diff --git a/.gitattributes b/.gitattributes
index 54a153654417ddaea006800ea5ae57912e55de2d..e4c1d11d06f6212941481ca70eb5d4e9442acb35 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -809,3 +809,251 @@ illustrious_generated/3e2afaad2b7d.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/04d6bfa98264.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/62a8fa0ac7dd.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/d190d03f64a7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f6342e8db68a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f7ca451e1933.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6b3c44df8332.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ed13e74032fb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/faa1e7049117.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c17212cc7fda.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6c268f463a2b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a364591ba4c1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2ea3ba7918b4.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2ffb09f5cbc0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0d55065059c0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/85e9723ae8cf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e89ab638d462.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/224c2084abb8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0b77d88bc5f0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/91076903bce5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7acda55248bc.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ee32c9618a12.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/698a4bf05f13.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bf97f1eaffeb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/62daa562132c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9ee7e057c8a2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/427d956c743b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/06da7f820423.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/92bcab0aaba1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/502a84449b45.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d99abaed93ba.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3a12bf82c05e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/433a115b55a3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/574012fe8664.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7d22dc2a6fb2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4f23c350b644.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e24085ea542f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3cc7f3366f7a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5242430c6777.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6fe5f96649a3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/12875eda15eb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/eac29190186c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c1276a9fc21b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a891e5d92031.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0367ba694b76.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f84f116882be.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b8e81c1a4bd1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/392a7a129a01.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1506e01a5598.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cbd5827b38ea.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b80b59fe722f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a2ca03055273.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b58cf17494db.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4c587778617b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7c5200560049.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b78d0c1f0687.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5c6f22f08540.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9b2b12c21a2b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ec96a311c2cb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a28e4715fc8c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/00f5e16a2236.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0ef8c1ed2c6c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f214facc5681.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f41b4fc2c7d5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9e9a0ce3d676.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/26d2ef2d7d03.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1e774fcc188d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7eab3f4f0c8e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f8631de95d70.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8d95e57fcb27.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7ac791baad53.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7b8529c066a0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7d8509931e4e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9fafd1175b72.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7023242de1c0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/99d5b088ccd4.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2bac6ab4413e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/00ff6449b55d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7b900f6e27b1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/69e10254baf5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/93d9e9abc98e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/095dc81d1160.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3315198d28df.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2549abad7eff.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8a90db3476ef.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/72473c769552.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bbf3fb096202.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c5e0eb8a2241.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8fa96985fc06.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/645e3b996530.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b9fdc64b985c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fa67e15ca2bf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9f5c49f2e362.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e8318516b273.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e801a5ce2da6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cd9145683d1e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/275253c8ad6b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f2a6e0c5c432.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/586dbda7c6ff.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/dff506d177c0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c8846919f3a8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/afbdb8dce1e5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fd4c46f2141f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ee36cea22c91.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6ca60a86b836.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/11c7f55b2aab.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d684bc0d0627.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4f1602c01d5b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/45c709323899.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d7bc7c5ba632.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0e0acc59ef85.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1c7a7ed6f359.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/31cbd66704bb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/dd8a48931525.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7368d4c82b5f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c7e1a60c0f5d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/be56d67f1e08.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/269ee6e9a79c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2bb0e99b92bc.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/afd28993674d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/585afc2017e2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f9c5bdc8bef5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8f338d47820a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e0443895d658.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/67ea9c16fed3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/78dfdb4f0521.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fff7c0390e8a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c63799030196.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fc061ac787c7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/26185801988b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/656abae8d0b6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5c4a2ea8f842.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2286bf835a6b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/dc7501a6f47f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/38b5363061d5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/451e48977b1a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f7621703575c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/891dc839571c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d1e30fd687b5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d1413371999b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0ad3307ea09c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6fba429dafc5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/481f3834876a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1e54c0c78134.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a564e408f362.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ec6650b62802.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9f447e4cf3d7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/790ece21df10.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/75e576f27cb6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/205b715d279f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/060e926dcc0a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/733c86338921.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b9f37572031b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/43eeb1fb403b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d22ef7243fac.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/162e3face5a7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/765bf9d23c7e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/47418c15a58f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3030bee9df5a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e4acb93d313c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/08e454ab01c2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3f43e650c7d7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/085929212457.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/91d346543b7c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/891abd7c9fa3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1927adcb399a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7e49e6b5a30b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2cd36314054f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b569d3590c66.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9e8dc59217e8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c2c3bea0e9d5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/05972b153525.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c9bf921e364a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/13cdedc9c525.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d8641bfcdd46.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/34afbd2725c8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f0d97f98333f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/76b2de1037cb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a370eb471cd7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f5ab32c63fb8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5718f8172842.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b7f508ecce88.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5f147d77f3ed.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ac9d950baac7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8b674edb3a4e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8ad0a744de62.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5b8f74bcc260.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/78026f131004.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d305fe437c6f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7cce990ade4c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c76729f0f827.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0706f94ebdc3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/22af9def0424.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/43877698ad33.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5a0201bebc6d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7ad096e9b528.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/46edb49b5dbf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bd65b176bfe6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/073f299a3b06.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fc885c9be9af.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bcfc32b88c98.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e55e6cf94025.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b4a9600f3647.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d7ef34bf47ee.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8cbc6e1dbe62.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8633a3dff7ea.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cb335826ba02.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3048ba382498.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/eca43ddadd85.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/365e7d0f97c2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e71b25950c5d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/59a595c825c8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/82ee8177ef04.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/36915299353b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ca07713b354c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fbebd175667e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/dacfbbcd3fb3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8a371dac467c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/40c498965cbd.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/190beb9306ef.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bb2041beb345.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6f1c05af41ca.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9f741bd68919.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9bb815cccb98.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/41d42d8f4842.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/13166cbea867.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e2812aff73e9.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/954594f7f0a6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c4b5bff2dbc1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/980b174e831c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ed89a47fd589.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a8e5c9011eef.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f1de13ffcad6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/574fba2c6515.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/591e156ad5fd.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/aef907db00ce.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3967f8d787ab.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a1ec0d3b0b0e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9da135f5f21e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8fd9fbffb954.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/24e5b9fe7d38.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/495f1b55919f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/19ff2ce2a961.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e39fecdd2676.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7663094bacec.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6d5feb7de870.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/abe90752beb0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cae43d7fd0f8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3f5c59c8ee7b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/49712a2e71f1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6346f39915f3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4c6ea9681419.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2899647a6e82fbf639b7d7bc14d07b03c0384be1
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1f7eb47d3bfa78f0838a340db3dda9e38e560b5
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efb7dc12217ffeb49f6cf64ea9341ff1837b2dba
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcc8d6a70c088e238daaeea958534a147ba900cc
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7564852d126318056eedd2950936883061927698
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd26baf8ce93b33c66c0a3df4c31242ace7d0575
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a04c10ccfc8c22e8d479ef52468f7fd27b897ab0
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6697fe51d614b5f5bcf0619a5b9f1731f12b002
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6987f964bfdb3450551bddfd37211382a06c1ef6
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__init__.py b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a349c77d717de0f69c478b62b328f5f0eba2847
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f12e92b4553516c241336205c5fc505df171eafc
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53edb8ac07b7843a354bf656fbf2b0e0a6df7176
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a5327436f9b1d9eae371e321c491a270634b3cf
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py
@@ -0,0 +1,22 @@
+import os
+import pathlib
+import sys
+import types
+
+
+def wrap(path):  # pragma: no cover
+    """
+    Workaround for https://github.com/python/cpython/issues/84538
+    to add backward compatibility for walk_up=True.
+    An example affected package is dask-labextension, which uses
+    jupyter-packaging to install JupyterLab javascript files outside
+    of site-packages.
+    """
+
+    def relative_to(root, *, walk_up=False):
+        return pathlib.Path(os.path.relpath(path, root))
+
+    return types.SimpleNamespace(relative_to=relative_to)
+
+
+relative_fix = wrap if sys.version_info < (3, 12) else lambda x: x
diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb9c01ecbbdcdf7b79d8840ee91c2fe7a734a1c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py
@@ -0,0 +1,42 @@
+"""
+Compatibility layer with Python 3.8/3.9
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:  # pragma: no cover
+    # Prevent circular imports on runtime.
+    from .. import Distribution, EntryPoint
+else:
+    Distribution = EntryPoint = Any
+
+from .._typing import md_none
+
+
+def normalized_name(dist: Distribution) -> str | None:
+    """
+    Honor name normalization for distributions that don't provide ``_normalized_name``.
+    """
+    try:
+        return dist._normalized_name
+    except AttributeError:
+        from .. import Prepared  # -> delay to prevent circular imports.
+
+        return Prepared.normalize(
+            getattr(dist, "name", None) or md_none(dist.metadata)['Name']
+        )
+
+
+def ep_matches(ep: EntryPoint, **params) -> bool:
+    """
+    Workaround for ``EntryPoint`` objects without the ``matches`` method.
+    """
+    try:
+        return ep.matches(**params)
+    except AttributeError:
+        from .. import EntryPoint  # -> delay to prevent circular imports.
+
+        # Reconstruct the EntryPoint object to make sure it is compatible.
+        return EntryPoint(ep.name, ep.value, ep.group).matches(**params)
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e08847c95f1294bc99e96e737a53cc6ebb7a458
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn : Neural Networks Library  */
+
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <cuda_runtime_api.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+#include "cudnn_ops.h"
+#include "cudnn_adv.h"
+#include "cudnn_cnn.h"
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d9bef65d5323dd3354299569d869191a07615cf
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h
@@ -0,0 +1,669 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn_adv : cuDNN's advanced and experimental features.
+
+*/
+
+#if !defined(CUDNN_ADV_H_)
+#define CUDNN_ADV_H_
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_MAJOR 9
+#define CUDNN_ADV_MINOR 10
+#define CUDNN_ADV_PATCH 2
+
+#if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* BASIC RNN API */
+
+typedef enum {
+    CUDNN_RNN_ALGO_STANDARD               = 0,
+    CUDNN_RNN_ALGO_PERSIST_STATIC         = 1,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC        = 2,
+    CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
+    CUDNN_RNN_ALGO_COUNT                  = 4,
+} cudnnRNNAlgo_t;
+
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+
+/* For auxFlags in cudnnSetRNNDescriptor_v8() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
+ * Compute precision is further modified by mathType that sets the
+ * preferred option for using NVIDIA Tensor Cores.  dataType specify
+ * input/output data type and weight/bias type.
+ */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fwdMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+
+/* Sequence data descriptor */
+
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
+
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+
+/* Multihead Attention */
+
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvVersionCheck(void);
+
+typedef enum {
+    CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
+    CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
+} cudnnWgradMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData_v8(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        const int32_t devSeqLengths[],
+                        cudnnRNNDataDescriptor_t yDesc,
+                        const void *y,
+                        const void *dy,
+                        cudnnRNNDataDescriptor_t xDesc,
+                        void *dx,
+                        cudnnTensorDescriptor_t hDesc,
+                        const void *hx,
+                        const void *dhy,
+                        void *dhx,
+                        cudnnTensorDescriptor_t cDesc,
+                        const void *cx,
+                        const void *dcy,
+                        void *dcx,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        size_t workSpaceSize,
+                        void *workSpace,
+                        size_t reserveSpaceSize,
+                        void *reserveSpace);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
+                           cudnnRNNDescriptor_t rnnDesc,
+                           cudnnWgradMode_t addGrad,
+                           const int32_t devSeqLengths[],
+                           cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           cudnnTensorDescriptor_t hDesc,
+                           const void *hx,
+                           cudnnRNNDataDescriptor_t yDesc,
+                           const void *y,
+                           size_t weightSpaceSize,
+                           void *dweightSpace,
+                           size_t workSpaceSize,
+                           void *workSpace,
+                           size_t reserveSpaceSize,
+                           void *reserveSpace);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace);
+
+/*
+ * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
+ */
+/* Input normalization mode for loss function */
+typedef enum {
+    CUDNN_LOSS_NORMALIZATION_NONE    = 0,
+    CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
+} cudnnLossNormalizationMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnCTCGradMode_t ctcGradMode,
+                             int maxLabelLength);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnCTCGradMode_t *ctcGradMode,
+                             int *maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
+
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size)  */
+    const void *probs,                       /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                  /* labels, in CPU memory */
+    const int hostLabelLengths[],            /* the length of each label, in CPU memory */
+    const int hostInputLengths[],            /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes); /* size of the workspace */
+
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size)  */
+    const void *probs,                       /* probabilities after softmax, in GPU memory */
+    const int labels[],                      /* labels, in GPU memory */
+    const int labelLengths[],                /* the length of each label, in GPU memory */
+    const int inputLengths[],                /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace);            /* pointer to the workspace, in GPU memory */
+
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes); /* pointer to the returned workspace size */
+
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes);                        /* pointer to the returned workspace size */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_ADV_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d9bef65d5323dd3354299569d869191a07615cf
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h
@@ -0,0 +1,669 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn_adv : cuDNN's advanced and experimental features.
+
+*/
+
+#if !defined(CUDNN_ADV_H_)
+#define CUDNN_ADV_H_
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_MAJOR 9
+#define CUDNN_ADV_MINOR 10
+#define CUDNN_ADV_PATCH 2
+
+#if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* BASIC RNN API */
+
+typedef enum {
+    CUDNN_RNN_ALGO_STANDARD               = 0,
+    CUDNN_RNN_ALGO_PERSIST_STATIC         = 1,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC        = 2,
+    CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
+    CUDNN_RNN_ALGO_COUNT                  = 4,
+} cudnnRNNAlgo_t;
+
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+
+/* For auxFlags in cudnnSetRNNDescriptor_v8() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
+ * Compute precision is further modified by mathType that sets the
+ * preferred option for using NVIDIA Tensor Cores.  dataType specify
+ * input/output data type and weight/bias type.
+ */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fwdMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+
+/* Sequence data descriptor */
+
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
+
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+
+/* Multihead Attention */
+
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvVersionCheck(void);
+
+typedef enum {
+    CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
+    CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
+} cudnnWgradMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData_v8(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        const int32_t devSeqLengths[],
+                        cudnnRNNDataDescriptor_t yDesc,
+                        const void *y,
+                        const void *dy,
+                        cudnnRNNDataDescriptor_t xDesc,
+                        void *dx,
+                        cudnnTensorDescriptor_t hDesc,
+                        const void *hx,
+                        const void *dhy,
+                        void *dhx,
+                        cudnnTensorDescriptor_t cDesc,
+                        const void *cx,
+                        const void *dcy,
+                        void *dcx,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        size_t workSpaceSize,
+                        void *workSpace,
+                        size_t reserveSpaceSize,
+                        void *reserveSpace);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
+                           cudnnRNNDescriptor_t rnnDesc,
+                           cudnnWgradMode_t addGrad,
+                           const int32_t devSeqLengths[],
+                           cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           cudnnTensorDescriptor_t hDesc,
+                           const void *hx,
+                           cudnnRNNDataDescriptor_t yDesc,
+                           const void *y,
+                           size_t weightSpaceSize,
+                           void *dweightSpace,
+                           size_t workSpaceSize,
+                           void *workSpace,
+                           size_t reserveSpaceSize,
+                           void *reserveSpace);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace);
+
+/*
+ * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
+ */
+/* Input normalization mode for loss function */
+typedef enum {
+    CUDNN_LOSS_NORMALIZATION_NONE    = 0,
+    CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
+} cudnnLossNormalizationMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnCTCGradMode_t ctcGradMode,
+                             int maxLabelLength);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnCTCGradMode_t *ctcGradMode,
+                             int *maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
+
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size)  */
+    const void *probs,                       /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                  /* labels, in CPU memory */
+    const int hostLabelLengths[],            /* the length of each label, in CPU memory */
+    const int hostInputLengths[],            /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes); /* size of the workspace */
+
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size)  */
+    const void *probs,                       /* probabilities after softmax, in GPU memory */
+    const int labels[],                      /* labels, in GPU memory */
+    const int labelLengths[],                /* the length of each label, in GPU memory */
+    const int inputLengths[],                /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace);            /* pointer to the workspace, in GPU memory */
+
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes); /* pointer to the returned workspace size */
+
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes);                        /* pointer to the returned workspace size */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_ADV_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a378e2087f7a45c423f65d213d98c4fa20f3a52
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDNN_BACKEND_H_
+#define _CUDNN_BACKEND_H_
+
+/*
+ * The content of this header has been moved into cudnn_graph.h.
+ * This header is kept for the backward compatibility purpose.
+ */
+
+#include "cudnn_graph.h"
+
+#endif /* _CUDNN_BACKEND_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a378e2087f7a45c423f65d213d98c4fa20f3a52
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDNN_BACKEND_H_
+#define _CUDNN_BACKEND_H_
+
+/*
+ * The content of this header has been moved into cudnn_graph.h.
+ * This header is kept for the backward compatibility purpose.
+ */
+
+#include "cudnn_graph.h"
+
+#endif /* _CUDNN_BACKEND_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..e988a8a033df31e35a37aeba12b9a7cdc1d7ed60
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h
@@ -0,0 +1,693 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_cnn : cuDNN's basic definitions and CNN functions.
+ */
+
+#if !defined(CUDNN_CNN_H_)
+#define CUDNN_CNN_H_
+
+#pragma once
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_CNN_MAJOR 9
+#define CUDNN_CNN_MINOR 10
+#define CUDNN_CNN_PATCH 2
+
+#if (CUDNN_CNN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_MINOR != CUDNN_MINOR) || (CUDNN_CNN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN CNN INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t CUDNN_DEPRECATED;
+
+typedef struct cudnnConvolutionFwdAlgoPerfStruct {
+    cudnnConvolutionFwdAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionFwdAlgoPerf_t CUDNN_DEPRECATED;
+
+/* Create an instance of convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc);
+
+/* Destroy an instance of convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType); /* convolution data type */
+
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType); /* convolution data type */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w);
+
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]);
+
+/* helper function to provide the convolution forward algo that fit best the requirement */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReorderFilterAndBias(cudnnHandle_t handle,
+                          const cudnnFilterDescriptor_t filterDesc,
+                          cudnnReorderType_t reorderType,
+                          const void *filterData,
+                          void *reorderedFilterData,
+                          int reorderBias,
+                          const void *biasData,
+                          void *reorderedBiasData);
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes);
+
+/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform the forward pass for batch convolution */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y);
+
+/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y);
+
+/* helper function to provide the convolution backward data algo that fit best the requirement */
+
+typedef struct cudnnConvolutionBwdDataAlgoPerfStruct {
+    cudnnConvolutionBwdDataAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdDataAlgoPerf_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+
+/* Helper function to calculate folding descriptors for dgrad */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t filterDesc,
+                                          const cudnnTensorDescriptor_t diffDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t gradDesc,
+                                          const cudnnTensorFormat_t transformFormat,
+                                          cudnnFilterDescriptor_t foldedFilterDesc,
+                                          cudnnTensorDescriptor_t paddedDiffDesc,
+                                          cudnnConvolutionDescriptor_t foldedConvDesc,
+                                          cudnnTensorDescriptor_t foldedGradDesc,
+                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc);
+
+/* cudnnFusedOps... */
+struct cudnnFusedOpsConstParamStruct;
+typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t CUDNN_DEPRECATED;
+
+struct cudnnFusedOpsVariantParamStruct;
+typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t CUDNN_DEPRECATED;
+
+struct cudnnFusedOpsPlanStruct;
+typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t CUDNN_DEPRECATED;
+
+typedef enum {
+    /* each op in [ ] can be disabled by passing NULL ptr */
+    /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
+    /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
+    /* utility for BN training in BN-conv fusion */
+    /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */
+    /* optionally update running stats and generate saved stats */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
+    /* utility for BN inference in BN-conv fusion */
+    /* computes the equivalent scale and bias from learned running stats and learned scale, bias */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
+    /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */
+    CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
+    /* reserved for future use: [per channel scale], [per channel bias], [residual add],  activation, bitmask */
+    CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
+    /* reserved for future use */
+    CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
+} cudnnFusedOps_t CUDNN_DEPRECATED;
+
+typedef enum {
+    /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get XDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_XDESC = 0,
+    /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
+    /* set/get BN_MODE: pass cudnnBatchNormMode_t* */
+    CUDNN_PARAM_BN_MODE = 2,
+    /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
+    /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
+    /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
+    /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */
+    /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_DESC = 6,
+    /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */
+    /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */
+    CUDNN_PARAM_CONV_DESC = 7,
+    /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get WDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_WDESC = 8,
+    /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
+    /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get DWDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_DWDESC = 10,
+    /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
+    /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YDESC = 12,
+    /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
+    /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DYDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DYDESC = 14,
+    /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
+    /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YSTATS_DESC = 16,
+    /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
+    /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
+    /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
+    /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
+    /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
+    /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
+    /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
+    /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
+    /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
+
+    /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ZDESC = 26,
+    /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
+    /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
+    /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
+    /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
+
+    /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
+    /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
+
+    /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DXDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DXDESC = 33,
+    /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
+    /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DZDESC = 35,
+    /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
+    /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
+    /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
+} cudnnFusedOpsConstParamLabel_t CUDNN_DEPRECATED;
+
+typedef enum {
+    CUDNN_PTR_NULL         = 0,
+    CUDNN_PTR_ELEM_ALIGNED = 1,
+    CUDNN_PTR_16B_ALIGNED  = 2,
+} cudnnFusedOpsPointerPlaceHolder_t CUDNN_DEPRECATED;
+
+typedef enum {
+    /* set: pass void* pointing to dev memory */
+    /* get: pass void** pointing to host memory */
+    CUDNN_PTR_XDATA              = 0,
+    CUDNN_PTR_BN_EQSCALE         = 1,
+    CUDNN_PTR_BN_EQBIAS          = 2,
+    CUDNN_PTR_WDATA              = 3,
+    CUDNN_PTR_DWDATA             = 4,
+    CUDNN_PTR_YDATA              = 5,
+    CUDNN_PTR_DYDATA             = 6,
+    CUDNN_PTR_YSUM               = 7,
+    CUDNN_PTR_YSQSUM             = 8,
+    CUDNN_PTR_WORKSPACE          = 9,
+    CUDNN_PTR_BN_SCALE           = 10,
+    CUDNN_PTR_BN_BIAS            = 11,
+    CUDNN_PTR_BN_SAVED_MEAN      = 12,
+    CUDNN_PTR_BN_SAVED_INVSTD    = 13,
+    CUDNN_PTR_BN_RUNNING_MEAN    = 14,
+    CUDNN_PTR_BN_RUNNING_VAR     = 15,
+    CUDNN_PTR_ZDATA              = 16,
+    CUDNN_PTR_BN_Z_EQSCALE       = 17,
+    CUDNN_PTR_BN_Z_EQBIAS        = 18,
+    CUDNN_PTR_ACTIVATION_BITMASK = 19,
+    CUDNN_PTR_DXDATA             = 20,
+    CUDNN_PTR_DZDATA             = 21,
+    CUDNN_PTR_BN_DSCALE          = 22,
+    CUDNN_PTR_BN_DBIAS           = 23,
+
+    /* set/get: pass size_t* pointing to host memory */
+    CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
+    /* set/get: pass int64_t* pointing to host memory */
+    CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
+} cudnnFusedOpsVariantParamLabel_t CUDNN_DEPRECATED;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCnnVersionCheck(void);
+
+/* helper function to provide the convolution backward filter algo that fit best the requirement */
+
+typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct {
+    cudnnConvolutionBwdFilterAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdFilterAlgoPerf_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw);
+
+/* Function to compute the bias gradient for batch convolution */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        const void *param);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        void *param,
+                                        int *isNULL);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
+                      cudnnFusedOpsPlan_t plan,
+                      const cudnnFusedOpsConstParamPack_t constPack,
+                      size_t *workspaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_CNN_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..e988a8a033df31e35a37aeba12b9a7cdc1d7ed60
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h
@@ -0,0 +1,693 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_cnn : cuDNN's basic definitions and CNN functions.
+ */
+
+#if !defined(CUDNN_CNN_H_)
+#define CUDNN_CNN_H_
+
+#pragma once
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_CNN_MAJOR 9
+#define CUDNN_CNN_MINOR 10
+#define CUDNN_CNN_PATCH 2
+
+#if (CUDNN_CNN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_MINOR != CUDNN_MINOR) || (CUDNN_CNN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN CNN INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t CUDNN_DEPRECATED;
+
+typedef struct cudnnConvolutionFwdAlgoPerfStruct {
+    cudnnConvolutionFwdAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionFwdAlgoPerf_t CUDNN_DEPRECATED;
+
+/* Create an instance of convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc);
+
+/* Destroy an instance of convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType); /* convolution data type */
+
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType); /* convolution data type */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w);
+
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]);
+
+/* helper function to provide the convolution forward algo that fit best the requirement */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReorderFilterAndBias(cudnnHandle_t handle,
+                          const cudnnFilterDescriptor_t filterDesc,
+                          cudnnReorderType_t reorderType,
+                          const void *filterData,
+                          void *reorderedFilterData,
+                          int reorderBias,
+                          const void *biasData,
+                          void *reorderedBiasData);
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes);
+
+/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform the forward pass for batch convolution */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y);
+
+/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y);
+
+/* helper function to provide the convolution backward data algo that fit best the requirement */
+
+typedef struct cudnnConvolutionBwdDataAlgoPerfStruct {
+    cudnnConvolutionBwdDataAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdDataAlgoPerf_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+
+/* Helper function to calculate folding descriptors for dgrad */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t filterDesc,
+                                          const cudnnTensorDescriptor_t diffDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t gradDesc,
+                                          const cudnnTensorFormat_t transformFormat,
+                                          cudnnFilterDescriptor_t foldedFilterDesc,
+                                          cudnnTensorDescriptor_t paddedDiffDesc,
+                                          cudnnConvolutionDescriptor_t foldedConvDesc,
+                                          cudnnTensorDescriptor_t foldedGradDesc,
+                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc);
+
+/* cudnnFusedOps... */
+struct cudnnFusedOpsConstParamStruct;
+typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t CUDNN_DEPRECATED;
+
+struct cudnnFusedOpsVariantParamStruct;
+typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t CUDNN_DEPRECATED;
+
+struct cudnnFusedOpsPlanStruct;
+typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t CUDNN_DEPRECATED;
+
+typedef enum {
+    /* each op in [ ] can be disabled by passing NULL ptr */
+    /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
+    /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
+    /* utility for BN training in BN-conv fusion */
+    /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */
+    /* optionally update running stats and generate saved stats */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
+    /* utility for BN inference in BN-conv fusion */
+    /* computes the equivalent scale and bias from learned running stats and learned scale, bias */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
+    /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */
+    CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
+    /* reserved for future use: [per channel scale], [per channel bias], [residual add],  activation, bitmask */
+    CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
+    /* reserved for future use */
+    CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
+} cudnnFusedOps_t CUDNN_DEPRECATED;
+
+typedef enum {
+    /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get XDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_XDESC = 0,
+    /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
+    /* set/get BN_MODE: pass cudnnBatchNormMode_t* */
+    CUDNN_PARAM_BN_MODE = 2,
+    /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
+    /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
+    /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
+    /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */
+    /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_DESC = 6,
+    /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */
+    /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */
+    CUDNN_PARAM_CONV_DESC = 7,
+    /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get WDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_WDESC = 8,
+    /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
+    /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get DWDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_DWDESC = 10,
+    /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
+    /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YDESC = 12,
+    /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
+    /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DYDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DYDESC = 14,
+    /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
+    /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YSTATS_DESC = 16,
+    /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
+    /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
+    /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
+    /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
+    /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
+    /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
+    /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
+    /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
+    /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
+
+    /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ZDESC = 26,
+    /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
+    /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
+    /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
+    /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
+
+    /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
+    /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
+
+    /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DXDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DXDESC = 33,
+    /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
+    /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DZDESC = 35,
+    /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
+    /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
+    /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
+} cudnnFusedOpsConstParamLabel_t CUDNN_DEPRECATED;
+
+typedef enum {
+    CUDNN_PTR_NULL         = 0,
+    CUDNN_PTR_ELEM_ALIGNED = 1,
+    CUDNN_PTR_16B_ALIGNED  = 2,
+} cudnnFusedOpsPointerPlaceHolder_t CUDNN_DEPRECATED;
+
+typedef enum {
+    /* set: pass void* pointing to dev memory */
+    /* get: pass void** pointing to host memory */
+    CUDNN_PTR_XDATA              = 0,
+    CUDNN_PTR_BN_EQSCALE         = 1,
+    CUDNN_PTR_BN_EQBIAS          = 2,
+    CUDNN_PTR_WDATA              = 3,
+    CUDNN_PTR_DWDATA             = 4,
+    CUDNN_PTR_YDATA              = 5,
+    CUDNN_PTR_DYDATA             = 6,
+    CUDNN_PTR_YSUM               = 7,
+    CUDNN_PTR_YSQSUM             = 8,
+    CUDNN_PTR_WORKSPACE          = 9,
+    CUDNN_PTR_BN_SCALE           = 10,
+    CUDNN_PTR_BN_BIAS            = 11,
+    CUDNN_PTR_BN_SAVED_MEAN      = 12,
+    CUDNN_PTR_BN_SAVED_INVSTD    = 13,
+    CUDNN_PTR_BN_RUNNING_MEAN    = 14,
+    CUDNN_PTR_BN_RUNNING_VAR     = 15,
+    CUDNN_PTR_ZDATA              = 16,
+    CUDNN_PTR_BN_Z_EQSCALE       = 17,
+    CUDNN_PTR_BN_Z_EQBIAS        = 18,
+    CUDNN_PTR_ACTIVATION_BITMASK = 19,
+    CUDNN_PTR_DXDATA             = 20,
+    CUDNN_PTR_DZDATA             = 21,
+    CUDNN_PTR_BN_DSCALE          = 22,
+    CUDNN_PTR_BN_DBIAS           = 23,
+
+    /* set/get: pass size_t* pointing to host memory */
+    CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
+    /* set/get: pass int64_t* pointing to host memory */
+    CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
+} cudnnFusedOpsVariantParamLabel_t CUDNN_DEPRECATED;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCnnVersionCheck(void);
+
+/* helper function to provide the convolution backward filter algo that fit best the requirement */
+
+typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct {
+    cudnnConvolutionBwdFilterAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdFilterAlgoPerf_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw);
+
+/* Function to compute the bias gradient for batch convolution */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        const void *param);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        void *param,
+                                        int *isNULL);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
+                      cudnnFusedOpsPlan_t plan,
+                      const cudnnFusedOpsConstParamPack_t constPack,
+                      size_t *workspaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_CNN_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..389fba220c579e08519072255f2aea9a5da2d3e5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h
@@ -0,0 +1,992 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_graph : cuDNN's basic definitions operations.
+ */
+
+#if !defined(CUDNN_GRAPH_H_)
+#define CUDNN_GRAPH_H_
+
+#include <cuda_runtime_api.h>
+#include <library_types.h>
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_GRAPH_MAJOR 9
+#define CUDNN_GRAPH_MINOR 10
+#define CUDNN_GRAPH_PATCH 2
+
+#if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN GRAPH!!!
+#endif
+
+#ifndef CUDNNWINAPI
+#ifdef _WIN32
+#define CUDNNWINAPI __stdcall
+#else
+#define CUDNNWINAPI
+#endif
+#endif
+
+/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
+#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
+/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
+#define CUDNN_DEPRECATED __attribute__((deprecated))
+#define CUDNN_DEPRECATED_ENUM __attribute__((deprecated))
+#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
+/* Microsoft Visual C++ */
+#define CUDNN_DEPRECATED __declspec(deprecated)
+#define CUDNN_DEPRECATED_ENUM __declspec(deprecated)
+#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
+/* C++14 compilers */
+#define CUDNN_DEPRECATED [[deprecated]]
+#define CUDNN_DEPRECATED_ENUM [[deprecated]]
+#else
+/* No support for the deprecated attribute */
+#define CUDNN_DEPRECATED
+#define CUDNN_DEPRECATED_ENUM
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudnnContext;
+typedef struct cudnnContext *cudnnHandle_t;
+
+size_t CUDNNWINAPI
+cudnnGetVersion(void);
+
+size_t CUDNNWINAPI
+cudnnGetMaxDeviceVersion(void);
+
+/* Returns CUDA Runtime version statically linked against cudnn */
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void);
+
+/*
+ * CUDNN return codes
+ */
+typedef enum {
+    CUDNN_STATUS_SUCCESS = 0,
+
+    /* Uncategorized errors */
+    CUDNN_STATUS_NOT_INITIALIZED                = 1001,
+    CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH    = 1002,
+    CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003,
+    CUDNN_STATUS_DEPRECATED                     = 1004,
+    CUDNN_STATUS_LICENSE_ERROR                  = 1005,
+    CUDNN_STATUS_RUNTIME_IN_PROGRESS            = 1006,
+    CUDNN_STATUS_RUNTIME_FP_OVERFLOW            = 1007,
+    CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED      = 1008,
+
+    CUDNN_STATUS_BAD_PARAM                     = 2000,
+    CUDNN_STATUS_BAD_PARAM_NULL_POINTER        = 2002,
+    CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER  = 2003,
+    CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED       = 2004,
+    CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND        = 2005,
+    CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT   = 2006,
+    CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH     = 2007,
+    CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH      = 2008,
+    CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES  = 2009,
+    CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE      = 2010,
+    CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011,
+    CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE     = 2012,
+
+    CUDNN_STATUS_NOT_SUPPORTED                              = 3000,
+    CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN                = 3001,
+    CUDNN_STATUS_NOT_SUPPORTED_SHAPE                        = 3002,
+    CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE                    = 3003,
+    CUDNN_STATUS_NOT_SUPPORTED_LAYOUT                       = 3004,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER     = 3005,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART          = 3006,
+    CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH                = 3007,
+    CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008,
+    CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE       = 3009,
+    CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT   = 3010,
+    CUDNN_STATUS_NOT_SUPPORTED_PADDING                      = 3011,
+    CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM             = 3012,
+    CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API        = 3013,
+
+    CUDNN_STATUS_INTERNAL_ERROR                          = 4000,
+    CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED       = 4001,
+    CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE         = 4002,
+    CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED   = 4003,
+    CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004,
+    CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM         = 4005,
+    CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED  = 4006,
+
+    CUDNN_STATUS_EXECUTION_FAILED             = 5000,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001,
+    CUDNN_STATUS_EXECUTION_FAILED_CUBLAS      = 5002,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDART      = 5003,
+    CUDNN_STATUS_EXECUTION_FAILED_CURAND      = 5004,
+
+    CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM  = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED,
+    CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */,
+    CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH,
+    CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED,
+    CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM =
+        CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING,
+    CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH,
+} cudnnStatus_t;
+
+#define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err)))
+#define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000)
+#define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000)
+
+/* human-readable error messages */
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status);
+
+void CUDNNWINAPI
+cudnnGetLastErrorString(char *message, size_t max_size);
+
+/* Forward definition in this version only */
+typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED;
+
+typedef enum {
+    CUDNN_ERRQUERY_RAWCODE     = 0,
+    CUDNN_ERRQUERY_NONBLOCKING = 1,
+    CUDNN_ERRQUERY_BLOCKING    = 2,
+} cudnnErrQueryMode_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
+/*
+ * CUDNN data type
+ */
+typedef enum {
+    CUDNN_DATA_FLOAT                         = 0,
+    CUDNN_DATA_DOUBLE                        = 1,
+    CUDNN_DATA_HALF                          = 2,
+    CUDNN_DATA_INT8                          = 3,
+    CUDNN_DATA_INT32                         = 4,
+    CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM  = 5,
+    CUDNN_DATA_UINT8                         = 6,
+    CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7,
+    CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8,
+    CUDNN_DATA_BFLOAT16                      = 9,
+    CUDNN_DATA_INT64                         = 10,
+    CUDNN_DATA_BOOLEAN                       = 11,
+    CUDNN_DATA_FP8_E4M3                      = 12,
+    CUDNN_DATA_FP8_E5M2                      = 13,
+    CUDNN_DATA_FAST_FLOAT_FOR_FP8            = 14,
+    CUDNN_DATA_FP8_E8M0                      = 15,
+    CUDNN_DATA_FP4_E2M1                      = 16,
+} cudnnDataType_t;
+
+/*
+ * CUDNN math type
+ */
+typedef enum {
+    CUDNN_DEFAULT_MATH                    = 0,
+    CUDNN_TENSOR_OP_MATH                  = 1,
+    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
+    CUDNN_FMA_MATH                        = 3,
+} cudnnMathType_t;
+
+/*
+ * CUDNN propagate Nan
+ */
+typedef enum {
+    CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0,
+    CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM     = 1,
+} cudnnNanPropagation_t;
+
+/*
+ * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If
+ * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for
+ * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to
+ * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
+ */
+typedef enum {
+    CUDNN_CTC_ZERO_OOB_GRADIENTS = 0,
+    CUDNN_CTC_SKIP_OOB_GRADIENTS = 1,
+} cudnnCTCGradMode_t;
+
+typedef enum {
+    CUDNN_TENSOR_NCHW        = 0, /* row major (wStride = 1, hStride = w) */
+    CUDNN_TENSOR_NHWC        = 1, /* feature maps interleaved ( cStride = 1 )*/
+    CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
+} cudnnTensorFormat_t;
+
+/*
+ * CUDNN ReduceTensor op type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_ADD          = 0,
+    CUDNN_REDUCE_TENSOR_MUL          = 1,
+    CUDNN_REDUCE_TENSOR_MIN          = 2,
+    CUDNN_REDUCE_TENSOR_MAX          = 3,
+    CUDNN_REDUCE_TENSOR_AMAX         = 4,
+    CUDNN_REDUCE_TENSOR_AVG          = 5,
+    CUDNN_REDUCE_TENSOR_NORM1        = 6,
+    CUDNN_REDUCE_TENSOR_NORM2        = 7,
+    CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
+} cudnnReduceTensorOp_t;
+
+/*
+ * activation mode
+ */
+typedef enum {
+    CUDNN_ACTIVATION_SIGMOID      = 0,
+    CUDNN_ACTIVATION_RELU         = 1,
+    CUDNN_ACTIVATION_TANH         = 2,
+    CUDNN_ACTIVATION_CLIPPED_RELU = 3,
+    CUDNN_ACTIVATION_ELU          = 4,
+    CUDNN_ACTIVATION_IDENTITY     = 5,
+    CUDNN_ACTIVATION_SWISH        = 6
+} cudnnActivationMode_t CUDNN_DEPRECATED;
+
+typedef enum {
+    CUDNN_SEV_FATAL   = 0,
+    CUDNN_SEV_ERROR   = 1,
+    CUDNN_SEV_WARNING = 2,
+    CUDNN_SEV_INFO    = 3,
+} cudnnSeverity_t;
+
+/* Message masks to be used with cudnnSetCallback() */
+#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
+#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
+#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
+
+/* struct containing useful informaiton for each API call */
+typedef struct cudnnDebugStruct {
+    unsigned cudnn_version;
+    cudnnStatus_t cudnnStatus;
+    unsigned time_sec;      /* epoch time in seconds */
+    unsigned time_usec;     /* microseconds part of epoch time */
+    unsigned time_delta;    /* time since start in seconds */
+    cudnnHandle_t handle;   /* cudnn handle */
+    cudaStream_t stream;    /* cuda stream ID */
+    unsigned long long pid; /* process ID */
+    unsigned long long tid; /* thread ID */
+    int cudaDeviceId;       /* CUDA device ID */
+    int reserved[15];       /* reserved for future use */
+} cudnnDebug_t;
+
+typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGraphVersionCheck(void);
+
+/* Maximum supported number of tensor dimensions */
+#define CUDNN_DIM_MAX 8
+
+/*
+ *  convolution mode
+ */
+typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
+
+/*
+ * CUDNN Reorder
+ */
+typedef enum {
+    CUDNN_DEFAULT_REORDER = 0,
+    CUDNN_NO_REORDER      = 1,
+} cudnnReorderType_t CUDNN_DEPRECATED;
+
+typedef void *cudnnBackendDescriptor_t;
+
+typedef struct cudnnFractionStruct {
+    int64_t numerator;
+    int64_t denominator;
+} cudnnFraction_t;
+
+typedef enum {
+    CUDNN_POINTWISE_ADD        = 0,
+    CUDNN_POINTWISE_ADD_SQUARE = 5,
+    CUDNN_POINTWISE_DIV        = 6,
+    CUDNN_POINTWISE_MAX        = 3,
+    CUDNN_POINTWISE_MIN        = 2,
+    CUDNN_POINTWISE_MOD        = 7,
+    CUDNN_POINTWISE_MUL        = 1,
+    CUDNN_POINTWISE_POW        = 8,
+    CUDNN_POINTWISE_SUB        = 9,
+
+    CUDNN_POINTWISE_ABS        = 10,
+    CUDNN_POINTWISE_CEIL       = 11,
+    CUDNN_POINTWISE_COS        = 12,
+    CUDNN_POINTWISE_EXP        = 13,
+    CUDNN_POINTWISE_FLOOR      = 14,
+    CUDNN_POINTWISE_LOG        = 15,
+    CUDNN_POINTWISE_NEG        = 16,
+    CUDNN_POINTWISE_RSQRT      = 17,
+    CUDNN_POINTWISE_SIN        = 18,
+    CUDNN_POINTWISE_SQRT       = 4,
+    CUDNN_POINTWISE_TAN        = 19,
+    CUDNN_POINTWISE_ERF        = 20,
+    CUDNN_POINTWISE_IDENTITY   = 21,
+    CUDNN_POINTWISE_RECIPROCAL = 22,
+    CUDNN_POINTWISE_ATAN2      = 23,
+
+    CUDNN_POINTWISE_RELU_FWD             = 100,
+    CUDNN_POINTWISE_TANH_FWD             = 101,
+    CUDNN_POINTWISE_SIGMOID_FWD          = 102,
+    CUDNN_POINTWISE_ELU_FWD              = 103,
+    CUDNN_POINTWISE_GELU_FWD             = 104,
+    CUDNN_POINTWISE_SOFTPLUS_FWD         = 105,
+    CUDNN_POINTWISE_SWISH_FWD            = 106,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
+
+    CUDNN_POINTWISE_RELU_BWD             = 200,
+    CUDNN_POINTWISE_TANH_BWD             = 201,
+    CUDNN_POINTWISE_SIGMOID_BWD          = 202,
+    CUDNN_POINTWISE_ELU_BWD              = 203,
+    CUDNN_POINTWISE_GELU_BWD             = 204,
+    CUDNN_POINTWISE_SOFTPLUS_BWD         = 205,
+    CUDNN_POINTWISE_SWISH_BWD            = 206,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
+
+    CUDNN_POINTWISE_CMP_EQ  = 300,
+    CUDNN_POINTWISE_CMP_NEQ = 301,
+    CUDNN_POINTWISE_CMP_GT  = 302,
+    CUDNN_POINTWISE_CMP_GE  = 303,
+    CUDNN_POINTWISE_CMP_LT  = 304,
+    CUDNN_POINTWISE_CMP_LE  = 305,
+
+    CUDNN_POINTWISE_LOGICAL_AND = 400,
+    CUDNN_POINTWISE_LOGICAL_OR  = 401,
+    CUDNN_POINTWISE_LOGICAL_NOT = 402,
+
+    CUDNN_POINTWISE_GEN_INDEX = 501,
+
+    CUDNN_POINTWISE_BINARY_SELECT = 601,
+} cudnnPointwiseMode_t;
+
+typedef enum {
+    CUDNN_RESAMPLE_NEAREST                 = 0,
+    CUDNN_RESAMPLE_BILINEAR                = 1,
+    CUDNN_RESAMPLE_AVGPOOL                 = 2,
+    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
+    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
+    CUDNN_RESAMPLE_MAXPOOL                 = 3,
+} cudnnResampleMode_t;
+
+typedef enum {
+    CUDNN_SIGNAL_SET  = 0,
+    CUDNN_SIGNAL_WAIT = 1,
+} cudnnSignalMode_t;
+
+typedef enum {
+    CUDNN_GENSTATS_SUM_SQSUM = 0,
+} cudnnGenStatsMode_t;
+
+typedef enum {
+    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
+    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
+} cudnnBnFinalizeStatsMode_t;
+
+typedef enum {
+    CUDNN_RNG_DISTRIBUTION_BERNOULLI = 0,
+    CUDNN_RNG_DISTRIBUTION_UNIFORM   = 1,
+    CUDNN_RNG_DISTRIBUTION_NORMAL    = 2,
+} cudnnRngDistribution_t;
+
+typedef enum {
+    CUDNN_ATTR_POINTWISE_MODE                                  = 0,
+    CUDNN_ATTR_POINTWISE_MATH_PREC                             = 1,
+    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP                       = 3,
+    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP                       = 4,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE                 = 5,
+    CUDNN_ATTR_POINTWISE_ELU_ALPHA                             = 6,
+    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA                         = 7,
+    CUDNN_ATTR_POINTWISE_SWISH_BETA                            = 8,
+    CUDNN_ATTR_POINTWISE_AXIS                                  = 9,
+
+    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
+    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
+    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
+    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
+    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
+    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
+    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,
+
+    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
+    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
+    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
+    CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
+    CUDNN_ATTR_ENGINEHEUR_DEVICEPROP      = 204,
+
+    CUDNN_ATTR_ENGINECFG_ENGINE             = 300,
+    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO  = 301,
+    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES       = 302,
+    CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE     = 303,
+    CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED = 304,
+
+    CUDNN_ATTR_EXECUTION_PLAN_HANDLE CUDNN_DEPRECATED_ENUM = 400,
+    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG                = 401,
+    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE               = 402,
+    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS   = 403,
+    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS   = 404,
+    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION          = 405,
+    CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE                 = 406,
+    CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP                   = 407,
+
+    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
+    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
+
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
+
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
+
+    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
+    CUDNN_ATTR_OPERATION_POINTWISE_XDESC         = 751,
+    CUDNN_ATTR_OPERATION_POINTWISE_BDESC         = 752,
+    CUDNN_ATTR_OPERATION_POINTWISE_YDESC         = 753,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1        = 754,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2        = 755,
+    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC        = 756,
+    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC        = 757,
+    CUDNN_ATTR_OPERATION_POINTWISE_TDESC         = 758,
+
+    CUDNN_ATTR_OPERATION_GENSTATS_MODE      = 770,
+    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
+    CUDNN_ATTR_OPERATION_GENSTATS_XDESC     = 772,
+    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC   = 773,
+    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
+
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,
+
+    CUDNN_ATTR_OPERATIONGRAPH_HANDLE CUDNN_DEPRECATED_ENUM = 800,
+    CUDNN_ATTR_OPERATIONGRAPH_OPS                          = 801,
+    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT          = 802,
+    CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED     = 803,
+    CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY             = 804,
+
+    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
+    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
+    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
+    CUDNN_ATTR_TENSOR_STRIDES              = 903,
+    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
+    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
+    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
+    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
+    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
+    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
+    CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC   = 913,
+
+    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
+    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
+    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
+    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,
+
+    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
+    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,
+
+    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
+    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
+    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
+    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,
+
+    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
+    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
+    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
+    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
+    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
+    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
+    CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
+    CUDNN_ATTR_ENGINE_DEVICEPROP      = 1307,
+
+    CUDNN_ATTR_MATMUL_COMP_TYPE     = 1500,
+    CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503,
+
+    CUDNN_ATTR_OPERATION_MATMUL_ADESC                                                 = 1520,
+    CUDNN_ATTR_OPERATION_MATMUL_BDESC                                                 = 1521,
+    CUDNN_ATTR_OPERATION_MATMUL_CDESC                                                 = 1522,
+    CUDNN_ATTR_OPERATION_MATMUL_DESC                                                  = 1523,
+    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC                                  = 1525,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC                                  = 1526,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC                                  = 1527,
+
+    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
+    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
+
+    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
+    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
+    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,
+
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,
+
+    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
+    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
+    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
+    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
+    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
+    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
+    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
+    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
+    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,
+
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC                       = 1710,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC                       = 1711,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC                     = 1712,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM  = 1714,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC                        = 1716,
+
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC                      = 1720,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC                      = 1721,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC                     = 1722,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM  = 1724,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC                        = 1725,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC                       = 1726,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC                       = 1727,
+
+    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
+    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
+    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
+    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,
+
+    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
+    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
+    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
+    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
+    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,
+
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC  = 1950,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC           = 1951,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC   = 1952,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953,
+
+    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
+    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
+    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
+    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
+    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,
+
+    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
+    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
+    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
+    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
+    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
+    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,
+
+    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
+    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
+
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_XDESC                = 2250,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_YDESC                = 2251,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_LOWER_BANDWIDTH      = 2252,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_UPPER_BANDWIDTH      = 2253,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_AXIS                 = 2254,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_PAD_VALUE            = 2255,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_KV_TOKEN_OFFSET_DESC = 2256,
+
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_XDESC           = 2270,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_YDESC           = 2271,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_LOWER_BANDWIDTH = 2272,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_UPPER_BANDWIDTH = 2273,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_AXIS            = 2274,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_PAD_VALUE       = 2275,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MAX_TOKEN_VALUE        = 2276,
+
+    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
+    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
+    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
+    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,
+
+    CUDNN_ATTR_OPERATION_RNG_YDESC       = 2310,
+    CUDNN_ATTR_OPERATION_RNG_SEED        = 2311,
+    CUDNN_ATTR_OPERATION_RNG_DESC        = 2312,
+    CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,
+
+    CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH            = 2400,
+    CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401,
+    CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION        = 2402,
+
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC      = 2500,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC      = 2501,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC  = 2503,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504,
+
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC      = 2600,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC      = 2602,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC  = 2603,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604,
+
+    CUDNN_ATTR_DEVICEPROP_DEVICE_ID           = 2700,
+    CUDNN_ATTR_DEVICEPROP_HANDLE              = 2701,
+    CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702,
+} cudnnBackendAttributeName_t;
+
+typedef enum {
+    CUDNN_TYPE_HANDLE                                = 0,
+    CUDNN_TYPE_DATA_TYPE                             = 1,
+    CUDNN_TYPE_BOOLEAN                               = 2,
+    CUDNN_TYPE_INT64                                 = 3,
+    CUDNN_TYPE_FLOAT                                 = 4,
+    CUDNN_TYPE_DOUBLE                                = 5,
+    CUDNN_TYPE_VOID_PTR                              = 6,
+    CUDNN_TYPE_CONVOLUTION_MODE                      = 7,
+    CUDNN_TYPE_HEUR_MODE                             = 8,
+    CUDNN_TYPE_KNOB_TYPE                             = 9,
+    CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM = 10,
+    CUDNN_TYPE_NUMERICAL_NOTE                        = 11,
+    CUDNN_TYPE_LAYOUT_TYPE                           = 12,
+    CUDNN_TYPE_ATTRIB_NAME                           = 13,
+    CUDNN_TYPE_POINTWISE_MODE                        = 14,
+    CUDNN_TYPE_BACKEND_DESCRIPTOR                    = 15,
+    CUDNN_TYPE_GENSTATS_MODE                         = 16,
+    CUDNN_TYPE_BN_FINALIZE_STATS_MODE                = 17,
+    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE               = 18,
+    CUDNN_TYPE_BEHAVIOR_NOTE                         = 19,
+    CUDNN_TYPE_TENSOR_REORDERING_MODE                = 20,
+    CUDNN_TYPE_RESAMPLE_MODE                         = 21,
+    CUDNN_TYPE_PADDING_MODE                          = 22,
+    CUDNN_TYPE_INT32                                 = 23,
+    CUDNN_TYPE_CHAR                                  = 24,
+    CUDNN_TYPE_SIGNAL_MODE                           = 25,
+    CUDNN_TYPE_FRACTION                              = 26,
+    CUDNN_TYPE_NORM_MODE                             = 27,
+    CUDNN_TYPE_NORM_FWD_PHASE                        = 28,
+    CUDNN_TYPE_RNG_DISTRIBUTION                      = 29,
+} cudnnBackendAttributeType_t;
+
+typedef enum {
+    CUDNN_BACKEND_POINTWISE_DESCRIPTOR                             = 0,
+    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR                           = 1,
+    CUDNN_BACKEND_ENGINE_DESCRIPTOR                                = 2,
+    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR                             = 3,
+    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR                            = 4,
+    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR                        = 5,
+    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR                     = 6,
+    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR                           = 7,
+    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR                             = 8,
+    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR                           = 9,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR         = 10,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR   = 12,
+    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR                   = 13,
+    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR                   = 14,
+    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR                        = 15,
+    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR                          = 16,
+    CUDNN_BACKEND_TENSOR_DESCRIPTOR                                = 17,
+    CUDNN_BACKEND_MATMUL_DESCRIPTOR                                = 18,
+    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR                      = 19,
+    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR      = 20,
+    CUDNN_BACKEND_REDUCTION_DESCRIPTOR                             = 21,
+    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR                   = 22,
+    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR              = 23,
+    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR                              = 24,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR                = 25,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR                = 26,
+    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR                      = 27,
+    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR                      = 28,
+    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR                = 29,
+    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR               = 30,
+    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR                     = 31,
+    CUDNN_BACKEND_RNG_DESCRIPTOR                                   = 32,
+    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR                         = 33,
+    CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR                          = 34,
+    CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR            = 35,
+    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR        = 36,
+    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR      = 37,
+    CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR                            = 38,
+    CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR          = 39,
+    CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR        = 40,
+} cudnnBackendDescriptorType_t;
+
+typedef enum {
+    CUDNN_NUMERICAL_NOTE_TENSOR_CORE                 = 0,
+    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS         = 1,
+    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2,
+    CUDNN_NUMERICAL_NOTE_FFT                         = 3,
+    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC            = 4,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD                    = 5,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4           = 6,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6           = 7,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13         = 8,
+    CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP             = 9,
+    CUDNN_NUMERICAL_NOTE_TYPE_COUNT                  = 10,
+} cudnnBackendNumericalNote_t;
+
+typedef enum {
+    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
+    CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API  = 3,
+    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT                      = 4,
+} cudnnBackendBehaviorNote_t;
+
+typedef enum {
+    CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM          = 0,
+    CUDNN_KNOB_TYPE_SWIZZLE                                = 1,
+    CUDNN_KNOB_TYPE_TILE_SIZE                              = 2,
+    CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM          = 3,
+    CUDNN_KNOB_TYPE_EDGE                                   = 4,
+    CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM           = 5,
+    CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM             = 6,
+    CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM             = 7,
+    CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM          = 8,
+    CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM          = 9,
+    CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM        = 10,
+    CUDNN_KNOB_TYPE_MULTIPLY                               = 11,
+    CUDNN_KNOB_TYPE_SPLIT_K_BUF                            = 12,
+    CUDNN_KNOB_TYPE_TILEK                                  = 13,
+    CUDNN_KNOB_TYPE_STAGES                                 = 14,
+    CUDNN_KNOB_TYPE_REDUCTION_MODE                         = 15,
+    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16,
+    CUDNN_KNOB_TYPE_SPLIT_K_SLC                            = 17,
+    CUDNN_KNOB_TYPE_IDX_MODE                               = 18,
+    CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM           = 19,
+    CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM         = 20,
+    CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM     = 21,
+    CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM             = 22,
+    CUDNN_KNOB_TYPE_SPECFILT                               = 23,
+    CUDNN_KNOB_TYPE_KERNEL_CFG                             = 24,
+    CUDNN_KNOB_TYPE_WORKSPACE                              = 25,
+    CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM         = 26,
+    CUDNN_KNOB_TYPE_TILE_CGA_M                             = 27,
+    CUDNN_KNOB_TYPE_TILE_CGA_N                             = 28,
+    CUDNN_KNOB_TYPE_BLOCK_SIZE                             = 29,
+    CUDNN_KNOB_TYPE_OCCUPANCY                              = 30,
+    CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD                  = 31,
+    CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM  = 32,
+    CUDNN_KNOB_TYPE_SPLIT_COLS                             = 33,
+    CUDNN_KNOB_TYPE_TILE_ROWS                              = 34,
+    CUDNN_KNOB_TYPE_TILE_COLS                              = 35,
+    CUDNN_KNOB_TYPE_LOAD_SIZE                              = 36,
+    CUDNN_KNOB_TYPE_CTA_COUNT                              = 37,
+    CUDNN_KNOB_TYPE_STREAM_K                               = 38,
+    CUDNN_KNOB_TYPE_SPLIT_P_SLC                            = 39,
+    CUDNN_KNOB_TYPE_TILE_M                                 = 40,
+    CUDNN_KNOB_TYPE_TILE_N                                 = 41,
+    CUDNN_KNOB_TYPE_WARP_SPEC_CFG                          = 42,
+    CUDNN_KNOB_TYPE_COUNTS                                 = 43,
+} cudnnBackendKnobType_t;
+
+typedef enum {
+    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
+    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
+    CUDNN_LAYOUT_TYPE_COUNT            = 4,
+} cudnnBackendLayoutType_t;
+
+typedef enum {
+    CUDNN_HEUR_MODE_INSTANT  = 0,
+    CUDNN_HEUR_MODE_B        = 1,
+    CUDNN_HEUR_MODE_FALLBACK = 2,
+    CUDNN_HEUR_MODE_A        = 3,
+    CUDNN_HEUR_MODES_COUNT   = 4,
+} cudnnBackendHeurMode_t;
+
+typedef enum {
+    CUDNN_TENSOR_REORDERING_NONE     = 0,
+    CUDNN_TENSOR_REORDERING_INT8x32  = 1,
+    CUDNN_TENSOR_REORDERING_F16x16   = 2,
+    CUDNN_TENSOR_REORDERING_F8_128x4 = 3,
+} cudnnBackendTensorReordering_t;
+
+typedef enum {
+    CUDNN_ZERO_PAD     = 0,
+    CUDNN_NEG_INF_PAD  = 1,
+    CUDNN_EDGE_VAL_PAD = 2,
+} cudnnPaddingMode_t;
+
+typedef enum {
+    CUDNN_LAYER_NORM     = 0,
+    CUDNN_INSTANCE_NORM  = 1,
+    CUDNN_BATCH_NORM     = 2,
+    CUDNN_GROUP_NORM     = 3,
+    CUDNN_RMS_NORM       = 4,
+    CUDNN_ADA_LAYER_NORM = 5,
+} cudnnBackendNormMode_t;
+
+typedef enum {
+    CUDNN_NORM_FWD_INFERENCE = 0,
+    CUDNN_NORM_FWD_TRAINING  = 1,
+} cudnnBackendNormFwdPhase_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendPopulateCudaGraph(cudnnHandle_t handle,
+                              cudnnBackendDescriptor_t executionPlan,
+                              cudnnBackendDescriptor_t variantPack,
+                              cudaGraph_t graph);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendUpdateCudaGraph(cudnnHandle_t handle,
+                            cudnnBackendDescriptor_t executionPlan,
+                            cudnnBackendDescriptor_t variantPack,
+                            cudaGraph_t graph);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_GRAPH_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..389fba220c579e08519072255f2aea9a5da2d3e5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h
@@ -0,0 +1,992 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_graph : cuDNN's basic definitions operations.
+ */
+
+#if !defined(CUDNN_GRAPH_H_)
+#define CUDNN_GRAPH_H_
+
+#include <cuda_runtime_api.h>
+#include <library_types.h>
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_GRAPH_MAJOR 9
+#define CUDNN_GRAPH_MINOR 10
+#define CUDNN_GRAPH_PATCH 2
+
+#if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN GRAPH!!!
+#endif
+
+#ifndef CUDNNWINAPI
+#ifdef _WIN32
+#define CUDNNWINAPI __stdcall
+#else
+#define CUDNNWINAPI
+#endif
+#endif
+
+/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
+#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
+/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
+#define CUDNN_DEPRECATED __attribute__((deprecated))
+#define CUDNN_DEPRECATED_ENUM __attribute__((deprecated))
+#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
+/* Microsoft Visual C++ */
+#define CUDNN_DEPRECATED __declspec(deprecated)
+#define CUDNN_DEPRECATED_ENUM __declspec(deprecated)
+#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
+/* C++14 compilers */
+#define CUDNN_DEPRECATED [[deprecated]]
+#define CUDNN_DEPRECATED_ENUM [[deprecated]]
+#else
+/* No support for the deprecated attribute */
+#define CUDNN_DEPRECATED
+#define CUDNN_DEPRECATED_ENUM
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudnnContext;
+typedef struct cudnnContext *cudnnHandle_t;
+
+size_t CUDNNWINAPI
+cudnnGetVersion(void);
+
+size_t CUDNNWINAPI
+cudnnGetMaxDeviceVersion(void);
+
+/* Returns CUDA Runtime version statically linked against cudnn */
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void);
+
+/*
+ * CUDNN return codes
+ */
+typedef enum {
+    CUDNN_STATUS_SUCCESS = 0,
+
+    /* Uncategorized errors */
+    CUDNN_STATUS_NOT_INITIALIZED                = 1001,
+    CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH    = 1002,
+    CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003,
+    CUDNN_STATUS_DEPRECATED                     = 1004,
+    CUDNN_STATUS_LICENSE_ERROR                  = 1005,
+    CUDNN_STATUS_RUNTIME_IN_PROGRESS            = 1006,
+    CUDNN_STATUS_RUNTIME_FP_OVERFLOW            = 1007,
+    CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED      = 1008,
+
+    CUDNN_STATUS_BAD_PARAM                     = 2000,
+    CUDNN_STATUS_BAD_PARAM_NULL_POINTER        = 2002,
+    CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER  = 2003,
+    CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED       = 2004,
+    CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND        = 2005,
+    CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT   = 2006,
+    CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH     = 2007,
+    CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH      = 2008,
+    CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES  = 2009,
+    CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE      = 2010,
+    CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011,
+    CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE     = 2012,
+
+    CUDNN_STATUS_NOT_SUPPORTED                              = 3000,
+    CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN                = 3001,
+    CUDNN_STATUS_NOT_SUPPORTED_SHAPE                        = 3002,
+    CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE                    = 3003,
+    CUDNN_STATUS_NOT_SUPPORTED_LAYOUT                       = 3004,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER     = 3005,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART          = 3006,
+    CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH                = 3007,
+    CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008,
+    CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE       = 3009,
+    CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT   = 3010,
+    CUDNN_STATUS_NOT_SUPPORTED_PADDING                      = 3011,
+    CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM             = 3012,
+    CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API        = 3013,
+
+    CUDNN_STATUS_INTERNAL_ERROR                          = 4000,
+    CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED       = 4001,
+    CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE         = 4002,
+    CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED   = 4003,
+    CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004,
+    CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM         = 4005,
+    CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED  = 4006,
+
+    CUDNN_STATUS_EXECUTION_FAILED             = 5000,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001,
+    CUDNN_STATUS_EXECUTION_FAILED_CUBLAS      = 5002,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDART      = 5003,
+    CUDNN_STATUS_EXECUTION_FAILED_CURAND      = 5004,
+
+    CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM  = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED,
+    CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */,
+    CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH,
+    CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED,
+    CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM =
+        CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING,
+    CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH,
+} cudnnStatus_t;
+
+#define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err)))
+#define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000)
+#define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000)
+
+/* human-readable error messages */
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status);
+
+void CUDNNWINAPI
+cudnnGetLastErrorString(char *message, size_t max_size);
+
+/* Forward definition in this version only */
+typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED;
+
+typedef enum {
+    CUDNN_ERRQUERY_RAWCODE     = 0,
+    CUDNN_ERRQUERY_NONBLOCKING = 1,
+    CUDNN_ERRQUERY_BLOCKING    = 2,
+} cudnnErrQueryMode_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
+/*
+ * CUDNN data type
+ */
+typedef enum {
+    CUDNN_DATA_FLOAT                         = 0,
+    CUDNN_DATA_DOUBLE                        = 1,
+    CUDNN_DATA_HALF                          = 2,
+    CUDNN_DATA_INT8                          = 3,
+    CUDNN_DATA_INT32                         = 4,
+    CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM  = 5,
+    CUDNN_DATA_UINT8                         = 6,
+    CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7,
+    CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8,
+    CUDNN_DATA_BFLOAT16                      = 9,
+    CUDNN_DATA_INT64                         = 10,
+    CUDNN_DATA_BOOLEAN                       = 11,
+    CUDNN_DATA_FP8_E4M3                      = 12,
+    CUDNN_DATA_FP8_E5M2                      = 13,
+    CUDNN_DATA_FAST_FLOAT_FOR_FP8            = 14,
+    CUDNN_DATA_FP8_E8M0                      = 15,
+    CUDNN_DATA_FP4_E2M1                      = 16,
+} cudnnDataType_t;
+
+/*
+ * CUDNN math type
+ */
+typedef enum {
+    CUDNN_DEFAULT_MATH                    = 0,
+    CUDNN_TENSOR_OP_MATH                  = 1,
+    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
+    CUDNN_FMA_MATH                        = 3,
+} cudnnMathType_t;
+
+/*
+ * CUDNN propagate Nan
+ */
+typedef enum {
+    CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0,
+    CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM     = 1,
+} cudnnNanPropagation_t;
+
+/*
+ * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If
+ * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for
+ * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to
+ * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
+ */
+typedef enum {
+    CUDNN_CTC_ZERO_OOB_GRADIENTS = 0,
+    CUDNN_CTC_SKIP_OOB_GRADIENTS = 1,
+} cudnnCTCGradMode_t;
+
+typedef enum {
+    CUDNN_TENSOR_NCHW        = 0, /* row major (wStride = 1, hStride = w) */
+    CUDNN_TENSOR_NHWC        = 1, /* feature maps interleaved ( cStride = 1 )*/
+    CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
+} cudnnTensorFormat_t;
+
+/*
+ * CUDNN ReduceTensor op type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_ADD          = 0,
+    CUDNN_REDUCE_TENSOR_MUL          = 1,
+    CUDNN_REDUCE_TENSOR_MIN          = 2,
+    CUDNN_REDUCE_TENSOR_MAX          = 3,
+    CUDNN_REDUCE_TENSOR_AMAX         = 4,
+    CUDNN_REDUCE_TENSOR_AVG          = 5,
+    CUDNN_REDUCE_TENSOR_NORM1        = 6,
+    CUDNN_REDUCE_TENSOR_NORM2        = 7,
+    CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
+} cudnnReduceTensorOp_t;
+
+/*
+ * activation mode
+ */
+typedef enum {
+    CUDNN_ACTIVATION_SIGMOID      = 0,
+    CUDNN_ACTIVATION_RELU         = 1,
+    CUDNN_ACTIVATION_TANH         = 2,
+    CUDNN_ACTIVATION_CLIPPED_RELU = 3,
+    CUDNN_ACTIVATION_ELU          = 4,
+    CUDNN_ACTIVATION_IDENTITY     = 5,
+    CUDNN_ACTIVATION_SWISH        = 6
+} cudnnActivationMode_t CUDNN_DEPRECATED;
+
+typedef enum {
+    CUDNN_SEV_FATAL   = 0,
+    CUDNN_SEV_ERROR   = 1,
+    CUDNN_SEV_WARNING = 2,
+    CUDNN_SEV_INFO    = 3,
+} cudnnSeverity_t;
+
+/* Message masks to be used with cudnnSetCallback() */
+#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
+#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
+#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
+
+/* struct containing useful informaiton for each API call */
+typedef struct cudnnDebugStruct {
+    unsigned cudnn_version;
+    cudnnStatus_t cudnnStatus;
+    unsigned time_sec;      /* epoch time in seconds */
+    unsigned time_usec;     /* microseconds part of epoch time */
+    unsigned time_delta;    /* time since start in seconds */
+    cudnnHandle_t handle;   /* cudnn handle */
+    cudaStream_t stream;    /* cuda stream ID */
+    unsigned long long pid; /* process ID */
+    unsigned long long tid; /* thread ID */
+    int cudaDeviceId;       /* CUDA device ID */
+    int reserved[15];       /* reserved for future use */
+} cudnnDebug_t;
+
+typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGraphVersionCheck(void);
+
+/* Maximum supported number of tensor dimensions */
+#define CUDNN_DIM_MAX 8
+
+/*
+ *  convolution mode
+ */
+typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
+
+/*
+ * CUDNN Reorder
+ */
+typedef enum {
+    CUDNN_DEFAULT_REORDER = 0,
+    CUDNN_NO_REORDER      = 1,
+} cudnnReorderType_t CUDNN_DEPRECATED;
+
+typedef void *cudnnBackendDescriptor_t;
+
+typedef struct cudnnFractionStruct {
+    int64_t numerator;
+    int64_t denominator;
+} cudnnFraction_t;
+
+typedef enum {
+    CUDNN_POINTWISE_ADD        = 0,
+    CUDNN_POINTWISE_ADD_SQUARE = 5,
+    CUDNN_POINTWISE_DIV        = 6,
+    CUDNN_POINTWISE_MAX        = 3,
+    CUDNN_POINTWISE_MIN        = 2,
+    CUDNN_POINTWISE_MOD        = 7,
+    CUDNN_POINTWISE_MUL        = 1,
+    CUDNN_POINTWISE_POW        = 8,
+    CUDNN_POINTWISE_SUB        = 9,
+
+    CUDNN_POINTWISE_ABS        = 10,
+    CUDNN_POINTWISE_CEIL       = 11,
+    CUDNN_POINTWISE_COS        = 12,
+    CUDNN_POINTWISE_EXP        = 13,
+    CUDNN_POINTWISE_FLOOR      = 14,
+    CUDNN_POINTWISE_LOG        = 15,
+    CUDNN_POINTWISE_NEG        = 16,
+    CUDNN_POINTWISE_RSQRT      = 17,
+    CUDNN_POINTWISE_SIN        = 18,
+    CUDNN_POINTWISE_SQRT       = 4,
+    CUDNN_POINTWISE_TAN        = 19,
+    CUDNN_POINTWISE_ERF        = 20,
+    CUDNN_POINTWISE_IDENTITY   = 21,
+    CUDNN_POINTWISE_RECIPROCAL = 22,
+    CUDNN_POINTWISE_ATAN2      = 23,
+
+    CUDNN_POINTWISE_RELU_FWD             = 100,
+    CUDNN_POINTWISE_TANH_FWD             = 101,
+    CUDNN_POINTWISE_SIGMOID_FWD          = 102,
+    CUDNN_POINTWISE_ELU_FWD              = 103,
+    CUDNN_POINTWISE_GELU_FWD             = 104,
+    CUDNN_POINTWISE_SOFTPLUS_FWD         = 105,
+    CUDNN_POINTWISE_SWISH_FWD            = 106,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
+
+    CUDNN_POINTWISE_RELU_BWD             = 200,
+    CUDNN_POINTWISE_TANH_BWD             = 201,
+    CUDNN_POINTWISE_SIGMOID_BWD          = 202,
+    CUDNN_POINTWISE_ELU_BWD              = 203,
+    CUDNN_POINTWISE_GELU_BWD             = 204,
+    CUDNN_POINTWISE_SOFTPLUS_BWD         = 205,
+    CUDNN_POINTWISE_SWISH_BWD            = 206,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
+
+    CUDNN_POINTWISE_CMP_EQ  = 300,
+    CUDNN_POINTWISE_CMP_NEQ = 301,
+    CUDNN_POINTWISE_CMP_GT  = 302,
+    CUDNN_POINTWISE_CMP_GE  = 303,
+    CUDNN_POINTWISE_CMP_LT  = 304,
+    CUDNN_POINTWISE_CMP_LE  = 305,
+
+    CUDNN_POINTWISE_LOGICAL_AND = 400,
+    CUDNN_POINTWISE_LOGICAL_OR  = 401,
+    CUDNN_POINTWISE_LOGICAL_NOT = 402,
+
+    CUDNN_POINTWISE_GEN_INDEX = 501,
+
+    CUDNN_POINTWISE_BINARY_SELECT = 601,
+} cudnnPointwiseMode_t;
+
+typedef enum {
+    CUDNN_RESAMPLE_NEAREST                 = 0,
+    CUDNN_RESAMPLE_BILINEAR                = 1,
+    CUDNN_RESAMPLE_AVGPOOL                 = 2,
+    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
+    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
+    CUDNN_RESAMPLE_MAXPOOL                 = 3,
+} cudnnResampleMode_t;
+
+typedef enum {
+    CUDNN_SIGNAL_SET  = 0,
+    CUDNN_SIGNAL_WAIT = 1,
+} cudnnSignalMode_t;
+
+typedef enum {
+    CUDNN_GENSTATS_SUM_SQSUM = 0,
+} cudnnGenStatsMode_t;
+
+typedef enum {
+    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
+    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
+} cudnnBnFinalizeStatsMode_t;
+
+typedef enum {
+    CUDNN_RNG_DISTRIBUTION_BERNOULLI = 0,
+    CUDNN_RNG_DISTRIBUTION_UNIFORM   = 1,
+    CUDNN_RNG_DISTRIBUTION_NORMAL    = 2,
+} cudnnRngDistribution_t;
+
+typedef enum {
+    CUDNN_ATTR_POINTWISE_MODE                                  = 0,
+    CUDNN_ATTR_POINTWISE_MATH_PREC                             = 1,
+    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP                       = 3,
+    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP                       = 4,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE                 = 5,
+    CUDNN_ATTR_POINTWISE_ELU_ALPHA                             = 6,
+    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA                         = 7,
+    CUDNN_ATTR_POINTWISE_SWISH_BETA                            = 8,
+    CUDNN_ATTR_POINTWISE_AXIS                                  = 9,
+
+    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
+    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
+    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
+    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
+    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
+    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
+    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,
+
+    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
+    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
+    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
+    CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
+    CUDNN_ATTR_ENGINEHEUR_DEVICEPROP      = 204,
+
+    CUDNN_ATTR_ENGINECFG_ENGINE             = 300,
+    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO  = 301,
+    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES       = 302,
+    CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE     = 303,
+    CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED = 304,
+
+    CUDNN_ATTR_EXECUTION_PLAN_HANDLE CUDNN_DEPRECATED_ENUM = 400,
+    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG                = 401,
+    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE               = 402,
+    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS   = 403,
+    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS   = 404,
+    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION          = 405,
+    CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE                 = 406,
+    CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP                   = 407,
+
+    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
+    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
+
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
+
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
+
+    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
+    CUDNN_ATTR_OPERATION_POINTWISE_XDESC         = 751,
+    CUDNN_ATTR_OPERATION_POINTWISE_BDESC         = 752,
+    CUDNN_ATTR_OPERATION_POINTWISE_YDESC         = 753,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1        = 754,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2        = 755,
+    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC        = 756,
+    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC        = 757,
+    CUDNN_ATTR_OPERATION_POINTWISE_TDESC         = 758,
+
+    CUDNN_ATTR_OPERATION_GENSTATS_MODE      = 770,
+    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
+    CUDNN_ATTR_OPERATION_GENSTATS_XDESC     = 772,
+    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC   = 773,
+    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
+
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,
+
+    CUDNN_ATTR_OPERATIONGRAPH_HANDLE CUDNN_DEPRECATED_ENUM = 800,
+    CUDNN_ATTR_OPERATIONGRAPH_OPS                          = 801,
+    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT          = 802,
+    CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED     = 803,
+    CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY             = 804,
+
+    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
+    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
+    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
+    CUDNN_ATTR_TENSOR_STRIDES              = 903,
+    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
+    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
+    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
+    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
+    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
+    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
+    CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC   = 913,
+
+    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
+    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
+    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
+    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,
+
+    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
+    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,
+
+    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
+    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
+    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
+    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,
+
+    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
+    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
+    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
+    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
+    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
+    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
+    CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
+    CUDNN_ATTR_ENGINE_DEVICEPROP      = 1307,
+
+    CUDNN_ATTR_MATMUL_COMP_TYPE     = 1500,
+    CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503,
+
+    CUDNN_ATTR_OPERATION_MATMUL_ADESC                                                 = 1520,
+    CUDNN_ATTR_OPERATION_MATMUL_BDESC                                                 = 1521,
+    CUDNN_ATTR_OPERATION_MATMUL_CDESC                                                 = 1522,
+    CUDNN_ATTR_OPERATION_MATMUL_DESC                                                  = 1523,
+    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC                                  = 1525,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC                                  = 1526,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC                                  = 1527,
+
+    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
+    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
+
+    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
+    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
+    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,
+
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,
+
+    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
+    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
+    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
+    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
+    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
+    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
+    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
+    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
+    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,
+
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC                       = 1710,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC                       = 1711,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC                     = 1712,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM  = 1714,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC                        = 1716,
+
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC                      = 1720,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC                      = 1721,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC                     = 1722,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM  = 1724,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC                        = 1725,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC                       = 1726,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC                       = 1727,
+
+    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
+    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
+    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
+    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,
+
+    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
+    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
+    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
+    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
+    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,
+
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC  = 1950,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC           = 1951,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC   = 1952,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953,
+
+    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
+    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
+    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
+    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
+    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,
+
+    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
+    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
+    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
+    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
+    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
+    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,
+
+    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
+    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
+
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_XDESC                = 2250,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_YDESC                = 2251,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_LOWER_BANDWIDTH      = 2252,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_UPPER_BANDWIDTH      = 2253,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_AXIS                 = 2254,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_PAD_VALUE            = 2255,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_KV_TOKEN_OFFSET_DESC = 2256,
+
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_XDESC           = 2270,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_YDESC           = 2271,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_LOWER_BANDWIDTH = 2272,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_UPPER_BANDWIDTH = 2273,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_AXIS            = 2274,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_PAD_VALUE       = 2275,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MAX_TOKEN_VALUE        = 2276,
+
+    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
+    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
+    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
+    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,
+
+    CUDNN_ATTR_OPERATION_RNG_YDESC       = 2310,
+    CUDNN_ATTR_OPERATION_RNG_SEED        = 2311,
+    CUDNN_ATTR_OPERATION_RNG_DESC        = 2312,
+    CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,
+
+    CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH            = 2400,
+    CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401,
+    CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION        = 2402,
+
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC      = 2500,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC      = 2501,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC  = 2503,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504,
+
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC      = 2600,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC      = 2602,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC  = 2603,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604,
+
+    CUDNN_ATTR_DEVICEPROP_DEVICE_ID           = 2700,
+    CUDNN_ATTR_DEVICEPROP_HANDLE              = 2701,
+    CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702,
+} cudnnBackendAttributeName_t;
+
+typedef enum {
+    CUDNN_TYPE_HANDLE                                = 0,
+    CUDNN_TYPE_DATA_TYPE                             = 1,
+    CUDNN_TYPE_BOOLEAN                               = 2,
+    CUDNN_TYPE_INT64                                 = 3,
+    CUDNN_TYPE_FLOAT                                 = 4,
+    CUDNN_TYPE_DOUBLE                                = 5,
+    CUDNN_TYPE_VOID_PTR                              = 6,
+    CUDNN_TYPE_CONVOLUTION_MODE                      = 7,
+    CUDNN_TYPE_HEUR_MODE                             = 8,
+    CUDNN_TYPE_KNOB_TYPE                             = 9,
+    CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM = 10,
+    CUDNN_TYPE_NUMERICAL_NOTE                        = 11,
+    CUDNN_TYPE_LAYOUT_TYPE                           = 12,
+    CUDNN_TYPE_ATTRIB_NAME                           = 13,
+    CUDNN_TYPE_POINTWISE_MODE                        = 14,
+    CUDNN_TYPE_BACKEND_DESCRIPTOR                    = 15,
+    CUDNN_TYPE_GENSTATS_MODE                         = 16,
+    CUDNN_TYPE_BN_FINALIZE_STATS_MODE                = 17,
+    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE               = 18,
+    CUDNN_TYPE_BEHAVIOR_NOTE                         = 19,
+    CUDNN_TYPE_TENSOR_REORDERING_MODE                = 20,
+    CUDNN_TYPE_RESAMPLE_MODE                         = 21,
+    CUDNN_TYPE_PADDING_MODE                          = 22,
+    CUDNN_TYPE_INT32                                 = 23,
+    CUDNN_TYPE_CHAR                                  = 24,
+    CUDNN_TYPE_SIGNAL_MODE                           = 25,
+    CUDNN_TYPE_FRACTION                              = 26,
+    CUDNN_TYPE_NORM_MODE                             = 27,
+    CUDNN_TYPE_NORM_FWD_PHASE                        = 28,
+    CUDNN_TYPE_RNG_DISTRIBUTION                      = 29,
+} cudnnBackendAttributeType_t;
+
+typedef enum {
+    CUDNN_BACKEND_POINTWISE_DESCRIPTOR                             = 0,
+    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR                           = 1,
+    CUDNN_BACKEND_ENGINE_DESCRIPTOR                                = 2,
+    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR                             = 3,
+    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR                            = 4,
+    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR                        = 5,
+    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR                     = 6,
+    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR                           = 7,
+    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR                             = 8,
+    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR                           = 9,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR         = 10,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR   = 12,
+    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR                   = 13,
+    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR                   = 14,
+    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR                        = 15,
+    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR                          = 16,
+    CUDNN_BACKEND_TENSOR_DESCRIPTOR                                = 17,
+    CUDNN_BACKEND_MATMUL_DESCRIPTOR                                = 18,
+    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR                      = 19,
+    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR      = 20,
+    CUDNN_BACKEND_REDUCTION_DESCRIPTOR                             = 21,
+    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR                   = 22,
+    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR              = 23,
+    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR                              = 24,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR                = 25,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR                = 26,
+    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR                      = 27,
+    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR                      = 28,
+    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR                = 29,
+    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR               = 30,
+    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR                     = 31,
+    CUDNN_BACKEND_RNG_DESCRIPTOR                                   = 32,
+    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR                         = 33,
+    CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR                          = 34,
+    CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR            = 35,
+    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR        = 36,
+    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR      = 37,
+    CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR                            = 38,
+    CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR          = 39,
+    CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR        = 40,
+} cudnnBackendDescriptorType_t;
+
+typedef enum {
+    CUDNN_NUMERICAL_NOTE_TENSOR_CORE                 = 0,
+    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS         = 1,
+    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2,
+    CUDNN_NUMERICAL_NOTE_FFT                         = 3,
+    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC            = 4,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD                    = 5,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4           = 6,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6           = 7,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13         = 8,
+    CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP             = 9,
+    CUDNN_NUMERICAL_NOTE_TYPE_COUNT                  = 10,
+} cudnnBackendNumericalNote_t;
+
+typedef enum {
+    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
+    CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API  = 3,
+    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT                      = 4,
+} cudnnBackendBehaviorNote_t;
+
+typedef enum {
+    CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM          = 0,
+    CUDNN_KNOB_TYPE_SWIZZLE                                = 1,
+    CUDNN_KNOB_TYPE_TILE_SIZE                              = 2,
+    CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM          = 3,
+    CUDNN_KNOB_TYPE_EDGE                                   = 4,
+    CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM           = 5,
+    CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM             = 6,
+    CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM             = 7,
+    CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM          = 8,
+    CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM          = 9,
+    CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM        = 10,
+    CUDNN_KNOB_TYPE_MULTIPLY                               = 11,
+    CUDNN_KNOB_TYPE_SPLIT_K_BUF                            = 12,
+    CUDNN_KNOB_TYPE_TILEK                                  = 13,
+    CUDNN_KNOB_TYPE_STAGES                                 = 14,
+    CUDNN_KNOB_TYPE_REDUCTION_MODE                         = 15,
+    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16,
+    CUDNN_KNOB_TYPE_SPLIT_K_SLC                            = 17,
+    CUDNN_KNOB_TYPE_IDX_MODE                               = 18,
+    CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM           = 19,
+    CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM         = 20,
+    CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM     = 21,
+    CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM             = 22,
+    CUDNN_KNOB_TYPE_SPECFILT                               = 23,
+    CUDNN_KNOB_TYPE_KERNEL_CFG                             = 24,
+    CUDNN_KNOB_TYPE_WORKSPACE                              = 25,
+    CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM         = 26,
+    CUDNN_KNOB_TYPE_TILE_CGA_M                             = 27,
+    CUDNN_KNOB_TYPE_TILE_CGA_N                             = 28,
+    CUDNN_KNOB_TYPE_BLOCK_SIZE                             = 29,
+    CUDNN_KNOB_TYPE_OCCUPANCY                              = 30,
+    CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD                  = 31,
+    CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM  = 32,
+    CUDNN_KNOB_TYPE_SPLIT_COLS                             = 33,
+    CUDNN_KNOB_TYPE_TILE_ROWS                              = 34,
+    CUDNN_KNOB_TYPE_TILE_COLS                              = 35,
+    CUDNN_KNOB_TYPE_LOAD_SIZE                              = 36,
+    CUDNN_KNOB_TYPE_CTA_COUNT                              = 37,
+    CUDNN_KNOB_TYPE_STREAM_K                               = 38,
+    CUDNN_KNOB_TYPE_SPLIT_P_SLC                            = 39,
+    CUDNN_KNOB_TYPE_TILE_M                                 = 40,
+    CUDNN_KNOB_TYPE_TILE_N                                 = 41,
+    CUDNN_KNOB_TYPE_WARP_SPEC_CFG                          = 42,
+    CUDNN_KNOB_TYPE_COUNTS                                 = 43,
+} cudnnBackendKnobType_t;
+
+typedef enum {
+    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
+    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
+    CUDNN_LAYOUT_TYPE_COUNT            = 4,
+} cudnnBackendLayoutType_t;
+
+typedef enum {
+    CUDNN_HEUR_MODE_INSTANT  = 0,
+    CUDNN_HEUR_MODE_B        = 1,
+    CUDNN_HEUR_MODE_FALLBACK = 2,
+    CUDNN_HEUR_MODE_A        = 3,
+    CUDNN_HEUR_MODES_COUNT   = 4,
+} cudnnBackendHeurMode_t;
+
+typedef enum {
+    CUDNN_TENSOR_REORDERING_NONE     = 0,
+    CUDNN_TENSOR_REORDERING_INT8x32  = 1,
+    CUDNN_TENSOR_REORDERING_F16x16   = 2,
+    CUDNN_TENSOR_REORDERING_F8_128x4 = 3,
+} cudnnBackendTensorReordering_t;
+
+typedef enum {
+    CUDNN_ZERO_PAD     = 0,
+    CUDNN_NEG_INF_PAD  = 1,
+    CUDNN_EDGE_VAL_PAD = 2,
+} cudnnPaddingMode_t;
+
+typedef enum {
+    CUDNN_LAYER_NORM     = 0,
+    CUDNN_INSTANCE_NORM  = 1,
+    CUDNN_BATCH_NORM     = 2,
+    CUDNN_GROUP_NORM     = 3,
+    CUDNN_RMS_NORM       = 4,
+    CUDNN_ADA_LAYER_NORM = 5,
+} cudnnBackendNormMode_t;
+
+typedef enum {
+    CUDNN_NORM_FWD_INFERENCE = 0,
+    CUDNN_NORM_FWD_TRAINING  = 1,
+} cudnnBackendNormFwdPhase_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendPopulateCudaGraph(cudnnHandle_t handle,
+                              cudnnBackendDescriptor_t executionPlan,
+                              cudnnBackendDescriptor_t variantPack,
+                              cudaGraph_t graph);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendUpdateCudaGraph(cudnnHandle_t handle,
+                            cudnnBackendDescriptor_t executionPlan,
+                            cudnnBackendDescriptor_t variantPack,
+                            cudaGraph_t graph);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_GRAPH_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..471a0e59d67228ab8a74159517418f217ab86324
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h
@@ -0,0 +1,1316 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_ops : cuDNN's basic definitions and basic operations.
+ */
+
+#if !defined(CUDNN_OPS_H_)
+#define CUDNN_OPS_H_
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_MAJOR 9
+#define CUDNN_OPS_MINOR 10
+#define CUDNN_OPS_PATCH 2
+
+#if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Data structures to represent Image/Filter and the Neural Network Layer */
+typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
+typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
+typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
+typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
+typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED;
+/*
+ * CUDNN Determinism
+ */
+typedef enum {
+    CUDNN_NON_DETERMINISTIC = 0,
+    CUDNN_DETERMINISTIC     = 1,
+} cudnnDeterminism_t;
+
+/* Create an instance of a generic Tensor descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w);                   /* width of input section */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
+
+/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
+
+   1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
+   input_stride :  c x h x h_stride
+   feature_stride : h x h_stride
+   h_stride  :  >= w  ( h_stride = w if no padding)
+   w_stride  : 1
+
+
+   2)Example of all images in row major with features maps interleaved
+   input_stride :  c x h x h_stride
+   feature_stride : 1
+   h_stride  :  w x c
+   w_stride  : c
+
+   3)Example of all images in column major order one batch of features after the other (with optional padding on column)
+   input_stride :  c x w x w_stride
+   feature_stride : w x w_stride
+   h_stride  :  1
+   w_stride  :  >= h
+
+*/
+
+/* Destroy an instance of Tensor4d descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
+
+/* Fold/unfold transforms */
+typedef enum {
+    CUDNN_TRANSFORM_FOLD   = 0U,
+    CUDNN_TRANSFORM_UNFOLD = 1U,
+} cudnnFoldingDirection_t;
+
+/** Create a destination descriptor for cudnnTransformTensor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes);
+
+/** Create an empty tensor transform descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
+
+/** Initialize a previously created tensor transform descriptor. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction);
+
+/**
+ * Retrieves the values stored in a previously initialized tensor transform
+ * descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction);
+
+/**
+ * Destroys a previously created tensor transform descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
+
+/* Tensor layout conversion helper (y = alpha * x + beta * y) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData);
+
+/* Tensor Bias addition : C = alpha * A + beta * C  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C);
+
+/*
+ * CUDNN OpTensor op type
+ */
+typedef enum {
+    CUDNN_OP_TENSOR_ADD  = 0,
+    CUDNN_OP_TENSOR_MUL  = 1,
+    CUDNN_OP_TENSOR_MIN  = 2,
+    CUDNN_OP_TENSOR_MAX  = 3,
+    CUDNN_OP_TENSOR_SQRT = 4,
+    CUDNN_OP_TENSOR_NOT  = 5,
+} cudnnOpTensorOp_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
+
+/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
+/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C);
+
+/*
+ * CUDNN ReduceTensor indices type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_NO_INDICES        = 0,
+    CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
+} cudnnReduceTensorIndices_t CUDNN_DEPRECATED;
+
+/*
+ * CUDNN tensor indices type size (all unsigned)
+ * Currently not supported, default is 32 bit unsigned.
+ */
+typedef enum {
+    CUDNN_32BIT_INDICES = 0,
+    CUDNN_64BIT_INDICES = 1,
+    CUDNN_16BIT_INDICES = 2,
+    CUDNN_8BIT_INDICES  = 3,
+} cudnnIndicesType_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
+
+/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
+ * output tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes);
+
+/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
+ * tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes);
+
+/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
+/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
+/* The indices space is ignored for reduce ops other than min or max. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C);
+
+/* Set all values of a tensor to a given value : y[i] = value[0] */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
+
+/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
+
+/* Create an instance of FilterStruct */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w); /* width of  each input filter */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w); /* width of  each input filter */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
+
+/*
+ *  softmax algorithm
+ */
+typedef enum {
+    CUDNN_SOFTMAX_FAST     = 0, /* straightforward implementation */
+    CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
+    CUDNN_SOFTMAX_LOG      = 2
+} cudnnSoftmaxAlgorithm_t;
+
+typedef enum {
+    CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
+    CUDNN_SOFTMAX_MODE_CHANNEL  = 1  /* compute the softmax over all C for each H, W, N */
+} cudnnSoftmaxMode_t;
+
+/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/*
+ *  pooling mode
+ */
+typedef enum {
+    CUDNN_POOLING_MAX                           = 0,
+    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
+    CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
+    CUDNN_POOLING_MAX_DETERMINISTIC             = 3
+} cudnnPoolingMode_t CUDNN_DEPRECATED;
+
+/* Create an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w);
+
+/* Destroy an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
+
+/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef); /* ceiling for clipped RELU, alpha for ELU */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef); /* ceiling for clipped RELU, alpha for ELU */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
+
+/* Function to perform forward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y);
+
+/*
+ * Create an instance of LRN (Local Response Normalization) descriptor
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
+
+#define CUDNN_LRN_MIN_N 1       /* minimum allowed lrnN */
+#define CUDNN_LRN_MAX_N 16      /* maximum allowed lrnN */
+#define CUDNN_LRN_MIN_K 1e-5    /* minimum allowed lrnK */
+#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
+
+/* LRN layer mode */
+typedef enum {
+    CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
+} cudnnLRNMode_t;
+
+/*
+ * Uses a window [center-lookBehind, center+lookAhead], where
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
+ * Values of double parameters cast to tensor data type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
+/*
+ * Retrieve the settings currently stored in an LRN layer descriptor
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
+
+/* Destroy an instance of LRN descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
+
+/* LRN functions: output = alpha * normalize(x) + beta * old_y */
+
+/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y);
+
+typedef enum {
+    CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
+} cudnnDivNormMode_t;
+
+/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_BATCHNORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_BATCHNORM_SPATIAL = 1,
+
+    /*
+     * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
+     * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
+     */
+    CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
+} cudnnBatchNormMode_t CUDNN_DEPRECATED;
+
+#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode);
+
+typedef enum {
+    CUDNN_BATCHNORM_OPS_BN                = 0, /* do batch normalization only */
+    CUDNN_BATCHNORM_OPS_BN_ACTIVATION     = 1, /* do batchNorm, then activation */
+    CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
+} cudnnBatchNormOps_t CUDNN_DEPRECATED;
+
+/*
+ * Performs Batch Normalization during Inference:
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_NORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_NORM_PER_CHANNEL = 1,
+} cudnnNormMode_t CUDNN_DEPRECATED;
+
+typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED;
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for Normalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
+                                cudnnTensorDescriptor_t derivedNormMeanVarDesc,
+                                const cudnnTensorDescriptor_t xDesc,
+                                cudnnNormMode_t mode,
+                                int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+typedef enum {
+    CUDNN_NORM_OPS_NORM                = 0, /* do normalization only */
+    CUDNN_NORM_OPS_NORM_ACTIVATION     = 1, /* do Norm, then activation */
+    CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
+} cudnnNormOps_t CUDNN_DEPRECATED;
+
+/*
+ * Performs Normalization during Inference:
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardInference(cudnnHandle_t handle,
+                                   cudnnNormMode_t mode,
+                                   cudnnNormOps_t normOps,
+                                   cudnnNormAlgo_t algo,
+                                   const void *alpha, /* alpha[0] = result blend factor */
+                                   const void *beta,  /* beta[0] = dest layer blend factor */
+                                   const cudnnTensorDescriptor_t xDesc,
+                                   const void *x, /* NxCxHxW */
+                                   const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                   const void *normScale,
+                                   const void *normBias,
+                                   const cudnnTensorDescriptor_t normMeanVarDesc,
+                                   const void *estimatedMean,
+                                   const void *estimatedVariance,
+                                   const cudnnTensorDescriptor_t zDesc,
+                                   const void *z,
+                                   cudnnActivationDescriptor_t activationDesc,
+                                   const cudnnTensorDescriptor_t yDesc,
+                                   void *y, /* NxCxHxW */
+                                   double epsilon,
+                                   int groupCnt); /* Place hold for future work*/
+
+/* APIs for spatial transformer network*/
+typedef enum {
+    CUDNN_SAMPLER_BILINEAR = 0,
+} cudnnSamplerType_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y);
+
+typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
+
+/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
+
+/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed);
+
+/* Restores the dropout descriptor to a previously saved-off state */
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes);
+
+/* TODO: move these enums out to the appropriate submodule */
+typedef enum {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM         = 0,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM                  = 2,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT                = 3,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT                   = 4,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING            = 5,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD              = 6,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED     = 7,
+    CUDNN_CONVOLUTION_FWD_ALGO_COUNT                 = 8
+} cudnnConvolutionFwdAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 = 3, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          = 4, /* not implemented */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        = 6,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT             = 7
+} cudnnConvolutionBwdFilterAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING        = 3,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD          = 4,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT             = 6
+} cudnnConvolutionBwdDataAlgo_t;
+
+typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsVersionCheck(void);
+
+/* Function to perform backward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+
+/* Function to perform backward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+
+/* Function to perform backward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx);
+
+/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans); /* output means differential, can be NULL */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes);
+
+/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance);
+
+/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes);
+
+/* Performs backward pass of Batch Normalization layer. Returns x gradient,
+ * bnScale gradient and bnBias gradient */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnNormMode_t mode,
+                                                  cudnnNormOps_t normOps,
+                                                  cudnnNormAlgo_t algo,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t zDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                                  size_t *sizeInBytes,
+                                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
+                                           cudnnNormMode_t mode,
+                                           cudnnNormOps_t normOps,
+                                           cudnnNormAlgo_t algo,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnTensorDescriptor_t dzDesc,
+                                           const cudnnTensorDescriptor_t dxDesc,
+                                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                                           const cudnnActivationDescriptor_t activationDesc,
+                                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                                           size_t *sizeInBytes,
+                                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
+                                              cudnnNormMode_t mode,
+                                              cudnnNormOps_t normOps,
+                                              cudnnNormAlgo_t algo,
+                                              const cudnnActivationDescriptor_t activationDesc,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              size_t *sizeInBytes,
+                                              int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardTraining(cudnnHandle_t handle,
+                                  cudnnNormMode_t mode,
+                                  cudnnNormOps_t normOps,
+                                  cudnnNormAlgo_t algo,
+                                  const void *alpha, /* alpha[0] = result blend factor */
+                                  const void *beta,  /* beta[0] = dest layer blend factor */
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                  const void *normScale,
+                                  const void *normBias,
+                                  double exponentialAverageFactor,
+                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                  void *resultRunningMean,
+                                  void *resultRunningVariance,
+                                  /* Has to be >= 0. Should be the same in forward and backward functions. */
+                                  double epsilon,
+                                  /* Optionally save intermediate results from the forward pass here
+                                     - can be reused to speed up backward pass. NULL if unused */
+                                  void *resultSaveMean,
+                                  void *resultSaveInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  const cudnnTensorDescriptor_t zDesc,
+                                  const void *zData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *yData,
+                                  void *workspace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationBackward(cudnnHandle_t handle,
+                           cudnnNormMode_t mode,
+                           cudnnNormOps_t normOps,
+                           cudnnNormAlgo_t algo,
+                           const void *alphaDataDiff,
+                           const void *betaDataDiff,
+                           const void *alphaParamDiff,
+                           const void *betaParamDiff,
+                           const cudnnTensorDescriptor_t xDesc,
+                           const void *xData,
+                           const cudnnTensorDescriptor_t yDesc,
+                           const void *yData,
+                           const cudnnTensorDescriptor_t dyDesc,
+                           const void *dyData,
+                           const cudnnTensorDescriptor_t dzDesc,
+                           void *dzData,
+                           const cudnnTensorDescriptor_t dxDesc,
+                           void *dxData,
+                           /* Shared tensor desc for the 4 tensors below */
+                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                           const void *normScaleData,
+                           const void *normBiasData, /* needed if there is activation */
+                           void *dNormScaleData,
+                           void *dNormBiasData,
+                           double epsilon, /* Same epsilon as forward pass */
+                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                           /* Optionally cached intermediate results from
+                              forward pass */
+                           const void *savedMean,
+                           const void *savedInvVariance,
+                           cudnnActivationDescriptor_t activationDesc,
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes,
+                           void *reserveSpace,
+                           size_t reserveSpaceSizeInBytes,
+                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_OPS_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..471a0e59d67228ab8a74159517418f217ab86324
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h
@@ -0,0 +1,1316 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_ops : cuDNN's basic definitions and basic operations.
+ */
+
+#if !defined(CUDNN_OPS_H_)
+#define CUDNN_OPS_H_
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_MAJOR 9
+#define CUDNN_OPS_MINOR 10
+#define CUDNN_OPS_PATCH 2
+
+#if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Data structures to represent Image/Filter and the Neural Network Layer */
+typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
+typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
+typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
+typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
+typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED;
+/*
+ * CUDNN Determinism
+ */
+typedef enum {
+    CUDNN_NON_DETERMINISTIC = 0,
+    CUDNN_DETERMINISTIC     = 1,
+} cudnnDeterminism_t;
+
+/* Create an instance of a generic Tensor descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w);                   /* width of input section */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
+
+/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
+
+   1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
+   input_stride :  c x h x h_stride
+   feature_stride : h x h_stride
+   h_stride  :  >= w  ( h_stride = w if no padding)
+   w_stride  : 1
+
+
+   2)Example of all images in row major with features maps interleaved
+   input_stride :  c x h x h_stride
+   feature_stride : 1
+   h_stride  :  w x c
+   w_stride  : c
+
+   3)Example of all images in column major order one batch of features after the other (with optional padding on column)
+   input_stride :  c x w x w_stride
+   feature_stride : w x w_stride
+   h_stride  :  1
+   w_stride  :  >= h
+
+*/
+
+/* Destroy an instance of Tensor4d descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
+
+/* Fold/unfold transforms */
+typedef enum {
+    CUDNN_TRANSFORM_FOLD   = 0U,
+    CUDNN_TRANSFORM_UNFOLD = 1U,
+} cudnnFoldingDirection_t;
+
+/** Create a destination descriptor for cudnnTransformTensor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes);
+
+/** Create an empty tensor transform descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
+
+/** Initialize a previously created tensor transform descriptor. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction);
+
+/**
+ * Retrieves the values stored in a previously initialized tensor transform
+ * descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction);
+
+/**
+ * Destroys a previously created tensor transform descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
+
+/* Tensor layout conversion helper (y = alpha * x + beta * y) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData);
+
+/* Tensor Bias addition : C = alpha * A + beta * C  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C);
+
+/*
+ * CUDNN OpTensor op type
+ */
+typedef enum {
+    CUDNN_OP_TENSOR_ADD  = 0,
+    CUDNN_OP_TENSOR_MUL  = 1,
+    CUDNN_OP_TENSOR_MIN  = 2,
+    CUDNN_OP_TENSOR_MAX  = 3,
+    CUDNN_OP_TENSOR_SQRT = 4,
+    CUDNN_OP_TENSOR_NOT  = 5,
+} cudnnOpTensorOp_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
+
+/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
+/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C);
+
+/*
+ * CUDNN ReduceTensor indices type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_NO_INDICES        = 0,
+    CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
+} cudnnReduceTensorIndices_t CUDNN_DEPRECATED;
+
+/*
+ * CUDNN tensor indices type size (all unsigned)
+ * Currently not supported, default is 32 bit unsigned.
+ */
+typedef enum {
+    CUDNN_32BIT_INDICES = 0,
+    CUDNN_64BIT_INDICES = 1,
+    CUDNN_16BIT_INDICES = 2,
+    CUDNN_8BIT_INDICES  = 3,
+} cudnnIndicesType_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
+
+/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
+ * output tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes);
+
+/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
+ * tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes);
+
+/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
+/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
+/* The indices space is ignored for reduce ops other than min or max. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C);
+
+/* Set all values of a tensor to a given value : y[i] = value[0] */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
+
+/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
+
+/* Create an instance of FilterStruct */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w); /* width of  each input filter */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w); /* width of  each input filter */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
+
+/*
+ *  softmax algorithm
+ */
+typedef enum {
+    CUDNN_SOFTMAX_FAST     = 0, /* straightforward implementation */
+    CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
+    CUDNN_SOFTMAX_LOG      = 2
+} cudnnSoftmaxAlgorithm_t;
+
+typedef enum {
+    CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
+    CUDNN_SOFTMAX_MODE_CHANNEL  = 1  /* compute the softmax over all C for each H, W, N */
+} cudnnSoftmaxMode_t;
+
+/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/*
+ *  pooling mode
+ */
+typedef enum {
+    CUDNN_POOLING_MAX                           = 0,
+    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
+    CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
+    CUDNN_POOLING_MAX_DETERMINISTIC             = 3
+} cudnnPoolingMode_t CUDNN_DEPRECATED;
+
+/* Create an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w);
+
+/* Destroy an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
+
+/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef); /* ceiling for clipped RELU, alpha for ELU */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef); /* ceiling for clipped RELU, alpha for ELU */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
+
+/* Function to perform forward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y);
+
+/*
+ * Create an instance of LRN (Local Response Normalization) descriptor
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
+
+#define CUDNN_LRN_MIN_N 1       /* minimum allowed lrnN */
+#define CUDNN_LRN_MAX_N 16      /* maximum allowed lrnN */
+#define CUDNN_LRN_MIN_K 1e-5    /* minimum allowed lrnK */
+#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
+
+/* LRN layer mode */
+typedef enum {
+    CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
+} cudnnLRNMode_t;
+
+/*
+ * Uses a window [center-lookBehind, center+lookAhead], where
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
+ * Values of double parameters cast to tensor data type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
+/*
+ * Retrieve the settings currently stored in an LRN layer descriptor
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
+
+/* Destroy an instance of LRN descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
+
+/* LRN functions: output = alpha * normalize(x) + beta * old_y */
+
+/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y);
+
+typedef enum {
+    CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
+} cudnnDivNormMode_t;
+
+/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_BATCHNORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_BATCHNORM_SPATIAL = 1,
+
+    /*
+     * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
+     * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
+     */
+    CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
+} cudnnBatchNormMode_t CUDNN_DEPRECATED;
+
+#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode);
+
+typedef enum {
+    CUDNN_BATCHNORM_OPS_BN                = 0, /* do batch normalization only */
+    CUDNN_BATCHNORM_OPS_BN_ACTIVATION     = 1, /* do batchNorm, then activation */
+    CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
+} cudnnBatchNormOps_t CUDNN_DEPRECATED;
+
+/*
+ * Performs Batch Normalization during Inference:
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_NORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_NORM_PER_CHANNEL = 1,
+} cudnnNormMode_t CUDNN_DEPRECATED;
+
+typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED;
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for Normalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
+                                cudnnTensorDescriptor_t derivedNormMeanVarDesc,
+                                const cudnnTensorDescriptor_t xDesc,
+                                cudnnNormMode_t mode,
+                                int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+typedef enum {
+    CUDNN_NORM_OPS_NORM                = 0, /* do normalization only */
+    CUDNN_NORM_OPS_NORM_ACTIVATION     = 1, /* do Norm, then activation */
+    CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
+} cudnnNormOps_t CUDNN_DEPRECATED;
+
+/*
+ * Performs Normalization during Inference:
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardInference(cudnnHandle_t handle,
+                                   cudnnNormMode_t mode,
+                                   cudnnNormOps_t normOps,
+                                   cudnnNormAlgo_t algo,
+                                   const void *alpha, /* alpha[0] = result blend factor */
+                                   const void *beta,  /* beta[0] = dest layer blend factor */
+                                   const cudnnTensorDescriptor_t xDesc,
+                                   const void *x, /* NxCxHxW */
+                                   const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                   const void *normScale,
+                                   const void *normBias,
+                                   const cudnnTensorDescriptor_t normMeanVarDesc,
+                                   const void *estimatedMean,
+                                   const void *estimatedVariance,
+                                   const cudnnTensorDescriptor_t zDesc,
+                                   const void *z,
+                                   cudnnActivationDescriptor_t activationDesc,
+                                   const cudnnTensorDescriptor_t yDesc,
+                                   void *y, /* NxCxHxW */
+                                   double epsilon,
+                                   int groupCnt); /* Place hold for future work*/
+
+/* APIs for spatial transformer network*/
+typedef enum {
+    CUDNN_SAMPLER_BILINEAR = 0,
+} cudnnSamplerType_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y);
+
+typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
+
+/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
+
+/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed);
+
+/* Restores the dropout descriptor to a previously saved-off state */
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes);
+
+/* TODO: move these enums out to the appropriate submodule */
+typedef enum {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM         = 0,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM                  = 2,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT                = 3,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT                   = 4,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING            = 5,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD              = 6,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED     = 7,
+    CUDNN_CONVOLUTION_FWD_ALGO_COUNT                 = 8
+} cudnnConvolutionFwdAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 = 3, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          = 4, /* not implemented */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        = 6,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT             = 7
+} cudnnConvolutionBwdFilterAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING        = 3,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD          = 4,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT             = 6
+} cudnnConvolutionBwdDataAlgo_t;
+
+typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsVersionCheck(void);
+
+/* Function to perform backward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+
+/* Function to perform backward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+
+/* Function to perform backward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx);
+
+/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans); /* output means differential, can be NULL */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes);
+
+/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance);
+
+/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes);
+
+/* Performs backward pass of Batch Normalization layer. Returns x gradient,
+ * bnScale gradient and bnBias gradient */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnNormMode_t mode,
+                                                  cudnnNormOps_t normOps,
+                                                  cudnnNormAlgo_t algo,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t zDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                                  size_t *sizeInBytes,
+                                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
+                                           cudnnNormMode_t mode,
+                                           cudnnNormOps_t normOps,
+                                           cudnnNormAlgo_t algo,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnTensorDescriptor_t dzDesc,
+                                           const cudnnTensorDescriptor_t dxDesc,
+                                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                                           const cudnnActivationDescriptor_t activationDesc,
+                                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                                           size_t *sizeInBytes,
+                                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
+                                              cudnnNormMode_t mode,
+                                              cudnnNormOps_t normOps,
+                                              cudnnNormAlgo_t algo,
+                                              const cudnnActivationDescriptor_t activationDesc,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              size_t *sizeInBytes,
+                                              int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardTraining(cudnnHandle_t handle,
+                                  cudnnNormMode_t mode,
+                                  cudnnNormOps_t normOps,
+                                  cudnnNormAlgo_t algo,
+                                  const void *alpha, /* alpha[0] = result blend factor */
+                                  const void *beta,  /* beta[0] = dest layer blend factor */
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                  const void *normScale,
+                                  const void *normBias,
+                                  double exponentialAverageFactor,
+                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                  void *resultRunningMean,
+                                  void *resultRunningVariance,
+                                  /* Has to be >= 0. Should be the same in forward and backward functions. */
+                                  double epsilon,
+                                  /* Optionally save intermediate results from the forward pass here
+                                     - can be reused to speed up backward pass. NULL if unused */
+                                  void *resultSaveMean,
+                                  void *resultSaveInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  const cudnnTensorDescriptor_t zDesc,
+                                  const void *zData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *yData,
+                                  void *workspace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationBackward(cudnnHandle_t handle,
+                           cudnnNormMode_t mode,
+                           cudnnNormOps_t normOps,
+                           cudnnNormAlgo_t algo,
+                           const void *alphaDataDiff,
+                           const void *betaDataDiff,
+                           const void *alphaParamDiff,
+                           const void *betaParamDiff,
+                           const cudnnTensorDescriptor_t xDesc,
+                           const void *xData,
+                           const cudnnTensorDescriptor_t yDesc,
+                           const void *yData,
+                           const cudnnTensorDescriptor_t dyDesc,
+                           const void *dyData,
+                           const cudnnTensorDescriptor_t dzDesc,
+                           void *dzData,
+                           const cudnnTensorDescriptor_t dxDesc,
+                           void *dxData,
+                           /* Shared tensor desc for the 4 tensors below */
+                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                           const void *normScaleData,
+                           const void *normBiasData, /* needed if there is activation */
+                           void *dNormScaleData,
+                           void *dNormBiasData,
+                           double epsilon, /* Same epsilon as forward pass */
+                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                           /* Optionally cached intermediate results from
+                              forward pass */
+                           const void *savedMean,
+                           const void *savedInvVariance,
+                           cudnnActivationDescriptor_t activationDesc,
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes,
+                           void *reserveSpace,
+                           size_t reserveSpaceSizeInBytes,
+                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_OPS_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e08847c95f1294bc99e96e737a53cc6ebb7a458
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn : Neural Networks Library  */
+
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <cuda_runtime_api.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+#include "cudnn_ops.h"
+#include "cudnn_adv.h"
+#include "cudnn_cnn.h"
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_H_ */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..1af101fac7672614e3af52cbc32c57bc2104f498
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/**
+ * \file: The master cuDNN version file.
+ */
+
+#ifndef CUDNN_VERSION_H_
+#define CUDNN_VERSION_H_
+
+#define CUDNN_MAJOR 9
+#define CUDNN_MINOR 10
+#define CUDNN_PATCHLEVEL 2
+
+#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
+
+/* cannot use constexpr here since this is a C-only file */
+/* Below is the max SM version this cuDNN library is aware of and supports natively */
+
+#define CUDNN_MAX_SM_MAJOR_NUMBER 12
+#define CUDNN_MAX_SM_MINOR_NUMBER 0
+#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10)
+
+#endif /* CUDNN_VERSION_H */
diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..1af101fac7672614e3af52cbc32c57bc2104f498
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/**
+ * \file: The master cuDNN version file.
+ */
+
+#ifndef CUDNN_VERSION_H_
+#define CUDNN_VERSION_H_
+
+#define CUDNN_MAJOR 9
+#define CUDNN_MINOR 10
+#define CUDNN_PATCHLEVEL 2
+
+#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
+
+/* cannot use constexpr here since this is a C-only file */
+/* Below is the max SM version this cuDNN library is aware of and supports natively */
+
+#define CUDNN_MAX_SM_MAJOR_NUMBER 12
+#define CUDNN_MAX_SM_MINOR_NUMBER 0
+#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10)
+
+#endif /* CUDNN_VERSION_H */
diff --git a/.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt b/.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bcd1867a02a6a8c1e592b92e2e50f34e531f2d87
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt
@@ -0,0 +1,39 @@
+
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+   https://github.com/NVIDIA/NVTX
+
+for more information and license details.
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e06d16bd4d506966ccc5a6ea42de1a608d8e99e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py
@@ -0,0 +1,54 @@
+"""Module to give helpful messages to the user that did not
+compile scikit-learn properly.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import os
+
+INPLACE_MSG = """
+It appears that you are importing a local scikit-learn source tree. For
+this, you need to have an inplace install. Maybe you are in the source
+directory and you need to try from another location."""
+
+STANDARD_MSG = """
+If you have used an installer, please check that it is suited for your
+Python version, your operating system and your platform."""
+
+
+def raise_build_error(e):
+    # Raise a comprehensible error and list the contents of the
+    # directory to help debugging on the mailing list.
+    local_dir = os.path.split(__file__)[0]
+    msg = STANDARD_MSG
+    if local_dir == "sklearn/__check_build":
+        # Picking up the local install: this will work only if the
+        # install is an 'inplace build'
+        msg = INPLACE_MSG
+    dir_content = list()
+    for i, filename in enumerate(os.listdir(local_dir)):
+        if (i + 1) % 3:
+            dir_content.append(filename.ljust(26))
+        else:
+            dir_content.append(filename + "\n")
+    raise ImportError(
+        """%s
+___________________________________________________________________________
+Contents of %s:
+%s
+___________________________________________________________________________
+It seems that scikit-learn has not been built correctly.
+
+If you have installed scikit-learn from source, please do not forget
+to build the package before using it. For detailed instructions, see:
+https://scikit-learn.org/dev/developers/advanced_installation.html#building-from-source
+%s"""
+        % (e, local_dir, "".join(dir_content).strip(), msg)
+    )
+
+
+try:
+    from ._check_build import check_build  # noqa: F401
+except ImportError as e:
+    raise_build_error(e)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..74e02aa76589b2223acfb9fdfbe8da3beb3dc778
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx b/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..0409e73f5e96dc3a4c27889fa44eda8a17d36ef9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx
@@ -0,0 +1,2 @@
+def check_build():
+    return
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build b/.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..5f6115d9765499dc28f477a1506a8298492003f5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build
@@ -0,0 +1,6 @@
+py.extension_module(
+  '_check_build',
+  cython_gen.process('_check_build.pyx'),
+  install: true,
+  subdir: 'sklearn/__check_build',
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..714515227428299e0390d3ad8ea743e9be703bf6
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5f935f2e11c1cb4d83e397b390a97230473e223
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a68226bee1459be761629fc8d25cc26fc37b0f9b
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24f16857cc66b2e3c2635b23dd7124b99b802a52
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..261cfecb8d9d846d532dbe7549e5f46c0390c029
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4011fd3ad9eff11c7cf6a4c662b4c90b5df67583
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_build_utils/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a7a35a62feeed47fbb10ace87411c9bdc16370
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import argparse
+import os
+
+from Cython import Tempita as tempita
+
+# XXX: If this import ever fails (does it really?), vendor either
+# cython.tempita or numpy/npy_tempita.
+
+
+def process_tempita(fromfile, outfile=None):
+    """Process tempita templated file and write out the result.
+
+    The template file is expected to end in `.c.tp` or `.pyx.tp`:
+    E.g. processing `template.c.in` generates `template.c`.
+
+    """
+    with open(fromfile, "r", encoding="utf-8") as f:
+        template_content = f.read()
+
+    template = tempita.Template(template_content)
+    content = template.substitute()
+
+    with open(outfile, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("infile", type=str, help="Path to the input file")
+    parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory")
+    parser.add_argument(
+        "-i",
+        "--ignore",
+        type=str,
+        help=(
+            "An ignored input - may be useful to add a "
+            "dependency between custom targets"
+        ),
+    )
+    args = parser.parse_args()
+
+    if not args.infile.endswith(".tp"):
+        raise ValueError(f"Unexpected extension: {args.infile}")
+
+    if not args.outdir:
+        raise ValueError("Missing `--outdir` argument to tempita.py")
+
+    outdir_abs = os.path.join(os.getcwd(), args.outdir)
+    outfile = os.path.join(
+        outdir_abs, os.path.splitext(os.path.split(args.infile)[1])[0]
+    )
+
+    process_tempita(args.infile, outfile)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..922a14917bf3fd2d395a4f5002a39c4d9d9c7ee2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+"""Extract version number from __init__.py"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import os
+
+sklearn_init = os.path.join(os.path.dirname(__file__), "../__init__.py")
+
+data = open(sklearn_init).readlines()
+version_line = next(line for line in data if line.startswith("__version__"))
+
+version = version_line.strip().split(" = ")[1].replace('"', "").replace("'", "")
+
+print(version)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97fdd884e517c4a623e6fc180526bde227af0c21
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py
@@ -0,0 +1,33 @@
+"""
+The :mod:`sklearn._loss` module includes loss function classes suitable for
+fitting classification and regression tasks.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from .loss import (
+    AbsoluteError,
+    HalfBinomialLoss,
+    HalfGammaLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    HalfTweedieLossIdentity,
+    HuberLoss,
+    PinballLoss,
+)
+
+__all__ = [
+    "AbsoluteError",
+    "HalfBinomialLoss",
+    "HalfGammaLoss",
+    "HalfMultinomialLoss",
+    "HalfPoissonLoss",
+    "HalfSquaredError",
+    "HalfTweedieLoss",
+    "HalfTweedieLossIdentity",
+    "HuberLoss",
+    "PinballLoss",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd b/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..ac01b122a0941c35bc4d440543cf5c981943952a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd
@@ -0,0 +1,101 @@
+# Fused types for input like y_true, raw_prediction, sample_weights.
+ctypedef fused floating_in:
+    double
+    float
+
+
+# Fused types for output like gradient and hessian
+# We use a different fused types for input (floating_in) and output (floating_out), such
+# that input and output can have different dtypes in the same function call. A single
+# fused type can only take on one single value (type) for all arguments in one function
+# call.
+ctypedef fused floating_out:
+    double
+    float
+
+
+# Struct to return 2 doubles
+ctypedef struct double_pair:
+    double val1
+    double val2
+
+
+# C base class for loss functions
+cdef class CyLossFunction:
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfSquaredError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyAbsoluteError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyPinballLoss(CyLossFunction):
+    cdef readonly double quantile  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHuberLoss(CyLossFunction):
+    cdef public double delta  # public makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfPoissonLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfGammaLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfTweedieLoss(CyLossFunction):
+    cdef readonly double power  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfTweedieLossIdentity(CyLossFunction):
+    cdef readonly double power  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfBinomialLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyExponentialLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfMultinomialLoss():
+    cdef void cy_gradient(
+        self,
+        const floating_in y_true,
+        const floating_in[::1] raw_prediction,
+        const floating_in sample_weight,
+        floating_out[::1] gradient_out,
+    ) noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..44d5acd530a7f60ac6e08174c5e5197f3fb00735
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp
@@ -0,0 +1,1505 @@
+{{py:
+
+"""
+Template file to easily generate loops over samples using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: _loss.pyx
+
+Each loss class is generated by a cdef functions on single samples.
+The keywords between double braces are substituted during the build.
+"""
+
+doc_HalfSquaredError = (
+    """Half Squared Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+)
+
+doc_AbsoluteError = (
+    """Absolute Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+)
+
+doc_PinballLoss = (
+    """Quantile Loss aka Pinball Loss with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
+    """
+)
+
+doc_HuberLoss = (
+    """Huber Loss with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+    delta in positive real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+)
+
+doc_HalfPoissonLoss = (
+    """Half Poisson deviance loss with log-link.
+
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Poisson deviance with log-link is
+        y_true * log(y_true/y_pred) + y_pred - y_true
+        = y_true * log(y_true) - y_true * raw_prediction
+          + exp(raw_prediction) - y_true
+
+    Dropping constant terms, this gives:
+        exp(raw_prediction) - y_true * raw_prediction
+    """
+)
+
+doc_HalfGammaLoss = (
+    """Half Gamma deviance loss with log-link.
+
+    Domain:
+    y_true and y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Gamma deviance with log-link is
+        log(y_pred/y_true) + y_true/y_pred - 1
+        = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
+
+    Dropping constant terms, this gives:
+        raw_prediction + y_true * exp(-raw_prediction)
+    """
+)
+
+doc_HalfTweedieLoss = (
+    """Half Tweedie deviance loss with log-link.
+
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Tweedie deviance with log-link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+        + exp((2-p) * raw_prediction) / (2-p)
+
+    Dropping constant terms, this gives:
+        exp((2-p) * raw_prediction) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+
+    Notes:
+    - Poisson with p=1 and Gamma with p=2 have different terms dropped such
+      that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
+    - While the Tweedie distribution only exists for p<=0 or p>=1, the range
+      0<p<1 still gives a strictly consistent scoring function for the
+      expectation.
+    """
+)
+
+doc_HalfTweedieLossIdentity = (
+    """Half Tweedie deviance loss with identity link.
+
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers, y_pred may be negative for p=0.
+
+    Link:
+    y_pred = raw_prediction
+
+    Half Tweedie deviance with identity link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+
+    Notes:
+    - Here, we do not drop constant terms in contrast to the version with log-link.
+    """
+)
+
+doc_HalfBinomialLoss = (
+    """Half Binomial deviance loss with logit link.
+
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(raw_prediction)
+    """
+)
+
+doc_ExponentialLoss = (
+    """"Exponential loss with (half) logit link
+
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(2 * raw_prediction)
+    """
+)
+
+# loss class name, docstring, param,
+# cy_loss, cy_loss_grad,
+# cy_grad, cy_grad_hess,
+class_list = [
+    ("CyHalfSquaredError", doc_HalfSquaredError, None,
+     "closs_half_squared_error", None,
+     "cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
+    ("CyAbsoluteError", doc_AbsoluteError, None,
+     "closs_absolute_error", None,
+     "cgradient_absolute_error", "cgrad_hess_absolute_error"),
+    ("CyPinballLoss", doc_PinballLoss, "quantile",
+     "closs_pinball_loss", None,
+     "cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
+     ("CyHuberLoss", doc_HuberLoss, "delta",
+     "closs_huber_loss", None,
+     "cgradient_huber_loss", "cgrad_hess_huber_loss"),
+    ("CyHalfPoissonLoss", doc_HalfPoissonLoss, None,
+     "closs_half_poisson", "closs_grad_half_poisson",
+     "cgradient_half_poisson", "cgrad_hess_half_poisson"),
+    ("CyHalfGammaLoss", doc_HalfGammaLoss, None,
+     "closs_half_gamma", "closs_grad_half_gamma",
+     "cgradient_half_gamma", "cgrad_hess_half_gamma"),
+    ("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power",
+     "closs_half_tweedie", "closs_grad_half_tweedie",
+     "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
+    ("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power",
+     "closs_half_tweedie_identity", "closs_grad_half_tweedie_identity",
+     "cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"),
+    ("CyHalfBinomialLoss", doc_HalfBinomialLoss, None,
+     "closs_half_binomial", "closs_grad_half_binomial",
+     "cgradient_half_binomial", "cgrad_hess_half_binomial"),
+     ("CyExponentialLoss", doc_ExponentialLoss, None,
+     "closs_exponential", "closs_grad_exponential",
+     "cgradient_exponential", "cgrad_hess_exponential"),
+]
+}}
+
+# Design:
+# See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons.
+# a) Merge link functions into loss functions for speed and numerical
+#    stability, i.e. use raw_prediction instead of y_pred in signature.
+# b) Pure C functions (nogil) calculate single points (single sample)
+# c) Wrap C functions in a loop to get Python functions operating on ndarrays.
+#   - Write loops manually---use Tempita for this.
+#     Reason: There is still some performance overhead when using a wrapper
+#     function "wrap" that carries out the loop and gets as argument a function
+#     pointer to one of the C functions from b), e.g.
+#     wrap(closs_half_poisson, y_true, ...)
+#   - Pass n_threads as argument to prange and propagate option to all callers.
+# d) Provide classes (Cython extension types) per loss (names start with Cy) in
+#    order to have semantical structured objects.
+#    - Member functions for single points just call the C function from b).
+#      These are used e.g. in SGD `_plain_sgd`.
+#    - Member functions operating on ndarrays, see c), looping over calls to C
+#      functions from b).
+# e) Provide convenience Python classes that compose from these extension types
+#    elsewhere (see loss.py)
+#    - Example: loss.gradient calls CyLoss.gradient but does some input
+#      checking like None -> np.empty().
+#
+# Note: We require 1-dim ndarrays to be contiguous.
+
+from cython.parallel import parallel, prange
+import numpy as np
+
+from libc.math cimport exp, fabs, log, log1p, pow
+from libc.stdlib cimport malloc, free
+
+
+# -------------------------------------
+# Helper functions
+# -------------------------------------
+# Numerically stable version of log(1 + exp(x)) for double precision, see Eq. (10) of
+# https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
+# Note: The only important cutoff is at x = 18. All others are to save computation
+# time. Compared to the reference, we add the additional case distinction x <= -2 in
+# order to use log instead of log1p for improved performance. As with the other
+# cutoffs, this is accurate within machine precision of double.
+cdef inline double log1pexp(double x) noexcept nogil:
+    if x <= -37:
+        return exp(x)
+    elif x <= -2:
+        return log1p(exp(x))
+    elif x <= 18:
+        return log(1. + exp(x))
+    elif x <= 33.3:
+        return x + exp(-x)
+    else:
+        return x
+
+
+cdef inline double_pair sum_exp_minus_max(
+    const int i,
+    const floating_in[:, :] raw_prediction,  # IN
+    floating_out *p                           # OUT
+) noexcept nogil:
+    # Thread local buffers are used to store part of the results via p.
+    # The results are stored as follows:
+    #     p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
+    #     return.val1 = max_value = max(raw_prediction_i_k, k = 0 to n_classes-1)
+    #     return.val2 = sum_exps = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
+    # len(p) must be n_classes
+    # Notes:
+    # - We return the max value and sum of exps (stored in p) as a double_pair.
+    # - i needs to be passed (and stays constant) because otherwise Cython does
+    #   not generate optimal code, see
+    #   https://github.com/scikit-learn/scikit-learn/issues/17299
+    # - We do not normalize p by calculating p[k] = p[k] / sum_exps.
+    #   This helps to save one loop over k.
+    cdef:
+        int k
+        int n_classes = raw_prediction.shape[1]
+        double_pair max_value_and_sum_exps  # val1 = max_value, val2 = sum_exps
+
+    max_value_and_sum_exps.val1 = raw_prediction[i, 0]
+    max_value_and_sum_exps.val2 = 0
+    for k in range(1, n_classes):
+        # Compute max value of array for numerical stability
+        if max_value_and_sum_exps.val1 < raw_prediction[i, k]:
+            max_value_and_sum_exps.val1 = raw_prediction[i, k]
+
+    for k in range(n_classes):
+        p[k] = exp(raw_prediction[i, k] - max_value_and_sum_exps.val1)
+        max_value_and_sum_exps.val2 += p[k]
+
+    return max_value_and_sum_exps
+
+
+# -------------------------------------
+# Single point inline C functions
+# -------------------------------------
+# Half Squared Error
+cdef inline double closs_half_squared_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)
+
+
+cdef inline double cgradient_half_squared_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return raw_prediction - y_true
+
+
+cdef inline double_pair cgrad_hess_half_squared_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair gh
+    gh.val1 = raw_prediction - y_true  # gradient
+    gh.val2 = 1.                       # hessian
+    return gh
+
+
+# Absolute Error
+cdef inline double closs_absolute_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return fabs(raw_prediction - y_true)
+
+
+cdef inline double cgradient_absolute_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return 1. if raw_prediction > y_true else -1.
+
+
+cdef inline double_pair cgrad_hess_absolute_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair gh
+    # Note that exact hessian = 0 almost everywhere. Optimization routines like
+    # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
+    gh.val1 = 1. if raw_prediction > y_true else -1.  # gradient
+    gh.val2 = 1.                                      # hessian
+    return gh
+
+
+# Quantile Loss / Pinball Loss
+cdef inline double closs_pinball_loss(
+    double y_true,
+    double raw_prediction,
+    double quantile
+) noexcept nogil:
+    return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
+            else (1. - quantile) * (raw_prediction - y_true))
+
+
+cdef inline double cgradient_pinball_loss(
+    double y_true,
+    double raw_prediction,
+    double quantile
+) noexcept nogil:
+    return -quantile if y_true >=raw_prediction else 1. - quantile
+
+
+cdef inline double_pair cgrad_hess_pinball_loss(
+    double y_true,
+    double raw_prediction,
+    double quantile
+) noexcept nogil:
+    cdef double_pair gh
+    # Note that exact hessian = 0 almost everywhere. Optimization routines like
+    # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
+    gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile  # gradient
+    gh.val2 = 1.                                                       # hessian
+    return gh
+
+
+# Huber Loss
+cdef inline double closs_huber_loss(
+    double y_true,
+    double raw_prediction,
+    double delta,
+) noexcept nogil:
+    cdef double abserr = fabs(y_true - raw_prediction)
+    if abserr <= delta:
+        return 0.5 * abserr**2
+    else:
+        return delta * (abserr - 0.5 * delta)
+
+
+cdef inline double cgradient_huber_loss(
+    double y_true,
+    double raw_prediction,
+    double delta,
+) noexcept nogil:
+    cdef double res = raw_prediction - y_true
+    if fabs(res) <= delta:
+        return res
+    else:
+        return delta if res >=0 else -delta
+
+
+cdef inline double_pair cgrad_hess_huber_loss(
+    double y_true,
+    double raw_prediction,
+    double delta,
+) noexcept nogil:
+    cdef double_pair gh
+    gh.val2 = raw_prediction - y_true               # used as temporary
+    if fabs(gh.val2) <= delta:
+        gh.val1 = gh.val2                           # gradient
+        gh.val2 = 1                                 # hessian
+    else:
+        gh.val1 = delta if gh.val2 >=0 else -delta  # gradient
+        gh.val2 = 0                                 # hessian
+    return gh
+
+
+# Half Poisson Deviance with Log-Link, dropping constant terms
+cdef inline double closs_half_poisson(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return exp(raw_prediction) - y_true * raw_prediction
+
+
+cdef inline double cgradient_half_poisson(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # y_pred - y_true
+    return exp(raw_prediction) - y_true
+
+
+cdef inline double_pair closs_grad_half_poisson(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair lg
+    lg.val2 = exp(raw_prediction)                # used as temporary
+    lg.val1 = lg.val2 - y_true * raw_prediction  # loss
+    lg.val2 -= y_true                            # gradient
+    return lg
+
+
+cdef inline double_pair cgrad_hess_half_poisson(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair gh
+    gh.val2 = exp(raw_prediction)  # hessian
+    gh.val1 = gh.val2 - y_true     # gradient
+    return gh
+
+
+# Half Gamma Deviance with Log-Link, dropping constant terms
+cdef inline double closs_half_gamma(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return raw_prediction + y_true * exp(-raw_prediction)
+
+
+cdef inline double cgradient_half_gamma(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return 1. - y_true * exp(-raw_prediction)
+
+
+cdef inline double_pair closs_grad_half_gamma(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair lg
+    lg.val2 = exp(-raw_prediction)               # used as temporary
+    lg.val1 = raw_prediction + y_true * lg.val2  # loss
+    lg.val2 = 1. - y_true * lg.val2              # gradient
+    return lg
+
+
+cdef inline double_pair cgrad_hess_half_gamma(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair gh
+    gh.val2 = exp(-raw_prediction)   # used as temporary
+    gh.val1 = 1. - y_true * gh.val2  # gradient
+    gh.val2 *= y_true                # hessian
+    return gh
+
+
+# Half Tweedie Deviance with Log-Link, dropping constant terms
+# Note that by dropping constants this is no longer continuous in parameter power.
+cdef inline double closs_half_tweedie(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    if power == 0.:
+        return closs_half_squared_error(y_true, exp(raw_prediction))
+    elif power == 1.:
+        return closs_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return closs_half_gamma(y_true, raw_prediction)
+    else:
+        return (exp((2. - power) * raw_prediction) / (2. - power)
+                - y_true * exp((1. - power) * raw_prediction) / (1. - power))
+
+
+cdef inline double cgradient_half_tweedie(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double exp1
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        return exp1 * (exp1 - y_true)
+    elif power == 1.:
+        return cgradient_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return cgradient_half_gamma(y_true, raw_prediction)
+    else:
+        return (exp((2. - power) * raw_prediction)
+                - y_true * exp((1. - power) * raw_prediction))
+
+
+cdef inline double_pair closs_grad_half_tweedie(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double_pair lg
+    cdef double exp1, exp2
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        lg.val1 = closs_half_squared_error(y_true, exp1)  # loss
+        lg.val2 = exp1 * (exp1 - y_true)                  # gradient
+    elif power == 1.:
+        return closs_grad_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return closs_grad_half_gamma(y_true, raw_prediction)
+    else:
+        exp1 = exp((1. - power) * raw_prediction)
+        exp2 = exp((2. - power) * raw_prediction)
+        lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power)  # loss
+        lg.val2 = exp2 - y_true * exp1                                # gradient
+    return lg
+
+
+cdef inline double_pair cgrad_hess_half_tweedie(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double_pair gh
+    cdef double exp1, exp2
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        gh.val1 = exp1 * (exp1 - y_true)      # gradient
+        gh.val2 = exp1 * (2 * exp1 - y_true)  # hessian
+    elif power == 1.:
+        return cgrad_hess_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return cgrad_hess_half_gamma(y_true, raw_prediction)
+    else:
+        exp1 = exp((1. - power) * raw_prediction)
+        exp2 = exp((2. - power) * raw_prediction)
+        gh.val1 = exp2 - y_true * exp1                                # gradient
+        gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1  # hessian
+    return gh
+
+
+# Half Tweedie Deviance with identity link, without dropping constant terms!
+# Therefore, best loss value is zero.
+cdef inline double closs_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double tmp
+    if power == 0.:
+        return closs_half_squared_error(y_true, raw_prediction)
+    elif power == 1.:
+        if y_true == 0:
+            return raw_prediction
+        else:
+            return y_true * log(y_true/raw_prediction) + raw_prediction - y_true
+    elif power == 2.:
+        return log(raw_prediction/y_true) + y_true/raw_prediction - 1.
+    else:
+        tmp = pow(raw_prediction, 1. - power)
+        tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power)
+        if y_true > 0:
+            tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power))
+        return tmp
+
+
+cdef inline double cgradient_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    if power == 0.:
+        return raw_prediction - y_true
+    elif power == 1.:
+        return 1. - y_true / raw_prediction
+    elif power == 2.:
+        return (raw_prediction - y_true) / (raw_prediction * raw_prediction)
+    else:
+        return pow(raw_prediction, -power) * (raw_prediction - y_true)
+
+
+cdef inline double_pair closs_grad_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double_pair lg
+    cdef double tmp
+    if power == 0.:
+        lg.val2 = raw_prediction - y_true  # gradient
+        lg.val1 = 0.5 * lg.val2 * lg.val2  # loss
+    elif power == 1.:
+        if y_true == 0:
+            lg.val1 = raw_prediction
+        else:
+            lg.val1 = (y_true * log(y_true/raw_prediction)  # loss
+                       + raw_prediction - y_true)
+        lg.val2 = 1. - y_true / raw_prediction              # gradient
+    elif power == 2.:
+        lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1.  # loss
+        tmp = raw_prediction * raw_prediction
+        lg.val2 = (raw_prediction - y_true) / tmp                          # gradient
+    else:
+        tmp = pow(raw_prediction, 1. - power)
+        lg.val1 = (raw_prediction * tmp / (2. - power)  # loss
+                   - y_true * tmp / (1. - power))
+        if y_true > 0:
+            lg.val1 += (pow(y_true, 2. - power)
+                        / ((1. - power) * (2. - power)))
+        lg.val2 = tmp * (1. - y_true / raw_prediction)    # gradient
+    return lg
+
+
+cdef inline double_pair cgrad_hess_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double_pair gh
+    cdef double tmp
+    if power == 0.:
+        gh.val1 = raw_prediction - y_true  # gradient
+        gh.val2 = 1.                       # hessian
+    elif power == 1.:
+        gh.val1 = 1. - y_true / raw_prediction                # gradient
+        gh.val2 = y_true / (raw_prediction * raw_prediction)  # hessian
+    elif power == 2.:
+        tmp = raw_prediction * raw_prediction
+        gh.val1 = (raw_prediction - y_true) / tmp             # gradient
+        gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp  # hessian
+    else:
+        tmp = pow(raw_prediction, -power)
+        gh.val1 = tmp * (raw_prediction - y_true)                         # gradient
+        gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction)  # hessian
+    return gh
+
+
+# Half Binomial deviance with logit-link, aka log-loss or binary cross entropy
+cdef inline double closs_half_binomial(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # log1p(exp(raw_prediction)) - y_true * raw_prediction
+    return log1pexp(raw_prediction) - y_true * raw_prediction
+
+
+cdef inline double cgradient_half_binomial(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # gradient = y_pred - y_true = expit(raw_prediction) - y_true
+    # Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
+    #     if raw_prediction < 0:
+    #         exp_tmp = exp(raw_prediction)
+    #         return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
+    #     else:
+    #         exp_tmp = exp(-raw_prediction)
+    #         return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+    # Note that optimal speed would be achieved, at the cost of precision, by
+    #     return expit(raw_prediction) - y_true
+    # i.e. no "if else" and an own inline implementation of expit instead of
+    #     from scipy.special.cython_special cimport expit
+    # The case distinction raw_prediction < 0 in the stable implementation does not
+    # provide significant better precision apart from protecting overflow of exp(..).
+    # The branch (if else), however, can incur runtime costs of up to 30%.
+    # Instead, we help branch prediction by almost always ending in the first if clause
+    # and making the second branch (else) a bit simpler. This has the exact same
+    # precision but is faster than the stable implementation.
+    # As branching criteria, we use the same cutoff as in log1pexp. Note that the
+    # maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731
+    # (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
+    cdef double exp_tmp
+    if raw_prediction > -37:
+        exp_tmp = exp(-raw_prediction)
+        return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+    else:
+        # expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37
+        return exp(raw_prediction) - y_true
+
+
+cdef inline double_pair closs_grad_half_binomial(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair lg
+    # Same if else conditions as in log1pexp.
+    if raw_prediction <= -37:
+        lg.val2 = exp(raw_prediction)  # used as temporary
+        lg.val1 = lg.val2 - y_true * raw_prediction                  # loss
+        lg.val2 -= y_true                                            # gradient
+    elif raw_prediction <= -2:
+        lg.val2 = exp(raw_prediction)  # used as temporary
+        lg.val1 = log1p(lg.val2) - y_true * raw_prediction           # loss
+        lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2)  # gradient
+    elif raw_prediction <= 18:
+        lg.val2 = exp(-raw_prediction)  # used as temporary
+        # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
+        lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction     # loss
+        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)  # gradient
+    else:
+        lg.val2 = exp(-raw_prediction)  # used as temporary
+        lg.val1 = lg.val2 + (1 - y_true) * raw_prediction            # loss
+        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)  # gradient
+    return lg
+
+
+cdef inline double_pair cgrad_hess_half_binomial(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # with y_pred = expit(raw)
+    # hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
+    #                                 = exp(-raw) / (1 + exp(-raw))**2
+    cdef double_pair gh
+    # See comment in cgradient_half_binomial.
+    if raw_prediction > -37:
+        gh.val2 = exp(-raw_prediction)  # used as temporary
+        gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
+        gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
+    else:
+        gh.val2 = exp(raw_prediction)  # = 1. order Taylor in exp(raw_prediction)
+        gh.val1 = gh.val2 - y_true
+    return gh
+
+
+# Exponential loss with (half) logit-link, aka boosting loss
+cdef inline double closs_exponential(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double tmp = exp(raw_prediction)
+    return y_true / tmp + (1 - y_true) * tmp
+
+
+cdef inline double cgradient_exponential(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double tmp = exp(raw_prediction)
+    return -y_true / tmp + (1 - y_true) * tmp
+
+
+cdef inline double_pair closs_grad_exponential(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair lg
+    lg.val2 = exp(raw_prediction)  # used as temporary
+
+    lg.val1 =  y_true / lg.val2 + (1 - y_true) * lg.val2  # loss
+    lg.val2 = -y_true / lg.val2 + (1 - y_true) * lg.val2  # gradient
+    return lg
+
+
+cdef inline double_pair cgrad_hess_exponential(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # Note that hessian = loss
+    cdef double_pair gh
+    gh.val2 = exp(raw_prediction)  # used as temporary
+
+    gh.val1 = -y_true / gh.val2 + (1 - y_true) * gh.val2  # gradient
+    gh.val2 =  y_true / gh.val2 + (1 - y_true) * gh.val2  # hessian
+    return gh
+
+
+# ---------------------------------------------------
+# Extension Types for Loss Functions of 1-dim targets
+# ---------------------------------------------------
+cdef class CyLossFunction:
+    """Base class for convex loss functions."""
+
+    def __reduce__(self):
+        return (self.__class__, ())
+
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
+        """Compute the loss for a single sample.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        double
+            The loss evaluated at `y_true` and `raw_prediction`.
+        """
+        pass
+
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
+        """Compute gradient of loss w.r.t. raw_prediction for a single sample.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        double
+            The derivative of the loss function w.r.t. `raw_prediction`.
+        """
+        pass
+
+    cdef double_pair cy_grad_hess(
+        self, double y_true, double raw_prediction
+    ) noexcept nogil:
+        """Compute gradient and hessian.
+
+        Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
+
+        This is usually diagonal in raw_prediction_i and raw_prediction_j.
+        Therefore, we return the diagonal element i=j.
+
+        For a loss with a non-canonical link, this might implement the diagonal
+        of the Fisher matrix (=expected hessian) instead of the hessian.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        double_pair
+            Gradient and hessian of the loss function w.r.t. `raw_prediction`.
+        """
+        pass
+
+    def loss(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        int n_threads=1
+    ):
+        """Compute the point-wise loss value for each input.
+
+        The point-wise loss is written to `loss_out` and no array is returned.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        loss_out : array of shape (n_samples,)
+            A location into which the result is stored.
+        n_threads : int
+            Number of threads used by OpenMP (if any).
+        """
+        pass
+
+    def gradient(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        """Compute gradient of loss w.r.t raw_prediction for each input.
+
+        The gradient is written to `gradient_out` and no array is returned.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        gradient_out : array of shape (n_samples,)
+            A location into which the result is stored.
+        n_threads : int
+            Number of threads used by OpenMP (if any).
+        """
+        pass
+
+    def loss_gradient(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        floating_out[::1] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        """Compute loss and gradient of loss w.r.t raw_prediction.
+
+        The loss and gradient are written to `loss_out` and `gradient_out` and no arrays
+        are returned.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        loss_out : array of shape (n_samples,) or None
+            A location into which the element-wise loss is stored.
+        gradient_out : array of shape (n_samples,)
+            A location into which the gradient is stored.
+        n_threads : int
+            Number of threads used by OpenMP (if any).
+        """
+        self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
+        self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)
+
+    def gradient_hessian(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        floating_out[::1] hessian_out,          # OUT
+        int n_threads=1
+    ):
+        """Compute gradient and hessian of loss w.r.t raw_prediction.
+
+        The gradient and hessian are written to `gradient_out` and `hessian_out` and no
+        arrays are returned.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        gradient_out : array of shape (n_samples,)
+            A location into which the gradient is stored.
+        hessian_out : array of shape (n_samples,)
+            A location into which the hessian is stored.
+        n_threads : int
+            Number of threads used by OpenMP (if any).
+        """
+        pass
+
+
+{{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}}
+{{py:
+if param is None:
+    with_param = ""
+else:
+    with_param = ", self." + param
+}}
+
+cdef class {{name}}(CyLossFunction):
+    """{{docstring}}"""
+
+    {{if param is not None}}
+    def __init__(self, {{param}}):
+        self.{{param}} = {{param}}
+    {{endif}}
+
+    {{if param is not None}}
+    def __reduce__(self):
+        return (self.__class__, (self.{{param}},))
+    {{endif}}
+
+    cdef inline double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
+        return {{closs}}(y_true, raw_prediction{{with_param}})
+
+    cdef inline double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
+        return {{cgrad}}(y_true, raw_prediction{{with_param}})
+
+    cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil:
+        return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
+
+    def loss(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss_out[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
+
+    {{if closs_grad is not None}}
+    def loss_gradient(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        floating_out[::1] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double_pair dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss_out[i] = dbl2.val1
+                gradient_out[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss_out[i] = sample_weight[i] * dbl2.val1
+                gradient_out[i] = sample_weight[i] * dbl2.val2
+
+    {{endif}}
+
+    def gradient(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient_out[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
+
+    def gradient_hessian(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        floating_out[::1] hessian_out,          # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double_pair dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
+                gradient_out[i] = dbl2.val1
+                hessian_out[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
+                gradient_out[i] = sample_weight[i] * dbl2.val1
+                hessian_out[i] = sample_weight[i] * dbl2.val2
+
+{{endfor}}
+
+
+# The multinomial deviance loss is also known as categorical cross-entropy or
+# multinomial log-likelihood.
+# Here, we do not inherit from CyLossFunction as its cy_gradient method deviates
+# from the API.
+cdef class CyHalfMultinomialLoss():
+    """Half Multinomial deviance loss with multinomial logit link.
+
+    Domain:
+    y_true in {0, 1, 2, 3, .., n_classes - 1}
+    y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded
+
+    Link:
+    y_pred = softmax(raw_prediction)
+
+    Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is
+    mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1.
+    """
+
+    # Here we deviate from the CyLossFunction API. SAG/SAGA needs direct access to
+    # sample-wise gradients which we provide here.
+    cdef inline void cy_gradient(
+        self,
+        const floating_in y_true,
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in sample_weight,
+        floating_out[::1] gradient_out,         # OUT
+    ) noexcept nogil:
+        """Compute gradient of loss w.r.t. `raw_prediction` for a single sample.
+
+        The gradient of the multinomial logistic loss with respect to a class k,
+        and for one sample is:
+        grad_k = - sw * (p[k] - (y==k))
+
+        where:
+            p[k] = proba[k] = exp(raw_prediction[k] - logsumexp(raw_prediction))
+            sw = sample_weight
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : array of shape (n_classes,)
+            Raw prediction values (in link space).
+        sample_weight : double
+            Sample weight.
+        gradient_out : array of shape (n_classs,)
+            A location into which the gradient is stored.
+
+        Returns
+        -------
+        gradient : double
+            The derivative of the loss function w.r.t. `raw_prediction`.
+        """
+        cdef:
+            int k
+            int n_classes = raw_prediction.shape[0]
+            double_pair max_value_and_sum_exps
+            const floating_in[:, :] raw = raw_prediction[None, :]
+
+        max_value_and_sum_exps = sum_exp_minus_max(0, raw, &gradient_out[0])
+        for k in range(n_classes):
+            # gradient_out[k] = p_k = y_pred_k = prob of class k
+            gradient_out[k] /= max_value_and_sum_exps.val2
+            # gradient_k = (p_k - (y_true == k)) * sw
+            gradient_out[k] = (gradient_out[k] - (y_true == k)) * sample_weight
+
+    def _test_cy_gradient(
+        self,
+        const floating_in[::1] y_true,             # IN
+        const floating_in[:, ::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,      # IN
+    ):
+        """For testing only."""
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in [:, ::1] gradient_out
+        gradient = np.empty((n_samples, n_classes), dtype=np.float64)
+        gradient_out = gradient
+
+        for i in range(n_samples):
+            self.cy_gradient(
+                y_true=y_true[i],
+                raw_prediction=raw_prediction[i, :],
+                sample_weight=1.0 if sample_weight is None else sample_weight[i],
+                gradient_out=gradient_out[i, :],
+            )
+        return gradient
+
+    # Note that we do not assume memory alignment/contiguity of 2d arrays.
+    # There seems to be little benefit in doing so. Benchmarks proofing the
+    # opposite are welcome.
+    def loss(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[::1] loss_out,              # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in max_value, sum_exps
+            floating_in*  p  # temporary buffer
+            double_pair max_value_and_sum_exps
+
+        # We assume n_samples > n_classes. In this case having the inner loop
+        # over n_classes is a good default.
+        # TODO: If every memoryview is contiguous and raw_prediction is
+        #       f-contiguous, can we write a better algo (loops) to improve
+        #       performance?
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = max_value_and_sum_exps.val1
+                    sum_exps = max_value_and_sum_exps.val2
+                    loss_out[i] = log(sum_exps) + max_value
+
+                    # label encoded y_true
+                    k = int(y_true[i])
+                    loss_out[i] -= raw_prediction[i, k]
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = max_value_and_sum_exps.val1
+                    sum_exps = max_value_and_sum_exps.val2
+                    loss_out[i] = log(sum_exps) + max_value
+
+                    # label encoded y_true
+                    k = int(y_true[i])
+                    loss_out[i] -= raw_prediction[i, k]
+
+                    loss_out[i] *= sample_weight[i]
+
+                free(p)
+
+    def loss_gradient(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[::1] loss_out,              # OUT
+        floating_out[:, :] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in max_value, sum_exps
+            floating_in*  p  # temporary buffer
+            double_pair max_value_and_sum_exps
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = max_value_and_sum_exps.val1
+                    sum_exps = max_value_and_sum_exps.val2
+                    loss_out[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true[i] == k:
+                            loss_out[i] -= raw_prediction[i, k]
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = p_k - (y_true == k)
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = max_value_and_sum_exps.val1
+                    sum_exps = max_value_and_sum_exps.val2
+                    loss_out[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true[i] == k:
+                            loss_out[i] -= raw_prediction[i, k]
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+
+                    loss_out[i] *= sample_weight[i]
+
+                free(p)
+
+    def gradient(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in sum_exps
+            floating_in*  p  # temporary buffer
+            double_pair max_value_and_sum_exps
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = y_pred_k - (y_true == k)
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+
+                free(p)
+
+    def gradient_hessian(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
+        floating_out[:, :] hessian_out,          # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in sum_exps
+            floating_in* p  # temporary buffer
+            double_pair max_value_and_sum_exps
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # hessian_k = p_k * (1 - p_k)
+                        # gradient_k = p_k - (y_true == k)
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
+                        hessian_out[i, k] = p[k] * (1. - p[k])
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        # hessian_k = p_k * (1 - p_k) * sw
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                        hessian_out[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]
+
+                free(p)
+
+    # This method simplifies the implementation of hessp in linear models,
+    # i.e. the matrix-vector product of the full hessian, not only of the
+    # diagonal (in the classes) approximation as implemented above.
+    def gradient_proba(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
+        floating_out[:, :] proba_out,            # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in sum_exps
+            floating_in*  p  # temporary buffer
+            double_pair max_value_and_sum_exps
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+
+                    for k in range(n_classes):
+                        proba_out[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        # gradient_k = y_pred_k - (y_true == k)
+                        gradient_out[i, k] = proba_out[i, k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+
+                    for k in range(n_classes):
+                        proba_out[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i]
+
+                free(p)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/link.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/link.py
new file mode 100644
index 0000000000000000000000000000000000000000..53dff6c2e928541ce58bb71c484e59622143104d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/link.py
@@ -0,0 +1,282 @@
+"""
+Module contains classes for invertible (and differentiable) link functions.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import numpy as np
+from scipy.special import expit, logit
+from scipy.stats import gmean
+
+from ..utils.extmath import softmax
+
+
+@dataclass
+class Interval:
+    low: float
+    high: float
+    low_inclusive: bool
+    high_inclusive: bool
+
+    def __post_init__(self):
+        """Check that low <= high"""
+        if self.low > self.high:
+            raise ValueError(
+                f"One must have low <= high; got low={self.low}, high={self.high}."
+            )
+
+    def includes(self, x):
+        """Test whether all values of x are in interval range.
+
+        Parameters
+        ----------
+        x : ndarray
+            Array whose elements are tested to be in interval range.
+
+        Returns
+        -------
+        result : bool
+        """
+        if self.low_inclusive:
+            low = np.greater_equal(x, self.low)
+        else:
+            low = np.greater(x, self.low)
+
+        if not np.all(low):
+            return False
+
+        if self.high_inclusive:
+            high = np.less_equal(x, self.high)
+        else:
+            high = np.less(x, self.high)
+
+        # Note: np.all returns numpy.bool_
+        return bool(np.all(high))
+
+
+def _inclusive_low_high(interval, dtype=np.float64):
+    """Generate values low and high to be within the interval range.
+
+    This is used in tests only.
+
+    Returns
+    -------
+    low, high : tuple
+        The returned values low and high lie within the interval.
+    """
+    eps = 10 * np.finfo(dtype).eps
+    if interval.low == -np.inf:
+        low = -1e10
+    elif interval.low < 0:
+        low = interval.low * (1 - eps) + eps
+    else:
+        low = interval.low * (1 + eps) + eps
+
+    if interval.high == np.inf:
+        high = 1e10
+    elif interval.high < 0:
+        high = interval.high * (1 + eps) - eps
+    else:
+        high = interval.high * (1 - eps) - eps
+
+    return low, high
+
+
+class BaseLink(ABC):
+    """Abstract base class for differentiable, invertible link functions.
+
+    Convention:
+        - link function g: raw_prediction = g(y_pred)
+        - inverse link h: y_pred = h(raw_prediction)
+
+    For (generalized) linear models, `raw_prediction = X @ coef` is the so
+    called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
+    conditional (on X) expected value of the target `y_true`.
+
+    The methods are not implemented as staticmethods in case a link function needs
+    parameters.
+    """
+
+    is_multiclass = False  # used for testing only
+
+    # Usually, raw_prediction may be any real number and y_pred is an open
+    # interval.
+    # interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
+    interval_y_pred = Interval(-np.inf, np.inf, False, False)
+
+    @abstractmethod
+    def link(self, y_pred, out=None):
+        """Compute the link function g(y_pred).
+
+        The link function maps (predicted) target values to raw predictions,
+        i.e. `g(y_pred) = raw_prediction`.
+
+        Parameters
+        ----------
+        y_pred : array
+            Predicted target values.
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+
+        Returns
+        -------
+        out : array
+            Output array, element-wise link function.
+        """
+
+    @abstractmethod
+    def inverse(self, raw_prediction, out=None):
+        """Compute the inverse link function h(raw_prediction).
+
+        The inverse link function maps raw predictions to predicted target
+        values, i.e. `h(raw_prediction) = y_pred`.
+
+        Parameters
+        ----------
+        raw_prediction : array
+            Raw prediction values (in link space).
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+
+        Returns
+        -------
+        out : array
+            Output array, element-wise inverse link function.
+        """
+
+
+class IdentityLink(BaseLink):
+    """The identity link function g(x)=x."""
+
+    def link(self, y_pred, out=None):
+        if out is not None:
+            np.copyto(out, y_pred)
+            return out
+        else:
+            return y_pred
+
+    inverse = link
+
+
+class LogLink(BaseLink):
+    """The log link function g(x)=log(x)."""
+
+    interval_y_pred = Interval(0, np.inf, False, False)
+
+    def link(self, y_pred, out=None):
+        return np.log(y_pred, out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        return np.exp(raw_prediction, out=out)
+
+
+class LogitLink(BaseLink):
+    """The logit link function g(x)=logit(x)."""
+
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def link(self, y_pred, out=None):
+        return logit(y_pred, out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        return expit(raw_prediction, out=out)
+
+
+class HalfLogitLink(BaseLink):
+    """Half the logit link function g(x)=1/2 * logit(x).
+
+    Used for the exponential loss.
+    """
+
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def link(self, y_pred, out=None):
+        out = logit(y_pred, out=out)
+        out *= 0.5
+        return out
+
+    def inverse(self, raw_prediction, out=None):
+        return expit(2 * raw_prediction, out)
+
+
+class MultinomialLogit(BaseLink):
+    """The symmetric multinomial logit function.
+
+    Convention:
+        - y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+
+    Notes:
+        - The inverse link h is the softmax function.
+        - The sum is over the second axis, i.e. axis=1 (n_classes).
+
+    We have to choose additional constraints in order to make
+
+        y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
+
+    for n_classes classes identifiable and invertible.
+    We choose the symmetric side constraint where the geometric mean response
+    is set as reference category, see [2]:
+
+    The symmetric multinomial logit link function for a single data point is
+    then defined as
+
+        raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
+        = log(y_pred[k]) - mean(log(y_pred)).
+
+    Note that this is equivalent to the definition in [1] and implies mean
+    centered raw predictions:
+
+        sum(raw_prediction[k], k=0..n_classes-1) = 0.
+
+    For linear models with raw_prediction = X @ coef, this corresponds to
+    sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
+    feature is zero.
+
+    Reference
+    ---------
+    .. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
+        logistic regression: a statistical view of boosting" Ann. Statist.
+        28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
+        https://projecteuclid.org/euclid.aos/1016218223
+
+    .. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
+        multinomial logit models with symmetric side constraints."
+        Computational Statistics 28 (2013): 1017-1034.
+        http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
+    """
+
+    is_multiclass = True
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def symmetrize_raw_prediction(self, raw_prediction):
+        return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
+
+    def link(self, y_pred, out=None):
+        # geometric mean as reference category
+        gm = gmean(y_pred, axis=1)
+        return np.log(y_pred / gm[:, np.newaxis], out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        if out is None:
+            return softmax(raw_prediction, copy=True)
+        else:
+            np.copyto(out, raw_prediction)
+            softmax(out, copy=False)
+            return out
+
+
+_LINKS = {
+    "identity": IdentityLink,
+    "log": LogLink,
+    "logit": LogitLink,
+    "half_logit": HalfLogitLink,
+    "multinomial_logit": MultinomialLogit,
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b45ff3322699aa26533d504be6407f9d5acbb5b8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py
@@ -0,0 +1,1181 @@
+"""
+This module contains loss classes suitable for fitting.
+
+It is not part of the public API.
+Specific losses are used for regression, binary classification or multiclass
+classification.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Goals:
+# - Provide a common private module for loss functions/classes.
+# - To be used in:
+#   - LogisticRegression
+#   - PoissonRegressor, GammaRegressor, TweedieRegressor
+#   - HistGradientBoostingRegressor, HistGradientBoostingClassifier
+#   - GradientBoostingRegressor, GradientBoostingClassifier
+#   - SGDRegressor, SGDClassifier
+# - Replace link module of GLMs.
+
+import numbers
+
+import numpy as np
+from scipy.special import xlogy
+
+from ..utils import check_scalar
+from ..utils.stats import _weighted_percentile
+from ._loss import (
+    CyAbsoluteError,
+    CyExponentialLoss,
+    CyHalfBinomialLoss,
+    CyHalfGammaLoss,
+    CyHalfMultinomialLoss,
+    CyHalfPoissonLoss,
+    CyHalfSquaredError,
+    CyHalfTweedieLoss,
+    CyHalfTweedieLossIdentity,
+    CyHuberLoss,
+    CyPinballLoss,
+)
+from .link import (
+    HalfLogitLink,
+    IdentityLink,
+    Interval,
+    LogitLink,
+    LogLink,
+    MultinomialLogit,
+)
+
+
+# Note: The shape of raw_prediction for multiclass classifications are
+# - GradientBoostingClassifier: (n_samples, n_classes)
+# - HistGradientBoostingClassifier: (n_classes, n_samples)
+#
+# Note: Instead of inheritance like
+#
+#    class BaseLoss(BaseLink, CyLossFunction):
+#    ...
+#
+#    # Note: Naturally, we would inherit in the following order
+#    #     class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
+#    #   But because of https://github.com/cython/cython/issues/4350 we set BaseLoss as
+#    #   the last one. This, of course, changes the MRO.
+#    class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
+#
+# we use composition. This way we improve maintainability by avoiding the above
+# mentioned Cython edge case and have easier to understand code (which method calls
+# which code).
+class BaseLoss:
+    """Base class for a loss function of 1-dimensional targets.
+
+    Conventions:
+
+        - y_true.shape = sample_weight.shape = (n_samples,)
+        - y_pred.shape = raw_prediction.shape = (n_samples,)
+        - If is_multiclass is true (multiclass classification), then
+          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+          Note that this corresponds to the return value of decision_function.
+
+    y_true, y_pred, sample_weight and raw_prediction must either be all float64
+    or all float32.
+    gradient and hessian must be either both float64 or both float32.
+
+    Note that y_pred = link.inverse(raw_prediction).
+
+    Specific loss classes can inherit specific link classes to satisfy
+    BaseLink's abstractmethods.
+
+    Parameters
+    ----------
+    sample_weight : {None, ndarray}
+        If sample_weight is None, the hessian might be constant.
+    n_classes : {None, int}
+        The number of classes for classification, else None.
+
+    Attributes
+    ----------
+    closs: CyLossFunction
+    link : BaseLink
+    interval_y_true : Interval
+        Valid interval for y_true
+    interval_y_pred : Interval
+        Valid Interval for y_pred
+    differentiable : bool
+        Indicates whether or not loss function is differentiable in
+        raw_prediction everywhere.
+    need_update_leaves_values : bool
+        Indicates whether decision trees in gradient boosting need to uptade
+        leave values after having been fit to the (negative) gradients.
+    approx_hessian : bool
+        Indicates whether the hessian is approximated or exact. If,
+        approximated, it should be larger or equal to the exact one.
+    constant_hessian : bool
+        Indicates whether the hessian is one for this loss.
+    is_multiclass : bool
+        Indicates whether n_classes > 2 is allowed.
+    """
+
+    # For gradient boosted decision trees:
+    # This variable indicates whether the loss requires the leaves values to
+    # be updated once the tree has been trained. The trees are trained to
+    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
+    # some losses (e.g. least absolute deviation) we need to adjust the tree
+    # values to account for the "line search" of the gradient descent
+    # procedure. See the original paper Greedy Function Approximation: A
+    # Gradient Boosting Machine by Friedman
+    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
+    differentiable = True
+    need_update_leaves_values = False
+    is_multiclass = False
+
+    def __init__(self, closs, link, n_classes=None):
+        self.closs = closs
+        self.link = link
+        self.approx_hessian = False
+        self.constant_hessian = False
+        self.n_classes = n_classes
+        self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        self.interval_y_pred = self.link.interval_y_pred
+
+    def in_y_true_range(self, y):
+        """Return True if y is in the valid range of y_true.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return self.interval_y_true.includes(y)
+
+    def in_y_pred_range(self, y):
+        """Return True if y is in the valid range of y_pred.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return self.interval_y_pred.includes(y)
+
+    def loss(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        loss_out=None,
+        n_threads=1,
+    ):
+        """Compute the pointwise loss value for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        loss_out : None or C-contiguous array of shape (n_samples,)
+            A location into which the result is stored. If None, a new array
+            might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+        """
+        if loss_out is None:
+            loss_out = np.empty_like(y_true)
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+
+        self.closs.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss_out=loss_out,
+            n_threads=n_threads,
+        )
+        return loss_out
+
+    def loss_gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        loss_out=None,
+        gradient_out=None,
+        n_threads=1,
+    ):
+        """Compute loss and gradient w.r.t. raw_prediction for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        loss_out : None or C-contiguous array of shape (n_samples,)
+            A location into which the loss is stored. If None, a new array
+            might be created.
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+
+        gradient : array of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        """
+        if loss_out is None:
+            if gradient_out is None:
+                loss_out = np.empty_like(y_true)
+                gradient_out = np.empty_like(raw_prediction)
+            else:
+                loss_out = np.empty_like(y_true, dtype=gradient_out.dtype)
+        elif gradient_out is None:
+            gradient_out = np.empty_like(raw_prediction, dtype=loss_out.dtype)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
+
+        self.closs.loss_gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss_out=loss_out,
+            gradient_out=gradient_out,
+            n_threads=n_threads,
+        )
+        return loss_out, gradient_out
+
+    def gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient_out=None,
+        n_threads=1,
+    ):
+        """Compute gradient of loss w.r.t raw_prediction for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the result is stored. If None, a new array
+            might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : array of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        """
+        if gradient_out is None:
+            gradient_out = np.empty_like(raw_prediction)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
+
+        self.closs.gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient_out=gradient_out,
+            n_threads=n_threads,
+        )
+        return gradient_out
+
+    def gradient_hessian(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient_out=None,
+        hessian_out=None,
+        n_threads=1,
+    ):
+        """Compute gradient and hessian of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        hessian_out : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the hessian is stored. If None, a new array
+            might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+
+        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise hessians.
+        """
+        if gradient_out is None:
+            if hessian_out is None:
+                gradient_out = np.empty_like(raw_prediction)
+                hessian_out = np.empty_like(raw_prediction)
+            else:
+                gradient_out = np.empty_like(hessian_out)
+        elif hessian_out is None:
+            hessian_out = np.empty_like(gradient_out)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
+        if hessian_out.ndim == 2 and hessian_out.shape[1] == 1:
+            hessian_out = hessian_out.squeeze(1)
+
+        self.closs.gradient_hessian(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient_out=gradient_out,
+            hessian_out=hessian_out,
+            n_threads=n_threads,
+        )
+        return gradient_out, hessian_out
+
+    def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1):
+        """Compute the weighted average loss.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : float
+            Mean or averaged loss function.
+        """
+        return np.average(
+            self.loss(
+                y_true=y_true,
+                raw_prediction=raw_prediction,
+                sample_weight=None,
+                loss_out=None,
+                n_threads=n_threads,
+            ),
+            weights=sample_weight,
+        )
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This can be used as initial estimates of predictions, i.e. before the
+        first iteration in fit.
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or array of shape (n_samples,)
+            Sample weights.
+
+        Returns
+        -------
+        raw_prediction : numpy scalar or array of shape (n_classes,)
+            Raw predictions of an intercept-only model.
+        """
+        # As default, take weighted average of the target over the samples
+        # axis=0 and then transform into link-scale (raw_prediction).
+        y_pred = np.average(y_true, weights=sample_weight, axis=0)
+        eps = 10 * np.finfo(y_pred.dtype).eps
+
+        if self.interval_y_pred.low == -np.inf:
+            a_min = None
+        elif self.interval_y_pred.low_inclusive:
+            a_min = self.interval_y_pred.low
+        else:
+            a_min = self.interval_y_pred.low + eps
+
+        if self.interval_y_pred.high == np.inf:
+            a_max = None
+        elif self.interval_y_pred.high_inclusive:
+            a_max = self.interval_y_pred.high
+        else:
+            a_max = self.interval_y_pred.high - eps
+
+        if a_min is None and a_max is None:
+            return self.link.link(y_pred)
+        else:
+            return self.link.link(np.clip(y_pred, a_min, a_max))
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        """Calculate term dropped in loss.
+
+        With this term added, the loss of perfect predictions is zero.
+        """
+        return np.zeros_like(y_true)
+
+    def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"):
+        """Initialize arrays for gradients and hessians.
+
+        Unless hessians are constant, arrays are initialized with undefined values.
+
+        Parameters
+        ----------
+        n_samples : int
+            The number of samples, usually passed to `fit()`.
+        dtype : {np.float64, np.float32}, default=np.float64
+            The dtype of the arrays gradient and hessian.
+        order : {'C', 'F'}, default='F'
+            Order of the arrays gradient and hessian. The default 'F' makes the arrays
+            contiguous along samples.
+
+        Returns
+        -------
+        gradient : C-contiguous array of shape (n_samples,) or array of shape \
+            (n_samples, n_classes)
+            Empty array (allocated but not initialized) to be used as argument
+            gradient_out.
+        hessian : C-contiguous array of shape (n_samples,), array of shape
+            (n_samples, n_classes) or shape (1,)
+            Empty (allocated but not initialized) array to be used as argument
+            hessian_out.
+            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
+            initialized to ``1``.
+        """
+        if dtype not in (np.float32, np.float64):
+            raise ValueError(
+                "Valid options for 'dtype' are np.float32 and np.float64. "
+                f"Got dtype={dtype} instead."
+            )
+
+        if self.is_multiclass:
+            shape = (n_samples, self.n_classes)
+        else:
+            shape = (n_samples,)
+        gradient = np.empty(shape=shape, dtype=dtype, order=order)
+
+        if self.constant_hessian:
+            # If the hessians are constant, we consider them equal to 1.
+            # - This is correct for HalfSquaredError
+            # - For AbsoluteError, hessians are actually 0, but they are
+            #   always ignored anyway.
+            hessian = np.ones(shape=(1,), dtype=dtype)
+        else:
+            hessian = np.empty(shape=shape, dtype=dtype, order=order)
+
+        return gradient, hessian
+
+
+# Note: Naturally, we would inherit in the following order
+#         class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
+#       But because of https://github.com/cython/cython/issues/4350 we
+#       set BaseLoss as the last one. This, of course, changes the MRO.
+class HalfSquaredError(BaseLoss):
+    """Half squared error with identity link, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, half squared error is defined as::
+
+        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
+
+    The factor of 0.5 simplifies the computation of gradients and results in a
+    unit hessian (and is consistent with what is done in LightGBM). It is also
+    half the Normal distribution deviance.
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__(closs=CyHalfSquaredError(), link=IdentityLink())
+        self.constant_hessian = sample_weight is None
+
+
+class AbsoluteError(BaseLoss):
+    """Absolute error with identity link, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, the absolute error is defined as::
+
+        loss(x_i) = |y_true_i - raw_prediction_i|
+
+    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
+    differentiable = False). Optimization routines like in HGBT, however, need a
+    hessian > 0. Therefore, we assign 1.
+    """
+
+    differentiable = False
+    need_update_leaves_values = True
+
+    def __init__(self, sample_weight=None):
+        super().__init__(closs=CyAbsoluteError(), link=IdentityLink())
+        self.approx_hessian = True
+        self.constant_hessian = sample_weight is None
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        if sample_weight is None:
+            return np.median(y_true, axis=0)
+        else:
+            return _weighted_percentile(y_true, sample_weight, 50)
+
+
+class PinballLoss(BaseLoss):
+    """Quantile loss aka pinball loss, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, the pinball loss is defined as::
+
+        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
+
+        rho_{quantile}(u) = u * (quantile - 1_{u<0})
+                          = -u *(1 - quantile)  if u < 0
+                             u * quantile       if u >= 0
+
+    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
+
+    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
+    differentiable = False). Optimization routines like in HGBT, however, need a
+    hessian > 0. Therefore, we assign 1.
+
+    Additional Attributes
+    ---------------------
+    quantile : float
+        The quantile level of the quantile to be estimated. Must be in range (0, 1).
+    """
+
+    differentiable = False
+    need_update_leaves_values = True
+
+    def __init__(self, sample_weight=None, quantile=0.5):
+        check_scalar(
+            quantile,
+            "quantile",
+            target_type=numbers.Real,
+            min_val=0,
+            max_val=1,
+            include_boundaries="neither",
+        )
+        super().__init__(
+            closs=CyPinballLoss(quantile=float(quantile)),
+            link=IdentityLink(),
+        )
+        self.approx_hessian = True
+        self.constant_hessian = sample_weight is None
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        if sample_weight is None:
+            return np.percentile(y_true, 100 * self.closs.quantile, axis=0)
+        else:
+            return _weighted_percentile(
+                y_true, sample_weight, 100 * self.closs.quantile
+            )
+
+
+class HuberLoss(BaseLoss):
+    """Huber loss, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, the Huber loss is defined as::
+
+        loss(x_i) = 1/2 * abserr**2            if abserr <= delta
+                    delta * (abserr - delta/2) if abserr > delta
+
+        abserr = |y_true_i - raw_prediction_i|
+        delta = quantile(abserr, self.quantile)
+
+    Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
+    equals delta * (AbsoluteError() - delta/2).
+
+    Additional Attributes
+    ---------------------
+    quantile : float
+        The quantile level which defines the breaking point `delta` to distinguish
+        between absolute error and squared error. Must be in range (0, 1).
+
+     Reference
+    ---------
+    .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
+      boosting machine <10.1214/aos/1013203451>`.
+      Annals of Statistics, 29, 1189-1232.
+    """
+
+    differentiable = False
+    need_update_leaves_values = True
+
+    def __init__(self, sample_weight=None, quantile=0.9, delta=0.5):
+        check_scalar(
+            quantile,
+            "quantile",
+            target_type=numbers.Real,
+            min_val=0,
+            max_val=1,
+            include_boundaries="neither",
+        )
+        self.quantile = quantile  # This is better stored outside of Cython.
+        super().__init__(
+            closs=CyHuberLoss(delta=float(delta)),
+            link=IdentityLink(),
+        )
+        self.approx_hessian = True
+        self.constant_hessian = False
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        # See formula before algo 4 in Friedman (2001), but we apply it to y_true,
+        # not to the residual y_true - raw_prediction. An estimator like
+        # HistGradientBoostingRegressor might then call it on the residual, e.g.
+        # fit_intercept_only(y_true - raw_prediction).
+        if sample_weight is None:
+            median = np.percentile(y_true, 50, axis=0)
+        else:
+            median = _weighted_percentile(y_true, sample_weight, 50)
+        diff = y_true - median
+        term = np.sign(diff) * np.minimum(self.closs.delta, np.abs(diff))
+        return median + np.average(term, weights=sample_weight)
+
+
+class HalfPoissonLoss(BaseLoss):
+    """Half Poisson deviance loss with log-link, for regression.
+
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half the Poisson deviance is defined as::
+
+        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
+                    - y_true_i + exp(raw_prediction_i)
+
+    Half the Poisson deviance is actually the negative log-likelihood up to
+    constant terms (not involving raw_prediction) and simplifies the
+    computation of the gradients.
+    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__(closs=CyHalfPoissonLoss(), link=LogLink())
+        self.interval_y_true = Interval(0, np.inf, True, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        term = xlogy(y_true, y_true) - y_true
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+
+class HalfGammaLoss(BaseLoss):
+    """Half Gamma deviance loss with log-link, for regression.
+
+    Domain:
+    y_true and y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half Gamma deviance loss is defined as::
+
+        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
+                    + y_true/exp(raw_prediction_i) - 1
+
+    Half the Gamma deviance is actually proportional to the negative log-
+    likelihood up to constant terms (not involving raw_prediction) and
+    simplifies the computation of the gradients.
+    We also skip the constant term `-log(y_true_i) - 1`.
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__(closs=CyHalfGammaLoss(), link=LogLink())
+        self.interval_y_true = Interval(0, np.inf, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        term = -np.log(y_true) - 1
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+
+class HalfTweedieLoss(BaseLoss):
+    """Half Tweedie deviance loss with log-link, for regression.
+
+    Domain:
+    y_true in real numbers for power <= 0
+    y_true in non-negative real numbers for 0 < power < 2
+    y_true in positive real numbers for 2 <= power
+    y_pred in positive real numbers
+    power in real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half Tweedie deviance loss with p=power is defined
+    as::
+
+        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
+                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
+                    + exp(raw_prediction_i)**(2-p) / (2-p)
+
+    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
+    HalfPoissonLoss and HalfGammaLoss.
+
+    We also skip constant terms, but those are different for p=0, 1, 2.
+    Therefore, the loss is not continuous in `power`.
+
+    Note furthermore that although no Tweedie distribution exists for
+    0 < power < 1, it still gives a strictly consistent scoring function for
+    the expectation.
+    """
+
+    def __init__(self, sample_weight=None, power=1.5):
+        super().__init__(
+            closs=CyHalfTweedieLoss(power=float(power)),
+            link=LogLink(),
+        )
+        if self.closs.power <= 0:
+            self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        elif self.closs.power < 2:
+            self.interval_y_true = Interval(0, np.inf, True, False)
+        else:
+            self.interval_y_true = Interval(0, np.inf, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        if self.closs.power == 0:
+            return HalfSquaredError().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        elif self.closs.power == 1:
+            return HalfPoissonLoss().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        elif self.closs.power == 2:
+            return HalfGammaLoss().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        else:
+            p = self.closs.power
+            term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p)
+            if sample_weight is not None:
+                term *= sample_weight
+            return term
+
+
+class HalfTweedieLossIdentity(BaseLoss):
+    """Half Tweedie deviance loss with identity link, for regression.
+
+    Domain:
+    y_true in real numbers for power <= 0
+    y_true in non-negative real numbers for 0 < power < 2
+    y_true in positive real numbers for 2 <= power
+    y_pred in positive real numbers for power != 0
+    y_pred in real numbers for power = 0
+    power in real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, half Tweedie deviance loss with p=power is defined
+    as::
+
+        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
+                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
+                    + raw_prediction_i**(2-p) / (2-p)
+
+    Note that the minimum value of this loss is 0.
+
+    Note furthermore that although no Tweedie distribution exists for
+    0 < power < 1, it still gives a strictly consistent scoring function for
+    the expectation.
+    """
+
+    def __init__(self, sample_weight=None, power=1.5):
+        super().__init__(
+            closs=CyHalfTweedieLossIdentity(power=float(power)),
+            link=IdentityLink(),
+        )
+        if self.closs.power <= 0:
+            self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        elif self.closs.power < 2:
+            self.interval_y_true = Interval(0, np.inf, True, False)
+        else:
+            self.interval_y_true = Interval(0, np.inf, False, False)
+
+        if self.closs.power == 0:
+            self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
+        else:
+            self.interval_y_pred = Interval(0, np.inf, False, False)
+
+
+class HalfBinomialLoss(BaseLoss):
+    """Half Binomial deviance loss with logit link, for binary classification.
+
+    This is also know as binary cross entropy, log-loss and logistic loss.
+
+    Domain:
+    y_true in [0, 1], i.e. regression on the unit interval
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(raw_prediction)
+
+    For a given sample x_i, half Binomial deviance is defined as the negative
+    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
+    as::
+
+        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
+
+    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
+    section 4.4.1 (about logistic regression).
+
+    Note that the formulation works for classification, y = {0, 1}, as well as
+    logistic regression, y = [0, 1].
+    If you add `constant_to_optimal_zero` to the loss, you get half the
+    Bernoulli/binomial deviance.
+
+    More details: Inserting the predicted probability y_pred = expit(raw_prediction)
+    in the loss gives the well known::
+
+        loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__(
+            closs=CyHalfBinomialLoss(),
+            link=LogitLink(),
+            n_classes=2,
+        )
+        self.interval_y_true = Interval(0, 1, True, True)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        # This is non-zero only if y_true is neither 0 nor 1.
+        term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true)
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+    def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
+            Raw prediction values (in link space).
+
+        Returns
+        -------
+        proba : array of shape (n_samples, 2)
+            Element-wise class probabilities.
+        """
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
+        proba[:, 1] = self.link.inverse(raw_prediction)
+        proba[:, 0] = 1 - proba[:, 1]
+        return proba
+
+
+class HalfMultinomialLoss(BaseLoss):
+    """Categorical cross-entropy loss, for multiclass classification.
+
+    Domain:
+    y_true in {0, 1, 2, 3, .., n_classes - 1}
+    y_pred has n_classes elements, each element in (0, 1)
+
+    Link:
+    y_pred = softmax(raw_prediction)
+
+    Note: We assume y_true to be already label encoded. The inverse link is
+    softmax. But the full link function is the symmetric multinomial logit
+    function.
+
+    For a given sample x_i, the categorical cross-entropy loss is defined as
+    the negative log-likelihood of the multinomial distribution, it
+    generalizes the binary cross-entropy to more than 2 classes::
+
+        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
+                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)
+
+    See [1].
+
+    Note that for the hessian, we calculate only the diagonal part in the
+    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
+    we calculate H_i_k_k, i.e. k=l.
+
+    Reference
+    ---------
+    .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
+        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
+        Multinomial Regression".
+        <1311.6529>`
+    """
+
+    is_multiclass = True
+
+    def __init__(self, sample_weight=None, n_classes=3):
+        super().__init__(
+            closs=CyHalfMultinomialLoss(),
+            link=MultinomialLogit(),
+            n_classes=n_classes,
+        )
+        self.interval_y_true = Interval(0, np.inf, True, False)
+        self.interval_y_pred = Interval(0, 1, False, False)
+
+    def in_y_true_range(self, y):
+        """Return True if y is in the valid range of y_true.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return self.interval_y_true.includes(y) and np.all(y.astype(int) == y)
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the softmax of the weighted average of the target, i.e. over
+        the samples axis=0.
+        """
+        out = np.zeros(self.n_classes, dtype=y_true.dtype)
+        eps = np.finfo(y_true.dtype).eps
+        for k in range(self.n_classes):
+            out[k] = np.average(y_true == k, weights=sample_weight, axis=0)
+            out[k] = np.clip(out[k], eps, 1 - eps)
+        return self.link.link(out[None, :]).reshape(-1)
+
+    def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+
+        Returns
+        -------
+        proba : array of shape (n_samples, n_classes)
+            Element-wise class probabilities.
+        """
+        return self.link.inverse(raw_prediction)
+
+    def gradient_proba(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient_out=None,
+        proba_out=None,
+        n_threads=1,
+    ):
+        """Compute gradient and class probabilities fow raw_prediction.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient_out : None or array of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        proba_out : None or array of shape (n_samples, n_classes)
+            A location into which the class probabilities are stored. If None,
+            a new array might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : array of shape (n_samples, n_classes)
+            Element-wise gradients.
+
+        proba : array of shape (n_samples, n_classes)
+            Element-wise class probabilities.
+        """
+        if gradient_out is None:
+            if proba_out is None:
+                gradient_out = np.empty_like(raw_prediction)
+                proba_out = np.empty_like(raw_prediction)
+            else:
+                gradient_out = np.empty_like(proba_out)
+        elif proba_out is None:
+            proba_out = np.empty_like(gradient_out)
+
+        self.closs.gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient_out=gradient_out,
+            proba_out=proba_out,
+            n_threads=n_threads,
+        )
+        return gradient_out, proba_out
+
+
+class ExponentialLoss(BaseLoss):
+    """Exponential loss with (half) logit link, for binary classification.
+
+    This is also know as boosting loss.
+
+    Domain:
+    y_true in [0, 1], i.e. regression on the unit interval
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(2 * raw_prediction)
+
+    For a given sample x_i, the exponential loss is defined as::
+
+        loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)
+
+    See:
+    - J. Friedman, T. Hastie, R. Tibshirani.
+      "Additive logistic regression: a statistical view of boosting (With discussion
+      and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
+      https://doi.org/10.1214/aos/1016218223
+    - A. Buja, W. Stuetzle, Y. Shen. (2005).
+      "Loss Functions for Binary Class Probability Estimation and Classification:
+      Structure and Applications."
+
+    Note that the formulation works for classification, y = {0, 1}, as well as
+    "exponential logistic" regression, y = [0, 1].
+    Note that this is a proper scoring rule, but without it's canonical link.
+
+    More details: Inserting the predicted probability
+    y_pred = expit(2 * raw_prediction) in the loss gives::
+
+        loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
+            + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__(
+            closs=CyExponentialLoss(),
+            link=HalfLogitLink(),
+            n_classes=2,
+        )
+        self.interval_y_true = Interval(0, 1, True, True)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        # This is non-zero only if y_true is neither 0 nor 1.
+        term = -2 * np.sqrt(y_true * (1 - y_true))
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+    def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
+            Raw prediction values (in link space).
+
+        Returns
+        -------
+        proba : array of shape (n_samples, 2)
+            Element-wise class probabilities.
+        """
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
+        proba[:, 1] = self.link.inverse(raw_prediction)
+        proba[:, 0] = 1 - proba[:, 1]
+        return proba
+
+
+_LOSSES = {
+    "squared_error": HalfSquaredError,
+    "absolute_error": AbsoluteError,
+    "pinball_loss": PinballLoss,
+    "huber_loss": HuberLoss,
+    "poisson_loss": HalfPoissonLoss,
+    "gamma_loss": HalfGammaLoss,
+    "tweedie_loss": HalfTweedieLoss,
+    "binomial_loss": HalfBinomialLoss,
+    "multinomial_loss": HalfMultinomialLoss,
+    "exponential_loss": ExponentialLoss,
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build b/.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..a4b3425a21cd21b6dfa69d28ac688ede94ef2bea
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build
@@ -0,0 +1,23 @@
+# .pyx is generated, so this is needed to make Cython compilation work
+_loss_cython_tree = [
+  fs.copyfile('_loss.pxd')
+]
+
+_loss_pyx = custom_target(
+  '_loss_pyx',
+  output: '_loss.pyx',
+  input: '_loss.pyx.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # TODO in principle this should go in py.exension_module below. This is
+  # temporary work-around for dependency issue with .pyx.tp files. For more
+  # details, see https://github.com/mesonbuild/meson/issues/13212
+  depends: _loss_cython_tree,
+)
+
+py.extension_module(
+  '_loss',
+  cython_gen.process(_loss_pyx),
+  dependencies: [openmp_dep],
+  install: true,
+  subdir: 'sklearn/_loss',
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_link.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_link.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5a665f8d48ac9e356971346774a125b18d234d9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_link.py
@@ -0,0 +1,111 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn._loss.link import (
+    _LINKS,
+    HalfLogitLink,
+    Interval,
+    MultinomialLogit,
+    _inclusive_low_high,
+)
+
+LINK_FUNCTIONS = list(_LINKS.values())
+
+
+def test_interval_raises():
+    """Test that interval with low > high raises ValueError."""
+    with pytest.raises(
+        ValueError, match="One must have low <= high; got low=1, high=0."
+    ):
+        Interval(1, 0, False, False)
+
+
+@pytest.mark.parametrize(
+    "interval",
+    [
+        Interval(0, 1, False, False),
+        Interval(0, 1, False, True),
+        Interval(0, 1, True, False),
+        Interval(0, 1, True, True),
+        Interval(-np.inf, np.inf, False, False),
+        Interval(-np.inf, np.inf, False, True),
+        Interval(-np.inf, np.inf, True, False),
+        Interval(-np.inf, np.inf, True, True),
+        Interval(-10, -1, False, False),
+        Interval(-10, -1, False, True),
+        Interval(-10, -1, True, False),
+        Interval(-10, -1, True, True),
+    ],
+)
+def test_is_in_range(interval):
+    # make sure low and high are always within the interval, used for linspace
+    low, high = _inclusive_low_high(interval)
+
+    x = np.linspace(low, high, num=10)
+    assert interval.includes(x)
+
+    # x contains lower bound
+    assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive
+
+    # x contains upper bound
+    assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive
+
+    # x contains upper and lower bound
+    assert interval.includes(np.r_[x, interval.low, interval.high]) == (
+        interval.low_inclusive and interval.high_inclusive
+    )
+
+
+@pytest.mark.parametrize("link", LINK_FUNCTIONS)
+def test_link_inverse_identity(link, global_random_seed):
+    # Test that link of inverse gives identity.
+    rng = np.random.RandomState(global_random_seed)
+    link = link()
+    n_samples, n_classes = 100, None
+    # The values for `raw_prediction` are limited from -20 to 20 because in the
+    # class `LogitLink` the term `expit(x)` comes very close to 1 for large
+    # positive x and therefore loses precision.
+    if link.is_multiclass:
+        n_classes = 10
+        raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples, n_classes))
+        if isinstance(link, MultinomialLogit):
+            raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
+    elif isinstance(link, HalfLogitLink):
+        raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
+    else:
+        raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples))
+
+    assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
+    y_pred = link.inverse(raw_prediction)
+    assert_allclose(link.inverse(link.link(y_pred)), y_pred)
+
+
+@pytest.mark.parametrize("link", LINK_FUNCTIONS)
+def test_link_out_argument(link):
+    # Test that out argument gets assigned the result.
+    rng = np.random.RandomState(42)
+    link = link()
+    n_samples, n_classes = 100, None
+    if link.is_multiclass:
+        n_classes = 10
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
+        if isinstance(link, MultinomialLogit):
+            raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
+    else:
+        # So far, the valid interval of raw_prediction is (-inf, inf) and
+        # we do not need to distinguish.
+        raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
+
+    y_pred = link.inverse(raw_prediction, out=None)
+    out = np.empty_like(raw_prediction)
+    y_pred_2 = link.inverse(raw_prediction, out=out)
+    assert_allclose(y_pred, out)
+    assert_array_equal(out, y_pred_2)
+    assert np.shares_memory(out, y_pred_2)
+
+    out = np.empty_like(y_pred)
+    raw_prediction_2 = link.link(y_pred, out=out)
+    assert_allclose(raw_prediction, out)
+    assert_array_equal(out, raw_prediction_2)
+    assert np.shares_memory(out, raw_prediction_2)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_loss.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fea32572902366ed70490d67431cab1d1a29f80
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_loss.py
@@ -0,0 +1,1358 @@
+import pickle
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+from pytest import approx
+from scipy.optimize import (
+    LinearConstraint,
+    minimize,
+    minimize_scalar,
+    newton,
+)
+from scipy.special import logsumexp
+
+from sklearn._loss.link import IdentityLink, _inclusive_low_high
+from sklearn._loss.loss import (
+    _LOSSES,
+    AbsoluteError,
+    BaseLoss,
+    HalfBinomialLoss,
+    HalfGammaLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    HalfTweedieLossIdentity,
+    HuberLoss,
+    PinballLoss,
+)
+from sklearn.utils import assert_all_finite
+from sklearn.utils._testing import create_memmap_backed_data, skip_if_32bit
+
+ALL_LOSSES = list(_LOSSES.values())
+
+LOSS_INSTANCES = [loss() for loss in ALL_LOSSES]
+# HalfTweedieLoss(power=1.5) is already there as default
+LOSS_INSTANCES += [
+    PinballLoss(quantile=0.25),
+    HuberLoss(quantile=0.75),
+    HalfTweedieLoss(power=-1.5),
+    HalfTweedieLoss(power=0),
+    HalfTweedieLoss(power=1),
+    HalfTweedieLoss(power=2),
+    HalfTweedieLoss(power=3.0),
+    HalfTweedieLossIdentity(power=0),
+    HalfTweedieLossIdentity(power=1),
+    HalfTweedieLossIdentity(power=2),
+    HalfTweedieLossIdentity(power=3.0),
+]
+
+
+def loss_instance_name(param):
+    if isinstance(param, BaseLoss):
+        loss = param
+        name = loss.__class__.__name__
+        if isinstance(loss, PinballLoss):
+            name += f"(quantile={loss.closs.quantile})"
+        elif isinstance(loss, HuberLoss):
+            name += f"(quantile={loss.quantile}"
+        elif hasattr(loss, "closs") and hasattr(loss.closs, "power"):
+            name += f"(power={loss.closs.power})"
+        return name
+    else:
+        return str(param)
+
+
+def random_y_true_raw_prediction(
+    loss, n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=42
+):
+    """Random generate y_true and raw_prediction in valid range."""
+    rng = np.random.RandomState(seed)
+    if loss.is_multiclass:
+        raw_prediction = np.empty((n_samples, loss.n_classes))
+        raw_prediction.flat[:] = rng.uniform(
+            low=raw_bound[0],
+            high=raw_bound[1],
+            size=n_samples * loss.n_classes,
+        )
+        y_true = np.arange(n_samples).astype(float) % loss.n_classes
+    else:
+        # If link is identity, we must respect the interval of y_pred:
+        if isinstance(loss.link, IdentityLink):
+            low, high = _inclusive_low_high(loss.interval_y_pred)
+            low = np.amax([low, raw_bound[0]])
+            high = np.amin([high, raw_bound[1]])
+            raw_bound = (low, high)
+        raw_prediction = rng.uniform(
+            low=raw_bound[0], high=raw_bound[1], size=n_samples
+        )
+        # generate a y_true in valid range
+        low, high = _inclusive_low_high(loss.interval_y_true)
+        low = max(low, y_bound[0])
+        high = min(high, y_bound[1])
+        y_true = rng.uniform(low, high, size=n_samples)
+        # set some values at special boundaries
+        if loss.interval_y_true.low == 0 and loss.interval_y_true.low_inclusive:
+            y_true[:: (n_samples // 3)] = 0
+        if loss.interval_y_true.high == 1 and loss.interval_y_true.high_inclusive:
+            y_true[1 :: (n_samples // 3)] = 1
+
+    return y_true, raw_prediction
+
+
+def numerical_derivative(func, x, eps):
+    """Helper function for numerical (first) derivatives."""
+    # For numerical derivatives, see
+    # https://en.wikipedia.org/wiki/Numerical_differentiation
+    # https://en.wikipedia.org/wiki/Finite_difference_coefficient
+    # We use central finite differences of accuracy 4.
+    h = np.full_like(x, fill_value=eps)
+    f_minus_2h = func(x - 2 * h)
+    f_minus_1h = func(x - h)
+    f_plus_1h = func(x + h)
+    f_plus_2h = func(x + 2 * h)
+    return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (12.0 * eps)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_loss_boundary(loss):
+    """Test interval ranges of y_true and y_pred in losses."""
+    # make sure low and high are always within the interval, used for linspace
+    if loss.is_multiclass:
+        n_classes = 3  # default value
+        y_true = np.tile(np.linspace(0, n_classes - 1, num=n_classes), 3)
+    else:
+        low, high = _inclusive_low_high(loss.interval_y_true)
+        y_true = np.linspace(low, high, num=10)
+
+    # add boundaries if they are included
+    if loss.interval_y_true.low_inclusive:
+        y_true = np.r_[y_true, loss.interval_y_true.low]
+    if loss.interval_y_true.high_inclusive:
+        y_true = np.r_[y_true, loss.interval_y_true.high]
+
+    assert loss.in_y_true_range(y_true)
+
+    n = y_true.shape[0]
+    low, high = _inclusive_low_high(loss.interval_y_pred)
+    if loss.is_multiclass:
+        y_pred = np.empty((n, n_classes))
+        y_pred[:, 0] = np.linspace(low, high, num=n)
+        y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0])
+        y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0])
+    else:
+        y_pred = np.linspace(low, high, num=n)
+
+    assert loss.in_y_pred_range(y_pred)
+
+    # calculating losses should not fail
+    raw_prediction = loss.link.link(y_pred)
+    loss.loss(y_true=y_true, raw_prediction=raw_prediction)
+
+
+# Fixture to test valid value ranges.
+Y_COMMON_PARAMS = [
+    # (loss, [y success], [y fail])
+    (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (HuberLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+    (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLoss(power=-3), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLoss(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+    (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLossIdentity(power=-3), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLossIdentity(power=0), [-3, -0.1, 0, 0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLossIdentity(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+    (HalfTweedieLossIdentity(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLossIdentity(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfBinomialLoss(), [0.1, 0.5, 0.9], [-np.inf, -1, 2, np.inf]),
+    (HalfMultinomialLoss(), [], [-np.inf, -1, 1.1, np.inf]),
+]
+# y_pred and y_true do not always have the same domain (valid value range).
+# Hence, we define extra sets of parameters for each of them.
+Y_TRUE_PARAMS = [  # type: ignore[var-annotated]
+    # (loss, [y success], [y fail])
+    (HalfPoissonLoss(), [0], []),
+    (HuberLoss(), [0], []),
+    (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []),
+    (HalfTweedieLoss(power=0), [-100, 0], []),
+    (HalfTweedieLoss(power=1.5), [0], []),
+    (HalfTweedieLossIdentity(power=-3), [-100, -0.1, 0], []),
+    (HalfTweedieLossIdentity(power=0), [-100, 0], []),
+    (HalfTweedieLossIdentity(power=1.5), [0], []),
+    (HalfBinomialLoss(), [0, 1], []),
+    (HalfMultinomialLoss(), [0.0, 1.0, 2], []),
+]
+Y_PRED_PARAMS = [
+    # (loss, [y success], [y fail])
+    (HalfPoissonLoss(), [], [0]),
+    (HalfTweedieLoss(power=-3), [], [-3, -0.1, 0]),
+    (HalfTweedieLoss(power=0), [], [-3, -0.1, 0]),
+    (HalfTweedieLoss(power=1.5), [], [0]),
+    (HalfTweedieLossIdentity(power=-3), [], [-3, -0.1, 0]),
+    (HalfTweedieLossIdentity(power=0), [-3, -0.1, 0], []),
+    (HalfTweedieLossIdentity(power=1.5), [], [0]),
+    (HalfBinomialLoss(), [], [0, 1]),
+    (HalfMultinomialLoss(), [0.1, 0.5], [0, 1]),
+]
+
+
+@pytest.mark.parametrize(
+    "loss, y_true_success, y_true_fail",
+    Y_COMMON_PARAMS + Y_TRUE_PARAMS,  # type: ignore[operator]
+)
+def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
+    """Test boundaries of y_true for loss functions."""
+    for y in y_true_success:
+        assert loss.in_y_true_range(np.array([y]))
+    for y in y_true_fail:
+        assert not loss.in_y_true_range(np.array([y]))
+
+
+@pytest.mark.parametrize(
+    "loss, y_pred_success, y_pred_fail",
+    Y_COMMON_PARAMS + Y_PRED_PARAMS,  # type: ignore[operator]
+)
+def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
+    """Test boundaries of y_pred for loss functions."""
+    for y in y_pred_success:
+        assert loss.in_y_pred_range(np.array([y]))
+    for y in y_pred_fail:
+        assert not loss.in_y_pred_range(np.array([y]))
+
+
+@pytest.mark.parametrize(
+    "loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true",
+    [
+        (HalfSquaredError(), 1.0, 5.0, 8, 4, 1),
+        (AbsoluteError(), 1.0, 5.0, 4.0, 1.0, None),
+        (PinballLoss(quantile=0.5), 1.0, 5.0, 2, 0.5, None),
+        (PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25), 1 - 0.25, None),
+        (PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25, -0.25, None),
+        (HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2), None, None),
+        (HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2, None, None),
+        (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4), 4 - 2, 4),
+        (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4, 1 - 2 / 4, 2 / 4),
+        (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2, None, None),
+        (HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2), None, None),
+        (HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2, None, None),
+        (
+            HalfTweedieLossIdentity(power=3),
+            2.0,
+            4.0,
+            -1 / 4 + 1 / 4**2 + 1 / 2 / 2,
+            None,
+            None,
+        ),
+        (
+            HalfBinomialLoss(),
+            0.25,
+            np.log(4),
+            np.log1p(4) - 0.25 * np.log(4),
+            None,
+            None,
+        ),
+        # Extreme log loss cases, checked with mpmath:
+        # import mpmath as mp
+        #
+        # # Stolen from scipy
+        # def mpf2float(x):
+        #     return float(mp.nstr(x, 17, min_fixed=0, max_fixed=0))
+        #
+        # def mp_logloss(y_true, raw):
+        #     with mp.workdps(100):
+        #         y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
+        #         out = mp.log1p(mp.exp(raw)) - y_true * raw
+        #     return mpf2float(out)
+        #
+        # def mp_gradient(y_true, raw):
+        #     with mp.workdps(100):
+        #         y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
+        #         out = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw)) - y_true
+        #     return mpf2float(out)
+        #
+        # def mp_hessian(y_true, raw):
+        #     with mp.workdps(100):
+        #         y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
+        #         p = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw))
+        #         out = p * (mp.mpf(1) - p)
+        #     return mpf2float(out)
+        #
+        # y, raw = 0.0, 37.
+        # mp_logloss(y, raw), mp_gradient(y, raw), mp_hessian(y, raw)
+        (HalfBinomialLoss(), 0.0, -1e20, 0, 0, 0),
+        (HalfBinomialLoss(), 1.0, -1e20, 1e20, -1, 0),
+        (HalfBinomialLoss(), 0.0, -1e3, 0, 0, 0),
+        (HalfBinomialLoss(), 1.0, -1e3, 1e3, -1, 0),
+        (HalfBinomialLoss(), 1.0, -37.5, 37.5, -1, 0),
+        (HalfBinomialLoss(), 1.0, -37.0, 37, 1e-16 - 1, 8.533047625744065e-17),
+        (HalfBinomialLoss(), 0.0, -37.0, *[8.533047625744065e-17] * 3),
+        (HalfBinomialLoss(), 1.0, -36.9, 36.9, 1e-16 - 1, 9.430476078526806e-17),
+        (HalfBinomialLoss(), 0.0, -36.9, *[9.430476078526806e-17] * 3),
+        (HalfBinomialLoss(), 0.0, 37.0, 37, 1 - 1e-16, 8.533047625744065e-17),
+        (HalfBinomialLoss(), 1.0, 37.0, *[8.533047625744066e-17] * 3),
+        (HalfBinomialLoss(), 0.0, 37.5, 37.5, 1, 5.175555005801868e-17),
+        (HalfBinomialLoss(), 0.0, 232.8, 232.8, 1, 1.4287342391028437e-101),
+        (HalfBinomialLoss(), 1.0, 1e20, 0, 0, 0),
+        (HalfBinomialLoss(), 0.0, 1e20, 1e20, 1, 0),
+        (
+            HalfBinomialLoss(),
+            1.0,
+            232.8,
+            0,
+            -1.4287342391028437e-101,
+            1.4287342391028437e-101,
+        ),
+        (HalfBinomialLoss(), 1.0, 232.9, 0, 0, 0),
+        (HalfBinomialLoss(), 1.0, 1e3, 0, 0, 0),
+        (HalfBinomialLoss(), 0.0, 1e3, 1e3, 1, 0),
+        (
+            HalfMultinomialLoss(n_classes=3),
+            0.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.2,
+            None,
+            None,
+        ),
+        (
+            HalfMultinomialLoss(n_classes=3),
+            1.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.5,
+            None,
+            None,
+        ),
+        (
+            HalfMultinomialLoss(n_classes=3),
+            2.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.3,
+            None,
+            None,
+        ),
+        (
+            HalfMultinomialLoss(n_classes=3),
+            2.0,
+            [1e4, 0, 7e-7],
+            logsumexp([1e4, 0, 7e-7]) - (7e-7),
+            None,
+            None,
+        ),
+    ],
+    ids=loss_instance_name,
+)
+def test_loss_on_specific_values(
+    loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true
+):
+    """Test losses, gradients and hessians at specific values."""
+    loss1 = loss(y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]))
+    grad1 = loss.gradient(
+        y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
+    )
+    loss2, grad2 = loss.loss_gradient(
+        y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
+    )
+    grad3, hess = loss.gradient_hessian(
+        y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
+    )
+
+    assert loss1 == approx(loss_true, rel=1e-15, abs=1e-15)
+    assert loss2 == approx(loss_true, rel=1e-15, abs=1e-15)
+
+    if gradient_true is not None:
+        assert grad1 == approx(gradient_true, rel=1e-15, abs=1e-15)
+        assert grad2 == approx(gradient_true, rel=1e-15, abs=1e-15)
+        assert grad3 == approx(gradient_true, rel=1e-15, abs=1e-15)
+
+    if hessian_true is not None:
+        assert hess == approx(hessian_true, rel=1e-15, abs=1e-15)
+
+
+@pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize("readonly_memmap", [False, True])
+@pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
+@pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
+@pytest.mark.parametrize("sample_weight", [None, 1])
+@pytest.mark.parametrize("out1", [None, 1])
+@pytest.mark.parametrize("out2", [None, 1])
+@pytest.mark.parametrize("n_threads", [1, 2])
+def test_loss_dtype(
+    loss, readonly_memmap, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
+):
+    """Test acceptance of dtypes, readonly and writeable arrays in loss functions.
+
+    Check that loss accepts if all input arrays are either all float32 or all
+    float64, and all output arrays are either all float32 or all float64.
+
+    Also check that input arrays can be readonly, e.g. memory mapped.
+    """
+    loss = loss()
+    # generate a y_true and raw_prediction in valid range
+    n_samples = 5
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+    y_true = y_true.astype(dtype_in)
+    raw_prediction = raw_prediction.astype(dtype_in)
+
+    if sample_weight is not None:
+        sample_weight = np.array([2.0] * n_samples, dtype=dtype_in)
+    if out1 is not None:
+        out1 = np.empty_like(y_true, dtype=dtype_out)
+    if out2 is not None:
+        out2 = np.empty_like(raw_prediction, dtype=dtype_out)
+
+    if readonly_memmap:
+        y_true = create_memmap_backed_data(y_true)
+        raw_prediction = create_memmap_backed_data(raw_prediction)
+        if sample_weight is not None:
+            sample_weight = create_memmap_backed_data(sample_weight)
+
+    l = loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out1,
+        n_threads=n_threads,
+    )
+    assert l is out1 if out1 is not None else True
+    g = loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out2,
+        n_threads=n_threads,
+    )
+    assert g is out2 if out2 is not None else True
+    l, g = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out1,
+        gradient_out=out2,
+        n_threads=n_threads,
+    )
+    assert l is out1 if out1 is not None else True
+    assert g is out2 if out2 is not None else True
+    if out1 is not None and loss.is_multiclass:
+        out1 = np.empty_like(raw_prediction, dtype=dtype_out)
+    g, h = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out1,
+        hessian_out=out2,
+        n_threads=n_threads,
+    )
+    assert g is out1 if out1 is not None else True
+    assert h is out2 if out2 is not None else True
+    loss(y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight)
+    loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight)
+    loss.constant_to_optimal_zero(y_true=y_true, sample_weight=sample_weight)
+    if hasattr(loss, "predict_proba"):
+        loss.predict_proba(raw_prediction=raw_prediction)
+    if hasattr(loss, "gradient_proba"):
+        g, p = loss.gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient_out=out1,
+            proba_out=out2,
+            n_threads=n_threads,
+        )
+        assert g is out1 if out1 is not None else True
+        assert p is out2 if out2 is not None else True
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_same_as_C_functions(loss, sample_weight):
+    """Test that Python and Cython functions return same results."""
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    out_l1 = np.empty_like(y_true)
+    out_l2 = np.empty_like(y_true)
+    out_g1 = np.empty_like(raw_prediction)
+    out_g2 = np.empty_like(raw_prediction)
+    out_h1 = np.empty_like(raw_prediction)
+    out_h2 = np.empty_like(raw_prediction)
+    loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out_l1,
+    )
+    loss.closs.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out_l2,
+    )
+    assert_allclose(out_l1, out_l2)
+    loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out_g1,
+    )
+    loss.closs.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out_g2,
+    )
+    assert_allclose(out_g1, out_g2)
+    loss.closs.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out_l1,
+        gradient_out=out_g1,
+    )
+    loss.closs.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out_l2,
+        gradient_out=out_g2,
+    )
+    assert_allclose(out_l1, out_l2)
+    assert_allclose(out_g1, out_g2)
+    loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out_g1,
+        hessian_out=out_h1,
+    )
+    loss.closs.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out_g2,
+        hessian_out=out_h2,
+    )
+    assert_allclose(out_g1, out_g2)
+    assert_allclose(out_h1, out_h2)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_gradients_are_the_same(loss, sample_weight, global_random_seed):
+    """Test that loss and gradient are the same across different functions.
+
+    Also test that output arguments contain correct results.
+    """
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=global_random_seed,
+    )
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    out_l1 = np.empty_like(y_true)
+    out_l2 = np.empty_like(y_true)
+    out_g1 = np.empty_like(raw_prediction)
+    out_g2 = np.empty_like(raw_prediction)
+    out_g3 = np.empty_like(raw_prediction)
+    out_h3 = np.empty_like(raw_prediction)
+
+    l1 = loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out_l1,
+    )
+    g1 = loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out_g1,
+    )
+    l2, g2 = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out_l2,
+        gradient_out=out_g2,
+    )
+    g3, h3 = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out_g3,
+        hessian_out=out_h3,
+    )
+    assert_allclose(l1, l2)
+    assert_array_equal(l1, out_l1)
+    assert np.shares_memory(l1, out_l1)
+    assert_array_equal(l2, out_l2)
+    assert np.shares_memory(l2, out_l2)
+    assert_allclose(g1, g2)
+    assert_allclose(g1, g3)
+    assert_array_equal(g1, out_g1)
+    assert np.shares_memory(g1, out_g1)
+    assert_array_equal(g2, out_g2)
+    assert np.shares_memory(g2, out_g2)
+    assert_array_equal(g3, out_g3)
+    assert np.shares_memory(g3, out_g3)
+
+    if hasattr(loss, "gradient_proba"):
+        assert loss.is_multiclass  # only for HalfMultinomialLoss
+        out_g4 = np.empty_like(raw_prediction)
+        out_proba = np.empty_like(raw_prediction)
+        g4, proba = loss.gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient_out=out_g4,
+            proba_out=out_proba,
+        )
+        assert_allclose(g1, out_g4)
+        assert_allclose(g1, g4)
+        assert_allclose(proba, out_proba)
+        assert_allclose(np.sum(proba, axis=1), 1, rtol=1e-11)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", ["ones", "random"])
+def test_sample_weight_multiplies(loss, sample_weight, global_random_seed):
+    """Test sample weights in loss, gradients and hessians.
+
+    Make sure that passing sample weights to loss, gradient and hessian
+    computation methods is equivalent to multiplying by the weights.
+    """
+    n_samples = 100
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=global_random_seed,
+    )
+
+    if sample_weight == "ones":
+        sample_weight = np.ones(shape=n_samples, dtype=np.float64)
+    else:
+        rng = np.random.RandomState(global_random_seed)
+        sample_weight = rng.normal(size=n_samples).astype(np.float64)
+
+    assert_allclose(
+        loss.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+        ),
+        sample_weight
+        * loss.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=None,
+        ),
+    )
+
+    losses, gradient = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=None,
+    )
+    losses_sw, gradient_sw = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    assert_allclose(losses * sample_weight, losses_sw)
+    if not loss.is_multiclass:
+        assert_allclose(gradient * sample_weight, gradient_sw)
+    else:
+        assert_allclose(gradient * sample_weight[:, None], gradient_sw)
+
+    gradient, hessian = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=None,
+    )
+    gradient_sw, hessian_sw = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    if not loss.is_multiclass:
+        assert_allclose(gradient * sample_weight, gradient_sw)
+        assert_allclose(hessian * sample_weight, hessian_sw)
+    else:
+        assert_allclose(gradient * sample_weight[:, None], gradient_sw)
+        assert_allclose(hessian * sample_weight[:, None], hessian_sw)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_graceful_squeezing(loss):
+    """Test that reshaped raw_prediction gives same results."""
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+
+    if raw_prediction.ndim == 1:
+        raw_prediction_2d = raw_prediction[:, None]
+        assert_allclose(
+            loss.loss(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.loss(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.gradient(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.gradient(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction),
+        )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_of_perfect_prediction(loss, sample_weight):
+    """Test value of perfect predictions.
+
+    Loss of y_pred = y_true plus constant_to_optimal_zero should sums up to
+    zero.
+    """
+    if not loss.is_multiclass:
+        # Use small values such that exp(value) is not nan.
+        raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
+        # If link is identity, we must respect the interval of y_pred:
+        if isinstance(loss.link, IdentityLink):
+            eps = 1e-10
+            low = loss.interval_y_pred.low
+            if not loss.interval_y_pred.low_inclusive:
+                low = low + eps
+            high = loss.interval_y_pred.high
+            if not loss.interval_y_pred.high_inclusive:
+                high = high - eps
+            raw_prediction = np.clip(raw_prediction, low, high)
+        y_true = loss.link.inverse(raw_prediction)
+    else:
+        # HalfMultinomialLoss
+        y_true = np.arange(loss.n_classes).astype(float)
+        # raw_prediction with entries -exp(10), but +exp(10) on the diagonal
+        # this is close enough to np.inf which would produce nan
+        raw_prediction = np.full(
+            shape=(loss.n_classes, loss.n_classes),
+            fill_value=-np.exp(10),
+            dtype=float,
+        )
+        raw_prediction.flat[:: loss.n_classes + 1] = np.exp(10)
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    loss_value = loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    constant_term = loss.constant_to_optimal_zero(
+        y_true=y_true, sample_weight=sample_weight
+    )
+    # Comparing loss_value + constant_term to zero would result in large
+    # round-off errors.
+    assert_allclose(loss_value, -constant_term, atol=1e-14, rtol=1e-15)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_gradients_hessians_numerically(loss, sample_weight, global_random_seed):
+    """Test gradients and hessians with numerical derivatives.
+
+    Gradient should equal the numerical derivatives of the loss function.
+    Hessians should equal the numerical derivatives of gradients.
+    """
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=global_random_seed,
+    )
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    g, h = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+
+    assert g.shape == raw_prediction.shape
+    assert h.shape == raw_prediction.shape
+
+    if not loss.is_multiclass:
+
+        def loss_func(x):
+            return loss.loss(
+                y_true=y_true,
+                raw_prediction=x,
+                sample_weight=sample_weight,
+            )
+
+        g_numeric = numerical_derivative(loss_func, raw_prediction, eps=1e-6)
+        assert_allclose(g, g_numeric, rtol=5e-6, atol=1e-10)
+
+        def grad_func(x):
+            return loss.gradient(
+                y_true=y_true,
+                raw_prediction=x,
+                sample_weight=sample_weight,
+            )
+
+        h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6)
+        if loss.approx_hessian:
+            # TODO: What could we test if loss.approx_hessian?
+            pass
+        else:
+            assert_allclose(h, h_numeric, rtol=5e-6, atol=1e-10)
+    else:
+        # For multiclass loss, we should only change the predictions of the
+        # class for which the derivative is taken for, e.g. offset[:, k] = eps
+        # for class k.
+        # As a softmax is computed, offsetting the whole array by a constant
+        # would have no effect on the probabilities, and thus on the loss.
+        for k in range(loss.n_classes):
+
+            def loss_func(x):
+                raw = raw_prediction.copy()
+                raw[:, k] = x
+                return loss.loss(
+                    y_true=y_true,
+                    raw_prediction=raw,
+                    sample_weight=sample_weight,
+                )
+
+            g_numeric = numerical_derivative(loss_func, raw_prediction[:, k], eps=1e-5)
+            assert_allclose(g[:, k], g_numeric, rtol=5e-6, atol=1e-10)
+
+            def grad_func(x):
+                raw = raw_prediction.copy()
+                raw[:, k] = x
+                return loss.gradient(
+                    y_true=y_true,
+                    raw_prediction=raw,
+                    sample_weight=sample_weight,
+                )[:, k]
+
+            h_numeric = numerical_derivative(grad_func, raw_prediction[:, k], eps=1e-6)
+            if loss.approx_hessian:
+                # TODO: What could we test if loss.approx_hessian?
+                pass
+            else:
+                assert_allclose(h[:, k], h_numeric, rtol=5e-6, atol=1e-10)
+
+
+@pytest.mark.parametrize(
+    "loss, x0, y_true",
+    [
+        ("squared_error", -2.0, 42),
+        ("squared_error", 117.0, 1.05),
+        ("squared_error", 0.0, 0.0),
+        # The argmin of binomial_loss for y_true=0 and y_true=1 is resp.
+        # -inf and +inf due to logit, cf. "complete separation". Therefore, we
+        # use 0 < y_true < 1.
+        ("binomial_loss", 0.3, 0.1),
+        ("binomial_loss", -12, 0.2),
+        ("binomial_loss", 30, 0.9),
+        ("poisson_loss", 12.0, 1.0),
+        ("poisson_loss", 0.0, 2.0),
+        ("poisson_loss", -22.0, 10.0),
+    ],
+)
+@skip_if_32bit
+def test_derivatives(loss, x0, y_true):
+    """Test that gradients are zero at the minimum of the loss.
+
+    We check this on a single value/sample using Halley's method with the
+    first and second order derivatives computed by the Loss instance.
+    Note that methods of Loss instances operate on arrays while the newton
+    root finder expects a scalar or a one-element array for this purpose.
+    """
+    loss = _LOSSES[loss](sample_weight=None)
+    y_true = np.array([y_true], dtype=np.float64)
+    x0 = np.array([x0], dtype=np.float64)
+
+    def func(x: np.ndarray) -> np.ndarray:
+        """Compute loss plus constant term.
+
+        The constant term is such that the minimum function value is zero,
+        which is required by the Newton method.
+        """
+        return loss.loss(
+            y_true=y_true, raw_prediction=x
+        ) + loss.constant_to_optimal_zero(y_true=y_true)
+
+    def fprime(x: np.ndarray) -> np.ndarray:
+        return loss.gradient(y_true=y_true, raw_prediction=x)
+
+    def fprime2(x: np.ndarray) -> np.ndarray:
+        return loss.gradient_hessian(y_true=y_true, raw_prediction=x)[1]
+
+    optimum = newton(
+        func,
+        x0=x0,
+        fprime=fprime,
+        fprime2=fprime2,
+        maxiter=100,
+        tol=5e-8,
+    )
+
+    # Need to ravel arrays because assert_allclose requires matching
+    # dimensions.
+    y_true = y_true.ravel()
+    optimum = optimum.ravel()
+    assert_allclose(loss.link.inverse(optimum), y_true)
+    assert_allclose(func(optimum), 0, atol=1e-14)
+    assert_allclose(loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_intercept_only(loss, sample_weight):
+    """Test that fit_intercept_only returns the argmin of the loss.
+
+    Also test that the gradient is zero at the minimum.
+    """
+    n_samples = 50
+    if not loss.is_multiclass:
+        y_true = loss.link.inverse(np.linspace(-4, 4, num=n_samples))
+    else:
+        y_true = np.arange(n_samples).astype(np.float64) % loss.n_classes
+        y_true[::5] = 0  # exceedance of class 0
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(0.1, 2, num=n_samples)
+
+    a = loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight)
+
+    # find minimum by optimization
+    def fun(x):
+        if not loss.is_multiclass:
+            raw_prediction = np.full(shape=(n_samples), fill_value=x)
+        else:
+            raw_prediction = np.ascontiguousarray(
+                np.broadcast_to(x, shape=(n_samples, loss.n_classes))
+            )
+        return loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+        )
+
+    if not loss.is_multiclass:
+        opt = minimize_scalar(fun, tol=1e-7, options={"maxiter": 100})
+        grad = loss.gradient(
+            y_true=y_true,
+            raw_prediction=np.full_like(y_true, a),
+            sample_weight=sample_weight,
+        )
+        assert a.shape == tuple()  # scalar
+        assert a.dtype == y_true.dtype
+        assert_all_finite(a)
+        a == approx(opt.x, rel=1e-7)
+        grad.sum() == approx(0, abs=1e-12)
+    else:
+        # The constraint corresponds to sum(raw_prediction) = 0. Without it, we would
+        # need to apply loss.symmetrize_raw_prediction to opt.x before comparing.
+        opt = minimize(
+            fun,
+            np.zeros((loss.n_classes)),
+            tol=1e-13,
+            options={"maxiter": 100},
+            method="SLSQP",
+            constraints=LinearConstraint(np.ones((1, loss.n_classes)), 0, 0),
+        )
+        grad = loss.gradient(
+            y_true=y_true,
+            raw_prediction=np.tile(a, (n_samples, 1)),
+            sample_weight=sample_weight,
+        )
+        assert a.dtype == y_true.dtype
+        assert_all_finite(a)
+        assert_allclose(a, opt.x, rtol=5e-6, atol=1e-12)
+        assert_allclose(grad.sum(axis=0), 0, atol=1e-12)
+
+
+@pytest.mark.parametrize(
+    "loss, func, random_dist",
+    [
+        (HalfSquaredError(), np.mean, "normal"),
+        (AbsoluteError(), np.median, "normal"),
+        (PinballLoss(quantile=0.25), lambda x: np.percentile(x, q=25), "normal"),
+        (HalfPoissonLoss(), np.mean, "poisson"),
+        (HalfGammaLoss(), np.mean, "exponential"),
+        (HalfTweedieLoss(), np.mean, "exponential"),
+        (HalfBinomialLoss(), np.mean, "binomial"),
+    ],
+)
+def test_specific_fit_intercept_only(loss, func, random_dist, global_random_seed):
+    """Test that fit_intercept_only returns the correct functional.
+
+    We test the functional for specific, meaningful distributions, e.g.
+    squared error estimates the expectation of a probability distribution.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    if random_dist == "binomial":
+        y_train = rng.binomial(1, 0.5, size=100)
+    else:
+        y_train = getattr(rng, random_dist)(size=100)
+    baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+    # Make sure baseline prediction is the expected functional=func, e.g. mean
+    # or median.
+    assert_all_finite(baseline_prediction)
+    assert baseline_prediction == approx(loss.link.link(func(y_train)))
+    assert loss.link.inverse(baseline_prediction) == approx(func(y_train))
+    if isinstance(loss, IdentityLink):
+        assert_allclose(loss.link.inverse(baseline_prediction), baseline_prediction)
+
+    # Test baseline at boundary
+    if loss.interval_y_true.low_inclusive:
+        y_train.fill(loss.interval_y_true.low)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert_all_finite(baseline_prediction)
+    if loss.interval_y_true.high_inclusive:
+        y_train.fill(loss.interval_y_true.high)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert_all_finite(baseline_prediction)
+
+
+def test_multinomial_loss_fit_intercept_only():
+    """Test that fit_intercept_only returns the mean functional for CCE."""
+    rng = np.random.RandomState(0)
+    n_classes = 4
+    loss = HalfMultinomialLoss(n_classes=n_classes)
+    # Same logic as test_specific_fit_intercept_only. Here inverse link
+    # function = softmax and link function = log - symmetry term.
+    y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64)
+    baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+    assert baseline_prediction.shape == (n_classes,)
+    p = np.zeros(n_classes, dtype=y_train.dtype)
+    for k in range(n_classes):
+        p[k] = (y_train == k).mean()
+    assert_allclose(baseline_prediction, np.log(p) - np.mean(np.log(p)))
+    assert_allclose(baseline_prediction[None, :], loss.link.link(p[None, :]))
+
+    for y_train in (np.zeros(shape=10), np.ones(shape=10)):
+        y_train = y_train.astype(np.float64)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert baseline_prediction.dtype == y_train.dtype
+        assert_all_finite(baseline_prediction)
+
+
+def test_multinomial_cy_gradient(global_random_seed):
+    """Test that Multinomial cy_gradient gives the same result as gradient.
+
+    CyHalfMultinomialLoss does not inherit from CyLossFunction and has a different API.
+    As a consequence, the functions like `loss` and `gradient` do not rely on `cy_loss`
+    and `cy_gradient`.
+    """
+    n_samples = 100
+    n_classes = 5
+    loss = HalfMultinomialLoss(n_classes=n_classes)
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        seed=global_random_seed,
+    )
+    sample_weight = np.linspace(0.1, 2, num=n_samples)
+
+    grad1 = loss.closs._test_cy_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,  # needs to be C-contiguous
+        sample_weight=sample_weight,
+    )
+    grad2 = loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    assert_allclose(grad1, grad2)
+
+
+def test_binomial_and_multinomial_loss(global_random_seed):
+    """Test that multinomial loss with n_classes = 2 is the same as binomial loss."""
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 20
+    binom = HalfBinomialLoss()
+    multinom = HalfMultinomialLoss(n_classes=2)
+    y_train = rng.randint(0, 2, size=n_samples).astype(np.float64)
+    raw_prediction = rng.normal(size=n_samples)
+    raw_multinom = np.empty((n_samples, 2))
+    raw_multinom[:, 0] = -0.5 * raw_prediction
+    raw_multinom[:, 1] = 0.5 * raw_prediction
+    assert_allclose(
+        binom.loss(y_true=y_train, raw_prediction=raw_prediction),
+        multinom.loss(y_true=y_train, raw_prediction=raw_multinom),
+    )
+
+
+@pytest.mark.parametrize("y_true", (np.array([0.0, 0, 0]), np.array([1.0, 1, 1])))
+@pytest.mark.parametrize("y_pred", (np.array([-5.0, -5, -5]), np.array([3.0, 3, 3])))
+def test_binomial_vs_alternative_formulation(y_true, y_pred, global_dtype):
+    """Test that both formulations of the binomial deviance agree.
+
+    Often, the binomial deviance or log loss is written in terms of a variable
+    z in {-1, +1}, but we use y in {0, 1}, hence z = 2 * y - 1.
+    ESL II Eq. (10.18):
+
+        -loglike(z, f) = log(1 + exp(-2 * z * f))
+
+    Note:
+        - ESL 2*f = raw_prediction, hence the factor 2 of ESL disappears.
+        - Deviance = -2*loglike + .., but HalfBinomialLoss is half of the
+          deviance, hence the factor of 2 cancels in the comparison.
+    """
+
+    def alt_loss(y, raw_pred):
+        z = 2 * y - 1
+        return np.mean(np.log(1 + np.exp(-z * raw_pred)))
+
+    def alt_gradient(y, raw_pred):
+        # alternative gradient formula according to ESL
+        z = 2 * y - 1
+        return -z / (1 + np.exp(z * raw_pred))
+
+    bin_loss = HalfBinomialLoss()
+
+    y_true = y_true.astype(global_dtype)
+    y_pred = y_pred.astype(global_dtype)
+    datum = (y_true, y_pred)
+
+    assert bin_loss(*datum) == approx(alt_loss(*datum))
+    assert_allclose(bin_loss.gradient(*datum), alt_gradient(*datum))
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_predict_proba(loss, global_random_seed):
+    """Test that predict_proba and gradient_proba work as expected."""
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=global_random_seed,
+    )
+
+    if hasattr(loss, "predict_proba"):
+        proba = loss.predict_proba(raw_prediction)
+        assert proba.shape == (n_samples, loss.n_classes)
+        assert np.sum(proba, axis=1) == approx(1, rel=1e-11)
+
+    if hasattr(loss, "gradient_proba"):
+        for grad, proba in (
+            (None, None),
+            (None, np.empty_like(raw_prediction)),
+            (np.empty_like(raw_prediction), None),
+            (np.empty_like(raw_prediction), np.empty_like(raw_prediction)),
+        ):
+            grad, proba = loss.gradient_proba(
+                y_true=y_true,
+                raw_prediction=raw_prediction,
+                sample_weight=None,
+                gradient_out=grad,
+                proba_out=proba,
+            )
+            assert proba.shape == (n_samples, loss.n_classes)
+            assert np.sum(proba, axis=1) == approx(1, rel=1e-11)
+            assert_allclose(
+                grad,
+                loss.gradient(
+                    y_true=y_true,
+                    raw_prediction=raw_prediction,
+                    sample_weight=None,
+                    gradient_out=None,
+                ),
+            )
+
+
+@pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+@pytest.mark.parametrize("order", ("C", "F"))
+def test_init_gradient_and_hessians(loss, sample_weight, dtype, order):
+    """Test that init_gradient_and_hessian works as expected.
+
+    passing sample_weight to a loss correctly influences the constant_hessian
+    attribute, and consequently the shape of the hessian array.
+    """
+    n_samples = 5
+    if sample_weight == "range":
+        sample_weight = np.ones(n_samples)
+    loss = loss(sample_weight=sample_weight)
+    gradient, hessian = loss.init_gradient_and_hessian(
+        n_samples=n_samples,
+        dtype=dtype,
+        order=order,
+    )
+    if loss.constant_hessian:
+        assert gradient.shape == (n_samples,)
+        assert hessian.shape == (1,)
+    elif loss.is_multiclass:
+        assert gradient.shape == (n_samples, loss.n_classes)
+        assert hessian.shape == (n_samples, loss.n_classes)
+    else:
+        assert hessian.shape == (n_samples,)
+        assert hessian.shape == (n_samples,)
+
+    assert gradient.dtype == dtype
+    assert hessian.dtype == dtype
+
+    if order == "C":
+        assert gradient.flags.c_contiguous
+        assert hessian.flags.c_contiguous
+    else:
+        assert gradient.flags.f_contiguous
+        assert hessian.flags.f_contiguous
+
+
+@pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        (
+            {"dtype": np.int64},
+            f"Valid options for 'dtype' are .* Got dtype={np.int64} instead.",
+        ),
+    ],
+)
+def test_init_gradient_and_hessian_raises(loss, params, err_msg):
+    """Test that init_gradient_and_hessian raises errors for invalid input."""
+    loss = loss()
+    with pytest.raises((ValueError, TypeError), match=err_msg):
+        gradient, hessian = loss.init_gradient_and_hessian(n_samples=5, **params)
+
+
+@pytest.mark.parametrize(
+    "loss, params, err_type, err_msg",
+    [
+        (
+            PinballLoss,
+            {"quantile": None},
+            TypeError,
+            "quantile must be an instance of float, not NoneType.",
+        ),
+        (
+            PinballLoss,
+            {"quantile": 0},
+            ValueError,
+            "quantile == 0, must be > 0.",
+        ),
+        (PinballLoss, {"quantile": 1.1}, ValueError, "quantile == 1.1, must be < 1."),
+        (
+            HuberLoss,
+            {"quantile": None},
+            TypeError,
+            "quantile must be an instance of float, not NoneType.",
+        ),
+        (
+            HuberLoss,
+            {"quantile": 0},
+            ValueError,
+            "quantile == 0, must be > 0.",
+        ),
+        (HuberLoss, {"quantile": 1.1}, ValueError, "quantile == 1.1, must be < 1."),
+    ],
+)
+def test_loss_init_parameter_validation(loss, params, err_type, err_msg):
+    """Test that loss raises errors for invalid input."""
+    with pytest.raises(err_type, match=err_msg):
+        loss(**params)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_loss_pickle(loss):
+    """Test that losses can be pickled."""
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+    pickled_loss = pickle.dumps(loss)
+    unpickled_loss = pickle.loads(pickled_loss)
+    assert loss(y_true=y_true, raw_prediction=raw_prediction) == approx(
+        unpickled_loss(y_true=y_true, raw_prediction=raw_prediction)
+    )
+
+
+@pytest.mark.parametrize("p", [-1.5, 0, 1, 1.5, 2, 3])
+def test_tweedie_log_identity_consistency(p):
+    """Test for identical losses when only the link function is different."""
+    half_tweedie_log = HalfTweedieLoss(power=p)
+    half_tweedie_identity = HalfTweedieLossIdentity(power=p)
+    n_samples = 10
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=half_tweedie_log, n_samples=n_samples, seed=42
+    )
+    y_pred = half_tweedie_log.link.inverse(raw_prediction)  # exp(raw_prediction)
+
+    # Let's compare the loss values, up to some constant term that is dropped
+    # in HalfTweedieLoss but not in HalfTweedieLossIdentity.
+    loss_log = half_tweedie_log.loss(
+        y_true=y_true, raw_prediction=raw_prediction
+    ) + half_tweedie_log.constant_to_optimal_zero(y_true)
+    loss_identity = half_tweedie_identity.loss(
+        y_true=y_true, raw_prediction=y_pred
+    ) + half_tweedie_identity.constant_to_optimal_zero(y_true)
+    # Note that HalfTweedieLoss ignores different constant terms than
+    # HalfTweedieLossIdentity. Constant terms means terms not depending on
+    # raw_prediction. By adding these terms, `constant_to_optimal_zero`, both losses
+    # give the same values.
+    assert_allclose(loss_log, loss_identity)
+
+    # For gradients and hessians, the constant terms do not matter. We have, however,
+    # to account for the chain rule, i.e. with x=raw_prediction
+    #     gradient_log(x) = d/dx loss_log(x)
+    #                     = d/dx loss_identity(exp(x))
+    #                     = exp(x) * gradient_identity(exp(x))
+    # Similarly,
+    #     hessian_log(x) = exp(x) * gradient_identity(exp(x))
+    #                    + exp(x)**2 * hessian_identity(x)
+    gradient_log, hessian_log = half_tweedie_log.gradient_hessian(
+        y_true=y_true, raw_prediction=raw_prediction
+    )
+    gradient_identity, hessian_identity = half_tweedie_identity.gradient_hessian(
+        y_true=y_true, raw_prediction=y_pred
+    )
+    assert_allclose(gradient_log, y_pred * gradient_identity)
+    assert_allclose(
+        hessian_log, y_pred * gradient_identity + y_pred**2 * hessian_identity
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..de86a59e07113dcc7f9c656e65c7708ee230afa6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/__init__.py
@@ -0,0 +1,56 @@
+"""Popular unsupervised clustering algorithms."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._affinity_propagation import AffinityPropagation, affinity_propagation
+from ._agglomerative import (
+    AgglomerativeClustering,
+    FeatureAgglomeration,
+    linkage_tree,
+    ward_tree,
+)
+from ._bicluster import SpectralBiclustering, SpectralCoclustering
+from ._birch import Birch
+from ._bisect_k_means import BisectingKMeans
+from ._dbscan import DBSCAN, dbscan
+from ._hdbscan.hdbscan import HDBSCAN
+from ._kmeans import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
+from ._mean_shift import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
+from ._optics import (
+    OPTICS,
+    cluster_optics_dbscan,
+    cluster_optics_xi,
+    compute_optics_graph,
+)
+from ._spectral import SpectralClustering, spectral_clustering
+
+__all__ = [
+    "DBSCAN",
+    "HDBSCAN",
+    "OPTICS",
+    "AffinityPropagation",
+    "AgglomerativeClustering",
+    "Birch",
+    "BisectingKMeans",
+    "FeatureAgglomeration",
+    "KMeans",
+    "MeanShift",
+    "MiniBatchKMeans",
+    "SpectralBiclustering",
+    "SpectralClustering",
+    "SpectralCoclustering",
+    "affinity_propagation",
+    "cluster_optics_dbscan",
+    "cluster_optics_xi",
+    "compute_optics_graph",
+    "dbscan",
+    "estimate_bandwidth",
+    "get_bin_seeds",
+    "k_means",
+    "kmeans_plusplus",
+    "linkage_tree",
+    "mean_shift",
+    "spectral_clustering",
+    "ward_tree",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_affinity_propagation.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_affinity_propagation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7ae6ed63580d60eb2d889c11cfe84875380c55c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_affinity_propagation.py
@@ -0,0 +1,607 @@
+"""Affinity Propagation clustering algorithm."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+
+from .._config import config_context
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..exceptions import ConvergenceWarning
+from ..metrics import euclidean_distances, pairwise_distances_argmin
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.validation import check_is_fitted, validate_data
+
+
+def _equal_similarities_and_preferences(S, preference):
+    def all_equal_preferences():
+        return np.all(preference == preference.flat[0])
+
+    def all_equal_similarities():
+        # Create mask to ignore diagonal of S
+        mask = np.ones(S.shape, dtype=bool)
+        np.fill_diagonal(mask, 0)
+
+        return np.all(S[mask].flat == S[mask].flat[0])
+
+    return all_equal_preferences() and all_equal_similarities()
+
+
+def _affinity_propagation(
+    S,
+    *,
+    preference,
+    convergence_iter,
+    max_iter,
+    damping,
+    verbose,
+    return_n_iter,
+    random_state,
+):
+    """Main affinity propagation algorithm."""
+    n_samples = S.shape[0]
+    if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
+        # It makes no sense to run the algorithm in this case, so return 1 or
+        # n_samples clusters, depending on preferences
+        warnings.warn(
+            "All samples have mutually equal similarities. "
+            "Returning arbitrary cluster center(s)."
+        )
+        if preference.flat[0] > S.flat[n_samples - 1]:
+            return (
+                (np.arange(n_samples), np.arange(n_samples), 0)
+                if return_n_iter
+                else (np.arange(n_samples), np.arange(n_samples))
+            )
+        else:
+            return (
+                (np.array([0]), np.array([0] * n_samples), 0)
+                if return_n_iter
+                else (np.array([0]), np.array([0] * n_samples))
+            )
+
+    # Place preference on the diagonal of S
+    S.flat[:: (n_samples + 1)] = preference
+
+    A = np.zeros((n_samples, n_samples))
+    R = np.zeros((n_samples, n_samples))  # Initialize messages
+    # Intermediate results
+    tmp = np.zeros((n_samples, n_samples))
+
+    # Remove degeneracies
+    S += (
+        np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100
+    ) * random_state.standard_normal(size=(n_samples, n_samples))
+
+    # Execute parallel affinity propagation updates
+    e = np.zeros((n_samples, convergence_iter))
+
+    ind = np.arange(n_samples)
+
+    for it in range(max_iter):
+        # tmp = A + S; compute responsibilities
+        np.add(A, S, tmp)
+        I = np.argmax(tmp, axis=1)
+        Y = tmp[ind, I]  # np.max(A + S, axis=1)
+        tmp[ind, I] = -np.inf
+        Y2 = np.max(tmp, axis=1)
+
+        # tmp = Rnew
+        np.subtract(S, Y[:, None], tmp)
+        tmp[ind, I] = S[ind, I] - Y2
+
+        # Damping
+        tmp *= 1 - damping
+        R *= damping
+        R += tmp
+
+        # tmp = Rp; compute availabilities
+        np.maximum(R, 0, tmp)
+        tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1]
+
+        # tmp = -Anew
+        tmp -= np.sum(tmp, axis=0)
+        dA = np.diag(tmp).copy()
+        tmp.clip(0, np.inf, tmp)
+        tmp.flat[:: n_samples + 1] = dA
+
+        # Damping
+        tmp *= 1 - damping
+        A *= damping
+        A -= tmp
+
+        # Check for convergence
+        E = (np.diag(A) + np.diag(R)) > 0
+        e[:, it % convergence_iter] = E
+        K = np.sum(E, axis=0)
+
+        if it >= convergence_iter:
+            se = np.sum(e, axis=1)
+            unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples
+            if (not unconverged and (K > 0)) or (it == max_iter):
+                never_converged = False
+                if verbose:
+                    print("Converged after %d iterations." % it)
+                break
+    else:
+        never_converged = True
+        if verbose:
+            print("Did not converge")
+
+    I = np.flatnonzero(E)
+    K = I.size  # Identify exemplars
+
+    if K > 0:
+        if never_converged:
+            warnings.warn(
+                (
+                    "Affinity propagation did not converge, this model "
+                    "may return degenerate cluster centers and labels."
+                ),
+                ConvergenceWarning,
+            )
+        c = np.argmax(S[:, I], axis=1)
+        c[I] = np.arange(K)  # Identify clusters
+        # Refine the final set of exemplars and clusters and return results
+        for k in range(K):
+            ii = np.asarray(c == k).nonzero()[0]
+            j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
+            I[k] = ii[j]
+
+        c = np.argmax(S[:, I], axis=1)
+        c[I] = np.arange(K)
+        labels = I[c]
+        # Reduce labels to a sorted, gapless, list
+        cluster_centers_indices = np.unique(labels)
+        labels = np.searchsorted(cluster_centers_indices, labels)
+    else:
+        warnings.warn(
+            (
+                "Affinity propagation did not converge and this model "
+                "will not have any cluster centers."
+            ),
+            ConvergenceWarning,
+        )
+        labels = np.array([-1] * n_samples)
+        cluster_centers_indices = []
+
+    if return_n_iter:
+        return cluster_centers_indices, labels, it + 1
+    else:
+        return cluster_centers_indices, labels
+
+
+###############################################################################
+# Public API
+
+
+@validate_params(
+    {
+        "S": ["array-like"],
+        "return_n_iter": ["boolean"],
+    },
+    prefer_skip_nested_validation=False,
+)
+def affinity_propagation(
+    S,
+    *,
+    preference=None,
+    convergence_iter=15,
+    max_iter=200,
+    damping=0.5,
+    copy=True,
+    verbose=False,
+    return_n_iter=False,
+    random_state=None,
+):
+    """Perform Affinity Propagation Clustering of data.
+
+    Read more in the :ref:`User Guide <affinity_propagation>`.
+
+    Parameters
+    ----------
+    S : array-like of shape (n_samples, n_samples)
+        Matrix of similarities between points.
+
+    preference : array-like of shape (n_samples,) or float, default=None
+        Preferences for each point - points with larger values of
+        preferences are more likely to be chosen as exemplars. The number of
+        exemplars, i.e. of clusters, is influenced by the input preferences
+        value. If the preferences are not passed as arguments, they will be
+        set to the median of the input similarities (resulting in a moderate
+        number of clusters). For a smaller amount of clusters, this can be set
+        to the minimum value of the similarities.
+
+    convergence_iter : int, default=15
+        Number of iterations with no change in the number
+        of estimated clusters that stops the convergence.
+
+    max_iter : int, default=200
+        Maximum number of iterations.
+
+    damping : float, default=0.5
+        Damping factor between 0.5 and 1.
+
+    copy : bool, default=True
+        If copy is False, the affinity matrix is modified inplace by the
+        algorithm, for memory efficiency.
+
+    verbose : bool, default=False
+        The verbosity level.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo-random number generator to control the starting state.
+        Use an int for reproducible results across function calls.
+        See the :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.23
+            this parameter was previously hardcoded as 0.
+
+    Returns
+    -------
+    cluster_centers_indices : ndarray of shape (n_clusters,)
+        Index of clusters centers.
+
+    labels : ndarray of shape (n_samples,)
+        Cluster labels for each point.
+
+    n_iter : int
+        Number of iterations run. Returned only if `return_n_iter` is
+        set to True.
+
+    Notes
+    -----
+    For an example usage,
+    see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`.
+    You may also check out,
+    :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py`
+
+    When the algorithm does not converge, it will still return a arrays of
+    ``cluster_center_indices`` and labels if there are any exemplars/clusters,
+    however they may be degenerate and should be used with caution.
+
+    When all training samples have equal similarities and equal preferences,
+    the assignment of cluster centers and labels depends on the preference.
+    If the preference is smaller than the similarities, a single cluster center
+    and label ``0`` for every sample will be returned. Otherwise, every
+    training sample becomes its own cluster center and is assigned a unique
+    label.
+
+    References
+    ----------
+    Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
+    Between Data Points", Science Feb. 2007
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import affinity_propagation
+    >>> from sklearn.metrics.pairwise import euclidean_distances
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 4], [4, 0]])
+    >>> S = -euclidean_distances(X, squared=True)
+    >>> cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
+    >>> cluster_centers_indices
+    array([0, 3])
+    >>> labels
+    array([0, 0, 0, 1, 1, 1])
+    """
+    estimator = AffinityPropagation(
+        damping=damping,
+        max_iter=max_iter,
+        convergence_iter=convergence_iter,
+        copy=copy,
+        preference=preference,
+        affinity="precomputed",
+        verbose=verbose,
+        random_state=random_state,
+    ).fit(S)
+
+    if return_n_iter:
+        return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
+    return estimator.cluster_centers_indices_, estimator.labels_
+
+
+class AffinityPropagation(ClusterMixin, BaseEstimator):
+    """Perform Affinity Propagation Clustering of data.
+
+    Read more in the :ref:`User Guide <affinity_propagation>`.
+
+    Parameters
+    ----------
+    damping : float, default=0.5
+        Damping factor in the range `[0.5, 1.0)` is the extent to
+        which the current value is maintained relative to
+        incoming values (weighted 1 - damping). This in order
+        to avoid numerical oscillations when updating these
+        values (messages).
+
+    max_iter : int, default=200
+        Maximum number of iterations.
+
+    convergence_iter : int, default=15
+        Number of iterations with no change in the number
+        of estimated clusters that stops the convergence.
+
+    copy : bool, default=True
+        Make a copy of input data.
+
+    preference : array-like of shape (n_samples,) or float, default=None
+        Preferences for each point - points with larger values of
+        preferences are more likely to be chosen as exemplars. The number
+        of exemplars, ie of clusters, is influenced by the input
+        preferences value. If the preferences are not passed as arguments,
+        they will be set to the median of the input similarities.
+
+    affinity : {'euclidean', 'precomputed'}, default='euclidean'
+        Which affinity to use. At the moment 'precomputed' and
+        ``euclidean`` are supported. 'euclidean' uses the
+        negative squared euclidean distance between points.
+
+    verbose : bool, default=False
+        Whether to be verbose.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo-random number generator to control the starting state.
+        Use an int for reproducible results across function calls.
+        See the :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.23
+            this parameter was previously hardcoded as 0.
+
+    Attributes
+    ----------
+    cluster_centers_indices_ : ndarray of shape (n_clusters,)
+        Indices of cluster centers.
+
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
+        Cluster centers (if affinity != ``precomputed``).
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point.
+
+    affinity_matrix_ : ndarray of shape (n_samples, n_samples)
+        Stores the affinity matrix used in ``fit``.
+
+    n_iter_ : int
+        Number of iterations taken to converge.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    AgglomerativeClustering : Recursively merges the pair of
+        clusters that minimally increases a given linkage distance.
+    FeatureAgglomeration : Similar to AgglomerativeClustering,
+        but recursively merges features instead of samples.
+    KMeans : K-Means clustering.
+    MiniBatchKMeans : Mini-Batch K-Means clustering.
+    MeanShift : Mean shift clustering using a flat kernel.
+    SpectralClustering : Apply clustering to a projection
+        of the normalized Laplacian.
+
+    Notes
+    -----
+    The algorithmic complexity of affinity propagation is quadratic
+    in the number of points.
+
+    When the algorithm does not converge, it will still return a arrays of
+    ``cluster_center_indices`` and labels if there are any exemplars/clusters,
+    however they may be degenerate and should be used with caution.
+
+    When ``fit`` does not converge, ``cluster_centers_`` is still populated
+    however it may be degenerate. In such a case, proceed with caution.
+    If ``fit`` does not converge and fails to produce any ``cluster_centers_``
+    then ``predict`` will label every sample as ``-1``.
+
+    When all training samples have equal similarities and equal preferences,
+    the assignment of cluster centers and labels depends on the preference.
+    If the preference is smaller than the similarities, ``fit`` will result in
+    a single cluster center and label ``0`` for every sample. Otherwise, every
+    training sample becomes its own cluster center and is assigned a unique
+    label.
+
+    References
+    ----------
+
+    Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
+    Between Data Points", Science Feb. 2007
+
+    Examples
+    --------
+    >>> from sklearn.cluster import AffinityPropagation
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 4], [4, 0]])
+    >>> clustering = AffinityPropagation(random_state=5).fit(X)
+    >>> clustering
+    AffinityPropagation(random_state=5)
+    >>> clustering.labels_
+    array([0, 0, 0, 1, 1, 1])
+    >>> clustering.predict([[0, 0], [4, 4]])
+    array([0, 1])
+    >>> clustering.cluster_centers_
+    array([[1, 2],
+           [4, 2]])
+
+    For an example usage,
+    see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`.
+
+    For a comparison of Affinity Propagation with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        "damping": [Interval(Real, 0.5, 1.0, closed="left")],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "convergence_iter": [Interval(Integral, 1, None, closed="left")],
+        "copy": ["boolean"],
+        "preference": [
+            "array-like",
+            Interval(Real, None, None, closed="neither"),
+            None,
+        ],
+        "affinity": [StrOptions({"euclidean", "precomputed"})],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        *,
+        damping=0.5,
+        max_iter=200,
+        convergence_iter=15,
+        copy=True,
+        preference=None,
+        affinity="euclidean",
+        verbose=False,
+        random_state=None,
+    ):
+        self.damping = damping
+        self.max_iter = max_iter
+        self.convergence_iter = convergence_iter
+        self.copy = copy
+        self.verbose = verbose
+        self.preference = preference
+        self.affinity = affinity
+        self.random_state = random_state
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.pairwise = self.affinity == "precomputed"
+        tags.input_tags.sparse = self.affinity != "precomputed"
+        return tags
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the clustering from features, or affinity matrix.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+                array-like of shape (n_samples, n_samples)
+            Training instances to cluster, or similarities / affinities between
+            instances if ``affinity='precomputed'``. If a sparse feature matrix
+            is provided, it will be converted into a sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Returns the instance itself.
+        """
+        if self.affinity == "precomputed":
+            X = validate_data(self, X, copy=self.copy, force_writeable=True)
+            self.affinity_matrix_ = X
+        else:  # self.affinity == "euclidean"
+            X = validate_data(self, X, accept_sparse="csr")
+            self.affinity_matrix_ = -euclidean_distances(X, squared=True)
+
+        if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
+            raise ValueError(
+                "The matrix of similarities must be a square array. "
+                f"Got {self.affinity_matrix_.shape} instead."
+            )
+
+        if self.preference is None:
+            preference = np.median(self.affinity_matrix_)
+        else:
+            preference = self.preference
+        preference = np.asarray(preference)
+
+        random_state = check_random_state(self.random_state)
+
+        (
+            self.cluster_centers_indices_,
+            self.labels_,
+            self.n_iter_,
+        ) = _affinity_propagation(
+            self.affinity_matrix_,
+            max_iter=self.max_iter,
+            convergence_iter=self.convergence_iter,
+            preference=preference,
+            damping=self.damping,
+            verbose=self.verbose,
+            return_n_iter=True,
+            random_state=random_state,
+        )
+
+        if self.affinity != "precomputed":
+            self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
+
+        return self
+
+    def predict(self, X):
+        """Predict the closest cluster each sample in X belongs to.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to predict. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False, accept_sparse="csr")
+        if not hasattr(self, "cluster_centers_"):
+            raise ValueError(
+                "Predict method is not supported when affinity='precomputed'."
+            )
+
+        if self.cluster_centers_.shape[0] > 0:
+            with config_context(assume_finite=True):
+                return pairwise_distances_argmin(X, self.cluster_centers_)
+        else:
+            warnings.warn(
+                (
+                    "This model does not have any cluster centers "
+                    "because affinity propagation did not converge. "
+                    "Labeling every sample as '-1'."
+                ),
+                ConvergenceWarning,
+            )
+            return np.array([-1] * X.shape[0])
+
+    def fit_predict(self, X, y=None):
+        """Fit clustering from features/affinity matrix; return cluster labels.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+                array-like of shape (n_samples, n_samples)
+            Training instances to cluster, or similarities / affinities between
+            instances if ``affinity='precomputed'``. If a sparse feature matrix
+            is provided, it will be converted into a sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels.
+        """
+        return super().fit_predict(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py
new file mode 100644
index 0000000000000000000000000000000000000000..f068dc934151d0f4a03f32000fb79e2d657f45a2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py
@@ -0,0 +1,1333 @@
+"""Hierarchical Agglomerative Clustering
+
+These routines perform some hierarchical agglomerative clustering of some
+input data.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from heapq import heapify, heappop, heappush, heappushpop
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse
+from scipy.sparse.csgraph import connected_components
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    ClusterMixin,
+    _fit_context,
+)
+from ..metrics import DistanceMetric
+from ..metrics._dist_metrics import METRIC_MAPPING64
+from ..metrics.pairwise import _VALID_METRICS, paired_distances
+from ..utils import check_array
+from ..utils._fast_dict import IntFloatDict
+from ..utils._param_validation import (
+    HasMethods,
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from ..utils.graph import _fix_connected_components
+from ..utils.validation import check_memory, validate_data
+
+# mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
+from . import _hierarchical_fast as _hierarchical  # type: ignore[attr-defined]
+from ._feature_agglomeration import AgglomerationTransform
+
+###############################################################################
+# For non fully-connected graphs
+
+
+def _fix_connectivity(X, connectivity, affinity):
+    """
+    Fixes the connectivity matrix.
+
+    The different steps are:
+
+    - copies it
+    - makes it symmetric
+    - converts it to LIL if necessary
+    - completes it if necessary.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Feature matrix representing `n_samples` samples to be clustered.
+
+    connectivity : sparse matrix, default=None
+        Connectivity matrix. Defines for each sample the neighboring samples
+        following a given structure of the data. The matrix is assumed to
+        be symmetric and only the upper triangular half is used.
+        Default is `None`, i.e, the Ward algorithm is unstructured.
+
+    affinity : {"euclidean", "precomputed"}, default="euclidean"
+        Which affinity to use. At the moment `precomputed` and
+        ``euclidean`` are supported. `euclidean` uses the
+        negative squared Euclidean distance between points.
+
+    Returns
+    -------
+    connectivity : sparse matrix
+        The fixed connectivity matrix.
+
+    n_connected_components : int
+        The number of connected components in the graph.
+    """
+    n_samples = X.shape[0]
+    if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples:
+        raise ValueError(
+            "Wrong shape for connectivity matrix: %s when X is %s"
+            % (connectivity.shape, X.shape)
+        )
+
+    # Make the connectivity matrix symmetric:
+    connectivity = connectivity + connectivity.T
+
+    # Convert connectivity matrix to LIL
+    if not sparse.issparse(connectivity):
+        connectivity = sparse.lil_matrix(connectivity)
+
+    # `connectivity` is a sparse matrix at this point
+    if connectivity.format != "lil":
+        connectivity = connectivity.tolil()
+
+    # Compute the number of nodes
+    n_connected_components, labels = connected_components(connectivity)
+
+    if n_connected_components > 1:
+        warnings.warn(
+            "the number of connected components of the "
+            "connectivity matrix is %d > 1. Completing it to avoid "
+            "stopping the tree early." % n_connected_components,
+            stacklevel=2,
+        )
+        # XXX: Can we do without completing the matrix?
+        connectivity = _fix_connected_components(
+            X=X,
+            graph=connectivity,
+            n_connected_components=n_connected_components,
+            component_labels=labels,
+            metric=affinity,
+            mode="connectivity",
+        )
+
+    return connectivity, n_connected_components
+
+
+def _single_linkage_tree(
+    connectivity,
+    n_samples,
+    n_nodes,
+    n_clusters,
+    n_connected_components,
+    return_distance,
+):
+    """
+    Perform single linkage clustering on sparse data via the minimum
+    spanning tree from scipy.sparse.csgraph, then using union-find to label.
+    The parent array is then generated by walking through the tree.
+    """
+    from scipy.sparse.csgraph import minimum_spanning_tree
+
+    # explicitly cast connectivity to ensure safety
+    connectivity = connectivity.astype(np.float64, copy=False)
+
+    # Ensure zero distances aren't ignored by setting them to "epsilon"
+    epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps
+    connectivity.data[connectivity.data == 0] = epsilon_value
+
+    # Use scipy.sparse.csgraph to generate a minimum spanning tree
+    mst = minimum_spanning_tree(connectivity.tocsr())
+
+    # Convert the graph to scipy.cluster.hierarchy array format
+    mst = mst.tocoo()
+
+    # Undo the epsilon values
+    mst.data[mst.data == epsilon_value] = 0
+
+    mst_array = np.vstack([mst.row, mst.col, mst.data]).T
+
+    # Sort edges of the min_spanning_tree by weight
+    mst_array = mst_array[np.argsort(mst_array.T[2], kind="mergesort"), :]
+
+    # Convert edge list into standard hierarchical clustering format
+    single_linkage_tree = _hierarchical._single_linkage_label(mst_array)
+    children_ = single_linkage_tree[:, :2].astype(int)
+
+    # Compute parents
+    parent = np.arange(n_nodes, dtype=np.intp)
+    for i, (left, right) in enumerate(children_, n_samples):
+        if n_clusters is not None and i >= n_nodes:
+            break
+        if left < n_nodes:
+            parent[left] = i
+        if right < n_nodes:
+            parent[right] = i
+
+    if return_distance:
+        distances = single_linkage_tree[:, 2]
+        return children_, n_connected_components, n_samples, parent, distances
+    return children_, n_connected_components, n_samples, parent
+
+
+###############################################################################
+# Hierarchical tree building functions
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "connectivity": ["array-like", "sparse matrix", None],
+        "n_clusters": [Interval(Integral, 1, None, closed="left"), None],
+        "return_distance": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
+    """Ward clustering based on a Feature matrix.
+
+    Recursively merges the pair of clusters that minimally increases
+    within-cluster variance.
+
+    The inertia matrix uses a Heapq-based representation.
+
+    This is the structured version, that takes into account some topological
+    structure between samples.
+
+    Read more in the :ref:`User Guide <hierarchical_clustering>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Feature matrix representing `n_samples` samples to be clustered.
+
+    connectivity : {array-like, sparse matrix}, default=None
+        Connectivity matrix. Defines for each sample the neighboring samples
+        following a given structure of the data. The matrix is assumed to
+        be symmetric and only the upper triangular half is used.
+        Default is None, i.e, the Ward algorithm is unstructured.
+
+    n_clusters : int, default=None
+        `n_clusters` should be less than `n_samples`.  Stop early the
+        construction of the tree at `n_clusters.` This is useful to decrease
+        computation time if the number of clusters is not small compared to the
+        number of samples. In this case, the complete tree is not computed, thus
+        the 'children' output is of limited use, and the 'parents' output should
+        rather be used. This option is valid only when specifying a connectivity
+        matrix.
+
+    return_distance : bool, default=False
+        If `True`, return the distance between the clusters.
+
+    Returns
+    -------
+    children : ndarray of shape (n_nodes-1, 2)
+        The children of each non-leaf node. Values less than `n_samples`
+        correspond to leaves of the tree which are the original samples.
+        A node `i` greater than or equal to `n_samples` is a non-leaf
+        node and has children `children_[i - n_samples]`. Alternatively
+        at the i-th iteration, children[i][0] and children[i][1]
+        are merged to form node `n_samples + i`.
+
+    n_connected_components : int
+        The number of connected components in the graph.
+
+    n_leaves : int
+        The number of leaves in the tree.
+
+    parents : ndarray of shape (n_nodes,) or None
+        The parent of each node. Only returned when a connectivity matrix
+        is specified, elsewhere 'None' is returned.
+
+    distances : ndarray of shape (n_nodes-1,)
+        Only returned if `return_distance` is set to `True` (for compatibility).
+        The distances between the centers of the nodes. `distances[i]`
+        corresponds to a weighted Euclidean distance between
+        the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to
+        leaves of the tree, then `distances[i]` is their unweighted Euclidean
+        distance. Distances are updated in the following way
+        (from scipy.hierarchy.linkage):
+
+        The new entry :math:`d(u,v)` is computed as follows,
+
+        .. math::
+
+           d(u,v) = \\sqrt{\\frac{|v|+|s|}
+                               {T}d(v,s)^2
+                        + \\frac{|v|+|t|}
+                               {T}d(v,t)^2
+                        - \\frac{|v|}
+                               {T}d(s,t)^2}
+
+        where :math:`u` is the newly joined cluster consisting of
+        clusters :math:`s` and :math:`t`, :math:`v` is an unused
+        cluster in the forest, :math:`T=|v|+|s|+|t|`, and
+        :math:`|*|` is the cardinality of its argument. This is also
+        known as the incremental algorithm.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import ward_tree
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 4], [4, 0]])
+    >>> children, n_connected_components, n_leaves, parents = ward_tree(X)
+    >>> children
+    array([[0, 1],
+           [3, 5],
+           [2, 6],
+           [4, 7],
+           [8, 9]])
+    >>> n_connected_components
+    1
+    >>> n_leaves
+    6
+    """
+    X = np.asarray(X)
+    if X.ndim == 1:
+        X = np.reshape(X, (-1, 1))
+    n_samples, n_features = X.shape
+
+    if connectivity is None:
+        from scipy.cluster import hierarchy  # imports PIL
+
+        if n_clusters is not None:
+            warnings.warn(
+                (
+                    "Partial build of the tree is implemented "
+                    "only for structured clustering (i.e. with "
+                    "explicit connectivity). The algorithm "
+                    "will build the full tree and only "
+                    "retain the lower branches required "
+                    "for the specified number of clusters"
+                ),
+                stacklevel=2,
+            )
+        X = np.require(X, requirements="W")
+        out = hierarchy.ward(X)
+        children_ = out[:, :2].astype(np.intp)
+
+        if return_distance:
+            distances = out[:, 2]
+            return children_, 1, n_samples, None, distances
+        else:
+            return children_, 1, n_samples, None
+
+    connectivity, n_connected_components = _fix_connectivity(
+        X, connectivity, affinity="euclidean"
+    )
+    if n_clusters is None:
+        n_nodes = 2 * n_samples - 1
+    else:
+        if n_clusters > n_samples:
+            raise ValueError(
+                "Cannot provide more clusters than samples. "
+                "%i n_clusters was asked, and there are %i "
+                "samples." % (n_clusters, n_samples)
+            )
+        n_nodes = 2 * n_samples - n_clusters
+
+    # create inertia matrix
+    coord_row = []
+    coord_col = []
+    A = []
+    for ind, row in enumerate(connectivity.rows):
+        A.append(row)
+        # We keep only the upper triangular for the moments
+        # Generator expressions are faster than arrays on the following
+        row = [i for i in row if i < ind]
+        coord_row.extend(
+            len(row)
+            * [
+                ind,
+            ]
+        )
+        coord_col.extend(row)
+
+    coord_row = np.array(coord_row, dtype=np.intp, order="C")
+    coord_col = np.array(coord_col, dtype=np.intp, order="C")
+
+    # build moments as a list
+    moments_1 = np.zeros(n_nodes, order="C")
+    moments_1[:n_samples] = 1
+    moments_2 = np.zeros((n_nodes, n_features), order="C")
+    moments_2[:n_samples] = X
+    inertia = np.empty(len(coord_row), dtype=np.float64, order="C")
+    _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia)
+    inertia = list(zip(inertia, coord_row, coord_col))
+    heapify(inertia)
+
+    # prepare the main fields
+    parent = np.arange(n_nodes, dtype=np.intp)
+    used_node = np.ones(n_nodes, dtype=bool)
+    children = []
+    if return_distance:
+        distances = np.empty(n_nodes - n_samples)
+
+    not_visited = np.empty(n_nodes, dtype=bool, order="C")
+
+    # recursive merge loop
+    for k in range(n_samples, n_nodes):
+        # identify the merge
+        while True:
+            inert, i, j = heappop(inertia)
+            if used_node[i] and used_node[j]:
+                break
+        parent[i], parent[j] = k, k
+        children.append((i, j))
+        used_node[i] = used_node[j] = False
+        if return_distance:  # store inertia value
+            distances[k - n_samples] = inert
+
+        # update the moments
+        moments_1[k] = moments_1[i] + moments_1[j]
+        moments_2[k] = moments_2[i] + moments_2[j]
+
+        # update the structure matrix A and the inertia matrix
+        coord_col = []
+        not_visited.fill(1)
+        not_visited[k] = 0
+        _hierarchical._get_parents(A[i], coord_col, parent, not_visited)
+        _hierarchical._get_parents(A[j], coord_col, parent, not_visited)
+        # List comprehension is faster than a for loop
+        [A[col].append(k) for col in coord_col]
+        A.append(coord_col)
+        coord_col = np.array(coord_col, dtype=np.intp, order="C")
+        coord_row = np.empty(coord_col.shape, dtype=np.intp, order="C")
+        coord_row.fill(k)
+        n_additions = len(coord_row)
+        ini = np.empty(n_additions, dtype=np.float64, order="C")
+
+        _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini)
+
+        # List comprehension is faster than a for loop
+        [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)]
+
+    # Separate leaves in children (empty lists up to now)
+    n_leaves = n_samples
+    # sort children to get consistent output with unstructured version
+    children = [c[::-1] for c in children]
+    children = np.array(children)  # return numpy array for efficient caching
+
+    if return_distance:
+        # 2 is scaling factor to compare w/ unstructured version
+        distances = np.sqrt(2.0 * distances)
+        return children, n_connected_components, n_leaves, parent, distances
+    else:
+        return children, n_connected_components, n_leaves, parent
+
+
+# single average and complete linkage
+def linkage_tree(
+    X,
+    connectivity=None,
+    n_clusters=None,
+    linkage="complete",
+    affinity="euclidean",
+    return_distance=False,
+):
+    """Linkage agglomerative clustering based on a Feature matrix.
+
+    The inertia matrix uses a Heapq-based representation.
+
+    This is the structured version, that takes into account some topological
+    structure between samples.
+
+    Read more in the :ref:`User Guide <hierarchical_clustering>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Feature matrix representing `n_samples` samples to be clustered.
+
+    connectivity : sparse matrix, default=None
+        Connectivity matrix. Defines for each sample the neighboring samples
+        following a given structure of the data. The matrix is assumed to
+        be symmetric and only the upper triangular half is used.
+        Default is `None`, i.e, the Ward algorithm is unstructured.
+
+    n_clusters : int, default=None
+        Stop early the construction of the tree at `n_clusters`. This is
+        useful to decrease computation time if the number of clusters is
+        not small compared to the number of samples. In this case, the
+        complete tree is not computed, thus the 'children' output is of
+        limited use, and the 'parents' output should rather be used.
+        This option is valid only when specifying a connectivity matrix.
+
+    linkage : {"average", "complete", "single"}, default="complete"
+        Which linkage criteria to use. The linkage criterion determines which
+        distance to use between sets of observation.
+            - "average" uses the average of the distances of each observation of
+              the two sets.
+            - "complete" or maximum linkage uses the maximum distances between
+              all observations of the two sets.
+            - "single" uses the minimum of the distances between all
+              observations of the two sets.
+
+    affinity : str or callable, default='euclidean'
+        Which metric to use. Can be 'euclidean', 'manhattan', or any
+        distance known to paired distance (see metric.pairwise).
+
+    return_distance : bool, default=False
+        Whether or not to return the distances between the clusters.
+
+    Returns
+    -------
+    children : ndarray of shape (n_nodes-1, 2)
+        The children of each non-leaf node. Values less than `n_samples`
+        correspond to leaves of the tree which are the original samples.
+        A node `i` greater than or equal to `n_samples` is a non-leaf
+        node and has children `children_[i - n_samples]`. Alternatively
+        at the i-th iteration, children[i][0] and children[i][1]
+        are merged to form node `n_samples + i`.
+
+    n_connected_components : int
+        The number of connected components in the graph.
+
+    n_leaves : int
+        The number of leaves in the tree.
+
+    parents : ndarray of shape (n_nodes, ) or None
+        The parent of each node. Only returned when a connectivity matrix
+        is specified, elsewhere 'None' is returned.
+
+    distances : ndarray of shape (n_nodes-1,)
+        Returned when `return_distance` is set to `True`.
+
+        distances[i] refers to the distance between children[i][0] and
+        children[i][1] when they are merged.
+
+    See Also
+    --------
+    ward_tree : Hierarchical clustering with ward linkage.
+    """
+    X = np.asarray(X)
+    if X.ndim == 1:
+        X = np.reshape(X, (-1, 1))
+    n_samples, n_features = X.shape
+
+    linkage_choices = {
+        "complete": _hierarchical.max_merge,
+        "average": _hierarchical.average_merge,
+        "single": None,
+    }  # Single linkage is handled differently
+    try:
+        join_func = linkage_choices[linkage]
+    except KeyError as e:
+        raise ValueError(
+            "Unknown linkage option, linkage should be one of %s, but %s was given"
+            % (linkage_choices.keys(), linkage)
+        ) from e
+
+    if affinity == "cosine" and np.any(~np.any(X, axis=1)):
+        raise ValueError("Cosine affinity cannot be used when X contains zero vectors")
+
+    if connectivity is None:
+        from scipy.cluster import hierarchy  # imports PIL
+
+        if n_clusters is not None:
+            warnings.warn(
+                (
+                    "Partial build of the tree is implemented "
+                    "only for structured clustering (i.e. with "
+                    "explicit connectivity). The algorithm "
+                    "will build the full tree and only "
+                    "retain the lower branches required "
+                    "for the specified number of clusters"
+                ),
+                stacklevel=2,
+            )
+
+        if affinity == "precomputed":
+            # for the linkage function of hierarchy to work on precomputed
+            # data, provide as first argument an ndarray of the shape returned
+            # by sklearn.metrics.pairwise_distances.
+            if X.shape[0] != X.shape[1]:
+                raise ValueError(
+                    f"Distance matrix should be square, got matrix of shape {X.shape}"
+                )
+            i, j = np.triu_indices(X.shape[0], k=1)
+            X = X[i, j]
+        elif affinity == "l2":
+            # Translate to something understood by scipy
+            affinity = "euclidean"
+        elif affinity in ("l1", "manhattan"):
+            affinity = "cityblock"
+        elif callable(affinity):
+            X = affinity(X)
+            i, j = np.triu_indices(X.shape[0], k=1)
+            X = X[i, j]
+        if (
+            linkage == "single"
+            and affinity != "precomputed"
+            and not callable(affinity)
+            and affinity in METRIC_MAPPING64
+        ):
+            # We need the fast cythonized metric from neighbors
+            dist_metric = DistanceMetric.get_metric(affinity)
+
+            # The Cython routines used require contiguous arrays
+            X = np.ascontiguousarray(X, dtype=np.double)
+
+            mst = _hierarchical.mst_linkage_core(X, dist_metric)
+            # Sort edges of the min_spanning_tree by weight
+            mst = mst[np.argsort(mst.T[2], kind="mergesort"), :]
+
+            # Convert edge list into standard hierarchical clustering format
+            out = _hierarchical.single_linkage_label(mst)
+        else:
+            out = hierarchy.linkage(X, method=linkage, metric=affinity)
+        children_ = out[:, :2].astype(int, copy=False)
+
+        if return_distance:
+            distances = out[:, 2]
+            return children_, 1, n_samples, None, distances
+        return children_, 1, n_samples, None
+
+    connectivity, n_connected_components = _fix_connectivity(
+        X, connectivity, affinity=affinity
+    )
+    connectivity = connectivity.tocoo()
+    # Put the diagonal to zero
+    diag_mask = connectivity.row != connectivity.col
+    connectivity.row = connectivity.row[diag_mask]
+    connectivity.col = connectivity.col[diag_mask]
+    connectivity.data = connectivity.data[diag_mask]
+    del diag_mask
+
+    if affinity == "precomputed":
+        distances = X[connectivity.row, connectivity.col].astype(np.float64, copy=False)
+    else:
+        # FIXME We compute all the distances, while we could have only computed
+        # the "interesting" distances
+        distances = paired_distances(
+            X[connectivity.row], X[connectivity.col], metric=affinity
+        )
+    connectivity.data = distances
+
+    if n_clusters is None:
+        n_nodes = 2 * n_samples - 1
+    else:
+        assert n_clusters <= n_samples
+        n_nodes = 2 * n_samples - n_clusters
+
+    if linkage == "single":
+        return _single_linkage_tree(
+            connectivity,
+            n_samples,
+            n_nodes,
+            n_clusters,
+            n_connected_components,
+            return_distance,
+        )
+
+    if return_distance:
+        distances = np.empty(n_nodes - n_samples)
+    # create inertia heap and connection matrix
+    A = np.empty(n_nodes, dtype=object)
+    inertia = list()
+
+    # LIL seems to the best format to access the rows quickly,
+    # without the numpy overhead of slicing CSR indices and data.
+    connectivity = connectivity.tolil()
+    # We are storing the graph in a list of IntFloatDict
+    for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)):
+        A[ind] = IntFloatDict(
+            np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)
+        )
+        # We keep only the upper triangular for the heap
+        # Generator expressions are faster than arrays on the following
+        inertia.extend(
+            _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind
+        )
+    del connectivity
+
+    heapify(inertia)
+
+    # prepare the main fields
+    parent = np.arange(n_nodes, dtype=np.intp)
+    used_node = np.ones(n_nodes, dtype=np.intp)
+    children = []
+
+    # recursive merge loop
+    for k in range(n_samples, n_nodes):
+        # identify the merge
+        while True:
+            edge = heappop(inertia)
+            if used_node[edge.a] and used_node[edge.b]:
+                break
+        i = edge.a
+        j = edge.b
+
+        if return_distance:
+            # store distances
+            distances[k - n_samples] = edge.weight
+
+        parent[i] = parent[j] = k
+        children.append((i, j))
+        # Keep track of the number of elements per cluster
+        n_i = used_node[i]
+        n_j = used_node[j]
+        used_node[k] = n_i + n_j
+        used_node[i] = used_node[j] = False
+
+        # update the structure matrix A and the inertia matrix
+        # a clever 'min', or 'max' operation between A[i] and A[j]
+        coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
+        for col, d in coord_col:
+            A[col].append(k, d)
+            # Here we use the information from coord_col (containing the
+            # distances) to update the heap
+            heappush(inertia, _hierarchical.WeightedEdge(d, k, col))
+        A[k] = coord_col
+        # Clear A[i] and A[j] to save memory
+        A[i] = A[j] = 0
+
+    # Separate leaves in children (empty lists up to now)
+    n_leaves = n_samples
+
+    # # return numpy array for efficient caching
+    children = np.array(children)[:, ::-1]
+
+    if return_distance:
+        return children, n_connected_components, n_leaves, parent, distances
+    return children, n_connected_components, n_leaves, parent
+
+
+# Matching names to tree-building strategies
+def _complete_linkage(*args, **kwargs):
+    kwargs["linkage"] = "complete"
+    return linkage_tree(*args, **kwargs)
+
+
+def _average_linkage(*args, **kwargs):
+    kwargs["linkage"] = "average"
+    return linkage_tree(*args, **kwargs)
+
+
+def _single_linkage(*args, **kwargs):
+    kwargs["linkage"] = "single"
+    return linkage_tree(*args, **kwargs)
+
+
+_TREE_BUILDERS = dict(
+    ward=ward_tree,
+    complete=_complete_linkage,
+    average=_average_linkage,
+    single=_single_linkage,
+)
+
+###############################################################################
+# Functions for cutting hierarchical clustering tree
+
+
+def _hc_cut(n_clusters, children, n_leaves):
+    """Function cutting the ward tree for a given number of clusters.
+
+    Parameters
+    ----------
+    n_clusters : int or ndarray
+        The number of clusters to form.
+
+    children : ndarray of shape (n_nodes-1, 2)
+        The children of each non-leaf node. Values less than `n_samples`
+        correspond to leaves of the tree which are the original samples.
+        A node `i` greater than or equal to `n_samples` is a non-leaf
+        node and has children `children_[i - n_samples]`. Alternatively
+        at the i-th iteration, children[i][0] and children[i][1]
+        are merged to form node `n_samples + i`.
+
+    n_leaves : int
+        Number of leaves of the tree.
+
+    Returns
+    -------
+    labels : array [n_samples]
+        Cluster labels for each point.
+    """
+    if n_clusters > n_leaves:
+        raise ValueError(
+            "Cannot extract more clusters than samples: "
+            f"{n_clusters} clusters were given for a tree with {n_leaves} leaves."
+        )
+    # In this function, we store nodes as a heap to avoid recomputing
+    # the max of the nodes: the first element is always the smallest
+    # We use negated indices as heaps work on smallest elements, and we
+    # are interested in largest elements
+    # children[-1] is the root of the tree
+    nodes = [-(max(children[-1]) + 1)]
+    for _ in range(n_clusters - 1):
+        # As we have a heap, nodes[0] is the smallest element
+        these_children = children[-nodes[0] - n_leaves]
+        # Insert the 2 children and remove the largest node
+        heappush(nodes, -these_children[0])
+        heappushpop(nodes, -these_children[1])
+    label = np.zeros(n_leaves, dtype=np.intp)
+    for i, node in enumerate(nodes):
+        label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i
+    return label
+
+
+###############################################################################
+
+
+class AgglomerativeClustering(ClusterMixin, BaseEstimator):
+    """
+    Agglomerative Clustering.
+
+    Recursively merges pair of clusters of sample data; uses linkage distance.
+
+    Read more in the :ref:`User Guide <hierarchical_clustering>`.
+
+    Parameters
+    ----------
+    n_clusters : int or None, default=2
+        The number of clusters to find. It must be ``None`` if
+        ``distance_threshold`` is not ``None``.
+
+    metric : str or callable, default="euclidean"
+        Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
+        "manhattan", "cosine", or "precomputed". If linkage is "ward", only
+        "euclidean" is accepted. If "precomputed", a distance matrix is needed
+        as input for the fit method. If connectivity is None, linkage is
+        "single" and affinity is not "precomputed" any valid pairwise distance
+        metric can be assigned.
+
+        For an example of agglomerative clustering with different metrics, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering_metrics.py`.
+
+        .. versionadded:: 1.2
+
+    memory : str or object with the joblib.Memory interface, default=None
+        Used to cache the output of the computation of the tree.
+        By default, no caching is done. If a string is given, it is the
+        path to the caching directory.
+
+    connectivity : array-like, sparse matrix, or callable, default=None
+        Connectivity matrix. Defines for each sample the neighboring
+        samples following a given structure of the data.
+        This can be a connectivity matrix itself or a callable that transforms
+        the data into a connectivity matrix, such as derived from
+        `kneighbors_graph`. Default is ``None``, i.e, the
+        hierarchical clustering algorithm is unstructured.
+
+        For an example of connectivity matrix using
+        :class:`~sklearn.neighbors.kneighbors_graph`, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py`.
+
+    compute_full_tree : 'auto' or bool, default='auto'
+        Stop early the construction of the tree at ``n_clusters``. This is
+        useful to decrease computation time if the number of clusters is not
+        small compared to the number of samples. This option is useful only
+        when specifying a connectivity matrix. Note also that when varying the
+        number of clusters and using caching, it may be advantageous to compute
+        the full tree. It must be ``True`` if ``distance_threshold`` is not
+        ``None``. By default `compute_full_tree` is "auto", which is equivalent
+        to `True` when `distance_threshold` is not `None` or that `n_clusters`
+        is inferior to the maximum between 100 or `0.02 * n_samples`.
+        Otherwise, "auto" is equivalent to `False`.
+
+    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
+        Which linkage criterion to use. The linkage criterion determines which
+        distance to use between sets of observation. The algorithm will merge
+        the pairs of cluster that minimize this criterion.
+
+        - 'ward' minimizes the variance of the clusters being merged.
+        - 'average' uses the average of the distances of each observation of
+          the two sets.
+        - 'complete' or 'maximum' linkage uses the maximum distances between
+          all observations of the two sets.
+        - 'single' uses the minimum of the distances between all observations
+          of the two sets.
+
+        .. versionadded:: 0.20
+            Added the 'single' option
+
+        For examples comparing different `linkage` criteria, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_linkage_comparison.py`.
+
+    distance_threshold : float, default=None
+        The linkage distance threshold at or above which clusters will not be
+        merged. If not ``None``, ``n_clusters`` must be ``None`` and
+        ``compute_full_tree`` must be ``True``.
+
+        .. versionadded:: 0.21
+
+    compute_distances : bool, default=False
+        Computes distances between clusters even if `distance_threshold` is not
+        used. This can be used to make dendrogram visualization, but introduces
+        a computational and memory overhead.
+
+        .. versionadded:: 0.24
+
+        For an example of dendrogram visualization, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_dendrogram.py`.
+
+    Attributes
+    ----------
+    n_clusters_ : int
+        The number of clusters found by the algorithm. If
+        ``distance_threshold=None``, it will be equal to the given
+        ``n_clusters``.
+
+    labels_ : ndarray of shape (n_samples)
+        Cluster labels for each point.
+
+    n_leaves_ : int
+        Number of leaves in the hierarchical tree.
+
+    n_connected_components_ : int
+        The estimated number of connected components in the graph.
+
+        .. versionadded:: 0.21
+            ``n_connected_components_`` was added to replace ``n_components_``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    children_ : array-like of shape (n_samples-1, 2)
+        The children of each non-leaf node. Values less than `n_samples`
+        correspond to leaves of the tree which are the original samples.
+        A node `i` greater than or equal to `n_samples` is a non-leaf
+        node and has children `children_[i - n_samples]`. Alternatively
+        at the i-th iteration, children[i][0] and children[i][1]
+        are merged to form node `n_samples + i`.
+
+    distances_ : array-like of shape (n_nodes-1,)
+        Distances between nodes in the corresponding place in `children_`.
+        Only computed if `distance_threshold` is used or `compute_distances`
+        is set to `True`.
+
+    See Also
+    --------
+    FeatureAgglomeration : Agglomerative clustering but for features instead of
+        samples.
+    ward_tree : Hierarchical clustering with ward linkage.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import AgglomerativeClustering
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 4], [4, 0]])
+    >>> clustering = AgglomerativeClustering().fit(X)
+    >>> clustering
+    AgglomerativeClustering()
+    >>> clustering.labels_
+    array([1, 1, 1, 0, 0, 0])
+
+    For a comparison of Agglomerative clustering with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        "n_clusters": [Interval(Integral, 1, None, closed="left"), None],
+        "metric": [
+            StrOptions(set(_VALID_METRICS) | {"precomputed"}),
+            callable,
+        ],
+        "memory": [str, HasMethods("cache"), None],
+        "connectivity": ["array-like", "sparse matrix", callable, None],
+        "compute_full_tree": [StrOptions({"auto"}), "boolean"],
+        "linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
+        "distance_threshold": [Interval(Real, 0, None, closed="left"), None],
+        "compute_distances": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        n_clusters=2,
+        *,
+        metric="euclidean",
+        memory=None,
+        connectivity=None,
+        compute_full_tree="auto",
+        linkage="ward",
+        distance_threshold=None,
+        compute_distances=False,
+    ):
+        self.n_clusters = n_clusters
+        self.distance_threshold = distance_threshold
+        self.memory = memory
+        self.connectivity = connectivity
+        self.compute_full_tree = compute_full_tree
+        self.linkage = linkage
+        self.metric = metric
+        self.compute_distances = compute_distances
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the hierarchical clustering from features, or distance matrix.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Training instances to cluster, or distances between instances if
+            ``metric='precomputed'``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the fitted instance.
+        """
+        X = validate_data(self, X, ensure_min_samples=2)
+        return self._fit(X)
+
+    def _fit(self, X):
+        """Fit without validation
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
+            Training instances to cluster, or distances between instances if
+            ``metric='precomputed'``.
+
+        Returns
+        -------
+        self : object
+            Returns the fitted instance.
+        """
+        memory = check_memory(self.memory)
+
+        if not ((self.n_clusters is None) ^ (self.distance_threshold is None)):
+            raise ValueError(
+                "Exactly one of n_clusters and "
+                "distance_threshold has to be set, and the other "
+                "needs to be None."
+            )
+
+        if self.distance_threshold is not None and not self.compute_full_tree:
+            raise ValueError(
+                "compute_full_tree must be True if distance_threshold is set."
+            )
+
+        if self.linkage == "ward" and self.metric != "euclidean":
+            raise ValueError(
+                f"{self.metric} was provided as metric. Ward can only "
+                "work with euclidean distances."
+            )
+
+        tree_builder = _TREE_BUILDERS[self.linkage]
+
+        connectivity = self.connectivity
+        if self.connectivity is not None:
+            if callable(self.connectivity):
+                connectivity = self.connectivity(X)
+            connectivity = check_array(
+                connectivity, accept_sparse=["csr", "coo", "lil"]
+            )
+
+        n_samples = len(X)
+        compute_full_tree = self.compute_full_tree
+        if self.connectivity is None:
+            compute_full_tree = True
+        if compute_full_tree == "auto":
+            if self.distance_threshold is not None:
+                compute_full_tree = True
+            else:
+                # Early stopping is likely to give a speed up only for
+                # a large number of clusters. The actual threshold
+                # implemented here is heuristic
+                compute_full_tree = self.n_clusters < max(100, 0.02 * n_samples)
+        n_clusters = self.n_clusters
+        if compute_full_tree:
+            n_clusters = None
+
+        # Construct the tree
+        kwargs = {}
+        if self.linkage != "ward":
+            kwargs["linkage"] = self.linkage
+            kwargs["affinity"] = self.metric
+
+        distance_threshold = self.distance_threshold
+
+        return_distance = (distance_threshold is not None) or self.compute_distances
+
+        out = memory.cache(tree_builder)(
+            X,
+            connectivity=connectivity,
+            n_clusters=n_clusters,
+            return_distance=return_distance,
+            **kwargs,
+        )
+        (self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[
+            :4
+        ]
+
+        if return_distance:
+            self.distances_ = out[-1]
+
+        if self.distance_threshold is not None:  # distance_threshold is used
+            self.n_clusters_ = (
+                np.count_nonzero(self.distances_ >= distance_threshold) + 1
+            )
+        else:  # n_clusters is used
+            self.n_clusters_ = self.n_clusters
+
+        # Cut the tree
+        if compute_full_tree:
+            self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_)
+        else:
+            labels = _hierarchical.hc_get_heads(parents, copy=False)
+            # copy to avoid holding a reference on the original array
+            labels = np.copy(labels[:n_samples])
+            # Reassign cluster numbers
+            self.labels_ = np.searchsorted(np.unique(labels), labels)
+        return self
+
+    def fit_predict(self, X, y=None):
+        """Fit and return the result of each sample's clustering assignment.
+
+        In addition to fitting, this method also return the result of the
+        clustering assignment for each sample in the training set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Training instances to cluster, or distances between instances if
+            ``affinity='precomputed'``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels.
+        """
+        return super().fit_predict(X, y)
+
+
+class FeatureAgglomeration(
+    ClassNamePrefixFeaturesOutMixin, AgglomerationTransform, AgglomerativeClustering
+):
+    """Agglomerate features.
+
+    Recursively merges pair of clusters of features.
+
+    Refer to
+    :ref:`sphx_glr_auto_examples_cluster_plot_feature_agglomeration_vs_univariate_selection.py`
+    for an example comparison of :class:`FeatureAgglomeration` strategy with a
+    univariate feature selection strategy (based on ANOVA).
+
+    Read more in the :ref:`User Guide <hierarchical_clustering>`.
+
+    Parameters
+    ----------
+    n_clusters : int or None, default=2
+        The number of clusters to find. It must be ``None`` if
+        ``distance_threshold`` is not ``None``.
+
+    metric : str or callable, default="euclidean"
+        Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
+        "manhattan", "cosine", or "precomputed". If linkage is "ward", only
+        "euclidean" is accepted. If "precomputed", a distance matrix is needed
+        as input for the fit method.
+
+        .. versionadded:: 1.2
+
+    memory : str or object with the joblib.Memory interface, default=None
+        Used to cache the output of the computation of the tree.
+        By default, no caching is done. If a string is given, it is the
+        path to the caching directory.
+
+    connectivity : array-like, sparse matrix, or callable, default=None
+        Connectivity matrix. Defines for each feature the neighboring
+        features following a given structure of the data.
+        This can be a connectivity matrix itself or a callable that transforms
+        the data into a connectivity matrix, such as derived from
+        `kneighbors_graph`. Default is `None`, i.e, the
+        hierarchical clustering algorithm is unstructured.
+
+    compute_full_tree : 'auto' or bool, default='auto'
+        Stop early the construction of the tree at `n_clusters`. This is useful
+        to decrease computation time if the number of clusters is not small
+        compared to the number of features. This option is useful only when
+        specifying a connectivity matrix. Note also that when varying the
+        number of clusters and using caching, it may be advantageous to compute
+        the full tree. It must be ``True`` if ``distance_threshold`` is not
+        ``None``. By default `compute_full_tree` is "auto", which is equivalent
+        to `True` when `distance_threshold` is not `None` or that `n_clusters`
+        is inferior to the maximum between 100 or `0.02 * n_samples`.
+        Otherwise, "auto" is equivalent to `False`.
+
+    linkage : {"ward", "complete", "average", "single"}, default="ward"
+        Which linkage criterion to use. The linkage criterion determines which
+        distance to use between sets of features. The algorithm will merge
+        the pairs of cluster that minimize this criterion.
+
+        - "ward" minimizes the variance of the clusters being merged.
+        - "complete" or maximum linkage uses the maximum distances between
+          all features of the two sets.
+        - "average" uses the average of the distances of each feature of
+          the two sets.
+        - "single" uses the minimum of the distances between all features
+          of the two sets.
+
+    pooling_func : callable, default=np.mean
+        This combines the values of agglomerated features into a single
+        value, and should accept an array of shape [M, N] and the keyword
+        argument `axis=1`, and reduce it to an array of size [M].
+
+    distance_threshold : float, default=None
+        The linkage distance threshold at or above which clusters will not be
+        merged. If not ``None``, ``n_clusters`` must be ``None`` and
+        ``compute_full_tree`` must be ``True``.
+
+        .. versionadded:: 0.21
+
+    compute_distances : bool, default=False
+        Computes distances between clusters even if `distance_threshold` is not
+        used. This can be used to make dendrogram visualization, but introduces
+        a computational and memory overhead.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    n_clusters_ : int
+        The number of clusters found by the algorithm. If
+        ``distance_threshold=None``, it will be equal to the given
+        ``n_clusters``.
+
+    labels_ : array-like of (n_features,)
+        Cluster labels for each feature.
+
+    n_leaves_ : int
+        Number of leaves in the hierarchical tree.
+
+    n_connected_components_ : int
+        The estimated number of connected components in the graph.
+
+        .. versionadded:: 0.21
+            ``n_connected_components_`` was added to replace ``n_components_``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    children_ : array-like of shape (n_nodes-1, 2)
+        The children of each non-leaf node. Values less than `n_features`
+        correspond to leaves of the tree which are the original samples.
+        A node `i` greater than or equal to `n_features` is a non-leaf
+        node and has children `children_[i - n_features]`. Alternatively
+        at the i-th iteration, children[i][0] and children[i][1]
+        are merged to form node `n_features + i`.
+
+    distances_ : array-like of shape (n_nodes-1,)
+        Distances between nodes in the corresponding place in `children_`.
+        Only computed if `distance_threshold` is used or `compute_distances`
+        is set to `True`.
+
+    See Also
+    --------
+    AgglomerativeClustering : Agglomerative clustering samples instead of
+        features.
+    ward_tree : Hierarchical clustering with ward linkage.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import datasets, cluster
+    >>> digits = datasets.load_digits()
+    >>> images = digits.images
+    >>> X = np.reshape(images, (len(images), -1))
+    >>> agglo = cluster.FeatureAgglomeration(n_clusters=32)
+    >>> agglo.fit(X)
+    FeatureAgglomeration(n_clusters=32)
+    >>> X_reduced = agglo.transform(X)
+    >>> X_reduced.shape
+    (1797, 32)
+    """
+
+    _parameter_constraints: dict = {
+        "n_clusters": [Interval(Integral, 1, None, closed="left"), None],
+        "metric": [
+            StrOptions(set(_VALID_METRICS) | {"precomputed"}),
+            callable,
+        ],
+        "memory": [str, HasMethods("cache"), None],
+        "connectivity": ["array-like", "sparse matrix", callable, None],
+        "compute_full_tree": [StrOptions({"auto"}), "boolean"],
+        "linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
+        "pooling_func": [callable],
+        "distance_threshold": [Interval(Real, 0, None, closed="left"), None],
+        "compute_distances": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        n_clusters=2,
+        *,
+        metric="euclidean",
+        memory=None,
+        connectivity=None,
+        compute_full_tree="auto",
+        linkage="ward",
+        pooling_func=np.mean,
+        distance_threshold=None,
+        compute_distances=False,
+    ):
+        super().__init__(
+            n_clusters=n_clusters,
+            memory=memory,
+            connectivity=connectivity,
+            compute_full_tree=compute_full_tree,
+            linkage=linkage,
+            metric=metric,
+            distance_threshold=distance_threshold,
+            compute_distances=compute_distances,
+        )
+        self.pooling_func = pooling_func
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the hierarchical clustering on the data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the transformer.
+        """
+        X = validate_data(self, X, ensure_min_features=2)
+        super()._fit(X.T)
+        self._n_features_out = self.n_clusters_
+        return self
+
+    @property
+    def fit_predict(self):
+        """Fit and return the result of each sample's clustering assignment."""
+        raise AttributeError
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_bicluster.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_bicluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..04a4e68024d33350b9fdd844f6bc614e4c22f39a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_bicluster.py
@@ -0,0 +1,621 @@
+"""Spectral biclustering algorithms."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import ABCMeta, abstractmethod
+from numbers import Integral
+
+import numpy as np
+from scipy.linalg import norm
+from scipy.sparse import dia_matrix, issparse
+from scipy.sparse.linalg import eigsh, svds
+
+from ..base import BaseEstimator, BiclusterMixin, _fit_context
+from ..utils import check_random_state, check_scalar
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import _randomized_svd, make_nonnegative, safe_sparse_dot
+from ..utils.validation import assert_all_finite, validate_data
+from ._kmeans import KMeans, MiniBatchKMeans
+
+__all__ = ["SpectralBiclustering", "SpectralCoclustering"]
+
+
+def _scale_normalize(X):
+    """Normalize ``X`` by scaling rows and columns independently.
+
+    Returns the normalized matrix and the row and column scaling
+    factors.
+    """
+    X = make_nonnegative(X)
+    row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
+    col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
+    row_diag = np.where(np.isnan(row_diag), 0, row_diag)
+    col_diag = np.where(np.isnan(col_diag), 0, col_diag)
+    if issparse(X):
+        n_rows, n_cols = X.shape
+        r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
+        c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
+        an = r @ X @ c
+    else:
+        an = row_diag[:, np.newaxis] * X * col_diag
+    return an, row_diag, col_diag
+
+
+def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
+    """Normalize rows and columns of ``X`` simultaneously so that all
+    rows sum to one constant and all columns sum to a different
+    constant.
+    """
+    # According to paper, this can also be done more efficiently with
+    # deviation reduction and balancing algorithms.
+    X = make_nonnegative(X)
+    X_scaled = X
+    for _ in range(max_iter):
+        X_new, _, _ = _scale_normalize(X_scaled)
+        if issparse(X):
+            dist = norm(X_scaled.data - X.data)
+        else:
+            dist = norm(X_scaled - X_new)
+        X_scaled = X_new
+        if dist is not None and dist < tol:
+            break
+    return X_scaled
+
+
+def _log_normalize(X):
+    """Normalize ``X`` according to Kluger's log-interactions scheme."""
+    X = make_nonnegative(X, min_value=1)
+    if issparse(X):
+        raise ValueError(
+            "Cannot compute log of a sparse matrix,"
+            " because log(x) diverges to -infinity as x"
+            " goes to 0."
+        )
+    L = np.log(X)
+    row_avg = L.mean(axis=1)[:, np.newaxis]
+    col_avg = L.mean(axis=0)
+    avg = L.mean()
+    return L - row_avg - col_avg + avg
+
+
+class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for spectral biclustering."""
+
+    _parameter_constraints: dict = {
+        "svd_method": [StrOptions({"randomized", "arpack"})],
+        "n_svd_vecs": [Interval(Integral, 0, None, closed="left"), None],
+        "mini_batch": ["boolean"],
+        "init": [StrOptions({"k-means++", "random"}), np.ndarray],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        n_clusters=3,
+        svd_method="randomized",
+        n_svd_vecs=None,
+        mini_batch=False,
+        init="k-means++",
+        n_init=10,
+        random_state=None,
+    ):
+        self.n_clusters = n_clusters
+        self.svd_method = svd_method
+        self.n_svd_vecs = n_svd_vecs
+        self.mini_batch = mini_batch
+        self.init = init
+        self.n_init = n_init
+        self.random_state = random_state
+
+    @abstractmethod
+    def _check_parameters(self, n_samples):
+        """Validate parameters depending on the input data."""
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Create a biclustering for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            SpectralBiclustering instance.
+        """
+        X = validate_data(self, X, accept_sparse="csr", dtype=np.float64)
+        self._check_parameters(X.shape[0])
+        self._fit(X)
+        return self
+
+    def _svd(self, array, n_components, n_discard):
+        """Returns first `n_components` left and right singular
+        vectors u and v, discarding the first `n_discard`.
+        """
+        if self.svd_method == "randomized":
+            kwargs = {}
+            if self.n_svd_vecs is not None:
+                kwargs["n_oversamples"] = self.n_svd_vecs
+            u, _, vt = _randomized_svd(
+                array, n_components, random_state=self.random_state, **kwargs
+            )
+
+        elif self.svd_method == "arpack":
+            u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
+            if np.any(np.isnan(vt)):
+                # some eigenvalues of A * A.T are negative, causing
+                # sqrt() to be np.nan. This causes some vectors in vt
+                # to be np.nan.
+                A = safe_sparse_dot(array.T, array)
+                random_state = check_random_state(self.random_state)
+                # initialize with [-1,1] as in ARPACK
+                v0 = random_state.uniform(-1, 1, A.shape[0])
+                _, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
+                vt = v.T
+            if np.any(np.isnan(u)):
+                A = safe_sparse_dot(array, array.T)
+                random_state = check_random_state(self.random_state)
+                # initialize with [-1,1] as in ARPACK
+                v0 = random_state.uniform(-1, 1, A.shape[0])
+                _, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
+
+        assert_all_finite(u)
+        assert_all_finite(vt)
+        u = u[:, n_discard:]
+        vt = vt[n_discard:]
+        return u, vt.T
+
+    def _k_means(self, data, n_clusters):
+        if self.mini_batch:
+            model = MiniBatchKMeans(
+                n_clusters,
+                init=self.init,
+                n_init=self.n_init,
+                random_state=self.random_state,
+            )
+        else:
+            model = KMeans(
+                n_clusters,
+                init=self.init,
+                n_init=self.n_init,
+                random_state=self.random_state,
+            )
+        model.fit(data)
+        centroid = model.cluster_centers_
+        labels = model.labels_
+        return centroid, labels
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class SpectralCoclustering(BaseSpectral):
+    """Spectral Co-Clustering algorithm (Dhillon, 2001).
+
+    Clusters rows and columns of an array `X` to solve the relaxed
+    normalized cut of the bipartite graph created from `X` as follows:
+    the edge between row vertex `i` and column vertex `j` has weight
+    `X[i, j]`.
+
+    The resulting bicluster structure is block-diagonal, since each
+    row and each column belongs to exactly one bicluster.
+
+    Supports sparse matrices, as long as they are nonnegative.
+
+    Read more in the :ref:`User Guide <spectral_coclustering>`.
+
+    Parameters
+    ----------
+    n_clusters : int, default=3
+        The number of biclusters to find.
+
+    svd_method : {'randomized', 'arpack'}, default='randomized'
+        Selects the algorithm for finding singular vectors. May be
+        'randomized' or 'arpack'. If 'randomized', use
+        :func:`sklearn.utils.extmath.randomized_svd`, which may be faster
+        for large matrices. If 'arpack', use
+        :func:`scipy.sparse.linalg.svds`, which is more accurate, but
+        possibly slower in some cases.
+
+    n_svd_vecs : int, default=None
+        Number of vectors to use in calculating the SVD. Corresponds
+        to `ncv` when `svd_method=arpack` and `n_oversamples` when
+        `svd_method` is 'randomized`.
+
+    mini_batch : bool, default=False
+        Whether to use mini-batch k-means, which is faster but may get
+        different results.
+
+    init : {'k-means++', 'random'}, or ndarray of shape \
+            (n_clusters, n_features), default='k-means++'
+        Method for initialization of k-means algorithm; defaults to
+        'k-means++'.
+
+    n_init : int, default=10
+        Number of random initializations that are tried with the
+        k-means algorithm.
+
+        If mini-batch k-means is used, the best initialization is
+        chosen and the algorithm runs once. Otherwise, the algorithm
+        is run for each initialization and the best solution chosen.
+
+    random_state : int, RandomState instance, default=None
+        Used for randomizing the singular value decomposition and the k-means
+        initialization. Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    rows_ : array-like of shape (n_row_clusters, n_rows)
+        Results of the clustering. `rows[i, r]` is True if
+        cluster `i` contains row `r`. Available only after calling ``fit``.
+
+    columns_ : array-like of shape (n_column_clusters, n_columns)
+        Results of the clustering, like `rows`.
+
+    row_labels_ : array-like of shape (n_rows,)
+        The bicluster label of each row.
+
+    column_labels_ : array-like of shape (n_cols,)
+        The bicluster label of each column.
+
+    biclusters_ : tuple of two ndarrays
+        The tuple contains the `rows_` and `columns_` arrays.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    SpectralBiclustering : Partitions rows and columns under the assumption
+        that the data has an underlying checkerboard structure.
+
+    References
+    ----------
+    * :doi:`Dhillon, Inderjit S, 2001. Co-clustering documents and words using
+      bipartite spectral graph partitioning.
+      <10.1145/502512.502550>`
+
+    Examples
+    --------
+    >>> from sklearn.cluster import SpectralCoclustering
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
+    >>> clustering.row_labels_ #doctest: +SKIP
+    array([0, 1, 1, 0, 0, 0], dtype=int32)
+    >>> clustering.column_labels_ #doctest: +SKIP
+    array([0, 0], dtype=int32)
+    >>> clustering
+    SpectralCoclustering(n_clusters=2, random_state=0)
+
+    For a more detailed example, see the following:
+    :ref:`sphx_glr_auto_examples_bicluster_plot_spectral_coclustering.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSpectral._parameter_constraints,
+        "n_clusters": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        n_clusters=3,
+        *,
+        svd_method="randomized",
+        n_svd_vecs=None,
+        mini_batch=False,
+        init="k-means++",
+        n_init=10,
+        random_state=None,
+    ):
+        super().__init__(
+            n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
+        )
+
+    def _check_parameters(self, n_samples):
+        if self.n_clusters > n_samples:
+            raise ValueError(
+                f"n_clusters should be <= n_samples={n_samples}. Got"
+                f" {self.n_clusters} instead."
+            )
+
+    def _fit(self, X):
+        normalized_data, row_diag, col_diag = _scale_normalize(X)
+        n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
+        u, v = self._svd(normalized_data, n_sv, n_discard=1)
+        z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))
+
+        _, labels = self._k_means(z, self.n_clusters)
+
+        n_rows = X.shape[0]
+        self.row_labels_ = labels[:n_rows]
+        self.column_labels_ = labels[n_rows:]
+
+        self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])
+        self.columns_ = np.vstack(
+            [self.column_labels_ == c for c in range(self.n_clusters)]
+        )
+
+
+class SpectralBiclustering(BaseSpectral):
+    """Spectral biclustering (Kluger, 2003).
+
+    Partitions rows and columns under the assumption that the data has
+    an underlying checkerboard structure. For instance, if there are
+    two row partitions and three column partitions, each row will
+    belong to three biclusters, and each column will belong to two
+    biclusters. The outer product of the corresponding row and column
+    label vectors gives this checkerboard structure.
+
+    Read more in the :ref:`User Guide <spectral_biclustering>`.
+
+    Parameters
+    ----------
+    n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
+        The number of row and column clusters in the checkerboard
+        structure.
+
+    method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
+        Method of normalizing and converting singular vectors into
+        biclusters. May be one of 'scale', 'bistochastic', or 'log'.
+        The authors recommend using 'log'. If the data is sparse,
+        however, log normalization will not work, which is why the
+        default is 'bistochastic'.
+
+        .. warning::
+           if `method='log'`, the data must not be sparse.
+
+    n_components : int, default=6
+        Number of singular vectors to check.
+
+    n_best : int, default=3
+        Number of best singular vectors to which to project the data
+        for clustering.
+
+    svd_method : {'randomized', 'arpack'}, default='randomized'
+        Selects the algorithm for finding singular vectors. May be
+        'randomized' or 'arpack'. If 'randomized', uses
+        :func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
+        for large matrices. If 'arpack', uses
+        `scipy.sparse.linalg.svds`, which is more accurate, but
+        possibly slower in some cases.
+
+    n_svd_vecs : int, default=None
+        Number of vectors to use in calculating the SVD. Corresponds
+        to `ncv` when `svd_method=arpack` and `n_oversamples` when
+        `svd_method` is 'randomized`.
+
+    mini_batch : bool, default=False
+        Whether to use mini-batch k-means, which is faster but may get
+        different results.
+
+    init : {'k-means++', 'random'} or ndarray of shape (n_clusters, n_features), \
+            default='k-means++'
+        Method for initialization of k-means algorithm; defaults to
+        'k-means++'.
+
+    n_init : int, default=10
+        Number of random initializations that are tried with the
+        k-means algorithm.
+
+        If mini-batch k-means is used, the best initialization is
+        chosen and the algorithm runs once. Otherwise, the algorithm
+        is run for each initialization and the best solution chosen.
+
+    random_state : int, RandomState instance, default=None
+        Used for randomizing the singular value decomposition and the k-means
+        initialization. Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    rows_ : array-like of shape (n_row_clusters, n_rows)
+        Results of the clustering. `rows[i, r]` is True if
+        cluster `i` contains row `r`. Available only after calling ``fit``.
+
+    columns_ : array-like of shape (n_column_clusters, n_columns)
+        Results of the clustering, like `rows`.
+
+    row_labels_ : array-like of shape (n_rows,)
+        Row partition labels.
+
+    column_labels_ : array-like of shape (n_cols,)
+        Column partition labels.
+
+    biclusters_ : tuple of two ndarrays
+        The tuple contains the `rows_` and `columns_` arrays.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    SpectralCoclustering : Spectral Co-Clustering algorithm (Dhillon, 2001).
+
+    References
+    ----------
+
+    * :doi:`Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray
+      data: coclustering genes and conditions.
+      <10.1101/gr.648603>`
+
+    Examples
+    --------
+    >>> from sklearn.cluster import SpectralBiclustering
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
+    >>> clustering.row_labels_
+    array([1, 1, 1, 0, 0, 0], dtype=int32)
+    >>> clustering.column_labels_
+    array([1, 0], dtype=int32)
+    >>> clustering
+    SpectralBiclustering(n_clusters=2, random_state=0)
+
+    For a more detailed example, see
+    :ref:`sphx_glr_auto_examples_bicluster_plot_spectral_biclustering.py`
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSpectral._parameter_constraints,
+        "n_clusters": [Interval(Integral, 1, None, closed="left"), tuple],
+        "method": [StrOptions({"bistochastic", "scale", "log"})],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "n_best": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        n_clusters=3,
+        *,
+        method="bistochastic",
+        n_components=6,
+        n_best=3,
+        svd_method="randomized",
+        n_svd_vecs=None,
+        mini_batch=False,
+        init="k-means++",
+        n_init=10,
+        random_state=None,
+    ):
+        super().__init__(
+            n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
+        )
+        self.method = method
+        self.n_components = n_components
+        self.n_best = n_best
+
+    def _check_parameters(self, n_samples):
+        if isinstance(self.n_clusters, Integral):
+            if self.n_clusters > n_samples:
+                raise ValueError(
+                    f"n_clusters should be <= n_samples={n_samples}. Got"
+                    f" {self.n_clusters} instead."
+                )
+        else:  # tuple
+            try:
+                n_row_clusters, n_column_clusters = self.n_clusters
+                check_scalar(
+                    n_row_clusters,
+                    "n_row_clusters",
+                    target_type=Integral,
+                    min_val=1,
+                    max_val=n_samples,
+                )
+                check_scalar(
+                    n_column_clusters,
+                    "n_column_clusters",
+                    target_type=Integral,
+                    min_val=1,
+                    max_val=n_samples,
+                )
+            except (ValueError, TypeError) as e:
+                raise ValueError(
+                    "Incorrect parameter n_clusters has value:"
+                    f" {self.n_clusters}. It should either be a single integer"
+                    " or an iterable with two integers:"
+                    " (n_row_clusters, n_column_clusters)"
+                    " And the values are should be in the"
+                    " range: (1, n_samples)"
+                ) from e
+
+        if self.n_best > self.n_components:
+            raise ValueError(
+                f"n_best={self.n_best} must be <= n_components={self.n_components}."
+            )
+
+    def _fit(self, X):
+        n_sv = self.n_components
+        if self.method == "bistochastic":
+            normalized_data = _bistochastic_normalize(X)
+            n_sv += 1
+        elif self.method == "scale":
+            normalized_data, _, _ = _scale_normalize(X)
+            n_sv += 1
+        elif self.method == "log":
+            normalized_data = _log_normalize(X)
+        n_discard = 0 if self.method == "log" else 1
+        u, v = self._svd(normalized_data, n_sv, n_discard)
+        ut = u.T
+        vt = v.T
+
+        try:
+            n_row_clusters, n_col_clusters = self.n_clusters
+        except TypeError:
+            n_row_clusters = n_col_clusters = self.n_clusters
+
+        best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)
+
+        best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)
+
+        self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)
+
+        self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)
+
+        self.rows_ = np.vstack(
+            [
+                self.row_labels_ == label
+                for label in range(n_row_clusters)
+                for _ in range(n_col_clusters)
+            ]
+        )
+        self.columns_ = np.vstack(
+            [
+                self.column_labels_ == label
+                for _ in range(n_row_clusters)
+                for label in range(n_col_clusters)
+            ]
+        )
+
+    def _fit_best_piecewise(self, vectors, n_best, n_clusters):
+        """Find the ``n_best`` vectors that are best approximated by piecewise
+        constant vectors.
+
+        The piecewise vectors are found by k-means; the best is chosen
+        according to Euclidean distance.
+
+        """
+
+        def make_piecewise(v):
+            centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
+            return centroid[labels].ravel()
+
+        piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)
+        dists = np.apply_along_axis(norm, axis=1, arr=(vectors - piecewise_vectors))
+        result = vectors[np.argsort(dists)[:n_best]]
+        return result
+
+    def _project_and_cluster(self, data, vectors, n_clusters):
+        """Project ``data`` to ``vectors`` and cluster the result."""
+        projected = safe_sparse_dot(data, vectors)
+        _, labels = self._k_means(projected, n_clusters)
+        return labels
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_birch.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_birch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c894a644c8bc8b96b1c285358fd6a9cbf803a47
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_birch.py
@@ -0,0 +1,749 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from math import sqrt
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse
+
+from .._config import config_context
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    ClusterMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..exceptions import ConvergenceWarning
+from ..metrics import pairwise_distances_argmin
+from ..metrics.pairwise import euclidean_distances
+from ..utils._param_validation import Hidden, Interval, StrOptions
+from ..utils.extmath import row_norms
+from ..utils.validation import check_is_fitted, validate_data
+from . import AgglomerativeClustering
+
+
+def _iterate_sparse_X(X):
+    """This little hack returns a densified row when iterating over a sparse
+    matrix, instead of constructing a sparse matrix for every row that is
+    expensive.
+    """
+    n_samples = X.shape[0]
+    X_indices = X.indices
+    X_data = X.data
+    X_indptr = X.indptr
+
+    for i in range(n_samples):
+        row = np.zeros(X.shape[1])
+        startptr, endptr = X_indptr[i], X_indptr[i + 1]
+        nonzero_indices = X_indices[startptr:endptr]
+        row[nonzero_indices] = X_data[startptr:endptr]
+        yield row
+
+
+def _split_node(node, threshold, branching_factor):
+    """The node has to be split if there is no place for a new subcluster
+    in the node.
+    1. Two empty nodes and two empty subclusters are initialized.
+    2. The pair of distant subclusters are found.
+    3. The properties of the empty subclusters and nodes are updated
+       according to the nearest distance between the subclusters to the
+       pair of distant subclusters.
+    4. The two nodes are set as children to the two subclusters.
+    """
+    new_subcluster1 = _CFSubcluster()
+    new_subcluster2 = _CFSubcluster()
+    new_node1 = _CFNode(
+        threshold=threshold,
+        branching_factor=branching_factor,
+        is_leaf=node.is_leaf,
+        n_features=node.n_features,
+        dtype=node.init_centroids_.dtype,
+    )
+    new_node2 = _CFNode(
+        threshold=threshold,
+        branching_factor=branching_factor,
+        is_leaf=node.is_leaf,
+        n_features=node.n_features,
+        dtype=node.init_centroids_.dtype,
+    )
+    new_subcluster1.child_ = new_node1
+    new_subcluster2.child_ = new_node2
+
+    if node.is_leaf:
+        if node.prev_leaf_ is not None:
+            node.prev_leaf_.next_leaf_ = new_node1
+        new_node1.prev_leaf_ = node.prev_leaf_
+        new_node1.next_leaf_ = new_node2
+        new_node2.prev_leaf_ = new_node1
+        new_node2.next_leaf_ = node.next_leaf_
+        if node.next_leaf_ is not None:
+            node.next_leaf_.prev_leaf_ = new_node2
+
+    dist = euclidean_distances(
+        node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
+    )
+    n_clusters = dist.shape[0]
+
+    farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
+    node1_dist, node2_dist = dist[(farthest_idx,)]
+
+    node1_closer = node1_dist < node2_dist
+    # make sure node1 is closest to itself even if all distances are equal.
+    # This can only happen when all node.centroids_ are duplicates leading to all
+    # distances between centroids being zero.
+    node1_closer[farthest_idx[0]] = True
+
+    for idx, subcluster in enumerate(node.subclusters_):
+        if node1_closer[idx]:
+            new_node1.append_subcluster(subcluster)
+            new_subcluster1.update(subcluster)
+        else:
+            new_node2.append_subcluster(subcluster)
+            new_subcluster2.update(subcluster)
+    return new_subcluster1, new_subcluster2
+
+
+class _CFNode:
+    """Each node in a CFTree is called a CFNode.
+
+    The CFNode can have a maximum of branching_factor
+    number of CFSubclusters.
+
+    Parameters
+    ----------
+    threshold : float
+        Threshold needed for a new subcluster to enter a CFSubcluster.
+
+    branching_factor : int
+        Maximum number of CF subclusters in each node.
+
+    is_leaf : bool
+        We need to know if the CFNode is a leaf or not, in order to
+        retrieve the final subclusters.
+
+    n_features : int
+        The number of features.
+
+    Attributes
+    ----------
+    subclusters_ : list
+        List of subclusters for a particular CFNode.
+
+    prev_leaf_ : _CFNode
+        Useful only if is_leaf is True.
+
+    next_leaf_ : _CFNode
+        next_leaf. Useful only if is_leaf is True.
+        the final subclusters.
+
+    init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
+        Manipulate ``init_centroids_`` throughout rather than centroids_ since
+        the centroids are just a view of the ``init_centroids_`` .
+
+    init_sq_norm_ : ndarray of shape (branching_factor + 1,)
+        manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
+
+    centroids_ : ndarray of shape (branching_factor + 1, n_features)
+        View of ``init_centroids_``.
+
+    squared_norm_ : ndarray of shape (branching_factor + 1,)
+        View of ``init_sq_norm_``.
+
+    """
+
+    def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
+        self.threshold = threshold
+        self.branching_factor = branching_factor
+        self.is_leaf = is_leaf
+        self.n_features = n_features
+
+        # The list of subclusters, centroids and squared norms
+        # to manipulate throughout.
+        self.subclusters_ = []
+        self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
+        self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
+        self.squared_norm_ = []
+        self.prev_leaf_ = None
+        self.next_leaf_ = None
+
+    def append_subcluster(self, subcluster):
+        n_samples = len(self.subclusters_)
+        self.subclusters_.append(subcluster)
+        self.init_centroids_[n_samples] = subcluster.centroid_
+        self.init_sq_norm_[n_samples] = subcluster.sq_norm_
+
+        # Keep centroids and squared norm as views. In this way
+        # if we change init_centroids and init_sq_norm_, it is
+        # sufficient,
+        self.centroids_ = self.init_centroids_[: n_samples + 1, :]
+        self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
+
+    def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
+        """Remove a subcluster from a node and update it with the
+        split subclusters.
+        """
+        ind = self.subclusters_.index(subcluster)
+        self.subclusters_[ind] = new_subcluster1
+        self.init_centroids_[ind] = new_subcluster1.centroid_
+        self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
+        self.append_subcluster(new_subcluster2)
+
+    def insert_cf_subcluster(self, subcluster):
+        """Insert a new subcluster into the node."""
+        if not self.subclusters_:
+            self.append_subcluster(subcluster)
+            return False
+
+        threshold = self.threshold
+        branching_factor = self.branching_factor
+        # We need to find the closest subcluster among all the
+        # subclusters so that we can insert our new subcluster.
+        dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
+        dist_matrix *= -2.0
+        dist_matrix += self.squared_norm_
+        closest_index = np.argmin(dist_matrix)
+        closest_subcluster = self.subclusters_[closest_index]
+
+        # If the subcluster has a child, we need a recursive strategy.
+        if closest_subcluster.child_ is not None:
+            split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
+
+            if not split_child:
+                # If it is determined that the child need not be split, we
+                # can just update the closest_subcluster
+                closest_subcluster.update(subcluster)
+                self.init_centroids_[closest_index] = self.subclusters_[
+                    closest_index
+                ].centroid_
+                self.init_sq_norm_[closest_index] = self.subclusters_[
+                    closest_index
+                ].sq_norm_
+                return False
+
+            # things not too good. we need to redistribute the subclusters in
+            # our child node, and add a new subcluster in the parent
+            # subcluster to accommodate the new child.
+            else:
+                new_subcluster1, new_subcluster2 = _split_node(
+                    closest_subcluster.child_,
+                    threshold,
+                    branching_factor,
+                )
+                self.update_split_subclusters(
+                    closest_subcluster, new_subcluster1, new_subcluster2
+                )
+
+                if len(self.subclusters_) > self.branching_factor:
+                    return True
+                return False
+
+        # good to go!
+        else:
+            merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
+            if merged:
+                self.init_centroids_[closest_index] = closest_subcluster.centroid_
+                self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
+                return False
+
+            # not close to any other subclusters, and we still
+            # have space, so add.
+            elif len(self.subclusters_) < self.branching_factor:
+                self.append_subcluster(subcluster)
+                return False
+
+            # We do not have enough space nor is it closer to an
+            # other subcluster. We need to split.
+            else:
+                self.append_subcluster(subcluster)
+                return True
+
+
+class _CFSubcluster:
+    """Each subcluster in a CFNode is called a CFSubcluster.
+
+    A CFSubcluster can have a CFNode has its child.
+
+    Parameters
+    ----------
+    linear_sum : ndarray of shape (n_features,), default=None
+        Sample. This is kept optional to allow initialization of empty
+        subclusters.
+
+    Attributes
+    ----------
+    n_samples_ : int
+        Number of samples that belong to each subcluster.
+
+    linear_sum_ : ndarray
+        Linear sum of all the samples in a subcluster. Prevents holding
+        all sample data in memory.
+
+    squared_sum_ : float
+        Sum of the squared l2 norms of all samples belonging to a subcluster.
+
+    centroid_ : ndarray of shape (branching_factor + 1, n_features)
+        Centroid of the subcluster. Prevent recomputing of centroids when
+        ``CFNode.centroids_`` is called.
+
+    child_ : _CFNode
+        Child Node of the subcluster. Once a given _CFNode is set as the child
+        of the _CFNode, it is set to ``self.child_``.
+
+    sq_norm_ : ndarray of shape (branching_factor + 1,)
+        Squared norm of the subcluster. Used to prevent recomputing when
+        pairwise minimum distances are computed.
+    """
+
+    def __init__(self, *, linear_sum=None):
+        if linear_sum is None:
+            self.n_samples_ = 0
+            self.squared_sum_ = 0.0
+            self.centroid_ = self.linear_sum_ = 0
+        else:
+            self.n_samples_ = 1
+            self.centroid_ = self.linear_sum_ = linear_sum
+            self.squared_sum_ = self.sq_norm_ = np.dot(
+                self.linear_sum_, self.linear_sum_
+            )
+        self.child_ = None
+
+    def update(self, subcluster):
+        self.n_samples_ += subcluster.n_samples_
+        self.linear_sum_ += subcluster.linear_sum_
+        self.squared_sum_ += subcluster.squared_sum_
+        self.centroid_ = self.linear_sum_ / self.n_samples_
+        self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
+
+    def merge_subcluster(self, nominee_cluster, threshold):
+        """Check if a cluster is worthy enough to be merged. If
+        yes then merge.
+        """
+        new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
+        new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
+        new_n = self.n_samples_ + nominee_cluster.n_samples_
+        new_centroid = (1 / new_n) * new_ls
+        new_sq_norm = np.dot(new_centroid, new_centroid)
+
+        # The squared radius of the cluster is defined:
+        #   r^2  = sum_i ||x_i - c||^2 / n
+        # with x_i the n points assigned to the cluster and c its centroid:
+        #   c = sum_i x_i / n
+        # This can be expanded to:
+        #   r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
+        # and therefore simplifies to:
+        #   r^2 = sum_i ||x_i||^2 / n - ||c||^2
+        sq_radius = new_ss / new_n - new_sq_norm
+
+        if sq_radius <= threshold**2:
+            (
+                self.n_samples_,
+                self.linear_sum_,
+                self.squared_sum_,
+                self.centroid_,
+                self.sq_norm_,
+            ) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
+            return True
+        return False
+
+    @property
+    def radius(self):
+        """Return radius of the subcluster"""
+        # Because of numerical issues, this could become negative
+        sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
+        return sqrt(max(0, sq_radius))
+
+
+class Birch(
+    ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
+):
+    """Implements the BIRCH clustering algorithm.
+
+    It is a memory-efficient, online-learning algorithm provided as an
+    alternative to :class:`MiniBatchKMeans`. It constructs a tree
+    data structure with the cluster centroids being read off the leaf.
+    These can be either the final cluster centroids or can be provided as input
+    to another clustering algorithm such as :class:`AgglomerativeClustering`.
+
+    Read more in the :ref:`User Guide <birch>`.
+
+    .. versionadded:: 0.16
+
+    Parameters
+    ----------
+    threshold : float, default=0.5
+        The radius of the subcluster obtained by merging a new sample and the
+        closest subcluster should be lesser than the threshold. Otherwise a new
+        subcluster is started. Setting this value to be very low promotes
+        splitting and vice-versa.
+
+    branching_factor : int, default=50
+        Maximum number of CF subclusters in each node. If a new samples enters
+        such that the number of subclusters exceed the branching_factor then
+        that node is split into two nodes with the subclusters redistributed
+        in each. The parent subcluster of that node is removed and two new
+        subclusters are added as parents of the 2 split nodes.
+
+    n_clusters : int, instance of sklearn.cluster model or None, default=3
+        Number of clusters after the final clustering step, which treats the
+        subclusters from the leaves as new samples.
+
+        - `None` : the final clustering step is not performed and the
+          subclusters are returned as they are.
+
+        - :mod:`sklearn.cluster` Estimator : If a model is provided, the model
+          is fit treating the subclusters as new samples and the initial data
+          is mapped to the label of the closest subcluster.
+
+        - `int` : the model fit is :class:`AgglomerativeClustering` with
+          `n_clusters` set to be equal to the int.
+
+    compute_labels : bool, default=True
+        Whether or not to compute labels for each fit.
+
+    copy : bool, default=True
+        Whether or not to make a copy of the given data. If set to False,
+        the initial data will be overwritten.
+
+        .. deprecated:: 1.6
+            `copy` was deprecated in 1.6 and will be removed in 1.8. It has no effect
+            as the estimator does not perform in-place operations on the input data.
+
+    Attributes
+    ----------
+    root_ : _CFNode
+        Root of the CFTree.
+
+    dummy_leaf_ : _CFNode
+        Start pointer to all the leaves.
+
+    subcluster_centers_ : ndarray
+        Centroids of all subclusters read directly from the leaves.
+
+    subcluster_labels_ : ndarray
+        Labels assigned to the centroids of the subclusters after
+        they are clustered globally.
+
+    labels_ : ndarray of shape (n_samples,)
+        Array of labels assigned to the input data.
+        if partial_fit is used instead of fit, they are assigned to the
+        last batch of data.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    MiniBatchKMeans : Alternative implementation that does incremental updates
+        of the centers' positions using mini-batches.
+
+    Notes
+    -----
+    The tree data structure consists of nodes with each node consisting of
+    a number of subclusters. The maximum number of subclusters in a node
+    is determined by the branching factor. Each subcluster maintains a
+    linear sum, squared sum and the number of samples in that subcluster.
+    In addition, each subcluster can also have a node as its child, if the
+    subcluster is not a member of a leaf node.
+
+    For a new point entering the root, it is merged with the subcluster closest
+    to it and the linear sum, squared sum and the number of samples of that
+    subcluster are updated. This is done recursively till the properties of
+    the leaf node are updated.
+
+    See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a
+    comparison with :class:`~sklearn.cluster.MiniBatchKMeans`.
+
+    References
+    ----------
+    * Tian Zhang, Raghu Ramakrishnan, Maron Livny
+      BIRCH: An efficient data clustering method for large databases.
+      https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
+
+    * Roberto Perdisci
+      JBirch - Java implementation of BIRCH clustering algorithm
+      https://code.google.com/archive/p/jbirch
+
+    Examples
+    --------
+    >>> from sklearn.cluster import Birch
+    >>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
+    >>> brc = Birch(n_clusters=None)
+    >>> brc.fit(X)
+    Birch(n_clusters=None)
+    >>> brc.predict(X)
+    array([0, 0, 0, 1, 1, 1])
+
+    For a comparison of the BIRCH clustering algorithm with other clustering algorithms,
+    see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        "threshold": [Interval(Real, 0.0, None, closed="neither")],
+        "branching_factor": [Interval(Integral, 1, None, closed="neither")],
+        "n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
+        "compute_labels": ["boolean"],
+        "copy": ["boolean", Hidden(StrOptions({"deprecated"}))],
+    }
+
+    def __init__(
+        self,
+        *,
+        threshold=0.5,
+        branching_factor=50,
+        n_clusters=3,
+        compute_labels=True,
+        copy="deprecated",
+    ):
+        self.threshold = threshold
+        self.branching_factor = branching_factor
+        self.n_clusters = n_clusters
+        self.compute_labels = compute_labels
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """
+        Build a CF Tree for the input data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        return self._fit(X, partial=False)
+
+    def _fit(self, X, partial):
+        has_root = getattr(self, "root_", None)
+        first_call = not (partial and has_root)
+
+        if self.copy != "deprecated" and first_call:
+            warnings.warn(
+                "`copy` was deprecated in 1.6 and will be removed in 1.8 since it "
+                "has no effect internally. Simply leave this parameter to its default "
+                "value to avoid this warning.",
+                FutureWarning,
+            )
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csr",
+            reset=first_call,
+            dtype=[np.float64, np.float32],
+        )
+        threshold = self.threshold
+        branching_factor = self.branching_factor
+
+        n_samples, n_features = X.shape
+
+        # If partial_fit is called for the first time or fit is called, we
+        # start a new tree.
+        if first_call:
+            # The first root is the leaf. Manipulate this object throughout.
+            self.root_ = _CFNode(
+                threshold=threshold,
+                branching_factor=branching_factor,
+                is_leaf=True,
+                n_features=n_features,
+                dtype=X.dtype,
+            )
+
+            # To enable getting back subclusters.
+            self.dummy_leaf_ = _CFNode(
+                threshold=threshold,
+                branching_factor=branching_factor,
+                is_leaf=True,
+                n_features=n_features,
+                dtype=X.dtype,
+            )
+            self.dummy_leaf_.next_leaf_ = self.root_
+            self.root_.prev_leaf_ = self.dummy_leaf_
+
+        # Cannot vectorize. Enough to convince to use cython.
+        if not sparse.issparse(X):
+            iter_func = iter
+        else:
+            iter_func = _iterate_sparse_X
+
+        for sample in iter_func(X):
+            subcluster = _CFSubcluster(linear_sum=sample)
+            split = self.root_.insert_cf_subcluster(subcluster)
+
+            if split:
+                new_subcluster1, new_subcluster2 = _split_node(
+                    self.root_, threshold, branching_factor
+                )
+                del self.root_
+                self.root_ = _CFNode(
+                    threshold=threshold,
+                    branching_factor=branching_factor,
+                    is_leaf=False,
+                    n_features=n_features,
+                    dtype=X.dtype,
+                )
+                self.root_.append_subcluster(new_subcluster1)
+                self.root_.append_subcluster(new_subcluster2)
+
+        centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
+        self.subcluster_centers_ = centroids
+        self._n_features_out = self.subcluster_centers_.shape[0]
+
+        self._global_clustering(X)
+        return self
+
+    def _get_leaves(self):
+        """
+        Retrieve the leaves of the CF Node.
+
+        Returns
+        -------
+        leaves : list of shape (n_leaves,)
+            List of the leaf nodes.
+        """
+        leaf_ptr = self.dummy_leaf_.next_leaf_
+        leaves = []
+        while leaf_ptr is not None:
+            leaves.append(leaf_ptr)
+            leaf_ptr = leaf_ptr.next_leaf_
+        return leaves
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X=None, y=None):
+        """
+        Online learning. Prevents rebuilding of CFTree from scratch.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), \
+            default=None
+            Input data. If X is not provided, only the global clustering
+            step is done.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        if X is None:
+            # Perform just the final global clustering step.
+            self._global_clustering()
+            return self
+        else:
+            return self._fit(X, partial=True)
+
+    def predict(self, X):
+        """
+        Predict data using the ``centroids_`` of subclusters.
+
+        Avoid computation of the row norms of X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        Returns
+        -------
+        labels : ndarray of shape(n_samples,)
+            Labelled data.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, accept_sparse="csr", reset=False)
+        return self._predict(X)
+
+    def _predict(self, X):
+        """Predict data using the ``centroids_`` of subclusters."""
+        kwargs = {"Y_norm_squared": self._subcluster_norms}
+
+        with config_context(assume_finite=True):
+            argmin = pairwise_distances_argmin(
+                X, self.subcluster_centers_, metric_kwargs=kwargs
+            )
+        return self.subcluster_labels_[argmin]
+
+    def transform(self, X):
+        """
+        Transform X into subcluster centroids dimension.
+
+        Each dimension represents the distance from the sample point to each
+        cluster centroid.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        Returns
+        -------
+        X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
+            Transformed data.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, accept_sparse="csr", reset=False)
+        with config_context(assume_finite=True):
+            return euclidean_distances(X, self.subcluster_centers_)
+
+    def _global_clustering(self, X=None):
+        """
+        Global clustering for the subclusters obtained after fitting
+        """
+        clusterer = self.n_clusters
+        centroids = self.subcluster_centers_
+        compute_labels = (X is not None) and self.compute_labels
+
+        # Preprocessing for the global clustering.
+        not_enough_centroids = False
+        if isinstance(clusterer, Integral):
+            clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
+            # There is no need to perform the global clustering step.
+            if len(centroids) < self.n_clusters:
+                not_enough_centroids = True
+
+        # To use in predict to avoid recalculation.
+        self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
+
+        if clusterer is None or not_enough_centroids:
+            self.subcluster_labels_ = np.arange(len(centroids))
+            if not_enough_centroids:
+                warnings.warn(
+                    "Number of subclusters found (%d) by BIRCH is less "
+                    "than (%d). Decrease the threshold."
+                    % (len(centroids), self.n_clusters),
+                    ConvergenceWarning,
+                )
+        else:
+            # The global clustering step that clusters the subclusters of
+            # the leaves. It assumes the centroids of the subclusters as
+            # samples and finds the final centroids.
+            self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
+
+        if compute_labels:
+            self.labels_ = self._predict(X)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_bisect_k_means.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_bisect_k_means.py
new file mode 100644
index 0000000000000000000000000000000000000000..77e24adbf80848b13f36adc1151686746024bf25
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_bisect_k_means.py
@@ -0,0 +1,543 @@
+"""Bisecting K-means clustering."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+import numpy as np
+import scipy.sparse as sp
+
+from ..base import _fit_context
+from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._param_validation import Integral, Interval, StrOptions
+from ..utils.extmath import row_norms
+from ..utils.validation import (
+    _check_sample_weight,
+    check_is_fitted,
+    check_random_state,
+    validate_data,
+)
+from ._k_means_common import _inertia_dense, _inertia_sparse
+from ._kmeans import (
+    _BaseKMeans,
+    _kmeans_single_elkan,
+    _kmeans_single_lloyd,
+    _labels_inertia_threadpool_limit,
+)
+
+
+class _BisectingTree:
+    """Tree structure representing the hierarchical clusters of BisectingKMeans."""
+
+    def __init__(self, center, indices, score):
+        """Create a new cluster node in the tree.
+
+        The node holds the center of this cluster and the indices of the data points
+        that belong to it.
+        """
+        self.center = center
+        self.indices = indices
+        self.score = score
+
+        self.left = None
+        self.right = None
+
+    def split(self, labels, centers, scores):
+        """Split the cluster node into two subclusters."""
+        self.left = _BisectingTree(
+            indices=self.indices[labels == 0], center=centers[0], score=scores[0]
+        )
+        self.right = _BisectingTree(
+            indices=self.indices[labels == 1], center=centers[1], score=scores[1]
+        )
+
+        # reset the indices attribute to save memory
+        self.indices = None
+
+    def get_cluster_to_bisect(self):
+        """Return the cluster node to bisect next.
+
+        It's based on the score of the cluster, which can be either the number of
+        data points assigned to that cluster or the inertia of that cluster
+        (see `bisecting_strategy` for details).
+        """
+        max_score = None
+
+        for cluster_leaf in self.iter_leaves():
+            if max_score is None or cluster_leaf.score > max_score:
+                max_score = cluster_leaf.score
+                best_cluster_leaf = cluster_leaf
+
+        return best_cluster_leaf
+
+    def iter_leaves(self):
+        """Iterate over all the cluster leaves in the tree."""
+        if self.left is None:
+            yield self
+        else:
+            yield from self.left.iter_leaves()
+            yield from self.right.iter_leaves()
+
+
+class BisectingKMeans(_BaseKMeans):
+    """Bisecting K-Means clustering.
+
+    Read more in the :ref:`User Guide <bisect_k_means>`.
+
+    .. versionadded:: 1.1
+
+    Parameters
+    ----------
+    n_clusters : int, default=8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    init : {'k-means++', 'random'} or callable, default='random'
+        Method for initialization:
+
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+
+        'random': choose `n_clusters` observations (rows) at random from data
+        for the initial centroids.
+
+        If a callable is passed, it should take arguments X, n_clusters and a
+        random state and return an initialization.
+
+    n_init : int, default=1
+        Number of time the inner k-means algorithm will be run with different
+        centroid seeds in each bisection.
+        That will result producing for each bisection best output of n_init
+        consecutive runs in terms of inertia.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for centroid initialization
+        in inner K-Means. Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the inner k-means algorithm at each
+        bisection.
+
+    verbose : int, default=0
+        Verbosity mode.
+
+    tol : float, default=1e-4
+        Relative tolerance with regards to Frobenius norm of the difference
+        in the cluster centers of two consecutive iterations  to declare
+        convergence. Used in inner k-means algorithm at each bisection to pick
+        best possible clusters.
+
+    copy_x : bool, default=True
+        When pre-computing distances it is more numerically accurate to center
+        the data first. If copy_x is True (default), then the original data is
+        not modified. If False, the original data is modified, and put back
+        before the function returns, but small numerical differences may be
+        introduced by subtracting and then adding the data mean. Note that if
+        the original data is not C-contiguous, a copy will be made even if
+        copy_x is False. If the original data is sparse, but not in CSR format,
+        a copy will be made even if copy_x is False.
+
+    algorithm : {"lloyd", "elkan"}, default="lloyd"
+        Inner K-means algorithm used in bisection.
+        The classical EM-style algorithm is `"lloyd"`.
+        The `"elkan"` variation can be more efficient on some datasets with
+        well-defined clusters, by using the triangle inequality. However it's
+        more memory intensive due to the allocation of an extra array of shape
+        `(n_samples, n_clusters)`.
+
+    bisecting_strategy : {"biggest_inertia", "largest_cluster"},\
+            default="biggest_inertia"
+        Defines how bisection should be performed:
+
+        - "biggest_inertia" means that BisectingKMeans will always check
+          all calculated cluster for cluster with biggest SSE
+          (Sum of squared errors) and bisect it. This approach concentrates on
+          precision, but may be costly in terms of execution time (especially for
+          larger amount of data points).
+
+        - "largest_cluster" - BisectingKMeans will always split cluster with
+          largest amount of points assigned to it from all clusters
+          previously calculated. That should work faster than picking by SSE
+          ('biggest_inertia') and may produce similar results in most cases.
+
+    Attributes
+    ----------
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
+        Coordinates of cluster centers. If the algorithm stops before fully
+        converging (see ``tol`` and ``max_iter``), these will not be
+        consistent with ``labels_``.
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point.
+
+    inertia_ : float
+        Sum of squared distances of samples to their closest cluster center,
+        weighted by the sample weights if provided.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    See Also
+    --------
+    KMeans : Original implementation of K-Means algorithm.
+
+    Notes
+    -----
+    It might be inefficient when n_cluster is less than 3, due to unnecessary
+    calculations for that case.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import BisectingKMeans
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [10, 1], [3, 1],
+    ...               [10, 0], [2, 1], [10, 2],
+    ...               [10, 8], [10, 9], [10, 10]])
+    >>> bisect_means = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
+    >>> bisect_means.labels_
+    array([0, 2, 0, 2, 0, 2, 1, 1, 1], dtype=int32)
+    >>> bisect_means.predict([[0, 0], [12, 3]])
+    array([0, 2], dtype=int32)
+    >>> bisect_means.cluster_centers_
+    array([[ 2., 1.],
+           [10., 9.],
+           [10., 1.]])
+
+    For a comparison between BisectingKMeans and K-Means refer to example
+    :ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseKMeans._parameter_constraints,
+        "init": [StrOptions({"k-means++", "random"}), callable],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "copy_x": ["boolean"],
+        "algorithm": [StrOptions({"lloyd", "elkan"})],
+        "bisecting_strategy": [StrOptions({"biggest_inertia", "largest_cluster"})],
+    }
+
+    def __init__(
+        self,
+        n_clusters=8,
+        *,
+        init="random",
+        n_init=1,
+        random_state=None,
+        max_iter=300,
+        verbose=0,
+        tol=1e-4,
+        copy_x=True,
+        algorithm="lloyd",
+        bisecting_strategy="biggest_inertia",
+    ):
+        super().__init__(
+            n_clusters=n_clusters,
+            init=init,
+            max_iter=max_iter,
+            verbose=verbose,
+            random_state=random_state,
+            tol=tol,
+            n_init=n_init,
+        )
+
+        self.copy_x = copy_x
+        self.algorithm = algorithm
+        self.bisecting_strategy = bisecting_strategy
+
+    def _warn_mkl_vcomp(self, n_active_threads):
+        """Warn when vcomp and mkl are both present"""
+        warnings.warn(
+            "BisectingKMeans is known to have a memory leak on Windows "
+            "with MKL, when there are less chunks than available "
+            "threads. You can avoid it by setting the environment"
+            f" variable OMP_NUM_THREADS={n_active_threads}."
+        )
+
+    def _inertia_per_cluster(self, X, centers, labels, sample_weight):
+        """Calculate the sum of squared errors (inertia) per cluster.
+
+        Parameters
+        ----------
+        X : {ndarray, csr_matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        centers : ndarray of shape (n_clusters=2, n_features)
+            The cluster centers.
+
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+
+        sample_weight : ndarray of shape (n_samples,)
+            The weights for each observation in X.
+
+        Returns
+        -------
+        inertia_per_cluster : ndarray of shape (n_clusters=2,)
+            Sum of squared errors (inertia) for each cluster.
+        """
+        n_clusters = centers.shape[0]  # = 2 since centers comes from a bisection
+        _inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
+
+        inertia_per_cluster = np.empty(n_clusters)
+        for label in range(n_clusters):
+            inertia_per_cluster[label] = _inertia(
+                X, sample_weight, centers, labels, self._n_threads, single_label=label
+            )
+
+        return inertia_per_cluster
+
+    def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
+        """Split a cluster into 2 subsclusters.
+
+        Parameters
+        ----------
+        X : {ndarray, csr_matrix} of shape (n_samples, n_features)
+            Training instances to cluster.
+
+        x_squared_norms : ndarray of shape (n_samples,)
+            Squared euclidean norm of each data point.
+
+        sample_weight : ndarray of shape (n_samples,)
+            The weights for each observation in X.
+
+        cluster_to_bisect : _BisectingTree node object
+            The cluster node to split.
+        """
+        X = X[cluster_to_bisect.indices]
+        x_squared_norms = x_squared_norms[cluster_to_bisect.indices]
+        sample_weight = sample_weight[cluster_to_bisect.indices]
+
+        best_inertia = None
+
+        # Split samples in X into 2 clusters.
+        # Repeating `n_init` times to obtain best clusters
+        for _ in range(self.n_init):
+            centers_init = self._init_centroids(
+                X,
+                x_squared_norms=x_squared_norms,
+                init=self.init,
+                random_state=self._random_state,
+                n_centroids=2,
+                sample_weight=sample_weight,
+            )
+
+            labels, inertia, centers, _ = self._kmeans_single(
+                X,
+                sample_weight,
+                centers_init,
+                max_iter=self.max_iter,
+                verbose=self.verbose,
+                tol=self.tol,
+                n_threads=self._n_threads,
+            )
+
+            # allow small tolerance on the inertia to accommodate for
+            # non-deterministic rounding errors due to parallel computation
+            if best_inertia is None or inertia < best_inertia * (1 - 1e-6):
+                best_labels = labels
+                best_centers = centers
+                best_inertia = inertia
+
+        if self.verbose:
+            print(f"New centroids from bisection: {best_centers}")
+
+        if self.bisecting_strategy == "biggest_inertia":
+            scores = self._inertia_per_cluster(
+                X, best_centers, best_labels, sample_weight
+            )
+        else:  # bisecting_strategy == "largest_cluster"
+            # Using minlength to make sure that we have the counts for both labels even
+            # if all samples are labelled 0.
+            scores = np.bincount(best_labels, minlength=2)
+
+        cluster_to_bisect.split(best_labels, best_centers, scores)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute bisecting k-means clustering.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+
+            Training instances to cluster.
+
+            .. note:: The data will be converted to C ordering,
+                which will cause a memory copy
+                if the given data is not C-contiguous.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight. `sample_weight` is not used during
+            initialization if `init` is a callable.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csr",
+            dtype=[np.float64, np.float32],
+            order="C",
+            copy=self.copy_x,
+            accept_large_sparse=False,
+        )
+
+        self._check_params_vs_input(X)
+
+        self._random_state = check_random_state(self.random_state)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self._n_threads = _openmp_effective_n_threads()
+
+        if self.algorithm == "lloyd" or self.n_clusters == 1:
+            self._kmeans_single = _kmeans_single_lloyd
+            self._check_mkl_vcomp(X, X.shape[0])
+        else:
+            self._kmeans_single = _kmeans_single_elkan
+
+        # Subtract of mean of X for more accurate distance computations
+        if not sp.issparse(X):
+            self._X_mean = X.mean(axis=0)
+            X -= self._X_mean
+
+        # Initialize the hierarchical clusters tree
+        self._bisecting_tree = _BisectingTree(
+            indices=np.arange(X.shape[0]),
+            center=X.mean(axis=0),
+            score=0,
+        )
+
+        x_squared_norms = row_norms(X, squared=True)
+
+        for _ in range(self.n_clusters - 1):
+            # Chose cluster to bisect
+            cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
+
+            # Split this cluster into 2 subclusters
+            self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect)
+
+        # Aggregate final labels and centers from the bisecting tree
+        self.labels_ = np.full(X.shape[0], -1, dtype=np.int32)
+        self.cluster_centers_ = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
+
+        for i, cluster_node in enumerate(self._bisecting_tree.iter_leaves()):
+            self.labels_[cluster_node.indices] = i
+            self.cluster_centers_[i] = cluster_node.center
+            cluster_node.label = i  # label final clusters for future prediction
+            cluster_node.indices = None  # release memory
+
+        # Restore original data
+        if not sp.issparse(X):
+            X += self._X_mean
+            self.cluster_centers_ += self._X_mean
+
+        _inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
+        self.inertia_ = _inertia(
+            X, sample_weight, self.cluster_centers_, self.labels_, self._n_threads
+        )
+
+        self._n_features_out = self.cluster_centers_.shape[0]
+
+        return self
+
+    def predict(self, X):
+        """Predict which cluster each sample in X belongs to.
+
+        Prediction is made by going down the hierarchical tree
+        in searching of closest leaf cluster.
+
+        In the vector quantization literature, `cluster_centers_` is called
+        the code book and each value returned by `predict` is the index of
+        the closest code in the code book.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to predict.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        x_squared_norms = row_norms(X, squared=True)
+
+        # sample weights are unused but necessary in cython helpers
+        sample_weight = np.ones_like(x_squared_norms)
+
+        labels = self._predict_recursive(X, sample_weight, self._bisecting_tree)
+
+        return labels
+
+    def _predict_recursive(self, X, sample_weight, cluster_node):
+        """Predict recursively by going down the hierarchical tree.
+
+        Parameters
+        ----------
+        X : {ndarray, csr_matrix} of shape (n_samples, n_features)
+            The data points, currently assigned to `cluster_node`, to predict between
+            the subclusters of this node.
+
+        sample_weight : ndarray of shape (n_samples,)
+            The weights for each observation in X.
+
+        cluster_node : _BisectingTree node object
+            The cluster node of the hierarchical tree.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        if cluster_node.left is None:
+            # This cluster has no subcluster. Labels are just the label of the cluster.
+            return np.full(X.shape[0], cluster_node.label, dtype=np.int32)
+
+        # Determine if data points belong to the left or right subcluster
+        centers = np.vstack((cluster_node.left.center, cluster_node.right.center))
+        if hasattr(self, "_X_mean"):
+            centers += self._X_mean
+
+        cluster_labels = _labels_inertia_threadpool_limit(
+            X,
+            sample_weight,
+            centers,
+            self._n_threads,
+            return_inertia=False,
+        )
+        mask = cluster_labels == 0
+
+        # Compute the labels for each subset of the data points.
+        labels = np.full(X.shape[0], -1, dtype=np.int32)
+
+        labels[mask] = self._predict_recursive(
+            X[mask], sample_weight[mask], cluster_node.left
+        )
+
+        labels[~mask] = self._predict_recursive(
+            X[~mask], sample_weight[~mask], cluster_node.right
+        )
+
+        return labels
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan.py
new file mode 100644
index 0000000000000000000000000000000000000000..857a332cc2371a6cbbcc8b69c21cd7e432ccbcc6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan.py
@@ -0,0 +1,480 @@
+"""
+DBSCAN: Density-Based Spatial Clustering of Applications with Noise
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse
+
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..metrics.pairwise import _VALID_METRICS
+from ..neighbors import NearestNeighbors
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.validation import _check_sample_weight, validate_data
+from ._dbscan_inner import dbscan_inner
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=False,
+)
+def dbscan(
+    X,
+    eps=0.5,
+    *,
+    min_samples=5,
+    metric="minkowski",
+    metric_params=None,
+    algorithm="auto",
+    leaf_size=30,
+    p=2,
+    sample_weight=None,
+    n_jobs=None,
+):
+    """Perform DBSCAN clustering from vector array or distance matrix.
+
+    Read more in the :ref:`User Guide <dbscan>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
+            (n_samples, n_samples)
+        A feature array, or array of distances between samples if
+        ``metric='precomputed'``.
+
+    eps : float, default=0.5
+        The maximum distance between two samples for one to be considered
+        as in the neighborhood of the other. This is not a maximum bound
+        on the distances of points within a cluster. This is the most
+        important DBSCAN parameter to choose appropriately for your data set
+        and distance function.
+
+    min_samples : int, default=5
+        The number of samples (or total weight) in a neighborhood for a point
+        to be considered as a core point. This includes the point itself.
+
+    metric : str or callable, default='minkowski'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string or callable, it must be one of
+        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
+        its metric parameter.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit.
+        X may be a :term:`sparse graph <sparse graph>`,
+        in which case only "nonzero" elements may be considered neighbors.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+        .. versionadded:: 0.19
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        The algorithm to be used by the NearestNeighbors module
+        to compute pointwise distances and find nearest neighbors.
+        See NearestNeighbors module documentation for details.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or cKDTree. This can affect the speed
+        of the construction and query, as well as the memory required
+        to store the tree. The optimal value depends
+        on the nature of the problem.
+
+    p : float, default=2
+        The power of the Minkowski metric to be used to calculate distance
+        between points.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Weight of each sample, such that a sample with a weight of at least
+        ``min_samples`` is by itself a core sample; a sample with negative
+        weight may inhibit its eps-neighbor from being core.
+        Note that weights are absolute, and default to 1.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search. ``None`` means
+        1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
+        using all processors. See :term:`Glossary <n_jobs>` for more details.
+        If precomputed distance are used, parallel execution is not available
+        and thus n_jobs will have no effect.
+
+    Returns
+    -------
+    core_samples : ndarray of shape (n_core_samples,)
+        Indices of core samples.
+
+    labels : ndarray of shape (n_samples,)
+        Cluster labels for each point.  Noisy samples are given the label -1.
+
+    See Also
+    --------
+    DBSCAN : An estimator interface for this clustering algorithm.
+    OPTICS : A similar estimator interface clustering at multiple values of
+        eps. Our implementation is optimized for memory usage.
+
+    Notes
+    -----
+    For an example, see :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`.
+
+    This implementation bulk-computes all neighborhood queries, which increases
+    the memory complexity to O(n.d) where d is the average number of neighbors,
+    while original DBSCAN had memory complexity O(n). It may attract a higher
+    memory complexity when querying these nearest neighborhoods, depending
+    on the ``algorithm``.
+
+    One way to avoid the query complexity is to pre-compute sparse
+    neighborhoods in chunks using
+    :func:`NearestNeighbors.radius_neighbors_graph
+    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
+    ``mode='distance'``, then using ``metric='precomputed'`` here.
+
+    Another way to reduce memory and computation time is to remove
+    (near-)duplicate points and use ``sample_weight`` instead.
+
+    :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower
+    memory usage.
+
+    References
+    ----------
+    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
+    Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
+    <https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
+    In: Proceedings of the 2nd International Conference on Knowledge Discovery
+    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
+
+    Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
+    :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
+    <10.1145/3068335>`
+    ACM Transactions on Database Systems (TODS), 42(3), 19.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import dbscan
+    >>> X = [[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]
+    >>> core_samples, labels = dbscan(X, eps=3, min_samples=2)
+    >>> core_samples
+    array([0, 1, 2, 3, 4])
+    >>> labels
+    array([ 0,  0,  0,  1,  1, -1])
+    """
+
+    est = DBSCAN(
+        eps=eps,
+        min_samples=min_samples,
+        metric=metric,
+        metric_params=metric_params,
+        algorithm=algorithm,
+        leaf_size=leaf_size,
+        p=p,
+        n_jobs=n_jobs,
+    )
+    est.fit(X, sample_weight=sample_weight)
+    return est.core_sample_indices_, est.labels_
+
+
+class DBSCAN(ClusterMixin, BaseEstimator):
+    """Perform DBSCAN clustering from vector array or distance matrix.
+
+    DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
+    Finds core samples of high density and expands clusters from them.
+    Good for data which contains clusters of similar density.
+
+    This implementation has a worst case memory complexity of :math:`O({n}^2)`,
+    which can occur when the `eps` param is large and `min_samples` is low,
+    while the original DBSCAN only uses linear memory.
+    For further details, see the Notes below.
+
+    Read more in the :ref:`User Guide <dbscan>`.
+
+    Parameters
+    ----------
+    eps : float, default=0.5
+        The maximum distance between two samples for one to be considered
+        as in the neighborhood of the other. This is not a maximum bound
+        on the distances of points within a cluster. This is the most
+        important DBSCAN parameter to choose appropriately for your data set
+        and distance function.
+
+    min_samples : int, default=5
+        The number of samples (or total weight) in a neighborhood for a point to
+        be considered as a core point. This includes the point itself. If
+        `min_samples` is set to a higher value, DBSCAN will find denser clusters,
+        whereas if it is set to a lower value, the found clusters will be more
+        sparse.
+
+    metric : str, or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string or callable, it must be one of
+        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
+        its metric parameter.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors for DBSCAN.
+
+        .. versionadded:: 0.17
+           metric *precomputed* to accept precomputed sparse matrix.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+        .. versionadded:: 0.19
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        The algorithm to be used by the NearestNeighbors module
+        to compute pointwise distances and find nearest neighbors.
+        See NearestNeighbors module documentation for details.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or cKDTree. This can affect the speed
+        of the construction and query, as well as the memory required
+        to store the tree. The optimal value depends
+        on the nature of the problem.
+
+    p : float, default=None
+        The power of the Minkowski metric to be used to calculate distance
+        between points. If None, then ``p=2`` (equivalent to the Euclidean
+        distance).
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    core_sample_indices_ : ndarray of shape (n_core_samples,)
+        Indices of core samples.
+
+    components_ : ndarray of shape (n_core_samples, n_features)
+        Copy of each core sample found by training.
+
+    labels_ : ndarray of shape (n_samples)
+        Cluster labels for each point in the dataset given to fit().
+        Noisy samples are given the label -1.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    OPTICS : A similar clustering at multiple values of eps. Our implementation
+        is optimized for memory usage.
+
+    Notes
+    -----
+    This implementation bulk-computes all neighborhood queries, which increases
+    the memory complexity to O(n.d) where d is the average number of neighbors,
+    while original DBSCAN had memory complexity O(n). It may attract a higher
+    memory complexity when querying these nearest neighborhoods, depending
+    on the ``algorithm``.
+
+    One way to avoid the query complexity is to pre-compute sparse
+    neighborhoods in chunks using
+    :func:`NearestNeighbors.radius_neighbors_graph
+    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
+    ``mode='distance'``, then using ``metric='precomputed'`` here.
+
+    Another way to reduce memory and computation time is to remove
+    (near-)duplicate points and use ``sample_weight`` instead.
+
+    :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory
+    usage.
+
+    References
+    ----------
+    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
+    Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
+    <https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
+    In: Proceedings of the 2nd International Conference on Knowledge Discovery
+    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
+
+    Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
+    :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
+    <10.1145/3068335>`
+    ACM Transactions on Database Systems (TODS), 42(3), 19.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import DBSCAN
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [2, 2], [2, 3],
+    ...               [8, 7], [8, 8], [25, 80]])
+    >>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
+    >>> clustering.labels_
+    array([ 0,  0,  0,  1,  1, -1])
+    >>> clustering
+    DBSCAN(eps=3, min_samples=2)
+
+    For an example, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`.
+
+    For a comparison of DBSCAN with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        "eps": [Interval(Real, 0.0, None, closed="neither")],
+        "min_samples": [Interval(Integral, 1, None, closed="left")],
+        "metric": [
+            StrOptions(set(_VALID_METRICS) | {"precomputed"}),
+            callable,
+        ],
+        "metric_params": [dict, None],
+        "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
+        "leaf_size": [Interval(Integral, 1, None, closed="left")],
+        "p": [Interval(Real, 0.0, None, closed="left"), None],
+        "n_jobs": [Integral, None],
+    }
+
+    def __init__(
+        self,
+        eps=0.5,
+        *,
+        min_samples=5,
+        metric="euclidean",
+        metric_params=None,
+        algorithm="auto",
+        leaf_size=30,
+        p=None,
+        n_jobs=None,
+    ):
+        self.eps = eps
+        self.min_samples = min_samples
+        self.metric = metric
+        self.metric_params = metric_params
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.p = p
+        self.n_jobs = n_jobs
+
+    @_fit_context(
+        # DBSCAN.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, sample_weight=None):
+        """Perform DBSCAN clustering from features, or distance matrix.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+            (n_samples, n_samples)
+            Training instances to cluster, or distances between instances if
+            ``metric='precomputed'``. If a sparse matrix is provided, it will
+            be converted into a sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Weight of each sample, such that a sample with a weight of at least
+            ``min_samples`` is by itself a core sample; a sample with a
+            negative weight may inhibit its eps-neighbor from being core.
+            Note that weights are absolute, and default to 1.
+
+        Returns
+        -------
+        self : object
+            Returns a fitted instance of self.
+        """
+        X = validate_data(self, X, accept_sparse="csr")
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        # Calculate neighborhood for all samples. This leaves the original
+        # point in, which needs to be considered later (i.e. point i is in the
+        # neighborhood of point i. While True, its useless information)
+        if self.metric == "precomputed" and sparse.issparse(X):
+            # set the diagonal to explicit values, as a point is its own
+            # neighbor
+            X = X.copy()  # copy to avoid in-place modification
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
+                X.setdiag(X.diagonal())
+
+        neighbors_model = NearestNeighbors(
+            radius=self.eps,
+            algorithm=self.algorithm,
+            leaf_size=self.leaf_size,
+            metric=self.metric,
+            metric_params=self.metric_params,
+            p=self.p,
+            n_jobs=self.n_jobs,
+        )
+        neighbors_model.fit(X)
+        # This has worst case O(n^2) memory complexity
+        neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)
+
+        if sample_weight is None:
+            n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
+        else:
+            n_neighbors = np.array(
+                [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
+            )
+
+        # Initially, all samples are noise.
+        labels = np.full(X.shape[0], -1, dtype=np.intp)
+
+        # A list of all core samples found.
+        core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
+        dbscan_inner(core_samples, neighborhoods, labels)
+
+        self.core_sample_indices_ = np.where(core_samples)[0]
+        self.labels_ = labels
+
+        if len(self.core_sample_indices_):
+            # fix for scipy sparse indexing issue
+            self.components_ = X[self.core_sample_indices_].copy()
+        else:
+            # no core samples
+            self.components_ = np.empty((0, X.shape[1]))
+        return self
+
+    def fit_predict(self, X, y=None, sample_weight=None):
+        """Compute clusters from a data or distance matrix and predict labels.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+            (n_samples, n_samples)
+            Training instances to cluster, or distances between instances if
+            ``metric='precomputed'``. If a sparse matrix is provided, it will
+            be converted into a sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Weight of each sample, such that a sample with a weight of at least
+            ``min_samples`` is by itself a core sample; a sample with a
+            negative weight may inhibit its eps-neighbor from being core.
+            Note that weights are absolute, and default to 1.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels. Noisy samples are given the label -1.
+        """
+        self.fit(X, sample_weight=sample_weight)
+        return self.labels_
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..d1a0dd9aec1c17fb677016ce2a6c95872a80bf6e
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..266b214bb269a717fd2eea300fe7445b96bd7cba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.pyx
@@ -0,0 +1,41 @@
+# Fast inner loop for DBSCAN.
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libcpp.vector cimport vector
+
+from ..utils._typedefs cimport uint8_t, intp_t
+
+
+def dbscan_inner(const uint8_t[::1] is_core,
+                 object[:] neighborhoods,
+                 intp_t[::1] labels):
+    cdef intp_t i, label_num = 0, v
+    cdef intp_t[:] neighb
+    cdef vector[intp_t] stack
+
+    for i in range(labels.shape[0]):
+        if labels[i] != -1 or not is_core[i]:
+            continue
+
+        # Depth-first search starting from i, ending at the non-core points.
+        # This is very similar to the classic algorithm for computing connected
+        # components, the difference being that we label non-core points as
+        # part of a cluster (component), but don't expand their neighborhoods.
+        while True:
+            if labels[i] == -1:
+                labels[i] = label_num
+                if is_core[i]:
+                    neighb = neighborhoods[i]
+                    for i in range(neighb.shape[0]):
+                        v = neighb[i]
+                        if labels[v] == -1:
+                            stack.push_back(v)
+
+            if stack.size() == 0:
+                break
+            i = stack.back()
+            stack.pop_back()
+
+        label_num += 1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_feature_agglomeration.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_feature_agglomeration.py
new file mode 100644
index 0000000000000000000000000000000000000000..32fcb85625f354bf0dcece88453e7e8f931e03cb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_feature_agglomeration.py
@@ -0,0 +1,76 @@
+"""
+Feature agglomeration. Base classes and functions for performing feature
+agglomeration.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from scipy.sparse import issparse
+
+from ..base import TransformerMixin
+from ..utils.validation import check_is_fitted, validate_data
+
+###############################################################################
+# Mixin class for feature agglomeration.
+
+
+class AgglomerationTransform(TransformerMixin):
+    """
+    A class for feature agglomeration via the transform interface.
+    """
+
+    def transform(self, X):
+        """
+        Transform a new matrix using the built clustering.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            A M by N array of M observations in N dimensions or a length
+            M array of M one-dimensional observations.
+
+        Returns
+        -------
+        Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)
+            The pooled values for each feature cluster.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(self, X, reset=False)
+        if self.pooling_func == np.mean and not issparse(X):
+            size = np.bincount(self.labels_)
+            n_samples = X.shape[0]
+            # a fast way to compute the mean of grouped features
+            nX = np.array(
+                [np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)]
+            )
+        else:
+            nX = [
+                self.pooling_func(X[:, self.labels_ == l], axis=1)
+                for l in np.unique(self.labels_)
+            ]
+            nX = np.array(nX).T
+        return nX
+
+    def inverse_transform(self, X):
+        """
+        Inverse the transformation and return a vector of size `n_features`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_clusters) or (n_clusters,)
+            The values to be assigned to each cluster of samples.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features) or (n_features,)
+            A vector of size `n_samples` with the values of `X` assigned to
+            each of the cluster of samples.
+        """
+        check_is_fitted(self)
+
+        unil, inverse = np.unique(self.labels_, return_inverse=True)
+        return X[..., inverse]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..67dd18fb94b593f0a3125c1f5833f3b9597614ba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/__init__.py
@@ -0,0 +1,2 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_linkage.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_linkage.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..5684193a13d40ed68cabe9b8502a4b59b18d4e1b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -0,0 +1,274 @@
+# Minimum spanning tree single linkage implementation for hdbscan
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+cimport numpy as cnp
+from libc.float cimport DBL_MAX
+
+import numpy as np
+from ...metrics._dist_metrics cimport DistanceMetric64
+from ...cluster._hierarchical_fast cimport UnionFind
+from ...cluster._hdbscan._tree cimport HIERARCHY_t
+from ...cluster._hdbscan._tree import HIERARCHY_dtype
+from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t
+
+cnp.import_array()
+
+cdef extern from "numpy/arrayobject.h":
+    intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
+
+# Numpy structured dtype representing a single ordered edge in Prim's algorithm
+MST_edge_dtype = np.dtype([
+    ("current_node", np.int64),
+    ("next_node", np.int64),
+    ("distance", np.float64),
+])
+
+# Packed shouldn't make a difference since they're all 8-byte quantities,
+# but it's included just to be safe.
+ctypedef packed struct MST_edge_t:
+    int64_t current_node
+    int64_t next_node
+    float64_t distance
+
+cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
+    cnp.ndarray[float64_t, ndim=2] mutual_reachability
+):
+    """Compute the Minimum Spanning Tree (MST) representation of the mutual-
+    reachability graph using Prim's algorithm.
+
+    Parameters
+    ----------
+    mutual_reachability : ndarray of shape (n_samples, n_samples)
+        Array of mutual-reachabilities between samples.
+
+    Returns
+    -------
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
+        The MST representation of the mutual-reachability graph. The MST is
+        represented as a collection of edges.
+    """
+    cdef:
+        # Note: we utilize ndarray's over memory-views to make use of numpy
+        # binary indexing and sub-selection below.
+        cnp.ndarray[int64_t, ndim=1, mode='c'] current_labels
+        cnp.ndarray[float64_t, ndim=1, mode='c'] min_reachability, left, right
+        cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
+
+        cnp.ndarray[uint8_t, mode='c'] label_filter
+
+        int64_t n_samples = PyArray_SHAPE(<cnp.PyArrayObject*> mutual_reachability)[0]
+        int64_t current_node, new_node_index, new_node, i
+
+    mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
+    current_labels = np.arange(n_samples, dtype=np.int64)
+    current_node = 0
+    min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
+    for i in range(0, n_samples - 1):
+        label_filter = current_labels != current_node
+        current_labels = current_labels[label_filter]
+        left = min_reachability[label_filter]
+        right = mutual_reachability[current_node][current_labels]
+        min_reachability = np.minimum(left, right)
+
+        new_node_index = np.argmin(min_reachability)
+        new_node = current_labels[new_node_index]
+        mst[i].current_node = current_node
+        mst[i].next_node = new_node
+        mst[i].distance = min_reachability[new_node_index]
+        current_node = new_node
+
+    return mst
+
+
+cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
+    const float64_t[:, ::1] raw_data,
+    const float64_t[::1] core_distances,
+    DistanceMetric64 dist_metric,
+    float64_t alpha=1.0
+):
+    """Compute the Minimum Spanning Tree (MST) representation of the mutual-
+    reachability graph generated from the provided `raw_data` and
+    `core_distances` using Prim's algorithm.
+
+    Parameters
+    ----------
+    raw_data : ndarray of shape (n_samples, n_features)
+        Input array of data samples.
+
+    core_distances : ndarray of shape (n_samples,)
+        An array containing the core-distance calculated for each corresponding
+        sample.
+
+    dist_metric : DistanceMetric
+        The distance metric to use when calculating pairwise distances for
+        determining mutual-reachability.
+
+    Returns
+    -------
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
+        The MST representation of the mutual-reachability graph. The MST is
+        represented as a collection of edges.
+    """
+
+    cdef:
+        uint8_t[::1] in_tree
+        float64_t[::1] min_reachability
+        int64_t[::1] current_sources
+        cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
+
+        int64_t current_node, source_node, new_node, next_node_source
+        int64_t i, j, n_samples, num_features
+
+        float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
+        float64_t next_node_min_reach, pair_distance, next_node_core_dist
+
+    n_samples = raw_data.shape[0]
+    num_features = raw_data.shape[1]
+
+    mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
+
+    in_tree = np.zeros(n_samples, dtype=np.uint8)
+    min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
+    current_sources = np.ones(n_samples, dtype=np.int64)
+
+    current_node = 0
+
+    # The following loop dynamically updates minimum reachability node-by-node,
+    # avoiding unnecessary computation where possible.
+    for i in range(0, n_samples - 1):
+
+        in_tree[current_node] = 1
+
+        current_node_core_dist = core_distances[current_node]
+
+        new_reachability = DBL_MAX
+        source_node = 0
+        new_node = 0
+
+        for j in range(n_samples):
+            if in_tree[j]:
+                continue
+
+            next_node_min_reach = min_reachability[j]
+            next_node_source = current_sources[j]
+
+            pair_distance = dist_metric.dist(
+                &raw_data[current_node, 0],
+                &raw_data[j, 0],
+                num_features
+            )
+
+            pair_distance /= alpha
+
+            next_node_core_dist = core_distances[j]
+            mutual_reachability_distance = max(
+                current_node_core_dist,
+                next_node_core_dist,
+                pair_distance
+            )
+
+            # If MRD(i, j) is smaller than node j's min_reachability, we update
+            # node j's min_reachability for future reference.
+            if mutual_reachability_distance < next_node_min_reach:
+                min_reachability[j] = mutual_reachability_distance
+                current_sources[j] = current_node
+
+                # If MRD(i, j) is also smaller than node i's current
+                # min_reachability, we update and set their edge as the current
+                # MST edge candidate.
+                if mutual_reachability_distance < new_reachability:
+                    new_reachability = mutual_reachability_distance
+                    source_node = current_node
+                    new_node = j
+
+            # If the node j is closer to another node already in the tree, we
+            # make their edge the current MST candidate edge.
+            elif next_node_min_reach < new_reachability:
+                new_reachability = next_node_min_reach
+                source_node = next_node_source
+                new_node = j
+
+        mst[i].current_node = source_node
+        mst[i].next_node = new_node
+        mst[i].distance = new_reachability
+        current_node = new_node
+
+    return mst
+
+cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst):
+    """Construct a single-linkage tree from an MST.
+
+    Parameters
+    ----------
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
+        The MST representation of the mutual-reachability graph. The MST is
+        represented as a collection of edges.
+
+    Returns
+    -------
+    single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
+        The single-linkage tree tree (dendrogram) built from the MST. Each
+        of the array represents the following:
+
+        - left node/cluster
+        - right node/cluster
+        - distance
+        - new cluster size
+    """
+    cdef:
+        cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage
+
+        # Note mst.shape[0] is one fewer than the number of samples
+        int64_t n_samples = mst.shape[0] + 1
+        intp_t current_node_cluster, next_node_cluster
+        int64_t current_node, next_node, i
+        float64_t distance
+        UnionFind U = UnionFind(n_samples)
+
+    single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype)
+
+    for i in range(n_samples - 1):
+
+        current_node = mst[i].current_node
+        next_node = mst[i].next_node
+        distance = mst[i].distance
+
+        current_node_cluster = U.fast_find(current_node)
+        next_node_cluster = U.fast_find(next_node)
+
+        single_linkage[i].left_node = current_node_cluster
+        single_linkage[i].right_node = next_node_cluster
+        single_linkage[i].value = distance
+        single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster]
+
+        U.union(current_node_cluster, next_node_cluster)
+
+    return single_linkage
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_reachability.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_reachability.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..bff686ae0a6369a7891525433a3fd79341dd2022
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -0,0 +1,210 @@
+# mutual reachability distance computations
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+cimport numpy as cnp
+
+import numpy as np
+from scipy.sparse import issparse
+from cython cimport floating, integral
+from libc.math cimport isfinite, INFINITY
+from ...utils._typedefs cimport intp_t
+cnp.import_array()
+
+
+def mutual_reachability_graph(
+    distance_matrix, min_samples=5, max_distance=0.0
+):
+    """Compute the weighted adjacency matrix of the mutual reachability graph.
+
+    The mutual reachability distance used to build the graph is defined as::
+
+        max(d_core(x_p), d_core(x_q), d(x_p, x_q))
+
+    and the core distance `d_core` is defined as the distance between a point
+    `x_p` and its k-th nearest neighbor.
+
+    Note that all computations are done in-place.
+
+    Parameters
+    ----------
+    distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
+        Array of distances between samples. If sparse, the array must be in
+        `CSR` format.
+
+    min_samples : int, default=5
+        The parameter `k` used to calculate the distance between a point
+        `x_p` and its k-th nearest neighbor.
+
+    max_distance : float, default=0.0
+        The distance which `np.inf` is replaced with. When the true mutual-
+        reachability distance is measured to be infinite, it is instead
+        truncated to `max_dist`. Only used when `distance_matrix` is a sparse
+        matrix.
+
+    Returns
+    -------
+    mututal_reachability_graph: {ndarray, sparse matrix} of shape \
+            (n_samples, n_samples)
+        Weighted adjacency matrix of the mutual reachability graph.
+
+    References
+    ----------
+    .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
+       Density-based clustering based on hierarchical density estimates.
+       In Pacific-Asia Conference on Knowledge Discovery and Data Mining
+       (pp. 160-172). Springer Berlin Heidelberg.
+    """
+    further_neighbor_idx = min_samples - 1
+    if issparse(distance_matrix):
+        if distance_matrix.format != "csr":
+            raise ValueError(
+                "Only sparse CSR matrices are supported for `distance_matrix`."
+            )
+        _sparse_mutual_reachability_graph(
+            distance_matrix.data,
+            distance_matrix.indices,
+            distance_matrix.indptr,
+            distance_matrix.shape[0],
+            further_neighbor_idx=further_neighbor_idx,
+            max_distance=max_distance,
+        )
+    else:
+        _dense_mutual_reachability_graph(
+            distance_matrix, further_neighbor_idx=further_neighbor_idx
+        )
+    return distance_matrix
+
+
+def _dense_mutual_reachability_graph(
+    floating[:, :] distance_matrix,
+    intp_t further_neighbor_idx,
+):
+    """Dense implementation of mutual reachability graph.
+
+    The computation is done in-place, i.e. the distance matrix is modified
+    directly.
+
+    Parameters
+    ----------
+    distance_matrix : ndarray of shape (n_samples, n_samples)
+        Array of distances between samples.
+
+    further_neighbor_idx : int
+        The index of the furthest neighbor to use to define the core distances.
+    """
+    cdef:
+        intp_t i, j, n_samples = distance_matrix.shape[0]
+        floating mutual_reachability_distance
+        floating[::1] core_distances
+
+    # We assume that the distance matrix is symmetric. We choose to sort every
+    # row to have the same implementation than the sparse case that requires
+    # CSR matrix.
+    core_distances = np.ascontiguousarray(
+        np.partition(
+            distance_matrix, further_neighbor_idx, axis=1
+        )[:, further_neighbor_idx]
+    )
+
+    with nogil:
+        # TODO: Update w/ prange with thread count based on
+        # _openmp_effective_n_threads
+        for i in range(n_samples):
+            for j in range(n_samples):
+                mutual_reachability_distance = max(
+                    core_distances[i],
+                    core_distances[j],
+                    distance_matrix[i, j],
+                )
+                distance_matrix[i, j] = mutual_reachability_distance
+
+
+def _sparse_mutual_reachability_graph(
+    cnp.ndarray[floating, ndim=1, mode="c"] data,
+    cnp.ndarray[integral, ndim=1, mode="c"] indices,
+    cnp.ndarray[integral, ndim=1, mode="c"] indptr,
+    intp_t n_samples,
+    intp_t further_neighbor_idx,
+    floating max_distance,
+):
+    """Sparse implementation of mutual reachability graph.
+
+    The computation is done in-place, i.e. the distance matrix is modified
+    directly. This implementation only accepts `CSR` format sparse matrices.
+
+    Parameters
+    ----------
+    distance_matrix : sparse matrix of shape (n_samples, n_samples)
+        Sparse matrix of distances between samples. The sparse format should
+        be `CSR`.
+
+    further_neighbor_idx : int
+        The index of the furthest neighbor to use to define the core distances.
+
+    max_distance : float
+        The distance which `np.inf` is replaced with. When the true mutual-
+        reachability distance is measured to be infinite, it is instead
+        truncated to `max_dist`. Only used when `distance_matrix` is a sparse
+        matrix.
+    """
+    cdef:
+        integral i, col_ind, row_ind
+        floating mutual_reachability_distance
+        floating[:] core_distances
+        floating[:] row_data
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    core_distances = np.empty(n_samples, dtype=dtype)
+
+    for i in range(n_samples):
+        row_data = data[indptr[i]:indptr[i + 1]]
+        if further_neighbor_idx < row_data.size:
+            core_distances[i] = np.partition(
+                row_data, further_neighbor_idx
+            )[further_neighbor_idx]
+        else:
+            core_distances[i] = INFINITY
+
+    with nogil:
+        for row_ind in range(n_samples):
+            for i in range(indptr[row_ind], indptr[row_ind + 1]):
+                col_ind = indices[i]
+                mutual_reachability_distance = max(
+                    core_distances[row_ind], core_distances[col_ind], data[i]
+                )
+                if isfinite(mutual_reachability_distance):
+                    data[i] = mutual_reachability_distance
+                elif max_distance > 0:
+                    data[i] = max_distance
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pxd b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..23708b9a38d07884c035b88e260821146075f861
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pxd
@@ -0,0 +1,49 @@
+# Copyright (c) 2015, Leland McInnes
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from ...utils._typedefs cimport intp_t, float64_t, uint8_t
+cimport numpy as cnp
+
+# This corresponds to the scipy.cluster.hierarchy format
+ctypedef packed struct HIERARCHY_t:
+    intp_t left_node
+    intp_t right_node
+    float64_t value
+    intp_t cluster_size
+
+# Effectively an edgelist encoding a parent/child pair, along with a value and
+# the corresponding cluster_size in each row providing a tree structure.
+ctypedef packed struct CONDENSED_t:
+    intp_t parent
+    intp_t child
+    float64_t value
+    intp_t cluster_size
+
+cdef extern from "numpy/arrayobject.h":
+    intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..161092033b915bd9bb51f87750fb156c6a598833
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pyx
@@ -0,0 +1,799 @@
+# Tree handling (condensing, finding stable clusters) for hdbscan
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+cimport numpy as cnp
+from libc.math cimport isinf
+import cython
+
+import numpy as np
+
+cnp.import_array()
+
+cdef extern from "numpy/arrayobject.h":
+    intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
+
+cdef cnp.float64_t INFTY = np.inf
+cdef cnp.intp_t NOISE = -1
+
+HIERARCHY_dtype = np.dtype([
+    ("left_node", np.intp),
+    ("right_node", np.intp),
+    ("value", np.float64),
+    ("cluster_size", np.intp),
+])
+
+CONDENSED_dtype = np.dtype([
+    ("parent", np.intp),
+    ("child", np.intp),
+    ("value", np.float64),
+    ("cluster_size", np.intp),
+])
+
+cpdef tuple tree_to_labels(
+    const HIERARCHY_t[::1] single_linkage_tree,
+    cnp.intp_t min_cluster_size=10,
+    cluster_selection_method="eom",
+    bint allow_single_cluster=False,
+    cnp.float64_t cluster_selection_epsilon=0.0,
+    max_cluster_size=None,
+):
+    cdef:
+        cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
+        cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities
+
+    condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size)
+    labels, probabilities = _get_clusters(
+        condensed_tree,
+        _compute_stability(condensed_tree),
+        cluster_selection_method,
+        allow_single_cluster,
+        cluster_selection_epsilon,
+        max_cluster_size,
+    )
+
+    return (labels, probabilities)
+
+cdef list bfs_from_hierarchy(
+    const HIERARCHY_t[::1] hierarchy,
+    cnp.intp_t bfs_root
+):
+    """
+    Perform a breadth first search on a tree in scipy hclust format.
+    """
+
+    cdef list process_queue, next_queue, result
+    cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1
+    cdef cnp.intp_t node
+    process_queue = [bfs_root]
+    result = []
+
+    while process_queue:
+        result.extend(process_queue)
+        # By construction, node i is formed by the union of nodes
+        # hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1]
+        process_queue = [
+            x - n_samples
+            for x in process_queue
+            if x >= n_samples
+        ]
+        if process_queue:
+            next_queue = []
+            for node in process_queue:
+                next_queue.extend(
+                    [
+                        hierarchy[node].left_node,
+                        hierarchy[node].right_node,
+                    ]
+                )
+            process_queue = next_queue
+    return result
+
+
+cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
+    const HIERARCHY_t[::1] hierarchy,
+    cnp.intp_t min_cluster_size=10
+):
+    """Condense a tree according to a minimum cluster size. This is akin
+    to the runt pruning procedure of Stuetzle. The result is a much simpler
+    tree that is easier to visualize. We include extra information on the
+    lambda value at which individual points depart clusters for later
+    analysis and computation.
+
+    Parameters
+    ----------
+    hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
+        A single linkage hierarchy in scipy.cluster.hierarchy format.
+
+    min_cluster_size : int, optional (default 10)
+        The minimum size of clusters to consider. Clusters smaller than this
+        are pruned from the tree.
+
+    Returns
+    -------
+    condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
+        Effectively an edgelist encoding a parent/child pair, along with a
+        value and the corresponding cluster_size in each row providing a tree
+        structure.
+    """
+
+    cdef:
+        cnp.intp_t root = 2 * hierarchy.shape[0]
+        cnp.intp_t n_samples = hierarchy.shape[0] + 1
+        cnp.intp_t next_label = n_samples + 1
+        list result_list, node_list = bfs_from_hierarchy(hierarchy, root)
+
+        cnp.intp_t[::1] relabel
+        cnp.uint8_t[::1] ignore
+
+        cnp.intp_t node, sub_node, left, right
+        cnp.float64_t lambda_value, distance
+        cnp.intp_t left_count, right_count
+        HIERARCHY_t children
+
+    relabel = np.empty(root + 1, dtype=np.intp)
+    relabel[root] = n_samples
+    result_list = []
+    ignore = np.zeros(len(node_list), dtype=bool)
+
+    for node in node_list:
+        if ignore[node] or node < n_samples:
+            continue
+
+        children = hierarchy[node - n_samples]
+        left = children.left_node
+        right = children.right_node
+        distance = children.value
+        if distance > 0.0:
+            lambda_value = 1.0 / distance
+        else:
+            lambda_value = INFTY
+
+        if left >= n_samples:
+            left_count = hierarchy[left - n_samples].cluster_size
+        else:
+            left_count = 1
+
+        if right >= n_samples:
+            right_count = hierarchy[right - n_samples].cluster_size
+        else:
+            right_count = 1
+
+        if left_count >= min_cluster_size and right_count >= min_cluster_size:
+            relabel[left] = next_label
+            next_label += 1
+            result_list.append(
+                (relabel[node], relabel[left], lambda_value, left_count)
+            )
+
+            relabel[right] = next_label
+            next_label += 1
+            result_list.append(
+                (relabel[node], relabel[right], lambda_value, right_count)
+            )
+
+        elif left_count < min_cluster_size and right_count < min_cluster_size:
+            for sub_node in bfs_from_hierarchy(hierarchy, left):
+                if sub_node < n_samples:
+                    result_list.append(
+                        (relabel[node], sub_node, lambda_value, 1)
+                    )
+                ignore[sub_node] = True
+
+            for sub_node in bfs_from_hierarchy(hierarchy, right):
+                if sub_node < n_samples:
+                    result_list.append(
+                        (relabel[node], sub_node, lambda_value, 1)
+                    )
+                ignore[sub_node] = True
+
+        elif left_count < min_cluster_size:
+            relabel[right] = relabel[node]
+            for sub_node in bfs_from_hierarchy(hierarchy, left):
+                if sub_node < n_samples:
+                    result_list.append(
+                        (relabel[node], sub_node, lambda_value, 1)
+                    )
+                ignore[sub_node] = True
+
+        else:
+            relabel[left] = relabel[node]
+            for sub_node in bfs_from_hierarchy(hierarchy, right):
+                if sub_node < n_samples:
+                    result_list.append(
+                        (relabel[node], sub_node, lambda_value, 1)
+                    )
+                ignore[sub_node] = True
+
+    return np.array(result_list, dtype=CONDENSED_dtype)
+
+
+cdef dict _compute_stability(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
+):
+
+    cdef:
+        cnp.float64_t[::1] result, births
+        cnp.intp_t[:] parents = condensed_tree['parent']
+
+        cnp.intp_t parent, cluster_size, result_index, idx
+        cnp.float64_t lambda_val
+        CONDENSED_t condensed_node
+        cnp.intp_t largest_child = condensed_tree['child'].max()
+        cnp.intp_t smallest_cluster = np.min(parents)
+        cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
+        dict stability_dict = {}
+
+    largest_child = max(largest_child, smallest_cluster)
+    births = np.full(largest_child + 1, np.nan, dtype=np.float64)
+
+    for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        condensed_node = condensed_tree[idx]
+        births[condensed_node.child] = condensed_node.value
+
+    births[smallest_cluster] = 0.0
+
+    result = np.zeros(num_clusters, dtype=np.float64)
+    for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        condensed_node = condensed_tree[idx]
+        parent = condensed_node.parent
+        lambda_val = condensed_node.value
+        cluster_size = condensed_node.cluster_size
+
+        result_index = parent - smallest_cluster
+        result[result_index] += (lambda_val - births[parent]) * cluster_size
+
+    for idx in range(num_clusters):
+        stability_dict[idx + smallest_cluster] = result[idx]
+
+    return stability_dict
+
+
+cdef list bfs_from_cluster_tree(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
+    cnp.intp_t bfs_root
+):
+
+    cdef:
+        list result = []
+        cnp.ndarray[cnp.intp_t, ndim=1] process_queue = (
+            np.array([bfs_root], dtype=np.intp)
+        )
+        cnp.ndarray[cnp.intp_t, ndim=1] children = condensed_tree['child']
+        cnp.intp_t[:] parents = condensed_tree['parent']
+
+    while len(process_queue) > 0:
+        result.extend(process_queue.tolist())
+        process_queue = children[np.isin(parents, process_queue)]
+
+    return result
+
+
+cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree):
+
+    cdef:
+        cnp.intp_t parent, current_parent, idx
+        cnp.float64_t lambda_val, max_lambda
+        cnp.float64_t[::1] deaths
+        cnp.intp_t largest_parent = condensed_tree['parent'].max()
+
+    deaths = np.zeros(largest_parent + 1, dtype=np.float64)
+    current_parent = condensed_tree[0].parent
+    max_lambda = condensed_tree[0].value
+
+    for idx in range(1, PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        parent = condensed_tree[idx].parent
+        lambda_val = condensed_tree[idx].value
+
+        if parent == current_parent:
+            max_lambda = max(max_lambda, lambda_val)
+        else:
+            deaths[current_parent] = max_lambda
+            current_parent = parent
+            max_lambda = lambda_val
+
+    deaths[current_parent] = max_lambda  # value for last parent
+    return deaths
+
+
+@cython.final
+cdef class TreeUnionFind:
+
+    cdef cnp.intp_t[:, ::1] data
+    cdef cnp.uint8_t[::1] is_component
+
+    def __init__(self, size):
+        cdef cnp.intp_t idx
+        self.data = np.zeros((size, 2), dtype=np.intp)
+        for idx in range(size):
+            self.data[idx, 0] = idx
+        self.is_component = np.ones(size, dtype=np.uint8)
+
+    cdef void union(self, cnp.intp_t x, cnp.intp_t y):
+        cdef cnp.intp_t x_root = self.find(x)
+        cdef cnp.intp_t y_root = self.find(y)
+
+        if self.data[x_root, 1] < self.data[y_root, 1]:
+            self.data[x_root, 0] = y_root
+        elif self.data[x_root, 1] > self.data[y_root, 1]:
+            self.data[y_root, 0] = x_root
+        else:
+            self.data[y_root, 0] = x_root
+            self.data[x_root, 1] += 1
+        return
+
+    cdef cnp.intp_t find(self, cnp.intp_t x):
+        if self.data[x, 0] != x:
+            self.data[x, 0] = self.find(self.data[x, 0])
+            self.is_component[x] = False
+        return self.data[x, 0]
+
+
+cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
+        const HIERARCHY_t[::1] linkage,
+        cnp.float64_t cut,
+        cnp.intp_t min_cluster_size
+):
+    """Given a single linkage tree and a cut value, return the
+    vector of cluster labels at that cut value. This is useful
+    for Robust Single Linkage, and extracting DBSCAN results
+    from a single HDBSCAN run.
+
+    Parameters
+    ----------
+    linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
+        The single linkage tree in scipy.cluster.hierarchy format.
+
+    cut : double
+        The cut value at which to find clusters.
+
+    min_cluster_size : int
+        The minimum cluster size; clusters below this size at
+        the cut will be considered noise.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        The cluster labels for each point in the data set;
+        a label of -1 denotes a noise assignment.
+    """
+
+    cdef:
+        cnp.intp_t n, cluster, root, n_samples, cluster_label
+        cnp.intp_t[::1] unique_labels, cluster_size
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
+        TreeUnionFind union_find
+        dict cluster_label_map
+        HIERARCHY_t node
+
+    root = 2 * linkage.shape[0]
+    n_samples = root // 2 + 1
+    result = np.empty(n_samples, dtype=np.intp)
+    union_find = TreeUnionFind(root + 1)
+
+    cluster = n_samples
+    for node in linkage:
+        if node.value < cut:
+            union_find.union(node.left_node, cluster)
+            union_find.union(node.right_node, cluster)
+        cluster += 1
+
+    cluster_size = np.zeros(cluster, dtype=np.intp)
+    for n in range(n_samples):
+        cluster = union_find.find(n)
+        cluster_size[cluster] += 1
+        result[n] = cluster
+
+    cluster_label_map = {-1: NOISE}
+    cluster_label = 0
+    unique_labels = np.unique(result)
+
+    for cluster in unique_labels:
+        if cluster_size[cluster] < min_cluster_size:
+            cluster_label_map[cluster] = NOISE
+        else:
+            cluster_label_map[cluster] = cluster_label
+            cluster_label += 1
+
+    for n in range(n_samples):
+        result[n] = cluster_label_map[result[n]]
+
+    return result
+
+
+cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
+        cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
+        set clusters,
+        dict cluster_label_map,
+        cnp.intp_t allow_single_cluster,
+        cnp.float64_t cluster_selection_epsilon
+):
+    """Given a condensed tree, clusters and a labeling map for the clusters,
+    return an array containing the labels of each point based on cluster
+    membership. Note that this is where points may be marked as noisy
+    outliers. The determination of some points as noise is in large, single-
+    cluster datasets is controlled by the `allow_single_cluster` and
+    `cluster_selection_epsilon` parameters.
+
+    Parameters
+    ----------
+    condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
+        Effectively an edgelist encoding a parent/child pair, along with a
+        value and the corresponding cluster_size in each row providing a tree
+        structure.
+
+    clusters : set
+        The set of nodes corresponding to identified clusters. These node
+        values should be the same as those present in `condensed_tree`.
+
+    cluster_label_map : dict
+        A mapping from the node values present in `clusters` to the labels
+        which will be returned.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        The cluster labels for each point in the data set;
+        a label of -1 denotes a noise assignment.
+    """
+
+    cdef:
+        cnp.intp_t root_cluster
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
+        cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
+        cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
+        TreeUnionFind union_find
+        cnp.intp_t n, parent, child, cluster
+        cnp.float64_t threshold
+
+    child_array = condensed_tree['child']
+    parent_array = condensed_tree['parent']
+    lambda_array = condensed_tree['value']
+
+    root_cluster = np.min(parent_array)
+    result = np.empty(root_cluster, dtype=np.intp)
+    union_find = TreeUnionFind(np.max(parent_array) + 1)
+
+    for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        child = child_array[n]
+        parent = parent_array[n]
+        if child not in clusters:
+            union_find.union(parent, child)
+
+    for n in range(root_cluster):
+        cluster = union_find.find(n)
+        label = NOISE
+        if cluster != root_cluster:
+            label = cluster_label_map[cluster]
+        elif len(clusters) == 1 and allow_single_cluster:
+            # There can only be one edge with this particular child hence this
+            # expression extracts a unique, scalar lambda value.
+            parent_lambda = lambda_array[child_array == n]
+            if cluster_selection_epsilon != 0.0:
+                threshold = 1 / cluster_selection_epsilon
+            else:
+                # The threshold should be calculated per-sample based on the
+                # largest lambda of any simbling node.
+                threshold = lambda_array[parent_array == cluster].max()
+            if parent_lambda >= threshold:
+                label = cluster_label_map[cluster]
+
+        result[n] = label
+
+    return result
+
+
+cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
+    dict cluster_map,
+    cnp.intp_t[::1] labels
+):
+
+    cdef:
+        cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result
+        cnp.float64_t[:] lambda_array
+        cnp.float64_t[::1] deaths
+        cnp.intp_t[:] child_array, parent_array
+        cnp.intp_t root_cluster, n, point, cluster_num, cluster
+        cnp.float64_t max_lambda, lambda_val
+
+    child_array = condensed_tree['child']
+    parent_array = condensed_tree['parent']
+    lambda_array = condensed_tree['value']
+
+    result = np.zeros(labels.shape[0])
+    deaths = max_lambdas(condensed_tree)
+    root_cluster = np.min(parent_array)
+
+    for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        point = child_array[n]
+        if point >= root_cluster:
+            continue
+
+        cluster_num = labels[point]
+        if cluster_num == -1:
+            continue
+
+        cluster = cluster_map[cluster_num]
+        max_lambda = deaths[cluster]
+        if max_lambda == 0.0 or isinf(lambda_array[n]):
+            result[point] = 1.0
+        else:
+            lambda_val = min(lambda_array[n], max_lambda)
+            result[point] = lambda_val / max_lambda
+
+    return result
+
+
+cpdef list recurse_leaf_dfs(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
+    cnp.intp_t current_node
+):
+    cdef cnp.intp_t[:] children
+    cdef cnp.intp_t child
+
+    children = cluster_tree[cluster_tree['parent'] == current_node]['child']
+    if children.shape[0] == 0:
+        return [current_node,]
+    else:
+        return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])
+
+
+cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree):
+    cdef cnp.intp_t root
+    if PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] == 0:
+        return []
+    root = cluster_tree['parent'].min()
+    return recurse_leaf_dfs(cluster_tree, root)
+
+cdef cnp.intp_t traverse_upwards(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
+    cnp.float64_t cluster_selection_epsilon,
+    cnp.intp_t leaf,
+    cnp.intp_t allow_single_cluster
+):
+    cdef cnp.intp_t root, parent
+    cdef cnp.float64_t parent_eps
+
+    root = cluster_tree['parent'].min()
+    parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
+    if parent == root:
+        if allow_single_cluster:
+            return parent
+        else:
+            return leaf  # return node closest to root
+
+    parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
+    if parent_eps > cluster_selection_epsilon:
+        return parent
+    else:
+        return traverse_upwards(
+            cluster_tree,
+            cluster_selection_epsilon,
+            parent,
+            allow_single_cluster
+        )
+
+cdef set epsilon_search(
+    set leaves,
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
+    cnp.float64_t cluster_selection_epsilon,
+    cnp.intp_t allow_single_cluster
+):
+    cdef:
+        list selected_clusters = list()
+        list processed = list()
+        cnp.intp_t leaf, epsilon_child, sub_node
+        cnp.float64_t eps
+        cnp.uint8_t[:] leaf_nodes
+        cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child']
+        cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value']
+
+    for leaf in leaves:
+        leaf_nodes = children == leaf
+        eps = 1 / distances[leaf_nodes][0]
+        if eps < cluster_selection_epsilon:
+            if leaf not in processed:
+                epsilon_child = traverse_upwards(
+                    cluster_tree,
+                    cluster_selection_epsilon,
+                    leaf,
+                    allow_single_cluster
+                )
+                selected_clusters.append(epsilon_child)
+
+                for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
+                    if sub_node != epsilon_child:
+                        processed.append(sub_node)
+        else:
+            selected_clusters.append(leaf)
+
+    return set(selected_clusters)
+
+
+@cython.wraparound(True)
+cdef tuple _get_clusters(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
+    dict stability,
+    cluster_selection_method='eom',
+    cnp.uint8_t allow_single_cluster=False,
+    cnp.float64_t cluster_selection_epsilon=0.0,
+    max_cluster_size=None
+):
+    """Given a tree and stability dict, produce the cluster labels
+    (and probabilities) for a flat clustering based on the chosen
+    cluster selection method.
+
+    Parameters
+    ----------
+    condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
+        Effectively an edgelist encoding a parent/child pair, along with a
+        value and the corresponding cluster_size in each row providing a tree
+        structure.
+
+    stability : dict
+        A dictionary mapping cluster_ids to stability values
+
+    cluster_selection_method : string, optional (default 'eom')
+        The method of selecting clusters. The default is the
+        Excess of Mass algorithm specified by 'eom'. The alternate
+        option is 'leaf'.
+
+    allow_single_cluster : boolean, optional (default False)
+        Whether to allow a single cluster to be selected by the
+        Excess of Mass algorithm.
+
+    cluster_selection_epsilon: double, optional (default 0.0)
+        A distance threshold for cluster splits.
+
+    max_cluster_size: int, default=None
+        The maximum size for clusters located by the EOM clusterer. Can
+        be overridden by the cluster_selection_epsilon parameter in
+        rare cases.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        An integer array of cluster labels, with -1 denoting noise.
+
+    probabilities : ndarray (n_samples,)
+        The cluster membership strength of each sample.
+
+    stabilities : ndarray (n_clusters,)
+        The cluster coherence strengths of each cluster.
+    """
+    cdef:
+        list node_list
+        cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree
+        cnp.uint8_t[::1] child_selection
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
+        dict is_cluster, cluster_sizes
+        cnp.float64_t subtree_stability
+        cnp.intp_t node, sub_node, cluster, n_samples
+        cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs
+
+    # Assume clusters are ordered by numeric id equivalent to
+    # a topological sort of the tree; This is valid given the
+    # current implementation above, so don't change that ... or
+    # if you do, change this accordingly!
+    if allow_single_cluster:
+        node_list = sorted(stability.keys(), reverse=True)
+    else:
+        node_list = sorted(stability.keys(), reverse=True)[:-1]
+        # (exclude root)
+
+    cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1]
+    is_cluster = {cluster: True for cluster in node_list}
+    n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1
+
+    if max_cluster_size is None:
+        max_cluster_size = n_samples + 1  # Set to a value that will never be triggered
+    cluster_sizes = {
+        child: cluster_size for child, cluster_size
+        in zip(cluster_tree['child'], cluster_tree['cluster_size'])
+    }
+    if allow_single_cluster:
+        # Compute cluster size for the root node
+        cluster_sizes[node_list[-1]] = np.sum(
+            cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size'])
+
+    if cluster_selection_method == 'eom':
+        for node in node_list:
+            child_selection = (cluster_tree['parent'] == node)
+            subtree_stability = np.sum([
+                stability[child] for
+                child in cluster_tree['child'][child_selection]])
+            if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size:
+                is_cluster[node] = False
+                stability[node] = subtree_stability
+            else:
+                for sub_node in bfs_from_cluster_tree(cluster_tree, node):
+                    if sub_node != node:
+                        is_cluster[sub_node] = False
+
+        if cluster_selection_epsilon != 0.0 and PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] > 0:
+            eom_clusters = [c for c in is_cluster if is_cluster[c]]
+            selected_clusters = []
+            # first check if eom_clusters only has root node, which skips epsilon check.
+            if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()):
+                if allow_single_cluster:
+                    selected_clusters = eom_clusters
+            else:
+                selected_clusters = epsilon_search(
+                    set(eom_clusters),
+                    cluster_tree,
+                    cluster_selection_epsilon,
+                    allow_single_cluster
+                )
+            for c in is_cluster:
+                if c in selected_clusters:
+                    is_cluster[c] = True
+                else:
+                    is_cluster[c] = False
+
+    elif cluster_selection_method == 'leaf':
+        leaves = set(get_cluster_tree_leaves(cluster_tree))
+        if len(leaves) == 0:
+            for c in is_cluster:
+                is_cluster[c] = False
+            is_cluster[condensed_tree['parent'].min()] = True
+
+        if cluster_selection_epsilon != 0.0:
+            selected_clusters = epsilon_search(
+                leaves,
+                cluster_tree,
+                cluster_selection_epsilon,
+                allow_single_cluster
+            )
+        else:
+            selected_clusters = leaves
+
+        for c in is_cluster:
+            if c in selected_clusters:
+                is_cluster[c] = True
+            else:
+                is_cluster[c] = False
+
+    clusters = set([c for c in is_cluster if is_cluster[c]])
+    cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
+    reverse_cluster_map = {n: c for c, n in cluster_map.items()}
+
+    labels = _do_labelling(
+        condensed_tree,
+        clusters,
+        cluster_map,
+        allow_single_cluster,
+        cluster_selection_epsilon
+    )
+    probs = get_probabilities(condensed_tree, reverse_cluster_map, labels)
+
+    return (labels, probs)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/hdbscan.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/hdbscan.py
new file mode 100644
index 0000000000000000000000000000000000000000..f292a1f65909b6a5a1a0287adbc2996a3dc36381
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/hdbscan.py
@@ -0,0 +1,1000 @@
+"""
+HDBSCAN: Hierarchical Density-Based Spatial Clustering
+         of Applications with Noise
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from numbers import Integral, Real
+from warnings import warn
+
+import numpy as np
+from scipy.sparse import csgraph, issparse
+
+from ...base import BaseEstimator, ClusterMixin, _fit_context
+from ...metrics import pairwise_distances
+from ...metrics._dist_metrics import DistanceMetric
+from ...metrics.pairwise import _VALID_METRICS
+from ...neighbors import BallTree, KDTree, NearestNeighbors
+from ...utils._param_validation import Interval, StrOptions
+from ...utils.validation import (
+    _allclose_dense_sparse,
+    _assert_all_finite,
+    validate_data,
+)
+from ._linkage import (
+    MST_edge_dtype,
+    make_single_linkage,
+    mst_from_data_matrix,
+    mst_from_mutual_reachability,
+)
+from ._reachability import mutual_reachability_graph
+from ._tree import HIERARCHY_dtype, labelling_at_cut, tree_to_labels
+
+FAST_METRICS = set(KDTree.valid_metrics + BallTree.valid_metrics)
+
+# Encodings are arbitrary but must be strictly negative.
+# The current encodings are chosen as extensions to the -1 noise label.
+# Avoided enums so that the end user only deals with simple labels.
+_OUTLIER_ENCODING: dict = {
+    "infinite": {
+        "label": -2,
+        # The probability could also be 1, since infinite points are certainly
+        # infinite outliers, however 0 is convention from the HDBSCAN library
+        # implementation.
+        "prob": 0,
+    },
+    "missing": {
+        "label": -3,
+        # A nan probability is chosen to emphasize the fact that the
+        # corresponding data was not considered in the clustering problem.
+        "prob": np.nan,
+    },
+}
+
+
+def _brute_mst(mutual_reachability, min_samples):
+    """
+    Builds a minimum spanning tree (MST) from the provided mutual-reachability
+    values. This function dispatches to a custom Cython implementation for
+    dense arrays, and `scipy.sparse.csgraph.minimum_spanning_tree` for sparse
+    arrays/matrices.
+
+    Parameters
+    ----------
+    mututal_reachability_graph: {ndarray, sparse matrix} of shape \
+            (n_samples, n_samples)
+        Weighted adjacency matrix of the mutual reachability graph.
+
+    min_samples : int, default=None
+        The number of samples in a neighborhood for a point
+        to be considered as a core point. This includes the point itself.
+
+    Returns
+    -------
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
+        The MST representation of the mutual-reachability graph. The MST is
+        represented as a collection of edges.
+    """
+    if not issparse(mutual_reachability):
+        return mst_from_mutual_reachability(mutual_reachability)
+
+    # Check if the mutual reachability matrix has any rows which have
+    # less than `min_samples` non-zero elements.
+    indptr = mutual_reachability.indptr
+    num_points = mutual_reachability.shape[0]
+    if any((indptr[i + 1] - indptr[i]) < min_samples for i in range(num_points)):
+        raise ValueError(
+            f"There exists points with fewer than {min_samples} neighbors. Ensure"
+            " your distance matrix has non-zero values for at least"
+            f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn"
+            " graph), or specify a `max_distance` in `metric_params` to use when"
+            " distances are missing."
+        )
+    # Check connected component on mutual reachability.
+    # If more than one connected component is present,
+    # it means that the graph is disconnected.
+    n_components = csgraph.connected_components(
+        mutual_reachability, directed=False, return_labels=False
+    )
+    if n_components > 1:
+        raise ValueError(
+            f"Sparse mutual reachability matrix has {n_components} connected"
+            " components. HDBSCAN cannot be performed on a disconnected graph. Ensure"
+            " that the sparse distance matrix has only one connected component."
+        )
+
+    # Compute the minimum spanning tree for the sparse graph
+    sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability)
+    rows, cols = sparse_min_spanning_tree.nonzero()
+    mst = np.rec.fromarrays(
+        [rows, cols, sparse_min_spanning_tree.data],
+        dtype=MST_edge_dtype,
+    )
+    return mst
+
+
+def _process_mst(min_spanning_tree):
+    """
+    Builds a single-linkage tree (SLT) from the provided minimum spanning tree
+    (MST). The MST is first sorted then processed by a custom Cython routine.
+
+    Parameters
+    ----------
+    min_spanning_tree : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
+        The MST representation of the mutual-reachability graph. The MST is
+        represented as a collection of edges.
+
+    Returns
+    -------
+    single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
+        The single-linkage tree tree (dendrogram) built from the MST.
+    """
+    # Sort edges of the min_spanning_tree by weight
+    row_order = np.argsort(min_spanning_tree["distance"])
+    min_spanning_tree = min_spanning_tree[row_order]
+    # Convert edge list into standard hierarchical clustering format
+    return make_single_linkage(min_spanning_tree)
+
+
+def _hdbscan_brute(
+    X,
+    min_samples=5,
+    alpha=None,
+    metric="euclidean",
+    n_jobs=None,
+    copy=False,
+    **metric_params,
+):
+    """
+    Builds a single-linkage tree (SLT) from the input data `X`. If
+    `metric="precomputed"` then `X` must be a symmetric array of distances.
+    Otherwise, the pairwise distances are calculated directly and passed to
+    `mutual_reachability_graph`.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
+        Either the raw data from which to compute the pairwise distances,
+        or the precomputed distances.
+
+    min_samples : int, default=None
+        The number of samples in a neighborhood for a point
+        to be considered as a core point. This includes the point itself.
+
+    alpha : float, default=1.0
+        A distance scaling parameter as used in robust single linkage.
+
+    metric : str or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array.
+
+        - If metric is a string or callable, it must be one of
+          the options allowed by :func:`~sklearn.metrics.pairwise_distances`
+          for its metric parameter.
+
+        - If metric is "precomputed", X is assumed to be a distance matrix and
+          must be square.
+
+    n_jobs : int, default=None
+        The number of jobs to use for computing the pairwise distances. This
+        works by breaking down the pairwise matrix into n_jobs even slices and
+        computing them in parallel. This parameter is passed directly to
+        :func:`~sklearn.metrics.pairwise_distances`.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    copy : bool, default=False
+        If `copy=True` then any time an in-place modifications would be made
+        that would overwrite `X`, a copy will first be made, guaranteeing that
+        the original data will be unchanged. Currently, it only applies when
+        `metric="precomputed"`, when passing a dense array or a CSR sparse
+        array/matrix.
+
+    metric_params : dict, default=None
+        Arguments passed to the distance metric.
+
+    Returns
+    -------
+    single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
+        The single-linkage tree tree (dendrogram) built from the MST.
+    """
+    if metric == "precomputed":
+        if X.shape[0] != X.shape[1]:
+            raise ValueError(
+                "The precomputed distance matrix is expected to be symmetric, however"
+                f" it has shape {X.shape}. Please verify that the"
+                " distance matrix was constructed correctly."
+            )
+        if not _allclose_dense_sparse(X, X.T):
+            raise ValueError(
+                "The precomputed distance matrix is expected to be symmetric, however"
+                " its values appear to be asymmetric. Please verify that the distance"
+                " matrix was constructed correctly."
+            )
+
+        distance_matrix = X.copy() if copy else X
+    else:
+        distance_matrix = pairwise_distances(
+            X, metric=metric, n_jobs=n_jobs, **metric_params
+        )
+    distance_matrix /= alpha
+
+    max_distance = metric_params.get("max_distance", 0.0)
+    if issparse(distance_matrix) and distance_matrix.format != "csr":
+        # we need CSR format to avoid a conversion in `_brute_mst` when calling
+        # `csgraph.connected_components`
+        distance_matrix = distance_matrix.tocsr()
+
+    # Note that `distance_matrix` is manipulated in-place, however we do not
+    # need it for anything else past this point, hence the operation is safe.
+    mutual_reachability_ = mutual_reachability_graph(
+        distance_matrix, min_samples=min_samples, max_distance=max_distance
+    )
+    min_spanning_tree = _brute_mst(mutual_reachability_, min_samples=min_samples)
+    # Warn if the MST couldn't be constructed around the missing distances
+    if np.isinf(min_spanning_tree["distance"]).any():
+        warn(
+            (
+                "The minimum spanning tree contains edge weights with value "
+                "infinity. Potentially, you are missing too many distances "
+                "in the initial distance matrix for the given neighborhood "
+                "size."
+            ),
+            UserWarning,
+        )
+    return _process_mst(min_spanning_tree)
+
+
+def _hdbscan_prims(
+    X,
+    algo,
+    min_samples=5,
+    alpha=1.0,
+    metric="euclidean",
+    leaf_size=40,
+    n_jobs=None,
+    **metric_params,
+):
+    """
+    Builds a single-linkage tree (SLT) from the input data `X`. If
+    `metric="precomputed"` then `X` must be a symmetric array of distances.
+    Otherwise, the pairwise distances are calculated directly and passed to
+    `mutual_reachability_graph`.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        The raw data.
+
+    min_samples : int, default=None
+        The number of samples in a neighborhood for a point
+        to be considered as a core point. This includes the point itself.
+
+    alpha : float, default=1.0
+        A distance scaling parameter as used in robust single linkage.
+
+    metric : str or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array. `metric` must be one of the options allowed by
+        :func:`~sklearn.metrics.pairwise_distances` for its metric
+        parameter.
+
+    n_jobs : int, default=None
+        The number of jobs to use for computing the pairwise distances. This
+        works by breaking down the pairwise matrix into n_jobs even slices and
+        computing them in parallel. This parameter is passed directly to
+        :func:`~sklearn.metrics.pairwise_distances`.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    copy : bool, default=False
+        If `copy=True` then any time an in-place modifications would be made
+        that would overwrite `X`, a copy will first be made, guaranteeing that
+        the original data will be unchanged. Currently, it only applies when
+        `metric="precomputed"`, when passing a dense array or a CSR sparse
+        array/matrix.
+
+    metric_params : dict, default=None
+        Arguments passed to the distance metric.
+
+    Returns
+    -------
+    single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
+        The single-linkage tree tree (dendrogram) built from the MST.
+    """
+    # The Cython routines used require contiguous arrays
+    X = np.asarray(X, order="C")
+
+    # Get distance to kth nearest neighbour
+    nbrs = NearestNeighbors(
+        n_neighbors=min_samples,
+        algorithm=algo,
+        leaf_size=leaf_size,
+        metric=metric,
+        metric_params=metric_params,
+        n_jobs=n_jobs,
+        p=None,
+    ).fit(X)
+
+    neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True)
+    core_distances = np.ascontiguousarray(neighbors_distances[:, -1])
+    dist_metric = DistanceMetric.get_metric(metric, **metric_params)
+
+    # Mutual reachability distance is implicit in mst_from_data_matrix
+    min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha)
+    return _process_mst(min_spanning_tree)
+
+
+def remap_single_linkage_tree(tree, internal_to_raw, non_finite):
+    """
+    Takes an internal single_linkage_tree structure and adds back in a set of points
+    that were initially detected as non-finite and returns that new tree.
+    These points will all be merged into the final node at np.inf distance and
+    considered noise points.
+
+    Parameters
+    ----------
+    tree : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
+        The single-linkage tree tree (dendrogram) built from the MST.
+    internal_to_raw: dict
+        A mapping from internal integer index to the raw integer index
+    non_finite : ndarray
+        Boolean array of which entries in the raw data are non-finite
+    """
+    finite_count = len(internal_to_raw)
+
+    outlier_count = len(non_finite)
+    for i, _ in enumerate(tree):
+        left = tree[i]["left_node"]
+        right = tree[i]["right_node"]
+
+        if left < finite_count:
+            tree[i]["left_node"] = internal_to_raw[left]
+        else:
+            tree[i]["left_node"] = left + outlier_count
+        if right < finite_count:
+            tree[i]["right_node"] = internal_to_raw[right]
+        else:
+            tree[i]["right_node"] = right + outlier_count
+
+    outlier_tree = np.zeros(len(non_finite), dtype=HIERARCHY_dtype)
+    last_cluster_id = max(
+        tree[tree.shape[0] - 1]["left_node"], tree[tree.shape[0] - 1]["right_node"]
+    )
+    last_cluster_size = tree[tree.shape[0] - 1]["cluster_size"]
+    for i, outlier in enumerate(non_finite):
+        outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1)
+        last_cluster_id += 1
+        last_cluster_size += 1
+    tree = np.concatenate([tree, outlier_tree])
+    return tree
+
+
+def _get_finite_row_indices(matrix):
+    """
+    Returns the indices of the purely finite rows of a
+    sparse matrix or dense ndarray
+    """
+    if issparse(matrix):
+        row_indices = np.array(
+            [i for i, row in enumerate(matrix.tolil().data) if np.all(np.isfinite(row))]
+        )
+    else:
+        (row_indices,) = np.isfinite(matrix.sum(axis=1)).nonzero()
+    return row_indices
+
+
+class HDBSCAN(ClusterMixin, BaseEstimator):
+    """Cluster data using hierarchical density-based clustering.
+
+    HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications
+    with Noise. Performs :class:`~sklearn.cluster.DBSCAN` over varying epsilon
+    values and integrates the result to find a clustering that gives the best
+    stability over epsilon.
+    This allows HDBSCAN to find clusters of varying densities (unlike
+    :class:`~sklearn.cluster.DBSCAN`), and be more robust to parameter selection.
+    Read more in the :ref:`User Guide <hdbscan>`.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    min_cluster_size : int, default=5
+        The minimum number of samples in a group for that group to be
+        considered a cluster; groupings smaller than this size will be left
+        as noise.
+
+    min_samples : int, default=None
+        The parameter `k` used to calculate the distance between a point
+        `x_p` and its k-th nearest neighbor.
+        When `None`, defaults to `min_cluster_size`.
+
+    cluster_selection_epsilon : float, default=0.0
+        A distance threshold. Clusters below this value will be merged.
+        See [5]_ for more information.
+
+    max_cluster_size : int, default=None
+        A limit to the size of clusters returned by the `"eom"` cluster
+        selection algorithm. There is no limit when `max_cluster_size=None`.
+        Has no effect if `cluster_selection_method="leaf"`.
+
+    metric : str or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array.
+
+        - If metric is a string or callable, it must be one of
+          the options allowed by :func:`~sklearn.metrics.pairwise_distances`
+          for its metric parameter.
+
+        - If metric is "precomputed", X is assumed to be a distance matrix and
+          must be square.
+
+    metric_params : dict, default=None
+        Arguments passed to the distance metric.
+
+    alpha : float, default=1.0
+        A distance scaling parameter as used in robust single linkage.
+        See [3]_ for more information.
+
+    algorithm : {"auto", "brute", "kd_tree", "ball_tree"}, default="auto"
+        Exactly which algorithm to use for computing core distances; By default
+        this is set to `"auto"` which attempts to use a
+        :class:`~sklearn.neighbors.KDTree` tree if possible, otherwise it uses
+        a :class:`~sklearn.neighbors.BallTree` tree. Both `"kd_tree"` and
+        `"ball_tree"` algorithms use the
+        :class:`~sklearn.neighbors.NearestNeighbors` estimator.
+
+        If the `X` passed during `fit` is sparse or `metric` is invalid for
+        both :class:`~sklearn.neighbors.KDTree` and
+        :class:`~sklearn.neighbors.BallTree`, then it resolves to use the
+        `"brute"` algorithm.
+
+    leaf_size : int, default=40
+        Leaf size for trees responsible for fast nearest neighbour queries when
+        a KDTree or a BallTree are used as core-distance algorithms. A large
+        dataset size and small `leaf_size` may induce excessive memory usage.
+        If you are running out of memory consider increasing the `leaf_size`
+        parameter. Ignored for `algorithm="brute"`.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel to calculate distances.
+        `None` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        `-1` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    cluster_selection_method : {"eom", "leaf"}, default="eom"
+        The method used to select clusters from the condensed tree. The
+        standard approach for HDBSCAN* is to use an Excess of Mass (`"eom"`)
+        algorithm to find the most persistent clusters. Alternatively you can
+        instead select the clusters at the leaves of the tree -- this provides
+        the most fine grained and homogeneous clusters.
+
+    allow_single_cluster : bool, default=False
+        By default HDBSCAN* will not produce a single cluster, setting this
+        to True will override this and allow single cluster results in
+        the case that you feel this is a valid result for your dataset.
+
+    store_centers : str, default=None
+        Which, if any, cluster centers to compute and store. The options are:
+
+        - `None` which does not compute nor store any centers.
+        - `"centroid"` which calculates the center by taking the weighted
+          average of their positions. Note that the algorithm uses the
+          euclidean metric and does not guarantee that the output will be
+          an observed data point.
+        - `"medoid"` which calculates the center by taking the point in the
+          fitted data which minimizes the distance to all other points in
+          the cluster. This is slower than "centroid" since it requires
+          computing additional pairwise distances between points of the
+          same cluster but guarantees the output is an observed data point.
+          The medoid is also well-defined for arbitrary metrics, and does not
+          depend on a euclidean metric.
+        - `"both"` which computes and stores both forms of centers.
+
+    copy : bool, default=False
+        If `copy=True` then any time an in-place modifications would be made
+        that would overwrite data passed to :term:`fit`, a copy will first be
+        made, guaranteeing that the original data will be unchanged.
+        Currently, it only applies when `metric="precomputed"`, when passing
+        a dense array or a CSR sparse matrix and when `algorithm="brute"`.
+
+    Attributes
+    ----------
+    labels_ : ndarray of shape (n_samples,)
+        Cluster labels for each point in the dataset given to :term:`fit`.
+        Outliers are labeled as follows:
+
+        - Noisy samples are given the label -1.
+        - Samples with infinite elements (+/- np.inf) are given the label -2.
+        - Samples with missing data are given the label -3, even if they
+          also have infinite elements.
+
+    probabilities_ : ndarray of shape (n_samples,)
+        The strength with which each sample is a member of its assigned
+        cluster.
+
+        - Clustered samples have probabilities proportional to the degree that
+          they persist as part of the cluster.
+        - Noisy samples have probability zero.
+        - Samples with infinite elements (+/- np.inf) have probability 0.
+        - Samples with missing data have probability `np.nan`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    centroids_ : ndarray of shape (n_clusters, n_features)
+        A collection containing the centroid of each cluster calculated under
+        the standard euclidean metric. The centroids may fall "outside" their
+        respective clusters if the clusters themselves are non-convex.
+
+        Note that `n_clusters` only counts non-outlier clusters. That is to
+        say, the `-1, -2, -3` labels for the outlier clusters are excluded.
+
+    medoids_ : ndarray of shape (n_clusters, n_features)
+        A collection containing the medoid of each cluster calculated under
+        the whichever metric was passed to the `metric` parameter. The
+        medoids are points in the original cluster which minimize the average
+        distance to all other points in that cluster under the chosen metric.
+        These can be thought of as the result of projecting the `metric`-based
+        centroid back onto the cluster.
+
+        Note that `n_clusters` only counts non-outlier clusters. That is to
+        say, the `-1, -2, -3` labels for the outlier clusters are excluded.
+
+    See Also
+    --------
+    DBSCAN : Density-Based Spatial Clustering of Applications
+        with Noise.
+    OPTICS : Ordering Points To Identify the Clustering Structure.
+    Birch : Memory-efficient, online-learning algorithm.
+
+    Notes
+    -----
+    The `min_samples` parameter includes the point itself, whereas the implementation in
+    `scikit-learn-contrib/hdbscan <https://github.com/scikit-learn-contrib/hdbscan>`_
+    does not. To get the same results in both versions, the value of `min_samples` here
+    must be 1 greater than the value used in `scikit-learn-contrib/hdbscan
+    <https://github.com/scikit-learn-contrib/hdbscan>`_.
+
+    References
+    ----------
+
+    .. [1] :doi:`Campello, R. J., Moulavi, D., & Sander, J. Density-based clustering
+      based on hierarchical density estimates.
+      <10.1007/978-3-642-37456-2_14>`
+    .. [2] :doi:`Campello, R. J., Moulavi, D., Zimek, A., & Sander, J.
+       Hierarchical density estimates for data clustering, visualization,
+       and outlier detection.<10.1145/2733381>`
+
+    .. [3] `Chaudhuri, K., & Dasgupta, S. Rates of convergence for the
+       cluster tree.
+       <https://papers.nips.cc/paper/2010/hash/
+       b534ba68236ba543ae44b22bd110a1d6-Abstract.html>`_
+
+    .. [4] `Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and
+       Sander, J. Density-Based Clustering Validation.
+       <https://www.dbs.ifi.lmu.de/~zimek/publications/SDM2014/DBCV.pdf>`_
+
+    .. [5] :arxiv:`Malzer, C., & Baum, M. "A Hybrid Approach To Hierarchical
+       Density-based Cluster Selection."<1911.02282>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import HDBSCAN
+    >>> from sklearn.datasets import load_digits
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> hdb = HDBSCAN(min_cluster_size=20)
+    >>> hdb.fit(X)
+    HDBSCAN(min_cluster_size=20)
+    >>> hdb.labels_.shape == (X.shape[0],)
+    True
+    >>> np.unique(hdb.labels_).tolist()
+    [-1, 0, 1, 2, 3, 4, 5, 6, 7]
+    """
+
+    _parameter_constraints = {
+        "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")],
+        "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None],
+        "cluster_selection_epsilon": [
+            Interval(Real, left=0, right=None, closed="left")
+        ],
+        "max_cluster_size": [
+            None,
+            Interval(Integral, left=1, right=None, closed="left"),
+        ],
+        "metric": [
+            StrOptions(FAST_METRICS | set(_VALID_METRICS) | {"precomputed"}),
+            callable,
+        ],
+        "metric_params": [dict, None],
+        "alpha": [Interval(Real, left=0, right=None, closed="neither")],
+        "algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
+        "leaf_size": [Interval(Integral, left=1, right=None, closed="left")],
+        "n_jobs": [Integral, None],
+        "cluster_selection_method": [StrOptions({"eom", "leaf"})],
+        "allow_single_cluster": ["boolean"],
+        "store_centers": [None, StrOptions({"centroid", "medoid", "both"})],
+        "copy": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        min_cluster_size=5,
+        min_samples=None,
+        cluster_selection_epsilon=0.0,
+        max_cluster_size=None,
+        metric="euclidean",
+        metric_params=None,
+        alpha=1.0,
+        algorithm="auto",
+        leaf_size=40,
+        n_jobs=None,
+        cluster_selection_method="eom",
+        allow_single_cluster=False,
+        store_centers=None,
+        copy=False,
+    ):
+        self.min_cluster_size = min_cluster_size
+        self.min_samples = min_samples
+        self.alpha = alpha
+        self.max_cluster_size = max_cluster_size
+        self.cluster_selection_epsilon = cluster_selection_epsilon
+        self.metric = metric
+        self.metric_params = metric_params
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.n_jobs = n_jobs
+        self.cluster_selection_method = cluster_selection_method
+        self.allow_single_cluster = allow_single_cluster
+        self.store_centers = store_centers
+        self.copy = copy
+
+    @_fit_context(
+        # HDBSCAN.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Find clusters based on hierarchical density-based clustering.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+                ndarray of shape (n_samples, n_samples)
+            A feature array, or array of distances between samples if
+            `metric='precomputed'`.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        if self.metric == "precomputed" and self.store_centers is not None:
+            raise ValueError(
+                "Cannot store centers when using a precomputed distance matrix."
+            )
+
+        self._metric_params = self.metric_params or {}
+        if self.metric != "precomputed":
+            # Non-precomputed matrices may contain non-finite values.
+            X = validate_data(
+                self,
+                X,
+                accept_sparse=["csr", "lil"],
+                ensure_all_finite=False,
+                dtype=np.float64,
+            )
+            self._raw_data = X
+            all_finite = True
+            try:
+                _assert_all_finite(X.data if issparse(X) else X)
+            except ValueError:
+                all_finite = False
+
+            if not all_finite:
+                # Pass only the purely finite indices into hdbscan
+                # We will later assign all non-finite points their
+                # corresponding labels, as specified in `_OUTLIER_ENCODING`
+
+                # Reduce X to make the checks for missing/outlier samples more
+                # convenient.
+                reduced_X = X.sum(axis=1)
+
+                # Samples with missing data are denoted by the presence of
+                # `np.nan`
+                missing_index = np.isnan(reduced_X).nonzero()[0]
+
+                # Outlier samples are denoted by the presence of `np.inf`
+                infinite_index = np.isinf(reduced_X).nonzero()[0]
+
+                # Continue with only finite samples
+                finite_index = _get_finite_row_indices(X)
+                internal_to_raw = {x: y for x, y in enumerate(finite_index)}
+                X = X[finite_index]
+        elif issparse(X):
+            # Handle sparse precomputed distance matrices separately
+            X = validate_data(
+                self,
+                X,
+                accept_sparse=["csr", "lil"],
+                dtype=np.float64,
+                force_writeable=True,
+            )
+        else:
+            # Only non-sparse, precomputed distance matrices are handled here
+            # and thereby allowed to contain numpy.inf for missing distances
+
+            # Perform data validation after removing infinite values (numpy.inf)
+            # from the given distance matrix.
+            X = validate_data(
+                self, X, ensure_all_finite=False, dtype=np.float64, force_writeable=True
+            )
+            if np.isnan(X).any():
+                # TODO: Support np.nan in Cython implementation for precomputed
+                # dense HDBSCAN
+                raise ValueError("np.nan values found in precomputed-dense")
+        if X.shape[0] == 1:
+            raise ValueError("n_samples=1 while HDBSCAN requires more than one sample")
+        self._min_samples = (
+            self.min_cluster_size if self.min_samples is None else self.min_samples
+        )
+
+        if self._min_samples > X.shape[0]:
+            raise ValueError(
+                f"min_samples ({self._min_samples}) must be at most the number of"
+                f" samples in X ({X.shape[0]})"
+            )
+
+        mst_func = None
+        kwargs = dict(
+            X=X,
+            min_samples=self._min_samples,
+            alpha=self.alpha,
+            metric=self.metric,
+            n_jobs=self.n_jobs,
+            **self._metric_params,
+        )
+        if self.algorithm == "kd_tree" and self.metric not in KDTree.valid_metrics:
+            raise ValueError(
+                f"{self.metric} is not a valid metric for a KDTree-based algorithm."
+                " Please select a different metric."
+            )
+        elif (
+            self.algorithm == "ball_tree" and self.metric not in BallTree.valid_metrics
+        ):
+            raise ValueError(
+                f"{self.metric} is not a valid metric for a BallTree-based algorithm."
+                " Please select a different metric."
+            )
+
+        if self.algorithm != "auto":
+            if (
+                self.metric != "precomputed"
+                and issparse(X)
+                and self.algorithm != "brute"
+            ):
+                raise ValueError("Sparse data matrices only support algorithm `brute`.")
+
+            if self.algorithm == "brute":
+                mst_func = _hdbscan_brute
+                kwargs["copy"] = self.copy
+            elif self.algorithm == "kd_tree":
+                mst_func = _hdbscan_prims
+                kwargs["algo"] = "kd_tree"
+                kwargs["leaf_size"] = self.leaf_size
+            else:
+                mst_func = _hdbscan_prims
+                kwargs["algo"] = "ball_tree"
+                kwargs["leaf_size"] = self.leaf_size
+        else:
+            if issparse(X) or self.metric not in FAST_METRICS:
+                # We can't do much with sparse matrices ...
+                mst_func = _hdbscan_brute
+                kwargs["copy"] = self.copy
+            elif self.metric in KDTree.valid_metrics:
+                # TODO: Benchmark KD vs Ball Tree efficiency
+                mst_func = _hdbscan_prims
+                kwargs["algo"] = "kd_tree"
+                kwargs["leaf_size"] = self.leaf_size
+            else:
+                # Metric is a valid BallTree metric
+                mst_func = _hdbscan_prims
+                kwargs["algo"] = "ball_tree"
+                kwargs["leaf_size"] = self.leaf_size
+
+        self._single_linkage_tree_ = mst_func(**kwargs)
+
+        self.labels_, self.probabilities_ = tree_to_labels(
+            self._single_linkage_tree_,
+            self.min_cluster_size,
+            self.cluster_selection_method,
+            self.allow_single_cluster,
+            self.cluster_selection_epsilon,
+            self.max_cluster_size,
+        )
+        if self.metric != "precomputed" and not all_finite:
+            # Remap indices to align with original data in the case of
+            # non-finite entries. Samples with np.inf are mapped to -1 and
+            # those with np.nan are mapped to -2.
+            self._single_linkage_tree_ = remap_single_linkage_tree(
+                self._single_linkage_tree_,
+                internal_to_raw,
+                # There may be overlap for points w/ both `np.inf` and `np.nan`
+                non_finite=set(np.hstack([infinite_index, missing_index])),
+            )
+            new_labels = np.empty(self._raw_data.shape[0], dtype=np.int32)
+            new_labels[finite_index] = self.labels_
+            new_labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"]
+            new_labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"]
+            self.labels_ = new_labels
+
+            new_probabilities = np.zeros(self._raw_data.shape[0], dtype=np.float64)
+            new_probabilities[finite_index] = self.probabilities_
+            # Infinite outliers have probability 0 by convention, though this
+            # is arbitrary.
+            new_probabilities[infinite_index] = _OUTLIER_ENCODING["infinite"]["prob"]
+            new_probabilities[missing_index] = _OUTLIER_ENCODING["missing"]["prob"]
+            self.probabilities_ = new_probabilities
+
+        if self.store_centers:
+            self._weighted_cluster_center(X)
+        return self
+
+    def fit_predict(self, X, y=None):
+        """Cluster X and return the associated cluster labels.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+                ndarray of shape (n_samples, n_samples)
+            A feature array, or array of distances between samples if
+            `metric='precomputed'`.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            Cluster labels.
+        """
+        self.fit(X)
+        return self.labels_
+
+    def _weighted_cluster_center(self, X):
+        """Calculate and store the centroids/medoids of each cluster.
+
+        This requires `X` to be a raw feature array, not precomputed
+        distances. Rather than return outputs directly, this helper method
+        instead stores them in the `self.{centroids, medoids}_` attributes.
+        The choice for which attributes are calculated and stored is mediated
+        by the value of `self.store_centers`.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The feature array that the estimator was fit with.
+
+        """
+        # Number of non-noise clusters
+        n_clusters = len(set(self.labels_) - {-1, -2})
+        mask = np.empty((X.shape[0],), dtype=np.bool_)
+        make_centroids = self.store_centers in ("centroid", "both")
+        make_medoids = self.store_centers in ("medoid", "both")
+
+        if make_centroids:
+            self.centroids_ = np.empty((n_clusters, X.shape[1]), dtype=np.float64)
+        if make_medoids:
+            self.medoids_ = np.empty((n_clusters, X.shape[1]), dtype=np.float64)
+
+        # Need to handle iteratively seen each cluster may have a different
+        # number of samples, hence we can't create a homogeneous 3D array.
+        for idx in range(n_clusters):
+            mask = self.labels_ == idx
+            data = X[mask]
+            strength = self.probabilities_[mask]
+            if make_centroids:
+                self.centroids_[idx] = np.average(data, weights=strength, axis=0)
+            if make_medoids:
+                # TODO: Implement weighted argmin PWD backend
+                dist_mat = pairwise_distances(
+                    data, metric=self.metric, **self._metric_params
+                )
+                dist_mat = dist_mat * strength
+                medoid_index = np.argmin(dist_mat.sum(axis=1))
+                self.medoids_[idx] = data[medoid_index]
+        return
+
+    def dbscan_clustering(self, cut_distance, min_cluster_size=5):
+        """Return clustering given by DBSCAN without border points.
+
+        Return clustering that would be equivalent to running DBSCAN* for a
+        particular cut_distance (or epsilon) DBSCAN* can be thought of as
+        DBSCAN without the border points.  As such these results may differ
+        slightly from `cluster.DBSCAN` due to the difference in implementation
+        over the non-core points.
+
+        This can also be thought of as a flat clustering derived from constant
+        height cut through the single linkage tree.
+
+        This represents the result of selecting a cut value for robust single linkage
+        clustering. The `min_cluster_size` allows the flat clustering to declare noise
+        points (and cluster smaller than `min_cluster_size`).
+
+        Parameters
+        ----------
+        cut_distance : float
+            The mutual reachability distance cut value to use to generate a
+            flat clustering.
+
+        min_cluster_size : int, default=5
+            Clusters smaller than this value with be called 'noise' and remain
+            unclustered in the resulting flat clustering.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            An array of cluster labels, one per datapoint.
+            Outliers are labeled as follows:
+
+            - Noisy samples are given the label -1.
+            - Samples with infinite elements (+/- np.inf) are given the label -2.
+            - Samples with missing data are given the label -3, even if they
+              also have infinite elements.
+        """
+        labels = labelling_at_cut(
+            self._single_linkage_tree_, cut_distance, min_cluster_size
+        )
+        # Infer indices from labels generated during `fit`
+        infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"]
+        missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"]
+
+        # Overwrite infinite/missing outlier samples (otherwise simple noise)
+        labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"]
+        labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"]
+        return labels
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.input_tags.allow_nan = self.metric != "precomputed"
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/meson.build b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..8d880b39a4db58dffa1b282c3633c873755f5245
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/meson.build
@@ -0,0 +1,15 @@
+cluster_hdbscan_extension_metadata = {
+  '_linkage': {'sources': [cython_gen.process('_linkage.pyx'), metrics_cython_tree]},
+  '_reachability': {'sources': [cython_gen.process('_reachability.pyx')]},
+  '_tree': {'sources': [cython_gen.process('_tree.pyx')]}
+}
+
+foreach ext_name, ext_dict : cluster_hdbscan_extension_metadata
+  py.extension_module(
+    ext_name,
+    ext_dict.get('sources'),
+    dependencies: [np_dep],
+    subdir: 'sklearn/cluster/_hdbscan',
+    install: true
+  )
+endforeach
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/test_reachibility.py
new file mode 100644
index 0000000000000000000000000000000000000000..a336e6be6116d1345a1d4eb0448c2e2f58cd8ecd
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/test_reachibility.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+
+from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+)
+
+
+def test_mutual_reachability_graph_error_sparse_format():
+    """Check that we raise an error if the sparse format is not CSR."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = X.T @ X
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, "sparse_csc")
+
+    err_msg = "Only sparse CSR matrices are supported"
+    with pytest.raises(ValueError, match=err_msg):
+        mutual_reachability_graph(X)
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
+def test_mutual_reachability_graph_inplace(array_type):
+    """Check that the operation is happening inplace."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = X.T @ X
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, array_type)
+
+    mr_graph = mutual_reachability_graph(X)
+
+    assert id(mr_graph) == id(X)
+
+
+def test_mutual_reachability_graph_equivalence_dense_sparse():
+    """Check that we get the same results for dense and sparse implementation."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(5, 5)
+    X_dense = X.T @ X
+    X_sparse = _convert_container(X_dense, "sparse_csr")
+
+    mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
+    mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
+
+    assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_mutual_reachability_graph_preserves_dtype(array_type, dtype):
+    """Check that the computation preserve dtype thanks to fused types."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = (X.T @ X).astype(dtype)
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, array_type)
+
+    assert X.dtype == dtype
+    mr_graph = mutual_reachability_graph(X)
+    assert mr_graph.dtype == dtype
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pxd b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..a10f8c12f34402c872ccc3bd7c14266dcc9b5e7a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pxd
@@ -0,0 +1,9 @@
+from ..utils._typedefs cimport intp_t
+
+cdef class UnionFind:
+    cdef intp_t next_label
+    cdef intp_t[:] parent
+    cdef intp_t[:] size
+
+    cdef void union(self, intp_t m, intp_t n) noexcept
+    cdef intp_t fast_find(self, intp_t n) noexcept
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..36ae0ab0d241432df9f5833901580dc88c30d925
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pyx
@@ -0,0 +1,507 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+cimport cython
+
+from ..metrics._dist_metrics cimport DistanceMetric64
+from ..utils._fast_dict cimport IntFloatDict
+from ..utils._typedefs cimport float64_t, intp_t, uint8_t
+
+# C++
+from cython.operator cimport dereference as deref, preincrement as inc
+from libcpp.map cimport map as cpp_map
+from libc.math cimport fmax, INFINITY
+
+
+###############################################################################
+# Utilities for computing the ward momentum
+
+def compute_ward_dist(
+    const float64_t[::1] m_1,
+    const float64_t[:, ::1] m_2,
+    const intp_t[::1] coord_row,
+    const intp_t[::1] coord_col,
+    float64_t[::1] res
+):
+    cdef intp_t size_max = coord_row.shape[0]
+    cdef intp_t n_features = m_2.shape[1]
+    cdef intp_t i, j, row, col
+    cdef float64_t pa, n
+
+    for i in range(size_max):
+        row = coord_row[i]
+        col = coord_col[i]
+        n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col])
+        pa = 0.
+        for j in range(n_features):
+            pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
+        res[i] = pa * n
+
+
+###############################################################################
+# Utilities for cutting and exploring a hierarchical tree
+
+def _hc_get_descendent(intp_t node, children, intp_t n_leaves):
+    """
+    Function returning all the descendent leaves of a set of nodes in the tree.
+
+    Parameters
+    ----------
+    node : integer
+        The node for which we want the descendents.
+
+    children : list of pairs, length n_nodes
+        The children of each non-leaf node. Values less than `n_samples` refer
+        to leaves of the tree. A greater value `i` indicates a node with
+        children `children[i - n_samples]`.
+
+    n_leaves : integer
+        Number of leaves.
+
+    Returns
+    -------
+    descendent : list of int
+    """
+    ind = [node]
+    if node < n_leaves:
+        return ind
+    descendent = []
+
+    # It is actually faster to do the accounting of the number of
+    # elements is the list ourselves: len is a lengthy operation on a
+    # chained list
+    cdef intp_t i, n_indices = 1
+
+    while n_indices:
+        i = ind.pop()
+        if i < n_leaves:
+            descendent.append(i)
+            n_indices -= 1
+        else:
+            ind.extend(children[i - n_leaves])
+            n_indices += 1
+    return descendent
+
+
+def hc_get_heads(intp_t[:] parents, copy=True):
+    """Returns the heads of the forest, as defined by parents.
+
+    Parameters
+    ----------
+    parents : array of integers
+        The parent structure defining the forest (ensemble of trees)
+    copy : boolean
+        If copy is False, the input 'parents' array is modified inplace
+
+    Returns
+    -------
+    heads : array of integers of same shape as parents
+        The indices in the 'parents' of the tree heads
+
+    """
+    cdef intp_t parent, node0, node, size
+    if copy:
+        parents = np.copy(parents)
+    size = parents.size
+
+    # Start from the top of the tree and go down
+    for node0 in range(size - 1, -1, -1):
+        node = node0
+        parent = parents[node]
+        while parent != node:
+            parents[node0] = parent
+            node = parent
+            parent = parents[node]
+    return parents
+
+
+def _get_parents(
+    nodes,
+    heads,
+    const intp_t[:] parents,
+    uint8_t[::1] not_visited
+):
+    """Returns the heads of the given nodes, as defined by parents.
+
+    Modifies 'heads' and 'not_visited' in-place.
+
+    Parameters
+    ----------
+    nodes : list of integers
+        The nodes to start from
+    heads : list of integers
+        A list to hold the results (modified inplace)
+    parents : array of integers
+        The parent structure defining the tree
+    not_visited
+        The tree nodes to consider (modified inplace)
+
+    """
+    cdef intp_t parent, node
+
+    for node in nodes:
+        parent = parents[node]
+        while parent != node:
+            node = parent
+            parent = parents[node]
+        if not_visited[node]:
+            not_visited[node] = 0
+            heads.append(node)
+
+
+###############################################################################
+# merge strategies implemented on IntFloatDicts
+
+# These are used in the hierarchical clustering code, to implement
+# merging between two clusters, defined as a dict containing node number
+# as keys and edge weights as values.
+
+
+def max_merge(
+    IntFloatDict a,
+    IntFloatDict b,
+    const intp_t[:] mask,
+    intp_t n_a,
+    intp_t n_b
+):
+    """Merge two IntFloatDicts with the max strategy: when the same key is
+    present in the two dicts, the max of the two values is used.
+
+    Parameters
+    ==========
+    a, b : IntFloatDict object
+        The IntFloatDicts to merge
+    mask : ndarray array of dtype integer and of dimension 1
+        a mask for keys to ignore: if not mask[key] the corresponding key
+        is skipped in the output dictionary
+    n_a, n_b : float
+        n_a and n_b are weights for a and b for the merge strategy.
+        They are not used in the case of a max merge.
+
+    Returns
+    =======
+    out : IntFloatDict object
+        The IntFloatDict resulting from the merge
+    """
+    cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
+    cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
+    cdef intp_t key
+    cdef float64_t value
+    # First copy a into out
+    while a_it != a_end:
+        key = deref(a_it).first
+        if mask[key]:
+            out_obj.my_map[key] = deref(a_it).second
+        inc(a_it)
+
+    # Then merge b into out
+    cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
+    cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
+    while b_it != b_end:
+        key = deref(b_it).first
+        value = deref(b_it).second
+        if mask[key]:
+            out_it = out_obj.my_map.find(key)
+            if out_it == out_end:
+                # Key not found
+                out_obj.my_map[key] = value
+            else:
+                deref(out_it).second = fmax(deref(out_it).second, value)
+        inc(b_it)
+    return out_obj
+
+
+def average_merge(
+    IntFloatDict a,
+    IntFloatDict b,
+    const intp_t[:] mask,
+    intp_t n_a,
+    intp_t n_b
+):
+    """Merge two IntFloatDicts with the average strategy: when the
+    same key is present in the two dicts, the weighted average of the two
+    values is used.
+
+    Parameters
+    ==========
+    a, b : IntFloatDict object
+        The IntFloatDicts to merge
+    mask : ndarray array of dtype integer and of dimension 1
+        a mask for keys to ignore: if not mask[key] the corresponding key
+        is skipped in the output dictionary
+    n_a, n_b : float
+        n_a and n_b are weights for a and b for the merge strategy.
+        They are used for a weighted mean.
+
+    Returns
+    =======
+    out : IntFloatDict object
+        The IntFloatDict resulting from the merge
+    """
+    cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
+    cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
+    cdef intp_t key
+    cdef float64_t value
+    cdef float64_t n_out = <float64_t> (n_a + n_b)
+    # First copy a into out
+    while a_it != a_end:
+        key = deref(a_it).first
+        if mask[key]:
+            out_obj.my_map[key] = deref(a_it).second
+        inc(a_it)
+
+    # Then merge b into out
+    cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
+    cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
+    while b_it != b_end:
+        key = deref(b_it).first
+        value = deref(b_it).second
+        if mask[key]:
+            out_it = out_obj.my_map.find(key)
+            if out_it == out_end:
+                # Key not found
+                out_obj.my_map[key] = value
+            else:
+                deref(out_it).second = (n_a * deref(out_it).second
+                                        + n_b * value) / n_out
+        inc(b_it)
+    return out_obj
+
+
+###############################################################################
+# An edge object for fast comparisons
+
+cdef class WeightedEdge:
+    cdef public intp_t a
+    cdef public intp_t b
+    cdef public float64_t weight
+
+    def __init__(self, float64_t weight, intp_t a, intp_t b):
+        self.weight = weight
+        self.a = a
+        self.b = b
+
+    def __richcmp__(self, WeightedEdge other, int op):
+        """Cython-specific comparison method.
+
+        op is the comparison code::
+            <   0
+            ==  2
+            >   4
+            <=  1
+            !=  3
+            >=  5
+        """
+        if op == 0:
+            return self.weight < other.weight
+        elif op == 1:
+            return self.weight <= other.weight
+        elif op == 2:
+            return self.weight == other.weight
+        elif op == 3:
+            return self.weight != other.weight
+        elif op == 4:
+            return self.weight > other.weight
+        elif op == 5:
+            return self.weight >= other.weight
+
+    def __repr__(self):
+        return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
+                                              self.weight,
+                                              self.a, self.b)
+
+
+################################################################################
+# Efficient labelling/conversion of MSTs to single linkage hierarchies
+
+cdef class UnionFind(object):
+
+    def __init__(self, N):
+        self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C')
+        self.next_label = N
+        self.size = np.hstack((np.ones(N, dtype=np.intp),
+                               np.zeros(N - 1, dtype=np.intp)))
+
+    cdef void union(self, intp_t m, intp_t n) noexcept:
+        self.parent[m] = self.next_label
+        self.parent[n] = self.next_label
+        self.size[self.next_label] = self.size[m] + self.size[n]
+        self.next_label += 1
+        return
+
+    @cython.wraparound(True)
+    cdef intp_t fast_find(self, intp_t n) noexcept:
+        cdef intp_t p
+        p = n
+        # find the highest node in the linkage graph so far
+        while self.parent[n] != -1:
+            n = self.parent[n]
+        # provide a shortcut up to the highest node
+        while self.parent[p] != n:
+            p, self.parent[p] = self.parent[p], n
+        return n
+
+
+def _single_linkage_label(const float64_t[:, :] L):
+    """
+    Convert an linkage array or MST to a tree by labelling clusters at merges.
+    This is done by using a Union find structure to keep track of merges
+    efficiently. This is the private version of the function that assumes that
+    ``L`` has been properly validated. See ``single_linkage_label`` for the
+    user facing version of this function.
+
+    Parameters
+    ----------
+    L: array of shape (n_samples - 1, 3)
+        The linkage array or MST where each row specifies two samples
+        to be merged and a distance or weight at which the merge occurs. This
+         array is assumed to be sorted by the distance/weight.
+
+    Returns
+    -------
+    A tree in the format used by scipy.cluster.hierarchy.
+    """
+
+    cdef float64_t[:, ::1] result_arr
+
+    cdef intp_t left, left_cluster, right, right_cluster, index
+    cdef float64_t delta
+
+    result_arr = np.zeros((L.shape[0], 4), dtype=np.float64)
+    U = UnionFind(L.shape[0] + 1)
+
+    for index in range(L.shape[0]):
+
+        left = <intp_t> L[index, 0]
+        right = <intp_t> L[index, 1]
+        delta = L[index, 2]
+
+        left_cluster = U.fast_find(left)
+        right_cluster = U.fast_find(right)
+
+        result_arr[index][0] = left_cluster
+        result_arr[index][1] = right_cluster
+        result_arr[index][2] = delta
+        result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster]
+
+        U.union(left_cluster, right_cluster)
+
+    return np.asarray(result_arr)
+
+
+@cython.wraparound(True)
+def single_linkage_label(L):
+    """
+    Convert an linkage array or MST to a tree by labelling clusters at merges.
+    This is done by using a Union find structure to keep track of merges
+    efficiently.
+
+    Parameters
+    ----------
+    L: array of shape (n_samples - 1, 3)
+        The linkage array or MST where each row specifies two samples
+        to be merged and a distance or weight at which the merge occurs. This
+         array is assumed to be sorted by the distance/weight.
+
+    Returns
+    -------
+    A tree in the format used by scipy.cluster.hierarchy.
+    """
+    # Validate L
+    if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1:
+        raise ValueError("Input MST array is not a validly formatted MST array")
+
+    is_sorted = lambda x: np.all(x[:-1] <= x[1:])
+    if not is_sorted(L[:, 2]):
+        raise ValueError("Input MST array must be sorted by weight")
+
+    return _single_linkage_label(L)
+
+
+# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
+def mst_linkage_core(
+        const float64_t [:, ::1] raw_data,
+        DistanceMetric64 dist_metric):
+    """
+    Compute the necessary elements of a minimum spanning
+    tree for computation of single linkage clustering. This
+    represents the MST-LINKAGE-CORE algorithm (Figure 6) from
+    :arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering
+    algorithms" <1109.2378>`.
+
+    In contrast to the scipy implementation is never computes
+    a full distance matrix, generating distances only as they
+    are needed and releasing them when no longer needed.
+
+    Parameters
+    ----------
+    raw_data: array of shape (n_samples, n_features)
+        The array of feature data to be clustered. Must be C-aligned
+
+    dist_metric: DistanceMetric64
+        A DistanceMetric64 object conforming to the API from
+        ``sklearn.metrics._dist_metrics.pxd`` that will be
+        used to compute distances.
+
+    Returns
+    -------
+    mst_core_data: array of shape (n_samples, 3)
+        An array providing information from which one
+        can either compute an MST, or the linkage hierarchy
+        very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical,
+        agglomerative clustering algorithms" <1109.2378>` algorithm
+        MST-LINKAGE-CORE for more details.
+    """
+    cdef:
+        intp_t n_samples = raw_data.shape[0]
+        uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool)
+        float64_t[:, ::1] result = np.zeros((n_samples - 1, 3))
+
+        intp_t current_node = 0
+        intp_t new_node
+        intp_t i
+        intp_t j
+        intp_t num_features = raw_data.shape[1]
+
+        float64_t right_value
+        float64_t left_value
+        float64_t new_distance
+
+        float64_t[:] current_distances = np.full(n_samples, INFINITY)
+
+    for i in range(n_samples - 1):
+
+        in_tree[current_node] = 1
+
+        new_distance = INFINITY
+        new_node = 0
+
+        for j in range(n_samples):
+            if in_tree[j]:
+                continue
+
+            right_value = current_distances[j]
+            left_value = dist_metric.dist(&raw_data[current_node, 0],
+                                          &raw_data[j, 0],
+                                          num_features)
+
+            if left_value < right_value:
+                current_distances[j] = left_value
+
+            if current_distances[j] < new_distance:
+                new_distance = current_distances[j]
+                new_node = j
+
+        result[i, 0] = current_node
+        result[i, 1] = new_node
+        result[i, 2] = new_distance
+        current_node = new_node
+
+    return np.array(result)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pxd b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..9a41ea68d1bafc0cad55c028e0413e463ddb6d2e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pxd
@@ -0,0 +1,48 @@
+from cython cimport floating
+
+
+cdef floating _euclidean_dense_dense(
+    const floating*,
+    const floating*,
+    int,
+    bint
+) noexcept nogil
+
+cdef floating _euclidean_sparse_dense(
+    const floating[::1],
+    const int[::1],
+    const floating[::1],
+    floating,
+    bint
+) noexcept nogil
+
+cpdef void _relocate_empty_clusters_dense(
+    const floating[:, ::1],
+    const floating[::1],
+    const floating[:, ::1],
+    floating[:, ::1],
+    floating[::1],
+    const int[::1]
+)
+
+cpdef void _relocate_empty_clusters_sparse(
+    const floating[::1],
+    const int[::1],
+    const int[::1],
+    const floating[::1],
+    const floating[:, ::1],
+    floating[:, ::1],
+    floating[::1],
+    const int[::1]
+)
+
+cdef void _average_centers(
+    floating[:, ::1],
+    const floating[::1]
+)
+
+cdef void _center_shift(
+    const floating[:, ::1],
+    const floating[:, ::1],
+    floating[::1]
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..674d4026a67564f266ec709a9f47d77f8f912386
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pyx
@@ -0,0 +1,328 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from cython cimport floating
+from cython.parallel cimport prange
+from libc.math cimport sqrt
+
+from ..utils.extmath import row_norms
+
+
+# Number of samples per data chunk defined as a global constant.
+CHUNK_SIZE = 256
+
+
+cdef floating _euclidean_dense_dense(
+        const floating* a,  # IN
+        const floating* b,  # IN
+        int n_features,
+        bint squared
+) noexcept nogil:
+    """Euclidean distance between a dense and b dense"""
+    cdef:
+        int i
+        int n = n_features // 4
+        int rem = n_features % 4
+        floating result = 0
+
+    # We manually unroll the loop for better cache optimization.
+    for i in range(n):
+        result += (
+            (a[0] - b[0]) * (a[0] - b[0]) +
+            (a[1] - b[1]) * (a[1] - b[1]) +
+            (a[2] - b[2]) * (a[2] - b[2]) +
+            (a[3] - b[3]) * (a[3] - b[3])
+        )
+        a += 4
+        b += 4
+
+    for i in range(rem):
+        result += (a[i] - b[i]) * (a[i] - b[i])
+
+    return result if squared else sqrt(result)
+
+
+def _euclidean_dense_dense_wrapper(
+    const floating[::1] a,
+    const floating[::1] b,
+    bint squared
+):
+    """Wrapper of _euclidean_dense_dense for testing purpose"""
+    return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared)
+
+
+cdef floating _euclidean_sparse_dense(
+        const floating[::1] a_data,  # IN
+        const int[::1] a_indices,    # IN
+        const floating[::1] b,       # IN
+        floating b_squared_norm,
+        bint squared
+) noexcept nogil:
+    """Euclidean distance between a sparse and b dense"""
+    cdef:
+        int nnz = a_indices.shape[0]
+        int i
+        floating tmp, bi
+        floating result = 0.0
+
+    for i in range(nnz):
+        bi = b[a_indices[i]]
+        tmp = a_data[i] - bi
+        result += tmp * tmp - bi * bi
+
+    result += b_squared_norm
+
+    if result < 0:
+        result = 0.0
+
+    return result if squared else sqrt(result)
+
+
+def _euclidean_sparse_dense_wrapper(
+        const floating[::1] a_data,
+        const int[::1] a_indices,
+        const floating[::1] b,
+        floating b_squared_norm,
+        bint squared
+):
+    """Wrapper of _euclidean_sparse_dense for testing purpose"""
+    return _euclidean_sparse_dense(
+        a_data, a_indices, b, b_squared_norm, squared)
+
+
+cpdef floating _inertia_dense(
+        const floating[:, ::1] X,           # IN
+        const floating[::1] sample_weight,  # IN
+        const floating[:, ::1] centers,     # IN
+        const int[::1] labels,              # IN
+        int n_threads,
+        int single_label=-1,
+):
+    """Compute inertia for dense input data
+
+    Sum of squared distance between each sample and its assigned center.
+
+    If single_label is >= 0, the inertia is computed only for that label.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j
+
+        floating sq_dist = 0.0
+        floating inertia = 0.0
+
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
+        j = labels[i]
+        if single_label < 0 or single_label == j:
+            sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
+                                             n_features, True)
+            inertia += sq_dist * sample_weight[i]
+
+    return inertia
+
+
+cpdef floating _inertia_sparse(
+        X,                                  # IN
+        const floating[::1] sample_weight,  # IN
+        const floating[:, ::1] centers,     # IN
+        const int[::1] labels,              # IN
+        int n_threads,
+        int single_label=-1,
+):
+    """Compute inertia for sparse input data
+
+    Sum of squared distance between each sample and its assigned center.
+
+    If single_label is >= 0, the inertia is computed only for that label.
+    """
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        int n_samples = X.shape[0]
+        int i, j
+
+        floating sq_dist = 0.0
+        floating inertia = 0.0
+
+        floating[::1] centers_squared_norms = row_norms(centers, squared=True)
+
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
+        j = labels[i]
+        if single_label < 0 or single_label == j:
+            sq_dist = _euclidean_sparse_dense(
+                X_data[X_indptr[i]: X_indptr[i + 1]],
+                X_indices[X_indptr[i]: X_indptr[i + 1]],
+                centers[j], centers_squared_norms[j], True)
+            inertia += sq_dist * sample_weight[i]
+
+    return inertia
+
+
+cpdef void _relocate_empty_clusters_dense(
+        const floating[:, ::1] X,            # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # INOUT
+        floating[::1] weight_in_clusters,    # INOUT
+        const int[::1] labels                # IN
+):
+    """Relocate centers which have no sample assigned to them."""
+    cdef:
+        int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
+        int n_empty = empty_clusters.shape[0]
+
+    if n_empty == 0:
+        return
+
+    cdef:
+        int n_features = X.shape[1]
+
+        floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
+        int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
+
+        int new_cluster_id, old_cluster_id, far_idx, idx, k
+        floating weight
+
+    if np.max(distances) == 0:
+        # Happens when there are more clusters than non-duplicate samples. Relocating
+        # is pointless in this case.
+        return
+
+    for idx in range(n_empty):
+
+        new_cluster_id = empty_clusters[idx]
+
+        far_idx = far_from_centers[idx]
+        weight = sample_weight[far_idx]
+
+        old_cluster_id = labels[far_idx]
+
+        for k in range(n_features):
+            centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
+            centers_new[new_cluster_id, k] = X[far_idx, k] * weight
+
+        weight_in_clusters[new_cluster_id] = weight
+        weight_in_clusters[old_cluster_id] -= weight
+
+
+cpdef void _relocate_empty_clusters_sparse(
+        const floating[::1] X_data,          # IN
+        const int[::1] X_indices,            # IN
+        const int[::1] X_indptr,             # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # INOUT
+        floating[::1] weight_in_clusters,    # INOUT
+        const int[::1] labels                # IN
+):
+    """Relocate centers which have no sample assigned to them."""
+    cdef:
+        int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
+        int n_empty = empty_clusters.shape[0]
+
+    if n_empty == 0:
+        return
+
+    cdef:
+        int n_samples = X_indptr.shape[0] - 1
+        int i, j, k
+
+        floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype)
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+    for i in range(n_samples):
+        j = labels[i]
+        distances[i] = _euclidean_sparse_dense(
+            X_data[X_indptr[i]: X_indptr[i + 1]],
+            X_indices[X_indptr[i]: X_indptr[i + 1]],
+            centers_old[j], centers_squared_norms[j], True)
+
+    if np.max(distances) == 0:
+        # Happens when there are more clusters than non-duplicate samples. Relocating
+        # is pointless in this case.
+        return
+
+    cdef:
+        int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
+
+        int new_cluster_id, old_cluster_id, far_idx, idx
+        floating weight
+
+    for idx in range(n_empty):
+
+        new_cluster_id = empty_clusters[idx]
+
+        far_idx = far_from_centers[idx]
+        weight = sample_weight[far_idx]
+
+        old_cluster_id = labels[far_idx]
+
+        for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]):
+            centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight
+            centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight
+
+        weight_in_clusters[new_cluster_id] = weight
+        weight_in_clusters[old_cluster_id] -= weight
+
+
+cdef void _average_centers(
+        floating[:, ::1] centers,               # INOUT
+        const floating[::1] weight_in_clusters  # IN
+):
+    """Average new centers wrt weights."""
+    cdef:
+        int n_clusters = centers.shape[0]
+        int n_features = centers.shape[1]
+        int j, k
+        floating alpha
+        int argmax_weight = np.argmax(weight_in_clusters)
+
+    for j in range(n_clusters):
+        if weight_in_clusters[j] > 0:
+            alpha = 1.0 / weight_in_clusters[j]
+            for k in range(n_features):
+                centers[j, k] *= alpha
+        else:
+            # For convenience, we avoid setting empty clusters at the origin but place
+            # them at the location of the biggest cluster.
+            for k in range(n_features):
+                centers[j, k] = centers[argmax_weight, k]
+
+
+cdef void _center_shift(
+        const floating[:, ::1] centers_old,  # IN
+        const floating[:, ::1] centers_new,  # IN
+        floating[::1] center_shift           # OUT
+):
+    """Compute shift between old and new centers."""
+    cdef:
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+        int j
+
+    for j in range(n_clusters):
+        center_shift[j] = _euclidean_dense_dense(
+            &centers_new[j, 0], &centers_old[j, 0], n_features, False)
+
+
+def _is_same_clustering(
+    const int[::1] labels1,
+    const int[::1] labels2,
+    n_clusters
+):
+    """Check if two arrays of labels are the same up to a permutation of the labels"""
+    cdef int[::1] mapping = np.full(fill_value=-1, shape=(n_clusters,), dtype=np.int32)
+    cdef int i
+
+    for i in range(labels1.shape[0]):
+        if mapping[labels1[i]] == -1:
+            mapping[labels1[i]] = labels2[i]
+        elif mapping[labels1[i]] != labels2[i]:
+            return False
+    return True
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_elkan.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_elkan.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..564218a17f7018241d43dd33f55d3f516746a145
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_elkan.pyx
@@ -0,0 +1,686 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cython cimport floating
+from cython.parallel import prange, parallel
+from libc.stdlib cimport calloc, free
+from libc.string cimport memset
+
+from ..utils._openmp_helpers cimport omp_lock_t
+from ..utils._openmp_helpers cimport omp_init_lock
+from ..utils._openmp_helpers cimport omp_destroy_lock
+from ..utils._openmp_helpers cimport omp_set_lock
+from ..utils._openmp_helpers cimport omp_unset_lock
+from ..utils.extmath import row_norms
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _euclidean_dense_dense
+from ._k_means_common cimport _euclidean_sparse_dense
+from ._k_means_common cimport _average_centers
+from ._k_means_common cimport _center_shift
+
+
+def init_bounds_dense(
+        const floating[:, ::1] X,                      # IN
+        const floating[:, ::1] centers,                # IN
+        const floating[:, ::1] center_half_distances,  # IN
+        int[::1] labels,                               # OUT
+        floating[::1] upper_bounds,                    # OUT
+        floating[:, ::1] lower_bounds,                 # OUT
+        int n_threads):
+    """Initialize upper and lower bounds for each sample for dense input data.
+
+    Given X, centers and the pairwise distances divided by 2.0 between the
+    centers this calculates the upper bounds and lower bounds for each sample.
+    The upper bound for each sample is set to the distance between the sample
+    and the closest center.
+
+    The lower bound for each sample is a one-dimensional array of n_clusters.
+    For each sample i assume that the previously assigned cluster is c1 and the
+    previous closest distance is dist, for a new cluster c2, the
+    lower_bound[i][c2] is set to distance between the sample and this new
+    cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
+    computation of unnecessary distances for each sample to the clusters that
+    it is unlikely to be assigned to.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The input data.
+
+    centers : ndarray of shape (n_clusters, n_features), dtype=floating
+        The cluster centers.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        The half of the distance between any 2 clusters centers.
+
+    labels : ndarray of shape(n_samples), dtype=int
+        The label for each sample. This array is modified in place.
+
+    upper_bounds : ndarray of shape(n_samples,), dtype=floating
+        The upper bound on the distance between each sample and its closest
+        cluster center. This array is modified in place.
+
+    lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating
+        The lower bound on the distance between each sample and each cluster
+        center. This array is modified in place.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers.shape[0]
+        int n_features = X.shape[1]
+
+        floating min_dist, dist
+        int best_cluster, i, j
+
+    for i in prange(
+        n_samples, num_threads=n_threads, schedule='static', nogil=True
+    ):
+        best_cluster = 0
+        min_dist = _euclidean_dense_dense(&X[i, 0], &centers[0, 0],
+                                          n_features, False)
+        lower_bounds[i, 0] = min_dist
+        for j in range(1, n_clusters):
+            if min_dist > center_half_distances[best_cluster, j]:
+                dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
+                                              n_features, False)
+                lower_bounds[i, j] = dist
+                if dist < min_dist:
+                    min_dist = dist
+                    best_cluster = j
+        labels[i] = best_cluster
+        upper_bounds[i] = min_dist
+
+
+def init_bounds_sparse(
+        X,                                             # IN
+        const floating[:, ::1] centers,                # IN
+        const floating[:, ::1] center_half_distances,  # IN
+        int[::1] labels,                               # OUT
+        floating[::1] upper_bounds,                    # OUT
+        floating[:, ::1] lower_bounds,                 # OUT
+        int n_threads):
+    """Initialize upper and lower bounds for each sample for sparse input data.
+
+    Given X, centers and the pairwise distances divided by 2.0 between the
+    centers this calculates the upper bounds and lower bounds for each sample.
+    The upper bound for each sample is set to the distance between the sample
+    and the closest center.
+
+    The lower bound for each sample is a one-dimensional array of n_clusters.
+    For each sample i assume that the previously assigned cluster is c1 and the
+    previous closest distance is dist, for a new cluster c2, the
+    lower_bound[i][c2] is set to distance between the sample and this new
+    cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
+    computation of unnecessary distances for each sample to the clusters that
+    it is unlikely to be assigned to.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The input data. Must be in CSR format.
+
+    centers : ndarray of shape (n_clusters, n_features), dtype=floating
+        The cluster centers.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        The half of the distance between any 2 clusters centers.
+
+    labels : ndarray of shape(n_samples), dtype=int
+        The label for each sample. This array is modified in place.
+
+    upper_bounds : ndarray of shape(n_samples,), dtype=floating
+        The upper bound on the distance between each sample and its closest
+        cluster center. This array is modified in place.
+
+    lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating
+        The lower bound on the distance between each sample and each cluster
+        center. This array is modified in place.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers.shape[0]
+
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        floating min_dist, dist
+        int best_cluster, i, j
+
+        floating[::1] centers_squared_norms = row_norms(centers, squared=True)
+
+    for i in prange(
+        n_samples, num_threads=n_threads, schedule='static', nogil=True
+    ):
+        best_cluster = 0
+        min_dist = _euclidean_sparse_dense(
+            X_data[X_indptr[i]: X_indptr[i + 1]],
+            X_indices[X_indptr[i]: X_indptr[i + 1]],
+            centers[0], centers_squared_norms[0], False)
+
+        lower_bounds[i, 0] = min_dist
+        for j in range(1, n_clusters):
+            if min_dist > center_half_distances[best_cluster, j]:
+                dist = _euclidean_sparse_dense(
+                    X_data[X_indptr[i]: X_indptr[i + 1]],
+                    X_indices[X_indptr[i]: X_indptr[i + 1]],
+                    centers[j], centers_squared_norms[j], False)
+                lower_bounds[i, j] = dist
+                if dist < min_dist:
+                    min_dist = dist
+                    best_cluster = j
+        labels[i] = best_cluster
+        upper_bounds[i] = min_dist
+
+
+def elkan_iter_chunked_dense(
+        const floating[:, ::1] X,                      # IN
+        const floating[::1] sample_weight,             # IN
+        const floating[:, ::1] centers_old,            # IN
+        floating[:, ::1] centers_new,                  # OUT
+        floating[::1] weight_in_clusters,              # OUT
+        const floating[:, ::1] center_half_distances,  # IN
+        const floating[::1] distance_next_center,      # IN
+        floating[::1] upper_bounds,                    # INOUT
+        floating[:, ::1] lower_bounds,                 # INOUT
+        int[::1] labels,                               # INOUT
+        floating[::1] center_shift,                    # OUT
+        int n_threads,
+        bint update_centers=True):
+    """Single iteration of K-means Elkan algorithm with dense input.
+
+    Update labels and centers (inplace), for one iteration, distributed
+    over data chunks.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The observations to cluster.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
+        Placeholder for the sums of the weights of every observation assigned
+        to each center.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        Half pairwise distances between centers.
+
+    distance_next_center : ndarray of shape (n_clusters,), dtype=floating
+        Distance between each center its closest center.
+
+    upper_bounds : ndarray of shape (n_samples,), dtype=floating
+        Upper bound for the distance between each sample and its center,
+        updated inplace.
+
+    lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
+        Lower bound for the distance between each sample and each center,
+        updated inplace.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    center_shift : ndarray of shape (n_clusters,), dtype=floating
+        Distance between old and new centers.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+
+    update_centers : bool
+        - If True, the labels and the new centers will be computed, i.e. runs
+          the E-step and the M-step of the algorithm.
+        - If False, only the labels will be computed, i.e runs the E-step of
+          the algorithm. This is useful especially when calling predict on a
+          fitted model.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int n_clusters = centers_new.shape[0]
+
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outliers.
+        return
+
+    cdef:
+        # hard-coded number of samples per chunk. Splitting in chunks is
+        # necessary to get parallelism. Chunk size chosen to be same as lloyd's
+        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_chunks = n_samples // n_samples_chunk
+        int n_samples_rem = n_samples % n_samples_chunk
+        int chunk_idx
+        int start, end
+
+        int i, j, k
+
+        floating *centers_new_chunk
+        floating *weight_in_clusters_chunk
+
+        omp_lock_t lock
+
+    # count remainder chunk in total number of chunks
+    n_chunks += n_samples != n_chunks * n_samples_chunk
+
+    # number of threads should not be bigger than number of chunks
+    n_threads = min(n_threads, n_chunks)
+
+    if update_centers:
+        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
+        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+        omp_init_lock(&lock)
+
+    with nogil, parallel(num_threads=n_threads):
+        # thread local buffers
+        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
+        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+
+        for chunk_idx in prange(n_chunks, schedule='static'):
+            start = chunk_idx * n_samples_chunk
+            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
+                end = start + n_samples_rem
+            else:
+                end = start + n_samples_chunk
+
+            _update_chunk_dense(
+                X[start: end],
+                sample_weight[start: end],
+                centers_old,
+                center_half_distances,
+                distance_next_center,
+                labels[start: end],
+                upper_bounds[start: end],
+                lower_bounds[start: end],
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+                update_centers)
+
+        # reduction from local buffers.
+        if update_centers:
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
+            for j in range(n_clusters):
+                weight_in_clusters[j] += weight_in_clusters_chunk[j]
+                for k in range(n_features):
+                    centers_new[j, k] += centers_new_chunk[j * n_features + k]
+            omp_unset_lock(&lock)
+
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
+    if update_centers:
+        omp_destroy_lock(&lock)
+        _relocate_empty_clusters_dense(X, sample_weight, centers_old,
+                                       centers_new, weight_in_clusters, labels)
+
+        _average_centers(centers_new, weight_in_clusters)
+        _center_shift(centers_old, centers_new, center_shift)
+
+        # update lower and upper bounds
+        for i in range(n_samples):
+            upper_bounds[i] += center_shift[labels[i]]
+
+            for j in range(n_clusters):
+                lower_bounds[i, j] -= center_shift[j]
+                if lower_bounds[i, j] < 0:
+                    lower_bounds[i, j] = 0
+
+
+cdef void _update_chunk_dense(
+        const floating[:, ::1] X,                      # IN
+        const floating[::1] sample_weight,             # IN
+        const floating[:, ::1] centers_old,            # IN
+        const floating[:, ::1] center_half_distances,  # IN
+        const floating[::1] distance_next_center,      # IN
+        int[::1] labels,                               # INOUT
+        floating[::1] upper_bounds,                    # INOUT
+        floating[:, ::1] lower_bounds,                 # INOUT
+        floating *centers_new,                         # OUT
+        floating *weight_in_clusters,                  # OUT
+        bint update_centers) noexcept nogil:
+    """K-means combined EM step for one dense data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating upper_bound, distance
+        int i, j, k, label
+
+    for i in range(n_samples):
+        upper_bound = upper_bounds[i]
+        bounds_tight = 0
+        label = labels[i]
+
+        # Next center is not far away from the currently assigned center.
+        # Sample might need to be assigned to another center.
+        if not distance_next_center[label] >= upper_bound:
+
+            for j in range(n_clusters):
+
+                # If this holds, then center_index is a good candidate for the
+                # sample to be relabelled, and we need to confirm this by
+                # recomputing the upper and lower bounds.
+                if (
+                    j != label
+                    and (upper_bound > lower_bounds[i, j])
+                    and (upper_bound > center_half_distances[label, j])
+                ):
+
+                    # Recompute upper bound by calculating the actual distance
+                    # between the sample and its current assigned center.
+                    if not bounds_tight:
+                        upper_bound = _euclidean_dense_dense(
+                            &X[i, 0], &centers_old[label, 0], n_features, False)
+                        lower_bounds[i, label] = upper_bound
+                        bounds_tight = 1
+
+                    # If the condition still holds, then compute the actual
+                    # distance between the sample and center. If this is less
+                    # than the previous distance, reassign label.
+                    if (
+                        upper_bound > lower_bounds[i, j]
+                        or (upper_bound > center_half_distances[label, j])
+                    ):
+
+                        distance = _euclidean_dense_dense(
+                            &X[i, 0], &centers_old[j, 0], n_features, False)
+                        lower_bounds[i, j] = distance
+                        if distance < upper_bound:
+                            label = j
+                            upper_bound = distance
+
+            labels[i] = label
+            upper_bounds[i] = upper_bound
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(n_features):
+                centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
+
+
+def elkan_iter_chunked_sparse(
+        X,                                             # IN
+        const floating[::1] sample_weight,             # IN
+        const floating[:, ::1] centers_old,            # IN
+        floating[:, ::1] centers_new,                  # OUT
+        floating[::1] weight_in_clusters,              # OUT
+        const floating[:, ::1] center_half_distances,  # IN
+        const floating[::1] distance_next_center,      # IN
+        floating[::1] upper_bounds,                    # INOUT
+        floating[:, ::1] lower_bounds,                 # INOUT
+        int[::1] labels,                               # INOUT
+        floating[::1] center_shift,                    # OUT
+        int n_threads,
+        bint update_centers=True):
+    """Single iteration of K-means Elkan algorithm with sparse input.
+
+    Update labels and centers (inplace), for one iteration, distributed
+    over data chunks.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        The observations to cluster. Must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
+        Placeholder for the sums of the weights of every observation assigned
+        to each center.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        Half pairwise distances between centers.
+
+    distance_next_center : ndarray of shape (n_clusters,), dtype=floating
+        Distance between each center its closest center.
+
+    upper_bounds : ndarray of shape (n_samples,), dtype=floating
+        Upper bound for the distance between each sample and its center,
+        updated inplace.
+
+    lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
+        Lower bound for the distance between each sample and each center,
+        updated inplace.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    center_shift : ndarray of shape (n_clusters,), dtype=floating
+        Distance between old and new centers.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+
+    update_centers : bool
+        - If True, the labels and the new centers will be computed, i.e. runs
+          the E-step and the M-step of the algorithm.
+        - If False, only the labels will be computed, i.e runs the E-step of
+          the algorithm. This is useful especially when calling predict on a
+          fitted model.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int n_clusters = centers_new.shape[0]
+
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outliers.
+        return
+
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        # hard-coded number of samples per chunk. Splitting in chunks is
+        # necessary to get parallelism. Chunk size chosen to be same as lloyd's
+        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_chunks = n_samples // n_samples_chunk
+        int n_samples_rem = n_samples % n_samples_chunk
+        int chunk_idx
+        int start, end
+
+        int i, j, k
+
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+        floating *centers_new_chunk
+        floating *weight_in_clusters_chunk
+
+        omp_lock_t lock
+
+    # count remainder chunk in total number of chunks
+    n_chunks += n_samples != n_chunks * n_samples_chunk
+
+    # number of threads should not be bigger than number of chunks
+    n_threads = min(n_threads, n_chunks)
+
+    if update_centers:
+        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
+        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+        omp_init_lock(&lock)
+
+    with nogil, parallel(num_threads=n_threads):
+        # thread local buffers
+        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
+        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+
+        for chunk_idx in prange(n_chunks, schedule='static'):
+            start = chunk_idx * n_samples_chunk
+            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
+                end = start + n_samples_rem
+            else:
+                end = start + n_samples_chunk
+
+            _update_chunk_sparse(
+                X_data[X_indptr[start]: X_indptr[end]],
+                X_indices[X_indptr[start]: X_indptr[end]],
+                X_indptr[start: end+1],
+                sample_weight[start: end],
+                centers_old,
+                centers_squared_norms,
+                center_half_distances,
+                distance_next_center,
+                labels[start: end],
+                upper_bounds[start: end],
+                lower_bounds[start: end],
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+                update_centers)
+
+        # reduction from local buffers.
+        if update_centers:
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
+            for j in range(n_clusters):
+                weight_in_clusters[j] += weight_in_clusters_chunk[j]
+                for k in range(n_features):
+                    centers_new[j, k] += centers_new_chunk[j * n_features + k]
+            omp_unset_lock(&lock)
+
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
+    if update_centers:
+        omp_destroy_lock(&lock)
+        _relocate_empty_clusters_sparse(
+            X_data, X_indices, X_indptr, sample_weight,
+            centers_old, centers_new, weight_in_clusters, labels)
+
+        _average_centers(centers_new, weight_in_clusters)
+        _center_shift(centers_old, centers_new, center_shift)
+
+        # update lower and upper bounds
+        for i in range(n_samples):
+            upper_bounds[i] += center_shift[labels[i]]
+
+            for j in range(n_clusters):
+                lower_bounds[i, j] -= center_shift[j]
+                if lower_bounds[i, j] < 0:
+                    lower_bounds[i, j] = 0
+
+
+cdef void _update_chunk_sparse(
+        const floating[::1] X_data,                    # IN
+        const int[::1] X_indices,                      # IN
+        const int[::1] X_indptr,                       # IN
+        const floating[::1] sample_weight,             # IN
+        const floating[:, ::1] centers_old,            # IN
+        const floating[::1] centers_squared_norms,     # IN
+        const floating[:, ::1] center_half_distances,  # IN
+        const floating[::1] distance_next_center,      # IN
+        int[::1] labels,                               # INOUT
+        floating[::1] upper_bounds,                    # INOUT
+        floating[:, ::1] lower_bounds,                 # INOUT
+        floating *centers_new,                         # OUT
+        floating *weight_in_clusters,                  # OUT
+        bint update_centers) noexcept nogil:
+    """K-means combined EM step for one sparse data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating upper_bound, distance
+        int i, j, k, label
+        int s = X_indptr[0]
+
+    for i in range(n_samples):
+        upper_bound = upper_bounds[i]
+        bounds_tight = 0
+        label = labels[i]
+
+        # Next center is not far away from the currently assigned center.
+        # Sample might need to be assigned to another center.
+        if not distance_next_center[label] >= upper_bound:
+
+            for j in range(n_clusters):
+
+                # If this holds, then center_index is a good candidate for the
+                # sample to be relabelled, and we need to confirm this by
+                # recomputing the upper and lower bounds.
+                if (
+                    j != label
+                    and (upper_bound > lower_bounds[i, j])
+                    and (upper_bound > center_half_distances[label, j])
+                ):
+
+                    # Recompute upper bound by calculating the actual distance
+                    # between the sample and its current assigned center.
+                    if not bounds_tight:
+                        upper_bound = _euclidean_sparse_dense(
+                            X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            centers_old[label], centers_squared_norms[label], False)
+                        lower_bounds[i, label] = upper_bound
+                        bounds_tight = 1
+
+                    # If the condition still holds, then compute the actual
+                    # distance between the sample and center. If this is less
+                    # than the previous distance, reassign label.
+                    if (
+                        upper_bound > lower_bounds[i, j]
+                        or (upper_bound > center_half_distances[label, j])
+                    ):
+                        distance = _euclidean_sparse_dense(
+                            X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            centers_old[j], centers_squared_norms[j], False)
+                        lower_bounds[i, j] = distance
+                        if distance < upper_bound:
+                            label = j
+                            upper_bound = distance
+
+            labels[i] = label
+            upper_bounds[i] = upper_bound
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_lloyd.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_lloyd.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..a507a6239ab5f836e8c7d23ac0e3e2ab2f7f4d11
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_lloyd.pyx
@@ -0,0 +1,420 @@
+# Licence: BSD 3 clause
+
+from cython cimport floating
+from cython.parallel import prange, parallel
+from libc.stdlib cimport malloc, calloc, free
+from libc.string cimport memset
+from libc.float cimport DBL_MAX, FLT_MAX
+
+from ..utils._openmp_helpers cimport omp_lock_t
+from ..utils._openmp_helpers cimport omp_init_lock
+from ..utils._openmp_helpers cimport omp_destroy_lock
+from ..utils._openmp_helpers cimport omp_set_lock
+from ..utils._openmp_helpers cimport omp_unset_lock
+from ..utils.extmath import row_norms
+from ..utils._cython_blas cimport _gemm
+from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _average_centers, _center_shift
+
+
+def lloyd_iter_chunked_dense(
+        const floating[:, ::1] X,            # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_in_clusters,    # OUT
+        int[::1] labels,                     # OUT
+        floating[::1] center_shift,          # OUT
+        int n_threads,
+        bint update_centers=True):
+    """Single iteration of K-means lloyd algorithm with dense input.
+
+    Update labels and centers (inplace), for one iteration, distributed
+    over data chunks.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The observations to cluster.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration. `centers_new` can be `None` if
+        `update_centers` is False.
+
+    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
+        Placeholder for the sums of the weights of every observation assigned
+        to each center. `weight_in_clusters` can be `None` if `update_centers`
+        is False.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    center_shift : ndarray of shape (n_clusters,), dtype=floating
+        Distance between old and new centers.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+
+    update_centers : bool
+        - If True, the labels and the new centers will be computed, i.e. runs
+          the E-step and the M-step of the algorithm.
+        - If False, only the labels will be computed, i.e runs the E-step of
+          the algorithm. This is useful especially when calling predict on a
+          fitted model.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int n_clusters = centers_old.shape[0]
+
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outliers.
+        return
+
+    cdef:
+        # hard-coded number of samples per chunk. Appeared to be close to
+        # optimal in all situations.
+        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_chunks = n_samples // n_samples_chunk
+        int n_samples_rem = n_samples % n_samples_chunk
+        int chunk_idx
+        int start, end
+
+        int j, k
+
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+        floating *centers_new_chunk
+        floating *weight_in_clusters_chunk
+        floating *pairwise_distances_chunk
+
+        omp_lock_t lock
+
+    # count remainder chunk in total number of chunks
+    n_chunks += n_samples != n_chunks * n_samples_chunk
+
+    # number of threads should not be bigger than number of chunks
+    n_threads = min(n_threads, n_chunks)
+
+    if update_centers:
+        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
+        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+        omp_init_lock(&lock)
+
+    with nogil, parallel(num_threads=n_threads):
+        # thread local buffers
+        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
+        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+        pairwise_distances_chunk = <floating*> malloc(n_samples_chunk * n_clusters * sizeof(floating))
+
+        for chunk_idx in prange(n_chunks, schedule='static'):
+            start = chunk_idx * n_samples_chunk
+            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
+                end = start + n_samples_rem
+            else:
+                end = start + n_samples_chunk
+
+            _update_chunk_dense(
+                X[start: end],
+                sample_weight[start: end],
+                centers_old,
+                centers_squared_norms,
+                labels[start: end],
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+                pairwise_distances_chunk,
+                update_centers)
+
+        # reduction from local buffers.
+        if update_centers:
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
+            for j in range(n_clusters):
+                weight_in_clusters[j] += weight_in_clusters_chunk[j]
+                for k in range(n_features):
+                    centers_new[j, k] += centers_new_chunk[j * n_features + k]
+
+            omp_unset_lock(&lock)
+
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+        free(pairwise_distances_chunk)
+
+    if update_centers:
+        omp_destroy_lock(&lock)
+        _relocate_empty_clusters_dense(
+            X, sample_weight, centers_old, centers_new, weight_in_clusters, labels
+        )
+
+        _average_centers(centers_new, weight_in_clusters)
+        _center_shift(centers_old, centers_new, center_shift)
+
+
+cdef void _update_chunk_dense(
+        const floating[:, ::1] X,                   # IN
+        const floating[::1] sample_weight,          # IN
+        const floating[:, ::1] centers_old,         # IN
+        const floating[::1] centers_squared_norms,  # IN
+        int[::1] labels,                            # OUT
+        floating *centers_new,                      # OUT
+        floating *weight_in_clusters,               # OUT
+        floating *pairwise_distances,               # OUT
+        bint update_centers) noexcept nogil:
+    """K-means combined EM step for one dense data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating sq_dist, min_sq_dist
+        int i, j, k, label
+
+    # Instead of computing the full pairwise squared distances matrix,
+    # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
+    # the - 2 X.C^T + ||C||² term since the argmin for a given sample only
+    # depends on the centers.
+    # pairwise_distances = ||C||²
+    for i in range(n_samples):
+        for j in range(n_clusters):
+            pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
+
+    # pairwise_distances += -2 * X.dot(C.T)
+    _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
+          -2.0, &X[0, 0], n_features, &centers_old[0, 0], n_features,
+          1.0, pairwise_distances, n_clusters)
+
+    for i in range(n_samples):
+        min_sq_dist = pairwise_distances[i * n_clusters]
+        label = 0
+        for j in range(1, n_clusters):
+            sq_dist = pairwise_distances[i * n_clusters + j]
+            if sq_dist < min_sq_dist:
+                min_sq_dist = sq_dist
+                label = j
+        labels[i] = label
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(n_features):
+                centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
+
+
+def lloyd_iter_chunked_sparse(
+        X,                                   # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_in_clusters,    # OUT
+        int[::1] labels,                     # OUT
+        floating[::1] center_shift,          # OUT
+        int n_threads,
+        bint update_centers=True):
+    """Single iteration of K-means lloyd algorithm with sparse input.
+
+    Update labels and centers (inplace), for one iteration, distributed
+    over data chunks.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The observations to cluster. Must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration. `centers_new` can be `None` if
+        `update_centers` is False.
+
+    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
+        Placeholder for the sums of the weights of every observation assigned
+        to each center. `weight_in_clusters` can be `None` if `update_centers`
+        is False.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    center_shift : ndarray of shape (n_clusters,), dtype=floating
+        Distance between old and new centers.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+
+    update_centers : bool
+        - If True, the labels and the new centers will be computed, i.e. runs
+          the E-step and the M-step of the algorithm.
+        - If False, only the labels will be computed, i.e runs the E-step of
+          the algorithm. This is useful especially when calling predict on a
+          fitted model.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int n_clusters = centers_old.shape[0]
+
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outliers.
+        return
+
+    cdef:
+        # Choose same as for dense. Does not have the same impact since with
+        # sparse data the pairwise distances matrix is not precomputed.
+        # However, splitting in chunks is necessary to get parallelism.
+        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_chunks = n_samples // n_samples_chunk
+        int n_samples_rem = n_samples % n_samples_chunk
+        int chunk_idx
+        int start = 0, end = 0
+
+        int j, k
+
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+        floating *centers_new_chunk
+        floating *weight_in_clusters_chunk
+
+        omp_lock_t lock
+
+    # count remainder chunk in total number of chunks
+    n_chunks += n_samples != n_chunks * n_samples_chunk
+
+    # number of threads should not be bigger than number of chunks
+    n_threads = min(n_threads, n_chunks)
+
+    if update_centers:
+        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
+        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+        omp_init_lock(&lock)
+
+    with nogil, parallel(num_threads=n_threads):
+        # thread local buffers
+        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
+        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+
+        for chunk_idx in prange(n_chunks, schedule='static'):
+            start = chunk_idx * n_samples_chunk
+            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
+                end = start + n_samples_rem
+            else:
+                end = start + n_samples_chunk
+
+            _update_chunk_sparse(
+                X_data[X_indptr[start]: X_indptr[end]],
+                X_indices[X_indptr[start]: X_indptr[end]],
+                X_indptr[start: end+1],
+                sample_weight[start: end],
+                centers_old,
+                centers_squared_norms,
+                labels[start: end],
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+                update_centers)
+
+        # reduction from local buffers.
+        if update_centers:
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
+            for j in range(n_clusters):
+                weight_in_clusters[j] += weight_in_clusters_chunk[j]
+                for k in range(n_features):
+                    centers_new[j, k] += centers_new_chunk[j * n_features + k]
+            omp_unset_lock(&lock)
+
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
+    if update_centers:
+        omp_destroy_lock(&lock)
+        _relocate_empty_clusters_sparse(
+            X_data, X_indices, X_indptr, sample_weight,
+            centers_old, centers_new, weight_in_clusters, labels)
+
+        _average_centers(centers_new, weight_in_clusters)
+        _center_shift(centers_old, centers_new, center_shift)
+
+
+cdef void _update_chunk_sparse(
+        const floating[::1] X_data,                 # IN
+        const int[::1] X_indices,                   # IN
+        const int[::1] X_indptr,                    # IN
+        const floating[::1] sample_weight,          # IN
+        const floating[:, ::1] centers_old,         # IN
+        const floating[::1] centers_squared_norms,  # IN
+        int[::1] labels,                            # OUT
+        floating *centers_new,                      # OUT
+        floating *weight_in_clusters,               # OUT
+        bint update_centers) noexcept nogil:
+    """K-means combined EM step for one sparse data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating sq_dist, min_sq_dist
+        int i, j, k, label
+        floating max_floating = FLT_MAX if floating is float else DBL_MAX
+        int s = X_indptr[0]
+
+    # XXX Precompute the pairwise distances matrix is not worth for sparse
+    # currently. Should be tested when BLAS (sparse x dense) matrix
+    # multiplication is available.
+    for i in range(n_samples):
+        min_sq_dist = max_floating
+        label = 0
+
+        for j in range(n_clusters):
+            sq_dist = 0.0
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                sq_dist += centers_old[j, X_indices[k]] * X_data[k]
+
+            # Instead of computing the full squared distance with each cluster,
+            # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute
+            # the - 2 X.C^T + ||C||² term since the argmin for a given sample
+            # only depends on the centers C.
+            sq_dist = centers_squared_norms[j] -2 * sq_dist
+            if sq_dist < min_sq_dist:
+                min_sq_dist = sq_dist
+                label = j
+
+        labels[i] = label
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_minibatch.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_minibatch.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..22ca5255e3889574d7155f1e077f84111832cf92
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_minibatch.pyx
@@ -0,0 +1,218 @@
+from cython cimport floating
+from cython.parallel cimport parallel, prange
+from libc.stdlib cimport malloc, free
+
+
+def _minibatch_update_dense(
+        const floating[:, ::1] X,            # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_sums,           # INOUT
+        const int[::1] labels,               # IN
+        int n_threads):
+    """Update of the centers for dense MiniBatchKMeans.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The observations to cluster.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_sums : ndarray of shape (n_clusters,), dtype=floating
+        Current sums of the accumulated weights for each center.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int cluster_idx
+
+        int *indices
+
+    with nogil, parallel(num_threads=n_threads):
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_dense(cluster_idx, X, sample_weight,
+                                centers_old, centers_new, weight_sums, labels,
+                                indices)
+
+        free(indices)
+
+
+cdef void update_center_dense(
+        int cluster_idx,
+        const floating[:, ::1] X,            # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_sums,           # INOUT
+        const int[::1] labels,               # IN
+        int *indices) noexcept nogil:        # TMP
+    """Update of a single center for dense MinibatchKMeans"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers_old.shape[1]
+        floating alpha
+        int n_indices
+        int k, sample_idx, feature_idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == cluster_idx)[0]
+    k = 0
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
+            k += 1
+    n_indices = k
+
+    if wsum > 0:
+        # Undo the previous count-based scaling for this cluster center
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
+
+        # Update cluster with new point members
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(n_features):
+                centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]
+
+        # Update the count statistics for this center
+        weight_sums[cluster_idx] += wsum
+
+        # Rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
+    else:
+        # No sample was assigned to this cluster in this batch of data
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
+
+
+def _minibatch_update_sparse(
+        X,                                   # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_sums,           # INOUT
+        const int[::1] labels,               # IN
+        int n_threads):
+    """Update of the centers for sparse MiniBatchKMeans.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The observations to cluster. Must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_sums : ndarray of shape (n_clusters,), dtype=floating
+        Current sums of the accumulated weights for each center.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int cluster_idx
+
+        int *indices
+
+    with nogil, parallel(num_threads=n_threads):
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_sparse(cluster_idx, X_data, X_indices, X_indptr,
+                                 sample_weight, centers_old, centers_new,
+                                 weight_sums, labels, indices)
+
+        free(indices)
+
+
+cdef void update_center_sparse(
+        int cluster_idx,
+        const floating[::1] X_data,          # IN
+        const int[::1] X_indices,            # IN
+        const int[::1] X_indptr,             # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_sums,           # INOUT
+        const int[::1] labels,               # IN
+        int *indices) noexcept nogil:        # TMP
+    """Update of a single center for sparse MinibatchKMeans"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers_old.shape[1]
+        floating alpha
+        int n_indices
+        int k, sample_idx, feature_idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == cluster_idx)[0]
+    k = 0
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
+            k += 1
+    n_indices = k
+
+    if wsum > 0:
+        # Undo the previous count-based scaling for this cluster center:
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
+
+        # Update cluster with new point members
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
+                centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx]
+
+        # Update the count statistics for this center
+        weight_sums[cluster_idx] += wsum
+
+        # Rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
+    else:
+        # No sample was assigned to this cluster in this batch of data
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c85610239ccae163137a1ced0f990325864390
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py
@@ -0,0 +1,2303 @@
+"""K-means clustering."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from abc import ABC, abstractmethod
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    ClusterMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..exceptions import ConvergenceWarning
+from ..metrics.pairwise import _euclidean_distances, euclidean_distances
+from ..utils import check_array, check_random_state
+from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.extmath import row_norms, stable_cumsum
+from ..utils.parallel import (
+    _get_threadpool_controller,
+    _threadpool_controller_decorator,
+)
+from ..utils.sparsefuncs import mean_variance_axis
+from ..utils.sparsefuncs_fast import assign_rows_csr
+from ..utils.validation import (
+    _check_sample_weight,
+    _is_arraylike_not_scalar,
+    check_is_fitted,
+    validate_data,
+)
+from ._k_means_common import (
+    CHUNK_SIZE,
+    _inertia_dense,
+    _inertia_sparse,
+    _is_same_clustering,
+)
+from ._k_means_elkan import (
+    elkan_iter_chunked_dense,
+    elkan_iter_chunked_sparse,
+    init_bounds_dense,
+    init_bounds_sparse,
+)
+from ._k_means_lloyd import lloyd_iter_chunked_dense, lloyd_iter_chunked_sparse
+from ._k_means_minibatch import _minibatch_update_dense, _minibatch_update_sparse
+
+###############################################################################
+# Initialization heuristic
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "n_clusters": [Interval(Integral, 1, None, closed="left")],
+        "sample_weight": ["array-like", None],
+        "x_squared_norms": ["array-like", None],
+        "random_state": ["random_state"],
+        "n_local_trials": [Interval(Integral, 1, None, closed="left"), None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def kmeans_plusplus(
+    X,
+    n_clusters,
+    *,
+    sample_weight=None,
+    x_squared_norms=None,
+    random_state=None,
+    n_local_trials=None,
+):
+    """Init n_clusters seeds according to k-means++.
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to pick seeds from.
+
+    n_clusters : int
+        The number of centroids to initialize.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        The weights for each observation in `X`. If `None`, all observations
+        are assigned equal weight. `sample_weight` is ignored if `init`
+        is a callable or a user provided array.
+
+        .. versionadded:: 1.3
+
+    x_squared_norms : array-like of shape (n_samples,), default=None
+        Squared Euclidean norm of each data point.
+
+    random_state : int or RandomState instance, default=None
+        Determines random number generation for centroid initialization. Pass
+        an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_local_trials : int, default=None
+        The number of seeding trials for each center (except the first),
+        of which the one reducing inertia the most is greedily chosen.
+        Set to None to make the number of trials depend logarithmically
+        on the number of seeds (2+log(k)) which is the recommended setting.
+        Setting to 1 disables the greedy cluster selection and recovers the
+        vanilla k-means++ algorithm which was empirically shown to work less
+        well than its greedy variant.
+
+    Returns
+    -------
+    centers : ndarray of shape (n_clusters, n_features)
+        The initial centers for k-means.
+
+    indices : ndarray of shape (n_clusters,)
+        The index location of the chosen centers in the data array X. For a
+        given index and center, X[index] = center.
+
+    Notes
+    -----
+    Selects initial cluster centers for k-mean clustering in a smart way
+    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
+    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
+    on Discrete algorithms. 2007
+
+    Examples
+    --------
+
+    >>> from sklearn.cluster import kmeans_plusplus
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [10, 2], [10, 4], [10, 0]])
+    >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
+    >>> centers
+    array([[10,  2],
+           [ 1,  0]])
+    >>> indices
+    array([3, 2])
+    """
+    # Check data
+    check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+    if X.shape[0] < n_clusters:
+        raise ValueError(
+            f"n_samples={X.shape[0]} should be >= n_clusters={n_clusters}."
+        )
+
+    # Check parameters
+    if x_squared_norms is None:
+        x_squared_norms = row_norms(X, squared=True)
+    else:
+        x_squared_norms = check_array(x_squared_norms, dtype=X.dtype, ensure_2d=False)
+
+    if x_squared_norms.shape[0] != X.shape[0]:
+        raise ValueError(
+            f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
+            f"be equal to the length of n_samples {X.shape[0]}."
+        )
+
+    random_state = check_random_state(random_state)
+
+    # Call private k-means++
+    centers, indices = _kmeans_plusplus(
+        X, n_clusters, x_squared_norms, sample_weight, random_state, n_local_trials
+    )
+
+    return centers, indices
+
+
+def _kmeans_plusplus(
+    X, n_clusters, x_squared_norms, sample_weight, random_state, n_local_trials=None
+):
+    """Computational component for initialization of n_clusters by
+    k-means++. Prior validation of data is assumed.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The data to pick seeds for.
+
+    n_clusters : int
+        The number of seeds to choose.
+
+    sample_weight : ndarray of shape (n_samples,)
+        The weights for each observation in `X`.
+
+    x_squared_norms : ndarray of shape (n_samples,)
+        Squared Euclidean norm of each data point.
+
+    random_state : RandomState instance
+        The generator used to initialize the centers.
+        See :term:`Glossary <random_state>`.
+
+    n_local_trials : int, default=None
+        The number of seeding trials for each center (except the first),
+        of which the one reducing inertia the most is greedily chosen.
+        Set to None to make the number of trials depend logarithmically
+        on the number of seeds (2+log(k)); this is the default.
+
+    Returns
+    -------
+    centers : ndarray of shape (n_clusters, n_features)
+        The initial centers for k-means.
+
+    indices : ndarray of shape (n_clusters,)
+        The index location of the chosen centers in the data array X. For a
+        given index and center, X[index] = center.
+    """
+    n_samples, n_features = X.shape
+
+    centers = np.empty((n_clusters, n_features), dtype=X.dtype)
+
+    # Set the number of local seeding trials if none is given
+    if n_local_trials is None:
+        # This is what Arthur/Vassilvitskii tried, but did not report
+        # specific results for other than mentioning in the conclusion
+        # that it helped.
+        n_local_trials = 2 + int(np.log(n_clusters))
+
+    # Pick first center randomly and track index of point
+    center_id = random_state.choice(n_samples, p=sample_weight / sample_weight.sum())
+    indices = np.full(n_clusters, -1, dtype=int)
+    if sp.issparse(X):
+        centers[0] = X[[center_id]].toarray()
+    else:
+        centers[0] = X[center_id]
+    indices[0] = center_id
+
+    # Initialize list of closest distances and calculate current potential
+    closest_dist_sq = _euclidean_distances(
+        centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True
+    )
+    current_pot = closest_dist_sq @ sample_weight
+
+    # Pick the remaining n_clusters-1 points
+    for c in range(1, n_clusters):
+        # Choose center candidates by sampling with probability proportional
+        # to the squared distance to the closest existing center
+        rand_vals = random_state.uniform(size=n_local_trials) * current_pot
+        candidate_ids = np.searchsorted(
+            stable_cumsum(sample_weight * closest_dist_sq), rand_vals
+        )
+        # XXX: numerical imprecision can result in a candidate_id out of range
+        np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids)
+
+        # Compute distances to center candidates
+        distance_to_candidates = _euclidean_distances(
+            X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True
+        )
+
+        # update closest distances squared and potential for each candidate
+        np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates)
+        candidates_pot = distance_to_candidates @ sample_weight.reshape(-1, 1)
+
+        # Decide which candidate is the best
+        best_candidate = np.argmin(candidates_pot)
+        current_pot = candidates_pot[best_candidate]
+        closest_dist_sq = distance_to_candidates[best_candidate]
+        best_candidate = candidate_ids[best_candidate]
+
+        # Permanently add best center candidate found in local tries
+        if sp.issparse(X):
+            centers[c] = X[[best_candidate]].toarray()
+        else:
+            centers[c] = X[best_candidate]
+        indices[c] = best_candidate
+
+    return centers, indices
+
+
+###############################################################################
+# K-means batch estimation by EM (expectation maximization)
+
+
+def _tolerance(X, tol):
+    """Return a tolerance which is dependent on the dataset."""
+    if tol == 0:
+        return 0
+    if sp.issparse(X):
+        variances = mean_variance_axis(X, axis=0)[1]
+    else:
+        variances = np.var(X, axis=0)
+    return np.mean(variances) * tol
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "sample_weight": ["array-like", None],
+        "return_n_iter": [bool],
+    },
+    prefer_skip_nested_validation=False,
+)
+def k_means(
+    X,
+    n_clusters,
+    *,
+    sample_weight=None,
+    init="k-means++",
+    n_init="auto",
+    max_iter=300,
+    verbose=False,
+    tol=1e-4,
+    random_state=None,
+    copy_x=True,
+    algorithm="lloyd",
+    return_n_iter=False,
+):
+    """Perform K-means clustering algorithm.
+
+    Read more in the :ref:`User Guide <k_means>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The observations to cluster. It must be noted that the data
+        will be converted to C ordering, which will cause a memory copy
+        if the given data is not C-contiguous.
+
+    n_clusters : int
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        The weights for each observation in `X`. If `None`, all observations
+        are assigned equal weight. `sample_weight` is not used during
+        initialization if `init` is a callable or a user provided array.
+
+    init : {'k-means++', 'random'}, callable or array-like of shape \
+            (n_clusters, n_features), default='k-means++'
+        Method for initialization:
+
+        - `'k-means++'` : selects initial cluster centers for k-mean
+          clustering in a smart way to speed up convergence. See section
+          Notes in k_init for more details.
+        - `'random'`: choose `n_clusters` observations (rows) at random from data
+          for the initial centroids.
+        - If an array is passed, it should be of shape `(n_clusters, n_features)`
+          and gives the initial centers.
+        - If a callable is passed, it should take arguments `X`, `n_clusters` and a
+          random state and return an initialization.
+
+    n_init : 'auto' or int, default="auto"
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+        When `n_init='auto'`, the number of runs depends on the value of init:
+        10 if using `init='random'` or `init` is a callable;
+        1 if using `init='k-means++'` or `init` is an array-like.
+
+        .. versionadded:: 1.2
+           Added 'auto' option for `n_init`.
+
+        .. versionchanged:: 1.4
+           Default value for `n_init` changed to `'auto'`.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the k-means algorithm to run.
+
+    verbose : bool, default=False
+        Verbosity mode.
+
+    tol : float, default=1e-4
+        Relative tolerance with regards to Frobenius norm of the difference
+        in the cluster centers of two consecutive iterations to declare
+        convergence.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for centroid initialization. Use
+        an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    copy_x : bool, default=True
+        When pre-computing distances it is more numerically accurate to center
+        the data first. If `copy_x` is True (default), then the original data is
+        not modified. If False, the original data is modified, and put back
+        before the function returns, but small numerical differences may be
+        introduced by subtracting and then adding the data mean. Note that if
+        the original data is not C-contiguous, a copy will be made even if
+        `copy_x` is False. If the original data is sparse, but not in CSR format,
+        a copy will be made even if `copy_x` is False.
+
+    algorithm : {"lloyd", "elkan"}, default="lloyd"
+        K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`.
+        The `"elkan"` variation can be more efficient on some datasets with
+        well-defined clusters, by using the triangle inequality. However it's
+        more memory intensive due to the allocation of an extra array of shape
+        `(n_samples, n_clusters)`.
+
+        .. versionchanged:: 0.18
+            Added Elkan algorithm
+
+        .. versionchanged:: 1.1
+            Renamed "full" to "lloyd", and deprecated "auto" and "full".
+            Changed "auto" to use "lloyd" instead of "elkan".
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    Returns
+    -------
+    centroid : ndarray of shape (n_clusters, n_features)
+        Centroids found at the last iteration of k-means.
+
+    label : ndarray of shape (n_samples,)
+        The `label[i]` is the code or index of the centroid the
+        i'th observation is closest to.
+
+    inertia : float
+        The final value of the inertia criterion (sum of squared distances to
+        the closest centroid for all observations in the training set).
+
+    best_n_iter : int
+        Number of iterations corresponding to the best results.
+        Returned only if `return_n_iter` is set to True.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import k_means
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [10, 2], [10, 4], [10, 0]])
+    >>> centroid, label, inertia = k_means(
+    ...     X, n_clusters=2, n_init="auto", random_state=0
+    ... )
+    >>> centroid
+    array([[10.,  2.],
+           [ 1.,  2.]])
+    >>> label
+    array([1, 1, 1, 0, 0, 0], dtype=int32)
+    >>> inertia
+    16.0
+    """
+    est = KMeans(
+        n_clusters=n_clusters,
+        init=init,
+        n_init=n_init,
+        max_iter=max_iter,
+        verbose=verbose,
+        tol=tol,
+        random_state=random_state,
+        copy_x=copy_x,
+        algorithm=algorithm,
+    ).fit(X, sample_weight=sample_weight)
+    if return_n_iter:
+        return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_
+    else:
+        return est.cluster_centers_, est.labels_, est.inertia_
+
+
+def _kmeans_single_elkan(
+    X,
+    sample_weight,
+    centers_init,
+    max_iter=300,
+    verbose=False,
+    tol=1e-4,
+    n_threads=1,
+):
+    """A single run of k-means elkan, assumes preparation completed prior.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The observations to cluster. If sparse matrix, must be in CSR format.
+
+    sample_weight : array-like of shape (n_samples,)
+        The weights for each observation in X.
+
+    centers_init : ndarray of shape (n_clusters, n_features)
+        The initial centers.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the k-means algorithm to run.
+
+    verbose : bool, default=False
+        Verbosity mode.
+
+    tol : float, default=1e-4
+        Relative tolerance with regards to Frobenius norm of the difference
+        in the cluster centers of two consecutive iterations to declare
+        convergence.
+        It's not advised to set `tol=0` since convergence might never be
+        declared due to rounding errors. Use a very small number instead.
+
+    n_threads : int, default=1
+        The number of OpenMP threads to use for the computation. Parallelism is
+        sample-wise on the main cython loop which assigns each sample to its
+        closest center.
+
+    Returns
+    -------
+    centroid : ndarray of shape (n_clusters, n_features)
+        Centroids found at the last iteration of k-means.
+
+    label : ndarray of shape (n_samples,)
+        label[i] is the code or index of the centroid the
+        i'th observation is closest to.
+
+    inertia : float
+        The final value of the inertia criterion (sum of squared distances to
+        the closest centroid for all observations in the training set).
+
+    n_iter : int
+        Number of iterations run.
+    """
+    n_samples = X.shape[0]
+    n_clusters = centers_init.shape[0]
+
+    # Buffers to avoid new allocations at each iteration.
+    centers = centers_init
+    centers_new = np.zeros_like(centers)
+    weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
+    labels = np.full(n_samples, -1, dtype=np.int32)
+    labels_old = labels.copy()
+    center_half_distances = euclidean_distances(centers) / 2
+    distance_next_center = np.partition(
+        np.asarray(center_half_distances), kth=1, axis=0
+    )[1]
+    upper_bounds = np.zeros(n_samples, dtype=X.dtype)
+    lower_bounds = np.zeros((n_samples, n_clusters), dtype=X.dtype)
+    center_shift = np.zeros(n_clusters, dtype=X.dtype)
+
+    if sp.issparse(X):
+        init_bounds = init_bounds_sparse
+        elkan_iter = elkan_iter_chunked_sparse
+        _inertia = _inertia_sparse
+    else:
+        init_bounds = init_bounds_dense
+        elkan_iter = elkan_iter_chunked_dense
+        _inertia = _inertia_dense
+
+    init_bounds(
+        X,
+        centers,
+        center_half_distances,
+        labels,
+        upper_bounds,
+        lower_bounds,
+        n_threads=n_threads,
+    )
+
+    strict_convergence = False
+
+    for i in range(max_iter):
+        elkan_iter(
+            X,
+            sample_weight,
+            centers,
+            centers_new,
+            weight_in_clusters,
+            center_half_distances,
+            distance_next_center,
+            upper_bounds,
+            lower_bounds,
+            labels,
+            center_shift,
+            n_threads,
+        )
+
+        # compute new pairwise distances between centers and closest other
+        # center of each center for next iterations
+        center_half_distances = euclidean_distances(centers_new) / 2
+        distance_next_center = np.partition(
+            np.asarray(center_half_distances), kth=1, axis=0
+        )[1]
+
+        if verbose:
+            inertia = _inertia(X, sample_weight, centers, labels, n_threads)
+            print(f"Iteration {i}, inertia {inertia}")
+
+        centers, centers_new = centers_new, centers
+
+        if np.array_equal(labels, labels_old):
+            # First check the labels for strict convergence.
+            if verbose:
+                print(f"Converged at iteration {i}: strict convergence.")
+            strict_convergence = True
+            break
+        else:
+            # No strict convergence, check for tol based convergence.
+            center_shift_tot = (center_shift**2).sum()
+            if center_shift_tot <= tol:
+                if verbose:
+                    print(
+                        f"Converged at iteration {i}: center shift "
+                        f"{center_shift_tot} within tolerance {tol}."
+                    )
+                break
+
+        labels_old[:] = labels
+
+    if not strict_convergence:
+        # rerun E-step so that predicted labels match cluster centers
+        elkan_iter(
+            X,
+            sample_weight,
+            centers,
+            centers,
+            weight_in_clusters,
+            center_half_distances,
+            distance_next_center,
+            upper_bounds,
+            lower_bounds,
+            labels,
+            center_shift,
+            n_threads,
+            update_centers=False,
+        )
+
+    inertia = _inertia(X, sample_weight, centers, labels, n_threads)
+
+    return labels, inertia, centers, i + 1
+
+
+# Threadpoolctl context to limit the number of threads in second level of
+# nested parallelism (i.e. BLAS) to avoid oversubscription.
+@_threadpool_controller_decorator(limits=1, user_api="blas")
+def _kmeans_single_lloyd(
+    X,
+    sample_weight,
+    centers_init,
+    max_iter=300,
+    verbose=False,
+    tol=1e-4,
+    n_threads=1,
+):
+    """A single run of k-means lloyd, assumes preparation completed prior.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The observations to cluster. If sparse matrix, must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,)
+        The weights for each observation in X.
+
+    centers_init : ndarray of shape (n_clusters, n_features)
+        The initial centers.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the k-means algorithm to run.
+
+    verbose : bool, default=False
+        Verbosity mode
+
+    tol : float, default=1e-4
+        Relative tolerance with regards to Frobenius norm of the difference
+        in the cluster centers of two consecutive iterations to declare
+        convergence.
+        It's not advised to set `tol=0` since convergence might never be
+        declared due to rounding errors. Use a very small number instead.
+
+    n_threads : int, default=1
+        The number of OpenMP threads to use for the computation. Parallelism is
+        sample-wise on the main cython loop which assigns each sample to its
+        closest center.
+
+    Returns
+    -------
+    centroid : ndarray of shape (n_clusters, n_features)
+        Centroids found at the last iteration of k-means.
+
+    label : ndarray of shape (n_samples,)
+        label[i] is the code or index of the centroid the
+        i'th observation is closest to.
+
+    inertia : float
+        The final value of the inertia criterion (sum of squared distances to
+        the closest centroid for all observations in the training set).
+
+    n_iter : int
+        Number of iterations run.
+    """
+    n_clusters = centers_init.shape[0]
+
+    # Buffers to avoid new allocations at each iteration.
+    centers = centers_init
+    centers_new = np.zeros_like(centers)
+    labels = np.full(X.shape[0], -1, dtype=np.int32)
+    labels_old = labels.copy()
+    weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
+    center_shift = np.zeros(n_clusters, dtype=X.dtype)
+
+    if sp.issparse(X):
+        lloyd_iter = lloyd_iter_chunked_sparse
+        _inertia = _inertia_sparse
+    else:
+        lloyd_iter = lloyd_iter_chunked_dense
+        _inertia = _inertia_dense
+
+    strict_convergence = False
+
+    for i in range(max_iter):
+        lloyd_iter(
+            X,
+            sample_weight,
+            centers,
+            centers_new,
+            weight_in_clusters,
+            labels,
+            center_shift,
+            n_threads,
+        )
+
+        if verbose:
+            inertia = _inertia(X, sample_weight, centers, labels, n_threads)
+            print(f"Iteration {i}, inertia {inertia}.")
+
+        centers, centers_new = centers_new, centers
+
+        if np.array_equal(labels, labels_old):
+            # First check the labels for strict convergence.
+            if verbose:
+                print(f"Converged at iteration {i}: strict convergence.")
+            strict_convergence = True
+            break
+        else:
+            # No strict convergence, check for tol based convergence.
+            center_shift_tot = (center_shift**2).sum()
+            if center_shift_tot <= tol:
+                if verbose:
+                    print(
+                        f"Converged at iteration {i}: center shift "
+                        f"{center_shift_tot} within tolerance {tol}."
+                    )
+                break
+
+        labels_old[:] = labels
+
+    if not strict_convergence:
+        # rerun E-step so that predicted labels match cluster centers
+        lloyd_iter(
+            X,
+            sample_weight,
+            centers,
+            centers,
+            weight_in_clusters,
+            labels,
+            center_shift,
+            n_threads,
+            update_centers=False,
+        )
+
+    inertia = _inertia(X, sample_weight, centers, labels, n_threads)
+
+    return labels, inertia, centers, i + 1
+
+
+def _labels_inertia(X, sample_weight, centers, n_threads=1, return_inertia=True):
+    """E step of the K-means EM algorithm.
+
+    Compute the labels and the inertia of the given samples and centers.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The input samples to assign to the labels. If sparse matrix, must
+        be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,)
+        The weights for each observation in X.
+
+    x_squared_norms : ndarray of shape (n_samples,)
+        Precomputed squared euclidean norm of each data point, to speed up
+        computations.
+
+    centers : ndarray of shape (n_clusters, n_features)
+        The cluster centers.
+
+    n_threads : int, default=1
+        The number of OpenMP threads to use for the computation. Parallelism is
+        sample-wise on the main cython loop which assigns each sample to its
+        closest center.
+
+    return_inertia : bool, default=True
+        Whether to compute and return the inertia.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        The resulting assignment.
+
+    inertia : float
+        Sum of squared distances of samples to their closest cluster center.
+        Inertia is only returned if return_inertia is True.
+    """
+    n_samples = X.shape[0]
+    n_clusters = centers.shape[0]
+
+    labels = np.full(n_samples, -1, dtype=np.int32)
+    center_shift = np.zeros(n_clusters, dtype=centers.dtype)
+
+    if sp.issparse(X):
+        _labels = lloyd_iter_chunked_sparse
+        _inertia = _inertia_sparse
+    else:
+        _labels = lloyd_iter_chunked_dense
+        _inertia = _inertia_dense
+
+    _labels(
+        X,
+        sample_weight,
+        centers,
+        centers_new=None,
+        weight_in_clusters=None,
+        labels=labels,
+        center_shift=center_shift,
+        n_threads=n_threads,
+        update_centers=False,
+    )
+
+    if return_inertia:
+        inertia = _inertia(X, sample_weight, centers, labels, n_threads)
+        return labels, inertia
+
+    return labels
+
+
+# Same as _labels_inertia but in a threadpool_limits context.
+_labels_inertia_threadpool_limit = _threadpool_controller_decorator(
+    limits=1, user_api="blas"
+)(_labels_inertia)
+
+
+class _BaseKMeans(
+    ClassNamePrefixFeaturesOutMixin, TransformerMixin, ClusterMixin, BaseEstimator, ABC
+):
+    """Base class for KMeans and MiniBatchKMeans"""
+
+    _parameter_constraints: dict = {
+        "n_clusters": [Interval(Integral, 1, None, closed="left")],
+        "init": [StrOptions({"k-means++", "random"}), callable, "array-like"],
+        "n_init": [
+            StrOptions({"auto"}),
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_clusters,
+        *,
+        init,
+        n_init,
+        max_iter,
+        tol,
+        verbose,
+        random_state,
+    ):
+        self.n_clusters = n_clusters
+        self.init = init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_init = n_init
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def _check_params_vs_input(self, X, default_n_init=None):
+        # n_clusters
+        if X.shape[0] < self.n_clusters:
+            raise ValueError(
+                f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}."
+            )
+
+        # tol
+        self._tol = _tolerance(X, self.tol)
+
+        # n-init
+        if self.n_init == "auto":
+            if isinstance(self.init, str) and self.init == "k-means++":
+                self._n_init = 1
+            elif isinstance(self.init, str) and self.init == "random":
+                self._n_init = default_n_init
+            elif callable(self.init):
+                self._n_init = default_n_init
+            else:  # array-like
+                self._n_init = 1
+        else:
+            self._n_init = self.n_init
+
+        if _is_arraylike_not_scalar(self.init) and self._n_init != 1:
+            warnings.warn(
+                (
+                    "Explicit initial center position passed: performing only"
+                    f" one init in {self.__class__.__name__} instead of "
+                    f"n_init={self._n_init}."
+                ),
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            self._n_init = 1
+
+    @abstractmethod
+    def _warn_mkl_vcomp(self, n_active_threads):
+        """Issue an estimator specific warning when vcomp and mkl are both present
+
+        This method is called by `_check_mkl_vcomp`.
+        """
+
+    def _check_mkl_vcomp(self, X, n_samples):
+        """Check when vcomp and mkl are both present"""
+        # The BLAS call inside a prange in lloyd_iter_chunked_dense is known to
+        # cause a small memory leak when there are less chunks than the number
+        # of available threads. It only happens when the OpenMP library is
+        # vcomp (microsoft OpenMP) and the BLAS library is MKL. see #18653
+        if sp.issparse(X):
+            return
+
+        n_active_threads = int(np.ceil(n_samples / CHUNK_SIZE))
+        if n_active_threads < self._n_threads:
+            modules = _get_threadpool_controller().info()
+            has_vcomp = "vcomp" in [module["prefix"] for module in modules]
+            has_mkl = ("mkl", "intel") in [
+                (module["internal_api"], module.get("threading_layer", None))
+                for module in modules
+            ]
+            if has_vcomp and has_mkl:
+                self._warn_mkl_vcomp(n_active_threads)
+
+    def _validate_center_shape(self, X, centers):
+        """Check if centers is compatible with X and n_clusters."""
+        if centers.shape[0] != self.n_clusters:
+            raise ValueError(
+                f"The shape of the initial centers {centers.shape} does not "
+                f"match the number of clusters {self.n_clusters}."
+            )
+        if centers.shape[1] != X.shape[1]:
+            raise ValueError(
+                f"The shape of the initial centers {centers.shape} does not "
+                f"match the number of features of the data {X.shape[1]}."
+            )
+
+    def _check_test_data(self, X):
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csr",
+            reset=False,
+            dtype=[np.float64, np.float32],
+            order="C",
+            accept_large_sparse=False,
+        )
+        return X
+
+    def _init_centroids(
+        self,
+        X,
+        x_squared_norms,
+        init,
+        random_state,
+        sample_weight,
+        init_size=None,
+        n_centroids=None,
+    ):
+        """Compute the initial centroids.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        x_squared_norms : ndarray of shape (n_samples,)
+            Squared euclidean norm of each data point. Pass it if you have it
+            at hands already to avoid it being recomputed here.
+
+        init : {'k-means++', 'random'}, callable or ndarray of shape \
+                (n_clusters, n_features)
+            Method for initialization.
+
+        random_state : RandomState instance
+            Determines random number generation for centroid initialization.
+            See :term:`Glossary <random_state>`.
+
+        sample_weight : ndarray of shape (n_samples,)
+            The weights for each observation in X. `sample_weight` is not used
+            during initialization if `init` is a callable or a user provided
+            array.
+
+        init_size : int, default=None
+            Number of samples to randomly sample for speeding up the
+            initialization (sometimes at the expense of accuracy).
+
+        n_centroids : int, default=None
+            Number of centroids to initialize.
+            If left to 'None' the number of centroids will be equal to
+            number of clusters to form (self.n_clusters).
+
+        Returns
+        -------
+        centers : ndarray of shape (n_clusters, n_features)
+            Initial centroids of clusters.
+        """
+        n_samples = X.shape[0]
+        n_clusters = self.n_clusters if n_centroids is None else n_centroids
+
+        if init_size is not None and init_size < n_samples:
+            init_indices = random_state.randint(0, n_samples, init_size)
+            X = X[init_indices]
+            x_squared_norms = x_squared_norms[init_indices]
+            n_samples = X.shape[0]
+            sample_weight = sample_weight[init_indices]
+
+        if isinstance(init, str) and init == "k-means++":
+            centers, _ = _kmeans_plusplus(
+                X,
+                n_clusters,
+                random_state=random_state,
+                x_squared_norms=x_squared_norms,
+                sample_weight=sample_weight,
+            )
+        elif isinstance(init, str) and init == "random":
+            seeds = random_state.choice(
+                n_samples,
+                size=n_clusters,
+                replace=False,
+                p=sample_weight / sample_weight.sum(),
+            )
+            centers = X[seeds]
+        elif _is_arraylike_not_scalar(self.init):
+            centers = init
+        elif callable(init):
+            centers = init(X, n_clusters, random_state=random_state)
+            centers = check_array(centers, dtype=X.dtype, copy=False, order="C")
+            self._validate_center_shape(X, centers)
+
+        if sp.issparse(centers):
+            centers = centers.toarray()
+
+        return centers
+
+    def fit_predict(self, X, y=None, sample_weight=None):
+        """Compute cluster centers and predict cluster index for each sample.
+
+        Convenience method; equivalent to calling fit(X) followed by
+        predict(X).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to transform.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        return self.fit(X, sample_weight=sample_weight).labels_
+
+    def predict(self, X):
+        """Predict the closest cluster each sample in X belongs to.
+
+        In the vector quantization literature, `cluster_centers_` is called
+        the code book and each value returned by `predict` is the index of
+        the closest code in the code book.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to predict.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+
+        # sample weights are not used by predict but cython helpers expect an array
+        sample_weight = np.ones(X.shape[0], dtype=X.dtype)
+
+        labels = _labels_inertia_threadpool_limit(
+            X,
+            sample_weight,
+            self.cluster_centers_,
+            n_threads=self._n_threads,
+            return_inertia=False,
+        )
+
+        return labels
+
+    def fit_transform(self, X, y=None, sample_weight=None):
+        """Compute clustering and transform X to cluster-distance space.
+
+        Equivalent to fit(X).transform(X), but more efficiently implemented.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to transform.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_clusters)
+            X transformed in the new space.
+        """
+        return self.fit(X, sample_weight=sample_weight)._transform(X)
+
+    def transform(self, X):
+        """Transform X to a cluster-distance space.
+
+        In the new space, each dimension is the distance to the cluster
+        centers. Note that even if X is sparse, the array returned by
+        `transform` will typically be dense.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to transform.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_clusters)
+            X transformed in the new space.
+        """
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        return self._transform(X)
+
+    def _transform(self, X):
+        """Guts of transform method; no input validation."""
+        return euclidean_distances(X, self.cluster_centers_)
+
+    def score(self, X, y=None, sample_weight=None):
+        """Opposite of the value of X on the K-means objective.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight.
+
+        Returns
+        -------
+        score : float
+            Opposite of the value of X on the K-means objective.
+        """
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        _, scores = _labels_inertia_threadpool_limit(
+            X, sample_weight, self.cluster_centers_, self._n_threads
+        )
+        return -scores
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class KMeans(_BaseKMeans):
+    """K-Means clustering.
+
+    Read more in the :ref:`User Guide <k_means>`.
+
+    Parameters
+    ----------
+
+    n_clusters : int, default=8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+        For an example of how to choose an optimal value for `n_clusters` refer to
+        :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
+
+    init : {'k-means++', 'random'}, callable or array-like of shape \
+            (n_clusters, n_features), default='k-means++'
+        Method for initialization:
+
+        * 'k-means++' : selects initial cluster centroids using sampling \
+            based on an empirical probability distribution of the points' \
+            contribution to the overall inertia. This technique speeds up \
+            convergence. The algorithm implemented is "greedy k-means++". It \
+            differs from the vanilla k-means++ by making several trials at \
+            each sampling step and choosing the best centroid among them.
+
+        * 'random': choose `n_clusters` observations (rows) at random from \
+        data for the initial centroids.
+
+        * If an array is passed, it should be of shape (n_clusters, n_features)\
+        and gives the initial centers.
+
+        * If a callable is passed, it should take arguments X, n_clusters and a\
+        random state and return an initialization.
+
+        For an example of how to use the different `init` strategies, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`.
+
+        For an evaluation of the impact of initialization, see the example
+        :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`.
+
+    n_init : 'auto' or int, default='auto'
+        Number of times the k-means algorithm is run with different centroid
+        seeds. The final results is the best output of `n_init` consecutive runs
+        in terms of inertia. Several runs are recommended for sparse
+        high-dimensional problems (see :ref:`kmeans_sparse_high_dim`).
+
+        When `n_init='auto'`, the number of runs depends on the value of init:
+        10 if using `init='random'` or `init` is a callable;
+        1 if using `init='k-means++'` or `init` is an array-like.
+
+        .. versionadded:: 1.2
+           Added 'auto' option for `n_init`.
+
+        .. versionchanged:: 1.4
+           Default value for `n_init` changed to `'auto'`.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    tol : float, default=1e-4
+        Relative tolerance with regards to Frobenius norm of the difference
+        in the cluster centers of two consecutive iterations to declare
+        convergence.
+
+    verbose : int, default=0
+        Verbosity mode.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for centroid initialization. Use
+        an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    copy_x : bool, default=True
+        When pre-computing distances it is more numerically accurate to center
+        the data first. If copy_x is True (default), then the original data is
+        not modified. If False, the original data is modified, and put back
+        before the function returns, but small numerical differences may be
+        introduced by subtracting and then adding the data mean. Note that if
+        the original data is not C-contiguous, a copy will be made even if
+        copy_x is False. If the original data is sparse, but not in CSR format,
+        a copy will be made even if copy_x is False.
+
+    algorithm : {"lloyd", "elkan"}, default="lloyd"
+        K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`.
+        The `"elkan"` variation can be more efficient on some datasets with
+        well-defined clusters, by using the triangle inequality. However it's
+        more memory intensive due to the allocation of an extra array of shape
+        `(n_samples, n_clusters)`.
+
+        .. versionchanged:: 0.18
+            Added Elkan algorithm
+
+        .. versionchanged:: 1.1
+            Renamed "full" to "lloyd", and deprecated "auto" and "full".
+            Changed "auto" to use "lloyd" instead of "elkan".
+
+    Attributes
+    ----------
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
+        Coordinates of cluster centers. If the algorithm stops before fully
+        converging (see ``tol`` and ``max_iter``), these will not be
+        consistent with ``labels_``.
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point
+
+    inertia_ : float
+        Sum of squared distances of samples to their closest cluster center,
+        weighted by the sample weights if provided.
+
+    n_iter_ : int
+        Number of iterations run.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    MiniBatchKMeans : Alternative online implementation that does incremental
+        updates of the centers positions using mini-batches.
+        For large scale learning (say n_samples > 10k) MiniBatchKMeans is
+        probably much faster than the default batch implementation.
+
+    Notes
+    -----
+    The k-means problem is solved using either Lloyd's or Elkan's algorithm.
+
+    The average complexity is given by O(k n T), where n is the number of
+    samples and T is the number of iteration.
+
+    The worst case complexity is given by O(n^(k+2/p)) with
+    n = n_samples, p = n_features.
+    Refer to :doi:`"How slow is the k-means method?" D. Arthur and S. Vassilvitskii -
+    SoCG2006.<10.1145/1137856.1137880>` for more details.
+
+    In practice, the k-means algorithm is very fast (one of the fastest
+    clustering algorithms available), but it falls in local minima. That's why
+    it can be useful to restart it several times.
+
+    If the algorithm stops before fully converging (because of ``tol`` or
+    ``max_iter``), ``labels_`` and ``cluster_centers_`` will not be consistent,
+    i.e. the ``cluster_centers_`` will not be the means of the points in each
+    cluster. Also, the estimator will reassign ``labels_`` after the last
+    iteration to make ``labels_`` consistent with ``predict`` on the training
+    set.
+
+    Examples
+    --------
+
+    >>> from sklearn.cluster import KMeans
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [10, 2], [10, 4], [10, 0]])
+    >>> kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)
+    >>> kmeans.labels_
+    array([1, 1, 1, 0, 0, 0], dtype=int32)
+    >>> kmeans.predict([[0, 0], [12, 3]])
+    array([1, 0], dtype=int32)
+    >>> kmeans.cluster_centers_
+    array([[10.,  2.],
+           [ 1.,  2.]])
+
+    For examples of common problems with K-Means and how to address them see
+    :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`.
+
+    For a demonstration of how K-Means can be used to cluster text documents see
+    :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`.
+
+    For a comparison between K-Means and MiniBatchKMeans refer to example
+    :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`.
+
+    For a comparison between K-Means and BisectingKMeans refer to example
+    :ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseKMeans._parameter_constraints,
+        "copy_x": ["boolean"],
+        "algorithm": [StrOptions({"lloyd", "elkan"})],
+    }
+
+    def __init__(
+        self,
+        n_clusters=8,
+        *,
+        init="k-means++",
+        n_init="auto",
+        max_iter=300,
+        tol=1e-4,
+        verbose=0,
+        random_state=None,
+        copy_x=True,
+        algorithm="lloyd",
+    ):
+        super().__init__(
+            n_clusters=n_clusters,
+            init=init,
+            n_init=n_init,
+            max_iter=max_iter,
+            tol=tol,
+            verbose=verbose,
+            random_state=random_state,
+        )
+
+        self.copy_x = copy_x
+        self.algorithm = algorithm
+
+    def _check_params_vs_input(self, X):
+        super()._check_params_vs_input(X, default_n_init=10)
+
+        self._algorithm = self.algorithm
+        if self._algorithm == "elkan" and self.n_clusters == 1:
+            warnings.warn(
+                (
+                    "algorithm='elkan' doesn't make sense for a single "
+                    "cluster. Using 'lloyd' instead."
+                ),
+                RuntimeWarning,
+            )
+            self._algorithm = "lloyd"
+
+    def _warn_mkl_vcomp(self, n_active_threads):
+        """Warn when vcomp and mkl are both present"""
+        warnings.warn(
+            "KMeans is known to have a memory leak on Windows "
+            "with MKL, when there are less chunks than available "
+            "threads. You can avoid it by setting the environment"
+            f" variable OMP_NUM_THREADS={n_active_threads}."
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute k-means clustering.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training instances to cluster. It must be noted that the data
+            will be converted to C ordering, which will cause a memory
+            copy if the given data is not C-contiguous.
+            If a sparse matrix is passed, a copy will be made if it's not in
+            CSR format.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight. `sample_weight` is not used during
+            initialization if `init` is a callable or a user provided array.
+
+            .. versionadded:: 0.20
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csr",
+            dtype=[np.float64, np.float32],
+            order="C",
+            copy=self.copy_x,
+            accept_large_sparse=False,
+        )
+
+        self._check_params_vs_input(X)
+
+        random_state = check_random_state(self.random_state)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self._n_threads = _openmp_effective_n_threads()
+
+        # Validate init array
+        init = self.init
+        init_is_array_like = _is_arraylike_not_scalar(init)
+        if init_is_array_like:
+            init = check_array(init, dtype=X.dtype, copy=True, order="C")
+            self._validate_center_shape(X, init)
+
+        # subtract of mean of x for more accurate distance computations
+        if not sp.issparse(X):
+            X_mean = X.mean(axis=0)
+            # The copy was already done above
+            X -= X_mean
+
+            if init_is_array_like:
+                init -= X_mean
+
+        # precompute squared norms of data points
+        x_squared_norms = row_norms(X, squared=True)
+
+        if self._algorithm == "elkan":
+            kmeans_single = _kmeans_single_elkan
+        else:
+            kmeans_single = _kmeans_single_lloyd
+            self._check_mkl_vcomp(X, X.shape[0])
+
+        best_inertia, best_labels = None, None
+
+        for i in range(self._n_init):
+            # Initialize centers
+            centers_init = self._init_centroids(
+                X,
+                x_squared_norms=x_squared_norms,
+                init=init,
+                random_state=random_state,
+                sample_weight=sample_weight,
+            )
+            if self.verbose:
+                print("Initialization complete")
+
+            # run a k-means once
+            labels, inertia, centers, n_iter_ = kmeans_single(
+                X,
+                sample_weight,
+                centers_init,
+                max_iter=self.max_iter,
+                verbose=self.verbose,
+                tol=self._tol,
+                n_threads=self._n_threads,
+            )
+
+            # determine if these results are the best so far
+            # we chose a new run if it has a better inertia and the clustering is
+            # different from the best so far (it's possible that the inertia is
+            # slightly better even if the clustering is the same with potentially
+            # permuted labels, due to rounding errors)
+            if best_inertia is None or (
+                inertia < best_inertia
+                and not _is_same_clustering(labels, best_labels, self.n_clusters)
+            ):
+                best_labels = labels
+                best_centers = centers
+                best_inertia = inertia
+                best_n_iter = n_iter_
+
+        if not sp.issparse(X):
+            if not self.copy_x:
+                X += X_mean
+            best_centers += X_mean
+
+        distinct_clusters = len(set(best_labels))
+        if distinct_clusters < self.n_clusters:
+            warnings.warn(
+                "Number of distinct clusters ({}) found smaller than "
+                "n_clusters ({}). Possibly due to duplicate points "
+                "in X.".format(distinct_clusters, self.n_clusters),
+                ConvergenceWarning,
+                stacklevel=2,
+            )
+
+        self.cluster_centers_ = best_centers
+        self._n_features_out = self.cluster_centers_.shape[0]
+        self.labels_ = best_labels
+        self.inertia_ = best_inertia
+        self.n_iter_ = best_n_iter
+        return self
+
+
+def _mini_batch_step(
+    X,
+    sample_weight,
+    centers,
+    centers_new,
+    weight_sums,
+    random_state,
+    random_reassign=False,
+    reassignment_ratio=0.01,
+    verbose=False,
+    n_threads=1,
+):
+    """Incremental update of the centers for the Minibatch K-Means algorithm.
+
+    Parameters
+    ----------
+
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The original data array. If sparse, must be in CSR format.
+
+    x_squared_norms : ndarray of shape (n_samples,)
+        Squared euclidean norm of each data point.
+
+    sample_weight : ndarray of shape (n_samples,)
+        The weights for each observation in `X`.
+
+    centers : ndarray of shape (n_clusters, n_features)
+        The cluster centers before the current iteration
+
+    centers_new : ndarray of shape (n_clusters, n_features)
+        The cluster centers after the current iteration. Modified in-place.
+
+    weight_sums : ndarray of shape (n_clusters,)
+        The vector in which we keep track of the numbers of points in a
+        cluster. This array is modified in place.
+
+    random_state : RandomState instance
+        Determines random number generation for low count centers reassignment.
+        See :term:`Glossary <random_state>`.
+
+    random_reassign : boolean, default=False
+        If True, centers with very low counts are randomly reassigned
+        to observations.
+
+    reassignment_ratio : float, default=0.01
+        Control the fraction of the maximum number of counts for a
+        center to be reassigned. A higher value means that low count
+        centers are more likely to be reassigned, which means that the
+        model will take longer to converge, but should converge in a
+        better clustering.
+
+    verbose : bool, default=False
+        Controls the verbosity.
+
+    n_threads : int, default=1
+        The number of OpenMP threads to use for the computation.
+
+    Returns
+    -------
+    inertia : float
+        Sum of squared distances of samples to their closest cluster center.
+        The inertia is computed after finding the labels and before updating
+        the centers.
+    """
+    # Perform label assignment to nearest centers
+    # For better efficiency, it's better to run _mini_batch_step in a
+    # threadpool_limit context than using _labels_inertia_threadpool_limit here
+    labels, inertia = _labels_inertia(X, sample_weight, centers, n_threads=n_threads)
+
+    # Update centers according to the labels
+    if sp.issparse(X):
+        _minibatch_update_sparse(
+            X, sample_weight, centers, centers_new, weight_sums, labels, n_threads
+        )
+    else:
+        _minibatch_update_dense(
+            X,
+            sample_weight,
+            centers,
+            centers_new,
+            weight_sums,
+            labels,
+            n_threads,
+        )
+
+    # Reassign clusters that have very low weight
+    if random_reassign and reassignment_ratio > 0:
+        to_reassign = weight_sums < reassignment_ratio * weight_sums.max()
+
+        # pick at most .5 * batch_size samples as new centers
+        if to_reassign.sum() > 0.5 * X.shape[0]:
+            indices_dont_reassign = np.argsort(weight_sums)[int(0.5 * X.shape[0]) :]
+            to_reassign[indices_dont_reassign] = False
+        n_reassigns = to_reassign.sum()
+
+        if n_reassigns:
+            # Pick new clusters amongst observations with uniform probability
+            new_centers = random_state.choice(
+                X.shape[0], replace=False, size=n_reassigns
+            )
+            if verbose:
+                print(f"[MiniBatchKMeans] Reassigning {n_reassigns} cluster centers.")
+
+            if sp.issparse(X):
+                assign_rows_csr(
+                    X,
+                    new_centers.astype(np.intp, copy=False),
+                    np.where(to_reassign)[0].astype(np.intp, copy=False),
+                    centers_new,
+                )
+            else:
+                centers_new[to_reassign] = X[new_centers]
+
+        # reset counts of reassigned centers, but don't reset them too small
+        # to avoid instant reassignment. This is a pretty dirty hack as it
+        # also modifies the learning rates.
+        weight_sums[to_reassign] = np.min(weight_sums[~to_reassign])
+
+    return inertia
+
+
+class MiniBatchKMeans(_BaseKMeans):
+    """
+    Mini-Batch K-Means clustering.
+
+    Read more in the :ref:`User Guide <mini_batch_kmeans>`.
+
+    Parameters
+    ----------
+
+    n_clusters : int, default=8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    init : {'k-means++', 'random'}, callable or array-like of shape \
+            (n_clusters, n_features), default='k-means++'
+        Method for initialization:
+
+        'k-means++' : selects initial cluster centroids using sampling based on
+        an empirical probability distribution of the points' contribution to the
+        overall inertia. This technique speeds up convergence. The algorithm
+        implemented is "greedy k-means++". It differs from the vanilla k-means++
+        by making several trials at each sampling step and choosing the best centroid
+        among them.
+
+        'random': choose `n_clusters` observations (rows) at random from data
+        for the initial centroids.
+
+        If an array is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+        If a callable is passed, it should take arguments X, n_clusters and a
+        random state and return an initialization.
+
+        For an evaluation of the impact of initialization, see the example
+        :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`.
+
+    max_iter : int, default=100
+        Maximum number of iterations over the complete dataset before
+        stopping independently of any early stopping criterion heuristics.
+
+    batch_size : int, default=1024
+        Size of the mini batches.
+        For faster computations, you can set the ``batch_size`` greater than
+        256 * number of cores to enable parallelism on all cores.
+
+        .. versionchanged:: 1.0
+           `batch_size` default changed from 100 to 1024.
+
+    verbose : int, default=0
+        Verbosity mode.
+
+    compute_labels : bool, default=True
+        Compute label assignment and inertia for the complete dataset
+        once the minibatch optimization has converged in fit.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for centroid initialization and
+        random reassignment. Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    tol : float, default=0.0
+        Control early stopping based on the relative center changes as
+        measured by a smoothed, variance-normalized of the mean center
+        squared position changes. This early stopping heuristics is
+        closer to the one used for the batch variant of the algorithms
+        but induces a slight computational and memory overhead over the
+        inertia heuristic.
+
+        To disable convergence detection based on normalized center
+        change, set tol to 0.0 (default).
+
+    max_no_improvement : int, default=10
+        Control early stopping based on the consecutive number of mini
+        batches that does not yield an improvement on the smoothed inertia.
+
+        To disable convergence detection based on inertia, set
+        max_no_improvement to None.
+
+    init_size : int, default=None
+        Number of samples to randomly sample for speeding up the
+        initialization (sometimes at the expense of accuracy): the
+        only algorithm is initialized by running a batch KMeans on a
+        random subset of the data. This needs to be larger than n_clusters.
+
+        If `None`, the heuristic is `init_size = 3 * batch_size` if
+        `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`.
+
+    n_init : 'auto' or int, default="auto"
+        Number of random initializations that are tried.
+        In contrast to KMeans, the algorithm is only run once, using the best of
+        the `n_init` initializations as measured by inertia. Several runs are
+        recommended for sparse high-dimensional problems (see
+        :ref:`kmeans_sparse_high_dim`).
+
+        When `n_init='auto'`, the number of runs depends on the value of init:
+        3 if using `init='random'` or `init` is a callable;
+        1 if using `init='k-means++'` or `init` is an array-like.
+
+        .. versionadded:: 1.2
+           Added 'auto' option for `n_init`.
+
+        .. versionchanged:: 1.4
+           Default value for `n_init` changed to `'auto'` in version.
+
+    reassignment_ratio : float, default=0.01
+        Control the fraction of the maximum number of counts for a center to
+        be reassigned. A higher value means that low count centers are more
+        easily reassigned, which means that the model will take longer to
+        converge, but should converge in a better clustering. However, too high
+        a value may cause convergence issues, especially with a small batch
+        size.
+
+    Attributes
+    ----------
+
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
+        Coordinates of cluster centers.
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point (if compute_labels is set to True).
+
+    inertia_ : float
+        The value of the inertia criterion associated with the chosen
+        partition if compute_labels is set to True. If compute_labels is set to
+        False, it's an approximation of the inertia based on an exponentially
+        weighted average of the batch inertiae.
+        The inertia is defined as the sum of square distances of samples to
+        their cluster center, weighted by the sample weights if provided.
+
+    n_iter_ : int
+        Number of iterations over the full dataset.
+
+    n_steps_ : int
+        Number of minibatches processed.
+
+        .. versionadded:: 1.0
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    KMeans : The classic implementation of the clustering method based on the
+        Lloyd's algorithm. It consumes the whole set of input data at each
+        iteration.
+
+    Notes
+    -----
+    See https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf
+
+    When there are too few points in the dataset, some centers may be
+    duplicated, which means that a proper clustering in terms of the number
+    of requesting clusters and the number of returned clusters will not
+    always match. One solution is to set `reassignment_ratio=0`, which
+    prevents reassignments of clusters that are too small.
+
+    See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a
+    comparison with :class:`~sklearn.cluster.BIRCH`.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import MiniBatchKMeans
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 0], [4, 4],
+    ...               [4, 5], [0, 1], [2, 2],
+    ...               [3, 2], [5, 5], [1, -1]])
+    >>> # manually fit on batches
+    >>> kmeans = MiniBatchKMeans(n_clusters=2,
+    ...                          random_state=0,
+    ...                          batch_size=6,
+    ...                          n_init="auto")
+    >>> kmeans = kmeans.partial_fit(X[0:6,:])
+    >>> kmeans = kmeans.partial_fit(X[6:12,:])
+    >>> kmeans.cluster_centers_
+    array([[3.375, 3.  ],
+           [0.75 , 0.5 ]])
+    >>> kmeans.predict([[0, 0], [4, 4]])
+    array([1, 0], dtype=int32)
+    >>> # fit on the whole data
+    >>> kmeans = MiniBatchKMeans(n_clusters=2,
+    ...                          random_state=0,
+    ...                          batch_size=6,
+    ...                          max_iter=10,
+    ...                          n_init="auto").fit(X)
+    >>> kmeans.cluster_centers_
+    array([[3.55102041, 2.48979592],
+           [1.06896552, 1.        ]])
+    >>> kmeans.predict([[0, 0], [4, 4]])
+    array([1, 0], dtype=int32)
+
+    For a comparison of Mini-Batch K-Means clustering with other clustering algorithms,
+    see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseKMeans._parameter_constraints,
+        "batch_size": [Interval(Integral, 1, None, closed="left")],
+        "compute_labels": ["boolean"],
+        "max_no_improvement": [Interval(Integral, 0, None, closed="left"), None],
+        "init_size": [Interval(Integral, 1, None, closed="left"), None],
+        "reassignment_ratio": [Interval(Real, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        n_clusters=8,
+        *,
+        init="k-means++",
+        max_iter=100,
+        batch_size=1024,
+        verbose=0,
+        compute_labels=True,
+        random_state=None,
+        tol=0.0,
+        max_no_improvement=10,
+        init_size=None,
+        n_init="auto",
+        reassignment_ratio=0.01,
+    ):
+        super().__init__(
+            n_clusters=n_clusters,
+            init=init,
+            max_iter=max_iter,
+            verbose=verbose,
+            random_state=random_state,
+            tol=tol,
+            n_init=n_init,
+        )
+
+        self.max_no_improvement = max_no_improvement
+        self.batch_size = batch_size
+        self.compute_labels = compute_labels
+        self.init_size = init_size
+        self.reassignment_ratio = reassignment_ratio
+
+    def _check_params_vs_input(self, X):
+        super()._check_params_vs_input(X, default_n_init=3)
+
+        self._batch_size = min(self.batch_size, X.shape[0])
+
+        # init_size
+        self._init_size = self.init_size
+        if self._init_size is None:
+            self._init_size = 3 * self._batch_size
+            if self._init_size < self.n_clusters:
+                self._init_size = 3 * self.n_clusters
+        elif self._init_size < self.n_clusters:
+            warnings.warn(
+                (
+                    f"init_size={self._init_size} should be larger than "
+                    f"n_clusters={self.n_clusters}. Setting it to "
+                    "min(3*n_clusters, n_samples)"
+                ),
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            self._init_size = 3 * self.n_clusters
+        self._init_size = min(self._init_size, X.shape[0])
+
+        # reassignment_ratio
+        if self.reassignment_ratio < 0:
+            raise ValueError(
+                "reassignment_ratio should be >= 0, got "
+                f"{self.reassignment_ratio} instead."
+            )
+
+    def _warn_mkl_vcomp(self, n_active_threads):
+        """Warn when vcomp and mkl are both present"""
+        warnings.warn(
+            "MiniBatchKMeans is known to have a memory leak on "
+            "Windows with MKL, when there are less chunks than "
+            "available threads. You can prevent it by setting "
+            f"batch_size >= {self._n_threads * CHUNK_SIZE} or by "
+            "setting the environment variable "
+            f"OMP_NUM_THREADS={n_active_threads}"
+        )
+
+    def _mini_batch_convergence(
+        self, step, n_steps, n_samples, centers_squared_diff, batch_inertia
+    ):
+        """Helper function to encapsulate the early stopping logic"""
+        # Normalize inertia to be able to compare values when
+        # batch_size changes
+        batch_inertia /= self._batch_size
+
+        # count steps starting from 1 for user friendly verbose mode.
+        step = step + 1
+
+        # Ignore first iteration because it's inertia from initialization.
+        if step == 1:
+            if self.verbose:
+                print(
+                    f"Minibatch step {step}/{n_steps}: mean batch "
+                    f"inertia: {batch_inertia}"
+                )
+            return False
+
+        # Compute an Exponentially Weighted Average of the inertia to
+        # monitor the convergence while discarding minibatch-local stochastic
+        # variability: https://en.wikipedia.org/wiki/Moving_average
+        if self._ewa_inertia is None:
+            self._ewa_inertia = batch_inertia
+        else:
+            alpha = self._batch_size * 2.0 / (n_samples + 1)
+            alpha = min(alpha, 1)
+            self._ewa_inertia = self._ewa_inertia * (1 - alpha) + batch_inertia * alpha
+
+        # Log progress to be able to monitor convergence
+        if self.verbose:
+            print(
+                f"Minibatch step {step}/{n_steps}: mean batch inertia: "
+                f"{batch_inertia}, ewa inertia: {self._ewa_inertia}"
+            )
+
+        # Early stopping based on absolute tolerance on squared change of
+        # centers position
+        if self._tol > 0.0 and centers_squared_diff <= self._tol:
+            if self.verbose:
+                print(f"Converged (small centers change) at step {step}/{n_steps}")
+            return True
+
+        # Early stopping heuristic due to lack of improvement on smoothed
+        # inertia
+        if self._ewa_inertia_min is None or self._ewa_inertia < self._ewa_inertia_min:
+            self._no_improvement = 0
+            self._ewa_inertia_min = self._ewa_inertia
+        else:
+            self._no_improvement += 1
+
+        if (
+            self.max_no_improvement is not None
+            and self._no_improvement >= self.max_no_improvement
+        ):
+            if self.verbose:
+                print(
+                    "Converged (lack of improvement in inertia) at step "
+                    f"{step}/{n_steps}"
+                )
+            return True
+
+        return False
+
+    def _random_reassign(self):
+        """Check if a random reassignment needs to be done.
+
+        Do random reassignments each time 10 * n_clusters samples have been
+        processed.
+
+        If there are empty clusters we always want to reassign.
+        """
+        self._n_since_last_reassign += self._batch_size
+        if (self._counts == 0).any() or self._n_since_last_reassign >= (
+            10 * self.n_clusters
+        ):
+            self._n_since_last_reassign = 0
+            return True
+        return False
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute the centroids on X by chunking it into mini-batches.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training instances to cluster. It must be noted that the data
+            will be converted to C ordering, which will cause a memory copy
+            if the given data is not C-contiguous.
+            If a sparse matrix is passed, a copy will be made if it's not in
+            CSR format.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight. `sample_weight` is not used during
+            initialization if `init` is a callable or a user provided array.
+
+            .. versionadded:: 0.20
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csr",
+            dtype=[np.float64, np.float32],
+            order="C",
+            accept_large_sparse=False,
+        )
+
+        self._check_params_vs_input(X)
+        random_state = check_random_state(self.random_state)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self._n_threads = _openmp_effective_n_threads()
+        n_samples, n_features = X.shape
+
+        # Validate init array
+        init = self.init
+        if _is_arraylike_not_scalar(init):
+            init = check_array(init, dtype=X.dtype, copy=True, order="C")
+            self._validate_center_shape(X, init)
+
+        self._check_mkl_vcomp(X, self._batch_size)
+
+        # precompute squared norms of data points
+        x_squared_norms = row_norms(X, squared=True)
+
+        # Validation set for the init
+        validation_indices = random_state.randint(0, n_samples, self._init_size)
+        X_valid = X[validation_indices]
+        sample_weight_valid = sample_weight[validation_indices]
+
+        # perform several inits with random subsets
+        best_inertia = None
+        for init_idx in range(self._n_init):
+            if self.verbose:
+                print(f"Init {init_idx + 1}/{self._n_init} with method {init}")
+
+            # Initialize the centers using only a fraction of the data as we
+            # expect n_samples to be very large when using MiniBatchKMeans.
+            cluster_centers = self._init_centroids(
+                X,
+                x_squared_norms=x_squared_norms,
+                init=init,
+                random_state=random_state,
+                init_size=self._init_size,
+                sample_weight=sample_weight,
+            )
+
+            # Compute inertia on a validation set.
+            _, inertia = _labels_inertia_threadpool_limit(
+                X_valid,
+                sample_weight_valid,
+                cluster_centers,
+                n_threads=self._n_threads,
+            )
+
+            if self.verbose:
+                print(f"Inertia for init {init_idx + 1}/{self._n_init}: {inertia}")
+            if best_inertia is None or inertia < best_inertia:
+                init_centers = cluster_centers
+                best_inertia = inertia
+
+        centers = init_centers
+        centers_new = np.empty_like(centers)
+
+        # Initialize counts
+        self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
+
+        # Attributes to monitor the convergence
+        self._ewa_inertia = None
+        self._ewa_inertia_min = None
+        self._no_improvement = 0
+
+        # Initialize number of samples seen since last reassignment
+        self._n_since_last_reassign = 0
+
+        n_steps = (self.max_iter * n_samples) // self._batch_size
+
+        with _get_threadpool_controller().limit(limits=1, user_api="blas"):
+            # Perform the iterative optimization until convergence
+            for i in range(n_steps):
+                # Sample a minibatch from the full dataset
+                minibatch_indices = random_state.randint(0, n_samples, self._batch_size)
+
+                # Perform the actual update step on the minibatch data
+                batch_inertia = _mini_batch_step(
+                    X=X[minibatch_indices],
+                    sample_weight=sample_weight[minibatch_indices],
+                    centers=centers,
+                    centers_new=centers_new,
+                    weight_sums=self._counts,
+                    random_state=random_state,
+                    random_reassign=self._random_reassign(),
+                    reassignment_ratio=self.reassignment_ratio,
+                    verbose=self.verbose,
+                    n_threads=self._n_threads,
+                )
+
+                if self._tol > 0.0:
+                    centers_squared_diff = np.sum((centers_new - centers) ** 2)
+                else:
+                    centers_squared_diff = 0
+
+                centers, centers_new = centers_new, centers
+
+                # Monitor convergence and do early stopping if necessary
+                if self._mini_batch_convergence(
+                    i, n_steps, n_samples, centers_squared_diff, batch_inertia
+                ):
+                    break
+
+        self.cluster_centers_ = centers
+        self._n_features_out = self.cluster_centers_.shape[0]
+
+        self.n_steps_ = i + 1
+        self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples))
+
+        if self.compute_labels:
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
+                X,
+                sample_weight,
+                self.cluster_centers_,
+                n_threads=self._n_threads,
+            )
+        else:
+            self.inertia_ = self._ewa_inertia * n_samples
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None, sample_weight=None):
+        """Update k means estimate on a single mini-batch X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training instances to cluster. It must be noted that the data
+            will be converted to C ordering, which will cause a memory copy
+            if the given data is not C-contiguous.
+            If a sparse matrix is passed, a copy will be made if it's not in
+            CSR format.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight. `sample_weight` is not used during
+            initialization if `init` is a callable or a user provided array.
+
+        Returns
+        -------
+        self : object
+            Return updated estimator.
+        """
+        has_centers = hasattr(self, "cluster_centers_")
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csr",
+            dtype=[np.float64, np.float32],
+            order="C",
+            accept_large_sparse=False,
+            reset=not has_centers,
+        )
+
+        self._random_state = getattr(
+            self, "_random_state", check_random_state(self.random_state)
+        )
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self.n_steps_ = getattr(self, "n_steps_", 0)
+
+        # precompute squared norms of data points
+        x_squared_norms = row_norms(X, squared=True)
+
+        if not has_centers:
+            # this instance has not been fitted yet (fit or partial_fit)
+            self._check_params_vs_input(X)
+            self._n_threads = _openmp_effective_n_threads()
+
+            # Validate init array
+            init = self.init
+            if _is_arraylike_not_scalar(init):
+                init = check_array(init, dtype=X.dtype, copy=True, order="C")
+                self._validate_center_shape(X, init)
+
+            self._check_mkl_vcomp(X, X.shape[0])
+
+            # initialize the cluster centers
+            self.cluster_centers_ = self._init_centroids(
+                X,
+                x_squared_norms=x_squared_norms,
+                init=init,
+                random_state=self._random_state,
+                init_size=self._init_size,
+                sample_weight=sample_weight,
+            )
+
+            # Initialize counts
+            self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
+
+            # Initialize number of samples seen since last reassignment
+            self._n_since_last_reassign = 0
+
+        with _get_threadpool_controller().limit(limits=1, user_api="blas"):
+            _mini_batch_step(
+                X,
+                sample_weight=sample_weight,
+                centers=self.cluster_centers_,
+                centers_new=self.cluster_centers_,
+                weight_sums=self._counts,
+                random_state=self._random_state,
+                random_reassign=self._random_reassign(),
+                reassignment_ratio=self.reassignment_ratio,
+                verbose=self.verbose,
+                n_threads=self._n_threads,
+            )
+
+        if self.compute_labels:
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
+                X,
+                sample_weight,
+                self.cluster_centers_,
+                n_threads=self._n_threads,
+            )
+
+        self.n_steps_ += 1
+        self._n_features_out = self.cluster_centers_.shape[0]
+
+        return self
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_mean_shift.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_mean_shift.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba4409d14698b482a6854fd1558f014ea3d9f70
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_mean_shift.py
@@ -0,0 +1,579 @@
+"""Mean shift clustering algorithm.
+
+Mean shift clustering aims to discover *blobs* in a smooth density of
+samples. It is a centroid based algorithm, which works by updating candidates
+for centroids to be the mean of the points within a given region. These
+candidates are then filtered in a post-processing stage to eliminate
+near-duplicates to form the final set of centroids.
+
+Seeding is performed using a binning technique for scalability.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from collections import defaultdict
+from numbers import Integral, Real
+
+import numpy as np
+
+from .._config import config_context
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..metrics.pairwise import pairwise_distances_argmin
+from ..neighbors import NearestNeighbors
+from ..utils import check_array, check_random_state, gen_batches
+from ..utils._param_validation import Interval, validate_params
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import check_is_fitted, validate_data
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "quantile": [Interval(Real, 0, 1, closed="both")],
+        "n_samples": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None):
+    """Estimate the bandwidth to use with the mean-shift algorithm.
+
+    This function takes time at least quadratic in `n_samples`. For large
+    datasets, it is wise to subsample by setting `n_samples`. Alternatively,
+    the parameter `bandwidth` can be set to a small value without estimating
+    it.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Input points.
+
+    quantile : float, default=0.3
+        Should be between [0, 1]
+        0.5 means that the median of all pairwise distances is used.
+
+    n_samples : int, default=None
+        The number of samples to use. If not given, all samples are used.
+
+    random_state : int, RandomState instance, default=None
+        The generator used to randomly select the samples from input points
+        for bandwidth estimation. Use an int to make the randomness
+        deterministic.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    bandwidth : float
+        The bandwidth parameter.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import estimate_bandwidth
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> estimate_bandwidth(X, quantile=0.5)
+    np.float64(1.61)
+    """
+    X = check_array(X)
+
+    random_state = check_random_state(random_state)
+    if n_samples is not None:
+        idx = random_state.permutation(X.shape[0])[:n_samples]
+        X = X[idx]
+    n_neighbors = int(X.shape[0] * quantile)
+    if n_neighbors < 1:  # cannot fit NearestNeighbors with n_neighbors = 0
+        n_neighbors = 1
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs)
+    nbrs.fit(X)
+
+    bandwidth = 0.0
+    for batch in gen_batches(len(X), 500):
+        d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
+        bandwidth += np.max(d, axis=1).sum()
+
+    return bandwidth / X.shape[0]
+
+
+# separate function for each seed's iterative loop
+def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
+    # For each seed, climb gradient until convergence or max_iter
+    bandwidth = nbrs.get_params()["radius"]
+    stop_thresh = 1e-3 * bandwidth  # when mean has converged
+    completed_iterations = 0
+    while True:
+        # Find mean of points within bandwidth
+        i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0]
+        points_within = X[i_nbrs]
+        if len(points_within) == 0:
+            break  # Depending on seeding strategy this condition may occur
+        my_old_mean = my_mean  # save the old mean
+        my_mean = np.mean(points_within, axis=0)
+        # If converged or at max_iter, adds the cluster
+        if (
+            np.linalg.norm(my_mean - my_old_mean) <= stop_thresh
+            or completed_iterations == max_iter
+        ):
+            break
+        completed_iterations += 1
+    return tuple(my_mean), len(points_within), completed_iterations
+
+
+@validate_params(
+    {"X": ["array-like"]},
+    prefer_skip_nested_validation=False,
+)
+def mean_shift(
+    X,
+    *,
+    bandwidth=None,
+    seeds=None,
+    bin_seeding=False,
+    min_bin_freq=1,
+    cluster_all=True,
+    max_iter=300,
+    n_jobs=None,
+):
+    """Perform mean shift clustering of data using a flat kernel.
+
+    Read more in the :ref:`User Guide <mean_shift>`.
+
+    Parameters
+    ----------
+
+    X : array-like of shape (n_samples, n_features)
+        Input data.
+
+    bandwidth : float, default=None
+        Kernel bandwidth. If not None, must be in the range [0, +inf).
+
+        If None, the bandwidth is determined using a heuristic based on
+        the median of all pairwise distances. This will take quadratic time in
+        the number of samples. The sklearn.cluster.estimate_bandwidth function
+        can be used to do this more efficiently.
+
+    seeds : array-like of shape (n_seeds, n_features) or None
+        Point used as initial kernel locations. If None and bin_seeding=False,
+        each data point is used as a seed. If None and bin_seeding=True,
+        see bin_seeding.
+
+    bin_seeding : bool, default=False
+        If true, initial kernel locations are not locations of all
+        points, but rather the location of the discretized version of
+        points, where points are binned onto a grid whose coarseness
+        corresponds to the bandwidth. Setting this option to True will speed
+        up the algorithm because fewer seeds will be initialized.
+        Ignored if seeds argument is not None.
+
+    min_bin_freq : int, default=1
+       To speed up the algorithm, accept only those bins with at least
+       min_bin_freq points as seeds.
+
+    cluster_all : bool, default=True
+        If true, then all points are clustered, even those orphans that are
+        not within any kernel. Orphans are assigned to the nearest kernel.
+        If false, then orphans are given cluster label -1.
+
+    max_iter : int, default=300
+        Maximum number of iterations, per seed point before the clustering
+        operation terminates (for that seed point), if has not converged yet.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. The following tasks benefit
+        from the parallelization:
+
+        - The search of nearest neighbors for bandwidth estimation and label
+          assignments. See the details in the docstring of the
+          ``NearestNeighbors`` class.
+        - Hill-climbing optimization for all seeds.
+
+        See :term:`Glossary <n_jobs>` for more details.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 0.17
+           Parallel Execution using *n_jobs*.
+
+    Returns
+    -------
+
+    cluster_centers : ndarray of shape (n_clusters, n_features)
+        Coordinates of cluster centers.
+
+    labels : ndarray of shape (n_samples,)
+        Cluster labels for each point.
+
+    Notes
+    -----
+    For a usage example, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_mean_shift.py`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import mean_shift
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> cluster_centers, labels = mean_shift(X, bandwidth=2)
+    >>> cluster_centers
+    array([[3.33, 6.     ],
+           [1.33, 0.66]])
+    >>> labels
+    array([1, 1, 1, 0, 0, 0])
+    """
+    model = MeanShift(
+        bandwidth=bandwidth,
+        seeds=seeds,
+        min_bin_freq=min_bin_freq,
+        bin_seeding=bin_seeding,
+        cluster_all=cluster_all,
+        n_jobs=n_jobs,
+        max_iter=max_iter,
+    ).fit(X)
+    return model.cluster_centers_, model.labels_
+
+
+def get_bin_seeds(X, bin_size, min_bin_freq=1):
+    """Find seeds for mean_shift.
+
+    Finds seeds by first binning data onto a grid whose lines are
+    spaced bin_size apart, and then choosing those bins with at least
+    min_bin_freq points.
+
+    Parameters
+    ----------
+
+    X : array-like of shape (n_samples, n_features)
+        Input points, the same points that will be used in mean_shift.
+
+    bin_size : float
+        Controls the coarseness of the binning. Smaller values lead
+        to more seeding (which is computationally more expensive). If you're
+        not sure how to set this, set it to the value of the bandwidth used
+        in clustering.mean_shift.
+
+    min_bin_freq : int, default=1
+        Only bins with at least min_bin_freq will be selected as seeds.
+        Raising this value decreases the number of seeds found, which
+        makes mean_shift computationally cheaper.
+
+    Returns
+    -------
+    bin_seeds : array-like of shape (n_samples, n_features)
+        Points used as initial kernel positions in clustering.mean_shift.
+    """
+    if bin_size == 0:
+        return X
+
+    # Bin points
+    bin_sizes = defaultdict(int)
+    for point in X:
+        binned_point = np.round(point / bin_size)
+        bin_sizes[tuple(binned_point)] += 1
+
+    # Select only those bins as seeds which have enough members
+    bin_seeds = np.array(
+        [point for point, freq in bin_sizes.items() if freq >= min_bin_freq],
+        dtype=np.float32,
+    )
+    if len(bin_seeds) == len(X):
+        warnings.warn(
+            "Binning data failed with provided bin_size=%f, using data points as seeds."
+            % bin_size
+        )
+        return X
+    bin_seeds = bin_seeds * bin_size
+    return bin_seeds
+
+
+class MeanShift(ClusterMixin, BaseEstimator):
+    """Mean shift clustering using a flat kernel.
+
+    Mean shift clustering aims to discover "blobs" in a smooth density of
+    samples. It is a centroid-based algorithm, which works by updating
+    candidates for centroids to be the mean of the points within a given
+    region. These candidates are then filtered in a post-processing stage to
+    eliminate near-duplicates to form the final set of centroids.
+
+    Seeding is performed using a binning technique for scalability.
+
+    For an example of how to use MeanShift clustering, refer to:
+    :ref:`sphx_glr_auto_examples_cluster_plot_mean_shift.py`.
+
+    Read more in the :ref:`User Guide <mean_shift>`.
+
+    Parameters
+    ----------
+    bandwidth : float, default=None
+        Bandwidth used in the flat kernel.
+
+        If not given, the bandwidth is estimated using
+        sklearn.cluster.estimate_bandwidth; see the documentation for that
+        function for hints on scalability (see also the Notes, below).
+
+    seeds : array-like of shape (n_samples, n_features), default=None
+        Seeds used to initialize kernels. If not set,
+        the seeds are calculated by clustering.get_bin_seeds
+        with bandwidth as the grid size and default values for
+        other parameters.
+
+    bin_seeding : bool, default=False
+        If true, initial kernel locations are not locations of all
+        points, but rather the location of the discretized version of
+        points, where points are binned onto a grid whose coarseness
+        corresponds to the bandwidth. Setting this option to True will speed
+        up the algorithm because fewer seeds will be initialized.
+        The default value is False.
+        Ignored if seeds argument is not None.
+
+    min_bin_freq : int, default=1
+       To speed up the algorithm, accept only those bins with at least
+       min_bin_freq points as seeds.
+
+    cluster_all : bool, default=True
+        If true, then all points are clustered, even those orphans that are
+        not within any kernel. Orphans are assigned to the nearest kernel.
+        If false, then orphans are given cluster label -1.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. The following tasks benefit
+        from the parallelization:
+
+        - The search of nearest neighbors for bandwidth estimation and label
+          assignments. See the details in the docstring of the
+          ``NearestNeighbors`` class.
+        - Hill-climbing optimization for all seeds.
+
+        See :term:`Glossary <n_jobs>` for more details.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    max_iter : int, default=300
+        Maximum number of iterations, per seed point before the clustering
+        operation terminates (for that seed point), if has not converged yet.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
+        Coordinates of cluster centers.
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point.
+
+    n_iter_ : int
+        Maximum number of iterations performed on each seed.
+
+        .. versionadded:: 0.22
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    KMeans : K-Means clustering.
+
+    Notes
+    -----
+
+    Scalability:
+
+    Because this implementation uses a flat kernel and
+    a Ball Tree to look up members of each kernel, the complexity will tend
+    towards O(T*n*log(n)) in lower dimensions, with n the number of samples
+    and T the number of points. In higher dimensions the complexity will
+    tend towards O(T*n^2).
+
+    Scalability can be boosted by using fewer seeds, for example by using
+    a higher value of min_bin_freq in the get_bin_seeds function.
+
+    Note that the estimate_bandwidth function is much less scalable than the
+    mean shift algorithm and will be the bottleneck if it is used.
+
+    References
+    ----------
+
+    Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
+    feature space analysis". IEEE Transactions on Pattern Analysis and
+    Machine Intelligence. 2002. pp. 603-619.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import MeanShift
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = MeanShift(bandwidth=2).fit(X)
+    >>> clustering.labels_
+    array([1, 1, 1, 0, 0, 0])
+    >>> clustering.predict([[0, 0], [5, 5]])
+    array([1, 0])
+    >>> clustering
+    MeanShift(bandwidth=2)
+
+    For a comparison of Mean Shift clustering with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        "bandwidth": [Interval(Real, 0, None, closed="neither"), None],
+        "seeds": ["array-like", None],
+        "bin_seeding": ["boolean"],
+        "min_bin_freq": [Interval(Integral, 1, None, closed="left")],
+        "cluster_all": ["boolean"],
+        "n_jobs": [Integral, None],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        *,
+        bandwidth=None,
+        seeds=None,
+        bin_seeding=False,
+        min_bin_freq=1,
+        cluster_all=True,
+        n_jobs=None,
+        max_iter=300,
+    ):
+        self.bandwidth = bandwidth
+        self.seeds = seeds
+        self.bin_seeding = bin_seeding
+        self.cluster_all = cluster_all
+        self.min_bin_freq = min_bin_freq
+        self.n_jobs = n_jobs
+        self.max_iter = max_iter
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Perform clustering.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples to cluster.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+               Fitted instance.
+        """
+        X = validate_data(self, X)
+        bandwidth = self.bandwidth
+        if bandwidth is None:
+            bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
+
+        seeds = self.seeds
+        if seeds is None:
+            if self.bin_seeding:
+                seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
+            else:
+                seeds = X
+        n_samples, n_features = X.shape
+        center_intensity_dict = {}
+
+        # We use n_jobs=1 because this will be used in nested calls under
+        # parallel calls to _mean_shift_single_seed so there is no need for
+        # for further parallelism.
+        nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
+
+        # execute iterations on all seeds in parallel
+        all_res = Parallel(n_jobs=self.n_jobs)(
+            delayed(_mean_shift_single_seed)(seed, X, nbrs, self.max_iter)
+            for seed in seeds
+        )
+        # copy results in a dictionary
+        for i in range(len(seeds)):
+            if all_res[i][1]:  # i.e. len(points_within) > 0
+                center_intensity_dict[all_res[i][0]] = all_res[i][1]
+
+        self.n_iter_ = max([x[2] for x in all_res])
+
+        if not center_intensity_dict:
+            # nothing near seeds
+            raise ValueError(
+                "No point was within bandwidth=%f of any seed. Try a different seeding"
+                " strategy                              or increase the bandwidth."
+                % bandwidth
+            )
+
+        # POST PROCESSING: remove near duplicate points
+        # If the distance between two kernels is less than the bandwidth,
+        # then we have to remove one because it is a duplicate. Remove the
+        # one with fewer points.
+
+        sorted_by_intensity = sorted(
+            center_intensity_dict.items(),
+            key=lambda tup: (tup[1], tup[0]),
+            reverse=True,
+        )
+        sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
+        unique = np.ones(len(sorted_centers), dtype=bool)
+        nbrs = NearestNeighbors(radius=bandwidth, n_jobs=self.n_jobs).fit(
+            sorted_centers
+        )
+        for i, center in enumerate(sorted_centers):
+            if unique[i]:
+                neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[
+                    0
+                ]
+                unique[neighbor_idxs] = 0
+                unique[i] = 1  # leave the current point as unique
+        cluster_centers = sorted_centers[unique]
+
+        # ASSIGN LABELS: a point belongs to the cluster that it is closest to
+        nbrs = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs).fit(cluster_centers)
+        labels = np.zeros(n_samples, dtype=int)
+        distances, idxs = nbrs.kneighbors(X)
+        if self.cluster_all:
+            labels = idxs.flatten()
+        else:
+            labels.fill(-1)
+            bool_selector = distances.flatten() <= bandwidth
+            labels[bool_selector] = idxs.flatten()[bool_selector]
+
+        self.cluster_centers_, self.labels_ = cluster_centers, labels
+        return self
+
+    def predict(self, X):
+        """Predict the closest cluster each sample in X belongs to.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            New data to predict.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+        with config_context(assume_finite=True):
+            return pairwise_distances_argmin(X, self.cluster_centers_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_optics.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_optics.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a1a80c9065c2d1504a6c97a926b919374e0a1ee
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_optics.py
@@ -0,0 +1,1202 @@
+"""Ordering Points To Identify the Clustering Structure (OPTICS)
+
+These routines execute the OPTICS algorithm, and implement various
+cluster extraction methods of the ordered list.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.sparse import SparseEfficiencyWarning, issparse
+
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..exceptions import DataConversionWarning
+from ..metrics import pairwise_distances
+from ..metrics.pairwise import _VALID_METRICS, PAIRWISE_BOOLEAN_FUNCTIONS
+from ..neighbors import NearestNeighbors
+from ..utils import gen_batches
+from ..utils._chunking import get_chunk_n_rows
+from ..utils._param_validation import (
+    HasMethods,
+    Interval,
+    RealNotInt,
+    StrOptions,
+    validate_params,
+)
+from ..utils.validation import check_memory, validate_data
+
+
+class OPTICS(ClusterMixin, BaseEstimator):
+    """Estimate clustering structure from vector array.
+
+    OPTICS (Ordering Points To Identify the Clustering Structure), closely
+    related to DBSCAN, finds core samples of high density and expands clusters
+    from them [1]_. Unlike DBSCAN, it keeps cluster hierarchy for a variable
+    neighborhood radius. Better suited for usage on large datasets than the
+    current scikit-learn implementation of DBSCAN.
+
+    Clusters are then extracted from the cluster-order using a
+    DBSCAN-like method (cluster_method = 'dbscan') or an automatic
+    technique proposed in [1]_ (cluster_method = 'xi').
+
+    This implementation deviates from the original OPTICS by first performing
+    k-nearest-neighborhood searches on all points to identify core sizes of
+    all points (instead of computing neighbors while looping through points).
+    Reachability distances to only unprocessed points are then computed, to
+    construct the cluster order, similar to the original OPTICS.
+    Note that we do not employ a heap to manage the expansion
+    candidates, so the time complexity will be O(n^2).
+
+    Read more in the :ref:`User Guide <optics>`.
+
+    Parameters
+    ----------
+    min_samples : int > 1 or float between 0 and 1, default=5
+        The number of samples in a neighborhood for a point to be considered as
+        a core point. Also, up and down steep regions can't have more than
+        ``min_samples`` consecutive non-steep points. Expressed as an absolute
+        number or a fraction of the number of samples (rounded to be at least
+        2).
+
+    max_eps : float, default=np.inf
+        The maximum distance between two samples for one to be considered as
+        in the neighborhood of the other. Default value of ``np.inf`` will
+        identify clusters across all scales; reducing ``max_eps`` will result
+        in shorter run times.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Any metric from scikit-learn
+        or :mod:`scipy.spatial.distance` can be used.
+
+        If `metric` is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays as input and return one value indicating the
+        distance between them. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string. If metric is
+        "precomputed", `X` is assumed to be a distance matrix and must be
+        square.
+
+        Valid values for metric are:
+
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan']
+
+        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+          'yule']
+
+        Sparse matrices are only supported by scikit-learn metrics.
+        See :mod:`scipy.spatial.distance` for details on these metrics.
+
+        .. note::
+           `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    cluster_method : {'xi', 'dbscan'}, default='xi'
+        The extraction method used to extract clusters using the calculated
+        reachability and ordering.
+
+    eps : float, default=None
+        The maximum distance between two samples for one to be considered as
+        in the neighborhood of the other. By default it assumes the same value
+        as ``max_eps``.
+        Used only when ``cluster_method='dbscan'``.
+
+    xi : float between 0 and 1, default=0.05
+        Determines the minimum steepness on the reachability plot that
+        constitutes a cluster boundary. For example, an upwards point in the
+        reachability plot is defined by the ratio from one point to its
+        successor being at most 1-xi.
+        Used only when ``cluster_method='xi'``.
+
+    predecessor_correction : bool, default=True
+        Correct clusters according to the predecessors calculated by OPTICS
+        [2]_. This parameter has minimal effect on most datasets.
+        Used only when ``cluster_method='xi'``.
+
+    min_cluster_size : int > 1 or float between 0 and 1, default=None
+        Minimum number of samples in an OPTICS cluster, expressed as an
+        absolute number or a fraction of the number of samples (rounded to be
+        at least 2). If ``None``, the value of ``min_samples`` is used instead.
+        Used only when ``cluster_method='xi'``.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
+        - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
+        - 'brute' will use a brute-force search.
+        - 'auto' (default) will attempt to decide the most appropriate
+          algorithm based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
+        :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
+        construction and query, as well as the memory required to store the
+        tree. The optimal value depends on the nature of the problem.
+
+    memory : str or object with the joblib.Memory interface, default=None
+        Used to cache the output of the computation of the tree.
+        By default, no caching is done. If a string is given, it is the
+        path to the caching directory.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    labels_ : ndarray of shape (n_samples,)
+        Cluster labels for each point in the dataset given to fit().
+        Noisy samples and points which are not included in a leaf cluster
+        of ``cluster_hierarchy_`` are labeled as -1.
+
+    reachability_ : ndarray of shape (n_samples,)
+        Reachability distances per sample, indexed by object order. Use
+        ``clust.reachability_[clust.ordering_]`` to access in cluster order.
+
+    ordering_ : ndarray of shape (n_samples,)
+        The cluster ordered list of sample indices.
+
+    core_distances_ : ndarray of shape (n_samples,)
+        Distance at which each sample becomes a core point, indexed by object
+        order. Points which will never be core have a distance of inf. Use
+        ``clust.core_distances_[clust.ordering_]`` to access in cluster order.
+
+    predecessor_ : ndarray of shape (n_samples,)
+        Point that a sample was reached from, indexed by object order.
+        Seed points have a predecessor of -1.
+
+    cluster_hierarchy_ : ndarray of shape (n_clusters, 2)
+        The list of clusters in the form of ``[start, end]`` in each row, with
+        all indices inclusive. The clusters are ordered according to
+        ``(end, -start)`` (ascending) so that larger clusters encompassing
+        smaller clusters come after those smaller ones. Since ``labels_`` does
+        not reflect the hierarchy, usually
+        ``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also
+        note that these indices are of the ``ordering_``, i.e.
+        ``X[ordering_][start:end + 1]`` form a cluster.
+        Only available when ``cluster_method='xi'``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    DBSCAN : A similar clustering for a specified neighborhood radius (eps).
+        Our implementation is optimized for runtime.
+
+    References
+    ----------
+    .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
+       and Jörg Sander. "OPTICS: ordering points to identify the clustering
+       structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
+
+    .. [2] Schubert, Erich, Michael Gertz.
+       "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
+       the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import OPTICS
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [2, 5], [3, 6],
+    ...               [8, 7], [8, 8], [7, 3]])
+    >>> clustering = OPTICS(min_samples=2).fit(X)
+    >>> clustering.labels_
+    array([0, 0, 0, 1, 1, 1])
+
+    For a more detailed example see
+    :ref:`sphx_glr_auto_examples_cluster_plot_optics.py`.
+
+    For a comparison of OPTICS with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        "min_samples": [
+            Interval(Integral, 2, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="both"),
+        ],
+        "max_eps": [Interval(Real, 0, None, closed="both")],
+        "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
+        "p": [Interval(Real, 1, None, closed="left")],
+        "metric_params": [dict, None],
+        "cluster_method": [StrOptions({"dbscan", "xi"})],
+        "eps": [Interval(Real, 0, None, closed="both"), None],
+        "xi": [Interval(Real, 0, 1, closed="both")],
+        "predecessor_correction": ["boolean"],
+        "min_cluster_size": [
+            Interval(Integral, 2, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="right"),
+            None,
+        ],
+        "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})],
+        "leaf_size": [Interval(Integral, 1, None, closed="left")],
+        "memory": [str, HasMethods("cache"), None],
+        "n_jobs": [Integral, None],
+    }
+
+    def __init__(
+        self,
+        *,
+        min_samples=5,
+        max_eps=np.inf,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        cluster_method="xi",
+        eps=None,
+        xi=0.05,
+        predecessor_correction=True,
+        min_cluster_size=None,
+        algorithm="auto",
+        leaf_size=30,
+        memory=None,
+        n_jobs=None,
+    ):
+        self.max_eps = max_eps
+        self.min_samples = min_samples
+        self.min_cluster_size = min_cluster_size
+        self.algorithm = algorithm
+        self.metric = metric
+        self.metric_params = metric_params
+        self.p = p
+        self.leaf_size = leaf_size
+        self.cluster_method = cluster_method
+        self.eps = eps
+        self.xi = xi
+        self.predecessor_correction = predecessor_correction
+        self.memory = memory
+        self.n_jobs = n_jobs
+
+    @_fit_context(
+        # Optics.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Perform OPTICS clustering.
+
+        Extracts an ordered list of points and reachability distances, and
+        performs initial clustering using ``max_eps`` distance specified at
+        OPTICS object instantiation.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \
+                (n_samples, n_samples) if metric='precomputed'
+            A feature array, or array of distances between samples if
+            metric='precomputed'. If a sparse matrix is provided, it will be
+            converted into CSR format.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns a fitted instance of self.
+        """
+        dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float
+        if dtype is bool and X.dtype != bool:
+            msg = (
+                "Data will be converted to boolean for"
+                f" metric {self.metric}, to avoid this warning,"
+                " you may convert the data prior to calling fit."
+            )
+            warnings.warn(msg, DataConversionWarning)
+
+        X = validate_data(self, X, dtype=dtype, accept_sparse="csr")
+        if self.metric == "precomputed" and issparse(X):
+            X = X.copy()  # copy to avoid in-place modification
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", SparseEfficiencyWarning)
+                # Set each diagonal to an explicit value so each point is its
+                # own neighbor
+                X.setdiag(X.diagonal())
+        memory = check_memory(self.memory)
+
+        (
+            self.ordering_,
+            self.core_distances_,
+            self.reachability_,
+            self.predecessor_,
+        ) = memory.cache(compute_optics_graph)(
+            X=X,
+            min_samples=self.min_samples,
+            algorithm=self.algorithm,
+            leaf_size=self.leaf_size,
+            metric=self.metric,
+            metric_params=self.metric_params,
+            p=self.p,
+            n_jobs=self.n_jobs,
+            max_eps=self.max_eps,
+        )
+
+        # Extract clusters from the calculated orders and reachability
+        if self.cluster_method == "xi":
+            labels_, clusters_ = cluster_optics_xi(
+                reachability=self.reachability_,
+                predecessor=self.predecessor_,
+                ordering=self.ordering_,
+                min_samples=self.min_samples,
+                min_cluster_size=self.min_cluster_size,
+                xi=self.xi,
+                predecessor_correction=self.predecessor_correction,
+            )
+            self.cluster_hierarchy_ = clusters_
+        elif self.cluster_method == "dbscan":
+            if self.eps is None:
+                eps = self.max_eps
+            else:
+                eps = self.eps
+
+            if eps > self.max_eps:
+                raise ValueError(
+                    "Specify an epsilon smaller than %s. Got %s." % (self.max_eps, eps)
+                )
+
+            labels_ = cluster_optics_dbscan(
+                reachability=self.reachability_,
+                core_distances=self.core_distances_,
+                ordering=self.ordering_,
+                eps=eps,
+            )
+
+        self.labels_ = labels_
+        return self
+
+
+def _validate_size(size, n_samples, param_name):
+    if size > n_samples:
+        raise ValueError(
+            "%s must be no greater than the number of samples (%d). Got %d"
+            % (param_name, n_samples, size)
+        )
+
+
+# OPTICS helper functions
+def _compute_core_distances_(X, neighbors, min_samples, working_memory):
+    """Compute the k-th nearest neighbor of each sample.
+
+    Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
+    but with more memory efficiency.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data.
+    neighbors : NearestNeighbors instance
+        The fitted nearest neighbors estimator.
+    working_memory : int, default=None
+        The sought maximum memory for temporary distance matrix chunks.
+        When None (default), the value of
+        ``sklearn.get_config()['working_memory']`` is used.
+
+    Returns
+    -------
+    core_distances : ndarray of shape (n_samples,)
+        Distance at which each sample becomes a core point.
+        Points which will never be core have a distance of inf.
+    """
+    n_samples = X.shape[0]
+    core_distances = np.empty(n_samples)
+    core_distances.fill(np.nan)
+
+    chunk_n_rows = get_chunk_n_rows(
+        row_bytes=16 * min_samples, max_n_rows=n_samples, working_memory=working_memory
+    )
+    slices = gen_batches(n_samples, chunk_n_rows)
+    for sl in slices:
+        core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1]
+    return core_distances
+
+
+@validate_params(
+    {
+        "X": [np.ndarray, "sparse matrix"],
+        "min_samples": [
+            Interval(Integral, 2, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="both"),
+        ],
+        "max_eps": [Interval(Real, 0, None, closed="both")],
+        "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
+        "p": [Interval(Real, 0, None, closed="right"), None],
+        "metric_params": [dict, None],
+        "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})],
+        "leaf_size": [Interval(Integral, 1, None, closed="left")],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=False,  # metric is not validated yet
+)
+def compute_optics_graph(
+    X, *, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs
+):
+    """Compute the OPTICS reachability graph.
+
+    Read more in the :ref:`User Guide <optics>`.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \
+            (n_samples, n_samples) if metric='precomputed'
+        A feature array, or array of distances between samples if
+        metric='precomputed'.
+
+    min_samples : int > 1 or float between 0 and 1
+        The number of samples in a neighborhood for a point to be considered
+        as a core point. Expressed as an absolute number or a fraction of the
+        number of samples (rounded to be at least 2).
+
+    max_eps : float, default=np.inf
+        The maximum distance between two samples for one to be considered as
+        in the neighborhood of the other. Default value of ``np.inf`` will
+        identify clusters across all scales; reducing ``max_eps`` will result
+        in shorter run times.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Any metric from scikit-learn
+        or scipy.spatial.distance can be used.
+
+        If metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays as input and return one value indicating the
+        distance between them. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string. If metric is
+        "precomputed", X is assumed to be a distance matrix and must be square.
+
+        Valid values for metric are:
+
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan']
+
+        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+          'yule']
+
+        See the documentation for scipy.spatial.distance for details on these
+        metrics.
+
+        .. note::
+           `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
+        - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to `fit` method. (default)
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
+        :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
+        construction and query, as well as the memory required to store the
+        tree. The optimal value depends on the nature of the problem.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    ordering_ : array of shape (n_samples,)
+        The cluster ordered list of sample indices.
+
+    core_distances_ : array of shape (n_samples,)
+        Distance at which each sample becomes a core point, indexed by object
+        order. Points which will never be core have a distance of inf. Use
+        ``clust.core_distances_[clust.ordering_]`` to access in cluster order.
+
+    reachability_ : array of shape (n_samples,)
+        Reachability distances per sample, indexed by object order. Use
+        ``clust.reachability_[clust.ordering_]`` to access in cluster order.
+
+    predecessor_ : array of shape (n_samples,)
+        Point that a sample was reached from, indexed by object order.
+        Seed points have a predecessor of -1.
+
+    References
+    ----------
+    .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
+       and Jörg Sander. "OPTICS: ordering points to identify the clustering
+       structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import compute_optics_graph
+    >>> X = np.array([[1, 2], [2, 5], [3, 6],
+    ...               [8, 7], [8, 8], [7, 3]])
+    >>> ordering, core_distances, reachability, predecessor = compute_optics_graph(
+    ...     X,
+    ...     min_samples=2,
+    ...     max_eps=np.inf,
+    ...     metric="minkowski",
+    ...     p=2,
+    ...     metric_params=None,
+    ...     algorithm="auto",
+    ...     leaf_size=30,
+    ...     n_jobs=None,
+    ... )
+    >>> ordering
+    array([0, 1, 2, 5, 3, 4])
+    >>> core_distances
+    array([3.16, 1.41, 1.41, 1.        , 1.        ,
+           4.12])
+    >>> reachability
+    array([       inf, 3.16, 1.41, 4.12, 1.        ,
+           5.        ])
+    >>> predecessor
+    array([-1,  0,  1,  5,  3,  2])
+    """
+    n_samples = X.shape[0]
+    _validate_size(min_samples, n_samples, "min_samples")
+    if min_samples <= 1:
+        min_samples = max(2, int(min_samples * n_samples))
+
+    # Start all points as 'unprocessed' ##
+    reachability_ = np.empty(n_samples)
+    reachability_.fill(np.inf)
+    predecessor_ = np.empty(n_samples, dtype=int)
+    predecessor_.fill(-1)
+
+    nbrs = NearestNeighbors(
+        n_neighbors=min_samples,
+        algorithm=algorithm,
+        leaf_size=leaf_size,
+        metric=metric,
+        metric_params=metric_params,
+        p=p,
+        n_jobs=n_jobs,
+    )
+
+    nbrs.fit(X)
+    # Here we first do a kNN query for each point, this differs from
+    # the original OPTICS that only used epsilon range queries.
+    # TODO: handle working_memory somehow?
+    core_distances_ = _compute_core_distances_(
+        X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
+    )
+    # OPTICS puts an upper limit on these, use inf for undefined.
+    core_distances_[core_distances_ > max_eps] = np.inf
+    np.around(
+        core_distances_,
+        decimals=np.finfo(core_distances_.dtype).precision,
+        out=core_distances_,
+    )
+
+    # Main OPTICS loop. Not parallelizable. The order that entries are
+    # written to the 'ordering_' list is important!
+    # Note that this implementation is O(n^2) theoretically, but
+    # supposedly with very low constant factors.
+    processed = np.zeros(X.shape[0], dtype=bool)
+    ordering = np.zeros(X.shape[0], dtype=int)
+    for ordering_idx in range(X.shape[0]):
+        # Choose next based on smallest reachability distance
+        # (And prefer smaller ids on ties, possibly np.inf!)
+        index = np.where(processed == 0)[0]
+        point = index[np.argmin(reachability_[index])]
+
+        processed[point] = True
+        ordering[ordering_idx] = point
+        if core_distances_[point] != np.inf:
+            _set_reach_dist(
+                core_distances_=core_distances_,
+                reachability_=reachability_,
+                predecessor_=predecessor_,
+                point_index=point,
+                processed=processed,
+                X=X,
+                nbrs=nbrs,
+                metric=metric,
+                metric_params=metric_params,
+                p=p,
+                max_eps=max_eps,
+            )
+    if np.all(np.isinf(reachability_)):
+        warnings.warn(
+            (
+                "All reachability values are inf. Set a larger"
+                " max_eps or all data will be considered outliers."
+            ),
+            UserWarning,
+        )
+    return ordering, core_distances_, reachability_, predecessor_
+
+
+def _set_reach_dist(
+    core_distances_,
+    reachability_,
+    predecessor_,
+    point_index,
+    processed,
+    X,
+    nbrs,
+    metric,
+    metric_params,
+    p,
+    max_eps,
+):
+    P = X[point_index : point_index + 1]
+    # Assume that radius_neighbors is faster without distances
+    # and we don't need all distances, nevertheless, this means
+    # we may be doing some work twice.
+    indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]
+
+    # Getting indices of neighbors that have not been processed
+    unproc = np.compress(~np.take(processed, indices), indices)
+    # Neighbors of current point are already processed.
+    if not unproc.size:
+        return
+
+    # Only compute distances to unprocessed neighbors:
+    if metric == "precomputed":
+        dists = X[[point_index], unproc]
+        if isinstance(dists, np.matrix):
+            dists = np.asarray(dists)
+        dists = dists.ravel()
+    else:
+        _params = dict() if metric_params is None else metric_params.copy()
+        if metric == "minkowski" and "p" not in _params:
+            # the same logic as neighbors, p is ignored if explicitly set
+            # in the dict params
+            _params["p"] = p
+        dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel()
+
+    rdists = np.maximum(dists, core_distances_[point_index])
+    np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
+    improved = np.where(rdists < np.take(reachability_, unproc))
+    reachability_[unproc[improved]] = rdists[improved]
+    predecessor_[unproc[improved]] = point_index
+
+
+@validate_params(
+    {
+        "reachability": [np.ndarray],
+        "core_distances": [np.ndarray],
+        "ordering": [np.ndarray],
+        "eps": [Interval(Real, 0, None, closed="both")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):
+    """Perform DBSCAN extraction for an arbitrary epsilon.
+
+    Extracting the clusters runs in linear time. Note that this results in
+    ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with
+    similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.
+
+    Parameters
+    ----------
+    reachability : ndarray of shape (n_samples,)
+        Reachability distances calculated by OPTICS (``reachability_``).
+
+    core_distances : ndarray of shape (n_samples,)
+        Distances at which points become core (``core_distances_``).
+
+    ordering : ndarray of shape (n_samples,)
+        OPTICS ordered point indices (``ordering_``).
+
+    eps : float
+        DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results
+        will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close
+        to one another.
+
+    Returns
+    -------
+    labels_ : array of shape (n_samples,)
+        The estimated labels.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import cluster_optics_dbscan, compute_optics_graph
+    >>> X = np.array([[1, 2], [2, 5], [3, 6],
+    ...               [8, 7], [8, 8], [7, 3]])
+    >>> ordering, core_distances, reachability, predecessor = compute_optics_graph(
+    ...     X,
+    ...     min_samples=2,
+    ...     max_eps=np.inf,
+    ...     metric="minkowski",
+    ...     p=2,
+    ...     metric_params=None,
+    ...     algorithm="auto",
+    ...     leaf_size=30,
+    ...     n_jobs=None,
+    ... )
+    >>> eps = 4.5
+    >>> labels = cluster_optics_dbscan(
+    ...     reachability=reachability,
+    ...     core_distances=core_distances,
+    ...     ordering=ordering,
+    ...     eps=eps,
+    ... )
+    >>> labels
+    array([0, 0, 0, 1, 1, 1])
+    """
+    n_samples = len(core_distances)
+    labels = np.zeros(n_samples, dtype=int)
+
+    far_reach = reachability > eps
+    near_core = core_distances <= eps
+    labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1
+    labels[far_reach & ~near_core] = -1
+    return labels
+
+
+@validate_params(
+    {
+        "reachability": [np.ndarray],
+        "predecessor": [np.ndarray],
+        "ordering": [np.ndarray],
+        "min_samples": [
+            Interval(Integral, 2, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="both"),
+        ],
+        "min_cluster_size": [
+            Interval(Integral, 2, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="both"),
+            None,
+        ],
+        "xi": [Interval(Real, 0, 1, closed="both")],
+        "predecessor_correction": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def cluster_optics_xi(
+    *,
+    reachability,
+    predecessor,
+    ordering,
+    min_samples,
+    min_cluster_size=None,
+    xi=0.05,
+    predecessor_correction=True,
+):
+    """Automatically extract clusters according to the Xi-steep method.
+
+    Parameters
+    ----------
+    reachability : ndarray of shape (n_samples,)
+        Reachability distances calculated by OPTICS (`reachability_`).
+
+    predecessor : ndarray of shape (n_samples,)
+        Predecessors calculated by OPTICS.
+
+    ordering : ndarray of shape (n_samples,)
+        OPTICS ordered point indices (`ordering_`).
+
+    min_samples : int > 1 or float between 0 and 1
+        The same as the min_samples given to OPTICS. Up and down steep regions
+        can't have more then ``min_samples`` consecutive non-steep points.
+        Expressed as an absolute number or a fraction of the number of samples
+        (rounded to be at least 2).
+
+    min_cluster_size : int > 1 or float between 0 and 1, default=None
+        Minimum number of samples in an OPTICS cluster, expressed as an
+        absolute number or a fraction of the number of samples (rounded to be
+        at least 2). If ``None``, the value of ``min_samples`` is used instead.
+
+    xi : float between 0 and 1, default=0.05
+        Determines the minimum steepness on the reachability plot that
+        constitutes a cluster boundary. For example, an upwards point in the
+        reachability plot is defined by the ratio from one point to its
+        successor being at most 1-xi.
+
+    predecessor_correction : bool, default=True
+        Correct clusters based on the calculated predecessors.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        The labels assigned to samples. Points which are not included
+        in any cluster are labeled as -1.
+
+    clusters : ndarray of shape (n_clusters, 2)
+        The list of clusters in the form of ``[start, end]`` in each row, with
+        all indices inclusive. The clusters are ordered according to ``(end,
+        -start)`` (ascending) so that larger clusters encompassing smaller
+        clusters come after such nested smaller clusters. Since ``labels`` does
+        not reflect the hierarchy, usually ``len(clusters) >
+        np.unique(labels)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import cluster_optics_xi, compute_optics_graph
+    >>> X = np.array([[1, 2], [2, 5], [3, 6],
+    ...               [8, 7], [8, 8], [7, 3]])
+    >>> ordering, core_distances, reachability, predecessor = compute_optics_graph(
+    ...     X,
+    ...     min_samples=2,
+    ...     max_eps=np.inf,
+    ...     metric="minkowski",
+    ...     p=2,
+    ...     metric_params=None,
+    ...     algorithm="auto",
+    ...     leaf_size=30,
+    ...     n_jobs=None
+    ... )
+    >>> min_samples = 2
+    >>> labels, clusters = cluster_optics_xi(
+    ...     reachability=reachability,
+    ...     predecessor=predecessor,
+    ...     ordering=ordering,
+    ...     min_samples=min_samples,
+    ... )
+    >>> labels
+    array([0, 0, 0, 1, 1, 1])
+    >>> clusters
+    array([[0, 2],
+           [3, 5],
+           [0, 5]])
+    """
+    n_samples = len(reachability)
+    _validate_size(min_samples, n_samples, "min_samples")
+    if min_samples <= 1:
+        min_samples = max(2, int(min_samples * n_samples))
+    if min_cluster_size is None:
+        min_cluster_size = min_samples
+    _validate_size(min_cluster_size, n_samples, "min_cluster_size")
+    if min_cluster_size <= 1:
+        min_cluster_size = max(2, int(min_cluster_size * n_samples))
+
+    clusters = _xi_cluster(
+        reachability[ordering],
+        predecessor[ordering],
+        ordering,
+        xi,
+        min_samples,
+        min_cluster_size,
+        predecessor_correction,
+    )
+    labels = _extract_xi_labels(ordering, clusters)
+    return labels, clusters
+
+
+def _extend_region(steep_point, xward_point, start, min_samples):
+    """Extend the area until it's maximal.
+
+    It's the same function for both upward and downward reagions, depending on
+    the given input parameters. Assuming:
+
+        - steep_{upward/downward}: bool array indicating whether a point is a
+          steep {upward/downward};
+        - upward/downward: bool array indicating whether a point is
+          upward/downward;
+
+    To extend an upward reagion, ``steep_point=steep_upward`` and
+    ``xward_point=downward`` are expected, and to extend a downward region,
+    ``steep_point=steep_downward`` and ``xward_point=upward``.
+
+    Parameters
+    ----------
+    steep_point : ndarray of shape (n_samples,), dtype=bool
+        True if the point is steep downward (upward).
+
+    xward_point : ndarray of shape (n_samples,), dtype=bool
+        True if the point is an upward (respectively downward) point.
+
+    start : int
+        The start of the xward region.
+
+    min_samples : int
+       The same as the min_samples given to OPTICS. Up and down steep
+       regions can't have more then ``min_samples`` consecutive non-steep
+       points.
+
+    Returns
+    -------
+    index : int
+        The current index iterating over all the samples, i.e. where we are up
+        to in our search.
+
+    end : int
+        The end of the region, which can be behind the index. The region
+        includes the ``end`` index.
+    """
+    n_samples = len(steep_point)
+    non_xward_points = 0
+    index = start
+    end = start
+    # find a maximal area
+    while index < n_samples:
+        if steep_point[index]:
+            non_xward_points = 0
+            end = index
+        elif not xward_point[index]:
+            # it's not a steep point, but still goes up.
+            non_xward_points += 1
+            # region should include no more than min_samples consecutive
+            # non steep xward points.
+            if non_xward_points > min_samples:
+                break
+        else:
+            return end
+        index += 1
+    return end
+
+
+def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot):
+    """Update steep down areas (SDAs) using the new maximum in between (mib)
+    value, and the given complement of xi, i.e. ``1 - xi``.
+    """
+    if np.isinf(mib):
+        return []
+    res = [
+        sda for sda in sdas if mib <= reachability_plot[sda["start"]] * xi_complement
+    ]
+    for sda in res:
+        sda["mib"] = max(sda["mib"], mib)
+    return res
+
+
+def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e):
+    """Correct for predecessors.
+
+    Applies Algorithm 2 of [1]_.
+
+    Input parameters are ordered by the computer OPTICS ordering.
+
+    .. [1] Schubert, Erich, Michael Gertz.
+       "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
+       the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
+    """
+    while s < e:
+        if reachability_plot[s] > reachability_plot[e]:
+            return s, e
+        p_e = predecessor_plot[e]
+        for i in range(s, e):
+            if p_e == ordering[i]:
+                return s, e
+        e -= 1
+    return None, None
+
+
+def _xi_cluster(
+    reachability_plot,
+    predecessor_plot,
+    ordering,
+    xi,
+    min_samples,
+    min_cluster_size,
+    predecessor_correction,
+):
+    """Automatically extract clusters according to the Xi-steep method.
+
+    This is rouphly an implementation of Figure 19 of the OPTICS paper.
+
+    Parameters
+    ----------
+    reachability_plot : array-like of shape (n_samples,)
+        The reachability plot, i.e. reachability ordered according to
+        the calculated ordering, all computed by OPTICS.
+
+    predecessor_plot : array-like of shape (n_samples,)
+        Predecessors ordered according to the calculated ordering.
+
+    xi : float, between 0 and 1
+        Determines the minimum steepness on the reachability plot that
+        constitutes a cluster boundary. For example, an upwards point in the
+        reachability plot is defined by the ratio from one point to its
+        successor being at most 1-xi.
+
+    min_samples : int > 1
+        The same as the min_samples given to OPTICS. Up and down steep regions
+        can't have more then ``min_samples`` consecutive non-steep points.
+
+    min_cluster_size : int > 1
+        Minimum number of samples in an OPTICS cluster.
+
+    predecessor_correction : bool
+        Correct clusters based on the calculated predecessors.
+
+    Returns
+    -------
+    clusters : ndarray of shape (n_clusters, 2)
+        The list of clusters in the form of [start, end] in each row, with all
+        indices inclusive. The clusters are ordered in a way that larger
+        clusters encompassing smaller clusters come after those smaller
+        clusters.
+    """
+
+    # Our implementation adds an inf to the end of reachability plot
+    # this helps to find potential clusters at the end of the
+    # reachability plot even if there's no upward region at the end of it.
+    reachability_plot = np.hstack((reachability_plot, np.inf))
+
+    xi_complement = 1 - xi
+    sdas = []  # steep down areas, introduced in section 4.3.2 of the paper
+    clusters = []
+    index = 0
+    mib = 0.0  # maximum in between, section 4.3.2
+
+    # Our implementation corrects a mistake in the original
+    # paper, i.e., in Definition 9 steep downward point,
+    # r(p) * (1 - x1) <= r(p + 1) should be
+    # r(p) * (1 - x1) >= r(p + 1)
+    with np.errstate(invalid="ignore"):
+        ratio = reachability_plot[:-1] / reachability_plot[1:]
+        steep_upward = ratio <= xi_complement
+        steep_downward = ratio >= 1 / xi_complement
+        downward = ratio > 1
+        upward = ratio < 1
+
+    # the following loop is almost exactly as Figure 19 of the paper.
+    # it jumps over the areas which are not either steep down or up areas
+    for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)):
+        # just continue if steep_index has been a part of a discovered xward
+        # area.
+        if steep_index < index:
+            continue
+
+        mib = max(mib, np.max(reachability_plot[index : steep_index + 1]))
+
+        # steep downward areas
+        if steep_downward[steep_index]:
+            sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot)
+            D_start = steep_index
+            D_end = _extend_region(steep_downward, upward, D_start, min_samples)
+            D = {"start": D_start, "end": D_end, "mib": 0.0}
+            sdas.append(D)
+            index = D_end + 1
+            mib = reachability_plot[index]
+
+        # steep upward areas
+        else:
+            sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot)
+            U_start = steep_index
+            U_end = _extend_region(steep_upward, downward, U_start, min_samples)
+            index = U_end + 1
+            mib = reachability_plot[index]
+
+            U_clusters = []
+            for D in sdas:
+                c_start = D["start"]
+                c_end = U_end
+
+                # line (**), sc2*
+                if reachability_plot[c_end + 1] * xi_complement < D["mib"]:
+                    continue
+
+                # Definition 11: criterion 4
+                D_max = reachability_plot[D["start"]]
+                if D_max * xi_complement >= reachability_plot[c_end + 1]:
+                    # Find the first index from the left side which is almost
+                    # at the same level as the end of the detected cluster.
+                    while (
+                        reachability_plot[c_start + 1] > reachability_plot[c_end + 1]
+                        and c_start < D["end"]
+                    ):
+                        c_start += 1
+                elif reachability_plot[c_end + 1] * xi_complement >= D_max:
+                    # Find the first index from the right side which is almost
+                    # at the same level as the beginning of the detected
+                    # cluster.
+                    # Our implementation corrects a mistake in the original
+                    # paper, i.e., in Definition 11 4c, r(x) < r(sD) should be
+                    # r(x) > r(sD).
+                    while reachability_plot[c_end - 1] > D_max and c_end > U_start:
+                        c_end -= 1
+
+                # predecessor correction
+                if predecessor_correction:
+                    c_start, c_end = _correct_predecessor(
+                        reachability_plot, predecessor_plot, ordering, c_start, c_end
+                    )
+                if c_start is None:
+                    continue
+
+                # Definition 11: criterion 3.a
+                if c_end - c_start + 1 < min_cluster_size:
+                    continue
+
+                # Definition 11: criterion 1
+                if c_start > D["end"]:
+                    continue
+
+                # Definition 11: criterion 2
+                if c_end < U_start:
+                    continue
+
+                U_clusters.append((c_start, c_end))
+
+            # add smaller clusters first.
+            U_clusters.reverse()
+            clusters.extend(U_clusters)
+
+    return np.array(clusters)
+
+
+def _extract_xi_labels(ordering, clusters):
+    """Extracts the labels from the clusters returned by `_xi_cluster`.
+    We rely on the fact that clusters are stored
+    with the smaller clusters coming before the larger ones.
+
+    Parameters
+    ----------
+    ordering : array-like of shape (n_samples,)
+        The ordering of points calculated by OPTICS
+
+    clusters : array-like of shape (n_clusters, 2)
+        List of clusters i.e. (start, end) tuples,
+        as returned by `_xi_cluster`.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+    """
+
+    labels = np.full(len(ordering), -1, dtype=int)
+    label = 0
+    for c in clusters:
+        if not np.any(labels[c[0] : (c[1] + 1)] != -1):
+            labels[c[0] : (c[1] + 1)] = label
+            label += 1
+    labels[ordering] = labels.copy()
+    return labels
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_spectral.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_spectral.py
new file mode 100644
index 0000000000000000000000000000000000000000..00d23437504e5ad019e49583972f244d85a5dae6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_spectral.py
@@ -0,0 +1,805 @@
+"""Algorithms for spectral clustering"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.linalg import LinAlgError, qr, svd
+from scipy.sparse import csc_matrix
+
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..manifold._spectral_embedding import _spectral_embedding
+from ..metrics.pairwise import KERNEL_PARAMS, pairwise_kernels
+from ..neighbors import NearestNeighbors, kneighbors_graph
+from ..utils import as_float_array, check_random_state
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.validation import validate_data
+from ._kmeans import k_means
+
+
+def cluster_qr(vectors):
+    """Find the discrete partition closest to the eigenvector embedding.
+
+        This implementation was proposed in [1]_.
+
+    .. versionadded:: 1.1
+
+        Parameters
+        ----------
+        vectors : array-like, shape: (n_samples, n_clusters)
+            The embedding space of the samples.
+
+        Returns
+        -------
+        labels : array of integers, shape: n_samples
+            The cluster labels of vectors.
+
+        References
+        ----------
+        .. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
+            Anil Damle, Victor Minden, Lexing Ying
+            <10.1093/imaiai/iay008>`
+
+    """
+
+    k = vectors.shape[1]
+    _, _, piv = qr(vectors.T, pivoting=True)
+    ut, _, v = svd(vectors[piv[:k], :].T)
+    vectors = abs(np.dot(vectors, np.dot(ut, v.conj())))
+    return vectors.argmax(axis=1)
+
+
+def discretize(
+    vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None
+):
+    """Search for a partition matrix which is closest to the eigenvector embedding.
+
+    This implementation was proposed in [1]_.
+
+    Parameters
+    ----------
+    vectors : array-like of shape (n_samples, n_clusters)
+        The embedding space of the samples.
+
+    copy : bool, default=True
+        Whether to copy vectors, or perform in-place normalization.
+
+    max_svd_restarts : int, default=30
+        Maximum number of attempts to restart SVD if convergence fails
+
+    n_iter_max : int, default=30
+        Maximum number of iterations to attempt in rotation and partition
+        matrix search if machine precision convergence is not reached
+
+    random_state : int, RandomState instance, default=None
+        Determines random number generation for rotation matrix initialization.
+        Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    labels : array of integers, shape: n_samples
+        The labels of the clusters.
+
+    References
+    ----------
+
+    .. [1] `Multiclass spectral clustering, 2003
+           Stella X. Yu, Jianbo Shi
+           <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
+
+    Notes
+    -----
+
+    The eigenvector embedding is used to iteratively search for the
+    closest discrete partition.  First, the eigenvector embedding is
+    normalized to the space of partition matrices. An optimal discrete
+    partition matrix closest to this normalized embedding multiplied by
+    an initial rotation is calculated.  Fixing this discrete partition
+    matrix, an optimal rotation matrix is calculated.  These two
+    calculations are performed until convergence.  The discrete partition
+    matrix is returned as the clustering solution.  Used in spectral
+    clustering, this method tends to be faster and more robust to random
+    initialization than k-means.
+
+    """
+
+    random_state = check_random_state(random_state)
+
+    vectors = as_float_array(vectors, copy=copy)
+
+    eps = np.finfo(float).eps
+    n_samples, n_components = vectors.shape
+
+    # Normalize the eigenvectors to an equal length of a vector of ones.
+    # Reorient the eigenvectors to point in the negative direction with respect
+    # to the first element.  This may have to do with constraining the
+    # eigenvectors to lie in a specific quadrant to make the discretization
+    # search easier.
+    norm_ones = np.sqrt(n_samples)
+    for i in range(vectors.shape[1]):
+        vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones
+        if vectors[0, i] != 0:
+            vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])
+
+    # Normalize the rows of the eigenvectors.  Samples should lie on the unit
+    # hypersphere centered at the origin.  This transforms the samples in the
+    # embedding space to the space of partition matrices.
+    vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]
+
+    svd_restarts = 0
+    has_converged = False
+
+    # If there is an exception we try to randomize and rerun SVD again
+    # do this max_svd_restarts times.
+    while (svd_restarts < max_svd_restarts) and not has_converged:
+        # Initialize first column of rotation matrix with a row of the
+        # eigenvectors
+        rotation = np.zeros((n_components, n_components))
+        rotation[:, 0] = vectors[random_state.randint(n_samples), :].T
+
+        # To initialize the rest of the rotation matrix, find the rows
+        # of the eigenvectors that are as orthogonal to each other as
+        # possible
+        c = np.zeros(n_samples)
+        for j in range(1, n_components):
+            # Accumulate c to ensure row is as orthogonal as possible to
+            # previous picks as well as current one
+            c += np.abs(np.dot(vectors, rotation[:, j - 1]))
+            rotation[:, j] = vectors[c.argmin(), :].T
+
+        last_objective_value = 0.0
+        n_iter = 0
+
+        while not has_converged:
+            n_iter += 1
+
+            t_discrete = np.dot(vectors, rotation)
+
+            labels = t_discrete.argmax(axis=1)
+            vectors_discrete = csc_matrix(
+                (np.ones(len(labels)), (np.arange(0, n_samples), labels)),
+                shape=(n_samples, n_components),
+            )
+
+            t_svd = vectors_discrete.T @ vectors
+
+            try:
+                U, S, Vh = np.linalg.svd(t_svd)
+            except LinAlgError:
+                svd_restarts += 1
+                print("SVD did not converge, randomizing and trying again")
+                break
+
+            ncut_value = 2.0 * (n_samples - S.sum())
+            if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max):
+                has_converged = True
+            else:
+                # otherwise calculate rotation and continue
+                last_objective_value = ncut_value
+                rotation = np.dot(Vh.T, U.T)
+
+    if not has_converged:
+        raise LinAlgError("SVD did not converge")
+    return labels
+
+
+@validate_params(
+    {"affinity": ["array-like", "sparse matrix"]},
+    prefer_skip_nested_validation=False,
+)
+def spectral_clustering(
+    affinity,
+    *,
+    n_clusters=8,
+    n_components=None,
+    eigen_solver=None,
+    random_state=None,
+    n_init=10,
+    eigen_tol="auto",
+    assign_labels="kmeans",
+    verbose=False,
+):
+    """Apply clustering to a projection of the normalized Laplacian.
+
+    In practice Spectral Clustering is very useful when the structure of
+    the individual clusters is highly non-convex or more generally when
+    a measure of the center and spread of the cluster is not a suitable
+    description of the complete cluster. For instance, when clusters are
+    nested circles on the 2D plane.
+
+    If affinity is the adjacency matrix of a graph, this method can be
+    used to find normalized graph cuts [1]_, [2]_.
+
+    Read more in the :ref:`User Guide <spectral_clustering>`.
+
+    Parameters
+    ----------
+    affinity : {array-like, sparse matrix} of shape (n_samples, n_samples)
+        The affinity matrix describing the relationship of the samples to
+        embed. **Must be symmetric**.
+
+        Possible examples:
+          - adjacency matrix of a graph,
+          - heat kernel of the pairwise distance matrix of the samples,
+          - symmetric k-nearest neighbours connectivity matrix of the samples.
+
+    n_clusters : int, default=None
+        Number of clusters to extract.
+
+    n_components : int, default=n_clusters
+        Number of eigenvectors to use for the spectral embedding.
+
+    eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
+        The eigenvalue decomposition method. If None then ``'arpack'`` is used.
+        See [4]_ for more details regarding ``'lobpcg'``.
+        Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional
+        Algebraic MultiGrid preconditioning and requires pyamg to be installed.
+        It can be faster on very large sparse problems [6]_ and [7]_.
+
+    random_state : int, RandomState instance, default=None
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigenvectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
+
+    n_init : int, default=10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of n_init
+        consecutive runs in terms of inertia. Only used if
+        ``assign_labels='kmeans'``.
+
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigendecomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
+        values of `tol<1e-5` may lead to convergence issues and should be
+        avoided.
+
+        .. versionadded:: 1.2
+           Added 'auto' option.
+
+    assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
+        The strategy to use to assign labels in the embedding
+        space.  There are three ways to assign labels after the Laplacian
+        embedding.  k-means can be applied and is a popular choice. But it can
+        also be sensitive to initialization. Discretization is another
+        approach which is less sensitive to random initialization [3]_.
+        The cluster_qr method [5]_ directly extracts clusters from eigenvectors
+        in spectral clustering. In contrast to k-means and discretization, cluster_qr
+        has no tuning parameters and is not an iterative method, yet may outperform
+        k-means and discretization in terms of both quality and speed. For a detailed
+        comparison of clustering strategies, refer to the following example:
+        :ref:`sphx_glr_auto_examples_cluster_plot_coin_segmentation.py`.
+
+        .. versionchanged:: 1.1
+           Added new labeling method 'cluster_qr'.
+
+    verbose : bool, default=False
+        Verbosity mode.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    labels : array of integers, shape: n_samples
+        The labels of the clusters.
+
+    Notes
+    -----
+    The graph should contain only one connected component, elsewhere
+    the results make little sense.
+
+    This algorithm solves the normalized cut for `k=2`: it is a
+    normalized spectral clustering.
+
+    References
+    ----------
+
+    .. [1] :doi:`Normalized cuts and image segmentation, 2000
+           Jianbo Shi, Jitendra Malik
+           <10.1109/34.868688>`
+
+    .. [2] :doi:`A Tutorial on Spectral Clustering, 2007
+           Ulrike von Luxburg
+           <10.1007/s11222-007-9033-z>`
+
+    .. [3] `Multiclass spectral clustering, 2003
+           Stella X. Yu, Jianbo Shi
+           <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
+
+    .. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
+           Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
+           A. V. Knyazev
+           SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
+           <10.1137/S1064827500366124>`
+
+    .. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
+           Anil Damle, Victor Minden, Lexing Ying
+           <10.1093/imaiai/iay008>`
+
+    .. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning
+           for computing eigenvalues of graph Laplacians in image segmentation, 2006
+           Andrew Knyazev
+           <10.13140/RG.2.2.35280.02565>`
+
+    .. [7] :doi:`Preconditioned spectral clustering for stochastic block partition
+           streaming graph challenge (Preliminary version at arXiv.)
+           David Zhuzhunashvili, Andrew Knyazev
+           <10.1109/HPEC.2017.8091045>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics.pairwise import pairwise_kernels
+    >>> from sklearn.cluster import spectral_clustering
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> affinity = pairwise_kernels(X, metric='rbf')
+    >>> spectral_clustering(
+    ...     affinity=affinity, n_clusters=2, assign_labels="discretize", random_state=0
+    ... )
+    array([1, 1, 1, 0, 0, 0])
+    """
+
+    clusterer = SpectralClustering(
+        n_clusters=n_clusters,
+        n_components=n_components,
+        eigen_solver=eigen_solver,
+        random_state=random_state,
+        n_init=n_init,
+        affinity="precomputed",
+        eigen_tol=eigen_tol,
+        assign_labels=assign_labels,
+        verbose=verbose,
+    ).fit(affinity)
+
+    return clusterer.labels_
+
+
+class SpectralClustering(ClusterMixin, BaseEstimator):
+    """Apply clustering to a projection of the normalized Laplacian.
+
+    In practice Spectral Clustering is very useful when the structure of
+    the individual clusters is highly non-convex, or more generally when
+    a measure of the center and spread of the cluster is not a suitable
+    description of the complete cluster, such as when clusters are
+    nested circles on the 2D plane.
+
+    If the affinity matrix is the adjacency matrix of a graph, this method
+    can be used to find normalized graph cuts [1]_, [2]_.
+
+    When calling ``fit``, an affinity matrix is constructed using either
+    a kernel function such the Gaussian (aka RBF) kernel with Euclidean
+    distance ``d(X, X)``::
+
+            np.exp(-gamma * d(X,X) ** 2)
+
+    or a k-nearest neighbors connectivity matrix.
+
+    Alternatively, a user-provided affinity matrix can be specified by
+    setting ``affinity='precomputed'``.
+
+    Read more in the :ref:`User Guide <spectral_clustering>`.
+
+    Parameters
+    ----------
+    n_clusters : int, default=8
+        The dimension of the projection subspace.
+
+    eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
+        The eigenvalue decomposition strategy to use. AMG requires pyamg
+        to be installed. It can be faster on very large, sparse problems,
+        but may also lead to instabilities. If None, then ``'arpack'`` is
+        used. See [4]_ for more details regarding `'lobpcg'`.
+
+    n_components : int, default=None
+        Number of eigenvectors to use for the spectral embedding. If None,
+        defaults to `n_clusters`.
+
+    random_state : int, RandomState instance, default=None
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigenvectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
+
+    n_init : int, default=10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of n_init
+        consecutive runs in terms of inertia. Only used if
+        ``assign_labels='kmeans'``.
+
+    gamma : float, default=1.0
+        Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
+        Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'``
+        or ``affinity='precomputed_nearest_neighbors'``.
+
+    affinity : str or callable, default='rbf'
+        How to construct the affinity matrix.
+         - 'nearest_neighbors': construct the affinity matrix by computing a
+           graph of nearest neighbors.
+         - 'rbf': construct the affinity matrix using a radial basis function
+           (RBF) kernel.
+         - 'precomputed': interpret ``X`` as a precomputed affinity matrix,
+           where larger values indicate greater similarity between instances.
+         - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph
+           of precomputed distances, and construct a binary affinity matrix
+           from the ``n_neighbors`` nearest neighbors of each instance.
+         - one of the kernels supported by
+           :func:`~sklearn.metrics.pairwise.pairwise_kernels`.
+
+        Only kernels that produce similarity scores (non-negative values that
+        increase with similarity) should be used. This property is not checked
+        by the clustering algorithm.
+
+    n_neighbors : int, default=10
+        Number of neighbors to use when constructing the affinity matrix using
+        the nearest neighbors method. Ignored for ``affinity='rbf'``.
+
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigen decomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
+        values of `tol<1e-5` may lead to convergence issues and should be
+        avoided.
+
+        .. versionadded:: 1.2
+           Added 'auto' option.
+
+    assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
+        The strategy for assigning labels in the embedding space. There are two
+        ways to assign labels after the Laplacian embedding. k-means is a
+        popular choice, but it can be sensitive to initialization.
+        Discretization is another approach which is less sensitive to random
+        initialization [3]_.
+        The cluster_qr method [5]_ directly extract clusters from eigenvectors
+        in spectral clustering. In contrast to k-means and discretization, cluster_qr
+        has no tuning parameters and runs no iterations, yet may outperform
+        k-means and discretization in terms of both quality and speed.
+
+        .. versionchanged:: 1.1
+           Added new labeling method 'cluster_qr'.
+
+    degree : float, default=3
+        Degree of the polynomial kernel. Ignored by other kernels.
+
+    coef0 : float, default=1
+        Zero coefficient for polynomial and sigmoid kernels.
+        Ignored by other kernels.
+
+    kernel_params : dict of str to any, default=None
+        Parameters (keyword arguments) and values for kernel passed as
+        callable object. Ignored by other kernels.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run when `affinity='nearest_neighbors'`
+        or `affinity='precomputed_nearest_neighbors'`. The neighbors search
+        will be done in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool, default=False
+        Verbosity mode.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    affinity_matrix_ : array-like of shape (n_samples, n_samples)
+        Affinity matrix used for clustering. Available only after calling
+        ``fit``.
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.cluster.KMeans : K-Means clustering.
+    sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of
+        Applications with Noise.
+
+    Notes
+    -----
+    A distance matrix for which 0 indicates identical elements and high values
+    indicate very dissimilar elements can be transformed into an affinity /
+    similarity matrix that is well-suited for the algorithm by
+    applying the Gaussian (aka RBF, heat) kernel::
+
+        np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
+
+    where ``delta`` is a free parameter representing the width of the Gaussian
+    kernel.
+
+    An alternative is to take a symmetric version of the k-nearest neighbors
+    connectivity matrix of the points.
+
+    If the pyamg package is installed, it is used: this greatly
+    speeds up computation.
+
+    References
+    ----------
+    .. [1] :doi:`Normalized cuts and image segmentation, 2000
+           Jianbo Shi, Jitendra Malik
+           <10.1109/34.868688>`
+
+    .. [2] :doi:`A Tutorial on Spectral Clustering, 2007
+           Ulrike von Luxburg
+           <10.1007/s11222-007-9033-z>`
+
+    .. [3] `Multiclass spectral clustering, 2003
+           Stella X. Yu, Jianbo Shi
+           <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
+
+    .. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
+           Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
+           A. V. Knyazev
+           SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
+           <10.1137/S1064827500366124>`
+
+    .. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
+           Anil Damle, Victor Minden, Lexing Ying
+           <10.1093/imaiai/iay008>`
+
+    Examples
+    --------
+    >>> from sklearn.cluster import SpectralClustering
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = SpectralClustering(n_clusters=2,
+    ...         assign_labels='discretize',
+    ...         random_state=0).fit(X)
+    >>> clustering.labels_
+    array([1, 1, 1, 0, 0, 0])
+    >>> clustering
+    SpectralClustering(assign_labels='discretize', n_clusters=2,
+        random_state=0)
+
+    For a comparison of Spectral clustering with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        "n_clusters": [Interval(Integral, 1, None, closed="left")],
+        "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
+        "n_components": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "gamma": [Interval(Real, 0, None, closed="left")],
+        "affinity": [
+            callable,
+            StrOptions(
+                set(KERNEL_PARAMS)
+                | {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"}
+            ),
+        ],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "eigen_tol": [
+            Interval(Real, 0.0, None, closed="left"),
+            StrOptions({"auto"}),
+        ],
+        "assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})],
+        "degree": [Interval(Real, 0, None, closed="left")],
+        "coef0": [Interval(Real, None, None, closed="neither")],
+        "kernel_params": [dict, None],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        n_clusters=8,
+        *,
+        eigen_solver=None,
+        n_components=None,
+        random_state=None,
+        n_init=10,
+        gamma=1.0,
+        affinity="rbf",
+        n_neighbors=10,
+        eigen_tol="auto",
+        assign_labels="kmeans",
+        degree=3,
+        coef0=1,
+        kernel_params=None,
+        n_jobs=None,
+        verbose=False,
+    ):
+        self.n_clusters = n_clusters
+        self.eigen_solver = eigen_solver
+        self.n_components = n_components
+        self.random_state = random_state
+        self.n_init = n_init
+        self.gamma = gamma
+        self.affinity = affinity
+        self.n_neighbors = n_neighbors
+        self.eigen_tol = eigen_tol
+        self.assign_labels = assign_labels
+        self.degree = degree
+        self.coef0 = coef0
+        self.kernel_params = kernel_params
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Perform spectral clustering from features, or affinity matrix.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Training instances to cluster, similarities / affinities between
+            instances if ``affinity='precomputed'``, or distances between
+            instances if ``affinity='precomputed_nearest_neighbors``. If a
+            sparse matrix is provided in a format other than ``csr_matrix``,
+            ``csc_matrix``, or ``coo_matrix``, it will be converted into a
+            sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            A fitted instance of the estimator.
+        """
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype=np.float64,
+            ensure_min_samples=2,
+        )
+        allow_squared = self.affinity in [
+            "precomputed",
+            "precomputed_nearest_neighbors",
+        ]
+        if X.shape[0] == X.shape[1] and not allow_squared:
+            warnings.warn(
+                "The spectral clustering API has changed. ``fit``"
+                "now constructs an affinity matrix from data. To use"
+                " a custom affinity matrix, "
+                "set ``affinity=precomputed``."
+            )
+
+        if self.affinity == "nearest_neighbors":
+            connectivity = kneighbors_graph(
+                X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs
+            )
+            self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
+        elif self.affinity == "precomputed_nearest_neighbors":
+            estimator = NearestNeighbors(
+                n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
+            ).fit(X)
+            connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
+            self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
+        elif self.affinity == "precomputed":
+            self.affinity_matrix_ = X
+        else:
+            params = self.kernel_params
+            if params is None:
+                params = {}
+            if not callable(self.affinity):
+                params["gamma"] = self.gamma
+                params["degree"] = self.degree
+                params["coef0"] = self.coef0
+            self.affinity_matrix_ = pairwise_kernels(
+                X, metric=self.affinity, filter_params=True, **params
+            )
+
+        random_state = check_random_state(self.random_state)
+        n_components = (
+            self.n_clusters if self.n_components is None else self.n_components
+        )
+        # We now obtain the real valued solution matrix to the
+        # relaxed Ncut problem, solving the eigenvalue problem
+        # L_sym x = lambda x  and recovering u = D^-1/2 x.
+        # The first eigenvector is constant only for fully connected graphs
+        # and should be kept for spectral clustering (drop_first = False)
+        # See spectral_embedding documentation.
+        maps = _spectral_embedding(
+            self.affinity_matrix_,
+            n_components=n_components,
+            eigen_solver=self.eigen_solver,
+            random_state=random_state,
+            eigen_tol=self.eigen_tol,
+            drop_first=False,
+        )
+        if self.verbose:
+            print(f"Computing label assignment using {self.assign_labels}")
+
+        if self.assign_labels == "kmeans":
+            _, self.labels_, _ = k_means(
+                maps,
+                self.n_clusters,
+                random_state=random_state,
+                n_init=self.n_init,
+                verbose=self.verbose,
+            )
+        elif self.assign_labels == "cluster_qr":
+            self.labels_ = cluster_qr(maps)
+        else:
+            self.labels_ = discretize(maps, random_state=random_state)
+
+        return self
+
+    def fit_predict(self, X, y=None):
+        """Perform spectral clustering on `X` and return cluster labels.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Training instances to cluster, similarities / affinities between
+            instances if ``affinity='precomputed'``, or distances between
+            instances if ``affinity='precomputed_nearest_neighbors``. If a
+            sparse matrix is provided in a format other than ``csr_matrix``,
+            ``csc_matrix``, or ``coo_matrix``, it will be converted into a
+            sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels.
+        """
+        return super().fit_predict(X, y)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.input_tags.pairwise = self.affinity in [
+            "precomputed",
+            "precomputed_nearest_neighbors",
+        ]
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/meson.build b/.venv/lib/python3.12/site-packages/sklearn/cluster/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..6c11619f3ca555c58ed43b1d579548d75cf6aea4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/meson.build
@@ -0,0 +1,26 @@
+cluster_extension_metadata = {
+  '_dbscan_inner':
+    {'sources': [cython_gen_cpp.process('_dbscan_inner.pyx')]},
+  '_hierarchical_fast':
+    {'sources': [cython_gen_cpp.process('_hierarchical_fast.pyx'), metrics_cython_tree]},
+  '_k_means_common':
+    {'sources': [cython_gen.process('_k_means_common.pyx')], 'dependencies': [openmp_dep]},
+  '_k_means_lloyd':
+    {'sources': [cython_gen.process('_k_means_lloyd.pyx')], 'dependencies': [openmp_dep]},
+  '_k_means_elkan':
+    {'sources': [cython_gen.process('_k_means_elkan.pyx')], 'dependencies': [openmp_dep]},
+  '_k_means_minibatch':
+    {'sources': [cython_gen.process('_k_means_minibatch.pyx')], 'dependencies': [openmp_dep]},
+}
+
+foreach ext_name, ext_dict : cluster_extension_metadata
+  py.extension_module(
+    ext_name,
+    [ext_dict.get('sources'), utils_cython_tree],
+    dependencies: [np_dep] + ext_dict.get('dependencies', []),
+    subdir: 'sklearn/cluster',
+    install: true
+  )
+endforeach
+
+subdir('_hdbscan')
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/common.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1fe047fe230af1c3fbb2ec0b72f3ef20e5aa3aa
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/common.py
@@ -0,0 +1,37 @@
+"""
+Common utilities for testing clustering.
+
+"""
+
+import numpy as np
+
+###############################################################################
+# Generate sample data
+
+
+def generate_clustered_data(
+    seed=0, n_clusters=3, n_features=2, n_samples_per_cluster=20, std=0.4
+):
+    prng = np.random.RandomState(seed)
+
+    # the data is voluntary shifted away from zero to check clustering
+    # algorithm robustness with regards to non centered data
+    means = (
+        np.array(
+            [
+                [1, 1, 1, 0],
+                [-1, -1, 0, 1],
+                [1, -1, 1, 1],
+                [-1, 1, 1, 0],
+            ]
+        )
+        + 10
+    )
+
+    X = np.empty((0, n_features))
+    for i in range(n_clusters):
+        X = np.r_[
+            X,
+            means[i][:n_features] + std * prng.randn(n_samples_per_cluster, n_features),
+        ]
+    return X
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_affinity_propagation.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_affinity_propagation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3138e59111ed849988dd0e6d3433a4bb251e2a1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_affinity_propagation.py
@@ -0,0 +1,321 @@
+"""
+Testing for Clustering methods
+
+"""
+
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.cluster import AffinityPropagation, affinity_propagation
+from sklearn.cluster._affinity_propagation import _equal_similarities_and_preferences
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import ConvergenceWarning, NotFittedError
+from sklearn.metrics import euclidean_distances
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+n_clusters = 3
+centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+X, _ = make_blobs(
+    n_samples=60,
+    n_features=2,
+    centers=centers,
+    cluster_std=0.4,
+    shuffle=True,
+    random_state=0,
+)
+
+# TODO: AffinityPropagation must preserve dtype for its fitted attributes
+# and test must be created accordingly to this new behavior.
+# For more details, see: https://github.com/scikit-learn/scikit-learn/issues/11000
+
+
+def test_affinity_propagation(global_random_seed, global_dtype):
+    """Test consistency of the affinity propagations."""
+    S = -euclidean_distances(X.astype(global_dtype, copy=False), squared=True)
+    preference = np.median(S) * 10
+    cluster_centers_indices, labels = affinity_propagation(
+        S, preference=preference, random_state=global_random_seed
+    )
+
+    n_clusters_ = len(cluster_centers_indices)
+
+    assert n_clusters == n_clusters_
+
+
+def test_affinity_propagation_precomputed():
+    """Check equality of precomputed affinity matrix to internally computed affinity
+    matrix.
+    """
+    S = -euclidean_distances(X, squared=True)
+    preference = np.median(S) * 10
+    af = AffinityPropagation(
+        preference=preference, affinity="precomputed", random_state=28
+    )
+    labels_precomputed = af.fit(S).labels_
+
+    af = AffinityPropagation(preference=preference, verbose=True, random_state=37)
+    labels = af.fit(X).labels_
+
+    assert_array_equal(labels, labels_precomputed)
+
+    cluster_centers_indices = af.cluster_centers_indices_
+
+    n_clusters_ = len(cluster_centers_indices)
+    assert np.unique(labels).size == n_clusters_
+    assert n_clusters == n_clusters_
+
+
+def test_affinity_propagation_no_copy():
+    """Check behaviour of not copying the input data."""
+    S = -euclidean_distances(X, squared=True)
+    S_original = S.copy()
+    preference = np.median(S) * 10
+    assert not np.allclose(S.diagonal(), preference)
+
+    # with copy=True S should not be modified
+    affinity_propagation(S, preference=preference, copy=True, random_state=0)
+    assert_allclose(S, S_original)
+    assert not np.allclose(S.diagonal(), preference)
+    assert_allclose(S.diagonal(), np.zeros(S.shape[0]))
+
+    # with copy=False S will be modified inplace
+    affinity_propagation(S, preference=preference, copy=False, random_state=0)
+    assert_allclose(S.diagonal(), preference)
+
+    # test that copy=True and copy=False lead to the same result
+    S = S_original.copy()
+    af = AffinityPropagation(preference=preference, verbose=True, random_state=0)
+
+    labels = af.fit(X).labels_
+    _, labels_no_copy = affinity_propagation(
+        S, preference=preference, copy=False, random_state=74
+    )
+    assert_array_equal(labels, labels_no_copy)
+
+
+def test_affinity_propagation_affinity_shape():
+    """Check the shape of the affinity matrix when using `affinity_propagation."""
+    S = -euclidean_distances(X, squared=True)
+    err_msg = "The matrix of similarities must be a square array"
+    with pytest.raises(ValueError, match=err_msg):
+        affinity_propagation(S[:, :-1])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_affinity_propagation_precomputed_with_sparse_input(csr_container):
+    err_msg = "Sparse data was passed for X, but dense data is required"
+    with pytest.raises(TypeError, match=err_msg):
+        AffinityPropagation(affinity="precomputed").fit(csr_container((3, 3)))
+
+
+def test_affinity_propagation_predict(global_random_seed, global_dtype):
+    # Test AffinityPropagation.predict
+    af = AffinityPropagation(affinity="euclidean", random_state=global_random_seed)
+    X_ = X.astype(global_dtype, copy=False)
+    labels = af.fit_predict(X_)
+    labels2 = af.predict(X_)
+    assert_array_equal(labels, labels2)
+
+
+def test_affinity_propagation_predict_error():
+    # Test exception in AffinityPropagation.predict
+    # Not fitted.
+    af = AffinityPropagation(affinity="euclidean")
+    with pytest.raises(NotFittedError):
+        af.predict(X)
+
+    # Predict not supported when affinity="precomputed".
+    S = np.dot(X, X.T)
+    af = AffinityPropagation(affinity="precomputed", random_state=57)
+    af.fit(S)
+    with pytest.raises(ValueError, match="expecting 60 features as input"):
+        af.predict(X)
+
+
+def test_affinity_propagation_fit_non_convergence(global_dtype):
+    # In case of non-convergence of affinity_propagation(), the cluster
+    # centers should be an empty array and training samples should be labelled
+    # as noise (-1)
+    X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
+
+    # Force non-convergence by allowing only a single iteration
+    af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
+
+    with pytest.warns(ConvergenceWarning):
+        af.fit(X)
+    assert_allclose(np.empty((0, 2)), af.cluster_centers_)
+    assert_array_equal(np.array([-1, -1, -1]), af.labels_)
+
+
+def test_affinity_propagation_equal_mutual_similarities(global_dtype):
+    X = np.array([[-1, 1], [1, -1]], dtype=global_dtype)
+    S = -euclidean_distances(X, squared=True)
+
+    # setting preference > similarity
+    with pytest.warns(UserWarning, match="mutually equal"):
+        cluster_center_indices, labels = affinity_propagation(S, preference=0)
+
+    # expect every sample to become an exemplar
+    assert_array_equal([0, 1], cluster_center_indices)
+    assert_array_equal([0, 1], labels)
+
+    # setting preference < similarity
+    with pytest.warns(UserWarning, match="mutually equal"):
+        cluster_center_indices, labels = affinity_propagation(S, preference=-10)
+
+    # expect one cluster, with arbitrary (first) sample as exemplar
+    assert_array_equal([0], cluster_center_indices)
+    assert_array_equal([0, 0], labels)
+
+    # setting different preferences
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        cluster_center_indices, labels = affinity_propagation(
+            S, preference=[-20, -10], random_state=37
+        )
+
+    # expect one cluster, with highest-preference sample as exemplar
+    assert_array_equal([1], cluster_center_indices)
+    assert_array_equal([0, 0], labels)
+
+
+def test_affinity_propagation_predict_non_convergence(global_dtype):
+    # In case of non-convergence of affinity_propagation(), the cluster
+    # centers should be an empty array
+    X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
+
+    # Force non-convergence by allowing only a single iteration
+    with pytest.warns(ConvergenceWarning):
+        af = AffinityPropagation(preference=-10, max_iter=1, random_state=75).fit(X)
+
+    # At prediction time, consider new samples as noise since there are no
+    # clusters
+    to_predict = np.array([[2, 2], [3, 3], [4, 4]])
+    with pytest.warns(ConvergenceWarning):
+        y = af.predict(to_predict)
+    assert_array_equal(np.array([-1, -1, -1]), y)
+
+
+def test_affinity_propagation_non_convergence_regressiontest(global_dtype):
+    X = np.array(
+        [[1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 1]], dtype=global_dtype
+    )
+    af = AffinityPropagation(affinity="euclidean", max_iter=2, random_state=34)
+    msg = (
+        "Affinity propagation did not converge, this model may return degenerate"
+        " cluster centers and labels."
+    )
+    with pytest.warns(ConvergenceWarning, match=msg):
+        af.fit(X)
+
+    assert_array_equal(np.array([0, 0, 0]), af.labels_)
+
+
+def test_equal_similarities_and_preferences(global_dtype):
+    # Unequal distances
+    X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
+    S = -euclidean_distances(X, squared=True)
+
+    assert not _equal_similarities_and_preferences(S, np.array(0))
+    assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
+    assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
+
+    # Equal distances
+    X = np.array([[0, 0], [1, 1]], dtype=global_dtype)
+    S = -euclidean_distances(X, squared=True)
+
+    # Different preferences
+    assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
+
+    # Same preferences
+    assert _equal_similarities_and_preferences(S, np.array([0, 0]))
+    assert _equal_similarities_and_preferences(S, np.array(0))
+
+
+def test_affinity_propagation_random_state():
+    """Check that different random states lead to different initialisations
+    by looking at the center locations after two iterations.
+    """
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=300, centers=centers, cluster_std=0.5, random_state=0
+    )
+    # random_state = 0
+    ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
+    ap.fit(X)
+    centers0 = ap.cluster_centers_
+
+    # random_state = 76
+    ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
+    ap.fit(X)
+    centers76 = ap.cluster_centers_
+    # check that the centers have not yet converged to the same solution
+    assert np.mean((centers0 - centers76) ** 2) > 1
+
+
+@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array])
+def test_affinity_propagation_convergence_warning_dense_sparse(container, global_dtype):
+    """
+    Check that having sparse or dense `centers` format should not
+    influence the convergence.
+    Non-regression test for gh-13334.
+    """
+    centers = container(np.zeros((1, 10)))
+    rng = np.random.RandomState(42)
+    X = rng.rand(40, 10).astype(global_dtype, copy=False)
+    y = (4 * rng.rand(40)).astype(int)
+    ap = AffinityPropagation(random_state=46)
+    ap.fit(X, y)
+    ap.cluster_centers_ = centers
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", ConvergenceWarning)
+        assert_array_equal(ap.predict(X), np.zeros(X.shape[0], dtype=int))
+
+
+# FIXME; this test is broken with different random states, needs to be revisited
+def test_correct_clusters(global_dtype):
+    # Test to fix incorrect clusters due to dtype change
+    # (non-regression test for issue #10832)
+    X = np.array(
+        [[1, 0, 0, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype=global_dtype
+    )
+    afp = AffinityPropagation(preference=1, affinity="precomputed", random_state=0).fit(
+        X
+    )
+    expected = np.array([0, 1, 1, 2])
+    assert_array_equal(afp.labels_, expected)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_input_for_predict(csr_container):
+    # Test to make sure sparse inputs are accepted for predict
+    # (non-regression test for issue #20049)
+    af = AffinityPropagation(affinity="euclidean", random_state=42)
+    af.fit(X)
+    labels = af.predict(csr_container((2, 2)))
+    assert_array_equal(labels, (2, 2))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_input_for_fit_predict(csr_container):
+    # Test to make sure sparse inputs are accepted for fit_predict
+    # (non-regression test for issue #20049)
+    af = AffinityPropagation(affinity="euclidean", random_state=42)
+    rng = np.random.RandomState(42)
+    X = csr_container(rng.randint(0, 2, size=(5, 5)))
+    labels = af.fit_predict(X)
+    assert_array_equal(labels, (0, 1, 1, 2, 3))
+
+
+def test_affinity_propagation_equal_points():
+    """Make sure we do not assign multiple clusters to equal points.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/20043
+    """
+    X = np.zeros((8, 1))
+    af = AffinityPropagation(affinity="euclidean", damping=0.5, random_state=42).fit(X)
+    assert np.all(af.labels_ == 0)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bicluster.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bicluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc845a7bf262c60cf9f039e5ce021d841bdf4d4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bicluster.py
@@ -0,0 +1,264 @@
+"""Testing for Spectral Biclustering methods"""
+
+import numpy as np
+import pytest
+from scipy.sparse import issparse
+
+from sklearn.base import BaseEstimator, BiclusterMixin
+from sklearn.cluster import SpectralBiclustering, SpectralCoclustering
+from sklearn.cluster._bicluster import (
+    _bistochastic_normalize,
+    _log_normalize,
+    _scale_normalize,
+)
+from sklearn.datasets import make_biclusters, make_checkerboard
+from sklearn.metrics import consensus_score, v_measure_score
+from sklearn.model_selection import ParameterGrid
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+class MockBiclustering(BiclusterMixin, BaseEstimator):
+    # Mock object for testing get_submatrix.
+    def __init__(self):
+        pass
+
+    def get_indices(self, i):
+        # Overridden to reproduce old get_submatrix test.
+        return (
+            np.where([True, True, False, False, True])[0],
+            np.where([False, False, True, True])[0],
+        )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_get_submatrix(csr_container):
+    data = np.arange(20).reshape(5, 4)
+    model = MockBiclustering()
+
+    for X in (data, csr_container(data), data.tolist()):
+        submatrix = model.get_submatrix(0, X)
+        if issparse(submatrix):
+            submatrix = submatrix.toarray()
+        assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]])
+        submatrix[:] = -1
+        if issparse(X):
+            X = X.toarray()
+        assert np.all(X != -1)
+
+
+def _test_shape_indices(model):
+    # Test get_shape and get_indices on fitted model.
+    for i in range(model.n_clusters):
+        m, n = model.get_shape(i)
+        i_ind, j_ind = model.get_indices(i)
+        assert len(i_ind) == m
+        assert len(j_ind) == n
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_spectral_coclustering(global_random_seed, csr_container):
+    # Test Dhillon's Spectral CoClustering on a simple problem.
+    param_grid = {
+        "svd_method": ["randomized", "arpack"],
+        "n_svd_vecs": [None, 20],
+        "mini_batch": [False, True],
+        "init": ["k-means++"],
+        "n_init": [10],
+    }
+    S, rows, cols = make_biclusters(
+        (30, 30), 3, noise=0.1, random_state=global_random_seed
+    )
+    S -= S.min()  # needs to be nonnegative before making it sparse
+    S = np.where(S < 1, 0, S)  # threshold some values
+    for mat in (S, csr_container(S)):
+        for kwargs in ParameterGrid(param_grid):
+            model = SpectralCoclustering(
+                n_clusters=3, random_state=global_random_seed, **kwargs
+            )
+            model.fit(mat)
+
+            assert model.rows_.shape == (3, 30)
+            assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
+            assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
+            assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+            _test_shape_indices(model)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_spectral_biclustering(global_random_seed, csr_container):
+    # Test Kluger methods on a checkerboard dataset.
+    S, rows, cols = make_checkerboard(
+        (30, 30), 3, noise=0.5, random_state=global_random_seed
+    )
+
+    non_default_params = {
+        "method": ["scale", "log"],
+        "svd_method": ["arpack"],
+        "n_svd_vecs": [20],
+        "mini_batch": [True],
+    }
+
+    for mat in (S, csr_container(S)):
+        for param_name, param_values in non_default_params.items():
+            for param_value in param_values:
+                model = SpectralBiclustering(
+                    n_clusters=3,
+                    n_init=3,
+                    init="k-means++",
+                    random_state=global_random_seed,
+                )
+                model.set_params(**dict([(param_name, param_value)]))
+
+                if issparse(mat) and model.get_params().get("method") == "log":
+                    # cannot take log of sparse matrix
+                    with pytest.raises(ValueError):
+                        model.fit(mat)
+                    continue
+                else:
+                    model.fit(mat)
+
+                assert model.rows_.shape == (9, 30)
+                assert model.columns_.shape == (9, 30)
+                assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30))
+                assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30))
+                assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+                _test_shape_indices(model)
+
+
+def _do_scale_test(scaled):
+    """Check that rows sum to one constant, and columns to another."""
+    row_sum = scaled.sum(axis=1)
+    col_sum = scaled.sum(axis=0)
+    if issparse(scaled):
+        row_sum = np.asarray(row_sum).squeeze()
+        col_sum = np.asarray(col_sum).squeeze()
+    assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1)
+    assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1)
+
+
+def _do_bistochastic_test(scaled):
+    """Check that rows and columns sum to the same constant."""
+    _do_scale_test(scaled)
+    assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_scale_normalize(global_random_seed, csr_container):
+    generator = np.random.RandomState(global_random_seed)
+    X = generator.rand(100, 100)
+    for mat in (X, csr_container(X)):
+        scaled, _, _ = _scale_normalize(mat)
+        _do_scale_test(scaled)
+        if issparse(mat):
+            assert issparse(scaled)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_bistochastic_normalize(global_random_seed, csr_container):
+    generator = np.random.RandomState(global_random_seed)
+    X = generator.rand(100, 100)
+    for mat in (X, csr_container(X)):
+        scaled = _bistochastic_normalize(mat)
+        _do_bistochastic_test(scaled)
+        if issparse(mat):
+            assert issparse(scaled)
+
+
+def test_log_normalize(global_random_seed):
+    # adding any constant to a log-scaled matrix should make it
+    # bistochastic
+    generator = np.random.RandomState(global_random_seed)
+    mat = generator.rand(100, 100)
+    scaled = _log_normalize(mat) + 1
+    _do_bistochastic_test(scaled)
+
+
+def test_fit_best_piecewise(global_random_seed):
+    model = SpectralBiclustering(random_state=global_random_seed)
+    vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]])
+    best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
+    assert_array_equal(best, vectors[:2])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_project_and_cluster(global_random_seed, csr_container):
+    model = SpectralBiclustering(random_state=global_random_seed)
+    data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
+    vectors = np.array([[1, 0], [0, 1], [0, 0]])
+    for mat in (data, csr_container(data)):
+        labels = model._project_and_cluster(mat, vectors, n_clusters=2)
+        assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
+
+
+def test_perfect_checkerboard(global_random_seed):
+    # XXX Previously failed on build bot (not reproducible)
+    model = SpectralBiclustering(
+        3, svd_method="arpack", random_state=global_random_seed
+    )
+
+    S, rows, cols = make_checkerboard(
+        (30, 30), 3, noise=0, random_state=global_random_seed
+    )
+    model.fit(S)
+    assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+    S, rows, cols = make_checkerboard(
+        (40, 30), 3, noise=0, random_state=global_random_seed
+    )
+    model.fit(S)
+    assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+    S, rows, cols = make_checkerboard(
+        (30, 40), 3, noise=0, random_state=global_random_seed
+    )
+    model.fit(S)
+    assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+
+@pytest.mark.parametrize(
+    "params, type_err, err_msg",
+    [
+        (
+            {"n_clusters": 6},
+            ValueError,
+            "n_clusters should be <= n_samples=5",
+        ),
+        (
+            {"n_clusters": (3, 3, 3)},
+            ValueError,
+            "Incorrect parameter n_clusters",
+        ),
+        (
+            {"n_clusters": (3, 6)},
+            ValueError,
+            "Incorrect parameter n_clusters",
+        ),
+        (
+            {"n_components": 3, "n_best": 4},
+            ValueError,
+            "n_best=4 must be <= n_components=3",
+        ),
+    ],
+)
+def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
+    """Check parameters validation in `SpectralBiClustering`"""
+    data = np.arange(25).reshape((5, 5))
+    model = SpectralBiclustering(**params)
+    with pytest.raises(type_err, match=err_msg):
+        model.fit(data)
+
+
+@pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering()))
+def test_n_features_in_(est):
+    X, _, _ = make_biclusters((3, 3), 3, random_state=0)
+
+    assert not hasattr(est, "n_features_in_")
+    est.fit(X)
+    assert est.n_features_in_ == 3
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_birch.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_birch.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc87934adaecdb507126097e2de945c677587bee
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_birch.py
@@ -0,0 +1,250 @@
+"""
+Tests for the birch clustering algorithm.
+"""
+
+import numpy as np
+import pytest
+
+from sklearn.cluster import AgglomerativeClustering, Birch
+from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import pairwise_distances_argmin, v_measure_score
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def test_n_samples_leaves_roots(global_random_seed, global_dtype):
+    # Sanity check for the number of samples in leaves and roots
+    X, y = make_blobs(n_samples=10, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc = Birch()
+    brc.fit(X)
+    n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
+    n_samples_leaves = sum(
+        [sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_]
+    )
+    assert n_samples_leaves == X.shape[0]
+    assert n_samples_root == X.shape[0]
+
+
+def test_partial_fit(global_random_seed, global_dtype):
+    # Test that fit is equivalent to calling partial_fit multiple times
+    X, y = make_blobs(n_samples=100, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc = Birch(n_clusters=3)
+    brc.fit(X)
+    brc_partial = Birch(n_clusters=None)
+    brc_partial.partial_fit(X[:50])
+    brc_partial.partial_fit(X[50:])
+    assert_allclose(brc_partial.subcluster_centers_, brc.subcluster_centers_)
+
+    # Test that same global labels are obtained after calling partial_fit
+    # with None
+    brc_partial.set_params(n_clusters=3)
+    brc_partial.partial_fit(None)
+    assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
+
+
+def test_birch_predict(global_random_seed, global_dtype):
+    # Test the predict method predicts the nearest centroid.
+    rng = np.random.RandomState(global_random_seed)
+    X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10)
+    X = X.astype(global_dtype, copy=False)
+
+    # n_samples * n_samples_per_cluster
+    shuffle_indices = np.arange(30)
+    rng.shuffle(shuffle_indices)
+    X_shuffle = X[shuffle_indices, :]
+    brc = Birch(n_clusters=4, threshold=1.0)
+    brc.fit(X_shuffle)
+
+    # Birch must preserve inputs' dtype
+    assert brc.subcluster_centers_.dtype == global_dtype
+
+    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
+    centroids = brc.subcluster_centers_
+    nearest_centroid = brc.subcluster_labels_[
+        pairwise_distances_argmin(X_shuffle, centroids)
+    ]
+    assert_allclose(v_measure_score(nearest_centroid, brc.labels_), 1.0)
+
+
+def test_n_clusters(global_random_seed, global_dtype):
+    # Test that n_clusters param works properly
+    X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc1 = Birch(n_clusters=10)
+    brc1.fit(X)
+    assert len(brc1.subcluster_centers_) > 10
+    assert len(np.unique(brc1.labels_)) == 10
+
+    # Test that n_clusters = Agglomerative Clustering gives
+    # the same results.
+    gc = AgglomerativeClustering(n_clusters=10)
+    brc2 = Birch(n_clusters=gc)
+    brc2.fit(X)
+    assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
+    assert_array_equal(brc1.labels_, brc2.labels_)
+
+    # Test that a small number of clusters raises a warning.
+    brc4 = Birch(threshold=10000.0)
+    with pytest.warns(ConvergenceWarning):
+        brc4.fit(X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_X(global_random_seed, global_dtype, csr_container):
+    # Test that sparse and dense data give same results
+    X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc = Birch(n_clusters=10)
+    brc.fit(X)
+
+    csr = csr_container(X)
+    brc_sparse = Birch(n_clusters=10)
+    brc_sparse.fit(csr)
+
+    # Birch must preserve inputs' dtype
+    assert brc_sparse.subcluster_centers_.dtype == global_dtype
+
+    assert_array_equal(brc.labels_, brc_sparse.labels_)
+    assert_allclose(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
+
+
+def test_partial_fit_second_call_error_checks():
+    # second partial fit calls will error when n_features is not consistent
+    # with the first call
+    X, y = make_blobs(n_samples=100)
+    brc = Birch(n_clusters=3)
+    brc.partial_fit(X, y)
+
+    msg = "X has 1 features, but Birch is expecting 2 features"
+    with pytest.raises(ValueError, match=msg):
+        brc.partial_fit(X[:, [0]], y)
+
+
+def check_branching_factor(node, branching_factor):
+    subclusters = node.subclusters_
+    assert branching_factor >= len(subclusters)
+    for cluster in subclusters:
+        if cluster.child_:
+            check_branching_factor(cluster.child_, branching_factor)
+
+
+def test_branching_factor(global_random_seed, global_dtype):
+    # Test that nodes have at max branching_factor number of subclusters
+    X, y = make_blobs(random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    branching_factor = 9
+
+    # Purposefully set a low threshold to maximize the subclusters.
+    brc = Birch(n_clusters=None, branching_factor=branching_factor, threshold=0.01)
+    brc.fit(X)
+    check_branching_factor(brc.root_, branching_factor)
+    brc = Birch(n_clusters=3, branching_factor=branching_factor, threshold=0.01)
+    brc.fit(X)
+    check_branching_factor(brc.root_, branching_factor)
+
+
+def check_threshold(birch_instance, threshold):
+    """Use the leaf linked list for traversal"""
+    current_leaf = birch_instance.dummy_leaf_.next_leaf_
+    while current_leaf:
+        subclusters = current_leaf.subclusters_
+        for sc in subclusters:
+            assert threshold >= sc.radius
+        current_leaf = current_leaf.next_leaf_
+
+
+def test_threshold(global_random_seed, global_dtype):
+    # Test that the leaf subclusters have a threshold lesser than radius
+    X, y = make_blobs(n_samples=80, centers=4, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc = Birch(threshold=0.5, n_clusters=None)
+    brc.fit(X)
+    check_threshold(brc, 0.5)
+
+    brc = Birch(threshold=5.0, n_clusters=None)
+    brc.fit(X)
+    check_threshold(brc, 5.0)
+
+
+def test_birch_n_clusters_long_int():
+    # Check that birch supports n_clusters with np.int64 dtype, for instance
+    # coming from np.arange. #16484
+    X, _ = make_blobs(random_state=0)
+    n_clusters = np.int64(5)
+    Birch(n_clusters=n_clusters).fit(X)
+
+
+def test_feature_names_out():
+    """Check `get_feature_names_out` for `Birch`."""
+    X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
+    brc = Birch(n_clusters=4)
+    brc.fit(X)
+    n_clusters = brc.subcluster_centers_.shape[0]
+
+    names_out = brc.get_feature_names_out()
+    assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out)
+
+
+def test_transform_match_across_dtypes(global_random_seed):
+    X, _ = make_blobs(n_samples=80, n_features=4, random_state=global_random_seed)
+    brc = Birch(n_clusters=4, threshold=1.1)
+    Y_64 = brc.fit_transform(X)
+    Y_32 = brc.fit_transform(X.astype(np.float32))
+
+    assert_allclose(Y_64, Y_32, atol=1e-6)
+
+
+def test_subcluster_dtype(global_dtype):
+    X = make_blobs(n_samples=80, n_features=4, random_state=0)[0].astype(
+        global_dtype, copy=False
+    )
+    brc = Birch(n_clusters=4)
+    assert brc.fit(X).subcluster_centers_.dtype == global_dtype
+
+
+def test_both_subclusters_updated():
+    """Check that both subclusters are updated when a node a split, even when there are
+    duplicated data points. Non-regression test for #23269.
+    """
+
+    X = np.array(
+        [
+            [-2.6192791, -1.5053215],
+            [-2.9993038, -1.6863596],
+            [-2.3724914, -1.3438171],
+            [-2.336792, -1.3417323],
+            [-2.4089134, -1.3290224],
+            [-2.3724914, -1.3438171],
+            [-3.364009, -1.8846745],
+            [-2.3724914, -1.3438171],
+            [-2.617677, -1.5003285],
+            [-2.2960556, -1.3260119],
+            [-2.3724914, -1.3438171],
+            [-2.5459878, -1.4533926],
+            [-2.25979, -1.3003055],
+            [-2.4089134, -1.3290224],
+            [-2.3724914, -1.3438171],
+            [-2.4089134, -1.3290224],
+            [-2.5459878, -1.4533926],
+            [-2.3724914, -1.3438171],
+            [-2.9720619, -1.7058647],
+            [-2.336792, -1.3417323],
+            [-2.3724914, -1.3438171],
+        ],
+        dtype=np.float32,
+    )
+
+    # no error
+    Birch(branching_factor=5, threshold=1e-5, n_clusters=None).fit(X)
+
+
+# TODO(1.8): Remove
+def test_birch_copy_deprecated():
+    X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
+    brc = Birch(n_clusters=4, copy=True)
+    with pytest.warns(FutureWarning, match="`copy` was deprecated"):
+        brc.fit(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bisect_k_means.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bisect_k_means.py
new file mode 100644
index 0000000000000000000000000000000000000000..799ddbc086ce0a14397fe5cb4aef607903c01228
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bisect_k_means.py
@@ -0,0 +1,158 @@
+import numpy as np
+import pytest
+
+from sklearn.cluster import BisectingKMeans
+from sklearn.metrics import v_measure_score
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+@pytest.mark.parametrize("bisecting_strategy", ["biggest_inertia", "largest_cluster"])
+@pytest.mark.parametrize("init", ["k-means++", "random"])
+def test_three_clusters(bisecting_strategy, init):
+    """Tries to perform bisect k-means for three clusters to check
+    if splitting data is performed correctly.
+    """
+    X = np.array(
+        [[1, 1], [10, 1], [3, 1], [10, 0], [2, 1], [10, 2], [10, 8], [10, 9], [10, 10]]
+    )
+    bisect_means = BisectingKMeans(
+        n_clusters=3,
+        random_state=0,
+        bisecting_strategy=bisecting_strategy,
+        init=init,
+    )
+    bisect_means.fit(X)
+
+    expected_centers = [[2, 1], [10, 1], [10, 9]]
+    expected_labels = [0, 1, 0, 1, 0, 1, 2, 2, 2]
+
+    assert_allclose(
+        sorted(expected_centers), sorted(bisect_means.cluster_centers_.tolist())
+    )
+    assert_allclose(v_measure_score(expected_labels, bisect_means.labels_), 1.0)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse(csr_container):
+    """Test Bisecting K-Means with sparse data.
+
+    Checks if labels and centers are the same between dense and sparse.
+    """
+
+    rng = np.random.RandomState(0)
+
+    X = rng.rand(20, 2)
+    X[X < 0.8] = 0
+    X_csr = csr_container(X)
+
+    bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
+
+    bisect_means.fit(X_csr)
+    sparse_centers = bisect_means.cluster_centers_
+
+    bisect_means.fit(X)
+    normal_centers = bisect_means.cluster_centers_
+
+    # Check if results is the same for dense and sparse data
+    assert_allclose(normal_centers, sparse_centers, atol=1e-8)
+
+
+@pytest.mark.parametrize("n_clusters", [4, 5])
+def test_n_clusters(n_clusters):
+    """Test if resulting labels are in range [0, n_clusters - 1]."""
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2)
+
+    bisect_means = BisectingKMeans(n_clusters=n_clusters, random_state=0)
+    bisect_means.fit(X)
+
+    assert_array_equal(np.unique(bisect_means.labels_), np.arange(n_clusters))
+
+
+def test_one_cluster():
+    """Test single cluster."""
+
+    X = np.array([[1, 2], [10, 2], [10, 8]])
+
+    bisect_means = BisectingKMeans(n_clusters=1, random_state=0).fit(X)
+
+    # All labels from fit or predict should be equal 0
+    assert all(bisect_means.labels_ == 0)
+    assert all(bisect_means.predict(X) == 0)
+
+    assert_allclose(bisect_means.cluster_centers_, X.mean(axis=0).reshape(1, -1))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_fit_predict(csr_container):
+    """Check if labels from fit(X) method are same as from fit(X).predict(X)."""
+    rng = np.random.RandomState(0)
+
+    X = rng.rand(10, 2)
+
+    if csr_container is not None:
+        X[X < 0.8] = 0
+        X = csr_container(X)
+
+    bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
+    bisect_means.fit(X)
+
+    assert_array_equal(bisect_means.labels_, bisect_means.predict(X))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_dtype_preserved(csr_container, global_dtype):
+    """Check that centers dtype is the same as input data dtype."""
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2).astype(global_dtype, copy=False)
+
+    if csr_container is not None:
+        X[X < 0.8] = 0
+        X = csr_container(X)
+
+    km = BisectingKMeans(n_clusters=3, random_state=0)
+    km.fit(X)
+
+    assert km.cluster_centers_.dtype == global_dtype
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_float32_float64_equivalence(csr_container):
+    """Check that the results are the same between float32 and float64."""
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2)
+
+    if csr_container is not None:
+        X[X < 0.8] = 0
+        X = csr_container(X)
+
+    km64 = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
+    km32 = BisectingKMeans(n_clusters=3, random_state=0).fit(X.astype(np.float32))
+
+    assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
+    assert_array_equal(km32.labels_, km64.labels_)
+
+
+@pytest.mark.parametrize("algorithm", ("lloyd", "elkan"))
+def test_no_crash_on_empty_bisections(algorithm):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/27081
+    rng = np.random.RandomState(0)
+    X_train = rng.rand(3000, 10)
+    bkm = BisectingKMeans(n_clusters=10, algorithm=algorithm).fit(X_train)
+
+    # predict on scaled data to trigger pathologic case
+    # where the inner mask leads to empty bisections.
+    X_test = 50 * rng.rand(100, 10)
+    labels = bkm.predict(X_test)  # should not crash with idiv by 0
+    assert np.isin(np.unique(labels), np.arange(10)).all()
+
+
+def test_one_feature():
+    # Check that no error is raised when there is only one feature
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/27236
+    X = np.random.normal(size=(128, 1))
+    BisectingKMeans(bisecting_strategy="biggest_inertia", random_state=0).fit(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_dbscan.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_dbscan.py
new file mode 100644
index 0000000000000000000000000000000000000000..556f89312d2fc87ab962ab84551f4941ec8b359b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_dbscan.py
@@ -0,0 +1,434 @@
+"""
+Tests for DBSCAN clustering algorithm
+"""
+
+import pickle
+import warnings
+
+import numpy as np
+import pytest
+from scipy.spatial import distance
+
+from sklearn.cluster import DBSCAN, dbscan
+from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS, LIL_CONTAINERS
+
+n_clusters = 3
+X = generate_clustered_data(n_clusters=n_clusters)
+
+
+def test_dbscan_similarity():
+    # Tests the DBSCAN algorithm with a similarity array.
+    # Parameters chosen specifically for this task.
+    eps = 0.15
+    min_samples = 10
+    # Compute similarities
+    D = distance.squareform(distance.pdist(X))
+    D /= np.max(D)
+    # Compute DBSCAN
+    core_samples, labels = dbscan(
+        D, metric="precomputed", eps=eps, min_samples=min_samples
+    )
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
+
+    assert n_clusters_1 == n_clusters
+
+    db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
+    labels = db.fit(D).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_2 == n_clusters
+
+
+def test_dbscan_feature():
+    # Tests the DBSCAN algorithm with a feature vector array.
+    # Parameters chosen specifically for this task.
+    # Different eps to other test, because distance is not normalised.
+    eps = 0.8
+    min_samples = 10
+    metric = "euclidean"
+    # Compute DBSCAN
+    # parameters chosen for task
+    core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples)
+
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_1 == n_clusters
+
+    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
+    labels = db.fit(X).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_2 == n_clusters
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_dbscan_sparse(lil_container):
+    core_sparse, labels_sparse = dbscan(lil_container(X), eps=0.8, min_samples=10)
+    core_dense, labels_dense = dbscan(X, eps=0.8, min_samples=10)
+    assert_array_equal(core_dense, core_sparse)
+    assert_array_equal(labels_dense, labels_sparse)
+
+
+@pytest.mark.parametrize("include_self", [False, True])
+def test_dbscan_sparse_precomputed(include_self):
+    D = pairwise_distances(X)
+    nn = NearestNeighbors(radius=0.9).fit(X)
+    X_ = X if include_self else None
+    D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance")
+    # Ensure it is sparse not merely on diagonals:
+    assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
+    core_sparse, labels_sparse = dbscan(
+        D_sparse, eps=0.8, min_samples=10, metric="precomputed"
+    )
+    core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed")
+    assert_array_equal(core_dense, core_sparse)
+    assert_array_equal(labels_dense, labels_sparse)
+
+
+def test_dbscan_sparse_precomputed_different_eps():
+    # test that precomputed neighbors graph is filtered if computed with
+    # a radius larger than DBSCAN's eps.
+    lower_eps = 0.2
+    nn = NearestNeighbors(radius=lower_eps).fit(X)
+    D_sparse = nn.radius_neighbors_graph(X, mode="distance")
+    dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
+
+    higher_eps = lower_eps + 0.7
+    nn = NearestNeighbors(radius=higher_eps).fit(X)
+    D_sparse = nn.radius_neighbors_graph(X, mode="distance")
+    dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
+
+    assert_array_equal(dbscan_lower[0], dbscan_higher[0])
+    assert_array_equal(dbscan_lower[1], dbscan_higher[1])
+
+
+@pytest.mark.parametrize("metric", ["precomputed", "minkowski"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_dbscan_input_not_modified(metric, csr_container):
+    # test that the input is not modified by dbscan
+    X = np.random.RandomState(0).rand(10, 10)
+    X = csr_container(X) if csr_container is not None else X
+    X_copy = X.copy()
+    dbscan(X, metric=metric)
+
+    if csr_container is not None:
+        assert_array_equal(X.toarray(), X_copy.toarray())
+    else:
+        assert_array_equal(X, X_copy)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container):
+    """Check that we don't modify in-place the pre-computed sparse matrix.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27508
+    """
+    X = np.random.RandomState(0).rand(10, 10)
+    # Add zeros on the diagonal that will be implicit when creating
+    # the sparse matrix. If `X` is modified in-place, the zeros from
+    # the diagonal will be made explicit.
+    np.fill_diagonal(X, 0)
+    X = csr_container(X)
+    assert all(row != col for row, col in zip(*X.nonzero()))
+    X_copy = X.copy()
+    dbscan(X, metric="precomputed")
+    # Make sure that we did not modify `X` in-place even by creating
+    # explicit 0s values.
+    assert X.nnz == X_copy.nnz
+    assert_array_equal(X.toarray(), X_copy.toarray())
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_dbscan_no_core_samples(csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.rand(40, 10)
+    X[X < 0.8] = 0
+
+    for X_ in [X, csr_container(X)]:
+        db = DBSCAN(min_samples=6).fit(X_)
+        assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
+        assert_array_equal(db.labels_, -1)
+        assert db.core_sample_indices_.shape == (0,)
+
+
+def test_dbscan_callable():
+    # Tests the DBSCAN algorithm with a callable metric.
+    # Parameters chosen specifically for this task.
+    # Different eps to other test, because distance is not normalised.
+    eps = 0.8
+    min_samples = 10
+    # metric is the function reference, not the string key.
+    metric = distance.euclidean
+    # Compute DBSCAN
+    # parameters chosen for task
+    core_samples, labels = dbscan(
+        X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree"
+    )
+
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_1 == n_clusters
+
+    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_2 == n_clusters
+
+
+def test_dbscan_metric_params():
+    # Tests that DBSCAN works with the metrics_params argument.
+    eps = 0.8
+    min_samples = 10
+    p = 1
+
+    # Compute DBSCAN with metric_params arg
+
+    with warnings.catch_warnings(record=True) as warns:
+        db = DBSCAN(
+            metric="minkowski",
+            metric_params={"p": p},
+            eps=eps,
+            p=None,
+            min_samples=min_samples,
+            algorithm="ball_tree",
+        ).fit(X)
+    assert not warns, warns[0].message
+    core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
+
+    # Test that sample labels are the same as passing Minkowski 'p' directly
+    db = DBSCAN(
+        metric="minkowski", eps=eps, min_samples=min_samples, algorithm="ball_tree", p=p
+    ).fit(X)
+    core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
+
+    assert_array_equal(core_sample_1, core_sample_2)
+    assert_array_equal(labels_1, labels_2)
+
+    # Minkowski with p=1 should be equivalent to Manhattan distance
+    db = DBSCAN(
+        metric="manhattan", eps=eps, min_samples=min_samples, algorithm="ball_tree"
+    ).fit(X)
+    core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
+
+    assert_array_equal(core_sample_1, core_sample_3)
+    assert_array_equal(labels_1, labels_3)
+
+    with pytest.warns(
+        SyntaxWarning,
+        match=(
+            "Parameter p is found in metric_params. "
+            "The corresponding parameter from __init__ "
+            "is ignored."
+        ),
+    ):
+        # Test that checks p is ignored in favor of metric_params={'p': <val>}
+        db = DBSCAN(
+            metric="minkowski",
+            metric_params={"p": p},
+            eps=eps,
+            p=p + 1,
+            min_samples=min_samples,
+            algorithm="ball_tree",
+        ).fit(X)
+        core_sample_4, labels_4 = db.core_sample_indices_, db.labels_
+
+    assert_array_equal(core_sample_1, core_sample_4)
+    assert_array_equal(labels_1, labels_4)
+
+
+def test_dbscan_balltree():
+    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
+    eps = 0.8
+    min_samples = 10
+
+    D = pairwise_distances(X)
+    core_samples, labels = dbscan(
+        D, metric="precomputed", eps=eps, min_samples=min_samples
+    )
+
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_1 == n_clusters
+
+    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_2 == n_clusters
+
+    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_3 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_3 == n_clusters
+
+    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_4 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_4 == n_clusters
+
+    db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_5 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_5 == n_clusters
+
+
+def test_input_validation():
+    # DBSCAN.fit should accept a list of lists.
+    X = [[1.0, 2.0], [3.0, 4.0]]
+    DBSCAN().fit(X)  # must not raise exception
+
+
+def test_pickle():
+    obj = DBSCAN()
+    s = pickle.dumps(obj)
+    assert type(pickle.loads(s)) is obj.__class__
+
+
+def test_boundaries():
+    # ensure min_samples is inclusive of core point
+    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
+    assert 0 in core
+    # ensure eps is inclusive of circumference
+    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
+    assert 0 in core
+    core, _ = dbscan([[0], [1], [1]], eps=0.99, min_samples=2)
+    assert 0 not in core
+
+
+def test_weighted_dbscan(global_random_seed):
+    # ensure sample_weight is validated
+    with pytest.raises(ValueError):
+        dbscan([[0], [1]], sample_weight=[2])
+    with pytest.raises(ValueError):
+        dbscan([[0], [1]], sample_weight=[2, 3, 4])
+
+    # ensure sample_weight has an effect
+    assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0])
+    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0])
+    assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0])
+    assert_array_equal(
+        [0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]
+    )
+
+    # points within eps of each other:
+    assert_array_equal(
+        [0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]
+    )
+    # and effect of non-positive and non-integer sample_weight:
+    assert_array_equal(
+        [], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]
+    )
+    assert_array_equal(
+        [0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]
+    )
+    assert_array_equal(
+        [0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]
+    )
+    assert_array_equal(
+        [], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]
+    )
+
+    # for non-negative sample_weight, cores should be identical to repetition
+    rng = np.random.RandomState(global_random_seed)
+    sample_weight = rng.randint(0, 5, X.shape[0])
+    core1, label1 = dbscan(X, sample_weight=sample_weight)
+    assert len(label1) == len(X)
+
+    X_repeated = np.repeat(X, sample_weight, axis=0)
+    core_repeated, label_repeated = dbscan(X_repeated)
+    core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
+    core_repeated_mask[core_repeated] = True
+    core_mask = np.zeros(X.shape[0], dtype=bool)
+    core_mask[core1] = True
+    assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
+
+    # sample_weight should work with precomputed distance matrix
+    D = pairwise_distances(X)
+    core3, label3 = dbscan(D, sample_weight=sample_weight, metric="precomputed")
+    assert_array_equal(core1, core3)
+    assert_array_equal(label1, label3)
+
+    # sample_weight should work with estimator
+    est = DBSCAN().fit(X, sample_weight=sample_weight)
+    core4 = est.core_sample_indices_
+    label4 = est.labels_
+    assert_array_equal(core1, core4)
+    assert_array_equal(label1, label4)
+
+    est = DBSCAN()
+    label5 = est.fit_predict(X, sample_weight=sample_weight)
+    core5 = est.core_sample_indices_
+    assert_array_equal(core1, core5)
+    assert_array_equal(label1, label5)
+    assert_array_equal(label1, est.labels_)
+
+
+@pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"])
+def test_dbscan_core_samples_toy(algorithm):
+    X = [[0], [2], [3], [4], [6], [8], [10]]
+    n_samples = len(X)
+
+    # Degenerate case: every sample is a core sample, either with its own
+    # cluster or including other close core samples.
+    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1)
+    assert_array_equal(core_samples, np.arange(n_samples))
+    assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
+
+    # With eps=1 and min_samples=2 only the 3 samples from the denser area
+    # are core samples. All other points are isolated and considered noise.
+    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2)
+    assert_array_equal(core_samples, [1, 2, 3])
+    assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
+
+    # Only the sample in the middle of the dense area is core. Its two
+    # neighbors are edge samples. Remaining samples are noise.
+    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3)
+    assert_array_equal(core_samples, [2])
+    assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
+
+    # It's no longer possible to extract core samples with eps=1:
+    # everything is noise.
+    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4)
+    assert_array_equal(core_samples, [])
+    assert_array_equal(labels, np.full(n_samples, -1.0))
+
+
+def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
+    # see https://github.com/scikit-learn/scikit-learn/issues/4641 for
+    # more details
+    X = np.eye(10)
+    labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
+    assert len(set(labels)) == 1
+
+    X = np.zeros((10, 10))
+    labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
+    assert len(set(labels)) == 1
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_dbscan_precomputed_metric_with_initial_rows_zero(csr_container):
+    # sample matrix with initial two row all zero
+    ar = np.array(
+        [
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
+            [0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
+            [0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0],
+        ]
+    )
+    matrix = csr_container(ar)
+    labels = DBSCAN(eps=0.2, metric="precomputed", min_samples=2).fit(matrix).labels_
+    assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_feature_agglomeration.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_feature_agglomeration.py
new file mode 100644
index 0000000000000000000000000000000000000000..80aa251c358153b0771bd201067fa87f8fb6bfdc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_feature_agglomeration.py
@@ -0,0 +1,55 @@
+"""
+Tests for sklearn.cluster._feature_agglomeration
+"""
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.cluster import FeatureAgglomeration
+from sklearn.datasets import make_blobs
+from sklearn.utils._testing import assert_array_almost_equal
+
+
+def test_feature_agglomeration():
+    n_clusters = 1
+    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)
+
+    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean)
+    agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median)
+    agglo_mean.fit(X)
+    agglo_median.fit(X)
+
+    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
+    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
+    assert np.size(agglo_mean.labels_) == X.shape[1]
+    assert np.size(agglo_median.labels_) == X.shape[1]
+
+    # Test transform
+    Xt_mean = agglo_mean.transform(X)
+    Xt_median = agglo_median.transform(X)
+    assert Xt_mean.shape[1] == n_clusters
+    assert Xt_median.shape[1] == n_clusters
+    assert Xt_mean == np.array([1 / 3.0])
+    assert Xt_median == np.array([0.0])
+
+    # Test inverse transform
+    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
+    X_full_median = agglo_median.inverse_transform(Xt_median)
+    assert np.unique(X_full_mean[0]).size == n_clusters
+    assert np.unique(X_full_median[0]).size == n_clusters
+
+    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
+    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
+
+
+def test_feature_agglomeration_feature_names_out():
+    """Check `get_feature_names_out` for `FeatureAgglomeration`."""
+    X, _ = make_blobs(n_features=6, random_state=0)
+    agglo = FeatureAgglomeration(n_clusters=3)
+    agglo.fit(X)
+    n_clusters = agglo.n_clusters_
+
+    names_out = agglo.get_feature_names_out()
+    assert_array_equal(
+        [f"featureagglomeration{i}" for i in range(n_clusters)], names_out
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hdbscan.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hdbscan.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b45d9d3cb7aa290e7fac62f359ac518d105579e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hdbscan.py
@@ -0,0 +1,582 @@
+"""
+Tests for HDBSCAN clustering algorithm
+Based on the DBSCAN test code
+"""
+
+import numpy as np
+import pytest
+from scipy import stats
+from scipy.spatial import distance
+
+from sklearn.cluster import HDBSCAN
+from sklearn.cluster._hdbscan._tree import (
+    CONDENSED_dtype,
+    _condense_tree,
+    _do_labelling,
+)
+from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING
+from sklearn.datasets import make_blobs
+from sklearn.metrics import fowlkes_mallows_score
+from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances
+from sklearn.neighbors import BallTree, KDTree
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import shuffle
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+X, y = make_blobs(n_samples=200, random_state=10)
+X, y = shuffle(X, y, random_state=7)
+X = StandardScaler().fit_transform(X)
+
+ALGORITHMS = [
+    "kd_tree",
+    "ball_tree",
+    "brute",
+    "auto",
+]
+
+OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()}
+
+
+def check_label_quality(labels, threshold=0.99):
+    n_clusters = len(set(labels) - OUTLIER_SET)
+    assert n_clusters == 3
+    assert fowlkes_mallows_score(labels, y) > threshold
+
+
+@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING)
+def test_outlier_data(outlier_type):
+    """
+    Tests if np.inf and np.nan data are each treated as special outliers.
+    """
+    outlier = {
+        "infinite": np.inf,
+        "missing": np.nan,
+    }[outlier_type]
+    prob_check = {
+        "infinite": lambda x, y: x == y,
+        "missing": lambda x, y: np.isnan(x),
+    }[outlier_type]
+    label = _OUTLIER_ENCODING[outlier_type]["label"]
+    prob = _OUTLIER_ENCODING[outlier_type]["prob"]
+
+    X_outlier = X.copy()
+    X_outlier[0] = [outlier, 1]
+    X_outlier[5] = [outlier, outlier]
+    model = HDBSCAN().fit(X_outlier)
+
+    (missing_labels_idx,) = (model.labels_ == label).nonzero()
+    assert_array_equal(missing_labels_idx, [0, 5])
+
+    (missing_probs_idx,) = (prob_check(model.probabilities_, prob)).nonzero()
+    assert_array_equal(missing_probs_idx, [0, 5])
+
+    clean_indices = list(range(1, 5)) + list(range(6, 200))
+    clean_model = HDBSCAN().fit(X_outlier[clean_indices])
+    assert_array_equal(clean_model.labels_, model.labels_[clean_indices])
+
+
+def test_hdbscan_distance_matrix():
+    """
+    Tests that HDBSCAN works with precomputed distance matrices, and throws the
+    appropriate errors when needed.
+    """
+    D = euclidean_distances(X)
+    D_original = D.copy()
+    labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D)
+
+    assert_allclose(D, D_original)
+    check_label_quality(labels)
+
+    msg = r"The precomputed distance matrix.*has shape"
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed", copy=True).fit_predict(X)
+
+    msg = r"The precomputed distance matrix.*values"
+    # Ensure the matrix is not symmetric
+    D[0, 1] = 10
+    D[1, 0] = 1
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed").fit_predict(D)
+
+
+@pytest.mark.parametrize("sparse_constructor", [*CSR_CONTAINERS, *CSC_CONTAINERS])
+def test_hdbscan_sparse_distance_matrix(sparse_constructor):
+    """
+    Tests that HDBSCAN works with sparse distance matrices.
+    """
+    D = distance.squareform(distance.pdist(X))
+    D /= np.max(D)
+
+    threshold = stats.scoreatpercentile(D.flatten(), 50)
+
+    D[D >= threshold] = 0.0
+    D = sparse_constructor(D)
+    D.eliminate_zeros()
+
+    labels = HDBSCAN(metric="precomputed").fit_predict(D)
+    check_label_quality(labels)
+
+
+def test_hdbscan_feature_array():
+    """
+    Tests that HDBSCAN works with feature array, including an arbitrary
+    goodness of fit check. Note that the check is a simple heuristic.
+    """
+    labels = HDBSCAN().fit_predict(X)
+
+    # Check that clustering is arbitrarily good
+    # This is a heuristic to guard against regression
+    check_label_quality(labels)
+
+
+@pytest.mark.parametrize("algo", ALGORITHMS)
+@pytest.mark.parametrize("metric", _VALID_METRICS)
+def test_hdbscan_algorithms(algo, metric):
+    """
+    Tests that HDBSCAN works with the expected combinations of algorithms and
+    metrics, or raises the expected errors.
+    """
+    labels = HDBSCAN(algorithm=algo).fit_predict(X)
+    check_label_quality(labels)
+
+    # Validation for brute is handled by `pairwise_distances`
+    if algo in ("brute", "auto"):
+        return
+
+    ALGOS_TREES = {
+        "kd_tree": KDTree,
+        "ball_tree": BallTree,
+    }
+    metric_params = {
+        "mahalanobis": {"V": np.eye(X.shape[1])},
+        "seuclidean": {"V": np.ones(X.shape[1])},
+        "minkowski": {"p": 2},
+        "wminkowski": {"p": 2, "w": np.ones(X.shape[1])},
+    }.get(metric, None)
+
+    hdb = HDBSCAN(
+        algorithm=algo,
+        metric=metric,
+        metric_params=metric_params,
+    )
+
+    if metric not in ALGOS_TREES[algo].valid_metrics:
+        with pytest.raises(ValueError):
+            hdb.fit(X)
+    elif metric == "wminkowski":
+        with pytest.warns(FutureWarning):
+            hdb.fit(X)
+    else:
+        hdb.fit(X)
+
+
+def test_dbscan_clustering():
+    """
+    Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
+    This test is more of a sanity check than a rigorous evaluation.
+    """
+    clusterer = HDBSCAN().fit(X)
+    labels = clusterer.dbscan_clustering(0.3)
+
+    # We use a looser threshold due to dbscan producing a more constrained
+    # clustering representation
+    check_label_quality(labels, threshold=0.92)
+
+
+@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
+def test_dbscan_clustering_outlier_data(cut_distance):
+    """
+    Tests if np.inf and np.nan data are each treated as special outliers.
+    """
+    missing_label = _OUTLIER_ENCODING["missing"]["label"]
+    infinite_label = _OUTLIER_ENCODING["infinite"]["label"]
+
+    X_outlier = X.copy()
+    X_outlier[0] = [np.inf, 1]
+    X_outlier[2] = [1, np.nan]
+    X_outlier[5] = [np.inf, np.nan]
+    model = HDBSCAN().fit(X_outlier)
+    labels = model.dbscan_clustering(cut_distance=cut_distance)
+
+    missing_labels_idx = np.flatnonzero(labels == missing_label)
+    assert_array_equal(missing_labels_idx, [2, 5])
+
+    infinite_labels_idx = np.flatnonzero(labels == infinite_label)
+    assert_array_equal(infinite_labels_idx, [0])
+
+    clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx))
+    clean_model = HDBSCAN().fit(X_outlier[clean_idx])
+    clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
+    assert_array_equal(clean_labels, labels[clean_idx])
+
+
+def test_hdbscan_best_balltree_metric():
+    """
+    Tests that HDBSCAN using `BallTree` works.
+    """
+    labels = HDBSCAN(
+        metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}
+    ).fit_predict(X)
+    check_label_quality(labels)
+
+
+def test_hdbscan_no_clusters():
+    """
+    Tests that HDBSCAN correctly does not generate a valid cluster when the
+    `min_cluster_size` is too large for the data.
+    """
+    labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X)
+    assert set(labels).issubset(OUTLIER_SET)
+
+
+def test_hdbscan_min_cluster_size():
+    """
+    Test that the smallest non-noise cluster has at least `min_cluster_size`
+    many points
+    """
+    for min_cluster_size in range(2, len(X), 1):
+        labels = HDBSCAN(min_cluster_size=min_cluster_size).fit_predict(X)
+        true_labels = [label for label in labels if label != -1]
+        if len(true_labels) != 0:
+            assert np.min(np.bincount(true_labels)) >= min_cluster_size
+
+
+def test_hdbscan_callable_metric():
+    """
+    Tests that HDBSCAN works when passed a callable metric.
+    """
+    metric = distance.euclidean
+    labels = HDBSCAN(metric=metric).fit_predict(X)
+    check_label_quality(labels)
+
+
+@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"])
+def test_hdbscan_precomputed_non_brute(tree):
+    """
+    Tests that HDBSCAN correctly raises an error when passing precomputed data
+    while requesting a tree-based algorithm.
+    """
+    hdb = HDBSCAN(metric="precomputed", algorithm=tree)
+    msg = "precomputed is not a valid metric for"
+    with pytest.raises(ValueError, match=msg):
+        hdb.fit(X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_hdbscan_sparse(csr_container):
+    """
+    Tests that HDBSCAN works correctly when passing sparse feature data.
+    Evaluates correctness by comparing against the same data passed as a dense
+    array.
+    """
+
+    dense_labels = HDBSCAN().fit(X).labels_
+    check_label_quality(dense_labels)
+
+    _X_sparse = csr_container(X)
+    X_sparse = _X_sparse.copy()
+    sparse_labels = HDBSCAN().fit(X_sparse).labels_
+    assert_array_equal(dense_labels, sparse_labels)
+
+    # Compare that the sparse and dense non-precomputed routines return the same labels
+    # where the 0th observation contains the outlier.
+    for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")):
+        X_dense = X.copy()
+        X_dense[0, 0] = outlier_val
+        dense_labels = HDBSCAN().fit(X_dense).labels_
+        check_label_quality(dense_labels)
+        assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
+
+        X_sparse = _X_sparse.copy()
+        X_sparse[0, 0] = outlier_val
+        sparse_labels = HDBSCAN().fit(X_sparse).labels_
+        assert_array_equal(dense_labels, sparse_labels)
+
+    msg = "Sparse data matrices only support algorithm `brute`."
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse)
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_hdbscan_centers(algorithm):
+    """
+    Tests that HDBSCAN centers are calculated and stored properly, and are
+    accurate to the data.
+    """
+    centers = [(0.0, 0.0), (3.0, 3.0)]
+    H, _ = make_blobs(n_samples=2000, random_state=0, centers=centers, cluster_std=0.5)
+    hdb = HDBSCAN(store_centers="both").fit(H)
+
+    for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_):
+        assert_allclose(center, centroid, rtol=1, atol=0.05)
+        assert_allclose(center, medoid, rtol=1, atol=0.05)
+
+    # Ensure that nothing is done for noise
+    hdb = HDBSCAN(
+        algorithm=algorithm, store_centers="both", min_cluster_size=X.shape[0]
+    ).fit(X)
+    assert hdb.centroids_.shape[0] == 0
+    assert hdb.medoids_.shape[0] == 0
+
+
+def test_hdbscan_allow_single_cluster_with_epsilon():
+    """
+    Tests that HDBSCAN single-cluster selection with epsilon works correctly.
+    """
+    rng = np.random.RandomState(0)
+    no_structure = rng.rand(150, 2)
+    # without epsilon we should see many noise points as children of root.
+    labels = HDBSCAN(
+        min_cluster_size=5,
+        cluster_selection_epsilon=0.0,
+        cluster_selection_method="eom",
+        allow_single_cluster=True,
+    ).fit_predict(no_structure)
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    assert len(unique_labels) == 2
+
+    # Arbitrary heuristic. Would prefer something more precise.
+    assert counts[unique_labels == -1] > 30
+
+    # for this random seed an epsilon of 0.18 will produce exactly 2 noise
+    # points at that cut in single linkage.
+    labels = HDBSCAN(
+        min_cluster_size=5,
+        cluster_selection_epsilon=0.18,
+        cluster_selection_method="eom",
+        allow_single_cluster=True,
+        algorithm="kd_tree",
+    ).fit_predict(no_structure)
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    assert len(unique_labels) == 2
+    assert counts[unique_labels == -1] == 2
+
+
+def test_hdbscan_better_than_dbscan():
+    """
+    Validate that HDBSCAN can properly cluster this difficult synthetic
+    dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
+    example)
+    """
+    centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]]
+    X, y = make_blobs(
+        n_samples=750,
+        centers=centers,
+        cluster_std=[0.2, 0.35, 1.35, 1.35],
+        random_state=0,
+    )
+    labels = HDBSCAN().fit(X).labels_
+
+    n_clusters = len(set(labels)) - int(-1 in labels)
+    assert n_clusters == 4
+    fowlkes_mallows_score(labels, y) > 0.99
+
+
+@pytest.mark.parametrize(
+    "kwargs, X",
+    [
+        ({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])),
+        ({"metric": "precomputed"}, [[1, 2], [2, 1]]),
+        ({}, [[1, 2], [3, 4]]),
+    ],
+)
+def test_hdbscan_usable_inputs(X, kwargs):
+    """
+    Tests that HDBSCAN works correctly for array-likes and precomputed inputs
+    with non-finite points.
+    """
+    HDBSCAN(min_samples=1, **kwargs).fit(X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_hdbscan_sparse_distances_too_few_nonzero(csr_container):
+    """
+    Tests that HDBSCAN raises the correct error when there are too few
+    non-zero distances.
+    """
+    X = csr_container(np.zeros((10, 10)))
+
+    msg = "There exists points with fewer than"
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed").fit(X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_hdbscan_sparse_distances_disconnected_graph(csr_container):
+    """
+    Tests that HDBSCAN raises the correct error when the distance matrix
+    has multiple connected components.
+    """
+    # Create symmetric sparse matrix with 2 connected components
+    X = np.zeros((20, 20))
+    X[:5, :5] = 1
+    X[5:, 15:] = 1
+    X = X + X.T
+    X = csr_container(X)
+    msg = "HDBSCAN cannot be performed on a disconnected graph"
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed").fit(X)
+
+
+def test_hdbscan_tree_invalid_metric():
+    """
+    Tests that HDBSCAN correctly raises an error for invalid metric choices.
+    """
+    metric_callable = lambda x: x
+    msg = (
+        ".* is not a valid metric for a .*-based algorithm\\. Please select a different"
+        " metric\\."
+    )
+
+    # Callables are not supported for either
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X)
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X)
+
+    # The set of valid metrics for KDTree at the time of writing this test is a
+    # strict subset of those supported in BallTree
+    metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics))
+    if len(metrics_not_kd) > 0:
+        with pytest.raises(ValueError, match=msg):
+            HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X)
+
+
+def test_hdbscan_too_many_min_samples():
+    """
+    Tests that HDBSCAN correctly raises an error when setting `min_samples`
+    larger than the number of samples.
+    """
+    hdb = HDBSCAN(min_samples=len(X) + 1)
+    msg = r"min_samples (.*) must be at most"
+    with pytest.raises(ValueError, match=msg):
+        hdb.fit(X)
+
+
+def test_hdbscan_precomputed_dense_nan():
+    """
+    Tests that HDBSCAN correctly raises an error when providing precomputed
+    distances with `np.nan` values.
+    """
+    X_nan = X.copy()
+    X_nan[0, 0] = np.nan
+    msg = "np.nan values found in precomputed-dense"
+    hdb = HDBSCAN(metric="precomputed")
+    with pytest.raises(ValueError, match=msg):
+        hdb.fit(X_nan)
+
+
+@pytest.mark.parametrize("allow_single_cluster", [True, False])
+@pytest.mark.parametrize("epsilon", [0, 0.1])
+def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon):
+    """
+    Tests that the `_do_labelling` helper function correctly assigns labels.
+    """
+    n_samples = 48
+    X, y = make_blobs(
+        n_samples,
+        random_state=global_random_seed,
+        # Ensure the clusters are distinct with no overlap
+        centers=[
+            [0, 0],
+            [10, 0],
+            [0, 10],
+        ],
+    )
+
+    est = HDBSCAN().fit(X)
+    condensed_tree = _condense_tree(
+        est._single_linkage_tree_, min_cluster_size=est.min_cluster_size
+    )
+    clusters = {n_samples + 2, n_samples + 3, n_samples + 4}
+    cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2}
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters=clusters,
+        cluster_label_map=cluster_label_map,
+        allow_single_cluster=allow_single_cluster,
+        cluster_selection_epsilon=epsilon,
+    )
+
+    first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))}
+    y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))}
+    aligned_target = np.vectorize(y_to_labels.get)(y)
+    assert_array_equal(labels, aligned_target)
+
+
+def test_labelling_thresholding():
+    """
+    Tests that the `_do_labelling` helper function correctly thresholds the
+    incoming lambda values given various `cluster_selection_epsilon` values.
+    """
+    n_samples = 5
+    MAX_LAMBDA = 1.5
+    condensed_tree = np.array(
+        [
+            (5, 2, MAX_LAMBDA, 1),
+            (5, 1, 0.1, 1),
+            (5, 0, MAX_LAMBDA, 1),
+            (5, 3, 0.2, 1),
+            (5, 4, 0.3, 1),
+        ],
+        dtype=CONDENSED_dtype,
+    )
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters={n_samples},
+        cluster_label_map={n_samples: 0, n_samples + 1: 1},
+        allow_single_cluster=True,
+        cluster_selection_epsilon=1,
+    )
+    num_noise = condensed_tree["value"] < 1
+    assert sum(num_noise) == sum(labels == -1)
+
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters={n_samples},
+        cluster_label_map={n_samples: 0, n_samples + 1: 1},
+        allow_single_cluster=True,
+        cluster_selection_epsilon=0,
+    )
+    # The threshold should be calculated per-sample based on the largest
+    # lambda of any simbling node. In this case, all points are siblings
+    # and the largest value is exactly MAX_LAMBDA.
+    num_noise = condensed_tree["value"] < MAX_LAMBDA
+    assert sum(num_noise) == sum(labels == -1)
+
+
+@pytest.mark.parametrize("store_centers", ["centroid", "medoid"])
+def test_hdbscan_error_precomputed_and_store_centers(store_centers):
+    """Check that we raise an error if the centers are requested together with
+    a precomputed input matrix.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27893
+    """
+    rng = np.random.RandomState(0)
+    X = rng.random((100, 2))
+    X_dist = euclidean_distances(X)
+    err_msg = "Cannot store centers when using a precomputed distance matrix."
+    with pytest.raises(ValueError, match=err_msg):
+        HDBSCAN(metric="precomputed", store_centers=store_centers).fit(X_dist)
+
+
+@pytest.mark.parametrize("valid_algo", ["auto", "brute"])
+def test_hdbscan_cosine_metric_valid_algorithm(valid_algo):
+    """Test that HDBSCAN works with the "cosine" metric when the algorithm is set
+    to "brute" or "auto".
+
+    Non-regression test for issue #28631
+    """
+    HDBSCAN(metric="cosine", algorithm=valid_algo).fit_predict(X)
+
+
+@pytest.mark.parametrize("invalid_algo", ["kd_tree", "ball_tree"])
+def test_hdbscan_cosine_metric_invalid_algorithm(invalid_algo):
+    """Test that HDBSCAN raises an informative error is raised when an unsupported
+    algorithm is used with the "cosine" metric.
+    """
+    hdbscan = HDBSCAN(metric="cosine", algorithm=invalid_algo)
+    with pytest.raises(ValueError, match="cosine is not a valid metric"):
+        hdbscan.fit_predict(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hierarchical.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hierarchical.py
new file mode 100644
index 0000000000000000000000000000000000000000..222d4f6cd92649b9d59cb3f69f3d350414493984
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hierarchical.py
@@ -0,0 +1,889 @@
+"""
+Several basic tests for hierarchical clustering procedures
+
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import shutil
+from functools import partial
+from tempfile import mkdtemp
+
+import numpy as np
+import pytest
+from scipy.cluster import hierarchy
+from scipy.sparse.csgraph import connected_components
+
+from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration, ward_tree
+from sklearn.cluster._agglomerative import (
+    _TREE_BUILDERS,
+    _fix_connectivity,
+    _hc_cut,
+    linkage_tree,
+)
+from sklearn.cluster._hierarchical_fast import (
+    average_merge,
+    max_merge,
+    mst_linkage_core,
+)
+from sklearn.datasets import make_circles, make_moons
+from sklearn.feature_extraction.image import grid_to_graph
+from sklearn.metrics import DistanceMetric
+from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score
+from sklearn.metrics.pairwise import (
+    PAIRED_DISTANCES,
+    cosine_distances,
+    manhattan_distances,
+    pairwise_distances,
+)
+from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
+from sklearn.neighbors import kneighbors_graph
+from sklearn.utils._fast_dict import IntFloatDict
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    create_memmap_backed_data,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import LIL_CONTAINERS
+
+
+def test_linkage_misc():
+    # Misc tests on linkage
+    rng = np.random.RandomState(42)
+    X = rng.normal(size=(5, 5))
+
+    with pytest.raises(ValueError):
+        linkage_tree(X, linkage="foo")
+
+    with pytest.raises(ValueError):
+        linkage_tree(X, connectivity=np.ones((4, 4)))
+
+    # Smoke test FeatureAgglomeration
+    FeatureAgglomeration().fit(X)
+
+    # test hierarchical clustering on a precomputed distances matrix
+    dis = cosine_distances(X)
+
+    res = linkage_tree(dis, affinity="precomputed")
+    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
+
+    # test hierarchical clustering on a precomputed distances matrix
+    res = linkage_tree(X, affinity=manhattan_distances)
+    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
+
+
+def test_structured_linkage_tree():
+    # Check that we obtain the correct solution for structured linkage trees.
+    rng = np.random.RandomState(0)
+    mask = np.ones([10, 10], dtype=bool)
+    # Avoiding a mask with only 'True' entries
+    mask[4:7, 4:7] = 0
+    X = rng.randn(50, 100)
+    connectivity = grid_to_graph(*mask.shape)
+    for tree_builder in _TREE_BUILDERS.values():
+        children, n_components, n_leaves, parent = tree_builder(
+            X.T, connectivity=connectivity
+        )
+        n_nodes = 2 * X.shape[1] - 1
+        assert len(children) + n_leaves == n_nodes
+        # Check that ward_tree raises a ValueError with a connectivity matrix
+        # of the wrong shape
+        with pytest.raises(ValueError):
+            tree_builder(X.T, connectivity=np.ones((4, 4)))
+        # Check that fitting with no samples raises an error
+        with pytest.raises(ValueError):
+            tree_builder(X.T[:0], connectivity=connectivity)
+
+
+def test_unstructured_linkage_tree():
+    # Check that we obtain the correct solution for unstructured linkage trees.
+    rng = np.random.RandomState(0)
+    X = rng.randn(50, 100)
+    for this_X in (X, X[0]):
+        # With specified a number of clusters just for the sake of
+        # raising a warning and testing the warning code
+        with ignore_warnings():
+            with pytest.warns(UserWarning):
+                children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10)
+        n_nodes = 2 * X.shape[1] - 1
+        assert len(children) + n_leaves == n_nodes
+
+    for tree_builder in _TREE_BUILDERS.values():
+        for this_X in (X, X[0]):
+            with ignore_warnings():
+                with pytest.warns(UserWarning):
+                    children, n_nodes, n_leaves, parent = tree_builder(
+                        this_X.T, n_clusters=10
+                    )
+            n_nodes = 2 * X.shape[1] - 1
+            assert len(children) + n_leaves == n_nodes
+
+
+def test_height_linkage_tree():
+    # Check that the height of the results of linkage tree is sorted.
+    rng = np.random.RandomState(0)
+    mask = np.ones([10, 10], dtype=bool)
+    X = rng.randn(50, 100)
+    connectivity = grid_to_graph(*mask.shape)
+    for linkage_func in _TREE_BUILDERS.values():
+        children, n_nodes, n_leaves, parent = linkage_func(
+            X.T, connectivity=connectivity
+        )
+        n_nodes = 2 * X.shape[1] - 1
+        assert len(children) + n_leaves == n_nodes
+
+
+def test_zero_cosine_linkage_tree():
+    # Check that zero vectors in X produce an error when
+    # 'cosine' affinity is used
+    X = np.array([[0, 1], [0, 0]])
+    msg = "Cosine affinity cannot be used when X contains zero vectors"
+    with pytest.raises(ValueError, match=msg):
+        linkage_tree(X, affinity="cosine")
+
+
+@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)])
+@pytest.mark.parametrize("compute_distances", [True, False])
+@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"])
+def test_agglomerative_clustering_distances(
+    n_clusters, compute_distances, distance_threshold, linkage
+):
+    # Check that when `compute_distances` is True or `distance_threshold` is
+    # given, the fitted model has an attribute `distances_`.
+    rng = np.random.RandomState(0)
+    mask = np.ones([10, 10], dtype=bool)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    connectivity = grid_to_graph(*mask.shape)
+
+    clustering = AgglomerativeClustering(
+        n_clusters=n_clusters,
+        connectivity=connectivity,
+        linkage=linkage,
+        distance_threshold=distance_threshold,
+        compute_distances=compute_distances,
+    )
+    clustering.fit(X)
+    if compute_distances or (distance_threshold is not None):
+        assert hasattr(clustering, "distances_")
+        n_children = clustering.children_.shape[0]
+        n_nodes = n_children + 1
+        assert clustering.distances_.shape == (n_nodes - 1,)
+    else:
+        assert not hasattr(clustering, "distances_")
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_agglomerative_clustering(global_random_seed, lil_container):
+    # Check that we obtain the correct number of clusters with
+    # agglomerative clustering.
+    rng = np.random.RandomState(global_random_seed)
+    mask = np.ones([10, 10], dtype=bool)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    connectivity = grid_to_graph(*mask.shape)
+    for linkage in ("ward", "complete", "average", "single"):
+        clustering = AgglomerativeClustering(
+            n_clusters=10, connectivity=connectivity, linkage=linkage
+        )
+        clustering.fit(X)
+        # test caching
+        try:
+            tempdir = mkdtemp()
+            clustering = AgglomerativeClustering(
+                n_clusters=10,
+                connectivity=connectivity,
+                memory=tempdir,
+                linkage=linkage,
+            )
+            clustering.fit(X)
+            labels = clustering.labels_
+            assert np.size(np.unique(labels)) == 10
+        finally:
+            shutil.rmtree(tempdir)
+        # Turn caching off now
+        clustering = AgglomerativeClustering(
+            n_clusters=10, connectivity=connectivity, linkage=linkage
+        )
+        # Check that we obtain the same solution with early-stopping of the
+        # tree building
+        clustering.compute_full_tree = False
+        clustering.fit(X)
+        assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1)
+        clustering.connectivity = None
+        clustering.fit(X)
+        assert np.size(np.unique(clustering.labels_)) == 10
+        # Check that we raise a TypeError on dense matrices
+        clustering = AgglomerativeClustering(
+            n_clusters=10,
+            connectivity=lil_container(connectivity.toarray()[:10, :10]),
+            linkage=linkage,
+        )
+        with pytest.raises(ValueError):
+            clustering.fit(X)
+
+    # Test that using ward with another metric than euclidean raises an
+    # exception
+    clustering = AgglomerativeClustering(
+        n_clusters=10,
+        connectivity=connectivity.toarray(),
+        metric="manhattan",
+        linkage="ward",
+    )
+    with pytest.raises(ValueError):
+        clustering.fit(X)
+
+    # Test using another metric than euclidean works with linkage complete
+    for metric in PAIRED_DISTANCES.keys():
+        # Compare our (structured) implementation to scipy
+        clustering = AgglomerativeClustering(
+            n_clusters=10,
+            connectivity=np.ones((n_samples, n_samples)),
+            metric=metric,
+            linkage="complete",
+        )
+        clustering.fit(X)
+        clustering2 = AgglomerativeClustering(
+            n_clusters=10, connectivity=None, metric=metric, linkage="complete"
+        )
+        clustering2.fit(X)
+        assert_almost_equal(
+            normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1
+        )
+
+    # Test that using a distance matrix (affinity = 'precomputed') has same
+    # results (with connectivity constraints)
+    clustering = AgglomerativeClustering(
+        n_clusters=10, connectivity=connectivity, linkage="complete"
+    )
+    clustering.fit(X)
+    X_dist = pairwise_distances(X)
+    clustering2 = AgglomerativeClustering(
+        n_clusters=10,
+        connectivity=connectivity,
+        metric="precomputed",
+        linkage="complete",
+    )
+    clustering2.fit(X_dist)
+    assert_array_equal(clustering.labels_, clustering2.labels_)
+
+
+def test_agglomerative_clustering_memory_mapped():
+    """AgglomerativeClustering must work on mem-mapped dataset.
+
+    Non-regression test for issue #19875.
+    """
+    rng = np.random.RandomState(0)
+    Xmm = create_memmap_backed_data(rng.randn(50, 100))
+    AgglomerativeClustering(metric="euclidean", linkage="single").fit(Xmm)
+
+
+def test_ward_agglomeration(global_random_seed):
+    # Check that we obtain the correct solution in a simplistic case
+    rng = np.random.RandomState(global_random_seed)
+    mask = np.ones([10, 10], dtype=bool)
+    X = rng.randn(50, 100)
+    connectivity = grid_to_graph(*mask.shape)
+    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
+    agglo.fit(X)
+    assert np.size(np.unique(agglo.labels_)) == 5
+
+    X_red = agglo.transform(X)
+    assert X_red.shape[1] == 5
+    X_full = agglo.inverse_transform(X_red)
+    assert np.unique(X_full[0]).size == 5
+    assert_array_almost_equal(agglo.transform(X_full), X_red)
+
+    # Check that fitting with no samples raises a ValueError
+    with pytest.raises(ValueError):
+        agglo.fit(X[:0])
+
+
+def test_single_linkage_clustering():
+    # Check that we get the correct result in two emblematic cases
+    moons, moon_labels = make_moons(noise=0.05, random_state=42)
+    clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
+    clustering.fit(moons)
+    assert_almost_equal(
+        normalized_mutual_info_score(clustering.labels_, moon_labels), 1
+    )
+
+    circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42)
+    clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
+    clustering.fit(circles)
+    assert_almost_equal(
+        normalized_mutual_info_score(clustering.labels_, circle_labels), 1
+    )
+
+
+def assess_same_labelling(cut1, cut2):
+    """Util for comparison with scipy"""
+    co_clust = []
+    for cut in [cut1, cut2]:
+        n = len(cut)
+        k = cut.max() + 1
+        ecut = np.zeros((n, k))
+        ecut[np.arange(n), cut] = 1
+        co_clust.append(np.dot(ecut, ecut.T))
+    assert (co_clust[0] == co_clust[1]).all()
+
+
+def test_sparse_scikit_vs_scipy(global_random_seed):
+    # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
+    n, p, k = 10, 5, 3
+    rng = np.random.RandomState(global_random_seed)
+
+    # Not using a lil_matrix here, just to check that non sparse
+    # matrices are well handled
+    connectivity = np.ones((n, n))
+    for linkage in _TREE_BUILDERS.keys():
+        for i in range(5):
+            X = 0.1 * rng.normal(size=(n, p))
+            X -= 4.0 * np.arange(n)[:, np.newaxis]
+            X -= X.mean(axis=1)[:, np.newaxis]
+
+            out = hierarchy.linkage(X, method=linkage)
+
+            children_ = out[:, :2].astype(int, copy=False)
+            children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
+                X, connectivity=connectivity
+            )
+
+            # Sort the order of child nodes per row for consistency
+            children.sort(axis=1)
+            assert_array_equal(
+                children,
+                children_,
+                "linkage tree differs from scipy impl for linkage: " + linkage,
+            )
+
+            cut = _hc_cut(k, children, n_leaves)
+            cut_ = _hc_cut(k, children_, n_leaves)
+            assess_same_labelling(cut, cut_)
+
+    # Test error management in _hc_cut
+    with pytest.raises(ValueError):
+        _hc_cut(n_leaves + 1, children, n_leaves)
+
+
+# Make sure our custom mst_linkage_core gives
+# the same results as scipy's builtin
+def test_vector_scikit_single_vs_scipy_single(global_random_seed):
+    n_samples, n_features, n_clusters = 10, 5, 3
+    rng = np.random.RandomState(global_random_seed)
+    X = 0.1 * rng.normal(size=(n_samples, n_features))
+    X -= 4.0 * np.arange(n_samples)[:, np.newaxis]
+    X -= X.mean(axis=1)[:, np.newaxis]
+
+    out = hierarchy.linkage(X, method="single")
+    children_scipy = out[:, :2].astype(int)
+
+    children, _, n_leaves, _ = _TREE_BUILDERS["single"](X)
+
+    # Sort the order of child nodes per row for consistency
+    children.sort(axis=1)
+    assert_array_equal(
+        children,
+        children_scipy,
+        "linkage tree differs from scipy impl for single linkage.",
+    )
+
+    cut = _hc_cut(n_clusters, children, n_leaves)
+    cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
+    assess_same_labelling(cut, cut_scipy)
+
+
+@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
+def test_mst_linkage_core_memory_mapped(metric_param_grid):
+    """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
+
+    Non-regression test for issue #19875.
+    """
+    rng = np.random.RandomState(seed=1)
+    X = rng.normal(size=(20, 4))
+    Xmm = create_memmap_backed_data(X)
+    metric, param_grid = metric_param_grid
+    keys = param_grid.keys()
+    for vals in itertools.product(*param_grid.values()):
+        kwargs = dict(zip(keys, vals))
+        distance_metric = DistanceMetric.get_metric(metric, **kwargs)
+        mst = mst_linkage_core(X, distance_metric)
+        mst_mm = mst_linkage_core(Xmm, distance_metric)
+        np.testing.assert_equal(mst, mst_mm)
+
+
+def test_identical_points():
+    # Ensure identical points are handled correctly when using mst with
+    # a sparse connectivity matrix
+    X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]])
+    true_labels = np.array([0, 0, 1, 1, 2, 2])
+    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
+    connectivity = 0.5 * (connectivity + connectivity.T)
+    connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean")
+
+    for linkage in ("single", "average", "average", "ward"):
+        clustering = AgglomerativeClustering(
+            n_clusters=3, linkage=linkage, connectivity=connectivity
+        )
+        clustering.fit(X)
+
+        assert_almost_equal(
+            normalized_mutual_info_score(clustering.labels_, true_labels), 1
+        )
+
+
+def test_connectivity_propagation():
+    # Check that connectivity in the ward tree is propagated correctly during
+    # merging.
+    X = np.array(
+        [
+            (0.014, 0.120),
+            (0.014, 0.099),
+            (0.014, 0.097),
+            (0.017, 0.153),
+            (0.017, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.152),
+            (0.018, 0.149),
+            (0.018, 0.144),
+        ]
+    )
+    connectivity = kneighbors_graph(X, 10, include_self=False)
+    ward = AgglomerativeClustering(
+        n_clusters=4, connectivity=connectivity, linkage="ward"
+    )
+    # If changes are not propagated correctly, fit crashes with an
+    # IndexError
+    ward.fit(X)
+
+
+def test_ward_tree_children_order(global_random_seed):
+    # Check that children are ordered in the same way for both structured and
+    # unstructured versions of ward_tree.
+
+    # test on five random datasets
+    n, p = 10, 5
+    rng = np.random.RandomState(global_random_seed)
+
+    connectivity = np.ones((n, n))
+    for i in range(5):
+        X = 0.1 * rng.normal(size=(n, p))
+        X -= 4.0 * np.arange(n)[:, np.newaxis]
+        X -= X.mean(axis=1)[:, np.newaxis]
+
+        out_unstructured = ward_tree(X)
+        out_structured = ward_tree(X, connectivity=connectivity)
+
+        assert_array_equal(out_unstructured[0], out_structured[0])
+
+
+def test_ward_linkage_tree_return_distance(global_random_seed):
+    # Test return_distance option on linkage and ward trees
+
+    # test that return_distance when set true, gives same
+    # output on both structured and unstructured clustering.
+    n, p = 10, 5
+    rng = np.random.RandomState(global_random_seed)
+
+    connectivity = np.ones((n, n))
+    for i in range(5):
+        X = 0.1 * rng.normal(size=(n, p))
+        X -= 4.0 * np.arange(n)[:, np.newaxis]
+        X -= X.mean(axis=1)[:, np.newaxis]
+
+        out_unstructured = ward_tree(X, return_distance=True)
+        out_structured = ward_tree(X, connectivity=connectivity, return_distance=True)
+
+        # get children
+        children_unstructured = out_unstructured[0]
+        children_structured = out_structured[0]
+
+        # check if we got the same clusters
+        assert_array_equal(children_unstructured, children_structured)
+
+        # check if the distances are the same
+        dist_unstructured = out_unstructured[-1]
+        dist_structured = out_structured[-1]
+
+        assert_array_almost_equal(dist_unstructured, dist_structured)
+
+        for linkage in ["average", "complete", "single"]:
+            structured_items = linkage_tree(
+                X, connectivity=connectivity, linkage=linkage, return_distance=True
+            )[-1]
+            unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[
+                -1
+            ]
+            structured_dist = structured_items[-1]
+            unstructured_dist = unstructured_items[-1]
+            structured_children = structured_items[0]
+            unstructured_children = unstructured_items[0]
+            assert_array_almost_equal(structured_dist, unstructured_dist)
+            assert_array_almost_equal(structured_children, unstructured_children)
+
+    # test on the following dataset where we know the truth
+    # taken from scipy/cluster/tests/hierarchy_test_data.py
+    X = np.array(
+        [
+            [1.43054825, -7.5693489],
+            [6.95887839, 6.82293382],
+            [2.87137846, -9.68248579],
+            [7.87974764, -6.05485803],
+            [8.24018364, -6.09495602],
+            [7.39020262, 8.54004355],
+        ]
+    )
+    # truth
+    linkage_X_ward = np.array(
+        [
+            [3.0, 4.0, 0.36265956, 2.0],
+            [1.0, 5.0, 1.77045373, 2.0],
+            [0.0, 2.0, 2.55760419, 2.0],
+            [6.0, 8.0, 9.10208346, 4.0],
+            [7.0, 9.0, 24.7784379, 6.0],
+        ]
+    )
+
+    linkage_X_complete = np.array(
+        [
+            [3.0, 4.0, 0.36265956, 2.0],
+            [1.0, 5.0, 1.77045373, 2.0],
+            [0.0, 2.0, 2.55760419, 2.0],
+            [6.0, 8.0, 6.96742194, 4.0],
+            [7.0, 9.0, 18.77445997, 6.0],
+        ]
+    )
+
+    linkage_X_average = np.array(
+        [
+            [3.0, 4.0, 0.36265956, 2.0],
+            [1.0, 5.0, 1.77045373, 2.0],
+            [0.0, 2.0, 2.55760419, 2.0],
+            [6.0, 8.0, 6.55832839, 4.0],
+            [7.0, 9.0, 15.44089605, 6.0],
+        ]
+    )
+
+    n_samples, n_features = np.shape(X)
+    connectivity_X = np.ones((n_samples, n_samples))
+
+    out_X_unstructured = ward_tree(X, return_distance=True)
+    out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True)
+
+    # check that the labels are the same
+    assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
+    assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
+
+    # check that the distances are correct
+    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
+    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
+
+    linkage_options = ["complete", "average", "single"]
+    X_linkage_truth = [linkage_X_complete, linkage_X_average]
+    for linkage, X_truth in zip(linkage_options, X_linkage_truth):
+        out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage)
+        out_X_structured = linkage_tree(
+            X, connectivity=connectivity_X, linkage=linkage, return_distance=True
+        )
+
+        # check that the labels are the same
+        assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
+        assert_array_equal(X_truth[:, :2], out_X_structured[0])
+
+        # check that the distances are correct
+        assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
+        assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
+
+
+def test_connectivity_fixing_non_lil():
+    # Check non regression of a bug if a non item assignable connectivity is
+    # provided with more than one component.
+    # create dummy data
+    x = np.array([[0, 0], [1, 1]])
+    # create a mask with several components to force connectivity fixing
+    m = np.array([[True, False], [False, True]])
+    c = grid_to_graph(n_x=2, n_y=2, mask=m)
+    w = AgglomerativeClustering(connectivity=c, linkage="ward")
+    with pytest.warns(UserWarning):
+        w.fit(x)
+
+
+def test_int_float_dict():
+    rng = np.random.RandomState(0)
+    keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
+    values = rng.rand(len(keys))
+
+    d = IntFloatDict(keys, values)
+    for key, value in zip(keys, values):
+        assert d[key] == value
+
+    other_keys = np.arange(50, dtype=np.intp)[::2]
+    other_values = np.full(50, 0.5)[::2]
+    other = IntFloatDict(other_keys, other_values)
+    # Complete smoke test
+    max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
+    average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
+
+
+def test_connectivity_callable():
+    rng = np.random.RandomState(0)
+    X = rng.rand(20, 5)
+    connectivity = kneighbors_graph(X, 3, include_self=False)
+    aglc1 = AgglomerativeClustering(connectivity=connectivity)
+    aglc2 = AgglomerativeClustering(
+        connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)
+    )
+    aglc1.fit(X)
+    aglc2.fit(X)
+    assert_array_equal(aglc1.labels_, aglc2.labels_)
+
+
+def test_connectivity_ignores_diagonal():
+    rng = np.random.RandomState(0)
+    X = rng.rand(20, 5)
+    connectivity = kneighbors_graph(X, 3, include_self=False)
+    connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
+    aglc1 = AgglomerativeClustering(connectivity=connectivity)
+    aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
+    aglc1.fit(X)
+    aglc2.fit(X)
+    assert_array_equal(aglc1.labels_, aglc2.labels_)
+
+
+def test_compute_full_tree():
+    # Test that the full tree is computed if n_clusters is small
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 2)
+    connectivity = kneighbors_graph(X, 5, include_self=False)
+
+    # When n_clusters is less, the full tree should be built
+    # that is the number of merges should be n_samples - 1
+    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
+    agc.fit(X)
+    n_samples = X.shape[0]
+    n_nodes = agc.children_.shape[0]
+    assert n_nodes == n_samples - 1
+
+    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
+    # we should stop when there are n_clusters.
+    n_clusters = 101
+    X = rng.randn(200, 2)
+    connectivity = kneighbors_graph(X, 10, include_self=False)
+    agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity)
+    agc.fit(X)
+    n_samples = X.shape[0]
+    n_nodes = agc.children_.shape[0]
+    assert n_nodes == n_samples - n_clusters
+
+
+def test_n_components():
+    # Test n_components returned by linkage, average and ward tree
+    rng = np.random.RandomState(0)
+    X = rng.rand(5, 5)
+
+    # Connectivity matrix having five components.
+    connectivity = np.eye(5)
+
+    for linkage_func in _TREE_BUILDERS.values():
+        assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5
+
+
+def test_affinity_passed_to_fix_connectivity():
+    # Test that the affinity parameter is actually passed to the pairwise
+    # function
+
+    size = 2
+    rng = np.random.RandomState(0)
+    X = rng.randn(size, size)
+    mask = np.array([True, False, False, True])
+
+    connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
+
+    class FakeAffinity:
+        def __init__(self):
+            self.counter = 0
+
+        def increment(self, *args, **kwargs):
+            self.counter += 1
+            return self.counter
+
+    fa = FakeAffinity()
+
+    linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
+
+    assert fa.counter == 3
+
+
+@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
+def test_agglomerative_clustering_with_distance_threshold(linkage, global_random_seed):
+    # Check that we obtain the correct number of clusters with
+    # agglomerative clustering with distance_threshold.
+    rng = np.random.RandomState(global_random_seed)
+    mask = np.ones([10, 10], dtype=bool)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    connectivity = grid_to_graph(*mask.shape)
+    # test when distance threshold is set to 10
+    distance_threshold = 10
+    for conn in [None, connectivity]:
+        clustering = AgglomerativeClustering(
+            n_clusters=None,
+            distance_threshold=distance_threshold,
+            connectivity=conn,
+            linkage=linkage,
+        )
+        clustering.fit(X)
+        clusters_produced = clustering.labels_
+        num_clusters_produced = len(np.unique(clustering.labels_))
+        # test if the clusters produced match the point in the linkage tree
+        # where the distance exceeds the threshold
+        tree_builder = _TREE_BUILDERS[linkage]
+        children, n_components, n_leaves, parent, distances = tree_builder(
+            X, connectivity=conn, n_clusters=None, return_distance=True
+        )
+        num_clusters_at_threshold = (
+            np.count_nonzero(distances >= distance_threshold) + 1
+        )
+        # test number of clusters produced
+        assert num_clusters_at_threshold == num_clusters_produced
+        # test clusters produced
+        clusters_at_threshold = _hc_cut(
+            n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves
+        )
+        assert np.array_equiv(clusters_produced, clusters_at_threshold)
+
+
+def test_small_distance_threshold(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 10
+    X = rng.randint(-300, 300, size=(n_samples, 3))
+    # this should result in all data in their own clusters, given that
+    # their pairwise distances are bigger than .1 (which may not be the case
+    # with a different random seed).
+    clustering = AgglomerativeClustering(
+        n_clusters=None, distance_threshold=1.0, linkage="single"
+    ).fit(X)
+    # check that the pairwise distances are indeed all larger than .1
+    all_distances = pairwise_distances(X, metric="minkowski", p=2)
+    np.fill_diagonal(all_distances, np.inf)
+    assert np.all(all_distances > 0.1)
+    assert clustering.n_clusters_ == n_samples
+
+
+def test_cluster_distances_with_distance_threshold(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 100
+    X = rng.randint(-10, 10, size=(n_samples, 3))
+    # check the distances within the clusters and with other clusters
+    distance_threshold = 4
+    clustering = AgglomerativeClustering(
+        n_clusters=None, distance_threshold=distance_threshold, linkage="single"
+    ).fit(X)
+    labels = clustering.labels_
+    D = pairwise_distances(X, metric="minkowski", p=2)
+    # to avoid taking the 0 diagonal in min()
+    np.fill_diagonal(D, np.inf)
+    for label in np.unique(labels):
+        in_cluster_mask = labels == label
+        max_in_cluster_distance = (
+            D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max()
+        )
+        min_out_cluster_distance = (
+            D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min()
+        )
+        # single data point clusters only have that inf diagonal here
+        if in_cluster_mask.sum() > 1:
+            assert max_in_cluster_distance < distance_threshold
+        assert min_out_cluster_distance >= distance_threshold
+
+
+@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
+@pytest.mark.parametrize(
+    ("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]
+)
+def test_agglomerative_clustering_with_distance_threshold_edge_case(
+    linkage, threshold, y_true
+):
+    # test boundary case of distance_threshold matching the distance
+    X = [[0], [1]]
+    clusterer = AgglomerativeClustering(
+        n_clusters=None, distance_threshold=threshold, linkage=linkage
+    )
+    y_pred = clusterer.fit_predict(X)
+    assert adjusted_rand_score(y_true, y_pred) == 1
+
+
+def test_dist_threshold_invalid_parameters():
+    X = [[0], [1]]
+    with pytest.raises(ValueError, match="Exactly one of "):
+        AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X)
+
+    with pytest.raises(ValueError, match="Exactly one of "):
+        AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X)
+
+    X = [[0], [1]]
+    with pytest.raises(ValueError, match="compute_full_tree must be True if"):
+        AgglomerativeClustering(
+            n_clusters=None, distance_threshold=1, compute_full_tree=False
+        ).fit(X)
+
+
+def test_invalid_shape_precomputed_dist_matrix():
+    # Check that an error is raised when affinity='precomputed'
+    # and a non square matrix is passed (PR #16257).
+    rng = np.random.RandomState(0)
+    X = rng.rand(5, 3)
+    with pytest.raises(
+        ValueError,
+        match=r"Distance matrix should be square, got matrix of shape \(5, 3\)",
+    ):
+        AgglomerativeClustering(metric="precomputed", linkage="complete").fit(X)
+
+
+def test_precomputed_connectivity_metric_with_2_connected_components():
+    """Check that connecting components works when connectivity and
+    affinity are both precomputed and the number of connected components is
+    greater than 1. Non-regression test for #16151.
+    """
+
+    connectivity_matrix = np.array(
+        [
+            [0, 1, 1, 0, 0],
+            [0, 0, 1, 0, 0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 1],
+            [0, 0, 0, 0, 0],
+        ]
+    )
+    # ensure that connectivity_matrix has two connected components
+    assert connected_components(connectivity_matrix)[0] == 2
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(5, 10)
+
+    X_dist = pairwise_distances(X)
+    clusterer_precomputed = AgglomerativeClustering(
+        metric="precomputed", connectivity=connectivity_matrix, linkage="complete"
+    )
+    msg = "Completing it to avoid stopping the tree early"
+    with pytest.warns(UserWarning, match=msg):
+        clusterer_precomputed.fit(X_dist)
+
+    clusterer = AgglomerativeClustering(
+        connectivity=connectivity_matrix, linkage="complete"
+    )
+    with pytest.warns(UserWarning, match=msg):
+        clusterer.fit(X)
+
+    assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_)
+    assert_array_equal(clusterer.children_, clusterer_precomputed.children_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_k_means.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_k_means.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ab602d32d1330fe738ce7a24cd4b4c68cdf9c15
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_k_means.py
@@ -0,0 +1,1364 @@
+"""Testing for K-means"""
+
+import re
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+from scipy import sparse as sp
+
+from sklearn.base import clone
+from sklearn.cluster import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
+from sklearn.cluster._k_means_common import (
+    _euclidean_dense_dense_wrapper,
+    _euclidean_sparse_dense_wrapper,
+    _inertia_dense,
+    _inertia_sparse,
+    _is_same_clustering,
+    _relocate_empty_clusters_dense,
+    _relocate_empty_clusters_sparse,
+)
+from sklearn.cluster._kmeans import _labels_inertia, _mini_batch_step
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import pairwise_distances, pairwise_distances_argmin
+from sklearn.metrics.cluster import v_measure_score
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_equal,
+    create_memmap_backed_data,
+)
+from sklearn.utils.extmath import row_norms
+from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.parallel import _get_threadpool_controller
+
+# non centered, sparse centers to check the
+centers = np.array(
+    [
+        [0.0, 5.0, 0.0, 0.0, 0.0],
+        [1.0, 1.0, 4.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 5.0, 1.0],
+    ]
+)
+n_samples = 100
+n_clusters, n_features = centers.shape
+X, true_labels = make_blobs(
+    n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
+)
+X_as_any_csr = [container(X) for container in CSR_CONTAINERS]
+data_containers = [np.array] + CSR_CONTAINERS
+data_containers_ids = (
+    ["dense", "sparse_matrix", "sparse_array"]
+    if len(X_as_any_csr) == 2
+    else ["dense", "sparse_matrix"]
+)
+
+
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+@pytest.mark.parametrize("algo", ["lloyd", "elkan"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_kmeans_results(array_constr, algo, dtype):
+    # Checks that KMeans works as intended on toy dataset by comparing with
+    # expected results computed by hand.
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
+    sample_weight = [3, 1, 1, 3]
+    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
+
+    expected_labels = [0, 0, 1, 1]
+    expected_inertia = 0.375
+    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
+    expected_n_iter = 2
+
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X, sample_weight=sample_weight)
+
+    assert_array_equal(kmeans.labels_, expected_labels)
+    assert_allclose(kmeans.inertia_, expected_inertia)
+    assert_allclose(kmeans.cluster_centers_, expected_centers)
+    assert kmeans.n_iter_ == expected_n_iter
+
+
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+@pytest.mark.parametrize("algo", ["lloyd", "elkan"])
+def test_kmeans_relocated_clusters(array_constr, algo):
+    # check that empty clusters are relocated as expected
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
+
+    # second center too far from others points will be empty at first iter
+    init_centers = np.array([[0.5, 0.5], [3, 3]])
+
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X)
+
+    expected_n_iter = 3
+    expected_inertia = 0.25
+    assert_allclose(kmeans.inertia_, expected_inertia)
+    assert kmeans.n_iter_ == expected_n_iter
+
+    # There are two acceptable ways of relocating clusters in this example, the output
+    # depends on how the argpartition strategy breaks ties. We accept both outputs.
+    try:
+        expected_labels = [0, 0, 1, 1]
+        expected_centers = [[0.25, 0], [0.75, 1]]
+        assert_array_equal(kmeans.labels_, expected_labels)
+        assert_allclose(kmeans.cluster_centers_, expected_centers)
+    except AssertionError:
+        expected_labels = [1, 1, 0, 0]
+        expected_centers = [[0.75, 1.0], [0.25, 0.0]]
+        assert_array_equal(kmeans.labels_, expected_labels)
+        assert_allclose(kmeans.cluster_centers_, expected_centers)
+
+
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+def test_relocate_empty_clusters(array_constr):
+    # test for the _relocate_empty_clusters_(dense/sparse) helpers
+
+    # Synthetic dataset with 3 obvious clusters of different sizes
+    X = np.array([-10.0, -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1)
+    X = array_constr(X)
+    sample_weight = np.ones(10)
+
+    # centers all initialized to the first point of X
+    centers_old = np.array([-10.0, -10, -10]).reshape(-1, 1)
+
+    # With this initialization, all points will be assigned to the first center
+    # At this point a center in centers_new is the weighted sum of the points
+    # it contains if it's not empty, otherwise it is the same as before.
+    centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1)
+    weight_in_clusters = np.array([10.0, 0, 0])
+    labels = np.zeros(10, dtype=np.int32)
+
+    if array_constr is np.array:
+        _relocate_empty_clusters_dense(
+            X, sample_weight, centers_old, centers_new, weight_in_clusters, labels
+        )
+    else:
+        _relocate_empty_clusters_sparse(
+            X.data,
+            X.indices,
+            X.indptr,
+            sample_weight,
+            centers_old,
+            centers_new,
+            weight_in_clusters,
+            labels,
+        )
+
+    # The relocation scheme will take the 2 points farthest from the center and
+    # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The
+    # first center will be updated to contain the other 8 points.
+    assert_array_equal(weight_in_clusters, [8, 1, 1])
+    assert_allclose(centers_new, [[-36], [10], [9.5]])
+
+
+@pytest.mark.parametrize("distribution", ["normal", "blobs"])
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+@pytest.mark.parametrize("tol", [1e-2, 1e-8, 1e-100, 0])
+def test_kmeans_elkan_results(distribution, array_constr, tol, global_random_seed):
+    # Check that results are identical between lloyd and elkan algorithms
+    rnd = np.random.RandomState(global_random_seed)
+    if distribution == "normal":
+        X = rnd.normal(size=(5000, 10))
+    else:
+        X, _ = make_blobs(random_state=rnd)
+    X[X < 0] = 0
+    X = array_constr(X)
+
+    km_lloyd = KMeans(n_clusters=5, random_state=global_random_seed, n_init=1, tol=tol)
+    km_elkan = KMeans(
+        algorithm="elkan",
+        n_clusters=5,
+        random_state=global_random_seed,
+        n_init=1,
+        tol=tol,
+    )
+
+    km_lloyd.fit(X)
+    km_elkan.fit(X)
+    assert_allclose(km_elkan.cluster_centers_, km_lloyd.cluster_centers_)
+    assert_array_equal(km_elkan.labels_, km_lloyd.labels_)
+    assert km_elkan.n_iter_ == km_lloyd.n_iter_
+    assert km_elkan.inertia_ == pytest.approx(km_lloyd.inertia_, rel=1e-6)
+
+
+@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"])
+def test_kmeans_convergence(algorithm, global_random_seed):
+    # Check that KMeans stops when convergence is reached when tol=0. (#16075)
+    rnd = np.random.RandomState(global_random_seed)
+    X = rnd.normal(size=(5000, 10))
+    max_iter = 300
+
+    km = KMeans(
+        algorithm=algorithm,
+        n_clusters=5,
+        random_state=global_random_seed,
+        n_init=1,
+        tol=0,
+        max_iter=max_iter,
+    ).fit(X)
+
+    assert km.n_iter_ < max_iter
+
+
+@pytest.mark.parametrize("X_csr", X_as_any_csr)
+def test_minibatch_update_consistency(X_csr, global_random_seed):
+    # Check that dense and sparse minibatch update give the same results
+    rng = np.random.RandomState(global_random_seed)
+
+    centers_old = centers + rng.normal(size=centers.shape)
+    centers_old_csr = centers_old.copy()
+
+    centers_new = np.zeros_like(centers_old)
+    centers_new_csr = np.zeros_like(centers_old_csr)
+
+    weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype)
+    weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype)
+
+    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
+
+    # extract a small minibatch
+    X_mb = X[:10]
+    X_mb_csr = X_csr[:10]
+    sample_weight_mb = sample_weight[:10]
+
+    # step 1: compute the dense minibatch update
+    old_inertia = _mini_batch_step(
+        X_mb,
+        sample_weight_mb,
+        centers_old,
+        centers_new,
+        weight_sums,
+        np.random.RandomState(global_random_seed),
+        random_reassign=False,
+    )
+    assert old_inertia > 0.0
+
+    # compute the new inertia on the same batch to check that it decreased
+    labels, new_inertia = _labels_inertia(X_mb, sample_weight_mb, centers_new)
+    assert new_inertia > 0.0
+    assert new_inertia < old_inertia
+
+    # step 2: compute the sparse minibatch update
+    old_inertia_csr = _mini_batch_step(
+        X_mb_csr,
+        sample_weight_mb,
+        centers_old_csr,
+        centers_new_csr,
+        weight_sums_csr,
+        np.random.RandomState(global_random_seed),
+        random_reassign=False,
+    )
+    assert old_inertia_csr > 0.0
+
+    # compute the new inertia on the same batch to check that it decreased
+    labels_csr, new_inertia_csr = _labels_inertia(
+        X_mb_csr, sample_weight_mb, centers_new_csr
+    )
+    assert new_inertia_csr > 0.0
+    assert new_inertia_csr < old_inertia_csr
+
+    # step 3: check that sparse and dense updates lead to the same results
+    assert_array_equal(labels, labels_csr)
+    assert_allclose(centers_new, centers_new_csr)
+    assert_allclose(old_inertia, old_inertia_csr)
+    assert_allclose(new_inertia, new_inertia_csr)
+
+
+def _check_fitted_model(km):
+    # check that the number of clusters centers and distinct labels match
+    # the expectation
+    centers = km.cluster_centers_
+    assert centers.shape == (n_clusters, n_features)
+
+    labels = km.labels_
+    assert np.unique(labels).shape[0] == n_clusters
+
+    # check that the labels assignment are perfect (up to a permutation)
+    assert_allclose(v_measure_score(true_labels, labels), 1.0)
+    assert km.inertia_ > 0.0
+
+
+@pytest.mark.parametrize(
+    "input_data",
+    [X] + X_as_any_csr,
+    ids=data_containers_ids,
+)
+@pytest.mark.parametrize(
+    "init",
+    ["random", "k-means++", centers, lambda X, k, random_state: centers],
+    ids=["random", "k-means++", "ndarray", "callable"],
+)
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_all_init(Estimator, input_data, init):
+    # Check KMeans and MiniBatchKMeans with all possible init.
+    n_init = 10 if isinstance(init, str) else 1
+    km = Estimator(
+        init=init, n_clusters=n_clusters, random_state=42, n_init=n_init
+    ).fit(input_data)
+    _check_fitted_model(km)
+
+
+@pytest.mark.parametrize(
+    "init",
+    ["random", "k-means++", centers, lambda X, k, random_state: centers],
+    ids=["random", "k-means++", "ndarray", "callable"],
+)
+def test_minibatch_kmeans_partial_fit_init(init):
+    # Check MiniBatchKMeans init with partial_fit
+    n_init = 10 if isinstance(init, str) else 1
+    km = MiniBatchKMeans(
+        init=init, n_clusters=n_clusters, random_state=0, n_init=n_init
+    )
+    for i in range(100):
+        # "random" init requires many batches to recover the true labels.
+        km.partial_fit(X)
+    _check_fitted_model(km)
+
+
+@pytest.mark.parametrize(
+    "init, expected_n_init",
+    [
+        ("k-means++", 1),
+        ("random", "default"),
+        (
+            lambda X, n_clusters, random_state: random_state.uniform(
+                size=(n_clusters, X.shape[1])
+            ),
+            "default",
+        ),
+        ("array-like", 1),
+    ],
+)
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_kmeans_init_auto_with_initial_centroids(Estimator, init, expected_n_init):
+    """Check that `n_init="auto"` chooses the right number of initializations.
+    Non-regression test for #26657:
+    https://github.com/scikit-learn/scikit-learn/pull/26657
+    """
+    n_sample, n_features, n_clusters = 100, 10, 5
+    X = np.random.randn(n_sample, n_features)
+    if init == "array-like":
+        init = np.random.randn(n_clusters, n_features)
+    if expected_n_init == "default":
+        expected_n_init = 3 if Estimator is MiniBatchKMeans else 10
+
+    kmeans = Estimator(n_clusters=n_clusters, init=init, n_init="auto").fit(X)
+    assert kmeans._n_init == expected_n_init
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_fortran_aligned_data(Estimator, global_random_seed):
+    # Check that KMeans works with fortran-aligned data.
+    X_fortran = np.asfortranarray(X)
+    centers_fortran = np.asfortranarray(centers)
+
+    km_c = Estimator(
+        n_clusters=n_clusters, init=centers, n_init=1, random_state=global_random_seed
+    ).fit(X)
+    km_f = Estimator(
+        n_clusters=n_clusters,
+        init=centers_fortran,
+        n_init=1,
+        random_state=global_random_seed,
+    ).fit(X_fortran)
+    assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_)
+    assert_array_equal(km_c.labels_, km_f.labels_)
+
+
+def test_minibatch_kmeans_verbose():
+    # Check verbose mode of MiniBatchKMeans for better coverage.
+    km = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, verbose=1)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        km.fit(X)
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"])
+@pytest.mark.parametrize("tol", [1e-2, 0])
+def test_kmeans_verbose(algorithm, tol, capsys):
+    # Check verbose mode of KMeans for better coverage.
+    X = np.random.RandomState(0).normal(size=(5000, 10))
+
+    KMeans(
+        algorithm=algorithm,
+        n_clusters=n_clusters,
+        random_state=42,
+        init="random",
+        n_init=1,
+        tol=tol,
+        verbose=1,
+    ).fit(X)
+
+    captured = capsys.readouterr()
+
+    assert re.search(r"Initialization complete", captured.out)
+    assert re.search(r"Iteration [0-9]+, inertia", captured.out)
+
+    if tol == 0:
+        assert re.search(r"strict convergence", captured.out)
+    else:
+        assert re.search(r"center shift .* within tolerance", captured.out)
+
+
+def test_minibatch_kmeans_warning_init_size():
+    # Check that a warning is raised when init_size is smaller than n_clusters
+    with pytest.warns(
+        RuntimeWarning, match=r"init_size.* should be larger than n_clusters"
+    ):
+        MiniBatchKMeans(init_size=10, n_clusters=20).fit(X)
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_warning_n_init_precomputed_centers(Estimator):
+    # Check that a warning is raised when n_init > 1 and an array is passed for
+    # the init parameter.
+    with pytest.warns(
+        RuntimeWarning,
+        match="Explicit initial center position passed: performing only one init",
+    ):
+        Estimator(init=centers, n_clusters=n_clusters, n_init=10).fit(X)
+
+
+def test_minibatch_sensible_reassign(global_random_seed):
+    # check that identical initial clusters are reassigned
+    # also a regression test for when there are more desired reassignments than
+    # samples.
+    zeroed_X, true_labels = make_blobs(
+        n_samples=100, centers=5, random_state=global_random_seed
+    )
+    zeroed_X[::2, :] = 0
+
+    km = MiniBatchKMeans(
+        n_clusters=20, batch_size=10, random_state=global_random_seed, init="random"
+    ).fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    num_non_zero_clusters = km.cluster_centers_.any(axis=1).sum()
+    assert num_non_zero_clusters > 9, f"{num_non_zero_clusters=} is too small"
+
+    # do the same with batch-size > X.shape[0] (regression test)
+    km = MiniBatchKMeans(
+        n_clusters=20, batch_size=200, random_state=global_random_seed, init="random"
+    ).fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    num_non_zero_clusters = km.cluster_centers_.any(axis=1).sum()
+    assert num_non_zero_clusters > 9, f"{num_non_zero_clusters=} is too small"
+
+    # do the same with partial_fit API
+    km = MiniBatchKMeans(n_clusters=20, random_state=global_random_seed, init="random")
+    for i in range(100):
+        km.partial_fit(zeroed_X)
+    # there should not be too many exact zero cluster centers
+    num_non_zero_clusters = km.cluster_centers_.any(axis=1).sum()
+    assert num_non_zero_clusters > 9, f"{num_non_zero_clusters=} is too small"
+
+
+@pytest.mark.parametrize(
+    "input_data",
+    [X] + X_as_any_csr,
+    ids=data_containers_ids,
+)
+def test_minibatch_reassign(input_data, global_random_seed):
+    # Check the reassignment part of the minibatch step with very high or very
+    # low reassignment ratio.
+    perfect_centers = np.empty((n_clusters, n_features))
+    for i in range(n_clusters):
+        perfect_centers[i] = X[true_labels == i].mean(axis=0)
+
+    sample_weight = np.ones(n_samples)
+    centers_new = np.empty_like(perfect_centers)
+
+    # Give a perfect initialization, but a large reassignment_ratio, as a
+    # result many centers should be reassigned and the model should no longer
+    # be good
+    score_before = -_labels_inertia(input_data, sample_weight, perfect_centers, 1)[1]
+
+    _mini_batch_step(
+        input_data,
+        sample_weight,
+        perfect_centers,
+        centers_new,
+        np.zeros(n_clusters),
+        np.random.RandomState(global_random_seed),
+        random_reassign=True,
+        reassignment_ratio=1,
+    )
+
+    score_after = -_labels_inertia(input_data, sample_weight, centers_new, 1)[1]
+
+    assert score_before > score_after
+
+    # Give a perfect initialization, with a small reassignment_ratio,
+    # no center should be reassigned.
+    _mini_batch_step(
+        input_data,
+        sample_weight,
+        perfect_centers,
+        centers_new,
+        np.zeros(n_clusters),
+        np.random.RandomState(global_random_seed),
+        random_reassign=True,
+        reassignment_ratio=1e-15,
+    )
+
+    assert_allclose(centers_new, perfect_centers)
+
+
+def test_minibatch_with_many_reassignments():
+    # Test for the case that the number of clusters to reassign is bigger
+    # than the batch_size. Run the test with 100 clusters and a batch_size of
+    # 10 because it turned out that these values ensure that the number of
+    # clusters to reassign is always bigger than the batch_size.
+    MiniBatchKMeans(
+        n_clusters=100,
+        batch_size=10,
+        init_size=n_samples,
+        random_state=42,
+        verbose=True,
+    ).fit(X)
+
+
+def test_minibatch_kmeans_init_size():
+    # Check the internal _init_size attribute of MiniBatchKMeans
+
+    # default init size should be 3 * batch_size
+    km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X)
+    assert km._init_size == 15
+
+    # if 3 * batch size < n_clusters, it should then be 3 * n_clusters
+    km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X)
+    assert km._init_size == 30
+
+    # it should not be larger than n_samples
+    km = MiniBatchKMeans(
+        n_clusters=10, batch_size=5, n_init=1, init_size=n_samples + 1
+    ).fit(X)
+    assert km._init_size == n_samples
+
+
+@pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)])
+def test_minibatch_declared_convergence(capsys, tol, max_no_improvement):
+    # Check convergence detection based on ewa batch inertia or on
+    # small center change.
+    X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True)
+
+    km = MiniBatchKMeans(
+        n_clusters=3,
+        init=centers,
+        batch_size=20,
+        tol=tol,
+        random_state=0,
+        max_iter=10,
+        n_init=1,
+        verbose=1,
+        max_no_improvement=max_no_improvement,
+    )
+
+    km.fit(X)
+    assert 1 < km.n_iter_ < 10
+
+    captured = capsys.readouterr()
+    if max_no_improvement is None:
+        assert "Converged (small centers change)" in captured.out
+    if tol == 0:
+        assert "Converged (lack of improvement in inertia)" in captured.out
+
+
+def test_minibatch_iter_steps():
+    # Check consistency of n_iter_ and n_steps_ attributes.
+    batch_size = 30
+    n_samples = X.shape[0]
+    km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0).fit(X)
+
+    # n_iter_ is the number of started epochs
+    assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples)
+    assert isinstance(km.n_iter_, int)
+
+    # without stopping condition, max_iter should be reached
+    km = MiniBatchKMeans(
+        n_clusters=3,
+        batch_size=batch_size,
+        random_state=0,
+        tol=0,
+        max_no_improvement=None,
+        max_iter=10,
+    ).fit(X)
+
+    assert km.n_iter_ == 10
+    assert km.n_steps_ == (10 * n_samples) // batch_size
+    assert isinstance(km.n_steps_, int)
+
+
+def test_kmeans_copyx():
+    # Check that copy_x=False returns nearly equal X after de-centering.
+    my_X = X.copy()
+    km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42)
+    km.fit(my_X)
+    _check_fitted_model(km)
+
+    # check that my_X is de-centered
+    assert_allclose(my_X, X)
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_score_max_iter(Estimator, global_random_seed):
+    # Check that fitting KMeans or MiniBatchKMeans with more iterations gives
+    # better score
+    X = np.random.RandomState(global_random_seed).randn(100, 10)
+
+    km1 = Estimator(n_init=1, random_state=global_random_seed, max_iter=1)
+    s1 = km1.fit(X).score(X)
+    km2 = Estimator(n_init=1, random_state=global_random_seed, max_iter=10)
+    s2 = km2.fit(X).score(X)
+    assert s2 > s1
+
+
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+@pytest.mark.parametrize(
+    "Estimator, algorithm",
+    [(KMeans, "lloyd"), (KMeans, "elkan"), (MiniBatchKMeans, None)],
+)
+@pytest.mark.parametrize("max_iter", [2, 100])
+def test_kmeans_predict(
+    Estimator, algorithm, array_constr, max_iter, global_dtype, global_random_seed
+):
+    # Check the predict method and the equivalence between fit.predict and
+    # fit_predict.
+    X, _ = make_blobs(
+        n_samples=200, n_features=10, centers=10, random_state=global_random_seed
+    )
+    X = array_constr(X, dtype=global_dtype)
+
+    km = Estimator(
+        n_clusters=10,
+        init="random",
+        n_init=10,
+        max_iter=max_iter,
+        random_state=global_random_seed,
+    )
+    if algorithm is not None:
+        km.set_params(algorithm=algorithm)
+    km.fit(X)
+    labels = km.labels_
+
+    # re-predict labels for training set using predict
+    pred = km.predict(X)
+    assert_array_equal(pred, labels)
+
+    # re-predict labels for training set using fit_predict
+    pred = km.fit_predict(X)
+    assert_array_equal(pred, labels)
+
+    # predict centroid labels
+    pred = km.predict(km.cluster_centers_)
+    assert_array_equal(pred, np.arange(10))
+
+
+@pytest.mark.parametrize("X_csr", X_as_any_csr)
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_dense_sparse(Estimator, X_csr, global_random_seed):
+    # Check that the results are the same for dense and sparse input.
+    sample_weight = np.random.RandomState(global_random_seed).random_sample(
+        (n_samples,)
+    )
+    km_dense = Estimator(
+        n_clusters=n_clusters, random_state=global_random_seed, n_init=1
+    )
+    km_dense.fit(X, sample_weight=sample_weight)
+    km_sparse = Estimator(
+        n_clusters=n_clusters, random_state=global_random_seed, n_init=1
+    )
+    km_sparse.fit(X_csr, sample_weight=sample_weight)
+
+    assert_array_equal(km_dense.labels_, km_sparse.labels_)
+    assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_)
+
+
+@pytest.mark.parametrize("X_csr", X_as_any_csr)
+@pytest.mark.parametrize(
+    "init", ["random", "k-means++", centers], ids=["random", "k-means++", "ndarray"]
+)
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_predict_dense_sparse(Estimator, init, X_csr):
+    # check that models trained on sparse input also works for dense input at
+    # predict time and vice versa.
+    n_init = 10 if isinstance(init, str) else 1
+    km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init, random_state=0)
+
+    km.fit(X_csr)
+    assert_array_equal(km.predict(X), km.labels_)
+
+    km.fit(X)
+    assert_array_equal(km.predict(X_csr), km.labels_)
+
+
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+@pytest.mark.parametrize("dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("init", ["k-means++", "ndarray"])
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_integer_input(Estimator, array_constr, dtype, init, global_random_seed):
+    # Check that KMeans and MiniBatchKMeans work with integer input.
+    X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]])
+    X = array_constr(X_dense, dtype=dtype)
+
+    n_init = 1 if init == "ndarray" else 10
+    init = X_dense[:2] if init == "ndarray" else init
+
+    km = Estimator(
+        n_clusters=2, init=init, n_init=n_init, random_state=global_random_seed
+    )
+    if Estimator is MiniBatchKMeans:
+        km.set_params(batch_size=2)
+
+    km.fit(X)
+
+    # Internally integer input should be converted to float64
+    assert km.cluster_centers_.dtype == np.float64
+
+    expected_labels = [0, 1, 1, 0, 0, 1]
+    assert_allclose(v_measure_score(km.labels_, expected_labels), 1.0)
+
+    # Same with partial_fit (#14314)
+    if Estimator is MiniBatchKMeans:
+        km = clone(km).partial_fit(X)
+        assert km.cluster_centers_.dtype == np.float64
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_transform(Estimator, global_random_seed):
+    # Check the transform method
+    km = Estimator(n_clusters=n_clusters, random_state=global_random_seed).fit(X)
+
+    # Transorfming cluster_centers_ should return the pairwise distances
+    # between centers
+    Xt = km.transform(km.cluster_centers_)
+    assert_allclose(Xt, pairwise_distances(km.cluster_centers_))
+    # In particular, diagonal must be 0
+    assert_array_equal(Xt.diagonal(), np.zeros(n_clusters))
+
+    # Transorfming X should return the pairwise distances between X and the
+    # centers
+    Xt = km.transform(X)
+    assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_))
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_fit_transform(Estimator, global_random_seed):
+    # Check equivalence between fit.transform and fit_transform
+    X1 = Estimator(random_state=global_random_seed, n_init=1).fit(X).transform(X)
+    X2 = Estimator(random_state=global_random_seed, n_init=1).fit_transform(X)
+    assert_allclose(X1, X2)
+
+
+def test_n_init(global_random_seed):
+    # Check that increasing the number of init increases the quality
+    previous_inertia = np.inf
+    for n_init in [1, 5, 10]:
+        # set max_iter=1 to avoid finding the global minimum and get the same
+        # inertia each time
+        km = KMeans(
+            n_clusters=n_clusters,
+            init="random",
+            n_init=n_init,
+            random_state=global_random_seed,
+            max_iter=1,
+        ).fit(X)
+        assert km.inertia_ <= previous_inertia
+
+
+def test_k_means_function(global_random_seed):
+    # test calling the k_means function directly
+    cluster_centers, labels, inertia = k_means(
+        X, n_clusters=n_clusters, sample_weight=None, random_state=global_random_seed
+    )
+
+    assert cluster_centers.shape == (n_clusters, n_features)
+    assert np.unique(labels).shape[0] == n_clusters
+
+    # check that the labels assignment are perfect (up to a permutation)
+    assert_allclose(v_measure_score(true_labels, labels), 1.0)
+    assert inertia > 0.0
+
+
+@pytest.mark.parametrize(
+    "input_data",
+    [X] + X_as_any_csr,
+    ids=data_containers_ids,
+)
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_float_precision(Estimator, input_data, global_random_seed):
+    # Check that the results are the same for single and double precision.
+    km = Estimator(n_init=1, random_state=global_random_seed)
+
+    inertia = {}
+    Xt = {}
+    centers = {}
+    labels = {}
+
+    for dtype in [np.float64, np.float32]:
+        X = input_data.astype(dtype, copy=False)
+        km.fit(X)
+
+        inertia[dtype] = km.inertia_
+        Xt[dtype] = km.transform(X)
+        centers[dtype] = km.cluster_centers_
+        labels[dtype] = km.labels_
+
+        # dtype of cluster centers has to be the dtype of the input data
+        assert km.cluster_centers_.dtype == dtype
+
+        # same with partial_fit
+        if Estimator is MiniBatchKMeans:
+            km.partial_fit(X[0:3])
+            assert km.cluster_centers_.dtype == dtype
+
+    # compare arrays with low precision since the difference between 32 and
+    # 64 bit comes from an accumulation of rounding errors.
+    assert_allclose(inertia[np.float32], inertia[np.float64], rtol=1e-4)
+    assert_allclose(Xt[np.float32], Xt[np.float64], atol=Xt[np.float64].max() * 1e-4)
+    assert_allclose(
+        centers[np.float32], centers[np.float64], atol=centers[np.float64].max() * 1e-4
+    )
+    assert_array_equal(labels[np.float32], labels[np.float64])
+
+
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_centers_not_mutated(Estimator, dtype):
+    # Check that KMeans and MiniBatchKMeans won't mutate the user provided
+    # init centers silently even if input data and init centers have the same
+    # type.
+    X_new_type = X.astype(dtype, copy=False)
+    centers_new_type = centers.astype(dtype, copy=False)
+
+    km = Estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1)
+    km.fit(X_new_type)
+
+    assert not np.may_share_memory(km.cluster_centers_, centers_new_type)
+
+
+@pytest.mark.parametrize(
+    "input_data",
+    [X] + X_as_any_csr,
+    ids=data_containers_ids,
+)
+def test_kmeans_init_fitted_centers(input_data):
+    # Check that starting fitting from a local optimum shouldn't change the
+    # solution
+    km1 = KMeans(n_clusters=n_clusters).fit(input_data)
+    km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_, n_init=1).fit(
+        input_data
+    )
+
+    assert_allclose(km1.cluster_centers_, km2.cluster_centers_)
+
+
+def test_kmeans_warns_less_centers_than_unique_points(global_random_seed):
+    # Check KMeans when the number of found clusters is smaller than expected
+    X = np.asarray([[0, 0], [0, 1], [1, 0], [1, 0]])  # last point is duplicated
+    km = KMeans(n_clusters=4, random_state=global_random_seed)
+
+    # KMeans should warn that fewer labels than cluster centers have been used
+    msg = (
+        r"Number of distinct clusters \(3\) found smaller than "
+        r"n_clusters \(4\). Possibly due to duplicate points in X."
+    )
+    with pytest.warns(ConvergenceWarning, match=msg):
+        km.fit(X)
+        # only three distinct points, so only three clusters
+        # can have points assigned to them
+        assert set(km.labels_) == set(range(3))
+
+
+def _sort_centers(centers):
+    return np.sort(centers, axis=0)
+
+
+def test_weighted_vs_repeated(global_random_seed):
+    # Check that a sample weight of N should yield the same result as an N-fold
+    # repetition of the sample. Valid only if init is precomputed, otherwise
+    # rng produces different results. Not valid for MinibatchKMeans due to rng
+    # to extract minibatches.
+    sample_weight = np.random.RandomState(global_random_seed).randint(
+        1, 5, size=n_samples
+    )
+    X_repeat = np.repeat(X, sample_weight, axis=0)
+
+    km = KMeans(
+        init=centers, n_init=1, n_clusters=n_clusters, random_state=global_random_seed
+    )
+
+    km_weighted = clone(km).fit(X, sample_weight=sample_weight)
+    repeated_labels = np.repeat(km_weighted.labels_, sample_weight)
+    km_repeated = clone(km).fit(X_repeat)
+
+    assert_array_equal(km_repeated.labels_, repeated_labels)
+    assert_allclose(km_weighted.inertia_, km_repeated.inertia_)
+    assert_allclose(
+        _sort_centers(km_weighted.cluster_centers_),
+        _sort_centers(km_repeated.cluster_centers_),
+    )
+
+
+@pytest.mark.parametrize(
+    "input_data",
+    [X] + X_as_any_csr,
+    ids=data_containers_ids,
+)
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_unit_weights_vs_no_weights(Estimator, input_data, global_random_seed):
+    # Check that not passing sample weights should be equivalent to passing
+    # sample weights all equal to one.
+    sample_weight = np.ones(n_samples)
+
+    km = Estimator(n_clusters=n_clusters, random_state=global_random_seed, n_init=1)
+    km_none = clone(km).fit(input_data, sample_weight=None)
+    km_ones = clone(km).fit(input_data, sample_weight=sample_weight)
+
+    assert_array_equal(km_none.labels_, km_ones.labels_)
+    assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_)
+
+
+@pytest.mark.parametrize(
+    "input_data",
+    [X] + X_as_any_csr,
+    ids=data_containers_ids,
+)
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_scaled_weights(Estimator, input_data, global_random_seed):
+    # Check that scaling all sample weights by a common factor
+    # shouldn't change the result
+    sample_weight = np.random.RandomState(global_random_seed).uniform(size=n_samples)
+
+    km = Estimator(n_clusters=n_clusters, random_state=global_random_seed, n_init=1)
+    km_orig = clone(km).fit(input_data, sample_weight=sample_weight)
+    km_scaled = clone(km).fit(input_data, sample_weight=0.5 * sample_weight)
+
+    assert_array_equal(km_orig.labels_, km_scaled.labels_)
+    assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_)
+
+
+def test_kmeans_elkan_iter_attribute():
+    # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off
+    # it's right value (#11340).
+    km = KMeans(algorithm="elkan", max_iter=1).fit(X)
+    assert km.n_iter_ == 1
+
+
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+def test_kmeans_empty_cluster_relocated(array_constr):
+    # check that empty clusters are correctly relocated when using sample
+    # weights (#13486)
+    X = array_constr([[-1], [1]])
+    sample_weight = [1.9, 0.1]
+    init = np.array([[-1], [10]])
+
+    km = KMeans(n_clusters=2, init=init, n_init=1)
+    km.fit(X, sample_weight=sample_weight)
+
+    assert len(set(km.labels_)) == 2
+    assert_allclose(km.cluster_centers_, [[-1], [1]])
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_result_equal_in_diff_n_threads(Estimator, global_random_seed):
+    # Check that KMeans/MiniBatchKMeans give the same results in parallel mode
+    # than in sequential mode.
+    rnd = np.random.RandomState(global_random_seed)
+    X = rnd.normal(size=(50, 10))
+
+    with _get_threadpool_controller().limit(limits=1, user_api="openmp"):
+        result_1 = (
+            Estimator(n_clusters=n_clusters, random_state=global_random_seed)
+            .fit(X)
+            .labels_
+        )
+    with _get_threadpool_controller().limit(limits=2, user_api="openmp"):
+        result_2 = (
+            Estimator(n_clusters=n_clusters, random_state=global_random_seed)
+            .fit(X)
+            .labels_
+        )
+    assert_array_equal(result_1, result_2)
+
+
+def test_warning_elkan_1_cluster():
+    # Check warning messages specific to KMeans
+    with pytest.warns(
+        RuntimeWarning,
+        match="algorithm='elkan' doesn't make sense for a single cluster",
+    ):
+        KMeans(n_clusters=1, algorithm="elkan").fit(X)
+
+
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+@pytest.mark.parametrize("algo", ["lloyd", "elkan"])
+def test_k_means_1_iteration(array_constr, algo, global_random_seed):
+    # check the results after a single iteration (E-step M-step E-step) by
+    # comparing against a pure python implementation.
+    X = np.random.RandomState(global_random_seed).uniform(size=(100, 5))
+    init_centers = X[:5]
+    X = array_constr(X)
+
+    def py_kmeans(X, init):
+        new_centers = init.copy()
+        labels = pairwise_distances_argmin(X, init)
+        for label in range(init.shape[0]):
+            new_centers[label] = X[labels == label].mean(axis=0)
+        labels = pairwise_distances_argmin(X, new_centers)
+        return labels, new_centers
+
+    py_labels, py_centers = py_kmeans(X, init_centers)
+
+    cy_kmeans = KMeans(
+        n_clusters=5, n_init=1, init=init_centers, algorithm=algo, max_iter=1
+    ).fit(X)
+    cy_labels = cy_kmeans.labels_
+    cy_centers = cy_kmeans.cluster_centers_
+
+    assert_array_equal(py_labels, cy_labels)
+    assert_allclose(py_centers, cy_centers)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("squared", [True, False])
+def test_euclidean_distance(dtype, squared, global_random_seed):
+    # Check that the _euclidean_(dense/sparse)_dense helpers produce correct
+    # results
+    rng = np.random.RandomState(global_random_seed)
+    a_sparse = sp.random(
+        1, 100, density=0.5, format="csr", random_state=rng, dtype=dtype
+    )
+    a_dense = a_sparse.toarray().reshape(-1)
+    b = rng.randn(100).astype(dtype, copy=False)
+    b_squared_norm = (b**2).sum()
+
+    expected = ((a_dense - b) ** 2).sum()
+    expected = expected if squared else np.sqrt(expected)
+
+    distance_dense_dense = _euclidean_dense_dense_wrapper(a_dense, b, squared)
+    distance_sparse_dense = _euclidean_sparse_dense_wrapper(
+        a_sparse.data, a_sparse.indices, b, b_squared_norm, squared
+    )
+
+    rtol = 1e-4 if dtype == np.float32 else 1e-7
+    assert_allclose(distance_dense_dense, distance_sparse_dense, rtol=rtol)
+    assert_allclose(distance_dense_dense, expected, rtol=rtol)
+    assert_allclose(distance_sparse_dense, expected, rtol=rtol)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_inertia(dtype, global_random_seed):
+    # Check that the _inertia_(dense/sparse) helpers produce correct results.
+    rng = np.random.RandomState(global_random_seed)
+    X_sparse = sp.random(
+        100, 10, density=0.5, format="csr", random_state=rng, dtype=dtype
+    )
+    X_dense = X_sparse.toarray()
+    sample_weight = rng.randn(100).astype(dtype, copy=False)
+    centers = rng.randn(5, 10).astype(dtype, copy=False)
+    labels = rng.randint(5, size=100, dtype=np.int32)
+
+    distances = ((X_dense - centers[labels]) ** 2).sum(axis=1)
+    expected = np.sum(distances * sample_weight)
+
+    inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels, n_threads=1)
+    inertia_sparse = _inertia_sparse(
+        X_sparse, sample_weight, centers, labels, n_threads=1
+    )
+
+    rtol = 1e-4 if dtype == np.float32 else 1e-6
+    assert_allclose(inertia_dense, inertia_sparse, rtol=rtol)
+    assert_allclose(inertia_dense, expected, rtol=rtol)
+    assert_allclose(inertia_sparse, expected, rtol=rtol)
+
+    # Check the single_label parameter.
+    label = 1
+    mask = labels == label
+    distances = ((X_dense[mask] - centers[label]) ** 2).sum(axis=1)
+    expected = np.sum(distances * sample_weight[mask])
+
+    inertia_dense = _inertia_dense(
+        X_dense, sample_weight, centers, labels, n_threads=1, single_label=label
+    )
+    inertia_sparse = _inertia_sparse(
+        X_sparse, sample_weight, centers, labels, n_threads=1, single_label=label
+    )
+
+    assert_allclose(inertia_dense, inertia_sparse, rtol=rtol)
+    assert_allclose(inertia_dense, expected, rtol=rtol)
+    assert_allclose(inertia_sparse, expected, rtol=rtol)
+
+
+@pytest.mark.parametrize("Klass, default_n_init", [(KMeans, 10), (MiniBatchKMeans, 3)])
+def test_n_init_auto(Klass, default_n_init):
+    est = Klass(n_init="auto", init="k-means++")
+    est.fit(X)
+    assert est._n_init == 1
+
+    est = Klass(n_init="auto", init="random")
+    est.fit(X)
+    assert est._n_init == 10 if Klass.__name__ == "KMeans" else 3
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_sample_weight_unchanged(Estimator):
+    # Check that sample_weight is not modified in place by KMeans (#17204)
+    X = np.array([[1], [2], [4]])
+    sample_weight = np.array([0.5, 0.2, 0.3])
+    Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight)
+
+    assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3]))
+
+
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+@pytest.mark.parametrize(
+    "param, match",
+    [
+        ({"n_clusters": n_samples + 1}, r"n_samples.* should be >= n_clusters"),
+        (
+            {"init": X[:2]},
+            r"The shape of the initial centers .* does not match "
+            r"the number of clusters",
+        ),
+        (
+            {"init": lambda X_, k, random_state: X_[:2]},
+            r"The shape of the initial centers .* does not match "
+            r"the number of clusters",
+        ),
+        (
+            {"init": X[:8, :2]},
+            r"The shape of the initial centers .* does not match "
+            r"the number of features of the data",
+        ),
+        (
+            {"init": lambda X_, k, random_state: X_[:8, :2]},
+            r"The shape of the initial centers .* does not match "
+            r"the number of features of the data",
+        ),
+    ],
+)
+def test_wrong_params(Estimator, param, match):
+    # Check that error are raised with clear error message when wrong values
+    # are passed for the parameters
+    # Set n_init=1 by default to avoid warning with precomputed init
+    km = Estimator(n_init=1)
+    with pytest.raises(ValueError, match=match):
+        km.set_params(**param).fit(X)
+
+
+@pytest.mark.parametrize(
+    "param, match",
+    [
+        (
+            {"x_squared_norms": X[:2]},
+            r"The length of x_squared_norms .* should "
+            r"be equal to the length of n_samples",
+        ),
+    ],
+)
+def test_kmeans_plusplus_wrong_params(param, match):
+    with pytest.raises(ValueError, match=match):
+        kmeans_plusplus(X, n_clusters, **param)
+
+
+@pytest.mark.parametrize(
+    "input_data",
+    [X] + X_as_any_csr,
+)
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_kmeans_plusplus_output(input_data, dtype, global_random_seed):
+    # Check for the correct number of seeds and all positive values
+    data = input_data.astype(dtype)
+    centers, indices = kmeans_plusplus(
+        data, n_clusters, random_state=global_random_seed
+    )
+
+    # Check there are the correct number of indices and that all indices are
+    # positive and within the number of samples
+    assert indices.shape[0] == n_clusters
+    assert (indices >= 0).all()
+    assert (indices <= data.shape[0]).all()
+
+    # Check for the correct number of seeds and that they are bound by the data
+    assert centers.shape[0] == n_clusters
+    assert (centers.max(axis=0) <= data.max(axis=0)).all()
+    assert (centers.min(axis=0) >= data.min(axis=0)).all()
+
+    # Check that indices correspond to reported centers
+    # Use X for comparison rather than data, test still works against centers
+    # calculated with sparse data.
+    assert_allclose(X[indices].astype(dtype), centers)
+
+
+@pytest.mark.parametrize("x_squared_norms", [row_norms(X, squared=True), None])
+def test_kmeans_plusplus_norms(x_squared_norms):
+    # Check that defining x_squared_norms returns the same as default=None.
+    centers, indices = kmeans_plusplus(X, n_clusters, x_squared_norms=x_squared_norms)
+
+    assert_allclose(X[indices], centers)
+
+
+def test_kmeans_plusplus_dataorder(global_random_seed):
+    # Check that memory layout does not effect result
+    centers_c, _ = kmeans_plusplus(X, n_clusters, random_state=global_random_seed)
+
+    X_fortran = np.asfortranarray(X)
+
+    centers_fortran, _ = kmeans_plusplus(
+        X_fortran, n_clusters, random_state=global_random_seed
+    )
+
+    assert_allclose(centers_c, centers_fortran)
+
+
+def test_is_same_clustering():
+    # Sanity check for the _is_same_clustering utility function
+    labels1 = np.array([1, 0, 0, 1, 2, 0, 2, 1], dtype=np.int32)
+    assert _is_same_clustering(labels1, labels1, 3)
+
+    # these other labels represent the same clustering since we can retrieve the first
+    # labels by simply renaming the labels: 0 -> 1, 1 -> 2, 2 -> 0.
+    labels2 = np.array([0, 2, 2, 0, 1, 2, 1, 0], dtype=np.int32)
+    assert _is_same_clustering(labels1, labels2, 3)
+
+    # these other labels do not represent the same clustering since not all ones are
+    # mapped to a same value
+    labels3 = np.array([1, 0, 0, 2, 2, 0, 2, 1], dtype=np.int32)
+    assert not _is_same_clustering(labels1, labels3, 3)
+
+
+@pytest.mark.parametrize(
+    "kwargs", ({"init": np.str_("k-means++")}, {"init": [[0, 0], [1, 1]], "n_init": 1})
+)
+def test_kmeans_with_array_like_or_np_scalar_init(kwargs):
+    """Check that init works with numpy scalar strings.
+
+    Non-regression test for #21964.
+    """
+    X = np.asarray([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=np.float64)
+
+    clustering = KMeans(n_clusters=2, **kwargs)
+    # Does not raise
+    clustering.fit(X)
+
+
+@pytest.mark.parametrize(
+    "Klass, method",
+    [(KMeans, "fit"), (MiniBatchKMeans, "fit"), (MiniBatchKMeans, "partial_fit")],
+)
+def test_feature_names_out(Klass, method):
+    """Check `feature_names_out` for `KMeans` and `MiniBatchKMeans`."""
+    class_name = Klass.__name__.lower()
+    kmeans = Klass()
+    getattr(kmeans, method)(X)
+    n_clusters = kmeans.cluster_centers_.shape[0]
+
+    names_out = kmeans.get_feature_names_out()
+    assert_array_equal([f"{class_name}{i}" for i in range(n_clusters)], names_out)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_predict_does_not_change_cluster_centers(csr_container):
+    """Check that predict does not change cluster centers.
+
+    Non-regression test for gh-24253.
+    """
+    X, _ = make_blobs(n_samples=200, n_features=10, centers=10, random_state=0)
+    if csr_container is not None:
+        X = csr_container(X)
+
+    kmeans = KMeans()
+    y_pred1 = kmeans.fit_predict(X)
+    # Make cluster_centers readonly
+    kmeans.cluster_centers_ = create_memmap_backed_data(kmeans.cluster_centers_)
+    kmeans.labels_ = create_memmap_backed_data(kmeans.labels_)
+
+    y_pred2 = kmeans.predict(X)
+    assert_array_equal(y_pred1, y_pred2)
+
+
+@pytest.mark.parametrize("init", ["k-means++", "random"])
+def test_sample_weight_init(init, global_random_seed):
+    """Check that sample weight is used during init.
+
+    `_init_centroids` is shared across all classes inheriting from _BaseKMeans so
+    it's enough to check for KMeans.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X, _ = make_blobs(
+        n_samples=200, n_features=10, centers=10, random_state=global_random_seed
+    )
+    x_squared_norms = row_norms(X, squared=True)
+
+    kmeans = KMeans()
+    clusters_weighted = kmeans._init_centroids(
+        X=X,
+        x_squared_norms=x_squared_norms,
+        init=init,
+        sample_weight=rng.uniform(size=X.shape[0]),
+        n_centroids=5,
+        random_state=np.random.RandomState(global_random_seed),
+    )
+    clusters = kmeans._init_centroids(
+        X=X,
+        x_squared_norms=x_squared_norms,
+        init=init,
+        sample_weight=np.ones(X.shape[0]),
+        n_centroids=5,
+        random_state=np.random.RandomState(global_random_seed),
+    )
+    with pytest.raises(AssertionError):
+        assert_allclose(clusters_weighted, clusters)
+
+
+@pytest.mark.parametrize("init", ["k-means++", "random"])
+def test_sample_weight_zero(init, global_random_seed):
+    """Check that if sample weight is 0, this sample won't be chosen.
+
+    `_init_centroids` is shared across all classes inheriting from _BaseKMeans so
+    it's enough to check for KMeans.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X, _ = make_blobs(
+        n_samples=100, n_features=5, centers=5, random_state=global_random_seed
+    )
+    sample_weight = rng.uniform(size=X.shape[0])
+    sample_weight[::2] = 0
+    x_squared_norms = row_norms(X, squared=True)
+
+    kmeans = KMeans()
+    clusters_weighted = kmeans._init_centroids(
+        X=X,
+        x_squared_norms=x_squared_norms,
+        init=init,
+        sample_weight=sample_weight,
+        n_centroids=10,
+        random_state=np.random.RandomState(global_random_seed),
+    )
+    # No center should be one of the 0 sample weight point
+    # (i.e. be at a distance=0 from it)
+    d = euclidean_distances(X[::2], clusters_weighted)
+    assert not np.any(np.isclose(d, 0))
+
+
+@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids)
+@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"])
+def test_relocating_with_duplicates(algorithm, array_constr):
+    """Check that kmeans stops when there are more centers than non-duplicate samples
+
+    Non-regression test for issue:
+    https://github.com/scikit-learn/scikit-learn/issues/28055
+    """
+    X = np.array([[0, 0], [1, 1], [1, 1], [1, 0], [0, 1]])
+    km = KMeans(n_clusters=5, init=X, algorithm=algorithm)
+
+    msg = r"Number of distinct clusters \(4\) found smaller than n_clusters \(5\)"
+    with pytest.warns(ConvergenceWarning, match=msg):
+        km.fit(array_constr(X))
+
+    assert km.n_iter_ == 1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_mean_shift.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_mean_shift.py
new file mode 100644
index 0000000000000000000000000000000000000000..7216a064ccbc729de42688a48cae3b0be6e89bfa
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_mean_shift.py
@@ -0,0 +1,215 @@
+"""
+Testing for mean shift clustering methods
+
+"""
+
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.cluster import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
+from sklearn.datasets import make_blobs
+from sklearn.metrics import v_measure_score
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+
+n_clusters = 3
+centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+X, _ = make_blobs(
+    n_samples=300,
+    n_features=2,
+    centers=centers,
+    cluster_std=0.4,
+    shuffle=True,
+    random_state=11,
+)
+
+
+def test_convergence_of_1d_constant_data():
+    # Test convergence using 1D constant data
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/28926
+    model = MeanShift()
+    n_iter = model.fit(np.ones(10).reshape(-1, 1)).n_iter_
+    assert n_iter < model.max_iter
+
+
+def test_estimate_bandwidth():
+    # Test estimate_bandwidth
+    bandwidth = estimate_bandwidth(X, n_samples=200)
+    assert 0.9 <= bandwidth <= 1.5
+
+
+def test_estimate_bandwidth_1sample(global_dtype):
+    # Test estimate_bandwidth when n_samples=1 and quantile<1, so that
+    # n_neighbors is set to 1.
+    bandwidth = estimate_bandwidth(
+        X.astype(global_dtype, copy=False), n_samples=1, quantile=0.3
+    )
+
+    assert bandwidth.dtype == X.dtype
+    assert bandwidth == pytest.approx(0.0, abs=1e-5)
+
+
+@pytest.mark.parametrize(
+    "bandwidth, cluster_all, expected, first_cluster_label",
+    [(1.2, True, 3, 0), (1.2, False, 4, -1)],
+)
+def test_mean_shift(
+    global_dtype, bandwidth, cluster_all, expected, first_cluster_label
+):
+    # Test MeanShift algorithm
+    X_with_global_dtype = X.astype(global_dtype, copy=False)
+    ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
+    labels = ms.fit(X_with_global_dtype).labels_
+    labels_unique = np.unique(labels)
+    n_clusters_ = len(labels_unique)
+    assert n_clusters_ == expected
+    assert labels_unique[0] == first_cluster_label
+    assert ms.cluster_centers_.dtype == global_dtype
+
+    cluster_centers, labels_mean_shift = mean_shift(
+        X_with_global_dtype, cluster_all=cluster_all
+    )
+    labels_mean_shift_unique = np.unique(labels_mean_shift)
+    n_clusters_mean_shift = len(labels_mean_shift_unique)
+    assert n_clusters_mean_shift == expected
+    assert labels_mean_shift_unique[0] == first_cluster_label
+    assert cluster_centers.dtype == global_dtype
+
+
+def test_parallel(global_dtype, global_random_seed):
+    centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+    X, _ = make_blobs(
+        n_samples=50,
+        n_features=2,
+        centers=centers,
+        cluster_std=0.4,
+        shuffle=True,
+        random_state=global_random_seed,
+    )
+
+    X = X.astype(global_dtype, copy=False)
+
+    ms1 = MeanShift(n_jobs=2)
+    ms1.fit(X)
+
+    ms2 = MeanShift()
+    ms2.fit(X)
+
+    assert_allclose(ms1.cluster_centers_, ms2.cluster_centers_)
+    assert ms1.cluster_centers_.dtype == ms2.cluster_centers_.dtype
+    assert_array_equal(ms1.labels_, ms2.labels_)
+
+
+def test_meanshift_predict(global_dtype):
+    # Test MeanShift.predict
+    ms = MeanShift(bandwidth=1.2)
+    X_with_global_dtype = X.astype(global_dtype, copy=False)
+    labels = ms.fit_predict(X_with_global_dtype)
+    labels2 = ms.predict(X_with_global_dtype)
+    assert_array_equal(labels, labels2)
+
+
+def test_meanshift_all_orphans():
+    # init away from the data, crash with a sensible warning
+    ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
+    msg = "No point was within bandwidth=0.1"
+    with pytest.raises(ValueError, match=msg):
+        ms.fit(
+            X,
+        )
+
+
+def test_unfitted():
+    # Non-regression: before fit, there should be not fitted attributes.
+    ms = MeanShift()
+    assert not hasattr(ms, "cluster_centers_")
+    assert not hasattr(ms, "labels_")
+
+
+def test_cluster_intensity_tie(global_dtype):
+    X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=global_dtype)
+    c1 = MeanShift(bandwidth=2).fit(X)
+
+    X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=global_dtype)
+    c2 = MeanShift(bandwidth=2).fit(X)
+    assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
+    assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
+
+
+def test_bin_seeds(global_dtype):
+    # Test the bin seeding technique which can be used in the mean shift
+    # algorithm
+    # Data is just 6 points in the plane
+    X = np.array(
+        [[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]],
+        dtype=global_dtype,
+    )
+
+    # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
+    # found
+    ground_truth = {(1.0, 1.0), (2.0, 1.0), (0.0, 0.0)}
+    test_bins = get_bin_seeds(X, 1, 1)
+    test_result = set(tuple(p) for p in test_bins)
+    assert len(ground_truth.symmetric_difference(test_result)) == 0
+
+    # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
+    # found
+    ground_truth = {(1.0, 1.0), (2.0, 1.0)}
+    test_bins = get_bin_seeds(X, 1, 2)
+    test_result = set(tuple(p) for p in test_bins)
+    assert len(ground_truth.symmetric_difference(test_result)) == 0
+
+    # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
+    # we bail and use the whole data here.
+    with warnings.catch_warnings(record=True):
+        test_bins = get_bin_seeds(X, 0.01, 1)
+    assert_allclose(test_bins, X)
+
+    # tight clusters around [0, 0] and [1, 1], only get two bins
+    X, _ = make_blobs(
+        n_samples=100,
+        n_features=2,
+        centers=[[0, 0], [1, 1]],
+        cluster_std=0.1,
+        random_state=0,
+    )
+    X = X.astype(global_dtype, copy=False)
+    test_bins = get_bin_seeds(X, 1)
+    assert_array_equal(test_bins, [[0, 0], [1, 1]])
+
+
+@pytest.mark.parametrize("max_iter", [1, 100])
+def test_max_iter(max_iter):
+    clusters1, _ = mean_shift(X, max_iter=max_iter)
+    ms = MeanShift(max_iter=max_iter).fit(X)
+    clusters2 = ms.cluster_centers_
+
+    assert ms.n_iter_ <= ms.max_iter
+    assert len(clusters1) == len(clusters2)
+
+    for c1, c2 in zip(clusters1, clusters2):
+        assert np.allclose(c1, c2)
+
+
+def test_mean_shift_zero_bandwidth(global_dtype):
+    # Check that mean shift works when the estimated bandwidth is 0.
+    X = np.array([1, 1, 1, 2, 2, 2, 3, 3], dtype=global_dtype).reshape(-1, 1)
+
+    # estimate_bandwidth with default args returns 0 on this dataset
+    bandwidth = estimate_bandwidth(X)
+    assert bandwidth == 0
+
+    # get_bin_seeds with a 0 bin_size should return the dataset itself
+    assert get_bin_seeds(X, bin_size=bandwidth) is X
+
+    # MeanShift with binning and a 0 estimated bandwidth should be equivalent
+    # to no binning.
+    ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X)
+    ms_nobinning = MeanShift(bin_seeding=False).fit(X)
+    expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2])
+
+    assert v_measure_score(ms_binning.labels_, expected_labels) == pytest.approx(1)
+    assert v_measure_score(ms_nobinning.labels_, expected_labels) == pytest.approx(1)
+    assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_optics.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_optics.py
new file mode 100644
index 0000000000000000000000000000000000000000..02184ea454d65cc1f2d9d95f265e52d307c30543
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_optics.py
@@ -0,0 +1,874 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.cluster import DBSCAN, OPTICS
+from sklearn.cluster._optics import _extend_region, _extract_xi_labels
+from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import DataConversionWarning, EfficiencyWarning
+from sklearn.metrics.cluster import contingency_matrix
+from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.utils import shuffle
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+rng = np.random.RandomState(0)
+n_points_per_cluster = 10
+C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
+C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
+C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
+C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
+C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
+C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
+X = np.vstack((C1, C2, C3, C4, C5, C6))
+
+
+@pytest.mark.parametrize(
+    ("r_plot", "end"),
+    [
+        [[10, 8.9, 8.8, 8.7, 7, 10], 3],
+        [[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0],
+        [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
+        [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
+    ],
+)
+def test_extend_downward(r_plot, end):
+    r_plot = np.array(r_plot)
+    ratio = r_plot[:-1] / r_plot[1:]
+    steep_downward = ratio >= 1 / 0.9
+    upward = ratio < 1
+
+    e = _extend_region(steep_downward, upward, 0, 2)
+    assert e == end
+
+
+@pytest.mark.parametrize(
+    ("r_plot", "end"),
+    [
+        [[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6],
+        [[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0],
+        [[1, 2, 2.1, 2, np.inf], 0],
+        [[1, 2, 2.1, np.inf], 2],
+    ],
+)
+def test_extend_upward(r_plot, end):
+    r_plot = np.array(r_plot)
+    ratio = r_plot[:-1] / r_plot[1:]
+    steep_upward = ratio <= 0.9
+    downward = ratio > 1
+
+    e = _extend_region(steep_upward, downward, 0, 2)
+    assert e == end
+
+
+@pytest.mark.parametrize(
+    ("ordering", "clusters", "expected"),
+    [
+        [[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]],
+        [[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]],
+        [[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]],
+        [[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]],
+    ],
+)
+def test_the_extract_xi_labels(ordering, clusters, expected):
+    labels = _extract_xi_labels(ordering, clusters)
+
+    assert_array_equal(labels, expected)
+
+
+def test_extract_xi(global_dtype):
+    # small and easy test (no clusters around other clusters)
+    # but with a clear noise data.
+    # global_random_seed is not used here since the expected labels
+    # are hardcoded for these specific data.
+    rng = np.random.RandomState(0)
+    n_points_per_cluster = 5
+
+    C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
+    C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
+    C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2)
+    C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2)
+
+    X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype(
+        global_dtype, copy=False
+    )
+    expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5]
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
+    clust = OPTICS(
+        min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4
+    ).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+    # check float min_samples and min_cluster_size
+    clust = OPTICS(
+        min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4
+    ).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+    X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype(
+        global_dtype, copy=False
+    )
+    expected_labels = np.r_[
+        [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5
+    ]
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
+    clust = OPTICS(
+        min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3
+    ).fit(X)
+    # this may fail if the predecessor correction is not at work!
+    assert_array_equal(clust.labels_, expected_labels)
+
+    C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]]
+    C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
+    C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
+    X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
+    expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
+    clust = OPTICS(
+        min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04
+    ).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+
+def test_cluster_hierarchy(global_dtype, global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_points_per_cluster = 100
+    C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype(
+        global_dtype, copy=False
+    )
+    C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2).astype(
+        global_dtype, copy=False
+    )
+    X = np.vstack((C1, C2))
+    X = shuffle(X, random_state=rng)
+
+    clusters = OPTICS(min_samples=20, xi=0.2).fit(X).cluster_hierarchy_
+    assert clusters.shape == (2, 2)
+
+    # The first cluster should contain all point from C1 but due to how the data is
+    # generated, some points from C2 may end up in it.
+    assert 100 <= np.diff(clusters[0]) + 1 <= 115
+    # The second cluster should contain all points from C1 and C2.
+    assert np.diff(clusters[-1]) + 1 == 200
+
+
+@pytest.mark.parametrize(
+    "csr_container, metric",
+    [(None, "minkowski")] + [(container, "euclidean") for container in CSR_CONTAINERS],
+)
+def test_correct_number_of_clusters(metric, csr_container):
+    # in 'auto' mode
+
+    n_clusters = 3
+    X = generate_clustered_data(n_clusters=n_clusters)
+    # Parameters chosen specifically for this task.
+    # Compute OPTICS
+    clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
+    clust.fit(csr_container(X) if csr_container is not None else X)
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
+    assert n_clusters_1 == n_clusters
+
+    # check attribute types and sizes
+    assert clust.labels_.shape == (len(X),)
+    assert clust.labels_.dtype.kind == "i"
+
+    assert clust.reachability_.shape == (len(X),)
+    assert clust.reachability_.dtype.kind == "f"
+
+    assert clust.core_distances_.shape == (len(X),)
+    assert clust.core_distances_.dtype.kind == "f"
+
+    assert clust.ordering_.shape == (len(X),)
+    assert clust.ordering_.dtype.kind == "i"
+    assert set(clust.ordering_) == set(range(len(X)))
+
+
+def test_minimum_number_of_sample_check():
+    # test that we check a minimum number of samples
+    msg = "min_samples must be no greater than"
+
+    # Compute OPTICS
+    X = [[1, 1]]
+    clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1.0)
+
+    # Run the fit
+    with pytest.raises(ValueError, match=msg):
+        clust.fit(X)
+
+
+def test_bad_extract():
+    # Test an extraction of eps too close to original eps
+    msg = "Specify an epsilon smaller than 0.15. Got 0.3."
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+    )
+
+    # Compute OPTICS
+    clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10)
+    with pytest.raises(ValueError, match=msg):
+        clust.fit(X)
+
+
+def test_bad_reachability():
+    msg = "All reachability values are inf. Set a larger max_eps."
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+    )
+
+    with pytest.warns(UserWarning, match=msg):
+        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
+        clust.fit(X)
+
+
+def test_nowarn_if_metric_bool_data_bool():
+    # make sure no warning is raised if metric and data are both boolean
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/18996
+
+    pairwise_metric = "rogerstanimoto"
+    X = np.random.randint(2, size=(5, 2), dtype=bool)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", DataConversionWarning)
+
+        OPTICS(metric=pairwise_metric).fit(X)
+
+
+def test_warn_if_metric_bool_data_no_bool():
+    # make sure a *single* conversion warning is raised if metric is boolean
+    # but data isn't
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/18996
+
+    pairwise_metric = "rogerstanimoto"
+    X = np.random.randint(2, size=(5, 2), dtype=np.int32)
+    msg = f"Data will be converted to boolean for metric {pairwise_metric}"
+
+    with pytest.warns(DataConversionWarning, match=msg) as warn_record:
+        # Silence a DeprecationWarning from joblib <= 1.5.1 in Python 3.14+.
+        warnings.filterwarnings(
+            "ignore",
+            message="'asyncio.iscoroutinefunction' is deprecated",
+            category=DeprecationWarning,
+        )
+        OPTICS(metric=pairwise_metric).fit(X)
+        assert len(warn_record) == 1
+
+
+def test_nowarn_if_metric_no_bool():
+    # make sure no conversion warning is raised if
+    # metric isn't boolean, no matter what the data type is
+    pairwise_metric = "minkowski"
+    X_bool = np.random.randint(2, size=(5, 2), dtype=bool)
+    X_num = np.random.randint(2, size=(5, 2), dtype=np.int32)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", DataConversionWarning)
+
+        # fit boolean data
+        OPTICS(metric=pairwise_metric).fit(X_bool)
+        # fit numeric data
+        OPTICS(metric=pairwise_metric).fit(X_num)
+
+
+def test_close_extract():
+    # Test extract where extraction eps is close to scaled max_eps
+
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+    )
+
+    # Compute OPTICS
+    clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X)
+    # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
+    assert max(clust.labels_) == 2
+
+
+@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
+@pytest.mark.parametrize("min_samples", [3, 10, 20])
+@pytest.mark.parametrize(
+    "csr_container, metric",
+    [(None, "minkowski"), (None, "euclidean")]
+    + [(container, "euclidean") for container in CSR_CONTAINERS],
+)
+def test_dbscan_optics_parity(eps, min_samples, metric, global_dtype, csr_container):
+    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN
+
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=150, centers=centers, cluster_std=0.4, random_state=0
+    )
+    X = csr_container(X) if csr_container is not None else X
+
+    X = X.astype(global_dtype, copy=False)
+
+    # calculate optics with dbscan extract at 0.3 epsilon
+    op = OPTICS(
+        min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric
+    ).fit(X)
+
+    # calculate dbscan labels
+    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
+
+    contingency = contingency_matrix(db.labels_, op.labels_)
+    agree = min(
+        np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))
+    )
+    disagree = X.shape[0] - agree
+
+    percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
+
+    # verify label mismatch is <= 5% labels
+    assert percent_mismatch <= 0.05
+
+
+def test_min_samples_edge_case(global_dtype):
+    C1 = [[0, 0], [0, 0.1], [0, -0.1]]
+    C2 = [[10, 10], [10, 9], [10, 11]]
+    C3 = [[100, 100], [100, 96], [100, 106]]
+    X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
+
+    expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
+    clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+    expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
+    clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+    expected_labels = np.r_[[-1] * 9]
+    with pytest.warns(UserWarning, match="All reachability values"):
+        clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
+        assert_array_equal(clust.labels_, expected_labels)
+
+
+# try arbitrary minimum sizes
+@pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23))
+def test_min_cluster_size(min_cluster_size, global_dtype):
+    redX = X[::2].astype(global_dtype, copy=False)  # reduce for speed
+    clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
+    cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
+    if cluster_sizes.size:
+        assert min(cluster_sizes) >= min_cluster_size
+    # check behaviour is the same when min_cluster_size is a fraction
+    clust_frac = OPTICS(
+        min_samples=9,
+        min_cluster_size=min_cluster_size / redX.shape[0],
+    )
+    clust_frac.fit(redX)
+    assert_array_equal(clust.labels_, clust_frac.labels_)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_min_cluster_size_invalid2(csr_container):
+    clust = OPTICS(min_cluster_size=len(X) + 1)
+    with pytest.raises(ValueError, match="must be no greater than the "):
+        clust.fit(X)
+
+    clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
+    with pytest.raises(ValueError, match="must be no greater than the "):
+        clust.fit(csr_container(X))
+
+
+def test_processing_order():
+    # Ensure that we consider all unprocessed points,
+    # not only direct neighbors. when picking the next point.
+    Y = [[0], [10], [-10], [25]]
+
+    clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
+    assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
+    assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
+    assert_array_equal(clust.ordering_, [0, 1, 2, 3])
+
+
+def test_compare_to_ELKI():
+    # Expected values, computed with (future) ELKI 0.7.5 using:
+    # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
+    #   -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
+    # where the FixedDBIDsFilter gives 0-indexed ids.
+    r1 = [
+        np.inf,
+        1.0574896366427478,
+        0.7587934993548423,
+        0.7290174038973836,
+        0.7290174038973836,
+        0.7290174038973836,
+        0.6861627576116127,
+        0.7587934993548423,
+        0.9280118450166668,
+        1.1748022534146194,
+        3.3355455741292257,
+        0.49618389254482587,
+        0.2552805046961355,
+        0.2552805046961355,
+        0.24944622248445714,
+        0.24944622248445714,
+        0.24944622248445714,
+        0.2552805046961355,
+        0.2552805046961355,
+        0.3086779122185853,
+        4.163024452756142,
+        1.623152630340929,
+        0.45315840475822655,
+        0.25468325192031926,
+        0.2254004358159971,
+        0.18765711877083036,
+        0.1821471333893275,
+        0.1821471333893275,
+        0.18765711877083036,
+        0.18765711877083036,
+        0.2240202988740153,
+        1.154337614548715,
+        1.342604473837069,
+        1.323308536402633,
+        0.8607514948648837,
+        0.27219111215810565,
+        0.13260875220533205,
+        0.13260875220533205,
+        0.09890587675958984,
+        0.09890587675958984,
+        0.13548790801634494,
+        0.1575483940837384,
+        0.17515137170530226,
+        0.17575920159442388,
+        0.27219111215810565,
+        0.6101447895405373,
+        1.3189208094864302,
+        1.323308536402633,
+        2.2509184159764577,
+        2.4517810628594527,
+        3.675977064404973,
+        3.8264795626020365,
+        2.9130735341510614,
+        2.9130735341510614,
+        2.9130735341510614,
+        2.9130735341510614,
+        2.8459300127258036,
+        2.8459300127258036,
+        2.8459300127258036,
+        3.0321982337972537,
+    ]
+    o1 = [
+        0,
+        3,
+        6,
+        4,
+        7,
+        8,
+        2,
+        9,
+        5,
+        1,
+        31,
+        30,
+        32,
+        34,
+        33,
+        38,
+        39,
+        35,
+        37,
+        36,
+        44,
+        21,
+        23,
+        24,
+        22,
+        25,
+        27,
+        29,
+        26,
+        28,
+        20,
+        40,
+        45,
+        46,
+        10,
+        15,
+        11,
+        13,
+        17,
+        19,
+        18,
+        12,
+        16,
+        14,
+        47,
+        49,
+        43,
+        48,
+        42,
+        41,
+        53,
+        57,
+        51,
+        52,
+        56,
+        59,
+        54,
+        55,
+        58,
+        50,
+    ]
+    p1 = [
+        -1,
+        0,
+        3,
+        6,
+        6,
+        6,
+        8,
+        3,
+        7,
+        5,
+        1,
+        31,
+        30,
+        30,
+        34,
+        34,
+        34,
+        32,
+        32,
+        37,
+        36,
+        44,
+        21,
+        23,
+        24,
+        22,
+        25,
+        25,
+        22,
+        22,
+        22,
+        21,
+        40,
+        45,
+        46,
+        10,
+        15,
+        15,
+        13,
+        13,
+        15,
+        11,
+        19,
+        15,
+        10,
+        47,
+        12,
+        45,
+        14,
+        43,
+        42,
+        53,
+        57,
+        57,
+        57,
+        57,
+        59,
+        59,
+        59,
+        58,
+    ]
+
+    # Tests against known extraction array
+    # Does NOT work with metric='euclidean', because sklearn euclidean has
+    # worse numeric precision. 'minkowski' is slower but more accurate.
+    clust1 = OPTICS(min_samples=5).fit(X)
+
+    assert_array_equal(clust1.ordering_, np.array(o1))
+    assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
+    assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
+    # ELKI currently does not print the core distances (which are not used much
+    # in literature, but we can at least ensure to have this consistency:
+    for i in clust1.ordering_[1:]:
+        assert clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]]
+
+    # Expected values, computed with (future) ELKI 0.7.5 using
+    r2 = [
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        0.27219111215810565,
+        0.13260875220533205,
+        0.13260875220533205,
+        0.09890587675958984,
+        0.09890587675958984,
+        0.13548790801634494,
+        0.1575483940837384,
+        0.17515137170530226,
+        0.17575920159442388,
+        0.27219111215810565,
+        0.4928068613197889,
+        np.inf,
+        0.2666183922512113,
+        0.18765711877083036,
+        0.1821471333893275,
+        0.1821471333893275,
+        0.1821471333893275,
+        0.18715928772277457,
+        0.18765711877083036,
+        0.18765711877083036,
+        0.25468325192031926,
+        np.inf,
+        0.2552805046961355,
+        0.2552805046961355,
+        0.24944622248445714,
+        0.24944622248445714,
+        0.24944622248445714,
+        0.2552805046961355,
+        0.2552805046961355,
+        0.3086779122185853,
+        0.34466409325984865,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+    ]
+    o2 = [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        15,
+        11,
+        13,
+        17,
+        19,
+        18,
+        12,
+        16,
+        14,
+        47,
+        46,
+        20,
+        22,
+        25,
+        23,
+        27,
+        29,
+        24,
+        26,
+        28,
+        21,
+        30,
+        32,
+        34,
+        33,
+        38,
+        39,
+        35,
+        37,
+        36,
+        31,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+    ]
+    p2 = [
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        10,
+        15,
+        15,
+        13,
+        13,
+        15,
+        11,
+        19,
+        15,
+        10,
+        47,
+        -1,
+        20,
+        22,
+        25,
+        25,
+        25,
+        25,
+        22,
+        22,
+        23,
+        -1,
+        30,
+        30,
+        34,
+        34,
+        34,
+        32,
+        32,
+        37,
+        38,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+    ]
+    clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
+
+    assert_array_equal(clust2.ordering_, np.array(o2))
+    assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
+    assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))
+
+    index = np.where(clust1.core_distances_ <= 0.5)[0]
+    assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])
+
+
+def test_extract_dbscan(global_dtype, global_random_seed):
+    # testing an easy dbscan case. Not including clusters with different
+    # densities.
+    rng = np.random.RandomState(global_random_seed)
+    n_points_per_cluster = 20
+    C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    C3 = [1, 2] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    C4 = [-2, 3] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False)
+
+    clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X)
+    assert_array_equal(
+        np.sort(np.unique(clust.labels_[clust.labels_ != -1])), [0, 1, 2, 3]
+    )
+
+
+@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
+def test_precomputed_dists(global_dtype, csr_container):
+    redX = X[::2].astype(global_dtype, copy=False)
+    dists = pairwise_distances(redX, metric="euclidean")
+    dists = csr_container(dists) if csr_container is not None else dists
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", EfficiencyWarning)
+        clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(
+            dists
+        )
+    clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)
+
+    assert_allclose(clust1.reachability_, clust2.reachability_)
+    assert_array_equal(clust1.labels_, clust2.labels_)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_optics_input_not_modified_precomputed_sparse_nodiag(
+    csr_container, global_random_seed
+):
+    """Check that we don't modify in-place the pre-computed sparse matrix.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27508
+    """
+    X = np.random.RandomState(global_random_seed).rand(6, 6)
+    # Add zeros on the diagonal that will be implicit when creating
+    # the sparse matrix. If `X` is modified in-place, the zeros from
+    # the diagonal will be made explicit.
+    np.fill_diagonal(X, 0)
+    X = csr_container(X)
+    assert all(row != col for row, col in zip(*X.nonzero()))
+    X_copy = X.copy()
+    OPTICS(metric="precomputed").fit(X)
+    # Make sure that we did not modify `X` in-place even by creating
+    # explicit 0s values.
+    assert X.nnz == X_copy.nnz
+    assert_array_equal(X.toarray(), X_copy.toarray())
+
+
+def test_optics_predecessor_correction_ordering():
+    """Check that cluster correction using predecessor is working as expected.
+
+    In the following example, the predecessor correction was not working properly
+    since it was not using the right indices.
+
+    This non-regression test check that reordering the data does not change the results.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/26324
+    """
+    X_1 = np.array([1, 2, 3, 1, 8, 8, 7, 100]).reshape(-1, 1)
+    reorder = [0, 1, 2, 4, 5, 6, 7, 3]
+    X_2 = X_1[reorder]
+
+    optics_1 = OPTICS(min_samples=3, metric="euclidean").fit(X_1)
+    optics_2 = OPTICS(min_samples=3, metric="euclidean").fit(X_2)
+
+    assert_array_equal(optics_1.labels_[reorder], optics_2.labels_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_spectral.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_spectral.py
new file mode 100644
index 0000000000000000000000000000000000000000..71b11c9fe151c310f4fd5a60f99323360e493506
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_spectral.py
@@ -0,0 +1,335 @@
+"""Testing for Spectral Clustering methods"""
+
+import pickle
+import re
+
+import numpy as np
+import pytest
+from scipy.linalg import LinAlgError
+
+from sklearn.cluster import SpectralClustering, spectral_clustering
+from sklearn.cluster._spectral import cluster_qr, discretize
+from sklearn.datasets import make_blobs
+from sklearn.feature_extraction import img_to_graph
+from sklearn.metrics import adjusted_rand_score
+from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS
+
+try:
+    from pyamg import smoothed_aggregation_solver  # noqa: F401
+
+    amg_loaded = True
+except ImportError:
+    amg_loaded = False
+
+centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+X, _ = make_blobs(
+    n_samples=60,
+    n_features=2,
+    centers=centers,
+    cluster_std=0.4,
+    shuffle=True,
+    random_state=0,
+)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
+def test_spectral_clustering(
+    eigen_solver, assign_labels, csr_container, global_random_seed
+):
+    S = np.array(
+        [
+            [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
+            [0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
+        ]
+    )
+
+    for mat in (S, csr_container(S)):
+        model = SpectralClustering(
+            random_state=global_random_seed,
+            n_clusters=2,
+            affinity="precomputed",
+            eigen_solver=eigen_solver,
+            assign_labels=assign_labels,
+        ).fit(mat)
+        labels = model.labels_
+        if labels[0] == 0:
+            labels = 1 - labels
+
+        assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
+
+        model_copy = pickle.loads(pickle.dumps(model))
+        assert model_copy.n_clusters == model.n_clusters
+        assert model_copy.eigen_solver == model.eigen_solver
+        assert_array_equal(model_copy.labels_, model.labels_)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
+def test_spectral_clustering_sparse(assign_labels, coo_container, global_random_seed):
+    X, y = make_blobs(
+        n_samples=20,
+        random_state=global_random_seed,
+        centers=[[1, 1], [-1, -1]],
+        cluster_std=0.01,
+    )
+
+    S = rbf_kernel(X, gamma=1)
+    S = np.maximum(S - 1e-4, 0)
+    S = coo_container(S)
+
+    labels = (
+        SpectralClustering(
+            random_state=global_random_seed,
+            n_clusters=2,
+            affinity="precomputed",
+            assign_labels=assign_labels,
+        )
+        .fit(S)
+        .labels_
+    )
+    assert adjusted_rand_score(y, labels) == 1
+
+
+def test_precomputed_nearest_neighbors_filtering(global_random_seed):
+    # Test precomputed graph filtering when containing too many neighbors
+    X, y = make_blobs(
+        n_samples=250,
+        random_state=global_random_seed,
+        centers=[[1, 1, 1], [-1, -1, -1]],
+        cluster_std=0.01,
+    )
+
+    n_neighbors = 2
+    results = []
+    for additional_neighbors in [0, 10]:
+        nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(X)
+        graph = nn.kneighbors_graph(X, mode="distance")
+        labels = (
+            SpectralClustering(
+                random_state=global_random_seed,
+                n_clusters=2,
+                affinity="precomputed_nearest_neighbors",
+                n_neighbors=n_neighbors,
+            )
+            .fit(graph)
+            .labels_
+        )
+        results.append(labels)
+
+    assert_array_equal(results[0], results[1])
+
+
+def test_affinities(global_random_seed):
+    # Note: in the following, random_state has been selected to have
+    # a dataset that yields a stable eigen decomposition both when built
+    # on OSX and Linux
+    X, y = make_blobs(
+        n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+    # nearest neighbors affinity
+    sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
+    with pytest.warns(UserWarning, match="not fully connected"):
+        sp.fit(X)
+    assert adjusted_rand_score(y, sp.labels_) == 1
+
+    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=global_random_seed)
+    labels = sp.fit(X).labels_
+    assert adjusted_rand_score(y, labels) == 1
+
+    X = check_random_state(10).rand(10, 5) * 10
+
+    kernels_available = kernel_metrics()
+    for kern in kernels_available:
+        # Additive chi^2 gives a negative similarity matrix which
+        # doesn't make sense for spectral clustering
+        if kern != "additive_chi2":
+            sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0)
+            labels = sp.fit(X).labels_
+            assert (X.shape[0],) == labels.shape
+
+    sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0)
+    labels = sp.fit(X).labels_
+    assert (X.shape[0],) == labels.shape
+
+    def histogram(x, y, **kwargs):
+        # Histogram kernel implemented as a callable.
+        assert kwargs == {}  # no kernel_params that we didn't ask for
+        return np.minimum(x, y).sum()
+
+    sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
+    labels = sp.fit(X).labels_
+    assert (X.shape[0],) == labels.shape
+
+
+def test_cluster_qr(global_random_seed):
+    # cluster_qr by itself should not be used for clustering generic data
+    # other than the rows of the eigenvectors within spectral clustering,
+    # but cluster_qr must still preserve the labels for different dtypes
+    # of the generic fixed input even if the labels may be meaningless.
+    random_state = np.random.RandomState(seed=global_random_seed)
+    n_samples, n_components = 10, 5
+    data = random_state.randn(n_samples, n_components)
+    labels_float64 = cluster_qr(data.astype(np.float64))
+    # Each sample is assigned a cluster identifier
+    assert labels_float64.shape == (n_samples,)
+    # All components should be covered by the assignment
+    assert np.array_equal(np.unique(labels_float64), np.arange(n_components))
+    # Single precision data should yield the same cluster assignments
+    labels_float32 = cluster_qr(data.astype(np.float32))
+    assert np.array_equal(labels_float64, labels_float32)
+
+
+def test_cluster_qr_permutation_invariance(global_random_seed):
+    # cluster_qr must be invariant to sample permutation.
+    random_state = np.random.RandomState(seed=global_random_seed)
+    n_samples, n_components = 100, 5
+    data = random_state.randn(n_samples, n_components)
+    perm = random_state.permutation(n_samples)
+    assert np.array_equal(
+        cluster_qr(data)[perm],
+        cluster_qr(data[perm]),
+    )
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+@pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
+def test_discretize(n_samples, coo_container, global_random_seed):
+    # Test the discretize using a noise assignment matrix
+    random_state = np.random.RandomState(seed=global_random_seed)
+    for n_class in range(2, 10):
+        # random class labels
+        y_true = random_state.randint(0, n_class + 1, n_samples)
+        y_true = np.array(y_true, float)
+        # noise class assignment matrix
+        y_indicator = coo_container(
+            (np.ones(n_samples), (np.arange(n_samples), y_true)),
+            shape=(n_samples, n_class + 1),
+        )
+        y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn(
+            n_samples, n_class + 1
+        )
+        y_pred = discretize(y_true_noisy, random_state=random_state)
+        assert adjusted_rand_score(y_true, y_pred) > 0.8
+
+
+def test_spectral_clustering_with_arpack_amg_solvers(global_random_seed):
+    # Test that spectral_clustering is the same for arpack and amg solver
+    # Based on toy example from plot_segmentation_toy.py
+
+    # a small two coin image
+    x, y = np.indices((40, 40))
+
+    center1, center2 = (14, 12), (20, 25)
+    radius1, radius2 = 8, 7
+
+    circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1**2
+    circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2**2
+
+    circles = circle1 | circle2
+    mask = circles.copy()
+    img = circles.astype(float)
+
+    graph = img_to_graph(img, mask=mask)
+    graph.data = np.exp(-graph.data / graph.data.std())
+
+    labels_arpack = spectral_clustering(
+        graph, n_clusters=2, eigen_solver="arpack", random_state=global_random_seed
+    )
+
+    assert len(np.unique(labels_arpack)) == 2
+
+    if amg_loaded:
+        labels_amg = spectral_clustering(
+            graph, n_clusters=2, eigen_solver="amg", random_state=global_random_seed
+        )
+        assert adjusted_rand_score(labels_arpack, labels_amg) == 1
+    else:
+        with pytest.raises(ValueError):
+            spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0)
+
+
+def test_n_components(global_random_seed):
+    # Test that after adding n_components, result is different and
+    # n_components = n_clusters by default
+    X, y = make_blobs(
+        n_samples=20,
+        random_state=global_random_seed,
+        centers=[[1, 1], [-1, -1]],
+        cluster_std=0.01,
+    )
+    sp = SpectralClustering(n_clusters=2, random_state=global_random_seed)
+    labels = sp.fit(X).labels_
+    # set n_components = n_cluster and test if result is the same
+    labels_same_ncomp = (
+        SpectralClustering(
+            n_clusters=2, n_components=2, random_state=global_random_seed
+        )
+        .fit(X)
+        .labels_
+    )
+    # test that n_components=n_clusters by default
+    assert_array_equal(labels, labels_same_ncomp)
+
+    # test that n_components affect result
+    # n_clusters=8 by default, and set n_components=2
+    labels_diff_ncomp = (
+        SpectralClustering(n_components=2, random_state=global_random_seed)
+        .fit(X)
+        .labels_
+    )
+    assert not np.array_equal(labels, labels_diff_ncomp)
+
+
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
+def test_verbose(assign_labels, capsys):
+    # Check verbose mode of KMeans for better coverage.
+    X, y = make_blobs(
+        n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+
+    SpectralClustering(n_clusters=2, random_state=42, verbose=1).fit(X)
+
+    captured = capsys.readouterr()
+
+    assert re.search(r"Computing label assignment using", captured.out)
+
+    if assign_labels == "kmeans":
+        assert re.search(r"Initialization complete", captured.out)
+        assert re.search(r"Iteration [0-9]+, inertia", captured.out)
+
+
+def test_spectral_clustering_np_matrix_raises():
+    """Check that spectral_clustering raises an informative error when passed
+    a np.matrix. See #10993"""
+    X = np.matrix([[0.0, 2.0], [2.0, 0.0]])
+
+    msg = r"np\.matrix is not supported. Please convert to a numpy array"
+    with pytest.raises(TypeError, match=msg):
+        spectral_clustering(X)
+
+
+def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
+    """Check that discretize raises LinAlgError when svd never converges.
+
+    Non-regression test for #21380
+    """
+
+    def new_svd(*args, **kwargs):
+        raise LinAlgError()
+
+    monkeypatch.setattr(np.linalg, "svd", new_svd)
+    vectors = np.ones((10, 4))
+
+    with pytest.raises(LinAlgError, match="SVD did not converge"):
+        discretize(vectors)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/compose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..842a86ba21d9b7e2e738284faf1a394e3f6ae7e9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/compose/__init__.py
@@ -0,0 +1,23 @@
+"""Meta-estimators for building composite models with transformers.
+
+In addition to its current contents, this module will eventually be home to
+refurbished versions of :class:`~sklearn.pipeline.Pipeline` and
+:class:`~sklearn.pipeline.FeatureUnion`.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._column_transformer import (
+    ColumnTransformer,
+    make_column_selector,
+    make_column_transformer,
+)
+from ._target import TransformedTargetRegressor
+
+__all__ = [
+    "ColumnTransformer",
+    "TransformedTargetRegressor",
+    "make_column_selector",
+    "make_column_transformer",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py b/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..940d9194dd97657f0cd746b89b482ea4d75d8852
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py
@@ -0,0 +1,1604 @@
+"""
+The :mod:`sklearn.compose._column_transformer` module implements utilities
+to work with heterogeneous data and to apply different transformers to
+different columns.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from collections import Counter
+from functools import partial
+from itertools import chain
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse
+
+from ..base import TransformerMixin, _fit_context, clone
+from ..pipeline import _fit_transform_one, _name_estimators, _transform_one
+from ..preprocessing import FunctionTransformer
+from ..utils import Bunch
+from ..utils._indexing import _determine_key_type, _get_column_indices, _safe_indexing
+from ..utils._metadata_requests import METHODS
+from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
+from ..utils._repr_html.estimator import _VisualBlock
+from ..utils._set_output import (
+    _get_container_adapter,
+    _get_output_config,
+    _safe_set_output,
+)
+from ..utils._tags import get_tags
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import _BaseComposition
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_feature_names,
+    _check_feature_names_in,
+    _check_n_features,
+    _get_feature_names,
+    _is_pandas_df,
+    _num_samples,
+    check_array,
+    check_is_fitted,
+)
+
+__all__ = ["ColumnTransformer", "make_column_selector", "make_column_transformer"]
+
+
+_ERR_MSG_1DCOLUMN = (
+    "1D data passed to a transformer that expects 2D data. "
+    "Try to specify the column selection as a list of one "
+    "item instead of a scalar."
+)
+
+
+class ColumnTransformer(TransformerMixin, _BaseComposition):
+    """Applies transformers to columns of an array or pandas DataFrame.
+
+    This estimator allows different columns or column subsets of the input
+    to be transformed separately and the features generated by each transformer
+    will be concatenated to form a single feature space.
+    This is useful for heterogeneous or columnar data, to combine several
+    feature extraction mechanisms or transformations into a single transformer.
+
+    Read more in the :ref:`User Guide <column_transformer>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    transformers : list of tuples
+        List of (name, transformer, columns) tuples specifying the
+        transformer objects to be applied to subsets of the data.
+
+        name : str
+            Like in Pipeline and FeatureUnion, this allows the transformer and
+            its parameters to be set using ``set_params`` and searched in grid
+            search.
+        transformer : {'drop', 'passthrough'} or estimator
+            Estimator must support :term:`fit` and :term:`transform`.
+            Special-cased strings 'drop' and 'passthrough' are accepted as
+            well, to indicate to drop the columns or to pass them through
+            untransformed, respectively.
+        columns :  str, array-like of str, int, array-like of int, \
+                array-like of bool, slice or callable
+            Indexes the data on its second axis. Integers are interpreted as
+            positional columns, while strings can reference DataFrame columns
+            by name.  A scalar string or int should be used where
+            ``transformer`` expects X to be a 1d array-like (vector),
+            otherwise a 2d array will be passed to the transformer.
+            A callable is passed the input data `X` and can return any of the
+            above. To select multiple columns by name or dtype, you can use
+            :obj:`make_column_selector`.
+
+    remainder : {'drop', 'passthrough'} or estimator, default='drop'
+        By default, only the specified columns in `transformers` are
+        transformed and combined in the output, and the non-specified
+        columns are dropped. (default of ``'drop'``).
+        By specifying ``remainder='passthrough'``, all remaining columns that
+        were not specified in `transformers`, but present in the data passed
+        to `fit` will be automatically passed through. This subset of columns
+        is concatenated with the output of the transformers. For dataframes,
+        extra columns not seen during `fit` will be excluded from the output
+        of `transform`.
+        By setting ``remainder`` to be an estimator, the remaining
+        non-specified columns will use the ``remainder`` estimator. The
+        estimator must support :term:`fit` and :term:`transform`.
+        Note that using this feature requires that the DataFrame columns
+        input at :term:`fit` and :term:`transform` have identical order.
+
+    sparse_threshold : float, default=0.3
+        If the output of the different transformers contains sparse matrices,
+        these will be stacked as a sparse matrix if the overall density is
+        lower than this value. Use ``sparse_threshold=0`` to always return
+        dense.  When the transformed output consists of all dense data, the
+        stacked result will be dense, and this keyword will be ignored.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    transformer_weights : dict, default=None
+        Multiplicative weights for features per transformer. The output of the
+        transformer is multiplied by these weights. Keys are transformer names,
+        values the weights.
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting each transformer will be
+        printed as it is completed.
+
+    verbose_feature_names_out : bool, str or Callable[[str, str], str], default=True
+
+        - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix
+          all feature names with the name of the transformer that generated that
+          feature. It is equivalent to setting
+          `verbose_feature_names_out="{transformer_name}__{feature_name}"`.
+        - If False, :meth:`ColumnTransformer.get_feature_names_out` will not
+          prefix any feature names and will error if feature names are not
+          unique.
+        - If ``Callable[[str, str], str]``,
+          :meth:`ColumnTransformer.get_feature_names_out` will rename all the features
+          using the name of the transformer. The first argument of the callable is the
+          transformer name and the second argument is the feature name. The returned
+          string will be the new feature name.
+        - If ``str``, it must be a string ready for formatting. The given string will
+          be formatted using two field names: ``transformer_name`` and ``feature_name``.
+          e.g. ``"{feature_name}__{transformer_name}"``. See :meth:`str.format` method
+          from the standard library for more info.
+
+        .. versionadded:: 1.0
+
+        .. versionchanged:: 1.6
+            `verbose_feature_names_out` can be a callable or a string to be formatted.
+
+    force_int_remainder_cols : bool, default=False
+        This parameter has no effect.
+
+        .. note::
+            If you do not access the list of columns for the remainder columns
+            in the `transformers_` fitted attribute, you do not need to set
+            this parameter.
+
+        .. versionadded:: 1.5
+
+        .. versionchanged:: 1.7
+           The default value for `force_int_remainder_cols` will change from
+           `True` to `False` in version 1.7.
+
+        .. deprecated:: 1.7
+           `force_int_remainder_cols` is deprecated and will be removed in 1.9.
+
+    Attributes
+    ----------
+    transformers_ : list
+        The collection of fitted transformers as tuples of (name,
+        fitted_transformer, column). `fitted_transformer` can be an estimator,
+        or `'drop'`; `'passthrough'` is replaced with an equivalent
+        :class:`~sklearn.preprocessing.FunctionTransformer`. In case there were
+        no columns selected, this will be the unfitted transformer. If there
+        are remaining columns, the final element is a tuple of the form:
+        ('remainder', transformer, remaining_columns) corresponding to the
+        ``remainder`` parameter. If there are remaining columns, then
+        ``len(transformers_)==len(transformers)+1``, otherwise
+        ``len(transformers_)==len(transformers)``.
+
+        .. versionadded:: 1.7
+            The format of the remaining columns now attempts to match that of the other
+            transformers: if all columns were provided as column names (`str`), the
+            remaining columns are stored as column names; if all columns were provided
+            as mask arrays (`bool`), so are the remaining columns; in all other cases
+            the remaining columns are stored as indices (`int`).
+
+    named_transformers_ : :class:`~sklearn.utils.Bunch`
+        Read-only attribute to access any transformer by given name.
+        Keys are transformer names and values are the fitted transformer
+        objects.
+
+    sparse_output_ : bool
+        Boolean flag indicating whether the output of ``transform`` is a
+        sparse matrix or a dense numpy array, which depends on the output
+        of the individual transformers and the `sparse_threshold` keyword.
+
+    output_indices_ : dict
+        A dictionary from each transformer name to a slice, where the slice
+        corresponds to indices in the transformed output. This is useful to
+        inspect which transformer is responsible for which transformed
+        feature(s).
+
+        .. versionadded:: 1.0
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying transformers expose such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    make_column_transformer : Convenience function for
+        combining the outputs of multiple transformer objects applied to
+        column subsets of the original feature space.
+    make_column_selector : Convenience function for selecting
+        columns based on datatype or the columns name with a regex pattern.
+
+    Notes
+    -----
+    The order of the columns in the transformed feature matrix follows the
+    order of how the columns are specified in the `transformers` list.
+    Columns of the original feature matrix that are not specified are
+    dropped from the resulting transformed feature matrix, unless specified
+    in the `passthrough` keyword. Those columns specified with `passthrough`
+    are added at the right to the output of the transformers.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.compose import ColumnTransformer
+    >>> from sklearn.preprocessing import Normalizer
+    >>> ct = ColumnTransformer(
+    ...     [("norm1", Normalizer(norm='l1'), [0, 1]),
+    ...      ("norm2", Normalizer(norm='l1'), slice(2, 4))])
+    >>> X = np.array([[0., 1., 2., 2.],
+    ...               [1., 1., 0., 1.]])
+    >>> # Normalizer scales each row of X to unit norm. A separate scaling
+    >>> # is applied for the two first and two last elements of each
+    >>> # row independently.
+    >>> ct.fit_transform(X)
+    array([[0. , 1. , 0.5, 0.5],
+           [0.5, 0.5, 0. , 1. ]])
+
+    :class:`ColumnTransformer` can be configured with a transformer that requires
+    a 1d array by setting the column to a string:
+
+    >>> from sklearn.feature_extraction.text import CountVectorizer
+    >>> from sklearn.preprocessing import MinMaxScaler
+    >>> import pandas as pd   # doctest: +SKIP
+    >>> X = pd.DataFrame({
+    ...     "documents": ["First item", "second one here", "Is this the last?"],
+    ...     "width": [3, 4, 5],
+    ... })  # doctest: +SKIP
+    >>> # "documents" is a string which configures ColumnTransformer to
+    >>> # pass the documents column as a 1d array to the CountVectorizer
+    >>> ct = ColumnTransformer(
+    ...     [("text_preprocess", CountVectorizer(), "documents"),
+    ...      ("num_preprocess", MinMaxScaler(), ["width"])])
+    >>> X_trans = ct.fit_transform(X)  # doctest: +SKIP
+
+    For a more detailed example of usage, see
+    :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "transformers": [list, Hidden(tuple)],
+        "remainder": [
+            StrOptions({"drop", "passthrough"}),
+            HasMethods(["fit", "transform"]),
+            HasMethods(["fit_transform", "transform"]),
+        ],
+        "sparse_threshold": [Interval(Real, 0, 1, closed="both")],
+        "n_jobs": [Integral, None],
+        "transformer_weights": [dict, None],
+        "verbose": ["verbose"],
+        "verbose_feature_names_out": ["boolean", str, callable],
+        "force_int_remainder_cols": ["boolean", Hidden(StrOptions({"deprecated"}))],
+    }
+
+    def __init__(
+        self,
+        transformers,
+        *,
+        remainder="drop",
+        sparse_threshold=0.3,
+        n_jobs=None,
+        transformer_weights=None,
+        verbose=False,
+        verbose_feature_names_out=True,
+        force_int_remainder_cols="deprecated",
+    ):
+        self.transformers = transformers
+        self.remainder = remainder
+        self.sparse_threshold = sparse_threshold
+        self.n_jobs = n_jobs
+        self.transformer_weights = transformer_weights
+        self.verbose = verbose
+        self.verbose_feature_names_out = verbose_feature_names_out
+        self.force_int_remainder_cols = force_int_remainder_cols
+
+    @property
+    def _transformers(self):
+        """
+        Internal list of transformer only containing the name and
+        transformers, dropping the columns.
+
+        DO NOT USE: This is for the implementation of get_params via
+        BaseComposition._get_params which expects lists of tuples of len 2.
+
+        To iterate through the transformers, use ``self._iter`` instead.
+        """
+        try:
+            return [(name, trans) for name, trans, _ in self.transformers]
+        except (TypeError, ValueError):
+            return self.transformers
+
+    @_transformers.setter
+    def _transformers(self, value):
+        """DO NOT USE: This is for the implementation of set_params via
+        BaseComposition._get_params which gives lists of tuples of len 2.
+        """
+        try:
+            self.transformers = [
+                (name, trans, col)
+                for ((name, trans), (_, _, col)) in zip(value, self.transformers)
+            ]
+        except (TypeError, ValueError):
+            self.transformers = value
+
+    def set_output(self, *, transform=None):
+        """Set the output container when `"transform"` and `"fit_transform"` are called.
+
+        Calling `set_output` will set the output of all estimators in `transformers`
+        and `transformers_`.
+
+        Parameters
+        ----------
+        transform : {"default", "pandas", "polars"}, default=None
+            Configure output of `transform` and `fit_transform`.
+
+            - `"default"`: Default output format of a transformer
+            - `"pandas"`: DataFrame output
+            - `"polars"`: Polars output
+            - `None`: Transform configuration is unchanged
+
+            .. versionadded:: 1.4
+                `"polars"` option was added.
+
+        Returns
+        -------
+        self : estimator instance
+            Estimator instance.
+        """
+        super().set_output(transform=transform)
+
+        transformers = (
+            trans
+            for _, trans, _ in chain(
+                self.transformers, getattr(self, "transformers_", [])
+            )
+            if trans not in {"passthrough", "drop"}
+        )
+        for trans in transformers:
+            _safe_set_output(trans, transform=transform)
+
+        if self.remainder not in {"passthrough", "drop"}:
+            _safe_set_output(self.remainder, transform=transform)
+
+        return self
+
+    def get_params(self, deep=True):
+        """Get parameters for this estimator.
+
+        Returns the parameters given in the constructor as well as the
+        estimators contained within the `transformers` of the
+        `ColumnTransformer`.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        return self._get_params("_transformers", deep=deep)
+
+    def set_params(self, **kwargs):
+        """Set the parameters of this estimator.
+
+        Valid parameter keys can be listed with ``get_params()``. Note that you
+        can directly set the parameters of the estimators contained in
+        `transformers` of `ColumnTransformer`.
+
+        Parameters
+        ----------
+        **kwargs : dict
+            Estimator parameters.
+
+        Returns
+        -------
+        self : ColumnTransformer
+            This estimator.
+        """
+        self._set_params("_transformers", **kwargs)
+        return self
+
+    def _iter(self, fitted, column_as_labels, skip_drop, skip_empty_columns):
+        """
+        Generate (name, trans, columns, weight) tuples.
+
+
+        Parameters
+        ----------
+        fitted : bool
+            If True, use the fitted transformers (``self.transformers_``) to
+            iterate through transformers, else use the transformers passed by
+            the user (``self.transformers``).
+
+        column_as_labels : bool
+            If True, columns are returned as string labels. If False, columns
+            are returned as they were given by the user. This can only be True
+            if the ``ColumnTransformer`` is already fitted.
+
+        skip_drop : bool
+            If True, 'drop' transformers are filtered out.
+
+        skip_empty_columns : bool
+            If True, transformers with empty selected columns are filtered out.
+
+        Yields
+        ------
+        A generator of tuples containing:
+            - name : the name of the transformer
+            - transformer : the transformer object
+            - columns : the columns for that transformer
+            - weight : the weight of the transformer
+        """
+        if fitted:
+            transformers = self.transformers_
+        else:
+            # interleave the validated column specifiers
+            transformers = [
+                (name, trans, column)
+                for (name, trans, _), column in zip(self.transformers, self._columns)
+            ]
+            # add transformer tuple for remainder
+            if self._remainder[2]:
+                transformers = chain(transformers, [self._remainder])
+
+        get_weight = (self.transformer_weights or {}).get
+
+        for name, trans, columns in transformers:
+            if skip_drop and trans == "drop":
+                continue
+            if skip_empty_columns and _is_empty_column_selection(columns):
+                continue
+
+            if column_as_labels:
+                # Convert all columns to using their string labels
+                columns_is_scalar = np.isscalar(columns)
+
+                indices = self._transformer_to_input_indices[name]
+                columns = self.feature_names_in_[indices]
+
+                if columns_is_scalar:
+                    # selection is done with one dimension
+                    columns = columns[0]
+
+            yield (name, trans, columns, get_weight(name))
+
+    def _validate_transformers(self):
+        """Validate names of transformers and the transformers themselves.
+
+        This checks whether given transformers have the required methods, i.e.
+        `fit` or `fit_transform` and `transform` implemented.
+        """
+        if not self.transformers:
+            return
+
+        names, transformers, _ = zip(*self.transformers)
+
+        # validate names
+        self._validate_names(names)
+
+        # validate estimators
+        for t in transformers:
+            if t in ("drop", "passthrough"):
+                continue
+            if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr(
+                t, "transform"
+            ):
+                # Used to validate the transformers in the `transformers` list
+                raise TypeError(
+                    "All estimators should implement fit and "
+                    "transform, or can be 'drop' or 'passthrough' "
+                    "specifiers. '%s' (type %s) doesn't." % (t, type(t))
+                )
+
+    def _validate_column_callables(self, X):
+        """
+        Converts callable column specifications.
+
+        This stores a dictionary of the form `{step_name: column_indices}` and
+        calls the `columns` on `X` if `columns` is a callable for a given
+        transformer.
+
+        The results are then stored in `self._transformer_to_input_indices`.
+        """
+        all_columns = []
+        transformer_to_input_indices = {}
+        for name, _, columns in self.transformers:
+            if callable(columns):
+                columns = columns(X)
+            all_columns.append(columns)
+            transformer_to_input_indices[name] = _get_column_indices(X, columns)
+
+        self._columns = all_columns
+        self._transformer_to_input_indices = transformer_to_input_indices
+
+    def _validate_remainder(self, X):
+        """
+        Validates ``remainder`` and defines ``_remainder`` targeting
+        the remaining columns.
+        """
+        cols = set(chain(*self._transformer_to_input_indices.values()))
+        remaining = sorted(set(range(self.n_features_in_)) - cols)
+        self._transformer_to_input_indices["remainder"] = remaining
+        remainder_cols = self._get_remainder_cols(remaining)
+        self._remainder = ("remainder", self.remainder, remainder_cols)
+
+    def _get_remainder_cols_dtype(self):
+        try:
+            all_dtypes = {_determine_key_type(c) for (*_, c) in self.transformers}
+            if len(all_dtypes) == 1:
+                return next(iter(all_dtypes))
+        except ValueError:
+            # _determine_key_type raises a ValueError if some transformer
+            # columns are Callables
+            return "int"
+        return "int"
+
+    def _get_remainder_cols(self, indices):
+        dtype = self._get_remainder_cols_dtype()
+        if dtype == "str":
+            return list(self.feature_names_in_[indices])
+        if dtype == "bool":
+            return [i in indices for i in range(self.n_features_in_)]
+        return indices
+
+    @property
+    def named_transformers_(self):
+        """Access the fitted transformer by name.
+
+        Read-only attribute to access any transformer by given name.
+        Keys are transformer names and values are the fitted transformer
+        objects.
+        """
+        # Use Bunch object to improve autocomplete
+        return Bunch(**{name: trans for name, trans, _ in self.transformers_})
+
+    def _get_feature_name_out_for_transformer(self, name, trans, feature_names_in):
+        """Gets feature names of transformer.
+
+        Used in conjunction with self._iter(fitted=True) in get_feature_names_out.
+        """
+        column_indices = self._transformer_to_input_indices[name]
+        names = feature_names_in[column_indices]
+        # An actual transformer
+        if not hasattr(trans, "get_feature_names_out"):
+            raise AttributeError(
+                f"Transformer {name} (type {type(trans).__name__}) does "
+                "not provide get_feature_names_out."
+            )
+        return trans.get_feature_names_out(names)
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self)
+        input_features = _check_feature_names_in(self, input_features)
+
+        # List of tuples (name, feature_names_out)
+        transformer_with_feature_names_out = []
+        for name, trans, *_ in self._iter(
+            fitted=True,
+            column_as_labels=False,
+            skip_empty_columns=True,
+            skip_drop=True,
+        ):
+            feature_names_out = self._get_feature_name_out_for_transformer(
+                name, trans, input_features
+            )
+            if feature_names_out is None:
+                continue
+            transformer_with_feature_names_out.append((name, feature_names_out))
+
+        if not transformer_with_feature_names_out:
+            # No feature names
+            return np.array([], dtype=object)
+
+        return self._add_prefix_for_feature_names_out(
+            transformer_with_feature_names_out
+        )
+
+    def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out):
+        """Add prefix for feature names out that includes the transformer names.
+
+        Parameters
+        ----------
+        transformer_with_feature_names_out : list of tuples of (str, array-like of str)
+            The tuple consistent of the transformer's name and its feature names out.
+
+        Returns
+        -------
+        feature_names_out : ndarray of shape (n_features,), dtype=str
+            Transformed feature names.
+        """
+        feature_names_out_callable = None
+        if callable(self.verbose_feature_names_out):
+            feature_names_out_callable = self.verbose_feature_names_out
+        elif isinstance(self.verbose_feature_names_out, str):
+            feature_names_out_callable = partial(
+                _feature_names_out_with_str_format,
+                str_format=self.verbose_feature_names_out,
+            )
+        elif self.verbose_feature_names_out is True:
+            feature_names_out_callable = partial(
+                _feature_names_out_with_str_format,
+                str_format="{transformer_name}__{feature_name}",
+            )
+
+        if feature_names_out_callable is not None:
+            # Prefix the feature names out with the transformers name
+            names = list(
+                chain.from_iterable(
+                    (feature_names_out_callable(name, i) for i in feature_names_out)
+                    for name, feature_names_out in transformer_with_feature_names_out
+                )
+            )
+            return np.asarray(names, dtype=object)
+
+        # verbose_feature_names_out is False
+        # Check that names are all unique without a prefix
+        feature_names_count = Counter(
+            chain.from_iterable(s for _, s in transformer_with_feature_names_out)
+        )
+        top_6_overlap = [
+            name for name, count in feature_names_count.most_common(6) if count > 1
+        ]
+        top_6_overlap.sort()
+        if top_6_overlap:
+            if len(top_6_overlap) == 6:
+                # There are more than 5 overlapping names, we only show the 5
+                # of the feature names
+                names_repr = str(top_6_overlap[:5])[:-1] + ", ...]"
+            else:
+                names_repr = str(top_6_overlap)
+            raise ValueError(
+                f"Output feature names: {names_repr} are not unique. Please set "
+                "verbose_feature_names_out=True to add prefixes to feature names"
+            )
+
+        return np.concatenate(
+            [name for _, name in transformer_with_feature_names_out],
+        )
+
+    def _update_fitted_transformers(self, transformers):
+        """Set self.transformers_ from given transformers.
+
+        Parameters
+        ----------
+        transformers : list of estimators
+            The fitted estimators as the output of
+            `self._call_func_on_transformers(func=_fit_transform_one, ...)`.
+            That function doesn't include 'drop' or transformers for which no
+            column is selected. 'drop' is kept as is, and for the no-column
+            transformers the unfitted transformer is put in
+            `self.transformers_`.
+        """
+        # transformers are fitted; excludes 'drop' cases
+        fitted_transformers = iter(transformers)
+        transformers_ = []
+
+        for name, old, column, _ in self._iter(
+            fitted=False,
+            column_as_labels=False,
+            skip_drop=False,
+            skip_empty_columns=False,
+        ):
+            if old == "drop":
+                trans = "drop"
+            elif _is_empty_column_selection(column):
+                trans = old
+            else:
+                trans = next(fitted_transformers)
+            transformers_.append((name, trans, column))
+
+        # sanity check that transformers is exhausted
+        assert not list(fitted_transformers)
+        self.transformers_ = transformers_
+
+    def _validate_output(self, result):
+        """
+        Ensure that the output of each transformer is 2D. Otherwise
+        hstack can raise an error or produce incorrect results.
+        """
+        names = [
+            name
+            for name, _, _, _ in self._iter(
+                fitted=True,
+                column_as_labels=False,
+                skip_drop=True,
+                skip_empty_columns=True,
+            )
+        ]
+        for Xs, name in zip(result, names):
+            if not getattr(Xs, "ndim", 0) == 2 and not hasattr(Xs, "__dataframe__"):
+                raise ValueError(
+                    "The output of the '{0}' transformer should be 2D (numpy array, "
+                    "scipy sparse array, dataframe).".format(name)
+                )
+        if _get_output_config("transform", self)["dense"] == "pandas":
+            return
+        try:
+            import pandas as pd
+        except ImportError:
+            return
+        for Xs, name in zip(result, names):
+            if not _is_pandas_df(Xs):
+                continue
+            for col_name, dtype in Xs.dtypes.to_dict().items():
+                if getattr(dtype, "na_value", None) is not pd.NA:
+                    continue
+                if pd.NA not in Xs[col_name].values:
+                    continue
+                class_name = self.__class__.__name__
+                raise ValueError(
+                    f"The output of the '{name}' transformer for column"
+                    f" '{col_name}' has dtype {dtype} and uses pandas.NA to"
+                    " represent null values. Storing this output in a numpy array"
+                    " can cause errors in downstream scikit-learn estimators, and"
+                    " inefficiencies. To avoid this problem you can (i)"
+                    " store the output in a pandas DataFrame by using"
+                    f" {class_name}.set_output(transform='pandas') or (ii) modify"
+                    f" the input data or the '{name}' transformer to avoid the"
+                    " presence of pandas.NA (for example by using"
+                    " pandas.DataFrame.astype)."
+                )
+
+    def _record_output_indices(self, Xs):
+        """
+        Record which transformer produced which column.
+        """
+        idx = 0
+        self.output_indices_ = {}
+
+        for transformer_idx, (name, _, _, _) in enumerate(
+            self._iter(
+                fitted=True,
+                column_as_labels=False,
+                skip_drop=True,
+                skip_empty_columns=True,
+            )
+        ):
+            n_columns = Xs[transformer_idx].shape[1]
+            self.output_indices_[name] = slice(idx, idx + n_columns)
+            idx += n_columns
+
+        # `_iter` only generates transformers that have a non empty
+        # selection. Here we set empty slices for transformers that
+        # generate no output, which are safe for indexing
+        all_names = [t[0] for t in self.transformers] + ["remainder"]
+        for name in all_names:
+            if name not in self.output_indices_:
+                self.output_indices_[name] = slice(0, 0)
+
+    def _log_message(self, name, idx, total):
+        if not self.verbose:
+            return None
+        return "(%d of %d) Processing %s" % (idx, total, name)
+
+    def _call_func_on_transformers(self, X, y, func, column_as_labels, routed_params):
+        """
+        Private function to fit and/or transform on demand.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe} of shape (n_samples, n_features)
+            The data to be used in fit and/or transform.
+
+        y : array-like of shape (n_samples,)
+            Targets.
+
+        func : callable
+            Function to call, which can be _fit_transform_one or
+            _transform_one.
+
+        column_as_labels : bool
+            Used to iterate through transformers. If True, columns are returned
+            as strings. If False, columns are returned as they were given by
+            the user. Can be True only if the ``ColumnTransformer`` is already
+            fitted.
+
+        routed_params : dict
+            The routed parameters as the output from ``process_routing``.
+
+        Returns
+        -------
+        Return value (transformers and/or transformed X data) depends
+        on the passed function.
+        """
+        if func is _fit_transform_one:
+            fitted = False
+        else:  # func is _transform_one
+            fitted = True
+
+        transformers = list(
+            self._iter(
+                fitted=fitted,
+                column_as_labels=column_as_labels,
+                skip_drop=True,
+                skip_empty_columns=True,
+            )
+        )
+        try:
+            jobs = []
+            for idx, (name, trans, columns, weight) in enumerate(transformers, start=1):
+                if func is _fit_transform_one:
+                    if trans == "passthrough":
+                        output_config = _get_output_config("transform", self)
+                        trans = FunctionTransformer(
+                            accept_sparse=True,
+                            check_inverse=False,
+                            feature_names_out="one-to-one",
+                        ).set_output(transform=output_config["dense"])
+
+                    extra_args = dict(
+                        message_clsname="ColumnTransformer",
+                        message=self._log_message(name, idx, len(transformers)),
+                    )
+                else:  # func is _transform_one
+                    extra_args = {}
+                jobs.append(
+                    delayed(func)(
+                        transformer=clone(trans) if not fitted else trans,
+                        X=_safe_indexing(X, columns, axis=1),
+                        y=y,
+                        weight=weight,
+                        **extra_args,
+                        params=routed_params[name],
+                    )
+                )
+
+            return Parallel(n_jobs=self.n_jobs)(jobs)
+
+        except ValueError as e:
+            if "Expected 2D array, got 1D array instead" in str(e):
+                raise ValueError(_ERR_MSG_1DCOLUMN) from e
+            else:
+                raise
+
+    def fit(self, X, y=None, **params):
+        """Fit all transformers using X.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe} of shape (n_samples, n_features)
+            Input data, of which specified subsets are used to fit the
+            transformers.
+
+        y : array-like of shape (n_samples,...), default=None
+            Targets for supervised learning.
+
+        **params : dict, default=None
+            Parameters to be passed to the underlying transformers' ``fit`` and
+            ``transform`` methods.
+
+            You can only pass this if metadata routing is enabled, which you
+            can enable using ``sklearn.set_config(enable_metadata_routing=True)``.
+
+            .. versionadded:: 1.4
+
+        Returns
+        -------
+        self : ColumnTransformer
+            This estimator.
+        """
+        _raise_for_params(params, self, "fit")
+        # we use fit_transform to make sure to set sparse_output_ (for which we
+        # need the transformed data) to have consistent output type in predict
+        self.fit_transform(X, y=y, **params)
+        return self
+
+    @_fit_context(
+        # estimators in ColumnTransformer.transformers are not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit_transform(self, X, y=None, **params):
+        """Fit all transformers, transform the data and concatenate results.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe} of shape (n_samples, n_features)
+            Input data, of which specified subsets are used to fit the
+            transformers.
+
+        y : array-like of shape (n_samples,), default=None
+            Targets for supervised learning.
+
+        **params : dict, default=None
+            Parameters to be passed to the underlying transformers' ``fit`` and
+            ``transform`` methods.
+
+            You can only pass this if metadata routing is enabled, which you
+            can enable using ``sklearn.set_config(enable_metadata_routing=True)``.
+
+            .. versionadded:: 1.4
+
+        Returns
+        -------
+        X_t : {array-like, sparse matrix} of \
+                shape (n_samples, sum_n_components)
+            Horizontally stacked results of transformers. sum_n_components is the
+            sum of n_components (output dimension) over transformers. If
+            any result is a sparse matrix, everything will be converted to
+            sparse matrices.
+        """
+        _raise_for_params(params, self, "fit_transform")
+        _check_feature_names(self, X, reset=True)
+
+        if self.force_int_remainder_cols != "deprecated":
+            warnings.warn(
+                "The parameter `force_int_remainder_cols` is deprecated and will be "
+                "removed in 1.9. It has no effect. Leave it to its default value to "
+                "avoid this warning.",
+                FutureWarning,
+            )
+
+        X = _check_X(X)
+        # set n_features_in_ attribute
+        _check_n_features(self, X, reset=True)
+        self._validate_transformers()
+        n_samples = _num_samples(X)
+
+        self._validate_column_callables(X)
+        self._validate_remainder(X)
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit_transform", **params)
+        else:
+            routed_params = self._get_empty_routing()
+
+        result = self._call_func_on_transformers(
+            X,
+            y,
+            _fit_transform_one,
+            column_as_labels=False,
+            routed_params=routed_params,
+        )
+
+        if not result:
+            self._update_fitted_transformers([])
+            # All transformers are None
+            return np.zeros((n_samples, 0))
+
+        Xs, transformers = zip(*result)
+
+        # determine if concatenated output will be sparse or not
+        if any(sparse.issparse(X) for X in Xs):
+            nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs)
+            total = sum(
+                X.shape[0] * X.shape[1] if sparse.issparse(X) else X.size for X in Xs
+            )
+            density = nnz / total
+            self.sparse_output_ = density < self.sparse_threshold
+        else:
+            self.sparse_output_ = False
+
+        self._update_fitted_transformers(transformers)
+        self._validate_output(Xs)
+        self._record_output_indices(Xs)
+
+        return self._hstack(list(Xs), n_samples=n_samples)
+
+    def transform(self, X, **params):
+        """Transform X separately by each transformer, concatenate results.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe} of shape (n_samples, n_features)
+            The data to be transformed by subset.
+
+        **params : dict, default=None
+            Parameters to be passed to the underlying transformers' ``transform``
+            method.
+
+            You can only pass this if metadata routing is enabled, which you
+            can enable using ``sklearn.set_config(enable_metadata_routing=True)``.
+
+            .. versionadded:: 1.4
+
+        Returns
+        -------
+        X_t : {array-like, sparse matrix} of \
+                shape (n_samples, sum_n_components)
+            Horizontally stacked results of transformers. sum_n_components is the
+            sum of n_components (output dimension) over transformers. If
+            any result is a sparse matrix, everything will be converted to
+            sparse matrices.
+        """
+        _raise_for_params(params, self, "transform")
+        check_is_fitted(self)
+        X = _check_X(X)
+
+        # If ColumnTransformer is fit using a dataframe, and now a dataframe is
+        # passed to be transformed, we select columns by name instead. This
+        # enables the user to pass X at transform time with extra columns which
+        # were not present in fit time, and the order of the columns doesn't
+        # matter.
+        fit_dataframe_and_transform_dataframe = hasattr(self, "feature_names_in_") and (
+            _is_pandas_df(X) or hasattr(X, "__dataframe__")
+        )
+
+        n_samples = _num_samples(X)
+        column_names = _get_feature_names(X)
+
+        if fit_dataframe_and_transform_dataframe:
+            named_transformers = self.named_transformers_
+            # check that all names seen in fit are in transform, unless
+            # they were dropped
+            non_dropped_indices = [
+                ind
+                for name, ind in self._transformer_to_input_indices.items()
+                if name in named_transformers and named_transformers[name] != "drop"
+            ]
+
+            all_indices = set(chain(*non_dropped_indices))
+            all_names = set(self.feature_names_in_[ind] for ind in all_indices)
+
+            diff = all_names - set(column_names)
+            if diff:
+                raise ValueError(f"columns are missing: {diff}")
+        else:
+            # ndarray was used for fitting or transforming, thus we only
+            # check that n_features_in_ is consistent
+            _check_n_features(self, X, reset=False)
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "transform", **params)
+        else:
+            routed_params = self._get_empty_routing()
+
+        Xs = self._call_func_on_transformers(
+            X,
+            None,
+            _transform_one,
+            column_as_labels=fit_dataframe_and_transform_dataframe,
+            routed_params=routed_params,
+        )
+        self._validate_output(Xs)
+
+        if not Xs:
+            # All transformers are None
+            return np.zeros((n_samples, 0))
+
+        return self._hstack(list(Xs), n_samples=n_samples)
+
+    def _hstack(self, Xs, *, n_samples):
+        """Stacks Xs horizontally.
+
+        This allows subclasses to control the stacking behavior, while reusing
+        everything else from ColumnTransformer.
+
+        Parameters
+        ----------
+        Xs : list of {array-like, sparse matrix, dataframe}
+            The container to concatenate.
+        n_samples : int
+            The number of samples in the input data to checking the transformation
+            consistency.
+        """
+        if self.sparse_output_:
+            try:
+                # since all columns should be numeric before stacking them
+                # in a sparse matrix, `check_array` is used for the
+                # dtype conversion if necessary.
+                converted_Xs = [
+                    check_array(X, accept_sparse=True, ensure_all_finite=False)
+                    for X in Xs
+                ]
+            except ValueError as e:
+                raise ValueError(
+                    "For a sparse output, all columns should "
+                    "be a numeric or convertible to a numeric."
+                ) from e
+
+            return sparse.hstack(converted_Xs).tocsr()
+        else:
+            Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
+            adapter = _get_container_adapter("transform", self)
+            if adapter and all(adapter.is_supported_container(X) for X in Xs):
+                # rename before stacking as it avoids to error on temporary duplicated
+                # columns
+                transformer_names = [
+                    t[0]
+                    for t in self._iter(
+                        fitted=True,
+                        column_as_labels=False,
+                        skip_drop=True,
+                        skip_empty_columns=True,
+                    )
+                ]
+                feature_names_outs = [X.columns for X in Xs if X.shape[1] != 0]
+                if self.verbose_feature_names_out:
+                    # `_add_prefix_for_feature_names_out` takes care about raising
+                    # an error if there are duplicated columns.
+                    feature_names_outs = self._add_prefix_for_feature_names_out(
+                        list(zip(transformer_names, feature_names_outs))
+                    )
+                else:
+                    # check for duplicated columns and raise if any
+                    feature_names_outs = list(chain.from_iterable(feature_names_outs))
+                    feature_names_count = Counter(feature_names_outs)
+                    if any(count > 1 for count in feature_names_count.values()):
+                        duplicated_feature_names = sorted(
+                            name
+                            for name, count in feature_names_count.items()
+                            if count > 1
+                        )
+                        err_msg = (
+                            "Duplicated feature names found before concatenating the"
+                            " outputs of the transformers:"
+                            f" {duplicated_feature_names}.\n"
+                        )
+                        for transformer_name, X in zip(transformer_names, Xs):
+                            if X.shape[1] == 0:
+                                continue
+                            dup_cols_in_transformer = sorted(
+                                set(X.columns).intersection(duplicated_feature_names)
+                            )
+                            if len(dup_cols_in_transformer):
+                                err_msg += (
+                                    f"Transformer {transformer_name} has conflicting "
+                                    f"columns names: {dup_cols_in_transformer}.\n"
+                                )
+                        raise ValueError(
+                            err_msg
+                            + "Either make sure that the transformers named above "
+                            "do not generate columns with conflicting names or set "
+                            "verbose_feature_names_out=True to automatically "
+                            "prefix to the output feature names with the name "
+                            "of the transformer to prevent any conflicting "
+                            "names."
+                        )
+
+                names_idx = 0
+                for X in Xs:
+                    if X.shape[1] == 0:
+                        continue
+                    names_out = feature_names_outs[names_idx : names_idx + X.shape[1]]
+                    adapter.rename_columns(X, names_out)
+                    names_idx += X.shape[1]
+
+                output = adapter.hstack(Xs)
+                output_samples = output.shape[0]
+                if output_samples != n_samples:
+                    raise ValueError(
+                        "Concatenating DataFrames from the transformer's output lead to"
+                        " an inconsistent number of samples. The output may have Pandas"
+                        " Indexes that do not match, or that transformers are returning"
+                        " number of samples which are not the same as the number input"
+                        " samples."
+                    )
+
+                return output
+
+            return np.hstack(Xs)
+
+    def _sk_visual_block_(self):
+        if isinstance(self.remainder, str) and self.remainder == "drop":
+            transformers = self.transformers
+        elif hasattr(self, "_remainder"):
+            remainder_columns = self._remainder[2]
+            if (
+                hasattr(self, "feature_names_in_")
+                and remainder_columns
+                and not all(isinstance(col, str) for col in remainder_columns)
+            ):
+                remainder_columns = self.feature_names_in_[remainder_columns].tolist()
+            transformers = chain(
+                self.transformers, [("remainder", self.remainder, remainder_columns)]
+            )
+        else:
+            transformers = chain(self.transformers, [("remainder", self.remainder, "")])
+
+        names, transformers, name_details = zip(*transformers)
+        return _VisualBlock(
+            "parallel", transformers, names=names, name_details=name_details
+        )
+
+    def __getitem__(self, key):
+        try:
+            return self.named_transformers_[key]
+        except AttributeError as e:
+            raise TypeError(
+                "ColumnTransformer is subscriptable after it is fitted"
+            ) from e
+        except KeyError as e:
+            raise KeyError(f"'{key}' is not a valid transformer name") from e
+
+    def _get_empty_routing(self):
+        """Return empty routing.
+
+        Used while routing can be disabled.
+
+        TODO: Remove when ``set_config(enable_metadata_routing=False)`` is no
+        more an option.
+        """
+        return Bunch(
+            **{
+                name: Bunch(**{method: {} for method in METHODS})
+                for name, step, _, _ in self._iter(
+                    fitted=False,
+                    column_as_labels=False,
+                    skip_drop=True,
+                    skip_empty_columns=True,
+                )
+            }
+        )
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.4
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+        # Here we don't care about which columns are used for which
+        # transformers, and whether or not a transformer is used at all, which
+        # might happen if no columns are selected for that transformer. We
+        # request all metadata requested by all transformers.
+        transformers = chain(self.transformers, [("remainder", self.remainder, None)])
+        for name, step, _ in transformers:
+            method_mapping = MethodMapping()
+            if hasattr(step, "fit_transform"):
+                (
+                    method_mapping.add(caller="fit", callee="fit_transform").add(
+                        caller="fit_transform", callee="fit_transform"
+                    )
+                )
+            else:
+                (
+                    method_mapping.add(caller="fit", callee="fit")
+                    .add(caller="fit", callee="transform")
+                    .add(caller="fit_transform", callee="fit")
+                    .add(caller="fit_transform", callee="transform")
+                )
+            method_mapping.add(caller="transform", callee="transform")
+            router.add(method_mapping=method_mapping, **{name: step})
+
+        return router
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        try:
+            tags.input_tags.sparse = all(
+                get_tags(trans).input_tags.sparse
+                for name, trans, _ in self.transformers
+                if trans not in {"passthrough", "drop"}
+            )
+        except Exception:
+            # If `transformers` does not comply with our API (list of tuples)
+            # then it will fail. In this case, we assume that `sparse` is False
+            # but the parameter validation will raise an error during `fit`.
+            pass  # pragma: no cover
+        return tags
+
+
+def _check_X(X):
+    """Use check_array only when necessary, e.g. on lists and other non-array-likes."""
+    if (
+        (hasattr(X, "__array__") and hasattr(X, "shape"))
+        or hasattr(X, "__dataframe__")
+        or sparse.issparse(X)
+    ):
+        return X
+    return check_array(X, ensure_all_finite="allow-nan", dtype=object)
+
+
+def _is_empty_column_selection(column):
+    """
+    Return True if the column selection is empty (empty list or all-False
+    boolean array).
+
+    """
+    if (
+        hasattr(column, "dtype")
+        # Not necessarily a numpy dtype, can be a pandas dtype as well
+        and isinstance(column.dtype, np.dtype)
+        and np.issubdtype(column.dtype, np.bool_)
+    ):
+        return not column.any()
+    elif hasattr(column, "__len__"):
+        return len(column) == 0 or (
+            all(isinstance(col, bool) for col in column) and not any(column)
+        )
+    else:
+        return False
+
+
+def _get_transformer_list(estimators):
+    """
+    Construct (name, trans, column) tuples from list
+
+    """
+    transformers, columns = zip(*estimators)
+    names, _ = zip(*_name_estimators(transformers))
+
+    transformer_list = list(zip(names, transformers, columns))
+    return transformer_list
+
+
+# This function is not validated using validate_params because
+# it's just a factory for ColumnTransformer.
+def make_column_transformer(
+    *transformers,
+    remainder="drop",
+    sparse_threshold=0.3,
+    n_jobs=None,
+    verbose=False,
+    verbose_feature_names_out=True,
+    force_int_remainder_cols="deprecated",
+):
+    """Construct a ColumnTransformer from the given transformers.
+
+    This is a shorthand for the ColumnTransformer constructor; it does not
+    require, and does not permit, naming the transformers. Instead, they will
+    be given names automatically based on their types. It also does not allow
+    weighting with ``transformer_weights``.
+
+    Read more in the :ref:`User Guide <make_column_transformer>`.
+
+    Parameters
+    ----------
+    *transformers : tuples
+        Tuples of the form (transformer, columns) specifying the
+        transformer objects to be applied to subsets of the data.
+
+        transformer : {'drop', 'passthrough'} or estimator
+            Estimator must support :term:`fit` and :term:`transform`.
+            Special-cased strings 'drop' and 'passthrough' are accepted as
+            well, to indicate to drop the columns or to pass them through
+            untransformed, respectively.
+        columns : str,  array-like of str, int, array-like of int, slice, \
+                array-like of bool or callable
+            Indexes the data on its second axis. Integers are interpreted as
+            positional columns, while strings can reference DataFrame columns
+            by name. A scalar string or int should be used where
+            ``transformer`` expects X to be a 1d array-like (vector),
+            otherwise a 2d array will be passed to the transformer.
+            A callable is passed the input data `X` and can return any of the
+            above. To select multiple columns by name or dtype, you can use
+            :obj:`make_column_selector`.
+
+    remainder : {'drop', 'passthrough'} or estimator, default='drop'
+        By default, only the specified columns in `transformers` are
+        transformed and combined in the output, and the non-specified
+        columns are dropped. (default of ``'drop'``).
+        By specifying ``remainder='passthrough'``, all remaining columns that
+        were not specified in `transformers` will be automatically passed
+        through. This subset of columns is concatenated with the output of
+        the transformers.
+        By setting ``remainder`` to be an estimator, the remaining
+        non-specified columns will use the ``remainder`` estimator. The
+        estimator must support :term:`fit` and :term:`transform`.
+
+    sparse_threshold : float, default=0.3
+        If the transformed output consists of a mix of sparse and dense data,
+        it will be stacked as a sparse matrix if the density is lower than this
+        value. Use ``sparse_threshold=0`` to always return dense.
+        When the transformed output consists of all sparse or all dense data,
+        the stacked result will be sparse or dense, respectively, and this
+        keyword will be ignored.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting each transformer will be
+        printed as it is completed.
+
+    verbose_feature_names_out : bool, default=True
+        If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix
+        all feature names with the name of the transformer that generated that
+        feature.
+        If False, :meth:`ColumnTransformer.get_feature_names_out` will not
+        prefix any feature names and will error if feature names are not
+        unique.
+
+        .. versionadded:: 1.0
+
+    force_int_remainder_cols : bool, default=True
+        This parameter has no effect.
+
+        .. note::
+            If you do not access the list of columns for the remainder columns
+            in the :attr:`ColumnTransformer.transformers_` fitted attribute,
+            you do not need to set this parameter.
+
+        .. versionadded:: 1.5
+
+        .. versionchanged:: 1.7
+           The default value for `force_int_remainder_cols` will change from
+           `True` to `False` in version 1.7.
+
+        .. deprecated:: 1.7
+           `force_int_remainder_cols` is deprecated and will be removed in version 1.9.
+
+    Returns
+    -------
+    ct : ColumnTransformer
+        Returns a :class:`ColumnTransformer` object.
+
+    See Also
+    --------
+    ColumnTransformer : Class that allows combining the
+        outputs of multiple transformer objects used on column subsets
+        of the data into a single feature space.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
+    >>> from sklearn.compose import make_column_transformer
+    >>> make_column_transformer(
+    ...     (StandardScaler(), ['numerical_column']),
+    ...     (OneHotEncoder(), ['categorical_column']))
+    ColumnTransformer(transformers=[('standardscaler', StandardScaler(...),
+                                     ['numerical_column']),
+                                    ('onehotencoder', OneHotEncoder(...),
+                                     ['categorical_column'])])
+    """
+    # transformer_weights keyword is not passed through because the user
+    # would need to know the automatically generated names of the transformers
+    transformer_list = _get_transformer_list(transformers)
+    return ColumnTransformer(
+        transformer_list,
+        n_jobs=n_jobs,
+        remainder=remainder,
+        sparse_threshold=sparse_threshold,
+        verbose=verbose,
+        verbose_feature_names_out=verbose_feature_names_out,
+        force_int_remainder_cols=force_int_remainder_cols,
+    )
+
+
+class make_column_selector:
+    """Create a callable to select columns to be used with
+    :class:`ColumnTransformer`.
+
+    :func:`make_column_selector` can select columns based on datatype or the
+    columns name with a regex. When using multiple selection criteria, **all**
+    criteria must match for a column to be selected.
+
+    For an example of how to use :func:`make_column_selector` within a
+    :class:`ColumnTransformer` to select columns based on data type (i.e.
+    `dtype`), refer to
+    :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`.
+
+    Parameters
+    ----------
+    pattern : str, default=None
+        Name of columns containing this regex pattern will be included. If
+        None, column selection will not be selected based on pattern.
+
+    dtype_include : column dtype or list of column dtypes, default=None
+        A selection of dtypes to include. For more details, see
+        :meth:`pandas.DataFrame.select_dtypes`.
+
+    dtype_exclude : column dtype or list of column dtypes, default=None
+        A selection of dtypes to exclude. For more details, see
+        :meth:`pandas.DataFrame.select_dtypes`.
+
+    Returns
+    -------
+    selector : callable
+        Callable for column selection to be used by a
+        :class:`ColumnTransformer`.
+
+    See Also
+    --------
+    ColumnTransformer : Class that allows combining the
+        outputs of multiple transformer objects used on column subsets
+        of the data into a single feature space.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
+    >>> from sklearn.compose import make_column_transformer
+    >>> from sklearn.compose import make_column_selector
+    >>> import numpy as np
+    >>> import pandas as pd  # doctest: +SKIP
+    >>> X = pd.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'],
+    ...                   'rating': [5, 3, 4, 5]})  # doctest: +SKIP
+    >>> ct = make_column_transformer(
+    ...       (StandardScaler(),
+    ...        make_column_selector(dtype_include=np.number)),  # rating
+    ...       (OneHotEncoder(),
+    ...        make_column_selector(dtype_include=object)))  # city
+    >>> ct.fit_transform(X)  # doctest: +SKIP
+    array([[ 0.90453403,  1.        ,  0.        ,  0.        ],
+           [-1.50755672,  1.        ,  0.        ,  0.        ],
+           [-0.30151134,  0.        ,  1.        ,  0.        ],
+           [ 0.90453403,  0.        ,  0.        ,  1.        ]])
+    """
+
+    def __init__(self, pattern=None, *, dtype_include=None, dtype_exclude=None):
+        self.pattern = pattern
+        self.dtype_include = dtype_include
+        self.dtype_exclude = dtype_exclude
+
+    def __call__(self, df):
+        """Callable for column selection to be used by a
+        :class:`ColumnTransformer`.
+
+        Parameters
+        ----------
+        df : dataframe of shape (n_features, n_samples)
+            DataFrame to select columns from.
+        """
+        if not hasattr(df, "iloc"):
+            raise ValueError(
+                "make_column_selector can only be applied to pandas dataframes"
+            )
+        df_row = df.iloc[:1]
+        if self.dtype_include is not None or self.dtype_exclude is not None:
+            df_row = df_row.select_dtypes(
+                include=self.dtype_include, exclude=self.dtype_exclude
+            )
+        cols = df_row.columns
+        if self.pattern is not None:
+            cols = cols[cols.str.contains(self.pattern, regex=True)]
+        return cols.tolist()
+
+
+def _feature_names_out_with_str_format(
+    transformer_name: str, feature_name: str, str_format: str
+) -> str:
+    return str_format.format(
+        transformer_name=transformer_name, feature_name=feature_name
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/_target.py b/.venv/lib/python3.12/site-packages/sklearn/compose/_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f713767b30cb8ce0cb5724f2252e427df05a788
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/compose/_target.py
@@ -0,0 +1,397 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+import numpy as np
+
+from ..base import BaseEstimator, RegressorMixin, _fit_context, clone
+from ..exceptions import NotFittedError
+from ..linear_model import LinearRegression
+from ..preprocessing import FunctionTransformer
+from ..utils import Bunch, _safe_indexing, check_array
+from ..utils._metadata_requests import (
+    MetadataRouter,
+    MethodMapping,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils._param_validation import HasMethods
+from ..utils._tags import get_tags
+from ..utils.validation import check_is_fitted
+
+__all__ = ["TransformedTargetRegressor"]
+
+
+class TransformedTargetRegressor(RegressorMixin, BaseEstimator):
+    """Meta-estimator to regress on a transformed target.
+
+    Useful for applying a non-linear transformation to the target `y` in
+    regression problems. This transformation can be given as a Transformer
+    such as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a
+    function and its inverse such as `np.log` and `np.exp`.
+
+    The computation during :meth:`fit` is::
+
+        regressor.fit(X, func(y))
+
+    or::
+
+        regressor.fit(X, transformer.transform(y))
+
+    The computation during :meth:`predict` is::
+
+        inverse_func(regressor.predict(X))
+
+    or::
+
+        transformer.inverse_transform(regressor.predict(X))
+
+    Read more in the :ref:`User Guide <transformed_target_regressor>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    regressor : object, default=None
+        Regressor object such as derived from
+        :class:`~sklearn.base.RegressorMixin`. This regressor will
+        automatically be cloned each time prior to fitting. If `regressor is
+        None`, :class:`~sklearn.linear_model.LinearRegression` is created and used.
+
+    transformer : object, default=None
+        Estimator object such as derived from
+        :class:`~sklearn.base.TransformerMixin`. Cannot be set at the same time
+        as `func` and `inverse_func`. If `transformer is None` as well as
+        `func` and `inverse_func`, the transformer will be an identity
+        transformer. Note that the transformer will be cloned during fitting.
+        Also, the transformer is restricting `y` to be a numpy array.
+
+    func : function, default=None
+        Function to apply to `y` before passing to :meth:`fit`. Cannot be set
+        at the same time as `transformer`. If `func is None`, the function used will be
+        the identity function. If `func` is set, `inverse_func` also needs to be
+        provided. The function needs to return a 2-dimensional array.
+
+    inverse_func : function, default=None
+        Function to apply to the prediction of the regressor. Cannot be set at
+        the same time as `transformer`. The inverse function is used to return
+        predictions to the same space of the original training labels. If
+        `inverse_func` is set, `func` also needs to be provided. The inverse
+        function needs to return a 2-dimensional array.
+
+    check_inverse : bool, default=True
+        Whether to check that `transform` followed by `inverse_transform`
+        or `func` followed by `inverse_func` leads to the original targets.
+
+    Attributes
+    ----------
+    regressor_ : object
+        Fitted regressor.
+
+    transformer_ : object
+        Transformer used in :meth:`fit` and :meth:`predict`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying regressor exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.preprocessing.FunctionTransformer : Construct a transformer from an
+        arbitrary callable.
+
+    Notes
+    -----
+    Internally, the target `y` is always converted into a 2-dimensional array
+    to be used by scikit-learn transformers. At the time of prediction, the
+    output will be reshaped to a have the same number of dimensions as `y`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.compose import TransformedTargetRegressor
+    >>> tt = TransformedTargetRegressor(regressor=LinearRegression(),
+    ...                                 func=np.log, inverse_func=np.exp)
+    >>> X = np.arange(4).reshape(-1, 1)
+    >>> y = np.exp(2 * X).ravel()
+    >>> tt.fit(X, y)
+    TransformedTargetRegressor(...)
+    >>> tt.score(X, y)
+    1.0
+    >>> tt.regressor_.coef_
+    array([2.])
+
+    For a more detailed example use case refer to
+    :ref:`sphx_glr_auto_examples_compose_plot_transformed_target.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "regressor": [HasMethods(["fit", "predict"]), None],
+        "transformer": [HasMethods("transform"), None],
+        "func": [callable, None],
+        "inverse_func": [callable, None],
+        "check_inverse": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        regressor=None,
+        *,
+        transformer=None,
+        func=None,
+        inverse_func=None,
+        check_inverse=True,
+    ):
+        self.regressor = regressor
+        self.transformer = transformer
+        self.func = func
+        self.inverse_func = inverse_func
+        self.check_inverse = check_inverse
+
+    def _fit_transformer(self, y):
+        """Check transformer and fit transformer.
+
+        Create the default transformer, fit it and make additional inverse
+        check on a subset (optional).
+
+        """
+        if self.transformer is not None and (
+            self.func is not None or self.inverse_func is not None
+        ):
+            raise ValueError(
+                "'transformer' and functions 'func'/'inverse_func' cannot both be set."
+            )
+        elif self.transformer is not None:
+            self.transformer_ = clone(self.transformer)
+        else:
+            if (self.func is not None and self.inverse_func is None) or (
+                self.func is None and self.inverse_func is not None
+            ):
+                lacking_param, existing_param = (
+                    ("func", "inverse_func")
+                    if self.func is None
+                    else ("inverse_func", "func")
+                )
+                raise ValueError(
+                    f"When '{existing_param}' is provided, '{lacking_param}' must also"
+                    f" be provided. If {lacking_param} is supposed to be the default,"
+                    " you need to explicitly pass it the identity function."
+                )
+            self.transformer_ = FunctionTransformer(
+                func=self.func,
+                inverse_func=self.inverse_func,
+                validate=True,
+                check_inverse=self.check_inverse,
+            )
+            # We are transforming the target here and not the features, so we set the
+            # output of FunctionTransformer() to be a numpy array (default) and to not
+            # depend on the global configuration:
+            self.transformer_.set_output(transform="default")
+        # XXX: sample_weight is not currently passed to the
+        # transformer. However, if transformer starts using sample_weight, the
+        # code should be modified accordingly. At the time to consider the
+        # sample_prop feature, it is also a good use case to be considered.
+        self.transformer_.fit(y)
+        if self.check_inverse:
+            idx_selected = slice(None, None, max(1, y.shape[0] // 10))
+            y_sel = _safe_indexing(y, idx_selected)
+            y_sel_t = self.transformer_.transform(y_sel)
+            if not np.allclose(y_sel, self.transformer_.inverse_transform(y_sel_t)):
+                warnings.warn(
+                    (
+                        "The provided functions or transformer are"
+                        " not strictly inverse of each other. If"
+                        " you are sure you want to proceed regardless"
+                        ", set 'check_inverse=False'"
+                    ),
+                    UserWarning,
+                )
+
+    @_fit_context(
+        # TransformedTargetRegressor.regressor/transformer are not validated yet.
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **fit_params):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **fit_params : dict
+            - If `enable_metadata_routing=False` (default): Parameters directly passed
+              to the `fit` method of the underlying regressor.
+
+            - If `enable_metadata_routing=True`: Parameters safely routed to the `fit`
+              method of the underlying regressor.
+
+            .. versionchanged:: 1.6
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        if y is None:
+            raise ValueError(
+                f"This {self.__class__.__name__} estimator "
+                "requires y to be passed, but the target y is None."
+            )
+        y = check_array(
+            y,
+            input_name="y",
+            accept_sparse=False,
+            ensure_all_finite=True,
+            ensure_2d=False,
+            dtype="numeric",
+            allow_nd=True,
+        )
+
+        # store the number of dimension of the target to predict an array of
+        # similar shape at predict
+        self._training_dim = y.ndim
+
+        # transformers are designed to modify X which is 2d dimensional, we
+        # need to modify y accordingly.
+        if y.ndim == 1:
+            y_2d = y.reshape(-1, 1)
+        else:
+            y_2d = y
+        self._fit_transformer(y_2d)
+
+        # transform y and convert back to 1d array if needed
+        y_trans = self.transformer_.transform(y_2d)
+        # FIXME: a FunctionTransformer can return a 1D array even when validate
+        # is set to True. Therefore, we need to check the number of dimension
+        # first.
+        if y_trans.ndim == 2 and y_trans.shape[1] == 1 and self._training_dim == 1:
+            y_trans = y_trans.squeeze(axis=1)
+
+        self.regressor_ = self._get_regressor(get_clone=True)
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            routed_params = Bunch(regressor=Bunch(fit=fit_params))
+
+        self.regressor_.fit(X, y_trans, **routed_params.regressor.fit)
+
+        if hasattr(self.regressor_, "feature_names_in_"):
+            self.feature_names_in_ = self.regressor_.feature_names_in_
+
+        return self
+
+    def predict(self, X, **predict_params):
+        """Predict using the base regressor, applying inverse.
+
+        The regressor is used to predict and the `inverse_func` or
+        `inverse_transform` is applied before returning the prediction.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        **predict_params : dict of str -> object
+            - If `enable_metadata_routing=False` (default): Parameters directly passed
+              to the `predict` method of the underlying regressor.
+
+            - If `enable_metadata_routing=True`: Parameters safely routed to the
+              `predict` method of the underlying regressor.
+
+            .. versionchanged:: 1.6
+                See :ref:`Metadata Routing User Guide <metadata_routing>`
+                for more details.
+
+        Returns
+        -------
+        y_hat : ndarray of shape (n_samples,)
+            Predicted values.
+        """
+        check_is_fitted(self)
+        if _routing_enabled():
+            routed_params = process_routing(self, "predict", **predict_params)
+        else:
+            routed_params = Bunch(regressor=Bunch(predict=predict_params))
+
+        pred = self.regressor_.predict(X, **routed_params.regressor.predict)
+        if pred.ndim == 1:
+            pred_trans = self.transformer_.inverse_transform(pred.reshape(-1, 1))
+        else:
+            pred_trans = self.transformer_.inverse_transform(pred)
+        if (
+            self._training_dim == 1
+            and pred_trans.ndim == 2
+            and pred_trans.shape[1] == 1
+        ):
+            pred_trans = pred_trans.squeeze(axis=1)
+
+        return pred_trans
+
+    def __sklearn_tags__(self):
+        regressor = self._get_regressor()
+        tags = super().__sklearn_tags__()
+        tags.regressor_tags.poor_score = True
+        tags.input_tags.sparse = get_tags(regressor).input_tags.sparse
+        tags.target_tags.multi_output = get_tags(regressor).target_tags.multi_output
+        return tags
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during :term:`fit`."""
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() returns False the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute.".format(
+                    self.__class__.__name__
+                )
+            ) from nfe
+
+        return self.regressor_.n_features_in_
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.6
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            regressor=self._get_regressor(),
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="fit")
+            .add(caller="predict", callee="predict"),
+        )
+        return router
+
+    def _get_regressor(self, get_clone=False):
+        if self.regressor is None:
+            return LinearRegression()
+
+        return clone(self.regressor) if get_clone else self.regressor
diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_column_transformer.py b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_column_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7c69c657f2eab27e71df599d7507be8effe73cc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_column_transformer.py
@@ -0,0 +1,2804 @@
+"""
+Test the ColumnTransformer.
+"""
+
+import pickle
+import re
+import warnings
+
+import joblib
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy import sparse
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.compose import (
+    ColumnTransformer,
+    make_column_selector,
+    make_column_transformer,
+)
+from sklearn.exceptions import NotFittedError
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.preprocessing import (
+    FunctionTransformer,
+    Normalizer,
+    OneHotEncoder,
+    StandardScaler,
+)
+from sklearn.tests.metadata_routing_common import (
+    ConsumingTransformer,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.utils._indexing import _safe_indexing
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose_dense_sparse,
+    assert_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS, parse_version
+
+
+class Trans(TransformerMixin, BaseEstimator):
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X, y=None):
+        # 1D Series -> 2D DataFrame
+        if hasattr(X, "to_frame"):
+            return X.to_frame()
+        # 1D array -> 2D array
+        if getattr(X, "ndim", 2) == 1:
+            return np.atleast_2d(X).T
+        return X
+
+
+class DoubleTrans(BaseEstimator):
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        return 2 * X
+
+
+class SparseMatrixTrans(BaseEstimator):
+    def __init__(self, csr_container):
+        self.csr_container = csr_container
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X, y=None):
+        n_samples = len(X)
+        return self.csr_container(sparse.eye(n_samples, n_samples))
+
+
+class TransNo2D(BaseEstimator):
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X, y=None):
+        return X
+
+
+class TransRaise(BaseEstimator):
+    def fit(self, X, y=None):
+        raise ValueError("specific message")
+
+    def transform(self, X, y=None):
+        raise ValueError("specific message")
+
+
+def test_column_transformer():
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+
+    X_res_first1D = np.array([0, 1, 2])
+    X_res_second1D = np.array([2, 4, 6])
+    X_res_first = X_res_first1D.reshape(-1, 1)
+    X_res_both = X_array
+
+    cases = [
+        # single column 1D / 2D
+        (0, X_res_first),
+        ([0], X_res_first),
+        # list-like
+        ([0, 1], X_res_both),
+        (np.array([0, 1]), X_res_both),
+        # slice
+        (slice(0, 1), X_res_first),
+        (slice(0, 2), X_res_both),
+        # boolean mask
+        (np.array([True, False]), X_res_first),
+        ([True, False], X_res_first),
+        (np.array([True, True]), X_res_both),
+        ([True, True], X_res_both),
+    ]
+
+    for selection, res in cases:
+        ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
+        assert_array_equal(ct.fit_transform(X_array), res)
+        assert_array_equal(ct.fit(X_array).transform(X_array), res)
+
+        # callable that returns any of the allowed specifiers
+        ct = ColumnTransformer(
+            [("trans", Trans(), lambda x: selection)], remainder="drop"
+        )
+        assert_array_equal(ct.fit_transform(X_array), res)
+        assert_array_equal(ct.fit(X_array).transform(X_array), res)
+
+    ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
+    assert_array_equal(ct.fit_transform(X_array), X_res_both)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
+    assert len(ct.transformers_) == 2
+
+    # test with transformer_weights
+    transformer_weights = {"trans1": 0.1, "trans2": 10}
+    both = ColumnTransformer(
+        [("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
+        transformer_weights=transformer_weights,
+    )
+    res = np.vstack(
+        [
+            transformer_weights["trans1"] * X_res_first1D,
+            transformer_weights["trans2"] * X_res_second1D,
+        ]
+    ).T
+    assert_array_equal(both.fit_transform(X_array), res)
+    assert_array_equal(both.fit(X_array).transform(X_array), res)
+    assert len(both.transformers_) == 2
+
+    both = ColumnTransformer(
+        [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
+    )
+    assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both)
+    assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both)
+    assert len(both.transformers_) == 1
+
+
+def test_column_transformer_tuple_transformers_parameter():
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+
+    transformers = [("trans1", Trans(), [0]), ("trans2", Trans(), [1])]
+
+    ct_with_list = ColumnTransformer(transformers)
+    ct_with_tuple = ColumnTransformer(tuple(transformers))
+
+    assert_array_equal(
+        ct_with_list.fit_transform(X_array), ct_with_tuple.fit_transform(X_array)
+    )
+    assert_array_equal(
+        ct_with_list.fit(X_array).transform(X_array),
+        ct_with_tuple.fit(X_array).transform(X_array),
+    )
+
+
+@pytest.mark.parametrize("constructor_name", ["dataframe", "polars"])
+def test_column_transformer_dataframe(constructor_name):
+    if constructor_name == "dataframe":
+        dataframe_lib = pytest.importorskip("pandas")
+    else:
+        dataframe_lib = pytest.importorskip(constructor_name)
+
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_df = _convert_container(
+        X_array, constructor_name, columns_name=["first", "second"]
+    )
+
+    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
+    X_res_both = X_array
+
+    cases = [
+        # String keys: label based
+        # list
+        (["first"], X_res_first),
+        (["first", "second"], X_res_both),
+        # slice
+        (slice("first", "second"), X_res_both),
+        # int keys: positional
+        # list
+        ([0], X_res_first),
+        ([0, 1], X_res_both),
+        (np.array([0, 1]), X_res_both),
+        # slice
+        (slice(0, 1), X_res_first),
+        (slice(0, 2), X_res_both),
+        # boolean mask
+        (np.array([True, False]), X_res_first),
+        ([True, False], X_res_first),
+    ]
+    if constructor_name == "dataframe":
+        # Scalars are only supported for pandas dataframes.
+        cases.extend(
+            [
+                # scalar
+                (0, X_res_first),
+                ("first", X_res_first),
+                (
+                    dataframe_lib.Series([True, False], index=["first", "second"]),
+                    X_res_first,
+                ),
+            ]
+        )
+
+    for selection, res in cases:
+        ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
+        assert_array_equal(ct.fit_transform(X_df), res)
+        assert_array_equal(ct.fit(X_df).transform(X_df), res)
+
+        # callable that returns any of the allowed specifiers
+        ct = ColumnTransformer(
+            [("trans", Trans(), lambda X: selection)], remainder="drop"
+        )
+        assert_array_equal(ct.fit_transform(X_df), res)
+        assert_array_equal(ct.fit(X_df).transform(X_df), res)
+
+    ct = ColumnTransformer(
+        [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])]
+    )
+    assert_array_equal(ct.fit_transform(X_df), X_res_both)
+    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] != "remainder"
+
+    ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
+    assert_array_equal(ct.fit_transform(X_df), X_res_both)
+    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] != "remainder"
+
+    # test with transformer_weights
+    transformer_weights = {"trans1": 0.1, "trans2": 10}
+    both = ColumnTransformer(
+        [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])],
+        transformer_weights=transformer_weights,
+    )
+    res = np.vstack(
+        [
+            transformer_weights["trans1"] * X_df["first"],
+            transformer_weights["trans2"] * X_df["second"],
+        ]
+    ).T
+    assert_array_equal(both.fit_transform(X_df), res)
+    assert_array_equal(both.fit(X_df).transform(X_df), res)
+    assert len(both.transformers_) == 2
+    assert both.transformers_[-1][0] != "remainder"
+
+    # test multiple columns
+    both = ColumnTransformer(
+        [("trans", Trans(), ["first", "second"])], transformer_weights={"trans": 0.1}
+    )
+    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
+    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
+    assert len(both.transformers_) == 1
+    assert both.transformers_[-1][0] != "remainder"
+
+    both = ColumnTransformer(
+        [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
+    )
+    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
+    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
+    assert len(both.transformers_) == 1
+    assert both.transformers_[-1][0] != "remainder"
+
+    # ensure pandas object is passed through
+
+    class TransAssert(BaseEstimator):
+        def __init__(self, expected_type_transform):
+            self.expected_type_transform = expected_type_transform
+
+        def fit(self, X, y=None):
+            return self
+
+        def transform(self, X, y=None):
+            assert isinstance(X, self.expected_type_transform)
+            if isinstance(X, dataframe_lib.Series):
+                X = X.to_frame()
+            return X
+
+    ct = ColumnTransformer(
+        [
+            (
+                "trans",
+                TransAssert(expected_type_transform=dataframe_lib.DataFrame),
+                ["first", "second"],
+            )
+        ]
+    )
+    ct.fit_transform(X_df)
+
+    if constructor_name == "dataframe":
+        # DataFrame protocol does not have 1d columns, so we only test on Pandas
+        # dataframes.
+        ct = ColumnTransformer(
+            [
+                (
+                    "trans",
+                    TransAssert(expected_type_transform=dataframe_lib.Series),
+                    "first",
+                )
+            ],
+            remainder="drop",
+        )
+        ct.fit_transform(X_df)
+
+        # Only test on pandas because the dataframe protocol requires string column
+        # names
+        # integer column spec + integer column names -> still use positional
+        X_df2 = X_df.copy()
+        X_df2.columns = [1, 0]
+        ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop")
+        assert_array_equal(ct.fit_transform(X_df2), X_res_first)
+        assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first)
+
+        assert len(ct.transformers_) == 2
+        assert ct.transformers_[-1][0] == "remainder"
+        assert ct.transformers_[-1][1] == "drop"
+        assert_array_equal(ct.transformers_[-1][2], [1])
+
+
+@pytest.mark.parametrize("pandas", [True, False], ids=["pandas", "numpy"])
+@pytest.mark.parametrize(
+    "column_selection",
+    [[], np.array([False, False]), [False, False]],
+    ids=["list", "bool", "bool_int"],
+)
+@pytest.mark.parametrize("callable_column", [False, True])
+def test_column_transformer_empty_columns(pandas, column_selection, callable_column):
+    # test case that ensures that the column transformer does also work when
+    # a given transformer doesn't have any columns to work on
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_res_both = X_array
+
+    if pandas:
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame(X_array, columns=["first", "second"])
+    else:
+        X = X_array
+
+    if callable_column:
+        column = lambda X: column_selection
+    else:
+        column = column_selection
+
+    ct = ColumnTransformer(
+        [("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), column)]
+    )
+    assert_array_equal(ct.fit_transform(X), X_res_both)
+    assert_array_equal(ct.fit(X).transform(X), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert isinstance(ct.transformers_[1][1], TransRaise)
+
+    ct = ColumnTransformer(
+        [("trans1", TransRaise(), column), ("trans2", Trans(), [0, 1])]
+    )
+    assert_array_equal(ct.fit_transform(X), X_res_both)
+    assert_array_equal(ct.fit(X).transform(X), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert isinstance(ct.transformers_[0][1], TransRaise)
+
+    ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="passthrough")
+    assert_array_equal(ct.fit_transform(X), X_res_both)
+    assert_array_equal(ct.fit(X).transform(X), X_res_both)
+    assert len(ct.transformers_) == 2  # including remainder
+    assert isinstance(ct.transformers_[0][1], TransRaise)
+
+    fixture = np.array([[], [], []])
+    ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="drop")
+    assert_array_equal(ct.fit_transform(X), fixture)
+    assert_array_equal(ct.fit(X).transform(X), fixture)
+    assert len(ct.transformers_) == 2  # including remainder
+    assert isinstance(ct.transformers_[0][1], TransRaise)
+
+
+def test_column_transformer_output_indices():
+    # Checks for the output_indices_ attribute
+    X_array = np.arange(6).reshape(3, 2)
+
+    ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
+    X_trans = ct.fit_transform(X_array)
+    assert ct.output_indices_ == {
+        "trans1": slice(0, 1),
+        "trans2": slice(1, 2),
+        "remainder": slice(0, 0),
+    }
+    assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
+    assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
+
+    # test with transformer_weights and multiple columns
+    ct = ColumnTransformer(
+        [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1}
+    )
+    X_trans = ct.fit_transform(X_array)
+    assert ct.output_indices_ == {"trans": slice(0, 2), "remainder": slice(0, 0)}
+    assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans"]])
+    assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
+
+    # test case that ensures that the attribute does also work when
+    # a given transformer doesn't have any columns to work on
+    ct = ColumnTransformer([("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), [])])
+    X_trans = ct.fit_transform(X_array)
+    assert ct.output_indices_ == {
+        "trans1": slice(0, 2),
+        "trans2": slice(0, 0),
+        "remainder": slice(0, 0),
+    }
+    assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans1"]])
+    assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans2"]])
+    assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
+
+    ct = ColumnTransformer([("trans", TransRaise(), [])], remainder="passthrough")
+    X_trans = ct.fit_transform(X_array)
+    assert ct.output_indices_ == {"trans": slice(0, 0), "remainder": slice(0, 2)}
+    assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans"]])
+    assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["remainder"]])
+
+
+def test_column_transformer_output_indices_df():
+    # Checks for the output_indices_ attribute with data frames
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame(np.arange(6).reshape(3, 2), columns=["first", "second"])
+
+    ct = ColumnTransformer(
+        [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])]
+    )
+    X_trans = ct.fit_transform(X_df)
+    assert ct.output_indices_ == {
+        "trans1": slice(0, 1),
+        "trans2": slice(1, 2),
+        "remainder": slice(0, 0),
+    }
+    assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
+    assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
+    assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
+
+    ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
+    X_trans = ct.fit_transform(X_df)
+    assert ct.output_indices_ == {
+        "trans1": slice(0, 1),
+        "trans2": slice(1, 2),
+        "remainder": slice(0, 0),
+    }
+    assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
+    assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
+    assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_column_transformer_sparse_array(csr_container):
+    X_sparse = csr_container(sparse.eye(3, 2))
+
+    # no distinction between 1D and 2D
+    X_res_first = X_sparse[:, [0]]
+    X_res_both = X_sparse
+
+    for col in [(0,), [0], slice(0, 1)]:
+        for remainder, res in [("drop", X_res_first), ("passthrough", X_res_both)]:
+            ct = ColumnTransformer(
+                [("trans", Trans(), col)], remainder=remainder, sparse_threshold=0.8
+            )
+            assert sparse.issparse(ct.fit_transform(X_sparse))
+            assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
+            assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res)
+
+    for col in [[0, 1], slice(0, 2)]:
+        ct = ColumnTransformer([("trans", Trans(), col)], sparse_threshold=0.8)
+        assert sparse.issparse(ct.fit_transform(X_sparse))
+        assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
+        assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
+
+
+def test_column_transformer_list():
+    X_list = [[1, float("nan"), "a"], [0, 0, "b"]]
+    expected_result = np.array(
+        [
+            [1, float("nan"), 1, 0],
+            [-1, 0, 0, 1],
+        ]
+    )
+
+    ct = ColumnTransformer(
+        [
+            ("numerical", StandardScaler(), [0, 1]),
+            ("categorical", OneHotEncoder(), [2]),
+        ]
+    )
+
+    assert_array_equal(ct.fit_transform(X_list), expected_result)
+    assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_column_transformer_sparse_stacking(csr_container):
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    col_trans = ColumnTransformer(
+        [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)],
+        sparse_threshold=0.8,
+    )
+    col_trans.fit(X_array)
+    X_trans = col_trans.transform(X_array)
+    assert sparse.issparse(X_trans)
+    assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
+    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
+    assert len(col_trans.transformers_) == 2
+    assert col_trans.transformers_[-1][0] != "remainder"
+
+    col_trans = ColumnTransformer(
+        [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)],
+        sparse_threshold=0.1,
+    )
+    col_trans.fit(X_array)
+    X_trans = col_trans.transform(X_array)
+    assert not sparse.issparse(X_trans)
+    assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
+    assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
+
+
+def test_column_transformer_mixed_cols_sparse():
+    df = np.array([["a", 1, True], ["b", 2, False]], dtype="O")
+
+    ct = make_column_transformer(
+        (OneHotEncoder(), [0]), ("passthrough", [1, 2]), sparse_threshold=1.0
+    )
+
+    # this shouldn't fail, since boolean can be coerced into a numeric
+    # See: https://github.com/scikit-learn/scikit-learn/issues/11912
+    X_trans = ct.fit_transform(df)
+    assert X_trans.format == "csr"
+    assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2, 0]]))
+
+    ct = make_column_transformer(
+        (OneHotEncoder(), [0]), ("passthrough", [0]), sparse_threshold=1.0
+    )
+    with pytest.raises(ValueError, match="For a sparse output, all columns should"):
+        # this fails since strings `a` and `b` cannot be
+        # coerced into a numeric.
+        ct.fit_transform(df)
+
+
+def test_column_transformer_sparse_threshold():
+    X_array = np.array([["a", "b"], ["A", "B"]], dtype=object).T
+    # above data has sparsity of 4 / 8 = 0.5
+
+    # apply threshold even if all sparse
+    col_trans = ColumnTransformer(
+        [("trans1", OneHotEncoder(), [0]), ("trans2", OneHotEncoder(), [1])],
+        sparse_threshold=0.2,
+    )
+    res = col_trans.fit_transform(X_array)
+    assert not sparse.issparse(res)
+    assert not col_trans.sparse_output_
+
+    # mixed -> sparsity of (4 + 2) / 8 = 0.75
+    for thres in [0.75001, 1]:
+        col_trans = ColumnTransformer(
+            [
+                ("trans1", OneHotEncoder(sparse_output=True), [0]),
+                ("trans2", OneHotEncoder(sparse_output=False), [1]),
+            ],
+            sparse_threshold=thres,
+        )
+        res = col_trans.fit_transform(X_array)
+        assert sparse.issparse(res)
+        assert col_trans.sparse_output_
+
+    for thres in [0.75, 0]:
+        col_trans = ColumnTransformer(
+            [
+                ("trans1", OneHotEncoder(sparse_output=True), [0]),
+                ("trans2", OneHotEncoder(sparse_output=False), [1]),
+            ],
+            sparse_threshold=thres,
+        )
+        res = col_trans.fit_transform(X_array)
+        assert not sparse.issparse(res)
+        assert not col_trans.sparse_output_
+
+    # if nothing is sparse -> no sparse
+    for thres in [0.33, 0, 1]:
+        col_trans = ColumnTransformer(
+            [
+                ("trans1", OneHotEncoder(sparse_output=False), [0]),
+                ("trans2", OneHotEncoder(sparse_output=False), [1]),
+            ],
+            sparse_threshold=thres,
+        )
+        res = col_trans.fit_transform(X_array)
+        assert not sparse.issparse(res)
+        assert not col_trans.sparse_output_
+
+
+def test_column_transformer_error_msg_1D():
+    X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
+
+    col_trans = ColumnTransformer([("trans", StandardScaler(), 0)])
+    msg = "1D data passed to a transformer"
+    with pytest.raises(ValueError, match=msg):
+        col_trans.fit(X_array)
+
+    with pytest.raises(ValueError, match=msg):
+        col_trans.fit_transform(X_array)
+
+    col_trans = ColumnTransformer([("trans", TransRaise(), 0)])
+    for func in [col_trans.fit, col_trans.fit_transform]:
+        with pytest.raises(ValueError, match="specific message"):
+            func(X_array)
+
+
+def test_2D_transformer_output():
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+
+    # if one transformer is dropped, test that name is still correct
+    ct = ColumnTransformer([("trans1", "drop", 0), ("trans2", TransNo2D(), 1)])
+
+    msg = "the 'trans2' transformer should be 2D"
+    with pytest.raises(ValueError, match=msg):
+        ct.fit_transform(X_array)
+    # because fit is also doing transform, this raises already on fit
+    with pytest.raises(ValueError, match=msg):
+        ct.fit(X_array)
+
+
+def test_2D_transformer_output_pandas():
+    pd = pytest.importorskip("pandas")
+
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_df = pd.DataFrame(X_array, columns=["col1", "col2"])
+
+    # if one transformer is dropped, test that name is still correct
+    ct = ColumnTransformer([("trans1", TransNo2D(), "col1")])
+    msg = "the 'trans1' transformer should be 2D"
+    with pytest.raises(ValueError, match=msg):
+        ct.fit_transform(X_df)
+    # because fit is also doing transform, this raises already on fit
+    with pytest.raises(ValueError, match=msg):
+        ct.fit(X_df)
+
+
+@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
+def test_column_transformer_invalid_columns(remainder):
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+
+    # general invalid
+    for col in [1.5, ["string", 1], slice(1, "s"), np.array([1.0])]:
+        ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
+        with pytest.raises(ValueError, match="No valid specification"):
+            ct.fit(X_array)
+
+    # invalid for arrays
+    for col in ["string", ["string", "other"], slice("a", "b")]:
+        ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
+        with pytest.raises(ValueError, match="Specifying the columns"):
+            ct.fit(X_array)
+
+    # transformed n_features does not match fitted n_features
+    col = [0, 1]
+    ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder)
+    ct.fit(X_array)
+    X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
+    msg = "X has 3 features, but ColumnTransformer is expecting 2 features as input."
+    with pytest.raises(ValueError, match=msg):
+        ct.transform(X_array_more)
+    X_array_fewer = np.array(
+        [
+            [0, 1, 2],
+        ]
+    ).T
+    err_msg = (
+        "X has 1 features, but ColumnTransformer is expecting 2 features as input."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        ct.transform(X_array_fewer)
+
+
+def test_column_transformer_invalid_transformer():
+    class NoTrans(BaseEstimator):
+        def fit(self, X, y=None):
+            return self
+
+        def predict(self, X):
+            return X
+
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    ct = ColumnTransformer([("trans", NoTrans(), [0])])
+    msg = "All estimators should implement fit and transform"
+    with pytest.raises(TypeError, match=msg):
+        ct.fit(X_array)
+
+
+def test_make_column_transformer():
+    scaler = StandardScaler()
+    norm = Normalizer()
+    ct = make_column_transformer((scaler, "first"), (norm, ["second"]))
+    names, transformers, columns = zip(*ct.transformers)
+    assert names == ("standardscaler", "normalizer")
+    assert transformers == (scaler, norm)
+    assert columns == ("first", ["second"])
+
+
+def test_make_column_transformer_pandas():
+    pd = pytest.importorskip("pandas")
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_df = pd.DataFrame(X_array, columns=["first", "second"])
+    norm = Normalizer()
+    ct1 = ColumnTransformer([("norm", Normalizer(), X_df.columns)])
+    ct2 = make_column_transformer((norm, X_df.columns))
+    assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
+
+
+def test_make_column_transformer_kwargs():
+    scaler = StandardScaler()
+    norm = Normalizer()
+    ct = make_column_transformer(
+        (scaler, "first"),
+        (norm, ["second"]),
+        n_jobs=3,
+        remainder="drop",
+        sparse_threshold=0.5,
+    )
+    assert (
+        ct.transformers
+        == make_column_transformer((scaler, "first"), (norm, ["second"])).transformers
+    )
+    assert ct.n_jobs == 3
+    assert ct.remainder == "drop"
+    assert ct.sparse_threshold == 0.5
+    # invalid keyword parameters should raise an error message
+    msg = re.escape(
+        "make_column_transformer() got an unexpected "
+        "keyword argument 'transformer_weights'"
+    )
+    with pytest.raises(TypeError, match=msg):
+        make_column_transformer(
+            (scaler, "first"),
+            (norm, ["second"]),
+            transformer_weights={"pca": 10, "Transf": 1},
+        )
+
+
+def test_make_column_transformer_remainder_transformer():
+    scaler = StandardScaler()
+    norm = Normalizer()
+    remainder = StandardScaler()
+    ct = make_column_transformer(
+        (scaler, "first"), (norm, ["second"]), remainder=remainder
+    )
+    assert ct.remainder == remainder
+
+
+def test_column_transformer_get_set_params():
+    ct = ColumnTransformer(
+        [("trans1", StandardScaler(), [0]), ("trans2", StandardScaler(), [1])]
+    )
+
+    exp = {
+        "n_jobs": None,
+        "remainder": "drop",
+        "sparse_threshold": 0.3,
+        "trans1": ct.transformers[0][1],
+        "trans1__copy": True,
+        "trans1__with_mean": True,
+        "trans1__with_std": True,
+        "trans2": ct.transformers[1][1],
+        "trans2__copy": True,
+        "trans2__with_mean": True,
+        "trans2__with_std": True,
+        "transformers": ct.transformers,
+        "transformer_weights": None,
+        "verbose_feature_names_out": True,
+        "verbose": False,
+        "force_int_remainder_cols": "deprecated",
+    }
+
+    assert ct.get_params() == exp
+
+    ct.set_params(trans1__with_mean=False)
+    assert not ct.get_params()["trans1__with_mean"]
+
+    ct.set_params(trans1="passthrough")
+    exp = {
+        "n_jobs": None,
+        "remainder": "drop",
+        "sparse_threshold": 0.3,
+        "trans1": "passthrough",
+        "trans2": ct.transformers[1][1],
+        "trans2__copy": True,
+        "trans2__with_mean": True,
+        "trans2__with_std": True,
+        "transformers": ct.transformers,
+        "transformer_weights": None,
+        "verbose_feature_names_out": True,
+        "verbose": False,
+        "force_int_remainder_cols": "deprecated",
+    }
+
+    assert ct.get_params() == exp
+
+
+def test_column_transformer_named_estimators():
+    X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
+    ct = ColumnTransformer(
+        [
+            ("trans1", StandardScaler(), [0]),
+            ("trans2", StandardScaler(with_std=False), [1]),
+        ]
+    )
+    assert not hasattr(ct, "transformers_")
+    ct.fit(X_array)
+    assert hasattr(ct, "transformers_")
+    assert isinstance(ct.named_transformers_["trans1"], StandardScaler)
+    assert isinstance(ct.named_transformers_.trans1, StandardScaler)
+    assert isinstance(ct.named_transformers_["trans2"], StandardScaler)
+    assert isinstance(ct.named_transformers_.trans2, StandardScaler)
+    assert not ct.named_transformers_.trans2.with_std
+    # check it are fitted transformers
+    assert ct.named_transformers_.trans1.mean_ == 1.0
+
+
+def test_column_transformer_cloning():
+    X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
+
+    ct = ColumnTransformer([("trans", StandardScaler(), [0])])
+    ct.fit(X_array)
+    assert not hasattr(ct.transformers[0][1], "mean_")
+    assert hasattr(ct.transformers_[0][1], "mean_")
+
+    ct = ColumnTransformer([("trans", StandardScaler(), [0])])
+    ct.fit_transform(X_array)
+    assert not hasattr(ct.transformers[0][1], "mean_")
+    assert hasattr(ct.transformers_[0][1], "mean_")
+
+
+def test_column_transformer_get_feature_names():
+    X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
+    ct = ColumnTransformer([("trans", Trans(), [0, 1])])
+    # raise correct error when not fitted
+    with pytest.raises(NotFittedError):
+        ct.get_feature_names_out()
+    # raise correct error when no feature names are available
+    ct.fit(X_array)
+    msg = re.escape(
+        "Transformer trans (type Trans) does not provide get_feature_names_out"
+    )
+    with pytest.raises(AttributeError, match=msg):
+        ct.get_feature_names_out()
+
+
+def test_column_transformer_special_strings():
+    # one 'drop' -> ignore
+    X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
+    ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "drop", [1])])
+    exp = np.array([[0.0], [1.0], [2.0]])
+    assert_array_equal(ct.fit_transform(X_array), exp)
+    assert_array_equal(ct.fit(X_array).transform(X_array), exp)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] != "remainder"
+
+    # all 'drop' -> return shape 0 array
+    ct = ColumnTransformer([("trans1", "drop", [0]), ("trans2", "drop", [1])])
+    assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0))
+    assert_array_equal(ct.fit_transform(X_array).shape, (3, 0))
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] != "remainder"
+
+    # 'passthrough'
+    X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T
+    ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "passthrough", [1])])
+    exp = X_array
+    assert_array_equal(ct.fit_transform(X_array), exp)
+    assert_array_equal(ct.fit(X_array).transform(X_array), exp)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] != "remainder"
+
+
+def test_column_transformer_remainder():
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+
+    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
+    X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
+    X_res_both = X_array
+
+    # default drop
+    ct = ColumnTransformer([("trans1", Trans(), [0])])
+    assert_array_equal(ct.fit_transform(X_array), X_res_first)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert ct.transformers_[-1][1] == "drop"
+    assert_array_equal(ct.transformers_[-1][2], [1])
+
+    # specify passthrough
+    ct = ColumnTransformer([("trans", Trans(), [0])], remainder="passthrough")
+    assert_array_equal(ct.fit_transform(X_array), X_res_both)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
+    assert_array_equal(ct.transformers_[-1][2], [1])
+
+    # column order is not preserved (passed through added to end)
+    ct = ColumnTransformer([("trans1", Trans(), [1])], remainder="passthrough")
+    assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
+    assert_array_equal(ct.transformers_[-1][2], [0])
+
+    # passthrough when all actual transformers are skipped
+    ct = ColumnTransformer([("trans1", "drop", [0])], remainder="passthrough")
+    assert_array_equal(ct.fit_transform(X_array), X_res_second)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
+    assert_array_equal(ct.transformers_[-1][2], [1])
+
+    # check default for make_column_transformer
+    ct = make_column_transformer((Trans(), [0]))
+    assert ct.remainder == "drop"
+
+
+@pytest.mark.parametrize(
+    "cols1, cols2, expected_remainder_cols",
+    [
+        ([0], [False, True, False], [2]),  # mix types
+        ([0], [1], [2]),  # ints
+        (lambda x: [0], lambda x: [1], [2]),  # callables
+        (["A"], ["B"], ["C"]),  # all strings
+        ([True, False, False], [False, True, False], [False, False, True]),  # all bools
+    ],
+)
+def test_column_transformer_remainder_dtypes(cols1, cols2, expected_remainder_cols):
+    """Check that the remainder columns format matches the format of the other
+    columns when they're all strings or masks.
+    """
+    X = np.ones((1, 3))
+
+    if isinstance(cols1, list) and isinstance(cols1[0], str):
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame(X, columns=["A", "B", "C"])
+
+    # if inputs are column names store remainder columns as column names
+    ct = make_column_transformer(
+        (Trans(), cols1),
+        (Trans(), cols2),
+        remainder="passthrough",
+    )
+    ct.fit_transform(X)
+    assert ct.transformers_[-1][-1] == expected_remainder_cols
+
+
+# TODO(1.9): remove this test
+@pytest.mark.parametrize("force_int_remainder_cols", [True, False])
+def test_force_int_remainder_cols_deprecation(force_int_remainder_cols):
+    """Check that ColumnTransformer raises a FutureWarning when
+    force_int_remainder_cols is set.
+    """
+    X = np.ones((1, 3))
+    ct = ColumnTransformer(
+        [("T1", Trans(), [0]), ("T2", Trans(), [1])],
+        remainder="passthrough",
+        force_int_remainder_cols=force_int_remainder_cols,
+    )
+
+    with pytest.warns(FutureWarning, match="`force_int_remainder_cols` is deprecated"):
+        ct.fit(X)
+
+
+@pytest.mark.parametrize(
+    "key, expected_cols",
+    [
+        ([0], [1]),
+        (np.array([0]), [1]),
+        (slice(0, 1), [1]),
+        (np.array([True, False]), [False, True]),
+    ],
+)
+def test_column_transformer_remainder_numpy(key, expected_cols):
+    # test different ways that columns are specified with passthrough
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_res_both = X_array
+
+    ct = ColumnTransformer(
+        [("trans1", Trans(), key)],
+        remainder="passthrough",
+    )
+    assert_array_equal(ct.fit_transform(X_array), X_res_both)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
+    assert ct.transformers_[-1][2] == expected_cols
+
+
+@pytest.mark.parametrize(
+    "key, expected_cols",
+    [
+        ([0], [1]),
+        (slice(0, 1), [1]),
+        (np.array([True, False]), [False, True]),
+        (["first"], ["second"]),
+        ("pd-index", ["second"]),
+        (np.array(["first"]), ["second"]),
+        (np.array(["first"], dtype=object), ["second"]),
+        (slice(None, "first"), ["second"]),
+        (slice("first", "first"), ["second"]),
+    ],
+)
+def test_column_transformer_remainder_pandas(key, expected_cols):
+    # test different ways that columns are specified with passthrough
+    pd = pytest.importorskip("pandas")
+    if isinstance(key, str) and key == "pd-index":
+        key = pd.Index(["first"])
+
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_df = pd.DataFrame(X_array, columns=["first", "second"])
+    X_res_both = X_array
+
+    ct = ColumnTransformer(
+        [("trans1", Trans(), key)],
+        remainder="passthrough",
+    )
+    assert_array_equal(ct.fit_transform(X_df), X_res_both)
+    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], FunctionTransformer)
+    assert ct.transformers_[-1][2] == expected_cols
+
+
+@pytest.mark.parametrize(
+    "key, expected_cols",
+    [
+        ([0], [1, 2]),
+        (np.array([0]), [1, 2]),
+        (slice(0, 1), [1, 2]),
+        (np.array([True, False, False]), [False, True, True]),
+    ],
+)
+def test_column_transformer_remainder_transformer(key, expected_cols):
+    X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
+    X_res_both = X_array.copy()
+
+    # second and third columns are doubled when remainder = DoubleTrans
+    X_res_both[:, 1:3] *= 2
+
+    ct = ColumnTransformer(
+        [("trans1", Trans(), key)],
+        remainder=DoubleTrans(),
+    )
+
+    assert_array_equal(ct.fit_transform(X_array), X_res_both)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
+    assert ct.transformers_[-1][2] == expected_cols
+
+
+def test_column_transformer_no_remaining_remainder_transformer():
+    X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
+
+    ct = ColumnTransformer([("trans1", Trans(), [0, 1, 2])], remainder=DoubleTrans())
+
+    assert_array_equal(ct.fit_transform(X_array), X_array)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_array)
+    assert len(ct.transformers_) == 1
+    assert ct.transformers_[-1][0] != "remainder"
+
+
+def test_column_transformer_drops_all_remainder_transformer():
+    X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
+
+    # columns are doubled when remainder = DoubleTrans
+    X_res_both = 2 * X_array.copy()[:, 1:3]
+
+    ct = ColumnTransformer([("trans1", "drop", [0])], remainder=DoubleTrans())
+
+    assert_array_equal(ct.fit_transform(X_array), X_res_both)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
+    assert_array_equal(ct.transformers_[-1][2], [1, 2])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_column_transformer_sparse_remainder_transformer(csr_container):
+    X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
+
+    ct = ColumnTransformer(
+        [("trans1", Trans(), [0])],
+        remainder=SparseMatrixTrans(csr_container),
+        sparse_threshold=0.8,
+    )
+
+    X_trans = ct.fit_transform(X_array)
+    assert sparse.issparse(X_trans)
+    # SparseMatrixTrans creates 3 features for each column. There is
+    # one column in ``transformers``, thus:
+    assert X_trans.shape == (3, 3 + 1)
+
+    exp_array = np.hstack((X_array[:, 0].reshape(-1, 1), np.eye(3)))
+    assert_array_equal(X_trans.toarray(), exp_array)
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
+    assert_array_equal(ct.transformers_[-1][2], [1, 2])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_column_transformer_drop_all_sparse_remainder_transformer(csr_container):
+    X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
+    ct = ColumnTransformer(
+        [("trans1", "drop", [0])],
+        remainder=SparseMatrixTrans(csr_container),
+        sparse_threshold=0.8,
+    )
+
+    X_trans = ct.fit_transform(X_array)
+    assert sparse.issparse(X_trans)
+
+    #  SparseMatrixTrans creates 3 features for each column, thus:
+    assert X_trans.shape == (3, 3)
+    assert_array_equal(X_trans.toarray(), np.eye(3))
+    assert len(ct.transformers_) == 2
+    assert ct.transformers_[-1][0] == "remainder"
+    assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
+    assert_array_equal(ct.transformers_[-1][2], [1, 2])
+
+
+def test_column_transformer_get_set_params_with_remainder():
+    ct = ColumnTransformer(
+        [("trans1", StandardScaler(), [0])], remainder=StandardScaler()
+    )
+
+    exp = {
+        "n_jobs": None,
+        "remainder": ct.remainder,
+        "remainder__copy": True,
+        "remainder__with_mean": True,
+        "remainder__with_std": True,
+        "sparse_threshold": 0.3,
+        "trans1": ct.transformers[0][1],
+        "trans1__copy": True,
+        "trans1__with_mean": True,
+        "trans1__with_std": True,
+        "transformers": ct.transformers,
+        "transformer_weights": None,
+        "verbose_feature_names_out": True,
+        "verbose": False,
+        "force_int_remainder_cols": "deprecated",
+    }
+
+    assert ct.get_params() == exp
+
+    ct.set_params(remainder__with_std=False)
+    assert not ct.get_params()["remainder__with_std"]
+
+    ct.set_params(trans1="passthrough")
+    exp = {
+        "n_jobs": None,
+        "remainder": ct.remainder,
+        "remainder__copy": True,
+        "remainder__with_mean": True,
+        "remainder__with_std": False,
+        "sparse_threshold": 0.3,
+        "trans1": "passthrough",
+        "transformers": ct.transformers,
+        "transformer_weights": None,
+        "verbose_feature_names_out": True,
+        "verbose": False,
+        "force_int_remainder_cols": "deprecated",
+    }
+    assert ct.get_params() == exp
+
+
+def test_column_transformer_no_estimators():
+    X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).astype("float").T
+    ct = ColumnTransformer([], remainder=StandardScaler())
+
+    params = ct.get_params()
+    assert params["remainder__with_mean"]
+
+    X_trans = ct.fit_transform(X_array)
+    assert X_trans.shape == X_array.shape
+    assert len(ct.transformers_) == 1
+    assert ct.transformers_[-1][0] == "remainder"
+    assert ct.transformers_[-1][2] == [0, 1, 2]
+
+
+@pytest.mark.parametrize(
+    ["est", "pattern"],
+    [
+        (
+            ColumnTransformer(
+                [("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
+                remainder=DoubleTrans(),
+            ),
+            (
+                r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
+                r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
+                r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
+            ),
+        ),
+        (
+            ColumnTransformer(
+                [("trans1", Trans(), [0]), ("trans2", Trans(), [1])],
+                remainder="passthrough",
+            ),
+            (
+                r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
+                r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
+                r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
+            ),
+        ),
+        (
+            ColumnTransformer(
+                [("trans1", Trans(), [0]), ("trans2", "drop", [1])],
+                remainder="passthrough",
+            ),
+            (
+                r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
+                r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$"
+            ),
+        ),
+        (
+            ColumnTransformer(
+                [("trans1", Trans(), [0]), ("trans2", "passthrough", [1])],
+                remainder="passthrough",
+            ),
+            (
+                r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n"
+                r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n"
+                r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$"
+            ),
+        ),
+        (
+            ColumnTransformer([("trans1", Trans(), [0])], remainder="passthrough"),
+            (
+                r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
+                r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$"
+            ),
+        ),
+        (
+            ColumnTransformer(
+                [("trans1", Trans(), [0]), ("trans2", Trans(), [1])], remainder="drop"
+            ),
+            (
+                r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n"
+                r"\[ColumnTransformer\].*\(2 of 2\) Processing trans2.* total=.*\n$"
+            ),
+        ),
+        (
+            ColumnTransformer([("trans1", Trans(), [0])], remainder="drop"),
+            r"\[ColumnTransformer\].*\(1 of 1\) Processing trans1.* total=.*\n$",
+        ),
+    ],
+)
+@pytest.mark.parametrize("method", ["fit", "fit_transform"])
+def test_column_transformer_verbose(est, pattern, method, capsys):
+    X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T
+
+    func = getattr(est, method)
+    est.set_params(verbose=False)
+    func(X_array)
+    assert not capsys.readouterr().out, "Got output for verbose=False"
+
+    est.set_params(verbose=True)
+    func(X_array)
+    assert re.match(pattern, capsys.readouterr()[0])
+
+
+def test_column_transformer_no_estimators_set_params():
+    ct = ColumnTransformer([]).set_params(n_jobs=2)
+    assert ct.n_jobs == 2
+
+
+def test_column_transformer_callable_specifier():
+    # assert that function gets the full array
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_res_first = np.array([[0, 1, 2]]).T
+
+    def func(X):
+        assert_array_equal(X, X_array)
+        return [0]
+
+    ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop")
+    assert_array_equal(ct.fit_transform(X_array), X_res_first)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
+    assert callable(ct.transformers[0][2])
+    assert ct.transformers_[0][2] == [0]
+
+
+def test_column_transformer_callable_specifier_dataframe():
+    # assert that function gets the full dataframe
+    pd = pytest.importorskip("pandas")
+    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_res_first = np.array([[0, 1, 2]]).T
+
+    X_df = pd.DataFrame(X_array, columns=["first", "second"])
+
+    def func(X):
+        assert_array_equal(X.columns, X_df.columns)
+        assert_array_equal(X.values, X_df.values)
+        return ["first"]
+
+    ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop")
+    assert_array_equal(ct.fit_transform(X_df), X_res_first)
+    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
+    assert callable(ct.transformers[0][2])
+    assert ct.transformers_[0][2] == ["first"]
+
+
+def test_column_transformer_negative_column_indexes():
+    X = np.random.randn(2, 2)
+    X_categories = np.array([[1], [2]])
+    X = np.concatenate([X, X_categories], axis=1)
+
+    ohe = OneHotEncoder()
+
+    tf_1 = ColumnTransformer([("ohe", ohe, [-1])], remainder="passthrough")
+    tf_2 = ColumnTransformer([("ohe", ohe, [2])], remainder="passthrough")
+    assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
+
+
+@pytest.mark.parametrize("array_type", [np.asarray, *CSR_CONTAINERS])
+def test_column_transformer_mask_indexing(array_type):
+    # Regression test for #14510
+    # Boolean array-like does not behave as boolean array with sparse matrices.
+    X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]])
+    X = array_type(X)
+    column_transformer = ColumnTransformer(
+        [("identity", FunctionTransformer(), [False, True, False, True])]
+    )
+    X_trans = column_transformer.fit_transform(X)
+    assert X_trans.shape == (3, 2)
+
+
+def test_n_features_in():
+    # make sure n_features_in is what is passed as input to the column
+    # transformer.
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    ct = ColumnTransformer([("a", DoubleTrans(), [0]), ("b", DoubleTrans(), [1])])
+    assert not hasattr(ct, "n_features_in_")
+    ct.fit(X)
+    assert ct.n_features_in_ == 2
+
+
+@pytest.mark.parametrize(
+    "cols, pattern, include, exclude",
+    [
+        (["col_int", "col_float"], None, np.number, None),
+        (["col_int", "col_float"], None, None, [object, "string"]),
+        (["col_int", "col_float"], None, [int, float], None),
+        (["col_str"], None, [object, "string"], None),
+        (["col_float"], None, [float], None),
+        (["col_float"], None, float, None),
+        (["col_float"], "at$", [np.number], None),
+        (["col_int"], None, [int], None),
+        (["col_int"], "^col_int", [np.number], None),
+        (["col_float", "col_str"], "float|str", None, None),
+        (["col_str"], "^col_s", None, [int]),
+        ([], "str$", float, None),
+        (
+            ["col_int", "col_float", "col_str"],
+            None,
+            [np.number, object, "string"],
+            None,
+        ),
+    ],
+)
+def test_make_column_selector_with_select_dtypes(cols, pattern, include, exclude):
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame(
+        {
+            "col_int": np.array([0, 1, 2], dtype=int),
+            "col_float": np.array([0.0, 1.0, 2.0], dtype=float),
+            "col_str": ["one", "two", "three"],
+        },
+        columns=["col_int", "col_float", "col_str"],
+    )
+
+    selector = make_column_selector(
+        dtype_include=include, dtype_exclude=exclude, pattern=pattern
+    )
+
+    assert_array_equal(selector(X_df), cols)
+
+
+def test_column_transformer_with_make_column_selector():
+    # Functional test for column transformer + column selector
+    pd = pytest.importorskip("pandas")
+    X_df = pd.DataFrame(
+        {
+            "col_int": np.array([0, 1, 2], dtype=int),
+            "col_float": np.array([0.0, 1.0, 2.0], dtype=float),
+            "col_cat": ["one", "two", "one"],
+            "col_str": ["low", "middle", "high"],
+        },
+        columns=["col_int", "col_float", "col_cat", "col_str"],
+    )
+    X_df["col_str"] = X_df["col_str"].astype("category")
+
+    cat_selector = make_column_selector(dtype_include=["category", object, "string"])
+    num_selector = make_column_selector(dtype_include=np.number)
+
+    ohe = OneHotEncoder()
+    scaler = StandardScaler()
+
+    ct_selector = make_column_transformer((ohe, cat_selector), (scaler, num_selector))
+    ct_direct = make_column_transformer(
+        (ohe, ["col_cat", "col_str"]), (scaler, ["col_float", "col_int"])
+    )
+
+    X_selector = ct_selector.fit_transform(X_df)
+    X_direct = ct_direct.fit_transform(X_df)
+
+    assert_allclose(X_selector, X_direct)
+
+
+def test_make_column_selector_error():
+    selector = make_column_selector(dtype_include=np.number)
+    X = np.array([[0.1, 0.2]])
+    msg = "make_column_selector can only be applied to pandas dataframes"
+    with pytest.raises(ValueError, match=msg):
+        selector(X)
+
+
+def test_make_column_selector_pickle():
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame(
+        {
+            "col_int": np.array([0, 1, 2], dtype=int),
+            "col_float": np.array([0.0, 1.0, 2.0], dtype=float),
+            "col_str": ["one", "two", "three"],
+        },
+        columns=["col_int", "col_float", "col_str"],
+    )
+
+    selector = make_column_selector(dtype_include=[object])
+    selector_picked = pickle.loads(pickle.dumps(selector))
+
+    assert_array_equal(selector(X_df), selector_picked(X_df))
+
+
+@pytest.mark.parametrize(
+    "empty_col",
+    [[], np.array([], dtype=int), lambda x: []],
+    ids=["list", "array", "callable"],
+)
+def test_feature_names_empty_columns(empty_col):
+    pd = pytest.importorskip("pandas")
+
+    df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})
+
+    ct = ColumnTransformer(
+        transformers=[
+            ("ohe", OneHotEncoder(), ["col1", "col2"]),
+            ("empty_features", OneHotEncoder(), empty_col),
+        ],
+    )
+
+    ct.fit(df)
+    assert_array_equal(
+        ct.get_feature_names_out(), ["ohe__col1_a", "ohe__col1_b", "ohe__col2_z"]
+    )
+
+
+@pytest.mark.parametrize(
+    "selector",
+    [
+        [1],
+        lambda x: [1],
+        ["col2"],
+        lambda x: ["col2"],
+        [False, True],
+        lambda x: [False, True],
+    ],
+)
+def test_feature_names_out_pandas(selector):
+    """Checks name when selecting only the second column"""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]})
+    ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)])
+    ct.fit(df)
+
+    assert_array_equal(ct.get_feature_names_out(), ["ohe__col2_z"])
+
+
+@pytest.mark.parametrize(
+    "selector", [[1], lambda x: [1], [False, True], lambda x: [False, True]]
+)
+def test_feature_names_out_non_pandas(selector):
+    """Checks name when selecting the second column with numpy array"""
+    X = [["a", "z"], ["a", "z"], ["b", "z"]]
+    ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)])
+    ct.fit(X)
+
+    assert_array_equal(ct.get_feature_names_out(), ["ohe__x1_z"])
+
+
+@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
+def test_sk_visual_block_remainder(remainder):
+    # remainder='passthrough' or an estimator will be shown in repr_html
+    ohe = OneHotEncoder()
+    ct = ColumnTransformer(
+        transformers=[("ohe", ohe, ["col1", "col2"])], remainder=remainder
+    )
+    visual_block = ct._sk_visual_block_()
+    assert visual_block.names == ("ohe", "remainder")
+    assert visual_block.name_details == (["col1", "col2"], "")
+    assert visual_block.estimators == (ohe, remainder)
+
+
+def test_sk_visual_block_remainder_drop():
+    # remainder='drop' is not shown in repr_html
+    ohe = OneHotEncoder()
+    ct = ColumnTransformer(transformers=[("ohe", ohe, ["col1", "col2"])])
+    visual_block = ct._sk_visual_block_()
+    assert visual_block.names == ("ohe",)
+    assert visual_block.name_details == (["col1", "col2"],)
+    assert visual_block.estimators == (ohe,)
+
+
+@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
+def test_sk_visual_block_remainder_fitted_pandas(remainder):
+    # Remainder shows the columns after fitting
+    pd = pytest.importorskip("pandas")
+    ohe = OneHotEncoder()
+    ct = ColumnTransformer(
+        transformers=[("ohe", ohe, ["col1", "col2"])],
+        remainder=remainder,
+    )
+    df = pd.DataFrame(
+        {
+            "col1": ["a", "b", "c"],
+            "col2": ["z", "z", "z"],
+            "col3": [1, 2, 3],
+            "col4": [3, 4, 5],
+        }
+    )
+    ct.fit(df)
+    visual_block = ct._sk_visual_block_()
+    assert visual_block.names == ("ohe", "remainder")
+    assert visual_block.name_details == (["col1", "col2"], ["col3", "col4"])
+    assert visual_block.estimators == (ohe, remainder)
+
+
+@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()])
+def test_sk_visual_block_remainder_fitted_numpy(remainder):
+    # Remainder shows the indices after fitting
+    X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
+    scaler = StandardScaler()
+    ct = ColumnTransformer(
+        transformers=[("scale", scaler, [0, 2])], remainder=remainder
+    )
+    ct.fit(X)
+    visual_block = ct._sk_visual_block_()
+    assert visual_block.names == ("scale", "remainder")
+    assert visual_block.name_details == ([0, 2], [1])
+    assert visual_block.estimators == (scaler, remainder)
+
+
+@pytest.mark.parametrize("explicit_colname", ["first", "second", 0, 1])
+@pytest.mark.parametrize("remainder", [Trans(), "passthrough", "drop"])
+def test_column_transformer_reordered_column_names_remainder(
+    explicit_colname, remainder
+):
+    """Test the interaction between remainder and column transformer"""
+    pd = pytest.importorskip("pandas")
+
+    X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_fit_df = pd.DataFrame(X_fit_array, columns=["first", "second"])
+
+    X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
+    X_trans_df = pd.DataFrame(X_trans_array, columns=["second", "first"])
+
+    tf = ColumnTransformer([("bycol", Trans(), explicit_colname)], remainder=remainder)
+
+    tf.fit(X_fit_df)
+    X_fit_trans = tf.transform(X_fit_df)
+
+    # Changing the order still works
+    X_trans = tf.transform(X_trans_df)
+    assert_allclose(X_trans, X_fit_trans)
+
+    # extra columns are ignored
+    X_extended_df = X_fit_df.copy()
+    X_extended_df["third"] = [3, 6, 9]
+    X_trans = tf.transform(X_extended_df)
+    assert_allclose(X_trans, X_fit_trans)
+
+    if isinstance(explicit_colname, str):
+        # Raise error if columns are specified by names but input only allows
+        # to specify by position, e.g. numpy array instead of a pandas df.
+        X_array = X_fit_array.copy()
+        err_msg = "Specifying the columns"
+        with pytest.raises(ValueError, match=err_msg):
+            tf.transform(X_array)
+
+
+def test_feature_name_validation_missing_columns_drop_passthough():
+    """Test the interaction between {'drop', 'passthrough'} and
+    missing column names."""
+    pd = pytest.importorskip("pandas")
+
+    X = np.ones(shape=(3, 4))
+    df = pd.DataFrame(X, columns=["a", "b", "c", "d"])
+
+    df_dropped = df.drop("c", axis=1)
+
+    # with remainder='passthrough', all columns seen during `fit` must be
+    # present
+    tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="passthrough")
+    tf.fit(df)
+    msg = r"columns are missing: {'c'}"
+    with pytest.raises(ValueError, match=msg):
+        tf.transform(df_dropped)
+
+    # with remainder='drop', it is allowed to have column 'c' missing
+    tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="drop")
+    tf.fit(df)
+
+    df_dropped_trans = tf.transform(df_dropped)
+    df_fit_trans = tf.transform(df)
+    assert_allclose(df_dropped_trans, df_fit_trans)
+
+    # bycol drops 'c', thus it is allowed for 'c' to be missing
+    tf = ColumnTransformer([("bycol", "drop", ["c"])], remainder="passthrough")
+    tf.fit(df)
+    df_dropped_trans = tf.transform(df_dropped)
+    df_fit_trans = tf.transform(df)
+    assert_allclose(df_dropped_trans, df_fit_trans)
+
+
+def test_feature_names_in_():
+    """Feature names are stored in column transformer.
+
+    Column transformer deliberately does not check for column name consistency.
+    It only checks that the non-dropped names seen in `fit` are seen
+    in `transform`. This behavior is already tested in
+    `test_feature_name_validation_missing_columns_drop_passthough`"""
+
+    pd = pytest.importorskip("pandas")
+
+    feature_names = ["a", "c", "d"]
+    df = pd.DataFrame([[1, 2, 3]], columns=feature_names)
+    ct = ColumnTransformer([("bycol", Trans(), ["a", "d"])], remainder="passthrough")
+
+    ct.fit(df)
+    assert_array_equal(ct.feature_names_in_, feature_names)
+    assert isinstance(ct.feature_names_in_, np.ndarray)
+    assert ct.feature_names_in_.dtype == object
+
+
+class TransWithNames(Trans):
+    def __init__(self, feature_names_out=None):
+        self.feature_names_out = feature_names_out
+
+    def get_feature_names_out(self, input_features=None):
+        if self.feature_names_out is not None:
+            return np.asarray(self.feature_names_out, dtype=object)
+        return input_features
+
+
+@pytest.mark.parametrize(
+    "transformers, remainder, expected_names",
+    [
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", ["d"]),
+            ],
+            "passthrough",
+            ["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", ["d"]),
+            ],
+            "drop",
+            ["bycol1__d", "bycol1__c", "bycol2__d"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["b"]),
+                ("bycol2", "drop", ["d"]),
+            ],
+            "passthrough",
+            ["bycol1__b", "remainder__a", "remainder__c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]),
+            ],
+            "passthrough",
+            ["bycol1__pca1", "bycol1__pca2", "remainder__c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a", "b"]), ["d"]),
+                ("bycol2", "passthrough", ["b"]),
+            ],
+            "drop",
+            ["bycol1__a", "bycol1__b", "bycol2__b"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
+                ("bycol2", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
+            ],
+            "passthrough",
+            [
+                "bycol1__pca0",
+                "bycol1__pca1",
+                "bycol2__pca0",
+                "bycol2__pca1",
+                "remainder__a",
+                "remainder__c",
+                "remainder__d",
+            ],
+        ),
+        (
+            [
+                ("bycol1", "drop", ["d"]),
+            ],
+            "drop",
+            [],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), slice(1, 3)),
+            ],
+            "drop",
+            ["bycol1__b", "bycol1__c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["b"]),
+                ("bycol2", "drop", slice(3, 4)),
+            ],
+            "passthrough",
+            ["bycol1__b", "remainder__a", "remainder__c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", slice(3, 4)),
+            ],
+            "passthrough",
+            ["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), slice("b", "c")),
+            ],
+            "drop",
+            ["bycol1__b", "bycol1__c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["b"]),
+                ("bycol2", "drop", slice("c", "d")),
+            ],
+            "passthrough",
+            ["bycol1__b", "remainder__a"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", slice("c", "d")),
+            ],
+            "passthrough",
+            [
+                "bycol1__d",
+                "bycol1__c",
+                "bycol2__c",
+                "bycol2__d",
+                "remainder__a",
+                "remainder__b",
+            ],
+        ),
+    ],
+)
+def test_verbose_feature_names_out_true(transformers, remainder, expected_names):
+    """Check feature_names_out for verbose_feature_names_out=True (default)"""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
+    ct = ColumnTransformer(
+        transformers,
+        remainder=remainder,
+    )
+    ct.fit(df)
+
+    names = ct.get_feature_names_out()
+    assert isinstance(names, np.ndarray)
+    assert names.dtype == object
+    assert_array_equal(names, expected_names)
+
+
+def _feature_names_out_callable_name_clash(trans_name: str, feat_name: str):
+    return f"{trans_name[:2]}++{feat_name}"
+
+
+def _feature_names_out_callable_upper(trans_name: str, feat_name: str):
+    return f"{trans_name.upper()}={feat_name.upper()}"
+
+
+@pytest.mark.parametrize(
+    "transformers, remainder, verbose_feature_names_out, expected_names",
+    [
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", ["d"]),
+            ],
+            "passthrough",
+            _feature_names_out_callable_name_clash,
+            ["by++d", "by++c", "by++d", "re++a", "re++b"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", ["d"]),
+            ],
+            "drop",
+            "{feature_name}-{transformer_name}",
+            ["d-bycol1", "c-bycol1", "d-bycol2"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", slice("c", "d")),
+            ],
+            "passthrough",
+            _feature_names_out_callable_upper,
+            [
+                "BYCOL1=D",
+                "BYCOL1=C",
+                "BYCOL2=C",
+                "BYCOL2=D",
+                "REMAINDER=A",
+                "REMAINDER=B",
+            ],
+        ),
+    ],
+)
+def test_verbose_feature_names_out_callable_or_str(
+    transformers, remainder, verbose_feature_names_out, expected_names
+):
+    """Check feature_names_out for verbose_feature_names_out=True (default)"""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
+    ct = ColumnTransformer(
+        transformers,
+        remainder=remainder,
+        verbose_feature_names_out=verbose_feature_names_out,
+    )
+    ct.fit(df)
+
+    names = ct.get_feature_names_out()
+    assert isinstance(names, np.ndarray)
+    assert names.dtype == object
+    assert_array_equal(names, expected_names)
+
+
+@pytest.mark.parametrize(
+    "transformers, remainder, expected_names",
+    [
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", ["a"]),
+            ],
+            "passthrough",
+            ["d", "c", "a", "b"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a"]), ["d", "c"]),
+                ("bycol2", "passthrough", ["d"]),
+            ],
+            "drop",
+            ["a", "d"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["b"]),
+                ("bycol2", "drop", ["d"]),
+            ],
+            "passthrough",
+            ["b", "a", "c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]),
+            ],
+            "passthrough",
+            ["pca1", "pca2", "c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a", "c"]), ["d"]),
+                ("bycol2", "passthrough", ["d"]),
+            ],
+            "drop",
+            ["a", "c", "d"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]),
+                ("bycol2", TransWithNames([f"kpca{i}" for i in range(2)]), ["b"]),
+            ],
+            "passthrough",
+            ["pca0", "pca1", "kpca0", "kpca1", "a", "c", "d"],
+        ),
+        (
+            [
+                ("bycol1", "drop", ["d"]),
+            ],
+            "drop",
+            [],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), slice(1, 2)),
+                ("bycol2", "drop", ["d"]),
+            ],
+            "passthrough",
+            ["b", "a", "c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["b"]),
+                ("bycol2", "drop", slice(3, 4)),
+            ],
+            "passthrough",
+            ["b", "a", "c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", slice(0, 2)),
+            ],
+            "drop",
+            ["d", "c", "a", "b"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), slice("a", "b")),
+                ("bycol2", "drop", ["d"]),
+            ],
+            "passthrough",
+            ["a", "b", "c"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["b"]),
+                ("bycol2", "drop", slice("c", "d")),
+            ],
+            "passthrough",
+            ["b", "a"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", slice("a", "b")),
+            ],
+            "drop",
+            ["d", "c", "a", "b"],
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(), ["d", "c"]),
+                ("bycol2", "passthrough", slice("b", "b")),
+            ],
+            "drop",
+            ["d", "c", "b"],
+        ),
+    ],
+)
+def test_verbose_feature_names_out_false(transformers, remainder, expected_names):
+    """Check feature_names_out for verbose_feature_names_out=False"""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
+    ct = ColumnTransformer(
+        transformers,
+        remainder=remainder,
+        verbose_feature_names_out=False,
+    )
+    ct.fit(df)
+
+    names = ct.get_feature_names_out()
+    assert isinstance(names, np.ndarray)
+    assert names.dtype == object
+    assert_array_equal(names, expected_names)
+
+
+@pytest.mark.parametrize(
+    "transformers, remainder, colliding_columns",
+    [
+        (
+            [
+                ("bycol1", TransWithNames(), ["b"]),
+                ("bycol2", "passthrough", ["b"]),
+            ],
+            "drop",
+            "['b']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["c", "d"]), ["c"]),
+                ("bycol2", "passthrough", ["c"]),
+            ],
+            "drop",
+            "['c']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a"]), ["b"]),
+                ("bycol2", "passthrough", ["b"]),
+            ],
+            "passthrough",
+            "['a']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a"]), ["b"]),
+                ("bycol2", "drop", ["b"]),
+            ],
+            "passthrough",
+            "['a']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["c", "b"]), ["b"]),
+                ("bycol2", "passthrough", ["c", "b"]),
+            ],
+            "drop",
+            "['b', 'c']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a"]), ["b"]),
+                ("bycol2", "passthrough", ["a"]),
+                ("bycol3", TransWithNames(["a"]), ["b"]),
+            ],
+            "passthrough",
+            "['a']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a", "b"]), ["b"]),
+                ("bycol2", "passthrough", ["a"]),
+                ("bycol3", TransWithNames(["b"]), ["c"]),
+            ],
+            "passthrough",
+            "['a', 'b']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]),
+                ("bycol2", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]),
+            ],
+            "passthrough",
+            "['pca0', 'pca1', 'pca2', 'pca3', 'pca4', ...]",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a", "b"]), slice(1, 2)),
+                ("bycol2", "passthrough", ["a"]),
+                ("bycol3", TransWithNames(["b"]), ["c"]),
+            ],
+            "passthrough",
+            "['a', 'b']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a", "b"]), ["b"]),
+                ("bycol2", "passthrough", slice(0, 1)),
+                ("bycol3", TransWithNames(["b"]), ["c"]),
+            ],
+            "passthrough",
+            "['a', 'b']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a", "b"]), slice("b", "c")),
+                ("bycol2", "passthrough", ["a"]),
+                ("bycol3", TransWithNames(["b"]), ["c"]),
+            ],
+            "passthrough",
+            "['a', 'b']",
+        ),
+        (
+            [
+                ("bycol1", TransWithNames(["a", "b"]), ["b"]),
+                ("bycol2", "passthrough", slice("a", "a")),
+                ("bycol3", TransWithNames(["b"]), ["c"]),
+            ],
+            "passthrough",
+            "['a', 'b']",
+        ),
+    ],
+)
+def test_verbose_feature_names_out_false_errors(
+    transformers, remainder, colliding_columns
+):
+    """Check feature_names_out for verbose_feature_names_out=False"""
+
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
+    ct = ColumnTransformer(
+        transformers,
+        remainder=remainder,
+        verbose_feature_names_out=False,
+    )
+    ct.fit(df)
+
+    msg = re.escape(
+        f"Output feature names: {colliding_columns} are not unique. Please set "
+        "verbose_feature_names_out=True to add prefixes to feature names"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ct.get_feature_names_out()
+
+
+@pytest.mark.parametrize("verbose_feature_names_out", [True, False])
+@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
+def test_column_transformer_set_output(verbose_feature_names_out, remainder):
+    """Check column transformer behavior with set_output."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"], index=[10])
+    ct = ColumnTransformer(
+        [("first", TransWithNames(), ["a", "c"]), ("second", TransWithNames(), ["d"])],
+        remainder=remainder,
+        verbose_feature_names_out=verbose_feature_names_out,
+    )
+    X_trans = ct.fit_transform(df)
+    assert isinstance(X_trans, np.ndarray)
+
+    ct.set_output(transform="pandas")
+
+    df_test = pd.DataFrame([[1, 2, 3, 4]], columns=df.columns, index=[20])
+    X_trans = ct.transform(df_test)
+    assert isinstance(X_trans, pd.DataFrame)
+
+    feature_names_out = ct.get_feature_names_out()
+    assert_array_equal(X_trans.columns, feature_names_out)
+    assert_array_equal(X_trans.index, df_test.index)
+
+
+@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
+@pytest.mark.parametrize("fit_transform", [True, False])
+def test_column_transform_set_output_mixed(remainder, fit_transform):
+    """Check ColumnTransformer outputs mixed types correctly."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {
+            "pet": pd.Series(["dog", "cat", "snake"], dtype="category"),
+            "color": pd.Series(["green", "blue", "red"], dtype="object"),
+            "age": [1.4, 2.1, 4.4],
+            "height": [20, 40, 10],
+            "distance": pd.Series([20, pd.NA, 100], dtype="Int32"),
+        }
+    )
+    ct = ColumnTransformer(
+        [
+            (
+                "color_encode",
+                OneHotEncoder(sparse_output=False, dtype="int8"),
+                ["color"],
+            ),
+            ("age", StandardScaler(), ["age"]),
+        ],
+        remainder=remainder,
+        verbose_feature_names_out=False,
+    ).set_output(transform="pandas")
+    if fit_transform:
+        X_trans = ct.fit_transform(df)
+    else:
+        X_trans = ct.fit(df).transform(df)
+
+    assert isinstance(X_trans, pd.DataFrame)
+    assert_array_equal(X_trans.columns, ct.get_feature_names_out())
+
+    expected_dtypes = {
+        "color_blue": "int8",
+        "color_green": "int8",
+        "color_red": "int8",
+        "age": "float64",
+        "pet": "category",
+        "height": "int64",
+        "distance": "Int32",
+    }
+    for col, dtype in X_trans.dtypes.items():
+        assert dtype == expected_dtypes[col]
+
+
+@pytest.mark.parametrize("remainder", ["drop", "passthrough"])
+def test_column_transform_set_output_after_fitting(remainder):
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {
+            "pet": pd.Series(["dog", "cat", "snake"], dtype="category"),
+            "age": [1.4, 2.1, 4.4],
+            "height": [20, 40, 10],
+        }
+    )
+    ct = ColumnTransformer(
+        [
+            (
+                "color_encode",
+                OneHotEncoder(sparse_output=False, dtype="int16"),
+                ["pet"],
+            ),
+            ("age", StandardScaler(), ["age"]),
+        ],
+        remainder=remainder,
+        verbose_feature_names_out=False,
+    )
+
+    # fit without calling set_output
+    X_trans = ct.fit_transform(df)
+    assert isinstance(X_trans, np.ndarray)
+    assert X_trans.dtype == "float64"
+
+    ct.set_output(transform="pandas")
+    X_trans_df = ct.transform(df)
+    expected_dtypes = {
+        "pet_cat": "int16",
+        "pet_dog": "int16",
+        "pet_snake": "int16",
+        "height": "int64",
+        "age": "float64",
+    }
+    for col, dtype in X_trans_df.dtypes.items():
+        assert dtype == expected_dtypes[col]
+
+
+# PandasOutTransformer that does not define get_feature_names_out and always expects
+# the input to be a DataFrame.
+class PandasOutTransformer(BaseEstimator):
+    def __init__(self, offset=1.0):
+        self.offset = offset
+
+    def fit(self, X, y=None):
+        pd = pytest.importorskip("pandas")
+        assert isinstance(X, pd.DataFrame)
+        return self
+
+    def transform(self, X, y=None):
+        pd = pytest.importorskip("pandas")
+        assert isinstance(X, pd.DataFrame)
+        return X - self.offset
+
+    def set_output(self, transform=None):
+        # This transformer will always output a DataFrame regardless of the
+        # configuration.
+        return self
+
+
+@pytest.mark.parametrize(
+    "trans_1, expected_verbose_names, expected_non_verbose_names",
+    [
+        (
+            PandasOutTransformer(offset=2.0),
+            ["trans_0__feat1", "trans_1__feat0"],
+            ["feat1", "feat0"],
+        ),
+        (
+            "drop",
+            ["trans_0__feat1"],
+            ["feat1"],
+        ),
+        (
+            "passthrough",
+            ["trans_0__feat1", "trans_1__feat0"],
+            ["feat1", "feat0"],
+        ),
+    ],
+)
+def test_transformers_with_pandas_out_but_not_feature_names_out(
+    trans_1, expected_verbose_names, expected_non_verbose_names
+):
+    """Check that set_config(transform="pandas") is compatible with more transformers.
+
+    Specifically, if transformers returns a DataFrame, but does not define
+    `get_feature_names_out`.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]})
+    ct = ColumnTransformer(
+        [
+            ("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]),
+            ("trans_1", trans_1, ["feat0"]),
+        ]
+    )
+    X_trans_np = ct.fit_transform(X_df)
+    assert isinstance(X_trans_np, np.ndarray)
+
+    # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does
+    # not define the method.
+    with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
+        ct.get_feature_names_out()
+
+    # The feature names are prefixed because verbose_feature_names_out=True is default
+    ct.set_output(transform="pandas")
+    X_trans_df0 = ct.fit_transform(X_df)
+    assert_array_equal(X_trans_df0.columns, expected_verbose_names)
+
+    ct.set_params(verbose_feature_names_out=False)
+    X_trans_df1 = ct.fit_transform(X_df)
+    assert_array_equal(X_trans_df1.columns, expected_non_verbose_names)
+
+
+@pytest.mark.parametrize(
+    "empty_selection",
+    [[], np.array([False, False]), [False, False]],
+    ids=["list", "bool", "bool_int"],
+)
+def test_empty_selection_pandas_output(empty_selection):
+    """Check that pandas output works when there is an empty selection.
+
+    Non-regression test for gh-25487
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"])
+    ct = ColumnTransformer(
+        [
+            ("categorical", "passthrough", empty_selection),
+            ("numerical", StandardScaler(), ["a", "b"]),
+        ],
+        verbose_feature_names_out=True,
+    )
+    ct.set_output(transform="pandas")
+    X_out = ct.fit_transform(X)
+    assert_array_equal(X_out.columns, ["numerical__a", "numerical__b"])
+
+    ct.set_params(verbose_feature_names_out=False)
+    X_out = ct.fit_transform(X)
+    assert_array_equal(X_out.columns, ["a", "b"])
+
+
+def test_raise_error_if_index_not_aligned():
+    """Check column transformer raises error if indices are not aligned.
+
+    Non-regression test for gh-26210.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"], index=[8, 3])
+    reset_index_transformer = FunctionTransformer(
+        lambda x: x.reset_index(drop=True), feature_names_out="one-to-one"
+    )
+
+    ct = ColumnTransformer(
+        [
+            ("num1", "passthrough", ["a"]),
+            ("num2", reset_index_transformer, ["b"]),
+        ],
+    )
+    ct.set_output(transform="pandas")
+    msg = (
+        "Concatenating DataFrames from the transformer's output lead to"
+        " an inconsistent number of samples. The output may have Pandas"
+        " Indexes that do not match."
+    )
+    with pytest.raises(ValueError, match=msg):
+        ct.fit_transform(X)
+
+
+def test_remainder_set_output():
+    """Check that the output is set for the remainder.
+
+    Non-regression test for #26306.
+    """
+
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame({"a": [True, False, True], "b": [1, 2, 3]})
+
+    ct = make_column_transformer(
+        (VarianceThreshold(), make_column_selector(dtype_include=bool)),
+        remainder=VarianceThreshold(),
+        verbose_feature_names_out=False,
+    )
+    ct.set_output(transform="pandas")
+
+    out = ct.fit_transform(df)
+    pd.testing.assert_frame_equal(out, df)
+
+    ct.set_output(transform="default")
+    out = ct.fit_transform(df)
+    assert isinstance(out, np.ndarray)
+
+
+def test_transform_pd_na():
+    """Check behavior when a tranformer's output contains pandas.NA
+
+    It should raise an error unless the output config is set to 'pandas'.
+    """
+    pd = pytest.importorskip("pandas")
+    if not hasattr(pd, "Float64Dtype"):
+        pytest.skip(
+            "The issue with pd.NA tested here does not happen in old versions that do"
+            " not have the extension dtypes"
+        )
+    df = pd.DataFrame({"a": [1.5, None]})
+    ct = make_column_transformer(("passthrough", ["a"]))
+    # No warning with non-extension dtypes and np.nan
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        ct.fit_transform(df)
+    df = df.convert_dtypes()
+
+    # Error with extension dtype and pd.NA
+    with pytest.raises(ValueError, match=r"set_output\(transform='pandas'\)"):
+        ct.fit_transform(df)
+
+    # No error when output is set to pandas
+    ct.set_output(transform="pandas")
+    ct.fit_transform(df)
+    ct.set_output(transform="default")
+
+    # No error when there are no pd.NA
+    ct.fit_transform(df.fillna(-1.0))
+
+
+def test_dataframe_different_dataframe_libraries():
+    """Check fitting and transforming on pandas and polars dataframes."""
+    pd = pytest.importorskip("pandas")
+    pl = pytest.importorskip("polars")
+    X_train_np = np.array([[0, 1], [2, 4], [4, 5]])
+    X_test_np = np.array([[1, 2], [1, 3], [2, 3]])
+
+    # Fit on pandas and transform on polars
+    X_train_pd = pd.DataFrame(X_train_np, columns=["a", "b"])
+    X_test_pl = pl.DataFrame(X_test_np, schema=["a", "b"])
+
+    ct = make_column_transformer((Trans(), [0, 1]))
+    ct.fit(X_train_pd)
+
+    out_pl_in = ct.transform(X_test_pl)
+    assert_array_equal(out_pl_in, X_test_np)
+
+    # Fit on polars and transform on pandas
+    X_train_pl = pl.DataFrame(X_train_np, schema=["a", "b"])
+    X_test_pd = pd.DataFrame(X_test_np, columns=["a", "b"])
+    ct.fit(X_train_pl)
+
+    out_pd_in = ct.transform(X_test_pd)
+    assert_array_equal(out_pd_in, X_test_np)
+
+
+def test_column_transformer__getitem__():
+    """Check __getitem__ for ColumnTransformer."""
+    X = np.array([[0, 1, 2], [3, 4, 5]])
+    ct = ColumnTransformer([("t1", Trans(), [0, 1]), ("t2", Trans(), [1, 2])])
+
+    msg = "ColumnTransformer is subscriptable after it is fitted"
+    with pytest.raises(TypeError, match=msg):
+        ct["t1"]
+
+    ct.fit(X)
+    assert ct["t1"] is ct.named_transformers_["t1"]
+    assert ct["t2"] is ct.named_transformers_["t2"]
+
+    msg = "'does_not_exist' is not a valid transformer name"
+    with pytest.raises(KeyError, match=msg):
+        ct["does_not_exist"]
+
+
+@pytest.mark.parametrize("transform_output", ["default", "pandas"])
+def test_column_transformer_remainder_passthrough_naming_consistency(transform_output):
+    """Check that when `remainder="passthrough"`, inconsistent naming is handled
+    correctly by the underlying `FunctionTransformer`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28232
+    """
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame(np.random.randn(10, 4))
+
+    preprocessor = ColumnTransformer(
+        transformers=[("scaler", StandardScaler(), [0, 1])],
+        remainder="passthrough",
+    ).set_output(transform=transform_output)
+    X_trans = preprocessor.fit_transform(X)
+    assert X_trans.shape == X.shape
+
+    expected_column_names = [
+        "scaler__x0",
+        "scaler__x1",
+        "remainder__x2",
+        "remainder__x3",
+    ]
+    if hasattr(X_trans, "columns"):
+        assert X_trans.columns.tolist() == expected_column_names
+    assert preprocessor.get_feature_names_out().tolist() == expected_column_names
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_column_transformer_column_renaming(dataframe_lib):
+    """Check that we properly rename columns when using `ColumnTransformer` and
+    selected columns are redundant between transformers.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28260
+    """
+    lib = pytest.importorskip(dataframe_lib)
+
+    df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})
+
+    transformer = ColumnTransformer(
+        transformers=[
+            ("A", "passthrough", ["x1", "x2", "x3"]),
+            ("B", FunctionTransformer(), ["x1", "x2"]),
+            ("C", StandardScaler(), ["x1", "x3"]),
+            # special case of a transformer returning 0-columns, e.g feature selector
+            (
+                "D",
+                FunctionTransformer(lambda x: _safe_indexing(x, [], axis=1)),
+                ["x1", "x2", "x3"],
+            ),
+        ],
+        verbose_feature_names_out=True,
+    ).set_output(transform=dataframe_lib)
+    df_trans = transformer.fit_transform(df)
+    assert list(df_trans.columns) == [
+        "A__x1",
+        "A__x2",
+        "A__x3",
+        "B__x1",
+        "B__x2",
+        "C__x1",
+        "C__x3",
+    ]
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_column_transformer_error_with_duplicated_columns(dataframe_lib):
+    """Check that we raise an error when using `ColumnTransformer` and
+    the columns names are duplicated between transformers."""
+    lib = pytest.importorskip(dataframe_lib)
+
+    df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})
+
+    transformer = ColumnTransformer(
+        transformers=[
+            ("A", "passthrough", ["x1", "x2", "x3"]),
+            ("B", FunctionTransformer(), ["x1", "x2"]),
+            ("C", StandardScaler(), ["x1", "x3"]),
+            # special case of a transformer returning 0-columns, e.g feature selector
+            (
+                "D",
+                FunctionTransformer(lambda x: _safe_indexing(x, [], axis=1)),
+                ["x1", "x2", "x3"],
+            ),
+        ],
+        verbose_feature_names_out=False,
+    ).set_output(transform=dataframe_lib)
+    err_msg = re.escape(
+        "Duplicated feature names found before concatenating the outputs of the "
+        "transformers: ['x1', 'x2', 'x3'].\n"
+        "Transformer A has conflicting columns names: ['x1', 'x2', 'x3'].\n"
+        "Transformer B has conflicting columns names: ['x1', 'x2'].\n"
+        "Transformer C has conflicting columns names: ['x1', 'x3'].\n"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.fit_transform(df)
+
+
+@pytest.mark.skipif(
+    parse_version(joblib.__version__) < parse_version("1.3"),
+    reason="requires joblib >= 1.3",
+)
+def test_column_transformer_auto_memmap():
+    """Check that ColumnTransformer works in parallel with joblib's auto-memmapping.
+
+    non-regression test for issue #28781
+    """
+    X = np.random.RandomState(0).uniform(size=(3, 4))
+
+    scaler = StandardScaler(copy=False)
+
+    transformer = ColumnTransformer(
+        transformers=[("scaler", scaler, [0])],
+        n_jobs=2,
+    )
+
+    with joblib.parallel_backend("loky", max_nbytes=1):
+        Xt = transformer.fit_transform(X)
+
+    assert_allclose(Xt, StandardScaler().fit_transform(X[:, [0]]))
+
+
+def test_column_transformer_non_default_index():
+    """Check index handling when both pd.Series and pd.DataFrame slices are used in
+    ColumnTransformer.
+
+    Non-regression test for issue #31546.
+    """
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {
+            "dict_col": [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}],
+            "dummy_col": [1, 2],
+        },
+        index=[1, 2],
+    )
+    t = make_column_transformer(
+        (DictVectorizer(sparse=False), "dict_col"),
+        (FunctionTransformer(), ["dummy_col"]),
+    )
+    t.set_output(transform="pandas")
+    X = t.fit_transform(df)
+    assert list(X.index) == [1, 2]
+
+
+# Metadata Routing Tests
+# ======================
+
+
+@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
+def test_routing_passed_metadata_not_supported(method):
+    """Test that the right error message is raised when metadata is passed while
+    not supported when `enable_metadata_routing=False`."""
+
+    X = np.array([[0, 1, 2], [2, 4, 6]]).T
+    y = [1, 2, 3]
+    trs = ColumnTransformer([("trans", Trans(), [0])]).fit(X, y)
+
+    with pytest.raises(
+        ValueError, match="is only supported if enable_metadata_routing=True"
+    ):
+        getattr(trs, method)([[1]], sample_weight=[1], prop="a")
+
+
+@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_for_column_transformer(method):
+    """Test that metadata is routed correctly for column transformer."""
+    X = np.array([[0, 1, 2], [2, 4, 6]]).T
+    y = [1, 2, 3]
+    registry = _Registry()
+    sample_weight, metadata = [1], "a"
+    trs = ColumnTransformer(
+        [
+            (
+                "trans",
+                ConsumingTransformer(registry=registry)
+                .set_fit_request(sample_weight=True, metadata=True)
+                .set_transform_request(sample_weight=True, metadata=True),
+                [0],
+            )
+        ]
+    )
+
+    if method == "transform":
+        trs.fit(X, y, sample_weight=sample_weight, metadata=metadata)
+        trs.transform(X, sample_weight=sample_weight, metadata=metadata)
+    else:
+        getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata)
+
+    assert len(registry)
+    for _trs in registry:
+        check_recorded_metadata(
+            obj=_trs,
+            method=method,
+            parent=method,
+            sample_weight=sample_weight,
+            metadata=metadata,
+        )
+
+
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_no_fit_transform():
+    """Test metadata routing when the sub-estimator doesn't implement
+    ``fit_transform``."""
+
+    class NoFitTransform(BaseEstimator):
+        def fit(self, X, y=None, sample_weight=None, metadata=None):
+            assert sample_weight
+            assert metadata
+            return self
+
+        def transform(self, X, sample_weight=None, metadata=None):
+            assert sample_weight
+            assert metadata
+            return X
+
+    X = np.array([[0, 1, 2], [2, 4, 6]]).T
+    y = [1, 2, 3]
+    sample_weight, metadata = [1], "a"
+    trs = ColumnTransformer(
+        [
+            (
+                "trans",
+                NoFitTransform()
+                .set_fit_request(sample_weight=True, metadata=True)
+                .set_transform_request(sample_weight=True, metadata=True),
+                [0],
+            )
+        ]
+    )
+
+    trs.fit(X, y, sample_weight=sample_weight, metadata=metadata)
+    trs.fit_transform(X, y, sample_weight=sample_weight, metadata=metadata)
+
+
+@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"])
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_error_for_column_transformer(method):
+    """Test that the right error is raised when metadata is not requested."""
+    X = np.array([[0, 1, 2], [2, 4, 6]]).T
+    y = [1, 2, 3]
+    sample_weight, metadata = [1], "a"
+    trs = ColumnTransformer([("trans", ConsumingTransformer(), [0])])
+
+    error_message = (
+        "[sample_weight, metadata] are passed but are not explicitly set as requested"
+        f" or not requested for ConsumingTransformer.{method}"
+    )
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        if method == "transform":
+            trs.fit(X, y)
+            trs.transform(X, sample_weight=sample_weight, metadata=metadata)
+        else:
+            getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata)
+
+
+@config_context(enable_metadata_routing=True)
+def test_get_metadata_routing_works_without_fit():
+    # Regression test for https://github.com/scikit-learn/scikit-learn/issues/28186
+    # Make sure ct.get_metadata_routing() works w/o having called fit.
+    ct = ColumnTransformer([("trans", ConsumingTransformer(), [0])])
+    ct.get_metadata_routing()
+
+
+@config_context(enable_metadata_routing=True)
+def test_remainder_request_always_present():
+    # Test that remainder request is always present.
+    ct = ColumnTransformer(
+        [("trans", StandardScaler(), [0])],
+        remainder=ConsumingTransformer()
+        .set_fit_request(metadata=True)
+        .set_transform_request(metadata=True),
+    )
+    router = ct.get_metadata_routing()
+    assert router.consumes("fit", ["metadata"]) == set(["metadata"])
+
+
+@config_context(enable_metadata_routing=True)
+def test_unused_transformer_request_present():
+    # Test that the request of a transformer is always present even when not
+    # used due to no selected columns.
+    ct = ColumnTransformer(
+        [
+            (
+                "trans",
+                ConsumingTransformer()
+                .set_fit_request(metadata=True)
+                .set_transform_request(metadata=True),
+                lambda X: [],
+            )
+        ]
+    )
+    router = ct.get_metadata_routing()
+    assert router.consumes("fit", ["metadata"]) == set(["metadata"])
+
+
+# End of Metadata Routing Tests
+# =============================
diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_target.py b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..19dcfb5dc7f031f8b1a5303c84e84389fbcccc1e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_target.py
@@ -0,0 +1,439 @@
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn import config_context, datasets
+from sklearn.base import BaseEstimator, TransformerMixin, clone
+from sklearn.compose import TransformedTargetRegressor
+from sklearn.dummy import DummyRegressor
+from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer, StandardScaler
+from sklearn.utils._testing import assert_allclose
+
+friedman = datasets.make_friedman1(random_state=0)
+
+
+def test_transform_target_regressor_error():
+    X, y = friedman
+    # provide a transformer and functions at the same time
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(),
+        transformer=StandardScaler(),
+        func=np.exp,
+        inverse_func=np.log,
+    )
+    with pytest.raises(
+        ValueError,
+        match="'transformer' and functions 'func'/'inverse_func' cannot both be set.",
+    ):
+        regr.fit(X, y)
+    # fit with sample_weight with a regressor which does not support it
+    sample_weight = np.ones((y.shape[0],))
+    regr = TransformedTargetRegressor(
+        regressor=OrthogonalMatchingPursuit(), transformer=StandardScaler()
+    )
+    with pytest.raises(
+        TypeError,
+        match=r"fit\(\) got an unexpected keyword argument 'sample_weight'",
+    ):
+        regr.fit(X, y, sample_weight=sample_weight)
+
+    # one of (func, inverse_func) is given but the other one is not
+    regr = TransformedTargetRegressor(func=np.exp)
+    with pytest.raises(
+        ValueError,
+        match="When 'func' is provided, 'inverse_func' must also be provided",
+    ):
+        regr.fit(X, y)
+
+    regr = TransformedTargetRegressor(inverse_func=np.log)
+    with pytest.raises(
+        ValueError,
+        match="When 'inverse_func' is provided, 'func' must also be provided",
+    ):
+        regr.fit(X, y)
+
+
+def test_transform_target_regressor_invertible():
+    X, y = friedman
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(),
+        func=np.sqrt,
+        inverse_func=np.log,
+        check_inverse=True,
+    )
+    with pytest.warns(
+        UserWarning,
+        match=(r"The provided functions.* are not strictly inverse of each other"),
+    ):
+        regr.fit(X, y)
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log
+    )
+    regr.set_params(check_inverse=False)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        regr.fit(X, y)
+
+
+def _check_standard_scaled(y, y_pred):
+    y_mean = np.mean(y, axis=0)
+    y_std = np.std(y, axis=0)
+    assert_allclose((y - y_mean) / y_std, y_pred)
+
+
+def _check_shifted_by_one(y, y_pred):
+    assert_allclose(y + 1, y_pred)
+
+
+def test_transform_target_regressor_functions():
+    X, y = friedman
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), func=np.log, inverse_func=np.exp
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    # check the transformer output
+    y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze()
+    assert_allclose(np.log(y), y_tran)
+    assert_allclose(
+        y, regr.transformer_.inverse_transform(y_tran.reshape(-1, 1)).squeeze()
+    )
+    assert y.shape == y_pred.shape
+    assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
+    # check the regressor output
+    lr = LinearRegression().fit(X, regr.func(y))
+    assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
+
+
+def test_transform_target_regressor_functions_multioutput():
+    X = friedman[0]
+    y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), func=np.log, inverse_func=np.exp
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    # check the transformer output
+    y_tran = regr.transformer_.transform(y)
+    assert_allclose(np.log(y), y_tran)
+    assert_allclose(y, regr.transformer_.inverse_transform(y_tran))
+    assert y.shape == y_pred.shape
+    assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
+    # check the regressor output
+    lr = LinearRegression().fit(X, regr.func(y))
+    assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
+
+
+@pytest.mark.parametrize(
+    "X,y", [friedman, (friedman[0], np.vstack((friedman[1], friedman[1] ** 2 + 1)).T)]
+)
+def test_transform_target_regressor_1d_transformer(X, y):
+    # All transformer in scikit-learn expect 2D data. FunctionTransformer with
+    # validate=False lift this constraint without checking that the input is a
+    # 2D vector. We check the consistency of the data shape using a 1D and 2D y
+    # array.
+    transformer = FunctionTransformer(
+        func=lambda x: x + 1, inverse_func=lambda x: x - 1
+    )
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), transformer=transformer
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+    # consistency forward transform
+    y_tran = regr.transformer_.transform(y)
+    _check_shifted_by_one(y, y_tran)
+    assert y.shape == y_pred.shape
+    # consistency inverse transform
+    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
+    # consistency of the regressor
+    lr = LinearRegression()
+    transformer2 = clone(transformer)
+    lr.fit(X, transformer2.fit_transform(y))
+    y_lr_pred = lr.predict(X)
+    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
+    assert_allclose(regr.regressor_.coef_, lr.coef_)
+
+
+@pytest.mark.parametrize(
+    "X,y", [friedman, (friedman[0], np.vstack((friedman[1], friedman[1] ** 2 + 1)).T)]
+)
+def test_transform_target_regressor_2d_transformer(X, y):
+    # Check consistency with transformer accepting only 2D array and a 1D/2D y
+    # array.
+    transformer = StandardScaler()
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), transformer=transformer
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+    # consistency forward transform
+    if y.ndim == 1:  # create a 2D array and squeeze results
+        y_tran = regr.transformer_.transform(y.reshape(-1, 1))
+    else:
+        y_tran = regr.transformer_.transform(y)
+    _check_standard_scaled(y, y_tran.squeeze())
+    assert y.shape == y_pred.shape
+    # consistency inverse transform
+    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
+    # consistency of the regressor
+    lr = LinearRegression()
+    transformer2 = clone(transformer)
+    if y.ndim == 1:  # create a 2D array and squeeze results
+        lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze())
+        y_lr_pred = lr.predict(X).reshape(-1, 1)
+        y_pred2 = transformer2.inverse_transform(y_lr_pred).squeeze()
+    else:
+        lr.fit(X, transformer2.fit_transform(y))
+        y_lr_pred = lr.predict(X)
+        y_pred2 = transformer2.inverse_transform(y_lr_pred)
+
+    assert_allclose(y_pred, y_pred2)
+    assert_allclose(regr.regressor_.coef_, lr.coef_)
+
+
+def test_transform_target_regressor_2d_transformer_multioutput():
+    # Check consistency with transformer accepting only 2D array and a 2D y
+    # array.
+    X = friedman[0]
+    y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
+    transformer = StandardScaler()
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), transformer=transformer
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+    # consistency forward transform
+    y_tran = regr.transformer_.transform(y)
+    _check_standard_scaled(y, y_tran)
+    assert y.shape == y_pred.shape
+    # consistency inverse transform
+    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
+    # consistency of the regressor
+    lr = LinearRegression()
+    transformer2 = clone(transformer)
+    lr.fit(X, transformer2.fit_transform(y))
+    y_lr_pred = lr.predict(X)
+    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
+    assert_allclose(regr.regressor_.coef_, lr.coef_)
+
+
+def test_transform_target_regressor_3d_target():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/18866
+    # Check with a 3D target with a transformer that reshapes the target
+    X = friedman[0]
+    y = np.tile(friedman[1].reshape(-1, 1, 1), [1, 3, 2])
+
+    def flatten_data(data):
+        return data.reshape(data.shape[0], -1)
+
+    def unflatten_data(data):
+        return data.reshape(data.shape[0], -1, 2)
+
+    transformer = FunctionTransformer(func=flatten_data, inverse_func=unflatten_data)
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), transformer=transformer
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+
+
+def test_transform_target_regressor_multi_to_single():
+    X = friedman[0]
+    y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)])
+
+    def func(y):
+        out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
+        return out[:, np.newaxis]
+
+    def inverse_func(y):
+        return y
+
+    tt = TransformedTargetRegressor(
+        func=func, inverse_func=inverse_func, check_inverse=False
+    )
+    tt.fit(X, y)
+    y_pred_2d_func = tt.predict(X)
+    assert y_pred_2d_func.shape == (100, 1)
+
+    # force that the function only return a 1D array
+    def func(y):
+        return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
+
+    tt = TransformedTargetRegressor(
+        func=func, inverse_func=inverse_func, check_inverse=False
+    )
+    tt.fit(X, y)
+    y_pred_1d_func = tt.predict(X)
+    assert y_pred_1d_func.shape == (100, 1)
+
+    assert_allclose(y_pred_1d_func, y_pred_2d_func)
+
+
+class DummyCheckerArrayTransformer(TransformerMixin, BaseEstimator):
+    def fit(self, X, y=None):
+        assert isinstance(X, np.ndarray)
+        return self
+
+    def transform(self, X):
+        assert isinstance(X, np.ndarray)
+        return X
+
+    def inverse_transform(self, X):
+        assert isinstance(X, np.ndarray)
+        return X
+
+
+class DummyCheckerListRegressor(DummyRegressor):
+    def fit(self, X, y, sample_weight=None):
+        assert isinstance(X, list)
+        return super().fit(X, y, sample_weight)
+
+    def predict(self, X):
+        assert isinstance(X, list)
+        return super().predict(X)
+
+
+def test_transform_target_regressor_ensure_y_array():
+    # check that the target ``y`` passed to the transformer will always be a
+    # numpy array. Similarly, if ``X`` is passed as a list, we check that the
+    # predictor receive as it is.
+    X, y = friedman
+    tt = TransformedTargetRegressor(
+        transformer=DummyCheckerArrayTransformer(),
+        regressor=DummyCheckerListRegressor(),
+        check_inverse=False,
+    )
+    tt.fit(X.tolist(), y.tolist())
+    tt.predict(X.tolist())
+    with pytest.raises(AssertionError):
+        tt.fit(X, y.tolist())
+    with pytest.raises(AssertionError):
+        tt.predict(X)
+
+
+class DummyTransformer(TransformerMixin, BaseEstimator):
+    """Dummy transformer which count how many time fit was called."""
+
+    def __init__(self, fit_counter=0):
+        self.fit_counter = fit_counter
+
+    def fit(self, X, y=None):
+        self.fit_counter += 1
+        return self
+
+    def transform(self, X):
+        return X
+
+    def inverse_transform(self, X):
+        return X
+
+
+@pytest.mark.parametrize("check_inverse", [False, True])
+def test_transform_target_regressor_count_fit(check_inverse):
+    # regression test for gh-issue #11618
+    # check that we only call a single time fit for the transformer
+    X, y = friedman
+    ttr = TransformedTargetRegressor(
+        transformer=DummyTransformer(), check_inverse=check_inverse
+    )
+    ttr.fit(X, y)
+    assert ttr.transformer_.fit_counter == 1
+
+
+class DummyRegressorWithExtraFitParams(DummyRegressor):
+    def fit(self, X, y, sample_weight=None, check_input=True):
+        # on the test below we force this to false, we make sure this is
+        # actually passed to the regressor
+        assert not check_input
+        return super().fit(X, y, sample_weight)
+
+
+def test_transform_target_regressor_pass_fit_parameters():
+    X, y = friedman
+    regr = TransformedTargetRegressor(
+        regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()
+    )
+
+    regr.fit(X, y, check_input=False)
+    assert regr.transformer_.fit_counter == 1
+
+
+def test_transform_target_regressor_route_pipeline():
+    X, y = friedman
+
+    regr = TransformedTargetRegressor(
+        regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()
+    )
+    estimators = [("normalize", StandardScaler()), ("est", regr)]
+
+    pip = Pipeline(estimators)
+    pip.fit(X, y, **{"est__check_input": False})
+
+    assert regr.transformer_.fit_counter == 1
+
+
+class DummyRegressorWithExtraPredictParams(DummyRegressor):
+    def predict(self, X, check_input=True):
+        # In the test below we make sure that the check input parameter is
+        # passed as false
+        self.predict_called = True
+        assert not check_input
+        return super().predict(X)
+
+
+def test_transform_target_regressor_pass_extra_predict_parameters():
+    # Checks that predict kwargs are passed to regressor.
+    X, y = friedman
+    regr = TransformedTargetRegressor(
+        regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer()
+    )
+
+    regr.fit(X, y)
+    regr.predict(X, check_input=False)
+    assert regr.regressor_.predict_called
+
+
+@pytest.mark.parametrize("output_format", ["pandas", "polars"])
+def test_transform_target_regressor_not_warns_with_global_output_set(output_format):
+    """Test that TransformedTargetRegressor will not raise warnings if
+    set_config(transform_output="pandas"/"polars") is set globally; regression test for
+    issue #29361."""
+    X, y = datasets.make_regression()
+    y = np.abs(y) + 1
+    with config_context(transform_output=output_format):
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            TransformedTargetRegressor(
+                regressor=LinearRegression(), func=np.log, inverse_func=np.exp
+            ).fit(X, y)
+
+
+class ValidateDimensionRegressor(BaseEstimator):
+    """A regressor that expects the target to have a specific number of dimensions."""
+
+    def __init__(self, ndim):
+        self.ndim = ndim
+
+    def fit(self, X, y):
+        assert y.ndim == self.ndim
+
+    def predict(self, X):
+        pass  # pragma: no cover
+
+
+@pytest.mark.parametrize("ndim", [1, 2])
+def test_transform_target_regressor_preserves_input_shape(ndim):
+    """Check that TransformedTargetRegressor internally preserves the shape of the input
+
+    non-regression test for issue #26530.
+    """
+    X, y = datasets.make_regression(n_samples=10, n_features=5, random_state=42)
+    if ndim == 2:
+        y = y.reshape(-1, 1)
+
+    regr = TransformedTargetRegressor(regressor=ValidateDimensionRegressor(ndim))
+    regr.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..65817ef7b977b84bcd4c8eb913866d54ce756999
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/__init__.py
@@ -0,0 +1,46 @@
+"""Methods and algorithms to robustly estimate covariance.
+
+They estimate the covariance of features at given sets of points, as well as the
+precision matrix defined as the inverse of the covariance. Covariance estimation is
+closely related to the theory of Gaussian graphical models.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._elliptic_envelope import EllipticEnvelope
+from ._empirical_covariance import (
+    EmpiricalCovariance,
+    empirical_covariance,
+    log_likelihood,
+)
+from ._graph_lasso import GraphicalLasso, GraphicalLassoCV, graphical_lasso
+from ._robust_covariance import MinCovDet, fast_mcd
+from ._shrunk_covariance import (
+    OAS,
+    LedoitWolf,
+    ShrunkCovariance,
+    ledoit_wolf,
+    ledoit_wolf_shrinkage,
+    oas,
+    shrunk_covariance,
+)
+
+__all__ = [
+    "OAS",
+    "EllipticEnvelope",
+    "EmpiricalCovariance",
+    "GraphicalLasso",
+    "GraphicalLassoCV",
+    "LedoitWolf",
+    "MinCovDet",
+    "ShrunkCovariance",
+    "empirical_covariance",
+    "fast_mcd",
+    "graphical_lasso",
+    "ledoit_wolf",
+    "ledoit_wolf_shrinkage",
+    "log_likelihood",
+    "oas",
+    "shrunk_covariance",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_elliptic_envelope.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_elliptic_envelope.py
new file mode 100644
index 0000000000000000000000000000000000000000..71fb72ccd683d04a708162774487922b719cbe4c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_elliptic_envelope.py
@@ -0,0 +1,266 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Real
+
+import numpy as np
+
+from ..base import OutlierMixin, _fit_context
+from ..metrics import accuracy_score
+from ..utils._param_validation import Interval
+from ..utils.validation import check_is_fitted
+from ._robust_covariance import MinCovDet
+
+
+class EllipticEnvelope(OutlierMixin, MinCovDet):
+    """An object for detecting outliers in a Gaussian distributed dataset.
+
+    Read more in the :ref:`User Guide <outlier_detection>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, the support of robust location and covariance estimates
+        is computed, and a covariance estimate is recomputed from it,
+        without centering the data.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, the robust location and covariance are directly computed
+        with the FastMCD algorithm without additional treatment.
+
+    support_fraction : float, default=None
+        The proportion of points to be included in the support of the raw
+        MCD estimate. If None, the minimum value of support_fraction will
+        be used within the algorithm: `(n_samples + n_features + 1) / 2 * n_samples`.
+        Range is (0, 1).
+
+    contamination : float, default=0.1
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. Range is (0, 0.5].
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling
+        the data. Pass an int for reproducible results across multiple function
+        calls. See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    location_ : ndarray of shape (n_features,)
+        Estimated robust location.
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated robust covariance matrix.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute the
+        robust estimates of location and shape.
+
+    offset_ : float
+        Offset used to define the decision function from the raw scores.
+        We have the relation: ``decision_function = score_samples - offset_``.
+        The offset depends on the contamination parameter and is defined in
+        such a way we obtain the expected number of outliers (samples with
+        decision function < 0) in training.
+
+        .. versionadded:: 0.20
+
+    raw_location_ : ndarray of shape (n_features,)
+        The raw robust estimated location before correction and re-weighting.
+
+    raw_covariance_ : ndarray of shape (n_features, n_features)
+        The raw robust estimated covariance before correction and re-weighting.
+
+    raw_support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute
+        the raw robust estimates of location and shape, before correction
+        and re-weighting.
+
+    dist_ : ndarray of shape (n_samples,)
+        Mahalanobis distances of the training set (on which :meth:`fit` is
+        called) observations.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    LedoitWolf : LedoitWolf Estimator.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    OAS : Oracle Approximating Shrinkage Estimator.
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    Notes
+    -----
+    Outlier detection from covariance estimation may break or not
+    perform well in high-dimensional settings. In particular, one will
+    always take care to work with ``n_samples > n_features ** 2``.
+
+    References
+    ----------
+    .. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
+       minimum covariance determinant estimator" Technometrics 41(3), 212
+       (1999)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import EllipticEnvelope
+    >>> true_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
+    ...                                                  cov=true_cov,
+    ...                                                  size=500)
+    >>> cov = EllipticEnvelope(random_state=0).fit(X)
+    >>> # predict returns 1 for an inlier and -1 for an outlier
+    >>> cov.predict([[0, 0],
+    ...              [3, 3]])
+    array([ 1, -1])
+    >>> cov.covariance_
+    array([[0.7411, 0.2535],
+           [0.2535, 0.3053]])
+    >>> cov.location_
+    array([0.0813 , 0.0427])
+    """
+
+    _parameter_constraints: dict = {
+        **MinCovDet._parameter_constraints,
+        "contamination": [Interval(Real, 0, 0.5, closed="right")],
+    }
+
+    def __init__(
+        self,
+        *,
+        store_precision=True,
+        assume_centered=False,
+        support_fraction=None,
+        contamination=0.1,
+        random_state=None,
+    ):
+        super().__init__(
+            store_precision=store_precision,
+            assume_centered=assume_centered,
+            support_fraction=support_fraction,
+            random_state=random_state,
+        )
+        self.contamination = contamination
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the EllipticEnvelope model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        super().fit(X)
+        self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)
+        return self
+
+    def decision_function(self, X):
+        """Compute the decision function of the given observations.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        decision : ndarray of shape (n_samples,)
+            Decision function of the samples.
+            It is equal to the shifted Mahalanobis distances.
+            The threshold for being an outlier is 0, which ensures a
+            compatibility with other outlier detection algorithms.
+        """
+        check_is_fitted(self)
+        negative_mahal_dist = self.score_samples(X)
+        return negative_mahal_dist - self.offset_
+
+    def score_samples(self, X):
+        """Compute the negative Mahalanobis distances.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        negative_mahal_distances : array-like of shape (n_samples,)
+            Opposite of the Mahalanobis distances.
+        """
+        check_is_fitted(self)
+        return -self.mahalanobis(X)
+
+    def predict(self, X):
+        """
+        Predict labels (1 inlier, -1 outlier) of X according to fitted model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and +1 for inliers.
+        """
+        values = self.decision_function(X)
+        is_inlier = np.full(values.shape[0], -1, dtype=int)
+        is_inlier[values >= 0] = 1
+
+        return is_inlier
+
+    def score(self, X, y, sample_weight=None):
+        """Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of self.predict(X) w.r.t. y.
+        """
+        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_empirical_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_empirical_covariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8ee198cc477275da749de0c212c4c874937f51b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_empirical_covariance.py
@@ -0,0 +1,370 @@
+"""
+Maximum likelihood covariance estimator.
+
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# avoid division truncation
+import warnings
+
+import numpy as np
+from scipy import linalg
+
+from sklearn.utils import metadata_routing
+
+from .. import config_context
+from ..base import BaseEstimator, _fit_context
+from ..metrics.pairwise import pairwise_distances
+from ..utils import check_array
+from ..utils._param_validation import validate_params
+from ..utils.extmath import fast_logdet
+from ..utils.validation import validate_data
+
+
+@validate_params(
+    {
+        "emp_cov": [np.ndarray],
+        "precision": [np.ndarray],
+    },
+    prefer_skip_nested_validation=True,
+)
+def log_likelihood(emp_cov, precision):
+    """Compute the sample mean of the log_likelihood under a covariance model.
+
+    Computes the empirical expected log-likelihood, allowing for universal
+    comparison (beyond this software package), and accounts for normalization
+    terms and scaling.
+
+    Parameters
+    ----------
+    emp_cov : ndarray of shape (n_features, n_features)
+        Maximum Likelihood Estimator of covariance.
+
+    precision : ndarray of shape (n_features, n_features)
+        The precision matrix of the covariance model to be tested.
+
+    Returns
+    -------
+    log_likelihood_ : float
+        Sample mean of the log-likelihood.
+    """
+    p = precision.shape[0]
+    log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision)
+    log_likelihood_ -= p * np.log(2 * np.pi)
+    log_likelihood_ /= 2.0
+    return log_likelihood_
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "assume_centered": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def empirical_covariance(X, *, assume_centered=False):
+    """Compute the Maximum likelihood covariance estimator.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Data from which to compute the covariance estimate.
+
+    assume_centered : bool, default=False
+        If `True`, data will not be centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If `False`, data will be centered before computation.
+
+    Returns
+    -------
+    covariance : ndarray of shape (n_features, n_features)
+        Empirical covariance (Maximum Likelihood Estimator).
+
+    Examples
+    --------
+    >>> from sklearn.covariance import empirical_covariance
+    >>> X = [[1,1,1],[1,1,1],[1,1,1],
+    ...      [0,0,0],[0,0,0],[0,0,0]]
+    >>> empirical_covariance(X)
+    array([[0.25, 0.25, 0.25],
+           [0.25, 0.25, 0.25],
+           [0.25, 0.25, 0.25]])
+    """
+    X = check_array(X, ensure_2d=False, ensure_all_finite=False)
+
+    if X.ndim == 1:
+        X = np.reshape(X, (1, -1))
+
+    if X.shape[0] == 1:
+        warnings.warn(
+            "Only one sample available. You may want to reshape your data array"
+        )
+
+    if assume_centered:
+        covariance = np.dot(X.T, X) / X.shape[0]
+    else:
+        covariance = np.cov(X.T, bias=1)
+
+    if covariance.ndim == 0:
+        covariance = np.array([[covariance]])
+    return covariance
+
+
+class EmpiricalCovariance(BaseEstimator):
+    """Maximum likelihood covariance estimator.
+
+    Read more in the :ref:`User Guide <covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specifies if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, data are not centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False (default), data are centered before computation.
+
+    Attributes
+    ----------
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo-inverse matrix.
+        (stored only if store_precision is True)
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    LedoitWolf : LedoitWolf Estimator.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    OAS : Oracle Approximating Shrinkage Estimator.
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import EmpiricalCovariance
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                             cov=real_cov,
+    ...                             size=500)
+    >>> cov = EmpiricalCovariance().fit(X)
+    >>> cov.covariance_
+    array([[0.7569, 0.2818],
+           [0.2818, 0.3928]])
+    >>> cov.location_
+    array([0.0622, 0.0193])
+    """
+
+    # X_test should have been called X
+    __metadata_request__score = {"X_test": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "store_precision": ["boolean"],
+        "assume_centered": ["boolean"],
+    }
+
+    def __init__(self, *, store_precision=True, assume_centered=False):
+        self.store_precision = store_precision
+        self.assume_centered = assume_centered
+
+    def _set_covariance(self, covariance):
+        """Saves the covariance and precision estimates
+
+        Storage is done accordingly to `self.store_precision`.
+        Precision stored only if invertible.
+
+        Parameters
+        ----------
+        covariance : array-like of shape (n_features, n_features)
+            Estimated covariance matrix to be stored, and from which precision
+            is computed.
+        """
+        covariance = check_array(covariance)
+        # set covariance
+        self.covariance_ = covariance
+        # set precision
+        if self.store_precision:
+            self.precision_ = linalg.pinvh(covariance, check_finite=False)
+        else:
+            self.precision_ = None
+
+    def get_precision(self):
+        """Getter for the precision matrix.
+
+        Returns
+        -------
+        precision_ : array-like of shape (n_features, n_features)
+            The precision matrix associated to the current covariance object.
+        """
+        if self.store_precision:
+            precision = self.precision_
+        else:
+            precision = linalg.pinvh(self.covariance_, check_finite=False)
+        return precision
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the maximum likelihood covariance estimator to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+          Training data, where `n_samples` is the number of samples and
+          `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(self, X)
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+        covariance = empirical_covariance(X, assume_centered=self.assume_centered)
+        self._set_covariance(covariance)
+
+        return self
+
+    def score(self, X_test, y=None):
+        """Compute the log-likelihood of `X_test` under the estimated Gaussian model.
+
+        The Gaussian model is defined by its mean and covariance matrix which are
+        represented respectively by `self.location_` and `self.covariance_`.
+
+        Parameters
+        ----------
+        X_test : array-like of shape (n_samples, n_features)
+            Test data of which we compute the likelihood, where `n_samples` is
+            the number of samples and `n_features` is the number of features.
+            `X_test` is assumed to be drawn from the same distribution than
+            the data used in fit (including centering).
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        res : float
+            The log-likelihood of `X_test` with `self.location_` and `self.covariance_`
+            as estimators of the Gaussian model mean and covariance matrix respectively.
+        """
+        X_test = validate_data(self, X_test, reset=False)
+        # compute empirical covariance of the test set
+        test_cov = empirical_covariance(X_test - self.location_, assume_centered=True)
+        # compute log likelihood
+        res = log_likelihood(test_cov, self.get_precision())
+
+        return res
+
+    def error_norm(self, comp_cov, norm="frobenius", scaling=True, squared=True):
+        """Compute the Mean Squared Error between two covariance estimators.
+
+        Parameters
+        ----------
+        comp_cov : array-like of shape (n_features, n_features)
+            The covariance to compare with.
+
+        norm : {"frobenius", "spectral"}, default="frobenius"
+            The type of norm used to compute the error. Available error types:
+            - 'frobenius' (default): sqrt(tr(A^t.A))
+            - 'spectral': sqrt(max(eigenvalues(A^t.A))
+            where A is the error ``(comp_cov - self.covariance_)``.
+
+        scaling : bool, default=True
+            If True (default), the squared error norm is divided by n_features.
+            If False, the squared error norm is not rescaled.
+
+        squared : bool, default=True
+            Whether to compute the squared error norm or the error norm.
+            If True (default), the squared error norm is returned.
+            If False, the error norm is returned.
+
+        Returns
+        -------
+        result : float
+            The Mean Squared Error (in the sense of the Frobenius norm) between
+            `self` and `comp_cov` covariance estimators.
+        """
+        # compute the error
+        error = comp_cov - self.covariance_
+        # compute the error norm
+        if norm == "frobenius":
+            squared_norm = np.sum(error**2)
+        elif norm == "spectral":
+            squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
+        else:
+            raise NotImplementedError(
+                "Only spectral and frobenius norms are implemented"
+            )
+        # optionally scale the error norm
+        if scaling:
+            squared_norm = squared_norm / error.shape[0]
+        # finally get either the squared norm or the norm
+        if squared:
+            result = squared_norm
+        else:
+            result = np.sqrt(squared_norm)
+
+        return result
+
+    def mahalanobis(self, X):
+        """Compute the squared Mahalanobis distances of given observations.
+
+        For a detailed example of how outliers affects the Mahalanobis distance,
+        see :ref:`sphx_glr_auto_examples_covariance_plot_mahalanobis_distances.py`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The observations, the Mahalanobis distances of the which we
+            compute. Observations are assumed to be drawn from the same
+            distribution than the data used in fit.
+
+        Returns
+        -------
+        dist : ndarray of shape (n_samples,)
+            Squared Mahalanobis distances of the observations.
+        """
+        X = validate_data(self, X, reset=False)
+
+        precision = self.get_precision()
+        with config_context(assume_finite=True):
+            # compute mahalanobis distances
+            dist = pairwise_distances(
+                X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision
+            )
+
+        return np.reshape(dist, (len(X),)) ** 2
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_graph_lasso.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_graph_lasso.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94663120216dbeab7f8edd963554b9653e58221
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_graph_lasso.py
@@ -0,0 +1,1145 @@
+"""GraphicalLasso: sparse inverse covariance estimation with an l1-penalized
+estimator.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import operator
+import sys
+import time
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+
+from ..base import _fit_context
+from ..exceptions import ConvergenceWarning
+
+# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
+from ..linear_model import _cd_fast as cd_fast  # type: ignore[attr-defined]
+from ..linear_model import lars_path_gram
+from ..model_selection import check_cv, cross_val_score
+from ..utils import Bunch
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _is_arraylike_not_scalar,
+    check_random_state,
+    check_scalar,
+    validate_data,
+)
+from . import EmpiricalCovariance, empirical_covariance, log_likelihood
+
+
+# Helper functions to compute the objective and dual objective functions
+# of the l1-penalized estimator
+def _objective(mle, precision_, alpha):
+    """Evaluation of the graphical-lasso objective function
+
+    the objective function is made of a shifted scaled version of the
+    normalized log-likelihood (i.e. its empirical mean over the samples) and a
+    penalisation term to promote sparsity
+    """
+    p = precision_.shape[0]
+    cost = -2.0 * log_likelihood(mle, precision_) + p * np.log(2 * np.pi)
+    cost += alpha * (np.abs(precision_).sum() - np.abs(np.diag(precision_)).sum())
+    return cost
+
+
+def _dual_gap(emp_cov, precision_, alpha):
+    """Expression of the dual gap convergence criterion
+
+    The specific definition is given in Duchi "Projected Subgradient Methods
+    for Learning Sparse Gaussians".
+    """
+    gap = np.sum(emp_cov * precision_)
+    gap -= precision_.shape[0]
+    gap += alpha * (np.abs(precision_).sum() - np.abs(np.diag(precision_)).sum())
+    return gap
+
+
+# The g-lasso algorithm
+def _graphical_lasso(
+    emp_cov,
+    alpha,
+    *,
+    cov_init=None,
+    mode="cd",
+    tol=1e-4,
+    enet_tol=1e-4,
+    max_iter=100,
+    verbose=False,
+    eps=np.finfo(np.float64).eps,
+):
+    _, n_features = emp_cov.shape
+    if alpha == 0:
+        # Early return without regularization
+        precision_ = linalg.inv(emp_cov)
+        cost = -2.0 * log_likelihood(emp_cov, precision_)
+        cost += n_features * np.log(2 * np.pi)
+        d_gap = np.sum(emp_cov * precision_) - n_features
+        return emp_cov, precision_, (cost, d_gap), 0
+
+    if cov_init is None:
+        covariance_ = emp_cov.copy()
+    else:
+        covariance_ = cov_init.copy()
+    # As a trivial regularization (Tikhonov like), we scale down the
+    # off-diagonal coefficients of our starting point: This is needed, as
+    # in the cross-validation the cov_init can easily be
+    # ill-conditioned, and the CV loop blows. Beside, this takes
+    # conservative stand-point on the initial conditions, and it tends to
+    # make the convergence go faster.
+    covariance_ *= 0.95
+    diagonal = emp_cov.flat[:: n_features + 1]
+    covariance_.flat[:: n_features + 1] = diagonal
+    precision_ = linalg.pinvh(covariance_)
+
+    indices = np.arange(n_features)
+    i = 0  # initialize the counter to be robust to `max_iter=0`
+    costs = list()
+    # The different l1 regression solver have different numerical errors
+    if mode == "cd":
+        errors = dict(over="raise", invalid="ignore")
+    else:
+        errors = dict(invalid="raise")
+    try:
+        # be robust to the max_iter=0 edge case, see:
+        # https://github.com/scikit-learn/scikit-learn/issues/4134
+        d_gap = np.inf
+        # set a sub_covariance buffer
+        sub_covariance = np.copy(covariance_[1:, 1:], order="C")
+        for i in range(max_iter):
+            for idx in range(n_features):
+                # To keep the contiguous matrix `sub_covariance` equal to
+                # covariance_[indices != idx].T[indices != idx]
+                # we only need to update 1 column and 1 line when idx changes
+                if idx > 0:
+                    di = idx - 1
+                    sub_covariance[di] = covariance_[di][indices != idx]
+                    sub_covariance[:, di] = covariance_[:, di][indices != idx]
+                else:
+                    sub_covariance[:] = covariance_[1:, 1:]
+                row = emp_cov[idx, indices != idx]
+                with np.errstate(**errors):
+                    if mode == "cd":
+                        # Use coordinate descent
+                        coefs = -(
+                            precision_[indices != idx, idx]
+                            / (precision_[idx, idx] + 1000 * eps)
+                        )
+                        coefs, _, _, _ = cd_fast.enet_coordinate_descent_gram(
+                            coefs,
+                            alpha,
+                            0,
+                            sub_covariance,
+                            row,
+                            row,
+                            max_iter,
+                            enet_tol,
+                            check_random_state(None),
+                            False,
+                        )
+                    else:  # mode == "lars"
+                        _, _, coefs = lars_path_gram(
+                            Xy=row,
+                            Gram=sub_covariance,
+                            n_samples=row.size,
+                            alpha_min=alpha / (n_features - 1),
+                            copy_Gram=True,
+                            eps=eps,
+                            method="lars",
+                            return_path=False,
+                        )
+                # Update the precision matrix
+                precision_[idx, idx] = 1.0 / (
+                    covariance_[idx, idx]
+                    - np.dot(covariance_[indices != idx, idx], coefs)
+                )
+                precision_[indices != idx, idx] = -precision_[idx, idx] * coefs
+                precision_[idx, indices != idx] = -precision_[idx, idx] * coefs
+                coefs = np.dot(sub_covariance, coefs)
+                covariance_[idx, indices != idx] = coefs
+                covariance_[indices != idx, idx] = coefs
+            if not np.isfinite(precision_.sum()):
+                raise FloatingPointError(
+                    "The system is too ill-conditioned for this solver"
+                )
+            d_gap = _dual_gap(emp_cov, precision_, alpha)
+            cost = _objective(emp_cov, precision_, alpha)
+            if verbose:
+                print(
+                    "[graphical_lasso] Iteration % 3i, cost % 3.2e, dual gap %.3e"
+                    % (i, cost, d_gap)
+                )
+            costs.append((cost, d_gap))
+            if np.abs(d_gap) < tol:
+                break
+            if not np.isfinite(cost) and i > 0:
+                raise FloatingPointError(
+                    "Non SPD result: the system is too ill-conditioned for this solver"
+                )
+        else:
+            warnings.warn(
+                "graphical_lasso: did not converge after %i iteration: dual gap: %.3e"
+                % (max_iter, d_gap),
+                ConvergenceWarning,
+            )
+    except FloatingPointError as e:
+        e.args = (e.args[0] + ". The system is too ill-conditioned for this solver",)
+        raise e
+
+    return covariance_, precision_, costs, i + 1
+
+
+def alpha_max(emp_cov):
+    """Find the maximum alpha for which there are some non-zeros off-diagonal.
+
+    Parameters
+    ----------
+    emp_cov : ndarray of shape (n_features, n_features)
+        The sample covariance matrix.
+
+    Notes
+    -----
+    This results from the bound for the all the Lasso that are solved
+    in GraphicalLasso: each time, the row of cov corresponds to Xy. As the
+    bound for alpha is given by `max(abs(Xy))`, the result follows.
+    """
+    A = np.copy(emp_cov)
+    A.flat[:: A.shape[0] + 1] = 0
+    return np.max(np.abs(A))
+
+
+@validate_params(
+    {
+        "emp_cov": ["array-like"],
+        "return_costs": ["boolean"],
+        "return_n_iter": ["boolean"],
+    },
+    prefer_skip_nested_validation=False,
+)
+def graphical_lasso(
+    emp_cov,
+    alpha,
+    *,
+    mode="cd",
+    tol=1e-4,
+    enet_tol=1e-4,
+    max_iter=100,
+    verbose=False,
+    return_costs=False,
+    eps=np.finfo(np.float64).eps,
+    return_n_iter=False,
+):
+    """L1-penalized covariance estimator.
+
+    Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
+
+    .. versionchanged:: v0.20
+        graph_lasso has been renamed to graphical_lasso
+
+    Parameters
+    ----------
+    emp_cov : array-like of shape (n_features, n_features)
+        Empirical covariance from which to compute the covariance estimate.
+
+    alpha : float
+        The regularization parameter: the higher alpha, the more
+        regularization, the sparser the inverse covariance.
+        Range is (0, inf].
+
+    mode : {'cd', 'lars'}, default='cd'
+        The Lasso solver to use: coordinate descent or LARS. Use LARS for
+        very sparse underlying graphs, where p > n. Elsewhere prefer cd
+        which is more numerically stable.
+
+    tol : float, default=1e-4
+        The tolerance to declare convergence: if the dual gap goes below
+        this value, iterations are stopped. Range is (0, inf].
+
+    enet_tol : float, default=1e-4
+        The tolerance for the elastic net solver used to calculate the descent
+        direction. This parameter controls the accuracy of the search direction
+        for a given column update, not of the overall parameter estimate. Only
+        used for mode='cd'. Range is (0, inf].
+
+    max_iter : int, default=100
+        The maximum number of iterations.
+
+    verbose : bool, default=False
+        If verbose is True, the objective function and dual gap are
+        printed at each iteration.
+
+    return_costs : bool, default=False
+        If return_costs is True, the objective function and dual gap
+        at each iteration are returned.
+
+    eps : float, default=eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Default is `np.finfo(np.float64).eps`.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    Returns
+    -------
+    covariance : ndarray of shape (n_features, n_features)
+        The estimated covariance matrix.
+
+    precision : ndarray of shape (n_features, n_features)
+        The estimated (sparse) precision matrix.
+
+    costs : list of (objective, dual_gap) pairs
+        The list of values of the objective function and the dual gap at
+        each iteration. Returned only if return_costs is True.
+
+    n_iter : int
+        Number of iterations. Returned only if `return_n_iter` is set to True.
+
+    See Also
+    --------
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with
+        cross-validated choice of the l1 penalty.
+
+    Notes
+    -----
+    The algorithm employed to solve this problem is the GLasso algorithm,
+    from the Friedman 2008 Biostatistics paper. It is the same algorithm
+    as in the R `glasso` package.
+
+    One possible difference with the `glasso` R package is that the
+    diagonal coefficients are not penalized.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_sparse_spd_matrix
+    >>> from sklearn.covariance import empirical_covariance, graphical_lasso
+    >>> true_cov = make_sparse_spd_matrix(n_dim=3,random_state=42)
+    >>> rng = np.random.RandomState(42)
+    >>> X = rng.multivariate_normal(mean=np.zeros(3), cov=true_cov, size=3)
+    >>> emp_cov = empirical_covariance(X, assume_centered=True)
+    >>> emp_cov, _ = graphical_lasso(emp_cov, alpha=0.05)
+    >>> emp_cov
+    array([[ 1.687,  0.212, -0.209],
+           [ 0.212,  0.221, -0.0817],
+           [-0.209, -0.0817, 0.232]])
+    """
+    model = GraphicalLasso(
+        alpha=alpha,
+        mode=mode,
+        covariance="precomputed",
+        tol=tol,
+        enet_tol=enet_tol,
+        max_iter=max_iter,
+        verbose=verbose,
+        eps=eps,
+        assume_centered=True,
+    ).fit(emp_cov)
+
+    output = [model.covariance_, model.precision_]
+    if return_costs:
+        output.append(model.costs_)
+    if return_n_iter:
+        output.append(model.n_iter_)
+    return tuple(output)
+
+
+class BaseGraphicalLasso(EmpiricalCovariance):
+    _parameter_constraints: dict = {
+        **EmpiricalCovariance._parameter_constraints,
+        "tol": [Interval(Real, 0, None, closed="right")],
+        "enet_tol": [Interval(Real, 0, None, closed="right")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "mode": [StrOptions({"cd", "lars"})],
+        "verbose": ["verbose"],
+        "eps": [Interval(Real, 0, None, closed="both")],
+    }
+    _parameter_constraints.pop("store_precision")
+
+    def __init__(
+        self,
+        tol=1e-4,
+        enet_tol=1e-4,
+        max_iter=100,
+        mode="cd",
+        verbose=False,
+        eps=np.finfo(np.float64).eps,
+        assume_centered=False,
+    ):
+        super().__init__(assume_centered=assume_centered)
+        self.tol = tol
+        self.enet_tol = enet_tol
+        self.max_iter = max_iter
+        self.mode = mode
+        self.verbose = verbose
+        self.eps = eps
+
+
+class GraphicalLasso(BaseGraphicalLasso):
+    """Sparse inverse covariance estimation with an l1-penalized estimator.
+
+    For a usage example see
+    :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py`.
+
+    Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
+
+    .. versionchanged:: v0.20
+        GraphLasso has been renamed to GraphicalLasso
+
+    Parameters
+    ----------
+    alpha : float, default=0.01
+        The regularization parameter: the higher alpha, the more
+        regularization, the sparser the inverse covariance.
+        Range is (0, inf].
+
+    mode : {'cd', 'lars'}, default='cd'
+        The Lasso solver to use: coordinate descent or LARS. Use LARS for
+        very sparse underlying graphs, where p > n. Elsewhere prefer cd
+        which is more numerically stable.
+
+    covariance : "precomputed", default=None
+        If covariance is "precomputed", the input data in `fit` is assumed
+        to be the covariance matrix. If `None`, the empirical covariance
+        is estimated from the data `X`.
+
+        .. versionadded:: 1.3
+
+    tol : float, default=1e-4
+        The tolerance to declare convergence: if the dual gap goes below
+        this value, iterations are stopped. Range is (0, inf].
+
+    enet_tol : float, default=1e-4
+        The tolerance for the elastic net solver used to calculate the descent
+        direction. This parameter controls the accuracy of the search direction
+        for a given column update, not of the overall parameter estimate. Only
+        used for mode='cd'. Range is (0, inf].
+
+    max_iter : int, default=100
+        The maximum number of iterations.
+
+    verbose : bool, default=False
+        If verbose is True, the objective function and dual gap are
+        plotted at each iteration.
+
+    eps : float, default=eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Default is `np.finfo(np.float64).eps`.
+
+        .. versionadded:: 1.3
+
+    assume_centered : bool, default=False
+        If True, data are not centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False, data are centered before computation.
+
+    Attributes
+    ----------
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+
+    n_iter_ : int
+        Number of iterations run.
+
+    costs_ : list of (objective, dual_gap) pairs
+        The list of values of the objective function and the dual gap at
+        each iteration. Returned only if return_costs is True.
+
+        .. versionadded:: 1.3
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    graphical_lasso : L1-penalized covariance estimator.
+    GraphicalLassoCV : Sparse inverse covariance with
+        cross-validated choice of the l1 penalty.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import GraphicalLasso
+    >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
+    ...                      [0.0, 0.4, 0.0, 0.0],
+    ...                      [0.2, 0.0, 0.3, 0.1],
+    ...                      [0.0, 0.0, 0.1, 0.7]])
+    >>> np.random.seed(0)
+    >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
+    ...                                   cov=true_cov,
+    ...                                   size=200)
+    >>> cov = GraphicalLasso().fit(X)
+    >>> np.around(cov.covariance_, decimals=3)
+    array([[0.816, 0.049, 0.218, 0.019],
+           [0.049, 0.364, 0.017, 0.034],
+           [0.218, 0.017, 0.322, 0.093],
+           [0.019, 0.034, 0.093, 0.69 ]])
+    >>> np.around(cov.location_, decimals=3)
+    array([0.073, 0.04 , 0.038, 0.143])
+    """
+
+    _parameter_constraints: dict = {
+        **BaseGraphicalLasso._parameter_constraints,
+        "alpha": [Interval(Real, 0, None, closed="both")],
+        "covariance": [StrOptions({"precomputed"}), None],
+    }
+
+    def __init__(
+        self,
+        alpha=0.01,
+        *,
+        mode="cd",
+        covariance=None,
+        tol=1e-4,
+        enet_tol=1e-4,
+        max_iter=100,
+        verbose=False,
+        eps=np.finfo(np.float64).eps,
+        assume_centered=False,
+    ):
+        super().__init__(
+            tol=tol,
+            enet_tol=enet_tol,
+            max_iter=max_iter,
+            mode=mode,
+            verbose=verbose,
+            eps=eps,
+            assume_centered=assume_centered,
+        )
+        self.alpha = alpha
+        self.covariance = covariance
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the GraphicalLasso model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data from which to compute the covariance estimate.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        # Covariance does not make sense for a single feature
+        X = validate_data(self, X, ensure_min_features=2, ensure_min_samples=2)
+
+        if self.covariance == "precomputed":
+            emp_cov = X.copy()
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            emp_cov = empirical_covariance(X, assume_centered=self.assume_centered)
+            if self.assume_centered:
+                self.location_ = np.zeros(X.shape[1])
+            else:
+                self.location_ = X.mean(0)
+
+        self.covariance_, self.precision_, self.costs_, self.n_iter_ = _graphical_lasso(
+            emp_cov,
+            alpha=self.alpha,
+            cov_init=None,
+            mode=self.mode,
+            tol=self.tol,
+            enet_tol=self.enet_tol,
+            max_iter=self.max_iter,
+            verbose=self.verbose,
+            eps=self.eps,
+        )
+        return self
+
+
+# Cross-validation with GraphicalLasso
+def graphical_lasso_path(
+    X,
+    alphas,
+    cov_init=None,
+    X_test=None,
+    mode="cd",
+    tol=1e-4,
+    enet_tol=1e-4,
+    max_iter=100,
+    verbose=False,
+    eps=np.finfo(np.float64).eps,
+):
+    """l1-penalized covariance estimator along a path of decreasing alphas
+
+    Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Data from which to compute the covariance estimate.
+
+    alphas : array-like of shape (n_alphas,)
+        The list of regularization parameters, decreasing order.
+
+    cov_init : array of shape (n_features, n_features), default=None
+        The initial guess for the covariance.
+
+    X_test : array of shape (n_test_samples, n_features), default=None
+        Optional test matrix to measure generalisation error.
+
+    mode : {'cd', 'lars'}, default='cd'
+        The Lasso solver to use: coordinate descent or LARS. Use LARS for
+        very sparse underlying graphs, where p > n. Elsewhere prefer cd
+        which is more numerically stable.
+
+    tol : float, default=1e-4
+        The tolerance to declare convergence: if the dual gap goes below
+        this value, iterations are stopped. The tolerance must be a positive
+        number.
+
+    enet_tol : float, default=1e-4
+        The tolerance for the elastic net solver used to calculate the descent
+        direction. This parameter controls the accuracy of the search direction
+        for a given column update, not of the overall parameter estimate. Only
+        used for mode='cd'. The tolerance must be a positive number.
+
+    max_iter : int, default=100
+        The maximum number of iterations. This parameter should be a strictly
+        positive integer.
+
+    verbose : int or bool, default=False
+        The higher the verbosity flag, the more information is printed
+        during the fitting.
+
+    eps : float, default=eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Default is `np.finfo(np.float64).eps`.
+
+        .. versionadded:: 1.3
+
+    Returns
+    -------
+    covariances_ : list of shape (n_alphas,) of ndarray of shape \
+            (n_features, n_features)
+        The estimated covariance matrices.
+
+    precisions_ : list of shape (n_alphas,) of ndarray of shape \
+            (n_features, n_features)
+        The estimated (sparse) precision matrices.
+
+    scores_ : list of shape (n_alphas,), dtype=float
+        The generalisation error (log-likelihood) on the test data.
+        Returned only if test data is passed.
+    """
+    inner_verbose = max(0, verbose - 1)
+    emp_cov = empirical_covariance(X)
+    if cov_init is None:
+        covariance_ = emp_cov.copy()
+    else:
+        covariance_ = cov_init
+    covariances_ = list()
+    precisions_ = list()
+    scores_ = list()
+    if X_test is not None:
+        test_emp_cov = empirical_covariance(X_test)
+
+    for alpha in alphas:
+        try:
+            # Capture the errors, and move on
+            covariance_, precision_, _, _ = _graphical_lasso(
+                emp_cov,
+                alpha=alpha,
+                cov_init=covariance_,
+                mode=mode,
+                tol=tol,
+                enet_tol=enet_tol,
+                max_iter=max_iter,
+                verbose=inner_verbose,
+                eps=eps,
+            )
+            covariances_.append(covariance_)
+            precisions_.append(precision_)
+            if X_test is not None:
+                this_score = log_likelihood(test_emp_cov, precision_)
+        except FloatingPointError:
+            this_score = -np.inf
+            covariances_.append(np.nan)
+            precisions_.append(np.nan)
+        if X_test is not None:
+            if not np.isfinite(this_score):
+                this_score = -np.inf
+            scores_.append(this_score)
+        if verbose == 1:
+            sys.stderr.write(".")
+        elif verbose > 1:
+            if X_test is not None:
+                print(
+                    "[graphical_lasso_path] alpha: %.2e, score: %.2e"
+                    % (alpha, this_score)
+                )
+            else:
+                print("[graphical_lasso_path] alpha: %.2e" % alpha)
+    if X_test is not None:
+        return covariances_, precisions_, scores_
+    return covariances_, precisions_
+
+
+class GraphicalLassoCV(BaseGraphicalLasso):
+    """Sparse inverse covariance w/ cross-validated choice of the l1 penalty.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
+
+    .. versionchanged:: v0.20
+        GraphLassoCV has been renamed to GraphicalLassoCV
+
+    Parameters
+    ----------
+    alphas : int or array-like of shape (n_alphas,), dtype=float, default=4
+        If an integer is given, it fixes the number of points on the
+        grids of alpha to be used. If a list is given, it gives the
+        grid to be used. See the notes in the class docstring for
+        more details. Range is [1, inf) for an integer.
+        Range is (0, inf] for an array-like of floats.
+
+    n_refinements : int, default=4
+        The number of times the grid is refined. Not used if explicit
+        values of alphas are passed. Range is [1, inf).
+
+    cv : int, cross-validation generator or iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.20
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    tol : float, default=1e-4
+        The tolerance to declare convergence: if the dual gap goes below
+        this value, iterations are stopped. Range is (0, inf].
+
+    enet_tol : float, default=1e-4
+        The tolerance for the elastic net solver used to calculate the descent
+        direction. This parameter controls the accuracy of the search direction
+        for a given column update, not of the overall parameter estimate. Only
+        used for mode='cd'. Range is (0, inf].
+
+    max_iter : int, default=100
+        Maximum number of iterations.
+
+    mode : {'cd', 'lars'}, default='cd'
+        The Lasso solver to use: coordinate descent or LARS. Use LARS for
+        very sparse underlying graphs, where number of features is greater
+        than number of samples. Elsewhere prefer cd which is more numerically
+        stable.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
+    verbose : bool, default=False
+        If verbose is True, the objective function and duality gap are
+        printed at each iteration.
+
+    eps : float, default=eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Default is `np.finfo(np.float64).eps`.
+
+        .. versionadded:: 1.3
+
+    assume_centered : bool, default=False
+        If True, data are not centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False, data are centered before computation.
+
+    Attributes
+    ----------
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated precision matrix (inverse covariance).
+
+    costs_ : list of (objective, dual_gap) pairs
+        The list of values of the objective function and the dual gap at
+        each iteration. Returned only if return_costs is True.
+
+        .. versionadded:: 1.3
+
+    alpha_ : float
+        Penalization parameter selected.
+
+    cv_results_ : dict of ndarrays
+        A dict with keys:
+
+        alphas : ndarray of shape (n_alphas,)
+            All penalization parameters explored.
+
+        split(k)_test_score : ndarray of shape (n_alphas,)
+            Log-likelihood score on left-out data across (k)th fold.
+
+            .. versionadded:: 1.0
+
+        mean_test_score : ndarray of shape (n_alphas,)
+            Mean of scores over the folds.
+
+            .. versionadded:: 1.0
+
+        std_test_score : ndarray of shape (n_alphas,)
+            Standard deviation of scores over the folds.
+
+            .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations run for the optimal alpha.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    graphical_lasso : L1-penalized covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+
+    Notes
+    -----
+    The search for the optimal penalization parameter (`alpha`) is done on an
+    iteratively refined grid: first the cross-validated scores on a grid are
+    computed, then a new refined grid is centered around the maximum, and so
+    on.
+
+    One of the challenges which is faced here is that the solvers can
+    fail to converge to a well-conditioned estimate. The corresponding
+    values of `alpha` then come out as missing values, but the optimum may
+    be close to these missing values.
+
+    In `fit`, once the best parameter `alpha` is found through
+    cross-validation, the model is fit again using the entire training set.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import GraphicalLassoCV
+    >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
+    ...                      [0.0, 0.4, 0.0, 0.0],
+    ...                      [0.2, 0.0, 0.3, 0.1],
+    ...                      [0.0, 0.0, 0.1, 0.7]])
+    >>> np.random.seed(0)
+    >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
+    ...                                   cov=true_cov,
+    ...                                   size=200)
+    >>> cov = GraphicalLassoCV().fit(X)
+    >>> np.around(cov.covariance_, decimals=3)
+    array([[0.816, 0.051, 0.22 , 0.017],
+           [0.051, 0.364, 0.018, 0.036],
+           [0.22 , 0.018, 0.322, 0.094],
+           [0.017, 0.036, 0.094, 0.69 ]])
+    >>> np.around(cov.location_, decimals=3)
+    array([0.073, 0.04 , 0.038, 0.143])
+
+    For an example comparing :class:`sklearn.covariance.GraphicalLassoCV`,
+    :func:`sklearn.covariance.ledoit_wolf` shrinkage and the empirical covariance
+    on high-dimensional gaussian data, see
+    :ref:`sphx_glr_auto_examples_covariance_plot_sparse_cov.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **BaseGraphicalLasso._parameter_constraints,
+        "alphas": [Interval(Integral, 0, None, closed="left"), "array-like"],
+        "n_refinements": [Interval(Integral, 1, None, closed="left")],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+    }
+
+    def __init__(
+        self,
+        *,
+        alphas=4,
+        n_refinements=4,
+        cv=None,
+        tol=1e-4,
+        enet_tol=1e-4,
+        max_iter=100,
+        mode="cd",
+        n_jobs=None,
+        verbose=False,
+        eps=np.finfo(np.float64).eps,
+        assume_centered=False,
+    ):
+        super().__init__(
+            tol=tol,
+            enet_tol=enet_tol,
+            max_iter=max_iter,
+            mode=mode,
+            verbose=verbose,
+            eps=eps,
+            assume_centered=assume_centered,
+        )
+        self.alphas = alphas
+        self.n_refinements = n_refinements
+        self.cv = cv
+        self.n_jobs = n_jobs
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, **params):
+        """Fit the GraphicalLasso covariance model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data from which to compute the covariance estimate.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        **params : dict, default=None
+            Parameters to be passed to the CV splitter and the
+            cross_val_score function.
+
+            .. versionadded:: 1.5
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        # Covariance does not make sense for a single feature
+        _raise_for_params(params, self, "fit")
+
+        X = validate_data(self, X, ensure_min_features=2)
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+        emp_cov = empirical_covariance(X, assume_centered=self.assume_centered)
+
+        cv = check_cv(self.cv, y, classifier=False)
+
+        # List of (alpha, scores, covs)
+        path = list()
+        n_alphas = self.alphas
+        inner_verbose = max(0, self.verbose - 1)
+
+        if _is_arraylike_not_scalar(n_alphas):
+            for alpha in self.alphas:
+                check_scalar(
+                    alpha,
+                    "alpha",
+                    Real,
+                    min_val=0,
+                    max_val=np.inf,
+                    include_boundaries="right",
+                )
+            alphas = self.alphas
+            n_refinements = 1
+        else:
+            n_refinements = self.n_refinements
+            alpha_1 = alpha_max(emp_cov)
+            alpha_0 = 1e-2 * alpha_1
+            alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1), n_alphas)[::-1]
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **params)
+        else:
+            routed_params = Bunch(splitter=Bunch(split={}))
+
+        t0 = time.time()
+        for i in range(n_refinements):
+            with warnings.catch_warnings():
+                # No need to see the convergence warnings on this grid:
+                # they will always be points that will not converge
+                # during the cross-validation
+                warnings.simplefilter("ignore", ConvergenceWarning)
+                # Compute the cross-validated loss on the current grid
+
+                # NOTE: Warm-restarting graphical_lasso_path has been tried,
+                # and this did not allow to gain anything
+                # (same execution time with or without).
+                this_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
+                    delayed(graphical_lasso_path)(
+                        X[train],
+                        alphas=alphas,
+                        X_test=X[test],
+                        mode=self.mode,
+                        tol=self.tol,
+                        enet_tol=self.enet_tol,
+                        max_iter=int(0.1 * self.max_iter),
+                        verbose=inner_verbose,
+                        eps=self.eps,
+                    )
+                    for train, test in cv.split(X, y, **routed_params.splitter.split)
+                )
+
+            # Little danse to transform the list in what we need
+            covs, _, scores = zip(*this_path)
+            covs = zip(*covs)
+            scores = zip(*scores)
+            path.extend(zip(alphas, scores, covs))
+            path = sorted(path, key=operator.itemgetter(0), reverse=True)
+
+            # Find the maximum (avoid using built in 'max' function to
+            # have a fully-reproducible selection of the smallest alpha
+            # in case of equality)
+            best_score = -np.inf
+            last_finite_idx = 0
+            for index, (alpha, scores, _) in enumerate(path):
+                this_score = np.mean(scores)
+                if this_score >= 0.1 / np.finfo(np.float64).eps:
+                    this_score = np.nan
+                if np.isfinite(this_score):
+                    last_finite_idx = index
+                if this_score >= best_score:
+                    best_score = this_score
+                    best_index = index
+
+            # Refine the grid
+            if best_index == 0:
+                # We do not need to go back: we have chosen
+                # the highest value of alpha for which there are
+                # non-zero coefficients
+                alpha_1 = path[0][0]
+                alpha_0 = path[1][0]
+            elif best_index == last_finite_idx and not best_index == len(path) - 1:
+                # We have non-converged models on the upper bound of the
+                # grid, we need to refine the grid there
+                alpha_1 = path[best_index][0]
+                alpha_0 = path[best_index + 1][0]
+            elif best_index == len(path) - 1:
+                alpha_1 = path[best_index][0]
+                alpha_0 = 0.01 * path[best_index][0]
+            else:
+                alpha_1 = path[best_index - 1][0]
+                alpha_0 = path[best_index + 1][0]
+
+            if not _is_arraylike_not_scalar(n_alphas):
+                alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0), n_alphas + 2)
+                alphas = alphas[1:-1]
+
+            if self.verbose and n_refinements > 1:
+                print(
+                    "[GraphicalLassoCV] Done refinement % 2i out of %i: % 3is"
+                    % (i + 1, n_refinements, time.time() - t0)
+                )
+
+        path = list(zip(*path))
+        grid_scores = list(path[1])
+        alphas = list(path[0])
+        # Finally, compute the score with alpha = 0
+        alphas.append(0)
+        grid_scores.append(
+            cross_val_score(
+                EmpiricalCovariance(),
+                X,
+                cv=cv,
+                n_jobs=self.n_jobs,
+                verbose=inner_verbose,
+                params=params,
+            )
+        )
+        grid_scores = np.array(grid_scores)
+
+        self.cv_results_ = {"alphas": np.array(alphas)}
+
+        for i in range(grid_scores.shape[1]):
+            self.cv_results_[f"split{i}_test_score"] = grid_scores[:, i]
+
+        self.cv_results_["mean_test_score"] = np.mean(grid_scores, axis=1)
+        self.cv_results_["std_test_score"] = np.std(grid_scores, axis=1)
+
+        best_alpha = alphas[best_index]
+        self.alpha_ = best_alpha
+
+        # Finally fit the model with the selected alpha
+        self.covariance_, self.precision_, self.costs_, self.n_iter_ = _graphical_lasso(
+            emp_cov,
+            alpha=best_alpha,
+            mode=self.mode,
+            tol=self.tol,
+            enet_tol=self.enet_tol,
+            max_iter=self.max_iter,
+            verbose=inner_verbose,
+            eps=self.eps,
+        )
+        return self
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            splitter=check_cv(self.cv),
+            method_mapping=MethodMapping().add(callee="split", caller="fit"),
+        )
+        return router
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_robust_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_robust_covariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..81fc194c6e410da364db9eba432e7201e6ab44cb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_robust_covariance.py
@@ -0,0 +1,874 @@
+"""
+Robust location and covariance estimators.
+
+Here are implemented estimators that are resistant to outliers.
+
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+from scipy.stats import chi2
+
+from ..base import _fit_context
+from ..utils import check_array, check_random_state
+from ..utils._param_validation import Interval
+from ..utils.extmath import fast_logdet
+from ..utils.validation import validate_data
+from ._empirical_covariance import EmpiricalCovariance, empirical_covariance
+
+
+# Minimum Covariance Determinant
+#   Implementing of an algorithm by Rousseeuw & Van Driessen described in
+#   (A Fast Algorithm for the Minimum Covariance Determinant Estimator,
+#   1999, American Statistical Association and the American Society
+#   for Quality, TECHNOMETRICS)
+# XXX Is this really a public function? It's not listed in the docs or
+# exported by sklearn.covariance. Deprecate?
+def c_step(
+    X,
+    n_support,
+    remaining_iterations=30,
+    initial_estimates=None,
+    verbose=False,
+    cov_computation_method=empirical_covariance,
+    random_state=None,
+):
+    """C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data set in which we look for the n_support observations whose
+        scatter matrix has minimum determinant.
+
+    n_support : int
+        Number of observations to compute the robust estimates of location
+        and covariance from. This parameter must be greater than
+        `n_samples / 2`.
+
+    remaining_iterations : int, default=30
+        Number of iterations to perform.
+        According to [Rouseeuw1999]_, two iterations are sufficient to get
+        close to the minimum, and we never need more than 30 to reach
+        convergence.
+
+    initial_estimates : tuple of shape (2,), default=None
+        Initial estimates of location and shape from which to run the c_step
+        procedure:
+        - initial_estimates[0]: an initial location estimate
+        - initial_estimates[1]: an initial covariance estimate
+
+    verbose : bool, default=False
+        Verbose mode.
+
+    cov_computation_method : callable, \
+            default=:func:`sklearn.covariance.empirical_covariance`
+        The function which will be used to compute the covariance.
+        Must return array of shape (n_features, n_features).
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    location : ndarray of shape (n_features,)
+        Robust location estimates.
+
+    covariance : ndarray of shape (n_features, n_features)
+        Robust covariance estimates.
+
+    support : ndarray of shape (n_samples,)
+        A mask for the `n_support` observations whose scatter matrix has
+        minimum determinant.
+
+    References
+    ----------
+    .. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant
+        Estimator, 1999, American Statistical Association and the American
+        Society for Quality, TECHNOMETRICS
+    """
+    X = np.asarray(X)
+    random_state = check_random_state(random_state)
+    return _c_step(
+        X,
+        n_support,
+        remaining_iterations=remaining_iterations,
+        initial_estimates=initial_estimates,
+        verbose=verbose,
+        cov_computation_method=cov_computation_method,
+        random_state=random_state,
+    )
+
+
+def _c_step(
+    X,
+    n_support,
+    random_state,
+    remaining_iterations=30,
+    initial_estimates=None,
+    verbose=False,
+    cov_computation_method=empirical_covariance,
+):
+    n_samples, n_features = X.shape
+    dist = np.inf
+
+    # Initialisation
+    if initial_estimates is None:
+        # compute initial robust estimates from a random subset
+        support_indices = random_state.permutation(n_samples)[:n_support]
+    else:
+        # get initial robust estimates from the function parameters
+        location = initial_estimates[0]
+        covariance = initial_estimates[1]
+        # run a special iteration for that case (to get an initial support_indices)
+        precision = linalg.pinvh(covariance)
+        X_centered = X - location
+        dist = (np.dot(X_centered, precision) * X_centered).sum(1)
+        # compute new estimates
+        support_indices = np.argpartition(dist, n_support - 1)[:n_support]
+
+    X_support = X[support_indices]
+    location = X_support.mean(0)
+    covariance = cov_computation_method(X_support)
+
+    # Iterative procedure for Minimum Covariance Determinant computation
+    det = fast_logdet(covariance)
+    # If the data already has singular covariance, calculate the precision,
+    # as the loop below will not be entered.
+    if np.isinf(det):
+        precision = linalg.pinvh(covariance)
+
+    previous_det = np.inf
+    while det < previous_det and remaining_iterations > 0 and not np.isinf(det):
+        # save old estimates values
+        previous_location = location
+        previous_covariance = covariance
+        previous_det = det
+        previous_support_indices = support_indices
+        # compute a new support_indices from the full data set mahalanobis distances
+        precision = linalg.pinvh(covariance)
+        X_centered = X - location
+        dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)
+        # compute new estimates
+        support_indices = np.argpartition(dist, n_support - 1)[:n_support]
+        X_support = X[support_indices]
+        location = X_support.mean(axis=0)
+        covariance = cov_computation_method(X_support)
+        det = fast_logdet(covariance)
+        # update remaining iterations for early stopping
+        remaining_iterations -= 1
+
+    previous_dist = dist
+    dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1)
+    # Check if best fit already found (det => 0, logdet => -inf)
+    if np.isinf(det):
+        results = location, covariance, det, support_indices, dist
+    # Check convergence
+    if np.allclose(det, previous_det):
+        # c_step procedure converged
+        if verbose:
+            print(
+                "Optimal couple (location, covariance) found before"
+                " ending iterations (%d left)" % (remaining_iterations)
+            )
+        results = location, covariance, det, support_indices, dist
+    elif det > previous_det:
+        # determinant has increased (should not happen)
+        warnings.warn(
+            "Determinant has increased; this should not happen: "
+            "log(det) > log(previous_det) (%.15f > %.15f). "
+            "You may want to try with a higher value of "
+            "support_fraction (current value: %.3f)."
+            % (det, previous_det, n_support / n_samples),
+            RuntimeWarning,
+        )
+        results = (
+            previous_location,
+            previous_covariance,
+            previous_det,
+            previous_support_indices,
+            previous_dist,
+        )
+
+    # Check early stopping
+    if remaining_iterations == 0:
+        if verbose:
+            print("Maximum number of iterations reached")
+        results = location, covariance, det, support_indices, dist
+
+    location, covariance, det, support_indices, dist = results
+    # Convert from list of indices to boolean mask.
+    support = np.bincount(support_indices, minlength=n_samples).astype(bool)
+    return location, covariance, det, support, dist
+
+
+def select_candidates(
+    X,
+    n_support,
+    n_trials,
+    select=1,
+    n_iter=30,
+    verbose=False,
+    cov_computation_method=empirical_covariance,
+    random_state=None,
+):
+    """Finds the best pure subset of observations to compute MCD from it.
+
+    The purpose of this function is to find the best sets of n_support
+    observations with respect to a minimization of their covariance
+    matrix determinant. Equivalently, it removes n_samples-n_support
+    observations to construct what we call a pure data set (i.e. not
+    containing outliers). The list of the observations of the pure
+    data set is referred to as the `support`.
+
+    Starting from a random support, the pure data set is found by the
+    c_step procedure introduced by Rousseeuw and Van Driessen in
+    [RV]_.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data (sub)set in which we look for the n_support purest observations.
+
+    n_support : int
+        The number of samples the pure data set must contain.
+        This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.
+
+    n_trials : int or tuple of shape (2,)
+        Number of different initial sets of observations from which to
+        run the algorithm. This parameter should be a strictly positive
+        integer.
+        Instead of giving a number of trials to perform, one can provide a
+        list of initial estimates that will be used to iteratively run
+        c_step procedures. In this case:
+        - n_trials[0]: array-like, shape (n_trials, n_features)
+          is the list of `n_trials` initial location estimates
+        - n_trials[1]: array-like, shape (n_trials, n_features, n_features)
+          is the list of `n_trials` initial covariances estimates
+
+    select : int, default=1
+        Number of best candidates results to return. This parameter must be
+        a strictly positive integer.
+
+    n_iter : int, default=30
+        Maximum number of iterations for the c_step procedure.
+        (2 is enough to be close to the final solution. "Never" exceeds 20).
+        This parameter must be a strictly positive integer.
+
+    verbose : bool, default=False
+        Control the output verbosity.
+
+    cov_computation_method : callable, \
+            default=:func:`sklearn.covariance.empirical_covariance`
+        The function which will be used to compute the covariance.
+        Must return an array of shape (n_features, n_features).
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    See Also
+    ---------
+    c_step
+
+    Returns
+    -------
+    best_locations : ndarray of shape (select, n_features)
+        The `select` location estimates computed from the `select` best
+        supports found in the data set (`X`).
+
+    best_covariances : ndarray of shape (select, n_features, n_features)
+        The `select` covariance estimates computed from the `select`
+        best supports found in the data set (`X`).
+
+    best_supports : ndarray of shape (select, n_samples)
+        The `select` best supports found in the data set (`X`).
+
+    References
+    ----------
+    .. [RV] A Fast Algorithm for the Minimum Covariance Determinant
+        Estimator, 1999, American Statistical Association and the American
+        Society for Quality, TECHNOMETRICS
+    """
+    random_state = check_random_state(random_state)
+
+    if isinstance(n_trials, Integral):
+        run_from_estimates = False
+    elif isinstance(n_trials, tuple):
+        run_from_estimates = True
+        estimates_list = n_trials
+        n_trials = estimates_list[0].shape[0]
+    else:
+        raise TypeError(
+            "Invalid 'n_trials' parameter, expected tuple or  integer, got %s (%s)"
+            % (n_trials, type(n_trials))
+        )
+
+    # compute `n_trials` location and shape estimates candidates in the subset
+    all_estimates = []
+    if not run_from_estimates:
+        # perform `n_trials` computations from random initial supports
+        for j in range(n_trials):
+            all_estimates.append(
+                _c_step(
+                    X,
+                    n_support,
+                    remaining_iterations=n_iter,
+                    verbose=verbose,
+                    cov_computation_method=cov_computation_method,
+                    random_state=random_state,
+                )
+            )
+    else:
+        # perform computations from every given initial estimates
+        for j in range(n_trials):
+            initial_estimates = (estimates_list[0][j], estimates_list[1][j])
+            all_estimates.append(
+                _c_step(
+                    X,
+                    n_support,
+                    remaining_iterations=n_iter,
+                    initial_estimates=initial_estimates,
+                    verbose=verbose,
+                    cov_computation_method=cov_computation_method,
+                    random_state=random_state,
+                )
+            )
+    all_locs_sub, all_covs_sub, all_dets_sub, all_supports_sub, all_ds_sub = zip(
+        *all_estimates
+    )
+    # find the `n_best` best results among the `n_trials` ones
+    index_best = np.argsort(all_dets_sub)[:select]
+    best_locations = np.asarray(all_locs_sub)[index_best]
+    best_covariances = np.asarray(all_covs_sub)[index_best]
+    best_supports = np.asarray(all_supports_sub)[index_best]
+    best_ds = np.asarray(all_ds_sub)[index_best]
+
+    return best_locations, best_covariances, best_supports, best_ds
+
+
+def fast_mcd(
+    X,
+    support_fraction=None,
+    cov_computation_method=empirical_covariance,
+    random_state=None,
+):
+    """Estimate the Minimum Covariance Determinant matrix.
+
+    Read more in the :ref:`User Guide <robust_covariance>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data matrix, with p features and n samples.
+
+    support_fraction : float, default=None
+        The proportion of points to be included in the support of the raw
+        MCD estimate. Default is `None`, which implies that the minimum
+        value of `support_fraction` will be used within the algorithm:
+        `(n_samples + n_features + 1) / 2 * n_samples`. This parameter must be
+        in the range (0, 1).
+
+    cov_computation_method : callable, \
+            default=:func:`sklearn.covariance.empirical_covariance`
+        The function which will be used to compute the covariance.
+        Must return an array of shape (n_features, n_features).
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    location : ndarray of shape (n_features,)
+        Robust location of the data.
+
+    covariance : ndarray of shape (n_features, n_features)
+        Robust covariance of the features.
+
+    support : ndarray of shape (n_samples,), dtype=bool
+        A mask of the observations that have been used to compute
+        the robust location and covariance estimates of the data set.
+
+    Notes
+    -----
+    The FastMCD algorithm has been introduced by Rousseuw and Van Driessen
+    in "A Fast Algorithm for the Minimum Covariance Determinant Estimator,
+    1999, American Statistical Association and the American Society
+    for Quality, TECHNOMETRICS".
+    The principle is to compute robust estimates and random subsets before
+    pooling them into a larger subsets, and finally into the full data set.
+    Depending on the size of the initial sample, we have one, two or three
+    such computation levels.
+
+    Note that only raw estimates are returned. If one is interested in
+    the correction and reweighting steps described in [RouseeuwVan]_,
+    see the MinCovDet object.
+
+    References
+    ----------
+
+    .. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance
+        Determinant Estimator, 1999, American Statistical Association
+        and the American Society for Quality, TECHNOMETRICS
+
+    .. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,
+        Asymptotics For The Minimum Covariance Determinant Estimator,
+        The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
+    """
+    random_state = check_random_state(random_state)
+
+    X = check_array(X, ensure_min_samples=2, estimator="fast_mcd")
+    n_samples, n_features = X.shape
+
+    # minimum breakdown value
+    if support_fraction is None:
+        n_support = min(int(np.ceil(0.5 * (n_samples + n_features + 1))), n_samples)
+    else:
+        n_support = int(support_fraction * n_samples)
+
+    # 1-dimensional case quick computation
+    # (Rousseeuw, P. J. and Leroy, A. M. (2005) References, in Robust
+    #  Regression and Outlier Detection, John Wiley & Sons, chapter 4)
+    if n_features == 1:
+        if n_support < n_samples:
+            # find the sample shortest halves
+            X_sorted = np.sort(np.ravel(X))
+            diff = X_sorted[n_support:] - X_sorted[: (n_samples - n_support)]
+            halves_start = np.where(diff == np.min(diff))[0]
+            # take the middle points' mean to get the robust location estimate
+            location = (
+                0.5
+                * (X_sorted[n_support + halves_start] + X_sorted[halves_start]).mean()
+            )
+            support = np.zeros(n_samples, dtype=bool)
+            X_centered = X - location
+            support[np.argsort(np.abs(X_centered), 0)[:n_support]] = True
+            covariance = np.asarray([[np.var(X[support])]])
+            location = np.array([location])
+            # get precision matrix in an optimized way
+            precision = linalg.pinvh(covariance)
+            dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
+        else:
+            support = np.ones(n_samples, dtype=bool)
+            covariance = np.asarray([[np.var(X)]])
+            location = np.asarray([np.mean(X)])
+            X_centered = X - location
+            # get precision matrix in an optimized way
+            precision = linalg.pinvh(covariance)
+            dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
+    # Starting FastMCD algorithm for p-dimensional case
+    if (n_samples > 500) and (n_features > 1):
+        # 1. Find candidate supports on subsets
+        # a. split the set in subsets of size ~ 300
+        n_subsets = n_samples // 300
+        n_samples_subsets = n_samples // n_subsets
+        samples_shuffle = random_state.permutation(n_samples)
+        h_subset = int(np.ceil(n_samples_subsets * (n_support / float(n_samples))))
+        # b. perform a total of 500 trials
+        n_trials_tot = 500
+        # c. select 10 best (location, covariance) for each subset
+        n_best_sub = 10
+        n_trials = max(10, n_trials_tot // n_subsets)
+        n_best_tot = n_subsets * n_best_sub
+        all_best_locations = np.zeros((n_best_tot, n_features))
+        try:
+            all_best_covariances = np.zeros((n_best_tot, n_features, n_features))
+        except MemoryError:
+            # The above is too big. Let's try with something much small
+            # (and less optimal)
+            n_best_tot = 10
+            all_best_covariances = np.zeros((n_best_tot, n_features, n_features))
+            n_best_sub = 2
+        for i in range(n_subsets):
+            low_bound = i * n_samples_subsets
+            high_bound = low_bound + n_samples_subsets
+            current_subset = X[samples_shuffle[low_bound:high_bound]]
+            best_locations_sub, best_covariances_sub, _, _ = select_candidates(
+                current_subset,
+                h_subset,
+                n_trials,
+                select=n_best_sub,
+                n_iter=2,
+                cov_computation_method=cov_computation_method,
+                random_state=random_state,
+            )
+            subset_slice = np.arange(i * n_best_sub, (i + 1) * n_best_sub)
+            all_best_locations[subset_slice] = best_locations_sub
+            all_best_covariances[subset_slice] = best_covariances_sub
+        # 2. Pool the candidate supports into a merged set
+        # (possibly the full dataset)
+        n_samples_merged = min(1500, n_samples)
+        h_merged = int(np.ceil(n_samples_merged * (n_support / float(n_samples))))
+        if n_samples > 1500:
+            n_best_merged = 10
+        else:
+            n_best_merged = 1
+        # find the best couples (location, covariance) on the merged set
+        selection = random_state.permutation(n_samples)[:n_samples_merged]
+        locations_merged, covariances_merged, supports_merged, d = select_candidates(
+            X[selection],
+            h_merged,
+            n_trials=(all_best_locations, all_best_covariances),
+            select=n_best_merged,
+            cov_computation_method=cov_computation_method,
+            random_state=random_state,
+        )
+        # 3. Finally get the overall best (locations, covariance) couple
+        if n_samples < 1500:
+            # directly get the best couple (location, covariance)
+            location = locations_merged[0]
+            covariance = covariances_merged[0]
+            support = np.zeros(n_samples, dtype=bool)
+            dist = np.zeros(n_samples)
+            support[selection] = supports_merged[0]
+            dist[selection] = d[0]
+        else:
+            # select the best couple on the full dataset
+            locations_full, covariances_full, supports_full, d = select_candidates(
+                X,
+                n_support,
+                n_trials=(locations_merged, covariances_merged),
+                select=1,
+                cov_computation_method=cov_computation_method,
+                random_state=random_state,
+            )
+            location = locations_full[0]
+            covariance = covariances_full[0]
+            support = supports_full[0]
+            dist = d[0]
+    elif n_features > 1:
+        # 1. Find the 10 best couples (location, covariance)
+        # considering two iterations
+        n_trials = 30
+        n_best = 10
+        locations_best, covariances_best, _, _ = select_candidates(
+            X,
+            n_support,
+            n_trials=n_trials,
+            select=n_best,
+            n_iter=2,
+            cov_computation_method=cov_computation_method,
+            random_state=random_state,
+        )
+        # 2. Select the best couple on the full dataset amongst the 10
+        locations_full, covariances_full, supports_full, d = select_candidates(
+            X,
+            n_support,
+            n_trials=(locations_best, covariances_best),
+            select=1,
+            cov_computation_method=cov_computation_method,
+            random_state=random_state,
+        )
+        location = locations_full[0]
+        covariance = covariances_full[0]
+        support = supports_full[0]
+        dist = d[0]
+
+    return location, covariance, support, dist
+
+
+class MinCovDet(EmpiricalCovariance):
+    """Minimum Covariance Determinant (MCD): robust estimator of covariance.
+
+    The Minimum Covariance Determinant covariance estimator is to be applied
+    on Gaussian-distributed data, but could still be relevant on data
+    drawn from a unimodal, symmetric distribution. It is not meant to be used
+    with multi-modal data (the algorithm used to fit a MinCovDet object is
+    likely to fail in such a case).
+    One should consider projection pursuit methods to deal with multi-modal
+    datasets.
+
+    Read more in the :ref:`User Guide <robust_covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, the support of the robust location and the covariance
+        estimates is computed, and a covariance estimate is recomputed from
+        it, without centering the data.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, the robust location and covariance are directly computed
+        with the FastMCD algorithm without additional treatment.
+
+    support_fraction : float, default=None
+        The proportion of points to be included in the support of the raw
+        MCD estimate. Default is None, which implies that the minimum
+        value of support_fraction will be used within the algorithm:
+        `(n_samples + n_features + 1) / 2 * n_samples`. The parameter must be
+        in the range (0, 1].
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    raw_location_ : ndarray of shape (n_features,)
+        The raw robust estimated location before correction and re-weighting.
+
+    raw_covariance_ : ndarray of shape (n_features, n_features)
+        The raw robust estimated covariance before correction and re-weighting.
+
+    raw_support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute
+        the raw robust estimates of location and shape, before correction
+        and re-weighting.
+
+    location_ : ndarray of shape (n_features,)
+        Estimated robust location.
+
+        For an example of comparing raw robust estimates with
+        the true location and covariance, refer to
+        :ref:`sphx_glr_auto_examples_covariance_plot_robust_vs_empirical_covariance.py`.
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated robust covariance matrix.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute
+        the robust estimates of location and shape.
+
+    dist_ : ndarray of shape (n_samples,)
+        Mahalanobis distances of the training set (on which :meth:`fit` is
+        called) observations.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with cross-validated
+        choice of the l1 penalty.
+    LedoitWolf : LedoitWolf Estimator.
+    OAS : Oracle Approximating Shrinkage Estimator.
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    References
+    ----------
+
+    .. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression.
+        J. Am Stat Ass, 79:871, 1984.
+    .. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant
+        Estimator, 1999, American Statistical Association and the American
+        Society for Quality, TECHNOMETRICS
+    .. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun,
+        Asymptotics For The Minimum Covariance Determinant Estimator,
+        The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import MinCovDet
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=500)
+    >>> cov = MinCovDet(random_state=0).fit(X)
+    >>> cov.covariance_
+    array([[0.7411, 0.2535],
+           [0.2535, 0.3053]])
+    >>> cov.location_
+    array([0.0813 , 0.0427])
+    """
+
+    _parameter_constraints: dict = {
+        **EmpiricalCovariance._parameter_constraints,
+        "support_fraction": [Interval(Real, 0, 1, closed="right"), None],
+        "random_state": ["random_state"],
+    }
+    _nonrobust_covariance = staticmethod(empirical_covariance)
+
+    def __init__(
+        self,
+        *,
+        store_precision=True,
+        assume_centered=False,
+        support_fraction=None,
+        random_state=None,
+    ):
+        self.store_precision = store_precision
+        self.assume_centered = assume_centered
+        self.support_fraction = support_fraction
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit a Minimum Covariance Determinant with the FastMCD algorithm.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(self, X, ensure_min_samples=2, estimator="MinCovDet")
+        random_state = check_random_state(self.random_state)
+        n_samples, n_features = X.shape
+        # check that the empirical covariance is full rank
+        if (linalg.svdvals(np.dot(X.T, X)) > 1e-8).sum() != n_features:
+            warnings.warn(
+                "The covariance matrix associated to your dataset is not full rank"
+            )
+        # compute and store raw estimates
+        raw_location, raw_covariance, raw_support, raw_dist = fast_mcd(
+            X,
+            support_fraction=self.support_fraction,
+            cov_computation_method=self._nonrobust_covariance,
+            random_state=random_state,
+        )
+        if self.assume_centered:
+            raw_location = np.zeros(n_features)
+            raw_covariance = self._nonrobust_covariance(
+                X[raw_support], assume_centered=True
+            )
+            # get precision matrix in an optimized way
+            precision = linalg.pinvh(raw_covariance)
+            raw_dist = np.sum(np.dot(X, precision) * X, 1)
+        self.raw_location_ = raw_location
+        self.raw_covariance_ = raw_covariance
+        self.raw_support_ = raw_support
+        self.location_ = raw_location
+        self.support_ = raw_support
+        self.dist_ = raw_dist
+        # obtain consistency at normal models
+        self.correct_covariance(X)
+        # re-weight estimator
+        self.reweight_covariance(X)
+
+        return self
+
+    def correct_covariance(self, data):
+        """Apply a correction to raw Minimum Covariance Determinant estimates.
+
+        Correction using the empirical correction factor suggested
+        by Rousseeuw and Van Driessen in [RVD]_.
+
+        Parameters
+        ----------
+        data : array-like of shape (n_samples, n_features)
+            The data matrix, with p features and n samples.
+            The data set must be the one which was used to compute
+            the raw estimates.
+
+        Returns
+        -------
+        covariance_corrected : ndarray of shape (n_features, n_features)
+            Corrected robust covariance estimate.
+
+        References
+        ----------
+
+        .. [RVD] A Fast Algorithm for the Minimum Covariance
+            Determinant Estimator, 1999, American Statistical Association
+            and the American Society for Quality, TECHNOMETRICS
+        """
+
+        # Check that the covariance of the support data is not equal to 0.
+        # Otherwise self.dist_ = 0 and thus correction = 0.
+        n_samples = len(self.dist_)
+        n_support = np.sum(self.support_)
+        if n_support < n_samples and np.allclose(self.raw_covariance_, 0):
+            raise ValueError(
+                "The covariance matrix of the support data "
+                "is equal to 0, try to increase support_fraction"
+            )
+        correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)
+        covariance_corrected = self.raw_covariance_ * correction
+        self.dist_ /= correction
+        return covariance_corrected
+
+    def reweight_covariance(self, data):
+        """Re-weight raw Minimum Covariance Determinant estimates.
+
+        Re-weight observations using Rousseeuw's method (equivalent to
+        deleting outlying observations from the data set before
+        computing location and covariance estimates) described
+        in [RVDriessen]_.
+
+        Parameters
+        ----------
+        data : array-like of shape (n_samples, n_features)
+            The data matrix, with p features and n samples.
+            The data set must be the one which was used to compute
+            the raw estimates.
+
+        Returns
+        -------
+        location_reweighted : ndarray of shape (n_features,)
+            Re-weighted robust location estimate.
+
+        covariance_reweighted : ndarray of shape (n_features, n_features)
+            Re-weighted robust covariance estimate.
+
+        support_reweighted : ndarray of shape (n_samples,), dtype=bool
+            A mask of the observations that have been used to compute
+            the re-weighted robust location and covariance estimates.
+
+        References
+        ----------
+
+        .. [RVDriessen] A Fast Algorithm for the Minimum Covariance
+            Determinant Estimator, 1999, American Statistical Association
+            and the American Society for Quality, TECHNOMETRICS
+        """
+        n_samples, n_features = data.shape
+        mask = self.dist_ < chi2(n_features).isf(0.025)
+        if self.assume_centered:
+            location_reweighted = np.zeros(n_features)
+        else:
+            location_reweighted = data[mask].mean(0)
+        covariance_reweighted = self._nonrobust_covariance(
+            data[mask], assume_centered=self.assume_centered
+        )
+        support_reweighted = np.zeros(n_samples, dtype=bool)
+        support_reweighted[mask] = True
+        self._set_covariance(covariance_reweighted)
+        self.location_ = location_reweighted
+        self.support_ = support_reweighted
+        X_centered = data - self.location_
+        self.dist_ = np.sum(np.dot(X_centered, self.get_precision()) * X_centered, 1)
+        return location_reweighted, covariance_reweighted, support_reweighted
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_shrunk_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_shrunk_covariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..99d6f70f57d6eee24fc442bd42f496bf8ae9a9a2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_shrunk_covariance.py
@@ -0,0 +1,822 @@
+"""
+Covariance estimators using shrinkage.
+
+Shrinkage corresponds to regularising `cov` using a convex combination:
+shrunk_cov = (1-shrinkage)*cov + shrinkage*structured_estimate.
+
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# avoid division truncation
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import _fit_context
+from ..utils import check_array
+from ..utils._param_validation import Interval, validate_params
+from ..utils.validation import validate_data
+from . import EmpiricalCovariance, empirical_covariance
+
+
+def _ledoit_wolf(X, *, assume_centered, block_size):
+    """Estimate the shrunk Ledoit-Wolf covariance matrix."""
+    # for only one feature, the result is the same whatever the shrinkage
+    if len(X.shape) == 2 and X.shape[1] == 1:
+        if not assume_centered:
+            X = X - X.mean()
+        return np.atleast_2d((X**2).mean()), 0.0
+    n_features = X.shape[1]
+
+    # get Ledoit-Wolf shrinkage
+    shrinkage = ledoit_wolf_shrinkage(
+        X, assume_centered=assume_centered, block_size=block_size
+    )
+    emp_cov = empirical_covariance(X, assume_centered=assume_centered)
+    mu = np.sum(np.trace(emp_cov)) / n_features
+    shrunk_cov = (1.0 - shrinkage) * emp_cov
+    shrunk_cov.flat[:: n_features + 1] += shrinkage * mu
+
+    return shrunk_cov, shrinkage
+
+
+def _oas(X, *, assume_centered=False):
+    """Estimate covariance with the Oracle Approximating Shrinkage algorithm.
+
+    The formulation is based on [1]_.
+    [1] "Shrinkage algorithms for MMSE covariance estimation.",
+        Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
+        IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
+        https://arxiv.org/pdf/0907.4698.pdf
+    """
+    if len(X.shape) == 2 and X.shape[1] == 1:
+        # for only one feature, the result is the same whatever the shrinkage
+        if not assume_centered:
+            X = X - X.mean()
+        return np.atleast_2d((X**2).mean()), 0.0
+
+    n_samples, n_features = X.shape
+
+    emp_cov = empirical_covariance(X, assume_centered=assume_centered)
+
+    # The shrinkage is defined as:
+    # shrinkage = min(
+    # trace(S @ S.T) + trace(S)**2) / ((n + 1) (trace(S @ S.T) - trace(S)**2 / p), 1
+    # )
+    # where n and p are n_samples and n_features, respectively (cf. Eq. 23 in [1]).
+    # The factor 2 / p is omitted since it does not impact the value of the estimator
+    # for large p.
+
+    # Instead of computing trace(S)**2, we can compute the average of the squared
+    # elements of S that is equal to trace(S)**2 / p**2.
+    # See the definition of the Frobenius norm:
+    # https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
+    alpha = np.mean(emp_cov**2)
+    mu = np.trace(emp_cov) / n_features
+    mu_squared = mu**2
+
+    # The factor 1 / p**2 will cancel out since it is in both the numerator and
+    # denominator
+    num = alpha + mu_squared
+    den = (n_samples + 1) * (alpha - mu_squared / n_features)
+    shrinkage = 1.0 if den == 0 else min(num / den, 1.0)
+
+    # The shrunk covariance is defined as:
+    # (1 - shrinkage) * S + shrinkage * F (cf. Eq. 4 in [1])
+    # where S is the empirical covariance and F is the shrinkage target defined as
+    # F = trace(S) / n_features * np.identity(n_features) (cf. Eq. 3 in [1])
+    shrunk_cov = (1.0 - shrinkage) * emp_cov
+    shrunk_cov.flat[:: n_features + 1] += shrinkage * mu
+
+    return shrunk_cov, shrinkage
+
+
+###############################################################################
+# Public API
+# ShrunkCovariance estimator
+
+
+@validate_params(
+    {
+        "emp_cov": ["array-like"],
+        "shrinkage": [Interval(Real, 0, 1, closed="both")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def shrunk_covariance(emp_cov, shrinkage=0.1):
+    """Calculate covariance matrices shrunk on the diagonal.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    emp_cov : array-like of shape (..., n_features, n_features)
+        Covariance matrices to be shrunk, at least 2D ndarray.
+
+    shrinkage : float, default=0.1
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate. Range is [0, 1].
+
+    Returns
+    -------
+    shrunk_cov : ndarray of shape (..., n_features, n_features)
+        Shrunk covariance matrices.
+
+    Notes
+    -----
+    The regularized (shrunk) covariance is given by::
+
+        (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where `mu = trace(cov) / n_features`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> from sklearn.covariance import empirical_covariance, shrunk_covariance
+    >>> real_cov = np.array([[.8, .3], [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)
+    >>> shrunk_covariance(empirical_covariance(X))
+    array([[0.739, 0.254],
+           [0.254, 0.411]])
+    """
+    emp_cov = check_array(emp_cov, allow_nd=True)
+    n_features = emp_cov.shape[-1]
+
+    shrunk_cov = (1.0 - shrinkage) * emp_cov
+    mu = np.trace(emp_cov, axis1=-2, axis2=-1) / n_features
+    mu = np.expand_dims(mu, axis=tuple(range(mu.ndim, emp_cov.ndim)))
+    shrunk_cov += shrinkage * mu * np.eye(n_features)
+
+    return shrunk_cov
+
+
+class ShrunkCovariance(EmpiricalCovariance):
+    """Covariance estimator with shrinkage.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False, data will be centered before computation.
+
+    shrinkage : float, default=0.1
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate. Range is [0, 1].
+
+    Attributes
+    ----------
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix
+
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with cross-validated
+        choice of the l1 penalty.
+    LedoitWolf : LedoitWolf Estimator.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    OAS : Oracle Approximating Shrinkage Estimator.
+
+    Notes
+    -----
+    The regularized covariance is given by:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where mu = trace(cov) / n_features
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import ShrunkCovariance
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=500)
+    >>> cov = ShrunkCovariance().fit(X)
+    >>> cov.covariance_
+    array([[0.7387, 0.2536],
+           [0.2536, 0.4110]])
+    >>> cov.location_
+    array([0.0622, 0.0193])
+    """
+
+    _parameter_constraints: dict = {
+        **EmpiricalCovariance._parameter_constraints,
+        "shrinkage": [Interval(Real, 0, 1, closed="both")],
+    }
+
+    def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1):
+        super().__init__(
+            store_precision=store_precision, assume_centered=assume_centered
+        )
+        self.shrinkage = shrinkage
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the shrunk covariance model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(self, X)
+        # Not calling the parent object to fit, to avoid a potential
+        # matrix inversion when setting the precision
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+        covariance = empirical_covariance(X, assume_centered=self.assume_centered)
+        covariance = shrunk_covariance(covariance, self.shrinkage)
+        self._set_covariance(covariance)
+
+        return self
+
+
+# Ledoit-Wolf estimator
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "assume_centered": ["boolean"],
+        "block_size": [Interval(Integral, 1, None, closed="left")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):
+    """Estimate the shrunk Ledoit-Wolf covariance matrix.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, data will be centered before computation.
+
+    block_size : int, default=1000
+        Size of blocks into which the covariance matrix will be split.
+
+    Returns
+    -------
+    shrinkage : float
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate.
+
+    Notes
+    -----
+    The regularized (shrunk) covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where mu = trace(cov) / n_features
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import ledoit_wolf_shrinkage
+    >>> real_cov = np.array([[.4, .2], [.2, .8]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50)
+    >>> shrinkage_coefficient = ledoit_wolf_shrinkage(X)
+    >>> shrinkage_coefficient
+    np.float64(0.23)
+    """
+    X = check_array(X)
+    # for only one feature, the result is the same whatever the shrinkage
+    if len(X.shape) == 2 and X.shape[1] == 1:
+        return 0.0
+    if X.ndim == 1:
+        X = np.reshape(X, (1, -1))
+
+    if X.shape[0] == 1:
+        warnings.warn(
+            "Only one sample available. You may want to reshape your data array"
+        )
+    n_samples, n_features = X.shape
+
+    # optionally center data
+    if not assume_centered:
+        X = X - X.mean(0)
+
+    # A non-blocked version of the computation is present in the tests
+    # in tests/test_covariance.py
+
+    # number of blocks to split the covariance matrix into
+    n_splits = int(n_features / block_size)
+    X2 = X**2
+    emp_cov_trace = np.sum(X2, axis=0) / n_samples
+    mu = np.sum(emp_cov_trace) / n_features
+    beta_ = 0.0  # sum of the coefficients of <X2.T, X2>
+    delta_ = 0.0  # sum of the *squared* coefficients of <X.T, X>
+    # starting block computation
+    for i in range(n_splits):
+        for j in range(n_splits):
+            rows = slice(block_size * i, block_size * (i + 1))
+            cols = slice(block_size * j, block_size * (j + 1))
+            beta_ += np.sum(np.dot(X2.T[rows], X2[:, cols]))
+            delta_ += np.sum(np.dot(X.T[rows], X[:, cols]) ** 2)
+        rows = slice(block_size * i, block_size * (i + 1))
+        beta_ += np.sum(np.dot(X2.T[rows], X2[:, block_size * n_splits :]))
+        delta_ += np.sum(np.dot(X.T[rows], X[:, block_size * n_splits :]) ** 2)
+    for j in range(n_splits):
+        cols = slice(block_size * j, block_size * (j + 1))
+        beta_ += np.sum(np.dot(X2.T[block_size * n_splits :], X2[:, cols]))
+        delta_ += np.sum(np.dot(X.T[block_size * n_splits :], X[:, cols]) ** 2)
+    delta_ += np.sum(
+        np.dot(X.T[block_size * n_splits :], X[:, block_size * n_splits :]) ** 2
+    )
+    delta_ /= n_samples**2
+    beta_ += np.sum(
+        np.dot(X2.T[block_size * n_splits :], X2[:, block_size * n_splits :])
+    )
+    # use delta_ to compute beta
+    beta = 1.0 / (n_features * n_samples) * (beta_ / n_samples - delta_)
+    # delta is the sum of the squared coefficients of (<X.T,X> - mu*Id) / p
+    delta = delta_ - 2.0 * mu * emp_cov_trace.sum() + n_features * mu**2
+    delta /= n_features
+    # get final beta as the min between beta and delta
+    # We do this to prevent shrinking more than "1", which would invert
+    # the value of covariances
+    beta = min(beta, delta)
+    # finally get shrinkage
+    shrinkage = 0 if beta == 0 else beta / delta
+    return shrinkage
+
+
+@validate_params(
+    {"X": ["array-like"]},
+    prefer_skip_nested_validation=False,
+)
+def ledoit_wolf(X, *, assume_centered=False, block_size=1000):
+    """Estimate the shrunk Ledoit-Wolf covariance matrix.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data from which to compute the covariance estimate.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, data will be centered before computation.
+
+    block_size : int, default=1000
+        Size of blocks into which the covariance matrix will be split.
+        This is purely a memory optimization and does not affect results.
+
+    Returns
+    -------
+    shrunk_cov : ndarray of shape (n_features, n_features)
+        Shrunk covariance.
+
+    shrinkage : float
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate.
+
+    Notes
+    -----
+    The regularized (shrunk) covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where mu = trace(cov) / n_features
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import empirical_covariance, ledoit_wolf
+    >>> real_cov = np.array([[.4, .2], [.2, .8]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50)
+    >>> covariance, shrinkage = ledoit_wolf(X)
+    >>> covariance
+    array([[0.44, 0.16],
+           [0.16, 0.80]])
+    >>> shrinkage
+    np.float64(0.23)
+    """
+    estimator = LedoitWolf(
+        assume_centered=assume_centered,
+        block_size=block_size,
+        store_precision=False,
+    ).fit(X)
+
+    return estimator.covariance_, estimator.shrinkage_
+
+
+class LedoitWolf(EmpiricalCovariance):
+    """LedoitWolf Estimator.
+
+    Ledoit-Wolf is a particular form of shrinkage, where the shrinkage
+    coefficient is computed using O. Ledoit and M. Wolf's formula as
+    described in "A Well-Conditioned Estimator for Large-Dimensional
+    Covariance Matrices", Ledoit and Wolf, Journal of Multivariate
+    Analysis, Volume 88, Issue 2, February 2004, pages 365-411.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False (default), data will be centered before computation.
+
+    block_size : int, default=1000
+        Size of blocks into which the covariance matrix will be split
+        during its Ledoit-Wolf estimation. This is purely a memory
+        optimization and does not affect results.
+
+    Attributes
+    ----------
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix.
+
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    shrinkage_ : float
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate. Range is [0, 1].
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with cross-validated
+        choice of the l1 penalty.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    OAS : Oracle Approximating Shrinkage Estimator.
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    Notes
+    -----
+    The regularised covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where mu = trace(cov) / n_features
+    and shrinkage is given by the Ledoit and Wolf formula (see References)
+
+    References
+    ----------
+    "A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices",
+    Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,
+    February 2004, pages 365-411.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import LedoitWolf
+    >>> real_cov = np.array([[.4, .2],
+    ...                      [.2, .8]])
+    >>> np.random.seed(0)
+    >>> X = np.random.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=50)
+    >>> cov = LedoitWolf().fit(X)
+    >>> cov.covariance_
+    array([[0.4406, 0.1616],
+           [0.1616, 0.8022]])
+    >>> cov.location_
+    array([ 0.0595 , -0.0075])
+
+    See also :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py`
+    and :ref:`sphx_glr_auto_examples_covariance_plot_lw_vs_oas.py`
+    for more detailed examples.
+    """
+
+    _parameter_constraints: dict = {
+        **EmpiricalCovariance._parameter_constraints,
+        "block_size": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    def __init__(self, *, store_precision=True, assume_centered=False, block_size=1000):
+        super().__init__(
+            store_precision=store_precision, assume_centered=assume_centered
+        )
+        self.block_size = block_size
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the Ledoit-Wolf shrunk covariance model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        # Not calling the parent object to fit, to avoid computing the
+        # covariance matrix (and potentially the precision)
+        X = validate_data(self, X)
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+        covariance, shrinkage = _ledoit_wolf(
+            X - self.location_, assume_centered=True, block_size=self.block_size
+        )
+        self.shrinkage_ = shrinkage
+        self._set_covariance(covariance)
+
+        return self
+
+
+# OAS estimator
+@validate_params(
+    {"X": ["array-like"]},
+    prefer_skip_nested_validation=False,
+)
+def oas(X, *, assume_centered=False):
+    """Estimate covariance with the Oracle Approximating Shrinkage.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data from which to compute the covariance estimate.
+
+    assume_centered : bool, default=False
+      If True, data will not be centered before computation.
+      Useful to work with data whose mean is significantly equal to
+      zero but is not exactly zero.
+      If False, data will be centered before computation.
+
+    Returns
+    -------
+    shrunk_cov : array-like of shape (n_features, n_features)
+        Shrunk covariance.
+
+    shrinkage : float
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate.
+
+    Notes
+    -----
+    The regularised covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
+
+    where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
+    (see [1]_).
+
+    The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
+    the original article, formula (23) states that 2/p (p being the number of
+    features) is multiplied by Trace(cov*cov) in both the numerator and
+    denominator, but this operation is omitted because for a large p, the value
+    of 2/p is so small that it doesn't affect the value of the estimator.
+
+    References
+    ----------
+    .. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.",
+           Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
+           IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
+           <0907.4698>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import oas
+    >>> rng = np.random.RandomState(0)
+    >>> real_cov = [[.8, .3], [.3, .4]]
+    >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)
+    >>> shrunk_cov, shrinkage = oas(X)
+    >>> shrunk_cov
+    array([[0.7533, 0.2763],
+           [0.2763, 0.3964]])
+    >>> shrinkage
+    np.float64(0.0195)
+    """
+    estimator = OAS(
+        assume_centered=assume_centered,
+    ).fit(X)
+    return estimator.covariance_, estimator.shrinkage_
+
+
+class OAS(EmpiricalCovariance):
+    """Oracle Approximating Shrinkage Estimator.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False (default), data will be centered before computation.
+
+    Attributes
+    ----------
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix.
+
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    shrinkage_ : float
+      coefficient in the convex combination used for the computation
+      of the shrunk estimate. Range is [0, 1].
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with cross-validated
+        choice of the l1 penalty.
+    LedoitWolf : LedoitWolf Estimator.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    Notes
+    -----
+    The regularised covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
+
+    where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
+    (see [1]_).
+
+    The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
+    the original article, formula (23) states that 2/p (p being the number of
+    features) is multiplied by Trace(cov*cov) in both the numerator and
+    denominator, but this operation is omitted because for a large p, the value
+    of 2/p is so small that it doesn't affect the value of the estimator.
+
+    References
+    ----------
+    .. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.",
+           Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
+           IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
+           <0907.4698>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import OAS
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                             cov=real_cov,
+    ...                             size=500)
+    >>> oas = OAS().fit(X)
+    >>> oas.covariance_
+    array([[0.7533, 0.2763],
+           [0.2763, 0.3964]])
+    >>> oas.precision_
+    array([[ 1.7833, -1.2431 ],
+           [-1.2431,  3.3889]])
+    >>> oas.shrinkage_
+    np.float64(0.0195)
+
+    See also :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py`
+    and :ref:`sphx_glr_auto_examples_covariance_plot_lw_vs_oas.py`
+    for more detailed examples.
+    """
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the Oracle Approximating Shrinkage covariance model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(self, X)
+        # Not calling the parent object to fit, to avoid computing the
+        # covariance matrix (and potentially the precision)
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+
+        covariance, shrinkage = _oas(X - self.location_, assume_centered=True)
+        self.shrinkage_ = shrinkage
+        self._set_covariance(covariance)
+
+        return self
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_covariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c55012c158e19df20e4c4770867fc19398213d0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_covariance.py
@@ -0,0 +1,374 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.covariance import (
+    OAS,
+    EmpiricalCovariance,
+    LedoitWolf,
+    ShrunkCovariance,
+    empirical_covariance,
+    ledoit_wolf,
+    ledoit_wolf_shrinkage,
+    oas,
+    shrunk_covariance,
+)
+from sklearn.covariance._shrunk_covariance import _ledoit_wolf
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+from .._shrunk_covariance import _oas
+
+X, _ = datasets.load_diabetes(return_X_y=True)
+X_1d = X[:, 0]
+n_samples, n_features = X.shape
+
+
+def test_covariance():
+    # Tests Covariance module on a simple dataset.
+    # test covariance fit from data
+    cov = EmpiricalCovariance()
+    cov.fit(X)
+    emp_cov = empirical_covariance(X)
+    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
+    assert_almost_equal(cov.error_norm(emp_cov), 0)
+    assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0)
+    assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0)
+    assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
+    assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
+    with pytest.raises(NotImplementedError):
+        cov.error_norm(emp_cov, norm="foo")
+    # Mahalanobis distances computation test
+    mahal_dist = cov.mahalanobis(X)
+    assert np.amin(mahal_dist) > 0
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    cov = EmpiricalCovariance()
+    cov.fit(X_1d)
+    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
+    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
+    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0)
+
+    # test with one sample
+    # Create X with 1 sample and 5 features
+    X_1sample = np.arange(5).reshape(1, 5)
+    cov = EmpiricalCovariance()
+    warn_msg = "Only one sample available. You may want to reshape your data array"
+    with pytest.warns(UserWarning, match=warn_msg):
+        cov.fit(X_1sample)
+
+    assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
+
+    # test integer type
+    X_integer = np.asarray([[0, 1], [1, 0]])
+    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
+    assert_array_almost_equal(empirical_covariance(X_integer), result)
+
+    # test centered case
+    cov = EmpiricalCovariance(assume_centered=True)
+    cov.fit(X)
+    assert_array_equal(cov.location_, np.zeros(X.shape[1]))
+
+
+@pytest.mark.parametrize("n_matrices", [1, 3])
+def test_shrunk_covariance_func(n_matrices):
+    """Check `shrunk_covariance` function."""
+
+    n_features = 2
+    cov = np.ones((n_features, n_features))
+    cov_target = np.array([[1, 0.5], [0.5, 1]])
+
+    if n_matrices > 1:
+        cov = np.repeat(cov[np.newaxis, ...], n_matrices, axis=0)
+        cov_target = np.repeat(cov_target[np.newaxis, ...], n_matrices, axis=0)
+
+    cov_shrunk = shrunk_covariance(cov, 0.5)
+    assert_allclose(cov_shrunk, cov_target)
+
+
+def test_shrunk_covariance():
+    """Check consistency between `ShrunkCovariance` and `shrunk_covariance`."""
+
+    # Tests ShrunkCovariance module on a simple dataset.
+    # compare shrunk covariance obtained from data and from MLE estimate
+    cov = ShrunkCovariance(shrinkage=0.5)
+    cov.fit(X)
+    assert_array_almost_equal(
+        shrunk_covariance(empirical_covariance(X), shrinkage=0.5), cov.covariance_, 4
+    )
+
+    # same test with shrinkage not provided
+    cov = ShrunkCovariance()
+    cov.fit(X)
+    assert_array_almost_equal(
+        shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4
+    )
+
+    # same test with shrinkage = 0 (<==> empirical_covariance)
+    cov = ShrunkCovariance(shrinkage=0.0)
+    cov.fit(X)
+    assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    cov = ShrunkCovariance(shrinkage=0.3)
+    cov.fit(X_1d)
+    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    cov = ShrunkCovariance(shrinkage=0.5, store_precision=False)
+    cov.fit(X)
+    assert cov.precision_ is None
+
+
+def test_ledoit_wolf():
+    # Tests LedoitWolf module on a simple dataset.
+    # test shrinkage coeff on a simple data set
+    X_centered = X - X.mean(axis=0)
+    lw = LedoitWolf(assume_centered=True)
+    lw.fit(X_centered)
+    shrinkage_ = lw.shrinkage_
+
+    score_ = lw.score(X_centered)
+    assert_almost_equal(
+        ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_
+    )
+    assert_almost_equal(
+        ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6),
+        shrinkage_,
+    )
+    # compare shrunk covariance obtained from data and from MLE estimate
+    lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(
+        X_centered, assume_centered=True
+    )
+    assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
+    assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
+    # compare estimates given by LW and ShrunkCovariance
+    scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True)
+    scov.fit(X_centered)
+    assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    lw = LedoitWolf(assume_centered=True)
+    lw.fit(X_1d)
+    lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True)
+    assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
+    assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
+    assert_array_almost_equal((X_1d**2).sum() / n_samples, lw.covariance_, 4)
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    lw = LedoitWolf(store_precision=False, assume_centered=True)
+    lw.fit(X_centered)
+    assert_almost_equal(lw.score(X_centered), score_, 4)
+    assert lw.precision_ is None
+
+    # Same tests without assuming centered data
+    # test shrinkage coeff on a simple data set
+    lw = LedoitWolf()
+    lw.fit(X)
+    assert_almost_equal(lw.shrinkage_, shrinkage_, 4)
+    assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X))
+    assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1])
+    assert_almost_equal(
+        lw.shrinkage_, _ledoit_wolf(X=X, assume_centered=False, block_size=10000)[1]
+    )
+    assert_almost_equal(lw.score(X), score_, 4)
+    # compare shrunk covariance obtained from data and from MLE estimate
+    lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X)
+    assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
+    assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
+    # compare estimates given by LW and ShrunkCovariance
+    scov = ShrunkCovariance(shrinkage=lw.shrinkage_)
+    scov.fit(X)
+    assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    lw = LedoitWolf()
+    lw.fit(X_1d)
+    assert_allclose(
+        X_1d.var(ddof=0),
+        _ledoit_wolf(X=X_1d, assume_centered=False, block_size=10000)[0],
+    )
+    lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d)
+    assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
+    assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
+    assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4)
+
+    # test with one sample
+    # warning should be raised when using only 1 sample
+    X_1sample = np.arange(5).reshape(1, 5)
+    lw = LedoitWolf()
+
+    warn_msg = "Only one sample available. You may want to reshape your data array"
+    with pytest.warns(UserWarning, match=warn_msg):
+        lw.fit(X_1sample)
+
+    assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    lw = LedoitWolf(store_precision=False)
+    lw.fit(X)
+    assert_almost_equal(lw.score(X), score_, 4)
+    assert lw.precision_ is None
+
+
+def _naive_ledoit_wolf_shrinkage(X):
+    # A simple implementation of the formulas from Ledoit & Wolf
+
+    # The computation below achieves the following computations of the
+    # "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for
+    # Large-Dimensional Covariance Matrices"
+    # beta and delta are given in the beginning of section 3.2
+    n_samples, n_features = X.shape
+    emp_cov = empirical_covariance(X, assume_centered=False)
+    mu = np.trace(emp_cov) / n_features
+    delta_ = emp_cov.copy()
+    delta_.flat[:: n_features + 1] -= mu
+    delta = (delta_**2).sum() / n_features
+    X2 = X**2
+    beta_ = (
+        1.0
+        / (n_features * n_samples)
+        * np.sum(np.dot(X2.T, X2) / n_samples - emp_cov**2)
+    )
+
+    beta = min(beta_, delta)
+    shrinkage = beta / delta
+    return shrinkage
+
+
+def test_ledoit_wolf_small():
+    # Compare our blocked implementation to the naive implementation
+    X_small = X[:, :4]
+    lw = LedoitWolf()
+    lw.fit(X_small)
+    shrinkage_ = lw.shrinkage_
+
+    assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
+
+
+def test_ledoit_wolf_large():
+    # test that ledoit_wolf doesn't error on data that is wider than block_size
+    rng = np.random.RandomState(0)
+    # use a number of features that is larger than the block-size
+    X = rng.normal(size=(10, 20))
+    lw = LedoitWolf(block_size=10).fit(X)
+    # check that covariance is about diagonal (random normal noise)
+    assert_almost_equal(lw.covariance_, np.eye(20), 0)
+    cov = lw.covariance_
+
+    # check that the result is consistent with not splitting data into blocks.
+    lw = LedoitWolf(block_size=25).fit(X)
+    assert_almost_equal(lw.covariance_, cov)
+
+
+@pytest.mark.parametrize(
+    "ledoit_wolf_fitting_function", [LedoitWolf().fit, ledoit_wolf_shrinkage]
+)
+def test_ledoit_wolf_empty_array(ledoit_wolf_fitting_function):
+    """Check that we validate X and raise proper error with 0-sample array."""
+    X_empty = np.zeros((0, 2))
+    with pytest.raises(ValueError, match="Found array with 0 sample"):
+        ledoit_wolf_fitting_function(X_empty)
+
+
+def test_oas():
+    # Tests OAS module on a simple dataset.
+    # test shrinkage coeff on a simple data set
+    X_centered = X - X.mean(axis=0)
+    oa = OAS(assume_centered=True)
+    oa.fit(X_centered)
+    shrinkage_ = oa.shrinkage_
+    score_ = oa.score(X_centered)
+    # compare shrunk covariance obtained from data and from MLE estimate
+    oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered, assume_centered=True)
+    assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
+    # compare estimates given by OAS and ShrunkCovariance
+    scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True)
+    scov.fit(X_centered)
+    assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0:1]
+    oa = OAS(assume_centered=True)
+    oa.fit(X_1d)
+    oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True)
+    assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
+    assert_array_almost_equal((X_1d**2).sum() / n_samples, oa.covariance_, 4)
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    oa = OAS(store_precision=False, assume_centered=True)
+    oa.fit(X_centered)
+    assert_almost_equal(oa.score(X_centered), score_, 4)
+    assert oa.precision_ is None
+
+    # Same tests without assuming centered data--------------------------------
+    # test shrinkage coeff on a simple data set
+    oa = OAS()
+    oa.fit(X)
+    assert_almost_equal(oa.shrinkage_, shrinkage_, 4)
+    assert_almost_equal(oa.score(X), score_, 4)
+    # compare shrunk covariance obtained from data and from MLE estimate
+    oa_cov_from_mle, oa_shrinkage_from_mle = oas(X)
+    assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
+    # compare estimates given by OAS and ShrunkCovariance
+    scov = ShrunkCovariance(shrinkage=oa.shrinkage_)
+    scov.fit(X)
+    assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    oa = OAS()
+    oa.fit(X_1d)
+    oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d)
+    assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
+    assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4)
+
+    # test with one sample
+    # warning should be raised when using only 1 sample
+    X_1sample = np.arange(5).reshape(1, 5)
+    oa = OAS()
+    warn_msg = "Only one sample available. You may want to reshape your data array"
+    with pytest.warns(UserWarning, match=warn_msg):
+        oa.fit(X_1sample)
+
+    assert_array_almost_equal(oa.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    oa = OAS(store_precision=False)
+    oa.fit(X)
+    assert_almost_equal(oa.score(X), score_, 4)
+    assert oa.precision_ is None
+
+    # test function _oas without assuming centered data
+    X_1f = X[:, 0:1]
+    oa = OAS()
+    oa.fit(X_1f)
+    # compare shrunk covariance obtained from data and from MLE estimate
+    _oa_cov_from_mle, _oa_shrinkage_from_mle = _oas(X_1f)
+    assert_array_almost_equal(_oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(_oa_shrinkage_from_mle, oa.shrinkage_)
+    assert_array_almost_equal((X_1f**2).sum() / n_samples, oa.covariance_, 4)
+
+
+def test_EmpiricalCovariance_validates_mahalanobis():
+    """Checks that EmpiricalCovariance validates data with mahalanobis."""
+    cov = EmpiricalCovariance().fit(X)
+
+    msg = f"X has 2 features, but \\w+ is expecting {X.shape[1]} features as input"
+    with pytest.raises(ValueError, match=msg):
+        cov.mahalanobis(X[:, :2])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_elliptic_envelope.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_elliptic_envelope.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca85717fb378243ff8dcb75db1adade9a6c50c18
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_elliptic_envelope.py
@@ -0,0 +1,52 @@
+"""
+Testing for Elliptic Envelope algorithm (sklearn.covariance.elliptic_envelope).
+"""
+
+import numpy as np
+import pytest
+
+from sklearn.covariance import EllipticEnvelope
+from sklearn.exceptions import NotFittedError
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+
+def test_elliptic_envelope(global_random_seed):
+    rnd = np.random.RandomState(global_random_seed)
+    X = rnd.randn(100, 10)
+    clf = EllipticEnvelope(contamination=0.1)
+    with pytest.raises(NotFittedError):
+        clf.predict(X)
+    with pytest.raises(NotFittedError):
+        clf.decision_function(X)
+    clf.fit(X)
+    y_pred = clf.predict(X)
+    scores = clf.score_samples(X)
+    decisions = clf.decision_function(X)
+
+    assert_array_almost_equal(scores, -clf.mahalanobis(X))
+    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
+    assert_almost_equal(
+        clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0
+    )
+    assert sum(y_pred == -1) == sum(decisions < 0)
+
+
+def test_score_samples():
+    X_train = [[1, 1], [1, 2], [2, 1]]
+    clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
+    clf2 = EllipticEnvelope().fit(X_train)
+    assert_array_equal(
+        clf1.score_samples([[2.0, 2.0]]),
+        clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
+    )
+    assert_array_equal(
+        clf2.score_samples([[2.0, 2.0]]),
+        clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
+    )
+    assert_array_equal(
+        clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_graphical_lasso.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_graphical_lasso.py
new file mode 100644
index 0000000000000000000000000000000000000000..9698b64bf4407e216229b9e55fa4cd19896af823
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_graphical_lasso.py
@@ -0,0 +1,318 @@
+"""Test the graphical_lasso module."""
+
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy import linalg
+
+from sklearn import config_context, datasets
+from sklearn.covariance import (
+    GraphicalLasso,
+    GraphicalLassoCV,
+    empirical_covariance,
+    graphical_lasso,
+)
+from sklearn.datasets import make_sparse_spd_matrix
+from sklearn.model_selection import GroupKFold
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_array_almost_equal,
+    assert_array_less,
+)
+
+
+def test_graphical_lassos(random_state=1):
+    """Test the graphical lasso solvers.
+
+    This checks is unstable for some random seeds where the covariance found with "cd"
+    and "lars" solvers are different (4 cases / 100 tries).
+    """
+    # Sample data from a sparse multivariate normal
+    dim = 20
+    n_samples = 100
+    random_state = check_random_state(random_state)
+    prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=random_state)
+    cov = linalg.inv(prec)
+    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
+    emp_cov = empirical_covariance(X)
+
+    for alpha in (0.0, 0.1, 0.25):
+        covs = dict()
+        icovs = dict()
+        for method in ("cd", "lars"):
+            cov_, icov_, costs = graphical_lasso(
+                emp_cov, return_costs=True, alpha=alpha, mode=method
+            )
+            covs[method] = cov_
+            icovs[method] = icov_
+            costs, dual_gap = np.array(costs).T
+            # Check that the costs always decrease (doesn't hold if alpha == 0)
+            if not alpha == 0:
+                # use 1e-12 since the cost can be exactly 0
+                assert_array_less(np.diff(costs), 1e-12)
+        # Check that the 2 approaches give similar results
+        assert_allclose(covs["cd"], covs["lars"], atol=5e-4)
+        assert_allclose(icovs["cd"], icovs["lars"], atol=5e-4)
+
+    # Smoke test the estimator
+    model = GraphicalLasso(alpha=0.25).fit(X)
+    model.score(X)
+    assert_array_almost_equal(model.covariance_, covs["cd"], decimal=4)
+    assert_array_almost_equal(model.covariance_, covs["lars"], decimal=4)
+
+    # For a centered matrix, assume_centered could be chosen True or False
+    # Check that this returns indeed the same result for centered data
+    Z = X - X.mean(0)
+    precs = list()
+    for assume_centered in (False, True):
+        prec_ = GraphicalLasso(assume_centered=assume_centered).fit(Z).precision_
+        precs.append(prec_)
+    assert_array_almost_equal(precs[0], precs[1])
+
+
+def test_graphical_lasso_when_alpha_equals_0():
+    """Test graphical_lasso's early return condition when alpha=0."""
+    X = np.random.randn(100, 10)
+    emp_cov = empirical_covariance(X, assume_centered=True)
+
+    model = GraphicalLasso(alpha=0, covariance="precomputed").fit(emp_cov)
+    assert_allclose(model.precision_, np.linalg.inv(emp_cov))
+
+    _, precision = graphical_lasso(emp_cov, alpha=0)
+    assert_allclose(precision, np.linalg.inv(emp_cov))
+
+
+@pytest.mark.parametrize("mode", ["cd", "lars"])
+def test_graphical_lasso_n_iter(mode):
+    X, _ = datasets.make_classification(n_samples=5_000, n_features=20, random_state=0)
+    emp_cov = empirical_covariance(X)
+
+    _, _, n_iter = graphical_lasso(
+        emp_cov, 0.2, mode=mode, max_iter=2, return_n_iter=True
+    )
+    assert n_iter == 2
+
+
+def test_graphical_lasso_iris():
+    # Hard-coded solution from R glasso package for alpha=1.0
+    # (need to set penalize.diagonal to FALSE)
+    cov_R = np.array(
+        [
+            [0.68112222, 0.0000000, 0.265820, 0.02464314],
+            [0.00000000, 0.1887129, 0.000000, 0.00000000],
+            [0.26582000, 0.0000000, 3.095503, 0.28697200],
+            [0.02464314, 0.0000000, 0.286972, 0.57713289],
+        ]
+    )
+    icov_R = np.array(
+        [
+            [1.5190747, 0.000000, -0.1304475, 0.0000000],
+            [0.0000000, 5.299055, 0.0000000, 0.0000000],
+            [-0.1304475, 0.000000, 0.3498624, -0.1683946],
+            [0.0000000, 0.000000, -0.1683946, 1.8164353],
+        ]
+    )
+    X = datasets.load_iris().data
+    emp_cov = empirical_covariance(X)
+    for method in ("cd", "lars"):
+        cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method)
+        assert_array_almost_equal(cov, cov_R)
+        assert_array_almost_equal(icov, icov_R)
+
+
+def test_graph_lasso_2D():
+    # Hard-coded solution from Python skggm package
+    # obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)`
+    cov_skggm = np.array([[3.09550269, 1.186972], [1.186972, 0.57713289]])
+
+    icov_skggm = np.array([[1.52836773, -3.14334831], [-3.14334831, 8.19753385]])
+    X = datasets.load_iris().data[:, 2:]
+    emp_cov = empirical_covariance(X)
+    for method in ("cd", "lars"):
+        cov, icov = graphical_lasso(emp_cov, alpha=0.1, return_costs=False, mode=method)
+        assert_array_almost_equal(cov, cov_skggm)
+        assert_array_almost_equal(icov, icov_skggm)
+
+
+def test_graphical_lasso_iris_singular():
+    # Small subset of rows to test the rank-deficient case
+    # Need to choose samples such that none of the variances are zero
+    indices = np.arange(10, 13)
+
+    # Hard-coded solution from R glasso package for alpha=0.01
+    cov_R = np.array(
+        [
+            [0.08, 0.056666662595, 0.00229729713223, 0.00153153142149],
+            [0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222],
+            [0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009],
+            [0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222],
+        ]
+    )
+    icov_R = np.array(
+        [
+            [24.42244057, -16.831679593, 0.0, 0.0],
+            [-16.83168201, 24.351841681, -6.206896552, -12.5],
+            [0.0, -6.206896171, 153.103448276, 0.0],
+            [0.0, -12.499999143, 0.0, 462.5],
+        ]
+    )
+    X = datasets.load_iris().data[indices, :]
+    emp_cov = empirical_covariance(X)
+    for method in ("cd", "lars"):
+        cov, icov = graphical_lasso(
+            emp_cov, alpha=0.01, return_costs=False, mode=method
+        )
+        assert_array_almost_equal(cov, cov_R, decimal=5)
+        assert_array_almost_equal(icov, icov_R, decimal=5)
+
+
+def test_graphical_lasso_cv(random_state=1):
+    # Sample data from a sparse multivariate normal
+    dim = 5
+    n_samples = 6
+    random_state = check_random_state(random_state)
+    prec = make_sparse_spd_matrix(dim, alpha=0.96, random_state=random_state)
+    cov = linalg.inv(prec)
+    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
+    # Capture stdout, to smoke test the verbose mode
+    orig_stdout = sys.stdout
+    try:
+        sys.stdout = StringIO()
+        # We need verbose very high so that Parallel prints on stdout
+        GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
+    finally:
+        sys.stdout = orig_stdout
+
+
+@pytest.mark.parametrize("alphas_container_type", ["list", "tuple", "array"])
+def test_graphical_lasso_cv_alphas_iterable(alphas_container_type):
+    """Check that we can pass an array-like to `alphas`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/22489
+    """
+    true_cov = np.array(
+        [
+            [0.8, 0.0, 0.2, 0.0],
+            [0.0, 0.4, 0.0, 0.0],
+            [0.2, 0.0, 0.3, 0.1],
+            [0.0, 0.0, 0.1, 0.7],
+        ]
+    )
+    rng = np.random.RandomState(0)
+    X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
+    alphas = _convert_container([0.02, 0.03], alphas_container_type)
+    GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
+
+
+@pytest.mark.parametrize(
+    "alphas,err_type,err_msg",
+    [
+        ([-0.02, 0.03], ValueError, "must be > 0"),
+        ([0, 0.03], ValueError, "must be > 0"),
+        (["not_number", 0.03], TypeError, "must be an instance of float"),
+    ],
+)
+def test_graphical_lasso_cv_alphas_invalid_array(alphas, err_type, err_msg):
+    """Check that if an array-like containing a value
+    outside of (0, inf] is passed to `alphas`, a ValueError is raised.
+    Check if a string is passed, a TypeError is raised.
+    """
+    true_cov = np.array(
+        [
+            [0.8, 0.0, 0.2, 0.0],
+            [0.0, 0.4, 0.0, 0.0],
+            [0.2, 0.0, 0.3, 0.1],
+            [0.0, 0.0, 0.1, 0.7],
+        ]
+    )
+    rng = np.random.RandomState(0)
+    X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
+
+    with pytest.raises(err_type, match=err_msg):
+        GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
+
+
+def test_graphical_lasso_cv_scores():
+    splits = 4
+    n_alphas = 5
+    n_refinements = 3
+    true_cov = np.array(
+        [
+            [0.8, 0.0, 0.2, 0.0],
+            [0.0, 0.4, 0.0, 0.0],
+            [0.2, 0.0, 0.3, 0.1],
+            [0.0, 0.0, 0.1, 0.7],
+        ]
+    )
+    rng = np.random.RandomState(0)
+    X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
+    cov = GraphicalLassoCV(cv=splits, alphas=n_alphas, n_refinements=n_refinements).fit(
+        X
+    )
+
+    _assert_graphical_lasso_cv_scores(
+        cov=cov,
+        n_splits=splits,
+        n_refinements=n_refinements,
+        n_alphas=n_alphas,
+    )
+
+
+@config_context(enable_metadata_routing=True)
+def test_graphical_lasso_cv_scores_with_routing(global_random_seed):
+    """Check that `GraphicalLassoCV` internally dispatches metadata to
+    the splitter.
+    """
+    splits = 5
+    n_alphas = 5
+    n_refinements = 3
+    true_cov = np.array(
+        [
+            [0.8, 0.0, 0.2, 0.0],
+            [0.0, 0.4, 0.0, 0.0],
+            [0.2, 0.0, 0.3, 0.1],
+            [0.0, 0.0, 0.1, 0.7],
+        ]
+    )
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=300)
+    n_samples = X.shape[0]
+    groups = rng.randint(0, 5, n_samples)
+    params = {"groups": groups}
+    cv = GroupKFold(n_splits=splits)
+    cv.set_split_request(groups=True)
+
+    cov = GraphicalLassoCV(cv=cv, alphas=n_alphas, n_refinements=n_refinements).fit(
+        X, **params
+    )
+
+    _assert_graphical_lasso_cv_scores(
+        cov=cov,
+        n_splits=splits,
+        n_refinements=n_refinements,
+        n_alphas=n_alphas,
+    )
+
+
+def _assert_graphical_lasso_cv_scores(cov, n_splits, n_refinements, n_alphas):
+    cv_results = cov.cv_results_
+    # alpha and one for each split
+
+    total_alphas = n_refinements * n_alphas + 1
+    keys = ["alphas"]
+    split_keys = [f"split{i}_test_score" for i in range(n_splits)]
+    for key in keys + split_keys:
+        assert key in cv_results
+        assert len(cv_results[key]) == total_alphas
+
+    cv_scores = np.asarray([cov.cv_results_[key] for key in split_keys])
+    expected_mean = cv_scores.mean(axis=0)
+    expected_std = cv_scores.std(axis=0)
+
+    assert_allclose(cov.cv_results_["mean_test_score"], expected_mean)
+    assert_allclose(cov.cv_results_["std_test_score"], expected_std)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_robust_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_robust_covariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7bd3996b9e4bdc39af0f961976eb8b727c9a130
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_robust_covariance.py
@@ -0,0 +1,171 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.covariance import MinCovDet, empirical_covariance, fast_mcd
+from sklearn.utils._testing import assert_array_almost_equal
+
+X = datasets.load_iris().data
+X_1d = X[:, 0]
+n_samples, n_features = X.shape
+
+
+def test_mcd(global_random_seed):
+    # Tests the FastMCD algorithm implementation
+    # Small data set
+    # test without outliers (random independent normal data)
+    launch_mcd_on_dataset(100, 5, 0, 0.02, 0.1, 75, global_random_seed)
+    # test with a contaminated data set (medium contamination)
+    launch_mcd_on_dataset(100, 5, 20, 0.3, 0.3, 65, global_random_seed)
+    # test with a contaminated data set (strong contamination)
+    launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50, global_random_seed)
+
+    # Medium data set
+    launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540, global_random_seed)
+
+    # Large data set
+    launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870, global_random_seed)
+
+    # 1D data set
+    launch_mcd_on_dataset(500, 1, 100, 0.02, 0.02, 350, global_random_seed)
+
+    # n_samples == n_features
+    launch_mcd_on_dataset(20, 20, 0, 0.1, 0.1, 15, global_random_seed)
+
+
+def test_fast_mcd_on_invalid_input():
+    X = np.arange(100)
+    msg = "Expected 2D array, got 1D array instead"
+    with pytest.raises(ValueError, match=msg):
+        fast_mcd(X)
+
+
+def test_mcd_class_on_invalid_input():
+    X = np.arange(100)
+    mcd = MinCovDet()
+    msg = "Expected 2D array, got 1D array instead"
+    with pytest.raises(ValueError, match=msg):
+        mcd.fit(X)
+
+
+def launch_mcd_on_dataset(
+    n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support, seed
+):
+    rand_gen = np.random.RandomState(seed)
+    data = rand_gen.randn(n_samples, n_features)
+    # add some outliers
+    outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
+    outliers_offset = 10.0 * (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
+    data[outliers_index] += outliers_offset
+    inliers_mask = np.ones(n_samples).astype(bool)
+    inliers_mask[outliers_index] = False
+
+    pure_data = data[inliers_mask]
+    # compute MCD by fitting an object
+    mcd_fit = MinCovDet(random_state=seed).fit(data)
+    T = mcd_fit.location_
+    S = mcd_fit.covariance_
+    H = mcd_fit.support_
+    # compare with the estimates learnt from the inliers
+    error_location = np.mean((pure_data.mean(0) - T) ** 2)
+    assert error_location < tol_loc
+    error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
+    assert error_cov < tol_cov
+    assert np.sum(H) >= tol_support
+    assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
+
+
+def test_mcd_issue1127():
+    # Check that the code does not break with X.shape = (3, 1)
+    # (i.e. n_support = n_samples)
+    rnd = np.random.RandomState(0)
+    X = rnd.normal(size=(3, 1))
+    mcd = MinCovDet()
+    mcd.fit(X)
+
+
+def test_mcd_issue3367(global_random_seed):
+    # Check that MCD completes when the covariance matrix is singular
+    # i.e. one of the rows and columns are all zeros
+    rand_gen = np.random.RandomState(global_random_seed)
+
+    # Think of these as the values for X and Y -> 10 values between -5 and 5
+    data_values = np.linspace(-5, 5, 10).tolist()
+    # Get the cartesian product of all possible coordinate pairs from above set
+    data = np.array(list(itertools.product(data_values, data_values)))
+
+    # Add a third column that's all zeros to make our data a set of point
+    # within a plane, which means that the covariance matrix will be singular
+    data = np.hstack((data, np.zeros((data.shape[0], 1))))
+
+    # The below line of code should raise an exception if the covariance matrix
+    # is singular. As a further test, since we have points in XYZ, the
+    # principle components (Eigenvectors) of these directly relate to the
+    # geometry of the points. Since it's a plane, we should be able to test
+    # that the Eigenvector that corresponds to the smallest Eigenvalue is the
+    # plane normal, specifically [0, 0, 1], since everything is in the XY plane
+    # (as I've set it up above). To do this one would start by:
+    #
+    #     evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
+    #     normal = evecs[:, np.argmin(evals)]
+    #
+    # After which we need to assert that our `normal` is equal to [0, 0, 1].
+    # Do note that there is floating point error associated with this, so it's
+    # best to subtract the two and then compare some small tolerance (e.g.
+    # 1e-12).
+    MinCovDet(random_state=rand_gen).fit(data)
+
+
+def test_mcd_support_covariance_is_zero():
+    # Check that MCD returns a ValueError with informative message when the
+    # covariance of the support data is equal to 0.
+    X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1])
+    X_1 = X_1.reshape(-1, 1)
+    X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3])
+    X_2 = X_2.reshape(-1, 1)
+    msg = (
+        "The covariance matrix of the support data is equal to 0, try to "
+        "increase support_fraction"
+    )
+    for X in [X_1, X_2]:
+        with pytest.raises(ValueError, match=msg):
+            MinCovDet().fit(X)
+
+
+def test_mcd_increasing_det_warning(global_random_seed):
+    # Check that a warning is raised if we observe increasing determinants
+    # during the c_step. In theory the sequence of determinants should be
+    # decreasing. Increasing determinants are likely due to ill-conditioned
+    # covariance matrices that result in poor precision matrices.
+
+    X = [
+        [5.1, 3.5, 1.4, 0.2],
+        [4.9, 3.0, 1.4, 0.2],
+        [4.7, 3.2, 1.3, 0.2],
+        [4.6, 3.1, 1.5, 0.2],
+        [5.0, 3.6, 1.4, 0.2],
+        [4.6, 3.4, 1.4, 0.3],
+        [5.0, 3.4, 1.5, 0.2],
+        [4.4, 2.9, 1.4, 0.2],
+        [4.9, 3.1, 1.5, 0.1],
+        [5.4, 3.7, 1.5, 0.2],
+        [4.8, 3.4, 1.6, 0.2],
+        [4.8, 3.0, 1.4, 0.1],
+        [4.3, 3.0, 1.1, 0.1],
+        [5.1, 3.5, 1.4, 0.3],
+        [5.7, 3.8, 1.7, 0.3],
+        [5.4, 3.4, 1.7, 0.2],
+        [4.6, 3.6, 1.0, 0.2],
+        [5.0, 3.0, 1.6, 0.2],
+        [5.2, 3.5, 1.5, 0.2],
+    ]
+
+    mcd = MinCovDet(support_fraction=0.5, random_state=global_random_seed)
+    warn_msg = "Determinant has increased"
+    with pytest.warns(RuntimeWarning, match=warn_msg):
+        mcd.fit(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f78f33811e5c7bfd26fac6dda83022e4d8719191
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/__init__.py
@@ -0,0 +1,8 @@
+"""Algorithms for cross decomposition."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._pls import CCA, PLSSVD, PLSCanonical, PLSRegression
+
+__all__ = ["CCA", "PLSSVD", "PLSCanonical", "PLSRegression"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/_pls.py b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/_pls.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bf6ec8f01d065f7f170f278c3ba87a0b0ce9823
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/_pls.py
@@ -0,0 +1,1097 @@
+"""
+The :mod:`sklearn.pls` module implements Partial Least Squares (PLS).
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.linalg import pinv, svd
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..exceptions import ConvergenceWarning
+from ..utils import check_array, check_consistent_length
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import svd_flip
+from ..utils.validation import FLOAT_DTYPES, check_is_fitted, validate_data
+
+__all__ = ["PLSSVD", "PLSCanonical", "PLSRegression"]
+
+
+def _pinv2_old(a):
+    # Used previous scipy pinv2 that was updated in:
+    # https://github.com/scipy/scipy/pull/10067
+    # We can not set `cond` or `rcond` for pinv2 in scipy >= 1.3 to keep the
+    # same behavior of pinv2 for scipy < 1.3, because the condition used to
+    # determine the rank is dependent on the output of svd.
+    u, s, vh = svd(a, full_matrices=False, check_finite=False)
+
+    t = u.dtype.char.lower()
+    factor = {"f": 1e3, "d": 1e6}
+    cond = np.max(s) * factor[t] * np.finfo(t).eps
+    rank = np.sum(s > cond)
+
+    u = u[:, :rank]
+    u /= s[:rank]
+    return np.transpose(np.conjugate(np.dot(u, vh[:rank])))
+
+
+def _get_first_singular_vectors_power_method(
+    X, y, mode="A", max_iter=500, tol=1e-06, norm_y_weights=False
+):
+    """Return the first left and right singular vectors of X'y.
+
+    Provides an alternative to the svd(X'y) and uses the power method instead.
+    With norm_y_weights to True and in mode A, this corresponds to the
+    algorithm section 11.3 of the Wegelin's review, except this starts at the
+    "update saliences" part.
+    """
+
+    eps = np.finfo(X.dtype).eps
+    try:
+        y_score = next(col for col in y.T if np.any(np.abs(col) > eps))
+    except StopIteration as e:
+        raise StopIteration("y residual is constant") from e
+
+    x_weights_old = 100  # init to big value for first convergence check
+
+    if mode == "B":
+        # Precompute pseudo inverse matrices
+        # Basically: X_pinv = (X.T X)^-1 X.T
+        # Which requires inverting a (n_features, n_features) matrix.
+        # As a result, and as detailed in the Wegelin's review, CCA (i.e. mode
+        # B) will be unstable if n_features > n_samples or n_targets >
+        # n_samples
+        X_pinv, y_pinv = _pinv2_old(X), _pinv2_old(y)
+
+    for i in range(max_iter):
+        if mode == "B":
+            x_weights = np.dot(X_pinv, y_score)
+        else:
+            x_weights = np.dot(X.T, y_score) / np.dot(y_score, y_score)
+
+        x_weights /= np.sqrt(np.dot(x_weights, x_weights)) + eps
+        x_score = np.dot(X, x_weights)
+
+        if mode == "B":
+            y_weights = np.dot(y_pinv, x_score)
+        else:
+            y_weights = np.dot(y.T, x_score) / np.dot(x_score.T, x_score)
+
+        if norm_y_weights:
+            y_weights /= np.sqrt(np.dot(y_weights, y_weights)) + eps
+
+        y_score = np.dot(y, y_weights) / (np.dot(y_weights, y_weights) + eps)
+
+        x_weights_diff = x_weights - x_weights_old
+        if np.dot(x_weights_diff, x_weights_diff) < tol or y.shape[1] == 1:
+            break
+        x_weights_old = x_weights
+
+    n_iter = i + 1
+    if n_iter == max_iter:
+        warnings.warn("Maximum number of iterations reached", ConvergenceWarning)
+
+    return x_weights, y_weights, n_iter
+
+
+def _get_first_singular_vectors_svd(X, y):
+    """Return the first left and right singular vectors of X'y.
+
+    Here the whole SVD is computed.
+    """
+    C = np.dot(X.T, y)
+    U, _, Vt = svd(C, full_matrices=False)
+    return U[:, 0], Vt[0, :]
+
+
+def _center_scale_xy(X, y, scale=True):
+    """Center X, y and scale if the scale parameter==True
+
+    Returns
+    -------
+        X, y, x_mean, y_mean, x_std, y_std
+    """
+    # center
+    x_mean = X.mean(axis=0)
+    X -= x_mean
+    y_mean = y.mean(axis=0)
+    y -= y_mean
+    # scale
+    if scale:
+        x_std = X.std(axis=0, ddof=1)
+        x_std[x_std == 0.0] = 1.0
+        X /= x_std
+        y_std = y.std(axis=0, ddof=1)
+        y_std[y_std == 0.0] = 1.0
+        y /= y_std
+    else:
+        x_std = np.ones(X.shape[1])
+        y_std = np.ones(y.shape[1])
+    return X, y, x_mean, y_mean, x_std, y_std
+
+
+def _svd_flip_1d(u, v):
+    """Same as svd_flip but works on 1d arrays, and is inplace"""
+    # svd_flip would force us to convert to 2d array and would also return 2d
+    # arrays. We don't want that.
+    biggest_abs_val_idx = np.argmax(np.abs(u))
+    sign = np.sign(u[biggest_abs_val_idx])
+    u *= sign
+    v *= sign
+
+
+class _PLS(
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    RegressorMixin,
+    MultiOutputMixin,
+    BaseEstimator,
+    metaclass=ABCMeta,
+):
+    """Partial Least Squares (PLS)
+
+    This class implements the generic PLS algorithm.
+
+    Main ref: Wegelin, a survey of Partial Least Squares (PLS) methods,
+    with emphasis on the two-block case
+    https://stat.uw.edu/sites/default/files/files/reports/2000/tr371.pdf
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "scale": ["boolean"],
+        "deflation_mode": [StrOptions({"regression", "canonical"})],
+        "mode": [StrOptions({"A", "B"})],
+        "algorithm": [StrOptions({"svd", "nipals"})],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "copy": ["boolean"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        n_components=2,
+        *,
+        scale=True,
+        deflation_mode="regression",
+        mode="A",
+        algorithm="nipals",
+        max_iter=500,
+        tol=1e-06,
+        copy=True,
+    ):
+        self.n_components = n_components
+        self.deflation_mode = deflation_mode
+        self.mode = mode
+        self.scale = scale
+        self.algorithm = algorithm
+        self.max_iter = max_iter
+        self.tol = tol
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit model to data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of predictors.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target vectors, where `n_samples` is the number of samples and
+            `n_targets` is the number of response variables.
+
+        Returns
+        -------
+        self : object
+            Fitted model.
+        """
+        check_consistent_length(X, y)
+        X = validate_data(
+            self,
+            X,
+            dtype=np.float64,
+            force_writeable=True,
+            copy=self.copy,
+            ensure_min_samples=2,
+        )
+        y = check_array(
+            y,
+            input_name="y",
+            dtype=np.float64,
+            force_writeable=True,
+            copy=self.copy,
+            ensure_2d=False,
+        )
+        if y.ndim == 1:
+            self._predict_1d = True
+            y = y.reshape(-1, 1)
+        else:
+            self._predict_1d = False
+
+        n = X.shape[0]
+        p = X.shape[1]
+        q = y.shape[1]
+
+        n_components = self.n_components
+        # With PLSRegression n_components is bounded by the rank of (X.T X) see
+        # Wegelin page 25. With CCA and PLSCanonical, n_components is bounded
+        # by the rank of X and the rank of y: see Wegelin page 12
+        rank_upper_bound = (
+            min(n, p) if self.deflation_mode == "regression" else min(n, p, q)
+        )
+        if n_components > rank_upper_bound:
+            raise ValueError(
+                f"`n_components` upper bound is {rank_upper_bound}. "
+                f"Got {n_components} instead. Reduce `n_components`."
+            )
+
+        self._norm_y_weights = self.deflation_mode == "canonical"  # 1.1
+        norm_y_weights = self._norm_y_weights
+
+        # Scale (in place)
+        Xk, yk, self._x_mean, self._y_mean, self._x_std, self._y_std = _center_scale_xy(
+            X, y, self.scale
+        )
+
+        self.x_weights_ = np.zeros((p, n_components))  # U
+        self.y_weights_ = np.zeros((q, n_components))  # V
+        self._x_scores = np.zeros((n, n_components))  # Xi
+        self._y_scores = np.zeros((n, n_components))  # Omega
+        self.x_loadings_ = np.zeros((p, n_components))  # Gamma
+        self.y_loadings_ = np.zeros((q, n_components))  # Delta
+        self.n_iter_ = []
+
+        # This whole thing corresponds to the algorithm in section 4.1 of the
+        # review from Wegelin. See above for a notation mapping from code to
+        # paper.
+        y_eps = np.finfo(yk.dtype).eps
+        for k in range(n_components):
+            # Find first left and right singular vectors of the X.T.dot(y)
+            # cross-covariance matrix.
+            if self.algorithm == "nipals":
+                # Replace columns that are all close to zero with zeros
+                yk_mask = np.all(np.abs(yk) < 10 * y_eps, axis=0)
+                yk[:, yk_mask] = 0.0
+
+                try:
+                    (
+                        x_weights,
+                        y_weights,
+                        n_iter_,
+                    ) = _get_first_singular_vectors_power_method(
+                        Xk,
+                        yk,
+                        mode=self.mode,
+                        max_iter=self.max_iter,
+                        tol=self.tol,
+                        norm_y_weights=norm_y_weights,
+                    )
+                except StopIteration as e:
+                    if str(e) != "y residual is constant":
+                        raise
+                    warnings.warn(f"y residual is constant at iteration {k}")
+                    break
+
+                self.n_iter_.append(n_iter_)
+
+            elif self.algorithm == "svd":
+                x_weights, y_weights = _get_first_singular_vectors_svd(Xk, yk)
+
+            # inplace sign flip for consistency across solvers and archs
+            _svd_flip_1d(x_weights, y_weights)
+
+            # compute scores, i.e. the projections of X and y
+            x_scores = np.dot(Xk, x_weights)
+            if norm_y_weights:
+                y_ss = 1
+            else:
+                y_ss = np.dot(y_weights, y_weights)
+            y_scores = np.dot(yk, y_weights) / y_ss
+
+            # Deflation: subtract rank-one approx to obtain Xk+1 and yk+1
+            x_loadings = np.dot(x_scores, Xk) / np.dot(x_scores, x_scores)
+            Xk -= np.outer(x_scores, x_loadings)
+
+            if self.deflation_mode == "canonical":
+                # regress yk on y_score
+                y_loadings = np.dot(y_scores, yk) / np.dot(y_scores, y_scores)
+                yk -= np.outer(y_scores, y_loadings)
+            if self.deflation_mode == "regression":
+                # regress yk on x_score
+                y_loadings = np.dot(x_scores, yk) / np.dot(x_scores, x_scores)
+                yk -= np.outer(x_scores, y_loadings)
+
+            self.x_weights_[:, k] = x_weights
+            self.y_weights_[:, k] = y_weights
+            self._x_scores[:, k] = x_scores
+            self._y_scores[:, k] = y_scores
+            self.x_loadings_[:, k] = x_loadings
+            self.y_loadings_[:, k] = y_loadings
+
+        # X was approximated as Xi . Gamma.T + X_(R+1)
+        # Xi . Gamma.T is a sum of n_components rank-1 matrices. X_(R+1) is
+        # whatever is left to fully reconstruct X, and can be 0 if X is of rank
+        # n_components.
+        # Similarly, y was approximated as Omega . Delta.T + y_(R+1)
+
+        # Compute transformation matrices (rotations_). See User Guide.
+        self.x_rotations_ = np.dot(
+            self.x_weights_,
+            pinv(np.dot(self.x_loadings_.T, self.x_weights_), check_finite=False),
+        )
+        self.y_rotations_ = np.dot(
+            self.y_weights_,
+            pinv(np.dot(self.y_loadings_.T, self.y_weights_), check_finite=False),
+        )
+        self.coef_ = np.dot(self.x_rotations_, self.y_loadings_.T)
+        self.coef_ = (self.coef_ * self._y_std).T / self._x_std
+        self.intercept_ = self._y_mean
+        self._n_features_out = self.x_rotations_.shape[1]
+        return self
+
+    def transform(self, X, y=None, copy=True):
+        """Apply the dimension reduction.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples to transform.
+
+        y : array-like of shape (n_samples, n_targets), default=None
+            Target vectors.
+
+        copy : bool, default=True
+            Whether to copy `X` and `y`, or perform in-place normalization.
+
+        Returns
+        -------
+        x_scores, y_scores : array-like or tuple of array-like
+            Return `x_scores` if `y` is not given, `(x_scores, y_scores)` otherwise.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, copy=copy, dtype=FLOAT_DTYPES, reset=False)
+        # Normalize
+        X -= self._x_mean
+        X /= self._x_std
+        # Apply rotation
+        x_scores = np.dot(X, self.x_rotations_)
+        if y is not None:
+            y = check_array(
+                y, input_name="y", ensure_2d=False, copy=copy, dtype=FLOAT_DTYPES
+            )
+            if y.ndim == 1:
+                y = y.reshape(-1, 1)
+            y -= self._y_mean
+            y /= self._y_std
+            y_scores = np.dot(y, self.y_rotations_)
+            return x_scores, y_scores
+
+        return x_scores
+
+    def inverse_transform(self, X, y=None):
+        """Transform data back to its original space.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_components)
+            New data, where `n_samples` is the number of samples
+            and `n_components` is the number of pls components.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_components)
+            New target, where `n_samples` is the number of samples
+            and `n_components` is the number of pls components.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Return the reconstructed `X` data.
+
+        y_original : ndarray of shape (n_samples, n_targets)
+            Return the reconstructed `X` target. Only returned when `y` is given.
+
+        Notes
+        -----
+        This transformation will only be exact if `n_components=n_features`.
+        """
+        check_is_fitted(self)
+        X = check_array(X, input_name="X", dtype=FLOAT_DTYPES)
+        # From pls space to original space
+        X_reconstructed = np.matmul(X, self.x_loadings_.T)
+        # Denormalize
+        X_reconstructed *= self._x_std
+        X_reconstructed += self._x_mean
+
+        if y is not None:
+            y = check_array(y, input_name="y", dtype=FLOAT_DTYPES)
+            # From pls space to original space
+            y_reconstructed = np.matmul(y, self.y_loadings_.T)
+            # Denormalize
+            y_reconstructed *= self._y_std
+            y_reconstructed += self._y_mean
+            return X_reconstructed, y_reconstructed
+
+        return X_reconstructed
+
+    def predict(self, X, copy=True):
+        """Predict targets of given samples.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        copy : bool, default=True
+            Whether to copy `X` or perform in-place normalization.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_targets)
+            Returns predicted values.
+
+        Notes
+        -----
+        This call requires the estimation of a matrix of shape
+        `(n_features, n_targets)`, which may be an issue in high dimensional
+        space.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, copy=copy, dtype=FLOAT_DTYPES, reset=False)
+        # Only center X but do not scale it since the coefficients are already scaled
+        X -= self._x_mean
+        y_pred = X @ self.coef_.T + self.intercept_
+        return y_pred.ravel() if self._predict_1d else y_pred
+
+    def fit_transform(self, X, y=None):
+        """Learn and apply the dimension reduction on the train data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of predictors.
+
+        y : array-like of shape (n_samples, n_targets), default=None
+            Target vectors, where `n_samples` is the number of samples and
+            `n_targets` is the number of response variables.
+
+        Returns
+        -------
+        self : ndarray of shape (n_samples, n_components)
+            Return `x_scores` if `y` is not given, `(x_scores, y_scores)` otherwise.
+        """
+        return self.fit(X, y).transform(X, y)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.regressor_tags.poor_score = True
+        tags.target_tags.required = False
+        return tags
+
+
+class PLSRegression(_PLS):
+    """PLS regression.
+
+    PLSRegression is also known as PLS2 or PLS1, depending on the number of
+    targets.
+
+    For a comparison between other cross decomposition algorithms, see
+    :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py`.
+
+    Read more in the :ref:`User Guide <cross_decomposition>`.
+
+    .. versionadded:: 0.8
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        Number of components to keep. Should be in `[1, n_features]`.
+
+    scale : bool, default=True
+        Whether to scale `X` and `y`.
+
+    max_iter : int, default=500
+        The maximum number of iterations of the power method when
+        `algorithm='nipals'`. Ignored otherwise.
+
+    tol : float, default=1e-06
+        The tolerance used as convergence criteria in the power method: the
+        algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less
+        than `tol`, where `u` corresponds to the left singular vector.
+
+    copy : bool, default=True
+        Whether to copy `X` and `y` in :term:`fit` before applying centering,
+        and potentially scaling. If `False`, these operations will be done
+        inplace, modifying both arrays.
+
+    Attributes
+    ----------
+    x_weights_ : ndarray of shape (n_features, n_components)
+        The left singular vectors of the cross-covariance matrices of each
+        iteration.
+
+    y_weights_ : ndarray of shape (n_targets, n_components)
+        The right singular vectors of the cross-covariance matrices of each
+        iteration.
+
+    x_loadings_ : ndarray of shape (n_features, n_components)
+        The loadings of `X`.
+
+    y_loadings_ : ndarray of shape (n_targets, n_components)
+        The loadings of `y`.
+
+    x_scores_ : ndarray of shape (n_samples, n_components)
+        The transformed training samples.
+
+    y_scores_ : ndarray of shape (n_samples, n_components)
+        The transformed training targets.
+
+    x_rotations_ : ndarray of shape (n_features, n_components)
+        The projection matrix used to transform `X`.
+
+    y_rotations_ : ndarray of shape (n_targets, n_components)
+        The projection matrix used to transform `y`.
+
+    coef_ : ndarray of shape (n_target, n_features)
+        The coefficients of the linear model such that `y` is approximated as
+        `y = X @ coef_.T + intercept_`.
+
+    intercept_ : ndarray of shape (n_targets,)
+        The intercepts of the linear model such that `y` is approximated as
+        `y = X @ coef_.T + intercept_`.
+
+        .. versionadded:: 1.1
+
+    n_iter_ : list of shape (n_components,)
+        Number of iterations of the power method, for each
+        component.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PLSCanonical : Partial Least Squares transformer and regressor.
+
+    Examples
+    --------
+    >>> from sklearn.cross_decomposition import PLSRegression
+    >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]
+    >>> y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
+    >>> pls2 = PLSRegression(n_components=2)
+    >>> pls2.fit(X, y)
+    PLSRegression()
+    >>> y_pred = pls2.predict(X)
+
+    For a comparison between PLS Regression and :class:`~sklearn.decomposition.PCA`, see
+    :ref:`sphx_glr_auto_examples_cross_decomposition_plot_pcr_vs_pls.py`.
+    """
+
+    _parameter_constraints: dict = {**_PLS._parameter_constraints}
+    for param in ("deflation_mode", "mode", "algorithm"):
+        _parameter_constraints.pop(param)
+
+    # This implementation provides the same results that 3 PLS packages
+    # provided in the R language (R-project):
+    #     - "mixOmics" with function pls(X, y, mode = "regression")
+    #     - "plspm " with function plsreg2(X, y)
+    #     - "pls" with function oscorespls.fit(X, y)
+
+    def __init__(
+        self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True
+    ):
+        super().__init__(
+            n_components=n_components,
+            scale=scale,
+            deflation_mode="regression",
+            mode="A",
+            algorithm="nipals",
+            max_iter=max_iter,
+            tol=tol,
+            copy=copy,
+        )
+
+    def fit(self, X, y):
+        """Fit model to data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of predictors.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target vectors, where `n_samples` is the number of samples and
+            `n_targets` is the number of response variables.
+
+        Returns
+        -------
+        self : object
+            Fitted model.
+        """
+        super().fit(X, y)
+        # expose the fitted attributes `x_scores_` and `y_scores_`
+        self.x_scores_ = self._x_scores
+        self.y_scores_ = self._y_scores
+        return self
+
+
+class PLSCanonical(_PLS):
+    """Partial Least Squares transformer and regressor.
+
+    For a comparison between other cross decomposition algorithms, see
+    :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py`.
+
+    Read more in the :ref:`User Guide <cross_decomposition>`.
+
+    .. versionadded:: 0.8
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        Number of components to keep. Should be in `[1, min(n_samples,
+        n_features, n_targets)]`.
+
+    scale : bool, default=True
+        Whether to scale `X` and `y`.
+
+    algorithm : {'nipals', 'svd'}, default='nipals'
+        The algorithm used to estimate the first singular vectors of the
+        cross-covariance matrix. 'nipals' uses the power method while 'svd'
+        will compute the whole SVD.
+
+    max_iter : int, default=500
+        The maximum number of iterations of the power method when
+        `algorithm='nipals'`. Ignored otherwise.
+
+    tol : float, default=1e-06
+        The tolerance used as convergence criteria in the power method: the
+        algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less
+        than `tol`, where `u` corresponds to the left singular vector.
+
+    copy : bool, default=True
+        Whether to copy `X` and `y` in fit before applying centering, and
+        potentially scaling. If False, these operations will be done inplace,
+        modifying both arrays.
+
+    Attributes
+    ----------
+    x_weights_ : ndarray of shape (n_features, n_components)
+        The left singular vectors of the cross-covariance matrices of each
+        iteration.
+
+    y_weights_ : ndarray of shape (n_targets, n_components)
+        The right singular vectors of the cross-covariance matrices of each
+        iteration.
+
+    x_loadings_ : ndarray of shape (n_features, n_components)
+        The loadings of `X`.
+
+    y_loadings_ : ndarray of shape (n_targets, n_components)
+        The loadings of `y`.
+
+    x_rotations_ : ndarray of shape (n_features, n_components)
+        The projection matrix used to transform `X`.
+
+    y_rotations_ : ndarray of shape (n_targets, n_components)
+        The projection matrix used to transform `y`.
+
+    coef_ : ndarray of shape (n_targets, n_features)
+        The coefficients of the linear model such that `y` is approximated as
+        `y = X @ coef_.T + intercept_`.
+
+    intercept_ : ndarray of shape (n_targets,)
+        The intercepts of the linear model such that `y` is approximated as
+        `y = X @ coef_.T + intercept_`.
+
+        .. versionadded:: 1.1
+
+    n_iter_ : list of shape (n_components,)
+        Number of iterations of the power method, for each
+        component. Empty if `algorithm='svd'`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    CCA : Canonical Correlation Analysis.
+    PLSSVD : Partial Least Square SVD.
+
+    Examples
+    --------
+    >>> from sklearn.cross_decomposition import PLSCanonical
+    >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]
+    >>> y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
+    >>> plsca = PLSCanonical(n_components=2)
+    >>> plsca.fit(X, y)
+    PLSCanonical()
+    >>> X_c, y_c = plsca.transform(X, y)
+    """
+
+    _parameter_constraints: dict = {**_PLS._parameter_constraints}
+    for param in ("deflation_mode", "mode"):
+        _parameter_constraints.pop(param)
+
+    # This implementation provides the same results that the "plspm" package
+    # provided in the R language (R-project), using the function plsca(X, y).
+    # Results are equal or collinear with the function
+    # ``pls(..., mode = "canonical")`` of the "mixOmics" package. The
+    # difference relies in the fact that mixOmics implementation does not
+    # exactly implement the Wold algorithm since it does not normalize
+    # y_weights to one.
+
+    def __init__(
+        self,
+        n_components=2,
+        *,
+        scale=True,
+        algorithm="nipals",
+        max_iter=500,
+        tol=1e-06,
+        copy=True,
+    ):
+        super().__init__(
+            n_components=n_components,
+            scale=scale,
+            deflation_mode="canonical",
+            mode="A",
+            algorithm=algorithm,
+            max_iter=max_iter,
+            tol=tol,
+            copy=copy,
+        )
+
+
+class CCA(_PLS):
+    """Canonical Correlation Analysis, also known as "Mode B" PLS.
+
+    For a comparison between other cross decomposition algorithms, see
+    :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py`.
+
+    Read more in the :ref:`User Guide <cross_decomposition>`.
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        Number of components to keep. Should be in `[1, min(n_samples,
+        n_features, n_targets)]`.
+
+    scale : bool, default=True
+        Whether to scale `X` and `y`.
+
+    max_iter : int, default=500
+        The maximum number of iterations of the power method.
+
+    tol : float, default=1e-06
+        The tolerance used as convergence criteria in the power method: the
+        algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less
+        than `tol`, where `u` corresponds to the left singular vector.
+
+    copy : bool, default=True
+        Whether to copy `X` and `y` in fit before applying centering, and
+        potentially scaling. If False, these operations will be done inplace,
+        modifying both arrays.
+
+    Attributes
+    ----------
+    x_weights_ : ndarray of shape (n_features, n_components)
+        The left singular vectors of the cross-covariance matrices of each
+        iteration.
+
+    y_weights_ : ndarray of shape (n_targets, n_components)
+        The right singular vectors of the cross-covariance matrices of each
+        iteration.
+
+    x_loadings_ : ndarray of shape (n_features, n_components)
+        The loadings of `X`.
+
+    y_loadings_ : ndarray of shape (n_targets, n_components)
+        The loadings of `y`.
+
+    x_rotations_ : ndarray of shape (n_features, n_components)
+        The projection matrix used to transform `X`.
+
+    y_rotations_ : ndarray of shape (n_targets, n_components)
+        The projection matrix used to transform `y`.
+
+    coef_ : ndarray of shape (n_targets, n_features)
+        The coefficients of the linear model such that `y` is approximated as
+        `y = X @ coef_.T + intercept_`.
+
+    intercept_ : ndarray of shape (n_targets,)
+        The intercepts of the linear model such that `y` is approximated as
+        `y = X @ coef_.T + intercept_`.
+
+        .. versionadded:: 1.1
+
+    n_iter_ : list of shape (n_components,)
+        Number of iterations of the power method, for each
+        component.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PLSCanonical : Partial Least Squares transformer and regressor.
+    PLSSVD : Partial Least Square SVD.
+
+    Examples
+    --------
+    >>> from sklearn.cross_decomposition import CCA
+    >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]]
+    >>> y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
+    >>> cca = CCA(n_components=1)
+    >>> cca.fit(X, y)
+    CCA(n_components=1)
+    >>> X_c, y_c = cca.transform(X, y)
+    """
+
+    _parameter_constraints: dict = {**_PLS._parameter_constraints}
+    for param in ("deflation_mode", "mode", "algorithm"):
+        _parameter_constraints.pop(param)
+
+    def __init__(
+        self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True
+    ):
+        super().__init__(
+            n_components=n_components,
+            scale=scale,
+            deflation_mode="canonical",
+            mode="B",
+            algorithm="nipals",
+            max_iter=max_iter,
+            tol=tol,
+            copy=copy,
+        )
+
+
+class PLSSVD(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Partial Least Square SVD.
+
+    This transformer simply performs a SVD on the cross-covariance matrix
+    `X'y`. It is able to project both the training data `X` and the targets
+    `y`. The training data `X` is projected on the left singular vectors, while
+    the targets are projected on the right singular vectors.
+
+    Read more in the :ref:`User Guide <cross_decomposition>`.
+
+    .. versionadded:: 0.8
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        The number of components to keep. Should be in `[1,
+        min(n_samples, n_features, n_targets)]`.
+
+    scale : bool, default=True
+        Whether to scale `X` and `y`.
+
+    copy : bool, default=True
+        Whether to copy `X` and `y` in fit before applying centering, and
+        potentially scaling. If `False`, these operations will be done inplace,
+        modifying both arrays.
+
+    Attributes
+    ----------
+    x_weights_ : ndarray of shape (n_features, n_components)
+        The left singular vectors of the SVD of the cross-covariance matrix.
+        Used to project `X` in :meth:`transform`.
+
+    y_weights_ : ndarray of (n_targets, n_components)
+        The right singular vectors of the SVD of the cross-covariance matrix.
+        Used to project `X` in :meth:`transform`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PLSCanonical : Partial Least Squares transformer and regressor.
+    CCA : Canonical Correlation Analysis.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cross_decomposition import PLSSVD
+    >>> X = np.array([[0., 0., 1.],
+    ...               [1., 0., 0.],
+    ...               [2., 2., 2.],
+    ...               [2., 5., 4.]])
+    >>> y = np.array([[0.1, -0.2],
+    ...               [0.9, 1.1],
+    ...               [6.2, 5.9],
+    ...               [11.9, 12.3]])
+    >>> pls = PLSSVD(n_components=2).fit(X, y)
+    >>> X_c, y_c = pls.transform(X, y)
+    >>> X_c.shape, y_c.shape
+    ((4, 2), (4, 2))
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "scale": ["boolean"],
+        "copy": ["boolean"],
+    }
+
+    def __init__(self, n_components=2, *, scale=True, copy=True):
+        self.n_components = n_components
+        self.scale = scale
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit model to data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training samples.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Targets.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        check_consistent_length(X, y)
+        X = validate_data(
+            self,
+            X,
+            dtype=np.float64,
+            force_writeable=True,
+            copy=self.copy,
+            ensure_min_samples=2,
+        )
+        y = check_array(
+            y,
+            input_name="y",
+            dtype=np.float64,
+            force_writeable=True,
+            copy=self.copy,
+            ensure_2d=False,
+        )
+        if y.ndim == 1:
+            y = y.reshape(-1, 1)
+
+        # we'll compute the SVD of the cross-covariance matrix = X.T.dot(y)
+        # This matrix rank is at most min(n_samples, n_features, n_targets) so
+        # n_components cannot be bigger than that.
+        n_components = self.n_components
+        rank_upper_bound = min(X.shape[0], X.shape[1], y.shape[1])
+        if n_components > rank_upper_bound:
+            raise ValueError(
+                f"`n_components` upper bound is {rank_upper_bound}. "
+                f"Got {n_components} instead. Reduce `n_components`."
+            )
+
+        X, y, self._x_mean, self._y_mean, self._x_std, self._y_std = _center_scale_xy(
+            X, y, self.scale
+        )
+
+        # Compute SVD of cross-covariance matrix
+        C = np.dot(X.T, y)
+        U, s, Vt = svd(C, full_matrices=False)
+        U = U[:, :n_components]
+        Vt = Vt[:n_components]
+        U, Vt = svd_flip(U, Vt)
+        V = Vt.T
+
+        self.x_weights_ = U
+        self.y_weights_ = V
+        self._n_features_out = self.x_weights_.shape[1]
+        return self
+
+    def transform(self, X, y=None):
+        """
+        Apply the dimensionality reduction.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples to be transformed.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets), \
+                default=None
+            Targets.
+
+        Returns
+        -------
+        x_scores : array-like or tuple of array-like
+            The transformed data `X_transformed` if `y is not None`,
+            `(X_transformed, y_transformed)` otherwise.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, dtype=np.float64, reset=False)
+        Xr = (X - self._x_mean) / self._x_std
+        x_scores = np.dot(Xr, self.x_weights_)
+        if y is not None:
+            y = check_array(y, input_name="y", ensure_2d=False, dtype=np.float64)
+            if y.ndim == 1:
+                y = y.reshape(-1, 1)
+            yr = (y - self._y_mean) / self._y_std
+            y_scores = np.dot(yr, self.y_weights_)
+            return x_scores, y_scores
+        return x_scores
+
+    def fit_transform(self, X, y=None):
+        """Learn and apply the dimensionality reduction.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training samples.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets), \
+                default=None
+            Targets.
+
+        Returns
+        -------
+        out : array-like or tuple of array-like
+            The transformed data `X_transformed` if `y is not None`,
+            `(X_transformed, y_transformed)` otherwise.
+        """
+        return self.fit(X, y).transform(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/test_pls.py b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/test_pls.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e516d71b6f988710b71c2b8d575a80e42e87d65
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/test_pls.py
@@ -0,0 +1,677 @@
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal
+
+from sklearn.cross_decomposition import CCA, PLSSVD, PLSCanonical, PLSRegression
+from sklearn.cross_decomposition._pls import (
+    _center_scale_xy,
+    _get_first_singular_vectors_power_method,
+    _get_first_singular_vectors_svd,
+    _svd_flip_1d,
+)
+from sklearn.datasets import load_linnerud, make_regression
+from sklearn.ensemble import VotingRegressor
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import LinearRegression
+from sklearn.utils import check_random_state
+from sklearn.utils.extmath import svd_flip
+
+
+def assert_matrix_orthogonal(M):
+    K = np.dot(M.T, M)
+    assert_array_almost_equal(K, np.diag(np.diag(K)))
+
+
+def test_pls_canonical_basics():
+    # Basic checks for PLSCanonical
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+
+    pls = PLSCanonical(n_components=X.shape[1])
+    pls.fit(X, y)
+
+    assert_matrix_orthogonal(pls.x_weights_)
+    assert_matrix_orthogonal(pls.y_weights_)
+    assert_matrix_orthogonal(pls._x_scores)
+    assert_matrix_orthogonal(pls._y_scores)
+
+    # Check X = TP' and y = UQ'
+    T = pls._x_scores
+    P = pls.x_loadings_
+    U = pls._y_scores
+    Q = pls.y_loadings_
+    # Need to scale first
+    Xc, yc, x_mean, y_mean, x_std, y_std = _center_scale_xy(
+        X.copy(), y.copy(), scale=True
+    )
+    assert_array_almost_equal(Xc, np.dot(T, P.T))
+    assert_array_almost_equal(yc, np.dot(U, Q.T))
+
+    # Check that rotations on training data lead to scores
+    Xt = pls.transform(X)
+    assert_array_almost_equal(Xt, pls._x_scores)
+    Xt, yt = pls.transform(X, y)
+    assert_array_almost_equal(Xt, pls._x_scores)
+    assert_array_almost_equal(yt, pls._y_scores)
+
+    # Check that inverse_transform works
+    X_back = pls.inverse_transform(Xt)
+    assert_array_almost_equal(X_back, X)
+    _, y_back = pls.inverse_transform(Xt, yt)
+    assert_array_almost_equal(y_back, y)
+
+
+def test_sanity_check_pls_regression():
+    # Sanity check for PLSRegression
+    # The results were checked against the R-packages plspm, misOmics and pls
+
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+
+    pls = PLSRegression(n_components=X.shape[1])
+    X_trans, _ = pls.fit_transform(X, y)
+
+    # FIXME: one would expect y_trans == pls.y_scores_ but this is not
+    # the case.
+    # xref: https://github.com/scikit-learn/scikit-learn/issues/22420
+    assert_allclose(X_trans, pls.x_scores_)
+
+    expected_x_weights = np.array(
+        [
+            [-0.61330704, -0.00443647, 0.78983213],
+            [-0.74697144, -0.32172099, -0.58183269],
+            [-0.25668686, 0.94682413, -0.19399983],
+        ]
+    )
+
+    expected_x_loadings = np.array(
+        [
+            [-0.61470416, -0.24574278, 0.78983213],
+            [-0.65625755, -0.14396183, -0.58183269],
+            [-0.51733059, 1.00609417, -0.19399983],
+        ]
+    )
+
+    expected_y_weights = np.array(
+        [
+            [+0.32456184, 0.29892183, 0.20316322],
+            [+0.42439636, 0.61970543, 0.19320542],
+            [-0.13143144, -0.26348971, -0.17092916],
+        ]
+    )
+
+    expected_y_loadings = np.array(
+        [
+            [+0.32456184, 0.29892183, 0.20316322],
+            [+0.42439636, 0.61970543, 0.19320542],
+            [-0.13143144, -0.26348971, -0.17092916],
+        ]
+    )
+
+    assert_array_almost_equal(np.abs(pls.x_loadings_), np.abs(expected_x_loadings))
+    assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights))
+    assert_array_almost_equal(np.abs(pls.y_loadings_), np.abs(expected_y_loadings))
+    assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights))
+
+    # The R / Python difference in the signs should be consistent across
+    # loadings, weights, etc.
+    x_loadings_sign_flip = np.sign(pls.x_loadings_ / expected_x_loadings)
+    x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights)
+    y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights)
+    y_loadings_sign_flip = np.sign(pls.y_loadings_ / expected_y_loadings)
+    assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip)
+    assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip)
+
+
+def test_sanity_check_pls_regression_constant_column_y():
+    # Check behavior when the first column of y is constant
+    # The results are checked against a modified version of plsreg2
+    # from the R-package plsdepot
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+    y[:, 0] = 1
+    pls = PLSRegression(n_components=X.shape[1])
+    pls.fit(X, y)
+
+    expected_x_weights = np.array(
+        [
+            [-0.6273573, 0.007081799, 0.7786994],
+            [-0.7493417, -0.277612681, -0.6011807],
+            [-0.2119194, 0.960666981, -0.1794690],
+        ]
+    )
+
+    expected_x_loadings = np.array(
+        [
+            [-0.6273512, -0.22464538, 0.7786994],
+            [-0.6643156, -0.09871193, -0.6011807],
+            [-0.5125877, 1.01407380, -0.1794690],
+        ]
+    )
+
+    expected_y_loadings = np.array(
+        [
+            [0.0000000, 0.0000000, 0.0000000],
+            [0.4357300, 0.5828479, 0.2174802],
+            [-0.1353739, -0.2486423, -0.1810386],
+        ]
+    )
+
+    assert_array_almost_equal(np.abs(expected_x_weights), np.abs(pls.x_weights_))
+    assert_array_almost_equal(np.abs(expected_x_loadings), np.abs(pls.x_loadings_))
+    # For the PLSRegression with default parameters, y_loadings == y_weights
+    assert_array_almost_equal(np.abs(pls.y_loadings_), np.abs(expected_y_loadings))
+    assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_loadings))
+
+    x_loadings_sign_flip = np.sign(expected_x_loadings / pls.x_loadings_)
+    x_weights_sign_flip = np.sign(expected_x_weights / pls.x_weights_)
+    # we ignore the first full-zeros row for y
+    y_loadings_sign_flip = np.sign(expected_y_loadings[1:] / pls.y_loadings_[1:])
+
+    assert_array_equal(x_loadings_sign_flip, x_weights_sign_flip)
+    assert_array_equal(x_loadings_sign_flip[1:], y_loadings_sign_flip)
+
+
+def test_sanity_check_pls_canonical():
+    # Sanity check for PLSCanonical
+    # The results were checked against the R-package plspm
+
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+
+    pls = PLSCanonical(n_components=X.shape[1])
+    pls.fit(X, y)
+
+    expected_x_weights = np.array(
+        [
+            [-0.61330704, 0.25616119, -0.74715187],
+            [-0.74697144, 0.11930791, 0.65406368],
+            [-0.25668686, -0.95924297, -0.11817271],
+        ]
+    )
+
+    expected_x_rotations = np.array(
+        [
+            [-0.61330704, 0.41591889, -0.62297525],
+            [-0.74697144, 0.31388326, 0.77368233],
+            [-0.25668686, -0.89237972, -0.24121788],
+        ]
+    )
+
+    expected_y_weights = np.array(
+        [
+            [+0.58989127, 0.7890047, 0.1717553],
+            [+0.77134053, -0.61351791, 0.16920272],
+            [-0.23887670, -0.03267062, 0.97050016],
+        ]
+    )
+
+    expected_y_rotations = np.array(
+        [
+            [+0.58989127, 0.7168115, 0.30665872],
+            [+0.77134053, -0.70791757, 0.19786539],
+            [-0.23887670, -0.00343595, 0.94162826],
+        ]
+    )
+
+    assert_array_almost_equal(np.abs(pls.x_rotations_), np.abs(expected_x_rotations))
+    assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights))
+    assert_array_almost_equal(np.abs(pls.y_rotations_), np.abs(expected_y_rotations))
+    assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights))
+
+    x_rotations_sign_flip = np.sign(pls.x_rotations_ / expected_x_rotations)
+    x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights)
+    y_rotations_sign_flip = np.sign(pls.y_rotations_ / expected_y_rotations)
+    y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights)
+    assert_array_almost_equal(x_rotations_sign_flip, x_weights_sign_flip)
+    assert_array_almost_equal(y_rotations_sign_flip, y_weights_sign_flip)
+
+    assert_matrix_orthogonal(pls.x_weights_)
+    assert_matrix_orthogonal(pls.y_weights_)
+
+    assert_matrix_orthogonal(pls._x_scores)
+    assert_matrix_orthogonal(pls._y_scores)
+
+
+def test_sanity_check_pls_canonical_random():
+    # Sanity check for PLSCanonical on random data
+    # The results were checked against the R-package plspm
+    n = 500
+    p_noise = 10
+    q_noise = 5
+    # 2 latents vars:
+    rng = check_random_state(11)
+    l1 = rng.normal(size=n)
+    l2 = rng.normal(size=n)
+    latents = np.array([l1, l1, l2, l2]).T
+    X = latents + rng.normal(size=4 * n).reshape((n, 4))
+    y = latents + rng.normal(size=4 * n).reshape((n, 4))
+    X = np.concatenate((X, rng.normal(size=p_noise * n).reshape(n, p_noise)), axis=1)
+    y = np.concatenate((y, rng.normal(size=q_noise * n).reshape(n, q_noise)), axis=1)
+
+    pls = PLSCanonical(n_components=3)
+    pls.fit(X, y)
+
+    expected_x_weights = np.array(
+        [
+            [0.65803719, 0.19197924, 0.21769083],
+            [0.7009113, 0.13303969, -0.15376699],
+            [0.13528197, -0.68636408, 0.13856546],
+            [0.16854574, -0.66788088, -0.12485304],
+            [-0.03232333, -0.04189855, 0.40690153],
+            [0.1148816, -0.09643158, 0.1613305],
+            [0.04792138, -0.02384992, 0.17175319],
+            [-0.06781, -0.01666137, -0.18556747],
+            [-0.00266945, -0.00160224, 0.11893098],
+            [-0.00849528, -0.07706095, 0.1570547],
+            [-0.00949471, -0.02964127, 0.34657036],
+            [-0.03572177, 0.0945091, 0.3414855],
+            [0.05584937, -0.02028961, -0.57682568],
+            [0.05744254, -0.01482333, -0.17431274],
+        ]
+    )
+
+    expected_x_loadings = np.array(
+        [
+            [0.65649254, 0.1847647, 0.15270699],
+            [0.67554234, 0.15237508, -0.09182247],
+            [0.19219925, -0.67750975, 0.08673128],
+            [0.2133631, -0.67034809, -0.08835483],
+            [-0.03178912, -0.06668336, 0.43395268],
+            [0.15684588, -0.13350241, 0.20578984],
+            [0.03337736, -0.03807306, 0.09871553],
+            [-0.06199844, 0.01559854, -0.1881785],
+            [0.00406146, -0.00587025, 0.16413253],
+            [-0.00374239, -0.05848466, 0.19140336],
+            [0.00139214, -0.01033161, 0.32239136],
+            [-0.05292828, 0.0953533, 0.31916881],
+            [0.04031924, -0.01961045, -0.65174036],
+            [0.06172484, -0.06597366, -0.1244497],
+        ]
+    )
+
+    expected_y_weights = np.array(
+        [
+            [0.66101097, 0.18672553, 0.22826092],
+            [0.69347861, 0.18463471, -0.23995597],
+            [0.14462724, -0.66504085, 0.17082434],
+            [0.22247955, -0.6932605, -0.09832993],
+            [0.07035859, 0.00714283, 0.67810124],
+            [0.07765351, -0.0105204, -0.44108074],
+            [-0.00917056, 0.04322147, 0.10062478],
+            [-0.01909512, 0.06182718, 0.28830475],
+            [0.01756709, 0.04797666, 0.32225745],
+        ]
+    )
+
+    expected_y_loadings = np.array(
+        [
+            [0.68568625, 0.1674376, 0.0969508],
+            [0.68782064, 0.20375837, -0.1164448],
+            [0.11712173, -0.68046903, 0.12001505],
+            [0.17860457, -0.6798319, -0.05089681],
+            [0.06265739, -0.0277703, 0.74729584],
+            [0.0914178, 0.00403751, -0.5135078],
+            [-0.02196918, -0.01377169, 0.09564505],
+            [-0.03288952, 0.09039729, 0.31858973],
+            [0.04287624, 0.05254676, 0.27836841],
+        ]
+    )
+
+    assert_array_almost_equal(np.abs(pls.x_loadings_), np.abs(expected_x_loadings))
+    assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights))
+    assert_array_almost_equal(np.abs(pls.y_loadings_), np.abs(expected_y_loadings))
+    assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights))
+
+    x_loadings_sign_flip = np.sign(pls.x_loadings_ / expected_x_loadings)
+    x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights)
+    y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights)
+    y_loadings_sign_flip = np.sign(pls.y_loadings_ / expected_y_loadings)
+    assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip)
+    assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip)
+
+    assert_matrix_orthogonal(pls.x_weights_)
+    assert_matrix_orthogonal(pls.y_weights_)
+
+    assert_matrix_orthogonal(pls._x_scores)
+    assert_matrix_orthogonal(pls._y_scores)
+
+
+def test_convergence_fail():
+    # Make sure ConvergenceWarning is raised if max_iter is too small
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+    pls_nipals = PLSCanonical(n_components=X.shape[1], max_iter=2)
+    with pytest.warns(ConvergenceWarning):
+        pls_nipals.fit(X, y)
+
+
+@pytest.mark.parametrize("Est", (PLSSVD, PLSRegression, PLSCanonical))
+def test_attibutes_shapes(Est):
+    # Make sure attributes are of the correct shape depending on n_components
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+    n_components = 2
+    pls = Est(n_components=n_components)
+    pls.fit(X, y)
+    assert all(
+        attr.shape[1] == n_components for attr in (pls.x_weights_, pls.y_weights_)
+    )
+
+
+@pytest.mark.parametrize("Est", (PLSRegression, PLSCanonical, CCA))
+def test_univariate_equivalence(Est):
+    # Ensure 2D y with 1 column is equivalent to 1D y
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+
+    est = Est(n_components=1)
+    one_d_coeff = est.fit(X, y[:, 0]).coef_
+    two_d_coeff = est.fit(X, y[:, :1]).coef_
+
+    assert one_d_coeff.shape == two_d_coeff.shape
+    assert_array_almost_equal(one_d_coeff, two_d_coeff)
+
+
+@pytest.mark.parametrize("Est", (PLSRegression, PLSCanonical, CCA, PLSSVD))
+def test_copy(Est):
+    # check that the "copy" keyword works
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+    X_orig = X.copy()
+
+    # copy=True won't modify inplace
+    pls = Est(copy=True).fit(X, y)
+    assert_array_equal(X, X_orig)
+
+    # copy=False will modify inplace
+    with pytest.raises(AssertionError):
+        Est(copy=False).fit(X, y)
+        assert_array_almost_equal(X, X_orig)
+
+    if Est is PLSSVD:
+        return  # PLSSVD does not support copy param in predict or transform
+
+    X_orig = X.copy()
+    with pytest.raises(AssertionError):
+        pls.transform(X, y, copy=False)
+        assert_array_almost_equal(X, X_orig)
+
+    X_orig = X.copy()
+    with pytest.raises(AssertionError):
+        pls.predict(X, copy=False)
+        assert_array_almost_equal(X, X_orig)
+
+    # Make sure copy=True gives same transform and predictions as predict=False
+    assert_array_almost_equal(
+        pls.transform(X, y, copy=True), pls.transform(X.copy(), y.copy(), copy=False)
+    )
+    assert_array_almost_equal(
+        pls.predict(X, copy=True), pls.predict(X.copy(), copy=False)
+    )
+
+
+def _generate_test_scale_and_stability_datasets():
+    """Generate dataset for test_scale_and_stability"""
+    # dataset for non-regression 7818
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_targets = 5
+    n_features = 10
+    Q = rng.randn(n_targets, n_features)
+    y = rng.randn(n_samples, n_targets)
+    X = np.dot(y, Q) + 2 * rng.randn(n_samples, n_features) + 1
+    X *= 1000
+    yield X, y
+
+    # Data set where one of the features is constraint
+    X, y = load_linnerud(return_X_y=True)
+    # causes X[:, -1].std() to be zero
+    X[:, -1] = 1.0
+    yield X, y
+
+    X = np.array([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [2.0, 2.0, 2.0], [3.0, 5.0, 4.0]])
+    y = np.array([[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]])
+    yield X, y
+
+    # Seeds that provide a non-regression test for #18746, where CCA fails
+    seeds = [530, 741]
+    for seed in seeds:
+        rng = np.random.RandomState(seed)
+        X = rng.randn(4, 3)
+        y = rng.randn(4, 2)
+        yield X, y
+
+
+@pytest.mark.parametrize("Est", (CCA, PLSCanonical, PLSRegression, PLSSVD))
+@pytest.mark.parametrize("X, y", _generate_test_scale_and_stability_datasets())
+def test_scale_and_stability(Est, X, y):
+    """scale=True is equivalent to scale=False on centered/scaled data
+    This allows to check numerical stability over platforms as well"""
+
+    X_s, y_s, *_ = _center_scale_xy(X, y)
+
+    X_score, y_score = Est(scale=True).fit_transform(X, y)
+    X_s_score, y_s_score = Est(scale=False).fit_transform(X_s, y_s)
+
+    assert_allclose(X_s_score, X_score, atol=1e-4)
+    assert_allclose(y_s_score, y_score, atol=1e-4)
+
+
+@pytest.mark.parametrize("Estimator", (PLSSVD, PLSRegression, PLSCanonical, CCA))
+def test_n_components_upper_bounds(Estimator):
+    """Check the validation of `n_components` upper bounds for `PLS` regressors."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 5)
+    y = rng.randn(10, 3)
+    est = Estimator(n_components=10)
+    err_msg = "`n_components` upper bound is .*. Got 10 instead. Reduce `n_components`."
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+
+def test_n_components_upper_PLSRegression():
+    """Check the validation of `n_components` upper bounds for PLSRegression."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(20, 64)
+    y = rng.randn(20, 3)
+    est = PLSRegression(n_components=30)
+    err_msg = "`n_components` upper bound is 20. Got 30 instead. Reduce `n_components`."
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("n_samples, n_features", [(100, 10), (100, 200)])
+def test_singular_value_helpers(n_samples, n_features, global_random_seed):
+    # Make sure SVD and power method give approximately the same results
+    X, y = make_regression(
+        n_samples, n_features, n_targets=5, random_state=global_random_seed
+    )
+    u1, v1, _ = _get_first_singular_vectors_power_method(X, y, norm_y_weights=True)
+    u2, v2 = _get_first_singular_vectors_svd(X, y)
+
+    _svd_flip_1d(u1, v1)
+    _svd_flip_1d(u2, v2)
+
+    rtol = 1e-3
+    # Setting atol because some coordinates are very close to zero
+    assert_allclose(u1, u2, atol=u2.max() * rtol)
+    assert_allclose(v1, v2, atol=v2.max() * rtol)
+
+
+def test_one_component_equivalence(global_random_seed):
+    # PLSSVD, PLSRegression and PLSCanonical should all be equivalent when
+    # n_components is 1
+    X, y = make_regression(100, 10, n_targets=5, random_state=global_random_seed)
+    svd = PLSSVD(n_components=1).fit(X, y).transform(X)
+    reg = PLSRegression(n_components=1).fit(X, y).transform(X)
+    canonical = PLSCanonical(n_components=1).fit(X, y).transform(X)
+
+    rtol = 1e-3
+    # Setting atol because some entries are very close to zero
+    assert_allclose(svd, reg, atol=reg.max() * rtol)
+    assert_allclose(svd, canonical, atol=canonical.max() * rtol)
+
+
+def test_svd_flip_1d():
+    # Make sure svd_flip_1d is equivalent to svd_flip
+    u = np.array([1, -4, 2])
+    v = np.array([1, 2, 3])
+
+    u_expected, v_expected = svd_flip(u.reshape(-1, 1), v.reshape(1, -1))
+    _svd_flip_1d(u, v)  # inplace
+
+    assert_allclose(u, u_expected.ravel())
+    assert_allclose(u, [-1, 4, -2])
+
+    assert_allclose(v, v_expected.ravel())
+    assert_allclose(v, [-1, -2, -3])
+
+
+def test_loadings_converges(global_random_seed):
+    """Test that CCA converges. Non-regression test for #19549."""
+    X, y = make_regression(
+        n_samples=200, n_features=20, n_targets=20, random_state=global_random_seed
+    )
+
+    cca = CCA(n_components=10, max_iter=500)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", ConvergenceWarning)
+
+        cca.fit(X, y)
+
+    # Loadings converges to reasonable values
+    assert np.all(np.abs(cca.x_loadings_) < 1)
+
+
+def test_pls_constant_y():
+    """Checks warning when y is constant. Non-regression test for #19831"""
+    rng = np.random.RandomState(42)
+    x = rng.rand(100, 3)
+    y = np.zeros(100)
+
+    pls = PLSRegression()
+
+    msg = "y residual is constant at iteration"
+    with pytest.warns(UserWarning, match=msg):
+        pls.fit(x, y)
+
+    assert_allclose(pls.x_rotations_, 0)
+
+
+@pytest.mark.parametrize("PLSEstimator", [PLSRegression, PLSCanonical, CCA])
+def test_pls_coef_shape(PLSEstimator):
+    """Check the shape of `coef_` attribute.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/12410
+    """
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+
+    pls = PLSEstimator(copy=True).fit(X, y)
+
+    n_targets, n_features = y.shape[1], X.shape[1]
+    assert pls.coef_.shape == (n_targets, n_features)
+
+
+@pytest.mark.parametrize("scale", [True, False])
+@pytest.mark.parametrize("PLSEstimator", [PLSRegression, PLSCanonical, CCA])
+def test_pls_prediction(PLSEstimator, scale):
+    """Check the behaviour of the prediction function."""
+    d = load_linnerud()
+    X = d.data
+    y = d.target
+
+    pls = PLSEstimator(copy=True, scale=scale).fit(X, y)
+    y_pred = pls.predict(X, copy=True)
+
+    y_mean = y.mean(axis=0)
+    X_trans = X - X.mean(axis=0)
+
+    assert_allclose(pls.intercept_, y_mean)
+    assert_allclose(y_pred, X_trans @ pls.coef_.T + pls.intercept_)
+
+
+@pytest.mark.parametrize("Klass", [CCA, PLSSVD, PLSRegression, PLSCanonical])
+def test_pls_feature_names_out(Klass):
+    """Check `get_feature_names_out` cross_decomposition module."""
+    X, y = load_linnerud(return_X_y=True)
+
+    est = Klass().fit(X, y)
+    names_out = est.get_feature_names_out()
+
+    class_name_lower = Klass.__name__.lower()
+    expected_names_out = np.array(
+        [f"{class_name_lower}{i}" for i in range(est.x_weights_.shape[1])],
+        dtype=object,
+    )
+    assert_array_equal(names_out, expected_names_out)
+
+
+@pytest.mark.parametrize("Klass", [CCA, PLSSVD, PLSRegression, PLSCanonical])
+def test_pls_set_output(Klass):
+    """Check `set_output` in cross_decomposition module."""
+    pd = pytest.importorskip("pandas")
+    X, y = load_linnerud(return_X_y=True, as_frame=True)
+
+    est = Klass().set_output(transform="pandas").fit(X, y)
+    X_trans, y_trans = est.transform(X, y)
+    assert isinstance(y_trans, np.ndarray)
+    assert isinstance(X_trans, pd.DataFrame)
+    assert_array_equal(X_trans.columns, est.get_feature_names_out())
+
+
+def test_pls_regression_fit_1d_y():
+    """Check that when fitting with 1d `y`, prediction should also be 1d.
+
+    Non-regression test for Issue #26549.
+    """
+    X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
+    y = np.array([2, 6, 12, 20, 30, 42])
+    expected = y.copy()
+
+    plsr = PLSRegression().fit(X, y)
+    y_pred = plsr.predict(X)
+    assert y_pred.shape == expected.shape
+
+    # Check that it works in VotingRegressor
+    lr = LinearRegression().fit(X, y)
+    vr = VotingRegressor([("lr", lr), ("plsr", plsr)])
+    y_pred = vr.fit(X, y).predict(X)
+    assert y_pred.shape == expected.shape
+    assert_allclose(y_pred, expected)
+
+
+def test_pls_regression_scaling_coef():
+    """Check that when using `scale=True`, the coefficients are using the std. dev. from
+    both `X` and `y`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27964
+    """
+    # handcrafted data where we can predict y from X with an additional scaling factor
+    rng = np.random.RandomState(0)
+    coef = rng.uniform(size=(3, 5))
+    X = rng.normal(scale=10, size=(30, 5))  # add a std of 10
+    y = X @ coef.T
+
+    # we need to make sure that the dimension of the latent space is large enough to
+    # perfectly predict `y` from `X` (no information loss)
+    pls = PLSRegression(n_components=5, scale=True).fit(X, y)
+    assert_allclose(pls.coef_, coef)
+
+    # we therefore should be able to predict `y` from `X`
+    assert_allclose(pls.predict(X), y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8863fe489f3b62740757c3801ee55d7e1e406703
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/__init__.py
@@ -0,0 +1,166 @@
+"""Utilities to load popular datasets and artificial data generators."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import textwrap
+
+from ._base import (
+    clear_data_home,
+    fetch_file,
+    get_data_home,
+    load_breast_cancer,
+    load_diabetes,
+    load_digits,
+    load_files,
+    load_iris,
+    load_linnerud,
+    load_sample_image,
+    load_sample_images,
+    load_wine,
+)
+from ._california_housing import fetch_california_housing
+from ._covtype import fetch_covtype
+from ._kddcup99 import fetch_kddcup99
+from ._lfw import fetch_lfw_pairs, fetch_lfw_people
+from ._olivetti_faces import fetch_olivetti_faces
+from ._openml import fetch_openml
+from ._rcv1 import fetch_rcv1
+from ._samples_generator import (
+    make_biclusters,
+    make_blobs,
+    make_checkerboard,
+    make_circles,
+    make_classification,
+    make_friedman1,
+    make_friedman2,
+    make_friedman3,
+    make_gaussian_quantiles,
+    make_hastie_10_2,
+    make_low_rank_matrix,
+    make_moons,
+    make_multilabel_classification,
+    make_regression,
+    make_s_curve,
+    make_sparse_coded_signal,
+    make_sparse_spd_matrix,
+    make_sparse_uncorrelated,
+    make_spd_matrix,
+    make_swiss_roll,
+)
+from ._species_distributions import fetch_species_distributions
+from ._svmlight_format_io import (
+    dump_svmlight_file,
+    load_svmlight_file,
+    load_svmlight_files,
+)
+from ._twenty_newsgroups import fetch_20newsgroups, fetch_20newsgroups_vectorized
+
+__all__ = [
+    "clear_data_home",
+    "dump_svmlight_file",
+    "fetch_20newsgroups",
+    "fetch_20newsgroups_vectorized",
+    "fetch_california_housing",
+    "fetch_covtype",
+    "fetch_file",
+    "fetch_kddcup99",
+    "fetch_lfw_pairs",
+    "fetch_lfw_people",
+    "fetch_olivetti_faces",
+    "fetch_openml",
+    "fetch_rcv1",
+    "fetch_species_distributions",
+    "get_data_home",
+    "load_breast_cancer",
+    "load_diabetes",
+    "load_digits",
+    "load_files",
+    "load_iris",
+    "load_linnerud",
+    "load_sample_image",
+    "load_sample_images",
+    "load_svmlight_file",
+    "load_svmlight_files",
+    "load_wine",
+    "make_biclusters",
+    "make_blobs",
+    "make_checkerboard",
+    "make_circles",
+    "make_classification",
+    "make_friedman1",
+    "make_friedman2",
+    "make_friedman3",
+    "make_gaussian_quantiles",
+    "make_hastie_10_2",
+    "make_low_rank_matrix",
+    "make_moons",
+    "make_multilabel_classification",
+    "make_regression",
+    "make_s_curve",
+    "make_sparse_coded_signal",
+    "make_sparse_spd_matrix",
+    "make_sparse_uncorrelated",
+    "make_spd_matrix",
+    "make_swiss_roll",
+]
+
+
+def __getattr__(name):
+    if name == "load_boston":
+        msg = textwrap.dedent(
+            """
+            `load_boston` has been removed from scikit-learn since version 1.2.
+
+            The Boston housing prices dataset has an ethical problem: as
+            investigated in [1], the authors of this dataset engineered a
+            non-invertible variable "B" assuming that racial self-segregation had a
+            positive impact on house prices [2]. Furthermore the goal of the
+            research that led to the creation of this dataset was to study the
+            impact of air quality but it did not give adequate demonstration of the
+            validity of this assumption.
+
+            The scikit-learn maintainers therefore strongly discourage the use of
+            this dataset unless the purpose of the code is to study and educate
+            about ethical issues in data science and machine learning.
+
+            In this special case, you can fetch the dataset from the original
+            source::
+
+                import pandas as pd
+                import numpy as np
+
+                data_url = "http://lib.stat.cmu.edu/datasets/boston"
+                raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None)
+                data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
+                target = raw_df.values[1::2, 2]
+
+            Alternative datasets include the California housing dataset and the
+            Ames housing dataset. You can load the datasets as follows::
+
+                from sklearn.datasets import fetch_california_housing
+                housing = fetch_california_housing()
+
+            for the California housing dataset and::
+
+                from sklearn.datasets import fetch_openml
+                housing = fetch_openml(name="house_prices", as_frame=True)
+
+            for the Ames housing dataset.
+
+            [1] M Carlisle.
+            "Racist data destruction?"
+            <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>
+
+            [2] Harrison Jr, David, and Daniel L. Rubinfeld.
+            "Hedonic housing prices and the demand for clean air."
+            Journal of environmental economics and management 5.1 (1978): 81-102.
+            <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>
+            """
+        )
+        raise ImportError(msg)
+    try:
+        return globals()[name]
+    except KeyError:
+        # This is turned into the appropriate ImportError
+        raise AttributeError
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb6e629a73c8d509ef8bc00404311b6c1bdcbb8f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py
@@ -0,0 +1,543 @@
+"""Implementation of ARFF parsers: via LIAC-ARFF and pandas."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import re
+from collections import OrderedDict
+from collections.abc import Generator
+from typing import List
+
+import numpy as np
+import scipy as sp
+
+from ..externals import _arff
+from ..externals._arff import ArffSparseDataType
+from ..utils._chunking import chunk_generator, get_chunk_n_rows
+from ..utils._optional_dependencies import check_pandas_support
+from ..utils.fixes import pd_fillna
+
+
+def _split_sparse_columns(
+    arff_data: ArffSparseDataType, include_columns: List
+) -> ArffSparseDataType:
+    """Obtains several columns from sparse ARFF representation. Additionally,
+    the column indices are re-labelled, given the columns that are not
+    included. (e.g., when including [1, 2, 3], the columns will be relabelled
+    to [0, 1, 2]).
+
+    Parameters
+    ----------
+    arff_data : tuple
+        A tuple of three lists of equal size; first list indicating the value,
+        second the x coordinate and the third the y coordinate.
+
+    include_columns : list
+        A list of columns to include.
+
+    Returns
+    -------
+    arff_data_new : tuple
+        Subset of arff data with only the include columns indicated by the
+        include_columns argument.
+    """
+    arff_data_new: ArffSparseDataType = (list(), list(), list())
+    reindexed_columns = {
+        column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)
+    }
+    for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
+        if col_idx in include_columns:
+            arff_data_new[0].append(val)
+            arff_data_new[1].append(row_idx)
+            arff_data_new[2].append(reindexed_columns[col_idx])
+    return arff_data_new
+
+
+def _sparse_data_to_array(
+    arff_data: ArffSparseDataType, include_columns: List
+) -> np.ndarray:
+    # turns the sparse data back into an array (can't use toarray() function,
+    # as this does only work on numeric data)
+    num_obs = max(arff_data[1]) + 1
+    y_shape = (num_obs, len(include_columns))
+    reindexed_columns = {
+        column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)
+    }
+    # TODO: improve for efficiency
+    y = np.empty(y_shape, dtype=np.float64)
+    for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
+        if col_idx in include_columns:
+            y[row_idx, reindexed_columns[col_idx]] = val
+    return y
+
+
+def _post_process_frame(frame, feature_names, target_names):
+    """Post process a dataframe to select the desired columns in `X` and `y`.
+
+    Parameters
+    ----------
+    frame : dataframe
+        The dataframe to split into `X` and `y`.
+
+    feature_names : list of str
+        The list of feature names to populate `X`.
+
+    target_names : list of str
+        The list of target names to populate `y`.
+
+    Returns
+    -------
+    X : dataframe
+        The dataframe containing the features.
+
+    y : {series, dataframe} or None
+        The series or dataframe containing the target.
+    """
+    X = frame[feature_names]
+    if len(target_names) >= 2:
+        y = frame[target_names]
+    elif len(target_names) == 1:
+        y = frame[target_names[0]]
+    else:
+        y = None
+    return X, y
+
+
+def _liac_arff_parser(
+    gzip_file,
+    output_arrays_type,
+    openml_columns_info,
+    feature_names_to_select,
+    target_names_to_select,
+    shape=None,
+):
+    """ARFF parser using the LIAC-ARFF library coded purely in Python.
+
+    This parser is quite slow but consumes a generator. Currently it is needed
+    to parse sparse datasets. For dense datasets, it is recommended to instead
+    use the pandas-based parser, although it does not always handles the
+    dtypes exactly the same.
+
+    Parameters
+    ----------
+    gzip_file : GzipFile instance
+        The file compressed to be read.
+
+    output_arrays_type : {"numpy", "sparse", "pandas"}
+        The type of the arrays that will be returned. The possibilities ara:
+
+        - `"numpy"`: both `X` and `y` will be NumPy arrays;
+        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
+        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
+          pandas Series or DataFrame.
+
+    columns_info : dict
+        The information provided by OpenML regarding the columns of the ARFF
+        file.
+
+    feature_names_to_select : list of str
+        A list of the feature names to be selected.
+
+    target_names_to_select : list of str
+        A list of the target names to be selected.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix, dataframe}
+        The data matrix.
+
+    y : {ndarray, dataframe, series}
+        The target.
+
+    frame : dataframe or None
+        A dataframe containing both `X` and `y`. `None` if
+        `output_array_type != "pandas"`.
+
+    categories : list of str or None
+        The names of the features that are categorical. `None` if
+        `output_array_type == "pandas"`.
+    """
+
+    def _io_to_generator(gzip_file):
+        for line in gzip_file:
+            yield line.decode("utf-8")
+
+    stream = _io_to_generator(gzip_file)
+
+    # find which type (dense or sparse) ARFF type we will have to deal with
+    return_type = _arff.COO if output_arrays_type == "sparse" else _arff.DENSE_GEN
+    # we should not let LIAC-ARFF to encode the nominal attributes with NumPy
+    # arrays to have only numerical values.
+    encode_nominal = not (output_arrays_type == "pandas")
+    arff_container = _arff.load(
+        stream, return_type=return_type, encode_nominal=encode_nominal
+    )
+    columns_to_select = feature_names_to_select + target_names_to_select
+
+    categories = {
+        name: cat
+        for name, cat in arff_container["attributes"]
+        if isinstance(cat, list) and name in columns_to_select
+    }
+    if output_arrays_type == "pandas":
+        pd = check_pandas_support("fetch_openml with as_frame=True")
+
+        columns_info = OrderedDict(arff_container["attributes"])
+        columns_names = list(columns_info.keys())
+
+        # calculate chunksize
+        first_row = next(arff_container["data"])
+        first_df = pd.DataFrame([first_row], columns=columns_names, copy=False)
+
+        row_bytes = first_df.memory_usage(deep=True).sum()
+        chunksize = get_chunk_n_rows(row_bytes)
+
+        # read arff data with chunks
+        columns_to_keep = [col for col in columns_names if col in columns_to_select]
+        dfs = [first_df[columns_to_keep]]
+        for data in chunk_generator(arff_container["data"], chunksize):
+            dfs.append(
+                pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep]
+            )
+        # dfs[0] contains only one row, which may not have enough data to infer to
+        # column's dtype. Here we use `dfs[1]` to configure the dtype in dfs[0]
+        if len(dfs) >= 2:
+            dfs[0] = dfs[0].astype(dfs[1].dtypes)
+
+        # liac-arff parser does not depend on NumPy and uses None to represent
+        # missing values. To be consistent with the pandas parser, we replace
+        # None with np.nan.
+        frame = pd.concat(dfs, ignore_index=True)
+        frame = pd_fillna(pd, frame)
+        del dfs, first_df
+
+        # cast the columns frame
+        dtypes = {}
+        for name in frame.columns:
+            column_dtype = openml_columns_info[name]["data_type"]
+            if column_dtype.lower() == "integer":
+                # Use a pandas extension array instead of np.int64 to be able
+                # to support missing values.
+                dtypes[name] = "Int64"
+            elif column_dtype.lower() == "nominal":
+                dtypes[name] = "category"
+            else:
+                dtypes[name] = frame.dtypes[name]
+        frame = frame.astype(dtypes)
+
+        X, y = _post_process_frame(
+            frame, feature_names_to_select, target_names_to_select
+        )
+    else:
+        arff_data = arff_container["data"]
+
+        feature_indices_to_select = [
+            int(openml_columns_info[col_name]["index"])
+            for col_name in feature_names_to_select
+        ]
+        target_indices_to_select = [
+            int(openml_columns_info[col_name]["index"])
+            for col_name in target_names_to_select
+        ]
+
+        if isinstance(arff_data, Generator):
+            if shape is None:
+                raise ValueError(
+                    "shape must be provided when arr['data'] is a Generator"
+                )
+            if shape[0] == -1:
+                count = -1
+            else:
+                count = shape[0] * shape[1]
+            data = np.fromiter(
+                itertools.chain.from_iterable(arff_data),
+                dtype="float64",
+                count=count,
+            )
+            data = data.reshape(*shape)
+            X = data[:, feature_indices_to_select]
+            y = data[:, target_indices_to_select]
+        elif isinstance(arff_data, tuple):
+            arff_data_X = _split_sparse_columns(arff_data, feature_indices_to_select)
+            num_obs = max(arff_data[1]) + 1
+            X_shape = (num_obs, len(feature_indices_to_select))
+            X = sp.sparse.coo_matrix(
+                (arff_data_X[0], (arff_data_X[1], arff_data_X[2])),
+                shape=X_shape,
+                dtype=np.float64,
+            )
+            X = X.tocsr()
+            y = _sparse_data_to_array(arff_data, target_indices_to_select)
+        else:
+            # This should never happen
+            raise ValueError(
+                f"Unexpected type for data obtained from arff: {type(arff_data)}"
+            )
+
+        is_classification = {
+            col_name in categories for col_name in target_names_to_select
+        }
+        if not is_classification:
+            # No target
+            pass
+        elif all(is_classification):
+            y = np.hstack(
+                [
+                    np.take(
+                        np.asarray(categories.pop(col_name), dtype="O"),
+                        y[:, i : i + 1].astype(int, copy=False),
+                    )
+                    for i, col_name in enumerate(target_names_to_select)
+                ]
+            )
+        elif any(is_classification):
+            raise ValueError(
+                "Mix of nominal and non-nominal targets is not currently supported"
+            )
+
+        # reshape y back to 1-D array, if there is only 1 target column;
+        # back to None if there are not target columns
+        if y.shape[1] == 1:
+            y = y.reshape((-1,))
+        elif y.shape[1] == 0:
+            y = None
+
+    if output_arrays_type == "pandas":
+        return X, y, frame, None
+    return X, y, None, categories
+
+
+def _pandas_arff_parser(
+    gzip_file,
+    output_arrays_type,
+    openml_columns_info,
+    feature_names_to_select,
+    target_names_to_select,
+    read_csv_kwargs=None,
+):
+    """ARFF parser using `pandas.read_csv`.
+
+    This parser uses the metadata fetched directly from OpenML and skips the metadata
+    headers of ARFF file itself. The data is loaded as a CSV file.
+
+    Parameters
+    ----------
+    gzip_file : GzipFile instance
+        The GZip compressed file with the ARFF formatted payload.
+
+    output_arrays_type : {"numpy", "sparse", "pandas"}
+        The type of the arrays that will be returned. The possibilities are:
+
+        - `"numpy"`: both `X` and `y` will be NumPy arrays;
+        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
+        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
+          pandas Series or DataFrame.
+
+    openml_columns_info : dict
+        The information provided by OpenML regarding the columns of the ARFF
+        file.
+
+    feature_names_to_select : list of str
+        A list of the feature names to be selected to build `X`.
+
+    target_names_to_select : list of str
+        A list of the target names to be selected to build `y`.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
+        the default options.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix, dataframe}
+        The data matrix.
+
+    y : {ndarray, dataframe, series}
+        The target.
+
+    frame : dataframe or None
+        A dataframe containing both `X` and `y`. `None` if
+        `output_array_type != "pandas"`.
+
+    categories : list of str or None
+        The names of the features that are categorical. `None` if
+        `output_array_type == "pandas"`.
+    """
+    import pandas as pd
+
+    # read the file until the data section to skip the ARFF metadata headers
+    for line in gzip_file:
+        if line.decode("utf-8").lower().startswith("@data"):
+            break
+
+    dtypes = {}
+    for name in openml_columns_info:
+        column_dtype = openml_columns_info[name]["data_type"]
+        if column_dtype.lower() == "integer":
+            # Use Int64 to infer missing values from data
+            # XXX: this line is not covered by our tests. Is this really needed?
+            dtypes[name] = "Int64"
+        elif column_dtype.lower() == "nominal":
+            dtypes[name] = "category"
+    # since we will not pass `names` when reading the ARFF file, we need to translate
+    # `dtypes` from column names to column indices to pass to `pandas.read_csv`
+    dtypes_positional = {
+        col_idx: dtypes[name]
+        for col_idx, name in enumerate(openml_columns_info)
+        if name in dtypes
+    }
+
+    default_read_csv_kwargs = {
+        "header": None,
+        "index_col": False,  # always force pandas to not use the first column as index
+        "na_values": ["?"],  # missing values are represented by `?`
+        "keep_default_na": False,  # only `?` is a missing value given the ARFF specs
+        "comment": "%",  # skip line starting by `%` since they are comments
+        "quotechar": '"',  # delimiter to use for quoted strings
+        "skipinitialspace": True,  # skip spaces after delimiter to follow ARFF specs
+        "escapechar": "\\",
+        "dtype": dtypes_positional,
+    }
+    read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
+    frame = pd.read_csv(gzip_file, **read_csv_kwargs)
+    try:
+        # Setting the columns while reading the file will select the N first columns
+        # and not raise a ParserError. Instead, we set the columns after reading the
+        # file and raise a ParserError if the number of columns does not match the
+        # number of columns in the metadata given by OpenML.
+        frame.columns = [name for name in openml_columns_info]
+    except ValueError as exc:
+        raise pd.errors.ParserError(
+            "The number of columns provided by OpenML does not match the number of "
+            "columns inferred by pandas when reading the file."
+        ) from exc
+
+    columns_to_select = feature_names_to_select + target_names_to_select
+    columns_to_keep = [col for col in frame.columns if col in columns_to_select]
+    frame = frame[columns_to_keep]
+
+    # `pd.read_csv` automatically handles double quotes for quoting non-numeric
+    # CSV cell values. Contrary to LIAC-ARFF, `pd.read_csv` cannot be configured to
+    # consider either single quotes and double quotes as valid quoting chars at
+    # the same time since this case does not occur in regular (non-ARFF) CSV files.
+    # To mimic the behavior of LIAC-ARFF parser, we manually strip single quotes
+    # on categories as a post-processing steps if needed.
+    #
+    # Note however that we intentionally do not attempt to do this kind of manual
+    # post-processing of (non-categorical) string-typed columns because we cannot
+    # resolve the ambiguity of the case of CSV cell with nesting quoting such as
+    # `"'some string value'"` with pandas.
+    single_quote_pattern = re.compile(r"^'(?P<contents>.*)'$")
+
+    def strip_single_quotes(input_string):
+        match = re.search(single_quote_pattern, input_string)
+        if match is None:
+            return input_string
+
+        return match.group("contents")
+
+    categorical_columns = [
+        name
+        for name, dtype in frame.dtypes.items()
+        if isinstance(dtype, pd.CategoricalDtype)
+    ]
+    for col in categorical_columns:
+        frame[col] = frame[col].cat.rename_categories(strip_single_quotes)
+
+    X, y = _post_process_frame(frame, feature_names_to_select, target_names_to_select)
+
+    if output_arrays_type == "pandas":
+        return X, y, frame, None
+    else:
+        X, y = X.to_numpy(), y.to_numpy()
+
+    categories = {
+        name: dtype.categories.tolist()
+        for name, dtype in frame.dtypes.items()
+        if isinstance(dtype, pd.CategoricalDtype)
+    }
+    return X, y, None, categories
+
+
+def load_arff_from_gzip_file(
+    gzip_file,
+    parser,
+    output_type,
+    openml_columns_info,
+    feature_names_to_select,
+    target_names_to_select,
+    shape=None,
+    read_csv_kwargs=None,
+):
+    """Load a compressed ARFF file using a given parser.
+
+    Parameters
+    ----------
+    gzip_file : GzipFile instance
+        The file compressed to be read.
+
+    parser : {"pandas", "liac-arff"}
+        The parser used to parse the ARFF file. "pandas" is recommended
+        but only supports loading dense datasets.
+
+    output_type : {"numpy", "sparse", "pandas"}
+        The type of the arrays that will be returned. The possibilities ara:
+
+        - `"numpy"`: both `X` and `y` will be NumPy arrays;
+        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
+        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
+          pandas Series or DataFrame.
+
+    openml_columns_info : dict
+        The information provided by OpenML regarding the columns of the ARFF
+        file.
+
+    feature_names_to_select : list of str
+        A list of the feature names to be selected.
+
+    target_names_to_select : list of str
+        A list of the target names to be selected.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
+        the default options.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix, dataframe}
+        The data matrix.
+
+    y : {ndarray, dataframe, series}
+        The target.
+
+    frame : dataframe or None
+        A dataframe containing both `X` and `y`. `None` if
+        `output_array_type != "pandas"`.
+
+    categories : list of str or None
+        The names of the features that are categorical. `None` if
+        `output_array_type == "pandas"`.
+    """
+    if parser == "liac-arff":
+        return _liac_arff_parser(
+            gzip_file,
+            output_type,
+            openml_columns_info,
+            feature_names_to_select,
+            target_names_to_select,
+            shape,
+        )
+    elif parser == "pandas":
+        return _pandas_arff_parser(
+            gzip_file,
+            output_type,
+            openml_columns_info,
+            feature_names_to_select,
+            target_names_to_select,
+            read_csv_kwargs,
+        )
+    else:
+        raise ValueError(
+            f"Unknown parser: '{parser}'. Should be 'liac-arff' or 'pandas'."
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_base.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e6939ddbc193ebab2022ecad56e23516b7e8a4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_base.py
@@ -0,0 +1,1636 @@
+"""
+Base IO code for all datasets
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import csv
+import gzip
+import hashlib
+import os
+import re
+import shutil
+import time
+import unicodedata
+import warnings
+from collections import namedtuple
+from importlib import resources
+from numbers import Integral
+from os import environ, listdir, makedirs
+from os.path import expanduser, isdir, join, splitext
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from urllib.error import URLError
+from urllib.parse import urlparse
+from urllib.request import urlretrieve
+
+import numpy as np
+
+from ..preprocessing import scale
+from ..utils import Bunch, check_random_state
+from ..utils._optional_dependencies import check_pandas_support
+from ..utils._param_validation import Interval, StrOptions, validate_params
+
+DATA_MODULE = "sklearn.datasets.data"
+DESCR_MODULE = "sklearn.datasets.descr"
+IMAGES_MODULE = "sklearn.datasets.images"
+
+RemoteFileMetadata = namedtuple("RemoteFileMetadata", ["filename", "url", "checksum"])
+
+
+@validate_params(
+    {
+        "data_home": [str, os.PathLike, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def get_data_home(data_home=None) -> str:
+    """Return the path of the scikit-learn data directory.
+
+    This folder is used by some large dataset loaders to avoid downloading the
+    data several times.
+
+    By default the data directory is set to a folder named 'scikit_learn_data' in the
+    user home folder.
+
+    Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment
+    variable or programmatically by giving an explicit folder path. The '~'
+    symbol is expanded to the user home folder.
+
+    If the folder does not already exist, it is automatically created.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        The path to scikit-learn data directory. If `None`, the default path
+        is `~/scikit_learn_data`.
+
+    Returns
+    -------
+    data_home: str
+        The path to scikit-learn data directory.
+
+    Examples
+    --------
+    >>> import os
+    >>> from sklearn.datasets import get_data_home
+    >>> data_home_path = get_data_home()
+    >>> os.path.exists(data_home_path)
+    True
+    """
+    if data_home is None:
+        data_home = environ.get("SCIKIT_LEARN_DATA", join("~", "scikit_learn_data"))
+    data_home = expanduser(data_home)
+    makedirs(data_home, exist_ok=True)
+    return data_home
+
+
+@validate_params(
+    {
+        "data_home": [str, os.PathLike, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def clear_data_home(data_home=None):
+    """Delete all the content of the data home cache.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        The path to scikit-learn data directory. If `None`, the default path
+        is `~/scikit_learn_data`.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import clear_data_home
+    >>> clear_data_home()  # doctest: +SKIP
+    """
+    data_home = get_data_home(data_home)
+    shutil.rmtree(data_home)
+
+
+def _convert_data_dataframe(
+    caller_name, data, target, feature_names, target_names, sparse_data=False
+):
+    pd = check_pandas_support("{} with as_frame=True".format(caller_name))
+    if not sparse_data:
+        data_df = pd.DataFrame(data, columns=feature_names, copy=False)
+    else:
+        data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names)
+
+    target_df = pd.DataFrame(target, columns=target_names)
+    combined_df = pd.concat([data_df, target_df], axis=1)
+    X = combined_df[feature_names]
+    y = combined_df[target_names]
+    if y.shape[1] == 1:
+        y = y.iloc[:, 0]
+    return combined_df, X, y
+
+
+@validate_params(
+    {
+        "container_path": [str, os.PathLike],
+        "description": [str, None],
+        "categories": [list, None],
+        "load_content": ["boolean"],
+        "shuffle": ["boolean"],
+        "encoding": [str, None],
+        "decode_error": [StrOptions({"strict", "ignore", "replace"})],
+        "random_state": ["random_state"],
+        "allowed_extensions": [list, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def load_files(
+    container_path,
+    *,
+    description=None,
+    categories=None,
+    load_content=True,
+    shuffle=True,
+    encoding=None,
+    decode_error="strict",
+    random_state=0,
+    allowed_extensions=None,
+):
+    """Load text files with categories as subfolder names.
+
+    Individual samples are assumed to be files stored a two levels folder
+    structure such as the following:
+
+    .. code-block:: text
+
+        container_folder/
+            category_1_folder/
+                file_1.txt
+                file_2.txt
+                ...
+                file_42.txt
+            category_2_folder/
+                file_43.txt
+                file_44.txt
+                ...
+
+    The folder names are used as supervised signal label names. The individual
+    file names are not important.
+
+    This function does not try to extract features into a numpy array or scipy
+    sparse matrix. In addition, if load_content is false it does not try to
+    load the files in memory.
+
+    To use text files in a scikit-learn classification or clustering algorithm,
+    you will need to use the :mod:`~sklearn.feature_extraction.text` module to
+    build a feature extraction transformer that suits your problem.
+
+    If you set load_content=True, you should also specify the encoding of the
+    text using the 'encoding' parameter. For many modern text files, 'utf-8'
+    will be the correct encoding. If you leave encoding equal to None, then the
+    content will be made of bytes instead of Unicode, and you will not be able
+    to use most functions in :mod:`~sklearn.feature_extraction.text`.
+
+    Similar feature extractors should be built for other kind of unstructured
+    data input such as images, audio, video, ...
+
+    If you want files with a specific file extension (e.g. `.txt`) then you
+    can pass a list of those file extensions to `allowed_extensions`.
+
+    Read more in the :ref:`User Guide <datasets>`.
+
+    Parameters
+    ----------
+    container_path : str
+        Path to the main folder holding one subfolder per category.
+
+    description : str, default=None
+        A paragraph describing the characteristic of the dataset: its source,
+        reference, etc.
+
+    categories : list of str, default=None
+        If None (default), load all the categories. If not None, list of
+        category names to load (other categories ignored).
+
+    load_content : bool, default=True
+        Whether to load or not the content of the different files. If true a
+        'data' attribute containing the text information is present in the data
+        structure returned. If not, a filenames attribute gives the path to the
+        files.
+
+    shuffle : bool, default=True
+        Whether or not to shuffle the data: might be important for models that
+        make the assumption that the samples are independent and identically
+        distributed (i.i.d.), such as stochastic gradient descent.
+
+    encoding : str, default=None
+        If None, do not try to decode the content of the files (e.g. for images
+        or other non-text content). If not None, encoding to use to decode text
+        files to Unicode if load_content is True.
+
+    decode_error : {'strict', 'ignore', 'replace'}, default='strict'
+        Instruction on what to do if a byte sequence is given to analyze that
+        contains characters not of the given `encoding`. Passed as keyword
+        argument 'errors' to bytes.decode.
+
+    random_state : int, RandomState instance or None, default=0
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    allowed_extensions : list of str, default=None
+        List of desired file extensions to filter the files to be loaded.
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : list of str
+            Only present when `load_content=True`.
+            The raw text data to learn.
+        target : ndarray
+            The target labels (integer index).
+        target_names : list
+            The names of target classes.
+        DESCR : str
+            The full description of the dataset.
+        filenames: ndarray
+            The filenames holding the dataset.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_files
+    >>> container_path = "./"
+    >>> load_files(container_path)  # doctest: +SKIP
+    """
+
+    target = []
+    target_names = []
+    filenames = []
+
+    folders = [
+        f for f in sorted(listdir(container_path)) if isdir(join(container_path, f))
+    ]
+
+    if categories is not None:
+        folders = [f for f in folders if f in categories]
+
+    if allowed_extensions is not None:
+        allowed_extensions = frozenset(allowed_extensions)
+
+    for label, folder in enumerate(folders):
+        target_names.append(folder)
+        folder_path = join(container_path, folder)
+        files = sorted(listdir(folder_path))
+        if allowed_extensions is not None:
+            documents = [
+                join(folder_path, file)
+                for file in files
+                if os.path.splitext(file)[1] in allowed_extensions
+            ]
+        else:
+            documents = [join(folder_path, file) for file in files]
+        target.extend(len(documents) * [label])
+        filenames.extend(documents)
+
+    # convert to array for fancy indexing
+    filenames = np.array(filenames)
+    target = np.array(target)
+
+    if shuffle:
+        random_state = check_random_state(random_state)
+        indices = np.arange(filenames.shape[0])
+        random_state.shuffle(indices)
+        filenames = filenames[indices]
+        target = target[indices]
+
+    if load_content:
+        data = []
+        for filename in filenames:
+            data.append(Path(filename).read_bytes())
+        if encoding is not None:
+            data = [d.decode(encoding, decode_error) for d in data]
+        return Bunch(
+            data=data,
+            filenames=filenames,
+            target_names=target_names,
+            target=target,
+            DESCR=description,
+        )
+
+    return Bunch(
+        filenames=filenames, target_names=target_names, target=target, DESCR=description
+    )
+
+
+def load_csv_data(
+    data_file_name,
+    *,
+    data_module=DATA_MODULE,
+    descr_file_name=None,
+    descr_module=DESCR_MODULE,
+    encoding="utf-8",
+):
+    """Loads `data_file_name` from `data_module with `importlib.resources`.
+
+    Parameters
+    ----------
+    data_file_name : str
+        Name of csv file to be loaded from `data_module/data_file_name`.
+        For example `'wine_data.csv'`.
+
+    data_module : str or module, default='sklearn.datasets.data'
+        Module where data lives. The default is `'sklearn.datasets.data'`.
+
+    descr_file_name : str, default=None
+        Name of rst file to be loaded from `descr_module/descr_file_name`.
+        For example `'wine_data.rst'`. See also :func:`load_descr`.
+        If not None, also returns the corresponding description of
+        the dataset.
+
+    descr_module : str or module, default='sklearn.datasets.descr'
+        Module where `descr_file_name` lives. See also :func:`load_descr`.
+        The default is `'sklearn.datasets.descr'`.
+
+    Returns
+    -------
+    data : ndarray of shape (n_samples, n_features)
+        A 2D array with each row representing one sample and each column
+        representing the features of a given sample.
+
+    target : ndarry of shape (n_samples,)
+        A 1D array holding target variables for all the samples in `data`.
+        For example target[0] is the target variable for data[0].
+
+    target_names : ndarry of shape (n_samples,)
+        A 1D array containing the names of the classifications. For example
+        target_names[0] is the name of the target[0] class.
+
+    descr : str, optional
+        Description of the dataset (the content of `descr_file_name`).
+        Only returned if `descr_file_name` is not None.
+
+    encoding : str, optional
+        Text encoding of the CSV file.
+
+        .. versionadded:: 1.4
+    """
+    data_path = resources.files(data_module) / data_file_name
+    with data_path.open("r", encoding="utf-8") as csv_file:
+        data_file = csv.reader(csv_file)
+        temp = next(data_file)
+        n_samples = int(temp[0])
+        n_features = int(temp[1])
+        target_names = np.array(temp[2:])
+        data = np.empty((n_samples, n_features))
+        target = np.empty((n_samples,), dtype=int)
+
+        for i, ir in enumerate(data_file):
+            data[i] = np.asarray(ir[:-1], dtype=np.float64)
+            target[i] = np.asarray(ir[-1], dtype=int)
+
+    if descr_file_name is None:
+        return data, target, target_names
+    else:
+        assert descr_module is not None
+        descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)
+        return data, target, target_names, descr
+
+
+def load_gzip_compressed_csv_data(
+    data_file_name,
+    *,
+    data_module=DATA_MODULE,
+    descr_file_name=None,
+    descr_module=DESCR_MODULE,
+    encoding="utf-8",
+    **kwargs,
+):
+    """Loads gzip-compressed with `importlib.resources`.
+
+    1) Open resource file with `importlib.resources.open_binary`
+    2) Decompress file obj with `gzip.open`
+    3) Load decompressed data with `np.loadtxt`
+
+    Parameters
+    ----------
+    data_file_name : str
+        Name of gzip-compressed csv file  (`'*.csv.gz'`) to be loaded from
+        `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`.
+
+    data_module : str or module, default='sklearn.datasets.data'
+        Module where data lives. The default is `'sklearn.datasets.data'`.
+
+    descr_file_name : str, default=None
+        Name of rst file to be loaded from `descr_module/descr_file_name`.
+        For example `'wine_data.rst'`. See also :func:`load_descr`.
+        If not None, also returns the corresponding description of
+        the dataset.
+
+    descr_module : str or module, default='sklearn.datasets.descr'
+        Module where `descr_file_name` lives. See also :func:`load_descr`.
+        The default  is `'sklearn.datasets.descr'`.
+
+    encoding : str, default="utf-8"
+        Name of the encoding that the gzip-decompressed file will be
+        decoded with. The default is 'utf-8'.
+
+    **kwargs : dict, optional
+        Keyword arguments to be passed to `np.loadtxt`;
+        e.g. delimiter=','.
+
+    Returns
+    -------
+    data : ndarray of shape (n_samples, n_features)
+        A 2D array with each row representing one sample and each column
+        representing the features and/or target of a given sample.
+
+    descr : str, optional
+        Description of the dataset (the content of `descr_file_name`).
+        Only returned if `descr_file_name` is not None.
+    """
+    data_path = resources.files(data_module) / data_file_name
+    with data_path.open("rb") as compressed_file:
+        compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding)
+        data = np.loadtxt(compressed_file, **kwargs)
+
+    if descr_file_name is None:
+        return data
+    else:
+        assert descr_module is not None
+        descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)
+        return data, descr
+
+
+def load_descr(descr_file_name, *, descr_module=DESCR_MODULE, encoding="utf-8"):
+    """Load `descr_file_name` from `descr_module` with `importlib.resources`.
+
+    Parameters
+    ----------
+    descr_file_name : str, default=None
+        Name of rst file to be loaded from `descr_module/descr_file_name`.
+        For example `'wine_data.rst'`. See also :func:`load_descr`.
+        If not None, also returns the corresponding description of
+        the dataset.
+
+    descr_module : str or module, default='sklearn.datasets.descr'
+        Module where `descr_file_name` lives. See also :func:`load_descr`.
+        The default  is `'sklearn.datasets.descr'`.
+
+    encoding : str, default="utf-8"
+        Name of the encoding that `descr_file_name` will be decoded with.
+        The default is 'utf-8'.
+
+        .. versionadded:: 1.4
+
+    Returns
+    -------
+    fdescr : str
+        Content of `descr_file_name`.
+    """
+    path = resources.files(descr_module) / descr_file_name
+    return path.read_text(encoding=encoding)
+
+
+@validate_params(
+    {
+        "return_X_y": ["boolean"],
+        "as_frame": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def load_wine(*, return_X_y=False, as_frame=False):
+    """Load and return the wine dataset (classification).
+
+    .. versionadded:: 0.18
+
+    The wine dataset is a classic and very easy multi-class classification
+    dataset.
+
+    =================   ==============
+    Classes                          3
+    Samples per class        [59,71,48]
+    Samples total                  178
+    Dimensionality                  13
+    Features            real, positive
+    =================   ==============
+
+    The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit
+    standard format from:
+    https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
+
+    Read more in the :ref:`User Guide <wine_dataset>`.
+
+    Parameters
+    ----------
+    return_X_y : bool, default=False
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric). The target is
+        a pandas DataFrame or Series depending on the number of target columns.
+        If `return_X_y` is True, then (`data`, `target`) will be pandas
+        DataFrames or Series as described below.
+
+        .. versionadded:: 0.23
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : {ndarray, dataframe} of shape (178, 13)
+            The data matrix. If `as_frame=True`, `data` will be a pandas
+            DataFrame.
+        target: {ndarray, Series} of shape (178,)
+            The classification target. If `as_frame=True`, `target` will be
+            a pandas Series.
+        feature_names: list
+            The names of the dataset columns.
+        target_names: list
+            The names of target classes.
+        frame: DataFrame of shape (178, 14)
+            Only present when `as_frame=True`. DataFrame with `data` and
+            `target`.
+
+            .. versionadded:: 0.23
+        DESCR: str
+            The full description of the dataset.
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarrays by default. The first contains a 2D array of shape
+        (178, 13) with each row representing one sample and each column representing
+        the features. The second array of shape (178,) contains the target samples.
+
+    Examples
+    --------
+    Let's say you are interested in the samples 10, 80, and 140, and want to
+    know their class name.
+
+    >>> from sklearn.datasets import load_wine
+    >>> data = load_wine()
+    >>> data.target[[10, 80, 140]]
+    array([0, 1, 2])
+    >>> list(data.target_names)
+    [np.str_('class_0'), np.str_('class_1'), np.str_('class_2')]
+    """
+
+    data, target, target_names, fdescr = load_csv_data(
+        data_file_name="wine_data.csv", descr_file_name="wine_data.rst"
+    )
+
+    feature_names = [
+        "alcohol",
+        "malic_acid",
+        "ash",
+        "alcalinity_of_ash",
+        "magnesium",
+        "total_phenols",
+        "flavanoids",
+        "nonflavanoid_phenols",
+        "proanthocyanins",
+        "color_intensity",
+        "hue",
+        "od280/od315_of_diluted_wines",
+        "proline",
+    ]
+
+    frame = None
+    target_columns = [
+        "target",
+    ]
+    if as_frame:
+        frame, data, target = _convert_data_dataframe(
+            "load_wine", data, target, feature_names, target_columns
+        )
+
+    if return_X_y:
+        return data, target
+
+    return Bunch(
+        data=data,
+        target=target,
+        frame=frame,
+        target_names=target_names,
+        DESCR=fdescr,
+        feature_names=feature_names,
+    )
+
+
+@validate_params(
+    {"return_X_y": ["boolean"], "as_frame": ["boolean"]},
+    prefer_skip_nested_validation=True,
+)
+def load_iris(*, return_X_y=False, as_frame=False):
+    """Load and return the iris dataset (classification).
+
+    The iris dataset is a classic and very easy multi-class classification
+    dataset.
+
+    =================   ==============
+    Classes                          3
+    Samples per class               50
+    Samples total                  150
+    Dimensionality                   4
+    Features            real, positive
+    =================   ==============
+
+    Read more in the :ref:`User Guide <iris_dataset>`.
+
+    .. versionchanged:: 0.20
+        Fixed two wrong data points according to Fisher's paper.
+        The new version is the same as in R, but not as in the UCI
+        Machine Learning Repository.
+
+    Parameters
+    ----------
+    return_X_y : bool, default=False
+        If True, returns ``(data, target)`` instead of a Bunch object. See
+        below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric). The target is
+        a pandas DataFrame or Series depending on the number of target columns.
+        If `return_X_y` is True, then (`data`, `target`) will be pandas
+        DataFrames or Series as described below.
+
+        .. versionadded:: 0.23
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : {ndarray, dataframe} of shape (150, 4)
+            The data matrix. If `as_frame=True`, `data` will be a pandas
+            DataFrame.
+        target: {ndarray, Series} of shape (150,)
+            The classification target. If `as_frame=True`, `target` will be
+            a pandas Series.
+        feature_names: list
+            The names of the dataset columns.
+        target_names: ndarray of shape (3, )
+            The names of target classes.
+        frame: DataFrame of shape (150, 5)
+            Only present when `as_frame=True`. DataFrame with `data` and
+            `target`.
+
+            .. versionadded:: 0.23
+        DESCR: str
+            The full description of the dataset.
+        filename: str
+            The path to the location of the data.
+
+            .. versionadded:: 0.20
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarray. The first containing a 2D array of shape
+        (n_samples, n_features) with each row representing one sample and
+        each column representing the features. The second ndarray of shape
+        (n_samples,) containing the target samples.
+
+        .. versionadded:: 0.18
+
+    Examples
+    --------
+    Let's say you are interested in the samples 10, 25, and 50, and want to
+    know their class name.
+
+    >>> from sklearn.datasets import load_iris
+    >>> data = load_iris()
+    >>> data.target[[10, 25, 50]]
+    array([0, 0, 1])
+    >>> list(data.target_names)
+    [np.str_('setosa'), np.str_('versicolor'), np.str_('virginica')]
+
+    See :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py` for a more
+    detailed example of how to work with the iris dataset.
+    """
+    data_file_name = "iris.csv"
+    data, target, target_names, fdescr = load_csv_data(
+        data_file_name=data_file_name, descr_file_name="iris.rst"
+    )
+
+    feature_names = [
+        "sepal length (cm)",
+        "sepal width (cm)",
+        "petal length (cm)",
+        "petal width (cm)",
+    ]
+
+    frame = None
+    target_columns = [
+        "target",
+    ]
+    if as_frame:
+        frame, data, target = _convert_data_dataframe(
+            "load_iris", data, target, feature_names, target_columns
+        )
+
+    if return_X_y:
+        return data, target
+
+    return Bunch(
+        data=data,
+        target=target,
+        frame=frame,
+        target_names=target_names,
+        DESCR=fdescr,
+        feature_names=feature_names,
+        filename=data_file_name,
+        data_module=DATA_MODULE,
+    )
+
+
+@validate_params(
+    {"return_X_y": ["boolean"], "as_frame": ["boolean"]},
+    prefer_skip_nested_validation=True,
+)
+def load_breast_cancer(*, return_X_y=False, as_frame=False):
+    """Load and return the breast cancer Wisconsin dataset (classification).
+
+    The breast cancer dataset is a classic and very easy binary classification
+    dataset.
+
+    =================   ==============
+    Classes                          2
+    Samples per class    212(M),357(B)
+    Samples total                  569
+    Dimensionality                  30
+    Features            real, positive
+    =================   ==============
+
+    The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is
+    downloaded from:
+    https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic
+
+    Read more in the :ref:`User Guide <breast_cancer_dataset>`.
+
+    Parameters
+    ----------
+    return_X_y : bool, default=False
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric). The target is
+        a pandas DataFrame or Series depending on the number of target columns.
+        If `return_X_y` is True, then (`data`, `target`) will be pandas
+        DataFrames or Series as described below.
+
+        .. versionadded:: 0.23
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : {ndarray, dataframe} of shape (569, 30)
+            The data matrix. If `as_frame=True`, `data` will be a pandas
+            DataFrame.
+        target : {ndarray, Series} of shape (569,)
+            The classification target. If `as_frame=True`, `target` will be
+            a pandas Series.
+        feature_names : ndarray of shape (30,)
+            The names of the dataset columns.
+        target_names : ndarray of shape (2,)
+            The names of target classes.
+        frame : DataFrame of shape (569, 31)
+            Only present when `as_frame=True`. DataFrame with `data` and
+            `target`.
+
+            .. versionadded:: 0.23
+        DESCR : str
+            The full description of the dataset.
+        filename : str
+            The path to the location of the data.
+
+            .. versionadded:: 0.20
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarrays by default. The first contains a 2D ndarray of
+        shape (569, 30) with each row representing one sample and each column
+        representing the features. The second ndarray of shape (569,) contains
+        the target samples.  If `as_frame=True`, both arrays are pandas objects,
+        i.e. `X` a dataframe and `y` a series.
+
+        .. versionadded:: 0.18
+
+    Examples
+    --------
+    Let's say you are interested in the samples 10, 50, and 85, and want to
+    know their class name.
+
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> data = load_breast_cancer()
+    >>> data.target[[10, 50, 85]]
+    array([0, 1, 0])
+    >>> list(data.target_names)
+    [np.str_('malignant'), np.str_('benign')]
+    """
+    data_file_name = "breast_cancer.csv"
+    data, target, target_names, fdescr = load_csv_data(
+        data_file_name=data_file_name, descr_file_name="breast_cancer.rst"
+    )
+
+    feature_names = np.array(
+        [
+            "mean radius",
+            "mean texture",
+            "mean perimeter",
+            "mean area",
+            "mean smoothness",
+            "mean compactness",
+            "mean concavity",
+            "mean concave points",
+            "mean symmetry",
+            "mean fractal dimension",
+            "radius error",
+            "texture error",
+            "perimeter error",
+            "area error",
+            "smoothness error",
+            "compactness error",
+            "concavity error",
+            "concave points error",
+            "symmetry error",
+            "fractal dimension error",
+            "worst radius",
+            "worst texture",
+            "worst perimeter",
+            "worst area",
+            "worst smoothness",
+            "worst compactness",
+            "worst concavity",
+            "worst concave points",
+            "worst symmetry",
+            "worst fractal dimension",
+        ]
+    )
+
+    frame = None
+    target_columns = [
+        "target",
+    ]
+    if as_frame:
+        frame, data, target = _convert_data_dataframe(
+            "load_breast_cancer", data, target, feature_names, target_columns
+        )
+
+    if return_X_y:
+        return data, target
+
+    return Bunch(
+        data=data,
+        target=target,
+        frame=frame,
+        target_names=target_names,
+        DESCR=fdescr,
+        feature_names=feature_names,
+        filename=data_file_name,
+        data_module=DATA_MODULE,
+    )
+
+
+@validate_params(
+    {
+        "n_class": [Interval(Integral, 1, 10, closed="both")],
+        "return_X_y": ["boolean"],
+        "as_frame": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
+    """Load and return the digits dataset (classification).
+
+    Each datapoint is a 8x8 image of a digit.
+
+    =================   ==============
+    Classes                         10
+    Samples per class             ~180
+    Samples total                 1797
+    Dimensionality                  64
+    Features             integers 0-16
+    =================   ==============
+
+    This is a copy of the test set of the UCI ML hand-written digits datasets
+    https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
+
+    Read more in the :ref:`User Guide <digits_dataset>`.
+
+    Parameters
+    ----------
+    n_class : int, default=10
+        The number of classes to return. Between 0 and 10.
+
+    return_X_y : bool, default=False
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric). The target is
+        a pandas DataFrame or Series depending on the number of target columns.
+        If `return_X_y` is True, then (`data`, `target`) will be pandas
+        DataFrames or Series as described below.
+
+        .. versionadded:: 0.23
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : {ndarray, dataframe} of shape (1797, 64)
+            The flattened data matrix. If `as_frame=True`, `data` will be
+            a pandas DataFrame.
+        target: {ndarray, Series} of shape (1797,)
+            The classification target. If `as_frame=True`, `target` will be
+            a pandas Series.
+        feature_names: list
+            The names of the dataset columns.
+        target_names: list
+            The names of target classes.
+
+            .. versionadded:: 0.20
+
+        frame: DataFrame of shape (1797, 65)
+            Only present when `as_frame=True`. DataFrame with `data` and
+            `target`.
+
+            .. versionadded:: 0.23
+        images: {ndarray} of shape (1797, 8, 8)
+            The raw image data.
+        DESCR: str
+            The full description of the dataset.
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarrays by default. The first contains a 2D ndarray of
+        shape (1797, 64) with each row representing one sample and each column
+        representing the features. The second ndarray of shape (1797) contains
+        the target samples.  If `as_frame=True`, both arrays are pandas objects,
+        i.e. `X` a dataframe and `y` a series.
+
+        .. versionadded:: 0.18
+
+    Examples
+    --------
+    To load the data and visualize the images::
+
+        >>> from sklearn.datasets import load_digits
+        >>> digits = load_digits()
+        >>> print(digits.data.shape)
+        (1797, 64)
+        >>> import matplotlib.pyplot as plt
+        >>> plt.matshow(digits.images[0], cmap="gray")
+        <...>
+        >>> plt.show()
+    """
+
+    data, fdescr = load_gzip_compressed_csv_data(
+        data_file_name="digits.csv.gz", descr_file_name="digits.rst", delimiter=","
+    )
+
+    target = data[:, -1].astype(int, copy=False)
+    flat_data = data[:, :-1]
+    images = flat_data.view()
+    images.shape = (-1, 8, 8)
+
+    if n_class < 10:
+        idx = target < n_class
+        flat_data, target = flat_data[idx], target[idx]
+        images = images[idx]
+
+    feature_names = [
+        "pixel_{}_{}".format(row_idx, col_idx)
+        for row_idx in range(8)
+        for col_idx in range(8)
+    ]
+
+    frame = None
+    target_columns = [
+        "target",
+    ]
+    if as_frame:
+        frame, flat_data, target = _convert_data_dataframe(
+            "load_digits", flat_data, target, feature_names, target_columns
+        )
+
+    if return_X_y:
+        return flat_data, target
+
+    return Bunch(
+        data=flat_data,
+        target=target,
+        frame=frame,
+        feature_names=feature_names,
+        target_names=np.arange(10),
+        images=images,
+        DESCR=fdescr,
+    )
+
+
+@validate_params(
+    {"return_X_y": ["boolean"], "as_frame": ["boolean"], "scaled": ["boolean"]},
+    prefer_skip_nested_validation=True,
+)
+def load_diabetes(*, return_X_y=False, as_frame=False, scaled=True):
+    """Load and return the diabetes dataset (regression).
+
+    ==============   ==================
+    Samples total    442
+    Dimensionality   10
+    Features         real, -.2 < x < .2
+    Targets          integer 25 - 346
+    ==============   ==================
+
+    .. note::
+       The meaning of each feature (i.e. `feature_names`) might be unclear
+       (especially for `ltg`) as the documentation of the original dataset is
+       not explicit. We provide information that seems correct in regard with
+       the scientific literature in this field of research.
+
+    Read more in the :ref:`User Guide <diabetes_dataset>`.
+
+    Parameters
+    ----------
+    return_X_y : bool, default=False
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric). The target is
+        a pandas DataFrame or Series depending on the number of target columns.
+        If `return_X_y` is True, then (`data`, `target`) will be pandas
+        DataFrames or Series as described below.
+
+        .. versionadded:: 0.23
+
+    scaled : bool, default=True
+        If True, the feature variables are mean centered and scaled by the
+        standard deviation times the square root of `n_samples`.
+        If False, raw data is returned for the feature variables.
+
+        .. versionadded:: 1.1
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : {ndarray, dataframe} of shape (442, 10)
+            The data matrix. If `as_frame=True`, `data` will be a pandas
+            DataFrame.
+        target: {ndarray, Series} of shape (442,)
+            The regression target. If `as_frame=True`, `target` will be
+            a pandas Series.
+        feature_names: list
+            The names of the dataset columns.
+        frame: DataFrame of shape (442, 11)
+            Only present when `as_frame=True`. DataFrame with `data` and
+            `target`.
+
+            .. versionadded:: 0.23
+        DESCR: str
+            The full description of the dataset.
+        data_filename: str
+            The path to the location of the data.
+        target_filename: str
+            The path to the location of the target.
+
+    (data, target) : tuple if ``return_X_y`` is True
+        Returns a tuple of two ndarray of shape (n_samples, n_features)
+        A 2D array with each row representing one sample and each column
+        representing the features and/or target of a given sample.
+
+        .. versionadded:: 0.18
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_diabetes
+    >>> diabetes = load_diabetes()
+    >>> diabetes.target[:3]
+    array([151.,  75., 141.])
+    >>> diabetes.data.shape
+    (442, 10)
+    """
+    data_filename = "diabetes_data_raw.csv.gz"
+    target_filename = "diabetes_target.csv.gz"
+    data = load_gzip_compressed_csv_data(data_filename)
+    target = load_gzip_compressed_csv_data(target_filename)
+
+    if scaled:
+        data = scale(data, copy=False)
+        data /= data.shape[0] ** 0.5
+
+    fdescr = load_descr("diabetes.rst")
+
+    feature_names = ["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"]
+
+    frame = None
+    target_columns = [
+        "target",
+    ]
+    if as_frame:
+        frame, data, target = _convert_data_dataframe(
+            "load_diabetes", data, target, feature_names, target_columns
+        )
+
+    if return_X_y:
+        return data, target
+
+    return Bunch(
+        data=data,
+        target=target,
+        frame=frame,
+        DESCR=fdescr,
+        feature_names=feature_names,
+        data_filename=data_filename,
+        target_filename=target_filename,
+        data_module=DATA_MODULE,
+    )
+
+
+@validate_params(
+    {
+        "return_X_y": ["boolean"],
+        "as_frame": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def load_linnerud(*, return_X_y=False, as_frame=False):
+    """Load and return the physical exercise Linnerud dataset.
+
+    This dataset is suitable for multi-output regression tasks.
+
+    ==============   ============================
+    Samples total    20
+    Dimensionality   3 (for both data and target)
+    Features         integer
+    Targets          integer
+    ==============   ============================
+
+    Read more in the :ref:`User Guide <linnerrud_dataset>`.
+
+    Parameters
+    ----------
+    return_X_y : bool, default=False
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric, string or categorical). The target is
+        a pandas DataFrame or Series depending on the number of target columns.
+        If `return_X_y` is True, then (`data`, `target`) will be pandas
+        DataFrames or Series as described below.
+
+        .. versionadded:: 0.23
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : {ndarray, dataframe} of shape (20, 3)
+            The data matrix. If `as_frame=True`, `data` will be a pandas
+            DataFrame.
+        target: {ndarray, dataframe} of shape (20, 3)
+            The regression targets. If `as_frame=True`, `target` will be
+            a pandas DataFrame.
+        feature_names: list
+            The names of the dataset columns.
+        target_names: list
+            The names of the target columns.
+        frame: DataFrame of shape (20, 6)
+            Only present when `as_frame=True`. DataFrame with `data` and
+            `target`.
+
+            .. versionadded:: 0.23
+        DESCR: str
+            The full description of the dataset.
+        data_filename: str
+            The path to the location of the data.
+        target_filename: str
+            The path to the location of the target.
+
+            .. versionadded:: 0.20
+
+    (data, target) : tuple if ``return_X_y`` is True
+        Returns a tuple of two ndarrays or dataframe of shape
+        `(20, 3)`. Each row represents one sample and each column represents the
+        features in `X` and a target in `y` of a given sample.
+
+        .. versionadded:: 0.18
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_linnerud
+    >>> linnerud = load_linnerud()
+    >>> linnerud.data.shape
+    (20, 3)
+    >>> linnerud.target.shape
+    (20, 3)
+    """
+    data_filename = "linnerud_exercise.csv"
+    target_filename = "linnerud_physiological.csv"
+
+    data_module_path = resources.files(DATA_MODULE)
+    # Read header and data
+    data_path = data_module_path / data_filename
+    with data_path.open("r", encoding="utf-8") as f:
+        header_exercise = f.readline().split()
+        f.seek(0)  # reset file obj
+        data_exercise = np.loadtxt(f, skiprows=1)
+
+    target_path = data_module_path / target_filename
+    with target_path.open("r", encoding="utf-8") as f:
+        header_physiological = f.readline().split()
+        f.seek(0)  # reset file obj
+        data_physiological = np.loadtxt(f, skiprows=1)
+
+    fdescr = load_descr("linnerud.rst")
+
+    frame = None
+    if as_frame:
+        (frame, data_exercise, data_physiological) = _convert_data_dataframe(
+            "load_linnerud",
+            data_exercise,
+            data_physiological,
+            header_exercise,
+            header_physiological,
+        )
+    if return_X_y:
+        return data_exercise, data_physiological
+
+    return Bunch(
+        data=data_exercise,
+        feature_names=header_exercise,
+        target=data_physiological,
+        target_names=header_physiological,
+        frame=frame,
+        DESCR=fdescr,
+        data_filename=data_filename,
+        target_filename=target_filename,
+        data_module=DATA_MODULE,
+    )
+
+
+def load_sample_images():
+    """Load sample images for image manipulation.
+
+    Loads both, ``china`` and ``flower``.
+
+    Read more in the :ref:`User Guide <sample_images>`.
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        images : list of ndarray of shape (427, 640, 3)
+            The two sample image.
+        filenames : list
+            The filenames for the images.
+        DESCR : str
+            The full description of the dataset.
+
+    Examples
+    --------
+    To load the data and visualize the images:
+
+    >>> from sklearn.datasets import load_sample_images
+    >>> dataset = load_sample_images()     #doctest: +SKIP
+    >>> len(dataset.images)                #doctest: +SKIP
+    2
+    >>> first_img_data = dataset.images[0] #doctest: +SKIP
+    >>> first_img_data.shape               #doctest: +SKIP
+    (427, 640, 3)
+    >>> first_img_data.dtype               #doctest: +SKIP
+    dtype('uint8')
+    """
+    try:
+        from PIL import Image
+    except ImportError:
+        raise ImportError(
+            "The Python Imaging Library (PIL) is required to load data "
+            "from jpeg files. Please refer to "
+            "https://pillow.readthedocs.io/en/stable/installation.html "
+            "for installing PIL."
+        )
+
+    descr = load_descr("README.txt", descr_module=IMAGES_MODULE)
+
+    filenames, images = [], []
+
+    jpg_paths = sorted(
+        resource
+        for resource in resources.files(IMAGES_MODULE).iterdir()
+        if resource.is_file() and resource.match("*.jpg")
+    )
+
+    for path in jpg_paths:
+        filenames.append(str(path))
+        with path.open("rb") as image_file:
+            pil_image = Image.open(image_file)
+            image = np.asarray(pil_image)
+        images.append(image)
+
+    return Bunch(images=images, filenames=filenames, DESCR=descr)
+
+
+@validate_params(
+    {
+        "image_name": [StrOptions({"china.jpg", "flower.jpg"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def load_sample_image(image_name):
+    """Load the numpy array of a single sample image.
+
+    Read more in the :ref:`User Guide <sample_images>`.
+
+    Parameters
+    ----------
+    image_name : {`china.jpg`, `flower.jpg`}
+        The name of the sample image loaded.
+
+    Returns
+    -------
+    img : 3D array
+        The image as a numpy array: height x width x color.
+
+    Examples
+    --------
+
+    >>> from sklearn.datasets import load_sample_image
+    >>> china = load_sample_image('china.jpg')   # doctest: +SKIP
+    >>> china.dtype                              # doctest: +SKIP
+    dtype('uint8')
+    >>> china.shape                              # doctest: +SKIP
+    (427, 640, 3)
+    >>> flower = load_sample_image('flower.jpg') # doctest: +SKIP
+    >>> flower.dtype                             # doctest: +SKIP
+    dtype('uint8')
+    >>> flower.shape                             # doctest: +SKIP
+    (427, 640, 3)
+    """
+    images = load_sample_images()
+    index = None
+    for i, filename in enumerate(images.filenames):
+        if filename.endswith(image_name):
+            index = i
+            break
+    if index is None:
+        raise AttributeError("Cannot find sample image: %s" % image_name)
+    return images.images[index]
+
+
+def _pkl_filepath(*args, **kwargs):
+    """Return filename for Python 3 pickles
+
+    args[-1] is expected to be the ".pkl" filename. For compatibility with
+    older scikit-learn versions, a suffix is inserted before the extension.
+
+    _pkl_filepath('/path/to/folder', 'filename.pkl') returns
+    '/path/to/folder/filename_py3.pkl'
+
+    """
+    py3_suffix = kwargs.get("py3_suffix", "_py3")
+    basename, ext = splitext(args[-1])
+    basename += py3_suffix
+    new_args = args[:-1] + (basename + ext,)
+    return join(*new_args)
+
+
+def _sha256(path):
+    """Calculate the sha256 hash of the file at path."""
+    sha256hash = hashlib.sha256()
+    chunk_size = 8192
+    with open(path, "rb") as f:
+        while True:
+            buffer = f.read(chunk_size)
+            if not buffer:
+                break
+            sha256hash.update(buffer)
+    return sha256hash.hexdigest()
+
+
+def _fetch_remote(remote, dirname=None, n_retries=3, delay=1):
+    """Helper function to download a remote dataset.
+
+    Fetch a dataset pointed by remote's url, save into path using remote's
+    filename and ensure its integrity based on the SHA256 checksum of the
+    downloaded file.
+
+    .. versionchanged:: 1.6
+
+        If the file already exists locally and the SHA256 checksums match, the
+        path to the local file is returned without re-downloading.
+
+    Parameters
+    ----------
+    remote : RemoteFileMetadata
+        Named tuple containing remote dataset meta information: url, filename
+        and checksum.
+
+    dirname : str or Path, default=None
+        Directory to save the file to. If None, the current working directory
+        is used.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : int, default=1
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    file_path: Path
+        Full path of the created file.
+    """
+    if dirname is None:
+        folder_path = Path(".")
+    else:
+        folder_path = Path(dirname)
+
+    file_path = folder_path / remote.filename
+
+    if file_path.exists():
+        if remote.checksum is None:
+            return file_path
+
+        checksum = _sha256(file_path)
+        if checksum == remote.checksum:
+            return file_path
+        else:
+            warnings.warn(
+                f"SHA256 checksum of existing local file {file_path.name} "
+                f"({checksum}) differs from expected ({remote.checksum}): "
+                f"re-downloading from {remote.url} ."
+            )
+
+    # We create a temporary file dedicated to this particular download to avoid
+    # conflicts with parallel downloads. If the download is successful, the
+    # temporary file is atomically renamed to the final file path (with
+    # `shutil.move`). We therefore pass `delete=False` to `NamedTemporaryFile`.
+    # Otherwise, garbage collecting temp_file would raise an error when
+    # attempting to delete a file that was already renamed. If the download
+    # fails or the result does not match the expected SHA256 digest, the
+    # temporary file is removed manually in the except block.
+    temp_file = NamedTemporaryFile(
+        prefix=remote.filename + ".part_", dir=folder_path, delete=False
+    )
+    # Note that Python 3.12's `delete_on_close=True` is ignored as we set
+    # `delete=False` explicitly. So after this line the empty temporary file still
+    # exists on disk to make sure that it's uniquely reserved for this specific call of
+    # `_fetch_remote` and therefore it protects against any corruption by parallel
+    # calls.
+    temp_file.close()
+    try:
+        temp_file_path = Path(temp_file.name)
+        while True:
+            try:
+                urlretrieve(remote.url, temp_file_path)
+                break
+            except (URLError, TimeoutError):
+                if n_retries == 0:
+                    # If no more retries are left, re-raise the caught exception.
+                    raise
+                warnings.warn(f"Retry downloading from url: {remote.url}")
+                n_retries -= 1
+                time.sleep(delay)
+
+        checksum = _sha256(temp_file_path)
+        if remote.checksum is not None and remote.checksum != checksum:
+            raise OSError(
+                f"The SHA256 checksum of {remote.filename} ({checksum}) "
+                f"differs from expected ({remote.checksum})."
+            )
+    except (Exception, KeyboardInterrupt):
+        os.unlink(temp_file.name)
+        raise
+
+    # The following renaming is atomic whenever temp_file_path and
+    # file_path are on the same filesystem. This should be the case most of
+    # the time, but we still use shutil.move instead of os.rename in case
+    # they are not.
+    shutil.move(temp_file_path, file_path)
+
+    return file_path
+
+
+def _filter_filename(value, filter_dots=True):
+    """Derive a name that is safe to use as filename from the given string.
+
+    Adapted from the `slugify` function of django:
+    https://github.com/django/django/blob/master/django/utils/text.py
+
+    Convert spaces or repeated dashes to single dashes. Replace characters that
+    aren't alphanumerics, underscores, hyphens or dots by underscores. Convert
+    to lowercase. Also strip leading and trailing whitespace, dashes, and
+    underscores.
+    """
+    value = unicodedata.normalize("NFKD", value).lower()
+    if filter_dots:
+        value = re.sub(r"[^\w\s-]+", "_", value)
+    else:
+        value = re.sub(r"[^.\w\s-]+", "_", value)
+    value = re.sub(r"[\s-]+", "-", value)
+    return value.strip("-_.")
+
+
+def _derive_folder_and_filename_from_url(url):
+    parsed_url = urlparse(url)
+    if not parsed_url.hostname:
+        raise ValueError(f"Invalid URL: {url}")
+    folder_components = [_filter_filename(parsed_url.hostname, filter_dots=False)]
+    path = parsed_url.path
+
+    if "/" in path:
+        base_folder, raw_filename = path.rsplit("/", 1)
+
+        base_folder = _filter_filename(base_folder)
+        if base_folder:
+            folder_components.append(base_folder)
+    else:
+        raw_filename = path
+
+    filename = _filter_filename(raw_filename, filter_dots=False)
+    if not filename:
+        filename = "downloaded_file"
+
+    return "/".join(folder_components), filename
+
+
+def fetch_file(
+    url, folder=None, local_filename=None, sha256=None, n_retries=3, delay=1
+):
+    """Fetch a file from the web if not already present in the local folder.
+
+    If the file already exists locally (and the SHA256 checksums match when
+    provided), the path to the local file is returned without re-downloading.
+
+    .. versionadded:: 1.6
+
+    Parameters
+    ----------
+    url : str
+        URL of the file to download.
+
+    folder : str or Path, default=None
+        Directory to save the file to. If None, the file is downloaded in a
+        folder with a name derived from the URL host name and path under
+        scikit-learn data home folder.
+
+    local_filename : str, default=None
+        Name of the file to save. If None, the filename is inferred from the
+        URL.
+
+    sha256 : str, default=None
+        SHA256 checksum of the file. If None, no checksum is verified.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+    delay : int, default=1
+        Number of seconds between retries.
+
+    Returns
+    -------
+    file_path : Path
+        Full path of the downloaded file.
+    """
+    folder_from_url, filename_from_url = _derive_folder_and_filename_from_url(url)
+
+    if local_filename is None:
+        local_filename = filename_from_url
+
+    if folder is None:
+        folder = Path(get_data_home()) / folder_from_url
+        makedirs(folder, exist_ok=True)
+
+    remote_metadata = RemoteFileMetadata(
+        filename=local_filename, url=url, checksum=sha256
+    )
+    return _fetch_remote(
+        remote_metadata, dirname=folder, n_retries=n_retries, delay=delay
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_california_housing.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_california_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..749f8528da338010a70cfdb59c6ee91d060a3441
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_california_housing.py
@@ -0,0 +1,248 @@
+"""California housing dataset.
+
+The original database is available from StatLib
+
+    http://lib.stat.cmu.edu/datasets/
+
+The data contains 20,640 observations on 9 variables.
+
+This dataset contains the average house value as target variable
+and the following input variables (features): average income,
+housing average age, average rooms, average bedrooms, population,
+average occupation, latitude, and longitude in that order.
+
+References
+----------
+
+Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
+Statistics and Probability Letters, 33:291-297, 1997.
+
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+import tarfile
+from numbers import Integral, Real
+from os import PathLike, makedirs, remove
+from os.path import exists
+
+import joblib
+import numpy as np
+
+from ..utils import Bunch
+from ..utils._param_validation import Interval, validate_params
+from . import get_data_home
+from ._base import (
+    RemoteFileMetadata,
+    _convert_data_dataframe,
+    _fetch_remote,
+    _pkl_filepath,
+    load_descr,
+)
+
+# The original data can be found at:
+# https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz
+ARCHIVE = RemoteFileMetadata(
+    filename="cal_housing.tgz",
+    url="https://ndownloader.figshare.com/files/5976036",
+    checksum="aaa5c9a6afe2225cc2aed2723682ae403280c4a3695a2ddda4ffb5d8215ea681",
+)
+
+logger = logging.getLogger(__name__)
+
+
+@validate_params(
+    {
+        "data_home": [str, PathLike, None],
+        "download_if_missing": ["boolean"],
+        "return_X_y": ["boolean"],
+        "as_frame": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_california_housing(
+    *,
+    data_home=None,
+    download_if_missing=True,
+    return_X_y=False,
+    as_frame=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the California housing dataset (regression).
+
+    ==============   ==============
+    Samples total             20640
+    Dimensionality                8
+    Features                   real
+    Target           real 0.15 - 5.
+    ==============   ==============
+
+    Read more in the :ref:`User Guide <california_housing_dataset>`.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    return_X_y : bool, default=False
+        If True, returns ``(data.data, data.target)`` instead of a Bunch
+        object.
+
+        .. versionadded:: 0.20
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric, string or categorical). The target is
+        a pandas DataFrame or Series depending on the number of target_columns.
+
+        .. versionadded:: 0.23
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    dataset : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : ndarray, shape (20640, 8)
+            Each row corresponding to the 8 feature values in order.
+            If ``as_frame`` is True, ``data`` is a pandas object.
+        target : numpy array of shape (20640,)
+            Each value corresponds to the average
+            house value in units of 100,000.
+            If ``as_frame`` is True, ``target`` is a pandas object.
+        feature_names : list of length 8
+            Array of ordered feature names used in the dataset.
+        DESCR : str
+            Description of the California housing dataset.
+        frame : pandas DataFrame
+            Only present when `as_frame=True`. DataFrame with ``data`` and
+            ``target``.
+
+            .. versionadded:: 0.23
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarray. The first containing a 2D array of
+        shape (n_samples, n_features) with each row representing one
+        sample and each column representing the features. The second
+        ndarray of shape (n_samples,) containing the target samples.
+
+        .. versionadded:: 0.20
+
+    Notes
+    -----
+
+    This dataset consists of 20,640 samples and 9 features.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_california_housing
+    >>> housing = fetch_california_housing()
+    >>> print(housing.data.shape, housing.target.shape)
+    (20640, 8) (20640,)
+    >>> print(housing.feature_names[0:6])
+    ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']
+    """
+    data_home = get_data_home(data_home=data_home)
+    if not exists(data_home):
+        makedirs(data_home)
+
+    filepath = _pkl_filepath(data_home, "cal_housing.pkz")
+    if not exists(filepath):
+        if not download_if_missing:
+            raise OSError("Data not found and `download_if_missing` is False")
+
+        logger.info(
+            "Downloading Cal. housing from {} to {}".format(ARCHIVE.url, data_home)
+        )
+
+        archive_path = _fetch_remote(
+            ARCHIVE,
+            dirname=data_home,
+            n_retries=n_retries,
+            delay=delay,
+        )
+
+        with tarfile.open(mode="r:gz", name=archive_path) as f:
+            cal_housing = np.loadtxt(
+                f.extractfile("CaliforniaHousing/cal_housing.data"), delimiter=","
+            )
+            # Columns are not in the same order compared to the previous
+            # URL resource on lib.stat.cmu.edu
+            columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
+            cal_housing = cal_housing[:, columns_index]
+
+            joblib.dump(cal_housing, filepath, compress=6)
+        remove(archive_path)
+
+    else:
+        cal_housing = joblib.load(filepath)
+
+    feature_names = [
+        "MedInc",
+        "HouseAge",
+        "AveRooms",
+        "AveBedrms",
+        "Population",
+        "AveOccup",
+        "Latitude",
+        "Longitude",
+    ]
+
+    target, data = cal_housing[:, 0], cal_housing[:, 1:]
+
+    # avg rooms = total rooms / households
+    data[:, 2] /= data[:, 5]
+
+    # avg bed rooms = total bed rooms / households
+    data[:, 3] /= data[:, 5]
+
+    # avg occupancy = population / households
+    data[:, 5] = data[:, 4] / data[:, 5]
+
+    # target in units of 100,000
+    target = target / 100000.0
+
+    descr = load_descr("california_housing.rst")
+
+    X = data
+    y = target
+
+    frame = None
+    target_names = [
+        "MedHouseVal",
+    ]
+    if as_frame:
+        frame, X, y = _convert_data_dataframe(
+            "fetch_california_housing", data, target, feature_names, target_names
+        )
+
+    if return_X_y:
+        return X, y
+
+    return Bunch(
+        data=X,
+        target=y,
+        frame=frame,
+        target_names=target_names,
+        feature_names=feature_names,
+        DESCR=descr,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_covtype.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_covtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a0138bafa9c5b7bc902883572d3715d8a297c94
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_covtype.py
@@ -0,0 +1,252 @@
+"""Forest covertype dataset.
+
+A classic dataset for classification benchmarks, featuring categorical and
+real-valued features.
+
+The dataset page is available from UCI Machine Learning Repository
+
+    https://archive.ics.uci.edu/ml/datasets/Covertype
+
+Courtesy of Jock A. Blackard and Colorado State University.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+import os
+from gzip import GzipFile
+from numbers import Integral, Real
+from os.path import exists, join
+from tempfile import TemporaryDirectory
+
+import joblib
+import numpy as np
+
+from ..utils import Bunch, check_random_state
+from ..utils._param_validation import Interval, validate_params
+from . import get_data_home
+from ._base import (
+    RemoteFileMetadata,
+    _convert_data_dataframe,
+    _fetch_remote,
+    _pkl_filepath,
+    load_descr,
+)
+
+# The original data can be found in:
+# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
+ARCHIVE = RemoteFileMetadata(
+    filename="covtype.data.gz",
+    url="https://ndownloader.figshare.com/files/5976039",
+    checksum="614360d0257557dd1792834a85a1cdebfadc3c4f30b011d56afee7ffb5b15771",
+)
+
+logger = logging.getLogger(__name__)
+
+# Column names reference:
+# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
+FEATURE_NAMES = [
+    "Elevation",
+    "Aspect",
+    "Slope",
+    "Horizontal_Distance_To_Hydrology",
+    "Vertical_Distance_To_Hydrology",
+    "Horizontal_Distance_To_Roadways",
+    "Hillshade_9am",
+    "Hillshade_Noon",
+    "Hillshade_3pm",
+    "Horizontal_Distance_To_Fire_Points",
+]
+FEATURE_NAMES += [f"Wilderness_Area_{i}" for i in range(4)]
+FEATURE_NAMES += [f"Soil_Type_{i}" for i in range(40)]
+TARGET_NAMES = ["Cover_Type"]
+
+
+@validate_params(
+    {
+        "data_home": [str, os.PathLike, None],
+        "download_if_missing": ["boolean"],
+        "random_state": ["random_state"],
+        "shuffle": ["boolean"],
+        "return_X_y": ["boolean"],
+        "as_frame": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_covtype(
+    *,
+    data_home=None,
+    download_if_missing=True,
+    random_state=None,
+    shuffle=False,
+    return_X_y=False,
+    as_frame=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the covertype dataset (classification).
+
+    Download it if necessary.
+
+    =================   ============
+    Classes                        7
+    Samples total             581012
+    Dimensionality                54
+    Features                     int
+    =================   ============
+
+    Read more in the :ref:`User Guide <covtype_dataset>`.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    shuffle : bool, default=False
+        Whether to shuffle dataset.
+
+    return_X_y : bool, default=False
+        If True, returns ``(data.data, data.target)`` instead of a Bunch
+        object.
+
+        .. versionadded:: 0.20
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric). The target is a pandas DataFrame or
+        Series depending on the number of target columns. If `return_X_y` is
+        True, then (`data`, `target`) will be pandas DataFrames or Series as
+        described below.
+
+        .. versionadded:: 0.24
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    dataset : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : ndarray of shape (581012, 54)
+            Each row corresponds to the 54 features in the dataset.
+        target : ndarray of shape (581012,)
+            Each value corresponds to one of
+            the 7 forest covertypes with values
+            ranging between 1 to 7.
+        frame : dataframe of shape (581012, 55)
+            Only present when `as_frame=True`. Contains `data` and `target`.
+        DESCR : str
+            Description of the forest covertype dataset.
+        feature_names : list
+            The names of the dataset columns.
+        target_names: list
+            The names of the target columns.
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarray. The first containing a 2D array of
+        shape (n_samples, n_features) with each row representing one
+        sample and each column representing the features. The second
+        ndarray of shape (n_samples,) containing the target samples.
+
+        .. versionadded:: 0.20
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_covtype
+    >>> cov_type = fetch_covtype()
+    >>> cov_type.data.shape
+    (581012, 54)
+    >>> cov_type.target.shape
+    (581012,)
+    >>> # Let's check the 4 first feature names
+    >>> cov_type.feature_names[:4]
+    ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology']
+    """
+    data_home = get_data_home(data_home=data_home)
+    covtype_dir = join(data_home, "covertype")
+    samples_path = _pkl_filepath(covtype_dir, "samples")
+    targets_path = _pkl_filepath(covtype_dir, "targets")
+    available = exists(samples_path) and exists(targets_path)
+
+    if download_if_missing and not available:
+        os.makedirs(covtype_dir, exist_ok=True)
+
+        # Creating temp_dir as a direct subdirectory of the target directory
+        # guarantees that both reside on the same filesystem, so that we can use
+        # os.rename to atomically move the data files to their target location.
+        with TemporaryDirectory(dir=covtype_dir) as temp_dir:
+            logger.info(f"Downloading {ARCHIVE.url}")
+            archive_path = _fetch_remote(
+                ARCHIVE, dirname=temp_dir, n_retries=n_retries, delay=delay
+            )
+            Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",")
+
+            X = Xy[:, :-1]
+            y = Xy[:, -1].astype(np.int32, copy=False)
+
+            samples_tmp_path = _pkl_filepath(temp_dir, "samples")
+            joblib.dump(X, samples_tmp_path, compress=9)
+            os.rename(samples_tmp_path, samples_path)
+
+            targets_tmp_path = _pkl_filepath(temp_dir, "targets")
+            joblib.dump(y, targets_tmp_path, compress=9)
+            os.rename(targets_tmp_path, targets_path)
+
+    elif not available and not download_if_missing:
+        raise OSError("Data not found and `download_if_missing` is False")
+    try:
+        X, y
+    except NameError:
+        X = joblib.load(samples_path)
+        y = joblib.load(targets_path)
+
+    if shuffle:
+        ind = np.arange(X.shape[0])
+        rng = check_random_state(random_state)
+        rng.shuffle(ind)
+        X = X[ind]
+        y = y[ind]
+
+    fdescr = load_descr("covtype.rst")
+
+    frame = None
+    if as_frame:
+        frame, X, y = _convert_data_dataframe(
+            caller_name="fetch_covtype",
+            data=X,
+            target=y,
+            feature_names=FEATURE_NAMES,
+            target_names=TARGET_NAMES,
+        )
+    if return_X_y:
+        return X, y
+
+    return Bunch(
+        data=X,
+        target=y,
+        frame=frame,
+        target_names=TARGET_NAMES,
+        feature_names=FEATURE_NAMES,
+        DESCR=fdescr,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_kddcup99.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_kddcup99.py
new file mode 100644
index 0000000000000000000000000000000000000000..f379da42eb9dfe8877529bb7f8c8d12df39cb812
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_kddcup99.py
@@ -0,0 +1,429 @@
+"""KDDCUP 99 dataset.
+
+A classic dataset for anomaly detection.
+
+The dataset page is available from UCI Machine Learning Repository
+
+https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz
+
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import errno
+import logging
+import os
+from gzip import GzipFile
+from numbers import Integral, Real
+from os.path import exists, join
+
+import joblib
+import numpy as np
+
+from ..utils import Bunch, check_random_state
+from ..utils import shuffle as shuffle_method
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from . import get_data_home
+from ._base import (
+    RemoteFileMetadata,
+    _convert_data_dataframe,
+    _fetch_remote,
+    load_descr,
+)
+
+# The original data can be found at:
+# https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz
+ARCHIVE = RemoteFileMetadata(
+    filename="kddcup99_data",
+    url="https://ndownloader.figshare.com/files/5976045",
+    checksum="3b6c942aa0356c0ca35b7b595a26c89d343652c9db428893e7494f837b274292",
+)
+
+# The original data can be found at:
+# https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz
+ARCHIVE_10_PERCENT = RemoteFileMetadata(
+    filename="kddcup99_10_data",
+    url="https://ndownloader.figshare.com/files/5976042",
+    checksum="8045aca0d84e70e622d1148d7df782496f6333bf6eb979a1b0837c42a9fd9561",
+)
+
+logger = logging.getLogger(__name__)
+
+
+@validate_params(
+    {
+        "subset": [StrOptions({"SA", "SF", "http", "smtp"}), None],
+        "data_home": [str, os.PathLike, None],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+        "percent10": ["boolean"],
+        "download_if_missing": ["boolean"],
+        "return_X_y": ["boolean"],
+        "as_frame": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_kddcup99(
+    *,
+    subset=None,
+    data_home=None,
+    shuffle=False,
+    random_state=None,
+    percent10=True,
+    download_if_missing=True,
+    return_X_y=False,
+    as_frame=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the kddcup99 dataset (classification).
+
+    Download it if necessary.
+
+    =================   ====================================
+    Classes                                               23
+    Samples total                                    4898431
+    Dimensionality                                        41
+    Features            discrete (int) or continuous (float)
+    =================   ====================================
+
+    Read more in the :ref:`User Guide <kddcup99_dataset>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    subset : {'SA', 'SF', 'http', 'smtp'}, default=None
+        To return the corresponding classical subsets of kddcup 99.
+        If None, return the entire kddcup 99 dataset.
+
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+        .. versionadded:: 0.19
+
+    shuffle : bool, default=False
+        Whether to shuffle dataset.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset shuffling and for
+        selection of abnormal samples if `subset='SA'`. Pass an int for
+        reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    percent10 : bool, default=True
+        Whether to load only 10 percent of the data.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    return_X_y : bool, default=False
+        If True, returns ``(data, target)`` instead of a Bunch object. See
+        below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.20
+
+    as_frame : bool, default=False
+        If `True`, returns a pandas Dataframe for the ``data`` and ``target``
+        objects in the `Bunch` returned object; `Bunch` return object will also
+        have a ``frame`` member.
+
+        .. versionadded:: 0.24
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : {ndarray, dataframe} of shape (494021, 41)
+            The data matrix to learn. If `as_frame=True`, `data` will be a
+            pandas DataFrame.
+        target : {ndarray, series} of shape (494021,)
+            The regression target for each sample. If `as_frame=True`, `target`
+            will be a pandas Series.
+        frame : dataframe of shape (494021, 42)
+            Only present when `as_frame=True`. Contains `data` and `target`.
+        DESCR : str
+            The full description of the dataset.
+        feature_names : list
+            The names of the dataset columns
+        target_names: list
+            The names of the target columns
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarray. The first containing a 2D array of
+        shape (n_samples, n_features) with each row representing one
+        sample and each column representing the features. The second
+        ndarray of shape (n_samples,) containing the target samples.
+
+        .. versionadded:: 0.20
+    """
+    data_home = get_data_home(data_home=data_home)
+    kddcup99 = _fetch_brute_kddcup99(
+        data_home=data_home,
+        percent10=percent10,
+        download_if_missing=download_if_missing,
+        n_retries=n_retries,
+        delay=delay,
+    )
+
+    data = kddcup99.data
+    target = kddcup99.target
+    feature_names = kddcup99.feature_names
+    target_names = kddcup99.target_names
+
+    if subset == "SA":
+        s = target == b"normal."
+        t = np.logical_not(s)
+        normal_samples = data[s, :]
+        normal_targets = target[s]
+        abnormal_samples = data[t, :]
+        abnormal_targets = target[t]
+
+        n_samples_abnormal = abnormal_samples.shape[0]
+        # selected abnormal samples:
+        random_state = check_random_state(random_state)
+        r = random_state.randint(0, n_samples_abnormal, 3377)
+        abnormal_samples = abnormal_samples[r]
+        abnormal_targets = abnormal_targets[r]
+
+        data = np.r_[normal_samples, abnormal_samples]
+        target = np.r_[normal_targets, abnormal_targets]
+
+    if subset == "SF" or subset == "http" or subset == "smtp":
+        # select all samples with positive logged_in attribute:
+        s = data[:, 11] == 1
+        data = np.c_[data[s, :11], data[s, 12:]]
+        feature_names = feature_names[:11] + feature_names[12:]
+        target = target[s]
+
+        data[:, 0] = np.log((data[:, 0] + 0.1).astype(float, copy=False))
+        data[:, 4] = np.log((data[:, 4] + 0.1).astype(float, copy=False))
+        data[:, 5] = np.log((data[:, 5] + 0.1).astype(float, copy=False))
+
+        if subset == "http":
+            s = data[:, 2] == b"http"
+            data = data[s]
+            target = target[s]
+            data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
+            feature_names = [feature_names[0], feature_names[4], feature_names[5]]
+
+        if subset == "smtp":
+            s = data[:, 2] == b"smtp"
+            data = data[s]
+            target = target[s]
+            data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
+            feature_names = [feature_names[0], feature_names[4], feature_names[5]]
+
+        if subset == "SF":
+            data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]]
+            feature_names = [
+                feature_names[0],
+                feature_names[2],
+                feature_names[4],
+                feature_names[5],
+            ]
+
+    if shuffle:
+        data, target = shuffle_method(data, target, random_state=random_state)
+
+    fdescr = load_descr("kddcup99.rst")
+
+    frame = None
+    if as_frame:
+        frame, data, target = _convert_data_dataframe(
+            "fetch_kddcup99", data, target, feature_names, target_names
+        )
+
+    if return_X_y:
+        return data, target
+
+    return Bunch(
+        data=data,
+        target=target,
+        frame=frame,
+        target_names=target_names,
+        feature_names=feature_names,
+        DESCR=fdescr,
+    )
+
+
+def _fetch_brute_kddcup99(
+    data_home=None, download_if_missing=True, percent10=True, n_retries=3, delay=1.0
+):
+    """Load the kddcup99 dataset, downloading it if necessary.
+
+    Parameters
+    ----------
+    data_home : str, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    percent10 : bool, default=True
+        Whether to load only 10 percent of the data.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+    Returns
+    -------
+    dataset : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : ndarray of shape (494021, 41)
+            Each row corresponds to the 41 features in the dataset.
+        target : ndarray of shape (494021,)
+            Each value corresponds to one of the 21 attack types or to the
+            label 'normal.'.
+        feature_names : list
+            The names of the dataset columns
+        target_names: list
+            The names of the target columns
+        DESCR : str
+            Description of the kddcup99 dataset.
+
+    """
+
+    data_home = get_data_home(data_home=data_home)
+    dir_suffix = "-py3"
+
+    if percent10:
+        kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
+        archive = ARCHIVE_10_PERCENT
+    else:
+        kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
+        archive = ARCHIVE
+
+    samples_path = join(kddcup_dir, "samples")
+    targets_path = join(kddcup_dir, "targets")
+    available = exists(samples_path)
+
+    dt = [
+        ("duration", int),
+        ("protocol_type", "S4"),
+        ("service", "S11"),
+        ("flag", "S6"),
+        ("src_bytes", int),
+        ("dst_bytes", int),
+        ("land", int),
+        ("wrong_fragment", int),
+        ("urgent", int),
+        ("hot", int),
+        ("num_failed_logins", int),
+        ("logged_in", int),
+        ("num_compromised", int),
+        ("root_shell", int),
+        ("su_attempted", int),
+        ("num_root", int),
+        ("num_file_creations", int),
+        ("num_shells", int),
+        ("num_access_files", int),
+        ("num_outbound_cmds", int),
+        ("is_host_login", int),
+        ("is_guest_login", int),
+        ("count", int),
+        ("srv_count", int),
+        ("serror_rate", float),
+        ("srv_serror_rate", float),
+        ("rerror_rate", float),
+        ("srv_rerror_rate", float),
+        ("same_srv_rate", float),
+        ("diff_srv_rate", float),
+        ("srv_diff_host_rate", float),
+        ("dst_host_count", int),
+        ("dst_host_srv_count", int),
+        ("dst_host_same_srv_rate", float),
+        ("dst_host_diff_srv_rate", float),
+        ("dst_host_same_src_port_rate", float),
+        ("dst_host_srv_diff_host_rate", float),
+        ("dst_host_serror_rate", float),
+        ("dst_host_srv_serror_rate", float),
+        ("dst_host_rerror_rate", float),
+        ("dst_host_srv_rerror_rate", float),
+        ("labels", "S16"),
+    ]
+
+    column_names = [c[0] for c in dt]
+    target_names = column_names[-1]
+    feature_names = column_names[:-1]
+
+    if available:
+        try:
+            X = joblib.load(samples_path)
+            y = joblib.load(targets_path)
+        except Exception as e:
+            raise OSError(
+                "The cache for fetch_kddcup99 is invalid, please delete "
+                f"{kddcup_dir} and run the fetch_kddcup99 again"
+            ) from e
+
+    elif download_if_missing:
+        _mkdirp(kddcup_dir)
+        logger.info("Downloading %s" % archive.url)
+        _fetch_remote(archive, dirname=kddcup_dir, n_retries=n_retries, delay=delay)
+        DT = np.dtype(dt)
+        logger.debug("extracting archive")
+        archive_path = join(kddcup_dir, archive.filename)
+        file_ = GzipFile(filename=archive_path, mode="r")
+        Xy = []
+        for line in file_.readlines():
+            line = line.decode()
+            Xy.append(line.replace("\n", "").split(","))
+        file_.close()
+        logger.debug("extraction done")
+        os.remove(archive_path)
+
+        Xy = np.asarray(Xy, dtype=object)
+        for j in range(42):
+            Xy[:, j] = Xy[:, j].astype(DT[j])
+
+        X = Xy[:, :-1]
+        y = Xy[:, -1]
+        # XXX bug when compress!=0:
+        # (error: 'Incorrect data length while decompressing[...] the file
+        #  could be corrupted.')
+
+        joblib.dump(X, samples_path, compress=0)
+        joblib.dump(y, targets_path, compress=0)
+    else:
+        raise OSError("Data not found and `download_if_missing` is False")
+
+    return Bunch(
+        data=X,
+        target=y,
+        feature_names=feature_names,
+        target_names=[target_names],
+    )
+
+
+def _mkdirp(d):
+    """Ensure directory d exists (like mkdir -p on Unix)
+    No guarantee that the directory is writable.
+    """
+    try:
+        os.makedirs(d)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_lfw.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_lfw.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f725b9250cc5e325659612f0c83c7724288828b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_lfw.py
@@ -0,0 +1,648 @@
+"""Labeled Faces in the Wild (LFW) dataset
+
+This dataset is a collection of JPEG pictures of famous people collected
+over the internet, all details are available on the official website:
+
+    http://vis-www.cs.umass.edu/lfw/
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+from numbers import Integral, Real
+from os import PathLike, listdir, makedirs, remove
+from os.path import exists, isdir, join
+
+import numpy as np
+from joblib import Memory
+
+from ..utils import Bunch
+from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
+from ..utils.fixes import tarfile_extractall
+from ._base import (
+    RemoteFileMetadata,
+    _fetch_remote,
+    get_data_home,
+    load_descr,
+)
+
+logger = logging.getLogger(__name__)
+
+# The original data can be found in:
+# http://vis-www.cs.umass.edu/lfw/lfw.tgz
+ARCHIVE = RemoteFileMetadata(
+    filename="lfw.tgz",
+    url="https://ndownloader.figshare.com/files/5976018",
+    checksum="055f7d9c632d7370e6fb4afc7468d40f970c34a80d4c6f50ffec63f5a8d536c0",
+)
+
+# The original funneled data can be found in:
+# http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz
+FUNNELED_ARCHIVE = RemoteFileMetadata(
+    filename="lfw-funneled.tgz",
+    url="https://ndownloader.figshare.com/files/5976015",
+    checksum="b47c8422c8cded889dc5a13418c4bc2abbda121092b3533a83306f90d900100a",
+)
+
+# The original target data can be found in:
+# http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt',
+# http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt',
+# http://vis-www.cs.umass.edu/lfw/pairs.txt',
+TARGETS = (
+    RemoteFileMetadata(
+        filename="pairsDevTrain.txt",
+        url="https://ndownloader.figshare.com/files/5976012",
+        checksum="1d454dada7dfeca0e7eab6f65dc4e97a6312d44cf142207be28d688be92aabfa",
+    ),
+    RemoteFileMetadata(
+        filename="pairsDevTest.txt",
+        url="https://ndownloader.figshare.com/files/5976009",
+        checksum="7cb06600ea8b2814ac26e946201cdb304296262aad67d046a16a7ec85d0ff87c",
+    ),
+    RemoteFileMetadata(
+        filename="pairs.txt",
+        url="https://ndownloader.figshare.com/files/5976006",
+        checksum="ea42330c62c92989f9d7c03237ed5d591365e89b3e649747777b70e692dc1592",
+    ),
+)
+
+
+#
+# Common private utilities for data fetching from the original LFW website
+# local disk caching, and image decoding.
+#
+
+
+def _check_fetch_lfw(
+    data_home=None, funneled=True, download_if_missing=True, n_retries=3, delay=1.0
+):
+    """Helper function to download any missing LFW data"""
+
+    data_home = get_data_home(data_home=data_home)
+    lfw_home = join(data_home, "lfw_home")
+
+    if not exists(lfw_home):
+        makedirs(lfw_home)
+
+    for target in TARGETS:
+        target_filepath = join(lfw_home, target.filename)
+        if not exists(target_filepath):
+            if download_if_missing:
+                logger.info("Downloading LFW metadata: %s", target.url)
+                _fetch_remote(
+                    target, dirname=lfw_home, n_retries=n_retries, delay=delay
+                )
+            else:
+                raise OSError("%s is missing" % target_filepath)
+
+    if funneled:
+        data_folder_path = join(lfw_home, "lfw_funneled")
+        archive = FUNNELED_ARCHIVE
+    else:
+        data_folder_path = join(lfw_home, "lfw")
+        archive = ARCHIVE
+
+    if not exists(data_folder_path):
+        archive_path = join(lfw_home, archive.filename)
+        if not exists(archive_path):
+            if download_if_missing:
+                logger.info("Downloading LFW data (~200MB): %s", archive.url)
+                _fetch_remote(
+                    archive, dirname=lfw_home, n_retries=n_retries, delay=delay
+                )
+            else:
+                raise OSError("%s is missing" % archive_path)
+
+        import tarfile
+
+        logger.debug("Decompressing the data archive to %s", data_folder_path)
+        with tarfile.open(archive_path, "r:gz") as fp:
+            tarfile_extractall(fp, path=lfw_home)
+
+        remove(archive_path)
+
+    return lfw_home, data_folder_path
+
+
+def _load_imgs(file_paths, slice_, color, resize):
+    """Internally used to load images"""
+    try:
+        from PIL import Image
+    except ImportError:
+        raise ImportError(
+            "The Python Imaging Library (PIL) is required to load data "
+            "from jpeg files. Please refer to "
+            "https://pillow.readthedocs.io/en/stable/installation.html "
+            "for installing PIL."
+        )
+
+    # compute the portion of the images to load to respect the slice_ parameter
+    # given by the caller
+    default_slice = (slice(0, 250), slice(0, 250))
+    if slice_ is None:
+        slice_ = default_slice
+    else:
+        slice_ = tuple(s or ds for s, ds in zip(slice_, default_slice))
+
+    h_slice, w_slice = slice_
+    h = (h_slice.stop - h_slice.start) // (h_slice.step or 1)
+    w = (w_slice.stop - w_slice.start) // (w_slice.step or 1)
+
+    if resize is not None:
+        resize = float(resize)
+        h = int(resize * h)
+        w = int(resize * w)
+
+    # allocate some contiguous memory to host the decoded image slices
+    n_faces = len(file_paths)
+    if not color:
+        faces = np.zeros((n_faces, h, w), dtype=np.float32)
+    else:
+        faces = np.zeros((n_faces, h, w, 3), dtype=np.float32)
+
+    # iterate over the collected file path to load the jpeg files as numpy
+    # arrays
+    for i, file_path in enumerate(file_paths):
+        if i % 1000 == 0:
+            logger.debug("Loading face #%05d / %05d", i + 1, n_faces)
+
+        # Checks if jpeg reading worked. Refer to issue #3594 for more
+        # details.
+        pil_img = Image.open(file_path)
+        pil_img = pil_img.crop(
+            (w_slice.start, h_slice.start, w_slice.stop, h_slice.stop)
+        )
+        if resize is not None:
+            pil_img = pil_img.resize((w, h))
+        face = np.asarray(pil_img, dtype=np.float32)
+
+        if face.ndim == 0:
+            raise RuntimeError(
+                "Failed to read the image file %s, "
+                "Please make sure that libjpeg is installed" % file_path
+            )
+
+        face /= 255.0  # scale uint8 coded colors to the [0.0, 1.0] floats
+        if not color:
+            # average the color channels to compute a gray levels
+            # representation
+            face = face.mean(axis=2)
+
+        faces[i, ...] = face
+
+    return faces
+
+
+#
+# Task #1:  Face Identification on picture with names
+#
+
+
+def _fetch_lfw_people(
+    data_folder_path, slice_=None, color=False, resize=None, min_faces_per_person=0
+):
+    """Perform the actual data loading for the lfw people dataset
+
+    This operation is meant to be cached by a joblib wrapper.
+    """
+    # scan the data folder content to retain people with more that
+    # `min_faces_per_person` face pictures
+    person_names, file_paths = [], []
+    for person_name in sorted(listdir(data_folder_path)):
+        folder_path = join(data_folder_path, person_name)
+        if not isdir(folder_path):
+            continue
+        paths = [join(folder_path, f) for f in sorted(listdir(folder_path))]
+        n_pictures = len(paths)
+        if n_pictures >= min_faces_per_person:
+            person_name = person_name.replace("_", " ")
+            person_names.extend([person_name] * n_pictures)
+            file_paths.extend(paths)
+
+    n_faces = len(file_paths)
+    if n_faces == 0:
+        raise ValueError(
+            "min_faces_per_person=%d is too restrictive" % min_faces_per_person
+        )
+
+    target_names = np.unique(person_names)
+    target = np.searchsorted(target_names, person_names)
+
+    faces = _load_imgs(file_paths, slice_, color, resize)
+
+    # shuffle the faces with a deterministic RNG scheme to avoid having
+    # all faces of the same person in a row, as it would break some
+    # cross validation and learning algorithms such as SGD and online
+    # k-means that make an IID assumption
+
+    indices = np.arange(n_faces)
+    np.random.RandomState(42).shuffle(indices)
+    faces, target = faces[indices], target[indices]
+    return faces, target, target_names
+
+
+@validate_params(
+    {
+        "data_home": [str, PathLike, None],
+        "funneled": ["boolean"],
+        "resize": [Interval(Real, 0, None, closed="neither"), None],
+        "min_faces_per_person": [Interval(Integral, 0, None, closed="left"), None],
+        "color": ["boolean"],
+        "slice_": [tuple, Hidden(None)],
+        "download_if_missing": ["boolean"],
+        "return_X_y": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_lfw_people(
+    *,
+    data_home=None,
+    funneled=True,
+    resize=0.5,
+    min_faces_per_person=0,
+    color=False,
+    slice_=(slice(70, 195), slice(78, 172)),
+    download_if_missing=True,
+    return_X_y=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the Labeled Faces in the Wild (LFW) people dataset \
+(classification).
+
+    Download it if necessary.
+
+    =================   =======================
+    Classes                                5749
+    Samples total                         13233
+    Dimensionality                         5828
+    Features            real, between 0 and 255
+    =================   =======================
+
+    For a usage example of this dataset, see
+    :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py`.
+
+    Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    funneled : bool, default=True
+        Download and use the funneled variant of the dataset.
+
+    resize : float or None, default=0.5
+        Ratio used to resize the each face picture. If `None`, no resizing is
+        performed.
+
+    min_faces_per_person : int, default=None
+        The extracted dataset will only retain pictures of people that have at
+        least `min_faces_per_person` different pictures.
+
+    color : bool, default=False
+        Keep the 3 RGB channels instead of averaging them to a single
+        gray level channel. If color is True the shape of the data has
+        one more dimension than the shape with color = False.
+
+    slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))
+        Provide a custom 2D slice (height, width) to extract the
+        'interesting' part of the jpeg files and avoid use statistical
+        correlation from the background.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    return_X_y : bool, default=False
+        If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch
+        object. See below for more information about the `dataset.data` and
+        `dataset.target` object.
+
+        .. versionadded:: 0.20
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    dataset : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : numpy array of shape (13233, 2914)
+            Each row corresponds to a ravelled face image
+            of original size 62 x 47 pixels.
+            Changing the ``slice_`` or resize parameters will change the
+            shape of the output.
+        images : numpy array of shape (13233, 62, 47)
+            Each row is a face image corresponding to one of the 5749 people in
+            the dataset. Changing the ``slice_``
+            or resize parameters will change the shape of the output.
+        target : numpy array of shape (13233,)
+            Labels associated to each face image.
+            Those labels range from 0-5748 and correspond to the person IDs.
+        target_names : numpy array of shape (5749,)
+            Names of all persons in the dataset.
+            Position in array corresponds to the person ID in the target array.
+        DESCR : str
+            Description of the Labeled Faces in the Wild (LFW) dataset.
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarray. The first containing a 2D array of
+        shape (n_samples, n_features) with each row representing one
+        sample and each column representing the features. The second
+        ndarray of shape (n_samples,) containing the target samples.
+
+        .. versionadded:: 0.20
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_lfw_people
+    >>> lfw_people = fetch_lfw_people()
+    >>> lfw_people.data.shape
+    (13233, 2914)
+    >>> lfw_people.target.shape
+    (13233,)
+    >>> for name in lfw_people.target_names[:5]:
+    ...    print(name)
+    AJ Cook
+    AJ Lamas
+    Aaron Eckhart
+    Aaron Guiel
+    Aaron Patterson
+    """
+    lfw_home, data_folder_path = _check_fetch_lfw(
+        data_home=data_home,
+        funneled=funneled,
+        download_if_missing=download_if_missing,
+        n_retries=n_retries,
+        delay=delay,
+    )
+    logger.debug("Loading LFW people faces from %s", lfw_home)
+
+    # wrap the loader in a memoizing function that will return memmaped data
+    # arrays for optimal memory usage
+    m = Memory(location=lfw_home, compress=6, verbose=0)
+    load_func = m.cache(_fetch_lfw_people)
+
+    # load and memoize the pairs as np arrays
+    faces, target, target_names = load_func(
+        data_folder_path,
+        resize=resize,
+        min_faces_per_person=min_faces_per_person,
+        color=color,
+        slice_=slice_,
+    )
+
+    X = faces.reshape(len(faces), -1)
+
+    fdescr = load_descr("lfw.rst")
+
+    if return_X_y:
+        return X, target
+
+    # pack the results as a Bunch instance
+    return Bunch(
+        data=X, images=faces, target=target, target_names=target_names, DESCR=fdescr
+    )
+
+
+#
+# Task #2:  Face Verification on pairs of face pictures
+#
+
+
+def _fetch_lfw_pairs(
+    index_file_path, data_folder_path, slice_=None, color=False, resize=None
+):
+    """Perform the actual data loading for the LFW pairs dataset
+
+    This operation is meant to be cached by a joblib wrapper.
+    """
+    # parse the index file to find the number of pairs to be able to allocate
+    # the right amount of memory before starting to decode the jpeg files
+    with open(index_file_path, "rb") as index_file:
+        split_lines = [ln.decode().strip().split("\t") for ln in index_file]
+    pair_specs = [sl for sl in split_lines if len(sl) > 2]
+    n_pairs = len(pair_specs)
+
+    # iterating over the metadata lines for each pair to find the filename to
+    # decode and load in memory
+    target = np.zeros(n_pairs, dtype=int)
+    file_paths = list()
+    for i, components in enumerate(pair_specs):
+        if len(components) == 3:
+            target[i] = 1
+            pair = (
+                (components[0], int(components[1]) - 1),
+                (components[0], int(components[2]) - 1),
+            )
+        elif len(components) == 4:
+            target[i] = 0
+            pair = (
+                (components[0], int(components[1]) - 1),
+                (components[2], int(components[3]) - 1),
+            )
+        else:
+            raise ValueError("invalid line %d: %r" % (i + 1, components))
+        for j, (name, idx) in enumerate(pair):
+            try:
+                person_folder = join(data_folder_path, name)
+            except TypeError:
+                person_folder = join(data_folder_path, str(name, "UTF-8"))
+            filenames = list(sorted(listdir(person_folder)))
+            file_path = join(person_folder, filenames[idx])
+            file_paths.append(file_path)
+
+    pairs = _load_imgs(file_paths, slice_, color, resize)
+    shape = list(pairs.shape)
+    n_faces = shape.pop(0)
+    shape.insert(0, 2)
+    shape.insert(0, n_faces // 2)
+    pairs.shape = shape
+
+    return pairs, target, np.array(["Different persons", "Same person"])
+
+
+@validate_params(
+    {
+        "subset": [StrOptions({"train", "test", "10_folds"})],
+        "data_home": [str, PathLike, None],
+        "funneled": ["boolean"],
+        "resize": [Interval(Real, 0, None, closed="neither"), None],
+        "color": ["boolean"],
+        "slice_": [tuple, Hidden(None)],
+        "download_if_missing": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_lfw_pairs(
+    *,
+    subset="train",
+    data_home=None,
+    funneled=True,
+    resize=0.5,
+    color=False,
+    slice_=(slice(70, 195), slice(78, 172)),
+    download_if_missing=True,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).
+
+    Download it if necessary.
+
+    =================   =======================
+    Classes                                   2
+    Samples total                         13233
+    Dimensionality                         5828
+    Features            real, between 0 and 255
+    =================   =======================
+
+    In the `original paper <https://people.cs.umass.edu/~elm/papers/lfw.pdf>`_
+    the "pairs" version corresponds to the "restricted task", where
+    the experimenter should not use the name of a person to infer
+    the equivalence or non-equivalence of two face images that
+    are not explicitly given in the training set.
+
+    The original images are 250 x 250 pixels, but the default slice and resize
+    arguments reduce them to 62 x 47.
+
+    Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`.
+
+    Parameters
+    ----------
+    subset : {'train', 'test', '10_folds'}, default='train'
+        Select the dataset to load: 'train' for the development training
+        set, 'test' for the development test set, and '10_folds' for the
+        official evaluation set that is meant to be used with a 10-folds
+        cross validation.
+
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By
+        default all scikit-learn data is stored in '~/scikit_learn_data'
+        subfolders.
+
+    funneled : bool, default=True
+        Download and use the funneled variant of the dataset.
+
+    resize : float, default=0.5
+        Ratio used to resize the each face picture.
+
+    color : bool, default=False
+        Keep the 3 RGB channels instead of averaging them to a single
+        gray level channel. If color is True the shape of the data has
+        one more dimension than the shape with color = False.
+
+    slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))
+        Provide a custom 2D slice (height, width) to extract the
+        'interesting' part of the jpeg files and avoid use statistical
+        correlation from the background.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : ndarray of shape (2200, 5828). Shape depends on ``subset``.
+            Each row corresponds to 2 ravel'd face images
+            of original size 62 x 47 pixels.
+            Changing the ``slice_``, ``resize`` or ``subset`` parameters
+            will change the shape of the output.
+        pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset``
+            Each row has 2 face images corresponding
+            to same or different person from the dataset
+            containing 5749 people. Changing the ``slice_``,
+            ``resize`` or ``subset`` parameters will change the shape of the
+            output.
+        target : numpy array of shape (2200,). Shape depends on ``subset``.
+            Labels associated to each pair of images.
+            The two label values being different persons or the same person.
+        target_names : numpy array of shape (2,)
+            Explains the target values of the target array.
+            0 corresponds to "Different person", 1 corresponds to "same person".
+        DESCR : str
+            Description of the Labeled Faces in the Wild (LFW) dataset.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_lfw_pairs
+    >>> lfw_pairs_train = fetch_lfw_pairs(subset='train')
+    >>> list(lfw_pairs_train.target_names)
+    [np.str_('Different persons'), np.str_('Same person')]
+    >>> lfw_pairs_train.pairs.shape
+    (2200, 2, 62, 47)
+    >>> lfw_pairs_train.data.shape
+    (2200, 5828)
+    >>> lfw_pairs_train.target.shape
+    (2200,)
+    """
+    lfw_home, data_folder_path = _check_fetch_lfw(
+        data_home=data_home,
+        funneled=funneled,
+        download_if_missing=download_if_missing,
+        n_retries=n_retries,
+        delay=delay,
+    )
+    logger.debug("Loading %s LFW pairs from %s", subset, lfw_home)
+
+    # wrap the loader in a memoizing function that will return memmaped data
+    # arrays for optimal memory usage
+    m = Memory(location=lfw_home, compress=6, verbose=0)
+    load_func = m.cache(_fetch_lfw_pairs)
+
+    # select the right metadata file according to the requested subset
+    label_filenames = {
+        "train": "pairsDevTrain.txt",
+        "test": "pairsDevTest.txt",
+        "10_folds": "pairs.txt",
+    }
+    if subset not in label_filenames:
+        raise ValueError(
+            "subset='%s' is invalid: should be one of %r"
+            % (subset, list(sorted(label_filenames.keys())))
+        )
+    index_file_path = join(lfw_home, label_filenames[subset])
+
+    # load and memoize the pairs as np arrays
+    pairs, target, target_names = load_func(
+        index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_
+    )
+
+    fdescr = load_descr("lfw.rst")
+
+    # pack the results as a Bunch instance
+    return Bunch(
+        data=pairs.reshape(len(pairs), -1),
+        pairs=pairs,
+        target=target,
+        target_names=target_names,
+        DESCR=fdescr,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_olivetti_faces.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_olivetti_faces.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb382b1dcdda0bd3dadc2216da9be21d40dddd2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_olivetti_faces.py
@@ -0,0 +1,184 @@
+"""Modified Olivetti faces dataset.
+
+The original database was available from (now defunct)
+
+    https://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html
+
+The version retrieved here comes in MATLAB format from the personal
+web page of Sam Roweis:
+
+    https://cs.nyu.edu/~roweis/
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+from os import PathLike, makedirs, remove
+from os.path import exists
+
+import joblib
+import numpy as np
+from scipy.io import loadmat
+
+from ..utils import Bunch, check_random_state
+from ..utils._param_validation import Interval, validate_params
+from . import get_data_home
+from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath, load_descr
+
+# The original data can be found at:
+# https://cs.nyu.edu/~roweis/data/olivettifaces.mat
+FACES = RemoteFileMetadata(
+    filename="olivettifaces.mat",
+    url="https://ndownloader.figshare.com/files/5976027",
+    checksum="b612fb967f2dc77c9c62d3e1266e0c73d5fca46a4b8906c18e454d41af987794",
+)
+
+
+@validate_params(
+    {
+        "data_home": [str, PathLike, None],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+        "download_if_missing": ["boolean"],
+        "return_X_y": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_olivetti_faces(
+    *,
+    data_home=None,
+    shuffle=False,
+    random_state=0,
+    download_if_missing=True,
+    return_X_y=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the Olivetti faces data-set from AT&T (classification).
+
+    Download it if necessary.
+
+    =================   =====================
+    Classes                                40
+    Samples total                         400
+    Dimensionality                       4096
+    Features            real, between 0 and 1
+    =================   =====================
+
+    Read more in the :ref:`User Guide <olivetti_faces_dataset>`.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    shuffle : bool, default=False
+        If True the order of the dataset is shuffled to avoid having
+        images of the same person grouped.
+
+    random_state : int, RandomState instance or None, default=0
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    return_X_y : bool, default=False
+        If True, returns `(data, target)` instead of a `Bunch` object. See
+        below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.22
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data: ndarray, shape (400, 4096)
+            Each row corresponds to a ravelled
+            face image of original size 64 x 64 pixels.
+        images : ndarray, shape (400, 64, 64)
+            Each row is a face image
+            corresponding to one of the 40 subjects of the dataset.
+        target : ndarray, shape (400,)
+            Labels associated to each face image.
+            Those labels are ranging from 0-39 and correspond to the
+            Subject IDs.
+        DESCR : str
+            Description of the modified Olivetti Faces Dataset.
+
+    (data, target) : tuple if `return_X_y=True`
+        Tuple with the `data` and `target` objects described above.
+
+        .. versionadded:: 0.22
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_olivetti_faces
+    >>> olivetti_faces = fetch_olivetti_faces()
+    >>> olivetti_faces.data.shape
+    (400, 4096)
+    >>> olivetti_faces.target.shape
+    (400,)
+    >>> olivetti_faces.images.shape
+    (400, 64, 64)
+    """
+    data_home = get_data_home(data_home=data_home)
+    if not exists(data_home):
+        makedirs(data_home)
+    filepath = _pkl_filepath(data_home, "olivetti.pkz")
+    if not exists(filepath):
+        if not download_if_missing:
+            raise OSError("Data not found and `download_if_missing` is False")
+
+        print("downloading Olivetti faces from %s to %s" % (FACES.url, data_home))
+        mat_path = _fetch_remote(
+            FACES, dirname=data_home, n_retries=n_retries, delay=delay
+        )
+        mfile = loadmat(file_name=mat_path)
+        # delete raw .mat data
+        remove(mat_path)
+
+        faces = mfile["faces"].T.copy()
+        joblib.dump(faces, filepath, compress=6)
+        del mfile
+    else:
+        faces = joblib.load(filepath)
+
+    # We want floating point data, but float32 is enough (there is only
+    # one byte of precision in the original uint8s anyway)
+    faces = np.float32(faces)
+    faces = faces - faces.min()
+    faces /= faces.max()
+    faces = faces.reshape((400, 64, 64)).transpose(0, 2, 1)
+    # 10 images per class, 400 images total, each class is contiguous.
+    target = np.array([i // 10 for i in range(400)])
+    if shuffle:
+        random_state = check_random_state(random_state)
+        order = random_state.permutation(len(faces))
+        faces = faces[order]
+        target = target[order]
+    faces_vectorized = faces.reshape(len(faces), -1)
+
+    fdescr = load_descr("olivetti_faces.rst")
+
+    if return_X_y:
+        return faces_vectorized, target
+
+    return Bunch(data=faces_vectorized, images=faces, target=target, DESCR=fdescr)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_openml.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_openml.py
new file mode 100644
index 0000000000000000000000000000000000000000..537f6cde499a2384ba08032af4efe004e9321bce
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_openml.py
@@ -0,0 +1,1164 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import gzip
+import hashlib
+import json
+import os
+import shutil
+import time
+from contextlib import closing
+from functools import wraps
+from os.path import join
+from tempfile import TemporaryDirectory
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlparse
+from urllib.request import Request, urlopen
+from warnings import warn
+
+import numpy as np
+
+from ..utils import Bunch
+from ..utils._optional_dependencies import check_pandas_support
+from ..utils._param_validation import (
+    Integral,
+    Interval,
+    Real,
+    StrOptions,
+    validate_params,
+)
+from . import get_data_home
+from ._arff_parser import load_arff_from_gzip_file
+
+__all__ = ["fetch_openml"]
+
+_SEARCH_NAME = "https://api.openml.org/api/v1/json/data/list/data_name/{}/limit/2"
+_DATA_INFO = "https://api.openml.org/api/v1/json/data/{}"
+_DATA_FEATURES = "https://api.openml.org/api/v1/json/data/features/{}"
+_DATA_QUALITIES = "https://api.openml.org/api/v1/json/data/qualities/{}"
+
+OpenmlQualitiesType = List[Dict[str, str]]
+OpenmlFeaturesType = List[Dict[str, str]]
+
+
+def _get_local_path(openml_path: str, data_home: str) -> str:
+    return os.path.join(data_home, "openml.org", openml_path + ".gz")
+
+
+def _retry_with_clean_cache(
+    openml_path: str,
+    data_home: Optional[str],
+    no_retry_exception: Optional[Exception] = None,
+) -> Callable:
+    """If the first call to the decorated function fails, the local cached
+    file is removed, and the function is called again. If ``data_home`` is
+    ``None``, then the function is called once. We can provide a specific
+    exception to not retry on using `no_retry_exception` parameter.
+    """
+
+    def decorator(f):
+        @wraps(f)
+        def wrapper(*args, **kw):
+            if data_home is None:
+                return f(*args, **kw)
+            try:
+                return f(*args, **kw)
+            except URLError:
+                raise
+            except Exception as exc:
+                if no_retry_exception is not None and isinstance(
+                    exc, no_retry_exception
+                ):
+                    raise
+                warn("Invalid cache, redownloading file", RuntimeWarning)
+                local_path = _get_local_path(openml_path, data_home)
+                if os.path.exists(local_path):
+                    os.unlink(local_path)
+                return f(*args, **kw)
+
+        return wrapper
+
+    return decorator
+
+
+def _retry_on_network_error(
+    n_retries: int = 3, delay: float = 1.0, url: str = ""
+) -> Callable:
+    """If the function call results in a network error, call the function again
+    up to ``n_retries`` times with a ``delay`` between each call. If the error
+    has a 412 status code, don't call the function again as this is a specific
+    OpenML error.
+    The url parameter is used to give more information to the user about the
+    error.
+    """
+
+    def decorator(f):
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            retry_counter = n_retries
+            while True:
+                try:
+                    return f(*args, **kwargs)
+                except (URLError, TimeoutError) as e:
+                    # 412 is a specific OpenML error code.
+                    if isinstance(e, HTTPError) and e.code == 412:
+                        raise
+                    if retry_counter == 0:
+                        raise
+                    warn(
+                        f"A network error occurred while downloading {url}. Retrying..."
+                    )
+                    # Avoid a ResourceWarning on Python 3.14 and later.
+                    if isinstance(e, HTTPError):
+                        e.close()
+
+                    retry_counter -= 1
+                    time.sleep(delay)
+
+        return wrapper
+
+    return decorator
+
+
+def _open_openml_url(
+    url: str, data_home: Optional[str], n_retries: int = 3, delay: float = 1.0
+):
+    """
+    Returns a resource from OpenML.org. Caches it to data_home if required.
+
+    Parameters
+    ----------
+    url : str
+        OpenML URL that will be downloaded and cached locally. The path component
+        of the URL is used to replicate the tree structure as sub-folders of the local
+        cache folder.
+
+    data_home : str
+        Directory to which the files will be cached. If None, no caching will
+        be applied.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered. Error with status
+        code 412 won't be retried as they represent OpenML generic errors.
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+    Returns
+    -------
+    result : stream
+        A stream to the OpenML resource.
+    """
+
+    def is_gzip_encoded(_fsrc):
+        return _fsrc.info().get("Content-Encoding", "") == "gzip"
+
+    req = Request(url)
+    req.add_header("Accept-encoding", "gzip")
+
+    if data_home is None:
+        fsrc = _retry_on_network_error(n_retries, delay, req.full_url)(urlopen)(req)
+        if is_gzip_encoded(fsrc):
+            return gzip.GzipFile(fileobj=fsrc, mode="rb")
+        return fsrc
+
+    openml_path = urlparse(url).path.lstrip("/")
+    local_path = _get_local_path(openml_path, data_home)
+    dir_name, file_name = os.path.split(local_path)
+    if not os.path.exists(local_path):
+        os.makedirs(dir_name, exist_ok=True)
+        try:
+            # Create a tmpdir as a subfolder of dir_name where the final file will
+            # be moved to if the download is successful. This guarantees that the
+            # renaming operation to the final location is atomic to ensure the
+            # concurrence safety of the dataset caching mechanism.
+            with TemporaryDirectory(dir=dir_name) as tmpdir:
+                with closing(
+                    _retry_on_network_error(n_retries, delay, req.full_url)(urlopen)(
+                        req
+                    )
+                ) as fsrc:
+                    opener: Callable
+                    if is_gzip_encoded(fsrc):
+                        opener = open
+                    else:
+                        opener = gzip.GzipFile
+                    with opener(os.path.join(tmpdir, file_name), "wb") as fdst:
+                        shutil.copyfileobj(fsrc, fdst)
+                shutil.move(fdst.name, local_path)
+        except Exception:
+            if os.path.exists(local_path):
+                os.unlink(local_path)
+            raise
+
+    # XXX: First time, decompression will not be necessary (by using fsrc), but
+    # it will happen nonetheless
+    return gzip.GzipFile(local_path, "rb")
+
+
+class OpenMLError(ValueError):
+    """HTTP 412 is a specific OpenML error code, indicating a generic error"""
+
+    pass
+
+
+def _get_json_content_from_openml_api(
+    url: str,
+    error_message: Optional[str],
+    data_home: Optional[str],
+    n_retries: int = 3,
+    delay: float = 1.0,
+) -> Dict:
+    """
+    Loads json data from the openml api.
+
+    Parameters
+    ----------
+    url : str
+        The URL to load from. Should be an official OpenML endpoint.
+
+    error_message : str or None
+        The error message to raise if an acceptable OpenML error is thrown
+        (acceptable error is, e.g., data id not found. Other errors, like 404's
+        will throw the native error message).
+
+    data_home : str or None
+        Location to cache the response. None if no cache is required.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered. Error with status
+        code 412 won't be retried as they represent OpenML generic errors.
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+    Returns
+    -------
+    json_data : json
+        the json result from the OpenML server if the call was successful.
+        An exception otherwise.
+    """
+
+    @_retry_with_clean_cache(url, data_home=data_home)
+    def _load_json():
+        with closing(
+            _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
+        ) as response:
+            return json.loads(response.read().decode("utf-8"))
+
+    try:
+        return _load_json()
+    except HTTPError as error:
+        # 412 is an OpenML specific error code, indicating a generic error
+        # (e.g., data not found)
+        if error.code != 412:
+            raise error
+
+    # 412 error, not in except for nicer traceback
+    raise OpenMLError(error_message)
+
+
+def _get_data_info_by_name(
+    name: str,
+    version: Union[int, str],
+    data_home: Optional[str],
+    n_retries: int = 3,
+    delay: float = 1.0,
+):
+    """
+    Utilizes the openml dataset listing api to find a dataset by
+    name/version
+    OpenML api function:
+    https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name
+
+    Parameters
+    ----------
+    name : str
+        name of the dataset
+
+    version : int or str
+        If version is an integer, the exact name/version will be obtained from
+        OpenML. If version is a string (value: "active") it will take the first
+        version from OpenML that is annotated as active. Any other string
+        values except "active" are treated as integer.
+
+    data_home : str or None
+        Location to cache the response. None if no cache is required.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered. Error with status
+        code 412 won't be retried as they represent OpenML generic errors.
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+    Returns
+    -------
+    first_dataset : json
+        json representation of the first dataset object that adhired to the
+        search criteria
+
+    """
+    if version == "active":
+        # situation in which we return the oldest active version
+        url = _SEARCH_NAME.format(name) + "/status/active/"
+        error_msg = "No active dataset {} found.".format(name)
+        json_data = _get_json_content_from_openml_api(
+            url,
+            error_msg,
+            data_home=data_home,
+            n_retries=n_retries,
+            delay=delay,
+        )
+        res = json_data["data"]["dataset"]
+        if len(res) > 1:
+            first_version = version = res[0]["version"]
+            warning_msg = (
+                "Multiple active versions of the dataset matching the name"
+                f" {name} exist. Versions may be fundamentally different, "
+                f"returning version {first_version}. "
+                "Available versions:\n"
+            )
+            for r in res:
+                warning_msg += f"- version {r['version']}, status: {r['status']}\n"
+                warning_msg += (
+                    f"  url: https://www.openml.org/search?type=data&id={r['did']}\n"
+                )
+            warn(warning_msg)
+        return res[0]
+
+    # an integer version has been provided
+    url = (_SEARCH_NAME + "/data_version/{}").format(name, version)
+    try:
+        json_data = _get_json_content_from_openml_api(
+            url,
+            error_message=None,
+            data_home=data_home,
+            n_retries=n_retries,
+            delay=delay,
+        )
+    except OpenMLError:
+        # we can do this in 1 function call if OpenML does not require the
+        # specification of the dataset status (i.e., return datasets with a
+        # given name / version regardless of active, deactivated, etc. )
+        # TODO: feature request OpenML.
+        url += "/status/deactivated"
+        error_msg = "Dataset {} with version {} not found.".format(name, version)
+        json_data = _get_json_content_from_openml_api(
+            url,
+            error_msg,
+            data_home=data_home,
+            n_retries=n_retries,
+            delay=delay,
+        )
+
+    return json_data["data"]["dataset"][0]
+
+
+def _get_data_description_by_id(
+    data_id: int,
+    data_home: Optional[str],
+    n_retries: int = 3,
+    delay: float = 1.0,
+) -> Dict[str, Any]:
+    # OpenML API function: https://www.openml.org/api_docs#!/data/get_data_id
+    url = _DATA_INFO.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(
+        url,
+        error_message,
+        data_home=data_home,
+        n_retries=n_retries,
+        delay=delay,
+    )
+    return json_data["data_set_description"]
+
+
+def _get_data_features(
+    data_id: int,
+    data_home: Optional[str],
+    n_retries: int = 3,
+    delay: float = 1.0,
+) -> OpenmlFeaturesType:
+    # OpenML function:
+    # https://www.openml.org/api_docs#!/data/get_data_features_id
+    url = _DATA_FEATURES.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(
+        url,
+        error_message,
+        data_home=data_home,
+        n_retries=n_retries,
+        delay=delay,
+    )
+    return json_data["data_features"]["feature"]
+
+
+def _get_data_qualities(
+    data_id: int,
+    data_home: Optional[str],
+    n_retries: int = 3,
+    delay: float = 1.0,
+) -> OpenmlQualitiesType:
+    # OpenML API function:
+    # https://www.openml.org/api_docs#!/data/get_data_qualities_id
+    url = _DATA_QUALITIES.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(
+        url,
+        error_message,
+        data_home=data_home,
+        n_retries=n_retries,
+        delay=delay,
+    )
+    # the qualities might not be available, but we still try to process
+    # the data
+    return json_data.get("data_qualities", {}).get("quality", [])
+
+
+def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int:
+    """Get the number of samples from data qualities.
+
+    Parameters
+    ----------
+    data_qualities : list of dict
+        Used to retrieve the number of instances (samples) in the dataset.
+
+    Returns
+    -------
+    n_samples : int
+        The number of samples in the dataset or -1 if data qualities are
+        unavailable.
+    """
+    # If the data qualities are unavailable, we return -1
+    default_n_samples = -1
+
+    qualities = {d["name"]: d["value"] for d in data_qualities}
+    return int(float(qualities.get("NumberOfInstances", default_n_samples)))
+
+
+def _load_arff_response(
+    url: str,
+    data_home: Optional[str],
+    parser: str,
+    output_type: str,
+    openml_columns_info: dict,
+    feature_names_to_select: List[str],
+    target_names_to_select: List[str],
+    shape: Optional[Tuple[int, int]],
+    md5_checksum: str,
+    n_retries: int = 3,
+    delay: float = 1.0,
+    read_csv_kwargs: Optional[Dict] = None,
+):
+    """Load the ARFF data associated with the OpenML URL.
+
+    In addition of loading the data, this function will also check the
+    integrity of the downloaded file from OpenML using MD5 checksum.
+
+    Parameters
+    ----------
+    url : str
+        The URL of the ARFF file on OpenML.
+
+    data_home : str
+        The location where to cache the data.
+
+    parser : {"liac-arff", "pandas"}
+        The parser used to parse the ARFF file.
+
+    output_type : {"numpy", "pandas", "sparse"}
+        The type of the arrays that will be returned. The possibilities are:
+
+        - `"numpy"`: both `X` and `y` will be NumPy arrays;
+        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
+        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
+          pandas Series or DataFrame.
+
+    openml_columns_info : dict
+        The information provided by OpenML regarding the columns of the ARFF
+        file.
+
+    feature_names_to_select : list of str
+        The list of the features to be selected.
+
+    target_names_to_select : list of str
+        The list of the target variables to be selected.
+
+    shape : tuple or None
+        With `parser="liac-arff"`, when using a generator to load the data,
+        one needs to provide the shape of the data beforehand.
+
+    md5_checksum : str
+        The MD5 checksum provided by OpenML to check the data integrity.
+
+    n_retries : int, default=3
+        The number of times to retry downloading the data if it fails.
+
+    delay : float, default=1.0
+        The delay between two consecutive downloads in seconds.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
+        It allows to overwrite the default options.
+
+        .. versionadded:: 1.3
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix, dataframe}
+        The data matrix.
+
+    y : {ndarray, dataframe, series}
+        The target.
+
+    frame : dataframe or None
+        A dataframe containing both `X` and `y`. `None` if
+        `output_array_type != "pandas"`.
+
+    categories : list of str or None
+        The names of the features that are categorical. `None` if
+        `output_array_type == "pandas"`.
+    """
+    gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
+    with closing(gzip_file):
+        md5 = hashlib.md5()
+        for chunk in iter(lambda: gzip_file.read(4096), b""):
+            md5.update(chunk)
+        actual_md5_checksum = md5.hexdigest()
+
+    if actual_md5_checksum != md5_checksum:
+        raise ValueError(
+            f"md5 checksum of local file for {url} does not match description: "
+            f"expected: {md5_checksum} but got {actual_md5_checksum}. "
+            "Downloaded file could have been modified / corrupted, clean cache "
+            "and retry..."
+        )
+
+    def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
+        gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
+        with closing(gzip_file):
+            return load_arff_from_gzip_file(gzip_file, **arff_params)
+
+    arff_params: Dict = dict(
+        parser=parser,
+        output_type=output_type,
+        openml_columns_info=openml_columns_info,
+        feature_names_to_select=feature_names_to_select,
+        target_names_to_select=target_names_to_select,
+        shape=shape,
+        read_csv_kwargs=read_csv_kwargs or {},
+    )
+    try:
+        X, y, frame, categories = _open_url_and_load_gzip_file(
+            url, data_home, n_retries, delay, arff_params
+        )
+    except Exception as exc:
+        if parser != "pandas":
+            raise
+
+        from pandas.errors import ParserError
+
+        if not isinstance(exc, ParserError):
+            raise
+
+        # A parsing error could come from providing the wrong quotechar
+        # to pandas. By default, we use a double quote. Thus, we retry
+        # with a single quote before to raise the error.
+        arff_params["read_csv_kwargs"].update(quotechar="'")
+        X, y, frame, categories = _open_url_and_load_gzip_file(
+            url, data_home, n_retries, delay, arff_params
+        )
+
+    return X, y, frame, categories
+
+
+def _download_data_to_bunch(
+    url: str,
+    sparse: bool,
+    data_home: Optional[str],
+    *,
+    as_frame: bool,
+    openml_columns_info: List[dict],
+    data_columns: List[str],
+    target_columns: List[str],
+    shape: Optional[Tuple[int, int]],
+    md5_checksum: str,
+    n_retries: int = 3,
+    delay: float = 1.0,
+    parser: str,
+    read_csv_kwargs: Optional[Dict] = None,
+):
+    """Download ARFF data, load it to a specific container and create to Bunch.
+
+    This function has a mechanism to retry/cache/clean the data.
+
+    Parameters
+    ----------
+    url : str
+        The URL of the ARFF file on OpenML.
+
+    sparse : bool
+        Whether the dataset is expected to use the sparse ARFF format.
+
+    data_home : str
+        The location where to cache the data.
+
+    as_frame : bool
+        Whether or not to return the data into a pandas DataFrame.
+
+    openml_columns_info : list of dict
+        The information regarding the columns provided by OpenML for the
+        ARFF dataset. The information is stored as a list of dictionaries.
+
+    data_columns : list of str
+        The list of the features to be selected.
+
+    target_columns : list of str
+        The list of the target variables to be selected.
+
+    shape : tuple or None
+        With `parser="liac-arff"`, when using a generator to load the data,
+        one needs to provide the shape of the data beforehand.
+
+    md5_checksum : str
+        The MD5 checksum provided by OpenML to check the data integrity.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered. Error with status
+        code 412 won't be retried as they represent OpenML generic errors.
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+    parser : {"liac-arff", "pandas"}
+        The parser used to parse the ARFF file.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
+        It allows to overwrite the default options.
+
+        .. versionadded:: 1.3
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        X : {ndarray, sparse matrix, dataframe}
+            The data matrix.
+        y : {ndarray, dataframe, series}
+            The target.
+        frame : dataframe or None
+            A dataframe containing both `X` and `y`. `None` if
+            `output_array_type != "pandas"`.
+        categories : list of str or None
+            The names of the features that are categorical. `None` if
+            `output_array_type == "pandas"`.
+    """
+    # Prepare which columns and data types should be returned for the X and y
+    features_dict = {feature["name"]: feature for feature in openml_columns_info}
+
+    if sparse:
+        output_type = "sparse"
+    elif as_frame:
+        output_type = "pandas"
+    else:
+        output_type = "numpy"
+
+    # XXX: target columns should all be categorical or all numeric
+    _verify_target_data_type(features_dict, target_columns)
+    for name in target_columns:
+        column_info = features_dict[name]
+        n_missing_values = int(column_info["number_of_missing_values"])
+        if n_missing_values > 0:
+            raise ValueError(
+                f"Target column '{column_info['name']}' has {n_missing_values} missing "
+                "values. Missing values are not supported for target columns."
+            )
+
+    no_retry_exception = None
+    if parser == "pandas":
+        # If we get a ParserError with pandas, then we don't want to retry and we raise
+        # early.
+        from pandas.errors import ParserError
+
+        no_retry_exception = ParserError
+
+    X, y, frame, categories = _retry_with_clean_cache(
+        url, data_home, no_retry_exception
+    )(_load_arff_response)(
+        url,
+        data_home,
+        parser=parser,
+        output_type=output_type,
+        openml_columns_info=features_dict,
+        feature_names_to_select=data_columns,
+        target_names_to_select=target_columns,
+        shape=shape,
+        md5_checksum=md5_checksum,
+        n_retries=n_retries,
+        delay=delay,
+        read_csv_kwargs=read_csv_kwargs,
+    )
+
+    return Bunch(
+        data=X,
+        target=y,
+        frame=frame,
+        categories=categories,
+        feature_names=data_columns,
+        target_names=target_columns,
+    )
+
+
+def _verify_target_data_type(features_dict, target_columns):
+    # verifies the data type of the y array in case there are multiple targets
+    # (throws an error if these targets do not comply with sklearn support)
+    if not isinstance(target_columns, list):
+        raise ValueError("target_column should be list, got: %s" % type(target_columns))
+    found_types = set()
+    for target_column in target_columns:
+        if target_column not in features_dict:
+            raise KeyError(f"Could not find target_column='{target_column}'")
+        if features_dict[target_column]["data_type"] == "numeric":
+            found_types.add(np.float64)
+        else:
+            found_types.add(object)
+
+        # note: we compare to a string, not boolean
+        if features_dict[target_column]["is_ignore"] == "true":
+            warn(f"target_column='{target_column}' has flag is_ignore.")
+        if features_dict[target_column]["is_row_identifier"] == "true":
+            warn(f"target_column='{target_column}' has flag is_row_identifier.")
+    if len(found_types) > 1:
+        raise ValueError(
+            "Can only handle homogeneous multi-target datasets, "
+            "i.e., all targets are either numeric or "
+            "categorical."
+        )
+
+
+def _valid_data_column_names(features_list, target_columns):
+    # logic for determining on which columns can be learned. Note that from the
+    # OpenML guide follows that columns that have the `is_row_identifier` or
+    # `is_ignore` flag, these can not be learned on. Also target columns are
+    # excluded.
+    valid_data_column_names = []
+    for feature in features_list:
+        if (
+            feature["name"] not in target_columns
+            and feature["is_ignore"] != "true"
+            and feature["is_row_identifier"] != "true"
+        ):
+            valid_data_column_names.append(feature["name"])
+    return valid_data_column_names
+
+
+@validate_params(
+    {
+        "name": [str, None],
+        "version": [Interval(Integral, 1, None, closed="left"), StrOptions({"active"})],
+        "data_id": [Interval(Integral, 1, None, closed="left"), None],
+        "data_home": [str, os.PathLike, None],
+        "target_column": [str, list, None],
+        "cache": [bool],
+        "return_X_y": [bool],
+        "as_frame": [bool, StrOptions({"auto"})],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+        "parser": [
+            StrOptions({"auto", "pandas", "liac-arff"}),
+        ],
+        "read_csv_kwargs": [dict, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_openml(
+    name: Optional[str] = None,
+    *,
+    version: Union[str, int] = "active",
+    data_id: Optional[int] = None,
+    data_home: Optional[Union[str, os.PathLike]] = None,
+    target_column: Optional[Union[str, List]] = "default-target",
+    cache: bool = True,
+    return_X_y: bool = False,
+    as_frame: Union[str, bool] = "auto",
+    n_retries: int = 3,
+    delay: float = 1.0,
+    parser: str = "auto",
+    read_csv_kwargs: Optional[Dict] = None,
+):
+    """Fetch dataset from openml by name or dataset id.
+
+    Datasets are uniquely identified by either an integer ID or by a
+    combination of name and version (i.e. there might be multiple
+    versions of the 'iris' dataset). Please give either name or data_id
+    (not both). In case a name is given, a version can also be
+    provided.
+
+    Read more in the :ref:`User Guide <openml>`.
+
+    .. versionadded:: 0.20
+
+    .. note:: EXPERIMENTAL
+
+        The API is experimental (particularly the return value structure),
+        and might have small backward-incompatible changes without notice
+        or warning in future releases.
+
+    Parameters
+    ----------
+    name : str, default=None
+        String identifier of the dataset. Note that OpenML can have multiple
+        datasets with the same name.
+
+    version : int or 'active', default='active'
+        Version of the dataset. Can only be provided if also ``name`` is given.
+        If 'active' the oldest version that's still active is used. Since
+        there may be more than one active version of a dataset, and those
+        versions may fundamentally be different from one another, setting an
+        exact version is highly recommended.
+
+    data_id : int, default=None
+        OpenML ID of the dataset. The most specific way of retrieving a
+        dataset. If data_id is not given, name (and potential version) are
+        used to obtain a dataset.
+
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the data sets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    target_column : str, list or None, default='default-target'
+        Specify the column name in the data to use as target. If
+        'default-target', the standard target column a stored on the server
+        is used. If ``None``, all columns are returned as data and the
+        target is ``None``. If list (of strings), all columns with these names
+        are returned as multi-target (Note: not all scikit-learn classifiers
+        can handle all types of multi-output combinations).
+
+    cache : bool, default=True
+        Whether to cache the downloaded datasets into `data_home`.
+
+    return_X_y : bool, default=False
+        If True, returns ``(data, target)`` instead of a Bunch object. See
+        below for more information about the `data` and `target` objects.
+
+    as_frame : bool or 'auto', default='auto'
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric, string or categorical). The target is
+        a pandas DataFrame or Series depending on the number of target_columns.
+        The Bunch will contain a ``frame`` attribute with the target and the
+        data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas
+        DataFrames or Series as describe above.
+
+        If `as_frame` is 'auto', the data and target will be converted to
+        DataFrame or Series as if `as_frame` is set to True, unless the dataset
+        is stored in sparse format.
+
+        If `as_frame` is False, the data and target will be NumPy arrays and
+        the `data` will only contain numerical values when `parser="liac-arff"`
+        where the categories are provided in the attribute `categories` of the
+        `Bunch` instance. When `parser="pandas"`, no ordinal encoding is made.
+
+        .. versionchanged:: 0.24
+           The default value of `as_frame` changed from `False` to `'auto'`
+           in 0.24.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors or network timeouts are encountered.
+        Error with status code 412 won't be retried as they represent OpenML
+        generic errors.
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+    parser : {"auto", "pandas", "liac-arff"}, default="auto"
+        Parser used to load the ARFF file. Two parsers are implemented:
+
+        - `"pandas"`: this is the most efficient parser. However, it requires
+          pandas to be installed and can only open dense datasets.
+        - `"liac-arff"`: this is a pure Python ARFF parser that is much less
+          memory- and CPU-efficient. It deals with sparse ARFF datasets.
+
+        If `"auto"`, the parser is chosen automatically such that `"liac-arff"`
+        is selected for sparse ARFF datasets, otherwise `"pandas"` is selected.
+
+        .. versionadded:: 1.2
+        .. versionchanged:: 1.4
+           The default value of `parser` changes from `"liac-arff"` to
+           `"auto"`.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments passed to :func:`pandas.read_csv` when loading the data
+        from a ARFF file and using the pandas parser. It can allow to
+        overwrite some default parameters.
+
+        .. versionadded:: 1.3
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame
+            The feature matrix. Categorical features are encoded as ordinals.
+        target : np.array, pandas Series or DataFrame
+            The regression target or classification labels, if applicable.
+            Dtype is float if numeric, and object if categorical. If
+            ``as_frame`` is True, ``target`` is a pandas object.
+        DESCR : str
+            The full description of the dataset.
+        feature_names : list
+            The names of the dataset columns.
+        target_names: list
+            The names of the target columns.
+
+        .. versionadded:: 0.22
+
+        categories : dict or None
+            Maps each categorical feature name to a list of values, such
+            that the value encoded as i is ith in the list. If ``as_frame``
+            is True, this is None.
+        details : dict
+            More metadata from OpenML.
+        frame : pandas DataFrame
+            Only present when `as_frame=True`. DataFrame with ``data`` and
+            ``target``.
+
+    (data, target) : tuple if ``return_X_y`` is True
+
+        .. note:: EXPERIMENTAL
+
+            This interface is **experimental** and subsequent releases may
+            change attributes without notice (although there should only be
+            minor changes to ``data`` and ``target``).
+
+        Missing values in the 'data' are represented as NaN's. Missing values
+        in 'target' are represented as NaN's (numerical target) or None
+        (categorical target).
+
+    Notes
+    -----
+    The `"pandas"` and `"liac-arff"` parsers can lead to different data types
+    in the output. The notable differences are the following:
+
+    - The `"liac-arff"` parser always encodes categorical features as `str` objects.
+      To the contrary, the `"pandas"` parser instead infers the type while
+      reading and numerical categories will be casted into integers whenever
+      possible.
+    - The `"liac-arff"` parser uses float64 to encode numerical features
+      tagged as 'REAL' and 'NUMERICAL' in the metadata. The `"pandas"`
+      parser instead infers if these numerical features corresponds
+      to integers and uses panda's Integer extension dtype.
+    - In particular, classification datasets with integer categories are
+      typically loaded as such `(0, 1, ...)` with the `"pandas"` parser while
+      `"liac-arff"` will force the use of string encoded class labels such as
+      `"0"`, `"1"` and so on.
+    - The `"pandas"` parser will not strip single quotes - i.e. `'` - from
+      string columns. For instance, a string `'my string'` will be kept as is
+      while the `"liac-arff"` parser will strip the single quotes. For
+      categorical columns, the single quotes are stripped from the values.
+
+    In addition, when `as_frame=False` is used, the `"liac-arff"` parser
+    returns ordinally encoded data where the categories are provided in the
+    attribute `categories` of the `Bunch` instance. Instead, `"pandas"` returns
+    a NumPy array were the categories are not encoded.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_openml
+    >>> adult = fetch_openml("adult", version=2)  # doctest: +SKIP
+    >>> adult.frame.info()  # doctest: +SKIP
+    <class 'pandas.core.frame.DataFrame'>
+    RangeIndex: 48842 entries, 0 to 48841
+    Data columns (total 15 columns):
+     #   Column          Non-Null Count  Dtype
+    ---  ------          --------------  -----
+     0   age             48842 non-null  int64
+     1   workclass       46043 non-null  category
+     2   fnlwgt          48842 non-null  int64
+     3   education       48842 non-null  category
+     4   education-num   48842 non-null  int64
+     5   marital-status  48842 non-null  category
+     6   occupation      46033 non-null  category
+     7   relationship    48842 non-null  category
+     8   race            48842 non-null  category
+     9   sex             48842 non-null  category
+     10  capital-gain    48842 non-null  int64
+     11  capital-loss    48842 non-null  int64
+     12  hours-per-week  48842 non-null  int64
+     13  native-country  47985 non-null  category
+     14  class           48842 non-null  category
+    dtypes: category(9), int64(6)
+    memory usage: 2.7 MB
+    """
+    if cache is False:
+        # no caching will be applied
+        data_home = None
+    else:
+        data_home = get_data_home(data_home=data_home)
+        data_home = join(str(data_home), "openml")
+
+    # check valid function arguments. data_id XOR (name, version) should be
+    # provided
+    if name is not None:
+        # OpenML is case-insensitive, but the caching mechanism is not
+        # convert all data names (str) to lower case
+        name = name.lower()
+        if data_id is not None:
+            raise ValueError(
+                "Dataset data_id={} and name={} passed, but you can only "
+                "specify a numeric data_id or a name, not "
+                "both.".format(data_id, name)
+            )
+        data_info = _get_data_info_by_name(
+            name, version, data_home, n_retries=n_retries, delay=delay
+        )
+        data_id = data_info["did"]
+    elif data_id is not None:
+        # from the previous if statement, it is given that name is None
+        if version != "active":
+            raise ValueError(
+                "Dataset data_id={} and version={} passed, but you can only "
+                "specify a numeric data_id or a version, not "
+                "both.".format(data_id, version)
+            )
+    else:
+        raise ValueError(
+            "Neither name nor data_id are provided. Please provide name or data_id."
+        )
+
+    data_description = _get_data_description_by_id(data_id, data_home)
+    if data_description["status"] != "active":
+        warn(
+            "Version {} of dataset {} is inactive, meaning that issues have "
+            "been found in the dataset. Try using a newer version from "
+            "this URL: {}".format(
+                data_description["version"],
+                data_description["name"],
+                data_description["url"],
+            )
+        )
+    if "error" in data_description:
+        warn(
+            "OpenML registered a problem with the dataset. It might be "
+            "unusable. Error: {}".format(data_description["error"])
+        )
+    if "warning" in data_description:
+        warn(
+            "OpenML raised a warning on the dataset. It might be "
+            "unusable. Warning: {}".format(data_description["warning"])
+        )
+
+    return_sparse = data_description["format"].lower() == "sparse_arff"
+    as_frame = not return_sparse if as_frame == "auto" else as_frame
+    if parser == "auto":
+        parser_ = "liac-arff" if return_sparse else "pandas"
+    else:
+        parser_ = parser
+
+    if parser_ == "pandas":
+        try:
+            check_pandas_support("`fetch_openml`")
+        except ImportError as exc:
+            if as_frame:
+                err_msg = (
+                    "Returning pandas objects requires pandas to be installed. "
+                    "Alternatively, explicitly set `as_frame=False` and "
+                    "`parser='liac-arff'`."
+                )
+            else:
+                err_msg = (
+                    f"Using `parser={parser!r}` with dense data requires pandas to be "
+                    "installed. Alternatively, explicitly set `parser='liac-arff'`."
+                )
+            raise ImportError(err_msg) from exc
+
+    if return_sparse:
+        if as_frame:
+            raise ValueError(
+                "Sparse ARFF datasets cannot be loaded with as_frame=True. "
+                "Use as_frame=False or as_frame='auto' instead."
+            )
+        if parser_ == "pandas":
+            raise ValueError(
+                f"Sparse ARFF datasets cannot be loaded with parser={parser!r}. "
+                "Use parser='liac-arff' or parser='auto' instead."
+            )
+
+    # download data features, meta-info about column types
+    features_list = _get_data_features(data_id, data_home)
+
+    if not as_frame:
+        for feature in features_list:
+            if "true" in (feature["is_ignore"], feature["is_row_identifier"]):
+                continue
+            if feature["data_type"] == "string":
+                raise ValueError(
+                    "STRING attributes are not supported for "
+                    "array representation. Try as_frame=True"
+                )
+
+    if target_column == "default-target":
+        # determines the default target based on the data feature results
+        # (which is currently more reliable than the data description;
+        # see issue: https://github.com/openml/OpenML/issues/768)
+        target_columns = [
+            feature["name"]
+            for feature in features_list
+            if feature["is_target"] == "true"
+        ]
+    elif isinstance(target_column, str):
+        # for code-simplicity, make target_column by default a list
+        target_columns = [target_column]
+    elif target_column is None:
+        target_columns = []
+    else:
+        # target_column already is of type list
+        target_columns = target_column
+    data_columns = _valid_data_column_names(features_list, target_columns)
+
+    shape: Optional[Tuple[int, int]]
+    # determine arff encoding to return
+    if not return_sparse:
+        # The shape must include the ignored features to keep the right indexes
+        # during the arff data conversion.
+        data_qualities = _get_data_qualities(data_id, data_home)
+        shape = _get_num_samples(data_qualities), len(features_list)
+    else:
+        shape = None
+
+    # obtain the data
+    url = data_description["url"]
+    bunch = _download_data_to_bunch(
+        url,
+        return_sparse,
+        data_home,
+        as_frame=bool(as_frame),
+        openml_columns_info=features_list,
+        shape=shape,
+        target_columns=target_columns,
+        data_columns=data_columns,
+        md5_checksum=data_description["md5_checksum"],
+        n_retries=n_retries,
+        delay=delay,
+        parser=parser_,
+        read_csv_kwargs=read_csv_kwargs,
+    )
+
+    if return_X_y:
+        return bunch.data, bunch.target
+
+    description = "{}\n\nDownloaded from openml.org.".format(
+        data_description.pop("description")
+    )
+
+    bunch.update(
+        DESCR=description,
+        details=data_description,
+        url="https://www.openml.org/d/{}".format(data_id),
+    )
+
+    return bunch
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_rcv1.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_rcv1.py
new file mode 100644
index 0000000000000000000000000000000000000000..b673f938f0e46f180e6cbd9235cc79b21fde1154
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_rcv1.py
@@ -0,0 +1,334 @@
+"""RCV1 dataset.
+
+The dataset page is available at
+
+    http://jmlr.csail.mit.edu/papers/volume5/lewis04a/
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+from gzip import GzipFile
+from numbers import Integral, Real
+from os import PathLike, makedirs, remove
+from os.path import exists, join
+
+import joblib
+import numpy as np
+import scipy.sparse as sp
+
+from ..utils import Bunch
+from ..utils import shuffle as shuffle_
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from . import get_data_home
+from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath, load_descr
+from ._svmlight_format_io import load_svmlight_files
+
+# The original vectorized data can be found at:
+#    http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt0.dat.gz
+#    http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt1.dat.gz
+#    http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt2.dat.gz
+#    http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt3.dat.gz
+#    http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_train.dat.gz
+# while the original stemmed token files can be found
+# in the README, section B.12.i.:
+#    http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm
+XY_METADATA = (
+    RemoteFileMetadata(
+        url="https://ndownloader.figshare.com/files/5976069",
+        checksum="ed40f7e418d10484091b059703eeb95ae3199fe042891dcec4be6696b9968374",
+        filename="lyrl2004_vectors_test_pt0.dat.gz",
+    ),
+    RemoteFileMetadata(
+        url="https://ndownloader.figshare.com/files/5976066",
+        checksum="87700668ae45d45d5ca1ef6ae9bd81ab0f5ec88cc95dcef9ae7838f727a13aa6",
+        filename="lyrl2004_vectors_test_pt1.dat.gz",
+    ),
+    RemoteFileMetadata(
+        url="https://ndownloader.figshare.com/files/5976063",
+        checksum="48143ac703cbe33299f7ae9f4995db49a258690f60e5debbff8995c34841c7f5",
+        filename="lyrl2004_vectors_test_pt2.dat.gz",
+    ),
+    RemoteFileMetadata(
+        url="https://ndownloader.figshare.com/files/5976060",
+        checksum="dfcb0d658311481523c6e6ca0c3f5a3e1d3d12cde5d7a8ce629a9006ec7dbb39",
+        filename="lyrl2004_vectors_test_pt3.dat.gz",
+    ),
+    RemoteFileMetadata(
+        url="https://ndownloader.figshare.com/files/5976057",
+        checksum="5468f656d0ba7a83afc7ad44841cf9a53048a5c083eedc005dcdb5cc768924ae",
+        filename="lyrl2004_vectors_train.dat.gz",
+    ),
+)
+
+# The original data can be found at:
+# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz
+TOPICS_METADATA = RemoteFileMetadata(
+    url="https://ndownloader.figshare.com/files/5976048",
+    checksum="2a98e5e5d8b770bded93afc8930d88299474317fe14181aee1466cc754d0d1c1",
+    filename="rcv1v2.topics.qrels.gz",
+)
+
+logger = logging.getLogger(__name__)
+
+
+@validate_params(
+    {
+        "data_home": [str, PathLike, None],
+        "subset": [StrOptions({"train", "test", "all"})],
+        "download_if_missing": ["boolean"],
+        "random_state": ["random_state"],
+        "shuffle": ["boolean"],
+        "return_X_y": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_rcv1(
+    *,
+    data_home=None,
+    subset="all",
+    download_if_missing=True,
+    random_state=None,
+    shuffle=False,
+    return_X_y=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the RCV1 multilabel dataset (classification).
+
+    Download it if necessary.
+
+    Version: RCV1-v2, vectors, full sets, topics multilabels.
+
+    =================   =====================
+    Classes                               103
+    Samples total                      804414
+    Dimensionality                      47236
+    Features            real, between 0 and 1
+    =================   =====================
+
+    Read more in the :ref:`User Guide <rcv1_dataset>`.
+
+    .. versionadded:: 0.17
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    subset : {'train', 'test', 'all'}, default='all'
+        Select the dataset to load: 'train' for the training set
+        (23149 samples), 'test' for the test set (781265 samples),
+        'all' for both, with the training samples first if shuffle is False.
+        This follows the official LYRL2004 chronological split.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    shuffle : bool, default=False
+        Whether to shuffle dataset.
+
+    return_X_y : bool, default=False
+        If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch
+        object. See below for more information about the `dataset.data` and
+        `dataset.target` object.
+
+        .. versionadded:: 0.20
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    dataset : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object. Returned only if `return_X_y` is False.
+        `dataset` has the following attributes:
+
+        - data : sparse matrix of shape (804414, 47236), dtype=np.float64
+            The array has 0.16% of non zero values. Will be of CSR format.
+        - target : sparse matrix of shape (804414, 103), dtype=np.uint8
+            Each sample has a value of 1 in its categories, and 0 in others.
+            The array has 3.15% of non zero values. Will be of CSR format.
+        - sample_id : ndarray of shape (804414,), dtype=np.uint32,
+            Identification number of each sample, as ordered in dataset.data.
+        - target_names : ndarray of shape (103,), dtype=object
+            Names of each target (RCV1 topics), as ordered in dataset.target.
+        - DESCR : str
+            Description of the RCV1 dataset.
+
+    (data, target) : tuple
+        A tuple consisting of `dataset.data` and `dataset.target`, as
+        described above. Returned only if `return_X_y` is True.
+
+        .. versionadded:: 0.20
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_rcv1
+    >>> rcv1 = fetch_rcv1()
+    >>> rcv1.data.shape
+    (804414, 47236)
+    >>> rcv1.target.shape
+    (804414, 103)
+    """
+    N_SAMPLES = 804414
+    N_FEATURES = 47236
+    N_CATEGORIES = 103
+    N_TRAIN = 23149
+
+    data_home = get_data_home(data_home=data_home)
+    rcv1_dir = join(data_home, "RCV1")
+    if download_if_missing:
+        if not exists(rcv1_dir):
+            makedirs(rcv1_dir)
+
+    samples_path = _pkl_filepath(rcv1_dir, "samples.pkl")
+    sample_id_path = _pkl_filepath(rcv1_dir, "sample_id.pkl")
+    sample_topics_path = _pkl_filepath(rcv1_dir, "sample_topics.pkl")
+    topics_path = _pkl_filepath(rcv1_dir, "topics_names.pkl")
+
+    # load data (X) and sample_id
+    if download_if_missing and (not exists(samples_path) or not exists(sample_id_path)):
+        files = []
+        for each in XY_METADATA:
+            logger.info("Downloading %s" % each.url)
+            file_path = _fetch_remote(
+                each, dirname=rcv1_dir, n_retries=n_retries, delay=delay
+            )
+            files.append(GzipFile(filename=file_path))
+
+        Xy = load_svmlight_files(files, n_features=N_FEATURES)
+
+        # Training data is before testing data
+        X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
+        sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
+        sample_id = sample_id.astype(np.uint32, copy=False)
+
+        joblib.dump(X, samples_path, compress=9)
+        joblib.dump(sample_id, sample_id_path, compress=9)
+
+        # delete archives
+        for f in files:
+            f.close()
+            remove(f.name)
+    else:
+        X = joblib.load(samples_path)
+        sample_id = joblib.load(sample_id_path)
+
+    # load target (y), categories, and sample_id_bis
+    if download_if_missing and (
+        not exists(sample_topics_path) or not exists(topics_path)
+    ):
+        logger.info("Downloading %s" % TOPICS_METADATA.url)
+        topics_archive_path = _fetch_remote(
+            TOPICS_METADATA, dirname=rcv1_dir, n_retries=n_retries, delay=delay
+        )
+
+        # parse the target file
+        n_cat = -1
+        n_doc = -1
+        doc_previous = -1
+        y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
+        sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
+        category_names = {}
+        with GzipFile(filename=topics_archive_path, mode="rb") as f:
+            for line in f:
+                line_components = line.decode("ascii").split(" ")
+                if len(line_components) == 3:
+                    cat, doc, _ = line_components
+                    if cat not in category_names:
+                        n_cat += 1
+                        category_names[cat] = n_cat
+
+                    doc = int(doc)
+                    if doc != doc_previous:
+                        doc_previous = doc
+                        n_doc += 1
+                        sample_id_bis[n_doc] = doc
+                    y[n_doc, category_names[cat]] = 1
+
+        # delete archive
+        remove(topics_archive_path)
+
+        # Samples in X are ordered with sample_id,
+        # whereas in y, they are ordered with sample_id_bis.
+        permutation = _find_permutation(sample_id_bis, sample_id)
+        y = y[permutation, :]
+
+        # save category names in a list, with same order than y
+        categories = np.empty(N_CATEGORIES, dtype=object)
+        for k in category_names.keys():
+            categories[category_names[k]] = k
+
+        # reorder categories in lexicographic order
+        order = np.argsort(categories)
+        categories = categories[order]
+        y = sp.csr_matrix(y[:, order])
+
+        joblib.dump(y, sample_topics_path, compress=9)
+        joblib.dump(categories, topics_path, compress=9)
+    else:
+        y = joblib.load(sample_topics_path)
+        categories = joblib.load(topics_path)
+
+    if subset == "all":
+        pass
+    elif subset == "train":
+        X = X[:N_TRAIN, :]
+        y = y[:N_TRAIN, :]
+        sample_id = sample_id[:N_TRAIN]
+    elif subset == "test":
+        X = X[N_TRAIN:, :]
+        y = y[N_TRAIN:, :]
+        sample_id = sample_id[N_TRAIN:]
+    else:
+        raise ValueError(
+            "Unknown subset parameter. Got '%s' instead of one"
+            " of ('all', 'train', test')" % subset
+        )
+
+    if shuffle:
+        X, y, sample_id = shuffle_(X, y, sample_id, random_state=random_state)
+
+    fdescr = load_descr("rcv1.rst")
+
+    if return_X_y:
+        return X, y
+
+    return Bunch(
+        data=X, target=y, sample_id=sample_id, target_names=categories, DESCR=fdescr
+    )
+
+
+def _inverse_permutation(p):
+    """Inverse permutation p."""
+    n = p.size
+    s = np.zeros(n, dtype=np.int32)
+    i = np.arange(n, dtype=np.int32)
+    np.put(s, p, i)  # s[p] = i
+    return s
+
+
+def _find_permutation(a, b):
+    """Find the permutation from a to b."""
+    t = np.argsort(a)
+    u = np.argsort(b)
+    u_ = _inverse_permutation(u)
+    return t[u_]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_samples_generator.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_samples_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b4622d6a91bc579b505fa5dd8dd429de563198
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_samples_generator.py
@@ -0,0 +1,2383 @@
+"""
+Generate samples of synthetic data sets.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import array
+import numbers
+from collections.abc import Iterable
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+from scipy import linalg
+
+from sklearn.utils import Bunch
+
+from ..preprocessing import MultiLabelBinarizer
+from ..utils import check_array, check_random_state
+from ..utils import shuffle as util_shuffle
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.random import sample_without_replacement
+
+
+def _generate_hypercube(samples, dimensions, rng):
+    """Returns distinct binary samples of length dimensions."""
+    if dimensions > 30:
+        return np.hstack(
+            [
+                rng.randint(2, size=(samples, dimensions - 30)),
+                _generate_hypercube(samples, 30, rng),
+            ]
+        )
+    out = sample_without_replacement(2**dimensions, samples, random_state=rng).astype(
+        dtype=">u4", copy=False
+    )
+    out = np.unpackbits(out.view(">u1")).reshape((-1, 32))[:, -dimensions:]
+    return out
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "n_features": [Interval(Integral, 1, None, closed="left")],
+        "n_informative": [Interval(Integral, 1, None, closed="left")],
+        "n_redundant": [Interval(Integral, 0, None, closed="left")],
+        "n_repeated": [Interval(Integral, 0, None, closed="left")],
+        "n_classes": [Interval(Integral, 1, None, closed="left")],
+        "n_clusters_per_class": [Interval(Integral, 1, None, closed="left")],
+        "weights": ["array-like", None],
+        "flip_y": [Interval(Real, 0, 1, closed="both")],
+        "class_sep": [Interval(Real, 0, None, closed="neither")],
+        "hypercube": ["boolean"],
+        "shift": [Interval(Real, None, None, closed="neither"), "array-like", None],
+        "scale": [Interval(Real, 0, None, closed="neither"), "array-like", None],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+        "return_X_y": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_classification(
+    n_samples=100,
+    n_features=20,
+    *,
+    n_informative=2,
+    n_redundant=2,
+    n_repeated=0,
+    n_classes=2,
+    n_clusters_per_class=2,
+    weights=None,
+    flip_y=0.01,
+    class_sep=1.0,
+    hypercube=True,
+    shift=0.0,
+    scale=1.0,
+    shuffle=True,
+    random_state=None,
+    return_X_y=True,
+):
+    """Generate a random n-class classification problem.
+
+    This initially creates clusters of points normally distributed (std=1)
+    about vertices of an ``n_informative``-dimensional hypercube with sides of
+    length ``2*class_sep`` and assigns an equal number of clusters to each
+    class. It introduces interdependence between these features and adds
+    various types of further noise to the data.
+
+    Without shuffling, ``X`` horizontally stacks features in the following
+    order: the primary ``n_informative`` features, followed by ``n_redundant``
+    linear combinations of the informative features, followed by ``n_repeated``
+    duplicates, drawn randomly with replacement from the informative and
+    redundant features. The remaining features are filled with random noise.
+    Thus, without shuffling, all useful features are contained in the columns
+    ``X[:, :n_informative + n_redundant + n_repeated]``.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    n_features : int, default=20
+        The total number of features. These comprise ``n_informative``
+        informative features, ``n_redundant`` redundant features,
+        ``n_repeated`` duplicated features and
+        ``n_features-n_informative-n_redundant-n_repeated`` useless features
+        drawn at random.
+
+    n_informative : int, default=2
+        The number of informative features. Each class is composed of a number
+        of gaussian clusters each located around the vertices of a hypercube
+        in a subspace of dimension ``n_informative``. For each cluster,
+        informative features are drawn independently from  N(0, 1) and then
+        randomly linearly combined within each cluster in order to add
+        covariance. The clusters are then placed on the vertices of the
+        hypercube.
+
+    n_redundant : int, default=2
+        The number of redundant features. These features are generated as
+        random linear combinations of the informative features.
+
+    n_repeated : int, default=0
+        The number of duplicated features, drawn randomly from the informative
+        and the redundant features.
+
+    n_classes : int, default=2
+        The number of classes (or labels) of the classification problem.
+
+    n_clusters_per_class : int, default=2
+        The number of clusters per class.
+
+    weights : array-like of shape (n_classes,) or (n_classes - 1,),\
+              default=None
+        The proportions of samples assigned to each class. If None, then
+        classes are balanced. Note that if ``len(weights) == n_classes - 1``,
+        then the last class weight is automatically inferred.
+        More than ``n_samples`` samples may be returned if the sum of
+        ``weights`` exceeds 1. Note that the actual class proportions will
+        not exactly match ``weights`` when ``flip_y`` isn't 0.
+
+    flip_y : float, default=0.01
+        The fraction of samples whose class is assigned randomly. Larger
+        values introduce noise in the labels and make the classification
+        task harder. Note that the default setting flip_y > 0 might lead
+        to less than ``n_classes`` in y in some cases.
+
+    class_sep : float, default=1.0
+        The factor multiplying the hypercube size.  Larger values spread
+        out the clusters/classes and make the classification task easier.
+
+    hypercube : bool, default=True
+        If True, the clusters are put on the vertices of a hypercube. If
+        False, the clusters are put on the vertices of a random polytope.
+
+    shift : float, ndarray of shape (n_features,) or None, default=0.0
+        Shift features by the specified value. If None, then features
+        are shifted by a random value drawn in [-class_sep, class_sep].
+
+    scale : float, ndarray of shape (n_features,) or None, default=1.0
+        Multiply features by the specified value. If None, then features
+        are scaled by a random value drawn in [1, 100]. Note that scaling
+        happens after shifting.
+
+    shuffle : bool, default=True
+        Shuffle the samples and the features.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    return_X_y : bool, default=True
+        If True, a tuple ``(X, y)`` instead of a Bunch object is returned.
+
+        .. versionadded:: 1.7
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch` if `return_X_y` is `False`.
+        Dictionary-like object, with the following attributes.
+
+        DESCR : str
+            A description of the function that generated the dataset.
+        parameter : dict
+            A dictionary that stores the values of the arguments passed to the
+            generator function.
+        feature_info : list of len(n_features)
+            A description for each generated feature.
+        X : ndarray of shape (n_samples, n_features)
+            The generated samples.
+        y : ndarray of shape (n_samples,)
+            An integer label for class membership of each sample.
+
+        .. versionadded:: 1.7
+
+    (X, y) : tuple if ``return_X_y`` is True
+        A tuple of generated samples and labels.
+
+    See Also
+    --------
+    make_blobs : Simplified variant.
+    make_multilabel_classification : Unrelated generator for multilabel tasks.
+
+    Notes
+    -----
+    The algorithm is adapted from Guyon [1] and was designed to generate
+    the "Madelon" dataset.
+
+    References
+    ----------
+    .. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable
+           selection benchmark", 2003.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(random_state=42)
+    >>> X.shape
+    (100, 20)
+    >>> y.shape
+    (100,)
+    >>> list(y[:5])
+    [np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0)]
+    """
+    generator = check_random_state(random_state)
+
+    # Count features, clusters and samples
+    if n_informative + n_redundant + n_repeated > n_features:
+        raise ValueError(
+            "Number of informative, redundant and repeated "
+            "features must sum to less than the number of total"
+            " features"
+        )
+    # Use log2 to avoid overflow errors
+    if n_informative < np.log2(n_classes * n_clusters_per_class):
+        msg = "n_classes({}) * n_clusters_per_class({}) must be"
+        msg += " smaller or equal 2**n_informative({})={}"
+        raise ValueError(
+            msg.format(n_classes, n_clusters_per_class, n_informative, 2**n_informative)
+        )
+
+    if weights is not None:
+        # we define new variable, weight_, instead of modifying user defined parameter.
+        if len(weights) not in [n_classes, n_classes - 1]:
+            raise ValueError(
+                "Weights specified but incompatible with number of classes."
+            )
+        if len(weights) == n_classes - 1:
+            if isinstance(weights, list):
+                weights_ = weights + [1.0 - sum(weights)]
+            else:
+                weights_ = np.resize(weights, n_classes)
+                weights_[-1] = 1.0 - sum(weights_[:-1])
+        else:
+            weights_ = weights.copy()
+    else:
+        weights_ = [1.0 / n_classes] * n_classes
+
+    n_random = n_features - n_informative - n_redundant - n_repeated
+    n_clusters = n_classes * n_clusters_per_class
+
+    # Distribute samples among clusters by weight
+    n_samples_per_cluster = [
+        int(n_samples * weights_[k % n_classes] / n_clusters_per_class)
+        for k in range(n_clusters)
+    ]
+
+    for i in range(n_samples - sum(n_samples_per_cluster)):
+        n_samples_per_cluster[i % n_clusters] += 1
+
+    # Initialize X and y
+    X = np.zeros((n_samples, n_features))
+    y = np.zeros(n_samples, dtype=int)
+
+    # Build the polytope whose vertices become cluster centroids
+    centroids = _generate_hypercube(n_clusters, n_informative, generator).astype(
+        float, copy=False
+    )
+    centroids *= 2 * class_sep
+    centroids -= class_sep
+    if not hypercube:
+        centroids *= generator.uniform(size=(n_clusters, 1))
+        centroids *= generator.uniform(size=(1, n_informative))
+
+    # Initially draw informative features from the standard normal
+    X[:, :n_informative] = generator.standard_normal(size=(n_samples, n_informative))
+
+    # Create each cluster; a variant of make_blobs
+    stop = 0
+    for k, centroid in enumerate(centroids):
+        start, stop = stop, stop + n_samples_per_cluster[k]
+        y[start:stop] = k % n_classes  # assign labels
+        X_k = X[start:stop, :n_informative]  # slice a view of the cluster
+
+        A = 2 * generator.uniform(size=(n_informative, n_informative)) - 1
+        X_k[...] = np.dot(X_k, A)  # introduce random covariance
+
+        X_k += centroid  # shift the cluster to a vertex
+
+    # Create redundant features
+    if n_redundant > 0:
+        B = 2 * generator.uniform(size=(n_informative, n_redundant)) - 1
+        X[:, n_informative : n_informative + n_redundant] = np.dot(
+            X[:, :n_informative], B
+        )
+
+    # Repeat some features
+    n = n_informative + n_redundant
+    if n_repeated > 0:
+        indices = ((n - 1) * generator.uniform(size=n_repeated) + 0.5).astype(np.intp)
+        X[:, n : n + n_repeated] = X[:, indices]
+
+    # Fill useless features
+    if n_random > 0:
+        X[:, -n_random:] = generator.standard_normal(size=(n_samples, n_random))
+
+    # Randomly replace labels
+    if flip_y >= 0.0:
+        flip_mask = generator.uniform(size=n_samples) < flip_y
+        y[flip_mask] = generator.randint(n_classes, size=flip_mask.sum())
+
+    # Randomly shift and scale
+    if shift is None:
+        shift = (2 * generator.uniform(size=n_features) - 1) * class_sep
+    X += shift
+
+    if scale is None:
+        scale = 1 + 100 * generator.uniform(size=n_features)
+    X *= scale
+
+    indices = np.arange(n_features)
+    if shuffle:
+        # Randomly permute samples
+        X, y = util_shuffle(X, y, random_state=generator)
+
+        # Randomly permute features
+        generator.shuffle(indices)
+        X[:, :] = X[:, indices]
+
+    if return_X_y:
+        return X, y
+
+    # feat_desc describes features in X
+    feat_desc = ["random"] * n_features
+    for i, index in enumerate(indices):
+        if index < n_informative:
+            feat_desc[i] = "informative"
+        elif n_informative <= index < n_informative + n_redundant:
+            feat_desc[i] = "redundant"
+        elif n <= index < n + n_repeated:
+            feat_desc[i] = "repeated"
+
+    parameters = {
+        "n_samples": n_samples,
+        "n_features": n_features,
+        "n_informative": n_informative,
+        "n_redundant": n_redundant,
+        "n_repeated": n_repeated,
+        "n_classes": n_classes,
+        "n_clusters_per_class": n_clusters_per_class,
+        "weights": weights,
+        "flip_y": flip_y,
+        "class_sep": class_sep,
+        "hypercube": hypercube,
+        "shift": shift,
+        "scale": scale,
+        "shuffle": shuffle,
+        "random_state": random_state,
+        "return_X_y": return_X_y,
+    }
+
+    bunch = Bunch(
+        DESCR=make_classification.__doc__,
+        parameters=parameters,
+        feature_info=feat_desc,
+        X=X,
+        y=y,
+    )
+
+    return bunch
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "n_features": [Interval(Integral, 1, None, closed="left")],
+        "n_classes": [Interval(Integral, 1, None, closed="left")],
+        "n_labels": [Interval(Integral, 0, None, closed="left")],
+        "length": [Interval(Integral, 1, None, closed="left")],
+        "allow_unlabeled": ["boolean"],
+        "sparse": ["boolean"],
+        "return_indicator": [StrOptions({"dense", "sparse"}), "boolean"],
+        "return_distributions": ["boolean"],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_multilabel_classification(
+    n_samples=100,
+    n_features=20,
+    *,
+    n_classes=5,
+    n_labels=2,
+    length=50,
+    allow_unlabeled=True,
+    sparse=False,
+    return_indicator="dense",
+    return_distributions=False,
+    random_state=None,
+):
+    """Generate a random multilabel classification problem.
+
+    For each sample, the generative process is:
+        - pick the number of labels: n ~ Poisson(n_labels)
+        - n times, choose a class c: c ~ Multinomial(theta)
+        - pick the document length: k ~ Poisson(length)
+        - k times, choose a word: w ~ Multinomial(theta_c)
+
+    In the above process, rejection sampling is used to make sure that
+    n is never zero or more than `n_classes`, and that the document length
+    is never zero. Likewise, we reject classes which have already been chosen.
+
+    For an example of usage, see
+    :ref:`sphx_glr_auto_examples_datasets_plot_random_multilabel_dataset.py`.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    n_features : int, default=20
+        The total number of features.
+
+    n_classes : int, default=5
+        The number of classes of the classification problem.
+
+    n_labels : int, default=2
+        The average number of labels per instance. More precisely, the number
+        of labels per sample is drawn from a Poisson distribution with
+        ``n_labels`` as its expected value, but samples are bounded (using
+        rejection sampling) by ``n_classes``, and must be nonzero if
+        ``allow_unlabeled`` is False.
+
+    length : int, default=50
+        The sum of the features (number of words if documents) is drawn from
+        a Poisson distribution with this expected value.
+
+    allow_unlabeled : bool, default=True
+        If ``True``, some instances might not belong to any class.
+
+    sparse : bool, default=False
+        If ``True``, return a sparse feature matrix.
+
+        .. versionadded:: 0.17
+           parameter to allow *sparse* output.
+
+    return_indicator : {'dense', 'sparse'} or False, default='dense'
+        If ``'dense'`` return ``Y`` in the dense binary indicator format. If
+        ``'sparse'`` return ``Y`` in the sparse binary indicator format.
+        ``False`` returns a list of lists of labels.
+
+    return_distributions : bool, default=False
+        If ``True``, return the prior class probability and conditional
+        probabilities of features given classes, from which the data was
+        drawn.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_features)
+        The generated samples.
+
+    Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+        The label sets. Sparse matrix should be of CSR format.
+
+    p_c : ndarray of shape (n_classes,)
+        The probability of each class being drawn. Only returned if
+        ``return_distributions=True``.
+
+    p_w_c : ndarray of shape (n_features, n_classes)
+        The probability of each feature being drawn given each class.
+        Only returned if ``return_distributions=True``.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_multilabel_classification
+    >>> X, y = make_multilabel_classification(n_labels=3, random_state=42)
+    >>> X.shape
+    (100, 20)
+    >>> y.shape
+    (100, 5)
+    >>> list(y[:3])
+    [array([1, 1, 0, 1, 0]), array([0, 1, 1, 1, 0]), array([0, 1, 0, 0, 0])]
+    """
+
+    generator = check_random_state(random_state)
+    p_c = generator.uniform(size=n_classes)
+    p_c /= p_c.sum()
+    cumulative_p_c = np.cumsum(p_c)
+    p_w_c = generator.uniform(size=(n_features, n_classes))
+    p_w_c /= np.sum(p_w_c, axis=0)
+
+    def sample_example():
+        _, n_classes = p_w_c.shape
+
+        # pick a nonzero number of labels per document by rejection sampling
+        y_size = n_classes + 1
+        while (not allow_unlabeled and y_size == 0) or y_size > n_classes:
+            y_size = generator.poisson(n_labels)
+
+        # pick n classes
+        y = set()
+        while len(y) != y_size:
+            # pick a class with probability P(c)
+            c = np.searchsorted(cumulative_p_c, generator.uniform(size=y_size - len(y)))
+            y.update(c)
+        y = list(y)
+
+        # pick a non-zero document length by rejection sampling
+        n_words = 0
+        while n_words == 0:
+            n_words = generator.poisson(length)
+
+        # generate a document of length n_words
+        if len(y) == 0:
+            # if sample does not belong to any class, generate noise word
+            words = generator.randint(n_features, size=n_words)
+            return words, y
+
+        # sample words with replacement from selected classes
+        cumulative_p_w_sample = p_w_c.take(y, axis=1).sum(axis=1).cumsum()
+        cumulative_p_w_sample /= cumulative_p_w_sample[-1]
+        words = np.searchsorted(cumulative_p_w_sample, generator.uniform(size=n_words))
+        return words, y
+
+    X_indices = array.array("i")
+    X_indptr = array.array("i", [0])
+    Y = []
+    for i in range(n_samples):
+        words, y = sample_example()
+        X_indices.extend(words)
+        X_indptr.append(len(X_indices))
+        Y.append(y)
+    X_data = np.ones(len(X_indices), dtype=np.float64)
+    X = sp.csr_matrix((X_data, X_indices, X_indptr), shape=(n_samples, n_features))
+    X.sum_duplicates()
+    if not sparse:
+        X = X.toarray()
+
+    # return_indicator can be True due to backward compatibility
+    if return_indicator in (True, "sparse", "dense"):
+        lb = MultiLabelBinarizer(sparse_output=(return_indicator == "sparse"))
+        Y = lb.fit([range(n_classes)]).transform(Y)
+    if return_distributions:
+        return X, Y, p_c, p_w_c
+    return X, Y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_hastie_10_2(n_samples=12000, *, random_state=None):
+    """Generate data for binary classification used in Hastie et al. 2009, Example 10.2.
+
+    The ten features are standard independent Gaussian and
+    the target ``y`` is defined by::
+
+      y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=12000
+        The number of samples.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, 10)
+        The input samples.
+
+    y : ndarray of shape (n_samples,)
+        The output values.
+
+    See Also
+    --------
+    make_gaussian_quantiles : A generalization of this dataset approach.
+
+    References
+    ----------
+    .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
+           Learning Ed. 2", Springer, 2009.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_hastie_10_2
+    >>> X, y = make_hastie_10_2(n_samples=24000, random_state=42)
+    >>> X.shape
+    (24000, 10)
+    >>> y.shape
+    (24000,)
+    >>> list(y[:5])
+    [np.float64(-1.0), np.float64(1.0), np.float64(-1.0), np.float64(1.0),
+    np.float64(-1.0)]
+    """
+    rs = check_random_state(random_state)
+
+    shape = (n_samples, 10)
+    X = rs.normal(size=shape).reshape(shape)
+    y = ((X**2.0).sum(axis=1) > 9.34).astype(np.float64, copy=False)
+    y[y == 0.0] = -1.0
+
+    return X, y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "n_features": [Interval(Integral, 1, None, closed="left")],
+        "n_informative": [Interval(Integral, 0, None, closed="left")],
+        "n_targets": [Interval(Integral, 1, None, closed="left")],
+        "bias": [Interval(Real, None, None, closed="neither")],
+        "effective_rank": [Interval(Integral, 1, None, closed="left"), None],
+        "tail_strength": [Interval(Real, 0, 1, closed="both")],
+        "noise": [Interval(Real, 0, None, closed="left")],
+        "shuffle": ["boolean"],
+        "coef": ["boolean"],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_regression(
+    n_samples=100,
+    n_features=100,
+    *,
+    n_informative=10,
+    n_targets=1,
+    bias=0.0,
+    effective_rank=None,
+    tail_strength=0.5,
+    noise=0.0,
+    shuffle=True,
+    coef=False,
+    random_state=None,
+):
+    """Generate a random regression problem.
+
+    The input set can either be well conditioned (by default) or have a low
+    rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
+    more details.
+
+    The output is generated by applying a (potentially biased) random linear
+    regression model with `n_informative` nonzero regressors to the previously
+    generated input and some gaussian centered noise with some adjustable
+    scale.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    n_features : int, default=100
+        The number of features.
+
+    n_informative : int, default=10
+        The number of informative features, i.e., the number of features used
+        to build the linear model used to generate the output.
+
+    n_targets : int, default=1
+        The number of regression targets, i.e., the dimension of the y output
+        vector associated with a sample. By default, the output is a scalar.
+
+    bias : float, default=0.0
+        The bias term in the underlying linear model.
+
+    effective_rank : int, default=None
+        If not None:
+            The approximate number of singular vectors required to explain most
+            of the input data by linear combinations. Using this kind of
+            singular spectrum in the input allows the generator to reproduce
+            the correlations often observed in practice.
+        If None:
+            The input set is well conditioned, centered and gaussian with
+            unit variance.
+
+    tail_strength : float, default=0.5
+        The relative importance of the fat noisy tail of the singular values
+        profile if `effective_rank` is not None. When a float, it should be
+        between 0 and 1.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise applied to the output.
+
+    shuffle : bool, default=True
+        Shuffle the samples and the features.
+
+    coef : bool, default=False
+        If True, the coefficients of the underlying linear model are returned.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_features)
+        The input samples.
+
+    y : ndarray of shape (n_samples,) or (n_samples, n_targets)
+        The output values.
+
+    coef : ndarray of shape (n_features,) or (n_features, n_targets)
+        The coefficient of the underlying linear model. It is returned only if
+        coef is True.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_samples=5, n_features=2, noise=1, random_state=42)
+    >>> X
+    array([[ 0.4967, -0.1382 ],
+        [ 0.6476,  1.523],
+        [-0.2341, -0.2341],
+        [-0.4694,  0.5425],
+        [ 1.579,  0.7674]])
+    >>> y
+    array([  6.737,  37.79, -10.27,   0.4017,   42.22])
+    """
+    n_informative = min(n_features, n_informative)
+    generator = check_random_state(random_state)
+
+    if effective_rank is None:
+        # Randomly generate a well conditioned input set
+        X = generator.standard_normal(size=(n_samples, n_features))
+
+    else:
+        # Randomly generate a low rank, fat tail input set
+        X = make_low_rank_matrix(
+            n_samples=n_samples,
+            n_features=n_features,
+            effective_rank=effective_rank,
+            tail_strength=tail_strength,
+            random_state=generator,
+        )
+
+    # Generate a ground truth model with only n_informative features being non
+    # zeros (the other features are not correlated to y and should be ignored
+    # by a sparsifying regularizers such as L1 or elastic net)
+    ground_truth = np.zeros((n_features, n_targets))
+    ground_truth[:n_informative, :] = 100 * generator.uniform(
+        size=(n_informative, n_targets)
+    )
+
+    y = np.dot(X, ground_truth) + bias
+
+    # Add noise
+    if noise > 0.0:
+        y += generator.normal(scale=noise, size=y.shape)
+
+    # Randomly permute samples and features
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+        indices = np.arange(n_features)
+        generator.shuffle(indices)
+        X[:, :] = X[:, indices]
+        ground_truth = ground_truth[indices]
+
+    y = np.squeeze(y)
+
+    if coef:
+        return X, y, np.squeeze(ground_truth)
+
+    else:
+        return X, y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 0, None, closed="left"), tuple],
+        "shuffle": ["boolean"],
+        "noise": [Interval(Real, 0, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "factor": [Interval(Real, 0, 1, closed="left")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_circles(
+    n_samples=100, *, shuffle=True, noise=None, random_state=None, factor=0.8
+):
+    """Make a large circle containing a smaller circle in 2d.
+
+    A simple toy dataset to visualize clustering and classification
+    algorithms.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int or tuple of shape (2,), dtype=int, default=100
+        If int, it is the total number of points generated.
+        For odd numbers, the inner circle will have one point more than the
+        outer circle.
+        If two-element tuple, number of points in outer circle and inner
+        circle.
+
+        .. versionchanged:: 0.23
+           Added two-element tuple.
+
+    shuffle : bool, default=True
+        Whether to shuffle the samples.
+
+    noise : float, default=None
+        Standard deviation of Gaussian noise added to the data.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset shuffling and noise.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    factor : float, default=.8
+        Scale factor between inner and outer circle in the range `[0, 1)`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, 2)
+        The generated samples.
+
+    y : ndarray of shape (n_samples,)
+        The integer labels (0 or 1) for class membership of each sample.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_circles
+    >>> X, y = make_circles(random_state=42)
+    >>> X.shape
+    (100, 2)
+    >>> y.shape
+    (100,)
+    >>> list(y[:5])
+    [np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(0)]
+    """
+    if isinstance(n_samples, numbers.Integral):
+        n_samples_out = n_samples // 2
+        n_samples_in = n_samples - n_samples_out
+    else:  # n_samples is a tuple
+        if len(n_samples) != 2:
+            raise ValueError("When a tuple, n_samples must have exactly two elements.")
+        n_samples_out, n_samples_in = n_samples
+
+    generator = check_random_state(random_state)
+    # so as not to have the first point = last point, we set endpoint=False
+    linspace_out = np.linspace(0, 2 * np.pi, n_samples_out, endpoint=False)
+    linspace_in = np.linspace(0, 2 * np.pi, n_samples_in, endpoint=False)
+    outer_circ_x = np.cos(linspace_out)
+    outer_circ_y = np.sin(linspace_out)
+    inner_circ_x = np.cos(linspace_in) * factor
+    inner_circ_y = np.sin(linspace_in) * factor
+
+    X = np.vstack(
+        [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
+    ).T
+    y = np.hstack(
+        [np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)]
+    )
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+    if noise is not None:
+        X += generator.normal(scale=noise, size=X.shape)
+
+    return X, y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left"), tuple],
+        "shuffle": ["boolean"],
+        "noise": [Interval(Real, 0, None, closed="left"), None],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
+    """Make two interleaving half circles.
+
+    A simple toy dataset to visualize clustering and classification
+    algorithms. Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int or tuple of shape (2,), dtype=int, default=100
+        If int, the total number of points generated.
+        If two-element tuple, number of points in each of two moons.
+
+        .. versionchanged:: 0.23
+           Added two-element tuple.
+
+    shuffle : bool, default=True
+        Whether to shuffle the samples.
+
+    noise : float, default=None
+        Standard deviation of Gaussian noise added to the data.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset shuffling and noise.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, 2)
+        The generated samples.
+
+    y : ndarray of shape (n_samples,)
+        The integer labels (0 or 1) for class membership of each sample.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_moons
+    >>> X, y = make_moons(n_samples=200, noise=0.2, random_state=42)
+    >>> X.shape
+    (200, 2)
+    >>> y.shape
+    (200,)
+    """
+
+    if isinstance(n_samples, numbers.Integral):
+        n_samples_out = n_samples // 2
+        n_samples_in = n_samples - n_samples_out
+    else:
+        try:
+            n_samples_out, n_samples_in = n_samples
+        except ValueError as e:
+            raise ValueError(
+                "`n_samples` can be either an int or a two-element tuple."
+            ) from e
+
+    generator = check_random_state(random_state)
+
+    outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))
+    outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))
+    inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))
+    inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.5
+
+    X = np.vstack(
+        [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
+    ).T
+    y = np.hstack(
+        [np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)]
+    )
+
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+    if noise is not None:
+        X += generator.normal(scale=noise, size=X.shape)
+
+    return X, y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left"), "array-like"],
+        "n_features": [Interval(Integral, 1, None, closed="left")],
+        "centers": [Interval(Integral, 1, None, closed="left"), "array-like", None],
+        "cluster_std": [Interval(Real, 0, None, closed="left"), "array-like"],
+        "center_box": [tuple],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+        "return_centers": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_blobs(
+    n_samples=100,
+    n_features=2,
+    *,
+    centers=None,
+    cluster_std=1.0,
+    center_box=(-10.0, 10.0),
+    shuffle=True,
+    random_state=None,
+    return_centers=False,
+):
+    """Generate isotropic Gaussian blobs for clustering.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int or array-like, default=100
+        If int, it is the total number of points equally divided among
+        clusters.
+        If array-like, each element of the sequence indicates
+        the number of samples per cluster.
+
+        .. versionchanged:: v0.20
+            one can now pass an array-like to the ``n_samples`` parameter
+
+    n_features : int, default=2
+        The number of features for each sample.
+
+    centers : int or array-like of shape (n_centers, n_features), default=None
+        The number of centers to generate, or the fixed center locations.
+        If n_samples is an int and centers is None, 3 centers are generated.
+        If n_samples is array-like, centers must be
+        either None or an array of length equal to the length of n_samples.
+
+    cluster_std : float or array-like of float, default=1.0
+        The standard deviation of the clusters.
+
+    center_box : tuple of float (min, max), default=(-10.0, 10.0)
+        The bounding box for each cluster center when centers are
+        generated at random.
+
+    shuffle : bool, default=True
+        Shuffle the samples.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    return_centers : bool, default=False
+        If True, then return the centers of each cluster.
+
+        .. versionadded:: 0.23
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_features)
+        The generated samples.
+
+    y : ndarray of shape (n_samples,)
+        The integer labels for cluster membership of each sample.
+
+    centers : ndarray of shape (n_centers, n_features)
+        The centers of each cluster. Only returned if
+        ``return_centers=True``.
+
+    See Also
+    --------
+    make_classification : A more intricate variant.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_blobs
+    >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
+    ...                   random_state=0)
+    >>> print(X.shape)
+    (10, 2)
+    >>> y
+    array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
+    >>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,
+    ...                   random_state=0)
+    >>> print(X.shape)
+    (10, 2)
+    >>> y
+    array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])
+    """
+    generator = check_random_state(random_state)
+
+    if isinstance(n_samples, numbers.Integral):
+        # Set n_centers by looking at centers arg
+        if centers is None:
+            centers = 3
+
+        if isinstance(centers, numbers.Integral):
+            n_centers = centers
+            centers = generator.uniform(
+                center_box[0], center_box[1], size=(n_centers, n_features)
+            )
+
+        else:
+            centers = check_array(centers)
+            n_features = centers.shape[1]
+            n_centers = centers.shape[0]
+
+    else:
+        # Set n_centers by looking at [n_samples] arg
+        n_centers = len(n_samples)
+        if centers is None:
+            centers = generator.uniform(
+                center_box[0], center_box[1], size=(n_centers, n_features)
+            )
+        if not isinstance(centers, Iterable):
+            raise ValueError(
+                "Parameter `centers` must be array-like. Got {!r} instead".format(
+                    centers
+                )
+            )
+        if len(centers) != n_centers:
+            raise ValueError(
+                "Length of `n_samples` not consistent with number of "
+                f"centers. Got n_samples = {n_samples} and centers = {centers}"
+            )
+        centers = check_array(centers)
+        n_features = centers.shape[1]
+
+    # stds: if cluster_std is given as list, it must be consistent
+    # with the n_centers
+    if hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers:
+        raise ValueError(
+            "Length of `clusters_std` not consistent with "
+            "number of centers. Got centers = {} "
+            "and cluster_std = {}".format(centers, cluster_std)
+        )
+
+    if isinstance(cluster_std, numbers.Real):
+        cluster_std = np.full(len(centers), cluster_std)
+
+    if isinstance(n_samples, Iterable):
+        n_samples_per_center = n_samples
+    else:
+        n_samples_per_center = [int(n_samples // n_centers)] * n_centers
+
+        for i in range(n_samples % n_centers):
+            n_samples_per_center[i] += 1
+
+    cum_sum_n_samples = np.cumsum(n_samples_per_center)
+    X = np.empty(shape=(sum(n_samples_per_center), n_features), dtype=np.float64)
+    y = np.empty(shape=(sum(n_samples_per_center),), dtype=int)
+
+    for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
+        start_idx = cum_sum_n_samples[i - 1] if i > 0 else 0
+        end_idx = cum_sum_n_samples[i]
+        X[start_idx:end_idx] = generator.normal(
+            loc=centers[i], scale=std, size=(n, n_features)
+        )
+        y[start_idx:end_idx] = i
+
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+    if return_centers:
+        return X, y, centers
+    else:
+        return X, y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "n_features": [Interval(Integral, 5, None, closed="left")],
+        "noise": [Interval(Real, 0.0, None, closed="left")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_friedman1(n_samples=100, n_features=10, *, noise=0.0, random_state=None):
+    """Generate the "Friedman #1" regression problem.
+
+    This dataset is described in Friedman [1] and Breiman [2].
+
+    Inputs `X` are independent features uniformly distributed on the interval
+    [0, 1]. The output `y` is created according to the formula::
+
+        y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
++ 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).
+
+    Out of the `n_features` features, only 5 are actually used to compute
+    `y`. The remaining features are independent of `y`.
+
+    The number of features has to be >= 5.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    n_features : int, default=10
+        The number of features. Should be at least 5.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise applied to the output.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset noise. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_features)
+        The input samples.
+
+    y : ndarray of shape (n_samples,)
+        The output values.
+
+    References
+    ----------
+    .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
+           of Statistics 19 (1), pages 1-67, 1991.
+
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
+           pages 123-140, 1996.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman1
+    >>> X, y = make_friedman1(random_state=42)
+    >>> X.shape
+    (100, 10)
+    >>> y.shape
+    (100,)
+    >>> list(y[:3])
+    [np.float64(16.8), np.float64(5.87), np.float64(9.46)]
+    """
+    generator = check_random_state(random_state)
+
+    X = generator.uniform(size=(n_samples, n_features))
+    y = (
+        10 * np.sin(np.pi * X[:, 0] * X[:, 1])
+        + 20 * (X[:, 2] - 0.5) ** 2
+        + 10 * X[:, 3]
+        + 5 * X[:, 4]
+        + noise * generator.standard_normal(size=(n_samples))
+    )
+
+    return X, y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "noise": [Interval(Real, 0, None, closed="left")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
+    """Generate the "Friedman #2" regression problem.
+
+    This dataset is described in Friedman [1] and Breiman [2].
+
+    Inputs `X` are 4 independent features uniformly distributed on the
+    intervals::
+
+        0 <= X[:, 0] <= 100,
+        40 * pi <= X[:, 1] <= 560 * pi,
+        0 <= X[:, 2] <= 1,
+        1 <= X[:, 3] <= 11.
+
+    The output `y` is created according to the formula::
+
+        y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] \
+ - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise applied to the output.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset noise. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, 4)
+        The input samples.
+
+    y : ndarray of shape (n_samples,)
+        The output values.
+
+    References
+    ----------
+    .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
+           of Statistics 19 (1), pages 1-67, 1991.
+
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
+           pages 123-140, 1996.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> X, y = make_friedman2(random_state=42)
+    >>> X.shape
+    (100, 4)
+    >>> y.shape
+    (100,)
+    >>> list(y[:3])
+    [np.float64(1229.4), np.float64(27.0), np.float64(65.6)]
+    """
+    generator = check_random_state(random_state)
+
+    X = generator.uniform(size=(n_samples, 4))
+    X[:, 0] *= 100
+    X[:, 1] *= 520 * np.pi
+    X[:, 1] += 40 * np.pi
+    X[:, 3] *= 10
+    X[:, 3] += 1
+
+    y = (
+        X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2
+    ) ** 0.5 + noise * generator.standard_normal(size=(n_samples))
+
+    return X, y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "noise": [Interval(Real, 0, None, closed="left")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
+    """Generate the "Friedman #3" regression problem.
+
+    This dataset is described in Friedman [1] and Breiman [2].
+
+    Inputs `X` are 4 independent features uniformly distributed on the
+    intervals::
+
+        0 <= X[:, 0] <= 100,
+        40 * pi <= X[:, 1] <= 560 * pi,
+        0 <= X[:, 2] <= 1,
+        1 <= X[:, 3] <= 11.
+
+    The output `y` is created according to the formula::
+
+        y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) \
+/ X[:, 0]) + noise * N(0, 1).
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise applied to the output.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset noise. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, 4)
+        The input samples.
+
+    y : ndarray of shape (n_samples,)
+        The output values.
+
+    References
+    ----------
+    .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
+           of Statistics 19 (1), pages 1-67, 1991.
+
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
+           pages 123-140, 1996.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman3
+    >>> X, y = make_friedman3(random_state=42)
+    >>> X.shape
+    (100, 4)
+    >>> y.shape
+    (100,)
+    >>> list(y[:3])
+    [np.float64(1.54), np.float64(0.956), np.float64(0.414)]
+    """
+    generator = check_random_state(random_state)
+
+    X = generator.uniform(size=(n_samples, 4))
+    X[:, 0] *= 100
+    X[:, 1] *= 520 * np.pi
+    X[:, 1] += 40 * np.pi
+    X[:, 3] *= 10
+    X[:, 3] += 1
+
+    y = np.arctan(
+        (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]
+    ) + noise * generator.standard_normal(size=(n_samples))
+
+    return X, y
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "n_features": [Interval(Integral, 1, None, closed="left")],
+        "effective_rank": [Interval(Integral, 1, None, closed="left")],
+        "tail_strength": [Interval(Real, 0, 1, closed="both")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_low_rank_matrix(
+    n_samples=100,
+    n_features=100,
+    *,
+    effective_rank=10,
+    tail_strength=0.5,
+    random_state=None,
+):
+    """Generate a mostly low rank matrix with bell-shaped singular values.
+
+    Most of the variance can be explained by a bell-shaped curve of width
+    effective_rank: the low rank part of the singular values profile is::
+
+        (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)
+
+    The remaining singular values' tail is fat, decreasing as::
+
+        tail_strength * exp(-0.1 * i / effective_rank).
+
+    The low rank part of the profile can be considered the structured
+    signal part of the data while the tail can be considered the noisy
+    part of the data that cannot be summarized by a low number of linear
+    components (singular vectors).
+
+    This kind of singular profiles is often seen in practice, for instance:
+     - gray level pictures of faces
+     - TF-IDF vectors of text documents crawled from the web
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    n_features : int, default=100
+        The number of features.
+
+    effective_rank : int, default=10
+        The approximate number of singular vectors required to explain most of
+        the data by linear combinations.
+
+    tail_strength : float, default=0.5
+        The relative importance of the fat noisy tail of the singular values
+        profile. The value should be between 0 and 1.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_features)
+        The matrix.
+
+    Examples
+    --------
+    >>> from numpy.linalg import svd
+    >>> from sklearn.datasets import make_low_rank_matrix
+    >>> X = make_low_rank_matrix(
+    ...     n_samples=50,
+    ...     n_features=25,
+    ...     effective_rank=5,
+    ...     tail_strength=0.01,
+    ...     random_state=0,
+    ... )
+    >>> X.shape
+    (50, 25)
+    """
+    generator = check_random_state(random_state)
+    n = min(n_samples, n_features)
+
+    # Random (ortho normal) vectors
+    u, _ = linalg.qr(
+        generator.standard_normal(size=(n_samples, n)),
+        mode="economic",
+        check_finite=False,
+    )
+    v, _ = linalg.qr(
+        generator.standard_normal(size=(n_features, n)),
+        mode="economic",
+        check_finite=False,
+    )
+
+    # Index of the singular values
+    singular_ind = np.arange(n, dtype=np.float64)
+
+    # Build the singular profile by assembling signal and noise components
+    low_rank = (1 - tail_strength) * np.exp(-1.0 * (singular_ind / effective_rank) ** 2)
+    tail = tail_strength * np.exp(-0.1 * singular_ind / effective_rank)
+    s = np.identity(n) * (low_rank + tail)
+
+    return np.dot(np.dot(u, s), v.T)
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "n_features": [Interval(Integral, 1, None, closed="left")],
+        "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_sparse_coded_signal(
+    n_samples,
+    *,
+    n_components,
+    n_features,
+    n_nonzero_coefs,
+    random_state=None,
+):
+    """Generate a signal as a sparse combination of dictionary elements.
+
+    Returns matrices `Y`, `D` and `X` such that `Y = XD` where `X` is of shape
+    `(n_samples, n_components)`, `D` is of shape `(n_components, n_features)`, and
+    each row of `X` has exactly `n_nonzero_coefs` non-zero elements.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples to generate.
+
+    n_components : int
+        Number of components in the dictionary.
+
+    n_features : int
+        Number of features of the dataset to generate.
+
+    n_nonzero_coefs : int
+        Number of active (non-zero) coefficients in each sample.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    data : ndarray of shape (n_samples, n_features)
+        The encoded signal (Y).
+
+    dictionary : ndarray of shape (n_components, n_features)
+        The dictionary with normalized components (D).
+
+    code : ndarray of shape (n_samples, n_components)
+        The sparse code such that each column of this matrix has exactly
+        n_nonzero_coefs non-zero items (X).
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_sparse_coded_signal
+    >>> data, dictionary, code = make_sparse_coded_signal(
+    ...     n_samples=50,
+    ...     n_components=100,
+    ...     n_features=10,
+    ...     n_nonzero_coefs=4,
+    ...     random_state=0
+    ... )
+    >>> data.shape
+    (50, 10)
+    >>> dictionary.shape
+    (100, 10)
+    >>> code.shape
+    (50, 100)
+    """
+    generator = check_random_state(random_state)
+
+    # generate dictionary
+    D = generator.standard_normal(size=(n_features, n_components))
+    D /= np.sqrt(np.sum((D**2), axis=0))
+
+    # generate code
+    X = np.zeros((n_components, n_samples))
+    for i in range(n_samples):
+        idx = np.arange(n_components)
+        generator.shuffle(idx)
+        idx = idx[:n_nonzero_coefs]
+        X[idx, i] = generator.standard_normal(size=n_nonzero_coefs)
+
+    # encode signal
+    Y = np.dot(D, X)
+
+    # Transpose to have shapes consistent with the rest of the API
+    Y, D, X = Y.T, D.T, X.T
+
+    return map(np.squeeze, (Y, D, X))
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "n_features": [Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_sparse_uncorrelated(n_samples=100, n_features=10, *, random_state=None):
+    """Generate a random regression problem with sparse uncorrelated design.
+
+    This dataset is described in Celeux et al [1]. as::
+
+        X ~ N(0, 1)
+        y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]
+
+    Only the first 4 features are informative. The remaining features are
+    useless.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    n_features : int, default=10
+        The number of features.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_features)
+        The input samples.
+
+    y : ndarray of shape (n_samples,)
+        The output values.
+
+    References
+    ----------
+    .. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,
+           "Regularization in regression: comparing Bayesian and frequentist
+           methods in a poorly informative situation", 2009.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_sparse_uncorrelated
+    >>> X, y = make_sparse_uncorrelated(random_state=0)
+    >>> X.shape
+    (100, 10)
+    >>> y.shape
+    (100,)
+    """
+    generator = check_random_state(random_state)
+
+    X = generator.normal(loc=0, scale=1, size=(n_samples, n_features))
+    y = generator.normal(
+        loc=(X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]),
+        scale=np.ones(n_samples),
+    )
+
+    return X, y
+
+
+@validate_params(
+    {
+        "n_dim": [Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_spd_matrix(n_dim, *, random_state=None):
+    """Generate a random symmetric, positive-definite matrix.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_dim : int
+        The matrix dimension.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_dim, n_dim)
+        The random symmetric, positive-definite matrix.
+
+    See Also
+    --------
+    make_sparse_spd_matrix: Generate a sparse symmetric definite positive matrix.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_spd_matrix
+    >>> make_spd_matrix(n_dim=2, random_state=42)
+    array([[2.093, 0.346],
+           [0.346, 0.218]])
+    """
+    generator = check_random_state(random_state)
+
+    A = generator.uniform(size=(n_dim, n_dim))
+    U, _, Vt = linalg.svd(np.dot(A.T, A), check_finite=False)
+    X = np.dot(np.dot(U, 1.0 + np.diag(generator.uniform(size=n_dim))), Vt)
+
+    return X
+
+
+@validate_params(
+    {
+        "n_dim": [Interval(Integral, 1, None, closed="left")],
+        "alpha": [Interval(Real, 0, 1, closed="both")],
+        "norm_diag": ["boolean"],
+        "smallest_coef": [Interval(Real, 0, 1, closed="both")],
+        "largest_coef": [Interval(Real, 0, 1, closed="both")],
+        "sparse_format": [
+            StrOptions({"bsr", "coo", "csc", "csr", "dia", "dok", "lil"}),
+            None,
+        ],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_sparse_spd_matrix(
+    n_dim=1,
+    *,
+    alpha=0.95,
+    norm_diag=False,
+    smallest_coef=0.1,
+    largest_coef=0.9,
+    sparse_format=None,
+    random_state=None,
+):
+    """Generate a sparse symmetric definite positive matrix.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_dim : int, default=1
+        The size of the random matrix to generate.
+
+        .. versionchanged:: 1.4
+            Renamed from ``dim`` to ``n_dim``.
+
+    alpha : float, default=0.95
+        The probability that a coefficient is zero (see notes). Larger values
+        enforce more sparsity. The value should be in the range 0 and 1.
+
+    norm_diag : bool, default=False
+        Whether to normalize the output matrix to make the leading diagonal
+        elements all 1.
+
+    smallest_coef : float, default=0.1
+        The value of the smallest coefficient between 0 and 1.
+
+    largest_coef : float, default=0.9
+        The value of the largest coefficient between 0 and 1.
+
+    sparse_format : str, default=None
+        String representing the output sparse format, such as 'csc', 'csr', etc.
+        If ``None``, return a dense numpy ndarray.
+
+        .. versionadded:: 1.4
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    prec : ndarray or sparse matrix of shape (dim, dim)
+        The generated matrix. If ``sparse_format=None``, this would be an ndarray.
+        Otherwise, this will be a sparse matrix of the specified format.
+
+    See Also
+    --------
+    make_spd_matrix : Generate a random symmetric, positive-definite matrix.
+
+    Notes
+    -----
+    The sparsity is actually imposed on the cholesky factor of the matrix.
+    Thus alpha does not translate directly into the filling fraction of
+    the matrix itself.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_sparse_spd_matrix
+    >>> make_sparse_spd_matrix(n_dim=4, norm_diag=False, random_state=42)
+    array([[1., 0., 0., 0.],
+           [0., 1., 0., 0.],
+           [0., 0., 1., 0.],
+           [0., 0., 0., 1.]])
+    """
+    random_state = check_random_state(random_state)
+
+    chol = -sp.eye(n_dim)
+    aux = sp.random(
+        m=n_dim,
+        n=n_dim,
+        density=1 - alpha,
+        data_rvs=lambda x: random_state.uniform(
+            low=smallest_coef, high=largest_coef, size=x
+        ),
+        random_state=random_state,
+    )
+    # We need to avoid "coo" format because it does not support slicing
+    aux = sp.tril(aux, k=-1, format="csc")
+
+    # Permute the lines: we don't want to have asymmetries in the final
+    # SPD matrix
+    permutation = random_state.permutation(n_dim)
+    aux = aux[permutation].T[permutation]
+    chol += aux
+    prec = chol.T @ chol
+
+    if norm_diag:
+        # Form the diagonal vector into a row matrix
+        d = sp.diags(1.0 / np.sqrt(prec.diagonal()))
+        prec = d @ prec @ d
+
+    if sparse_format is None:
+        return prec.toarray()
+    else:
+        return prec.asformat(sparse_format)
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "noise": [Interval(Real, 0, None, closed="left")],
+        "random_state": ["random_state"],
+        "hole": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None, hole=False):
+    """Generate a swiss roll dataset.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Adapted with permission from Stephen Marsland's code [1].
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of sample points on the Swiss Roll.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    hole : bool, default=False
+        If True generates the swiss roll with hole dataset.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, 3)
+        The points.
+
+    t : ndarray of shape (n_samples,)
+        The univariate position of the sample according to the main dimension
+        of the points in the manifold.
+
+    Notes
+    -----
+    The algorithm is from Marsland [1].
+
+    References
+    ----------
+    .. [1] S. Marsland, "Machine Learning: An Algorithmic Perspective", 2nd edition,
+           Chapter 6, 2014.
+           https://homepages.ecs.vuw.ac.nz/~marslast/Code/Ch6/lle.py
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_swiss_roll
+    >>> X, t = make_swiss_roll(noise=0.05, random_state=0)
+    >>> X.shape
+    (100, 3)
+    >>> t.shape
+    (100,)
+    """
+    generator = check_random_state(random_state)
+
+    if not hole:
+        t = 1.5 * np.pi * (1 + 2 * generator.uniform(size=n_samples))
+        y = 21 * generator.uniform(size=n_samples)
+    else:
+        corners = np.array(
+            [[np.pi * (1.5 + i), j * 7] for i in range(3) for j in range(3)]
+        )
+        corners = np.delete(corners, 4, axis=0)
+        corner_index = generator.choice(8, n_samples)
+        parameters = generator.uniform(size=(2, n_samples)) * np.array([[np.pi], [7]])
+        t, y = corners[corner_index].T + parameters
+
+    x = t * np.cos(t)
+    z = t * np.sin(t)
+
+    X = np.vstack((x, y, z))
+    X += noise * generator.standard_normal(size=(3, n_samples))
+    X = X.T
+    t = np.squeeze(t)
+
+    return X, t
+
+
+@validate_params(
+    {
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "noise": [Interval(Real, 0, None, closed="left")],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
+    """Generate an S curve dataset.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of sample points on the S curve.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, 3)
+        The points.
+
+    t : ndarray of shape (n_samples,)
+        The univariate position of the sample according
+        to the main dimension of the points in the manifold.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_s_curve
+    >>> X, t = make_s_curve(noise=0.05, random_state=0)
+    >>> X.shape
+    (100, 3)
+    >>> t.shape
+    (100,)
+    """
+    generator = check_random_state(random_state)
+
+    t = 3 * np.pi * (generator.uniform(size=(1, n_samples)) - 0.5)
+    X = np.empty(shape=(n_samples, 3), dtype=np.float64)
+    X[:, 0] = np.sin(t)
+    X[:, 1] = 2.0 * generator.uniform(size=n_samples)
+    X[:, 2] = np.sign(t) * (np.cos(t) - 1)
+    X += noise * generator.standard_normal(size=(3, n_samples)).T
+    t = np.squeeze(t)
+
+    return X, t
+
+
+@validate_params(
+    {
+        "mean": ["array-like", None],
+        "cov": [Interval(Real, 0, None, closed="left")],
+        "n_samples": [Interval(Integral, 1, None, closed="left")],
+        "n_features": [Interval(Integral, 1, None, closed="left")],
+        "n_classes": [Interval(Integral, 1, None, closed="left")],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_gaussian_quantiles(
+    *,
+    mean=None,
+    cov=1.0,
+    n_samples=100,
+    n_features=2,
+    n_classes=3,
+    shuffle=True,
+    random_state=None,
+):
+    r"""Generate isotropic Gaussian and label samples by quantile.
+
+    This classification dataset is constructed by taking a multi-dimensional
+    standard normal distribution and defining classes separated by nested
+    concentric multi-dimensional spheres such that roughly equal numbers of
+    samples are in each class (quantiles of the :math:`\chi^2` distribution).
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    mean : array-like of shape (n_features,), default=None
+        The mean of the multi-dimensional normal distribution.
+        If None then use the origin (0, 0, ...).
+
+    cov : float, default=1.0
+        The covariance matrix will be this value times the unit matrix. This
+        dataset only produces symmetric normal distributions.
+
+    n_samples : int, default=100
+        The total number of points equally divided among classes.
+
+    n_features : int, default=2
+        The number of features for each sample.
+
+    n_classes : int, default=3
+        The number of classes.
+
+    shuffle : bool, default=True
+        Shuffle the samples.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_features)
+        The generated samples.
+
+    y : ndarray of shape (n_samples,)
+        The integer labels for quantile membership of each sample.
+
+    Notes
+    -----
+    The dataset is from Zhu et al [1].
+
+    References
+    ----------
+    .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> X, y = make_gaussian_quantiles(random_state=42)
+    >>> X.shape
+    (100, 2)
+    >>> y.shape
+    (100,)
+    >>> list(y[:5])
+    [np.int64(2), np.int64(0), np.int64(1), np.int64(0), np.int64(2)]
+    """
+    if n_samples < n_classes:
+        raise ValueError("n_samples must be at least n_classes")
+
+    generator = check_random_state(random_state)
+
+    if mean is None:
+        mean = np.zeros(n_features)
+    else:
+        mean = np.array(mean)
+
+    # Build multivariate normal distribution
+    X = generator.multivariate_normal(mean, cov * np.identity(n_features), (n_samples,))
+
+    # Sort by distance from origin
+    idx = np.argsort(np.sum((X - mean[np.newaxis, :]) ** 2, axis=1))
+    X = X[idx, :]
+
+    # Label by quantile
+    step = n_samples // n_classes
+
+    y = np.hstack(
+        [
+            np.repeat(np.arange(n_classes), step),
+            np.repeat(n_classes - 1, n_samples - step * n_classes),
+        ]
+    )
+
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+    return X, y
+
+
+def _shuffle(data, random_state=None):
+    generator = check_random_state(random_state)
+    n_rows, n_cols = data.shape
+    row_idx = generator.permutation(n_rows)
+    col_idx = generator.permutation(n_cols)
+    result = data[row_idx][:, col_idx]
+    return result, row_idx, col_idx
+
+
+@validate_params(
+    {
+        "shape": [tuple],
+        "n_clusters": [Interval(Integral, 1, None, closed="left")],
+        "noise": [Interval(Real, 0, None, closed="left")],
+        "minval": [Interval(Real, None, None, closed="neither")],
+        "maxval": [Interval(Real, None, None, closed="neither")],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_biclusters(
+    shape,
+    n_clusters,
+    *,
+    noise=0.0,
+    minval=10,
+    maxval=100,
+    shuffle=True,
+    random_state=None,
+):
+    """Generate a constant block diagonal structure array for biclustering.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    shape : tuple of shape (n_rows, n_cols)
+        The shape of the result.
+
+    n_clusters : int
+        The number of biclusters.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise.
+
+    minval : float, default=10
+        Minimum value of a bicluster.
+
+    maxval : float, default=100
+        Maximum value of a bicluster.
+
+    shuffle : bool, default=True
+        Shuffle the samples.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape `shape`
+        The generated array.
+
+    rows : ndarray of shape (n_clusters, X.shape[0])
+        The indicators for cluster membership of each row.
+
+    cols : ndarray of shape (n_clusters, X.shape[1])
+        The indicators for cluster membership of each column.
+
+    See Also
+    --------
+    make_checkerboard: Generate an array with block checkerboard structure for
+        biclustering.
+
+    References
+    ----------
+
+    .. [1] Dhillon, I. S. (2001, August). Co-clustering documents and
+        words using bipartite spectral graph partitioning. In Proceedings
+        of the seventh ACM SIGKDD international conference on Knowledge
+        discovery and data mining (pp. 269-274). ACM.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_biclusters
+    >>> data, rows, cols = make_biclusters(
+    ...     shape=(10, 20), n_clusters=2, random_state=42
+    ... )
+    >>> data.shape
+    (10, 20)
+    >>> rows.shape
+    (2, 10)
+    >>> cols.shape
+    (2, 20)
+    """
+    generator = check_random_state(random_state)
+    n_rows, n_cols = shape
+    consts = generator.uniform(minval, maxval, n_clusters)
+
+    # row and column clusters of approximately equal sizes
+    row_sizes = generator.multinomial(n_rows, np.repeat(1.0 / n_clusters, n_clusters))
+    col_sizes = generator.multinomial(n_cols, np.repeat(1.0 / n_clusters, n_clusters))
+
+    row_labels = np.hstack(
+        [np.repeat(val, rep) for val, rep in zip(range(n_clusters), row_sizes)]
+    )
+    col_labels = np.hstack(
+        [np.repeat(val, rep) for val, rep in zip(range(n_clusters), col_sizes)]
+    )
+
+    result = np.zeros(shape, dtype=np.float64)
+    for i in range(n_clusters):
+        selector = np.outer(row_labels == i, col_labels == i)
+        result[selector] += consts[i]
+
+    if noise > 0:
+        result += generator.normal(scale=noise, size=result.shape)
+
+    if shuffle:
+        result, row_idx, col_idx = _shuffle(result, random_state)
+        row_labels = row_labels[row_idx]
+        col_labels = col_labels[col_idx]
+
+    rows = np.vstack([row_labels == c for c in range(n_clusters)])
+    cols = np.vstack([col_labels == c for c in range(n_clusters)])
+
+    return result, rows, cols
+
+
+@validate_params(
+    {
+        "shape": [tuple],
+        "n_clusters": [Interval(Integral, 1, None, closed="left"), "array-like"],
+        "noise": [Interval(Real, 0, None, closed="left")],
+        "minval": [Interval(Real, None, None, closed="neither")],
+        "maxval": [Interval(Real, None, None, closed="neither")],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_checkerboard(
+    shape,
+    n_clusters,
+    *,
+    noise=0.0,
+    minval=10,
+    maxval=100,
+    shuffle=True,
+    random_state=None,
+):
+    """Generate an array with block checkerboard structure for biclustering.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    shape : tuple of shape (n_rows, n_cols)
+        The shape of the result.
+
+    n_clusters : int or array-like or shape (n_row_clusters, n_column_clusters)
+        The number of row and column clusters.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise.
+
+    minval : float, default=10
+        Minimum value of a bicluster.
+
+    maxval : float, default=100
+        Maximum value of a bicluster.
+
+    shuffle : bool, default=True
+        Shuffle the samples.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray of shape `shape`
+        The generated array.
+
+    rows : ndarray of shape (n_clusters, X.shape[0])
+        The indicators for cluster membership of each row.
+
+    cols : ndarray of shape (n_clusters, X.shape[1])
+        The indicators for cluster membership of each column.
+
+    See Also
+    --------
+    make_biclusters : Generate an array with constant block diagonal structure
+        for biclustering.
+
+    References
+    ----------
+    .. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).
+        Spectral biclustering of microarray data: coclustering genes
+        and conditions. Genome research, 13(4), 703-716.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_checkerboard
+    >>> data, rows, columns = make_checkerboard(shape=(300, 300), n_clusters=10,
+    ...                                         random_state=42)
+    >>> data.shape
+    (300, 300)
+    >>> rows.shape
+    (100, 300)
+    >>> columns.shape
+    (100, 300)
+    >>> print(rows[0][:5], columns[0][:5])
+    [False False False  True False] [False False False False False]
+    """
+    generator = check_random_state(random_state)
+
+    if hasattr(n_clusters, "__len__"):
+        n_row_clusters, n_col_clusters = n_clusters
+    else:
+        n_row_clusters = n_col_clusters = n_clusters
+
+    # row and column clusters of approximately equal sizes
+    n_rows, n_cols = shape
+    row_sizes = generator.multinomial(
+        n_rows, np.repeat(1.0 / n_row_clusters, n_row_clusters)
+    )
+    col_sizes = generator.multinomial(
+        n_cols, np.repeat(1.0 / n_col_clusters, n_col_clusters)
+    )
+
+    row_labels = np.hstack(
+        [np.repeat(val, rep) for val, rep in zip(range(n_row_clusters), row_sizes)]
+    )
+    col_labels = np.hstack(
+        [np.repeat(val, rep) for val, rep in zip(range(n_col_clusters), col_sizes)]
+    )
+
+    result = np.zeros(shape, dtype=np.float64)
+    for i in range(n_row_clusters):
+        for j in range(n_col_clusters):
+            selector = np.outer(row_labels == i, col_labels == j)
+            result[selector] += generator.uniform(minval, maxval)
+
+    if noise > 0:
+        result += generator.normal(scale=noise, size=result.shape)
+
+    if shuffle:
+        result, row_idx, col_idx = _shuffle(result, random_state)
+        row_labels = row_labels[row_idx]
+        col_labels = col_labels[col_idx]
+
+    rows = np.vstack(
+        [
+            row_labels == label
+            for label in range(n_row_clusters)
+            for _ in range(n_col_clusters)
+        ]
+    )
+    cols = np.vstack(
+        [
+            col_labels == label
+            for _ in range(n_row_clusters)
+            for label in range(n_col_clusters)
+        ]
+    )
+
+    return result, rows, cols
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_species_distributions.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_species_distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..e871949e41312b2600512551f0c3d2593ad8cf64
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_species_distributions.py
@@ -0,0 +1,289 @@
+"""
+=============================
+Species distribution dataset
+=============================
+
+This dataset represents the geographic distribution of species.
+The dataset is provided by Phillips et. al. (2006).
+
+The two species are:
+
+ - `"Bradypus variegatus"
+   <http://www.iucnredlist.org/details/3038/0>`_ ,
+   the Brown-throated Sloth.
+
+ - `"Microryzomys minutus"
+   <http://www.iucnredlist.org/details/13408/0>`_ ,
+   also known as the Forest Small Rice Rat, a rodent that lives in Peru,
+   Colombia, Ecuador, Peru, and Venezuela.
+
+References
+----------
+
+`"Maximum entropy modeling of species geographic distributions"
+<http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips,
+R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+from io import BytesIO
+from numbers import Integral, Real
+from os import PathLike, makedirs, remove
+from os.path import exists
+
+import joblib
+import numpy as np
+
+from ..utils import Bunch
+from ..utils._param_validation import Interval, validate_params
+from . import get_data_home
+from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath
+
+# The original data can be found at:
+# https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip
+SAMPLES = RemoteFileMetadata(
+    filename="samples.zip",
+    url="https://ndownloader.figshare.com/files/5976075",
+    checksum="abb07ad284ac50d9e6d20f1c4211e0fd3c098f7f85955e89d321ee8efe37ac28",
+)
+
+# The original data can be found at:
+# https://biodiversityinformatics.amnh.org/open_source/maxent/coverages.zip
+COVERAGES = RemoteFileMetadata(
+    filename="coverages.zip",
+    url="https://ndownloader.figshare.com/files/5976078",
+    checksum="4d862674d72e79d6cee77e63b98651ec7926043ba7d39dcb31329cf3f6073807",
+)
+
+DATA_ARCHIVE_NAME = "species_coverage.pkz"
+
+
+logger = logging.getLogger(__name__)
+
+
+def _load_coverage(F, header_length=6, dtype=np.int16):
+    """Load a coverage file from an open file object.
+
+    This will return a numpy array of the given dtype
+    """
+    header = [F.readline() for _ in range(header_length)]
+    make_tuple = lambda t: (t.split()[0], float(t.split()[1]))
+    header = dict([make_tuple(line) for line in header])
+
+    M = np.loadtxt(F, dtype=dtype)
+    nodata = int(header[b"NODATA_value"])
+    if nodata != -9999:
+        M[nodata] = -9999
+    return M
+
+
+def _load_csv(F):
+    """Load csv file.
+
+    Parameters
+    ----------
+    F : file object
+        CSV file open in byte mode.
+
+    Returns
+    -------
+    rec : np.ndarray
+        record array representing the data
+    """
+    names = F.readline().decode("ascii").strip().split(",")
+
+    rec = np.loadtxt(F, skiprows=0, delimiter=",", dtype="S22,f4,f4")
+    rec.dtype.names = names
+    return rec
+
+
+def construct_grids(batch):
+    """Construct the map grid from the batch object
+
+    Parameters
+    ----------
+    batch : Batch object
+        The object returned by :func:`fetch_species_distributions`
+
+    Returns
+    -------
+    (xgrid, ygrid) : 1-D arrays
+        The grid corresponding to the values in batch.coverages
+    """
+    # x,y coordinates for corner cells
+    xmin = batch.x_left_lower_corner + batch.grid_size
+    xmax = xmin + (batch.Nx * batch.grid_size)
+    ymin = batch.y_left_lower_corner + batch.grid_size
+    ymax = ymin + (batch.Ny * batch.grid_size)
+
+    # x coordinates of the grid cells
+    xgrid = np.arange(xmin, xmax, batch.grid_size)
+    # y coordinates of the grid cells
+    ygrid = np.arange(ymin, ymax, batch.grid_size)
+
+    return (xgrid, ygrid)
+
+
+@validate_params(
+    {
+        "data_home": [str, PathLike, None],
+        "download_if_missing": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_species_distributions(
+    *,
+    data_home=None,
+    download_if_missing=True,
+    n_retries=3,
+    delay=1.0,
+):
+    """Loader for species distribution dataset from Phillips et. al. (2006).
+
+    Read more in the :ref:`User Guide <species_distribution_dataset>`.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        coverages : array, shape = [14, 1592, 1212]
+            These represent the 14 features measured
+            at each point of the map grid.
+            The latitude/longitude values for the grid are discussed below.
+            Missing data is represented by the value -9999.
+        train : record array, shape = (1624,)
+            The training points for the data.  Each point has three fields:
+
+            - train['species'] is the species name
+            - train['dd long'] is the longitude, in degrees
+            - train['dd lat'] is the latitude, in degrees
+        test : record array, shape = (620,)
+            The test points for the data.  Same format as the training data.
+        Nx, Ny : integers
+            The number of longitudes (x) and latitudes (y) in the grid
+        x_left_lower_corner, y_left_lower_corner : floats
+            The (x,y) position of the lower-left corner, in degrees
+        grid_size : float
+            The spacing between points of the grid, in degrees
+
+    Notes
+    -----
+
+    This dataset represents the geographic distribution of species.
+    The dataset is provided by Phillips et. al. (2006).
+
+    The two species are:
+
+    - `"Bradypus variegatus"
+      <http://www.iucnredlist.org/details/3038/0>`_ ,
+      the Brown-throated Sloth.
+
+    - `"Microryzomys minutus"
+      <http://www.iucnredlist.org/details/13408/0>`_ ,
+      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
+      Colombia, Ecuador, Peru, and Venezuela.
+
+    References
+    ----------
+
+    * `"Maximum entropy modeling of species geographic distributions"
+      <http://rob.schapire.net/papers/ecolmod.pdf>`_
+      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
+      190:231-259, 2006.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_species_distributions
+    >>> species = fetch_species_distributions()
+    >>> species.train[:5]
+    array([(b'microryzomys_minutus', -64.7   , -17.85  ),
+           (b'microryzomys_minutus', -67.8333, -16.3333),
+           (b'microryzomys_minutus', -67.8833, -16.3   ),
+           (b'microryzomys_minutus', -67.8   , -16.2667),
+           (b'microryzomys_minutus', -67.9833, -15.9   )],
+          dtype=[('species', 'S22'), ('dd long', '<f4'), ('dd lat', '<f4')])
+
+    For a more extended example,
+    see :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py`
+    """
+    data_home = get_data_home(data_home)
+    if not exists(data_home):
+        makedirs(data_home)
+
+    # Define parameters for the data files.  These should not be changed
+    # unless the data model changes.  They will be saved in the npz file
+    # with the downloaded data.
+    extra_params = dict(
+        x_left_lower_corner=-94.8,
+        Nx=1212,
+        y_left_lower_corner=-56.05,
+        Ny=1592,
+        grid_size=0.05,
+    )
+    dtype = np.int16
+
+    archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME)
+
+    if not exists(archive_path):
+        if not download_if_missing:
+            raise OSError("Data not found and `download_if_missing` is False")
+        logger.info("Downloading species data from %s to %s" % (SAMPLES.url, data_home))
+        samples_path = _fetch_remote(
+            SAMPLES, dirname=data_home, n_retries=n_retries, delay=delay
+        )
+        with np.load(samples_path) as X:  # samples.zip is a valid npz
+            for f in X.files:
+                fhandle = BytesIO(X[f])
+                if "train" in f:
+                    train = _load_csv(fhandle)
+                if "test" in f:
+                    test = _load_csv(fhandle)
+        remove(samples_path)
+
+        logger.info(
+            "Downloading coverage data from %s to %s" % (COVERAGES.url, data_home)
+        )
+        coverages_path = _fetch_remote(
+            COVERAGES, dirname=data_home, n_retries=n_retries, delay=delay
+        )
+        with np.load(coverages_path) as X:  # coverages.zip is a valid npz
+            coverages = []
+            for f in X.files:
+                fhandle = BytesIO(X[f])
+                logger.debug(" - converting {}".format(f))
+                coverages.append(_load_coverage(fhandle))
+            coverages = np.asarray(coverages, dtype=dtype)
+        remove(coverages_path)
+
+        bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params)
+        joblib.dump(bunch, archive_path, compress=9)
+    else:
+        bunch = joblib.load(archive_path)
+
+    return bunch
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..76a595407c11b1d4e1fda865d1e56e975500f52d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_fast.pyx
@@ -0,0 +1,252 @@
+# Optimized inner loop of load_svmlight_file.
+#
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import array
+from cpython cimport array
+cimport cython
+from libc.string cimport strchr
+
+import numpy as np
+
+
+cdef bytes COMMA = u','.encode('ascii')
+cdef bytes COLON = u':'.encode('ascii')
+
+
+def _load_svmlight_file(f, dtype, bint multilabel, bint zero_based,
+                        bint query_id, long long offset, long long length):
+    cdef array.array data, indices, indptr
+    cdef bytes line
+    cdef char *hash_ptr
+    cdef char *line_cstr
+    cdef int idx, prev_idx
+    cdef Py_ssize_t i
+    cdef bytes qid_prefix = b'qid'
+    cdef Py_ssize_t n_features
+    cdef long long offset_max = offset + length if length > 0 else -1
+
+    # Special-case float32 but use float64 for everything else;
+    # the Python code will do further conversions.
+    if dtype == np.float32:
+        data = array.array("f")
+    else:
+        dtype = np.float64
+        data = array.array("d")
+
+    indices = array.array("q")
+    indptr = array.array("q", [0])
+    query = np.arange(0, dtype=np.int64)
+
+    if multilabel:
+        labels = []
+    else:
+        labels = array.array("d")
+
+    if offset > 0:
+        f.seek(offset)
+        # drop the current line that might be truncated and is to be
+        # fetched by another call
+        f.readline()
+
+    for line in f:
+        # skip comments
+        line_cstr = line
+        hash_ptr = strchr(line_cstr, 35)  # ASCII value of '#' is 35
+        if hash_ptr != NULL:
+            line = line[:hash_ptr - line_cstr]
+
+        line_parts = line.split()
+        if len(line_parts) == 0:
+            continue
+
+        target, features = line_parts[0], line_parts[1:]
+        if multilabel:
+            if COLON in target:
+                target, features = [], line_parts[0:]
+            else:
+                target = [float(y) for y in target.split(COMMA)]
+            target.sort()
+            labels.append(tuple(target))
+        else:
+            array.resize_smart(labels, len(labels) + 1)
+            labels[len(labels) - 1] = float(target)
+
+        prev_idx = -1
+        n_features = len(features)
+        if n_features and features[0].startswith(qid_prefix):
+            _, value = features[0].split(COLON, 1)
+            if query_id:
+                query.resize(len(query) + 1)
+                query[len(query) - 1] = np.int64(value)
+            features.pop(0)
+            n_features -= 1
+
+        for i in range(0, n_features):
+            idx_s, value = features[i].split(COLON, 1)
+            idx = int(idx_s)
+            if idx < 0 or not zero_based and idx == 0:
+                raise ValueError(
+                    "Invalid index %d in SVMlight/LibSVM data file." % idx)
+            if idx <= prev_idx:
+                raise ValueError("Feature indices in SVMlight/LibSVM data "
+                                 "file should be sorted and unique.")
+
+            array.resize_smart(indices, len(indices) + 1)
+            indices[len(indices) - 1] = idx
+
+            array.resize_smart(data, len(data) + 1)
+            data[len(data) - 1] = float(value)
+
+            prev_idx = idx
+
+        # increment index pointer array size
+        array.resize_smart(indptr, len(indptr) + 1)
+        indptr[len(indptr) - 1] = len(data)
+
+        if offset_max != -1 and f.tell() > offset_max:
+            # Stop here and let another call deal with the following.
+            break
+
+    return (dtype, data, indices, indptr, labels, query)
+
+
+# Two fused types are defined to be able to
+# use all possible combinations of parameters.
+ctypedef fused int_or_float:
+    cython.integral
+    cython.floating
+    signed long long
+
+ctypedef fused double_or_longlong:
+    double
+    signed long long
+
+ctypedef fused int_or_longlong:
+    cython.integral
+    signed long long
+
+
+def get_dense_row_string(
+    const int_or_float[:, :] X,
+    Py_ssize_t[:] x_inds,
+    double_or_longlong[:] x_vals,
+    Py_ssize_t row,
+    str value_pattern,
+    bint one_based,
+):
+    cdef:
+        Py_ssize_t row_length = X.shape[1]
+        Py_ssize_t x_nz_used = 0
+        Py_ssize_t k
+        int_or_float val
+
+    for k in range(row_length):
+        val = X[row, k]
+        if val == 0:
+            continue
+        x_inds[x_nz_used] = k
+        x_vals[x_nz_used] = <double_or_longlong> val
+        x_nz_used += 1
+
+    reprs = [
+        value_pattern % (x_inds[i] + one_based, x_vals[i])
+        for i in range(x_nz_used)
+    ]
+
+    return " ".join(reprs)
+
+
+def get_sparse_row_string(
+    int_or_float[:] X_data,
+    int[:] X_indptr,
+    int[:] X_indices,
+    Py_ssize_t row,
+    str value_pattern,
+    bint one_based,
+):
+    cdef:
+        Py_ssize_t row_start = X_indptr[row]
+        Py_ssize_t row_end = X_indptr[row+1]
+
+    reprs = [
+        value_pattern % (X_indices[i] + one_based, X_data[i])
+        for i in range(row_start, row_end)
+    ]
+
+    return " ".join(reprs)
+
+
+def _dump_svmlight_file(
+    X,
+    y,
+    f,
+    bint multilabel,
+    bint one_based,
+    int_or_longlong[:] query_id,
+    bint X_is_sp,
+    bint y_is_sp,
+):
+    cdef bint X_is_integral
+    cdef bint query_id_is_not_empty = query_id.size > 0
+    X_is_integral = X.dtype.kind == "i"
+    if X_is_integral:
+        value_pattern = "%d:%d"
+    else:
+        value_pattern = "%d:%.16g"
+    if y.dtype.kind == "i":
+        label_pattern = "%d"
+    else:
+        label_pattern = "%.16g"
+
+    line_pattern = "%s"
+    if query_id_is_not_empty:
+        line_pattern += " qid:%d"
+    line_pattern += " %s\n"
+
+    cdef:
+        Py_ssize_t num_labels = y.shape[1]
+        Py_ssize_t x_len = X.shape[0]
+        Py_ssize_t row_length = X.shape[1]
+        Py_ssize_t i
+        Py_ssize_t j
+        Py_ssize_t col_start
+        Py_ssize_t col_end
+        Py_ssize_t[:] x_inds = np.empty(row_length, dtype=np.intp)
+        signed long long[:] x_vals_int
+        double[:] x_vals_float
+
+    if not X_is_sp:
+        if X_is_integral:
+            x_vals_int = np.zeros(row_length, dtype=np.longlong)
+        else:
+            x_vals_float = np.zeros(row_length, dtype=np.float64)
+
+    for i in range(x_len):
+        if not X_is_sp:
+            if X_is_integral:
+                s = get_dense_row_string(X, x_inds, x_vals_int, i, value_pattern, one_based)
+            else:
+                s = get_dense_row_string(X, x_inds, x_vals_float, i, value_pattern, one_based)
+        else:
+            s = get_sparse_row_string(X.data, X.indptr, X.indices, i, value_pattern, one_based)
+        if multilabel:
+            if y_is_sp:
+                col_start = y.indptr[i]
+                col_end = y.indptr[i+1]
+                labels_str = ','.join(tuple(label_pattern % y.indices[j] for j in range(col_start, col_end) if y.data[j] != 0))
+            else:
+                labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i, j] != 0)
+        else:
+            if y_is_sp:
+                labels_str = label_pattern % y.data[i]
+            else:
+                labels_str = label_pattern % y[i, 0]
+
+        if query_id_is_not_empty:
+            feat = (labels_str, query_id[i], s)
+        else:
+            feat = (labels_str, s)
+
+        f.write((line_pattern % feat).encode("utf-8"))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_io.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3a833efb86c02675a318fd09674e33ad5dfb526
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_io.py
@@ -0,0 +1,585 @@
+"""This module implements a loader and dumper for the svmlight format
+
+This format is a text-based format, with one sample per line. It does
+not store zero valued features hence is suitable for sparse dataset.
+
+The first element of each line can be used to store a target variable to
+predict.
+
+This format is used as the default format for both svmlight and the
+libsvm command line programs.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import os.path
+from contextlib import closing
+from numbers import Integral
+
+import numpy as np
+import scipy.sparse as sp
+
+from .. import __version__
+from ..utils import check_array
+from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params
+from ._svmlight_format_fast import (
+    _dump_svmlight_file,
+    _load_svmlight_file,
+)
+
+
+@validate_params(
+    {
+        "f": [
+            str,
+            Interval(Integral, 0, None, closed="left"),
+            os.PathLike,
+            HasMethods("read"),
+        ],
+        "n_features": [Interval(Integral, 1, None, closed="left"), None],
+        "dtype": "no_validation",  # delegate validation to numpy
+        "multilabel": ["boolean"],
+        "zero_based": ["boolean", StrOptions({"auto"})],
+        "query_id": ["boolean"],
+        "offset": [Interval(Integral, 0, None, closed="left")],
+        "length": [Integral],
+    },
+    prefer_skip_nested_validation=True,
+)
+def load_svmlight_file(
+    f,
+    *,
+    n_features=None,
+    dtype=np.float64,
+    multilabel=False,
+    zero_based="auto",
+    query_id=False,
+    offset=0,
+    length=-1,
+):
+    """Load datasets in the svmlight / libsvm format into sparse CSR matrix.
+
+    This format is a text-based format, with one sample per line. It does
+    not store zero valued features hence is suitable for sparse dataset.
+
+    The first element of each line can be used to store a target variable
+    to predict.
+
+    This format is used as the default format for both svmlight and the
+    libsvm command line programs.
+
+    Parsing a text based source can be expensive. When repeatedly
+    working on the same dataset, it is recommended to wrap this
+    loader with joblib.Memory.cache to store a memmapped backup of the
+    CSR results of the first call and benefit from the near instantaneous
+    loading of memmapped structures for the subsequent calls.
+
+    In case the file contains a pairwise preference constraint (known
+    as "qid" in the svmlight format) these are ignored unless the
+    query_id parameter is set to True. These pairwise preference
+    constraints can be used to constraint the combination of samples
+    when using pairwise loss functions (as is the case in some
+    learning to rank problems) so that only pairs with the same
+    query_id value are considered.
+
+    This implementation is written in Cython and is reasonably fast.
+    However, a faster API-compatible loader is also available at:
+    https://github.com/mblondel/svmlight-loader
+
+    Parameters
+    ----------
+    f : str, path-like, file-like or int
+        (Path to) a file to load. If a path ends in ".gz" or ".bz2", it will
+        be uncompressed on the fly. If an integer is passed, it is assumed to
+        be a file descriptor. A file-like or file descriptor will not be closed
+        by this function. A file-like object must be opened in binary mode.
+
+        .. versionchanged:: 1.2
+           Path-like objects are now accepted.
+
+    n_features : int, default=None
+        The number of features to use. If None, it will be inferred. This
+        argument is useful to load several files that are subsets of a
+        bigger sliced dataset: each subset might not have examples of
+        every feature, hence the inferred shape might vary from one
+        slice to another.
+        n_features is only required if ``offset`` or ``length`` are passed a
+        non-default value.
+
+    dtype : numpy data type, default=np.float64
+        Data type of dataset to be loaded. This will be the data type of the
+        output numpy arrays ``X`` and ``y``.
+
+    multilabel : bool, default=False
+        Samples may have several labels each (see
+        https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html).
+
+    zero_based : bool or "auto", default="auto"
+        Whether column indices in f are zero-based (True) or one-based
+        (False). If column indices are one-based, they are transformed to
+        zero-based to match Python/NumPy conventions.
+        If set to "auto", a heuristic check is applied to determine this from
+        the file contents. Both kinds of files occur "in the wild", but they
+        are unfortunately not self-identifying. Using "auto" or True should
+        always be safe when no ``offset`` or ``length`` is passed.
+        If ``offset`` or ``length`` are passed, the "auto" mode falls back
+        to ``zero_based=True`` to avoid having the heuristic check yield
+        inconsistent results on different segments of the file.
+
+    query_id : bool, default=False
+        If True, will return the query_id array for each file.
+
+    offset : int, default=0
+        Ignore the offset first bytes by seeking forward, then
+        discarding the following bytes up until the next new line
+        character.
+
+    length : int, default=-1
+        If strictly positive, stop reading any new line of data once the
+        position in the file has reached the (offset + length) bytes threshold.
+
+    Returns
+    -------
+    X : scipy.sparse matrix of shape (n_samples, n_features)
+        The data matrix.
+
+    y : ndarray of shape (n_samples,), or a list of tuples of length n_samples
+        The target. It is a list of tuples when ``multilabel=True``, else a
+        ndarray.
+
+    query_id : array of shape (n_samples,)
+       The query_id for each sample. Only returned when query_id is set to
+       True.
+
+    See Also
+    --------
+    load_svmlight_files : Similar function for loading multiple files in this
+        format, enforcing the same number of features/columns on all of them.
+
+    Examples
+    --------
+    To use joblib.Memory to cache the svmlight file::
+
+        from joblib import Memory
+        from sklearn.datasets import load_svmlight_file
+        mem = Memory("./mycache")
+
+        @mem.cache
+        def get_data():
+            data = load_svmlight_file("mysvmlightfile")
+            return data[0], data[1]
+
+        X, y = get_data()
+    """
+    return tuple(
+        load_svmlight_files(
+            [f],
+            n_features=n_features,
+            dtype=dtype,
+            multilabel=multilabel,
+            zero_based=zero_based,
+            query_id=query_id,
+            offset=offset,
+            length=length,
+        )
+    )
+
+
+def _gen_open(f):
+    if isinstance(f, int):  # file descriptor
+        return open(f, "rb", closefd=False)
+    elif isinstance(f, os.PathLike):
+        f = os.fspath(f)
+    elif not isinstance(f, str):
+        raise TypeError("expected {str, int, path-like, file-like}, got %s" % type(f))
+
+    _, ext = os.path.splitext(f)
+    if ext == ".gz":
+        import gzip
+
+        return gzip.open(f, "rb")
+    elif ext == ".bz2":
+        from bz2 import BZ2File
+
+        return BZ2File(f, "rb")
+    else:
+        return open(f, "rb")
+
+
+def _open_and_load(f, dtype, multilabel, zero_based, query_id, offset=0, length=-1):
+    if hasattr(f, "read"):
+        actual_dtype, data, ind, indptr, labels, query = _load_svmlight_file(
+            f, dtype, multilabel, zero_based, query_id, offset, length
+        )
+    else:
+        with closing(_gen_open(f)) as f:
+            actual_dtype, data, ind, indptr, labels, query = _load_svmlight_file(
+                f, dtype, multilabel, zero_based, query_id, offset, length
+            )
+
+    # convert from array.array, give data the right dtype
+    if not multilabel:
+        labels = np.frombuffer(labels, np.float64)
+    data = np.frombuffer(data, actual_dtype)
+    indices = np.frombuffer(ind, np.longlong)
+    indptr = np.frombuffer(indptr, dtype=np.longlong)  # never empty
+    query = np.frombuffer(query, np.int64)
+
+    data = np.asarray(data, dtype=dtype)  # no-op for float{32,64}
+    return data, indices, indptr, labels, query
+
+
+@validate_params(
+    {
+        "files": [
+            "array-like",
+            str,
+            os.PathLike,
+            HasMethods("read"),
+            Interval(Integral, 0, None, closed="left"),
+        ],
+        "n_features": [Interval(Integral, 1, None, closed="left"), None],
+        "dtype": "no_validation",  # delegate validation to numpy
+        "multilabel": ["boolean"],
+        "zero_based": ["boolean", StrOptions({"auto"})],
+        "query_id": ["boolean"],
+        "offset": [Interval(Integral, 0, None, closed="left")],
+        "length": [Integral],
+    },
+    prefer_skip_nested_validation=True,
+)
+def load_svmlight_files(
+    files,
+    *,
+    n_features=None,
+    dtype=np.float64,
+    multilabel=False,
+    zero_based="auto",
+    query_id=False,
+    offset=0,
+    length=-1,
+):
+    """Load dataset from multiple files in SVMlight format.
+
+    This function is equivalent to mapping load_svmlight_file over a list of
+    files, except that the results are concatenated into a single, flat list
+    and the samples vectors are constrained to all have the same number of
+    features.
+
+    In case the file contains a pairwise preference constraint (known
+    as "qid" in the svmlight format) these are ignored unless the
+    query_id parameter is set to True. These pairwise preference
+    constraints can be used to constraint the combination of samples
+    when using pairwise loss functions (as is the case in some
+    learning to rank problems) so that only pairs with the same
+    query_id value are considered.
+
+    Parameters
+    ----------
+    files : array-like, dtype=str, path-like, file-like or int
+        (Paths of) files to load. If a path ends in ".gz" or ".bz2", it will
+        be uncompressed on the fly. If an integer is passed, it is assumed to
+        be a file descriptor. File-likes and file descriptors will not be
+        closed by this function. File-like objects must be opened in binary
+        mode.
+
+        .. versionchanged:: 1.2
+           Path-like objects are now accepted.
+
+    n_features : int, default=None
+        The number of features to use. If None, it will be inferred from the
+        maximum column index occurring in any of the files.
+
+        This can be set to a higher value than the actual number of features
+        in any of the input files, but setting it to a lower value will cause
+        an exception to be raised.
+
+    dtype : numpy data type, default=np.float64
+        Data type of dataset to be loaded. This will be the data type of the
+        output numpy arrays ``X`` and ``y``.
+
+    multilabel : bool, default=False
+        Samples may have several labels each (see
+        https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html).
+
+    zero_based : bool or "auto", default="auto"
+        Whether column indices in f are zero-based (True) or one-based
+        (False). If column indices are one-based, they are transformed to
+        zero-based to match Python/NumPy conventions.
+        If set to "auto", a heuristic check is applied to determine this from
+        the file contents. Both kinds of files occur "in the wild", but they
+        are unfortunately not self-identifying. Using "auto" or True should
+        always be safe when no offset or length is passed.
+        If offset or length are passed, the "auto" mode falls back
+        to zero_based=True to avoid having the heuristic check yield
+        inconsistent results on different segments of the file.
+
+    query_id : bool, default=False
+        If True, will return the query_id array for each file.
+
+    offset : int, default=0
+        Ignore the offset first bytes by seeking forward, then
+        discarding the following bytes up until the next new line
+        character.
+
+    length : int, default=-1
+        If strictly positive, stop reading any new line of data once the
+        position in the file has reached the (offset + length) bytes threshold.
+
+    Returns
+    -------
+    [X1, y1, ..., Xn, yn] or [X1, y1, q1, ..., Xn, yn, qn]: list of arrays
+        Each (Xi, yi) pair is the result from load_svmlight_file(files[i]).
+        If query_id is set to True, this will return instead (Xi, yi, qi)
+        triplets.
+
+    See Also
+    --------
+    load_svmlight_file: Similar function for loading a single file in this
+        format.
+
+    Notes
+    -----
+    When fitting a model to a matrix X_train and evaluating it against a
+    matrix X_test, it is essential that X_train and X_test have the same
+    number of features (X_train.shape[1] == X_test.shape[1]). This may not
+    be the case if you load the files individually with load_svmlight_file.
+
+    Examples
+    --------
+    To use joblib.Memory to cache the svmlight file::
+
+        from joblib import Memory
+        from sklearn.datasets import load_svmlight_file
+        mem = Memory("./mycache")
+
+        @mem.cache
+        def get_data():
+            data_train, target_train, data_test, target_test = load_svmlight_files(
+                ["svmlight_file_train", "svmlight_file_test"]
+            )
+            return data_train, target_train, data_test, target_test
+
+        X_train, y_train, X_test, y_test = get_data()
+    """
+    if (offset != 0 or length > 0) and zero_based == "auto":
+        # disable heuristic search to avoid getting inconsistent results on
+        # different segments of the file
+        zero_based = True
+
+    if (offset != 0 or length > 0) and n_features is None:
+        raise ValueError("n_features is required when offset or length is specified.")
+
+    r = [
+        _open_and_load(
+            f,
+            dtype,
+            multilabel,
+            bool(zero_based),
+            bool(query_id),
+            offset=offset,
+            length=length,
+        )
+        for f in files
+    ]
+
+    if zero_based is False or (
+        zero_based == "auto" and all(len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r)
+    ):
+        for _, indices, _, _, _ in r:
+            indices -= 1
+
+    n_f = max(ind[1].max() if len(ind[1]) else 0 for ind in r) + 1
+
+    if n_features is None:
+        n_features = n_f
+    elif n_features < n_f:
+        raise ValueError(
+            "n_features was set to {}, but input file contains {} features".format(
+                n_features, n_f
+            )
+        )
+
+    result = []
+    for data, indices, indptr, y, query_values in r:
+        shape = (indptr.shape[0] - 1, n_features)
+        X = sp.csr_matrix((data, indices, indptr), shape)
+        X.sort_indices()
+        result += X, y
+        if query_id:
+            result.append(query_values)
+
+    return result
+
+
+def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):
+    if comment:
+        f.write(
+            (
+                "# Generated by dump_svmlight_file from scikit-learn %s\n" % __version__
+            ).encode()
+        )
+        f.write(
+            ("# Column indices are %s-based\n" % ["zero", "one"][one_based]).encode()
+        )
+
+        f.write(b"#\n")
+        f.writelines(b"# %s\n" % line for line in comment.splitlines())
+    X_is_sp = sp.issparse(X)
+    y_is_sp = sp.issparse(y)
+    if not multilabel and not y_is_sp:
+        y = y[:, np.newaxis]
+    _dump_svmlight_file(
+        X,
+        y,
+        f,
+        multilabel,
+        one_based,
+        query_id,
+        X_is_sp,
+        y_is_sp,
+    )
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", "sparse matrix"],
+        "f": [str, HasMethods(["write"])],
+        "zero_based": ["boolean"],
+        "comment": [str, bytes, None],
+        "query_id": ["array-like", None],
+        "multilabel": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def dump_svmlight_file(
+    X,
+    y,
+    f,
+    *,
+    zero_based=True,
+    comment=None,
+    query_id=None,
+    multilabel=False,
+):
+    """Dump the dataset in svmlight / libsvm file format.
+
+    This format is a text-based format, with one sample per line. It does
+    not store zero valued features hence is suitable for sparse dataset.
+
+    The first element of each line can be used to store a target variable
+    to predict.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training vectors, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : {array-like, sparse matrix}, shape = (n_samples,) or (n_samples, n_labels)
+        Target values. Class labels must be an
+        integer or float, or array-like objects of integer or float for
+        multilabel classifications.
+
+    f : str or file-like in binary mode
+        If string, specifies the path that will contain the data.
+        If file-like, data will be written to f. f should be opened in binary
+        mode.
+
+    zero_based : bool, default=True
+        Whether column indices should be written zero-based (True) or one-based
+        (False).
+
+    comment : str or bytes, default=None
+        Comment to insert at the top of the file. This should be either a
+        Unicode string, which will be encoded as UTF-8, or an ASCII byte
+        string.
+        If a comment is given, then it will be preceded by one that identifies
+        the file as having been dumped by scikit-learn. Note that not all
+        tools grok comments in SVMlight files.
+
+    query_id : array-like of shape (n_samples,), default=None
+        Array containing pairwise preference constraints (qid in svmlight
+        format).
+
+    multilabel : bool, default=False
+        Samples may have several labels each (see
+        https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html).
+
+        .. versionadded:: 0.17
+           parameter `multilabel` to support multilabel datasets.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import dump_svmlight_file, make_classification
+    >>> X, y = make_classification(random_state=0)
+    >>> output_file = "my_dataset.svmlight"
+    >>> dump_svmlight_file(X, y, output_file)  # doctest: +SKIP
+    """
+    if comment is not None:
+        # Convert comment string to list of lines in UTF-8.
+        # If a byte string is passed, then check whether it's ASCII;
+        # if a user wants to get fancy, they'll have to decode themselves.
+        if isinstance(comment, bytes):
+            comment.decode("ascii")  # just for the exception
+        else:
+            comment = comment.encode("utf-8")
+        if b"\0" in comment:
+            raise ValueError("comment string contains NUL byte")
+
+    yval = check_array(y, accept_sparse="csr", ensure_2d=False)
+    if sp.issparse(yval):
+        if yval.shape[1] != 1 and not multilabel:
+            raise ValueError(
+                "expected y of shape (n_samples, 1), got %r" % (yval.shape,)
+            )
+    else:
+        if yval.ndim != 1 and not multilabel:
+            raise ValueError("expected y of shape (n_samples,), got %r" % (yval.shape,))
+
+    Xval = check_array(X, accept_sparse="csr")
+    if Xval.shape[0] != yval.shape[0]:
+        raise ValueError(
+            "X.shape[0] and y.shape[0] should be the same, got %r and %r instead."
+            % (Xval.shape[0], yval.shape[0])
+        )
+
+    # We had some issues with CSR matrices with unsorted indices (e.g. #1501),
+    # so sort them here, but first make sure we don't modify the user's X.
+    # TODO We can do this cheaper; sorted_indices copies the whole matrix.
+    if yval is y and hasattr(yval, "sorted_indices"):
+        y = yval.sorted_indices()
+    else:
+        y = yval
+        if hasattr(y, "sort_indices"):
+            y.sort_indices()
+
+    if Xval is X and hasattr(Xval, "sorted_indices"):
+        X = Xval.sorted_indices()
+    else:
+        X = Xval
+        if hasattr(X, "sort_indices"):
+            X.sort_indices()
+
+    if query_id is None:
+        # NOTE: query_id is passed to Cython functions using a fused type on query_id.
+        # Yet as of Cython>=3.0, memory views can't be None otherwise the runtime
+        # would not known which concrete implementation to dispatch the Python call to.
+        # TODO: simplify interfaces and implementations in _svmlight_format_fast.pyx.
+        query_id = np.array([], dtype=np.int32)
+    else:
+        query_id = np.asarray(query_id)
+        if query_id.shape[0] != y.shape[0]:
+            raise ValueError(
+                "expected query_id of shape (n_samples,), got %r" % (query_id.shape,)
+            )
+
+    one_based = not zero_based
+
+    if hasattr(f, "write"):
+        _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id)
+    else:
+        with open(f, "wb") as f:
+            _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_twenty_newsgroups.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_twenty_newsgroups.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc5fb6244f1b9411d9fa3147b4402bf2a68e559
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_twenty_newsgroups.py
@@ -0,0 +1,625 @@
+"""Caching loader for the 20 newsgroups text classification dataset.
+
+
+The description of the dataset is available on the official website at:
+
+    http://people.csail.mit.edu/jrennie/20Newsgroups/
+
+Quoting the introduction:
+
+    The 20 Newsgroups data set is a collection of approximately 20,000
+    newsgroup documents, partitioned (nearly) evenly across 20 different
+    newsgroups. To the best of my knowledge, it was originally collected
+    by Ken Lang, probably for his Newsweeder: Learning to filter netnews
+    paper, though he does not explicitly mention this collection. The 20
+    newsgroups collection has become a popular data set for experiments
+    in text applications of machine learning techniques, such as text
+    classification and text clustering.
+
+This dataset loader will download the recommended "by date" variant of the
+dataset and which features a point in time split between the train and
+test sets. The compressed dataset size is around 14 Mb compressed. Once
+uncompressed the train set is 52 MB and the test set is 34 MB.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import codecs
+import logging
+import os
+import pickle
+import re
+import shutil
+import tarfile
+from contextlib import suppress
+from numbers import Integral, Real
+
+import joblib
+import numpy as np
+import scipy.sparse as sp
+
+from .. import preprocessing
+from ..feature_extraction.text import CountVectorizer
+from ..utils import Bunch, check_random_state
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.fixes import tarfile_extractall
+from . import get_data_home, load_files
+from ._base import (
+    RemoteFileMetadata,
+    _convert_data_dataframe,
+    _fetch_remote,
+    _pkl_filepath,
+    load_descr,
+)
+
+logger = logging.getLogger(__name__)
+
+# The original data can be found at:
+# https://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz
+ARCHIVE = RemoteFileMetadata(
+    filename="20news-bydate.tar.gz",
+    url="https://ndownloader.figshare.com/files/5975967",
+    checksum="8f1b2514ca22a5ade8fbb9cfa5727df95fa587f4c87b786e15c759fa66d95610",
+)
+
+CACHE_NAME = "20news-bydate.pkz"
+TRAIN_FOLDER = "20news-bydate-train"
+TEST_FOLDER = "20news-bydate-test"
+
+
+def _download_20newsgroups(target_dir, cache_path, n_retries, delay):
+    """Download the 20 newsgroups data and stored it as a zipped pickle."""
+    train_path = os.path.join(target_dir, TRAIN_FOLDER)
+    test_path = os.path.join(target_dir, TEST_FOLDER)
+
+    os.makedirs(target_dir, exist_ok=True)
+
+    logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url)
+    archive_path = _fetch_remote(
+        ARCHIVE, dirname=target_dir, n_retries=n_retries, delay=delay
+    )
+
+    logger.debug("Decompressing %s", archive_path)
+    with tarfile.open(archive_path, "r:gz") as fp:
+        tarfile_extractall(fp, path=target_dir)
+
+    with suppress(FileNotFoundError):
+        os.remove(archive_path)
+
+    # Store a zipped pickle
+    cache = dict(
+        train=load_files(train_path, encoding="latin1"),
+        test=load_files(test_path, encoding="latin1"),
+    )
+    compressed_content = codecs.encode(pickle.dumps(cache), "zlib_codec")
+    with open(cache_path, "wb") as f:
+        f.write(compressed_content)
+
+    shutil.rmtree(target_dir)
+    return cache
+
+
+def strip_newsgroup_header(text):
+    """
+    Given text in "news" format, strip the headers, by removing everything
+    before the first blank line.
+
+    Parameters
+    ----------
+    text : str
+        The text from which to remove the signature block.
+    """
+    _before, _blankline, after = text.partition("\n\n")
+    return after
+
+
+_QUOTE_RE = re.compile(
+    r"(writes in|writes:|wrote:|says:|said:|^In article|^Quoted from|^\||^>)"
+)
+
+
+def strip_newsgroup_quoting(text):
+    """
+    Given text in "news" format, strip lines beginning with the quote
+    characters > or |, plus lines that often introduce a quoted section
+    (for example, because they contain the string 'writes:'.)
+
+    Parameters
+    ----------
+    text : str
+        The text from which to remove the signature block.
+    """
+    good_lines = [line for line in text.split("\n") if not _QUOTE_RE.search(line)]
+    return "\n".join(good_lines)
+
+
+def strip_newsgroup_footer(text):
+    """
+    Given text in "news" format, attempt to remove a signature block.
+
+    As a rough heuristic, we assume that signatures are set apart by either
+    a blank line or a line made of hyphens, and that it is the last such line
+    in the file (disregarding blank lines at the end).
+
+    Parameters
+    ----------
+    text : str
+        The text from which to remove the signature block.
+    """
+    lines = text.strip().split("\n")
+    for line_num in range(len(lines) - 1, -1, -1):
+        line = lines[line_num]
+        if line.strip().strip("-") == "":
+            break
+
+    if line_num > 0:
+        return "\n".join(lines[:line_num])
+    else:
+        return text
+
+
+@validate_params(
+    {
+        "data_home": [str, os.PathLike, None],
+        "subset": [StrOptions({"train", "test", "all"})],
+        "categories": ["array-like", None],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+        "remove": [tuple],
+        "download_if_missing": ["boolean"],
+        "return_X_y": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_20newsgroups(
+    *,
+    data_home=None,
+    subset="train",
+    categories=None,
+    shuffle=True,
+    random_state=42,
+    remove=(),
+    download_if_missing=True,
+    return_X_y=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the filenames and data from the 20 newsgroups dataset \
+(classification).
+
+    Download it if necessary.
+
+    =================   ==========
+    Classes                     20
+    Samples total            18846
+    Dimensionality               1
+    Features                  text
+    =================   ==========
+
+    Read more in the :ref:`User Guide <20newsgroups_dataset>`.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify a download and cache folder for the datasets. If None,
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    subset : {'train', 'test', 'all'}, default='train'
+        Select the dataset to load: 'train' for the training set, 'test'
+        for the test set, 'all' for both, with shuffled ordering.
+
+    categories : array-like, dtype=str, default=None
+        If None (default), load all the categories.
+        If not None, list of category names to load (other categories
+        ignored).
+
+    shuffle : bool, default=True
+        Whether or not to shuffle the data: might be important for models that
+        make the assumption that the samples are independent and identically
+        distributed (i.i.d.), such as stochastic gradient descent.
+
+    random_state : int, RandomState instance or None, default=42
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    remove : tuple, default=()
+        May contain any subset of ('headers', 'footers', 'quotes'). Each of
+        these are kinds of text that will be detected and removed from the
+        newsgroup posts, preventing classifiers from overfitting on
+        metadata.
+
+        'headers' removes newsgroup headers, 'footers' removes blocks at the
+        ends of posts that look like signatures, and 'quotes' removes lines
+        that appear to be quoting another post.
+
+        'headers' follows an exact standard; the other filters are not always
+        correct.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    return_X_y : bool, default=False
+        If True, returns `(data.data, data.target)` instead of a Bunch
+        object.
+
+        .. versionadded:: 0.22
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    bunch : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : list of shape (n_samples,)
+            The data list to learn.
+        target: ndarray of shape (n_samples,)
+            The target labels.
+        filenames: list of shape (n_samples,)
+            The path to the location of the data.
+        DESCR: str
+            The full description of the dataset.
+        target_names: list of shape (n_classes,)
+            The names of target classes.
+
+    (data, target) : tuple if `return_X_y=True`
+        A tuple of two ndarrays. The first contains a 2D array of shape
+        (n_samples, n_classes) with each row representing one sample and each
+        column representing the features. The second array of shape
+        (n_samples,) contains the target samples.
+
+        .. versionadded:: 0.22
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_20newsgroups
+    >>> cats = ['alt.atheism', 'sci.space']
+    >>> newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
+    >>> list(newsgroups_train.target_names)
+    ['alt.atheism', 'sci.space']
+    >>> newsgroups_train.filenames.shape
+    (1073,)
+    >>> newsgroups_train.target.shape
+    (1073,)
+    >>> newsgroups_train.target[:10]
+    array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0])
+    """
+
+    data_home = get_data_home(data_home=data_home)
+    cache_path = _pkl_filepath(data_home, CACHE_NAME)
+    twenty_home = os.path.join(data_home, "20news_home")
+    cache = None
+    if os.path.exists(cache_path):
+        try:
+            with open(cache_path, "rb") as f:
+                compressed_content = f.read()
+            uncompressed_content = codecs.decode(compressed_content, "zlib_codec")
+            cache = pickle.loads(uncompressed_content)
+        except Exception as e:
+            print(80 * "_")
+            print("Cache loading failed")
+            print(80 * "_")
+            print(e)
+
+    if cache is None:
+        if download_if_missing:
+            logger.info("Downloading 20news dataset. This may take a few minutes.")
+            cache = _download_20newsgroups(
+                target_dir=twenty_home,
+                cache_path=cache_path,
+                n_retries=n_retries,
+                delay=delay,
+            )
+        else:
+            raise OSError("20Newsgroups dataset not found")
+
+    if subset in ("train", "test"):
+        data = cache[subset]
+    elif subset == "all":
+        data_lst = list()
+        target = list()
+        filenames = list()
+        for subset in ("train", "test"):
+            data = cache[subset]
+            data_lst.extend(data.data)
+            target.extend(data.target)
+            filenames.extend(data.filenames)
+
+        data.data = data_lst
+        data.target = np.array(target)
+        data.filenames = np.array(filenames)
+
+    fdescr = load_descr("twenty_newsgroups.rst")
+
+    data.DESCR = fdescr
+
+    if "headers" in remove:
+        data.data = [strip_newsgroup_header(text) for text in data.data]
+    if "footers" in remove:
+        data.data = [strip_newsgroup_footer(text) for text in data.data]
+    if "quotes" in remove:
+        data.data = [strip_newsgroup_quoting(text) for text in data.data]
+
+    if categories is not None:
+        labels = [(data.target_names.index(cat), cat) for cat in categories]
+        # Sort the categories to have the ordering of the labels
+        labels.sort()
+        labels, categories = zip(*labels)
+        mask = np.isin(data.target, labels)
+        data.filenames = data.filenames[mask]
+        data.target = data.target[mask]
+        # searchsorted to have continuous labels
+        data.target = np.searchsorted(labels, data.target)
+        data.target_names = list(categories)
+        # Use an object array to shuffle: avoids memory copy
+        data_lst = np.array(data.data, dtype=object)
+        data_lst = data_lst[mask]
+        data.data = data_lst.tolist()
+
+    if shuffle:
+        random_state = check_random_state(random_state)
+        indices = np.arange(data.target.shape[0])
+        random_state.shuffle(indices)
+        data.filenames = data.filenames[indices]
+        data.target = data.target[indices]
+        # Use an object array to shuffle: avoids memory copy
+        data_lst = np.array(data.data, dtype=object)
+        data_lst = data_lst[indices]
+        data.data = data_lst.tolist()
+
+    if return_X_y:
+        return data.data, data.target
+
+    return data
+
+
+@validate_params(
+    {
+        "subset": [StrOptions({"train", "test", "all"})],
+        "remove": [tuple],
+        "data_home": [str, os.PathLike, None],
+        "download_if_missing": ["boolean"],
+        "return_X_y": ["boolean"],
+        "normalize": ["boolean"],
+        "as_frame": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_20newsgroups_vectorized(
+    *,
+    subset="train",
+    remove=(),
+    data_home=None,
+    download_if_missing=True,
+    return_X_y=False,
+    normalize=True,
+    as_frame=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load and vectorize the 20 newsgroups dataset (classification).
+
+    Download it if necessary.
+
+    This is a convenience function; the transformation is done using the
+    default settings for
+    :class:`~sklearn.feature_extraction.text.CountVectorizer`. For more
+    advanced usage (stopword filtering, n-gram extraction, etc.), combine
+    fetch_20newsgroups with a custom
+    :class:`~sklearn.feature_extraction.text.CountVectorizer`,
+    :class:`~sklearn.feature_extraction.text.HashingVectorizer`,
+    :class:`~sklearn.feature_extraction.text.TfidfTransformer` or
+    :class:`~sklearn.feature_extraction.text.TfidfVectorizer`.
+
+    The resulting counts are normalized using
+    :func:`sklearn.preprocessing.normalize` unless normalize is set to False.
+
+    =================   ==========
+    Classes                     20
+    Samples total            18846
+    Dimensionality          130107
+    Features                  real
+    =================   ==========
+
+    Read more in the :ref:`User Guide <20newsgroups_dataset>`.
+
+    Parameters
+    ----------
+    subset : {'train', 'test', 'all'}, default='train'
+        Select the dataset to load: 'train' for the training set, 'test'
+        for the test set, 'all' for both, with shuffled ordering.
+
+    remove : tuple, default=()
+        May contain any subset of ('headers', 'footers', 'quotes'). Each of
+        these are kinds of text that will be detected and removed from the
+        newsgroup posts, preventing classifiers from overfitting on
+        metadata.
+
+        'headers' removes newsgroup headers, 'footers' removes blocks at the
+        ends of posts that look like signatures, and 'quotes' removes lines
+        that appear to be quoting another post.
+
+    data_home : str or path-like, default=None
+        Specify an download and cache folder for the datasets. If None,
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    return_X_y : bool, default=False
+        If True, returns ``(data.data, data.target)`` instead of a Bunch
+        object.
+
+        .. versionadded:: 0.20
+
+    normalize : bool, default=True
+        If True, normalizes each document's feature vector to unit norm using
+        :func:`sklearn.preprocessing.normalize`.
+
+        .. versionadded:: 0.22
+
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric, string, or categorical). The target is
+        a pandas DataFrame or Series depending on the number of
+        `target_columns`.
+
+        .. versionadded:: 0.24
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    bunch : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data: {sparse matrix, dataframe} of shape (n_samples, n_features)
+            The input data matrix. If ``as_frame`` is `True`, ``data`` is
+            a pandas DataFrame with sparse columns.
+        target: {ndarray, series} of shape (n_samples,)
+            The target labels. If ``as_frame`` is `True`, ``target`` is a
+            pandas Series.
+        target_names: list of shape (n_classes,)
+            The names of target classes.
+        DESCR: str
+            The full description of the dataset.
+        frame: dataframe of shape (n_samples, n_features + 1)
+            Only present when `as_frame=True`. Pandas DataFrame with ``data``
+            and ``target``.
+
+            .. versionadded:: 0.24
+
+    (data, target) : tuple if ``return_X_y`` is True
+        `data` and `target` would be of the format defined in the `Bunch`
+        description above.
+
+        .. versionadded:: 0.20
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_20newsgroups_vectorized
+    >>> newsgroups_vectorized = fetch_20newsgroups_vectorized(subset='test')
+    >>> newsgroups_vectorized.data.shape
+    (7532, 130107)
+    >>> newsgroups_vectorized.target.shape
+    (7532,)
+    """
+    data_home = get_data_home(data_home=data_home)
+    filebase = "20newsgroup_vectorized"
+    if remove:
+        filebase += "remove-" + "-".join(remove)
+    target_file = _pkl_filepath(data_home, filebase + ".pkl")
+
+    # we shuffle but use a fixed seed for the memoization
+    data_train = fetch_20newsgroups(
+        data_home=data_home,
+        subset="train",
+        categories=None,
+        shuffle=True,
+        random_state=12,
+        remove=remove,
+        download_if_missing=download_if_missing,
+        n_retries=n_retries,
+        delay=delay,
+    )
+
+    data_test = fetch_20newsgroups(
+        data_home=data_home,
+        subset="test",
+        categories=None,
+        shuffle=True,
+        random_state=12,
+        remove=remove,
+        download_if_missing=download_if_missing,
+        n_retries=n_retries,
+        delay=delay,
+    )
+
+    if os.path.exists(target_file):
+        try:
+            X_train, X_test, feature_names = joblib.load(target_file)
+        except ValueError as e:
+            raise ValueError(
+                f"The cached dataset located in {target_file} was fetched "
+                "with an older scikit-learn version and it is not compatible "
+                "with the scikit-learn version imported. You need to "
+                f"manually delete the file: {target_file}."
+            ) from e
+    else:
+        vectorizer = CountVectorizer(dtype=np.int16)
+        X_train = vectorizer.fit_transform(data_train.data).tocsr()
+        X_test = vectorizer.transform(data_test.data).tocsr()
+        feature_names = vectorizer.get_feature_names_out()
+
+        joblib.dump((X_train, X_test, feature_names), target_file, compress=9)
+
+    # the data is stored as int16 for compactness
+    # but normalize needs floats
+    if normalize:
+        X_train = X_train.astype(np.float64)
+        X_test = X_test.astype(np.float64)
+        preprocessing.normalize(X_train, copy=False)
+        preprocessing.normalize(X_test, copy=False)
+
+    target_names = data_train.target_names
+
+    if subset == "train":
+        data = X_train
+        target = data_train.target
+    elif subset == "test":
+        data = X_test
+        target = data_test.target
+    elif subset == "all":
+        data = sp.vstack((X_train, X_test)).tocsr()
+        target = np.concatenate((data_train.target, data_test.target))
+
+    fdescr = load_descr("twenty_newsgroups.rst")
+
+    frame = None
+    target_name = ["category_class"]
+
+    if as_frame:
+        frame, data, target = _convert_data_dataframe(
+            "fetch_20newsgroups_vectorized",
+            data,
+            target,
+            feature_names,
+            target_names=target_name,
+            sparse_data=True,
+        )
+
+    if return_X_y:
+        return data, target
+
+    return Bunch(
+        data=data,
+        target=target,
+        frame=frame,
+        target_names=target_names,
+        feature_names=feature_names,
+        DESCR=fdescr,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/meson.build b/.venv/lib/python3.12/site-packages/sklearn/datasets/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..4efcd279315de3478eae4da682c9760c58a8f92b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/meson.build
@@ -0,0 +1,7 @@
+py.extension_module(
+  '_svmlight_format_fast',
+  cython_gen.process('_svmlight_format_fast.pyx'),
+  dependencies: [np_dep],
+  subdir: 'sklearn/datasets',
+  install: true
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3fa9b42895a624ac2f3b50a14155c2c5fffd82
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/__init__.py
@@ -0,0 +1,54 @@
+"""Matrix decomposition algorithms.
+
+These include PCA, NMF, ICA, and more. Most of the algorithms of this module can be
+regarded as dimensionality reduction techniques.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..utils.extmath import randomized_svd
+from ._dict_learning import (
+    DictionaryLearning,
+    MiniBatchDictionaryLearning,
+    SparseCoder,
+    dict_learning,
+    dict_learning_online,
+    sparse_encode,
+)
+from ._factor_analysis import FactorAnalysis
+from ._fastica import FastICA, fastica
+from ._incremental_pca import IncrementalPCA
+from ._kernel_pca import KernelPCA
+from ._lda import LatentDirichletAllocation
+from ._nmf import (
+    NMF,
+    MiniBatchNMF,
+    non_negative_factorization,
+)
+from ._pca import PCA
+from ._sparse_pca import MiniBatchSparsePCA, SparsePCA
+from ._truncated_svd import TruncatedSVD
+
+__all__ = [
+    "NMF",
+    "PCA",
+    "DictionaryLearning",
+    "FactorAnalysis",
+    "FastICA",
+    "IncrementalPCA",
+    "KernelPCA",
+    "LatentDirichletAllocation",
+    "MiniBatchDictionaryLearning",
+    "MiniBatchNMF",
+    "MiniBatchSparsePCA",
+    "SparseCoder",
+    "SparsePCA",
+    "TruncatedSVD",
+    "dict_learning",
+    "dict_learning_online",
+    "fastica",
+    "non_negative_factorization",
+    "randomized_svd",
+    "sparse_encode",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_base.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..783c316b50f27b784767b019be3605be9b832027
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_base.py
@@ -0,0 +1,202 @@
+"""Principal Component Analysis Base Classes"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from scipy import linalg
+
+from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin
+from ..utils._array_api import _fill_or_add_to_diagonal, device, get_namespace
+from ..utils.validation import check_is_fitted, validate_data
+
+
+class _BasePCA(
+    ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, metaclass=ABCMeta
+):
+    """Base class for PCA methods.
+
+    Warning: This class should not be used directly.
+    Use derived classes instead.
+    """
+
+    def get_covariance(self):
+        """Compute data covariance with the generative model.
+
+        ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``
+        where S**2 contains the explained variances, and sigma2 contains the
+        noise variances.
+
+        Returns
+        -------
+        cov : array of shape=(n_features, n_features)
+            Estimated covariance of data.
+        """
+        xp, _ = get_namespace(self.components_)
+
+        components_ = self.components_
+        exp_var = self.explained_variance_
+        if self.whiten:
+            components_ = components_ * xp.sqrt(exp_var[:, np.newaxis])
+        exp_var_diff = exp_var - self.noise_variance_
+        exp_var_diff = xp.where(
+            exp_var > self.noise_variance_,
+            exp_var_diff,
+            xp.asarray(0.0, device=device(exp_var), dtype=exp_var.dtype),
+        )
+        cov = (components_.T * exp_var_diff) @ components_
+        _fill_or_add_to_diagonal(cov, self.noise_variance_, xp)
+        return cov
+
+    def get_precision(self):
+        """Compute data precision matrix with the generative model.
+
+        Equals the inverse of the covariance but computed with
+        the matrix inversion lemma for efficiency.
+
+        Returns
+        -------
+        precision : array, shape=(n_features, n_features)
+            Estimated precision of data.
+        """
+        xp, is_array_api_compliant = get_namespace(self.components_)
+
+        n_features = self.components_.shape[1]
+
+        # handle corner cases first
+        if self.n_components_ == 0:
+            return xp.eye(n_features) / self.noise_variance_
+
+        if is_array_api_compliant:
+            linalg_inv = xp.linalg.inv
+        else:
+            linalg_inv = linalg.inv
+
+        if self.noise_variance_ == 0.0:
+            return linalg_inv(self.get_covariance())
+
+        # Get precision using matrix inversion lemma
+        components_ = self.components_
+        exp_var = self.explained_variance_
+        if self.whiten:
+            components_ = components_ * xp.sqrt(exp_var[:, np.newaxis])
+        exp_var_diff = exp_var - self.noise_variance_
+        exp_var_diff = xp.where(
+            exp_var > self.noise_variance_,
+            exp_var_diff,
+            xp.asarray(0.0, device=device(exp_var)),
+        )
+        precision = components_ @ components_.T / self.noise_variance_
+        _fill_or_add_to_diagonal(precision, 1.0 / exp_var_diff, xp)
+        precision = components_.T @ linalg_inv(precision) @ components_
+        precision /= -(self.noise_variance_**2)
+        _fill_or_add_to_diagonal(precision, 1.0 / self.noise_variance_, xp)
+        return precision
+
+    @abstractmethod
+    def fit(self, X, y=None):
+        """Placeholder for fit. Subclasses should implement this method!
+
+        Fit the model with X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+
+    def transform(self, X):
+        """Apply dimensionality reduction to X.
+
+        X is projected on the first principal components previously extracted
+        from a training set.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Returns
+        -------
+        X_new : array-like of shape (n_samples, n_components)
+            Projection of X in the first principal components, where `n_samples`
+            is the number of samples and `n_components` is the number of the components.
+        """
+        xp, _ = get_namespace(X, self.components_, self.explained_variance_)
+
+        check_is_fitted(self)
+
+        X = validate_data(
+            self,
+            X,
+            dtype=[xp.float64, xp.float32],
+            accept_sparse=("csr", "csc"),
+            reset=False,
+        )
+        return self._transform(X, xp=xp, x_is_centered=False)
+
+    def _transform(self, X, xp, x_is_centered=False):
+        X_transformed = X @ self.components_.T
+        if not x_is_centered:
+            # Apply the centering after the projection.
+            # For dense X this avoids copying or mutating the data passed by
+            # the caller.
+            # For sparse X it keeps sparsity and avoids having to wrap X into
+            # a linear operator.
+            X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
+        if self.whiten:
+            # For some solvers (such as "arpack" and "covariance_eigh"), on
+            # rank deficient data, some components can have a variance
+            # arbitrarily close to zero, leading to non-finite results when
+            # whitening. To avoid this problem we clip the variance below.
+            scale = xp.sqrt(self.explained_variance_)
+            min_scale = xp.finfo(scale.dtype).eps
+            scale[scale < min_scale] = min_scale
+            X_transformed /= scale
+        return X_transformed
+
+    def inverse_transform(self, X):
+        """Transform data back to its original space.
+
+        In other words, return an input `X_original` whose transform would be X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_components)
+            New data, where `n_samples` is the number of samples
+            and `n_components` is the number of components.
+
+        Returns
+        -------
+        X_original : array-like of shape (n_samples, n_features)
+            Original data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Notes
+        -----
+        If whitening is enabled, inverse_transform will compute the
+        exact inverse operation, which includes reversing whitening.
+        """
+        xp, _ = get_namespace(X)
+
+        if self.whiten:
+            scaled_components = (
+                xp.sqrt(self.explained_variance_[:, np.newaxis]) * self.components_
+            )
+            return X @ scaled_components + self.mean_
+        else:
+            return X @ self.components_ + self.mean_
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_cdnmf_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_cdnmf_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..b2a07fb275bded974524b0c372a931de850d9142
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_cdnmf_fast.pyx
@@ -0,0 +1,38 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cython cimport floating
+from libc.math cimport fabs
+
+
+def _update_cdnmf_fast(floating[:, ::1] W, floating[:, :] HHt,
+                       floating[:, :] XHt, Py_ssize_t[::1] permutation):
+    cdef:
+        floating violation = 0
+        Py_ssize_t n_components = W.shape[1]
+        Py_ssize_t n_samples = W.shape[0]  # n_features for H update
+        floating grad, pg, hess
+        Py_ssize_t i, r, s, t
+
+    with nogil:
+        for s in range(n_components):
+            t = permutation[s]
+
+            for i in range(n_samples):
+                # gradient = GW[t, i] where GW = np.dot(W, HHt) - XHt
+                grad = -XHt[i, t]
+
+                for r in range(n_components):
+                    grad += HHt[t, r] * W[i, r]
+
+                # projected gradient
+                pg = min(0., grad) if W[i, t] == 0 else grad
+                violation += fabs(pg)
+
+                # Hessian
+                hess = HHt[t, t]
+
+                if hess != 0:
+                    W[i, t] = max(W[i, t] - grad / hess, 0.)
+
+    return violation
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_dict_learning.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_dict_learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae40e28e9f013295dc5b2c4c8dd365fda7ac6bc6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_dict_learning.py
@@ -0,0 +1,2329 @@
+"""Dictionary learning."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import sys
+import time
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+from scipy import linalg
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..linear_model import Lars, Lasso, LassoLars, orthogonal_mp_gram
+from ..utils import check_array, check_random_state, gen_batches, gen_even_slices
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.extmath import _randomized_svd, row_norms, svd_flip
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import check_is_fitted, validate_data
+
+
+def _check_positive_coding(method, positive):
+    if positive and method in ["omp", "lars"]:
+        raise ValueError(
+            "Positive constraint not supported for '{}' coding method.".format(method)
+        )
+
+
+def _sparse_encode_precomputed(
+    X,
+    dictionary,
+    *,
+    gram=None,
+    cov=None,
+    algorithm="lasso_lars",
+    regularization=None,
+    copy_cov=True,
+    init=None,
+    max_iter=1000,
+    verbose=0,
+    positive=False,
+):
+    """Generic sparse coding with precomputed Gram and/or covariance matrices.
+
+    Each row of the result is the solution to a Lasso problem.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Data matrix.
+
+    dictionary : ndarray of shape (n_components, n_features)
+        The dictionary matrix against which to solve the sparse coding of
+        the data. Some of the algorithms assume normalized rows.
+
+    gram : ndarray of shape (n_components, n_components), default=None
+        Precomputed Gram matrix, `dictionary * dictionary'`
+        gram can be `None` if method is 'threshold'.
+
+    cov : ndarray of shape (n_components, n_samples), default=None
+        Precomputed covariance, `dictionary * X'`.
+
+    algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, \
+            default='lasso_lars'
+        The algorithm used:
+
+        * `'lars'`: uses the least angle regression method
+          (`linear_model.lars_path`);
+        * `'lasso_lars'`: uses Lars to compute the Lasso solution;
+        * `'lasso_cd'`: uses the coordinate descent method to compute the
+          Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if
+          the estimated components are sparse;
+        * `'omp'`: uses orthogonal matching pursuit to estimate the sparse
+          solution;
+        * `'threshold'`: squashes to zero all coefficients less than
+          regularization from the projection `dictionary * data'`.
+
+    regularization : int or float, default=None
+        The regularization parameter. It corresponds to alpha when
+        algorithm is `'lasso_lars'`, `'lasso_cd'` or `'threshold'`.
+        Otherwise it corresponds to `n_nonzero_coefs`.
+
+    init : ndarray of shape (n_samples, n_components), default=None
+        Initialization value of the sparse code. Only used if
+        `algorithm='lasso_cd'`.
+
+    max_iter : int, default=1000
+        Maximum number of iterations to perform if `algorithm='lasso_cd'` or
+        `'lasso_lars'`.
+
+    copy_cov : bool, default=True
+        Whether to copy the precomputed covariance matrix; if `False`, it may
+        be overwritten.
+
+    verbose : int, default=0
+        Controls the verbosity; the higher, the more messages.
+
+    positive: bool, default=False
+        Whether to enforce a positivity constraint on the sparse code.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    code : ndarray of shape (n_components, n_features)
+        The sparse codes.
+    """
+    n_samples, n_features = X.shape
+    n_components = dictionary.shape[0]
+
+    if algorithm == "lasso_lars":
+        alpha = float(regularization) / n_features  # account for scaling
+        try:
+            err_mgt = np.seterr(all="ignore")
+
+            # Not passing in verbose=max(0, verbose-1) because Lars.fit already
+            # corrects the verbosity level.
+            lasso_lars = LassoLars(
+                alpha=alpha,
+                fit_intercept=False,
+                verbose=verbose,
+                precompute=gram,
+                fit_path=False,
+                positive=positive,
+                max_iter=max_iter,
+            )
+            lasso_lars.fit(dictionary.T, X.T, Xy=cov)
+            new_code = lasso_lars.coef_
+        finally:
+            np.seterr(**err_mgt)
+
+    elif algorithm == "lasso_cd":
+        alpha = float(regularization) / n_features  # account for scaling
+
+        # TODO: Make verbosity argument for Lasso?
+        # sklearn.linear_model.coordinate_descent.enet_path has a verbosity
+        # argument that we could pass in from Lasso.
+        clf = Lasso(
+            alpha=alpha,
+            fit_intercept=False,
+            precompute=gram,
+            max_iter=max_iter,
+            warm_start=True,
+            positive=positive,
+        )
+
+        if init is not None:
+            # In some workflows using coordinate descent algorithms:
+            #  - users might provide NumPy arrays with read-only buffers
+            #  - `joblib` might memmap arrays making their buffer read-only
+            # TODO: move this handling (which is currently too broad)
+            # closer to the actual private function which need buffers to be writable.
+            if not init.flags["WRITEABLE"]:
+                init = np.array(init)
+            clf.coef_ = init
+
+        clf.fit(dictionary.T, X.T, check_input=False)
+        new_code = clf.coef_
+
+    elif algorithm == "lars":
+        try:
+            err_mgt = np.seterr(all="ignore")
+
+            # Not passing in verbose=max(0, verbose-1) because Lars.fit already
+            # corrects the verbosity level.
+            lars = Lars(
+                fit_intercept=False,
+                verbose=verbose,
+                precompute=gram,
+                n_nonzero_coefs=int(regularization),
+                fit_path=False,
+            )
+            lars.fit(dictionary.T, X.T, Xy=cov)
+            new_code = lars.coef_
+        finally:
+            np.seterr(**err_mgt)
+
+    elif algorithm == "threshold":
+        new_code = (np.sign(cov) * np.maximum(np.abs(cov) - regularization, 0)).T
+        if positive:
+            np.clip(new_code, 0, None, out=new_code)
+
+    elif algorithm == "omp":
+        new_code = orthogonal_mp_gram(
+            Gram=gram,
+            Xy=cov,
+            n_nonzero_coefs=int(regularization),
+            tol=None,
+            norms_squared=row_norms(X, squared=True),
+            copy_Xy=copy_cov,
+        ).T
+
+    return new_code.reshape(n_samples, n_components)
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "dictionary": ["array-like"],
+        "gram": ["array-like", None],
+        "cov": ["array-like", None],
+        "algorithm": [
+            StrOptions({"lasso_lars", "lasso_cd", "lars", "omp", "threshold"})
+        ],
+        "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None],
+        "alpha": [Interval(Real, 0, None, closed="left"), None],
+        "copy_cov": ["boolean"],
+        "init": ["array-like", None],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "n_jobs": [Integral, None],
+        "check_input": ["boolean"],
+        "verbose": ["verbose"],
+        "positive": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+# XXX : could be moved to the linear_model module
+def sparse_encode(
+    X,
+    dictionary,
+    *,
+    gram=None,
+    cov=None,
+    algorithm="lasso_lars",
+    n_nonzero_coefs=None,
+    alpha=None,
+    copy_cov=True,
+    init=None,
+    max_iter=1000,
+    n_jobs=None,
+    check_input=True,
+    verbose=0,
+    positive=False,
+):
+    """Sparse coding.
+
+    Each row of the result is the solution to a sparse coding problem.
+    The goal is to find a sparse array `code` such that::
+
+        X ~= code * dictionary
+
+    Read more in the :ref:`User Guide <SparseCoder>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data matrix.
+
+    dictionary : array-like of shape (n_components, n_features)
+        The dictionary matrix against which to solve the sparse coding of
+        the data. Some of the algorithms assume normalized rows for meaningful
+        output.
+
+    gram : array-like of shape (n_components, n_components), default=None
+        Precomputed Gram matrix, `dictionary * dictionary'`.
+
+    cov : array-like of shape (n_components, n_samples), default=None
+        Precomputed covariance, `dictionary' * X`.
+
+    algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, \
+            default='lasso_lars'
+        The algorithm used:
+
+        * `'lars'`: uses the least angle regression method
+          (`linear_model.lars_path`);
+        * `'lasso_lars'`: uses Lars to compute the Lasso solution;
+        * `'lasso_cd'`: uses the coordinate descent method to compute the
+          Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if
+          the estimated components are sparse;
+        * `'omp'`: uses orthogonal matching pursuit to estimate the sparse
+          solution;
+        * `'threshold'`: squashes to zero all coefficients less than
+          regularization from the projection `dictionary * data'`.
+
+    n_nonzero_coefs : int, default=None
+        Number of nonzero coefficients to target in each column of the
+        solution. This is only used by `algorithm='lars'` and `algorithm='omp'`
+        and is overridden by `alpha` in the `omp` case. If `None`, then
+        `n_nonzero_coefs=int(n_features / 10)`.
+
+    alpha : float, default=None
+        If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the
+        penalty applied to the L1 norm.
+        If `algorithm='threshold'`, `alpha` is the absolute value of the
+        threshold below which coefficients will be squashed to zero.
+        If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of
+        the reconstruction error targeted. In this case, it overrides
+        `n_nonzero_coefs`.
+        If `None`, default to 1.
+
+    copy_cov : bool, default=True
+        Whether to copy the precomputed covariance matrix; if `False`, it may
+        be overwritten.
+
+    init : ndarray of shape (n_samples, n_components), default=None
+        Initialization value of the sparse codes. Only used if
+        `algorithm='lasso_cd'`.
+
+    max_iter : int, default=1000
+        Maximum number of iterations to perform if `algorithm='lasso_cd'` or
+        `'lasso_lars'`.
+
+    n_jobs : int, default=None
+        Number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    check_input : bool, default=True
+        If `False`, the input arrays X and dictionary will not be checked.
+
+    verbose : int, default=0
+        Controls the verbosity; the higher, the more messages.
+
+    positive : bool, default=False
+        Whether to enforce positivity when finding the encoding.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    code : ndarray of shape (n_samples, n_components)
+        The sparse codes.
+
+    See Also
+    --------
+    sklearn.linear_model.lars_path : Compute Least Angle Regression or Lasso
+        path using LARS algorithm.
+    sklearn.linear_model.orthogonal_mp : Solves Orthogonal Matching Pursuit problems.
+    sklearn.linear_model.Lasso : Train Linear Model with L1 prior as regularizer.
+    SparseCoder : Find a sparse representation of data from a fixed precomputed
+        dictionary.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.decomposition import sparse_encode
+    >>> X = np.array([[-1, -1, -1], [0, 0, 3]])
+    >>> dictionary = np.array(
+    ...     [[0, 1, 0],
+    ...      [-1, -1, 2],
+    ...      [1, 1, 1],
+    ...      [0, 1, 1],
+    ...      [0, 2, 1]],
+    ...    dtype=np.float64
+    ... )
+    >>> sparse_encode(X, dictionary, alpha=1e-10)
+    array([[ 0.,  0., -1.,  0.,  0.],
+           [ 0.,  1.,  1.,  0.,  0.]])
+    """
+    if check_input:
+        if algorithm == "lasso_cd":
+            dictionary = check_array(
+                dictionary, order="C", dtype=[np.float64, np.float32]
+            )
+            X = check_array(X, order="C", dtype=[np.float64, np.float32])
+        else:
+            dictionary = check_array(dictionary)
+            X = check_array(X)
+
+    if dictionary.shape[1] != X.shape[1]:
+        raise ValueError(
+            "Dictionary and X have different numbers of features:"
+            "dictionary.shape: {} X.shape{}".format(dictionary.shape, X.shape)
+        )
+
+    _check_positive_coding(algorithm, positive)
+
+    return _sparse_encode(
+        X,
+        dictionary,
+        gram=gram,
+        cov=cov,
+        algorithm=algorithm,
+        n_nonzero_coefs=n_nonzero_coefs,
+        alpha=alpha,
+        copy_cov=copy_cov,
+        init=init,
+        max_iter=max_iter,
+        n_jobs=n_jobs,
+        verbose=verbose,
+        positive=positive,
+    )
+
+
+def _sparse_encode(
+    X,
+    dictionary,
+    *,
+    gram=None,
+    cov=None,
+    algorithm="lasso_lars",
+    n_nonzero_coefs=None,
+    alpha=None,
+    copy_cov=True,
+    init=None,
+    max_iter=1000,
+    n_jobs=None,
+    verbose=0,
+    positive=False,
+):
+    """Sparse coding without input/parameter validation."""
+
+    n_samples, n_features = X.shape
+    n_components = dictionary.shape[0]
+
+    if algorithm in ("lars", "omp"):
+        regularization = n_nonzero_coefs
+        if regularization is None:
+            regularization = min(max(n_features / 10, 1), n_components)
+    else:
+        regularization = alpha
+        if regularization is None:
+            regularization = 1.0
+
+    if gram is None and algorithm != "threshold":
+        gram = np.dot(dictionary, dictionary.T)
+
+    if cov is None and algorithm != "lasso_cd":
+        copy_cov = False
+        cov = np.dot(dictionary, X.T)
+
+    if effective_n_jobs(n_jobs) == 1 or algorithm == "threshold":
+        code = _sparse_encode_precomputed(
+            X,
+            dictionary,
+            gram=gram,
+            cov=cov,
+            algorithm=algorithm,
+            regularization=regularization,
+            copy_cov=copy_cov,
+            init=init,
+            max_iter=max_iter,
+            verbose=verbose,
+            positive=positive,
+        )
+        return code
+
+    # Enter parallel code block
+    n_samples = X.shape[0]
+    n_components = dictionary.shape[0]
+    code = np.empty((n_samples, n_components))
+    slices = list(gen_even_slices(n_samples, effective_n_jobs(n_jobs)))
+
+    code_views = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(_sparse_encode_precomputed)(
+            X[this_slice],
+            dictionary,
+            gram=gram,
+            cov=cov[:, this_slice] if cov is not None else None,
+            algorithm=algorithm,
+            regularization=regularization,
+            copy_cov=copy_cov,
+            init=init[this_slice] if init is not None else None,
+            max_iter=max_iter,
+            verbose=verbose,
+            positive=positive,
+        )
+        for this_slice in slices
+    )
+    for this_slice, this_view in zip(slices, code_views):
+        code[this_slice] = this_view
+    return code
+
+
+def _update_dict(
+    dictionary,
+    Y,
+    code,
+    A=None,
+    B=None,
+    verbose=False,
+    random_state=None,
+    positive=False,
+):
+    """Update the dense dictionary factor in place.
+
+    Parameters
+    ----------
+    dictionary : ndarray of shape (n_components, n_features)
+        Value of the dictionary at the previous iteration.
+
+    Y : ndarray of shape (n_samples, n_features)
+        Data matrix.
+
+    code : ndarray of shape (n_samples, n_components)
+        Sparse coding of the data against which to optimize the dictionary.
+
+    A : ndarray of shape (n_components, n_components), default=None
+        Together with `B`, sufficient stats of the online model to update the
+        dictionary.
+
+    B : ndarray of shape (n_features, n_components), default=None
+        Together with `A`, sufficient stats of the online model to update the
+        dictionary.
+
+    verbose: bool, default=False
+        Degree of output the procedure will print.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for randomly initializing the dictionary. Pass an int for
+        reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    positive : bool, default=False
+        Whether to enforce positivity when finding the dictionary.
+
+        .. versionadded:: 0.20
+    """
+    n_samples, n_components = code.shape
+    random_state = check_random_state(random_state)
+
+    if A is None:
+        A = code.T @ code
+    if B is None:
+        B = Y.T @ code
+
+    n_unused = 0
+
+    for k in range(n_components):
+        if A[k, k] > 1e-6:
+            # 1e-6 is arbitrary but consistent with the spams implementation
+            dictionary[k] += (B[:, k] - A[k] @ dictionary) / A[k, k]
+        else:
+            # kth atom is almost never used -> sample a new one from the data
+            newd = Y[random_state.choice(n_samples)]
+
+            # add small noise to avoid making the sparse coding ill conditioned
+            noise_level = 0.01 * (newd.std() or 1)  # avoid 0 std
+            noise = random_state.normal(0, noise_level, size=len(newd))
+
+            dictionary[k] = newd + noise
+            code[:, k] = 0
+            n_unused += 1
+
+        if positive:
+            np.clip(dictionary[k], 0, None, out=dictionary[k])
+
+        # Projection on the constraint set ||V_k|| <= 1
+        dictionary[k] /= max(linalg.norm(dictionary[k]), 1)
+
+    if verbose and n_unused > 0:
+        print(f"{n_unused} unused atoms resampled.")
+
+
+def _dict_learning(
+    X,
+    n_components,
+    *,
+    alpha,
+    max_iter,
+    tol,
+    method,
+    n_jobs,
+    dict_init,
+    code_init,
+    callback,
+    verbose,
+    random_state,
+    return_n_iter,
+    positive_dict,
+    positive_code,
+    method_max_iter,
+):
+    """Main dictionary learning algorithm"""
+    t0 = time.time()
+    # Init the code and the dictionary with SVD of Y
+    if code_init is not None and dict_init is not None:
+        code = np.array(code_init, order="F")
+        # Don't copy V, it will happen below
+        dictionary = dict_init
+    else:
+        code, S, dictionary = linalg.svd(X, full_matrices=False)
+        # flip the initial code's sign to enforce deterministic output
+        code, dictionary = svd_flip(code, dictionary)
+        dictionary = S[:, np.newaxis] * dictionary
+    r = len(dictionary)
+    if n_components <= r:  # True even if n_components=None
+        code = code[:, :n_components]
+        dictionary = dictionary[:n_components, :]
+    else:
+        code = np.c_[code, np.zeros((len(code), n_components - r))]
+        dictionary = np.r_[
+            dictionary, np.zeros((n_components - r, dictionary.shape[1]))
+        ]
+
+    # Fortran-order dict better suited for the sparse coding which is the
+    # bottleneck of this algorithm.
+    dictionary = np.asfortranarray(dictionary)
+
+    errors = []
+    current_cost = np.nan
+
+    if verbose == 1:
+        print("[dict_learning]", end=" ")
+
+    # If max_iter is 0, number of iterations returned should be zero
+    ii = -1
+
+    for ii in range(max_iter):
+        dt = time.time() - t0
+        if verbose == 1:
+            sys.stdout.write(".")
+            sys.stdout.flush()
+        elif verbose:
+            print(
+                "Iteration % 3i (elapsed time: % 3is, % 4.1fmn, current cost % 7.3f)"
+                % (ii, dt, dt / 60, current_cost)
+            )
+
+        # Update code
+        code = sparse_encode(
+            X,
+            dictionary,
+            algorithm=method,
+            alpha=alpha,
+            init=code,
+            n_jobs=n_jobs,
+            positive=positive_code,
+            max_iter=method_max_iter,
+            verbose=verbose,
+        )
+
+        # Update dictionary in place
+        _update_dict(
+            dictionary,
+            X,
+            code,
+            verbose=verbose,
+            random_state=random_state,
+            positive=positive_dict,
+        )
+
+        # Cost function
+        current_cost = 0.5 * np.sum((X - code @ dictionary) ** 2) + alpha * np.sum(
+            np.abs(code)
+        )
+        errors.append(current_cost)
+
+        if ii > 0:
+            dE = errors[-2] - errors[-1]
+            # assert(dE >= -tol * errors[-1])
+            if dE < tol * errors[-1]:
+                if verbose == 1:
+                    # A line return
+                    print("")
+                elif verbose:
+                    print("--- Convergence reached after %d iterations" % ii)
+                break
+        if ii % 5 == 0 and callback is not None:
+            callback(locals())
+
+    if return_n_iter:
+        return code, dictionary, errors, ii + 1
+    else:
+        return code, dictionary, errors
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "return_code": ["boolean"],
+        "method": [StrOptions({"cd", "lars"})],
+        "method_max_iter": [Interval(Integral, 0, None, closed="left")],
+    },
+    prefer_skip_nested_validation=False,
+)
+def dict_learning_online(
+    X,
+    n_components=2,
+    *,
+    alpha=1,
+    max_iter=100,
+    return_code=True,
+    dict_init=None,
+    callback=None,
+    batch_size=256,
+    verbose=False,
+    shuffle=True,
+    n_jobs=None,
+    method="lars",
+    random_state=None,
+    positive_dict=False,
+    positive_code=False,
+    method_max_iter=1000,
+    tol=1e-3,
+    max_no_improvement=10,
+):
+    """Solve a dictionary learning matrix factorization problem online.
+
+    Finds the best dictionary and the corresponding sparse code for
+    approximating the data matrix X by solving::
+
+        (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
+                     (U,V)
+                     with || V_k ||_2 = 1 for all  0 <= k < n_components
+
+    where V is the dictionary and U is the sparse code. ||.||_Fro stands for
+    the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm
+    which is the sum of the absolute values of all the entries in the matrix.
+    This is accomplished by repeatedly iterating over mini-batches by slicing
+    the input data.
+
+    Read more in the :ref:`User Guide <DictionaryLearning>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data matrix.
+
+    n_components : int or None, default=2
+        Number of dictionary atoms to extract. If None, then ``n_components``
+        is set to ``n_features``.
+
+    alpha : float, default=1
+        Sparsity controlling parameter.
+
+    max_iter : int, default=100
+        Maximum number of iterations over the complete dataset before
+        stopping independently of any early stopping criterion heuristics.
+
+        .. versionadded:: 1.1
+
+    return_code : bool, default=True
+        Whether to also return the code U or just the dictionary `V`.
+
+    dict_init : ndarray of shape (n_components, n_features), default=None
+        Initial values for the dictionary for warm restart scenarios.
+        If `None`, the initial values for the dictionary are created
+        with an SVD decomposition of the data via
+        :func:`~sklearn.utils.extmath.randomized_svd`.
+
+    callback : callable, default=None
+        A callable that gets invoked at the end of each iteration.
+
+    batch_size : int, default=256
+        The number of samples to take in each batch.
+
+        .. versionchanged:: 1.3
+           The default value of `batch_size` changed from 3 to 256 in version 1.3.
+
+    verbose : bool, default=False
+        To control the verbosity of the procedure.
+
+    shuffle : bool, default=True
+        Whether to shuffle the data before splitting it in batches.
+
+    n_jobs : int, default=None
+        Number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    method : {'lars', 'cd'}, default='lars'
+        * `'lars'`: uses the least angle regression method to solve the lasso
+          problem (`linear_model.lars_path`);
+        * `'cd'`: uses the coordinate descent method to compute the
+          Lasso solution (`linear_model.Lasso`). Lars will be faster if
+          the estimated components are sparse.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for initializing the dictionary when ``dict_init`` is not
+        specified, randomly shuffling the data when ``shuffle`` is set to
+        ``True``, and updating the dictionary. Pass an int for reproducible
+        results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    positive_dict : bool, default=False
+        Whether to enforce positivity when finding the dictionary.
+
+        .. versionadded:: 0.20
+
+    positive_code : bool, default=False
+        Whether to enforce positivity when finding the code.
+
+        .. versionadded:: 0.20
+
+    method_max_iter : int, default=1000
+        Maximum number of iterations to perform when solving the lasso problem.
+
+        .. versionadded:: 0.22
+
+    tol : float, default=1e-3
+        Control early stopping based on the norm of the differences in the
+        dictionary between 2 steps.
+
+        To disable early stopping based on changes in the dictionary, set
+        `tol` to 0.0.
+
+        .. versionadded:: 1.1
+
+    max_no_improvement : int, default=10
+        Control early stopping based on the consecutive number of mini batches
+        that does not yield an improvement on the smoothed cost function.
+
+        To disable convergence detection based on cost function, set
+        `max_no_improvement` to None.
+
+        .. versionadded:: 1.1
+
+    Returns
+    -------
+    code : ndarray of shape (n_samples, n_components),
+        The sparse code (only returned if `return_code=True`).
+
+    dictionary : ndarray of shape (n_components, n_features),
+        The solutions to the dictionary learning problem.
+
+    n_iter : int
+        Number of iterations run. Returned only if `return_n_iter` is
+        set to `True`.
+
+    See Also
+    --------
+    dict_learning : Solve a dictionary learning matrix factorization problem.
+    DictionaryLearning : Find a dictionary that sparsely encodes data.
+    MiniBatchDictionaryLearning : A faster, less accurate, version of the dictionary
+        learning algorithm.
+    SparsePCA : Sparse Principal Components Analysis.
+    MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_sparse_coded_signal
+    >>> from sklearn.decomposition import dict_learning_online
+    >>> X, _, _ = make_sparse_coded_signal(
+    ...     n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10,
+    ...     random_state=42,
+    ... )
+    >>> U, V = dict_learning_online(
+    ...     X, n_components=15, alpha=0.2, max_iter=20, batch_size=3, random_state=42
+    ... )
+
+    We can check the level of sparsity of `U`:
+
+    >>> np.mean(U == 0)
+    np.float64(0.53)
+
+    We can compare the average squared euclidean norm of the reconstruction
+    error of the sparse coded signal relative to the squared euclidean norm of
+    the original signal:
+
+    >>> X_hat = U @ V
+    >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))
+    np.float64(0.053)
+    """
+    transform_algorithm = "lasso_" + method
+
+    est = MiniBatchDictionaryLearning(
+        n_components=n_components,
+        alpha=alpha,
+        max_iter=max_iter,
+        n_jobs=n_jobs,
+        fit_algorithm=method,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        dict_init=dict_init,
+        random_state=random_state,
+        transform_algorithm=transform_algorithm,
+        transform_alpha=alpha,
+        positive_code=positive_code,
+        positive_dict=positive_dict,
+        transform_max_iter=method_max_iter,
+        verbose=verbose,
+        callback=callback,
+        tol=tol,
+        max_no_improvement=max_no_improvement,
+    ).fit(X)
+
+    if not return_code:
+        return est.components_
+    else:
+        code = est.transform(X)
+        return code, est.components_
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "method": [StrOptions({"lars", "cd"})],
+        "return_n_iter": ["boolean"],
+        "method_max_iter": [Interval(Integral, 0, None, closed="left")],
+    },
+    prefer_skip_nested_validation=False,
+)
+def dict_learning(
+    X,
+    n_components,
+    *,
+    alpha,
+    max_iter=100,
+    tol=1e-8,
+    method="lars",
+    n_jobs=None,
+    dict_init=None,
+    code_init=None,
+    callback=None,
+    verbose=False,
+    random_state=None,
+    return_n_iter=False,
+    positive_dict=False,
+    positive_code=False,
+    method_max_iter=1000,
+):
+    """Solve a dictionary learning matrix factorization problem.
+
+    Finds the best dictionary and the corresponding sparse code for
+    approximating the data matrix X by solving::
+
+        (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
+                     (U,V)
+                    with || V_k ||_2 = 1 for all  0 <= k < n_components
+
+    where V is the dictionary and U is the sparse code. ||.||_Fro stands for
+    the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm
+    which is the sum of the absolute values of all the entries in the matrix.
+
+    Read more in the :ref:`User Guide <DictionaryLearning>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data matrix.
+
+    n_components : int
+        Number of dictionary atoms to extract.
+
+    alpha : int or float
+        Sparsity controlling parameter.
+
+    max_iter : int, default=100
+        Maximum number of iterations to perform.
+
+    tol : float, default=1e-8
+        Tolerance for the stopping condition.
+
+    method : {'lars', 'cd'}, default='lars'
+        The method used:
+
+        * `'lars'`: uses the least angle regression method to solve the lasso
+           problem (`linear_model.lars_path`);
+        * `'cd'`: uses the coordinate descent method to compute the
+          Lasso solution (`linear_model.Lasso`). Lars will be faster if
+          the estimated components are sparse.
+
+    n_jobs : int, default=None
+        Number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    dict_init : ndarray of shape (n_components, n_features), default=None
+        Initial value for the dictionary for warm restart scenarios. Only used
+        if `code_init` and `dict_init` are not None.
+
+    code_init : ndarray of shape (n_samples, n_components), default=None
+        Initial value for the sparse code for warm restart scenarios. Only used
+        if `code_init` and `dict_init` are not None.
+
+    callback : callable, default=None
+        Callable that gets invoked every five iterations.
+
+    verbose : bool, default=False
+        To control the verbosity of the procedure.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for randomly initializing the dictionary. Pass an int for
+        reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    positive_dict : bool, default=False
+        Whether to enforce positivity when finding the dictionary.
+
+        .. versionadded:: 0.20
+
+    positive_code : bool, default=False
+        Whether to enforce positivity when finding the code.
+
+        .. versionadded:: 0.20
+
+    method_max_iter : int, default=1000
+        Maximum number of iterations to perform.
+
+        .. versionadded:: 0.22
+
+    Returns
+    -------
+    code : ndarray of shape (n_samples, n_components)
+        The sparse code factor in the matrix factorization.
+
+    dictionary : ndarray of shape (n_components, n_features),
+        The dictionary factor in the matrix factorization.
+
+    errors : array
+        Vector of errors at each iteration.
+
+    n_iter : int
+        Number of iterations run. Returned only if `return_n_iter` is
+        set to True.
+
+    See Also
+    --------
+    dict_learning_online : Solve a dictionary learning matrix factorization
+        problem online.
+    DictionaryLearning : Find a dictionary that sparsely encodes data.
+    MiniBatchDictionaryLearning : A faster, less accurate version
+        of the dictionary learning algorithm.
+    SparsePCA : Sparse Principal Components Analysis.
+    MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_sparse_coded_signal
+    >>> from sklearn.decomposition import dict_learning
+    >>> X, _, _ = make_sparse_coded_signal(
+    ...     n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10,
+    ...     random_state=42,
+    ... )
+    >>> U, V, errors = dict_learning(X, n_components=15, alpha=0.1, random_state=42)
+
+    We can check the level of sparsity of `U`:
+
+    >>> np.mean(U == 0)
+    np.float64(0.62)
+
+    We can compare the average squared euclidean norm of the reconstruction
+    error of the sparse coded signal relative to the squared euclidean norm of
+    the original signal:
+
+    >>> X_hat = U @ V
+    >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))
+    np.float64(0.0192)
+    """
+    estimator = DictionaryLearning(
+        n_components=n_components,
+        alpha=alpha,
+        max_iter=max_iter,
+        tol=tol,
+        fit_algorithm=method,
+        n_jobs=n_jobs,
+        dict_init=dict_init,
+        callback=callback,
+        code_init=code_init,
+        verbose=verbose,
+        random_state=random_state,
+        positive_code=positive_code,
+        positive_dict=positive_dict,
+        transform_max_iter=method_max_iter,
+    ).set_output(transform="default")
+    code = estimator.fit_transform(X)
+    if return_n_iter:
+        return (
+            code,
+            estimator.components_,
+            estimator.error_,
+            estimator.n_iter_,
+        )
+    return code, estimator.components_, estimator.error_
+
+
+class _BaseSparseCoding(ClassNamePrefixFeaturesOutMixin, TransformerMixin):
+    """Base class from SparseCoder and DictionaryLearning algorithms."""
+
+    def __init__(
+        self,
+        transform_algorithm,
+        transform_n_nonzero_coefs,
+        transform_alpha,
+        split_sign,
+        n_jobs,
+        positive_code,
+        transform_max_iter,
+    ):
+        self.transform_algorithm = transform_algorithm
+        self.transform_n_nonzero_coefs = transform_n_nonzero_coefs
+        self.transform_alpha = transform_alpha
+        self.transform_max_iter = transform_max_iter
+        self.split_sign = split_sign
+        self.n_jobs = n_jobs
+        self.positive_code = positive_code
+
+    def _transform(self, X, dictionary):
+        """Private method allowing to accommodate both DictionaryLearning and
+        SparseCoder."""
+        X = validate_data(self, X, reset=False)
+
+        if hasattr(self, "alpha") and self.transform_alpha is None:
+            transform_alpha = self.alpha
+        else:
+            transform_alpha = self.transform_alpha
+
+        code = sparse_encode(
+            X,
+            dictionary,
+            algorithm=self.transform_algorithm,
+            n_nonzero_coefs=self.transform_n_nonzero_coefs,
+            alpha=transform_alpha,
+            max_iter=self.transform_max_iter,
+            n_jobs=self.n_jobs,
+            positive=self.positive_code,
+        )
+
+        if self.split_sign:
+            # feature vector is split into a positive and negative side
+            n_samples, n_features = code.shape
+            split_code = np.empty((n_samples, 2 * n_features))
+            split_code[:, :n_features] = np.maximum(code, 0)
+            split_code[:, n_features:] = -np.minimum(code, 0)
+            code = split_code
+
+        return code
+
+    def transform(self, X):
+        """Encode the data as a sparse combination of the dictionary atoms.
+
+        Coding method is determined by the object parameter
+        `transform_algorithm`.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Test data to be transformed, must have the same number of
+            features as the data used to train the model.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Transformed data.
+        """
+        check_is_fitted(self)
+        return self._transform(X, self.components_)
+
+    def _inverse_transform(self, code, dictionary):
+        """Private method allowing to accommodate both DictionaryLearning and
+        SparseCoder."""
+        code = check_array(code)
+        # compute number of expected features in code
+        expected_n_components = dictionary.shape[0]
+        if self.split_sign:
+            expected_n_components += expected_n_components
+        if not code.shape[1] == expected_n_components:
+            raise ValueError(
+                "The number of components in the code is different from the "
+                "number of components in the dictionary."
+                f"Expected {expected_n_components}, got {code.shape[1]}."
+            )
+        if self.split_sign:
+            n_samples, n_features = code.shape
+            n_features //= 2
+            code = code[:, :n_features] - code[:, n_features:]
+
+        return code @ dictionary
+
+    def inverse_transform(self, X):
+        """Transform data back to its original space.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_components)
+            Data to be transformed back. Must have the same number of
+            components as the data used to train the model.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        check_is_fitted(self)
+        return self._inverse_transform(X, self.components_)
+
+
+class SparseCoder(_BaseSparseCoding, BaseEstimator):
+    """Sparse coding.
+
+    Finds a sparse representation of data against a fixed, precomputed
+    dictionary.
+
+    Each row of the result is the solution to a sparse coding problem.
+    The goal is to find a sparse array `code` such that::
+
+        X ~= code * dictionary
+
+    Read more in the :ref:`User Guide <SparseCoder>`.
+
+    Parameters
+    ----------
+    dictionary : ndarray of shape (n_components, n_features)
+        The dictionary atoms used for sparse coding. Lines are assumed to be
+        normalized to unit norm.
+
+    transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \
+            'threshold'}, default='omp'
+        Algorithm used to transform the data:
+
+        - `'lars'`: uses the least angle regression method
+          (`linear_model.lars_path`);
+        - `'lasso_lars'`: uses Lars to compute the Lasso solution;
+        - `'lasso_cd'`: uses the coordinate descent method to compute the
+          Lasso solution (linear_model.Lasso). `'lasso_lars'` will be faster if
+          the estimated components are sparse;
+        - `'omp'`: uses orthogonal matching pursuit to estimate the sparse
+          solution;
+        - `'threshold'`: squashes to zero all coefficients less than alpha from
+          the projection ``dictionary * X'``.
+
+    transform_n_nonzero_coefs : int, default=None
+        Number of nonzero coefficients to target in each column of the
+        solution. This is only used by `algorithm='lars'` and `algorithm='omp'`
+        and is overridden by `alpha` in the `omp` case. If `None`, then
+        `transform_n_nonzero_coefs=int(n_features / 10)`.
+
+    transform_alpha : float, default=None
+        If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the
+        penalty applied to the L1 norm.
+        If `algorithm='threshold'`, `alpha` is the absolute value of the
+        threshold below which coefficients will be squashed to zero.
+        If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of
+        the reconstruction error targeted. In this case, it overrides
+        `n_nonzero_coefs`.
+        If `None`, default to 1.
+
+    split_sign : bool, default=False
+        Whether to split the sparse feature vector into the concatenation of
+        its negative part and its positive part. This can improve the
+        performance of downstream classifiers.
+
+    n_jobs : int, default=None
+        Number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    positive_code : bool, default=False
+        Whether to enforce positivity when finding the code.
+
+        .. versionadded:: 0.20
+
+    transform_max_iter : int, default=1000
+        Maximum number of iterations to perform if `algorithm='lasso_cd'` or
+        `lasso_lars`.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    n_components_ : int
+        Number of atoms.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    DictionaryLearning : Find a dictionary that sparsely encodes data.
+    MiniBatchDictionaryLearning : A faster, less accurate, version of the
+        dictionary learning algorithm.
+    MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.
+    SparsePCA : Sparse Principal Components Analysis.
+    sparse_encode : Sparse coding where each row of the result is the solution
+        to a sparse coding problem.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.decomposition import SparseCoder
+    >>> X = np.array([[-1, -1, -1], [0, 0, 3]])
+    >>> dictionary = np.array(
+    ...     [[0, 1, 0],
+    ...      [-1, -1, 2],
+    ...      [1, 1, 1],
+    ...      [0, 1, 1],
+    ...      [0, 2, 1]],
+    ...    dtype=np.float64
+    ... )
+    >>> coder = SparseCoder(
+    ...     dictionary=dictionary, transform_algorithm='lasso_lars',
+    ...     transform_alpha=1e-10,
+    ... )
+    >>> coder.transform(X)
+    array([[ 0.,  0., -1.,  0.,  0.],
+           [ 0.,  1.,  1.,  0.,  0.]])
+    """
+
+    def __init__(
+        self,
+        dictionary,
+        *,
+        transform_algorithm="omp",
+        transform_n_nonzero_coefs=None,
+        transform_alpha=None,
+        split_sign=False,
+        n_jobs=None,
+        positive_code=False,
+        transform_max_iter=1000,
+    ):
+        super().__init__(
+            transform_algorithm,
+            transform_n_nonzero_coefs,
+            transform_alpha,
+            split_sign,
+            n_jobs,
+            positive_code,
+            transform_max_iter,
+        )
+        self.dictionary = dictionary
+
+    def fit(self, X, y=None):
+        """Do nothing and return the estimator unchanged.
+
+        This method is just there to implement the usual API and hence
+        work in pipelines.
+
+        Parameters
+        ----------
+        X : Ignored
+            Not used, present for API consistency by convention.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        return self
+
+    def transform(self, X, y=None):
+        """Encode the data as a sparse combination of the dictionary atoms.
+
+        Coding method is determined by the object parameter
+        `transform_algorithm`.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Transformed data.
+        """
+        return super()._transform(X, self.dictionary)
+
+    def inverse_transform(self, X):
+        """Transform data back to its original space.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_components)
+            Data to be transformed back. Must have the same number of
+            components as the data used to train the model.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        return self._inverse_transform(X, self.dictionary)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.requires_fit = False
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+    @property
+    def n_components_(self):
+        """Number of atoms."""
+        return self.dictionary.shape[0]
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during `fit`."""
+        return self.dictionary.shape[1]
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.n_components_
+
+
+class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
+    """Dictionary learning.
+
+    Finds a dictionary (a set of atoms) that performs well at sparsely
+    encoding the fitted data.
+
+    Solves the optimization problem::
+
+        (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
+                    (U,V)
+                    with || V_k ||_2 <= 1 for all  0 <= k < n_components
+
+    ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for
+    the entry-wise matrix norm which is the sum of the absolute values
+    of all the entries in the matrix.
+
+    Read more in the :ref:`User Guide <DictionaryLearning>`.
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of dictionary elements to extract. If None, then ``n_components``
+        is set to ``n_features``.
+
+    alpha : float, default=1.0
+        Sparsity controlling parameter.
+
+    max_iter : int, default=1000
+        Maximum number of iterations to perform.
+
+    tol : float, default=1e-8
+        Tolerance for numerical error.
+
+    fit_algorithm : {'lars', 'cd'}, default='lars'
+        * `'lars'`: uses the least angle regression method to solve the lasso
+          problem (:func:`~sklearn.linear_model.lars_path`);
+        * `'cd'`: uses the coordinate descent method to compute the
+          Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be
+          faster if the estimated components are sparse.
+
+        .. versionadded:: 0.17
+           *cd* coordinate descent method to improve speed.
+
+    transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \
+            'threshold'}, default='omp'
+        Algorithm used to transform the data:
+
+        - `'lars'`: uses the least angle regression method
+          (:func:`~sklearn.linear_model.lars_path`);
+        - `'lasso_lars'`: uses Lars to compute the Lasso solution.
+        - `'lasso_cd'`: uses the coordinate descent method to compute the
+          Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'`
+          will be faster if the estimated components are sparse.
+        - `'omp'`: uses orthogonal matching pursuit to estimate the sparse
+          solution.
+        - `'threshold'`: squashes to zero all coefficients less than alpha from
+          the projection ``dictionary * X'``.
+
+        .. versionadded:: 0.17
+           *lasso_cd* coordinate descent method to improve speed.
+
+    transform_n_nonzero_coefs : int, default=None
+        Number of nonzero coefficients to target in each column of the
+        solution. This is only used by `algorithm='lars'` and
+        `algorithm='omp'`. If `None`, then
+        `transform_n_nonzero_coefs=int(n_features / 10)`.
+
+    transform_alpha : float, default=None
+        If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the
+        penalty applied to the L1 norm.
+        If `algorithm='threshold'`, `alpha` is the absolute value of the
+        threshold below which coefficients will be squashed to zero.
+        If `None`, defaults to `alpha`.
+
+        .. versionchanged:: 1.2
+            When None, default value changed from 1.0 to `alpha`.
+
+    n_jobs : int or None, default=None
+        Number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    code_init : ndarray of shape (n_samples, n_components), default=None
+        Initial value for the code, for warm restart. Only used if `code_init`
+        and `dict_init` are not None.
+
+    dict_init : ndarray of shape (n_components, n_features), default=None
+        Initial values for the dictionary, for warm restart. Only used if
+        `code_init` and `dict_init` are not None.
+
+    callback : callable, default=None
+        Callable that gets invoked every five iterations.
+
+        .. versionadded:: 1.3
+
+    verbose : bool, default=False
+        To control the verbosity of the procedure.
+
+    split_sign : bool, default=False
+        Whether to split the sparse feature vector into the concatenation of
+        its negative part and its positive part. This can improve the
+        performance of downstream classifiers.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for initializing the dictionary when ``dict_init`` is not
+        specified, randomly shuffling the data when ``shuffle`` is set to
+        ``True``, and updating the dictionary. Pass an int for reproducible
+        results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    positive_code : bool, default=False
+        Whether to enforce positivity when finding the code.
+
+        .. versionadded:: 0.20
+
+    positive_dict : bool, default=False
+        Whether to enforce positivity when finding the dictionary.
+
+        .. versionadded:: 0.20
+
+    transform_max_iter : int, default=1000
+        Maximum number of iterations to perform if `algorithm='lasso_cd'` or
+        `'lasso_lars'`.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        dictionary atoms extracted from the data
+
+    error_ : array
+        vector of errors at each iteration
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations run.
+
+    See Also
+    --------
+    MiniBatchDictionaryLearning: A faster, less accurate, version of the
+        dictionary learning algorithm.
+    MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.
+    SparseCoder : Find a sparse representation of data from a fixed,
+        precomputed dictionary.
+    SparsePCA : Sparse Principal Components Analysis.
+
+    References
+    ----------
+
+    J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning
+    for sparse coding (https://www.di.ens.fr/~fbach/mairal_icml09.pdf)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_sparse_coded_signal
+    >>> from sklearn.decomposition import DictionaryLearning
+    >>> X, dictionary, code = make_sparse_coded_signal(
+    ...     n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10,
+    ...     random_state=42,
+    ... )
+    >>> dict_learner = DictionaryLearning(
+    ...     n_components=15, transform_algorithm='lasso_lars', transform_alpha=0.1,
+    ...     random_state=42,
+    ... )
+    >>> X_transformed = dict_learner.fit(X).transform(X)
+
+    We can check the level of sparsity of `X_transformed`:
+
+    >>> np.mean(X_transformed == 0)
+    np.float64(0.527)
+
+    We can compare the average squared euclidean norm of the reconstruction
+    error of the sparse coded signal relative to the squared euclidean norm of
+    the original signal:
+
+    >>> X_hat = X_transformed @ dict_learner.components_
+    >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))
+    np.float64(0.056)
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left"), None],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "fit_algorithm": [StrOptions({"lars", "cd"})],
+        "transform_algorithm": [
+            StrOptions({"lasso_lars", "lasso_cd", "lars", "omp", "threshold"})
+        ],
+        "transform_n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None],
+        "transform_alpha": [Interval(Real, 0, None, closed="left"), None],
+        "n_jobs": [Integral, None],
+        "code_init": [np.ndarray, None],
+        "dict_init": [np.ndarray, None],
+        "callback": [callable, None],
+        "verbose": ["verbose"],
+        "split_sign": ["boolean"],
+        "random_state": ["random_state"],
+        "positive_code": ["boolean"],
+        "positive_dict": ["boolean"],
+        "transform_max_iter": [Interval(Integral, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        alpha=1,
+        max_iter=1000,
+        tol=1e-8,
+        fit_algorithm="lars",
+        transform_algorithm="omp",
+        transform_n_nonzero_coefs=None,
+        transform_alpha=None,
+        n_jobs=None,
+        code_init=None,
+        dict_init=None,
+        callback=None,
+        verbose=False,
+        split_sign=False,
+        random_state=None,
+        positive_code=False,
+        positive_dict=False,
+        transform_max_iter=1000,
+    ):
+        super().__init__(
+            transform_algorithm,
+            transform_n_nonzero_coefs,
+            transform_alpha,
+            split_sign,
+            n_jobs,
+            positive_code,
+            transform_max_iter,
+        )
+        self.n_components = n_components
+        self.alpha = alpha
+        self.max_iter = max_iter
+        self.tol = tol
+        self.fit_algorithm = fit_algorithm
+        self.code_init = code_init
+        self.dict_init = dict_init
+        self.callback = callback
+        self.verbose = verbose
+        self.random_state = random_state
+        self.positive_dict = positive_dict
+
+    def fit(self, X, y=None):
+        """Fit the model from data in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self.fit_transform(X)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Fit the model from data in X and return the transformed data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        V : ndarray of shape (n_samples, n_components)
+            Transformed data.
+        """
+        _check_positive_coding(method=self.fit_algorithm, positive=self.positive_code)
+
+        method = "lasso_" + self.fit_algorithm
+
+        random_state = check_random_state(self.random_state)
+        X = validate_data(self, X)
+
+        if self.n_components is None:
+            n_components = X.shape[1]
+        else:
+            n_components = self.n_components
+
+        V, U, E, self.n_iter_ = _dict_learning(
+            X,
+            n_components,
+            alpha=self.alpha,
+            tol=self.tol,
+            max_iter=self.max_iter,
+            method=method,
+            method_max_iter=self.transform_max_iter,
+            n_jobs=self.n_jobs,
+            code_init=self.code_init,
+            dict_init=self.dict_init,
+            callback=self.callback,
+            verbose=self.verbose,
+            random_state=random_state,
+            return_n_iter=True,
+            positive_dict=self.positive_dict,
+            positive_code=self.positive_code,
+        )
+        self.components_ = U
+        self.error_ = E
+
+        return V
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+
+class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
+    """Mini-batch dictionary learning.
+
+    Finds a dictionary (a set of atoms) that performs well at sparsely
+    encoding the fitted data.
+
+    Solves the optimization problem::
+
+       (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
+                    (U,V)
+                    with || V_k ||_2 <= 1 for all  0 <= k < n_components
+
+    ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for
+    the entry-wise matrix norm which is the sum of the absolute values
+    of all the entries in the matrix.
+
+    Read more in the :ref:`User Guide <DictionaryLearning>`.
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of dictionary elements to extract.
+
+    alpha : float, default=1
+        Sparsity controlling parameter.
+
+    max_iter : int, default=1_000
+        Maximum number of iterations over the complete dataset before
+        stopping independently of any early stopping criterion heuristics.
+
+        .. versionadded:: 1.1
+
+    fit_algorithm : {'lars', 'cd'}, default='lars'
+        The algorithm used:
+
+        - `'lars'`: uses the least angle regression method to solve the lasso
+          problem (`linear_model.lars_path`)
+        - `'cd'`: uses the coordinate descent method to compute the
+          Lasso solution (`linear_model.Lasso`). Lars will be faster if
+          the estimated components are sparse.
+
+    n_jobs : int, default=None
+        Number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    batch_size : int, default=256
+        Number of samples in each mini-batch.
+
+        .. versionchanged:: 1.3
+           The default value of `batch_size` changed from 3 to 256 in version 1.3.
+
+    shuffle : bool, default=True
+        Whether to shuffle the samples before forming batches.
+
+    dict_init : ndarray of shape (n_components, n_features), default=None
+        Initial value of the dictionary for warm restart scenarios.
+
+    transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \
+            'threshold'}, default='omp'
+        Algorithm used to transform the data:
+
+        - `'lars'`: uses the least angle regression method
+          (`linear_model.lars_path`);
+        - `'lasso_lars'`: uses Lars to compute the Lasso solution.
+        - `'lasso_cd'`: uses the coordinate descent method to compute the
+          Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster
+          if the estimated components are sparse.
+        - `'omp'`: uses orthogonal matching pursuit to estimate the sparse
+          solution.
+        - `'threshold'`: squashes to zero all coefficients less than alpha from
+          the projection ``dictionary * X'``.
+
+    transform_n_nonzero_coefs : int, default=None
+        Number of nonzero coefficients to target in each column of the
+        solution. This is only used by `algorithm='lars'` and
+        `algorithm='omp'`. If `None`, then
+        `transform_n_nonzero_coefs=int(n_features / 10)`.
+
+    transform_alpha : float, default=None
+        If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the
+        penalty applied to the L1 norm.
+        If `algorithm='threshold'`, `alpha` is the absolute value of the
+        threshold below which coefficients will be squashed to zero.
+        If `None`, defaults to `alpha`.
+
+        .. versionchanged:: 1.2
+            When None, default value changed from 1.0 to `alpha`.
+
+    verbose : bool or int, default=False
+        To control the verbosity of the procedure.
+
+    split_sign : bool, default=False
+        Whether to split the sparse feature vector into the concatenation of
+        its negative part and its positive part. This can improve the
+        performance of downstream classifiers.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for initializing the dictionary when ``dict_init`` is not
+        specified, randomly shuffling the data when ``shuffle`` is set to
+        ``True``, and updating the dictionary. Pass an int for reproducible
+        results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    positive_code : bool, default=False
+        Whether to enforce positivity when finding the code.
+
+        .. versionadded:: 0.20
+
+    positive_dict : bool, default=False
+        Whether to enforce positivity when finding the dictionary.
+
+        .. versionadded:: 0.20
+
+    transform_max_iter : int, default=1000
+        Maximum number of iterations to perform if `algorithm='lasso_cd'` or
+        `'lasso_lars'`.
+
+        .. versionadded:: 0.22
+
+    callback : callable, default=None
+        A callable that gets invoked at the end of each iteration.
+
+        .. versionadded:: 1.1
+
+    tol : float, default=1e-3
+        Control early stopping based on the norm of the differences in the
+        dictionary between 2 steps.
+
+        To disable early stopping based on changes in the dictionary, set
+        `tol` to 0.0.
+
+        .. versionadded:: 1.1
+
+    max_no_improvement : int, default=10
+        Control early stopping based on the consecutive number of mini batches
+        that does not yield an improvement on the smoothed cost function.
+
+        To disable convergence detection based on cost function, set
+        `max_no_improvement` to None.
+
+        .. versionadded:: 1.1
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Components extracted from the data.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations over the full dataset.
+
+    n_steps_ : int
+        Number of mini-batches processed.
+
+        .. versionadded:: 1.1
+
+    See Also
+    --------
+    DictionaryLearning : Find a dictionary that sparsely encodes data.
+    MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.
+    SparseCoder : Find a sparse representation of data from a fixed,
+        precomputed dictionary.
+    SparsePCA : Sparse Principal Components Analysis.
+
+    References
+    ----------
+
+    J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning
+    for sparse coding (https://www.di.ens.fr/~fbach/mairal_icml09.pdf)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_sparse_coded_signal
+    >>> from sklearn.decomposition import MiniBatchDictionaryLearning
+    >>> X, dictionary, code = make_sparse_coded_signal(
+    ...     n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10,
+    ...     random_state=42)
+    >>> dict_learner = MiniBatchDictionaryLearning(
+    ...     n_components=15, batch_size=3, transform_algorithm='lasso_lars',
+    ...     transform_alpha=0.1, max_iter=20, random_state=42)
+    >>> X_transformed = dict_learner.fit_transform(X)
+
+    We can check the level of sparsity of `X_transformed`:
+
+    >>> np.mean(X_transformed == 0) > 0.5
+    np.True_
+
+    We can compare the average squared euclidean norm of the reconstruction
+    error of the sparse coded signal relative to the squared euclidean norm of
+    the original signal:
+
+    >>> X_hat = X_transformed @ dict_learner.components_
+    >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))
+    np.float64(0.052)
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left"), None],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "fit_algorithm": [StrOptions({"cd", "lars"})],
+        "n_jobs": [None, Integral],
+        "batch_size": [Interval(Integral, 1, None, closed="left")],
+        "shuffle": ["boolean"],
+        "dict_init": [None, np.ndarray],
+        "transform_algorithm": [
+            StrOptions({"lasso_lars", "lasso_cd", "lars", "omp", "threshold"})
+        ],
+        "transform_n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None],
+        "transform_alpha": [Interval(Real, 0, None, closed="left"), None],
+        "verbose": ["verbose"],
+        "split_sign": ["boolean"],
+        "random_state": ["random_state"],
+        "positive_code": ["boolean"],
+        "positive_dict": ["boolean"],
+        "transform_max_iter": [Interval(Integral, 0, None, closed="left")],
+        "callback": [None, callable],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "max_no_improvement": [Interval(Integral, 0, None, closed="left"), None],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        alpha=1,
+        max_iter=1_000,
+        fit_algorithm="lars",
+        n_jobs=None,
+        batch_size=256,
+        shuffle=True,
+        dict_init=None,
+        transform_algorithm="omp",
+        transform_n_nonzero_coefs=None,
+        transform_alpha=None,
+        verbose=False,
+        split_sign=False,
+        random_state=None,
+        positive_code=False,
+        positive_dict=False,
+        transform_max_iter=1000,
+        callback=None,
+        tol=1e-3,
+        max_no_improvement=10,
+    ):
+        super().__init__(
+            transform_algorithm,
+            transform_n_nonzero_coefs,
+            transform_alpha,
+            split_sign,
+            n_jobs,
+            positive_code,
+            transform_max_iter,
+        )
+        self.n_components = n_components
+        self.alpha = alpha
+        self.max_iter = max_iter
+        self.fit_algorithm = fit_algorithm
+        self.dict_init = dict_init
+        self.verbose = verbose
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.split_sign = split_sign
+        self.random_state = random_state
+        self.positive_dict = positive_dict
+        self.callback = callback
+        self.max_no_improvement = max_no_improvement
+        self.tol = tol
+
+    def _check_params(self, X):
+        # n_components
+        self._n_components = self.n_components
+        if self._n_components is None:
+            self._n_components = X.shape[1]
+
+        # fit_algorithm
+        _check_positive_coding(self.fit_algorithm, self.positive_code)
+        self._fit_algorithm = "lasso_" + self.fit_algorithm
+
+        # batch_size
+        self._batch_size = min(self.batch_size, X.shape[0])
+
+    def _initialize_dict(self, X, random_state):
+        """Initialization of the dictionary."""
+        if self.dict_init is not None:
+            dictionary = self.dict_init
+        else:
+            # Init V with SVD of X
+            _, S, dictionary = _randomized_svd(
+                X, self._n_components, random_state=random_state
+            )
+            dictionary = S[:, np.newaxis] * dictionary
+
+        if self._n_components <= len(dictionary):
+            dictionary = dictionary[: self._n_components, :]
+        else:
+            dictionary = np.concatenate(
+                (
+                    dictionary,
+                    np.zeros(
+                        (self._n_components - len(dictionary), dictionary.shape[1]),
+                        dtype=dictionary.dtype,
+                    ),
+                )
+            )
+
+        dictionary = check_array(dictionary, order="F", dtype=X.dtype, copy=False)
+        dictionary = np.require(dictionary, requirements="W")
+
+        return dictionary
+
+    def _update_inner_stats(self, X, code, batch_size, step):
+        """Update the inner stats inplace."""
+        if step < batch_size - 1:
+            theta = (step + 1) * batch_size
+        else:
+            theta = batch_size**2 + step + 1 - batch_size
+        beta = (theta + 1 - batch_size) / (theta + 1)
+
+        self._A *= beta
+        self._A += code.T @ code / batch_size
+        self._B *= beta
+        self._B += X.T @ code / batch_size
+
+    def _minibatch_step(self, X, dictionary, random_state, step):
+        """Perform the update on the dictionary for one minibatch."""
+        batch_size = X.shape[0]
+
+        # Compute code for this batch
+        code = _sparse_encode(
+            X,
+            dictionary,
+            algorithm=self._fit_algorithm,
+            alpha=self.alpha,
+            n_jobs=self.n_jobs,
+            positive=self.positive_code,
+            max_iter=self.transform_max_iter,
+            verbose=self.verbose,
+        )
+
+        batch_cost = (
+            0.5 * ((X - code @ dictionary) ** 2).sum()
+            + self.alpha * np.sum(np.abs(code))
+        ) / batch_size
+
+        # Update inner stats
+        self._update_inner_stats(X, code, batch_size, step)
+
+        # Update dictionary
+        _update_dict(
+            dictionary,
+            X,
+            code,
+            self._A,
+            self._B,
+            verbose=self.verbose,
+            random_state=random_state,
+            positive=self.positive_dict,
+        )
+
+        return batch_cost
+
+    def _check_convergence(
+        self, X, batch_cost, new_dict, old_dict, n_samples, step, n_steps
+    ):
+        """Helper function to encapsulate the early stopping logic.
+
+        Early stopping is based on two factors:
+        - A small change of the dictionary between two minibatch updates. This is
+          controlled by the tol parameter.
+        - No more improvement on a smoothed estimate of the objective function for a
+          a certain number of consecutive minibatch updates. This is controlled by
+          the max_no_improvement parameter.
+        """
+        batch_size = X.shape[0]
+
+        # counts steps starting from 1 for user friendly verbose mode.
+        step = step + 1
+
+        # Ignore 100 first steps or 1 epoch to avoid initializing the ewa_cost with a
+        # too bad value
+        if step <= min(100, n_samples / batch_size):
+            if self.verbose:
+                print(f"Minibatch step {step}/{n_steps}: mean batch cost: {batch_cost}")
+            return False
+
+        # Compute an Exponentially Weighted Average of the cost function to
+        # monitor the convergence while discarding minibatch-local stochastic
+        # variability: https://en.wikipedia.org/wiki/Moving_average
+        if self._ewa_cost is None:
+            self._ewa_cost = batch_cost
+        else:
+            alpha = batch_size / (n_samples + 1)
+            alpha = min(alpha, 1)
+            self._ewa_cost = self._ewa_cost * (1 - alpha) + batch_cost * alpha
+
+        if self.verbose:
+            print(
+                f"Minibatch step {step}/{n_steps}: mean batch cost: "
+                f"{batch_cost}, ewa cost: {self._ewa_cost}"
+            )
+
+        # Early stopping based on change of dictionary
+        dict_diff = linalg.norm(new_dict - old_dict) / self._n_components
+        if self.tol > 0 and dict_diff <= self.tol:
+            if self.verbose:
+                print(f"Converged (small dictionary change) at step {step}/{n_steps}")
+            return True
+
+        # Early stopping heuristic due to lack of improvement on smoothed
+        # cost function
+        if self._ewa_cost_min is None or self._ewa_cost < self._ewa_cost_min:
+            self._no_improvement = 0
+            self._ewa_cost_min = self._ewa_cost
+        else:
+            self._no_improvement += 1
+
+        if (
+            self.max_no_improvement is not None
+            and self._no_improvement >= self.max_no_improvement
+        ):
+            if self.verbose:
+                print(
+                    "Converged (lack of improvement in objective function) "
+                    f"at step {step}/{n_steps}"
+                )
+            return True
+
+        return False
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model from data in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(
+            self, X, dtype=[np.float64, np.float32], order="C", copy=False
+        )
+
+        self._check_params(X)
+        self._random_state = check_random_state(self.random_state)
+
+        dictionary = self._initialize_dict(X, self._random_state)
+        old_dict = dictionary.copy()
+
+        if self.shuffle:
+            X_train = X.copy()
+            self._random_state.shuffle(X_train)
+        else:
+            X_train = X
+
+        n_samples, n_features = X_train.shape
+
+        if self.verbose:
+            print("[dict_learning]")
+
+        # Inner stats
+        self._A = np.zeros(
+            (self._n_components, self._n_components), dtype=X_train.dtype
+        )
+        self._B = np.zeros((n_features, self._n_components), dtype=X_train.dtype)
+
+        # Attributes to monitor the convergence
+        self._ewa_cost = None
+        self._ewa_cost_min = None
+        self._no_improvement = 0
+
+        batches = gen_batches(n_samples, self._batch_size)
+        batches = itertools.cycle(batches)
+        n_steps_per_iter = int(np.ceil(n_samples / self._batch_size))
+        n_steps = self.max_iter * n_steps_per_iter
+
+        i = -1  # to allow max_iter = 0
+
+        for i, batch in zip(range(n_steps), batches):
+            X_batch = X_train[batch]
+
+            batch_cost = self._minibatch_step(
+                X_batch, dictionary, self._random_state, i
+            )
+
+            if self._check_convergence(
+                X_batch, batch_cost, dictionary, old_dict, n_samples, i, n_steps
+            ):
+                break
+
+            # XXX callback param added for backward compat in #18975 but a common
+            # unified callback API should be preferred
+            if self.callback is not None:
+                self.callback(locals())
+
+            old_dict[:] = dictionary
+
+        self.n_steps_ = i + 1
+        self.n_iter_ = np.ceil(self.n_steps_ / n_steps_per_iter)
+        self.components_ = dictionary
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Update the model using the data in X as a mini-batch.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Return the instance itself.
+        """
+        has_components = hasattr(self, "components_")
+
+        X = validate_data(
+            self, X, dtype=[np.float64, np.float32], order="C", reset=not has_components
+        )
+
+        if not has_components:
+            # This instance has not been fitted yet (fit or partial_fit)
+            self._check_params(X)
+            self._random_state = check_random_state(self.random_state)
+
+            dictionary = self._initialize_dict(X, self._random_state)
+
+            self.n_steps_ = 0
+
+            self._A = np.zeros((self._n_components, self._n_components), dtype=X.dtype)
+            self._B = np.zeros((X.shape[1], self._n_components), dtype=X.dtype)
+        else:
+            dictionary = self.components_
+
+        self._minibatch_step(X, dictionary, self._random_state, self.n_steps_)
+
+        self.components_ = dictionary
+        self.n_steps_ += 1
+
+        return self
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_factor_analysis.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_factor_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6d5e72a5b7d3a3b032a1465de639f60ebc58d7f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_factor_analysis.py
@@ -0,0 +1,457 @@
+"""Factor Analysis.
+
+A latent linear variable model.
+
+FactorAnalysis is similar to probabilistic PCA implemented by PCA.score
+While PCA assumes Gaussian noise with the same variance for each
+feature, the FactorAnalysis model assumes different variances for
+each of them.
+
+This implementation is based on David Barber's Book,
+Bayesian Reasoning and Machine Learning,
+http://www.cs.ucl.ac.uk/staff/d.barber/brml,
+Algorithm 21.1
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from math import log, sqrt
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..exceptions import ConvergenceWarning
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import _randomized_svd, fast_logdet, squared_norm
+from ..utils.validation import check_is_fitted, validate_data
+
+
+class FactorAnalysis(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Factor Analysis (FA).
+
+    A simple linear generative model with Gaussian latent variables.
+
+    The observations are assumed to be caused by a linear transformation of
+    lower dimensional latent factors and added Gaussian noise.
+    Without loss of generality the factors are distributed according to a
+    Gaussian with zero mean and unit covariance. The noise is also zero mean
+    and has an arbitrary diagonal covariance matrix.
+
+    If we would restrict the model further, by assuming that the Gaussian
+    noise is even isotropic (all diagonal entries are the same) we would obtain
+    :class:`PCA`.
+
+    FactorAnalysis performs a maximum likelihood estimate of the so-called
+    `loading` matrix, the transformation of the latent variables to the
+    observed ones, using SVD based approach.
+
+    Read more in the :ref:`User Guide <FA>`.
+
+    .. versionadded:: 0.13
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Dimensionality of latent space, the number of components
+        of ``X`` that are obtained after ``transform``.
+        If None, n_components is set to the number of features.
+
+    tol : float, default=1e-2
+        Stopping tolerance for log-likelihood increase.
+
+    copy : bool, default=True
+        Whether to make a copy of X. If ``False``, the input X gets overwritten
+        during fitting.
+
+    max_iter : int, default=1000
+        Maximum number of iterations.
+
+    noise_variance_init : array-like of shape (n_features,), default=None
+        The initial guess of the noise variance for each feature.
+        If None, it defaults to np.ones(n_features).
+
+    svd_method : {'lapack', 'randomized'}, default='randomized'
+        Which SVD method to use. If 'lapack' use standard SVD from
+        scipy.linalg, if 'randomized' use fast ``randomized_svd`` function.
+        Defaults to 'randomized'. For most applications 'randomized' will
+        be sufficiently precise while providing significant speed gains.
+        Accuracy can also be improved by setting higher values for
+        `iterated_power`. If this is not sufficient, for maximum precision
+        you should choose 'lapack'.
+
+    iterated_power : int, default=3
+        Number of iterations for the power method. 3 by default. Only used
+        if ``svd_method`` equals 'randomized'.
+
+    rotation : {'varimax', 'quartimax'}, default=None
+        If not None, apply the indicated rotation. Currently, varimax and
+        quartimax are implemented. See
+        `"The varimax criterion for analytic rotation in factor analysis"
+        <https://link.springer.com/article/10.1007%2FBF02289233>`_
+        H. F. Kaiser, 1958.
+
+        .. versionadded:: 0.24
+
+    random_state : int or RandomState instance, default=0
+        Only used when ``svd_method`` equals 'randomized'. Pass an int for
+        reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Components with maximum variance.
+
+    loglike_ : list of shape (n_iterations,)
+        The log likelihood at each iteration.
+
+    noise_variance_ : ndarray of shape (n_features,)
+        The estimated noise variance for each feature.
+
+    n_iter_ : int
+        Number of iterations run.
+
+    mean_ : ndarray of shape (n_features,)
+        Per-feature empirical mean, estimated from the training set.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PCA: Principal component analysis is also a latent linear variable model
+        which however assumes equal noise variance for each feature.
+        This extra assumption makes probabilistic PCA faster as it can be
+        computed in closed form.
+    FastICA: Independent component analysis, a latent variable model with
+        non-Gaussian latent variables.
+
+    References
+    ----------
+    - David Barber, Bayesian Reasoning and Machine Learning,
+      Algorithm 21.1.
+
+    - Christopher M. Bishop: Pattern Recognition and Machine Learning,
+      Chapter 12.2.4.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.decomposition import FactorAnalysis
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> transformer = FactorAnalysis(n_components=7, random_state=0)
+    >>> X_transformed = transformer.fit_transform(X)
+    >>> X_transformed.shape
+    (1797, 7)
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 0, None, closed="left"), None],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+        "copy": ["boolean"],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "noise_variance_init": ["array-like", None],
+        "svd_method": [StrOptions({"randomized", "lapack"})],
+        "iterated_power": [Interval(Integral, 0, None, closed="left")],
+        "rotation": [StrOptions({"varimax", "quartimax"}), None],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        tol=1e-2,
+        copy=True,
+        max_iter=1000,
+        noise_variance_init=None,
+        svd_method="randomized",
+        iterated_power=3,
+        rotation=None,
+        random_state=0,
+    ):
+        self.n_components = n_components
+        self.copy = copy
+        self.tol = tol
+        self.max_iter = max_iter
+        self.svd_method = svd_method
+
+        self.noise_variance_init = noise_variance_init
+        self.iterated_power = iterated_power
+        self.random_state = random_state
+        self.rotation = rotation
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the FactorAnalysis model to X using SVD based approach.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : Ignored
+            Ignored parameter.
+
+        Returns
+        -------
+        self : object
+            FactorAnalysis class instance.
+        """
+        X = validate_data(
+            self, X, copy=self.copy, dtype=np.float64, force_writeable=True
+        )
+
+        n_samples, n_features = X.shape
+        n_components = self.n_components
+        if n_components is None:
+            n_components = n_features
+
+        self.mean_ = np.mean(X, axis=0)
+        X -= self.mean_
+
+        # some constant terms
+        nsqrt = sqrt(n_samples)
+        llconst = n_features * log(2.0 * np.pi) + n_components
+        var = np.var(X, axis=0)
+
+        if self.noise_variance_init is None:
+            psi = np.ones(n_features, dtype=X.dtype)
+        else:
+            if len(self.noise_variance_init) != n_features:
+                raise ValueError(
+                    "noise_variance_init dimension does not "
+                    "with number of features : %d != %d"
+                    % (len(self.noise_variance_init), n_features)
+                )
+            psi = np.array(self.noise_variance_init)
+
+        loglike = []
+        old_ll = -np.inf
+        SMALL = 1e-12
+
+        # we'll modify svd outputs to return unexplained variance
+        # to allow for unified computation of loglikelihood
+        if self.svd_method == "lapack":
+
+            def my_svd(X):
+                _, s, Vt = linalg.svd(X, full_matrices=False, check_finite=False)
+                return (
+                    s[:n_components],
+                    Vt[:n_components],
+                    squared_norm(s[n_components:]),
+                )
+
+        else:  # svd_method == "randomized"
+            random_state = check_random_state(self.random_state)
+
+            def my_svd(X):
+                _, s, Vt = _randomized_svd(
+                    X,
+                    n_components,
+                    random_state=random_state,
+                    n_iter=self.iterated_power,
+                )
+                return s, Vt, squared_norm(X) - squared_norm(s)
+
+        for i in range(self.max_iter):
+            # SMALL helps numerics
+            sqrt_psi = np.sqrt(psi) + SMALL
+            s, Vt, unexp_var = my_svd(X / (sqrt_psi * nsqrt))
+            s **= 2
+            # Use 'maximum' here to avoid sqrt problems.
+            W = np.sqrt(np.maximum(s - 1.0, 0.0))[:, np.newaxis] * Vt
+            del Vt
+            W *= sqrt_psi
+
+            # loglikelihood
+            ll = llconst + np.sum(np.log(s))
+            ll += unexp_var + np.sum(np.log(psi))
+            ll *= -n_samples / 2.0
+            loglike.append(ll)
+            if (ll - old_ll) < self.tol:
+                break
+            old_ll = ll
+
+            psi = np.maximum(var - np.sum(W**2, axis=0), SMALL)
+        else:
+            warnings.warn(
+                "FactorAnalysis did not converge."
+                " You might want"
+                " to increase the number of iterations.",
+                ConvergenceWarning,
+            )
+
+        self.components_ = W
+        if self.rotation is not None:
+            self.components_ = self._rotate(W)
+        self.noise_variance_ = psi
+        self.loglike_ = loglike
+        self.n_iter_ = i + 1
+        return self
+
+    def transform(self, X):
+        """Apply dimensionality reduction to X using the model.
+
+        Compute the expected mean of the latent variables.
+        See Barber, 21.2.33 (or Bishop, 12.66).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            The latent variables of X.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(self, X, reset=False)
+        Ih = np.eye(len(self.components_))
+
+        X_transformed = X - self.mean_
+
+        Wpsi = self.components_ / self.noise_variance_
+        cov_z = linalg.inv(Ih + np.dot(Wpsi, self.components_.T))
+        tmp = np.dot(X_transformed, Wpsi.T)
+        X_transformed = np.dot(tmp, cov_z)
+
+        return X_transformed
+
+    def get_covariance(self):
+        """Compute data covariance with the FactorAnalysis model.
+
+        ``cov = components_.T * components_ + diag(noise_variance)``
+
+        Returns
+        -------
+        cov : ndarray of shape (n_features, n_features)
+            Estimated covariance of data.
+        """
+        check_is_fitted(self)
+
+        cov = np.dot(self.components_.T, self.components_)
+        cov.flat[:: len(cov) + 1] += self.noise_variance_  # modify diag inplace
+        return cov
+
+    def get_precision(self):
+        """Compute data precision matrix with the FactorAnalysis model.
+
+        Returns
+        -------
+        precision : ndarray of shape (n_features, n_features)
+            Estimated precision of data.
+        """
+        check_is_fitted(self)
+
+        n_features = self.components_.shape[1]
+
+        # handle corner cases first
+        if self.n_components == 0:
+            return np.diag(1.0 / self.noise_variance_)
+        if self.n_components == n_features:
+            return linalg.inv(self.get_covariance())
+
+        # Get precision using matrix inversion lemma
+        components_ = self.components_
+        precision = np.dot(components_ / self.noise_variance_, components_.T)
+        precision.flat[:: len(precision) + 1] += 1.0
+        precision = np.dot(components_.T, np.dot(linalg.inv(precision), components_))
+        precision /= self.noise_variance_[:, np.newaxis]
+        precision /= -self.noise_variance_[np.newaxis, :]
+        precision.flat[:: len(precision) + 1] += 1.0 / self.noise_variance_
+        return precision
+
+    def score_samples(self, X):
+        """Compute the log-likelihood of each sample.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data.
+
+        Returns
+        -------
+        ll : ndarray of shape (n_samples,)
+            Log-likelihood of each sample under the current model.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+        Xr = X - self.mean_
+        precision = self.get_precision()
+        n_features = X.shape[1]
+        log_like = -0.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1)
+        log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision))
+        return log_like
+
+    def score(self, X, y=None):
+        """Compute the average log-likelihood of the samples.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data.
+
+        y : Ignored
+            Ignored parameter.
+
+        Returns
+        -------
+        ll : float
+            Average log-likelihood of the samples under the current model.
+        """
+        return np.mean(self.score_samples(X))
+
+    def _rotate(self, components, n_components=None, tol=1e-6):
+        "Rotate the factor analysis solution."
+        # note that tol is not exposed
+        return _ortho_rotation(components.T, method=self.rotation, tol=tol)[
+            : self.n_components
+        ]
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
+
+def _ortho_rotation(components, method="varimax", tol=1e-6, max_iter=100):
+    """Return rotated components."""
+    nrow, ncol = components.shape
+    rotation_matrix = np.eye(ncol)
+    var = 0
+
+    for _ in range(max_iter):
+        comp_rot = np.dot(components, rotation_matrix)
+        if method == "varimax":
+            tmp = comp_rot * np.transpose((comp_rot**2).sum(axis=0) / nrow)
+        elif method == "quartimax":
+            tmp = 0
+        u, s, v = np.linalg.svd(np.dot(components.T, comp_rot**3 - tmp))
+        rotation_matrix = np.dot(u, v)
+        var_new = np.sum(s)
+        if var != 0 and var_new < var * (1 + tol):
+            break
+        var = var_new
+
+    return np.dot(components, rotation_matrix).T
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_fastica.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_fastica.py
new file mode 100644
index 0000000000000000000000000000000000000000..efda7bfca56b60f361d6bafa1edf0d66effe3ef6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_fastica.py
@@ -0,0 +1,804 @@
+"""
+Python implementation of the fast ICA algorithms.
+
+Reference: Tables 8.3 and 8.4 page 196 in the book:
+Independent Component Analysis, by  Hyvarinen et al.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..exceptions import ConvergenceWarning
+from ..utils import as_float_array, check_array, check_random_state
+from ..utils._param_validation import Interval, Options, StrOptions, validate_params
+from ..utils.validation import check_is_fitted, validate_data
+
+__all__ = ["FastICA", "fastica"]
+
+
+def _gs_decorrelation(w, W, j):
+    """
+    Orthonormalize w wrt the first j rows of W.
+
+    Parameters
+    ----------
+    w : ndarray of shape (n,)
+        Array to be orthogonalized
+
+    W : ndarray of shape (p, n)
+        Null space definition
+
+    j : int < p
+        The no of (from the first) rows of Null space W wrt which w is
+        orthogonalized.
+
+    Notes
+    -----
+    Assumes that W is orthogonal
+    w changed in place
+    """
+    w -= np.linalg.multi_dot([w, W[:j].T, W[:j]])
+    return w
+
+
+def _sym_decorrelation(W):
+    """Symmetric decorrelation
+    i.e. W <- (W * W.T) ^{-1/2} * W
+    """
+    s, u = linalg.eigh(np.dot(W, W.T))
+    # Avoid sqrt of negative values because of rounding errors. Note that
+    # np.sqrt(tiny) is larger than tiny and therefore this clipping also
+    # prevents division by zero in the next step.
+    s = np.clip(s, a_min=np.finfo(W.dtype).tiny, a_max=None)
+
+    # u (resp. s) contains the eigenvectors (resp. square roots of
+    # the eigenvalues) of W * W.T
+    return np.linalg.multi_dot([u * (1.0 / np.sqrt(s)), u.T, W])
+
+
+def _ica_def(X, tol, g, fun_args, max_iter, w_init):
+    """Deflationary FastICA using fun approx to neg-entropy function
+
+    Used internally by FastICA.
+    """
+
+    n_components = w_init.shape[0]
+    W = np.zeros((n_components, n_components), dtype=X.dtype)
+    n_iter = []
+
+    # j is the index of the extracted component
+    for j in range(n_components):
+        w = w_init[j, :].copy()
+        w /= np.sqrt((w**2).sum())
+
+        for i in range(max_iter):
+            gwtx, g_wtx = g(np.dot(w.T, X), fun_args)
+
+            w1 = (X * gwtx).mean(axis=1) - g_wtx.mean() * w
+
+            _gs_decorrelation(w1, W, j)
+
+            w1 /= np.sqrt((w1**2).sum())
+
+            lim = np.abs(np.abs((w1 * w).sum()) - 1)
+            w = w1
+            if lim < tol:
+                break
+
+        n_iter.append(i + 1)
+        W[j, :] = w
+
+    return W, max(n_iter)
+
+
+def _ica_par(X, tol, g, fun_args, max_iter, w_init):
+    """Parallel FastICA.
+
+    Used internally by FastICA --main loop
+
+    """
+    W = _sym_decorrelation(w_init)
+    del w_init
+    p_ = float(X.shape[1])
+    for ii in range(max_iter):
+        gwtx, g_wtx = g(np.dot(W, X), fun_args)
+        W1 = _sym_decorrelation(np.dot(gwtx, X.T) / p_ - g_wtx[:, np.newaxis] * W)
+        del gwtx, g_wtx
+        # builtin max, abs are faster than numpy counter parts.
+        # np.einsum allows having the lowest memory footprint.
+        # It is faster than np.diag(np.dot(W1, W.T)).
+        lim = max(abs(abs(np.einsum("ij,ij->i", W1, W)) - 1))
+        W = W1
+        if lim < tol:
+            break
+    else:
+        warnings.warn(
+            (
+                "FastICA did not converge. Consider increasing "
+                "tolerance or the maximum number of iterations."
+            ),
+            ConvergenceWarning,
+        )
+
+    return W, ii + 1
+
+
+# Some standard non-linear functions.
+# XXX: these should be optimized, as they can be a bottleneck.
+def _logcosh(x, fun_args=None):
+    alpha = fun_args.get("alpha", 1.0)  # comment it out?
+
+    x *= alpha
+    gx = np.tanh(x, x)  # apply the tanh inplace
+    g_x = np.empty(x.shape[0], dtype=x.dtype)
+    # XXX compute in chunks to avoid extra allocation
+    for i, gx_i in enumerate(gx):  # please don't vectorize.
+        g_x[i] = (alpha * (1 - gx_i**2)).mean()
+    return gx, g_x
+
+
+def _exp(x, fun_args):
+    exp = np.exp(-(x**2) / 2)
+    gx = x * exp
+    g_x = (1 - x**2) * exp
+    return gx, g_x.mean(axis=-1)
+
+
+def _cube(x, fun_args):
+    return x**3, (3 * x**2).mean(axis=-1)
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "return_X_mean": ["boolean"],
+        "compute_sources": ["boolean"],
+        "return_n_iter": ["boolean"],
+    },
+    prefer_skip_nested_validation=False,
+)
+def fastica(
+    X,
+    n_components=None,
+    *,
+    algorithm="parallel",
+    whiten="unit-variance",
+    fun="logcosh",
+    fun_args=None,
+    max_iter=200,
+    tol=1e-04,
+    w_init=None,
+    whiten_solver="svd",
+    random_state=None,
+    return_X_mean=False,
+    compute_sources=True,
+    return_n_iter=False,
+):
+    """Perform Fast Independent Component Analysis.
+
+    The implementation is based on [1]_.
+
+    Read more in the :ref:`User Guide <ICA>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    n_components : int, default=None
+        Number of components to use. If None is passed, all are used.
+
+    algorithm : {'parallel', 'deflation'}, default='parallel'
+        Specify which algorithm to use for FastICA.
+
+    whiten : str or bool, default='unit-variance'
+        Specify the whitening strategy to use.
+
+        - If 'arbitrary-variance', a whitening with variance
+          arbitrary is used.
+        - If 'unit-variance', the whitening matrix is rescaled to ensure that
+          each recovered source has unit variance.
+        - If False, the data is already considered to be whitened, and no
+          whitening is performed.
+
+        .. versionchanged:: 1.3
+            The default value of `whiten` changed to 'unit-variance' in 1.3.
+
+    fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'
+        The functional form of the G function used in the
+        approximation to neg-entropy. Could be either 'logcosh', 'exp',
+        or 'cube'.
+        You can also provide your own function. It should return a tuple
+        containing the value of the function, and of its derivative, in the
+        point. The derivative should be averaged along its last dimension.
+        Example::
+
+            def my_g(x):
+                return x ** 3, (3 * x ** 2).mean(axis=-1)
+
+    fun_args : dict, default=None
+        Arguments to send to the functional form.
+        If empty or None and if fun='logcosh', fun_args will take value
+        {'alpha' : 1.0}.
+
+    max_iter : int, default=200
+        Maximum number of iterations to perform.
+
+    tol : float, default=1e-4
+        A positive scalar giving the tolerance at which the
+        un-mixing matrix is considered to have converged.
+
+    w_init : ndarray of shape (n_components, n_components), default=None
+        Initial un-mixing array. If `w_init=None`, then an array of values
+        drawn from a normal distribution is used.
+
+    whiten_solver : {"eigh", "svd"}, default="svd"
+        The solver to use for whitening.
+
+        - "svd" is more stable numerically if the problem is degenerate, and
+          often faster when `n_samples <= n_features`.
+
+        - "eigh" is generally more memory efficient when
+          `n_samples >= n_features`, and can be faster when
+          `n_samples >= 50 * n_features`.
+
+        .. versionadded:: 1.2
+
+    random_state : int, RandomState instance or None, default=None
+        Used to initialize ``w_init`` when not specified, with a
+        normal distribution. Pass an int, for reproducible results
+        across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    return_X_mean : bool, default=False
+        If True, X_mean is returned too.
+
+    compute_sources : bool, default=True
+        If False, sources are not computed, but only the rotation matrix.
+        This can save memory when working with big data. Defaults to True.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    Returns
+    -------
+    K : ndarray of shape (n_components, n_features) or None
+        If whiten is 'True', K is the pre-whitening matrix that projects data
+        onto the first n_components principal components. If whiten is 'False',
+        K is 'None'.
+
+    W : ndarray of shape (n_components, n_components)
+        The square matrix that unmixes the data after whitening.
+        The mixing matrix is the pseudo-inverse of matrix ``W K``
+        if K is not None, else it is the inverse of W.
+
+    S : ndarray of shape (n_samples, n_components) or None
+        Estimated source matrix.
+
+    X_mean : ndarray of shape (n_features,)
+        The mean over features. Returned only if return_X_mean is True.
+
+    n_iter : int
+        If the algorithm is "deflation", n_iter is the
+        maximum number of iterations run across all components. Else
+        they are just the number of iterations taken to converge. This is
+        returned only when return_n_iter is set to `True`.
+
+    Notes
+    -----
+    The data matrix X is considered to be a linear combination of
+    non-Gaussian (independent) components i.e. X = AS where columns of S
+    contain the independent components and A is a linear mixing
+    matrix. In short ICA attempts to `un-mix' the data by estimating an
+    un-mixing matrix W where ``S = W K X.``
+    While FastICA was proposed to estimate as many sources
+    as features, it is possible to estimate less by setting
+    n_components < n_features. It this case K is not a square matrix
+    and the estimated A is the pseudo-inverse of ``W K``.
+
+    This implementation was originally made for data of shape
+    [n_features, n_samples]. Now the input is transposed
+    before the algorithm is applied. This makes it slightly
+    faster for Fortran-ordered input.
+
+    References
+    ----------
+    .. [1] A. Hyvarinen and E. Oja, "Fast Independent Component Analysis",
+           Algorithms and Applications, Neural Networks, 13(4-5), 2000,
+           pp. 411-430.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.decomposition import fastica
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> K, W, S = fastica(X, n_components=7, random_state=0, whiten='unit-variance')
+    >>> K.shape
+    (7, 64)
+    >>> W.shape
+    (7, 7)
+    >>> S.shape
+    (1797, 7)
+    """
+    est = FastICA(
+        n_components=n_components,
+        algorithm=algorithm,
+        whiten=whiten,
+        fun=fun,
+        fun_args=fun_args,
+        max_iter=max_iter,
+        tol=tol,
+        w_init=w_init,
+        whiten_solver=whiten_solver,
+        random_state=random_state,
+    )
+    est._validate_params()
+    S = est._fit_transform(X, compute_sources=compute_sources)
+
+    if est.whiten in ["unit-variance", "arbitrary-variance"]:
+        K = est.whitening_
+        X_mean = est.mean_
+    else:
+        K = None
+        X_mean = None
+
+    returned_values = [K, est._unmixing, S]
+    if return_X_mean:
+        returned_values.append(X_mean)
+    if return_n_iter:
+        returned_values.append(est.n_iter_)
+
+    return returned_values
+
+
+class FastICA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """FastICA: a fast algorithm for Independent Component Analysis.
+
+    The implementation is based on [1]_.
+
+    Read more in the :ref:`User Guide <ICA>`.
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of components to use. If None is passed, all are used.
+
+    algorithm : {'parallel', 'deflation'}, default='parallel'
+        Specify which algorithm to use for FastICA.
+
+    whiten : str or bool, default='unit-variance'
+        Specify the whitening strategy to use.
+
+        - If 'arbitrary-variance', a whitening with variance
+          arbitrary is used.
+        - If 'unit-variance', the whitening matrix is rescaled to ensure that
+          each recovered source has unit variance.
+        - If False, the data is already considered to be whitened, and no
+          whitening is performed.
+
+        .. versionchanged:: 1.3
+            The default value of `whiten` changed to 'unit-variance' in 1.3.
+
+    fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'
+        The functional form of the G function used in the
+        approximation to neg-entropy. Could be either 'logcosh', 'exp',
+        or 'cube'.
+        You can also provide your own function. It should return a tuple
+        containing the value of the function, and of its derivative, in the
+        point. The derivative should be averaged along its last dimension.
+        Example::
+
+            def my_g(x):
+                return x ** 3, (3 * x ** 2).mean(axis=-1)
+
+    fun_args : dict, default=None
+        Arguments to send to the functional form.
+        If empty or None and if fun='logcosh', fun_args will take value
+        {'alpha' : 1.0}.
+
+    max_iter : int, default=200
+        Maximum number of iterations during fit.
+
+    tol : float, default=1e-4
+        A positive scalar giving the tolerance at which the
+        un-mixing matrix is considered to have converged.
+
+    w_init : array-like of shape (n_components, n_components), default=None
+        Initial un-mixing array. If `w_init=None`, then an array of values
+        drawn from a normal distribution is used.
+
+    whiten_solver : {"eigh", "svd"}, default="svd"
+        The solver to use for whitening.
+
+        - "svd" is more stable numerically if the problem is degenerate, and
+          often faster when `n_samples <= n_features`.
+
+        - "eigh" is generally more memory efficient when
+          `n_samples >= n_features`, and can be faster when
+          `n_samples >= 50 * n_features`.
+
+        .. versionadded:: 1.2
+
+    random_state : int, RandomState instance or None, default=None
+        Used to initialize ``w_init`` when not specified, with a
+        normal distribution. Pass an int, for reproducible results
+        across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        The linear operator to apply to the data to get the independent
+        sources. This is equal to the unmixing matrix when ``whiten`` is
+        False, and equal to ``np.dot(unmixing_matrix, self.whitening_)`` when
+        ``whiten`` is True.
+
+    mixing_ : ndarray of shape (n_features, n_components)
+        The pseudo-inverse of ``components_``. It is the linear operator
+        that maps independent sources to the data.
+
+    mean_ : ndarray of shape(n_features,)
+        The mean over features. Only set if `self.whiten` is True.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        If the algorithm is "deflation", n_iter is the
+        maximum number of iterations run across all components. Else
+        they are just the number of iterations taken to converge.
+
+    whitening_ : ndarray of shape (n_components, n_features)
+        Only set if whiten is 'True'. This is the pre-whitening matrix
+        that projects data onto the first `n_components` principal components.
+
+    See Also
+    --------
+    PCA : Principal component analysis (PCA).
+    IncrementalPCA : Incremental principal components analysis (IPCA).
+    KernelPCA : Kernel Principal component analysis (KPCA).
+    MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.
+    SparsePCA : Sparse Principal Components Analysis (SparsePCA).
+
+    References
+    ----------
+    .. [1] A. Hyvarinen and E. Oja, Independent Component Analysis:
+           Algorithms and Applications, Neural Networks, 13(4-5), 2000,
+           pp. 411-430.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.decomposition import FastICA
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> transformer = FastICA(n_components=7,
+    ...         random_state=0,
+    ...         whiten='unit-variance')
+    >>> X_transformed = transformer.fit_transform(X)
+    >>> X_transformed.shape
+    (1797, 7)
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left"), None],
+        "algorithm": [StrOptions({"parallel", "deflation"})],
+        "whiten": [
+            StrOptions({"arbitrary-variance", "unit-variance"}),
+            Options(bool, {False}),
+        ],
+        "fun": [StrOptions({"logcosh", "exp", "cube"}), callable],
+        "fun_args": [dict, None],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+        "w_init": ["array-like", None],
+        "whiten_solver": [StrOptions({"eigh", "svd"})],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        algorithm="parallel",
+        whiten="unit-variance",
+        fun="logcosh",
+        fun_args=None,
+        max_iter=200,
+        tol=1e-4,
+        w_init=None,
+        whiten_solver="svd",
+        random_state=None,
+    ):
+        super().__init__()
+        self.n_components = n_components
+        self.algorithm = algorithm
+        self.whiten = whiten
+        self.fun = fun
+        self.fun_args = fun_args
+        self.max_iter = max_iter
+        self.tol = tol
+        self.w_init = w_init
+        self.whiten_solver = whiten_solver
+        self.random_state = random_state
+
+    def _fit_transform(self, X, compute_sources=False):
+        """Fit the model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        compute_sources : bool, default=False
+            If False, sources are not computes but only the rotation matrix.
+            This can save memory when working with big data. Defaults to False.
+
+        Returns
+        -------
+        S : ndarray of shape (n_samples, n_components) or None
+            Sources matrix. `None` if `compute_sources` is `False`.
+        """
+        XT = validate_data(
+            self,
+            X,
+            copy=self.whiten,
+            dtype=[np.float64, np.float32],
+            ensure_min_samples=2,
+        ).T
+        fun_args = {} if self.fun_args is None else self.fun_args
+        random_state = check_random_state(self.random_state)
+
+        alpha = fun_args.get("alpha", 1.0)
+        if not 1 <= alpha <= 2:
+            raise ValueError("alpha must be in [1,2]")
+
+        if self.fun == "logcosh":
+            g = _logcosh
+        elif self.fun == "exp":
+            g = _exp
+        elif self.fun == "cube":
+            g = _cube
+        elif callable(self.fun):
+
+            def g(x, fun_args):
+                return self.fun(x, **fun_args)
+
+        n_features, n_samples = XT.shape
+        n_components = self.n_components
+        if not self.whiten and n_components is not None:
+            n_components = None
+            warnings.warn("Ignoring n_components with whiten=False.")
+
+        if n_components is None:
+            n_components = min(n_samples, n_features)
+        if n_components > min(n_samples, n_features):
+            n_components = min(n_samples, n_features)
+            warnings.warn(
+                "n_components is too large: it will be set to %s" % n_components
+            )
+
+        if self.whiten:
+            # Centering the features of X
+            X_mean = XT.mean(axis=-1)
+            XT -= X_mean[:, np.newaxis]
+
+            # Whitening and preprocessing by PCA
+            if self.whiten_solver == "eigh":
+                # Faster when num_samples >> n_features
+                d, u = linalg.eigh(XT.dot(X))
+                sort_indices = np.argsort(d)[::-1]
+                eps = np.finfo(d.dtype).eps * 10
+                degenerate_idx = d < eps
+                if np.any(degenerate_idx):
+                    warnings.warn(
+                        "There are some small singular values, using "
+                        "whiten_solver = 'svd' might lead to more "
+                        "accurate results."
+                    )
+                d[degenerate_idx] = eps  # For numerical issues
+                np.sqrt(d, out=d)
+                d, u = d[sort_indices], u[:, sort_indices]
+            elif self.whiten_solver == "svd":
+                u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2]
+
+            # Give consistent eigenvectors for both svd solvers
+            u *= np.sign(u[0])
+
+            K = (u / d).T[:n_components]  # see (6.33) p.140
+            del u, d
+            X1 = np.dot(K, XT)
+            # see (13.6) p.267 Here X1 is white and data
+            # in X has been projected onto a subspace by PCA
+            X1 *= np.sqrt(n_samples)
+        else:
+            # X must be casted to floats to avoid typing issues with numpy
+            # 2.0 and the line below
+            X1 = as_float_array(XT, copy=False)  # copy has been taken care of
+
+        w_init = self.w_init
+        if w_init is None:
+            w_init = np.asarray(
+                random_state.normal(size=(n_components, n_components)), dtype=X1.dtype
+            )
+
+        else:
+            w_init = np.asarray(w_init)
+            if w_init.shape != (n_components, n_components):
+                raise ValueError(
+                    "w_init has invalid shape -- should be %(shape)s"
+                    % {"shape": (n_components, n_components)}
+                )
+
+        kwargs = {
+            "tol": self.tol,
+            "g": g,
+            "fun_args": fun_args,
+            "max_iter": self.max_iter,
+            "w_init": w_init,
+        }
+
+        if self.algorithm == "parallel":
+            W, n_iter = _ica_par(X1, **kwargs)
+        elif self.algorithm == "deflation":
+            W, n_iter = _ica_def(X1, **kwargs)
+        del X1
+
+        self.n_iter_ = n_iter
+
+        if compute_sources:
+            if self.whiten:
+                S = np.linalg.multi_dot([W, K, XT]).T
+            else:
+                S = np.dot(W, XT).T
+        else:
+            S = None
+
+        if self.whiten:
+            if self.whiten == "unit-variance":
+                if not compute_sources:
+                    S = np.linalg.multi_dot([W, K, XT]).T
+                S_std = np.std(S, axis=0, keepdims=True)
+                S /= S_std
+                W /= S_std.T
+
+            self.components_ = np.dot(W, K)
+            self.mean_ = X_mean
+            self.whitening_ = K
+        else:
+            self.components_ = W
+
+        self.mixing_ = linalg.pinv(self.components_, check_finite=False)
+        self._unmixing = W
+
+        return S
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Fit the model and recover the sources from X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Estimated sources obtained by transforming the data with the
+            estimated unmixing matrix.
+        """
+        return self._fit_transform(X, compute_sources=True)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self._fit_transform(X, compute_sources=False)
+        return self
+
+    def transform(self, X, copy=True):
+        """Recover the sources from X (apply the unmixing matrix).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data to transform, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        copy : bool, default=True
+            If False, data passed to fit can be overwritten. Defaults to True.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Estimated sources obtained by transforming the data with the
+            estimated unmixing matrix.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(
+            self,
+            X,
+            copy=(copy and self.whiten),
+            dtype=[np.float64, np.float32],
+            reset=False,
+        )
+        if self.whiten:
+            X -= self.mean_
+
+        return np.dot(X, self.components_.T)
+
+    def inverse_transform(self, X, copy=True):
+        """Transform the sources back to the mixed data (apply mixing matrix).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_components)
+            Sources, where `n_samples` is the number of samples
+            and `n_components` is the number of components.
+        copy : bool, default=True
+            If False, data passed to fit are overwritten. Defaults to True.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Reconstructed data obtained with the mixing matrix.
+        """
+        check_is_fitted(self)
+
+        X = check_array(X, copy=(copy and self.whiten), dtype=[np.float64, np.float32])
+        X = np.dot(X, self.mixing_.T)
+        if self.whiten:
+            X += self.mean_
+
+        return X
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_incremental_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_incremental_pca.py
new file mode 100644
index 0000000000000000000000000000000000000000..da617ef8fa787402810e17a563ce3152b5e1da89
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_incremental_pca.py
@@ -0,0 +1,426 @@
+"""Incremental Principal Components Analysis."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral
+
+import numpy as np
+from scipy import linalg, sparse
+
+from sklearn.utils import metadata_routing
+
+from ..base import _fit_context
+from ..utils import gen_batches
+from ..utils._param_validation import Interval
+from ..utils.extmath import _incremental_mean_and_var, svd_flip
+from ..utils.validation import validate_data
+from ._base import _BasePCA
+
+
+class IncrementalPCA(_BasePCA):
+    """Incremental principal components analysis (IPCA).
+
+    Linear dimensionality reduction using Singular Value Decomposition of
+    the data, keeping only the most significant singular vectors to
+    project the data to a lower dimensional space. The input data is centered
+    but not scaled for each feature before applying the SVD.
+
+    Depending on the size of the input data, this algorithm can be much more
+    memory efficient than a PCA, and allows sparse input.
+
+    This algorithm has constant memory complexity, on the order
+    of ``batch_size * n_features``, enabling use of np.memmap files without
+    loading the entire file into memory. For sparse matrices, the input
+    is converted to dense in batches (in order to be able to subtract the
+    mean) which avoids storing the entire dense matrix at any one time.
+
+    The computational overhead of each SVD is
+    ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples
+    remain in memory at a time. There will be ``n_samples / batch_size`` SVD
+    computations to get the principal components, versus 1 large SVD of
+    complexity ``O(n_samples * n_features ** 2)`` for PCA.
+
+    For a usage example, see
+    :ref:`sphx_glr_auto_examples_decomposition_plot_incremental_pca.py`.
+
+    Read more in the :ref:`User Guide <IncrementalPCA>`.
+
+    .. versionadded:: 0.16
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of components to keep. If ``n_components`` is ``None``,
+        then ``n_components`` is set to ``min(n_samples, n_features)``.
+
+    whiten : bool, default=False
+        When True (False by default) the ``components_`` vectors are divided
+        by ``n_samples`` times ``components_`` to ensure uncorrelated outputs
+        with unit component-wise variances.
+
+        Whitening will remove some information from the transformed signal
+        (the relative variance scales of the components) but can sometimes
+        improve the predictive accuracy of the downstream estimators by
+        making data respect some hard-wired assumptions.
+
+    copy : bool, default=True
+        If False, X will be overwritten. ``copy=False`` can be used to
+        save memory but is unsafe for general use.
+
+    batch_size : int, default=None
+        The number of samples to use for each batch. Only used when calling
+        ``fit``. If ``batch_size`` is ``None``, then ``batch_size``
+        is inferred from the data and set to ``5 * n_features``, to provide a
+        balance between approximation accuracy and memory consumption.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Principal axes in feature space, representing the directions of
+        maximum variance in the data. Equivalently, the right singular
+        vectors of the centered input data, parallel to its eigenvectors.
+        The components are sorted by decreasing ``explained_variance_``.
+
+    explained_variance_ : ndarray of shape (n_components,)
+        Variance explained by each of the selected components.
+
+    explained_variance_ratio_ : ndarray of shape (n_components,)
+        Percentage of variance explained by each of the selected components.
+        If all components are stored, the sum of explained variances is equal
+        to 1.0.
+
+    singular_values_ : ndarray of shape (n_components,)
+        The singular values corresponding to each of the selected components.
+        The singular values are equal to the 2-norms of the ``n_components``
+        variables in the lower-dimensional space.
+
+    mean_ : ndarray of shape (n_features,)
+        Per-feature empirical mean, aggregate over calls to ``partial_fit``.
+
+    var_ : ndarray of shape (n_features,)
+        Per-feature empirical variance, aggregate over calls to
+        ``partial_fit``.
+
+    noise_variance_ : float
+        The estimated noise covariance following the Probabilistic PCA model
+        from Tipping and Bishop 1999. See "Pattern Recognition and
+        Machine Learning" by C. Bishop, 12.2.1 p. 574 or
+        http://www.miketipping.com/papers/met-mppca.pdf.
+
+    n_components_ : int
+        The estimated number of components. Relevant when
+        ``n_components=None``.
+
+    n_samples_seen_ : int
+        The number of samples processed by the estimator. Will be reset on
+        new calls to fit, but increments across ``partial_fit`` calls.
+
+    batch_size_ : int
+        Inferred batch size from ``batch_size``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PCA : Principal component analysis (PCA).
+    KernelPCA : Kernel Principal component analysis (KPCA).
+    SparsePCA : Sparse Principal Components Analysis (SparsePCA).
+    TruncatedSVD : Dimensionality reduction using truncated SVD.
+
+    Notes
+    -----
+    Implements the incremental PCA model from:
+    *D. Ross, J. Lim, R. Lin, M. Yang, Incremental Learning for Robust Visual
+    Tracking, International Journal of Computer Vision, Volume 77, Issue 1-3,
+    pp. 125-141, May 2008.*
+    See https://www.cs.toronto.edu/~dross/ivt/RossLimLinYang_ijcv.pdf
+
+    This model is an extension of the Sequential Karhunen-Loeve Transform from:
+    :doi:`A. Levy and M. Lindenbaum, Sequential Karhunen-Loeve Basis Extraction and
+    its Application to Images, IEEE Transactions on Image Processing, Volume 9,
+    Number 8, pp. 1371-1374, August 2000. <10.1109/83.855432>`
+
+    We have specifically abstained from an optimization used by authors of both
+    papers, a QR decomposition used in specific situations to reduce the
+    algorithmic complexity of the SVD. The source for this technique is
+    *Matrix Computations, Third Edition, G. Holub and C. Van Loan, Chapter 5,
+    section 5.4.4, pp 252-253.*. This technique has been omitted because it is
+    advantageous only when decomposing a matrix with ``n_samples`` (rows)
+    >= 5/3 * ``n_features`` (columns), and hurts the readability of the
+    implemented algorithm. This would be a good opportunity for future
+    optimization, if it is deemed necessary.
+
+    References
+    ----------
+    D. Ross, J. Lim, R. Lin, M. Yang. Incremental Learning for Robust Visual
+    Tracking, International Journal of Computer Vision, Volume 77,
+    Issue 1-3, pp. 125-141, May 2008.
+
+    G. Golub and C. Van Loan. Matrix Computations, Third Edition, Chapter 5,
+    Section 5.4.4, pp. 252-253.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.decomposition import IncrementalPCA
+    >>> from scipy import sparse
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> transformer = IncrementalPCA(n_components=7, batch_size=200)
+    >>> # either partially fit on smaller batches of data
+    >>> transformer.partial_fit(X[:100, :])
+    IncrementalPCA(batch_size=200, n_components=7)
+    >>> # or let the fit function itself divide the data into batches
+    >>> X_sparse = sparse.csr_matrix(X)
+    >>> X_transformed = transformer.fit_transform(X_sparse)
+    >>> X_transformed.shape
+    (1797, 7)
+    """
+
+    __metadata_request__partial_fit = {"check_input": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left"), None],
+        "whiten": ["boolean"],
+        "copy": ["boolean"],
+        "batch_size": [Interval(Integral, 1, None, closed="left"), None],
+    }
+
+    def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=None):
+        self.n_components = n_components
+        self.whiten = whiten
+        self.copy = copy
+        self.batch_size = batch_size
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model with X, using minibatches of size batch_size.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self.components_ = None
+        self.n_samples_seen_ = 0
+        self.mean_ = 0.0
+        self.var_ = 0.0
+        self.singular_values_ = None
+        self.explained_variance_ = None
+        self.explained_variance_ratio_ = None
+        self.noise_variance_ = None
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc", "lil"],
+            copy=self.copy,
+            dtype=[np.float64, np.float32],
+            force_writeable=True,
+        )
+        n_samples, n_features = X.shape
+
+        if self.batch_size is None:
+            self.batch_size_ = 5 * n_features
+        else:
+            self.batch_size_ = self.batch_size
+
+        for batch in gen_batches(
+            n_samples, self.batch_size_, min_batch_size=self.n_components or 0
+        ):
+            X_batch = X[batch]
+            if sparse.issparse(X_batch):
+                X_batch = X_batch.toarray()
+            self.partial_fit(X_batch, check_input=False)
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None, check_input=True):
+        """Incremental fit with X. All of X is processed as a single batch.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        check_input : bool, default=True
+            Run check_array on X.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        first_pass = not hasattr(self, "components_")
+
+        if check_input:
+            if sparse.issparse(X):
+                raise TypeError(
+                    "IncrementalPCA.partial_fit does not support "
+                    "sparse input. Either convert data to dense "
+                    "or use IncrementalPCA.fit to do so in batches."
+                )
+            X = validate_data(
+                self,
+                X,
+                copy=self.copy,
+                dtype=[np.float64, np.float32],
+                force_writeable=True,
+                reset=first_pass,
+            )
+        n_samples, n_features = X.shape
+        if first_pass:
+            self.components_ = None
+
+        if self.n_components is None:
+            if self.components_ is None:
+                self.n_components_ = min(n_samples, n_features)
+            else:
+                self.n_components_ = self.components_.shape[0]
+        elif not self.n_components <= n_features:
+            raise ValueError(
+                "n_components=%r invalid for n_features=%d, need "
+                "more rows than columns for IncrementalPCA "
+                "processing" % (self.n_components, n_features)
+            )
+        elif self.n_components > n_samples and first_pass:
+            raise ValueError(
+                f"n_components={self.n_components} must be less or equal to "
+                f"the batch number of samples {n_samples} for the first "
+                "partial_fit call."
+            )
+        else:
+            self.n_components_ = self.n_components
+
+        if (self.components_ is not None) and (
+            self.components_.shape[0] != self.n_components_
+        ):
+            raise ValueError(
+                "Number of input features has changed from %i "
+                "to %i between calls to partial_fit! Try "
+                "setting n_components to a fixed value."
+                % (self.components_.shape[0], self.n_components_)
+            )
+
+        # This is the first partial_fit
+        if not hasattr(self, "n_samples_seen_"):
+            self.n_samples_seen_ = 0
+            self.mean_ = 0.0
+            self.var_ = 0.0
+
+        # Update stats - they are 0 if this is the first step
+        col_mean, col_var, n_total_samples = _incremental_mean_and_var(
+            X,
+            last_mean=self.mean_,
+            last_variance=self.var_,
+            last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1]),
+        )
+        n_total_samples = n_total_samples[0]
+
+        # Whitening
+        if self.n_samples_seen_ == 0:
+            # If it is the first step, simply whiten X
+            X -= col_mean
+        else:
+            col_batch_mean = np.mean(X, axis=0)
+            X -= col_batch_mean
+            # Build matrix of combined previous basis and new data
+            mean_correction = np.sqrt(
+                (self.n_samples_seen_ / n_total_samples) * n_samples
+            ) * (self.mean_ - col_batch_mean)
+            X = np.vstack(
+                (
+                    self.singular_values_.reshape((-1, 1)) * self.components_,
+                    X,
+                    mean_correction,
+                )
+            )
+
+        U, S, Vt = linalg.svd(X, full_matrices=False, check_finite=False)
+        U, Vt = svd_flip(U, Vt, u_based_decision=False)
+        explained_variance = S**2 / (n_total_samples - 1)
+        explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples)
+
+        self.n_samples_seen_ = n_total_samples
+        self.components_ = Vt[: self.n_components_]
+        self.singular_values_ = S[: self.n_components_]
+        self.mean_ = col_mean
+        self.var_ = col_var
+        self.explained_variance_ = explained_variance[: self.n_components_]
+        self.explained_variance_ratio_ = explained_variance_ratio[: self.n_components_]
+        # we already checked `self.n_components <= n_samples` above
+        if self.n_components_ not in (n_samples, n_features):
+            self.noise_variance_ = explained_variance[self.n_components_ :].mean()
+        else:
+            self.noise_variance_ = 0.0
+        return self
+
+    def transform(self, X):
+        """Apply dimensionality reduction to X.
+
+        X is projected on the first principal components previously extracted
+        from a training set, using minibatches of size batch_size if X is
+        sparse.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Projection of X in the first principal components.
+
+        Examples
+        --------
+
+        >>> import numpy as np
+        >>> from sklearn.decomposition import IncrementalPCA
+        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2],
+        ...               [1, 1], [2, 1], [3, 2]])
+        >>> ipca = IncrementalPCA(n_components=2, batch_size=3)
+        >>> ipca.fit(X)
+        IncrementalPCA(batch_size=3, n_components=2)
+        >>> ipca.transform(X) # doctest: +SKIP
+        """
+        if sparse.issparse(X):
+            n_samples = X.shape[0]
+            output = []
+            for batch in gen_batches(
+                n_samples, self.batch_size_, min_batch_size=self.n_components or 0
+            ):
+                output.append(super().transform(X[batch].toarray()))
+            return np.vstack(output)
+        else:
+            return super().transform(X)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # Beware that fit accepts sparse data but partial_fit doesn't
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_kernel_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_kernel_pca.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd862079a1682deed3705f42da5672af8ca10acb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_kernel_pca.py
@@ -0,0 +1,579 @@
+"""Kernel Principal Components Analysis."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+from scipy.linalg import eigh
+from scipy.sparse.linalg import eigsh
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..exceptions import NotFittedError
+from ..metrics.pairwise import pairwise_kernels
+from ..preprocessing import KernelCenterer
+from ..utils._arpack import _init_arpack_v0
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import _randomized_eigsh, svd_flip
+from ..utils.validation import (
+    _check_psd_eigenvalues,
+    check_is_fitted,
+    validate_data,
+)
+
+
+class KernelPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Kernel Principal component analysis (KPCA).
+
+    Non-linear dimensionality reduction through the use of kernels [1]_, see also
+    :ref:`metrics`.
+
+    It uses the :func:`scipy.linalg.eigh` LAPACK implementation of the full SVD
+    or the :func:`scipy.sparse.linalg.eigsh` ARPACK implementation of the
+    truncated SVD, depending on the shape of the input data and the number of
+    components to extract. It can also use a randomized truncated SVD by the
+    method proposed in [3]_, see `eigen_solver`.
+
+    For a usage example and comparison between
+    Principal Components Analysis (PCA) and its kernelized version (KPCA), see
+    :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`.
+
+    For a usage example in denoising images using KPCA, see
+    :ref:`sphx_glr_auto_examples_applications_plot_digits_denoising.py`.
+
+    Read more in the :ref:`User Guide <kernel_PCA>`.
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of components. If None, all non-zero components are kept.
+
+    kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} \
+            or callable, default='linear'
+        Kernel used for PCA.
+
+    gamma : float, default=None
+        Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other
+        kernels. If ``gamma`` is ``None``, then it is set to ``1/n_features``.
+
+    degree : float, default=3
+        Degree for poly kernels. Ignored by other kernels.
+
+    coef0 : float, default=1
+        Independent term in poly and sigmoid kernels.
+        Ignored by other kernels.
+
+    kernel_params : dict, default=None
+        Parameters (keyword arguments) and
+        values for kernel passed as callable object.
+        Ignored by other kernels.
+
+    alpha : float, default=1.0
+        Hyperparameter of the ridge regression that learns the
+        inverse transform (when fit_inverse_transform=True).
+
+    fit_inverse_transform : bool, default=False
+        Learn the inverse transform for non-precomputed kernels
+        (i.e. learn to find the pre-image of a point). This method is based
+        on [2]_.
+
+    eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, \
+            default='auto'
+        Select eigensolver to use. If `n_components` is much
+        less than the number of training samples, randomized (or arpack to a
+        smaller extent) may be more efficient than the dense eigensolver.
+        Randomized SVD is performed according to the method of Halko et al
+        [3]_.
+
+        auto :
+            the solver is selected by a default policy based on n_samples
+            (the number of training samples) and `n_components`:
+            if the number of components to extract is less than 10 (strict) and
+            the number of samples is more than 200 (strict), the 'arpack'
+            method is enabled. Otherwise the exact full eigenvalue
+            decomposition is computed and optionally truncated afterwards
+            ('dense' method).
+        dense :
+            run exact full eigenvalue decomposition calling the standard
+            LAPACK solver via `scipy.linalg.eigh`, and select the components
+            by postprocessing
+        arpack :
+            run SVD truncated to n_components calling ARPACK solver using
+            `scipy.sparse.linalg.eigsh`. It requires strictly
+            0 < n_components < n_samples
+        randomized :
+            run randomized SVD by the method of Halko et al. [3]_. The current
+            implementation selects eigenvalues based on their module; therefore
+            using this method can lead to unexpected results if the kernel is
+            not positive semi-definite. See also [4]_.
+
+        .. versionchanged:: 1.0
+           `'randomized'` was added.
+
+    tol : float, default=0
+        Convergence tolerance for arpack.
+        If 0, optimal value will be chosen by arpack.
+
+    max_iter : int, default=None
+        Maximum number of iterations for arpack.
+        If None, optimal value will be chosen by arpack.
+
+    iterated_power : int >= 0, or 'auto', default='auto'
+        Number of iterations for the power method computed by
+        svd_solver == 'randomized'. When 'auto', it is set to 7 when
+        `n_components < 0.1 * min(X.shape)`, other it is set to 4.
+
+        .. versionadded:: 1.0
+
+    remove_zero_eig : bool, default=False
+        If True, then all components with zero eigenvalues are removed, so
+        that the number of components in the output may be < n_components
+        (and sometimes even zero due to numerical instability).
+        When n_components is None, this parameter is ignored and components
+        with zero eigenvalues are removed regardless.
+
+    random_state : int, RandomState instance or None, default=None
+        Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int
+        for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.18
+
+    copy_X : bool, default=True
+        If True, input X is copied and stored by the model in the `X_fit_`
+        attribute. If no further changes will be done to X, setting
+        `copy_X=False` saves memory by storing a reference.
+
+        .. versionadded:: 0.18
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 0.18
+
+    Attributes
+    ----------
+    eigenvalues_ : ndarray of shape (n_components,)
+        Eigenvalues of the centered kernel matrix in decreasing order.
+        If `n_components` and `remove_zero_eig` are not set,
+        then all values are stored.
+
+    eigenvectors_ : ndarray of shape (n_samples, n_components)
+        Eigenvectors of the centered kernel matrix. If `n_components` and
+        `remove_zero_eig` are not set, then all components are stored.
+
+    dual_coef_ : ndarray of shape (n_samples, n_features)
+        Inverse transform matrix. Only available when
+        ``fit_inverse_transform`` is True.
+
+    X_transformed_fit_ : ndarray of shape (n_samples, n_components)
+        Projection of the fitted data on the kernel principal components.
+        Only available when ``fit_inverse_transform`` is True.
+
+    X_fit_ : ndarray of shape (n_samples, n_features)
+        The data used to fit the model. If `copy_X=False`, then `X_fit_` is
+        a reference. This attribute is used for the calls to transform.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    gamma_ : float
+        Kernel coefficient for rbf, poly and sigmoid kernels. When `gamma`
+        is explicitly provided, this is just the same as `gamma`. When `gamma`
+        is `None`, this is the actual value of kernel coefficient.
+
+        .. versionadded:: 1.3
+
+    See Also
+    --------
+    FastICA : A fast algorithm for Independent Component Analysis.
+    IncrementalPCA : Incremental Principal Component Analysis.
+    NMF : Non-Negative Matrix Factorization.
+    PCA : Principal Component Analysis.
+    SparsePCA : Sparse Principal Component Analysis.
+    TruncatedSVD : Dimensionality reduction using truncated SVD.
+
+    References
+    ----------
+    .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       "Kernel principal component analysis."
+       International conference on artificial neural networks.
+       Springer, Berlin, Heidelberg, 1997.
+       <https://people.eecs.berkeley.edu/~wainwrig/stat241b/scholkopf_kernel.pdf>`_
+
+    .. [2] `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+       "Learning to find pre-images."
+       Advances in neural information processing systems 16 (2004): 449-456.
+       <https://papers.nips.cc/paper/2003/file/ac1ad983e08ad3304a97e147f522747e-Paper.pdf>`_
+
+    .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp.
+       "Finding structure with randomness: Probabilistic algorithms for
+       constructing approximate matrix decompositions."
+       SIAM review 53.2 (2011): 217-288. <0909.4061>`
+
+    .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert.
+       "A randomized algorithm for the decomposition of matrices."
+       Applied and Computational Harmonic Analysis 30.1 (2011): 47-68.
+       <https://www.sciencedirect.com/science/article/pii/S1063520310000242>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.decomposition import KernelPCA
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> transformer = KernelPCA(n_components=7, kernel='linear')
+    >>> X_transformed = transformer.fit_transform(X)
+    >>> X_transformed.shape
+    (1797, 7)
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [
+            Interval(Integral, 1, None, closed="left"),
+            None,
+        ],
+        "kernel": [
+            StrOptions({"linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"}),
+            callable,
+        ],
+        "gamma": [
+            Interval(Real, 0, None, closed="left"),
+            None,
+        ],
+        "degree": [Interval(Real, 0, None, closed="left")],
+        "coef0": [Interval(Real, None, None, closed="neither")],
+        "kernel_params": [dict, None],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "fit_inverse_transform": ["boolean"],
+        "eigen_solver": [StrOptions({"auto", "dense", "arpack", "randomized"})],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [
+            Interval(Integral, 1, None, closed="left"),
+            None,
+        ],
+        "iterated_power": [
+            Interval(Integral, 0, None, closed="left"),
+            StrOptions({"auto"}),
+        ],
+        "remove_zero_eig": ["boolean"],
+        "random_state": ["random_state"],
+        "copy_X": ["boolean"],
+        "n_jobs": [None, Integral],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        kernel="linear",
+        gamma=None,
+        degree=3,
+        coef0=1,
+        kernel_params=None,
+        alpha=1.0,
+        fit_inverse_transform=False,
+        eigen_solver="auto",
+        tol=0,
+        max_iter=None,
+        iterated_power="auto",
+        remove_zero_eig=False,
+        random_state=None,
+        copy_X=True,
+        n_jobs=None,
+    ):
+        self.n_components = n_components
+        self.kernel = kernel
+        self.kernel_params = kernel_params
+        self.gamma = gamma
+        self.degree = degree
+        self.coef0 = coef0
+        self.alpha = alpha
+        self.fit_inverse_transform = fit_inverse_transform
+        self.eigen_solver = eigen_solver
+        self.tol = tol
+        self.max_iter = max_iter
+        self.iterated_power = iterated_power
+        self.remove_zero_eig = remove_zero_eig
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+        self.copy_X = copy_X
+
+    def _get_kernel(self, X, Y=None):
+        if callable(self.kernel):
+            params = self.kernel_params or {}
+        else:
+            params = {"gamma": self.gamma_, "degree": self.degree, "coef0": self.coef0}
+        return pairwise_kernels(
+            X, Y, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **params
+        )
+
+    def _fit_transform_in_place(self, K):
+        """Fit's using kernel K"""
+        # center kernel in place
+        K = self._centerer.fit(K).transform(K, copy=False)
+
+        # adjust n_components according to user inputs
+        if self.n_components is None:
+            n_components = K.shape[0]  # use all dimensions
+        else:
+            n_components = min(K.shape[0], self.n_components)
+
+        # compute eigenvectors
+        if self.eigen_solver == "auto":
+            if K.shape[0] > 200 and n_components < 10:
+                eigen_solver = "arpack"
+            else:
+                eigen_solver = "dense"
+        else:
+            eigen_solver = self.eigen_solver
+
+        if eigen_solver == "dense":
+            # Note: subset_by_index specifies the indices of smallest/largest to return
+            self.eigenvalues_, self.eigenvectors_ = eigh(
+                K, subset_by_index=(K.shape[0] - n_components, K.shape[0] - 1)
+            )
+        elif eigen_solver == "arpack":
+            v0 = _init_arpack_v0(K.shape[0], self.random_state)
+            self.eigenvalues_, self.eigenvectors_ = eigsh(
+                K, n_components, which="LA", tol=self.tol, maxiter=self.max_iter, v0=v0
+            )
+        elif eigen_solver == "randomized":
+            self.eigenvalues_, self.eigenvectors_ = _randomized_eigsh(
+                K,
+                n_components=n_components,
+                n_iter=self.iterated_power,
+                random_state=self.random_state,
+                selection="module",
+            )
+
+        # make sure that the eigenvalues are ok and fix numerical issues
+        self.eigenvalues_ = _check_psd_eigenvalues(
+            self.eigenvalues_, enable_warnings=False
+        )
+
+        # flip eigenvectors' sign to enforce deterministic output
+        self.eigenvectors_, _ = svd_flip(u=self.eigenvectors_, v=None)
+
+        # sort eigenvectors in descending order
+        indices = self.eigenvalues_.argsort()[::-1]
+        self.eigenvalues_ = self.eigenvalues_[indices]
+        self.eigenvectors_ = self.eigenvectors_[:, indices]
+
+        # remove eigenvectors with a zero eigenvalue (null space) if required
+        if self.remove_zero_eig or self.n_components is None:
+            self.eigenvectors_ = self.eigenvectors_[:, self.eigenvalues_ > 0]
+            self.eigenvalues_ = self.eigenvalues_[self.eigenvalues_ > 0]
+
+        # Maintenance note on Eigenvectors normalization
+        # ----------------------------------------------
+        # there is a link between
+        # the eigenvectors of K=Phi(X)'Phi(X) and the ones of Phi(X)Phi(X)'
+        # if v is an eigenvector of K
+        #     then Phi(X)v  is an eigenvector of Phi(X)Phi(X)'
+        # if u is an eigenvector of Phi(X)Phi(X)'
+        #     then Phi(X)'u is an eigenvector of Phi(X)'Phi(X)
+        #
+        # At this stage our self.eigenvectors_ (the v) have norm 1, we need to scale
+        # them so that eigenvectors in kernel feature space (the u) have norm=1
+        # instead
+        #
+        # We COULD scale them here:
+        #       self.eigenvectors_ = self.eigenvectors_ / np.sqrt(self.eigenvalues_)
+        #
+        # But choose to perform that LATER when needed, in `fit()` and in
+        # `transform()`.
+
+        return K
+
+    def _fit_inverse_transform(self, X_transformed, X):
+        if hasattr(X, "tocsr"):
+            raise NotImplementedError(
+                "Inverse transform not implemented for sparse matrices!"
+            )
+
+        n_samples = X_transformed.shape[0]
+        K = self._get_kernel(X_transformed)
+        K.flat[:: n_samples + 1] += self.alpha
+        self.dual_coef_ = linalg.solve(K, X, assume_a="pos", overwrite_a=True)
+        self.X_transformed_fit_ = X_transformed
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model from data in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        if self.fit_inverse_transform and self.kernel == "precomputed":
+            raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.")
+        X = validate_data(self, X, accept_sparse="csr", copy=self.copy_X)
+        self.gamma_ = 1 / X.shape[1] if self.gamma is None else self.gamma
+        self._centerer = KernelCenterer().set_output(transform="default")
+        K = self._get_kernel(X)
+        # When kernel="precomputed", K is X but it's safe to perform in place operations
+        # on K because a copy was made before if requested by copy_X.
+        self._fit_transform_in_place(K)
+
+        if self.fit_inverse_transform:
+            # no need to use the kernel to transform X, use shortcut expression
+            X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_)
+
+            self._fit_inverse_transform(X_transformed, X)
+
+        self.X_fit_ = X
+        return self
+
+    def fit_transform(self, X, y=None, **params):
+        """Fit the model from data in X and transform X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        **params : kwargs
+            Parameters (keyword arguments) and values passed to
+            the fit_transform instance.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Transformed values.
+        """
+        self.fit(X, **params)
+
+        # no need to use the kernel to transform X, use shortcut expression
+        X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_)
+
+        if self.fit_inverse_transform:
+            self._fit_inverse_transform(X_transformed, X)
+
+        return X_transformed
+
+    def transform(self, X):
+        """Transform X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Projection of X in the first principal components, where `n_samples`
+            is the number of samples and `n_components` is the number of the components.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, accept_sparse="csr", reset=False)
+
+        # Compute centered gram matrix between X and training data X_fit_
+        K = self._centerer.transform(self._get_kernel(X, self.X_fit_))
+
+        # scale eigenvectors (properly account for null-space for dot product)
+        non_zeros = np.flatnonzero(self.eigenvalues_)
+        scaled_alphas = np.zeros_like(self.eigenvectors_)
+        scaled_alphas[:, non_zeros] = self.eigenvectors_[:, non_zeros] / np.sqrt(
+            self.eigenvalues_[non_zeros]
+        )
+
+        # Project with a scalar product between K and the scaled eigenvectors
+        return np.dot(K, scaled_alphas)
+
+    def inverse_transform(self, X):
+        """Transform X back to original space.
+
+        ``inverse_transform`` approximates the inverse transformation using
+        a learned pre-image. The pre-image is learned by kernel ridge
+        regression of the original data on their low-dimensional representation
+        vectors.
+
+        .. note:
+            :meth:`~sklearn.decomposition.fit` internally uses a centered
+            kernel. As the centered kernel no longer contains the information
+            of the mean of kernel features, such information is not taken into
+            account in reconstruction.
+
+        .. note::
+            When users want to compute inverse transformation for 'linear'
+            kernel, it is recommended that they use
+            :class:`~sklearn.decomposition.PCA` instead. Unlike
+            :class:`~sklearn.decomposition.PCA`,
+            :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``
+            does not reconstruct the mean of data when 'linear' kernel is used
+            due to the use of centered kernel.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_components)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Original data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        References
+        ----------
+        `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+        "Learning to find pre-images."
+        Advances in neural information processing systems 16 (2004): 449-456.
+        <https://papers.nips.cc/paper/2003/file/ac1ad983e08ad3304a97e147f522747e-Paper.pdf>`_
+        """
+        if not self.fit_inverse_transform:
+            raise NotFittedError(
+                "The fit_inverse_transform parameter was not"
+                " set to True when instantiating and hence "
+                "the inverse transform is not available."
+            )
+
+        K = self._get_kernel(X, self.X_transformed_fit_)
+        return np.dot(K, self.dual_coef_)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        tags.input_tags.pairwise = self.kernel == "precomputed"
+        return tags
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.eigenvalues_.shape[0]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_lda.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_lda.py
new file mode 100644
index 0000000000000000000000000000000000000000..94b1413745a2214572a06f7bfceaeffd5403ba48
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_lda.py
@@ -0,0 +1,959 @@
+"""
+
+=============================================================
+Online Latent Dirichlet Allocation with variational inference
+=============================================================
+
+This implementation is modified from Matthew D. Hoffman's onlineldavb code
+Link: https://github.com/blei-lab/onlineldavb
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+from joblib import effective_n_jobs
+from scipy.special import gammaln, logsumexp
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..utils import check_random_state, gen_batches, gen_even_slices
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import check_is_fitted, check_non_negative, validate_data
+from ._online_lda_fast import (
+    _dirichlet_expectation_1d as cy_dirichlet_expectation_1d,
+)
+from ._online_lda_fast import (
+    _dirichlet_expectation_2d,
+)
+from ._online_lda_fast import (
+    mean_change as cy_mean_change,
+)
+
+EPS = np.finfo(float).eps
+
+
+def _update_doc_distribution(
+    X,
+    exp_topic_word_distr,
+    doc_topic_prior,
+    max_doc_update_iter,
+    mean_change_tol,
+    cal_sstats,
+    random_state,
+):
+    """E-step: update document-topic distribution.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Document word matrix.
+
+    exp_topic_word_distr : ndarray of shape (n_topics, n_features)
+        Exponential value of expectation of log topic word distribution.
+        In the literature, this is `exp(E[log(beta)])`.
+
+    doc_topic_prior : float
+        Prior of document topic distribution `theta`.
+
+    max_doc_update_iter : int
+        Max number of iterations for updating document topic distribution in
+        the E-step.
+
+    mean_change_tol : float
+        Stopping tolerance for updating document topic distribution in E-step.
+
+    cal_sstats : bool
+        Parameter that indicate to calculate sufficient statistics or not.
+        Set `cal_sstats` to `True` when we need to run M-step.
+
+    random_state : RandomState instance or None
+        Parameter that indicate how to initialize document topic distribution.
+        Set `random_state` to None will initialize document topic distribution
+        to a constant number.
+
+    Returns
+    -------
+    (doc_topic_distr, suff_stats) :
+        `doc_topic_distr` is unnormalized topic distribution for each document.
+        In the literature, this is `gamma`. we can calculate `E[log(theta)]`
+        from it.
+        `suff_stats` is expected sufficient statistics for the M-step.
+            When `cal_sstats == False`, this will be None.
+
+    """
+    is_sparse_x = sp.issparse(X)
+    n_samples, n_features = X.shape
+    n_topics = exp_topic_word_distr.shape[0]
+
+    if random_state:
+        doc_topic_distr = random_state.gamma(100.0, 0.01, (n_samples, n_topics)).astype(
+            X.dtype, copy=False
+        )
+    else:
+        doc_topic_distr = np.ones((n_samples, n_topics), dtype=X.dtype)
+
+    # In the literature, this is `exp(E[log(theta)])`
+    exp_doc_topic = np.exp(_dirichlet_expectation_2d(doc_topic_distr))
+
+    # diff on `component_` (only calculate it when `cal_diff` is True)
+    suff_stats = (
+        np.zeros(exp_topic_word_distr.shape, dtype=X.dtype) if cal_sstats else None
+    )
+
+    if is_sparse_x:
+        X_data = X.data
+        X_indices = X.indices
+        X_indptr = X.indptr
+
+    # These cython functions are called in a nested loop on usually very small arrays
+    # (length=n_topics). In that case, finding the appropriate signature of the
+    # fused-typed function can be more costly than its execution, hence the dispatch
+    # is done outside of the loop.
+    ctype = "float" if X.dtype == np.float32 else "double"
+    mean_change = cy_mean_change[ctype]
+    dirichlet_expectation_1d = cy_dirichlet_expectation_1d[ctype]
+    eps = np.finfo(X.dtype).eps
+
+    for idx_d in range(n_samples):
+        if is_sparse_x:
+            ids = X_indices[X_indptr[idx_d] : X_indptr[idx_d + 1]]
+            cnts = X_data[X_indptr[idx_d] : X_indptr[idx_d + 1]]
+        else:
+            ids = np.nonzero(X[idx_d, :])[0]
+            cnts = X[idx_d, ids]
+
+        doc_topic_d = doc_topic_distr[idx_d, :]
+        # The next one is a copy, since the inner loop overwrites it.
+        exp_doc_topic_d = exp_doc_topic[idx_d, :].copy()
+        exp_topic_word_d = exp_topic_word_distr[:, ids]
+
+        # Iterate between `doc_topic_d` and `norm_phi` until convergence
+        for _ in range(0, max_doc_update_iter):
+            last_d = doc_topic_d
+
+            # The optimal phi_{dwk} is proportional to
+            # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
+            norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + eps
+
+            doc_topic_d = exp_doc_topic_d * np.dot(cnts / norm_phi, exp_topic_word_d.T)
+            # Note: adds doc_topic_prior to doc_topic_d, in-place.
+            dirichlet_expectation_1d(doc_topic_d, doc_topic_prior, exp_doc_topic_d)
+
+            if mean_change(last_d, doc_topic_d) < mean_change_tol:
+                break
+        doc_topic_distr[idx_d, :] = doc_topic_d
+
+        # Contribution of document d to the expected sufficient
+        # statistics for the M step.
+        if cal_sstats:
+            norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + eps
+            suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
+
+    return (doc_topic_distr, suff_stats)
+
+
+class LatentDirichletAllocation(
+    ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
+):
+    """Latent Dirichlet Allocation with online variational Bayes algorithm.
+
+    The implementation is based on [1]_ and [2]_.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <LatentDirichletAllocation>`.
+
+    Parameters
+    ----------
+    n_components : int, default=10
+        Number of topics.
+
+        .. versionchanged:: 0.19
+            ``n_topics`` was renamed to ``n_components``
+
+    doc_topic_prior : float, default=None
+        Prior of document topic distribution `theta`. If the value is None,
+        defaults to `1 / n_components`.
+        In [1]_, this is called `alpha`.
+
+    topic_word_prior : float, default=None
+        Prior of topic word distribution `beta`. If the value is None, defaults
+        to `1 / n_components`.
+        In [1]_, this is called `eta`.
+
+    learning_method : {'batch', 'online'}, default='batch'
+        Method used to update `_component`. Only used in :meth:`fit` method.
+        In general, if the data size is large, the online update will be much
+        faster than the batch update.
+
+        Valid options:
+
+        - 'batch': Batch variational Bayes method. Use all training data in each EM
+          update. Old `components_` will be overwritten in each iteration.
+        - 'online': Online variational Bayes method. In each EM update, use mini-batch
+          of training data to update the ``components_`` variable incrementally. The
+          learning rate is controlled by the ``learning_decay`` and the
+          ``learning_offset`` parameters.
+
+        .. versionchanged:: 0.20
+            The default learning method is now ``"batch"``.
+
+    learning_decay : float, default=0.7
+        It is a parameter that control learning rate in the online learning
+        method. The value should be set between (0.5, 1.0] to guarantee
+        asymptotic convergence. When the value is 0.0 and batch_size is
+        ``n_samples``, the update method is same as batch learning. In the
+        literature, this is called kappa.
+
+    learning_offset : float, default=10.0
+        A (positive) parameter that downweights early iterations in online
+        learning.  It should be greater than 1.0. In the literature, this is
+        called tau_0.
+
+    max_iter : int, default=10
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the :meth:`fit` method, and not the
+        :meth:`partial_fit` method.
+
+    batch_size : int, default=128
+        Number of documents to use in each EM iteration. Only used in online
+        learning.
+
+    evaluate_every : int, default=-1
+        How often to evaluate perplexity. Only used in `fit` method.
+        set it to 0 or negative number to not evaluate perplexity in
+        training at all. Evaluating perplexity can help you check convergence
+        in training process, but it will also increase total training time.
+        Evaluating perplexity in every iteration might increase training time
+        up to two-fold.
+
+    total_samples : int, default=1e6
+        Total number of documents. Only used in the :meth:`partial_fit` method.
+
+    perp_tol : float, default=1e-1
+        Perplexity tolerance. Only used when ``evaluate_every`` is greater than 0.
+
+    mean_change_tol : float, default=1e-3
+        Stopping tolerance for updating document topic distribution in E-step.
+
+    max_doc_update_iter : int, default=100
+        Max number of iterations for updating document topic distribution in
+        the E-step.
+
+    n_jobs : int, default=None
+        The number of jobs to use in the E-step.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        Verbosity level.
+
+    random_state : int, RandomState instance or None, default=None
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Variational parameters for topic word distribution. Since the complete
+        conditional for topic word distribution is a Dirichlet,
+        ``components_[i, j]`` can be viewed as pseudocount that represents the
+        number of times word `j` was assigned to topic `i`.
+        It can also be viewed as distribution over the words for each topic
+        after normalization:
+        ``model.components_ / model.components_.sum(axis=1)[:, np.newaxis]``.
+
+    exp_dirichlet_component_ : ndarray of shape (n_components, n_features)
+        Exponential value of expectation of log topic word distribution.
+        In the literature, this is `exp(E[log(beta)])`.
+
+    n_batch_iter_ : int
+        Number of iterations of the EM step.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of passes over the dataset.
+
+    bound_ : float
+        Final perplexity score on training set.
+
+    doc_topic_prior_ : float
+        Prior of document topic distribution `theta`. If the value is None,
+        it is `1 / n_components`.
+
+    random_state_ : RandomState instance
+        RandomState instance that is generated either from a seed, the random
+        number generator or by `np.random`.
+
+    topic_word_prior_ : float
+        Prior of topic word distribution `beta`. If the value is None, it is
+        `1 / n_components`.
+
+    See Also
+    --------
+    sklearn.discriminant_analysis.LinearDiscriminantAnalysis:
+        A classifier with a linear decision boundary, generated by fitting
+        class conditional densities to the data and using Bayes' rule.
+
+    References
+    ----------
+    .. [1] "Online Learning for Latent Dirichlet Allocation", Matthew D.
+           Hoffman, David M. Blei, Francis Bach, 2010
+           https://github.com/blei-lab/onlineldavb
+
+    .. [2] "Stochastic Variational Inference", Matthew D. Hoffman,
+           David M. Blei, Chong Wang, John Paisley, 2013
+
+    Examples
+    --------
+    >>> from sklearn.decomposition import LatentDirichletAllocation
+    >>> from sklearn.datasets import make_multilabel_classification
+    >>> # This produces a feature matrix of token counts, similar to what
+    >>> # CountVectorizer would produce on text.
+    >>> X, _ = make_multilabel_classification(random_state=0)
+    >>> lda = LatentDirichletAllocation(n_components=5,
+    ...     random_state=0)
+    >>> lda.fit(X)
+    LatentDirichletAllocation(...)
+    >>> # get topics for some given samples:
+    >>> lda.transform(X[-2:])
+    array([[0.00360392, 0.25499205, 0.0036211 , 0.64236448, 0.09541846],
+           [0.15297572, 0.00362644, 0.44412786, 0.39568399, 0.003586  ]])
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 0, None, closed="neither")],
+        "doc_topic_prior": [None, Interval(Real, 0, 1, closed="both")],
+        "topic_word_prior": [None, Interval(Real, 0, 1, closed="both")],
+        "learning_method": [StrOptions({"batch", "online"})],
+        "learning_decay": [Interval(Real, 0, 1, closed="both")],
+        "learning_offset": [Interval(Real, 1.0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "batch_size": [Interval(Integral, 0, None, closed="neither")],
+        "evaluate_every": [Interval(Integral, None, None, closed="neither")],
+        "total_samples": [Interval(Real, 0, None, closed="neither")],
+        "perp_tol": [Interval(Real, 0, None, closed="left")],
+        "mean_change_tol": [Interval(Real, 0, None, closed="left")],
+        "max_doc_update_iter": [Interval(Integral, 0, None, closed="left")],
+        "n_jobs": [None, Integral],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=10,
+        *,
+        doc_topic_prior=None,
+        topic_word_prior=None,
+        learning_method="batch",
+        learning_decay=0.7,
+        learning_offset=10.0,
+        max_iter=10,
+        batch_size=128,
+        evaluate_every=-1,
+        total_samples=1e6,
+        perp_tol=1e-1,
+        mean_change_tol=1e-3,
+        max_doc_update_iter=100,
+        n_jobs=None,
+        verbose=0,
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.doc_topic_prior = doc_topic_prior
+        self.topic_word_prior = topic_word_prior
+        self.learning_method = learning_method
+        self.learning_decay = learning_decay
+        self.learning_offset = learning_offset
+        self.max_iter = max_iter
+        self.batch_size = batch_size
+        self.evaluate_every = evaluate_every
+        self.total_samples = total_samples
+        self.perp_tol = perp_tol
+        self.mean_change_tol = mean_change_tol
+        self.max_doc_update_iter = max_doc_update_iter
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def _init_latent_vars(self, n_features, dtype=np.float64):
+        """Initialize latent variables."""
+
+        self.random_state_ = check_random_state(self.random_state)
+        self.n_batch_iter_ = 1
+        self.n_iter_ = 0
+
+        if self.doc_topic_prior is None:
+            self.doc_topic_prior_ = 1.0 / self.n_components
+        else:
+            self.doc_topic_prior_ = self.doc_topic_prior
+
+        if self.topic_word_prior is None:
+            self.topic_word_prior_ = 1.0 / self.n_components
+        else:
+            self.topic_word_prior_ = self.topic_word_prior
+
+        init_gamma = 100.0
+        init_var = 1.0 / init_gamma
+        # In the literature, this is called `lambda`
+        self.components_ = self.random_state_.gamma(
+            init_gamma, init_var, (self.n_components, n_features)
+        ).astype(dtype, copy=False)
+
+        # In the literature, this is `exp(E[log(beta)])`
+        self.exp_dirichlet_component_ = np.exp(
+            _dirichlet_expectation_2d(self.components_)
+        )
+
+    def _e_step(self, X, cal_sstats, random_init, parallel=None):
+        """E-step in EM update.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        cal_sstats : bool
+            Parameter that indicate whether to calculate sufficient statistics
+            or not. Set ``cal_sstats`` to True when we need to run M-step.
+
+        random_init : bool
+            Parameter that indicate whether to initialize document topic
+            distribution randomly in the E-step. Set it to True in training
+            steps.
+
+        parallel : joblib.Parallel, default=None
+            Pre-initialized instance of joblib.Parallel.
+
+        Returns
+        -------
+        (doc_topic_distr, suff_stats) :
+            `doc_topic_distr` is unnormalized topic distribution for each
+            document. In the literature, this is called `gamma`.
+            `suff_stats` is expected sufficient statistics for the M-step.
+            When `cal_sstats == False`, it will be None.
+
+        """
+
+        # Run e-step in parallel
+        random_state = self.random_state_ if random_init else None
+
+        # TODO: make Parallel._effective_n_jobs public instead?
+        n_jobs = effective_n_jobs(self.n_jobs)
+        if parallel is None:
+            parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1))
+        results = parallel(
+            delayed(_update_doc_distribution)(
+                X[idx_slice, :],
+                self.exp_dirichlet_component_,
+                self.doc_topic_prior_,
+                self.max_doc_update_iter,
+                self.mean_change_tol,
+                cal_sstats,
+                random_state,
+            )
+            for idx_slice in gen_even_slices(X.shape[0], n_jobs)
+        )
+
+        # merge result
+        doc_topics, sstats_list = zip(*results)
+        doc_topic_distr = np.vstack(doc_topics)
+
+        if cal_sstats:
+            # This step finishes computing the sufficient statistics for the
+            # M-step.
+            suff_stats = np.zeros(self.components_.shape, dtype=self.components_.dtype)
+            for sstats in sstats_list:
+                suff_stats += sstats
+            suff_stats *= self.exp_dirichlet_component_
+        else:
+            suff_stats = None
+
+        return (doc_topic_distr, suff_stats)
+
+    def _em_step(self, X, total_samples, batch_update, parallel=None):
+        """EM update for 1 iteration.
+
+        update `component_` by batch VB or online VB.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        total_samples : int
+            Total number of documents. It is only used when
+            batch_update is `False`.
+
+        batch_update : bool
+            Parameter that controls updating method.
+            `True` for batch learning, `False` for online learning.
+
+        parallel : joblib.Parallel, default=None
+            Pre-initialized instance of joblib.Parallel
+
+        Returns
+        -------
+        doc_topic_distr : ndarray of shape (n_samples, n_components)
+            Unnormalized document topic distribution.
+        """
+
+        # E-step
+        _, suff_stats = self._e_step(
+            X, cal_sstats=True, random_init=True, parallel=parallel
+        )
+
+        # M-step
+        if batch_update:
+            self.components_ = self.topic_word_prior_ + suff_stats
+        else:
+            # online update
+            # In the literature, the weight is `rho`
+            weight = np.power(
+                self.learning_offset + self.n_batch_iter_, -self.learning_decay
+            )
+            doc_ratio = float(total_samples) / X.shape[0]
+            self.components_ *= 1 - weight
+            self.components_ += weight * (
+                self.topic_word_prior_ + doc_ratio * suff_stats
+            )
+
+        # update `component_` related variables
+        self.exp_dirichlet_component_ = np.exp(
+            _dirichlet_expectation_2d(self.components_)
+        )
+        self.n_batch_iter_ += 1
+        return
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.positive_only = True
+        tags.input_tags.sparse = True
+        tags.transformer_tags.preserves_dtype = ["float32", "float64"]
+        return tags
+
+    def _check_non_neg_array(self, X, reset_n_features, whom):
+        """check X format
+
+        check X format and make sure no negative value in X.
+
+        Parameters
+        ----------
+        X :  array-like or sparse matrix
+
+        """
+        dtype = [np.float64, np.float32] if reset_n_features else self.components_.dtype
+
+        X = validate_data(
+            self,
+            X,
+            reset=reset_n_features,
+            accept_sparse="csr",
+            dtype=dtype,
+        )
+        check_non_negative(X, whom)
+
+        return X
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Online VB with Mini-Batch update.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Partially fitted estimator.
+        """
+        first_time = not hasattr(self, "components_")
+
+        X = self._check_non_neg_array(
+            X, reset_n_features=first_time, whom="LatentDirichletAllocation.partial_fit"
+        )
+        n_samples, n_features = X.shape
+        batch_size = self.batch_size
+
+        # initialize parameters or check
+        if first_time:
+            self._init_latent_vars(n_features, dtype=X.dtype)
+
+        if n_features != self.components_.shape[1]:
+            raise ValueError(
+                "The provided data has %d dimensions while "
+                "the model was trained with feature size %d."
+                % (n_features, self.components_.shape[1])
+            )
+
+        n_jobs = effective_n_jobs(self.n_jobs)
+        with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:
+            for idx_slice in gen_batches(n_samples, batch_size):
+                self._em_step(
+                    X[idx_slice, :],
+                    total_samples=self.total_samples,
+                    batch_update=False,
+                    parallel=parallel,
+                )
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Learn model for the data X with variational Bayes method.
+
+        When `learning_method` is 'online', use mini-batch update.
+        Otherwise, use batch update.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        X = self._check_non_neg_array(
+            X, reset_n_features=True, whom="LatentDirichletAllocation.fit"
+        )
+        n_samples, n_features = X.shape
+        max_iter = self.max_iter
+        evaluate_every = self.evaluate_every
+        learning_method = self.learning_method
+
+        batch_size = self.batch_size
+
+        # initialize parameters
+        self._init_latent_vars(n_features, dtype=X.dtype)
+        # change to perplexity later
+        last_bound = None
+        n_jobs = effective_n_jobs(self.n_jobs)
+        with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:
+            for i in range(max_iter):
+                if learning_method == "online":
+                    for idx_slice in gen_batches(n_samples, batch_size):
+                        self._em_step(
+                            X[idx_slice, :],
+                            total_samples=n_samples,
+                            batch_update=False,
+                            parallel=parallel,
+                        )
+                else:
+                    # batch update
+                    self._em_step(
+                        X, total_samples=n_samples, batch_update=True, parallel=parallel
+                    )
+
+                # check perplexity
+                if evaluate_every > 0 and (i + 1) % evaluate_every == 0:
+                    doc_topics_distr, _ = self._e_step(
+                        X, cal_sstats=False, random_init=False, parallel=parallel
+                    )
+                    bound = self._perplexity_precomp_distr(
+                        X, doc_topics_distr, sub_sampling=False
+                    )
+                    if self.verbose:
+                        print(
+                            "iteration: %d of max_iter: %d, perplexity: %.4f"
+                            % (i + 1, max_iter, bound)
+                        )
+
+                    if last_bound and abs(last_bound - bound) < self.perp_tol:
+                        break
+                    last_bound = bound
+
+                elif self.verbose:
+                    print("iteration: %d of max_iter: %d" % (i + 1, max_iter))
+                self.n_iter_ += 1
+
+        # calculate final perplexity value on train set
+        doc_topics_distr, _ = self._e_step(
+            X, cal_sstats=False, random_init=False, parallel=parallel
+        )
+        self.bound_ = self._perplexity_precomp_distr(
+            X, doc_topics_distr, sub_sampling=False
+        )
+
+        return self
+
+    def _unnormalized_transform(self, X):
+        """Transform data X according to fitted model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        Returns
+        -------
+        doc_topic_distr : ndarray of shape (n_samples, n_components)
+            Document topic distribution for X.
+        """
+        doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False)
+
+        return doc_topic_distr
+
+    def transform(self, X, *, normalize=True):
+        """Transform data X according to the fitted model.
+
+        .. versionchanged:: 0.18
+            `doc_topic_distr` is now normalized.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        normalize : bool, default=True
+            Whether to normalize the document topic distribution.
+
+        Returns
+        -------
+        doc_topic_distr : ndarray of shape (n_samples, n_components)
+            Document topic distribution for X.
+        """
+        check_is_fitted(self)
+        X = self._check_non_neg_array(
+            X, reset_n_features=False, whom="LatentDirichletAllocation.transform"
+        )
+        doc_topic_distr = self._unnormalized_transform(X)
+        if normalize:
+            doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]
+        return doc_topic_distr
+
+    def fit_transform(self, X, y=None, *, normalize=True):
+        """
+        Fit to data, then transform it.
+
+        Fits transformer to `X` and `y` and returns a transformed version of `X`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input samples.
+
+        y :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
+                default=None
+            Target values (None for unsupervised transformations).
+
+        normalize : bool, default=True
+            Whether to normalize the document topic distribution in `transform`.
+
+        Returns
+        -------
+        X_new : ndarray array of shape (n_samples, n_components)
+            Transformed array.
+        """
+        return self.fit(X, y).transform(X, normalize=normalize)
+
+    def _approx_bound(self, X, doc_topic_distr, sub_sampling):
+        """Estimate the variational bound.
+
+        Estimate the variational bound over "all documents" using only the
+        documents passed in as X. Since log-likelihood of each word cannot
+        be computed directly, we use this bound to estimate it.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        doc_topic_distr : ndarray of shape (n_samples, n_components)
+            Document topic distribution. In the literature, this is called
+            gamma.
+
+        sub_sampling : bool, default=False
+            Compensate for subsampling of documents.
+            It is used in calculate bound in online learning.
+
+        Returns
+        -------
+        score : float
+
+        """
+
+        def _loglikelihood(prior, distr, dirichlet_distr, size):
+            # calculate log-likelihood
+            score = np.sum((prior - distr) * dirichlet_distr)
+            score += np.sum(gammaln(distr) - gammaln(prior))
+            score += np.sum(gammaln(prior * size) - gammaln(np.sum(distr, 1)))
+            return score
+
+        is_sparse_x = sp.issparse(X)
+        n_samples, n_components = doc_topic_distr.shape
+        n_features = self.components_.shape[1]
+        score = 0
+
+        dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_distr)
+        dirichlet_component_ = _dirichlet_expectation_2d(self.components_)
+        doc_topic_prior = self.doc_topic_prior_
+        topic_word_prior = self.topic_word_prior_
+
+        if is_sparse_x:
+            X_data = X.data
+            X_indices = X.indices
+            X_indptr = X.indptr
+
+        # E[log p(docs | theta, beta)]
+        for idx_d in range(0, n_samples):
+            if is_sparse_x:
+                ids = X_indices[X_indptr[idx_d] : X_indptr[idx_d + 1]]
+                cnts = X_data[X_indptr[idx_d] : X_indptr[idx_d + 1]]
+            else:
+                ids = np.nonzero(X[idx_d, :])[0]
+                cnts = X[idx_d, ids]
+            temp = (
+                dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids]
+            )
+            norm_phi = logsumexp(temp, axis=0)
+            score += np.dot(cnts, norm_phi)
+
+        # compute E[log p(theta | alpha) - log q(theta | gamma)]
+        score += _loglikelihood(
+            doc_topic_prior, doc_topic_distr, dirichlet_doc_topic, self.n_components
+        )
+
+        # Compensate for the subsampling of the population of documents
+        if sub_sampling:
+            doc_ratio = float(self.total_samples) / n_samples
+            score *= doc_ratio
+
+        # E[log p(beta | eta) - log q (beta | lambda)]
+        score += _loglikelihood(
+            topic_word_prior, self.components_, dirichlet_component_, n_features
+        )
+
+        return score
+
+    def score(self, X, y=None):
+        """Calculate approximate log-likelihood as score.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        score : float
+            Use approximate bound as score.
+        """
+        check_is_fitted(self)
+        X = self._check_non_neg_array(
+            X, reset_n_features=False, whom="LatentDirichletAllocation.score"
+        )
+
+        doc_topic_distr = self._unnormalized_transform(X)
+        score = self._approx_bound(X, doc_topic_distr, sub_sampling=False)
+        return score
+
+    def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False):
+        """Calculate approximate perplexity for data X with ability to accept
+        precomputed doc_topic_distr
+
+        Perplexity is defined as exp(-1. * log-likelihood per word)
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        doc_topic_distr : ndarray of shape (n_samples, n_components), \
+                default=None
+            Document topic distribution.
+            If it is None, it will be generated by applying transform on X.
+
+        Returns
+        -------
+        score : float
+            Perplexity score.
+        """
+        if doc_topic_distr is None:
+            doc_topic_distr = self._unnormalized_transform(X)
+        else:
+            n_samples, n_components = doc_topic_distr.shape
+            if n_samples != X.shape[0]:
+                raise ValueError(
+                    "Number of samples in X and doc_topic_distr do not match."
+                )
+
+            if n_components != self.n_components:
+                raise ValueError("Number of topics does not match.")
+
+        current_samples = X.shape[0]
+        bound = self._approx_bound(X, doc_topic_distr, sub_sampling)
+
+        if sub_sampling:
+            word_cnt = X.sum() * (float(self.total_samples) / current_samples)
+        else:
+            word_cnt = X.sum()
+        perword_bound = bound / word_cnt
+
+        return np.exp(-1.0 * perword_bound)
+
+    def perplexity(self, X, sub_sampling=False):
+        """Calculate approximate perplexity for data X.
+
+        Perplexity is defined as exp(-1. * log-likelihood per word)
+
+        .. versionchanged:: 0.19
+           *doc_topic_distr* argument has been deprecated and is ignored
+           because user no longer has access to unnormalized distribution
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        sub_sampling : bool
+            Do sub-sampling or not.
+
+        Returns
+        -------
+        score : float
+            Perplexity score.
+        """
+        check_is_fitted(self)
+        X = self._check_non_neg_array(
+            X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity"
+        )
+        return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_nmf.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_nmf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c963538619a38d35bb74affb7ad8ebb64c071eb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_nmf.py
@@ -0,0 +1,2409 @@
+"""Non-negative matrix factorization."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import time
+import warnings
+from abc import ABC
+from math import sqrt
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+from scipy import linalg
+
+from .._config import config_context
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..exceptions import ConvergenceWarning
+from ..utils import check_array, check_random_state, gen_batches
+from ..utils._param_validation import (
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from ..utils.extmath import _randomized_svd, safe_sparse_dot, squared_norm
+from ..utils.validation import (
+    check_is_fitted,
+    check_non_negative,
+    validate_data,
+)
+from ._cdnmf_fast import _update_cdnmf_fast
+
+EPSILON = np.finfo(np.float32).eps
+
+
+def norm(x):
+    """Dot product-based Euclidean norm implementation.
+
+    See: http://fa.bianp.net/blog/2011/computing-the-vector-norm/
+
+    Parameters
+    ----------
+    x : array-like
+        Vector for which to compute the norm.
+    """
+    return sqrt(squared_norm(x))
+
+
+def trace_dot(X, Y):
+    """Trace of np.dot(X, Y.T).
+
+    Parameters
+    ----------
+    X : array-like
+        First matrix.
+    Y : array-like
+        Second matrix.
+    """
+    return np.dot(X.ravel(), Y.ravel())
+
+
+def _check_init(A, shape, whom):
+    A = check_array(A)
+    if shape[0] != "auto" and A.shape[0] != shape[0]:
+        raise ValueError(
+            f"Array with wrong first dimension passed to {whom}. Expected {shape[0]}, "
+            f"but got {A.shape[0]}."
+        )
+    if shape[1] != "auto" and A.shape[1] != shape[1]:
+        raise ValueError(
+            f"Array with wrong second dimension passed to {whom}. Expected {shape[1]}, "
+            f"but got {A.shape[1]}."
+        )
+    check_non_negative(A, whom)
+    if np.max(A) == 0:
+        raise ValueError(f"Array passed to {whom} is full of zeros.")
+
+
+def _beta_divergence(X, W, H, beta, square_root=False):
+    """Compute the beta-divergence of X and dot(W, H).
+
+    Parameters
+    ----------
+    X : float or array-like of shape (n_samples, n_features)
+
+    W : float or array-like of shape (n_samples, n_components)
+
+    H : float or array-like of shape (n_components, n_features)
+
+    beta : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}
+        Parameter of the beta-divergence.
+        If beta == 2, this is half the Frobenius *squared* norm.
+        If beta == 1, this is the generalized Kullback-Leibler divergence.
+        If beta == 0, this is the Itakura-Saito divergence.
+        Else, this is the general beta-divergence.
+
+    square_root : bool, default=False
+        If True, return np.sqrt(2 * res)
+        For beta == 2, it corresponds to the Frobenius norm.
+
+    Returns
+    -------
+        res : float
+            Beta divergence of X and np.dot(X, H).
+    """
+    beta = _beta_loss_to_float(beta)
+
+    # The method can be called with scalars
+    if not sp.issparse(X):
+        X = np.atleast_2d(X)
+    W = np.atleast_2d(W)
+    H = np.atleast_2d(H)
+
+    # Frobenius norm
+    if beta == 2:
+        # Avoid the creation of the dense np.dot(W, H) if X is sparse.
+        if sp.issparse(X):
+            norm_X = np.dot(X.data, X.data)
+            norm_WH = trace_dot(np.linalg.multi_dot([W.T, W, H]), H)
+            cross_prod = trace_dot((X @ H.T), W)
+            res = (norm_X + norm_WH - 2.0 * cross_prod) / 2.0
+        else:
+            res = squared_norm(X - np.dot(W, H)) / 2.0
+
+        if square_root:
+            return np.sqrt(res * 2)
+        else:
+            return res
+
+    if sp.issparse(X):
+        # compute np.dot(W, H) only where X is nonzero
+        WH_data = _special_sparse_dot(W, H, X).data
+        X_data = X.data
+    else:
+        WH = np.dot(W, H)
+        WH_data = WH.ravel()
+        X_data = X.ravel()
+
+    # do not affect the zeros: here 0 ** (-1) = 0 and not infinity
+    indices = X_data > EPSILON
+    WH_data = WH_data[indices]
+    X_data = X_data[indices]
+
+    # used to avoid division by zero
+    WH_data[WH_data < EPSILON] = EPSILON
+
+    # generalized Kullback-Leibler divergence
+    if beta == 1:
+        # fast and memory efficient computation of np.sum(np.dot(W, H))
+        sum_WH = np.dot(np.sum(W, axis=0), np.sum(H, axis=1))
+        # computes np.sum(X * log(X / WH)) only where X is nonzero
+        div = X_data / WH_data
+        res = np.dot(X_data, np.log(div))
+        # add full np.sum(np.dot(W, H)) - np.sum(X)
+        res += sum_WH - X_data.sum()
+
+    # Itakura-Saito divergence
+    elif beta == 0:
+        div = X_data / WH_data
+        res = np.sum(div) - np.prod(X.shape) - np.sum(np.log(div))
+
+    # beta-divergence, beta not in (0, 1, 2)
+    else:
+        if sp.issparse(X):
+            # slow loop, but memory efficient computation of :
+            # np.sum(np.dot(W, H) ** beta)
+            sum_WH_beta = 0
+            for i in range(X.shape[1]):
+                sum_WH_beta += np.sum(np.dot(W, H[:, i]) ** beta)
+
+        else:
+            sum_WH_beta = np.sum(WH**beta)
+
+        sum_X_WH = np.dot(X_data, WH_data ** (beta - 1))
+        res = (X_data**beta).sum() - beta * sum_X_WH
+        res += sum_WH_beta * (beta - 1)
+        res /= beta * (beta - 1)
+
+    if square_root:
+        res = max(res, 0)  # avoid negative number due to rounding errors
+        return np.sqrt(2 * res)
+    else:
+        return res
+
+
+def _special_sparse_dot(W, H, X):
+    """Computes np.dot(W, H), only where X is non zero."""
+    if sp.issparse(X):
+        ii, jj = X.nonzero()
+        n_vals = ii.shape[0]
+        dot_vals = np.empty(n_vals)
+        n_components = W.shape[1]
+
+        batch_size = max(n_components, n_vals // n_components)
+        for start in range(0, n_vals, batch_size):
+            batch = slice(start, start + batch_size)
+            dot_vals[batch] = np.multiply(W[ii[batch], :], H.T[jj[batch], :]).sum(
+                axis=1
+            )
+
+        WH = sp.coo_matrix((dot_vals, (ii, jj)), shape=X.shape)
+        return WH.tocsr()
+    else:
+        return np.dot(W, H)
+
+
+def _beta_loss_to_float(beta_loss):
+    """Convert string beta_loss to float."""
+    beta_loss_map = {"frobenius": 2, "kullback-leibler": 1, "itakura-saito": 0}
+    if isinstance(beta_loss, str):
+        beta_loss = beta_loss_map[beta_loss]
+    return beta_loss
+
+
+def _initialize_nmf(X, n_components, init=None, eps=1e-6, random_state=None):
+    """Algorithms for NMF initialization.
+
+    Computes an initial guess for the non-negative
+    rank k matrix approximation for X: X = WH.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data matrix to be decomposed.
+
+    n_components : int
+        The number of components desired in the approximation.
+
+    init :  {'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None
+        Method used to initialize the procedure.
+        Valid options:
+
+        - None: 'nndsvda' if n_components <= min(n_samples, n_features),
+            otherwise 'random'.
+
+        - 'random': non-negative random matrices, scaled with:
+            sqrt(X.mean() / n_components)
+
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+            initialization (better for sparseness)
+
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+            (better when sparsity is not desired)
+
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+            (generally faster, less accurate alternative to NNDSVDa
+            for when sparsity is not desired)
+
+        - 'custom': use custom matrices W and H
+
+        .. versionchanged:: 1.1
+            When `init=None` and n_components is less than n_samples and n_features
+            defaults to `nndsvda` instead of `nndsvd`.
+
+    eps : float, default=1e-6
+        Truncate all values less then this in output to zero.
+
+    random_state : int, RandomState instance or None, default=None
+        Used when ``init`` == 'nndsvdar' or 'random'. Pass an int for
+        reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    W : array-like of shape (n_samples, n_components)
+        Initial guesses for solving X ~= WH.
+
+    H : array-like of shape (n_components, n_features)
+        Initial guesses for solving X ~= WH.
+
+    References
+    ----------
+    C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for
+    nonnegative matrix factorization - Pattern Recognition, 2008
+    http://tinyurl.com/nndsvd
+    """
+    check_non_negative(X, "NMF initialization")
+    n_samples, n_features = X.shape
+
+    if (
+        init is not None
+        and init != "random"
+        and n_components > min(n_samples, n_features)
+    ):
+        raise ValueError(
+            "init = '{}' can only be used when "
+            "n_components <= min(n_samples, n_features)".format(init)
+        )
+
+    if init is None:
+        if n_components <= min(n_samples, n_features):
+            init = "nndsvda"
+        else:
+            init = "random"
+
+    # Random initialization
+    if init == "random":
+        avg = np.sqrt(X.mean() / n_components)
+        rng = check_random_state(random_state)
+        H = avg * rng.standard_normal(size=(n_components, n_features)).astype(
+            X.dtype, copy=False
+        )
+        W = avg * rng.standard_normal(size=(n_samples, n_components)).astype(
+            X.dtype, copy=False
+        )
+        np.abs(H, out=H)
+        np.abs(W, out=W)
+        return W, H
+
+    # NNDSVD initialization
+    U, S, V = _randomized_svd(X, n_components, random_state=random_state)
+    W = np.zeros_like(U)
+    H = np.zeros_like(V)
+
+    # The leading singular triplet is non-negative
+    # so it can be used as is for initialization.
+    W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
+    H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])
+
+    for j in range(1, n_components):
+        x, y = U[:, j], V[j, :]
+
+        # extract positive and negative parts of column vectors
+        x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
+        x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))
+
+        # and their norms
+        x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
+        x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)
+
+        m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm
+
+        # choose update
+        if m_p > m_n:
+            u = x_p / x_p_nrm
+            v = y_p / y_p_nrm
+            sigma = m_p
+        else:
+            u = x_n / x_n_nrm
+            v = y_n / y_n_nrm
+            sigma = m_n
+
+        lbd = np.sqrt(S[j] * sigma)
+        W[:, j] = lbd * u
+        H[j, :] = lbd * v
+
+    W[W < eps] = 0
+    H[H < eps] = 0
+
+    if init == "nndsvd":
+        pass
+    elif init == "nndsvda":
+        avg = X.mean()
+        W[W == 0] = avg
+        H[H == 0] = avg
+    elif init == "nndsvdar":
+        rng = check_random_state(random_state)
+        avg = X.mean()
+        W[W == 0] = abs(avg * rng.standard_normal(size=len(W[W == 0])) / 100)
+        H[H == 0] = abs(avg * rng.standard_normal(size=len(H[H == 0])) / 100)
+    else:
+        raise ValueError(
+            "Invalid init parameter: got %r instead of one of %r"
+            % (init, (None, "random", "nndsvd", "nndsvda", "nndsvdar"))
+        )
+
+    return W, H
+
+
+def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle, random_state):
+    """Helper function for _fit_coordinate_descent.
+
+    Update W to minimize the objective function, iterating once over all
+    coordinates. By symmetry, to update H, one can call
+    _update_coordinate_descent(X.T, Ht, W, ...).
+
+    """
+    n_components = Ht.shape[1]
+
+    HHt = np.dot(Ht.T, Ht)
+    XHt = safe_sparse_dot(X, Ht)
+
+    # L2 regularization corresponds to increase of the diagonal of HHt
+    if l2_reg != 0.0:
+        # adds l2_reg only on the diagonal
+        HHt.flat[:: n_components + 1] += l2_reg
+    # L1 regularization corresponds to decrease of each element of XHt
+    if l1_reg != 0.0:
+        XHt -= l1_reg
+
+    if shuffle:
+        permutation = random_state.permutation(n_components)
+    else:
+        permutation = np.arange(n_components)
+    # The following seems to be required on 64-bit Windows w/ Python 3.5.
+    permutation = np.asarray(permutation, dtype=np.intp)
+    return _update_cdnmf_fast(W, HHt, XHt, permutation)
+
+
+def _fit_coordinate_descent(
+    X,
+    W,
+    H,
+    tol=1e-4,
+    max_iter=200,
+    l1_reg_W=0,
+    l1_reg_H=0,
+    l2_reg_W=0,
+    l2_reg_H=0,
+    update_H=True,
+    verbose=0,
+    shuffle=False,
+    random_state=None,
+):
+    """Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent
+
+    The objective function is minimized with an alternating minimization of W
+    and H. Each minimization is done with a cyclic (up to a permutation of the
+    features) Coordinate Descent.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Constant matrix.
+
+    W : array-like of shape (n_samples, n_components)
+        Initial guess for the solution.
+
+    H : array-like of shape (n_components, n_features)
+        Initial guess for the solution.
+
+    tol : float, default=1e-4
+        Tolerance of the stopping condition.
+
+    max_iter : int, default=200
+        Maximum number of iterations before timing out.
+
+    l1_reg_W : float, default=0.
+        L1 regularization parameter for W.
+
+    l1_reg_H : float, default=0.
+        L1 regularization parameter for H.
+
+    l2_reg_W : float, default=0.
+        L2 regularization parameter for W.
+
+    l2_reg_H : float, default=0.
+        L2 regularization parameter for H.
+
+    update_H : bool, default=True
+        Set to True, both W and H will be estimated from initial guesses.
+        Set to False, only W will be estimated.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    shuffle : bool, default=False
+        If true, randomize the order of coordinates in the CD solver.
+
+    random_state : int, RandomState instance or None, default=None
+        Used to randomize the coordinates in the CD solver, when
+        ``shuffle`` is set to ``True``. Pass an int for reproducible
+        results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    W : ndarray of shape (n_samples, n_components)
+        Solution to the non-negative least squares problem.
+
+    H : ndarray of shape (n_components, n_features)
+        Solution to the non-negative least squares problem.
+
+    n_iter : int
+        The number of iterations done by the algorithm.
+
+    References
+    ----------
+    .. [1] :doi:`"Fast local algorithms for large scale nonnegative matrix and tensor
+       factorizations" <10.1587/transfun.E92.A.708>`
+       Cichocki, Andrzej, and P. H. A. N. Anh-Huy. IEICE transactions on fundamentals
+       of electronics, communications and computer sciences 92.3: 708-721, 2009.
+    """
+    # so W and Ht are both in C order in memory
+    Ht = check_array(H.T, order="C")
+    X = check_array(X, accept_sparse="csr")
+
+    rng = check_random_state(random_state)
+
+    for n_iter in range(1, max_iter + 1):
+        violation = 0.0
+
+        # Update W
+        violation += _update_coordinate_descent(
+            X, W, Ht, l1_reg_W, l2_reg_W, shuffle, rng
+        )
+        # Update H
+        if update_H:
+            violation += _update_coordinate_descent(
+                X.T, Ht, W, l1_reg_H, l2_reg_H, shuffle, rng
+            )
+
+        if n_iter == 1:
+            violation_init = violation
+
+        if violation_init == 0:
+            break
+
+        if verbose:
+            print("violation:", violation / violation_init)
+
+        if violation / violation_init <= tol:
+            if verbose:
+                print("Converged at iteration", n_iter + 1)
+            break
+
+    return W, Ht.T, n_iter
+
+
+def _multiplicative_update_w(
+    X,
+    W,
+    H,
+    beta_loss,
+    l1_reg_W,
+    l2_reg_W,
+    gamma,
+    H_sum=None,
+    HHt=None,
+    XHt=None,
+    update_H=True,
+):
+    """Update W in Multiplicative Update NMF."""
+    if beta_loss == 2:
+        # Numerator
+        if XHt is None:
+            XHt = safe_sparse_dot(X, H.T)
+        if update_H:
+            # avoid a copy of XHt, which will be re-computed (update_H=True)
+            numerator = XHt
+        else:
+            # preserve the XHt, which is not re-computed (update_H=False)
+            numerator = XHt.copy()
+
+        # Denominator
+        if HHt is None:
+            HHt = np.dot(H, H.T)
+        denominator = np.dot(W, HHt)
+
+    else:
+        # Numerator
+        # if X is sparse, compute WH only where X is non zero
+        WH_safe_X = _special_sparse_dot(W, H, X)
+        if sp.issparse(X):
+            WH_safe_X_data = WH_safe_X.data
+            X_data = X.data
+        else:
+            WH_safe_X_data = WH_safe_X
+            X_data = X
+            # copy used in the Denominator
+            WH = WH_safe_X.copy()
+            if beta_loss - 1.0 < 0:
+                WH[WH < EPSILON] = EPSILON
+
+        # to avoid taking a negative power of zero
+        if beta_loss - 2.0 < 0:
+            WH_safe_X_data[WH_safe_X_data < EPSILON] = EPSILON
+
+        if beta_loss == 1:
+            np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data)
+        elif beta_loss == 0:
+            # speeds up computation time
+            # refer to /numpy/numpy/issues/9363
+            WH_safe_X_data **= -1
+            WH_safe_X_data **= 2
+            # element-wise multiplication
+            WH_safe_X_data *= X_data
+        else:
+            WH_safe_X_data **= beta_loss - 2
+            # element-wise multiplication
+            WH_safe_X_data *= X_data
+
+        # here numerator = dot(X * (dot(W, H) ** (beta_loss - 2)), H.T)
+        numerator = safe_sparse_dot(WH_safe_X, H.T)
+
+        # Denominator
+        if beta_loss == 1:
+            if H_sum is None:
+                H_sum = np.sum(H, axis=1)  # shape(n_components, )
+            denominator = H_sum[np.newaxis, :]
+
+        else:
+            # computation of WHHt = dot(dot(W, H) ** beta_loss - 1, H.T)
+            if sp.issparse(X):
+                # memory efficient computation
+                # (compute row by row, avoiding the dense matrix WH)
+                WHHt = np.empty(W.shape)
+                for i in range(X.shape[0]):
+                    WHi = np.dot(W[i, :], H)
+                    if beta_loss - 1 < 0:
+                        WHi[WHi < EPSILON] = EPSILON
+                    WHi **= beta_loss - 1
+                    WHHt[i, :] = np.dot(WHi, H.T)
+            else:
+                WH **= beta_loss - 1
+                WHHt = np.dot(WH, H.T)
+            denominator = WHHt
+
+    # Add L1 and L2 regularization
+    if l1_reg_W > 0:
+        denominator += l1_reg_W
+    if l2_reg_W > 0:
+        denominator = denominator + l2_reg_W * W
+    denominator[denominator == 0] = EPSILON
+
+    numerator /= denominator
+    delta_W = numerator
+
+    # gamma is in ]0, 1]
+    if gamma != 1:
+        delta_W **= gamma
+
+    W *= delta_W
+
+    return W, H_sum, HHt, XHt
+
+
+def _multiplicative_update_h(
+    X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma, A=None, B=None, rho=None
+):
+    """update H in Multiplicative Update NMF."""
+    if beta_loss == 2:
+        numerator = safe_sparse_dot(W.T, X)
+        denominator = np.linalg.multi_dot([W.T, W, H])
+
+    else:
+        # Numerator
+        WH_safe_X = _special_sparse_dot(W, H, X)
+        if sp.issparse(X):
+            WH_safe_X_data = WH_safe_X.data
+            X_data = X.data
+        else:
+            WH_safe_X_data = WH_safe_X
+            X_data = X
+            # copy used in the Denominator
+            WH = WH_safe_X.copy()
+            if beta_loss - 1.0 < 0:
+                WH[WH < EPSILON] = EPSILON
+
+        # to avoid division by zero
+        if beta_loss - 2.0 < 0:
+            WH_safe_X_data[WH_safe_X_data < EPSILON] = EPSILON
+
+        if beta_loss == 1:
+            np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data)
+        elif beta_loss == 0:
+            # speeds up computation time
+            # refer to /numpy/numpy/issues/9363
+            WH_safe_X_data **= -1
+            WH_safe_X_data **= 2
+            # element-wise multiplication
+            WH_safe_X_data *= X_data
+        else:
+            WH_safe_X_data **= beta_loss - 2
+            # element-wise multiplication
+            WH_safe_X_data *= X_data
+
+        # here numerator = dot(W.T, (dot(W, H) ** (beta_loss - 2)) * X)
+        numerator = safe_sparse_dot(W.T, WH_safe_X)
+
+        # Denominator
+        if beta_loss == 1:
+            W_sum = np.sum(W, axis=0)  # shape(n_components, )
+            W_sum[W_sum == 0] = 1.0
+            denominator = W_sum[:, np.newaxis]
+
+        # beta_loss not in (1, 2)
+        else:
+            # computation of WtWH = dot(W.T, dot(W, H) ** beta_loss - 1)
+            if sp.issparse(X):
+                # memory efficient computation
+                # (compute column by column, avoiding the dense matrix WH)
+                WtWH = np.empty(H.shape)
+                for i in range(X.shape[1]):
+                    WHi = np.dot(W, H[:, i])
+                    if beta_loss - 1 < 0:
+                        WHi[WHi < EPSILON] = EPSILON
+                    WHi **= beta_loss - 1
+                    WtWH[:, i] = np.dot(W.T, WHi)
+            else:
+                WH **= beta_loss - 1
+                WtWH = np.dot(W.T, WH)
+            denominator = WtWH
+
+    # Add L1 and L2 regularization
+    if l1_reg_H > 0:
+        denominator += l1_reg_H
+    if l2_reg_H > 0:
+        denominator = denominator + l2_reg_H * H
+    denominator[denominator == 0] = EPSILON
+
+    if A is not None and B is not None:
+        # Updates for the online nmf
+        if gamma != 1:
+            H **= 1 / gamma
+        numerator *= H
+        A *= rho
+        B *= rho
+        A += numerator
+        B += denominator
+        H = A / B
+
+        if gamma != 1:
+            H **= gamma
+    else:
+        delta_H = numerator
+        delta_H /= denominator
+        if gamma != 1:
+            delta_H **= gamma
+        H *= delta_H
+
+    return H
+
+
+def _fit_multiplicative_update(
+    X,
+    W,
+    H,
+    beta_loss="frobenius",
+    max_iter=200,
+    tol=1e-4,
+    l1_reg_W=0,
+    l1_reg_H=0,
+    l2_reg_W=0,
+    l2_reg_H=0,
+    update_H=True,
+    verbose=0,
+):
+    """Compute Non-negative Matrix Factorization with Multiplicative Update.
+
+    The objective function is _beta_divergence(X, WH) and is minimized with an
+    alternating minimization of W and H. Each minimization is done with a
+    Multiplicative Update.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Constant input matrix.
+
+    W : array-like of shape (n_samples, n_components)
+        Initial guess for the solution.
+
+    H : array-like of shape (n_components, n_features)
+        Initial guess for the solution.
+
+    beta_loss : float or {'frobenius', 'kullback-leibler', \
+            'itakura-saito'}, default='frobenius'
+        String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.
+        Beta divergence to be minimized, measuring the distance between X
+        and the dot product WH. Note that values different from 'frobenius'
+        (or 2) and 'kullback-leibler' (or 1) lead to significantly slower
+        fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
+        matrix X cannot contain zeros.
+
+    max_iter : int, default=200
+        Number of iterations.
+
+    tol : float, default=1e-4
+        Tolerance of the stopping condition.
+
+    l1_reg_W : float, default=0.
+        L1 regularization parameter for W.
+
+    l1_reg_H : float, default=0.
+        L1 regularization parameter for H.
+
+    l2_reg_W : float, default=0.
+        L2 regularization parameter for W.
+
+    l2_reg_H : float, default=0.
+        L2 regularization parameter for H.
+
+    update_H : bool, default=True
+        Set to True, both W and H will be estimated from initial guesses.
+        Set to False, only W will be estimated.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    Returns
+    -------
+    W : ndarray of shape (n_samples, n_components)
+        Solution to the non-negative least squares problem.
+
+    H : ndarray of shape (n_components, n_features)
+        Solution to the non-negative least squares problem.
+
+    n_iter : int
+        The number of iterations done by the algorithm.
+
+    References
+    ----------
+    Lee, D. D., & Seung, H., S. (2001). Algorithms for Non-negative Matrix
+    Factorization. Adv. Neural Inform. Process. Syst.. 13.
+    Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
+    factorization with the beta-divergence. Neural Computation, 23(9).
+    """
+    start_time = time.time()
+
+    beta_loss = _beta_loss_to_float(beta_loss)
+
+    # gamma for Maximization-Minimization (MM) algorithm [Fevotte 2011]
+    if beta_loss < 1:
+        gamma = 1.0 / (2.0 - beta_loss)
+    elif beta_loss > 2:
+        gamma = 1.0 / (beta_loss - 1.0)
+    else:
+        gamma = 1.0
+
+    # used for the convergence criterion
+    error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True)
+    previous_error = error_at_init
+
+    H_sum, HHt, XHt = None, None, None
+    for n_iter in range(1, max_iter + 1):
+        # update W
+        # H_sum, HHt and XHt are saved and reused if not update_H
+        W, H_sum, HHt, XHt = _multiplicative_update_w(
+            X,
+            W,
+            H,
+            beta_loss=beta_loss,
+            l1_reg_W=l1_reg_W,
+            l2_reg_W=l2_reg_W,
+            gamma=gamma,
+            H_sum=H_sum,
+            HHt=HHt,
+            XHt=XHt,
+            update_H=update_H,
+        )
+
+        # necessary for stability with beta_loss < 1
+        if beta_loss < 1:
+            W[W < np.finfo(np.float64).eps] = 0.0
+
+        # update H (only at fit or fit_transform)
+        if update_H:
+            H = _multiplicative_update_h(
+                X,
+                W,
+                H,
+                beta_loss=beta_loss,
+                l1_reg_H=l1_reg_H,
+                l2_reg_H=l2_reg_H,
+                gamma=gamma,
+            )
+
+            # These values will be recomputed since H changed
+            H_sum, HHt, XHt = None, None, None
+
+            # necessary for stability with beta_loss < 1
+            if beta_loss <= 1:
+                H[H < np.finfo(np.float64).eps] = 0.0
+
+        # test convergence criterion every 10 iterations
+        if tol > 0 and n_iter % 10 == 0:
+            error = _beta_divergence(X, W, H, beta_loss, square_root=True)
+
+            if verbose:
+                iter_time = time.time()
+                print(
+                    "Epoch %02d reached after %.3f seconds, error: %f"
+                    % (n_iter, iter_time - start_time, error)
+                )
+
+            if (previous_error - error) / error_at_init < tol:
+                break
+            previous_error = error
+
+    # do not print if we have already printed in the convergence test
+    if verbose and (tol == 0 or n_iter % 10 != 0):
+        end_time = time.time()
+        print(
+            "Epoch %02d reached after %.3f seconds." % (n_iter, end_time - start_time)
+        )
+
+    return W, H, n_iter
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "W": ["array-like", None],
+        "H": ["array-like", None],
+        "update_H": ["boolean"],
+    },
+    prefer_skip_nested_validation=False,
+)
+def non_negative_factorization(
+    X,
+    W=None,
+    H=None,
+    n_components="auto",
+    *,
+    init=None,
+    update_H=True,
+    solver="cd",
+    beta_loss="frobenius",
+    tol=1e-4,
+    max_iter=200,
+    alpha_W=0.0,
+    alpha_H="same",
+    l1_ratio=0.0,
+    random_state=None,
+    verbose=0,
+    shuffle=False,
+):
+    """Compute Non-negative Matrix Factorization (NMF).
+
+    Find two non-negative matrices (W, H) whose product approximates the non-
+    negative matrix X. This factorization can be used for example for
+    dimensionality reduction, source separation or topic extraction.
+
+    The objective function is:
+
+    .. math::
+
+        L(W, H) &= 0.5 * ||X - WH||_{loss}^2
+
+                &+ alpha\\_W * l1\\_ratio * n\\_features * ||vec(W)||_1
+
+                &+ alpha\\_H * l1\\_ratio * n\\_samples * ||vec(H)||_1
+
+                &+ 0.5 * alpha\\_W * (1 - l1\\_ratio) * n\\_features * ||W||_{Fro}^2
+
+                &+ 0.5 * alpha\\_H * (1 - l1\\_ratio) * n\\_samples * ||H||_{Fro}^2,
+
+    where :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm) and
+    :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)
+
+    The generic norm :math:`||X - WH||_{loss}^2` may represent
+    the Frobenius norm or another supported beta-divergence loss.
+    The choice between options is controlled by the `beta_loss` parameter.
+
+    The regularization terms are scaled by `n_features` for `W` and by `n_samples` for
+    `H` to keep their impact balanced with respect to one another and to the data fit
+    term as independent as possible of the size `n_samples` of the training set.
+
+    The objective function is minimized with an alternating minimization of W
+    and H. If H is given and update_H=False, it solves for W only.
+
+    Note that the transformed data is named W and the components matrix is named H. In
+    the NMF literature, the naming convention is usually the opposite since the data
+    matrix X is transposed.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Constant matrix.
+
+    W : array-like of shape (n_samples, n_components), default=None
+        If `init='custom'`, it is used as initial guess for the solution.
+        If `update_H=False`, it is initialised as an array of zeros, unless
+        `solver='mu'`, then it is filled with values calculated by
+        `np.sqrt(X.mean() / self._n_components)`.
+        If `None`, uses the initialisation method specified in `init`.
+
+    H : array-like of shape (n_components, n_features), default=None
+        If `init='custom'`, it is used as initial guess for the solution.
+        If `update_H=False`, it is used as a constant, to solve for W only.
+        If `None`, uses the initialisation method specified in `init`.
+
+    n_components : int or {'auto'} or None, default='auto'
+        Number of components. If `None`, all features are kept.
+        If `n_components='auto'`, the number of components is automatically inferred
+        from `W` or `H` shapes.
+
+        .. versionchanged:: 1.4
+            Added `'auto'` value.
+
+        .. versionchanged:: 1.6
+            Default value changed from `None` to `'auto'`.
+
+    init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None
+        Method used to initialize the procedure.
+
+        Valid options:
+
+        - None: 'nndsvda' if n_components < n_features, otherwise 'random'.
+        - 'random': non-negative random matrices, scaled with:
+          `sqrt(X.mean() / n_components)`
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+          initialization (better for sparseness)
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+          (better when sparsity is not desired)
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+          (generally faster, less accurate alternative to NNDSVDa
+          for when sparsity is not desired)
+        - 'custom': If `update_H=True`, use custom matrices W and H which must both
+          be provided. If `update_H=False`, then only custom matrix H is used.
+
+        .. versionchanged:: 0.23
+            The default value of `init` changed from 'random' to None in 0.23.
+
+        .. versionchanged:: 1.1
+            When `init=None` and n_components is less than n_samples and n_features
+            defaults to `nndsvda` instead of `nndsvd`.
+
+    update_H : bool, default=True
+        Set to True, both W and H will be estimated from initial guesses.
+        Set to False, only W will be estimated.
+
+    solver : {'cd', 'mu'}, default='cd'
+        Numerical solver to use:
+
+        - 'cd' is a Coordinate Descent solver that uses Fast Hierarchical
+          Alternating Least Squares (Fast HALS).
+        - 'mu' is a Multiplicative Update solver.
+
+        .. versionadded:: 0.17
+           Coordinate Descent solver.
+
+        .. versionadded:: 0.19
+           Multiplicative Update solver.
+
+    beta_loss : float or {'frobenius', 'kullback-leibler', \
+            'itakura-saito'}, default='frobenius'
+        Beta divergence to be minimized, measuring the distance between X
+        and the dot product WH. Note that values different from 'frobenius'
+        (or 2) and 'kullback-leibler' (or 1) lead to significantly slower
+        fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
+        matrix X cannot contain zeros. Used only in 'mu' solver.
+
+        .. versionadded:: 0.19
+
+    tol : float, default=1e-4
+        Tolerance of the stopping condition.
+
+    max_iter : int, default=200
+        Maximum number of iterations before timing out.
+
+    alpha_W : float, default=0.0
+        Constant that multiplies the regularization terms of `W`. Set it to zero
+        (default) to have no regularization on `W`.
+
+        .. versionadded:: 1.0
+
+    alpha_H : float or "same", default="same"
+        Constant that multiplies the regularization terms of `H`. Set it to zero to
+        have no regularization on `H`. If "same" (default), it takes the same value as
+        `alpha_W`.
+
+        .. versionadded:: 1.0
+
+    l1_ratio : float, default=0.0
+        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
+        For l1_ratio = 0 the penalty is an elementwise L2 penalty
+        (aka Frobenius Norm).
+        For l1_ratio = 1 it is an elementwise L1 penalty.
+        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for NMF initialisation (when ``init`` == 'nndsvdar' or
+        'random'), and in Coordinate Descent. Pass an int for reproducible
+        results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    shuffle : bool, default=False
+        If true, randomize the order of coordinates in the CD solver.
+
+    Returns
+    -------
+    W : ndarray of shape (n_samples, n_components)
+        Solution to the non-negative least squares problem.
+
+    H : ndarray of shape (n_components, n_features)
+        Solution to the non-negative least squares problem.
+
+    n_iter : int
+        Actual number of iterations.
+
+    References
+    ----------
+    .. [1] :doi:`"Fast local algorithms for large scale nonnegative matrix and tensor
+       factorizations" <10.1587/transfun.E92.A.708>`
+       Cichocki, Andrzej, and P. H. A. N. Anh-Huy. IEICE transactions on fundamentals
+       of electronics, communications and computer sciences 92.3: 708-721, 2009.
+
+    .. [2] :doi:`"Algorithms for nonnegative matrix factorization with the
+       beta-divergence" <10.1162/NECO_a_00168>`
+       Fevotte, C., & Idier, J. (2011). Neural Computation, 23(9).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+    >>> from sklearn.decomposition import non_negative_factorization
+    >>> W, H, n_iter = non_negative_factorization(
+    ...     X, n_components=2, init='random', random_state=0)
+    """
+    est = NMF(
+        n_components=n_components,
+        init=init,
+        solver=solver,
+        beta_loss=beta_loss,
+        tol=tol,
+        max_iter=max_iter,
+        random_state=random_state,
+        alpha_W=alpha_W,
+        alpha_H=alpha_H,
+        l1_ratio=l1_ratio,
+        verbose=verbose,
+        shuffle=shuffle,
+    )
+    est._validate_params()
+
+    X = check_array(X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32])
+
+    with config_context(assume_finite=True):
+        W, H, n_iter = est._fit_transform(X, W=W, H=H, update_H=update_H)
+
+    return W, H, n_iter
+
+
+class _BaseNMF(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, ABC):
+    """Base class for NMF and MiniBatchNMF."""
+
+    _parameter_constraints: dict = {
+        "n_components": [
+            Interval(Integral, 1, None, closed="left"),
+            None,
+            StrOptions({"auto"}),
+        ],
+        "init": [
+            StrOptions({"random", "nndsvd", "nndsvda", "nndsvdar", "custom"}),
+            None,
+        ],
+        "beta_loss": [
+            StrOptions({"frobenius", "kullback-leibler", "itakura-saito"}),
+            Real,
+        ],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+        "alpha_W": [Interval(Real, 0, None, closed="left")],
+        "alpha_H": [Interval(Real, 0, None, closed="left"), StrOptions({"same"})],
+        "l1_ratio": [Interval(Real, 0, 1, closed="both")],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        n_components="auto",
+        *,
+        init=None,
+        beta_loss="frobenius",
+        tol=1e-4,
+        max_iter=200,
+        random_state=None,
+        alpha_W=0.0,
+        alpha_H="same",
+        l1_ratio=0.0,
+        verbose=0,
+    ):
+        self.n_components = n_components
+        self.init = init
+        self.beta_loss = beta_loss
+        self.tol = tol
+        self.max_iter = max_iter
+        self.random_state = random_state
+        self.alpha_W = alpha_W
+        self.alpha_H = alpha_H
+        self.l1_ratio = l1_ratio
+        self.verbose = verbose
+
+    def _check_params(self, X):
+        # n_components
+        self._n_components = self.n_components
+        if self._n_components is None:
+            self._n_components = X.shape[1]
+
+        # beta_loss
+        self._beta_loss = _beta_loss_to_float(self.beta_loss)
+
+    def _check_w_h(self, X, W, H, update_H):
+        """Check W and H, or initialize them."""
+        n_samples, n_features = X.shape
+
+        if self.init == "custom" and update_H:
+            _check_init(H, (self._n_components, n_features), "NMF (input H)")
+            _check_init(W, (n_samples, self._n_components), "NMF (input W)")
+            if self._n_components == "auto":
+                self._n_components = H.shape[0]
+
+            if H.dtype != X.dtype or W.dtype != X.dtype:
+                raise TypeError(
+                    "H and W should have the same dtype as X. Got "
+                    "H.dtype = {} and W.dtype = {}.".format(H.dtype, W.dtype)
+                )
+
+        elif not update_H:
+            if W is not None:
+                warnings.warn(
+                    "When update_H=False, the provided initial W is not used.",
+                    RuntimeWarning,
+                )
+
+            _check_init(H, (self._n_components, n_features), "NMF (input H)")
+            if self._n_components == "auto":
+                self._n_components = H.shape[0]
+
+            if H.dtype != X.dtype:
+                raise TypeError(
+                    "H should have the same dtype as X. Got H.dtype = {}.".format(
+                        H.dtype
+                    )
+                )
+
+            # 'mu' solver should not be initialized by zeros
+            if self.solver == "mu":
+                avg = np.sqrt(X.mean() / self._n_components)
+                W = np.full((n_samples, self._n_components), avg, dtype=X.dtype)
+            else:
+                W = np.zeros((n_samples, self._n_components), dtype=X.dtype)
+
+        else:
+            if W is not None or H is not None:
+                warnings.warn(
+                    (
+                        "When init!='custom', provided W or H are ignored. Set "
+                        " init='custom' to use them as initialization."
+                    ),
+                    RuntimeWarning,
+                )
+
+            if self._n_components == "auto":
+                self._n_components = X.shape[1]
+
+            W, H = _initialize_nmf(
+                X, self._n_components, init=self.init, random_state=self.random_state
+            )
+
+        return W, H
+
+    def _compute_regularization(self, X):
+        """Compute scaled regularization terms."""
+        n_samples, n_features = X.shape
+        alpha_W = self.alpha_W
+        alpha_H = self.alpha_W if self.alpha_H == "same" else self.alpha_H
+
+        l1_reg_W = n_features * alpha_W * self.l1_ratio
+        l1_reg_H = n_samples * alpha_H * self.l1_ratio
+        l2_reg_W = n_features * alpha_W * (1.0 - self.l1_ratio)
+        l2_reg_H = n_samples * alpha_H * (1.0 - self.l1_ratio)
+
+        return l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H
+
+    def fit(self, X, y=None, **params):
+        """Learn a NMF model for the data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        **params : kwargs
+            Parameters (keyword arguments) and values passed to
+            the fit_transform instance.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        # param validation is done in fit_transform
+
+        self.fit_transform(X, **params)
+        return self
+
+    def inverse_transform(self, X):
+        """Transform data back to its original space.
+
+        .. versionadded:: 0.18
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_components)
+            Transformed data matrix.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Returns a data matrix of the original shape.
+        """
+
+        check_is_fitted(self)
+        return X @ self.components_
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.positive_only = True
+        tags.input_tags.sparse = True
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+
+class NMF(_BaseNMF):
+    """Non-Negative Matrix Factorization (NMF).
+
+    Find two non-negative matrices, i.e. matrices with all non-negative elements, (W, H)
+    whose product approximates the non-negative matrix X. This factorization can be used
+    for example for dimensionality reduction, source separation or topic extraction.
+
+    The objective function is:
+
+    .. math::
+
+        L(W, H) &= 0.5 * ||X - WH||_{loss}^2
+
+                &+ alpha\\_W * l1\\_ratio * n\\_features * ||vec(W)||_1
+
+                &+ alpha\\_H * l1\\_ratio * n\\_samples * ||vec(H)||_1
+
+                &+ 0.5 * alpha\\_W * (1 - l1\\_ratio) * n\\_features * ||W||_{Fro}^2
+
+                &+ 0.5 * alpha\\_H * (1 - l1\\_ratio) * n\\_samples * ||H||_{Fro}^2,
+
+    where :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm) and
+    :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm).
+
+    The generic norm :math:`||X - WH||_{loss}` may represent
+    the Frobenius norm or another supported beta-divergence loss.
+    The choice between options is controlled by the `beta_loss` parameter.
+
+    The regularization terms are scaled by `n_features` for `W` and by `n_samples` for
+    `H` to keep their impact balanced with respect to one another and to the data fit
+    term as independent as possible of the size `n_samples` of the training set.
+
+    The objective function is minimized with an alternating minimization of W
+    and H.
+
+    Note that the transformed data is named W and the components matrix is named H. In
+    the NMF literature, the naming convention is usually the opposite since the data
+    matrix X is transposed.
+
+    Read more in the :ref:`User Guide <NMF>`.
+
+    Parameters
+    ----------
+    n_components : int or {'auto'} or None, default='auto'
+        Number of components. If `None`, all features are kept.
+        If `n_components='auto'`, the number of components is automatically inferred
+        from W or H shapes.
+
+        .. versionchanged:: 1.4
+            Added `'auto'` value.
+
+        .. versionchanged:: 1.6
+            Default value changed from `None` to `'auto'`.
+
+    init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None
+        Method used to initialize the procedure.
+        Valid options:
+
+        - `None`: 'nndsvda' if n_components <= min(n_samples, n_features),
+          otherwise random.
+
+        - `'random'`: non-negative random matrices, scaled with:
+          `sqrt(X.mean() / n_components)`
+
+        - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)
+          initialization (better for sparseness)
+
+        - `'nndsvda'`: NNDSVD with zeros filled with the average of X
+          (better when sparsity is not desired)
+
+        - `'nndsvdar'` NNDSVD with zeros filled with small random values
+          (generally faster, less accurate alternative to NNDSVDa
+          for when sparsity is not desired)
+
+        - `'custom'`: Use custom matrices `W` and `H` which must both be provided.
+
+        .. versionchanged:: 1.1
+            When `init=None` and n_components is less than n_samples and n_features
+            defaults to `nndsvda` instead of `nndsvd`.
+
+    solver : {'cd', 'mu'}, default='cd'
+        Numerical solver to use:
+
+        - 'cd' is a Coordinate Descent solver.
+        - 'mu' is a Multiplicative Update solver.
+
+        .. versionadded:: 0.17
+           Coordinate Descent solver.
+
+        .. versionadded:: 0.19
+           Multiplicative Update solver.
+
+    beta_loss : float or {'frobenius', 'kullback-leibler', \
+            'itakura-saito'}, default='frobenius'
+        Beta divergence to be minimized, measuring the distance between X
+        and the dot product WH. Note that values different from 'frobenius'
+        (or 2) and 'kullback-leibler' (or 1) lead to significantly slower
+        fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
+        matrix X cannot contain zeros. Used only in 'mu' solver.
+
+        .. versionadded:: 0.19
+
+    tol : float, default=1e-4
+        Tolerance of the stopping condition.
+
+    max_iter : int, default=200
+        Maximum number of iterations before timing out.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for initialisation (when ``init`` == 'nndsvdar' or
+        'random'), and in Coordinate Descent. Pass an int for reproducible
+        results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    alpha_W : float, default=0.0
+        Constant that multiplies the regularization terms of `W`. Set it to zero
+        (default) to have no regularization on `W`.
+
+        .. versionadded:: 1.0
+
+    alpha_H : float or "same", default="same"
+        Constant that multiplies the regularization terms of `H`. Set it to zero to
+        have no regularization on `H`. If "same" (default), it takes the same value as
+        `alpha_W`.
+
+        .. versionadded:: 1.0
+
+    l1_ratio : float, default=0.0
+        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
+        For l1_ratio = 0 the penalty is an elementwise L2 penalty
+        (aka Frobenius Norm).
+        For l1_ratio = 1 it is an elementwise L1 penalty.
+        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
+
+        .. versionadded:: 0.17
+           Regularization parameter *l1_ratio* used in the Coordinate Descent
+           solver.
+
+    verbose : int, default=0
+        Whether to be verbose.
+
+    shuffle : bool, default=False
+        If true, randomize the order of coordinates in the CD solver.
+
+        .. versionadded:: 0.17
+           *shuffle* parameter used in the Coordinate Descent solver.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Factorization matrix, sometimes called 'dictionary'.
+
+    n_components_ : int
+        The number of components. It is same as the `n_components` parameter
+        if it was given. Otherwise, it will be same as the number of
+        features.
+
+    reconstruction_err_ : float
+        Frobenius norm of the matrix difference, or beta-divergence, between
+        the training data ``X`` and the reconstructed data ``WH`` from
+        the fitted model.
+
+    n_iter_ : int
+        Actual number of iterations.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    DictionaryLearning : Find a dictionary that sparsely encodes data.
+    MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.
+    PCA : Principal component analysis.
+    SparseCoder : Find a sparse representation of data from a fixed,
+        precomputed dictionary.
+    SparsePCA : Sparse Principal Components Analysis.
+    TruncatedSVD : Dimensionality reduction using truncated SVD.
+
+    References
+    ----------
+    .. [1] :doi:`"Fast local algorithms for large scale nonnegative matrix and tensor
+       factorizations" <10.1587/transfun.E92.A.708>`
+       Cichocki, Andrzej, and P. H. A. N. Anh-Huy. IEICE transactions on fundamentals
+       of electronics, communications and computer sciences 92.3: 708-721, 2009.
+
+    .. [2] :doi:`"Algorithms for nonnegative matrix factorization with the
+       beta-divergence" <10.1162/NECO_a_00168>`
+       Fevotte, C., & Idier, J. (2011). Neural Computation, 23(9).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+    >>> from sklearn.decomposition import NMF
+    >>> model = NMF(n_components=2, init='random', random_state=0)
+    >>> W = model.fit_transform(X)
+    >>> H = model.components_
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseNMF._parameter_constraints,
+        "solver": [StrOptions({"mu", "cd"})],
+        "shuffle": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        n_components="auto",
+        *,
+        init=None,
+        solver="cd",
+        beta_loss="frobenius",
+        tol=1e-4,
+        max_iter=200,
+        random_state=None,
+        alpha_W=0.0,
+        alpha_H="same",
+        l1_ratio=0.0,
+        verbose=0,
+        shuffle=False,
+    ):
+        super().__init__(
+            n_components=n_components,
+            init=init,
+            beta_loss=beta_loss,
+            tol=tol,
+            max_iter=max_iter,
+            random_state=random_state,
+            alpha_W=alpha_W,
+            alpha_H=alpha_H,
+            l1_ratio=l1_ratio,
+            verbose=verbose,
+        )
+
+        self.solver = solver
+        self.shuffle = shuffle
+
+    def _check_params(self, X):
+        super()._check_params(X)
+
+        # solver
+        if self.solver != "mu" and self.beta_loss not in (2, "frobenius"):
+            # 'mu' is the only solver that handles other beta losses than 'frobenius'
+            raise ValueError(
+                f"Invalid beta_loss parameter: solver {self.solver!r} does not handle "
+                f"beta_loss = {self.beta_loss!r}"
+            )
+        if self.solver == "mu" and self.init == "nndsvd":
+            warnings.warn(
+                (
+                    "The multiplicative update ('mu') solver cannot update "
+                    "zeros present in the initialization, and so leads to "
+                    "poorer results when used jointly with init='nndsvd'. "
+                    "You may try init='nndsvda' or init='nndsvdar' instead."
+                ),
+                UserWarning,
+            )
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None, W=None, H=None):
+        """Learn a NMF model for the data X and returns the transformed data.
+
+        This is more efficient than calling fit followed by transform.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        W : array-like of shape (n_samples, n_components), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            If `None`, uses the initialisation method specified in `init`.
+
+        H : array-like of shape (n_components, n_features), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            If `None`, uses the initialisation method specified in `init`.
+
+        Returns
+        -------
+        W : ndarray of shape (n_samples, n_components)
+            Transformed data.
+        """
+        X = validate_data(
+            self, X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]
+        )
+
+        with config_context(assume_finite=True):
+            W, H, n_iter = self._fit_transform(X, W=W, H=H)
+
+        self.reconstruction_err_ = _beta_divergence(
+            X, W, H, self._beta_loss, square_root=True
+        )
+
+        self.n_components_ = H.shape[0]
+        self.components_ = H
+        self.n_iter_ = n_iter
+
+        return W
+
+    def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
+        """Learn a NMF model for the data X and returns the transformed data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Data matrix to be decomposed
+
+        y : Ignored
+
+        W : array-like of shape (n_samples, n_components), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            If `update_H=False`, it is initialised as an array of zeros, unless
+            `solver='mu'`, then it is filled with values calculated by
+            `np.sqrt(X.mean() / self._n_components)`.
+            If `None`, uses the initialisation method specified in `init`.
+
+        H : array-like of shape (n_components, n_features), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            If `update_H=False`, it is used as a constant, to solve for W only.
+            If `None`, uses the initialisation method specified in `init`.
+
+        update_H : bool, default=True
+            If True, both W and H will be estimated from initial guesses,
+            this corresponds to a call to the 'fit_transform' method.
+            If False, only W will be estimated, this corresponds to a call
+            to the 'transform' method.
+
+        Returns
+        -------
+        W : ndarray of shape (n_samples, n_components)
+            Transformed data.
+
+        H : ndarray of shape (n_components, n_features)
+            Factorization matrix, sometimes called 'dictionary'.
+
+        n_iter_ : int
+            Actual number of iterations.
+        """
+        # check parameters
+        self._check_params(X)
+
+        if X.min() == 0 and self._beta_loss <= 0:
+            raise ValueError(
+                "When beta_loss <= 0 and X contains zeros, "
+                "the solver may diverge. Please add small values "
+                "to X, or use a positive beta_loss."
+            )
+
+        # initialize or check W and H
+        W, H = self._check_w_h(X, W, H, update_H)
+
+        # scale the regularization terms
+        l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = self._compute_regularization(X)
+
+        if self.solver == "cd":
+            W, H, n_iter = _fit_coordinate_descent(
+                X,
+                W,
+                H,
+                self.tol,
+                self.max_iter,
+                l1_reg_W,
+                l1_reg_H,
+                l2_reg_W,
+                l2_reg_H,
+                update_H=update_H,
+                verbose=self.verbose,
+                shuffle=self.shuffle,
+                random_state=self.random_state,
+            )
+        elif self.solver == "mu":
+            W, H, n_iter, *_ = _fit_multiplicative_update(
+                X,
+                W,
+                H,
+                self._beta_loss,
+                self.max_iter,
+                self.tol,
+                l1_reg_W,
+                l1_reg_H,
+                l2_reg_W,
+                l2_reg_H,
+                update_H,
+                self.verbose,
+            )
+        else:
+            raise ValueError("Invalid solver parameter '%s'." % self.solver)
+
+        if n_iter == self.max_iter and self.tol > 0:
+            warnings.warn(
+                "Maximum number of iterations %d reached. Increase "
+                "it to improve convergence." % self.max_iter,
+                ConvergenceWarning,
+            )
+
+        return W, H, n_iter
+
+    def transform(self, X):
+        """Transform the data X according to the fitted NMF model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Returns
+        -------
+        W : ndarray of shape (n_samples, n_components)
+            Transformed data.
+        """
+        check_is_fitted(self)
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            dtype=[np.float64, np.float32],
+            reset=False,
+            ensure_non_negative=True,
+        )
+
+        with config_context(assume_finite=True):
+            W, *_ = self._fit_transform(X, H=self.components_, update_H=False)
+
+        return W
+
+
+class MiniBatchNMF(_BaseNMF):
+    """Mini-Batch Non-Negative Matrix Factorization (NMF).
+
+    .. versionadded:: 1.1
+
+    Find two non-negative matrices, i.e. matrices with all non-negative elements,
+    (`W`, `H`) whose product approximates the non-negative matrix `X`. This
+    factorization can be used for example for dimensionality reduction, source
+    separation or topic extraction.
+
+    The objective function is:
+
+    .. math::
+
+        L(W, H) &= 0.5 * ||X - WH||_{loss}^2
+
+                &+ alpha\\_W * l1\\_ratio * n\\_features * ||vec(W)||_1
+
+                &+ alpha\\_H * l1\\_ratio * n\\_samples * ||vec(H)||_1
+
+                &+ 0.5 * alpha\\_W * (1 - l1\\_ratio) * n\\_features * ||W||_{Fro}^2
+
+                &+ 0.5 * alpha\\_H * (1 - l1\\_ratio) * n\\_samples * ||H||_{Fro}^2,
+
+    where :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm) and
+    :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm).
+
+    The generic norm :math:`||X - WH||_{loss}^2` may represent
+    the Frobenius norm or another supported beta-divergence loss.
+    The choice between options is controlled by the `beta_loss` parameter.
+
+    The objective function is minimized with an alternating minimization of `W`
+    and `H`.
+
+    Note that the transformed data is named `W` and the components matrix is
+    named `H`. In the NMF literature, the naming convention is usually the opposite
+    since the data matrix `X` is transposed.
+
+    Read more in the :ref:`User Guide <MiniBatchNMF>`.
+
+    Parameters
+    ----------
+    n_components : int or {'auto'} or None, default='auto'
+        Number of components. If `None`, all features are kept.
+        If `n_components='auto'`, the number of components is automatically inferred
+        from W or H shapes.
+
+        .. versionchanged:: 1.4
+            Added `'auto'` value.
+
+        .. versionchanged:: 1.6
+            Default value changed from `None` to `'auto'`.
+
+    init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None
+        Method used to initialize the procedure.
+        Valid options:
+
+        - `None`: 'nndsvda' if `n_components <= min(n_samples, n_features)`,
+          otherwise random.
+
+        - `'random'`: non-negative random matrices, scaled with:
+          `sqrt(X.mean() / n_components)`
+
+        - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)
+          initialization (better for sparseness).
+
+        - `'nndsvda'`: NNDSVD with zeros filled with the average of X
+          (better when sparsity is not desired).
+
+        - `'nndsvdar'` NNDSVD with zeros filled with small random values
+          (generally faster, less accurate alternative to NNDSVDa
+          for when sparsity is not desired).
+
+        - `'custom'`: Use custom matrices `W` and `H` which must both be provided.
+
+    batch_size : int, default=1024
+        Number of samples in each mini-batch. Large batch sizes
+        give better long-term convergence at the cost of a slower start.
+
+    beta_loss : float or {'frobenius', 'kullback-leibler', \
+            'itakura-saito'}, default='frobenius'
+        Beta divergence to be minimized, measuring the distance between `X`
+        and the dot product `WH`. Note that values different from 'frobenius'
+        (or 2) and 'kullback-leibler' (or 1) lead to significantly slower
+        fits. Note that for `beta_loss <= 0` (or 'itakura-saito'), the input
+        matrix `X` cannot contain zeros.
+
+    tol : float, default=1e-4
+        Control early stopping based on the norm of the differences in `H`
+        between 2 steps. To disable early stopping based on changes in `H`, set
+        `tol` to 0.0.
+
+    max_no_improvement : int, default=10
+        Control early stopping based on the consecutive number of mini batches
+        that does not yield an improvement on the smoothed cost function.
+        To disable convergence detection based on cost function, set
+        `max_no_improvement` to None.
+
+    max_iter : int, default=200
+        Maximum number of iterations over the complete dataset before
+        timing out.
+
+    alpha_W : float, default=0.0
+        Constant that multiplies the regularization terms of `W`. Set it to zero
+        (default) to have no regularization on `W`.
+
+    alpha_H : float or "same", default="same"
+        Constant that multiplies the regularization terms of `H`. Set it to zero to
+        have no regularization on `H`. If "same" (default), it takes the same value as
+        `alpha_W`.
+
+    l1_ratio : float, default=0.0
+        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
+        For l1_ratio = 0 the penalty is an elementwise L2 penalty
+        (aka Frobenius Norm).
+        For l1_ratio = 1 it is an elementwise L1 penalty.
+        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
+
+    forget_factor : float, default=0.7
+        Amount of rescaling of past information. Its value could be 1 with
+        finite datasets. Choosing values < 1 is recommended with online
+        learning as more recent batches will weight more than past batches.
+
+    fresh_restarts : bool, default=False
+        Whether to completely solve for W at each step. Doing fresh restarts will likely
+        lead to a better solution for a same number of iterations but it is much slower.
+
+    fresh_restarts_max_iter : int, default=30
+        Maximum number of iterations when solving for W at each step. Only used when
+        doing fresh restarts. These iterations may be stopped early based on a small
+        change of W controlled by `tol`.
+
+    transform_max_iter : int, default=None
+        Maximum number of iterations when solving for W at transform time.
+        If None, it defaults to `max_iter`.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for initialisation (when ``init`` == 'nndsvdar' or
+        'random'), and in Coordinate Descent. Pass an int for reproducible
+        results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    verbose : bool, default=False
+        Whether to be verbose.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Factorization matrix, sometimes called 'dictionary'.
+
+    n_components_ : int
+        The number of components. It is same as the `n_components` parameter
+        if it was given. Otherwise, it will be same as the number of
+        features.
+
+    reconstruction_err_ : float
+        Frobenius norm of the matrix difference, or beta-divergence, between
+        the training data `X` and the reconstructed data `WH` from
+        the fitted model.
+
+    n_iter_ : int
+        Actual number of started iterations over the whole dataset.
+
+    n_steps_ : int
+        Number of mini-batches processed.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    See Also
+    --------
+    NMF : Non-negative matrix factorization.
+    MiniBatchDictionaryLearning : Finds a dictionary that can best be used to represent
+        data using a sparse code.
+
+    References
+    ----------
+    .. [1] :doi:`"Fast local algorithms for large scale nonnegative matrix and tensor
+       factorizations" <10.1587/transfun.E92.A.708>`
+       Cichocki, Andrzej, and P. H. A. N. Anh-Huy. IEICE transactions on fundamentals
+       of electronics, communications and computer sciences 92.3: 708-721, 2009.
+
+    .. [2] :doi:`"Algorithms for nonnegative matrix factorization with the
+       beta-divergence" <10.1162/NECO_a_00168>`
+       Fevotte, C., & Idier, J. (2011). Neural Computation, 23(9).
+
+    .. [3] :doi:`"Online algorithms for nonnegative matrix factorization with the
+       Itakura-Saito divergence" <10.1109/ASPAA.2011.6082314>`
+       Lefevre, A., Bach, F., Fevotte, C. (2011). WASPA.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+    >>> from sklearn.decomposition import MiniBatchNMF
+    >>> model = MiniBatchNMF(n_components=2, init='random', random_state=0)
+    >>> W = model.fit_transform(X)
+    >>> H = model.components_
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseNMF._parameter_constraints,
+        "max_no_improvement": [Interval(Integral, 1, None, closed="left"), None],
+        "batch_size": [Interval(Integral, 1, None, closed="left")],
+        "forget_factor": [Interval(Real, 0, 1, closed="both")],
+        "fresh_restarts": ["boolean"],
+        "fresh_restarts_max_iter": [Interval(Integral, 1, None, closed="left")],
+        "transform_max_iter": [Interval(Integral, 1, None, closed="left"), None],
+    }
+
+    def __init__(
+        self,
+        n_components="auto",
+        *,
+        init=None,
+        batch_size=1024,
+        beta_loss="frobenius",
+        tol=1e-4,
+        max_no_improvement=10,
+        max_iter=200,
+        alpha_W=0.0,
+        alpha_H="same",
+        l1_ratio=0.0,
+        forget_factor=0.7,
+        fresh_restarts=False,
+        fresh_restarts_max_iter=30,
+        transform_max_iter=None,
+        random_state=None,
+        verbose=0,
+    ):
+        super().__init__(
+            n_components=n_components,
+            init=init,
+            beta_loss=beta_loss,
+            tol=tol,
+            max_iter=max_iter,
+            random_state=random_state,
+            alpha_W=alpha_W,
+            alpha_H=alpha_H,
+            l1_ratio=l1_ratio,
+            verbose=verbose,
+        )
+
+        self.max_no_improvement = max_no_improvement
+        self.batch_size = batch_size
+        self.forget_factor = forget_factor
+        self.fresh_restarts = fresh_restarts
+        self.fresh_restarts_max_iter = fresh_restarts_max_iter
+        self.transform_max_iter = transform_max_iter
+
+    def _check_params(self, X):
+        super()._check_params(X)
+
+        # batch_size
+        self._batch_size = min(self.batch_size, X.shape[0])
+
+        # forget_factor
+        self._rho = self.forget_factor ** (self._batch_size / X.shape[0])
+
+        # gamma for Maximization-Minimization (MM) algorithm [Fevotte 2011]
+        if self._beta_loss < 1:
+            self._gamma = 1.0 / (2.0 - self._beta_loss)
+        elif self._beta_loss > 2:
+            self._gamma = 1.0 / (self._beta_loss - 1.0)
+        else:
+            self._gamma = 1.0
+
+        # transform_max_iter
+        self._transform_max_iter = (
+            self.max_iter
+            if self.transform_max_iter is None
+            else self.transform_max_iter
+        )
+
+        return self
+
+    def _solve_W(self, X, H, max_iter):
+        """Minimize the objective function w.r.t W.
+
+        Update W with H being fixed, until convergence. This is the heart
+        of `transform` but it's also used during `fit` when doing fresh restarts.
+        """
+        avg = np.sqrt(X.mean() / self._n_components)
+        W = np.full((X.shape[0], self._n_components), avg, dtype=X.dtype)
+        W_buffer = W.copy()
+
+        # Get scaled regularization terms. Done for each minibatch to take into account
+        # variable sizes of minibatches.
+        l1_reg_W, _, l2_reg_W, _ = self._compute_regularization(X)
+
+        for _ in range(max_iter):
+            W, *_ = _multiplicative_update_w(
+                X, W, H, self._beta_loss, l1_reg_W, l2_reg_W, self._gamma
+            )
+
+            W_diff = linalg.norm(W - W_buffer) / linalg.norm(W)
+            if self.tol > 0 and W_diff <= self.tol:
+                break
+
+            W_buffer[:] = W
+
+        return W
+
+    def _minibatch_step(self, X, W, H, update_H):
+        """Perform the update of W and H for one minibatch."""
+        batch_size = X.shape[0]
+
+        # get scaled regularization terms. Done for each minibatch to take into account
+        # variable sizes of minibatches.
+        l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = self._compute_regularization(X)
+
+        # update W
+        if self.fresh_restarts or W is None:
+            W = self._solve_W(X, H, self.fresh_restarts_max_iter)
+        else:
+            W, *_ = _multiplicative_update_w(
+                X, W, H, self._beta_loss, l1_reg_W, l2_reg_W, self._gamma
+            )
+
+        # necessary for stability with beta_loss < 1
+        if self._beta_loss < 1:
+            W[W < np.finfo(np.float64).eps] = 0.0
+
+        batch_cost = (
+            _beta_divergence(X, W, H, self._beta_loss)
+            + l1_reg_W * W.sum()
+            + l1_reg_H * H.sum()
+            + l2_reg_W * (W**2).sum()
+            + l2_reg_H * (H**2).sum()
+        ) / batch_size
+
+        # update H (only at fit or fit_transform)
+        if update_H:
+            H[:] = _multiplicative_update_h(
+                X,
+                W,
+                H,
+                beta_loss=self._beta_loss,
+                l1_reg_H=l1_reg_H,
+                l2_reg_H=l2_reg_H,
+                gamma=self._gamma,
+                A=self._components_numerator,
+                B=self._components_denominator,
+                rho=self._rho,
+            )
+
+            # necessary for stability with beta_loss < 1
+            if self._beta_loss <= 1:
+                H[H < np.finfo(np.float64).eps] = 0.0
+
+        return batch_cost
+
+    def _minibatch_convergence(
+        self, X, batch_cost, H, H_buffer, n_samples, step, n_steps
+    ):
+        """Helper function to encapsulate the early stopping logic"""
+        batch_size = X.shape[0]
+
+        # counts steps starting from 1 for user friendly verbose mode.
+        step = step + 1
+
+        # Ignore first iteration because H is not updated yet.
+        if step == 1:
+            if self.verbose:
+                print(f"Minibatch step {step}/{n_steps}: mean batch cost: {batch_cost}")
+            return False
+
+        # Compute an Exponentially Weighted Average of the cost function to
+        # monitor the convergence while discarding minibatch-local stochastic
+        # variability: https://en.wikipedia.org/wiki/Moving_average
+        if self._ewa_cost is None:
+            self._ewa_cost = batch_cost
+        else:
+            alpha = batch_size / (n_samples + 1)
+            alpha = min(alpha, 1)
+            self._ewa_cost = self._ewa_cost * (1 - alpha) + batch_cost * alpha
+
+        # Log progress to be able to monitor convergence
+        if self.verbose:
+            print(
+                f"Minibatch step {step}/{n_steps}: mean batch cost: "
+                f"{batch_cost}, ewa cost: {self._ewa_cost}"
+            )
+
+        # Early stopping based on change of H
+        H_diff = linalg.norm(H - H_buffer) / linalg.norm(H)
+        if self.tol > 0 and H_diff <= self.tol:
+            if self.verbose:
+                print(f"Converged (small H change) at step {step}/{n_steps}")
+            return True
+
+        # Early stopping heuristic due to lack of improvement on smoothed
+        # cost function
+        if self._ewa_cost_min is None or self._ewa_cost < self._ewa_cost_min:
+            self._no_improvement = 0
+            self._ewa_cost_min = self._ewa_cost
+        else:
+            self._no_improvement += 1
+
+        if (
+            self.max_no_improvement is not None
+            and self._no_improvement >= self.max_no_improvement
+        ):
+            if self.verbose:
+                print(
+                    "Converged (lack of improvement in objective function) "
+                    f"at step {step}/{n_steps}"
+                )
+            return True
+
+        return False
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None, W=None, H=None):
+        """Learn a NMF model for the data X and returns the transformed data.
+
+        This is more efficient than calling fit followed by transform.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Data matrix to be decomposed.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        W : array-like of shape (n_samples, n_components), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            If `None`, uses the initialisation method specified in `init`.
+
+        H : array-like of shape (n_components, n_features), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            If `None`, uses the initialisation method specified in `init`.
+
+        Returns
+        -------
+        W : ndarray of shape (n_samples, n_components)
+            Transformed data.
+        """
+        X = validate_data(
+            self, X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]
+        )
+
+        with config_context(assume_finite=True):
+            W, H, n_iter, n_steps = self._fit_transform(X, W=W, H=H)
+
+        self.reconstruction_err_ = _beta_divergence(
+            X, W, H, self._beta_loss, square_root=True
+        )
+
+        self.n_components_ = H.shape[0]
+        self.components_ = H
+        self.n_iter_ = n_iter
+        self.n_steps_ = n_steps
+
+        return W
+
+    def _fit_transform(self, X, W=None, H=None, update_H=True):
+        """Learn a NMF model for the data X and returns the transformed data.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Data matrix to be decomposed.
+
+        W : array-like of shape (n_samples, n_components), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            If `update_H=False`, it is initialised as an array of zeros, unless
+            `solver='mu'`, then it is filled with values calculated by
+            `np.sqrt(X.mean() / self._n_components)`.
+            If `None`, uses the initialisation method specified in `init`.
+
+        H : array-like of shape (n_components, n_features), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            If `update_H=False`, it is used as a constant, to solve for W only.
+            If `None`, uses the initialisation method specified in `init`.
+
+        update_H : bool, default=True
+            If True, both W and H will be estimated from initial guesses,
+            this corresponds to a call to the `fit_transform` method.
+            If False, only W will be estimated, this corresponds to a call
+            to the `transform` method.
+
+        Returns
+        -------
+        W : ndarray of shape (n_samples, n_components)
+            Transformed data.
+
+        H : ndarray of shape (n_components, n_features)
+            Factorization matrix, sometimes called 'dictionary'.
+
+        n_iter : int
+            Actual number of started iterations over the whole dataset.
+
+        n_steps : int
+            Number of mini-batches processed.
+        """
+        check_non_negative(X, "MiniBatchNMF (input X)")
+        self._check_params(X)
+
+        if X.min() == 0 and self._beta_loss <= 0:
+            raise ValueError(
+                "When beta_loss <= 0 and X contains zeros, "
+                "the solver may diverge. Please add small values "
+                "to X, or use a positive beta_loss."
+            )
+
+        n_samples = X.shape[0]
+
+        # initialize or check W and H
+        W, H = self._check_w_h(X, W, H, update_H)
+        H_buffer = H.copy()
+
+        # Initialize auxiliary matrices
+        self._components_numerator = H.copy()
+        self._components_denominator = np.ones(H.shape, dtype=H.dtype)
+
+        # Attributes to monitor the convergence
+        self._ewa_cost = None
+        self._ewa_cost_min = None
+        self._no_improvement = 0
+
+        batches = gen_batches(n_samples, self._batch_size)
+        batches = itertools.cycle(batches)
+        n_steps_per_iter = int(np.ceil(n_samples / self._batch_size))
+        n_steps = self.max_iter * n_steps_per_iter
+
+        for i, batch in zip(range(n_steps), batches):
+            batch_cost = self._minibatch_step(X[batch], W[batch], H, update_H)
+
+            if update_H and self._minibatch_convergence(
+                X[batch], batch_cost, H, H_buffer, n_samples, i, n_steps
+            ):
+                break
+
+            H_buffer[:] = H
+
+        if self.fresh_restarts:
+            W = self._solve_W(X, H, self._transform_max_iter)
+
+        n_steps = i + 1
+        n_iter = int(np.ceil(n_steps / n_steps_per_iter))
+
+        if n_iter == self.max_iter and self.tol > 0:
+            warnings.warn(
+                (
+                    f"Maximum number of iterations {self.max_iter} reached. "
+                    "Increase it to improve convergence."
+                ),
+                ConvergenceWarning,
+            )
+
+        return W, H, n_iter, n_steps
+
+    def transform(self, X):
+        """Transform the data X according to the fitted MiniBatchNMF model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Data matrix to be transformed by the model.
+
+        Returns
+        -------
+        W : ndarray of shape (n_samples, n_components)
+            Transformed data.
+        """
+        check_is_fitted(self)
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            dtype=[np.float64, np.float32],
+            reset=False,
+        )
+
+        W = self._solve_W(X, self.components_, self._transform_max_iter)
+
+        return W
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None, W=None, H=None):
+        """Update the model using the data in `X` as a mini-batch.
+
+        This method is expected to be called several times consecutively
+        on different chunks of a dataset so as to implement out-of-core
+        or online learning.
+
+        This is especially useful when the whole dataset is too big to fit in
+        memory at once (see :ref:`scaling_strategies`).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Data matrix to be decomposed.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        W : array-like of shape (n_samples, n_components), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            Only used for the first call to `partial_fit`.
+
+        H : array-like of shape (n_components, n_features), default=None
+            If `init='custom'`, it is used as initial guess for the solution.
+            Only used for the first call to `partial_fit`.
+
+        Returns
+        -------
+        self
+            Returns the instance itself.
+        """
+        has_components = hasattr(self, "components_")
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            dtype=[np.float64, np.float32],
+            reset=not has_components,
+        )
+
+        if not has_components:
+            # This instance has not been fitted yet (fit or partial_fit)
+            self._check_params(X)
+            _, H = self._check_w_h(X, W=W, H=H, update_H=True)
+
+            self._components_numerator = H.copy()
+            self._components_denominator = np.ones(H.shape, dtype=H.dtype)
+            self.n_steps_ = 0
+        else:
+            H = self.components_
+
+        self._minibatch_step(X, None, H, update_H=True)
+
+        self.n_components_ = H.shape[0]
+        self.components_ = H
+        self.n_steps_ += 1
+
+        return self
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_online_lda_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_online_lda_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..14f45ba9675f5e6b188d3dfdd1c06bdf4136f0ca
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_online_lda_fast.pyx
@@ -0,0 +1,110 @@
+import numpy as np
+
+
+from cython cimport floating
+from libc.math cimport exp, fabs, log
+
+from ..utils._typedefs cimport float64_t, intp_t
+
+
+def mean_change(const floating[:] arr_1, const floating[:] arr_2):
+    """Calculate the mean difference between two arrays.
+
+    Equivalent to np.abs(arr_1 - arr2).mean().
+    """
+
+    cdef float64_t total, diff
+    cdef intp_t i, size
+
+    size = arr_1.shape[0]
+    total = 0.0
+    for i in range(size):
+        diff = fabs(arr_1[i] - arr_2[i])
+        total += diff
+
+    return total / size
+
+
+def _dirichlet_expectation_1d(
+    floating[:] doc_topic,
+    floating doc_topic_prior,
+    floating[:] out
+):
+    """Dirichlet expectation for a single sample:
+        exp(E[log(theta)]) for theta ~ Dir(doc_topic)
+    after adding doc_topic_prior to doc_topic, in-place.
+
+    Equivalent to
+        doc_topic += doc_topic_prior
+        out[:] = np.exp(psi(doc_topic) - psi(np.sum(doc_topic)))
+    """
+
+    cdef floating dt, psi_total, total
+    cdef intp_t i, size
+
+    size = doc_topic.shape[0]
+
+    total = 0.0
+    for i in range(size):
+        dt = doc_topic[i] + doc_topic_prior
+        doc_topic[i] = dt
+        total += dt
+    psi_total = psi(total)
+
+    for i in range(size):
+        out[i] = exp(psi(doc_topic[i]) - psi_total)
+
+
+def _dirichlet_expectation_2d(const floating[:, :] arr):
+    """Dirichlet expectation for multiple samples:
+    E[log(theta)] for theta ~ Dir(arr).
+
+    Equivalent to psi(arr) - psi(np.sum(arr, axis=1))[:, np.newaxis].
+
+    Note that unlike _dirichlet_expectation_1d, this function doesn't compute
+    the exp and doesn't add in the prior.
+    """
+    cdef floating row_total, psi_row_total
+    cdef floating[:, :] d_exp
+    cdef intp_t i, j, n_rows, n_cols
+
+    n_rows = arr.shape[0]
+    n_cols = arr.shape[1]
+
+    d_exp = np.empty_like(arr)
+    for i in range(n_rows):
+        row_total = 0
+        for j in range(n_cols):
+            row_total += arr[i, j]
+        psi_row_total = psi(row_total)
+
+        for j in range(n_cols):
+            d_exp[i, j] = psi(arr[i, j]) - psi_row_total
+
+    return d_exp.base
+
+
+# Psi function for positive arguments. Optimized for speed, not accuracy.
+#
+# After: J. Bernardo (1976). Algorithm AS 103: Psi (Digamma) Function.
+# https://www.uv.es/~bernardo/1976AppStatist.pdf
+cdef floating psi(floating x) noexcept nogil:
+    cdef double EULER = 0.577215664901532860606512090082402431
+    if x <= 1e-6:
+        # psi(x) = -EULER - 1/x + O(x)
+        return -EULER - 1. / x
+
+    cdef floating r, result = 0
+
+    # psi(x + 1) = psi(x) + 1/x
+    while x < 6:
+        result -= 1. / x
+        x += 1
+
+    # psi(x) = log(x) - 1/(2x) - 1/(12x**2) + 1/(120x**4) - 1/(252x**6)
+    #          + O(1/x**8)
+    r = 1. / x
+    result += log(x) - .5 * r
+    r = r * r
+    result -= r * ((1./12.) - r * ((1./120.) - r * (1./252.)))
+    return result
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0d21d5d38be9288a7e3c3405bd4ca449cfd808
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py
@@ -0,0 +1,857 @@
+"""Principal Component Analysis."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from math import lgamma, log, sqrt
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+from scipy.sparse import issparse
+from scipy.sparse.linalg import svds
+
+from ..base import _fit_context
+from ..utils import check_random_state
+from ..utils._arpack import _init_arpack_v0
+from ..utils._array_api import _convert_to_numpy, get_namespace
+from ..utils._param_validation import Interval, RealNotInt, StrOptions
+from ..utils.extmath import _randomized_svd, fast_logdet, stable_cumsum, svd_flip
+from ..utils.sparsefuncs import _implicit_column_offset, mean_variance_axis
+from ..utils.validation import check_is_fitted, validate_data
+from ._base import _BasePCA
+
+
+def _assess_dimension(spectrum, rank, n_samples):
+    """Compute the log-likelihood of a rank ``rank`` dataset.
+
+    The dataset is assumed to be embedded in gaussian noise of shape(n,
+    dimf) having spectrum ``spectrum``. This implements the method of
+    T. P. Minka.
+
+    Parameters
+    ----------
+    spectrum : ndarray of shape (n_features,)
+        Data spectrum.
+    rank : int
+        Tested rank value. It should be strictly lower than n_features,
+        otherwise the method isn't specified (division by zero in equation
+        (31) from the paper).
+    n_samples : int
+        Number of samples.
+
+    Returns
+    -------
+    ll : float
+        The log-likelihood.
+
+    References
+    ----------
+    This implements the method of `Thomas P. Minka:
+    Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604
+    <https://proceedings.neurips.cc/paper/2000/file/7503cfacd12053d309b6bed5c89de212-Paper.pdf>`_
+    """
+    xp, _ = get_namespace(spectrum)
+
+    n_features = spectrum.shape[0]
+    if not 1 <= rank < n_features:
+        raise ValueError("the tested rank should be in [1, n_features - 1]")
+
+    eps = 1e-15
+
+    if spectrum[rank - 1] < eps:
+        # When the tested rank is associated with a small eigenvalue, there's
+        # no point in computing the log-likelihood: it's going to be very
+        # small and won't be the max anyway. Also, it can lead to numerical
+        # issues below when computing pa, in particular in log((spectrum[i] -
+        # spectrum[j]) because this will take the log of something very small.
+        return -xp.inf
+
+    pu = -rank * log(2.0)
+    for i in range(1, rank + 1):
+        pu += (
+            lgamma((n_features - i + 1) / 2.0) - log(xp.pi) * (n_features - i + 1) / 2.0
+        )
+
+    pl = xp.sum(xp.log(spectrum[:rank]))
+    pl = -pl * n_samples / 2.0
+
+    v = max(eps, xp.sum(spectrum[rank:]) / (n_features - rank))
+    pv = -log(v) * n_samples * (n_features - rank) / 2.0
+
+    m = n_features * rank - rank * (rank + 1.0) / 2.0
+    pp = log(2.0 * xp.pi) * (m + rank) / 2.0
+
+    pa = 0.0
+    spectrum_ = xp.asarray(spectrum, copy=True)
+    spectrum_[rank:n_features] = v
+    for i in range(rank):
+        for j in range(i + 1, spectrum.shape[0]):
+            pa += log(
+                (spectrum[i] - spectrum[j]) * (1.0 / spectrum_[j] - 1.0 / spectrum_[i])
+            ) + log(n_samples)
+
+    ll = pu + pl + pv + pp - pa / 2.0 - rank * log(n_samples) / 2.0
+
+    return ll
+
+
+def _infer_dimension(spectrum, n_samples):
+    """Infers the dimension of a dataset with a given spectrum.
+
+    The returned value will be in [1, n_features - 1].
+    """
+    xp, _ = get_namespace(spectrum)
+
+    ll = xp.empty_like(spectrum)
+    ll[0] = -xp.inf  # we don't want to return n_components = 0
+    for rank in range(1, spectrum.shape[0]):
+        ll[rank] = _assess_dimension(spectrum, rank, n_samples)
+    return xp.argmax(ll)
+
+
+class PCA(_BasePCA):
+    """Principal component analysis (PCA).
+
+    Linear dimensionality reduction using Singular Value Decomposition of the
+    data to project it to a lower dimensional space. The input data is centered
+    but not scaled for each feature before applying the SVD.
+
+    It uses the LAPACK implementation of the full SVD or a randomized truncated
+    SVD by the method of Halko et al. 2009, depending on the shape of the input
+    data and the number of components to extract.
+
+    With sparse inputs, the ARPACK implementation of the truncated SVD can be
+    used (i.e. through :func:`scipy.sparse.linalg.svds`). Alternatively, one
+    may consider :class:`TruncatedSVD` where the data are not centered.
+
+    Notice that this class only supports sparse inputs for some solvers such as
+    "arpack" and "covariance_eigh". See :class:`TruncatedSVD` for an
+    alternative with sparse data.
+
+    For a usage example, see
+    :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py`
+
+    Read more in the :ref:`User Guide <PCA>`.
+
+    Parameters
+    ----------
+    n_components : int, float or 'mle', default=None
+        Number of components to keep.
+        if n_components is not set all components are kept::
+
+            n_components == min(n_samples, n_features)
+
+        If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's
+        MLE is used to guess the dimension. Use of ``n_components == 'mle'``
+        will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``.
+
+        If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the
+        number of components such that the amount of variance that needs to be
+        explained is greater than the percentage specified by n_components.
+
+        If ``svd_solver == 'arpack'``, the number of components must be
+        strictly less than the minimum of n_features and n_samples.
+
+        Hence, the None case results in::
+
+            n_components == min(n_samples, n_features) - 1
+
+    copy : bool, default=True
+        If False, data passed to fit are overwritten and running
+        fit(X).transform(X) will not yield the expected results,
+        use fit_transform(X) instead.
+
+    whiten : bool, default=False
+        When True (False by default) the `components_` vectors are multiplied
+        by the square root of n_samples and then divided by the singular values
+        to ensure uncorrelated outputs with unit component-wise variances.
+
+        Whitening will remove some information from the transformed signal
+        (the relative variance scales of the components) but can sometime
+        improve the predictive accuracy of the downstream estimators by
+        making their data respect some hard-wired assumptions.
+
+    svd_solver : {'auto', 'full', 'covariance_eigh', 'arpack', 'randomized'},\
+            default='auto'
+        "auto" :
+            The solver is selected by a default 'auto' policy is based on `X.shape` and
+            `n_components`: if the input data has fewer than 1000 features and
+            more than 10 times as many samples, then the "covariance_eigh"
+            solver is used. Otherwise, if the input data is larger than 500x500
+            and the number of components to extract is lower than 80% of the
+            smallest dimension of the data, then the more efficient
+            "randomized" method is selected. Otherwise the exact "full" SVD is
+            computed and optionally truncated afterwards.
+        "full" :
+            Run exact full SVD calling the standard LAPACK solver via
+            `scipy.linalg.svd` and select the components by postprocessing
+        "covariance_eigh" :
+            Precompute the covariance matrix (on centered data), run a
+            classical eigenvalue decomposition on the covariance matrix
+            typically using LAPACK and select the components by postprocessing.
+            This solver is very efficient for n_samples >> n_features and small
+            n_features. It is, however, not tractable otherwise for large
+            n_features (large memory footprint required to materialize the
+            covariance matrix). Also note that compared to the "full" solver,
+            this solver effectively doubles the condition number and is
+            therefore less numerical stable (e.g. on input data with a large
+            range of singular values).
+        "arpack" :
+            Run SVD truncated to `n_components` calling ARPACK solver via
+            `scipy.sparse.linalg.svds`. It requires strictly
+            `0 < n_components < min(X.shape)`
+        "randomized" :
+            Run randomized SVD by the method of Halko et al.
+
+        .. versionadded:: 0.18.0
+
+        .. versionchanged:: 1.5
+            Added the 'covariance_eigh' solver.
+
+    tol : float, default=0.0
+        Tolerance for singular values computed by svd_solver == 'arpack'.
+        Must be of range [0.0, infinity).
+
+        .. versionadded:: 0.18.0
+
+    iterated_power : int or 'auto', default='auto'
+        Number of iterations for the power method computed by
+        svd_solver == 'randomized'.
+        Must be of range [0, infinity).
+
+        .. versionadded:: 0.18.0
+
+    n_oversamples : int, default=10
+        This parameter is only relevant when `svd_solver="randomized"`.
+        It corresponds to the additional number of random vectors to sample the
+        range of `X` so as to ensure proper conditioning. See
+        :func:`~sklearn.utils.extmath.randomized_svd` for more details.
+
+        .. versionadded:: 1.1
+
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Power iteration normalizer for randomized SVD solver.
+        Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd`
+        for more details.
+
+        .. versionadded:: 1.1
+
+    random_state : int, RandomState instance or None, default=None
+        Used when the 'arpack' or 'randomized' solvers are used. Pass an int
+        for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.18.0
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Principal axes in feature space, representing the directions of
+        maximum variance in the data. Equivalently, the right singular
+        vectors of the centered input data, parallel to its eigenvectors.
+        The components are sorted by decreasing ``explained_variance_``.
+
+    explained_variance_ : ndarray of shape (n_components,)
+        The amount of variance explained by each of the selected components.
+        The variance estimation uses `n_samples - 1` degrees of freedom.
+
+        Equal to n_components largest eigenvalues
+        of the covariance matrix of X.
+
+        .. versionadded:: 0.18
+
+    explained_variance_ratio_ : ndarray of shape (n_components,)
+        Percentage of variance explained by each of the selected components.
+
+        If ``n_components`` is not set then all components are stored and the
+        sum of the ratios is equal to 1.0.
+
+    singular_values_ : ndarray of shape (n_components,)
+        The singular values corresponding to each of the selected components.
+        The singular values are equal to the 2-norms of the ``n_components``
+        variables in the lower-dimensional space.
+
+        .. versionadded:: 0.19
+
+    mean_ : ndarray of shape (n_features,)
+        Per-feature empirical mean, estimated from the training set.
+
+        Equal to `X.mean(axis=0)`.
+
+    n_components_ : int
+        The estimated number of components. When n_components is set
+        to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
+        number is estimated from input data. Otherwise it equals the parameter
+        n_components, or the lesser value of n_features and n_samples
+        if n_components is None.
+
+    n_samples_ : int
+        Number of samples in the training data.
+
+    noise_variance_ : float
+        The estimated noise covariance following the Probabilistic PCA model
+        from Tipping and Bishop 1999. See "Pattern Recognition and
+        Machine Learning" by C. Bishop, 12.2.1 p. 574 or
+        http://www.miketipping.com/papers/met-mppca.pdf. It is required to
+        compute the estimated data covariance and score samples.
+
+        Equal to the average of (min(n_features, n_samples) - n_components)
+        smallest eigenvalues of the covariance matrix of X.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    KernelPCA : Kernel Principal Component Analysis.
+    SparsePCA : Sparse Principal Component Analysis.
+    TruncatedSVD : Dimensionality reduction using truncated SVD.
+    IncrementalPCA : Incremental Principal Component Analysis.
+
+    References
+    ----------
+    For n_components == 'mle', this class uses the method from:
+    `Minka, T. P.. "Automatic choice of dimensionality for PCA".
+    In NIPS, pp. 598-604 <https://tminka.github.io/papers/pca/minka-pca.pdf>`_
+
+    Implements the probabilistic PCA model from:
+    `Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal
+    component analysis". Journal of the Royal Statistical Society:
+    Series B (Statistical Methodology), 61(3), 611-622.
+    <http://www.miketipping.com/papers/met-mppca.pdf>`_
+    via the score and score_samples methods.
+
+    For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.
+
+    For svd_solver == 'randomized', see:
+    :doi:`Halko, N., Martinsson, P. G., and Tropp, J. A. (2011).
+    "Finding structure with randomness: Probabilistic algorithms for
+    constructing approximate matrix decompositions".
+    SIAM review, 53(2), 217-288.
+    <10.1137/090771806>`
+    and also
+    :doi:`Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011).
+    "A randomized algorithm for the decomposition of matrices".
+    Applied and Computational Harmonic Analysis, 30(1), 47-68.
+    <10.1016/j.acha.2010.02.003>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.decomposition import PCA
+    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>> pca = PCA(n_components=2)
+    >>> pca.fit(X)
+    PCA(n_components=2)
+    >>> print(pca.explained_variance_ratio_)
+    [0.9924 0.0075]
+    >>> print(pca.singular_values_)
+    [6.30061 0.54980]
+
+    >>> pca = PCA(n_components=2, svd_solver='full')
+    >>> pca.fit(X)
+    PCA(n_components=2, svd_solver='full')
+    >>> print(pca.explained_variance_ratio_)
+    [0.9924 0.00755]
+    >>> print(pca.singular_values_)
+    [6.30061 0.54980]
+
+    >>> pca = PCA(n_components=1, svd_solver='arpack')
+    >>> pca.fit(X)
+    PCA(n_components=1, svd_solver='arpack')
+    >>> print(pca.explained_variance_ratio_)
+    [0.99244]
+    >>> print(pca.singular_values_)
+    [6.30061]
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [
+            Interval(Integral, 0, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            StrOptions({"mle"}),
+            None,
+        ],
+        "copy": ["boolean"],
+        "whiten": ["boolean"],
+        "svd_solver": [
+            StrOptions({"auto", "full", "covariance_eigh", "arpack", "randomized"})
+        ],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "iterated_power": [
+            StrOptions({"auto"}),
+            Interval(Integral, 0, None, closed="left"),
+        ],
+        "n_oversamples": [Interval(Integral, 1, None, closed="left")],
+        "power_iteration_normalizer": [StrOptions({"auto", "QR", "LU", "none"})],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        copy=True,
+        whiten=False,
+        svd_solver="auto",
+        tol=0.0,
+        iterated_power="auto",
+        n_oversamples=10,
+        power_iteration_normalizer="auto",
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.copy = copy
+        self.whiten = whiten
+        self.svd_solver = svd_solver
+        self.tol = tol
+        self.iterated_power = iterated_power
+        self.n_oversamples = n_oversamples
+        self.power_iteration_normalizer = power_iteration_normalizer
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model with X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self._fit(X)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Fit the model with X and apply the dimensionality reduction on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Ignored.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Transformed values.
+
+        Notes
+        -----
+        This method returns a Fortran-ordered array. To convert it to a
+        C-ordered array, use 'np.ascontiguousarray'.
+        """
+        U, S, _, X, x_is_centered, xp = self._fit(X)
+        if U is not None:
+            U = U[:, : self.n_components_]
+
+            if self.whiten:
+                # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)
+                U *= sqrt(X.shape[0] - 1)
+            else:
+                # X_new = X * V = U * S * Vt * V = U * S
+                U *= S[: self.n_components_]
+
+            return U
+        else:  # solver="covariance_eigh" does not compute U at fit time.
+            return self._transform(X, xp, x_is_centered=x_is_centered)
+
+    def _fit(self, X):
+        """Dispatch to the right submethod depending on the chosen solver."""
+        xp, is_array_api_compliant = get_namespace(X)
+
+        # Raise an error for sparse input and unsupported svd_solver
+        if issparse(X) and self.svd_solver not in ["auto", "arpack", "covariance_eigh"]:
+            raise TypeError(
+                'PCA only support sparse inputs with the "arpack" and'
+                f' "covariance_eigh" solvers, while "{self.svd_solver}" was passed. See'
+                " TruncatedSVD for a possible alternative."
+            )
+        if self.svd_solver == "arpack" and is_array_api_compliant:
+            raise ValueError(
+                "PCA with svd_solver='arpack' is not supported for Array API inputs."
+            )
+
+        # Validate the data, without ever forcing a copy as any solver that
+        # supports sparse input data and the `covariance_eigh` solver are
+        # written in a way to avoid the need for any inplace modification of
+        # the input data contrary to the other solvers.
+        # The copy will happen
+        # later, only if needed, once the solver negotiation below is done.
+        X = validate_data(
+            self,
+            X,
+            dtype=[xp.float64, xp.float32],
+            force_writeable=True,
+            accept_sparse=("csr", "csc"),
+            ensure_2d=True,
+            copy=False,
+        )
+        self._fit_svd_solver = self.svd_solver
+        if self._fit_svd_solver == "auto" and issparse(X):
+            self._fit_svd_solver = "arpack"
+
+        if self.n_components is None:
+            if self._fit_svd_solver != "arpack":
+                n_components = min(X.shape)
+            else:
+                n_components = min(X.shape) - 1
+        else:
+            n_components = self.n_components
+
+        if self._fit_svd_solver == "auto":
+            # Tall and skinny problems are best handled by precomputing the
+            # covariance matrix.
+            if X.shape[1] <= 1_000 and X.shape[0] >= 10 * X.shape[1]:
+                self._fit_svd_solver = "covariance_eigh"
+            # Small problem or n_components == 'mle', just call full PCA
+            elif max(X.shape) <= 500 or n_components == "mle":
+                self._fit_svd_solver = "full"
+            elif 1 <= n_components < 0.8 * min(X.shape):
+                self._fit_svd_solver = "randomized"
+            # This is also the case of n_components in (0, 1)
+            else:
+                self._fit_svd_solver = "full"
+
+        # Call different fits for either full or truncated SVD
+        if self._fit_svd_solver in ("full", "covariance_eigh"):
+            return self._fit_full(X, n_components, xp, is_array_api_compliant)
+        elif self._fit_svd_solver in ["arpack", "randomized"]:
+            return self._fit_truncated(X, n_components, xp)
+
+    def _fit_full(self, X, n_components, xp, is_array_api_compliant):
+        """Fit the model by computing full SVD on X."""
+        n_samples, n_features = X.shape
+
+        if n_components == "mle":
+            if n_samples < n_features:
+                raise ValueError(
+                    "n_components='mle' is only supported if n_samples >= n_features"
+                )
+        elif not 0 <= n_components <= min(n_samples, n_features):
+            raise ValueError(
+                f"n_components={n_components} must be between 0 and "
+                f"min(n_samples, n_features)={min(n_samples, n_features)} with "
+                f"svd_solver={self._fit_svd_solver!r}"
+            )
+
+        self.mean_ = xp.mean(X, axis=0)
+        # When X is a scipy sparse matrix, self.mean_ is a numpy matrix, so we need
+        # to transform it to a 1D array. Note that this is not the case when X
+        # is a scipy sparse array.
+        # TODO: remove the following two lines when scikit-learn only depends
+        # on scipy versions that no longer support scipy.sparse matrices.
+        self.mean_ = xp.reshape(xp.asarray(self.mean_), (-1,))
+
+        if self._fit_svd_solver == "full":
+            X_centered = xp.asarray(X, copy=True) if self.copy else X
+            X_centered -= self.mean_
+            x_is_centered = not self.copy
+
+            if not is_array_api_compliant:
+                # Use scipy.linalg with NumPy/SciPy inputs for the sake of not
+                # introducing unanticipated behavior changes. In the long run we
+                # could instead decide to always use xp.linalg.svd for all inputs,
+                # but that would make this code rely on numpy's SVD instead of
+                # scipy's. It's not 100% clear whether they use the same LAPACK
+                # solver by default though (assuming both are built against the
+                # same BLAS).
+                U, S, Vt = linalg.svd(X_centered, full_matrices=False)
+            else:
+                U, S, Vt = xp.linalg.svd(X_centered, full_matrices=False)
+            explained_variance_ = (S**2) / (n_samples - 1)
+
+        else:
+            assert self._fit_svd_solver == "covariance_eigh"
+            # In the following, we center the covariance matrix C afterwards
+            # (without centering the data X first) to avoid an unnecessary copy
+            # of X. Note that the mean_ attribute is still needed to center
+            # test data in the transform method.
+            #
+            # Note: at the time of writing, `xp.cov` does not exist in the
+            # Array API standard:
+            # https://github.com/data-apis/array-api/issues/43
+            #
+            # Besides, using `numpy.cov`, as of numpy 1.26.0, would not be
+            # memory efficient for our use case when `n_samples >> n_features`:
+            # `numpy.cov` centers a copy of the data before computing the
+            # matrix product instead of subtracting a small `(n_features,
+            # n_features)` square matrix from the gram matrix X.T @ X, as we do
+            # below.
+            x_is_centered = False
+            C = X.T @ X
+            C -= (
+                n_samples
+                * xp.reshape(self.mean_, (-1, 1))
+                * xp.reshape(self.mean_, (1, -1))
+            )
+            C /= n_samples - 1
+            eigenvals, eigenvecs = xp.linalg.eigh(C)
+
+            # When X is a scipy sparse matrix, the following two datastructures
+            # are returned as instances of the soft-deprecated numpy.matrix
+            # class. Note that this problem does not occur when X is a scipy
+            # sparse array (or another other kind of supported array).
+            # TODO: remove the following two lines when scikit-learn only
+            # depends on scipy versions that no longer support scipy.sparse
+            # matrices.
+            eigenvals = xp.reshape(xp.asarray(eigenvals), (-1,))
+            eigenvecs = xp.asarray(eigenvecs)
+
+            eigenvals = xp.flip(eigenvals, axis=0)
+            eigenvecs = xp.flip(eigenvecs, axis=1)
+
+            # The covariance matrix C is positive semi-definite by
+            # construction. However, the eigenvalues returned by xp.linalg.eigh
+            # can be slightly negative due to numerical errors. This would be
+            # an issue for the subsequent sqrt, hence the manual clipping.
+            eigenvals[eigenvals < 0.0] = 0.0
+            explained_variance_ = eigenvals
+
+            # Re-construct SVD of centered X indirectly and make it consistent
+            # with the other solvers.
+            S = xp.sqrt(eigenvals * (n_samples - 1))
+            Vt = eigenvecs.T
+            U = None
+
+        # flip eigenvectors' sign to enforce deterministic output
+        U, Vt = svd_flip(U, Vt, u_based_decision=False)
+
+        components_ = Vt
+
+        # Get variance explained by singular values
+        total_var = xp.sum(explained_variance_)
+        explained_variance_ratio_ = explained_variance_ / total_var
+        singular_values_ = xp.asarray(S, copy=True)  # Store the singular values.
+
+        # Postprocess the number of components required
+        if n_components == "mle":
+            n_components = _infer_dimension(explained_variance_, n_samples)
+        elif 0 < n_components < 1.0:
+            # number of components for which the cumulated explained
+            # variance percentage is superior to the desired threshold
+            # side='right' ensures that number of features selected
+            # their variance is always greater than n_components float
+            # passed. More discussion in issue: #15669
+            if is_array_api_compliant:
+                # Convert to numpy as xp.cumsum and xp.searchsorted are not
+                # part of the Array API standard yet:
+                #
+                # https://github.com/data-apis/array-api/issues/597
+                # https://github.com/data-apis/array-api/issues/688
+                #
+                # Furthermore, it's not always safe to call them for namespaces
+                # that already implement them: for instance as
+                # cupy.searchsorted does not accept a float as second argument.
+                explained_variance_ratio_np = _convert_to_numpy(
+                    explained_variance_ratio_, xp=xp
+                )
+            else:
+                explained_variance_ratio_np = explained_variance_ratio_
+            ratio_cumsum = stable_cumsum(explained_variance_ratio_np)
+            n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1
+
+        # Compute noise covariance using Probabilistic PCA model
+        # The sigma2 maximum likelihood (cf. eq. 12.46)
+        if n_components < min(n_features, n_samples):
+            self.noise_variance_ = xp.mean(explained_variance_[n_components:])
+        else:
+            self.noise_variance_ = 0.0
+
+        self.n_samples_ = n_samples
+        self.n_components_ = n_components
+        # Assign a copy of the result of the truncation of the components in
+        # order to:
+        # - release the memory used by the discarded components,
+        # - ensure that the kept components are allocated contiguously in
+        #   memory to make the transform method faster by leveraging cache
+        #   locality.
+        self.components_ = xp.asarray(components_[:n_components, :], copy=True)
+
+        # We do the same for the other arrays for the sake of consistency.
+        self.explained_variance_ = xp.asarray(
+            explained_variance_[:n_components], copy=True
+        )
+        self.explained_variance_ratio_ = xp.asarray(
+            explained_variance_ratio_[:n_components], copy=True
+        )
+        self.singular_values_ = xp.asarray(singular_values_[:n_components], copy=True)
+
+        return U, S, Vt, X, x_is_centered, xp
+
+    def _fit_truncated(self, X, n_components, xp):
+        """Fit the model by computing truncated SVD (by ARPACK or randomized)
+        on X.
+        """
+        n_samples, n_features = X.shape
+
+        svd_solver = self._fit_svd_solver
+        if isinstance(n_components, str):
+            raise ValueError(
+                "n_components=%r cannot be a string with svd_solver='%s'"
+                % (n_components, svd_solver)
+            )
+        elif not 1 <= n_components <= min(n_samples, n_features):
+            raise ValueError(
+                "n_components=%r must be between 1 and "
+                "min(n_samples, n_features)=%r with "
+                "svd_solver='%s'"
+                % (n_components, min(n_samples, n_features), svd_solver)
+            )
+        elif svd_solver == "arpack" and n_components == min(n_samples, n_features):
+            raise ValueError(
+                "n_components=%r must be strictly less than "
+                "min(n_samples, n_features)=%r with "
+                "svd_solver='%s'"
+                % (n_components, min(n_samples, n_features), svd_solver)
+            )
+
+        random_state = check_random_state(self.random_state)
+
+        # Center data
+        total_var = None
+        if issparse(X):
+            self.mean_, var = mean_variance_axis(X, axis=0)
+            total_var = var.sum() * n_samples / (n_samples - 1)  # ddof=1
+            X_centered = _implicit_column_offset(X, self.mean_)
+            x_is_centered = False
+        else:
+            self.mean_ = xp.mean(X, axis=0)
+            X_centered = xp.asarray(X, copy=True) if self.copy else X
+            X_centered -= self.mean_
+            x_is_centered = not self.copy
+
+        if svd_solver == "arpack":
+            v0 = _init_arpack_v0(min(X.shape), random_state)
+            U, S, Vt = svds(X_centered, k=n_components, tol=self.tol, v0=v0)
+            # svds doesn't abide by scipy.linalg.svd/randomized_svd
+            # conventions, so reverse its outputs.
+            S = S[::-1]
+            # flip eigenvectors' sign to enforce deterministic output
+            U, Vt = svd_flip(U[:, ::-1], Vt[::-1], u_based_decision=False)
+
+        elif svd_solver == "randomized":
+            # sign flipping is done inside
+            U, S, Vt = _randomized_svd(
+                X_centered,
+                n_components=n_components,
+                n_oversamples=self.n_oversamples,
+                n_iter=self.iterated_power,
+                power_iteration_normalizer=self.power_iteration_normalizer,
+                flip_sign=False,
+                random_state=random_state,
+            )
+            U, Vt = svd_flip(U, Vt, u_based_decision=False)
+
+        self.n_samples_ = n_samples
+        self.components_ = Vt
+        self.n_components_ = n_components
+
+        # Get variance explained by singular values
+        self.explained_variance_ = (S**2) / (n_samples - 1)
+
+        # Workaround in-place variance calculation since at the time numpy
+        # did not have a way to calculate variance in-place.
+        #
+        # TODO: update this code to either:
+        # * Use the array-api variance calculation, unless memory usage suffers
+        # * Update sklearn.utils.extmath._incremental_mean_and_var to support array-api
+        # See: https://github.com/scikit-learn/scikit-learn/pull/18689#discussion_r1335540991
+        if total_var is None:
+            N = X.shape[0] - 1
+            X_centered **= 2
+            total_var = xp.sum(X_centered) / N
+
+        self.explained_variance_ratio_ = self.explained_variance_ / total_var
+        self.singular_values_ = xp.asarray(S, copy=True)  # Store the singular values.
+
+        if self.n_components_ < min(n_features, n_samples):
+            self.noise_variance_ = total_var - xp.sum(self.explained_variance_)
+            self.noise_variance_ /= min(n_features, n_samples) - n_components
+        else:
+            self.noise_variance_ = 0.0
+
+        return U, S, Vt, X, x_is_centered, xp
+
+    def score_samples(self, X):
+        """Return the log-likelihood of each sample.
+
+        See. "Pattern Recognition and Machine Learning"
+        by C. Bishop, 12.2.1 p. 574
+        or http://www.miketipping.com/papers/met-mppca.pdf
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data.
+
+        Returns
+        -------
+        ll : ndarray of shape (n_samples,)
+            Log-likelihood of each sample under the current model.
+        """
+        check_is_fitted(self)
+        xp, _ = get_namespace(X)
+        X = validate_data(self, X, dtype=[xp.float64, xp.float32], reset=False)
+        Xr = X - self.mean_
+        n_features = X.shape[1]
+        precision = self.get_precision()
+        log_like = -0.5 * xp.sum(Xr * (Xr @ precision), axis=1)
+        log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision))
+        return log_like
+
+    def score(self, X, y=None):
+        """Return the average log-likelihood of all samples.
+
+        See. "Pattern Recognition and Machine Learning"
+        by C. Bishop, 12.2.1 p. 574
+        or http://www.miketipping.com/papers/met-mppca.pdf
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data.
+
+        y : Ignored
+            Ignored.
+
+        Returns
+        -------
+        ll : float
+            Average log-likelihood of the samples under the current model.
+        """
+        xp, _ = get_namespace(X)
+        return float(xp.mean(self.score_samples(X)))
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        tags.array_api_support = True
+        tags.input_tags.sparse = self.svd_solver in (
+            "auto",
+            "arpack",
+            "covariance_eigh",
+        )
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_sparse_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_sparse_pca.py
new file mode 100644
index 0000000000000000000000000000000000000000..2717230c9df92511543eed80c4c52d39e54d15d3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_sparse_pca.py
@@ -0,0 +1,548 @@
+"""Matrix factorization with Sparse PCA."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..linear_model import ridge_regression
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import svd_flip
+from ..utils.validation import check_array, check_is_fitted, validate_data
+from ._dict_learning import MiniBatchDictionaryLearning, dict_learning
+
+
+class _BaseSparsePCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Base class for SparsePCA and MiniBatchSparsePCA"""
+
+    _parameter_constraints: dict = {
+        "n_components": [None, Interval(Integral, 1, None, closed="left")],
+        "alpha": [Interval(Real, 0.0, None, closed="left")],
+        "ridge_alpha": [Interval(Real, 0.0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+        "method": [StrOptions({"lars", "cd"})],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        alpha=1,
+        ridge_alpha=0.01,
+        max_iter=1000,
+        tol=1e-8,
+        method="lars",
+        n_jobs=None,
+        verbose=False,
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.alpha = alpha
+        self.ridge_alpha = ridge_alpha
+        self.max_iter = max_iter
+        self.tol = tol
+        self.method = method
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model from data in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        random_state = check_random_state(self.random_state)
+        X = validate_data(self, X)
+
+        self.mean_ = X.mean(axis=0)
+        X = X - self.mean_
+
+        if self.n_components is None:
+            n_components = X.shape[1]
+        else:
+            n_components = self.n_components
+
+        return self._fit(X, n_components, random_state)
+
+    def transform(self, X):
+        """Least Squares projection of the data onto the sparse components.
+
+        To avoid instability issues in case the system is under-determined,
+        regularization can be applied (Ridge regression) via the
+        `ridge_alpha` parameter.
+
+        Note that Sparse PCA components orthogonality is not enforced as in PCA
+        hence one cannot use a simple linear projection.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Test data to be transformed, must have the same number of
+            features as the data used to train the model.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Transformed data.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(self, X, reset=False)
+        X = X - self.mean_
+
+        U = ridge_regression(
+            self.components_.T, X.T, self.ridge_alpha, solver="cholesky"
+        )
+
+        return U
+
+    def inverse_transform(self, X):
+        """Transform data from the latent space to the original space.
+
+        This inversion is an approximation due to the loss of information
+        induced by the forward decomposition.
+
+        .. versionadded:: 1.2
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_components)
+            Data in the latent space.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Reconstructed data in the original space.
+        """
+        check_is_fitted(self)
+        X = check_array(X)
+
+        return (X @ self.components_) + self.mean_
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+
+class SparsePCA(_BaseSparsePCA):
+    """Sparse Principal Components Analysis (SparsePCA).
+
+    Finds the set of sparse components that can optimally reconstruct
+    the data.  The amount of sparseness is controllable by the coefficient
+    of the L1 penalty, given by the parameter alpha.
+
+    Read more in the :ref:`User Guide <SparsePCA>`.
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of sparse atoms to extract. If None, then ``n_components``
+        is set to ``n_features``.
+
+    alpha : float, default=1
+        Sparsity controlling parameter. Higher values lead to sparser
+        components.
+
+    ridge_alpha : float, default=0.01
+        Amount of ridge shrinkage to apply in order to improve
+        conditioning when calling the transform method.
+
+    max_iter : int, default=1000
+        Maximum number of iterations to perform.
+
+    tol : float, default=1e-8
+        Tolerance for the stopping condition.
+
+    method : {'lars', 'cd'}, default='lars'
+        Method to be used for optimization.
+        lars: uses the least angle regression method to solve the lasso problem
+        (linear_model.lars_path)
+        cd: uses the coordinate descent method to compute the
+        Lasso solution (linear_model.Lasso). Lars will be faster if
+        the estimated components are sparse.
+
+    n_jobs : int, default=None
+        Number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    U_init : ndarray of shape (n_samples, n_components), default=None
+        Initial values for the loadings for warm restart scenarios. Only used
+        if `U_init` and `V_init` are not None.
+
+    V_init : ndarray of shape (n_components, n_features), default=None
+        Initial values for the components for warm restart scenarios. Only used
+        if `U_init` and `V_init` are not None.
+
+    verbose : int or bool, default=False
+        Controls the verbosity; the higher, the more messages. Defaults to 0.
+
+    random_state : int, RandomState instance or None, default=None
+        Used during dictionary learning. Pass an int for reproducible results
+        across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Sparse components extracted from the data.
+
+    error_ : ndarray
+        Vector of errors at each iteration.
+
+    n_components_ : int
+        Estimated number of components.
+
+        .. versionadded:: 0.23
+
+    n_iter_ : int
+        Number of iterations run.
+
+    mean_ : ndarray of shape (n_features,)
+        Per-feature empirical mean, estimated from the training set.
+        Equal to ``X.mean(axis=0)``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PCA : Principal Component Analysis implementation.
+    MiniBatchSparsePCA : Mini batch variant of `SparsePCA` that is faster but less
+        accurate.
+    DictionaryLearning : Generic dictionary learning problem using a sparse code.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.decomposition import SparsePCA
+    >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)
+    >>> transformer = SparsePCA(n_components=5, random_state=0)
+    >>> transformer.fit(X)
+    SparsePCA(...)
+    >>> X_transformed = transformer.transform(X)
+    >>> X_transformed.shape
+    (200, 5)
+    >>> # most values in the components_ are zero (sparsity)
+    >>> np.mean(transformer.components_ == 0)
+    np.float64(0.9666)
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseSparsePCA._parameter_constraints,
+        "U_init": [None, np.ndarray],
+        "V_init": [None, np.ndarray],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        alpha=1,
+        ridge_alpha=0.01,
+        max_iter=1000,
+        tol=1e-8,
+        method="lars",
+        n_jobs=None,
+        U_init=None,
+        V_init=None,
+        verbose=False,
+        random_state=None,
+    ):
+        super().__init__(
+            n_components=n_components,
+            alpha=alpha,
+            ridge_alpha=ridge_alpha,
+            max_iter=max_iter,
+            tol=tol,
+            method=method,
+            n_jobs=n_jobs,
+            verbose=verbose,
+            random_state=random_state,
+        )
+        self.U_init = U_init
+        self.V_init = V_init
+
+    def _fit(self, X, n_components, random_state):
+        """Specialized `fit` for SparsePCA."""
+
+        code_init = self.V_init.T if self.V_init is not None else None
+        dict_init = self.U_init.T if self.U_init is not None else None
+        code, dictionary, E, self.n_iter_ = dict_learning(
+            X.T,
+            n_components,
+            alpha=self.alpha,
+            tol=self.tol,
+            max_iter=self.max_iter,
+            method=self.method,
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            random_state=random_state,
+            code_init=code_init,
+            dict_init=dict_init,
+            return_n_iter=True,
+        )
+        # flip eigenvectors' sign to enforce deterministic output
+        code, dictionary = svd_flip(code, dictionary, u_based_decision=True)
+        self.components_ = code.T
+        components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]
+        components_norm[components_norm == 0] = 1
+        self.components_ /= components_norm
+        self.n_components_ = len(self.components_)
+
+        self.error_ = E
+        return self
+
+
+class MiniBatchSparsePCA(_BaseSparsePCA):
+    """Mini-batch Sparse Principal Components Analysis.
+
+    Finds the set of sparse components that can optimally reconstruct
+    the data.  The amount of sparseness is controllable by the coefficient
+    of the L1 penalty, given by the parameter alpha.
+
+    For an example comparing sparse PCA to PCA, see
+    :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py`
+
+    Read more in the :ref:`User Guide <SparsePCA>`.
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of sparse atoms to extract. If None, then ``n_components``
+        is set to ``n_features``.
+
+    alpha : int, default=1
+        Sparsity controlling parameter. Higher values lead to sparser
+        components.
+
+    ridge_alpha : float, default=0.01
+        Amount of ridge shrinkage to apply in order to improve
+        conditioning when calling the transform method.
+
+    max_iter : int, default=1_000
+        Maximum number of iterations over the complete dataset before
+        stopping independently of any early stopping criterion heuristics.
+
+        .. versionadded:: 1.2
+
+    callback : callable, default=None
+        Callable that gets invoked every five iterations.
+
+    batch_size : int, default=3
+        The number of features to take in each mini batch.
+
+    verbose : int or bool, default=False
+        Controls the verbosity; the higher, the more messages. Defaults to 0.
+
+    shuffle : bool, default=True
+        Whether to shuffle the data before splitting it in batches.
+
+    n_jobs : int, default=None
+        Number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    method : {'lars', 'cd'}, default='lars'
+        Method to be used for optimization.
+        lars: uses the least angle regression method to solve the lasso problem
+        (linear_model.lars_path)
+        cd: uses the coordinate descent method to compute the
+        Lasso solution (linear_model.Lasso). Lars will be faster if
+        the estimated components are sparse.
+
+    random_state : int, RandomState instance or None, default=None
+        Used for random shuffling when ``shuffle`` is set to ``True``,
+        during online dictionary learning. Pass an int for reproducible results
+        across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    tol : float, default=1e-3
+        Control early stopping based on the norm of the differences in the
+        dictionary between 2 steps.
+
+        To disable early stopping based on changes in the dictionary, set
+        `tol` to 0.0.
+
+        .. versionadded:: 1.1
+
+    max_no_improvement : int or None, default=10
+        Control early stopping based on the consecutive number of mini batches
+        that does not yield an improvement on the smoothed cost function.
+
+        To disable convergence detection based on cost function, set
+        `max_no_improvement` to `None`.
+
+        .. versionadded:: 1.1
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Sparse components extracted from the data.
+
+    n_components_ : int
+        Estimated number of components.
+
+        .. versionadded:: 0.23
+
+    n_iter_ : int
+        Number of iterations run.
+
+    mean_ : ndarray of shape (n_features,)
+        Per-feature empirical mean, estimated from the training set.
+        Equal to ``X.mean(axis=0)``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    DictionaryLearning : Find a dictionary that sparsely encodes data.
+    IncrementalPCA : Incremental principal components analysis.
+    PCA : Principal component analysis.
+    SparsePCA : Sparse Principal Components Analysis.
+    TruncatedSVD : Dimensionality reduction using truncated SVD.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.decomposition import MiniBatchSparsePCA
+    >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)
+    >>> transformer = MiniBatchSparsePCA(n_components=5, batch_size=50,
+    ...                                  max_iter=10, random_state=0)
+    >>> transformer.fit(X)
+    MiniBatchSparsePCA(...)
+    >>> X_transformed = transformer.transform(X)
+    >>> X_transformed.shape
+    (200, 5)
+    >>> # most values in the components_ are zero (sparsity)
+    >>> np.mean(transformer.components_ == 0)
+    np.float64(0.9)
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseSparsePCA._parameter_constraints,
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "callback": [None, callable],
+        "batch_size": [Interval(Integral, 1, None, closed="left")],
+        "shuffle": ["boolean"],
+        "max_no_improvement": [Interval(Integral, 0, None, closed="left"), None],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        alpha=1,
+        ridge_alpha=0.01,
+        max_iter=1_000,
+        callback=None,
+        batch_size=3,
+        verbose=False,
+        shuffle=True,
+        n_jobs=None,
+        method="lars",
+        random_state=None,
+        tol=1e-3,
+        max_no_improvement=10,
+    ):
+        super().__init__(
+            n_components=n_components,
+            alpha=alpha,
+            ridge_alpha=ridge_alpha,
+            max_iter=max_iter,
+            tol=tol,
+            method=method,
+            n_jobs=n_jobs,
+            verbose=verbose,
+            random_state=random_state,
+        )
+        self.callback = callback
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.max_no_improvement = max_no_improvement
+
+    def _fit(self, X, n_components, random_state):
+        """Specialized `fit` for MiniBatchSparsePCA."""
+
+        transform_algorithm = "lasso_" + self.method
+        est = MiniBatchDictionaryLearning(
+            n_components=n_components,
+            alpha=self.alpha,
+            max_iter=self.max_iter,
+            dict_init=None,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            n_jobs=self.n_jobs,
+            fit_algorithm=self.method,
+            random_state=random_state,
+            transform_algorithm=transform_algorithm,
+            transform_alpha=self.alpha,
+            verbose=self.verbose,
+            callback=self.callback,
+            tol=self.tol,
+            max_no_improvement=self.max_no_improvement,
+        )
+        est.set_output(transform="default")
+        est.fit(X.T)
+
+        self.components_, self.n_iter_ = est.transform(X.T).T, est.n_iter_
+
+        components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]
+        components_norm[components_norm == 0] = 1
+        self.components_ /= components_norm
+        self.n_components_ = len(self.components_)
+
+        return self
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_truncated_svd.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_truncated_svd.py
new file mode 100644
index 0000000000000000000000000000000000000000..6165aba4e8db6a0eaa8c81d54a98214e6c782cae
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_truncated_svd.py
@@ -0,0 +1,322 @@
+"""Truncated SVD for sparse matrices, aka latent semantic analysis (LSA)."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+from scipy.sparse.linalg import svds
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..utils import check_array, check_random_state
+from ..utils._arpack import _init_arpack_v0
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import _randomized_svd, safe_sparse_dot, svd_flip
+from ..utils.sparsefuncs import mean_variance_axis
+from ..utils.validation import check_is_fitted, validate_data
+
+__all__ = ["TruncatedSVD"]
+
+
+class TruncatedSVD(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Dimensionality reduction using truncated SVD (aka LSA).
+
+    This transformer performs linear dimensionality reduction by means of
+    truncated singular value decomposition (SVD). Contrary to PCA, this
+    estimator does not center the data before computing the singular value
+    decomposition. This means it can work with sparse matrices
+    efficiently.
+
+    In particular, truncated SVD works on term count/tf-idf matrices as
+    returned by the vectorizers in :mod:`sklearn.feature_extraction.text`. In
+    that context, it is known as latent semantic analysis (LSA).
+
+    This estimator supports two algorithms: a fast randomized SVD solver, and
+    a "naive" algorithm that uses ARPACK as an eigensolver on `X * X.T` or
+    `X.T * X`, whichever is more efficient.
+
+    Read more in the :ref:`User Guide <LSA>`.
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        Desired dimensionality of output data.
+        If algorithm='arpack', must be strictly less than the number of features.
+        If algorithm='randomized', must be less than or equal to the number of features.
+        The default value is useful for visualisation. For LSA, a value of
+        100 is recommended.
+
+    algorithm : {'arpack', 'randomized'}, default='randomized'
+        SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy
+        (scipy.sparse.linalg.svds), or "randomized" for the randomized
+        algorithm due to Halko (2009).
+
+    n_iter : int, default=5
+        Number of iterations for randomized SVD solver. Not used by ARPACK. The
+        default is larger than the default in
+        :func:`~sklearn.utils.extmath.randomized_svd` to handle sparse
+        matrices that may have large slowly decaying spectrum.
+
+    n_oversamples : int, default=10
+        Number of oversamples for randomized SVD solver. Not used by ARPACK.
+        See :func:`~sklearn.utils.extmath.randomized_svd` for a complete
+        description.
+
+        .. versionadded:: 1.1
+
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Power iteration normalizer for randomized SVD solver.
+        Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd`
+        for more details.
+
+        .. versionadded:: 1.1
+
+    random_state : int, RandomState instance or None, default=None
+        Used during randomized svd. Pass an int for reproducible results across
+        multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    tol : float, default=0.0
+        Tolerance for ARPACK. 0 means machine precision. Ignored by randomized
+        SVD solver.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        The right singular vectors of the input data.
+
+    explained_variance_ : ndarray of shape (n_components,)
+        The variance of the training samples transformed by a projection to
+        each component.
+
+    explained_variance_ratio_ : ndarray of shape (n_components,)
+        Percentage of variance explained by each of the selected components.
+
+    singular_values_ : ndarray of shape (n_components,)
+        The singular values corresponding to each of the selected components.
+        The singular values are equal to the 2-norms of the ``n_components``
+        variables in the lower-dimensional space.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    DictionaryLearning : Find a dictionary that sparsely encodes data.
+    FactorAnalysis : A simple linear generative model with
+        Gaussian latent variables.
+    IncrementalPCA : Incremental principal components analysis.
+    KernelPCA : Kernel Principal component analysis.
+    NMF : Non-Negative Matrix Factorization.
+    PCA : Principal component analysis.
+
+    Notes
+    -----
+    SVD suffers from a problem called "sign indeterminacy", which means the
+    sign of the ``components_`` and the output from transform depend on the
+    algorithm and random state. To work around this, fit instances of this
+    class to data once, then keep the instance around to do transformations.
+
+    References
+    ----------
+    :arxiv:`Halko, et al. (2009). "Finding structure with randomness:
+    Stochastic algorithms for constructing approximate matrix decompositions"
+    <0909.4061>`
+
+    Examples
+    --------
+    >>> from sklearn.decomposition import TruncatedSVD
+    >>> from scipy.sparse import csr_matrix
+    >>> import numpy as np
+    >>> np.random.seed(0)
+    >>> X_dense = np.random.rand(100, 100)
+    >>> X_dense[:, 2 * np.arange(50)] = 0
+    >>> X = csr_matrix(X_dense)
+    >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
+    >>> svd.fit(X)
+    TruncatedSVD(n_components=5, n_iter=7, random_state=42)
+    >>> print(svd.explained_variance_ratio_)
+    [0.0157 0.0512 0.0499 0.0479 0.0453]
+    >>> print(svd.explained_variance_ratio_.sum())
+    0.2102
+    >>> print(svd.singular_values_)
+    [35.2410  4.5981   4.5420  4.4486  4.3288]
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "algorithm": [StrOptions({"arpack", "randomized"})],
+        "n_iter": [Interval(Integral, 0, None, closed="left")],
+        "n_oversamples": [Interval(Integral, 1, None, closed="left")],
+        "power_iteration_normalizer": [StrOptions({"auto", "OR", "LU", "none"})],
+        "random_state": ["random_state"],
+        "tol": [Interval(Real, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        n_components=2,
+        *,
+        algorithm="randomized",
+        n_iter=5,
+        n_oversamples=10,
+        power_iteration_normalizer="auto",
+        random_state=None,
+        tol=0.0,
+    ):
+        self.algorithm = algorithm
+        self.n_components = n_components
+        self.n_iter = n_iter
+        self.n_oversamples = n_oversamples
+        self.power_iteration_normalizer = power_iteration_normalizer
+        self.random_state = random_state
+        self.tol = tol
+
+    def fit(self, X, y=None):
+        """Fit model on training data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the transformer object.
+        """
+        self.fit_transform(X)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Fit model to X and perform dimensionality reduction on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Reduced version of X. This will always be a dense array.
+        """
+        X = validate_data(self, X, accept_sparse=["csr", "csc"], ensure_min_features=2)
+        random_state = check_random_state(self.random_state)
+
+        if self.algorithm == "arpack":
+            v0 = _init_arpack_v0(min(X.shape), random_state)
+            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0)
+            # svds doesn't abide by scipy.linalg.svd/randomized_svd
+            # conventions, so reverse its outputs.
+            Sigma = Sigma[::-1]
+            # u_based_decision=False is needed to be consistent with PCA.
+            U, VT = svd_flip(U[:, ::-1], VT[::-1], u_based_decision=False)
+
+        elif self.algorithm == "randomized":
+            if self.n_components > X.shape[1]:
+                raise ValueError(
+                    f"n_components({self.n_components}) must be <="
+                    f" n_features({X.shape[1]})."
+                )
+            U, Sigma, VT = _randomized_svd(
+                X,
+                self.n_components,
+                n_iter=self.n_iter,
+                n_oversamples=self.n_oversamples,
+                power_iteration_normalizer=self.power_iteration_normalizer,
+                random_state=random_state,
+                flip_sign=False,
+            )
+            U, VT = svd_flip(U, VT, u_based_decision=False)
+
+        self.components_ = VT
+
+        # As a result of the SVD approximation error on X ~ U @ Sigma @ V.T,
+        # X @ V is not the same as U @ Sigma
+        if self.algorithm == "randomized" or (
+            self.algorithm == "arpack" and self.tol > 0
+        ):
+            X_transformed = safe_sparse_dot(X, self.components_.T)
+        else:
+            X_transformed = U * Sigma
+
+        # Calculate explained variance & explained variance ratio
+        self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)
+        if sp.issparse(X):
+            _, full_var = mean_variance_axis(X, axis=0)
+            full_var = full_var.sum()
+        else:
+            full_var = np.var(X, axis=0).sum()
+        self.explained_variance_ratio_ = exp_var / full_var
+        self.singular_values_ = Sigma  # Store the singular values.
+
+        return X_transformed
+
+    def transform(self, X):
+        """Perform dimensionality reduction on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Reduced version of X. This will always be a dense array.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, accept_sparse=["csr", "csc"], reset=False)
+        return safe_sparse_dot(X, self.components_.T)
+
+    def inverse_transform(self, X):
+        """Transform X back to its original space.
+
+        Returns an array X_original whose transform would be X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_components)
+            New data.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Note that this is always a dense array.
+        """
+        X = check_array(X)
+        return np.dot(X, self.components_)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/meson.build b/.venv/lib/python3.12/site-packages/sklearn/decomposition/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..75b67a46981f4e394b73332980b4088087d6bc23
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/meson.build
@@ -0,0 +1,14 @@
+py.extension_module(
+  '_online_lda_fast',
+  [cython_gen.process('_online_lda_fast.pyx'), utils_cython_tree],
+  subdir: 'sklearn/decomposition',
+  install: true
+)
+
+py.extension_module(
+  '_cdnmf_fast',
+  cython_gen.process('_cdnmf_fast.pyx'),
+  dependencies: [np_dep],
+  subdir: 'sklearn/decomposition',
+  install: true
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_dict_learning.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_dict_learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..717c56d0abdbecb8636033f9515a41a8f35f1151
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_dict_learning.py
@@ -0,0 +1,988 @@
+import itertools
+import warnings
+from functools import partial
+
+import numpy as np
+import pytest
+
+import sklearn
+from sklearn.base import clone
+from sklearn.decomposition import (
+    DictionaryLearning,
+    MiniBatchDictionaryLearning,
+    SparseCoder,
+    dict_learning,
+    dict_learning_online,
+    sparse_encode,
+)
+from sklearn.decomposition._dict_learning import _update_dict
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.utils import check_array
+from sklearn.utils._testing import (
+    TempMemmap,
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.estimator_checks import (
+    check_transformer_data_not_an_array,
+    check_transformer_general,
+    check_transformers_unfitted,
+)
+from sklearn.utils.parallel import Parallel
+
+rng_global = np.random.RandomState(0)
+n_samples, n_features = 10, 8
+X = rng_global.randn(n_samples, n_features)
+
+
+def test_sparse_encode_shapes_omp():
+    rng = np.random.RandomState(0)
+    algorithms = ["omp", "lasso_lars", "lasso_cd", "lars", "threshold"]
+    for n_components, n_samples in itertools.product([1, 5], [1, 9]):
+        X_ = rng.randn(n_samples, n_features)
+        dictionary = rng.randn(n_components, n_features)
+        for algorithm, n_jobs in itertools.product(algorithms, [1, 2]):
+            code = sparse_encode(X_, dictionary, algorithm=algorithm, n_jobs=n_jobs)
+            assert code.shape == (n_samples, n_components)
+
+
+def test_dict_learning_shapes():
+    n_components = 5
+    dico = DictionaryLearning(n_components, random_state=0).fit(X)
+    assert dico.components_.shape == (n_components, n_features)
+
+    n_components = 1
+    dico = DictionaryLearning(n_components, random_state=0).fit(X)
+    assert dico.components_.shape == (n_components, n_features)
+    assert dico.transform(X).shape == (X.shape[0], n_components)
+
+
+def test_dict_learning_overcomplete():
+    n_components = 12
+    dico = DictionaryLearning(n_components, random_state=0).fit(X)
+    assert dico.components_.shape == (n_components, n_features)
+
+
+def test_max_iter():
+    def ricker_function(resolution, center, width):
+        """Discrete sub-sampled Ricker (Mexican hat) wavelet"""
+        x = np.linspace(0, resolution - 1, resolution)
+        x = (
+            (2 / (np.sqrt(3 * width) * np.pi**0.25))
+            * (1 - (x - center) ** 2 / width**2)
+            * np.exp(-((x - center) ** 2) / (2 * width**2))
+        )
+        return x
+
+    def ricker_matrix(width, resolution, n_components):
+        """Dictionary of Ricker (Mexican hat) wavelets"""
+        centers = np.linspace(0, resolution - 1, n_components)
+        D = np.empty((n_components, resolution))
+        for i, center in enumerate(centers):
+            D[i] = ricker_function(resolution, center, width)
+        D /= np.sqrt(np.sum(D**2, axis=1))[:, np.newaxis]
+        return D
+
+    transform_algorithm = "lasso_cd"
+    resolution = 1024
+    subsampling = 3  # subsampling factor
+    n_components = resolution // subsampling
+
+    # Compute a wavelet dictionary
+    D_multi = np.r_[
+        tuple(
+            ricker_matrix(
+                width=w, resolution=resolution, n_components=n_components // 5
+            )
+            for w in (10, 50, 100, 500, 1000)
+        )
+    ]
+
+    X = np.linspace(0, resolution - 1, resolution)
+    first_quarter = X < resolution / 4
+    X[first_quarter] = 3.0
+    X[np.logical_not(first_quarter)] = -1.0
+    X = X.reshape(1, -1)
+
+    # check that the underlying model fails to converge
+    with pytest.warns(ConvergenceWarning):
+        model = SparseCoder(
+            D_multi, transform_algorithm=transform_algorithm, transform_max_iter=1
+        )
+        model.fit_transform(X)
+
+    # check that the underlying model converges w/o warnings
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", ConvergenceWarning)
+        model = SparseCoder(
+            D_multi, transform_algorithm=transform_algorithm, transform_max_iter=2000
+        )
+        model.fit_transform(X)
+
+
+def test_dict_learning_lars_positive_parameter():
+    n_components = 5
+    alpha = 1
+    err_msg = "Positive constraint not supported for 'lars' coding method."
+    with pytest.raises(ValueError, match=err_msg):
+        dict_learning(X, n_components, alpha=alpha, positive_code=True)
+
+
+@pytest.mark.parametrize(
+    "transform_algorithm",
+    [
+        "lasso_lars",
+        "lasso_cd",
+        "threshold",
+    ],
+)
+@pytest.mark.parametrize("positive_code", [False, True])
+@pytest.mark.parametrize("positive_dict", [False, True])
+def test_dict_learning_positivity(transform_algorithm, positive_code, positive_dict):
+    n_components = 5
+    dico = DictionaryLearning(
+        n_components,
+        transform_algorithm=transform_algorithm,
+        random_state=0,
+        positive_code=positive_code,
+        positive_dict=positive_dict,
+        fit_algorithm="cd",
+    ).fit(X)
+
+    code = dico.transform(X)
+    if positive_dict:
+        assert (dico.components_ >= 0).all()
+    else:
+        assert (dico.components_ < 0).any()
+    if positive_code:
+        assert (code >= 0).all()
+    else:
+        assert (code < 0).any()
+
+
+@pytest.mark.parametrize("positive_dict", [False, True])
+def test_dict_learning_lars_dict_positivity(positive_dict):
+    n_components = 5
+    dico = DictionaryLearning(
+        n_components,
+        transform_algorithm="lars",
+        random_state=0,
+        positive_dict=positive_dict,
+        fit_algorithm="cd",
+    ).fit(X)
+
+    if positive_dict:
+        assert (dico.components_ >= 0).all()
+    else:
+        assert (dico.components_ < 0).any()
+
+
+def test_dict_learning_lars_code_positivity():
+    n_components = 5
+    dico = DictionaryLearning(
+        n_components,
+        transform_algorithm="lars",
+        random_state=0,
+        positive_code=True,
+        fit_algorithm="cd",
+    ).fit(X)
+
+    err_msg = "Positive constraint not supported for '{}' coding method."
+    err_msg = err_msg.format("lars")
+    with pytest.raises(ValueError, match=err_msg):
+        dico.transform(X)
+
+
+def test_dict_learning_reconstruction():
+    n_components = 12
+    dico = DictionaryLearning(
+        n_components, transform_algorithm="omp", transform_alpha=0.001, random_state=0
+    )
+    code = dico.fit(X).transform(X)
+    assert_array_almost_equal(np.dot(code, dico.components_), X)
+    assert_array_almost_equal(dico.inverse_transform(code), X)
+
+    dico.set_params(transform_algorithm="lasso_lars")
+    code = dico.transform(X)
+    assert_array_almost_equal(np.dot(code, dico.components_), X, decimal=2)
+    assert_array_almost_equal(dico.inverse_transform(code), X, decimal=2)
+
+    # test error raised for wrong code size
+    with pytest.raises(ValueError, match="Expected 12, got 11."):
+        dico.inverse_transform(code[:, :-1])
+
+    # used to test lars here too, but there's no guarantee the number of
+    # nonzero atoms is right.
+
+
+def test_dict_learning_reconstruction_parallel():
+    # regression test that parallel reconstruction works with n_jobs>1
+    n_components = 12
+    dico = DictionaryLearning(
+        n_components,
+        transform_algorithm="omp",
+        transform_alpha=0.001,
+        random_state=0,
+        n_jobs=4,
+    )
+    code = dico.fit(X).transform(X)
+    assert_array_almost_equal(np.dot(code, dico.components_), X)
+
+    dico.set_params(transform_algorithm="lasso_lars")
+    code = dico.transform(X)
+    assert_array_almost_equal(np.dot(code, dico.components_), X, decimal=2)
+
+
+def test_dict_learning_lassocd_readonly_data():
+    n_components = 12
+    with TempMemmap(X) as X_read_only:
+        dico = DictionaryLearning(
+            n_components,
+            transform_algorithm="lasso_cd",
+            transform_alpha=0.001,
+            random_state=0,
+            n_jobs=4,
+        )
+        with ignore_warnings(category=ConvergenceWarning):
+            code = dico.fit(X_read_only).transform(X_read_only)
+        assert_array_almost_equal(
+            np.dot(code, dico.components_), X_read_only, decimal=2
+        )
+
+
+def test_dict_learning_nonzero_coefs():
+    n_components = 4
+    dico = DictionaryLearning(
+        n_components,
+        transform_algorithm="lars",
+        transform_n_nonzero_coefs=3,
+        random_state=0,
+    )
+    code = dico.fit(X).transform(X[np.newaxis, 1])
+    assert len(np.flatnonzero(code)) == 3
+
+    dico.set_params(transform_algorithm="omp")
+    code = dico.transform(X[np.newaxis, 1])
+    assert len(np.flatnonzero(code)) == 3
+
+
+def test_dict_learning_split():
+    n_components = 5
+    dico = DictionaryLearning(
+        n_components, transform_algorithm="threshold", random_state=0
+    )
+    code = dico.fit(X).transform(X)
+    Xr = dico.inverse_transform(code)
+
+    dico.split_sign = True
+    split_code = dico.transform(X)
+
+    assert_array_almost_equal(
+        split_code[:, :n_components] - split_code[:, n_components:], code
+    )
+
+    Xr2 = dico.inverse_transform(split_code)
+    assert_array_almost_equal(Xr, Xr2)
+
+
+def test_dict_learning_online_shapes():
+    rng = np.random.RandomState(0)
+    n_components = 8
+
+    code, dictionary = dict_learning_online(
+        X,
+        n_components=n_components,
+        batch_size=4,
+        max_iter=10,
+        method="cd",
+        random_state=rng,
+        return_code=True,
+    )
+    assert code.shape == (n_samples, n_components)
+    assert dictionary.shape == (n_components, n_features)
+    assert np.dot(code, dictionary).shape == X.shape
+
+    dictionary = dict_learning_online(
+        X,
+        n_components=n_components,
+        batch_size=4,
+        max_iter=10,
+        method="cd",
+        random_state=rng,
+        return_code=False,
+    )
+    assert dictionary.shape == (n_components, n_features)
+
+
+def test_dict_learning_online_lars_positive_parameter():
+    err_msg = "Positive constraint not supported for 'lars' coding method."
+    with pytest.raises(ValueError, match=err_msg):
+        dict_learning_online(X, batch_size=4, max_iter=10, positive_code=True)
+
+
+@pytest.mark.parametrize(
+    "transform_algorithm",
+    [
+        "lasso_lars",
+        "lasso_cd",
+        "threshold",
+    ],
+)
+@pytest.mark.parametrize("positive_code", [False, True])
+@pytest.mark.parametrize("positive_dict", [False, True])
+def test_minibatch_dictionary_learning_positivity(
+    transform_algorithm, positive_code, positive_dict
+):
+    n_components = 8
+    dico = MiniBatchDictionaryLearning(
+        n_components,
+        batch_size=4,
+        max_iter=10,
+        transform_algorithm=transform_algorithm,
+        random_state=0,
+        positive_code=positive_code,
+        positive_dict=positive_dict,
+        fit_algorithm="cd",
+    ).fit(X)
+
+    code = dico.transform(X)
+    if positive_dict:
+        assert (dico.components_ >= 0).all()
+    else:
+        assert (dico.components_ < 0).any()
+    if positive_code:
+        assert (code >= 0).all()
+    else:
+        assert (code < 0).any()
+
+
+@pytest.mark.parametrize("positive_dict", [False, True])
+def test_minibatch_dictionary_learning_lars(positive_dict):
+    n_components = 8
+
+    dico = MiniBatchDictionaryLearning(
+        n_components,
+        batch_size=4,
+        max_iter=10,
+        transform_algorithm="lars",
+        random_state=0,
+        positive_dict=positive_dict,
+        fit_algorithm="cd",
+    ).fit(X)
+
+    if positive_dict:
+        assert (dico.components_ >= 0).all()
+    else:
+        assert (dico.components_ < 0).any()
+
+
+@pytest.mark.parametrize("positive_code", [False, True])
+@pytest.mark.parametrize("positive_dict", [False, True])
+def test_dict_learning_online_positivity(positive_code, positive_dict):
+    rng = np.random.RandomState(0)
+    n_components = 8
+
+    code, dictionary = dict_learning_online(
+        X,
+        n_components=n_components,
+        batch_size=4,
+        method="cd",
+        alpha=1,
+        random_state=rng,
+        positive_dict=positive_dict,
+        positive_code=positive_code,
+    )
+    if positive_dict:
+        assert (dictionary >= 0).all()
+    else:
+        assert (dictionary < 0).any()
+    if positive_code:
+        assert (code >= 0).all()
+    else:
+        assert (code < 0).any()
+
+
+def test_dict_learning_online_verbosity():
+    # test verbosity for better coverage
+    n_components = 5
+    import sys
+    from io import StringIO
+
+    old_stdout = sys.stdout
+    try:
+        sys.stdout = StringIO()
+
+        # convergence monitoring verbosity
+        dico = MiniBatchDictionaryLearning(
+            n_components, batch_size=4, max_iter=5, verbose=1, tol=0.1, random_state=0
+        )
+        dico.fit(X)
+        dico = MiniBatchDictionaryLearning(
+            n_components,
+            batch_size=4,
+            max_iter=5,
+            verbose=1,
+            max_no_improvement=2,
+            random_state=0,
+        )
+        dico.fit(X)
+        # higher verbosity level
+        dico = MiniBatchDictionaryLearning(
+            n_components, batch_size=4, max_iter=5, verbose=2, random_state=0
+        )
+        dico.fit(X)
+
+        # function API verbosity
+        dict_learning_online(
+            X,
+            n_components=n_components,
+            batch_size=4,
+            alpha=1,
+            verbose=1,
+            random_state=0,
+        )
+        dict_learning_online(
+            X,
+            n_components=n_components,
+            batch_size=4,
+            alpha=1,
+            verbose=2,
+            random_state=0,
+        )
+    finally:
+        sys.stdout = old_stdout
+
+    assert dico.components_.shape == (n_components, n_features)
+
+
+def test_dict_learning_online_estimator_shapes():
+    n_components = 5
+    dico = MiniBatchDictionaryLearning(
+        n_components, batch_size=4, max_iter=5, random_state=0
+    )
+    dico.fit(X)
+    assert dico.components_.shape == (n_components, n_features)
+
+
+def test_dict_learning_online_overcomplete():
+    n_components = 12
+    dico = MiniBatchDictionaryLearning(
+        n_components, batch_size=4, max_iter=5, random_state=0
+    ).fit(X)
+    assert dico.components_.shape == (n_components, n_features)
+
+
+def test_dict_learning_online_initialization():
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)
+    dico = MiniBatchDictionaryLearning(
+        n_components, batch_size=4, max_iter=0, dict_init=V, random_state=0
+    ).fit(X)
+    assert_array_equal(dico.components_, V)
+
+
+def test_dict_learning_online_readonly_initialization():
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)
+    V.setflags(write=False)
+    MiniBatchDictionaryLearning(
+        n_components,
+        batch_size=4,
+        max_iter=1,
+        dict_init=V,
+        random_state=0,
+        shuffle=False,
+    ).fit(X)
+
+
+def test_dict_learning_online_partial_fit():
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)  # random init
+    V /= np.sum(V**2, axis=1)[:, np.newaxis]
+    dict1 = MiniBatchDictionaryLearning(
+        n_components,
+        max_iter=10,
+        batch_size=1,
+        alpha=1,
+        shuffle=False,
+        dict_init=V,
+        max_no_improvement=None,
+        tol=0.0,
+        random_state=0,
+    ).fit(X)
+    dict2 = MiniBatchDictionaryLearning(
+        n_components, alpha=1, dict_init=V, random_state=0
+    )
+    for i in range(10):
+        for sample in X:
+            dict2.partial_fit(sample[np.newaxis, :])
+
+    assert not np.all(sparse_encode(X, dict1.components_, alpha=1) == 0)
+    assert_array_almost_equal(dict1.components_, dict2.components_, decimal=2)
+
+    # partial_fit should ignore max_iter (#17433)
+    assert dict1.n_steps_ == dict2.n_steps_ == 100
+
+
+def test_sparse_encode_shapes():
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)  # random init
+    V /= np.sum(V**2, axis=1)[:, np.newaxis]
+    for algo in ("lasso_lars", "lasso_cd", "lars", "omp", "threshold"):
+        code = sparse_encode(X, V, algorithm=algo)
+        assert code.shape == (n_samples, n_components)
+
+
+@pytest.mark.parametrize("algo", ["lasso_lars", "lasso_cd", "threshold"])
+@pytest.mark.parametrize("positive", [False, True])
+def test_sparse_encode_positivity(algo, positive):
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)  # random init
+    V /= np.sum(V**2, axis=1)[:, np.newaxis]
+    code = sparse_encode(X, V, algorithm=algo, positive=positive)
+    if positive:
+        assert (code >= 0).all()
+    else:
+        assert (code < 0).any()
+
+
+@pytest.mark.parametrize("algo", ["lars", "omp"])
+def test_sparse_encode_unavailable_positivity(algo):
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)  # random init
+    V /= np.sum(V**2, axis=1)[:, np.newaxis]
+    err_msg = "Positive constraint not supported for '{}' coding method."
+    err_msg = err_msg.format(algo)
+    with pytest.raises(ValueError, match=err_msg):
+        sparse_encode(X, V, algorithm=algo, positive=True)
+
+
+def test_sparse_encode_input():
+    n_components = 100
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)  # random init
+    V /= np.sum(V**2, axis=1)[:, np.newaxis]
+    Xf = check_array(X, order="F")
+    for algo in ("lasso_lars", "lasso_cd", "lars", "omp", "threshold"):
+        a = sparse_encode(X, V, algorithm=algo)
+        b = sparse_encode(Xf, V, algorithm=algo)
+        assert_array_almost_equal(a, b)
+
+
+def test_sparse_encode_error():
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)  # random init
+    V /= np.sum(V**2, axis=1)[:, np.newaxis]
+    code = sparse_encode(X, V, alpha=0.001)
+    assert not np.all(code == 0)
+    assert np.sqrt(np.sum((np.dot(code, V) - X) ** 2)) < 0.1
+
+
+def test_sparse_encode_error_default_sparsity():
+    rng = np.random.RandomState(0)
+    X = rng.randn(100, 64)
+    D = rng.randn(2, 64)
+    code = ignore_warnings(sparse_encode)(X, D, algorithm="omp", n_nonzero_coefs=None)
+    assert code.shape == (100, 2)
+
+
+def test_sparse_coder_estimator():
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)  # random init
+    V /= np.sum(V**2, axis=1)[:, np.newaxis]
+    coder = SparseCoder(
+        dictionary=V, transform_algorithm="lasso_lars", transform_alpha=0.001
+    )
+    code = coder.fit_transform(X)
+    Xr = coder.inverse_transform(code)
+    assert not np.all(code == 0)
+    assert np.sqrt(np.sum((np.dot(code, V) - X) ** 2)) < 0.1
+    np.testing.assert_allclose(Xr, np.dot(code, V))
+
+
+def test_sparse_coder_estimator_clone():
+    n_components = 12
+    rng = np.random.RandomState(0)
+    V = rng.randn(n_components, n_features)  # random init
+    V /= np.sum(V**2, axis=1)[:, np.newaxis]
+    coder = SparseCoder(
+        dictionary=V, transform_algorithm="lasso_lars", transform_alpha=0.001
+    )
+    cloned = clone(coder)
+    assert id(cloned) != id(coder)
+    np.testing.assert_allclose(cloned.dictionary, coder.dictionary)
+    assert id(cloned.dictionary) != id(coder.dictionary)
+    assert cloned.n_components_ == coder.n_components_
+    assert cloned.n_features_in_ == coder.n_features_in_
+    data = np.random.rand(n_samples, n_features).astype(np.float32)
+    np.testing.assert_allclose(cloned.transform(data), coder.transform(data))
+
+
+def test_sparse_coder_parallel_mmap():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/5956
+    # Test that SparseCoder does not error by passing reading only
+    # arrays to child processes
+
+    rng = np.random.RandomState(777)
+    n_components, n_features = 40, 64
+    init_dict = rng.rand(n_components, n_features)
+    # Ensure that `data` is >2M. Joblib memory maps arrays
+    # if they are larger than 1MB. The 4 accounts for float32
+    # data type
+    n_samples = int(2e6) // (4 * n_features)
+    data = np.random.rand(n_samples, n_features).astype(np.float32)
+
+    sc = SparseCoder(init_dict, transform_algorithm="omp", n_jobs=2)
+    sc.fit_transform(data)
+
+
+def test_sparse_coder_common_transformer():
+    rng = np.random.RandomState(777)
+    n_components, n_features = 40, 3
+    init_dict = rng.rand(n_components, n_features)
+
+    sc = SparseCoder(init_dict)
+
+    check_transformer_data_not_an_array(sc.__class__.__name__, sc)
+    check_transformer_general(sc.__class__.__name__, sc)
+    check_transformer_general_memmap = partial(
+        check_transformer_general, readonly_memmap=True
+    )
+    check_transformer_general_memmap(sc.__class__.__name__, sc)
+    check_transformers_unfitted(sc.__class__.__name__, sc)
+
+
+def test_sparse_coder_n_features_in():
+    d = np.array([[1, 2, 3], [1, 2, 3]])
+    sc = SparseCoder(d)
+    assert sc.n_features_in_ == d.shape[1]
+
+
+def test_update_dict():
+    # Check the dict update in batch mode vs online mode
+    # Non-regression test for #4866
+    rng = np.random.RandomState(0)
+
+    code = np.array([[0.5, -0.5], [0.1, 0.9]])
+    dictionary = np.array([[1.0, 0.0], [0.6, 0.8]])
+
+    X = np.dot(code, dictionary) + rng.randn(2, 2)
+
+    # full batch update
+    newd_batch = dictionary.copy()
+    _update_dict(newd_batch, X, code)
+
+    # online update
+    A = np.dot(code.T, code)
+    B = np.dot(X.T, code)
+    newd_online = dictionary.copy()
+    _update_dict(newd_online, X, code, A, B)
+
+    assert_allclose(newd_batch, newd_online)
+
+
+@pytest.mark.parametrize(
+    "algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp")
+)
+@pytest.mark.parametrize("data_type", (np.float32, np.float64))
+# Note: do not check integer input because `lasso_lars` and `lars` fail with
+# `ValueError` in `_lars_path_solver`
+def test_sparse_encode_dtype_match(data_type, algorithm):
+    n_components = 6
+    rng = np.random.RandomState(0)
+    dictionary = rng.randn(n_components, n_features)
+    code = sparse_encode(
+        X.astype(data_type), dictionary.astype(data_type), algorithm=algorithm
+    )
+    assert code.dtype == data_type
+
+
+@pytest.mark.parametrize(
+    "algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp")
+)
+def test_sparse_encode_numerical_consistency(algorithm):
+    # verify numerical consistency among np.float32 and np.float64
+    rtol = 1e-4
+    n_components = 6
+    rng = np.random.RandomState(0)
+    dictionary = rng.randn(n_components, n_features)
+    code_32 = sparse_encode(
+        X.astype(np.float32), dictionary.astype(np.float32), algorithm=algorithm
+    )
+    code_64 = sparse_encode(
+        X.astype(np.float64), dictionary.astype(np.float64), algorithm=algorithm
+    )
+    assert_allclose(code_32, code_64, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "transform_algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp")
+)
+@pytest.mark.parametrize("data_type", (np.float32, np.float64))
+# Note: do not check integer input because `lasso_lars` and `lars` fail with
+# `ValueError` in `_lars_path_solver`
+def test_sparse_coder_dtype_match(data_type, transform_algorithm):
+    # Verify preserving dtype for transform in sparse coder
+    n_components = 6
+    rng = np.random.RandomState(0)
+    dictionary = rng.randn(n_components, n_features)
+    coder = SparseCoder(
+        dictionary.astype(data_type), transform_algorithm=transform_algorithm
+    )
+    code = coder.transform(X.astype(data_type))
+    assert code.dtype == data_type
+
+
+@pytest.mark.parametrize("fit_algorithm", ("lars", "cd"))
+@pytest.mark.parametrize(
+    "transform_algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp")
+)
+@pytest.mark.parametrize(
+    "data_type, expected_type",
+    (
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ),
+)
+def test_dictionary_learning_dtype_match(
+    data_type,
+    expected_type,
+    fit_algorithm,
+    transform_algorithm,
+):
+    # Verify preserving dtype for fit and transform in dictionary learning class
+    dict_learner = DictionaryLearning(
+        n_components=8,
+        fit_algorithm=fit_algorithm,
+        transform_algorithm=transform_algorithm,
+        random_state=0,
+    )
+    dict_learner.fit(X.astype(data_type))
+    assert dict_learner.components_.dtype == expected_type
+    assert dict_learner.transform(X.astype(data_type)).dtype == expected_type
+
+
+@pytest.mark.parametrize("fit_algorithm", ("lars", "cd"))
+@pytest.mark.parametrize(
+    "transform_algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp")
+)
+@pytest.mark.parametrize(
+    "data_type, expected_type",
+    (
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ),
+)
+def test_minibatch_dictionary_learning_dtype_match(
+    data_type,
+    expected_type,
+    fit_algorithm,
+    transform_algorithm,
+):
+    # Verify preserving dtype for fit and transform in minibatch dictionary learning
+    dict_learner = MiniBatchDictionaryLearning(
+        n_components=8,
+        batch_size=10,
+        fit_algorithm=fit_algorithm,
+        transform_algorithm=transform_algorithm,
+        max_iter=100,
+        tol=1e-1,
+        random_state=0,
+    )
+    dict_learner.fit(X.astype(data_type))
+
+    assert dict_learner.components_.dtype == expected_type
+    assert dict_learner.transform(X.astype(data_type)).dtype == expected_type
+    assert dict_learner._A.dtype == expected_type
+    assert dict_learner._B.dtype == expected_type
+
+
+@pytest.mark.parametrize("method", ("lars", "cd"))
+@pytest.mark.parametrize(
+    "data_type, expected_type",
+    (
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ),
+)
+def test_dict_learning_dtype_match(data_type, expected_type, method):
+    # Verify output matrix dtype
+    rng = np.random.RandomState(0)
+    n_components = 8
+    code, dictionary, _ = dict_learning(
+        X.astype(data_type),
+        n_components=n_components,
+        alpha=1,
+        random_state=rng,
+        method=method,
+    )
+    assert code.dtype == expected_type
+    assert dictionary.dtype == expected_type
+
+
+@pytest.mark.parametrize("method", ("lars", "cd"))
+def test_dict_learning_numerical_consistency(method):
+    # verify numerically consistent among np.float32 and np.float64
+    rtol = 1e-6
+    n_components = 4
+    alpha = 2
+
+    U_64, V_64, _ = dict_learning(
+        X.astype(np.float64),
+        n_components=n_components,
+        alpha=alpha,
+        random_state=0,
+        method=method,
+    )
+    U_32, V_32, _ = dict_learning(
+        X.astype(np.float32),
+        n_components=n_components,
+        alpha=alpha,
+        random_state=0,
+        method=method,
+    )
+
+    # Optimal solution (U*, V*) is not unique.
+    # If (U*, V*) is optimal solution, (-U*,-V*) is also optimal,
+    # and (column permutated U*, row permutated V*) are also optional
+    # as long as holding UV.
+    # So here UV, ||U||_1,1 and sum(||V_k||_2^2) are verified
+    # instead of comparing directly U and V.
+    assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol)
+    assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol)
+    assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol)
+    # verify an obtained solution is not degenerate
+    assert np.mean(U_64 != 0.0) > 0.05
+    assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0)
+
+
+@pytest.mark.parametrize("method", ("lars", "cd"))
+@pytest.mark.parametrize(
+    "data_type, expected_type",
+    (
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ),
+)
+def test_dict_learning_online_dtype_match(data_type, expected_type, method):
+    # Verify output matrix dtype
+    rng = np.random.RandomState(0)
+    n_components = 8
+    code, dictionary = dict_learning_online(
+        X.astype(data_type),
+        n_components=n_components,
+        alpha=1,
+        batch_size=10,
+        random_state=rng,
+        method=method,
+    )
+    assert code.dtype == expected_type
+    assert dictionary.dtype == expected_type
+
+
+@pytest.mark.parametrize("method", ("lars", "cd"))
+def test_dict_learning_online_numerical_consistency(method):
+    # verify numerically consistent among np.float32 and np.float64
+    rtol = 1e-4
+    n_components = 4
+    alpha = 1
+
+    U_64, V_64 = dict_learning_online(
+        X.astype(np.float64),
+        n_components=n_components,
+        max_iter=1_000,
+        alpha=alpha,
+        batch_size=10,
+        random_state=0,
+        method=method,
+        tol=0.0,
+        max_no_improvement=None,
+    )
+    U_32, V_32 = dict_learning_online(
+        X.astype(np.float32),
+        n_components=n_components,
+        max_iter=1_000,
+        alpha=alpha,
+        batch_size=10,
+        random_state=0,
+        method=method,
+        tol=0.0,
+        max_no_improvement=None,
+    )
+
+    # Optimal solution (U*, V*) is not unique.
+    # If (U*, V*) is optimal solution, (-U*,-V*) is also optimal,
+    # and (column permutated U*, row permutated V*) are also optional
+    # as long as holding UV.
+    # So here UV, ||U||_1,1 and sum(||V_k||_2) are verified
+    # instead of comparing directly U and V.
+    assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol)
+    assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol)
+    assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol)
+    # verify an obtained solution is not degenerate
+    assert np.mean(U_64 != 0.0) > 0.05
+    assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        SparseCoder(X.T),
+        DictionaryLearning(),
+        MiniBatchDictionaryLearning(batch_size=4, max_iter=10),
+    ],
+    ids=lambda x: x.__class__.__name__,
+)
+def test_get_feature_names_out(estimator):
+    """Check feature names for dict learning estimators."""
+    estimator.fit(X)
+    n_components = X.shape[1]
+
+    feature_names_out = estimator.get_feature_names_out()
+    estimator_name = estimator.__class__.__name__.lower()
+    assert_array_equal(
+        feature_names_out,
+        [f"{estimator_name}{i}" for i in range(n_components)],
+    )
+
+
+def test_cd_work_on_joblib_memmapped_data(monkeypatch):
+    monkeypatch.setattr(
+        sklearn.decomposition._dict_learning,
+        "Parallel",
+        partial(Parallel, max_nbytes=100),
+    )
+
+    rng = np.random.RandomState(0)
+    X_train = rng.randn(10, 10)
+
+    dict_learner = DictionaryLearning(
+        n_components=5,
+        random_state=0,
+        n_jobs=2,
+        fit_algorithm="cd",
+        max_iter=50,
+        verbose=True,
+    )
+
+    # This must run and complete without error.
+    dict_learner.fit(X_train)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_factor_analysis.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_factor_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..9175829695b0d8695cdf294419869d8aaf066489
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_factor_analysis.py
@@ -0,0 +1,109 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import combinations
+
+import numpy as np
+import pytest
+
+from sklearn.decomposition import FactorAnalysis
+from sklearn.decomposition._factor_analysis import _ortho_rotation
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal
+
+
+def test_factor_analysis(global_random_seed):
+    # Test FactorAnalysis ability to recover the data covariance structure
+    rng = np.random.RandomState(global_random_seed)
+    n_samples, n_features, n_components = 20, 5, 3
+
+    # Some random settings for the generative model
+    W = rng.randn(n_components, n_features)
+    # latent variable of dim 3, 20 of it
+    h = rng.randn(n_samples, n_components)
+    # using gamma to model different noise variance
+    # per component
+    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)
+
+    # generate observations
+    # wlog, mean is 0
+    X = np.dot(h, W) + noise
+
+    fas = []
+    for method in ["randomized", "lapack"]:
+        fa = FactorAnalysis(n_components=n_components, svd_method=method)
+        fa.fit(X)
+        fas.append(fa)
+
+        X_t = fa.transform(X)
+        assert X_t.shape == (n_samples, n_components)
+
+        assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum())
+        assert_almost_equal(fa.score_samples(X).mean(), fa.score(X))
+
+        diff = np.all(np.diff(fa.loglike_))
+        assert diff > 0.0, "Log likelihood dif not increase"
+
+        # Sample Covariance
+        scov = np.cov(X, rowvar=0.0, bias=1.0)
+
+        # Model Covariance
+        mcov = fa.get_covariance()
+        diff = np.sum(np.abs(scov - mcov)) / W.size
+        assert diff < 0.2, "Mean absolute difference is %f" % diff
+        fa = FactorAnalysis(
+            n_components=n_components, noise_variance_init=np.ones(n_features)
+        )
+        with pytest.raises(ValueError):
+            fa.fit(X[:, :2])
+
+    def f(x, y):
+        return np.abs(getattr(x, y))  # sign will not be equal
+
+    fa1, fa2 = fas
+    for attr in ["loglike_", "components_", "noise_variance_"]:
+        assert_almost_equal(f(fa1, attr), f(fa2, attr))
+
+    fa1.max_iter = 1
+    fa1.verbose = True
+    with pytest.warns(ConvergenceWarning):
+        fa1.fit(X)
+
+    # Test get_covariance and get_precision with n_components == n_features
+    # with n_components < n_features and with n_components == 0
+    for n_components in [0, 2, X.shape[1]]:
+        fa.n_components = n_components
+        fa.fit(X)
+        cov = fa.get_covariance()
+        precision = fa.get_precision()
+        assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12)
+
+    # test rotation
+    n_components = 2
+
+    results, projections = {}, {}
+    for method in (None, "varimax", "quartimax"):
+        fa_var = FactorAnalysis(n_components=n_components, rotation=method)
+        results[method] = fa_var.fit_transform(X)
+        projections[method] = fa_var.get_covariance()
+    for rot1, rot2 in combinations([None, "varimax", "quartimax"], 2):
+        assert not np.allclose(results[rot1], results[rot2])
+        assert np.allclose(projections[rot1], projections[rot2], atol=3)
+
+    # test against R's psych::principal with rotate="varimax"
+    # (i.e., the values below stem from rotating the components in R)
+    # R's factor analysis returns quite different values; therefore, we only
+    # test the rotation itself
+    factors = np.array(
+        [
+            [0.89421016, -0.35854928, -0.27770122, 0.03773647],
+            [-0.45081822, -0.89132754, 0.0932195, -0.01787973],
+            [0.99500666, -0.02031465, 0.05426497, -0.11539407],
+            [0.96822861, -0.06299656, 0.24411001, 0.07540887],
+        ]
+    )
+    r_solution = np.array(
+        [[0.962, 0.052], [-0.141, 0.989], [0.949, -0.300], [0.937, -0.251]]
+    )
+    rotated = _ortho_rotation(factors[:, :n_components], method="varimax").T
+    assert_array_almost_equal(np.abs(rotated), np.abs(r_solution), decimal=3)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_fastica.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_fastica.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f8c9c55db621a36b6153aeef28a830ac7675a25
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_fastica.py
@@ -0,0 +1,457 @@
+"""
+Test the fastica algorithm.
+"""
+
+import itertools
+import os
+import warnings
+
+import numpy as np
+import pytest
+from scipy import stats
+
+from sklearn.decomposition import PCA, FastICA, fastica
+from sklearn.decomposition._fastica import _gs_decorrelation
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.utils._testing import assert_allclose, ignore_warnings
+
+
+def center_and_norm(x, axis=-1):
+    """Centers and norms x **in place**
+
+    Parameters
+    -----------
+    x: ndarray
+        Array with an axis of observations (statistical units) measured on
+        random variables.
+    axis: int, optional
+        Axis along which the mean and variance are calculated.
+    """
+    x = np.rollaxis(x, axis)
+    x -= x.mean(axis=0)
+    x /= x.std(axis=0)
+
+
+def test_gs(global_random_seed):
+    # Test gram schmidt orthonormalization
+    # generate a random orthogonal  matrix
+    rng = np.random.RandomState(global_random_seed)
+    W, _, _ = np.linalg.svd(rng.randn(10, 10))
+    w = rng.randn(10)
+    _gs_decorrelation(w, W, 10)
+    assert (w**2).sum() < 1.0e-10
+    w = rng.randn(10)
+    u = _gs_decorrelation(w, W, 5)
+    tmp = np.dot(u, W.T)
+    assert (tmp[:5] ** 2).sum() < 1.0e-10
+
+
+def test_fastica_attributes_dtypes(global_dtype):
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10)).astype(global_dtype, copy=False)
+    fica = FastICA(
+        n_components=5, max_iter=1000, whiten="unit-variance", random_state=0
+    ).fit(X)
+    assert fica.components_.dtype == global_dtype
+    assert fica.mixing_.dtype == global_dtype
+    assert fica.mean_.dtype == global_dtype
+    assert fica.whitening_.dtype == global_dtype
+
+
+def test_fastica_return_dtypes(global_dtype):
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10)).astype(global_dtype, copy=False)
+    k_, mixing_, s_ = fastica(
+        X, max_iter=1000, whiten="unit-variance", random_state=rng
+    )
+    assert k_.dtype == global_dtype
+    assert mixing_.dtype == global_dtype
+    assert s_.dtype == global_dtype
+
+
+@pytest.mark.parametrize("add_noise", [True, False])
+def test_fastica_simple(add_noise, global_random_seed, global_dtype):
+    if (
+        global_random_seed == 20
+        and global_dtype == np.float32
+        and not add_noise
+        and os.getenv("DISTRIB") == "ubuntu"
+    ):
+        pytest.xfail(
+            "FastICA instability with Ubuntu Atlas build with float32 "
+            "global_dtype. For more details, see "
+            "https://github.com/scikit-learn/scikit-learn/issues/24131#issuecomment-1208091119"
+        )
+
+    # Test the FastICA algorithm on very simple data.
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 1000
+    # Generate two sources:
+    s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
+    s2 = stats.t.rvs(1, size=n_samples, random_state=global_random_seed)
+    s = np.c_[s1, s2].T
+    center_and_norm(s)
+    s = s.astype(global_dtype)
+    s1, s2 = s
+
+    # Mixing angle
+    phi = 0.6
+    mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]])
+    mixing = mixing.astype(global_dtype)
+    m = np.dot(mixing, s)
+
+    if add_noise:
+        m += 0.1 * rng.randn(2, 1000)
+
+    center_and_norm(m)
+
+    # function as fun arg
+    def g_test(x):
+        return x**3, (3 * x**2).mean(axis=-1)
+
+    algos = ["parallel", "deflation"]
+    nls = ["logcosh", "exp", "cube", g_test]
+    whitening = ["arbitrary-variance", "unit-variance", False]
+    for algo, nl, whiten in itertools.product(algos, nls, whitening):
+        if whiten:
+            k_, mixing_, s_ = fastica(
+                m.T, fun=nl, whiten=whiten, algorithm=algo, random_state=rng
+            )
+            with pytest.raises(ValueError):
+                fastica(m.T, fun=np.tanh, whiten=whiten, algorithm=algo)
+        else:
+            pca = PCA(n_components=2, whiten=True, random_state=rng)
+            X = pca.fit_transform(m.T)
+            k_, mixing_, s_ = fastica(
+                X, fun=nl, algorithm=algo, whiten=False, random_state=rng
+            )
+            with pytest.raises(ValueError):
+                fastica(X, fun=np.tanh, algorithm=algo)
+        s_ = s_.T
+        # Check that the mixing model described in the docstring holds:
+        if whiten:
+            # XXX: exact reconstruction to standard relative tolerance is not
+            # possible. This is probably expected when add_noise is True but we
+            # also need a non-trivial atol in float32 when add_noise is False.
+            #
+            # Note that the 2 sources are non-Gaussian in this test.
+            atol = 1e-5 if global_dtype == np.float32 else 0
+            assert_allclose(np.dot(np.dot(mixing_, k_), m), s_, atol=atol)
+
+        center_and_norm(s_)
+        s1_, s2_ = s_
+        # Check to see if the sources have been estimated
+        # in the wrong order
+        if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
+            s2_, s1_ = s_
+        s1_ *= np.sign(np.dot(s1_, s1))
+        s2_ *= np.sign(np.dot(s2_, s2))
+
+        # Check that we have estimated the original sources
+        if not add_noise:
+            assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-2)
+            assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-2)
+        else:
+            assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-1)
+            assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-1)
+
+    # Test FastICA class
+    _, _, sources_fun = fastica(
+        m.T, fun=nl, algorithm=algo, random_state=global_random_seed
+    )
+    ica = FastICA(fun=nl, algorithm=algo, random_state=global_random_seed)
+    sources = ica.fit_transform(m.T)
+    assert ica.components_.shape == (2, 2)
+    assert sources.shape == (1000, 2)
+
+    assert_allclose(sources_fun, sources)
+    # Set atol to account for the different magnitudes of the elements in sources
+    # (from 1e-4 to 1e1).
+    atol = np.max(np.abs(sources)) * (1e-5 if global_dtype == np.float32 else 1e-7)
+    assert_allclose(sources, ica.transform(m.T), atol=atol)
+
+    assert ica.mixing_.shape == (2, 2)
+
+    ica = FastICA(fun=np.tanh, algorithm=algo)
+    with pytest.raises(ValueError):
+        ica.fit(m.T)
+
+
+def test_fastica_nowhiten():
+    m = [[0, 1], [1, 0]]
+
+    # test for issue #697
+    ica = FastICA(n_components=1, whiten=False, random_state=0)
+    warn_msg = "Ignoring n_components with whiten=False."
+    with pytest.warns(UserWarning, match=warn_msg):
+        ica.fit(m)
+    assert hasattr(ica, "mixing_")
+
+
+def test_fastica_convergence_fail(global_random_seed):
+    # Test the FastICA algorithm on very simple data
+    # (see test_non_square_fastica).
+    # Ensure a ConvergenceWarning raised if the tolerance is sufficiently low.
+    rng = np.random.RandomState(global_random_seed)
+
+    n_samples = 1000
+    # Generate two sources:
+    t = np.linspace(0, 100, n_samples)
+    s1 = np.sin(t)
+    s2 = np.ceil(np.sin(np.pi * t))
+    s = np.c_[s1, s2].T
+    center_and_norm(s)
+
+    # Mixing matrix
+    mixing = rng.randn(6, 2)
+    m = np.dot(mixing, s)
+
+    # Do fastICA with tolerance 0. to ensure failing convergence
+    warn_msg = (
+        "FastICA did not converge. Consider increasing tolerance "
+        "or the maximum number of iterations."
+    )
+    with pytest.warns(ConvergenceWarning, match=warn_msg):
+        ica = FastICA(
+            algorithm="parallel", n_components=2, random_state=rng, max_iter=2, tol=0.0
+        )
+        ica.fit(m.T)
+
+
+@pytest.mark.parametrize("add_noise", [True, False])
+def test_non_square_fastica(global_random_seed, add_noise):
+    # Test the FastICA algorithm on very simple data.
+    rng = np.random.RandomState(global_random_seed)
+
+    n_samples = 1000
+    # Generate two sources:
+    t = np.linspace(0, 100, n_samples)
+    s1 = np.sin(t)
+    s2 = np.ceil(np.sin(np.pi * t))
+    s = np.c_[s1, s2].T
+    center_and_norm(s)
+    s1, s2 = s
+
+    # Mixing matrix
+    mixing = rng.randn(6, 2)
+    m = np.dot(mixing, s)
+
+    if add_noise:
+        m += 0.1 * rng.randn(6, n_samples)
+
+    center_and_norm(m)
+
+    k_, mixing_, s_ = fastica(
+        m.T, n_components=2, whiten="unit-variance", random_state=rng
+    )
+    s_ = s_.T
+
+    # Check that the mixing model described in the docstring holds:
+    assert_allclose(s_, np.dot(np.dot(mixing_, k_), m))
+
+    center_and_norm(s_)
+    s1_, s2_ = s_
+    # Check to see if the sources have been estimated
+    # in the wrong order
+    if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
+        s2_, s1_ = s_
+    s1_ *= np.sign(np.dot(s1_, s1))
+    s2_ *= np.sign(np.dot(s2_, s2))
+
+    # Check that we have estimated the original sources
+    if not add_noise:
+        assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-3)
+        assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-3)
+
+
+def test_fit_transform(global_random_seed, global_dtype):
+    """Test unit variance of transformed data using FastICA algorithm.
+
+    Check that `fit_transform` gives the same result as applying
+    `fit` and then `transform`.
+
+    Bug #13056
+    """
+    # multivariate uniform data in [0, 1]
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.random_sample((100, 10)).astype(global_dtype)
+    max_iter = 300
+    for whiten, n_components in [["unit-variance", 5], [False, None]]:
+        n_components_ = n_components if n_components is not None else X.shape[1]
+
+        ica = FastICA(
+            n_components=n_components, max_iter=max_iter, whiten=whiten, random_state=0
+        )
+        with warnings.catch_warnings():
+            # make sure that numerical errors do not cause sqrt of negative
+            # values
+            warnings.simplefilter("error", RuntimeWarning)
+            # XXX: for some seeds, the model does not converge.
+            # However this is not what we test here.
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            Xt = ica.fit_transform(X)
+        assert ica.components_.shape == (n_components_, 10)
+        assert Xt.shape == (X.shape[0], n_components_)
+
+        ica2 = FastICA(
+            n_components=n_components, max_iter=max_iter, whiten=whiten, random_state=0
+        )
+        with warnings.catch_warnings():
+            # make sure that numerical errors do not cause sqrt of negative
+            # values
+            warnings.simplefilter("error", RuntimeWarning)
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            ica2.fit(X)
+        assert ica2.components_.shape == (n_components_, 10)
+        Xt2 = ica2.transform(X)
+
+        # XXX: we have to set atol for this test to pass for all seeds when
+        # fitting with float32 data. Is this revealing a bug?
+        if global_dtype:
+            atol = np.abs(Xt2).mean() / 1e6
+        else:
+            atol = 0.0  # the default rtol is enough for float64 data
+        assert_allclose(Xt, Xt2, atol=atol)
+
+
+@pytest.mark.filterwarnings("ignore:Ignoring n_components with whiten=False.")
+@pytest.mark.parametrize(
+    "whiten, n_components, expected_mixing_shape",
+    [
+        ("arbitrary-variance", 5, (10, 5)),
+        ("arbitrary-variance", 10, (10, 10)),
+        ("unit-variance", 5, (10, 5)),
+        ("unit-variance", 10, (10, 10)),
+        (False, 5, (10, 10)),
+        (False, 10, (10, 10)),
+    ],
+)
+def test_inverse_transform(
+    whiten, n_components, expected_mixing_shape, global_random_seed, global_dtype
+):
+    # Test FastICA.inverse_transform
+    n_samples = 100
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.random_sample((n_samples, 10)).astype(global_dtype)
+
+    ica = FastICA(n_components=n_components, random_state=rng, whiten=whiten)
+    with warnings.catch_warnings():
+        # For some dataset (depending on the value of global_dtype) the model
+        # can fail to converge but this should not impact the definition of
+        # a valid inverse transform.
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        Xt = ica.fit_transform(X)
+    assert ica.mixing_.shape == expected_mixing_shape
+    X2 = ica.inverse_transform(Xt)
+    assert X.shape == X2.shape
+
+    # reversibility test in non-reduction case
+    if n_components == X.shape[1]:
+        # XXX: we have to set atol for this test to pass for all seeds when
+        # fitting with float32 data. Is this revealing a bug?
+        if global_dtype:
+            # XXX: dividing by a smaller number makes
+            # tests fail for some seeds.
+            atol = np.abs(X2).mean() / 1e5
+        else:
+            atol = 0.0  # the default rtol is enough for float64 data
+        assert_allclose(X, X2, atol=atol)
+
+
+def test_fastica_errors():
+    n_features = 3
+    n_samples = 10
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((n_samples, n_features))
+    w_init = rng.randn(n_features + 1, n_features + 1)
+    with pytest.raises(ValueError, match=r"alpha must be in \[1,2\]"):
+        fastica(X, fun_args={"alpha": 0})
+    with pytest.raises(
+        ValueError, match=r"w_init has invalid shape.+should be \(3L?, 3L?\)"
+    ):
+        fastica(X, w_init=w_init)
+
+
+def test_fastica_whiten_unit_variance(global_random_seed):
+    """Test unit variance of transformed data using FastICA algorithm.
+
+    Bug #13056
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.random_sample((100, 10))
+    n_components = X.shape[1]
+    ica = FastICA(n_components=n_components, whiten="unit-variance", random_state=0)
+    Xt = ica.fit_transform(X)
+
+    assert np.var(Xt) == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize("whiten", ["arbitrary-variance", "unit-variance", False])
+@pytest.mark.parametrize("return_X_mean", [True, False])
+@pytest.mark.parametrize("return_n_iter", [True, False])
+def test_fastica_output_shape(whiten, return_X_mean, return_n_iter):
+    n_features = 3
+    n_samples = 10
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((n_samples, n_features))
+
+    expected_len = 3 + return_X_mean + return_n_iter
+
+    out = fastica(
+        X, whiten=whiten, return_n_iter=return_n_iter, return_X_mean=return_X_mean
+    )
+
+    assert len(out) == expected_len
+    if not whiten:
+        assert out[0] is None
+
+
+@pytest.mark.parametrize("add_noise", [True, False])
+def test_fastica_simple_different_solvers(add_noise, global_random_seed):
+    """Test FastICA is consistent between whiten_solvers."""
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 1000
+    # Generate two sources:
+    s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
+    s2 = stats.t.rvs(1, size=n_samples, random_state=rng)
+    s = np.c_[s1, s2].T
+    center_and_norm(s)
+    s1, s2 = s
+
+    # Mixing angle
+    phi = rng.rand() * 2 * np.pi
+    mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]])
+    m = np.dot(mixing, s)
+
+    if add_noise:
+        m += 0.1 * rng.randn(2, 1000)
+
+    center_and_norm(m)
+
+    outs = {}
+    for solver in ("svd", "eigh"):
+        ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver=solver)
+        sources = ica.fit_transform(m.T)
+        outs[solver] = sources
+        assert ica.components_.shape == (2, 2)
+        assert sources.shape == (1000, 2)
+
+    # compared numbers are not all on the same magnitude. Using a small atol to
+    # make the test less brittle
+    assert_allclose(outs["eigh"], outs["svd"], atol=1e-12)
+
+
+def test_fastica_eigh_low_rank_warning(global_random_seed):
+    """Test FastICA eigh solver raises warning for low-rank data."""
+    rng = np.random.RandomState(global_random_seed)
+    A = rng.randn(10, 2)
+    X = A @ A.T
+    ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh")
+    msg = "There are some small singular values"
+
+    with pytest.warns(UserWarning, match=msg):
+        with ignore_warnings(category=ConvergenceWarning):
+            # The FastICA solver may not converge for some data with specific
+            # random seeds but this happens after the whiten step so this is
+            # not want we want to test here.
+            ica.fit(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_incremental_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_incremental_pca.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4ea1c222901c0159fdf90c9675cdaad4da60450
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_incremental_pca.py
@@ -0,0 +1,487 @@
+"""Tests for Incremental PCA."""
+
+import itertools
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn import datasets
+from sklearn.decomposition import PCA, IncrementalPCA
+from sklearn.utils._testing import (
+    assert_allclose_dense_sparse,
+    assert_almost_equal,
+    assert_array_almost_equal,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
+
+iris = datasets.load_iris()
+
+
+def test_incremental_pca():
+    # Incremental PCA on dense arrays.
+    X = iris.data
+    batch_size = X.shape[0] // 3
+    ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
+    pca = PCA(n_components=2)
+    pca.fit_transform(X)
+
+    X_transformed = ipca.fit_transform(X)
+
+    assert X_transformed.shape == (X.shape[0], 2)
+    np.testing.assert_allclose(
+        ipca.explained_variance_ratio_.sum(),
+        pca.explained_variance_ratio_.sum(),
+        rtol=1e-3,
+    )
+
+    for n_components in [1, 2, X.shape[1]]:
+        ipca = IncrementalPCA(n_components, batch_size=batch_size)
+        ipca.fit(X)
+        cov = ipca.get_covariance()
+        precision = ipca.get_precision()
+        np.testing.assert_allclose(
+            np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-13
+        )
+
+
+@pytest.mark.parametrize(
+    "sparse_container", CSC_CONTAINERS + CSR_CONTAINERS + LIL_CONTAINERS
+)
+def test_incremental_pca_sparse(sparse_container):
+    # Incremental PCA on sparse arrays.
+    X = iris.data
+    pca = PCA(n_components=2)
+    pca.fit_transform(X)
+    X_sparse = sparse_container(X)
+    batch_size = X_sparse.shape[0] // 3
+    ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
+
+    X_transformed = ipca.fit_transform(X_sparse)
+
+    assert X_transformed.shape == (X_sparse.shape[0], 2)
+    np.testing.assert_allclose(
+        ipca.explained_variance_ratio_.sum(),
+        pca.explained_variance_ratio_.sum(),
+        rtol=1e-3,
+    )
+
+    for n_components in [1, 2, X.shape[1]]:
+        ipca = IncrementalPCA(n_components, batch_size=batch_size)
+        ipca.fit(X_sparse)
+        cov = ipca.get_covariance()
+        precision = ipca.get_precision()
+        np.testing.assert_allclose(
+            np.dot(cov, precision), np.eye(X_sparse.shape[1]), atol=1e-13
+        )
+
+    with pytest.raises(
+        TypeError,
+        match=(
+            "IncrementalPCA.partial_fit does not support "
+            "sparse input. Either convert data to dense "
+            "or use IncrementalPCA.fit to do so in batches."
+        ),
+    ):
+        ipca.partial_fit(X_sparse)
+
+
+def test_incremental_pca_check_projection(global_random_seed):
+    # Test that the projection of data is correct.
+    rng = np.random.RandomState(global_random_seed)
+    n, p = 100, 3
+    X = rng.randn(n, p) * 0.1
+    X[:10] += np.array([3, 4, 5])
+    Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
+
+    # Get the reconstruction of the generated data X
+    # Note that Xt has the same "components" as X, just separated
+    # This is what we want to ensure is recreated correctly
+    Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt)
+
+    # Normalize
+    Yt /= np.sqrt((Yt**2).sum())
+
+    # Make sure that the first element of Yt is ~1, this means
+    # the reconstruction worked as expected
+    assert_almost_equal(np.abs(Yt[0][0]), 1.0, 1)
+
+
+def test_incremental_pca_inverse(global_random_seed):
+    # Test that the projection of data can be inverted.
+    rng = np.random.RandomState(global_random_seed)
+    n, p = 50, 3
+    X = rng.randn(n, p)  # spherical data
+    X[:, 1] *= 0.00001  # make middle component relatively small
+    X += [5, 4, 3]  # make a large mean
+
+    # same check that we can find the original data from the transformed
+    # signal (since the data is almost of rank n_components)
+    ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X)
+    Y = ipca.transform(X)
+    Y_inverse = ipca.inverse_transform(Y)
+    assert_almost_equal(X, Y_inverse, decimal=3)
+
+
+def test_incremental_pca_validation():
+    # Test that n_components is <= n_features.
+    X = np.array([[0, 1, 0], [1, 0, 0]])
+    n_samples, n_features = X.shape
+    n_components = 4
+    with pytest.raises(
+        ValueError,
+        match=(
+            "n_components={} invalid"
+            " for n_features={}, need more rows than"
+            " columns for IncrementalPCA"
+            " processing".format(n_components, n_features)
+        ),
+    ):
+        IncrementalPCA(n_components, batch_size=10).fit(X)
+
+    # Test that n_components is also <= n_samples in first call to partial fit.
+    n_components = 3
+    with pytest.raises(
+        ValueError,
+        match=(
+            f"n_components={n_components} must be less or equal to the batch "
+            f"number of samples {n_samples} for the first partial_fit call."
+        ),
+    ):
+        IncrementalPCA(n_components=n_components).partial_fit(X)
+
+
+def test_n_samples_equal_n_components():
+    # Ensures no warning is raised when n_samples==n_components
+    # Non-regression test for gh-19050
+    ipca = IncrementalPCA(n_components=5)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        ipca.partial_fit(np.random.randn(5, 7))
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        ipca.fit(np.random.randn(5, 7))
+
+
+def test_n_components_none():
+    # Ensures that n_components == None is handled correctly
+    rng = np.random.RandomState(1999)
+    for n_samples, n_features in [(50, 10), (10, 50)]:
+        X = rng.rand(n_samples, n_features)
+        ipca = IncrementalPCA(n_components=None)
+
+        # First partial_fit call, ipca.n_components_ is inferred from
+        # min(X.shape)
+        ipca.partial_fit(X)
+        assert ipca.n_components_ == min(X.shape)
+
+        # Second partial_fit call, ipca.n_components_ is inferred from
+        # ipca.components_ computed from the first partial_fit call
+        ipca.partial_fit(X)
+        assert ipca.n_components_ == ipca.components_.shape[0]
+
+
+def test_incremental_pca_set_params():
+    # Test that components_ sign is stable over batch sizes.
+    rng = np.random.RandomState(1999)
+    n_samples = 100
+    n_features = 20
+    X = rng.randn(n_samples, n_features)
+    X2 = rng.randn(n_samples, n_features)
+    X3 = rng.randn(n_samples, n_features)
+    ipca = IncrementalPCA(n_components=20)
+    ipca.fit(X)
+    # Decreasing number of components
+    ipca.set_params(n_components=10)
+    with pytest.raises(ValueError):
+        ipca.partial_fit(X2)
+    # Increasing number of components
+    ipca.set_params(n_components=15)
+    with pytest.raises(ValueError):
+        ipca.partial_fit(X3)
+    # Returning to original setting
+    ipca.set_params(n_components=20)
+    ipca.partial_fit(X)
+
+
+def test_incremental_pca_num_features_change():
+    # Test that changing n_components will raise an error.
+    rng = np.random.RandomState(1999)
+    n_samples = 100
+    X = rng.randn(n_samples, 20)
+    X2 = rng.randn(n_samples, 50)
+    ipca = IncrementalPCA(n_components=None)
+    ipca.fit(X)
+    with pytest.raises(ValueError):
+        ipca.partial_fit(X2)
+
+
+def test_incremental_pca_batch_signs(global_random_seed):
+    # Test that components_ sign is stable over batch sizes.
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 100
+    n_features = 3
+    X = rng.randn(n_samples, n_features)
+    all_components = []
+    batch_sizes = np.arange(10, 20)
+    for batch_size in batch_sizes:
+        ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
+        all_components.append(ipca.components_)
+
+    for i, j in itertools.pairwise(all_components):
+        assert_almost_equal(np.sign(i), np.sign(j), decimal=6)
+
+
+def test_incremental_pca_partial_fit_small_batch():
+    # Test that there is no minimum batch size after the first partial_fit
+    # Non-regression test
+    rng = np.random.RandomState(1999)
+    n, p = 50, 3
+    X = rng.randn(n, p)  # spherical data
+    X[:, 1] *= 0.00001  # make middle component relatively small
+    X += [5, 4, 3]  # make a large mean
+
+    n_components = p
+    pipca = IncrementalPCA(n_components=n_components)
+    pipca.partial_fit(X[:n_components])
+    for idx in range(n_components, n):
+        pipca.partial_fit(X[idx : idx + 1])
+
+    pca = PCA(n_components=n_components)
+    pca.fit(X)
+
+    assert_allclose(pca.components_, pipca.components_, atol=1e-3)
+
+
+def test_incremental_pca_batch_values(global_random_seed):
+    # Test that components_ values are stable over batch sizes.
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 100
+    n_features = 3
+    X = rng.randn(n_samples, n_features)
+    all_components = []
+    batch_sizes = np.arange(20, 40, 3)
+    for batch_size in batch_sizes:
+        ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
+        all_components.append(ipca.components_)
+
+    for i, j in itertools.pairwise(all_components):
+        assert_almost_equal(i, j, decimal=1)
+
+
+def test_incremental_pca_batch_rank():
+    # Test sample size in each batch is always larger or equal to n_components
+    rng = np.random.RandomState(1999)
+    n_samples = 100
+    n_features = 20
+    X = rng.randn(n_samples, n_features)
+    all_components = []
+    batch_sizes = np.arange(20, 90, 3)
+    for batch_size in batch_sizes:
+        ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X)
+        all_components.append(ipca.components_)
+
+    for components_i, components_j in itertools.pairwise(all_components):
+        assert_allclose_dense_sparse(components_i, components_j)
+
+
+def test_incremental_pca_partial_fit(global_random_seed):
+    # Test that fit and partial_fit get equivalent results.
+    rng = np.random.RandomState(global_random_seed)
+    n, p = 50, 3
+    X = rng.randn(n, p)  # spherical data
+    X[:, 1] *= 0.00001  # make middle component relatively small
+    X += [5, 4, 3]  # make a large mean
+
+    # same check that we can find the original data from the transformed
+    # signal (since the data is almost of rank n_components)
+    batch_size = 10
+    ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X)
+    pipca = IncrementalPCA(n_components=2, batch_size=batch_size)
+    # Add one to make sure endpoint is included
+    batch_itr = np.arange(0, n + 1, batch_size)
+    for i, j in itertools.pairwise(batch_itr):
+        pipca.partial_fit(X[i:j, :])
+    assert_almost_equal(ipca.components_, pipca.components_, decimal=3)
+
+
+def test_incremental_pca_against_pca_iris():
+    # Test that IncrementalPCA and PCA are approximate (to a sign flip).
+    X = iris.data
+
+    Y_pca = PCA(n_components=2).fit_transform(X)
+    Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X)
+
+    assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
+
+
+def test_incremental_pca_against_pca_random_data(global_random_seed):
+    # Test that IncrementalPCA and PCA are approximate (to a sign flip).
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 100
+    n_features = 3
+    X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features)
+
+    Y_pca = PCA(n_components=3).fit_transform(X)
+    Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X)
+
+    assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
+
+
+def test_explained_variances():
+    # Test that PCA and IncrementalPCA calculations match
+    X = datasets.make_low_rank_matrix(
+        1000, 100, tail_strength=0.0, effective_rank=10, random_state=1999
+    )
+    prec = 3
+    n_samples, n_features = X.shape
+    for nc in [None, 99]:
+        pca = PCA(n_components=nc).fit(X)
+        ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X)
+        assert_almost_equal(
+            pca.explained_variance_, ipca.explained_variance_, decimal=prec
+        )
+        assert_almost_equal(
+            pca.explained_variance_ratio_, ipca.explained_variance_ratio_, decimal=prec
+        )
+        assert_almost_equal(pca.noise_variance_, ipca.noise_variance_, decimal=prec)
+
+
+def test_singular_values(global_random_seed):
+    # Check that the IncrementalPCA output has the correct singular values
+
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 1000
+    n_features = 100
+
+    X = datasets.make_low_rank_matrix(
+        n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng
+    )
+
+    pca = PCA(n_components=10, svd_solver="full", random_state=rng).fit(X)
+    ipca = IncrementalPCA(n_components=10, batch_size=150).fit(X)
+    assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)
+
+    # Compare to the Frobenius norm
+    X_pca = pca.transform(X)
+    X_ipca = ipca.transform(X)
+    assert_array_almost_equal(
+        np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro") ** 2.0, 12
+    )
+    assert_array_almost_equal(
+        np.sum(ipca.singular_values_**2.0), np.linalg.norm(X_ipca, "fro") ** 2.0, 2
+    )
+
+    # Compare to the 2-norms of the score vectors
+    assert_array_almost_equal(
+        pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12
+    )
+    assert_array_almost_equal(
+        ipca.singular_values_, np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2
+    )
+
+    # Set the singular values and see what we get back
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 100
+    n_features = 110
+
+    X = datasets.make_low_rank_matrix(
+        n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng
+    )
+
+    pca = PCA(n_components=3, svd_solver="full", random_state=rng)
+    ipca = IncrementalPCA(n_components=3, batch_size=100)
+
+    X_pca = pca.fit_transform(X)
+    X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
+    X_pca[:, 0] *= 3.142
+    X_pca[:, 1] *= 2.718
+
+    X_hat = np.dot(X_pca, pca.components_)
+    pca.fit(X_hat)
+    ipca.fit(X_hat)
+    assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
+    assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
+
+
+def test_whitening(global_random_seed):
+    # Test that PCA and IncrementalPCA transforms match to sign flip.
+    X = datasets.make_low_rank_matrix(
+        1000, 10, tail_strength=0.0, effective_rank=2, random_state=global_random_seed
+    )
+    atol = 1e-3
+    for nc in [None, 9]:
+        pca = PCA(whiten=True, n_components=nc).fit(X)
+        ipca = IncrementalPCA(whiten=True, n_components=nc, batch_size=250).fit(X)
+
+        # Since the data is rank deficient, some components are pure noise. We
+        # should not expect those dimensions to carry any signal and their
+        # values might be arbitrarily changed by implementation details of the
+        # internal SVD solver. We therefore filter them out before comparison.
+        stable_mask = pca.explained_variance_ratio_ > 1e-12
+
+        Xt_pca = pca.transform(X)
+        Xt_ipca = ipca.transform(X)
+        assert_allclose(
+            np.abs(Xt_pca)[:, stable_mask],
+            np.abs(Xt_ipca)[:, stable_mask],
+            atol=atol,
+        )
+
+        # The noisy dimensions are in the null space of the inverse transform,
+        # so they are not influencing the reconstruction. We therefore don't
+        # need to apply the mask here.
+        Xinv_ipca = ipca.inverse_transform(Xt_ipca)
+        Xinv_pca = pca.inverse_transform(Xt_pca)
+        assert_allclose(X, Xinv_ipca, atol=atol)
+        assert_allclose(X, Xinv_pca, atol=atol)
+        assert_allclose(Xinv_pca, Xinv_ipca, atol=atol)
+
+
+def test_incremental_pca_partial_fit_float_division():
+    # Test to ensure float division is used in all versions of Python
+    # (non-regression test for issue #9489)
+
+    rng = np.random.RandomState(0)
+    A = rng.randn(5, 3) + 2
+    B = rng.randn(7, 3) + 5
+
+    pca = IncrementalPCA(n_components=2)
+    pca.partial_fit(A)
+    # Set n_samples_seen_ to be a floating point number instead of an int
+    pca.n_samples_seen_ = float(pca.n_samples_seen_)
+    pca.partial_fit(B)
+    singular_vals_float_samples_seen = pca.singular_values_
+
+    pca2 = IncrementalPCA(n_components=2)
+    pca2.partial_fit(A)
+    pca2.partial_fit(B)
+    singular_vals_int_samples_seen = pca2.singular_values_
+
+    np.testing.assert_allclose(
+        singular_vals_float_samples_seen, singular_vals_int_samples_seen
+    )
+
+
+def test_incremental_pca_fit_overflow_error():
+    # Test for overflow error on Windows OS
+    # (non-regression test for issue #17693)
+    rng = np.random.RandomState(0)
+    A = rng.rand(500000, 2)
+
+    ipca = IncrementalPCA(n_components=2, batch_size=10000)
+    ipca.fit(A)
+
+    pca = PCA(n_components=2)
+    pca.fit(A)
+
+    np.testing.assert_allclose(ipca.singular_values_, pca.singular_values_)
+
+
+def test_incremental_pca_feature_names_out():
+    """Check feature names out for IncrementalPCA."""
+    ipca = IncrementalPCA(n_components=2).fit(iris.data)
+
+    names = ipca.get_feature_names_out()
+    assert_array_equal([f"incrementalpca{i}" for i in range(2)], names)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_kernel_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_kernel_pca.py
new file mode 100644
index 0000000000000000000000000000000000000000..57ae75c184622679b1db7350eab8b1ff9f94296e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_kernel_pca.py
@@ -0,0 +1,566 @@
+import warnings
+
+import numpy as np
+import pytest
+
+import sklearn
+from sklearn.datasets import load_iris, make_blobs, make_circles
+from sklearn.decomposition import PCA, KernelPCA
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import Perceptron
+from sklearn.metrics.pairwise import rbf_kernel
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.validation import _check_psd_eigenvalues
+
+
+def test_kernel_pca(global_random_seed):
+    """Nominal test for all solvers and all known kernels + a custom one
+
+    It tests
+     - that fit_transform is equivalent to fit+transform
+     - that the shapes of transforms and inverse transforms are correct
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X_fit = rng.random_sample((5, 4))
+    X_pred = rng.random_sample((2, 4))
+
+    def histogram(x, y, **kwargs):
+        # Histogram kernel implemented as a callable.
+        assert kwargs == {}  # no kernel_params that we didn't ask for
+        return np.minimum(x, y).sum()
+
+    for eigen_solver in ("auto", "dense", "arpack", "randomized"):
+        for kernel in ("linear", "rbf", "poly", histogram):
+            # histogram kernel produces singular matrix inside linalg.solve
+            # XXX use a least-squares approximation?
+            inv = not callable(kernel)
+
+            # transform fit data
+            kpca = KernelPCA(
+                4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv
+            )
+            X_fit_transformed = kpca.fit_transform(X_fit)
+            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
+            assert_array_almost_equal(
+                np.abs(X_fit_transformed), np.abs(X_fit_transformed2)
+            )
+
+            # non-regression test: previously, gamma would be 0 by default,
+            # forcing all eigenvalues to 0 under the poly kernel
+            assert X_fit_transformed.size != 0
+
+            # transform new data
+            X_pred_transformed = kpca.transform(X_pred)
+            assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]
+
+            # inverse transform
+            if inv:
+                X_pred2 = kpca.inverse_transform(X_pred_transformed)
+                assert X_pred2.shape == X_pred.shape
+
+
+def test_kernel_pca_invalid_parameters():
+    """Check that kPCA raises an error if the parameters are invalid
+
+    Tests fitting inverse transform with a precomputed kernel raises a
+    ValueError.
+    """
+    estimator = KernelPCA(
+        n_components=10, fit_inverse_transform=True, kernel="precomputed"
+    )
+    err_ms = "Cannot fit_inverse_transform with a precomputed kernel"
+    with pytest.raises(ValueError, match=err_ms):
+        estimator.fit(np.random.randn(10, 10))
+
+
+def test_kernel_pca_consistent_transform(global_random_seed):
+    """Check robustness to mutations in the original training array
+
+    Test that after fitting a kPCA model, it stays independent of any
+    mutation of the values of the original data object by relying on an
+    internal copy.
+    """
+    # X_fit_ needs to retain the old, unmodified copy of X
+    state = np.random.RandomState(global_random_seed)
+    X = state.rand(10, 10)
+    kpca = KernelPCA(random_state=state).fit(X)
+    transformed1 = kpca.transform(X)
+
+    X_copy = X.copy()
+    X[:, 0] = 666
+    transformed2 = kpca.transform(X_copy)
+    assert_array_almost_equal(transformed1, transformed2)
+
+
+def test_kernel_pca_deterministic_output(global_random_seed):
+    """Test that Kernel PCA produces deterministic output
+
+    Tests that the same inputs and random state produce the same output.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.rand(10, 10)
+    eigen_solver = ("arpack", "dense")
+
+    for solver in eigen_solver:
+        transformed_X = np.zeros((20, 2))
+        for i in range(20):
+            kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=rng)
+            transformed_X[i, :] = kpca.fit_transform(X)[0]
+        assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_kernel_pca_sparse(csr_container, global_random_seed):
+    """Test that kPCA works on a sparse data input.
+
+    Same test as ``test_kernel_pca except inverse_transform`` since it's not
+    implemented for sparse matrices.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X_fit = csr_container(rng.random_sample((5, 4)))
+    X_pred = csr_container(rng.random_sample((2, 4)))
+
+    for eigen_solver in ("auto", "arpack", "randomized"):
+        for kernel in ("linear", "rbf", "poly"):
+            # transform fit data
+            kpca = KernelPCA(
+                4,
+                kernel=kernel,
+                eigen_solver=eigen_solver,
+                fit_inverse_transform=False,
+                random_state=0,
+            )
+            X_fit_transformed = kpca.fit_transform(X_fit)
+            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
+            assert_array_almost_equal(
+                np.abs(X_fit_transformed), np.abs(X_fit_transformed2)
+            )
+
+            # transform new data
+            X_pred_transformed = kpca.transform(X_pred)
+            assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]
+
+            # inverse transform: not available for sparse matrices
+            # XXX: should we raise another exception type here? For instance:
+            # NotImplementedError.
+            with pytest.raises(NotFittedError):
+                kpca.inverse_transform(X_pred_transformed)
+
+
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+@pytest.mark.parametrize("n_features", [4, 10])
+def test_kernel_pca_linear_kernel(solver, n_features, global_random_seed):
+    """Test that kPCA with linear kernel is equivalent to PCA for all solvers.
+
+    KernelPCA with linear kernel should produce the same output as PCA.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X_fit = rng.random_sample((5, n_features))
+    X_pred = rng.random_sample((2, n_features))
+
+    # for a linear kernel, kernel PCA should find the same projection as PCA
+    # modulo the sign (direction)
+    # fit only the first four components: fifth is near zero eigenvalue, so
+    # can be trimmed due to roundoff error
+    n_comps = 3 if solver == "arpack" else 4
+    assert_array_almost_equal(
+        np.abs(KernelPCA(n_comps, eigen_solver=solver).fit(X_fit).transform(X_pred)),
+        np.abs(
+            PCA(n_comps, svd_solver=solver if solver != "dense" else "full")
+            .fit(X_fit)
+            .transform(X_pred)
+        ),
+    )
+
+
+def test_kernel_pca_n_components():
+    """Test that `n_components` is correctly taken into account for projections
+
+    For all solvers this tests that the output has the correct shape depending
+    on the selected number of components.
+    """
+    rng = np.random.RandomState(0)
+    X_fit = rng.random_sample((5, 4))
+    X_pred = rng.random_sample((2, 4))
+
+    for eigen_solver in ("dense", "arpack", "randomized"):
+        for c in [1, 2, 4]:
+            kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
+            shape = kpca.fit(X_fit).transform(X_pred).shape
+
+            assert shape == (2, c)
+
+
+def test_remove_zero_eig():
+    """Check that the ``remove_zero_eig`` parameter works correctly.
+
+    Tests that the null-space (Zero) eigenvalues are removed when
+    remove_zero_eig=True, whereas they are not by default.
+    """
+    X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]])
+
+    # n_components=None (default) => remove_zero_eig is True
+    kpca = KernelPCA()
+    Xt = kpca.fit_transform(X)
+    assert Xt.shape == (3, 0)
+
+    kpca = KernelPCA(n_components=2)
+    Xt = kpca.fit_transform(X)
+    assert Xt.shape == (3, 2)
+
+    kpca = KernelPCA(n_components=2, remove_zero_eig=True)
+    Xt = kpca.fit_transform(X)
+    assert Xt.shape == (3, 0)
+
+
+def test_leave_zero_eig():
+    """Non-regression test for issue #12141 (PR #12143)
+
+    This test checks that fit().transform() returns the same result as
+    fit_transform() in case of non-removed zero eigenvalue.
+    """
+    X_fit = np.array([[1, 1], [0, 0]])
+
+    # Assert that even with all np warnings on, there is no div by zero warning
+    with warnings.catch_warnings():
+        # There might be warnings about the kernel being badly conditioned,
+        # but there should not be warnings about division by zero.
+        # (Numpy division by zero warning can have many message variants, but
+        # at least we know that it is a RuntimeWarning so lets check only this)
+        warnings.simplefilter("error", RuntimeWarning)
+        with np.errstate(all="warn"):
+            k = KernelPCA(n_components=2, remove_zero_eig=False, eigen_solver="dense")
+            # Fit, then transform
+            A = k.fit(X_fit).transform(X_fit)
+            # Do both at once
+            B = k.fit_transform(X_fit)
+            # Compare
+            assert_array_almost_equal(np.abs(A), np.abs(B))
+
+
+def test_kernel_pca_precomputed(global_random_seed):
+    """Test that kPCA works with a precomputed kernel, for all solvers"""
+    rng = np.random.RandomState(global_random_seed)
+    X_fit = rng.random_sample((5, 4))
+    X_pred = rng.random_sample((2, 4))
+
+    for eigen_solver in ("dense", "arpack", "randomized"):
+        X_kpca = (
+            KernelPCA(4, eigen_solver=eigen_solver, random_state=0)
+            .fit(X_fit)
+            .transform(X_pred)
+        )
+
+        X_kpca2 = (
+            KernelPCA(
+                4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
+            )
+            .fit(np.dot(X_fit, X_fit.T))
+            .transform(np.dot(X_pred, X_fit.T))
+        )
+
+        X_kpca_train = KernelPCA(
+            4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
+        ).fit_transform(np.dot(X_fit, X_fit.T))
+
+        X_kpca_train2 = (
+            KernelPCA(
+                4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
+            )
+            .fit(np.dot(X_fit, X_fit.T))
+            .transform(np.dot(X_fit, X_fit.T))
+        )
+
+        assert_array_almost_equal(np.abs(X_kpca), np.abs(X_kpca2))
+
+        assert_array_almost_equal(np.abs(X_kpca_train), np.abs(X_kpca_train2))
+
+
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+def test_kernel_pca_precomputed_non_symmetric(solver):
+    """Check that the kernel centerer works.
+
+    Tests that a non symmetric precomputed kernel is actually accepted
+    because the kernel centerer does its job correctly.
+    """
+
+    # a non symmetric gram matrix
+    K = [[1, 2], [3, 40]]
+    kpca = KernelPCA(
+        kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0
+    )
+    kpca.fit(K)  # no error
+
+    # same test with centered kernel
+    Kc = [[9, -9], [-9, 9]]
+    kpca_c = KernelPCA(
+        kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0
+    )
+    kpca_c.fit(Kc)
+
+    # comparison between the non-centered and centered versions
+    assert_array_equal(kpca.eigenvectors_, kpca_c.eigenvectors_)
+    assert_array_equal(kpca.eigenvalues_, kpca_c.eigenvalues_)
+
+
+def test_gridsearch_pipeline():
+    """Check that kPCA works as expected in a grid search pipeline
+
+    Test if we can do a grid-search to find parameters to separate
+    circles with a perceptron model.
+    """
+    X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
+    kpca = KernelPCA(kernel="rbf", n_components=2)
+    pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))])
+    param_grid = dict(kernel_pca__gamma=2.0 ** np.arange(-2, 2))
+    grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
+    grid_search.fit(X, y)
+    assert grid_search.best_score_ == 1
+
+
+def test_gridsearch_pipeline_precomputed():
+    """Check that kPCA works as expected in a grid search pipeline (2)
+
+    Test if we can do a grid-search to find parameters to separate
+    circles with a perceptron model. This test uses a precomputed kernel.
+    """
+    X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
+    kpca = KernelPCA(kernel="precomputed", n_components=2)
+    pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))])
+    param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
+    grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
+    X_kernel = rbf_kernel(X, gamma=2.0)
+    grid_search.fit(X_kernel, y)
+    assert grid_search.best_score_ == 1
+
+
+def test_nested_circles():
+    """Check that kPCA projects in a space where nested circles are separable
+
+    Tests that 2D nested circles become separable with a perceptron when
+    projected in the first 2 kPCA using an RBF kernel, while raw samples
+    are not directly separable in the original space.
+    """
+    X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
+
+    # 2D nested circles are not linearly separable
+    train_score = Perceptron(max_iter=5).fit(X, y).score(X, y)
+    assert train_score < 0.8
+
+    # Project the circles data into the first 2 components of a RBF Kernel
+    # PCA model.
+    # Note that the gamma value is data dependent. If this test breaks
+    # and the gamma value has to be updated, the Kernel PCA example will
+    # have to be updated too.
+    kpca = KernelPCA(
+        kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.0
+    )
+    X_kpca = kpca.fit_transform(X)
+
+    # The data is perfectly linearly separable in that space
+    train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y)
+    assert train_score == 1.0
+
+
+def test_kernel_conditioning():
+    """Check that ``_check_psd_eigenvalues`` is correctly called in kPCA
+
+    Non-regression test for issue #12140 (PR #12145).
+    """
+
+    # create a pathological X leading to small non-zero eigenvalue
+    X = [[5, 1], [5 + 1e-8, 1e-8], [5 + 1e-8, 0]]
+    kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True)
+    kpca.fit(X)
+
+    # check that the small non-zero eigenvalue was correctly set to zero
+    assert kpca.eigenvalues_.min() == 0
+    assert np.all(kpca.eigenvalues_ == _check_psd_eigenvalues(kpca.eigenvalues_))
+
+
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+def test_precomputed_kernel_not_psd(solver):
+    """Check how KernelPCA works with non-PSD kernels depending on n_components
+
+    Tests for all methods what happens with a non PSD gram matrix (this
+    can happen in an isomap scenario, or with custom kernel functions, or
+    maybe with ill-posed datasets).
+
+    When ``n_component`` is large enough to capture a negative eigenvalue, an
+    error should be raised. Otherwise, KernelPCA should run without error
+    since the negative eigenvalues are not selected.
+    """
+
+    # a non PSD kernel with large eigenvalues, already centered
+    # it was captured from an isomap call and multiplied by 100 for compacity
+    K = [
+        [4.48, -1.0, 8.07, 2.33, 2.33, 2.33, -5.76, -12.78],
+        [-1.0, -6.48, 4.5, -1.24, -1.24, -1.24, -0.81, 7.49],
+        [8.07, 4.5, 15.48, 2.09, 2.09, 2.09, -11.1, -23.23],
+        [2.33, -1.24, 2.09, 4.0, -3.65, -3.65, 1.02, -0.9],
+        [2.33, -1.24, 2.09, -3.65, 4.0, -3.65, 1.02, -0.9],
+        [2.33, -1.24, 2.09, -3.65, -3.65, 4.0, 1.02, -0.9],
+        [-5.76, -0.81, -11.1, 1.02, 1.02, 1.02, 4.86, 9.75],
+        [-12.78, 7.49, -23.23, -0.9, -0.9, -0.9, 9.75, 21.46],
+    ]
+    # this gram matrix has 5 positive eigenvalues and 3 negative ones
+    # [ 52.72,   7.65,   7.65,   5.02,   0.  ,  -0.  ,  -6.13, -15.11]
+
+    # 1. ask for enough components to get a significant negative one
+    kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=7)
+    # make sure that the appropriate error is raised
+    with pytest.raises(ValueError, match="There are significant negative eigenvalues"):
+        kpca.fit(K)
+
+    # 2. ask for a small enough n_components to get only positive ones
+    kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=2)
+    if solver == "randomized":
+        # the randomized method is still inconsistent with the others on this
+        # since it selects the eigenvalues based on the largest 2 modules, not
+        # on the largest 2 values.
+        #
+        # At least we can ensure that we return an error instead of returning
+        # the wrong eigenvalues
+        with pytest.raises(
+            ValueError, match="There are significant negative eigenvalues"
+        ):
+            kpca.fit(K)
+    else:
+        # general case: make sure that it works
+        kpca.fit(K)
+
+
+@pytest.mark.parametrize("n_components", [4, 10, 20])
+def test_kernel_pca_solvers_equivalence(n_components):
+    """Check that 'dense' 'arpack' & 'randomized' solvers give similar results"""
+
+    # Generate random data
+    n_train, n_test = 1_000, 100
+    X, _ = make_circles(
+        n_samples=(n_train + n_test), factor=0.3, noise=0.05, random_state=0
+    )
+    X_fit, X_pred = X[:n_train, :], X[n_train:, :]
+
+    # reference (full)
+    ref_pred = (
+        KernelPCA(n_components, eigen_solver="dense", random_state=0)
+        .fit(X_fit)
+        .transform(X_pred)
+    )
+
+    # arpack
+    a_pred = (
+        KernelPCA(n_components, eigen_solver="arpack", random_state=0)
+        .fit(X_fit)
+        .transform(X_pred)
+    )
+    # check that the result is still correct despite the approx
+    assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
+
+    # randomized
+    r_pred = (
+        KernelPCA(n_components, eigen_solver="randomized", random_state=0)
+        .fit(X_fit)
+        .transform(X_pred)
+    )
+    # check that the result is still correct despite the approximation
+    assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
+
+
+def test_kernel_pca_inverse_transform_reconstruction():
+    """Test if the reconstruction is a good approximation.
+
+    Note that in general it is not possible to get an arbitrarily good
+    reconstruction because of kernel centering that does not
+    preserve all the information of the original data.
+    """
+    X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
+
+    kpca = KernelPCA(
+        n_components=20, kernel="rbf", fit_inverse_transform=True, alpha=1e-3
+    )
+    X_trans = kpca.fit_transform(X)
+    X_reconst = kpca.inverse_transform(X_trans)
+    assert np.linalg.norm(X - X_reconst) / np.linalg.norm(X) < 1e-1
+
+
+def test_kernel_pca_raise_not_fitted_error():
+    X = np.random.randn(15).reshape(5, 3)
+    kpca = KernelPCA()
+    kpca.fit(X)
+    with pytest.raises(NotFittedError):
+        kpca.inverse_transform(X)
+
+
+def test_32_64_decomposition_shape():
+    """Test that the decomposition is similar for 32 and 64 bits data
+
+    Non regression test for
+    https://github.com/scikit-learn/scikit-learn/issues/18146
+    """
+    X, y = make_blobs(
+        n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, cluster_std=0.1
+    )
+    X = StandardScaler().fit_transform(X)
+    X -= X.min()
+
+    # Compare the shapes (corresponds to the number of non-zero eigenvalues)
+    kpca = KernelPCA()
+    assert kpca.fit_transform(X).shape == kpca.fit_transform(X.astype(np.float32)).shape
+
+
+def test_kernel_pca_feature_names_out():
+    """Check feature names out for KernelPCA."""
+    X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
+    kpca = KernelPCA(n_components=2).fit(X)
+
+    names = kpca.get_feature_names_out()
+    assert_array_equal([f"kernelpca{i}" for i in range(2)], names)
+
+
+def test_kernel_pca_inverse_correct_gamma(global_random_seed):
+    """Check that gamma is set correctly when not provided.
+
+    Non-regression test for #26280
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.random_sample((5, 4))
+
+    kwargs = {
+        "n_components": 2,
+        "random_state": rng,
+        "fit_inverse_transform": True,
+        "kernel": "rbf",
+    }
+
+    expected_gamma = 1 / X.shape[1]
+    kpca1 = KernelPCA(gamma=None, **kwargs).fit(X)
+    kpca2 = KernelPCA(gamma=expected_gamma, **kwargs).fit(X)
+
+    assert kpca1.gamma_ == expected_gamma
+    assert kpca2.gamma_ == expected_gamma
+
+    X1_recon = kpca1.inverse_transform(kpca1.transform(X))
+    X2_recon = kpca2.inverse_transform(kpca1.transform(X))
+
+    assert_allclose(X1_recon, X2_recon)
+
+
+def test_kernel_pca_pandas_output():
+    """Check that KernelPCA works with pandas output when the solver is arpack.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27579
+    """
+    pytest.importorskip("pandas")
+    X, _ = load_iris(as_frame=True, return_X_y=True)
+    with sklearn.config_context(transform_output="pandas"):
+        KernelPCA(n_components=2, eigen_solver="arpack").fit_transform(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_nmf.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_nmf.py
new file mode 100644
index 0000000000000000000000000000000000000000..17be798b3f3921460cb2378c798209de60c963a4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_nmf.py
@@ -0,0 +1,1010 @@
+import re
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+from scipy import linalg
+
+from sklearn.base import clone
+from sklearn.decomposition import NMF, MiniBatchNMF, non_negative_factorization
+from sklearn.decomposition import _nmf as nmf  # For testing internals
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.extmath import squared_norm
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+def test_convergence_warning(Estimator, solver):
+    convergence_warning = (
+        "Maximum number of iterations 1 reached. Increase it to improve convergence."
+    )
+    A = np.ones((2, 2))
+    with pytest.warns(ConvergenceWarning, match=convergence_warning):
+        Estimator(max_iter=1, n_components="auto", **solver).fit(A)
+
+
+def test_initialize_nn_output():
+    # Test that initialization does not return negative values
+    rng = np.random.mtrand.RandomState(42)
+    data = np.abs(rng.randn(10, 10))
+    for init in ("random", "nndsvd", "nndsvda", "nndsvdar"):
+        W, H = nmf._initialize_nmf(data, 10, init=init, random_state=0)
+        assert not ((W < 0).any() or (H < 0).any())
+
+
+@pytest.mark.filterwarnings(
+    r"ignore:The multiplicative update \('mu'\) solver cannot update zeros present in"
+    r" the initialization",
+)
+def test_parameter_checking():
+    # Here we only check for invalid parameter values that are not already
+    # automatically tested in the common tests.
+
+    A = np.ones((2, 2))
+
+    msg = "Invalid beta_loss parameter: solver 'cd' does not handle beta_loss = 1.0"
+    with pytest.raises(ValueError, match=msg):
+        NMF(solver="cd", beta_loss=1.0).fit(A)
+    msg = "Negative values in data passed to"
+    with pytest.raises(ValueError, match=msg):
+        NMF().fit(-A)
+    clf = NMF(2, tol=0.1).fit(A)
+    with pytest.raises(ValueError, match=msg):
+        clf.transform(-A)
+    with pytest.raises(ValueError, match=msg):
+        nmf._initialize_nmf(-A, 2, "nndsvd")
+
+    for init in ["nndsvd", "nndsvda", "nndsvdar"]:
+        msg = re.escape(
+            "init = '{}' can only be used when "
+            "n_components <= min(n_samples, n_features)".format(init)
+        )
+        with pytest.raises(ValueError, match=msg):
+            NMF(3, init=init).fit(A)
+        with pytest.raises(ValueError, match=msg):
+            MiniBatchNMF(3, init=init).fit(A)
+        with pytest.raises(ValueError, match=msg):
+            nmf._initialize_nmf(A, 3, init)
+
+
+def test_initialize_close():
+    # Test NNDSVD error
+    # Test that _initialize_nmf error is less than the standard deviation of
+    # the entries in the matrix.
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(10, 10))
+    W, H = nmf._initialize_nmf(A, 10, init="nndsvd")
+    error = linalg.norm(np.dot(W, H) - A)
+    sdev = linalg.norm(A - A.mean())
+    assert error <= sdev
+
+
+def test_initialize_variants():
+    # Test NNDSVD variants correctness
+    # Test that the variants 'nndsvda' and 'nndsvdar' differ from basic
+    # 'nndsvd' only where the basic version has zeros.
+    rng = np.random.mtrand.RandomState(42)
+    data = np.abs(rng.randn(10, 10))
+    W0, H0 = nmf._initialize_nmf(data, 10, init="nndsvd")
+    Wa, Ha = nmf._initialize_nmf(data, 10, init="nndsvda")
+    War, Har = nmf._initialize_nmf(data, 10, init="nndsvdar", random_state=0)
+
+    for ref, evl in ((W0, Wa), (W0, War), (H0, Ha), (H0, Har)):
+        assert_almost_equal(evl[ref != 0], ref[ref != 0])
+
+
+# ignore UserWarning raised when both solver='mu' and init='nndsvd'
+@pytest.mark.filterwarnings(
+    r"ignore:The multiplicative update \('mu'\) solver cannot update zeros present in"
+    r" the initialization"
+)
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+@pytest.mark.parametrize("init", (None, "nndsvd", "nndsvda", "nndsvdar", "random"))
+@pytest.mark.parametrize("alpha_W", (0.0, 1.0))
+@pytest.mark.parametrize("alpha_H", (0.0, 1.0, "same"))
+def test_nmf_fit_nn_output(Estimator, solver, init, alpha_W, alpha_H):
+    # Test that the decomposition does not contain negative values
+    A = np.c_[5.0 - np.arange(1, 6), 5.0 + np.arange(1, 6)]
+    model = Estimator(
+        n_components=2,
+        init=init,
+        alpha_W=alpha_W,
+        alpha_H=alpha_H,
+        random_state=0,
+        **solver,
+    )
+    transf = model.fit_transform(A)
+    assert not ((model.components_ < 0).any() or (transf < 0).any())
+
+
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+def test_nmf_fit_close(Estimator, solver):
+    rng = np.random.mtrand.RandomState(42)
+    # Test that the fit is not too far away
+    pnmf = Estimator(
+        5,
+        init="nndsvdar",
+        random_state=0,
+        max_iter=600,
+        **solver,
+    )
+    X = np.abs(rng.randn(6, 5))
+    assert pnmf.fit(X).reconstruction_err_ < 0.1
+
+
+def test_nmf_true_reconstruction():
+    # Test that the fit is not too far away from an exact solution
+    # (by construction)
+    n_samples = 15
+    n_features = 10
+    n_components = 5
+    beta_loss = 1
+    batch_size = 3
+    max_iter = 1000
+
+    rng = np.random.mtrand.RandomState(42)
+    W_true = np.zeros([n_samples, n_components])
+    W_array = np.abs(rng.randn(n_samples))
+    for j in range(n_components):
+        W_true[j % n_samples, j] = W_array[j % n_samples]
+    H_true = np.zeros([n_components, n_features])
+    H_array = np.abs(rng.randn(n_components))
+    for j in range(n_features):
+        H_true[j % n_components, j] = H_array[j % n_components]
+    X = np.dot(W_true, H_true)
+
+    model = NMF(
+        n_components=n_components,
+        solver="mu",
+        beta_loss=beta_loss,
+        max_iter=max_iter,
+        random_state=0,
+    )
+    transf = model.fit_transform(X)
+    X_calc = np.dot(transf, model.components_)
+
+    assert model.reconstruction_err_ < 0.1
+    assert_allclose(X, X_calc)
+
+    mbmodel = MiniBatchNMF(
+        n_components=n_components,
+        beta_loss=beta_loss,
+        batch_size=batch_size,
+        random_state=0,
+        max_iter=max_iter,
+    )
+    transf = mbmodel.fit_transform(X)
+    X_calc = np.dot(transf, mbmodel.components_)
+
+    assert mbmodel.reconstruction_err_ < 0.1
+    assert_allclose(X, X_calc, atol=1)
+
+
+@pytest.mark.parametrize("solver", ["cd", "mu"])
+def test_nmf_transform(solver):
+    # Test that fit_transform is equivalent to fit.transform for NMF
+    # Test that NMF.transform returns close values
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(6, 5))
+    m = NMF(
+        solver=solver,
+        n_components=3,
+        init="random",
+        random_state=0,
+        tol=1e-6,
+    )
+    ft = m.fit_transform(A)
+    t = m.transform(A)
+    assert_allclose(ft, t, atol=1e-1)
+
+
+def test_minibatch_nmf_transform():
+    # Test that fit_transform is equivalent to fit.transform for MiniBatchNMF
+    # Only guaranteed with fresh restarts
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(6, 5))
+    m = MiniBatchNMF(
+        n_components=3,
+        random_state=0,
+        tol=1e-3,
+        fresh_restarts=True,
+    )
+    ft = m.fit_transform(A)
+    t = m.transform(A)
+    assert_allclose(ft, t)
+
+
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+def test_nmf_transform_custom_init(Estimator, solver):
+    # Smoke test that checks if NMF.transform works with custom initialization
+    random_state = np.random.RandomState(0)
+    A = np.abs(random_state.randn(6, 5))
+    n_components = 4
+    avg = np.sqrt(A.mean() / n_components)
+    H_init = np.abs(avg * random_state.randn(n_components, 5))
+    W_init = np.abs(avg * random_state.randn(6, n_components))
+
+    m = Estimator(
+        n_components=n_components, init="custom", random_state=0, tol=1e-3, **solver
+    )
+    m.fit_transform(A, W=W_init, H=H_init)
+    m.transform(A)
+
+
+@pytest.mark.parametrize("solver", ("cd", "mu"))
+def test_nmf_inverse_transform(solver):
+    # Test that NMF.inverse_transform returns close values
+    random_state = np.random.RandomState(0)
+    A = np.abs(random_state.randn(6, 4))
+    m = NMF(
+        solver=solver,
+        n_components=4,
+        init="random",
+        random_state=0,
+        max_iter=1000,
+    )
+    ft = m.fit_transform(A)
+    A_new = m.inverse_transform(ft)
+    assert_array_almost_equal(A, A_new, decimal=2)
+
+
+def test_mbnmf_inverse_transform():
+    # Test that MiniBatchNMF.transform followed by MiniBatchNMF.inverse_transform
+    # is close to the identity
+    rng = np.random.RandomState(0)
+    A = np.abs(rng.randn(6, 4))
+    nmf = MiniBatchNMF(
+        random_state=rng,
+        max_iter=500,
+        init="nndsvdar",
+        fresh_restarts=True,
+    )
+    ft = nmf.fit_transform(A)
+    A_new = nmf.inverse_transform(ft)
+    assert_allclose(A, A_new, rtol=1e-3, atol=1e-2)
+
+
+@pytest.mark.parametrize("Estimator", [NMF, MiniBatchNMF])
+def test_n_components_greater_n_features(Estimator):
+    # Smoke test for the case of more components than features.
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(30, 10))
+    Estimator(n_components=15, random_state=0, tol=1e-2).fit(A)
+
+
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+@pytest.mark.parametrize("alpha_W", (0.0, 1.0))
+@pytest.mark.parametrize("alpha_H", (0.0, 1.0, "same"))
+def test_nmf_sparse_input(Estimator, solver, sparse_container, alpha_W, alpha_H):
+    # Test that sparse matrices are accepted as input
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(10, 10))
+    A[:, 2 * np.arange(5)] = 0
+    A_sparse = sparse_container(A)
+
+    est1 = Estimator(
+        n_components=5,
+        init="random",
+        alpha_W=alpha_W,
+        alpha_H=alpha_H,
+        random_state=0,
+        tol=0,
+        max_iter=100,
+        **solver,
+    )
+    est2 = clone(est1)
+
+    W1 = est1.fit_transform(A)
+    W2 = est2.fit_transform(A_sparse)
+    H1 = est1.components_
+    H2 = est2.components_
+
+    assert_allclose(W1, W2)
+    assert_allclose(H1, H2)
+
+
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_nmf_sparse_transform(Estimator, solver, csc_container):
+    # Test that transform works on sparse data.  Issue #2124
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(3, 2))
+    A[1, 1] = 0
+    A = csc_container(A)
+
+    model = Estimator(random_state=0, n_components=2, max_iter=400, **solver)
+    A_fit_tr = model.fit_transform(A)
+    A_tr = model.transform(A)
+    assert_allclose(A_fit_tr, A_tr, atol=1e-1)
+
+
+@pytest.mark.parametrize("init", ["random", "nndsvd"])
+@pytest.mark.parametrize("solver", ("cd", "mu"))
+@pytest.mark.parametrize("alpha_W", (0.0, 1.0))
+@pytest.mark.parametrize("alpha_H", (0.0, 1.0, "same"))
+def test_non_negative_factorization_consistency(init, solver, alpha_W, alpha_H):
+    # Test that the function is called in the same way, either directly
+    # or through the NMF class
+    max_iter = 500
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(10, 10))
+    A[:, 2 * np.arange(5)] = 0
+
+    W_nmf, H, _ = non_negative_factorization(
+        A,
+        init=init,
+        solver=solver,
+        max_iter=max_iter,
+        alpha_W=alpha_W,
+        alpha_H=alpha_H,
+        random_state=1,
+        tol=1e-2,
+    )
+    W_nmf_2, H, _ = non_negative_factorization(
+        A,
+        H=H,
+        update_H=False,
+        init=init,
+        solver=solver,
+        max_iter=max_iter,
+        alpha_W=alpha_W,
+        alpha_H=alpha_H,
+        random_state=1,
+        tol=1e-2,
+    )
+
+    model_class = NMF(
+        init=init,
+        solver=solver,
+        max_iter=max_iter,
+        alpha_W=alpha_W,
+        alpha_H=alpha_H,
+        random_state=1,
+        tol=1e-2,
+    )
+    W_cls = model_class.fit_transform(A)
+    W_cls_2 = model_class.transform(A)
+
+    assert_allclose(W_nmf, W_cls)
+    assert_allclose(W_nmf_2, W_cls_2)
+
+
+def test_non_negative_factorization_checking():
+    # Note that the validity of parameter types and range of possible values
+    # for scalar numerical or str parameters is already checked in the common
+    # tests. Here we only check for problems that cannot be captured by simple
+    # declarative constraints on the valid parameter values.
+
+    A = np.ones((2, 2))
+    # Test parameters checking in public function
+    nnmf = non_negative_factorization
+    msg = re.escape("Negative values in data passed to NMF (input H)")
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, A, -A, 2, init="custom")
+    msg = re.escape("Negative values in data passed to NMF (input W)")
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, -A, A, 2, init="custom")
+    msg = re.escape("Array passed to NMF (input H) is full of zeros")
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, A, 0 * A, 2, init="custom")
+
+
+def _beta_divergence_dense(X, W, H, beta):
+    """Compute the beta-divergence of X and W.H for dense array only.
+
+    Used as a reference for testing nmf._beta_divergence.
+    """
+    WH = np.dot(W, H)
+
+    if beta == 2:
+        return squared_norm(X - WH) / 2
+
+    WH_Xnonzero = WH[X != 0]
+    X_nonzero = X[X != 0]
+    np.maximum(WH_Xnonzero, 1e-9, out=WH_Xnonzero)
+
+    if beta == 1:
+        res = np.sum(X_nonzero * np.log(X_nonzero / WH_Xnonzero))
+        res += WH.sum() - X.sum()
+
+    elif beta == 0:
+        div = X_nonzero / WH_Xnonzero
+        res = np.sum(div) - X.size - np.sum(np.log(div))
+    else:
+        res = (X_nonzero**beta).sum()
+        res += (beta - 1) * (WH**beta).sum()
+        res -= beta * (X_nonzero * (WH_Xnonzero ** (beta - 1))).sum()
+        res /= beta * (beta - 1)
+
+    return res
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_beta_divergence(csr_container):
+    # Compare _beta_divergence with the reference _beta_divergence_dense
+    n_samples = 20
+    n_features = 10
+    n_components = 5
+    beta_losses = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0]
+
+    # initialization
+    rng = np.random.mtrand.RandomState(42)
+    X = rng.randn(n_samples, n_features)
+    np.clip(X, 0, None, out=X)
+    X_csr = csr_container(X)
+    W, H = nmf._initialize_nmf(X, n_components, init="random", random_state=42)
+
+    for beta in beta_losses:
+        ref = _beta_divergence_dense(X, W, H, beta)
+        loss = nmf._beta_divergence(X, W, H, beta)
+        loss_csr = nmf._beta_divergence(X_csr, W, H, beta)
+
+        assert_almost_equal(ref, loss, decimal=7)
+        assert_almost_equal(ref, loss_csr, decimal=7)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_special_sparse_dot(csr_container):
+    # Test the function that computes np.dot(W, H), only where X is non zero.
+    n_samples = 10
+    n_features = 5
+    n_components = 3
+    rng = np.random.mtrand.RandomState(42)
+    X = rng.randn(n_samples, n_features)
+    np.clip(X, 0, None, out=X)
+    X_csr = csr_container(X)
+
+    W = np.abs(rng.randn(n_samples, n_components))
+    H = np.abs(rng.randn(n_components, n_features))
+
+    WH_safe = nmf._special_sparse_dot(W, H, X_csr)
+    WH = nmf._special_sparse_dot(W, H, X)
+
+    # test that both results have same values, in X_csr nonzero elements
+    ii, jj = X_csr.nonzero()
+    WH_safe_data = np.asarray(WH_safe[ii, jj]).ravel()
+    assert_array_almost_equal(WH_safe_data, WH[ii, jj], decimal=10)
+
+    # test that WH_safe and X_csr have the same sparse structure
+    assert_array_equal(WH_safe.indices, X_csr.indices)
+    assert_array_equal(WH_safe.indptr, X_csr.indptr)
+    assert_array_equal(WH_safe.shape, X_csr.shape)
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_nmf_multiplicative_update_sparse(csr_container):
+    # Compare sparse and dense input in multiplicative update NMF
+    # Also test continuity of the results with respect to beta_loss parameter
+    n_samples = 20
+    n_features = 10
+    n_components = 5
+    alpha = 0.1
+    l1_ratio = 0.5
+    n_iter = 20
+
+    # initialization
+    rng = np.random.mtrand.RandomState(1337)
+    X = rng.randn(n_samples, n_features)
+    X = np.abs(X)
+    X_csr = csr_container(X)
+    W0, H0 = nmf._initialize_nmf(X, n_components, init="random", random_state=42)
+
+    for beta_loss in (-1.2, 0, 0.2, 1.0, 2.0, 2.5):
+        # Reference with dense array X
+        W, H = W0.copy(), H0.copy()
+        W1, H1, _ = non_negative_factorization(
+            X,
+            W,
+            H,
+            n_components,
+            init="custom",
+            update_H=True,
+            solver="mu",
+            beta_loss=beta_loss,
+            max_iter=n_iter,
+            alpha_W=alpha,
+            l1_ratio=l1_ratio,
+            random_state=42,
+        )
+
+        # Compare with sparse X
+        W, H = W0.copy(), H0.copy()
+        W2, H2, _ = non_negative_factorization(
+            X_csr,
+            W,
+            H,
+            n_components,
+            init="custom",
+            update_H=True,
+            solver="mu",
+            beta_loss=beta_loss,
+            max_iter=n_iter,
+            alpha_W=alpha,
+            l1_ratio=l1_ratio,
+            random_state=42,
+        )
+
+        assert_allclose(W1, W2, atol=1e-7)
+        assert_allclose(H1, H2, atol=1e-7)
+
+        # Compare with almost same beta_loss, since some values have a specific
+        # behavior, but the results should be continuous w.r.t beta_loss
+        beta_loss -= 1.0e-5
+        W, H = W0.copy(), H0.copy()
+        W3, H3, _ = non_negative_factorization(
+            X_csr,
+            W,
+            H,
+            n_components,
+            init="custom",
+            update_H=True,
+            solver="mu",
+            beta_loss=beta_loss,
+            max_iter=n_iter,
+            alpha_W=alpha,
+            l1_ratio=l1_ratio,
+            random_state=42,
+        )
+
+        assert_allclose(W1, W3, atol=1e-4)
+        assert_allclose(H1, H3, atol=1e-4)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_nmf_negative_beta_loss(csr_container):
+    # Test that an error is raised if beta_loss < 0 and X contains zeros.
+    # Test that the output has not NaN values when the input contains zeros.
+    n_samples = 6
+    n_features = 5
+    n_components = 3
+
+    rng = np.random.mtrand.RandomState(42)
+    X = rng.randn(n_samples, n_features)
+    np.clip(X, 0, None, out=X)
+    X_csr = csr_container(X)
+
+    def _assert_nmf_no_nan(X, beta_loss):
+        W, H, _ = non_negative_factorization(
+            X,
+            init="random",
+            n_components=n_components,
+            solver="mu",
+            beta_loss=beta_loss,
+            random_state=0,
+            max_iter=1000,
+        )
+        assert not np.any(np.isnan(W))
+        assert not np.any(np.isnan(H))
+
+    msg = "When beta_loss <= 0 and X contains zeros, the solver may diverge."
+    for beta_loss in (-0.6, 0.0):
+        with pytest.raises(ValueError, match=msg):
+            _assert_nmf_no_nan(X, beta_loss)
+        _assert_nmf_no_nan(X + 1e-9, beta_loss)
+
+    for beta_loss in (0.2, 1.0, 1.2, 2.0, 2.5):
+        _assert_nmf_no_nan(X, beta_loss)
+        _assert_nmf_no_nan(X_csr, beta_loss)
+
+
+@pytest.mark.parametrize("beta_loss", [-0.5, 0.0])
+def test_minibatch_nmf_negative_beta_loss(beta_loss):
+    """Check that an error is raised if beta_loss < 0 and X contains zeros."""
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(6, 5))
+    X[X < 0] = 0
+
+    nmf = MiniBatchNMF(beta_loss=beta_loss, random_state=0)
+
+    msg = "When beta_loss <= 0 and X contains zeros, the solver may diverge."
+    with pytest.raises(ValueError, match=msg):
+        nmf.fit(X)
+
+
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+def test_nmf_regularization(Estimator, solver):
+    # Test the effect of L1 and L2 regularizations
+    n_samples = 6
+    n_features = 5
+    n_components = 3
+    rng = np.random.mtrand.RandomState(42)
+    X = np.abs(rng.randn(n_samples, n_features))
+
+    # L1 regularization should increase the number of zeros
+    l1_ratio = 1.0
+    regul = Estimator(
+        n_components=n_components,
+        alpha_W=0.5,
+        l1_ratio=l1_ratio,
+        random_state=42,
+        **solver,
+    )
+    model = Estimator(
+        n_components=n_components,
+        alpha_W=0.0,
+        l1_ratio=l1_ratio,
+        random_state=42,
+        **solver,
+    )
+
+    W_regul = regul.fit_transform(X)
+    W_model = model.fit_transform(X)
+
+    H_regul = regul.components_
+    H_model = model.components_
+
+    eps = np.finfo(np.float64).eps
+    W_regul_n_zeros = W_regul[W_regul <= eps].size
+    W_model_n_zeros = W_model[W_model <= eps].size
+    H_regul_n_zeros = H_regul[H_regul <= eps].size
+    H_model_n_zeros = H_model[H_model <= eps].size
+
+    assert W_regul_n_zeros > W_model_n_zeros
+    assert H_regul_n_zeros > H_model_n_zeros
+
+    # L2 regularization should decrease the sum of the squared norm
+    # of the matrices W and H
+    l1_ratio = 0.0
+    regul = Estimator(
+        n_components=n_components,
+        alpha_W=0.5,
+        l1_ratio=l1_ratio,
+        random_state=42,
+        **solver,
+    )
+    model = Estimator(
+        n_components=n_components,
+        alpha_W=0.0,
+        l1_ratio=l1_ratio,
+        random_state=42,
+        **solver,
+    )
+
+    W_regul = regul.fit_transform(X)
+    W_model = model.fit_transform(X)
+
+    H_regul = regul.components_
+    H_model = model.components_
+
+    assert (linalg.norm(W_model)) ** 2.0 + (linalg.norm(H_model)) ** 2.0 > (
+        linalg.norm(W_regul)
+    ) ** 2.0 + (linalg.norm(H_regul)) ** 2.0
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("solver", ("cd", "mu"))
+def test_nmf_decreasing(solver):
+    # test that the objective function is decreasing at each iteration
+    n_samples = 20
+    n_features = 15
+    n_components = 10
+    alpha = 0.1
+    l1_ratio = 0.5
+    tol = 0.0
+
+    # initialization
+    rng = np.random.mtrand.RandomState(42)
+    X = rng.randn(n_samples, n_features)
+    np.abs(X, X)
+    W0, H0 = nmf._initialize_nmf(X, n_components, init="random", random_state=42)
+
+    for beta_loss in (-1.2, 0, 0.2, 1.0, 2.0, 2.5):
+        if solver != "mu" and beta_loss != 2:
+            # not implemented
+            continue
+        W, H = W0.copy(), H0.copy()
+        previous_loss = None
+        for _ in range(30):
+            # one more iteration starting from the previous results
+            W, H, _ = non_negative_factorization(
+                X,
+                W,
+                H,
+                beta_loss=beta_loss,
+                init="custom",
+                n_components=n_components,
+                max_iter=1,
+                alpha_W=alpha,
+                solver=solver,
+                tol=tol,
+                l1_ratio=l1_ratio,
+                verbose=0,
+                random_state=0,
+                update_H=True,
+            )
+
+            loss = (
+                nmf._beta_divergence(X, W, H, beta_loss)
+                + alpha * l1_ratio * n_features * W.sum()
+                + alpha * l1_ratio * n_samples * H.sum()
+                + alpha * (1 - l1_ratio) * n_features * (W**2).sum()
+                + alpha * (1 - l1_ratio) * n_samples * (H**2).sum()
+            )
+            if previous_loss is not None:
+                assert previous_loss > loss
+            previous_loss = loss
+
+
+def test_nmf_underflow():
+    # Regression test for an underflow issue in _beta_divergence
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 10, 2, 2
+    X = np.abs(rng.randn(n_samples, n_features)) * 10
+    W = np.abs(rng.randn(n_samples, n_components)) * 10
+    H = np.abs(rng.randn(n_components, n_features))
+
+    X[0, 0] = 0
+    ref = nmf._beta_divergence(X, W, H, beta=1.0)
+    X[0, 0] = 1e-323
+    res = nmf._beta_divergence(X, W, H, beta=1.0)
+    assert_almost_equal(res, ref)
+
+
+@pytest.mark.parametrize(
+    "dtype_in, dtype_out",
+    [
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ],
+)
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+def test_nmf_dtype_match(Estimator, solver, dtype_in, dtype_out):
+    # Check that NMF preserves dtype (float32 and float64)
+    X = np.random.RandomState(0).randn(20, 15).astype(dtype_in, copy=False)
+    np.abs(X, out=X)
+
+    nmf = Estimator(
+        alpha_W=1.0,
+        alpha_H=1.0,
+        tol=1e-2,
+        random_state=0,
+        **solver,
+    )
+
+    assert nmf.fit(X).transform(X).dtype == dtype_out
+    assert nmf.fit_transform(X).dtype == dtype_out
+    assert nmf.components_.dtype == dtype_out
+
+
+@pytest.mark.parametrize(
+    ["Estimator", "solver"],
+    [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]],
+)
+def test_nmf_float32_float64_consistency(Estimator, solver):
+    # Check that the result of NMF is the same between float32 and float64
+    X = np.random.RandomState(0).randn(50, 7)
+    np.abs(X, out=X)
+    nmf32 = Estimator(random_state=0, tol=1e-3, **solver)
+    W32 = nmf32.fit_transform(X.astype(np.float32))
+    nmf64 = Estimator(random_state=0, tol=1e-3, **solver)
+    W64 = nmf64.fit_transform(X)
+
+    assert_allclose(W32, W64, atol=1e-5)
+
+
+@pytest.mark.parametrize("Estimator", [NMF, MiniBatchNMF])
+def test_nmf_custom_init_dtype_error(Estimator):
+    # Check that an error is raise if custom H and/or W don't have the same
+    # dtype as X.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((20, 15))
+    H = rng.random_sample((15, 15)).astype(np.float32)
+    W = rng.random_sample((20, 15))
+
+    with pytest.raises(TypeError, match="should have the same dtype as X"):
+        Estimator(init="custom").fit(X, H=H, W=W)
+
+    with pytest.raises(TypeError, match="should have the same dtype as X"):
+        non_negative_factorization(X, H=H, update_H=False)
+
+
+@pytest.mark.parametrize("beta_loss", [-0.5, 0, 0.5, 1, 1.5, 2, 2.5])
+def test_nmf_minibatchnmf_equivalence(beta_loss):
+    # Test that MiniBatchNMF is equivalent to NMF when batch_size = n_samples and
+    # forget_factor 0.0 (stopping criterion put aside)
+    rng = np.random.mtrand.RandomState(42)
+    X = np.abs(rng.randn(48, 5))
+
+    nmf = NMF(
+        n_components=5,
+        beta_loss=beta_loss,
+        solver="mu",
+        random_state=0,
+        tol=0,
+    )
+    mbnmf = MiniBatchNMF(
+        n_components=5,
+        beta_loss=beta_loss,
+        random_state=0,
+        tol=0,
+        max_no_improvement=None,
+        batch_size=X.shape[0],
+        forget_factor=0.0,
+    )
+    W = nmf.fit_transform(X)
+    mbW = mbnmf.fit_transform(X)
+    assert_allclose(W, mbW)
+
+
+def test_minibatch_nmf_partial_fit():
+    # Check fit / partial_fit equivalence. Applicable only with fresh restarts.
+    rng = np.random.mtrand.RandomState(42)
+    X = np.abs(rng.randn(100, 5))
+
+    n_components = 5
+    batch_size = 10
+    max_iter = 2
+
+    mbnmf1 = MiniBatchNMF(
+        n_components=n_components,
+        init="custom",
+        random_state=0,
+        max_iter=max_iter,
+        batch_size=batch_size,
+        tol=0,
+        max_no_improvement=None,
+        fresh_restarts=False,
+    )
+    mbnmf2 = MiniBatchNMF(n_components=n_components, init="custom", random_state=0)
+
+    # Force the same init of H (W is recomputed anyway) to be able to compare results.
+    W, H = nmf._initialize_nmf(
+        X, n_components=n_components, init="random", random_state=0
+    )
+
+    mbnmf1.fit(X, W=W, H=H)
+    for i in range(max_iter):
+        for j in range(batch_size):
+            mbnmf2.partial_fit(X[j : j + batch_size], W=W[:batch_size], H=H)
+
+    assert mbnmf1.n_steps_ == mbnmf2.n_steps_
+    assert_allclose(mbnmf1.components_, mbnmf2.components_)
+
+
+def test_feature_names_out():
+    """Check feature names out for NMF."""
+    random_state = np.random.RandomState(0)
+    X = np.abs(random_state.randn(10, 4))
+    nmf = NMF(n_components=3).fit(X)
+
+    names = nmf.get_feature_names_out()
+    assert_array_equal([f"nmf{i}" for i in range(3)], names)
+
+
+def test_minibatch_nmf_verbose():
+    # Check verbose mode of MiniBatchNMF for better coverage.
+    A = np.random.RandomState(0).random_sample((100, 10))
+    nmf = MiniBatchNMF(tol=1e-2, random_state=0, verbose=1)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        nmf.fit(A)
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize("Estimator", [NMF, MiniBatchNMF])
+def test_nmf_n_components_auto(Estimator):
+    # Check that n_components is correctly inferred
+    # from the provided custom initialization.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((6, 5))
+    W = rng.random_sample((6, 2))
+    H = rng.random_sample((2, 5))
+    est = Estimator(
+        n_components="auto",
+        init="custom",
+        random_state=0,
+        tol=1e-6,
+    )
+    est.fit_transform(X, W=W, H=H)
+    assert est._n_components == H.shape[0]
+
+
+def test_nmf_non_negative_factorization_n_components_auto():
+    # Check that n_components is correctly inferred from the provided
+    # custom initialization.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((6, 5))
+    W_init = rng.random_sample((6, 2))
+    H_init = rng.random_sample((2, 5))
+    W, H, _ = non_negative_factorization(
+        X, W=W_init, H=H_init, init="custom", n_components="auto"
+    )
+    assert H.shape == H_init.shape
+    assert W.shape == W_init.shape
+
+
+def test_nmf_n_components_auto_no_h_update():
+    # Tests that non_negative_factorization does not fail when setting
+    # n_components="auto" also tests that the inferred n_component
+    # value is the right one.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((6, 5))
+    H_true = rng.random_sample((2, 5))
+    W, H, _ = non_negative_factorization(
+        X, H=H_true, n_components="auto", update_H=False
+    )  # should not fail
+    assert_allclose(H, H_true)
+    assert W.shape == (X.shape[0], H_true.shape[0])
+
+
+def test_nmf_w_h_not_used_warning():
+    # Check that warnings are raised if user provided W and H are not used
+    # and initialization overrides value of W or H
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((6, 5))
+    W_init = rng.random_sample((6, 2))
+    H_init = rng.random_sample((2, 5))
+    with pytest.warns(
+        RuntimeWarning,
+        match="When init!='custom', provided W or H are ignored",
+    ):
+        non_negative_factorization(X, H=H_init, update_H=True, n_components="auto")
+
+    with pytest.warns(
+        RuntimeWarning,
+        match="When init!='custom', provided W or H are ignored",
+    ):
+        non_negative_factorization(
+            X, W=W_init, H=H_init, update_H=True, n_components="auto"
+        )
+
+    with pytest.warns(
+        RuntimeWarning, match="When update_H=False, the provided initial W is not used."
+    ):
+        # When update_H is False, W is ignored regardless of init
+        # TODO: use the provided W when init="custom".
+        non_negative_factorization(
+            X, W=W_init, H=H_init, update_H=False, n_components="auto"
+        )
+
+
+def test_nmf_custom_init_shape_error():
+    # Check that an informative error is raised when custom initialization does not
+    # have the right shape
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((6, 5))
+    H = rng.random_sample((2, 5))
+    nmf = NMF(n_components=2, init="custom", random_state=0)
+
+    with pytest.raises(ValueError, match="Array with wrong first dimension passed"):
+        nmf.fit(X, H=H, W=rng.random_sample((5, 2)))
+
+    with pytest.raises(ValueError, match="Array with wrong second dimension passed"):
+        nmf.fit(X, H=H, W=rng.random_sample((6, 3)))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_online_lda.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_online_lda.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3dafa1912eba231d9e7a8aaeb719203483255ad
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_online_lda.py
@@ -0,0 +1,482 @@
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+from scipy.linalg import block_diag
+from scipy.special import psi
+
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.decomposition._online_lda_fast import (
+    _dirichlet_expectation_1d,
+    _dirichlet_expectation_2d,
+)
+from sklearn.exceptions import NotFittedError
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    if_safe_multiprocessing_with_blas,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def _build_sparse_array(csr_container):
+    # Create 3 topics and each topic has 3 distinct words.
+    # (Each word only belongs to a single topic.)
+    n_components = 3
+    block = np.full((3, 3), n_components, dtype=int)
+    blocks = [block] * n_components
+    X = block_diag(*blocks)
+    X = csr_container(X)
+    return (n_components, X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_default_prior_params(csr_container):
+    # default prior parameter should be `1 / topics`
+    # and verbose params should not affect result
+    n_components, X = _build_sparse_array(csr_container)
+    prior = 1.0 / n_components
+    lda_1 = LatentDirichletAllocation(
+        n_components=n_components,
+        doc_topic_prior=prior,
+        topic_word_prior=prior,
+        random_state=0,
+    )
+    lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0)
+    topic_distr_1 = lda_1.fit_transform(X)
+    topic_distr_2 = lda_2.fit_transform(X)
+    assert_almost_equal(topic_distr_1, topic_distr_2)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_fit_batch(csr_container):
+    # Test LDA batch learning_offset (`fit` method with 'batch' learning)
+    rng = np.random.RandomState(0)
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        evaluate_every=1,
+        learning_method="batch",
+        random_state=rng,
+    )
+    lda.fit(X)
+
+    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
+    for component in lda.components_:
+        # Find top 3 words in each LDA component
+        top_idx = set(component.argsort()[-3:][::-1])
+        assert tuple(sorted(top_idx)) in correct_idx_grps
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_fit_online(csr_container):
+    # Test LDA online learning (`fit` method with 'online' learning)
+    rng = np.random.RandomState(0)
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        learning_offset=10.0,
+        evaluate_every=1,
+        learning_method="online",
+        random_state=rng,
+    )
+    lda.fit(X)
+
+    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
+    for component in lda.components_:
+        # Find top 3 words in each LDA component
+        top_idx = set(component.argsort()[-3:][::-1])
+        assert tuple(sorted(top_idx)) in correct_idx_grps
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_partial_fit(csr_container):
+    # Test LDA online learning (`partial_fit` method)
+    # (same as test_lda_batch)
+    rng = np.random.RandomState(0)
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        learning_offset=10.0,
+        total_samples=100,
+        random_state=rng,
+    )
+    for i in range(3):
+        lda.partial_fit(X)
+
+    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
+    for c in lda.components_:
+        top_idx = set(c.argsort()[-3:][::-1])
+        assert tuple(sorted(top_idx)) in correct_idx_grps
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_dense_input(csr_container):
+    # Test LDA with dense input.
+    rng = np.random.RandomState(0)
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components, learning_method="batch", random_state=rng
+    )
+    lda.fit(X.toarray())
+
+    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
+    for component in lda.components_:
+        # Find top 3 words in each LDA component
+        top_idx = set(component.argsort()[-3:][::-1])
+        assert tuple(sorted(top_idx)) in correct_idx_grps
+
+
+def test_lda_transform():
+    # Test LDA transform.
+    # Transform result cannot be negative and should be normalized by default
+    rng = np.random.RandomState(0)
+    X = rng.randint(5, size=(20, 10))
+    n_components = 3
+    lda = LatentDirichletAllocation(n_components=n_components, random_state=rng)
+    X_trans = lda.fit_transform(X)
+    assert (X_trans > 0.0).any()
+    assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))
+
+    X_trans_unnormalized = lda.transform(X, normalize=False)
+    assert_array_almost_equal(
+        X_trans, X_trans_unnormalized / X_trans_unnormalized.sum(axis=1)[:, np.newaxis]
+    )
+
+
+@pytest.mark.parametrize("method", ("online", "batch"))
+def test_lda_fit_transform(method):
+    # Test LDA fit_transform & transform
+    # fit_transform and transform result should be the same
+    rng = np.random.RandomState(0)
+    X = rng.randint(10, size=(50, 20))
+    lda = LatentDirichletAllocation(
+        n_components=5, learning_method=method, random_state=rng
+    )
+    X_fit = lda.fit_transform(X)
+    X_trans = lda.transform(X)
+    assert_array_almost_equal(X_fit, X_trans, 4)
+
+
+def test_lda_negative_input():
+    # test pass dense matrix with sparse negative input.
+    X = np.full((5, 10), -1.0)
+    lda = LatentDirichletAllocation()
+    regex = r"^Negative values in data passed"
+    with pytest.raises(ValueError, match=regex):
+        lda.fit(X)
+
+
+def test_lda_no_component_error():
+    # test `perplexity` before `fit`
+    rng = np.random.RandomState(0)
+    X = rng.randint(4, size=(20, 10))
+    lda = LatentDirichletAllocation()
+    regex = (
+        "This LatentDirichletAllocation instance is not fitted yet. "
+        "Call 'fit' with appropriate arguments before using this "
+        "estimator."
+    )
+    with pytest.raises(NotFittedError, match=regex):
+        lda.perplexity(X)
+
+
+@if_safe_multiprocessing_with_blas
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("method", ("online", "batch"))
+def test_lda_multi_jobs(method, csr_container):
+    n_components, X = _build_sparse_array(csr_container)
+    # Test LDA batch training with multi CPU
+    rng = np.random.RandomState(0)
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        n_jobs=2,
+        learning_method=method,
+        evaluate_every=1,
+        random_state=rng,
+    )
+    lda.fit(X)
+
+    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
+    for c in lda.components_:
+        top_idx = set(c.argsort()[-3:][::-1])
+        assert tuple(sorted(top_idx)) in correct_idx_grps
+
+
+@if_safe_multiprocessing_with_blas
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_partial_fit_multi_jobs(csr_container):
+    # Test LDA online training with multi CPU
+    rng = np.random.RandomState(0)
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        n_jobs=2,
+        learning_offset=5.0,
+        total_samples=30,
+        random_state=rng,
+    )
+    for i in range(2):
+        lda.partial_fit(X)
+
+    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
+    for c in lda.components_:
+        top_idx = set(c.argsort()[-3:][::-1])
+        assert tuple(sorted(top_idx)) in correct_idx_grps
+
+
+def test_lda_preplexity_mismatch():
+    # test dimension mismatch in `perplexity` method
+    rng = np.random.RandomState(0)
+    n_components = rng.randint(3, 6)
+    n_samples = rng.randint(6, 10)
+    X = np.random.randint(4, size=(n_samples, 10))
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        learning_offset=5.0,
+        total_samples=20,
+        random_state=rng,
+    )
+    lda.fit(X)
+    # invalid samples
+    invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
+    with pytest.raises(ValueError, match=r"Number of samples"):
+        lda._perplexity_precomp_distr(X, invalid_n_samples)
+    # invalid topic number
+    invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
+    with pytest.raises(ValueError, match=r"Number of topics"):
+        lda._perplexity_precomp_distr(X, invalid_n_components)
+
+
+@pytest.mark.parametrize("method", ("online", "batch"))
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_perplexity(method, csr_container):
+    # Test LDA perplexity for batch training
+    # perplexity should be lower after each iteration
+    n_components, X = _build_sparse_array(csr_container)
+    lda_1 = LatentDirichletAllocation(
+        n_components=n_components,
+        max_iter=1,
+        learning_method=method,
+        total_samples=100,
+        random_state=0,
+    )
+    lda_2 = LatentDirichletAllocation(
+        n_components=n_components,
+        max_iter=10,
+        learning_method=method,
+        total_samples=100,
+        random_state=0,
+    )
+    lda_1.fit(X)
+    perp_1 = lda_1.perplexity(X, sub_sampling=False)
+
+    lda_2.fit(X)
+    perp_2 = lda_2.perplexity(X, sub_sampling=False)
+    assert perp_1 >= perp_2
+
+    perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
+    perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
+    assert perp_1_subsampling >= perp_2_subsampling
+
+
+@pytest.mark.parametrize("method", ("online", "batch"))
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_score(method, csr_container):
+    # Test LDA score for batch training
+    # score should be higher after each iteration
+    n_components, X = _build_sparse_array(csr_container)
+    lda_1 = LatentDirichletAllocation(
+        n_components=n_components,
+        max_iter=1,
+        learning_method=method,
+        total_samples=100,
+        random_state=0,
+    )
+    lda_2 = LatentDirichletAllocation(
+        n_components=n_components,
+        max_iter=10,
+        learning_method=method,
+        total_samples=100,
+        random_state=0,
+    )
+    lda_1.fit_transform(X)
+    score_1 = lda_1.score(X)
+
+    lda_2.fit_transform(X)
+    score_2 = lda_2.score(X)
+    assert score_2 >= score_1
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_perplexity_input_format(csr_container):
+    # Test LDA perplexity for sparse and dense input
+    # score should be the same for both dense and sparse input
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        max_iter=1,
+        learning_method="batch",
+        total_samples=100,
+        random_state=0,
+    )
+    lda.fit(X)
+    perp_1 = lda.perplexity(X)
+    perp_2 = lda.perplexity(X.toarray())
+    assert_almost_equal(perp_1, perp_2)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_score_perplexity(csr_container):
+    # Test the relationship between LDA score and perplexity
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components, max_iter=10, random_state=0
+    )
+    lda.fit(X)
+    perplexity_1 = lda.perplexity(X, sub_sampling=False)
+
+    score = lda.score(X)
+    perplexity_2 = np.exp(-1.0 * (score / np.sum(X.data)))
+    assert_almost_equal(perplexity_1, perplexity_2)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_fit_perplexity(csr_container):
+    # Test that the perplexity computed during fit is consistent with what is
+    # returned by the perplexity method
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        max_iter=1,
+        learning_method="batch",
+        random_state=0,
+        evaluate_every=1,
+    )
+    lda.fit(X)
+
+    # Perplexity computed at end of fit method
+    perplexity1 = lda.bound_
+
+    # Result of perplexity method on the train set
+    perplexity2 = lda.perplexity(X)
+
+    assert_almost_equal(perplexity1, perplexity2)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_empty_docs(csr_container):
+    """Test LDA on empty document (all-zero rows)."""
+    Z = np.zeros((5, 4))
+    for X in [Z, csr_container(Z)]:
+        lda = LatentDirichletAllocation(max_iter=750).fit(X)
+        assert_almost_equal(
+            lda.components_.sum(axis=0), np.ones(lda.components_.shape[1])
+        )
+
+
+def test_dirichlet_expectation():
+    """Test Cython version of Dirichlet expectation calculation."""
+    x = np.logspace(-100, 10, 10000)
+    expectation = np.empty_like(x)
+    _dirichlet_expectation_1d(x, 0, expectation)
+    assert_allclose(expectation, np.exp(psi(x) - psi(np.sum(x))), atol=1e-19)
+
+    x = x.reshape(100, 100)
+    assert_allclose(
+        _dirichlet_expectation_2d(x),
+        psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]),
+        rtol=1e-11,
+        atol=3e-9,
+    )
+
+
+def check_verbosity(
+    verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
+):
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        max_iter=3,
+        learning_method="batch",
+        verbose=verbose,
+        evaluate_every=evaluate_every,
+        random_state=0,
+    )
+    out = StringIO()
+    old_out, sys.stdout = sys.stdout, out
+    try:
+        lda.fit(X)
+    finally:
+        sys.stdout = old_out
+
+    n_lines = out.getvalue().count("\n")
+    n_perplexity = out.getvalue().count("perplexity")
+    assert expected_lines == n_lines
+    assert expected_perplexities == n_perplexity
+
+
+@pytest.mark.parametrize(
+    "verbose,evaluate_every,expected_lines,expected_perplexities",
+    [
+        (False, 1, 0, 0),
+        (False, 0, 0, 0),
+        (True, 0, 3, 0),
+        (True, 1, 3, 3),
+        (True, 2, 3, 1),
+    ],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_verbosity(
+    verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
+):
+    check_verbosity(
+        verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
+    )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_feature_names_out(csr_container):
+    """Check feature names out for LatentDirichletAllocation."""
+    n_components, X = _build_sparse_array(csr_container)
+    lda = LatentDirichletAllocation(n_components=n_components).fit(X)
+
+    names = lda.get_feature_names_out()
+    assert_array_equal(
+        [f"latentdirichletallocation{i}" for i in range(n_components)], names
+    )
+
+
+@pytest.mark.parametrize("learning_method", ("batch", "online"))
+def test_lda_dtype_match(learning_method, global_dtype):
+    """Check data type preservation of fitted attributes."""
+    rng = np.random.RandomState(0)
+    X = rng.uniform(size=(20, 10)).astype(global_dtype, copy=False)
+
+    lda = LatentDirichletAllocation(
+        n_components=5, random_state=0, learning_method=learning_method
+    )
+    lda.fit(X)
+    assert lda.components_.dtype == global_dtype
+    assert lda.exp_dirichlet_component_.dtype == global_dtype
+
+
+@pytest.mark.parametrize("learning_method", ("batch", "online"))
+def test_lda_numerical_consistency(learning_method, global_random_seed):
+    """Check numerical consistency between np.float32 and np.float64."""
+    rng = np.random.RandomState(global_random_seed)
+    X64 = rng.uniform(size=(20, 10))
+    X32 = X64.astype(np.float32)
+
+    lda_64 = LatentDirichletAllocation(
+        n_components=5, random_state=global_random_seed, learning_method=learning_method
+    ).fit(X64)
+    lda_32 = LatentDirichletAllocation(
+        n_components=5, random_state=global_random_seed, learning_method=learning_method
+    ).fit(X32)
+
+    assert_allclose(lda_32.components_, lda_64.components_)
+    assert_allclose(lda_32.transform(X32), lda_64.transform(X64))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_pca.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b97138c4dea385b55fd91c6a1ec9f0d8298226b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_pca.py
@@ -0,0 +1,1154 @@
+import os
+import re
+import warnings
+
+import numpy as np
+import pytest
+import scipy as sp
+from numpy.testing import assert_array_equal
+
+from sklearn import config_context, datasets
+from sklearn.base import clone
+from sklearn.datasets import load_iris, make_classification, make_low_rank_matrix
+from sklearn.decomposition import PCA
+from sklearn.decomposition._pca import _assess_dimension, _infer_dimension
+from sklearn.utils._array_api import (
+    _atol_for_type,
+    _convert_to_numpy,
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._array_api import device as array_device
+from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids
+from sklearn.utils._testing import _array_api_for_tests, assert_allclose
+from sklearn.utils.estimator_checks import (
+    check_array_api_input_and_values,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+iris = datasets.load_iris()
+PCA_SOLVERS = ["full", "covariance_eigh", "arpack", "randomized", "auto"]
+
+# `SPARSE_M` and `SPARSE_N` could be larger, but be aware:
+# * SciPy's generation of random sparse matrix can be costly
+# * A (SPARSE_M, SPARSE_N) dense array is allocated to compare against
+SPARSE_M, SPARSE_N = 1000, 300  # arbitrary
+SPARSE_MAX_COMPONENTS = min(SPARSE_M, SPARSE_N)
+
+
+def _check_fitted_pca_close(pca1, pca2, rtol=1e-7, atol=1e-12):
+    assert_allclose(pca1.components_, pca2.components_, rtol=rtol, atol=atol)
+    assert_allclose(
+        pca1.explained_variance_, pca2.explained_variance_, rtol=rtol, atol=atol
+    )
+    assert_allclose(pca1.singular_values_, pca2.singular_values_, rtol=rtol, atol=atol)
+    assert_allclose(pca1.mean_, pca2.mean_, rtol=rtol, atol=atol)
+    assert_allclose(pca1.noise_variance_, pca2.noise_variance_, rtol=rtol, atol=atol)
+
+    assert pca1.n_components_ == pca2.n_components_
+    assert pca1.n_samples_ == pca2.n_samples_
+    assert pca1.n_features_in_ == pca2.n_features_in_
+
+
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+@pytest.mark.parametrize("n_components", range(1, iris.data.shape[1]))
+def test_pca(svd_solver, n_components):
+    X = iris.data
+    pca = PCA(n_components=n_components, svd_solver=svd_solver)
+
+    # check the shape of fit.transform
+    X_r = pca.fit(X).transform(X)
+    assert X_r.shape[1] == n_components
+
+    # check the equivalence of fit.transform and fit_transform
+    X_r2 = pca.fit_transform(X)
+    assert_allclose(X_r, X_r2)
+    X_r = pca.transform(X)
+    assert_allclose(X_r, X_r2)
+
+    # Test get_covariance and get_precision
+    cov = pca.get_covariance()
+    precision = pca.get_precision()
+    assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12)
+
+
+@pytest.mark.parametrize("density", [0.01, 0.1, 0.30])
+@pytest.mark.parametrize("n_components", [1, 2, 10])
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+@pytest.mark.parametrize("svd_solver", ["arpack", "covariance_eigh"])
+@pytest.mark.parametrize("scale", [1, 10, 100])
+def test_pca_sparse(
+    global_random_seed, svd_solver, sparse_container, n_components, density, scale
+):
+    """Check that the results are the same for sparse and dense input."""
+
+    # Set atol in addition of the default rtol to account for the very wide range of
+    # result values (1e-8 to 1e0).
+    atol = 1e-12
+    transform_atol = 1e-10
+
+    random_state = np.random.default_rng(global_random_seed)
+    X = sparse_container(
+        sp.sparse.random(
+            SPARSE_M,
+            SPARSE_N,
+            random_state=random_state,
+            density=density,
+        )
+    )
+    # Scale the data + vary the column means
+    scale_vector = random_state.random(X.shape[1]) * scale
+    X = X.multiply(scale_vector)
+
+    pca = PCA(
+        n_components=n_components,
+        svd_solver=svd_solver,
+        random_state=global_random_seed,
+    )
+    pca.fit(X)
+
+    Xd = X.toarray()
+    pcad = PCA(
+        n_components=n_components,
+        svd_solver=svd_solver,
+        random_state=global_random_seed,
+    )
+    pcad.fit(Xd)
+
+    # Fitted attributes equality
+    _check_fitted_pca_close(pca, pcad, atol=atol)
+
+    # Test transform
+    X2 = sparse_container(
+        sp.sparse.random(
+            SPARSE_M,
+            SPARSE_N,
+            random_state=random_state,
+            density=density,
+        )
+    )
+    X2d = X2.toarray()
+
+    assert_allclose(pca.transform(X2), pca.transform(X2d), atol=transform_atol)
+    assert_allclose(pca.transform(X2), pcad.transform(X2d), atol=transform_atol)
+
+
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_pca_sparse_fit_transform(global_random_seed, sparse_container):
+    random_state = np.random.default_rng(global_random_seed)
+    X = sparse_container(
+        sp.sparse.random(
+            SPARSE_M,
+            SPARSE_N,
+            random_state=random_state,
+            density=0.01,
+        )
+    )
+    X2 = sparse_container(
+        sp.sparse.random(
+            SPARSE_M,
+            SPARSE_N,
+            random_state=random_state,
+            density=0.01,
+        )
+    )
+
+    pca_fit = PCA(n_components=10, svd_solver="arpack", random_state=global_random_seed)
+    pca_fit_transform = PCA(
+        n_components=10, svd_solver="arpack", random_state=global_random_seed
+    )
+
+    pca_fit.fit(X)
+    transformed_X = pca_fit_transform.fit_transform(X)
+
+    _check_fitted_pca_close(pca_fit, pca_fit_transform)
+    assert_allclose(transformed_X, pca_fit_transform.transform(X))
+    assert_allclose(transformed_X, pca_fit.transform(X))
+    assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2))
+
+
+@pytest.mark.parametrize("svd_solver", ["randomized", "full"])
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container):
+    random_state = np.random.RandomState(global_random_seed)
+    X = sparse_container(
+        sp.sparse.random(
+            SPARSE_M,
+            SPARSE_N,
+            random_state=random_state,
+        )
+    )
+    pca = PCA(n_components=30, svd_solver=svd_solver)
+    error_msg_pattern = (
+        'PCA only support sparse inputs with the "arpack" and "covariance_eigh"'
+        f' solvers, while "{svd_solver}" was passed'
+    )
+    with pytest.raises(TypeError, match=error_msg_pattern):
+        pca.fit(X)
+
+
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_sparse_pca_auto_arpack_singluar_values_consistency(
+    global_random_seed, sparse_container
+):
+    """Check that "auto" and "arpack" solvers are equivalent for sparse inputs."""
+    random_state = np.random.RandomState(global_random_seed)
+    X = sparse_container(
+        sp.sparse.random(
+            SPARSE_M,
+            SPARSE_N,
+            random_state=random_state,
+        )
+    )
+    pca_arpack = PCA(n_components=10, svd_solver="arpack").fit(X)
+    pca_auto = PCA(n_components=10, svd_solver="auto").fit(X)
+    assert_allclose(pca_arpack.singular_values_, pca_auto.singular_values_, rtol=5e-3)
+
+
+def test_no_empty_slice_warning():
+    # test if we avoid numpy warnings for computing over empty arrays
+    n_components = 10
+    n_features = n_components + 2  # anything > n_comps triggered it in 0.16
+    X = np.random.uniform(-1, 1, size=(n_components, n_features))
+    pca = PCA(n_components=n_components)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        pca.fit(X)
+
+
+@pytest.mark.parametrize("copy", [True, False])
+@pytest.mark.parametrize("solver", PCA_SOLVERS)
+def test_whitening(solver, copy):
+    # Check that PCA output has unit-variance
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 80
+    n_components = 30
+    rank = 50
+
+    # some low rank data with correlated features
+    X = np.dot(
+        rng.randn(n_samples, rank),
+        np.dot(np.diag(np.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features)),
+    )
+    # the component-wise variance of the first 50 features is 3 times the
+    # mean component-wise variance of the remaining 30 features
+    X[:, :50] *= 3
+
+    assert X.shape == (n_samples, n_features)
+
+    # the component-wise variance is thus highly varying:
+    assert X.std(axis=0).std() > 43.8
+
+    # whiten the data while projecting to the lower dim subspace
+    X_ = X.copy()  # make sure we keep an original across iterations.
+    pca = PCA(
+        n_components=n_components,
+        whiten=True,
+        copy=copy,
+        svd_solver=solver,
+        random_state=0,
+        iterated_power=7,
+    )
+    # test fit_transform
+    X_whitened = pca.fit_transform(X_.copy())
+    assert X_whitened.shape == (n_samples, n_components)
+    X_whitened2 = pca.transform(X_)
+    assert_allclose(X_whitened, X_whitened2, rtol=5e-4)
+
+    assert_allclose(X_whitened.std(ddof=1, axis=0), np.ones(n_components))
+    assert_allclose(X_whitened.mean(axis=0), np.zeros(n_components), atol=1e-12)
+
+    X_ = X.copy()
+    pca = PCA(
+        n_components=n_components, whiten=False, copy=copy, svd_solver=solver
+    ).fit(X_.copy())
+    X_unwhitened = pca.transform(X_)
+    assert X_unwhitened.shape == (n_samples, n_components)
+
+    # in that case the output components still have varying variances
+    assert X_unwhitened.std(axis=0).std() == pytest.approx(74.1, rel=1e-1)
+    # we always center, so no test for non-centering.
+
+
+@pytest.mark.parametrize(
+    "other_svd_solver", sorted(list(set(PCA_SOLVERS) - {"full", "auto"}))
+)
+@pytest.mark.parametrize("data_shape", ["tall", "wide"])
+@pytest.mark.parametrize("rank_deficient", [False, True])
+@pytest.mark.parametrize("whiten", [False, True])
+def test_pca_solver_equivalence(
+    other_svd_solver,
+    data_shape,
+    rank_deficient,
+    whiten,
+    global_random_seed,
+    global_dtype,
+):
+    if data_shape == "tall":
+        n_samples, n_features = 100, 30
+    else:
+        n_samples, n_features = 30, 100
+    n_samples_test = 10
+
+    if rank_deficient:
+        rng = np.random.default_rng(global_random_seed)
+        rank = min(n_samples, n_features) // 2
+        X = rng.standard_normal(
+            size=(n_samples + n_samples_test, rank)
+        ) @ rng.standard_normal(size=(rank, n_features))
+    else:
+        X = make_low_rank_matrix(
+            n_samples=n_samples + n_samples_test,
+            n_features=n_features,
+            tail_strength=0.5,
+            random_state=global_random_seed,
+        )
+        # With a non-zero tail strength, the data is actually full-rank.
+        rank = min(n_samples, n_features)
+
+    X = X.astype(global_dtype, copy=False)
+    X_train, X_test = X[:n_samples], X[n_samples:]
+
+    if global_dtype == np.float32:
+        tols = dict(atol=3e-2, rtol=1e-5)
+        variance_threshold = 1e-5
+    else:
+        tols = dict(atol=1e-10, rtol=1e-12)
+        variance_threshold = 1e-12
+
+    extra_other_kwargs = {}
+    if other_svd_solver == "randomized":
+        # Only check for a truncated result with a large number of iterations
+        # to make sure that we can recover precise results.
+        n_components = 10
+        extra_other_kwargs = {"iterated_power": 50}
+    elif other_svd_solver == "arpack":
+        # Test all components except the last one which cannot be estimated by
+        # arpack.
+        n_components = np.minimum(n_samples, n_features) - 1
+    else:
+        # Test all components to high precision.
+        n_components = None
+
+    pca_full = PCA(n_components=n_components, svd_solver="full", whiten=whiten)
+    pca_other = PCA(
+        n_components=n_components,
+        svd_solver=other_svd_solver,
+        whiten=whiten,
+        random_state=global_random_seed,
+        **extra_other_kwargs,
+    )
+    X_trans_full_train = pca_full.fit_transform(X_train)
+    assert np.isfinite(X_trans_full_train).all()
+    assert X_trans_full_train.dtype == global_dtype
+    X_trans_other_train = pca_other.fit_transform(X_train)
+    assert np.isfinite(X_trans_other_train).all()
+    assert X_trans_other_train.dtype == global_dtype
+
+    assert (pca_full.explained_variance_ >= 0).all()
+    assert_allclose(pca_full.explained_variance_, pca_other.explained_variance_, **tols)
+    assert_allclose(
+        pca_full.explained_variance_ratio_,
+        pca_other.explained_variance_ratio_,
+        **tols,
+    )
+    reference_components = pca_full.components_
+    assert np.isfinite(reference_components).all()
+    other_components = pca_other.components_
+    assert np.isfinite(other_components).all()
+
+    # For some choice of n_components and data distribution, some components
+    # might be pure noise, let's ignore them in the comparison:
+    stable = pca_full.explained_variance_ > variance_threshold
+    assert stable.sum() > 1
+    assert_allclose(reference_components[stable], other_components[stable], **tols)
+
+    # As a result the output of fit_transform should be the same:
+    assert_allclose(
+        X_trans_other_train[:, stable], X_trans_full_train[:, stable], **tols
+    )
+
+    # And similarly for the output of transform on new data (except for the
+    # last component that can be underdetermined):
+    X_trans_full_test = pca_full.transform(X_test)
+    assert np.isfinite(X_trans_full_test).all()
+    assert X_trans_full_test.dtype == global_dtype
+    X_trans_other_test = pca_other.transform(X_test)
+    assert np.isfinite(X_trans_other_test).all()
+    assert X_trans_other_test.dtype == global_dtype
+    assert_allclose(X_trans_other_test[:, stable], X_trans_full_test[:, stable], **tols)
+
+    # Check that inverse transform reconstructions for both solvers are
+    # compatible.
+    X_recons_full_test = pca_full.inverse_transform(X_trans_full_test)
+    assert np.isfinite(X_recons_full_test).all()
+    assert X_recons_full_test.dtype == global_dtype
+    X_recons_other_test = pca_other.inverse_transform(X_trans_other_test)
+    assert np.isfinite(X_recons_other_test).all()
+    assert X_recons_other_test.dtype == global_dtype
+
+    if pca_full.components_.shape[0] == pca_full.components_.shape[1]:
+        # In this case, the models should have learned the same invertible
+        # transform. They should therefore both be able to reconstruct the test
+        # data.
+        assert_allclose(X_recons_full_test, X_test, **tols)
+        assert_allclose(X_recons_other_test, X_test, **tols)
+    elif pca_full.components_.shape[0] < rank:
+        # In the absence of noisy components, both models should be able to
+        # reconstruct the same low-rank approximation of the original data.
+        assert pca_full.explained_variance_.min() > variance_threshold
+        assert_allclose(X_recons_full_test, X_recons_other_test, **tols)
+    else:
+        # When n_features > n_samples and n_components is larger than the rank
+        # of the training set, the output of the `inverse_transform` function
+        # is ill-defined. We can only check that we reach the same fixed point
+        # after another round of transform:
+        assert_allclose(
+            pca_full.transform(X_recons_full_test)[:, stable],
+            pca_other.transform(X_recons_other_test)[:, stable],
+            **tols,
+        )
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        np.random.RandomState(0).randn(100, 80),
+        datasets.make_classification(100, 80, n_informative=78, random_state=0)[0],
+        np.random.RandomState(0).randn(10, 100),
+    ],
+    ids=["random-tall", "correlated-tall", "random-wide"],
+)
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+def test_pca_explained_variance_empirical(X, svd_solver):
+    pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0)
+    X_pca = pca.fit_transform(X)
+    assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0))
+
+    expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0]
+    expected_result = sorted(expected_result, reverse=True)[:2]
+    assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3)
+
+
+@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
+def test_pca_singular_values_consistency(svd_solver):
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 100, 80
+    X = rng.randn(n_samples, n_features)
+
+    pca_full = PCA(n_components=2, svd_solver="full", random_state=rng)
+    pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)
+
+    pca_full.fit(X)
+    pca_other.fit(X)
+
+    assert_allclose(pca_full.singular_values_, pca_other.singular_values_, rtol=5e-3)
+
+
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+def test_pca_singular_values(svd_solver):
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 100, 80
+    X = rng.randn(n_samples, n_features)
+
+    pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)
+    X_trans = pca.fit_transform(X)
+
+    # compare to the Frobenius norm
+    assert_allclose(
+        np.sum(pca.singular_values_**2), np.linalg.norm(X_trans, "fro") ** 2
+    )
+    # Compare to the 2-norms of the score vectors
+    assert_allclose(pca.singular_values_, np.sqrt(np.sum(X_trans**2, axis=0)))
+
+    # set the singular values and see what er get back
+    n_samples, n_features = 100, 110
+    X = rng.randn(n_samples, n_features)
+
+    pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng)
+    X_trans = pca.fit_transform(X)
+    X_trans /= np.sqrt(np.sum(X_trans**2, axis=0))
+    X_trans[:, 0] *= 3.142
+    X_trans[:, 1] *= 2.718
+    X_hat = np.dot(X_trans, pca.components_)
+    pca.fit(X_hat)
+    assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0])
+
+
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+def test_pca_check_projection(svd_solver):
+    # Test that the projection of data is correct
+    rng = np.random.RandomState(0)
+    n, p = 100, 3
+    X = rng.randn(n, p) * 0.1
+    X[:10] += np.array([3, 4, 5])
+    Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
+
+    Yt = PCA(n_components=2, svd_solver=svd_solver).fit(X).transform(Xt)
+    Yt /= np.sqrt((Yt**2).sum())
+
+    assert_allclose(np.abs(Yt[0][0]), 1.0, rtol=5e-3)
+
+
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+def test_pca_check_projection_list(svd_solver):
+    # Test that the projection of data is correct
+    X = [[1.0, 0.0], [0.0, 1.0]]
+    pca = PCA(n_components=1, svd_solver=svd_solver, random_state=0)
+    X_trans = pca.fit_transform(X)
+    assert X_trans.shape, (2, 1)
+    assert_allclose(X_trans.mean(), 0.00, atol=1e-12)
+    assert_allclose(X_trans.std(), 0.71, rtol=5e-3)
+
+
+@pytest.mark.parametrize("svd_solver", ["full", "arpack", "randomized"])
+@pytest.mark.parametrize("whiten", [False, True])
+def test_pca_inverse(svd_solver, whiten):
+    # Test that the projection of data can be inverted
+    rng = np.random.RandomState(0)
+    n, p = 50, 3
+    X = rng.randn(n, p)  # spherical data
+    X[:, 1] *= 0.00001  # make middle component relatively small
+    X += [5, 4, 3]  # make a large mean
+
+    # same check that we can find the original data from the transformed
+    # signal (since the data is almost of rank n_components)
+    pca = PCA(n_components=2, svd_solver=svd_solver, whiten=whiten).fit(X)
+    Y = pca.transform(X)
+    Y_inverse = pca.inverse_transform(Y)
+    assert_allclose(X, Y_inverse, rtol=5e-6)
+
+
+@pytest.mark.parametrize(
+    "data", [np.array([[0, 1, 0], [1, 0, 0]]), np.array([[0, 1, 0], [1, 0, 0]]).T]
+)
+@pytest.mark.parametrize(
+    "svd_solver, n_components, err_msg",
+    [
+        ("arpack", 0, r"must be between 1 and min\(n_samples, n_features\)"),
+        ("randomized", 0, r"must be between 1 and min\(n_samples, n_features\)"),
+        ("arpack", 2, r"must be strictly less than min"),
+        (
+            "auto",
+            3,
+            (
+                r"n_components=3 must be between 0 and min\(n_samples, "
+                r"n_features\)=2 with svd_solver='full'"
+            ),
+        ),
+    ],
+)
+def test_pca_validation(svd_solver, data, n_components, err_msg):
+    # Ensures that solver-specific extreme inputs for the n_components
+    # parameter raise errors
+    smallest_d = 2  # The smallest dimension
+    pca_fitted = PCA(n_components, svd_solver=svd_solver)
+
+    with pytest.raises(ValueError, match=err_msg):
+        pca_fitted.fit(data)
+
+    # Additional case for arpack
+    if svd_solver == "arpack":
+        n_components = smallest_d
+
+        err_msg = (
+            "n_components={}L? must be strictly less than "
+            r"min\(n_samples, n_features\)={}L? with "
+            "svd_solver='arpack'".format(n_components, smallest_d)
+        )
+        with pytest.raises(ValueError, match=err_msg):
+            PCA(n_components, svd_solver=svd_solver).fit(data)
+
+
+@pytest.mark.parametrize(
+    "solver, n_components_",
+    [
+        ("full", min(iris.data.shape)),
+        ("arpack", min(iris.data.shape) - 1),
+        ("randomized", min(iris.data.shape)),
+    ],
+)
+@pytest.mark.parametrize("data", [iris.data, iris.data.T])
+def test_n_components_none(data, solver, n_components_):
+    pca = PCA(svd_solver=solver)
+    pca.fit(data)
+    assert pca.n_components_ == n_components_
+
+
+@pytest.mark.parametrize("svd_solver", ["auto", "full"])
+def test_n_components_mle(svd_solver):
+    # Ensure that n_components == 'mle' doesn't raise error for auto/full
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 600, 10
+    X = rng.randn(n_samples, n_features)
+    pca = PCA(n_components="mle", svd_solver=svd_solver)
+    pca.fit(X)
+    assert pca.n_components_ == 1
+
+
+@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
+def test_n_components_mle_error(svd_solver):
+    # Ensure that n_components == 'mle' will raise an error for unsupported
+    # solvers
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 600, 10
+    X = rng.randn(n_samples, n_features)
+    pca = PCA(n_components="mle", svd_solver=svd_solver)
+    err_msg = "n_components='mle' cannot be a string with svd_solver='{}'".format(
+        svd_solver
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        pca.fit(X)
+
+
+def test_pca_dim():
+    # Check automated dimensionality setting
+    rng = np.random.RandomState(0)
+    n, p = 100, 5
+    X = rng.randn(n, p) * 0.1
+    X[:10] += np.array([3, 4, 5, 1, 2])
+    pca = PCA(n_components="mle", svd_solver="full").fit(X)
+    assert pca.n_components == "mle"
+    assert pca.n_components_ == 1
+
+
+def test_infer_dim_1():
+    # TODO: explain what this is testing
+    # Or at least use explicit variable names...
+    n, p = 1000, 5
+    rng = np.random.RandomState(0)
+    X = (
+        rng.randn(n, p) * 0.1
+        + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2])
+        + np.array([1, 0, 7, 4, 6])
+    )
+    pca = PCA(n_components=p, svd_solver="full")
+    pca.fit(X)
+    spect = pca.explained_variance_
+    ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)])
+    assert ll[1] > ll.max() - 0.01 * n
+
+
+def test_infer_dim_2():
+    # TODO: explain what this is testing
+    # Or at least use explicit variable names...
+    n, p = 1000, 5
+    rng = np.random.RandomState(0)
+    X = rng.randn(n, p) * 0.1
+    X[:10] += np.array([3, 4, 5, 1, 2])
+    X[10:20] += np.array([6, 0, 7, 2, -1])
+    pca = PCA(n_components=p, svd_solver="full")
+    pca.fit(X)
+    spect = pca.explained_variance_
+    assert _infer_dimension(spect, n) > 1
+
+
+def test_infer_dim_3():
+    n, p = 100, 5
+    rng = np.random.RandomState(0)
+    X = rng.randn(n, p) * 0.1
+    X[:10] += np.array([3, 4, 5, 1, 2])
+    X[10:20] += np.array([6, 0, 7, 2, -1])
+    X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
+    pca = PCA(n_components=p, svd_solver="full")
+    pca.fit(X)
+    spect = pca.explained_variance_
+    assert _infer_dimension(spect, n) > 2
+
+
+@pytest.mark.parametrize(
+    "X, n_components, n_components_validated",
+    [
+        (iris.data, 0.95, 2),  # row > col
+        (iris.data, 0.01, 1),  # row > col
+        (np.random.RandomState(0).rand(5, 20), 0.5, 2),
+    ],  # row < col
+)
+def test_infer_dim_by_explained_variance(X, n_components, n_components_validated):
+    pca = PCA(n_components=n_components, svd_solver="full")
+    pca.fit(X)
+    assert pca.n_components == pytest.approx(n_components)
+    assert pca.n_components_ == n_components_validated
+
+
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+def test_pca_score(svd_solver):
+    # Test that probabilistic PCA scoring yields a reasonable score
+    n, p = 1000, 3
+    rng = np.random.RandomState(0)
+    X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5])
+    pca = PCA(n_components=2, svd_solver=svd_solver)
+    pca.fit(X)
+
+    ll1 = pca.score(X)
+    h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1**2) * p
+    assert_allclose(ll1 / h, 1, rtol=5e-2)
+
+    ll2 = pca.score(rng.randn(n, p) * 0.2 + np.array([3, 4, 5]))
+    assert ll1 > ll2
+
+    pca = PCA(n_components=2, whiten=True, svd_solver=svd_solver)
+    pca.fit(X)
+    ll2 = pca.score(X)
+    assert ll1 > ll2
+
+
+def test_pca_score3():
+    # Check that probabilistic PCA selects the right model
+    n, p = 200, 3
+    rng = np.random.RandomState(0)
+    Xl = rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])
+    Xt = rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])
+    ll = np.zeros(p)
+    for k in range(p):
+        pca = PCA(n_components=k, svd_solver="full")
+        pca.fit(Xl)
+        ll[k] = pca.score(Xt)
+
+    assert ll.argmax() == 1
+
+
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+def test_pca_sanity_noise_variance(svd_solver):
+    # Sanity check for the noise_variance_. For more details see
+    # https://github.com/scikit-learn/scikit-learn/issues/7568
+    # https://github.com/scikit-learn/scikit-learn/issues/8541
+    # https://github.com/scikit-learn/scikit-learn/issues/8544
+    X, _ = datasets.load_digits(return_X_y=True)
+    pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0)
+    pca.fit(X)
+    assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
+
+
+@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
+def test_pca_score_consistency_solvers(svd_solver):
+    # Check the consistency of score between solvers
+    X, _ = datasets.load_digits(return_X_y=True)
+    pca_full = PCA(n_components=30, svd_solver="full", random_state=0)
+    pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0)
+    pca_full.fit(X)
+    pca_other.fit(X)
+    assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6)
+
+
+# arpack raises ValueError for n_components == min(n_samples,  n_features)
+@pytest.mark.parametrize("svd_solver", ["full", "randomized"])
+def test_pca_zero_noise_variance_edge_cases(svd_solver):
+    # ensure that noise_variance_ is 0 in edge cases
+    # when n_components == min(n_samples, n_features)
+    n, p = 100, 3
+    rng = np.random.RandomState(0)
+    X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5])
+
+    pca = PCA(n_components=p, svd_solver=svd_solver)
+    pca.fit(X)
+    assert pca.noise_variance_ == 0
+    # Non-regression test for gh-12489
+    # ensure no divide-by-zero error for n_components == n_features < n_samples
+    pca.score(X)
+
+    pca.fit(X.T)
+    assert pca.noise_variance_ == 0
+    # Non-regression test for gh-12489
+    # ensure no divide-by-zero error for n_components == n_samples < n_features
+    pca.score(X.T)
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, n_components, expected_solver",
+    [
+        # case: n_samples < 10 * n_features and max(X.shape) <= 500 => 'full'
+        (10, 50, 5, "full"),
+        # case: n_samples > 10 * n_features and n_features < 500 => 'covariance_eigh'
+        (1000, 50, 50, "covariance_eigh"),
+        # case: n_components >= .8 * min(X.shape) => 'full'
+        (1000, 500, 400, "full"),
+        # n_components >= 1 and n_components < .8*min(X.shape) => 'randomized'
+        (1000, 500, 10, "randomized"),
+        # case: n_components in (0,1) => 'full'
+        (1000, 500, 0.5, "full"),
+    ],
+)
+def test_pca_svd_solver_auto(n_samples, n_features, n_components, expected_solver):
+    data = np.random.RandomState(0).uniform(size=(n_samples, n_features))
+    pca_auto = PCA(n_components=n_components, random_state=0)
+    pca_test = PCA(
+        n_components=n_components, svd_solver=expected_solver, random_state=0
+    )
+    pca_auto.fit(data)
+    assert pca_auto._fit_svd_solver == expected_solver
+    pca_test.fit(data)
+    assert_allclose(pca_auto.components_, pca_test.components_)
+
+
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+def test_pca_deterministic_output(svd_solver):
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 10)
+
+    transformed_X = np.zeros((20, 2))
+    for i in range(20):
+        pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)
+        transformed_X[i, :] = pca.fit_transform(X)[0]
+    assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
+
+
+@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
+def test_pca_dtype_preservation(svd_solver, global_random_seed):
+    check_pca_float_dtype_preservation(svd_solver, global_random_seed)
+    check_pca_int_dtype_upcast_to_double(svd_solver)
+
+
+def check_pca_float_dtype_preservation(svd_solver, seed):
+    # Ensure that PCA does not upscale the dtype when input is float32
+    X = np.random.RandomState(seed).rand(1000, 4)
+    X_float64 = X.astype(np.float64, copy=False)
+    X_float32 = X.astype(np.float32)
+
+    pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=seed).fit(
+        X_float64
+    )
+    pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=seed).fit(
+        X_float32
+    )
+
+    assert pca_64.components_.dtype == np.float64
+    assert pca_32.components_.dtype == np.float32
+    assert pca_64.transform(X_float64).dtype == np.float64
+    assert pca_32.transform(X_float32).dtype == np.float32
+
+    # The atol and rtol are set such that the test passes for all random seeds
+    # on all supported platforms on our CI and conda-forge with the default
+    # random seed.
+    assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-3, atol=1e-3)
+
+
+def check_pca_int_dtype_upcast_to_double(svd_solver):
+    # Ensure that all int types will be upcast to float64
+    X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4))
+    X_i64 = X_i64.astype(np.int64, copy=False)
+    X_i32 = X_i64.astype(np.int32, copy=False)
+
+    pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i64)
+    pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i32)
+
+    assert pca_64.components_.dtype == np.float64
+    assert pca_32.components_.dtype == np.float64
+    assert pca_64.transform(X_i64).dtype == np.float64
+    assert pca_32.transform(X_i32).dtype == np.float64
+
+    assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4)
+
+
+def test_pca_n_components_mostly_explained_variance_ratio():
+    # when n_components is the second highest cumulative sum of the
+    # explained_variance_ratio_, then n_components_ should equal the
+    # number of features in the dataset #15669
+    X, y = load_iris(return_X_y=True)
+    pca1 = PCA().fit(X, y)
+
+    n_components = pca1.explained_variance_ratio_.cumsum()[-2]
+    pca2 = PCA(n_components=n_components).fit(X, y)
+    assert pca2.n_components_ == X.shape[1]
+
+
+def test_assess_dimension_bad_rank():
+    # Test error when tested rank not in [1, n_features - 1]
+    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
+    n_samples = 10
+    for rank in (0, 5):
+        with pytest.raises(ValueError, match=r"should be in \[1, n_features - 1\]"):
+            _assess_dimension(spectrum, rank, n_samples)
+
+
+def test_small_eigenvalues_mle():
+    # Test rank associated with tiny eigenvalues are given a log-likelihood of
+    # -inf. The inferred rank will be 1
+    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
+
+    assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf
+
+    for rank in (2, 3):
+        assert _assess_dimension(spectrum, rank, 10) == -np.inf
+
+    assert _infer_dimension(spectrum, 10) == 1
+
+
+def test_mle_redundant_data():
+    # Test 'mle' with pathological X: only one relevant feature should give a
+    # rank of 1
+    X, _ = datasets.make_classification(
+        n_features=20,
+        n_informative=1,
+        n_repeated=18,
+        n_redundant=1,
+        n_clusters_per_class=1,
+        random_state=42,
+    )
+    pca = PCA(n_components="mle").fit(X)
+    assert pca.n_components_ == 1
+
+
+def test_fit_mle_too_few_samples():
+    # Tests that an error is raised when the number of samples is smaller
+    # than the number of features during an mle fit
+    X, _ = datasets.make_classification(n_samples=20, n_features=21, random_state=42)
+
+    pca = PCA(n_components="mle", svd_solver="full")
+    with pytest.raises(
+        ValueError,
+        match="n_components='mle' is only supported if n_samples >= n_features",
+    ):
+        pca.fit(X)
+
+
+def test_mle_simple_case():
+    # non-regression test for issue
+    # https://github.com/scikit-learn/scikit-learn/issues/16730
+    n_samples, n_dim = 1000, 10
+    X = np.random.RandomState(0).randn(n_samples, n_dim)
+    X[:, -1] = np.mean(X[:, :-1], axis=-1)  # true X dim is ndim - 1
+    pca_skl = PCA("mle", svd_solver="full")
+    pca_skl.fit(X)
+    assert pca_skl.n_components_ == n_dim - 1
+
+
+def test_assess_dimesion_rank_one():
+    # Make sure assess_dimension works properly on a matrix of rank 1
+    n_samples, n_features = 9, 6
+    X = np.ones((n_samples, n_features))  # rank 1 matrix
+    _, s, _ = np.linalg.svd(X, full_matrices=True)
+    # except for rank 1, all eigenvalues are 0 resp. close to 0 (FP)
+    assert_allclose(s[1:], np.zeros(n_features - 1), atol=1e-12)
+
+    assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples))
+    for rank in range(2, n_features):
+        assert _assess_dimension(s, rank, n_samples) == -np.inf
+
+
+def test_pca_randomized_svd_n_oversamples():
+    """Check that exposing and setting `n_oversamples` will provide accurate results
+    even when `X` as a large number of features.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20589
+    """
+    rng = np.random.RandomState(0)
+    n_features = 100
+    X = rng.randn(1_000, n_features)
+
+    # The default value of `n_oversamples` will lead to inaccurate results
+    # We force it to the number of features.
+    pca_randomized = PCA(
+        n_components=1,
+        svd_solver="randomized",
+        n_oversamples=n_features,
+        random_state=0,
+    ).fit(X)
+    pca_full = PCA(n_components=1, svd_solver="full").fit(X)
+    pca_arpack = PCA(n_components=1, svd_solver="arpack", random_state=0).fit(X)
+
+    assert_allclose(np.abs(pca_full.components_), np.abs(pca_arpack.components_))
+    assert_allclose(np.abs(pca_randomized.components_), np.abs(pca_arpack.components_))
+
+
+def test_feature_names_out():
+    """Check feature names out for PCA."""
+    pca = PCA(n_components=2).fit(iris.data)
+
+    names = pca.get_feature_names_out()
+    assert_array_equal([f"pca{i}" for i in range(2)], names)
+
+
+@pytest.mark.parametrize("copy", [True, False])
+def test_variance_correctness(copy):
+    """Check the accuracy of PCA's internal variance calculation"""
+    rng = np.random.RandomState(0)
+    X = rng.randn(1000, 200)
+    pca = PCA().fit(X)
+    pca_var = pca.explained_variance_ / pca.explained_variance_ratio_
+    true_var = np.var(X, ddof=1, axis=0).sum()
+    np.testing.assert_allclose(pca_var, true_var)
+
+
+def check_array_api_get_precision(name, estimator, array_namespace, device, dtype_name):
+    xp = _array_api_for_tests(array_namespace, device)
+    iris_np = iris.data.astype(dtype_name)
+    iris_xp = xp.asarray(iris_np, device=device)
+
+    estimator.fit(iris_np)
+    precision_np = estimator.get_precision()
+    covariance_np = estimator.get_covariance()
+
+    rtol = 2e-4 if iris_np.dtype == "float32" else 2e-7
+    with config_context(array_api_dispatch=True):
+        estimator_xp = clone(estimator).fit(iris_xp)
+        precision_xp = estimator_xp.get_precision()
+        assert precision_xp.shape == (4, 4)
+        assert precision_xp.dtype == iris_xp.dtype
+
+        assert_allclose(
+            _convert_to_numpy(precision_xp, xp=xp),
+            precision_np,
+            rtol=rtol,
+            atol=_atol_for_type(dtype_name),
+        )
+        covariance_xp = estimator_xp.get_covariance()
+        assert covariance_xp.shape == (4, 4)
+        assert covariance_xp.dtype == iris_xp.dtype
+
+        assert_allclose(
+            _convert_to_numpy(covariance_xp, xp=xp),
+            covariance_np,
+            rtol=rtol,
+            atol=_atol_for_type(dtype_name),
+        )
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize(
+    "check",
+    [check_array_api_input_and_values, check_array_api_get_precision],
+    ids=_get_check_estimator_ids,
+)
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        PCA(n_components=2, svd_solver="full"),
+        PCA(n_components=2, svd_solver="full", whiten=True),
+        PCA(n_components=0.1, svd_solver="full", whiten=True),
+        PCA(n_components=2, svd_solver="covariance_eigh"),
+        PCA(n_components=2, svd_solver="covariance_eigh", whiten=True),
+        PCA(
+            n_components=2,
+            svd_solver="randomized",
+            power_iteration_normalizer="QR",
+            random_state=0,  # how to use global_random_seed here?
+        ),
+    ],
+    ids=_get_check_estimator_ids,
+)
+def test_pca_array_api_compliance(
+    estimator, check, array_namespace, device, dtype_name
+):
+    name = estimator.__class__.__name__
+    check(name, estimator, array_namespace, device=device, dtype_name=dtype_name)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize(
+    "check",
+    [check_array_api_get_precision],
+    ids=_get_check_estimator_ids,
+)
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        # PCA with mle cannot use check_array_api_input_and_values because of
+        # rounding errors in the noisy (low variance) components. Even checking
+        # the shape of the `components_` is problematic because the number of
+        # components depends on trimming threshold of the mle algorithm which
+        # can depend on device-specific rounding errors.
+        PCA(n_components="mle", svd_solver="full"),
+    ],
+    ids=_get_check_estimator_ids,
+)
+def test_pca_mle_array_api_compliance(
+    estimator, check, array_namespace, device, dtype_name
+):
+    name = estimator.__class__.__name__
+    check(name, estimator, array_namespace, device=device, dtype_name=dtype_name)
+
+    # Simpler variant of the generic check_array_api_input checker tailored for
+    # the specific case of PCA with mle-trimmed components.
+    xp = _array_api_for_tests(array_namespace, device)
+
+    X, y = make_classification(random_state=42)
+    X = X.astype(dtype_name, copy=False)
+    atol = _atol_for_type(X.dtype)
+
+    est = clone(estimator)
+
+    X_xp = xp.asarray(X, device=device)
+    y_xp = xp.asarray(y, device=device)
+
+    est.fit(X, y)
+
+    components_np = est.components_
+    explained_variance_np = est.explained_variance_
+
+    est_xp = clone(est)
+    with config_context(array_api_dispatch=True):
+        est_xp.fit(X_xp, y_xp)
+        components_xp = est_xp.components_
+        assert array_device(components_xp) == array_device(X_xp)
+        components_xp_np = _convert_to_numpy(components_xp, xp=xp)
+
+        explained_variance_xp = est_xp.explained_variance_
+        assert array_device(explained_variance_xp) == array_device(X_xp)
+        explained_variance_xp_np = _convert_to_numpy(explained_variance_xp, xp=xp)
+
+    assert components_xp_np.dtype == components_np.dtype
+    assert components_xp_np.shape[1] == components_np.shape[1]
+    assert explained_variance_xp_np.dtype == explained_variance_np.dtype
+
+    # Check that the explained variance values match for the
+    # common components:
+    min_components = min(components_xp_np.shape[0], components_np.shape[0])
+    assert_allclose(
+        explained_variance_xp_np[:min_components],
+        explained_variance_np[:min_components],
+        atol=atol,
+    )
+
+    # If the number of components differ, check that the explained variance of
+    # the trimmed components is very small.
+    if components_xp_np.shape[0] != components_np.shape[0]:
+        reference_variance = explained_variance_np[-1]
+        extra_variance_np = explained_variance_np[min_components:]
+        extra_variance_xp_np = explained_variance_xp_np[min_components:]
+        assert all(np.abs(extra_variance_np - reference_variance) < atol)
+        assert all(np.abs(extra_variance_xp_np - reference_variance) < atol)
+
+
+@pytest.mark.skipif(
+    os.environ.get("SCIPY_ARRAY_API") != "1", reason="SCIPY_ARRAY_API not set to 1."
+)
+def test_array_api_error_and_warnings_on_unsupported_params():
+    xp = pytest.importorskip("array_api_strict")
+    iris_xp = xp.asarray(iris.data)
+
+    pca = PCA(n_components=2, svd_solver="arpack", random_state=0)
+    expected_msg = re.escape(
+        "PCA with svd_solver='arpack' is not supported for Array API inputs."
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        with config_context(array_api_dispatch=True):
+            pca.fit(iris_xp)
+
+    pca.set_params(svd_solver="randomized", power_iteration_normalizer="LU")
+    expected_msg = re.escape(
+        "Array API does not support LU factorization. Set"
+        " `power_iteration_normalizer='QR'` instead."
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        with config_context(array_api_dispatch=True):
+            pca.fit(iris_xp)
+
+    pca.set_params(svd_solver="randomized", power_iteration_normalizer="auto")
+    expected_msg = re.escape(
+        "Array API does not support LU factorization, falling back to QR instead. Set"
+        " `power_iteration_normalizer='QR'` explicitly to silence this warning."
+    )
+    with pytest.warns(UserWarning, match=expected_msg):
+        with config_context(array_api_dispatch=True):
+            pca.fit(iris_xp)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_sparse_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_sparse_pca.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c71a5d0e752580dd90b6670804bf91b8ab0b72
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_sparse_pca.py
@@ -0,0 +1,347 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.datasets import make_low_rank_matrix
+from sklearn.decomposition import PCA, MiniBatchSparsePCA, SparsePCA
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    if_safe_multiprocessing_with_blas,
+)
+from sklearn.utils.extmath import svd_flip
+
+
+def generate_toy_data(n_components, n_samples, image_size, random_state=None):
+    n_features = image_size[0] * image_size[1]
+
+    rng = check_random_state(random_state)
+    U = rng.randn(n_samples, n_components)
+    V = rng.randn(n_components, n_features)
+
+    centers = [(3, 3), (6, 7), (8, 1)]
+    sz = [1, 2, 1]
+    for k in range(n_components):
+        img = np.zeros(image_size)
+        xmin, xmax = centers[k][0] - sz[k], centers[k][0] + sz[k]
+        ymin, ymax = centers[k][1] - sz[k], centers[k][1] + sz[k]
+        img[xmin:xmax][:, ymin:ymax] = 1.0
+        V[k, :] = img.ravel()
+
+    # Y is defined by : Y = UV + noise
+    Y = np.dot(U, V)
+    Y += 0.1 * rng.randn(Y.shape[0], Y.shape[1])  # Add noise
+    return Y, U, V
+
+
+# SparsePCA can be a bit slow. To avoid having test times go up, we
+# test different aspects of the code in the same test
+
+
+def test_correct_shapes():
+    rng = np.random.RandomState(0)
+    X = rng.randn(12, 10)
+    spca = SparsePCA(n_components=8, random_state=rng)
+    U = spca.fit_transform(X)
+    assert spca.components_.shape == (8, 10)
+    assert U.shape == (12, 8)
+    # test overcomplete decomposition
+    spca = SparsePCA(n_components=13, random_state=rng)
+    U = spca.fit_transform(X)
+    assert spca.components_.shape == (13, 10)
+    assert U.shape == (12, 13)
+
+
+def test_fit_transform(global_random_seed):
+    alpha = 1
+    rng = np.random.RandomState(global_random_seed)
+    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
+    spca_lars = SparsePCA(
+        n_components=3, method="lars", alpha=alpha, random_state=global_random_seed
+    )
+    spca_lars.fit(Y)
+
+    # Test that CD gives similar results
+    spca_lasso = SparsePCA(
+        n_components=3, method="cd", random_state=global_random_seed, alpha=alpha
+    )
+    spca_lasso.fit(Y)
+    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
+
+
+@if_safe_multiprocessing_with_blas
+def test_fit_transform_parallel(global_random_seed):
+    alpha = 1
+    rng = np.random.RandomState(global_random_seed)
+    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
+    spca_lars = SparsePCA(
+        n_components=3, method="lars", alpha=alpha, random_state=global_random_seed
+    )
+    spca_lars.fit(Y)
+    U1 = spca_lars.transform(Y)
+    # Test multiple CPUs
+    spca = SparsePCA(
+        n_components=3,
+        n_jobs=2,
+        method="lars",
+        alpha=alpha,
+        random_state=global_random_seed,
+    ).fit(Y)
+    U2 = spca.transform(Y)
+    assert not np.all(spca_lars.components_ == 0)
+    assert_array_almost_equal(U1, U2)
+
+
+def test_transform_nan(global_random_seed):
+    # Test that SparsePCA won't return NaN when there is 0 feature in all
+    # samples.
+    rng = np.random.RandomState(global_random_seed)
+    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
+    Y[:, 0] = 0
+    estimator = SparsePCA(n_components=8, random_state=global_random_seed)
+    assert not np.any(np.isnan(estimator.fit_transform(Y)))
+
+
+def test_fit_transform_tall(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng)  # tall array
+    spca_lars = SparsePCA(n_components=3, method="lars", random_state=rng)
+    U1 = spca_lars.fit_transform(Y)
+    spca_lasso = SparsePCA(n_components=3, method="cd", random_state=rng)
+    U2 = spca_lasso.fit(Y).transform(Y)
+    assert_array_almost_equal(U1, U2)
+
+
+def test_initialization(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    U_init = rng.randn(5, 3)
+    V_init = rng.randn(3, 4)
+    model = SparsePCA(
+        n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng
+    )
+    model.fit(rng.randn(5, 4))
+
+    expected_components = V_init / np.linalg.norm(V_init, axis=1, keepdims=True)
+    expected_components = svd_flip(u=expected_components.T, v=None)[0].T
+    assert_allclose(model.components_, expected_components)
+
+
+def test_mini_batch_correct_shapes():
+    rng = np.random.RandomState(0)
+    X = rng.randn(12, 10)
+    pca = MiniBatchSparsePCA(n_components=8, max_iter=1, random_state=rng)
+    U = pca.fit_transform(X)
+    assert pca.components_.shape == (8, 10)
+    assert U.shape == (12, 8)
+    # test overcomplete decomposition
+    pca = MiniBatchSparsePCA(n_components=13, max_iter=1, random_state=rng)
+    U = pca.fit_transform(X)
+    assert pca.components_.shape == (13, 10)
+    assert U.shape == (12, 13)
+
+
+def test_scaling_fit_transform(global_random_seed):
+    alpha = 1
+    rng = np.random.RandomState(global_random_seed)
+    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
+    spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=rng)
+    results_train = spca_lars.fit_transform(Y)
+    results_test = spca_lars.transform(Y[:10])
+    assert_allclose(results_train[0], results_test[0])
+
+
+def test_pca_vs_spca(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
+    Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
+    spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2, random_state=rng)
+    pca = PCA(n_components=2, random_state=rng)
+    pca.fit(Y)
+    spca.fit(Y)
+    results_test_pca = pca.transform(Z)
+    results_test_spca = spca.transform(Z)
+    assert_allclose(
+        np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-4
+    )
+    results_test_pca *= np.sign(results_test_pca[0, :])
+    results_test_spca *= np.sign(results_test_spca[0, :])
+    assert_allclose(results_test_pca, results_test_spca, atol=1e-4)
+
+
+@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
+@pytest.mark.parametrize("n_components", [None, 3])
+def test_spca_n_components_(SPCA, n_components):
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 12, 10
+    X = rng.randn(n_samples, n_features)
+
+    model = SPCA(n_components=n_components).fit(X)
+
+    if n_components is not None:
+        assert model.n_components_ == n_components
+    else:
+        assert model.n_components_ == n_features
+
+
+@pytest.mark.parametrize("SPCA", (SparsePCA, MiniBatchSparsePCA))
+@pytest.mark.parametrize("method", ("lars", "cd"))
+@pytest.mark.parametrize(
+    "data_type, expected_type",
+    (
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ),
+)
+def test_sparse_pca_dtype_match(SPCA, method, data_type, expected_type):
+    # Verify output matrix dtype
+    n_samples, n_features, n_components = 12, 10, 3
+    rng = np.random.RandomState(0)
+    input_array = rng.randn(n_samples, n_features).astype(data_type)
+    model = SPCA(n_components=n_components, method=method)
+    transformed = model.fit_transform(input_array)
+
+    assert transformed.dtype == expected_type
+    assert model.components_.dtype == expected_type
+
+
+@pytest.mark.parametrize("SPCA", (SparsePCA, MiniBatchSparsePCA))
+@pytest.mark.parametrize("method", ("lars", "cd"))
+def test_sparse_pca_numerical_consistency(SPCA, method, global_random_seed):
+    # Verify numericall consistentency among np.float32 and np.float64
+    n_samples, n_features, n_components = 20, 20, 5
+    input_array = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=n_components,
+        random_state=global_random_seed,
+    )
+
+    model_32 = SPCA(
+        n_components=n_components,
+        method=method,
+        random_state=global_random_seed,
+    )
+    transformed_32 = model_32.fit_transform(input_array.astype(np.float32))
+
+    model_64 = SPCA(
+        n_components=n_components,
+        method=method,
+        random_state=global_random_seed,
+    )
+    transformed_64 = model_64.fit_transform(input_array.astype(np.float64))
+    assert_allclose(transformed_64, transformed_32)
+    assert_allclose(model_64.components_, model_32.components_)
+
+
+@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
+def test_spca_feature_names_out(SPCA):
+    """Check feature names out for *SparsePCA."""
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 12, 10
+    X = rng.randn(n_samples, n_features)
+
+    model = SPCA(n_components=4).fit(X)
+    names = model.get_feature_names_out()
+
+    estimator_name = SPCA.__name__.lower()
+    assert_array_equal([f"{estimator_name}{i}" for i in range(4)], names)
+
+
+def test_spca_early_stopping(global_random_seed):
+    """Check that `tol` and `max_no_improvement` act as early stopping."""
+    rng = np.random.RandomState(global_random_seed)
+    n_samples, n_features = 50, 10
+    X = rng.randn(n_samples, n_features)
+
+    # vary the tolerance to force the early stopping of one of the model
+    model_early_stopped = MiniBatchSparsePCA(
+        max_iter=100, tol=0.5, random_state=global_random_seed
+    ).fit(X)
+    model_not_early_stopped = MiniBatchSparsePCA(
+        max_iter=100, tol=1e-3, random_state=global_random_seed
+    ).fit(X)
+    assert model_early_stopped.n_iter_ < model_not_early_stopped.n_iter_
+
+    # force the max number of no improvement to a large value to check that
+    # it does help to early stop
+    model_early_stopped = MiniBatchSparsePCA(
+        max_iter=100, tol=1e-6, max_no_improvement=2, random_state=global_random_seed
+    ).fit(X)
+    model_not_early_stopped = MiniBatchSparsePCA(
+        max_iter=100, tol=1e-6, max_no_improvement=100, random_state=global_random_seed
+    ).fit(X)
+    assert model_early_stopped.n_iter_ < model_not_early_stopped.n_iter_
+
+
+def test_equivalence_components_pca_spca(global_random_seed):
+    """Check the equivalence of the components found by PCA and SparsePCA.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/23932
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randn(50, 4)
+
+    n_components = 2
+    pca = PCA(
+        n_components=n_components,
+        svd_solver="randomized",
+        random_state=0,
+    ).fit(X)
+    spca = SparsePCA(
+        n_components=n_components,
+        method="lars",
+        ridge_alpha=0,
+        alpha=0,
+        random_state=0,
+    ).fit(X)
+
+    assert_allclose(pca.components_, spca.components_)
+
+
+def test_sparse_pca_inverse_transform(global_random_seed):
+    """Check that `inverse_transform` in `SparsePCA` and `PCA` are similar."""
+    rng = np.random.RandomState(global_random_seed)
+    n_samples, n_features = 10, 5
+    X = rng.randn(n_samples, n_features)
+
+    n_components = 2
+    spca = SparsePCA(
+        n_components=n_components,
+        alpha=1e-12,
+        ridge_alpha=1e-12,
+        random_state=global_random_seed,
+    )
+    pca = PCA(n_components=n_components, random_state=global_random_seed)
+    X_trans_spca = spca.fit_transform(X)
+    X_trans_pca = pca.fit_transform(X)
+    assert_allclose(
+        spca.inverse_transform(X_trans_spca), pca.inverse_transform(X_trans_pca)
+    )
+
+
+@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
+def test_transform_inverse_transform_round_trip(SPCA, global_random_seed):
+    """Check the `transform` and `inverse_transform` round trip with no loss of
+    information.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    n_samples, n_features = 10, 5
+    X = rng.randn(n_samples, n_features)
+
+    n_components = n_features
+    spca = SPCA(
+        n_components=n_components,
+        alpha=1e-12,
+        ridge_alpha=1e-12,
+        random_state=global_random_seed,
+    )
+    X_trans_spca = spca.fit_transform(X)
+    assert_allclose(spca.inverse_transform(X_trans_spca), X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_truncated_svd.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_truncated_svd.py
new file mode 100644
index 0000000000000000000000000000000000000000..07b35c873ee3e2faad40808bcf3337e81f78ff8a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_truncated_svd.py
@@ -0,0 +1,212 @@
+"""Test truncated SVD transformer."""
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+
+from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_allclose, assert_array_less
+
+SVD_SOLVERS = ["arpack", "randomized"]
+
+
+@pytest.fixture(scope="module")
+def X_sparse():
+    # Make an X that looks somewhat like a small tf-idf matrix.
+    rng = check_random_state(42)
+    X = sp.random(60, 55, density=0.2, format="csr", random_state=rng)
+    X.data[:] = 1 + np.log(X.data)
+    return X
+
+
+@pytest.mark.parametrize("solver", ["randomized"])
+@pytest.mark.parametrize("kind", ("dense", "sparse"))
+def test_solvers(X_sparse, solver, kind):
+    X = X_sparse if kind == "sparse" else X_sparse.toarray()
+    svd_a = TruncatedSVD(30, algorithm="arpack")
+    svd = TruncatedSVD(30, algorithm=solver, random_state=42, n_oversamples=100)
+
+    Xa = svd_a.fit_transform(X)[:, :6]
+    Xr = svd.fit_transform(X)[:, :6]
+    assert_allclose(Xa, Xr, rtol=2e-3)
+
+    comp_a = np.abs(svd_a.components_)
+    comp = np.abs(svd.components_)
+    # All elements are equal, but some elements are more equal than others.
+    assert_allclose(comp_a[:9], comp[:9], rtol=1e-3)
+    assert_allclose(comp_a[9:], comp[9:], atol=1e-2)
+
+
+@pytest.mark.parametrize("n_components", (10, 25, 41, 55))
+def test_attributes(n_components, X_sparse):
+    n_features = X_sparse.shape[1]
+    tsvd = TruncatedSVD(n_components).fit(X_sparse)
+    assert tsvd.n_components == n_components
+    assert tsvd.components_.shape == (n_components, n_features)
+
+
+@pytest.mark.parametrize(
+    "algorithm, n_components",
+    [
+        ("arpack", 55),
+        ("arpack", 56),
+        ("randomized", 56),
+    ],
+)
+def test_too_many_components(X_sparse, algorithm, n_components):
+    tsvd = TruncatedSVD(n_components=n_components, algorithm=algorithm)
+    with pytest.raises(ValueError):
+        tsvd.fit(X_sparse)
+
+
+@pytest.mark.parametrize("fmt", ("array", "csr", "csc", "coo", "lil"))
+def test_sparse_formats(fmt, X_sparse):
+    n_samples = X_sparse.shape[0]
+    Xfmt = X_sparse.toarray() if fmt == "dense" else getattr(X_sparse, "to" + fmt)()
+    tsvd = TruncatedSVD(n_components=11)
+    Xtrans = tsvd.fit_transform(Xfmt)
+    assert Xtrans.shape == (n_samples, 11)
+    Xtrans = tsvd.transform(Xfmt)
+    assert Xtrans.shape == (n_samples, 11)
+
+
+@pytest.mark.parametrize("algo", SVD_SOLVERS)
+def test_inverse_transform(algo, X_sparse):
+    # We need a lot of components for the reconstruction to be "almost
+    # equal" in all positions. XXX Test means or sums instead?
+    tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo)
+    Xt = tsvd.fit_transform(X_sparse)
+    Xinv = tsvd.inverse_transform(Xt)
+    assert_allclose(Xinv, X_sparse.toarray(), rtol=1e-1, atol=2e-1)
+
+
+def test_integers(X_sparse):
+    n_samples = X_sparse.shape[0]
+    Xint = X_sparse.astype(np.int64)
+    tsvd = TruncatedSVD(n_components=6)
+    Xtrans = tsvd.fit_transform(Xint)
+    assert Xtrans.shape == (n_samples, tsvd.n_components)
+
+
+@pytest.mark.parametrize("kind", ("dense", "sparse"))
+@pytest.mark.parametrize("n_components", [10, 20])
+@pytest.mark.parametrize("solver", SVD_SOLVERS)
+def test_explained_variance(X_sparse, kind, n_components, solver):
+    X = X_sparse if kind == "sparse" else X_sparse.toarray()
+    svd = TruncatedSVD(n_components, algorithm=solver)
+    X_tr = svd.fit_transform(X)
+    # Assert that all the values are greater than 0
+    assert_array_less(0.0, svd.explained_variance_ratio_)
+
+    # Assert that total explained variance is less than 1
+    assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)
+
+    # Test that explained_variance is correct
+    total_variance = np.var(X_sparse.toarray(), axis=0).sum()
+    variances = np.var(X_tr, axis=0)
+    true_explained_variance_ratio = variances / total_variance
+
+    assert_allclose(
+        svd.explained_variance_ratio_,
+        true_explained_variance_ratio,
+    )
+
+
+@pytest.mark.parametrize("kind", ("dense", "sparse"))
+@pytest.mark.parametrize("solver", SVD_SOLVERS)
+def test_explained_variance_components_10_20(X_sparse, kind, solver):
+    X = X_sparse if kind == "sparse" else X_sparse.toarray()
+    svd_10 = TruncatedSVD(10, algorithm=solver, n_iter=10).fit(X)
+    svd_20 = TruncatedSVD(20, algorithm=solver, n_iter=10).fit(X)
+
+    # Assert the 1st component is equal
+    assert_allclose(
+        svd_10.explained_variance_ratio_,
+        svd_20.explained_variance_ratio_[:10],
+        rtol=5e-3,
+    )
+
+    # Assert that 20 components has higher explained variance than 10
+    assert (
+        svd_20.explained_variance_ratio_.sum() > svd_10.explained_variance_ratio_.sum()
+    )
+
+
+@pytest.mark.parametrize("solver", SVD_SOLVERS)
+def test_singular_values_consistency(solver, global_random_seed):
+    # Check that the TruncatedSVD output has the correct singular values
+    rng = np.random.RandomState(global_random_seed)
+    n_samples, n_features = 100, 80
+    X = rng.randn(n_samples, n_features)
+
+    pca = TruncatedSVD(n_components=2, algorithm=solver, random_state=rng).fit(X)
+
+    # Compare to the Frobenius norm
+    X_pca = pca.transform(X)
+    assert_allclose(
+        np.sum(pca.singular_values_**2.0),
+        np.linalg.norm(X_pca, "fro") ** 2.0,
+        rtol=1e-2,
+    )
+
+    # Compare to the 2-norms of the score vectors
+    assert_allclose(
+        pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), rtol=1e-2
+    )
+
+
+@pytest.mark.parametrize("solver", SVD_SOLVERS)
+def test_singular_values_expected(solver, global_random_seed):
+    # Set the singular values and see what we get back
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 100
+    n_features = 110
+
+    X = rng.randn(n_samples, n_features)
+
+    pca = TruncatedSVD(n_components=3, algorithm=solver, random_state=rng)
+    X_pca = pca.fit_transform(X)
+
+    X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
+    X_pca[:, 0] *= 3.142
+    X_pca[:, 1] *= 2.718
+
+    X_hat_pca = np.dot(X_pca, pca.components_)
+    pca.fit(X_hat_pca)
+    assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0], rtol=1e-14)
+
+
+def test_truncated_svd_eq_pca(X_sparse):
+    # TruncatedSVD should be equal to PCA on centered data
+
+    X_dense = X_sparse.toarray()
+
+    X_c = X_dense - X_dense.mean(axis=0)
+
+    params = dict(n_components=10, random_state=42)
+
+    svd = TruncatedSVD(algorithm="arpack", **params)
+    pca = PCA(svd_solver="arpack", **params)
+
+    Xt_svd = svd.fit_transform(X_c)
+    Xt_pca = pca.fit_transform(X_c)
+
+    assert_allclose(Xt_svd, Xt_pca, rtol=1e-9)
+    assert_allclose(pca.mean_, 0, atol=1e-9)
+    assert_allclose(svd.components_, pca.components_)
+
+
+@pytest.mark.parametrize(
+    "algorithm, tol", [("randomized", 0.0), ("arpack", 1e-6), ("arpack", 0.0)]
+)
+@pytest.mark.parametrize("kind", ("dense", "sparse"))
+def test_fit_transform(X_sparse, algorithm, tol, kind):
+    # fit_transform(X) should equal fit(X).transform(X)
+    X = X_sparse if kind == "sparse" else X_sparse.toarray()
+    svd = TruncatedSVD(
+        n_components=5, n_iter=7, random_state=42, algorithm=algorithm, tol=tol
+    )
+    X_transformed_1 = svd.fit_transform(X)
+    X_transformed_2 = svd.fit(X).transform(X)
+    assert_allclose(X_transformed_1, X_transformed_2)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..62a538d340318f3eeb745e77e1b13a1a5ea809af
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/__init__.py
@@ -0,0 +1,45 @@
+"""Ensemble-based methods for classification, regression and anomaly detection."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._bagging import BaggingClassifier, BaggingRegressor
+from ._base import BaseEnsemble
+from ._forest import (
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
+    RandomForestClassifier,
+    RandomForestRegressor,
+    RandomTreesEmbedding,
+)
+from ._gb import GradientBoostingClassifier, GradientBoostingRegressor
+from ._hist_gradient_boosting.gradient_boosting import (
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+)
+from ._iforest import IsolationForest
+from ._stacking import StackingClassifier, StackingRegressor
+from ._voting import VotingClassifier, VotingRegressor
+from ._weight_boosting import AdaBoostClassifier, AdaBoostRegressor
+
+__all__ = [
+    "AdaBoostClassifier",
+    "AdaBoostRegressor",
+    "BaggingClassifier",
+    "BaggingRegressor",
+    "BaseEnsemble",
+    "ExtraTreesClassifier",
+    "ExtraTreesRegressor",
+    "GradientBoostingClassifier",
+    "GradientBoostingRegressor",
+    "HistGradientBoostingClassifier",
+    "HistGradientBoostingRegressor",
+    "IsolationForest",
+    "RandomForestClassifier",
+    "RandomForestRegressor",
+    "RandomTreesEmbedding",
+    "StackingClassifier",
+    "StackingRegressor",
+    "VotingClassifier",
+    "VotingRegressor",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_bagging.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_bagging.py
new file mode 100644
index 0000000000000000000000000000000000000000..34b613b15281aa946fac14178afed662dbbf3449
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_bagging.py
@@ -0,0 +1,1480 @@
+"""Bagging meta-estimator."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import numbers
+from abc import ABCMeta, abstractmethod
+from functools import partial
+from numbers import Integral
+from warnings import warn
+
+import numpy as np
+
+from ..base import ClassifierMixin, RegressorMixin, _fit_context
+from ..metrics import accuracy_score, r2_score
+from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
+from ..utils import (
+    Bunch,
+    _safe_indexing,
+    check_random_state,
+    column_or_1d,
+)
+from ..utils._mask import indices_to_mask
+from ..utils._param_validation import HasMethods, Interval, RealNotInt
+from ..utils._tags import get_tags
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    get_routing_for_object,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import check_classification_targets
+from ..utils.parallel import Parallel, delayed
+from ..utils.random import sample_without_replacement
+from ..utils.validation import (
+    _check_method_params,
+    _check_sample_weight,
+    _estimator_has,
+    check_is_fitted,
+    has_fit_parameter,
+    validate_data,
+)
+from ._base import BaseEnsemble, _partition_estimators
+
+__all__ = ["BaggingClassifier", "BaggingRegressor"]
+
+MAX_INT = np.iinfo(np.int32).max
+
+
+def _generate_indices(random_state, bootstrap, n_population, n_samples):
+    """Draw randomly sampled indices."""
+    # Draw sample indices
+    if bootstrap:
+        indices = random_state.randint(0, n_population, n_samples)
+    else:
+        indices = sample_without_replacement(
+            n_population, n_samples, random_state=random_state
+        )
+
+    return indices
+
+
+def _generate_bagging_indices(
+    random_state,
+    bootstrap_features,
+    bootstrap_samples,
+    n_features,
+    n_samples,
+    max_features,
+    max_samples,
+):
+    """Randomly draw feature and sample indices."""
+    # Get valid random state
+    random_state = check_random_state(random_state)
+
+    # Draw indices
+    feature_indices = _generate_indices(
+        random_state, bootstrap_features, n_features, max_features
+    )
+    sample_indices = _generate_indices(
+        random_state, bootstrap_samples, n_samples, max_samples
+    )
+
+    return feature_indices, sample_indices
+
+
+def _parallel_build_estimators(
+    n_estimators,
+    ensemble,
+    X,
+    y,
+    seeds,
+    total_n_estimators,
+    verbose,
+    check_input,
+    fit_params,
+):
+    """Private function used to build a batch of estimators within a job."""
+    # Retrieve settings
+    n_samples, n_features = X.shape
+    max_features = ensemble._max_features
+    max_samples = ensemble._max_samples
+    bootstrap = ensemble.bootstrap
+    bootstrap_features = ensemble.bootstrap_features
+    has_check_input = has_fit_parameter(ensemble.estimator_, "check_input")
+    requires_feature_indexing = bootstrap_features or max_features != n_features
+
+    # Build estimators
+    estimators = []
+    estimators_features = []
+
+    # TODO: (slep6) remove if condition for unrouted sample_weight when metadata
+    # routing can't be disabled.
+    support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight")
+    if not _routing_enabled() and (
+        not support_sample_weight and fit_params.get("sample_weight") is not None
+    ):
+        raise ValueError(
+            "The base estimator doesn't support sample weight, but sample_weight is "
+            "passed to the fit method."
+        )
+
+    for i in range(n_estimators):
+        if verbose > 1:
+            print(
+                "Building estimator %d of %d for this parallel run (total %d)..."
+                % (i + 1, n_estimators, total_n_estimators)
+            )
+
+        random_state = seeds[i]
+        estimator = ensemble._make_estimator(append=False, random_state=random_state)
+
+        if has_check_input:
+            estimator_fit = partial(estimator.fit, check_input=check_input)
+        else:
+            estimator_fit = estimator.fit
+
+        # Draw random feature, sample indices
+        features, indices = _generate_bagging_indices(
+            random_state,
+            bootstrap_features,
+            bootstrap,
+            n_features,
+            n_samples,
+            max_features,
+            max_samples,
+        )
+
+        fit_params_ = fit_params.copy()
+
+        # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata
+        # routing can't be disabled.
+        # 1. If routing is enabled, we will check if the routing supports sample
+        # weight and use it if it does.
+        # 2. If routing is not enabled, we will check if the base
+        # estimator supports sample_weight and use it if it does.
+
+        # Note: Row sampling can be achieved either through setting sample_weight or
+        # by indexing. The former is more efficient. Therefore, use this method
+        # if possible, otherwise use indexing.
+        if _routing_enabled():
+            request_or_router = get_routing_for_object(ensemble.estimator_)
+            consumes_sample_weight = request_or_router.consumes(
+                "fit", ("sample_weight",)
+            )
+        else:
+            consumes_sample_weight = support_sample_weight
+        if consumes_sample_weight:
+            # Draw sub samples, using sample weights, and then fit
+            curr_sample_weight = _check_sample_weight(
+                fit_params_.pop("sample_weight", None), X
+            ).copy()
+
+            if bootstrap:
+                sample_counts = np.bincount(indices, minlength=n_samples)
+                curr_sample_weight *= sample_counts
+            else:
+                not_indices_mask = ~indices_to_mask(indices, n_samples)
+                curr_sample_weight[not_indices_mask] = 0
+
+            fit_params_["sample_weight"] = curr_sample_weight
+            X_ = X[:, features] if requires_feature_indexing else X
+            estimator_fit(X_, y, **fit_params_)
+        else:
+            # cannot use sample_weight, so use indexing
+            y_ = _safe_indexing(y, indices)
+            X_ = _safe_indexing(X, indices)
+            fit_params_ = _check_method_params(X, params=fit_params_, indices=indices)
+            if requires_feature_indexing:
+                X_ = X_[:, features]
+            estimator_fit(X_, y_, **fit_params_)
+
+        estimators.append(estimator)
+        estimators_features.append(features)
+
+    return estimators, estimators_features
+
+
+def _parallel_predict_proba(
+    estimators,
+    estimators_features,
+    X,
+    n_classes,
+    predict_params=None,
+    predict_proba_params=None,
+):
+    """Private function used to compute (proba-)predictions within a job."""
+    n_samples = X.shape[0]
+    proba = np.zeros((n_samples, n_classes))
+
+    for estimator, features in zip(estimators, estimators_features):
+        if hasattr(estimator, "predict_proba"):
+            proba_estimator = estimator.predict_proba(
+                X[:, features], **(predict_params or {})
+            )
+
+            if n_classes == len(estimator.classes_):
+                proba += proba_estimator
+
+            else:
+                proba[:, estimator.classes_] += proba_estimator[
+                    :, range(len(estimator.classes_))
+                ]
+
+        else:
+            # Resort to voting
+            predictions = estimator.predict(
+                X[:, features], **(predict_proba_params or {})
+            )
+
+            for i in range(n_samples):
+                proba[i, predictions[i]] += 1
+
+    return proba
+
+
+def _parallel_predict_log_proba(estimators, estimators_features, X, n_classes, params):
+    """Private function used to compute log probabilities within a job."""
+    n_samples = X.shape[0]
+    log_proba = np.empty((n_samples, n_classes))
+    log_proba.fill(-np.inf)
+    all_classes = np.arange(n_classes, dtype=int)
+
+    for estimator, features in zip(estimators, estimators_features):
+        log_proba_estimator = estimator.predict_log_proba(X[:, features], **params)
+
+        if n_classes == len(estimator.classes_):
+            log_proba = np.logaddexp(log_proba, log_proba_estimator)
+
+        else:
+            log_proba[:, estimator.classes_] = np.logaddexp(
+                log_proba[:, estimator.classes_],
+                log_proba_estimator[:, range(len(estimator.classes_))],
+            )
+
+            missing = np.setdiff1d(all_classes, estimator.classes_)
+            log_proba[:, missing] = np.logaddexp(log_proba[:, missing], -np.inf)
+
+    return log_proba
+
+
+def _parallel_decision_function(estimators, estimators_features, X, params):
+    """Private function used to compute decisions within a job."""
+    return sum(
+        estimator.decision_function(X[:, features], **params)
+        for estimator, features in zip(estimators, estimators_features)
+    )
+
+
+def _parallel_predict_regression(estimators, estimators_features, X, params):
+    """Private function used to compute predictions within a job."""
+    return sum(
+        estimator.predict(X[:, features], **params)
+        for estimator, features in zip(estimators, estimators_features)
+    )
+
+
+class BaseBagging(BaseEnsemble, metaclass=ABCMeta):
+    """Base class for Bagging meta-estimator.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit", "predict"]), None],
+        "n_estimators": [Interval(Integral, 1, None, closed="left")],
+        "max_samples": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="right"),
+        ],
+        "max_features": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="right"),
+        ],
+        "bootstrap": ["boolean"],
+        "bootstrap_features": ["boolean"],
+        "oob_score": ["boolean"],
+        "warm_start": ["boolean"],
+        "n_jobs": [None, Integral],
+        "random_state": ["random_state"],
+        "verbose": ["verbose"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimator=None,
+        n_estimators=10,
+        *,
+        max_samples=1.0,
+        max_features=1.0,
+        bootstrap=True,
+        bootstrap_features=False,
+        oob_score=False,
+        warm_start=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+    ):
+        super().__init__(
+            estimator=estimator,
+            n_estimators=n_estimators,
+        )
+        self.max_samples = max_samples
+        self.max_features = max_features
+        self.bootstrap = bootstrap
+        self.bootstrap_features = bootstrap_features
+        self.oob_score = oob_score
+        self.warm_start = warm_start
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.verbose = verbose
+
+    @_fit_context(
+        # BaseBagging.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        """Build a Bagging ensemble of estimators from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        y : array-like of shape (n_samples,)
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if the base estimator supports
+            sample weighting.
+        **fit_params : dict
+            Parameters to pass to the underlying estimators.
+
+            .. versionadded:: 1.5
+
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        _raise_for_params(fit_params, self, "fit")
+
+        # Convert data (X is required to be 2d and indexable)
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csr", "csc"],
+            dtype=None,
+            ensure_all_finite=False,
+            multi_output=True,
+        )
+
+        return self._fit(
+            X,
+            y,
+            max_samples=self.max_samples,
+            sample_weight=sample_weight,
+            **fit_params,
+        )
+
+    def _parallel_args(self):
+        return {}
+
+    def _fit(
+        self,
+        X,
+        y,
+        max_samples=None,
+        max_depth=None,
+        check_input=True,
+        sample_weight=None,
+        **fit_params,
+    ):
+        """Build a Bagging ensemble of estimators from the training
+           set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        y : array-like of shape (n_samples,)
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        max_samples : int or float, default=None
+            Argument to use instead of self.max_samples.
+
+        max_depth : int, default=None
+            Override value used when constructing base estimator. Only
+            supported if the base estimator has a max_depth parameter.
+
+        check_input : bool, default=True
+            Override value used when fitting base estimator. Only supported
+            if the base estimator has a check_input parameter for fit function.
+            If the meta-estimator already checks the input, set this value to
+            False to prevent redundant input validation.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if the base estimator supports
+            sample weighting.
+
+        **fit_params : dict, default=None
+            Parameters to pass to the :term:`fit` method of the underlying
+            estimator.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        random_state = check_random_state(self.random_state)
+
+        # Remap output
+        n_samples = X.shape[0]
+        self._n_samples = n_samples
+        y = self._validate_y(y)
+
+        # Check parameters
+        self._validate_estimator(self._get_estimator())
+
+        if sample_weight is not None:
+            fit_params["sample_weight"] = sample_weight
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            routed_params = Bunch()
+            routed_params.estimator = Bunch(fit=fit_params)
+            if "sample_weight" in fit_params:
+                routed_params.estimator.fit["sample_weight"] = fit_params[
+                    "sample_weight"
+                ]
+
+        if max_depth is not None:
+            self.estimator_.max_depth = max_depth
+
+        # Validate max_samples
+        if max_samples is None:
+            max_samples = self.max_samples
+        elif not isinstance(max_samples, numbers.Integral):
+            max_samples = int(max_samples * X.shape[0])
+
+        if max_samples > X.shape[0]:
+            raise ValueError("max_samples must be <= n_samples")
+
+        # Store validated integer row sampling value
+        self._max_samples = max_samples
+
+        # Validate max_features
+        if isinstance(self.max_features, numbers.Integral):
+            max_features = self.max_features
+        elif isinstance(self.max_features, float):
+            max_features = int(self.max_features * self.n_features_in_)
+
+        if max_features > self.n_features_in_:
+            raise ValueError("max_features must be <= n_features")
+
+        max_features = max(1, int(max_features))
+
+        # Store validated integer feature sampling value
+        self._max_features = max_features
+
+        # Other checks
+        if not self.bootstrap and self.oob_score:
+            raise ValueError("Out of bag estimation only available if bootstrap=True")
+
+        if self.warm_start and self.oob_score:
+            raise ValueError("Out of bag estimate only available if warm_start=False")
+
+        if hasattr(self, "oob_score_") and self.warm_start:
+            del self.oob_score_
+
+        if not self.warm_start or not hasattr(self, "estimators_"):
+            # Free allocated memory, if any
+            self.estimators_ = []
+            self.estimators_features_ = []
+
+        n_more_estimators = self.n_estimators - len(self.estimators_)
+
+        if n_more_estimators < 0:
+            raise ValueError(
+                "n_estimators=%d must be larger or equal to "
+                "len(estimators_)=%d when warm_start==True"
+                % (self.n_estimators, len(self.estimators_))
+            )
+
+        elif n_more_estimators == 0:
+            warn(
+                "Warm-start fitting without increasing n_estimators does not "
+                "fit new trees."
+            )
+            return self
+
+        # Parallel loop
+        n_jobs, n_estimators, starts = _partition_estimators(
+            n_more_estimators, self.n_jobs
+        )
+        total_n_estimators = sum(n_estimators)
+
+        # Advance random state to state after training
+        # the first n_estimators
+        if self.warm_start and len(self.estimators_) > 0:
+            random_state.randint(MAX_INT, size=len(self.estimators_))
+
+        seeds = random_state.randint(MAX_INT, size=n_more_estimators)
+        self._seeds = seeds
+
+        all_results = Parallel(
+            n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args()
+        )(
+            delayed(_parallel_build_estimators)(
+                n_estimators[i],
+                self,
+                X,
+                y,
+                seeds[starts[i] : starts[i + 1]],
+                total_n_estimators,
+                verbose=self.verbose,
+                check_input=check_input,
+                fit_params=routed_params.estimator.fit,
+            )
+            for i in range(n_jobs)
+        )
+
+        # Reduce
+        self.estimators_ += list(
+            itertools.chain.from_iterable(t[0] for t in all_results)
+        )
+        self.estimators_features_ += list(
+            itertools.chain.from_iterable(t[1] for t in all_results)
+        )
+
+        if self.oob_score:
+            self._set_oob_score(X, y)
+
+        return self
+
+    @abstractmethod
+    def _set_oob_score(self, X, y):
+        """Calculate out of bag predictions and score."""
+
+    def _validate_y(self, y):
+        if len(y.shape) == 1 or y.shape[1] == 1:
+            return column_or_1d(y, warn=True)
+        return y
+
+    def _get_estimators_indices(self):
+        # Get drawn indices along both sample and feature axes
+        for seed in self._seeds:
+            # Operations accessing random_state must be performed identically
+            # to those in `_parallel_build_estimators()`
+            feature_indices, sample_indices = _generate_bagging_indices(
+                seed,
+                self.bootstrap_features,
+                self.bootstrap,
+                self.n_features_in_,
+                self._n_samples,
+                self._max_features,
+                self._max_samples,
+            )
+
+            yield feature_indices, sample_indices
+
+    @property
+    def estimators_samples_(self):
+        """
+        The subset of drawn samples for each base estimator.
+
+        Returns a dynamically generated list of indices identifying
+        the samples used for fitting each member of the ensemble, i.e.,
+        the in-bag samples.
+
+        Note: the list is re-created at each call to the property in order
+        to reduce the object memory footprint by not storing the sampling
+        data. Thus fetching the property may be slower than expected.
+        """
+        return [sample_indices for _, sample_indices in self._get_estimators_indices()]
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+
+        method_mapping = MethodMapping()
+        method_mapping.add(caller="fit", callee="fit").add(
+            caller="decision_function", callee="decision_function"
+        )
+
+        # the router needs to be built depending on whether the sub-estimator has a
+        # `predict_proba` method (as BaggingClassifier decides dynamically at runtime):
+        if hasattr(self._get_estimator(), "predict_proba"):
+            (
+                method_mapping.add(caller="predict", callee="predict_proba").add(
+                    caller="predict_proba", callee="predict_proba"
+                )
+            )
+
+        else:
+            (
+                method_mapping.add(caller="predict", callee="predict").add(
+                    caller="predict_proba", callee="predict"
+                )
+            )
+
+        # the router needs to be built depending on whether the sub-estimator has a
+        # `predict_log_proba` method (as BaggingClassifier decides dynamically at
+        # runtime):
+        if hasattr(self._get_estimator(), "predict_log_proba"):
+            method_mapping.add(caller="predict_log_proba", callee="predict_log_proba")
+
+        else:
+            # if `predict_log_proba` is not available in BaggingClassifier's
+            # sub-estimator, the routing should go to its `predict_proba` if it is
+            # available or else to its `predict` method; according to how
+            # `sample_weight` is passed to the respective methods dynamically at
+            # runtime:
+            if hasattr(self._get_estimator(), "predict_proba"):
+                method_mapping.add(caller="predict_log_proba", callee="predict_proba")
+
+            else:
+                method_mapping.add(caller="predict_log_proba", callee="predict")
+
+        router.add(estimator=self._get_estimator(), method_mapping=method_mapping)
+        return router
+
+    @abstractmethod
+    def _get_estimator(self):
+        """Resolve which estimator to return."""
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = get_tags(self._get_estimator()).input_tags.sparse
+        tags.input_tags.allow_nan = get_tags(self._get_estimator()).input_tags.allow_nan
+        return tags
+
+
+class BaggingClassifier(ClassifierMixin, BaseBagging):
+    """A Bagging classifier.
+
+    A Bagging classifier is an ensemble meta-estimator that fits base
+    classifiers each on random subsets of the original dataset and then
+    aggregate their individual predictions (either by voting or by averaging)
+    to form a final prediction. Such a meta-estimator can typically be used as
+    a way to reduce the variance of a black-box estimator (e.g., a decision
+    tree), by introducing randomization into its construction procedure and
+    then making an ensemble out of it.
+
+    This algorithm encompasses several works from the literature. When random
+    subsets of the dataset are drawn as random subsets of the samples, then
+    this algorithm is known as Pasting [1]_. If samples are drawn with
+    replacement, then the method is known as Bagging [2]_. When random subsets
+    of the dataset are drawn as random subsets of the features, then the method
+    is known as Random Subspaces [3]_. Finally, when base estimators are built
+    on subsets of both samples and features, then the method is known as
+    Random Patches [4]_.
+
+    Read more in the :ref:`User Guide <bagging>`.
+
+    .. versionadded:: 0.15
+
+    Parameters
+    ----------
+    estimator : object, default=None
+        The base estimator to fit on random subsets of the dataset.
+        If None, then the base estimator is a
+        :class:`~sklearn.tree.DecisionTreeClassifier`.
+
+        .. versionadded:: 1.2
+           `base_estimator` was renamed to `estimator`.
+
+    n_estimators : int, default=10
+        The number of base estimators in the ensemble.
+
+    max_samples : int or float, default=1.0
+        The number of samples to draw from X to train each base estimator (with
+        replacement by default, see `bootstrap` for more details).
+
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples.
+
+    max_features : int or float, default=1.0
+        The number of features to draw from X to train each base estimator (
+        without replacement by default, see `bootstrap_features` for more
+        details).
+
+        - If int, then draw `max_features` features.
+        - If float, then draw `max(1, int(max_features * n_features_in_))` features.
+
+    bootstrap : bool, default=True
+        Whether samples are drawn with replacement. If False, sampling
+        without replacement is performed.
+
+    bootstrap_features : bool, default=False
+        Whether features are drawn with replacement.
+
+    oob_score : bool, default=False
+        Whether to use out-of-bag samples to estimate
+        the generalization error. Only available if bootstrap=True.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble. See :term:`the Glossary <warm_start>`.
+
+        .. versionadded:: 0.17
+           *warm_start* constructor parameter.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for both :meth:`fit` and
+        :meth:`predict`. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random resampling of the original dataset
+        (sample wise and feature wise).
+        If the base estimator accepts a `random_state` attribute, a different
+        seed is generated for each instance in the ensemble.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    verbose : int, default=0
+        Controls the verbosity when fitting and predicting.
+
+    Attributes
+    ----------
+    estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by an array of the indices selected.
+
+    estimators_features_ : list of arrays
+        The subset of drawn features for each base estimator.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels.
+
+    n_classes_ : int or list
+        The number of classes.
+
+    oob_score_ : float
+        Score of the training dataset obtained using an out-of-bag estimate.
+        This attribute exists only when ``oob_score`` is True.
+
+    oob_decision_function_ : ndarray of shape (n_samples, n_classes)
+        Decision function computed with out-of-bag estimate on the training
+        set. If n_estimators is small it might be possible that a data point
+        was never left out during the bootstrap. In this case,
+        `oob_decision_function_` might contain NaN. This attribute exists
+        only when ``oob_score`` is True.
+
+    See Also
+    --------
+    BaggingRegressor : A Bagging regressor.
+
+    References
+    ----------
+
+    .. [1] L. Breiman, "Pasting small votes for classification in large
+           databases and on-line", Machine Learning, 36(1), 85-103, 1999.
+
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
+           1996.
+
+    .. [3] T. Ho, "The random subspace method for constructing decision
+           forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
+           1998.
+
+    .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
+           Learning and Knowledge Discovery in Databases, 346-361, 2012.
+
+    Examples
+    --------
+    >>> from sklearn.svm import SVC
+    >>> from sklearn.ensemble import BaggingClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=100, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = BaggingClassifier(estimator=SVC(),
+    ...                         n_estimators=10, random_state=0).fit(X, y)
+    >>> clf.predict([[0, 0, 0, 0]])
+    array([1])
+    """
+
+    def __init__(
+        self,
+        estimator=None,
+        n_estimators=10,
+        *,
+        max_samples=1.0,
+        max_features=1.0,
+        bootstrap=True,
+        bootstrap_features=False,
+        oob_score=False,
+        warm_start=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+    ):
+        super().__init__(
+            estimator=estimator,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=max_features,
+            bootstrap=bootstrap,
+            bootstrap_features=bootstrap_features,
+            oob_score=oob_score,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+        )
+
+    def _get_estimator(self):
+        """Resolve which estimator to return (default is DecisionTreeClassifier)"""
+        if self.estimator is None:
+            return DecisionTreeClassifier()
+        return self.estimator
+
+    def _set_oob_score(self, X, y):
+        n_samples = y.shape[0]
+        n_classes_ = self.n_classes_
+
+        predictions = np.zeros((n_samples, n_classes_))
+
+        for estimator, samples, features in zip(
+            self.estimators_, self.estimators_samples_, self.estimators_features_
+        ):
+            # Create mask for OOB samples
+            mask = ~indices_to_mask(samples, n_samples)
+
+            if hasattr(estimator, "predict_proba"):
+                predictions[mask, :] += estimator.predict_proba(
+                    (X[mask, :])[:, features]
+                )
+
+            else:
+                p = estimator.predict((X[mask, :])[:, features])
+                j = 0
+
+                for i in range(n_samples):
+                    if mask[i]:
+                        predictions[i, p[j]] += 1
+                        j += 1
+
+        if (predictions.sum(axis=1) == 0).any():
+            warn(
+                "Some inputs do not have OOB scores. "
+                "This probably means too few estimators were used "
+                "to compute any reliable oob estimates."
+            )
+
+        oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
+        oob_score = accuracy_score(y, np.argmax(predictions, axis=1))
+
+        self.oob_decision_function_ = oob_decision_function
+        self.oob_score_ = oob_score
+
+    def _validate_y(self, y):
+        y = column_or_1d(y, warn=True)
+        check_classification_targets(y)
+        self.classes_, y = np.unique(y, return_inverse=True)
+        self.n_classes_ = len(self.classes_)
+
+        return y
+
+    def predict(self, X, **params):
+        """Predict class for X.
+
+        The predicted class of an input sample is computed as the class with
+        the highest mean predicted probability. If base estimators do not
+        implement a ``predict_proba`` method, then it resorts to voting.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        **params : dict
+            Parameters routed to the `predict_proba` (if available) or the `predict`
+            method (otherwise) of the sub-estimators via the metadata routing API.
+
+            .. versionadded:: 1.7
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted classes.
+        """
+        _raise_for_params(params, self, "predict")
+
+        predicted_probabilitiy = self.predict_proba(X, **params)
+        return self.classes_.take((np.argmax(predicted_probabilitiy, axis=1)), axis=0)
+
+    def predict_proba(self, X, **params):
+        """Predict class probabilities for X.
+
+        The predicted class probabilities of an input sample is computed as
+        the mean predicted class probabilities of the base estimators in the
+        ensemble. If base estimators do not implement a ``predict_proba``
+        method, then it resorts to voting and the predicted class probabilities
+        of an input sample represents the proportion of estimators predicting
+        each class.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        **params : dict
+            Parameters routed to the `predict_proba` (if available) or the `predict`
+            method (otherwise) of the sub-estimators via the metadata routing API.
+
+            .. versionadded:: 1.7
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        _raise_for_params(params, self, "predict_proba")
+
+        check_is_fitted(self)
+        # Check data
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc"],
+            dtype=None,
+            ensure_all_finite=False,
+            reset=False,
+        )
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "predict_proba", **params)
+        else:
+            routed_params = Bunch()
+            routed_params.estimator = Bunch(predict_proba=Bunch())
+
+        # Parallel loop
+        n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        all_proba = Parallel(
+            n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args()
+        )(
+            delayed(_parallel_predict_proba)(
+                self.estimators_[starts[i] : starts[i + 1]],
+                self.estimators_features_[starts[i] : starts[i + 1]],
+                X,
+                self.n_classes_,
+                predict_params=routed_params.estimator.get("predict", None),
+                predict_proba_params=routed_params.estimator.get("predict_proba", None),
+            )
+            for i in range(n_jobs)
+        )
+
+        # Reduce
+        proba = sum(all_proba) / self.n_estimators
+
+        return proba
+
+    def predict_log_proba(self, X, **params):
+        """Predict class log-probabilities for X.
+
+        The predicted class log-probabilities of an input sample is computed as
+        the log of the mean predicted class probabilities of the base
+        estimators in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        **params : dict
+            Parameters routed to the `predict_log_proba`, the `predict_proba` or the
+            `proba` method of the sub-estimators via the metadata routing API. The
+            routing is tried in the mentioned order depending on whether this method is
+            available on the sub-estimator.
+
+            .. versionadded:: 1.7
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        _raise_for_params(params, self, "predict_log_proba")
+
+        check_is_fitted(self)
+
+        if hasattr(self.estimator_, "predict_log_proba"):
+            # Check data
+            X = validate_data(
+                self,
+                X,
+                accept_sparse=["csr", "csc"],
+                dtype=None,
+                ensure_all_finite=False,
+                reset=False,
+            )
+
+            if _routing_enabled():
+                routed_params = process_routing(self, "predict_log_proba", **params)
+            else:
+                routed_params = Bunch()
+                routed_params.estimator = Bunch(predict_log_proba=Bunch())
+
+            # Parallel loop
+            n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
+
+            all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+                delayed(_parallel_predict_log_proba)(
+                    self.estimators_[starts[i] : starts[i + 1]],
+                    self.estimators_features_[starts[i] : starts[i + 1]],
+                    X,
+                    self.n_classes_,
+                    params=routed_params.estimator.predict_log_proba,
+                )
+                for i in range(n_jobs)
+            )
+
+            # Reduce
+            log_proba = all_log_proba[0]
+
+            for j in range(1, len(all_log_proba)):
+                log_proba = np.logaddexp(log_proba, all_log_proba[j])
+
+            log_proba -= np.log(self.n_estimators)
+
+        else:
+            log_proba = np.log(self.predict_proba(X, **params))
+
+        return log_proba
+
+    @available_if(
+        _estimator_has("decision_function", delegates=("estimators_", "estimator"))
+    )
+    def decision_function(self, X, **params):
+        """Average of the decision functions of the base classifiers.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        **params : dict
+            Parameters routed to the `decision_function` method of the sub-estimators
+            via the metadata routing API.
+
+            .. versionadded:: 1.7
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        score : ndarray of shape (n_samples, k)
+            The decision function of the input samples. The columns correspond
+            to the classes in sorted order, as they appear in the attribute
+            ``classes_``. Regression and binary classification are special
+            cases with ``k == 1``, otherwise ``k==n_classes``.
+        """
+        _raise_for_params(params, self, "decision_function")
+
+        check_is_fitted(self)
+
+        # Check data
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc"],
+            dtype=None,
+            ensure_all_finite=False,
+            reset=False,
+        )
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "decision_function", **params)
+        else:
+            routed_params = Bunch()
+            routed_params.estimator = Bunch(decision_function=Bunch())
+
+        # Parallel loop
+        n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_parallel_decision_function)(
+                self.estimators_[starts[i] : starts[i + 1]],
+                self.estimators_features_[starts[i] : starts[i + 1]],
+                X,
+                params=routed_params.estimator.decision_function,
+            )
+            for i in range(n_jobs)
+        )
+
+        # Reduce
+        decisions = sum(all_decisions) / self.n_estimators
+
+        return decisions
+
+
+class BaggingRegressor(RegressorMixin, BaseBagging):
+    """A Bagging regressor.
+
+    A Bagging regressor is an ensemble meta-estimator that fits base
+    regressors each on random subsets of the original dataset and then
+    aggregate their individual predictions (either by voting or by averaging)
+    to form a final prediction. Such a meta-estimator can typically be used as
+    a way to reduce the variance of a black-box estimator (e.g., a decision
+    tree), by introducing randomization into its construction procedure and
+    then making an ensemble out of it.
+
+    This algorithm encompasses several works from the literature. When random
+    subsets of the dataset are drawn as random subsets of the samples, then
+    this algorithm is known as Pasting [1]_. If samples are drawn with
+    replacement, then the method is known as Bagging [2]_. When random subsets
+    of the dataset are drawn as random subsets of the features, then the method
+    is known as Random Subspaces [3]_. Finally, when base estimators are built
+    on subsets of both samples and features, then the method is known as
+    Random Patches [4]_.
+
+    Read more in the :ref:`User Guide <bagging>`.
+
+    .. versionadded:: 0.15
+
+    Parameters
+    ----------
+    estimator : object, default=None
+        The base estimator to fit on random subsets of the dataset.
+        If None, then the base estimator is a
+        :class:`~sklearn.tree.DecisionTreeRegressor`.
+
+        .. versionadded:: 1.2
+           `base_estimator` was renamed to `estimator`.
+
+    n_estimators : int, default=10
+        The number of base estimators in the ensemble.
+
+    max_samples : int or float, default=1.0
+        The number of samples to draw from X to train each base estimator (with
+        replacement by default, see `bootstrap` for more details).
+
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples.
+
+    max_features : int or float, default=1.0
+        The number of features to draw from X to train each base estimator (
+        without replacement by default, see `bootstrap_features` for more
+        details).
+
+        - If int, then draw `max_features` features.
+        - If float, then draw `max(1, int(max_features * n_features_in_))` features.
+
+    bootstrap : bool, default=True
+        Whether samples are drawn with replacement. If False, sampling
+        without replacement is performed.
+
+    bootstrap_features : bool, default=False
+        Whether features are drawn with replacement.
+
+    oob_score : bool, default=False
+        Whether to use out-of-bag samples to estimate
+        the generalization error. Only available if bootstrap=True.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble. See :term:`the Glossary <warm_start>`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for both :meth:`fit` and
+        :meth:`predict`. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random resampling of the original dataset
+        (sample wise and feature wise).
+        If the base estimator accepts a `random_state` attribute, a different
+        seed is generated for each instance in the ensemble.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    verbose : int, default=0
+        Controls the verbosity when fitting and predicting.
+
+    Attributes
+    ----------
+    estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    estimators_ : list of estimators
+        The collection of fitted sub-estimators.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by an array of the indices selected.
+
+    estimators_features_ : list of arrays
+        The subset of drawn features for each base estimator.
+
+    oob_score_ : float
+        Score of the training dataset obtained using an out-of-bag estimate.
+        This attribute exists only when ``oob_score`` is True.
+
+    oob_prediction_ : ndarray of shape (n_samples,)
+        Prediction computed with out-of-bag estimate on the training
+        set. If n_estimators is small it might be possible that a data point
+        was never left out during the bootstrap. In this case,
+        `oob_prediction_` might contain NaN. This attribute exists only
+        when ``oob_score`` is True.
+
+    See Also
+    --------
+    BaggingClassifier : A Bagging classifier.
+
+    References
+    ----------
+
+    .. [1] L. Breiman, "Pasting small votes for classification in large
+           databases and on-line", Machine Learning, 36(1), 85-103, 1999.
+
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
+           1996.
+
+    .. [3] T. Ho, "The random subspace method for constructing decision
+           forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
+           1998.
+
+    .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
+           Learning and Knowledge Discovery in Databases, 346-361, 2012.
+
+    Examples
+    --------
+    >>> from sklearn.svm import SVR
+    >>> from sklearn.ensemble import BaggingRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_samples=100, n_features=4,
+    ...                        n_informative=2, n_targets=1,
+    ...                        random_state=0, shuffle=False)
+    >>> regr = BaggingRegressor(estimator=SVR(),
+    ...                         n_estimators=10, random_state=0).fit(X, y)
+    >>> regr.predict([[0, 0, 0, 0]])
+    array([-2.8720])
+    """
+
+    def __init__(
+        self,
+        estimator=None,
+        n_estimators=10,
+        *,
+        max_samples=1.0,
+        max_features=1.0,
+        bootstrap=True,
+        bootstrap_features=False,
+        oob_score=False,
+        warm_start=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+    ):
+        super().__init__(
+            estimator=estimator,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=max_features,
+            bootstrap=bootstrap,
+            bootstrap_features=bootstrap_features,
+            oob_score=oob_score,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+        )
+
+    def predict(self, X, **params):
+        """Predict regression target for X.
+
+        The predicted regression target of an input sample is computed as the
+        mean predicted regression targets of the estimators in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        **params : dict
+            Parameters routed to the `predict` method of the sub-estimators via the
+            metadata routing API.
+
+            .. versionadded:: 1.7
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted values.
+        """
+        _raise_for_params(params, self, "predict")
+
+        check_is_fitted(self)
+        # Check data
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc"],
+            dtype=None,
+            ensure_all_finite=False,
+            reset=False,
+        )
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "predict", **params)
+        else:
+            routed_params = Bunch()
+            routed_params.estimator = Bunch(predict=Bunch())
+
+        # Parallel loop
+        n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_parallel_predict_regression)(
+                self.estimators_[starts[i] : starts[i + 1]],
+                self.estimators_features_[starts[i] : starts[i + 1]],
+                X,
+                params=routed_params.estimator.predict,
+            )
+            for i in range(n_jobs)
+        )
+
+        # Reduce
+        y_hat = sum(all_y_hat) / self.n_estimators
+
+        return y_hat
+
+    def _set_oob_score(self, X, y):
+        n_samples = y.shape[0]
+
+        predictions = np.zeros((n_samples,))
+        n_predictions = np.zeros((n_samples,))
+
+        for estimator, samples, features in zip(
+            self.estimators_, self.estimators_samples_, self.estimators_features_
+        ):
+            # Create mask for OOB samples
+            mask = ~indices_to_mask(samples, n_samples)
+
+            predictions[mask] += estimator.predict((X[mask, :])[:, features])
+            n_predictions[mask] += 1
+
+        if (n_predictions == 0).any():
+            warn(
+                "Some inputs do not have OOB scores. "
+                "This probably means too few estimators were used "
+                "to compute any reliable oob estimates."
+            )
+            n_predictions[n_predictions == 0] = 1
+
+        predictions /= n_predictions
+
+        self.oob_prediction_ = predictions
+        self.oob_score_ = r2_score(y, predictions)
+
+    def _get_estimator(self):
+        """Resolve which estimator to return (default is DecisionTreeClassifier)"""
+        if self.estimator is None:
+            return DecisionTreeRegressor()
+        return self.estimator
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_base.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e04645eec174f8d6adad6346ad2e0729577f0b5e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_base.py
@@ -0,0 +1,307 @@
+"""Base class for ensemble-based estimators."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from joblib import effective_n_jobs
+
+from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier, is_regressor
+from ..utils import Bunch, check_random_state
+from ..utils._tags import get_tags
+from ..utils._user_interface import _print_elapsed_time
+from ..utils.metadata_routing import _routing_enabled
+from ..utils.metaestimators import _BaseComposition
+
+
+def _fit_single_estimator(
+    estimator, X, y, fit_params, message_clsname=None, message=None
+):
+    """Private function used to fit an estimator within a job."""
+    # TODO(SLEP6): remove if-condition for unrouted sample_weight when metadata
+    # routing can't be disabled.
+    if not _routing_enabled() and "sample_weight" in fit_params:
+        try:
+            with _print_elapsed_time(message_clsname, message):
+                estimator.fit(X, y, sample_weight=fit_params["sample_weight"])
+        except TypeError as exc:
+            if "unexpected keyword argument 'sample_weight'" in str(exc):
+                raise TypeError(
+                    "Underlying estimator {} does not support sample weights.".format(
+                        estimator.__class__.__name__
+                    )
+                ) from exc
+            raise
+    else:
+        with _print_elapsed_time(message_clsname, message):
+            estimator.fit(X, y, **fit_params)
+    return estimator
+
+
+def _set_random_states(estimator, random_state=None):
+    """Set fixed random_state parameters for an estimator.
+
+    Finds all parameters ending ``random_state`` and sets them to integers
+    derived from ``random_state``.
+
+    Parameters
+    ----------
+    estimator : estimator supporting get/set_params
+        Estimator with potential randomness managed by random_state
+        parameters.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo-random number generator to control the generation of the random
+        integers. Pass an int for reproducible output across multiple function
+        calls.
+        See :term:`Glossary <random_state>`.
+
+    Notes
+    -----
+    This does not necessarily set *all* ``random_state`` attributes that
+    control an estimator's randomness, only those accessible through
+    ``estimator.get_params()``.  ``random_state``s not controlled include
+    those belonging to:
+
+        * cross-validation splitters
+        * ``scipy.stats`` rvs
+    """
+    random_state = check_random_state(random_state)
+    to_set = {}
+    for key in sorted(estimator.get_params(deep=True)):
+        if key == "random_state" or key.endswith("__random_state"):
+            to_set[key] = random_state.randint(np.iinfo(np.int32).max)
+
+    if to_set:
+        estimator.set_params(**to_set)
+
+
+class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for all ensemble classes.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+
+    Parameters
+    ----------
+    estimator : object
+        The base estimator from which the ensemble is built.
+
+    n_estimators : int, default=10
+        The number of estimators in the ensemble.
+
+    estimator_params : list of str, default=tuple()
+        The list of attributes to use as parameters when instantiating a
+        new base estimator. If none are given, default parameters are used.
+
+    Attributes
+    ----------
+    estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimator=None,
+        *,
+        n_estimators=10,
+        estimator_params=tuple(),
+    ):
+        # Set parameters
+        self.estimator = estimator
+        self.n_estimators = n_estimators
+        self.estimator_params = estimator_params
+
+        # Don't instantiate estimators now! Parameters of estimator might
+        # still change. Eg., when grid-searching with the nested object syntax.
+        # self.estimators_ needs to be filled by the derived classes in fit.
+
+    def _validate_estimator(self, default=None):
+        """Check the base estimator.
+
+        Sets the `estimator_` attributes.
+        """
+        if self.estimator is not None:
+            self.estimator_ = self.estimator
+        else:
+            self.estimator_ = default
+
+    def _make_estimator(self, append=True, random_state=None):
+        """Make and configure a copy of the `estimator_` attribute.
+
+        Warning: This method should be used to properly instantiate new
+        sub-estimators.
+        """
+        estimator = clone(self.estimator_)
+        estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params})
+
+        if random_state is not None:
+            _set_random_states(estimator, random_state)
+
+        if append:
+            self.estimators_.append(estimator)
+
+        return estimator
+
+    def __len__(self):
+        """Return the number of estimators in the ensemble."""
+        return len(self.estimators_)
+
+    def __getitem__(self, index):
+        """Return the index'th estimator in the ensemble."""
+        return self.estimators_[index]
+
+    def __iter__(self):
+        """Return iterator over estimators in the ensemble."""
+        return iter(self.estimators_)
+
+
+def _partition_estimators(n_estimators, n_jobs):
+    """Private function used to partition estimators between jobs."""
+    # Compute the number of jobs
+    n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
+
+    # Partition estimators between jobs
+    n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs, dtype=int)
+    n_estimators_per_job[: n_estimators % n_jobs] += 1
+    starts = np.cumsum(n_estimators_per_job)
+
+    return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
+
+
+class _BaseHeterogeneousEnsemble(
+    MetaEstimatorMixin, _BaseComposition, metaclass=ABCMeta
+):
+    """Base class for heterogeneous ensemble of learners.
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator) tuples
+        The ensemble of estimators to use in the ensemble. Each element of the
+        list is defined as a tuple of string (i.e. name of the estimator) and
+        an estimator instance. An estimator can be set to `'drop'` using
+        `set_params`.
+
+    Attributes
+    ----------
+    estimators_ : list of estimators
+        The elements of the estimators parameter, having been fitted on the
+        training data. If an estimator has been set to `'drop'`, it will not
+        appear in `estimators_`.
+    """
+
+    @property
+    def named_estimators(self):
+        """Dictionary to access any fitted sub-estimators by name.
+
+        Returns
+        -------
+        :class:`~sklearn.utils.Bunch`
+        """
+        return Bunch(**dict(self.estimators))
+
+    @abstractmethod
+    def __init__(self, estimators):
+        self.estimators = estimators
+
+    def _validate_estimators(self):
+        if len(self.estimators) == 0 or not all(
+            isinstance(item, (tuple, list)) and isinstance(item[0], str)
+            for item in self.estimators
+        ):
+            raise ValueError(
+                "Invalid 'estimators' attribute, 'estimators' should be a "
+                "non-empty list of (string, estimator) tuples."
+            )
+        names, estimators = zip(*self.estimators)
+        # defined by MetaEstimatorMixin
+        self._validate_names(names)
+
+        has_estimator = any(est != "drop" for est in estimators)
+        if not has_estimator:
+            raise ValueError(
+                "All estimators are dropped. At least one is required "
+                "to be an estimator."
+            )
+
+        is_estimator_type = is_classifier if is_classifier(self) else is_regressor
+
+        for est in estimators:
+            if est != "drop" and not is_estimator_type(est):
+                raise ValueError(
+                    "The estimator {} should be a {}.".format(
+                        est.__class__.__name__, is_estimator_type.__name__[3:]
+                    )
+                )
+
+        return names, estimators
+
+    def set_params(self, **params):
+        """
+        Set the parameters of an estimator from the ensemble.
+
+        Valid parameter keys can be listed with `get_params()`. Note that you
+        can directly set the parameters of the estimators contained in
+        `estimators`.
+
+        Parameters
+        ----------
+        **params : keyword arguments
+            Specific parameters using e.g.
+            `set_params(parameter_name=new_value)`. In addition, to setting the
+            parameters of the estimator, the individual estimator of the
+            estimators can also be set, or can be removed by setting them to
+            'drop'.
+
+        Returns
+        -------
+        self : object
+            Estimator instance.
+        """
+        super()._set_params("estimators", **params)
+        return self
+
+    def get_params(self, deep=True):
+        """
+        Get the parameters of an estimator from the ensemble.
+
+        Returns the parameters given in the constructor as well as the
+        estimators contained within the `estimators` parameter.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            Setting it to True gets the various estimators and the parameters
+            of the estimators as well.
+
+        Returns
+        -------
+        params : dict
+            Parameter and estimator names mapped to their values or parameter
+            names mapped to their values.
+        """
+        return super()._get_params("estimators", deep=deep)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        try:
+            tags.input_tags.allow_nan = all(
+                get_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True
+                for est in self.estimators
+            )
+            tags.input_tags.sparse = all(
+                get_tags(est[1]).input_tags.sparse if est[1] != "drop" else True
+                for est in self.estimators
+            )
+        except Exception:
+            # If `estimators` does not comply with our API (list of tuples) then it will
+            # fail. In this case, we assume that `allow_nan` and `sparse` are False but
+            # the parameter validation will raise an error during `fit`.
+            pass  # pragma: no cover
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_forest.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_forest.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b27e789b1d137126336f62955bd50895d7fa4f7
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_forest.py
@@ -0,0 +1,3045 @@
+"""
+Forest of trees-based ensemble methods.
+
+Those methods include random forests and extremely randomized trees.
+
+The module structure is the following:
+
+- The ``BaseForest`` base class implements a common ``fit`` method for all
+  the estimators in the module. The ``fit`` method of the base ``Forest``
+  class calls the ``fit`` method of each sub-estimator on random samples
+  (with replacement, a.k.a. bootstrap) of the training set.
+
+  The init of the sub-estimator is further delegated to the
+  ``BaseEnsemble`` constructor.
+
+- The ``ForestClassifier`` and ``ForestRegressor`` base classes further
+  implement the prediction logic by computing an average of the predicted
+  outcomes of the sub-estimators.
+
+- The ``RandomForestClassifier`` and ``RandomForestRegressor`` derived
+  classes provide the user with concrete implementations of
+  the forest ensemble method using classical, deterministic
+  ``DecisionTreeClassifier`` and ``DecisionTreeRegressor`` as
+  sub-estimator implementations.
+
+- The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived
+  classes provide the user with concrete implementations of the
+  forest ensemble method using the extremely randomized trees
+  ``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as
+  sub-estimator implementations.
+
+Single and multi-output problems are both handled.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import threading
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+from warnings import catch_warnings, simplefilter, warn
+
+import numpy as np
+from scipy.sparse import hstack as sparse_hstack
+from scipy.sparse import issparse
+
+from ..base import (
+    ClassifierMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    TransformerMixin,
+    _fit_context,
+    is_classifier,
+)
+from ..exceptions import DataConversionWarning
+from ..metrics import accuracy_score, r2_score
+from ..preprocessing import OneHotEncoder
+from ..tree import (
+    BaseDecisionTree,
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    ExtraTreeClassifier,
+    ExtraTreeRegressor,
+)
+from ..tree._tree import DOUBLE, DTYPE
+from ..utils import check_random_state, compute_sample_weight
+from ..utils._param_validation import Interval, RealNotInt, StrOptions
+from ..utils._tags import get_tags
+from ..utils.multiclass import check_classification_targets, type_of_target
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_feature_names_in,
+    _check_sample_weight,
+    _num_samples,
+    check_is_fitted,
+    validate_data,
+)
+from ._base import BaseEnsemble, _partition_estimators
+
+__all__ = [
+    "ExtraTreesClassifier",
+    "ExtraTreesRegressor",
+    "RandomForestClassifier",
+    "RandomForestRegressor",
+    "RandomTreesEmbedding",
+]
+
+MAX_INT = np.iinfo(np.int32).max
+
+
+def _get_n_samples_bootstrap(n_samples, max_samples):
+    """
+    Get the number of samples in a bootstrap sample.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples in the dataset.
+    max_samples : int or float
+        The maximum number of samples to draw from the total available:
+            - if float, this indicates a fraction of the total and should be
+              the interval `(0.0, 1.0]`;
+            - if int, this indicates the exact number of samples;
+            - if None, this indicates the total number of samples.
+
+    Returns
+    -------
+    n_samples_bootstrap : int
+        The total number of samples to draw for the bootstrap sample.
+    """
+    if max_samples is None:
+        return n_samples
+
+    if isinstance(max_samples, Integral):
+        if max_samples > n_samples:
+            msg = "`max_samples` must be <= n_samples={} but got value {}"
+            raise ValueError(msg.format(n_samples, max_samples))
+        return max_samples
+
+    if isinstance(max_samples, Real):
+        return max(round(n_samples * max_samples), 1)
+
+
+def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap):
+    """
+    Private function used to _parallel_build_trees function."""
+
+    random_instance = check_random_state(random_state)
+    sample_indices = random_instance.randint(
+        0, n_samples, n_samples_bootstrap, dtype=np.int32
+    )
+
+    return sample_indices
+
+
+def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap):
+    """
+    Private function used to forest._set_oob_score function."""
+    sample_indices = _generate_sample_indices(
+        random_state, n_samples, n_samples_bootstrap
+    )
+    sample_counts = np.bincount(sample_indices, minlength=n_samples)
+    unsampled_mask = sample_counts == 0
+    indices_range = np.arange(n_samples)
+    unsampled_indices = indices_range[unsampled_mask]
+
+    return unsampled_indices
+
+
+def _parallel_build_trees(
+    tree,
+    bootstrap,
+    X,
+    y,
+    sample_weight,
+    tree_idx,
+    n_trees,
+    verbose=0,
+    class_weight=None,
+    n_samples_bootstrap=None,
+    missing_values_in_feature_mask=None,
+):
+    """
+    Private function used to fit a single tree in parallel."""
+    if verbose > 1:
+        print("building tree %d of %d" % (tree_idx + 1, n_trees))
+
+    if bootstrap:
+        n_samples = X.shape[0]
+        if sample_weight is None:
+            curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
+        else:
+            curr_sample_weight = sample_weight.copy()
+
+        indices = _generate_sample_indices(
+            tree.random_state, n_samples, n_samples_bootstrap
+        )
+        sample_counts = np.bincount(indices, minlength=n_samples)
+        curr_sample_weight *= sample_counts
+
+        if class_weight == "subsample":
+            with catch_warnings():
+                simplefilter("ignore", DeprecationWarning)
+                curr_sample_weight *= compute_sample_weight("auto", y, indices=indices)
+        elif class_weight == "balanced_subsample":
+            curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices)
+
+        tree._fit(
+            X,
+            y,
+            sample_weight=curr_sample_weight,
+            check_input=False,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
+        )
+    else:
+        tree._fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=False,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
+        )
+
+    return tree
+
+
+class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
+    """
+    Base class for forests of trees.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+    """
+
+    _parameter_constraints: dict = {
+        "n_estimators": [Interval(Integral, 1, None, closed="left")],
+        "bootstrap": ["boolean"],
+        "oob_score": ["boolean", callable],
+        "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
+        "verbose": ["verbose"],
+        "warm_start": ["boolean"],
+        "max_samples": [
+            None,
+            Interval(RealNotInt, 0.0, 1.0, closed="right"),
+            Interval(Integral, 1, None, closed="left"),
+        ],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimator,
+        n_estimators=100,
+        *,
+        estimator_params=tuple(),
+        bootstrap=False,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        class_weight=None,
+        max_samples=None,
+    ):
+        super().__init__(
+            estimator=estimator,
+            n_estimators=n_estimators,
+            estimator_params=estimator_params,
+        )
+
+        self.bootstrap = bootstrap
+        self.oob_score = oob_score
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.verbose = verbose
+        self.warm_start = warm_start
+        self.class_weight = class_weight
+        self.max_samples = max_samples
+
+    def apply(self, X):
+        """
+        Apply trees in the forest to X, return leaf indices.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        X_leaves : ndarray of shape (n_samples, n_estimators)
+            For each datapoint x in X and for each tree in the forest,
+            return the index of the leaf x ends up in.
+        """
+        X = self._validate_X_predict(X)
+        results = Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(delayed(tree.apply)(X, check_input=False) for tree in self.estimators_)
+
+        return np.array(results).T
+
+    def decision_path(self, X):
+        """
+        Return the decision path in the forest.
+
+        .. versionadded:: 0.18
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        indicator : sparse matrix of shape (n_samples, n_nodes)
+            Return a node indicator matrix where non zero elements indicates
+            that the samples goes through the nodes. The matrix is of CSR
+            format.
+
+        n_nodes_ptr : ndarray of shape (n_estimators + 1,)
+            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
+            gives the indicator value for the i-th estimator.
+        """
+        X = self._validate_X_predict(X)
+        indicators = Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(
+            delayed(tree.decision_path)(X, check_input=False)
+            for tree in self.estimators_
+        )
+
+        n_nodes = [0]
+        n_nodes.extend([i.shape[1] for i in indicators])
+        n_nodes_ptr = np.array(n_nodes).cumsum()
+
+        return sparse_hstack(indicators).tocsr(), n_nodes_ptr
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """
+        Build a forest of trees from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, its dtype will be converted
+            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. In the case of
+            classification, splits are also ignored if they would result in any
+            single class carrying a negative weight in either child node.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        # Validate or convert input data
+        if issparse(y):
+            raise ValueError("sparse multilabel-indicator for y is not supported.")
+
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            multi_output=True,
+            accept_sparse="csc",
+            dtype=DTYPE,
+            ensure_all_finite=False,
+        )
+        # _compute_missing_values_in_feature_mask checks if X has missing values and
+        # will raise an error if the underlying tree base estimator can't handle missing
+        # values. Only the criterion is required to determine if the tree supports
+        # missing values.
+        estimator = type(self.estimator)(criterion=self.criterion)
+        missing_values_in_feature_mask = (
+            estimator._compute_missing_values_in_feature_mask(
+                X, estimator_name=self.__class__.__name__
+            )
+        )
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        if issparse(X):
+            # Pre-sort indices to avoid that each individual tree of the
+            # ensemble sorts the indices.
+            X.sort_indices()
+
+        y = np.atleast_1d(y)
+        if y.ndim == 2 and y.shape[1] == 1:
+            warn(
+                (
+                    "A column-vector y was passed when a 1d array was"
+                    " expected. Please change the shape of y to "
+                    "(n_samples,), for example using ravel()."
+                ),
+                DataConversionWarning,
+                stacklevel=2,
+            )
+
+        if y.ndim == 1:
+            # reshape is necessary to preserve the data contiguity against vs
+            # [:, np.newaxis] that does not.
+            y = np.reshape(y, (-1, 1))
+
+        if self.criterion == "poisson":
+            if np.any(y < 0):
+                raise ValueError(
+                    "Some value(s) of y are negative which is "
+                    "not allowed for Poisson regression."
+                )
+            if np.sum(y) <= 0:
+                raise ValueError(
+                    "Sum of y is not strictly positive which "
+                    "is necessary for Poisson regression."
+                )
+
+        self._n_samples, self.n_outputs_ = y.shape
+
+        y, expanded_class_weight = self._validate_y_class_weight(y)
+
+        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        if expanded_class_weight is not None:
+            if sample_weight is not None:
+                sample_weight = sample_weight * expanded_class_weight
+            else:
+                sample_weight = expanded_class_weight
+
+        if not self.bootstrap and self.max_samples is not None:
+            raise ValueError(
+                "`max_sample` cannot be set if `bootstrap=False`. "
+                "Either switch to `bootstrap=True` or set "
+                "`max_sample=None`."
+            )
+        elif self.bootstrap:
+            n_samples_bootstrap = _get_n_samples_bootstrap(
+                n_samples=X.shape[0], max_samples=self.max_samples
+            )
+        else:
+            n_samples_bootstrap = None
+
+        self._n_samples_bootstrap = n_samples_bootstrap
+
+        self._validate_estimator()
+
+        if not self.bootstrap and self.oob_score:
+            raise ValueError("Out of bag estimation only available if bootstrap=True")
+
+        random_state = check_random_state(self.random_state)
+
+        if not self.warm_start or not hasattr(self, "estimators_"):
+            # Free allocated memory, if any
+            self.estimators_ = []
+
+        n_more_estimators = self.n_estimators - len(self.estimators_)
+
+        if n_more_estimators < 0:
+            raise ValueError(
+                "n_estimators=%d must be larger or equal to "
+                "len(estimators_)=%d when warm_start==True"
+                % (self.n_estimators, len(self.estimators_))
+            )
+
+        elif n_more_estimators == 0:
+            warn(
+                "Warm-start fitting without increasing n_estimators does not "
+                "fit new trees."
+            )
+        else:
+            if self.warm_start and len(self.estimators_) > 0:
+                # We draw from the random state to get the random state we
+                # would have got if we hadn't used a warm_start.
+                random_state.randint(MAX_INT, size=len(self.estimators_))
+
+            trees = [
+                self._make_estimator(append=False, random_state=random_state)
+                for i in range(n_more_estimators)
+            ]
+
+            # Parallel loop: we prefer the threading backend as the Cython code
+            # for fitting the trees is internally releasing the Python GIL
+            # making threading more efficient than multiprocessing in
+            # that case. However, for joblib 0.12+ we respect any
+            # parallel_backend contexts set at a higher level,
+            # since correctness does not rely on using threads.
+            trees = Parallel(
+                n_jobs=self.n_jobs,
+                verbose=self.verbose,
+                prefer="threads",
+            )(
+                delayed(_parallel_build_trees)(
+                    t,
+                    self.bootstrap,
+                    X,
+                    y,
+                    sample_weight,
+                    i,
+                    len(trees),
+                    verbose=self.verbose,
+                    class_weight=self.class_weight,
+                    n_samples_bootstrap=n_samples_bootstrap,
+                    missing_values_in_feature_mask=missing_values_in_feature_mask,
+                )
+                for i, t in enumerate(trees)
+            )
+
+            # Collect newly grown trees
+            self.estimators_.extend(trees)
+
+        if self.oob_score and (
+            n_more_estimators > 0 or not hasattr(self, "oob_score_")
+        ):
+            y_type = type_of_target(y)
+            if y_type == "unknown" or (
+                is_classifier(self) and y_type == "multiclass-multioutput"
+            ):
+                # FIXME: we could consider to support multiclass-multioutput if
+                # we introduce or reuse a constructor parameter (e.g.
+                # oob_score) allowing our user to pass a callable defining the
+                # scoring strategy on OOB sample.
+                raise ValueError(
+                    "The type of target cannot be used to compute OOB "
+                    f"estimates. Got {y_type} while only the following are "
+                    "supported: continuous, continuous-multioutput, binary, "
+                    "multiclass, multilabel-indicator."
+                )
+
+            if callable(self.oob_score):
+                self._set_oob_score_and_attributes(
+                    X, y, scoring_function=self.oob_score
+                )
+            else:
+                self._set_oob_score_and_attributes(X, y)
+
+        # Decapsulate classes_ attributes
+        if hasattr(self, "classes_") and self.n_outputs_ == 1:
+            self.n_classes_ = self.n_classes_[0]
+            self.classes_ = self.classes_[0]
+
+        return self
+
+    @abstractmethod
+    def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
+        """Compute and set the OOB score and attributes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
+        scoring_function : callable, default=None
+            Scoring function for OOB score. Default depends on whether
+            this is a regression (R2 score) or classification problem
+            (accuracy score).
+        """
+
+    def _compute_oob_predictions(self, X, y):
+        """Compute and set the OOB score.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
+
+        Returns
+        -------
+        oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \
+                (n_samples, 1, n_outputs)
+            The OOB predictions.
+        """
+        # Prediction requires X to be in CSR format
+        if issparse(X):
+            X = X.tocsr()
+
+        n_samples = y.shape[0]
+        n_outputs = self.n_outputs_
+        if is_classifier(self) and hasattr(self, "n_classes_"):
+            # n_classes_ is a ndarray at this stage
+            # all the supported type of target will have the same number of
+            # classes in all outputs
+            oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs)
+        else:
+            # for regression, n_classes_ does not exist and we create an empty
+            # axis to be consistent with the classification case and make
+            # the array operations compatible with the 2 settings
+            oob_pred_shape = (n_samples, 1, n_outputs)
+
+        oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64)
+        n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64)
+
+        n_samples_bootstrap = _get_n_samples_bootstrap(
+            n_samples,
+            self.max_samples,
+        )
+        for estimator in self.estimators_:
+            unsampled_indices = _generate_unsampled_indices(
+                estimator.random_state,
+                n_samples,
+                n_samples_bootstrap,
+            )
+
+            y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :])
+            oob_pred[unsampled_indices, ...] += y_pred
+            n_oob_pred[unsampled_indices, :] += 1
+
+        for k in range(n_outputs):
+            if (n_oob_pred == 0).any():
+                warn(
+                    (
+                        "Some inputs do not have OOB scores. This probably means "
+                        "too few trees were used to compute any reliable OOB "
+                        "estimates."
+                    ),
+                    UserWarning,
+                )
+                n_oob_pred[n_oob_pred == 0] = 1
+            oob_pred[..., k] /= n_oob_pred[..., [k]]
+
+        return oob_pred
+
+    def _validate_y_class_weight(self, y):
+        # Default implementation
+        return y, None
+
+    def _validate_X_predict(self, X):
+        """
+        Validate X whenever one tries to predict, apply, predict_proba."""
+        check_is_fitted(self)
+        if self.estimators_[0]._support_missing_values(X):
+            ensure_all_finite = "allow-nan"
+        else:
+            ensure_all_finite = True
+
+        X = validate_data(
+            self,
+            X,
+            dtype=DTYPE,
+            accept_sparse="csr",
+            reset=False,
+            ensure_all_finite=ensure_all_finite,
+        )
+        if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
+            raise ValueError("No support for np.int64 index based sparse matrices")
+        return X
+
+    @property
+    def feature_importances_(self):
+        """
+        The impurity-based feature importances.
+
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+        Returns
+        -------
+        feature_importances_ : ndarray of shape (n_features,)
+            The values of this array sum to 1, unless all trees are single node
+            trees consisting of only the root node, in which case it will be an
+            array of zeros.
+        """
+        check_is_fitted(self)
+
+        all_importances = Parallel(n_jobs=self.n_jobs, prefer="threads")(
+            delayed(getattr)(tree, "feature_importances_")
+            for tree in self.estimators_
+            if tree.tree_.node_count > 1
+        )
+
+        if not all_importances:
+            return np.zeros(self.n_features_in_, dtype=np.float64)
+
+        all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
+        return all_importances / np.sum(all_importances)
+
+    def _get_estimators_indices(self):
+        # Get drawn indices along both sample and feature axes
+        for tree in self.estimators_:
+            if not self.bootstrap:
+                yield np.arange(self._n_samples, dtype=np.int32)
+            else:
+                # tree.random_state is actually an immutable integer seed rather
+                # than a mutable RandomState instance, so it's safe to use it
+                # repeatedly when calling this property.
+                seed = tree.random_state
+                # Operations accessing random_state must be performed identically
+                # to those in `_parallel_build_trees()`
+                yield _generate_sample_indices(
+                    seed, self._n_samples, self._n_samples_bootstrap
+                )
+
+    @property
+    def estimators_samples_(self):
+        """The subset of drawn samples for each base estimator.
+
+        Returns a dynamically generated list of indices identifying
+        the samples used for fitting each member of the ensemble, i.e.,
+        the in-bag samples.
+
+        Note: the list is re-created at each call to the property in order
+        to reduce the object memory footprint by not storing the sampling
+        data. Thus fetching the property may be slower than expected.
+        """
+        return [sample_indices for sample_indices in self._get_estimators_indices()]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # Only the criterion is required to determine if the tree supports
+        # missing values
+        estimator = type(self.estimator)(criterion=self.criterion)
+        tags.input_tags.allow_nan = get_tags(estimator).input_tags.allow_nan
+        return tags
+
+
+def _accumulate_prediction(predict, X, out, lock):
+    """
+    This is a utility function for joblib's Parallel.
+
+    It can't go locally in ForestClassifier or ForestRegressor, because joblib
+    complains that it cannot pickle it when placed there.
+    """
+    prediction = predict(X, check_input=False)
+    with lock:
+        if len(out) == 1:
+            out[0] += prediction
+        else:
+            for i in range(len(out)):
+                out[i] += prediction[i]
+
+
+class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
+    """
+    Base class for forest of trees-based classifiers.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimator,
+        n_estimators=100,
+        *,
+        estimator_params=tuple(),
+        bootstrap=False,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        class_weight=None,
+        max_samples=None,
+    ):
+        super().__init__(
+            estimator=estimator,
+            n_estimators=n_estimators,
+            estimator_params=estimator_params,
+            bootstrap=bootstrap,
+            oob_score=oob_score,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            class_weight=class_weight,
+            max_samples=max_samples,
+        )
+
+    @staticmethod
+    def _get_oob_predictions(tree, X):
+        """Compute the OOB predictions for an individual tree.
+
+        Parameters
+        ----------
+        tree : DecisionTreeClassifier object
+            A single decision tree classifier.
+        X : ndarray of shape (n_samples, n_features)
+            The OOB samples.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples, n_classes, n_outputs)
+            The OOB associated predictions.
+        """
+        y_pred = tree.predict_proba(X, check_input=False)
+        y_pred = np.asarray(y_pred)
+        if y_pred.ndim == 2:
+            # binary and multiclass
+            y_pred = y_pred[..., np.newaxis]
+        else:
+            # Roll the first `n_outputs` axis to the last axis. We will reshape
+            # from a shape of (n_outputs, n_samples, n_classes) to a shape of
+            # (n_samples, n_classes, n_outputs).
+            y_pred = np.rollaxis(y_pred, axis=0, start=3)
+        return y_pred
+
+    def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
+        """Compute and set the OOB score and attributes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
+        scoring_function : callable, default=None
+            Scoring function for OOB score. Defaults to `accuracy_score`.
+        """
+        self.oob_decision_function_ = super()._compute_oob_predictions(X, y)
+        if self.oob_decision_function_.shape[-1] == 1:
+            # drop the n_outputs axis if there is a single output
+            self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)
+
+        if scoring_function is None:
+            scoring_function = accuracy_score
+
+        self.oob_score_ = scoring_function(
+            y, np.argmax(self.oob_decision_function_, axis=1)
+        )
+
+    def _validate_y_class_weight(self, y):
+        check_classification_targets(y)
+
+        y = np.copy(y)
+        expanded_class_weight = None
+
+        if self.class_weight is not None:
+            y_original = np.copy(y)
+
+        self.classes_ = []
+        self.n_classes_ = []
+
+        y_store_unique_indices = np.zeros(y.shape, dtype=int)
+        for k in range(self.n_outputs_):
+            classes_k, y_store_unique_indices[:, k] = np.unique(
+                y[:, k], return_inverse=True
+            )
+            self.classes_.append(classes_k)
+            self.n_classes_.append(classes_k.shape[0])
+        y = y_store_unique_indices
+
+        if self.class_weight is not None:
+            valid_presets = ("balanced", "balanced_subsample")
+            if isinstance(self.class_weight, str):
+                if self.class_weight not in valid_presets:
+                    raise ValueError(
+                        "Valid presets for class_weight include "
+                        '"balanced" and "balanced_subsample".'
+                        'Given "%s".' % self.class_weight
+                    )
+                if self.warm_start:
+                    warn(
+                        'class_weight presets "balanced" or '
+                        '"balanced_subsample" are '
+                        "not recommended for warm_start if the fitted data "
+                        "differs from the full dataset. In order to use "
+                        '"balanced" weights, use compute_class_weight '
+                        '("balanced", classes, y). In place of y you can use '
+                        "a large enough sample of the full training set "
+                        "target to properly estimate the class frequency "
+                        "distributions. Pass the resulting weights as the "
+                        "class_weight parameter."
+                    )
+
+            if self.class_weight != "balanced_subsample" or not self.bootstrap:
+                if self.class_weight == "balanced_subsample":
+                    class_weight = "balanced"
+                else:
+                    class_weight = self.class_weight
+                expanded_class_weight = compute_sample_weight(class_weight, y_original)
+
+        return y, expanded_class_weight
+
+    def predict(self, X):
+        """
+        Predict class for X.
+
+        The predicted class of an input sample is a vote by the trees in
+        the forest, weighted by their probability estimates. That is,
+        the predicted class is the one with highest mean probability
+        estimate across the trees.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            The predicted classes.
+        """
+        proba = self.predict_proba(X)
+
+        if self.n_outputs_ == 1:
+            return self.classes_.take(np.argmax(proba, axis=1), axis=0)
+
+        else:
+            n_samples = proba[0].shape[0]
+            # all dtypes should be the same, so just take the first
+            class_type = self.classes_[0].dtype
+            predictions = np.empty((n_samples, self.n_outputs_), dtype=class_type)
+
+            for k in range(self.n_outputs_):
+                predictions[:, k] = self.classes_[k].take(
+                    np.argmax(proba[k], axis=1), axis=0
+                )
+
+            return predictions
+
+    def predict_proba(self, X):
+        """
+        Predict class probabilities for X.
+
+        The predicted class probabilities of an input sample are computed as
+        the mean predicted class probabilities of the trees in the forest.
+        The class probability of a single tree is the fraction of samples of
+        the same class in a leaf.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes), or a list of such arrays
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        # Assign chunk of trees to jobs
+        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        all_proba = [
+            np.zeros((X.shape[0], j), dtype=np.float64)
+            for j in np.atleast_1d(self.n_classes_)
+        ]
+        lock = threading.Lock()
+        Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(
+            delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, lock)
+            for e in self.estimators_
+        )
+
+        for proba in all_proba:
+            proba /= len(self.estimators_)
+
+        if len(all_proba) == 1:
+            return all_proba[0]
+        else:
+            return all_proba
+
+    def predict_log_proba(self, X):
+        """
+        Predict class log-probabilities for X.
+
+        The predicted class log-probabilities of an input sample is computed as
+        the log of the mean predicted class probabilities of the trees in the
+        forest.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes), or a list of such arrays
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        proba = self.predict_proba(X)
+
+        if self.n_outputs_ == 1:
+            return np.log(proba)
+
+        else:
+            for k in range(self.n_outputs_):
+                proba[k] = np.log(proba[k])
+
+            return proba
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        tags.input_tags.sparse = True
+        return tags
+
+
+class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta):
+    """
+    Base class for forest of trees-based regressors.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimator,
+        n_estimators=100,
+        *,
+        estimator_params=tuple(),
+        bootstrap=False,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        max_samples=None,
+    ):
+        super().__init__(
+            estimator,
+            n_estimators=n_estimators,
+            estimator_params=estimator_params,
+            bootstrap=bootstrap,
+            oob_score=oob_score,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            max_samples=max_samples,
+        )
+
+    def predict(self, X):
+        """
+        Predict regression target for X.
+
+        The predicted regression target of an input sample is computed as the
+        mean predicted regression targets of the trees in the forest.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        # Assign chunk of trees to jobs
+        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        if self.n_outputs_ > 1:
+            y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)
+        else:
+            y_hat = np.zeros((X.shape[0]), dtype=np.float64)
+
+        # Parallel loop
+        lock = threading.Lock()
+        Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(
+            delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock)
+            for e in self.estimators_
+        )
+
+        y_hat /= len(self.estimators_)
+
+        return y_hat
+
+    @staticmethod
+    def _get_oob_predictions(tree, X):
+        """Compute the OOB predictions for an individual tree.
+
+        Parameters
+        ----------
+        tree : DecisionTreeRegressor object
+            A single decision tree regressor.
+        X : ndarray of shape (n_samples, n_features)
+            The OOB samples.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples, 1, n_outputs)
+            The OOB associated predictions.
+        """
+        y_pred = tree.predict(X, check_input=False)
+        if y_pred.ndim == 1:
+            # single output regression
+            y_pred = y_pred[:, np.newaxis, np.newaxis]
+        else:
+            # multioutput regression
+            y_pred = y_pred[:, np.newaxis, :]
+        return y_pred
+
+    def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
+        """Compute and set the OOB score and attributes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
+        scoring_function : callable, default=None
+            Scoring function for OOB score. Defaults to `r2_score`.
+        """
+        self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1)
+        if self.oob_prediction_.shape[-1] == 1:
+            # drop the n_outputs axis if there is a single output
+            self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1)
+
+        if scoring_function is None:
+            scoring_function = r2_score
+
+        self.oob_score_ = scoring_function(y, self.oob_prediction_)
+
+    def _compute_partial_dependence_recursion(self, grid, target_features):
+        """Fast partial dependence computation.
+
+        Parameters
+        ----------
+        grid : ndarray of shape (n_samples, n_target_features), dtype=DTYPE
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : ndarray of shape (n_target_features), dtype=np.intp
+            The set of target features for which the partial dependence
+            should be evaluated.
+
+        Returns
+        -------
+        averaged_predictions : ndarray of shape (n_samples,)
+            The value of the partial dependence function on each grid point.
+        """
+        grid = np.asarray(grid, dtype=DTYPE, order="C")
+        target_features = np.asarray(target_features, dtype=np.intp, order="C")
+        averaged_predictions = np.zeros(
+            shape=grid.shape[0], dtype=np.float64, order="C"
+        )
+
+        for tree in self.estimators_:
+            # Note: we don't sum in parallel because the GIL isn't released in
+            # the fast method.
+            tree.tree_.compute_partial_dependence(
+                grid, target_features, averaged_predictions
+            )
+        # Average over the forest
+        averaged_predictions /= len(self.estimators_)
+
+        return averaged_predictions
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class RandomForestClassifier(ForestClassifier):
+    """
+    A random forest classifier.
+
+    A random forest is a meta estimator that fits a number of decision tree
+    classifiers on various sub-samples of the dataset and uses averaging to
+    improve the predictive accuracy and control over-fitting.
+    Trees in the forest use the best split strategy, i.e. equivalent to passing
+    `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeClassifier`.
+    The sub-sample size is controlled with the `max_samples` parameter if
+    `bootstrap=True` (default), otherwise the whole dataset is used to build
+    each tree.
+
+    For a comparison between tree-based ensemble models see the example
+    :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`.
+
+    This estimator has native support for missing values (NaNs). During training,
+    the tree grower learns at each split point whether samples with missing values
+    should go to the left or right child, based on the potential gain. When predicting,
+    samples with missing values are assigned to the left or right child consequently.
+    If no missing values were encountered for a given feature during training, then
+    samples with missing values are mapped to whichever child has the most samples.
+
+    Read more in the :ref:`User Guide <forest>`.
+
+    Parameters
+    ----------
+    n_estimators : int, default=100
+        The number of trees in the forest.
+
+        .. versionchanged:: 0.22
+           The default value of ``n_estimators`` changed from 10 to 100
+           in 0.22.
+
+    criterion : {"gini", "entropy", "log_loss"}, default="gini"
+        The function to measure the quality of a split. Supported criteria are
+        "gini" for the Gini impurity and "log_loss" and "entropy" both for the
+        Shannon information gain, see :ref:`tree_mathematical_formulation`.
+        Note: This parameter is tree-specific.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : {"sqrt", "log2", None}, int or float, default="sqrt"
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at each
+          split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        .. versionchanged:: 1.1
+            The default of `max_features` changed from `"auto"` to `"sqrt"`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    bootstrap : bool, default=True
+        Whether bootstrap samples are used when building trees. If False, the
+        whole dataset is used to build each tree.
+
+    oob_score : bool or callable, default=False
+        Whether to use out-of-bag samples to estimate the generalization score.
+        By default, :func:`~sklearn.metrics.accuracy_score` is used.
+        Provide a callable with signature `metric(y_true, y_pred)` to use a
+        custom metric. Only available if `bootstrap=True`.
+
+        For an illustration of out-of-bag (OOB) error estimation, see the example
+        :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
+        :meth:`decision_path` and :meth:`apply` are all parallelized over the
+        trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
+        context. ``-1`` means using all processors. See :term:`Glossary
+        <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls both the randomness of the bootstrapping of the samples used
+        when building trees (if ``bootstrap=True``) and the sampling of the
+        features to consider when looking for the best split at each node
+        (if ``max_features < n_features``).
+        See :term:`Glossary <random_state>` for details.
+
+    verbose : int, default=0
+        Controls the verbosity when fitting and predicting.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit a whole
+        new forest. See :term:`Glossary <warm_start>` and
+        :ref:`tree_ensemble_warm_start` for details.
+
+    class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \
+            default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        Note that for multioutput (including multilabel) weights should be
+        defined for each class of every column in its own dict. For example,
+        for four-class multilabel classification weights should be
+        [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
+        [{1:1}, {2:5}, {3:1}, {4:1}].
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
+        The "balanced_subsample" mode is the same as "balanced" except that
+        weights are computed based on the bootstrap sample for every tree
+        grown.
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    max_samples : int or float, default=None
+        If bootstrap is True, the number of samples to draw from X
+        to train each base estimator.
+
+        - If None (default), then draw `X.shape[0]` samples.
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus,
+          `max_samples` should be in the interval `(0.0, 1.0]`.
+
+        .. versionadded:: 0.22
+
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
+    Attributes
+    ----------
+    estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
+        The child estimator template used to create the collection of fitted
+        sub-estimators.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    estimators_ : list of DecisionTreeClassifier
+        The collection of fitted sub-estimators.
+
+    classes_ : ndarray of shape (n_classes,) or a list of such arrays
+        The classes labels (single output problem), or a list of arrays of
+        class labels (multi-output problem).
+
+    n_classes_ : int or list
+        The number of classes (single output problem), or a list containing the
+        number of classes for each output (multi-output problem).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_outputs_ : int
+        The number of outputs when ``fit`` is performed.
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    oob_score_ : float
+        Score of the training dataset obtained using an out-of-bag estimate.
+        This attribute exists only when ``oob_score`` is True.
+
+    oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \
+            (n_samples, n_classes, n_outputs)
+        Decision function computed with out-of-bag estimate on the training
+        set. If n_estimators is small it might be possible that a data point
+        was never left out during the bootstrap. In this case,
+        `oob_decision_function_` might contain NaN. This attribute exists
+        only when ``oob_score`` is True.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by an array of the indices selected.
+
+        .. versionadded:: 1.4
+
+    See Also
+    --------
+    sklearn.tree.DecisionTreeClassifier : A decision tree classifier.
+    sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized
+        tree classifiers.
+    sklearn.ensemble.HistGradientBoostingClassifier : A Histogram-based Gradient
+        Boosting Classification Tree, very fast for big datasets (n_samples >=
+        10_000).
+
+    Notes
+    -----
+    The default values for the parameters controlling the size of the trees
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
+    unpruned trees which can potentially be very large on some data sets. To
+    reduce memory consumption, the complexity and size of the trees should be
+    controlled by setting those parameter values.
+
+    The features are always randomly permuted at each split. Therefore,
+    the best found split may vary, even with the same training data,
+    ``max_features=n_features`` and ``bootstrap=False``, if the improvement
+    of the criterion is identical for several splits enumerated during the
+    search of the best split. To obtain a deterministic behaviour during
+    fitting, ``random_state`` has to be fixed.
+
+    References
+    ----------
+    .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=1000, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
+    >>> clf.fit(X, y)
+    RandomForestClassifier(...)
+    >>> print(clf.predict([[0, 0, 0, 0]]))
+    [1]
+    """
+
+    _parameter_constraints: dict = {
+        **ForestClassifier._parameter_constraints,
+        **DecisionTreeClassifier._parameter_constraints,
+        "class_weight": [
+            StrOptions({"balanced_subsample", "balanced"}),
+            dict,
+            list,
+            None,
+        ],
+    }
+    _parameter_constraints.pop("splitter")
+
+    def __init__(
+        self,
+        n_estimators=100,
+        *,
+        criterion="gini",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features="sqrt",
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        bootstrap=True,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        class_weight=None,
+        ccp_alpha=0.0,
+        max_samples=None,
+        monotonic_cst=None,
+    ):
+        super().__init__(
+            estimator=DecisionTreeClassifier(),
+            n_estimators=n_estimators,
+            estimator_params=(
+                "criterion",
+                "max_depth",
+                "min_samples_split",
+                "min_samples_leaf",
+                "min_weight_fraction_leaf",
+                "max_features",
+                "max_leaf_nodes",
+                "min_impurity_decrease",
+                "random_state",
+                "ccp_alpha",
+                "monotonic_cst",
+            ),
+            bootstrap=bootstrap,
+            oob_score=oob_score,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            class_weight=class_weight,
+            max_samples=max_samples,
+        )
+
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.monotonic_cst = monotonic_cst
+        self.ccp_alpha = ccp_alpha
+
+
+class RandomForestRegressor(ForestRegressor):
+    """
+    A random forest regressor.
+
+    A random forest is a meta estimator that fits a number of decision tree
+    regressors on various sub-samples of the dataset and uses averaging to
+    improve the predictive accuracy and control over-fitting.
+    Trees in the forest use the best split strategy, i.e. equivalent to passing
+    `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeRegressor`.
+    The sub-sample size is controlled with the `max_samples` parameter if
+    `bootstrap=True` (default), otherwise the whole dataset is used to build
+    each tree.
+
+    This estimator has native support for missing values (NaNs). During training,
+    the tree grower learns at each split point whether samples with missing values
+    should go to the left or right child, based on the potential gain. When predicting,
+    samples with missing values are assigned to the left or right child consequently.
+    If no missing values were encountered for a given feature during training, then
+    samples with missing values are mapped to whichever child has the most samples.
+
+    For a comparison between tree-based ensemble models see the example
+    :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`.
+
+    Read more in the :ref:`User Guide <forest>`.
+
+    Parameters
+    ----------
+    n_estimators : int, default=100
+        The number of trees in the forest.
+
+        .. versionchanged:: 0.22
+           The default value of ``n_estimators`` changed from 10 to 100
+           in 0.22.
+
+    criterion : {"squared_error", "absolute_error", "friedman_mse", "poisson"}, \
+            default="squared_error"
+        The function to measure the quality of a split. Supported criteria
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion and minimizes the L2
+        loss using the mean of each terminal node, "friedman_mse", which uses
+        mean squared error with Friedman's improvement score for potential
+        splits, "absolute_error" for the mean absolute error, which minimizes
+        the L1 loss using the median of each terminal node, and "poisson" which
+        uses reduction in Poisson deviance to find splits.
+        Training using "absolute_error" is significantly slower
+        than when using "squared_error".
+
+        .. versionadded:: 0.18
+           Mean Absolute Error (MAE) criterion.
+
+        .. versionadded:: 1.0
+           Poisson criterion.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : {"sqrt", "log2", None}, int or float, default=1.0
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at each
+          split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None or 1.0, then `max_features=n_features`.
+
+        .. note::
+            The default of 1.0 is equivalent to bagged trees and more
+            randomness can be achieved by setting smaller values, e.g. 0.3.
+
+        .. versionchanged:: 1.1
+            The default of `max_features` changed from `"auto"` to 1.0.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    bootstrap : bool, default=True
+        Whether bootstrap samples are used when building trees. If False, the
+        whole dataset is used to build each tree.
+
+    oob_score : bool or callable, default=False
+        Whether to use out-of-bag samples to estimate the generalization score.
+        By default, :func:`~sklearn.metrics.r2_score` is used.
+        Provide a callable with signature `metric(y_true, y_pred)` to use a
+        custom metric. Only available if `bootstrap=True`.
+
+        For an illustration of out-of-bag (OOB) error estimation, see the example
+        :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
+        :meth:`decision_path` and :meth:`apply` are all parallelized over the
+        trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
+        context. ``-1`` means using all processors. See :term:`Glossary
+        <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls both the randomness of the bootstrapping of the samples used
+        when building trees (if ``bootstrap=True``) and the sampling of the
+        features to consider when looking for the best split at each node
+        (if ``max_features < n_features``).
+        See :term:`Glossary <random_state>` for details.
+
+    verbose : int, default=0
+        Controls the verbosity when fitting and predicting.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit a whole
+        new forest. See :term:`Glossary <warm_start>` and
+        :ref:`tree_ensemble_warm_start` for details.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    max_samples : int or float, default=None
+        If bootstrap is True, the number of samples to draw from X
+        to train each base estimator.
+
+        - If None (default), then draw `X.shape[0]` samples.
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus,
+          `max_samples` should be in the interval `(0.0, 1.0]`.
+
+        .. versionadded:: 0.22
+
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonically increasing
+          - 0: no constraint
+          - -1: monotonically decreasing
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
+    Attributes
+    ----------
+    estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor`
+        The child estimator template used to create the collection of fitted
+        sub-estimators.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    estimators_ : list of DecisionTreeRegressor
+        The collection of fitted sub-estimators.
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_outputs_ : int
+        The number of outputs when ``fit`` is performed.
+
+    oob_score_ : float
+        Score of the training dataset obtained using an out-of-bag estimate.
+        This attribute exists only when ``oob_score`` is True.
+
+    oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+        Prediction computed with out-of-bag estimate on the training set.
+        This attribute exists only when ``oob_score`` is True.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by an array of the indices selected.
+
+        .. versionadded:: 1.4
+
+    See Also
+    --------
+    sklearn.tree.DecisionTreeRegressor : A decision tree regressor.
+    sklearn.ensemble.ExtraTreesRegressor : Ensemble of extremely randomized
+        tree regressors.
+    sklearn.ensemble.HistGradientBoostingRegressor : A Histogram-based Gradient
+        Boosting Regression Tree, very fast for big datasets (n_samples >=
+        10_000).
+
+    Notes
+    -----
+    The default values for the parameters controlling the size of the trees
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
+    unpruned trees which can potentially be very large on some data sets. To
+    reduce memory consumption, the complexity and size of the trees should be
+    controlled by setting those parameter values.
+
+    The features are always randomly permuted at each split. Therefore,
+    the best found split may vary, even with the same training data,
+    ``max_features=n_features`` and ``bootstrap=False``, if the improvement
+    of the criterion is identical for several splits enumerated during the
+    search of the best split. To obtain a deterministic behaviour during
+    fitting, ``random_state`` has to be fixed.
+
+    The default value ``max_features=1.0`` uses ``n_features``
+    rather than ``n_features / 3``. The latter was originally suggested in
+    [1], whereas the former was more recently justified empirically in [2].
+
+    References
+    ----------
+    .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
+
+    .. [2] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
+           trees", Machine Learning, 63(1), 3-42, 2006.
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import RandomForestRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_features=4, n_informative=2,
+    ...                        random_state=0, shuffle=False)
+    >>> regr = RandomForestRegressor(max_depth=2, random_state=0)
+    >>> regr.fit(X, y)
+    RandomForestRegressor(...)
+    >>> print(regr.predict([[0, 0, 0, 0]]))
+    [-8.32987858]
+    """
+
+    _parameter_constraints: dict = {
+        **ForestRegressor._parameter_constraints,
+        **DecisionTreeRegressor._parameter_constraints,
+    }
+    _parameter_constraints.pop("splitter")
+
+    def __init__(
+        self,
+        n_estimators=100,
+        *,
+        criterion="squared_error",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=1.0,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        bootstrap=True,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        ccp_alpha=0.0,
+        max_samples=None,
+        monotonic_cst=None,
+    ):
+        super().__init__(
+            estimator=DecisionTreeRegressor(),
+            n_estimators=n_estimators,
+            estimator_params=(
+                "criterion",
+                "max_depth",
+                "min_samples_split",
+                "min_samples_leaf",
+                "min_weight_fraction_leaf",
+                "max_features",
+                "max_leaf_nodes",
+                "min_impurity_decrease",
+                "random_state",
+                "ccp_alpha",
+                "monotonic_cst",
+            ),
+            bootstrap=bootstrap,
+            oob_score=oob_score,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            max_samples=max_samples,
+        )
+
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.ccp_alpha = ccp_alpha
+        self.monotonic_cst = monotonic_cst
+
+
+class ExtraTreesClassifier(ForestClassifier):
+    """
+    An extra-trees classifier.
+
+    This class implements a meta estimator that fits a number of
+    randomized decision trees (a.k.a. extra-trees) on various sub-samples
+    of the dataset and uses averaging to improve the predictive accuracy
+    and control over-fitting.
+
+    This estimator has native support for missing values (NaNs) for
+    random splits. During training, a random threshold will be chosen
+    to split the non-missing values on. Then the non-missing values will be sent
+    to the left and right child based on the randomly selected threshold, while
+    the missing values will also be randomly sent to the left or right child.
+    This is repeated for every feature considered at each split. The best split
+    among these is chosen.
+
+    Read more in the :ref:`User Guide <forest>`.
+
+    Parameters
+    ----------
+    n_estimators : int, default=100
+        The number of trees in the forest.
+
+        .. versionchanged:: 0.22
+           The default value of ``n_estimators`` changed from 10 to 100
+           in 0.22.
+
+    criterion : {"gini", "entropy", "log_loss"}, default="gini"
+        The function to measure the quality of a split. Supported criteria are
+        "gini" for the Gini impurity and "log_loss" and "entropy" both for the
+        Shannon information gain, see :ref:`tree_mathematical_formulation`.
+        Note: This parameter is tree-specific.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : {"sqrt", "log2", None}, int or float, default="sqrt"
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at each
+          split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        .. versionchanged:: 1.1
+            The default of `max_features` changed from `"auto"` to `"sqrt"`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    bootstrap : bool, default=False
+        Whether bootstrap samples are used when building trees. If False, the
+        whole dataset is used to build each tree.
+
+    oob_score : bool or callable, default=False
+        Whether to use out-of-bag samples to estimate the generalization score.
+        By default, :func:`~sklearn.metrics.accuracy_score` is used.
+        Provide a callable with signature `metric(y_true, y_pred)` to use a
+        custom metric. Only available if `bootstrap=True`.
+
+        For an illustration of out-of-bag (OOB) error estimation, see the example
+        :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
+        :meth:`decision_path` and :meth:`apply` are all parallelized over the
+        trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
+        context. ``-1`` means using all processors. See :term:`Glossary
+        <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls 3 sources of randomness:
+
+        - the bootstrapping of the samples used when building trees
+          (if ``bootstrap=True``)
+        - the sampling of the features to consider when looking for the best
+          split at each node (if ``max_features < n_features``)
+        - the draw of the splits for each of the `max_features`
+
+        See :term:`Glossary <random_state>` for details.
+
+    verbose : int, default=0
+        Controls the verbosity when fitting and predicting.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit a whole
+        new forest. See :term:`Glossary <warm_start>` and
+        :ref:`tree_ensemble_warm_start` for details.
+
+    class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \
+            default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        Note that for multioutput (including multilabel) weights should be
+        defined for each class of every column in its own dict. For example,
+        for four-class multilabel classification weights should be
+        [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
+        [{1:1}, {2:5}, {3:1}, {4:1}].
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
+        The "balanced_subsample" mode is the same as "balanced" except that
+        weights are computed based on the bootstrap sample for every tree
+        grown.
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    max_samples : int or float, default=None
+        If bootstrap is True, the number of samples to draw from X
+        to train each base estimator.
+
+        - If None (default), then draw `X.shape[0]` samples.
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples. Thus,
+          `max_samples` should be in the interval `(0.0, 1.0]`.
+
+        .. versionadded:: 0.22
+
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonically increasing
+          - 0: no constraint
+          - -1: monotonically decreasing
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
+    Attributes
+    ----------
+    estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier`
+        The child estimator template used to create the collection of fitted
+        sub-estimators.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    estimators_ : list of DecisionTreeClassifier
+        The collection of fitted sub-estimators.
+
+    classes_ : ndarray of shape (n_classes,) or a list of such arrays
+        The classes labels (single output problem), or a list of arrays of
+        class labels (multi-output problem).
+
+    n_classes_ : int or list
+        The number of classes (single output problem), or a list containing the
+        number of classes for each output (multi-output problem).
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_outputs_ : int
+        The number of outputs when ``fit`` is performed.
+
+    oob_score_ : float
+        Score of the training dataset obtained using an out-of-bag estimate.
+        This attribute exists only when ``oob_score`` is True.
+
+    oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \
+            (n_samples, n_classes, n_outputs)
+        Decision function computed with out-of-bag estimate on the training
+        set. If n_estimators is small it might be possible that a data point
+        was never left out during the bootstrap. In this case,
+        `oob_decision_function_` might contain NaN. This attribute exists
+        only when ``oob_score`` is True.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by an array of the indices selected.
+
+        .. versionadded:: 1.4
+
+    See Also
+    --------
+    ExtraTreesRegressor : An extra-trees regressor with random splits.
+    RandomForestClassifier : A random forest classifier with optimal splits.
+    RandomForestRegressor : Ensemble regressor using trees with optimal splits.
+
+    Notes
+    -----
+    The default values for the parameters controlling the size of the trees
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
+    unpruned trees which can potentially be very large on some data sets. To
+    reduce memory consumption, the complexity and size of the trees should be
+    controlled by setting those parameter values.
+
+    References
+    ----------
+    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
+           trees", Machine Learning, 63(1), 3-42, 2006.
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import ExtraTreesClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_features=4, random_state=0)
+    >>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
+    >>> clf.fit(X, y)
+    ExtraTreesClassifier(random_state=0)
+    >>> clf.predict([[0, 0, 0, 0]])
+    array([1])
+    """
+
+    _parameter_constraints: dict = {
+        **ForestClassifier._parameter_constraints,
+        **DecisionTreeClassifier._parameter_constraints,
+        "class_weight": [
+            StrOptions({"balanced_subsample", "balanced"}),
+            dict,
+            list,
+            None,
+        ],
+    }
+    _parameter_constraints.pop("splitter")
+
+    def __init__(
+        self,
+        n_estimators=100,
+        *,
+        criterion="gini",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features="sqrt",
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        bootstrap=False,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        class_weight=None,
+        ccp_alpha=0.0,
+        max_samples=None,
+        monotonic_cst=None,
+    ):
+        super().__init__(
+            estimator=ExtraTreeClassifier(),
+            n_estimators=n_estimators,
+            estimator_params=(
+                "criterion",
+                "max_depth",
+                "min_samples_split",
+                "min_samples_leaf",
+                "min_weight_fraction_leaf",
+                "max_features",
+                "max_leaf_nodes",
+                "min_impurity_decrease",
+                "random_state",
+                "ccp_alpha",
+                "monotonic_cst",
+            ),
+            bootstrap=bootstrap,
+            oob_score=oob_score,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            class_weight=class_weight,
+            max_samples=max_samples,
+        )
+
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.ccp_alpha = ccp_alpha
+        self.monotonic_cst = monotonic_cst
+
+
+class ExtraTreesRegressor(ForestRegressor):
+    """
+    An extra-trees regressor.
+
+    This class implements a meta estimator that fits a number of
+    randomized decision trees (a.k.a. extra-trees) on various sub-samples
+    of the dataset and uses averaging to improve the predictive accuracy
+    and control over-fitting.
+
+    This estimator has native support for missing values (NaNs) for
+    random splits. During training, a random threshold will be chosen
+    to split the non-missing values on. Then the non-missing values will be sent
+    to the left and right child based on the randomly selected threshold, while
+    the missing values will also be randomly sent to the left or right child.
+    This is repeated for every feature considered at each split. The best split
+    among these is chosen.
+
+    Read more in the :ref:`User Guide <forest>`.
+
+    Parameters
+    ----------
+    n_estimators : int, default=100
+        The number of trees in the forest.
+
+        .. versionchanged:: 0.22
+           The default value of ``n_estimators`` changed from 10 to 100
+           in 0.22.
+
+    criterion : {"squared_error", "absolute_error", "friedman_mse", "poisson"}, \
+            default="squared_error"
+        The function to measure the quality of a split. Supported criteria
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion and minimizes the L2
+        loss using the mean of each terminal node, "friedman_mse", which uses
+        mean squared error with Friedman's improvement score for potential
+        splits, "absolute_error" for the mean absolute error, which minimizes
+        the L1 loss using the median of each terminal node, and "poisson" which
+        uses reduction in Poisson deviance to find splits.
+        Training using "absolute_error" is significantly slower
+        than when using "squared_error".
+
+        .. versionadded:: 0.18
+           Mean Absolute Error (MAE) criterion.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : {"sqrt", "log2", None}, int or float, default=1.0
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at each
+          split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None or 1.0, then `max_features=n_features`.
+
+        .. note::
+            The default of 1.0 is equivalent to bagged trees and more
+            randomness can be achieved by setting smaller values, e.g. 0.3.
+
+        .. versionchanged:: 1.1
+            The default of `max_features` changed from `"auto"` to 1.0.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    bootstrap : bool, default=False
+        Whether bootstrap samples are used when building trees. If False, the
+        whole dataset is used to build each tree.
+
+    oob_score : bool or callable, default=False
+        Whether to use out-of-bag samples to estimate the generalization score.
+        By default, :func:`~sklearn.metrics.r2_score` is used.
+        Provide a callable with signature `metric(y_true, y_pred)` to use a
+        custom metric. Only available if `bootstrap=True`.
+
+        For an illustration of out-of-bag (OOB) error estimation, see the example
+        :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
+        :meth:`decision_path` and :meth:`apply` are all parallelized over the
+        trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
+        context. ``-1`` means using all processors. See :term:`Glossary
+        <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls 3 sources of randomness:
+
+        - the bootstrapping of the samples used when building trees
+          (if ``bootstrap=True``)
+        - the sampling of the features to consider when looking for the best
+          split at each node (if ``max_features < n_features``)
+        - the draw of the splits for each of the `max_features`
+
+        See :term:`Glossary <random_state>` for details.
+
+    verbose : int, default=0
+        Controls the verbosity when fitting and predicting.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit a whole
+        new forest. See :term:`Glossary <warm_start>` and
+        :ref:`tree_ensemble_warm_start` for details.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    max_samples : int or float, default=None
+        If bootstrap is True, the number of samples to draw from X
+        to train each base estimator.
+
+        - If None (default), then draw `X.shape[0]` samples.
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples. Thus,
+          `max_samples` should be in the interval `(0.0, 1.0]`.
+
+        .. versionadded:: 0.22
+
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonically increasing
+          - 0: no constraint
+          - -1: monotonically decreasing
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
+    Attributes
+    ----------
+    estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor`
+        The child estimator template used to create the collection of fitted
+        sub-estimators.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    estimators_ : list of DecisionTreeRegressor
+        The collection of fitted sub-estimators.
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_outputs_ : int
+        The number of outputs.
+
+    oob_score_ : float
+        Score of the training dataset obtained using an out-of-bag estimate.
+        This attribute exists only when ``oob_score`` is True.
+
+    oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+        Prediction computed with out-of-bag estimate on the training set.
+        This attribute exists only when ``oob_score`` is True.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by an array of the indices selected.
+
+        .. versionadded:: 1.4
+
+    See Also
+    --------
+    ExtraTreesClassifier : An extra-trees classifier with random splits.
+    RandomForestClassifier : A random forest classifier with optimal splits.
+    RandomForestRegressor : Ensemble regressor using trees with optimal splits.
+
+    Notes
+    -----
+    The default values for the parameters controlling the size of the trees
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
+    unpruned trees which can potentially be very large on some data sets. To
+    reduce memory consumption, the complexity and size of the trees should be
+    controlled by setting those parameter values.
+
+    References
+    ----------
+    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
+           Machine Learning, 63(1), 3-42, 2006.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.ensemble import ExtraTreesRegressor
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=0)
+    >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(
+    ...    X_train, y_train)
+    >>> reg.score(X_test, y_test)
+    0.2727...
+    """
+
+    _parameter_constraints: dict = {
+        **ForestRegressor._parameter_constraints,
+        **DecisionTreeRegressor._parameter_constraints,
+    }
+    _parameter_constraints.pop("splitter")
+
+    def __init__(
+        self,
+        n_estimators=100,
+        *,
+        criterion="squared_error",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=1.0,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        bootstrap=False,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+        ccp_alpha=0.0,
+        max_samples=None,
+        monotonic_cst=None,
+    ):
+        super().__init__(
+            estimator=ExtraTreeRegressor(),
+            n_estimators=n_estimators,
+            estimator_params=(
+                "criterion",
+                "max_depth",
+                "min_samples_split",
+                "min_samples_leaf",
+                "min_weight_fraction_leaf",
+                "max_features",
+                "max_leaf_nodes",
+                "min_impurity_decrease",
+                "random_state",
+                "ccp_alpha",
+                "monotonic_cst",
+            ),
+            bootstrap=bootstrap,
+            oob_score=oob_score,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            max_samples=max_samples,
+        )
+
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.ccp_alpha = ccp_alpha
+        self.monotonic_cst = monotonic_cst
+
+
+class RandomTreesEmbedding(TransformerMixin, BaseForest):
+    """
+    An ensemble of totally random trees.
+
+    An unsupervised transformation of a dataset to a high-dimensional
+    sparse representation. A datapoint is coded according to which leaf of
+    each tree it is sorted into. Using a one-hot encoding of the leaves,
+    this leads to a binary coding with as many ones as there are trees in
+    the forest.
+
+    The dimensionality of the resulting representation is
+    ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,
+    the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.
+
+    For an example of applying Random Trees Embedding to non-linear
+    classification, see
+    :ref:`sphx_glr_auto_examples_ensemble_plot_random_forest_embedding.py`.
+
+    Read more in the :ref:`User Guide <random_trees_embedding>`.
+
+    Parameters
+    ----------
+    n_estimators : int, default=100
+        Number of trees in the forest.
+
+        .. versionchanged:: 0.22
+           The default value of ``n_estimators`` changed from 10 to 100
+           in 0.22.
+
+    max_depth : int, default=5
+        The maximum depth of each tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` is the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` is the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    sparse_output : bool, default=True
+        Whether or not to return a sparse CSR matrix, as default behavior,
+        or to return a dense array compatible with dense pipeline operators.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`,
+        :meth:`decision_path` and :meth:`apply` are all parallelized over the
+        trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
+        context. ``-1`` means using all processors. See :term:`Glossary
+        <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the generation of the random `y` used to fit the trees
+        and the draw of the splits for each feature at the trees' nodes.
+        See :term:`Glossary <random_state>` for details.
+
+    verbose : int, default=0
+        Controls the verbosity when fitting and predicting.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit a whole
+        new forest. See :term:`Glossary <warm_start>` and
+        :ref:`tree_ensemble_warm_start` for details.
+
+    Attributes
+    ----------
+    estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
+        The child estimator template used to create the collection of fitted
+        sub-estimators.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    estimators_ : list of :class:`~sklearn.tree.ExtraTreeRegressor` instances
+        The collection of fitted sub-estimators.
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The feature importances (the higher, the more important the feature).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_outputs_ : int
+        The number of outputs when ``fit`` is performed.
+
+    one_hot_encoder_ : OneHotEncoder instance
+        One-hot encoder used to create the sparse embedding.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by an array of the indices selected.
+
+        .. versionadded:: 1.4
+
+    See Also
+    --------
+    ExtraTreesClassifier : An extra-trees classifier.
+    ExtraTreesRegressor : An extra-trees regressor.
+    RandomForestClassifier : A random forest classifier.
+    RandomForestRegressor : A random forest regressor.
+    sklearn.tree.ExtraTreeClassifier: An extremely randomized
+        tree classifier.
+    sklearn.tree.ExtraTreeRegressor : An extremely randomized
+        tree regressor.
+
+    References
+    ----------
+    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
+           Machine Learning, 63(1), 3-42, 2006.
+    .. [2] Moosmann, F. and Triggs, B. and Jurie, F.  "Fast discriminative
+           visual codebooks using randomized clustering forests"
+           NIPS 2007
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import RandomTreesEmbedding
+    >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]]
+    >>> random_trees = RandomTreesEmbedding(
+    ...    n_estimators=5, random_state=0, max_depth=1).fit(X)
+    >>> X_sparse_embedding = random_trees.transform(X)
+    >>> X_sparse_embedding.toarray()
+    array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],
+           [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],
+           [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.],
+           [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],
+           [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])
+    """
+
+    _parameter_constraints: dict = {
+        "n_estimators": [Interval(Integral, 1, None, closed="left")],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "warm_start": ["boolean"],
+        **BaseDecisionTree._parameter_constraints,
+        "sparse_output": ["boolean"],
+    }
+    for param in ("max_features", "ccp_alpha", "splitter", "monotonic_cst"):
+        _parameter_constraints.pop(param)
+
+    criterion = "squared_error"
+    max_features = 1
+
+    def __init__(
+        self,
+        n_estimators=100,
+        *,
+        max_depth=5,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        sparse_output=True,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+    ):
+        super().__init__(
+            estimator=ExtraTreeRegressor(),
+            n_estimators=n_estimators,
+            estimator_params=(
+                "criterion",
+                "max_depth",
+                "min_samples_split",
+                "min_samples_leaf",
+                "min_weight_fraction_leaf",
+                "max_features",
+                "max_leaf_nodes",
+                "min_impurity_decrease",
+                "random_state",
+            ),
+            bootstrap=False,
+            oob_score=False,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+            warm_start=warm_start,
+            max_samples=None,
+        )
+
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.sparse_output = sparse_output
+
+    def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
+        raise NotImplementedError("OOB score not supported by tree embedding")
+
+    def fit(self, X, y=None, sample_weight=None):
+        """
+        Fit estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Use ``dtype=np.float32`` for maximum
+            efficiency. Sparse matrices are also supported, use sparse
+            ``csc_matrix`` for maximum efficiency.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. In the case of
+            classification, splits are also ignored if they would result in any
+            single class carrying a negative weight in either child node.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        # Parameters are validated in fit_transform
+        self.fit_transform(X, y, sample_weight=sample_weight)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None, sample_weight=None):
+        """
+        Fit estimator and transform dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data used to build forests. Use ``dtype=np.float32`` for
+            maximum efficiency.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. In the case of
+            classification, splits are also ignored if they would result in any
+            single class carrying a negative weight in either child node.
+
+        Returns
+        -------
+        X_transformed : sparse matrix of shape (n_samples, n_out)
+            Transformed dataset.
+        """
+        rnd = check_random_state(self.random_state)
+        y = rnd.uniform(size=_num_samples(X))
+        super().fit(X, y, sample_weight=sample_weight)
+
+        self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output)
+        output = self.one_hot_encoder_.fit_transform(self.apply(X))
+        self._n_features_out = output.shape[1]
+        return output
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Only used to validate feature names with the names seen in :meth:`fit`.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names, in the format of
+            `randomtreesembedding_{tree}_{leaf}`, where `tree` is the tree used
+            to generate the leaf and `leaf` is the index of a leaf node
+            in that tree. Note that the node indexing scheme is used to
+            index both nodes with children (split nodes) and leaf nodes.
+            Only the latter can be present as output features.
+            As a consequence, there are missing indices in the output
+            feature names.
+        """
+        check_is_fitted(self, "_n_features_out")
+        _check_feature_names_in(
+            self, input_features=input_features, generate_names=False
+        )
+
+        feature_names = [
+            f"randomtreesembedding_{tree}_{leaf}"
+            for tree in range(self.n_estimators)
+            for leaf in self.one_hot_encoder_.categories_[tree]
+        ]
+        return np.asarray(feature_names, dtype=object)
+
+    def transform(self, X):
+        """
+        Transform dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data to be transformed. Use ``dtype=np.float32`` for maximum
+            efficiency. Sparse matrices are also supported, use sparse
+            ``csr_matrix`` for maximum efficiency.
+
+        Returns
+        -------
+        X_transformed : sparse matrix of shape (n_samples, n_out)
+            Transformed dataset.
+        """
+        check_is_fitted(self)
+        return self.one_hot_encoder_.transform(self.apply(X))
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gb.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gb.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c8e79e062dfd41be23162d9bdb90afc71b4381
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gb.py
@@ -0,0 +1,2196 @@
+"""Gradient Boosted Regression Trees.
+
+This module contains methods for fitting gradient boosted regression trees for
+both classification and regression.
+
+The module structure is the following:
+
+- The ``BaseGradientBoosting`` base class implements a common ``fit`` method
+  for all the estimators in the module. Regression and classification
+  only differ in the concrete ``LossFunction`` used.
+
+- ``GradientBoostingClassifier`` implements gradient boosting for
+  classification problems.
+
+- ``GradientBoostingRegressor`` implements gradient boosting for
+  regression problems.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import math
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+from time import time
+
+import numpy as np
+from scipy.sparse import csc_matrix, csr_matrix, issparse
+
+from .._loss.loss import (
+    _LOSSES,
+    AbsoluteError,
+    ExponentialLoss,
+    HalfBinomialLoss,
+    HalfMultinomialLoss,
+    HalfSquaredError,
+    HuberLoss,
+    PinballLoss,
+)
+from ..base import ClassifierMixin, RegressorMixin, _fit_context, is_classifier
+from ..dummy import DummyClassifier, DummyRegressor
+from ..exceptions import NotFittedError
+from ..model_selection import train_test_split
+from ..preprocessing import LabelEncoder
+from ..tree import DecisionTreeRegressor
+from ..tree._tree import DOUBLE, DTYPE, TREE_LEAF
+from ..utils import check_array, check_random_state, column_or_1d
+from ..utils._param_validation import HasMethods, Interval, StrOptions
+from ..utils.multiclass import check_classification_targets
+from ..utils.stats import _weighted_percentile
+from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data
+from ._base import BaseEnsemble
+from ._gradient_boosting import _random_sample_mask, predict_stage, predict_stages
+
+_LOSSES = _LOSSES.copy()
+_LOSSES.update(
+    {
+        "quantile": PinballLoss,
+        "huber": HuberLoss,
+    }
+)
+
+
+def _safe_divide(numerator, denominator):
+    """Prevents overflow and division by zero."""
+    # This is used for classifiers where the denominator might become zero exactly.
+    # For instance for log loss, HalfBinomialLoss, if proba=0 or proba=1 exactly, then
+    # denominator = hessian = 0, and we should set the node value in the line search to
+    # zero as there is no improvement of the loss possible.
+    # For numerical safety, we do this already for extremely tiny values.
+    if abs(denominator) < 1e-150:
+        return 0.0
+    else:
+        # Cast to Python float to trigger Python errors, e.g. ZeroDivisionError,
+        # without relying on `np.errstate` that is not supported by Pyodide.
+        result = float(numerator) / float(denominator)
+        # Cast to Python float to trigger a ZeroDivisionError without relying
+        # on `np.errstate` that is not supported by Pyodide.
+        result = float(numerator) / float(denominator)
+        if math.isinf(result):
+            warnings.warn("overflow encountered in _safe_divide", RuntimeWarning)
+        return result
+
+
+def _init_raw_predictions(X, estimator, loss, use_predict_proba):
+    """Return the initial raw predictions.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        The data array.
+    estimator : object
+        The estimator to use to compute the predictions.
+    loss : BaseLoss
+        An instance of a loss function class.
+    use_predict_proba : bool
+        Whether estimator.predict_proba is used instead of estimator.predict.
+
+    Returns
+    -------
+    raw_predictions : ndarray of shape (n_samples, K)
+        The initial raw predictions. K is equal to 1 for binary
+        classification and regression, and equal to the number of classes
+        for multiclass classification. ``raw_predictions`` is casted
+        into float64.
+    """
+    # TODO: Use loss.fit_intercept_only where appropriate instead of
+    # DummyRegressor which is the default given by the `init` parameter,
+    # see also _init_state.
+    if use_predict_proba:
+        # Our parameter validation, set via _fit_context and _parameter_constraints
+        # already guarantees that estimator has a predict_proba method.
+        predictions = estimator.predict_proba(X)
+        if not loss.is_multiclass:
+            predictions = predictions[:, 1]  # probability of positive class
+        eps = np.finfo(np.float32).eps  # FIXME: This is quite large!
+        predictions = np.clip(predictions, eps, 1 - eps, dtype=np.float64)
+    else:
+        predictions = estimator.predict(X).astype(np.float64)
+
+    if predictions.ndim == 1:
+        return loss.link.link(predictions).reshape(-1, 1)
+    else:
+        return loss.link.link(predictions)
+
+
+def _update_terminal_regions(
+    loss,
+    tree,
+    X,
+    y,
+    neg_gradient,
+    raw_prediction,
+    sample_weight,
+    sample_mask,
+    learning_rate=0.1,
+    k=0,
+):
+    """Update the leaf values to be predicted by the tree and raw_prediction.
+
+    The current raw predictions of the model (of this stage) are updated.
+
+    Additionally, the terminal regions (=leaves) of the given tree are updated as well.
+    This corresponds to the line search step in "Greedy Function Approximation" by
+    Friedman, Algorithm 1 step 5.
+
+    Update equals:
+        argmin_{x} loss(y_true, raw_prediction_old + x * tree.value)
+
+    For non-trivial cases like the Binomial loss, the update has no closed formula and
+    is an approximation, again, see the Friedman paper.
+
+    Also note that the update formula for the SquaredError is the identity. Therefore,
+    in this case, the leaf values don't need an update and only the raw_predictions are
+    updated (with the learning rate included).
+
+    Parameters
+    ----------
+    loss : BaseLoss
+    tree : tree.Tree
+        The tree object.
+    X : ndarray of shape (n_samples, n_features)
+        The data array.
+    y : ndarray of shape (n_samples,)
+        The target labels.
+    neg_gradient : ndarray of shape (n_samples,)
+        The negative gradient.
+    raw_prediction : ndarray of shape (n_samples, n_trees_per_iteration)
+        The raw predictions (i.e. values from the tree leaves) of the
+        tree ensemble at iteration ``i - 1``.
+    sample_weight : ndarray of shape (n_samples,)
+        The weight of each sample.
+    sample_mask : ndarray of shape (n_samples,)
+        The sample mask to be used.
+    learning_rate : float, default=0.1
+        Learning rate shrinks the contribution of each tree by
+         ``learning_rate``.
+    k : int, default=0
+        The index of the estimator being updated.
+    """
+    # compute leaf for each sample in ``X``.
+    terminal_regions = tree.apply(X)
+
+    if not isinstance(loss, HalfSquaredError):
+        # mask all which are not in sample mask.
+        masked_terminal_regions = terminal_regions.copy()
+        masked_terminal_regions[~sample_mask] = -1
+
+        if isinstance(loss, HalfBinomialLoss):
+
+            def compute_update(y_, indices, neg_gradient, raw_prediction, k):
+                # Make a single Newton-Raphson step, see "Additive Logistic Regression:
+                # A Statistical View of Boosting" FHT00 and note that we use a slightly
+                # different version (factor 2) of "F" with proba=expit(raw_prediction).
+                # Our node estimate is given by:
+                #    sum(w * (y - prob)) / sum(w * prob * (1 - prob))
+                # we take advantage that: y - prob = neg_gradient
+                neg_g = neg_gradient.take(indices, axis=0)
+                prob = y_ - neg_g
+                # numerator = negative gradient = y - prob
+                numerator = np.average(neg_g, weights=sw)
+                # denominator = hessian = prob * (1 - prob)
+                denominator = np.average(prob * (1 - prob), weights=sw)
+                return _safe_divide(numerator, denominator)
+
+        elif isinstance(loss, HalfMultinomialLoss):
+
+            def compute_update(y_, indices, neg_gradient, raw_prediction, k):
+                # we take advantage that: y - prob = neg_gradient
+                neg_g = neg_gradient.take(indices, axis=0)
+                prob = y_ - neg_g
+                K = loss.n_classes
+                # numerator = negative gradient * (k - 1) / k
+                # Note: The factor (k - 1)/k appears in the original papers "Greedy
+                # Function Approximation" by Friedman and "Additive Logistic
+                # Regression" by Friedman, Hastie, Tibshirani. This factor is, however,
+                # wrong or at least arbitrary as it directly multiplies the
+                # learning_rate. We keep it for backward compatibility.
+                numerator = np.average(neg_g, weights=sw)
+                numerator *= (K - 1) / K
+                # denominator = (diagonal) hessian = prob * (1 - prob)
+                denominator = np.average(prob * (1 - prob), weights=sw)
+                return _safe_divide(numerator, denominator)
+
+        elif isinstance(loss, ExponentialLoss):
+
+            def compute_update(y_, indices, neg_gradient, raw_prediction, k):
+                neg_g = neg_gradient.take(indices, axis=0)
+                # numerator = negative gradient = y * exp(-raw) - (1-y) * exp(raw)
+                numerator = np.average(neg_g, weights=sw)
+                # denominator = hessian = y * exp(-raw) + (1-y) * exp(raw)
+                # if y=0: hessian = exp(raw) = -neg_g
+                #    y=1: hessian = exp(-raw) = neg_g
+                hessian = neg_g.copy()
+                hessian[y_ == 0] *= -1
+                denominator = np.average(hessian, weights=sw)
+                return _safe_divide(numerator, denominator)
+
+        else:
+
+            def compute_update(y_, indices, neg_gradient, raw_prediction, k):
+                return loss.fit_intercept_only(
+                    y_true=y_ - raw_prediction[indices, k],
+                    sample_weight=sw,
+                )
+
+        # update each leaf (= perform line search)
+        for leaf in np.nonzero(tree.children_left == TREE_LEAF)[0]:
+            indices = np.nonzero(masked_terminal_regions == leaf)[
+                0
+            ]  # of terminal regions
+            y_ = y.take(indices, axis=0)
+            sw = None if sample_weight is None else sample_weight[indices]
+            update = compute_update(y_, indices, neg_gradient, raw_prediction, k)
+
+            # TODO: Multiply here by learning rate instead of everywhere else.
+            tree.value[leaf, 0, 0] = update
+
+    # update predictions (both in-bag and out-of-bag)
+    raw_prediction[:, k] += learning_rate * tree.value[:, 0, 0].take(
+        terminal_regions, axis=0
+    )
+
+
+def set_huber_delta(loss, y_true, raw_prediction, sample_weight=None):
+    """Calculate and set self.closs.delta based on self.quantile."""
+    abserr = np.abs(y_true - raw_prediction.squeeze())
+    # sample_weight is always a ndarray, never None.
+    delta = _weighted_percentile(abserr, sample_weight, 100 * loss.quantile)
+    loss.closs.delta = float(delta)
+
+
+class VerboseReporter:
+    """Reports verbose output to stdout.
+
+    Parameters
+    ----------
+    verbose : int
+        Verbosity level. If ``verbose==1`` output is printed once in a while
+        (when iteration mod verbose_mod is zero).; if larger than 1 then output
+        is printed for each update.
+    """
+
+    def __init__(self, verbose):
+        self.verbose = verbose
+
+    def init(self, est, begin_at_stage=0):
+        """Initialize reporter
+
+        Parameters
+        ----------
+        est : Estimator
+            The estimator
+
+        begin_at_stage : int, default=0
+            stage at which to begin reporting
+        """
+        # header fields and line format str
+        header_fields = ["Iter", "Train Loss"]
+        verbose_fmt = ["{iter:>10d}", "{train_score:>16.4f}"]
+        # do oob?
+        if est.subsample < 1:
+            header_fields.append("OOB Improve")
+            verbose_fmt.append("{oob_impr:>16.4f}")
+        header_fields.append("Remaining Time")
+        verbose_fmt.append("{remaining_time:>16s}")
+
+        # print the header line
+        print(("%10s " + "%16s " * (len(header_fields) - 1)) % tuple(header_fields))
+
+        self.verbose_fmt = " ".join(verbose_fmt)
+        # plot verbose info each time i % verbose_mod == 0
+        self.verbose_mod = 1
+        self.start_time = time()
+        self.begin_at_stage = begin_at_stage
+
+    def update(self, j, est):
+        """Update reporter with new iteration.
+
+        Parameters
+        ----------
+        j : int
+            The new iteration.
+        est : Estimator
+            The estimator.
+        """
+        do_oob = est.subsample < 1
+        # we need to take into account if we fit additional estimators.
+        i = j - self.begin_at_stage  # iteration relative to the start iter
+        if (i + 1) % self.verbose_mod == 0:
+            oob_impr = est.oob_improvement_[j] if do_oob else 0
+            remaining_time = (
+                (est.n_estimators - (j + 1)) * (time() - self.start_time) / float(i + 1)
+            )
+            if remaining_time > 60:
+                remaining_time = "{0:.2f}m".format(remaining_time / 60.0)
+            else:
+                remaining_time = "{0:.2f}s".format(remaining_time)
+            print(
+                self.verbose_fmt.format(
+                    iter=j + 1,
+                    train_score=est.train_score_[j],
+                    oob_impr=oob_impr,
+                    remaining_time=remaining_time,
+                )
+            )
+            if self.verbose == 1 and ((i + 1) // (self.verbose_mod * 10) > 0):
+                # adjust verbose frequency (powers of 10)
+                self.verbose_mod *= 10
+
+
+class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):
+    """Abstract base class for Gradient Boosting."""
+
+    _parameter_constraints: dict = {
+        **DecisionTreeRegressor._parameter_constraints,
+        "learning_rate": [Interval(Real, 0.0, None, closed="left")],
+        "n_estimators": [Interval(Integral, 1, None, closed="left")],
+        "criterion": [StrOptions({"friedman_mse", "squared_error"})],
+        "subsample": [Interval(Real, 0.0, 1.0, closed="right")],
+        "verbose": ["verbose"],
+        "warm_start": ["boolean"],
+        "validation_fraction": [Interval(Real, 0.0, 1.0, closed="neither")],
+        "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+    }
+    _parameter_constraints.pop("splitter")
+    _parameter_constraints.pop("monotonic_cst")
+
+    @abstractmethod
+    def __init__(
+        self,
+        *,
+        loss,
+        learning_rate,
+        n_estimators,
+        criterion,
+        min_samples_split,
+        min_samples_leaf,
+        min_weight_fraction_leaf,
+        max_depth,
+        min_impurity_decrease,
+        init,
+        subsample,
+        max_features,
+        ccp_alpha,
+        random_state,
+        alpha=0.9,
+        verbose=0,
+        max_leaf_nodes=None,
+        warm_start=False,
+        validation_fraction=0.1,
+        n_iter_no_change=None,
+        tol=1e-4,
+    ):
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.loss = loss
+        self.criterion = criterion
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.subsample = subsample
+        self.max_features = max_features
+        self.max_depth = max_depth
+        self.min_impurity_decrease = min_impurity_decrease
+        self.ccp_alpha = ccp_alpha
+        self.init = init
+        self.random_state = random_state
+        self.alpha = alpha
+        self.verbose = verbose
+        self.max_leaf_nodes = max_leaf_nodes
+        self.warm_start = warm_start
+        self.validation_fraction = validation_fraction
+        self.n_iter_no_change = n_iter_no_change
+        self.tol = tol
+
+    @abstractmethod
+    def _encode_y(self, y=None, sample_weight=None):
+        """Called by fit to validate and encode y."""
+
+    @abstractmethod
+    def _get_loss(self, sample_weight):
+        """Get loss object from sklearn._loss.loss."""
+
+    def _fit_stage(
+        self,
+        i,
+        X,
+        y,
+        raw_predictions,
+        sample_weight,
+        sample_mask,
+        random_state,
+        X_csc=None,
+        X_csr=None,
+    ):
+        """Fit another stage of ``n_trees_per_iteration_`` trees."""
+        original_y = y
+
+        if isinstance(self._loss, HuberLoss):
+            set_huber_delta(
+                loss=self._loss,
+                y_true=y,
+                raw_prediction=raw_predictions,
+                sample_weight=sample_weight,
+            )
+        # TODO: Without oob, i.e. with self.subsample = 1.0, we could call
+        # self._loss.loss_gradient and use it to set train_score_.
+        # But note that train_score_[i] is the score AFTER fitting the i-th tree.
+        # Note: We need the negative gradient!
+        neg_gradient = -self._loss.gradient(
+            y_true=y,
+            raw_prediction=raw_predictions,
+            sample_weight=None,  # We pass sample_weights to the tree directly.
+        )
+        # 2-d views of shape (n_samples, n_trees_per_iteration_) or (n_samples, 1)
+        # on neg_gradient to simplify the loop over n_trees_per_iteration_.
+        if neg_gradient.ndim == 1:
+            neg_g_view = neg_gradient.reshape((-1, 1))
+        else:
+            neg_g_view = neg_gradient
+
+        for k in range(self.n_trees_per_iteration_):
+            if self._loss.is_multiclass:
+                y = np.array(original_y == k, dtype=np.float64)
+
+            # induce regression tree on the negative gradient
+            tree = DecisionTreeRegressor(
+                criterion=self.criterion,
+                splitter="best",
+                max_depth=self.max_depth,
+                min_samples_split=self.min_samples_split,
+                min_samples_leaf=self.min_samples_leaf,
+                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                min_impurity_decrease=self.min_impurity_decrease,
+                max_features=self.max_features,
+                max_leaf_nodes=self.max_leaf_nodes,
+                random_state=random_state,
+                ccp_alpha=self.ccp_alpha,
+            )
+
+            if self.subsample < 1.0:
+                # no inplace multiplication!
+                sample_weight = sample_weight * sample_mask.astype(np.float64)
+
+            X = X_csc if X_csc is not None else X
+            tree.fit(
+                X, neg_g_view[:, k], sample_weight=sample_weight, check_input=False
+            )
+
+            # update tree leaves
+            X_for_tree_update = X_csr if X_csr is not None else X
+            _update_terminal_regions(
+                self._loss,
+                tree.tree_,
+                X_for_tree_update,
+                y,
+                neg_g_view[:, k],
+                raw_predictions,
+                sample_weight,
+                sample_mask,
+                learning_rate=self.learning_rate,
+                k=k,
+            )
+
+            # add tree to ensemble
+            self.estimators_[i, k] = tree
+
+        return raw_predictions
+
+    def _set_max_features(self):
+        """Set self.max_features_."""
+        if isinstance(self.max_features, str):
+            if self.max_features == "auto":
+                if is_classifier(self):
+                    max_features = max(1, int(np.sqrt(self.n_features_in_)))
+                else:
+                    max_features = self.n_features_in_
+            elif self.max_features == "sqrt":
+                max_features = max(1, int(np.sqrt(self.n_features_in_)))
+            else:  # self.max_features == "log2"
+                max_features = max(1, int(np.log2(self.n_features_in_)))
+        elif self.max_features is None:
+            max_features = self.n_features_in_
+        elif isinstance(self.max_features, Integral):
+            max_features = self.max_features
+        else:  # float
+            max_features = max(1, int(self.max_features * self.n_features_in_))
+
+        self.max_features_ = max_features
+
+    def _init_state(self):
+        """Initialize model state and allocate model state data structures."""
+
+        self.init_ = self.init
+        if self.init_ is None:
+            if is_classifier(self):
+                self.init_ = DummyClassifier(strategy="prior")
+            elif isinstance(self._loss, (AbsoluteError, HuberLoss)):
+                self.init_ = DummyRegressor(strategy="quantile", quantile=0.5)
+            elif isinstance(self._loss, PinballLoss):
+                self.init_ = DummyRegressor(strategy="quantile", quantile=self.alpha)
+            else:
+                self.init_ = DummyRegressor(strategy="mean")
+
+        self.estimators_ = np.empty(
+            (self.n_estimators, self.n_trees_per_iteration_), dtype=object
+        )
+        self.train_score_ = np.zeros((self.n_estimators,), dtype=np.float64)
+        # do oob?
+        if self.subsample < 1.0:
+            self.oob_improvement_ = np.zeros((self.n_estimators), dtype=np.float64)
+            self.oob_scores_ = np.zeros((self.n_estimators), dtype=np.float64)
+            self.oob_score_ = np.nan
+
+    def _clear_state(self):
+        """Clear the state of the gradient boosting model."""
+        if hasattr(self, "estimators_"):
+            self.estimators_ = np.empty((0, 0), dtype=object)
+        if hasattr(self, "train_score_"):
+            del self.train_score_
+        if hasattr(self, "oob_improvement_"):
+            del self.oob_improvement_
+        if hasattr(self, "oob_scores_"):
+            del self.oob_scores_
+        if hasattr(self, "oob_score_"):
+            del self.oob_score_
+        if hasattr(self, "init_"):
+            del self.init_
+        if hasattr(self, "_rng"):
+            del self._rng
+
+    def _resize_state(self):
+        """Add additional ``n_estimators`` entries to all attributes."""
+        # self.n_estimators is the number of additional est to fit
+        total_n_estimators = self.n_estimators
+        if total_n_estimators < self.estimators_.shape[0]:
+            raise ValueError(
+                "resize with smaller n_estimators %d < %d"
+                % (total_n_estimators, self.estimators_[0])
+            )
+
+        self.estimators_ = np.resize(
+            self.estimators_, (total_n_estimators, self.n_trees_per_iteration_)
+        )
+        self.train_score_ = np.resize(self.train_score_, total_n_estimators)
+        if self.subsample < 1 or hasattr(self, "oob_improvement_"):
+            # if do oob resize arrays or create new if not available
+            if hasattr(self, "oob_improvement_"):
+                self.oob_improvement_ = np.resize(
+                    self.oob_improvement_, total_n_estimators
+                )
+                self.oob_scores_ = np.resize(self.oob_scores_, total_n_estimators)
+                self.oob_score_ = np.nan
+            else:
+                self.oob_improvement_ = np.zeros(
+                    (total_n_estimators,), dtype=np.float64
+                )
+                self.oob_scores_ = np.zeros((total_n_estimators,), dtype=np.float64)
+                self.oob_score_ = np.nan
+
+    def _is_fitted(self):
+        return len(getattr(self, "estimators_", [])) > 0
+
+    def _check_initialized(self):
+        """Check that the estimator is initialized, raising an error if not."""
+        check_is_fitted(self)
+
+    @_fit_context(
+        # GradientBoosting*.init is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, sample_weight=None, monitor=None):
+        """Fit the gradient boosting model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        y : array-like of shape (n_samples,)
+            Target values (strings or integers in classification, real numbers
+            in regression)
+            For classification, labels must correspond to classes.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. In the case of
+            classification, splits are also ignored if they would result in any
+            single class carrying a negative weight in either child node.
+
+        monitor : callable, default=None
+            The monitor is called after each iteration with the current
+            iteration, a reference to the estimator and the local variables of
+            ``_fit_stages`` as keyword arguments ``callable(i, self,
+            locals())``. If the callable returns ``True`` the fitting procedure
+            is stopped. The monitor can be used for various things such as
+            computing held-out estimates, early stopping, model introspect, and
+            snapshotting.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        if not self.warm_start:
+            self._clear_state()
+
+        # Check input
+        # Since check_array converts both X and y to the same dtype, but the
+        # trees use different types for X and y, checking them separately.
+
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype=DTYPE,
+            multi_output=True,
+        )
+        sample_weight_is_none = sample_weight is None
+        sample_weight = _check_sample_weight(sample_weight, X)
+        if sample_weight_is_none:
+            y = self._encode_y(y=y, sample_weight=None)
+        else:
+            y = self._encode_y(y=y, sample_weight=sample_weight)
+        y = column_or_1d(y, warn=True)  # TODO: Is this still required?
+
+        self._set_max_features()
+
+        # self.loss is guaranteed to be a string
+        self._loss = self._get_loss(sample_weight=sample_weight)
+
+        if self.n_iter_no_change is not None:
+            stratify = y if is_classifier(self) else None
+            (
+                X_train,
+                X_val,
+                y_train,
+                y_val,
+                sample_weight_train,
+                sample_weight_val,
+            ) = train_test_split(
+                X,
+                y,
+                sample_weight,
+                random_state=self.random_state,
+                test_size=self.validation_fraction,
+                stratify=stratify,
+            )
+            if is_classifier(self):
+                if self.n_classes_ != np.unique(y_train).shape[0]:
+                    # We choose to error here. The problem is that the init
+                    # estimator would be trained on y, which has some missing
+                    # classes now, so its predictions would not have the
+                    # correct shape.
+                    raise ValueError(
+                        "The training data after the early stopping split "
+                        "is missing some classes. Try using another random "
+                        "seed."
+                    )
+        else:
+            X_train, y_train, sample_weight_train = X, y, sample_weight
+            X_val = y_val = sample_weight_val = None
+
+        n_samples = X_train.shape[0]
+
+        # First time calling fit.
+        if not self._is_fitted():
+            # init state
+            self._init_state()
+
+            # fit initial model and initialize raw predictions
+            if self.init_ == "zero":
+                raw_predictions = np.zeros(
+                    shape=(n_samples, self.n_trees_per_iteration_),
+                    dtype=np.float64,
+                )
+            else:
+                # XXX clean this once we have a support_sample_weight tag
+                if sample_weight_is_none:
+                    self.init_.fit(X_train, y_train)
+                else:
+                    msg = (
+                        "The initial estimator {} does not support sample "
+                        "weights.".format(self.init_.__class__.__name__)
+                    )
+                    try:
+                        self.init_.fit(
+                            X_train, y_train, sample_weight=sample_weight_train
+                        )
+                    except TypeError as e:
+                        if "unexpected keyword argument 'sample_weight'" in str(e):
+                            # regular estimator without SW support
+                            raise ValueError(msg) from e
+                        else:  # regular estimator whose input checking failed
+                            raise
+                    except ValueError as e:
+                        if (
+                            "pass parameters to specific steps of "
+                            "your pipeline using the "
+                            "stepname__parameter" in str(e)
+                        ):  # pipeline
+                            raise ValueError(msg) from e
+                        else:  # regular estimator whose input checking failed
+                            raise
+
+                raw_predictions = _init_raw_predictions(
+                    X_train, self.init_, self._loss, is_classifier(self)
+                )
+
+            begin_at_stage = 0
+
+            # The rng state must be preserved if warm_start is True
+            self._rng = check_random_state(self.random_state)
+
+        # warm start: this is not the first time fit was called
+        else:
+            # add more estimators to fitted model
+            # invariant: warm_start = True
+            if self.n_estimators < self.estimators_.shape[0]:
+                raise ValueError(
+                    "n_estimators=%d must be larger or equal to "
+                    "estimators_.shape[0]=%d when "
+                    "warm_start==True" % (self.n_estimators, self.estimators_.shape[0])
+                )
+            begin_at_stage = self.estimators_.shape[0]
+            # The requirements of _raw_predict
+            # are more constrained than fit. It accepts only CSR
+            # matrices. Finite values have already been checked in _validate_data.
+            X_train = check_array(
+                X_train,
+                dtype=DTYPE,
+                order="C",
+                accept_sparse="csr",
+                ensure_all_finite=False,
+            )
+            raw_predictions = self._raw_predict(X_train)
+            self._resize_state()
+
+        # fit the boosting stages
+        n_stages = self._fit_stages(
+            X_train,
+            y_train,
+            raw_predictions,
+            sample_weight_train,
+            self._rng,
+            X_val,
+            y_val,
+            sample_weight_val,
+            begin_at_stage,
+            monitor,
+        )
+
+        # change shape of arrays after fit (early-stopping or additional ests)
+        if n_stages != self.estimators_.shape[0]:
+            self.estimators_ = self.estimators_[:n_stages]
+            self.train_score_ = self.train_score_[:n_stages]
+            if hasattr(self, "oob_improvement_"):
+                # OOB scores were computed
+                self.oob_improvement_ = self.oob_improvement_[:n_stages]
+                self.oob_scores_ = self.oob_scores_[:n_stages]
+                self.oob_score_ = self.oob_scores_[-1]
+        self.n_estimators_ = n_stages
+        return self
+
+    def _fit_stages(
+        self,
+        X,
+        y,
+        raw_predictions,
+        sample_weight,
+        random_state,
+        X_val,
+        y_val,
+        sample_weight_val,
+        begin_at_stage=0,
+        monitor=None,
+    ):
+        """Iteratively fits the stages.
+
+        For each stage it computes the progress (OOB, train score)
+        and delegates to ``_fit_stage``.
+        Returns the number of stages fit; might differ from ``n_estimators``
+        due to early stopping.
+        """
+        n_samples = X.shape[0]
+        do_oob = self.subsample < 1.0
+        sample_mask = np.ones((n_samples,), dtype=bool)
+        n_inbag = max(1, int(self.subsample * n_samples))
+
+        if self.verbose:
+            verbose_reporter = VerboseReporter(verbose=self.verbose)
+            verbose_reporter.init(self, begin_at_stage)
+
+        X_csc = csc_matrix(X) if issparse(X) else None
+        X_csr = csr_matrix(X) if issparse(X) else None
+
+        if self.n_iter_no_change is not None:
+            loss_history = np.full(self.n_iter_no_change, np.inf)
+            # We create a generator to get the predictions for X_val after
+            # the addition of each successive stage
+            y_val_pred_iter = self._staged_raw_predict(X_val, check_input=False)
+
+        # Older versions of GBT had its own loss functions. With the new common
+        # private loss function submodule _loss, we often are a factor of 2
+        # away from the old version. Here we keep backward compatibility for
+        # oob_scores_ and oob_improvement_, even if the old way is quite
+        # inconsistent (sometimes the gradient is half the gradient, sometimes
+        # not).
+        if isinstance(
+            self._loss,
+            (
+                HalfSquaredError,
+                HalfBinomialLoss,
+            ),
+        ):
+            factor = 2
+        else:
+            factor = 1
+
+        # perform boosting iterations
+        i = begin_at_stage
+        for i in range(begin_at_stage, self.n_estimators):
+            # subsampling
+            if do_oob:
+                sample_mask = _random_sample_mask(n_samples, n_inbag, random_state)
+                y_oob_masked = y[~sample_mask]
+                sample_weight_oob_masked = sample_weight[~sample_mask]
+                if i == 0:  # store the initial loss to compute the OOB score
+                    initial_loss = factor * self._loss(
+                        y_true=y_oob_masked,
+                        raw_prediction=raw_predictions[~sample_mask],
+                        sample_weight=sample_weight_oob_masked,
+                    )
+
+            # fit next stage of trees
+            raw_predictions = self._fit_stage(
+                i,
+                X,
+                y,
+                raw_predictions,
+                sample_weight,
+                sample_mask,
+                random_state,
+                X_csc=X_csc,
+                X_csr=X_csr,
+            )
+
+            # track loss
+            if do_oob:
+                self.train_score_[i] = factor * self._loss(
+                    y_true=y[sample_mask],
+                    raw_prediction=raw_predictions[sample_mask],
+                    sample_weight=sample_weight[sample_mask],
+                )
+                self.oob_scores_[i] = factor * self._loss(
+                    y_true=y_oob_masked,
+                    raw_prediction=raw_predictions[~sample_mask],
+                    sample_weight=sample_weight_oob_masked,
+                )
+                previous_loss = initial_loss if i == 0 else self.oob_scores_[i - 1]
+                self.oob_improvement_[i] = previous_loss - self.oob_scores_[i]
+                self.oob_score_ = self.oob_scores_[-1]
+            else:
+                # no need to fancy index w/ no subsampling
+                self.train_score_[i] = factor * self._loss(
+                    y_true=y,
+                    raw_prediction=raw_predictions,
+                    sample_weight=sample_weight,
+                )
+
+            if self.verbose > 0:
+                verbose_reporter.update(i, self)
+
+            if monitor is not None:
+                early_stopping = monitor(i, self, locals())
+                if early_stopping:
+                    break
+
+            # We also provide an early stopping based on the score from
+            # validation set (X_val, y_val), if n_iter_no_change is set
+            if self.n_iter_no_change is not None:
+                # By calling next(y_val_pred_iter), we get the predictions
+                # for X_val after the addition of the current stage
+                validation_loss = factor * self._loss(
+                    y_val, next(y_val_pred_iter), sample_weight_val
+                )
+
+                # Require validation_score to be better (less) than at least
+                # one of the last n_iter_no_change evaluations
+                if np.any(validation_loss + self.tol < loss_history):
+                    loss_history[i % len(loss_history)] = validation_loss
+                else:
+                    break
+
+        return i + 1
+
+    def _make_estimator(self, append=True):
+        # we don't need _make_estimator
+        raise NotImplementedError()
+
+    def _raw_predict_init(self, X):
+        """Check input and compute raw predictions of the init estimator."""
+        self._check_initialized()
+        X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
+        if self.init_ == "zero":
+            raw_predictions = np.zeros(
+                shape=(X.shape[0], self.n_trees_per_iteration_), dtype=np.float64
+            )
+        else:
+            raw_predictions = _init_raw_predictions(
+                X, self.init_, self._loss, is_classifier(self)
+            )
+        return raw_predictions
+
+    def _raw_predict(self, X):
+        """Return the sum of the trees raw predictions (+ init estimator)."""
+        check_is_fitted(self)
+        raw_predictions = self._raw_predict_init(X)
+        predict_stages(self.estimators_, X, self.learning_rate, raw_predictions)
+        return raw_predictions
+
+    def _staged_raw_predict(self, X, check_input=True):
+        """Compute raw predictions of ``X`` for each iteration.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        check_input : bool, default=True
+            If False, the input arrays X will not be checked.
+
+        Returns
+        -------
+        raw_predictions : generator of ndarray of shape (n_samples, k)
+            The raw predictions of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+            Regression and binary classification are special cases with
+            ``k == 1``, otherwise ``k==n_classes``.
+        """
+        if check_input:
+            X = validate_data(
+                self, X, dtype=DTYPE, order="C", accept_sparse="csr", reset=False
+            )
+        raw_predictions = self._raw_predict_init(X)
+        for i in range(self.estimators_.shape[0]):
+            predict_stage(self.estimators_, i, X, self.learning_rate, raw_predictions)
+            yield raw_predictions.copy()
+
+    @property
+    def feature_importances_(self):
+        """The impurity-based feature importances.
+
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+        Returns
+        -------
+        feature_importances_ : ndarray of shape (n_features,)
+            The values of this array sum to 1, unless all trees are single node
+            trees consisting of only the root node, in which case it will be an
+            array of zeros.
+        """
+        self._check_initialized()
+
+        relevant_trees = [
+            tree
+            for stage in self.estimators_
+            for tree in stage
+            if tree.tree_.node_count > 1
+        ]
+        if not relevant_trees:
+            # degenerate case where all trees have only one node
+            return np.zeros(shape=self.n_features_in_, dtype=np.float64)
+
+        relevant_feature_importances = [
+            tree.tree_.compute_feature_importances(normalize=False)
+            for tree in relevant_trees
+        ]
+        avg_feature_importances = np.mean(
+            relevant_feature_importances, axis=0, dtype=np.float64
+        )
+        return avg_feature_importances / np.sum(avg_feature_importances)
+
+    def _compute_partial_dependence_recursion(self, grid, target_features):
+        """Fast partial dependence computation.
+
+        Parameters
+        ----------
+        grid : ndarray of shape (n_samples, n_target_features), dtype=np.float32
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : ndarray of shape (n_target_features,), dtype=np.intp
+            The set of target features for which the partial dependence
+            should be evaluated.
+
+        Returns
+        -------
+        averaged_predictions : ndarray of shape \
+                (n_trees_per_iteration_, n_samples)
+            The value of the partial dependence function on each grid point.
+        """
+        if self.init is not None:
+            warnings.warn(
+                "Using recursion method with a non-constant init predictor "
+                "will lead to incorrect partial dependence values. "
+                "Got init=%s." % self.init,
+                UserWarning,
+            )
+        grid = np.asarray(grid, dtype=DTYPE, order="C")
+        n_estimators, n_trees_per_stage = self.estimators_.shape
+        averaged_predictions = np.zeros(
+            (n_trees_per_stage, grid.shape[0]), dtype=np.float64, order="C"
+        )
+        target_features = np.asarray(target_features, dtype=np.intp, order="C")
+
+        for stage in range(n_estimators):
+            for k in range(n_trees_per_stage):
+                tree = self.estimators_[stage, k].tree_
+                tree.compute_partial_dependence(
+                    grid, target_features, averaged_predictions[k]
+                )
+        averaged_predictions *= self.learning_rate
+
+        return averaged_predictions
+
+    def apply(self, X):
+        """Apply trees in the ensemble to X, return leaf indices.
+
+        .. versionadded:: 0.17
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will
+            be converted to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        X_leaves : array-like of shape (n_samples, n_estimators, n_classes)
+            For each datapoint x in X and for each tree in the ensemble,
+            return the index of the leaf x ends up in each estimator.
+            In the case of binary classification n_classes is 1.
+        """
+
+        self._check_initialized()
+        X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
+
+        # n_classes will be equal to 1 in the binary classification or the
+        # regression case.
+        n_estimators, n_classes = self.estimators_.shape
+        leaves = np.zeros((X.shape[0], n_estimators, n_classes))
+
+        for i in range(n_estimators):
+            for j in range(n_classes):
+                estimator = self.estimators_[i, j]
+                leaves[:, i, j] = estimator.apply(X, check_input=False)
+
+        return leaves
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
+    """Gradient Boosting for classification.
+
+    This algorithm builds an additive model in a forward stage-wise fashion; it
+    allows for the optimization of arbitrary differentiable loss functions. In
+    each stage ``n_classes_`` regression trees are fit on the negative gradient
+    of the loss function, e.g. binary or multiclass log loss. Binary
+    classification is a special case where only a single regression tree is
+    induced.
+
+    :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is a much faster variant
+    of this algorithm for intermediate and large datasets (`n_samples >= 10_000`) and
+    supports monotonic constraints.
+
+    Read more in the :ref:`User Guide <gradient_boosting>`.
+
+    Parameters
+    ----------
+    loss : {'log_loss', 'exponential'}, default='log_loss'
+        The loss function to be optimized. 'log_loss' refers to binomial and
+        multinomial deviance, the same as used in logistic regression.
+        It is a good choice for classification with probabilistic outputs.
+        For loss 'exponential', gradient boosting recovers the AdaBoost algorithm.
+
+    learning_rate : float, default=0.1
+        Learning rate shrinks the contribution of each tree by `learning_rate`.
+        There is a trade-off between learning_rate and n_estimators.
+        Values must be in the range `[0.0, inf)`.
+
+        For an example of the effects of this parameter and its interaction with
+        ``subsample``, see
+        :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regularization.py`.
+
+    n_estimators : int, default=100
+        The number of boosting stages to perform. Gradient boosting
+        is fairly robust to over-fitting so a large number usually
+        results in better performance.
+        Values must be in the range `[1, inf)`.
+
+    subsample : float, default=1.0
+        The fraction of samples to be used for fitting the individual base
+        learners. If smaller than 1.0 this results in Stochastic Gradient
+        Boosting. `subsample` interacts with the parameter `n_estimators`.
+        Choosing `subsample < 1.0` leads to a reduction of variance
+        and an increase in bias.
+        Values must be in the range `(0.0, 1.0]`.
+
+    criterion : {'friedman_mse', 'squared_error'}, default='friedman_mse'
+        The function to measure the quality of a split. Supported criteria are
+        'friedman_mse' for the mean squared error with improvement score by
+        Friedman, 'squared_error' for mean squared error. The default value of
+        'friedman_mse' is generally the best as it can provide a better
+        approximation in some cases.
+
+        .. versionadded:: 0.18
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, values must be in the range `[2, inf)`.
+        - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split`
+          will be `ceil(min_samples_split * n_samples)`.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, values must be in the range `[1, inf)`.
+        - If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf`
+          will be `ceil(min_samples_leaf * n_samples)`.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+        Values must be in the range `[0.0, 0.5]`.
+
+    max_depth : int or None, default=3
+        Maximum depth of the individual regression estimators. The maximum
+        depth limits the number of nodes in the tree. Tune this parameter
+        for best performance; the best value depends on the interaction
+        of the input variables. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+        If int, values must be in the range `[1, inf)`.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+        Values must be in the range `[0.0, inf)`.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    init : estimator or 'zero', default=None
+        An estimator object that is used to compute the initial predictions.
+        ``init`` has to provide :term:`fit` and :term:`predict_proba`. If
+        'zero', the initial raw predictions are set to zero. By default, a
+        ``DummyEstimator`` predicting the classes priors is used.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random seed given to each Tree estimator at each
+        boosting iteration.
+        In addition, it controls the random permutation of the features at
+        each split (see Notes for more details).
+        It also controls the random splitting of the training data to obtain a
+        validation set if `n_iter_no_change` is not None.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    max_features : {'sqrt', 'log2'}, int or float, default=None
+        The number of features to consider when looking for the best split:
+
+        - If int, values must be in the range `[1, inf)`.
+        - If float, values must be in the range `(0.0, 1.0]` and the features
+          considered at each split will be `max(1, int(max_features * n_features_in_))`.
+        - If 'sqrt', then `max_features=sqrt(n_features)`.
+        - If 'log2', then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Choosing `max_features < n_features` leads to a reduction of variance
+        and an increase in bias.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    verbose : int, default=0
+        Enable verbose output. If 1 then it prints progress and performance
+        once in a while (the more trees the lower the frequency). If greater
+        than 1 then it prints progress and performance for every tree.
+        Values must be in the range `[0, inf)`.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        Values must be in the range `[2, inf)`.
+        If `None`, then unlimited number of leaf nodes.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just erase the
+        previous solution. See :term:`the Glossary <warm_start>`.
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Values must be in the range `(0.0, 1.0)`.
+        Only used if ``n_iter_no_change`` is set to an integer.
+
+        .. versionadded:: 0.20
+
+    n_iter_no_change : int, default=None
+        ``n_iter_no_change`` is used to decide if early stopping will be used
+        to terminate training when validation score is not improving. By
+        default it is set to None to disable early stopping. If set to a
+        number, it will set aside ``validation_fraction`` size of the training
+        data as validation and terminate training when validation score is not
+        improving in all of the previous ``n_iter_no_change`` numbers of
+        iterations. The split is stratified.
+        Values must be in the range `[1, inf)`.
+        See
+        :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_early_stopping.py`.
+
+        .. versionadded:: 0.20
+
+    tol : float, default=1e-4
+        Tolerance for the early stopping. When the loss is not improving
+        by at least tol for ``n_iter_no_change`` iterations (if set to a
+        number), the training stops.
+        Values must be in the range `[0.0, inf)`.
+
+        .. versionadded:: 0.20
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed.
+        Values must be in the range `[0.0, inf)`.
+        See :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    n_estimators_ : int
+        The number of estimators as selected by early stopping (if
+        ``n_iter_no_change`` is specified). Otherwise it is set to
+        ``n_estimators``.
+
+        .. versionadded:: 0.20
+
+    n_trees_per_iteration_ : int
+        The number of trees that are built at each iteration. For binary classifiers,
+        this is always 1.
+
+        .. versionadded:: 1.4.0
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    oob_improvement_ : ndarray of shape (n_estimators,)
+        The improvement in loss on the out-of-bag samples
+        relative to the previous iteration.
+        ``oob_improvement_[0]`` is the improvement in
+        loss of the first stage over the ``init`` estimator.
+        Only available if ``subsample < 1.0``.
+
+    oob_scores_ : ndarray of shape (n_estimators,)
+        The full history of the loss values on the out-of-bag
+        samples. Only available if `subsample < 1.0`.
+
+        .. versionadded:: 1.3
+
+    oob_score_ : float
+        The last value of the loss on the out-of-bag samples. It is
+        the same as `oob_scores_[-1]`. Only available if `subsample < 1.0`.
+
+        .. versionadded:: 1.3
+
+    train_score_ : ndarray of shape (n_estimators,)
+        The i-th score ``train_score_[i]`` is the loss of the
+        model at iteration ``i`` on the in-bag sample.
+        If ``subsample == 1`` this is the loss on the training data.
+
+    init_ : estimator
+        The estimator that provides the initial predictions. Set via the ``init``
+        argument.
+
+    estimators_ : ndarray of DecisionTreeRegressor of \
+            shape (n_estimators, ``n_trees_per_iteration_``)
+        The collection of fitted sub-estimators. ``n_trees_per_iteration_`` is 1 for
+        binary classification, otherwise ``n_classes``.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_classes_ : int
+        The number of classes.
+
+    max_features_ : int
+        The inferred value of max_features.
+
+    See Also
+    --------
+    HistGradientBoostingClassifier : Histogram-based Gradient Boosting
+        Classification Tree.
+    sklearn.tree.DecisionTreeClassifier : A decision tree classifier.
+    RandomForestClassifier : A meta-estimator that fits a number of decision
+        tree classifiers on various sub-samples of the dataset and uses
+        averaging to improve the predictive accuracy and control over-fitting.
+    AdaBoostClassifier : A meta-estimator that begins by fitting a classifier
+        on the original dataset and then fits additional copies of the
+        classifier on the same dataset where the weights of incorrectly
+        classified instances are adjusted such that subsequent classifiers
+        focus more on difficult cases.
+
+    Notes
+    -----
+    The features are always randomly permuted at each split. Therefore,
+    the best found split may vary, even with the same training data and
+    ``max_features=n_features``, if the improvement of the criterion is
+    identical for several splits enumerated during the search of the best
+    split. To obtain a deterministic behaviour during fitting,
+    ``random_state`` has to be fixed.
+
+    References
+    ----------
+    J. Friedman, Greedy Function Approximation: A Gradient Boosting
+    Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.
+
+    J. Friedman, Stochastic Gradient Boosting, 1999
+
+    T. Hastie, R. Tibshirani and J. Friedman.
+    Elements of Statistical Learning Ed. 2, Springer, 2009.
+
+    Examples
+    --------
+    The following example shows how to fit a gradient boosting classifier with
+    100 decision stumps as weak learners.
+
+    >>> from sklearn.datasets import make_hastie_10_2
+    >>> from sklearn.ensemble import GradientBoostingClassifier
+
+    >>> X, y = make_hastie_10_2(random_state=0)
+    >>> X_train, X_test = X[:2000], X[2000:]
+    >>> y_train, y_test = y[:2000], y[2000:]
+
+    >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
+    ...     max_depth=1, random_state=0).fit(X_train, y_train)
+    >>> clf.score(X_test, y_test)
+    0.913
+    """
+
+    _parameter_constraints: dict = {
+        **BaseGradientBoosting._parameter_constraints,
+        "loss": [StrOptions({"log_loss", "exponential"})],
+        "init": [StrOptions({"zero"}), None, HasMethods(["fit", "predict_proba"])],
+    }
+
+    def __init__(
+        self,
+        *,
+        loss="log_loss",
+        learning_rate=0.1,
+        n_estimators=100,
+        subsample=1.0,
+        criterion="friedman_mse",
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_depth=3,
+        min_impurity_decrease=0.0,
+        init=None,
+        random_state=None,
+        max_features=None,
+        verbose=0,
+        max_leaf_nodes=None,
+        warm_start=False,
+        validation_fraction=0.1,
+        n_iter_no_change=None,
+        tol=1e-4,
+        ccp_alpha=0.0,
+    ):
+        super().__init__(
+            loss=loss,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            criterion=criterion,
+            min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf,
+            min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_depth=max_depth,
+            init=init,
+            subsample=subsample,
+            max_features=max_features,
+            random_state=random_state,
+            verbose=verbose,
+            max_leaf_nodes=max_leaf_nodes,
+            min_impurity_decrease=min_impurity_decrease,
+            warm_start=warm_start,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            tol=tol,
+            ccp_alpha=ccp_alpha,
+        )
+
+    def _encode_y(self, y, sample_weight):
+        # encode classes into 0 ... n_classes - 1 and sets attributes classes_
+        # and n_trees_per_iteration_
+        check_classification_targets(y)
+
+        label_encoder = LabelEncoder()
+        encoded_y_int = label_encoder.fit_transform(y)
+        self.classes_ = label_encoder.classes_
+        n_classes = self.classes_.shape[0]
+        # only 1 tree for binary classification. For multiclass classification,
+        # we build 1 tree per class.
+        self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes
+        encoded_y = encoded_y_int.astype(float, copy=False)
+
+        # From here on, it is additional to the HGBT case.
+        # expose n_classes_ attribute
+        self.n_classes_ = n_classes
+        if sample_weight is None:
+            n_trim_classes = n_classes
+        else:
+            n_trim_classes = np.count_nonzero(np.bincount(encoded_y_int, sample_weight))
+
+        if n_trim_classes < 2:
+            raise ValueError(
+                "y contains %d class after sample_weight "
+                "trimmed classes with zero weights, while a "
+                "minimum of 2 classes are required." % n_trim_classes
+            )
+        return encoded_y
+
+    def _get_loss(self, sample_weight):
+        if self.loss == "log_loss":
+            if self.n_classes_ == 2:
+                return HalfBinomialLoss(sample_weight=sample_weight)
+            else:
+                return HalfMultinomialLoss(
+                    sample_weight=sample_weight, n_classes=self.n_classes_
+                )
+        elif self.loss == "exponential":
+            if self.n_classes_ > 2:
+                raise ValueError(
+                    f"loss='{self.loss}' is only suitable for a binary classification "
+                    f"problem, you have n_classes={self.n_classes_}. "
+                    "Please use loss='log_loss' instead."
+                )
+            else:
+                return ExponentialLoss(sample_weight=sample_weight)
+
+    def decision_function(self, X):
+        """Compute the decision function of ``X``.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        score : ndarray of shape (n_samples, n_classes) or (n_samples,)
+            The decision function of the input samples, which corresponds to
+            the raw values predicted from the trees of the ensemble . The
+            order of the classes corresponds to that in the attribute
+            :term:`classes_`. Regression and binary classification produce an
+            array of shape (n_samples,).
+        """
+        X = validate_data(
+            self, X, dtype=DTYPE, order="C", accept_sparse="csr", reset=False
+        )
+        raw_predictions = self._raw_predict(X)
+        if raw_predictions.shape[1] == 1:
+            return raw_predictions.ravel()
+        return raw_predictions
+
+    def staged_decision_function(self, X):
+        """Compute decision function of ``X`` for each iteration.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Yields
+        ------
+        score : generator of ndarray of shape (n_samples, k)
+            The decision function of the input samples, which corresponds to
+            the raw values predicted from the trees of the ensemble . The
+            classes corresponds to that in the attribute :term:`classes_`.
+            Regression and binary classification are special cases with
+            ``k == 1``, otherwise ``k==n_classes``.
+        """
+        yield from self._staged_raw_predict(X)
+
+    def predict(self, X):
+        """Predict class for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted values.
+        """
+        raw_predictions = self.decision_function(X)
+        if raw_predictions.ndim == 1:  # decision_function already squeezed it
+            encoded_classes = (raw_predictions >= 0).astype(int)
+        else:
+            encoded_classes = np.argmax(raw_predictions, axis=1)
+        return self.classes_[encoded_classes]
+
+    def staged_predict(self, X):
+        """Predict class at each stage for X.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Yields
+        ------
+        y : generator of ndarray of shape (n_samples,)
+            The predicted value of the input samples.
+        """
+        if self.n_classes_ == 2:  # n_trees_per_iteration_ = 1
+            for raw_predictions in self._staged_raw_predict(X):
+                encoded_classes = (raw_predictions.squeeze() >= 0).astype(int)
+                yield self.classes_.take(encoded_classes, axis=0)
+        else:
+            for raw_predictions in self._staged_raw_predict(X):
+                encoded_classes = np.argmax(raw_predictions, axis=1)
+                yield self.classes_.take(encoded_classes, axis=0)
+
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+
+        Raises
+        ------
+        AttributeError
+            If the ``loss`` does not support probabilities.
+        """
+        raw_predictions = self.decision_function(X)
+        return self._loss.predict_proba(raw_predictions)
+
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+
+        Raises
+        ------
+        AttributeError
+            If the ``loss`` does not support probabilities.
+        """
+        proba = self.predict_proba(X)
+        return np.log(proba)
+
+    def staged_predict_proba(self, X):
+        """Predict class probabilities at each stage for X.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Yields
+        ------
+        y : generator of ndarray of shape (n_samples,)
+            The predicted value of the input samples.
+        """
+        try:
+            for raw_predictions in self._staged_raw_predict(X):
+                yield self._loss.predict_proba(raw_predictions)
+        except NotFittedError:
+            raise
+        except AttributeError as e:
+            raise AttributeError(
+                "loss=%r does not support predict_proba" % self.loss
+            ) from e
+
+
+class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
+    """Gradient Boosting for regression.
+
+    This estimator builds an additive model in a forward stage-wise fashion; it
+    allows for the optimization of arbitrary differentiable loss functions. In
+    each stage a regression tree is fit on the negative gradient of the given
+    loss function.
+
+    :class:`~sklearn.ensemble.HistGradientBoostingRegressor` is a much faster variant
+    of this algorithm for intermediate and large datasets (`n_samples >= 10_000`) and
+    supports monotonic constraints.
+
+    Read more in the :ref:`User Guide <gradient_boosting>`.
+
+    Parameters
+    ----------
+    loss : {'squared_error', 'absolute_error', 'huber', 'quantile'}, \
+            default='squared_error'
+        Loss function to be optimized. 'squared_error' refers to the squared
+        error for regression. 'absolute_error' refers to the absolute error of
+        regression and is a robust loss function. 'huber' is a
+        combination of the two. 'quantile' allows quantile regression (use
+        `alpha` to specify the quantile).
+        See
+        :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`
+        for an example that demonstrates quantile regression for creating
+        prediction intervals with `loss='quantile'`.
+
+    learning_rate : float, default=0.1
+        Learning rate shrinks the contribution of each tree by `learning_rate`.
+        There is a trade-off between learning_rate and n_estimators.
+        Values must be in the range `[0.0, inf)`.
+
+    n_estimators : int, default=100
+        The number of boosting stages to perform. Gradient boosting
+        is fairly robust to over-fitting so a large number usually
+        results in better performance.
+        Values must be in the range `[1, inf)`.
+
+    subsample : float, default=1.0
+        The fraction of samples to be used for fitting the individual base
+        learners. If smaller than 1.0 this results in Stochastic Gradient
+        Boosting. `subsample` interacts with the parameter `n_estimators`.
+        Choosing `subsample < 1.0` leads to a reduction of variance
+        and an increase in bias.
+        Values must be in the range `(0.0, 1.0]`.
+
+    criterion : {'friedman_mse', 'squared_error'}, default='friedman_mse'
+        The function to measure the quality of a split. Supported criteria are
+        "friedman_mse" for the mean squared error with improvement score by
+        Friedman, "squared_error" for mean squared error. The default value of
+        "friedman_mse" is generally the best as it can provide a better
+        approximation in some cases.
+
+        .. versionadded:: 0.18
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, values must be in the range `[2, inf)`.
+        - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split`
+          will be `ceil(min_samples_split * n_samples)`.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, values must be in the range `[1, inf)`.
+        - If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf`
+          will be `ceil(min_samples_leaf * n_samples)`.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+        Values must be in the range `[0.0, 0.5]`.
+
+    max_depth : int or None, default=3
+        Maximum depth of the individual regression estimators. The maximum
+        depth limits the number of nodes in the tree. Tune this parameter
+        for best performance; the best value depends on the interaction
+        of the input variables. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+        If int, values must be in the range `[1, inf)`.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+        Values must be in the range `[0.0, inf)`.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    init : estimator or 'zero', default=None
+        An estimator object that is used to compute the initial predictions.
+        ``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the
+        initial raw predictions are set to zero. By default a
+        ``DummyEstimator`` is used, predicting either the average target value
+        (for loss='squared_error'), or a quantile for the other losses.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random seed given to each Tree estimator at each
+        boosting iteration.
+        In addition, it controls the random permutation of the features at
+        each split (see Notes for more details).
+        It also controls the random splitting of the training data to obtain a
+        validation set if `n_iter_no_change` is not None.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    max_features : {'sqrt', 'log2'}, int or float, default=None
+        The number of features to consider when looking for the best split:
+
+        - If int, values must be in the range `[1, inf)`.
+        - If float, values must be in the range `(0.0, 1.0]` and the features
+          considered at each split will be `max(1, int(max_features * n_features_in_))`.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Choosing `max_features < n_features` leads to a reduction of variance
+        and an increase in bias.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    alpha : float, default=0.9
+        The alpha-quantile of the huber loss function and the quantile
+        loss function. Only if ``loss='huber'`` or ``loss='quantile'``.
+        Values must be in the range `(0.0, 1.0)`.
+
+    verbose : int, default=0
+        Enable verbose output. If 1 then it prints progress and performance
+        once in a while (the more trees the lower the frequency). If greater
+        than 1 then it prints progress and performance for every tree.
+        Values must be in the range `[0, inf)`.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        Values must be in the range `[2, inf)`.
+        If None, then unlimited number of leaf nodes.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just erase the
+        previous solution. See :term:`the Glossary <warm_start>`.
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Values must be in the range `(0.0, 1.0)`.
+        Only used if ``n_iter_no_change`` is set to an integer.
+
+        .. versionadded:: 0.20
+
+    n_iter_no_change : int, default=None
+        ``n_iter_no_change`` is used to decide if early stopping will be used
+        to terminate training when validation score is not improving. By
+        default it is set to None to disable early stopping. If set to a
+        number, it will set aside ``validation_fraction`` size of the training
+        data as validation and terminate training when validation score is not
+        improving in all of the previous ``n_iter_no_change`` numbers of
+        iterations.
+        Values must be in the range `[1, inf)`.
+        See
+        :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_early_stopping.py`.
+
+        .. versionadded:: 0.20
+
+    tol : float, default=1e-4
+        Tolerance for the early stopping. When the loss is not improving
+        by at least tol for ``n_iter_no_change`` iterations (if set to a
+        number), the training stops.
+        Values must be in the range `[0.0, inf)`.
+
+        .. versionadded:: 0.20
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed.
+        Values must be in the range `[0.0, inf)`.
+        See :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    n_estimators_ : int
+        The number of estimators as selected by early stopping (if
+        ``n_iter_no_change`` is specified). Otherwise it is set to
+        ``n_estimators``.
+
+    n_trees_per_iteration_ : int
+        The number of trees that are built at each iteration. For regressors, this is
+        always 1.
+
+        .. versionadded:: 1.4.0
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    oob_improvement_ : ndarray of shape (n_estimators,)
+        The improvement in loss on the out-of-bag samples
+        relative to the previous iteration.
+        ``oob_improvement_[0]`` is the improvement in
+        loss of the first stage over the ``init`` estimator.
+        Only available if ``subsample < 1.0``.
+
+    oob_scores_ : ndarray of shape (n_estimators,)
+        The full history of the loss values on the out-of-bag
+        samples. Only available if `subsample < 1.0`.
+
+        .. versionadded:: 1.3
+
+    oob_score_ : float
+        The last value of the loss on the out-of-bag samples. It is
+        the same as `oob_scores_[-1]`. Only available if `subsample < 1.0`.
+
+        .. versionadded:: 1.3
+
+    train_score_ : ndarray of shape (n_estimators,)
+        The i-th score ``train_score_[i]`` is the loss of the
+        model at iteration ``i`` on the in-bag sample.
+        If ``subsample == 1`` this is the loss on the training data.
+
+    init_ : estimator
+        The estimator that provides the initial predictions. Set via the ``init``
+        argument.
+
+    estimators_ : ndarray of DecisionTreeRegressor of shape (n_estimators, 1)
+        The collection of fitted sub-estimators.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    max_features_ : int
+        The inferred value of max_features.
+
+    See Also
+    --------
+    HistGradientBoostingRegressor : Histogram-based Gradient Boosting
+        Classification Tree.
+    sklearn.tree.DecisionTreeRegressor : A decision tree regressor.
+    sklearn.ensemble.RandomForestRegressor : A random forest regressor.
+
+    Notes
+    -----
+    The features are always randomly permuted at each split. Therefore,
+    the best found split may vary, even with the same training data and
+    ``max_features=n_features``, if the improvement of the criterion is
+    identical for several splits enumerated during the search of the best
+    split. To obtain a deterministic behaviour during fitting,
+    ``random_state`` has to be fixed.
+
+    References
+    ----------
+    J. Friedman, Greedy Function Approximation: A Gradient Boosting
+    Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.
+
+    J. Friedman, Stochastic Gradient Boosting, 1999
+
+    T. Hastie, R. Tibshirani and J. Friedman.
+    Elements of Statistical Learning Ed. 2, Springer, 2009.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.ensemble import GradientBoostingRegressor
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = make_regression(random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=0)
+    >>> reg = GradientBoostingRegressor(random_state=0)
+    >>> reg.fit(X_train, y_train)
+    GradientBoostingRegressor(random_state=0)
+    >>> reg.predict(X_test[1:2])
+    array([-61.1])
+    >>> reg.score(X_test, y_test)
+    0.4...
+
+    For a detailed example of utilizing
+    :class:`~sklearn.ensemble.GradientBoostingRegressor`
+    to fit an ensemble of weak predictive models, please refer to
+    :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **BaseGradientBoosting._parameter_constraints,
+        "loss": [StrOptions({"squared_error", "absolute_error", "huber", "quantile"})],
+        "init": [StrOptions({"zero"}), None, HasMethods(["fit", "predict"])],
+        "alpha": [Interval(Real, 0.0, 1.0, closed="neither")],
+    }
+
+    def __init__(
+        self,
+        *,
+        loss="squared_error",
+        learning_rate=0.1,
+        n_estimators=100,
+        subsample=1.0,
+        criterion="friedman_mse",
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_depth=3,
+        min_impurity_decrease=0.0,
+        init=None,
+        random_state=None,
+        max_features=None,
+        alpha=0.9,
+        verbose=0,
+        max_leaf_nodes=None,
+        warm_start=False,
+        validation_fraction=0.1,
+        n_iter_no_change=None,
+        tol=1e-4,
+        ccp_alpha=0.0,
+    ):
+        super().__init__(
+            loss=loss,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            criterion=criterion,
+            min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf,
+            min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_depth=max_depth,
+            init=init,
+            subsample=subsample,
+            max_features=max_features,
+            min_impurity_decrease=min_impurity_decrease,
+            random_state=random_state,
+            alpha=alpha,
+            verbose=verbose,
+            max_leaf_nodes=max_leaf_nodes,
+            warm_start=warm_start,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            tol=tol,
+            ccp_alpha=ccp_alpha,
+        )
+
+    def _encode_y(self, y=None, sample_weight=None):
+        # Just convert y to the expected dtype
+        self.n_trees_per_iteration_ = 1
+        y = y.astype(DOUBLE, copy=False)
+        return y
+
+    def _get_loss(self, sample_weight):
+        if self.loss in ("quantile", "huber"):
+            return _LOSSES[self.loss](sample_weight=sample_weight, quantile=self.alpha)
+        else:
+            return _LOSSES[self.loss](sample_weight=sample_weight)
+
+    def predict(self, X):
+        """Predict regression target for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted values.
+        """
+        X = validate_data(
+            self, X, dtype=DTYPE, order="C", accept_sparse="csr", reset=False
+        )
+        # In regression we can directly return the raw value from the trees.
+        return self._raw_predict(X).ravel()
+
+    def staged_predict(self, X):
+        """Predict regression target at each stage for X.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Yields
+        ------
+        y : generator of ndarray of shape (n_samples,)
+            The predicted value of the input samples.
+        """
+        for raw_predictions in self._staged_raw_predict(X):
+            yield raw_predictions.ravel()
+
+    def apply(self, X):
+        """Apply trees in the ensemble to X, return leaf indices.
+
+        .. versionadded:: 0.17
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will
+            be converted to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        X_leaves : array-like of shape (n_samples, n_estimators)
+            For each datapoint x in X and for each tree in the ensemble,
+            return the index of the leaf x ends up in each estimator.
+        """
+
+        leaves = super().apply(X)
+        leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0])
+        return leaves
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gradient_boosting.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gradient_boosting.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..cd9845a217c7d505ff227637ec7c3f092a432849
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gradient_boosting.pyx
@@ -0,0 +1,262 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.stdlib cimport free
+from libc.string cimport memset
+
+import numpy as np
+from scipy.sparse import issparse
+
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t
+# Note: _tree uses cimport numpy, cnp.import_array, so we need to include
+# numpy headers in the build configuration of this extension
+from ..tree._tree cimport Node
+from ..tree._tree cimport Tree
+from ..tree._utils cimport safe_realloc
+
+
+# no namespace lookup for numpy dtype and array creation
+from numpy import zeros as np_zeros
+
+
+# constant to mark tree leafs
+cdef intp_t TREE_LEAF = -1
+
+cdef void _predict_regression_tree_inplace_fast_dense(
+    const float32_t[:, ::1] X,
+    Node* root_node,
+    double *value,
+    double scale,
+    Py_ssize_t k,
+    float64_t[:, :] out
+) noexcept nogil:
+    """Predicts output for regression tree and stores it in ``out[i, k]``.
+
+    This function operates directly on the data arrays of the tree
+    data structures. This is 5x faster than the variant above because
+    it allows us to avoid buffer validation.
+
+    The function assumes that the ndarray that wraps ``X`` is
+    c-continuous.
+
+    Parameters
+    ----------
+    X : float32_t 2d memory view
+        The memory view on the data ndarray of the input ``X``.
+        Assumes that the array is c-continuous.
+    root_node : tree Node pointer
+        Pointer to the main node array of the :class:``sklearn.tree.Tree``.
+    value : np.float64_t pointer
+        The pointer to the data array of the ``value`` array attribute
+        of the :class:``sklearn.tree.Tree``.
+    scale : double
+        A constant to scale the predictions.
+    k : int
+        The index of the tree output to be predicted. Must satisfy
+        0 <= ``k`` < ``K``.
+    out : memory view on array of type np.float64_t
+        The data array where the predictions are stored.
+        ``out`` is assumed to be a two-dimensional array of
+        shape ``(n_samples, K)``.
+    """
+    cdef intp_t n_samples = X.shape[0]
+    cdef Py_ssize_t i
+    cdef Node *node
+    for i in range(n_samples):
+        node = root_node
+        # While node not a leaf
+        while node.left_child != TREE_LEAF:
+            if X[i, node.feature] <= node.threshold:
+                node = root_node + node.left_child
+            else:
+                node = root_node + node.right_child
+        out[i, k] += scale * value[node - root_node]
+
+
+def _predict_regression_tree_stages_sparse(
+    object[:, :] estimators,
+    object X,
+    double scale,
+    float64_t[:, :] out
+):
+    """Predicts output for regression tree inplace and adds scaled value to ``out[i, k]``.
+
+    The function assumes that the ndarray that wraps ``X`` is csr_matrix.
+    """
+    cdef const float32_t[::1] X_data = X.data
+    cdef const int32_t[::1] X_indices = X.indices
+    cdef const int32_t[::1] X_indptr = X.indptr
+
+    cdef intp_t n_samples = X.shape[0]
+    cdef intp_t n_features = X.shape[1]
+    cdef intp_t n_stages = estimators.shape[0]
+    cdef intp_t n_outputs = estimators.shape[1]
+
+    # Indices and temporary variables
+    cdef intp_t sample_i
+    cdef intp_t feature_i
+    cdef intp_t stage_i
+    cdef intp_t output_i
+    cdef Node *root_node = NULL
+    cdef Node *node = NULL
+    cdef double *value = NULL
+
+    cdef Tree tree
+    cdef Node** nodes = NULL
+    cdef double** values = NULL
+    safe_realloc(&nodes, n_stages * n_outputs)
+    safe_realloc(&values, n_stages * n_outputs)
+    for stage_i in range(n_stages):
+        for output_i in range(n_outputs):
+            tree = estimators[stage_i, output_i].tree_
+            nodes[stage_i * n_outputs + output_i] = tree.nodes
+            values[stage_i * n_outputs + output_i] = tree.value
+
+    # Initialize auxiliary data-structure
+    cdef float32_t feature_value = 0.
+    cdef float32_t* X_sample = NULL
+
+    # feature_to_sample as a data structure records the last seen sample
+    # for each feature; functionally, it is an efficient way to identify
+    # which features are nonzero in the present sample.
+    cdef intp_t* feature_to_sample = NULL
+
+    safe_realloc(&X_sample, n_features)
+    safe_realloc(&feature_to_sample, n_features)
+
+    memset(feature_to_sample, -1, n_features * sizeof(intp_t))
+
+    # Cycle through all samples
+    for sample_i in range(n_samples):
+        for feature_i in range(X_indptr[sample_i], X_indptr[sample_i + 1]):
+            feature_to_sample[X_indices[feature_i]] = sample_i
+            X_sample[X_indices[feature_i]] = X_data[feature_i]
+
+        # Cycle through all stages
+        for stage_i in range(n_stages):
+            # Cycle through all trees
+            for output_i in range(n_outputs):
+                root_node = nodes[stage_i * n_outputs + output_i]
+                value = values[stage_i * n_outputs + output_i]
+                node = root_node
+
+                # While node not a leaf
+                while node.left_child != TREE_LEAF:
+                    # ... and node.right_child != TREE_LEAF:
+                    if feature_to_sample[node.feature] == sample_i:
+                        feature_value = X_sample[node.feature]
+                    else:
+                        feature_value = 0.
+
+                    if feature_value <= node.threshold:
+                        node = root_node + node.left_child
+                    else:
+                        node = root_node + node.right_child
+                out[sample_i, output_i] += scale * value[node - root_node]
+
+    # Free auxiliary arrays
+    free(X_sample)
+    free(feature_to_sample)
+    free(nodes)
+    free(values)
+
+
+def predict_stages(
+    object[:, :] estimators,
+    object X,
+    double scale,
+    float64_t[:, :] out
+):
+    """Add predictions of ``estimators`` to ``out``.
+
+    Each estimator is scaled by ``scale`` before its prediction
+    is added to ``out``.
+    """
+    cdef Py_ssize_t i
+    cdef Py_ssize_t k
+    cdef Py_ssize_t n_estimators = estimators.shape[0]
+    cdef Py_ssize_t K = estimators.shape[1]
+    cdef Tree tree
+
+    if issparse(X):
+        if X.format != 'csr':
+            raise ValueError("When X is a sparse matrix, a CSR format is"
+                             " expected, got {!r}".format(type(X)))
+        _predict_regression_tree_stages_sparse(
+            estimators=estimators, X=X, scale=scale, out=out
+        )
+    else:
+        if not isinstance(X, np.ndarray) or np.isfortran(X):
+            raise ValueError(f"X should be C-ordered np.ndarray, got {type(X)}")
+
+        for i in range(n_estimators):
+            for k in range(K):
+                tree = estimators[i, k].tree_
+
+                # avoid buffer validation by casting to ndarray
+                # and get data pointer
+                # need brackets because of casting operator priority
+                _predict_regression_tree_inplace_fast_dense(
+                    X=X,
+                    root_node=tree.nodes,
+                    value=tree.value,
+                    scale=scale,
+                    k=k,
+                    out=out
+                )
+                # out[:, k] += scale * tree.predict(X).ravel()
+
+
+def predict_stage(
+    object[:, :] estimators,
+    int stage,
+    object X,
+    double scale,
+    float64_t[:, :] out
+):
+    """Add predictions of ``estimators[stage]`` to ``out``.
+
+    Each estimator in the stage is scaled by ``scale`` before
+    its prediction is added to ``out``.
+    """
+    return predict_stages(
+        estimators=estimators[stage:stage + 1], X=X, scale=scale, out=out
+    )
+
+
+def _random_sample_mask(
+    intp_t n_total_samples,
+    intp_t n_total_in_bag,
+    random_state
+):
+    """Create a random sample mask where ``n_total_in_bag`` elements are set.
+
+    Parameters
+    ----------
+    n_total_samples : int
+        The length of the resulting mask.
+
+    n_total_in_bag : int
+        The number of elements in the sample mask which are set to 1.
+
+    random_state : RandomState
+        A numpy ``RandomState`` object.
+
+    Returns
+    -------
+    sample_mask : np.ndarray, shape=[n_total_samples]
+        An ndarray where ``n_total_in_bag`` elements are set to ``True``
+        the others are ``False``.
+    """
+    cdef float64_t[::1] rand = random_state.uniform(size=n_total_samples)
+    cdef uint8_t[::1] sample_mask = np_zeros((n_total_samples,), dtype=bool)
+
+    cdef intp_t n_bagged = 0
+    cdef intp_t i = 0
+
+    for i in range(n_total_samples):
+        if rand[i] * (n_total_samples - i) < (n_total_in_bag - n_bagged):
+            sample_mask[i] = 1
+            n_bagged += 1
+
+    return sample_mask.base
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5939d83c8483812187c39d373e425630a9e44fe5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/__init__.py
@@ -0,0 +1,8 @@
+"""This module implements histogram-based gradient boosting estimators.
+
+The implementation is a port from pygbm which is itself strongly inspired
+from LightGBM.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..34ab20733dfd9310b157edfc130f774bf063959c
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..f343ada64cdd0c0923184b5bafbaf1f0a9526a12
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -0,0 +1,85 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cython.parallel import prange
+from libc.math cimport isnan
+
+from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
+from ...utils._typedefs cimport uint8_t
+
+
+def _map_to_bins(const X_DTYPE_C [:, :] data,
+                 list binning_thresholds,
+                 const uint8_t[::1] is_categorical,
+                 const uint8_t missing_values_bin_idx,
+                 int n_threads,
+                 X_BINNED_DTYPE_C [::1, :] binned):
+    """Bin continuous and categorical values to discrete integer-coded levels.
+
+    A given value x is mapped into bin value i iff
+    thresholds[i - 1] < x <= thresholds[i]
+
+    Parameters
+    ----------
+    data : ndarray, shape (n_samples, n_features)
+        The data to bin.
+    binning_thresholds : list of arrays
+        For each feature, stores the increasing numeric values that are
+        used to separate the bins.
+    is_categorical : ndarray of uint8_t of shape (n_features,)
+        Indicates categorical features.
+    n_threads : int
+        Number of OpenMP threads to use.
+    binned : ndarray, shape (n_samples, n_features)
+        Output array, must be fortran aligned.
+    """
+    cdef:
+        int feature_idx
+
+    for feature_idx in range(data.shape[1]):
+        _map_col_to_bins(
+            data[:, feature_idx],
+            binning_thresholds[feature_idx],
+            is_categorical[feature_idx],
+            missing_values_bin_idx,
+            n_threads,
+            binned[:, feature_idx]
+        )
+
+
+cdef void _map_col_to_bins(
+    const X_DTYPE_C [:] data,
+    const X_DTYPE_C [:] binning_thresholds,
+    const uint8_t is_categorical,
+    const uint8_t missing_values_bin_idx,
+    int n_threads,
+    X_BINNED_DTYPE_C [:] binned
+):
+    """Binary search to find the bin index for each value in the data."""
+    cdef:
+        int i
+        int left
+        int right
+        int middle
+
+    for i in prange(data.shape[0], schedule='static', nogil=True,
+                    num_threads=n_threads):
+        if (
+            isnan(data[i]) or
+            # To follow LightGBM's conventions, negative values for
+            # categorical features are considered as missing values.
+            (is_categorical and data[i] < 0)
+        ):
+            binned[i] = missing_values_bin_idx
+        else:
+            # for known values, use binary search
+            left, right = 0, binning_thresholds.shape[0]
+            while left < right:
+                # equal to (right + left - 1) // 2 but avoids overflow
+                middle = left + (right - left - 1) // 2
+                if data[i] <= binning_thresholds[middle]:
+                    right = middle
+                else:
+                    left = middle + 1
+
+            binned[i] = left
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..d68213df8f07d204ee46bfd482025551796bf324
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..c44477cfa2300620c457152d86f8053ef44cf720
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd
@@ -0,0 +1,20 @@
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport BITSET_DTYPE_C
+from .common cimport BITSET_INNER_DTYPE_C
+from .common cimport X_DTYPE_C
+from ...utils._typedefs cimport uint8_t
+
+
+cdef void init_bitset(BITSET_DTYPE_C bitset) noexcept nogil
+
+cdef void set_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) noexcept nogil
+
+cdef uint8_t in_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) noexcept nogil
+
+cpdef uint8_t in_bitset_memoryview(const BITSET_INNER_DTYPE_C[:] bitset,
+                                   X_BINNED_DTYPE_C val) noexcept nogil
+
+cdef uint8_t in_bitset_2d_memoryview(
+    const BITSET_INNER_DTYPE_C[:, :] bitset,
+    X_BINNED_DTYPE_C val,
+    unsigned int row) noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..cab20f7d5af05242102379d9e61ab7ca9a0e91f3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx
@@ -0,0 +1,65 @@
+from .common cimport BITSET_INNER_DTYPE_C
+from .common cimport BITSET_DTYPE_C
+from .common cimport X_DTYPE_C
+from .common cimport X_BINNED_DTYPE_C
+from ...utils._typedefs cimport uint8_t
+
+
+# A bitset is a data structure used to represent sets of integers in [0, n]. We
+# use them to represent sets of features indices (e.g. features that go to the
+# left child, or features that are categorical). For familiarity with bitsets
+# and bitwise operations:
+# https://en.wikipedia.org/wiki/Bit_array
+# https://en.wikipedia.org/wiki/Bitwise_operation
+
+
+cdef inline void init_bitset(BITSET_DTYPE_C bitset) noexcept nogil:  # OUT
+    cdef:
+        unsigned int i
+
+    for i in range(8):
+        bitset[i] = 0
+
+
+cdef inline void set_bitset(BITSET_DTYPE_C bitset,  # OUT
+                            X_BINNED_DTYPE_C val) noexcept nogil:
+    bitset[val // 32] |= (1 << (val % 32))
+
+
+cdef inline uint8_t in_bitset(BITSET_DTYPE_C bitset,
+                              X_BINNED_DTYPE_C val) noexcept nogil:
+    return (bitset[val // 32] >> (val % 32)) & 1
+
+
+cpdef inline uint8_t in_bitset_memoryview(const BITSET_INNER_DTYPE_C[:] bitset,
+                                          X_BINNED_DTYPE_C val) noexcept nogil:
+    return (bitset[val // 32] >> (val % 32)) & 1
+
+
+cdef inline uint8_t in_bitset_2d_memoryview(const BITSET_INNER_DTYPE_C[:, :] bitset,
+                                            X_BINNED_DTYPE_C val,
+                                            unsigned int row) noexcept nogil:
+    # Same as above but works on 2d memory views to avoid the creation of 1d
+    # memory views. See https://github.com/scikit-learn/scikit-learn/issues/17299
+    return (bitset[row, val // 32] >> (val % 32)) & 1
+
+
+cpdef inline void set_bitset_memoryview(BITSET_INNER_DTYPE_C[:] bitset,  # OUT
+                                        X_BINNED_DTYPE_C val):
+    bitset[val // 32] |= (1 << (val % 32))
+
+
+def set_raw_bitset_from_binned_bitset(BITSET_INNER_DTYPE_C[:] raw_bitset,  # OUT
+                                      BITSET_INNER_DTYPE_C[:] binned_bitset,
+                                      X_DTYPE_C[:] categories):
+    """Set the raw_bitset from the values of the binned bitset
+
+    categories is a mapping from binned category value to raw category value.
+    """
+    cdef:
+        int binned_cat_value
+        X_DTYPE_C raw_cat_value
+
+    for binned_cat_value, raw_cat_value in enumerate(categories):
+        if in_bitset_memoryview(binned_bitset, binned_cat_value):
+            set_bitset_memoryview(raw_bitset, <X_BINNED_DTYPE_C>raw_cat_value)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..dcbbf733ebb51cf6f3ca426bd8d2955a20af3e50
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
@@ -0,0 +1,59 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cython.parallel import prange
+import numpy as np
+
+from .common import Y_DTYPE
+from .common cimport Y_DTYPE_C
+
+
+def _update_raw_predictions(
+        Y_DTYPE_C [::1] raw_predictions,  # OUT
+        grower,
+        n_threads,
+):
+    """Update raw_predictions with the predictions of the newest tree.
+
+    This is equivalent to (and much faster than):
+        raw_predictions += last_estimator.predict(X_train)
+
+    It's only possible for data X_train that is used to train the trees (it
+    isn't usable for e.g. X_val).
+    """
+    cdef:
+        unsigned int [::1] starts  # start of each leaf in partition
+        unsigned int [::1] stops  # end of each leaf in partition
+        Y_DTYPE_C [::1] values  # value of each leaf
+        const unsigned int [::1] partition = grower.splitter.partition
+        list leaves
+
+    leaves = grower.finalized_leaves
+    starts = np.array([leaf.partition_start for leaf in leaves],
+                      dtype=np.uint32)
+    stops = np.array([leaf.partition_stop for leaf in leaves],
+                     dtype=np.uint32)
+    values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE)
+
+    _update_raw_predictions_helper(raw_predictions, starts, stops, partition,
+                                   values, n_threads)
+
+
+cdef inline void _update_raw_predictions_helper(
+        Y_DTYPE_C [::1] raw_predictions,  # OUT
+        const unsigned int [::1] starts,
+        const unsigned int [::1] stops,
+        const unsigned int [::1] partition,
+        const Y_DTYPE_C [::1] values,
+        int n_threads,
+):
+
+    cdef:
+        unsigned int position
+        int leaf_idx
+        int n_leaves = starts.shape[0]
+
+    for leaf_idx in prange(n_leaves, schedule='static', nogil=True,
+                           num_threads=n_threads):
+        for position in range(starts[leaf_idx], stops[leaf_idx]):
+            raw_predictions[partition[position]] += values[leaf_idx]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..8257fa974c4a00115180aaa7815389051f1d9db2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -0,0 +1,256 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cython.parallel import prange
+from libc.math cimport isnan
+import numpy as np
+
+from ...utils._typedefs cimport intp_t, uint8_t
+from .common cimport X_DTYPE_C
+from .common cimport Y_DTYPE_C
+from .common import Y_DTYPE
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport BITSET_INNER_DTYPE_C
+from .common cimport node_struct
+from ._bitset cimport in_bitset_2d_memoryview
+
+
+def _predict_from_raw_data(  # raw data = non-binned data
+        const node_struct [:] nodes,
+        const X_DTYPE_C [:, :] numeric_data,
+        const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
+        const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
+        const unsigned int [::1] f_idx_map,
+        int n_threads,
+        Y_DTYPE_C [:] out):
+
+    cdef:
+        int i
+
+    for i in prange(numeric_data.shape[0], schedule='static', nogil=True,
+                    num_threads=n_threads):
+        out[i] = _predict_one_from_raw_data(
+            nodes, numeric_data, raw_left_cat_bitsets,
+            known_cat_bitsets,
+            f_idx_map, i)
+
+
+cdef inline Y_DTYPE_C _predict_one_from_raw_data(
+        const node_struct [:] nodes,
+        const X_DTYPE_C [:, :] numeric_data,
+        const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
+        const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
+        const unsigned int [::1] f_idx_map,
+        const int row) noexcept nogil:
+    # Need to pass the whole array and the row index, else prange won't work.
+    # See issue Cython #2798
+
+    cdef:
+        node_struct node = nodes[0]
+        unsigned int node_idx = 0
+        X_DTYPE_C data_val
+
+    while True:
+        if node.is_leaf:
+            return node.value
+
+        data_val = numeric_data[row, node.feature_idx]
+
+        if isnan(data_val):
+            if node.missing_go_to_left:
+                node_idx = node.left
+            else:
+                node_idx = node.right
+        elif node.is_categorical:
+            if data_val < 0:
+                # data_val is not in the accepted range, so it is treated as missing value
+                node_idx = node.left if node.missing_go_to_left else node.right
+            elif in_bitset_2d_memoryview(
+                    raw_left_cat_bitsets,
+                    <X_BINNED_DTYPE_C>data_val,
+                    node.bitset_idx):
+                node_idx = node.left
+            elif in_bitset_2d_memoryview(
+                    known_cat_bitsets,
+                    <X_BINNED_DTYPE_C>data_val,
+                    f_idx_map[node.feature_idx]):
+                node_idx = node.right
+            else:
+                # Treat unknown categories as missing.
+                node_idx = node.left if node.missing_go_to_left else node.right
+        else:
+            if data_val <= node.num_threshold:
+                node_idx = node.left
+            else:
+                node_idx = node.right
+        node = nodes[node_idx]
+
+
+def _predict_from_binned_data(
+        node_struct [:] nodes,
+        const X_BINNED_DTYPE_C [:, :] binned_data,
+        BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
+        const uint8_t missing_values_bin_idx,
+        int n_threads,
+        Y_DTYPE_C [:] out):
+
+    cdef:
+        int i
+
+    for i in prange(binned_data.shape[0], schedule='static', nogil=True,
+                    num_threads=n_threads):
+        out[i] = _predict_one_from_binned_data(nodes,
+                                               binned_data,
+                                               binned_left_cat_bitsets, i,
+                                               missing_values_bin_idx)
+
+
+cdef inline Y_DTYPE_C _predict_one_from_binned_data(
+        node_struct [:] nodes,
+        const X_BINNED_DTYPE_C [:, :] binned_data,
+        const BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
+        const int row,
+        const uint8_t missing_values_bin_idx) noexcept nogil:
+    # Need to pass the whole array and the row index, else prange won't work.
+    # See issue Cython #2798
+
+    cdef:
+        node_struct node = nodes[0]
+        unsigned int node_idx = 0
+        X_BINNED_DTYPE_C data_val
+
+    while True:
+        if node.is_leaf:
+            return node.value
+
+        data_val = binned_data[row, node.feature_idx]
+
+        if data_val == missing_values_bin_idx:
+            if node.missing_go_to_left:
+                node_idx = node.left
+            else:
+                node_idx = node.right
+        elif node.is_categorical:
+            if in_bitset_2d_memoryview(
+                    binned_left_cat_bitsets,
+                    data_val,
+                    node.bitset_idx):
+                node_idx = node.left
+            else:
+                node_idx = node.right
+        else:
+            if data_val <= node.bin_threshold:
+                node_idx = node.left
+            else:
+                node_idx = node.right
+        node = nodes[node_idx]
+
+
+def _compute_partial_dependence(
+    node_struct [:] nodes,
+    const X_DTYPE_C [:, ::1] X,
+    const intp_t [:] target_features,
+    Y_DTYPE_C [:] out
+):
+    """Partial dependence of the response on the ``target_features`` set.
+
+    For each sample in ``X`` a tree traversal is performed.
+    Each traversal starts from the root with weight 1.0.
+
+    At each non-leaf node that splits on a target feature, either
+    the left child or the right child is visited based on the feature
+    value of the current sample, and the weight is not modified.
+    At each non-leaf node that splits on a complementary feature,
+    both children are visited and the weight is multiplied by the fraction
+    of training samples which went to each child.
+
+    At each leaf, the value of the node is multiplied by the current
+    weight (weights sum to 1 for all visited terminal nodes).
+
+    Parameters
+    ----------
+    nodes : view on array of PREDICTOR_RECORD_DTYPE, shape (n_nodes)
+        The array representing the predictor tree.
+    X : view on 2d ndarray, shape (n_samples, n_target_features)
+        The grid points on which the partial dependence should be
+        evaluated.
+    target_features : view on 1d ndarray of intp_t, shape (n_target_features)
+        The set of target features for which the partial dependence
+        should be evaluated.
+    out : view on 1d ndarray, shape (n_samples)
+        The value of the partial dependence function on each grid
+        point.
+    """
+
+    cdef:
+        unsigned int current_node_idx
+        unsigned int [:] node_idx_stack = np.zeros(shape=nodes.shape[0],
+                                                   dtype=np.uint32)
+        Y_DTYPE_C [::1] weight_stack = np.zeros(shape=nodes.shape[0],
+                                                dtype=Y_DTYPE)
+        node_struct * current_node  # pointer to avoid copying attributes
+
+        unsigned int sample_idx
+        intp_t feature_idx
+        unsigned stack_size
+        Y_DTYPE_C left_sample_frac
+        Y_DTYPE_C current_weight
+        Y_DTYPE_C total_weight  # used for sanity check only
+        bint is_target_feature
+
+    for sample_idx in range(X.shape[0]):
+        # init stacks for current sample
+        stack_size = 1
+        node_idx_stack[0] = 0  # root node
+        weight_stack[0] = 1  # all the samples are in the root node
+        total_weight = 0
+
+        while stack_size > 0:
+
+            # pop the stack
+            stack_size -= 1
+            current_node_idx = node_idx_stack[stack_size]
+            current_node = &nodes[current_node_idx]
+
+            if current_node.is_leaf:
+                out[sample_idx] += (weight_stack[stack_size] *
+                                    current_node.value)
+                total_weight += weight_stack[stack_size]
+            else:
+                # determine if the split feature is a target feature
+                is_target_feature = False
+                for feature_idx in range(target_features.shape[0]):
+                    if target_features[feature_idx] == current_node.feature_idx:
+                        is_target_feature = True
+                        break
+
+                if is_target_feature:
+                    # In this case, we push left or right child on stack
+                    if X[sample_idx, feature_idx] <= current_node.num_threshold:
+                        node_idx_stack[stack_size] = current_node.left
+                    else:
+                        node_idx_stack[stack_size] = current_node.right
+                    stack_size += 1
+                else:
+                    # In this case, we push both children onto the stack,
+                    # and give a weight proportional to the number of
+                    # samples going through each branch.
+
+                    # push left child
+                    node_idx_stack[stack_size] = current_node.left
+                    left_sample_frac = (
+                        <Y_DTYPE_C> nodes[current_node.left].count /
+                        current_node.count)
+                    current_weight = weight_stack[stack_size]
+                    weight_stack[stack_size] = current_weight * left_sample_frac
+                    stack_size += 1
+
+                    # push right child
+                    node_idx_stack[stack_size] = current_node.right
+                    weight_stack[stack_size] = (
+                        current_weight * (1 - left_sample_frac))
+                    stack_size += 1
+
+        # Sanity check. Should never happen.
+        if not (0.999 < total_weight < 1.001):
+            raise ValueError("Total weight should be 1.0 but was %.9f" %total_weight)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py
new file mode 100644
index 0000000000000000000000000000000000000000..eee26e68842b7925922340bf79badeaae19e4ae6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -0,0 +1,333 @@
+"""
+This module contains the BinMapper class.
+
+BinMapper is used for mapping a real-valued dataset into integer-valued bins.
+Bin thresholds are computed with the quantiles so that each bin contains
+approximately the same number of samples.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+from ...base import BaseEstimator, TransformerMixin
+from ...utils import check_array, check_random_state
+from ...utils._openmp_helpers import _openmp_effective_n_threads
+from ...utils.parallel import Parallel, delayed
+from ...utils.validation import check_is_fitted
+from ._binning import _map_to_bins
+from ._bitset import set_bitset_memoryview
+from .common import ALMOST_INF, X_BINNED_DTYPE, X_BITSET_INNER_DTYPE, X_DTYPE
+
+
+def _find_binning_thresholds(col_data, max_bins):
+    """Extract quantiles from a continuous feature.
+
+    Missing values are ignored for finding the thresholds.
+
+    Parameters
+    ----------
+    col_data : array-like, shape (n_samples,)
+        The continuous feature to bin.
+    max_bins: int
+        The maximum number of bins to use for non-missing values. If for a
+        given feature the number of unique values is less than ``max_bins``,
+        then those unique values will be used to compute the bin thresholds,
+        instead of the quantiles
+
+    Return
+    ------
+    binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)
+        The increasing numeric values that can be used to separate the bins.
+        A given value x will be mapped into bin value i iff
+        bining_thresholds[i - 1] < x <= binning_thresholds[i]
+    """
+    # ignore missing values when computing bin thresholds
+    missing_mask = np.isnan(col_data)
+    if missing_mask.any():
+        col_data = col_data[~missing_mask]
+    # The data will be sorted anyway in np.unique and again in percentile, so we do it
+    # here. Sorting also returns a contiguous array.
+    col_data = np.sort(col_data)
+    distinct_values = np.unique(col_data).astype(X_DTYPE)
+    if len(distinct_values) <= max_bins:
+        midpoints = distinct_values[:-1] + distinct_values[1:]
+        midpoints *= 0.5
+    else:
+        # We could compute approximate midpoint percentiles using the output of
+        # np.unique(col_data, return_counts) instead but this is more
+        # work and the performance benefit will be limited because we
+        # work on a fixed-size subsample of the full data.
+        percentiles = np.linspace(0, 100, num=max_bins + 1)
+        percentiles = percentiles[1:-1]
+        midpoints = np.percentile(col_data, percentiles, method="midpoint").astype(
+            X_DTYPE
+        )
+        assert midpoints.shape[0] == max_bins - 1
+
+    # We avoid having +inf thresholds: +inf thresholds are only allowed in
+    # a "split on nan" situation.
+    np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
+    return midpoints
+
+
+class _BinMapper(TransformerMixin, BaseEstimator):
+    """Transformer that maps a dataset into integer-valued bins.
+
+    For continuous features, the bins are created in a feature-wise fashion,
+    using quantiles so that each bins contains approximately the same number
+    of samples. For large datasets, quantiles are computed on a subset of the
+    data to speed-up the binning, but the quantiles should remain stable.
+
+    For categorical features, the raw categorical values are expected to be
+    in [0, 254] (this is not validated here though) and each category
+    corresponds to a bin. All categorical values must be known at
+    initialization: transform() doesn't know how to bin unknown categorical
+    values. Note that transform() is only used on non-training data in the
+    case of early stopping.
+
+    Features with a small number of values may be binned into less than
+    ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
+    for missing values.
+
+    Parameters
+    ----------
+    n_bins : int, default=256
+        The maximum number of bins to use (including the bin for missing
+        values). Should be in [3, 256]. Non-missing values are binned on
+        ``max_bins = n_bins - 1`` bins. The last bin is always reserved for
+        missing values. If for a given feature the number of unique values is
+        less than ``max_bins``, then those unique values will be used to
+        compute the bin thresholds, instead of the quantiles. For categorical
+        features indicated by ``is_categorical``, the docstring for
+        ``is_categorical`` details on this procedure.
+    subsample : int or None, default=2e5
+        If ``n_samples > subsample``, then ``sub_samples`` samples will be
+        randomly chosen to compute the quantiles. If ``None``, the whole data
+        is used.
+    is_categorical : ndarray of bool of shape (n_features,), default=None
+        Indicates categorical features. By default, all features are
+        considered continuous.
+    known_categories : list of {ndarray, None} of shape (n_features,), \
+            default=none
+        For each categorical feature, the array indicates the set of unique
+        categorical values. These should be the possible values over all the
+        data, not just the training data. For continuous features, the
+        corresponding entry should be None.
+    random_state: int, RandomState instance or None, default=None
+        Pseudo-random number generator to control the random sub-sampling.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+    n_threads : int, default=None
+        Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
+        to determine the effective number of threads use, which takes cgroups CPU
+        quotes into account. See the docstring of `_openmp_effective_n_threads`
+        for details.
+
+    Attributes
+    ----------
+    bin_thresholds_ : list of ndarray
+        For each feature, each array indicates how to map a feature into a
+        binned feature. The semantic and size depends on the nature of the
+        feature:
+        - for real-valued features, the array corresponds to the real-valued
+          bin thresholds (the upper bound of each bin). There are ``max_bins
+          - 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of
+          bins used for non-missing values.
+        - for categorical features, the array is a map from a binned category
+          value to the raw category value. The size of the array is equal to
+          ``min(max_bins, category_cardinality)`` where we ignore missing
+          values in the cardinality.
+    n_bins_non_missing_ : ndarray, dtype=np.uint32
+        For each feature, gives the number of bins actually used for
+        non-missing values. For features with a lot of unique values, this is
+        equal to ``n_bins - 1``.
+    is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8
+        Indicator for categorical features.
+    missing_values_bin_idx_ : np.uint8
+        The index of the bin where missing values are mapped. This is a
+        constant across all features. This corresponds to the last bin, and
+        it is always equal to ``n_bins - 1``. Note that if ``n_bins_non_missing_``
+        is less than ``n_bins - 1`` for a given feature, then there are
+        empty (and unused) bins.
+    """
+
+    def __init__(
+        self,
+        n_bins=256,
+        subsample=int(2e5),
+        is_categorical=None,
+        known_categories=None,
+        random_state=None,
+        n_threads=None,
+    ):
+        self.n_bins = n_bins
+        self.subsample = subsample
+        self.is_categorical = is_categorical
+        self.known_categories = known_categories
+        self.random_state = random_state
+        self.n_threads = n_threads
+
+    def fit(self, X, y=None):
+        """Fit data X by computing the binning thresholds.
+
+        The last bin is reserved for missing values, whether missing values
+        are present in the data or not.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to bin.
+        y: None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+        """
+        if not (3 <= self.n_bins <= 256):
+            # min is 3: at least 2 distinct bins and a missing values bin
+            raise ValueError(
+                "n_bins={} should be no smaller than 3 and no larger than 256.".format(
+                    self.n_bins
+                )
+            )
+
+        X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False)
+        max_bins = self.n_bins - 1
+
+        rng = check_random_state(self.random_state)
+        if self.subsample is not None and X.shape[0] > self.subsample:
+            subset = rng.choice(X.shape[0], self.subsample, replace=False)
+            X = X.take(subset, axis=0)
+
+        if self.is_categorical is None:
+            self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)
+        else:
+            self.is_categorical_ = np.asarray(self.is_categorical, dtype=np.uint8)
+
+        n_features = X.shape[1]
+        known_categories = self.known_categories
+        if known_categories is None:
+            known_categories = [None] * n_features
+
+        # validate is_categorical and known_categories parameters
+        for f_idx in range(n_features):
+            is_categorical = self.is_categorical_[f_idx]
+            known_cats = known_categories[f_idx]
+            if is_categorical and known_cats is None:
+                raise ValueError(
+                    f"Known categories for feature {f_idx} must be provided."
+                )
+            if not is_categorical and known_cats is not None:
+                raise ValueError(
+                    f"Feature {f_idx} isn't marked as a categorical feature, "
+                    "but categories were passed."
+                )
+
+        self.missing_values_bin_idx_ = self.n_bins - 1
+
+        self.bin_thresholds_ = [None] * n_features
+        n_bins_non_missing = [None] * n_features
+
+        non_cat_thresholds = Parallel(n_jobs=self.n_threads, backend="threading")(
+            delayed(_find_binning_thresholds)(X[:, f_idx], max_bins)
+            for f_idx in range(n_features)
+            if not self.is_categorical_[f_idx]
+        )
+
+        non_cat_idx = 0
+        for f_idx in range(n_features):
+            if self.is_categorical_[f_idx]:
+                # Since categories are assumed to be encoded in
+                # [0, n_cats] and since n_cats <= max_bins,
+                # the thresholds *are* the unique categorical values. This will
+                # lead to the correct mapping in transform()
+                thresholds = known_categories[f_idx]
+                n_bins_non_missing[f_idx] = thresholds.shape[0]
+                self.bin_thresholds_[f_idx] = thresholds
+            else:
+                self.bin_thresholds_[f_idx] = non_cat_thresholds[non_cat_idx]
+                n_bins_non_missing[f_idx] = self.bin_thresholds_[f_idx].shape[0] + 1
+                non_cat_idx += 1
+
+        self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)
+        return self
+
+    def transform(self, X):
+        """Bin data X.
+
+        Missing values will be mapped to the last bin.
+
+        For categorical features, the mapping will be incorrect for unknown
+        categories. Since the BinMapper is given known_categories of the
+        entire training data (i.e. before the call to train_test_split() in
+        case of early-stopping), this never happens.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to bin.
+
+        Returns
+        -------
+        X_binned : array-like of shape (n_samples, n_features)
+            The binned data (fortran-aligned).
+        """
+        X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False)
+        check_is_fitted(self)
+        if X.shape[1] != self.n_bins_non_missing_.shape[0]:
+            raise ValueError(
+                "This estimator was fitted with {} features but {} got passed "
+                "to transform()".format(self.n_bins_non_missing_.shape[0], X.shape[1])
+            )
+
+        n_threads = _openmp_effective_n_threads(self.n_threads)
+        binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
+        _map_to_bins(
+            X,
+            self.bin_thresholds_,
+            self.is_categorical_,
+            self.missing_values_bin_idx_,
+            n_threads,
+            binned,
+        )
+        return binned
+
+    def make_known_categories_bitsets(self):
+        """Create bitsets of known categories.
+
+        Returns
+        -------
+        - known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
+            Array of bitsets of known categories, for each categorical feature.
+        - f_idx_map : ndarray of shape (n_features,)
+            Map from original feature index to the corresponding index in the
+            known_cat_bitsets array.
+        """
+
+        categorical_features_indices = np.flatnonzero(self.is_categorical_)
+
+        n_features = self.is_categorical_.size
+        n_categorical_features = categorical_features_indices.size
+
+        f_idx_map = np.zeros(n_features, dtype=np.uint32)
+        f_idx_map[categorical_features_indices] = np.arange(
+            n_categorical_features, dtype=np.uint32
+        )
+
+        known_categories = self.bin_thresholds_
+
+        known_cat_bitsets = np.zeros(
+            (n_categorical_features, 8), dtype=X_BITSET_INNER_DTYPE
+        )
+
+        # TODO: complexity is O(n_categorical_features * 255). Maybe this is
+        # worth cythonizing
+        for mapped_f_idx, f_idx in enumerate(categorical_features_indices):
+            for raw_cat_val in known_categories[f_idx]:
+                set_bitset_memoryview(known_cat_bitsets[mapped_f_idx], raw_cat_val)
+
+        return known_cat_bitsets, f_idx_map
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..7ce6502fa120ecd189de882e537f440844f48021
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..9ff9fc89800d7bcd04a0a9d202d828a2079a6f28
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -0,0 +1,43 @@
+from ...utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, uint32_t
+
+
+ctypedef float64_t X_DTYPE_C
+ctypedef uint8_t X_BINNED_DTYPE_C
+ctypedef float64_t Y_DTYPE_C
+ctypedef float32_t G_H_DTYPE_C
+ctypedef uint32_t BITSET_INNER_DTYPE_C
+ctypedef BITSET_INNER_DTYPE_C[8] BITSET_DTYPE_C
+
+
+cdef packed struct hist_struct:
+    # Same as histogram dtype but we need a struct to declare views. It needs
+    # to be packed since by default numpy dtypes aren't aligned
+    Y_DTYPE_C sum_gradients
+    Y_DTYPE_C sum_hessians
+    unsigned int count
+
+
+cdef packed struct node_struct:
+    # Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It
+    # needs to be packed since by default numpy dtypes aren't aligned
+    Y_DTYPE_C value
+    unsigned int count
+    intp_t feature_idx
+    X_DTYPE_C num_threshold
+    uint8_t missing_go_to_left
+    unsigned int left
+    unsigned int right
+    Y_DTYPE_C gain
+    unsigned int depth
+    uint8_t is_leaf
+    X_BINNED_DTYPE_C bin_threshold
+    uint8_t is_categorical
+    # The index of the corresponding bitsets in the Predictor's bitset arrays.
+    # Only used if is_categorical is True
+    unsigned int bitset_idx
+
+
+cpdef enum MonotonicConstraint:
+    NO_CST = 0
+    POS = 1
+    NEG = -1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..6b20e32813d5b88e533936e7ace1693ac4e5d7ec
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pyx
@@ -0,0 +1,44 @@
+import numpy as np
+
+# Y_DYTPE is the dtype to which the targets y are converted to. This is also
+# dtype for leaf values, gains, and sums of gradients / hessians. The gradients
+# and hessians arrays are stored as floats to avoid using too much memory.
+Y_DTYPE = np.float64
+X_DTYPE = np.float64
+X_BINNED_DTYPE = np.uint8  # hence max_bins == 256
+# dtype for gradients and hessians arrays
+G_H_DTYPE = np.float32
+X_BITSET_INNER_DTYPE = np.uint32
+
+# Note that we use Y_DTYPE=float64 to avoid issues with floating point precision when
+# summing gradients and hessians (both float32). Those are difficult to protect via
+# tools like (Kahan-) Neumaier summation as in CPython, see
+# https://github.com/python/cpython/issues/100425, or pairwise summation as numpy, see
+# https://github.com/numpy/numpy/pull/3685, due to the way histograms are summed
+# (number of additions per bin is not known in advance). See also comment in
+# _subtract_histograms.
+HISTOGRAM_DTYPE = np.dtype([
+    ('sum_gradients', Y_DTYPE),  # sum of sample gradients in bin
+    ('sum_hessians', Y_DTYPE),  # sum of sample hessians in bin
+    ('count', np.uint32),  # number of samples in bin
+])
+
+PREDICTOR_RECORD_DTYPE = np.dtype([
+    ('value', Y_DTYPE),
+    ('count', np.uint32),
+    ('feature_idx', np.intp),
+    ('num_threshold', X_DTYPE),
+    ('missing_go_to_left', np.uint8),
+    ('left', np.uint32),
+    ('right', np.uint32),
+    ('gain', Y_DTYPE),
+    ('depth', np.uint32),
+    ('is_leaf', np.uint8),
+    ('bin_threshold', X_BINNED_DTYPE),
+    ('is_categorical', np.uint8),
+    # The index of the corresponding bitsets in the Predictor's bitset arrays.
+    # Only used if is_categorical is True
+    ('bitset_idx', np.uint32)
+])
+
+ALMOST_INF = 1e300  # see LightGBM AvoidInf()
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
new file mode 100644
index 0000000000000000000000000000000000000000..064391abab24d88d3c410bbdacde7ebc4db616d8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -0,0 +1,2371 @@
+"""Fast Gradient Boosting decision trees for classification and regression."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+from abc import ABC, abstractmethod
+from contextlib import contextmanager, nullcontext, suppress
+from functools import partial
+from numbers import Integral, Real
+from time import time
+
+import numpy as np
+
+from ..._loss.loss import (
+    _LOSSES,
+    BaseLoss,
+    HalfBinomialLoss,
+    HalfGammaLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+    PinballLoss,
+)
+from ...base import (
+    BaseEstimator,
+    ClassifierMixin,
+    RegressorMixin,
+    _fit_context,
+    is_classifier,
+)
+from ...compose import ColumnTransformer
+from ...metrics import check_scoring
+from ...metrics._scorer import _SCORERS
+from ...model_selection import train_test_split
+from ...preprocessing import FunctionTransformer, LabelEncoder, OrdinalEncoder
+from ...utils import check_random_state, compute_sample_weight, resample
+from ...utils._missing import is_scalar_nan
+from ...utils._openmp_helpers import _openmp_effective_n_threads
+from ...utils._param_validation import Interval, RealNotInt, StrOptions
+from ...utils.multiclass import check_classification_targets
+from ...utils.validation import (
+    _check_monotonic_cst,
+    _check_sample_weight,
+    _check_y,
+    _is_pandas_df,
+    check_array,
+    check_consistent_length,
+    check_is_fitted,
+    validate_data,
+)
+from ._gradient_boosting import _update_raw_predictions
+from .binning import _BinMapper
+from .common import G_H_DTYPE, X_DTYPE, Y_DTYPE
+from .grower import TreeGrower
+
+_LOSSES = _LOSSES.copy()
+_LOSSES.update(
+    {
+        "poisson": HalfPoissonLoss,
+        "gamma": HalfGammaLoss,
+        "quantile": PinballLoss,
+    }
+)
+
+
+def _update_leaves_values(loss, grower, y_true, raw_prediction, sample_weight):
+    """Update the leaf values to be predicted by the tree.
+
+    Update equals:
+        loss.fit_intercept_only(y_true - raw_prediction)
+
+    This is only applied if loss.differentiable is False.
+    Note: It only works, if the loss is a function of the residual, as is the
+    case for AbsoluteError and PinballLoss. Otherwise, one would need to get
+    the minimum of loss(y_true, raw_prediction + x) in x. A few examples:
+      - AbsoluteError: median(y_true - raw_prediction).
+      - PinballLoss: quantile(y_true - raw_prediction).
+
+    More background:
+    For the standard gradient descent method according to "Greedy Function
+    Approximation: A Gradient Boosting Machine" by Friedman, all loss functions but the
+    squared loss need a line search step. BaseHistGradientBoosting, however, implements
+    a so called Newton boosting where the trees are fitted to a 2nd order
+    approximations of the loss in terms of gradients and hessians. In this case, the
+    line search step is only necessary if the loss is not smooth, i.e. not
+    differentiable, which renders the 2nd order approximation invalid. In fact,
+    non-smooth losses arbitrarily set hessians to 1 and effectively use the standard
+    gradient descent method with line search.
+    """
+    # TODO: Ideally this should be computed in parallel over the leaves using something
+    # similar to _update_raw_predictions(), but this requires a cython version of
+    # median().
+    for leaf in grower.finalized_leaves:
+        indices = leaf.sample_indices
+        if sample_weight is None:
+            sw = None
+        else:
+            sw = sample_weight[indices]
+        update = loss.fit_intercept_only(
+            y_true=y_true[indices] - raw_prediction[indices],
+            sample_weight=sw,
+        )
+        leaf.value = grower.shrinkage * update
+        # Note that the regularization is ignored here
+
+
+@contextmanager
+def _patch_raw_predict(estimator, raw_predictions):
+    """Context manager that patches _raw_predict to return raw_predictions.
+
+    `raw_predictions` is typically a precomputed array to avoid redundant
+    state-wise computations fitting with early stopping enabled: in this case
+    `raw_predictions` is incrementally updated whenever we add a tree to the
+    boosted ensemble.
+
+    Note: this makes fitting HistGradientBoosting* models inherently non thread
+    safe at fit time. However thread-safety at fit time was never guaranteed nor
+    enforced for scikit-learn estimators in general.
+
+    Thread-safety at prediction/transform time is another matter as those
+    operations are typically side-effect free and therefore often thread-safe by
+    default for most scikit-learn models and would like to keep it that way.
+    Therefore this context manager should only be used at fit time.
+
+    TODO: in the future, we could explore the possibility to extend the scorer
+    public API to expose a way to compute vales from raw predictions. That would
+    probably require also making the scorer aware of the inverse link function
+    used by the estimator which is typically private API for now, hence the need
+    for this patching mechanism.
+    """
+    orig_raw_predict = estimator._raw_predict
+
+    def _patched_raw_predicts(*args, **kwargs):
+        return raw_predictions
+
+    estimator._raw_predict = _patched_raw_predicts
+    yield estimator
+    estimator._raw_predict = orig_raw_predict
+
+
+class BaseHistGradientBoosting(BaseEstimator, ABC):
+    """Base class for histogram-based gradient boosting estimators."""
+
+    _parameter_constraints: dict = {
+        "loss": [BaseLoss],
+        "learning_rate": [Interval(Real, 0, None, closed="neither")],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
+        "max_depth": [Interval(Integral, 1, None, closed="left"), None],
+        "min_samples_leaf": [Interval(Integral, 1, None, closed="left")],
+        "l2_regularization": [Interval(Real, 0, None, closed="left")],
+        "max_features": [Interval(RealNotInt, 0, 1, closed="right")],
+        "monotonic_cst": ["array-like", dict, None],
+        "interaction_cst": [
+            list,
+            tuple,
+            StrOptions({"pairwise", "no_interactions"}),
+            None,
+        ],
+        "n_iter_no_change": [Interval(Integral, 1, None, closed="left")],
+        "validation_fraction": [
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(Integral, 1, None, closed="left"),
+            None,
+        ],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "max_bins": [Interval(Integral, 2, 255, closed="both")],
+        "categorical_features": ["array-like", StrOptions({"from_dtype"}), None],
+        "warm_start": ["boolean"],
+        "early_stopping": [StrOptions({"auto"}), "boolean"],
+        "scoring": [str, callable, None],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        loss,
+        *,
+        learning_rate,
+        max_iter,
+        max_leaf_nodes,
+        max_depth,
+        min_samples_leaf,
+        l2_regularization,
+        max_features,
+        max_bins,
+        categorical_features,
+        monotonic_cst,
+        interaction_cst,
+        warm_start,
+        early_stopping,
+        scoring,
+        validation_fraction,
+        n_iter_no_change,
+        tol,
+        verbose,
+        random_state,
+    ):
+        self.loss = loss
+        self.learning_rate = learning_rate
+        self.max_iter = max_iter
+        self.max_leaf_nodes = max_leaf_nodes
+        self.max_depth = max_depth
+        self.min_samples_leaf = min_samples_leaf
+        self.l2_regularization = l2_regularization
+        self.max_features = max_features
+        self.max_bins = max_bins
+        self.monotonic_cst = monotonic_cst
+        self.interaction_cst = interaction_cst
+        self.categorical_features = categorical_features
+        self.warm_start = warm_start
+        self.early_stopping = early_stopping
+        self.scoring = scoring
+        self.validation_fraction = validation_fraction
+        self.n_iter_no_change = n_iter_no_change
+        self.tol = tol
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def _validate_parameters(self):
+        """Validate parameters passed to __init__.
+
+        The parameters that are directly passed to the grower are checked in
+        TreeGrower."""
+        if self.monotonic_cst is not None and self.n_trees_per_iteration_ != 1:
+            raise ValueError(
+                "monotonic constraints are not supported for multiclass classification."
+            )
+
+    def _finalize_sample_weight(self, sample_weight, y):
+        """Finalize sample weight.
+
+        Used by subclasses to adjust sample_weights. This is useful for implementing
+        class weights.
+        """
+        return sample_weight
+
+    def _preprocess_X(self, X, *, reset):
+        """Preprocess and validate X.
+
+        Parameters
+        ----------
+        X : {array-like, pandas DataFrame} of shape (n_samples, n_features)
+            Input data.
+
+        reset : bool
+            Whether to reset the `n_features_in_` and `feature_names_in_ attributes.
+
+        Returns
+        -------
+        X : ndarray of shape (n_samples, n_features)
+            Validated input data.
+
+        known_categories : list of ndarray of shape (n_categories,)
+            List of known categories for each categorical feature.
+        """
+        # If there is a preprocessor, we let the preprocessor handle the validation.
+        # Otherwise, we validate the data ourselves.
+        check_X_kwargs = dict(dtype=[X_DTYPE], ensure_all_finite=False)
+        if not reset:
+            if self._preprocessor is None:
+                return validate_data(self, X, reset=False, **check_X_kwargs)
+            return self._preprocessor.transform(X)
+
+        # At this point, reset is False, which runs during `fit`.
+        self.is_categorical_ = self._check_categorical_features(X)
+
+        if self.is_categorical_ is None:
+            self._preprocessor = None
+            self._is_categorical_remapped = None
+
+            X = validate_data(self, X, **check_X_kwargs)
+            return X, None
+
+        n_features = X.shape[1]
+        ordinal_encoder = OrdinalEncoder(
+            categories="auto",
+            handle_unknown="use_encoded_value",
+            unknown_value=np.nan,
+            encoded_missing_value=np.nan,
+            dtype=X_DTYPE,
+        )
+
+        check_X = partial(check_array, **check_X_kwargs)
+        numerical_preprocessor = FunctionTransformer(check_X)
+        self._preprocessor = ColumnTransformer(
+            [
+                ("encoder", ordinal_encoder, self.is_categorical_),
+                ("numerical", numerical_preprocessor, ~self.is_categorical_),
+            ]
+        )
+        self._preprocessor.set_output(transform="default")
+        X = self._preprocessor.fit_transform(X)
+        # check categories found by the OrdinalEncoder and get their encoded values
+        known_categories = self._check_categories()
+        self.n_features_in_ = self._preprocessor.n_features_in_
+        with suppress(AttributeError):
+            self.feature_names_in_ = self._preprocessor.feature_names_in_
+
+        # The ColumnTransformer's output places the categorical features at the
+        # beginning
+        categorical_remapped = np.zeros(n_features, dtype=bool)
+        categorical_remapped[self._preprocessor.output_indices_["encoder"]] = True
+        self._is_categorical_remapped = categorical_remapped
+
+        return X, known_categories
+
+    def _check_categories(self):
+        """Check categories found by the preprocessor and return their encoded values.
+
+        Returns a list of length ``self.n_features_in_``, with one entry per
+        input feature.
+
+        For non-categorical features, the corresponding entry is ``None``.
+
+        For categorical features, the corresponding entry is an array
+        containing the categories as encoded by the preprocessor (an
+        ``OrdinalEncoder``), excluding missing values. The entry is therefore
+        ``np.arange(n_categories)`` where ``n_categories`` is the number of
+        unique values in the considered feature column, after removing missing
+        values.
+
+        If ``n_categories > self.max_bins`` for any feature, a ``ValueError``
+        is raised.
+        """
+        encoder = self._preprocessor.named_transformers_["encoder"]
+        known_categories = [None] * self._preprocessor.n_features_in_
+        categorical_column_indices = np.arange(self._preprocessor.n_features_in_)[
+            self._preprocessor.output_indices_["encoder"]
+        ]
+        for feature_idx, categories in zip(
+            categorical_column_indices, encoder.categories_
+        ):
+            # OrdinalEncoder always puts np.nan as the last category if the
+            # training data has missing values. Here we remove it because it is
+            # already added by the _BinMapper.
+            if len(categories) and is_scalar_nan(categories[-1]):
+                categories = categories[:-1]
+            if categories.size > self.max_bins:
+                try:
+                    feature_name = repr(encoder.feature_names_in_[feature_idx])
+                except AttributeError:
+                    feature_name = f"at index {feature_idx}"
+                raise ValueError(
+                    f"Categorical feature {feature_name} is expected to "
+                    f"have a cardinality <= {self.max_bins} but actually "
+                    f"has a cardinality of {categories.size}."
+                )
+            known_categories[feature_idx] = np.arange(len(categories), dtype=X_DTYPE)
+        return known_categories
+
+    def _check_categorical_features(self, X):
+        """Check and validate categorical features in X
+
+        Parameters
+        ----------
+        X : {array-like, pandas DataFrame} of shape (n_samples, n_features)
+            Input data.
+
+        Return
+        ------
+        is_categorical : ndarray of shape (n_features,) or None, dtype=bool
+            Indicates whether a feature is categorical. If no feature is
+            categorical, this is None.
+        """
+        # Special code for pandas because of a bug in recent pandas, which is
+        # fixed in main and maybe included in 2.2.1, see
+        # https://github.com/pandas-dev/pandas/pull/57173.
+        # Also pandas versions < 1.5.1 do not support the dataframe interchange
+        if _is_pandas_df(X):
+            X_is_dataframe = True
+            categorical_columns_mask = np.asarray(X.dtypes == "category")
+        elif hasattr(X, "__dataframe__"):
+            X_is_dataframe = True
+            categorical_columns_mask = np.asarray(
+                [
+                    c.dtype[0].name == "CATEGORICAL"
+                    for c in X.__dataframe__().get_columns()
+                ]
+            )
+        else:
+            X_is_dataframe = False
+            categorical_columns_mask = None
+
+        categorical_features = self.categorical_features
+
+        categorical_by_dtype = (
+            isinstance(categorical_features, str)
+            and categorical_features == "from_dtype"
+        )
+        no_categorical_dtype = categorical_features is None or (
+            categorical_by_dtype and not X_is_dataframe
+        )
+
+        if no_categorical_dtype:
+            return None
+
+        use_pandas_categorical = categorical_by_dtype and X_is_dataframe
+        if use_pandas_categorical:
+            categorical_features = categorical_columns_mask
+        else:
+            categorical_features = np.asarray(categorical_features)
+
+        if categorical_features.size == 0:
+            return None
+
+        if categorical_features.dtype.kind not in ("i", "b", "U", "O"):
+            raise ValueError(
+                "categorical_features must be an array-like of bool, int or "
+                f"str, got: {categorical_features.dtype.name}."
+            )
+
+        if categorical_features.dtype.kind == "O":
+            types = set(type(f) for f in categorical_features)
+            if types != {str}:
+                raise ValueError(
+                    "categorical_features must be an array-like of bool, int or "
+                    f"str, got: {', '.join(sorted(t.__name__ for t in types))}."
+                )
+
+        n_features = X.shape[1]
+        # At this point `validate_data` was not called yet because we use the original
+        # dtypes to discover the categorical features. Thus `feature_names_in_`
+        # is not defined yet.
+        feature_names_in_ = getattr(X, "columns", None)
+
+        if categorical_features.dtype.kind in ("U", "O"):
+            # check for feature names
+            if feature_names_in_ is None:
+                raise ValueError(
+                    "categorical_features should be passed as an array of "
+                    "integers or as a boolean mask when the model is fitted "
+                    "on data without feature names."
+                )
+            is_categorical = np.zeros(n_features, dtype=bool)
+            feature_names = list(feature_names_in_)
+            for feature_name in categorical_features:
+                try:
+                    is_categorical[feature_names.index(feature_name)] = True
+                except ValueError as e:
+                    raise ValueError(
+                        f"categorical_features has a item value '{feature_name}' "
+                        "which is not a valid feature name of the training "
+                        f"data. Observed feature names: {feature_names}"
+                    ) from e
+        elif categorical_features.dtype.kind == "i":
+            # check for categorical features as indices
+            if (
+                np.max(categorical_features) >= n_features
+                or np.min(categorical_features) < 0
+            ):
+                raise ValueError(
+                    "categorical_features set as integer "
+                    "indices must be in [0, n_features - 1]"
+                )
+            is_categorical = np.zeros(n_features, dtype=bool)
+            is_categorical[categorical_features] = True
+        else:
+            if categorical_features.shape[0] != n_features:
+                raise ValueError(
+                    "categorical_features set as a boolean mask "
+                    "must have shape (n_features,), got: "
+                    f"{categorical_features.shape}"
+                )
+            is_categorical = categorical_features
+
+        if not np.any(is_categorical):
+            return None
+        return is_categorical
+
+    def _check_interaction_cst(self, n_features):
+        """Check and validation for interaction constraints."""
+        if self.interaction_cst is None:
+            return None
+
+        if self.interaction_cst == "no_interactions":
+            interaction_cst = [[i] for i in range(n_features)]
+        elif self.interaction_cst == "pairwise":
+            interaction_cst = itertools.combinations(range(n_features), 2)
+        else:
+            interaction_cst = self.interaction_cst
+
+        try:
+            constraints = [set(group) for group in interaction_cst]
+        except TypeError:
+            raise ValueError(
+                "Interaction constraints must be a sequence of tuples or lists, got:"
+                f" {self.interaction_cst!r}."
+            )
+
+        for group in constraints:
+            for x in group:
+                if not (isinstance(x, Integral) and 0 <= x < n_features):
+                    raise ValueError(
+                        "Interaction constraints must consist of integer indices in"
+                        f" [0, n_features - 1] = [0, {n_features - 1}], specifying the"
+                        " position of features, got invalid indices:"
+                        f" {group!r}"
+                    )
+
+        # Add all not listed features as own group by default.
+        rest = set(range(n_features)) - set().union(*constraints)
+        if len(rest) > 0:
+            constraints.append(rest)
+
+        return constraints
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        *,
+        X_val=None,
+        y_val=None,
+        sample_weight_val=None,
+    ):
+        """Fit the gradient boosting model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,) default=None
+            Weights of training data.
+
+            .. versionadded:: 0.23
+
+        X_val : array-like of shape (n_val, n_features)
+            Additional sample of features for validation used in early stopping.
+            In a `Pipeline`, `X_val` can be transformed the same way as `X` with
+            `Pipeline(..., transform_input=["X_val"])`.
+
+            .. versionadded:: 1.7
+
+        y_val : array-like of shape (n_samples,)
+            Additional sample of target values for validation used in early stopping.
+
+            .. versionadded:: 1.7
+
+        sample_weight_val : array-like of shape (n_samples,) default=None
+            Additional weights for validation used in early stopping.
+
+            .. versionadded:: 1.7
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        fit_start_time = time()
+        acc_find_split_time = 0.0  # time spent finding the best splits
+        acc_apply_split_time = 0.0  # time spent splitting nodes
+        acc_compute_hist_time = 0.0  # time spent computing histograms
+        # time spent predicting X for gradient and hessians update
+        acc_prediction_time = 0.0
+        X, known_categories = self._preprocess_X(X, reset=True)
+        y = _check_y(y, estimator=self)
+        y = self._encode_y(y)
+        check_consistent_length(X, y)
+        # Do not create unit sample weights by default to later skip some
+        # computation
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64)
+            # TODO: remove when PDP supports sample weights
+            self._fitted_with_sw = True
+
+        sample_weight = self._finalize_sample_weight(sample_weight, y)
+
+        validation_data_provided = X_val is not None or y_val is not None
+        if validation_data_provided:
+            if y_val is None:
+                raise ValueError("X_val is provided, but y_val was not provided.")
+            if X_val is None:
+                raise ValueError("y_val is provided, but X_val was not provided.")
+            X_val = self._preprocess_X(X_val, reset=False)
+            y_val = _check_y(y_val, estimator=self)
+            y_val = self._encode_y_val(y_val)
+            check_consistent_length(X_val, y_val)
+            if sample_weight_val is not None:
+                sample_weight_val = _check_sample_weight(
+                    sample_weight_val, X_val, dtype=np.float64
+                )
+            if self.early_stopping is False:
+                raise ValueError(
+                    "X_val and y_val are passed to fit while at the same time "
+                    "early_stopping is False. When passing X_val and y_val to fit,"
+                    "early_stopping should be set to either 'auto' or True."
+                )
+
+        # Note: At this point, we could delete self._label_encoder if it exists.
+        # But we don't to keep the code even simpler.
+
+        rng = check_random_state(self.random_state)
+
+        # When warm starting, we want to reuse the same seed that was used
+        # the first time fit was called (e.g. train/val split).
+        # For feature subsampling, we want to continue with the rng we started with.
+        if not self.warm_start or not self._is_fitted():
+            self._random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8")
+            feature_subsample_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8")
+            self._feature_subsample_rng = np.random.default_rng(feature_subsample_seed)
+
+        self._validate_parameters()
+        monotonic_cst = _check_monotonic_cst(self, self.monotonic_cst)
+        # _preprocess_X places the categorical features at the beginning,
+        # change the order of monotonic_cst accordingly
+        if self.is_categorical_ is not None:
+            monotonic_cst_remapped = np.concatenate(
+                (
+                    monotonic_cst[self.is_categorical_],
+                    monotonic_cst[~self.is_categorical_],
+                )
+            )
+        else:
+            monotonic_cst_remapped = monotonic_cst
+
+        # used for validation in predict
+        n_samples, self._n_features = X.shape
+
+        # Encode constraints into a list of sets of features indices (integers).
+        interaction_cst = self._check_interaction_cst(self._n_features)
+
+        # we need this stateful variable to tell raw_predict() that it was
+        # called from fit() (this current method), and that the data it has
+        # received is pre-binned.
+        # predicting is faster on pre-binned data, so we want early stopping
+        # predictions to be made on pre-binned data. Unfortunately the _scorer
+        # can only call predict() or predict_proba(), not raw_predict(), and
+        # there's no way to tell the scorer that it needs to predict binned
+        # data.
+        self._in_fit = True
+
+        # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+        # into account when determine the maximum number of threads to use.
+        n_threads = _openmp_effective_n_threads()
+
+        if isinstance(self.loss, str):
+            self._loss = self._get_loss(sample_weight=sample_weight)
+        elif isinstance(self.loss, BaseLoss):
+            self._loss = self.loss
+
+        if self.early_stopping == "auto":
+            self.do_early_stopping_ = n_samples > 10_000
+        else:
+            self.do_early_stopping_ = self.early_stopping
+
+        # create validation data if needed
+        self._use_validation_data = (
+            self.validation_fraction is not None or validation_data_provided
+        )
+        if (
+            self.do_early_stopping_
+            and self._use_validation_data
+            and not validation_data_provided
+        ):
+            # stratify for classification
+            # instead of checking predict_proba, loss.n_classes >= 2 would also work
+            stratify = y if hasattr(self._loss, "predict_proba") else None
+
+            # Save the state of the RNG for the training and validation split.
+            # This is needed in order to have the same split when using
+            # warm starting.
+
+            if sample_weight is None:
+                X_train, X_val, y_train, y_val = train_test_split(
+                    X,
+                    y,
+                    test_size=self.validation_fraction,
+                    stratify=stratify,
+                    random_state=self._random_seed,
+                )
+                sample_weight_train = sample_weight_val = None
+            else:
+                # TODO: incorporate sample_weight in sampling here, as well as
+                # stratify
+                (
+                    X_train,
+                    X_val,
+                    y_train,
+                    y_val,
+                    sample_weight_train,
+                    sample_weight_val,
+                ) = train_test_split(
+                    X,
+                    y,
+                    sample_weight,
+                    test_size=self.validation_fraction,
+                    stratify=stratify,
+                    random_state=self._random_seed,
+                )
+        else:
+            X_train, y_train, sample_weight_train = X, y, sample_weight
+            if not validation_data_provided:
+                X_val = y_val = sample_weight_val = None
+
+        # Bin the data
+        # For ease of use of the API, the user-facing GBDT classes accept the
+        # parameter max_bins, which doesn't take into account the bin for
+        # missing values (which is always allocated). However, since max_bins
+        # isn't the true maximal number of bins, all other private classes
+        # (binmapper, histbuilder...) accept n_bins instead, which is the
+        # actual total number of bins. Everywhere in the code, the
+        # convention is that n_bins == max_bins + 1
+        n_bins = self.max_bins + 1  # + 1 for missing values
+        self._bin_mapper = _BinMapper(
+            n_bins=n_bins,
+            is_categorical=self._is_categorical_remapped,
+            known_categories=known_categories,
+            random_state=self._random_seed,
+            n_threads=n_threads,
+        )
+        X_binned_train = self._bin_data(X_train, is_training_data=True)
+        if X_val is not None:
+            X_binned_val = self._bin_data(X_val, is_training_data=False)
+        else:
+            X_binned_val = None
+
+        # Uses binned data to check for missing values
+        has_missing_values = (
+            (X_binned_train == self._bin_mapper.missing_values_bin_idx_)
+            .any(axis=0)
+            .astype(np.uint8)
+        )
+
+        if self.verbose:
+            print("Fitting gradient boosted rounds:")
+
+        n_samples = X_binned_train.shape[0]
+        scoring_is_predefined_string = self.scoring in _SCORERS
+        need_raw_predictions_val = X_binned_val is not None and (
+            scoring_is_predefined_string or self.scoring == "loss"
+        )
+        # First time calling fit, or no warm start
+        if not (self._is_fitted() and self.warm_start):
+            # Clear random state and score attributes
+            self._clear_state()
+
+            # initialize raw_predictions: those are the accumulated values
+            # predicted by the trees for the training data. raw_predictions has
+            # shape (n_samples, n_trees_per_iteration) where
+            # n_trees_per_iterations is n_classes in multiclass classification,
+            # else 1.
+            # self._baseline_prediction has shape (1, n_trees_per_iteration)
+            self._baseline_prediction = self._loss.fit_intercept_only(
+                y_true=y_train, sample_weight=sample_weight_train
+            ).reshape((1, -1))
+            raw_predictions = np.zeros(
+                shape=(n_samples, self.n_trees_per_iteration_),
+                dtype=self._baseline_prediction.dtype,
+                order="F",
+            )
+            raw_predictions += self._baseline_prediction
+
+            # predictors is a matrix (list of lists) of TreePredictor objects
+            # with shape (n_iter_, n_trees_per_iteration)
+            self._predictors = predictors = []
+
+            # Initialize structures and attributes related to early stopping
+            self._scorer = None  # set if scoring != loss
+            raw_predictions_val = None  # set if use val and scoring is a string
+            self.train_score_ = []
+            self.validation_score_ = []
+
+            if self.do_early_stopping_:
+                # populate train_score and validation_score with the
+                # predictions of the initial model (before the first tree)
+
+                # Create raw_predictions_val for storing the raw predictions of
+                # the validation data.
+                if need_raw_predictions_val:
+                    raw_predictions_val = np.zeros(
+                        shape=(X_binned_val.shape[0], self.n_trees_per_iteration_),
+                        dtype=self._baseline_prediction.dtype,
+                        order="F",
+                    )
+
+                    raw_predictions_val += self._baseline_prediction
+
+                if self.scoring == "loss":
+                    # we're going to compute scoring w.r.t the loss. As losses
+                    # take raw predictions as input (unlike the scorers), we
+                    # can optimize a bit and avoid repeating computing the
+                    # predictions of the previous trees. We'll reuse
+                    # raw_predictions (as it's needed for training anyway) for
+                    # evaluating the training loss.
+
+                    self._check_early_stopping_loss(
+                        raw_predictions=raw_predictions,
+                        y_train=y_train,
+                        sample_weight_train=sample_weight_train,
+                        raw_predictions_val=raw_predictions_val,
+                        y_val=y_val,
+                        sample_weight_val=sample_weight_val,
+                        n_threads=n_threads,
+                    )
+                else:
+                    self._scorer = check_scoring(self, self.scoring)
+                    # _scorer is a callable with signature (est, X, y) and
+                    # calls est.predict() or est.predict_proba() depending on
+                    # its nature.
+                    # Unfortunately, each call to _scorer() will compute
+                    # the predictions of all the trees. So we use a subset of
+                    # the training set to compute train scores.
+
+                    # Compute the subsample set
+                    (
+                        X_binned_small_train,
+                        y_small_train,
+                        sample_weight_small_train,
+                        indices_small_train,
+                    ) = self._get_small_trainset(
+                        X_binned_train,
+                        y_train,
+                        sample_weight_train,
+                        self._random_seed,
+                    )
+
+                    # If the scorer is a predefined string, then we optimize
+                    # the evaluation by reusing the incrementally updated raw
+                    # predictions.
+                    if scoring_is_predefined_string:
+                        raw_predictions_small_train = raw_predictions[
+                            indices_small_train
+                        ]
+                    else:
+                        raw_predictions_small_train = None
+
+                    self._check_early_stopping_scorer(
+                        X_binned_small_train,
+                        y_small_train,
+                        sample_weight_small_train,
+                        X_binned_val,
+                        y_val,
+                        sample_weight_val,
+                        raw_predictions_small_train=raw_predictions_small_train,
+                        raw_predictions_val=raw_predictions_val,
+                    )
+            begin_at_stage = 0
+
+        # warm start: this is not the first time fit was called
+        else:
+            # Check that the maximum number of iterations is not smaller
+            # than the number of iterations from the previous fit
+            if self.max_iter < self.n_iter_:
+                raise ValueError(
+                    "max_iter=%d must be larger than or equal to "
+                    "n_iter_=%d when warm_start==True" % (self.max_iter, self.n_iter_)
+                )
+
+            # Convert array attributes to lists
+            self.train_score_ = self.train_score_.tolist()
+            self.validation_score_ = self.validation_score_.tolist()
+
+            # Compute raw predictions
+            raw_predictions = self._raw_predict(X_binned_train, n_threads=n_threads)
+            if self.do_early_stopping_ and need_raw_predictions_val:
+                raw_predictions_val = self._raw_predict(
+                    X_binned_val, n_threads=n_threads
+                )
+            else:
+                raw_predictions_val = None
+
+            if self.do_early_stopping_ and self.scoring != "loss":
+                # Compute the subsample set
+                (
+                    X_binned_small_train,
+                    y_small_train,
+                    sample_weight_small_train,
+                    indices_small_train,
+                ) = self._get_small_trainset(
+                    X_binned_train, y_train, sample_weight_train, self._random_seed
+                )
+
+            # Get the predictors from the previous fit
+            predictors = self._predictors
+
+            begin_at_stage = self.n_iter_
+
+        # initialize gradients and hessians (empty arrays).
+        # shape = (n_samples, n_trees_per_iteration).
+        gradient, hessian = self._loss.init_gradient_and_hessian(
+            n_samples=n_samples, dtype=G_H_DTYPE, order="F"
+        )
+
+        for iteration in range(begin_at_stage, self.max_iter):
+            if self.verbose >= 2:
+                iteration_start_time = time()
+                print(
+                    "[{}/{}] ".format(iteration + 1, self.max_iter), end="", flush=True
+                )
+
+            # Update gradients and hessians, inplace
+            # Note that self._loss expects shape (n_samples,) for
+            # n_trees_per_iteration = 1 else shape (n_samples, n_trees_per_iteration).
+            if self._loss.constant_hessian:
+                self._loss.gradient(
+                    y_true=y_train,
+                    raw_prediction=raw_predictions,
+                    sample_weight=sample_weight_train,
+                    gradient_out=gradient,
+                    n_threads=n_threads,
+                )
+            else:
+                self._loss.gradient_hessian(
+                    y_true=y_train,
+                    raw_prediction=raw_predictions,
+                    sample_weight=sample_weight_train,
+                    gradient_out=gradient,
+                    hessian_out=hessian,
+                    n_threads=n_threads,
+                )
+
+            # Append a list since there may be more than 1 predictor per iter
+            predictors.append([])
+
+            # 2-d views of shape (n_samples, n_trees_per_iteration_) or (n_samples, 1)
+            # on gradient and hessian to simplify the loop over n_trees_per_iteration_.
+            if gradient.ndim == 1:
+                g_view = gradient.reshape((-1, 1))
+                h_view = hessian.reshape((-1, 1))
+            else:
+                g_view = gradient
+                h_view = hessian
+
+            # Build `n_trees_per_iteration` trees.
+            for k in range(self.n_trees_per_iteration_):
+                grower = TreeGrower(
+                    X_binned=X_binned_train,
+                    gradients=g_view[:, k],
+                    hessians=h_view[:, k],
+                    n_bins=n_bins,
+                    n_bins_non_missing=self._bin_mapper.n_bins_non_missing_,
+                    has_missing_values=has_missing_values,
+                    is_categorical=self._is_categorical_remapped,
+                    monotonic_cst=monotonic_cst_remapped,
+                    interaction_cst=interaction_cst,
+                    max_leaf_nodes=self.max_leaf_nodes,
+                    max_depth=self.max_depth,
+                    min_samples_leaf=self.min_samples_leaf,
+                    l2_regularization=self.l2_regularization,
+                    feature_fraction_per_split=self.max_features,
+                    rng=self._feature_subsample_rng,
+                    shrinkage=self.learning_rate,
+                    n_threads=n_threads,
+                )
+                grower.grow()
+
+                acc_apply_split_time += grower.total_apply_split_time
+                acc_find_split_time += grower.total_find_split_time
+                acc_compute_hist_time += grower.total_compute_hist_time
+
+                if not self._loss.differentiable:
+                    _update_leaves_values(
+                        loss=self._loss,
+                        grower=grower,
+                        y_true=y_train,
+                        raw_prediction=raw_predictions[:, k],
+                        sample_weight=sample_weight_train,
+                    )
+
+                predictor = grower.make_predictor(
+                    binning_thresholds=self._bin_mapper.bin_thresholds_
+                )
+                predictors[-1].append(predictor)
+
+                # Update raw_predictions with the predictions of the newly
+                # created tree.
+                tic_pred = time()
+                _update_raw_predictions(raw_predictions[:, k], grower, n_threads)
+                toc_pred = time()
+                acc_prediction_time += toc_pred - tic_pred
+
+            should_early_stop = False
+            if self.do_early_stopping_:
+                # Update raw_predictions_val with the newest tree(s)
+                if need_raw_predictions_val:
+                    for k, pred in enumerate(self._predictors[-1]):
+                        raw_predictions_val[:, k] += pred.predict_binned(
+                            X_binned_val,
+                            self._bin_mapper.missing_values_bin_idx_,
+                            n_threads,
+                        )
+
+                if self.scoring == "loss":
+                    should_early_stop = self._check_early_stopping_loss(
+                        raw_predictions=raw_predictions,
+                        y_train=y_train,
+                        sample_weight_train=sample_weight_train,
+                        raw_predictions_val=raw_predictions_val,
+                        y_val=y_val,
+                        sample_weight_val=sample_weight_val,
+                        n_threads=n_threads,
+                    )
+
+                else:
+                    # If the scorer is a predefined string, then we optimize the
+                    # evaluation by reusing the incrementally computed raw predictions.
+                    if scoring_is_predefined_string:
+                        raw_predictions_small_train = raw_predictions[
+                            indices_small_train
+                        ]
+                    else:
+                        raw_predictions_small_train = None
+
+                    should_early_stop = self._check_early_stopping_scorer(
+                        X_binned_small_train,
+                        y_small_train,
+                        sample_weight_small_train,
+                        X_binned_val,
+                        y_val,
+                        sample_weight_val,
+                        raw_predictions_small_train=raw_predictions_small_train,
+                        raw_predictions_val=raw_predictions_val,
+                    )
+
+            if self.verbose >= 2:
+                self._print_iteration_stats(iteration_start_time)
+
+            # maybe we could also early stop if all the trees are stumps?
+            if should_early_stop:
+                break
+
+        if self.verbose:
+            duration = time() - fit_start_time
+            n_total_leaves = sum(
+                predictor.get_n_leaf_nodes()
+                for predictors_at_ith_iteration in self._predictors
+                for predictor in predictors_at_ith_iteration
+            )
+            n_predictors = sum(
+                len(predictors_at_ith_iteration)
+                for predictors_at_ith_iteration in self._predictors
+            )
+            print(
+                "Fit {} trees in {:.3f} s, ({} total leaves)".format(
+                    n_predictors, duration, n_total_leaves
+                )
+            )
+            print(
+                "{:<32} {:.3f}s".format(
+                    "Time spent computing histograms:", acc_compute_hist_time
+                )
+            )
+            print(
+                "{:<32} {:.3f}s".format(
+                    "Time spent finding best splits:", acc_find_split_time
+                )
+            )
+            print(
+                "{:<32} {:.3f}s".format(
+                    "Time spent applying splits:", acc_apply_split_time
+                )
+            )
+            print(
+                "{:<32} {:.3f}s".format("Time spent predicting:", acc_prediction_time)
+            )
+
+        self.train_score_ = np.asarray(self.train_score_)
+        self.validation_score_ = np.asarray(self.validation_score_)
+        del self._in_fit  # hard delete so we're sure it can't be used anymore
+        return self
+
+    def _is_fitted(self):
+        return len(getattr(self, "_predictors", [])) > 0
+
+    def _clear_state(self):
+        """Clear the state of the gradient boosting model."""
+        for var in ("train_score_", "validation_score_"):
+            if hasattr(self, var):
+                delattr(self, var)
+
+    def _get_small_trainset(self, X_binned_train, y_train, sample_weight_train, seed):
+        """Compute the indices of the subsample set and return this set.
+
+        For efficiency, we need to subsample the training set to compute scores
+        with scorers.
+        """
+        # TODO: incorporate sample_weights here in `resample`
+        subsample_size = 10000
+        if X_binned_train.shape[0] > subsample_size:
+            indices = np.arange(X_binned_train.shape[0])
+            stratify = y_train if is_classifier(self) else None
+            indices = resample(
+                indices,
+                n_samples=subsample_size,
+                replace=False,
+                random_state=seed,
+                stratify=stratify,
+            )
+            X_binned_small_train = X_binned_train[indices]
+            y_small_train = y_train[indices]
+            if sample_weight_train is not None:
+                sample_weight_small_train = sample_weight_train[indices]
+            else:
+                sample_weight_small_train = None
+            X_binned_small_train = np.ascontiguousarray(X_binned_small_train)
+            return (
+                X_binned_small_train,
+                y_small_train,
+                sample_weight_small_train,
+                indices,
+            )
+        else:
+            return X_binned_train, y_train, sample_weight_train, slice(None)
+
+    def _check_early_stopping_scorer(
+        self,
+        X_binned_small_train,
+        y_small_train,
+        sample_weight_small_train,
+        X_binned_val,
+        y_val,
+        sample_weight_val,
+        raw_predictions_small_train=None,
+        raw_predictions_val=None,
+    ):
+        """Check if fitting should be early-stopped based on scorer.
+
+        Scores are computed on validation data or on training data.
+        """
+        if is_classifier(self):
+            y_small_train = self.classes_[y_small_train.astype(int)]
+
+        self.train_score_.append(
+            self._score_with_raw_predictions(
+                X_binned_small_train,
+                y_small_train,
+                sample_weight_small_train,
+                raw_predictions_small_train,
+            )
+        )
+
+        if self._use_validation_data:
+            if is_classifier(self):
+                y_val = self.classes_[y_val.astype(int)]
+            self.validation_score_.append(
+                self._score_with_raw_predictions(
+                    X_binned_val, y_val, sample_weight_val, raw_predictions_val
+                )
+            )
+            return self._should_stop(self.validation_score_)
+        else:
+            return self._should_stop(self.train_score_)
+
+    def _score_with_raw_predictions(self, X, y, sample_weight, raw_predictions=None):
+        if raw_predictions is None:
+            patcher_raw_predict = nullcontext()
+        else:
+            patcher_raw_predict = _patch_raw_predict(self, raw_predictions)
+
+        with patcher_raw_predict:
+            if sample_weight is None:
+                return self._scorer(self, X, y)
+            else:
+                return self._scorer(self, X, y, sample_weight=sample_weight)
+
+    def _check_early_stopping_loss(
+        self,
+        raw_predictions,
+        y_train,
+        sample_weight_train,
+        raw_predictions_val,
+        y_val,
+        sample_weight_val,
+        n_threads=1,
+    ):
+        """Check if fitting should be early-stopped based on loss.
+
+        Scores are computed on validation data or on training data.
+        """
+        self.train_score_.append(
+            -self._loss(
+                y_true=y_train,
+                raw_prediction=raw_predictions,
+                sample_weight=sample_weight_train,
+                n_threads=n_threads,
+            )
+        )
+
+        if self._use_validation_data:
+            self.validation_score_.append(
+                -self._loss(
+                    y_true=y_val,
+                    raw_prediction=raw_predictions_val,
+                    sample_weight=sample_weight_val,
+                    n_threads=n_threads,
+                )
+            )
+            return self._should_stop(self.validation_score_)
+        else:
+            return self._should_stop(self.train_score_)
+
+    def _should_stop(self, scores):
+        """
+        Return True (do early stopping) if the last n scores aren't better
+        than the (n-1)th-to-last score, up to some tolerance.
+        """
+        reference_position = self.n_iter_no_change + 1
+        if len(scores) < reference_position:
+            return False
+
+        # A higher score is always better. Higher tol means that it will be
+        # harder for subsequent iteration to be considered an improvement upon
+        # the reference score, and therefore it is more likely to early stop
+        # because of the lack of significant improvement.
+        reference_score = scores[-reference_position] + self.tol
+        recent_scores = scores[-reference_position + 1 :]
+        recent_improvements = [score > reference_score for score in recent_scores]
+        return not any(recent_improvements)
+
+    def _bin_data(self, X, is_training_data):
+        """Bin data X.
+
+        If is_training_data, then fit the _bin_mapper attribute.
+        Else, the binned data is converted to a C-contiguous array.
+        """
+
+        description = "training" if is_training_data else "validation"
+        if self.verbose:
+            print(
+                "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description),
+                end="",
+                flush=True,
+            )
+        tic = time()
+        if is_training_data:
+            X_binned = self._bin_mapper.fit_transform(X)  # F-aligned array
+        else:
+            X_binned = self._bin_mapper.transform(X)  # F-aligned array
+            # We convert the array to C-contiguous since predicting is faster
+            # with this layout (training is faster on F-arrays though)
+            X_binned = np.ascontiguousarray(X_binned)
+        toc = time()
+        if self.verbose:
+            duration = toc - tic
+            print("{:.3f} s".format(duration))
+
+        return X_binned
+
+    def _print_iteration_stats(self, iteration_start_time):
+        """Print info about the current fitting iteration."""
+        log_msg = ""
+
+        predictors_of_ith_iteration = [
+            predictors_list
+            for predictors_list in self._predictors[-1]
+            if predictors_list
+        ]
+        n_trees = len(predictors_of_ith_iteration)
+        max_depth = max(
+            predictor.get_max_depth() for predictor in predictors_of_ith_iteration
+        )
+        n_leaves = sum(
+            predictor.get_n_leaf_nodes() for predictor in predictors_of_ith_iteration
+        )
+
+        if n_trees == 1:
+            log_msg += "{} tree, {} leaves, ".format(n_trees, n_leaves)
+        else:
+            log_msg += "{} trees, {} leaves ".format(n_trees, n_leaves)
+            log_msg += "({} on avg), ".format(int(n_leaves / n_trees))
+
+        log_msg += "max depth = {}, ".format(max_depth)
+
+        if self.do_early_stopping_:
+            if self.scoring == "loss":
+                factor = -1  # score_ arrays contain the negative loss
+                name = "loss"
+            else:
+                factor = 1
+                name = "score"
+            log_msg += "train {}: {:.5f}, ".format(name, factor * self.train_score_[-1])
+            if self._use_validation_data:
+                log_msg += "val {}: {:.5f}, ".format(
+                    name, factor * self.validation_score_[-1]
+                )
+
+        iteration_time = time() - iteration_start_time
+        log_msg += "in {:0.3f}s".format(iteration_time)
+
+        print(log_msg)
+
+    def _raw_predict(self, X, n_threads=None):
+        """Return the sum of the leaves values over all predictors.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+        n_threads : int, default=None
+            Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
+            to determine the effective number of threads use, which takes cgroups CPU
+            quotes into account. See the docstring of `_openmp_effective_n_threads`
+            for details.
+
+        Returns
+        -------
+        raw_predictions : array, shape (n_samples, n_trees_per_iteration)
+            The raw predicted values.
+        """
+        check_is_fitted(self)
+        is_binned = getattr(self, "_in_fit", False)
+        if not is_binned:
+            X = self._preprocess_X(X, reset=False)
+
+        n_samples = X.shape[0]
+        raw_predictions = np.zeros(
+            shape=(n_samples, self.n_trees_per_iteration_),
+            dtype=self._baseline_prediction.dtype,
+            order="F",
+        )
+        raw_predictions += self._baseline_prediction
+
+        # We intentionally decouple the number of threads used at prediction
+        # time from the number of threads used at fit time because the model
+        # can be deployed on a different machine for prediction purposes.
+        n_threads = _openmp_effective_n_threads(n_threads)
+        self._predict_iterations(
+            X, self._predictors, raw_predictions, is_binned, n_threads
+        )
+        return raw_predictions
+
+    def _predict_iterations(self, X, predictors, raw_predictions, is_binned, n_threads):
+        """Add the predictions of the predictors to raw_predictions."""
+        if not is_binned:
+            (
+                known_cat_bitsets,
+                f_idx_map,
+            ) = self._bin_mapper.make_known_categories_bitsets()
+
+        for predictors_of_ith_iteration in predictors:
+            for k, predictor in enumerate(predictors_of_ith_iteration):
+                if is_binned:
+                    predict = partial(
+                        predictor.predict_binned,
+                        missing_values_bin_idx=self._bin_mapper.missing_values_bin_idx_,
+                        n_threads=n_threads,
+                    )
+                else:
+                    predict = partial(
+                        predictor.predict,
+                        known_cat_bitsets=known_cat_bitsets,
+                        f_idx_map=f_idx_map,
+                        n_threads=n_threads,
+                    )
+                raw_predictions[:, k] += predict(X)
+
+    def _staged_raw_predict(self, X):
+        """Compute raw predictions of ``X`` for each iteration.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+
+        Yields
+        ------
+        raw_predictions : generator of ndarray of shape \
+            (n_samples, n_trees_per_iteration)
+            The raw predictions of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        X = self._preprocess_X(X, reset=False)
+        if X.shape[1] != self._n_features:
+            raise ValueError(
+                "X has {} features but this estimator was trained with "
+                "{} features.".format(X.shape[1], self._n_features)
+            )
+        n_samples = X.shape[0]
+        raw_predictions = np.zeros(
+            shape=(n_samples, self.n_trees_per_iteration_),
+            dtype=self._baseline_prediction.dtype,
+            order="F",
+        )
+        raw_predictions += self._baseline_prediction
+
+        # We intentionally decouple the number of threads used at prediction
+        # time from the number of threads used at fit time because the model
+        # can be deployed on a different machine for prediction purposes.
+        n_threads = _openmp_effective_n_threads()
+        for iteration in range(len(self._predictors)):
+            self._predict_iterations(
+                X,
+                self._predictors[iteration : iteration + 1],
+                raw_predictions,
+                is_binned=False,
+                n_threads=n_threads,
+            )
+            yield raw_predictions.copy()
+
+    def _compute_partial_dependence_recursion(self, grid, target_features):
+        """Fast partial dependence computation.
+
+        Parameters
+        ----------
+        grid : ndarray, shape (n_samples, n_target_features), dtype=np.float32
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : ndarray, shape (n_target_features), dtype=np.intp
+            The set of target features for which the partial dependence
+            should be evaluated.
+
+        Returns
+        -------
+        averaged_predictions : ndarray, shape \
+                (n_trees_per_iteration, n_samples)
+            The value of the partial dependence function on each grid point.
+        """
+
+        if getattr(self, "_fitted_with_sw", False):
+            raise NotImplementedError(
+                "{} does not support partial dependence "
+                "plots with the 'recursion' method when "
+                "sample weights were given during fit "
+                "time.".format(self.__class__.__name__)
+            )
+
+        grid = np.asarray(grid, dtype=X_DTYPE, order="C")
+        averaged_predictions = np.zeros(
+            (self.n_trees_per_iteration_, grid.shape[0]), dtype=Y_DTYPE
+        )
+        target_features = np.asarray(target_features, dtype=np.intp, order="C")
+
+        for predictors_of_ith_iteration in self._predictors:
+            for k, predictor in enumerate(predictors_of_ith_iteration):
+                predictor.compute_partial_dependence(
+                    grid, target_features, averaged_predictions[k]
+                )
+        # Note that the learning rate is already accounted for in the leaves
+        # values.
+
+        return averaged_predictions
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+    @abstractmethod
+    def _get_loss(self, sample_weight):
+        pass
+
+    @abstractmethod
+    def _encode_y(self, y=None):
+        pass  # pragma: no cover
+
+    @abstractmethod
+    def _encode_y_val(self, y=None):
+        pass  # pragma: no cover
+
+    @property
+    def n_iter_(self):
+        """Number of iterations of the boosting process."""
+        check_is_fitted(self)
+        return len(self._predictors)
+
+
+class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
+    """Histogram-based Gradient Boosting Regression Tree.
+
+    This estimator is much faster than
+    :class:`GradientBoostingRegressor<sklearn.ensemble.GradientBoostingRegressor>`
+    for big datasets (n_samples >= 10 000).
+
+    This estimator has native support for missing values (NaNs). During
+    training, the tree grower learns at each split point whether samples
+    with missing values should go to the left or right child, based on the
+    potential gain. When predicting, samples with missing values are
+    assigned to the left or right child consequently. If no missing values
+    were encountered for a given feature during training, then samples with
+    missing values are mapped to whichever child has the most samples.
+    See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for a
+    usecase example of this feature.
+
+    This implementation is inspired by
+    `LightGBM <https://github.com/Microsoft/LightGBM>`_.
+
+    Read more in the :ref:`User Guide <histogram_based_gradient_boosting>`.
+
+    .. versionadded:: 0.21
+
+    Parameters
+    ----------
+    loss : {'squared_error', 'absolute_error', 'gamma', 'poisson', 'quantile'}, \
+            default='squared_error'
+        The loss function to use in the boosting process. Note that the
+        "squared error", "gamma" and "poisson" losses actually implement
+        "half least squares loss", "half gamma deviance" and "half poisson
+        deviance" to simplify the computation of the gradient. Furthermore,
+        "gamma" and "poisson" losses internally use a log-link, "gamma"
+        requires ``y > 0`` and "poisson" requires ``y >= 0``.
+        "quantile" uses the pinball loss.
+
+        .. versionchanged:: 0.23
+           Added option 'poisson'.
+
+        .. versionchanged:: 1.1
+           Added option 'quantile'.
+
+        .. versionchanged:: 1.3
+           Added option 'gamma'.
+
+    quantile : float, default=None
+        If loss is "quantile", this parameter specifies which quantile to be estimated
+        and must be between 0 and 1.
+    learning_rate : float, default=0.1
+        The learning rate, also known as *shrinkage*. This is used as a
+        multiplicative factor for the leaves values. Use ``1`` for no
+        shrinkage.
+    max_iter : int, default=100
+        The maximum number of iterations of the boosting process, i.e. the
+        maximum number of trees.
+    max_leaf_nodes : int or None, default=31
+        The maximum number of leaves for each tree. Must be strictly greater
+        than 1. If None, there is no maximum limit.
+    max_depth : int or None, default=None
+        The maximum depth of each tree. The depth of a tree is the number of
+        edges to go from the root to the deepest leaf.
+        Depth isn't constrained by default.
+    min_samples_leaf : int, default=20
+        The minimum number of samples per leaf. For small datasets with less
+        than a few hundred samples, it is recommended to lower this value
+        since only very shallow trees would be built.
+    l2_regularization : float, default=0
+        The L2 regularization parameter penalizing leaves with small hessians.
+        Use ``0`` for no regularization (default).
+    max_features : float, default=1.0
+        Proportion of randomly chosen features in each and every node split.
+        This is a form of regularization, smaller values make the trees weaker
+        learners and might prevent overfitting.
+        If interaction constraints from `interaction_cst` are present, only allowed
+        features are taken into account for the subsampling.
+
+        .. versionadded:: 1.4
+
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values. Before
+        training, each feature of the input array `X` is binned into
+        integer-valued bins, which allows for a much faster training stage.
+        Features with a small number of unique values may use less than
+        ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
+        is always reserved for missing values. Must be no larger than 255.
+    categorical_features : array-like of {bool, int, str} of shape (n_features) \
+            or shape (n_categorical_features,), default='from_dtype'
+        Indicates the categorical features.
+
+        - None : no feature will be considered categorical.
+        - boolean array-like : boolean mask indicating categorical features.
+        - integer array-like : integer indices indicating categorical
+          features.
+        - str array-like: names of categorical features (assuming the training
+          data has feature names).
+        - `"from_dtype"`: dataframe columns with dtype "category" are
+          considered to be categorical features. The input must be an object
+          exposing a ``__dataframe__`` method such as pandas or polars
+          DataFrames to use this feature.
+
+        For each categorical feature, there must be at most `max_bins` unique
+        categories. Negative values for categorical features encoded as numeric
+        dtypes are treated as missing values. All categorical values are
+        converted to floating point numbers. This means that categorical values
+        of 1.0 and 1 are treated as the same category.
+
+        Read more in the :ref:`User Guide <categorical_support_gbdt>` and
+        :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`.
+
+        .. versionadded:: 0.24
+
+        .. versionchanged:: 1.2
+           Added support for feature names.
+
+        .. versionchanged:: 1.4
+           Added `"from_dtype"` option.
+
+        .. versionchanged:: 1.6
+           The default value changed from `None` to `"from_dtype"`.
+
+    monotonic_cst : array-like of int of shape (n_features) or dict, default=None
+        Monotonic constraint to enforce on each feature are specified using the
+        following integer values:
+
+        - 1: monotonic increase
+        - 0: no constraint
+        - -1: monotonic decrease
+
+        If a dict with str keys, map feature to monotonic constraints by name.
+        If an array, the features are mapped to constraints by position. See
+        :ref:`monotonic_cst_features_names` for a usage example.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 0.23
+
+        .. versionchanged:: 1.2
+           Accept dict of constraints with feature names as keys.
+
+    interaction_cst : {"pairwise", "no_interactions"} or sequence of lists/tuples/sets \
+            of int, default=None
+        Specify interaction constraints, the sets of features which can
+        interact with each other in child node splits.
+
+        Each item specifies the set of feature indices that are allowed
+        to interact with each other. If there are more features than
+        specified in these constraints, they are treated as if they were
+        specified as an additional set.
+
+        The strings "pairwise" and "no_interactions" are shorthands for
+        allowing only pairwise or no interactions, respectively.
+
+        For instance, with 5 features in total, `interaction_cst=[{0, 1}]`
+        is equivalent to `interaction_cst=[{0, 1}, {2, 3, 4}]`,
+        and specifies that each branch of a tree will either only split
+        on features 0 and 1 or only split on features 2, 3 and 4.
+
+        See :ref:`this example<ice-vs-pdp>` on how to use `interaction_cst`.
+
+        .. versionadded:: 1.2
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble. For results to be valid, the
+        estimator should be re-trained on the same data only.
+        See :term:`the Glossary <warm_start>`.
+    early_stopping : 'auto' or bool, default='auto'
+        If 'auto', early stopping is enabled if the sample size is larger than
+        10000 or if `X_val` and `y_val` are passed to `fit`. If True, early stopping
+        is enabled, otherwise early stopping is disabled.
+
+        .. versionadded:: 0.23
+
+    scoring : str or callable or None, default='loss'
+        Scoring method to use for early stopping. Only used if `early_stopping`
+        is enabled. Options:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: the :ref:`coefficient of determination <r2_score>`
+          (:math:`R^2`) is used.
+        - 'loss': early stopping is checked w.r.t the loss value.
+
+    validation_fraction : int or float or None, default=0.1
+        Proportion (or absolute size) of training data to set aside as
+        validation data for early stopping. If None, early stopping is done on
+        the training data.
+        The value is ignored if either early stopping is not performed, e.g.
+        `early_stopping=False`, or if `X_val` and `y_val` are passed to fit.
+    n_iter_no_change : int, default=10
+        Used to determine when to "early stop". The fitting process is
+        stopped when none of the last ``n_iter_no_change`` scores are better
+        than the ``n_iter_no_change - 1`` -th-to-last one, up to some
+        tolerance. Only used if early stopping is performed.
+    tol : float, default=1e-7
+        The absolute tolerance to use when comparing scores during early
+        stopping. The higher the tolerance, the more likely we are to early
+        stop: higher tolerance means that it will be harder for subsequent
+        iterations to be considered an improvement upon the reference score.
+    verbose : int, default=0
+        The verbosity level. If not zero, print some information about the
+        fitting process. ``1`` prints only summary info, ``2`` prints info per
+        iteration.
+    random_state : int, RandomState instance or None, default=None
+        Pseudo-random number generator to control the subsampling in the
+        binning process, and the train/validation data split if early stopping
+        is enabled.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    do_early_stopping_ : bool
+        Indicates whether early stopping is used during training.
+    n_iter_ : int
+        The number of iterations as selected by early stopping, depending on
+        the `early_stopping` parameter. Otherwise it corresponds to max_iter.
+    n_trees_per_iteration_ : int
+        The number of tree that are built at each iteration. For regressors,
+        this is always 1.
+    train_score_ : ndarray, shape (n_iter_+1,)
+        The scores at each iteration on the training data. The first entry
+        is the score of the ensemble before the first iteration. Scores are
+        computed according to the ``scoring`` parameter. If ``scoring`` is
+        not 'loss', scores are computed on a subset of at most 10 000
+        samples. Empty if no early stopping.
+    validation_score_ : ndarray, shape (n_iter_+1,)
+        The scores at each iteration on the held-out validation data. The
+        first entry is the score of the ensemble before the first iteration.
+        Scores are computed according to the ``scoring`` parameter. Empty if
+        no early stopping or if ``validation_fraction`` is None.
+    is_categorical_ : ndarray, shape (n_features, ) or None
+        Boolean mask for the categorical features. ``None`` if there are no
+        categorical features.
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GradientBoostingRegressor : Exact gradient boosting method that does not
+        scale as good on datasets with a large number of samples.
+    sklearn.tree.DecisionTreeRegressor : A decision tree regressor.
+    RandomForestRegressor : A meta-estimator that fits a number of decision
+        tree regressors on various sub-samples of the dataset and uses
+        averaging to improve the statistical performance and control
+        over-fitting.
+    AdaBoostRegressor : A meta-estimator that begins by fitting a regressor
+        on the original dataset and then fits additional copies of the
+        regressor on the same dataset but where the weights of instances are
+        adjusted according to the error of the current prediction. As such,
+        subsequent regressors focus more on difficult cases.
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import HistGradientBoostingRegressor
+    >>> from sklearn.datasets import load_diabetes
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> est = HistGradientBoostingRegressor().fit(X, y)
+    >>> est.score(X, y)
+    0.92...
+    """
+
+    _parameter_constraints: dict = {
+        **BaseHistGradientBoosting._parameter_constraints,
+        "loss": [
+            StrOptions(
+                {
+                    "squared_error",
+                    "absolute_error",
+                    "poisson",
+                    "gamma",
+                    "quantile",
+                }
+            ),
+            BaseLoss,
+        ],
+        "quantile": [Interval(Real, 0, 1, closed="both"), None],
+    }
+
+    def __init__(
+        self,
+        loss="squared_error",
+        *,
+        quantile=None,
+        learning_rate=0.1,
+        max_iter=100,
+        max_leaf_nodes=31,
+        max_depth=None,
+        min_samples_leaf=20,
+        l2_regularization=0.0,
+        max_features=1.0,
+        max_bins=255,
+        categorical_features="from_dtype",
+        monotonic_cst=None,
+        interaction_cst=None,
+        warm_start=False,
+        early_stopping="auto",
+        scoring="loss",
+        validation_fraction=0.1,
+        n_iter_no_change=10,
+        tol=1e-7,
+        verbose=0,
+        random_state=None,
+    ):
+        super().__init__(
+            loss=loss,
+            learning_rate=learning_rate,
+            max_iter=max_iter,
+            max_leaf_nodes=max_leaf_nodes,
+            max_depth=max_depth,
+            min_samples_leaf=min_samples_leaf,
+            l2_regularization=l2_regularization,
+            max_features=max_features,
+            max_bins=max_bins,
+            monotonic_cst=monotonic_cst,
+            interaction_cst=interaction_cst,
+            categorical_features=categorical_features,
+            early_stopping=early_stopping,
+            warm_start=warm_start,
+            scoring=scoring,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            tol=tol,
+            verbose=verbose,
+            random_state=random_state,
+        )
+        self.quantile = quantile
+
+    def predict(self, X):
+        """Predict values for X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        # Return inverse link of raw predictions after converting
+        # shape (n_samples, 1) to (n_samples,)
+        return self._loss.link.inverse(self._raw_predict(X).ravel())
+
+    def staged_predict(self, X):
+        """Predict regression target for each iteration.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        .. versionadded:: 0.24
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+
+        Yields
+        ------
+        y : generator of ndarray of shape (n_samples,)
+            The predicted values of the input samples, for each iteration.
+        """
+        for raw_predictions in self._staged_raw_predict(X):
+            yield self._loss.link.inverse(raw_predictions.ravel())
+
+    def _encode_y(self, y):
+        # Just convert y to the expected dtype
+        self.n_trees_per_iteration_ = 1
+        y = y.astype(Y_DTYPE, copy=False)
+        if self.loss == "gamma":
+            # Ensure y > 0
+            if not np.all(y > 0):
+                raise ValueError("loss='gamma' requires strictly positive y.")
+        elif self.loss == "poisson":
+            # Ensure y >= 0 and sum(y) > 0
+            if not (np.all(y >= 0) and np.sum(y) > 0):
+                raise ValueError(
+                    "loss='poisson' requires non-negative y and sum(y) > 0."
+                )
+        return y
+
+    def _encode_y_val(self, y=None):
+        return self._encode_y(y)
+
+    def _get_loss(self, sample_weight):
+        if self.loss == "quantile":
+            return _LOSSES[self.loss](
+                sample_weight=sample_weight, quantile=self.quantile
+            )
+        else:
+            return _LOSSES[self.loss](sample_weight=sample_weight)
+
+
+class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
+    """Histogram-based Gradient Boosting Classification Tree.
+
+    This estimator is much faster than
+    :class:`GradientBoostingClassifier<sklearn.ensemble.GradientBoostingClassifier>`
+    for big datasets (n_samples >= 10 000).
+
+    This estimator has native support for missing values (NaNs). During
+    training, the tree grower learns at each split point whether samples
+    with missing values should go to the left or right child, based on the
+    potential gain. When predicting, samples with missing values are
+    assigned to the left or right child consequently. If no missing values
+    were encountered for a given feature during training, then samples with
+    missing values are mapped to whichever child has the most samples.
+
+    This implementation is inspired by
+    `LightGBM <https://github.com/Microsoft/LightGBM>`_.
+
+    Read more in the :ref:`User Guide <histogram_based_gradient_boosting>`.
+
+    .. versionadded:: 0.21
+
+    Parameters
+    ----------
+    loss : {'log_loss'}, default='log_loss'
+        The loss function to use in the boosting process.
+
+        For binary classification problems, 'log_loss' is also known as logistic loss,
+        binomial deviance or binary crossentropy. Internally, the model fits one tree
+        per boosting iteration and uses the logistic sigmoid function (expit) as
+        inverse link function to compute the predicted positive class probability.
+
+        For multiclass classification problems, 'log_loss' is also known as multinomial
+        deviance or categorical crossentropy. Internally, the model fits one tree per
+        boosting iteration and per class and uses the softmax function as inverse link
+        function to compute the predicted probabilities of the classes.
+
+    learning_rate : float, default=0.1
+        The learning rate, also known as *shrinkage*. This is used as a
+        multiplicative factor for the leaves values. Use ``1`` for no
+        shrinkage.
+    max_iter : int, default=100
+        The maximum number of iterations of the boosting process, i.e. the
+        maximum number of trees for binary classification. For multiclass
+        classification, `n_classes` trees per iteration are built.
+    max_leaf_nodes : int or None, default=31
+        The maximum number of leaves for each tree. Must be strictly greater
+        than 1. If None, there is no maximum limit.
+    max_depth : int or None, default=None
+        The maximum depth of each tree. The depth of a tree is the number of
+        edges to go from the root to the deepest leaf.
+        Depth isn't constrained by default.
+    min_samples_leaf : int, default=20
+        The minimum number of samples per leaf. For small datasets with less
+        than a few hundred samples, it is recommended to lower this value
+        since only very shallow trees would be built.
+    l2_regularization : float, default=0
+        The L2 regularization parameter penalizing leaves with small hessians.
+        Use ``0`` for no regularization (default).
+    max_features : float, default=1.0
+        Proportion of randomly chosen features in each and every node split.
+        This is a form of regularization, smaller values make the trees weaker
+        learners and might prevent overfitting.
+        If interaction constraints from `interaction_cst` are present, only allowed
+        features are taken into account for the subsampling.
+
+        .. versionadded:: 1.4
+
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values. Before
+        training, each feature of the input array `X` is binned into
+        integer-valued bins, which allows for a much faster training stage.
+        Features with a small number of unique values may use less than
+        ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
+        is always reserved for missing values. Must be no larger than 255.
+    categorical_features : array-like of {bool, int, str} of shape (n_features) \
+            or shape (n_categorical_features,), default='from_dtype'
+        Indicates the categorical features.
+
+        - None : no feature will be considered categorical.
+        - boolean array-like : boolean mask indicating categorical features.
+        - integer array-like : integer indices indicating categorical
+          features.
+        - str array-like: names of categorical features (assuming the training
+          data has feature names).
+        - `"from_dtype"`: dataframe columns with dtype "category" are
+          considered to be categorical features. The input must be an object
+          exposing a ``__dataframe__`` method such as pandas or polars
+          DataFrames to use this feature.
+
+        For each categorical feature, there must be at most `max_bins` unique
+        categories. Negative values for categorical features encoded as numeric
+        dtypes are treated as missing values. All categorical values are
+        converted to floating point numbers. This means that categorical values
+        of 1.0 and 1 are treated as the same category.
+
+        Read more in the :ref:`User Guide <categorical_support_gbdt>`.
+
+        .. versionadded:: 0.24
+
+        .. versionchanged:: 1.2
+           Added support for feature names.
+
+        .. versionchanged:: 1.4
+           Added `"from_dtype"` option.
+
+        .. versionchanged:: 1.6
+           The default value changed from `None` to `"from_dtype"`.
+
+    monotonic_cst : array-like of int of shape (n_features) or dict, default=None
+        Monotonic constraint to enforce on each feature are specified using the
+        following integer values:
+
+        - 1: monotonic increase
+        - 0: no constraint
+        - -1: monotonic decrease
+
+        If a dict with str keys, map feature to monotonic constraints by name.
+        If an array, the features are mapped to constraints by position. See
+        :ref:`monotonic_cst_features_names` for a usage example.
+
+        The constraints are only valid for binary classifications and hold
+        over the probability of the positive class.
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 0.23
+
+        .. versionchanged:: 1.2
+           Accept dict of constraints with feature names as keys.
+
+    interaction_cst : {"pairwise", "no_interactions"} or sequence of lists/tuples/sets \
+            of int, default=None
+        Specify interaction constraints, the sets of features which can
+        interact with each other in child node splits.
+
+        Each item specifies the set of feature indices that are allowed
+        to interact with each other. If there are more features than
+        specified in these constraints, they are treated as if they were
+        specified as an additional set.
+
+        The strings "pairwise" and "no_interactions" are shorthands for
+        allowing only pairwise or no interactions, respectively.
+
+        For instance, with 5 features in total, `interaction_cst=[{0, 1}]`
+        is equivalent to `interaction_cst=[{0, 1}, {2, 3, 4}]`,
+        and specifies that each branch of a tree will either only split
+        on features 0 and 1 or only split on features 2, 3 and 4.
+
+        See :ref:`this example<ice-vs-pdp>` on how to use `interaction_cst`.
+
+        .. versionadded:: 1.2
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble. For results to be valid, the
+        estimator should be re-trained on the same data only.
+        See :term:`the Glossary <warm_start>`.
+    early_stopping : 'auto' or bool, default='auto'
+        If 'auto', early stopping is enabled if the sample size is larger than
+        10000 or if `X_val` and `y_val` are passed to `fit`. If True, early stopping
+        is enabled, otherwise early stopping is disabled.
+
+        .. versionadded:: 0.23
+
+    scoring : str or callable or None, default='loss'
+        Scoring method to use for early stopping. Only used if `early_stopping`
+        is enabled. Options:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: :ref:`accuracy <accuracy_score>` is used.
+        - 'loss': early stopping is checked w.r.t the loss value.
+
+    validation_fraction : int or float or None, default=0.1
+        Proportion (or absolute size) of training data to set aside as
+        validation data for early stopping. If None, early stopping is done on
+        the training data.
+        The value is ignored if either early stopping is not performed, e.g.
+        `early_stopping=False`, or if `X_val` and `y_val` are passed to fit.
+    n_iter_no_change : int, default=10
+        Used to determine when to "early stop". The fitting process is
+        stopped when none of the last ``n_iter_no_change`` scores are better
+        than the ``n_iter_no_change - 1`` -th-to-last one, up to some
+        tolerance. Only used if early stopping is performed.
+    tol : float, default=1e-7
+        The absolute tolerance to use when comparing scores. The higher the
+        tolerance, the more likely we are to early stop: higher tolerance
+        means that it will be harder for subsequent iterations to be
+        considered an improvement upon the reference score.
+    verbose : int, default=0
+        The verbosity level. If not zero, print some information about the
+        fitting process. ``1`` prints only summary info, ``2`` prints info per
+        iteration.
+    random_state : int, RandomState instance or None, default=None
+        Pseudo-random number generator to control the subsampling in the
+        binning process, and the train/validation data split if early stopping
+        is enabled.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    class_weight : dict or 'balanced', default=None
+        Weights associated with classes in the form `{class_label: weight}`.
+        If not given, all classes are supposed to have weight one.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as `n_samples / (n_classes * np.bincount(y))`.
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if `sample_weight` is specified.
+
+        .. versionadded:: 1.2
+
+    Attributes
+    ----------
+    classes_ : array, shape = (n_classes,)
+        Class labels.
+    do_early_stopping_ : bool
+        Indicates whether early stopping is used during training.
+    n_iter_ : int
+        The number of iterations as selected by early stopping, depending on
+        the `early_stopping` parameter. Otherwise it corresponds to max_iter.
+    n_trees_per_iteration_ : int
+        The number of tree that are built at each iteration. This is equal to 1
+        for binary classification, and to ``n_classes`` for multiclass
+        classification.
+    train_score_ : ndarray, shape (n_iter_+1,)
+        The scores at each iteration on the training data. The first entry
+        is the score of the ensemble before the first iteration. Scores are
+        computed according to the ``scoring`` parameter. If ``scoring`` is
+        not 'loss', scores are computed on a subset of at most 10 000
+        samples. Empty if no early stopping.
+    validation_score_ : ndarray, shape (n_iter_+1,)
+        The scores at each iteration on the held-out validation data. The
+        first entry is the score of the ensemble before the first iteration.
+        Scores are computed according to the ``scoring`` parameter. Empty if
+        no early stopping or if ``validation_fraction`` is None.
+    is_categorical_ : ndarray, shape (n_features, ) or None
+        Boolean mask for the categorical features. ``None`` if there are no
+        categorical features.
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GradientBoostingClassifier : Exact gradient boosting method that does not
+        scale as good on datasets with a large number of samples.
+    sklearn.tree.DecisionTreeClassifier : A decision tree classifier.
+    RandomForestClassifier : A meta-estimator that fits a number of decision
+        tree classifiers on various sub-samples of the dataset and uses
+        averaging to improve the predictive accuracy and control over-fitting.
+    AdaBoostClassifier : A meta-estimator that begins by fitting a classifier
+        on the original dataset and then fits additional copies of the
+        classifier on the same dataset where the weights of incorrectly
+        classified instances are adjusted such that subsequent classifiers
+        focus more on difficult cases.
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import HistGradientBoostingClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = HistGradientBoostingClassifier().fit(X, y)
+    >>> clf.score(X, y)
+    1.0
+    """
+
+    _parameter_constraints: dict = {
+        **BaseHistGradientBoosting._parameter_constraints,
+        "loss": [StrOptions({"log_loss"}), BaseLoss],
+        "class_weight": [dict, StrOptions({"balanced"}), None],
+    }
+
+    def __init__(
+        self,
+        loss="log_loss",
+        *,
+        learning_rate=0.1,
+        max_iter=100,
+        max_leaf_nodes=31,
+        max_depth=None,
+        min_samples_leaf=20,
+        l2_regularization=0.0,
+        max_features=1.0,
+        max_bins=255,
+        categorical_features="from_dtype",
+        monotonic_cst=None,
+        interaction_cst=None,
+        warm_start=False,
+        early_stopping="auto",
+        scoring="loss",
+        validation_fraction=0.1,
+        n_iter_no_change=10,
+        tol=1e-7,
+        verbose=0,
+        random_state=None,
+        class_weight=None,
+    ):
+        super().__init__(
+            loss=loss,
+            learning_rate=learning_rate,
+            max_iter=max_iter,
+            max_leaf_nodes=max_leaf_nodes,
+            max_depth=max_depth,
+            min_samples_leaf=min_samples_leaf,
+            l2_regularization=l2_regularization,
+            max_features=max_features,
+            max_bins=max_bins,
+            categorical_features=categorical_features,
+            monotonic_cst=monotonic_cst,
+            interaction_cst=interaction_cst,
+            warm_start=warm_start,
+            early_stopping=early_stopping,
+            scoring=scoring,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            tol=tol,
+            verbose=verbose,
+            random_state=random_state,
+        )
+        self.class_weight = class_weight
+
+    def _finalize_sample_weight(self, sample_weight, y):
+        """Adjust sample_weights with class_weights."""
+        if self.class_weight is None:
+            return sample_weight
+
+        expanded_class_weight = compute_sample_weight(self.class_weight, y)
+
+        if sample_weight is not None:
+            return sample_weight * expanded_class_weight
+        else:
+            return expanded_class_weight
+
+    def predict(self, X):
+        """Predict classes for X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            The predicted classes.
+        """
+        # TODO: This could be done in parallel
+        raw_predictions = self._raw_predict(X)
+        if raw_predictions.shape[1] == 1:
+            # np.argmax([0.5, 0.5]) is 0, not 1. Therefore "> 0" not ">= 0" to be
+            # consistent with the multiclass case.
+            encoded_classes = (raw_predictions.ravel() > 0).astype(int)
+        else:
+            encoded_classes = np.argmax(raw_predictions, axis=1)
+        return self.classes_[encoded_classes]
+
+    def staged_predict(self, X):
+        """Predict classes at each iteration.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        .. versionadded:: 0.24
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+
+        Yields
+        ------
+        y : generator of ndarray of shape (n_samples,)
+            The predicted classes of the input samples, for each iteration.
+        """
+        for raw_predictions in self._staged_raw_predict(X):
+            if raw_predictions.shape[1] == 1:
+                # np.argmax([0, 0]) is 0, not 1, therefore "> 0" not ">= 0"
+                encoded_classes = (raw_predictions.ravel() > 0).astype(int)
+            else:
+                encoded_classes = np.argmax(raw_predictions, axis=1)
+            yield self.classes_.take(encoded_classes, axis=0)
+
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        p : ndarray, shape (n_samples, n_classes)
+            The class probabilities of the input samples.
+        """
+        raw_predictions = self._raw_predict(X)
+        return self._loss.predict_proba(raw_predictions)
+
+    def staged_predict_proba(self, X):
+        """Predict class probabilities at each iteration.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+
+        Yields
+        ------
+        y : generator of ndarray of shape (n_samples,)
+            The predicted class probabilities of the input samples,
+            for each iteration.
+        """
+        for raw_predictions in self._staged_raw_predict(X):
+            yield self._loss.predict_proba(raw_predictions)
+
+    def decision_function(self, X):
+        """Compute the decision function of ``X``.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        decision : ndarray, shape (n_samples,) or \
+                (n_samples, n_trees_per_iteration)
+            The raw predicted values (i.e. the sum of the trees leaves) for
+            each sample. n_trees_per_iteration is equal to the number of
+            classes in multiclass classification.
+        """
+        decision = self._raw_predict(X)
+        if decision.shape[1] == 1:
+            decision = decision.ravel()
+        return decision
+
+    def staged_decision_function(self, X):
+        """Compute decision function of ``X`` for each iteration.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each stage.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+
+        Yields
+        ------
+        decision : generator of ndarray of shape (n_samples,) or \
+                (n_samples, n_trees_per_iteration)
+            The decision function of the input samples, which corresponds to
+            the raw values predicted from the trees of the ensemble . The
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        for staged_decision in self._staged_raw_predict(X):
+            if staged_decision.shape[1] == 1:
+                staged_decision = staged_decision.ravel()
+            yield staged_decision
+
+    def _encode_y(self, y):
+        """Create self._label_encoder and encode y correspondingly."""
+        # encode classes into 0 ... n_classes - 1 and sets attributes classes_
+        # and n_trees_per_iteration_
+        check_classification_targets(y)
+
+        # We need to store the label encoder in case y_val needs to be label encoded,
+        # too.
+        self._label_encoder = LabelEncoder()
+        encoded_y = self._label_encoder.fit_transform(y)
+        self.classes_ = self._label_encoder.classes_
+        n_classes = self.classes_.shape[0]
+        # only 1 tree for binary classification. For multiclass classification,
+        # we build 1 tree per class.
+        self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes
+        encoded_y = encoded_y.astype(Y_DTYPE, copy=False)
+        return encoded_y
+
+    def _encode_y_val(self, y):
+        encoded_y = self._label_encoder.transform(y)
+        return encoded_y.astype(Y_DTYPE, copy=False)
+
+    def _get_loss(self, sample_weight):
+        # At this point self.loss == "log_loss"
+        if self.n_trees_per_iteration_ == 1:
+            return HalfBinomialLoss(sample_weight=sample_weight)
+        else:
+            return HalfMultinomialLoss(
+                sample_weight=sample_weight, n_classes=self.n_trees_per_iteration_
+            )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3dbbe7d82948412cf0c567cbc672cb018ee9817
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -0,0 +1,821 @@
+"""
+This module contains the TreeGrower class.
+
+TreeGrower builds a regression tree fitting a Newton-Raphson step, based on
+the gradients and hessians of the training data.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+from heapq import heappop, heappush
+from timeit import default_timer as time
+
+import numpy as np
+
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+from ._bitset import set_raw_bitset_from_binned_bitset
+from .common import (
+    PREDICTOR_RECORD_DTYPE,
+    X_BITSET_INNER_DTYPE,
+    MonotonicConstraint,
+)
+from .histogram import HistogramBuilder
+from .predictor import TreePredictor
+from .splitting import Splitter
+
+
+class TreeNode:
+    """Tree Node class used in TreeGrower.
+
+    This isn't used for prediction purposes, only for training (see
+    TreePredictor).
+
+    Parameters
+    ----------
+    depth : int
+        The depth of the node, i.e. its distance from the root.
+    sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint32
+        The indices of the samples at the node.
+    partition_start : int
+        start position of the node's sample_indices in splitter.partition.
+    partition_stop : int
+        stop position of the node's sample_indices in splitter.partition.
+    sum_gradients : float
+        The sum of the gradients of the samples at the node.
+    sum_hessians : float
+        The sum of the hessians of the samples at the node.
+
+    Attributes
+    ----------
+    depth : int
+        The depth of the node, i.e. its distance from the root.
+    sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint32
+        The indices of the samples at the node.
+    sum_gradients : float
+        The sum of the gradients of the samples at the node.
+    sum_hessians : float
+        The sum of the hessians of the samples at the node.
+    split_info : SplitInfo or None
+        The result of the split evaluation.
+    is_leaf : bool
+        True if node is a leaf
+    left_child : TreeNode or None
+        The left child of the node. None for leaves.
+    right_child : TreeNode or None
+        The right child of the node. None for leaves.
+    value : float or None
+        The value of the leaf, as computed in finalize_leaf(). None for
+        non-leaf nodes.
+    partition_start : int
+        start position of the node's sample_indices in splitter.partition.
+    partition_stop : int
+        stop position of the node's sample_indices in splitter.partition.
+    allowed_features : None or ndarray, dtype=int
+        Indices of features allowed to split for children.
+    interaction_cst_indices : None or list of ints
+        Indices of the interaction sets that have to be applied on splits of
+        child nodes. The fewer sets the stronger the constraint as fewer sets
+        contain fewer features.
+    children_lower_bound : float
+    children_upper_bound : float
+    """
+
+    def __init__(
+        self,
+        *,
+        depth,
+        sample_indices,
+        partition_start,
+        partition_stop,
+        sum_gradients,
+        sum_hessians,
+        value=None,
+    ):
+        self.depth = depth
+        self.sample_indices = sample_indices
+        self.n_samples = sample_indices.shape[0]
+        self.sum_gradients = sum_gradients
+        self.sum_hessians = sum_hessians
+        self.value = value
+        self.is_leaf = False
+        self.allowed_features = None
+        self.interaction_cst_indices = None
+        self.set_children_bounds(float("-inf"), float("+inf"))
+        self.split_info = None
+        self.left_child = None
+        self.right_child = None
+        self.histograms = None
+        # start and stop indices of the node in the splitter.partition
+        # array. Concretely,
+        # self.sample_indices = view(self.splitter.partition[start:stop])
+        # Please see the comments about splitter.partition and
+        # splitter.split_indices for more info about this design.
+        # These 2 attributes are only used in _update_raw_prediction, because we
+        # need to iterate over the leaves and I don't know how to efficiently
+        # store the sample_indices views because they're all of different sizes.
+        self.partition_start = partition_start
+        self.partition_stop = partition_stop
+
+    def set_children_bounds(self, lower, upper):
+        """Set children values bounds to respect monotonic constraints."""
+
+        # These are bounds for the node's *children* values, not the node's
+        # value. The bounds are used in the splitter when considering potential
+        # left and right child.
+        self.children_lower_bound = lower
+        self.children_upper_bound = upper
+
+    def __lt__(self, other_node):
+        """Comparison for priority queue.
+
+        Nodes with high gain are higher priority than nodes with low gain.
+
+        heapq.heappush only need the '<' operator.
+        heapq.heappop take the smallest item first (smaller is higher
+        priority).
+
+        Parameters
+        ----------
+        other_node : TreeNode
+            The node to compare with.
+        """
+        return self.split_info.gain > other_node.split_info.gain
+
+
+class TreeGrower:
+    """Tree grower class used to build a tree.
+
+    The tree is fitted to predict the values of a Newton-Raphson step. The
+    splits are considered in a best-first fashion, and the quality of a
+    split is defined in splitting._split_gain.
+
+    Parameters
+    ----------
+    X_binned : ndarray of shape (n_samples, n_features), dtype=np.uint8
+        The binned input samples. Must be Fortran-aligned.
+    gradients : ndarray of shape (n_samples,)
+        The gradients of each training sample. Those are the gradients of the
+        loss w.r.t the predictions, evaluated at iteration ``i - 1``.
+    hessians : ndarray of shape (n_samples,)
+        The hessians of each training sample. Those are the hessians of the
+        loss w.r.t the predictions, evaluated at iteration ``i - 1``.
+    max_leaf_nodes : int, default=None
+        The maximum number of leaves for each tree. If None, there is no
+        maximum limit.
+    max_depth : int, default=None
+        The maximum depth of each tree. The depth of a tree is the number of
+        edges to go from the root to the deepest leaf.
+        Depth isn't constrained by default.
+    min_samples_leaf : int, default=20
+        The minimum number of samples per leaf.
+    min_gain_to_split : float, default=0.
+        The minimum gain needed to split a node. Splits with lower gain will
+        be ignored.
+    min_hessian_to_split : float, default=1e-3
+        The minimum sum of hessians needed in each node. Splits that result in
+        at least one child having a sum of hessians less than
+        ``min_hessian_to_split`` are discarded.
+    n_bins : int, default=256
+        The total number of bins, including the bin for missing values. Used
+        to define the shape of the histograms.
+    n_bins_non_missing : ndarray, dtype=np.uint32, default=None
+        For each feature, gives the number of bins actually used for
+        non-missing values. For features with a lot of unique values, this
+        is equal to ``n_bins - 1``. If it's an int, all features are
+        considered to have the same number of bins. If None, all features
+        are considered to have ``n_bins - 1`` bins.
+    has_missing_values : bool or ndarray, dtype=bool, default=False
+        Whether each feature contains missing values (in the training data).
+        If it's a bool, the same value is used for all features.
+    is_categorical : ndarray of bool of shape (n_features,), default=None
+        Indicates categorical features.
+    monotonic_cst : array-like of int of shape (n_features,), dtype=int, default=None
+        Indicates the monotonic constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+    interaction_cst : list of sets of integers, default=None
+        List of interaction constraints.
+    l2_regularization : float, default=0.
+        The L2 regularization parameter penalizing leaves with small hessians.
+        Use ``0`` for no regularization (default).
+    feature_fraction_per_split : float, default=1
+        Proportion of randomly chosen features in each and every node split.
+        This is a form of regularization, smaller values make the trees weaker
+        learners and might prevent overfitting.
+    rng : Generator
+        Numpy random Generator used for feature subsampling.
+    shrinkage : float, default=1.
+        The shrinkage parameter to apply to the leaves values, also known as
+        learning rate.
+    n_threads : int, default=None
+        Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
+        to determine the effective number of threads use, which takes cgroups CPU
+        quotes into account. See the docstring of `_openmp_effective_n_threads`
+        for details.
+
+    Attributes
+    ----------
+    histogram_builder : HistogramBuilder
+    splitter : Splitter
+    root : TreeNode
+    finalized_leaves : list of TreeNode
+    splittable_nodes : list of TreeNode
+    missing_values_bin_idx : int
+        Equals n_bins - 1
+    n_categorical_splits : int
+    n_features : int
+    n_nodes : int
+    total_find_split_time : float
+        Time spent finding the best splits
+    total_compute_hist_time : float
+        Time spent computing histograms
+    total_apply_split_time : float
+        Time spent splitting nodes
+    with_monotonic_cst : bool
+        Whether there are monotonic constraints that apply. False iff monotonic_cst is
+        None.
+    """
+
+    def __init__(
+        self,
+        X_binned,
+        gradients,
+        hessians,
+        max_leaf_nodes=None,
+        max_depth=None,
+        min_samples_leaf=20,
+        min_gain_to_split=0.0,
+        min_hessian_to_split=1e-3,
+        n_bins=256,
+        n_bins_non_missing=None,
+        has_missing_values=False,
+        is_categorical=None,
+        monotonic_cst=None,
+        interaction_cst=None,
+        l2_regularization=0.0,
+        feature_fraction_per_split=1.0,
+        rng=np.random.default_rng(),
+        shrinkage=1.0,
+        n_threads=None,
+    ):
+        self._validate_parameters(
+            X_binned,
+            min_gain_to_split,
+            min_hessian_to_split,
+        )
+        n_threads = _openmp_effective_n_threads(n_threads)
+
+        if n_bins_non_missing is None:
+            n_bins_non_missing = n_bins - 1
+
+        if isinstance(n_bins_non_missing, numbers.Integral):
+            n_bins_non_missing = np.array(
+                [n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32
+            )
+        else:
+            n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32)
+
+        if isinstance(has_missing_values, bool):
+            has_missing_values = [has_missing_values] * X_binned.shape[1]
+        has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
+
+        # `monotonic_cst` validation is done in _validate_monotonic_cst
+        # at the estimator level and therefore the following should not be
+        # needed when using the public API.
+        if monotonic_cst is None:
+            monotonic_cst = np.full(
+                shape=X_binned.shape[1],
+                fill_value=MonotonicConstraint.NO_CST,
+                dtype=np.int8,
+            )
+        else:
+            monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
+        self.with_monotonic_cst = np.any(monotonic_cst != MonotonicConstraint.NO_CST)
+
+        if is_categorical is None:
+            is_categorical = np.zeros(shape=X_binned.shape[1], dtype=np.uint8)
+        else:
+            is_categorical = np.asarray(is_categorical, dtype=np.uint8)
+
+        if np.any(
+            np.logical_and(
+                is_categorical == 1, monotonic_cst != MonotonicConstraint.NO_CST
+            )
+        ):
+            raise ValueError("Categorical features cannot have monotonic constraints.")
+
+        hessians_are_constant = hessians.shape[0] == 1
+        self.histogram_builder = HistogramBuilder(
+            X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads
+        )
+        missing_values_bin_idx = n_bins - 1
+        self.splitter = Splitter(
+            X_binned=X_binned,
+            n_bins_non_missing=n_bins_non_missing,
+            missing_values_bin_idx=missing_values_bin_idx,
+            has_missing_values=has_missing_values,
+            is_categorical=is_categorical,
+            monotonic_cst=monotonic_cst,
+            l2_regularization=l2_regularization,
+            min_hessian_to_split=min_hessian_to_split,
+            min_samples_leaf=min_samples_leaf,
+            min_gain_to_split=min_gain_to_split,
+            hessians_are_constant=hessians_are_constant,
+            feature_fraction_per_split=feature_fraction_per_split,
+            rng=rng,
+            n_threads=n_threads,
+        )
+        self.X_binned = X_binned
+        self.max_leaf_nodes = max_leaf_nodes
+        self.max_depth = max_depth
+        self.min_samples_leaf = min_samples_leaf
+        self.min_gain_to_split = min_gain_to_split
+        self.n_bins_non_missing = n_bins_non_missing
+        self.missing_values_bin_idx = missing_values_bin_idx
+        self.has_missing_values = has_missing_values
+        self.is_categorical = is_categorical
+        self.monotonic_cst = monotonic_cst
+        self.interaction_cst = interaction_cst
+        self.l2_regularization = l2_regularization
+        self.shrinkage = shrinkage
+        self.n_features = X_binned.shape[1]
+        self.n_threads = n_threads
+        self.splittable_nodes = []
+        self.finalized_leaves = []
+        self.total_find_split_time = 0.0  # time spent finding the best splits
+        self.total_compute_hist_time = 0.0  # time spent computing histograms
+        self.total_apply_split_time = 0.0  # time spent splitting nodes
+        self.n_categorical_splits = 0
+        self._initialize_root()
+        self.n_nodes = 1
+
+    def _validate_parameters(
+        self,
+        X_binned,
+        min_gain_to_split,
+        min_hessian_to_split,
+    ):
+        """Validate parameters passed to __init__.
+
+        Also validate parameters passed to splitter.
+        """
+        if X_binned.dtype != np.uint8:
+            raise NotImplementedError("X_binned must be of type uint8.")
+        if not X_binned.flags.f_contiguous:
+            raise ValueError(
+                "X_binned should be passed as Fortran contiguous "
+                "array for maximum efficiency."
+            )
+        if min_gain_to_split < 0:
+            raise ValueError(
+                "min_gain_to_split={} must be positive.".format(min_gain_to_split)
+            )
+        if min_hessian_to_split < 0:
+            raise ValueError(
+                "min_hessian_to_split={} must be positive.".format(min_hessian_to_split)
+            )
+
+    def grow(self):
+        """Grow the tree, from root to leaves."""
+        while self.splittable_nodes:
+            self.split_next()
+
+        self._apply_shrinkage()
+
+    def _apply_shrinkage(self):
+        """Multiply leaves values by shrinkage parameter.
+
+        This must be done at the very end of the growing process. If this were
+        done during the growing process e.g. in finalize_leaf(), then a leaf
+        would be shrunk but its sibling would potentially not be (if it's a
+        non-leaf), which would lead to a wrong computation of the 'middle'
+        value needed to enforce the monotonic constraints.
+        """
+        for leaf in self.finalized_leaves:
+            leaf.value *= self.shrinkage
+
+    def _initialize_root(self):
+        """Initialize root node and finalize it if needed."""
+        tic = time()
+        if self.interaction_cst is not None:
+            allowed_features = set().union(*self.interaction_cst)
+            allowed_features = np.fromiter(
+                allowed_features, dtype=np.uint32, count=len(allowed_features)
+            )
+            arbitrary_feature = allowed_features[0]
+        else:
+            allowed_features = None
+            arbitrary_feature = 0
+
+        # TreeNode init needs the total sum of gradients and hessians. Therefore, we
+        # first compute the histograms and then compute the total grad/hess on an
+        # arbitrary feature histogram. This way we replace a loop over n_samples by a
+        # loop over n_bins.
+        histograms = self.histogram_builder.compute_histograms_brute(
+            self.splitter.partition,  # =self.root.sample_indices
+            allowed_features,
+        )
+        self.total_compute_hist_time += time() - tic
+
+        tic = time()
+        n_samples = self.X_binned.shape[0]
+        depth = 0
+        histogram_array = np.asarray(histograms[arbitrary_feature])
+        sum_gradients = histogram_array["sum_gradients"].sum()
+        if self.histogram_builder.hessians_are_constant:
+            sum_hessians = self.histogram_builder.hessians[0] * n_samples
+        else:
+            sum_hessians = histogram_array["sum_hessians"].sum()
+        self.root = TreeNode(
+            depth=depth,
+            sample_indices=self.splitter.partition,
+            partition_start=0,
+            partition_stop=n_samples,
+            sum_gradients=sum_gradients,
+            sum_hessians=sum_hessians,
+            value=0,
+        )
+
+        if self.root.n_samples < 2 * self.min_samples_leaf:
+            # Do not even bother computing any splitting statistics.
+            self._finalize_leaf(self.root)
+            return
+        if sum_hessians < self.splitter.min_hessian_to_split:
+            self._finalize_leaf(self.root)
+            return
+
+        if self.interaction_cst is not None:
+            self.root.interaction_cst_indices = range(len(self.interaction_cst))
+            self.root.allowed_features = allowed_features
+
+        self.root.histograms = histograms
+
+        self._compute_best_split_and_push(self.root)
+        self.total_find_split_time += time() - tic
+
+    def _compute_best_split_and_push(self, node):
+        """Compute the best possible split (SplitInfo) of a given node.
+
+        Also push it in the heap of splittable nodes if gain isn't zero.
+        The gain of a node is 0 if either all the leaves are pure
+        (best gain = 0), or if no split would satisfy the constraints,
+        (min_hessians_to_split, min_gain_to_split, min_samples_leaf)
+        """
+
+        node.split_info = self.splitter.find_node_split(
+            n_samples=node.n_samples,
+            histograms=node.histograms,
+            sum_gradients=node.sum_gradients,
+            sum_hessians=node.sum_hessians,
+            value=node.value,
+            lower_bound=node.children_lower_bound,
+            upper_bound=node.children_upper_bound,
+            allowed_features=node.allowed_features,
+        )
+
+        if node.split_info.gain <= 0:  # no valid split
+            self._finalize_leaf(node)
+        else:
+            heappush(self.splittable_nodes, node)
+
+    def split_next(self):
+        """Split the node with highest potential gain.
+
+        Returns
+        -------
+        left : TreeNode
+            The resulting left child.
+        right : TreeNode
+            The resulting right child.
+        """
+        # Consider the node with the highest loss reduction (a.k.a. gain)
+        node = heappop(self.splittable_nodes)
+
+        tic = time()
+        (
+            sample_indices_left,
+            sample_indices_right,
+            right_child_pos,
+        ) = self.splitter.split_indices(node.split_info, node.sample_indices)
+        self.total_apply_split_time += time() - tic
+
+        depth = node.depth + 1
+        n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)
+        n_leaf_nodes += 2
+
+        left_child_node = TreeNode(
+            depth=depth,
+            sample_indices=sample_indices_left,
+            partition_start=node.partition_start,
+            partition_stop=node.partition_start + right_child_pos,
+            sum_gradients=node.split_info.sum_gradient_left,
+            sum_hessians=node.split_info.sum_hessian_left,
+            value=node.split_info.value_left,
+        )
+        right_child_node = TreeNode(
+            depth=depth,
+            sample_indices=sample_indices_right,
+            partition_start=left_child_node.partition_stop,
+            partition_stop=node.partition_stop,
+            sum_gradients=node.split_info.sum_gradient_right,
+            sum_hessians=node.split_info.sum_hessian_right,
+            value=node.split_info.value_right,
+        )
+
+        node.right_child = right_child_node
+        node.left_child = left_child_node
+
+        # set interaction constraints (the indices of the constraints sets)
+        if self.interaction_cst is not None:
+            # Calculate allowed_features and interaction_cst_indices only once. Child
+            # nodes inherit them before they get split.
+            (
+                left_child_node.allowed_features,
+                left_child_node.interaction_cst_indices,
+            ) = self._compute_interactions(node)
+            right_child_node.interaction_cst_indices = (
+                left_child_node.interaction_cst_indices
+            )
+            right_child_node.allowed_features = left_child_node.allowed_features
+
+        if not self.has_missing_values[node.split_info.feature_idx]:
+            # If no missing values are encountered at fit time, then samples
+            # with missing values during predict() will go to whichever child
+            # has the most samples.
+            node.split_info.missing_go_to_left = (
+                left_child_node.n_samples > right_child_node.n_samples
+            )
+
+        self.n_nodes += 2
+        self.n_categorical_splits += node.split_info.is_categorical
+
+        if self.max_leaf_nodes is not None and n_leaf_nodes == self.max_leaf_nodes:
+            self._finalize_leaf(left_child_node)
+            self._finalize_leaf(right_child_node)
+            self._finalize_splittable_nodes()
+            return left_child_node, right_child_node
+
+        if self.max_depth is not None and depth == self.max_depth:
+            self._finalize_leaf(left_child_node)
+            self._finalize_leaf(right_child_node)
+            return left_child_node, right_child_node
+
+        if left_child_node.n_samples < self.min_samples_leaf * 2:
+            self._finalize_leaf(left_child_node)
+        if right_child_node.n_samples < self.min_samples_leaf * 2:
+            self._finalize_leaf(right_child_node)
+
+        if self.with_monotonic_cst:
+            # Set value bounds for respecting monotonic constraints
+            # See test_nodes_values() for details
+            if (
+                self.monotonic_cst[node.split_info.feature_idx]
+                == MonotonicConstraint.NO_CST
+            ):
+                lower_left = lower_right = node.children_lower_bound
+                upper_left = upper_right = node.children_upper_bound
+            else:
+                mid = (left_child_node.value + right_child_node.value) / 2
+                if (
+                    self.monotonic_cst[node.split_info.feature_idx]
+                    == MonotonicConstraint.POS
+                ):
+                    lower_left, upper_left = node.children_lower_bound, mid
+                    lower_right, upper_right = mid, node.children_upper_bound
+                else:  # NEG
+                    lower_left, upper_left = mid, node.children_upper_bound
+                    lower_right, upper_right = node.children_lower_bound, mid
+            left_child_node.set_children_bounds(lower_left, upper_left)
+            right_child_node.set_children_bounds(lower_right, upper_right)
+
+        # Compute histograms of children, and compute their best possible split
+        # (if needed)
+        should_split_left = not left_child_node.is_leaf
+        should_split_right = not right_child_node.is_leaf
+        if should_split_left or should_split_right:
+            # We will compute the histograms of both nodes even if one of them
+            # is a leaf, since computing the second histogram is very cheap
+            # (using histogram subtraction).
+            n_samples_left = left_child_node.sample_indices.shape[0]
+            n_samples_right = right_child_node.sample_indices.shape[0]
+            if n_samples_left < n_samples_right:
+                smallest_child = left_child_node
+                largest_child = right_child_node
+            else:
+                smallest_child = right_child_node
+                largest_child = left_child_node
+
+            # We use the brute O(n_samples) method on the child that has the
+            # smallest number of samples, and the subtraction trick O(n_bins)
+            # on the other one.
+            # Note that both left and right child have the same allowed_features.
+            tic = time()
+            smallest_child.histograms = self.histogram_builder.compute_histograms_brute(
+                smallest_child.sample_indices, smallest_child.allowed_features
+            )
+            largest_child.histograms = (
+                self.histogram_builder.compute_histograms_subtraction(
+                    node.histograms,
+                    smallest_child.histograms,
+                    smallest_child.allowed_features,
+                )
+            )
+            # node.histograms is reused in largest_child.histograms. To break cyclic
+            # memory references and help garbage collection, we set it to None.
+            node.histograms = None
+            self.total_compute_hist_time += time() - tic
+
+            tic = time()
+            if should_split_left:
+                self._compute_best_split_and_push(left_child_node)
+            if should_split_right:
+                self._compute_best_split_and_push(right_child_node)
+            self.total_find_split_time += time() - tic
+
+            # Release memory used by histograms as they are no longer needed
+            # for leaf nodes since they won't be split.
+            for child in (left_child_node, right_child_node):
+                if child.is_leaf:
+                    del child.histograms
+
+        # Release memory used by histograms as they are no longer needed for
+        # internal nodes once children histograms have been computed.
+        del node.histograms
+
+        return left_child_node, right_child_node
+
+    def _compute_interactions(self, node):
+        r"""Compute features allowed by interactions to be inherited by child nodes.
+
+        Example: Assume constraints [{0, 1}, {1, 2}].
+           1      <- Both constraint groups could be applied from now on
+          / \
+         1   2    <- Left split still fulfills both constraint groups.
+        / \ / \      Right split at feature 2 has only group {1, 2} from now on.
+
+        LightGBM uses the same logic for overlapping groups. See
+        https://github.com/microsoft/LightGBM/issues/4481 for details.
+
+        Parameters:
+        ----------
+        node : TreeNode
+            A node that might have children. Based on its feature_idx, the interaction
+            constraints for possible child nodes are computed.
+
+        Returns
+        -------
+        allowed_features : ndarray, dtype=uint32
+            Indices of features allowed to split for children.
+        interaction_cst_indices : list of ints
+            Indices of the interaction sets that have to be applied on splits of
+            child nodes. The fewer sets the stronger the constraint as fewer sets
+            contain fewer features.
+        """
+        # Note:
+        #  - Case of no interactions is already captured before function call.
+        #  - This is for nodes that are already split and have a
+        #    node.split_info.feature_idx.
+        allowed_features = set()
+        interaction_cst_indices = []
+        for i in node.interaction_cst_indices:
+            if node.split_info.feature_idx in self.interaction_cst[i]:
+                interaction_cst_indices.append(i)
+                allowed_features.update(self.interaction_cst[i])
+        return (
+            np.fromiter(allowed_features, dtype=np.uint32, count=len(allowed_features)),
+            interaction_cst_indices,
+        )
+
+    def _finalize_leaf(self, node):
+        """Make node a leaf of the tree being grown."""
+
+        node.is_leaf = True
+        self.finalized_leaves.append(node)
+
+    def _finalize_splittable_nodes(self):
+        """Transform all splittable nodes into leaves.
+
+        Used when some constraint is met e.g. maximum number of leaves or
+        maximum depth."""
+        while len(self.splittable_nodes) > 0:
+            node = self.splittable_nodes.pop()
+            self._finalize_leaf(node)
+
+    def make_predictor(self, binning_thresholds):
+        """Make a TreePredictor object out of the current tree.
+
+        Parameters
+        ----------
+        binning_thresholds : array-like of floats
+            Corresponds to the bin_thresholds_ attribute of the BinMapper.
+            For each feature, this stores:
+
+            - the bin frontiers for continuous features
+            - the unique raw category values for categorical features
+
+        Returns
+        -------
+        A TreePredictor object.
+        """
+        predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
+        binned_left_cat_bitsets = np.zeros(
+            (self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE
+        )
+        raw_left_cat_bitsets = np.zeros(
+            (self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE
+        )
+        _fill_predictor_arrays(
+            predictor_nodes,
+            binned_left_cat_bitsets,
+            raw_left_cat_bitsets,
+            self.root,
+            binning_thresholds,
+            self.n_bins_non_missing,
+        )
+        return TreePredictor(
+            predictor_nodes, binned_left_cat_bitsets, raw_left_cat_bitsets
+        )
+
+
+def _fill_predictor_arrays(
+    predictor_nodes,
+    binned_left_cat_bitsets,
+    raw_left_cat_bitsets,
+    grower_node,
+    binning_thresholds,
+    n_bins_non_missing,
+    next_free_node_idx=0,
+    next_free_bitset_idx=0,
+):
+    """Helper used in make_predictor to set the TreePredictor fields."""
+    node = predictor_nodes[next_free_node_idx]
+    node["count"] = grower_node.n_samples
+    node["depth"] = grower_node.depth
+    if grower_node.split_info is not None:
+        node["gain"] = grower_node.split_info.gain
+    else:
+        node["gain"] = -1
+
+    node["value"] = grower_node.value
+
+    if grower_node.is_leaf:
+        # Leaf node
+        node["is_leaf"] = True
+        return next_free_node_idx + 1, next_free_bitset_idx
+
+    split_info = grower_node.split_info
+    feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
+    node["feature_idx"] = feature_idx
+    node["bin_threshold"] = bin_idx
+    node["missing_go_to_left"] = split_info.missing_go_to_left
+    node["is_categorical"] = split_info.is_categorical
+
+    if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1:
+        # Split is on the last non-missing bin: it's a "split on nans".
+        # All nans go to the right, the rest go to the left.
+        # Note: for categorical splits, bin_idx is 0 and we rely on the bitset
+        node["num_threshold"] = np.inf
+    elif split_info.is_categorical:
+        categories = binning_thresholds[feature_idx]
+        node["bitset_idx"] = next_free_bitset_idx
+        binned_left_cat_bitsets[next_free_bitset_idx] = split_info.left_cat_bitset
+        set_raw_bitset_from_binned_bitset(
+            raw_left_cat_bitsets[next_free_bitset_idx],
+            split_info.left_cat_bitset,
+            categories,
+        )
+        next_free_bitset_idx += 1
+    else:
+        node["num_threshold"] = binning_thresholds[feature_idx][bin_idx]
+
+    next_free_node_idx += 1
+
+    node["left"] = next_free_node_idx
+    next_free_node_idx, next_free_bitset_idx = _fill_predictor_arrays(
+        predictor_nodes,
+        binned_left_cat_bitsets,
+        raw_left_cat_bitsets,
+        grower_node.left_child,
+        binning_thresholds=binning_thresholds,
+        n_bins_non_missing=n_bins_non_missing,
+        next_free_node_idx=next_free_node_idx,
+        next_free_bitset_idx=next_free_bitset_idx,
+    )
+
+    node["right"] = next_free_node_idx
+    return _fill_predictor_arrays(
+        predictor_nodes,
+        binned_left_cat_bitsets,
+        raw_left_cat_bitsets,
+        grower_node.right_child,
+        binning_thresholds=binning_thresholds,
+        n_bins_non_missing=n_bins_non_missing,
+        next_free_node_idx=next_free_node_idx,
+        next_free_bitset_idx=next_free_bitset_idx,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..e204eec6b97850f696ef61898562bb65bc908ed6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
@@ -0,0 +1,520 @@
+"""This module contains routines for building histograms."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+cimport cython
+from cython.parallel import prange
+from libc.string cimport memset
+
+import numpy as np
+
+from .common import HISTOGRAM_DTYPE
+from .common cimport hist_struct
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport G_H_DTYPE_C
+from ...utils._typedefs cimport uint8_t
+
+
+# Notes:
+# - IN views are read-only, OUT views are write-only
+# - In a lot of functions here, we pass feature_idx and the whole 2d
+#   histograms arrays instead of just histograms[feature_idx]. This is because
+#   Cython generated C code will have strange Python interactions (likely
+#   related to the GIL release and the custom histogram dtype) when using 1d
+#   histogram arrays that come from 2d arrays.
+# - The for loops are un-wrapped, for example:
+#
+#   for i in range(n):
+#       array[i] = i
+#
+#   will become
+#
+#   for i in range(n // 4):
+#       array[i] = i
+#       array[i + 1] = i + 1
+#       array[i + 2] = i + 2
+#       array[i + 3] = i + 3
+#
+#   This is to hint gcc that it can auto-vectorize these 4 operations and
+#   perform them all at once.
+
+
+@cython.final
+cdef class HistogramBuilder:
+    """A Histogram builder... used to build histograms.
+
+    A histogram is an array with n_bins entries of type HISTOGRAM_DTYPE. Each
+    feature has its own histogram. A histogram contains the sum of gradients
+    and hessians of all the samples belonging to each bin.
+
+    There are different ways to build a histogram:
+    - by subtraction: hist(child) = hist(parent) - hist(sibling)
+    - from scratch. In this case we have routines that update the hessians
+      or not (not useful when hessians are constant for some losses e.g.
+      least squares). Also, there's a special case for the root which
+      contains all the samples, leading to some possible optimizations.
+      Overall all the implementations look the same, and are optimized for
+      cache hit.
+
+    Parameters
+    ----------
+    X_binned : ndarray of int, shape (n_samples, n_features)
+        The binned input samples. Must be Fortran-aligned.
+    n_bins : int
+        The total number of bins, including the bin for missing values. Used
+        to define the shape of the histograms.
+    gradients : ndarray, shape (n_samples,)
+        The gradients of each training sample. Those are the gradients of the
+        loss w.r.t the predictions, evaluated at iteration i - 1.
+    hessians : ndarray, shape (n_samples,)
+        The hessians of each training sample. Those are the hessians of the
+        loss w.r.t the predictions, evaluated at iteration i - 1.
+    hessians_are_constant : bool
+        Whether hessians are constant.
+    """
+    cdef public:
+        const X_BINNED_DTYPE_C [::1, :] X_binned
+        unsigned int n_features
+        unsigned int n_bins
+        G_H_DTYPE_C [::1] gradients
+        G_H_DTYPE_C [::1] hessians
+        G_H_DTYPE_C [::1] ordered_gradients
+        G_H_DTYPE_C [::1] ordered_hessians
+        uint8_t hessians_are_constant
+        int n_threads
+
+    def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned,
+                 unsigned int n_bins, G_H_DTYPE_C [::1] gradients,
+                 G_H_DTYPE_C [::1] hessians,
+                 uint8_t hessians_are_constant,
+                 int n_threads):
+
+        self.X_binned = X_binned
+        self.n_features = X_binned.shape[1]
+        # Note: all histograms will have <n_bins> bins, but some of the
+        # bins may be unused if a feature has a small number of unique values.
+        self.n_bins = n_bins
+        self.gradients = gradients
+        self.hessians = hessians
+        # for root node, gradients and hessians are already ordered
+        self.ordered_gradients = gradients.copy()
+        self.ordered_hessians = hessians.copy()
+        self.hessians_are_constant = hessians_are_constant
+        self.n_threads = n_threads
+
+    def compute_histograms_brute(
+        HistogramBuilder self,
+        const unsigned int [::1] sample_indices,       # IN
+        const unsigned int [:] allowed_features=None,  # IN
+    ):
+        """Compute the histograms of the node by scanning through all the data.
+
+        For a given feature, the complexity is O(n_samples)
+
+        Parameters
+        ----------
+        sample_indices : array of int, shape (n_samples_at_node,)
+            The indices of the samples at the node to split.
+
+        allowed_features : None or ndarray, dtype=np.uint32
+            Indices of the features that are allowed by interaction constraints to be
+            split.
+
+        Returns
+        -------
+        histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins)
+            The computed histograms of the current node.
+        """
+        cdef:
+            int n_samples
+            int feature_idx
+            int f_idx
+            int i
+            # need local views to avoid python interactions
+            uint8_t hessians_are_constant = self.hessians_are_constant
+            int n_allowed_features = self.n_features
+            G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients
+            G_H_DTYPE_C [::1] gradients = self.gradients
+            G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians
+            G_H_DTYPE_C [::1] hessians = self.hessians
+            # Histograms will be initialized to zero later within a prange
+            hist_struct [:, ::1] histograms = np.empty(
+                shape=(self.n_features, self.n_bins),
+                dtype=HISTOGRAM_DTYPE
+            )
+            bint has_interaction_cst = allowed_features is not None
+            int n_threads = self.n_threads
+
+        if has_interaction_cst:
+            n_allowed_features = allowed_features.shape[0]
+
+        with nogil:
+            n_samples = sample_indices.shape[0]
+
+            # Populate ordered_gradients and ordered_hessians. (Already done
+            # for root) Ordering the gradients and hessians helps to improve
+            # cache hit.
+            if sample_indices.shape[0] != gradients.shape[0]:
+                if hessians_are_constant:
+                    for i in prange(n_samples, schedule='static',
+                                    num_threads=n_threads):
+                        ordered_gradients[i] = gradients[sample_indices[i]]
+                else:
+                    for i in prange(n_samples, schedule='static',
+                                    num_threads=n_threads):
+                        ordered_gradients[i] = gradients[sample_indices[i]]
+                        ordered_hessians[i] = hessians[sample_indices[i]]
+
+            # Compute histogram of each feature
+            for f_idx in prange(
+                n_allowed_features, schedule='static', num_threads=n_threads
+            ):
+                if has_interaction_cst:
+                    feature_idx = allowed_features[f_idx]
+                else:
+                    feature_idx = f_idx
+
+                self._compute_histogram_brute_single_feature(
+                    feature_idx, sample_indices, histograms
+                )
+
+        return histograms
+
+    cdef void _compute_histogram_brute_single_feature(
+            HistogramBuilder self,
+            const int feature_idx,
+            const unsigned int [::1] sample_indices,  # IN
+            hist_struct [:, ::1] histograms) noexcept nogil:  # OUT
+        """Compute the histogram for a given feature."""
+
+        cdef:
+            unsigned int n_samples = sample_indices.shape[0]
+            const X_BINNED_DTYPE_C [::1] X_binned = \
+                self.X_binned[:, feature_idx]
+            unsigned int root_node = X_binned.shape[0] == n_samples
+            G_H_DTYPE_C [::1] ordered_gradients = \
+                self.ordered_gradients[:n_samples]
+            G_H_DTYPE_C [::1] ordered_hessians = \
+                self.ordered_hessians[:n_samples]
+            uint8_t hessians_are_constant = \
+                self.hessians_are_constant
+
+        # Set histograms to zero.
+        memset(&histograms[feature_idx, 0], 0, self.n_bins * sizeof(hist_struct))
+
+        if root_node:
+            if hessians_are_constant:
+                _build_histogram_root_no_hessian(feature_idx, X_binned,
+                                                 ordered_gradients,
+                                                 histograms)
+            else:
+                _build_histogram_root(feature_idx, X_binned,
+                                      ordered_gradients, ordered_hessians,
+                                      histograms)
+        else:
+            if hessians_are_constant:
+                _build_histogram_no_hessian(feature_idx,
+                                            sample_indices, X_binned,
+                                            ordered_gradients, histograms)
+            else:
+                _build_histogram(feature_idx, sample_indices,
+                                 X_binned, ordered_gradients,
+                                 ordered_hessians, histograms)
+
+    def compute_histograms_subtraction(
+        HistogramBuilder self,
+        hist_struct [:, ::1] parent_histograms,        # IN and OUT
+        hist_struct [:, ::1] sibling_histograms,       # IN
+        const unsigned int [:] allowed_features=None,  # IN
+    ):
+        """Compute the histograms of the node using the subtraction trick.
+
+        hist(parent) = hist(left_child) + hist(right_child)
+
+        For a given feature, the complexity is O(n_bins). This is much more
+        efficient than compute_histograms_brute, but it's only possible for one
+        of the siblings.
+
+        Parameters
+        ----------
+        parent_histograms : ndarray of HISTOGRAM_DTYPE, \
+                shape (n_features, n_bins)
+            The histograms of the parent.
+        sibling_histograms : ndarray of HISTOGRAM_DTYPE, \
+                shape (n_features, n_bins)
+            The histograms of the sibling.
+        allowed_features : None or ndarray, dtype=np.uint32
+            Indices of the features that are allowed by interaction constraints to be
+            split.
+
+        Returns
+        -------
+        histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, n_bins)
+            The computed histograms of the current node.
+            We repurpose parent_histograms for this and don't need to allocate new
+            memory.
+        """
+
+        cdef:
+            int feature_idx
+            int f_idx
+            int n_allowed_features = self.n_features
+            bint has_interaction_cst = allowed_features is not None
+            int n_threads = self.n_threads
+
+        if has_interaction_cst:
+            n_allowed_features = allowed_features.shape[0]
+
+        # Compute histogram of each feature
+        for f_idx in prange(n_allowed_features, schedule='static', nogil=True,
+                            num_threads=n_threads):
+            if has_interaction_cst:
+                feature_idx = allowed_features[f_idx]
+            else:
+                feature_idx = f_idx
+
+            _subtract_histograms(
+                feature_idx,
+                self.n_bins,
+                parent_histograms,
+                sibling_histograms,
+            )
+        return parent_histograms
+
+
+cpdef void _build_histogram_naive(
+        const int feature_idx,
+        unsigned int [:] sample_indices,  # IN
+        X_BINNED_DTYPE_C [:] binned_feature,  # IN
+        G_H_DTYPE_C [:] ordered_gradients,  # IN
+        G_H_DTYPE_C [:] ordered_hessians,  # IN
+        hist_struct [:, :] out) noexcept nogil:  # OUT
+    """Build histogram in a naive way, without optimizing for cache hit.
+
+    Used in tests to compare with the optimized version."""
+    cdef:
+        unsigned int i
+        unsigned int n_samples = sample_indices.shape[0]
+        unsigned int sample_idx
+        unsigned int bin_idx
+
+    for i in range(n_samples):
+        sample_idx = sample_indices[i]
+        bin_idx = binned_feature[sample_idx]
+        out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i]
+        out[feature_idx, bin_idx].sum_hessians += ordered_hessians[i]
+        out[feature_idx, bin_idx].count += 1
+
+
+cpdef void _subtract_histograms(
+        const int feature_idx,
+        unsigned int n_bins,
+        hist_struct [:, ::1] hist_a,  # IN and OUT
+        hist_struct [:, ::1] hist_b,  # IN
+) noexcept nogil:  # OUT
+    """compute hist_a = hist_a - hist_b"""
+    # Note that subtraction of large sums of floating point numbers, as we have here,
+    # can exhibit catastrophic cancallation. This is in particular true for gradients
+    # as they can be positive and negative, while hessians are non-negative.
+    # Remember that gradients and hessians are originally computed in
+    # G_H_DTYPE_C = float32 precision. Therefore, if sum_gradients and sum_hessians are
+    # float64, we don't loose precision. But if we also used float32 for summation, we
+    # would need to take care of floating point errors.
+    #
+    # Note that we could protect for negative hessians by setting:
+    #     sum_hessians = max(0, sum_hessians)
+    # But as we use float64 for summing float32, that's veeeery unlikely.
+    cdef:
+        unsigned int i = 0
+    for i in range(n_bins):
+        hist_a[feature_idx, i].sum_gradients -= hist_b[feature_idx, i].sum_gradients
+        hist_a[feature_idx, i].sum_hessians -= hist_b[feature_idx, i].sum_hessians
+        hist_a[feature_idx, i].count -= hist_b[feature_idx, i].count
+
+
+cpdef void _build_histogram(
+        const int feature_idx,
+        const unsigned int [::1] sample_indices,  # IN
+        const X_BINNED_DTYPE_C [::1] binned_feature,  # IN
+        const G_H_DTYPE_C [::1] ordered_gradients,  # IN
+        const G_H_DTYPE_C [::1] ordered_hessians,  # IN
+        hist_struct [:, ::1] out) noexcept nogil:  # OUT
+    """Return histogram for a given feature."""
+    cdef:
+        unsigned int i = 0
+        unsigned int n_node_samples = sample_indices.shape[0]
+        unsigned int unrolled_upper = (n_node_samples // 4) * 4
+
+        unsigned int bin_0
+        unsigned int bin_1
+        unsigned int bin_2
+        unsigned int bin_3
+        unsigned int bin_idx
+
+    for i in range(0, unrolled_upper, 4):
+        bin_0 = binned_feature[sample_indices[i]]
+        bin_1 = binned_feature[sample_indices[i + 1]]
+        bin_2 = binned_feature[sample_indices[i + 2]]
+        bin_3 = binned_feature[sample_indices[i + 3]]
+
+        out[feature_idx, bin_0].sum_gradients += ordered_gradients[i]
+        out[feature_idx, bin_1].sum_gradients += ordered_gradients[i + 1]
+        out[feature_idx, bin_2].sum_gradients += ordered_gradients[i + 2]
+        out[feature_idx, bin_3].sum_gradients += ordered_gradients[i + 3]
+
+        out[feature_idx, bin_0].sum_hessians += ordered_hessians[i]
+        out[feature_idx, bin_1].sum_hessians += ordered_hessians[i + 1]
+        out[feature_idx, bin_2].sum_hessians += ordered_hessians[i + 2]
+        out[feature_idx, bin_3].sum_hessians += ordered_hessians[i + 3]
+
+        out[feature_idx, bin_0].count += 1
+        out[feature_idx, bin_1].count += 1
+        out[feature_idx, bin_2].count += 1
+        out[feature_idx, bin_3].count += 1
+
+    for i in range(unrolled_upper, n_node_samples):
+        bin_idx = binned_feature[sample_indices[i]]
+        out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i]
+        out[feature_idx, bin_idx].sum_hessians += ordered_hessians[i]
+        out[feature_idx, bin_idx].count += 1
+
+
+cpdef void _build_histogram_no_hessian(
+        const int feature_idx,
+        const unsigned int [::1] sample_indices,  # IN
+        const X_BINNED_DTYPE_C [::1] binned_feature,  # IN
+        const G_H_DTYPE_C [::1] ordered_gradients,  # IN
+        hist_struct [:, ::1] out) noexcept nogil:  # OUT
+    """Return histogram for a given feature, not updating hessians.
+
+    Used when the hessians of the loss are constant (typically LS loss).
+    """
+
+    cdef:
+        unsigned int i = 0
+        unsigned int n_node_samples = sample_indices.shape[0]
+        unsigned int unrolled_upper = (n_node_samples // 4) * 4
+
+        unsigned int bin_0
+        unsigned int bin_1
+        unsigned int bin_2
+        unsigned int bin_3
+        unsigned int bin_idx
+
+    for i in range(0, unrolled_upper, 4):
+        bin_0 = binned_feature[sample_indices[i]]
+        bin_1 = binned_feature[sample_indices[i + 1]]
+        bin_2 = binned_feature[sample_indices[i + 2]]
+        bin_3 = binned_feature[sample_indices[i + 3]]
+
+        out[feature_idx, bin_0].sum_gradients += ordered_gradients[i]
+        out[feature_idx, bin_1].sum_gradients += ordered_gradients[i + 1]
+        out[feature_idx, bin_2].sum_gradients += ordered_gradients[i + 2]
+        out[feature_idx, bin_3].sum_gradients += ordered_gradients[i + 3]
+
+        out[feature_idx, bin_0].count += 1
+        out[feature_idx, bin_1].count += 1
+        out[feature_idx, bin_2].count += 1
+        out[feature_idx, bin_3].count += 1
+
+    for i in range(unrolled_upper, n_node_samples):
+        bin_idx = binned_feature[sample_indices[i]]
+        out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i]
+        out[feature_idx, bin_idx].count += 1
+
+
+cpdef void _build_histogram_root(
+        const int feature_idx,
+        const X_BINNED_DTYPE_C [::1] binned_feature,  # IN
+        const G_H_DTYPE_C [::1] all_gradients,  # IN
+        const G_H_DTYPE_C [::1] all_hessians,  # IN
+        hist_struct [:, ::1] out) noexcept nogil:  # OUT
+    """Compute histogram of the root node.
+
+    Unlike other nodes, the root node has to find the split among *all* the
+    samples from the training set. binned_feature and all_gradients /
+    all_hessians already have a consistent ordering.
+    """
+
+    cdef:
+        unsigned int i = 0
+        unsigned int n_samples = binned_feature.shape[0]
+        unsigned int unrolled_upper = (n_samples // 4) * 4
+
+        unsigned int bin_0
+        unsigned int bin_1
+        unsigned int bin_2
+        unsigned int bin_3
+        unsigned int bin_idx
+
+    for i in range(0, unrolled_upper, 4):
+
+        bin_0 = binned_feature[i]
+        bin_1 = binned_feature[i + 1]
+        bin_2 = binned_feature[i + 2]
+        bin_3 = binned_feature[i + 3]
+
+        out[feature_idx, bin_0].sum_gradients += all_gradients[i]
+        out[feature_idx, bin_1].sum_gradients += all_gradients[i + 1]
+        out[feature_idx, bin_2].sum_gradients += all_gradients[i + 2]
+        out[feature_idx, bin_3].sum_gradients += all_gradients[i + 3]
+
+        out[feature_idx, bin_0].sum_hessians += all_hessians[i]
+        out[feature_idx, bin_1].sum_hessians += all_hessians[i + 1]
+        out[feature_idx, bin_2].sum_hessians += all_hessians[i + 2]
+        out[feature_idx, bin_3].sum_hessians += all_hessians[i + 3]
+
+        out[feature_idx, bin_0].count += 1
+        out[feature_idx, bin_1].count += 1
+        out[feature_idx, bin_2].count += 1
+        out[feature_idx, bin_3].count += 1
+
+    for i in range(unrolled_upper, n_samples):
+        bin_idx = binned_feature[i]
+        out[feature_idx, bin_idx].sum_gradients += all_gradients[i]
+        out[feature_idx, bin_idx].sum_hessians += all_hessians[i]
+        out[feature_idx, bin_idx].count += 1
+
+
+cpdef void _build_histogram_root_no_hessian(
+        const int feature_idx,
+        const X_BINNED_DTYPE_C [::1] binned_feature,  # IN
+        const G_H_DTYPE_C [::1] all_gradients,  # IN
+        hist_struct [:, ::1] out) noexcept nogil:  # OUT
+    """Compute histogram of the root node, not updating hessians.
+
+    Used when the hessians of the loss are constant (typically LS loss).
+    """
+
+    cdef:
+        unsigned int i = 0
+        unsigned int n_samples = binned_feature.shape[0]
+        unsigned int unrolled_upper = (n_samples // 4) * 4
+
+        unsigned int bin_0
+        unsigned int bin_1
+        unsigned int bin_2
+        unsigned int bin_3
+        unsigned int bin_idx
+
+    for i in range(0, unrolled_upper, 4):
+        bin_0 = binned_feature[i]
+        bin_1 = binned_feature[i + 1]
+        bin_2 = binned_feature[i + 2]
+        bin_3 = binned_feature[i + 3]
+
+        out[feature_idx, bin_0].sum_gradients += all_gradients[i]
+        out[feature_idx, bin_1].sum_gradients += all_gradients[i + 1]
+        out[feature_idx, bin_2].sum_gradients += all_gradients[i + 2]
+        out[feature_idx, bin_3].sum_gradients += all_gradients[i + 3]
+
+        out[feature_idx, bin_0].count += 1
+        out[feature_idx, bin_1].count += 1
+        out[feature_idx, bin_2].count += 1
+        out[feature_idx, bin_3].count += 1
+
+    for i in range(unrolled_upper, n_samples):
+        bin_idx = binned_feature[i]
+        out[feature_idx, bin_idx].sum_gradients += all_gradients[i]
+        out[feature_idx, bin_idx].count += 1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/meson.build b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..122a2102800f38f111af48d9ce009505dd5308ee
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/meson.build
@@ -0,0 +1,20 @@
+hist_gradient_boosting_extension_metadata = {
+  '_gradient_boosting': {'sources': [cython_gen.process('_gradient_boosting.pyx')],
+                         'dependencies': [openmp_dep]},
+  'histogram': {'sources': [cython_gen.process('histogram.pyx')], 'dependencies': [openmp_dep]},
+  'splitting': {'sources': [cython_gen.process('splitting.pyx')], 'dependencies': [openmp_dep]},
+  '_binning': {'sources': [cython_gen.process('_binning.pyx')], 'dependencies': [openmp_dep]},
+  '_predictor': {'sources': [cython_gen.process('_predictor.pyx')], 'dependencies': [openmp_dep]},
+  '_bitset': {'sources': [cython_gen.process('_bitset.pyx')]},
+  'common': {'sources': [cython_gen.process('common.pyx')]},
+}
+
+foreach ext_name, ext_dict : hist_gradient_boosting_extension_metadata
+  py.extension_module(
+    ext_name,
+    ext_dict.get('sources'),
+    dependencies: ext_dict.get('dependencies', []),
+    subdir: 'sklearn/ensemble/_hist_gradient_boosting',
+    install: true
+  )
+endforeach
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..59bb6499c450114db3171342d7bb97111db64b81
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py
@@ -0,0 +1,146 @@
+"""
+This module contains the TreePredictor class which is used for prediction.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+from ._predictor import (
+    _compute_partial_dependence,
+    _predict_from_binned_data,
+    _predict_from_raw_data,
+)
+from .common import PREDICTOR_RECORD_DTYPE, Y_DTYPE
+
+
+class TreePredictor:
+    """Tree class used for predictions.
+
+    Parameters
+    ----------
+    nodes : ndarray of PREDICTOR_RECORD_DTYPE
+        The nodes of the tree.
+    binned_left_cat_bitsets : ndarray of shape (n_categorical_splits, 8), dtype=uint32
+        Array of bitsets for binned categories used in predict_binned when a
+        split is categorical.
+    raw_left_cat_bitsets : ndarray of shape (n_categorical_splits, 8), dtype=uint32
+        Array of bitsets for raw categories used in predict when a split is
+        categorical.
+    """
+
+    def __init__(self, nodes, binned_left_cat_bitsets, raw_left_cat_bitsets):
+        self.nodes = nodes
+        self.binned_left_cat_bitsets = binned_left_cat_bitsets
+        self.raw_left_cat_bitsets = raw_left_cat_bitsets
+
+    def get_n_leaf_nodes(self):
+        """Return number of leaves."""
+        return int(self.nodes["is_leaf"].sum())
+
+    def get_max_depth(self):
+        """Return maximum depth among all leaves."""
+        return int(self.nodes["depth"].max())
+
+    def predict(self, X, known_cat_bitsets, f_idx_map, n_threads):
+        """Predict raw values for non-binned data.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            The input samples.
+
+        known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
+            Array of bitsets of known categories, for each categorical feature.
+
+        f_idx_map : ndarray of shape (n_features,)
+            Map from original feature index to the corresponding index in the
+            known_cat_bitsets array.
+
+        n_threads : int
+            Number of OpenMP threads to use.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            The raw predicted values.
+        """
+        out = np.empty(X.shape[0], dtype=Y_DTYPE)
+
+        _predict_from_raw_data(
+            self.nodes,
+            X,
+            self.raw_left_cat_bitsets,
+            known_cat_bitsets,
+            f_idx_map,
+            n_threads,
+            out,
+        )
+        return out
+
+    def predict_binned(self, X, missing_values_bin_idx, n_threads):
+        """Predict raw values for binned data.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            The input samples.
+        missing_values_bin_idx : uint8
+            Index of the bin that is used for missing values. This is the
+            index of the last bin and is always equal to max_bins (as passed
+            to the GBDT classes), or equivalently to n_bins - 1.
+        n_threads : int
+            Number of OpenMP threads to use.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            The raw predicted values.
+        """
+        out = np.empty(X.shape[0], dtype=Y_DTYPE)
+        _predict_from_binned_data(
+            self.nodes,
+            X,
+            self.binned_left_cat_bitsets,
+            missing_values_bin_idx,
+            n_threads,
+            out,
+        )
+        return out
+
+    def compute_partial_dependence(self, grid, target_features, out):
+        """Fast partial dependence computation.
+
+        Parameters
+        ----------
+        grid : ndarray, shape (n_samples, n_target_features)
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : ndarray, shape (n_target_features)
+            The set of target features for which the partial dependence
+            should be evaluated.
+        out : ndarray, shape (n_samples)
+            The value of the partial dependence function on each grid
+            point.
+        """
+        _compute_partial_dependence(self.nodes, grid, target_features, out)
+
+    def __setstate__(self, state):
+        try:
+            super().__setstate__(state)
+        except AttributeError:
+            self.__dict__.update(state)
+
+        # The dtype of feature_idx is np.intp which is platform dependent. Here, we
+        # make sure that saving and loading on different bitness systems works without
+        # errors. For instance, on a 64 bit Python runtime, np.intp = np.int64,
+        # while on 32 bit np.intp = np.int32.
+        #
+        # TODO: consider always using platform agnostic dtypes for fitted
+        # estimator attributes. For this particular estimator, this would
+        # mean replacing the intp field of PREDICTOR_RECORD_DTYPE by an int32
+        # field. Ideally this should be done consistently throughout
+        # scikit-learn along with a common test.
+        if self.nodes.dtype != PREDICTOR_RECORD_DTYPE:
+            self.nodes = self.nodes.astype(PREDICTOR_RECORD_DTYPE, casting="same_kind")
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..c4cb22067cf376294e16067fe6c62dfbe215fb90
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -0,0 +1,1201 @@
+"""This module contains routines and data structures to:
+
+- Find the best possible split of a node. For a given node, a split is
+  characterized by a feature and a bin.
+- Apply a split to a node, i.e. split the indices of the samples at the node
+  into the newly created left and right children.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+cimport cython
+from cython.parallel import prange
+import numpy as np
+from libc.math cimport INFINITY, ceil
+from libc.stdlib cimport malloc, free, qsort
+from libc.string cimport memcpy
+
+from ...utils._typedefs cimport uint8_t
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport Y_DTYPE_C
+from .common cimport hist_struct
+from .common cimport BITSET_INNER_DTYPE_C
+from .common cimport BITSET_DTYPE_C
+from .common cimport MonotonicConstraint
+from ._bitset cimport init_bitset
+from ._bitset cimport set_bitset
+from ._bitset cimport in_bitset
+
+
+cdef struct split_info_struct:
+    # Same as the SplitInfo class, but we need a C struct to use it in the
+    # nogil sections and to use in arrays.
+    Y_DTYPE_C gain
+    int feature_idx
+    unsigned int bin_idx
+    uint8_t missing_go_to_left
+    Y_DTYPE_C sum_gradient_left
+    Y_DTYPE_C sum_gradient_right
+    Y_DTYPE_C sum_hessian_left
+    Y_DTYPE_C sum_hessian_right
+    unsigned int n_samples_left
+    unsigned int n_samples_right
+    Y_DTYPE_C value_left
+    Y_DTYPE_C value_right
+    uint8_t is_categorical
+    BITSET_DTYPE_C left_cat_bitset
+
+
+# used in categorical splits for sorting categories by increasing values of
+# sum_gradients / sum_hessians
+cdef struct categorical_info:
+    X_BINNED_DTYPE_C bin_idx
+    Y_DTYPE_C value
+
+
+class SplitInfo:
+    """Pure data class to store information about a potential split.
+
+    Parameters
+    ----------
+    gain : float
+        The gain of the split.
+    feature_idx : int
+        The index of the feature to be split.
+    bin_idx : int
+        The index of the bin on which the split is made. Should be ignored if
+        `is_categorical` is True: `left_cat_bitset` will be used to determine
+        the split.
+    missing_go_to_left : bool
+        Whether missing values should go to the left child. This is used
+        whether the split is categorical or not.
+    sum_gradient_left : float
+        The sum of the gradients of all the samples in the left child.
+    sum_hessian_left : float
+        The sum of the hessians of all the samples in the left child.
+    sum_gradient_right : float
+        The sum of the gradients of all the samples in the right child.
+    sum_hessian_right : float
+        The sum of the hessians of all the samples in the right child.
+    n_samples_left : int, default=0
+        The number of samples in the left child.
+    n_samples_right : int
+        The number of samples in the right child.
+    is_categorical : bool
+        Whether the split is done on a categorical feature.
+    left_cat_bitset : ndarray of shape=(8,), dtype=uint32 or None
+        Bitset representing the categories that go to the left. This is used
+        only when `is_categorical` is True.
+        Note that missing values are part of that bitset if there are missing
+        values in the training data. For missing values, we rely on that
+        bitset for splitting, but at prediction time, we rely on
+        missing_go_to_left.
+    """
+    def __init__(self, gain, feature_idx, bin_idx,
+                 missing_go_to_left, sum_gradient_left, sum_hessian_left,
+                 sum_gradient_right, sum_hessian_right, n_samples_left,
+                 n_samples_right, value_left, value_right,
+                 is_categorical, left_cat_bitset):
+        self.gain = gain
+        self.feature_idx = feature_idx
+        self.bin_idx = bin_idx
+        self.missing_go_to_left = missing_go_to_left
+        self.sum_gradient_left = sum_gradient_left
+        self.sum_hessian_left = sum_hessian_left
+        self.sum_gradient_right = sum_gradient_right
+        self.sum_hessian_right = sum_hessian_right
+        self.n_samples_left = n_samples_left
+        self.n_samples_right = n_samples_right
+        self.value_left = value_left
+        self.value_right = value_right
+        self.is_categorical = is_categorical
+        self.left_cat_bitset = left_cat_bitset
+
+
+@cython.final
+cdef class Splitter:
+    """Splitter used to find the best possible split at each node.
+
+    A split (see SplitInfo) is characterized by a feature and a bin.
+
+    The Splitter is also responsible for partitioning the samples among the
+    leaves of the tree (see split_indices() and the partition attribute).
+
+    Parameters
+    ----------
+    X_binned : ndarray of int, shape (n_samples, n_features)
+        The binned input samples. Must be Fortran-aligned.
+    n_bins_non_missing : ndarray, shape (n_features,)
+        For each feature, gives the number of bins actually used for
+        non-missing values.
+    missing_values_bin_idx : uint8
+        Index of the bin that is used for missing values. This is the index of
+        the last bin and is always equal to max_bins (as passed to the GBDT
+        classes), or equivalently to n_bins - 1.
+    has_missing_values : ndarray, shape (n_features,)
+        Whether missing values were observed in the training data, for each
+        feature.
+    is_categorical : ndarray of bool of shape (n_features,)
+        Indicates categorical features.
+    monotonic_cst : ndarray of int of shape (n_features,), dtype=int
+        Indicates the monotonic constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+    l2_regularization : float
+        The L2 regularization parameter.
+    min_hessian_to_split : float, default=1e-3
+        The minimum sum of hessians needed in each node. Splits that result in
+        at least one child having a sum of hessians less than
+        min_hessian_to_split are discarded.
+    min_samples_leaf : int, default=20
+        The minimum number of samples per leaf.
+    min_gain_to_split : float, default=0.0
+        The minimum gain needed to split a node. Splits with lower gain will
+        be ignored.
+    hessians_are_constant: bool, default is False
+        Whether hessians are constant.
+    feature_fraction_per_split : float, default=1
+        Proportion of randomly chosen features in each and every node split.
+        This is a form of regularization, smaller values make the trees weaker
+        learners and might prevent overfitting.
+    rng : Generator
+    n_threads : int, default=1
+        Number of OpenMP threads to use.
+    """
+    cdef public:
+        const X_BINNED_DTYPE_C [::1, :] X_binned
+        unsigned int n_features
+        const unsigned int [::1] n_bins_non_missing
+        uint8_t missing_values_bin_idx
+        const uint8_t [::1] has_missing_values
+        const uint8_t [::1] is_categorical
+        const signed char [::1] monotonic_cst
+        uint8_t hessians_are_constant
+        Y_DTYPE_C l2_regularization
+        Y_DTYPE_C min_hessian_to_split
+        unsigned int min_samples_leaf
+        Y_DTYPE_C min_gain_to_split
+        Y_DTYPE_C feature_fraction_per_split
+        rng
+
+        unsigned int [::1] partition
+        unsigned int [::1] left_indices_buffer
+        unsigned int [::1] right_indices_buffer
+        int n_threads
+
+    def __init__(self,
+                 const X_BINNED_DTYPE_C [::1, :] X_binned,
+                 const unsigned int [::1] n_bins_non_missing,
+                 const uint8_t missing_values_bin_idx,
+                 const uint8_t [::1] has_missing_values,
+                 const uint8_t [::1] is_categorical,
+                 const signed char [::1] monotonic_cst,
+                 Y_DTYPE_C l2_regularization,
+                 Y_DTYPE_C min_hessian_to_split=1e-3,
+                 unsigned int min_samples_leaf=20,
+                 Y_DTYPE_C min_gain_to_split=0.,
+                 uint8_t hessians_are_constant=False,
+                 Y_DTYPE_C feature_fraction_per_split=1.0,
+                 rng=np.random.RandomState(),
+                 unsigned int n_threads=1):
+
+        self.X_binned = X_binned
+        self.n_features = X_binned.shape[1]
+        self.n_bins_non_missing = n_bins_non_missing
+        self.missing_values_bin_idx = missing_values_bin_idx
+        self.has_missing_values = has_missing_values
+        self.is_categorical = is_categorical
+        self.monotonic_cst = monotonic_cst
+        self.l2_regularization = l2_regularization
+        self.min_hessian_to_split = min_hessian_to_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_gain_to_split = min_gain_to_split
+        self.hessians_are_constant = hessians_are_constant
+        self.feature_fraction_per_split = feature_fraction_per_split
+        self.rng = rng
+        self.n_threads = n_threads
+
+        # The partition array maps each sample index into the leaves of the
+        # tree (a leaf in this context is a node that isn't split yet, not
+        # necessarily a 'finalized' leaf). Initially, the root contains all
+        # the indices, e.g.:
+        # partition = [abcdefghijkl]
+        # After a call to split_indices, it may look e.g. like this:
+        # partition = [cef|abdghijkl]
+        # we have 2 leaves, the left one is at position 0 and the second one at
+        # position 3. The order of the samples is irrelevant.
+        self.partition = np.arange(X_binned.shape[0], dtype=np.uint32)
+        # buffers used in split_indices to support parallel splitting.
+        self.left_indices_buffer = np.empty_like(self.partition)
+        self.right_indices_buffer = np.empty_like(self.partition)
+
+    def split_indices(Splitter self, split_info, unsigned int [::1]
+                      sample_indices):
+        """Split samples into left and right arrays.
+
+        The split is performed according to the best possible split
+        (split_info).
+
+        Ultimately, this is nothing but a partition of the sample_indices
+        array with a given pivot, exactly like a quicksort subroutine.
+
+        Parameters
+        ----------
+        split_info : SplitInfo
+            The SplitInfo of the node to split.
+        sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
+            The indices of the samples at the node to split. This is a view
+            on self.partition, and it is modified inplace by placing the
+            indices of the left child at the beginning, and the indices of
+            the right child at the end.
+
+        Returns
+        -------
+        left_indices : ndarray of int, shape (n_left_samples,)
+            The indices of the samples in the left child. This is a view on
+            self.partition.
+        right_indices : ndarray of int, shape (n_right_samples,)
+            The indices of the samples in the right child. This is a view on
+            self.partition.
+        right_child_position : int
+            The position of the right child in ``sample_indices``.
+        """
+        # This is a multi-threaded implementation inspired by lightgbm. Here
+        # is a quick break down. Let's suppose we want to split a node with 24
+        # samples named from a to x. self.partition looks like this (the * are
+        # indices in other leaves that we don't care about):
+        # partition = [*************abcdefghijklmnopqrstuvwx****************]
+        #                           ^                       ^
+        #                     node_position     node_position + node.n_samples
+
+        # Ultimately, we want to reorder the samples inside the boundaries of
+        # the leaf (which becomes a node) to now represent the samples in its
+        # left and right child. For example:
+        # partition = [*************abefilmnopqrtuxcdghjksvw*****************]
+        #                           ^              ^
+        #                   left_child_pos     right_child_pos
+        # Note that left_child_pos always takes the value of node_position,
+        # and right_child_pos = left_child_pos + left_child.n_samples. The
+        # order of the samples inside a leaf is irrelevant.
+
+        # 1. sample_indices is a view on this region a..x. We conceptually
+        #    divide it into n_threads regions. Each thread will be responsible
+        #    for its own region. Here is an example with 4 threads:
+        #    sample_indices = [abcdef|ghijkl|mnopqr|stuvwx]
+        # 2. Each thread processes 6 = 24 // 4 entries and maps them into
+        #    left_indices_buffer or right_indices_buffer. For example, we could
+        #    have the following mapping ('.' denotes an undefined entry):
+        #    - left_indices_buffer =  [abef..|il....|mnopqr|tux...]
+        #    - right_indices_buffer = [cd....|ghjk..|......|svw...]
+        # 3. We keep track of the start positions of the regions (the '|') in
+        #    ``offset_in_buffers`` as well as the size of each region. We also
+        #    keep track of the number of samples put into the left/right child
+        #    by each thread. Concretely:
+        #    - left_counts =  [4, 2, 6, 3]
+        #    - right_counts = [2, 4, 0, 3]
+        # 4. Finally, we put left/right_indices_buffer back into the
+        #    sample_indices, without any undefined entries and the partition
+        #    looks as expected
+        #    partition = [*************abefilmnopqrtuxcdghjksvw***************]
+
+        # Note: We here show left/right_indices_buffer as being the same size
+        # as sample_indices for simplicity, but in reality they are of the
+        # same size as partition.
+
+        cdef:
+            int n_samples = sample_indices.shape[0]
+            X_BINNED_DTYPE_C bin_idx = split_info.bin_idx
+            uint8_t missing_go_to_left = split_info.missing_go_to_left
+            uint8_t missing_values_bin_idx = self.missing_values_bin_idx
+            int feature_idx = split_info.feature_idx
+            const X_BINNED_DTYPE_C [::1] X_binned = \
+                self.X_binned[:, feature_idx]
+            unsigned int [::1] left_indices_buffer = self.left_indices_buffer
+            unsigned int [::1] right_indices_buffer = self.right_indices_buffer
+            uint8_t is_categorical = split_info.is_categorical
+            # Cython is unhappy if we set left_cat_bitset to
+            # split_info.left_cat_bitset directly, so we need a tmp var
+            BITSET_INNER_DTYPE_C [:] cat_bitset_tmp = split_info.left_cat_bitset
+            BITSET_DTYPE_C left_cat_bitset
+            int n_threads = self.n_threads
+
+            int [:] sizes = np.full(n_threads, n_samples // n_threads,
+                                    dtype=np.int32)
+            int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32)
+            int [:] left_counts = np.empty(n_threads, dtype=np.int32)
+            int [:] right_counts = np.empty(n_threads, dtype=np.int32)
+            int left_count
+            int right_count
+            int start
+            int stop
+            int i
+            int thread_idx
+            int sample_idx
+            int right_child_position
+            uint8_t turn_left
+            int [:] left_offset = np.zeros(n_threads, dtype=np.int32)
+            int [:] right_offset = np.zeros(n_threads, dtype=np.int32)
+
+        # only set left_cat_bitset when is_categorical is True
+        if is_categorical:
+            left_cat_bitset = &cat_bitset_tmp[0]
+
+        with nogil:
+            for thread_idx in range(n_samples % n_threads):
+                sizes[thread_idx] += 1
+
+            for thread_idx in range(1, n_threads):
+                offset_in_buffers[thread_idx] = \
+                    offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1]
+
+            # map indices from sample_indices to left/right_indices_buffer
+            for thread_idx in prange(n_threads, schedule='static',
+                                     chunksize=1, num_threads=n_threads):
+                left_count = 0
+                right_count = 0
+
+                start = offset_in_buffers[thread_idx]
+                stop = start + sizes[thread_idx]
+                for i in range(start, stop):
+                    sample_idx = sample_indices[i]
+                    turn_left = sample_goes_left(
+                        missing_go_to_left,
+                        missing_values_bin_idx, bin_idx,
+                        X_binned[sample_idx], is_categorical,
+                        left_cat_bitset)
+
+                    if turn_left:
+                        left_indices_buffer[start + left_count] = sample_idx
+                        left_count = left_count + 1
+                    else:
+                        right_indices_buffer[start + right_count] = sample_idx
+                        right_count = right_count + 1
+
+                left_counts[thread_idx] = left_count
+                right_counts[thread_idx] = right_count
+
+            # position of right child = just after the left child
+            right_child_position = 0
+            for thread_idx in range(n_threads):
+                right_child_position += left_counts[thread_idx]
+
+            # offset of each thread in sample_indices for left and right
+            # child, i.e. where each thread will start to write.
+            right_offset[0] = right_child_position
+            for thread_idx in range(1, n_threads):
+                left_offset[thread_idx] = \
+                    left_offset[thread_idx - 1] + left_counts[thread_idx - 1]
+                right_offset[thread_idx] = \
+                    right_offset[thread_idx - 1] + right_counts[thread_idx - 1]
+
+            # map indices in left/right_indices_buffer back into
+            # sample_indices. This also updates self.partition since
+            # sample_indices is a view.
+            for thread_idx in prange(n_threads, schedule='static',
+                                     chunksize=1, num_threads=n_threads):
+                memcpy(
+                    &sample_indices[left_offset[thread_idx]],
+                    &left_indices_buffer[offset_in_buffers[thread_idx]],
+                    sizeof(unsigned int) * left_counts[thread_idx]
+                )
+                if right_counts[thread_idx] > 0:
+                    # If we're splitting the rightmost node of the tree, i.e. the
+                    # rightmost node in the partition array, and if n_threads >= 2, one
+                    # might have right_counts[-1] = 0 and right_offset[-1] = len(sample_indices)
+                    # leading to evaluating
+                    #
+                    #    &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node]
+                    #                                      = &partition[n_samples_in_tree]
+                    #
+                    # which is an out-of-bounds read access that can cause a segmentation fault.
+                    # When boundscheck=True, removing this check produces this exception:
+                    #
+                    #    IndexError: Out of bounds on buffer access
+                    #
+                    memcpy(
+                        &sample_indices[right_offset[thread_idx]],
+                        &right_indices_buffer[offset_in_buffers[thread_idx]],
+                        sizeof(unsigned int) * right_counts[thread_idx]
+                    )
+
+        return (sample_indices[:right_child_position],
+                sample_indices[right_child_position:],
+                right_child_position)
+
+    def find_node_split(
+            Splitter self,
+            unsigned int n_samples,
+            hist_struct [:, ::1] histograms,  # IN
+            const Y_DTYPE_C sum_gradients,
+            const Y_DTYPE_C sum_hessians,
+            const Y_DTYPE_C value,
+            const Y_DTYPE_C lower_bound=-INFINITY,
+            const Y_DTYPE_C upper_bound=INFINITY,
+            const unsigned int [:] allowed_features=None,
+            ):
+        """For each feature, find the best bin to split on at a given node.
+
+        Return the best split info among all features.
+
+        Parameters
+        ----------
+        n_samples : int
+            The number of samples at the node.
+        histograms : ndarray of HISTOGRAM_DTYPE of \
+                shape (n_features, max_bins)
+            The histograms of the current node.
+        sum_gradients : float
+            The sum of the gradients for each sample at the node.
+        sum_hessians : float
+            The sum of the hessians for each sample at the node.
+        value : float
+            The bounded value of the current node. We directly pass the value
+            instead of re-computing it from sum_gradients and sum_hessians,
+            because we need to compute the loss and the gain based on the
+            *bounded* value: computing the value from
+            sum_gradients / sum_hessians would give the unbounded value, and
+            the interaction with min_gain_to_split would not be correct
+            anymore. Side note: we can't use the lower_bound / upper_bound
+            parameters either because these refer to the bounds of the
+            children, not the bounds of the current node.
+        lower_bound : float
+            Lower bound for the children values for respecting the monotonic
+            constraints.
+        upper_bound : float
+            Upper bound for the children values for respecting the monotonic
+            constraints.
+        allowed_features : None or ndarray, dtype=np.uint32
+            Indices of the features that are allowed by interaction constraints to be
+            split.
+
+        Returns
+        -------
+        best_split_info : SplitInfo
+            The info about the best possible split among all features.
+        """
+        cdef:
+            int feature_idx
+            int split_info_idx
+            int best_split_info_idx
+            int n_allowed_features
+            split_info_struct split_info
+            split_info_struct * split_infos
+            const uint8_t [::1] has_missing_values = self.has_missing_values
+            const uint8_t [::1] is_categorical = self.is_categorical
+            const signed char [::1] monotonic_cst = self.monotonic_cst
+            int n_threads = self.n_threads
+            bint has_interaction_cst = False
+            Y_DTYPE_C feature_fraction_per_split = self.feature_fraction_per_split
+            uint8_t [:] subsample_mask  # same as npy_bool
+            int n_subsampled_features
+
+        has_interaction_cst = allowed_features is not None
+        if has_interaction_cst:
+            n_allowed_features = allowed_features.shape[0]
+        else:
+            n_allowed_features = self.n_features
+
+        if feature_fraction_per_split < 1.0:
+            # We do all random sampling before the nogil and make sure that we sample
+            # exactly n_subsampled_features >= 1 features.
+            n_subsampled_features = max(
+                1,
+                int(ceil(feature_fraction_per_split * n_allowed_features)),
+            )
+            subsample_mask_arr = np.full(n_allowed_features, False)
+            subsample_mask_arr[:n_subsampled_features] = True
+            self.rng.shuffle(subsample_mask_arr)
+            # https://github.com/numpy/numpy/issues/18273
+            subsample_mask = subsample_mask_arr
+
+        with nogil:
+
+            split_infos = <split_info_struct *> malloc(
+                n_allowed_features * sizeof(split_info_struct))
+
+            # split_info_idx is index of split_infos of size n_allowed_features.
+            # features_idx is the index of the feature column in X.
+            for split_info_idx in prange(n_allowed_features, schedule='static',
+                                         num_threads=n_threads):
+                if has_interaction_cst:
+                    feature_idx = allowed_features[split_info_idx]
+                else:
+                    feature_idx = split_info_idx
+
+                split_infos[split_info_idx].feature_idx = feature_idx
+
+                # For each feature, find best bin to split on
+                # Start with a gain of -1 if no better split is found, that
+                # means one of the constraints isn't respected
+                # (min_samples_leaf, etc.) and the grower will later turn the
+                # node into a leaf.
+                split_infos[split_info_idx].gain = -1
+                split_infos[split_info_idx].is_categorical = is_categorical[feature_idx]
+
+                # Note that subsample_mask is indexed by split_info_idx and not by
+                # feature_idx because we only need to exclude the same features again
+                # and again. We do NOT need to access the features directly by using
+                # allowed_features.
+                if feature_fraction_per_split < 1.0 and not subsample_mask[split_info_idx]:
+                    continue
+
+                if is_categorical[feature_idx]:
+                    self._find_best_bin_to_split_category(
+                        feature_idx, has_missing_values[feature_idx],
+                        histograms, n_samples, sum_gradients, sum_hessians,
+                        value, monotonic_cst[feature_idx], lower_bound,
+                        upper_bound, &split_infos[split_info_idx])
+                else:
+                    # We will scan bins from left to right (in all cases), and
+                    # if there are any missing values, we will also scan bins
+                    # from right to left. This way, we can consider whichever
+                    # case yields the best gain: either missing values go to
+                    # the right (left to right scan) or to the left (right to
+                    # left case). See algo 3 from the XGBoost paper
+                    # https://arxiv.org/abs/1603.02754
+                    # Note: for the categorical features above, this isn't
+                    # needed since missing values are considered a native
+                    # category.
+                    self._find_best_bin_to_split_left_to_right(
+                        feature_idx, has_missing_values[feature_idx],
+                        histograms, n_samples, sum_gradients, sum_hessians,
+                        value, monotonic_cst[feature_idx],
+                        lower_bound, upper_bound, &split_infos[split_info_idx])
+
+                    if has_missing_values[feature_idx]:
+                        # We need to explore both directions to check whether
+                        # sending the nans to the left child would lead to a higher
+                        # gain
+                        self._find_best_bin_to_split_right_to_left(
+                            feature_idx, histograms, n_samples,
+                            sum_gradients, sum_hessians,
+                            value, monotonic_cst[feature_idx],
+                            lower_bound, upper_bound, &split_infos[split_info_idx])
+
+            # then compute best possible split among all features
+            # split_info is set to the best of split_infos
+            best_split_info_idx = self._find_best_feature_to_split_helper(
+                split_infos, n_allowed_features
+            )
+            split_info = split_infos[best_split_info_idx]
+
+        out = SplitInfo(
+            split_info.gain,
+            split_info.feature_idx,
+            split_info.bin_idx,
+            split_info.missing_go_to_left,
+            split_info.sum_gradient_left,
+            split_info.sum_hessian_left,
+            split_info.sum_gradient_right,
+            split_info.sum_hessian_right,
+            split_info.n_samples_left,
+            split_info.n_samples_right,
+            split_info.value_left,
+            split_info.value_right,
+            split_info.is_categorical,
+            None,  # left_cat_bitset will only be set if the split is categorical
+        )
+        # Only set bitset if the split is categorical
+        if split_info.is_categorical:
+            out.left_cat_bitset = np.asarray(split_info.left_cat_bitset, dtype=np.uint32)
+
+        free(split_infos)
+        return out
+
+    cdef int _find_best_feature_to_split_helper(
+        self,
+        split_info_struct * split_infos,  # IN
+        int n_allowed_features,
+    ) noexcept nogil:
+        """Return the index of split_infos with the best feature split."""
+        cdef:
+            int split_info_idx
+            int best_split_info_idx = 0
+
+        for split_info_idx in range(1, n_allowed_features):
+            if (split_infos[split_info_idx].gain > split_infos[best_split_info_idx].gain):
+                best_split_info_idx = split_info_idx
+        return best_split_info_idx
+
+    cdef void _find_best_bin_to_split_left_to_right(
+            Splitter self,
+            unsigned int feature_idx,
+            uint8_t has_missing_values,
+            const hist_struct [:, ::1] histograms,  # IN
+            unsigned int n_samples,
+            Y_DTYPE_C sum_gradients,
+            Y_DTYPE_C sum_hessians,
+            Y_DTYPE_C value,
+            signed char monotonic_cst,
+            Y_DTYPE_C lower_bound,
+            Y_DTYPE_C upper_bound,
+            split_info_struct * split_info) noexcept nogil:  # OUT
+        """Find best bin to split on for a given feature.
+
+        Splits that do not satisfy the splitting constraints
+        (min_gain_to_split, etc.) are discarded here.
+
+        We scan node from left to right. This version is called whether there
+        are missing values or not. If any, missing values are assigned to the
+        right node.
+        """
+        cdef:
+            unsigned int bin_idx
+            unsigned int n_samples_left
+            unsigned int n_samples_right
+            unsigned int n_samples_ = n_samples
+            # We set the 'end' variable such that the last non-missing-values
+            # bin never goes to the left child (which would result in and
+            # empty right child), unless there are missing values, since these
+            # would go to the right child.
+            unsigned int end = \
+                self.n_bins_non_missing[feature_idx] - 1 + has_missing_values
+            Y_DTYPE_C sum_hessian_left
+            Y_DTYPE_C sum_hessian_right
+            Y_DTYPE_C sum_gradient_left
+            Y_DTYPE_C sum_gradient_right
+            Y_DTYPE_C loss_current_node
+            Y_DTYPE_C gain
+            uint8_t found_better_split = False
+
+            Y_DTYPE_C best_sum_hessian_left
+            Y_DTYPE_C best_sum_gradient_left
+            unsigned int best_bin_idx
+            unsigned int best_n_samples_left
+            Y_DTYPE_C best_gain = -1
+            hist_struct hist
+
+        sum_gradient_left, sum_hessian_left = 0., 0.
+        n_samples_left = 0
+
+        loss_current_node = _loss_from_value(value, sum_gradients)
+
+        for bin_idx in range(end):
+            hist = histograms[feature_idx, bin_idx]
+            n_samples_left += hist.count
+            n_samples_right = n_samples_ - n_samples_left
+
+            if self.hessians_are_constant:
+                sum_hessian_left += hist.count
+            else:
+                sum_hessian_left += \
+                    hist.sum_hessians
+            sum_hessian_right = sum_hessians - sum_hessian_left
+
+            sum_gradient_left += hist.sum_gradients
+            sum_gradient_right = sum_gradients - sum_gradient_left
+
+            if n_samples_left < self.min_samples_leaf:
+                continue
+            if n_samples_right < self.min_samples_leaf:
+                # won't get any better
+                break
+
+            if sum_hessian_left < self.min_hessian_to_split:
+                continue
+            if sum_hessian_right < self.min_hessian_to_split:
+                # won't get any better (hessians are > 0 since loss is convex)
+                break
+
+            gain = _split_gain(sum_gradient_left, sum_hessian_left,
+                               sum_gradient_right, sum_hessian_right,
+                               loss_current_node,
+                               monotonic_cst,
+                               lower_bound,
+                               upper_bound,
+                               self.l2_regularization)
+
+            if gain > best_gain and gain > self.min_gain_to_split:
+                found_better_split = True
+                best_gain = gain
+                best_bin_idx = bin_idx
+                best_sum_gradient_left = sum_gradient_left
+                best_sum_hessian_left = sum_hessian_left
+                best_n_samples_left = n_samples_left
+
+        if found_better_split:
+            split_info.gain = best_gain
+            split_info.bin_idx = best_bin_idx
+            # we scan from left to right so missing values go to the right
+            split_info.missing_go_to_left = False
+            split_info.sum_gradient_left = best_sum_gradient_left
+            split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
+            split_info.sum_hessian_left = best_sum_hessian_left
+            split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
+            split_info.n_samples_left = best_n_samples_left
+            split_info.n_samples_right = n_samples - best_n_samples_left
+
+            # We recompute best values here but it's cheap
+            split_info.value_left = compute_node_value(
+                split_info.sum_gradient_left, split_info.sum_hessian_left,
+                lower_bound, upper_bound, self.l2_regularization)
+
+            split_info.value_right = compute_node_value(
+                split_info.sum_gradient_right, split_info.sum_hessian_right,
+                lower_bound, upper_bound, self.l2_regularization)
+
+    cdef void _find_best_bin_to_split_right_to_left(
+            self,
+            unsigned int feature_idx,
+            const hist_struct [:, ::1] histograms,  # IN
+            unsigned int n_samples,
+            Y_DTYPE_C sum_gradients,
+            Y_DTYPE_C sum_hessians,
+            Y_DTYPE_C value,
+            signed char monotonic_cst,
+            Y_DTYPE_C lower_bound,
+            Y_DTYPE_C upper_bound,
+            split_info_struct * split_info) noexcept nogil:  # OUT
+        """Find best bin to split on for a given feature.
+
+        Splits that do not satisfy the splitting constraints
+        (min_gain_to_split, etc.) are discarded here.
+
+        We scan node from right to left. This version is only called when
+        there are missing values. Missing values are assigned to the left
+        child.
+
+        If no missing value are present in the data this method isn't called
+        since only calling _find_best_bin_to_split_left_to_right is enough.
+        """
+
+        cdef:
+            unsigned int bin_idx
+            unsigned int n_samples_left
+            unsigned int n_samples_right
+            unsigned int n_samples_ = n_samples
+            Y_DTYPE_C sum_hessian_left
+            Y_DTYPE_C sum_hessian_right
+            Y_DTYPE_C sum_gradient_left
+            Y_DTYPE_C sum_gradient_right
+            Y_DTYPE_C loss_current_node
+            Y_DTYPE_C gain
+            unsigned int start = self.n_bins_non_missing[feature_idx] - 2
+            uint8_t found_better_split = False
+
+            Y_DTYPE_C best_sum_hessian_left
+            Y_DTYPE_C best_sum_gradient_left
+            unsigned int best_bin_idx
+            unsigned int best_n_samples_left
+            Y_DTYPE_C best_gain = split_info.gain  # computed during previous scan
+            hist_struct hist
+
+        sum_gradient_right, sum_hessian_right = 0., 0.
+        n_samples_right = 0
+
+        loss_current_node = _loss_from_value(value, sum_gradients)
+
+        for bin_idx in range(start, -1, -1):
+            hist = histograms[feature_idx, bin_idx + 1]
+            n_samples_right += hist.count
+            n_samples_left = n_samples_ - n_samples_right
+
+            if self.hessians_are_constant:
+                sum_hessian_right += hist.count
+            else:
+                sum_hessian_right += \
+                    hist.sum_hessians
+            sum_hessian_left = sum_hessians - sum_hessian_right
+
+            sum_gradient_right += \
+                hist.sum_gradients
+            sum_gradient_left = sum_gradients - sum_gradient_right
+
+            if n_samples_right < self.min_samples_leaf:
+                continue
+            if n_samples_left < self.min_samples_leaf:
+                # won't get any better
+                break
+
+            if sum_hessian_right < self.min_hessian_to_split:
+                continue
+            if sum_hessian_left < self.min_hessian_to_split:
+                # won't get any better (hessians are > 0 since loss is convex)
+                break
+
+            gain = _split_gain(sum_gradient_left, sum_hessian_left,
+                               sum_gradient_right, sum_hessian_right,
+                               loss_current_node,
+                               monotonic_cst,
+                               lower_bound,
+                               upper_bound,
+                               self.l2_regularization)
+
+            if gain > best_gain and gain > self.min_gain_to_split:
+                found_better_split = True
+                best_gain = gain
+                best_bin_idx = bin_idx
+                best_sum_gradient_left = sum_gradient_left
+                best_sum_hessian_left = sum_hessian_left
+                best_n_samples_left = n_samples_left
+
+        if found_better_split:
+            split_info.gain = best_gain
+            split_info.bin_idx = best_bin_idx
+            # we scan from right to left so missing values go to the left
+            split_info.missing_go_to_left = True
+            split_info.sum_gradient_left = best_sum_gradient_left
+            split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
+            split_info.sum_hessian_left = best_sum_hessian_left
+            split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
+            split_info.n_samples_left = best_n_samples_left
+            split_info.n_samples_right = n_samples - best_n_samples_left
+
+            # We recompute best values here but it's cheap
+            split_info.value_left = compute_node_value(
+                split_info.sum_gradient_left, split_info.sum_hessian_left,
+                lower_bound, upper_bound, self.l2_regularization)
+
+            split_info.value_right = compute_node_value(
+                split_info.sum_gradient_right, split_info.sum_hessian_right,
+                lower_bound, upper_bound, self.l2_regularization)
+
+    cdef void _find_best_bin_to_split_category(
+            self,
+            unsigned int feature_idx,
+            uint8_t has_missing_values,
+            const hist_struct [:, ::1] histograms,  # IN
+            unsigned int n_samples,
+            Y_DTYPE_C sum_gradients,
+            Y_DTYPE_C sum_hessians,
+            Y_DTYPE_C value,
+            char monotonic_cst,
+            Y_DTYPE_C lower_bound,
+            Y_DTYPE_C upper_bound,
+            split_info_struct * split_info) noexcept nogil:  # OUT
+        """Find best split for categorical features.
+
+        Categories are first sorted according to their variance, and then
+        a scan is performed as if categories were ordered quantities.
+
+        Ref: "On Grouping for Maximum Homogeneity", Walter D. Fisher
+        """
+
+        cdef:
+            unsigned int bin_idx
+            unsigned int n_bins_non_missing = self.n_bins_non_missing[feature_idx]
+            unsigned int missing_values_bin_idx = self.missing_values_bin_idx
+            categorical_info * cat_infos
+            unsigned int sorted_cat_idx
+            unsigned int n_used_bins = 0
+            int [2] scan_direction
+            int direction = 0
+            int best_direction = 0
+            unsigned int middle
+            unsigned int i
+            const hist_struct[::1] feature_hist = histograms[feature_idx, :]
+            hist_struct hist
+            Y_DTYPE_C sum_gradients_bin
+            Y_DTYPE_C sum_hessians_bin
+            Y_DTYPE_C loss_current_node
+            Y_DTYPE_C sum_gradient_left, sum_hessian_left
+            Y_DTYPE_C sum_gradient_right, sum_hessian_right
+            unsigned int n_samples_left, n_samples_right
+            Y_DTYPE_C gain
+            Y_DTYPE_C best_gain = -1.0
+            uint8_t found_better_split = False
+            Y_DTYPE_C best_sum_hessian_left
+            Y_DTYPE_C best_sum_gradient_left
+            unsigned int best_n_samples_left
+            unsigned int best_cat_infos_thresh
+            # Reduces the effect of noises in categorical features,
+            # especially for categories with few data. Called cat_smooth in
+            # LightGBM. TODO: Make this user adjustable?
+            Y_DTYPE_C MIN_CAT_SUPPORT = 10.
+            # this is equal to 1 for losses where hessians are constant
+            Y_DTYPE_C support_factor = n_samples / sum_hessians
+
+        # Details on the split finding:
+        # We first order categories by their sum_gradients / sum_hessians
+        # values, and we exclude categories that don't respect MIN_CAT_SUPPORT
+        # from this sorted array. Missing values are treated just like any
+        # other category. The low-support categories will always be mapped to
+        # the right child. We scan the sorted categories array from left to
+        # right and from right to left, and we stop at the middle.
+
+        # Considering ordered categories A B C D, with E being a low-support
+        # category: A B C D
+        #              ^
+        #           midpoint
+        # The scans will consider the following split-points:
+        # * left to right:
+        #   A - B C D E
+        #   A B - C D E
+        # * right to left:
+        #   D - A B C E
+        #   C D - A B E
+
+        # Note that since we stop at the middle and since low-support
+        # categories (E) are always mapped to the right, the following splits
+        # aren't considered:
+        # A E - B C D
+        # D E - A B C
+        # Basically, we're forcing E to always be mapped to the child that has
+        # *at least half of the categories* (and this child is always the right
+        # child, by convention).
+
+        # Also note that if we scanned in only one direction (e.g. left to
+        # right), we would only consider the following splits:
+        # A - B C D E
+        # A B - C D E
+        # A B C - D E
+        # and thus we would be missing on D - A B C E and on C D - A B E
+
+        cat_infos = <categorical_info *> malloc(
+            (n_bins_non_missing + has_missing_values) * sizeof(categorical_info))
+
+        # fill cat_infos while filtering out categories based on MIN_CAT_SUPPORT
+        for bin_idx in range(n_bins_non_missing):
+            hist = feature_hist[bin_idx]
+            if self.hessians_are_constant:
+                sum_hessians_bin = hist.count
+            else:
+                sum_hessians_bin = hist.sum_hessians
+            if sum_hessians_bin * support_factor >= MIN_CAT_SUPPORT:
+                cat_infos[n_used_bins].bin_idx = bin_idx
+                sum_gradients_bin = hist.sum_gradients
+
+                cat_infos[n_used_bins].value = (
+                    sum_gradients_bin / (sum_hessians_bin + MIN_CAT_SUPPORT)
+                )
+                n_used_bins += 1
+
+        # Also add missing values bin so that nans are considered as a category
+        if has_missing_values:
+            hist = feature_hist[missing_values_bin_idx]
+            if self.hessians_are_constant:
+                sum_hessians_bin = hist.count
+            else:
+                sum_hessians_bin = hist.sum_hessians
+            if sum_hessians_bin * support_factor >= MIN_CAT_SUPPORT:
+                cat_infos[n_used_bins].bin_idx = missing_values_bin_idx
+                sum_gradients_bin = (
+                    hist.sum_gradients
+                )
+
+                cat_infos[n_used_bins].value = (
+                    sum_gradients_bin / (sum_hessians_bin + MIN_CAT_SUPPORT)
+                )
+                n_used_bins += 1
+
+        # not enough categories to form a split
+        if n_used_bins <= 1:
+            free(cat_infos)
+            return
+
+        qsort(cat_infos, n_used_bins, sizeof(categorical_info),
+              compare_cat_infos)
+
+        loss_current_node = _loss_from_value(value, sum_gradients)
+
+        scan_direction[0], scan_direction[1] = 1, -1
+        for direction in scan_direction:
+            if direction == 1:
+                middle = (n_used_bins + 1) // 2
+            else:
+                middle = (n_used_bins + 1) // 2 - 1
+
+            # The categories we'll consider will go to the left child
+            sum_gradient_left, sum_hessian_left = 0., 0.
+            n_samples_left = 0
+
+            for i in range(middle):
+                sorted_cat_idx = i if direction == 1 else n_used_bins - 1 - i
+                bin_idx = cat_infos[sorted_cat_idx].bin_idx
+                hist = feature_hist[bin_idx]
+
+                n_samples_left += hist.count
+                n_samples_right = n_samples - n_samples_left
+
+                if self.hessians_are_constant:
+                    sum_hessian_left += hist.count
+                else:
+                    sum_hessian_left += hist.sum_hessians
+                sum_hessian_right = sum_hessians - sum_hessian_left
+
+                sum_gradient_left += hist.sum_gradients
+                sum_gradient_right = sum_gradients - sum_gradient_left
+
+                if (
+                    n_samples_left < self.min_samples_leaf or
+                    sum_hessian_left < self.min_hessian_to_split
+                ):
+                    continue
+                if (
+                    n_samples_right < self.min_samples_leaf or
+                    sum_hessian_right < self.min_hessian_to_split
+                ):
+                    break
+
+                gain = _split_gain(sum_gradient_left, sum_hessian_left,
+                                   sum_gradient_right, sum_hessian_right,
+                                   loss_current_node, monotonic_cst,
+                                   lower_bound, upper_bound,
+                                   self.l2_regularization)
+                if gain > best_gain and gain > self.min_gain_to_split:
+                    found_better_split = True
+                    best_gain = gain
+                    best_cat_infos_thresh = sorted_cat_idx
+                    best_sum_gradient_left = sum_gradient_left
+                    best_sum_hessian_left = sum_hessian_left
+                    best_n_samples_left = n_samples_left
+                    best_direction = direction
+
+        if found_better_split:
+            split_info.gain = best_gain
+
+            # split_info.bin_idx is unused for categorical splits: left_cat_bitset
+            # is used instead and set below
+            split_info.bin_idx = 0
+
+            split_info.sum_gradient_left = best_sum_gradient_left
+            split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
+            split_info.sum_hessian_left = best_sum_hessian_left
+            split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
+            split_info.n_samples_left = best_n_samples_left
+            split_info.n_samples_right = n_samples - best_n_samples_left
+
+            # We recompute best values here but it's cheap
+            split_info.value_left = compute_node_value(
+                split_info.sum_gradient_left, split_info.sum_hessian_left,
+                lower_bound, upper_bound, self.l2_regularization)
+
+            split_info.value_right = compute_node_value(
+                split_info.sum_gradient_right, split_info.sum_hessian_right,
+                lower_bound, upper_bound, self.l2_regularization)
+
+            # create bitset with values from best_cat_infos_thresh
+            init_bitset(split_info.left_cat_bitset)
+            if best_direction == 1:
+                for sorted_cat_idx in range(best_cat_infos_thresh + 1):
+                    bin_idx = cat_infos[sorted_cat_idx].bin_idx
+                    set_bitset(split_info.left_cat_bitset, bin_idx)
+            else:
+                for sorted_cat_idx in range(n_used_bins - 1, best_cat_infos_thresh - 1, -1):
+                    bin_idx = cat_infos[sorted_cat_idx].bin_idx
+                    set_bitset(split_info.left_cat_bitset, bin_idx)
+
+            if has_missing_values:
+                split_info.missing_go_to_left = in_bitset(
+                    split_info.left_cat_bitset, missing_values_bin_idx)
+
+        free(cat_infos)
+
+
+cdef int compare_cat_infos(const void * a, const void * b) noexcept nogil:
+    return -1 if (<categorical_info *>a).value < (<categorical_info *>b).value else 1
+
+cdef inline Y_DTYPE_C _split_gain(
+        Y_DTYPE_C sum_gradient_left,
+        Y_DTYPE_C sum_hessian_left,
+        Y_DTYPE_C sum_gradient_right,
+        Y_DTYPE_C sum_hessian_right,
+        Y_DTYPE_C loss_current_node,
+        signed char monotonic_cst,
+        Y_DTYPE_C lower_bound,
+        Y_DTYPE_C upper_bound,
+        Y_DTYPE_C l2_regularization) noexcept nogil:
+    """Loss reduction
+
+    Compute the reduction in loss after taking a split, compared to keeping
+    the node a leaf of the tree.
+
+    See Equation 7 of:
+    :arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System,
+    <1603.02754>.`
+    """
+    cdef:
+        Y_DTYPE_C gain
+        Y_DTYPE_C value_left
+        Y_DTYPE_C value_right
+
+    # Compute values of potential left and right children
+    value_left = compute_node_value(sum_gradient_left, sum_hessian_left,
+                                    lower_bound, upper_bound,
+                                    l2_regularization)
+    value_right = compute_node_value(sum_gradient_right, sum_hessian_right,
+                                     lower_bound, upper_bound,
+                                     l2_regularization)
+
+    if ((monotonic_cst == MonotonicConstraint.POS and value_left > value_right) or
+            (monotonic_cst == MonotonicConstraint.NEG and value_left < value_right)):
+        # don't consider this split since it does not respect the monotonic
+        # constraints. Note that these comparisons need to be done on values
+        # that have already been clipped to take the monotonic constraints into
+        # account (if any).
+        return -1
+
+    gain = loss_current_node
+    gain -= _loss_from_value(value_left, sum_gradient_left)
+    gain -= _loss_from_value(value_right, sum_gradient_right)
+    # Note that for the gain to be correct (and for min_gain_to_split to work
+    # as expected), we need all values to be bounded (current node, left child
+    # and right child).
+
+    return gain
+
+cdef inline Y_DTYPE_C _loss_from_value(
+        Y_DTYPE_C value,
+        Y_DTYPE_C sum_gradient) noexcept nogil:
+    """Return loss of a node from its (bounded) value
+
+    See Equation 6 of:
+    :arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System,
+    <1603.02754>.`
+    """
+    return sum_gradient * value
+
+cdef inline uint8_t sample_goes_left(
+        uint8_t missing_go_to_left,
+        uint8_t missing_values_bin_idx,
+        X_BINNED_DTYPE_C split_bin_idx,
+        X_BINNED_DTYPE_C bin_value,
+        uint8_t is_categorical,
+        BITSET_DTYPE_C left_cat_bitset) noexcept nogil:
+    """Helper to decide whether sample should go to left or right child."""
+
+    if is_categorical:
+        # note: if any, missing values are encoded in left_cat_bitset
+        return in_bitset(left_cat_bitset, bin_value)
+    else:
+        return (
+            (
+                missing_go_to_left and
+                bin_value == missing_values_bin_idx
+            )
+            or (
+                bin_value <= split_bin_idx
+            ))
+
+
+cpdef inline Y_DTYPE_C compute_node_value(
+        Y_DTYPE_C sum_gradient,
+        Y_DTYPE_C sum_hessian,
+        Y_DTYPE_C lower_bound,
+        Y_DTYPE_C upper_bound,
+        Y_DTYPE_C l2_regularization) noexcept nogil:
+    """Compute a node's value.
+
+    The value is capped in the [lower_bound, upper_bound] interval to respect
+    monotonic constraints. Shrinkage is ignored.
+
+    See Equation 5 of:
+    :arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System,
+    <1603.02754>.`
+    """
+
+    cdef:
+        Y_DTYPE_C value
+
+    value = -sum_gradient / (sum_hessian + l2_regularization + 1e-15)
+
+    if value < lower_bound:
+        value = lower_bound
+    elif value > upper_bound:
+        value = upper_bound
+
+    return value
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f9fcd0057141a398611ff94d528b1317ba4a0fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
@@ -0,0 +1,489 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn.ensemble._hist_gradient_boosting.binning import (
+    _BinMapper,
+    _find_binning_thresholds,
+    _map_to_bins,
+)
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    ALMOST_INF,
+    X_BINNED_DTYPE,
+    X_DTYPE,
+)
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
+
+
+DATA = (
+    np.random.RandomState(42)
+    .normal(loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2))
+    .astype(X_DTYPE)
+)
+
+
+def test_find_binning_thresholds_regular_data():
+    data = np.linspace(0, 10, 1001)
+    bin_thresholds = _find_binning_thresholds(data, max_bins=10)
+    assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=5)
+    assert_allclose(bin_thresholds, [2, 4, 6, 8])
+
+
+def test_find_binning_thresholds_small_regular_data():
+    data = np.linspace(0, 10, 11)
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=5)
+    assert_allclose(bin_thresholds, [2, 4, 6, 8])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=10)
+    assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=11)
+    assert_allclose(bin_thresholds, np.arange(10) + 0.5)
+
+    bin_thresholds = _find_binning_thresholds(data, max_bins=255)
+    assert_allclose(bin_thresholds, np.arange(10) + 0.5)
+
+
+def test_find_binning_thresholds_random_data():
+    bin_thresholds = [
+        _find_binning_thresholds(DATA[:, i], max_bins=255) for i in range(2)
+    ]
+    for i in range(len(bin_thresholds)):
+        assert bin_thresholds[i].shape == (254,)  # 255 - 1
+        assert bin_thresholds[i].dtype == DATA.dtype
+
+    assert_allclose(
+        bin_thresholds[0][[64, 128, 192]], np.array([-0.7, 0.0, 0.7]), atol=1e-1
+    )
+
+    assert_allclose(
+        bin_thresholds[1][[64, 128, 192]], np.array([9.99, 10.00, 10.01]), atol=1e-2
+    )
+
+
+def test_find_binning_thresholds_low_n_bins():
+    bin_thresholds = [
+        _find_binning_thresholds(DATA[:, i], max_bins=128) for i in range(2)
+    ]
+    for i in range(len(bin_thresholds)):
+        assert bin_thresholds[i].shape == (127,)  # 128 - 1
+        assert bin_thresholds[i].dtype == DATA.dtype
+
+
+@pytest.mark.parametrize("n_bins", (2, 257))
+def test_invalid_n_bins(n_bins):
+    err_msg = "n_bins={} should be no smaller than 3 and no larger than 256".format(
+        n_bins
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        _BinMapper(n_bins=n_bins).fit(DATA)
+
+
+def test_bin_mapper_n_features_transform():
+    mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
+    err_msg = "This estimator was fitted with 2 features but 4 got passed"
+    with pytest.raises(ValueError, match=err_msg):
+        mapper.transform(np.repeat(DATA, 2, axis=1))
+
+
+@pytest.mark.parametrize("max_bins", [16, 128, 255])
+def test_map_to_bins(max_bins):
+    bin_thresholds = [
+        _find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
+    ]
+    binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
+    is_categorical = np.zeros(2, dtype=np.uint8)
+    last_bin_idx = max_bins
+    _map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned)
+    assert binned.shape == DATA.shape
+    assert binned.dtype == np.uint8
+    assert binned.flags.f_contiguous
+
+    min_indices = DATA.argmin(axis=0)
+    max_indices = DATA.argmax(axis=0)
+
+    for feature_idx, min_idx in enumerate(min_indices):
+        assert binned[min_idx, feature_idx] == 0
+    for feature_idx, max_idx in enumerate(max_indices):
+        assert binned[max_idx, feature_idx] == max_bins - 1
+
+
+@pytest.mark.parametrize("max_bins", [5, 10, 42])
+def test_bin_mapper_random_data(max_bins):
+    n_samples, n_features = DATA.shape
+
+    expected_count_per_bin = n_samples // max_bins
+    tol = int(0.05 * expected_count_per_bin)
+
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
+    binned = mapper.transform(DATA)
+
+    assert binned.shape == (n_samples, n_features)
+    assert binned.dtype == np.uint8
+    assert_array_equal(binned.min(axis=0), np.array([0, 0]))
+    assert_array_equal(binned.max(axis=0), np.array([max_bins - 1, max_bins - 1]))
+    assert len(mapper.bin_thresholds_) == n_features
+    for bin_thresholds_feature in mapper.bin_thresholds_:
+        assert bin_thresholds_feature.shape == (max_bins - 1,)
+        assert bin_thresholds_feature.dtype == DATA.dtype
+    assert np.all(mapper.n_bins_non_missing_ == max_bins)
+
+    # Check that the binned data is approximately balanced across bins.
+    for feature_idx in range(n_features):
+        for bin_idx in range(max_bins):
+            count = (binned[:, feature_idx] == bin_idx).sum()
+            assert abs(count - expected_count_per_bin) < tol
+
+
+@pytest.mark.parametrize("n_samples, max_bins", [(5, 5), (5, 10), (5, 11), (42, 255)])
+def test_bin_mapper_small_random_data(n_samples, max_bins):
+    data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
+    assert len(np.unique(data)) == n_samples
+
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
+    binned = mapper.fit_transform(data)
+
+    assert binned.shape == data.shape
+    assert binned.dtype == np.uint8
+    assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples))
+
+
+@pytest.mark.parametrize(
+    "max_bins, n_distinct, multiplier",
+    [
+        (5, 5, 1),
+        (5, 5, 3),
+        (255, 12, 42),
+    ],
+)
+def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
+    data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
+    assert_array_equal(data, binned)
+
+
+@pytest.mark.parametrize("n_distinct", [2, 7, 42])
+def test_bin_mapper_repeated_values_invariance(n_distinct):
+    rng = np.random.RandomState(42)
+    distinct_values = rng.normal(size=n_distinct)
+    assert len(np.unique(distinct_values)) == n_distinct
+
+    repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
+    data = distinct_values[repeated_indices]
+    rng.shuffle(data)
+    assert_array_equal(np.unique(data), np.sort(distinct_values))
+
+    data = data.reshape(-1, 1)
+
+    mapper_1 = _BinMapper(n_bins=n_distinct + 1)
+    binned_1 = mapper_1.fit_transform(data)
+    assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
+
+    # Adding more bins to the mapper yields the same results (same thresholds)
+    mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
+    binned_2 = mapper_2.fit_transform(data)
+
+    assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
+    assert_array_equal(binned_1, binned_2)
+
+
+@pytest.mark.parametrize(
+    "max_bins, scale, offset",
+    [
+        (3, 2, -1),
+        (42, 1, 0),
+        (255, 0.3, 42),
+    ],
+)
+def test_bin_mapper_identity_small(max_bins, scale, offset):
+    data = np.arange(max_bins).reshape(-1, 1) * scale + offset
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
+    assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
+
+
+@pytest.mark.parametrize(
+    "max_bins_small, max_bins_large",
+    [
+        (2, 2),
+        (3, 3),
+        (4, 4),
+        (42, 42),
+        (255, 255),
+        (5, 17),
+        (42, 255),
+    ],
+)
+def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
+    assert max_bins_large >= max_bins_small
+    data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
+    mapper_small = _BinMapper(n_bins=max_bins_small + 1)
+    mapper_large = _BinMapper(n_bins=max_bins_small + 1)
+    binned_small = mapper_small.fit_transform(data)
+    binned_large = mapper_large.fit_transform(binned_small)
+    assert_array_equal(binned_small, binned_large)
+
+
+@pytest.mark.parametrize("n_bins", [10, 100, 256])
+@pytest.mark.parametrize("diff", [-5, 0, 5])
+def test_n_bins_non_missing(n_bins, diff):
+    # Check that n_bins_non_missing is n_unique_values when
+    # there are not a lot of unique values, else n_bins - 1.
+
+    n_unique_values = n_bins + diff
+    X = list(range(n_unique_values)) * 2
+    X = np.array(X).reshape(-1, 1)
+    mapper = _BinMapper(n_bins=n_bins).fit(X)
+    assert np.all(mapper.n_bins_non_missing_ == min(n_bins - 1, n_unique_values))
+
+
+def test_subsample():
+    # Make sure bin thresholds are different when applying subsampling
+    mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
+    mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)
+
+    for feature in range(DATA.shape[1]):
+        assert not np.allclose(
+            mapper_no_subsample.bin_thresholds_[feature],
+            mapper_subsample.bin_thresholds_[feature],
+            rtol=1e-4,
+        )
+
+
+@pytest.mark.parametrize(
+    "n_bins, n_bins_non_missing, X_trans_expected",
+    [
+        (
+            256,
+            [4, 2, 2],
+            [
+                [0, 0, 0],  # 255 <=> missing value
+                [255, 255, 0],
+                [1, 0, 0],
+                [255, 1, 1],
+                [2, 1, 1],
+                [3, 0, 0],
+            ],
+        ),
+        (
+            3,
+            [2, 2, 2],
+            [
+                [0, 0, 0],  # 2 <=> missing value
+                [2, 2, 0],
+                [0, 0, 0],
+                [2, 1, 1],
+                [1, 1, 1],
+                [1, 0, 0],
+            ],
+        ),
+    ],
+)
+def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
+    # check for missing values: make sure nans are mapped to the last bin
+    # and that the _BinMapper attributes are correct
+
+    X = [
+        [1, 1, 0],
+        [np.nan, np.nan, 0],
+        [2, 1, 0],
+        [np.nan, 2, 1],
+        [3, 2, 1],
+        [4, 1, 0],
+    ]
+
+    X = np.array(X)
+
+    mapper = _BinMapper(n_bins=n_bins)
+    mapper.fit(X)
+
+    assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
+
+    for feature_idx in range(X.shape[1]):
+        assert (
+            len(mapper.bin_thresholds_[feature_idx])
+            == n_bins_non_missing[feature_idx] - 1
+        )
+
+    assert mapper.missing_values_bin_idx_ == n_bins - 1
+
+    X_trans = mapper.transform(X)
+    assert_array_equal(X_trans, X_trans_expected)
+
+
+def test_infinite_values():
+    # Make sure infinite values are properly handled.
+    bin_mapper = _BinMapper()
+
+    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
+
+    bin_mapper.fit(X)
+    assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, 0.5, ALMOST_INF])
+    assert bin_mapper.n_bins_non_missing_ == [4]
+
+    expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
+    assert_array_equal(bin_mapper.transform(X), expected_binned_X)
+
+
+@pytest.mark.parametrize("n_bins", [15, 256])
+def test_categorical_feature(n_bins):
+    # Basic test for categorical features
+    # we make sure that categories are mapped into [0, n_categories - 1] and
+    # that nans are mapped to the last bin
+    X = np.array(
+        [[4] * 500 + [1] * 3 + [10] * 4 + [0] * 4 + [13] + [7] * 5 + [np.nan] * 2],
+        dtype=X_DTYPE,
+    ).T
+    known_categories = [np.unique(X[~np.isnan(X)])]
+
+    bin_mapper = _BinMapper(
+        n_bins=n_bins,
+        is_categorical=np.array([True]),
+        known_categories=known_categories,
+    ).fit(X)
+    assert bin_mapper.n_bins_non_missing_ == [6]
+    assert_array_equal(bin_mapper.bin_thresholds_[0], [0, 1, 4, 7, 10, 13])
+
+    X = np.array([[0, 1, 4, np.nan, 7, 10, 13]], dtype=X_DTYPE).T
+    expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
+    assert_array_equal(bin_mapper.transform(X), expected_trans)
+
+    # Negative categories are mapped to the missing values' bin
+    # (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1).
+    # Unknown positive categories does not happen in practice and tested
+    # for illustration purpose.
+    X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T
+    expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T
+    assert_array_equal(bin_mapper.transform(X), expected_trans)
+
+
+def test_categorical_feature_negative_missing():
+    """Make sure bin mapper treats negative categories as missing values."""
+    X = np.array(
+        [[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE
+    ).T
+    bin_mapper = _BinMapper(
+        n_bins=4,
+        is_categorical=np.array([True]),
+        known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)],
+    ).fit(X)
+
+    assert bin_mapper.n_bins_non_missing_ == [3]
+
+    X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T
+
+    # Negative values for categorical features are considered as missing values.
+    # They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`,
+    # which is 3 here.
+    assert bin_mapper.missing_values_bin_idx_ == 3
+    expected_trans = np.array([[3, 0, 1, 2, 3]]).T
+    assert_array_equal(bin_mapper.transform(X), expected_trans)
+
+
+@pytest.mark.parametrize("n_bins", (128, 256))
+def test_categorical_with_numerical_features(n_bins):
+    # basic check for binmapper with mixed data
+    X1 = np.arange(10, 20).reshape(-1, 1)  # numerical
+    X2 = np.arange(10, 15).reshape(-1, 1)  # categorical
+    X2 = np.r_[X2, X2]
+    X = np.c_[X1, X2]
+    known_categories = [None, np.unique(X2).astype(X_DTYPE)]
+
+    bin_mapper = _BinMapper(
+        n_bins=n_bins,
+        is_categorical=np.array([False, True]),
+        known_categories=known_categories,
+    ).fit(X)
+
+    assert_array_equal(bin_mapper.n_bins_non_missing_, [10, 5])
+
+    bin_thresholds = bin_mapper.bin_thresholds_
+    assert len(bin_thresholds) == 2
+    assert_array_equal(bin_thresholds[1], np.arange(10, 15))
+
+    expected_X_trans = [
+        [0, 0],
+        [1, 1],
+        [2, 2],
+        [3, 3],
+        [4, 4],
+        [5, 0],
+        [6, 1],
+        [7, 2],
+        [8, 3],
+        [9, 4],
+    ]
+    assert_array_equal(bin_mapper.transform(X), expected_X_trans)
+
+
+def test_make_known_categories_bitsets():
+    # Check the output of make_known_categories_bitsets
+    X = np.array(
+        [[14, 2, 30], [30, 4, 70], [40, 10, 180], [40, 240, 180]], dtype=X_DTYPE
+    )
+
+    bin_mapper = _BinMapper(
+        n_bins=256,
+        is_categorical=np.array([False, True, True]),
+        known_categories=[None, X[:, 1], X[:, 2]],
+    )
+    bin_mapper.fit(X)
+
+    known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets()
+
+    # Note that for non-categorical features, values are left to 0
+    expected_f_idx_map = np.array([0, 0, 1], dtype=np.uint8)
+    assert_allclose(expected_f_idx_map, f_idx_map)
+
+    expected_cat_bitset = np.zeros((2, 8), dtype=np.uint32)
+
+    # first categorical feature: [2, 4, 10, 240]
+    f_idx = 1
+    mapped_f_idx = f_idx_map[f_idx]
+    expected_cat_bitset[mapped_f_idx, 0] = 2**2 + 2**4 + 2**10
+    # 240 = 32**7 + 16, therefore the 16th bit of the 7th array is 1.
+    expected_cat_bitset[mapped_f_idx, 7] = 2**16
+
+    # second categorical feature [30, 70, 180]
+    f_idx = 2
+    mapped_f_idx = f_idx_map[f_idx]
+    expected_cat_bitset[mapped_f_idx, 0] = 2**30
+    expected_cat_bitset[mapped_f_idx, 2] = 2**6
+    expected_cat_bitset[mapped_f_idx, 5] = 2**20
+
+    assert_allclose(expected_cat_bitset, known_cat_bitsets)
+
+
+@pytest.mark.parametrize(
+    "is_categorical, known_categories, match",
+    [
+        (np.array([True]), [None], "Known categories for feature 0 must be provided"),
+        (
+            np.array([False]),
+            np.array([1, 2, 3]),
+            "isn't marked as a categorical feature, but categories were passed",
+        ),
+    ],
+)
+def test_categorical_parameters(is_categorical, known_categories, match):
+    # test the validation of the is_categorical and known_categories parameters
+
+    X = np.array([[1, 2, 3]], dtype=X_DTYPE)
+
+    bin_mapper = _BinMapper(
+        is_categorical=is_categorical, known_categories=known_categories
+    )
+    with pytest.raises(ValueError, match=match):
+        bin_mapper.fit(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02d66b666f80216088c691db39a55c055aa8d83
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py
@@ -0,0 +1,64 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn.ensemble._hist_gradient_boosting._bitset import (
+    in_bitset_memoryview,
+    set_bitset_memoryview,
+    set_raw_bitset_from_binned_bitset,
+)
+from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
+
+
+@pytest.mark.parametrize(
+    "values_to_insert, expected_bitset",
+    [
+        ([0, 4, 33], np.array([2**0 + 2**4, 2**1, 0], dtype=np.uint32)),
+        (
+            [31, 32, 33, 79],
+            np.array([2**31, 2**0 + 2**1, 2**15], dtype=np.uint32),
+        ),
+    ],
+)
+def test_set_get_bitset(values_to_insert, expected_bitset):
+    n_32bits_ints = 3
+    bitset = np.zeros(n_32bits_ints, dtype=np.uint32)
+    for value in values_to_insert:
+        set_bitset_memoryview(bitset, value)
+    assert_allclose(expected_bitset, bitset)
+    for value in range(32 * n_32bits_ints):
+        if value in values_to_insert:
+            assert in_bitset_memoryview(bitset, value)
+        else:
+            assert not in_bitset_memoryview(bitset, value)
+
+
+@pytest.mark.parametrize(
+    "raw_categories, binned_cat_to_insert, expected_raw_bitset",
+    [
+        (
+            [3, 4, 5, 10, 31, 32, 43],
+            [0, 2, 4, 5, 6],
+            [2**3 + 2**5 + 2**31, 2**0 + 2**11],
+        ),
+        ([3, 33, 50, 52], [1, 3], [0, 2**1 + 2**20]),
+    ],
+)
+def test_raw_bitset_from_binned_bitset(
+    raw_categories, binned_cat_to_insert, expected_raw_bitset
+):
+    binned_bitset = np.zeros(2, dtype=np.uint32)
+    raw_bitset = np.zeros(2, dtype=np.uint32)
+    raw_categories = np.asarray(raw_categories, dtype=X_DTYPE)
+
+    for val in binned_cat_to_insert:
+        set_bitset_memoryview(binned_bitset, val)
+
+    set_raw_bitset_from_binned_bitset(raw_bitset, binned_bitset, raw_categories)
+
+    assert_allclose(expected_raw_bitset, raw_bitset)
+    for binned_cat_val, raw_cat_val in enumerate(raw_categories):
+        if binned_cat_val in binned_cat_to_insert:
+            assert in_bitset_memoryview(raw_bitset, raw_cat_val)
+        else:
+            assert not in_bitset_memoryview(raw_bitset, raw_cat_val)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
new file mode 100644
index 0000000000000000000000000000000000000000..24b5b02aa0696c5cad5701ea23acba601b307f09
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@@ -0,0 +1,291 @@
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.ensemble import (
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+)
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+
+
+# TODO(1.8) remove the filterwarnings decorator
+@pytest.mark.filterwarnings(
+    "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning"
+)
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize(
+    "loss",
+    [
+        "squared_error",
+        "poisson",
+        pytest.param(
+            "gamma",
+            marks=pytest.mark.skip("LightGBM with gamma loss has larger deviation."),
+        ),
+    ],
+)
+@pytest.mark.parametrize("min_samples_leaf", (1, 20))
+@pytest.mark.parametrize(
+    "n_samples, max_leaf_nodes",
+    [
+        (255, 4096),
+        (1000, 8),
+    ],
+)
+def test_same_predictions_regression(
+    seed, loss, min_samples_leaf, n_samples, max_leaf_nodes
+):
+    # Make sure sklearn has the same predictions as lightgbm for easy targets.
+    #
+    # In particular when the size of the trees are bound and the number of
+    # samples is large enough, the structure of the prediction trees found by
+    # LightGBM and sklearn should be exactly identical.
+    #
+    # Notes:
+    # - Several candidate splits may have equal gains when the number of
+    #   samples in a node is low (and because of float errors). Therefore the
+    #   predictions on the test set might differ if the structure of the tree
+    #   is not exactly the same. To avoid this issue we only compare the
+    #   predictions on the test set when the number of samples is large enough
+    #   and max_leaf_nodes is low enough.
+    # - To ignore discrepancies caused by small differences in the binning
+    #   strategy, data is pre-binned if n_samples > 255.
+    # - We don't check the absolute_error loss here. This is because
+    #   LightGBM's computation of the median (used for the initial value of
+    #   raw_prediction) is a bit off (they'll e.g. return midpoints when there
+    #   is no need to.). Since these tests only run 1 iteration, the
+    #   discrepancy between the initial values leads to biggish differences in
+    #   the predictions. These differences are much smaller with more
+    #   iterations.
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    max_iter = 1
+    max_bins = 255
+
+    X, y = make_regression(
+        n_samples=n_samples, n_features=5, n_informative=5, random_state=0
+    )
+
+    if loss in ("gamma", "poisson"):
+        # make the target positive
+        y = np.abs(y) + np.mean(np.abs(y))
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingRegressor(
+        loss=loss,
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=1,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes,
+    )
+    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")
+    est_lightgbm.set_params(min_sum_hessian_in_leaf=0)
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    if loss in ("gamma", "poisson"):
+        # More than 65% of the predictions must be close up to the 2nd decimal.
+        # TODO: We are not entirely satisfied with this lax comparison, but the root
+        # cause is not clear, maybe algorithmic differences. One such example is the
+        # poisson_max_delta_step parameter of LightGBM which does not exist in HGBT.
+        assert (
+            np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-2, atol=1e-2))
+            > 0.65
+        )
+    else:
+        # Less than 1% of the predictions may deviate more than 1e-3 in relative terms.
+        assert np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-3)) > 1 - 0.01
+
+    if max_leaf_nodes < 10 and n_samples >= 1000 and loss in ("squared_error",):
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        # Less than 1% of the predictions may deviate more than 1e-4 in relative terms.
+        assert np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-4)) > 1 - 0.01
+
+
+# TODO(1.8) remove the filterwarnings decorator
+@pytest.mark.filterwarnings(
+    "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning"
+)
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("min_samples_leaf", (1, 20))
+@pytest.mark.parametrize(
+    "n_samples, max_leaf_nodes",
+    [
+        (255, 4096),
+        (1000, 8),
+    ],
+)
+def test_same_predictions_classification(
+    seed, min_samples_leaf, n_samples, max_leaf_nodes
+):
+    # Same as test_same_predictions_regression but for classification
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    max_iter = 1
+    n_classes = 2
+    max_bins = 255
+
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_classes=n_classes,
+        n_features=5,
+        n_informative=5,
+        n_redundant=0,
+        random_state=0,
+    )
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingClassifier(
+        loss="log_loss",
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=1,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes,
+    )
+    est_lightgbm = get_equivalent_estimator(
+        est_sklearn, lib="lightgbm", n_classes=n_classes
+    )
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
+
+    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
+    acc_sklearn = accuracy_score(y_train, pred_sklearn)
+    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)
+
+    if max_leaf_nodes < 10 and n_samples >= 1000:
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
+
+        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
+        acc_sklearn = accuracy_score(y_test, pred_sklearn)
+        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
+
+
+# TODO(1.8) remove the filterwarnings decorator
+@pytest.mark.filterwarnings(
+    "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning"
+)
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("min_samples_leaf", (1, 20))
+@pytest.mark.parametrize(
+    "n_samples, max_leaf_nodes",
+    [
+        (255, 4096),
+        (10000, 8),
+    ],
+)
+def test_same_predictions_multiclass_classification(
+    seed, min_samples_leaf, n_samples, max_leaf_nodes
+):
+    # Same as test_same_predictions_regression but for classification
+    pytest.importorskip("lightgbm")
+
+    rng = np.random.RandomState(seed=seed)
+    n_classes = 3
+    max_iter = 1
+    max_bins = 255
+    lr = 1
+
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_classes=n_classes,
+        n_features=5,
+        n_informative=5,
+        n_redundant=0,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+
+    if n_samples > 255:
+        # bin data and convert it to float32 so that the estimator doesn't
+        # treat it as pre-binned
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+
+    est_sklearn = HistGradientBoostingClassifier(
+        loss="log_loss",
+        max_iter=max_iter,
+        max_bins=max_bins,
+        learning_rate=lr,
+        early_stopping=False,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes,
+    )
+    est_lightgbm = get_equivalent_estimator(
+        est_sklearn, lib="lightgbm", n_classes=n_classes
+    )
+
+    est_lightgbm.fit(X_train, y_train)
+    est_sklearn.fit(X_train, y_train)
+
+    # We need X to be treated an numerical data, not pre-binned data.
+    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
+
+    pred_lightgbm = est_lightgbm.predict(X_train)
+    pred_sklearn = est_sklearn.predict(X_train)
+    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
+
+    proba_lightgbm = est_lightgbm.predict_proba(X_train)
+    proba_sklearn = est_sklearn.predict_proba(X_train)
+    # assert more than 75% of the predicted probabilities are the same up to
+    # the second decimal
+    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75
+
+    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
+    acc_sklearn = accuracy_score(y_train, pred_sklearn)
+
+    np.testing.assert_allclose(acc_lightgbm, acc_sklearn, rtol=0, atol=5e-2)
+
+    if max_leaf_nodes < 10 and n_samples >= 1000:
+        pred_lightgbm = est_lightgbm.predict(X_test)
+        pred_sklearn = est_sklearn.predict(X_test)
+        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
+
+        proba_lightgbm = est_lightgbm.predict_proba(X_train)
+        proba_sklearn = est_sklearn.predict_proba(X_train)
+        # assert more than 75% of the predicted probabilities are the same up
+        # to the second decimal
+        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75
+
+        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
+        acc_sklearn = accuracy_score(y_test, pred_sklearn)
+        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dde25f3d22dfc8e3a037e1d284d6e924a03986c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -0,0 +1,1760 @@
+import copyreg
+import io
+import pickle
+import re
+import warnings
+from unittest.mock import Mock
+
+import joblib
+import numpy as np
+import pytest
+from joblib.numpy_pickle import NumpyPickler
+from numpy.testing import assert_allclose, assert_array_equal
+
+import sklearn
+from sklearn._loss.loss import (
+    AbsoluteError,
+    HalfBinomialLoss,
+    HalfSquaredError,
+    PinballLoss,
+)
+from sklearn.base import BaseEstimator, TransformerMixin, clone, is_regressor
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import make_classification, make_low_rank_matrix, make_regression
+from sklearn.dummy import DummyRegressor
+from sklearn.ensemble import (
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+)
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
+from sklearn.exceptions import NotFittedError
+from sklearn.metrics import get_scorer, mean_gamma_deviance, mean_poisson_deviance
+from sklearn.model_selection import cross_val_score, train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, OneHotEncoder
+from sklearn.utils import check_random_state, shuffle
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils._testing import _convert_container
+from sklearn.utils.fixes import _IS_32BIT
+
+n_threads = _openmp_effective_n_threads()
+
+X_classification, y_classification = make_classification(random_state=0)
+X_regression, y_regression = make_regression(random_state=0)
+X_multi_classification, y_multi_classification = make_classification(
+    n_classes=3, n_informative=3, random_state=0
+)
+
+
+def _make_dumb_dataset(n_samples):
+    """Make a dumb dataset to test early stopping."""
+    rng = np.random.RandomState(42)
+    X_dumb = rng.randn(n_samples, 1)
+    y_dumb = (X_dumb[:, 0] > 0).astype("int64")
+    return X_dumb, y_dumb
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+    ],
+)
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        (
+            {"interaction_cst": [0, 1]},
+            "Interaction constraints must be a sequence of tuples or lists",
+        ),
+        (
+            {"interaction_cst": [{0, 9999}]},
+            r"Interaction constraints must consist of integer indices in \[0,"
+            r" n_features - 1\] = \[.*\], specifying the position of features,",
+        ),
+        (
+            {"interaction_cst": [{-1, 0}]},
+            r"Interaction constraints must consist of integer indices in \[0,"
+            r" n_features - 1\] = \[.*\], specifying the position of features,",
+        ),
+        (
+            {"interaction_cst": [{0.5}]},
+            r"Interaction constraints must consist of integer indices in \[0,"
+            r" n_features - 1\] = \[.*\], specifying the position of features,",
+        ),
+    ],
+)
+def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg):
+    with pytest.raises(ValueError, match=err_msg):
+        GradientBoosting(**params).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "scoring, validation_fraction, early_stopping, n_iter_no_change, tol",
+    [
+        ("neg_mean_squared_error", 0.1, True, 5, 1e-7),  # use scorer
+        ("neg_mean_squared_error", None, True, 5, 1e-1),  # use scorer on train
+        (None, 0.1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ("loss", 0.1, True, 5, 1e-7),  # use loss
+        ("loss", None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, 0.0),  # no early stopping
+    ],
+)
+def test_early_stopping_regression(
+    scoring, validation_fraction, early_stopping, n_iter_no_change, tol
+):
+    max_iter = 200
+
+    X, y = make_regression(n_samples=50, random_state=0)
+
+    gb = HistGradientBoostingRegressor(
+        verbose=1,  # just for coverage
+        min_samples_leaf=5,  # easier to overfit fast
+        scoring=scoring,
+        tol=tol,
+        early_stopping=early_stopping,
+        validation_fraction=validation_fraction,
+        max_iter=max_iter,
+        n_iter_no_change=n_iter_no_change,
+        random_state=0,
+    )
+    gb.fit(X, y)
+
+    if early_stopping:
+        assert n_iter_no_change <= gb.n_iter_ < max_iter
+    else:
+        assert gb.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize(
+    "data",
+    (
+        make_classification(n_samples=30, random_state=0),
+        make_classification(
+            n_samples=30, n_classes=3, n_clusters_per_class=1, random_state=0
+        ),
+    ),
+)
+@pytest.mark.parametrize(
+    "scoring, validation_fraction, early_stopping, n_iter_no_change, tol",
+    [
+        ("accuracy", 0.1, True, 5, 1e-7),  # use scorer
+        ("accuracy", None, True, 5, 1e-1),  # use scorer on training data
+        (None, 0.1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ("loss", 0.1, True, 5, 1e-7),  # use loss
+        ("loss", None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, 0.0),  # no early stopping
+    ],
+)
+def test_early_stopping_classification(
+    data, scoring, validation_fraction, early_stopping, n_iter_no_change, tol
+):
+    max_iter = 50
+
+    X, y = data
+
+    gb = HistGradientBoostingClassifier(
+        verbose=2,  # just for coverage
+        min_samples_leaf=5,  # easier to overfit fast
+        scoring=scoring,
+        tol=tol,
+        early_stopping=early_stopping,
+        validation_fraction=validation_fraction,
+        max_iter=max_iter,
+        n_iter_no_change=n_iter_no_change,
+        random_state=0,
+    )
+    gb.fit(X, y)
+
+    if early_stopping is True:
+        assert n_iter_no_change <= gb.n_iter_ < max_iter
+    else:
+        assert gb.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, *_make_dumb_dataset(10000)),
+        (HistGradientBoostingClassifier, *_make_dumb_dataset(10001)),
+        (HistGradientBoostingRegressor, *_make_dumb_dataset(10000)),
+        (HistGradientBoostingRegressor, *_make_dumb_dataset(10001)),
+    ],
+)
+def test_early_stopping_default(GradientBoosting, X, y):
+    # Test that early stopping is enabled by default if and only if there
+    # are more than 10000 samples
+    gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1)
+    gb.fit(X, y)
+    if X.shape[0] > 10000:
+        assert gb.n_iter_ < gb.max_iter
+    else:
+        assert gb.n_iter_ == gb.max_iter
+
+
+@pytest.mark.parametrize(
+    "scores, n_iter_no_change, tol, stopping",
+    [
+        ([], 1, 0.001, False),  # not enough iterations
+        ([1, 1, 1], 5, 0.001, False),  # not enough iterations
+        ([1, 1, 1, 1, 1], 5, 0.001, False),  # not enough iterations
+        ([1, 2, 3, 4, 5, 6], 5, 0.001, False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 0.0, False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 0.999, False),  # significant improvement
+        ([1, 2, 3, 4, 5, 6], 5, 5 - 1e-5, False),  # significant improvement
+        ([1] * 6, 5, 0.0, True),  # no significant improvement
+        ([1] * 6, 5, 0.001, True),  # no significant improvement
+        ([1] * 6, 5, 5, True),  # no significant improvement
+    ],
+)
+def test_should_stop(scores, n_iter_no_change, tol, stopping):
+    gbdt = HistGradientBoostingClassifier(n_iter_no_change=n_iter_no_change, tol=tol)
+    assert gbdt._should_stop(scores) == stopping
+
+
+def test_absolute_error():
+    # For coverage only.
+    X, y = make_regression(n_samples=500, random_state=0)
+    gbdt = HistGradientBoostingRegressor(loss="absolute_error", random_state=0)
+    gbdt.fit(X, y)
+    assert gbdt.score(X, y) > 0.9
+
+
+def test_absolute_error_sample_weight():
+    # non regression test for issue #19400
+    # make sure no error is thrown during fit of
+    # HistGradientBoostingRegressor with absolute_error loss function
+    # and passing sample_weight
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.uniform(-1, 1, size=(n_samples, 2))
+    y = rng.uniform(-1, 1, size=n_samples)
+    sample_weight = rng.uniform(0, 1, size=n_samples)
+    gbdt = HistGradientBoostingRegressor(loss="absolute_error")
+    gbdt.fit(X, y, sample_weight=sample_weight)
+
+
+@pytest.mark.parametrize("y", [([1.0, -2.0, 0.0]), ([0.0, 1.0, 2.0])])
+def test_gamma_y_positive(y):
+    # Test that ValueError is raised if any y_i <= 0.
+    err_msg = r"loss='gamma' requires strictly positive y."
+    gbdt = HistGradientBoostingRegressor(loss="gamma", random_state=0)
+    with pytest.raises(ValueError, match=err_msg):
+        gbdt.fit(np.zeros(shape=(len(y), 1)), y)
+
+
+def test_gamma():
+    # For a Gamma distributed target, we expect an HGBT trained with the Gamma deviance
+    # (loss) to give better results than an HGBT with any other loss function, measured
+    # in out-of-sample Gamma deviance as metric/score.
+    # Note that squared error could potentially predict negative values which is
+    # invalid (np.inf) for the Gamma deviance. A Poisson HGBT (having a log link)
+    # does not have that defect.
+    # Important note: It seems that a Poisson HGBT almost always has better
+    # out-of-sample performance than the Gamma HGBT, measured in Gamma deviance.
+    # LightGBM shows the same behaviour. Hence, we only compare to a squared error
+    # HGBT, but not to a Poisson deviance HGBT.
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 100, 20
+    X = make_low_rank_matrix(
+        n_samples=n_train + n_test,
+        n_features=n_features,
+        random_state=rng,
+    )
+    # We create a log-linear Gamma model. This gives y.min ~ 1e-2, y.max ~ 1e2
+    coef = rng.uniform(low=-10, high=20, size=n_features)
+    # Numpy parametrizes gamma(shape=k, scale=theta) with mean = k * theta and
+    # variance = k * theta^2. We parametrize it instead with mean = exp(X @ coef)
+    # and variance = dispersion * mean^2 by setting k = 1 / dispersion,
+    # theta =  dispersion * mean.
+    dispersion = 0.5
+    y = rng.gamma(shape=1 / dispersion, scale=dispersion * np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=n_test, random_state=rng
+    )
+    gbdt_gamma = HistGradientBoostingRegressor(loss="gamma", random_state=123)
+    gbdt_mse = HistGradientBoostingRegressor(loss="squared_error", random_state=123)
+    dummy = DummyRegressor(strategy="mean")
+    for model in (gbdt_gamma, gbdt_mse, dummy):
+        model.fit(X_train, y_train)
+
+    for X, y in [(X_train, y_train), (X_test, y_test)]:
+        loss_gbdt_gamma = mean_gamma_deviance(y, gbdt_gamma.predict(X))
+        # We restrict the squared error HGBT to predict at least the minimum seen y at
+        # train time to make it strictly positive.
+        loss_gbdt_mse = mean_gamma_deviance(
+            y, np.maximum(np.min(y_train), gbdt_mse.predict(X))
+        )
+        loss_dummy = mean_gamma_deviance(y, dummy.predict(X))
+        assert loss_gbdt_gamma < loss_dummy
+        assert loss_gbdt_gamma < loss_gbdt_mse
+
+
+@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
+def test_quantile_asymmetric_error(quantile):
+    """Test quantile regression for asymmetric distributed targets."""
+    n_samples = 10_000
+    rng = np.random.RandomState(42)
+    # take care that X @ coef + intercept > 0
+    X = np.concatenate(
+        (
+            np.abs(rng.randn(n_samples)[:, None]),
+            -rng.randint(2, size=(n_samples, 1)),
+        ),
+        axis=1,
+    )
+    intercept = 1.23
+    coef = np.array([0.5, -2])
+    # For an exponential distribution with rate lambda, e.g. exp(-lambda * x),
+    # the quantile at level q is:
+    #   quantile(q) = - log(1 - q) / lambda
+    #   scale = 1/lambda = -quantile(q) / log(1-q)
+    y = rng.exponential(
+        scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples
+    )
+    model = HistGradientBoostingRegressor(
+        loss="quantile",
+        quantile=quantile,
+        max_iter=25,
+        random_state=0,
+        max_leaf_nodes=10,
+    ).fit(X, y)
+    assert_allclose(np.mean(model.predict(X) > y), quantile, rtol=1e-2)
+
+    pinball_loss = PinballLoss(quantile=quantile)
+    loss_true_quantile = pinball_loss(y, X @ coef + intercept)
+    loss_pred_quantile = pinball_loss(y, model.predict(X))
+    # we are overfitting
+    assert loss_pred_quantile <= loss_true_quantile
+
+
+@pytest.mark.parametrize("y", [([1.0, -2.0, 0.0]), ([0.0, 0.0, 0.0])])
+def test_poisson_y_positive(y):
+    # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
+    err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
+    gbdt = HistGradientBoostingRegressor(loss="poisson", random_state=0)
+    with pytest.raises(ValueError, match=err_msg):
+        gbdt.fit(np.zeros(shape=(len(y), 1)), y)
+
+
+def test_poisson():
+    # For Poisson distributed target, Poisson loss should give better results
+    # than least squares measured in Poisson deviance as metric.
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 100, 100
+    X = make_low_rank_matrix(
+        n_samples=n_train + n_test, n_features=n_features, random_state=rng
+    )
+    # We create a log-linear Poisson model and downscale coef as it will get
+    # exponentiated.
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=n_test, random_state=rng
+    )
+    gbdt_pois = HistGradientBoostingRegressor(loss="poisson", random_state=rng)
+    gbdt_ls = HistGradientBoostingRegressor(loss="squared_error", random_state=rng)
+    gbdt_pois.fit(X_train, y_train)
+    gbdt_ls.fit(X_train, y_train)
+    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
+
+    for X, y in [(X_train, y_train), (X_test, y_test)]:
+        metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
+        # squared_error might produce non-positive predictions => clip
+        metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15, None))
+        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
+        assert metric_pois < metric_ls
+        assert metric_pois < metric_dummy
+
+
+def test_binning_train_validation_are_separated():
+    # Make sure training and validation data are binned separately.
+    # See issue 13926
+
+    rng = np.random.RandomState(0)
+    validation_fraction = 0.2
+    gb = HistGradientBoostingClassifier(
+        early_stopping=True, validation_fraction=validation_fraction, random_state=rng
+    )
+    gb.fit(X_classification, y_classification)
+    mapper_training_data = gb._bin_mapper
+
+    # Note that since the data is small there is no subsampling and the
+    # random_state doesn't matter
+    mapper_whole_data = _BinMapper(random_state=0)
+    mapper_whole_data.fit(X_classification)
+
+    n_samples = X_classification.shape[0]
+    assert np.all(
+        mapper_training_data.n_bins_non_missing_
+        == int((1 - validation_fraction) * n_samples)
+    )
+    assert np.all(
+        mapper_training_data.n_bins_non_missing_
+        != mapper_whole_data.n_bins_non_missing_
+    )
+
+
+def test_missing_values_trivial():
+    # sanity check for missing values support. With only one feature and
+    # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the
+    # training set.
+
+    n_samples = 100
+    n_features = 1
+    rng = np.random.RandomState(0)
+
+    X = rng.normal(size=(n_samples, n_features))
+    mask = rng.binomial(1, 0.5, size=X.shape).astype(bool)
+    X[mask] = np.nan
+    y = mask.ravel()
+    gb = HistGradientBoostingClassifier()
+    gb.fit(X, y)
+
+    assert gb.score(X, y) == pytest.approx(1)
+
+
+@pytest.mark.parametrize("problem", ("classification", "regression"))
+@pytest.mark.parametrize(
+    (
+        "missing_proportion, expected_min_score_classification, "
+        "expected_min_score_regression"
+    ),
+    [(0.1, 0.97, 0.89), (0.2, 0.93, 0.81), (0.5, 0.79, 0.52)],
+)
+def test_missing_values_resilience(
+    problem,
+    missing_proportion,
+    expected_min_score_classification,
+    expected_min_score_regression,
+):
+    # Make sure the estimators can deal with missing values and still yield
+    # decent predictions
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_features = 2
+    if problem == "regression":
+        X, y = make_regression(
+            n_samples=n_samples,
+            n_features=n_features,
+            n_informative=n_features,
+            random_state=rng,
+        )
+        gb = HistGradientBoostingRegressor()
+        expected_min_score = expected_min_score_regression
+    else:
+        X, y = make_classification(
+            n_samples=n_samples,
+            n_features=n_features,
+            n_informative=n_features,
+            n_redundant=0,
+            n_repeated=0,
+            random_state=rng,
+        )
+        gb = HistGradientBoostingClassifier()
+        expected_min_score = expected_min_score_classification
+
+    mask = rng.binomial(1, missing_proportion, size=X.shape).astype(bool)
+    X[mask] = np.nan
+
+    gb.fit(X, y)
+
+    assert gb.score(X, y) > expected_min_score
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        make_classification(random_state=0, n_classes=2),
+        make_classification(random_state=0, n_classes=3, n_informative=3),
+    ],
+    ids=["binary_log_loss", "multiclass_log_loss"],
+)
+def test_zero_division_hessians(data):
+    # non regression test for issue #14018
+    # make sure we avoid zero division errors when computing the leaves values.
+
+    # If the learning rate is too high, the raw predictions are bad and will
+    # saturate the softmax (or sigmoid in binary classif). This leads to
+    # probabilities being exactly 0 or 1, gradients being constant, and
+    # hessians being zero.
+    X, y = data
+    gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10)
+    gb.fit(X, y)
+
+
+def test_small_trainset():
+    # Make sure that the small trainset is stratified and has the expected
+    # length (10k samples)
+    n_samples = 20000
+    original_distrib = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4}
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples).reshape(n_samples, 1)
+    y = [
+        [class_] * int(prop * n_samples) for (class_, prop) in original_distrib.items()
+    ]
+    y = shuffle(np.concatenate(y))
+    gb = HistGradientBoostingClassifier()
+
+    # Compute the small training set
+    X_small, y_small, *_ = gb._get_small_trainset(
+        X, y, seed=42, sample_weight_train=None
+    )
+
+    # Compute the class distribution in the small training set
+    unique, counts = np.unique(y_small, return_counts=True)
+    small_distrib = {class_: count / 10000 for (class_, count) in zip(unique, counts)}
+
+    # Test that the small training set has the expected length
+    assert X_small.shape[0] == 10000
+    assert y_small.shape[0] == 10000
+
+    # Test that the class distributions in the whole dataset and in the small
+    # training set are identical
+    assert small_distrib == pytest.approx(original_distrib)
+
+
+def test_missing_values_minmax_imputation():
+    # Compare the buit-in missing value handling of Histogram GBC with an
+    # a-priori missing value imputation strategy that should yield the same
+    # results in terms of decision function.
+    #
+    # Each feature (containing NaNs) is replaced by 2 features:
+    # - one where the nans are replaced by min(feature) - 1
+    # - one where the nans are replaced by max(feature) + 1
+    # A split where nans go to the left has an equivalent split in the
+    # first (min) feature, and a split where nans go to the right has an
+    # equivalent split in the second (max) feature.
+    #
+    # Assuming the data is such that there is never a tie to select the best
+    # feature to split on during training, the learned decision trees should be
+    # strictly equivalent (learn a sequence of splits that encode the same
+    # decision function).
+    #
+    # The MinMaxImputer transformer is meant to be a toy implementation of the
+    # "Missing In Attributes" (MIA) missing value handling for decision trees
+    # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
+    # The implementation of MIA as an imputation transformer was suggested by
+    # "Remark 3" in :arxiv:'<1902.06931>`
+
+    class MinMaxImputer(TransformerMixin, BaseEstimator):
+        def fit(self, X, y=None):
+            mm = MinMaxScaler().fit(X)
+            self.data_min_ = mm.data_min_
+            self.data_max_ = mm.data_max_
+            return self
+
+        def transform(self, X):
+            X_min, X_max = X.copy(), X.copy()
+
+            for feature_idx in range(X.shape[1]):
+                nan_mask = np.isnan(X[:, feature_idx])
+                X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
+                X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1
+
+            return np.concatenate([X_min, X_max], axis=1)
+
+    def make_missing_value_data(n_samples=int(1e4), seed=0):
+        rng = np.random.RandomState(seed)
+        X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng)
+
+        # Pre-bin the data to ensure a deterministic handling by the 2
+        # strategies and also make it easier to insert np.nan in a structured
+        # way:
+        X = KBinsDiscretizer(
+            n_bins=42, encode="ordinal", quantile_method="averaged_inverted_cdf"
+        ).fit_transform(X)
+
+        # First feature has missing values completely at random:
+        rnd_mask = rng.rand(X.shape[0]) > 0.9
+        X[rnd_mask, 0] = np.nan
+
+        # Second and third features have missing values for extreme values
+        # (censoring missingness):
+        low_mask = X[:, 1] == 0
+        X[low_mask, 1] = np.nan
+
+        high_mask = X[:, 2] == X[:, 2].max()
+        X[high_mask, 2] = np.nan
+
+        # Make the last feature nan pattern very informative:
+        y_max = np.percentile(y, 70)
+        y_max_mask = y >= y_max
+        y[y_max_mask] = y_max
+        X[y_max_mask, 3] = np.nan
+
+        # Check that there is at least one missing value in each feature:
+        for feature_idx in range(X.shape[1]):
+            assert any(np.isnan(X[:, feature_idx]))
+
+        # Let's use a test set to check that the learned decision function is
+        # the same as evaluated on unseen data. Otherwise it could just be the
+        # case that we find two independent ways to overfit the training set.
+        return train_test_split(X, y, random_state=rng)
+
+    # n_samples need to be large enough to minimize the likelihood of having
+    # several candidate splits with the same gain value in a given tree.
+    X_train, X_test, y_train, y_test = make_missing_value_data(
+        n_samples=int(1e4), seed=0
+    )
+
+    # Use a small number of leaf nodes and iterations so as to keep
+    # under-fitting models to minimize the likelihood of ties when training the
+    # model.
+    gbm1 = HistGradientBoostingRegressor(max_iter=100, max_leaf_nodes=5, random_state=0)
+    gbm1.fit(X_train, y_train)
+
+    gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
+    gbm2.fit(X_train, y_train)
+
+    # Check that the model reach the same score:
+    assert gbm1.score(X_train, y_train) == pytest.approx(gbm2.score(X_train, y_train))
+
+    assert gbm1.score(X_test, y_test) == pytest.approx(gbm2.score(X_test, y_test))
+
+    # Check the individual prediction match as a finer grained
+    # decision function check.
+    assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
+    assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
+
+
+def test_infinite_values():
+    # Basic test for infinite values
+
+    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
+    y = np.array([0, 0, 1, 1])
+
+    gbdt = HistGradientBoostingRegressor(min_samples_leaf=1)
+    gbdt.fit(X, y)
+    np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
+
+
+def test_consistent_lengths():
+    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
+    y = np.array([0, 0, 1, 1])
+    sample_weight = np.array([0.1, 0.3, 0.1])
+    gbdt = HistGradientBoostingRegressor()
+    with pytest.raises(ValueError, match=r"sample_weight.shape == \(3,\), expected"):
+        gbdt.fit(X, y, sample_weight)
+
+    with pytest.raises(
+        ValueError, match="Found input variables with inconsistent number"
+    ):
+        gbdt.fit(X, y[1:])
+
+
+def test_infinite_values_missing_values():
+    # High level test making sure that inf and nan values are properly handled
+    # when both are present. This is similar to
+    # test_split_on_nan_with_infinite_values() in test_grower.py, though we
+    # cannot check the predictions for binned values here.
+
+    X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1)
+    y_isnan = np.isnan(X.ravel())
+    y_isinf = X.ravel() == np.inf
+
+    stump_clf = HistGradientBoostingClassifier(
+        min_samples_leaf=1, max_iter=1, learning_rate=1, max_depth=2
+    )
+
+    assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1
+    assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1
+
+
+@pytest.mark.parametrize("scoring", [None, "loss"])
+def test_string_target_early_stopping(scoring):
+    # Regression tests for #14709 where the targets need to be encoded before
+    # to compute the score
+    rng = np.random.RandomState(42)
+    X = rng.randn(100, 10)
+    y = np.array(["x"] * 50 + ["y"] * 50, dtype=object)
+    gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring)
+    gbrt.fit(X, y)
+
+
+def test_zero_sample_weights_regression():
+    # Make sure setting a SW to zero amounts to ignoring the corresponding
+    # sample
+
+    X = [[1, 0], [1, 0], [1, 0], [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = HistGradientBoostingRegressor(min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert gb.predict([[1, 0]])[0] > 0.5
+
+
+def test_zero_sample_weights_classification():
+    # Make sure setting a SW to zero amounts to ignoring the corresponding
+    # sample
+
+    X = [[1, 0], [1, 0], [1, 0], [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = HistGradientBoostingClassifier(loss="log_loss", min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(gb.predict([[1, 0]]), [1])
+
+    X = [[1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]
+    y = [0, 0, 1, 0, 2]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1, 1]
+    gb = HistGradientBoostingClassifier(loss="log_loss", min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(gb.predict([[1, 0]]), [1])
+
+
+@pytest.mark.parametrize(
+    "problem", ("regression", "binary_classification", "multiclass_classification")
+)
+@pytest.mark.parametrize("duplication", ("half", "all"))
+def test_sample_weight_effect(problem, duplication):
+    # High level test to make sure that duplicating a sample is equivalent to
+    # giving it weight of 2.
+
+    # fails for n_samples > 255 because binning does not take sample weights
+    # into account. Keeping n_samples <= 255 makes
+    # sure only unique values are used so SW have no effect on binning.
+    n_samples = 255
+    n_features = 2
+    if problem == "regression":
+        X, y = make_regression(
+            n_samples=n_samples,
+            n_features=n_features,
+            n_informative=n_features,
+            random_state=0,
+        )
+        Klass = HistGradientBoostingRegressor
+    else:
+        n_classes = 2 if problem == "binary_classification" else 3
+        X, y = make_classification(
+            n_samples=n_samples,
+            n_features=n_features,
+            n_informative=n_features,
+            n_redundant=0,
+            n_clusters_per_class=1,
+            n_classes=n_classes,
+            random_state=0,
+        )
+        Klass = HistGradientBoostingClassifier
+
+    # This test can't pass if min_samples_leaf > 1 because that would force 2
+    # samples to be in the same node in est_sw, while these samples would be
+    # free to be separate in est_dup: est_dup would just group together the
+    # duplicated samples.
+    est = Klass(min_samples_leaf=1)
+
+    # Create dataset with duplicate and corresponding sample weights
+    if duplication == "half":
+        lim = n_samples // 2
+    else:
+        lim = n_samples
+    X_dup = np.r_[X, X[:lim]]
+    y_dup = np.r_[y, y[:lim]]
+    sample_weight = np.ones(shape=(n_samples))
+    sample_weight[:lim] = 2
+
+    est_sw = clone(est).fit(X, y, sample_weight=sample_weight)
+    est_dup = clone(est).fit(X_dup, y_dup)
+
+    # checking raw_predict is stricter than just predict for classification
+    assert np.allclose(est_sw._raw_predict(X_dup), est_dup._raw_predict(X_dup))
+
+
+@pytest.mark.parametrize("Loss", (HalfSquaredError, AbsoluteError))
+def test_sum_hessians_are_sample_weight(Loss):
+    # For losses with constant hessians, the sum_hessians field of the
+    # histograms must be equal to the sum of the sample weight of samples at
+    # the corresponding bin.
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_features = 2
+    X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=rng)
+    bin_mapper = _BinMapper()
+    X_binned = bin_mapper.fit_transform(X)
+
+    # While sample weights are supposed to be positive, this still works.
+    sample_weight = rng.normal(size=n_samples)
+
+    loss = Loss(sample_weight=sample_weight)
+    gradients, hessians = loss.init_gradient_and_hessian(
+        n_samples=n_samples, dtype=G_H_DTYPE
+    )
+    gradients, hessians = gradients.reshape((-1, 1)), hessians.reshape((-1, 1))
+    raw_predictions = rng.normal(size=(n_samples, 1))
+    loss.gradient_hessian(
+        y_true=y,
+        raw_prediction=raw_predictions,
+        sample_weight=sample_weight,
+        gradient_out=gradients,
+        hessian_out=hessians,
+        n_threads=n_threads,
+    )
+
+    # build sum_sample_weight which contains the sum of the sample weights at
+    # each bin (for each feature). This must be equal to the sum_hessians
+    # field of the corresponding histogram
+    sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins))
+    for feature_idx in range(n_features):
+        for sample_idx in range(n_samples):
+            sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += sample_weight[
+                sample_idx
+            ]
+
+    # Build histogram
+    grower = TreeGrower(
+        X_binned, gradients[:, 0], hessians[:, 0], n_bins=bin_mapper.n_bins
+    )
+    histograms = grower.histogram_builder.compute_histograms_brute(
+        grower.root.sample_indices
+    )
+
+    for feature_idx in range(n_features):
+        for bin_idx in range(bin_mapper.n_bins):
+            assert histograms[feature_idx, bin_idx]["sum_hessians"] == (
+                pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5)
+            )
+
+
+def test_max_depth_max_leaf_nodes():
+    # Non regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/16179
+    # there was a bug when the max_depth and the max_leaf_nodes criteria were
+    # met at the same time, which would lead to max_leaf_nodes not being
+    # respected.
+    X, y = make_classification(random_state=0)
+    est = HistGradientBoostingClassifier(max_depth=2, max_leaf_nodes=3, max_iter=1).fit(
+        X, y
+    )
+    tree = est._predictors[0][0]
+    assert tree.get_max_depth() == 2
+    assert tree.get_n_leaf_nodes() == 3  # would be 4 prior to bug fix
+
+
+def test_early_stopping_on_test_set_with_warm_start():
+    # Non regression test for #16661 where second fit fails with
+    # warm_start=True, early_stopping is on, and no validation set
+    X, y = make_classification(random_state=0)
+    gb = HistGradientBoostingClassifier(
+        max_iter=1,
+        scoring="loss",
+        warm_start=True,
+        early_stopping=True,
+        n_iter_no_change=1,
+        validation_fraction=None,
+    )
+
+    gb.fit(X, y)
+    # does not raise on second call
+    gb.set_params(max_iter=2)
+    gb.fit(X, y)
+
+
+def test_early_stopping_with_sample_weights(monkeypatch):
+    """Check that sample weights is passed in to the scorer and _raw_predict is not
+    called."""
+
+    mock_scorer = Mock(side_effect=get_scorer("neg_median_absolute_error"))
+
+    def mock_check_scoring(estimator, scoring):
+        assert scoring == "neg_median_absolute_error"
+        return mock_scorer
+
+    monkeypatch.setattr(
+        sklearn.ensemble._hist_gradient_boosting.gradient_boosting,
+        "check_scoring",
+        mock_check_scoring,
+    )
+
+    X, y = make_regression(random_state=0)
+    sample_weight = np.ones_like(y)
+    hist = HistGradientBoostingRegressor(
+        max_iter=2,
+        early_stopping=True,
+        random_state=0,
+        scoring="neg_median_absolute_error",
+    )
+    mock_raw_predict = Mock(side_effect=hist._raw_predict)
+    hist._raw_predict = mock_raw_predict
+    hist.fit(X, y, sample_weight=sample_weight)
+
+    # _raw_predict should never be called with scoring as a string
+    assert mock_raw_predict.call_count == 0
+
+    # For scorer is called twice (train and val) for the baseline score, and twice
+    # per iteration (train and val) after that. So 6 times in total for `max_iter=2`.
+    assert mock_scorer.call_count == 6
+    for arg_list in mock_scorer.call_args_list:
+        assert "sample_weight" in arg_list[1]
+
+
+def test_raw_predict_is_called_with_custom_scorer():
+    """Custom scorer will still call _raw_predict."""
+
+    mock_scorer = Mock(side_effect=get_scorer("neg_median_absolute_error"))
+
+    X, y = make_regression(random_state=0)
+    hist = HistGradientBoostingRegressor(
+        max_iter=2,
+        early_stopping=True,
+        random_state=0,
+        scoring=mock_scorer,
+    )
+    mock_raw_predict = Mock(side_effect=hist._raw_predict)
+    hist._raw_predict = mock_raw_predict
+    hist.fit(X, y)
+
+    # `_raw_predict` and scorer is called twice (train and val) for the baseline score,
+    # and twice per iteration (train and val) after that. So 6 times in total for
+    # `max_iter=2`.
+    assert mock_raw_predict.call_count == 6
+    assert mock_scorer.call_count == 6
+
+
+@pytest.mark.parametrize(
+    "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
+)
+def test_single_node_trees(Est):
+    # Make sure it's still possible to build single-node trees. In that case
+    # the value of the root is set to 0. That's a correct value: if the tree is
+    # single-node that's because min_gain_to_split is not respected right from
+    # the root, so we don't want the tree to have any impact on the
+    # predictions.
+
+    X, y = make_classification(random_state=0)
+    y[:] = 1  # constant target will lead to a single root node
+
+    est = Est(max_iter=20)
+    est.fit(X, y)
+
+    assert all(len(predictor[0].nodes) == 1 for predictor in est._predictors)
+    assert all(predictor[0].nodes[0]["value"] == 0 for predictor in est._predictors)
+    # Still gives correct predictions thanks to the baseline prediction
+    assert_allclose(est.predict(X), y)
+
+
+@pytest.mark.parametrize(
+    "Est, loss, X, y",
+    [
+        (
+            HistGradientBoostingClassifier,
+            HalfBinomialLoss(sample_weight=None),
+            X_classification,
+            y_classification,
+        ),
+        (
+            HistGradientBoostingRegressor,
+            HalfSquaredError(sample_weight=None),
+            X_regression,
+            y_regression,
+        ),
+    ],
+)
+def test_custom_loss(Est, loss, X, y):
+    est = Est(loss=loss, max_iter=20)
+    est.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "HistGradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+        (
+            HistGradientBoostingClassifier,
+            X_multi_classification,
+            y_multi_classification,
+        ),
+    ],
+)
+def test_staged_predict(HistGradientBoosting, X, y):
+    # Test whether staged predictor eventually gives
+    # the same prediction.
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.5, random_state=0
+    )
+    gb = HistGradientBoosting(max_iter=10)
+
+    # test raise NotFittedError if not fitted
+    with pytest.raises(NotFittedError):
+        next(gb.staged_predict(X_test))
+
+    gb.fit(X_train, y_train)
+
+    # test if the staged predictions of each iteration
+    # are equal to the corresponding predictions of the same estimator
+    # trained from scratch.
+    # this also test limit case when max_iter = 1
+    method_names = (
+        ["predict"]
+        if is_regressor(gb)
+        else ["predict", "predict_proba", "decision_function"]
+    )
+    for method_name in method_names:
+        staged_method = getattr(gb, "staged_" + method_name)
+        staged_predictions = list(staged_method(X_test))
+        assert len(staged_predictions) == gb.n_iter_
+        for n_iter, staged_predictions in enumerate(staged_method(X_test), 1):
+            aux = HistGradientBoosting(max_iter=n_iter)
+            aux.fit(X_train, y_train)
+            pred_aux = getattr(aux, method_name)(X_test)
+
+            assert_allclose(staged_predictions, pred_aux)
+            assert staged_predictions.shape == pred_aux.shape
+
+
+@pytest.mark.parametrize("insert_missing", [False, True])
+@pytest.mark.parametrize(
+    "Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier)
+)
+@pytest.mark.parametrize("bool_categorical_parameter", [True, False])
+@pytest.mark.parametrize("missing_value", [np.nan, -1])
+def test_unknown_categories_nan(
+    insert_missing, Est, bool_categorical_parameter, missing_value
+):
+    # Make sure no error is raised at predict if a category wasn't seen during
+    # fit. We also make sure they're treated as nans.
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    f1 = rng.rand(n_samples)
+    f2 = rng.randint(4, size=n_samples)
+    X = np.c_[f1, f2]
+    y = np.zeros(shape=n_samples)
+    y[X[:, 1] % 2 == 0] = 1
+
+    if bool_categorical_parameter:
+        categorical_features = [False, True]
+    else:
+        categorical_features = [1]
+
+    if insert_missing:
+        mask = rng.binomial(1, 0.01, size=X.shape).astype(bool)
+        assert mask.sum() > 0
+        X[mask] = missing_value
+
+    est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y)
+    assert_array_equal(est.is_categorical_, [False, True])
+
+    # Make sure no error is raised on unknown categories and nans
+    # unknown categories will be treated as nans
+    X_test = np.zeros((10, X.shape[1]), dtype=float)
+    X_test[:5, 1] = 30
+    X_test[5:, 1] = missing_value
+    assert len(np.unique(est.predict(X_test))) == 1
+
+
+def test_categorical_encoding_strategies():
+    # Check native categorical handling vs different encoding strategies. We
+    # make sure that native encoding needs only 1 split to achieve a perfect
+    # prediction on a simple dataset. In contrast, OneHotEncoded data needs
+    # more depth / splits, and treating categories as ordered (just using
+    # OrdinalEncoder) requires even more depth.
+
+    # dataset with one random continuous feature, and one categorical feature
+    # with values in [0, 5], e.g. from an OrdinalEncoder.
+    # class == 1 iff categorical value in {0, 2, 4}
+    rng = np.random.RandomState(0)
+    n_samples = 10_000
+    f1 = rng.rand(n_samples)
+    f2 = rng.randint(6, size=n_samples)
+    X = np.c_[f1, f2]
+    y = np.zeros(shape=n_samples)
+    y[X[:, 1] % 2 == 0] = 1
+
+    # make sure dataset is balanced so that the baseline_prediction doesn't
+    # influence predictions too much with max_iter = 1
+    assert 0.49 < y.mean() < 0.51
+
+    native_cat_specs = [
+        [False, True],
+        [1],
+    ]
+    try:
+        import pandas as pd
+
+        X = pd.DataFrame(X, columns=["f_0", "f_1"])
+        native_cat_specs.append(["f_1"])
+    except ImportError:
+        pass
+
+    for native_cat_spec in native_cat_specs:
+        clf_cat = HistGradientBoostingClassifier(
+            max_iter=1, max_depth=1, categorical_features=native_cat_spec
+        )
+        clf_cat.fit(X, y)
+
+        # Using native categorical encoding, we get perfect predictions with just
+        # one split
+        assert cross_val_score(clf_cat, X, y).mean() == 1
+
+    # quick sanity check for the bitset: 0, 2, 4 = 2**0 + 2**2 + 2**4 = 21
+    expected_left_bitset = [21, 0, 0, 0, 0, 0, 0, 0]
+    left_bitset = clf_cat.fit(X, y)._predictors[0][0].raw_left_cat_bitsets[0]
+    assert_array_equal(left_bitset, expected_left_bitset)
+
+    # Treating categories as ordered, we need more depth / more splits to get
+    # the same predictions
+    clf_no_cat = HistGradientBoostingClassifier(
+        max_iter=1, max_depth=4, categorical_features=None
+    )
+    assert cross_val_score(clf_no_cat, X, y).mean() < 0.9
+
+    clf_no_cat.set_params(max_depth=5)
+    assert cross_val_score(clf_no_cat, X, y).mean() == 1
+
+    # Using OHEd data, we need less splits than with pure OEd data, but we
+    # still need more splits than with the native categorical splits
+    ct = make_column_transformer(
+        (OneHotEncoder(sparse_output=False), [1]), remainder="passthrough"
+    )
+    X_ohe = ct.fit_transform(X)
+    clf_no_cat.set_params(max_depth=2)
+    assert cross_val_score(clf_no_cat, X_ohe, y).mean() < 0.9
+
+    clf_no_cat.set_params(max_depth=3)
+    assert cross_val_score(clf_no_cat, X_ohe, y).mean() == 1
+
+
+@pytest.mark.parametrize(
+    "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
+)
+@pytest.mark.parametrize(
+    "categorical_features, monotonic_cst, expected_msg",
+    [
+        (
+            [b"hello", b"world"],
+            None,
+            re.escape(
+                "categorical_features must be an array-like of bool, int or str, "
+                "got: bytes40."
+            ),
+        ),
+        (
+            np.array([b"hello", 1.3], dtype=object),
+            None,
+            re.escape(
+                "categorical_features must be an array-like of bool, int or str, "
+                "got: bytes, float."
+            ),
+        ),
+        (
+            [0, -1],
+            None,
+            re.escape(
+                "categorical_features set as integer indices must be in "
+                "[0, n_features - 1]"
+            ),
+        ),
+        (
+            [True, True, False, False, True],
+            None,
+            re.escape(
+                "categorical_features set as a boolean mask must have shape "
+                "(n_features,)"
+            ),
+        ),
+        (
+            [True, True, False, False],
+            [0, -1, 0, 1],
+            "Categorical features cannot have monotonic constraints",
+        ),
+    ],
+)
+def test_categorical_spec_errors(
+    Est, categorical_features, monotonic_cst, expected_msg
+):
+    # Test errors when categories are specified incorrectly
+    n_samples = 100
+    X, y = make_classification(random_state=0, n_features=4, n_samples=n_samples)
+    rng = np.random.RandomState(0)
+    X[:, 0] = rng.randint(0, 10, size=n_samples)
+    X[:, 1] = rng.randint(0, 10, size=n_samples)
+    est = Est(categorical_features=categorical_features, monotonic_cst=monotonic_cst)
+
+    with pytest.raises(ValueError, match=expected_msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
+)
+def test_categorical_spec_errors_with_feature_names(Est):
+    pd = pytest.importorskip("pandas")
+    n_samples = 10
+    X = pd.DataFrame(
+        {
+            "f0": range(n_samples),
+            "f1": range(n_samples),
+            "f2": [1.0] * n_samples,
+        }
+    )
+    y = [0, 1] * (n_samples // 2)
+
+    est = Est(categorical_features=["f0", "f1", "f3"])
+    expected_msg = re.escape(
+        "categorical_features has a item value 'f3' which is not a valid "
+        "feature name of the training data."
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        est.fit(X, y)
+
+    est = Est(categorical_features=["f0", "f1"])
+    expected_msg = re.escape(
+        "categorical_features should be passed as an array of integers or "
+        "as a boolean mask when the model is fitted on data without feature "
+        "names."
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        est.fit(X.to_numpy(), y)
+
+
+@pytest.mark.parametrize(
+    "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
+)
+@pytest.mark.parametrize("categorical_features", ([False, False], []))
+@pytest.mark.parametrize("as_array", (True, False))
+def test_categorical_spec_no_categories(Est, categorical_features, as_array):
+    # Make sure we can properly detect that no categorical features are present
+    # even if the categorical_features parameter is not None
+    X = np.arange(10).reshape(5, 2)
+    y = np.arange(5)
+    if as_array:
+        categorical_features = np.asarray(categorical_features)
+    est = Est(categorical_features=categorical_features).fit(X, y)
+    assert est.is_categorical_ is None
+
+
+@pytest.mark.parametrize(
+    "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
+)
+@pytest.mark.parametrize(
+    "use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")]
+)
+def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
+    # Test errors when categories are encoded incorrectly
+
+    gb = Est(categorical_features=[True], max_bins=2)
+
+    if use_pandas:
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame({"f0": [0, 1, 2]})
+    else:
+        X = np.array([[0, 1, 2]]).T
+    y = np.arange(3)
+    msg = (
+        f"Categorical feature {feature_name} is expected to have a "
+        "cardinality <= 2 but actually has a cardinality of 3."
+    )
+    with pytest.raises(ValueError, match=msg):
+        gb.fit(X, y)
+
+    # nans are ignored in the counts
+    X = np.array([[0, 1, np.nan]]).T
+    y = np.arange(3)
+    gb.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
+)
+def test_uint8_predict(Est):
+    # Non regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/18408
+    # Make sure X can be of dtype uint8 (i.e. X_BINNED_DTYPE) in predict. It
+    # will be converted to X_DTYPE.
+
+    rng = np.random.RandomState(0)
+
+    X = rng.randint(0, 100, size=(10, 2)).astype(np.uint8)
+    y = rng.randint(0, 2, size=10).astype(np.uint8)
+    est = Est()
+    est.fit(X, y)
+    est.predict(X)
+
+
+@pytest.mark.parametrize(
+    "interaction_cst, n_features, result",
+    [
+        (None, 931, None),
+        ([{0, 1}], 2, [{0, 1}]),
+        ("pairwise", 2, [{0, 1}]),
+        ("pairwise", 4, [{0, 1}, {0, 2}, {0, 3}, {1, 2}, {1, 3}, {2, 3}]),
+        ("no_interactions", 2, [{0}, {1}]),
+        ("no_interactions", 4, [{0}, {1}, {2}, {3}]),
+        ([(1, 0), [5, 1]], 6, [{0, 1}, {1, 5}, {2, 3, 4}]),
+    ],
+)
+def test_check_interaction_cst(interaction_cst, n_features, result):
+    """Check that _check_interaction_cst returns the expected list of sets"""
+    est = HistGradientBoostingRegressor()
+    est.set_params(interaction_cst=interaction_cst)
+    assert est._check_interaction_cst(n_features) == result
+
+
+def test_interaction_cst_numerically():
+    """Check that interaction constraints have no forbidden interactions."""
+    rng = np.random.RandomState(42)
+    n_samples = 1000
+    X = rng.uniform(size=(n_samples, 2))
+    # Construct y with a strong interaction term
+    # y = x0 + x1 + 5 * x0 * x1
+    y = np.hstack((X, 5 * X[:, [0]] * X[:, [1]])).sum(axis=1)
+
+    est = HistGradientBoostingRegressor(random_state=42)
+    est.fit(X, y)
+    est_no_interactions = HistGradientBoostingRegressor(
+        interaction_cst=[{0}, {1}], random_state=42
+    )
+    est_no_interactions.fit(X, y)
+
+    delta = 0.25
+    # Make sure we do not extrapolate out of the training set as tree-based estimators
+    # are very bad in doing so.
+    X_test = X[(X[:, 0] < 1 - delta) & (X[:, 1] < 1 - delta)]
+    X_delta_d_0 = X_test + [delta, 0]
+    X_delta_0_d = X_test + [0, delta]
+    X_delta_d_d = X_test + [delta, delta]
+
+    # Note: For the y from above as a function of x0 and x1, we have
+    # y(x0+d, x1+d) = y(x0, x1) + 5 * d * (2/5 + x0 + x1) + 5 * d**2
+    # y(x0+d, x1)   = y(x0, x1) + 5 * d * (1/5 + x1)
+    # y(x0,   x1+d) = y(x0, x1) + 5 * d * (1/5 + x0)
+    # Without interaction constraints, we would expect a result of 5 * d**2 for the
+    # following expression, but zero with constraints in place.
+    assert_allclose(
+        est_no_interactions.predict(X_delta_d_d)
+        + est_no_interactions.predict(X_test)
+        - est_no_interactions.predict(X_delta_d_0)
+        - est_no_interactions.predict(X_delta_0_d),
+        0,
+        atol=1e-12,
+    )
+
+    # Correct result of the expressions is 5 * delta**2. But this is hard to achieve by
+    # a fitted tree-based model. However, with 100 iterations the expression should
+    # at least be positive!
+    assert np.all(
+        est.predict(X_delta_d_d)
+        + est.predict(X_test)
+        - est.predict(X_delta_d_0)
+        - est.predict(X_delta_0_d)
+        > 0.01
+    )
+
+
+def test_no_user_warning_with_scoring():
+    """Check that no UserWarning is raised when scoring is set.
+
+    Non-regression test for #22907.
+    """
+    pd = pytest.importorskip("pandas")
+    X, y = make_regression(n_samples=50, random_state=0)
+    X_df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])
+
+    est = HistGradientBoostingRegressor(
+        random_state=0, scoring="neg_mean_absolute_error", early_stopping=True
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        est.fit(X_df, y)
+
+
+def test_class_weights():
+    """High level test to check class_weights."""
+    n_samples = 255
+    n_features = 2
+
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_features,
+        n_redundant=0,
+        n_clusters_per_class=1,
+        n_classes=2,
+        random_state=0,
+    )
+    y_is_1 = y == 1
+
+    # class_weight is the same as sample weights with the corresponding class
+    clf = HistGradientBoostingClassifier(
+        min_samples_leaf=2, random_state=0, max_depth=2
+    )
+    sample_weight = np.ones(shape=(n_samples))
+    sample_weight[y_is_1] = 3.0
+    clf.fit(X, y, sample_weight=sample_weight)
+
+    class_weight = {0: 1.0, 1: 3.0}
+    clf_class_weighted = clone(clf).set_params(class_weight=class_weight)
+    clf_class_weighted.fit(X, y)
+
+    assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X))
+
+    # Check that sample_weight and class_weight are multiplicative
+    clf.fit(X, y, sample_weight=sample_weight**2)
+    clf_class_weighted.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X))
+
+    # Make imbalanced dataset
+    X_imb = np.concatenate((X[~y_is_1], X[y_is_1][:10]))
+    y_imb = np.concatenate((y[~y_is_1], y[y_is_1][:10]))
+
+    # class_weight="balanced" is the same as sample_weights to be
+    # inversely proportional to n_samples / (n_classes * np.bincount(y))
+    clf_balanced = clone(clf).set_params(class_weight="balanced")
+    clf_balanced.fit(X_imb, y_imb)
+
+    class_weight = y_imb.shape[0] / (2 * np.bincount(y_imb))
+    sample_weight = class_weight[y_imb]
+    clf_sample_weight = clone(clf).set_params(class_weight=None)
+    clf_sample_weight.fit(X_imb, y_imb, sample_weight=sample_weight)
+
+    assert_allclose(
+        clf_balanced.decision_function(X_imb),
+        clf_sample_weight.decision_function(X_imb),
+    )
+
+
+def test_unknown_category_that_are_negative():
+    """Check that unknown categories that are negative does not error.
+
+    Non-regression test for #24274.
+    """
+    rng = np.random.RandomState(42)
+    n_samples = 1000
+    X = np.c_[rng.rand(n_samples), rng.randint(4, size=n_samples)]
+    y = np.zeros(shape=n_samples)
+    y[X[:, 1] % 2 == 0] = 1
+
+    hist = HistGradientBoostingRegressor(
+        random_state=0,
+        categorical_features=[False, True],
+        max_iter=10,
+    ).fit(X, y)
+
+    # Check that negative values from the second column are treated like a
+    # missing category
+    X_test_neg = np.asarray([[1, -2], [3, -4]])
+    X_test_nan = np.asarray([[1, np.nan], [3, np.nan]])
+
+    assert_allclose(hist.predict(X_test_neg), hist.predict(X_test_nan))
+
+
+@pytest.mark.parametrize(
+    ("GradientBoosting", "make_X_y"),
+    [
+        (HistGradientBoostingClassifier, make_classification),
+        (HistGradientBoostingRegressor, make_regression),
+    ],
+)
+@pytest.mark.parametrize("sample_weight", [False, True])
+def test_X_val_in_fit(GradientBoosting, make_X_y, sample_weight, global_random_seed):
+    """Test that passing X_val, y_val in fit is same as validation fraction."""
+    rng = np.random.RandomState(42)
+    n_samples = 100
+    X, y = make_X_y(n_samples=n_samples, random_state=rng)
+    if sample_weight:
+        sample_weight = np.abs(rng.normal(size=n_samples))
+        data = (X, y, sample_weight)
+    else:
+        sample_weight = None
+        data = (X, y)
+    rng_seed = global_random_seed
+
+    # Fit with validation fraction and early stopping.
+    m1 = GradientBoosting(
+        early_stopping=True,
+        validation_fraction=0.5,
+        random_state=rng_seed,
+    )
+    m1.fit(X, y, sample_weight)
+
+    # Do train-test split ourselves.
+    rng = check_random_state(rng_seed)
+    # We do the same as in the fit method.
+    stratify = y if isinstance(m1, HistGradientBoostingClassifier) else None
+    random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8")
+    X_train, X_val, y_train, y_val, *sw = train_test_split(
+        *data,
+        test_size=0.5,
+        stratify=stratify,
+        random_state=random_seed,
+    )
+    if sample_weight is not None:
+        sample_weight_train = sw[0]
+        sample_weight_val = sw[1]
+    else:
+        sample_weight_train = None
+        sample_weight_val = None
+    m2 = GradientBoosting(
+        early_stopping=True,
+        random_state=rng_seed,
+    )
+    m2.fit(
+        X_train,
+        y_train,
+        sample_weight=sample_weight_train,
+        X_val=X_val,
+        y_val=y_val,
+        sample_weight_val=sample_weight_val,
+    )
+
+    assert_allclose(m2.n_iter_, m1.n_iter_)
+    assert_allclose(m2.predict(X), m1.predict(X))
+
+
+def test_X_val_raises_missing_y_val():
+    """Test that an error is raised if X_val given but y_val None."""
+    X, y = make_classification(n_samples=4)
+    X, X_val = X[:2], X[2:]
+    y, y_val = y[:2], y[2:]
+    with pytest.raises(
+        ValueError,
+        match="X_val is provided, but y_val was not provided",
+    ):
+        HistGradientBoostingClassifier().fit(X, y, X_val=X_val)
+    with pytest.raises(
+        ValueError,
+        match="y_val is provided, but X_val was not provided",
+    ):
+        HistGradientBoostingClassifier().fit(X, y, y_val=y_val)
+
+
+def test_X_val_raises_with_early_stopping_false():
+    """Test that an error is raised if X_val given but early_stopping is False."""
+    X, y = make_regression(n_samples=4)
+    X, X_val = X[:2], X[2:]
+    y, y_val = y[:2], y[2:]
+    with pytest.raises(
+        ValueError,
+        match="X_val and y_val are passed to fit while at the same time",
+    ):
+        HistGradientBoostingRegressor(early_stopping=False).fit(
+            X, y, X_val=X_val, y_val=y_val
+        )
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+@pytest.mark.parametrize(
+    "HistGradientBoosting",
+    [HistGradientBoostingClassifier, HistGradientBoostingRegressor],
+)
+def test_dataframe_categorical_results_same_as_ndarray(
+    dataframe_lib, HistGradientBoosting
+):
+    """Check that pandas categorical give the same results as ndarray."""
+    pytest.importorskip(dataframe_lib)
+
+    rng = np.random.RandomState(42)
+    n_samples = 5_000
+    n_cardinality = 50
+    max_bins = 100
+    f_num = rng.rand(n_samples)
+    f_cat = rng.randint(n_cardinality, size=n_samples)
+
+    # Make f_cat an informative feature
+    y = (f_cat % 3 == 0) & (f_num > 0.2)
+
+    X = np.c_[f_num, f_cat]
+    f_cat = [f"cat{c:0>3}" for c in f_cat]
+    X_df = _convert_container(
+        np.asarray([f_num, f_cat]).T,
+        dataframe_lib,
+        ["f_num", "f_cat"],
+        categorical_feature_names=["f_cat"],
+    )
+
+    X_train, X_test, X_train_df, X_test_df, y_train, y_test = train_test_split(
+        X, X_df, y, random_state=0
+    )
+
+    hist_kwargs = dict(max_iter=10, max_bins=max_bins, random_state=0)
+    hist_np = HistGradientBoosting(categorical_features=[False, True], **hist_kwargs)
+    hist_np.fit(X_train, y_train)
+
+    hist_pd = HistGradientBoosting(categorical_features="from_dtype", **hist_kwargs)
+    hist_pd.fit(X_train_df, y_train)
+
+    # Check categories are correct and sorted
+    categories = hist_pd._preprocessor.named_transformers_["encoder"].categories_[0]
+    assert_array_equal(categories, np.unique(f_cat))
+
+    assert len(hist_np._predictors) == len(hist_pd._predictors)
+    for predictor_1, predictor_2 in zip(hist_np._predictors, hist_pd._predictors):
+        assert len(predictor_1[0].nodes) == len(predictor_2[0].nodes)
+
+    score_np = hist_np.score(X_test, y_test)
+    score_pd = hist_pd.score(X_test_df, y_test)
+    assert score_np == pytest.approx(score_pd)
+    assert_allclose(hist_np.predict(X_test), hist_pd.predict(X_test_df))
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+@pytest.mark.parametrize(
+    "HistGradientBoosting",
+    [HistGradientBoostingClassifier, HistGradientBoostingRegressor],
+)
+def test_dataframe_categorical_errors(dataframe_lib, HistGradientBoosting):
+    """Check error cases for pandas categorical feature."""
+    pytest.importorskip(dataframe_lib)
+    msg = "Categorical feature 'f_cat' is expected to have a cardinality <= 16"
+    hist = HistGradientBoosting(categorical_features="from_dtype", max_bins=16)
+
+    rng = np.random.RandomState(42)
+    f_cat = rng.randint(0, high=100, size=100).astype(str)
+    X_df = _convert_container(
+        f_cat[:, None], dataframe_lib, ["f_cat"], categorical_feature_names=["f_cat"]
+    )
+    y = rng.randint(0, high=2, size=100)
+
+    with pytest.raises(ValueError, match=msg):
+        hist.fit(X_df, y)
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_categorical_different_order_same_model(dataframe_lib):
+    """Check that the order of the categorical gives same model."""
+    pytest.importorskip(dataframe_lib)
+    rng = np.random.RandomState(42)
+    n_samples = 1_000
+    f_ints = rng.randint(low=0, high=2, size=n_samples)
+
+    # Construct a target with some noise
+    y = f_ints.copy()
+    flipped = rng.choice([True, False], size=n_samples, p=[0.1, 0.9])
+    y[flipped] = 1 - y[flipped]
+
+    # Construct categorical where 0 -> A and 1 -> B and 1 -> A and 0 -> B
+    f_cat_a_b = np.asarray(["A", "B"])[f_ints]
+    f_cat_b_a = np.asarray(["B", "A"])[f_ints]
+    df_a_b = _convert_container(
+        f_cat_a_b[:, None],
+        dataframe_lib,
+        ["f_cat"],
+        categorical_feature_names=["f_cat"],
+    )
+    df_b_a = _convert_container(
+        f_cat_b_a[:, None],
+        dataframe_lib,
+        ["f_cat"],
+        categorical_feature_names=["f_cat"],
+    )
+
+    hist_a_b = HistGradientBoostingClassifier(
+        categorical_features="from_dtype", random_state=0
+    )
+    hist_b_a = HistGradientBoostingClassifier(
+        categorical_features="from_dtype", random_state=0
+    )
+
+    hist_a_b.fit(df_a_b, y)
+    hist_b_a.fit(df_b_a, y)
+
+    assert len(hist_a_b._predictors) == len(hist_b_a._predictors)
+    for predictor_1, predictor_2 in zip(hist_a_b._predictors, hist_b_a._predictors):
+        assert len(predictor_1[0].nodes) == len(predictor_2[0].nodes)
+
+
+def get_different_bitness_node_ndarray(node_ndarray):
+    new_dtype_for_indexing_fields = np.int64 if _IS_32BIT else np.int32
+
+    # field names in Node struct with np.intp types (see
+    # sklearn/ensemble/_hist_gradient_boosting/common.pyx)
+    indexing_field_names = ["feature_idx"]
+
+    new_dtype_dict = {
+        name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()
+    }
+    for name in indexing_field_names:
+        new_dtype_dict[name] = new_dtype_for_indexing_fields
+
+    new_dtype = np.dtype(
+        {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
+    )
+    return node_ndarray.astype(new_dtype, casting="same_kind")
+
+
+def reduce_predictor_with_different_bitness(predictor):
+    cls, args, state = predictor.__reduce__()
+
+    new_state = state.copy()
+    new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"])
+
+    return (cls, args, new_state)
+
+
+def test_different_bitness_pickle():
+    X, y = make_classification(random_state=0)
+
+    clf = HistGradientBoostingClassifier(random_state=0, max_depth=3)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+
+    def pickle_dump_with_different_bitness():
+        f = io.BytesIO()
+        p = pickle.Pickler(f)
+        p.dispatch_table = copyreg.dispatch_table.copy()
+        p.dispatch_table[TreePredictor] = reduce_predictor_with_different_bitness
+
+        p.dump(clf)
+        f.seek(0)
+        return f
+
+    # Simulate loading a pickle of the same model trained on a platform with different
+    # bitness that than the platform it will be used to make predictions on:
+    new_clf = pickle.load(pickle_dump_with_different_bitness())
+    new_score = new_clf.score(X, y)
+    assert score == pytest.approx(new_score)
+
+
+def test_different_bitness_joblib_pickle():
+    # Make sure that a platform specific pickle generated on a 64 bit
+    # platform can be converted at pickle load time into an estimator
+    # with Cython code that works with the host's native integer precision
+    # to index nodes in the tree data structure when the host is a 32 bit
+    # platform (and vice versa).
+    #
+    # This is in particular useful to be able to train a model on a 64 bit Linux
+    # server and deploy the model as part of a (32 bit) WASM in-browser
+    # application using pyodide.
+    X, y = make_classification(random_state=0)
+
+    clf = HistGradientBoostingClassifier(random_state=0, max_depth=3)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+
+    def joblib_dump_with_different_bitness():
+        f = io.BytesIO()
+        p = NumpyPickler(f)
+        p.dispatch_table = copyreg.dispatch_table.copy()
+        p.dispatch_table[TreePredictor] = reduce_predictor_with_different_bitness
+
+        p.dump(clf)
+        f.seek(0)
+        return f
+
+    new_clf = joblib.load(joblib_dump_with_different_bitness())
+    new_score = new_clf.score(X, y)
+    assert score == pytest.approx(new_score)
+
+
+def test_pandas_nullable_dtype():
+    # Non regression test for https://github.com/scikit-learn/scikit-learn/issues/28317
+    pd = pytest.importorskip("pandas")
+
+    rng = np.random.default_rng(0)
+    X = pd.DataFrame({"a": rng.integers(10, size=100)}).astype(pd.Int64Dtype())
+    y = rng.integers(2, size=100)
+
+    clf = HistGradientBoostingClassifier()
+    clf.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
new file mode 100644
index 0000000000000000000000000000000000000000..a55cb871e3c72ea04325b0b72f7aabc419285921
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@@ -0,0 +1,650 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+from pytest import approx
+
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    G_H_DTYPE,
+    X_BINNED_DTYPE,
+    X_BITSET_INNER_DTYPE,
+    X_DTYPE,
+    Y_DTYPE,
+)
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
+
+
+def _make_training_data(n_bins=256, constant_hessian=True):
+    rng = np.random.RandomState(42)
+    n_samples = 10000
+
+    # Generate some test data directly binned so as to test the grower code
+    # independently of the binning logic.
+    X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), dtype=X_BINNED_DTYPE)
+    X_binned = np.asfortranarray(X_binned)
+
+    def true_decision_function(input_features):
+        """Ground truth decision function
+
+        This is a very simple yet asymmetric decision tree. Therefore the
+        grower code should have no trouble recovering the decision function
+        from 10000 training samples.
+        """
+        if input_features[0] <= n_bins // 2:
+            return -1
+        else:
+            return -1 if input_features[1] <= n_bins // 3 else 1
+
+    target = np.array([true_decision_function(x) for x in X_binned], dtype=Y_DTYPE)
+
+    # Assume a square loss applied to an initial model that always predicts 0
+    # (hardcoded for this test):
+    all_gradients = target.astype(G_H_DTYPE)
+    shape_hessians = 1 if constant_hessian else all_gradients.shape
+    all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE)
+
+    return X_binned, all_gradients, all_hessians
+
+
+def _check_children_consistency(parent, left, right):
+    # Make sure the samples are correctly dispatched from a parent to its
+    # children
+    assert parent.left_child is left
+    assert parent.right_child is right
+
+    # each sample from the parent is propagated to one of the two children
+    assert len(left.sample_indices) + len(right.sample_indices) == len(
+        parent.sample_indices
+    )
+
+    assert set(left.sample_indices).union(set(right.sample_indices)) == set(
+        parent.sample_indices
+    )
+
+    # samples are sent either to the left or the right node, never to both
+    assert set(left.sample_indices).intersection(set(right.sample_indices)) == set()
+
+
+@pytest.mark.parametrize(
+    "n_bins, constant_hessian, stopping_param, shrinkage",
+    [
+        (11, True, "min_gain_to_split", 0.5),
+        (11, False, "min_gain_to_split", 1.0),
+        (11, True, "max_leaf_nodes", 1.0),
+        (11, False, "max_leaf_nodes", 0.1),
+        (42, True, "max_leaf_nodes", 0.01),
+        (42, False, "max_leaf_nodes", 1.0),
+        (256, True, "min_gain_to_split", 1.0),
+        (256, True, "max_leaf_nodes", 0.1),
+    ],
+)
+def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
+    X_binned, all_gradients, all_hessians = _make_training_data(
+        n_bins=n_bins, constant_hessian=constant_hessian
+    )
+    n_samples = X_binned.shape[0]
+
+    if stopping_param == "max_leaf_nodes":
+        stopping_param = {"max_leaf_nodes": 3}
+    else:
+        stopping_param = {"min_gain_to_split": 0.01}
+
+    grower = TreeGrower(
+        X_binned,
+        all_gradients,
+        all_hessians,
+        n_bins=n_bins,
+        shrinkage=shrinkage,
+        min_samples_leaf=1,
+        **stopping_param,
+    )
+
+    # The root node is not yet split, but the best possible split has
+    # already been evaluated:
+    assert grower.root.left_child is None
+    assert grower.root.right_child is None
+
+    root_split = grower.root.split_info
+    assert root_split.feature_idx == 0
+    assert root_split.bin_idx == n_bins // 2
+    assert len(grower.splittable_nodes) == 1
+
+    # Calling split next applies the next split and computes the best split
+    # for each of the two newly introduced children nodes.
+    left_node, right_node = grower.split_next()
+
+    # All training samples have ben split in the two nodes, approximately
+    # 50%/50%
+    _check_children_consistency(grower.root, left_node, right_node)
+    assert len(left_node.sample_indices) > 0.4 * n_samples
+    assert len(left_node.sample_indices) < 0.6 * n_samples
+
+    if grower.min_gain_to_split > 0:
+        # The left node is too pure: there is no gain to split it further.
+        assert left_node.split_info.gain < grower.min_gain_to_split
+        assert left_node in grower.finalized_leaves
+
+    # The right node can still be split further, this time on feature #1
+    split_info = right_node.split_info
+    assert split_info.gain > 1.0
+    assert split_info.feature_idx == 1
+    assert split_info.bin_idx == n_bins // 3
+    assert right_node.left_child is None
+    assert right_node.right_child is None
+
+    # The right split has not been applied yet. Let's do it now:
+    assert len(grower.splittable_nodes) == 1
+    right_left_node, right_right_node = grower.split_next()
+    _check_children_consistency(right_node, right_left_node, right_right_node)
+    assert len(right_left_node.sample_indices) > 0.1 * n_samples
+    assert len(right_left_node.sample_indices) < 0.2 * n_samples
+
+    assert len(right_right_node.sample_indices) > 0.2 * n_samples
+    assert len(right_right_node.sample_indices) < 0.4 * n_samples
+
+    # All the leafs are pure, it is not possible to split any further:
+    assert not grower.splittable_nodes
+
+    grower._apply_shrinkage()
+
+    # Check the values of the leaves:
+    assert grower.root.left_child.value == approx(shrinkage)
+    assert grower.root.right_child.left_child.value == approx(shrinkage)
+    assert grower.root.right_child.right_child.value == approx(-shrinkage, rel=1e-3)
+
+
+def test_predictor_from_grower():
+    # Build a tree on the toy 3-leaf dataset to extract the predictor.
+    n_bins = 256
+    X_binned, all_gradients, all_hessians = _make_training_data(n_bins=n_bins)
+    grower = TreeGrower(
+        X_binned,
+        all_gradients,
+        all_hessians,
+        n_bins=n_bins,
+        shrinkage=1.0,
+        max_leaf_nodes=3,
+        min_samples_leaf=5,
+    )
+    grower.grow()
+    assert grower.n_nodes == 5  # (2 decision nodes + 3 leaves)
+
+    # Check that the node structure can be converted into a predictor
+    # object to perform predictions at scale
+    # We pass undefined binning_thresholds because we won't use predict anyway
+    predictor = grower.make_predictor(
+        binning_thresholds=np.zeros((X_binned.shape[1], n_bins))
+    )
+    assert predictor.nodes.shape[0] == 5
+    assert predictor.nodes["is_leaf"].sum() == 3
+
+    # Probe some predictions for each leaf of the tree
+    # each group of 3 samples corresponds to a condition in _make_training_data
+    input_data = np.array(
+        [
+            [0, 0],
+            [42, 99],
+            [128, 254],
+            [129, 0],
+            [129, 85],
+            [254, 85],
+            [129, 86],
+            [129, 254],
+            [242, 100],
+        ],
+        dtype=np.uint8,
+    )
+    missing_values_bin_idx = n_bins - 1
+    predictions = predictor.predict_binned(
+        input_data, missing_values_bin_idx, n_threads
+    )
+    expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
+    assert np.allclose(predictions, expected_targets)
+
+    # Check that training set can be recovered exactly:
+    predictions = predictor.predict_binned(X_binned, missing_values_bin_idx, n_threads)
+    assert np.allclose(predictions, -all_gradients)
+
+
+@pytest.mark.parametrize(
+    "n_samples, min_samples_leaf, n_bins, constant_hessian, noise",
+    [
+        (11, 10, 7, True, 0),
+        (13, 10, 42, False, 0),
+        (56, 10, 255, True, 0.1),
+        (101, 3, 7, True, 0),
+        (200, 42, 42, False, 0),
+        (300, 55, 255, True, 0.1),
+        (300, 301, 255, True, 0.1),
+    ],
+)
+def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise):
+    rng = np.random.RandomState(seed=0)
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    if noise:
+        y_scale = y.std()
+        y += rng.normal(scale=noise, size=n_samples) * y_scale
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    shape_hessian = 1 if constant_hessian else all_gradients.shape
+    all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
+    grower = TreeGrower(
+        X,
+        all_gradients,
+        all_hessians,
+        n_bins=n_bins,
+        shrinkage=1.0,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=n_samples,
+    )
+    grower.grow()
+    predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_)
+
+    if n_samples >= min_samples_leaf:
+        for node in predictor.nodes:
+            if node["is_leaf"]:
+                assert node["count"] >= min_samples_leaf
+    else:
+        assert predictor.nodes.shape[0] == 1
+        assert predictor.nodes[0]["is_leaf"]
+        assert predictor.nodes[0]["count"] == n_samples
+
+
+@pytest.mark.parametrize("n_samples, min_samples_leaf", [(99, 50), (100, 50)])
+def test_min_samples_leaf_root(n_samples, min_samples_leaf):
+    # Make sure root node isn't split if n_samples is not at least twice
+    # min_samples_leaf
+    rng = np.random.RandomState(seed=0)
+
+    n_bins = 256
+
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+    grower = TreeGrower(
+        X,
+        all_gradients,
+        all_hessians,
+        n_bins=n_bins,
+        shrinkage=1.0,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=n_samples,
+    )
+    grower.grow()
+    if n_samples >= min_samples_leaf * 2:
+        assert len(grower.finalized_leaves) >= 2
+    else:
+        assert len(grower.finalized_leaves) == 1
+
+
+def assert_is_stump(grower):
+    # To assert that stumps are created when max_depth=1
+    for leaf in (grower.root.left_child, grower.root.right_child):
+        assert leaf.left_child is None
+        assert leaf.right_child is None
+
+
+@pytest.mark.parametrize("max_depth", [1, 2, 3])
+def test_max_depth(max_depth):
+    # Make sure max_depth parameter works as expected
+    rng = np.random.RandomState(seed=0)
+
+    n_bins = 256
+    n_samples = 1000
+
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    mapper = _BinMapper(n_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(G_H_DTYPE)
+    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+    grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
+    grower.grow()
+
+    depth = max(leaf.depth for leaf in grower.finalized_leaves)
+    assert depth == max_depth
+
+    if max_depth == 1:
+        assert_is_stump(grower)
+
+
+def test_input_validation():
+    X_binned, all_gradients, all_hessians = _make_training_data()
+
+    X_binned_float = X_binned.astype(np.float32)
+    with pytest.raises(NotImplementedError, match="X_binned must be of type uint8"):
+        TreeGrower(X_binned_float, all_gradients, all_hessians)
+
+    X_binned_C_array = np.ascontiguousarray(X_binned)
+    with pytest.raises(
+        ValueError, match="X_binned should be passed as Fortran contiguous array"
+    ):
+        TreeGrower(X_binned_C_array, all_gradients, all_hessians)
+
+
+def test_init_parameters_validation():
+    X_binned, all_gradients, all_hessians = _make_training_data()
+    with pytest.raises(ValueError, match="min_gain_to_split=-1 must be positive"):
+        TreeGrower(X_binned, all_gradients, all_hessians, min_gain_to_split=-1)
+
+    with pytest.raises(ValueError, match="min_hessian_to_split=-1 must be positive"):
+        TreeGrower(X_binned, all_gradients, all_hessians, min_hessian_to_split=-1)
+
+
+def test_missing_value_predict_only():
+    # Make sure that missing values are supported at predict time even if they
+    # were not encountered in the training data: the missing values are
+    # assigned to whichever child has the most samples.
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(
+        X_binned, gradients, hessians, min_samples_leaf=5, has_missing_values=False
+    )
+    grower.grow()
+
+    # We pass undefined binning_thresholds because we won't use predict anyway
+    predictor = grower.make_predictor(
+        binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))
+    )
+
+    # go from root to a leaf, always following node with the most samples.
+    # That's the path nans are supposed to take
+    node = predictor.nodes[0]
+    while not node["is_leaf"]:
+        left = predictor.nodes[node["left"]]
+        right = predictor.nodes[node["right"]]
+        node = left if left["count"] > right["count"] else right
+
+    prediction_main_path = node["value"]
+
+    # now build X_test with only nans, and make sure all predictions are equal
+    # to prediction_main_path
+    all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
+    known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
+    f_idx_map = np.zeros(0, dtype=np.uint32)
+
+    y_pred = predictor.predict(all_nans, known_cat_bitsets, f_idx_map, n_threads)
+    assert np.all(y_pred == prediction_main_path)
+
+
+def test_split_on_nan_with_infinite_values():
+    # Make sure the split on nan situations are respected even when there are
+    # samples with +inf values (we set the threshold to +inf when we have a
+    # split on nan so this test makes sure this does not introduce edge-case
+    # bugs). We need to use the private API so that we can also test
+    # predict_binned().
+
+    X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
+    # the gradient values will force a split on nan situation
+    gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    bin_mapper = _BinMapper()
+    X_binned = bin_mapper.fit_transform(X)
+
+    n_bins_non_missing = 3
+    has_missing_values = True
+    grower = TreeGrower(
+        X_binned,
+        gradients,
+        hessians,
+        n_bins_non_missing=n_bins_non_missing,
+        has_missing_values=has_missing_values,
+        min_samples_leaf=1,
+        n_threads=n_threads,
+    )
+
+    grower.grow()
+
+    predictor = grower.make_predictor(binning_thresholds=bin_mapper.bin_thresholds_)
+
+    # sanity check: this was a split on nan
+    assert predictor.nodes[0]["num_threshold"] == np.inf
+    assert predictor.nodes[0]["bin_threshold"] == n_bins_non_missing - 1
+
+    known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets()
+
+    # Make sure in particular that the +inf sample is mapped to the left child
+    # Note that lightgbm "fails" here and will assign the inf sample to the
+    # right child, even though it's a "split on nan" situation.
+    predictions = predictor.predict(X, known_cat_bitsets, f_idx_map, n_threads)
+    predictions_binned = predictor.predict_binned(
+        X_binned,
+        missing_values_bin_idx=bin_mapper.missing_values_bin_idx_,
+        n_threads=n_threads,
+    )
+    np.testing.assert_allclose(predictions, -gradients)
+    np.testing.assert_allclose(predictions_binned, -gradients)
+
+
+def test_grow_tree_categories():
+    # Check that the grower produces the right predictor tree when a split is
+    # categorical
+    X_binned = np.array([[0, 1] * 11 + [1]], dtype=X_BINNED_DTYPE).T
+    X_binned = np.asfortranarray(X_binned)
+
+    all_gradients = np.array([10, 1] * 11 + [1], dtype=G_H_DTYPE)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    is_categorical = np.ones(1, dtype=np.uint8)
+
+    grower = TreeGrower(
+        X_binned,
+        all_gradients,
+        all_hessians,
+        n_bins=4,
+        shrinkage=1.0,
+        min_samples_leaf=1,
+        is_categorical=is_categorical,
+        n_threads=n_threads,
+    )
+    grower.grow()
+    assert grower.n_nodes == 3
+
+    categories = [np.array([4, 9], dtype=X_DTYPE)]
+    predictor = grower.make_predictor(binning_thresholds=categories)
+    root = predictor.nodes[0]
+    assert root["count"] == 23
+    assert root["depth"] == 0
+    assert root["is_categorical"]
+
+    left, right = predictor.nodes[root["left"]], predictor.nodes[root["right"]]
+
+    # arbitrary validation, but this means ones go to the left.
+    assert left["count"] >= right["count"]
+
+    # check binned category value (1)
+    expected_binned_cat_bitset = [2**1] + [0] * 7
+    binned_cat_bitset = predictor.binned_left_cat_bitsets
+    assert_array_equal(binned_cat_bitset[0], expected_binned_cat_bitset)
+
+    # check raw category value (9)
+    expected_raw_cat_bitsets = [2**9] + [0] * 7
+    raw_cat_bitsets = predictor.raw_left_cat_bitsets
+    assert_array_equal(raw_cat_bitsets[0], expected_raw_cat_bitsets)
+
+    # Note that since there was no missing values during training, the missing
+    # values aren't part of the bitsets. However, we expect the missing values
+    # to go to the biggest child (i.e. the left one).
+    # The left child has a value of -1 = negative gradient.
+    assert root["missing_go_to_left"]
+
+    # make sure binned missing values are mapped to the left child during
+    # prediction
+    prediction_binned = predictor.predict_binned(
+        np.asarray([[6]]).astype(X_BINNED_DTYPE),
+        missing_values_bin_idx=6,
+        n_threads=n_threads,
+    )
+    assert_allclose(prediction_binned, [-1])  # negative gradient
+
+    # make sure raw missing values are mapped to the left child during
+    # prediction
+    known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32)  # ignored anyway
+    f_idx_map = np.array([0], dtype=np.uint32)
+    prediction = predictor.predict(
+        np.array([[np.nan]]), known_cat_bitsets, f_idx_map, n_threads
+    )
+    assert_allclose(prediction, [-1])
+
+
+@pytest.mark.parametrize("min_samples_leaf", (1, 20))
+@pytest.mark.parametrize("n_unique_categories", (2, 10, 100))
+@pytest.mark.parametrize("target", ("binary", "random", "equal"))
+def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target):
+    # Make sure that native categorical splits are equivalent to using a OHE,
+    # when given enough depth
+
+    rng = np.random.RandomState(0)
+    n_samples = 10_000
+    X_binned = rng.randint(0, n_unique_categories, size=(n_samples, 1), dtype=np.uint8)
+
+    X_ohe = OneHotEncoder(sparse_output=False).fit_transform(X_binned)
+    X_ohe = np.asfortranarray(X_ohe).astype(np.uint8)
+
+    if target == "equal":
+        gradients = X_binned.reshape(-1)
+    elif target == "binary":
+        gradients = (X_binned % 2).reshape(-1)
+    else:
+        gradients = rng.randn(n_samples)
+    gradients = gradients.astype(G_H_DTYPE)
+
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower_params = {
+        "min_samples_leaf": min_samples_leaf,
+        "max_depth": None,
+        "max_leaf_nodes": None,
+    }
+
+    grower = TreeGrower(
+        X_binned, gradients, hessians, is_categorical=[True], **grower_params
+    )
+    grower.grow()
+    # we pass undefined bin_thresholds because we won't use predict()
+    predictor = grower.make_predictor(
+        binning_thresholds=np.zeros((1, n_unique_categories))
+    )
+    preds = predictor.predict_binned(
+        X_binned, missing_values_bin_idx=255, n_threads=n_threads
+    )
+
+    grower_ohe = TreeGrower(X_ohe, gradients, hessians, **grower_params)
+    grower_ohe.grow()
+    predictor_ohe = grower_ohe.make_predictor(
+        binning_thresholds=np.zeros((X_ohe.shape[1], n_unique_categories))
+    )
+    preds_ohe = predictor_ohe.predict_binned(
+        X_ohe, missing_values_bin_idx=255, n_threads=n_threads
+    )
+
+    assert predictor.get_max_depth() <= predictor_ohe.get_max_depth()
+    if target == "binary" and n_unique_categories > 2:
+        # OHE needs more splits to achieve the same predictions
+        assert predictor.get_max_depth() < predictor_ohe.get_max_depth()
+
+    np.testing.assert_allclose(preds, preds_ohe)
+
+
+def test_grower_interaction_constraints():
+    """Check that grower respects interaction constraints."""
+    n_features = 6
+    interaction_cst = [{0, 1}, {1, 2}, {3, 4, 5}]
+    n_samples = 10
+    n_bins = 6
+    root_feature_splits = []
+
+    def get_all_children(node):
+        res = []
+        if node.is_leaf:
+            return res
+        for n in [node.left_child, node.right_child]:
+            res.append(n)
+            res.extend(get_all_children(n))
+        return res
+
+    for seed in range(20):
+        rng = np.random.RandomState(seed)
+
+        X_binned = rng.randint(
+            0, n_bins - 1, size=(n_samples, n_features), dtype=X_BINNED_DTYPE
+        )
+        X_binned = np.asfortranarray(X_binned)
+        gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+        hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+        grower = TreeGrower(
+            X_binned,
+            gradients,
+            hessians,
+            n_bins=n_bins,
+            min_samples_leaf=1,
+            interaction_cst=interaction_cst,
+            n_threads=n_threads,
+        )
+        grower.grow()
+
+        root_feature_idx = grower.root.split_info.feature_idx
+        root_feature_splits.append(root_feature_idx)
+
+        feature_idx_to_constraint_set = {
+            0: {0, 1},
+            1: {0, 1, 2},
+            2: {1, 2},
+            3: {3, 4, 5},
+            4: {3, 4, 5},
+            5: {3, 4, 5},
+        }
+
+        root_constraint_set = feature_idx_to_constraint_set[root_feature_idx]
+        for node in (grower.root.left_child, grower.root.right_child):
+            # Root's children's allowed_features must be the root's constraints set.
+            assert_array_equal(node.allowed_features, list(root_constraint_set))
+        for node in get_all_children(grower.root):
+            if node.is_leaf:
+                continue
+            # Ensure that each node uses a subset of features of its parent node.
+            parent_interaction_cst_indices = set(node.interaction_cst_indices)
+            right_interactions_cst_indices = set(
+                node.right_child.interaction_cst_indices
+            )
+            left_interactions_cst_indices = set(node.left_child.interaction_cst_indices)
+
+            assert right_interactions_cst_indices.issubset(
+                parent_interaction_cst_indices
+            )
+            assert left_interactions_cst_indices.issubset(
+                parent_interaction_cst_indices
+            )
+            # The features used for split must have been present in the root's
+            # constraint set.
+            assert node.split_info.feature_idx in root_constraint_set
+
+    # Make sure that every feature is used at least once as split for the root node.
+    assert (
+        len(set(root_feature_splits))
+        == len(set().union(*interaction_cst))
+        == n_features
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..22375c7d4ea2c378bf7a45ad619f92c187d40984
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
@@ -0,0 +1,239 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    G_H_DTYPE,
+    HISTOGRAM_DTYPE,
+    X_BINNED_DTYPE,
+)
+from sklearn.ensemble._hist_gradient_boosting.histogram import (
+    _build_histogram,
+    _build_histogram_naive,
+    _build_histogram_no_hessian,
+    _build_histogram_root,
+    _build_histogram_root_no_hessian,
+    _subtract_histograms,
+)
+
+
+@pytest.mark.parametrize("build_func", [_build_histogram_naive, _build_histogram])
+def test_build_histogram(build_func):
+    binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE)
+
+    # Small sample_indices (below unrolling threshold)
+    ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE)
+    ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE)
+
+    sample_indices = np.array([0, 2, 3], dtype=np.uint32)
+    hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
+    build_func(
+        0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist
+    )
+    hist = hist[0]
+    assert_array_equal(hist["count"], [2, 1, 0])
+    assert_allclose(hist["sum_gradients"], [1, 3, 0])
+    assert_allclose(hist["sum_hessians"], [2, 2, 0])
+
+    # Larger sample_indices (above unrolling threshold)
+    sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32)
+    ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE)
+    ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE)
+
+    hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
+    build_func(
+        0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist
+    )
+    hist = hist[0]
+    assert_array_equal(hist["count"], [2, 2, 1])
+    assert_allclose(hist["sum_gradients"], [1, 4, 0])
+    assert_allclose(hist["sum_hessians"], [2, 2, 1])
+
+
+def test_histogram_sample_order_independence():
+    # Make sure the order of the samples has no impact on the histogram
+    # computations
+    rng = np.random.RandomState(42)
+    n_sub_samples = 100
+    n_samples = 1000
+    n_bins = 256
+
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=X_BINNED_DTYPE)
+    sample_indices = rng.choice(
+        np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False
+    )
+    ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE)
+    hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram_no_hessian(
+        0, sample_indices, binned_feature, ordered_gradients, hist_gc
+    )
+
+    ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE)
+    hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram(
+        0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc
+    )
+
+    permutation = rng.permutation(n_sub_samples)
+    hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram_no_hessian(
+        0,
+        sample_indices[permutation],
+        binned_feature,
+        ordered_gradients[permutation],
+        hist_gc_perm,
+    )
+
+    hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    _build_histogram(
+        0,
+        sample_indices[permutation],
+        binned_feature,
+        ordered_gradients[permutation],
+        ordered_hessians[permutation],
+        hist_ghc_perm,
+    )
+
+    hist_gc = hist_gc[0]
+    hist_ghc = hist_ghc[0]
+    hist_gc_perm = hist_gc_perm[0]
+    hist_ghc_perm = hist_ghc_perm[0]
+
+    assert_allclose(hist_gc["sum_gradients"], hist_gc_perm["sum_gradients"])
+    assert_array_equal(hist_gc["count"], hist_gc_perm["count"])
+
+    assert_allclose(hist_ghc["sum_gradients"], hist_ghc_perm["sum_gradients"])
+    assert_allclose(hist_ghc["sum_hessians"], hist_ghc_perm["sum_hessians"])
+    assert_array_equal(hist_ghc["count"], hist_ghc_perm["count"])
+
+
+@pytest.mark.parametrize("constant_hessian", [True, False])
+def test_unrolled_equivalent_to_naive(constant_hessian):
+    # Make sure the different unrolled histogram computations give the same
+    # results as the naive one.
+    rng = np.random.RandomState(42)
+    n_samples = 10
+    n_bins = 5
+    sample_indices = np.arange(n_samples).astype(np.uint32)
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
+    ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    if constant_hessian:
+        ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    else:
+        ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+
+    hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+
+    _build_histogram_root_no_hessian(0, binned_feature, ordered_gradients, hist_gc_root)
+    _build_histogram_root(
+        0, binned_feature, ordered_gradients, ordered_hessians, hist_ghc_root
+    )
+    _build_histogram_no_hessian(
+        0, sample_indices, binned_feature, ordered_gradients, hist_gc
+    )
+    _build_histogram(
+        0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc
+    )
+    _build_histogram_naive(
+        0,
+        sample_indices,
+        binned_feature,
+        ordered_gradients,
+        ordered_hessians,
+        hist_naive,
+    )
+
+    hist_naive = hist_naive[0]
+    hist_gc_root = hist_gc_root[0]
+    hist_ghc_root = hist_ghc_root[0]
+    hist_gc = hist_gc[0]
+    hist_ghc = hist_ghc[0]
+    for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc):
+        assert_array_equal(hist["count"], hist_naive["count"])
+        assert_allclose(hist["sum_gradients"], hist_naive["sum_gradients"])
+    for hist in (hist_ghc_root, hist_ghc):
+        assert_allclose(hist["sum_hessians"], hist_naive["sum_hessians"])
+    for hist in (hist_gc_root, hist_gc):
+        assert_array_equal(hist["sum_hessians"], np.zeros(n_bins))
+
+
+@pytest.mark.parametrize("constant_hessian", [True, False])
+def test_hist_subtraction(constant_hessian):
+    # Make sure the histogram subtraction trick gives the same result as the
+    # classical method.
+    rng = np.random.RandomState(42)
+    n_samples = 10
+    n_bins = 5
+    sample_indices = np.arange(n_samples).astype(np.uint32)
+    binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
+    ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    if constant_hessian:
+        ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    else:
+        ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+
+    hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(
+            0, sample_indices, binned_feature, ordered_gradients, hist_parent
+        )
+    else:
+        _build_histogram(
+            0,
+            sample_indices,
+            binned_feature,
+            ordered_gradients,
+            ordered_hessians,
+            hist_parent,
+        )
+
+    mask = rng.randint(0, 2, n_samples).astype(bool)
+
+    sample_indices_left = sample_indices[mask]
+    ordered_gradients_left = ordered_gradients[mask]
+    ordered_hessians_left = ordered_hessians[mask]
+    hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(
+            0, sample_indices_left, binned_feature, ordered_gradients_left, hist_left
+        )
+    else:
+        _build_histogram(
+            0,
+            sample_indices_left,
+            binned_feature,
+            ordered_gradients_left,
+            ordered_hessians_left,
+            hist_left,
+        )
+
+    sample_indices_right = sample_indices[~mask]
+    ordered_gradients_right = ordered_gradients[~mask]
+    ordered_hessians_right = ordered_hessians[~mask]
+    hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
+    if constant_hessian:
+        _build_histogram_no_hessian(
+            0, sample_indices_right, binned_feature, ordered_gradients_right, hist_right
+        )
+    else:
+        _build_histogram(
+            0,
+            sample_indices_right,
+            binned_feature,
+            ordered_gradients_right,
+            ordered_hessians_right,
+            hist_right,
+        )
+
+    hist_left_sub = np.copy(hist_parent)
+    hist_right_sub = np.copy(hist_parent)
+    _subtract_histograms(0, n_bins, hist_left_sub, hist_right)
+    _subtract_histograms(0, n_bins, hist_right_sub, hist_left)
+
+    for key in ("count", "sum_hessians", "sum_gradients"):
+        assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6)
+        assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_constraints.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_constraints.py
new file mode 100644
index 0000000000000000000000000000000000000000..56b6068d794e8c96c24ee0ef18dbad3f66ad64b0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_constraints.py
@@ -0,0 +1,446 @@
+import re
+
+import numpy as np
+import pytest
+
+from sklearn.ensemble import (
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+)
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    G_H_DTYPE,
+    X_BINNED_DTYPE,
+    MonotonicConstraint,
+)
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value,
+)
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils._testing import _convert_container
+
+n_threads = _openmp_effective_n_threads()
+
+
+def is_increasing(a):
+    return (np.diff(a) >= 0.0).all()
+
+
+def is_decreasing(a):
+    return (np.diff(a) <= 0.0).all()
+
+
+def assert_leaves_values_monotonic(predictor, monotonic_cst):
+    # make sure leaves values (from left to right) are either all increasing
+    # or all decreasing (or neither) depending on the monotonic constraint.
+    nodes = predictor.nodes
+
+    def get_leaves_values():
+        """get leaves values from left to right"""
+        values = []
+
+        def depth_first_collect_leaf_values(node_idx):
+            node = nodes[node_idx]
+            if node["is_leaf"]:
+                values.append(node["value"])
+                return
+            depth_first_collect_leaf_values(node["left"])
+            depth_first_collect_leaf_values(node["right"])
+
+        depth_first_collect_leaf_values(0)  # start at root (0)
+        return values
+
+    values = get_leaves_values()
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        # some increasing, some decreasing
+        assert not is_increasing(values) and not is_decreasing(values)
+    elif monotonic_cst == MonotonicConstraint.POS:
+        # all increasing
+        assert is_increasing(values)
+    else:  # NEG
+        # all decreasing
+        assert is_decreasing(values)
+
+
+def assert_children_values_monotonic(predictor, monotonic_cst):
+    # Make sure siblings values respect the monotonic constraints. Left should
+    # be lower (resp greater) than right child if constraint is POS (resp.
+    # NEG).
+    # Note that this property alone isn't enough to ensure full monotonicity,
+    # since we also need to guanrantee that all the descendents of the left
+    # child won't be greater (resp. lower) than the right child, or its
+    # descendents. That's why we need to bound the predicted values (this is
+    # tested in assert_children_values_bounded)
+    nodes = predictor.nodes
+    left_lower = []
+    left_greater = []
+    for node in nodes:
+        if node["is_leaf"]:
+            continue
+
+        left_idx = node["left"]
+        right_idx = node["right"]
+
+        if nodes[left_idx]["value"] < nodes[right_idx]["value"]:
+            left_lower.append(node)
+        elif nodes[left_idx]["value"] > nodes[right_idx]["value"]:
+            left_greater.append(node)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        assert left_lower and left_greater
+    elif monotonic_cst == MonotonicConstraint.POS:
+        assert left_lower and not left_greater
+    else:  # NEG
+        assert not left_lower and left_greater
+
+
+def assert_children_values_bounded(grower, monotonic_cst):
+    # Make sure that the values of the children of a node are bounded by the
+    # middle value between that node and its sibling (if there is a monotonic
+    # constraint).
+    # As a bonus, we also check that the siblings values are properly ordered
+    # which is slightly redundant with assert_children_values_monotonic (but
+    # this check is done on the grower nodes whereas
+    # assert_children_values_monotonic is done on the predictor nodes)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        return
+
+    def recursively_check_children_node_values(node, right_sibling=None):
+        if node.is_leaf:
+            return
+        if right_sibling is not None:
+            middle = (node.value + right_sibling.value) / 2
+            if monotonic_cst == MonotonicConstraint.POS:
+                assert node.left_child.value <= node.right_child.value <= middle
+                if not right_sibling.is_leaf:
+                    assert (
+                        middle
+                        <= right_sibling.left_child.value
+                        <= right_sibling.right_child.value
+                    )
+            else:  # NEG
+                assert node.left_child.value >= node.right_child.value >= middle
+                if not right_sibling.is_leaf:
+                    assert (
+                        middle
+                        >= right_sibling.left_child.value
+                        >= right_sibling.right_child.value
+                    )
+
+        recursively_check_children_node_values(
+            node.left_child, right_sibling=node.right_child
+        )
+        recursively_check_children_node_values(node.right_child)
+
+    recursively_check_children_node_values(grower.root)
+
+
+@pytest.mark.parametrize("seed", range(3))
+@pytest.mark.parametrize(
+    "monotonic_cst",
+    (
+        MonotonicConstraint.NO_CST,
+        MonotonicConstraint.POS,
+        MonotonicConstraint.NEG,
+    ),
+)
+def test_nodes_values(monotonic_cst, seed):
+    # Build a single tree with only one feature, and make sure the nodes
+    # values respect the monotonic constraints.
+
+    # Considering the following tree with a monotonic POS constraint, we
+    # should have:
+    #
+    #       root
+    #      /    \
+    #     5     10    # middle = 7.5
+    #    / \   / \
+    #   a  b  c  d
+    #
+    # a <= b and c <= d  (assert_children_values_monotonic)
+    # a, b <= middle <= c, d (assert_children_values_bounded)
+    # a <= b <= c <= d (assert_leaves_values_monotonic)
+    #
+    # The last one is a consequence of the others, but can't hurt to check
+
+    rng = np.random.RandomState(seed)
+    n_samples = 1000
+    n_features = 1
+    X_binned = rng.randint(0, 255, size=(n_samples, n_features), dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(
+        X_binned, gradients, hessians, monotonic_cst=[monotonic_cst], shrinkage=0.1
+    )
+    grower.grow()
+
+    # grow() will shrink the leaves values at the very end. For our comparison
+    # tests, we need to revert the shrinkage of the leaves, else we would
+    # compare the value of a leaf (shrunk) with a node (not shrunk) and the
+    # test would not be correct.
+    for leave in grower.finalized_leaves:
+        leave.value /= grower.shrinkage
+
+    # We pass undefined binning_thresholds because we won't use predict anyway
+    predictor = grower.make_predictor(
+        binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))
+    )
+
+    # The consistency of the bounds can only be checked on the tree grower
+    # as the node bounds are not copied into the predictor tree. The
+    # consistency checks on the values of node children and leaves can be
+    # done either on the grower tree or on the predictor tree. We only
+    # do those checks on the predictor tree as the latter is derived from
+    # the former.
+    assert_children_values_monotonic(predictor, monotonic_cst)
+    assert_children_values_bounded(grower, monotonic_cst)
+    assert_leaves_values_monotonic(predictor, monotonic_cst)
+
+
+@pytest.mark.parametrize("use_feature_names", (True, False))
+def test_predictions(global_random_seed, use_feature_names):
+    # Train a model with a POS constraint on the first non-categorical feature
+    # and a NEG constraint on the second non-categorical feature, and make sure
+    # the constraints are respected by checking the predictions.
+    # test adapted from lightgbm's test_monotone_constraint(), itself inspired
+    # by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
+
+    rng = np.random.RandomState(global_random_seed)
+
+    n_samples = 1000
+    f_0 = rng.rand(n_samples)  # positive correlation with y
+    f_1 = rng.rand(n_samples)  # negative correlation with y
+
+    # extra categorical features, no correlation with y,
+    # to check the correctness of monotonicity constraint remapping, see issue #28898
+    f_a = rng.randint(low=0, high=9, size=n_samples)
+    f_b = rng.randint(low=0, high=9, size=n_samples)
+    f_c = rng.randint(low=0, high=9, size=n_samples)
+
+    X = np.c_[f_a, f_0, f_b, f_1, f_c]
+    columns_name = ["f_a", "f_0", "f_b", "f_1", "f_c"]
+    constructor_name = "dataframe" if use_feature_names else "array"
+    X = _convert_container(X, constructor_name, columns_name=columns_name)
+
+    noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+    y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise
+
+    if use_feature_names:
+        monotonic_cst = {"f_0": +1, "f_1": -1}
+        categorical_features = ["f_a", "f_b", "f_c"]
+    else:
+        monotonic_cst = [0, +1, 0, -1, 0]
+        categorical_features = [0, 2, 4]
+
+    gbdt = HistGradientBoostingRegressor(
+        monotonic_cst=monotonic_cst, categorical_features=categorical_features
+    )
+    gbdt.fit(X, y)
+
+    linspace = np.linspace(0, 1, 100)
+    sin = np.sin(linspace)
+    constant = np.full_like(linspace, fill_value=0.5)
+
+    # We now assert the predictions properly respect the constraints, on each
+    # feature. When testing for a feature we need to set the other one to a
+    # constant, because the monotonic constraints are only a "all else being
+    # equal" type of constraints:
+    # a constraint on the first feature only means that
+    # x0 < x0' => f(x0, x1) < f(x0', x1)
+    # while x1 stays constant.
+    # The constraint does not guanrantee that
+    # x0 < x0' => f(x0, x1) < f(x0', x1')
+
+    # First non-categorical feature (POS)
+    # assert pred is all increasing when f_0 is all increasing
+    X = np.c_[constant, linspace, constant, constant, constant]
+    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    pred = gbdt.predict(X)
+    assert is_increasing(pred)
+    # assert pred actually follows the variations of f_0
+    X = np.c_[constant, sin, constant, constant, constant]
+    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    pred = gbdt.predict(X)
+    assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
+
+    # Second non-categorical feature (NEG)
+    # assert pred is all decreasing when f_1 is all increasing
+    X = np.c_[constant, constant, constant, linspace, constant]
+    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    pred = gbdt.predict(X)
+    assert is_decreasing(pred)
+    # assert pred actually follows the inverse variations of f_1
+    X = np.c_[constant, constant, constant, sin, constant]
+    X = _convert_container(X, constructor_name, columns_name=columns_name)
+    pred = gbdt.predict(X)
+    assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
+
+
+def test_input_error():
+    X = [[1, 2], [2, 3], [3, 4]]
+    y = [0, 1, 2]
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
+    with pytest.raises(
+        ValueError, match=re.escape("monotonic_cst has shape (3,) but the input data")
+    ):
+        gbdt.fit(X, y)
+
+    for monotonic_cst in ([1, 3], [1, -3], [0.3, -0.7]):
+        gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+        expected_msg = re.escape(
+            "must be an array-like of -1, 0 or 1. Observed values:"
+        )
+        with pytest.raises(ValueError, match=expected_msg):
+            gbdt.fit(X, y)
+
+    gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
+    with pytest.raises(
+        ValueError,
+        match="monotonic constraints are not supported for multiclass classification",
+    ):
+        gbdt.fit(X, y)
+
+
+def test_input_error_related_to_feature_names():
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
+    y = np.array([0, 1, 0])
+
+    monotonic_cst = {"d": 1, "a": 1, "c": -1}
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+    expected_msg = re.escape(
+        "monotonic_cst contains 2 unexpected feature names: ['c', 'd']."
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        gbdt.fit(X, y)
+
+    monotonic_cst = {k: 1 for k in "abcdefghijklmnopqrstuvwxyz"}
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+    expected_msg = re.escape(
+        "monotonic_cst contains 24 unexpected feature names: "
+        "['c', 'd', 'e', 'f', 'g', '...']."
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        gbdt.fit(X, y)
+
+    monotonic_cst = {"a": 1}
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+    expected_msg = re.escape(
+        "HistGradientBoostingRegressor was not fitted on data with feature "
+        "names. Pass monotonic_cst as an integer array instead."
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        gbdt.fit(X.values, y)
+
+    monotonic_cst = {"b": -1, "a": "+"}
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+    expected_msg = re.escape("monotonic_cst['a'] must be either -1, 0 or 1. Got '+'.")
+    with pytest.raises(ValueError, match=expected_msg):
+        gbdt.fit(X, y)
+
+
+def test_bounded_value_min_gain_to_split():
+    # The purpose of this test is to show that when computing the gain at a
+    # given split, the value of the current node should be properly bounded to
+    # respect the monotonic constraints, because it strongly interacts with
+    # min_gain_to_split. We build a simple example where gradients are [1, 1,
+    # 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
+    # bin, and depending on whether the value of the node is bounded or not,
+    # the min_gain_to_split constraint is or isn't satisfied.
+    l2_regularization = 0
+    min_hessian_to_split = 0
+    min_samples_leaf = 1
+    n_bins = n_samples = 5
+    X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    builder = HistogramBuilder(
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
+    )
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+    )
+    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    children_lower_bound, children_upper_bound = -np.inf, np.inf
+
+    min_gain_to_split = 2000
+    splitter = Splitter(
+        X_binned,
+        n_bins_non_missing,
+        missing_values_bin_idx,
+        has_missing_values,
+        is_categorical,
+        monotonic_cst,
+        l2_regularization,
+        min_hessian_to_split,
+        min_samples_leaf,
+        min_gain_to_split,
+        hessians_are_constant,
+    )
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+
+    # Since the gradient array is [1, 1, 100, 1, 1]
+    # the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
+    # and is equal to about 1307, which less than min_gain_to_split = 2000, so
+    # the node is considered unsplittable (gain = -1)
+    current_lower_bound, current_upper_bound = -np.inf, np.inf
+    value = compute_node_value(
+        sum_gradients,
+        sum_hessians,
+        current_lower_bound,
+        current_upper_bound,
+        l2_regularization,
+    )
+    # the unbounded value is equal to -sum_gradients / sum_hessians
+    assert value == pytest.approx(-104 / 5)
+    split_info = splitter.find_node_split(
+        n_samples,
+        histograms,
+        sum_gradients,
+        sum_hessians,
+        value,
+        lower_bound=children_lower_bound,
+        upper_bound=children_upper_bound,
+    )
+    assert split_info.gain == -1  # min_gain_to_split not respected
+
+    # here again the max possible gain is on the 3rd bin but we now cap the
+    # value of the node into [-10, inf].
+    # This means the gain is now about 2430 which is more than the
+    # min_gain_to_split constraint.
+    current_lower_bound, current_upper_bound = -10, np.inf
+    value = compute_node_value(
+        sum_gradients,
+        sum_hessians,
+        current_lower_bound,
+        current_upper_bound,
+        l2_regularization,
+    )
+    assert value == -10
+    split_info = splitter.find_node_split(
+        n_samples,
+        histograms,
+        sum_gradients,
+        sum_hessians,
+        value,
+        lower_bound=children_lower_bound,
+        upper_bound=children_upper_bound,
+    )
+    assert split_info.gain > min_gain_to_split
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c3c9ae81bac2d498c460bfb5f2173f8c48693d1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
@@ -0,0 +1,187 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn.datasets import make_regression
+from sklearn.ensemble._hist_gradient_boosting._bitset import (
+    set_bitset_memoryview,
+    set_raw_bitset_from_binned_bitset,
+)
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    ALMOST_INF,
+    G_H_DTYPE,
+    PREDICTOR_RECORD_DTYPE,
+    X_BINNED_DTYPE,
+    X_BITSET_INNER_DTYPE,
+    X_DTYPE,
+)
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
+from sklearn.metrics import r2_score
+from sklearn.model_selection import train_test_split
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
+
+
+@pytest.mark.parametrize("n_bins", [200, 256])
+def test_regression_dataset(n_bins):
+    X, y = make_regression(
+        n_samples=500, n_features=10, n_informative=5, random_state=42
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
+    X_train_binned = mapper.fit_transform(X_train)
+
+    # Init gradients and hessians to that of least squares loss
+    gradients = -y_train.astype(G_H_DTYPE)
+    hessians = np.ones(1, dtype=G_H_DTYPE)
+
+    min_samples_leaf = 10
+    max_leaf_nodes = 30
+    grower = TreeGrower(
+        X_train_binned,
+        gradients,
+        hessians,
+        min_samples_leaf=min_samples_leaf,
+        max_leaf_nodes=max_leaf_nodes,
+        n_bins=n_bins,
+        n_bins_non_missing=mapper.n_bins_non_missing_,
+    )
+    grower.grow()
+
+    predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_)
+
+    known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
+    f_idx_map = np.zeros(0, dtype=np.uint32)
+
+    y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map, n_threads)
+    assert r2_score(y_train, y_pred_train) > 0.82
+
+    y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map, n_threads)
+    assert r2_score(y_test, y_pred_test) > 0.67
+
+
+@pytest.mark.parametrize(
+    "num_threshold, expected_predictions",
+    [
+        (-np.inf, [0, 1, 1, 1]),
+        (10, [0, 0, 1, 1]),
+        (20, [0, 0, 0, 1]),
+        (ALMOST_INF, [0, 0, 0, 1]),
+        (np.inf, [0, 0, 0, 0]),
+    ],
+)
+def test_infinite_values_and_thresholds(num_threshold, expected_predictions):
+    # Make sure infinite values and infinite thresholds are handled properly.
+    # In particular, if a value is +inf and the threshold is ALMOST_INF the
+    # sample should go to the right child. If the threshold is inf (split on
+    # nan), the +inf sample will go to the left child.
+
+    X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1)
+    nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
+
+    # We just construct a simple tree with 1 root and 2 children
+    # parent node
+    nodes[0]["left"] = 1
+    nodes[0]["right"] = 2
+    nodes[0]["feature_idx"] = 0
+    nodes[0]["num_threshold"] = num_threshold
+
+    # left child
+    nodes[1]["is_leaf"] = True
+    nodes[1]["value"] = 0
+
+    # right child
+    nodes[2]["is_leaf"] = True
+    nodes[2]["value"] = 1
+
+    binned_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
+    raw_categorical_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
+    known_cat_bitset = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
+    f_idx_map = np.zeros(0, dtype=np.uint32)
+
+    predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
+    predictions = predictor.predict(X, known_cat_bitset, f_idx_map, n_threads)
+
+    assert np.all(predictions == expected_predictions)
+
+
+@pytest.mark.parametrize(
+    "bins_go_left, expected_predictions",
+    [
+        ([0, 3, 4, 6], [1, 0, 0, 1, 1, 0]),
+        ([0, 1, 2, 6], [1, 1, 1, 0, 0, 0]),
+        ([3, 5, 6], [0, 0, 0, 1, 0, 1]),
+    ],
+)
+def test_categorical_predictor(bins_go_left, expected_predictions):
+    # Test predictor outputs are correct with categorical features
+
+    X_binned = np.array([[0, 1, 2, 3, 4, 5]], dtype=X_BINNED_DTYPE).T
+    categories = np.array([2, 5, 6, 8, 10, 15], dtype=X_DTYPE)
+
+    bins_go_left = np.array(bins_go_left, dtype=X_BINNED_DTYPE)
+
+    # We just construct a simple tree with 1 root and 2 children
+    # parent node
+    nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
+    nodes[0]["left"] = 1
+    nodes[0]["right"] = 2
+    nodes[0]["feature_idx"] = 0
+    nodes[0]["is_categorical"] = True
+    nodes[0]["missing_go_to_left"] = True
+
+    # left child
+    nodes[1]["is_leaf"] = True
+    nodes[1]["value"] = 1
+
+    # right child
+    nodes[2]["is_leaf"] = True
+    nodes[2]["value"] = 0
+
+    binned_cat_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)
+    raw_categorical_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)
+    for go_left in bins_go_left:
+        set_bitset_memoryview(binned_cat_bitsets[0], go_left)
+
+    set_raw_bitset_from_binned_bitset(
+        raw_categorical_bitsets[0], binned_cat_bitsets[0], categories
+    )
+
+    predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
+
+    # Check binned data gives correct predictions
+    prediction_binned = predictor.predict_binned(
+        X_binned, missing_values_bin_idx=6, n_threads=n_threads
+    )
+    assert_allclose(prediction_binned, expected_predictions)
+
+    # manually construct bitset
+    known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32)
+    known_cat_bitsets[0, 0] = np.sum(2**categories, dtype=np.uint32)
+    f_idx_map = np.array([0], dtype=np.uint32)
+
+    # Check with un-binned data
+    predictions = predictor.predict(
+        categories.reshape(-1, 1), known_cat_bitsets, f_idx_map, n_threads
+    )
+    assert_allclose(predictions, expected_predictions)
+
+    # Check missing goes left because missing_values_bin_idx=6
+    X_binned_missing = np.array([[6]], dtype=X_BINNED_DTYPE).T
+    predictions = predictor.predict_binned(
+        X_binned_missing, missing_values_bin_idx=6, n_threads=n_threads
+    )
+    assert_allclose(predictions, [1])
+
+    # missing and unknown go left
+    predictions = predictor.predict(
+        np.array([[np.nan, 17]], dtype=X_DTYPE).T,
+        known_cat_bitsets,
+        f_idx_map,
+        n_threads,
+    )
+    assert_allclose(predictions, [1, 1])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
new file mode 100644
index 0000000000000000000000000000000000000000..388697340e08b545be766c6d46cf7362371bc258
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
@@ -0,0 +1,1070 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    G_H_DTYPE,
+    HISTOGRAM_DTYPE,
+    X_BINNED_DTYPE,
+    MonotonicConstraint,
+)
+from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value,
+)
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils._testing import skip_if_32bit
+
+n_threads = _openmp_effective_n_threads()
+
+
+@pytest.mark.parametrize("n_bins", [3, 32, 256])
+def test_histogram_split(n_bins):
+    rng = np.random.RandomState(42)
+    feature_idx = 0
+    l2_regularization = 0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+    X_binned = np.asfortranarray(
+        rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE
+    )
+    binned_feature = X_binned.T[feature_idx]
+    sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
+    ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    all_hessians = ordered_hessians
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    for true_bin in range(1, n_bins - 2):
+        for sign in [-1, 1]:
+            ordered_gradients = np.full_like(binned_feature, sign, dtype=G_H_DTYPE)
+            ordered_gradients[binned_feature <= true_bin] *= -1
+            all_gradients = ordered_gradients
+            sum_gradients = all_gradients.sum()
+
+            builder = HistogramBuilder(
+                X_binned,
+                n_bins,
+                all_gradients,
+                all_hessians,
+                hessians_are_constant,
+                n_threads,
+            )
+            n_bins_non_missing = np.array(
+                [n_bins - 1] * X_binned.shape[1], dtype=np.uint32
+            )
+            has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+            monotonic_cst = np.array(
+                [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+            )
+            is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
+            missing_values_bin_idx = n_bins - 1
+            splitter = Splitter(
+                X_binned,
+                n_bins_non_missing,
+                missing_values_bin_idx,
+                has_missing_values,
+                is_categorical,
+                monotonic_cst,
+                l2_regularization,
+                min_hessian_to_split,
+                min_samples_leaf,
+                min_gain_to_split,
+                hessians_are_constant,
+            )
+
+            histograms = builder.compute_histograms_brute(sample_indices)
+            value = compute_node_value(
+                sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+            )
+            split_info = splitter.find_node_split(
+                sample_indices.shape[0], histograms, sum_gradients, sum_hessians, value
+            )
+
+            assert split_info.bin_idx == true_bin
+            assert split_info.gain >= 0
+            assert split_info.feature_idx == feature_idx
+            assert (
+                split_info.n_samples_left + split_info.n_samples_right
+                == sample_indices.shape[0]
+            )
+            # Constant hessian: 1. per sample.
+            assert split_info.n_samples_left == split_info.sum_hessian_left
+
+
+@skip_if_32bit
+@pytest.mark.parametrize("constant_hessian", [True, False])
+def test_gradient_and_hessian_sanity(constant_hessian):
+    # This test checks that the values of gradients and hessians are
+    # consistent in different places:
+    # - in split_info: si.sum_gradient_left + si.sum_gradient_right must be
+    #   equal to the gradient at the node. Same for hessians.
+    # - in the histograms: summing 'sum_gradients' over the bins must be
+    #   constant across all features, and those sums must be equal to the
+    #   node's gradient. Same for hessians.
+
+    rng = np.random.RandomState(42)
+
+    n_bins = 10
+    n_features = 20
+    n_samples = 500
+    l2_regularization = 0.0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+
+    X_binned = rng.randint(
+        0, n_bins, size=(n_samples, n_features), dtype=X_BINNED_DTYPE
+    )
+    X_binned = np.asfortranarray(X_binned)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    if constant_hessian:
+        all_hessians = np.ones(1, dtype=G_H_DTYPE)
+        sum_hessians = 1 * n_samples
+    else:
+        all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
+        sum_hessians = all_hessians.sum()
+
+    builder = HistogramBuilder(
+        X_binned, n_bins, all_gradients, all_hessians, constant_hessian, n_threads
+    )
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+    )
+    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(
+        X_binned,
+        n_bins_non_missing,
+        missing_values_bin_idx,
+        has_missing_values,
+        is_categorical,
+        monotonic_cst,
+        l2_regularization,
+        min_hessian_to_split,
+        min_samples_leaf,
+        min_gain_to_split,
+        constant_hessian,
+    )
+
+    hists_parent = builder.compute_histograms_brute(sample_indices)
+    value_parent = compute_node_value(
+        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+    )
+    si_parent = splitter.find_node_split(
+        n_samples, hists_parent, sum_gradients, sum_hessians, value_parent
+    )
+    sample_indices_left, sample_indices_right, _ = splitter.split_indices(
+        si_parent, sample_indices
+    )
+
+    hists_left = builder.compute_histograms_brute(sample_indices_left)
+    value_left = compute_node_value(
+        si_parent.sum_gradient_left,
+        si_parent.sum_hessian_left,
+        -np.inf,
+        np.inf,
+        l2_regularization,
+    )
+    hists_right = builder.compute_histograms_brute(sample_indices_right)
+    value_right = compute_node_value(
+        si_parent.sum_gradient_right,
+        si_parent.sum_hessian_right,
+        -np.inf,
+        np.inf,
+        l2_regularization,
+    )
+    si_left = splitter.find_node_split(
+        n_samples,
+        hists_left,
+        si_parent.sum_gradient_left,
+        si_parent.sum_hessian_left,
+        value_left,
+    )
+    si_right = splitter.find_node_split(
+        n_samples,
+        hists_right,
+        si_parent.sum_gradient_right,
+        si_parent.sum_hessian_right,
+        value_right,
+    )
+
+    # make sure that si.sum_gradient_left + si.sum_gradient_right have their
+    # expected value, same for hessians
+    for si, indices in (
+        (si_parent, sample_indices),
+        (si_left, sample_indices_left),
+        (si_right, sample_indices_right),
+    ):
+        gradient = si.sum_gradient_right + si.sum_gradient_left
+        expected_gradient = all_gradients[indices].sum()
+        hessian = si.sum_hessian_right + si.sum_hessian_left
+        if constant_hessian:
+            expected_hessian = indices.shape[0] * all_hessians[0]
+        else:
+            expected_hessian = all_hessians[indices].sum()
+
+        assert np.isclose(gradient, expected_gradient)
+        assert np.isclose(hessian, expected_hessian)
+
+    # make sure sum of gradients in histograms are the same for all features,
+    # and make sure they're equal to their expected value
+    hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE)
+    hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE)
+    hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE)
+    for hists, indices in (
+        (hists_parent, sample_indices),
+        (hists_left, sample_indices_left),
+        (hists_right, sample_indices_right),
+    ):
+        # note: gradients and hessians have shape (n_features,),
+        # we're comparing them to *scalars*. This has the benefit of also
+        # making sure that all the entries are equal across features.
+        gradients = hists["sum_gradients"].sum(axis=1)  # shape = (n_features,)
+        expected_gradient = all_gradients[indices].sum()  # scalar
+        hessians = hists["sum_hessians"].sum(axis=1)
+        if constant_hessian:
+            # 0 is not the actual hessian, but it's not computed in this case
+            expected_hessian = 0.0
+        else:
+            expected_hessian = all_hessians[indices].sum()
+
+        assert np.allclose(gradients, expected_gradient)
+        assert np.allclose(hessians, expected_hessian)
+
+
+def test_split_indices():
+    # Check that split_indices returns the correct splits and that
+    # splitter.partition is consistent with what is returned.
+    rng = np.random.RandomState(421)
+
+    n_bins = 5
+    n_samples = 10
+    l2_regularization = 0.0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+
+    # split will happen on feature 1 and on bin 3
+    X_binned = [
+        [0, 0],
+        [0, 3],
+        [0, 4],
+        [0, 0],
+        [0, 0],
+        [0, 0],
+        [0, 0],
+        [0, 4],
+        [0, 0],
+        [0, 4],
+    ]
+    X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = 1 * n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
+    )
+    n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+    )
+    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(
+        X_binned,
+        n_bins_non_missing,
+        missing_values_bin_idx,
+        has_missing_values,
+        is_categorical,
+        monotonic_cst,
+        l2_regularization,
+        min_hessian_to_split,
+        min_samples_leaf,
+        min_gain_to_split,
+        hessians_are_constant,
+    )
+
+    assert np.all(sample_indices == splitter.partition)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(
+        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+    )
+    si_root = splitter.find_node_split(
+        n_samples, histograms, sum_gradients, sum_hessians, value
+    )
+
+    # sanity checks for best split
+    assert si_root.feature_idx == 1
+    assert si_root.bin_idx == 3
+
+    samples_left, samples_right, position_right = splitter.split_indices(
+        si_root, splitter.partition
+    )
+    assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
+    assert set(samples_right) == set([2, 7, 9])
+
+    assert list(samples_left) == list(splitter.partition[:position_right])
+    assert list(samples_right) == list(splitter.partition[position_right:])
+
+    # Check that the resulting split indices sizes are consistent with the
+    # count statistics anticipated when looking for the best split.
+    assert samples_left.shape[0] == si_root.n_samples_left
+    assert samples_right.shape[0] == si_root.n_samples_right
+
+
+def test_min_gain_to_split():
+    # Try to split a pure node (all gradients are equal, same for hessians)
+    # with min_gain_to_split = 0 and make sure that the node is not split (best
+    # possible gain = -1). Note: before the strict inequality comparison, this
+    # test would fail because the node would be split with a gain of 0.
+    rng = np.random.RandomState(42)
+    l2_regularization = 0
+    min_hessian_to_split = 0
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+    n_bins = 255
+    n_samples = 100
+    X_binned = np.asfortranarray(
+        rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE
+    )
+    binned_feature = X_binned[:, 0]
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    builder = HistogramBuilder(
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
+    )
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+    )
+    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(
+        X_binned,
+        n_bins_non_missing,
+        missing_values_bin_idx,
+        has_missing_values,
+        is_categorical,
+        monotonic_cst,
+        l2_regularization,
+        min_hessian_to_split,
+        min_samples_leaf,
+        min_gain_to_split,
+        hessians_are_constant,
+    )
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(
+        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+    )
+    split_info = splitter.find_node_split(
+        n_samples, histograms, sum_gradients, sum_hessians, value
+    )
+    assert split_info.gain == -1
+
+
+@pytest.mark.parametrize(
+    (
+        "X_binned, all_gradients, has_missing_values, n_bins_non_missing, "
+        " expected_split_on_nan, expected_bin_idx, expected_go_to_left"
+    ),
+    [
+        # basic sanity check with no missing values: given the gradient
+        # values, the split must occur on bin_idx=3
+        (
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],  # X_binned
+            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],  # gradients
+            False,  # no missing values
+            10,  # n_bins_non_missing
+            False,  # don't split on nans
+            3,  # expected_bin_idx
+            "not_applicable",
+        ),
+        # We replace 2 samples by NaNs (bin_idx=8)
+        # These 2 samples were mapped to the left node before, so they should
+        # be mapped to left node again
+        # Notice how the bin_idx threshold changes from 3 to 1.
+        (
+            [8, 0, 1, 8, 2, 3, 4, 5, 6, 7],  # 8 <=> missing
+            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+            True,  # missing values
+            8,  # n_bins_non_missing
+            False,  # don't split on nans
+            1,  # cut on bin_idx=1
+            True,
+        ),  # missing values go to left
+        # same as above, but with non-consecutive missing_values_bin
+        (
+            [9, 0, 1, 9, 2, 3, 4, 5, 6, 7],  # 9 <=> missing
+            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+            True,  # missing values
+            8,  # n_bins_non_missing
+            False,  # don't split on nans
+            1,  # cut on bin_idx=1
+            True,
+        ),  # missing values go to left
+        # this time replacing 2 samples that were on the right.
+        (
+            [0, 1, 2, 3, 8, 4, 8, 5, 6, 7],  # 8 <=> missing
+            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+            True,  # missing values
+            8,  # n_bins_non_missing
+            False,  # don't split on nans
+            3,  # cut on bin_idx=3 (like in first case)
+            False,
+        ),  # missing values go to right
+        # same as above, but with non-consecutive missing_values_bin
+        (
+            [0, 1, 2, 3, 9, 4, 9, 5, 6, 7],  # 9 <=> missing
+            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+            True,  # missing values
+            8,  # n_bins_non_missing
+            False,  # don't split on nans
+            3,  # cut on bin_idx=3 (like in first case)
+            False,
+        ),  # missing values go to right
+        # For the following cases, split_on_nans is True (we replace all of
+        # the samples with nans, instead of just 2).
+        (
+            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4],  # 4 <=> missing
+            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+            True,  # missing values
+            4,  # n_bins_non_missing
+            True,  # split on nans
+            3,  # cut on bin_idx=3
+            False,
+        ),  # missing values go to right
+        # same as above, but with non-consecutive missing_values_bin
+        (
+            [0, 1, 2, 3, 9, 9, 9, 9, 9, 9],  # 9 <=> missing
+            [1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
+            True,  # missing values
+            4,  # n_bins_non_missing
+            True,  # split on nans
+            3,  # cut on bin_idx=3
+            False,
+        ),  # missing values go to right
+        (
+            [6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 6 <=> missing
+            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+            True,  # missing values
+            6,  # n_bins_non_missing
+            True,  # split on nans
+            5,  # cut on bin_idx=5
+            False,
+        ),  # missing values go to right
+        # same as above, but with non-consecutive missing_values_bin
+        (
+            [9, 9, 9, 9, 0, 1, 2, 3, 4, 5],  # 9 <=> missing
+            [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+            True,  # missing values
+            6,  # n_bins_non_missing
+            True,  # split on nans
+            5,  # cut on bin_idx=5
+            False,
+        ),  # missing values go to right
+    ],
+)
+def test_splitting_missing_values(
+    X_binned,
+    all_gradients,
+    has_missing_values,
+    n_bins_non_missing,
+    expected_split_on_nan,
+    expected_bin_idx,
+    expected_go_to_left,
+):
+    # Make sure missing values are properly supported.
+    # we build an artificial example with gradients such that the best split
+    # is on bin_idx=3, when there are no missing values.
+    # Then we introduce missing values and:
+    #   - make sure the chosen bin is correct (find_best_bin()): it's
+    #     still the same split, even though the index of the bin may change
+    #   - make sure the missing values are mapped to the correct child
+    #     (split_indices())
+
+    n_bins = max(X_binned) + 1
+    n_samples = len(X_binned)
+    l2_regularization = 0.0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
+    X_binned = np.asfortranarray(X_binned)
+    all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
+    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = 1 * n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
+    )
+
+    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+    )
+    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(
+        X_binned,
+        n_bins_non_missing,
+        missing_values_bin_idx,
+        has_missing_values,
+        is_categorical,
+        monotonic_cst,
+        l2_regularization,
+        min_hessian_to_split,
+        min_samples_leaf,
+        min_gain_to_split,
+        hessians_are_constant,
+    )
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(
+        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+    )
+    split_info = splitter.find_node_split(
+        n_samples, histograms, sum_gradients, sum_hessians, value
+    )
+
+    assert split_info.bin_idx == expected_bin_idx
+    if has_missing_values:
+        assert split_info.missing_go_to_left == expected_go_to_left
+
+    split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
+    assert split_on_nan == expected_split_on_nan
+
+    # Make sure the split is properly computed.
+    # This also make sure missing values are properly assigned to the correct
+    # child in split_indices()
+    samples_left, samples_right, _ = splitter.split_indices(
+        split_info, splitter.partition
+    )
+
+    if not expected_split_on_nan:
+        # When we don't split on nans, the split should always be the same.
+        assert set(samples_left) == set([0, 1, 2, 3])
+        assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
+    else:
+        # When we split on nans, samples with missing values are always mapped
+        # to the right child.
+        missing_samples_indices = np.flatnonzero(
+            np.array(X_binned) == missing_values_bin_idx
+        )
+        non_missing_samples_indices = np.flatnonzero(
+            np.array(X_binned) != missing_values_bin_idx
+        )
+
+        assert set(samples_right) == set(missing_samples_indices)
+        assert set(samples_left) == set(non_missing_samples_indices)
+
+
+@pytest.mark.parametrize(
+    "X_binned, has_missing_values, n_bins_non_missing, ",
+    [
+        # one category
+        ([0] * 20, False, 1),
+        # all categories appear less than MIN_CAT_SUPPORT (hardcoded to 10)
+        ([0] * 9 + [1] * 8, False, 2),
+        # only one category appears more than MIN_CAT_SUPPORT
+        ([0] * 12 + [1] * 8, False, 2),
+        # missing values + category appear less than MIN_CAT_SUPPORT
+        # 9 is missing
+        ([0] * 9 + [1] * 8 + [9] * 4, True, 2),
+        # no non-missing category
+        ([9] * 11, True, 0),
+    ],
+)
+def test_splitting_categorical_cat_smooth(
+    X_binned, has_missing_values, n_bins_non_missing
+):
+    # Checks categorical splits are correct when the MIN_CAT_SUPPORT constraint
+    # isn't respected: there are no splits
+
+    n_bins = max(X_binned) + 1
+    n_samples = len(X_binned)
+    X_binned = np.array([X_binned], dtype=X_BINNED_DTYPE).T
+    X_binned = np.asfortranarray(X_binned)
+
+    l2_regularization = 0.0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = np.ones(n_samples, dtype=G_H_DTYPE)
+    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
+    )
+
+    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+    )
+    is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+
+    splitter = Splitter(
+        X_binned,
+        n_bins_non_missing,
+        missing_values_bin_idx,
+        has_missing_values,
+        is_categorical,
+        monotonic_cst,
+        l2_regularization,
+        min_hessian_to_split,
+        min_samples_leaf,
+        min_gain_to_split,
+        hessians_are_constant,
+    )
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(
+        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+    )
+    split_info = splitter.find_node_split(
+        n_samples, histograms, sum_gradients, sum_hessians, value
+    )
+
+    # no split found
+    assert split_info.gain == -1
+
+
+def _assert_categories_equals_bitset(categories, bitset):
+    # assert that the bitset exactly corresponds to the categories
+    # bitset is assumed to be an array of 8 uint32 elements
+
+    # form bitset from threshold
+    expected_bitset = np.zeros(8, dtype=np.uint32)
+    for cat in categories:
+        idx = cat // 32
+        shift = cat % 32
+        expected_bitset[idx] |= 1 << shift
+
+    # check for equality
+    assert_array_equal(expected_bitset, bitset)
+
+
+@pytest.mark.parametrize(
+    (
+        "X_binned, all_gradients, expected_categories_left, n_bins_non_missing,"
+        "missing_values_bin_idx, has_missing_values, expected_missing_go_to_left"
+    ),
+    [
+        # 4 categories
+        (
+            [0, 1, 2, 3] * 11,  # X_binned
+            [10, 1, 10, 10] * 11,  # all_gradients
+            [1],  # expected_categories_left
+            4,  # n_bins_non_missing
+            4,  # missing_values_bin_idx
+            False,  # has_missing_values
+            None,
+        ),  # expected_missing_go_to_left, unchecked
+        # Make sure that the categories that are on the right (second half) of
+        # the sorted categories array can still go in the left child. In this
+        # case, the best split was found when scanning from right to left.
+        (
+            [0, 1, 2, 3] * 11,  # X_binned
+            [10, 10, 10, 1] * 11,  # all_gradients
+            [3],  # expected_categories_left
+            4,  # n_bins_non_missing
+            4,  # missing_values_bin_idx
+            False,  # has_missing_values
+            None,
+        ),  # expected_missing_go_to_left, unchecked
+        # categories that don't respect MIN_CAT_SUPPORT (cat 4) are always
+        # mapped to the right child
+        (
+            [0, 1, 2, 3] * 11 + [4] * 5,  # X_binned
+            [10, 10, 10, 1] * 11 + [10] * 5,  # all_gradients
+            [3],  # expected_categories_left
+            4,  # n_bins_non_missing
+            4,  # missing_values_bin_idx
+            False,  # has_missing_values
+            None,
+        ),  # expected_missing_go_to_left, unchecked
+        # categories that don't respect MIN_CAT_SUPPORT are always mapped to
+        # the right child: in this case a more sensible split could have been
+        # 3, 4 - 0, 1, 2
+        # But the split is still 3 - 0, 1, 2, 4. this is because we only scan
+        # up to the middle of the sorted category array (0, 1, 2, 3), and
+        # because we exclude cat 4 in this array.
+        (
+            [0, 1, 2, 3] * 11 + [4] * 5,  # X_binned
+            [10, 10, 10, 1] * 11 + [1] * 5,  # all_gradients
+            [3],  # expected_categories_left
+            4,  # n_bins_non_missing
+            4,  # missing_values_bin_idx
+            False,  # has_missing_values
+            None,
+        ),  # expected_missing_go_to_left, unchecked
+        # 4 categories with missing values that go to the right
+        (
+            [0, 1, 2] * 11 + [9] * 11,  # X_binned
+            [10, 1, 10] * 11 + [10] * 11,  # all_gradients
+            [1],  # expected_categories_left
+            3,  # n_bins_non_missing
+            9,  # missing_values_bin_idx
+            True,  # has_missing_values
+            False,
+        ),  # expected_missing_go_to_left
+        # 4 categories with missing values that go to the left
+        (
+            [0, 1, 2] * 11 + [9] * 11,  # X_binned
+            [10, 1, 10] * 11 + [1] * 11,  # all_gradients
+            [1, 9],  # expected_categories_left
+            3,  # n_bins_non_missing
+            9,  # missing_values_bin_idx
+            True,  # has_missing_values
+            True,
+        ),  # expected_missing_go_to_left
+        # split is on the missing value
+        (
+            [0, 1, 2, 3, 4] * 11 + [255] * 12,  # X_binned
+            [10, 10, 10, 10, 10] * 11 + [1] * 12,  # all_gradients
+            [255],  # expected_categories_left
+            5,  # n_bins_non_missing
+            255,  # missing_values_bin_idx
+            True,  # has_missing_values
+            True,
+        ),  # expected_missing_go_to_left
+        # split on even categories
+        (
+            list(range(60)) * 12,  # X_binned
+            [10, 1] * 360,  # all_gradients
+            list(range(1, 60, 2)),  # expected_categories_left
+            59,  # n_bins_non_missing
+            59,  # missing_values_bin_idx
+            True,  # has_missing_values
+            True,
+        ),  # expected_missing_go_to_left
+        # split on every 8 categories
+        (
+            list(range(256)) * 12,  # X_binned
+            [10, 10, 10, 10, 10, 10, 10, 1] * 384,  # all_gradients
+            list(range(7, 256, 8)),  # expected_categories_left
+            255,  # n_bins_non_missing
+            255,  # missing_values_bin_idx
+            True,  # has_missing_values
+            True,
+        ),  # expected_missing_go_to_left
+    ],
+)
+def test_splitting_categorical_sanity(
+    X_binned,
+    all_gradients,
+    expected_categories_left,
+    n_bins_non_missing,
+    missing_values_bin_idx,
+    has_missing_values,
+    expected_missing_go_to_left,
+):
+    # Tests various combinations of categorical splits
+
+    n_samples = len(X_binned)
+    n_bins = max(X_binned) + 1
+
+    X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
+    X_binned = np.asfortranarray(X_binned)
+
+    l2_regularization = 0.0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
+    )
+
+    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+    )
+    is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8)
+
+    splitter = Splitter(
+        X_binned,
+        n_bins_non_missing,
+        missing_values_bin_idx,
+        has_missing_values,
+        is_categorical,
+        monotonic_cst,
+        l2_regularization,
+        min_hessian_to_split,
+        min_samples_leaf,
+        min_gain_to_split,
+        hessians_are_constant,
+    )
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+
+    value = compute_node_value(
+        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+    )
+    split_info = splitter.find_node_split(
+        n_samples, histograms, sum_gradients, sum_hessians, value
+    )
+
+    assert split_info.is_categorical
+    assert split_info.gain > 0
+    _assert_categories_equals_bitset(
+        expected_categories_left, split_info.left_cat_bitset
+    )
+    if has_missing_values:
+        assert split_info.missing_go_to_left == expected_missing_go_to_left
+    # If there is no missing value during training, the flag missing_go_to_left
+    # is set later in the grower.
+
+    # make sure samples are split correctly
+    samples_left, samples_right, _ = splitter.split_indices(
+        split_info, splitter.partition
+    )
+
+    left_mask = np.isin(X_binned.ravel(), expected_categories_left)
+    assert_array_equal(sample_indices[left_mask], samples_left)
+    assert_array_equal(sample_indices[~left_mask], samples_right)
+
+
+def test_split_interaction_constraints():
+    """Check that allowed_features are respected."""
+    n_features = 4
+    # features 1 and 2 are not allowed to be split on
+    allowed_features = np.array([0, 3], dtype=np.uint32)
+    n_bins = 5
+    n_samples = 10
+    l2_regularization = 0.0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_hessians = n_samples
+    hessians_are_constant = True
+
+    split_features = []
+
+    # The loop is to ensure that we split at least once on each allowed feature (0, 3).
+    # This is tracked by split_features and checked at the end.
+    for i in range(10):
+        rng = np.random.RandomState(919 + i)
+        X_binned = np.asfortranarray(
+            rng.randint(0, n_bins - 1, size=(n_samples, n_features)),
+            dtype=X_BINNED_DTYPE,
+        )
+        X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
+
+        # Make feature 1 very important
+        all_gradients = (10 * X_binned[:, 1] + rng.randn(n_samples)).astype(G_H_DTYPE)
+        sum_gradients = all_gradients.sum()
+
+        builder = HistogramBuilder(
+            X_binned,
+            n_bins,
+            all_gradients,
+            all_hessians,
+            hessians_are_constant,
+            n_threads,
+        )
+        n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32)
+        has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+        monotonic_cst = np.array(
+            [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+        )
+        is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
+        missing_values_bin_idx = n_bins - 1
+        splitter = Splitter(
+            X_binned,
+            n_bins_non_missing,
+            missing_values_bin_idx,
+            has_missing_values,
+            is_categorical,
+            monotonic_cst,
+            l2_regularization,
+            min_hessian_to_split,
+            min_samples_leaf,
+            min_gain_to_split,
+            hessians_are_constant,
+        )
+
+        assert np.all(sample_indices == splitter.partition)
+
+        histograms = builder.compute_histograms_brute(sample_indices)
+        value = compute_node_value(
+            sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+        )
+
+        # with all features allowed, feature 1 should be split on as it is the most
+        # important one by construction of the gradients
+        si_root = splitter.find_node_split(
+            n_samples,
+            histograms,
+            sum_gradients,
+            sum_hessians,
+            value,
+            allowed_features=None,
+        )
+        assert si_root.feature_idx == 1
+
+        # only features 0 and 3 are allowed to be split on
+        si_root = splitter.find_node_split(
+            n_samples,
+            histograms,
+            sum_gradients,
+            sum_hessians,
+            value,
+            allowed_features=allowed_features,
+        )
+        split_features.append(si_root.feature_idx)
+        assert si_root.feature_idx in allowed_features
+
+    # make sure feature 0 and feature 3 are split on in the constraint setting
+    assert set(allowed_features) == set(split_features)
+
+
+@pytest.mark.parametrize("forbidden_features", [set(), {1, 3}])
+def test_split_feature_fraction_per_split(forbidden_features):
+    """Check that feature_fraction_per_split is respected.
+
+    Because we set `n_features = 4` and `feature_fraction_per_split = 0.25`, it means
+    that calling `splitter.find_node_split` will be allowed to select a split for a
+    single completely random feature at each call. So if we iterate enough, we should
+    cover all the allowed features, irrespective of the values of the gradients and
+    Hessians of the objective.
+    """
+    n_features = 4
+    allowed_features = np.array(
+        list(set(range(n_features)) - forbidden_features), dtype=np.uint32
+    )
+    n_bins = 5
+    n_samples = 40
+    l2_regularization = 0.0
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.0
+    rng = np.random.default_rng(42)
+
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_gradients = rng.uniform(low=0.5, high=1, size=n_samples).astype(G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_hessians = n_samples
+    hessians_are_constant = True
+
+    X_binned = np.asfortranarray(
+        rng.integers(low=0, high=n_bins - 1, size=(n_samples, n_features)),
+        dtype=X_BINNED_DTYPE,
+    )
+    X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
+    builder = HistogramBuilder(
+        X_binned,
+        n_bins,
+        all_gradients,
+        all_hessians,
+        hessians_are_constant,
+        n_threads,
+    )
+    histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(
+        sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
+    )
+    n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
+    )
+    is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+
+    params = dict(
+        X_binned=X_binned,
+        n_bins_non_missing=n_bins_non_missing,
+        missing_values_bin_idx=missing_values_bin_idx,
+        has_missing_values=has_missing_values,
+        is_categorical=is_categorical,
+        monotonic_cst=monotonic_cst,
+        l2_regularization=l2_regularization,
+        min_hessian_to_split=min_hessian_to_split,
+        min_samples_leaf=min_samples_leaf,
+        min_gain_to_split=min_gain_to_split,
+        hessians_are_constant=hessians_are_constant,
+        rng=rng,
+    )
+    splitter_subsample = Splitter(
+        feature_fraction_per_split=0.25,  # THIS is the important setting here.
+        **params,
+    )
+    splitter_all_features = Splitter(feature_fraction_per_split=1.0, **params)
+
+    assert np.all(sample_indices == splitter_subsample.partition)
+
+    split_features_subsample = []
+    split_features_all = []
+    # The loop is to ensure that we split at least once on each feature.
+    # This is tracked by split_features and checked at the end.
+    for i in range(20):
+        si_root = splitter_subsample.find_node_split(
+            n_samples,
+            histograms,
+            sum_gradients,
+            sum_hessians,
+            value,
+            allowed_features=allowed_features,
+        )
+        split_features_subsample.append(si_root.feature_idx)
+
+        # This second splitter is our "counterfactual".
+        si_root = splitter_all_features.find_node_split(
+            n_samples,
+            histograms,
+            sum_gradients,
+            sum_hessians,
+            value,
+            allowed_features=allowed_features,
+        )
+        split_features_all.append(si_root.feature_idx)
+
+    # Make sure all features are split on.
+    assert set(split_features_subsample) == set(allowed_features)
+
+    # Make sure, our counterfactual always splits on same feature.
+    assert len(set(split_features_all)) == 1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
new file mode 100644
index 0000000000000000000000000000000000000000..03a2720b36127108e06537a3f4a85c5b9d4e7701
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@@ -0,0 +1,231 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn.base import clone
+from sklearn.datasets import make_classification, make_regression
+from sklearn.ensemble import (
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+)
+from sklearn.metrics import check_scoring
+
+X_classification, y_classification = make_classification(random_state=0)
+X_regression, y_regression = make_regression(random_state=0)
+
+
+def _assert_predictor_equal(gb_1, gb_2, X):
+    """Assert that two HistGBM instances are identical."""
+    # Check identical nodes for each tree
+    for pred_ith_1, pred_ith_2 in zip(gb_1._predictors, gb_2._predictors):
+        for predictor_1, predictor_2 in zip(pred_ith_1, pred_ith_2):
+            assert_array_equal(predictor_1.nodes, predictor_2.nodes)
+
+    # Check identical predictions
+    assert_allclose(gb_1.predict(X), gb_2.predict(X))
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+    ],
+)
+def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
+    # Check that a ValueError is raised when the maximum number of iterations
+    # is smaller than the number of iterations from the previous fit when warm
+    # start is True.
+
+    estimator = GradientBoosting(max_iter=10, early_stopping=False, warm_start=True)
+    estimator.fit(X, y)
+    estimator.set_params(max_iter=5)
+    err_msg = (
+        "max_iter=5 must be larger than or equal to n_iter_=10 when warm_start==True"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        estimator.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+    ],
+)
+def test_warm_start_yields_identical_results(GradientBoosting, X, y):
+    # Make sure that fitting 50 iterations and then 25 with warm start is
+    # equivalent to fitting 75 iterations.
+
+    rng = 42
+    gb_warm_start = GradientBoosting(
+        n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True
+    )
+    gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)
+
+    gb_no_warm_start = GradientBoosting(
+        n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False
+    )
+    gb_no_warm_start.fit(X, y)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+    ],
+)
+def test_warm_start_max_depth(GradientBoosting, X, y):
+    # Test if possible to fit trees of different depth in ensemble.
+    gb = GradientBoosting(
+        max_iter=20,
+        min_samples_leaf=1,
+        warm_start=True,
+        max_depth=2,
+        early_stopping=False,
+    )
+    gb.fit(X, y)
+    gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
+    gb.fit(X, y)
+
+    # First 20 trees have max_depth == 2
+    for i in range(20):
+        assert gb._predictors[i][0].get_max_depth() == 2
+    # Last 10 trees have max_depth == 3
+    for i in range(1, 11):
+        assert gb._predictors[-i][0].get_max_depth() == 3
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+    ],
+)
+@pytest.mark.parametrize("scoring", (None, "loss"))
+def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
+    # Make sure that early stopping occurs after a small number of iterations
+    # when fitting a second time with warm starting.
+
+    n_iter_no_change = 5
+    gb = GradientBoosting(
+        n_iter_no_change=n_iter_no_change,
+        max_iter=10000,
+        early_stopping=True,
+        random_state=42,
+        warm_start=True,
+        tol=1e-3,
+        scoring=scoring,
+    )
+    gb.fit(X, y)
+    n_iter_first_fit = gb.n_iter_
+    gb.fit(X, y)
+    n_iter_second_fit = gb.n_iter_
+    assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+    ],
+)
+def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
+    # Test if warm start with equal n_estimators does nothing
+    gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
+    gb_1.fit(X, y)
+
+    gb_2 = clone(gb_1)
+    gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True, n_iter_no_change=5)
+    gb_2.fit(X, y)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_1, gb_2, X)
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+    ],
+)
+def test_warm_start_clear(GradientBoosting, X, y):
+    # Test if fit clears state.
+    gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)
+    gb_1.fit(X, y)
+
+    gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42, warm_start=True)
+    gb_2.fit(X, y)  # inits state
+    gb_2.set_params(warm_start=False)
+    gb_2.fit(X, y)  # clears old state and equals est
+
+    # Check that both predictors have the same train_score_ and
+    # validation_score_ attributes
+    assert_allclose(gb_1.train_score_, gb_2.train_score_)
+    assert_allclose(gb_1.validation_score_, gb_2.validation_score_)
+
+    # Check that both predictors are equal
+    _assert_predictor_equal(gb_1, gb_2, X)
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (HistGradientBoostingClassifier, X_classification, y_classification),
+        (HistGradientBoostingRegressor, X_regression, y_regression),
+    ],
+)
+@pytest.mark.parametrize("rng_type", ("none", "int", "instance"))
+def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
+    # Make sure the seeds for train/val split and small trainset subsampling
+    # are correctly set in a warm start context.
+    def _get_rng(rng_type):
+        # Helper to avoid consuming rngs
+        if rng_type == "none":
+            return None
+        elif rng_type == "int":
+            return 42
+        else:
+            return np.random.RandomState(0)
+
+    random_state = _get_rng(rng_type)
+    gb_1 = GradientBoosting(early_stopping=True, max_iter=2, random_state=random_state)
+    gb_1.set_params(scoring=check_scoring(gb_1))
+    gb_1.fit(X, y)
+    random_seed_1_1 = gb_1._random_seed
+
+    gb_1.fit(X, y)
+    random_seed_1_2 = gb_1._random_seed  # clear the old state, different seed
+
+    random_state = _get_rng(rng_type)
+    gb_2 = GradientBoosting(
+        early_stopping=True, max_iter=2, random_state=random_state, warm_start=True
+    )
+    gb_2.set_params(scoring=check_scoring(gb_2))
+    gb_2.fit(X, y)  # inits state
+    random_seed_2_1 = gb_2._random_seed
+    gb_2.fit(X, y)  # clears old state and equals est
+    random_seed_2_2 = gb_2._random_seed
+
+    # Without warm starting, the seeds should be
+    # * all different if random state is None
+    # * all equal if random state is an integer
+    # * different when refitting and equal with a new estimator (because
+    #   the random state is mutated)
+    if rng_type == "none":
+        assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
+    elif rng_type == "int":
+        assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
+    else:
+        assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2
+
+    # With warm starting, the seeds must be equal
+    assert random_seed_2_1 == random_seed_2_2
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..429fbed611c22952154d1083152a3af69ba1ca07
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.py
@@ -0,0 +1,149 @@
+"""This module contains utility routines."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ...base import is_classifier
+from .binning import _BinMapper
+
+
+def get_equivalent_estimator(estimator, lib="lightgbm", n_classes=None):
+    """Return an unfitted estimator from another lib with matching hyperparams.
+
+    This utility function takes care of renaming the sklearn parameters into
+    their LightGBM, XGBoost or CatBoost equivalent parameters.
+
+    # unmapped XGB parameters:
+    # - min_samples_leaf
+    # - min_data_in_bin
+    # - min_split_gain (there is min_split_loss though?)
+
+    # unmapped Catboost parameters:
+    # max_leaves
+    # min_*
+    """
+
+    if lib not in ("lightgbm", "xgboost", "catboost"):
+        raise ValueError(
+            "accepted libs are lightgbm, xgboost, and catboost.  got {}".format(lib)
+        )
+
+    sklearn_params = estimator.get_params()
+
+    if sklearn_params["loss"] == "auto":
+        raise ValueError(
+            "auto loss is not accepted. We need to know if "
+            "the problem is binary or multiclass classification."
+        )
+    if sklearn_params["early_stopping"]:
+        raise NotImplementedError("Early stopping should be deactivated.")
+
+    lightgbm_loss_mapping = {
+        "squared_error": "regression_l2",
+        "absolute_error": "regression_l1",
+        "log_loss": "binary" if n_classes == 2 else "multiclass",
+        "gamma": "gamma",
+        "poisson": "poisson",
+    }
+
+    lightgbm_params = {
+        "objective": lightgbm_loss_mapping[sklearn_params["loss"]],
+        "learning_rate": sklearn_params["learning_rate"],
+        "n_estimators": sklearn_params["max_iter"],
+        "num_leaves": sklearn_params["max_leaf_nodes"],
+        "max_depth": sklearn_params["max_depth"],
+        "min_data_in_leaf": sklearn_params["min_samples_leaf"],
+        "reg_lambda": sklearn_params["l2_regularization"],
+        "max_bin": sklearn_params["max_bins"],
+        "min_data_in_bin": 1,
+        "min_sum_hessian_in_leaf": 1e-3,
+        "min_split_gain": 0,
+        "verbosity": 10 if sklearn_params["verbose"] else -10,
+        "boost_from_average": True,
+        "enable_bundle": False,  # also makes feature order consistent
+        "subsample_for_bin": _BinMapper().subsample,
+        "poisson_max_delta_step": 1e-12,
+        "feature_fraction_bynode": sklearn_params["max_features"],
+    }
+
+    if sklearn_params["loss"] == "log_loss" and n_classes > 2:
+        # LightGBM multiplies hessians by 2 in multiclass loss.
+        lightgbm_params["min_sum_hessian_in_leaf"] *= 2
+        # LightGBM 3.0 introduced a different scaling of the hessian for the multiclass
+        # case.
+        # It is equivalent of scaling the learning rate.
+        # See https://github.com/microsoft/LightGBM/pull/3256.
+        if n_classes is not None:
+            lightgbm_params["learning_rate"] *= n_classes / (n_classes - 1)
+
+    # XGB
+    xgboost_loss_mapping = {
+        "squared_error": "reg:linear",
+        "absolute_error": "LEAST_ABSOLUTE_DEV_NOT_SUPPORTED",
+        "log_loss": "reg:logistic" if n_classes == 2 else "multi:softmax",
+        "gamma": "reg:gamma",
+        "poisson": "count:poisson",
+    }
+
+    xgboost_params = {
+        "tree_method": "hist",
+        "grow_policy": "lossguide",  # so that we can set max_leaves
+        "objective": xgboost_loss_mapping[sklearn_params["loss"]],
+        "learning_rate": sklearn_params["learning_rate"],
+        "n_estimators": sklearn_params["max_iter"],
+        "max_leaves": sklearn_params["max_leaf_nodes"],
+        "max_depth": sklearn_params["max_depth"] or 0,
+        "lambda": sklearn_params["l2_regularization"],
+        "max_bin": sklearn_params["max_bins"],
+        "min_child_weight": 1e-3,
+        "verbosity": 2 if sklearn_params["verbose"] else 0,
+        "silent": sklearn_params["verbose"] == 0,
+        "n_jobs": -1,
+        "colsample_bynode": sklearn_params["max_features"],
+    }
+
+    # Catboost
+    catboost_loss_mapping = {
+        "squared_error": "RMSE",
+        # catboost does not support MAE when leaf_estimation_method is Newton
+        "absolute_error": "LEAST_ASBOLUTE_DEV_NOT_SUPPORTED",
+        "log_loss": "Logloss" if n_classes == 2 else "MultiClass",
+        "gamma": None,
+        "poisson": "Poisson",
+    }
+
+    catboost_params = {
+        "loss_function": catboost_loss_mapping[sklearn_params["loss"]],
+        "learning_rate": sklearn_params["learning_rate"],
+        "iterations": sklearn_params["max_iter"],
+        "depth": sklearn_params["max_depth"],
+        "reg_lambda": sklearn_params["l2_regularization"],
+        "max_bin": sklearn_params["max_bins"],
+        "feature_border_type": "Median",
+        "leaf_estimation_method": "Newton",
+        "verbose": bool(sklearn_params["verbose"]),
+    }
+
+    if lib == "lightgbm":
+        from lightgbm import LGBMClassifier, LGBMRegressor
+
+        if is_classifier(estimator):
+            return LGBMClassifier(**lightgbm_params)
+        else:
+            return LGBMRegressor(**lightgbm_params)
+
+    elif lib == "xgboost":
+        from xgboost import XGBClassifier, XGBRegressor
+
+        if is_classifier(estimator):
+            return XGBClassifier(**xgboost_params)
+        else:
+            return XGBRegressor(**xgboost_params)
+
+    else:
+        from catboost import CatBoostClassifier, CatBoostRegressor
+
+        if is_classifier(estimator):
+            return CatBoostClassifier(**catboost_params)
+        else:
+            return CatBoostRegressor(**catboost_params)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_iforest.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_iforest.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e5287af7f699992d21c1881beb039dfb0f6dcc0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_iforest.py
@@ -0,0 +1,673 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import threading
+from numbers import Integral, Real
+from warnings import warn
+
+import numpy as np
+from scipy.sparse import issparse
+
+from ..base import OutlierMixin, _fit_context
+from ..tree import ExtraTreeRegressor
+from ..tree._tree import DTYPE as tree_dtype
+from ..utils import (
+    check_array,
+    check_random_state,
+    gen_batches,
+)
+from ..utils._chunking import get_chunk_n_rows
+from ..utils._param_validation import Interval, RealNotInt, StrOptions
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _num_samples, check_is_fitted, validate_data
+from ._bagging import BaseBagging
+
+__all__ = ["IsolationForest"]
+
+
+def _parallel_compute_tree_depths(
+    tree,
+    X,
+    features,
+    tree_decision_path_lengths,
+    tree_avg_path_lengths,
+    depths,
+    lock,
+):
+    """Parallel computation of isolation tree depth."""
+    if features is None:
+        X_subset = X
+    else:
+        X_subset = X[:, features]
+
+    leaves_index = tree.apply(X_subset, check_input=False)
+
+    with lock:
+        depths += (
+            tree_decision_path_lengths[leaves_index]
+            + tree_avg_path_lengths[leaves_index]
+            - 1.0
+        )
+
+
+class IsolationForest(OutlierMixin, BaseBagging):
+    """
+    Isolation Forest Algorithm.
+
+    Return the anomaly score of each sample using the IsolationForest algorithm
+
+    The IsolationForest 'isolates' observations by randomly selecting a feature
+    and then randomly selecting a split value between the maximum and minimum
+    values of the selected feature.
+
+    Since recursive partitioning can be represented by a tree structure, the
+    number of splittings required to isolate a sample is equivalent to the path
+    length from the root node to the terminating node.
+
+    This path length, averaged over a forest of such random trees, is a
+    measure of normality and our decision function.
+
+    Random partitioning produces noticeably shorter paths for anomalies.
+    Hence, when a forest of random trees collectively produce shorter path
+    lengths for particular samples, they are highly likely to be anomalies.
+
+    Read more in the :ref:`User Guide <isolation_forest>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    n_estimators : int, default=100
+        The number of base estimators in the ensemble.
+
+    max_samples : "auto", int or float, default="auto"
+        The number of samples to draw from X to train each base estimator.
+
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples.
+        - If "auto", then `max_samples=min(256, n_samples)`.
+
+        If max_samples is larger than the number of samples provided,
+        all samples will be used for all trees (no sampling).
+
+    contamination : 'auto' or float, default='auto'
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. Used when fitting to define the threshold
+        on the scores of the samples.
+
+        - If 'auto', the threshold is determined as in the
+          original paper.
+        - If float, the contamination should be in the range (0, 0.5].
+
+        .. versionchanged:: 0.22
+           The default value of ``contamination`` changed from 0.1
+           to ``'auto'``.
+
+    max_features : int or float, default=1.0
+        The number of features to draw from X to train each base estimator.
+
+        - If int, then draw `max_features` features.
+        - If float, then draw `max(1, int(max_features * n_features_in_))` features.
+
+        Note: using a float number less than 1.0 or integer less than number of
+        features will enable feature subsampling and leads to a longer runtime.
+
+    bootstrap : bool, default=False
+        If True, individual trees are fit on random subsets of the training
+        data sampled with replacement. If False, sampling without replacement
+        is performed.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1
+        unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using
+        all processors. See :term:`Glossary <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the pseudo-randomness of the selection of the feature
+        and split values for each branching step and each tree in the forest.
+
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    verbose : int, default=0
+        Controls the verbosity of the tree building process.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit a whole
+        new forest. See :term:`the Glossary <warm_start>`.
+
+        .. versionadded:: 0.21
+
+    Attributes
+    ----------
+    estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
+        The child estimator template used to create the collection of
+        fitted sub-estimators.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    estimators_ : list of ExtraTreeRegressor instances
+        The collection of fitted sub-estimators.
+
+    estimators_features_ : list of ndarray
+        The subset of drawn features for each base estimator.
+
+    estimators_samples_ : list of ndarray
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator.
+
+    max_samples_ : int
+        The actual number of samples.
+
+    offset_ : float
+        Offset used to define the decision function from the raw scores. We
+        have the relation: ``decision_function = score_samples - offset_``.
+        ``offset_`` is defined as follows. When the contamination parameter is
+        set to "auto", the offset is equal to -0.5 as the scores of inliers are
+        close to 0 and the scores of outliers are close to -1. When a
+        contamination parameter different than "auto" is provided, the offset
+        is defined in such a way we obtain the expected number of outliers
+        (samples with decision function < 0) in training.
+
+        .. versionadded:: 0.20
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a
+        Gaussian distributed dataset.
+    sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
+        Estimate the support of a high-dimensional distribution.
+        The implementation is based on libsvm.
+    sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection
+        using Local Outlier Factor (LOF).
+
+    Notes
+    -----
+    The implementation is based on an ensemble of ExtraTreeRegressor. The
+    maximum depth of each tree is set to ``ceil(log_2(n))`` where
+    :math:`n` is the number of samples used to build the tree
+    (see (Liu et al., 2008) for more details).
+
+    References
+    ----------
+    .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
+           Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
+    .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based
+           anomaly detection." ACM Transactions on Knowledge Discovery from
+           Data (TKDD) 6.1 (2012): 3.
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import IsolationForest
+    >>> X = [[-1.1], [0.3], [0.5], [100]]
+    >>> clf = IsolationForest(random_state=0).fit(X)
+    >>> clf.predict([[0.1], [0], [90]])
+    array([ 1,  1, -1])
+
+    For an example of using isolation forest for anomaly detection see
+    :ref:`sphx_glr_auto_examples_ensemble_plot_isolation_forest.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "n_estimators": [Interval(Integral, 1, None, closed="left")],
+        "max_samples": [
+            StrOptions({"auto"}),
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="right"),
+        ],
+        "contamination": [
+            StrOptions({"auto"}),
+            Interval(Real, 0, 0.5, closed="right"),
+        ],
+        "max_features": [
+            Integral,
+            Interval(Real, 0, 1, closed="right"),
+        ],
+        "bootstrap": ["boolean"],
+        "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
+        "verbose": ["verbose"],
+        "warm_start": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        n_estimators=100,
+        max_samples="auto",
+        contamination="auto",
+        max_features=1.0,
+        bootstrap=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        warm_start=False,
+    ):
+        super().__init__(
+            estimator=None,
+            # here above max_features has no links with self.max_features
+            bootstrap=bootstrap,
+            bootstrap_features=False,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=max_features,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+        )
+
+        self.contamination = contamination
+
+    def _get_estimator(self):
+        return ExtraTreeRegressor(
+            # here max_features has no links with self.max_features
+            max_features=1,
+            splitter="random",
+            random_state=self.random_state,
+        )
+
+    def _set_oob_score(self, X, y):
+        raise NotImplementedError("OOB score not supported by iforest")
+
+    def _parallel_args(self):
+        # ExtraTreeRegressor releases the GIL, so it's more efficient to use
+        # a thread-based backend rather than a process-based backend so as
+        # to avoid suffering from communication overhead and extra memory
+        # copies. This is only used in the fit method.
+        return {"prefer": "threads"}
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """
+        Fit estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Use ``dtype=np.float32`` for maximum
+            efficiency. Sparse matrices are also supported, use sparse
+            ``csc_matrix`` for maximum efficiency.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        X = validate_data(
+            self, X, accept_sparse=["csc"], dtype=tree_dtype, ensure_all_finite=False
+        )
+        if issparse(X):
+            # Pre-sort indices to avoid that each individual tree of the
+            # ensemble sorts the indices.
+            X.sort_indices()
+
+        rnd = check_random_state(self.random_state)
+        y = rnd.uniform(size=X.shape[0])
+
+        # ensure that max_sample is in [1, n_samples]:
+        n_samples = X.shape[0]
+
+        if isinstance(self.max_samples, str) and self.max_samples == "auto":
+            max_samples = min(256, n_samples)
+
+        elif isinstance(self.max_samples, numbers.Integral):
+            if self.max_samples > n_samples:
+                warn(
+                    "max_samples (%s) is greater than the "
+                    "total number of samples (%s). max_samples "
+                    "will be set to n_samples for estimation."
+                    % (self.max_samples, n_samples)
+                )
+                max_samples = n_samples
+            else:
+                max_samples = self.max_samples
+        else:  # max_samples is float
+            max_samples = int(self.max_samples * X.shape[0])
+
+        self.max_samples_ = max_samples
+        max_depth = int(np.ceil(np.log2(max(max_samples, 2))))
+        super()._fit(
+            X,
+            y,
+            max_samples,
+            max_depth=max_depth,
+            sample_weight=sample_weight,
+            check_input=False,
+        )
+
+        self._average_path_length_per_tree, self._decision_path_lengths = zip(
+            *[
+                (
+                    _average_path_length(tree.tree_.n_node_samples),
+                    tree.tree_.compute_node_depths(),
+                )
+                for tree in self.estimators_
+            ]
+        )
+
+        if self.contamination == "auto":
+            # 0.5 plays a special role as described in the original paper.
+            # we take the opposite as we consider the opposite of their score.
+            self.offset_ = -0.5
+            return self
+
+        # Else, define offset_ wrt contamination parameter
+        # To avoid performing input validation a second time we call
+        # _score_samples rather than score_samples.
+        # _score_samples expects a CSR matrix, so we convert if necessary.
+        if issparse(X):
+            X = X.tocsr()
+        self.offset_ = np.percentile(self._score_samples(X), 100.0 * self.contamination)
+
+        return self
+
+    def predict(self, X):
+        """
+        Predict if a particular sample is an outlier or not.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            For each observation, tells whether or not (+1 or -1) it should
+            be considered as an inlier according to the fitted model.
+
+        Notes
+        -----
+        The predict method can be parallelized by setting a joblib context. This
+        inherently does NOT use the ``n_jobs`` parameter initialized in the class,
+        which is used during ``fit``. This is because, predict may actually be faster
+        without parallelization for a small number of samples,
+        such as for 1000 samples or less. The user can set the
+        number of jobs in the joblib context to control the number of parallel jobs.
+
+        .. code-block:: python
+
+            from joblib import parallel_backend
+
+            # Note, we use threading here as the predict method is not CPU bound.
+            with parallel_backend("threading", n_jobs=4):
+                model.predict(X)
+        """
+        check_is_fitted(self)
+        decision_func = self.decision_function(X)
+        is_inlier = np.ones_like(decision_func, dtype=int)
+        is_inlier[decision_func < 0] = -1
+        return is_inlier
+
+    def decision_function(self, X):
+        """
+        Average anomaly score of X of the base classifiers.
+
+        The anomaly score of an input sample is computed as
+        the mean anomaly score of the trees in the forest.
+
+        The measure of normality of an observation given a tree is the depth
+        of the leaf containing this observation, which is equivalent to
+        the number of splittings required to isolate this point. In case of
+        several observations n_left in the leaf, the average path length of
+        a n_left samples isolation tree is added.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,)
+            The anomaly score of the input samples.
+            The lower, the more abnormal. Negative scores represent outliers,
+            positive scores represent inliers.
+
+        Notes
+        -----
+        The decision_function method can be parallelized by setting a joblib context.
+        This inherently does NOT use the ``n_jobs`` parameter initialized in the class,
+        which is used during ``fit``. This is because, calculating the score may
+        actually be faster without parallelization for a small number of samples,
+        such as for 1000 samples or less.
+        The user can set the number of jobs in the joblib context to control the
+        number of parallel jobs.
+
+        .. code-block:: python
+
+            from joblib import parallel_backend
+
+            # Note, we use threading here as the decision_function method is
+            # not CPU bound.
+            with parallel_backend("threading", n_jobs=4):
+                model.decision_function(X)
+        """
+        # We subtract self.offset_ to make 0 be the threshold value for being
+        # an outlier:
+
+        return self.score_samples(X) - self.offset_
+
+    def score_samples(self, X):
+        """
+        Opposite of the anomaly score defined in the original paper.
+
+        The anomaly score of an input sample is computed as
+        the mean anomaly score of the trees in the forest.
+
+        The measure of normality of an observation given a tree is the depth
+        of the leaf containing this observation, which is equivalent to
+        the number of splittings required to isolate this point. In case of
+        several observations n_left in the leaf, the average path length of
+        a n_left samples isolation tree is added.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,)
+            The anomaly score of the input samples.
+            The lower, the more abnormal.
+
+        Notes
+        -----
+        The score function method can be parallelized by setting a joblib context. This
+        inherently does NOT use the ``n_jobs`` parameter initialized in the class,
+        which is used during ``fit``. This is because, calculating the score may
+        actually be faster without parallelization for a small number of samples,
+        such as for 1000 samples or less.
+        The user can set the number of jobs in the joblib context to control the
+        number of parallel jobs.
+
+        .. code-block:: python
+
+            from joblib import parallel_backend
+
+            # Note, we use threading here as the score_samples method is not CPU bound.
+            with parallel_backend("threading", n_jobs=4):
+                model.score(X)
+        """
+        # Check data
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csr",
+            dtype=tree_dtype,
+            reset=False,
+            ensure_all_finite=False,
+        )
+
+        return self._score_samples(X)
+
+    def _score_samples(self, X):
+        """Private version of score_samples without input validation.
+
+        Input validation would remove feature names, so we disable it.
+        """
+        # Code structure from ForestClassifier/predict_proba
+
+        check_is_fitted(self)
+
+        # Take the opposite of the scores as bigger is better (here less abnormal)
+        return -self._compute_chunked_score_samples(X)
+
+    def _compute_chunked_score_samples(self, X):
+        n_samples = _num_samples(X)
+
+        if self._max_features == X.shape[1]:
+            subsample_features = False
+        else:
+            subsample_features = True
+
+        # We get as many rows as possible within our working_memory budget
+        # (defined by sklearn.get_config()['working_memory']) to store
+        # self._max_features in each row during computation.
+        #
+        # Note:
+        #  - this will get at least 1 row, even if 1 row of score will
+        #    exceed working_memory.
+        #  - this does only account for temporary memory usage while loading
+        #    the data needed to compute the scores -- the returned scores
+        #    themselves are 1D.
+
+        chunk_n_rows = get_chunk_n_rows(
+            row_bytes=16 * self._max_features, max_n_rows=n_samples
+        )
+        slices = gen_batches(n_samples, chunk_n_rows)
+
+        scores = np.zeros(n_samples, order="f")
+
+        for sl in slices:
+            # compute score on the slices of test samples:
+            scores[sl] = self._compute_score_samples(X[sl], subsample_features)
+
+        return scores
+
+    def _compute_score_samples(self, X, subsample_features):
+        """
+        Compute the score of each samples in X going through the extra trees.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix
+            Data matrix.
+
+        subsample_features : bool
+            Whether features should be subsampled.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,)
+            The score of each sample in X.
+        """
+        n_samples = X.shape[0]
+
+        depths = np.zeros(n_samples, order="f")
+
+        average_path_length_max_samples = _average_path_length([self._max_samples])
+
+        # Note: we use default n_jobs value, i.e. sequential computation, which
+        # we expect to be more performant that parallelizing for small number
+        # of samples, e.g. < 1k samples. Default n_jobs value can be overridden
+        # by using joblib.parallel_backend context manager around
+        # ._compute_score_samples. Using a higher n_jobs may speed up the
+        # computation of the scores, e.g. for > 1k samples. See
+        # https://github.com/scikit-learn/scikit-learn/pull/28622 for more
+        # details.
+        lock = threading.Lock()
+        Parallel(
+            verbose=self.verbose,
+            require="sharedmem",
+        )(
+            delayed(_parallel_compute_tree_depths)(
+                tree,
+                X,
+                features if subsample_features else None,
+                self._decision_path_lengths[tree_idx],
+                self._average_path_length_per_tree[tree_idx],
+                depths,
+                lock,
+            )
+            for tree_idx, (tree, features) in enumerate(
+                zip(self.estimators_, self.estimators_features_)
+            )
+        )
+
+        denominator = len(self.estimators_) * average_path_length_max_samples
+        scores = 2 ** (
+            # For a single training sample, denominator and depth are 0.
+            # Therefore, we set the score manually to 1.
+            -np.divide(
+                depths, denominator, out=np.ones_like(depths), where=denominator != 0
+            )
+        )
+        return scores
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+def _average_path_length(n_samples_leaf):
+    """
+    The average path length in a n_samples iTree, which is equal to
+    the average path length of an unsuccessful BST search since the
+    latter has the same structure as an isolation tree.
+    Parameters
+    ----------
+    n_samples_leaf : array-like of shape (n_samples,)
+        The number of training samples in each test sample leaf, for
+        each estimators.
+
+    Returns
+    -------
+    average_path_length : ndarray of shape (n_samples,)
+    """
+
+    n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False)
+
+    n_samples_leaf_shape = n_samples_leaf.shape
+    n_samples_leaf = n_samples_leaf.reshape((1, -1))
+    average_path_length = np.zeros(n_samples_leaf.shape)
+
+    mask_1 = n_samples_leaf <= 1
+    mask_2 = n_samples_leaf == 2
+    not_mask = ~np.logical_or(mask_1, mask_2)
+
+    average_path_length[mask_1] = 0.0
+    average_path_length[mask_2] = 1.0
+    average_path_length[not_mask] = (
+        2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
+        - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
+    )
+
+    return average_path_length.reshape(n_samples_leaf_shape)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py
new file mode 100644
index 0000000000000000000000000000000000000000..2894d8f174c13a7f54607e4b56717381464cf94f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py
@@ -0,0 +1,1145 @@
+"""Stacking classifier and regressor."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import ABCMeta, abstractmethod
+from copy import deepcopy
+from numbers import Integral
+
+import numpy as np
+import scipy.sparse as sparse
+
+from ..base import (
+    ClassifierMixin,
+    RegressorMixin,
+    TransformerMixin,
+    _fit_context,
+    clone,
+    is_classifier,
+    is_regressor,
+)
+from ..exceptions import NotFittedError
+from ..linear_model import LogisticRegression, RidgeCV
+from ..model_selection import check_cv, cross_val_predict
+from ..preprocessing import LabelEncoder
+from ..utils import Bunch
+from ..utils._param_validation import HasMethods, StrOptions
+from ..utils._repr_html.estimator import _VisualBlock
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import check_classification_targets, type_of_target
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_feature_names_in,
+    _check_response_method,
+    _estimator_has,
+    check_is_fitted,
+    column_or_1d,
+)
+from ._base import _BaseHeterogeneousEnsemble, _fit_single_estimator
+
+
+class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble, metaclass=ABCMeta):
+    """Base class for stacking method."""
+
+    _parameter_constraints: dict = {
+        "estimators": [list],
+        "final_estimator": [None, HasMethods("fit")],
+        "cv": ["cv_object", StrOptions({"prefit"})],
+        "n_jobs": [None, Integral],
+        "passthrough": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimators,
+        final_estimator=None,
+        *,
+        cv=None,
+        stack_method="auto",
+        n_jobs=None,
+        verbose=0,
+        passthrough=False,
+    ):
+        super().__init__(estimators=estimators)
+        self.final_estimator = final_estimator
+        self.cv = cv
+        self.stack_method = stack_method
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        self.passthrough = passthrough
+
+    def _clone_final_estimator(self, default):
+        if self.final_estimator is not None:
+            self.final_estimator_ = clone(self.final_estimator)
+        else:
+            self.final_estimator_ = clone(default)
+
+    def _concatenate_predictions(self, X, predictions):
+        """Concatenate the predictions of each first layer learner and
+        possibly the input dataset `X`.
+
+        If `X` is sparse and `self.passthrough` is False, the output of
+        `transform` will be dense (the predictions). If `X` is sparse
+        and `self.passthrough` is True, the output of `transform` will
+        be sparse.
+
+        This helper is in charge of ensuring the predictions are 2D arrays and
+        it will drop one of the probability column when using probabilities
+        in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)
+
+        When `y` type is `"multilabel-indicator"`` and the method used is
+        `predict_proba`, `preds` can be either a `ndarray` of shape
+        `(n_samples, n_class)` or for some estimators a list of `ndarray`.
+        This function will drop one of the probability column in this situation as well.
+        """
+        X_meta = []
+        for est_idx, preds in enumerate(predictions):
+            if isinstance(preds, list):
+                # `preds` is here a list of `n_targets` 2D ndarrays of
+                # `n_classes` columns. The k-th column contains the
+                # probabilities of the samples belonging the k-th class.
+                #
+                # Since those probabilities must sum to one for each sample,
+                # we can work with probabilities of `n_classes - 1` classes.
+                # Hence we drop the first column.
+                for pred in preds:
+                    X_meta.append(pred[:, 1:])
+            elif preds.ndim == 1:
+                # Some estimator return a 1D array for predictions
+                # which must be 2-dimensional arrays.
+                X_meta.append(preds.reshape(-1, 1))
+            elif (
+                self.stack_method_[est_idx] == "predict_proba"
+                and len(self.classes_) == 2
+            ):
+                # Remove the first column when using probabilities in
+                # binary classification because both features `preds` are perfectly
+                # collinear.
+                X_meta.append(preds[:, 1:])
+            else:
+                X_meta.append(preds)
+
+        self._n_feature_outs = [pred.shape[1] for pred in X_meta]
+        if self.passthrough:
+            X_meta.append(X)
+            if sparse.issparse(X):
+                return sparse.hstack(X_meta, format=X.format)
+
+        return np.hstack(X_meta)
+
+    @staticmethod
+    def _method_name(name, estimator, method):
+        if estimator == "drop":
+            return None
+        if method == "auto":
+            method = ["predict_proba", "decision_function", "predict"]
+        try:
+            method_name = _check_response_method(estimator, method).__name__
+        except AttributeError as e:
+            raise ValueError(
+                f"Underlying estimator {name} does not implement the method {method}."
+            ) from e
+
+        return method_name
+
+    @_fit_context(
+        # estimators in Stacking*.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **fit_params):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **fit_params : dict
+            Dict of metadata, potentially containing sample_weight as a
+            key-value pair. If sample_weight is not present, then samples are
+            equally weighted. Note that sample_weight is supported only if all
+            underlying estimators support sample weights.
+
+            .. versionadded:: 1.6
+
+        Returns
+        -------
+        self : object
+        """
+        # all_estimators contains all estimators, the one to be fitted and the
+        # 'drop' string.
+        names, all_estimators = self._validate_estimators()
+        self._validate_final_estimator()
+
+        stack_method = [self.stack_method] * len(all_estimators)
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            routed_params = Bunch()
+            for name in names:
+                routed_params[name] = Bunch(fit={})
+                if "sample_weight" in fit_params:
+                    routed_params[name].fit["sample_weight"] = fit_params[
+                        "sample_weight"
+                    ]
+
+        if self.cv == "prefit":
+            self.estimators_ = []
+            for estimator in all_estimators:
+                if estimator != "drop":
+                    check_is_fitted(estimator)
+                    self.estimators_.append(estimator)
+        else:
+            # Fit the base estimators on the whole training data. Those
+            # base estimators will be used in transform, predict, and
+            # predict_proba. They are exposed publicly.
+            self.estimators_ = Parallel(n_jobs=self.n_jobs)(
+                delayed(_fit_single_estimator)(
+                    clone(est), X, y, routed_params[name]["fit"]
+                )
+                for name, est in zip(names, all_estimators)
+                if est != "drop"
+            )
+
+        self.named_estimators_ = Bunch()
+        est_fitted_idx = 0
+        for name_est, org_est in zip(names, all_estimators):
+            if org_est != "drop":
+                current_estimator = self.estimators_[est_fitted_idx]
+                self.named_estimators_[name_est] = current_estimator
+                est_fitted_idx += 1
+                if hasattr(current_estimator, "feature_names_in_"):
+                    self.feature_names_in_ = current_estimator.feature_names_in_
+            else:
+                self.named_estimators_[name_est] = "drop"
+
+        self.stack_method_ = [
+            self._method_name(name, est, meth)
+            for name, est, meth in zip(names, all_estimators, stack_method)
+        ]
+
+        if self.cv == "prefit":
+            # Generate predictions from prefit models
+            predictions = [
+                getattr(estimator, predict_method)(X)
+                for estimator, predict_method in zip(all_estimators, self.stack_method_)
+                if estimator != "drop"
+            ]
+        else:
+            # To train the meta-classifier using the most data as possible, we use
+            # a cross-validation to obtain the output of the stacked estimators.
+            # To ensure that the data provided to each estimator are the same,
+            # we need to set the random state of the cv if there is one and we
+            # need to take a copy.
+            cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
+            if hasattr(cv, "random_state") and cv.random_state is None:
+                cv.random_state = np.random.RandomState()
+
+            predictions = Parallel(n_jobs=self.n_jobs)(
+                delayed(cross_val_predict)(
+                    clone(est),
+                    X,
+                    y,
+                    cv=deepcopy(cv),
+                    method=meth,
+                    n_jobs=self.n_jobs,
+                    params=routed_params[name]["fit"],
+                    verbose=self.verbose,
+                )
+                for name, est, meth in zip(names, all_estimators, self.stack_method_)
+                if est != "drop"
+            )
+
+        # Only not None or not 'drop' estimators will be used in transform.
+        # Remove the None from the method as well.
+        self.stack_method_ = [
+            meth
+            for (meth, est) in zip(self.stack_method_, all_estimators)
+            if est != "drop"
+        ]
+
+        X_meta = self._concatenate_predictions(X, predictions)
+        _fit_single_estimator(self.final_estimator_, X_meta, y, fit_params=fit_params)
+
+        return self
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during :term:`fit`."""
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                f"{self.__class__.__name__} object has no attribute n_features_in_"
+            ) from nfe
+        return self.estimators_[0].n_features_in_
+
+    def _transform(self, X):
+        """Concatenate and return the predictions of the estimators."""
+        check_is_fitted(self)
+        predictions = [
+            getattr(est, meth)(X)
+            for est, meth in zip(self.estimators_, self.stack_method_)
+            if est != "drop"
+        ]
+        return self._concatenate_predictions(X, predictions)
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features. The input feature names are only used when `passthrough` is
+            `True`.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then names are generated: `[x0, x1, ..., x(n_features_in_ - 1)]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+            If `passthrough` is `False`, then only the names of `estimators` are used
+            to generate the output feature names.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        input_features = _check_feature_names_in(
+            self, input_features, generate_names=self.passthrough
+        )
+
+        class_name = self.__class__.__name__.lower()
+        non_dropped_estimators = (
+            name for name, est in self.estimators if est != "drop"
+        )
+        meta_names = []
+        for est, n_features_out in zip(non_dropped_estimators, self._n_feature_outs):
+            if n_features_out == 1:
+                meta_names.append(f"{class_name}_{est}")
+            else:
+                meta_names.extend(
+                    f"{class_name}_{est}{i}" for i in range(n_features_out)
+                )
+
+        if self.passthrough:
+            return np.concatenate((meta_names, input_features))
+
+        return np.asarray(meta_names, dtype=object)
+
+    @available_if(
+        _estimator_has("predict", delegates=("final_estimator_", "final_estimator"))
+    )
+    def predict(self, X, **predict_params):
+        """Predict target for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        **predict_params : dict of str -> obj
+            Parameters to the `predict` called by the `final_estimator`. Note
+            that this may be used to return uncertainties from some estimators
+            with `return_std` or `return_cov`. Be aware that it will only
+            account for uncertainty in the final estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
+            Predicted targets.
+        """
+
+        check_is_fitted(self)
+        return self.final_estimator_.predict(self.transform(X), **predict_params)
+
+    def _sk_visual_block_with_final_estimator(self, final_estimator):
+        names, estimators = zip(*self.estimators)
+        parallel = _VisualBlock("parallel", estimators, names=names, dash_wrapped=False)
+
+        # final estimator is wrapped in a parallel block to show the label:
+        # 'final_estimator' in the html repr
+        final_block = _VisualBlock(
+            "parallel", [final_estimator], names=["final_estimator"], dash_wrapped=False
+        )
+        return _VisualBlock("serial", (parallel, final_block), dash_wrapped=False)
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.6
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+
+        # `self.estimators` is a list of (name, est) tuples
+        for name, estimator in self.estimators:
+            router.add(
+                **{name: estimator},
+                method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+            )
+
+        try:
+            final_estimator_ = self.final_estimator_
+        except AttributeError:
+            final_estimator_ = self.final_estimator
+
+        router.add(
+            final_estimator_=final_estimator_,
+            method_mapping=MethodMapping().add(caller="predict", callee="predict"),
+        )
+
+        return router
+
+
+class StackingClassifier(ClassifierMixin, _BaseStacking):
+    """Stack of estimators with a final classifier.
+
+    Stacked generalization consists in stacking the output of individual
+    estimator and use a classifier to compute the final prediction. Stacking
+    allows to use the strength of each individual estimator by using their
+    output as input of a final estimator.
+
+    Note that `estimators_` are fitted on the full `X` while `final_estimator_`
+    is trained using cross-validated predictions of the base estimators using
+    `cross_val_predict`.
+
+    Read more in the :ref:`User Guide <stacking>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator)
+        Base estimators which will be stacked together. Each element of the
+        list is defined as a tuple of string (i.e. name) and an estimator
+        instance. An estimator can be set to 'drop' using `set_params`.
+
+        The type of estimator is generally expected to be a classifier.
+        However, one can pass a regressor for some use case (e.g. ordinal
+        regression).
+
+    final_estimator : estimator, default=None
+        A classifier which will be used to combine the base estimators.
+        The default classifier is a
+        :class:`~sklearn.linear_model.LogisticRegression`.
+
+    cv : int, cross-validation generator, iterable, or "prefit", default=None
+        Determines the cross-validation splitting strategy used in
+        `cross_val_predict` to train `final_estimator`. Possible inputs for
+        cv are:
+
+        * None, to use the default 5-fold cross validation,
+        * integer, to specify the number of folds in a (Stratified) KFold,
+        * An object to be used as a cross-validation generator,
+        * An iterable yielding train, test splits,
+        * `"prefit"`, to assume the `estimators` are prefit. In this case, the
+          estimators will not be refitted.
+
+        For integer/None inputs, if the estimator is a classifier and y is
+        either binary or multiclass,
+        :class:`~sklearn.model_selection.StratifiedKFold` is used.
+        In all other cases, :class:`~sklearn.model_selection.KFold` is used.
+        These splitters are instantiated with `shuffle=False` so the splits
+        will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        If "prefit" is passed, it is assumed that all `estimators` have
+        been fitted already. The `final_estimator_` is trained on the `estimators`
+        predictions on the full training set and are **not** cross validated
+        predictions. Please note that if the models have been trained on the same
+        data to train the stacking model, there is a very high risk of overfitting.
+
+        .. versionadded:: 1.1
+            The 'prefit' option was added in 1.1
+
+        .. note::
+           A larger number of split will provide no benefits if the number
+           of training samples is large enough. Indeed, the training time
+           will increase. ``cv`` is not used for model evaluation but for
+           prediction.
+
+    stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, \
+            default='auto'
+        Methods called for each base estimator. It can be:
+
+        * if 'auto', it will try to invoke, for each estimator,
+          `'predict_proba'`, `'decision_function'` or `'predict'` in that
+          order.
+        * otherwise, one of `'predict_proba'`, `'decision_function'` or
+          `'predict'`. If the method is not implemented by the estimator, it
+          will raise an error.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for `fit` of all `estimators`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors. See :term:`Glossary <n_jobs>` for more details.
+
+    passthrough : bool, default=False
+        When False, only the predictions of estimators will be used as
+        training data for `final_estimator`. When True, the
+        `final_estimator` is trained on the predictions as well as the
+        original training data.
+
+    verbose : int, default=0
+        Verbosity level.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,) or list of ndarray if `y` \
+        is of type `"multilabel-indicator"`.
+        Class labels.
+
+    estimators_ : list of estimators
+        The elements of the `estimators` parameter, having been fitted on the
+        training data. If an estimator has been set to `'drop'`, it
+        will not appear in `estimators_`. When `cv="prefit"`, `estimators_`
+        is set to `estimators` and is not fitted again.
+
+    named_estimators_ : :class:`~sklearn.utils.Bunch`
+        Attribute to access any fitted sub-estimators by name.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimators expose such an attribute when fit.
+
+        .. versionadded:: 1.0
+
+    final_estimator_ : estimator
+        The classifier fit on the output of `estimators_` and responsible for
+        final predictions.
+
+    stack_method_ : list of str
+        The method used by each base estimator.
+
+    See Also
+    --------
+    StackingRegressor : Stack of estimators with a final regressor.
+
+    Notes
+    -----
+    When `predict_proba` is used by each estimator (i.e. most of the time for
+    `stack_method='auto'` or specifically for `stack_method='predict_proba'`),
+    the first column predicted by each estimator will be dropped in the case
+    of a binary classification problem. Indeed, both feature will be perfectly
+    collinear.
+
+    In some cases (e.g. ordinal regression), one can pass regressors as the
+    first layer of the :class:`StackingClassifier`. However, note that `y` will
+    be internally encoded in a numerically increasing order or lexicographic
+    order. If this ordering is not adequate, one should manually numerically
+    encode the classes in the desired order.
+
+    References
+    ----------
+    .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
+       (1992): 241-259.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.ensemble import StackingClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> estimators = [
+    ...     ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
+    ...     ('svr', make_pipeline(StandardScaler(),
+    ...                           LinearSVC(random_state=42)))
+    ... ]
+    >>> clf = StackingClassifier(
+    ...     estimators=estimators, final_estimator=LogisticRegression()
+    ... )
+    >>> from sklearn.model_selection import train_test_split
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> clf.fit(X_train, y_train).score(X_test, y_test)
+    0.9...
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseStacking._parameter_constraints,
+        "stack_method": [
+            StrOptions({"auto", "predict_proba", "decision_function", "predict"})
+        ],
+    }
+
+    def __init__(
+        self,
+        estimators,
+        final_estimator=None,
+        *,
+        cv=None,
+        stack_method="auto",
+        n_jobs=None,
+        passthrough=False,
+        verbose=0,
+    ):
+        super().__init__(
+            estimators=estimators,
+            final_estimator=final_estimator,
+            cv=cv,
+            stack_method=stack_method,
+            n_jobs=n_jobs,
+            passthrough=passthrough,
+            verbose=verbose,
+        )
+
+    def _validate_final_estimator(self):
+        self._clone_final_estimator(default=LogisticRegression())
+        if not is_classifier(self.final_estimator_):
+            raise ValueError(
+                "'final_estimator' parameter should be a classifier. Got {}".format(
+                    self.final_estimator_
+                )
+            )
+
+    def _validate_estimators(self):
+        """Overload the method of `_BaseHeterogeneousEnsemble` to be more
+        lenient towards the type of `estimators`.
+
+        Regressors can be accepted for some cases such as ordinal regression.
+        """
+        if len(self.estimators) == 0:
+            raise ValueError(
+                "Invalid 'estimators' attribute, 'estimators' should be a "
+                "non-empty list of (string, estimator) tuples."
+            )
+        names, estimators = zip(*self.estimators)
+        self._validate_names(names)
+
+        has_estimator = any(est != "drop" for est in estimators)
+        if not has_estimator:
+            raise ValueError(
+                "All estimators are dropped. At least one is required "
+                "to be an estimator."
+            )
+
+        return names, estimators
+
+    def fit(self, X, y, **fit_params):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values. Note that `y` will be internally encoded in
+            numerically increasing order or lexicographic order. If the order
+            matter (e.g. for ordinal regression), one should numerically encode
+            the target `y` before calling :term:`fit`.
+
+        **fit_params : dict
+            Parameters to pass to the underlying estimators.
+
+            .. versionadded:: 1.6
+
+                Only available if `enable_metadata_routing=True`, which can be
+                set by using ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns a fitted instance of estimator.
+        """
+        _raise_for_params(fit_params, self, "fit", allow=["sample_weight"])
+
+        check_classification_targets(y)
+        if type_of_target(y) == "multilabel-indicator":
+            self._label_encoder = [LabelEncoder().fit(yk) for yk in y.T]
+            self.classes_ = [le.classes_ for le in self._label_encoder]
+            y_encoded = np.array(
+                [
+                    self._label_encoder[target_idx].transform(target)
+                    for target_idx, target in enumerate(y.T)
+                ]
+            ).T
+        else:
+            self._label_encoder = LabelEncoder().fit(y)
+            self.classes_ = self._label_encoder.classes_
+            y_encoded = self._label_encoder.transform(y)
+
+        return super().fit(X, y_encoded, **fit_params)
+
+    @available_if(
+        _estimator_has("predict", delegates=("final_estimator_", "final_estimator"))
+    )
+    def predict(self, X, **predict_params):
+        """Predict target for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        **predict_params : dict of str -> obj
+            Parameters to the `predict` called by the `final_estimator`. Note
+            that this may be used to return uncertainties from some estimators
+            with `return_std` or `return_cov`. Be aware that it will only
+            account for uncertainty in the final estimator.
+
+            - If `enable_metadata_routing=False` (default):
+              Parameters directly passed to the `predict` method of the
+              `final_estimator`.
+
+            - If `enable_metadata_routing=True`: Parameters safely routed to
+              the `predict` method of the `final_estimator`. See :ref:`Metadata
+              Routing User Guide <metadata_routing>` for more details.
+
+            .. versionchanged:: 1.6
+                `**predict_params` can be routed via metadata routing API.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
+            Predicted targets.
+        """
+        if _routing_enabled():
+            routed_params = process_routing(self, "predict", **predict_params)
+        else:
+            # TODO(SLEP6): remove when metadata routing cannot be disabled.
+            routed_params = Bunch()
+            routed_params.final_estimator_ = Bunch(predict={})
+            routed_params.final_estimator_.predict = predict_params
+
+        y_pred = super().predict(X, **routed_params.final_estimator_["predict"])
+        if isinstance(self._label_encoder, list):
+            # Handle the multilabel-indicator case
+            y_pred = np.array(
+                [
+                    self._label_encoder[target_idx].inverse_transform(target)
+                    for target_idx, target in enumerate(y_pred.T)
+                ]
+            ).T
+        else:
+            y_pred = self._label_encoder.inverse_transform(y_pred)
+        return y_pred
+
+    @available_if(
+        _estimator_has(
+            "predict_proba", delegates=("final_estimator_", "final_estimator")
+        )
+    )
+    def predict_proba(self, X):
+        """Predict class probabilities for `X` using the final estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes) or \
+            list of ndarray of shape (n_output,)
+            The class probabilities of the input samples.
+        """
+        check_is_fitted(self)
+        y_pred = self.final_estimator_.predict_proba(self.transform(X))
+
+        if isinstance(self._label_encoder, list):
+            # Handle the multilabel-indicator cases
+            y_pred = np.array([preds[:, 0] for preds in y_pred]).T
+        return y_pred
+
+    @available_if(
+        _estimator_has(
+            "decision_function", delegates=("final_estimator_", "final_estimator")
+        )
+    )
+    def decision_function(self, X):
+        """Decision function for samples in `X` using the final estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        decisions : ndarray of shape (n_samples,), (n_samples, n_classes), \
+            or (n_samples, n_classes * (n_classes-1) / 2)
+            The decision function computed the final estimator.
+        """
+        check_is_fitted(self)
+        return self.final_estimator_.decision_function(self.transform(X))
+
+    def transform(self, X):
+        """Return class labels or probabilities for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        y_preds : ndarray of shape (n_samples, n_estimators) or \
+                (n_samples, n_classes * n_estimators)
+            Prediction outputs for each estimator.
+        """
+        return self._transform(X)
+
+    def _sk_visual_block_(self):
+        # If final_estimator's default changes then this should be
+        # updated.
+        if self.final_estimator is None:
+            final_estimator = LogisticRegression()
+        else:
+            final_estimator = self.final_estimator
+        return super()._sk_visual_block_with_final_estimator(final_estimator)
+
+
+class StackingRegressor(RegressorMixin, _BaseStacking):
+    """Stack of estimators with a final regressor.
+
+    Stacked generalization consists in stacking the output of individual
+    estimator and use a regressor to compute the final prediction. Stacking
+    allows to use the strength of each individual estimator by using their
+    output as input of a final estimator.
+
+    Note that `estimators_` are fitted on the full `X` while `final_estimator_`
+    is trained using cross-validated predictions of the base estimators using
+    `cross_val_predict`.
+
+    Read more in the :ref:`User Guide <stacking>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator)
+        Base estimators which will be stacked together. Each element of the
+        list is defined as a tuple of string (i.e. name) and an estimator
+        instance. An estimator can be set to 'drop' using `set_params`.
+
+    final_estimator : estimator, default=None
+        A regressor which will be used to combine the base estimators.
+        The default regressor is a :class:`~sklearn.linear_model.RidgeCV`.
+
+    cv : int, cross-validation generator, iterable, or "prefit", default=None
+        Determines the cross-validation splitting strategy used in
+        `cross_val_predict` to train `final_estimator`. Possible inputs for
+        cv are:
+
+        * None, to use the default 5-fold cross validation,
+        * integer, to specify the number of folds in a (Stratified) KFold,
+        * An object to be used as a cross-validation generator,
+        * An iterable yielding train, test splits,
+        * `"prefit"`, to assume the `estimators` are prefit. In this case, the
+          estimators will not be refitted.
+
+        For integer/None inputs, if the estimator is a classifier and y is
+        either binary or multiclass,
+        :class:`~sklearn.model_selection.StratifiedKFold` is used.
+        In all other cases, :class:`~sklearn.model_selection.KFold` is used.
+        These splitters are instantiated with `shuffle=False` so the splits
+        will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        If "prefit" is passed, it is assumed that all `estimators` have
+        been fitted already. The `final_estimator_` is trained on the `estimators`
+        predictions on the full training set and are **not** cross validated
+        predictions. Please note that if the models have been trained on the same
+        data to train the stacking model, there is a very high risk of overfitting.
+
+        .. versionadded:: 1.1
+            The 'prefit' option was added in 1.1
+
+        .. note::
+           A larger number of split will provide no benefits if the number
+           of training samples is large enough. Indeed, the training time
+           will increase. ``cv`` is not used for model evaluation but for
+           prediction.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for `fit` of all `estimators`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors. See :term:`Glossary <n_jobs>` for more details.
+
+    passthrough : bool, default=False
+        When False, only the predictions of estimators will be used as
+        training data for `final_estimator`. When True, the
+        `final_estimator` is trained on the predictions as well as the
+        original training data.
+
+    verbose : int, default=0
+        Verbosity level.
+
+    Attributes
+    ----------
+    estimators_ : list of estimators
+        The elements of the `estimators` parameter, having been fitted on the
+        training data. If an estimator has been set to `'drop'`, it
+        will not appear in `estimators_`. When `cv="prefit"`, `estimators_`
+        is set to `estimators` and is not fitted again.
+
+    named_estimators_ : :class:`~sklearn.utils.Bunch`
+        Attribute to access any fitted sub-estimators by name.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimators expose such an attribute when fit.
+
+        .. versionadded:: 1.0
+
+    final_estimator_ : estimator
+        The regressor fit on the output of `estimators_` and responsible for
+        final predictions.
+
+    stack_method_ : list of str
+        The method used by each base estimator.
+
+    See Also
+    --------
+    StackingClassifier : Stack of estimators with a final classifier.
+
+    References
+    ----------
+    .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
+       (1992): 241-259.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.linear_model import RidgeCV
+    >>> from sklearn.svm import LinearSVR
+    >>> from sklearn.ensemble import RandomForestRegressor
+    >>> from sklearn.ensemble import StackingRegressor
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> estimators = [
+    ...     ('lr', RidgeCV()),
+    ...     ('svr', LinearSVR(random_state=42))
+    ... ]
+    >>> reg = StackingRegressor(
+    ...     estimators=estimators,
+    ...     final_estimator=RandomForestRegressor(n_estimators=10,
+    ...                                           random_state=42)
+    ... )
+    >>> from sklearn.model_selection import train_test_split
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=42
+    ... )
+    >>> reg.fit(X_train, y_train).score(X_test, y_test)
+    0.3...
+    """
+
+    def __init__(
+        self,
+        estimators,
+        final_estimator=None,
+        *,
+        cv=None,
+        n_jobs=None,
+        passthrough=False,
+        verbose=0,
+    ):
+        super().__init__(
+            estimators=estimators,
+            final_estimator=final_estimator,
+            cv=cv,
+            stack_method="predict",
+            n_jobs=n_jobs,
+            passthrough=passthrough,
+            verbose=verbose,
+        )
+
+    def _validate_final_estimator(self):
+        self._clone_final_estimator(default=RidgeCV())
+        if not is_regressor(self.final_estimator_):
+            raise ValueError(
+                "'final_estimator' parameter should be a regressor. Got {}".format(
+                    self.final_estimator_
+                )
+            )
+
+    def fit(self, X, y, **fit_params):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **fit_params : dict
+            Parameters to pass to the underlying estimators.
+
+            .. versionadded:: 1.6
+
+                Only available if `enable_metadata_routing=True`, which can be
+                set by using ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns a fitted instance.
+        """
+        _raise_for_params(fit_params, self, "fit", allow=["sample_weight"])
+
+        y = column_or_1d(y, warn=True)
+
+        return super().fit(X, y, **fit_params)
+
+    def transform(self, X):
+        """Return the predictions for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        y_preds : ndarray of shape (n_samples, n_estimators)
+            Prediction outputs for each estimator.
+        """
+        return self._transform(X)
+
+    def fit_transform(self, X, y, **fit_params):
+        """Fit the estimators and return the predictions for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **fit_params : dict
+            Parameters to pass to the underlying estimators.
+
+            .. versionadded:: 1.6
+
+                Only available if `enable_metadata_routing=True`, which can be
+                set by using ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        y_preds : ndarray of shape (n_samples, n_estimators)
+            Prediction outputs for each estimator.
+        """
+        _raise_for_params(fit_params, self, "fit", allow=["sample_weight"])
+
+        return super().fit_transform(X, y, **fit_params)
+
+    @available_if(
+        _estimator_has("predict", delegates=("final_estimator_", "final_estimator"))
+    )
+    def predict(self, X, **predict_params):
+        """Predict target for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        **predict_params : dict of str -> obj
+            Parameters to the `predict` called by the `final_estimator`. Note
+            that this may be used to return uncertainties from some estimators
+            with `return_std` or `return_cov`. Be aware that it will only
+            account for uncertainty in the final estimator.
+
+            - If `enable_metadata_routing=False` (default):
+              Parameters directly passed to the `predict` method of the
+              `final_estimator`.
+
+            - If `enable_metadata_routing=True`: Parameters safely routed to
+              the `predict` method of the `final_estimator`. See :ref:`Metadata
+              Routing User Guide <metadata_routing>` for more details.
+
+            .. versionchanged:: 1.6
+                `**predict_params` can be routed via metadata routing API.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
+            Predicted targets.
+        """
+        if _routing_enabled():
+            routed_params = process_routing(self, "predict", **predict_params)
+        else:
+            # TODO(SLEP6): remove when metadata routing cannot be disabled.
+            routed_params = Bunch()
+            routed_params.final_estimator_ = Bunch(predict={})
+            routed_params.final_estimator_.predict = predict_params
+
+        y_pred = super().predict(X, **routed_params.final_estimator_["predict"])
+
+        return y_pred
+
+    def _sk_visual_block_(self):
+        # If final_estimator's default changes then this should be
+        # updated.
+        if self.final_estimator is None:
+            final_estimator = RidgeCV()
+        else:
+            final_estimator = self.final_estimator
+        return super()._sk_visual_block_with_final_estimator(final_estimator)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_voting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_voting.py
new file mode 100644
index 0000000000000000000000000000000000000000..369d3f0f5553ee2c22b930d7f6a43e132dbe2596
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_voting.py
@@ -0,0 +1,734 @@
+"""
+Soft Voting/Majority Rule classifier and Voting regressor.
+
+This module contains:
+ - A Soft Voting/Majority Rule classifier for classification estimators.
+ - A Voting regressor for regression estimators.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import abstractmethod
+from numbers import Integral
+
+import numpy as np
+
+from ..base import (
+    ClassifierMixin,
+    RegressorMixin,
+    TransformerMixin,
+    _fit_context,
+    clone,
+)
+from ..exceptions import NotFittedError
+from ..preprocessing import LabelEncoder
+from ..utils import Bunch
+from ..utils._param_validation import StrOptions
+from ..utils._repr_html.estimator import _VisualBlock
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import type_of_target
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_feature_names_in,
+    check_is_fitted,
+    column_or_1d,
+)
+from ._base import _BaseHeterogeneousEnsemble, _fit_single_estimator
+
+
+class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
+    """Base class for voting.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+    """
+
+    _parameter_constraints: dict = {
+        "estimators": [list],
+        "weights": ["array-like", None],
+        "n_jobs": [None, Integral],
+        "verbose": ["verbose"],
+    }
+
+    def _log_message(self, name, idx, total):
+        if not self.verbose:
+            return None
+        return f"({idx} of {total}) Processing {name}"
+
+    @property
+    def _weights_not_none(self):
+        """Get the weights of not `None` estimators."""
+        if self.weights is None:
+            return None
+        return [w for est, w in zip(self.estimators, self.weights) if est[1] != "drop"]
+
+    def _predict(self, X):
+        """Collect results from clf.predict calls."""
+        return np.asarray([est.predict(X) for est in self.estimators_]).T
+
+    @abstractmethod
+    def fit(self, X, y, **fit_params):
+        """Get common fit operations."""
+        names, clfs = self._validate_estimators()
+
+        if self.weights is not None and len(self.weights) != len(self.estimators):
+            raise ValueError(
+                "Number of `estimators` and weights must be equal; got"
+                f" {len(self.weights)} weights, {len(self.estimators)} estimators"
+            )
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            routed_params = Bunch()
+            for name in names:
+                routed_params[name] = Bunch(fit={})
+                if "sample_weight" in fit_params:
+                    routed_params[name].fit["sample_weight"] = fit_params[
+                        "sample_weight"
+                    ]
+
+        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
+            delayed(_fit_single_estimator)(
+                clone(clf),
+                X,
+                y,
+                fit_params=routed_params[name]["fit"],
+                message_clsname="Voting",
+                message=self._log_message(name, idx + 1, len(clfs)),
+            )
+            for idx, (name, clf) in enumerate(zip(names, clfs))
+            if clf != "drop"
+        )
+
+        self.named_estimators_ = Bunch()
+
+        # Uses 'drop' as placeholder for dropped estimators
+        est_iter = iter(self.estimators_)
+        for name, est in self.estimators:
+            current_est = est if est == "drop" else next(est_iter)
+            self.named_estimators_[name] = current_est
+
+            if hasattr(current_est, "feature_names_in_"):
+                self.feature_names_in_ = current_est.feature_names_in_
+
+        return self
+
+    def fit_transform(self, X, y=None, **fit_params):
+        """Return class labels or probabilities for each estimator.
+
+        Return predictions for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features)
+            Input samples.
+
+        y : ndarray of shape (n_samples,), default=None
+            Target values (None for unsupervised transformations).
+
+        **fit_params : dict
+            Additional fit parameters.
+
+        Returns
+        -------
+        X_new : ndarray array of shape (n_samples, n_features_new)
+            Transformed array.
+        """
+        return super().fit_transform(X, y, **fit_params)
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during :term:`fit`."""
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute.".format(
+                    self.__class__.__name__
+                )
+            ) from nfe
+
+        return self.estimators_[0].n_features_in_
+
+    def _sk_visual_block_(self):
+        names, estimators = zip(*self.estimators)
+        return _VisualBlock("parallel", estimators, names=names)
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+
+        # `self.estimators` is a list of (name, est) tuples
+        for name, estimator in self.estimators:
+            router.add(
+                **{name: estimator},
+                method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+            )
+        return router
+
+
+class VotingClassifier(ClassifierMixin, _BaseVoting):
+    """Soft Voting/Majority Rule classifier for unfitted estimators.
+
+    Read more in the :ref:`User Guide <voting_classifier>`.
+
+    .. versionadded:: 0.17
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator) tuples
+        Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
+        of those original estimators that will be stored in the class attribute
+        ``self.estimators_``. An estimator can be set to ``'drop'`` using
+        :meth:`set_params`.
+
+        .. versionchanged:: 0.21
+            ``'drop'`` is accepted. Using None was deprecated in 0.22 and
+            support was removed in 0.24.
+
+    voting : {'hard', 'soft'}, default='hard'
+        If 'hard', uses predicted class labels for majority rule voting.
+        Else if 'soft', predicts the class label based on the argmax of
+        the sums of the predicted probabilities, which is recommended for
+        an ensemble of well-calibrated classifiers.
+
+    weights : array-like of shape (n_classifiers,), default=None
+        Sequence of weights (`float` or `int`) to weight the occurrences of
+        predicted class labels (`hard` voting) or class probabilities
+        before averaging (`soft` voting). Uses uniform weights if `None`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for ``fit``.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 0.18
+
+    flatten_transform : bool, default=True
+        Affects shape of transform output only when voting='soft'
+        If voting='soft' and flatten_transform=True, transform method returns
+        matrix with shape (n_samples, n_classifiers * n_classes). If
+        flatten_transform=False, it returns
+        (n_classifiers, n_samples, n_classes).
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting will be printed as it
+        is completed.
+
+        .. versionadded:: 0.23
+
+    Attributes
+    ----------
+    estimators_ : list of classifiers
+        The collection of fitted sub-estimators as defined in ``estimators``
+        that are not 'drop'.
+
+    named_estimators_ : :class:`~sklearn.utils.Bunch`
+        Attribute to access any fitted sub-estimators by name.
+
+        .. versionadded:: 0.20
+
+    le_ : :class:`~sklearn.preprocessing.LabelEncoder`
+        Transformer used to encode the labels during fit and decode during
+        prediction.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying classifier exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimators expose such an attribute when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    VotingRegressor : Prediction voting regressor.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.naive_bayes import GaussianNB
+    >>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier
+    >>> clf1 = LogisticRegression(random_state=1)
+    >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
+    >>> clf3 = GaussianNB()
+    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>> y = np.array([1, 1, 1, 2, 2, 2])
+    >>> eclf1 = VotingClassifier(estimators=[
+    ...         ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
+    >>> eclf1 = eclf1.fit(X, y)
+    >>> print(eclf1.predict(X))
+    [1 1 1 2 2 2]
+    >>> np.array_equal(eclf1.named_estimators_.lr.predict(X),
+    ...                eclf1.named_estimators_['lr'].predict(X))
+    True
+    >>> eclf2 = VotingClassifier(estimators=[
+    ...         ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+    ...         voting='soft')
+    >>> eclf2 = eclf2.fit(X, y)
+    >>> print(eclf2.predict(X))
+    [1 1 1 2 2 2]
+
+    To drop an estimator, :meth:`set_params` can be used to remove it. Here we
+    dropped one of the estimators, resulting in 2 fitted estimators:
+
+    >>> eclf2 = eclf2.set_params(lr='drop')
+    >>> eclf2 = eclf2.fit(X, y)
+    >>> len(eclf2.estimators_)
+    2
+
+    Setting `flatten_transform=True` with `voting='soft'` flattens output shape of
+    `transform`:
+
+    >>> eclf3 = VotingClassifier(estimators=[
+    ...        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+    ...        voting='soft', weights=[2,1,1],
+    ...        flatten_transform=True)
+    >>> eclf3 = eclf3.fit(X, y)
+    >>> print(eclf3.predict(X))
+    [1 1 1 2 2 2]
+    >>> print(eclf3.transform(X).shape)
+    (6, 6)
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseVoting._parameter_constraints,
+        "voting": [StrOptions({"hard", "soft"})],
+        "flatten_transform": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        estimators,
+        *,
+        voting="hard",
+        weights=None,
+        n_jobs=None,
+        flatten_transform=True,
+        verbose=False,
+    ):
+        super().__init__(estimators=estimators)
+        self.voting = voting
+        self.weights = weights
+        self.n_jobs = n_jobs
+        self.flatten_transform = flatten_transform
+        self.verbose = verbose
+
+    @_fit_context(
+        # estimators in VotingClassifier.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **fit_params):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **fit_params : dict
+            Parameters to pass to the underlying estimators.
+
+            .. versionadded:: 1.5
+
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        _raise_for_params(fit_params, self, "fit", allow=["sample_weight"])
+
+        y_type = type_of_target(y, input_name="y")
+        if y_type in ("unknown", "continuous"):
+            # raise a specific ValueError for non-classification tasks
+            raise ValueError(
+                f"Unknown label type: {y_type}. Maybe you are trying to fit a "
+                "classifier, which expects discrete classes on a "
+                "regression target with continuous values."
+            )
+        elif y_type not in ("binary", "multiclass"):
+            # raise a NotImplementedError for backward compatibility for non-supported
+            # classification tasks
+            raise NotImplementedError(
+                f"{self.__class__.__name__} only supports binary or multiclass "
+                "classification. Multilabel and multi-output classification are not "
+                "supported."
+            )
+
+        self.le_ = LabelEncoder().fit(y)
+        self.classes_ = self.le_.classes_
+        transformed_y = self.le_.transform(y)
+
+        return super().fit(X, transformed_y, **fit_params)
+
+    def predict(self, X):
+        """Predict class labels for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        maj : array-like of shape (n_samples,)
+            Predicted class labels.
+        """
+        check_is_fitted(self)
+        if self.voting == "soft":
+            maj = np.argmax(self.predict_proba(X), axis=1)
+
+        else:  # 'hard' voting
+            predictions = self._predict(X)
+            maj = np.apply_along_axis(
+                lambda x: np.argmax(np.bincount(x, weights=self._weights_not_none)),
+                axis=1,
+                arr=predictions,
+            )
+
+        maj = self.le_.inverse_transform(maj)
+
+        return maj
+
+    def _collect_probas(self, X):
+        """Collect results from clf.predict calls."""
+        return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
+
+    def _check_voting(self):
+        if self.voting == "hard":
+            raise AttributeError(
+                f"predict_proba is not available when voting={self.voting!r}"
+            )
+        return True
+
+    @available_if(_check_voting)
+    def predict_proba(self, X):
+        """Compute probabilities of possible outcomes for samples in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        avg : array-like of shape (n_samples, n_classes)
+            Weighted average probability for each class per sample.
+        """
+        check_is_fitted(self)
+        avg = np.average(
+            self._collect_probas(X), axis=0, weights=self._weights_not_none
+        )
+        return avg
+
+    def transform(self, X):
+        """Return class labels or probabilities for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        probabilities_or_labels
+            If `voting='soft'` and `flatten_transform=True`:
+                returns ndarray of shape (n_samples, n_classifiers * n_classes),
+                being class probabilities calculated by each classifier.
+            If `voting='soft' and `flatten_transform=False`:
+                ndarray of shape (n_classifiers, n_samples, n_classes)
+            If `voting='hard'`:
+                ndarray of shape (n_samples, n_classifiers), being
+                class labels predicted by each classifier.
+        """
+        check_is_fitted(self)
+
+        if self.voting == "soft":
+            probas = self._collect_probas(X)
+            if not self.flatten_transform:
+                return probas
+            return np.hstack(probas)
+
+        else:
+            return self._predict(X)
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        if self.voting == "soft" and not self.flatten_transform:
+            raise ValueError(
+                "get_feature_names_out is not supported when `voting='soft'` and "
+                "`flatten_transform=False`"
+            )
+
+        _check_feature_names_in(self, input_features, generate_names=False)
+        class_name = self.__class__.__name__.lower()
+
+        active_names = [name for name, est in self.estimators if est != "drop"]
+
+        if self.voting == "hard":
+            return np.asarray(
+                [f"{class_name}_{name}" for name in active_names], dtype=object
+            )
+
+        # voting == "soft"
+        n_classes = len(self.classes_)
+        names_out = [
+            f"{class_name}_{name}{i}" for name in active_names for i in range(n_classes)
+        ]
+        return np.asarray(names_out, dtype=object)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = []
+        return tags
+
+
+class VotingRegressor(RegressorMixin, _BaseVoting):
+    """Prediction voting regressor for unfitted estimators.
+
+    A voting regressor is an ensemble meta-estimator that fits several base
+    regressors, each on the whole dataset. Then it averages the individual
+    predictions to form a final prediction.
+
+    For a detailed example, refer to
+    :ref:`sphx_glr_auto_examples_ensemble_plot_voting_regressor.py`.
+
+    Read more in the :ref:`User Guide <voting_regressor>`.
+
+    .. versionadded:: 0.21
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator) tuples
+        Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
+        of those original estimators that will be stored in the class attribute
+        ``self.estimators_``. An estimator can be set to ``'drop'`` using
+        :meth:`set_params`.
+
+        .. versionchanged:: 0.21
+            ``'drop'`` is accepted. Using None was deprecated in 0.22 and
+            support was removed in 0.24.
+
+    weights : array-like of shape (n_regressors,), default=None
+        Sequence of weights (`float` or `int`) to weight the occurrences of
+        predicted values before averaging. Uses uniform weights if `None`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for ``fit``.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting will be printed as it
+        is completed.
+
+        .. versionadded:: 0.23
+
+    Attributes
+    ----------
+    estimators_ : list of regressors
+        The collection of fitted sub-estimators as defined in ``estimators``
+        that are not 'drop'.
+
+    named_estimators_ : :class:`~sklearn.utils.Bunch`
+        Attribute to access any fitted sub-estimators by name.
+
+        .. versionadded:: 0.20
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying regressor exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimators expose such an attribute when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    VotingClassifier : Soft Voting/Majority Rule classifier.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.ensemble import RandomForestRegressor
+    >>> from sklearn.ensemble import VotingRegressor
+    >>> from sklearn.neighbors import KNeighborsRegressor
+    >>> r1 = LinearRegression()
+    >>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
+    >>> r3 = KNeighborsRegressor()
+    >>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
+    >>> y = np.array([2, 6, 12, 20, 30, 42])
+    >>> er = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
+    >>> print(er.fit(X, y).predict(X))
+    [ 6.8  8.4 12.5 17.8 26  34]
+
+    In the following example, we drop the `'lr'` estimator with
+    :meth:`~VotingRegressor.set_params` and fit the remaining two estimators:
+
+    >>> er = er.set_params(lr='drop')
+    >>> er = er.fit(X, y)
+    >>> len(er.estimators_)
+    2
+    """
+
+    def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
+        super().__init__(estimators=estimators)
+        self.weights = weights
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    @_fit_context(
+        # estimators in VotingRegressor.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **fit_params):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **fit_params : dict
+            Parameters to pass to the underlying estimators.
+
+            .. versionadded:: 1.5
+
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        _raise_for_params(fit_params, self, "fit", allow=["sample_weight"])
+
+        y = column_or_1d(y, warn=True)
+
+        return super().fit(X, y, **fit_params)
+
+    def predict(self, X):
+        """Predict regression target for X.
+
+        The predicted regression target of an input sample is computed as the
+        mean predicted regression targets of the estimators in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        return np.average(self._predict(X), axis=1, weights=self._weights_not_none)
+
+    def transform(self, X):
+        """Return predictions for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        predictions : ndarray of shape (n_samples, n_classifiers)
+            Values predicted by each regressor.
+        """
+        check_is_fitted(self)
+        return self._predict(X)
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        _check_feature_names_in(self, input_features, generate_names=False)
+        class_name = self.__class__.__name__.lower()
+        return np.asarray(
+            [f"{class_name}_{name}" for name, est in self.estimators if est != "drop"],
+            dtype=object,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_weight_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_weight_boosting.py
new file mode 100644
index 0000000000000000000000000000000000000000..37c6468a5ebf6d8e22d927b3528604f57b9e0676
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_weight_boosting.py
@@ -0,0 +1,1173 @@
+"""Weight Boosting.
+
+This module contains weight boosting estimators for both classification and
+regression.
+
+The module structure is the following:
+
+- The `BaseWeightBoosting` base class implements a common ``fit`` method
+  for all the estimators in the module. Regression and classification
+  only differ from each other in the loss function that is optimized.
+
+- :class:`~sklearn.ensemble.AdaBoostClassifier` implements adaptive boosting
+  (AdaBoost-SAMME) for classification problems.
+
+- :class:`~sklearn.ensemble.AdaBoostRegressor` implements adaptive boosting
+  (AdaBoost.R2) for regression problems.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import (
+    ClassifierMixin,
+    RegressorMixin,
+    _fit_context,
+    is_classifier,
+    is_regressor,
+)
+from ..metrics import accuracy_score, r2_score
+from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
+from ..utils import _safe_indexing, check_random_state
+from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
+from ..utils.extmath import softmax, stable_cumsum
+from ..utils.metadata_routing import (
+    _raise_for_unsupported_routing,
+    _RoutingNotSupportedMixin,
+)
+from ..utils.validation import (
+    _check_sample_weight,
+    _num_samples,
+    check_is_fitted,
+    has_fit_parameter,
+    validate_data,
+)
+from ._base import BaseEnsemble
+
+__all__ = [
+    "AdaBoostClassifier",
+    "AdaBoostRegressor",
+]
+
+
+class BaseWeightBoosting(BaseEnsemble, metaclass=ABCMeta):
+    """Base class for AdaBoost estimators.
+
+    Warning: This class should not be used directly. Use derived classes
+    instead.
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit", "predict"]), None],
+        "n_estimators": [Interval(Integral, 1, None, closed="left")],
+        "learning_rate": [Interval(Real, 0, None, closed="neither")],
+        "random_state": ["random_state"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimator=None,
+        *,
+        n_estimators=50,
+        estimator_params=tuple(),
+        learning_rate=1.0,
+        random_state=None,
+    ):
+        super().__init__(
+            estimator=estimator,
+            n_estimators=n_estimators,
+            estimator_params=estimator_params,
+        )
+
+        self.learning_rate = learning_rate
+        self.random_state = random_state
+
+    def _check_X(self, X):
+        # Only called to validate X in non-fit methods, therefore reset=False
+        return validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc"],
+            ensure_2d=True,
+            allow_nd=True,
+            dtype=None,
+            reset=False,
+        )
+
+    @_fit_context(
+        # AdaBoost*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, sample_weight=None):
+        """Build a boosted classifier/regressor from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        y : array-like of shape (n_samples,)
+            The target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, the sample weights are initialized to
+            1 / n_samples.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        _raise_for_unsupported_routing(self, "fit", sample_weight=sample_weight)
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csr", "csc"],
+            ensure_2d=True,
+            allow_nd=True,
+            dtype=None,
+            y_numeric=is_regressor(self),
+        )
+
+        sample_weight = _check_sample_weight(
+            sample_weight, X, dtype=np.float64, copy=True, ensure_non_negative=True
+        )
+        sample_weight /= sample_weight.sum()
+
+        # Check parameters
+        self._validate_estimator()
+
+        # Clear any previous fit results
+        self.estimators_ = []
+        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
+        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
+
+        # Initialization of the random number instance that will be used to
+        # generate a seed at each iteration
+        random_state = check_random_state(self.random_state)
+        epsilon = np.finfo(sample_weight.dtype).eps
+
+        zero_weight_mask = sample_weight == 0.0
+        for iboost in range(self.n_estimators):
+            # avoid extremely small sample weight, for details see issue #20320
+            sample_weight = np.clip(sample_weight, a_min=epsilon, a_max=None)
+            # do not clip sample weights that were exactly zero originally
+            sample_weight[zero_weight_mask] = 0.0
+
+            # Boosting step
+            sample_weight, estimator_weight, estimator_error = self._boost(
+                iboost, X, y, sample_weight, random_state
+            )
+
+            # Early termination
+            if sample_weight is None:
+                break
+            self.estimator_weights_[iboost] = estimator_weight
+            self.estimator_errors_[iboost] = estimator_error
+
+            # Stop if error is zero
+            if estimator_error == 0:
+                break
+
+            sample_weight_sum = np.sum(sample_weight)
+
+            if not np.isfinite(sample_weight_sum):
+                warnings.warn(
+                    (
+                        "Sample weights have reached infinite values,"
+                        f" at iteration {iboost}, causing overflow. "
+                        "Iterations stopped. Try lowering the learning rate."
+                    ),
+                    stacklevel=2,
+                )
+                break
+
+            # Stop if the sum of sample weights has become non-positive
+            if sample_weight_sum <= 0:
+                break
+
+            if iboost < self.n_estimators - 1:
+                # Normalize
+                sample_weight /= sample_weight_sum
+
+        return self
+
+    @abstractmethod
+    def _boost(self, iboost, X, y, sample_weight, random_state):
+        """Implement a single boost.
+
+        Warning: This method needs to be overridden by subclasses.
+
+        Parameters
+        ----------
+        iboost : int
+            The index of the current boost iteration.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        y : array-like of shape (n_samples,)
+            The target values (class labels).
+
+        sample_weight : array-like of shape (n_samples,)
+            The current sample weights.
+
+        random_state : RandomState
+            The current random number generator
+
+        Returns
+        -------
+        sample_weight : array-like of shape (n_samples,) or None
+            The reweighted sample weights.
+            If None then boosting has terminated early.
+
+        estimator_weight : float
+            The weight for the current boost.
+            If None then boosting has terminated early.
+
+        error : float
+            The classification error for the current boost.
+            If None then boosting has terminated early.
+        """
+        pass
+
+    def staged_score(self, X, y, sample_weight=None):
+        """Return staged scores for X, y.
+
+        This generator method yields the ensemble score after each iteration of
+        boosting and therefore allows monitoring, such as to determine the
+        score on a test set after each boost.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        y : array-like of shape (n_samples,)
+            Labels for X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Yields
+        ------
+        z : float
+        """
+        X = self._check_X(X)
+
+        for y_pred in self.staged_predict(X):
+            if is_classifier(self):
+                yield accuracy_score(y, y_pred, sample_weight=sample_weight)
+            else:
+                yield r2_score(y, y_pred, sample_weight=sample_weight)
+
+    @property
+    def feature_importances_(self):
+        """The impurity-based feature importances.
+
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+        Returns
+        -------
+        feature_importances_ : ndarray of shape (n_features,)
+            The feature importances.
+        """
+        if self.estimators_ is None or len(self.estimators_) == 0:
+            raise ValueError(
+                "Estimator not fitted, call `fit` before `feature_importances_`."
+            )
+
+        try:
+            norm = self.estimator_weights_.sum()
+            return (
+                sum(
+                    weight * clf.feature_importances_
+                    for weight, clf in zip(self.estimator_weights_, self.estimators_)
+                )
+                / norm
+            )
+
+        except AttributeError as e:
+            raise AttributeError(
+                "Unable to compute feature importances "
+                "since estimator does not have a "
+                "feature_importances_ attribute"
+            ) from e
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+def _samme_proba(estimator, n_classes, X):
+    """Calculate algorithm 4, step 2, equation c) of Zhu et al [1].
+
+    References
+    ----------
+    .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
+
+    """
+    proba = estimator.predict_proba(X)
+
+    # Displace zero probabilities so the log is defined.
+    # Also fix negative elements which may occur with
+    # negative sample weights.
+    np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)
+    log_proba = np.log(proba)
+
+    return (n_classes - 1) * (
+        log_proba - (1.0 / n_classes) * log_proba.sum(axis=1)[:, np.newaxis]
+    )
+
+
+class AdaBoostClassifier(
+    _RoutingNotSupportedMixin, ClassifierMixin, BaseWeightBoosting
+):
+    """An AdaBoost classifier.
+
+    An AdaBoost [1]_ classifier is a meta-estimator that begins by fitting a
+    classifier on the original dataset and then fits additional copies of the
+    classifier on the same dataset but where the weights of incorrectly
+    classified instances are adjusted such that subsequent classifiers focus
+    more on difficult cases.
+
+    This class implements the algorithm based on [2]_.
+
+    Read more in the :ref:`User Guide <adaboost>`.
+
+    .. versionadded:: 0.14
+
+    Parameters
+    ----------
+    estimator : object, default=None
+        The base estimator from which the boosted ensemble is built.
+        Support for sample weighting is required, as well as proper
+        ``classes_`` and ``n_classes_`` attributes. If ``None``, then
+        the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier`
+        initialized with `max_depth=1`.
+
+        .. versionadded:: 1.2
+           `base_estimator` was renamed to `estimator`.
+
+    n_estimators : int, default=50
+        The maximum number of estimators at which boosting is terminated.
+        In case of perfect fit, the learning procedure is stopped early.
+        Values must be in the range `[1, inf)`.
+
+    learning_rate : float, default=1.0
+        Weight applied to each classifier at each boosting iteration. A higher
+        learning rate increases the contribution of each classifier. There is
+        a trade-off between the `learning_rate` and `n_estimators` parameters.
+        Values must be in the range `(0.0, inf)`.
+
+    algorithm : {'SAMME'}, default='SAMME'
+        Use the SAMME discrete boosting algorithm.
+
+        .. deprecated:: 1.6
+            `algorithm` is deprecated and will be removed in version 1.8. This
+            estimator only implements the 'SAMME' algorithm.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random seed given at each `estimator` at each
+        boosting iteration.
+        Thus, it is only used when `estimator` exposes a `random_state`.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    estimators_ : list of classifiers
+        The collection of fitted sub-estimators.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels.
+
+    n_classes_ : int
+        The number of classes.
+
+    estimator_weights_ : ndarray of floats
+        Weights for each estimator in the boosted ensemble.
+
+    estimator_errors_ : ndarray of floats
+        Classification error for each estimator in the boosted
+        ensemble.
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances if supported by the
+        ``estimator`` (when based on decision trees).
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    AdaBoostRegressor : An AdaBoost regressor that begins by fitting a
+        regressor on the original dataset and then fits additional copies of
+        the regressor on the same dataset but where the weights of instances
+        are adjusted according to the error of the current prediction.
+
+    GradientBoostingClassifier : GB builds an additive model in a forward
+        stage-wise fashion. Regression trees are fit on the negative gradient
+        of the binomial or multinomial deviance loss function. Binary
+        classification is a special case where only a single regression tree is
+        induced.
+
+    sklearn.tree.DecisionTreeClassifier : A non-parametric supervised learning
+        method used for classification.
+        Creates a model that predicts the value of a target variable by
+        learning simple decision rules inferred from the data features.
+
+    References
+    ----------
+    .. [1] Y. Freund, R. Schapire, "A Decision-Theoretic Generalization of
+           on-Line Learning and an Application to Boosting", 1995.
+
+    .. [2] :doi:`J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class adaboost."
+           Statistics and its Interface 2.3 (2009): 349-360.
+           <10.4310/SII.2009.v2.n3.a8>`
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import AdaBoostClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=1000, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = AdaBoostClassifier(n_estimators=100, random_state=0)
+    >>> clf.fit(X, y)
+    AdaBoostClassifier(n_estimators=100, random_state=0)
+    >>> clf.predict([[0, 0, 0, 0]])
+    array([1])
+    >>> clf.score(X, y)
+    0.96
+
+    For a detailed example of using AdaBoost to fit a sequence of DecisionTrees
+    as weaklearners, please refer to
+    :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`.
+
+    For a detailed example of using AdaBoost to fit a non-linearly separable
+    classification dataset composed of two Gaussian quantiles clusters, please
+    refer to :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_twoclass.py`.
+    """
+
+    # TODO(1.8): remove "algorithm" entry
+    _parameter_constraints: dict = {
+        **BaseWeightBoosting._parameter_constraints,
+        "algorithm": [StrOptions({"SAMME"}), Hidden(StrOptions({"deprecated"}))],
+    }
+
+    def __init__(
+        self,
+        estimator=None,
+        *,
+        n_estimators=50,
+        learning_rate=1.0,
+        algorithm="deprecated",
+        random_state=None,
+    ):
+        super().__init__(
+            estimator=estimator,
+            n_estimators=n_estimators,
+            learning_rate=learning_rate,
+            random_state=random_state,
+        )
+
+        self.algorithm = algorithm
+
+    def _validate_estimator(self):
+        """Check the estimator and set the estimator_ attribute."""
+        super()._validate_estimator(default=DecisionTreeClassifier(max_depth=1))
+
+        if self.algorithm != "deprecated":
+            warnings.warn(
+                "The parameter 'algorithm' is deprecated in 1.6 and has no effect. "
+                "It will be removed in version 1.8.",
+                FutureWarning,
+            )
+
+        if not has_fit_parameter(self.estimator_, "sample_weight"):
+            raise ValueError(
+                f"{self.estimator.__class__.__name__} doesn't support sample_weight."
+            )
+
+    def _boost(self, iboost, X, y, sample_weight, random_state):
+        """Implement a single boost.
+
+        Perform a single boost according to the discrete SAMME algorithm and return the
+        updated sample weights.
+
+        Parameters
+        ----------
+        iboost : int
+            The index of the current boost iteration.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples,)
+            The target values (class labels).
+
+        sample_weight : array-like of shape (n_samples,)
+            The current sample weights.
+
+        random_state : RandomState instance
+            The RandomState instance used if the base estimator accepts a
+            `random_state` attribute.
+
+        Returns
+        -------
+        sample_weight : array-like of shape (n_samples,) or None
+            The reweighted sample weights.
+            If None then boosting has terminated early.
+
+        estimator_weight : float
+            The weight for the current boost.
+            If None then boosting has terminated early.
+
+        estimator_error : float
+            The classification error for the current boost.
+            If None then boosting has terminated early.
+        """
+        estimator = self._make_estimator(random_state=random_state)
+
+        estimator.fit(X, y, sample_weight=sample_weight)
+
+        y_predict = estimator.predict(X)
+
+        if iboost == 0:
+            self.classes_ = getattr(estimator, "classes_", None)
+            self.n_classes_ = len(self.classes_)
+
+        # Instances incorrectly classified
+        incorrect = y_predict != y
+
+        # Error fraction
+        estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))
+
+        # Stop if classification is perfect
+        if estimator_error <= 0:
+            return sample_weight, 1.0, 0.0
+
+        n_classes = self.n_classes_
+
+        # Stop if the error is at least as bad as random guessing
+        if estimator_error >= 1.0 - (1.0 / n_classes):
+            self.estimators_.pop(-1)
+            if len(self.estimators_) == 0:
+                raise ValueError(
+                    "BaseClassifier in AdaBoostClassifier "
+                    "ensemble is worse than random, ensemble "
+                    "can not be fit."
+                )
+            return None, None, None
+
+        # Boost weight using multi-class AdaBoost SAMME alg
+        estimator_weight = self.learning_rate * (
+            np.log((1.0 - estimator_error) / estimator_error) + np.log(n_classes - 1.0)
+        )
+
+        # Only boost the weights if it will fit again
+        if not iboost == self.n_estimators - 1:
+            # Only boost positive weights
+            sample_weight = np.exp(
+                np.log(sample_weight)
+                + estimator_weight * incorrect * (sample_weight > 0)
+            )
+
+        return sample_weight, estimator_weight, estimator_error
+
+    def predict(self, X):
+        """Predict classes for X.
+
+        The predicted class of an input sample is computed as the weighted mean
+        prediction of the classifiers in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted classes.
+        """
+        pred = self.decision_function(X)
+
+        if self.n_classes_ == 2:
+            return self.classes_.take(pred > 0, axis=0)
+
+        return self.classes_.take(np.argmax(pred, axis=1), axis=0)
+
+    def staged_predict(self, X):
+        """Return staged predictions for X.
+
+        The predicted class of an input sample is computed as the weighted mean
+        prediction of the classifiers in the ensemble.
+
+        This generator method yields the ensemble prediction after each
+        iteration of boosting and therefore allows monitoring, such as to
+        determine the prediction on a test set after each boost.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Yields
+        ------
+        y : generator of ndarray of shape (n_samples,)
+            The predicted classes.
+        """
+        X = self._check_X(X)
+
+        n_classes = self.n_classes_
+        classes = self.classes_
+
+        if n_classes == 2:
+            for pred in self.staged_decision_function(X):
+                yield np.array(classes.take(pred > 0, axis=0))
+
+        else:
+            for pred in self.staged_decision_function(X):
+                yield np.array(classes.take(np.argmax(pred, axis=1), axis=0))
+
+    def decision_function(self, X):
+        """Compute the decision function of ``X``.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Returns
+        -------
+        score : ndarray of shape of (n_samples, k)
+            The decision function of the input samples. The order of
+            outputs is the same as that of the :term:`classes_` attribute.
+            Binary classification is a special cases with ``k == 1``,
+            otherwise ``k==n_classes``. For binary classification,
+            values closer to -1 or 1 mean more like the first or second
+            class in ``classes_``, respectively.
+        """
+        check_is_fitted(self)
+        X = self._check_X(X)
+
+        n_classes = self.n_classes_
+        classes = self.classes_[:, np.newaxis]
+
+        if n_classes == 1:
+            return np.zeros_like(X, shape=(X.shape[0], 1))
+
+        pred = sum(
+            np.where(
+                (estimator.predict(X) == classes).T,
+                w,
+                -1 / (n_classes - 1) * w,
+            )
+            for estimator, w in zip(self.estimators_, self.estimator_weights_)
+        )
+
+        pred /= self.estimator_weights_.sum()
+        if n_classes == 2:
+            pred[:, 0] *= -1
+            return pred.sum(axis=1)
+        return pred
+
+    def staged_decision_function(self, X):
+        """Compute decision function of ``X`` for each boosting iteration.
+
+        This method allows monitoring (i.e. determine error on testing set)
+        after each boosting iteration.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Yields
+        ------
+        score : generator of ndarray of shape (n_samples, k)
+            The decision function of the input samples. The order of
+            outputs is the same of that of the :term:`classes_` attribute.
+            Binary classification is a special cases with ``k == 1``,
+            otherwise ``k==n_classes``. For binary classification,
+            values closer to -1 or 1 mean more like the first or second
+            class in ``classes_``, respectively.
+        """
+        check_is_fitted(self)
+        X = self._check_X(X)
+
+        n_classes = self.n_classes_
+        classes = self.classes_[:, np.newaxis]
+        pred = None
+        norm = 0.0
+
+        for weight, estimator in zip(self.estimator_weights_, self.estimators_):
+            norm += weight
+
+            current_pred = np.where(
+                (estimator.predict(X) == classes).T,
+                weight,
+                -1 / (n_classes - 1) * weight,
+            )
+
+            if pred is None:
+                pred = current_pred
+            else:
+                pred += current_pred
+
+            if n_classes == 2:
+                tmp_pred = np.copy(pred)
+                tmp_pred[:, 0] *= -1
+                yield (tmp_pred / norm).sum(axis=1)
+            else:
+                yield pred / norm
+
+    @staticmethod
+    def _compute_proba_from_decision(decision, n_classes):
+        """Compute probabilities from the decision function.
+
+        This is based eq. (15) of [1] where:
+            p(y=c|X) = exp((1 / K-1) f_c(X)) / sum_k(exp((1 / K-1) f_k(X)))
+                     = softmax((1 / K-1) * f(X))
+
+        References
+        ----------
+        .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost",
+               2009.
+        """
+        if n_classes == 2:
+            decision = np.vstack([-decision, decision]).T / 2
+        else:
+            decision /= n_classes - 1
+        return softmax(decision, copy=False)
+
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        The predicted class probabilities of an input sample is computed as
+        the weighted mean predicted class probabilities of the classifiers
+        in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of
+            outputs is the same of that of the :term:`classes_` attribute.
+        """
+        check_is_fitted(self)
+        n_classes = self.n_classes_
+
+        if n_classes == 1:
+            return np.ones((_num_samples(X), 1))
+
+        decision = self.decision_function(X)
+        return self._compute_proba_from_decision(decision, n_classes)
+
+    def staged_predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        The predicted class probabilities of an input sample is computed as
+        the weighted mean predicted class probabilities of the classifiers
+        in the ensemble.
+
+        This generator method yields the ensemble predicted class probabilities
+        after each iteration of boosting and therefore allows monitoring, such
+        as to determine the predicted class probabilities on a test set after
+        each boost.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Yields
+        ------
+        p : generator of ndarray of shape (n_samples,)
+            The class probabilities of the input samples. The order of
+            outputs is the same of that of the :term:`classes_` attribute.
+        """
+
+        n_classes = self.n_classes_
+
+        for decision in self.staged_decision_function(X):
+            yield self._compute_proba_from_decision(decision, n_classes)
+
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities for X.
+
+        The predicted class log-probabilities of an input sample is computed as
+        the weighted mean predicted class log-probabilities of the classifiers
+        in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of
+            outputs is the same of that of the :term:`classes_` attribute.
+        """
+        return np.log(self.predict_proba(X))
+
+
+class AdaBoostRegressor(_RoutingNotSupportedMixin, RegressorMixin, BaseWeightBoosting):
+    """An AdaBoost regressor.
+
+    An AdaBoost [1] regressor is a meta-estimator that begins by fitting a
+    regressor on the original dataset and then fits additional copies of the
+    regressor on the same dataset but where the weights of instances are
+    adjusted according to the error of the current prediction. As such,
+    subsequent regressors focus more on difficult cases.
+
+    This class implements the algorithm known as AdaBoost.R2 [2].
+
+    Read more in the :ref:`User Guide <adaboost>`.
+
+    .. versionadded:: 0.14
+
+    Parameters
+    ----------
+    estimator : object, default=None
+        The base estimator from which the boosted ensemble is built.
+        If ``None``, then the base estimator is
+        :class:`~sklearn.tree.DecisionTreeRegressor` initialized with
+        `max_depth=3`.
+
+        .. versionadded:: 1.2
+           `base_estimator` was renamed to `estimator`.
+
+    n_estimators : int, default=50
+        The maximum number of estimators at which boosting is terminated.
+        In case of perfect fit, the learning procedure is stopped early.
+        Values must be in the range `[1, inf)`.
+
+    learning_rate : float, default=1.0
+        Weight applied to each regressor at each boosting iteration. A higher
+        learning rate increases the contribution of each regressor. There is
+        a trade-off between the `learning_rate` and `n_estimators` parameters.
+        Values must be in the range `(0.0, inf)`.
+
+    loss : {'linear', 'square', 'exponential'}, default='linear'
+        The loss function to use when updating the weights after each
+        boosting iteration.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random seed given at each `estimator` at each
+        boosting iteration.
+        Thus, it is only used when `estimator` exposes a `random_state`.
+        In addition, it controls the bootstrap of the weights used to train the
+        `estimator` at each boosting iteration.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+        .. versionadded:: 1.2
+           `base_estimator_` was renamed to `estimator_`.
+
+    estimators_ : list of regressors
+        The collection of fitted sub-estimators.
+
+    estimator_weights_ : ndarray of floats
+        Weights for each estimator in the boosted ensemble.
+
+    estimator_errors_ : ndarray of floats
+        Regression error for each estimator in the boosted ensemble.
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances if supported by the
+        ``estimator`` (when based on decision trees).
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    AdaBoostClassifier : An AdaBoost classifier.
+    GradientBoostingRegressor : Gradient Boosting Classification Tree.
+    sklearn.tree.DecisionTreeRegressor : A decision tree regressor.
+
+    References
+    ----------
+    .. [1] Y. Freund, R. Schapire, "A Decision-Theoretic Generalization of
+           on-Line Learning and an Application to Boosting", 1995.
+
+    .. [2] H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import AdaBoostRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_features=4, n_informative=2,
+    ...                        random_state=0, shuffle=False)
+    >>> regr = AdaBoostRegressor(random_state=0, n_estimators=100)
+    >>> regr.fit(X, y)
+    AdaBoostRegressor(n_estimators=100, random_state=0)
+    >>> regr.predict([[0, 0, 0, 0]])
+    array([4.7972])
+    >>> regr.score(X, y)
+    0.9771
+
+    For a detailed example of utilizing :class:`~sklearn.ensemble.AdaBoostRegressor`
+    to fit a sequence of decision trees as weak learners, please refer to
+    :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **BaseWeightBoosting._parameter_constraints,
+        "loss": [StrOptions({"linear", "square", "exponential"})],
+    }
+
+    def __init__(
+        self,
+        estimator=None,
+        *,
+        n_estimators=50,
+        learning_rate=1.0,
+        loss="linear",
+        random_state=None,
+    ):
+        super().__init__(
+            estimator=estimator,
+            n_estimators=n_estimators,
+            learning_rate=learning_rate,
+            random_state=random_state,
+        )
+
+        self.loss = loss
+        self.random_state = random_state
+
+    def _validate_estimator(self):
+        """Check the estimator and set the estimator_ attribute."""
+        super()._validate_estimator(default=DecisionTreeRegressor(max_depth=3))
+
+    def _boost(self, iboost, X, y, sample_weight, random_state):
+        """Implement a single boost for regression
+
+        Perform a single boost according to the AdaBoost.R2 algorithm and
+        return the updated sample weights.
+
+        Parameters
+        ----------
+        iboost : int
+            The index of the current boost iteration.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples,)
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        sample_weight : array-like of shape (n_samples,)
+            The current sample weights.
+
+        random_state : RandomState
+            The RandomState instance used if the base estimator accepts a
+            `random_state` attribute.
+            Controls also the bootstrap of the weights used to train the weak
+            learner.
+
+        Returns
+        -------
+        sample_weight : array-like of shape (n_samples,) or None
+            The reweighted sample weights.
+            If None then boosting has terminated early.
+
+        estimator_weight : float
+            The weight for the current boost.
+            If None then boosting has terminated early.
+
+        estimator_error : float
+            The regression error for the current boost.
+            If None then boosting has terminated early.
+        """
+        estimator = self._make_estimator(random_state=random_state)
+
+        # Weighted sampling of the training set with replacement
+        bootstrap_idx = random_state.choice(
+            np.arange(_num_samples(X)),
+            size=_num_samples(X),
+            replace=True,
+            p=sample_weight,
+        )
+
+        # Fit on the bootstrapped sample and obtain a prediction
+        # for all samples in the training set
+        X_ = _safe_indexing(X, bootstrap_idx)
+        y_ = _safe_indexing(y, bootstrap_idx)
+        estimator.fit(X_, y_)
+        y_predict = estimator.predict(X)
+
+        error_vect = np.abs(y_predict - y)
+        sample_mask = sample_weight > 0
+        masked_sample_weight = sample_weight[sample_mask]
+        masked_error_vector = error_vect[sample_mask]
+
+        error_max = masked_error_vector.max()
+        if error_max != 0:
+            masked_error_vector /= error_max
+
+        if self.loss == "square":
+            masked_error_vector **= 2
+        elif self.loss == "exponential":
+            masked_error_vector = 1.0 - np.exp(-masked_error_vector)
+
+        # Calculate the average loss
+        estimator_error = (masked_sample_weight * masked_error_vector).sum()
+
+        if estimator_error <= 0:
+            # Stop if fit is perfect
+            return sample_weight, 1.0, 0.0
+
+        elif estimator_error >= 0.5:
+            # Discard current estimator only if it isn't the only one
+            if len(self.estimators_) > 1:
+                self.estimators_.pop(-1)
+            return None, None, None
+
+        beta = estimator_error / (1.0 - estimator_error)
+
+        # Boost weight using AdaBoost.R2 alg
+        estimator_weight = self.learning_rate * np.log(1.0 / beta)
+
+        if not iboost == self.n_estimators - 1:
+            sample_weight[sample_mask] *= np.power(
+                beta, (1.0 - masked_error_vector) * self.learning_rate
+            )
+
+        return sample_weight, estimator_weight, estimator_error
+
+    def _get_median_predict(self, X, limit):
+        # Evaluate predictions of all estimators
+        predictions = np.array([est.predict(X) for est in self.estimators_[:limit]]).T
+
+        # Sort the predictions
+        sorted_idx = np.argsort(predictions, axis=1)
+
+        # Find index of median prediction for each sample
+        weight_cdf = stable_cumsum(self.estimator_weights_[sorted_idx], axis=1)
+        median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
+        median_idx = median_or_above.argmax(axis=1)
+
+        median_estimators = sorted_idx[np.arange(_num_samples(X)), median_idx]
+
+        # Return median predictions
+        return predictions[np.arange(_num_samples(X)), median_estimators]
+
+    def predict(self, X):
+        """Predict regression value for X.
+
+        The predicted regression value of an input sample is computed
+        as the weighted median prediction of the regressors in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrix can be CSC, CSR, COO,
+            DOK, or LIL. COO, DOK, and LIL are converted to CSR.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted regression values.
+        """
+        check_is_fitted(self)
+        X = self._check_X(X)
+
+        return self._get_median_predict(X, len(self.estimators_))
+
+    def staged_predict(self, X):
+        """Return staged predictions for X.
+
+        The predicted regression value of an input sample is computed
+        as the weighted median prediction of the regressors in the ensemble.
+
+        This generator method yields the ensemble prediction after each
+        iteration of boosting and therefore allows monitoring, such as to
+        determine the prediction on a test set after each boost.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples.
+
+        Yields
+        ------
+        y : generator of ndarray of shape (n_samples,)
+            The predicted regression values.
+        """
+        check_is_fitted(self)
+        X = self._check_X(X)
+
+        for i, _ in enumerate(self.estimators_, 1):
+            yield self._get_median_predict(X, limit=i)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/meson.build b/.venv/lib/python3.12/site-packages/sklearn/ensemble/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..893a4eb1a510aeea0eecb26b38087eb35bbe93d8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/meson.build
@@ -0,0 +1,9 @@
+py.extension_module(
+  '_gradient_boosting',
+  [cython_gen.process('_gradient_boosting.pyx')] + utils_cython_tree,
+  dependencies: [np_dep],
+  subdir: 'sklearn/ensemble',
+  install: true
+)
+
+subdir('_hist_gradient_boosting')
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_bagging.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_bagging.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cb9336bfd759321da57745beecacf3d46154551
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_bagging.py
@@ -0,0 +1,1043 @@
+"""
+Testing for the bagging ensemble module (sklearn.ensemble.bagging).
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import cycle, product
+
+import joblib
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator
+from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.ensemble import (
+    AdaBoostClassifier,
+    AdaBoostRegressor,
+    BaggingClassifier,
+    BaggingRegressor,
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+    RandomForestClassifier,
+    RandomForestRegressor,
+)
+from sklearn.feature_selection import SelectKBest
+from sklearn.linear_model import LogisticRegression, Perceptron
+from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split
+from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, scale
+from sklearn.random_projection import SparseRandomProjection
+from sklearn.svm import SVC, SVR
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifierWithOnlyPredict,
+    ConsumingClassifierWithoutPredictLogProba,
+    ConsumingClassifierWithoutPredictProba,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+rng = check_random_state(0)
+
+# also load the iris dataset
+# and randomly permute it
+iris = load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+# also load the diabetes dataset
+# and randomly permute it
+diabetes = load_diabetes()
+perm = rng.permutation(diabetes.target.size)
+diabetes.data = diabetes.data[perm]
+diabetes.target = diabetes.target[perm]
+
+
+def test_classification():
+    # Check classification for various parameter settings.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        iris.data, iris.target, random_state=rng
+    )
+    grid = ParameterGrid(
+        {
+            "max_samples": [0.5, 1.0],
+            "max_features": [1, 4],
+            "bootstrap": [True, False],
+            "bootstrap_features": [True, False],
+        }
+    )
+    estimators = [
+        None,
+        DummyClassifier(),
+        Perceptron(max_iter=20),
+        DecisionTreeClassifier(max_depth=2),
+        KNeighborsClassifier(),
+        SVC(),
+    ]
+    # Try different parameter settings with different base classifiers without
+    # doing the full cartesian product to keep the test durations low.
+    for params, estimator in zip(grid, cycle(estimators)):
+        BaggingClassifier(
+            estimator=estimator,
+            random_state=rng,
+            n_estimators=2,
+            **params,
+        ).fit(X_train, y_train).predict(X_test)
+
+
+@pytest.mark.parametrize(
+    "sparse_container, params, method",
+    product(
+        CSR_CONTAINERS + CSC_CONTAINERS,
+        [
+            {
+                "max_samples": 0.5,
+                "max_features": 2,
+                "bootstrap": True,
+                "bootstrap_features": True,
+            },
+            {
+                "max_samples": 1.0,
+                "max_features": 4,
+                "bootstrap": True,
+                "bootstrap_features": True,
+            },
+            {"max_features": 2, "bootstrap": False, "bootstrap_features": True},
+            {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
+        ],
+        ["predict", "predict_proba", "predict_log_proba", "decision_function"],
+    ),
+)
+def test_sparse_classification(sparse_container, params, method):
+    # Check classification for various parameter settings on sparse input.
+
+    class CustomSVC(SVC):
+        """SVC variant that records the nature of the training set"""
+
+        def fit(self, X, y):
+            super().fit(X, y)
+            self.data_type_ = type(X)
+            return self
+
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        scale(iris.data), iris.target, random_state=rng
+    )
+
+    X_train_sparse = sparse_container(X_train)
+    X_test_sparse = sparse_container(X_test)
+    # Trained on sparse format
+    sparse_classifier = BaggingClassifier(
+        estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
+        random_state=1,
+        **params,
+    ).fit(X_train_sparse, y_train)
+    sparse_results = getattr(sparse_classifier, method)(X_test_sparse)
+
+    # Trained on dense format
+    dense_classifier = BaggingClassifier(
+        estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
+        random_state=1,
+        **params,
+    ).fit(X_train, y_train)
+    dense_results = getattr(dense_classifier, method)(X_test)
+    assert_array_almost_equal(sparse_results, dense_results)
+
+    sparse_type = type(X_train_sparse)
+    types = [i.data_type_ for i in sparse_classifier.estimators_]
+
+    assert all([t == sparse_type for t in types])
+
+
+def test_regression():
+    # Check regression for various parameter settings.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data[:50], diabetes.target[:50], random_state=rng
+    )
+    grid = ParameterGrid(
+        {
+            "max_samples": [0.5, 1.0],
+            "max_features": [0.5, 1.0],
+            "bootstrap": [True, False],
+            "bootstrap_features": [True, False],
+        }
+    )
+
+    for estimator in [
+        None,
+        DummyRegressor(),
+        DecisionTreeRegressor(),
+        KNeighborsRegressor(),
+        SVR(),
+    ]:
+        for params in grid:
+            BaggingRegressor(estimator=estimator, random_state=rng, **params).fit(
+                X_train, y_train
+            ).predict(X_test)
+
+
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_sparse_regression(sparse_container):
+    # Check regression for various parameter settings on sparse input.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data[:50], diabetes.target[:50], random_state=rng
+    )
+
+    class CustomSVR(SVR):
+        """SVC variant that records the nature of the training set"""
+
+        def fit(self, X, y):
+            super().fit(X, y)
+            self.data_type_ = type(X)
+            return self
+
+    parameter_sets = [
+        {
+            "max_samples": 0.5,
+            "max_features": 2,
+            "bootstrap": True,
+            "bootstrap_features": True,
+        },
+        {
+            "max_samples": 1.0,
+            "max_features": 4,
+            "bootstrap": True,
+            "bootstrap_features": True,
+        },
+        {"max_features": 2, "bootstrap": False, "bootstrap_features": True},
+        {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
+    ]
+
+    X_train_sparse = sparse_container(X_train)
+    X_test_sparse = sparse_container(X_test)
+    for params in parameter_sets:
+        # Trained on sparse format
+        sparse_classifier = BaggingRegressor(
+            estimator=CustomSVR(), random_state=1, **params
+        ).fit(X_train_sparse, y_train)
+        sparse_results = sparse_classifier.predict(X_test_sparse)
+
+        # Trained on dense format
+        dense_results = (
+            BaggingRegressor(estimator=CustomSVR(), random_state=1, **params)
+            .fit(X_train, y_train)
+            .predict(X_test)
+        )
+
+        sparse_type = type(X_train_sparse)
+        types = [i.data_type_ for i in sparse_classifier.estimators_]
+
+        assert_array_almost_equal(sparse_results, dense_results)
+        assert all([t == sparse_type for t in types])
+        assert_array_almost_equal(sparse_results, dense_results)
+
+
+class DummySizeEstimator(BaseEstimator):
+    def fit(self, X, y):
+        self.training_size_ = X.shape[0]
+        self.training_hash_ = joblib.hash(X)
+
+    def predict(self, X):
+        return np.ones(X.shape[0])
+
+
+def test_bootstrap_samples():
+    # Test that bootstrapping samples generate non-perfect base estimators.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data, diabetes.target, random_state=rng
+    )
+
+    estimator = DecisionTreeRegressor().fit(X_train, y_train)
+
+    # without bootstrap, all trees are perfect on the training set
+    ensemble = BaggingRegressor(
+        estimator=DecisionTreeRegressor(),
+        max_samples=1.0,
+        bootstrap=False,
+        random_state=rng,
+    ).fit(X_train, y_train)
+
+    assert estimator.score(X_train, y_train) == ensemble.score(X_train, y_train)
+
+    # with bootstrap, trees are no longer perfect on the training set
+    ensemble = BaggingRegressor(
+        estimator=DecisionTreeRegressor(),
+        max_samples=1.0,
+        bootstrap=True,
+        random_state=rng,
+    ).fit(X_train, y_train)
+
+    assert estimator.score(X_train, y_train) > ensemble.score(X_train, y_train)
+
+    # check that each sampling correspond to a complete bootstrap resample.
+    # the size of each bootstrap should be the same as the input data but
+    # the data should be different (checked using the hash of the data).
+    ensemble = BaggingRegressor(estimator=DummySizeEstimator(), bootstrap=True).fit(
+        X_train, y_train
+    )
+    training_hash = []
+    for estimator in ensemble.estimators_:
+        assert estimator.training_size_ == X_train.shape[0]
+        training_hash.append(estimator.training_hash_)
+    assert len(set(training_hash)) == len(training_hash)
+
+
+def test_bootstrap_features():
+    # Test that bootstrapping features may generate duplicate features.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data, diabetes.target, random_state=rng
+    )
+
+    ensemble = BaggingRegressor(
+        estimator=DecisionTreeRegressor(),
+        max_features=1.0,
+        bootstrap_features=False,
+        random_state=rng,
+    ).fit(X_train, y_train)
+
+    for features in ensemble.estimators_features_:
+        assert diabetes.data.shape[1] == np.unique(features).shape[0]
+
+    ensemble = BaggingRegressor(
+        estimator=DecisionTreeRegressor(),
+        max_features=1.0,
+        bootstrap_features=True,
+        random_state=rng,
+    ).fit(X_train, y_train)
+
+    for features in ensemble.estimators_features_:
+        assert diabetes.data.shape[1] > np.unique(features).shape[0]
+
+
+def test_probability():
+    # Predict probabilities.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        iris.data, iris.target, random_state=rng
+    )
+
+    with np.errstate(divide="ignore", invalid="ignore"):
+        # Normal case
+        ensemble = BaggingClassifier(
+            estimator=DecisionTreeClassifier(), random_state=rng
+        ).fit(X_train, y_train)
+
+        assert_array_almost_equal(
+            np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
+        )
+
+        assert_array_almost_equal(
+            ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
+        )
+
+        # Degenerate case, where some classes are missing
+        ensemble = BaggingClassifier(
+            estimator=LogisticRegression(), random_state=rng, max_samples=5
+        ).fit(X_train, y_train)
+
+        assert_array_almost_equal(
+            np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
+        )
+
+        assert_array_almost_equal(
+            ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
+        )
+
+
+def test_oob_score_classification():
+    # Check that oob prediction is a good estimation of the generalization
+    # error.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        iris.data, iris.target, random_state=rng
+    )
+
+    for estimator in [DecisionTreeClassifier(), SVC()]:
+        clf = BaggingClassifier(
+            estimator=estimator,
+            n_estimators=100,
+            bootstrap=True,
+            oob_score=True,
+            random_state=rng,
+        ).fit(X_train, y_train)
+
+        test_score = clf.score(X_test, y_test)
+
+        assert abs(test_score - clf.oob_score_) < 0.1
+
+        # Test with few estimators
+        warn_msg = (
+            "Some inputs do not have OOB scores. This probably means too few "
+            "estimators were used to compute any reliable oob estimates."
+        )
+        with pytest.warns(UserWarning, match=warn_msg):
+            clf = BaggingClassifier(
+                estimator=estimator,
+                n_estimators=1,
+                bootstrap=True,
+                oob_score=True,
+                random_state=rng,
+            )
+            clf.fit(X_train, y_train)
+
+
+def test_oob_score_regression():
+    # Check that oob prediction is a good estimation of the generalization
+    # error.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data, diabetes.target, random_state=rng
+    )
+
+    clf = BaggingRegressor(
+        estimator=DecisionTreeRegressor(),
+        n_estimators=50,
+        bootstrap=True,
+        oob_score=True,
+        random_state=rng,
+    ).fit(X_train, y_train)
+
+    test_score = clf.score(X_test, y_test)
+
+    assert abs(test_score - clf.oob_score_) < 0.1
+
+    # Test with few estimators
+    warn_msg = (
+        "Some inputs do not have OOB scores. This probably means too few "
+        "estimators were used to compute any reliable oob estimates."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        regr = BaggingRegressor(
+            estimator=DecisionTreeRegressor(),
+            n_estimators=1,
+            bootstrap=True,
+            oob_score=True,
+            random_state=rng,
+        )
+        regr.fit(X_train, y_train)
+
+
+def test_single_estimator():
+    # Check singleton ensembles.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data, diabetes.target, random_state=rng
+    )
+
+    clf1 = BaggingRegressor(
+        estimator=KNeighborsRegressor(),
+        n_estimators=1,
+        bootstrap=False,
+        bootstrap_features=False,
+        random_state=rng,
+    ).fit(X_train, y_train)
+
+    clf2 = KNeighborsRegressor().fit(X_train, y_train)
+
+    assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
+
+
+def test_error():
+    # Test support of decision_function
+    X, y = iris.data, iris.target
+    base = DecisionTreeClassifier()
+    assert not hasattr(BaggingClassifier(base).fit(X, y), "decision_function")
+
+
+def test_parallel_classification():
+    # Check parallel classification.
+    X_train, X_test, y_train, y_test = train_test_split(
+        iris.data, iris.target, random_state=0
+    )
+
+    ensemble = BaggingClassifier(
+        DecisionTreeClassifier(), n_jobs=3, random_state=0
+    ).fit(X_train, y_train)
+
+    # predict_proba
+    y1 = ensemble.predict_proba(X_test)
+    ensemble.set_params(n_jobs=1)
+    y2 = ensemble.predict_proba(X_test)
+    assert_array_almost_equal(y1, y2)
+
+    ensemble = BaggingClassifier(
+        DecisionTreeClassifier(), n_jobs=1, random_state=0
+    ).fit(X_train, y_train)
+
+    y3 = ensemble.predict_proba(X_test)
+    assert_array_almost_equal(y1, y3)
+
+    # decision_function
+    ensemble = BaggingClassifier(
+        SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0
+    ).fit(X_train, y_train)
+
+    decisions1 = ensemble.decision_function(X_test)
+    ensemble.set_params(n_jobs=1)
+    decisions2 = ensemble.decision_function(X_test)
+    assert_array_almost_equal(decisions1, decisions2)
+
+    ensemble = BaggingClassifier(
+        SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0
+    ).fit(X_train, y_train)
+
+    decisions3 = ensemble.decision_function(X_test)
+    assert_array_almost_equal(decisions1, decisions3)
+
+
+def test_parallel_regression():
+    # Check parallel regression.
+    rng = check_random_state(0)
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data, diabetes.target, random_state=rng
+    )
+
+    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
+        X_train, y_train
+    )
+
+    ensemble.set_params(n_jobs=1)
+    y1 = ensemble.predict(X_test)
+    ensemble.set_params(n_jobs=2)
+    y2 = ensemble.predict(X_test)
+    assert_array_almost_equal(y1, y2)
+
+    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(
+        X_train, y_train
+    )
+
+    y3 = ensemble.predict(X_test)
+    assert_array_almost_equal(y1, y3)
+
+
+def test_gridsearch():
+    # Check that bagging ensembles can be grid-searched.
+    # Transform iris into a binary classification task
+    X, y = iris.data, iris.target
+    y[y == 2] = 1
+
+    # Grid search with scoring based on decision_function
+    parameters = {"n_estimators": (1, 2), "estimator__C": (1, 2)}
+
+    GridSearchCV(BaggingClassifier(SVC()), parameters, scoring="roc_auc").fit(X, y)
+
+
+def test_estimator():
+    # Check estimator and its default values.
+    rng = check_random_state(0)
+
+    # Classification
+    X_train, X_test, y_train, y_test = train_test_split(
+        iris.data, iris.target, random_state=rng
+    )
+
+    ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)
+
+    assert isinstance(ensemble.estimator_, DecisionTreeClassifier)
+
+    ensemble = BaggingClassifier(
+        DecisionTreeClassifier(), n_jobs=3, random_state=0
+    ).fit(X_train, y_train)
+
+    assert isinstance(ensemble.estimator_, DecisionTreeClassifier)
+
+    ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(
+        X_train, y_train
+    )
+
+    assert isinstance(ensemble.estimator_, Perceptron)
+
+    # Regression
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data, diabetes.target, random_state=rng
+    )
+
+    ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)
+
+    assert isinstance(ensemble.estimator_, DecisionTreeRegressor)
+
+    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
+        X_train, y_train
+    )
+
+    assert isinstance(ensemble.estimator_, DecisionTreeRegressor)
+
+    ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
+    assert isinstance(ensemble.estimator_, SVR)
+
+
+def test_bagging_with_pipeline():
+    estimator = BaggingClassifier(
+        make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2
+    )
+    estimator.fit(iris.data, iris.target)
+    assert isinstance(estimator[0].steps[-1][1].random_state, int)
+
+
+class DummyZeroEstimator(BaseEstimator):
+    def fit(self, X, y):
+        self.classes_ = np.unique(y)
+        return self
+
+    def predict(self, X):
+        return self.classes_[np.zeros(X.shape[0], dtype=int)]
+
+
+def test_bagging_sample_weight_unsupported_but_passed():
+    estimator = BaggingClassifier(DummyZeroEstimator())
+    rng = check_random_state(0)
+
+    estimator.fit(iris.data, iris.target).predict(iris.data)
+    with pytest.raises(ValueError):
+        estimator.fit(
+            iris.data,
+            iris.target,
+            sample_weight=rng.randint(10, size=(iris.data.shape[0])),
+        )
+
+
+def test_warm_start(random_state=42):
+    # Test if fitting incrementally with warm start gives a forest of the
+    # right size and the same results as a normal fit.
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+
+    clf_ws = None
+    for n_estimators in [5, 10]:
+        if clf_ws is None:
+            clf_ws = BaggingClassifier(
+                n_estimators=n_estimators, random_state=random_state, warm_start=True
+            )
+        else:
+            clf_ws.set_params(n_estimators=n_estimators)
+        clf_ws.fit(X, y)
+        assert len(clf_ws) == n_estimators
+
+    clf_no_ws = BaggingClassifier(
+        n_estimators=10, random_state=random_state, warm_start=False
+    )
+    clf_no_ws.fit(X, y)
+
+    assert set([tree.random_state for tree in clf_ws]) == set(
+        [tree.random_state for tree in clf_no_ws]
+    )
+
+
+def test_warm_start_smaller_n_estimators():
+    # Test if warm start'ed second fit with smaller n_estimators raises error.
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    clf = BaggingClassifier(n_estimators=5, warm_start=True)
+    clf.fit(X, y)
+    clf.set_params(n_estimators=4)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
+
+
+def test_warm_start_equal_n_estimators():
+    # Test that nothing happens when fitting without increasing n_estimators
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
+
+    clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
+    clf.fit(X_train, y_train)
+
+    y_pred = clf.predict(X_test)
+    # modify X to nonsense values, this should not change anything
+    X_train += 1.0
+
+    warn_msg = "Warm-start fitting without increasing n_estimators does not"
+    with pytest.warns(UserWarning, match=warn_msg):
+        clf.fit(X_train, y_train)
+    assert_array_equal(y_pred, clf.predict(X_test))
+
+
+def test_warm_start_equivalence():
+    # warm started classifier with 5+5 estimators should be equivalent to
+    # one classifier with 10 estimators
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
+
+    clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141)
+    clf_ws.fit(X_train, y_train)
+    clf_ws.set_params(n_estimators=10)
+    clf_ws.fit(X_train, y_train)
+    y1 = clf_ws.predict(X_test)
+
+    clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141)
+    clf.fit(X_train, y_train)
+    y2 = clf.predict(X_test)
+
+    assert_array_almost_equal(y1, y2)
+
+
+def test_warm_start_with_oob_score_fails():
+    # Check using oob_score and warm_start simultaneously fails
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
+
+
+def test_oob_score_removed_on_warm_start():
+    X, y = make_hastie_10_2(n_samples=100, random_state=1)
+
+    clf = BaggingClassifier(n_estimators=5, oob_score=True)
+    clf.fit(X, y)
+
+    clf.set_params(warm_start=True, oob_score=False, n_estimators=10)
+    clf.fit(X, y)
+
+    with pytest.raises(AttributeError):
+        getattr(clf, "oob_score_")
+
+
+def test_oob_score_consistency():
+    # Make sure OOB scores are identical when random_state, estimator, and
+    # training data are fixed and fitting is done twice
+    X, y = make_hastie_10_2(n_samples=200, random_state=1)
+    bagging = BaggingClassifier(
+        KNeighborsClassifier(),
+        max_samples=0.5,
+        max_features=0.5,
+        oob_score=True,
+        random_state=1,
+    )
+    assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
+
+
+def test_estimators_samples():
+    # Check that format of estimators_samples_ is correct and that results
+    # generated at fit time can be identically reproduced at a later time
+    # using data saved in object attributes.
+    X, y = make_hastie_10_2(n_samples=200, random_state=1)
+    bagging = BaggingClassifier(
+        LogisticRegression(),
+        max_samples=0.5,
+        max_features=0.5,
+        random_state=1,
+        bootstrap=False,
+    )
+    bagging.fit(X, y)
+
+    # Get relevant attributes
+    estimators_samples = bagging.estimators_samples_
+    estimators_features = bagging.estimators_features_
+    estimators = bagging.estimators_
+
+    # Test for correct formatting
+    assert len(estimators_samples) == len(estimators)
+    assert len(estimators_samples[0]) == len(X) // 2
+    assert estimators_samples[0].dtype.kind == "i"
+
+    # Re-fit single estimator to test for consistent sampling
+    estimator_index = 0
+    estimator_samples = estimators_samples[estimator_index]
+    estimator_features = estimators_features[estimator_index]
+    estimator = estimators[estimator_index]
+
+    X_train = (X[estimator_samples])[:, estimator_features]
+    y_train = y[estimator_samples]
+
+    orig_coefs = estimator.coef_
+    estimator.fit(X_train, y_train)
+    new_coefs = estimator.coef_
+
+    assert_array_almost_equal(orig_coefs, new_coefs)
+
+
+def test_estimators_samples_deterministic():
+    # This test is a regression test to check that with a random step
+    # (e.g. SparseRandomProjection) and a given random state, the results
+    # generated at fit time can be identically reproduced at a later time using
+    # data saved in object attributes. Check issue #9524 for full discussion.
+
+    iris = load_iris()
+    X, y = iris.data, iris.target
+
+    base_pipeline = make_pipeline(
+        SparseRandomProjection(n_components=2), LogisticRegression()
+    )
+    clf = BaggingClassifier(estimator=base_pipeline, max_samples=0.5, random_state=0)
+    clf.fit(X, y)
+    pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
+
+    estimator = clf.estimators_[0]
+    estimator_sample = clf.estimators_samples_[0]
+    estimator_feature = clf.estimators_features_[0]
+
+    X_train = (X[estimator_sample])[:, estimator_feature]
+    y_train = y[estimator_sample]
+
+    estimator.fit(X_train, y_train)
+    assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
+
+
+def test_max_samples_consistency():
+    # Make sure validated max_samples and original max_samples are identical
+    # when valid integer max_samples supplied by user
+    max_samples = 100
+    X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
+    bagging = BaggingClassifier(
+        KNeighborsClassifier(),
+        max_samples=max_samples,
+        max_features=0.5,
+        random_state=1,
+    )
+    bagging.fit(X, y)
+    assert bagging._max_samples == max_samples
+
+
+def test_set_oob_score_label_encoding():
+    # Make sure the oob_score doesn't change when the labels change
+    # See: https://github.com/scikit-learn/scikit-learn/issues/8933
+    random_state = 5
+    X = [[-1], [0], [1]] * 5
+    Y1 = ["A", "B", "C"] * 5
+    Y2 = [-1, 0, 1] * 5
+    Y3 = [0, 1, 2] * 5
+    x1 = (
+        BaggingClassifier(oob_score=True, random_state=random_state)
+        .fit(X, Y1)
+        .oob_score_
+    )
+    x2 = (
+        BaggingClassifier(oob_score=True, random_state=random_state)
+        .fit(X, Y2)
+        .oob_score_
+    )
+    x3 = (
+        BaggingClassifier(oob_score=True, random_state=random_state)
+        .fit(X, Y3)
+        .oob_score_
+    )
+    assert [x1, x2] == [x3, x3]
+
+
+def replace(X):
+    X = X.astype("float", copy=True)
+    X[~np.isfinite(X)] = 0
+    return X
+
+
+def test_bagging_regressor_with_missing_inputs():
+    # Check that BaggingRegressor can accept X with missing/infinite data
+    X = np.array(
+        [
+            [1, 3, 5],
+            [2, None, 6],
+            [2, np.nan, 6],
+            [2, np.inf, 6],
+            [2, -np.inf, 6],
+        ]
+    )
+    y_values = [
+        np.array([2, 3, 3, 3, 3]),
+        np.array(
+            [
+                [2, 1, 9],
+                [3, 6, 8],
+                [3, 6, 8],
+                [3, 6, 8],
+                [3, 6, 8],
+            ]
+        ),
+    ]
+    for y in y_values:
+        regressor = DecisionTreeRegressor()
+        pipeline = make_pipeline(FunctionTransformer(replace), regressor)
+        pipeline.fit(X, y).predict(X)
+        bagging_regressor = BaggingRegressor(pipeline)
+        y_hat = bagging_regressor.fit(X, y).predict(X)
+        assert y.shape == y_hat.shape
+
+        # Verify that exceptions can be raised by wrapper regressor
+        regressor = DecisionTreeRegressor()
+        pipeline = make_pipeline(regressor)
+        with pytest.raises(ValueError):
+            pipeline.fit(X, y)
+        bagging_regressor = BaggingRegressor(pipeline)
+        with pytest.raises(ValueError):
+            bagging_regressor.fit(X, y)
+
+
+def test_bagging_classifier_with_missing_inputs():
+    # Check that BaggingClassifier can accept X with missing/infinite data
+    X = np.array(
+        [
+            [1, 3, 5],
+            [2, None, 6],
+            [2, np.nan, 6],
+            [2, np.inf, 6],
+            [2, -np.inf, 6],
+        ]
+    )
+    y = np.array([3, 6, 6, 6, 6])
+    classifier = DecisionTreeClassifier()
+    pipeline = make_pipeline(FunctionTransformer(replace), classifier)
+    pipeline.fit(X, y).predict(X)
+    bagging_classifier = BaggingClassifier(pipeline)
+    bagging_classifier.fit(X, y)
+    y_hat = bagging_classifier.predict(X)
+    assert y.shape == y_hat.shape
+    bagging_classifier.predict_log_proba(X)
+    bagging_classifier.predict_proba(X)
+
+    # Verify that exceptions can be raised by wrapper classifier
+    classifier = DecisionTreeClassifier()
+    pipeline = make_pipeline(classifier)
+    with pytest.raises(ValueError):
+        pipeline.fit(X, y)
+    bagging_classifier = BaggingClassifier(pipeline)
+    with pytest.raises(ValueError):
+        bagging_classifier.fit(X, y)
+
+
+def test_bagging_small_max_features():
+    # Check that Bagging estimator can accept low fractional max_features
+
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 0])
+
+    bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1)
+    bagging.fit(X, y)
+
+
+def test_bagging_get_estimators_indices(global_random_seed):
+    # Check that Bagging estimator can generate sample indices properly
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16436
+
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randn(13, 4)
+    y = np.arange(13)
+
+    class MyEstimator(DecisionTreeRegressor):
+        """An estimator which stores y indices information at fit."""
+
+        def fit(self, X, y):
+            self._sample_indices = y
+
+    clf = BaggingRegressor(estimator=MyEstimator(), n_estimators=1, random_state=0)
+    clf.fit(X, y)
+
+    assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0])
+
+
+@pytest.mark.parametrize(
+    "bagging, expected_allow_nan",
+    [
+        (BaggingClassifier(HistGradientBoostingClassifier(max_iter=1)), True),
+        (BaggingRegressor(HistGradientBoostingRegressor(max_iter=1)), True),
+        (BaggingClassifier(LogisticRegression()), False),
+        (BaggingRegressor(SVR()), False),
+    ],
+)
+def test_bagging_allow_nan_tag(bagging, expected_allow_nan):
+    """Check that bagging inherits allow_nan tag."""
+    assert bagging.__sklearn_tags__().input_tags.allow_nan == expected_allow_nan
+
+
+# Metadata Routing Tests
+# ======================
+
+
+@config_context(enable_metadata_routing=True)
+@pytest.mark.parametrize(
+    "model",
+    [
+        BaggingClassifier(
+            estimator=RandomForestClassifier(n_estimators=1), n_estimators=1
+        ),
+        BaggingRegressor(
+            estimator=RandomForestRegressor(n_estimators=1), n_estimators=1
+        ),
+    ],
+)
+def test_bagging_with_metadata_routing(model):
+    """Make sure that metadata routing works with non-default estimator."""
+    model.fit(iris.data, iris.target)
+
+
+@pytest.mark.parametrize(
+    "sub_estimator, caller, callee",
+    [
+        (ConsumingClassifierWithoutPredictProba, "predict", "predict"),
+        (
+            ConsumingClassifierWithoutPredictLogProba,
+            "predict_log_proba",
+            "predict_proba",
+        ),
+        (ConsumingClassifierWithOnlyPredict, "predict_log_proba", "predict"),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_with_dynamic_method_selection(sub_estimator, caller, callee):
+    """Test that metadata routing works in `BaggingClassifier` with dynamic selection of
+    the sub-estimator's methods. Here we test only specific test cases, where
+    sub-estimator methods are not present and are not tested with `ConsumingClassifier`
+    (which possesses all the methods) in
+    sklearn/tests/test_metaestimators_metadata_routing.py: `BaggingClassifier.predict()`
+    dynamically routes to `predict` if the sub-estimator doesn't have `predict_proba`
+    and `BaggingClassifier.predict_log_proba()` dynamically routes to `predict_proba` if
+    the sub-estimator doesn't have `predict_log_proba`, or to `predict`, if it doesn't
+    have it.
+    """
+    X = np.array([[0, 2], [1, 4], [2, 6]])
+    y = [1, 2, 3]
+    sample_weight, metadata = [1], "a"
+    registry = _Registry()
+    estimator = sub_estimator(registry=registry)
+    set_callee_request = "set_" + callee + "_request"
+    getattr(estimator, set_callee_request)(sample_weight=True, metadata=True)
+
+    bagging = BaggingClassifier(estimator=estimator)
+    bagging.fit(X, y)
+    getattr(bagging, caller)(
+        X=np.array([[1, 1], [1, 3], [0, 2]]),
+        sample_weight=sample_weight,
+        metadata=metadata,
+    )
+
+    assert len(registry)
+    for estimator in registry:
+        check_recorded_metadata(
+            obj=estimator,
+            method=callee,
+            parent=caller,
+            sample_weight=sample_weight,
+            metadata=metadata,
+        )
+
+
+# End of Metadata Routing Tests
+# =============================
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        BaggingClassifier(
+            estimator=AdaBoostClassifier(n_estimators=1),
+            n_estimators=1,
+        ),
+        BaggingRegressor(estimator=AdaBoostRegressor(n_estimators=1), n_estimators=1),
+    ],
+)
+def test_bagging_without_support_metadata_routing(model):
+    """Make sure that we still can use an estimator that does not implement the
+    metadata routing."""
+    model.fit(iris.data, iris.target)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..95a852b8a7cc50e3b4440461e7ed5f5facde3e69
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_base.py
@@ -0,0 +1,109 @@
+"""
+Testing for the base module (sklearn.ensemble.base).
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from collections import OrderedDict
+
+import numpy as np
+
+from sklearn.datasets import load_iris
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.ensemble import BaggingClassifier
+from sklearn.ensemble._base import _set_random_states
+from sklearn.feature_selection import SelectFromModel
+from sklearn.linear_model import Perceptron
+from sklearn.pipeline import Pipeline
+
+
+def test_base():
+    # Check BaseEnsemble methods.
+    ensemble = BaggingClassifier(
+        estimator=Perceptron(random_state=None), n_estimators=3
+    )
+
+    iris = load_iris()
+    ensemble.fit(iris.data, iris.target)
+    ensemble.estimators_ = []  # empty the list and create estimators manually
+
+    ensemble._make_estimator()
+    random_state = np.random.RandomState(3)
+    ensemble._make_estimator(random_state=random_state)
+    ensemble._make_estimator(random_state=random_state)
+    ensemble._make_estimator(append=False)
+
+    assert 3 == len(ensemble)
+    assert 3 == len(ensemble.estimators_)
+
+    assert isinstance(ensemble[0], Perceptron)
+    assert ensemble[0].random_state is None
+    assert isinstance(ensemble[1].random_state, int)
+    assert isinstance(ensemble[2].random_state, int)
+    assert ensemble[1].random_state != ensemble[2].random_state
+
+    np_int_ensemble = BaggingClassifier(
+        estimator=Perceptron(), n_estimators=np.int32(3)
+    )
+    np_int_ensemble.fit(iris.data, iris.target)
+
+
+def test_set_random_states():
+    # Linear Discriminant Analysis doesn't have random state: smoke test
+    _set_random_states(LinearDiscriminantAnalysis(), random_state=17)
+
+    clf1 = Perceptron(random_state=None)
+    assert clf1.random_state is None
+    # check random_state is None still sets
+    _set_random_states(clf1, None)
+    assert isinstance(clf1.random_state, int)
+
+    # check random_state fixes results in consistent initialisation
+    _set_random_states(clf1, 3)
+    assert isinstance(clf1.random_state, int)
+    clf2 = Perceptron(random_state=None)
+    _set_random_states(clf2, 3)
+    assert clf1.random_state == clf2.random_state
+
+    # nested random_state
+
+    def make_steps():
+        return [
+            ("sel", SelectFromModel(Perceptron(random_state=None))),
+            ("clf", Perceptron(random_state=None)),
+        ]
+
+    est1 = Pipeline(make_steps())
+    _set_random_states(est1, 3)
+    assert isinstance(est1.steps[0][1].estimator.random_state, int)
+    assert isinstance(est1.steps[1][1].random_state, int)
+    assert (
+        est1.get_params()["sel__estimator__random_state"]
+        != est1.get_params()["clf__random_state"]
+    )
+
+    # ensure multiple random_state parameters are invariant to get_params()
+    # iteration order
+
+    class AlphaParamPipeline(Pipeline):
+        def get_params(self, *args, **kwargs):
+            params = Pipeline.get_params(self, *args, **kwargs).items()
+            return OrderedDict(sorted(params))
+
+    class RevParamPipeline(Pipeline):
+        def get_params(self, *args, **kwargs):
+            params = Pipeline.get_params(self, *args, **kwargs).items()
+            return OrderedDict(sorted(params, reverse=True))
+
+    for cls in [AlphaParamPipeline, RevParamPipeline]:
+        est2 = cls(make_steps())
+        _set_random_states(est2, 3)
+        assert (
+            est1.get_params()["sel__estimator__random_state"]
+            == est2.get_params()["sel__estimator__random_state"]
+        )
+        assert (
+            est1.get_params()["clf__random_state"]
+            == est2.get_params()["clf__random_state"]
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e83512ccd1d673951655c4572ac294fdda52af2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_common.py
@@ -0,0 +1,262 @@
+import numpy as np
+import pytest
+
+from sklearn.base import ClassifierMixin, clone, is_classifier
+from sklearn.datasets import (
+    load_diabetes,
+    load_iris,
+    make_classification,
+    make_regression,
+)
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    RandomForestRegressor,
+    StackingClassifier,
+    StackingRegressor,
+    VotingClassifier,
+    VotingRegressor,
+)
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
+
+X, y = load_iris(return_X_y=True)
+
+X_r, y_r = load_diabetes(return_X_y=True)
+
+
+@pytest.mark.parametrize(
+    "X, y, estimator",
+    [
+        (
+            *make_classification(n_samples=10),
+            StackingClassifier(
+                estimators=[
+                    ("lr", LogisticRegression()),
+                    ("svm", LinearSVC()),
+                    ("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
+                ],
+                cv=2,
+            ),
+        ),
+        (
+            *make_classification(n_samples=10),
+            VotingClassifier(
+                estimators=[
+                    ("lr", LogisticRegression()),
+                    ("svm", LinearSVC()),
+                    ("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
+                ]
+            ),
+        ),
+        (
+            *make_regression(n_samples=10),
+            StackingRegressor(
+                estimators=[
+                    ("lr", LinearRegression()),
+                    ("svm", LinearSVR()),
+                    ("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
+                ],
+                cv=2,
+            ),
+        ),
+        (
+            *make_regression(n_samples=10),
+            VotingRegressor(
+                estimators=[
+                    ("lr", LinearRegression()),
+                    ("svm", LinearSVR()),
+                    ("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
+                ]
+            ),
+        ),
+    ],
+    ids=[
+        "stacking-classifier",
+        "voting-classifier",
+        "stacking-regressor",
+        "voting-regressor",
+    ],
+)
+def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
+    # check that the behavior of `estimators`, `estimators_`,
+    # `named_estimators`, `named_estimators_` is consistent across all
+    # ensemble classes and when using `set_params()`.
+
+    # before fit
+    assert "svm" in estimator.named_estimators
+    assert estimator.named_estimators.svm is estimator.estimators[1][1]
+    assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
+
+    # check fitted attributes
+    estimator.fit(X, y)
+    assert len(estimator.named_estimators) == 3
+    assert len(estimator.named_estimators_) == 3
+    assert sorted(list(estimator.named_estimators_.keys())) == sorted(
+        ["lr", "svm", "rf"]
+    )
+
+    # check that set_params() does not add a new attribute
+    estimator_new_params = clone(estimator)
+    svm_estimator = SVC() if is_classifier(estimator) else SVR()
+    estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
+    assert not hasattr(estimator_new_params, "svm")
+    assert (
+        estimator_new_params.named_estimators.lr.get_params()
+        == estimator.named_estimators.lr.get_params()
+    )
+    assert (
+        estimator_new_params.named_estimators.rf.get_params()
+        == estimator.named_estimators.rf.get_params()
+    )
+
+    # check the behavior when setting an dropping an estimator
+    estimator_dropped = clone(estimator)
+    estimator_dropped.set_params(svm="drop")
+    estimator_dropped.fit(X, y)
+    assert len(estimator_dropped.named_estimators) == 3
+    assert estimator_dropped.named_estimators.svm == "drop"
+    assert len(estimator_dropped.named_estimators_) == 3
+    assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
+        ["lr", "svm", "rf"]
+    )
+    for sub_est in estimator_dropped.named_estimators_:
+        # check that the correspondence is correct
+        assert not isinstance(sub_est, type(estimator.named_estimators.svm))
+
+    # check that we can set the parameters of the underlying classifier
+    estimator.set_params(svm__C=10.0)
+    estimator.set_params(rf__max_depth=5)
+    assert (
+        estimator.get_params()["svm__C"]
+        == estimator.get_params()["svm"].get_params()["C"]
+    )
+    assert (
+        estimator.get_params()["rf__max_depth"]
+        == estimator.get_params()["rf"].get_params()["max_depth"]
+    )
+
+
+@pytest.mark.parametrize(
+    "Ensemble",
+    [VotingClassifier, StackingRegressor, VotingRegressor],
+)
+def test_ensemble_heterogeneous_estimators_type(Ensemble):
+    # check that ensemble will fail during validation if the underlying
+    # estimators are not of the same type (i.e. classifier or regressor)
+    # StackingClassifier can have an underlying regresor so it's not checked
+    if issubclass(Ensemble, ClassifierMixin):
+        X, y = make_classification(n_samples=10)
+        estimators = [("lr", LinearRegression())]
+        ensemble_type = "classifier"
+    else:
+        X, y = make_regression(n_samples=10)
+        estimators = [("lr", LogisticRegression())]
+        ensemble_type = "regressor"
+    ensemble = Ensemble(estimators=estimators)
+
+    err_msg = "should be a {}".format(ensemble_type)
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "X, y, Ensemble",
+    [
+        (*make_classification(n_samples=10), StackingClassifier),
+        (*make_classification(n_samples=10), VotingClassifier),
+        (*make_regression(n_samples=10), StackingRegressor),
+        (*make_regression(n_samples=10), VotingRegressor),
+    ],
+)
+def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
+    # raise an error when the name contains dunder
+    if issubclass(Ensemble, ClassifierMixin):
+        estimators = [("lr__", LogisticRegression())]
+    else:
+        estimators = [("lr__", LinearRegression())]
+    ensemble = Ensemble(estimators=estimators)
+
+    err_msg = r"Estimator names must not contain __: got \['lr__'\]"
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+    # raise an error when the name is not unique
+    if issubclass(Ensemble, ClassifierMixin):
+        estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
+    else:
+        estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
+    ensemble = Ensemble(estimators=estimators)
+
+    err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+    # raise an error when the name conflicts with the parameters
+    if issubclass(Ensemble, ClassifierMixin):
+        estimators = [("estimators", LogisticRegression())]
+    else:
+        estimators = [("estimators", LinearRegression())]
+    ensemble = Ensemble(estimators=estimators)
+
+    err_msg = "Estimator names conflict with constructor arguments"
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "X, y, estimator",
+    [
+        (
+            *make_classification(n_samples=10),
+            StackingClassifier(estimators=[("lr", LogisticRegression())]),
+        ),
+        (
+            *make_classification(n_samples=10),
+            VotingClassifier(estimators=[("lr", LogisticRegression())]),
+        ),
+        (
+            *make_regression(n_samples=10),
+            StackingRegressor(estimators=[("lr", LinearRegression())]),
+        ),
+        (
+            *make_regression(n_samples=10),
+            VotingRegressor(estimators=[("lr", LinearRegression())]),
+        ),
+    ],
+    ids=[
+        "stacking-classifier",
+        "voting-classifier",
+        "stacking-regressor",
+        "voting-regressor",
+    ],
+)
+def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
+    # check that we raise a consistent error when all estimators are
+    # dropped
+    estimator.set_params(lr="drop")
+    with pytest.raises(ValueError, match="All estimators are dropped."):
+        estimator.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "Ensemble, Estimator, X, y",
+    [
+        (StackingClassifier, LogisticRegression, X, y),
+        (StackingRegressor, LinearRegression, X_r, y_r),
+        (VotingClassifier, LogisticRegression, X, y),
+        (VotingRegressor, LinearRegression, X_r, y_r),
+    ],
+)
+# FIXME: we should move this test in `estimator_checks` once we are able
+# to construct meta-estimator instances
+def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
+    # check that Voting and Stacking predictor delegate the missing values
+    # validation to the underlying estimator.
+    X = X.copy()
+    mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
+    X[mask] = np.nan
+    pipe = make_pipeline(SimpleImputer(), Estimator())
+    ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
+    ensemble.fit(X, y).score(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_forest.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_forest.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dec5c7ab90b2635aa58f0735859a6fdfea7d0ca
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_forest.py
@@ -0,0 +1,1865 @@
+"""
+Testing for the forest module (sklearn.ensemble.forest).
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import math
+import pickle
+from collections import defaultdict
+from functools import partial
+from itertools import combinations, product
+from typing import Any, Dict
+from unittest.mock import patch
+
+import joblib
+import numpy as np
+import pytest
+from scipy.special import comb
+
+import sklearn
+from sklearn import clone, datasets
+from sklearn.datasets import make_classification, make_hastie_10_2
+from sklearn.decomposition import TruncatedSVD
+from sklearn.dummy import DummyRegressor
+from sklearn.ensemble import (
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
+    RandomForestClassifier,
+    RandomForestRegressor,
+    RandomTreesEmbedding,
+)
+from sklearn.ensemble._forest import (
+    _generate_unsampled_indices,
+    _get_n_samples_bootstrap,
+)
+from sklearn.exceptions import NotFittedError
+from sklearn.metrics import (
+    explained_variance_score,
+    f1_score,
+    mean_poisson_deviance,
+    mean_squared_error,
+)
+from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
+from sklearn.svm import LinearSVC
+from sklearn.tree._classes import SPARSE_SPLITTERS
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+    skip_if_no_parallel,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
+from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.parallel import Parallel
+from sklearn.utils.validation import check_random_state
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+T = [[-1, -1], [2, 2], [3, 2]]
+true_result = [-1, 1, 1]
+
+# Larger classification sample used for testing feature importances
+X_large, y_large = datasets.make_classification(
+    n_samples=500,
+    n_features=10,
+    n_informative=3,
+    n_redundant=0,
+    n_repeated=0,
+    shuffle=False,
+    random_state=0,
+)
+
+# also load the iris dataset
+# and randomly permute it
+iris = datasets.load_iris()
+rng = check_random_state(0)
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+# Make regression dataset
+X_reg, y_reg = datasets.make_regression(n_samples=500, n_features=10, random_state=1)
+
+# also make a hastie_10_2 dataset
+hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
+hastie_X = hastie_X.astype(np.float32)
+
+# Get the default backend in joblib to test parallelism and interaction with
+# different backends
+DEFAULT_JOBLIB_BACKEND = joblib.parallel.get_active_backend()[0].__class__
+
+FOREST_CLASSIFIERS = {
+    "ExtraTreesClassifier": ExtraTreesClassifier,
+    "RandomForestClassifier": RandomForestClassifier,
+}
+
+FOREST_REGRESSORS = {
+    "ExtraTreesRegressor": ExtraTreesRegressor,
+    "RandomForestRegressor": RandomForestRegressor,
+}
+
+FOREST_TRANSFORMERS = {
+    "RandomTreesEmbedding": RandomTreesEmbedding,
+}
+
+FOREST_ESTIMATORS: Dict[str, Any] = dict()
+FOREST_ESTIMATORS.update(FOREST_CLASSIFIERS)
+FOREST_ESTIMATORS.update(FOREST_REGRESSORS)
+FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS)
+
+FOREST_CLASSIFIERS_REGRESSORS: Dict[str, Any] = FOREST_CLASSIFIERS.copy()
+FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_classification_toy(name):
+    """Check classification on a toy dataset."""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    clf = ForestClassifier(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = ForestClassifier(n_estimators=10, max_features=1, random_state=1)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    # also test apply
+    leaf_indices = clf.apply(X)
+    assert leaf_indices.shape == (len(X), clf.n_estimators)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+@pytest.mark.parametrize("criterion", ("gini", "log_loss"))
+def test_iris_criterion(name, criterion):
+    # Check consistency on dataset iris.
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    clf = ForestClassifier(n_estimators=10, criterion=criterion, random_state=1)
+    clf.fit(iris.data, iris.target)
+    score = clf.score(iris.data, iris.target)
+    assert score > 0.9, "Failed with criterion %s and score = %f" % (criterion, score)
+
+    clf = ForestClassifier(
+        n_estimators=10, criterion=criterion, max_features=2, random_state=1
+    )
+    clf.fit(iris.data, iris.target)
+    score = clf.score(iris.data, iris.target)
+    assert score > 0.5, "Failed with criterion %s and score = %f" % (criterion, score)
+
+
+@pytest.mark.parametrize("name", FOREST_REGRESSORS)
+@pytest.mark.parametrize(
+    "criterion", ("squared_error", "absolute_error", "friedman_mse")
+)
+def test_regression_criterion(name, criterion):
+    # Check consistency on regression dataset.
+    ForestRegressor = FOREST_REGRESSORS[name]
+
+    reg = ForestRegressor(n_estimators=5, criterion=criterion, random_state=1)
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert score > 0.93, (
+        "Failed with max_features=None, criterion %s and score = %f"
+        % (
+            criterion,
+            score,
+        )
+    )
+
+    reg = ForestRegressor(
+        n_estimators=5, criterion=criterion, max_features=6, random_state=1
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
+
+
+def test_poisson_vs_mse():
+    """Test that random forest with poisson criterion performs better than
+    mse for a poisson target.
+
+    There is a similar test for DecisionTreeRegressor.
+    """
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 500, 10
+    X = datasets.make_low_rank_matrix(
+        n_samples=n_train + n_test, n_features=n_features, random_state=rng
+    )
+    #  We create a log-linear Poisson model and downscale coef as it will get
+    # exponentiated.
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=n_test, random_state=rng
+    )
+    # We prevent some overfitting by setting min_samples_split=10.
+    forest_poi = RandomForestRegressor(
+        criterion="poisson", min_samples_leaf=10, max_features="sqrt", random_state=rng
+    )
+    forest_mse = RandomForestRegressor(
+        criterion="squared_error",
+        min_samples_leaf=10,
+        max_features="sqrt",
+        random_state=rng,
+    )
+
+    forest_poi.fit(X_train, y_train)
+    forest_mse.fit(X_train, y_train)
+    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
+
+    for X, y, data_name in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
+        metric_poi = mean_poisson_deviance(y, forest_poi.predict(X))
+        # squared_error forest might produce non-positive predictions => clip
+        # If y = 0 for those, the poisson deviance gets too good.
+        # If we drew more samples, we would eventually get y > 0 and the
+        # poisson deviance would explode, i.e. be undefined. Therefore, we do
+        # not clip to a tiny value like 1e-15, but to 1e-6. This acts like a
+        # small penalty to the non-positive predictions.
+        metric_mse = mean_poisson_deviance(
+            y, np.clip(forest_mse.predict(X), 1e-6, None)
+        )
+        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
+        # As squared_error might correctly predict 0 in train set, its train
+        # score can be better than Poisson. This is no longer the case for the
+        # test set. But keep the above comment for clipping in mind.
+        if data_name == "test":
+            assert metric_poi < metric_mse
+        assert metric_poi < 0.8 * metric_dummy
+
+
+@pytest.mark.parametrize("criterion", ("poisson", "squared_error"))
+def test_balance_property_random_forest(criterion):
+    """ "Test that sum(y_pred)==sum(y_true) on the training set."""
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 500, 10
+    X = datasets.make_low_rank_matrix(
+        n_samples=n_train + n_test, n_features=n_features, random_state=rng
+    )
+
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+
+    reg = RandomForestRegressor(
+        criterion=criterion, n_estimators=10, bootstrap=False, random_state=rng
+    )
+    reg.fit(X, y)
+
+    assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))
+
+
+@pytest.mark.parametrize("name", FOREST_REGRESSORS)
+def test_regressor_attributes(name):
+    # Regression models should not have a classes_ attribute.
+    r = FOREST_REGRESSORS[name](random_state=0)
+    assert not hasattr(r, "classes_")
+    assert not hasattr(r, "n_classes_")
+
+    r.fit([[1, 2, 3], [4, 5, 6]], [1, 2])
+    assert not hasattr(r, "classes_")
+    assert not hasattr(r, "n_classes_")
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_probability(name):
+    # Predict probabilities.
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+    with np.errstate(divide="ignore"):
+        clf = ForestClassifier(
+            n_estimators=10, random_state=1, max_features=1, max_depth=1
+        )
+        clf.fit(iris.data, iris.target)
+        assert_array_almost_equal(
+            np.sum(clf.predict_proba(iris.data), axis=1), np.ones(iris.data.shape[0])
+        )
+        assert_array_almost_equal(
+            clf.predict_proba(iris.data), np.exp(clf.predict_log_proba(iris.data))
+        )
+
+
+@pytest.mark.parametrize("dtype", (np.float64, np.float32))
+@pytest.mark.parametrize(
+    "name, criterion",
+    itertools.chain(
+        product(FOREST_CLASSIFIERS, ["gini", "log_loss"]),
+        product(FOREST_REGRESSORS, ["squared_error", "friedman_mse", "absolute_error"]),
+    ),
+)
+def test_importances(dtype, name, criterion):
+    tolerance = 0.01
+    if name in FOREST_REGRESSORS and criterion == "absolute_error":
+        tolerance = 0.05
+
+    # cast as dtype
+    X = X_large.astype(dtype, copy=False)
+    y = y_large.astype(dtype, copy=False)
+
+    ForestEstimator = FOREST_ESTIMATORS[name]
+
+    est = ForestEstimator(n_estimators=10, criterion=criterion, random_state=0)
+    est.fit(X, y)
+    importances = est.feature_importances_
+
+    # The forest estimator can detect that only the first 3 features of the
+    # dataset are informative:
+    n_important = np.sum(importances > 0.1)
+    assert importances.shape[0] == 10
+    assert n_important == 3
+    assert np.all(importances[:3] > 0.1)
+
+    # Check with parallel
+    importances = est.feature_importances_
+    est.set_params(n_jobs=2)
+    importances_parallel = est.feature_importances_
+    assert_array_almost_equal(importances, importances_parallel)
+
+    # Check with sample weights
+    sample_weight = check_random_state(0).randint(1, 10, len(X))
+    est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion)
+    est.fit(X, y, sample_weight=sample_weight)
+    importances = est.feature_importances_
+    assert np.all(importances >= 0.0)
+
+    for scale in [0.5, 100]:
+        est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion)
+        est.fit(X, y, sample_weight=scale * sample_weight)
+        importances_bis = est.feature_importances_
+        assert np.abs(importances - importances_bis).mean() < tolerance
+
+
+def test_importances_asymptotic():
+    # Check whether variable importances of totally randomized trees
+    # converge towards their theoretical values (See Louppe et al,
+    # Understanding variable importances in forests of randomized trees, 2013).
+
+    def binomial(k, n):
+        return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True)
+
+    def entropy(samples):
+        n_samples = len(samples)
+        entropy = 0.0
+
+        for count in np.bincount(samples):
+            p = 1.0 * count / n_samples
+            if p > 0:
+                entropy -= p * np.log2(p)
+
+        return entropy
+
+    def mdi_importance(X_m, X, y):
+        n_samples, n_features = X.shape
+
+        features = list(range(n_features))
+        features.pop(X_m)
+        values = [np.unique(X[:, i]) for i in range(n_features)]
+
+        imp = 0.0
+
+        for k in range(n_features):
+            # Weight of each B of size k
+            coef = 1.0 / (binomial(k, n_features) * (n_features - k))
+
+            # For all B of size k
+            for B in combinations(features, k):
+                # For all values B=b
+                for b in product(*[values[B[j]] for j in range(k)]):
+                    mask_b = np.ones(n_samples, dtype=bool)
+
+                    for j in range(k):
+                        mask_b &= X[:, B[j]] == b[j]
+
+                    X_, y_ = X[mask_b, :], y[mask_b]
+                    n_samples_b = len(X_)
+
+                    if n_samples_b > 0:
+                        children = []
+
+                        for xi in values[X_m]:
+                            mask_xi = X_[:, X_m] == xi
+                            children.append(y_[mask_xi])
+
+                        imp += (
+                            coef
+                            * (1.0 * n_samples_b / n_samples)  # P(B=b)
+                            * (
+                                entropy(y_)
+                                - sum(
+                                    [
+                                        entropy(c) * len(c) / n_samples_b
+                                        for c in children
+                                    ]
+                                )
+                            )
+                        )
+
+        return imp
+
+    data = np.array(
+        [
+            [0, 0, 1, 0, 0, 1, 0, 1],
+            [1, 0, 1, 1, 1, 0, 1, 2],
+            [1, 0, 1, 1, 0, 1, 1, 3],
+            [0, 1, 1, 1, 0, 1, 0, 4],
+            [1, 1, 0, 1, 0, 1, 1, 5],
+            [1, 1, 0, 1, 1, 1, 1, 6],
+            [1, 0, 1, 0, 0, 1, 0, 7],
+            [1, 1, 1, 1, 1, 1, 1, 8],
+            [1, 1, 1, 1, 0, 1, 1, 9],
+            [1, 1, 1, 0, 1, 1, 1, 0],
+        ]
+    )
+
+    X, y = np.array(data[:, :7], dtype=bool), data[:, 7]
+    n_features = X.shape[1]
+
+    # Compute true importances
+    true_importances = np.zeros(n_features)
+
+    for i in range(n_features):
+        true_importances[i] = mdi_importance(i, X, y)
+
+    # Estimate importances with totally randomized trees
+    clf = ExtraTreesClassifier(
+        n_estimators=500, max_features=1, criterion="log_loss", random_state=0
+    ).fit(X, y)
+
+    importances = (
+        sum(
+            tree.tree_.compute_feature_importances(normalize=False)
+            for tree in clf.estimators_
+        )
+        / clf.n_estimators
+    )
+
+    # Check correctness
+    assert_almost_equal(entropy(y), sum(importances))
+    assert np.abs(true_importances - importances).mean() < 0.01
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_unfitted_feature_importances(name):
+    err_msg = (
+        "This {} instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator.".format(name)
+    )
+    with pytest.raises(NotFittedError, match=err_msg):
+        getattr(FOREST_ESTIMATORS[name](), "feature_importances_")
+
+
+@pytest.mark.parametrize("ForestClassifier", FOREST_CLASSIFIERS.values())
+@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"])
+@pytest.mark.parametrize(
+    "X, y, lower_bound_accuracy",
+    [
+        (
+            *datasets.make_classification(n_samples=300, n_classes=2, random_state=0),
+            0.9,
+        ),
+        (
+            *datasets.make_classification(
+                n_samples=1000, n_classes=3, n_informative=6, random_state=0
+            ),
+            0.65,
+        ),
+        (
+            iris.data,
+            iris.target * 2 + 1,
+            0.65,
+        ),
+        (
+            *datasets.make_multilabel_classification(n_samples=300, random_state=0),
+            0.18,
+        ),
+    ],
+)
+@pytest.mark.parametrize("oob_score", [True, partial(f1_score, average="micro")])
+def test_forest_classifier_oob(
+    ForestClassifier, X, y, X_type, lower_bound_accuracy, oob_score
+):
+    """Check that OOB score is close to score on a test set."""
+    X = _convert_container(X, constructor_name=X_type)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X,
+        y,
+        test_size=0.5,
+        random_state=0,
+    )
+    classifier = ForestClassifier(
+        n_estimators=40,
+        bootstrap=True,
+        oob_score=oob_score,
+        random_state=0,
+    )
+
+    assert not hasattr(classifier, "oob_score_")
+    assert not hasattr(classifier, "oob_decision_function_")
+
+    classifier.fit(X_train, y_train)
+    if callable(oob_score):
+        test_score = oob_score(y_test, classifier.predict(X_test))
+    else:
+        test_score = classifier.score(X_test, y_test)
+        assert classifier.oob_score_ >= lower_bound_accuracy
+
+    abs_diff = abs(test_score - classifier.oob_score_)
+    assert abs_diff <= 0.11, f"{abs_diff=} is greater than 0.11"
+
+    assert hasattr(classifier, "oob_score_")
+    assert not hasattr(classifier, "oob_prediction_")
+    assert hasattr(classifier, "oob_decision_function_")
+
+    if y.ndim == 1:
+        expected_shape = (X_train.shape[0], len(set(y)))
+    else:
+        expected_shape = (X_train.shape[0], len(set(y[:, 0])), y.shape[1])
+    assert classifier.oob_decision_function_.shape == expected_shape
+
+
+@pytest.mark.parametrize("ForestRegressor", FOREST_REGRESSORS.values())
+@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"])
+@pytest.mark.parametrize(
+    "X, y, lower_bound_r2",
+    [
+        (
+            *datasets.make_regression(
+                n_samples=500, n_features=10, n_targets=1, random_state=0
+            ),
+            0.7,
+        ),
+        (
+            *datasets.make_regression(
+                n_samples=500, n_features=10, n_targets=2, random_state=0
+            ),
+            0.55,
+        ),
+    ],
+)
+@pytest.mark.parametrize("oob_score", [True, explained_variance_score])
+def test_forest_regressor_oob(ForestRegressor, X, y, X_type, lower_bound_r2, oob_score):
+    """Check that forest-based regressor provide an OOB score close to the
+    score on a test set."""
+    X = _convert_container(X, constructor_name=X_type)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X,
+        y,
+        test_size=0.5,
+        random_state=0,
+    )
+    regressor = ForestRegressor(
+        n_estimators=50,
+        bootstrap=True,
+        oob_score=oob_score,
+        random_state=0,
+    )
+
+    assert not hasattr(regressor, "oob_score_")
+    assert not hasattr(regressor, "oob_prediction_")
+
+    regressor.fit(X_train, y_train)
+    if callable(oob_score):
+        test_score = oob_score(y_test, regressor.predict(X_test))
+    else:
+        test_score = regressor.score(X_test, y_test)
+        assert regressor.oob_score_ >= lower_bound_r2
+
+    assert abs(test_score - regressor.oob_score_) <= 0.1
+
+    assert hasattr(regressor, "oob_score_")
+    assert hasattr(regressor, "oob_prediction_")
+    assert not hasattr(regressor, "oob_decision_function_")
+
+    if y.ndim == 1:
+        expected_shape = (X_train.shape[0],)
+    else:
+        expected_shape = (X_train.shape[0], y.ndim)
+    assert regressor.oob_prediction_.shape == expected_shape
+
+
+@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values())
+def test_forest_oob_warning(ForestEstimator):
+    """Check that a warning is raised when not enough estimator and the OOB
+    estimates will be inaccurate."""
+    estimator = ForestEstimator(
+        n_estimators=1,
+        oob_score=True,
+        bootstrap=True,
+        random_state=0,
+    )
+    with pytest.warns(UserWarning, match="Some inputs do not have OOB scores"):
+        estimator.fit(iris.data, iris.target)
+
+
+@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values())
+def test_forest_oob_score_requires_bootstrap(ForestEstimator):
+    """Check that we raise an error if OOB score is requested without
+    activating bootstrapping.
+    """
+    X = iris.data
+    y = iris.target
+    err_msg = "Out of bag estimation only available if bootstrap=True"
+    estimator = ForestEstimator(oob_score=True, bootstrap=False)
+    with pytest.raises(ValueError, match=err_msg):
+        estimator.fit(X, y)
+
+
+@pytest.mark.parametrize("ForestClassifier", FOREST_CLASSIFIERS.values())
+def test_classifier_error_oob_score_multiclass_multioutput(ForestClassifier):
+    """Check that we raise an error with when requesting OOB score with
+    multiclass-multioutput classification target.
+    """
+    rng = np.random.RandomState(42)
+    X = iris.data
+    y = rng.randint(low=0, high=5, size=(iris.data.shape[0], 2))
+    y_type = type_of_target(y)
+    assert y_type == "multiclass-multioutput"
+    estimator = ForestClassifier(oob_score=True, bootstrap=True)
+    err_msg = "The type of target cannot be used to compute OOB estimates"
+    with pytest.raises(ValueError, match=err_msg):
+        estimator.fit(X, y)
+
+
+@pytest.mark.parametrize("ForestRegressor", FOREST_REGRESSORS.values())
+def test_forest_multioutput_integral_regression_target(ForestRegressor):
+    """Check that multioutput regression with integral values is not interpreted
+    as a multiclass-multioutput target and OOB score can be computed.
+    """
+    rng = np.random.RandomState(42)
+    X = iris.data
+    y = rng.randint(low=0, high=10, size=(iris.data.shape[0], 2))
+    estimator = ForestRegressor(
+        n_estimators=30, oob_score=True, bootstrap=True, random_state=0
+    )
+    estimator.fit(X, y)
+
+    n_samples_bootstrap = _get_n_samples_bootstrap(len(X), estimator.max_samples)
+    n_samples_test = X.shape[0] // 4
+    oob_pred = np.zeros([n_samples_test, 2])
+    for sample_idx, sample in enumerate(X[:n_samples_test]):
+        n_samples_oob = 0
+        oob_pred_sample = np.zeros(2)
+        for tree in estimator.estimators_:
+            oob_unsampled_indices = _generate_unsampled_indices(
+                tree.random_state, len(X), n_samples_bootstrap
+            )
+            if sample_idx in oob_unsampled_indices:
+                n_samples_oob += 1
+                oob_pred_sample += tree.predict(sample.reshape(1, -1)).squeeze()
+        oob_pred[sample_idx] = oob_pred_sample / n_samples_oob
+    assert_allclose(oob_pred, estimator.oob_prediction_[:n_samples_test])
+
+
+@pytest.mark.parametrize("oob_score", [True, False])
+def test_random_trees_embedding_raise_error_oob(oob_score):
+    with pytest.raises(TypeError, match="got an unexpected keyword argument"):
+        RandomTreesEmbedding(oob_score=oob_score)
+    with pytest.raises(NotImplementedError, match="OOB score not supported"):
+        RandomTreesEmbedding()._set_oob_score_and_attributes(X, y)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_gridsearch(name):
+    # Check that base trees can be grid-searched.
+    forest = FOREST_CLASSIFIERS[name]()
+    clf = GridSearchCV(forest, {"n_estimators": (1, 2), "max_depth": (1, 2)})
+    clf.fit(iris.data, iris.target)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_parallel(name):
+    """Check parallel computations in classification"""
+    if name in FOREST_CLASSIFIERS:
+        X = iris.data
+        y = iris.target
+    elif name in FOREST_REGRESSORS:
+        X = X_reg
+        y = y_reg
+
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    forest = ForestEstimator(n_estimators=10, n_jobs=3, random_state=0)
+
+    forest.fit(X, y)
+    assert len(forest) == 10
+
+    forest.set_params(n_jobs=1)
+    y1 = forest.predict(X)
+    forest.set_params(n_jobs=2)
+    y2 = forest.predict(X)
+    assert_array_almost_equal(y1, y2, 3)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_pickle(name):
+    # Check pickability.
+    if name in FOREST_CLASSIFIERS:
+        X = iris.data[::2]
+        y = iris.target[::2]
+    elif name in FOREST_REGRESSORS:
+        X = X_reg[::2]
+        y = y_reg[::2]
+
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    obj = ForestEstimator(random_state=0)
+    obj.fit(X, y)
+    score = obj.score(X, y)
+    pickle_object = pickle.dumps(obj)
+
+    obj2 = pickle.loads(pickle_object)
+    assert type(obj2) == obj.__class__
+    score2 = obj2.score(X, y)
+    assert score == score2
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_multioutput(name):
+    # Check estimators on multi-output problems.
+
+    X_train = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+    y_train = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+    X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False)
+    y_pred = est.fit(X_train, y_train).predict(X_test)
+    assert_array_almost_equal(y_pred, y_test)
+
+    if name in FOREST_CLASSIFIERS:
+        with np.errstate(divide="ignore"):
+            proba = est.predict_proba(X_test)
+            assert len(proba) == 2
+            assert proba[0].shape == (4, 2)
+            assert proba[1].shape == (4, 4)
+
+            log_proba = est.predict_log_proba(X_test)
+            assert len(log_proba) == 2
+            assert log_proba[0].shape == (4, 2)
+            assert log_proba[1].shape == (4, 4)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_multioutput_string(name):
+    # Check estimators on multi-output problems with string outputs.
+
+    X_train = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+    y_train = [
+        ["red", "blue"],
+        ["red", "blue"],
+        ["red", "blue"],
+        ["green", "green"],
+        ["green", "green"],
+        ["green", "green"],
+        ["red", "purple"],
+        ["red", "purple"],
+        ["red", "purple"],
+        ["green", "yellow"],
+        ["green", "yellow"],
+        ["green", "yellow"],
+    ]
+    X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_test = [
+        ["red", "blue"],
+        ["green", "green"],
+        ["red", "purple"],
+        ["green", "yellow"],
+    ]
+
+    est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False)
+    y_pred = est.fit(X_train, y_train).predict(X_test)
+    assert_array_equal(y_pred, y_test)
+
+    with np.errstate(divide="ignore"):
+        proba = est.predict_proba(X_test)
+        assert len(proba) == 2
+        assert proba[0].shape == (4, 2)
+        assert proba[1].shape == (4, 4)
+
+        log_proba = est.predict_log_proba(X_test)
+        assert len(log_proba) == 2
+        assert log_proba[0].shape == (4, 2)
+        assert log_proba[1].shape == (4, 4)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_classes_shape(name):
+    # Test that n_classes_ and classes_ have proper shape.
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    # Classification, single output
+    clf = ForestClassifier(random_state=0).fit(X, y)
+
+    assert clf.n_classes_ == 2
+    assert_array_equal(clf.classes_, [-1, 1])
+
+    # Classification, multi-output
+    _y = np.vstack((y, np.array(y) * 2)).T
+    clf = ForestClassifier(random_state=0).fit(X, _y)
+
+    assert_array_equal(clf.n_classes_, [2, 2])
+    assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
+
+
+def test_random_trees_dense_type():
+    # Test that the `sparse_output` parameter of RandomTreesEmbedding
+    # works by returning a dense array.
+
+    # Create the RTE with sparse=False
+    hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False)
+    X, y = datasets.make_circles(factor=0.5)
+    X_transformed = hasher.fit_transform(X)
+
+    # Assert that type is ndarray, not scipy.sparse.csr_matrix
+    assert isinstance(X_transformed, np.ndarray)
+
+
+def test_random_trees_dense_equal():
+    # Test that the `sparse_output` parameter of RandomTreesEmbedding
+    # works by returning the same array for both argument values.
+
+    # Create the RTEs
+    hasher_dense = RandomTreesEmbedding(
+        n_estimators=10, sparse_output=False, random_state=0
+    )
+    hasher_sparse = RandomTreesEmbedding(
+        n_estimators=10, sparse_output=True, random_state=0
+    )
+    X, y = datasets.make_circles(factor=0.5)
+    X_transformed_dense = hasher_dense.fit_transform(X)
+    X_transformed_sparse = hasher_sparse.fit_transform(X)
+
+    # Assert that dense and sparse hashers have same array.
+    assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense)
+
+
+def test_random_hasher():
+    # test random forest hashing on circles dataset
+    # make sure that it is linearly separable.
+    # even after projected to two SVD dimensions
+    # Note: Not all random_states produce perfect results.
+    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
+    X, y = datasets.make_circles(factor=0.5)
+    X_transformed = hasher.fit_transform(X)
+
+    # test fit and transform:
+    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
+    assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray())
+
+    # one leaf active per data point per forest
+    assert X_transformed.shape[0] == X.shape[0]
+    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
+    svd = TruncatedSVD(n_components=2)
+    X_reduced = svd.fit_transform(X_transformed)
+    linear_clf = LinearSVC()
+    linear_clf.fit(X_reduced, y)
+    assert linear_clf.score(X_reduced, y) == 1.0
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_random_hasher_sparse_data(csc_container):
+    X, y = datasets.make_multilabel_classification(random_state=0)
+    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
+    X_transformed = hasher.fit_transform(X)
+    X_transformed_sparse = hasher.fit_transform(csc_container(X))
+    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
+
+
+def test_parallel_train():
+    rng = check_random_state(12321)
+    n_samples, n_features = 80, 30
+    X_train = rng.randn(n_samples, n_features)
+    y_train = rng.randint(0, 2, n_samples)
+
+    clfs = [
+        RandomForestClassifier(n_estimators=20, n_jobs=n_jobs, random_state=12345).fit(
+            X_train, y_train
+        )
+        for n_jobs in [1, 2, 3, 8, 16, 32]
+    ]
+
+    X_test = rng.randn(n_samples, n_features)
+    probas = [clf.predict_proba(X_test) for clf in clfs]
+    for proba1, proba2 in itertools.pairwise(probas):
+        assert_array_almost_equal(proba1, proba2)
+
+
+def test_distribution():
+    rng = check_random_state(12321)
+
+    # Single variable with 4 values
+    X = rng.randint(0, 4, size=(1000, 1))
+    y = rng.rand(1000)
+    n_trees = 500
+
+    reg = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
+
+    uniques = defaultdict(int)
+    for tree in reg.estimators_:
+        tree = "".join(
+            ("%d,%d/" % (f, int(t)) if f >= 0 else "-")
+            for f, t in zip(tree.tree_.feature, tree.tree_.threshold)
+        )
+
+        uniques[tree] += 1
+
+    uniques = sorted([(1.0 * count / n_trees, tree) for tree, count in uniques.items()])
+
+    # On a single variable problem where X_0 has 4 equiprobable values, there
+    # are 5 ways to build a random tree. The more compact (0,1/0,0/--0,2/--) of
+    # them has probability 1/3 while the 4 others have probability 1/6.
+
+    assert len(uniques) == 5
+    assert 0.20 > uniques[0][0]  # Rough approximation of 1/6.
+    assert 0.20 > uniques[1][0]
+    assert 0.20 > uniques[2][0]
+    assert 0.20 > uniques[3][0]
+    assert uniques[4][0] > 0.3
+    assert uniques[4][1] == "0,1/0,0/--0,2/--"
+
+    # Two variables, one with 2 values, one with 3 values
+    X = np.empty((1000, 2))
+    X[:, 0] = np.random.randint(0, 2, 1000)
+    X[:, 1] = np.random.randint(0, 3, 1000)
+    y = rng.rand(1000)
+
+    reg = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y)
+
+    uniques = defaultdict(int)
+    for tree in reg.estimators_:
+        tree = "".join(
+            ("%d,%d/" % (f, int(t)) if f >= 0 else "-")
+            for f, t in zip(tree.tree_.feature, tree.tree_.threshold)
+        )
+
+        uniques[tree] += 1
+
+    uniques = [(count, tree) for tree, count in uniques.items()]
+    assert len(uniques) == 8
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_max_leaf_nodes_max_depth(name):
+    X, y = hastie_X, hastie_y
+
+    # Test precedence of max_leaf_nodes over max_depth.
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    est = ForestEstimator(
+        max_depth=1, max_leaf_nodes=4, n_estimators=1, random_state=0
+    ).fit(X, y)
+    assert est.estimators_[0].get_depth() == 1
+
+    est = ForestEstimator(max_depth=1, n_estimators=1, random_state=0).fit(X, y)
+    assert est.estimators_[0].get_depth() == 1
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_min_samples_split(name):
+    X, y = hastie_X, hastie_y
+    ForestEstimator = FOREST_ESTIMATORS[name]
+
+    est = ForestEstimator(min_samples_split=10, n_estimators=1, random_state=0)
+    est.fit(X, y)
+    node_idx = est.estimators_[0].tree_.children_left != -1
+    node_samples = est.estimators_[0].tree_.n_node_samples[node_idx]
+
+    assert np.min(node_samples) > len(X) * 0.5 - 1, "Failed with {0}".format(name)
+
+    est = ForestEstimator(min_samples_split=0.5, n_estimators=1, random_state=0)
+    est.fit(X, y)
+    node_idx = est.estimators_[0].tree_.children_left != -1
+    node_samples = est.estimators_[0].tree_.n_node_samples[node_idx]
+
+    assert np.min(node_samples) > len(X) * 0.5 - 1, "Failed with {0}".format(name)
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_min_samples_leaf(name):
+    X, y = hastie_X, hastie_y
+
+    # Test if leaves contain more than leaf_count training examples
+    ForestEstimator = FOREST_ESTIMATORS[name]
+
+    est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0)
+    est.fit(X, y)
+    out = est.estimators_[0].tree_.apply(X)
+    node_counts = np.bincount(out)
+    # drop inner nodes
+    leaf_count = node_counts[node_counts != 0]
+    assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
+
+    est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0)
+    est.fit(X, y)
+    out = est.estimators_[0].tree_.apply(X)
+    node_counts = np.bincount(out)
+    # drop inner nodes
+    leaf_count = node_counts[node_counts != 0]
+    assert np.min(leaf_count) > len(X) * 0.25 - 1, "Failed with {0}".format(name)
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_min_weight_fraction_leaf(name):
+    X, y = hastie_X, hastie_y
+
+    # Test if leaves contain at least min_weight_fraction_leaf of the
+    # training set
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    rng = np.random.RandomState(0)
+    weights = rng.rand(X.shape[0])
+    total_weight = np.sum(weights)
+
+    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
+    # by setting max_leaf_nodes
+    for frac in np.linspace(0, 0.5, 6):
+        est = ForestEstimator(
+            min_weight_fraction_leaf=frac, n_estimators=1, random_state=0
+        )
+        if "RandomForest" in name:
+            est.bootstrap = False
+
+        est.fit(X, y, sample_weight=weights)
+        out = est.estimators_[0].tree_.apply(X)
+        node_weights = np.bincount(out, weights=weights)
+        # drop inner nodes
+        leaf_weights = node_weights[node_weights != 0]
+        assert np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf, (
+            "Failed with {0} min_weight_fraction_leaf={1}".format(
+                name, est.min_weight_fraction_leaf
+            )
+        )
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_sparse_input(name, sparse_container):
+    X, y = datasets.make_multilabel_classification(random_state=0, n_samples=50)
+
+    ForestEstimator = FOREST_ESTIMATORS[name]
+
+    dense = ForestEstimator(random_state=0, max_depth=2).fit(X, y)
+    sparse = ForestEstimator(random_state=0, max_depth=2).fit(sparse_container(X), y)
+
+    assert_array_almost_equal(sparse.apply(X), dense.apply(X))
+
+    if name in FOREST_CLASSIFIERS or name in FOREST_REGRESSORS:
+        assert_array_almost_equal(sparse.predict(X), dense.predict(X))
+        assert_array_almost_equal(
+            sparse.feature_importances_, dense.feature_importances_
+        )
+
+    if name in FOREST_CLASSIFIERS:
+        assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X))
+        assert_array_almost_equal(
+            sparse.predict_log_proba(X), dense.predict_log_proba(X)
+        )
+
+    if name in FOREST_TRANSFORMERS:
+        assert_array_almost_equal(
+            sparse.transform(X).toarray(), dense.transform(X).toarray()
+        )
+        assert_array_almost_equal(
+            sparse.fit_transform(X).toarray(), dense.fit_transform(X).toarray()
+        )
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+@pytest.mark.parametrize("dtype", (np.float64, np.float32))
+def test_memory_layout(name, dtype):
+    # Test that it works no matter the memory layout
+    est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False)
+
+    # Dense
+    for container, kwargs in (
+        (np.asarray, {}),  # Nothing
+        (np.asarray, {"order": "C"}),  # C-order
+        (np.asarray, {"order": "F"}),  # F-order
+        (np.ascontiguousarray, {}),  # Contiguous
+    ):
+        X = container(iris.data, dtype=dtype, **kwargs)
+        y = iris.target
+        assert_array_almost_equal(est.fit(X, y).predict(X), y)
+
+    # Sparse (if applicable)
+    if est.estimator.splitter in SPARSE_SPLITTERS:
+        for sparse_container in COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS:
+            X = sparse_container(iris.data, dtype=dtype)
+            y = iris.target
+            assert_array_almost_equal(est.fit(X, y).predict(X), y)
+
+    # Strided
+    X = np.asarray(iris.data[::3], dtype=dtype)
+    y = iris.target[::3]
+    assert_array_almost_equal(est.fit(X, y).predict(X), y)
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_1d_input(name):
+    X = iris.data[:, 0]
+    X_2d = iris.data[:, 0].reshape((-1, 1))
+    y = iris.target
+
+    with ignore_warnings():
+        ForestEstimator = FOREST_ESTIMATORS[name]
+        with pytest.raises(ValueError):
+            ForestEstimator(n_estimators=1, random_state=0).fit(X, y)
+
+        est = ForestEstimator(random_state=0)
+        est.fit(X_2d, y)
+
+        if name in FOREST_CLASSIFIERS or name in FOREST_REGRESSORS:
+            with pytest.raises(ValueError):
+                est.predict(X)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_class_weights(name):
+    # Check class_weights resemble sample_weights behavior.
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    # Iris is balanced, so no effect expected for using 'balanced' weights
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target)
+    clf2 = ForestClassifier(class_weight="balanced", random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Make a multi-output problem with three copies of Iris
+    iris_multi = np.vstack((iris.target, iris.target, iris.target)).T
+    # Create user-defined weights that should balance over the outputs
+    clf3 = ForestClassifier(
+        class_weight=[
+            {0: 2.0, 1: 2.0, 2: 1.0},
+            {0: 2.0, 1: 1.0, 2: 2.0},
+            {0: 1.0, 1: 2.0, 2: 2.0},
+        ],
+        random_state=0,
+    )
+    clf3.fit(iris.data, iris_multi)
+    assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
+    # Check against multi-output "balanced" which should also have no effect
+    clf4 = ForestClassifier(class_weight="balanced", random_state=0)
+    clf4.fit(iris.data, iris_multi)
+    assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
+
+    # Inflate importance of class 1, check against user-defined weights
+    sample_weight = np.ones(iris.target.shape)
+    sample_weight[iris.target == 1] *= 100
+    class_weight = {0: 1.0, 1: 100.0, 2: 1.0}
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight)
+    clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Check that sample_weight and class_weight are multiplicative
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight**2)
+    clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target, sample_weight)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_class_weight_balanced_and_bootstrap_multi_output(name):
+    # Test class_weight works for multi-output"""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+    _y = np.vstack((y, np.array(y) * 2)).T
+    clf = ForestClassifier(class_weight="balanced", random_state=0)
+    clf.fit(X, _y)
+    clf = ForestClassifier(
+        class_weight=[{-1: 0.5, 1: 1.0}, {-2: 1.0, 2: 1.0}], random_state=0
+    )
+    clf.fit(X, _y)
+    # smoke test for balanced subsample
+    clf = ForestClassifier(class_weight="balanced_subsample", random_state=0)
+    clf.fit(X, _y)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_class_weight_errors(name):
+    # Test if class_weight raises errors and warnings when expected.
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+    _y = np.vstack((y, np.array(y) * 2)).T
+
+    # Warning warm_start with preset
+    clf = ForestClassifier(class_weight="balanced", warm_start=True, random_state=0)
+    clf.fit(X, y)
+
+    warn_msg = (
+        "Warm-start fitting without increasing n_estimators does not fit new trees."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        clf.fit(X, _y)
+
+    # Incorrect length list for multi-output
+    clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.0}], random_state=0)
+    with pytest.raises(ValueError):
+        clf.fit(X, _y)
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_warm_start(name):
+    # Test if fitting incrementally with warm start gives a forest of the
+    # right size and the same results as a normal fit.
+    X, y = hastie_X, hastie_y
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    est_ws = None
+    for n_estimators in [5, 10]:
+        if est_ws is None:
+            est_ws = ForestEstimator(
+                n_estimators=n_estimators, random_state=42, warm_start=True
+            )
+        else:
+            est_ws.set_params(n_estimators=n_estimators)
+        est_ws.fit(X, y)
+        assert len(est_ws) == n_estimators
+
+    est_no_ws = ForestEstimator(n_estimators=10, random_state=42, warm_start=False)
+    est_no_ws.fit(X, y)
+
+    assert set([tree.random_state for tree in est_ws]) == set(
+        [tree.random_state for tree in est_no_ws]
+    )
+
+    assert_array_equal(
+        est_ws.apply(X), est_no_ws.apply(X), err_msg="Failed with {0}".format(name)
+    )
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_warm_start_clear(name):
+    # Test if fit clears state and grows a new forest when warm_start==False.
+    X, y = hastie_X, hastie_y
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1)
+    est.fit(X, y)
+
+    est_2 = ForestEstimator(
+        n_estimators=5, max_depth=1, warm_start=True, random_state=2
+    )
+    est_2.fit(X, y)  # inits state
+    est_2.set_params(warm_start=False, random_state=1)
+    est_2.fit(X, y)  # clears old state and equals est
+
+    assert_array_almost_equal(est_2.apply(X), est.apply(X))
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_warm_start_smaller_n_estimators(name):
+    # Test if warm start second fit with smaller n_estimators raises error.
+    X, y = hastie_X, hastie_y
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True)
+    est.fit(X, y)
+    est.set_params(n_estimators=4)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
+def test_warm_start_equal_n_estimators(name):
+    # Test if warm start with equal n_estimators does nothing and returns the
+    # same forest and raises a warning.
+    X, y = hastie_X, hastie_y
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    est = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1)
+    est.fit(X, y)
+
+    est_2 = ForestEstimator(
+        n_estimators=5, max_depth=3, warm_start=True, random_state=1
+    )
+    est_2.fit(X, y)
+    # Now est_2 equals est.
+
+    est_2.set_params(random_state=2)
+    warn_msg = (
+        "Warm-start fitting without increasing n_estimators does not fit new trees."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        est_2.fit(X, y)
+    # If we had fit the trees again we would have got a different forest as we
+    # changed the random state.
+    assert_array_equal(est.apply(X), est_2.apply(X))
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_warm_start_oob(name):
+    # Test that the warm start computes oob score when asked.
+    X, y = hastie_X, hastie_y
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning.
+    est = ForestEstimator(
+        n_estimators=15,
+        max_depth=3,
+        warm_start=False,
+        random_state=1,
+        bootstrap=True,
+        oob_score=True,
+    )
+    est.fit(X, y)
+
+    est_2 = ForestEstimator(
+        n_estimators=5,
+        max_depth=3,
+        warm_start=False,
+        random_state=1,
+        bootstrap=True,
+        oob_score=False,
+    )
+    est_2.fit(X, y)
+
+    est_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
+    est_2.fit(X, y)
+
+    assert hasattr(est_2, "oob_score_")
+    assert est.oob_score_ == est_2.oob_score_
+
+    # Test that oob_score is computed even if we don't need to train
+    # additional trees.
+    est_3 = ForestEstimator(
+        n_estimators=15,
+        max_depth=3,
+        warm_start=True,
+        random_state=1,
+        bootstrap=True,
+        oob_score=False,
+    )
+    est_3.fit(X, y)
+    assert not hasattr(est_3, "oob_score_")
+
+    est_3.set_params(oob_score=True)
+    ignore_warnings(est_3.fit)(X, y)
+
+    assert est.oob_score_ == est_3.oob_score_
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_oob_not_computed_twice(name):
+    # Check that oob_score is not computed twice when warm_start=True.
+    X, y = hastie_X, hastie_y
+    ForestEstimator = FOREST_ESTIMATORS[name]
+
+    est = ForestEstimator(
+        n_estimators=10, warm_start=True, bootstrap=True, oob_score=True
+    )
+
+    with patch.object(
+        est, "_set_oob_score_and_attributes", wraps=est._set_oob_score_and_attributes
+    ) as mock_set_oob_score_and_attributes:
+        est.fit(X, y)
+
+        with pytest.warns(UserWarning, match="Warm-start fitting without increasing"):
+            est.fit(X, y)
+
+        mock_set_oob_score_and_attributes.assert_called_once()
+
+
+def test_dtype_convert(n_classes=15):
+    classifier = RandomForestClassifier(random_state=0, bootstrap=False)
+
+    X = np.eye(n_classes)
+    y = [ch for ch in "ABCDEFGHIJKLMNOPQRSTU"[:n_classes]]
+
+    result = classifier.fit(X, y).predict(X)
+    assert_array_equal(classifier.classes_, y)
+    assert_array_equal(result, y)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_decision_path(name):
+    X, y = hastie_X, hastie_y
+    n_samples = X.shape[0]
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1)
+    est.fit(X, y)
+    indicator, n_nodes_ptr = est.decision_path(X)
+
+    assert indicator.shape[1] == n_nodes_ptr[-1]
+    assert indicator.shape[0] == n_samples
+    assert_array_equal(
+        np.diff(n_nodes_ptr), [e.tree_.node_count for e in est.estimators_]
+    )
+
+    # Assert that leaves index are correct
+    leaves = est.apply(X)
+    for est_id in range(leaves.shape[1]):
+        leave_indicator = [
+            indicator[i, n_nodes_ptr[est_id] + j]
+            for i, j in enumerate(leaves[:, est_id])
+        ]
+        assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
+
+
+def test_min_impurity_decrease():
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    all_estimators = [
+        RandomForestClassifier,
+        RandomForestRegressor,
+        ExtraTreesClassifier,
+        ExtraTreesRegressor,
+    ]
+
+    for Estimator in all_estimators:
+        est = Estimator(min_impurity_decrease=0.1)
+        est.fit(X, y)
+        for tree in est.estimators_:
+            # Simply check if the parameter is passed on correctly. Tree tests
+            # will suffice for the actual working of this param
+            assert tree.min_impurity_decrease == 0.1
+
+
+def test_poisson_y_positive_check():
+    est = RandomForestRegressor(criterion="poisson")
+    X = np.zeros((3, 3))
+
+    y = [-1, 1, 3]
+    err_msg = (
+        r"Some value\(s\) of y are negative which is "
+        r"not allowed for Poisson regression."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+    y = [0, 0, 0]
+    err_msg = (
+        r"Sum of y is not strictly positive which "
+        r"is necessary for Poisson regression."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+
+# mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type
+class MyBackend(DEFAULT_JOBLIB_BACKEND):  # type: ignore[valid-type,misc]
+    def __init__(self, *args, **kwargs):
+        self.count = 0
+        super().__init__(*args, **kwargs)
+
+    def start_call(self):
+        self.count += 1
+        return super().start_call()
+
+
+joblib.register_parallel_backend("testing", MyBackend)
+
+
+@skip_if_no_parallel
+def test_backend_respected():
+    clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
+
+    with joblib.parallel_backend("testing") as (ba, n_jobs):
+        clf.fit(X, y)
+
+    assert ba.count > 0
+
+    # predict_proba requires shared memory. Ensure that's honored.
+    with joblib.parallel_backend("testing") as (ba, _):
+        clf.predict_proba(X)
+
+    assert ba.count == 0
+
+
+def test_forest_feature_importances_sum():
+    X, y = make_classification(
+        n_samples=15, n_informative=3, random_state=1, n_classes=3
+    )
+    clf = RandomForestClassifier(
+        min_samples_leaf=5, random_state=42, n_estimators=200
+    ).fit(X, y)
+    assert math.isclose(1, clf.feature_importances_.sum(), abs_tol=1e-7)
+
+
+def test_forest_degenerate_feature_importances():
+    # build a forest of single node trees. See #13636
+    X = np.zeros((10, 10))
+    y = np.ones((10,))
+    gbr = RandomForestRegressor(n_estimators=10).fit(X, y)
+    assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64))
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_max_samples_bootstrap(name):
+    # Check invalid `max_samples` values
+    est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=False, max_samples=0.5)
+    err_msg = (
+        r"`max_sample` cannot be set if `bootstrap=False`. "
+        r"Either switch to `bootstrap=True` or set "
+        r"`max_sample=None`."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_large_max_samples_exception(name):
+    # Check invalid `max_samples`
+    est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=True, max_samples=int(1e9))
+    match = "`max_samples` must be <= n_samples=6 but got value 1000000000"
+    with pytest.raises(ValueError, match=match):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("name", FOREST_REGRESSORS)
+def test_max_samples_boundary_regressors(name):
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0
+    )
+
+    ms_1_model = FOREST_REGRESSORS[name](
+        bootstrap=True, max_samples=1.0, random_state=0
+    )
+    ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test)
+
+    ms_None_model = FOREST_REGRESSORS[name](
+        bootstrap=True, max_samples=None, random_state=0
+    )
+    ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test)
+
+    ms_1_ms = mean_squared_error(ms_1_predict, y_test)
+    ms_None_ms = mean_squared_error(ms_None_predict, y_test)
+
+    assert ms_1_ms == pytest.approx(ms_None_ms)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_max_samples_boundary_classifiers(name):
+    X_train, X_test, y_train, _ = train_test_split(
+        X_large, y_large, random_state=0, stratify=y_large
+    )
+
+    ms_1_model = FOREST_CLASSIFIERS[name](
+        bootstrap=True, max_samples=1.0, random_state=0
+    )
+    ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test)
+
+    ms_None_model = FOREST_CLASSIFIERS[name](
+        bootstrap=True, max_samples=None, random_state=0
+    )
+    ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test)
+
+    np.testing.assert_allclose(ms_1_proba, ms_None_proba)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_forest_y_sparse(csr_container):
+    X = [[1, 2, 3]]
+    y = csr_container([[4, 5, 6]])
+    est = RandomForestClassifier()
+    msg = "sparse multilabel-indicator for y is not supported."
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("ForestClass", [RandomForestClassifier, RandomForestRegressor])
+def test_little_tree_with_small_max_samples(ForestClass):
+    rng = np.random.RandomState(1)
+
+    X = rng.randn(10000, 2)
+    y = rng.randn(10000) > 0
+
+    # First fit with no restriction on max samples
+    est1 = ForestClass(
+        n_estimators=1,
+        random_state=rng,
+        max_samples=None,
+    )
+
+    # Second fit with max samples restricted to just 2
+    est2 = ForestClass(
+        n_estimators=1,
+        random_state=rng,
+        max_samples=2,
+    )
+
+    est1.fit(X, y)
+    est2.fit(X, y)
+
+    tree1 = est1.estimators_[0].tree_
+    tree2 = est2.estimators_[0].tree_
+
+    msg = "Tree without `max_samples` restriction should have more nodes"
+    assert tree1.node_count > tree2.node_count, msg
+
+
+@pytest.mark.parametrize("Forest", FOREST_REGRESSORS)
+def test_mse_criterion_object_segfault_smoke_test(Forest):
+    # This is a smoke test to ensure that passing a mutable criterion
+    # does not cause a segfault when fitting with concurrent threads.
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/12623
+    from sklearn.tree._criterion import MSE
+
+    y = y_reg.reshape(-1, 1)
+    n_samples, n_outputs = y.shape
+    mse_criterion = MSE(n_outputs, n_samples)
+    est = FOREST_REGRESSORS[Forest](n_estimators=2, n_jobs=2, criterion=mse_criterion)
+
+    est.fit(X_reg, y)
+
+
+def test_random_trees_embedding_feature_names_out():
+    """Check feature names out for Random Trees Embedding."""
+    random_state = np.random.RandomState(0)
+    X = np.abs(random_state.randn(100, 4))
+    hasher = RandomTreesEmbedding(
+        n_estimators=2, max_depth=2, sparse_output=False, random_state=0
+    ).fit(X)
+    names = hasher.get_feature_names_out()
+    expected_names = [
+        f"randomtreesembedding_{tree}_{leaf}"
+        # Note: nodes with indices 0, 1 and 4 are internal split nodes and
+        # therefore do not appear in the expected output feature names.
+        for tree, leaf in [
+            (0, 2),
+            (0, 3),
+            (0, 5),
+            (0, 6),
+            (1, 2),
+            (1, 3),
+            (1, 5),
+            (1, 6),
+        ]
+    ]
+    assert_array_equal(expected_names, names)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_read_only_buffer(csr_container, monkeypatch):
+    """RandomForestClassifier must work on readonly sparse data.
+
+    Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/25333
+    """
+    monkeypatch.setattr(
+        sklearn.ensemble._forest,
+        "Parallel",
+        partial(Parallel, max_nbytes=100),
+    )
+    rng = np.random.RandomState(seed=0)
+
+    X, y = make_classification(n_samples=100, n_features=200, random_state=rng)
+    X = csr_container(X, copy=True)
+
+    clf = RandomForestClassifier(n_jobs=2, random_state=rng)
+    cross_val_score(clf, X, y, cv=2)
+
+
+@pytest.mark.parametrize("class_weight", ["balanced_subsample", None])
+def test_round_samples_to_one_when_samples_too_low(class_weight):
+    """Check low max_samples works and is rounded to one.
+
+    Non-regression test for gh-24037.
+    """
+    X, y = datasets.load_wine(return_X_y=True)
+    forest = RandomForestClassifier(
+        n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0
+    )
+    forest.fit(X, y)
+
+
+@pytest.mark.parametrize("seed", [None, 1])
+@pytest.mark.parametrize("bootstrap", [True, False])
+@pytest.mark.parametrize("ForestClass", FOREST_CLASSIFIERS_REGRESSORS.values())
+def test_estimators_samples(ForestClass, bootstrap, seed):
+    """Estimators_samples_ property should be consistent.
+
+    Tests consistency across fits and whether or not the seed for the random generator
+    is set.
+    """
+    X, y = make_hastie_10_2(n_samples=200, random_state=1)
+
+    if bootstrap:
+        max_samples = 0.5
+    else:
+        max_samples = None
+    est = ForestClass(
+        n_estimators=10,
+        max_samples=max_samples,
+        max_features=0.5,
+        random_state=seed,
+        bootstrap=bootstrap,
+    )
+    est.fit(X, y)
+
+    estimators_samples = est.estimators_samples_.copy()
+
+    # Test repeated calls result in same set of indices
+    assert_array_equal(estimators_samples, est.estimators_samples_)
+    estimators = est.estimators_
+
+    assert isinstance(estimators_samples, list)
+    assert len(estimators_samples) == len(estimators)
+    assert estimators_samples[0].dtype == np.int32
+
+    for i in range(len(estimators)):
+        if bootstrap:
+            assert len(estimators_samples[i]) == len(X) // 2
+
+            # the bootstrap should be a resampling with replacement
+            assert len(np.unique(estimators_samples[i])) < len(estimators_samples[i])
+        else:
+            assert len(set(estimators_samples[i])) == len(X)
+
+    estimator_index = 0
+    estimator_samples = estimators_samples[estimator_index]
+    estimator = estimators[estimator_index]
+
+    X_train = X[estimator_samples]
+    y_train = y[estimator_samples]
+
+    orig_tree_values = estimator.tree_.value
+    estimator = clone(estimator)
+    estimator.fit(X_train, y_train)
+    new_tree_values = estimator.tree_.value
+    assert_allclose(orig_tree_values, new_tree_values)
+
+
+@pytest.mark.parametrize(
+    "make_data, Forest",
+    [
+        (datasets.make_regression, RandomForestRegressor),
+        (datasets.make_classification, RandomForestClassifier),
+        (datasets.make_regression, ExtraTreesRegressor),
+        (datasets.make_classification, ExtraTreesClassifier),
+    ],
+)
+def test_missing_values_is_resilient(make_data, Forest):
+    """Check that forest can deal with missing values and has decent performance."""
+
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 1000, 10
+    X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng)
+
+    # Create dataset with missing values
+    X_missing = X.copy()
+    X_missing[rng.choice([False, True], size=X.shape, p=[0.95, 0.05])] = np.nan
+    assert np.isnan(X_missing).any()
+
+    X_missing_train, X_missing_test, y_train, y_test = train_test_split(
+        X_missing, y, random_state=0
+    )
+
+    # Train forest with missing values
+    forest_with_missing = Forest(random_state=rng, n_estimators=50)
+    forest_with_missing.fit(X_missing_train, y_train)
+    score_with_missing = forest_with_missing.score(X_missing_test, y_test)
+
+    # Train forest without missing values
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    forest = Forest(random_state=rng, n_estimators=50)
+    forest.fit(X_train, y_train)
+    score_without_missing = forest.score(X_test, y_test)
+
+    # Score is still 80 percent of the forest's score that had no missing values
+    assert score_with_missing >= 0.80 * score_without_missing
+
+
+@pytest.mark.parametrize(
+    "Forest",
+    [
+        RandomForestClassifier,
+        RandomForestRegressor,
+        ExtraTreesRegressor,
+        ExtraTreesClassifier,
+    ],
+)
+def test_missing_value_is_predictive(Forest):
+    """Check that the forest learns when missing values are only present for
+    a predictive feature."""
+    rng = np.random.RandomState(0)
+    n_samples = 300
+    expected_score = 0.75
+
+    X_non_predictive = rng.standard_normal(size=(n_samples, 10))
+    y = rng.randint(0, high=2, size=n_samples)
+
+    # Create a predictive feature using `y` and with some noise
+    X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05])
+    y_mask = y.astype(bool)
+    y_mask[X_random_mask] = ~y_mask[X_random_mask]
+
+    predictive_feature = rng.standard_normal(size=n_samples)
+    predictive_feature[y_mask] = np.nan
+    assert np.isnan(predictive_feature).any()
+
+    X_predictive = X_non_predictive.copy()
+    X_predictive[:, 5] = predictive_feature
+
+    (
+        X_predictive_train,
+        X_predictive_test,
+        X_non_predictive_train,
+        X_non_predictive_test,
+        y_train,
+        y_test,
+    ) = train_test_split(X_predictive, X_non_predictive, y, random_state=0)
+    forest_predictive = Forest(random_state=0).fit(X_predictive_train, y_train)
+    forest_non_predictive = Forest(random_state=0).fit(X_non_predictive_train, y_train)
+
+    predictive_test_score = forest_predictive.score(X_predictive_test, y_test)
+
+    assert predictive_test_score >= expected_score
+    assert predictive_test_score >= forest_non_predictive.score(
+        X_non_predictive_test, y_test
+    )
+
+
+@pytest.mark.parametrize("Forest", FOREST_REGRESSORS.values())
+def test_non_supported_criterion_raises_error_with_missing_values(Forest):
+    """Raise error for unsupported criterion when there are missing values."""
+    X = np.array([[0, 1, 2], [np.nan, 0, 2.0]])
+    y = [0.5, 1.0]
+
+    forest = Forest(criterion="absolute_error")
+
+    msg = ".*does not accept missing values"
+    with pytest.raises(ValueError, match=msg):
+        forest.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_gradient_boosting.py
new file mode 100644
index 0000000000000000000000000000000000000000..f799d51eec25cd908b9dfcda3704a0ab8b8d381a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -0,0 +1,1711 @@
+"""
+Testing for the gradient boosting module (sklearn.ensemble.gradient_boosting).
+"""
+
+import re
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn import datasets
+from sklearn.base import clone
+from sklearn.datasets import make_classification, make_regression
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
+from sklearn.ensemble._gb import _safe_divide
+from sklearn.ensemble._gradient_boosting import predict_stages
+from sklearn.exceptions import DataConversionWarning, NotFittedError
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import scale
+from sklearn.svm import NuSVR
+from sklearn.utils import check_random_state
+from sklearn.utils._mocking import NoSampleWeightWrapper
+from sklearn.utils._param_validation import InvalidParameterError
+from sklearn.utils._testing import (
+    assert_array_almost_equal,
+    assert_array_equal,
+    skip_if_32bit,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
+
+GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, GradientBoostingRegressor]
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+T = [[-1, -1], [2, 2], [3, 2]]
+true_result = [-1, 1, 1]
+
+# also make regression dataset
+X_reg, y_reg = make_regression(
+    n_samples=100, n_features=4, n_informative=8, noise=10, random_state=7
+)
+y_reg = scale(y_reg)
+
+rng = np.random.RandomState(0)
+# also load the iris dataset
+# and randomly permute it
+iris = datasets.load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+
+def test_exponential_n_classes_gt_2():
+    """Test exponential loss raises for n_classes > 2."""
+    clf = GradientBoostingClassifier(loss="exponential")
+    msg = "loss='exponential' is only suitable for a binary classification"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(iris.data, iris.target)
+
+
+def test_raise_if_init_has_no_predict_proba():
+    """Test raise if init_ has no predict_proba method."""
+    clf = GradientBoostingClassifier(init=GradientBoostingRegressor)
+    msg = (
+        "The 'init' parameter of GradientBoostingClassifier must be a str among "
+        "{'zero'}, None or an object implementing 'fit' and 'predict_proba'."
+    )
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+@pytest.mark.parametrize("loss", ("log_loss", "exponential"))
+def test_classification_toy(loss, global_random_seed):
+    # Check classification on a toy dataset.
+    clf = GradientBoostingClassifier(
+        loss=loss, n_estimators=10, random_state=global_random_seed
+    )
+
+    with pytest.raises(ValueError):
+        clf.predict(T)
+
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf.estimators_)
+
+    log_loss_decrease = clf.train_score_[:-1] - clf.train_score_[1:]
+    assert np.any(log_loss_decrease >= 0.0)
+
+    leaves = clf.apply(X)
+    assert leaves.shape == (6, 10, 1)
+
+
+@pytest.mark.parametrize("loss", ("log_loss", "exponential"))
+def test_classification_synthetic(loss, global_random_seed):
+    # Test GradientBoostingClassifier on synthetic dataset used by
+    # Hastie et al. in ESLII - Figure 10.9
+    # Note that Figure 10.9 reuses the dataset generated for figure 10.2
+    # and should have 2_000 train data points and 10_000 test data points.
+    # Here we intentionally use a smaller variant to make the test run faster,
+    # but the conclusions are still the same, despite the smaller datasets.
+    X, y = datasets.make_hastie_10_2(n_samples=2000, random_state=global_random_seed)
+
+    split_idx = 500
+    X_train, X_test = X[:split_idx], X[split_idx:]
+    y_train, y_test = y[:split_idx], y[split_idx:]
+
+    # Increasing the number of trees should decrease the test error
+    common_params = {
+        "max_depth": 1,
+        "learning_rate": 1.0,
+        "loss": loss,
+        "random_state": global_random_seed,
+    }
+    gbrt_10_stumps = GradientBoostingClassifier(n_estimators=10, **common_params)
+    gbrt_10_stumps.fit(X_train, y_train)
+
+    gbrt_50_stumps = GradientBoostingClassifier(n_estimators=50, **common_params)
+    gbrt_50_stumps.fit(X_train, y_train)
+
+    assert gbrt_10_stumps.score(X_test, y_test) < gbrt_50_stumps.score(X_test, y_test)
+
+    # Decision stumps are better suited for this dataset with a large number of
+    # estimators.
+    common_params = {
+        "n_estimators": 200,
+        "learning_rate": 1.0,
+        "loss": loss,
+        "random_state": global_random_seed,
+    }
+    gbrt_stumps = GradientBoostingClassifier(max_depth=1, **common_params)
+    gbrt_stumps.fit(X_train, y_train)
+
+    gbrt_10_nodes = GradientBoostingClassifier(max_leaf_nodes=10, **common_params)
+    gbrt_10_nodes.fit(X_train, y_train)
+
+    assert gbrt_stumps.score(X_test, y_test) > gbrt_10_nodes.score(X_test, y_test)
+
+
+@pytest.mark.parametrize("loss", ("squared_error", "absolute_error", "huber"))
+@pytest.mark.parametrize("subsample", (1.0, 0.5))
+def test_regression_dataset(loss, subsample, global_random_seed):
+    # Check consistency on regression dataset with least squares
+    # and least absolute deviation.
+    ones = np.ones(len(y_reg))
+    last_y_pred = None
+    for sample_weight in [None, ones, 2 * ones]:
+        # learning_rate, max_depth and n_estimators were adjusted to get a mode
+        # that is accurate enough to reach a low MSE on the training set while
+        # keeping the resource used to execute this test low enough.
+        reg = GradientBoostingRegressor(
+            n_estimators=30,
+            loss=loss,
+            max_depth=4,
+            subsample=subsample,
+            min_samples_split=2,
+            random_state=global_random_seed,
+            learning_rate=0.5,
+        )
+
+        reg.fit(X_reg, y_reg, sample_weight=sample_weight)
+        leaves = reg.apply(X_reg)
+        assert leaves.shape == (100, 30)
+
+        y_pred = reg.predict(X_reg)
+        mse = mean_squared_error(y_reg, y_pred)
+        assert mse < 0.05
+
+        if last_y_pred is not None:
+            # FIXME: We temporarily bypass this test. This is due to the fact
+            # that GBRT with and without `sample_weight` do not use the same
+            # implementation of the median during the initialization with the
+            # `DummyRegressor`. In the future, we should make sure that both
+            # implementations should be the same. See PR #17377 for more.
+            # assert_allclose(last_y_pred, y_pred)
+            pass
+
+        last_y_pred = y_pred
+
+
+@pytest.mark.parametrize("subsample", (1.0, 0.5))
+@pytest.mark.parametrize("sample_weight", (None, 1))
+def test_iris(subsample, sample_weight, global_random_seed):
+    if sample_weight == 1:
+        sample_weight = np.ones(len(iris.target))
+    # Check consistency on dataset iris.
+    clf = GradientBoostingClassifier(
+        n_estimators=100,
+        loss="log_loss",
+        random_state=global_random_seed,
+        subsample=subsample,
+    )
+    clf.fit(iris.data, iris.target, sample_weight=sample_weight)
+    score = clf.score(iris.data, iris.target)
+    assert score > 0.9
+
+    leaves = clf.apply(iris.data)
+    assert leaves.shape == (150, 100, 3)
+
+
+def test_regression_synthetic(global_random_seed):
+    # Test on synthetic regression datasets used in Leo Breiman,
+    # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996).
+    random_state = check_random_state(global_random_seed)
+    regression_params = {
+        "n_estimators": 100,
+        "max_depth": 4,
+        "min_samples_split": 2,
+        "learning_rate": 0.1,
+        "loss": "squared_error",
+        "random_state": global_random_seed,
+    }
+
+    # Friedman1
+    X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0)
+    X_train, y_train = X[:200], y[:200]
+    X_test, y_test = X[200:], y[200:]
+
+    clf = GradientBoostingRegressor(**regression_params)
+    clf.fit(X_train, y_train)
+    mse = mean_squared_error(y_test, clf.predict(X_test))
+    assert mse < 6.5
+
+    # Friedman2
+    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
+    X_train, y_train = X[:200], y[:200]
+    X_test, y_test = X[200:], y[200:]
+
+    clf = GradientBoostingRegressor(**regression_params)
+    clf.fit(X_train, y_train)
+    mse = mean_squared_error(y_test, clf.predict(X_test))
+    assert mse < 2500.0
+
+    # Friedman3
+    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
+    X_train, y_train = X[:200], y[:200]
+    X_test, y_test = X[200:], y[200:]
+
+    clf = GradientBoostingRegressor(**regression_params)
+    clf.fit(X_train, y_train)
+    mse = mean_squared_error(y_test, clf.predict(X_test))
+    assert mse < 0.025
+
+
+@pytest.mark.parametrize(
+    "GradientBoosting, X, y",
+    [
+        (GradientBoostingRegressor, X_reg, y_reg),
+        (GradientBoostingClassifier, iris.data, iris.target),
+    ],
+)
+def test_feature_importances(GradientBoosting, X, y):
+    # smoke test to check that the gradient boosting expose an attribute
+    # feature_importances_
+    gbdt = GradientBoosting()
+    assert not hasattr(gbdt, "feature_importances_")
+    gbdt.fit(X, y)
+    assert hasattr(gbdt, "feature_importances_")
+
+
+def test_probability_log(global_random_seed):
+    # Predict probabilities.
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=global_random_seed)
+
+    with pytest.raises(ValueError):
+        clf.predict_proba(T)
+
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+
+    # check if probabilities are in [0, 1].
+    y_proba = clf.predict_proba(T)
+    assert np.all(y_proba >= 0.0)
+    assert np.all(y_proba <= 1.0)
+
+    # derive predictions from probabilities
+    y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
+    assert_array_equal(y_pred, true_result)
+
+
+def test_single_class_with_sample_weight():
+    sample_weight = [0, 0, 0, 1, 1, 1]
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+    msg = (
+        "y contains 1 class after sample_weight trimmed classes with "
+        "zero weights, while a minimum of 2 classes are required."
+    )
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y, sample_weight=sample_weight)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_check_inputs_predict_stages(csc_container):
+    # check that predict_stages through an error if the type of X is not
+    # supported
+    x, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    x_sparse_csc = csc_container(x)
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+    clf.fit(x, y)
+    score = np.zeros((y.shape)).reshape(-1, 1)
+    err_msg = "When X is a sparse matrix, a CSR format is expected"
+    with pytest.raises(ValueError, match=err_msg):
+        predict_stages(clf.estimators_, x_sparse_csc, clf.learning_rate, score)
+    x_fortran = np.asfortranarray(x)
+    with pytest.raises(ValueError, match="X should be C-ordered np.ndarray"):
+        predict_stages(clf.estimators_, x_fortran, clf.learning_rate, score)
+
+
+def test_max_feature_regression(global_random_seed):
+    # Test to make sure random state is set properly.
+    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=global_random_seed)
+
+    X_train, X_test = X[:2000], X[2000:]
+    y_train, y_test = y[:2000], y[2000:]
+
+    gbrt = GradientBoostingClassifier(
+        n_estimators=100,
+        min_samples_split=5,
+        max_depth=2,
+        learning_rate=0.1,
+        max_features=2,
+        random_state=global_random_seed,
+    )
+    gbrt.fit(X_train, y_train)
+    log_loss = gbrt._loss(y_test, gbrt.decision_function(X_test))
+    assert log_loss < 0.5, "GB failed with deviance %.4f" % log_loss
+
+
+def test_feature_importance_regression(
+    fetch_california_housing_fxt, global_random_seed
+):
+    """Test that Gini importance is calculated correctly.
+
+    This test follows the example from [1]_ (pg. 373).
+
+    .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
+       of statistical learning. New York: Springer series in statistics.
+    """
+    california = fetch_california_housing_fxt()
+    X, y = california.data, california.target
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=global_random_seed
+    )
+
+    reg = GradientBoostingRegressor(
+        loss="huber",
+        learning_rate=0.1,
+        max_leaf_nodes=6,
+        n_estimators=100,
+        random_state=global_random_seed,
+    )
+    reg.fit(X_train, y_train)
+    sorted_idx = np.argsort(reg.feature_importances_)[::-1]
+    sorted_features = [california.feature_names[s] for s in sorted_idx]
+
+    # The most important feature is the median income by far.
+    assert sorted_features[0] == "MedInc"
+
+    # The three subsequent features are the following. Their relative ordering
+    # might change a bit depending on the randomness of the trees and the
+    # train / test split.
+    assert set(sorted_features[1:4]) == {"Longitude", "AveOccup", "Latitude"}
+
+
+def test_max_features():
+    # Test if max features is set properly for floats and str.
+    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
+    _, n_features = X.shape
+
+    X_train = X[:2000]
+    y_train = y[:2000]
+
+    gbrt = GradientBoostingClassifier(n_estimators=1, max_features=None)
+    gbrt.fit(X_train, y_train)
+    assert gbrt.max_features_ == n_features
+
+    gbrt = GradientBoostingRegressor(n_estimators=1, max_features=None)
+    gbrt.fit(X_train, y_train)
+    assert gbrt.max_features_ == n_features
+
+    gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3)
+    gbrt.fit(X_train, y_train)
+    assert gbrt.max_features_ == int(n_features * 0.3)
+
+    gbrt = GradientBoostingRegressor(n_estimators=1, max_features="sqrt")
+    gbrt.fit(X_train, y_train)
+    assert gbrt.max_features_ == int(np.sqrt(n_features))
+
+    gbrt = GradientBoostingRegressor(n_estimators=1, max_features="log2")
+    gbrt.fit(X_train, y_train)
+    assert gbrt.max_features_ == int(np.log2(n_features))
+
+    gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.01 / X.shape[1])
+    gbrt.fit(X_train, y_train)
+    assert gbrt.max_features_ == 1
+
+
+def test_staged_predict():
+    # Test whether staged decision function eventually gives
+    # the same prediction.
+    X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0)
+    X_train, y_train = X[:200], y[:200]
+    X_test = X[200:]
+    clf = GradientBoostingRegressor()
+    # test raise ValueError if not fitted
+    with pytest.raises(ValueError):
+        np.fromiter(clf.staged_predict(X_test), dtype=np.float64)
+
+    clf.fit(X_train, y_train)
+    y_pred = clf.predict(X_test)
+
+    # test if prediction for last stage equals ``predict``
+    for y in clf.staged_predict(X_test):
+        assert y.shape == y_pred.shape
+
+    assert_array_almost_equal(y_pred, y)
+
+
+def test_staged_predict_proba():
+    # Test whether staged predict proba eventually gives
+    # the same prediction.
+    X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1)
+    X_train, y_train = X[:200], y[:200]
+    X_test, y_test = X[200:], y[200:]
+    clf = GradientBoostingClassifier(n_estimators=20)
+    # test raise NotFittedError if not
+    with pytest.raises(NotFittedError):
+        np.fromiter(clf.staged_predict_proba(X_test), dtype=np.float64)
+
+    clf.fit(X_train, y_train)
+
+    # test if prediction for last stage equals ``predict``
+    for y_pred in clf.staged_predict(X_test):
+        assert y_test.shape == y_pred.shape
+
+    assert_array_equal(clf.predict(X_test), y_pred)
+
+    # test if prediction for last stage equals ``predict_proba``
+    for staged_proba in clf.staged_predict_proba(X_test):
+        assert y_test.shape[0] == staged_proba.shape[0]
+        assert 2 == staged_proba.shape[1]
+
+    assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
+
+
+@pytest.mark.parametrize("Estimator", GRADIENT_BOOSTING_ESTIMATORS)
+def test_staged_functions_defensive(Estimator, global_random_seed):
+    # test that staged_functions make defensive copies
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.uniform(size=(10, 3))
+    y = (4 * X[:, 0]).astype(int) + 1  # don't predict zeros
+    estimator = Estimator()
+    estimator.fit(X, y)
+    for func in ["predict", "decision_function", "predict_proba"]:
+        staged_func = getattr(estimator, "staged_" + func, None)
+        if staged_func is None:
+            # regressor has no staged_predict_proba
+            continue
+        with warnings.catch_warnings(record=True):
+            staged_result = list(staged_func(X))
+        staged_result[1][:] = 0
+        assert np.all(staged_result[0] != 0)
+
+
+def test_serialization():
+    # Check model serialization.
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 100 == len(clf.estimators_)
+
+    try:
+        import cPickle as pickle
+    except ImportError:
+        import pickle
+
+    serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL)
+    clf = None
+    clf = pickle.loads(serialized_clf)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 100 == len(clf.estimators_)
+
+
+def test_degenerate_targets():
+    # Check if we can fit even though all targets are equal.
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+
+    # classifier should raise exception
+    with pytest.raises(ValueError):
+        clf.fit(X, np.ones(len(X)))
+
+    clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
+    clf.fit(X, np.ones(len(X)))
+    clf.predict([rng.rand(2)])
+    assert_array_equal(np.ones((1,), dtype=np.float64), clf.predict([rng.rand(2)]))
+
+
+def test_quantile_loss(global_random_seed):
+    # Check if quantile loss with alpha=0.5 equals absolute_error.
+    clf_quantile = GradientBoostingRegressor(
+        n_estimators=100,
+        loss="quantile",
+        max_depth=4,
+        alpha=0.5,
+        random_state=global_random_seed,
+    )
+
+    clf_quantile.fit(X_reg, y_reg)
+    y_quantile = clf_quantile.predict(X_reg)
+
+    clf_ae = GradientBoostingRegressor(
+        n_estimators=100,
+        loss="absolute_error",
+        max_depth=4,
+        random_state=global_random_seed,
+    )
+
+    clf_ae.fit(X_reg, y_reg)
+    y_ae = clf_ae.predict(X_reg)
+    assert_allclose(y_quantile, y_ae)
+
+
+def test_symbol_labels():
+    # Test with non-integer class labels.
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+
+    symbol_y = list(map(str, y))
+
+    clf.fit(X, symbol_y)
+    assert_array_equal(clf.predict(T), list(map(str, true_result)))
+    assert 100 == len(clf.estimators_)
+
+
+def test_float_class_labels():
+    # Test with float class labels.
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+
+    float_y = np.asarray(y, dtype=np.float32)
+
+    clf.fit(X, float_y)
+    assert_array_equal(clf.predict(T), np.asarray(true_result, dtype=np.float32))
+    assert 100 == len(clf.estimators_)
+
+
+def test_shape_y():
+    # Test with float class labels.
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+
+    y_ = np.asarray(y, dtype=np.int32)
+    y_ = y_[:, np.newaxis]
+
+    # This will raise a DataConversionWarning that we want to
+    # "always" raise, elsewhere the warnings gets ignored in the
+    # later tests, and the tests that check for this warning fail
+    warn_msg = (
+        "A column-vector y was passed when a 1d array was expected. "
+        "Please change the shape of y to \\(n_samples, \\), for "
+        "example using ravel()."
+    )
+    with pytest.warns(DataConversionWarning, match=warn_msg):
+        clf.fit(X, y_)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 100 == len(clf.estimators_)
+
+
+def test_mem_layout():
+    # Test with different memory layouts of X and y
+    X_ = np.asfortranarray(X)
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+    clf.fit(X_, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 100 == len(clf.estimators_)
+
+    X_ = np.ascontiguousarray(X)
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+    clf.fit(X_, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 100 == len(clf.estimators_)
+
+    y_ = np.asarray(y, dtype=np.int32)
+    y_ = np.ascontiguousarray(y_)
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+    clf.fit(X, y_)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 100 == len(clf.estimators_)
+
+    y_ = np.asarray(y, dtype=np.int32)
+    y_ = np.asfortranarray(y_)
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
+    clf.fit(X, y_)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 100 == len(clf.estimators_)
+
+
+@pytest.mark.parametrize("GradientBoostingEstimator", GRADIENT_BOOSTING_ESTIMATORS)
+def test_oob_improvement(GradientBoostingEstimator):
+    # Test if oob improvement has correct shape and regression test.
+    estimator = GradientBoostingEstimator(
+        n_estimators=100, random_state=1, subsample=0.5
+    )
+    estimator.fit(X, y)
+    assert estimator.oob_improvement_.shape[0] == 100
+    # hard-coded regression test - change if modification in OOB computation
+    assert_array_almost_equal(
+        estimator.oob_improvement_[:5],
+        np.array([0.19, 0.15, 0.12, -0.11, 0.11]),
+        decimal=2,
+    )
+
+
+@pytest.mark.parametrize("GradientBoostingEstimator", GRADIENT_BOOSTING_ESTIMATORS)
+def test_oob_scores(GradientBoostingEstimator):
+    # Test if oob scores has correct shape and regression test.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    estimator = GradientBoostingEstimator(
+        n_estimators=100, random_state=1, subsample=0.5
+    )
+    estimator.fit(X, y)
+    assert estimator.oob_scores_.shape[0] == 100
+    assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)
+
+    estimator = GradientBoostingEstimator(
+        n_estimators=100,
+        random_state=1,
+        subsample=0.5,
+        n_iter_no_change=5,
+    )
+    estimator.fit(X, y)
+    assert estimator.oob_scores_.shape[0] < 100
+    assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)
+
+
+@pytest.mark.parametrize(
+    "GradientBoostingEstimator, oob_attribute",
+    [
+        (GradientBoostingClassifier, "oob_improvement_"),
+        (GradientBoostingClassifier, "oob_scores_"),
+        (GradientBoostingClassifier, "oob_score_"),
+        (GradientBoostingRegressor, "oob_improvement_"),
+        (GradientBoostingRegressor, "oob_scores_"),
+        (GradientBoostingRegressor, "oob_score_"),
+    ],
+)
+def test_oob_attributes_error(GradientBoostingEstimator, oob_attribute):
+    """
+    Check that we raise an AttributeError when the OOB statistics were not computed.
+    """
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    estimator = GradientBoostingEstimator(
+        n_estimators=100,
+        random_state=1,
+        subsample=1.0,
+    )
+    estimator.fit(X, y)
+    with pytest.raises(AttributeError):
+        estimator.oob_attribute
+
+
+def test_oob_multilcass_iris():
+    # Check OOB improvement on multi-class dataset.
+    estimator = GradientBoostingClassifier(
+        n_estimators=100, loss="log_loss", random_state=1, subsample=0.5
+    )
+    estimator.fit(iris.data, iris.target)
+    score = estimator.score(iris.data, iris.target)
+    assert score > 0.9
+    assert estimator.oob_improvement_.shape[0] == estimator.n_estimators
+    assert estimator.oob_scores_.shape[0] == estimator.n_estimators
+    assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)
+
+    estimator = GradientBoostingClassifier(
+        n_estimators=100,
+        loss="log_loss",
+        random_state=1,
+        subsample=0.5,
+        n_iter_no_change=5,
+    )
+    estimator.fit(iris.data, iris.target)
+    score = estimator.score(iris.data, iris.target)
+    assert estimator.oob_improvement_.shape[0] < estimator.n_estimators
+    assert estimator.oob_scores_.shape[0] < estimator.n_estimators
+    assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)
+
+    # hard-coded regression test - change if modification in OOB computation
+    # FIXME: the following snippet does not yield the same results on 32 bits
+    # assert_array_almost_equal(estimator.oob_improvement_[:5],
+    #                           np.array([12.68, 10.45, 8.18, 6.43, 5.13]),
+    #                           decimal=2)
+
+
+def test_verbose_output():
+    # Check verbose=1 does not cause error.
+    import sys
+    from io import StringIO
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    clf = GradientBoostingClassifier(
+        n_estimators=100, random_state=1, verbose=1, subsample=0.8
+    )
+    clf.fit(X, y)
+    verbose_output = sys.stdout
+    sys.stdout = old_stdout
+
+    # check output
+    verbose_output.seek(0)
+    header = verbose_output.readline().rstrip()
+    # with OOB
+    true_header = " ".join(["%10s"] + ["%16s"] * 3) % (
+        "Iter",
+        "Train Loss",
+        "OOB Improve",
+        "Remaining Time",
+    )
+    assert true_header == header
+
+    n_lines = sum(1 for l in verbose_output.readlines())
+    # one for 1-10 and then 9 for 20-100
+    assert 10 + 9 == n_lines
+
+
+def test_more_verbose_output():
+    # Check verbose=2 does not cause error.
+    import sys
+    from io import StringIO
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    clf = GradientBoostingClassifier(n_estimators=100, random_state=1, verbose=2)
+    clf.fit(X, y)
+    verbose_output = sys.stdout
+    sys.stdout = old_stdout
+
+    # check output
+    verbose_output.seek(0)
+    header = verbose_output.readline().rstrip()
+    # no OOB
+    true_header = " ".join(["%10s"] + ["%16s"] * 2) % (
+        "Iter",
+        "Train Loss",
+        "Remaining Time",
+    )
+    assert true_header == header
+
+    n_lines = sum(1 for l in verbose_output.readlines())
+    # 100 lines for n_estimators==100
+    assert 100 == n_lines
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start(Cls, global_random_seed):
+    # Test if warm start equals fit.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
+    est = Cls(n_estimators=200, max_depth=1, random_state=global_random_seed)
+    est.fit(X, y)
+
+    est_ws = Cls(
+        n_estimators=100, max_depth=1, warm_start=True, random_state=global_random_seed
+    )
+    est_ws.fit(X, y)
+    est_ws.set_params(n_estimators=200)
+    est_ws.fit(X, y)
+
+    if Cls is GradientBoostingRegressor:
+        assert_allclose(est_ws.predict(X), est.predict(X))
+    else:
+        # Random state is preserved and hence predict_proba must also be
+        # same
+        assert_array_equal(est_ws.predict(X), est.predict(X))
+        assert_allclose(est_ws.predict_proba(X), est.predict_proba(X))
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_n_estimators(Cls, global_random_seed):
+    # Test if warm start equals fit - set n_estimators.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
+    est = Cls(n_estimators=300, max_depth=1, random_state=global_random_seed)
+    est.fit(X, y)
+
+    est_ws = Cls(
+        n_estimators=100, max_depth=1, warm_start=True, random_state=global_random_seed
+    )
+    est_ws.fit(X, y)
+    est_ws.set_params(n_estimators=300)
+    est_ws.fit(X, y)
+
+    assert_allclose(est_ws.predict(X), est.predict(X))
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_max_depth(Cls):
+    # Test if possible to fit trees of different depth in ensemble.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    est = Cls(n_estimators=100, max_depth=1, warm_start=True)
+    est.fit(X, y)
+    est.set_params(n_estimators=110, max_depth=2)
+    est.fit(X, y)
+
+    # last 10 trees have different depth
+    assert est.estimators_[0, 0].max_depth == 1
+    for i in range(1, 11):
+        assert est.estimators_[-i, 0].max_depth == 2
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_clear(Cls):
+    # Test if fit clears state.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    est = Cls(n_estimators=100, max_depth=1)
+    est.fit(X, y)
+
+    est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True)
+    est_2.fit(X, y)  # inits state
+    est_2.set_params(warm_start=False)
+    est_2.fit(X, y)  # clears old state and equals est
+
+    assert_array_almost_equal(est_2.predict(X), est.predict(X))
+
+
+@pytest.mark.parametrize("GradientBoosting", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_state_oob_scores(GradientBoosting):
+    """
+    Check that the states of the OOB scores are cleared when used with `warm_start`.
+    """
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    n_estimators = 100
+    estimator = GradientBoosting(
+        n_estimators=n_estimators,
+        max_depth=1,
+        subsample=0.5,
+        warm_start=True,
+        random_state=1,
+    )
+    estimator.fit(X, y)
+    oob_scores, oob_score = estimator.oob_scores_, estimator.oob_score_
+    assert len(oob_scores) == n_estimators
+    assert oob_scores[-1] == pytest.approx(oob_score)
+
+    n_more_estimators = 200
+    estimator.set_params(n_estimators=n_more_estimators).fit(X, y)
+    assert len(estimator.oob_scores_) == n_more_estimators
+    assert_allclose(estimator.oob_scores_[:n_estimators], oob_scores)
+
+    estimator.set_params(n_estimators=n_estimators, warm_start=False).fit(X, y)
+    assert estimator.oob_scores_ is not oob_scores
+    assert estimator.oob_score_ is not oob_score
+    assert_allclose(estimator.oob_scores_, oob_scores)
+    assert estimator.oob_score_ == pytest.approx(oob_score)
+    assert oob_scores[-1] == pytest.approx(oob_score)
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_smaller_n_estimators(Cls):
+    # Test if warm start with smaller n_estimators raises error
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    est = Cls(n_estimators=100, max_depth=1, warm_start=True)
+    est.fit(X, y)
+    est.set_params(n_estimators=99)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_equal_n_estimators(Cls):
+    # Test if warm start with equal n_estimators does nothing
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    est = Cls(n_estimators=100, max_depth=1)
+    est.fit(X, y)
+
+    est2 = clone(est)
+    est2.set_params(n_estimators=est.n_estimators, warm_start=True)
+    est2.fit(X, y)
+
+    assert_array_almost_equal(est2.predict(X), est.predict(X))
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_oob_switch(Cls):
+    # Test if oob can be turned on during warm start.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    est = Cls(n_estimators=100, max_depth=1, warm_start=True)
+    est.fit(X, y)
+    est.set_params(n_estimators=110, subsample=0.5)
+    est.fit(X, y)
+
+    assert_array_equal(est.oob_improvement_[:100], np.zeros(100))
+    assert_array_equal(est.oob_scores_[:100], np.zeros(100))
+
+    # the last 10 are not zeros
+    assert (est.oob_improvement_[-10:] != 0.0).all()
+    assert (est.oob_scores_[-10:] != 0.0).all()
+
+    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_oob(Cls):
+    # Test if warm start OOB equals fit.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    est = Cls(n_estimators=200, max_depth=1, subsample=0.5, random_state=1)
+    est.fit(X, y)
+
+    est_ws = Cls(
+        n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True
+    )
+    est_ws.fit(X, y)
+    est_ws.set_params(n_estimators=200)
+    est_ws.fit(X, y)
+
+    assert_array_almost_equal(est_ws.oob_improvement_[:100], est.oob_improvement_[:100])
+    assert_array_almost_equal(est_ws.oob_scores_[:100], est.oob_scores_[:100])
+    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
+    assert est_ws.oob_scores_[-1] == pytest.approx(est_ws.oob_score_)
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_warm_start_sparse(Cls, sparse_container):
+    # Test that all sparse matrix types are supported
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    est_dense = Cls(
+        n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True
+    )
+    est_dense.fit(X, y)
+    est_dense.predict(X)
+    est_dense.set_params(n_estimators=200)
+    est_dense.fit(X, y)
+    y_pred_dense = est_dense.predict(X)
+
+    X_sparse = sparse_container(X)
+
+    est_sparse = Cls(
+        n_estimators=100,
+        max_depth=1,
+        subsample=0.5,
+        random_state=1,
+        warm_start=True,
+    )
+    est_sparse.fit(X_sparse, y)
+    est_sparse.predict(X)
+    est_sparse.set_params(n_estimators=200)
+    est_sparse.fit(X_sparse, y)
+    y_pred_sparse = est_sparse.predict(X)
+
+    assert_array_almost_equal(
+        est_dense.oob_improvement_[:100], est_sparse.oob_improvement_[:100]
+    )
+    assert est_dense.oob_scores_[-1] == pytest.approx(est_dense.oob_score_)
+    assert_array_almost_equal(est_dense.oob_scores_[:100], est_sparse.oob_scores_[:100])
+    assert est_sparse.oob_scores_[-1] == pytest.approx(est_sparse.oob_score_)
+    assert_array_almost_equal(y_pred_dense, y_pred_sparse)
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_warm_start_fortran(Cls, global_random_seed):
+    # Test that feeding a X in Fortran-ordered is giving the same results as
+    # in C-ordered
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
+    est_c = Cls(n_estimators=1, random_state=global_random_seed, warm_start=True)
+    est_fortran = Cls(n_estimators=1, random_state=global_random_seed, warm_start=True)
+
+    est_c.fit(X, y)
+    est_c.set_params(n_estimators=11)
+    est_c.fit(X, y)
+
+    X_fortran = np.asfortranarray(X)
+    est_fortran.fit(X_fortran, y)
+    est_fortran.set_params(n_estimators=11)
+    est_fortran.fit(X_fortran, y)
+
+    assert_allclose(est_c.predict(X), est_fortran.predict(X))
+
+
+def early_stopping_monitor(i, est, locals):
+    """Returns True on the 10th iteration."""
+    if i == 9:
+        return True
+    else:
+        return False
+
+
+@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
+def test_monitor_early_stopping(Cls):
+    # Test if monitor return value works.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+
+    est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5)
+    est.fit(X, y, monitor=early_stopping_monitor)
+    assert est.n_estimators == 20  # this is not altered
+    assert est.estimators_.shape[0] == 10
+    assert est.train_score_.shape[0] == 10
+    assert est.oob_improvement_.shape[0] == 10
+    assert est.oob_scores_.shape[0] == 10
+    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
+
+    # try refit
+    est.set_params(n_estimators=30)
+    est.fit(X, y)
+    assert est.n_estimators == 30
+    assert est.estimators_.shape[0] == 30
+    assert est.train_score_.shape[0] == 30
+    assert est.oob_improvement_.shape[0] == 30
+    assert est.oob_scores_.shape[0] == 30
+    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
+
+    est = Cls(
+        n_estimators=20, max_depth=1, random_state=1, subsample=0.5, warm_start=True
+    )
+    est.fit(X, y, monitor=early_stopping_monitor)
+    assert est.n_estimators == 20
+    assert est.estimators_.shape[0] == 10
+    assert est.train_score_.shape[0] == 10
+    assert est.oob_improvement_.shape[0] == 10
+    assert est.oob_scores_.shape[0] == 10
+    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
+
+    # try refit
+    est.set_params(n_estimators=30, warm_start=False)
+    est.fit(X, y)
+    assert est.n_estimators == 30
+    assert est.train_score_.shape[0] == 30
+    assert est.estimators_.shape[0] == 30
+    assert est.oob_improvement_.shape[0] == 30
+    assert est.oob_scores_.shape[0] == 30
+    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
+
+
+def test_complete_classification():
+    # Test greedy trees with max_depth + 1 leafs.
+    from sklearn.tree._tree import TREE_LEAF
+
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    k = 4
+
+    est = GradientBoostingClassifier(
+        n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1
+    )
+    est.fit(X, y)
+
+    tree = est.estimators_[0, 0].tree_
+    assert tree.max_depth == k
+    assert tree.children_left[tree.children_left == TREE_LEAF].shape[0] == k + 1
+
+
+def test_complete_regression():
+    # Test greedy trees with max_depth + 1 leafs.
+    from sklearn.tree._tree import TREE_LEAF
+
+    k = 4
+
+    est = GradientBoostingRegressor(
+        n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1
+    )
+    est.fit(X_reg, y_reg)
+
+    tree = est.estimators_[-1, 0].tree_
+    assert tree.children_left[tree.children_left == TREE_LEAF].shape[0] == k + 1
+
+
+def test_zero_estimator_reg(global_random_seed):
+    # Test if init='zero' works for regression by checking that it is better
+    # than a simple baseline.
+
+    baseline = DummyRegressor(strategy="mean").fit(X_reg, y_reg)
+    mse_baseline = mean_squared_error(baseline.predict(X_reg), y_reg)
+    est = GradientBoostingRegressor(
+        n_estimators=5,
+        max_depth=1,
+        random_state=global_random_seed,
+        init="zero",
+        learning_rate=0.5,
+    )
+    est.fit(X_reg, y_reg)
+    y_pred = est.predict(X_reg)
+    mse_gbdt = mean_squared_error(y_reg, y_pred)
+    assert mse_gbdt < mse_baseline
+
+
+def test_zero_estimator_clf(global_random_seed):
+    # Test if init='zero' works for classification.
+    X = iris.data
+    y = np.array(iris.target)
+
+    est = GradientBoostingClassifier(
+        n_estimators=20, max_depth=1, random_state=global_random_seed, init="zero"
+    )
+    est.fit(X, y)
+
+    assert est.score(X, y) > 0.96
+
+    # binary clf
+    mask = y != 0
+    y[mask] = 1
+    y[~mask] = 0
+    est = GradientBoostingClassifier(
+        n_estimators=20, max_depth=1, random_state=global_random_seed, init="zero"
+    )
+    est.fit(X, y)
+    assert est.score(X, y) > 0.96
+
+
+@pytest.mark.parametrize("GBEstimator", GRADIENT_BOOSTING_ESTIMATORS)
+def test_max_leaf_nodes_max_depth(GBEstimator):
+    # Test precedence of max_leaf_nodes over max_depth.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+
+    k = 4
+
+    est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
+    tree = est.estimators_[0, 0].tree_
+    assert tree.max_depth == 1
+
+    est = GBEstimator(max_depth=1).fit(X, y)
+    tree = est.estimators_[0, 0].tree_
+    assert tree.max_depth == 1
+
+
+@pytest.mark.parametrize("GBEstimator", GRADIENT_BOOSTING_ESTIMATORS)
+def test_min_impurity_decrease(GBEstimator):
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+
+    est = GBEstimator(min_impurity_decrease=0.1)
+    est.fit(X, y)
+    for tree in est.estimators_.flat:
+        # Simply check if the parameter is passed on correctly. Tree tests
+        # will suffice for the actual working of this param
+        assert tree.min_impurity_decrease == 0.1
+
+
+def test_warm_start_wo_nestimators_change():
+    # Test if warm_start does nothing if n_estimators is not changed.
+    # Regression test for #3513.
+    clf = GradientBoostingClassifier(n_estimators=10, warm_start=True)
+    clf.fit([[0, 1], [2, 3]], [0, 1])
+    assert clf.estimators_.shape[0] == 10
+    clf.fit([[0, 1], [2, 3]], [0, 1])
+    assert clf.estimators_.shape[0] == 10
+
+
+@pytest.mark.parametrize(
+    ("loss", "value"),
+    [
+        ("squared_error", 0.5),
+        ("absolute_error", 0.0),
+        ("huber", 0.5),
+        ("quantile", 0.5),
+    ],
+)
+def test_non_uniform_weights_toy_edge_case_reg(loss, value):
+    X = [[1, 0], [1, 0], [1, 0], [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = GradientBoostingRegressor(learning_rate=1.0, n_estimators=2, loss=loss)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert gb.predict([[1, 0]])[0] >= value
+
+
+def test_non_uniform_weights_toy_edge_case_clf():
+    X = [[1, 0], [1, 0], [1, 0], [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    for loss in ("log_loss", "exponential"):
+        gb = GradientBoostingClassifier(n_estimators=5, loss=loss)
+        gb.fit(X, y, sample_weight=sample_weight)
+        assert_array_equal(gb.predict([[1, 0]]), [1])
+
+
+@skip_if_32bit
+@pytest.mark.parametrize(
+    "EstimatorClass", (GradientBoostingClassifier, GradientBoostingRegressor)
+)
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_sparse_input(EstimatorClass, sparse_container):
+    y, X = datasets.make_multilabel_classification(
+        random_state=0, n_samples=50, n_features=1, n_classes=20
+    )
+    y = y[:, 0]
+    X_sparse = sparse_container(X)
+
+    dense = EstimatorClass(
+        n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7
+    ).fit(X, y)
+    sparse = EstimatorClass(
+        n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7
+    ).fit(X_sparse, y)
+
+    assert_array_almost_equal(sparse.apply(X), dense.apply(X))
+    assert_array_almost_equal(sparse.predict(X), dense.predict(X))
+    assert_array_almost_equal(sparse.feature_importances_, dense.feature_importances_)
+
+    assert_array_almost_equal(sparse.predict(X_sparse), dense.predict(X))
+    assert_array_almost_equal(dense.predict(X_sparse), sparse.predict(X))
+
+    if issubclass(EstimatorClass, GradientBoostingClassifier):
+        assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X))
+        assert_array_almost_equal(
+            sparse.predict_log_proba(X), dense.predict_log_proba(X)
+        )
+
+        assert_array_almost_equal(
+            sparse.decision_function(X_sparse), sparse.decision_function(X)
+        )
+        assert_array_almost_equal(
+            dense.decision_function(X_sparse), sparse.decision_function(X)
+        )
+        for res_sparse, res in zip(
+            sparse.staged_decision_function(X_sparse),
+            sparse.staged_decision_function(X),
+        ):
+            assert_array_almost_equal(res_sparse, res)
+
+
+@pytest.mark.parametrize(
+    "GradientBoostingEstimator", [GradientBoostingClassifier, GradientBoostingRegressor]
+)
+def test_gradient_boosting_early_stopping(GradientBoostingEstimator):
+    # Check if early stopping works as expected, that is empirically check that the
+    # number of trained estimators is increasing when the tolerance decreases.
+
+    X, y = make_classification(n_samples=1000, random_state=0)
+    n_estimators = 1000
+
+    gb_large_tol = GradientBoostingEstimator(
+        n_estimators=n_estimators,
+        n_iter_no_change=10,
+        learning_rate=0.1,
+        max_depth=3,
+        random_state=42,
+        tol=1e-1,
+    )
+
+    gb_small_tol = GradientBoostingEstimator(
+        n_estimators=n_estimators,
+        n_iter_no_change=10,
+        learning_rate=0.1,
+        max_depth=3,
+        random_state=42,
+        tol=1e-3,
+    )
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+    gb_large_tol.fit(X_train, y_train)
+    gb_small_tol.fit(X_train, y_train)
+
+    assert gb_large_tol.n_estimators_ < gb_small_tol.n_estimators_ < n_estimators
+
+    assert gb_large_tol.score(X_test, y_test) > 0.7
+    assert gb_small_tol.score(X_test, y_test) > 0.7
+
+
+def test_gradient_boosting_without_early_stopping():
+    # When early stopping is not used, the number of trained estimators
+    # must be the one specified.
+    X, y = make_classification(n_samples=1000, random_state=0)
+
+    gbc = GradientBoostingClassifier(
+        n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42
+    )
+    gbc.fit(X, y)
+    gbr = GradientBoostingRegressor(
+        n_estimators=30, learning_rate=0.1, max_depth=3, random_state=42
+    )
+    gbr.fit(X, y)
+
+    # The number of trained estimators must be the one specified.
+    assert gbc.n_estimators_ == 50
+    assert gbr.n_estimators_ == 30
+
+
+def test_gradient_boosting_validation_fraction():
+    X, y = make_classification(n_samples=1000, random_state=0)
+
+    gbc = GradientBoostingClassifier(
+        n_estimators=100,
+        n_iter_no_change=10,
+        validation_fraction=0.1,
+        learning_rate=0.1,
+        max_depth=3,
+        random_state=42,
+    )
+    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
+    gbc3 = clone(gbc).set_params(n_iter_no_change=20)
+
+    gbr = GradientBoostingRegressor(
+        n_estimators=100,
+        n_iter_no_change=10,
+        learning_rate=0.1,
+        max_depth=3,
+        validation_fraction=0.1,
+        random_state=42,
+    )
+    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
+    gbr3 = clone(gbr).set_params(n_iter_no_change=20)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+    # Check if validation_fraction has an effect
+    gbc.fit(X_train, y_train)
+    gbc2.fit(X_train, y_train)
+    assert gbc.n_estimators_ != gbc2.n_estimators_
+
+    gbr.fit(X_train, y_train)
+    gbr2.fit(X_train, y_train)
+    assert gbr.n_estimators_ != gbr2.n_estimators_
+
+    # Check if n_estimators_ increase monotonically with n_iter_no_change
+    # Set validation
+    gbc3.fit(X_train, y_train)
+    gbr3.fit(X_train, y_train)
+    assert gbr.n_estimators_ < gbr3.n_estimators_
+    assert gbc.n_estimators_ < gbc3.n_estimators_
+
+
+def test_early_stopping_stratified():
+    # Make sure data splitting for early stopping is stratified
+    X = [[1, 2], [2, 3], [3, 4], [4, 5]]
+    y = [0, 0, 0, 1]
+
+    gbc = GradientBoostingClassifier(n_iter_no_change=5)
+    with pytest.raises(
+        ValueError, match="The least populated class in y has only 1 member"
+    ):
+        gbc.fit(X, y)
+
+
+def _make_multiclass():
+    return make_classification(n_classes=3, n_clusters_per_class=1)
+
+
+@pytest.mark.parametrize(
+    "gb, dataset_maker, init_estimator",
+    [
+        (GradientBoostingClassifier, make_classification, DummyClassifier),
+        (GradientBoostingClassifier, _make_multiclass, DummyClassifier),
+        (GradientBoostingRegressor, make_regression, DummyRegressor),
+    ],
+    ids=["binary classification", "multiclass classification", "regression"],
+)
+def test_gradient_boosting_with_init(
+    gb, dataset_maker, init_estimator, global_random_seed
+):
+    # Check that GradientBoostingRegressor works when init is a sklearn
+    # estimator.
+    # Check that an error is raised if trying to fit with sample weight but
+    # initial estimator does not support sample weight
+
+    X, y = dataset_maker()
+    sample_weight = np.random.RandomState(global_random_seed).rand(100)
+
+    # init supports sample weights
+    init_est = init_estimator()
+    gb(init=init_est).fit(X, y, sample_weight=sample_weight)
+
+    # init does not support sample weights
+    init_est = NoSampleWeightWrapper(init_estimator())
+    gb(init=init_est).fit(X, y)  # ok no sample weights
+    with pytest.raises(ValueError, match="estimator.*does not support sample weights"):
+        gb(init=init_est).fit(X, y, sample_weight=sample_weight)
+
+
+def test_gradient_boosting_with_init_pipeline():
+    # Check that the init estimator can be a pipeline (see issue #13466)
+
+    X, y = make_regression(random_state=0)
+    init = make_pipeline(LinearRegression())
+    gb = GradientBoostingRegressor(init=init)
+    gb.fit(X, y)  # pipeline without sample_weight works fine
+
+    with pytest.raises(
+        ValueError,
+        match="The initial estimator Pipeline does not support sample weights",
+    ):
+        gb.fit(X, y, sample_weight=np.ones(X.shape[0]))
+
+    # Passing sample_weight to a pipeline raises a ValueError. This test makes
+    # sure we make the distinction between ValueError raised by a pipeline that
+    # was passed sample_weight, and a InvalidParameterError raised by a regular
+    # estimator whose input checking failed.
+    invalid_nu = 1.5
+    err_msg = (
+        "The 'nu' parameter of NuSVR must be a float in the"
+        f" range (0.0, 1.0]. Got {invalid_nu} instead."
+    )
+    with pytest.raises(InvalidParameterError, match=re.escape(err_msg)):
+        # Note that NuSVR properly supports sample_weight
+        init = NuSVR(gamma="auto", nu=invalid_nu)
+        gb = GradientBoostingRegressor(init=init)
+        gb.fit(X, y, sample_weight=np.ones(X.shape[0]))
+
+
+def test_early_stopping_n_classes():
+    # when doing early stopping (_, , y_train, _ = train_test_split(X, y))
+    # there might be classes in y that are missing in y_train. As the init
+    # estimator will be trained on y_train, we need to raise an error if this
+    # happens.
+
+    X = [[1]] * 10
+    y = [0, 0] + [1] * 8  # only 2 negative class over 10 samples
+    gb = GradientBoostingClassifier(
+        n_iter_no_change=5, random_state=0, validation_fraction=0.8
+    )
+    with pytest.raises(
+        ValueError, match="The training data after the early stopping split"
+    ):
+        gb.fit(X, y)
+
+    # No error if we let training data be big enough
+    gb = GradientBoostingClassifier(
+        n_iter_no_change=5, random_state=0, validation_fraction=0.4
+    )
+
+
+def test_gbr_degenerate_feature_importances():
+    # growing an ensemble of single node trees. See #13620
+    X = np.zeros((10, 10))
+    y = np.ones((10,))
+    gbr = GradientBoostingRegressor().fit(X, y)
+    assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64))
+
+
+def test_huber_vs_mean_and_median():
+    """Check that huber lies between absolute and squared error."""
+    n_rep = 100
+    n_samples = 10
+    y = np.tile(np.arange(n_samples), n_rep)
+    x1 = np.minimum(y, n_samples / 2)
+    x2 = np.minimum(-y, -n_samples / 2)
+    X = np.c_[x1, x2]
+
+    rng = np.random.RandomState(42)
+    # We want an asymmetric distribution.
+    y = y + rng.exponential(scale=1, size=y.shape)
+
+    gbt_absolute_error = GradientBoostingRegressor(loss="absolute_error").fit(X, y)
+    gbt_huber = GradientBoostingRegressor(loss="huber").fit(X, y)
+    gbt_squared_error = GradientBoostingRegressor().fit(X, y)
+
+    gbt_huber_predictions = gbt_huber.predict(X)
+    assert np.all(gbt_absolute_error.predict(X) <= gbt_huber_predictions)
+    assert np.all(gbt_huber_predictions <= gbt_squared_error.predict(X))
+
+
+def test_safe_divide():
+    """Test that _safe_divide handles division by zero."""
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        assert _safe_divide(np.float64(1e300), 0) == 0
+        assert _safe_divide(np.float64(0.0), np.float64(0.0)) == 0
+    with pytest.warns(RuntimeWarning, match="overflow"):
+        # np.finfo(float).max = 1.7976931348623157e+308
+        _safe_divide(np.float64(1e300), 1e-10)
+
+
+def test_squared_error_exact_backward_compat():
+    """Test squared error GBT backward compat on a simple dataset.
+
+    The results to compare against are taken from scikit-learn v1.2.0.
+    """
+    n_samples = 10
+    y = np.arange(n_samples)
+    x1 = np.minimum(y, n_samples / 2)
+    x2 = np.minimum(-y, -n_samples / 2)
+    X = np.c_[x1, x2]
+    gbt = GradientBoostingRegressor(loss="squared_error", n_estimators=100).fit(X, y)
+
+    pred_result = np.array(
+        [
+            1.39245726e-04,
+            1.00010468e00,
+            2.00007043e00,
+            3.00004051e00,
+            4.00000802e00,
+            4.99998972e00,
+            5.99996312e00,
+            6.99993395e00,
+            7.99989372e00,
+            8.99985660e00,
+        ]
+    )
+    assert_allclose(gbt.predict(X), pred_result, rtol=1e-8)
+
+    train_score = np.array(
+        [
+            4.87246390e-08,
+            3.95590036e-08,
+            3.21267865e-08,
+            2.60970300e-08,
+            2.11820178e-08,
+            1.71995782e-08,
+            1.39695549e-08,
+            1.13391770e-08,
+            9.19931587e-09,
+            7.47000575e-09,
+        ]
+    )
+    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)
+
+    # Same but with sample_weights
+    sample_weights = np.tile([1, 10], n_samples // 2)
+    gbt = GradientBoostingRegressor(loss="squared_error", n_estimators=100).fit(
+        X, y, sample_weight=sample_weights
+    )
+
+    pred_result = np.array(
+        [
+            1.52391462e-04,
+            1.00011168e00,
+            2.00007724e00,
+            3.00004638e00,
+            4.00001302e00,
+            4.99999873e00,
+            5.99997093e00,
+            6.99994329e00,
+            7.99991290e00,
+            8.99988727e00,
+        ]
+    )
+    assert_allclose(gbt.predict(X), pred_result, rtol=1e-6, atol=1e-5)
+
+    train_score = np.array(
+        [
+            4.12445296e-08,
+            3.34418322e-08,
+            2.71151383e-08,
+            2.19782469e-08,
+            1.78173649e-08,
+            1.44461976e-08,
+            1.17120123e-08,
+            9.49485678e-09,
+            7.69772505e-09,
+            6.24155316e-09,
+        ]
+    )
+    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-3, atol=1e-11)
+
+
+@skip_if_32bit
+def test_huber_exact_backward_compat():
+    """Test huber GBT backward compat on a simple dataset.
+
+    The results to compare against are taken from scikit-learn v1.2.0.
+    """
+    n_samples = 10
+    y = np.arange(n_samples)
+    x1 = np.minimum(y, n_samples / 2)
+    x2 = np.minimum(-y, -n_samples / 2)
+    X = np.c_[x1, x2]
+    gbt = GradientBoostingRegressor(loss="huber", n_estimators=100, alpha=0.8).fit(X, y)
+
+    assert_allclose(gbt._loss.closs.delta, 0.0001655688041282133)
+
+    pred_result = np.array(
+        [
+            1.48120765e-04,
+            9.99949174e-01,
+            2.00116957e00,
+            2.99986716e00,
+            4.00012064e00,
+            5.00002462e00,
+            5.99998898e00,
+            6.99692549e00,
+            8.00006356e00,
+            8.99985099e00,
+        ]
+    )
+    assert_allclose(gbt.predict(X), pred_result, rtol=1e-8)
+
+    train_score = np.array(
+        [
+            2.59484709e-07,
+            2.19165900e-07,
+            1.89644782e-07,
+            1.64556454e-07,
+            1.38705110e-07,
+            1.20373736e-07,
+            1.04746082e-07,
+            9.13835687e-08,
+            8.20245756e-08,
+            7.17122188e-08,
+        ]
+    )
+    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)
+
+
+def test_binomial_error_exact_backward_compat():
+    """Test binary log_loss GBT backward compat on a simple dataset.
+
+    The results to compare against are taken from scikit-learn v1.2.0.
+    """
+    n_samples = 10
+    y = np.arange(n_samples) % 2
+    x1 = np.minimum(y, n_samples / 2)
+    x2 = np.minimum(-y, -n_samples / 2)
+    X = np.c_[x1, x2]
+    gbt = GradientBoostingClassifier(loss="log_loss", n_estimators=100).fit(X, y)
+
+    pred_result = np.array(
+        [
+            [9.99978098e-01, 2.19017313e-05],
+            [2.19017313e-05, 9.99978098e-01],
+            [9.99978098e-01, 2.19017313e-05],
+            [2.19017313e-05, 9.99978098e-01],
+            [9.99978098e-01, 2.19017313e-05],
+            [2.19017313e-05, 9.99978098e-01],
+            [9.99978098e-01, 2.19017313e-05],
+            [2.19017313e-05, 9.99978098e-01],
+            [9.99978098e-01, 2.19017313e-05],
+            [2.19017313e-05, 9.99978098e-01],
+        ]
+    )
+    assert_allclose(gbt.predict_proba(X), pred_result, rtol=1e-8)
+
+    train_score = np.array(
+        [
+            1.07742210e-04,
+            9.74889078e-05,
+            8.82113863e-05,
+            7.98167784e-05,
+            7.22210566e-05,
+            6.53481907e-05,
+            5.91293869e-05,
+            5.35023988e-05,
+            4.84109045e-05,
+            4.38039423e-05,
+        ]
+    )
+    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)
+
+
+def test_multinomial_error_exact_backward_compat():
+    """Test multiclass log_loss GBT backward compat on a simple dataset.
+
+    The results to compare against are taken from scikit-learn v1.2.0.
+    """
+    n_samples = 10
+    y = np.arange(n_samples) % 4
+    x1 = np.minimum(y, n_samples / 2)
+    x2 = np.minimum(-y, -n_samples / 2)
+    X = np.c_[x1, x2]
+    gbt = GradientBoostingClassifier(loss="log_loss", n_estimators=100).fit(X, y)
+
+    pred_result = np.array(
+        [
+            [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08],
+            [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08],
+            [1.19417637e-07, 1.19417637e-07, 9.99999675e-01, 8.60526098e-08],
+            [1.19417637e-07, 1.19417637e-07, 8.60526088e-08, 9.99999675e-01],
+            [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08],
+            [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08],
+            [1.19417637e-07, 1.19417637e-07, 9.99999675e-01, 8.60526098e-08],
+            [1.19417637e-07, 1.19417637e-07, 8.60526088e-08, 9.99999675e-01],
+            [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08],
+            [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08],
+        ]
+    )
+    assert_allclose(gbt.predict_proba(X), pred_result, rtol=1e-8)
+
+    train_score = np.array(
+        [
+            1.13300150e-06,
+            9.75183397e-07,
+            8.39348103e-07,
+            7.22433588e-07,
+            6.21804338e-07,
+            5.35191943e-07,
+            4.60643966e-07,
+            3.96479930e-07,
+            3.41253434e-07,
+            2.93719550e-07,
+        ]
+    )
+    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)
+
+
+def test_gb_denominator_zero(global_random_seed):
+    """Test _update_terminal_regions denominator is not zero.
+
+    For instance for log loss based binary classification, the line search step might
+    become nan/inf as denominator = hessian = prob * (1 - prob) and prob = 0 or 1 can
+    happen.
+    Here, we create a situation were this happens (at least with roughly 80%) based
+    on the random seed.
+    """
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=20)
+
+    params = {
+        "learning_rate": 1.0,
+        "subsample": 0.5,
+        "n_estimators": 100,
+        "max_leaf_nodes": 4,
+        "max_depth": None,
+        "random_state": global_random_seed,
+        "min_samples_leaf": 2,
+    }
+
+    clf = GradientBoostingClassifier(**params)
+    # _safe_devide would raise a RuntimeWarning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        clf.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_iforest.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_iforest.py
new file mode 100644
index 0000000000000000000000000000000000000000..19e34bbf51808931fd29b650a527ac0bc668dd9c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_iforest.py
@@ -0,0 +1,393 @@
+"""
+Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from unittest.mock import Mock, patch
+
+import numpy as np
+import pytest
+from joblib import parallel_backend
+
+from sklearn.datasets import load_diabetes, load_iris, make_classification
+from sklearn.ensemble import IsolationForest
+from sklearn.ensemble._iforest import _average_path_length
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import ParameterGrid, train_test_split
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+# load iris & diabetes dataset
+iris = load_iris()
+diabetes = load_diabetes()
+
+
+def test_iforest(global_random_seed):
+    """Check Isolation Forest for various parameter settings."""
+    X_train = np.array([[0, 1], [1, 2]])
+    X_test = np.array([[2, 1], [1, 1]])
+
+    grid = ParameterGrid(
+        {"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}
+    )
+
+    with ignore_warnings():
+        for params in grid:
+            IsolationForest(random_state=global_random_seed, **params).fit(
+                X_train
+            ).predict(X_test)
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_iforest_sparse(global_random_seed, sparse_container):
+    """Check IForest for various parameter settings on sparse input."""
+    rng = check_random_state(global_random_seed)
+    X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng)
+    grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})
+
+    X_train_sparse = sparse_container(X_train)
+    X_test_sparse = sparse_container(X_test)
+
+    for params in grid:
+        # Trained on sparse format
+        sparse_classifier = IsolationForest(
+            n_estimators=10, random_state=global_random_seed, **params
+        ).fit(X_train_sparse)
+        sparse_results = sparse_classifier.predict(X_test_sparse)
+
+        # Trained on dense format
+        dense_classifier = IsolationForest(
+            n_estimators=10, random_state=global_random_seed, **params
+        ).fit(X_train)
+        dense_results = dense_classifier.predict(X_test)
+
+        assert_array_equal(sparse_results, dense_results)
+
+
+def test_iforest_error():
+    """Test that it gives proper exception on deficient input."""
+    X = iris.data
+
+    # The dataset has less than 256 samples, explicitly setting
+    # max_samples > n_samples should result in a warning. If not set
+    # explicitly there should be no warning
+    warn_msg = "max_samples will be set to n_samples for estimation"
+    with pytest.warns(UserWarning, match=warn_msg):
+        IsolationForest(max_samples=1000).fit(X)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        IsolationForest(max_samples="auto").fit(X)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        IsolationForest(max_samples=np.int64(2)).fit(X)
+
+    # test X_test n_features match X_train one:
+    with pytest.raises(ValueError):
+        IsolationForest().fit(X).predict(X[:, 1:])
+
+
+def test_recalculate_max_depth():
+    """Check max_depth recalculation when max_samples is reset to n_samples"""
+    X = iris.data
+    clf = IsolationForest().fit(X)
+    for est in clf.estimators_:
+        assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
+
+
+def test_max_samples_attribute():
+    X = iris.data
+    clf = IsolationForest().fit(X)
+    assert clf.max_samples_ == X.shape[0]
+
+    clf = IsolationForest(max_samples=500)
+    warn_msg = "max_samples will be set to n_samples for estimation"
+    with pytest.warns(UserWarning, match=warn_msg):
+        clf.fit(X)
+    assert clf.max_samples_ == X.shape[0]
+
+    clf = IsolationForest(max_samples=0.4).fit(X)
+    assert clf.max_samples_ == 0.4 * X.shape[0]
+
+
+def test_iforest_parallel_regression(global_random_seed):
+    """Check parallel regression."""
+    rng = check_random_state(global_random_seed)
+
+    X_train, X_test = train_test_split(diabetes.data, random_state=rng)
+
+    ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
+
+    ensemble.set_params(n_jobs=1)
+    y1 = ensemble.predict(X_test)
+    ensemble.set_params(n_jobs=2)
+    y2 = ensemble.predict(X_test)
+    assert_array_almost_equal(y1, y2)
+
+    ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train)
+
+    y3 = ensemble.predict(X_test)
+    assert_array_almost_equal(y1, y3)
+
+
+def test_iforest_performance(global_random_seed):
+    """Test Isolation Forest performs well"""
+
+    # Generate train/test data
+    rng = check_random_state(global_random_seed)
+    X = 0.3 * rng.randn(600, 2)
+    X = rng.permutation(np.vstack((X + 2, X - 2)))
+    X_train = X[:1000]
+
+    # Generate some abnormal novel observations
+    X_outliers = rng.uniform(low=-1, high=1, size=(200, 2))
+    X_test = np.vstack((X[1000:], X_outliers))
+    y_test = np.array([0] * 200 + [1] * 200)
+
+    # fit the model
+    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
+
+    # predict scores (the lower, the more normal)
+    y_pred = -clf.decision_function(X_test)
+
+    # check that there is at most 6 errors (false positive or false negative)
+    assert roc_auc_score(y_test, y_pred) > 0.98
+
+
+@pytest.mark.parametrize("contamination", [0.25, "auto"])
+def test_iforest_works(contamination, global_random_seed):
+    # toy sample (the last two samples are outliers)
+    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
+
+    # Test IsolationForest
+    clf = IsolationForest(random_state=global_random_seed, contamination=contamination)
+    clf.fit(X)
+    decision_func = -clf.decision_function(X)
+    pred = clf.predict(X)
+    # assert detect outliers:
+    assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
+    assert_array_equal(pred, 6 * [1] + 2 * [-1])
+
+
+def test_max_samples_consistency():
+    # Make sure validated max_samples in iforest and BaseBagging are identical
+    X = iris.data
+    clf = IsolationForest().fit(X)
+    assert clf.max_samples_ == clf._max_samples
+
+
+def test_iforest_subsampled_features():
+    # It tests non-regression for #5732 which failed at predict.
+    rng = check_random_state(0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        diabetes.data[:50], diabetes.target[:50], random_state=rng
+    )
+    clf = IsolationForest(max_features=0.8)
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+
+
+def test_iforest_average_path_length():
+    # It tests non-regression for #8549 which used the wrong formula
+    # for average path length, strictly for the integer case
+    # Updated to check average path length when input is <= 2 (issue #11839)
+    result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
+    result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
+    assert_allclose(_average_path_length([0]), [0.0])
+    assert_allclose(_average_path_length([1]), [0.0])
+    assert_allclose(_average_path_length([2]), [1.0])
+    assert_allclose(_average_path_length([5]), [result_one])
+    assert_allclose(_average_path_length([999]), [result_two])
+    assert_allclose(
+        _average_path_length(np.array([1, 2, 5, 999])),
+        [0.0, 1.0, result_one, result_two],
+    )
+    # _average_path_length is increasing
+    avg_path_length = _average_path_length(np.arange(5))
+    assert_array_equal(avg_path_length, np.sort(avg_path_length))
+
+
+def test_score_samples():
+    X_train = [[1, 1], [1, 2], [2, 1]]
+    clf1 = IsolationForest(contamination=0.1).fit(X_train)
+    clf2 = IsolationForest().fit(X_train)
+    assert_array_equal(
+        clf1.score_samples([[2.0, 2.0]]),
+        clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
+    )
+    assert_array_equal(
+        clf2.score_samples([[2.0, 2.0]]),
+        clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
+    )
+    assert_array_equal(
+        clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
+    )
+
+
+def test_iforest_warm_start():
+    """Test iterative addition of iTrees to an iForest"""
+
+    rng = check_random_state(0)
+    X = rng.randn(20, 2)
+
+    # fit first 10 trees
+    clf = IsolationForest(
+        n_estimators=10, max_samples=20, random_state=rng, warm_start=True
+    )
+    clf.fit(X)
+    # remember the 1st tree
+    tree_1 = clf.estimators_[0]
+    # fit another 10 trees
+    clf.set_params(n_estimators=20)
+    clf.fit(X)
+    # expecting 20 fitted trees and no overwritten trees
+    assert len(clf.estimators_) == 20
+    assert clf.estimators_[0] is tree_1
+
+
+# mock get_chunk_n_rows to actually test more than one chunk (here one
+# chunk has 3 rows):
+@patch(
+    "sklearn.ensemble._iforest.get_chunk_n_rows",
+    side_effect=Mock(**{"return_value": 3}),
+)
+@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
+def test_iforest_chunks_works1(
+    mocked_get_chunk, contamination, n_predict_calls, global_random_seed
+):
+    test_iforest_works(contamination, global_random_seed)
+    assert mocked_get_chunk.call_count == n_predict_calls
+
+
+# idem with chunk_size = 10 rows
+@patch(
+    "sklearn.ensemble._iforest.get_chunk_n_rows",
+    side_effect=Mock(**{"return_value": 10}),
+)
+@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
+def test_iforest_chunks_works2(
+    mocked_get_chunk, contamination, n_predict_calls, global_random_seed
+):
+    test_iforest_works(contamination, global_random_seed)
+    assert mocked_get_chunk.call_count == n_predict_calls
+
+
+def test_iforest_with_uniform_data():
+    """Test whether iforest predicts inliers when using uniform data"""
+
+    # 2-d array of all 1s
+    X = np.ones((100, 10))
+    iforest = IsolationForest()
+    iforest.fit(X)
+
+    rng = np.random.RandomState(0)
+
+    assert all(iforest.predict(X) == 1)
+    assert all(iforest.predict(rng.randn(100, 10)) == 1)
+    assert all(iforest.predict(X + 1) == 1)
+    assert all(iforest.predict(X - 1) == 1)
+
+    # 2-d array where columns contain the same value across rows
+    X = np.repeat(rng.randn(1, 10), 100, 0)
+    iforest = IsolationForest()
+    iforest.fit(X)
+
+    assert all(iforest.predict(X) == 1)
+    assert all(iforest.predict(rng.randn(100, 10)) == 1)
+    assert all(iforest.predict(np.ones((100, 10))) == 1)
+
+    # Single row
+    X = rng.randn(1, 10)
+    iforest = IsolationForest()
+    iforest.fit(X)
+
+    assert all(iforest.predict(X) == 1)
+    assert all(iforest.predict(rng.randn(100, 10)) == 1)
+    assert all(iforest.predict(np.ones((100, 10))) == 1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_iforest_with_n_jobs_does_not_segfault(csc_container):
+    """Check that Isolation Forest does not segfault with n_jobs=2
+
+    Non-regression test for #23252
+    """
+    X, _ = make_classification(n_samples=85_000, n_features=100, random_state=0)
+    X = csc_container(X)
+    IsolationForest(n_estimators=10, max_samples=256, n_jobs=2).fit(X)
+
+
+def test_iforest_preserve_feature_names():
+    """Check that feature names are preserved when contamination is not "auto".
+
+    Feature names are required for consistency checks during scoring.
+
+    Non-regression test for Issue #25844
+    """
+    pd = pytest.importorskip("pandas")
+    rng = np.random.RandomState(0)
+
+    X = pd.DataFrame(data=rng.randn(4), columns=["a"])
+    model = IsolationForest(random_state=0, contamination=0.05)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        model.fit(X)
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_iforest_sparse_input_float_contamination(sparse_container):
+    """Check that `IsolationForest` accepts sparse matrix input and float value for
+    contamination.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27626
+    """
+    X, _ = make_classification(n_samples=50, n_features=4, random_state=0)
+    X = sparse_container(X)
+    X.sort_indices()
+    contamination = 0.1
+    iforest = IsolationForest(
+        n_estimators=5, contamination=contamination, random_state=0
+    ).fit(X)
+
+    X_decision = iforest.decision_function(X)
+    assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination)
+
+
+@pytest.mark.parametrize("n_jobs", [1, 2])
+@pytest.mark.parametrize("contamination", [0.25, "auto"])
+def test_iforest_predict_parallel(global_random_seed, contamination, n_jobs):
+    """Check that `IsolationForest.predict` is parallelized."""
+    # toy sample (the last two samples are outliers)
+    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
+
+    # Test IsolationForest
+    clf = IsolationForest(
+        random_state=global_random_seed, contamination=contamination, n_jobs=None
+    )
+    clf.fit(X)
+    decision_func = -clf.decision_function(X)
+    pred = clf.predict(X)
+
+    # assert detect outliers:
+    assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
+    assert_array_equal(pred, 6 * [1] + 2 * [-1])
+
+    clf_parallel = IsolationForest(
+        random_state=global_random_seed, contamination=contamination, n_jobs=-1
+    )
+    clf_parallel.fit(X)
+    with parallel_backend("threading", n_jobs=n_jobs):
+        pred_paralell = clf_parallel.predict(X)
+
+    # assert the same results as non-parallel
+    assert_array_equal(pred, pred_paralell)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_stacking.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_stacking.py
new file mode 100644
index 0000000000000000000000000000000000000000..e944ecc4abb528c9bffb1cf23674831fcd0fb7ca
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_stacking.py
@@ -0,0 +1,1019 @@
+"""Test the stacking classifier and regressor."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+from scipy import sparse
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
+from sklearn.datasets import (
+    load_breast_cancer,
+    load_diabetes,
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    RandomForestRegressor,
+    StackingClassifier,
+    StackingRegressor,
+)
+from sklearn.exceptions import ConvergenceWarning, NotFittedError
+from sklearn.linear_model import (
+    LinearRegression,
+    LogisticRegression,
+    Ridge,
+    RidgeClassifier,
+)
+from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.preprocessing import scale
+from sklearn.svm import SVC, LinearSVC, LinearSVR
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingRegressor,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.utils._mocking import CheckingClassifier
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
+
+diabetes = load_diabetes()
+X_diabetes, y_diabetes = diabetes.data, diabetes.target
+iris = load_iris()
+X_iris, y_iris = iris.data, iris.target
+X_multilabel, y_multilabel = make_multilabel_classification(
+    n_classes=3, random_state=42
+)
+X_binary, y_binary = make_classification(n_classes=2, random_state=42)
+
+
+@pytest.mark.parametrize(
+    "cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
+)
+@pytest.mark.parametrize(
+    "final_estimator", [None, RandomForestClassifier(random_state=42)]
+)
+@pytest.mark.parametrize("passthrough", [False, True])
+def test_stacking_classifier_iris(cv, final_estimator, passthrough):
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, y_test = train_test_split(
+        scale(X_iris), y_iris, stratify=y_iris, random_state=42
+    )
+    estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
+    clf = StackingClassifier(
+        estimators=estimators,
+        final_estimator=final_estimator,
+        cv=cv,
+        passthrough=passthrough,
+    )
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+    clf.predict_proba(X_test)
+    assert clf.score(X_test, y_test) > 0.8
+
+    X_trans = clf.transform(X_test)
+    expected_column_count = 10 if passthrough else 6
+    assert X_trans.shape[1] == expected_column_count
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -4:])
+
+    clf.set_params(lr="drop")
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+    clf.predict_proba(X_test)
+    if final_estimator is None:
+        # LogisticRegression has decision_function method
+        clf.decision_function(X_test)
+
+    X_trans = clf.transform(X_test)
+    expected_column_count_drop = 7 if passthrough else 3
+    assert X_trans.shape[1] == expected_column_count_drop
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -4:])
+
+
+def test_stacking_classifier_drop_column_binary_classification():
+    # check that a column is dropped in binary classification
+    X, y = load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X), y, stratify=y, random_state=42
+    )
+
+    # both classifiers implement 'predict_proba' and will both drop one column
+    estimators = [
+        ("lr", LogisticRegression()),
+        ("rf", RandomForestClassifier(random_state=42)),
+    ]
+    clf = StackingClassifier(estimators=estimators, cv=3)
+
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape[1] == 2
+
+    # LinearSVC does not implement 'predict_proba' and will not drop one column
+    estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
+    clf.set_params(estimators=estimators)
+
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape[1] == 2
+
+
+def test_stacking_classifier_drop_estimator():
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_iris), y_iris, stratify=y_iris, random_state=42
+    )
+    estimators = [("lr", "drop"), ("svc", LinearSVC(random_state=0))]
+    rf = RandomForestClassifier(n_estimators=10, random_state=42)
+    clf = StackingClassifier(
+        estimators=[("svc", LinearSVC(random_state=0))],
+        final_estimator=rf,
+        cv=5,
+    )
+    clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5)
+
+    clf.fit(X_train, y_train)
+    clf_drop.fit(X_train, y_train)
+    assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
+    assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
+    assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
+
+
+def test_stacking_regressor_drop_estimator():
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_diabetes), y_diabetes, random_state=42
+    )
+    estimators = [("lr", "drop"), ("svr", LinearSVR(random_state=0))]
+    rf = RandomForestRegressor(n_estimators=10, random_state=42)
+    reg = StackingRegressor(
+        estimators=[("svr", LinearSVR(random_state=0))],
+        final_estimator=rf,
+        cv=5,
+    )
+    reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5)
+
+    reg.fit(X_train, y_train)
+    reg_drop.fit(X_train, y_train)
+    assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
+    assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
+
+
+@pytest.mark.parametrize("cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)])
+@pytest.mark.parametrize(
+    "final_estimator, predict_params",
+    [
+        (None, {}),
+        (RandomForestRegressor(random_state=42), {}),
+        (DummyRegressor(), {"return_std": True}),
+    ],
+)
+@pytest.mark.parametrize("passthrough", [False, True])
+def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough):
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_diabetes), y_diabetes, random_state=42
+    )
+    estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
+    reg = StackingRegressor(
+        estimators=estimators,
+        final_estimator=final_estimator,
+        cv=cv,
+        passthrough=passthrough,
+    )
+    reg.fit(X_train, y_train)
+    result = reg.predict(X_test, **predict_params)
+    expected_result_length = 2 if predict_params else 1
+    if predict_params:
+        assert len(result) == expected_result_length
+
+    X_trans = reg.transform(X_test)
+    expected_column_count = 12 if passthrough else 2
+    assert X_trans.shape[1] == expected_column_count
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -10:])
+
+    reg.set_params(lr="drop")
+    reg.fit(X_train, y_train)
+    reg.predict(X_test)
+
+    X_trans = reg.transform(X_test)
+    expected_column_count_drop = 11 if passthrough else 1
+    assert X_trans.shape[1] == expected_column_count_drop
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -10:])
+
+
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_stacking_regressor_sparse_passthrough(sparse_container):
+    # Check passthrough behavior on a sparse X matrix
+    X_train, X_test, y_train, _ = train_test_split(
+        sparse_container(scale(X_diabetes)), y_diabetes, random_state=42
+    )
+    estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
+    rf = RandomForestRegressor(n_estimators=10, random_state=42)
+    clf = StackingRegressor(
+        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
+    )
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
+    assert sparse.issparse(X_trans)
+    assert X_test.format == X_trans.format
+
+
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_stacking_classifier_sparse_passthrough(sparse_container):
+    # Check passthrough behavior on a sparse X matrix
+    X_train, X_test, y_train, _ = train_test_split(
+        sparse_container(scale(X_iris)), y_iris, random_state=42
+    )
+    estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
+    rf = RandomForestClassifier(n_estimators=10, random_state=42)
+    clf = StackingClassifier(
+        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
+    )
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
+    assert sparse.issparse(X_trans)
+    assert X_test.format == X_trans.format
+
+
+def test_stacking_classifier_drop_binary_prob():
+    # check that classifier will drop one of the probability column for
+    # binary classification problem
+
+    # Select only the 2 first classes
+    X_, y_ = scale(X_iris[:100]), y_iris[:100]
+
+    estimators = [("lr", LogisticRegression()), ("rf", RandomForestClassifier())]
+    clf = StackingClassifier(estimators=estimators)
+    clf.fit(X_, y_)
+    X_meta = clf.transform(X_)
+    assert X_meta.shape[1] == 2
+
+
+class NoWeightRegressor(RegressorMixin, BaseEstimator):
+    def fit(self, X, y):
+        self.reg = DummyRegressor()
+        return self.reg.fit(X, y)
+
+    def predict(self, X):
+        return np.ones(X.shape[0])
+
+
+class NoWeightClassifier(ClassifierMixin, BaseEstimator):
+    def fit(self, X, y):
+        self.clf = DummyClassifier(strategy="stratified")
+        return self.clf.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "y, params, type_err, msg_err",
+    [
+        (y_iris, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
+        (
+            y_iris,
+            {
+                "estimators": [
+                    ("lr", LogisticRegression()),
+                    ("svm", SVC(max_iter=50_000)),
+                ],
+                "stack_method": "predict_proba",
+            },
+            ValueError,
+            "does not implement the method predict_proba",
+        ),
+        (
+            y_iris,
+            {
+                "estimators": [
+                    ("lr", LogisticRegression()),
+                    ("cor", NoWeightClassifier()),
+                ]
+            },
+            TypeError,
+            "does not support sample weight",
+        ),
+        (
+            y_iris,
+            {
+                "estimators": [
+                    ("lr", LogisticRegression()),
+                    ("cor", LinearSVC(max_iter=50_000)),
+                ],
+                "final_estimator": NoWeightClassifier(),
+            },
+            TypeError,
+            "does not support sample weight",
+        ),
+    ],
+)
+def test_stacking_classifier_error(y, params, type_err, msg_err):
+    with pytest.raises(type_err, match=msg_err):
+        clf = StackingClassifier(**params, cv=3)
+        clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
+
+
+@pytest.mark.parametrize(
+    "y, params, type_err, msg_err",
+    [
+        (y_diabetes, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
+        (
+            y_diabetes,
+            {"estimators": [("lr", LinearRegression()), ("cor", NoWeightRegressor())]},
+            TypeError,
+            "does not support sample weight",
+        ),
+        (
+            y_diabetes,
+            {
+                "estimators": [
+                    ("lr", LinearRegression()),
+                    ("cor", LinearSVR()),
+                ],
+                "final_estimator": NoWeightRegressor(),
+            },
+            TypeError,
+            "does not support sample weight",
+        ),
+    ],
+)
+def test_stacking_regressor_error(y, params, type_err, msg_err):
+    with pytest.raises(type_err, match=msg_err):
+        reg = StackingRegressor(**params, cv=3)
+        reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]))
+
+
+@pytest.mark.parametrize(
+    "estimator, X, y",
+    [
+        (
+            StackingClassifier(
+                estimators=[
+                    ("lr", LogisticRegression(random_state=0)),
+                    ("svm", LinearSVC(random_state=0)),
+                ]
+            ),
+            X_iris[:100],
+            y_iris[:100],
+        ),  # keep only classes 0 and 1
+        (
+            StackingRegressor(
+                estimators=[
+                    ("lr", LinearRegression()),
+                    ("svm", LinearSVR(random_state=0)),
+                ]
+            ),
+            X_diabetes,
+            y_diabetes,
+        ),
+    ],
+    ids=["StackingClassifier", "StackingRegressor"],
+)
+def test_stacking_randomness(estimator, X, y):
+    # checking that fixing the random state of the CV will lead to the same
+    # results
+    estimator_full = clone(estimator)
+    estimator_full.set_params(
+        cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
+    )
+
+    estimator_drop = clone(estimator)
+    estimator_drop.set_params(lr="drop")
+    estimator_drop.set_params(
+        cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
+    )
+
+    assert_allclose(
+        estimator_full.fit(X, y).transform(X)[:, 1:],
+        estimator_drop.fit(X, y).transform(X),
+    )
+
+
+def test_stacking_classifier_stratify_default():
+    # check that we stratify the classes for the default CV
+    clf = StackingClassifier(
+        estimators=[
+            ("lr", LogisticRegression(max_iter=10_000)),
+            ("svm", LinearSVC(max_iter=10_000)),
+        ]
+    )
+    # since iris is not shuffled, a simple k-fold would not contain the
+    # 3 classes during training
+    clf.fit(X_iris, y_iris)
+
+
+@pytest.mark.parametrize(
+    "stacker, X, y",
+    [
+        (
+            StackingClassifier(
+                estimators=[
+                    ("lr", LogisticRegression()),
+                    ("svm", LinearSVC(random_state=42)),
+                ],
+                final_estimator=LogisticRegression(),
+                cv=KFold(shuffle=True, random_state=42),
+            ),
+            *load_breast_cancer(return_X_y=True),
+        ),
+        (
+            StackingRegressor(
+                estimators=[
+                    ("lr", LinearRegression()),
+                    ("svm", LinearSVR(random_state=42)),
+                ],
+                final_estimator=LinearRegression(),
+                cv=KFold(shuffle=True, random_state=42),
+            ),
+            X_diabetes,
+            y_diabetes,
+        ),
+    ],
+    ids=["StackingClassifier", "StackingRegressor"],
+)
+def test_stacking_with_sample_weight(stacker, X, y):
+    # check that sample weights has an influence on the fitting
+    # note: ConvergenceWarning are catch since we are not worrying about the
+    # convergence here
+    n_half_samples = len(y) // 2
+    total_sample_weight = np.array(
+        [0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
+    )
+    X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
+        X, y, total_sample_weight, random_state=42
+    )
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train)
+    y_pred_no_weight = stacker.predict(X_test)
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
+    y_pred_unit_weight = stacker.predict(X_test)
+
+    assert_allclose(y_pred_no_weight, y_pred_unit_weight)
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
+    y_pred_biased = stacker.predict(X_test)
+
+    assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
+
+
+def test_stacking_classifier_sample_weight_fit_param():
+    # check sample_weight is passed to all invocations of fit
+    stacker = StackingClassifier(
+        estimators=[("lr", CheckingClassifier(expected_sample_weight=True))],
+        final_estimator=CheckingClassifier(expected_sample_weight=True),
+    )
+    stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0]))
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize(
+    "stacker, X, y",
+    [
+        (
+            StackingClassifier(
+                estimators=[
+                    ("lr", LogisticRegression()),
+                    ("svm", LinearSVC(random_state=42)),
+                ],
+                final_estimator=LogisticRegression(),
+            ),
+            *load_breast_cancer(return_X_y=True),
+        ),
+        (
+            StackingRegressor(
+                estimators=[
+                    ("lr", LinearRegression()),
+                    ("svm", LinearSVR(random_state=42)),
+                ],
+                final_estimator=LinearRegression(),
+            ),
+            X_diabetes,
+            y_diabetes,
+        ),
+    ],
+    ids=["StackingClassifier", "StackingRegressor"],
+)
+def test_stacking_cv_influence(stacker, X, y):
+    # check that the stacking affects the fit of the final estimator but not
+    # the fit of the base estimators
+    # note: ConvergenceWarning are catch since we are not worrying about the
+    # convergence here
+    stacker_cv_3 = clone(stacker)
+    stacker_cv_5 = clone(stacker)
+
+    stacker_cv_3.set_params(cv=3)
+    stacker_cv_5.set_params(cv=5)
+
+    stacker_cv_3.fit(X, y)
+    stacker_cv_5.fit(X, y)
+
+    # the base estimators should be identical
+    for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_, stacker_cv_5.estimators_):
+        assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
+
+    # the final estimator should be different
+    with pytest.raises(AssertionError, match="Not equal"):
+        assert_allclose(
+            stacker_cv_3.final_estimator_.coef_, stacker_cv_5.final_estimator_.coef_
+        )
+
+
+@pytest.mark.parametrize(
+    "Stacker, Estimator, stack_method, final_estimator, X, y",
+    [
+        (
+            StackingClassifier,
+            DummyClassifier,
+            "predict_proba",
+            LogisticRegression(random_state=42),
+            X_iris,
+            y_iris,
+        ),
+        (
+            StackingRegressor,
+            DummyRegressor,
+            "predict",
+            LinearRegression(),
+            X_diabetes,
+            y_diabetes,
+        ),
+    ],
+)
+def test_stacking_prefit(Stacker, Estimator, stack_method, final_estimator, X, y):
+    """Check the behaviour of stacking when `cv='prefit'`"""
+    X_train1, X_train2, y_train1, y_train2 = train_test_split(
+        X, y, random_state=42, test_size=0.5
+    )
+    estimators = [
+        ("d0", Estimator().fit(X_train1, y_train1)),
+        ("d1", Estimator().fit(X_train1, y_train1)),
+    ]
+
+    # mock out fit and stack_method to be asserted later
+    for _, estimator in estimators:
+        estimator.fit = Mock(name="fit")
+        stack_func = getattr(estimator, stack_method)
+        predict_method_mocked = Mock(side_effect=stack_func)
+        # Mocking a method will not provide a `__name__` while Python methods
+        # do and we are using it in `_get_response_method`.
+        predict_method_mocked.__name__ = stack_method
+        setattr(estimator, stack_method, predict_method_mocked)
+
+    stacker = Stacker(
+        estimators=estimators, cv="prefit", final_estimator=final_estimator
+    )
+    stacker.fit(X_train2, y_train2)
+
+    assert stacker.estimators_ == [estimator for _, estimator in estimators]
+    # fit was not called again
+    assert all(estimator.fit.call_count == 0 for estimator in stacker.estimators_)
+
+    # stack method is called with the proper inputs
+    for estimator in stacker.estimators_:
+        stack_func_mock = getattr(estimator, stack_method)
+        stack_func_mock.assert_called_with(X_train2)
+
+
+@pytest.mark.parametrize(
+    "stacker, X, y",
+    [
+        (
+            StackingClassifier(
+                estimators=[("lr", LogisticRegression()), ("svm", SVC())],
+                cv="prefit",
+            ),
+            X_iris,
+            y_iris,
+        ),
+        (
+            StackingRegressor(
+                estimators=[
+                    ("lr", LinearRegression()),
+                    ("svm", LinearSVR()),
+                ],
+                cv="prefit",
+            ),
+            X_diabetes,
+            y_diabetes,
+        ),
+    ],
+)
+def test_stacking_prefit_error(stacker, X, y):
+    # check that NotFittedError is raised
+    # if base estimators are not fitted when cv="prefit"
+    with pytest.raises(NotFittedError):
+        stacker.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "make_dataset, Stacking, Estimator",
+    [
+        (make_classification, StackingClassifier, LogisticRegression),
+        (make_regression, StackingRegressor, LinearRegression),
+    ],
+)
+def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
+    # Stacking supports estimators without `n_features_in_`. Regression test
+    # for #17353
+
+    class MyEstimator(Estimator):
+        """Estimator without n_features_in_"""
+
+        def fit(self, X, y):
+            super().fit(X, y)
+            del self.n_features_in_
+
+    X, y = make_dataset(random_state=0, n_samples=100)
+    stacker = Stacking(estimators=[("lr", MyEstimator())])
+
+    msg = f"{Stacking.__name__} object has no attribute n_features_in_"
+    with pytest.raises(AttributeError, match=msg):
+        stacker.n_features_in_
+
+    # Does not raise
+    stacker.fit(X, y)
+
+    msg = "'MyEstimator' object has no attribute 'n_features_in_'"
+    with pytest.raises(AttributeError, match=msg):
+        stacker.n_features_in_
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        # output a 2D array of the probability of the positive class for each output
+        MLPClassifier(random_state=42),
+        # output a list of 2D array containing the probability of each class
+        # for each output
+        RandomForestClassifier(random_state=42),
+    ],
+    ids=["MLPClassifier", "RandomForestClassifier"],
+)
+def test_stacking_classifier_multilabel_predict_proba(estimator):
+    """Check the behaviour for the multilabel classification case and the
+    `predict_proba` stacking method.
+
+    Estimators are not consistent with the output arrays and we need to ensure that
+    we handle all cases.
+    """
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
+    )
+    n_outputs = 3
+
+    estimators = [("est", estimator)]
+    stacker = StackingClassifier(
+        estimators=estimators,
+        final_estimator=KNeighborsClassifier(),
+        stack_method="predict_proba",
+    ).fit(X_train, y_train)
+
+    X_trans = stacker.transform(X_test)
+    assert X_trans.shape == (X_test.shape[0], n_outputs)
+    # we should not have any collinear classes and thus nothing should sum to 1
+    assert not any(np.isclose(X_trans.sum(axis=1), 1.0))
+
+    y_pred = stacker.predict(X_test)
+    assert y_pred.shape == y_test.shape
+
+
+def test_stacking_classifier_multilabel_decision_function():
+    """Check the behaviour for the multilabel classification case and the
+    `decision_function` stacking method. Only `RidgeClassifier` supports this
+    case.
+    """
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
+    )
+    n_outputs = 3
+
+    estimators = [("est", RidgeClassifier())]
+    stacker = StackingClassifier(
+        estimators=estimators,
+        final_estimator=KNeighborsClassifier(),
+        stack_method="decision_function",
+    ).fit(X_train, y_train)
+
+    X_trans = stacker.transform(X_test)
+    assert X_trans.shape == (X_test.shape[0], n_outputs)
+
+    y_pred = stacker.predict(X_test)
+    assert y_pred.shape == y_test.shape
+
+
+@pytest.mark.parametrize("stack_method", ["auto", "predict"])
+@pytest.mark.parametrize("passthrough", [False, True])
+def test_stacking_classifier_multilabel_auto_predict(stack_method, passthrough):
+    """Check the behaviour for the multilabel classification case for stack methods
+    supported for all estimators or automatically picked up.
+    """
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
+    )
+    y_train_before_fit = y_train.copy()
+    n_outputs = 3
+
+    estimators = [
+        ("mlp", MLPClassifier(random_state=42)),
+        ("rf", RandomForestClassifier(random_state=42)),
+        ("ridge", RidgeClassifier()),
+    ]
+    final_estimator = KNeighborsClassifier()
+
+    clf = StackingClassifier(
+        estimators=estimators,
+        final_estimator=final_estimator,
+        passthrough=passthrough,
+        stack_method=stack_method,
+    ).fit(X_train, y_train)
+
+    # make sure we don't change `y_train` inplace
+    assert_array_equal(y_train_before_fit, y_train)
+
+    y_pred = clf.predict(X_test)
+    assert y_pred.shape == y_test.shape
+
+    if stack_method == "auto":
+        expected_stack_methods = ["predict_proba", "predict_proba", "decision_function"]
+    else:
+        expected_stack_methods = ["predict"] * len(estimators)
+    assert clf.stack_method_ == expected_stack_methods
+
+    n_features_X_trans = n_outputs * len(estimators)
+    if passthrough:
+        n_features_X_trans += X_train.shape[1]
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape == (X_test.shape[0], n_features_X_trans)
+
+    assert_array_equal(clf.classes_, [np.array([0, 1])] * n_outputs)
+
+
+@pytest.mark.parametrize(
+    "stacker, feature_names, X, y, expected_names",
+    [
+        (
+            StackingClassifier(
+                estimators=[
+                    ("lr", LogisticRegression(random_state=0)),
+                    ("svm", LinearSVC(random_state=0)),
+                ]
+            ),
+            iris.feature_names,
+            X_iris,
+            y_iris,
+            [
+                "stackingclassifier_lr0",
+                "stackingclassifier_lr1",
+                "stackingclassifier_lr2",
+                "stackingclassifier_svm0",
+                "stackingclassifier_svm1",
+                "stackingclassifier_svm2",
+            ],
+        ),
+        (
+            StackingClassifier(
+                estimators=[
+                    ("lr", LogisticRegression(random_state=0)),
+                    ("other", "drop"),
+                    ("svm", LinearSVC(random_state=0)),
+                ]
+            ),
+            iris.feature_names,
+            X_iris[:100],
+            y_iris[:100],  # keep only classes 0 and 1
+            [
+                "stackingclassifier_lr",
+                "stackingclassifier_svm",
+            ],
+        ),
+        (
+            StackingRegressor(
+                estimators=[
+                    ("lr", LinearRegression()),
+                    ("svm", LinearSVR(random_state=0)),
+                ]
+            ),
+            diabetes.feature_names,
+            X_diabetes,
+            y_diabetes,
+            [
+                "stackingregressor_lr",
+                "stackingregressor_svm",
+            ],
+        ),
+    ],
+    ids=[
+        "StackingClassifier_multiclass",
+        "StackingClassifier_binary",
+        "StackingRegressor",
+    ],
+)
+@pytest.mark.parametrize("passthrough", [True, False])
+def test_get_feature_names_out(
+    stacker, feature_names, X, y, expected_names, passthrough
+):
+    """Check get_feature_names_out works for stacking."""
+
+    stacker.set_params(passthrough=passthrough)
+    stacker.fit(scale(X), y)
+
+    if passthrough:
+        expected_names = np.concatenate((expected_names, feature_names))
+
+    names_out = stacker.get_feature_names_out(feature_names)
+    assert_array_equal(names_out, expected_names)
+
+
+def test_stacking_classifier_base_regressor():
+    """Check that a regressor can be used as the first layer in `StackingClassifier`."""
+    X_train, X_test, y_train, y_test = train_test_split(
+        scale(X_iris), y_iris, stratify=y_iris, random_state=42
+    )
+    clf = StackingClassifier(estimators=[("ridge", Ridge())])
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+    clf.predict_proba(X_test)
+    assert clf.score(X_test, y_test) > 0.8
+
+
+def test_stacking_final_estimator_attribute_error():
+    """Check that we raise the proper AttributeError when the final estimator
+    does not implement the `decision_function` method, which is decorated with
+    `available_if`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28108
+    """
+    X, y = make_classification(random_state=42)
+
+    estimators = [
+        ("lr", LogisticRegression()),
+        ("rf", RandomForestClassifier(n_estimators=2, random_state=42)),
+    ]
+    # RandomForestClassifier does not implement 'decision_function' and should raise
+    # an AttributeError
+    final_estimator = RandomForestClassifier(n_estimators=2, random_state=42)
+    clf = StackingClassifier(
+        estimators=estimators, final_estimator=final_estimator, cv=3
+    )
+
+    outer_msg = "This 'StackingClassifier' has no attribute 'decision_function'"
+    inner_msg = "'RandomForestClassifier' object has no attribute 'decision_function'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        clf.fit(X, y).decision_function(X)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+
+# Metadata Routing Tests
+# ======================
+
+
+@pytest.mark.parametrize(
+    "Estimator, Child",
+    [
+        (StackingClassifier, ConsumingClassifier),
+        (StackingRegressor, ConsumingRegressor),
+    ],
+)
+def test_routing_passed_metadata_not_supported(Estimator, Child):
+    """Test that the right error message is raised when metadata is passed while
+    not supported when `enable_metadata_routing=False`."""
+
+    with pytest.raises(
+        ValueError, match="is only supported if enable_metadata_routing=True"
+    ):
+        Estimator(["clf", Child()]).fit(
+            X_iris, y_iris, sample_weight=[1, 1, 1, 1, 1], metadata="a"
+        )
+
+
+@pytest.mark.parametrize(
+    "Estimator, Child",
+    [
+        (StackingClassifier, ConsumingClassifier),
+        (StackingRegressor, ConsumingRegressor),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_get_metadata_routing_without_fit(Estimator, Child):
+    # Test that metadata_routing() doesn't raise when called before fit.
+    est = Estimator([("sub_est", Child())])
+    est.get_metadata_routing()
+
+
+@pytest.mark.parametrize(
+    "Estimator, Child",
+    [
+        (StackingClassifier, ConsumingClassifier),
+        (StackingRegressor, ConsumingRegressor),
+    ],
+)
+@pytest.mark.parametrize(
+    "prop, prop_value", [("sample_weight", np.ones(X_iris.shape[0])), ("metadata", "a")]
+)
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_for_stacking_estimators(Estimator, Child, prop, prop_value):
+    """Test that metadata is routed correctly for Stacking*."""
+
+    est = Estimator(
+        [
+            (
+                "sub_est1",
+                Child(registry=_Registry()).set_fit_request(**{prop: True}),
+            ),
+            (
+                "sub_est2",
+                Child(registry=_Registry()).set_fit_request(**{prop: True}),
+            ),
+        ],
+        final_estimator=Child(registry=_Registry()).set_predict_request(**{prop: True}),
+    )
+
+    est.fit(X_iris, y_iris, **{prop: prop_value})
+    est.fit_transform(X_iris, y_iris, **{prop: prop_value})
+
+    est.predict(X_iris, **{prop: prop_value})
+
+    for estimator in est.estimators:
+        # access sub-estimator in (name, est) with estimator[1]:
+        registry = estimator[1].registry
+        assert len(registry)
+        for sub_est in registry:
+            check_recorded_metadata(
+                obj=sub_est,
+                method="fit",
+                parent="fit",
+                split_params=(prop),
+                **{prop: prop_value},
+            )
+    # access final_estimator:
+    registry = est.final_estimator_.registry
+    assert len(registry)
+    check_recorded_metadata(
+        obj=registry[-1],
+        method="predict",
+        parent="predict",
+        split_params=(prop),
+        **{prop: prop_value},
+    )
+
+
+@pytest.mark.parametrize(
+    "Estimator, Child",
+    [
+        (StackingClassifier, ConsumingClassifier),
+        (StackingRegressor, ConsumingRegressor),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_error_for_stacking_estimators(Estimator, Child):
+    """Test that the right error is raised when metadata is not requested."""
+    sample_weight, metadata = np.ones(X_iris.shape[0]), "a"
+
+    est = Estimator([("sub_est", Child())])
+
+    error_message = (
+        "[sample_weight, metadata] are passed but are not explicitly set as requested"
+        f" or not requested for {Child.__name__}.fit"
+    )
+
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        est.fit(X_iris, y_iris, sample_weight=sample_weight, metadata=metadata)
+
+
+# End of Metadata Routing Tests
+# =============================
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_voting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_voting.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc3fc82c2bee8a29d9b3da95a3cb231c86c3c71d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_voting.py
@@ -0,0 +1,793 @@
+"""Testing for the VotingClassifier and VotingRegressor"""
+
+import re
+
+import numpy as np
+import pytest
+
+from sklearn import config_context, datasets
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.datasets import make_multilabel_classification
+from sklearn.dummy import DummyRegressor
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    RandomForestRegressor,
+    VotingClassifier,
+    VotingRegressor,
+)
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingRegressor,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+# Load datasets
+iris = datasets.load_iris()
+X, y = iris.data[:, 1:3], iris.target
+# Scaled to solve ConvergenceWarning throw by Logistic Regression
+X_scaled = StandardScaler().fit_transform(X)
+
+X_r, y_r = datasets.load_diabetes(return_X_y=True)
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        (
+            {"estimators": []},
+            "Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
+        ),
+        (
+            {"estimators": [LogisticRegression()]},
+            "Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
+        ),
+        (
+            {"estimators": [(213, LogisticRegression())]},
+            "Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
+        ),
+        (
+            {"estimators": [("lr", LogisticRegression())], "weights": [1, 2]},
+            "Number of `estimators` and weights must be equal",
+        ),
+    ],
+)
+def test_voting_classifier_estimator_init(params, err_msg):
+    ensemble = VotingClassifier(**params)
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(X, y)
+
+
+def test_predictproba_hardvoting():
+    eclf = VotingClassifier(
+        estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
+        voting="hard",
+    )
+
+    inner_msg = "predict_proba is not available when voting='hard'"
+    outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        eclf.predict_proba
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+    assert not hasattr(eclf, "predict_proba")
+    eclf.fit(X_scaled, y)
+    assert not hasattr(eclf, "predict_proba")
+
+
+def test_notfitted():
+    eclf = VotingClassifier(
+        estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
+        voting="soft",
+    )
+    ereg = VotingRegressor([("dr", DummyRegressor())])
+    msg = (
+        "This %s instance is not fitted yet. Call 'fit'"
+        " with appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
+        eclf.predict(X)
+    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
+        eclf.predict_proba(X)
+    with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
+        eclf.transform(X)
+    with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
+        ereg.predict(X_r)
+    with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
+        ereg.transform(X_r)
+
+
+def test_majority_label_iris(global_random_seed):
+    """Check classification by majority label on dataset iris."""
+    clf1 = LogisticRegression(random_state=global_random_seed)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
+    clf3 = GaussianNB()
+    eclf = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
+    )
+    scores = cross_val_score(eclf, X, y, scoring="accuracy")
+
+    assert scores.mean() >= 0.9
+
+
+def test_tie_situation():
+    """Check voting classifier selects smaller class label in tie situation."""
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
+    assert clf1.fit(X, y).predict(X)[52] == 2
+    assert clf2.fit(X, y).predict(X)[52] == 1
+    assert eclf.fit(X, y).predict(X)[52] == 1
+
+
+def test_weights_iris(global_random_seed):
+    """Check classification by average probabilities on dataset iris."""
+    clf1 = LogisticRegression(random_state=global_random_seed)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
+    clf3 = GaussianNB()
+    eclf = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
+        voting="soft",
+        weights=[1, 2, 10],
+    )
+    scores = cross_val_score(eclf, X_scaled, y, scoring="accuracy")
+    assert scores.mean() >= 0.9
+
+
+def test_weights_regressor():
+    """Check weighted average regression prediction on diabetes dataset."""
+    reg1 = DummyRegressor(strategy="mean")
+    reg2 = DummyRegressor(strategy="median")
+    reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
+    ereg = VotingRegressor(
+        [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
+    )
+
+    X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
+        X_r, y_r, test_size=0.25
+    )
+
+    reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
+    reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
+    reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
+    ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
+
+    avg = np.average(
+        np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
+    )
+    assert_almost_equal(ereg_pred, avg, decimal=2)
+
+    ereg_weights_none = VotingRegressor(
+        [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
+    )
+    ereg_weights_equal = VotingRegressor(
+        [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
+    )
+    ereg_weights_none.fit(X_r_train, y_r_train)
+    ereg_weights_equal.fit(X_r_train, y_r_train)
+    ereg_none_pred = ereg_weights_none.predict(X_r_test)
+    ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
+    assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
+
+
+def test_predict_on_toy_problem(global_random_seed):
+    """Manually check predicted class labels for toy dataset."""
+    clf1 = LogisticRegression(random_state=global_random_seed)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
+    clf3 = GaussianNB()
+
+    X = np.array(
+        [[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]
+    )
+
+    y = np.array([1, 1, 1, 2, 2, 2])
+
+    assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+    assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+    assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+
+    eclf = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
+        voting="hard",
+        weights=[1, 1, 1],
+    )
+    assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+
+    eclf = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
+        voting="soft",
+        weights=[1, 1, 1],
+    )
+    assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
+
+
+def test_predict_proba_on_toy_problem():
+    """Calculate predicted probabilities on toy dataset."""
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(random_state=123)
+    clf3 = GaussianNB()
+    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
+    y = np.array([1, 1, 2, 2])
+
+    clf1_res = np.array(
+        [
+            [0.59790391, 0.40209609],
+            [0.57622162, 0.42377838],
+            [0.50728456, 0.49271544],
+            [0.40241774, 0.59758226],
+        ]
+    )
+
+    clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]])
+
+    clf3_res = np.array(
+        [[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]]
+    )
+
+    t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
+    t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
+    t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
+    t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
+
+    eclf = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
+        voting="soft",
+        weights=[2, 1, 1],
+    )
+    eclf_res = eclf.fit(X, y).predict_proba(X)
+
+    assert_almost_equal(t00, eclf_res[0][0], decimal=1)
+    assert_almost_equal(t11, eclf_res[1][1], decimal=1)
+    assert_almost_equal(t21, eclf_res[2][1], decimal=1)
+    assert_almost_equal(t31, eclf_res[3][1], decimal=1)
+
+    inner_msg = "predict_proba is not available when voting='hard'"
+    outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        eclf = VotingClassifier(
+            estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
+        )
+        eclf.fit(X, y).predict_proba(X)
+
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+
+def test_multilabel():
+    """Check if error is raised for multilabel classification."""
+    X, y = make_multilabel_classification(
+        n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123
+    )
+    clf = OneVsRestClassifier(SVC(kernel="linear"))
+
+    eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")
+
+    try:
+        eclf.fit(X, y)
+    except NotImplementedError:
+        return
+
+
+def test_gridsearch():
+    """Check GridSearch support."""
+    clf1 = LogisticRegression(random_state=1)
+    clf2 = RandomForestClassifier(random_state=1, n_estimators=3)
+    clf3 = GaussianNB()
+    eclf = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
+    )
+
+    params = {
+        "lr__C": [1.0, 100.0],
+        "voting": ["soft", "hard"],
+        "weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]],
+    }
+
+    grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
+    grid.fit(X_scaled, y)
+
+
+def test_parallel_fit(global_random_seed):
+    """Check parallel backend of VotingClassifier on toy dataset."""
+    clf1 = LogisticRegression(random_state=global_random_seed)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
+    clf3 = GaussianNB()
+    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
+    y = np.array([1, 1, 2, 2])
+
+    eclf1 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
+    ).fit(X, y)
+    eclf2 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
+    ).fit(X, y)
+
+    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
+    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
+
+
+def test_sample_weight(global_random_seed):
+    """Tests sample_weight parameter of VotingClassifier"""
+    clf1 = LogisticRegression(random_state=global_random_seed)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
+    clf3 = SVC(probability=True, random_state=global_random_seed)
+    eclf1 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
+    ).fit(X_scaled, y, sample_weight=np.ones((len(y),)))
+    eclf2 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
+    ).fit(X_scaled, y)
+    assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
+    assert_array_almost_equal(
+        eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
+    )
+    sample_weight = np.random.RandomState(global_random_seed).uniform(size=(len(y),))
+    eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft")
+    eclf3.fit(X_scaled, y, sample_weight=sample_weight)
+    clf1.fit(X_scaled, y, sample_weight)
+    assert_array_equal(eclf3.predict(X_scaled), clf1.predict(X_scaled))
+    assert_array_almost_equal(
+        eclf3.predict_proba(X_scaled), clf1.predict_proba(X_scaled)
+    )
+
+    # check that an error is raised and indicative if sample_weight is not
+    # supported.
+    clf4 = KNeighborsClassifier()
+    eclf3 = VotingClassifier(
+        estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft"
+    )
+    msg = "Underlying estimator KNeighborsClassifier does not support sample weights."
+    with pytest.raises(TypeError, match=msg):
+        eclf3.fit(X_scaled, y, sample_weight=sample_weight)
+
+    # check that _fit_single_estimator will raise the right error
+    # it should raise the original error if this is not linked to sample_weight
+    class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
+        def fit(self, X_scaled, y, sample_weight):
+            raise TypeError("Error unrelated to sample_weight.")
+
+    clf = ClassifierErrorFit()
+    with pytest.raises(TypeError, match="Error unrelated to sample_weight"):
+        clf.fit(X_scaled, y, sample_weight=sample_weight)
+
+
+def test_sample_weight_kwargs():
+    """Check that VotingClassifier passes sample_weight as kwargs"""
+
+    class MockClassifier(ClassifierMixin, BaseEstimator):
+        """Mock Classifier to check that sample_weight is received as kwargs"""
+
+        def fit(self, X, y, *args, **sample_weight):
+            assert "sample_weight" in sample_weight
+
+    clf = MockClassifier()
+    eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft")
+
+    # Should not raise an error.
+    eclf.fit(X, y, sample_weight=np.ones((len(y),)))
+
+
+def test_voting_classifier_set_params(global_random_seed):
+    # check equivalence in the output when setting underlying estimators
+    clf1 = LogisticRegression(random_state=global_random_seed)
+    clf2 = RandomForestClassifier(
+        n_estimators=10, random_state=global_random_seed, max_depth=None
+    )
+    clf3 = GaussianNB()
+
+    eclf1 = VotingClassifier(
+        [("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2]
+    ).fit(X_scaled, y)
+    eclf2 = VotingClassifier(
+        [("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2]
+    )
+    eclf2.set_params(nb=clf2).fit(X_scaled, y)
+
+    assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
+    assert_array_almost_equal(
+        eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
+    )
+    assert eclf2.estimators[0][1].get_params() == clf1.get_params()
+    assert eclf2.estimators[1][1].get_params() == clf2.get_params()
+
+
+def test_set_estimator_drop():
+    # VotingClassifier set_params should be able to set estimators as drop
+    # Test predict
+    clf1 = LogisticRegression(random_state=123)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
+    clf3 = GaussianNB()
+    eclf1 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
+        voting="hard",
+        weights=[1, 0, 0.5],
+    ).fit(X, y)
+
+    eclf2 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
+        voting="hard",
+        weights=[1, 1, 0.5],
+    )
+    eclf2.set_params(rf="drop").fit(X, y)
+
+    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
+
+    assert dict(eclf2.estimators)["rf"] == "drop"
+    assert len(eclf2.estimators_) == 2
+    assert all(
+        isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_
+    )
+    assert eclf2.get_params()["rf"] == "drop"
+
+    eclf1.set_params(voting="soft").fit(X, y)
+    eclf2.set_params(voting="soft").fit(X, y)
+
+    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
+    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
+    msg = "All estimators are dropped. At least one is required"
+    with pytest.raises(ValueError, match=msg):
+        eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)
+
+    # Test soft voting transform
+    X1 = np.array([[1], [2]])
+    y1 = np.array([1, 2])
+    eclf1 = VotingClassifier(
+        estimators=[("rf", clf2), ("nb", clf3)],
+        voting="soft",
+        weights=[0, 0.5],
+        flatten_transform=False,
+    ).fit(X1, y1)
+
+    eclf2 = VotingClassifier(
+        estimators=[("rf", clf2), ("nb", clf3)],
+        voting="soft",
+        weights=[1, 0.5],
+        flatten_transform=False,
+    )
+    eclf2.set_params(rf="drop").fit(X1, y1)
+    assert_array_almost_equal(
+        eclf1.transform(X1),
+        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
+    )
+    assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]]))
+    eclf1.set_params(voting="hard")
+    eclf2.set_params(voting="hard")
+    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
+    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
+
+
+def test_estimator_weights_format(global_random_seed):
+    # Test estimator weights inputs as list and array
+    clf1 = LogisticRegression(random_state=global_random_seed)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
+    eclf1 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft"
+    )
+    eclf2 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft"
+    )
+    eclf1.fit(X_scaled, y)
+    eclf2.fit(X_scaled, y)
+    assert_array_almost_equal(
+        eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
+    )
+
+
+def test_transform(global_random_seed):
+    """Check transform method of VotingClassifier on toy dataset."""
+    clf1 = LogisticRegression(random_state=global_random_seed)
+    clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
+    clf3 = GaussianNB()
+    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
+    y = np.array([1, 1, 2, 2])
+
+    eclf1 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
+    ).fit(X, y)
+    eclf2 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
+        voting="soft",
+        flatten_transform=True,
+    ).fit(X, y)
+    eclf3 = VotingClassifier(
+        estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
+        voting="soft",
+        flatten_transform=False,
+    ).fit(X, y)
+
+    assert_array_equal(eclf1.transform(X).shape, (4, 6))
+    assert_array_equal(eclf2.transform(X).shape, (4, 6))
+    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
+    assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
+    assert_array_almost_equal(
+        eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)
+    )
+
+
+@pytest.mark.parametrize(
+    "X, y, voter",
+    [
+        (
+            X,
+            y,
+            VotingClassifier(
+                [
+                    ("lr", LogisticRegression()),
+                    ("rf", RandomForestClassifier(n_estimators=5)),
+                ]
+            ),
+        ),
+        (
+            X_r,
+            y_r,
+            VotingRegressor(
+                [
+                    ("lr", LinearRegression()),
+                    ("rf", RandomForestRegressor(n_estimators=5)),
+                ]
+            ),
+        ),
+    ],
+)
+def test_none_estimator_with_weights(X, y, voter):
+    # check that an estimator can be set to 'drop' and passing some weight
+    # regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/13777
+    voter = clone(voter)
+    # Scaled to solve ConvergenceWarning throw by Logistic Regression
+    X_scaled = StandardScaler().fit_transform(X)
+    voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
+    voter.set_params(lr="drop")
+    voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
+    y_pred = voter.predict(X_scaled)
+    assert y_pred.shape == y.shape
+
+
+@pytest.mark.parametrize(
+    "est",
+    [
+        VotingRegressor(
+            estimators=[
+                ("lr", LinearRegression()),
+                ("tree", DecisionTreeRegressor(random_state=0)),
+            ]
+        ),
+        VotingClassifier(
+            estimators=[
+                ("lr", LogisticRegression(random_state=0)),
+                ("tree", DecisionTreeClassifier(random_state=0)),
+            ]
+        ),
+    ],
+    ids=["VotingRegressor", "VotingClassifier"],
+)
+def test_n_features_in(est):
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    assert not hasattr(est, "n_features_in_")
+    est.fit(X, y)
+    assert est.n_features_in_ == 2
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        VotingRegressor(
+            estimators=[
+                ("lr", LinearRegression()),
+                ("rf", RandomForestRegressor(random_state=123)),
+            ],
+            verbose=True,
+        ),
+        VotingClassifier(
+            estimators=[
+                ("lr", LogisticRegression(random_state=123)),
+                ("rf", RandomForestClassifier(random_state=123)),
+            ],
+            verbose=True,
+        ),
+    ],
+)
+def test_voting_verbose(estimator, capsys):
+    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
+    y = np.array([1, 1, 2, 2])
+
+    pattern = (
+        r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n"
+        r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$"
+    )
+    clone(estimator).fit(X, y)
+    assert re.match(pattern, capsys.readouterr()[0])
+
+
+def test_get_features_names_out_regressor():
+    """Check get_feature_names_out output for regressor."""
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    voting = VotingRegressor(
+        estimators=[
+            ("lr", LinearRegression()),
+            ("tree", DecisionTreeRegressor(random_state=0)),
+            ("ignore", "drop"),
+        ]
+    )
+    voting.fit(X, y)
+
+    names_out = voting.get_feature_names_out()
+    expected_names = ["votingregressor_lr", "votingregressor_tree"]
+    assert_array_equal(names_out, expected_names)
+
+
+@pytest.mark.parametrize(
+    "kwargs, expected_names",
+    [
+        (
+            {"voting": "soft", "flatten_transform": True},
+            [
+                "votingclassifier_lr0",
+                "votingclassifier_lr1",
+                "votingclassifier_lr2",
+                "votingclassifier_tree0",
+                "votingclassifier_tree1",
+                "votingclassifier_tree2",
+            ],
+        ),
+        ({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]),
+    ],
+)
+def test_get_features_names_out_classifier(kwargs, expected_names):
+    """Check get_feature_names_out for classifier for different settings."""
+    X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
+    y = [0, 1, 2, 0]
+
+    voting = VotingClassifier(
+        estimators=[
+            ("lr", LogisticRegression(random_state=0)),
+            ("tree", DecisionTreeClassifier(random_state=0)),
+        ],
+        **kwargs,
+    )
+    voting.fit(X, y)
+    X_trans = voting.transform(X)
+    names_out = voting.get_feature_names_out()
+
+    assert X_trans.shape[1] == len(expected_names)
+    assert_array_equal(names_out, expected_names)
+
+
+def test_get_features_names_out_classifier_error():
+    """Check that error is raised when voting="soft" and flatten_transform=False."""
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    voting = VotingClassifier(
+        estimators=[
+            ("lr", LogisticRegression(random_state=0)),
+            ("tree", DecisionTreeClassifier(random_state=0)),
+        ],
+        voting="soft",
+        flatten_transform=False,
+    )
+    voting.fit(X, y)
+
+    msg = (
+        "get_feature_names_out is not supported when `voting='soft'` and "
+        "`flatten_transform=False`"
+    )
+    with pytest.raises(ValueError, match=msg):
+        voting.get_feature_names_out()
+
+
+# Metadata Routing Tests
+# ======================
+
+
+@pytest.mark.parametrize(
+    "Estimator, Child",
+    [(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
+)
+def test_routing_passed_metadata_not_supported(Estimator, Child):
+    """Test that the right error message is raised when metadata is passed while
+    not supported when `enable_metadata_routing=False`."""
+
+    X = np.array([[0, 1], [2, 2], [4, 6]])
+    y = [1, 2, 3]
+
+    with pytest.raises(
+        ValueError, match="is only supported if enable_metadata_routing=True"
+    ):
+        Estimator(["clf", Child()]).fit(X, y, sample_weight=[1, 1, 1], metadata="a")
+
+
+@pytest.mark.parametrize(
+    "Estimator, Child",
+    [(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
+)
+@config_context(enable_metadata_routing=True)
+def test_get_metadata_routing_without_fit(Estimator, Child):
+    # Test that metadata_routing() doesn't raise when called before fit.
+    est = Estimator([("sub_est", Child())])
+    est.get_metadata_routing()
+
+
+@pytest.mark.parametrize(
+    "Estimator, Child",
+    [(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
+)
+@pytest.mark.parametrize("prop", ["sample_weight", "metadata"])
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_for_voting_estimators(Estimator, Child, prop):
+    """Test that metadata is routed correctly for Voting*."""
+    X = np.array([[0, 1], [2, 2], [4, 6]])
+    y = [1, 2, 3]
+    sample_weight, metadata = [1, 1, 1], "a"
+
+    est = Estimator(
+        [
+            (
+                "sub_est1",
+                Child(registry=_Registry()).set_fit_request(**{prop: True}),
+            ),
+            (
+                "sub_est2",
+                Child(registry=_Registry()).set_fit_request(**{prop: True}),
+            ),
+        ]
+    )
+
+    est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata})
+
+    for estimator in est.estimators:
+        if prop == "sample_weight":
+            kwargs = {prop: sample_weight}
+        else:
+            kwargs = {prop: metadata}
+        # access sub-estimator in (name, est) with estimator[1]
+        registry = estimator[1].registry
+        assert len(registry)
+        for sub_est in registry:
+            check_recorded_metadata(obj=sub_est, method="fit", parent="fit", **kwargs)
+
+
+@pytest.mark.parametrize(
+    "Estimator, Child",
+    [(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
+)
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_error_for_voting_estimators(Estimator, Child):
+    """Test that the right error is raised when metadata is not requested."""
+    X = np.array([[0, 1], [2, 2], [4, 6]])
+    y = [1, 2, 3]
+    sample_weight, metadata = [1, 1, 1], "a"
+
+    est = Estimator([("sub_est", Child())])
+
+    error_message = (
+        "[sample_weight, metadata] are passed but are not explicitly set as requested"
+        f" or not requested for {Child.__name__}.fit"
+    )
+
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        est.fit(X, y, sample_weight=sample_weight, metadata=metadata)
+
+
+# End of Metadata Routing Tests
+# =============================
diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_weight_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_weight_boosting.py
new file mode 100644
index 0000000000000000000000000000000000000000..55825c438d76b29b74d8108970f72e3ebaa5e745
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_weight_boosting.py
@@ -0,0 +1,639 @@
+"""Testing for the boost module (sklearn.ensemble.boost)."""
+
+import re
+
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.base import BaseEstimator, clone
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
+from sklearn.ensemble._weight_boosting import _samme_proba
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import GridSearchCV, train_test_split
+from sklearn.svm import SVC, SVR
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils import shuffle
+from sklearn.utils._mocking import NoSampleWeightWrapper
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+# Common random state
+rng = np.random.RandomState(0)
+
+# Toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y_class = ["foo", "foo", "foo", 1, 1, 1]  # test string class labels
+y_regr = [-1, -1, -1, 1, 1, 1]
+T = [[-1, -1], [2, 2], [3, 2]]
+y_t_class = ["foo", 1, 1]
+y_t_regr = [-1, 1, 1]
+
+# Load the iris dataset and randomly permute it
+iris = datasets.load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
+
+# Load the diabetes dataset and randomly permute it
+diabetes = datasets.load_diabetes()
+diabetes.data, diabetes.target = shuffle(
+    diabetes.data, diabetes.target, random_state=rng
+)
+
+
+def test_samme_proba():
+    # Test the `_samme_proba` helper function.
+
+    # Define some example (bad) `predict_proba` output.
+    probs = np.array(
+        [[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]]
+    )
+    probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
+
+    # _samme_proba calls estimator.predict_proba.
+    # Make a mock object so I can control what gets returned.
+    class MockEstimator:
+        def predict_proba(self, X):
+            assert_array_equal(X.shape, probs.shape)
+            return probs
+
+    mock = MockEstimator()
+
+    samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
+
+    assert_array_equal(samme_proba.shape, probs.shape)
+    assert np.isfinite(samme_proba).all()
+
+    # Make sure that the correct elements come out as smallest --
+    # `_samme_proba` should preserve the ordering in each example.
+    assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
+    assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
+
+
+def test_oneclass_adaboost_proba():
+    # Test predict_proba robustness for one class label input.
+    # In response to issue #7501
+    # https://github.com/scikit-learn/scikit-learn/issues/7501
+    y_t = np.ones(len(X))
+    clf = AdaBoostClassifier().fit(X, y_t)
+    assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
+
+
+def test_classification_toy():
+    # Check classification on a toy dataset.
+    clf = AdaBoostClassifier(random_state=0)
+    clf.fit(X, y_class)
+    assert_array_equal(clf.predict(T), y_t_class)
+    assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
+    assert clf.predict_proba(T).shape == (len(T), 2)
+    assert clf.decision_function(T).shape == (len(T),)
+
+
+def test_regression_toy():
+    # Check classification on a toy dataset.
+    clf = AdaBoostRegressor(random_state=0)
+    clf.fit(X, y_regr)
+    assert_array_equal(clf.predict(T), y_t_regr)
+
+
+def test_iris():
+    # Check consistency on dataset iris.
+    classes = np.unique(iris.target)
+
+    clf = AdaBoostClassifier()
+    clf.fit(iris.data, iris.target)
+
+    assert_array_equal(classes, clf.classes_)
+    proba = clf.predict_proba(iris.data)
+
+    assert proba.shape[1] == len(classes)
+    assert clf.decision_function(iris.data).shape[1] == len(classes)
+
+    score = clf.score(iris.data, iris.target)
+    assert score > 0.9, f"Failed with {score = }"
+
+    # Check we used multiple estimators
+    assert len(clf.estimators_) > 1
+    # Check for distinct random states (see issue #7408)
+    assert len(set(est.random_state for est in clf.estimators_)) == len(clf.estimators_)
+
+
+@pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
+def test_diabetes(loss):
+    # Check consistency on dataset diabetes.
+    reg = AdaBoostRegressor(loss=loss, random_state=0)
+    reg.fit(diabetes.data, diabetes.target)
+    score = reg.score(diabetes.data, diabetes.target)
+    assert score > 0.55
+
+    # Check we used multiple estimators
+    assert len(reg.estimators_) > 1
+    # Check for distinct random states (see issue #7408)
+    assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
+
+
+def test_staged_predict():
+    # Check staged predictions.
+    rng = np.random.RandomState(0)
+    iris_weights = rng.randint(10, size=iris.target.shape)
+    diabetes_weights = rng.randint(10, size=diabetes.target.shape)
+
+    clf = AdaBoostClassifier(n_estimators=10)
+    clf.fit(iris.data, iris.target, sample_weight=iris_weights)
+
+    predictions = clf.predict(iris.data)
+    staged_predictions = [p for p in clf.staged_predict(iris.data)]
+    proba = clf.predict_proba(iris.data)
+    staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
+    score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
+    staged_scores = [
+        s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
+    ]
+
+    assert len(staged_predictions) == 10
+    assert_array_almost_equal(predictions, staged_predictions[-1])
+    assert len(staged_probas) == 10
+    assert_array_almost_equal(proba, staged_probas[-1])
+    assert len(staged_scores) == 10
+    assert_array_almost_equal(score, staged_scores[-1])
+
+    # AdaBoost regression
+    clf = AdaBoostRegressor(n_estimators=10, random_state=0)
+    clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
+
+    predictions = clf.predict(diabetes.data)
+    staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
+    score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
+    staged_scores = [
+        s
+        for s in clf.staged_score(
+            diabetes.data, diabetes.target, sample_weight=diabetes_weights
+        )
+    ]
+
+    assert len(staged_predictions) == 10
+    assert_array_almost_equal(predictions, staged_predictions[-1])
+    assert len(staged_scores) == 10
+    assert_array_almost_equal(score, staged_scores[-1])
+
+
+def test_gridsearch():
+    # Check that base trees can be grid-searched.
+    # AdaBoost classification
+    boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())
+    parameters = {
+        "n_estimators": (1, 2),
+        "estimator__max_depth": (1, 2),
+    }
+    clf = GridSearchCV(boost, parameters)
+    clf.fit(iris.data, iris.target)
+
+    # AdaBoost regression
+    boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0)
+    parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)}
+    clf = GridSearchCV(boost, parameters)
+    clf.fit(diabetes.data, diabetes.target)
+
+
+def test_pickle():
+    # Check pickability.
+    import pickle
+
+    # Adaboost classifier
+    obj = AdaBoostClassifier()
+    obj.fit(iris.data, iris.target)
+    score = obj.score(iris.data, iris.target)
+    s = pickle.dumps(obj)
+
+    obj2 = pickle.loads(s)
+    assert type(obj2) == obj.__class__
+    score2 = obj2.score(iris.data, iris.target)
+    assert score == score2
+
+    # Adaboost regressor
+    obj = AdaBoostRegressor(random_state=0)
+    obj.fit(diabetes.data, diabetes.target)
+    score = obj.score(diabetes.data, diabetes.target)
+    s = pickle.dumps(obj)
+
+    obj2 = pickle.loads(s)
+    assert type(obj2) == obj.__class__
+    score2 = obj2.score(diabetes.data, diabetes.target)
+    assert score == score2
+
+
+def test_importances():
+    # Check variable importances.
+    X, y = datasets.make_classification(
+        n_samples=2000,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=1,
+    )
+
+    clf = AdaBoostClassifier()
+
+    clf.fit(X, y)
+    importances = clf.feature_importances_
+
+    assert importances.shape[0] == 10
+    assert (importances[:3, np.newaxis] >= importances[3:]).all()
+
+
+def test_adaboost_classifier_sample_weight_error():
+    # Test that it gives proper exception on incorrect sample weight.
+    clf = AdaBoostClassifier()
+    msg = re.escape("sample_weight.shape == (1,), expected (6,)")
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y_class, sample_weight=np.asarray([-1]))
+
+
+def test_estimator():
+    # Test different estimators.
+    from sklearn.ensemble import RandomForestClassifier
+
+    # XXX doesn't work with y_class because RF doesn't support classes_
+    # Shouldn't AdaBoost run a LabelBinarizer?
+    clf = AdaBoostClassifier(RandomForestClassifier())
+    clf.fit(X, y_regr)
+
+    clf = AdaBoostClassifier(SVC())
+    clf.fit(X, y_class)
+
+    from sklearn.ensemble import RandomForestRegressor
+
+    clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
+    clf.fit(X, y_regr)
+
+    clf = AdaBoostRegressor(SVR(), random_state=0)
+    clf.fit(X, y_regr)
+
+    # Check that an empty discrete ensemble fails in fit, not predict.
+    X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
+    y_fail = ["foo", "bar", 1, 2]
+    clf = AdaBoostClassifier(SVC())
+    with pytest.raises(ValueError, match="worse than random"):
+        clf.fit(X_fail, y_fail)
+
+
+def test_sample_weights_infinite():
+    msg = "Sample weights have reached infinite values"
+    clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0)
+    with pytest.warns(UserWarning, match=msg):
+        clf.fit(iris.data, iris.target)
+
+
+@pytest.mark.parametrize(
+    "sparse_container, expected_internal_type",
+    zip(
+        [
+            *CSC_CONTAINERS,
+            *CSR_CONTAINERS,
+            *LIL_CONTAINERS,
+            *COO_CONTAINERS,
+            *DOK_CONTAINERS,
+        ],
+        CSC_CONTAINERS + 4 * CSR_CONTAINERS,
+    ),
+)
+def test_sparse_classification(sparse_container, expected_internal_type):
+    # Check classification with sparse input.
+
+    class CustomSVC(SVC):
+        """SVC variant that records the nature of the training set."""
+
+        def fit(self, X, y, sample_weight=None):
+            """Modification on fit caries data type for later verification."""
+            super().fit(X, y, sample_weight=sample_weight)
+            self.data_type_ = type(X)
+            return self
+
+    X, y = datasets.make_multilabel_classification(
+        n_classes=1, n_samples=15, n_features=5, random_state=42
+    )
+    # Flatten y to a 1d array
+    y = np.ravel(y)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    X_train_sparse = sparse_container(X_train)
+    X_test_sparse = sparse_container(X_test)
+
+    # Trained on sparse format
+    sparse_classifier = AdaBoostClassifier(
+        estimator=CustomSVC(probability=True),
+        random_state=1,
+    ).fit(X_train_sparse, y_train)
+
+    # Trained on dense format
+    dense_classifier = AdaBoostClassifier(
+        estimator=CustomSVC(probability=True),
+        random_state=1,
+    ).fit(X_train, y_train)
+
+    # predict
+    sparse_clf_results = sparse_classifier.predict(X_test_sparse)
+    dense_clf_results = dense_classifier.predict(X_test)
+    assert_array_equal(sparse_clf_results, dense_clf_results)
+
+    # decision_function
+    sparse_clf_results = sparse_classifier.decision_function(X_test_sparse)
+    dense_clf_results = dense_classifier.decision_function(X_test)
+    assert_array_almost_equal(sparse_clf_results, dense_clf_results)
+
+    # predict_log_proba
+    sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse)
+    dense_clf_results = dense_classifier.predict_log_proba(X_test)
+    assert_array_almost_equal(sparse_clf_results, dense_clf_results)
+
+    # predict_proba
+    sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse)
+    dense_clf_results = dense_classifier.predict_proba(X_test)
+    assert_array_almost_equal(sparse_clf_results, dense_clf_results)
+
+    # score
+    sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test)
+    dense_clf_results = dense_classifier.score(X_test, y_test)
+    assert_array_almost_equal(sparse_clf_results, dense_clf_results)
+
+    # staged_decision_function
+    sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse)
+    dense_clf_results = dense_classifier.staged_decision_function(X_test)
+    for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
+        assert_array_almost_equal(sparse_clf_res, dense_clf_res)
+
+    # staged_predict
+    sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse)
+    dense_clf_results = dense_classifier.staged_predict(X_test)
+    for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
+        assert_array_equal(sparse_clf_res, dense_clf_res)
+
+    # staged_predict_proba
+    sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse)
+    dense_clf_results = dense_classifier.staged_predict_proba(X_test)
+    for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
+        assert_array_almost_equal(sparse_clf_res, dense_clf_res)
+
+    # staged_score
+    sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test)
+    dense_clf_results = dense_classifier.staged_score(X_test, y_test)
+    for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
+        assert_array_equal(sparse_clf_res, dense_clf_res)
+
+    # Verify sparsity of data is maintained during training
+    types = [i.data_type_ for i in sparse_classifier.estimators_]
+
+    assert all([t == expected_internal_type for t in types])
+
+
+@pytest.mark.parametrize(
+    "sparse_container, expected_internal_type",
+    zip(
+        [
+            *CSC_CONTAINERS,
+            *CSR_CONTAINERS,
+            *LIL_CONTAINERS,
+            *COO_CONTAINERS,
+            *DOK_CONTAINERS,
+        ],
+        CSC_CONTAINERS + 4 * CSR_CONTAINERS,
+    ),
+)
+def test_sparse_regression(sparse_container, expected_internal_type):
+    # Check regression with sparse input.
+
+    class CustomSVR(SVR):
+        """SVR variant that records the nature of the training set."""
+
+        def fit(self, X, y, sample_weight=None):
+            """Modification on fit caries data type for later verification."""
+            super().fit(X, y, sample_weight=sample_weight)
+            self.data_type_ = type(X)
+            return self
+
+    X, y = datasets.make_regression(
+        n_samples=15, n_features=50, n_targets=1, random_state=42
+    )
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    X_train_sparse = sparse_container(X_train)
+    X_test_sparse = sparse_container(X_test)
+
+    # Trained on sparse format
+    sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
+        X_train_sparse, y_train
+    )
+
+    # Trained on dense format
+    dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
+        X_train, y_train
+    )
+
+    # predict
+    sparse_regr_results = sparse_regressor.predict(X_test_sparse)
+    dense_regr_results = dense_regressor.predict(X_test)
+    assert_array_almost_equal(sparse_regr_results, dense_regr_results)
+
+    # staged_predict
+    sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse)
+    dense_regr_results = dense_regressor.staged_predict(X_test)
+    for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results):
+        assert_array_almost_equal(sparse_regr_res, dense_regr_res)
+
+    types = [i.data_type_ for i in sparse_regressor.estimators_]
+
+    assert all([t == expected_internal_type for t in types])
+
+
+def test_sample_weight_adaboost_regressor():
+    """
+    AdaBoostRegressor should work without sample_weights in the base estimator
+    The random weighted sampling is done internally in the _boost method in
+    AdaBoostRegressor.
+    """
+
+    class DummyEstimator(BaseEstimator):
+        def fit(self, X, y):
+            pass
+
+        def predict(self, X):
+            return np.zeros(X.shape[0])
+
+    boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
+    boost.fit(X, y_regr)
+    assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
+
+
+def test_multidimensional_X():
+    """
+    Check that the AdaBoost estimators can work with n-dimensional
+    data matrix
+    """
+    rng = np.random.RandomState(0)
+
+    X = rng.randn(51, 3, 3)
+    yc = rng.choice([0, 1], 51)
+    yr = rng.randn(51)
+
+    boost = AdaBoostClassifier(DummyClassifier(strategy="most_frequent"))
+    boost.fit(X, yc)
+    boost.predict(X)
+    boost.predict_proba(X)
+
+    boost = AdaBoostRegressor(DummyRegressor())
+    boost.fit(X, yr)
+    boost.predict(X)
+
+
+def test_adaboostclassifier_without_sample_weight():
+    X, y = iris.data, iris.target
+    estimator = NoSampleWeightWrapper(DummyClassifier())
+    clf = AdaBoostClassifier(estimator=estimator)
+    err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__)
+    with pytest.raises(ValueError, match=err_msg):
+        clf.fit(X, y)
+
+
+def test_adaboostregressor_sample_weight():
+    # check that giving weight will have an influence on the error computed
+    # for a weak learner
+    rng = np.random.RandomState(42)
+    X = np.linspace(0, 100, num=1000)
+    y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
+    X = X.reshape(-1, 1)
+
+    # add an arbitrary outlier
+    X[-1] *= 10
+    y[-1] = 10000
+
+    # random_state=0 ensure that the underlying bootstrap will use the outlier
+    regr_no_outlier = AdaBoostRegressor(
+        estimator=LinearRegression(), n_estimators=1, random_state=0
+    )
+    regr_with_weight = clone(regr_no_outlier)
+    regr_with_outlier = clone(regr_no_outlier)
+
+    # fit 3 models:
+    # - a model containing the outlier
+    # - a model without the outlier
+    # - a model containing the outlier but with a null sample-weight
+    regr_with_outlier.fit(X, y)
+    regr_no_outlier.fit(X[:-1], y[:-1])
+    sample_weight = np.ones_like(y)
+    sample_weight[-1] = 0
+    regr_with_weight.fit(X, y, sample_weight=sample_weight)
+
+    score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
+    score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
+    score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
+
+    assert score_with_outlier < score_no_outlier
+    assert score_with_outlier < score_with_weight
+    assert score_no_outlier == pytest.approx(score_with_weight)
+
+
+def test_adaboost_consistent_predict():
+    # check that predict_proba and predict give consistent results
+    # regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/14084
+    X_train, X_test, y_train, y_test = train_test_split(
+        *datasets.load_digits(return_X_y=True), random_state=42
+    )
+    model = AdaBoostClassifier(random_state=42)
+    model.fit(X_train, y_train)
+
+    assert_array_equal(
+        np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
+    )
+
+
+@pytest.mark.parametrize(
+    "model, X, y",
+    [
+        (AdaBoostClassifier(), iris.data, iris.target),
+        (AdaBoostRegressor(), diabetes.data, diabetes.target),
+    ],
+)
+def test_adaboost_negative_weight_error(model, X, y):
+    sample_weight = np.ones_like(y)
+    sample_weight[-1] = -10
+
+    err_msg = "Negative values in data passed to `sample_weight`"
+    with pytest.raises(ValueError, match=err_msg):
+        model.fit(X, y, sample_weight=sample_weight)
+
+
+def test_adaboost_numerically_stable_feature_importance_with_small_weights():
+    """Check that we don't create NaN feature importance with numerically
+    instable inputs.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20320
+    """
+    rng = np.random.RandomState(42)
+    X = rng.normal(size=(1000, 10))
+    y = rng.choice([0, 1], size=1000)
+    sample_weight = np.ones_like(y) * 1e-263
+    tree = DecisionTreeClassifier(max_depth=10, random_state=12)
+    ada_model = AdaBoostClassifier(estimator=tree, n_estimators=20, random_state=12)
+    ada_model.fit(X, y, sample_weight=sample_weight)
+    assert np.isnan(ada_model.feature_importances_).sum() == 0
+
+
+def test_adaboost_decision_function(global_random_seed):
+    """Check that the decision function respects the symmetric constraint for weak
+    learners.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/26520
+    """
+    n_classes = 3
+    X, y = datasets.make_classification(
+        n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed
+    )
+    clf = AdaBoostClassifier(n_estimators=1, random_state=global_random_seed).fit(X, y)
+
+    y_score = clf.decision_function(X)
+    assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
+
+    # With a single learner, we expect to have a decision function in
+    # {1, - 1 / (n_classes - 1)}.
+    assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
+
+    # We can assert the same for staged_decision_function since we have a single learner
+    for y_score in clf.staged_decision_function(X):
+        assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
+
+        # With a single learner, we expect to have a decision function in
+        # {1, - 1 / (n_classes - 1)}.
+        assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
+
+    clf.set_params(n_estimators=5).fit(X, y)
+
+    y_score = clf.decision_function(X)
+    assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
+
+    for y_score in clf.staged_decision_function(X):
+        assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
+
+
+# TODO(1.8): remove
+def test_deprecated_algorithm():
+    adaboost_clf = AdaBoostClassifier(n_estimators=1, algorithm="SAMME")
+    with pytest.warns(FutureWarning, match="The parameter 'algorithm' is deprecated"):
+        adaboost_clf.fit(X, y_class)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..593d247e5bc403056808dafa8fba9d511457fbd0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/__init__.py
@@ -0,0 +1,10 @@
+"""Importable modules that enable the use of experimental features or estimators.
+
+.. warning::
+
+    The features and estimators that are experimental aren't subject to
+    deprecation cycles. Use them at your own risks!
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_halving_search_cv.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_halving_search_cv.py
new file mode 100644
index 0000000000000000000000000000000000000000..85f93b26459d0c5e154dc9e7000e81d586cb701e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_halving_search_cv.py
@@ -0,0 +1,35 @@
+"""Enables Successive Halving search-estimators
+
+The API and results of these estimators might change without any deprecation
+cycle.
+
+Importing this file dynamically sets the
+:class:`~sklearn.model_selection.HalvingRandomSearchCV` and
+:class:`~sklearn.model_selection.HalvingGridSearchCV` as attributes of the
+`model_selection` module::
+
+    >>> # explicitly require this experimental feature
+    >>> from sklearn.experimental import enable_halving_search_cv # noqa
+    >>> # now you can import normally from model_selection
+    >>> from sklearn.model_selection import HalvingRandomSearchCV
+    >>> from sklearn.model_selection import HalvingGridSearchCV
+
+
+The ``# noqa`` comment comment can be removed: it just tells linters like
+flake8 to ignore the import, which appears as unused.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from .. import model_selection
+from ..model_selection._search_successive_halving import (
+    HalvingGridSearchCV,
+    HalvingRandomSearchCV,
+)
+
+# use settattr to avoid mypy errors when monkeypatching
+setattr(model_selection, "HalvingRandomSearchCV", HalvingRandomSearchCV)
+setattr(model_selection, "HalvingGridSearchCV", HalvingGridSearchCV)
+
+model_selection.__all__ += ["HalvingRandomSearchCV", "HalvingGridSearchCV"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py
new file mode 100644
index 0000000000000000000000000000000000000000..589348fe9bc21de2ae642d51be152de7958be0b1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py
@@ -0,0 +1,23 @@
+"""This is now a no-op and can be safely removed from your code.
+
+It used to enable the use of
+:class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+:class:`~sklearn.ensemble.HistGradientBoostingRegressor` when they were still
+:term:`experimental`, but these estimators are now stable and can be imported
+normally from `sklearn.ensemble`.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Don't remove this file, we don't want to break users code just because the
+# feature isn't experimental anymore.
+
+import warnings
+
+warnings.warn(
+    "Since version 1.0, "
+    "it is not needed to import enable_hist_gradient_boosting anymore. "
+    "HistGradientBoostingClassifier and HistGradientBoostingRegressor are now "
+    "stable and can be normally imported from sklearn.ensemble."
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_iterative_imputer.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_iterative_imputer.py
new file mode 100644
index 0000000000000000000000000000000000000000..544e0d60eea2863a4516da89d5af475d94f4aba3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_iterative_imputer.py
@@ -0,0 +1,23 @@
+"""Enables IterativeImputer
+
+The API and results of this estimator might change without any deprecation
+cycle.
+
+Importing this file dynamically sets :class:`~sklearn.impute.IterativeImputer`
+as an attribute of the impute module::
+
+    >>> # explicitly require this experimental feature
+    >>> from sklearn.experimental import enable_iterative_imputer  # noqa
+    >>> # now you can import normally from impute
+    >>> from sklearn.impute import IterativeImputer
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from .. import impute
+from ..impute._iterative import IterativeImputer
+
+# use settattr to avoid mypy errors when monkeypatching
+setattr(impute, "IterativeImputer", IterativeImputer)
+impute.__all__ += ["IterativeImputer"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py
new file mode 100644
index 0000000000000000000000000000000000000000..a247bfd3f64280cc338825c7695da9f9cb7688e0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py
@@ -0,0 +1,19 @@
+"""Tests for making sure experimental imports work as expected."""
+
+import textwrap
+
+import pytest
+
+from sklearn.utils._testing import assert_run_python_script_without_output
+from sklearn.utils.fixes import _IS_WASM
+
+
+@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess")
+def test_import_raises_warning():
+    code = """
+    import pytest
+    with pytest.warns(UserWarning, match="it is not needed to import"):
+        from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+    """
+    pattern = "it is not needed to import enable_hist_gradient_boosting anymore"
+    assert_run_python_script_without_output(textwrap.dedent(code), pattern=pattern)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_iterative_imputer.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_iterative_imputer.py
new file mode 100644
index 0000000000000000000000000000000000000000..17e9dfa0d037612d639a0e070fff8fd432b526a2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_iterative_imputer.py
@@ -0,0 +1,51 @@
+"""Tests for making sure experimental imports work as expected."""
+
+import textwrap
+
+import pytest
+
+from sklearn.utils._testing import assert_run_python_script_without_output
+from sklearn.utils.fixes import _IS_WASM
+
+
+@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess")
+def test_imports_strategies():
+    # Make sure different import strategies work or fail as expected.
+
+    # Since Python caches the imported modules, we need to run a child process
+    # for every test case. Else, the tests would not be independent
+    # (manually removing the imports from the cache (sys.modules) is not
+    # recommended and can lead to many complications).
+    pattern = "IterativeImputer is experimental"
+    good_import = """
+    from sklearn.experimental import enable_iterative_imputer
+    from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script_without_output(
+        textwrap.dedent(good_import), pattern=pattern
+    )
+
+    good_import_with_ensemble_first = """
+    import sklearn.ensemble
+    from sklearn.experimental import enable_iterative_imputer
+    from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script_without_output(
+        textwrap.dedent(good_import_with_ensemble_first),
+        pattern=pattern,
+    )
+
+    bad_imports = f"""
+    import pytest
+
+    with pytest.raises(ImportError, match={pattern!r}):
+        from sklearn.impute import IterativeImputer
+
+    import sklearn.experimental
+    with pytest.raises(ImportError, match={pattern!r}):
+        from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script_without_output(
+        textwrap.dedent(bad_imports),
+        pattern=pattern,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_successive_halving.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_successive_halving.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ba273f94cc496550ab6b9a4d2b688a88d6fc43b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_successive_halving.py
@@ -0,0 +1,53 @@
+"""Tests for making sure experimental imports work as expected."""
+
+import textwrap
+
+import pytest
+
+from sklearn.utils._testing import assert_run_python_script_without_output
+from sklearn.utils.fixes import _IS_WASM
+
+
+@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess")
+def test_imports_strategies():
+    # Make sure different import strategies work or fail as expected.
+
+    # Since Python caches the imported modules, we need to run a child process
+    # for every test case. Else, the tests would not be independent
+    # (manually removing the imports from the cache (sys.modules) is not
+    # recommended and can lead to many complications).
+    pattern = "Halving(Grid|Random)SearchCV is experimental"
+    good_import = """
+    from sklearn.experimental import enable_halving_search_cv
+    from sklearn.model_selection import HalvingGridSearchCV
+    from sklearn.model_selection import HalvingRandomSearchCV
+    """
+    assert_run_python_script_without_output(
+        textwrap.dedent(good_import), pattern=pattern
+    )
+
+    good_import_with_model_selection_first = """
+    import sklearn.model_selection
+    from sklearn.experimental import enable_halving_search_cv
+    from sklearn.model_selection import HalvingGridSearchCV
+    from sklearn.model_selection import HalvingRandomSearchCV
+    """
+    assert_run_python_script_without_output(
+        textwrap.dedent(good_import_with_model_selection_first),
+        pattern=pattern,
+    )
+
+    bad_imports = f"""
+    import pytest
+
+    with pytest.raises(ImportError, match={pattern!r}):
+        from sklearn.model_selection import HalvingGridSearchCV
+
+    import sklearn.experimental
+    with pytest.raises(ImportError, match={pattern!r}):
+        from sklearn.model_selection import HalvingRandomSearchCV
+    """
+    assert_run_python_script_without_output(
+        textwrap.dedent(bad_imports),
+        pattern=pattern,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/README b/.venv/lib/python3.12/site-packages/sklearn/externals/README
new file mode 100644
index 0000000000000000000000000000000000000000..eef7ba7dd652e73413dad8ed1c6096dc4066d214
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/externals/README
@@ -0,0 +1,7 @@
+This directory contains bundled external dependencies that are updated
+every once in a while.
+
+Note for distribution packagers: if you want to remove the duplicated
+code and depend on a packaged version, we suggest that you simply do a
+symbolic link in this directory.
+
diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/externals/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97cda1858d5655b4179183372d271299298c62be
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/externals/__init__.py
@@ -0,0 +1,5 @@
+
+"""
+External, bundled dependencies.
+
+"""
diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/_arff.py b/.venv/lib/python3.12/site-packages/sklearn/externals/_arff.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c9d51d0702ff5cbe70b80d405747e37a5e6cb1d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/externals/_arff.py
@@ -0,0 +1,1107 @@
+# =============================================================================
+# Federal University of Rio Grande do Sul (UFRGS)
+# Connectionist Artificial Intelligence Laboratory (LIAC)
+# Renato de Pontes Pereira - rppereira@inf.ufrgs.br
+# =============================================================================
+# Copyright (c) 2011 Renato de Pontes Pereira, renato.ppontes at gmail dot com
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# =============================================================================
+
+'''
+The liac-arff module implements functions to read and write ARFF files in
+Python. It was created in the Connectionist Artificial Intelligence Laboratory
+(LIAC), which takes place at the Federal University of Rio Grande do Sul
+(UFRGS), in Brazil.
+
+ARFF (Attribute-Relation File Format) is an file format specially created for
+describe datasets which are commonly used for machine learning experiments and
+software. This file format was created to be used in Weka, the best
+representative software for machine learning automated experiments.
+
+An ARFF file can be divided into two sections: header and data. The Header
+describes the metadata of the dataset, including a general description of the
+dataset, its name and its attributes. The source below is an example of a
+header section in a XOR dataset::
+
+    %
+    % XOR Dataset
+    %
+    % Created by Renato Pereira
+    %            rppereira@inf.ufrgs.br
+    %            http://inf.ufrgs.br/~rppereira
+    %
+    %
+    @RELATION XOR
+
+    @ATTRIBUTE input1 REAL
+    @ATTRIBUTE input2 REAL
+    @ATTRIBUTE y REAL
+
+The Data section of an ARFF file describes the observations of the dataset, in
+the case of XOR dataset::
+
+    @DATA
+    0.0,0.0,0.0
+    0.0,1.0,1.0
+    1.0,0.0,1.0
+    1.0,1.0,0.0
+    %
+    %
+    %
+
+Notice that several lines are starting with an ``%`` symbol, denoting a
+comment, thus, lines with ``%`` at the beginning will be ignored, except by the
+description part at the beginning of the file. The declarations ``@RELATION``,
+``@ATTRIBUTE``, and ``@DATA`` are all case insensitive and obligatory.
+
+For more information and details about the ARFF file description, consult
+http://www.cs.waikato.ac.nz/~ml/weka/arff.html
+
+
+ARFF Files in Python
+~~~~~~~~~~~~~~~~~~~~
+
+This module uses built-ins python objects to represent a deserialized ARFF
+file. A dictionary is used as the container of the data and metadata of ARFF,
+and have the following keys:
+
+- **description**: (OPTIONAL) a string with the description of the dataset.
+- **relation**: (OBLIGATORY) a string with the name of the dataset.
+- **attributes**: (OBLIGATORY) a list of attributes with the following
+  template::
+
+    (attribute_name, attribute_type)
+
+  the attribute_name is a string, and attribute_type must be an string
+  or a list of strings.
+- **data**: (OBLIGATORY) a list of data instances. Each data instance must be
+  a list with values, depending on the attributes.
+
+The above keys must follow the case which were described, i.e., the keys are
+case sensitive. The attribute type ``attribute_type`` must be one of these
+strings (they are not case sensitive): ``NUMERIC``, ``INTEGER``, ``REAL`` or
+``STRING``. For nominal attributes, the ``atribute_type`` must be a list of
+strings.
+
+In this format, the XOR dataset presented above can be represented as a python
+object as::
+
+    xor_dataset = {
+        'description': 'XOR Dataset',
+        'relation': 'XOR',
+        'attributes': [
+            ('input1', 'REAL'),
+            ('input2', 'REAL'),
+            ('y', 'REAL'),
+        ],
+        'data': [
+            [0.0, 0.0, 0.0],
+            [0.0, 1.0, 1.0],
+            [1.0, 0.0, 1.0],
+            [1.0, 1.0, 0.0]
+        ]
+    }
+
+
+Features
+~~~~~~~~
+
+This module provides several features, including:
+
+- Read and write ARFF files using python built-in structures, such dictionaries
+  and lists;
+- Supports `scipy.sparse.coo <http://docs.scipy
+  .org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html#scipy.sparse.coo_matrix>`_
+  and lists of dictionaries as used by SVMLight
+- Supports the following attribute types: NUMERIC, REAL, INTEGER, STRING, and
+  NOMINAL;
+- Has an interface similar to other built-in modules such as ``json``, or
+  ``zipfile``;
+- Supports read and write the descriptions of files;
+- Supports missing values and names with spaces;
+- Supports unicode values and names;
+- Fully compatible with Python 2.7+, Python 3.5+, pypy and pypy3;
+- Under `MIT License <http://opensource.org/licenses/MIT>`_
+
+'''
+__author__ = 'Renato de Pontes Pereira, Matthias Feurer, Joel Nothman'
+__author_email__ = ('renato.ppontes@gmail.com, '
+                    'feurerm@informatik.uni-freiburg.de, '
+                    'joel.nothman@gmail.com')
+__version__ = '2.4.0'
+
+import re
+import csv
+from typing import TYPE_CHECKING
+from typing import Optional, List, Dict, Any, Iterator, Union, Tuple
+
+# CONSTANTS ===================================================================
+_SIMPLE_TYPES = ['NUMERIC', 'REAL', 'INTEGER', 'STRING']
+
+_TK_DESCRIPTION = '%'
+_TK_COMMENT     = '%'
+_TK_RELATION    = '@RELATION'
+_TK_ATTRIBUTE   = '@ATTRIBUTE'
+_TK_DATA        = '@DATA'
+
+_RE_RELATION     = re.compile(r'^([^\{\}%,\s]*|\".*\"|\'.*\')$', re.UNICODE)
+_RE_ATTRIBUTE    = re.compile(r'^(\".*\"|\'.*\'|[^\{\}%,\s]*)\s+(.+)$', re.UNICODE)
+_RE_QUOTE_CHARS = re.compile(r'["\'\\\s%,\000-\031]', re.UNICODE)
+_RE_ESCAPE_CHARS = re.compile(r'(?=["\'\\%])|[\n\r\t\000-\031]')
+_RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE)
+_RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE)
+
+ArffDenseDataType = Iterator[List]
+ArffSparseDataType = Tuple[List, ...]
+
+
+if TYPE_CHECKING:
+    # typing_extensions is available when mypy is installed
+    from typing_extensions import TypedDict
+
+    class ArffContainerType(TypedDict):
+        description: str
+        relation: str
+        attributes: List
+        data: Union[ArffDenseDataType, ArffSparseDataType]
+
+else:
+    ArffContainerType = Dict[str, Any]
+
+
+def _build_re_values():
+    quoted_re = r'''
+                    "      # open quote followed by zero or more of:
+                    (?:
+                        (?<!\\)    # no additional backslash
+                        (?:\\\\)*  # maybe escaped backslashes
+                        \\"        # escaped quote
+                    |
+                        \\[^"]     # escaping a non-quote
+                    |
+                        [^"\\]     # non-quote char
+                    )*
+                    "      # close quote
+                    '''
+    # a value is surrounded by " or by ' or contains no quotables
+    value_re = r'''(?:
+        %s|          # a value may be surrounded by "
+        %s|          # or by '
+        [^,\s"'{}]+  # or may contain no characters requiring quoting
+        )''' % (quoted_re,
+                quoted_re.replace('"', "'"))
+
+    # This captures (value, error) groups. Because empty values are allowed,
+    # we cannot just look for empty values to handle syntax errors.
+    # We presume the line has had ',' prepended...
+    dense = re.compile(r'''(?x)
+        ,                # may follow ','
+        \s*
+        ((?=,)|$|{value_re})  # empty or value
+        |
+        (\S.*)           # error
+        '''.format(value_re=value_re))
+
+    # This captures (key, value) groups and will have an empty key/value
+    # in case of syntax errors.
+    # It does not ensure that the line starts with '{' or ends with '}'.
+    sparse = re.compile(r'''(?x)
+        (?:^\s*\{|,)   # may follow ',', or '{' at line start
+        \s*
+        (\d+)          # attribute key
+        \s+
+        (%(value_re)s) # value
+        |
+        (?!}\s*$)      # not an error if it's }$
+        (?!^\s*{\s*}\s*$)  # not an error if it's ^{}$
+        \S.*           # error
+        ''' % {'value_re': value_re})
+    return dense, sparse
+
+
+
+_RE_DENSE_VALUES, _RE_SPARSE_KEY_VALUES = _build_re_values()
+
+
+_ESCAPE_SUB_MAP = {
+    '\\\\': '\\',
+    '\\"': '"',
+    "\\'": "'",
+    '\\t': '\t',
+    '\\n': '\n',
+    '\\r': '\r',
+    '\\b': '\b',
+    '\\f': '\f',
+    '\\%': '%',
+}
+_UNESCAPE_SUB_MAP = {chr(i): '\\%03o' % i for i in range(32)}
+_UNESCAPE_SUB_MAP.update({v: k for k, v in _ESCAPE_SUB_MAP.items()})
+_UNESCAPE_SUB_MAP[''] = '\\'
+_ESCAPE_SUB_MAP.update({'\\%d' % i: chr(i) for i in range(10)})
+
+
+def _escape_sub_callback(match):
+    s = match.group()
+    if len(s) == 2:
+        try:
+            return _ESCAPE_SUB_MAP[s]
+        except KeyError:
+            raise ValueError('Unsupported escape sequence: %s' % s)
+    if s[1] == 'u':
+        return chr(int(s[2:], 16))
+    else:
+        return chr(int(s[1:], 8))
+
+
+def _unquote(v):
+    if v[:1] in ('"', "'"):
+        return re.sub(r'\\([0-9]{1,3}|u[0-9a-f]{4}|.)', _escape_sub_callback,
+                      v[1:-1])
+    elif v in ('?', ''):
+        return None
+    else:
+        return v
+
+
+def _parse_values(s):
+    '''(INTERNAL) Split a line into a list of values'''
+    if not _RE_NONTRIVIAL_DATA.search(s):
+        # Fast path for trivial cases (unfortunately we have to handle missing
+        # values because of the empty string case :(.)
+        return [None if s in ('?', '') else s
+                for s in next(csv.reader([s]))]
+
+    # _RE_DENSE_VALUES tokenizes despite quoting, whitespace, etc.
+    values, errors = zip(*_RE_DENSE_VALUES.findall(',' + s))
+    if not any(errors):
+        return [_unquote(v) for v in values]
+    if _RE_SPARSE_LINE.match(s):
+        try:
+            return {int(k): _unquote(v)
+                    for k, v in _RE_SPARSE_KEY_VALUES.findall(s)}
+        except ValueError:
+            # an ARFF syntax error in sparse data
+            for match in _RE_SPARSE_KEY_VALUES.finditer(s):
+                if not match.group(1):
+                    raise BadLayout('Error parsing %r' % match.group())
+            raise BadLayout('Unknown parsing error')
+    else:
+        # an ARFF syntax error
+        for match in _RE_DENSE_VALUES.finditer(s):
+            if match.group(2):
+                raise BadLayout('Error parsing %r' % match.group())
+        raise BadLayout('Unknown parsing error')
+
+
+DENSE = 0     # Constant value representing a dense matrix
+COO = 1       # Constant value representing a sparse matrix in coordinate format
+LOD = 2       # Constant value representing a sparse matrix in list of
+              # dictionaries format
+DENSE_GEN = 3 # Generator of dictionaries
+LOD_GEN = 4   # Generator of dictionaries
+_SUPPORTED_DATA_STRUCTURES = [DENSE, COO, LOD, DENSE_GEN, LOD_GEN]
+
+
+# EXCEPTIONS ==================================================================
+class ArffException(Exception):
+    message: Optional[str] = None
+
+    def __init__(self):
+        self.line = -1
+
+    def __str__(self):
+        return self.message%self.line
+
+class BadRelationFormat(ArffException):
+    '''Error raised when the relation declaration is in an invalid format.'''
+    message = 'Bad @RELATION format, at line %d.'
+
+class BadAttributeFormat(ArffException):
+    '''Error raised when some attribute declaration is in an invalid format.'''
+    message = 'Bad @ATTRIBUTE format, at line %d.'
+
+class BadDataFormat(ArffException):
+    '''Error raised when some data instance is in an invalid format.'''
+    def __init__(self, value):
+        super().__init__()
+        self.message = (
+            'Bad @DATA instance format in line %d: ' +
+            ('%s' % value)
+        )
+
+class BadAttributeType(ArffException):
+    '''Error raised when some invalid type is provided into the attribute
+    declaration.'''
+    message = 'Bad @ATTRIBUTE type, at line %d.'
+
+class BadAttributeName(ArffException):
+    '''Error raised when an attribute name is provided twice the attribute
+    declaration.'''
+
+    def __init__(self, value, value2):
+        super().__init__()
+        self.message = (
+            ('Bad @ATTRIBUTE name %s at line' % value) +
+            ' %d, this name is already in use in line' +
+            (' %d.' % value2)
+        )
+
+class BadNominalValue(ArffException):
+    '''Error raised when a value in used in some data instance but is not
+    declared into it respective attribute declaration.'''
+
+    def __init__(self, value):
+        super().__init__()
+        self.message = (
+            ('Data value %s not found in nominal declaration, ' % value)
+            + 'at line %d.'
+        )
+
+class BadNominalFormatting(ArffException):
+    '''Error raised when a nominal value with space is not properly quoted.'''
+    def __init__(self, value):
+        super().__init__()
+        self.message = (
+            ('Nominal data value "%s" not properly quoted in line ' % value) +
+            '%d.'
+        )
+
+class BadNumericalValue(ArffException):
+    '''Error raised when and invalid numerical value is used in some data
+    instance.'''
+    message = 'Invalid numerical value, at line %d.'
+
+class BadStringValue(ArffException):
+    '''Error raise when a string contains space but is not quoted.'''
+    message = 'Invalid string value at line %d.'
+
+class BadLayout(ArffException):
+    '''Error raised when the layout of the ARFF file has something wrong.'''
+    message = 'Invalid layout of the ARFF file, at line %d.'
+
+    def __init__(self, msg=''):
+        super().__init__()
+        if msg:
+            self.message = BadLayout.message + ' ' + msg.replace('%', '%%')
+
+
+class BadObject(ArffException):
+    '''Error raised when the object representing the ARFF file has something
+    wrong.'''
+    def __init__(self, msg='Invalid object.'):
+        self.msg = msg
+
+    def __str__(self):
+        return '%s' % self.msg
+
+# =============================================================================
+
+# INTERNAL ====================================================================
+def _unescape_sub_callback(match):
+    return _UNESCAPE_SUB_MAP[match.group()]
+
+
+def encode_string(s):
+    if _RE_QUOTE_CHARS.search(s):
+        return "'%s'" % _RE_ESCAPE_CHARS.sub(_unescape_sub_callback, s)
+    return s
+
+
+class EncodedNominalConversor:
+    def __init__(self, values):
+        self.values = {v: i for i, v in enumerate(values)}
+        self.values[0] = 0
+
+    def __call__(self, value):
+        try:
+            return self.values[value]
+        except KeyError:
+            raise BadNominalValue(value)
+
+
+class NominalConversor:
+    def __init__(self, values):
+        self.values = set(values)
+        self.zero_value = values[0]
+
+    def __call__(self, value):
+        if value not in self.values:
+            if value == 0:
+                # Sparse decode
+                # See issue #52: nominals should take their first value when
+                # unspecified in a sparse matrix. Naturally, this is consistent
+                # with EncodedNominalConversor.
+                return self.zero_value
+            raise BadNominalValue(value)
+        return str(value)
+
+
+class DenseGeneratorData:
+    '''Internal helper class to allow for different matrix types without
+    making the code a huge collection of if statements.'''
+
+    def decode_rows(self, stream, conversors):
+        for row in stream:
+            values = _parse_values(row)
+
+            if isinstance(values, dict):
+                if values and max(values) >= len(conversors):
+                    raise BadDataFormat(row)
+                # XXX: int 0 is used for implicit values, not '0'
+                values = [values[i] if i in values else 0 for i in
+                          range(len(conversors))]
+            else:
+                if len(values) != len(conversors):
+                    raise BadDataFormat(row)
+
+            yield self._decode_values(values, conversors)
+
+    @staticmethod
+    def _decode_values(values, conversors):
+        try:
+            values = [None if value is None else conversor(value)
+                      for conversor, value
+                      in zip(conversors, values)]
+        except ValueError as exc:
+            if 'float: ' in str(exc):
+                raise BadNumericalValue()
+        return values
+
+    def encode_data(self, data, attributes):
+        '''(INTERNAL) Encodes a line of data.
+
+        Data instances follow the csv format, i.e, attribute values are
+        delimited by commas. After converted from csv.
+
+        :param data: a list of values.
+        :param attributes: a list of attributes. Used to check if data is valid.
+        :return: a string with the encoded data line.
+        '''
+        current_row = 0
+
+        for inst in data:
+            if len(inst) != len(attributes):
+                raise BadObject(
+                    'Instance %d has %d attributes, expected %d' %
+                     (current_row, len(inst), len(attributes))
+                )
+
+            new_data = []
+            for value in inst:
+                if value is None or value == '' or value != value:
+                    s = '?'
+                else:
+                    s = encode_string(str(value))
+                new_data.append(s)
+
+            current_row += 1
+            yield ','.join(new_data)
+
+
+class _DataListMixin:
+    """Mixin to return a list from decode_rows instead of a generator"""
+    def decode_rows(self, stream, conversors):
+        return list(super().decode_rows(stream, conversors))
+
+
+class Data(_DataListMixin, DenseGeneratorData):
+    pass
+
+
+class COOData:
+    def decode_rows(self, stream, conversors):
+        data, rows, cols = [], [], []
+        for i, row in enumerate(stream):
+            values = _parse_values(row)
+            if not isinstance(values, dict):
+                raise BadLayout()
+            if not values:
+                continue
+            row_cols, values = zip(*sorted(values.items()))
+            try:
+                values = [value if value is None else conversors[key](value)
+                          for key, value in zip(row_cols, values)]
+            except ValueError as exc:
+                if 'float: ' in str(exc):
+                    raise BadNumericalValue()
+                raise
+            except IndexError:
+                # conversor out of range
+                raise BadDataFormat(row)
+
+            data.extend(values)
+            rows.extend([i] * len(values))
+            cols.extend(row_cols)
+
+        return data, rows, cols
+
+    def encode_data(self, data, attributes):
+        num_attributes = len(attributes)
+        new_data = []
+        current_row = 0
+
+        row = data.row
+        col = data.col
+        data = data.data
+
+        # Check if the rows are sorted
+        if not all(row[i] <= row[i + 1] for i in range(len(row) - 1)):
+            raise ValueError("liac-arff can only output COO matrices with "
+                             "sorted rows.")
+
+        for v, col, row in zip(data, col, row):
+            if row > current_row:
+                # Add empty rows if necessary
+                while current_row < row:
+                    yield " ".join(["{", ','.join(new_data), "}"])
+                    new_data = []
+                    current_row += 1
+
+            if col >= num_attributes:
+                raise BadObject(
+                    'Instance %d has at least %d attributes, expected %d' %
+                    (current_row, col + 1, num_attributes)
+                )
+
+            if v is None or v == '' or v != v:
+                s = '?'
+            else:
+                s = encode_string(str(v))
+            new_data.append("%d %s" % (col, s))
+
+        yield " ".join(["{", ','.join(new_data), "}"])
+
+class LODGeneratorData:
+    def decode_rows(self, stream, conversors):
+        for row in stream:
+            values = _parse_values(row)
+
+            if not isinstance(values, dict):
+                raise BadLayout()
+            try:
+                yield {key: None if value is None else conversors[key](value)
+                       for key, value in values.items()}
+            except ValueError as exc:
+                if 'float: ' in str(exc):
+                    raise BadNumericalValue()
+                raise
+            except IndexError:
+                # conversor out of range
+                raise BadDataFormat(row)
+
+    def encode_data(self, data, attributes):
+        current_row = 0
+
+        num_attributes = len(attributes)
+        for row in data:
+            new_data = []
+
+            if len(row) > 0 and max(row) >= num_attributes:
+                raise BadObject(
+                    'Instance %d has %d attributes, expected %d' %
+                    (current_row, max(row) + 1, num_attributes)
+                )
+
+            for col in sorted(row):
+                v = row[col]
+                if v is None or v == '' or v != v:
+                    s = '?'
+                else:
+                    s = encode_string(str(v))
+                new_data.append("%d %s" % (col, s))
+
+            current_row += 1
+            yield " ".join(["{", ','.join(new_data), "}"])
+
+class LODData(_DataListMixin, LODGeneratorData):
+    pass
+
+
+def _get_data_object_for_decoding(matrix_type):
+    if matrix_type == DENSE:
+        return Data()
+    elif matrix_type == COO:
+        return COOData()
+    elif matrix_type == LOD:
+        return LODData()
+    elif matrix_type == DENSE_GEN:
+        return DenseGeneratorData()
+    elif matrix_type == LOD_GEN:
+        return LODGeneratorData()
+    else:
+        raise ValueError("Matrix type %s not supported." % str(matrix_type))
+
+def _get_data_object_for_encoding(matrix):
+    # Probably a scipy.sparse
+    if hasattr(matrix, 'format'):
+        if matrix.format == 'coo':
+            return COOData()
+        else:
+            raise ValueError('Cannot guess matrix format!')
+    elif isinstance(matrix[0], dict):
+        return LODData()
+    else:
+        return Data()
+
+# =============================================================================
+
+# ADVANCED INTERFACE ==========================================================
+class ArffDecoder:
+    '''An ARFF decoder.'''
+
+    def __init__(self):
+        '''Constructor.'''
+        self._conversors = []
+        self._current_line = 0
+
+    def _decode_comment(self, s):
+        '''(INTERNAL) Decodes a comment line.
+
+        Comments are single line strings starting, obligatorily, with the ``%``
+        character, and can have any symbol, including whitespaces or special
+        characters.
+
+        This method must receive a normalized string, i.e., a string without
+        padding, including the "\r\n" characters.
+
+        :param s: a normalized string.
+        :return: a string with the decoded comment.
+        '''
+        res = re.sub(r'^\%( )?', '', s)
+        return res
+
+    def _decode_relation(self, s):
+        '''(INTERNAL) Decodes a relation line.
+
+        The relation declaration is a line with the format ``@RELATION
+        <relation-name>``, where ``relation-name`` is a string. The string must
+        start with alphabetic character and must be quoted if the name includes
+        spaces, otherwise this method will raise a `BadRelationFormat` exception.
+
+        This method must receive a normalized string, i.e., a string without
+        padding, including the "\r\n" characters.
+
+        :param s: a normalized string.
+        :return: a string with the decoded relation name.
+        '''
+        _, v = s.split(' ', 1)
+        v = v.strip()
+
+        if not _RE_RELATION.match(v):
+            raise BadRelationFormat()
+
+        res = str(v.strip('"\''))
+        return res
+
+    def _decode_attribute(self, s):
+        '''(INTERNAL) Decodes an attribute line.
+
+        The attribute is the most complex declaration in an arff file. All
+        attributes must follow the template::
+
+             @attribute <attribute-name> <datatype>
+
+        where ``attribute-name`` is a string, quoted if the name contains any
+        whitespace, and ``datatype`` can be:
+
+        - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.
+        - Strings as ``STRING``.
+        - Dates (NOT IMPLEMENTED).
+        - Nominal attributes with format:
+
+            {<nominal-name1>, <nominal-name2>, <nominal-name3>, ...}
+
+        The nominal names follow the rules for the attribute names, i.e., they
+        must be quoted if the name contains whitespaces.
+
+        This method must receive a normalized string, i.e., a string without
+        padding, including the "\r\n" characters.
+
+        :param s: a normalized string.
+        :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES).
+        '''
+        _, v = s.split(' ', 1)
+        v = v.strip()
+
+        # Verify the general structure of declaration
+        m = _RE_ATTRIBUTE.match(v)
+        if not m:
+            raise BadAttributeFormat()
+
+        # Extracts the raw name and type
+        name, type_ = m.groups()
+
+        # Extracts the final name
+        name = str(name.strip('"\''))
+
+        # Extracts the final type
+        if type_[:1] == "{" and type_[-1:] == "}":
+            try:
+                type_ = _parse_values(type_.strip('{} '))
+            except Exception:
+                raise BadAttributeType()
+            if isinstance(type_, dict):
+                raise BadAttributeType()
+
+        else:
+            # If not nominal, verify the type name
+            type_ = str(type_).upper()
+            if type_ not in ['NUMERIC', 'REAL', 'INTEGER', 'STRING']:
+                raise BadAttributeType()
+
+        return (name, type_)
+
+    def _decode(self, s, encode_nominal=False, matrix_type=DENSE):
+        '''Do the job the ``encode``.'''
+
+        # Make sure this method is idempotent
+        self._current_line = 0
+
+        # If string, convert to a list of lines
+        if isinstance(s, str):
+            s = s.strip('\r\n ').replace('\r\n', '\n').split('\n')
+
+        # Create the return object
+        obj: ArffContainerType = {
+            'description': '',
+            'relation': '',
+            'attributes': [],
+            'data': []
+        }
+        attribute_names = {}
+
+        # Create the data helper object
+        data = _get_data_object_for_decoding(matrix_type)
+
+        # Read all lines
+        STATE = _TK_DESCRIPTION
+        s = iter(s)
+        for row in s:
+            self._current_line += 1
+            # Ignore empty lines
+            row = row.strip(' \r\n')
+            if not row: continue
+
+            u_row = row.upper()
+
+            # DESCRIPTION -----------------------------------------------------
+            if u_row.startswith(_TK_DESCRIPTION) and STATE == _TK_DESCRIPTION:
+                obj['description'] += self._decode_comment(row) + '\n'
+            # -----------------------------------------------------------------
+
+            # RELATION --------------------------------------------------------
+            elif u_row.startswith(_TK_RELATION):
+                if STATE != _TK_DESCRIPTION:
+                    raise BadLayout()
+
+                STATE = _TK_RELATION
+                obj['relation'] = self._decode_relation(row)
+            # -----------------------------------------------------------------
+
+            # ATTRIBUTE -------------------------------------------------------
+            elif u_row.startswith(_TK_ATTRIBUTE):
+                if STATE != _TK_RELATION and STATE != _TK_ATTRIBUTE:
+                    raise BadLayout()
+
+                STATE = _TK_ATTRIBUTE
+
+                attr = self._decode_attribute(row)
+                if attr[0] in attribute_names:
+                    raise BadAttributeName(attr[0], attribute_names[attr[0]])
+                else:
+                    attribute_names[attr[0]] = self._current_line
+                obj['attributes'].append(attr)
+
+                if isinstance(attr[1], (list, tuple)):
+                    if encode_nominal:
+                        conversor = EncodedNominalConversor(attr[1])
+                    else:
+                        conversor = NominalConversor(attr[1])
+                else:
+                    CONVERSOR_MAP = {'STRING': str,
+                                     'INTEGER': lambda x: int(float(x)),
+                                     'NUMERIC': float,
+                                     'REAL': float}
+                    conversor = CONVERSOR_MAP[attr[1]]
+
+                self._conversors.append(conversor)
+            # -----------------------------------------------------------------
+
+            # DATA ------------------------------------------------------------
+            elif u_row.startswith(_TK_DATA):
+                if STATE != _TK_ATTRIBUTE:
+                    raise BadLayout()
+
+                break
+            # -----------------------------------------------------------------
+
+            # COMMENT ---------------------------------------------------------
+            elif u_row.startswith(_TK_COMMENT):
+                pass
+            # -----------------------------------------------------------------
+        else:
+            # Never found @DATA
+            raise BadLayout()
+
+        def stream():
+            for row in s:
+                self._current_line += 1
+                row = row.strip()
+                # Ignore empty lines and comment lines.
+                if row and not row.startswith(_TK_COMMENT):
+                    yield row
+
+        # Alter the data object
+        obj['data'] = data.decode_rows(stream(), self._conversors)
+        if obj['description'].endswith('\n'):
+            obj['description'] = obj['description'][:-1]
+
+        return obj
+
+    def decode(self, s, encode_nominal=False, return_type=DENSE):
+        '''Returns the Python representation of a given ARFF file.
+
+        When a file object is passed as an argument, this method reads lines
+        iteratively, avoiding to load unnecessary information to the memory.
+
+        :param s: a string or file object with the ARFF file.
+        :param encode_nominal: boolean, if True perform a label encoding
+            while reading the .arff file.
+        :param return_type: determines the data structure used to store the
+            dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,
+            `arff.DENSE_GEN` or `arff.LOD_GEN`.
+            Consult the sections on `working with sparse data`_ and `loading
+            progressively`_.
+        '''
+        try:
+            return self._decode(s, encode_nominal=encode_nominal,
+                                matrix_type=return_type)
+        except ArffException as e:
+            e.line = self._current_line
+            raise e
+
+
+class ArffEncoder:
+    '''An ARFF encoder.'''
+
+    def _encode_comment(self, s=''):
+        '''(INTERNAL) Encodes a comment line.
+
+        Comments are single line strings starting, obligatorily, with the ``%``
+        character, and can have any symbol, including whitespaces or special
+        characters.
+
+        If ``s`` is None, this method will simply return an empty comment.
+
+        :param s: (OPTIONAL) string.
+        :return: a string with the encoded comment line.
+        '''
+        if s:
+            return '%s %s'%(_TK_COMMENT, s)
+        else:
+            return '%s' % _TK_COMMENT
+
+    def _encode_relation(self, name):
+        '''(INTERNAL) Decodes a relation line.
+
+        The relation declaration is a line with the format ``@RELATION
+        <relation-name>``, where ``relation-name`` is a string.
+
+        :param name: a string.
+        :return: a string with the encoded relation declaration.
+        '''
+        for char in ' %{},':
+            if char in name:
+                name = '"%s"'%name
+                break
+
+        return '%s %s'%(_TK_RELATION, name)
+
+    def _encode_attribute(self, name, type_):
+        '''(INTERNAL) Encodes an attribute line.
+
+        The attribute follow the template::
+
+             @attribute <attribute-name> <datatype>
+
+        where ``attribute-name`` is a string, and ``datatype`` can be:
+
+        - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.
+        - Strings as ``STRING``.
+        - Dates (NOT IMPLEMENTED).
+        - Nominal attributes with format:
+
+            {<nominal-name1>, <nominal-name2>, <nominal-name3>, ...}
+
+        This method must receive a the name of the attribute and its type, if
+        the attribute type is nominal, ``type`` must be a list of values.
+
+        :param name: a string.
+        :param type_: a string or a list of string.
+        :return: a string with the encoded attribute declaration.
+        '''
+        for char in ' %{},':
+            if char in name:
+                name = '"%s"'%name
+                break
+
+        if isinstance(type_, (tuple, list)):
+            type_tmp = ['%s' % encode_string(type_k) for type_k in type_]
+            type_ = '{%s}'%(', '.join(type_tmp))
+
+        return '%s %s %s'%(_TK_ATTRIBUTE, name, type_)
+
+    def encode(self, obj):
+        '''Encodes a given object to an ARFF file.
+
+        :param obj: the object containing the ARFF information.
+        :return: the ARFF file as an string.
+        '''
+        data = [row for row in self.iter_encode(obj)]
+
+        return '\n'.join(data)
+
+    def iter_encode(self, obj):
+        '''The iterative version of `arff.ArffEncoder.encode`.
+
+        This encodes iteratively a given object and return, one-by-one, the
+        lines of the ARFF file.
+
+        :param obj: the object containing the ARFF information.
+        :return: (yields) the ARFF file as strings.
+        '''
+        # DESCRIPTION
+        if obj.get('description', None):
+            for row in obj['description'].split('\n'):
+                yield self._encode_comment(row)
+
+        # RELATION
+        if not obj.get('relation'):
+            raise BadObject('Relation name not found or with invalid value.')
+
+        yield self._encode_relation(obj['relation'])
+        yield ''
+
+        # ATTRIBUTES
+        if not obj.get('attributes'):
+            raise BadObject('Attributes not found.')
+
+        attribute_names = set()
+        for attr in obj['attributes']:
+            # Verify for bad object format
+            if not isinstance(attr, (tuple, list)) or \
+               len(attr) != 2 or \
+               not isinstance(attr[0], str):
+                raise BadObject('Invalid attribute declaration "%s"'%str(attr))
+
+            if isinstance(attr[1], str):
+                # Verify for invalid types
+                if attr[1] not in _SIMPLE_TYPES:
+                    raise BadObject('Invalid attribute type "%s"'%str(attr))
+
+            # Verify for bad object format
+            elif not isinstance(attr[1], (tuple, list)):
+                raise BadObject('Invalid attribute type "%s"'%str(attr))
+
+            # Verify attribute name is not used twice
+            if attr[0] in attribute_names:
+                raise BadObject('Trying to use attribute name "%s" for the '
+                                'second time.' % str(attr[0]))
+            else:
+                attribute_names.add(attr[0])
+
+            yield self._encode_attribute(attr[0], attr[1])
+        yield ''
+        attributes = obj['attributes']
+
+        # DATA
+        yield _TK_DATA
+        if 'data' in obj:
+            data = _get_data_object_for_encoding(obj.get('data'))
+            yield from data.encode_data(obj.get('data'), attributes)
+
+        yield ''
+
+# =============================================================================
+
+# BASIC INTERFACE =============================================================
+def load(fp, encode_nominal=False, return_type=DENSE):
+    '''Load a file-like object containing the ARFF document and convert it into
+    a Python object.
+
+    :param fp: a file-like object.
+    :param encode_nominal: boolean, if True perform a label encoding
+        while reading the .arff file.
+    :param return_type: determines the data structure used to store the
+        dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,
+        `arff.DENSE_GEN` or `arff.LOD_GEN`.
+        Consult the sections on `working with sparse data`_ and `loading
+        progressively`_.
+    :return: a dictionary.
+     '''
+    decoder = ArffDecoder()
+    return decoder.decode(fp, encode_nominal=encode_nominal,
+                          return_type=return_type)
+
+def loads(s, encode_nominal=False, return_type=DENSE):
+    '''Convert a string instance containing the ARFF document into a Python
+    object.
+
+    :param s: a string object.
+    :param encode_nominal: boolean, if True perform a label encoding
+        while reading the .arff file.
+    :param return_type: determines the data structure used to store the
+        dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,
+        `arff.DENSE_GEN` or `arff.LOD_GEN`.
+        Consult the sections on `working with sparse data`_ and `loading
+        progressively`_.
+    :return: a dictionary.
+    '''
+    decoder = ArffDecoder()
+    return decoder.decode(s, encode_nominal=encode_nominal,
+                          return_type=return_type)
+
+def dump(obj, fp):
+    '''Serialize an object representing the ARFF document to a given file-like
+    object.
+
+    :param obj: a dictionary.
+    :param fp: a file-like object.
+    '''
+    encoder = ArffEncoder()
+    generator = encoder.iter_encode(obj)
+
+    last_row = next(generator)
+    for row in generator:
+        fp.write(last_row + '\n')
+        last_row = row
+    fp.write(last_row)
+
+    return fp
+
+def dumps(obj):
+    '''Serialize an object representing the ARFF document, returning a string.
+
+    :param obj: a dictionary.
+    :return: a string with the ARFF document.
+    '''
+    encoder = ArffEncoder()
+    return encoder.encode(obj)
+# =============================================================================
diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/_array_api_compat_vendor.py b/.venv/lib/python3.12/site-packages/sklearn/externals/_array_api_compat_vendor.py
new file mode 100644
index 0000000000000000000000000000000000000000..38cefd2fe6f3f51cb76caa0137eef1af927b9e45
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/externals/_array_api_compat_vendor.py
@@ -0,0 +1,5 @@
+# DO NOT RENAME THIS FILE
+# This is a hook for array_api_extra/_lib/_compat.py
+# to co-vendor array_api_compat and potentially override its functions.
+
+from .array_api_compat import *  # noqa: F403
diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/conftest.py b/.venv/lib/python3.12/site-packages/sklearn/externals/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..c763d9761a438dca43e5856d6eaf9747cdeed2bb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/externals/conftest.py
@@ -0,0 +1,6 @@
+# Do not collect any tests in externals. This is more robust than using
+# --ignore because --ignore needs a path and it is not convenient to pass in
+# the externals path (very long install-dependent path in site-packages) when
+# using --pyargs
+def pytest_ignore_collect(collection_path, config):
+    return True
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f8c53b4ffb6b5c0784743e414d6053ca0ddfa65
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/__init__.py
@@ -0,0 +1,18 @@
+"""Feature extraction from raw data."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from . import image, text
+from ._dict_vectorizer import DictVectorizer
+from ._hash import FeatureHasher
+from .image import grid_to_graph, img_to_graph
+
+__all__ = [
+    "DictVectorizer",
+    "FeatureHasher",
+    "grid_to_graph",
+    "image",
+    "img_to_graph",
+    "text",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..689146bd229d83b463511a6578a4dab9bec7fa72
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py
@@ -0,0 +1,459 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from array import array
+from collections.abc import Iterable, Mapping
+from numbers import Number
+from operator import itemgetter
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.utils import metadata_routing
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import check_array
+from ..utils.validation import check_is_fitted
+
+
+class DictVectorizer(TransformerMixin, BaseEstimator):
+    """Transforms lists of feature-value mappings to vectors.
+
+    This transformer turns lists of mappings (dict-like objects) of feature
+    names to feature values into Numpy arrays or scipy.sparse matrices for use
+    with scikit-learn estimators.
+
+    When feature values are strings, this transformer will do a binary one-hot
+    (aka one-of-K) coding: one boolean-valued feature is constructed for each
+    of the possible string values that the feature can take on. For instance,
+    a feature "f" that can take on the values "ham" and "spam" will become two
+    features in the output, one signifying "f=ham", the other "f=spam".
+
+    If a feature value is a sequence or set of strings, this transformer
+    will iterate over the values and will count the occurrences of each string
+    value.
+
+    However, note that this transformer will only do a binary one-hot encoding
+    when feature values are of type string. If categorical features are
+    represented as numeric values such as int or iterables of strings, the
+    DictVectorizer can be followed by
+    :class:`~sklearn.preprocessing.OneHotEncoder` to complete
+    binary one-hot encoding.
+
+    Features that do not occur in a sample (mapping) will have a zero value
+    in the resulting array/matrix.
+
+    For an efficiency comparison of the different feature extractors, see
+    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
+
+    Read more in the :ref:`User Guide <dict_feature_extraction>`.
+
+    Parameters
+    ----------
+    dtype : dtype, default=np.float64
+        The type of feature values. Passed to Numpy array/scipy.sparse matrix
+        constructors as the dtype argument.
+    separator : str, default="="
+        Separator string used when constructing new features for one-hot
+        coding.
+    sparse : bool, default=True
+        Whether transform should produce scipy.sparse matrices.
+    sort : bool, default=True
+        Whether ``feature_names_`` and ``vocabulary_`` should be
+        sorted when fitting.
+
+    Attributes
+    ----------
+    vocabulary_ : dict
+        A dictionary mapping feature names to feature indices.
+
+    feature_names_ : list
+        A list of length n_features containing the feature names (e.g., "f=ham"
+        and "f=spam").
+
+    See Also
+    --------
+    FeatureHasher : Performs vectorization using only a hash function.
+    sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
+        features encoded as columns of arbitrary data types.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction import DictVectorizer
+    >>> v = DictVectorizer(sparse=False)
+    >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
+    >>> X = v.fit_transform(D)
+    >>> X
+    array([[2., 0., 1.],
+           [0., 1., 3.]])
+    >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
+    ...                            {'baz': 1.0, 'foo': 3.0}]
+    True
+    >>> v.transform({'foo': 4, 'unseen_feature': 3})
+    array([[0., 0., 4.]])
+    """
+
+    # This isn't something that people should be routing / using in a pipeline.
+    __metadata_request__inverse_transform = {"dict_type": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "dtype": "no_validation",  # validation delegated to numpy,
+        "separator": [str],
+        "sparse": ["boolean"],
+        "sort": ["boolean"],
+    }
+
+    def __init__(self, *, dtype=np.float64, separator="=", sparse=True, sort=True):
+        self.dtype = dtype
+        self.separator = separator
+        self.sparse = sparse
+        self.sort = sort
+
+    def _add_iterable_element(
+        self,
+        f,
+        v,
+        feature_names,
+        vocab,
+        *,
+        fitting=True,
+        transforming=False,
+        indices=None,
+        values=None,
+    ):
+        """Add feature names for iterable of strings"""
+        for vv in v:
+            if isinstance(vv, str):
+                feature_name = "%s%s%s" % (f, self.separator, vv)
+                vv = 1
+            else:
+                raise TypeError(
+                    f"Unsupported type {type(vv)} in iterable "
+                    "value. Only iterables of string are "
+                    "supported."
+                )
+            if fitting and feature_name not in vocab:
+                vocab[feature_name] = len(feature_names)
+                feature_names.append(feature_name)
+
+            if transforming and feature_name in vocab:
+                indices.append(vocab[feature_name])
+                values.append(self.dtype(vv))
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Learn a list of feature name -> indices mappings.
+
+        Parameters
+        ----------
+        X : Mapping or iterable over Mappings
+            Dict(s) or Mapping(s) from feature names (arbitrary Python
+            objects) to feature values (strings or convertible to dtype).
+
+            .. versionchanged:: 0.24
+               Accepts multiple string values for one categorical feature.
+
+        y : (ignored)
+            Ignored parameter.
+
+        Returns
+        -------
+        self : object
+            DictVectorizer class instance.
+        """
+        feature_names = []
+        vocab = {}
+
+        for x in X:
+            for f, v in x.items():
+                if isinstance(v, str):
+                    feature_name = "%s%s%s" % (f, self.separator, v)
+                elif isinstance(v, Number) or (v is None):
+                    feature_name = f
+                elif isinstance(v, Mapping):
+                    raise TypeError(
+                        f"Unsupported value type {type(v)} "
+                        f"for {f}: {v}.\n"
+                        "Mapping objects are not supported."
+                    )
+                elif isinstance(v, Iterable):
+                    feature_name = None
+                    self._add_iterable_element(f, v, feature_names, vocab)
+
+                if feature_name is not None:
+                    if feature_name not in vocab:
+                        vocab[feature_name] = len(feature_names)
+                        feature_names.append(feature_name)
+
+        if self.sort:
+            feature_names.sort()
+            vocab = {f: i for i, f in enumerate(feature_names)}
+
+        self.feature_names_ = feature_names
+        self.vocabulary_ = vocab
+
+        return self
+
+    def _transform(self, X, fitting):
+        # Sanity check: Python's array has no way of explicitly requesting the
+        # signed 32-bit integers that scipy.sparse needs, so we use the next
+        # best thing: typecode "i" (int). However, if that gives larger or
+        # smaller integers than 32-bit ones, np.frombuffer screws up.
+        assert array("i").itemsize == 4, (
+            "sizeof(int) != 4 on your platform; please report this at"
+            " https://github.com/scikit-learn/scikit-learn/issues and"
+            " include the output from platform.platform() in your bug report"
+        )
+
+        dtype = self.dtype
+        if fitting:
+            feature_names = []
+            vocab = {}
+        else:
+            feature_names = self.feature_names_
+            vocab = self.vocabulary_
+
+        transforming = True
+
+        # Process everything as sparse regardless of setting
+        X = [X] if isinstance(X, Mapping) else X
+
+        indices = array("i")
+        indptr = [0]
+        # XXX we could change values to an array.array as well, but it
+        # would require (heuristic) conversion of dtype to typecode...
+        values = []
+
+        # collect all the possible feature names and build sparse matrix at
+        # same time
+        for x in X:
+            for f, v in x.items():
+                if isinstance(v, str):
+                    feature_name = "%s%s%s" % (f, self.separator, v)
+                    v = 1
+                elif isinstance(v, Number) or (v is None):
+                    feature_name = f
+                elif not isinstance(v, Mapping) and isinstance(v, Iterable):
+                    feature_name = None
+                    self._add_iterable_element(
+                        f,
+                        v,
+                        feature_names,
+                        vocab,
+                        fitting=fitting,
+                        transforming=transforming,
+                        indices=indices,
+                        values=values,
+                    )
+                else:
+                    raise TypeError(
+                        f"Unsupported value Type {type(v)} "
+                        f"for {f}: {v}.\n"
+                        f"{type(v)} objects are not supported."
+                    )
+
+                if feature_name is not None:
+                    if fitting and feature_name not in vocab:
+                        vocab[feature_name] = len(feature_names)
+                        feature_names.append(feature_name)
+
+                    if feature_name in vocab:
+                        indices.append(vocab[feature_name])
+                        values.append(self.dtype(v))
+
+            indptr.append(len(indices))
+
+        if len(indptr) == 1:
+            raise ValueError("Sample sequence X is empty.")
+
+        indices = np.frombuffer(indices, dtype=np.intc)
+        shape = (len(indptr) - 1, len(vocab))
+
+        result_matrix = sp.csr_matrix(
+            (values, indices, indptr), shape=shape, dtype=dtype
+        )
+
+        # Sort everything if asked
+        if fitting and self.sort:
+            feature_names.sort()
+            map_index = np.empty(len(feature_names), dtype=np.int32)
+            for new_val, f in enumerate(feature_names):
+                map_index[new_val] = vocab[f]
+                vocab[f] = new_val
+            result_matrix = result_matrix[:, map_index]
+
+        if self.sparse:
+            result_matrix.sort_indices()
+        else:
+            result_matrix = result_matrix.toarray()
+
+        if fitting:
+            self.feature_names_ = feature_names
+            self.vocabulary_ = vocab
+
+        return result_matrix
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Learn a list of feature name -> indices mappings and transform X.
+
+        Like fit(X) followed by transform(X), but does not require
+        materializing X in memory.
+
+        Parameters
+        ----------
+        X : Mapping or iterable over Mappings
+            Dict(s) or Mapping(s) from feature names (arbitrary Python
+            objects) to feature values (strings or convertible to dtype).
+
+            .. versionchanged:: 0.24
+               Accepts multiple string values for one categorical feature.
+
+        y : (ignored)
+            Ignored parameter.
+
+        Returns
+        -------
+        Xa : {array, sparse matrix}
+            Feature vectors; always 2-d.
+        """
+        return self._transform(X, fitting=True)
+
+    def inverse_transform(self, X, dict_type=dict):
+        """Transform array or sparse matrix X back to feature mappings.
+
+        X must have been produced by this DictVectorizer's transform or
+        fit_transform method; it may only have passed through transformers
+        that preserve the number of features and their order.
+
+        In the case of one-hot/one-of-K coding, the constructed feature
+        names and values are returned rather than the original ones.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Sample matrix.
+        dict_type : type, default=dict
+            Constructor for feature mappings. Must conform to the
+            collections.Mapping API.
+
+        Returns
+        -------
+        X_original : list of dict_type objects of shape (n_samples,)
+            Feature mappings for the samples in X.
+        """
+        check_is_fitted(self, "feature_names_")
+
+        # COO matrix is not subscriptable
+        X = check_array(X, accept_sparse=["csr", "csc"])
+        n_samples = X.shape[0]
+
+        names = self.feature_names_
+        dicts = [dict_type() for _ in range(n_samples)]
+
+        if sp.issparse(X):
+            for i, j in zip(*X.nonzero()):
+                dicts[i][names[j]] = X[i, j]
+        else:
+            for i, d in enumerate(dicts):
+                for j, v in enumerate(X[i, :]):
+                    if v != 0:
+                        d[names[j]] = X[i, j]
+
+        return dicts
+
+    def transform(self, X):
+        """Transform feature->value dicts to array or sparse matrix.
+
+        Named features not encountered during fit or fit_transform will be
+        silently ignored.
+
+        Parameters
+        ----------
+        X : Mapping or iterable over Mappings of shape (n_samples,)
+            Dict(s) or Mapping(s) from feature names (arbitrary Python
+            objects) to feature values (strings or convertible to dtype).
+
+        Returns
+        -------
+        Xa : {array, sparse matrix}
+            Feature vectors; always 2-d.
+        """
+        check_is_fitted(self, ["feature_names_", "vocabulary_"])
+        return self._transform(X, fitting=False)
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "feature_names_")
+        if any(not isinstance(name, str) for name in self.feature_names_):
+            feature_names = [str(name) for name in self.feature_names_]
+        else:
+            feature_names = self.feature_names_
+        return np.asarray(feature_names, dtype=object)
+
+    def restrict(self, support, indices=False):
+        """Restrict the features to those in support using feature selection.
+
+        This function modifies the estimator in-place.
+
+        Parameters
+        ----------
+        support : array-like
+            Boolean mask or list of indices (as returned by the get_support
+            member of feature selectors).
+        indices : bool, default=False
+            Whether support is a list of indices.
+
+        Returns
+        -------
+        self : object
+            DictVectorizer class instance.
+
+        Examples
+        --------
+        >>> from sklearn.feature_extraction import DictVectorizer
+        >>> from sklearn.feature_selection import SelectKBest, chi2
+        >>> v = DictVectorizer()
+        >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
+        >>> X = v.fit_transform(D)
+        >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
+        >>> v.get_feature_names_out()
+        array(['bar', 'baz', 'foo'], ...)
+        >>> v.restrict(support.get_support())
+        DictVectorizer()
+        >>> v.get_feature_names_out()
+        array(['bar', 'foo'], ...)
+        """
+        check_is_fitted(self, "feature_names_")
+
+        if not indices:
+            support = np.where(support)[0]
+
+        names = self.feature_names_
+        new_vocab = {}
+        for i in support:
+            new_vocab[names[i]] = len(new_vocab)
+
+        self.vocabulary_ = new_vocab
+        self.feature_names_ = [
+            f for f, i in sorted(new_vocab.items(), key=itemgetter(1))
+        ]
+
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.dict = True
+        tags.input_tags.two_d_array = False
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hash.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hash.py
new file mode 100644
index 0000000000000000000000000000000000000000..34756fa06eb4e701cd1f0364d604e6a432ebea68
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hash.py
@@ -0,0 +1,209 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import chain
+from numbers import Integral
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.utils import metadata_routing
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils._param_validation import Interval, StrOptions
+from ._hashing_fast import transform as _hashing_transform
+
+
+def _iteritems(d):
+    """Like d.iteritems, but accepts any collections.Mapping."""
+    return d.iteritems() if hasattr(d, "iteritems") else d.items()
+
+
+class FeatureHasher(TransformerMixin, BaseEstimator):
+    """Implements feature hashing, aka the hashing trick.
+
+    This class turns sequences of symbolic feature names (strings) into
+    scipy.sparse matrices, using a hash function to compute the matrix column
+    corresponding to a name. The hash function employed is the signed 32-bit
+    version of Murmurhash3.
+
+    Feature names of type byte string are used as-is. Unicode strings are
+    converted to UTF-8 first, but no Unicode normalization is done.
+    Feature values must be (finite) numbers.
+
+    This class is a low-memory alternative to DictVectorizer and
+    CountVectorizer, intended for large-scale (online) learning and situations
+    where memory is tight, e.g. when running prediction code on embedded
+    devices.
+
+    For an efficiency comparison of the different feature extractors, see
+    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
+
+    Read more in the :ref:`User Guide <feature_hashing>`.
+
+    .. versionadded:: 0.13
+
+    Parameters
+    ----------
+    n_features : int, default=2**20
+        The number of features (columns) in the output matrices. Small numbers
+        of features are likely to cause hash collisions, but large numbers
+        will cause larger coefficient dimensions in linear learners.
+    input_type : str, default='dict'
+        Choose a string from {'dict', 'pair', 'string'}.
+        Either "dict" (the default) to accept dictionaries over
+        (feature_name, value); "pair" to accept pairs of (feature_name, value);
+        or "string" to accept single strings.
+        feature_name should be a string, while value should be a number.
+        In the case of "string", a value of 1 is implied.
+        The feature_name is hashed to find the appropriate column for the
+        feature. The value's sign might be flipped in the output (but see
+        non_negative, below).
+    dtype : numpy dtype, default=np.float64
+        The type of feature values. Passed to scipy.sparse matrix constructors
+        as the dtype argument. Do not set this to bool, np.boolean or any
+        unsigned integer type.
+    alternate_sign : bool, default=True
+        When True, an alternating sign is added to the features as to
+        approximately conserve the inner product in the hashed space even for
+        small n_features. This approach is similar to sparse random projection.
+
+        .. versionchanged:: 0.19
+            ``alternate_sign`` replaces the now deprecated ``non_negative``
+            parameter.
+
+    See Also
+    --------
+    DictVectorizer : Vectorizes string-valued features using a hash table.
+    sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features.
+
+    Notes
+    -----
+    This estimator is :term:`stateless` and does not need to be fitted.
+    However, we recommend to call :meth:`fit_transform` instead of
+    :meth:`transform`, as parameter validation is only performed in
+    :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction import FeatureHasher
+    >>> h = FeatureHasher(n_features=10)
+    >>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
+    >>> f = h.transform(D)
+    >>> f.toarray()
+    array([[ 0.,  0., -4., -1.,  0.,  0.,  0.,  0.,  0.,  2.],
+           [ 0.,  0.,  0., -2., -5.,  0.,  0.,  0.,  0.,  0.]])
+
+    With `input_type="string"`, the input must be an iterable over iterables of
+    strings:
+
+    >>> h = FeatureHasher(n_features=8, input_type="string")
+    >>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]]
+    >>> f = h.transform(raw_X)
+    >>> f.toarray()
+    array([[ 0.,  0.,  0., -1.,  0., -1.,  0.,  1.],
+           [ 0.,  0.,  0., -1.,  0., -1.,  0.,  0.],
+           [ 0., -1.,  0.,  0.,  0.,  0.,  0.,  1.]])
+    """
+
+    # raw_X should have been called X
+    __metadata_request__transform = {"raw_X": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="both")],
+        "input_type": [StrOptions({"dict", "pair", "string"})],
+        "dtype": "no_validation",  # delegate to numpy
+        "alternate_sign": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        n_features=(2**20),
+        *,
+        input_type="dict",
+        dtype=np.float64,
+        alternate_sign=True,
+    ):
+        self.dtype = dtype
+        self.input_type = input_type
+        self.n_features = n_features
+        self.alternate_sign = alternate_sign
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X=None, y=None):
+        """Only validates estimator's parameters.
+
+        This method allows to: (i) validate the estimator's parameters and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : Ignored
+            Not used, present here for API consistency by convention.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            FeatureHasher class instance.
+        """
+        return self
+
+    def transform(self, raw_X):
+        """Transform a sequence of instances to a scipy.sparse matrix.
+
+        Parameters
+        ----------
+        raw_X : iterable over iterable over raw features, length = n_samples
+            Samples. Each sample must be iterable an (e.g., a list or tuple)
+            containing/generating feature names (and optionally values, see
+            the input_type constructor argument) which will be hashed.
+            raw_X need not support the len function, so it can be the result
+            of a generator; n_samples is determined on the fly.
+
+        Returns
+        -------
+        X : sparse matrix of shape (n_samples, n_features)
+            Feature matrix, for use with estimators or further transformers.
+        """
+        raw_X = iter(raw_X)
+        if self.input_type == "dict":
+            raw_X = (_iteritems(d) for d in raw_X)
+        elif self.input_type == "string":
+            first_raw_X = next(raw_X)
+            if isinstance(first_raw_X, str):
+                raise ValueError(
+                    "Samples can not be a single string. The input must be an iterable"
+                    " over iterables of strings."
+                )
+            raw_X_ = chain([first_raw_X], raw_X)
+            raw_X = (((f, 1) for f in x) for x in raw_X_)
+
+        indices, indptr, values = _hashing_transform(
+            raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0
+        )
+        n_samples = indptr.shape[0] - 1
+
+        if n_samples == 0:
+            raise ValueError("Cannot vectorize empty sequence.")
+
+        X = sp.csr_matrix(
+            (values, indices, indptr),
+            dtype=self.dtype,
+            shape=(n_samples, self.n_features),
+        )
+        X.sum_duplicates()  # also sorts the indices
+
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.two_d_array = False
+        if self.input_type == "string":
+            tags.input_tags.string = True
+        elif self.input_type == "dict":
+            tags.input_tags.dict = True
+        tags.requires_fit = False
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..56d8ee4948c7d16355c73ecf22d7c43d93e6b2e9
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..5069d555d60eae0ccc4cbfb04c03fbfce78b87bc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.pyx
@@ -0,0 +1,89 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.stdlib cimport abs
+from libcpp.vector cimport vector
+
+cimport numpy as cnp
+import numpy as np
+from ..utils._typedefs cimport int32_t, int64_t
+from ..utils.murmurhash cimport murmurhash3_bytes_s32
+from ..utils._vector_sentinel cimport vector_to_nd_array
+
+cnp.import_array()
+
+
+def transform(raw_X, Py_ssize_t n_features, dtype,
+              bint alternate_sign=1, unsigned int seed=0):
+    """Guts of FeatureHasher.transform.
+
+    Returns
+    -------
+    n_samples : integer
+    indices, indptr, values : lists
+        For constructing a scipy.sparse.csr_matrix.
+
+    """
+    cdef int32_t h
+    cdef double value
+
+    cdef vector[int32_t] indices
+    cdef vector[int64_t] indptr
+    indptr.push_back(0)
+
+    # Since Python array does not understand Numpy dtypes, we grow the indices
+    # and values arrays ourselves. Use a Py_ssize_t capacity for safety.
+    cdef Py_ssize_t capacity = 8192     # arbitrary
+    cdef int64_t size = 0
+    cdef cnp.ndarray values = np.empty(capacity, dtype=dtype)
+
+    for x in raw_X:
+        for f, v in x:
+            if isinstance(v, (str, unicode)):
+                f = "%s%s%s" % (f, '=', v)
+                value = 1
+            else:
+                value = v
+
+            if value == 0:
+                continue
+
+            if isinstance(f, unicode):
+                f = (<unicode>f).encode("utf-8")
+            # Need explicit type check because Murmurhash does not propagate
+            # all exceptions. Add "except *" there?
+            elif not isinstance(f, bytes):
+                raise TypeError("feature names must be strings")
+
+            h = murmurhash3_bytes_s32(<bytes>f, seed)
+
+            if h == - 2147483648:
+                # abs(-2**31) is undefined behavior because h is a `np.int32`
+                # The following is defined such that it is equal to: abs(-2**31) % n_features
+                indices.push_back((2147483647 - (n_features - 1)) % n_features)
+            else:
+                indices.push_back(abs(h) % n_features)
+            # improve inner product preservation in the hashed space
+            if alternate_sign:
+                value *= (h >= 0) * 2 - 1
+            values[size] = value
+            size += 1
+
+            if size == capacity:
+                capacity *= 2
+                # can't use resize member because there might be multiple
+                # references to the arrays due to Cython's error checking
+                values = np.resize(values, capacity)
+
+        indptr.push_back(size)
+
+    indices_array = vector_to_nd_array(&indices)
+    indptr_array = vector_to_nd_array(&indptr)
+
+    if indptr_array[indptr_array.shape[0]-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
+        # both indices and indptr have the same dtype in CSR arrays
+        indices_array = indices_array.astype(np.int64, copy=False)
+    else:
+        indptr_array = indptr_array.astype(np.int32, copy=False)
+
+    return (indices_array, indptr_array, values[:size])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_stop_words.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_stop_words.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc8e6d2f37dc06cf834cb42b363594901a86d1f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_stop_words.py
@@ -0,0 +1,328 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# This list of English stop words is taken from the "Glasgow Information
+# Retrieval Group". The original list can be found at
+# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
+ENGLISH_STOP_WORDS = frozenset(
+    [
+        "a",
+        "about",
+        "above",
+        "across",
+        "after",
+        "afterwards",
+        "again",
+        "against",
+        "all",
+        "almost",
+        "alone",
+        "along",
+        "already",
+        "also",
+        "although",
+        "always",
+        "am",
+        "among",
+        "amongst",
+        "amoungst",
+        "amount",
+        "an",
+        "and",
+        "another",
+        "any",
+        "anyhow",
+        "anyone",
+        "anything",
+        "anyway",
+        "anywhere",
+        "are",
+        "around",
+        "as",
+        "at",
+        "back",
+        "be",
+        "became",
+        "because",
+        "become",
+        "becomes",
+        "becoming",
+        "been",
+        "before",
+        "beforehand",
+        "behind",
+        "being",
+        "below",
+        "beside",
+        "besides",
+        "between",
+        "beyond",
+        "bill",
+        "both",
+        "bottom",
+        "but",
+        "by",
+        "call",
+        "can",
+        "cannot",
+        "cant",
+        "co",
+        "con",
+        "could",
+        "couldnt",
+        "cry",
+        "de",
+        "describe",
+        "detail",
+        "do",
+        "done",
+        "down",
+        "due",
+        "during",
+        "each",
+        "eg",
+        "eight",
+        "either",
+        "eleven",
+        "else",
+        "elsewhere",
+        "empty",
+        "enough",
+        "etc",
+        "even",
+        "ever",
+        "every",
+        "everyone",
+        "everything",
+        "everywhere",
+        "except",
+        "few",
+        "fifteen",
+        "fifty",
+        "fill",
+        "find",
+        "fire",
+        "first",
+        "five",
+        "for",
+        "former",
+        "formerly",
+        "forty",
+        "found",
+        "four",
+        "from",
+        "front",
+        "full",
+        "further",
+        "get",
+        "give",
+        "go",
+        "had",
+        "has",
+        "hasnt",
+        "have",
+        "he",
+        "hence",
+        "her",
+        "here",
+        "hereafter",
+        "hereby",
+        "herein",
+        "hereupon",
+        "hers",
+        "herself",
+        "him",
+        "himself",
+        "his",
+        "how",
+        "however",
+        "hundred",
+        "i",
+        "ie",
+        "if",
+        "in",
+        "inc",
+        "indeed",
+        "interest",
+        "into",
+        "is",
+        "it",
+        "its",
+        "itself",
+        "keep",
+        "last",
+        "latter",
+        "latterly",
+        "least",
+        "less",
+        "ltd",
+        "made",
+        "many",
+        "may",
+        "me",
+        "meanwhile",
+        "might",
+        "mill",
+        "mine",
+        "more",
+        "moreover",
+        "most",
+        "mostly",
+        "move",
+        "much",
+        "must",
+        "my",
+        "myself",
+        "name",
+        "namely",
+        "neither",
+        "never",
+        "nevertheless",
+        "next",
+        "nine",
+        "no",
+        "nobody",
+        "none",
+        "noone",
+        "nor",
+        "not",
+        "nothing",
+        "now",
+        "nowhere",
+        "of",
+        "off",
+        "often",
+        "on",
+        "once",
+        "one",
+        "only",
+        "onto",
+        "or",
+        "other",
+        "others",
+        "otherwise",
+        "our",
+        "ours",
+        "ourselves",
+        "out",
+        "over",
+        "own",
+        "part",
+        "per",
+        "perhaps",
+        "please",
+        "put",
+        "rather",
+        "re",
+        "same",
+        "see",
+        "seem",
+        "seemed",
+        "seeming",
+        "seems",
+        "serious",
+        "several",
+        "she",
+        "should",
+        "show",
+        "side",
+        "since",
+        "sincere",
+        "six",
+        "sixty",
+        "so",
+        "some",
+        "somehow",
+        "someone",
+        "something",
+        "sometime",
+        "sometimes",
+        "somewhere",
+        "still",
+        "such",
+        "system",
+        "take",
+        "ten",
+        "than",
+        "that",
+        "the",
+        "their",
+        "them",
+        "themselves",
+        "then",
+        "thence",
+        "there",
+        "thereafter",
+        "thereby",
+        "therefore",
+        "therein",
+        "thereupon",
+        "these",
+        "they",
+        "thick",
+        "thin",
+        "third",
+        "this",
+        "those",
+        "though",
+        "three",
+        "through",
+        "throughout",
+        "thru",
+        "thus",
+        "to",
+        "together",
+        "too",
+        "top",
+        "toward",
+        "towards",
+        "twelve",
+        "twenty",
+        "two",
+        "un",
+        "under",
+        "until",
+        "up",
+        "upon",
+        "us",
+        "very",
+        "via",
+        "was",
+        "we",
+        "well",
+        "were",
+        "what",
+        "whatever",
+        "when",
+        "whence",
+        "whenever",
+        "where",
+        "whereafter",
+        "whereas",
+        "whereby",
+        "wherein",
+        "whereupon",
+        "wherever",
+        "whether",
+        "which",
+        "while",
+        "whither",
+        "who",
+        "whoever",
+        "whole",
+        "whom",
+        "whose",
+        "why",
+        "will",
+        "with",
+        "within",
+        "without",
+        "would",
+        "yet",
+        "you",
+        "your",
+        "yours",
+        "yourself",
+        "yourselves",
+    ]
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/image.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..b571215de47be973d81ae1b4dbab517b4de571c6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/image.py
@@ -0,0 +1,687 @@
+"""Utilities to extract features from images."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import product
+from numbers import Integral, Number, Real
+
+import numpy as np
+from numpy.lib.stride_tricks import as_strided
+from scipy import sparse
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import check_array, check_random_state
+from ..utils._param_validation import Hidden, Interval, RealNotInt, validate_params
+
+__all__ = [
+    "PatchExtractor",
+    "extract_patches_2d",
+    "grid_to_graph",
+    "img_to_graph",
+    "reconstruct_from_patches_2d",
+]
+
+from ..utils.validation import validate_data
+
+###############################################################################
+# From an image to a graph
+
+
+def _make_edges_3d(n_x, n_y, n_z=1):
+    """Returns a list of edges for a 3D image.
+
+    Parameters
+    ----------
+    n_x : int
+        The size of the grid in the x direction.
+    n_y : int
+        The size of the grid in the y direction.
+    n_z : integer, default=1
+        The size of the grid in the z direction, defaults to 1
+    """
+    vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z))
+    edges_deep = np.vstack((vertices[:, :, :-1].ravel(), vertices[:, :, 1:].ravel()))
+    edges_right = np.vstack((vertices[:, :-1].ravel(), vertices[:, 1:].ravel()))
+    edges_down = np.vstack((vertices[:-1].ravel(), vertices[1:].ravel()))
+    edges = np.hstack((edges_deep, edges_right, edges_down))
+    return edges
+
+
+def _compute_gradient_3d(edges, img):
+    _, n_y, n_z = img.shape
+    gradient = np.abs(
+        img[
+            edges[0] // (n_y * n_z),
+            (edges[0] % (n_y * n_z)) // n_z,
+            (edges[0] % (n_y * n_z)) % n_z,
+        ]
+        - img[
+            edges[1] // (n_y * n_z),
+            (edges[1] % (n_y * n_z)) // n_z,
+            (edges[1] % (n_y * n_z)) % n_z,
+        ]
+    )
+    return gradient
+
+
+# XXX: Why mask the image after computing the weights?
+
+
+def _mask_edges_weights(mask, edges, weights=None):
+    """Apply a mask to edges (weighted or not)"""
+    inds = np.arange(mask.size)
+    inds = inds[mask.ravel()]
+    ind_mask = np.logical_and(np.isin(edges[0], inds), np.isin(edges[1], inds))
+    edges = edges[:, ind_mask]
+    if weights is not None:
+        weights = weights[ind_mask]
+    if len(edges.ravel()):
+        maxval = edges.max()
+    else:
+        maxval = 0
+    order = np.searchsorted(np.flatnonzero(mask), np.arange(maxval + 1))
+    edges = order[edges]
+    if weights is None:
+        return edges
+    else:
+        return edges, weights
+
+
+def _to_graph(
+    n_x, n_y, n_z, mask=None, img=None, return_as=sparse.coo_matrix, dtype=None
+):
+    """Auxiliary function for img_to_graph and grid_to_graph"""
+    edges = _make_edges_3d(n_x, n_y, n_z)
+
+    if dtype is None:  # To not overwrite input dtype
+        if img is None:
+            dtype = int
+        else:
+            dtype = img.dtype
+
+    if img is not None:
+        img = np.atleast_3d(img)
+        weights = _compute_gradient_3d(edges, img)
+        if mask is not None:
+            edges, weights = _mask_edges_weights(mask, edges, weights)
+            diag = img.squeeze()[mask]
+        else:
+            diag = img.ravel()
+        n_voxels = diag.size
+    else:
+        if mask is not None:
+            mask = mask.astype(dtype=bool, copy=False)
+            edges = _mask_edges_weights(mask, edges)
+            n_voxels = np.sum(mask)
+        else:
+            n_voxels = n_x * n_y * n_z
+        weights = np.ones(edges.shape[1], dtype=dtype)
+        diag = np.ones(n_voxels, dtype=dtype)
+
+    diag_idx = np.arange(n_voxels)
+    i_idx = np.hstack((edges[0], edges[1]))
+    j_idx = np.hstack((edges[1], edges[0]))
+    graph = sparse.coo_matrix(
+        (
+            np.hstack((weights, weights, diag)),
+            (np.hstack((i_idx, diag_idx)), np.hstack((j_idx, diag_idx))),
+        ),
+        (n_voxels, n_voxels),
+        dtype=dtype,
+    )
+    if return_as is np.ndarray:
+        return graph.toarray()
+    return return_as(graph)
+
+
+@validate_params(
+    {
+        "img": ["array-like"],
+        "mask": [None, np.ndarray],
+        "return_as": [type],
+        "dtype": "no_validation",  # validation delegated to numpy
+    },
+    prefer_skip_nested_validation=True,
+)
+def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None):
+    """Graph of the pixel-to-pixel gradient connections.
+
+    Edges are weighted with the gradient values.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    img : array-like of shape (height, width) or (height, width, channel)
+        2D or 3D image.
+    mask : ndarray of shape (height, width) or \
+            (height, width, channel), dtype=bool, default=None
+        An optional mask of the image, to consider only part of the
+        pixels.
+    return_as : np.ndarray or a sparse matrix class, \
+            default=sparse.coo_matrix
+        The class to use to build the returned adjacency matrix.
+    dtype : dtype, default=None
+        The data of the returned sparse matrix. By default it is the
+        dtype of img.
+
+    Returns
+    -------
+    graph : ndarray or a sparse matrix class
+        The computed adjacency matrix.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.feature_extraction.image import img_to_graph
+    >>> img = np.array([[0, 0], [0, 1]])
+    >>> img_to_graph(img, return_as=np.ndarray)
+    array([[0, 0, 0, 0],
+           [0, 0, 0, 1],
+           [0, 0, 0, 1],
+           [0, 1, 1, 1]])
+    """
+    img = np.atleast_3d(img)
+    n_x, n_y, n_z = img.shape
+    return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype)
+
+
+@validate_params(
+    {
+        "n_x": [Interval(Integral, left=1, right=None, closed="left")],
+        "n_y": [Interval(Integral, left=1, right=None, closed="left")],
+        "n_z": [Interval(Integral, left=1, right=None, closed="left")],
+        "mask": [None, np.ndarray],
+        "return_as": [type],
+        "dtype": "no_validation",  # validation delegated to numpy
+    },
+    prefer_skip_nested_validation=True,
+)
+def grid_to_graph(
+    n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix, dtype=int
+):
+    """Graph of the pixel-to-pixel connections.
+
+    Edges exist if 2 voxels are connected.
+
+    Read more in the :ref:`User Guide <connectivity_graph_image>`.
+
+    Parameters
+    ----------
+    n_x : int
+        Dimension in x axis.
+    n_y : int
+        Dimension in y axis.
+    n_z : int, default=1
+        Dimension in z axis.
+    mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None
+        An optional mask of the image, to consider only part of the
+        pixels.
+    return_as : np.ndarray or a sparse matrix class, \
+            default=sparse.coo_matrix
+        The class to use to build the returned adjacency matrix.
+    dtype : dtype, default=int
+        The data of the returned sparse matrix. By default it is int.
+
+    Returns
+    -------
+    graph : np.ndarray or a sparse matrix class
+        The computed adjacency matrix.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.feature_extraction.image import grid_to_graph
+    >>> shape_img = (4, 4, 1)
+    >>> mask = np.zeros(shape=shape_img, dtype=bool)
+    >>> mask[[1, 2], [1, 2], :] = True
+    >>> graph = grid_to_graph(*shape_img, mask=mask)
+    >>> print(graph)
+    <COOrdinate sparse matrix of dtype 'int64'
+      with 2 stored elements and shape (2, 2)>
+      Coords	Values
+      (0, 0)    1
+      (1, 1)    1
+    """
+    return _to_graph(n_x, n_y, n_z, mask=mask, return_as=return_as, dtype=dtype)
+
+
+###############################################################################
+# From an image to a set of small image patches
+
+
+def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None):
+    """Compute the number of patches that will be extracted in an image.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    i_h : int
+        The image height
+    i_w : int
+        The image with
+    p_h : int
+        The height of a patch
+    p_w : int
+        The width of a patch
+    max_patches : int or float, default=None
+        The maximum number of patches to extract. If `max_patches` is a float
+        between 0 and 1, it is taken to be a proportion of the total number
+        of patches. If `max_patches` is None, all possible patches are extracted.
+    """
+    n_h = i_h - p_h + 1
+    n_w = i_w - p_w + 1
+    all_patches = n_h * n_w
+
+    if max_patches:
+        if isinstance(max_patches, (Integral)) and max_patches < all_patches:
+            return max_patches
+        elif isinstance(max_patches, (Integral)) and max_patches >= all_patches:
+            return all_patches
+        elif isinstance(max_patches, (Real)) and 0 < max_patches < 1:
+            return int(max_patches * all_patches)
+        else:
+            raise ValueError("Invalid value for max_patches: %r" % max_patches)
+    else:
+        return all_patches
+
+
+def _extract_patches(arr, patch_shape=8, extraction_step=1):
+    """Extracts patches of any n-dimensional array in place using strides.
+
+    Given an n-dimensional array it will return a 2n-dimensional array with
+    the first n dimensions indexing patch position and the last n indexing
+    the patch content. This operation is immediate (O(1)). A reshape
+    performed on the first n dimensions will cause numpy to copy data, leading
+    to a list of extracted patches.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    arr : ndarray
+        n-dimensional array of which patches are to be extracted
+
+    patch_shape : int or tuple of length arr.ndim.default=8
+        Indicates the shape of the patches to be extracted. If an
+        integer is given, the shape will be a hypercube of
+        sidelength given by its value.
+
+    extraction_step : int or tuple of length arr.ndim, default=1
+        Indicates step size at which extraction shall be performed.
+        If integer is given, then the step is uniform in all dimensions.
+
+
+    Returns
+    -------
+    patches : strided ndarray
+        2n-dimensional array indexing patches on first n dimensions and
+        containing patches on the last n dimensions. These dimensions
+        are fake, but this way no data is copied. A simple reshape invokes
+        a copying operation to obtain a list of patches:
+        result.reshape([-1] + list(patch_shape))
+    """
+
+    arr_ndim = arr.ndim
+
+    if isinstance(patch_shape, Number):
+        patch_shape = tuple([patch_shape] * arr_ndim)
+    if isinstance(extraction_step, Number):
+        extraction_step = tuple([extraction_step] * arr_ndim)
+
+    patch_strides = arr.strides
+
+    slices = tuple(slice(None, None, st) for st in extraction_step)
+    indexing_strides = arr[slices].strides
+
+    patch_indices_shape = (
+        (np.array(arr.shape) - np.array(patch_shape)) // np.array(extraction_step)
+    ) + 1
+
+    shape = tuple(list(patch_indices_shape) + list(patch_shape))
+    strides = tuple(list(indexing_strides) + list(patch_strides))
+
+    patches = as_strided(arr, shape=shape, strides=strides)
+    return patches
+
+
+@validate_params(
+    {
+        "image": [np.ndarray],
+        "patch_size": [tuple, list],
+        "max_patches": [
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(Integral, 1, None, closed="left"),
+            None,
+        ],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def extract_patches_2d(image, patch_size, *, max_patches=None, random_state=None):
+    """Reshape a 2D image into a collection of patches.
+
+    The resulting patches are allocated in a dedicated array.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    image : ndarray of shape (image_height, image_width) or \
+        (image_height, image_width, n_channels)
+        The original image data. For color images, the last dimension specifies
+        the channel: a RGB image would have `n_channels=3`.
+
+    patch_size : tuple of int (patch_height, patch_width)
+        The dimensions of one patch.
+
+    max_patches : int or float, default=None
+        The maximum number of patches to extract. If `max_patches` is a float
+        between 0 and 1, it is taken to be a proportion of the total number
+        of patches. If `max_patches` is None it corresponds to the total number
+        of patches that can be extracted.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator used for random sampling when
+        `max_patches` is not None. Use an int to make the randomness
+        deterministic.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    patches : array of shape (n_patches, patch_height, patch_width) or \
+        (n_patches, patch_height, patch_width, n_channels)
+        The collection of patches extracted from the image, where `n_patches`
+        is either `max_patches` or the total number of patches that can be
+        extracted.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_sample_image
+    >>> from sklearn.feature_extraction import image
+    >>> # Use the array data from the first image in this dataset:
+    >>> one_image = load_sample_image("china.jpg")
+    >>> print('Image shape: {}'.format(one_image.shape))
+    Image shape: (427, 640, 3)
+    >>> patches = image.extract_patches_2d(one_image, (2, 2))
+    >>> print('Patches shape: {}'.format(patches.shape))
+    Patches shape: (272214, 2, 2, 3)
+    >>> # Here are just two of these patches:
+    >>> print(patches[1])
+    [[[174 201 231]
+      [174 201 231]]
+     [[173 200 230]
+      [173 200 230]]]
+    >>> print(patches[800])
+    [[[187 214 243]
+      [188 215 244]]
+     [[187 214 243]
+      [188 215 244]]]
+    """
+    i_h, i_w = image.shape[:2]
+    p_h, p_w = patch_size
+
+    if p_h > i_h:
+        raise ValueError(
+            "Height of the patch should be less than the height of the image."
+        )
+
+    if p_w > i_w:
+        raise ValueError(
+            "Width of the patch should be less than the width of the image."
+        )
+
+    image = check_array(image, allow_nd=True)
+    image = image.reshape((i_h, i_w, -1))
+    n_colors = image.shape[-1]
+
+    extracted_patches = _extract_patches(
+        image, patch_shape=(p_h, p_w, n_colors), extraction_step=1
+    )
+
+    n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches)
+    if max_patches:
+        rng = check_random_state(random_state)
+        i_s = rng.randint(i_h - p_h + 1, size=n_patches)
+        j_s = rng.randint(i_w - p_w + 1, size=n_patches)
+        patches = extracted_patches[i_s, j_s, 0]
+    else:
+        patches = extracted_patches
+
+    patches = patches.reshape(-1, p_h, p_w, n_colors)
+    # remove the color dimension if useless
+    if patches.shape[-1] == 1:
+        return patches.reshape((n_patches, p_h, p_w))
+    else:
+        return patches
+
+
+@validate_params(
+    {"patches": [np.ndarray], "image_size": [tuple, Hidden(list)]},
+    prefer_skip_nested_validation=True,
+)
+def reconstruct_from_patches_2d(patches, image_size):
+    """Reconstruct the image from all of its patches.
+
+    Patches are assumed to overlap and the image is constructed by filling in
+    the patches from left to right, top to bottom, averaging the overlapping
+    regions.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    Parameters
+    ----------
+    patches : ndarray of shape (n_patches, patch_height, patch_width) or \
+        (n_patches, patch_height, patch_width, n_channels)
+        The complete set of patches. If the patches contain colour information,
+        channels are indexed along the last dimension: RGB patches would
+        have `n_channels=3`.
+
+    image_size : tuple of int (image_height, image_width) or \
+        (image_height, image_width, n_channels)
+        The size of the image that will be reconstructed.
+
+    Returns
+    -------
+    image : ndarray of shape image_size
+        The reconstructed image.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_sample_image
+    >>> from sklearn.feature_extraction import image
+    >>> one_image = load_sample_image("china.jpg")
+    >>> print('Image shape: {}'.format(one_image.shape))
+    Image shape: (427, 640, 3)
+    >>> image_patches = image.extract_patches_2d(image=one_image, patch_size=(10, 10))
+    >>> print('Patches shape: {}'.format(image_patches.shape))
+    Patches shape: (263758, 10, 10, 3)
+    >>> image_reconstructed = image.reconstruct_from_patches_2d(
+    ...     patches=image_patches,
+    ...     image_size=one_image.shape
+    ... )
+    >>> print(f"Reconstructed shape: {image_reconstructed.shape}")
+    Reconstructed shape: (427, 640, 3)
+    """
+    i_h, i_w = image_size[:2]
+    p_h, p_w = patches.shape[1:3]
+    img = np.zeros(image_size)
+    # compute the dimensions of the patches array
+    n_h = i_h - p_h + 1
+    n_w = i_w - p_w + 1
+    for p, (i, j) in zip(patches, product(range(n_h), range(n_w))):
+        img[i : i + p_h, j : j + p_w] += p
+
+    for i in range(i_h):
+        for j in range(i_w):
+            # divide by the amount of overlap
+            # XXX: is this the most efficient way? memory-wise yes, cpu wise?
+            img[i, j] /= float(min(i + 1, p_h, i_h - i) * min(j + 1, p_w, i_w - j))
+    return img
+
+
+class PatchExtractor(TransformerMixin, BaseEstimator):
+    """Extracts patches from a collection of images.
+
+    Read more in the :ref:`User Guide <image_feature_extraction>`.
+
+    .. versionadded:: 0.9
+
+    Parameters
+    ----------
+    patch_size : tuple of int (patch_height, patch_width), default=None
+        The dimensions of one patch. If set to None, the patch size will be
+        automatically set to `(img_height // 10, img_width // 10)`, where
+        `img_height` and `img_width` are the dimensions of the input images.
+
+    max_patches : int or float, default=None
+        The maximum number of patches per image to extract. If `max_patches` is
+        a float in (0, 1), it is taken to mean a proportion of the total number
+        of patches. If set to None, extract all possible patches.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator used for random sampling when
+        `max_patches is not None`. Use an int to make the randomness
+        deterministic.
+        See :term:`Glossary <random_state>`.
+
+    See Also
+    --------
+    reconstruct_from_patches_2d : Reconstruct image from all of its patches.
+
+    Notes
+    -----
+    This estimator is stateless and does not need to be fitted. However, we
+    recommend to call :meth:`fit_transform` instead of :meth:`transform`, as
+    parameter validation is only performed in :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_sample_images
+    >>> from sklearn.feature_extraction import image
+    >>> # Use the array data from the second image in this dataset:
+    >>> X = load_sample_images().images[1]
+    >>> X = X[None, ...]
+    >>> print(f"Image shape: {X.shape}")
+    Image shape: (1, 427, 640, 3)
+    >>> pe = image.PatchExtractor(patch_size=(10, 10))
+    >>> pe_trans = pe.transform(X)
+    >>> print(f"Patches shape: {pe_trans.shape}")
+    Patches shape: (263758, 10, 10, 3)
+    >>> X_reconstructed = image.reconstruct_from_patches_2d(pe_trans, X.shape[1:])
+    >>> print(f"Reconstructed shape: {X_reconstructed.shape}")
+    Reconstructed shape: (427, 640, 3)
+    """
+
+    _parameter_constraints: dict = {
+        "patch_size": [tuple, None],
+        "max_patches": [
+            None,
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(self, *, patch_size=None, max_patches=None, random_state=None):
+        self.patch_size = patch_size
+        self.max_patches = max_patches
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Only validate the parameters of the estimator.
+
+        This method allows to: (i) validate the parameters of the estimator  and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, image_height, image_width) or \
+                (n_samples, image_height, image_width, n_channels)
+            Array of images from which to extract patches. For color images,
+            the last dimension specifies the channel: a RGB image would have
+            `n_channels=3`.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        return self
+
+    def transform(self, X):
+        """Transform the image samples in `X` into a matrix of patch data.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, image_height, image_width) or \
+                (n_samples, image_height, image_width, n_channels)
+            Array of images from which to extract patches. For color images,
+            the last dimension specifies the channel: a RGB image would have
+            `n_channels=3`.
+
+        Returns
+        -------
+        patches : array of shape (n_patches, patch_height, patch_width) or \
+                (n_patches, patch_height, patch_width, n_channels)
+            The collection of patches extracted from the images, where
+            `n_patches` is either `n_samples * max_patches` or the total
+            number of patches that can be extracted.
+        """
+        X = validate_data(
+            self,
+            X=X,
+            ensure_2d=False,
+            allow_nd=True,
+            ensure_min_samples=1,
+            ensure_min_features=1,
+            reset=False,
+        )
+        random_state = check_random_state(self.random_state)
+        n_imgs, img_height, img_width = X.shape[:3]
+        if self.patch_size is None:
+            patch_size = img_height // 10, img_width // 10
+        else:
+            if len(self.patch_size) != 2:
+                raise ValueError(
+                    "patch_size must be a tuple of two integers. Got"
+                    f" {self.patch_size} instead."
+                )
+            patch_size = self.patch_size
+
+        n_imgs, img_height, img_width = X.shape[:3]
+        X = np.reshape(X, (n_imgs, img_height, img_width, -1))
+        n_channels = X.shape[-1]
+
+        # compute the dimensions of the patches array
+        patch_height, patch_width = patch_size
+        n_patches = _compute_n_patches(
+            img_height, img_width, patch_height, patch_width, self.max_patches
+        )
+        patches_shape = (n_imgs * n_patches,) + patch_size
+        if n_channels > 1:
+            patches_shape += (n_channels,)
+
+        # extract the patches
+        patches = np.empty(patches_shape)
+        for ii, image in enumerate(X):
+            patches[ii * n_patches : (ii + 1) * n_patches] = extract_patches_2d(
+                image,
+                patch_size,
+                max_patches=self.max_patches,
+                random_state=random_state,
+            )
+        return patches
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.two_d_array = False
+        tags.input_tags.three_d_array = True
+        tags.requires_fit = False
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/meson.build b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..f810d7b28576c82945ac4f285b55ab4ffc6c8fe9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/meson.build
@@ -0,0 +1,7 @@
+py.extension_module(
+  '_hashing_fast',
+  [cython_gen_cpp.process('_hashing_fast.pyx'), utils_cython_tree],
+  dependencies: [np_dep],
+  subdir: 'sklearn/feature_extraction',
+  install: true
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py
new file mode 100644
index 0000000000000000000000000000000000000000..d32248978a97ae85b7f3feee71fa234cbdeab9c6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py
@@ -0,0 +1,2137 @@
+"""Utilities to build feature vectors from text documents."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import array
+import re
+import unicodedata
+import warnings
+from collections import defaultdict
+from collections.abc import Mapping
+from functools import partial
+from numbers import Integral
+from operator import itemgetter
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.utils import metadata_routing
+
+from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context
+from ..exceptions import NotFittedError
+from ..preprocessing import normalize
+from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
+from ..utils.fixes import _IS_32BIT
+from ..utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, validate_data
+from ._hash import FeatureHasher
+from ._stop_words import ENGLISH_STOP_WORDS
+
+__all__ = [
+    "ENGLISH_STOP_WORDS",
+    "CountVectorizer",
+    "HashingVectorizer",
+    "TfidfTransformer",
+    "TfidfVectorizer",
+    "strip_accents_ascii",
+    "strip_accents_unicode",
+    "strip_tags",
+]
+
+
+def _preprocess(doc, accent_function=None, lower=False):
+    """Chain together an optional series of text preprocessing steps to
+    apply to a document.
+
+    Parameters
+    ----------
+    doc: str
+        The string to preprocess
+    accent_function: callable, default=None
+        Function for handling accented characters. Common strategies include
+        normalizing and removing.
+    lower: bool, default=False
+        Whether to use str.lower to lowercase all of the text
+
+    Returns
+    -------
+    doc: str
+        preprocessed string
+    """
+    if lower:
+        doc = doc.lower()
+    if accent_function is not None:
+        doc = accent_function(doc)
+    return doc
+
+
+def _analyze(
+    doc,
+    analyzer=None,
+    tokenizer=None,
+    ngrams=None,
+    preprocessor=None,
+    decoder=None,
+    stop_words=None,
+):
+    """Chain together an optional series of text processing steps to go from
+    a single document to ngrams, with or without tokenizing or preprocessing.
+
+    If analyzer is used, only the decoder argument is used, as the analyzer is
+    intended to replace the preprocessor, tokenizer, and ngrams steps.
+
+    Parameters
+    ----------
+    analyzer: callable, default=None
+    tokenizer: callable, default=None
+    ngrams: callable, default=None
+    preprocessor: callable, default=None
+    decoder: callable, default=None
+    stop_words: list, default=None
+
+    Returns
+    -------
+    ngrams: list
+        A sequence of tokens, possibly with pairs, triples, etc.
+    """
+
+    if decoder is not None:
+        doc = decoder(doc)
+    if analyzer is not None:
+        doc = analyzer(doc)
+    else:
+        if preprocessor is not None:
+            doc = preprocessor(doc)
+        if tokenizer is not None:
+            doc = tokenizer(doc)
+        if ngrams is not None:
+            if stop_words is not None:
+                doc = ngrams(doc, stop_words)
+            else:
+                doc = ngrams(doc)
+    return doc
+
+
+def strip_accents_unicode(s):
+    """Transform accentuated unicode symbols into their simple counterpart.
+
+    Warning: the python-level loop and join operations make this
+    implementation 20 times slower than the strip_accents_ascii basic
+    normalization.
+
+    Parameters
+    ----------
+    s : str
+        The string to strip.
+
+    Returns
+    -------
+    s : str
+        The stripped string.
+
+    See Also
+    --------
+    strip_accents_ascii : Remove accentuated char for any unicode symbol that
+        has a direct ASCII equivalent.
+    """
+    try:
+        # If `s` is ASCII-compatible, then it does not contain any accented
+        # characters and we can avoid an expensive list comprehension
+        s.encode("ASCII", errors="strict")
+        return s
+    except UnicodeEncodeError:
+        normalized = unicodedata.normalize("NFKD", s)
+        return "".join([c for c in normalized if not unicodedata.combining(c)])
+
+
+def strip_accents_ascii(s):
+    """Transform accentuated unicode symbols into ascii or nothing.
+
+    Warning: this solution is only suited for languages that have a direct
+    transliteration to ASCII symbols.
+
+    Parameters
+    ----------
+    s : str
+        The string to strip.
+
+    Returns
+    -------
+    s : str
+        The stripped string.
+
+    See Also
+    --------
+    strip_accents_unicode : Remove accentuated char for any unicode symbol.
+    """
+    nkfd_form = unicodedata.normalize("NFKD", s)
+    return nkfd_form.encode("ASCII", "ignore").decode("ASCII")
+
+
+def strip_tags(s):
+    """Basic regexp based HTML / XML tag stripper function.
+
+    For serious HTML/XML preprocessing you should rather use an external
+    library such as lxml or BeautifulSoup.
+
+    Parameters
+    ----------
+    s : str
+        The string to strip.
+
+    Returns
+    -------
+    s : str
+        The stripped string.
+    """
+    return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s)
+
+
+def _check_stop_list(stop):
+    if stop == "english":
+        return ENGLISH_STOP_WORDS
+    elif isinstance(stop, str):
+        raise ValueError("not a built-in stop list: %s" % stop)
+    elif stop is None:
+        return None
+    else:  # assume it's a collection
+        return frozenset(stop)
+
+
+class _VectorizerMixin:
+    """Provides common code for text vectorizers (tokenization logic)."""
+
+    _white_spaces = re.compile(r"\s\s+")
+
+    def decode(self, doc):
+        """Decode the input into a string of unicode symbols.
+
+        The decoding strategy depends on the vectorizer parameters.
+
+        Parameters
+        ----------
+        doc : bytes or str
+            The string to decode.
+
+        Returns
+        -------
+        doc: str
+            A string of unicode symbols.
+        """
+        if self.input == "filename":
+            with open(doc, "rb") as fh:
+                doc = fh.read()
+
+        elif self.input == "file":
+            doc = doc.read()
+
+        if isinstance(doc, bytes):
+            doc = doc.decode(self.encoding, self.decode_error)
+
+        if doc is np.nan:
+            raise ValueError(
+                "np.nan is an invalid document, expected byte or unicode string."
+            )
+
+        return doc
+
+    def _word_ngrams(self, tokens, stop_words=None):
+        """Turn tokens into a sequence of n-grams after stop words filtering"""
+        # handle stop words
+        if stop_words is not None:
+            tokens = [w for w in tokens if w not in stop_words]
+
+        # handle token n-grams
+        min_n, max_n = self.ngram_range
+        if max_n != 1:
+            original_tokens = tokens
+            if min_n == 1:
+                # no need to do any slicing for unigrams
+                # just iterate through the original tokens
+                tokens = list(original_tokens)
+                min_n += 1
+            else:
+                tokens = []
+
+            n_original_tokens = len(original_tokens)
+
+            # bind method outside of loop to reduce overhead
+            tokens_append = tokens.append
+            space_join = " ".join
+
+            for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
+                for i in range(n_original_tokens - n + 1):
+                    tokens_append(space_join(original_tokens[i : i + n]))
+
+        return tokens
+
+    def _char_ngrams(self, text_document):
+        """Tokenize text_document into a sequence of character n-grams"""
+        # normalize white spaces
+        text_document = self._white_spaces.sub(" ", text_document)
+
+        text_len = len(text_document)
+        min_n, max_n = self.ngram_range
+        if min_n == 1:
+            # no need to do any slicing for unigrams
+            # iterate through the string
+            ngrams = list(text_document)
+            min_n += 1
+        else:
+            ngrams = []
+
+        # bind method outside of loop to reduce overhead
+        ngrams_append = ngrams.append
+
+        for n in range(min_n, min(max_n + 1, text_len + 1)):
+            for i in range(text_len - n + 1):
+                ngrams_append(text_document[i : i + n])
+        return ngrams
+
+    def _char_wb_ngrams(self, text_document):
+        """Whitespace sensitive char-n-gram tokenization.
+
+        Tokenize text_document into a sequence of character n-grams
+        operating only inside word boundaries. n-grams at the edges
+        of words are padded with space."""
+        # normalize white spaces
+        text_document = self._white_spaces.sub(" ", text_document)
+
+        min_n, max_n = self.ngram_range
+        ngrams = []
+
+        # bind method outside of loop to reduce overhead
+        ngrams_append = ngrams.append
+
+        for w in text_document.split():
+            w = " " + w + " "
+            w_len = len(w)
+            for n in range(min_n, max_n + 1):
+                offset = 0
+                ngrams_append(w[offset : offset + n])
+                while offset + n < w_len:
+                    offset += 1
+                    ngrams_append(w[offset : offset + n])
+                if offset == 0:  # count a short word (w_len < n) only once
+                    break
+        return ngrams
+
+    def build_preprocessor(self):
+        """Return a function to preprocess the text before tokenization.
+
+        Returns
+        -------
+        preprocessor: callable
+              A function to preprocess the text before tokenization.
+        """
+        if self.preprocessor is not None:
+            return self.preprocessor
+
+        # accent stripping
+        if not self.strip_accents:
+            strip_accents = None
+        elif callable(self.strip_accents):
+            strip_accents = self.strip_accents
+        elif self.strip_accents == "ascii":
+            strip_accents = strip_accents_ascii
+        elif self.strip_accents == "unicode":
+            strip_accents = strip_accents_unicode
+        else:
+            raise ValueError(
+                'Invalid value for "strip_accents": %s' % self.strip_accents
+            )
+
+        return partial(_preprocess, accent_function=strip_accents, lower=self.lowercase)
+
+    def build_tokenizer(self):
+        """Return a function that splits a string into a sequence of tokens.
+
+        Returns
+        -------
+        tokenizer: callable
+              A function to split a string into a sequence of tokens.
+        """
+        if self.tokenizer is not None:
+            return self.tokenizer
+        token_pattern = re.compile(self.token_pattern)
+
+        if token_pattern.groups > 1:
+            raise ValueError(
+                "More than 1 capturing group in token pattern. Only a single "
+                "group should be captured."
+            )
+
+        return token_pattern.findall
+
+    def get_stop_words(self):
+        """Build or fetch the effective stop words list.
+
+        Returns
+        -------
+        stop_words: list or None
+                A list of stop words.
+        """
+        return _check_stop_list(self.stop_words)
+
+    def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
+        """Check if stop words are consistent
+
+        Returns
+        -------
+        is_consistent : True if stop words are consistent with the preprocessor
+                        and tokenizer, False if they are not, None if the check
+                        was previously performed, "error" if it could not be
+                        performed (e.g. because of the use of a custom
+                        preprocessor / tokenizer)
+        """
+        if id(self.stop_words) == getattr(self, "_stop_words_id", None):
+            # Stop words are were previously validated
+            return None
+
+        # NB: stop_words is validated, unlike self.stop_words
+        try:
+            inconsistent = set()
+            for w in stop_words or ():
+                tokens = list(tokenize(preprocess(w)))
+                for token in tokens:
+                    if token not in stop_words:
+                        inconsistent.add(token)
+            self._stop_words_id = id(self.stop_words)
+
+            if inconsistent:
+                warnings.warn(
+                    "Your stop_words may be inconsistent with "
+                    "your preprocessing. Tokenizing the stop "
+                    "words generated tokens %r not in "
+                    "stop_words." % sorted(inconsistent)
+                )
+            return not inconsistent
+        except Exception:
+            # Failed to check stop words consistency (e.g. because a custom
+            # preprocessor or tokenizer was used)
+            self._stop_words_id = id(self.stop_words)
+            return "error"
+
+    def build_analyzer(self):
+        """Return a callable to process input data.
+
+        The callable handles preprocessing, tokenization, and n-grams generation.
+
+        Returns
+        -------
+        analyzer: callable
+            A function to handle preprocessing, tokenization
+            and n-grams generation.
+        """
+
+        if callable(self.analyzer):
+            return partial(_analyze, analyzer=self.analyzer, decoder=self.decode)
+
+        preprocess = self.build_preprocessor()
+
+        if self.analyzer == "char":
+            return partial(
+                _analyze,
+                ngrams=self._char_ngrams,
+                preprocessor=preprocess,
+                decoder=self.decode,
+            )
+
+        elif self.analyzer == "char_wb":
+            return partial(
+                _analyze,
+                ngrams=self._char_wb_ngrams,
+                preprocessor=preprocess,
+                decoder=self.decode,
+            )
+
+        elif self.analyzer == "word":
+            stop_words = self.get_stop_words()
+            tokenize = self.build_tokenizer()
+            self._check_stop_words_consistency(stop_words, preprocess, tokenize)
+            return partial(
+                _analyze,
+                ngrams=self._word_ngrams,
+                tokenizer=tokenize,
+                preprocessor=preprocess,
+                decoder=self.decode,
+                stop_words=stop_words,
+            )
+
+        else:
+            raise ValueError(
+                "%s is not a valid tokenization scheme/analyzer" % self.analyzer
+            )
+
+    def _validate_vocabulary(self):
+        vocabulary = self.vocabulary
+        if vocabulary is not None:
+            if isinstance(vocabulary, set):
+                vocabulary = sorted(vocabulary)
+            if not isinstance(vocabulary, Mapping):
+                vocab = {}
+                for i, t in enumerate(vocabulary):
+                    if vocab.setdefault(t, i) != i:
+                        msg = "Duplicate term in vocabulary: %r" % t
+                        raise ValueError(msg)
+                vocabulary = vocab
+            else:
+                indices = set(vocabulary.values())
+                if len(indices) != len(vocabulary):
+                    raise ValueError("Vocabulary contains repeated indices.")
+                for i in range(len(vocabulary)):
+                    if i not in indices:
+                        msg = "Vocabulary of size %d doesn't contain index %d." % (
+                            len(vocabulary),
+                            i,
+                        )
+                        raise ValueError(msg)
+            if not vocabulary:
+                raise ValueError("empty vocabulary passed to fit")
+            self.fixed_vocabulary_ = True
+            self.vocabulary_ = dict(vocabulary)
+        else:
+            self.fixed_vocabulary_ = False
+
+    def _check_vocabulary(self):
+        """Check if vocabulary is empty or missing (not fitted)"""
+        if not hasattr(self, "vocabulary_"):
+            self._validate_vocabulary()
+            if not self.fixed_vocabulary_:
+                raise NotFittedError("Vocabulary not fitted or provided")
+
+        if len(self.vocabulary_) == 0:
+            raise ValueError("Vocabulary is empty")
+
+    def _validate_ngram_range(self):
+        """Check validity of ngram_range parameter"""
+        min_n, max_m = self.ngram_range
+        if min_n > max_m:
+            raise ValueError(
+                "Invalid value for ngram_range=%s "
+                "lower boundary larger than the upper boundary." % str(self.ngram_range)
+            )
+
+    def _warn_for_unused_params(self):
+        if self.tokenizer is not None and self.token_pattern is not None:
+            warnings.warn(
+                "The parameter 'token_pattern' will not be used"
+                " since 'tokenizer' is not None'"
+            )
+
+        if self.preprocessor is not None and callable(self.analyzer):
+            warnings.warn(
+                "The parameter 'preprocessor' will not be used"
+                " since 'analyzer' is callable'"
+            )
+
+        if (
+            self.ngram_range != (1, 1)
+            and self.ngram_range is not None
+            and callable(self.analyzer)
+        ):
+            warnings.warn(
+                "The parameter 'ngram_range' will not be used"
+                " since 'analyzer' is callable'"
+            )
+        if self.analyzer != "word" or callable(self.analyzer):
+            if self.stop_words is not None:
+                warnings.warn(
+                    "The parameter 'stop_words' will not be used"
+                    " since 'analyzer' != 'word'"
+                )
+            if (
+                self.token_pattern is not None
+                and self.token_pattern != r"(?u)\b\w\w+\b"
+            ):
+                warnings.warn(
+                    "The parameter 'token_pattern' will not be used"
+                    " since 'analyzer' != 'word'"
+                )
+            if self.tokenizer is not None:
+                warnings.warn(
+                    "The parameter 'tokenizer' will not be used"
+                    " since 'analyzer' != 'word'"
+                )
+
+
+class HashingVectorizer(
+    TransformerMixin, _VectorizerMixin, BaseEstimator, auto_wrap_output_keys=None
+):
+    r"""Convert a collection of text documents to a matrix of token occurrences.
+
+    It turns a collection of text documents into a scipy.sparse matrix holding
+    token occurrence counts (or binary occurrence information), possibly
+    normalized as token frequencies if norm='l1' or projected on the euclidean
+    unit sphere if norm='l2'.
+
+    This text vectorizer implementation uses the hashing trick to find the
+    token string name to feature integer index mapping.
+
+    This strategy has several advantages:
+
+    - it is very low memory scalable to large datasets as there is no need to
+      store a vocabulary dictionary in memory.
+
+    - it is fast to pickle and un-pickle as it holds no state besides the
+      constructor parameters.
+
+    - it can be used in a streaming (partial fit) or parallel pipeline as there
+      is no state computed during fit.
+
+    There are also a couple of cons (vs using a CountVectorizer with an
+    in-memory vocabulary):
+
+    - there is no way to compute the inverse transform (from feature indices to
+      string feature names) which can be a problem when trying to introspect
+      which features are most important to a model.
+
+    - there can be collisions: distinct tokens can be mapped to the same
+      feature index. However in practice this is rarely an issue if n_features
+      is large enough (e.g. 2 ** 18 for text classification problems).
+
+    - no IDF weighting as this would render the transformer stateful.
+
+    The hash function employed is the signed 32-bit version of Murmurhash3.
+
+    For an efficiency comparison of the different feature extractors, see
+    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
+
+    For an example of document clustering and comparison with
+    :class:`~sklearn.feature_extraction.text.TfidfVectorizer`, see
+    :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`.
+
+    Read more in the :ref:`User Guide <text_feature_extraction>`.
+
+    Parameters
+    ----------
+    input : {'filename', 'file', 'content'}, default='content'
+        - If `'filename'`, the sequence passed as an argument to fit is
+          expected to be a list of filenames that need reading to fetch
+          the raw content to analyze.
+
+        - If `'file'`, the sequence items must have a 'read' method (file-like
+          object) that is called to fetch the bytes in memory.
+
+        - If `'content'`, the input is expected to be a sequence of items that
+          can be of type string or byte.
+
+    encoding : str, default='utf-8'
+        If bytes or files are given to analyze, this encoding is used to
+        decode.
+
+    decode_error : {'strict', 'ignore', 'replace'}, default='strict'
+        Instruction on what to do if a byte sequence is given to analyze that
+        contains characters not of the given `encoding`. By default, it is
+        'strict', meaning that a UnicodeDecodeError will be raised. Other
+        values are 'ignore' and 'replace'.
+
+    strip_accents : {'ascii', 'unicode'} or callable, default=None
+        Remove accents and perform other character normalization
+        during the preprocessing step.
+        'ascii' is a fast method that only works on characters that have
+        a direct ASCII mapping.
+        'unicode' is a slightly slower method that works on any character.
+        None (default) means no character normalization is performed.
+
+        Both 'ascii' and 'unicode' use NFKD normalization from
+        :func:`unicodedata.normalize`.
+
+    lowercase : bool, default=True
+        Convert all characters to lowercase before tokenizing.
+
+    preprocessor : callable, default=None
+        Override the preprocessing (string transformation) stage while
+        preserving the tokenizing and n-grams generation steps.
+        Only applies if ``analyzer`` is not callable.
+
+    tokenizer : callable, default=None
+        Override the string tokenization step while preserving the
+        preprocessing and n-grams generation steps.
+        Only applies if ``analyzer == 'word'``.
+
+    stop_words : {'english'}, list, default=None
+        If 'english', a built-in stop word list for English is used.
+        There are several known issues with 'english' and you should
+        consider an alternative (see :ref:`stop_words`).
+
+        If a list, that list is assumed to contain stop words, all of which
+        will be removed from the resulting tokens.
+        Only applies if ``analyzer == 'word'``.
+
+    token_pattern : str or None, default=r"(?u)\\b\\w\\w+\\b"
+        Regular expression denoting what constitutes a "token", only used
+        if ``analyzer == 'word'``. The default regexp selects tokens of 2
+        or more alphanumeric characters (punctuation is completely ignored
+        and always treated as a token separator).
+
+        If there is a capturing group in token_pattern then the
+        captured group content, not the entire match, becomes the token.
+        At most one capturing group is permitted.
+
+    ngram_range : tuple (min_n, max_n), default=(1, 1)
+        The lower and upper boundary of the range of n-values for different
+        n-grams to be extracted. All values of n such that min_n <= n <= max_n
+        will be used. For example an ``ngram_range`` of ``(1, 1)`` means only
+        unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means
+        only bigrams.
+        Only applies if ``analyzer`` is not callable.
+
+    analyzer : {'word', 'char', 'char_wb'} or callable, default='word'
+        Whether the feature should be made of word or character n-grams.
+        Option 'char_wb' creates character n-grams only from text inside
+        word boundaries; n-grams at the edges of words are padded with space.
+
+        If a callable is passed it is used to extract the sequence of features
+        out of the raw, unprocessed input.
+
+        .. versionchanged:: 0.21
+            Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data
+            is first read from the file and then passed to the given callable
+            analyzer.
+
+    n_features : int, default=(2 ** 20)
+        The number of features (columns) in the output matrices. Small numbers
+        of features are likely to cause hash collisions, but large numbers
+        will cause larger coefficient dimensions in linear learners.
+
+    binary : bool, default=False
+        If True, all non zero counts are set to 1. This is useful for discrete
+        probabilistic models that model binary events rather than integer
+        counts.
+
+    norm : {'l1', 'l2'}, default='l2'
+        Norm used to normalize term vectors. None for no normalization.
+
+    alternate_sign : bool, default=True
+        When True, an alternating sign is added to the features as to
+        approximately conserve the inner product in the hashed space even for
+        small n_features. This approach is similar to sparse random projection.
+
+        .. versionadded:: 0.19
+
+    dtype : type, default=np.float64
+        Type of the matrix returned by fit_transform() or transform().
+
+    See Also
+    --------
+    CountVectorizer : Convert a collection of text documents to a matrix of
+        token counts.
+    TfidfVectorizer : Convert a collection of raw documents to a matrix of
+        TF-IDF features.
+
+    Notes
+    -----
+    This estimator is :term:`stateless` and does not need to be fitted.
+    However, we recommend to call :meth:`fit_transform` instead of
+    :meth:`transform`, as parameter validation is only performed in
+    :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction.text import HashingVectorizer
+    >>> corpus = [
+    ...     'This is the first document.',
+    ...     'This document is the second document.',
+    ...     'And this is the third one.',
+    ...     'Is this the first document?',
+    ... ]
+    >>> vectorizer = HashingVectorizer(n_features=2**4)
+    >>> X = vectorizer.fit_transform(corpus)
+    >>> print(X.shape)
+    (4, 16)
+    """
+
+    _parameter_constraints: dict = {
+        "input": [StrOptions({"filename", "file", "content"})],
+        "encoding": [str],
+        "decode_error": [StrOptions({"strict", "ignore", "replace"})],
+        "strip_accents": [StrOptions({"ascii", "unicode"}), None, callable],
+        "lowercase": ["boolean"],
+        "preprocessor": [callable, None],
+        "tokenizer": [callable, None],
+        "stop_words": [StrOptions({"english"}), list, None],
+        "token_pattern": [str, None],
+        "ngram_range": [tuple],
+        "analyzer": [StrOptions({"word", "char", "char_wb"}), callable],
+        "n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="left")],
+        "binary": ["boolean"],
+        "norm": [StrOptions({"l1", "l2"}), None],
+        "alternate_sign": ["boolean"],
+        "dtype": "no_validation",  # delegate to numpy
+    }
+
+    def __init__(
+        self,
+        *,
+        input="content",
+        encoding="utf-8",
+        decode_error="strict",
+        strip_accents=None,
+        lowercase=True,
+        preprocessor=None,
+        tokenizer=None,
+        stop_words=None,
+        token_pattern=r"(?u)\b\w\w+\b",
+        ngram_range=(1, 1),
+        analyzer="word",
+        n_features=(2**20),
+        binary=False,
+        norm="l2",
+        alternate_sign=True,
+        dtype=np.float64,
+    ):
+        self.input = input
+        self.encoding = encoding
+        self.decode_error = decode_error
+        self.strip_accents = strip_accents
+        self.preprocessor = preprocessor
+        self.tokenizer = tokenizer
+        self.analyzer = analyzer
+        self.lowercase = lowercase
+        self.token_pattern = token_pattern
+        self.stop_words = stop_words
+        self.n_features = n_features
+        self.ngram_range = ngram_range
+        self.binary = binary
+        self.norm = norm
+        self.alternate_sign = alternate_sign
+        self.dtype = dtype
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Only validates estimator's parameters.
+
+        This method allows to: (i) validate the estimator's parameters and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : ndarray of shape [n_samples, n_features]
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            HashingVectorizer instance.
+        """
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Only validates estimator's parameters.
+
+        This method allows to: (i) validate the estimator's parameters and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : ndarray of shape [n_samples, n_features]
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            HashingVectorizer instance.
+        """
+        # triggers a parameter validation
+        if isinstance(X, str):
+            raise ValueError(
+                "Iterable over raw text documents expected, string object received."
+            )
+
+        self._warn_for_unused_params()
+        self._validate_ngram_range()
+
+        self._get_hasher().fit(X, y=y)
+        return self
+
+    def transform(self, X):
+        """Transform a sequence of documents to a document-term matrix.
+
+        Parameters
+        ----------
+        X : iterable over raw text documents, length = n_samples
+            Samples. Each sample must be a text document (either bytes or
+            unicode strings, file name or file object depending on the
+            constructor argument) which will be tokenized and hashed.
+
+        Returns
+        -------
+        X : sparse matrix of shape (n_samples, n_features)
+            Document-term matrix.
+        """
+        if isinstance(X, str):
+            raise ValueError(
+                "Iterable over raw text documents expected, string object received."
+            )
+
+        self._validate_ngram_range()
+
+        analyzer = self.build_analyzer()
+        X = self._get_hasher().transform(analyzer(doc) for doc in X)
+        if self.binary:
+            X.data.fill(1)
+        if self.norm is not None:
+            X = normalize(X, norm=self.norm, copy=False)
+        return X
+
+    def fit_transform(self, X, y=None):
+        """Transform a sequence of documents to a document-term matrix.
+
+        Parameters
+        ----------
+        X : iterable over raw text documents, length = n_samples
+            Samples. Each sample must be a text document (either bytes or
+            unicode strings, file name or file object depending on the
+            constructor argument) which will be tokenized and hashed.
+        y : any
+            Ignored. This parameter exists only for compatibility with
+            sklearn.pipeline.Pipeline.
+
+        Returns
+        -------
+        X : sparse matrix of shape (n_samples, n_features)
+            Document-term matrix.
+        """
+        return self.fit(X, y).transform(X)
+
+    def _get_hasher(self):
+        return FeatureHasher(
+            n_features=self.n_features,
+            input_type="string",
+            dtype=self.dtype,
+            alternate_sign=self.alternate_sign,
+        )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.string = True
+        tags.input_tags.two_d_array = False
+        tags.requires_fit = False
+        return tags
+
+
+def _document_frequency(X):
+    """Count the number of non-zero values for each feature in sparse X."""
+    if sp.issparse(X) and X.format == "csr":
+        return np.bincount(X.indices, minlength=X.shape[1])
+    else:
+        return np.diff(X.indptr)
+
+
+class CountVectorizer(_VectorizerMixin, BaseEstimator):
+    r"""Convert a collection of text documents to a matrix of token counts.
+
+    This implementation produces a sparse representation of the counts using
+    scipy.sparse.csr_matrix.
+
+    If you do not provide an a-priori dictionary and you do not use an analyzer
+    that does some kind of feature selection then the number of features will
+    be equal to the vocabulary size found by analyzing the data.
+
+    For an efficiency comparison of the different feature extractors, see
+    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
+
+    Read more in the :ref:`User Guide <text_feature_extraction>`.
+
+    Parameters
+    ----------
+    input : {'filename', 'file', 'content'}, default='content'
+        - If `'filename'`, the sequence passed as an argument to fit is
+          expected to be a list of filenames that need reading to fetch
+          the raw content to analyze.
+
+        - If `'file'`, the sequence items must have a 'read' method (file-like
+          object) that is called to fetch the bytes in memory.
+
+        - If `'content'`, the input is expected to be a sequence of items that
+          can be of type string or byte.
+
+    encoding : str, default='utf-8'
+        If bytes or files are given to analyze, this encoding is used to
+        decode.
+
+    decode_error : {'strict', 'ignore', 'replace'}, default='strict'
+        Instruction on what to do if a byte sequence is given to analyze that
+        contains characters not of the given `encoding`. By default, it is
+        'strict', meaning that a UnicodeDecodeError will be raised. Other
+        values are 'ignore' and 'replace'.
+
+    strip_accents : {'ascii', 'unicode'} or callable, default=None
+        Remove accents and perform other character normalization
+        during the preprocessing step.
+        'ascii' is a fast method that only works on characters that have
+        a direct ASCII mapping.
+        'unicode' is a slightly slower method that works on any characters.
+        None (default) means no character normalization is performed.
+
+        Both 'ascii' and 'unicode' use NFKD normalization from
+        :func:`unicodedata.normalize`.
+
+    lowercase : bool, default=True
+        Convert all characters to lowercase before tokenizing.
+
+    preprocessor : callable, default=None
+        Override the preprocessing (strip_accents and lowercase) stage while
+        preserving the tokenizing and n-grams generation steps.
+        Only applies if ``analyzer`` is not callable.
+
+    tokenizer : callable, default=None
+        Override the string tokenization step while preserving the
+        preprocessing and n-grams generation steps.
+        Only applies if ``analyzer == 'word'``.
+
+    stop_words : {'english'}, list, default=None
+        If 'english', a built-in stop word list for English is used.
+        There are several known issues with 'english' and you should
+        consider an alternative (see :ref:`stop_words`).
+
+        If a list, that list is assumed to contain stop words, all of which
+        will be removed from the resulting tokens.
+        Only applies if ``analyzer == 'word'``.
+
+        If None, no stop words will be used. In this case, setting `max_df`
+        to a higher value, such as in the range (0.7, 1.0), can automatically detect
+        and filter stop words based on intra corpus document frequency of terms.
+
+    token_pattern : str or None, default=r"(?u)\\b\\w\\w+\\b"
+        Regular expression denoting what constitutes a "token", only used
+        if ``analyzer == 'word'``. The default regexp select tokens of 2
+        or more alphanumeric characters (punctuation is completely ignored
+        and always treated as a token separator).
+
+        If there is a capturing group in token_pattern then the
+        captured group content, not the entire match, becomes the token.
+        At most one capturing group is permitted.
+
+    ngram_range : tuple (min_n, max_n), default=(1, 1)
+        The lower and upper boundary of the range of n-values for different
+        word n-grams or char n-grams to be extracted. All values of n such
+        such that min_n <= n <= max_n will be used. For example an
+        ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means
+        unigrams and bigrams, and ``(2, 2)`` means only bigrams.
+        Only applies if ``analyzer`` is not callable.
+
+    analyzer : {'word', 'char', 'char_wb'} or callable, default='word'
+        Whether the feature should be made of word n-gram or character
+        n-grams.
+        Option 'char_wb' creates character n-grams only from text inside
+        word boundaries; n-grams at the edges of words are padded with space.
+
+        If a callable is passed it is used to extract the sequence of features
+        out of the raw, unprocessed input.
+
+        .. versionchanged:: 0.21
+
+        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
+        first read from the file and then passed to the given callable
+        analyzer.
+
+    max_df : float in range [0.0, 1.0] or int, default=1.0
+        When building the vocabulary ignore terms that have a document
+        frequency strictly higher than the given threshold (corpus-specific
+        stop words).
+        If float, the parameter represents a proportion of documents, integer
+        absolute counts.
+        This parameter is ignored if vocabulary is not None.
+
+    min_df : float in range [0.0, 1.0] or int, default=1
+        When building the vocabulary ignore terms that have a document
+        frequency strictly lower than the given threshold. This value is also
+        called cut-off in the literature.
+        If float, the parameter represents a proportion of documents, integer
+        absolute counts.
+        This parameter is ignored if vocabulary is not None.
+
+    max_features : int, default=None
+        If not None, build a vocabulary that only consider the top
+        `max_features` ordered by term frequency across the corpus.
+        Otherwise, all features are used.
+
+        This parameter is ignored if vocabulary is not None.
+
+    vocabulary : Mapping or iterable, default=None
+        Either a Mapping (e.g., a dict) where keys are terms and values are
+        indices in the feature matrix, or an iterable over terms. If not
+        given, a vocabulary is determined from the input documents. Indices
+        in the mapping should not be repeated and should not have any gap
+        between 0 and the largest index.
+
+    binary : bool, default=False
+        If True, all non zero counts are set to 1. This is useful for discrete
+        probabilistic models that model binary events rather than integer
+        counts.
+
+    dtype : dtype, default=np.int64
+        Type of the matrix returned by fit_transform() or transform().
+
+    Attributes
+    ----------
+    vocabulary_ : dict
+        A mapping of terms to feature indices.
+
+    fixed_vocabulary_ : bool
+        True if a fixed vocabulary of term to indices mapping
+        is provided by the user.
+
+    See Also
+    --------
+    HashingVectorizer : Convert a collection of text documents to a
+        matrix of token counts.
+
+    TfidfVectorizer : Convert a collection of raw documents to a matrix
+        of TF-IDF features.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction.text import CountVectorizer
+    >>> corpus = [
+    ...     'This is the first document.',
+    ...     'This document is the second document.',
+    ...     'And this is the third one.',
+    ...     'Is this the first document?',
+    ... ]
+    >>> vectorizer = CountVectorizer()
+    >>> X = vectorizer.fit_transform(corpus)
+    >>> vectorizer.get_feature_names_out()
+    array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
+           'this'], ...)
+    >>> print(X.toarray())
+    [[0 1 1 1 0 0 1 0 1]
+     [0 2 0 1 0 1 1 0 1]
+     [1 0 0 1 1 0 1 1 1]
+     [0 1 1 1 0 0 1 0 1]]
+    >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
+    >>> X2 = vectorizer2.fit_transform(corpus)
+    >>> vectorizer2.get_feature_names_out()
+    array(['and this', 'document is', 'first document', 'is the', 'is this',
+           'second document', 'the first', 'the second', 'the third', 'third one',
+           'this document', 'this is', 'this the'], ...)
+     >>> print(X2.toarray())
+     [[0 0 1 1 0 0 1 0 0 0 0 1 0]
+     [0 1 0 1 0 1 0 1 0 0 1 0 0]
+     [1 0 0 1 0 0 0 0 1 1 0 1 0]
+     [0 0 1 0 1 0 1 0 0 0 0 0 1]]
+    """
+
+    # raw_documents should not be in the routing mechanism. It should have been
+    # called X in the first place.
+    __metadata_request__fit = {"raw_documents": metadata_routing.UNUSED}
+    __metadata_request__transform = {"raw_documents": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "input": [StrOptions({"filename", "file", "content"})],
+        "encoding": [str],
+        "decode_error": [StrOptions({"strict", "ignore", "replace"})],
+        "strip_accents": [StrOptions({"ascii", "unicode"}), None, callable],
+        "lowercase": ["boolean"],
+        "preprocessor": [callable, None],
+        "tokenizer": [callable, None],
+        "stop_words": [StrOptions({"english"}), list, None],
+        "token_pattern": [str, None],
+        "ngram_range": [tuple],
+        "analyzer": [StrOptions({"word", "char", "char_wb"}), callable],
+        "max_df": [
+            Interval(RealNotInt, 0, 1, closed="both"),
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "min_df": [
+            Interval(RealNotInt, 0, 1, closed="both"),
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "max_features": [Interval(Integral, 1, None, closed="left"), None],
+        "vocabulary": [Mapping, HasMethods("__iter__"), None],
+        "binary": ["boolean"],
+        "dtype": "no_validation",  # delegate to numpy
+    }
+
+    def __init__(
+        self,
+        *,
+        input="content",
+        encoding="utf-8",
+        decode_error="strict",
+        strip_accents=None,
+        lowercase=True,
+        preprocessor=None,
+        tokenizer=None,
+        stop_words=None,
+        token_pattern=r"(?u)\b\w\w+\b",
+        ngram_range=(1, 1),
+        analyzer="word",
+        max_df=1.0,
+        min_df=1,
+        max_features=None,
+        vocabulary=None,
+        binary=False,
+        dtype=np.int64,
+    ):
+        self.input = input
+        self.encoding = encoding
+        self.decode_error = decode_error
+        self.strip_accents = strip_accents
+        self.preprocessor = preprocessor
+        self.tokenizer = tokenizer
+        self.analyzer = analyzer
+        self.lowercase = lowercase
+        self.token_pattern = token_pattern
+        self.stop_words = stop_words
+        self.max_df = max_df
+        self.min_df = min_df
+        self.max_features = max_features
+        self.ngram_range = ngram_range
+        self.vocabulary = vocabulary
+        self.binary = binary
+        self.dtype = dtype
+
+    def _sort_features(self, X, vocabulary):
+        """Sort features by name
+
+        Returns a reordered matrix and modifies the vocabulary in place
+        """
+        sorted_features = sorted(vocabulary.items())
+        map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)
+        for new_val, (term, old_val) in enumerate(sorted_features):
+            vocabulary[term] = new_val
+            map_index[old_val] = new_val
+
+        X.indices = map_index.take(X.indices, mode="clip")
+        return X
+
+    def _limit_features(self, X, vocabulary, high=None, low=None, limit=None):
+        """Remove too rare or too common features.
+
+        Prune features that are non zero in more samples than high or less
+        documents than low, modifying the vocabulary, and restricting it to
+        at most the limit most frequent.
+
+        This does not prune samples with zero features.
+        """
+        if high is None and low is None and limit is None:
+            return X, set()
+
+        # Calculate a mask based on document frequencies
+        dfs = _document_frequency(X)
+        mask = np.ones(len(dfs), dtype=bool)
+        if high is not None:
+            mask &= dfs <= high
+        if low is not None:
+            mask &= dfs >= low
+        if limit is not None and mask.sum() > limit:
+            tfs = np.asarray(X.sum(axis=0)).ravel()
+            mask_inds = (-tfs[mask]).argsort()[:limit]
+            new_mask = np.zeros(len(dfs), dtype=bool)
+            new_mask[np.where(mask)[0][mask_inds]] = True
+            mask = new_mask
+
+        new_indices = np.cumsum(mask) - 1  # maps old indices to new
+        for term, old_index in list(vocabulary.items()):
+            if mask[old_index]:
+                vocabulary[term] = new_indices[old_index]
+            else:
+                del vocabulary[term]
+        kept_indices = np.where(mask)[0]
+        if len(kept_indices) == 0:
+            raise ValueError(
+                "After pruning, no terms remain. Try a lower min_df or a higher max_df."
+            )
+        return X[:, kept_indices]
+
+    def _count_vocab(self, raw_documents, fixed_vocab):
+        """Create sparse feature matrix, and vocabulary where fixed_vocab=False"""
+        if fixed_vocab:
+            vocabulary = self.vocabulary_
+        else:
+            # Add a new value when a new vocabulary item is seen
+            vocabulary = defaultdict()
+            vocabulary.default_factory = vocabulary.__len__
+
+        analyze = self.build_analyzer()
+        j_indices = []
+        indptr = []
+
+        values = _make_int_array()
+        indptr.append(0)
+        for doc in raw_documents:
+            feature_counter = {}
+            for feature in analyze(doc):
+                try:
+                    feature_idx = vocabulary[feature]
+                    if feature_idx not in feature_counter:
+                        feature_counter[feature_idx] = 1
+                    else:
+                        feature_counter[feature_idx] += 1
+                except KeyError:
+                    # Ignore out-of-vocabulary items for fixed_vocab=True
+                    continue
+
+            j_indices.extend(feature_counter.keys())
+            values.extend(feature_counter.values())
+            indptr.append(len(j_indices))
+
+        if not fixed_vocab:
+            # disable defaultdict behaviour
+            vocabulary = dict(vocabulary)
+            if not vocabulary:
+                raise ValueError(
+                    "empty vocabulary; perhaps the documents only contain stop words"
+                )
+
+        if indptr[-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
+            if _IS_32BIT:
+                raise ValueError(
+                    (
+                        "sparse CSR array has {} non-zero "
+                        "elements and requires 64 bit indexing, "
+                        "which is unsupported with 32 bit Python."
+                    ).format(indptr[-1])
+                )
+            indices_dtype = np.int64
+
+        else:
+            indices_dtype = np.int32
+        j_indices = np.asarray(j_indices, dtype=indices_dtype)
+        indptr = np.asarray(indptr, dtype=indices_dtype)
+        values = np.frombuffer(values, dtype=np.intc)
+
+        X = sp.csr_matrix(
+            (values, j_indices, indptr),
+            shape=(len(indptr) - 1, len(vocabulary)),
+            dtype=self.dtype,
+        )
+        X.sort_indices()
+        return vocabulary, X
+
+    def fit(self, raw_documents, y=None):
+        """Learn a vocabulary dictionary of all tokens in the raw documents.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which generates either str, unicode or file objects.
+
+        y : None
+            This parameter is ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted vectorizer.
+        """
+        self.fit_transform(raw_documents)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, raw_documents, y=None):
+        """Learn the vocabulary dictionary and return document-term matrix.
+
+        This is equivalent to fit followed by transform, but more efficiently
+        implemented.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which generates either str, unicode or file objects.
+
+        y : None
+            This parameter is ignored.
+
+        Returns
+        -------
+        X : array of shape (n_samples, n_features)
+            Document-term matrix.
+        """
+        # We intentionally don't call the transform method to make
+        # fit_transform overridable without unwanted side effects in
+        # TfidfVectorizer.
+        if isinstance(raw_documents, str):
+            raise ValueError(
+                "Iterable over raw text documents expected, string object received."
+            )
+
+        self._validate_ngram_range()
+        self._warn_for_unused_params()
+        self._validate_vocabulary()
+        max_df = self.max_df
+        min_df = self.min_df
+        max_features = self.max_features
+
+        if self.fixed_vocabulary_ and self.lowercase:
+            for term in self.vocabulary:
+                if any(map(str.isupper, term)):
+                    warnings.warn(
+                        "Upper case characters found in"
+                        " vocabulary while 'lowercase'"
+                        " is True. These entries will not"
+                        " be matched with any documents"
+                    )
+                    break
+
+        vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
+
+        if self.binary:
+            X.data.fill(1)
+
+        if not self.fixed_vocabulary_:
+            n_doc = X.shape[0]
+            max_doc_count = max_df if isinstance(max_df, Integral) else max_df * n_doc
+            min_doc_count = min_df if isinstance(min_df, Integral) else min_df * n_doc
+            if max_doc_count < min_doc_count:
+                raise ValueError("max_df corresponds to < documents than min_df")
+            if max_features is not None:
+                X = self._sort_features(X, vocabulary)
+            X = self._limit_features(
+                X, vocabulary, max_doc_count, min_doc_count, max_features
+            )
+            if max_features is None:
+                X = self._sort_features(X, vocabulary)
+            self.vocabulary_ = vocabulary
+
+        return X
+
+    def transform(self, raw_documents):
+        """Transform documents to document-term matrix.
+
+        Extract token counts out of raw text documents using the vocabulary
+        fitted with fit or the one provided to the constructor.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which generates either str, unicode or file objects.
+
+        Returns
+        -------
+        X : sparse matrix of shape (n_samples, n_features)
+            Document-term matrix.
+        """
+        if isinstance(raw_documents, str):
+            raise ValueError(
+                "Iterable over raw text documents expected, string object received."
+            )
+        self._check_vocabulary()
+
+        # use the same matrix-building strategy as fit_transform
+        _, X = self._count_vocab(raw_documents, fixed_vocab=True)
+        if self.binary:
+            X.data.fill(1)
+        return X
+
+    def inverse_transform(self, X):
+        """Return terms per document with nonzero entries in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document-term matrix.
+
+        Returns
+        -------
+        X_original : list of arrays of shape (n_samples,)
+            List of arrays of terms.
+        """
+        self._check_vocabulary()
+        # We need CSR format for fast row manipulations.
+        X = check_array(X, accept_sparse="csr")
+        n_samples = X.shape[0]
+
+        terms = np.array(list(self.vocabulary_.keys()))
+        indices = np.array(list(self.vocabulary_.values()))
+        inverse_vocabulary = terms[np.argsort(indices)]
+
+        if sp.issparse(X):
+            return [
+                inverse_vocabulary[X[i, :].nonzero()[1]].ravel()
+                for i in range(n_samples)
+            ]
+        else:
+            return [
+                inverse_vocabulary[np.flatnonzero(X[i, :])].ravel()
+                for i in range(n_samples)
+            ]
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        self._check_vocabulary()
+        return np.asarray(
+            [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))],
+            dtype=object,
+        )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.string = True
+        tags.input_tags.two_d_array = False
+        return tags
+
+
+def _make_int_array():
+    """Construct an array.array of a type suitable for scipy.sparse indices."""
+    return array.array(str("i"))
+
+
+class TfidfTransformer(
+    OneToOneFeatureMixin, TransformerMixin, BaseEstimator, auto_wrap_output_keys=None
+):
+    """Transform a count matrix to a normalized tf or tf-idf representation.
+
+    Tf means term-frequency while tf-idf means term-frequency times inverse
+    document-frequency. This is a common term weighting scheme in information
+    retrieval, that has also found good use in document classification.
+
+    The goal of using tf-idf instead of the raw frequencies of occurrence of a
+    token in a given document is to scale down the impact of tokens that occur
+    very frequently in a given corpus and that are hence empirically less
+    informative than features that occur in a small fraction of the training
+    corpus.
+
+    The formula that is used to compute the tf-idf for a term t of a document d
+    in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is
+    computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where
+    n is the total number of documents in the document set and df(t) is the
+    document frequency of t; the document frequency is the number of documents
+    in the document set that contain the term t. The effect of adding "1" to
+    the idf in the equation above is that terms with zero idf, i.e., terms
+    that occur in all documents in a training set, will not be entirely
+    ignored.
+    (Note that the idf formula above differs from the standard textbook
+    notation that defines the idf as
+    idf(t) = log [ n / (df(t) + 1) ]).
+
+    If ``smooth_idf=True`` (the default), the constant "1" is added to the
+    numerator and denominator of the idf as if an extra document was seen
+    containing every term in the collection exactly once, which prevents
+    zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.
+
+    Furthermore, the formulas used to compute tf and idf depend
+    on parameter settings that correspond to the SMART notation used in IR
+    as follows:
+
+    Tf is "n" (natural) by default, "l" (logarithmic) when
+    ``sublinear_tf=True``.
+    Idf is "t" when use_idf is given, "n" (none) otherwise.
+    Normalization is "c" (cosine) when ``norm='l2'``, "n" (none)
+    when ``norm=None``.
+
+    Read more in the :ref:`User Guide <text_feature_extraction>`.
+
+    Parameters
+    ----------
+    norm : {'l1', 'l2'} or None, default='l2'
+        Each output row will have unit norm, either:
+
+        - 'l2': Sum of squares of vector elements is 1. The cosine
+          similarity between two vectors is their dot product when l2 norm has
+          been applied.
+        - 'l1': Sum of absolute values of vector elements is 1.
+          See :func:`~sklearn.preprocessing.normalize`.
+        - None: No normalization.
+
+    use_idf : bool, default=True
+        Enable inverse-document-frequency reweighting. If False, idf(t) = 1.
+
+    smooth_idf : bool, default=True
+        Smooth idf weights by adding one to document frequencies, as if an
+        extra document was seen containing every term in the collection
+        exactly once. Prevents zero divisions.
+
+    sublinear_tf : bool, default=False
+        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
+
+    Attributes
+    ----------
+    idf_ : array of shape (n_features)
+        The inverse document frequency (IDF) vector; only defined
+        if  ``use_idf`` is True.
+
+        .. versionadded:: 0.20
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 1.0
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
+
+    TfidfVectorizer : Convert a collection of raw documents to a matrix of
+        TF-IDF features.
+
+    HashingVectorizer : Convert a collection of text documents to a matrix
+        of token occurrences.
+
+    References
+    ----------
+    .. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
+                   Information Retrieval. Addison Wesley, pp. 68-74.
+
+    .. [MRS2008] C.D. Manning, P. Raghavan and H. Schütze  (2008).
+                   Introduction to Information Retrieval. Cambridge University
+                   Press, pp. 118-120.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction.text import TfidfTransformer
+    >>> from sklearn.feature_extraction.text import CountVectorizer
+    >>> from sklearn.pipeline import Pipeline
+    >>> corpus = ['this is the first document',
+    ...           'this document is the second document',
+    ...           'and this is the third one',
+    ...           'is this the first document']
+    >>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
+    ...               'and', 'one']
+    >>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
+    ...                  ('tfid', TfidfTransformer())]).fit(corpus)
+    >>> pipe['count'].transform(corpus).toarray()
+    array([[1, 1, 1, 1, 0, 1, 0, 0],
+           [1, 2, 0, 1, 1, 1, 0, 0],
+           [1, 0, 0, 1, 0, 1, 1, 1],
+           [1, 1, 1, 1, 0, 1, 0, 0]])
+    >>> pipe['tfid'].idf_
+    array([1.        , 1.22314355, 1.51082562, 1.        , 1.91629073,
+           1.        , 1.91629073, 1.91629073])
+    >>> pipe.transform(corpus).shape
+    (4, 8)
+    """
+
+    _parameter_constraints: dict = {
+        "norm": [StrOptions({"l1", "l2"}), None],
+        "use_idf": ["boolean"],
+        "smooth_idf": ["boolean"],
+        "sublinear_tf": ["boolean"],
+    }
+
+    def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False):
+        self.norm = norm
+        self.use_idf = use_idf
+        self.smooth_idf = smooth_idf
+        self.sublinear_tf = sublinear_tf
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Learn the idf vector (global term weights).
+
+        Parameters
+        ----------
+        X : sparse matrix of shape (n_samples, n_features)
+            A matrix of term/token counts.
+
+        y : None
+            This parameter is not needed to compute tf-idf.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        # large sparse data is not supported for 32bit platforms because
+        # _document_frequency uses np.bincount which works on arrays of
+        # dtype NPY_INTP which is int32 for 32bit platforms. See #20923
+        X = validate_data(
+            self, X, accept_sparse=("csr", "csc"), accept_large_sparse=not _IS_32BIT
+        )
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X)
+        dtype = X.dtype if X.dtype in (np.float64, np.float32) else np.float64
+
+        if self.use_idf:
+            n_samples, _ = X.shape
+            df = _document_frequency(X)
+            df = df.astype(dtype, copy=False)
+
+            # perform idf smoothing if required
+            df += float(self.smooth_idf)
+            n_samples += int(self.smooth_idf)
+
+            # log+1 instead of log makes sure terms with zero idf don't get
+            # suppressed entirely.
+            # Force the dtype of `idf_` to be the same as `df`. In NumPy < 2, the dtype
+            # was depending on the value of `n_samples`.
+            self.idf_ = np.full_like(df, fill_value=n_samples, dtype=dtype)
+            self.idf_ /= df
+            # `np.log` preserves the dtype of `df` and thus `dtype`.
+            np.log(self.idf_, out=self.idf_)
+            self.idf_ += 1.0
+
+        return self
+
+    def transform(self, X, copy=True):
+        """Transform a count matrix to a tf or tf-idf representation.
+
+        Parameters
+        ----------
+        X : sparse matrix of (n_samples, n_features)
+            A matrix of term/token counts.
+
+        copy : bool, default=True
+            Whether to copy X and operate on the copy or perform in-place
+            operations. `copy=False` will only be effective with CSR sparse matrix.
+
+        Returns
+        -------
+        vectors : sparse matrix of shape (n_samples, n_features)
+            Tf-idf-weighted document-term matrix.
+        """
+        check_is_fitted(self)
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csr",
+            dtype=[np.float64, np.float32],
+            copy=copy,
+            reset=False,
+        )
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X, dtype=X.dtype)
+
+        if self.sublinear_tf:
+            np.log(X.data, X.data)
+            X.data += 1.0
+
+        if hasattr(self, "idf_"):
+            # the columns of X (CSR matrix) can be accessed with `X.indices `and
+            # multiplied with the corresponding `idf` value
+            X.data *= self.idf_[X.indices]
+
+        if self.norm is not None:
+            X = normalize(X, norm=self.norm, copy=False)
+
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        # FIXME: np.float16 could be preserved if _inplace_csr_row_normalize_l2
+        # accepted it.
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+
+class TfidfVectorizer(CountVectorizer):
+    r"""Convert a collection of raw documents to a matrix of TF-IDF features.
+
+    Equivalent to :class:`CountVectorizer` followed by
+    :class:`TfidfTransformer`.
+
+    For an example of usage, see
+    :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`.
+
+    For an efficiency comparison of the different feature extractors, see
+    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
+
+    For an example of document clustering and comparison with
+    :class:`~sklearn.feature_extraction.text.HashingVectorizer`, see
+    :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`.
+
+    Read more in the :ref:`User Guide <text_feature_extraction>`.
+
+    Parameters
+    ----------
+    input : {'filename', 'file', 'content'}, default='content'
+        - If `'filename'`, the sequence passed as an argument to fit is
+          expected to be a list of filenames that need reading to fetch
+          the raw content to analyze.
+
+        - If `'file'`, the sequence items must have a 'read' method (file-like
+          object) that is called to fetch the bytes in memory.
+
+        - If `'content'`, the input is expected to be a sequence of items that
+          can be of type string or byte.
+
+    encoding : str, default='utf-8'
+        If bytes or files are given to analyze, this encoding is used to
+        decode.
+
+    decode_error : {'strict', 'ignore', 'replace'}, default='strict'
+        Instruction on what to do if a byte sequence is given to analyze that
+        contains characters not of the given `encoding`. By default, it is
+        'strict', meaning that a UnicodeDecodeError will be raised. Other
+        values are 'ignore' and 'replace'.
+
+    strip_accents : {'ascii', 'unicode'} or callable, default=None
+        Remove accents and perform other character normalization
+        during the preprocessing step.
+        'ascii' is a fast method that only works on characters that have
+        a direct ASCII mapping.
+        'unicode' is a slightly slower method that works on any characters.
+        None (default) means no character normalization is performed.
+
+        Both 'ascii' and 'unicode' use NFKD normalization from
+        :func:`unicodedata.normalize`.
+
+    lowercase : bool, default=True
+        Convert all characters to lowercase before tokenizing.
+
+    preprocessor : callable, default=None
+        Override the preprocessing (string transformation) stage while
+        preserving the tokenizing and n-grams generation steps.
+        Only applies if ``analyzer`` is not callable.
+
+    tokenizer : callable, default=None
+        Override the string tokenization step while preserving the
+        preprocessing and n-grams generation steps.
+        Only applies if ``analyzer == 'word'``.
+
+    analyzer : {'word', 'char', 'char_wb'} or callable, default='word'
+        Whether the feature should be made of word or character n-grams.
+        Option 'char_wb' creates character n-grams only from text inside
+        word boundaries; n-grams at the edges of words are padded with space.
+
+        If a callable is passed it is used to extract the sequence of features
+        out of the raw, unprocessed input.
+
+        .. versionchanged:: 0.21
+            Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data
+            is first read from the file and then passed to the given callable
+            analyzer.
+
+    stop_words : {'english'}, list, default=None
+        If a string, it is passed to _check_stop_list and the appropriate stop
+        list is returned. 'english' is currently the only supported string
+        value.
+        There are several known issues with 'english' and you should
+        consider an alternative (see :ref:`stop_words`).
+
+        If a list, that list is assumed to contain stop words, all of which
+        will be removed from the resulting tokens.
+        Only applies if ``analyzer == 'word'``.
+
+        If None, no stop words will be used. In this case, setting `max_df`
+        to a higher value, such as in the range (0.7, 1.0), can automatically detect
+        and filter stop words based on intra corpus document frequency of terms.
+
+    token_pattern : str, default=r"(?u)\\b\\w\\w+\\b"
+        Regular expression denoting what constitutes a "token", only used
+        if ``analyzer == 'word'``. The default regexp selects tokens of 2
+        or more alphanumeric characters (punctuation is completely ignored
+        and always treated as a token separator).
+
+        If there is a capturing group in token_pattern then the
+        captured group content, not the entire match, becomes the token.
+        At most one capturing group is permitted.
+
+    ngram_range : tuple (min_n, max_n), default=(1, 1)
+        The lower and upper boundary of the range of n-values for different
+        n-grams to be extracted. All values of n such that min_n <= n <= max_n
+        will be used. For example an ``ngram_range`` of ``(1, 1)`` means only
+        unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means
+        only bigrams.
+        Only applies if ``analyzer`` is not callable.
+
+    max_df : float or int, default=1.0
+        When building the vocabulary ignore terms that have a document
+        frequency strictly higher than the given threshold (corpus-specific
+        stop words).
+        If float in range [0.0, 1.0], the parameter represents a proportion of
+        documents, integer absolute counts.
+        This parameter is ignored if vocabulary is not None.
+
+    min_df : float or int, default=1
+        When building the vocabulary ignore terms that have a document
+        frequency strictly lower than the given threshold. This value is also
+        called cut-off in the literature.
+        If float in range of [0.0, 1.0], the parameter represents a proportion
+        of documents, integer absolute counts.
+        This parameter is ignored if vocabulary is not None.
+
+    max_features : int, default=None
+        If not None, build a vocabulary that only consider the top
+        `max_features` ordered by term frequency across the corpus.
+        Otherwise, all features are used.
+
+        This parameter is ignored if vocabulary is not None.
+
+    vocabulary : Mapping or iterable, default=None
+        Either a Mapping (e.g., a dict) where keys are terms and values are
+        indices in the feature matrix, or an iterable over terms. If not
+        given, a vocabulary is determined from the input documents.
+
+    binary : bool, default=False
+        If True, all non-zero term counts are set to 1. This does not mean
+        outputs will have only 0/1 values, only that the tf term in tf-idf
+        is binary. (Set `binary` to True, `use_idf` to False and
+        `norm` to None to get 0/1 outputs).
+
+    dtype : dtype, default=float64
+        Type of the matrix returned by fit_transform() or transform().
+
+    norm : {'l1', 'l2'} or None, default='l2'
+        Each output row will have unit norm, either:
+
+        - 'l2': Sum of squares of vector elements is 1. The cosine
+          similarity between two vectors is their dot product when l2 norm has
+          been applied.
+        - 'l1': Sum of absolute values of vector elements is 1.
+          See :func:`~sklearn.preprocessing.normalize`.
+        - None: No normalization.
+
+    use_idf : bool, default=True
+        Enable inverse-document-frequency reweighting. If False, idf(t) = 1.
+
+    smooth_idf : bool, default=True
+        Smooth idf weights by adding one to document frequencies, as if an
+        extra document was seen containing every term in the collection
+        exactly once. Prevents zero divisions.
+
+    sublinear_tf : bool, default=False
+        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
+
+    Attributes
+    ----------
+    vocabulary_ : dict
+        A mapping of terms to feature indices.
+
+    fixed_vocabulary_ : bool
+        True if a fixed vocabulary of term to indices mapping
+        is provided by the user.
+
+    idf_ : array of shape (n_features,)
+        The inverse document frequency (IDF) vector; only defined
+        if ``use_idf`` is True.
+
+    See Also
+    --------
+    CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
+
+    TfidfTransformer : Performs the TF-IDF transformation from a provided
+        matrix of counts.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction.text import TfidfVectorizer
+    >>> corpus = [
+    ...     'This is the first document.',
+    ...     'This document is the second document.',
+    ...     'And this is the third one.',
+    ...     'Is this the first document?',
+    ... ]
+    >>> vectorizer = TfidfVectorizer()
+    >>> X = vectorizer.fit_transform(corpus)
+    >>> vectorizer.get_feature_names_out()
+    array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
+           'this'], ...)
+    >>> print(X.shape)
+    (4, 9)
+    """
+
+    _parameter_constraints: dict = {**CountVectorizer._parameter_constraints}
+    _parameter_constraints.update(
+        {
+            "norm": [StrOptions({"l1", "l2"}), None],
+            "use_idf": ["boolean"],
+            "smooth_idf": ["boolean"],
+            "sublinear_tf": ["boolean"],
+        }
+    )
+
+    def __init__(
+        self,
+        *,
+        input="content",
+        encoding="utf-8",
+        decode_error="strict",
+        strip_accents=None,
+        lowercase=True,
+        preprocessor=None,
+        tokenizer=None,
+        analyzer="word",
+        stop_words=None,
+        token_pattern=r"(?u)\b\w\w+\b",
+        ngram_range=(1, 1),
+        max_df=1.0,
+        min_df=1,
+        max_features=None,
+        vocabulary=None,
+        binary=False,
+        dtype=np.float64,
+        norm="l2",
+        use_idf=True,
+        smooth_idf=True,
+        sublinear_tf=False,
+    ):
+        super().__init__(
+            input=input,
+            encoding=encoding,
+            decode_error=decode_error,
+            strip_accents=strip_accents,
+            lowercase=lowercase,
+            preprocessor=preprocessor,
+            tokenizer=tokenizer,
+            analyzer=analyzer,
+            stop_words=stop_words,
+            token_pattern=token_pattern,
+            ngram_range=ngram_range,
+            max_df=max_df,
+            min_df=min_df,
+            max_features=max_features,
+            vocabulary=vocabulary,
+            binary=binary,
+            dtype=dtype,
+        )
+        self.norm = norm
+        self.use_idf = use_idf
+        self.smooth_idf = smooth_idf
+        self.sublinear_tf = sublinear_tf
+
+    # Broadcast the TF-IDF parameters to the underlying transformer instance
+    # for easy grid search and repr
+
+    @property
+    def idf_(self):
+        """Inverse document frequency vector, only defined if `use_idf=True`.
+
+        Returns
+        -------
+        ndarray of shape (n_features,)
+        """
+        if not hasattr(self, "_tfidf"):
+            raise NotFittedError(
+                f"{self.__class__.__name__} is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this attribute."
+            )
+        return self._tfidf.idf_
+
+    @idf_.setter
+    def idf_(self, value):
+        if not self.use_idf:
+            raise ValueError("`idf_` cannot be set when `user_idf=False`.")
+        if not hasattr(self, "_tfidf"):
+            # We should support transferring `idf_` from another `TfidfTransformer`
+            # and therefore, we need to create the transformer instance it does not
+            # exist yet.
+            self._tfidf = TfidfTransformer(
+                norm=self.norm,
+                use_idf=self.use_idf,
+                smooth_idf=self.smooth_idf,
+                sublinear_tf=self.sublinear_tf,
+            )
+        self._validate_vocabulary()
+        if hasattr(self, "vocabulary_"):
+            if len(self.vocabulary_) != len(value):
+                raise ValueError(
+                    "idf length = %d must be equal to vocabulary size = %d"
+                    % (len(value), len(self.vocabulary))
+                )
+        self._tfidf.idf_ = value
+
+    def _check_params(self):
+        if self.dtype not in FLOAT_DTYPES:
+            warnings.warn(
+                "Only {} 'dtype' should be used. {} 'dtype' will "
+                "be converted to np.float64.".format(FLOAT_DTYPES, self.dtype),
+                UserWarning,
+            )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, raw_documents, y=None):
+        """Learn vocabulary and idf from training set.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which generates either str, unicode or file objects.
+
+        y : None
+            This parameter is not needed to compute tfidf.
+
+        Returns
+        -------
+        self : object
+            Fitted vectorizer.
+        """
+        self._check_params()
+        self._warn_for_unused_params()
+        self._tfidf = TfidfTransformer(
+            norm=self.norm,
+            use_idf=self.use_idf,
+            smooth_idf=self.smooth_idf,
+            sublinear_tf=self.sublinear_tf,
+        )
+        X = super().fit_transform(raw_documents)
+        self._tfidf.fit(X)
+        return self
+
+    def fit_transform(self, raw_documents, y=None):
+        """Learn vocabulary and idf, return document-term matrix.
+
+        This is equivalent to fit followed by transform, but more efficiently
+        implemented.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which generates either str, unicode or file objects.
+
+        y : None
+            This parameter is ignored.
+
+        Returns
+        -------
+        X : sparse matrix of (n_samples, n_features)
+            Tf-idf-weighted document-term matrix.
+        """
+        self._check_params()
+        self._tfidf = TfidfTransformer(
+            norm=self.norm,
+            use_idf=self.use_idf,
+            smooth_idf=self.smooth_idf,
+            sublinear_tf=self.sublinear_tf,
+        )
+        X = super().fit_transform(raw_documents)
+        self._tfidf.fit(X)
+        # X is already a transformed view of raw_documents so
+        # we set copy to False
+        return self._tfidf.transform(X, copy=False)
+
+    def transform(self, raw_documents):
+        """Transform documents to document-term matrix.
+
+        Uses the vocabulary and document frequencies (df) learned by fit (or
+        fit_transform).
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which generates either str, unicode or file objects.
+
+        Returns
+        -------
+        X : sparse matrix of (n_samples, n_features)
+            Tf-idf-weighted document-term matrix.
+        """
+        check_is_fitted(self, msg="The TF-IDF vectorizer is not fitted")
+
+        X = super().transform(raw_documents)
+        return self._tfidf.transform(X, copy=False)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.string = True
+        tags.input_tags.two_d_array = False
+        tags._skip_test = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0d2dcee909f4741d7ba79093812118dd14459d8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/__init__.py
@@ -0,0 +1,50 @@
+"""Feature selection algorithms.
+
+These include univariate filter selection methods and the recursive feature elimination
+algorithm.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._base import SelectorMixin
+from ._from_model import SelectFromModel
+from ._mutual_info import mutual_info_classif, mutual_info_regression
+from ._rfe import RFE, RFECV
+from ._sequential import SequentialFeatureSelector
+from ._univariate_selection import (
+    GenericUnivariateSelect,
+    SelectFdr,
+    SelectFpr,
+    SelectFwe,
+    SelectKBest,
+    SelectPercentile,
+    chi2,
+    f_classif,
+    f_oneway,
+    f_regression,
+    r_regression,
+)
+from ._variance_threshold import VarianceThreshold
+
+__all__ = [
+    "RFE",
+    "RFECV",
+    "GenericUnivariateSelect",
+    "SelectFdr",
+    "SelectFpr",
+    "SelectFromModel",
+    "SelectFwe",
+    "SelectKBest",
+    "SelectPercentile",
+    "SelectorMixin",
+    "SequentialFeatureSelector",
+    "VarianceThreshold",
+    "chi2",
+    "f_classif",
+    "f_oneway",
+    "f_regression",
+    "mutual_info_classif",
+    "mutual_info_regression",
+    "r_regression",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_base.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e50e49ca30c6970366b1c7799dcca46deef859
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_base.py
@@ -0,0 +1,267 @@
+"""Generic feature selection mixin"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from abc import ABCMeta, abstractmethod
+from operator import attrgetter
+
+import numpy as np
+from scipy.sparse import csc_matrix, issparse
+
+from ..base import TransformerMixin
+from ..utils import _safe_indexing, check_array, safe_sqr
+from ..utils._set_output import _get_output_config
+from ..utils._tags import get_tags
+from ..utils.validation import (
+    _check_feature_names_in,
+    _is_pandas_df,
+    check_is_fitted,
+    validate_data,
+)
+
+
+class SelectorMixin(TransformerMixin, metaclass=ABCMeta):
+    """
+    Transformer mixin that performs feature selection given a support mask
+
+    This mixin provides a feature selector implementation with `transform` and
+    `inverse_transform` functionality given an implementation of
+    `_get_support_mask`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.base import BaseEstimator
+    >>> from sklearn.feature_selection import SelectorMixin
+    >>> class FeatureSelector(SelectorMixin, BaseEstimator):
+    ...    def fit(self, X, y=None):
+    ...        self.n_features_in_ = X.shape[1]
+    ...        return self
+    ...    def _get_support_mask(self):
+    ...        mask = np.zeros(self.n_features_in_, dtype=bool)
+    ...        mask[:2] = True  # select the first two features
+    ...        return mask
+    >>> X, y = load_iris(return_X_y=True)
+    >>> FeatureSelector().fit_transform(X, y).shape
+    (150, 2)
+    """
+
+    def get_support(self, indices=False):
+        """
+        Get a mask, or integer index, of the features selected.
+
+        Parameters
+        ----------
+        indices : bool, default=False
+            If True, the return value will be an array of integers, rather
+            than a boolean mask.
+
+        Returns
+        -------
+        support : array
+            An index that selects the retained features from a feature vector.
+            If `indices` is False, this is a boolean array of shape
+            [# input features], in which an element is True iff its
+            corresponding feature is selected for retention. If `indices` is
+            True, this is an integer array of shape [# output features] whose
+            values are indices into the input feature vector.
+        """
+        mask = self._get_support_mask()
+        return mask if not indices else np.nonzero(mask)[0]
+
+    @abstractmethod
+    def _get_support_mask(self):
+        """
+        Get the boolean mask indicating which features are selected
+
+        Returns
+        -------
+        support : boolean array of shape [# input features]
+            An element is True iff its corresponding feature is selected for
+            retention.
+        """
+
+    def transform(self, X):
+        """Reduce X to the selected features.
+
+        Parameters
+        ----------
+        X : array of shape [n_samples, n_features]
+            The input samples.
+
+        Returns
+        -------
+        X_r : array of shape [n_samples, n_selected_features]
+            The input samples with only the selected features.
+        """
+        # Preserve X when X is a dataframe and the output is configured to
+        # be pandas.
+        output_config_dense = _get_output_config("transform", estimator=self)["dense"]
+        preserve_X = output_config_dense != "default" and _is_pandas_df(X)
+
+        # note: we use get_tags instead of __sklearn_tags__ because this is a
+        # public Mixin.
+        X = validate_data(
+            self,
+            X,
+            dtype=None,
+            accept_sparse="csr",
+            ensure_all_finite=not get_tags(self).input_tags.allow_nan,
+            skip_check_array=preserve_X,
+            reset=False,
+        )
+        return self._transform(X)
+
+    def _transform(self, X):
+        """Reduce X to the selected features."""
+        mask = self.get_support()
+        if not mask.any():
+            warnings.warn(
+                (
+                    "No features were selected: either the data is"
+                    " too noisy or the selection test too strict."
+                ),
+                UserWarning,
+            )
+            if hasattr(X, "iloc"):
+                return X.iloc[:, :0]
+            return np.empty(0, dtype=X.dtype).reshape((X.shape[0], 0))
+        return _safe_indexing(X, mask, axis=1)
+
+    def inverse_transform(self, X):
+        """Reverse the transformation operation.
+
+        Parameters
+        ----------
+        X : array of shape [n_samples, n_selected_features]
+            The input samples.
+
+        Returns
+        -------
+        X_original : array of shape [n_samples, n_original_features]
+            `X` with columns of zeros inserted where features would have
+            been removed by :meth:`transform`.
+        """
+        if issparse(X):
+            X = X.tocsc()
+            # insert additional entries in indptr:
+            # e.g. if transform changed indptr from [0 2 6 7] to [0 2 3]
+            # col_nonzeros here will be [2 0 1] so indptr becomes [0 2 2 3]
+            it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1))
+            col_nonzeros = it.ravel()
+            indptr = np.concatenate([[0], np.cumsum(col_nonzeros)])
+            Xt = csc_matrix(
+                (X.data, X.indices, indptr),
+                shape=(X.shape[0], len(indptr) - 1),
+                dtype=X.dtype,
+            )
+            return Xt
+
+        support = self.get_support()
+        X = check_array(X, dtype=None)
+        if support.sum() != X.shape[1]:
+            raise ValueError("X has a different shape than during fitting.")
+
+        if X.ndim == 1:
+            X = X[None, :]
+        Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype)
+        Xt[:, support] = X
+        return Xt
+
+    def get_feature_names_out(self, input_features=None):
+        """Mask feature names according to selected features.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self)
+        input_features = _check_feature_names_in(self, input_features)
+        return input_features[self.get_support()]
+
+
+def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1):
+    """
+    Retrieve and aggregate (ndim > 1)  the feature importances
+    from an estimator. Also optionally applies transformation.
+
+    Parameters
+    ----------
+    estimator : estimator
+        A scikit-learn estimator from which we want to get the feature
+        importances.
+
+    getter : "auto", str or callable
+        An attribute or a callable to get the feature importance. If `"auto"`,
+        `estimator` is expected to expose `coef_` or `feature_importances`.
+
+    transform_func : {"norm", "square"}, default=None
+        The transform to apply to the feature importances. By default (`None`)
+        no transformation is applied.
+
+    norm_order : int, default=1
+        The norm order to apply when `transform_func="norm"`. Only applied
+        when `importances.ndim > 1`.
+
+    Returns
+    -------
+    importances : ndarray of shape (n_features,)
+        The features importances, optionally transformed.
+    """
+    if isinstance(getter, str):
+        if getter == "auto":
+            if hasattr(estimator, "coef_"):
+                getter = attrgetter("coef_")
+            elif hasattr(estimator, "feature_importances_"):
+                getter = attrgetter("feature_importances_")
+            else:
+                raise ValueError(
+                    "when `importance_getter=='auto'`, the underlying "
+                    f"estimator {estimator.__class__.__name__} should have "
+                    "`coef_` or `feature_importances_` attribute. Either "
+                    "pass a fitted estimator to feature selector or call fit "
+                    "before calling transform."
+                )
+        else:
+            getter = attrgetter(getter)
+    elif not callable(getter):
+        raise ValueError("`importance_getter` has to be a string or `callable`")
+
+    importances = getter(estimator)
+
+    if transform_func is None:
+        return importances
+    elif transform_func == "norm":
+        if importances.ndim == 1:
+            importances = np.abs(importances)
+        else:
+            importances = np.linalg.norm(importances, axis=0, ord=norm_order)
+    elif transform_func == "square":
+        if importances.ndim == 1:
+            importances = safe_sqr(importances)
+        else:
+            importances = safe_sqr(importances).sum(axis=0)
+    else:
+        raise ValueError(
+            "Valid values for `transform_func` are "
+            "None, 'norm' and 'square'. Those two "
+            "transformation are only supported now"
+        )
+
+    return importances
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_from_model.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_from_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2c73c6cbfaeeca449af4b0c04388dbe10be8b7
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_from_model.py
@@ -0,0 +1,513 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from copy import deepcopy
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone
+from ..exceptions import NotFittedError
+from ..utils._param_validation import HasMethods, Interval, Options
+from ..utils._tags import get_tags
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.validation import (
+    _check_feature_names,
+    _estimator_has,
+    _num_features,
+    check_is_fitted,
+    check_scalar,
+)
+from ._base import SelectorMixin, _get_feature_importances
+
+
+def _calculate_threshold(estimator, importances, threshold):
+    """Interpret the threshold value"""
+
+    if threshold is None:
+        # determine default from estimator
+        est_name = estimator.__class__.__name__
+        is_l1_penalized = hasattr(estimator, "penalty") and estimator.penalty == "l1"
+        is_lasso = "Lasso" in est_name
+        is_elasticnet_l1_penalized = est_name == "ElasticNet" and (
+            hasattr(estimator, "l1_ratio") and np.isclose(estimator.l1_ratio, 1.0)
+        )
+        is_elasticnetcv_l1_penalized = est_name == "ElasticNetCV" and (
+            hasattr(estimator, "l1_ratio_") and np.isclose(estimator.l1_ratio_, 1.0)
+        )
+        if (
+            is_l1_penalized
+            or is_lasso
+            or is_elasticnet_l1_penalized
+            or is_elasticnetcv_l1_penalized
+        ):
+            # the natural default threshold is 0 when l1 penalty was used
+            threshold = 1e-5
+        else:
+            threshold = "mean"
+
+    if isinstance(threshold, str):
+        if "*" in threshold:
+            scale, reference = threshold.split("*")
+            scale = float(scale.strip())
+            reference = reference.strip()
+
+            if reference == "median":
+                reference = np.median(importances)
+            elif reference == "mean":
+                reference = np.mean(importances)
+            else:
+                raise ValueError("Unknown reference: " + reference)
+
+            threshold = scale * reference
+
+        elif threshold == "median":
+            threshold = np.median(importances)
+
+        elif threshold == "mean":
+            threshold = np.mean(importances)
+
+        else:
+            raise ValueError(
+                "Expected threshold='mean' or threshold='median' got %s" % threshold
+            )
+
+    else:
+        threshold = float(threshold)
+
+    return threshold
+
+
+class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
+    """Meta-transformer for selecting features based on importance weights.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <select_from_model>`.
+
+    Parameters
+    ----------
+    estimator : object
+        The base estimator from which the transformer is built.
+        This can be both a fitted (if ``prefit`` is set to True)
+        or a non-fitted estimator. The estimator should have a
+        ``feature_importances_`` or ``coef_`` attribute after fitting.
+        Otherwise, the ``importance_getter`` parameter should be used.
+
+    threshold : str or float, default=None
+        The threshold value to use for feature selection. Features whose
+        absolute importance value is greater or equal are kept while the others
+        are discarded. If "median" (resp. "mean"), then the ``threshold`` value
+        is the median (resp. the mean) of the feature importances. A scaling
+        factor (e.g., "1.25*mean") may also be used. If None and if the
+        estimator has a parameter penalty set to l1, either explicitly
+        or implicitly (e.g, Lasso), the threshold used is 1e-5.
+        Otherwise, "mean" is used by default.
+
+    prefit : bool, default=False
+        Whether a prefit model is expected to be passed into the constructor
+        directly or not.
+        If `True`, `estimator` must be a fitted estimator.
+        If `False`, `estimator` is fitted and updated by calling
+        `fit` and `partial_fit`, respectively.
+
+    norm_order : non-zero int, inf, -inf, default=1
+        Order of the norm used to filter the vectors of coefficients below
+        ``threshold`` in the case where the ``coef_`` attribute of the
+        estimator is of dimension 2.
+
+    max_features : int, callable, default=None
+        The maximum number of features to select.
+
+        - If an integer, then it specifies the maximum number of features to
+          allow.
+        - If a callable, then it specifies how to calculate the maximum number of
+          features allowed by using the output of `max_features(X)`.
+        - If `None`, then all features are kept.
+
+        To only select based on ``max_features``, set ``threshold=-np.inf``.
+
+        .. versionadded:: 0.20
+        .. versionchanged:: 1.1
+           `max_features` accepts a callable.
+
+    importance_getter : str or callable, default='auto'
+        If 'auto', uses the feature importance either through a ``coef_``
+        attribute or ``feature_importances_`` attribute of estimator.
+
+        Also accepts a string that specifies an attribute name/path
+        for extracting feature importance (implemented with `attrgetter`).
+        For example, give `regressor_.coef_` in case of
+        :class:`~sklearn.compose.TransformedTargetRegressor`  or
+        `named_steps.clf.feature_importances_` in case of
+        :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.
+
+        If `callable`, overrides the default feature importance getter.
+        The callable is passed with the fitted estimator and it should
+        return importance for each feature.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    estimator_ : estimator
+        The base estimator from which the transformer is built. This attribute
+        exist only when `fit` has been called.
+
+        - If `prefit=True`, it is a deep copy of `estimator`.
+        - If `prefit=False`, it is a clone of `estimator` and fit on the data
+          passed to `fit` or `partial_fit`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    max_features_ : int
+        Maximum number of features calculated during :term:`fit`. Only defined
+        if the ``max_features`` is not `None`.
+
+        - If `max_features` is an `int`, then `max_features_ = max_features`.
+        - If `max_features` is a callable, then `max_features_ = max_features(X)`.
+
+        .. versionadded:: 1.1
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    threshold_ : float
+        The threshold value used for feature selection.
+
+    See Also
+    --------
+    RFE : Recursive feature elimination based on importance weights.
+    RFECV : Recursive feature elimination with built-in cross-validated
+        selection of the best number of features.
+    SequentialFeatureSelector : Sequential cross-validation based feature
+        selection. Does not rely on importance weights.
+
+    Notes
+    -----
+    Allows NaN/Inf in the input if the underlying estimator does as well.
+
+    Examples
+    --------
+    >>> from sklearn.feature_selection import SelectFromModel
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X = [[ 0.87, -1.34,  0.31 ],
+    ...      [-2.79, -0.02, -0.85 ],
+    ...      [-1.34, -0.48, -2.55 ],
+    ...      [ 1.92,  1.48,  0.65 ]]
+    >>> y = [0, 1, 0, 1]
+    >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
+    >>> selector.estimator_.coef_
+    array([[-0.3252,  0.8345,  0.4976]])
+    >>> selector.threshold_
+    np.float64(0.55249)
+    >>> selector.get_support()
+    array([False,  True, False])
+    >>> selector.transform(X)
+    array([[-1.34],
+           [-0.02],
+           [-0.48],
+           [ 1.48]])
+
+    Using a callable to create a selector that can use no more than half
+    of the input features.
+
+    >>> def half_callable(X):
+    ...     return round(len(X[0]) / 2)
+    >>> half_selector = SelectFromModel(estimator=LogisticRegression(),
+    ...                                 max_features=half_callable)
+    >>> _ = half_selector.fit(X, y)
+    >>> half_selector.max_features_
+    2
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods("fit")],
+        "threshold": [Interval(Real, None, None, closed="both"), str, None],
+        "prefit": ["boolean"],
+        "norm_order": [
+            Interval(Integral, None, -1, closed="right"),
+            Interval(Integral, 1, None, closed="left"),
+            Options(Real, {np.inf, -np.inf}),
+        ],
+        "max_features": [Interval(Integral, 0, None, closed="left"), callable, None],
+        "importance_getter": [str, callable],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        threshold=None,
+        prefit=False,
+        norm_order=1,
+        max_features=None,
+        importance_getter="auto",
+    ):
+        self.estimator = estimator
+        self.threshold = threshold
+        self.prefit = prefit
+        self.importance_getter = importance_getter
+        self.norm_order = norm_order
+        self.max_features = max_features
+
+    def _get_support_mask(self):
+        estimator = getattr(self, "estimator_", self.estimator)
+        max_features = getattr(self, "max_features_", self.max_features)
+
+        if self.prefit:
+            try:
+                check_is_fitted(self.estimator)
+            except NotFittedError as exc:
+                raise NotFittedError(
+                    "When `prefit=True`, `estimator` is expected to be a fitted "
+                    "estimator."
+                ) from exc
+        if callable(max_features):
+            # This branch is executed when `transform` is called directly and thus
+            # `max_features_` is not set and we fallback using `self.max_features`
+            # that is not validated
+            raise NotFittedError(
+                "When `prefit=True` and `max_features` is a callable, call `fit` "
+                "before calling `transform`."
+            )
+        elif max_features is not None and not isinstance(max_features, Integral):
+            raise ValueError(
+                f"`max_features` must be an integer. Got `max_features={max_features}` "
+                "instead."
+            )
+
+        scores = _get_feature_importances(
+            estimator=estimator,
+            getter=self.importance_getter,
+            transform_func="norm",
+            norm_order=self.norm_order,
+        )
+        threshold = _calculate_threshold(estimator, scores, self.threshold)
+        if self.max_features is not None:
+            mask = np.zeros_like(scores, dtype=bool)
+            candidate_indices = np.argsort(-scores, kind="mergesort")[:max_features]
+            mask[candidate_indices] = True
+        else:
+            mask = np.ones_like(scores, dtype=bool)
+        mask[scores < threshold] = False
+        return mask
+
+    def _check_max_features(self, X):
+        if self.max_features is not None:
+            n_features = _num_features(X)
+
+            if callable(self.max_features):
+                max_features = self.max_features(X)
+            else:  # int
+                max_features = self.max_features
+
+            check_scalar(
+                max_features,
+                "max_features",
+                Integral,
+                min_val=0,
+                max_val=n_features,
+            )
+            self.max_features_ = max_features
+
+    @_fit_context(
+        # SelectFromModel.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, **fit_params):
+        """Fit the SelectFromModel meta-transformer.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples,), default=None
+            The target values (integers that correspond to classes in
+            classification, real numbers in regression).
+
+        **fit_params : dict
+            - If `enable_metadata_routing=False` (default): Parameters directly passed
+              to the `fit` method of the sub-estimator. They are ignored if
+              `prefit=True`.
+
+            - If `enable_metadata_routing=True`: Parameters safely routed to the `fit`
+              method of the sub-estimator. They are ignored if `prefit=True`.
+
+            .. versionchanged:: 1.4
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self._check_max_features(X)
+
+        if self.prefit:
+            try:
+                check_is_fitted(self.estimator)
+            except NotFittedError as exc:
+                raise NotFittedError(
+                    "When `prefit=True`, `estimator` is expected to be a fitted "
+                    "estimator."
+                ) from exc
+            self.estimator_ = deepcopy(self.estimator)
+        else:
+            if _routing_enabled():
+                routed_params = process_routing(self, "fit", **fit_params)
+                self.estimator_ = clone(self.estimator)
+                self.estimator_.fit(X, y, **routed_params.estimator.fit)
+            else:
+                # TODO(SLEP6): remove when metadata routing cannot be disabled.
+                self.estimator_ = clone(self.estimator)
+                self.estimator_.fit(X, y, **fit_params)
+
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
+        else:
+            _check_feature_names(self, X, reset=True)
+
+        return self
+
+    @property
+    def threshold_(self):
+        """Threshold value used for feature selection."""
+        scores = _get_feature_importances(
+            estimator=self.estimator_,
+            getter=self.importance_getter,
+            transform_func="norm",
+            norm_order=self.norm_order,
+        )
+        return _calculate_threshold(self.estimator, scores, self.threshold)
+
+    @available_if(_estimator_has("partial_fit"))
+    @_fit_context(
+        # SelectFromModel.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def partial_fit(self, X, y=None, **partial_fit_params):
+        """Fit the SelectFromModel meta-transformer only once.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples,), default=None
+            The target values (integers that correspond to classes in
+            classification, real numbers in regression).
+
+        **partial_fit_params : dict
+            - If `enable_metadata_routing=False` (default): Parameters directly passed
+              to the `partial_fit` method of the sub-estimator.
+
+            - If `enable_metadata_routing=True`: Parameters passed to the `partial_fit`
+              method of the sub-estimator. They are ignored if `prefit=True`.
+
+            .. versionchanged:: 1.4
+
+                `**partial_fit_params` are routed to the sub-estimator, if
+                `enable_metadata_routing=True` is set via
+                :func:`~sklearn.set_config`, which allows for aliasing.
+
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        first_call = not hasattr(self, "estimator_")
+
+        if first_call:
+            self._check_max_features(X)
+
+        if self.prefit:
+            if first_call:
+                try:
+                    check_is_fitted(self.estimator)
+                except NotFittedError as exc:
+                    raise NotFittedError(
+                        "When `prefit=True`, `estimator` is expected to be a fitted "
+                        "estimator."
+                    ) from exc
+                self.estimator_ = deepcopy(self.estimator)
+            return self
+
+        if first_call:
+            self.estimator_ = clone(self.estimator)
+        if _routing_enabled():
+            routed_params = process_routing(self, "partial_fit", **partial_fit_params)
+            self.estimator_ = clone(self.estimator)
+            self.estimator_.partial_fit(X, y, **routed_params.estimator.partial_fit)
+        else:
+            # TODO(SLEP6): remove when metadata routing cannot be disabled.
+            self.estimator_.partial_fit(X, y, **partial_fit_params)
+
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
+        else:
+            _check_feature_names(self, X, reset=first_call)
+
+        return self
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during `fit`."""
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute.".format(
+                    self.__class__.__name__
+                )
+            ) from nfe
+
+        return self.estimator_.n_features_in_
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.4
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping()
+            .add(caller="partial_fit", callee="partial_fit")
+            .add(caller="fit", callee="fit"),
+        )
+        return router
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
+        tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_mutual_info.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_mutual_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..aef9097879fcaf02efa50f7c5e3d33f492e14495
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_mutual_info.py
@@ -0,0 +1,580 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral
+
+import numpy as np
+from scipy.sparse import issparse
+from scipy.special import digamma
+
+from ..metrics.cluster import mutual_info_score
+from ..neighbors import KDTree, NearestNeighbors
+from ..preprocessing import scale
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.multiclass import check_classification_targets
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import check_array, check_X_y
+
+
+def _compute_mi_cc(x, y, n_neighbors):
+    """Compute mutual information between two continuous variables.
+
+    Parameters
+    ----------
+    x, y : ndarray, shape (n_samples,)
+        Samples of two continuous random variables, must have an identical
+        shape.
+
+    n_neighbors : int
+        Number of nearest neighbors to search for each point, see [1]_.
+
+    Returns
+    -------
+    mi : float
+        Estimated mutual information in nat units. If it turned out to be
+        negative it is replaced by 0.
+
+    Notes
+    -----
+    True mutual information can't be negative. If its estimate by a numerical
+    method is negative, it means (providing the method is adequate) that the
+    mutual information is close to 0 and replacing it by 0 is a reasonable
+    strategy.
+
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    """
+    n_samples = x.size
+
+    x = x.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+    xy = np.hstack((x, y))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
+
+    nn.fit(xy)
+    radius = nn.kneighbors()[0]
+    radius = np.nextafter(radius[:, -1], 0)
+
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    kd = KDTree(x, metric="chebyshev")
+    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
+    nx = np.array(nx) - 1.0
+
+    kd = KDTree(y, metric="chebyshev")
+    ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
+    ny = np.array(ny) - 1.0
+
+    mi = (
+        digamma(n_samples)
+        + digamma(n_neighbors)
+        - np.mean(digamma(nx + 1))
+        - np.mean(digamma(ny + 1))
+    )
+
+    return max(0, mi)
+
+
+def _compute_mi_cd(c, d, n_neighbors):
+    """Compute mutual information between continuous and discrete variables.
+
+    Parameters
+    ----------
+    c : ndarray, shape (n_samples,)
+        Samples of a continuous random variable.
+
+    d : ndarray, shape (n_samples,)
+        Samples of a discrete random variable.
+
+    n_neighbors : int
+        Number of nearest neighbors to search for each point, see [1]_.
+
+    Returns
+    -------
+    mi : float
+        Estimated mutual information in nat units. If it turned out to be
+        negative it is replaced by 0.
+
+    Notes
+    -----
+    True mutual information can't be negative. If its estimate by a numerical
+    method is negative, it means (providing the method is adequate) that the
+    mutual information is close to 0 and replacing it by 0 is a reasonable
+    strategy.
+
+    References
+    ----------
+    .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
+       Data Sets". PLoS ONE 9(2), 2014.
+    """
+    n_samples = c.shape[0]
+    c = c.reshape((-1, 1))
+
+    radius = np.empty(n_samples)
+    label_counts = np.empty(n_samples)
+    k_all = np.empty(n_samples)
+    nn = NearestNeighbors()
+    for label in np.unique(d):
+        mask = d == label
+        count = np.sum(mask)
+        if count > 1:
+            k = min(n_neighbors, count - 1)
+            nn.set_params(n_neighbors=k)
+            nn.fit(c[mask])
+            r = nn.kneighbors()[0]
+            radius[mask] = np.nextafter(r[:, -1], 0)
+            k_all[mask] = k
+        label_counts[mask] = count
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    n_samples = np.sum(mask)
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    c = c[mask]
+    radius = radius[mask]
+
+    kd = KDTree(c)
+    m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
+    m_all = np.array(m_all)
+
+    mi = (
+        digamma(n_samples)
+        + np.mean(digamma(k_all))
+        - np.mean(digamma(label_counts))
+        - np.mean(digamma(m_all))
+    )
+
+    return max(0, mi)
+
+
+def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
+    """Compute mutual information between two variables.
+
+    This is a simple wrapper which selects a proper function to call based on
+    whether `x` and `y` are discrete or not.
+    """
+    if x_discrete and y_discrete:
+        return mutual_info_score(x, y)
+    elif x_discrete and not y_discrete:
+        return _compute_mi_cd(y, x, n_neighbors)
+    elif not x_discrete and y_discrete:
+        return _compute_mi_cd(x, y, n_neighbors)
+    else:
+        return _compute_mi_cc(x, y, n_neighbors)
+
+
+def _iterate_columns(X, columns=None):
+    """Iterate over columns of a matrix.
+
+    Parameters
+    ----------
+    X : ndarray or csc_matrix, shape (n_samples, n_features)
+        Matrix over which to iterate.
+
+    columns : iterable or None, default=None
+        Indices of columns to iterate over. If None, iterate over all columns.
+
+    Yields
+    ------
+    x : ndarray, shape (n_samples,)
+        Columns of `X` in dense format.
+    """
+    if columns is None:
+        columns = range(X.shape[1])
+
+    if issparse(X):
+        for i in columns:
+            x = np.zeros(X.shape[0])
+            start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
+            x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
+            yield x
+    else:
+        for i in columns:
+            yield X[:, i]
+
+
+def _estimate_mi(
+    X,
+    y,
+    *,
+    discrete_features="auto",
+    discrete_target=False,
+    n_neighbors=3,
+    copy=True,
+    random_state=None,
+    n_jobs=None,
+):
+    """Estimate mutual information between the features and the target.
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array-like of shape (n_samples,)
+        Target vector.
+
+    discrete_features : {'auto', bool, array-like}, default='auto'
+        If bool, then determines whether to consider all features discrete
+        or continuous. If array, then it should be either a boolean mask
+        with shape (n_features,) or array with indices of discrete features.
+        If 'auto', it is assigned to False for dense `X` and to True for
+        sparse `X`.
+
+    discrete_target : bool, default=False
+        Whether to consider `y` as a discrete variable.
+
+    n_neighbors : int, default=3
+        Number of neighbors to use for MI estimation for continuous variables,
+        see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+        could introduce a bias.
+
+    copy : bool, default=True
+        Whether to make a copy of the given data. If set to False, the initial
+        data will be overwritten.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for adding small noise to
+        continuous variables in order to remove repeated values.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        The number of jobs to use for computing the mutual information.
+        The parallelization is done on the columns of `X`.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 1.5
+
+
+    Returns
+    -------
+    mi : ndarray, shape (n_features,)
+        Estimated mutual information between each feature and the target in
+        nat units. A negative value will be replaced by 0.
+
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    """
+    X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
+    n_samples, n_features = X.shape
+
+    if isinstance(discrete_features, (str, bool)):
+        if isinstance(discrete_features, str):
+            if discrete_features == "auto":
+                discrete_features = issparse(X)
+            else:
+                raise ValueError("Invalid string value for discrete_features.")
+        discrete_mask = np.empty(n_features, dtype=bool)
+        discrete_mask.fill(discrete_features)
+    else:
+        discrete_features = check_array(discrete_features, ensure_2d=False)
+        if discrete_features.dtype != "bool":
+            discrete_mask = np.zeros(n_features, dtype=bool)
+            discrete_mask[discrete_features] = True
+        else:
+            discrete_mask = discrete_features
+
+    continuous_mask = ~discrete_mask
+    if np.any(continuous_mask) and issparse(X):
+        raise ValueError("Sparse matrix `X` can't have continuous features.")
+
+    rng = check_random_state(random_state)
+    if np.any(continuous_mask):
+        X = X.astype(np.float64, copy=copy)
+        X[:, continuous_mask] = scale(
+            X[:, continuous_mask], with_mean=False, copy=False
+        )
+
+        # Add small noise to continuous features as advised in Kraskov et. al.
+        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
+        X[:, continuous_mask] += (
+            1e-10
+            * means
+            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
+        )
+
+    if not discrete_target:
+        y = scale(y, with_mean=False)
+        y += (
+            1e-10
+            * np.maximum(1, np.mean(np.abs(y)))
+            * rng.standard_normal(size=n_samples)
+        )
+
+    mi = Parallel(n_jobs=n_jobs)(
+        delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors)
+        for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
+    )
+
+    return np.array(mi)
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like"],
+        "discrete_features": [StrOptions({"auto"}), "boolean", "array-like"],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "copy": ["boolean"],
+        "random_state": ["random_state"],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mutual_info_regression(
+    X,
+    y,
+    *,
+    discrete_features="auto",
+    n_neighbors=3,
+    copy=True,
+    random_state=None,
+    n_jobs=None,
+):
+    """Estimate mutual information for a continuous target variable.
+
+    Mutual information (MI) [1]_ between two random variables is a non-negative
+    value, which measures the dependency between the variables. It is equal
+    to zero if and only if two random variables are independent, and higher
+    values mean higher dependency.
+
+    The function relies on nonparametric methods based on entropy estimation
+    from k-nearest neighbors distances as described in [2]_ and [3]_. Both
+    methods are based on the idea originally proposed in [4]_.
+
+    It can be used for univariate features selection, read more in the
+    :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array-like of shape (n_samples,)
+        Target vector.
+
+    discrete_features : {'auto', bool, array-like}, default='auto'
+        If bool, then determines whether to consider all features discrete
+        or continuous. If array, then it should be either a boolean mask
+        with shape (n_features,) or array with indices of discrete features.
+        If 'auto', it is assigned to False for dense `X` and to True for
+        sparse `X`.
+
+    n_neighbors : int, default=3
+        Number of neighbors to use for MI estimation for continuous variables,
+        see [2]_ and [3]_. Higher values reduce variance of the estimation, but
+        could introduce a bias.
+
+    copy : bool, default=True
+        Whether to make a copy of the given data. If set to False, the initial
+        data will be overwritten.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for adding small noise to
+        continuous variables in order to remove repeated values.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        The number of jobs to use for computing the mutual information.
+        The parallelization is done on the columns of `X`.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    mi : ndarray, shape (n_features,)
+        Estimated mutual information between each feature and the target in
+        nat units.
+
+    Notes
+    -----
+    1. The term "discrete features" is used instead of naming them
+       "categorical", because it describes the essence more accurately.
+       For example, pixel intensities of an image are discrete features
+       (but hardly categorical) and you will get better results if mark them
+       as such. Also note, that treating a continuous variable as discrete and
+       vice versa will usually give incorrect results, so be attentive about
+       that.
+    2. True mutual information can't be negative. If its estimate turns out
+       to be negative, it is replaced by zero.
+
+    References
+    ----------
+    .. [1] `Mutual Information
+           <https://en.wikipedia.org/wiki/Mutual_information>`_
+           on Wikipedia.
+    .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
+           of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.feature_selection import mutual_info_regression
+    >>> X, y = make_regression(
+    ...     n_samples=50, n_features=3, n_informative=1, noise=1e-4, random_state=42
+    ... )
+    >>> mutual_info_regression(X, y)
+    array([0.117, 2.645, 0.0287])
+    """
+    return _estimate_mi(
+        X,
+        y,
+        discrete_features=discrete_features,
+        discrete_target=False,
+        n_neighbors=n_neighbors,
+        copy=copy,
+        random_state=random_state,
+        n_jobs=n_jobs,
+    )
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like"],
+        "discrete_features": [StrOptions({"auto"}), "boolean", "array-like"],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "copy": ["boolean"],
+        "random_state": ["random_state"],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mutual_info_classif(
+    X,
+    y,
+    *,
+    discrete_features="auto",
+    n_neighbors=3,
+    copy=True,
+    random_state=None,
+    n_jobs=None,
+):
+    """Estimate mutual information for a discrete target variable.
+
+    Mutual information (MI) [1]_ between two random variables is a non-negative
+    value, which measures the dependency between the variables. It is equal
+    to zero if and only if two random variables are independent, and higher
+    values mean higher dependency.
+
+    The function relies on nonparametric methods based on entropy estimation
+    from k-nearest neighbors distances as described in [2]_ and [3]_. Both
+    methods are based on the idea originally proposed in [4]_.
+
+    It can be used for univariate features selection, read more in the
+    :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array-like of shape (n_samples,)
+        Target vector.
+
+    discrete_features : 'auto', bool or array-like, default='auto'
+        If bool, then determines whether to consider all features discrete
+        or continuous. If array, then it should be either a boolean mask
+        with shape (n_features,) or array with indices of discrete features.
+        If 'auto', it is assigned to False for dense `X` and to True for
+        sparse `X`.
+
+    n_neighbors : int, default=3
+        Number of neighbors to use for MI estimation for continuous variables,
+        see [2]_ and [3]_. Higher values reduce variance of the estimation, but
+        could introduce a bias.
+
+    copy : bool, default=True
+        Whether to make a copy of the given data. If set to False, the initial
+        data will be overwritten.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for adding small noise to
+        continuous variables in order to remove repeated values.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        The number of jobs to use for computing the mutual information.
+        The parallelization is done on the columns of `X`.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    mi : ndarray, shape (n_features,)
+        Estimated mutual information between each feature and the target in
+        nat units.
+
+    Notes
+    -----
+    1. The term "discrete features" is used instead of naming them
+       "categorical", because it describes the essence more accurately.
+       For example, pixel intensities of an image are discrete features
+       (but hardly categorical) and you will get better results if mark them
+       as such. Also note, that treating a continuous variable as discrete and
+       vice versa will usually give incorrect results, so be attentive about
+       that.
+    2. True mutual information can't be negative. If its estimate turns out
+       to be negative, it is replaced by zero.
+
+    References
+    ----------
+    .. [1] `Mutual Information
+           <https://en.wikipedia.org/wiki/Mutual_information>`_
+           on Wikipedia.
+    .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
+           of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.feature_selection import mutual_info_classif
+    >>> X, y = make_classification(
+    ...     n_samples=100, n_features=10, n_informative=2, n_clusters_per_class=1,
+    ...     shuffle=False, random_state=42
+    ... )
+    >>> mutual_info_classif(X, y)
+    array([0.589, 0.107, 0.196, 0.0968 , 0.,
+           0.   , 0.   , 0.   , 0.     , 0.])
+    """
+    check_classification_targets(y)
+    return _estimate_mi(
+        X,
+        y,
+        discrete_features=discrete_features,
+        discrete_target=True,
+        n_neighbors=n_neighbors,
+        copy=copy,
+        random_state=random_state,
+        n_jobs=n_jobs,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_rfe.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_rfe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d647ad0ca19b10d36bcf4bb9f5ccf698f506f24b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_rfe.py
@@ -0,0 +1,1025 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Recursive feature elimination for feature ranking"""
+
+import warnings
+from copy import deepcopy
+from numbers import Integral
+
+import numpy as np
+from joblib import effective_n_jobs
+
+from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
+from ..metrics import get_scorer
+from ..model_selection import check_cv
+from ..model_selection._validation import _score
+from ..utils import Bunch, metadata_routing
+from ..utils._metadata_requests import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils._param_validation import HasMethods, Interval, RealNotInt
+from ..utils._tags import get_tags
+from ..utils.metaestimators import _safe_split, available_if
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_method_params,
+    _deprecate_positional_args,
+    _estimator_has,
+    check_is_fitted,
+    validate_data,
+)
+from ._base import SelectorMixin, _get_feature_importances
+
+
+def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer, routed_params):
+    """
+    Return the score and n_features per step for a fit across one fold.
+    """
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, y_test = _safe_split(estimator, X, y, test, train)
+    fit_params = _check_method_params(
+        X, params=routed_params.estimator.fit, indices=train
+    )
+    score_params = _check_method_params(
+        X=X, params=routed_params.scorer.score, indices=test
+    )
+
+    rfe._fit(
+        X_train,
+        y_train,
+        lambda estimator, features: _score(
+            estimator,
+            X_test[:, features],
+            y_test,
+            scorer,
+            score_params=score_params,
+        ),
+        **fit_params,
+    )
+
+    return rfe.step_scores_, rfe.step_support_, rfe.step_ranking_, rfe.step_n_features_
+
+
+class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
+    """Feature ranking with recursive feature elimination.
+
+    Given an external estimator that assigns weights to features (e.g., the
+    coefficients of a linear model), the goal of recursive feature elimination
+    (RFE) is to select features by recursively considering smaller and smaller
+    sets of features. First, the estimator is trained on the initial set of
+    features and the importance of each feature is obtained either through
+    any specific attribute or callable.
+    Then, the least important features are pruned from current set of features.
+    That procedure is recursively repeated on the pruned set until the desired
+    number of features to select is eventually reached.
+
+    Read more in the :ref:`User Guide <rfe>`.
+
+    Parameters
+    ----------
+    estimator : ``Estimator`` instance
+        A supervised learning estimator with a ``fit`` method that provides
+        information about feature importance
+        (e.g. `coef_`, `feature_importances_`).
+
+    n_features_to_select : int or float, default=None
+        The number of features to select. If `None`, half of the features are
+        selected. If integer, the parameter is the absolute number of features
+        to select. If float between 0 and 1, it is the fraction of features to
+        select.
+
+        .. versionchanged:: 0.24
+           Added float values for fractions.
+
+    step : int or float, default=1
+        If greater than or equal to 1, then ``step`` corresponds to the
+        (integer) number of features to remove at each iteration.
+        If within (0.0, 1.0), then ``step`` corresponds to the percentage
+        (rounded down) of features to remove at each iteration.
+
+    verbose : int, default=0
+        Controls verbosity of output.
+
+    importance_getter : str or callable, default='auto'
+        If 'auto', uses the feature importance either through a `coef_`
+        or `feature_importances_` attributes of estimator.
+
+        Also accepts a string that specifies an attribute name/path
+        for extracting feature importance (implemented with `attrgetter`).
+        For example, give `regressor_.coef_` in case of
+        :class:`~sklearn.compose.TransformedTargetRegressor`  or
+        `named_steps.clf.feature_importances_` in case of
+        class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.
+
+        If `callable`, overrides the default feature importance getter.
+        The callable is passed with the fitted estimator and it should
+        return importance for each feature.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. Only available when `estimator` is a classifier.
+
+    estimator_ : ``Estimator`` instance
+        The fitted estimator used to select features.
+
+    n_features_ : int
+        The number of selected features.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    ranking_ : ndarray of shape (n_features,)
+        The feature ranking, such that ``ranking_[i]`` corresponds to the
+        ranking position of the i-th feature. Selected (i.e., estimated
+        best) features are assigned rank 1.
+
+    support_ : ndarray of shape (n_features,)
+        The mask of selected features.
+
+    See Also
+    --------
+    RFECV : Recursive feature elimination with built-in cross-validated
+        selection of the best number of features.
+    SelectFromModel : Feature selection based on thresholds of importance
+        weights.
+    SequentialFeatureSelector : Sequential cross-validation based feature
+        selection. Does not rely on importance weights.
+
+    Notes
+    -----
+    Allows NaN/Inf in the input if the underlying estimator does as well.
+
+    References
+    ----------
+
+    .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection
+           for cancer classification using support vector machines",
+           Mach. Learn., 46(1-3), 389--422, 2002.
+
+    Examples
+    --------
+    The following example shows how to retrieve the 5 most informative
+    features in the Friedman #1 dataset.
+
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.feature_selection import RFE
+    >>> from sklearn.svm import SVR
+    >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
+    >>> estimator = SVR(kernel="linear")
+    >>> selector = RFE(estimator, n_features_to_select=5, step=1)
+    >>> selector = selector.fit(X, y)
+    >>> selector.support_
+    array([ True,  True,  True,  True,  True, False, False, False, False,
+           False])
+    >>> selector.ranking_
+    array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit"])],
+        "n_features_to_select": [
+            None,
+            Interval(RealNotInt, 0, 1, closed="right"),
+            Interval(Integral, 0, None, closed="neither"),
+        ],
+        "step": [
+            Interval(Integral, 0, None, closed="neither"),
+            Interval(RealNotInt, 0, 1, closed="neither"),
+        ],
+        "verbose": ["verbose"],
+        "importance_getter": [str, callable],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        n_features_to_select=None,
+        step=1,
+        verbose=0,
+        importance_getter="auto",
+    ):
+        self.estimator = estimator
+        self.n_features_to_select = n_features_to_select
+        self.step = step
+        self.importance_getter = importance_getter
+        self.verbose = verbose
+
+    # TODO(1.8) remove this property
+    @property
+    def _estimator_type(self):
+        return self.estimator._estimator_type
+
+    @property
+    def classes_(self):
+        """Classes labels available when `estimator` is a classifier.
+
+        Returns
+        -------
+        ndarray of shape (n_classes,)
+        """
+        return self.estimator_.classes_
+
+    @_fit_context(
+        # RFE.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **fit_params):
+        """Fit the RFE model and then the underlying estimator on the selected features.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples,)
+            The target values.
+
+        **fit_params : dict
+            - If `enable_metadata_routing=False` (default): Parameters directly passed
+              to the ``fit`` method of the underlying estimator.
+
+            - If `enable_metadata_routing=True`: Parameters safely routed to the ``fit``
+              method of the underlying estimator.
+
+            .. versionchanged:: 1.6
+                See :ref:`Metadata Routing User Guide <metadata_routing>`
+                for more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            routed_params = Bunch(estimator=Bunch(fit=fit_params))
+
+        return self._fit(X, y, **routed_params.estimator.fit)
+
+    def _fit(self, X, y, step_score=None, **fit_params):
+        # Parameter step_score controls the calculation of self.step_scores_
+        # step_score is not exposed to users and is used when implementing RFECV
+        # self.step_scores_ will not be calculated when calling _fit through fit
+
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse="csc",
+            ensure_min_features=2,
+            ensure_all_finite=False,
+            multi_output=True,
+        )
+
+        # Initialization
+        n_features = X.shape[1]
+        if self.n_features_to_select is None:
+            n_features_to_select = n_features // 2
+        elif isinstance(self.n_features_to_select, Integral):  # int
+            n_features_to_select = self.n_features_to_select
+            if n_features_to_select > n_features:
+                warnings.warn(
+                    (
+                        f"Found {n_features_to_select=} > {n_features=}. There will be"
+                        " no feature selection and all features will be kept."
+                    ),
+                    UserWarning,
+                )
+        else:  # float
+            n_features_to_select = int(n_features * self.n_features_to_select)
+
+        if 0.0 < self.step < 1.0:
+            step = int(max(1, self.step * n_features))
+        else:
+            step = int(self.step)
+
+        support_ = np.ones(n_features, dtype=bool)
+        ranking_ = np.ones(n_features, dtype=int)
+
+        if step_score:
+            self.step_n_features_ = []
+            self.step_scores_ = []
+            self.step_support_ = []
+            self.step_ranking_ = []
+
+        # Elimination
+        while np.sum(support_) > n_features_to_select:
+            # Remaining features
+            features = np.arange(n_features)[support_]
+
+            # Rank the remaining features
+            estimator = clone(self.estimator)
+            if self.verbose > 0:
+                print("Fitting estimator with %d features." % np.sum(support_))
+
+            estimator.fit(X[:, features], y, **fit_params)
+
+            # Compute step values on the previous selection iteration because
+            # 'estimator' must use features that have not been eliminated yet
+            if step_score:
+                self.step_n_features_.append(len(features))
+                self.step_scores_.append(step_score(estimator, features))
+                self.step_support_.append(list(support_))
+                self.step_ranking_.append(list(ranking_))
+
+            # Get importance and rank them
+            importances = _get_feature_importances(
+                estimator,
+                self.importance_getter,
+                transform_func="square",
+            )
+            ranks = np.argsort(importances)
+
+            # for sparse case ranks is matrix
+            ranks = np.ravel(ranks)
+
+            # Eliminate the worse features
+            threshold = min(step, np.sum(support_) - n_features_to_select)
+
+            support_[features[ranks][:threshold]] = False
+            ranking_[np.logical_not(support_)] += 1
+
+        # Set final attributes
+        features = np.arange(n_features)[support_]
+        self.estimator_ = clone(self.estimator)
+        self.estimator_.fit(X[:, features], y, **fit_params)
+
+        # Compute step values when only n_features_to_select features left
+        if step_score:
+            self.step_n_features_.append(len(features))
+            self.step_scores_.append(step_score(self.estimator_, features))
+            self.step_support_.append(support_)
+            self.step_ranking_.append(ranking_)
+        self.n_features_ = support_.sum()
+        self.support_ = support_
+        self.ranking_ = ranking_
+
+        return self
+
+    @available_if(_estimator_has("predict"))
+    def predict(self, X, **predict_params):
+        """Reduce X to the selected features and predict using the estimator.
+
+        Parameters
+        ----------
+        X : array of shape [n_samples, n_features]
+            The input samples.
+
+        **predict_params : dict
+            Parameters to route to the ``predict`` method of the
+            underlying estimator.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>`
+                for more details.
+
+        Returns
+        -------
+        y : array of shape [n_samples]
+            The predicted target values.
+        """
+        _raise_for_params(predict_params, self, "predict")
+        check_is_fitted(self)
+        if _routing_enabled():
+            routed_params = process_routing(self, "predict", **predict_params)
+        else:
+            routed_params = Bunch(estimator=Bunch(predict={}))
+
+        return self.estimator_.predict(
+            self.transform(X), **routed_params.estimator.predict
+        )
+
+    @available_if(_estimator_has("score"))
+    def score(self, X, y, **score_params):
+        """Reduce X to the selected features and return the score of the estimator.
+
+        Parameters
+        ----------
+        X : array of shape [n_samples, n_features]
+            The input samples.
+
+        y : array of shape [n_samples]
+            The target values.
+
+        **score_params : dict
+            - If `enable_metadata_routing=False` (default): Parameters directly passed
+              to the ``score`` method of the underlying estimator.
+
+            - If `enable_metadata_routing=True`: Parameters safely routed to the `score`
+              method of the underlying estimator.
+
+            .. versionadded:: 1.0
+
+            .. versionchanged:: 1.6
+                See :ref:`Metadata Routing User Guide <metadata_routing>`
+                for more details.
+
+        Returns
+        -------
+        score : float
+            Score of the underlying base estimator computed with the selected
+            features returned by `rfe.transform(X)` and `y`.
+        """
+        check_is_fitted(self)
+        if _routing_enabled():
+            routed_params = process_routing(self, "score", **score_params)
+        else:
+            routed_params = Bunch(estimator=Bunch(score=score_params))
+
+        return self.estimator_.score(
+            self.transform(X), y, **routed_params.estimator.score
+        )
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+        return self.support_
+
+    @available_if(_estimator_has("decision_function"))
+    def decision_function(self, X):
+        """Compute the decision function of ``X``.
+
+        Parameters
+        ----------
+        X : {array-like or sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        score : array, shape = [n_samples, n_classes] or [n_samples]
+            The decision function of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+            Regression and binary classification produce an array of shape
+            [n_samples].
+        """
+        check_is_fitted(self)
+        return self.estimator_.decision_function(self.transform(X))
+
+    @available_if(_estimator_has("predict_proba"))
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        Parameters
+        ----------
+        X : {array-like or sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        p : array of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        return self.estimator_.predict_proba(self.transform(X))
+
+    @available_if(_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities for X.
+
+        Parameters
+        ----------
+        X : array of shape [n_samples, n_features]
+            The input samples.
+
+        Returns
+        -------
+        p : array of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        return self.estimator_.predict_log_proba(self.transform(X))
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        sub_estimator_tags = get_tags(self.estimator)
+        tags.estimator_type = sub_estimator_tags.estimator_type
+        tags.classifier_tags = deepcopy(sub_estimator_tags.classifier_tags)
+        tags.regressor_tags = deepcopy(sub_estimator_tags.regressor_tags)
+        if tags.classifier_tags is not None:
+            tags.classifier_tags.poor_score = True
+        if tags.regressor_tags is not None:
+            tags.regressor_tags.poor_score = True
+        tags.target_tags.required = True
+        tags.input_tags.sparse = sub_estimator_tags.input_tags.sparse
+        tags.input_tags.allow_nan = sub_estimator_tags.input_tags.allow_nan
+        return tags
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.6
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="fit")
+            .add(caller="predict", callee="predict")
+            .add(caller="score", callee="score"),
+        )
+        return router
+
+
+class RFECV(RFE):
+    """Recursive feature elimination with cross-validation to select features.
+
+    The number of features selected is tuned automatically by fitting an :class:`RFE`
+    selector on the different cross-validation splits (provided by the `cv` parameter).
+    The performance of each :class:`RFE` selector is evaluated using `scoring` for
+    different numbers of selected features and aggregated together. Finally, the scores
+    are averaged across folds and the number of features selected is set to the number
+    of features that maximize the cross-validation score.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    Read more in the :ref:`User Guide <rfe>`.
+
+    Parameters
+    ----------
+    estimator : ``Estimator`` instance
+        A supervised learning estimator with a ``fit`` method that provides
+        information about feature importance either through a ``coef_``
+        attribute or through a ``feature_importances_`` attribute.
+
+    step : int or float, default=1
+        If greater than or equal to 1, then ``step`` corresponds to the
+        (integer) number of features to remove at each iteration.
+        If within (0.0, 1.0), then ``step`` corresponds to the percentage
+        (rounded down) of features to remove at each iteration.
+        Note that the last iteration may remove fewer than ``step`` features in
+        order to reach ``min_features_to_select``.
+
+    min_features_to_select : int, default=1
+        The minimum number of features to be selected. This number of features
+        will always be scored, even if the difference between the original
+        feature count and ``min_features_to_select`` isn't divisible by
+        ``step``.
+
+        .. versionadded:: 0.20
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if ``y`` is binary or multiclass,
+        :class:`~sklearn.model_selection.StratifiedKFold` is used. If the
+        estimator is not a classifier or if ``y`` is neither binary nor multiclass,
+        :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value of None changed from 3-fold to 5-fold.
+
+    scoring : str or callable, default=None
+        Scoring method to evaluate the :class:`RFE` selectors' performance. Options:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+    verbose : int, default=0
+        Controls verbosity of output.
+
+    n_jobs : int or None, default=None
+        Number of cores to run in parallel while fitting across folds.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 0.18
+
+    importance_getter : str or callable, default='auto'
+        If 'auto', uses the feature importance either through a `coef_`
+        or `feature_importances_` attributes of estimator.
+
+        Also accepts a string that specifies an attribute name/path
+        for extracting feature importance.
+        For example, give `regressor_.coef_` in case of
+        :class:`~sklearn.compose.TransformedTargetRegressor`  or
+        `named_steps.clf.feature_importances_` in case of
+        :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.
+
+        If `callable`, overrides the default feature importance getter.
+        The callable is passed with the fitted estimator and it should
+        return importance for each feature.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. Only available when `estimator` is a classifier.
+
+    estimator_ : ``Estimator`` instance
+        The fitted estimator used to select features.
+
+    cv_results_ : dict of ndarrays
+        All arrays (values of the dictionary) are sorted in ascending order
+        by the number of features used (i.e., the first element of the array
+        represents the models that used the least number of features, while the
+        last element represents the models that used all available features).
+
+        .. versionadded:: 1.0
+
+        This dictionary contains the following keys:
+
+        split(k)_test_score : ndarray of shape (n_subsets_of_features,)
+            The cross-validation scores across (k)th fold.
+
+        mean_test_score : ndarray of shape (n_subsets_of_features,)
+            Mean of scores over the folds.
+
+        std_test_score : ndarray of shape (n_subsets_of_features,)
+            Standard deviation of scores over the folds.
+
+        n_features : ndarray of shape (n_subsets_of_features,)
+            Number of features used at each step.
+
+            .. versionadded:: 1.5
+
+        split(k)_ranking : ndarray of shape (n_subsets_of_features,)
+            The cross-validation rankings across (k)th fold.
+            Selected (i.e., estimated best) features are assigned rank 1.
+            Illustration in
+            :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`
+
+            .. versionadded:: 1.7
+
+        split(k)_support : ndarray of shape (n_subsets_of_features,)
+            The cross-validation supports across (k)th fold. The support
+            is the mask of selected features.
+
+            .. versionadded:: 1.7
+
+    n_features_ : int
+        The number of selected features with cross-validation.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    ranking_ : narray of shape (n_features,)
+        The feature ranking, such that `ranking_[i]`
+        corresponds to the ranking
+        position of the i-th feature.
+        Selected (i.e., estimated best)
+        features are assigned rank 1.
+
+    support_ : ndarray of shape (n_features,)
+        The mask of selected features.
+
+    See Also
+    --------
+    RFE : Recursive feature elimination.
+
+    Notes
+    -----
+    The size of all values in ``cv_results_`` is equal to
+    ``ceil((n_features - min_features_to_select) / step) + 1``,
+    where step is the number of features removed at each iteration.
+
+    Allows NaN/Inf in the input if the underlying estimator does as well.
+
+    References
+    ----------
+
+    .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection
+           for cancer classification using support vector machines",
+           Mach. Learn., 46(1-3), 389--422, 2002.
+
+    Examples
+    --------
+    The following example shows how to retrieve the a-priori not known 5
+    informative features in the Friedman #1 dataset.
+
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.feature_selection import RFECV
+    >>> from sklearn.svm import SVR
+    >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
+    >>> estimator = SVR(kernel="linear")
+    >>> selector = RFECV(estimator, step=1, cv=5)
+    >>> selector = selector.fit(X, y)
+    >>> selector.support_
+    array([ True,  True,  True,  True,  True, False, False, False, False,
+           False])
+    >>> selector.ranking_
+    array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
+
+    For a detailed example of using RFECV to select features when training a
+    :class:`~sklearn.linear_model.LogisticRegression`, see
+    :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **RFE._parameter_constraints,
+        "min_features_to_select": [Interval(Integral, 0, None, closed="neither")],
+        "cv": ["cv_object"],
+        "scoring": [None, str, callable],
+        "n_jobs": [None, Integral],
+    }
+    _parameter_constraints.pop("n_features_to_select")
+    __metadata_request__fit = {"groups": metadata_routing.UNUSED}
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        step=1,
+        min_features_to_select=1,
+        cv=None,
+        scoring=None,
+        verbose=0,
+        n_jobs=None,
+        importance_getter="auto",
+    ):
+        self.estimator = estimator
+        self.step = step
+        self.importance_getter = importance_getter
+        self.cv = cv
+        self.scoring = scoring
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.min_features_to_select = min_features_to_select
+
+    # TODO(1.8): remove `groups` from the signature after deprecation cycle.
+    @_deprecate_positional_args(version="1.8")
+    @_fit_context(
+        # RFECV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, *, groups=None, **params):
+        """Fit the RFE model and automatically tune the number of selected features.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the total number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values (integers for classification, real numbers for
+            regression).
+
+        groups : array-like of shape (n_samples,) or None, default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Only used in conjunction with a "Group" :term:`cv`
+            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
+
+            .. versionadded:: 0.20
+
+        **params : dict of str -> object
+            Parameters passed to the ``fit`` method of the estimator,
+            the scorer, and the CV splitter.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>`
+                for more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        _raise_for_params(params, self, "fit")
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse="csr",
+            ensure_min_features=2,
+            ensure_all_finite=False,
+            multi_output=True,
+        )
+
+        if _routing_enabled():
+            if groups is not None:
+                params.update({"groups": groups})
+            routed_params = process_routing(self, "fit", **params)
+        else:
+            routed_params = Bunch(
+                estimator=Bunch(fit={}),
+                splitter=Bunch(split={"groups": groups}),
+                scorer=Bunch(score={}),
+            )
+
+        # Initialization
+        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
+        scorer = self._get_scorer()
+
+        # Build an RFE object, which will evaluate and score each possible
+        # feature count, down to self.min_features_to_select
+        n_features = X.shape[1]
+        if self.min_features_to_select > n_features:
+            warnings.warn(
+                (
+                    f"Found min_features_to_select={self.min_features_to_select} > "
+                    f"{n_features=}. There will be no feature selection and all "
+                    "features will be kept."
+                ),
+                UserWarning,
+            )
+        rfe = RFE(
+            estimator=self.estimator,
+            n_features_to_select=min(self.min_features_to_select, n_features),
+            importance_getter=self.importance_getter,
+            step=self.step,
+            verbose=self.verbose,
+        )
+
+        # Determine the number of subsets of features by fitting across
+        # the train folds and choosing the "features_to_select" parameter
+        # that gives the least averaged error across all folds.
+
+        # Note that joblib raises a non-picklable error for bound methods
+        # even if n_jobs is set to 1 with the default multiprocessing
+        # backend.
+        # This branching is done so that to
+        # make sure that user code that sets n_jobs to 1
+        # and provides bound methods as scorers is not broken with the
+        # addition of n_jobs parameter in version 0.18.
+
+        if effective_n_jobs(self.n_jobs) == 1:
+            parallel, func = list, _rfe_single_fit
+        else:
+            parallel = Parallel(n_jobs=self.n_jobs)
+            func = delayed(_rfe_single_fit)
+
+        step_results = parallel(
+            func(clone(rfe), self.estimator, X, y, train, test, scorer, routed_params)
+            for train, test in cv.split(X, y, **routed_params.splitter.split)
+        )
+        scores, supports, rankings, step_n_features = zip(*step_results)
+
+        step_n_features_rev = np.array(step_n_features[0])[::-1]
+        scores = np.array(scores)
+        rankings = np.array(rankings)
+        supports = np.array(supports)
+
+        # Reverse order such that lowest number of features is selected in case of tie.
+        scores_sum_rev = np.sum(scores, axis=0)[::-1]
+        n_features_to_select = step_n_features_rev[np.argmax(scores_sum_rev)]
+
+        # Re-execute an elimination with best_k over the whole set
+        rfe = RFE(
+            estimator=self.estimator,
+            n_features_to_select=n_features_to_select,
+            step=self.step,
+            importance_getter=self.importance_getter,
+            verbose=self.verbose,
+        )
+
+        rfe.fit(X, y, **routed_params.estimator.fit)
+
+        # Set final attributes
+        self.support_ = rfe.support_
+        self.n_features_ = rfe.n_features_
+        self.ranking_ = rfe.ranking_
+        self.estimator_ = clone(self.estimator)
+        self.estimator_.fit(self._transform(X), y, **routed_params.estimator.fit)
+
+        # reverse to stay consistent with before
+        scores_rev = scores[:, ::-1]
+        supports_rev = supports[:, ::-1]
+        rankings_rev = rankings[:, ::-1]
+        self.cv_results_ = {
+            "mean_test_score": np.mean(scores_rev, axis=0),
+            "std_test_score": np.std(scores_rev, axis=0),
+            **{f"split{i}_test_score": scores_rev[i] for i in range(scores.shape[0])},
+            **{f"split{i}_ranking": rankings_rev[i] for i in range(rankings.shape[0])},
+            **{f"split{i}_support": supports_rev[i] for i in range(supports.shape[0])},
+            "n_features": step_n_features_rev,
+        }
+        return self
+
+    def score(self, X, y, **score_params):
+        """Score using the `scoring` option on the given test data and labels.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,)
+            True labels for X.
+
+        **score_params : dict
+            Parameters to pass to the `score` method of the underlying scorer.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>`
+                for more details.
+
+        Returns
+        -------
+        score : float
+            Score of self.predict(X) w.r.t. y defined by `scoring`.
+        """
+        _raise_for_params(score_params, self, "score")
+        scoring = self._get_scorer()
+        if _routing_enabled():
+            routed_params = process_routing(self, "score", **score_params)
+        else:
+            routed_params = Bunch()
+            routed_params.scorer = Bunch(score={})
+
+        return scoring(self, X, y, **routed_params.scorer.score)
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.6
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+        router.add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+        )
+        router.add(
+            splitter=check_cv(self.cv),
+            method_mapping=MethodMapping().add(
+                caller="fit",
+                callee="split",
+            ),
+        )
+        router.add(
+            scorer=self._get_scorer(),
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="score")
+            .add(caller="score", callee="score"),
+        )
+
+        return router
+
+    def _get_scorer(self):
+        if self.scoring is None:
+            scoring = "accuracy" if is_classifier(self.estimator) else "r2"
+        else:
+            scoring = self.scoring
+        return get_scorer(scoring)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d6ed9e2e72e278bee29638945bc9a2456826f6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py
@@ -0,0 +1,363 @@
+"""
+Sequential feature selection
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
+from ..metrics import check_scoring, get_scorer_names
+from ..model_selection import check_cv, cross_val_score
+from ..utils._metadata_requests import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
+from ..utils._tags import get_tags
+from ..utils.validation import check_is_fitted, validate_data
+from ._base import SelectorMixin
+
+
+class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
+    """Transformer that performs Sequential Feature Selection.
+
+    This Sequential Feature Selector adds (forward selection) or
+    removes (backward selection) features to form a feature subset in a
+    greedy fashion. At each stage, this estimator chooses the best feature to
+    add or remove based on the cross-validation score of an estimator. In
+    the case of unsupervised learning, this Sequential Feature Selector
+    looks only at the features (X), not the desired outputs (y).
+
+    Read more in the :ref:`User Guide <sequential_feature_selection>`.
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        An unfitted estimator.
+
+    n_features_to_select : "auto", int or float, default="auto"
+        If `"auto"`, the behaviour depends on the `tol` parameter:
+
+        - if `tol` is not `None`, then features are selected while the score
+          change does not exceed `tol`.
+        - otherwise, half of the features are selected.
+
+        If integer, the parameter is the absolute number of features to select.
+        If float between 0 and 1, it is the fraction of features to select.
+
+        .. versionadded:: 1.1
+           The option `"auto"` was added in version 1.1.
+
+        .. versionchanged:: 1.3
+           The default changed from `"warn"` to `"auto"` in 1.3.
+
+    tol : float, default=None
+        If the score is not incremented by at least `tol` between two
+        consecutive feature additions or removals, stop adding or removing.
+
+        `tol` can be negative when removing features using `direction="backward"`.
+        `tol` is required to be strictly positive when doing forward selection.
+        It can be useful to reduce the number of features at the cost of a small
+        decrease in the score.
+
+        `tol` is enabled only when `n_features_to_select` is `"auto"`.
+
+        .. versionadded:: 1.1
+
+    direction : {'forward', 'backward'}, default='forward'
+        Whether to perform forward selection or backward selection.
+
+    scoring : str or callable, default=None
+        Scoring method to use for cross-validation. Options:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)`` that returns a single value.
+          See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass,
+        :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other
+        cases, :class:`~sklearn.model_selection.KFold` is used. These splitters
+        are instantiated with `shuffle=False` so the splits will be the same
+        across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. When evaluating a new feature to
+        add or remove, the cross-validation procedure is parallel over the
+        folds.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_features_to_select_ : int
+        The number of features that were selected.
+
+    support_ : ndarray of shape (n_features,), dtype=bool
+        The mask of selected features.
+
+    See Also
+    --------
+    GenericUnivariateSelect : Univariate feature selector with configurable
+        strategy.
+    RFE : Recursive feature elimination based on importance weights.
+    RFECV : Recursive feature elimination based on importance weights, with
+        automatic selection of the number of features.
+    SelectFromModel : Feature selection based on thresholds of importance
+        weights.
+
+    Examples
+    --------
+    >>> from sklearn.feature_selection import SequentialFeatureSelector
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> X, y = load_iris(return_X_y=True)
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
+    >>> sfs.fit(X, y)
+    SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
+                              n_features_to_select=3)
+    >>> sfs.get_support()
+    array([ True, False,  True,  True])
+    >>> sfs.transform(X).shape
+    (150, 3)
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit"])],
+        "n_features_to_select": [
+            StrOptions({"auto"}),
+            Interval(RealNotInt, 0, 1, closed="right"),
+            Interval(Integral, 0, None, closed="neither"),
+        ],
+        "tol": [None, Interval(Real, None, None, closed="neither")],
+        "direction": [StrOptions({"forward", "backward"})],
+        "scoring": [None, StrOptions(set(get_scorer_names())), callable],
+        "cv": ["cv_object"],
+        "n_jobs": [None, Integral],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        n_features_to_select="auto",
+        tol=None,
+        direction="forward",
+        scoring=None,
+        cv=5,
+        n_jobs=None,
+    ):
+        self.estimator = estimator
+        self.n_features_to_select = n_features_to_select
+        self.tol = tol
+        self.direction = direction
+        self.scoring = scoring
+        self.cv = cv
+        self.n_jobs = n_jobs
+
+    @_fit_context(
+        # SequentialFeatureSelector.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, **params):
+        """Learn the features to select from X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of predictors.
+
+        y : array-like of shape (n_samples,), default=None
+            Target values. This parameter may be ignored for
+            unsupervised learning.
+
+        **params : dict, default=None
+            Parameters to be passed to the underlying `estimator`, `cv`
+            and `scorer` objects.
+
+            .. versionadded:: 1.6
+
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        _raise_for_params(params, self, "fit")
+        tags = self.__sklearn_tags__()
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csc",
+            ensure_min_features=2,
+            ensure_all_finite=not tags.input_tags.allow_nan,
+        )
+        n_features = X.shape[1]
+
+        if self.n_features_to_select == "auto":
+            if self.tol is not None:
+                # With auto feature selection, `n_features_to_select_` will be updated
+                # to `support_.sum()` after features are selected.
+                self.n_features_to_select_ = n_features - 1
+            else:
+                self.n_features_to_select_ = n_features // 2
+        elif isinstance(self.n_features_to_select, Integral):
+            if self.n_features_to_select >= n_features:
+                raise ValueError("n_features_to_select must be < n_features.")
+            self.n_features_to_select_ = self.n_features_to_select
+        elif isinstance(self.n_features_to_select, Real):
+            self.n_features_to_select_ = int(n_features * self.n_features_to_select)
+
+        if self.tol is not None and self.tol < 0 and self.direction == "forward":
+            raise ValueError(
+                "tol must be strictly positive when doing forward selection"
+            )
+
+        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
+
+        cloned_estimator = clone(self.estimator)
+
+        # the current mask corresponds to the set of features:
+        # - that we have already *selected* if we do forward selection
+        # - that we have already *excluded* if we do backward selection
+        current_mask = np.zeros(shape=n_features, dtype=bool)
+        n_iterations = (
+            self.n_features_to_select_
+            if self.n_features_to_select == "auto" or self.direction == "forward"
+            else n_features - self.n_features_to_select_
+        )
+
+        old_score = -np.inf
+        is_auto_select = self.tol is not None and self.n_features_to_select == "auto"
+
+        # We only need to verify the routing here and not use the routed params
+        # because internally the actual routing will also take place inside the
+        # `cross_val_score` function.
+        if _routing_enabled():
+            process_routing(self, "fit", **params)
+        for _ in range(n_iterations):
+            new_feature_idx, new_score = self._get_best_new_feature_score(
+                cloned_estimator, X, y, cv, current_mask, **params
+            )
+            if is_auto_select and ((new_score - old_score) < self.tol):
+                break
+
+            old_score = new_score
+            current_mask[new_feature_idx] = True
+
+        if self.direction == "backward":
+            current_mask = ~current_mask
+
+        self.support_ = current_mask
+        self.n_features_to_select_ = self.support_.sum()
+
+        return self
+
+    def _get_best_new_feature_score(self, estimator, X, y, cv, current_mask, **params):
+        # Return the best new feature and its score to add to the current_mask,
+        # i.e. return the best new feature and its score to add (resp. remove)
+        # when doing forward selection (resp. backward selection).
+        # Feature will be added if the current score and past score are greater
+        # than tol when n_feature is auto,
+        candidate_feature_indices = np.flatnonzero(~current_mask)
+        scores = {}
+        for feature_idx in candidate_feature_indices:
+            candidate_mask = current_mask.copy()
+            candidate_mask[feature_idx] = True
+            if self.direction == "backward":
+                candidate_mask = ~candidate_mask
+            X_new = X[:, candidate_mask]
+            scores[feature_idx] = cross_val_score(
+                estimator,
+                X_new,
+                y,
+                cv=cv,
+                scoring=self.scoring,
+                n_jobs=self.n_jobs,
+                params=params,
+            ).mean()
+        new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
+        return new_feature_idx, scores[new_feature_idx]
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+        return self.support_
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan
+        tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
+        return tags
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.6
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+        router.add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+        )
+        router.add(
+            splitter=check_cv(self.cv, classifier=is_classifier(self.estimator)),
+            method_mapping=MethodMapping().add(caller="fit", callee="split"),
+        )
+        router.add(
+            scorer=check_scoring(self.estimator, scoring=self.scoring),
+            method_mapping=MethodMapping().add(caller="fit", callee="score"),
+        )
+        return router
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_univariate_selection.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_univariate_selection.py
new file mode 100644
index 0000000000000000000000000000000000000000..7671a7ad7921d618cfdb98ba6baa60f24e3a9316
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_univariate_selection.py
@@ -0,0 +1,1171 @@
+"""Univariate features selection."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import special, stats
+from scipy.sparse import issparse
+
+from ..base import BaseEstimator, _fit_context
+from ..preprocessing import LabelBinarizer
+from ..utils import as_float_array, check_array, check_X_y, safe_mask, safe_sqr
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.extmath import row_norms, safe_sparse_dot
+from ..utils.validation import check_is_fitted, validate_data
+from ._base import SelectorMixin
+
+
+def _clean_nans(scores):
+    """
+    Fixes Issue #1240: NaNs can't be properly compared, so change them to the
+    smallest value of scores's dtype. -inf seems to be unreliable.
+    """
+    # XXX where should this function be called? fit? scoring functions
+    # themselves?
+    scores = as_float_array(scores, copy=True)
+    scores[np.isnan(scores)] = np.finfo(scores.dtype).min
+    return scores
+
+
+######################################################################
+# Scoring functions
+
+
+# The following function is a rewriting of scipy.stats.f_oneway
+# Contrary to the scipy.stats.f_oneway implementation it does not
+# copy the data while keeping the inputs unchanged.
+def f_oneway(*args):
+    """Perform a 1-way ANOVA.
+
+    The one-way ANOVA tests the null hypothesis that 2 or more groups have
+    the same population mean. The test is applied to samples from two or
+    more groups, possibly with differing sizes.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    *args : {array-like, sparse matrix}
+        Sample1, sample2... The sample measurements should be given as
+        arguments.
+
+    Returns
+    -------
+    f_statistic : float
+        The computed F-value of the test.
+    p_value : float
+        The associated p-value from the F-distribution.
+
+    Notes
+    -----
+    The ANOVA test has important assumptions that must be satisfied in order
+    for the associated p-value to be valid.
+
+    1. The samples are independent
+    2. Each sample is from a normally distributed population
+    3. The population standard deviations of the groups are all equal. This
+       property is known as homoscedasticity.
+
+    If these assumptions are not true for a given set of data, it may still be
+    possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
+    with some loss of power.
+
+    The algorithm is from Heiman[2], pp.394-7.
+
+    See ``scipy.stats.f_oneway`` that should give the same results while
+    being less efficient.
+
+    References
+    ----------
+    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
+           Statistics". Chapter 14.
+           http://vassarstats.net/textbook
+
+    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.
+    """
+    n_classes = len(args)
+    args = [as_float_array(a) for a in args]
+    n_samples_per_class = np.array([a.shape[0] for a in args])
+    n_samples = np.sum(n_samples_per_class)
+    ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
+    sums_args = [np.asarray(a.sum(axis=0)) for a in args]
+    square_of_sums_alldata = sum(sums_args) ** 2
+    square_of_sums_args = [s**2 for s in sums_args]
+    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
+    ssbn = 0.0
+    for k, _ in enumerate(args):
+        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
+    ssbn -= square_of_sums_alldata / float(n_samples)
+    sswn = sstot - ssbn
+    dfbn = n_classes - 1
+    dfwn = n_samples - n_classes
+    msb = ssbn / float(dfbn)
+    msw = sswn / float(dfwn)
+    constant_features_idx = np.where(msw == 0.0)[0]
+    if np.nonzero(msb)[0].size != msb.size and constant_features_idx.size:
+        warnings.warn("Features %s are constant." % constant_features_idx, UserWarning)
+    f = msb / msw
+    # flatten matrix to vector in sparse case
+    f = np.asarray(f).ravel()
+    prob = special.fdtrc(dfbn, dfwn, f)
+    return f, prob
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def f_classif(X, y):
+    """Compute the ANOVA F-value for the provided sample.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The set of regressors that will be tested sequentially.
+
+    y : array-like of shape (n_samples,)
+        The target vector.
+
+    Returns
+    -------
+    f_statistic : ndarray of shape (n_features,)
+        F-statistic for each feature.
+
+    p_values : ndarray of shape (n_features,)
+        P-values associated with the F-statistic.
+
+    See Also
+    --------
+    chi2 : Chi-squared stats of non-negative features for classification tasks.
+    f_regression : F-value between label/feature for regression tasks.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.feature_selection import f_classif
+    >>> X, y = make_classification(
+    ...     n_samples=100, n_features=10, n_informative=2, n_clusters_per_class=1,
+    ...     shuffle=False, random_state=42
+    ... )
+    >>> f_statistic, p_values = f_classif(X, y)
+    >>> f_statistic
+    array([2.21e+02, 7.02e-01, 1.70e+00, 9.31e-01,
+           5.41e+00, 3.25e-01, 4.71e-02, 5.72e-01,
+           7.54e-01, 8.90e-02])
+    >>> p_values
+    array([7.14e-27, 4.04e-01, 1.96e-01, 3.37e-01,
+           2.21e-02, 5.70e-01, 8.29e-01, 4.51e-01,
+           3.87e-01, 7.66e-01])
+    """
+    X, y = check_X_y(X, y, accept_sparse=["csr", "csc", "coo"])
+    args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
+    return f_oneway(*args)
+
+
+def _chisquare(f_obs, f_exp):
+    """Fast replacement for scipy.stats.chisquare.
+
+    Version from https://github.com/scipy/scipy/pull/2525 with additional
+    optimizations.
+    """
+    f_obs = np.asarray(f_obs, dtype=np.float64)
+
+    k = len(f_obs)
+    # Reuse f_obs for chi-squared statistics
+    chisq = f_obs
+    chisq -= f_exp
+    chisq **= 2
+    with np.errstate(invalid="ignore"):
+        chisq /= f_exp
+    chisq = chisq.sum(axis=0)
+    return chisq, special.chdtrc(k - 1, chisq)
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def chi2(X, y):
+    """Compute chi-squared stats between each non-negative feature and class.
+
+    This score can be used to select the `n_features` features with the
+    highest values for the test chi-squared statistic from X, which must
+    contain only **non-negative integer feature values** such as booleans or frequencies
+    (e.g., term counts in document classification), relative to the classes.
+
+    If some of your features are continuous, you need to bin them, for
+    example by using :class:`~sklearn.preprocessing.KBinsDiscretizer`.
+
+    Recall that the chi-square test measures dependence between stochastic
+    variables, so using this function "weeds out" the features that are the
+    most likely to be independent of class and therefore irrelevant for
+    classification.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Sample vectors.
+
+    y : array-like of shape (n_samples,)
+        Target vector (class labels).
+
+    Returns
+    -------
+    chi2 : ndarray of shape (n_features,)
+        Chi2 statistics for each feature.
+
+    p_values : ndarray of shape (n_features,)
+        P-values for each feature.
+
+    See Also
+    --------
+    f_classif : ANOVA F-value between label/feature for classification tasks.
+    f_regression : F-value between label/feature for regression tasks.
+
+    Notes
+    -----
+    Complexity of this algorithm is O(n_classes * n_features).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.feature_selection import chi2
+    >>> X = np.array([[1, 1, 3],
+    ...               [0, 1, 5],
+    ...               [5, 4, 1],
+    ...               [6, 6, 2],
+    ...               [1, 4, 0],
+    ...               [0, 0, 0]])
+    >>> y = np.array([1, 1, 0, 0, 2, 2])
+    >>> chi2_stats, p_values = chi2(X, y)
+    >>> chi2_stats
+    array([15.3,  6.5       ,  8.9])
+    >>> p_values
+    array([0.000456, 0.0387, 0.0116 ])
+    """
+
+    # XXX: we might want to do some of the following in logspace instead for
+    # numerical stability.
+    # Converting X to float allows getting better performance for the
+    # safe_sparse_dot call made below.
+    X = check_array(X, accept_sparse="csr", dtype=(np.float64, np.float32))
+    if np.any((X.data if issparse(X) else X) < 0):
+        raise ValueError("Input X must be non-negative.")
+
+    # Use a sparse representation for Y by default to reduce memory usage when
+    # y has many unique classes.
+    Y = LabelBinarizer(sparse_output=True).fit_transform(y)
+    if Y.shape[1] == 1:
+        Y = Y.toarray()
+        Y = np.append(1 - Y, Y, axis=1)
+
+    observed = safe_sparse_dot(Y.T, X)  # n_classes * n_features
+
+    if issparse(observed):
+        # convert back to a dense array before calling _chisquare
+        # XXX: could _chisquare be reimplement to accept sparse matrices for
+        # cases where both n_classes and n_features are large (and X is
+        # sparse)?
+        observed = observed.toarray()
+
+    feature_count = X.sum(axis=0).reshape(1, -1)
+    class_prob = Y.mean(axis=0).reshape(1, -1)
+    expected = np.dot(class_prob.T, feature_count)
+
+    return _chisquare(observed, expected)
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like"],
+        "center": ["boolean"],
+        "force_finite": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def r_regression(X, y, *, center=True, force_finite=True):
+    """Compute Pearson's r for each features and the target.
+
+    Pearson's r is also known as the Pearson correlation coefficient.
+
+    Linear model for testing the individual effect of each of many regressors.
+    This is a scoring function to be used in a feature selection procedure, not
+    a free standing feature selection procedure.
+
+    The cross correlation between each regressor and the target is computed
+    as::
+
+        E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))
+
+    For more on usage see the :ref:`User Guide <univariate_feature_selection>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data matrix.
+
+    y : array-like of shape (n_samples,)
+        The target vector.
+
+    center : bool, default=True
+        Whether or not to center the data matrix `X` and the target vector `y`.
+        By default, `X` and `y` will be centered.
+
+    force_finite : bool, default=True
+        Whether or not to force the Pearson's R correlation to be finite.
+        In the particular case where some features in `X` or the target `y`
+        are constant, the Pearson's R correlation is not defined. When
+        `force_finite=False`, a correlation of `np.nan` is returned to
+        acknowledge this case. When `force_finite=True`, this value will be
+        forced to a minimal correlation of `0.0`.
+
+        .. versionadded:: 1.1
+
+    Returns
+    -------
+    correlation_coefficient : ndarray of shape (n_features,)
+        Pearson's R correlation coefficients of features.
+
+    See Also
+    --------
+    f_regression: Univariate linear regression tests returning f-statistic
+        and p-values.
+    mutual_info_regression: Mutual information for a continuous target.
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.feature_selection import r_regression
+    >>> X, y = make_regression(
+    ...     n_samples=50, n_features=3, n_informative=1, noise=1e-4, random_state=42
+    ... )
+    >>> r_regression(X, y)
+    array([-0.157,  1.        , -0.229])
+    """
+    X, y = check_X_y(X, y, accept_sparse=["csr", "csc", "coo"], dtype=np.float64)
+    n_samples = X.shape[0]
+
+    # Compute centered values
+    # Note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
+    # need not center X
+    if center:
+        y = y - np.mean(y)
+        # TODO: for Scipy <= 1.10, `isspmatrix(X)` returns `True` for sparse arrays.
+        # Here, we check the output of the `.mean` operation that returns a `np.matrix`
+        # for sparse matrices while a `np.array` for dense and sparse arrays.
+        # We can reconsider using `isspmatrix` when the minimum version is
+        # SciPy >= 1.11
+        X_means = X.mean(axis=0)
+        X_means = X_means.getA1() if isinstance(X_means, np.matrix) else X_means
+        # Compute the scaled standard deviations via moments
+        X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)
+    else:
+        X_norms = row_norms(X.T)
+
+    correlation_coefficient = safe_sparse_dot(y, X)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        correlation_coefficient /= X_norms
+        correlation_coefficient /= np.linalg.norm(y)
+
+    if force_finite and not np.isfinite(correlation_coefficient).all():
+        # case where the target or some features are constant
+        # the correlation coefficient(s) is/are set to the minimum (i.e. 0.0)
+        nan_mask = np.isnan(correlation_coefficient)
+        correlation_coefficient[nan_mask] = 0.0
+    return correlation_coefficient
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like"],
+        "center": ["boolean"],
+        "force_finite": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def f_regression(X, y, *, center=True, force_finite=True):
+    """Univariate linear regression tests returning F-statistic and p-values.
+
+    Quick linear model for testing the effect of a single regressor,
+    sequentially for many regressors.
+
+    This is done in 2 steps:
+
+    1. The cross correlation between each regressor and the target is computed
+       using :func:`r_regression` as::
+
+           E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))
+
+    2. It is converted to an F score and then to a p-value.
+
+    :func:`f_regression` is derived from :func:`r_regression` and will rank
+    features in the same order if all the features are positively correlated
+    with the target.
+
+    Note however that contrary to :func:`f_regression`, :func:`r_regression`
+    values lie in [-1, 1] and can thus be negative. :func:`f_regression` is
+    therefore recommended as a feature selection criterion to identify
+    potentially predictive feature for a downstream classifier, irrespective of
+    the sign of the association with the target variable.
+
+    Furthermore :func:`f_regression` returns p-values while
+    :func:`r_regression` does not.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data matrix.
+
+    y : array-like of shape (n_samples,)
+        The target vector.
+
+    center : bool, default=True
+        Whether or not to center the data matrix `X` and the target vector `y`.
+        By default, `X` and `y` will be centered.
+
+    force_finite : bool, default=True
+        Whether or not to force the F-statistics and associated p-values to
+        be finite. There are two cases where the F-statistic is expected to not
+        be finite:
+
+        - when the target `y` or some features in `X` are constant. In this
+          case, the Pearson's R correlation is not defined leading to obtain
+          `np.nan` values in the F-statistic and p-value. When
+          `force_finite=True`, the F-statistic is set to `0.0` and the
+          associated p-value is set to `1.0`.
+        - when a feature in `X` is perfectly correlated (or
+          anti-correlated) with the target `y`. In this case, the F-statistic
+          is expected to be `np.inf`. When `force_finite=True`, the F-statistic
+          is set to `np.finfo(dtype).max` and the associated p-value is set to
+          `0.0`.
+
+        .. versionadded:: 1.1
+
+    Returns
+    -------
+    f_statistic : ndarray of shape (n_features,)
+        F-statistic for each feature.
+
+    p_values : ndarray of shape (n_features,)
+        P-values associated with the F-statistic.
+
+    See Also
+    --------
+    r_regression: Pearson's R between label/feature for regression tasks.
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
+    SelectKBest: Select features based on the k highest scores.
+    SelectFpr: Select features based on a false positive rate test.
+    SelectFdr: Select features based on an estimated false discovery rate.
+    SelectFwe: Select features based on family-wise error rate.
+    SelectPercentile: Select features based on percentile of the highest
+        scores.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.feature_selection import f_regression
+    >>> X, y = make_regression(
+    ...     n_samples=50, n_features=3, n_informative=1, noise=1e-4, random_state=42
+    ... )
+    >>> f_statistic, p_values = f_regression(X, y)
+    >>> f_statistic
+    array([1.21, 2.67e13, 2.66])
+    >>> p_values
+    array([0.276, 1.54e-283, 0.11])
+    """
+    correlation_coefficient = r_regression(
+        X, y, center=center, force_finite=force_finite
+    )
+    deg_of_freedom = y.size - (2 if center else 1)
+
+    corr_coef_squared = correlation_coefficient**2
+
+    with np.errstate(divide="ignore", invalid="ignore"):
+        f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
+        p_values = stats.f.sf(f_statistic, 1, deg_of_freedom)
+
+    if force_finite and not np.isfinite(f_statistic).all():
+        # case where there is a perfect (anti-)correlation
+        # f-statistics can be set to the maximum and p-values to zero
+        mask_inf = np.isinf(f_statistic)
+        f_statistic[mask_inf] = np.finfo(f_statistic.dtype).max
+        # case where the target or some features are constant
+        # f-statistics would be minimum and thus p-values large
+        mask_nan = np.isnan(f_statistic)
+        f_statistic[mask_nan] = 0.0
+        p_values[mask_nan] = 1.0
+    return f_statistic, p_values
+
+
+######################################################################
+# Base classes
+
+
+class _BaseFilter(SelectorMixin, BaseEstimator):
+    """Initialize the univariate feature selection.
+
+    Parameters
+    ----------
+    score_func : callable
+        Function taking two arrays X and y, and returning a pair of arrays
+        (scores, pvalues) or a single array with scores.
+    """
+
+    _parameter_constraints: dict = {"score_func": [callable]}
+
+    def __init__(self, score_func):
+        self.score_func = score_func
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Run score function on (X, y) and get the appropriate features.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples,) or None
+            The target values (class labels in classification, real numbers in
+            regression). If the selector is unsupervised then `y` can be set to `None`.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        if y is None:
+            X = validate_data(self, X, accept_sparse=["csr", "csc"])
+        else:
+            X, y = validate_data(
+                self, X, y, accept_sparse=["csr", "csc"], multi_output=True
+            )
+
+        self._check_params(X, y)
+        score_func_ret = self.score_func(X, y)
+        if isinstance(score_func_ret, (list, tuple)):
+            self.scores_, self.pvalues_ = score_func_ret
+            self.pvalues_ = np.asarray(self.pvalues_)
+        else:
+            self.scores_ = score_func_ret
+            self.pvalues_ = None
+
+        self.scores_ = np.asarray(self.scores_)
+
+        return self
+
+    def _check_params(self, X, y):
+        pass
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.required = True
+        tags.input_tags.sparse = True
+        return tags
+
+
+######################################################################
+# Specific filters
+######################################################################
+class SelectPercentile(_BaseFilter):
+    """Select features according to a percentile of the highest scores.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    score_func : callable, default=f_classif
+        Function taking two arrays X and y, and returning a pair of arrays
+        (scores, pvalues) or a single array with scores.
+        Default is f_classif (see below "See Also"). The default function only
+        works with classification tasks.
+
+        .. versionadded:: 0.18
+
+    percentile : int, default=10
+        Percent of features to keep.
+
+    Attributes
+    ----------
+    scores_ : array-like of shape (n_features,)
+        Scores of features.
+
+    pvalues_ : array-like of shape (n_features,)
+        p-values of feature scores, None if `score_func` returned only scores.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    f_classif : ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif : Mutual information for a discrete target.
+    chi2 : Chi-squared stats of non-negative features for classification tasks.
+    f_regression : F-value between label/feature for regression tasks.
+    mutual_info_regression : Mutual information for a continuous target.
+    SelectKBest : Select features based on the k highest scores.
+    SelectFpr : Select features based on a false positive rate test.
+    SelectFdr : Select features based on an estimated false discovery rate.
+    SelectFwe : Select features based on family-wise error rate.
+    GenericUnivariateSelect : Univariate feature selector with configurable
+        mode.
+
+    Notes
+    -----
+    Ties between features with equal scores will be broken in an unspecified
+    way.
+
+    This filter supports unsupervised feature selection that only requests `X` for
+    computing the scores.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.feature_selection import SelectPercentile, chi2
+    >>> X, y = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
+    >>> X_new.shape
+    (1797, 7)
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseFilter._parameter_constraints,
+        "percentile": [Interval(Real, 0, 100, closed="both")],
+    }
+
+    def __init__(self, score_func=f_classif, *, percentile=10):
+        super().__init__(score_func=score_func)
+        self.percentile = percentile
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        # Cater for NaNs
+        if self.percentile == 100:
+            return np.ones(len(self.scores_), dtype=bool)
+        elif self.percentile == 0:
+            return np.zeros(len(self.scores_), dtype=bool)
+
+        scores = _clean_nans(self.scores_)
+        threshold = np.percentile(scores, 100 - self.percentile)
+        mask = scores > threshold
+        ties = np.where(scores == threshold)[0]
+        if len(ties):
+            max_feats = int(len(scores) * self.percentile / 100)
+            kept_ties = ties[: max_feats - mask.sum()]
+            mask[kept_ties] = True
+        return mask
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.required = False
+        return tags
+
+
+class SelectKBest(_BaseFilter):
+    """Select features according to the k highest scores.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    score_func : callable, default=f_classif
+        Function taking two arrays X and y, and returning a pair of arrays
+        (scores, pvalues) or a single array with scores.
+        Default is f_classif (see below "See Also"). The default function only
+        works with classification tasks.
+
+        .. versionadded:: 0.18
+
+    k : int or "all", default=10
+        Number of top features to select.
+        The "all" option bypasses selection, for use in a parameter search.
+
+    Attributes
+    ----------
+    scores_ : array-like of shape (n_features,)
+        Scores of features.
+
+    pvalues_ : array-like of shape (n_features,)
+        p-values of feature scores, None if `score_func` returned only scores.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif: Mutual information for a discrete target.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
+    f_regression: F-value between label/feature for regression tasks.
+    mutual_info_regression: Mutual information for a continuous target.
+    SelectPercentile: Select features based on percentile of the highest
+        scores.
+    SelectFpr : Select features based on a false positive rate test.
+    SelectFdr : Select features based on an estimated false discovery rate.
+    SelectFwe : Select features based on family-wise error rate.
+    GenericUnivariateSelect : Univariate feature selector with configurable
+        mode.
+
+    Notes
+    -----
+    Ties between features with equal scores will be broken in an unspecified
+    way.
+
+    This filter supports unsupervised feature selection that only requests `X` for
+    computing the scores.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.feature_selection import SelectKBest, chi2
+    >>> X, y = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
+    >>> X_new.shape
+    (1797, 20)
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseFilter._parameter_constraints,
+        "k": [StrOptions({"all"}), Interval(Integral, 0, None, closed="left")],
+    }
+
+    def __init__(self, score_func=f_classif, *, k=10):
+        super().__init__(score_func=score_func)
+        self.k = k
+
+    def _check_params(self, X, y):
+        if not isinstance(self.k, str) and self.k > X.shape[1]:
+            warnings.warn(
+                f"k={self.k} is greater than n_features={X.shape[1]}. "
+                "All the features will be returned."
+            )
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        if self.k == "all":
+            return np.ones(self.scores_.shape, dtype=bool)
+        elif self.k == 0:
+            return np.zeros(self.scores_.shape, dtype=bool)
+        else:
+            scores = _clean_nans(self.scores_)
+            mask = np.zeros(scores.shape, dtype=bool)
+
+            # Request a stable sort. Mergesort takes more memory (~40MB per
+            # megafeature on x86-64).
+            mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1
+            return mask
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.required = False
+        return tags
+
+
+class SelectFpr(_BaseFilter):
+    """Filter: Select the pvalues below alpha based on a FPR test.
+
+    FPR test stands for False Positive Rate test. It controls the total
+    amount of false detections.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    score_func : callable, default=f_classif
+        Function taking two arrays X and y, and returning a pair of arrays
+        (scores, pvalues).
+        Default is f_classif (see below "See Also"). The default function only
+        works with classification tasks.
+
+    alpha : float, default=5e-2
+        Features with p-values less than `alpha` are selected.
+
+    Attributes
+    ----------
+    scores_ : array-like of shape (n_features,)
+        Scores of features.
+
+    pvalues_ : array-like of shape (n_features,)
+        p-values of feature scores.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    f_classif : ANOVA F-value between label/feature for classification tasks.
+    chi2 : Chi-squared stats of non-negative features for classification tasks.
+    mutual_info_classif: Mutual information for a discrete target.
+    f_regression : F-value between label/feature for regression tasks.
+    mutual_info_regression : Mutual information for a continuous target.
+    SelectPercentile : Select features based on percentile of the highest
+        scores.
+    SelectKBest : Select features based on the k highest scores.
+    SelectFdr : Select features based on an estimated false discovery rate.
+    SelectFwe : Select features based on family-wise error rate.
+    GenericUnivariateSelect : Univariate feature selector with configurable
+        mode.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.feature_selection import SelectFpr, chi2
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> X.shape
+    (569, 30)
+    >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)
+    >>> X_new.shape
+    (569, 16)
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseFilter._parameter_constraints,
+        "alpha": [Interval(Real, 0, 1, closed="both")],
+    }
+
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
+        self.alpha = alpha
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        return self.pvalues_ < self.alpha
+
+
+class SelectFdr(_BaseFilter):
+    """Filter: Select the p-values for an estimated false discovery rate.
+
+    This uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound
+    on the expected false discovery rate.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    score_func : callable, default=f_classif
+        Function taking two arrays X and y, and returning a pair of arrays
+        (scores, pvalues).
+        Default is f_classif (see below "See Also"). The default function only
+        works with classification tasks.
+
+    alpha : float, default=5e-2
+        The highest uncorrected p-value for features to keep.
+
+    Attributes
+    ----------
+    scores_ : array-like of shape (n_features,)
+        Scores of features.
+
+    pvalues_ : array-like of shape (n_features,)
+        p-values of feature scores.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    f_classif : ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif : Mutual information for a discrete target.
+    chi2 : Chi-squared stats of non-negative features for classification tasks.
+    f_regression : F-value between label/feature for regression tasks.
+    mutual_info_regression : Mutual information for a continuous target.
+    SelectPercentile : Select features based on percentile of the highest
+        scores.
+    SelectKBest : Select features based on the k highest scores.
+    SelectFpr : Select features based on a false positive rate test.
+    SelectFwe : Select features based on family-wise error rate.
+    GenericUnivariateSelect : Univariate feature selector with configurable
+        mode.
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/False_discovery_rate
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.feature_selection import SelectFdr, chi2
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> X.shape
+    (569, 30)
+    >>> X_new = SelectFdr(chi2, alpha=0.01).fit_transform(X, y)
+    >>> X_new.shape
+    (569, 16)
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseFilter._parameter_constraints,
+        "alpha": [Interval(Real, 0, 1, closed="both")],
+    }
+
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
+        self.alpha = alpha
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        n_features = len(self.pvalues_)
+        sv = np.sort(self.pvalues_)
+        selected = sv[
+            sv <= float(self.alpha) / n_features * np.arange(1, n_features + 1)
+        ]
+        if selected.size == 0:
+            return np.zeros_like(self.pvalues_, dtype=bool)
+        return self.pvalues_ <= selected.max()
+
+
+class SelectFwe(_BaseFilter):
+    """Filter: Select the p-values corresponding to Family-wise error rate.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    score_func : callable, default=f_classif
+        Function taking two arrays X and y, and returning a pair of arrays
+        (scores, pvalues).
+        Default is f_classif (see below "See Also"). The default function only
+        works with classification tasks.
+
+    alpha : float, default=5e-2
+        The highest uncorrected p-value for features to keep.
+
+    Attributes
+    ----------
+    scores_ : array-like of shape (n_features,)
+        Scores of features.
+
+    pvalues_ : array-like of shape (n_features,)
+        p-values of feature scores.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    f_classif : ANOVA F-value between label/feature for classification tasks.
+    chi2 : Chi-squared stats of non-negative features for classification tasks.
+    f_regression : F-value between label/feature for regression tasks.
+    SelectPercentile : Select features based on percentile of the highest
+        scores.
+    SelectKBest : Select features based on the k highest scores.
+    SelectFpr : Select features based on a false positive rate test.
+    SelectFdr : Select features based on an estimated false discovery rate.
+    GenericUnivariateSelect : Univariate feature selector with configurable
+        mode.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.feature_selection import SelectFwe, chi2
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> X.shape
+    (569, 30)
+    >>> X_new = SelectFwe(chi2, alpha=0.01).fit_transform(X, y)
+    >>> X_new.shape
+    (569, 15)
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseFilter._parameter_constraints,
+        "alpha": [Interval(Real, 0, 1, closed="both")],
+    }
+
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
+        self.alpha = alpha
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        return self.pvalues_ < self.alpha / len(self.pvalues_)
+
+
+######################################################################
+# Generic filter
+######################################################################
+
+
+# TODO this class should fit on either p-values or scores,
+# depending on the mode.
+class GenericUnivariateSelect(_BaseFilter):
+    """Univariate feature selector with configurable strategy.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    score_func : callable, default=f_classif
+        Function taking two arrays X and y, and returning a pair of arrays
+        (scores, pvalues). For modes 'percentile' or 'kbest' it can return
+        a single array scores.
+
+    mode : {'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}, default='percentile'
+        Feature selection mode. Note that the `'percentile'` and `'kbest'`
+        modes are supporting unsupervised feature selection (when `y` is `None`).
+
+    param : "all", float or int, default=1e-5
+        Parameter of the corresponding mode.
+
+    Attributes
+    ----------
+    scores_ : array-like of shape (n_features,)
+        Scores of features.
+
+    pvalues_ : array-like of shape (n_features,)
+        p-values of feature scores, None if `score_func` returned scores only.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    f_classif : ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif : Mutual information for a discrete target.
+    chi2 : Chi-squared stats of non-negative features for classification tasks.
+    f_regression : F-value between label/feature for regression tasks.
+    mutual_info_regression : Mutual information for a continuous target.
+    SelectPercentile : Select features based on percentile of the highest
+        scores.
+    SelectKBest : Select features based on the k highest scores.
+    SelectFpr : Select features based on a false positive rate test.
+    SelectFdr : Select features based on an estimated false discovery rate.
+    SelectFwe : Select features based on family-wise error rate.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.feature_selection import GenericUnivariateSelect, chi2
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> X.shape
+    (569, 30)
+    >>> transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20)
+    >>> X_new = transformer.fit_transform(X, y)
+    >>> X_new.shape
+    (569, 20)
+    """
+
+    _selection_modes: dict = {
+        "percentile": SelectPercentile,
+        "k_best": SelectKBest,
+        "fpr": SelectFpr,
+        "fdr": SelectFdr,
+        "fwe": SelectFwe,
+    }
+
+    _parameter_constraints: dict = {
+        **_BaseFilter._parameter_constraints,
+        "mode": [StrOptions(set(_selection_modes.keys()))],
+        "param": [Interval(Real, 0, None, closed="left"), StrOptions({"all"})],
+    }
+
+    def __init__(self, score_func=f_classif, *, mode="percentile", param=1e-5):
+        super().__init__(score_func=score_func)
+        self.mode = mode
+        self.param = param
+
+    def _make_selector(self):
+        selector = self._selection_modes[self.mode](score_func=self.score_func)
+
+        # Now perform some acrobatics to set the right named parameter in
+        # the selector
+        possible_params = selector._get_param_names()
+        possible_params.remove("score_func")
+        selector.set_params(**{possible_params[0]: self.param})
+
+        return selector
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+    def _check_params(self, X, y):
+        self._make_selector()._check_params(X, y)
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        selector = self._make_selector()
+        selector.pvalues_ = self.pvalues_
+        selector.scores_ = self.scores_
+        return selector._get_support_mask()
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_variance_threshold.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_variance_threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..f26d70ecf8f82ab317103ba73b52f85b3af5e524
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_variance_threshold.py
@@ -0,0 +1,141 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Real
+
+import numpy as np
+
+from ..base import BaseEstimator, _fit_context
+from ..utils._param_validation import Interval
+from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
+from ..utils.validation import check_is_fitted, validate_data
+from ._base import SelectorMixin
+
+
+class VarianceThreshold(SelectorMixin, BaseEstimator):
+    """Feature selector that removes all low-variance features.
+
+    This feature selection algorithm looks only at the features (X), not the
+    desired outputs (y), and can thus be used for unsupervised learning.
+
+    Read more in the :ref:`User Guide <variance_threshold>`.
+
+    Parameters
+    ----------
+    threshold : float, default=0
+        Features with a training-set variance lower than this threshold will
+        be removed. The default is to keep all features with non-zero variance,
+        i.e. remove the features that have the same value in all samples.
+
+    Attributes
+    ----------
+    variances_ : array, shape (n_features,)
+        Variances of individual features.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    SelectFromModel: Meta-transformer for selecting features based on
+        importance weights.
+    SelectPercentile : Select features according to a percentile of the highest
+        scores.
+    SequentialFeatureSelector : Transformer that performs Sequential Feature
+        Selection.
+
+    Notes
+    -----
+    Allows NaN in the input.
+    Raises ValueError if no feature in X meets the variance threshold.
+
+    Examples
+    --------
+    The following dataset has integer features, two of which are the same
+    in every sample. These are removed with the default setting for threshold::
+
+        >>> from sklearn.feature_selection import VarianceThreshold
+        >>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
+        >>> selector = VarianceThreshold()
+        >>> selector.fit_transform(X)
+        array([[2, 0],
+               [1, 4],
+               [1, 1]])
+    """
+
+    _parameter_constraints: dict = {
+        "threshold": [Interval(Real, 0, None, closed="left")]
+    }
+
+    def __init__(self, threshold=0.0):
+        self.threshold = threshold
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Learn empirical variances from X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Data from which to compute variances, where `n_samples` is
+            the number of samples and `n_features` is the number of features.
+
+        y : any, default=None
+            Ignored. This parameter exists only for compatibility with
+            sklearn.pipeline.Pipeline.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            dtype=np.float64,
+            ensure_all_finite="allow-nan",
+        )
+
+        if hasattr(X, "toarray"):  # sparse matrix
+            _, self.variances_ = mean_variance_axis(X, axis=0)
+            if self.threshold == 0:
+                mins, maxes = min_max_axis(X, axis=0)
+                peak_to_peaks = maxes - mins
+        else:
+            self.variances_ = np.nanvar(X, axis=0)
+            if self.threshold == 0:
+                peak_to_peaks = np.ptp(X, axis=0)
+
+        if self.threshold == 0:
+            # Use peak-to-peak to avoid numeric precision issues
+            # for constant features
+            compare_arr = np.array([self.variances_, peak_to_peaks])
+            self.variances_ = np.nanmin(compare_arr, axis=0)
+
+        if np.all(~np.isfinite(self.variances_) | (self.variances_ <= self.threshold)):
+            msg = "No feature in X meets the variance threshold {0:.5f}"
+            if X.shape[0] == 1:
+                msg += " (X contains only one sample)"
+            raise ValueError(msg.format(self.threshold))
+
+        return self
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        return self.variances_ > self.threshold
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bf51a80f01baa1a4340335ce397aeb0ca3e4b5f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_base.py
@@ -0,0 +1,154 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.base import BaseEstimator
+from sklearn.feature_selection._base import SelectorMixin
+from sklearn.utils.fixes import CSC_CONTAINERS
+from sklearn.utils.validation import validate_data
+
+
+class StepSelector(SelectorMixin, BaseEstimator):
+    """Retain every `step` features (beginning with 0).
+
+    If `step < 1`, then no features are selected.
+    """
+
+    def __init__(self, step=2):
+        self.step = step
+
+    def fit(self, X, y=None):
+        X = validate_data(self, X, accept_sparse="csc")
+        return self
+
+    def _get_support_mask(self):
+        mask = np.zeros(self.n_features_in_, dtype=bool)
+        if self.step >= 1:
+            mask[:: self.step] = True
+        return mask
+
+
+support = [True, False] * 5
+support_inds = [0, 2, 4, 6, 8]
+X = np.arange(20).reshape(2, 10)
+Xt = np.arange(0, 20, 2).reshape(2, 5)
+Xinv = X.copy()
+Xinv[:, 1::2] = 0
+y = [0, 1]
+feature_names = list("ABCDEFGHIJ")
+feature_names_t = feature_names[::2]
+feature_names_inv = np.array(feature_names)
+feature_names_inv[1::2] = ""
+
+
+def test_transform_dense():
+    sel = StepSelector()
+    Xt_actual = sel.fit(X, y).transform(X)
+    Xt_actual2 = StepSelector().fit_transform(X, y)
+    assert_array_equal(Xt, Xt_actual)
+    assert_array_equal(Xt, Xt_actual2)
+
+    # Check dtype matches
+    assert np.int32 == sel.transform(X.astype(np.int32)).dtype
+    assert np.float32 == sel.transform(X.astype(np.float32)).dtype
+
+    # Check 1d list and other dtype:
+    names_t_actual = sel.transform([feature_names])
+    assert_array_equal(feature_names_t, names_t_actual.ravel())
+
+    # Check wrong shape raises error
+    with pytest.raises(ValueError):
+        sel.transform(np.array([[1], [2]]))
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_transform_sparse(csc_container):
+    X_sp = csc_container(X)
+    sel = StepSelector()
+    Xt_actual = sel.fit(X_sp).transform(X_sp)
+    Xt_actual2 = sel.fit_transform(X_sp)
+    assert_array_equal(Xt, Xt_actual.toarray())
+    assert_array_equal(Xt, Xt_actual2.toarray())
+
+    # Check dtype matches
+    assert np.int32 == sel.transform(X_sp.astype(np.int32)).dtype
+    assert np.float32 == sel.transform(X_sp.astype(np.float32)).dtype
+
+    # Check wrong shape raises error
+    with pytest.raises(ValueError):
+        sel.transform(np.array([[1], [2]]))
+
+
+def test_inverse_transform_dense():
+    sel = StepSelector()
+    Xinv_actual = sel.fit(X, y).inverse_transform(Xt)
+    assert_array_equal(Xinv, Xinv_actual)
+
+    # Check dtype matches
+    assert np.int32 == sel.inverse_transform(Xt.astype(np.int32)).dtype
+    assert np.float32 == sel.inverse_transform(Xt.astype(np.float32)).dtype
+
+    # Check 1d list and other dtype:
+    names_inv_actual = sel.inverse_transform([feature_names_t])
+    assert_array_equal(feature_names_inv, names_inv_actual.ravel())
+
+    # Check wrong shape raises error
+    with pytest.raises(ValueError):
+        sel.inverse_transform(np.array([[1], [2]]))
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_inverse_transform_sparse(csc_container):
+    X_sp = csc_container(X)
+    Xt_sp = csc_container(Xt)
+    sel = StepSelector()
+    Xinv_actual = sel.fit(X_sp).inverse_transform(Xt_sp)
+    assert_array_equal(Xinv, Xinv_actual.toarray())
+
+    # Check dtype matches
+    assert np.int32 == sel.inverse_transform(Xt_sp.astype(np.int32)).dtype
+    assert np.float32 == sel.inverse_transform(Xt_sp.astype(np.float32)).dtype
+
+    # Check wrong shape raises error
+    with pytest.raises(ValueError):
+        sel.inverse_transform(np.array([[1], [2]]))
+
+
+def test_get_support():
+    sel = StepSelector()
+    sel.fit(X, y)
+    assert_array_equal(support, sel.get_support())
+    assert_array_equal(support_inds, sel.get_support(indices=True))
+
+
+def test_output_dataframe():
+    """Check output dtypes for dataframes is consistent with the input dtypes."""
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame(
+        {
+            "a": pd.Series([1.0, 2.4, 4.5], dtype=np.float32),
+            "b": pd.Series(["a", "b", "a"], dtype="category"),
+            "c": pd.Series(["j", "b", "b"], dtype="category"),
+            "d": pd.Series([3.0, 2.4, 1.2], dtype=np.float64),
+        }
+    )
+
+    for step in [2, 3]:
+        sel = StepSelector(step=step).set_output(transform="pandas")
+        sel.fit(X)
+
+        output = sel.transform(X)
+        for name, dtype in output.dtypes.items():
+            assert dtype == X.dtypes[name]
+
+    # step=0 will select nothing
+    sel0 = StepSelector(step=0).set_output(transform="pandas")
+    sel0.fit(X, y)
+
+    msg = "No features were selected"
+    with pytest.warns(UserWarning, match=msg):
+        output0 = sel0.transform(X)
+
+    assert_array_equal(output0.index, X.index)
+    assert output0.shape == (X.shape[0], 0)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_chi2.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_chi2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50def36f1b6c281e6c96019355b901bf4326a38
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_chi2.py
@@ -0,0 +1,93 @@
+"""
+Tests for chi2, currently the only feature selection function designed
+specifically to work with sparse matrices.
+"""
+
+import warnings
+
+import numpy as np
+import pytest
+import scipy.stats
+
+from sklearn.feature_selection import SelectKBest, chi2
+from sklearn.feature_selection._univariate_selection import _chisquare
+from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
+from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS
+
+# Feature 0 is highly informative for class 1;
+# feature 1 is the same everywhere;
+# feature 2 is a bit informative for class 2.
+X = [[2, 1, 2], [9, 1, 1], [6, 1, 2], [0, 1, 2]]
+y = [0, 1, 2, 2]
+
+
+def mkchi2(k):
+    """Make k-best chi2 selector"""
+    return SelectKBest(chi2, k=k)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_chi2(csr_container):
+    # Test Chi2 feature extraction
+
+    chi2 = mkchi2(k=1).fit(X, y)
+    chi2 = mkchi2(k=1).fit(X, y)
+    assert_array_equal(chi2.get_support(indices=True), [0])
+    assert_array_equal(chi2.transform(X), np.array(X)[:, [0]])
+
+    chi2 = mkchi2(k=2).fit(X, y)
+    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])
+
+    Xsp = csr_container(X, dtype=np.float64)
+    chi2 = mkchi2(k=2).fit(Xsp, y)
+    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])
+    Xtrans = chi2.transform(Xsp)
+    assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2])
+
+    # == doesn't work on scipy.sparse matrices
+    Xtrans = Xtrans.toarray()
+    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
+    assert_array_almost_equal(Xtrans, Xtrans2)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_chi2_coo(coo_container):
+    # Check that chi2 works with a COO matrix
+    # (as returned by CountVectorizer, DictVectorizer)
+    Xcoo = coo_container(X)
+    mkchi2(k=2).fit_transform(Xcoo, y)
+    # if we got here without an exception, we're safe
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_chi2_negative(csr_container):
+    # Check for proper error on negative numbers in the input X.
+    X, y = [[0, 1], [-1e-20, 1]], [0, 1]
+    for X in (X, np.array(X), csr_container(X)):
+        with pytest.raises(ValueError):
+            chi2(X, y)
+
+
+def test_chi2_unused_feature():
+    # Unused feature should evaluate to NaN
+    # and should issue no runtime warning
+    with warnings.catch_warnings(record=True) as warned:
+        warnings.simplefilter("always")
+        chi, p = chi2([[1, 0], [0, 0]], [1, 0])
+        for w in warned:
+            if "divide by zero" in repr(w):
+                raise AssertionError("Found unexpected warning %s" % w)
+    assert_array_equal(chi, [1, np.nan])
+    assert_array_equal(p[1], np.nan)
+
+
+def test_chisquare():
+    # Test replacement for scipy.stats.chisquare against the original.
+    obs = np.array([[2.0, 2.0], [1.0, 1.0]])
+    exp = np.array([[1.5, 1.5], [1.5, 1.5]])
+    # call SciPy first because our version overwrites obs
+    chi_scp, p_scp = scipy.stats.chisquare(obs, exp)
+    chi_our, p_our = _chisquare(obs, exp)
+
+    assert_array_almost_equal(chi_scp, chi_our)
+    assert_array_almost_equal(p_scp, p_our)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_feature_select.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_feature_select.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7bffec5159bfc7ba8faf452a218d5147906419c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_feature_select.py
@@ -0,0 +1,1018 @@
+"""
+Todo: cross-check the F-value with stats model
+"""
+
+import itertools
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy import sparse, stats
+
+from sklearn.datasets import load_iris, make_classification, make_regression
+from sklearn.feature_selection import (
+    GenericUnivariateSelect,
+    SelectFdr,
+    SelectFpr,
+    SelectFwe,
+    SelectKBest,
+    SelectPercentile,
+    chi2,
+    f_classif,
+    f_oneway,
+    f_regression,
+    mutual_info_classif,
+    mutual_info_regression,
+    r_regression,
+)
+from sklearn.utils import safe_mask
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+##############################################################################
+# Test the score functions
+
+
+def test_f_oneway_vs_scipy_stats():
+    # Test that our f_oneway gives the same result as scipy.stats
+    rng = np.random.RandomState(0)
+    X1 = rng.randn(10, 3)
+    X2 = 1 + rng.randn(10, 3)
+    f, pv = stats.f_oneway(X1, X2)
+    f2, pv2 = f_oneway(X1, X2)
+    assert np.allclose(f, f2)
+    assert np.allclose(pv, pv2)
+
+
+def test_f_oneway_ints():
+    # Smoke test f_oneway on integers: that it does raise casting errors
+    # with recent numpys
+    rng = np.random.RandomState(0)
+    X = rng.randint(10, size=(10, 10))
+    y = np.arange(10)
+    fint, pint = f_oneway(X, y)
+
+    # test that is gives the same result as with float
+    f, p = f_oneway(X.astype(float), y)
+    assert_array_almost_equal(f, fint, decimal=4)
+    assert_array_almost_equal(p, pint, decimal=4)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_f_classif(csr_container):
+    # Test whether the F test yields meaningful results
+    # on a simple simulated classification problem
+    X, y = make_classification(
+        n_samples=200,
+        n_features=20,
+        n_informative=3,
+        n_redundant=2,
+        n_repeated=0,
+        n_classes=8,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+
+    F, pv = f_classif(X, y)
+    F_sparse, pv_sparse = f_classif(csr_container(X), y)
+    assert (F > 0).all()
+    assert (pv > 0).all()
+    assert (pv < 1).all()
+    assert (pv[:5] < 0.05).all()
+    assert (pv[5:] > 1.0e-4).all()
+    assert_array_almost_equal(F_sparse, F)
+    assert_array_almost_equal(pv_sparse, pv)
+
+
+@pytest.mark.parametrize("center", [True, False])
+def test_r_regression(center):
+    X, y = make_regression(
+        n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0
+    )
+
+    corr_coeffs = r_regression(X, y, center=center)
+    assert (-1 < corr_coeffs).all()
+    assert (corr_coeffs < 1).all()
+
+    sparse_X = _convert_container(X, "sparse")
+
+    sparse_corr_coeffs = r_regression(sparse_X, y, center=center)
+    assert_allclose(sparse_corr_coeffs, corr_coeffs)
+
+    # Testing against numpy for reference
+    Z = np.hstack((X, y[:, np.newaxis]))
+    correlation_matrix = np.corrcoef(Z, rowvar=False)
+    np_corr_coeffs = correlation_matrix[:-1, -1]
+    assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_f_regression(csr_container):
+    # Test whether the F test yields meaningful results
+    # on a simple simulated regression problem
+    X, y = make_regression(
+        n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
+    )
+
+    F, pv = f_regression(X, y)
+    assert (F > 0).all()
+    assert (pv > 0).all()
+    assert (pv < 1).all()
+    assert (pv[:5] < 0.05).all()
+    assert (pv[5:] > 1.0e-4).all()
+
+    # with centering, compare with sparse
+    F, pv = f_regression(X, y, center=True)
+    F_sparse, pv_sparse = f_regression(csr_container(X), y, center=True)
+    assert_allclose(F_sparse, F)
+    assert_allclose(pv_sparse, pv)
+
+    # again without centering, compare with sparse
+    F, pv = f_regression(X, y, center=False)
+    F_sparse, pv_sparse = f_regression(csr_container(X), y, center=False)
+    assert_allclose(F_sparse, F)
+    assert_allclose(pv_sparse, pv)
+
+
+def test_f_regression_input_dtype():
+    # Test whether f_regression returns the same value
+    # for any numeric data_type
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 20)
+    y = np.arange(10).astype(int)
+
+    F1, pv1 = f_regression(X, y)
+    F2, pv2 = f_regression(X, y.astype(float))
+    assert_allclose(F1, F2, 5)
+    assert_allclose(pv1, pv2, 5)
+
+
+def test_f_regression_center():
+    # Test whether f_regression preserves dof according to 'center' argument
+    # We use two centered variates so we have a simple relationship between
+    # F-score with variates centering and F-score without variates centering.
+    # Create toy example
+    X = np.arange(-5, 6).reshape(-1, 1)  # X has zero mean
+    n_samples = X.size
+    Y = np.ones(n_samples)
+    Y[::2] *= -1.0
+    Y[0] = 0.0  # have Y mean being null
+
+    F1, _ = f_regression(X, Y, center=True)
+    F2, _ = f_regression(X, Y, center=False)
+    assert_allclose(F1 * (n_samples - 1.0) / (n_samples - 2.0), F2)
+    assert_almost_equal(F2[0], 0.232558139)  # value from statsmodels OLS
+
+
+@pytest.mark.parametrize(
+    "X, y, expected_corr_coef, force_finite",
+    [
+        (
+            # A feature in X is constant - forcing finite
+            np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
+            np.array([0, 1, 1, 0]),
+            np.array([0.0, 0.32075]),
+            True,
+        ),
+        (
+            # The target y is constant - forcing finite
+            np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
+            np.array([0, 0, 0, 0]),
+            np.array([0.0, 0.0]),
+            True,
+        ),
+        (
+            # A feature in X is constant - not forcing finite
+            np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
+            np.array([0, 1, 1, 0]),
+            np.array([np.nan, 0.32075]),
+            False,
+        ),
+        (
+            # The target y is constant - not forcing finite
+            np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
+            np.array([0, 0, 0, 0]),
+            np.array([np.nan, np.nan]),
+            False,
+        ),
+    ],
+)
+def test_r_regression_force_finite(X, y, expected_corr_coef, force_finite):
+    """Check the behaviour of `force_finite` for some corner cases with `r_regression`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/15672
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        corr_coef = r_regression(X, y, force_finite=force_finite)
+    np.testing.assert_array_almost_equal(corr_coef, expected_corr_coef)
+
+
+@pytest.mark.parametrize(
+    "X, y, expected_f_statistic, expected_p_values, force_finite",
+    [
+        (
+            # A feature in X is constant - forcing finite
+            np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
+            np.array([0, 1, 1, 0]),
+            np.array([0.0, 0.2293578]),
+            np.array([1.0, 0.67924985]),
+            True,
+        ),
+        (
+            # The target y is constant - forcing finite
+            np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
+            np.array([0, 0, 0, 0]),
+            np.array([0.0, 0.0]),
+            np.array([1.0, 1.0]),
+            True,
+        ),
+        (
+            # Feature in X correlated with y - forcing finite
+            np.array([[0, 1], [1, 0], [2, 10], [3, 4]]),
+            np.array([0, 1, 2, 3]),
+            np.array([np.finfo(np.float64).max, 0.845433]),
+            np.array([0.0, 0.454913]),
+            True,
+        ),
+        (
+            # Feature in X anti-correlated with y - forcing finite
+            np.array([[3, 1], [2, 0], [1, 10], [0, 4]]),
+            np.array([0, 1, 2, 3]),
+            np.array([np.finfo(np.float64).max, 0.845433]),
+            np.array([0.0, 0.454913]),
+            True,
+        ),
+        (
+            # A feature in X is constant - not forcing finite
+            np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
+            np.array([0, 1, 1, 0]),
+            np.array([np.nan, 0.2293578]),
+            np.array([np.nan, 0.67924985]),
+            False,
+        ),
+        (
+            # The target y is constant - not forcing finite
+            np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
+            np.array([0, 0, 0, 0]),
+            np.array([np.nan, np.nan]),
+            np.array([np.nan, np.nan]),
+            False,
+        ),
+        (
+            # Feature in X correlated with y - not forcing finite
+            np.array([[0, 1], [1, 0], [2, 10], [3, 4]]),
+            np.array([0, 1, 2, 3]),
+            np.array([np.inf, 0.845433]),
+            np.array([0.0, 0.454913]),
+            False,
+        ),
+        (
+            # Feature in X anti-correlated with y - not forcing finite
+            np.array([[3, 1], [2, 0], [1, 10], [0, 4]]),
+            np.array([0, 1, 2, 3]),
+            np.array([np.inf, 0.845433]),
+            np.array([0.0, 0.454913]),
+            False,
+        ),
+    ],
+)
+def test_f_regression_corner_case(
+    X, y, expected_f_statistic, expected_p_values, force_finite
+):
+    """Check the behaviour of `force_finite` for some corner cases with `f_regression`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/15672
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        f_statistic, p_values = f_regression(X, y, force_finite=force_finite)
+    np.testing.assert_array_almost_equal(f_statistic, expected_f_statistic)
+    np.testing.assert_array_almost_equal(p_values, expected_p_values)
+
+
+def test_f_classif_multi_class():
+    # Test whether the F test yields meaningful results
+    # on a simple simulated classification problem
+    X, y = make_classification(
+        n_samples=200,
+        n_features=20,
+        n_informative=3,
+        n_redundant=2,
+        n_repeated=0,
+        n_classes=8,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+
+    F, pv = f_classif(X, y)
+    assert (F > 0).all()
+    assert (pv > 0).all()
+    assert (pv < 1).all()
+    assert (pv[:5] < 0.05).all()
+    assert (pv[5:] > 1.0e-4).all()
+
+
+def test_select_percentile_classif():
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple classification problem
+    # with the percentile heuristic
+    X, y = make_classification(
+        n_samples=200,
+        n_features=20,
+        n_informative=3,
+        n_redundant=2,
+        n_repeated=0,
+        n_classes=8,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+
+    univariate_filter = SelectPercentile(f_classif, percentile=25)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = (
+        GenericUnivariateSelect(f_classif, mode="percentile", param=25)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    assert_array_equal(support, gtruth)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_select_percentile_classif_sparse(csr_container):
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple classification problem
+    # with the percentile heuristic
+    X, y = make_classification(
+        n_samples=200,
+        n_features=20,
+        n_informative=3,
+        n_redundant=2,
+        n_repeated=0,
+        n_classes=8,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+    X = csr_container(X)
+    univariate_filter = SelectPercentile(f_classif, percentile=25)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = (
+        GenericUnivariateSelect(f_classif, mode="percentile", param=25)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r.toarray(), X_r2.toarray())
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    assert_array_equal(support, gtruth)
+
+    X_r2inv = univariate_filter.inverse_transform(X_r2)
+    assert sparse.issparse(X_r2inv)
+    support_mask = safe_mask(X_r2inv, support)
+    assert X_r2inv.shape == X.shape
+    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
+    # Check other columns are empty
+    assert X_r2inv.nnz == X_r.nnz
+
+
+##############################################################################
+# Test univariate selection in classification settings
+
+
+def test_select_kbest_classif():
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple classification problem
+    # with the k best heuristic
+    X, y = make_classification(
+        n_samples=200,
+        n_features=20,
+        n_informative=3,
+        n_redundant=2,
+        n_repeated=0,
+        n_classes=8,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+
+    univariate_filter = SelectKBest(f_classif, k=5)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = (
+        GenericUnivariateSelect(f_classif, mode="k_best", param=5)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    assert_array_equal(support, gtruth)
+
+
+def test_select_kbest_all():
+    # Test whether k="all" correctly returns all features.
+    X, y = make_classification(
+        n_samples=20, n_features=10, shuffle=False, random_state=0
+    )
+
+    univariate_filter = SelectKBest(f_classif, k="all")
+    X_r = univariate_filter.fit(X, y).transform(X)
+    assert_array_equal(X, X_r)
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/24949
+    X_r2 = (
+        GenericUnivariateSelect(f_classif, mode="k_best", param="all")
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+
+
+@pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
+def test_select_kbest_zero(dtype_in):
+    # Test whether k=0 correctly returns no features.
+    X, y = make_classification(
+        n_samples=20, n_features=10, shuffle=False, random_state=0
+    )
+    X = X.astype(dtype_in)
+
+    univariate_filter = SelectKBest(f_classif, k=0)
+    univariate_filter.fit(X, y)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(10, dtype=bool)
+    assert_array_equal(support, gtruth)
+    with pytest.warns(UserWarning, match="No features were selected"):
+        X_selected = univariate_filter.transform(X)
+    assert X_selected.shape == (20, 0)
+    assert X_selected.dtype == dtype_in
+
+
+def test_select_heuristics_classif():
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple classification problem
+    # with the fdr, fwe and fpr heuristics
+    X, y = make_classification(
+        n_samples=200,
+        n_features=20,
+        n_informative=3,
+        n_redundant=2,
+        n_repeated=0,
+        n_classes=8,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+
+    univariate_filter = SelectFwe(f_classif, alpha=0.01)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    for mode in ["fdr", "fpr", "fwe"]:
+        X_r2 = (
+            GenericUnivariateSelect(f_classif, mode=mode, param=0.01)
+            .fit(X, y)
+            .transform(X)
+        )
+        assert_array_equal(X_r, X_r2)
+        support = univariate_filter.get_support()
+        assert_allclose(support, gtruth)
+
+
+##############################################################################
+# Test univariate selection in regression settings
+
+
+def assert_best_scores_kept(score_filter):
+    scores = score_filter.scores_
+    support = score_filter.get_support()
+    assert_allclose(np.sort(scores[support]), np.sort(scores)[-support.sum() :])
+
+
+def test_select_percentile_regression():
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple regression problem
+    # with the percentile heuristic
+    X, y = make_regression(
+        n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
+    )
+
+    univariate_filter = SelectPercentile(f_regression, percentile=25)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    assert_best_scores_kept(univariate_filter)
+    X_r2 = (
+        GenericUnivariateSelect(f_regression, mode="percentile", param=25)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    assert_array_equal(support, gtruth)
+    X_2 = X.copy()
+    X_2[:, np.logical_not(support)] = 0
+    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
+    # Check inverse_transform respects dtype
+    assert_array_equal(
+        X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool))
+    )
+
+
+def test_select_percentile_regression_full():
+    # Test whether the relative univariate feature selection
+    # selects all features when '100%' is asked.
+    X, y = make_regression(
+        n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
+    )
+
+    univariate_filter = SelectPercentile(f_regression, percentile=100)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    assert_best_scores_kept(univariate_filter)
+    X_r2 = (
+        GenericUnivariateSelect(f_regression, mode="percentile", param=100)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.ones(20)
+    assert_array_equal(support, gtruth)
+
+
+def test_select_kbest_regression():
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple regression problem
+    # with the k best heuristic
+    X, y = make_regression(
+        n_samples=200,
+        n_features=20,
+        n_informative=5,
+        shuffle=False,
+        random_state=0,
+        noise=10,
+    )
+
+    univariate_filter = SelectKBest(f_regression, k=5)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    assert_best_scores_kept(univariate_filter)
+    X_r2 = (
+        GenericUnivariateSelect(f_regression, mode="k_best", param=5)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    assert_array_equal(support, gtruth)
+
+
+def test_select_heuristics_regression():
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple regression problem
+    # with the fpr, fdr or fwe heuristics
+    X, y = make_regression(
+        n_samples=200,
+        n_features=20,
+        n_informative=5,
+        shuffle=False,
+        random_state=0,
+        noise=10,
+    )
+
+    univariate_filter = SelectFpr(f_regression, alpha=0.01)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    for mode in ["fdr", "fpr", "fwe"]:
+        X_r2 = (
+            GenericUnivariateSelect(f_regression, mode=mode, param=0.01)
+            .fit(X, y)
+            .transform(X)
+        )
+        assert_array_equal(X_r, X_r2)
+        support = univariate_filter.get_support()
+        assert_array_equal(support[:5], np.ones((5,), dtype=bool))
+        assert np.sum(support[5:] == 1) < 3
+
+
+def test_boundary_case_ch2():
+    # Test boundary case, and always aim to select 1 feature.
+    X = np.array([[10, 20], [20, 20], [20, 30]])
+    y = np.array([[1], [0], [0]])
+    scores, pvalues = chi2(X, y)
+    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
+    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
+
+    filter_fdr = SelectFdr(chi2, alpha=0.1)
+    filter_fdr.fit(X, y)
+    support_fdr = filter_fdr.get_support()
+    assert_array_equal(support_fdr, np.array([True, False]))
+
+    filter_kbest = SelectKBest(chi2, k=1)
+    filter_kbest.fit(X, y)
+    support_kbest = filter_kbest.get_support()
+    assert_array_equal(support_kbest, np.array([True, False]))
+
+    filter_percentile = SelectPercentile(chi2, percentile=50)
+    filter_percentile.fit(X, y)
+    support_percentile = filter_percentile.get_support()
+    assert_array_equal(support_percentile, np.array([True, False]))
+
+    filter_fpr = SelectFpr(chi2, alpha=0.1)
+    filter_fpr.fit(X, y)
+    support_fpr = filter_fpr.get_support()
+    assert_array_equal(support_fpr, np.array([True, False]))
+
+    filter_fwe = SelectFwe(chi2, alpha=0.1)
+    filter_fwe.fit(X, y)
+    support_fwe = filter_fwe.get_support()
+    assert_array_equal(support_fwe, np.array([True, False]))
+
+
+@pytest.mark.parametrize("alpha", [0.001, 0.01, 0.1])
+@pytest.mark.parametrize("n_informative", [1, 5, 10])
+def test_select_fdr_regression(alpha, n_informative):
+    # Test that fdr heuristic actually has low FDR.
+    def single_fdr(alpha, n_informative, random_state):
+        X, y = make_regression(
+            n_samples=150,
+            n_features=20,
+            n_informative=n_informative,
+            shuffle=False,
+            random_state=random_state,
+            noise=10,
+        )
+
+        with warnings.catch_warnings(record=True):
+            # Warnings can be raised when no features are selected
+            # (low alpha or very noisy data)
+            univariate_filter = SelectFdr(f_regression, alpha=alpha)
+            X_r = univariate_filter.fit(X, y).transform(X)
+            X_r2 = (
+                GenericUnivariateSelect(f_regression, mode="fdr", param=alpha)
+                .fit(X, y)
+                .transform(X)
+            )
+
+        assert_array_equal(X_r, X_r2)
+        support = univariate_filter.get_support()
+        num_false_positives = np.sum(support[n_informative:] == 1)
+        num_true_positives = np.sum(support[:n_informative] == 1)
+
+        if num_false_positives == 0:
+            return 0.0
+        false_discovery_rate = num_false_positives / (
+            num_true_positives + num_false_positives
+        )
+        return false_discovery_rate
+
+    # As per Benjamini-Hochberg, the expected false discovery rate
+    # should be lower than alpha:
+    # FDR = E(FP / (TP + FP)) <= alpha
+    false_discovery_rate = np.mean(
+        [single_fdr(alpha, n_informative, random_state) for random_state in range(100)]
+    )
+    assert alpha >= false_discovery_rate
+
+    # Make sure that the empirical false discovery rate increases
+    # with alpha:
+    if false_discovery_rate != 0:
+        assert false_discovery_rate > alpha / 10
+
+
+def test_select_fwe_regression():
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple regression problem
+    # with the fwe heuristic
+    X, y = make_regression(
+        n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
+    )
+
+    univariate_filter = SelectFwe(f_regression, alpha=0.01)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = (
+        GenericUnivariateSelect(f_regression, mode="fwe", param=0.01)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    assert_array_equal(support[:5], np.ones((5,), dtype=bool))
+    assert np.sum(support[5:] == 1) < 2
+
+
+def test_selectkbest_tiebreaking():
+    # Test whether SelectKBest actually selects k features in case of ties.
+    # Prior to 0.11, SelectKBest would return more features than requested.
+    Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
+    y = [1]
+    dummy_score = lambda X, y: (X[0], X[0])
+    for X in Xs:
+        sel = SelectKBest(dummy_score, k=1)
+        X1 = ignore_warnings(sel.fit_transform)([X], y)
+        assert X1.shape[1] == 1
+        assert_best_scores_kept(sel)
+
+        sel = SelectKBest(dummy_score, k=2)
+        X2 = ignore_warnings(sel.fit_transform)([X], y)
+        assert X2.shape[1] == 2
+        assert_best_scores_kept(sel)
+
+
+def test_selectpercentile_tiebreaking():
+    # Test if SelectPercentile selects the right n_features in case of ties.
+    Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
+    y = [1]
+    dummy_score = lambda X, y: (X[0], X[0])
+    for X in Xs:
+        sel = SelectPercentile(dummy_score, percentile=34)
+        X1 = ignore_warnings(sel.fit_transform)([X], y)
+        assert X1.shape[1] == 1
+        assert_best_scores_kept(sel)
+
+        sel = SelectPercentile(dummy_score, percentile=67)
+        X2 = ignore_warnings(sel.fit_transform)([X], y)
+        assert X2.shape[1] == 2
+        assert_best_scores_kept(sel)
+
+
+def test_tied_pvalues():
+    # Test whether k-best and percentiles work with tied pvalues from chi2.
+    # chi2 will return the same p-values for the following features, but it
+    # will return different scores.
+    X0 = np.array([[10000, 9999, 9998], [1, 1, 1]])
+    y = [0, 1]
+
+    for perm in itertools.permutations((0, 1, 2)):
+        X = X0[:, perm]
+        Xt = SelectKBest(chi2, k=2).fit_transform(X, y)
+        assert Xt.shape == (2, 2)
+        assert 9998 not in Xt
+
+        Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y)
+        assert Xt.shape == (2, 2)
+        assert 9998 not in Xt
+
+
+def test_scorefunc_multilabel():
+    # Test whether k-best and percentiles works with multilabels with chi2.
+
+    X = np.array([[10000, 9999, 0], [100, 9999, 0], [1000, 99, 0]])
+    y = [[1, 1], [0, 1], [1, 0]]
+
+    Xt = SelectKBest(chi2, k=2).fit_transform(X, y)
+    assert Xt.shape == (3, 2)
+    assert 0 not in Xt
+
+    Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y)
+    assert Xt.shape == (3, 2)
+    assert 0 not in Xt
+
+
+def test_tied_scores():
+    # Test for stable sorting in k-best with tied scores.
+    X_train = np.array([[0, 0, 0], [1, 1, 1]])
+    y_train = [0, 1]
+
+    for n_features in [1, 2, 3]:
+        sel = SelectKBest(chi2, k=n_features).fit(X_train, y_train)
+        X_test = sel.transform([[0, 1, 2]])
+        assert_array_equal(X_test[0], np.arange(3)[-n_features:])
+
+
+def test_nans():
+    # Assert that SelectKBest and SelectPercentile can handle NaNs.
+    # First feature has zero variance to confuse f_classif (ANOVA) and
+    # make it return a NaN.
+    X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]]
+    y = [1, 0, 1]
+
+    for select in (
+        SelectKBest(f_classif, k=2),
+        SelectPercentile(f_classif, percentile=67),
+    ):
+        ignore_warnings(select.fit)(X, y)
+        assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
+
+
+def test_invalid_k():
+    X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]]
+    y = [1, 0, 1]
+
+    msg = "k=4 is greater than n_features=3. All the features will be returned."
+    with pytest.warns(UserWarning, match=msg):
+        SelectKBest(k=4).fit(X, y)
+    with pytest.warns(UserWarning, match=msg):
+        GenericUnivariateSelect(mode="k_best", param=4).fit(X, y)
+
+
+def test_f_classif_constant_feature():
+    # Test that f_classif warns if a feature is constant throughout.
+
+    X, y = make_classification(n_samples=10, n_features=5)
+    X[:, 0] = 2.0
+    with pytest.warns(UserWarning):
+        f_classif(X, y)
+
+
+def test_no_feature_selected():
+    rng = np.random.RandomState(0)
+
+    # Generate random uncorrelated data: a strict univariate test should
+    # rejects all the features
+    X = rng.rand(40, 10)
+    y = rng.randint(0, 4, size=40)
+    strict_selectors = [
+        SelectFwe(alpha=0.01).fit(X, y),
+        SelectFdr(alpha=0.01).fit(X, y),
+        SelectFpr(alpha=0.01).fit(X, y),
+        SelectPercentile(percentile=0).fit(X, y),
+        SelectKBest(k=0).fit(X, y),
+    ]
+    for selector in strict_selectors:
+        assert_array_equal(selector.get_support(), np.zeros(10))
+        with pytest.warns(UserWarning, match="No features were selected"):
+            X_selected = selector.transform(X)
+        assert X_selected.shape == (40, 0)
+
+
+def test_mutual_info_classif():
+    X, y = make_classification(
+        n_samples=100,
+        n_features=5,
+        n_informative=1,
+        n_redundant=1,
+        n_repeated=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+
+    # Test in KBest mode.
+    univariate_filter = SelectKBest(mutual_info_classif, k=2)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = (
+        GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(5)
+    gtruth[:2] = 1
+    assert_array_equal(support, gtruth)
+
+    # Test in Percentile mode.
+    univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = (
+        GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(5)
+    gtruth[:2] = 1
+    assert_array_equal(support, gtruth)
+
+
+def test_mutual_info_regression():
+    X, y = make_regression(
+        n_samples=100,
+        n_features=10,
+        n_informative=2,
+        shuffle=False,
+        random_state=0,
+        noise=10,
+    )
+
+    # Test in KBest mode.
+    univariate_filter = SelectKBest(mutual_info_regression, k=2)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    assert_best_scores_kept(univariate_filter)
+    X_r2 = (
+        GenericUnivariateSelect(mutual_info_regression, mode="k_best", param=2)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(10)
+    gtruth[:2] = 1
+    assert_array_equal(support, gtruth)
+
+    # Test in Percentile mode.
+    univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = (
+        GenericUnivariateSelect(mutual_info_regression, mode="percentile", param=20)
+        .fit(X, y)
+        .transform(X)
+    )
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(10)
+    gtruth[:2] = 1
+    assert_array_equal(support, gtruth)
+
+
+def test_dataframe_output_dtypes():
+    """Check that the output datafarme dtypes are the same as the input.
+
+    Non-regression test for gh-24860.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X, y = load_iris(return_X_y=True, as_frame=True)
+    X = X.astype(
+        {
+            "petal length (cm)": np.float32,
+            "petal width (cm)": np.float64,
+        }
+    )
+    X["petal_width_binned"] = pd.cut(X["petal width (cm)"], bins=10)
+
+    column_order = X.columns
+
+    def selector(X, y):
+        ranking = {
+            "sepal length (cm)": 1,
+            "sepal width (cm)": 2,
+            "petal length (cm)": 3,
+            "petal width (cm)": 4,
+            "petal_width_binned": 5,
+        }
+        return np.asarray([ranking[name] for name in column_order])
+
+    univariate_filter = SelectKBest(selector, k=3).set_output(transform="pandas")
+    output = univariate_filter.fit_transform(X, y)
+
+    assert_array_equal(
+        output.columns, ["petal length (cm)", "petal width (cm)", "petal_width_binned"]
+    )
+    for name, dtype in output.dtypes.items():
+        assert dtype == X.dtypes[name]
+
+
+@pytest.mark.parametrize(
+    "selector",
+    [
+        SelectKBest(k=4),
+        SelectPercentile(percentile=80),
+        GenericUnivariateSelect(mode="k_best", param=4),
+        GenericUnivariateSelect(mode="percentile", param=80),
+    ],
+)
+def test_unsupervised_filter(selector):
+    """Check support for unsupervised feature selection for the filter that could
+    require only `X`.
+    """
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 5)
+
+    def score_func(X, y=None):
+        return np.array([1, 1, 1, 1, 0])
+
+    selector.set_params(score_func=score_func)
+    selector.fit(X)
+    X_trans = selector.transform(X)
+    assert_allclose(X_trans, X[:, :4])
+    X_trans = selector.fit_transform(X)
+    assert_allclose(X_trans, X[:, :4])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_from_model.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_from_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..17bedf44748fbcf9e65e9b2aee1a94621d1b709e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_from_model.py
@@ -0,0 +1,704 @@
+import re
+import warnings
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.base import BaseEstimator
+from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
+from sklearn.datasets import make_friedman1, make_regression
+from sklearn.decomposition import PCA
+from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
+from sklearn.exceptions import NotFittedError
+from sklearn.feature_selection import SelectFromModel
+from sklearn.linear_model import (
+    ElasticNet,
+    ElasticNetCV,
+    Lasso,
+    LassoCV,
+    LinearRegression,
+    LogisticRegression,
+    PassiveAggressiveClassifier,
+    SGDClassifier,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.svm import LinearSVC
+from sklearn.utils._testing import (
+    MinimalClassifier,
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    skip_if_32bit,
+)
+
+
+class NaNTag(BaseEstimator):
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+class NoNaNTag(BaseEstimator):
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags
+
+
+class NaNTagRandomForest(RandomForestClassifier):
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+iris = datasets.load_iris()
+data, y = iris.data, iris.target
+
+
+def test_invalid_input():
+    clf = SGDClassifier(
+        alpha=0.1, max_iter=10, shuffle=True, random_state=None, tol=None
+    )
+    for threshold in ["gobbledigook", ".5 * gobbledigook"]:
+        model = SelectFromModel(clf, threshold=threshold)
+        model.fit(data, y)
+        with pytest.raises(ValueError):
+            model.transform(data)
+
+
+def test_input_estimator_unchanged():
+    # Test that SelectFromModel fits on a clone of the estimator.
+    est = RandomForestClassifier()
+    transformer = SelectFromModel(estimator=est)
+    transformer.fit(data, y)
+    assert transformer.estimator is est
+
+
+@pytest.mark.parametrize(
+    "max_features, err_type, err_msg",
+    [
+        (
+            data.shape[1] + 1,
+            ValueError,
+            "max_features ==",
+        ),
+        (
+            lambda X: 1.5,
+            TypeError,
+            "max_features must be an instance of int, not float.",
+        ),
+        (
+            lambda X: data.shape[1] + 1,
+            ValueError,
+            "max_features ==",
+        ),
+        (
+            lambda X: -1,
+            ValueError,
+            "max_features ==",
+        ),
+    ],
+)
+def test_max_features_error(max_features, err_type, err_msg):
+    err_msg = re.escape(err_msg)
+    clf = RandomForestClassifier(n_estimators=5, random_state=0)
+
+    transformer = SelectFromModel(
+        estimator=clf, max_features=max_features, threshold=-np.inf
+    )
+    with pytest.raises(err_type, match=err_msg):
+        transformer.fit(data, y)
+
+
+@pytest.mark.parametrize("max_features", [0, 2, data.shape[1], None])
+def test_inferred_max_features_integer(max_features):
+    """Check max_features_ and output shape for integer max_features."""
+    clf = RandomForestClassifier(n_estimators=5, random_state=0)
+    transformer = SelectFromModel(
+        estimator=clf, max_features=max_features, threshold=-np.inf
+    )
+    X_trans = transformer.fit_transform(data, y)
+    if max_features is not None:
+        assert transformer.max_features_ == max_features
+        assert X_trans.shape[1] == transformer.max_features_
+    else:
+        assert not hasattr(transformer, "max_features_")
+        assert X_trans.shape[1] == data.shape[1]
+
+
+@pytest.mark.parametrize(
+    "max_features",
+    [lambda X: 1, lambda X: X.shape[1], lambda X: min(X.shape[1], 10000)],
+)
+def test_inferred_max_features_callable(max_features):
+    """Check max_features_ and output shape for callable max_features."""
+    clf = RandomForestClassifier(n_estimators=5, random_state=0)
+    transformer = SelectFromModel(
+        estimator=clf, max_features=max_features, threshold=-np.inf
+    )
+    X_trans = transformer.fit_transform(data, y)
+    assert transformer.max_features_ == max_features(data)
+    assert X_trans.shape[1] == transformer.max_features_
+
+
+@pytest.mark.parametrize("max_features", [lambda X: round(len(X[0]) / 2), 2])
+def test_max_features_array_like(max_features):
+    X = [
+        [0.87, -1.34, 0.31],
+        [-2.79, -0.02, -0.85],
+        [-1.34, -0.48, -2.55],
+        [1.92, 1.48, 0.65],
+    ]
+    y = [0, 1, 0, 1]
+
+    clf = RandomForestClassifier(n_estimators=5, random_state=0)
+    transformer = SelectFromModel(
+        estimator=clf, max_features=max_features, threshold=-np.inf
+    )
+    X_trans = transformer.fit_transform(X, y)
+    assert X_trans.shape[1] == transformer.max_features_
+
+
+@pytest.mark.parametrize(
+    "max_features",
+    [lambda X: min(X.shape[1], 10000), lambda X: X.shape[1], lambda X: 1],
+)
+def test_max_features_callable_data(max_features):
+    """Tests that the callable passed to `fit` is called on X."""
+    clf = RandomForestClassifier(n_estimators=50, random_state=0)
+    m = Mock(side_effect=max_features)
+    transformer = SelectFromModel(estimator=clf, max_features=m, threshold=-np.inf)
+    transformer.fit_transform(data, y)
+    m.assert_called_with(data)
+
+
+class FixedImportanceEstimator(BaseEstimator):
+    def __init__(self, importances):
+        self.importances = importances
+
+    def fit(self, X, y=None):
+        self.feature_importances_ = np.array(self.importances)
+
+
+def test_max_features():
+    # Test max_features parameter using various values
+    X, y = datasets.make_classification(
+        n_samples=1000,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+    )
+    max_features = X.shape[1]
+    est = RandomForestClassifier(n_estimators=50, random_state=0)
+
+    transformer1 = SelectFromModel(estimator=est, threshold=-np.inf)
+    transformer2 = SelectFromModel(
+        estimator=est, max_features=max_features, threshold=-np.inf
+    )
+    X_new1 = transformer1.fit_transform(X, y)
+    X_new2 = transformer2.fit_transform(X, y)
+    assert_allclose(X_new1, X_new2)
+
+    # Test max_features against actual model.
+    transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025, random_state=42))
+    X_new1 = transformer1.fit_transform(X, y)
+    scores1 = np.abs(transformer1.estimator_.coef_)
+    candidate_indices1 = np.argsort(-scores1, kind="mergesort")
+
+    for n_features in range(1, X_new1.shape[1] + 1):
+        transformer2 = SelectFromModel(
+            estimator=Lasso(alpha=0.025, random_state=42),
+            max_features=n_features,
+            threshold=-np.inf,
+        )
+        X_new2 = transformer2.fit_transform(X, y)
+        scores2 = np.abs(transformer2.estimator_.coef_)
+        candidate_indices2 = np.argsort(-scores2, kind="mergesort")
+        assert_allclose(
+            X[:, candidate_indices1[:n_features]], X[:, candidate_indices2[:n_features]]
+        )
+    assert_allclose(transformer1.estimator_.coef_, transformer2.estimator_.coef_)
+
+
+def test_max_features_tiebreak():
+    # Test if max_features can break tie among feature importance
+    X, y = datasets.make_classification(
+        n_samples=1000,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+    )
+    max_features = X.shape[1]
+
+    feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
+    for n_features in range(1, max_features + 1):
+        transformer = SelectFromModel(
+            FixedImportanceEstimator(feature_importances),
+            max_features=n_features,
+            threshold=-np.inf,
+        )
+        X_new = transformer.fit_transform(X, y)
+        selected_feature_indices = np.where(transformer._get_support_mask())[0]
+        assert_array_equal(selected_feature_indices, np.arange(n_features))
+        assert X_new.shape[1] == n_features
+
+
+def test_threshold_and_max_features():
+    X, y = datasets.make_classification(
+        n_samples=1000,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+    )
+    est = RandomForestClassifier(n_estimators=50, random_state=0)
+
+    transformer1 = SelectFromModel(estimator=est, max_features=3, threshold=-np.inf)
+    X_new1 = transformer1.fit_transform(X, y)
+
+    transformer2 = SelectFromModel(estimator=est, threshold=0.04)
+    X_new2 = transformer2.fit_transform(X, y)
+
+    transformer3 = SelectFromModel(estimator=est, max_features=3, threshold=0.04)
+    X_new3 = transformer3.fit_transform(X, y)
+    assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
+    selected_indices = transformer3.transform(np.arange(X.shape[1])[np.newaxis, :])
+    assert_allclose(X_new3, X[:, selected_indices[0]])
+
+
+@skip_if_32bit
+def test_feature_importances():
+    X, y = datasets.make_classification(
+        n_samples=1000,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+    )
+
+    est = RandomForestClassifier(n_estimators=50, random_state=0)
+    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
+        transformer = SelectFromModel(estimator=est, threshold=threshold)
+        transformer.fit(X, y)
+        assert hasattr(transformer.estimator_, "feature_importances_")
+
+        X_new = transformer.transform(X)
+        assert X_new.shape[1] < X.shape[1]
+        importances = transformer.estimator_.feature_importances_
+
+        feature_mask = np.abs(importances) > func(importances)
+        assert_array_almost_equal(X_new, X[:, feature_mask])
+
+
+def test_sample_weight():
+    # Ensure sample weights are passed to underlying estimator
+    X, y = datasets.make_classification(
+        n_samples=100,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+    )
+
+    # Check with sample weights
+    sample_weight = np.ones(y.shape)
+    sample_weight[y == 1] *= 100
+
+    est = LogisticRegression(random_state=0, fit_intercept=False)
+    transformer = SelectFromModel(estimator=est)
+    transformer.fit(X, y, sample_weight=None)
+    mask = transformer._get_support_mask()
+    transformer.fit(X, y, sample_weight=sample_weight)
+    weighted_mask = transformer._get_support_mask()
+    assert not np.all(weighted_mask == mask)
+    transformer.fit(X, y, sample_weight=3 * sample_weight)
+    reweighted_mask = transformer._get_support_mask()
+    assert np.all(weighted_mask == reweighted_mask)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        Lasso(alpha=0.1, random_state=42),
+        LassoCV(random_state=42),
+        ElasticNet(l1_ratio=1, random_state=42),
+        ElasticNetCV(l1_ratio=[1], random_state=42),
+    ],
+)
+def test_coef_default_threshold(estimator):
+    X, y = datasets.make_classification(
+        n_samples=100,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+    )
+
+    # For the Lasso and related models, the threshold defaults to 1e-5
+    transformer = SelectFromModel(estimator=estimator)
+    transformer.fit(X, y)
+    X_new = transformer.transform(X)
+    mask = np.abs(transformer.estimator_.coef_) > 1e-5
+    assert_array_almost_equal(X_new, X[:, mask])
+
+
+@skip_if_32bit
+def test_2d_coef():
+    X, y = datasets.make_classification(
+        n_samples=1000,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+        n_classes=4,
+    )
+
+    est = LogisticRegression()
+    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
+        for order in [1, 2, np.inf]:
+            # Fit SelectFromModel a multi-class problem
+            transformer = SelectFromModel(
+                estimator=LogisticRegression(), threshold=threshold, norm_order=order
+            )
+            transformer.fit(X, y)
+            assert hasattr(transformer.estimator_, "coef_")
+            X_new = transformer.transform(X)
+            assert X_new.shape[1] < X.shape[1]
+
+            # Manually check that the norm is correctly performed
+            est.fit(X, y)
+            importances = np.linalg.norm(est.coef_, axis=0, ord=order)
+            feature_mask = importances > func(importances)
+            assert_array_almost_equal(X_new, X[:, feature_mask])
+
+
+def test_partial_fit():
+    est = PassiveAggressiveClassifier(
+        random_state=0, shuffle=False, max_iter=5, tol=None
+    )
+    transformer = SelectFromModel(estimator=est)
+    transformer.partial_fit(data, y, classes=np.unique(y))
+    old_model = transformer.estimator_
+    transformer.partial_fit(data, y, classes=np.unique(y))
+    new_model = transformer.estimator_
+    assert old_model is new_model
+
+    X_transform = transformer.transform(data)
+    transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
+    assert_array_almost_equal(X_transform, transformer.transform(data))
+
+    # check that if est doesn't have partial_fit, neither does SelectFromModel
+    transformer = SelectFromModel(estimator=RandomForestClassifier())
+    assert not hasattr(transformer, "partial_fit")
+
+
+def test_calling_fit_reinitializes():
+    est = LinearSVC(random_state=0)
+    transformer = SelectFromModel(estimator=est)
+    transformer.fit(data, y)
+    transformer.set_params(estimator__C=100)
+    transformer.fit(data, y)
+    assert transformer.estimator_.C == 100
+
+
+def test_prefit():
+    # Test all possible combinations of the prefit parameter.
+
+    # Passing a prefit parameter with the selected model
+    # and fitting a unfit model with prefit=False should give same results.
+    clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None)
+    model = SelectFromModel(clf)
+    model.fit(data, y)
+    X_transform = model.transform(data)
+    clf.fit(data, y)
+    model = SelectFromModel(clf, prefit=True)
+    assert_array_almost_equal(model.transform(data), X_transform)
+    model.fit(data, y)
+    assert model.estimator_ is not clf
+
+    # Check that the model is rewritten if prefit=False and a fitted model is
+    # passed
+    model = SelectFromModel(clf, prefit=False)
+    model.fit(data, y)
+    assert_array_almost_equal(model.transform(data), X_transform)
+
+    # Check that passing an unfitted estimator with `prefit=True` raises a
+    # `ValueError`
+    clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None)
+    model = SelectFromModel(clf, prefit=True)
+    err_msg = "When `prefit=True`, `estimator` is expected to be a fitted estimator."
+    with pytest.raises(NotFittedError, match=err_msg):
+        model.fit(data, y)
+    with pytest.raises(NotFittedError, match=err_msg):
+        model.partial_fit(data, y)
+    with pytest.raises(NotFittedError, match=err_msg):
+        model.transform(data)
+
+    # Check that the internal parameters of prefitted model are not changed
+    # when calling `fit` or `partial_fit` with `prefit=True`
+    clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, tol=None).fit(data, y)
+    model = SelectFromModel(clf, prefit=True)
+    model.fit(data, y)
+    assert_allclose(model.estimator_.coef_, clf.coef_)
+    model.partial_fit(data, y)
+    assert_allclose(model.estimator_.coef_, clf.coef_)
+
+
+def test_prefit_max_features():
+    """Check the interaction between `prefit` and `max_features`."""
+    # case 1: an error should be raised at `transform` if `fit` was not called to
+    # validate the attributes
+    estimator = RandomForestClassifier(n_estimators=5, random_state=0)
+    estimator.fit(data, y)
+    model = SelectFromModel(estimator, prefit=True, max_features=lambda X: X.shape[1])
+
+    err_msg = (
+        "When `prefit=True` and `max_features` is a callable, call `fit` "
+        "before calling `transform`."
+    )
+    with pytest.raises(NotFittedError, match=err_msg):
+        model.transform(data)
+
+    # case 2: `max_features` is not validated and different from an integer
+    # FIXME: we cannot validate the upper bound of the attribute at transform
+    # and we should force calling `fit` if we intend to force the attribute
+    # to have such an upper bound.
+    max_features = 2.5
+    model.set_params(max_features=max_features)
+    with pytest.raises(ValueError, match="`max_features` must be an integer"):
+        model.transform(data)
+
+
+def test_get_feature_names_out_elasticnetcv():
+    """Check if ElasticNetCV works with a list of floats.
+
+    Non-regression test for #30936."""
+    X, y = make_regression(n_features=5, n_informative=3, random_state=0)
+    estimator = ElasticNetCV(l1_ratio=[0.25, 0.5, 0.75], random_state=0)
+    selector = SelectFromModel(estimator=estimator)
+    selector.fit(X, y)
+
+    names_out = selector.get_feature_names_out()
+    mask = selector.get_support()
+    expected = np.array([f"x{i}" for i in range(X.shape[1])])[mask]
+    assert_array_equal(names_out, expected)
+
+
+def test_prefit_get_feature_names_out():
+    """Check the interaction between prefit and the feature names."""
+    clf = RandomForestClassifier(n_estimators=2, random_state=0)
+    clf.fit(data, y)
+    model = SelectFromModel(clf, prefit=True, max_features=1)
+
+    name = type(model).__name__
+    err_msg = (
+        f"This {name} instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=err_msg):
+        model.get_feature_names_out()
+
+    model.fit(data, y)
+    feature_names = model.get_feature_names_out()
+    assert feature_names == ["x3"]
+
+
+def test_threshold_string():
+    est = RandomForestClassifier(n_estimators=50, random_state=0)
+    model = SelectFromModel(est, threshold="0.5*mean")
+    model.fit(data, y)
+    X_transform = model.transform(data)
+
+    # Calculate the threshold from the estimator directly.
+    est.fit(data, y)
+    threshold = 0.5 * np.mean(est.feature_importances_)
+    mask = est.feature_importances_ > threshold
+    assert_array_almost_equal(X_transform, data[:, mask])
+
+
+def test_threshold_without_refitting():
+    # Test that the threshold can be set without refitting the model.
+    clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None)
+    model = SelectFromModel(clf, threshold="0.1 * mean")
+    model.fit(data, y)
+    X_transform = model.transform(data)
+
+    # Set a higher threshold to filter out more features.
+    model.threshold = "1.0 * mean"
+    assert X_transform.shape[1] > model.transform(data).shape[1]
+
+
+def test_fit_accepts_nan_inf():
+    # Test that fit doesn't check for np.inf and np.nan values.
+    clf = HistGradientBoostingClassifier(random_state=0)
+
+    model = SelectFromModel(estimator=clf)
+
+    nan_data = data.copy()
+    nan_data[0] = np.nan
+    nan_data[1] = np.inf
+
+    model.fit(data, y)
+
+
+def test_transform_accepts_nan_inf():
+    # Test that transform doesn't check for np.inf and np.nan values.
+    clf = NaNTagRandomForest(n_estimators=100, random_state=0)
+    nan_data = data.copy()
+
+    model = SelectFromModel(estimator=clf)
+    model.fit(nan_data, y)
+
+    nan_data[0] = np.nan
+    nan_data[1] = np.inf
+
+    model.transform(nan_data)
+
+
+def test_allow_nan_tag_comes_from_estimator():
+    allow_nan_est = NaNTag()
+    model = SelectFromModel(estimator=allow_nan_est)
+    assert model.__sklearn_tags__().input_tags.allow_nan is True
+
+    no_nan_est = NoNaNTag()
+    model = SelectFromModel(estimator=no_nan_est)
+    assert model.__sklearn_tags__().input_tags.allow_nan is False
+
+
+def _pca_importances(pca_estimator):
+    return np.abs(pca_estimator.explained_variance_)
+
+
+@pytest.mark.parametrize(
+    "estimator, importance_getter",
+    [
+        (
+            make_pipeline(PCA(random_state=0), LogisticRegression()),
+            "named_steps.logisticregression.coef_",
+        ),
+        (PCA(random_state=0), _pca_importances),
+    ],
+)
+def test_importance_getter(estimator, importance_getter):
+    selector = SelectFromModel(
+        estimator, threshold="mean", importance_getter=importance_getter
+    )
+    selector.fit(data, y)
+    assert selector.transform(data).shape[1] == 1
+
+
+@pytest.mark.parametrize("PLSEstimator", [CCA, PLSCanonical, PLSRegression])
+def test_select_from_model_pls(PLSEstimator):
+    """Check the behaviour of SelectFromModel with PLS estimators.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/12410
+    """
+    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
+    estimator = PLSEstimator(n_components=1)
+    model = make_pipeline(SelectFromModel(estimator), estimator).fit(X, y)
+    assert model.score(X, y) > 0.5
+
+
+def test_estimator_does_not_support_feature_names():
+    """SelectFromModel works with estimators that do not support feature_names_in_.
+
+    Non-regression test for #21949.
+    """
+    pytest.importorskip("pandas")
+    X, y = datasets.load_iris(as_frame=True, return_X_y=True)
+    all_feature_names = set(X.columns)
+
+    def importance_getter(estimator):
+        return np.arange(X.shape[1])
+
+    selector = SelectFromModel(
+        MinimalClassifier(), importance_getter=importance_getter
+    ).fit(X, y)
+
+    # selector learns the feature names itself
+    assert_array_equal(selector.feature_names_in_, X.columns)
+
+    feature_names_out = set(selector.get_feature_names_out())
+    assert feature_names_out < all_feature_names
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+
+        selector.transform(X.iloc[1:3])
+
+
+@pytest.mark.parametrize(
+    "error, err_msg, max_features",
+    (
+        [ValueError, "max_features == 10, must be <= 4", 10],
+        [ValueError, "max_features == 5, must be <= 4", lambda x: x.shape[1] + 1],
+    ),
+)
+def test_partial_fit_validate_max_features(error, err_msg, max_features):
+    """Test that partial_fit from SelectFromModel validates `max_features`."""
+    X, y = datasets.make_classification(
+        n_samples=100,
+        n_features=4,
+        random_state=0,
+    )
+
+    with pytest.raises(error, match=err_msg):
+        SelectFromModel(
+            estimator=SGDClassifier(), max_features=max_features
+        ).partial_fit(X, y, classes=[0, 1])
+
+
+@pytest.mark.parametrize("as_frame", [True, False])
+def test_partial_fit_validate_feature_names(as_frame):
+    """Test that partial_fit from SelectFromModel validates `feature_names_in_`."""
+    pytest.importorskip("pandas")
+    X, y = datasets.load_iris(as_frame=as_frame, return_X_y=True)
+
+    selector = SelectFromModel(estimator=SGDClassifier(), max_features=4).partial_fit(
+        X, y, classes=[0, 1, 2]
+    )
+    if as_frame:
+        assert_array_equal(selector.feature_names_in_, X.columns)
+    else:
+        assert not hasattr(selector, "feature_names_in_")
+
+
+def test_from_model_estimator_attribute_error():
+    """Check that we raise the proper AttributeError when the estimator
+    does not implement the `partial_fit` method, which is decorated with
+    `available_if`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28108
+    """
+    # `LinearRegression` does not implement 'partial_fit' and should raise an
+    # AttributeError
+    from_model = SelectFromModel(estimator=LinearRegression())
+
+    outer_msg = "This 'SelectFromModel' has no attribute 'partial_fit'"
+    inner_msg = "'LinearRegression' object has no attribute 'partial_fit'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        from_model.fit(data, y).partial_fit(data)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_mutual_info.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_mutual_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..4922b7e4e57b352456e8295d7dba44feb4eef535
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_mutual_info.py
@@ -0,0 +1,270 @@
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
+from sklearn.feature_selection._mutual_info import _compute_mi
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def test_compute_mi_dd():
+    # In discrete case computations are straightforward and can be done
+    # by hand on given vectors.
+    x = np.array([0, 1, 1, 0, 0])
+    y = np.array([1, 0, 0, 0, 1])
+
+    H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5)
+    H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5)
+    I_xy = H_x + H_y - H_xy
+
+    assert_allclose(_compute_mi(x, y, x_discrete=True, y_discrete=True), I_xy)
+
+
+def test_compute_mi_cc(global_dtype):
+    # For two continuous variables a good approach is to test on bivariate
+    # normal distribution, where mutual information is known.
+
+    # Mean of the distribution, irrelevant for mutual information.
+    mean = np.zeros(2)
+
+    # Setup covariance matrix with correlation coeff. equal 0.5.
+    sigma_1 = 1
+    sigma_2 = 10
+    corr = 0.5
+    cov = np.array(
+        [
+            [sigma_1**2, corr * sigma_1 * sigma_2],
+            [corr * sigma_1 * sigma_2, sigma_2**2],
+        ]
+    )
+
+    # True theoretical mutual information.
+    I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))
+
+    rng = check_random_state(0)
+    Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)
+
+    x, y = Z[:, 0], Z[:, 1]
+
+    # Theory and computed values won't be very close
+    # We here check with a large relative tolerance
+    for n_neighbors in [3, 5, 7]:
+        I_computed = _compute_mi(
+            x, y, x_discrete=False, y_discrete=False, n_neighbors=n_neighbors
+        )
+        assert_allclose(I_computed, I_theory, rtol=1e-1)
+
+
+def test_compute_mi_cd(global_dtype):
+    # To test define a joint distribution as follows:
+    # p(x, y) = p(x) p(y | x)
+    # X ~ Bernoulli(p)
+    # (Y | x = 0) ~ Uniform(-1, 1)
+    # (Y | x = 1) ~ Uniform(0, 2)
+
+    # Use the following formula for mutual information:
+    # I(X; Y) = H(Y) - H(Y | X)
+    # Two entropies can be computed by hand:
+    # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
+    # H(Y | X) = ln(2)
+
+    # Now we need to implement sampling from out distribution, which is
+    # done easily using conditional distribution logic.
+
+    n_samples = 1000
+    rng = check_random_state(0)
+
+    for p in [0.3, 0.5, 0.7]:
+        x = rng.uniform(size=n_samples) > p
+
+        y = np.empty(n_samples, global_dtype)
+        mask = x == 0
+        y[mask] = rng.uniform(-1, 1, size=np.sum(mask))
+        y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))
+
+        I_theory = -0.5 * (
+            (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5)
+        ) - np.log(2)
+
+        # Assert the same tolerance.
+        for n_neighbors in [3, 5, 7]:
+            I_computed = _compute_mi(
+                x, y, x_discrete=True, y_discrete=False, n_neighbors=n_neighbors
+            )
+            assert_allclose(I_computed, I_theory, rtol=1e-1)
+
+
+def test_compute_mi_cd_unique_label(global_dtype):
+    # Test that adding unique label doesn't change MI.
+    n_samples = 100
+    x = np.random.uniform(size=n_samples) > 0.5
+
+    y = np.empty(n_samples, global_dtype)
+    mask = x == 0
+    y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
+    y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))
+
+    mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False)
+
+    x = np.hstack((x, 2))
+    y = np.hstack((y, 10))
+    mi_2 = _compute_mi(x, y, x_discrete=True, y_discrete=False)
+
+    assert_allclose(mi_1, mi_2)
+
+
+# We are going test that feature ordering by MI matches our expectations.
+def test_mutual_info_classif_discrete(global_dtype):
+    X = np.array(
+        [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype
+    )
+    y = np.array([0, 1, 2, 2, 1])
+
+    # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly
+    # informative.
+    mi = mutual_info_classif(X, y, discrete_features=True)
+    assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))
+
+
+def test_mutual_info_regression(global_dtype):
+    # We generate sample from multivariate normal distribution, using
+    # transformation from initially uncorrelated variables. The zero
+    # variables after transformation is selected as the target vector,
+    # it has the strongest correlation with the variable 2, and
+    # the weakest correlation with the variable 1.
+    T = np.array([[1, 0.5, 2, 1], [0, 1, 0.1, 0.0], [0, 0.1, 1, 0.1], [0, 0.1, 0.1, 1]])
+    cov = T.dot(T.T)
+    mean = np.zeros(4)
+
+    rng = check_random_state(0)
+    Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)
+    X = Z[:, 1:]
+    y = Z[:, 0]
+
+    mi = mutual_info_regression(X, y, random_state=0)
+    assert_array_equal(np.argsort(-mi), np.array([1, 2, 0]))
+    # XXX: should mutual_info_regression be fixed to avoid
+    # up-casting float32 inputs to float64?
+    assert mi.dtype == np.float64
+
+
+def test_mutual_info_classif_mixed(global_dtype):
+    # Here the target is discrete and there are two continuous and one
+    # discrete feature. The idea of this test is clear from the code.
+    rng = check_random_state(0)
+    X = rng.rand(1000, 3).astype(global_dtype, copy=False)
+    X[:, 1] += X[:, 0]
+    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
+    X[:, 2] = X[:, 2] > 0.5
+
+    mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0)
+    assert_array_equal(np.argsort(-mi), [2, 0, 1])
+    for n_neighbors in [5, 7, 9]:
+        mi_nn = mutual_info_classif(
+            X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0
+        )
+        # Check that the continuous values have an higher MI with greater
+        # n_neighbors
+        assert mi_nn[0] > mi[0]
+        assert mi_nn[1] > mi[1]
+        # The n_neighbors should not have any effect on the discrete value
+        # The MI should be the same
+        assert mi_nn[2] == mi[2]
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_mutual_info_options(global_dtype, csr_container):
+    X = np.array(
+        [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype
+    )
+    y = np.array([0, 1, 2, 2, 1], dtype=global_dtype)
+    X_csr = csr_container(X)
+
+    for mutual_info in (mutual_info_regression, mutual_info_classif):
+        with pytest.raises(ValueError):
+            mutual_info(X_csr, y, discrete_features=False)
+        with pytest.raises(ValueError):
+            mutual_info(X, y, discrete_features="manual")
+        with pytest.raises(ValueError):
+            mutual_info(X_csr, y, discrete_features=[True, False, True])
+        with pytest.raises(IndexError):
+            mutual_info(X, y, discrete_features=[True, False, True, False])
+        with pytest.raises(IndexError):
+            mutual_info(X, y, discrete_features=[1, 4])
+
+        mi_1 = mutual_info(X, y, discrete_features="auto", random_state=0)
+        mi_2 = mutual_info(X, y, discrete_features=False, random_state=0)
+        mi_3 = mutual_info(X_csr, y, discrete_features="auto", random_state=0)
+        mi_4 = mutual_info(X_csr, y, discrete_features=True, random_state=0)
+        mi_5 = mutual_info(X, y, discrete_features=[True, False, True], random_state=0)
+        mi_6 = mutual_info(X, y, discrete_features=[0, 2], random_state=0)
+
+        assert_allclose(mi_1, mi_2)
+        assert_allclose(mi_3, mi_4)
+        assert_allclose(mi_5, mi_6)
+
+        assert not np.allclose(mi_1, mi_3)
+
+
+@pytest.mark.parametrize("correlated", [True, False])
+def test_mutual_information_symmetry_classif_regression(correlated, global_random_seed):
+    """Check that `mutual_info_classif` and `mutual_info_regression` are
+    symmetric by switching the target `y` as `feature` in `X` and vice
+    versa.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/23720
+    """
+    rng = np.random.RandomState(global_random_seed)
+    n = 100
+    d = rng.randint(10, size=n)
+
+    if correlated:
+        c = d.astype(np.float64)
+    else:
+        c = rng.normal(0, 1, size=n)
+
+    mi_classif = mutual_info_classif(
+        c[:, None], d, discrete_features=[False], random_state=global_random_seed
+    )
+
+    mi_regression = mutual_info_regression(
+        d[:, None], c, discrete_features=[True], random_state=global_random_seed
+    )
+
+    assert mi_classif == pytest.approx(mi_regression)
+
+
+def test_mutual_info_regression_X_int_dtype(global_random_seed):
+    """Check that results agree when X is integer dtype and float dtype.
+
+    Non-regression test for Issue #26696.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randint(100, size=(100, 10))
+    X_float = X.astype(np.float64, copy=True)
+    y = rng.randint(100, size=100)
+
+    expected = mutual_info_regression(X_float, y, random_state=global_random_seed)
+    result = mutual_info_regression(X, y, random_state=global_random_seed)
+    assert_allclose(result, expected)
+
+
+@pytest.mark.parametrize(
+    "mutual_info_func, data_generator",
+    [
+        (mutual_info_regression, make_regression),
+        (mutual_info_classif, make_classification),
+    ],
+)
+def test_mutual_info_n_jobs(global_random_seed, mutual_info_func, data_generator):
+    """Check that results are consistent with different `n_jobs`."""
+    X, y = data_generator(random_state=global_random_seed)
+    single_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=1)
+    multi_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=2)
+    assert_allclose(single_job, multi_job)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_rfe.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_rfe.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f5672545874c057847a5d135f1c29a7211647e0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_rfe.py
@@ -0,0 +1,755 @@
+"""
+Testing Recursive feature elimination
+"""
+
+import re
+from operator import attrgetter
+
+import numpy as np
+import pytest
+from joblib import parallel_backend
+from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal
+
+from sklearn.base import BaseEstimator, ClassifierMixin, is_classifier
+from sklearn.compose import TransformedTargetRegressor
+from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
+from sklearn.datasets import load_iris, make_classification, make_friedman1
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import RFE, RFECV
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.metrics import get_scorer, make_scorer, zero_one_loss
+from sklearn.model_selection import GroupKFold, cross_val_score
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC, SVR, LinearSVR
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import ignore_warnings
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+class MockClassifier(ClassifierMixin, BaseEstimator):
+    """
+    Dummy classifier to test recursive feature elimination
+    """
+
+    def __init__(self, foo_param=0):
+        self.foo_param = foo_param
+
+    def fit(self, X, y):
+        assert len(X) == len(y)
+        self.coef_ = np.ones(X.shape[1], dtype=np.float64)
+        self.classes_ = sorted(set(y))
+        return self
+
+    def predict(self, T):
+        return np.ones(T.shape[0])
+
+    predict_proba = predict
+    decision_function = predict
+    transform = predict
+
+    def score(self, X=None, y=None):
+        return 0.0
+
+    def get_params(self, deep=True):
+        return {"foo_param": self.foo_param}
+
+    def set_params(self, **params):
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+def test_rfe_features_importance():
+    generator = check_random_state(0)
+    iris = load_iris()
+    # Add some irrelevant features. Random seed is set to make sure that
+    # irrelevant features are always irrelevant.
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = iris.target
+
+    clf = RandomForestClassifier(n_estimators=20, random_state=generator, max_depth=2)
+    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
+    rfe.fit(X, y)
+    assert len(rfe.ranking_) == X.shape[1]
+
+    clf_svc = SVC(kernel="linear")
+    rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1)
+    rfe_svc.fit(X, y)
+
+    # Check if the supports are equal
+    assert_array_equal(rfe.get_support(), rfe_svc.get_support())
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_rfe(csr_container):
+    generator = check_random_state(0)
+    iris = load_iris()
+    # Add some irrelevant features. Random seed is set to make sure that
+    # irrelevant features are always irrelevant.
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    X_sparse = csr_container(X)
+    y = iris.target
+
+    # dense model
+    clf = SVC(kernel="linear")
+    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
+    rfe.fit(X, y)
+    X_r = rfe.transform(X)
+    clf.fit(X_r, y)
+    assert len(rfe.ranking_) == X.shape[1]
+
+    # sparse model
+    clf_sparse = SVC(kernel="linear")
+    rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
+    rfe_sparse.fit(X_sparse, y)
+    X_r_sparse = rfe_sparse.transform(X_sparse)
+
+    assert X_r.shape == iris.data.shape
+    assert_array_almost_equal(X_r[:10], iris.data[:10])
+
+    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
+    assert rfe.score(X, y) == clf.score(iris.data, iris.target)
+    assert_array_almost_equal(X_r, X_r_sparse.toarray())
+
+
+def test_RFE_fit_score_params():
+    # Make sure RFE passes the metadata down to fit and score methods of the
+    # underlying estimator
+    class TestEstimator(BaseEstimator, ClassifierMixin):
+        def fit(self, X, y, prop=None):
+            if prop is None:
+                raise ValueError("fit: prop cannot be None")
+            self.svc_ = SVC(kernel="linear").fit(X, y)
+            self.coef_ = self.svc_.coef_
+            return self
+
+        def score(self, X, y, prop=None):
+            if prop is None:
+                raise ValueError("score: prop cannot be None")
+            return self.svc_.score(X, y)
+
+    X, y = load_iris(return_X_y=True)
+    with pytest.raises(ValueError, match="fit: prop cannot be None"):
+        RFE(estimator=TestEstimator()).fit(X, y)
+    with pytest.raises(ValueError, match="score: prop cannot be None"):
+        RFE(estimator=TestEstimator()).fit(X, y, prop="foo").score(X, y)
+
+    RFE(estimator=TestEstimator()).fit(X, y, prop="foo").score(X, y, prop="foo")
+
+
+def test_rfe_percent_n_features():
+    # test that the results are the same
+    generator = check_random_state(0)
+    iris = load_iris()
+    # Add some irrelevant features. Random seed is set to make sure that
+    # irrelevant features are always irrelevant.
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = iris.target
+    # there are 10 features in the data. We select 40%.
+    clf = SVC(kernel="linear")
+    rfe_num = RFE(estimator=clf, n_features_to_select=4, step=0.1)
+    rfe_num.fit(X, y)
+
+    rfe_perc = RFE(estimator=clf, n_features_to_select=0.4, step=0.1)
+    rfe_perc.fit(X, y)
+
+    assert_array_equal(rfe_perc.ranking_, rfe_num.ranking_)
+    assert_array_equal(rfe_perc.support_, rfe_num.support_)
+
+
+def test_rfe_mockclassifier():
+    generator = check_random_state(0)
+    iris = load_iris()
+    # Add some irrelevant features. Random seed is set to make sure that
+    # irrelevant features are always irrelevant.
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = iris.target
+
+    # dense model
+    clf = MockClassifier()
+    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
+    rfe.fit(X, y)
+    X_r = rfe.transform(X)
+    clf.fit(X_r, y)
+    assert len(rfe.ranking_) == X.shape[1]
+    assert X_r.shape == iris.data.shape
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_rfecv(csr_container):
+    generator = check_random_state(0)
+    iris = load_iris()
+    # Add some irrelevant features. Random seed is set to make sure that
+    # irrelevant features are always irrelevant.
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = list(iris.target)  # regression test: list should be supported
+
+    # Test using the score function
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1)
+    rfecv.fit(X, y)
+    # non-regression test for missing worst feature:
+
+    for key in rfecv.cv_results_.keys():
+        assert len(rfecv.cv_results_[key]) == X.shape[1]
+
+    assert len(rfecv.ranking_) == X.shape[1]
+    X_r = rfecv.transform(X)
+
+    # All the noisy variable were filtered out
+    assert_array_equal(X_r, iris.data)
+
+    # same in sparse
+    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1)
+    X_sparse = csr_container(X)
+    rfecv_sparse.fit(X_sparse, y)
+    X_r_sparse = rfecv_sparse.transform(X_sparse)
+    assert_array_equal(X_r_sparse.toarray(), iris.data)
+
+    # Test using a customized loss function
+    scoring = make_scorer(zero_one_loss, greater_is_better=False)
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring)
+    ignore_warnings(rfecv.fit)(X, y)
+    X_r = rfecv.transform(X)
+    assert_array_equal(X_r, iris.data)
+
+    # Test using a scorer
+    scorer = get_scorer("accuracy")
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer)
+    rfecv.fit(X, y)
+    X_r = rfecv.transform(X)
+    assert_array_equal(X_r, iris.data)
+
+    # Test fix on cv_results_
+    def test_scorer(estimator, X, y):
+        return 1.0
+
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer)
+    rfecv.fit(X, y)
+
+    # In the event of cross validation score ties, the expected behavior of
+    # RFECV is to return the FEWEST features that maximize the CV score.
+    # Because test_scorer always returns 1.0 in this example, RFECV should
+    # reduce the dimensionality to a single feature (i.e. n_features_ = 1)
+    assert rfecv.n_features_ == 1
+
+    # Same as the first two tests, but with step=2
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2)
+    rfecv.fit(X, y)
+
+    for key in rfecv.cv_results_.keys():
+        assert len(rfecv.cv_results_[key]) == 6
+
+    assert len(rfecv.ranking_) == X.shape[1]
+    X_r = rfecv.transform(X)
+    assert_array_equal(X_r, iris.data)
+
+    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2)
+    X_sparse = csr_container(X)
+    rfecv_sparse.fit(X_sparse, y)
+    X_r_sparse = rfecv_sparse.transform(X_sparse)
+    assert_array_equal(X_r_sparse.toarray(), iris.data)
+
+    # Verifying that steps < 1 don't blow up.
+    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=0.2)
+    X_sparse = csr_container(X)
+    rfecv_sparse.fit(X_sparse, y)
+    X_r_sparse = rfecv_sparse.transform(X_sparse)
+    assert_array_equal(X_r_sparse.toarray(), iris.data)
+
+
+def test_rfecv_mockclassifier():
+    generator = check_random_state(0)
+    iris = load_iris()
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = list(iris.target)  # regression test: list should be supported
+
+    # Test using the score function
+    rfecv = RFECV(estimator=MockClassifier(), step=1)
+    rfecv.fit(X, y)
+    # non-regression test for missing worst feature:
+
+    for key in rfecv.cv_results_.keys():
+        assert len(rfecv.cv_results_[key]) == X.shape[1]
+
+    assert len(rfecv.ranking_) == X.shape[1]
+
+
+def test_rfecv_verbose_output():
+    # Check verbose=1 is producing an output.
+    import sys
+    from io import StringIO
+
+    sys.stdout = StringIO()
+
+    generator = check_random_state(0)
+    iris = load_iris()
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = list(iris.target)
+
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, verbose=1)
+    rfecv.fit(X, y)
+
+    verbose_output = sys.stdout
+    verbose_output.seek(0)
+    assert len(verbose_output.readline()) > 0
+
+
+def test_rfecv_cv_results_size(global_random_seed):
+    generator = check_random_state(global_random_seed)
+    iris = load_iris()
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = list(iris.target)  # regression test: list should be supported
+
+    # Non-regression test for varying combinations of step and
+    # min_features_to_select.
+    for step, min_features_to_select in [[2, 1], [2, 2], [3, 3]]:
+        rfecv = RFECV(
+            estimator=MockClassifier(),
+            step=step,
+            min_features_to_select=min_features_to_select,
+        )
+        rfecv.fit(X, y)
+
+        score_len = np.ceil((X.shape[1] - min_features_to_select) / step) + 1
+
+        for key in rfecv.cv_results_.keys():
+            assert len(rfecv.cv_results_[key]) == score_len
+
+        assert len(rfecv.ranking_) == X.shape[1]
+        assert rfecv.n_features_ >= min_features_to_select
+
+
+def test_rfe_estimator_tags():
+    rfe = RFE(SVC(kernel="linear"))
+    assert is_classifier(rfe)
+    # make sure that cross-validation is stratified
+    iris = load_iris()
+    score = cross_val_score(rfe, iris.data, iris.target)
+    assert score.min() > 0.7
+
+
+def test_rfe_min_step(global_random_seed):
+    n_features = 10
+    X, y = make_friedman1(
+        n_samples=50, n_features=n_features, random_state=global_random_seed
+    )
+    n_samples, n_features = X.shape
+    estimator = SVR(kernel="linear")
+
+    # Test when floor(step * n_features) <= 0
+    selector = RFE(estimator, step=0.01)
+    sel = selector.fit(X, y)
+    assert sel.support_.sum() == n_features // 2
+
+    # Test when step is between (0,1) and floor(step * n_features) > 0
+    selector = RFE(estimator, step=0.20)
+    sel = selector.fit(X, y)
+    assert sel.support_.sum() == n_features // 2
+
+    # Test when step is an integer
+    selector = RFE(estimator, step=5)
+    sel = selector.fit(X, y)
+    assert sel.support_.sum() == n_features // 2
+
+
+def test_number_of_subsets_of_features(global_random_seed):
+    # In RFE, 'number_of_subsets_of_features'
+    # = the number of iterations in '_fit'
+    # = max(ranking_)
+    # = 1 + (n_features + step - n_features_to_select - 1) // step
+    # After optimization #4534, this number
+    # = 1 + np.ceil((n_features - n_features_to_select) / float(step))
+    # This test case is to test their equivalence, refer to #4534 and #3824
+
+    def formula1(n_features, n_features_to_select, step):
+        return 1 + ((n_features + step - n_features_to_select - 1) // step)
+
+    def formula2(n_features, n_features_to_select, step):
+        return 1 + np.ceil((n_features - n_features_to_select) / float(step))
+
+    # RFE
+    # Case 1, n_features - n_features_to_select is divisible by step
+    # Case 2, n_features - n_features_to_select is not divisible by step
+    n_features_list = [11, 11]
+    n_features_to_select_list = [3, 3]
+    step_list = [2, 3]
+    for n_features, n_features_to_select, step in zip(
+        n_features_list, n_features_to_select_list, step_list
+    ):
+        generator = check_random_state(global_random_seed)
+        X = generator.normal(size=(100, n_features))
+        y = generator.rand(100).round()
+        rfe = RFE(
+            estimator=SVC(kernel="linear"),
+            n_features_to_select=n_features_to_select,
+            step=step,
+        )
+        rfe.fit(X, y)
+        # this number also equals to the maximum of ranking_
+        assert np.max(rfe.ranking_) == formula1(n_features, n_features_to_select, step)
+        assert np.max(rfe.ranking_) == formula2(n_features, n_features_to_select, step)
+
+    # In RFECV, 'fit' calls 'RFE._fit'
+    # 'number_of_subsets_of_features' of RFE
+    # = the size of each score in 'cv_results_' of RFECV
+    # = the number of iterations of the for loop before optimization #4534
+
+    # RFECV, n_features_to_select = 1
+    # Case 1, n_features - 1 is divisible by step
+    # Case 2, n_features - 1 is not divisible by step
+
+    n_features_to_select = 1
+    n_features_list = [11, 10]
+    step_list = [2, 2]
+    for n_features, step in zip(n_features_list, step_list):
+        generator = check_random_state(global_random_seed)
+        X = generator.normal(size=(100, n_features))
+        y = generator.rand(100).round()
+        rfecv = RFECV(estimator=SVC(kernel="linear"), step=step)
+        rfecv.fit(X, y)
+
+        for key in rfecv.cv_results_.keys():
+            assert len(rfecv.cv_results_[key]) == formula1(
+                n_features, n_features_to_select, step
+            )
+            assert len(rfecv.cv_results_[key]) == formula2(
+                n_features, n_features_to_select, step
+            )
+
+
+def test_rfe_cv_n_jobs(global_random_seed):
+    generator = check_random_state(global_random_seed)
+    iris = load_iris()
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = iris.target
+
+    rfecv = RFECV(estimator=SVC(kernel="linear"))
+    rfecv.fit(X, y)
+    rfecv_ranking = rfecv.ranking_
+
+    rfecv_cv_results_ = rfecv.cv_results_
+
+    rfecv.set_params(n_jobs=2)
+    rfecv.fit(X, y)
+    assert_array_almost_equal(rfecv.ranking_, rfecv_ranking)
+
+    assert rfecv_cv_results_.keys() == rfecv.cv_results_.keys()
+    for key in rfecv_cv_results_.keys():
+        assert rfecv_cv_results_[key] == pytest.approx(rfecv.cv_results_[key])
+
+
+def test_rfe_cv_groups():
+    generator = check_random_state(0)
+    iris = load_iris()
+    number_groups = 4
+    groups = np.floor(np.linspace(0, number_groups, len(iris.target)))
+    X = iris.data
+    y = (iris.target > 0).astype(int)
+
+    est_groups = RFECV(
+        estimator=RandomForestClassifier(random_state=generator),
+        step=1,
+        scoring="accuracy",
+        cv=GroupKFold(n_splits=2),
+    )
+    est_groups.fit(X, y, groups=groups)
+    assert est_groups.n_features_ > 0
+
+
+@pytest.mark.parametrize(
+    "importance_getter", [attrgetter("regressor_.coef_"), "regressor_.coef_"]
+)
+@pytest.mark.parametrize("selector, expected_n_features", [(RFE, 5), (RFECV, 4)])
+def test_rfe_wrapped_estimator(importance_getter, selector, expected_n_features):
+    # Non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/15312
+    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
+    estimator = LinearSVR(random_state=0)
+
+    log_estimator = TransformedTargetRegressor(
+        regressor=estimator, func=np.log, inverse_func=np.exp
+    )
+
+    selector = selector(log_estimator, importance_getter=importance_getter)
+    sel = selector.fit(X, y)
+    assert sel.support_.sum() == expected_n_features
+
+
+@pytest.mark.parametrize(
+    "importance_getter, err_type",
+    [
+        ("auto", ValueError),
+        ("random", AttributeError),
+        (lambda x: x.importance, AttributeError),
+    ],
+)
+@pytest.mark.parametrize("Selector", [RFE, RFECV])
+def test_rfe_importance_getter_validation(importance_getter, err_type, Selector):
+    X, y = make_friedman1(n_samples=50, n_features=10, random_state=42)
+    estimator = LinearSVR()
+    log_estimator = TransformedTargetRegressor(
+        regressor=estimator, func=np.log, inverse_func=np.exp
+    )
+
+    with pytest.raises(err_type):
+        model = Selector(log_estimator, importance_getter=importance_getter)
+        model.fit(X, y)
+
+
+@pytest.mark.parametrize("cv", [None, 5])
+def test_rfe_allow_nan_inf_in_x(cv):
+    iris = load_iris()
+    X = iris.data
+    y = iris.target
+
+    # add nan and inf value to X
+    X[0][0] = np.nan
+    X[0][1] = np.inf
+
+    clf = MockClassifier()
+    if cv is not None:
+        rfe = RFECV(estimator=clf, cv=cv)
+    else:
+        rfe = RFE(estimator=clf)
+    rfe.fit(X, y)
+    rfe.transform(X)
+
+
+def test_w_pipeline_2d_coef_():
+    pipeline = make_pipeline(StandardScaler(), LogisticRegression())
+
+    data, y = load_iris(return_X_y=True)
+    sfm = RFE(
+        pipeline,
+        n_features_to_select=2,
+        importance_getter="named_steps.logisticregression.coef_",
+    )
+
+    sfm.fit(data, y)
+    assert sfm.transform(data).shape[1] == 2
+
+
+def test_rfecv_std_and_mean(global_random_seed):
+    generator = check_random_state(global_random_seed)
+    iris = load_iris()
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = iris.target
+
+    rfecv = RFECV(estimator=SVC(kernel="linear"))
+    rfecv.fit(X, y)
+    split_keys = [
+        key
+        for key in rfecv.cv_results_.keys()
+        if re.search(r"split\d+_test_score", key)
+    ]
+    cv_scores = np.asarray([rfecv.cv_results_[key] for key in split_keys])
+    expected_mean = np.mean(cv_scores, axis=0)
+    expected_std = np.std(cv_scores, axis=0)
+
+    assert_allclose(rfecv.cv_results_["mean_test_score"], expected_mean)
+    assert_allclose(rfecv.cv_results_["std_test_score"], expected_std)
+
+
+@pytest.mark.parametrize(
+    ["min_features_to_select", "n_features", "step", "cv_results_n_features"],
+    [
+        [1, 4, 1, np.array([1, 2, 3, 4])],
+        [1, 5, 1, np.array([1, 2, 3, 4, 5])],
+        [1, 4, 2, np.array([1, 2, 4])],
+        [1, 5, 2, np.array([1, 3, 5])],
+        [1, 4, 3, np.array([1, 4])],
+        [1, 5, 3, np.array([1, 2, 5])],
+        [1, 4, 4, np.array([1, 4])],
+        [1, 5, 4, np.array([1, 5])],
+        [4, 4, 2, np.array([4])],
+        [4, 5, 1, np.array([4, 5])],
+        [4, 5, 2, np.array([4, 5])],
+    ],
+)
+def test_rfecv_cv_results_n_features(
+    min_features_to_select,
+    n_features,
+    step,
+    cv_results_n_features,
+):
+    X, y = make_classification(
+        n_samples=20, n_features=n_features, n_informative=n_features, n_redundant=0
+    )
+    rfecv = RFECV(
+        estimator=SVC(kernel="linear"),
+        step=step,
+        min_features_to_select=min_features_to_select,
+    )
+    rfecv.fit(X, y)
+    assert_array_equal(rfecv.cv_results_["n_features"], cv_results_n_features)
+    assert all(
+        len(value) == len(rfecv.cv_results_["n_features"])
+        for value in rfecv.cv_results_.values()
+    )
+
+
+@pytest.mark.parametrize("ClsRFE", [RFE, RFECV])
+def test_multioutput(ClsRFE):
+    X = np.random.normal(size=(10, 3))
+    y = np.random.randint(2, size=(10, 2))
+    clf = RandomForestClassifier(n_estimators=5)
+    rfe_test = ClsRFE(clf)
+    rfe_test.fit(X, y)
+
+
+@pytest.mark.parametrize("ClsRFE", [RFE, RFECV])
+def test_pipeline_with_nans(ClsRFE):
+    """Check that RFE works with pipeline that accept nans.
+
+    Non-regression test for gh-21743.
+    """
+    X, y = load_iris(return_X_y=True)
+    X[0, 0] = np.nan
+
+    pipe = make_pipeline(
+        SimpleImputer(),
+        StandardScaler(),
+        LogisticRegression(),
+    )
+
+    fs = ClsRFE(
+        estimator=pipe,
+        importance_getter="named_steps.logisticregression.coef_",
+    )
+    fs.fit(X, y)
+
+
+@pytest.mark.parametrize("ClsRFE", [RFE, RFECV])
+@pytest.mark.parametrize("PLSEstimator", [CCA, PLSCanonical, PLSRegression])
+def test_rfe_pls(ClsRFE, PLSEstimator):
+    """Check the behaviour of RFE with PLS estimators.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/12410
+    """
+    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
+    estimator = PLSEstimator(n_components=1)
+    selector = ClsRFE(estimator, step=1).fit(X, y)
+    assert selector.score(X, y) > 0.5
+
+
+def test_rfe_estimator_attribute_error():
+    """Check that we raise the proper AttributeError when the estimator
+    does not implement the `decision_function` method, which is decorated with
+    `available_if`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28108
+    """
+    iris = load_iris()
+
+    # `LinearRegression` does not implement 'decision_function' and should raise an
+    # AttributeError
+    rfe = RFE(estimator=LinearRegression())
+
+    outer_msg = "This 'RFE' has no attribute 'decision_function'"
+    inner_msg = "'LinearRegression' object has no attribute 'decision_function'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        rfe.fit(iris.data, iris.target).decision_function(iris.data)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+
+@pytest.mark.parametrize(
+    "ClsRFE, param", [(RFE, "n_features_to_select"), (RFECV, "min_features_to_select")]
+)
+def test_rfe_n_features_to_select_warning(ClsRFE, param):
+    """Check if the correct warning is raised when trying to initialize a RFE
+    object with a n_features_to_select attribute larger than the number of
+    features present in the X variable that is passed to the fit method
+    """
+    X, y = make_classification(n_features=20, random_state=0)
+
+    with pytest.warns(UserWarning, match=f"{param}=21 > n_features=20"):
+        # Create RFE/RFECV with n_features_to_select/min_features_to_select
+        # larger than the number of features present in the X variable
+        clsrfe = ClsRFE(estimator=LogisticRegression(), **{param: 21})
+        clsrfe.fit(X, y)
+
+
+def test_rfe_with_sample_weight():
+    """Test that `RFE` works correctly with sample weights."""
+    X, y = make_classification(random_state=0)
+    n_samples = X.shape[0]
+
+    # Assign the first half of the samples with twice the weight
+    sample_weight = np.ones_like(y)
+    sample_weight[: n_samples // 2] = 2
+
+    # Duplicate the first half of the data samples to replicate the effect
+    # of sample weights for comparison
+    X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
+    y2 = np.concatenate([y, y[: n_samples // 2]])
+
+    estimator = SVC(kernel="linear")
+
+    rfe_sw = RFE(estimator=estimator, step=0.1)
+    rfe_sw.fit(X, y, sample_weight=sample_weight)
+
+    rfe = RFE(estimator=estimator, step=0.1)
+    rfe.fit(X2, y2)
+
+    assert_array_equal(rfe_sw.ranking_, rfe.ranking_)
+
+    # Also verify that when sample weights are not doubled the results
+    # are different from the duplicated data
+    rfe_sw_2 = RFE(estimator=estimator, step=0.1)
+    sample_weight_2 = np.ones_like(y)
+    rfe_sw_2.fit(X, y, sample_weight=sample_weight_2)
+
+    assert not np.array_equal(rfe_sw_2.ranking_, rfe.ranking_)
+
+
+def test_rfe_with_joblib_threading_backend(global_random_seed):
+    X, y = make_classification(random_state=global_random_seed)
+
+    clf = LogisticRegression()
+    rfe = RFECV(
+        estimator=clf,
+        n_jobs=2,
+    )
+
+    rfe.fit(X, y)
+    ranking_ref = rfe.ranking_
+
+    with parallel_backend("threading"):
+        rfe.fit(X, y)
+
+    assert_array_equal(ranking_ref, rfe.ranking_)
+
+
+def test_results_per_cv_in_rfecv(global_random_seed):
+    """
+    Test that the results of RFECV are consistent across the different folds
+    in terms of length of the arrays.
+    """
+    X, y = make_classification(random_state=global_random_seed)
+
+    clf = LogisticRegression()
+    rfecv = RFECV(
+        estimator=clf,
+        n_jobs=2,
+        cv=5,
+    )
+
+    rfecv.fit(X, y)
+
+    assert len(rfecv.cv_results_["split1_test_score"]) == len(
+        rfecv.cv_results_["split2_test_score"]
+    )
+    assert len(rfecv.cv_results_["split1_support"]) == len(
+        rfecv.cv_results_["split2_support"]
+    )
+    assert len(rfecv.cv_results_["split1_ranking"]) == len(
+        rfecv.cv_results_["split2_ranking"]
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_sequential.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_sequential.py
new file mode 100644
index 0000000000000000000000000000000000000000..b98d5b400b84eaa68440c0dbc3891b99372444a2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_sequential.py
@@ -0,0 +1,332 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs, make_classification, make_regression
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def test_bad_n_features_to_select():
+    n_features = 5
+    X, y = make_regression(n_features=n_features)
+    sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n_features)
+    with pytest.raises(ValueError, match="n_features_to_select must be < n_features"):
+        sfs.fit(X, y)
+
+
+@pytest.mark.parametrize("direction", ("forward", "backward"))
+@pytest.mark.parametrize("n_features_to_select", (1, 5, 9, "auto"))
+def test_n_features_to_select(direction, n_features_to_select):
+    # Make sure n_features_to_select is respected
+
+    n_features = 10
+    X, y = make_regression(n_features=n_features, random_state=0)
+    sfs = SequentialFeatureSelector(
+        LinearRegression(),
+        n_features_to_select=n_features_to_select,
+        direction=direction,
+        cv=2,
+    )
+    sfs.fit(X, y)
+
+    if n_features_to_select == "auto":
+        n_features_to_select = n_features // 2
+
+    assert sfs.get_support(indices=True).shape[0] == n_features_to_select
+    assert sfs.n_features_to_select_ == n_features_to_select
+    assert sfs.transform(X).shape[1] == n_features_to_select
+
+
+@pytest.mark.parametrize("direction", ("forward", "backward"))
+def test_n_features_to_select_auto(direction):
+    """Check the behaviour of `n_features_to_select="auto"` with different
+    values for the parameter `tol`.
+    """
+
+    n_features = 10
+    tol = 1e-3
+    X, y = make_regression(n_features=n_features, random_state=0)
+    sfs = SequentialFeatureSelector(
+        LinearRegression(),
+        n_features_to_select="auto",
+        tol=tol,
+        direction=direction,
+        cv=2,
+    )
+    sfs.fit(X, y)
+
+    max_features_to_select = n_features - 1
+
+    assert sfs.get_support(indices=True).shape[0] <= max_features_to_select
+    assert sfs.n_features_to_select_ <= max_features_to_select
+    assert sfs.transform(X).shape[1] <= max_features_to_select
+    assert sfs.get_support(indices=True).shape[0] == sfs.n_features_to_select_
+
+
+@pytest.mark.parametrize("direction", ("forward", "backward"))
+def test_n_features_to_select_stopping_criterion(direction):
+    """Check the behaviour stopping criterion for feature selection
+    depending on the values of `n_features_to_select` and `tol`.
+
+    When `direction` is `'forward'`, select a new features at random
+    among those not currently selected in selector.support_,
+    build a new version of the data that includes all the features
+    in selector.support_ + this newly selected feature.
+    And check that the cross-validation score of the model trained on
+    this new dataset variant is lower than the model with
+    the selected forward selected features or at least does not improve
+    by more than the tol margin.
+
+    When `direction` is `'backward'`, instead of adding a new feature
+    to selector.support_, try to remove one of those selected features at random
+    And check that the cross-validation score is either decreasing or
+    not improving by more than the tol margin.
+    """
+
+    X, y = make_regression(n_features=50, n_informative=10, random_state=0)
+
+    tol = 1e-3
+
+    sfs = SequentialFeatureSelector(
+        LinearRegression(),
+        n_features_to_select="auto",
+        tol=tol,
+        direction=direction,
+        cv=2,
+    )
+    sfs.fit(X, y)
+    selected_X = sfs.transform(X)
+
+    rng = np.random.RandomState(0)
+
+    added_candidates = list(set(range(X.shape[1])) - set(sfs.get_support(indices=True)))
+    added_X = np.hstack(
+        [
+            selected_X,
+            (X[:, rng.choice(added_candidates)])[:, np.newaxis],
+        ]
+    )
+
+    removed_candidate = rng.choice(list(range(sfs.n_features_to_select_)))
+    removed_X = np.delete(selected_X, removed_candidate, axis=1)
+
+    plain_cv_score = cross_val_score(LinearRegression(), X, y, cv=2).mean()
+    sfs_cv_score = cross_val_score(LinearRegression(), selected_X, y, cv=2).mean()
+    added_cv_score = cross_val_score(LinearRegression(), added_X, y, cv=2).mean()
+    removed_cv_score = cross_val_score(LinearRegression(), removed_X, y, cv=2).mean()
+
+    assert sfs_cv_score >= plain_cv_score
+
+    if direction == "forward":
+        assert (sfs_cv_score - added_cv_score) <= tol
+        assert (sfs_cv_score - removed_cv_score) >= tol
+    else:
+        assert (added_cv_score - sfs_cv_score) <= tol
+        assert (removed_cv_score - sfs_cv_score) <= tol
+
+
+@pytest.mark.parametrize("direction", ("forward", "backward"))
+@pytest.mark.parametrize(
+    "n_features_to_select, expected",
+    (
+        (0.1, 1),
+        (1.0, 10),
+        (0.5, 5),
+    ),
+)
+def test_n_features_to_select_float(direction, n_features_to_select, expected):
+    # Test passing a float as n_features_to_select
+    X, y = make_regression(n_features=10)
+    sfs = SequentialFeatureSelector(
+        LinearRegression(),
+        n_features_to_select=n_features_to_select,
+        direction=direction,
+        cv=2,
+    )
+    sfs.fit(X, y)
+    assert sfs.n_features_to_select_ == expected
+
+
+@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("direction", ("forward", "backward"))
+@pytest.mark.parametrize(
+    "n_features_to_select, expected_selected_features",
+    [
+        (2, [0, 2]),  # f1 is dropped since it has no predictive power
+        (1, [2]),  # f2 is more predictive than f0 so it's kept
+    ],
+)
+def test_sanity(seed, direction, n_features_to_select, expected_selected_features):
+    # Basic sanity check: 3 features, only f0 and f2 are correlated with the
+    # target, f2 having a stronger correlation than f0. We expect f1 to be
+    # dropped, and f2 to always be selected.
+
+    rng = np.random.RandomState(seed)
+    n_samples = 100
+    X = rng.randn(n_samples, 3)
+    y = 3 * X[:, 0] - 10 * X[:, 2]
+
+    sfs = SequentialFeatureSelector(
+        LinearRegression(),
+        n_features_to_select=n_features_to_select,
+        direction=direction,
+        cv=2,
+    )
+    sfs.fit(X, y)
+    assert_array_equal(sfs.get_support(indices=True), expected_selected_features)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_support(csr_container):
+    # Make sure sparse data is supported
+
+    X, y = make_regression(n_features=10)
+    X = csr_container(X)
+    sfs = SequentialFeatureSelector(
+        LinearRegression(), n_features_to_select="auto", cv=2
+    )
+    sfs.fit(X, y)
+    sfs.transform(X)
+
+
+def test_nan_support():
+    # Make sure nans are OK if the underlying estimator supports nans
+
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 40, 4
+    X, y = make_regression(n_samples, n_features, random_state=0)
+    nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool)
+    X[nan_mask] = np.nan
+    sfs = SequentialFeatureSelector(
+        HistGradientBoostingRegressor(), n_features_to_select="auto", cv=2
+    )
+    sfs.fit(X, y)
+    sfs.transform(X)
+
+    with pytest.raises(ValueError, match="Input X contains NaN"):
+        # LinearRegression does not support nans
+        SequentialFeatureSelector(
+            LinearRegression(), n_features_to_select="auto", cv=2
+        ).fit(X, y)
+
+
+def test_pipeline_support():
+    # Make sure that pipelines can be passed into SFS and that SFS can be
+    # passed into a pipeline
+
+    n_samples, n_features = 50, 3
+    X, y = make_regression(n_samples, n_features, random_state=0)
+
+    # pipeline in SFS
+    pipe = make_pipeline(StandardScaler(), LinearRegression())
+    sfs = SequentialFeatureSelector(pipe, n_features_to_select="auto", cv=2)
+    sfs.fit(X, y)
+    sfs.transform(X)
+
+    # SFS in pipeline
+    sfs = SequentialFeatureSelector(
+        LinearRegression(), n_features_to_select="auto", cv=2
+    )
+    pipe = make_pipeline(StandardScaler(), sfs)
+    pipe.fit(X, y)
+    pipe.transform(X)
+
+
+@pytest.mark.parametrize("n_features_to_select", (2, 3))
+def test_unsupervised_model_fit(n_features_to_select):
+    # Make sure that models without classification labels are not being
+    # validated
+
+    X, y = make_blobs(n_features=4)
+    sfs = SequentialFeatureSelector(
+        KMeans(n_init=1),
+        n_features_to_select=n_features_to_select,
+    )
+    sfs.fit(X)
+    assert sfs.transform(X).shape[1] == n_features_to_select
+
+
+@pytest.mark.parametrize("y", ("no_validation", 1j, 99.9, np.nan, 3))
+def test_no_y_validation_model_fit(y):
+    # Make sure that other non-conventional y labels are not accepted
+
+    X, clusters = make_blobs(n_features=6)
+    sfs = SequentialFeatureSelector(
+        KMeans(),
+        n_features_to_select=3,
+    )
+
+    with pytest.raises((TypeError, ValueError)):
+        sfs.fit(X, y)
+
+
+def test_forward_neg_tol_error():
+    """Check that we raise an error when tol<0 and direction='forward'"""
+    X, y = make_regression(n_features=10, random_state=0)
+    sfs = SequentialFeatureSelector(
+        LinearRegression(),
+        n_features_to_select="auto",
+        direction="forward",
+        tol=-1e-3,
+    )
+
+    with pytest.raises(ValueError, match="tol must be strictly positive"):
+        sfs.fit(X, y)
+
+
+def test_backward_neg_tol():
+    """Check that SequentialFeatureSelector works negative tol
+
+    non-regression test for #25525
+    """
+    X, y = make_regression(n_features=10, random_state=0)
+    lr = LinearRegression()
+    initial_score = lr.fit(X, y).score(X, y)
+
+    sfs = SequentialFeatureSelector(
+        lr,
+        n_features_to_select="auto",
+        direction="backward",
+        tol=-1e-3,
+    )
+    Xr = sfs.fit_transform(X, y)
+    new_score = lr.fit(Xr, y).score(Xr, y)
+
+    assert 0 < sfs.get_support().sum() < X.shape[1]
+    assert new_score < initial_score
+
+
+def test_cv_generator_support():
+    """Check that no exception raised when cv is generator
+
+    non-regression test for #25957
+    """
+    X, y = make_classification(random_state=0)
+
+    groups = np.zeros_like(y, dtype=int)
+    groups[y.size // 2 :] = 1
+
+    cv = LeaveOneGroupOut()
+    splits = cv.split(X, y, groups=groups)
+
+    knc = KNeighborsClassifier(n_neighbors=5)
+
+    sfs = SequentialFeatureSelector(knc, n_features_to_select=5, cv=splits)
+    sfs.fit(X, y)
+
+
+def test_fit_rejects_params_with_no_routing_enabled():
+    X, y = make_classification(random_state=42)
+    est = LinearRegression()
+    sfs = SequentialFeatureSelector(estimator=est)
+
+    with pytest.raises(ValueError, match="is only supported if"):
+        sfs.fit(X, y, sample_weight=np.ones_like(y))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_variance_threshold.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_variance_threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..45e66cb338a4b7a5a410db669a13f6f9213451dc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_variance_threshold.py
@@ -0,0 +1,72 @@
+import numpy as np
+import pytest
+
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.fixes import BSR_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
+
+data = [[0, 1, 2, 3, 4], [0, 2, 2, 3, 5], [1, 1, 2, 4, 0]]
+
+data2 = [[-0.13725701]] * 10
+
+
+@pytest.mark.parametrize(
+    "sparse_container", [None] + BSR_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_zero_variance(sparse_container):
+    # Test VarianceThreshold with default setting, zero variance.
+    X = data if sparse_container is None else sparse_container(data)
+    sel = VarianceThreshold().fit(X)
+    assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True))
+
+
+def test_zero_variance_value_error():
+    # Test VarianceThreshold with default setting, zero variance, error cases.
+    with pytest.raises(ValueError):
+        VarianceThreshold().fit([[0, 1, 2, 3]])
+    with pytest.raises(ValueError):
+        VarianceThreshold().fit([[0, 1], [0, 1]])
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+def test_variance_threshold(sparse_container):
+    # Test VarianceThreshold with custom variance.
+    X = data if sparse_container is None else sparse_container(data)
+    X = VarianceThreshold(threshold=0.4).fit_transform(X)
+    assert (len(data), 1) == X.shape
+
+
+@pytest.mark.skipif(
+    np.var(data2) == 0,
+    reason=(
+        "This test is not valid for this platform, "
+        "as it relies on numerical instabilities."
+    ),
+)
+@pytest.mark.parametrize(
+    "sparse_container", [None] + BSR_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_zero_variance_floating_point_error(sparse_container):
+    # Test that VarianceThreshold(0.0).fit eliminates features that have
+    # the same value in every sample, even when floating point errors
+    # cause np.var not to be 0 for the feature.
+    # See #13691
+    X = data2 if sparse_container is None else sparse_container(data2)
+    msg = "No feature in X meets the variance threshold 0.00000"
+    with pytest.raises(ValueError, match=msg):
+        VarianceThreshold().fit(X)
+
+
+@pytest.mark.parametrize(
+    "sparse_container", [None] + BSR_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_variance_nan(sparse_container):
+    arr = np.array(data, dtype=np.float64)
+    # add single NaN and feature should still be included
+    arr[0, 0] = np.nan
+    # make all values in feature NaN and feature should be rejected
+    arr[:, 1] = np.nan
+
+    X = arr if sparse_container is None else sparse_container(arr)
+    sel = VarianceThreshold().fit(X)
+    assert_array_equal([0, 3, 4], sel.get_support(indices=True))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/frozen/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/frozen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca540b79229c87447f40eed6717fe59202885f0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/frozen/__init__.py
@@ -0,0 +1,6 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._frozen import FrozenEstimator
+
+__all__ = ["FrozenEstimator"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/frozen/_frozen.py b/.venv/lib/python3.12/site-packages/sklearn/frozen/_frozen.py
new file mode 100644
index 0000000000000000000000000000000000000000..7585ea2597b5995a5e7ffcaf8f7f9b78fd676e6e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/frozen/_frozen.py
@@ -0,0 +1,166 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from copy import deepcopy
+
+from ..base import BaseEstimator
+from ..exceptions import NotFittedError
+from ..utils import get_tags
+from ..utils.metaestimators import available_if
+from ..utils.validation import check_is_fitted
+
+
+def _estimator_has(attr):
+    """Check that final_estimator has `attr`.
+
+    Used together with `available_if`.
+    """
+
+    def check(self):
+        # raise original `AttributeError` if `attr` does not exist
+        getattr(self.estimator, attr)
+        return True
+
+    return check
+
+
+class FrozenEstimator(BaseEstimator):
+    """Estimator that wraps a fitted estimator to prevent re-fitting.
+
+    This meta-estimator takes an estimator and freezes it, in the sense that calling
+    `fit` on it has no effect. `fit_predict` and `fit_transform` are also disabled.
+    All other methods are delegated to the original estimator and original estimator's
+    attributes are accessible as well.
+
+    This is particularly useful when you have a fitted or a pre-trained model as a
+    transformer in a pipeline, and you'd like `pipeline.fit` to have no effect on this
+    step.
+
+    Parameters
+    ----------
+    estimator : estimator
+        The estimator which is to be kept frozen.
+
+    See Also
+    --------
+    None: No similar entry in the scikit-learn documentation.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.frozen import FrozenEstimator
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = make_classification(random_state=0)
+    >>> clf = LogisticRegression(random_state=0).fit(X, y)
+    >>> frozen_clf = FrozenEstimator(clf)
+    >>> frozen_clf.fit(X, y)  # No-op
+    FrozenEstimator(estimator=LogisticRegression(random_state=0))
+    >>> frozen_clf.predict(X)  # Predictions from `clf.predict`
+    array(...)
+    """
+
+    def __init__(self, estimator):
+        self.estimator = estimator
+
+    @available_if(_estimator_has("__getitem__"))
+    def __getitem__(self, *args, **kwargs):
+        """__getitem__ is defined in :class:`~sklearn.pipeline.Pipeline` and \
+            :class:`~sklearn.compose.ColumnTransformer`.
+        """
+        return self.estimator.__getitem__(*args, **kwargs)
+
+    def __getattr__(self, name):
+        # `estimator`'s attributes are now accessible except `fit_predict` and
+        # `fit_transform`
+        if name in ["fit_predict", "fit_transform"]:
+            raise AttributeError(f"{name} is not available for frozen estimators.")
+        return getattr(self.estimator, name)
+
+    def __sklearn_clone__(self):
+        return self
+
+    def __sklearn_is_fitted__(self):
+        try:
+            check_is_fitted(self.estimator)
+            return True
+        except NotFittedError:
+            return False
+
+    def fit(self, X, y, *args, **kwargs):
+        """No-op.
+
+        As a frozen estimator, calling `fit` has no effect.
+
+        Parameters
+        ----------
+        X : object
+            Ignored.
+
+        y : object
+            Ignored.
+
+        *args : tuple
+            Additional positional arguments. Ignored, but present for API compatibility
+            with `self.estimator`.
+
+        **kwargs : dict
+            Additional keyword arguments. Ignored, but present for API compatibility
+            with `self.estimator`.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        check_is_fitted(self.estimator)
+        return self
+
+    def set_params(self, **kwargs):
+        """Set the parameters of this estimator.
+
+        The only valid key here is `estimator`. You cannot set the parameters of the
+        inner estimator.
+
+        Parameters
+        ----------
+        **kwargs : dict
+            Estimator parameters.
+
+        Returns
+        -------
+        self : FrozenEstimator
+            This estimator.
+        """
+        estimator = kwargs.pop("estimator", None)
+        if estimator is not None:
+            self.estimator = estimator
+        if kwargs:
+            raise ValueError(
+                "You cannot set parameters of the inner estimator in a frozen "
+                "estimator since calling `fit` has no effect. You can use "
+                "`frozenestimator.estimator.set_params` to set parameters of the inner "
+                "estimator."
+            )
+
+    def get_params(self, deep=True):
+        """Get parameters for this estimator.
+
+        Returns a `{"estimator": estimator}` dict. The parameters of the inner
+        estimator are not included.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            Ignored.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        return {"estimator": self.estimator}
+
+    def __sklearn_tags__(self):
+        tags = deepcopy(get_tags(self.estimator))
+        tags._skip_test = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/test_frozen.py b/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/test_frozen.py
new file mode 100644
index 0000000000000000000000000000000000000000..b304d3ac0aa2c32d6b494351ef0c0d0209866b71
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/test_frozen.py
@@ -0,0 +1,223 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn import config_context
+from sklearn.base import (
+    BaseEstimator,
+    clone,
+    is_classifier,
+    is_clusterer,
+    is_outlier_detector,
+    is_regressor,
+)
+from sklearn.cluster import KMeans
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import make_classification, make_regression
+from sklearn.exceptions import NotFittedError, UnsetMetadataPassedError
+from sklearn.frozen import FrozenEstimator
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import RobustScaler, StandardScaler
+from sklearn.utils._testing import set_random_state
+from sklearn.utils.validation import check_is_fitted
+
+
+@pytest.fixture
+def regression_dataset():
+    return make_regression()
+
+
+@pytest.fixture
+def classification_dataset():
+    return make_classification()
+
+
+@pytest.mark.parametrize(
+    "estimator, dataset",
+    [
+        (LinearRegression(), "regression_dataset"),
+        (LogisticRegression(), "classification_dataset"),
+        (make_pipeline(StandardScaler(), LinearRegression()), "regression_dataset"),
+        (
+            make_pipeline(StandardScaler(), LogisticRegression()),
+            "classification_dataset",
+        ),
+        (StandardScaler(), "regression_dataset"),
+        (KMeans(), "regression_dataset"),
+        (LocalOutlierFactor(), "regression_dataset"),
+        (
+            make_column_transformer(
+                (StandardScaler(), [0]),
+                (RobustScaler(), [1]),
+            ),
+            "regression_dataset",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "method",
+    ["predict", "predict_proba", "predict_log_proba", "decision_function", "transform"],
+)
+def test_frozen_methods(estimator, dataset, request, method):
+    """Test that frozen.fit doesn't do anything, and that all other methods are
+    exposed by the frozen estimator and return the same values as the estimator.
+    """
+    X, y = request.getfixturevalue(dataset)
+    set_random_state(estimator)
+    estimator.fit(X, y)
+    frozen = FrozenEstimator(estimator)
+    # this should be no-op
+    frozen.fit([[1]], [1])
+
+    if hasattr(estimator, method):
+        assert_array_equal(getattr(estimator, method)(X), getattr(frozen, method)(X))
+
+    assert is_classifier(estimator) == is_classifier(frozen)
+    assert is_regressor(estimator) == is_regressor(frozen)
+    assert is_clusterer(estimator) == is_clusterer(frozen)
+    assert is_outlier_detector(estimator) == is_outlier_detector(frozen)
+
+
+@config_context(enable_metadata_routing=True)
+def test_frozen_metadata_routing(regression_dataset):
+    """Test that metadata routing works with frozen estimators."""
+
+    class ConsumesMetadata(BaseEstimator):
+        def __init__(self, on_fit=None, on_predict=None):
+            self.on_fit = on_fit
+            self.on_predict = on_predict
+
+        def fit(self, X, y, metadata=None):
+            if self.on_fit:
+                assert metadata is not None
+            self.fitted_ = True
+            return self
+
+        def predict(self, X, metadata=None):
+            if self.on_predict:
+                assert metadata is not None
+            return np.ones(len(X))
+
+    X, y = regression_dataset
+    pipeline = make_pipeline(
+        ConsumesMetadata(on_fit=True, on_predict=True)
+        .set_fit_request(metadata=True)
+        .set_predict_request(metadata=True)
+    )
+
+    pipeline.fit(X, y, metadata="test")
+    frozen = FrozenEstimator(pipeline)
+    pipeline.predict(X, metadata="test")
+    frozen.predict(X, metadata="test")
+
+    frozen["consumesmetadata"].set_predict_request(metadata=False)
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "Pipeline.predict got unexpected argument(s) {'metadata'}, which are not "
+            "routed to any object."
+        ),
+    ):
+        frozen.predict(X, metadata="test")
+
+    frozen["consumesmetadata"].set_predict_request(metadata=None)
+    with pytest.raises(UnsetMetadataPassedError):
+        frozen.predict(X, metadata="test")
+
+
+def test_composite_fit(classification_dataset):
+    """Test that calling fit_transform and fit_predict doesn't call fit."""
+
+    class Estimator(BaseEstimator):
+        def fit(self, X, y):
+            try:
+                self._fit_counter += 1
+            except AttributeError:
+                self._fit_counter = 1
+            return self
+
+        def fit_transform(self, X, y=None):
+            # only here to test that it doesn't get called
+            ...  # pragma: no cover
+
+        def fit_predict(self, X, y=None):
+            # only here to test that it doesn't get called
+            ...  # pragma: no cover
+
+    X, y = classification_dataset
+    est = Estimator().fit(X, y)
+    frozen = FrozenEstimator(est)
+
+    with pytest.raises(AttributeError):
+        frozen.fit_predict(X, y)
+    with pytest.raises(AttributeError):
+        frozen.fit_transform(X, y)
+
+    assert frozen._fit_counter == 1
+
+
+def test_clone_frozen(regression_dataset):
+    """Test that cloning a frozen estimator keeps the frozen state."""
+    X, y = regression_dataset
+    estimator = LinearRegression().fit(X, y)
+    frozen = FrozenEstimator(estimator)
+    cloned = clone(frozen)
+    assert cloned.estimator is estimator
+
+
+def test_check_is_fitted(regression_dataset):
+    """Test that check_is_fitted works on frozen estimators."""
+    X, y = regression_dataset
+
+    estimator = LinearRegression()
+    frozen = FrozenEstimator(estimator)
+    with pytest.raises(NotFittedError):
+        check_is_fitted(frozen)
+
+    estimator = LinearRegression().fit(X, y)
+    frozen = FrozenEstimator(estimator)
+    check_is_fitted(frozen)
+
+
+def test_frozen_tags():
+    """Test that frozen estimators have the same tags as the original estimator
+    except for the skip_test tag."""
+
+    class Estimator(BaseEstimator):
+        def __sklearn_tags__(self):
+            tags = super().__sklearn_tags__()
+            tags.input_tags.categorical = True
+            return tags
+
+    estimator = Estimator()
+    frozen = FrozenEstimator(estimator)
+    frozen_tags = frozen.__sklearn_tags__()
+    estimator_tags = estimator.__sklearn_tags__()
+
+    assert frozen_tags._skip_test is True
+    assert estimator_tags._skip_test is False
+
+    assert estimator_tags.input_tags.categorical is True
+    assert frozen_tags.input_tags.categorical is True
+
+
+def test_frozen_params():
+    """Test that FrozenEstimator only exposes the estimator parameter."""
+    est = LogisticRegression()
+    frozen = FrozenEstimator(est)
+
+    with pytest.raises(ValueError, match="You cannot set parameters of the inner"):
+        frozen.set_params(estimator__C=1)
+
+    assert frozen.get_params() == {"estimator": est}
+
+    other_est = LocalOutlierFactor()
+    frozen.set_params(estimator=other_est)
+    assert frozen.get_params() == {"estimator": other_est}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fafaf67e4ed042a95058e294f2395ea0dffb55d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/__init__.py
@@ -0,0 +1,10 @@
+"""Gaussian process based regression and classification."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from . import kernels
+from ._gpc import GaussianProcessClassifier
+from ._gpr import GaussianProcessRegressor
+
+__all__ = ["GaussianProcessClassifier", "GaussianProcessRegressor", "kernels"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpc.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpc.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ecceb47de9058643daee84faaab1e9927919c26
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpc.py
@@ -0,0 +1,973 @@
+"""Gaussian processes classification."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral
+from operator import itemgetter
+
+import numpy as np
+import scipy.optimize
+from scipy.linalg import cho_solve, cholesky, solve
+from scipy.special import erf, expit
+
+from ..base import BaseEstimator, ClassifierMixin, _fit_context, clone
+from ..multiclass import OneVsOneClassifier, OneVsRestClassifier
+from ..preprocessing import LabelEncoder
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.optimize import _check_optimize_result
+from ..utils.validation import check_is_fitted, validate_data
+from .kernels import RBF, CompoundKernel, Kernel
+from .kernels import ConstantKernel as C
+
+# Values required for approximating the logistic sigmoid by
+# error functions. coefs are obtained via:
+# x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
+# b = logistic(x)
+# A = (erf(np.dot(x, self.lambdas)) + 1) / 2
+# coefs = lstsq(A, b)[0]
+LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
+COEFS = np.array(
+    [-1854.8214151, 3516.89893646, 221.29346712, 128.12323805, -2010.49422654]
+)[:, np.newaxis]
+
+
+class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
+    """Binary Gaussian process classification based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernel : kernel instance, default=None
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer : int, default=0
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
+
+    max_iter_predict : int, default=100
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
+    warm_start : bool, default=False
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization. See :term:`the Glossary
+        <warm_start>`.
+
+    copy_X_train : bool, default=True
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    X_train_ : array-like of shape (n_samples, n_features) or list of object
+        Feature vectors or other representations of training data (also
+        required for prediction).
+
+    y_train_ : array-like of shape (n_samples,)
+        Target values in training data (also required for prediction)
+
+    classes_ : array-like of shape (n_classes,)
+        Unique class labels.
+
+    kernel_ : kernl instance
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters
+
+    L_ : array-like of shape (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in X_train_
+
+    pi_ : array-like of shape (n_samples,)
+        The probabilities of the positive class for the training points
+        X_train_
+
+    W_sr_ : array-like of shape (n_samples,)
+        Square root of W, the Hessian of log-likelihood of the latent function
+        values for the observed labels. Since W is diagonal, only the diagonal
+        of sqrt(W) is stored.
+
+    log_marginal_likelihood_value_ : float
+        The log-marginal-likelihood of ``self.kernel_.theta``
+
+    References
+    ----------
+    .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
+       "Gaussian Processes for Machine Learning",
+       MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
+    """
+
+    def __init__(
+        self,
+        kernel=None,
+        *,
+        optimizer="fmin_l_bfgs_b",
+        n_restarts_optimizer=0,
+        max_iter_predict=100,
+        warm_start=False,
+        copy_X_train=True,
+        random_state=None,
+    ):
+        self.kernel = kernel
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter_predict = max_iter_predict
+        self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+
+    def fit(self, X, y):
+        """Fit Gaussian process classification model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Feature vectors or other representations of training data.
+
+        y : array-like of shape (n_samples,)
+            Target values, must be binary.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
+                1.0, length_scale_bounds="fixed"
+            )
+        else:
+            self.kernel_ = clone(self.kernel)
+
+        self.rng = check_random_state(self.random_state)
+
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
+
+        # Encode class labels and check that it is a binary classification
+        # problem
+        label_encoder = LabelEncoder()
+        self.y_train_ = label_encoder.fit_transform(y)
+        self.classes_ = label_encoder.classes_
+        if self.classes_.size > 2:
+            raise ValueError(
+                "%s supports only binary classification. y contains classes %s"
+                % (self.__class__.__name__, self.classes_)
+            )
+        elif self.classes_.size == 1:
+            raise ValueError(
+                "{0:s} requires 2 classes; got {1:d} class".format(
+                    self.__class__.__name__, self.classes_.size
+                )
+            )
+
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood (potentially starting from several initial values)
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True, clone_kernel=False
+                    )
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta, clone_kernel=False)
+
+            # First optimize starting from theta specified in kernel
+            optima = [
+                self._constrained_optimization(
+                    obj_func, self.kernel_.theta, self.kernel_.bounds
+                )
+            ]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 0:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
+                        "requires that all bounds are finite."
+                    )
+                bounds = self.kernel_.bounds
+                for iteration in range(self.n_restarts_optimizer):
+                    theta_initial = np.exp(self.rng.uniform(bounds[:, 0], bounds[:, 1]))
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial, bounds)
+                    )
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            lml_values = list(map(itemgetter(1), optima))
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.kernel_._check_bounds_params()
+
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
+                self.kernel_.theta
+            )
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        K = self.kernel_(self.X_train_)
+
+        _, (self.pi_, self.W_sr_, self.L_, _, _) = self._posterior_mode(
+            K, return_temporaries=True
+        )
+
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            Predicted target values for X, values are from ``classes_``
+        """
+        check_is_fitted(self)
+
+        # As discussed on Section 3.4.2 of GPML, for making hard binary
+        # decisions, it is enough to compute the MAP of the posterior and
+        # pass it through the link function
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Algorithm 3.2,Line 4
+
+        return np.where(f_star > 0, self.classes_[1], self.classes_[0])
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : array-like of shape (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute ``classes_``.
+        """
+        check_is_fitted(self)
+
+        # Compute the mean and variance of the latent function
+        # (Lines 4-6 of Algorithm 3.2 of GPML)
+        latent_mean, latent_var = self.latent_mean_and_variance(X)
+
+        # Line 7:
+        # Approximate \int log(z) * N(z | f_star, var_f_star)
+        # Approximation is due to Williams & Barber, "Bayesian Classification
+        # with Gaussian Processes", Appendix A: Approximate the logistic
+        # sigmoid by a linear combination of 5 error functions.
+        # For information on how this integral can be computed see
+        # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
+        alpha = 1 / (2 * latent_var)
+        gamma = LAMBDAS * latent_mean
+        integrals = (
+            np.sqrt(np.pi / alpha)
+            * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2)))
+            / (2 * np.sqrt(latent_var * 2 * np.pi))
+        )
+        pi_star = (COEFS * integrals).sum(axis=0) + 0.5 * COEFS.sum()
+
+        return np.vstack((1 - pi_star, pi_star)).T
+
+    def log_marginal_likelihood(
+        self, theta=None, eval_gradient=False, clone_kernel=True
+    ):
+        """Returns log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like of shape (n_kernel_params,), default=None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of ``self.kernel_.theta`` is returned.
+
+        eval_gradient : bool, default=False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. If True, theta must not be None.
+
+        clone_kernel : bool, default=True
+            If True, the kernel attribute is copied. If False, the kernel
+            attribute is modified, but may result in a performance improvement.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : ndarray of shape (n_kernel_params,), \
+                optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when `eval_gradient` is True.
+        """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        if clone_kernel:
+            kernel = self.kernel_.clone_with_theta(theta)
+        else:
+            kernel = self.kernel_
+            kernel.theta = theta
+
+        if eval_gradient:
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
+        else:
+            K = kernel(self.X_train_)
+
+        # Compute log-marginal-likelihood Z and also store some temporaries
+        # which can be reused for computing Z's gradient
+        Z, (pi, W_sr, L, b, a) = self._posterior_mode(K, return_temporaries=True)
+
+        if not eval_gradient:
+            return Z
+
+        # Compute gradient based on Algorithm 5.1 of GPML
+        d_Z = np.empty(theta.shape[0])
+        # XXX: Get rid of the np.diag() in the next line
+        R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))  # Line 7
+        C = solve(L, W_sr[:, np.newaxis] * K)  # Line 8
+        # Line 9: (use einsum to compute np.diag(C.T.dot(C))))
+        s_2 = (
+            -0.5
+            * (np.diag(K) - np.einsum("ij, ij -> j", C, C))
+            * (pi * (1 - pi) * (1 - 2 * pi))
+        )  # third derivative
+
+        for j in range(d_Z.shape[0]):
+            C = K_gradient[:, :, j]  # Line 11
+            # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
+            s_1 = 0.5 * a.T.dot(C).dot(a) - 0.5 * R.T.ravel().dot(C.ravel())
+
+            b = C.dot(self.y_train_ - pi)  # Line 13
+            s_3 = b - K.dot(R.dot(b))  # Line 14
+
+            d_Z[j] = s_1 + s_2.T.dot(s_3)  # Line 15
+
+        return Z, d_Z
+
+    def latent_mean_and_variance(self, X):
+        """Compute the mean and variance of the latent function values.
+
+        Based on algorithm 3.2 of [RW2006]_, this function returns the latent
+        mean (Line 4) and variance (Line 6) of the Gaussian process
+        classification model.
+
+        Note that this function is only supported for binary classification.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        latent_mean : array-like of shape (n_samples,)
+            Mean of the latent function values at the query points.
+
+        latent_var : array-like of shape (n_samples,)
+            Variance of the latent function values at the query points.
+        """
+        check_is_fitted(self)
+
+        # Based on Algorithm 3.2 of GPML
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        latent_mean = K_star.T.dot(self.y_train_ - self.pi_)  # Line 4
+        v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)  # Line 5
+        # Line 6 (compute np.diag(v.T.dot(v)) via einsum)
+        latent_var = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
+
+        return latent_mean, latent_var
+
+    def _posterior_mode(self, K, return_temporaries=False):
+        """Mode-finding for binary Laplace GPC and fixed kernel.
+
+        This approximates the posterior of the latent function values for given
+        inputs and target observations with a Gaussian approximation and uses
+        Newton's iteration to find the mode of this approximation.
+        """
+        # Based on Algorithm 3.1 of GPML
+
+        # If warm_start are enabled, we reuse the last solution for the
+        # posterior mode as initialization; otherwise, we initialize with 0
+        if (
+            self.warm_start
+            and hasattr(self, "f_cached")
+            and self.f_cached.shape == self.y_train_.shape
+        ):
+            f = self.f_cached
+        else:
+            f = np.zeros_like(self.y_train_, dtype=np.float64)
+
+        # Use Newton's iteration method to find mode of Laplace approximation
+        log_marginal_likelihood = -np.inf
+        for _ in range(self.max_iter_predict):
+            # Line 4
+            pi = expit(f)
+            W = pi * (1 - pi)
+            # Line 5
+            W_sr = np.sqrt(W)
+            W_sr_K = W_sr[:, np.newaxis] * K
+            B = np.eye(W.shape[0]) + W_sr_K * W_sr
+            L = cholesky(B, lower=True)
+            # Line 6
+            b = W * f + (self.y_train_ - pi)
+            # Line 7
+            a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
+            # Line 8
+            f = K.dot(a)
+
+            # Line 10: Compute log marginal likelihood in loop and use as
+            #          convergence criterion
+            lml = (
+                -0.5 * a.T.dot(f)
+                - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum()
+                - np.log(np.diag(L)).sum()
+            )
+            # Check if we have converged (log marginal likelihood does
+            # not decrease)
+            # XXX: more complex convergence criterion
+            if lml - log_marginal_likelihood < 1e-10:
+                break
+            log_marginal_likelihood = lml
+
+        self.f_cached = f  # Remember solution for later warm-starts
+        if return_temporaries:
+            return log_marginal_likelihood, (pi, W_sr, L, b, a)
+        else:
+            return log_marginal_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer == "fmin_l_bfgs_b":
+            opt_res = scipy.optimize.minimize(
+                obj_func, initial_theta, method="L-BFGS-B", jac=True, bounds=bounds
+            )
+            _check_optimize_result("lbfgs", opt_res)
+            theta_opt, func_min = opt_res.x, opt_res.fun
+        elif callable(self.optimizer):
+            theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
+
+        return theta_opt, func_min
+
+
+class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
+    """Gaussian process classification (GPC) based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function. For multi-class classification, several binary one-versus rest
+    classifiers are fitted. Note that this class thus does not implement
+    a true multi-class Laplace approximation.
+
+    Read more in the :ref:`User Guide <gaussian_process>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernel : kernel instance, default=None
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting. Also kernel
+        cannot be a `CompoundKernel`.
+
+    optimizer : 'fmin_l_bfgs_b', callable or None, default='fmin_l_bfgs_b'
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer : int, default=0
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
+
+    max_iter_predict : int, default=100
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
+    warm_start : bool, default=False
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization. See :term:`the Glossary
+        <warm_start>`.
+
+    copy_X_train : bool, default=True
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'
+        Specifies how multi-class classification problems are handled.
+        Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',
+        one binary Gaussian process classifier is fitted for each class, which
+        is trained to separate this class from the rest. In 'one_vs_one', one
+        binary Gaussian process classifier is fitted for each pair of classes,
+        which is trained to separate these two classes. The predictions of
+        these binary predictors are combined into multi-class predictions.
+        Note that 'one_vs_one' does not support predicting probability
+        estimates.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation: the specified
+        multiclass problems are computed in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    base_estimator_ : ``Estimator`` instance
+        The estimator instance that defines the likelihood function
+        using the observed data.
+
+    kernel_ : kernel instance
+        The kernel used for prediction. In case of binary classification,
+        the structure of the kernel is the same as the one passed as parameter
+        but with optimized hyperparameters. In case of multi-class
+        classification, a CompoundKernel is returned which consists of the
+        different kernels used in the one-versus-rest classifiers.
+
+    log_marginal_likelihood_value_ : float
+        The log-marginal-likelihood of ``self.kernel_.theta``
+
+    classes_ : array-like of shape (n_classes,)
+        Unique class labels.
+
+    n_classes_ : int
+        The number of classes in the training data
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GaussianProcessRegressor : Gaussian process regression (GPR).
+
+    References
+    ----------
+    .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
+       "Gaussian Processes for Machine Learning",
+       MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.gaussian_process import GaussianProcessClassifier
+    >>> from sklearn.gaussian_process.kernels import RBF
+    >>> X, y = load_iris(return_X_y=True)
+    >>> kernel = 1.0 * RBF(1.0)
+    >>> gpc = GaussianProcessClassifier(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpc.score(X, y)
+    0.9866...
+    >>> gpc.predict_proba(X[:2,:])
+    array([[0.83548752, 0.03228706, 0.13222543],
+           [0.79064206, 0.06525643, 0.14410151]])
+
+    For a comparison of the GaussianProcessClassifier with other classifiers see:
+    :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "kernel": [Kernel, None],
+        "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
+        "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
+        "max_iter_predict": [Interval(Integral, 1, None, closed="left")],
+        "warm_start": ["boolean"],
+        "copy_X_train": ["boolean"],
+        "random_state": ["random_state"],
+        "multi_class": [StrOptions({"one_vs_rest", "one_vs_one"})],
+        "n_jobs": [Integral, None],
+    }
+
+    def __init__(
+        self,
+        kernel=None,
+        *,
+        optimizer="fmin_l_bfgs_b",
+        n_restarts_optimizer=0,
+        max_iter_predict=100,
+        warm_start=False,
+        copy_X_train=True,
+        random_state=None,
+        multi_class="one_vs_rest",
+        n_jobs=None,
+    ):
+        self.kernel = kernel
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter_predict = max_iter_predict
+        self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+        self.multi_class = multi_class
+        self.n_jobs = n_jobs
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit Gaussian process classification model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Feature vectors or other representations of training data.
+
+        y : array-like of shape (n_samples,)
+            Target values, must be binary.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        if isinstance(self.kernel, CompoundKernel):
+            raise ValueError("kernel cannot be a CompoundKernel")
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X, y = validate_data(
+                self, X, y, multi_output=False, ensure_2d=True, dtype="numeric"
+            )
+        else:
+            X, y = validate_data(
+                self, X, y, multi_output=False, ensure_2d=False, dtype=None
+            )
+
+        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
+            kernel=self.kernel,
+            optimizer=self.optimizer,
+            n_restarts_optimizer=self.n_restarts_optimizer,
+            max_iter_predict=self.max_iter_predict,
+            warm_start=self.warm_start,
+            copy_X_train=self.copy_X_train,
+            random_state=self.random_state,
+        )
+
+        self.classes_ = np.unique(y)
+        self.n_classes_ = self.classes_.size
+        if self.n_classes_ == 1:
+            raise ValueError(
+                "GaussianProcessClassifier requires 2 or more "
+                "distinct classes; got %d class (only class %s "
+                "is present)" % (self.n_classes_, self.classes_[0])
+            )
+        if self.n_classes_ > 2:
+            if self.multi_class == "one_vs_rest":
+                self.base_estimator_ = OneVsRestClassifier(
+                    self.base_estimator_, n_jobs=self.n_jobs
+                )
+            elif self.multi_class == "one_vs_one":
+                self.base_estimator_ = OneVsOneClassifier(
+                    self.base_estimator_, n_jobs=self.n_jobs
+                )
+            else:
+                raise ValueError("Unknown multi-class mode %s" % self.multi_class)
+
+        self.base_estimator_.fit(X, y)
+
+        if self.n_classes_ > 2:
+            self.log_marginal_likelihood_value_ = np.mean(
+                [
+                    estimator.log_marginal_likelihood()
+                    for estimator in self.base_estimator_.estimators_
+                ]
+            )
+        else:
+            self.log_marginal_likelihood_value_ = (
+                self.base_estimator_.log_marginal_likelihood()
+            )
+
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            Predicted target values for X, values are from ``classes_``.
+        """
+        check_is_fitted(self)
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X = validate_data(self, X, ensure_2d=True, dtype="numeric", reset=False)
+        else:
+            X = validate_data(self, X, ensure_2d=False, dtype=None, reset=False)
+
+        return self.base_estimator_.predict(X)
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : array-like of shape (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
+            raise ValueError(
+                "one_vs_one multi-class mode does not support "
+                "predicting probability estimates. Use "
+                "one_vs_rest mode instead."
+            )
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X = validate_data(self, X, ensure_2d=True, dtype="numeric", reset=False)
+        else:
+            X = validate_data(self, X, ensure_2d=False, dtype=None, reset=False)
+
+        return self.base_estimator_.predict_proba(X)
+
+    @property
+    def kernel_(self):
+        """Return the kernel of the base estimator."""
+        if self.n_classes_ == 2:
+            return self.base_estimator_.kernel_
+        else:
+            return CompoundKernel(
+                [estimator.kernel_ for estimator in self.base_estimator_.estimators_]
+            )
+
+    def log_marginal_likelihood(
+        self, theta=None, eval_gradient=False, clone_kernel=True
+    ):
+        """Return log-marginal likelihood of theta for training data.
+
+        In the case of multi-class classification, the mean log-marginal
+        likelihood of the one-versus-rest classifiers are returned.
+
+        Parameters
+        ----------
+        theta : array-like of shape (n_kernel_params,), default=None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. In the case of multi-class classification, theta may
+            be the  hyperparameters of the compound kernel or of an individual
+            kernel. In the latter case, all individual kernel get assigned the
+            same theta values. If None, the precomputed log_marginal_likelihood
+            of ``self.kernel_.theta`` is returned.
+
+        eval_gradient : bool, default=False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. Note that gradient computation is not supported
+            for non-binary classification. If True, theta must not be None.
+
+        clone_kernel : bool, default=True
+            If True, the kernel attribute is copied. If False, the kernel
+            attribute is modified, but may result in a performance improvement.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when `eval_gradient` is True.
+        """
+        check_is_fitted(self)
+
+        if theta is None:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        theta = np.asarray(theta)
+        if self.n_classes_ == 2:
+            return self.base_estimator_.log_marginal_likelihood(
+                theta, eval_gradient, clone_kernel=clone_kernel
+            )
+        else:
+            if eval_gradient:
+                raise NotImplementedError(
+                    "Gradient of log-marginal-likelihood not implemented for "
+                    "multi-class GPC."
+                )
+            estimators = self.base_estimator_.estimators_
+            n_dims = estimators[0].kernel_.n_dims
+            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
+                return np.mean(
+                    [
+                        estimator.log_marginal_likelihood(
+                            theta, clone_kernel=clone_kernel
+                        )
+                        for i, estimator in enumerate(estimators)
+                    ]
+                )
+            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
+                # theta for compound kernel
+                return np.mean(
+                    [
+                        estimator.log_marginal_likelihood(
+                            theta[n_dims * i : n_dims * (i + 1)],
+                            clone_kernel=clone_kernel,
+                        )
+                        for i, estimator in enumerate(estimators)
+                    ]
+                )
+            else:
+                raise ValueError(
+                    "Shape of theta must be either %d or %d. "
+                    "Obtained theta with shape %d."
+                    % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0])
+                )
+
+    def latent_mean_and_variance(self, X):
+        """Compute the mean and variance of the latent function.
+
+        Based on algorithm 3.2 of [RW2006]_, this function returns the latent
+        mean (Line 4) and variance (Line 6) of the Gaussian process
+        classification model.
+
+        Note that this function is only supported for binary classification.
+
+        .. versionadded:: 1.7
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        latent_mean : array-like of shape (n_samples,)
+            Mean of the latent function values at the query points.
+
+        latent_var : array-like of shape (n_samples,)
+            Variance of the latent function values at the query points.
+        """
+        if self.n_classes_ > 2:
+            raise ValueError(
+                "Returning the mean and variance of the latent function f "
+                "is only supported for binary classification, received "
+                f"{self.n_classes_} classes."
+            )
+        check_is_fitted(self)
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X = validate_data(self, X, ensure_2d=True, dtype="numeric", reset=False)
+        else:
+            X = validate_data(self, X, ensure_2d=False, dtype=None, reset=False)
+
+        return self.base_estimator_.latent_mean_and_variance(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpr.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d56e7735be787eaf2b1aaeaac0fce228651b2eb6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpr.py
@@ -0,0 +1,675 @@
+"""Gaussian processes regression."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+from operator import itemgetter
+
+import numpy as np
+import scipy.optimize
+from scipy.linalg import cho_solve, cholesky, solve_triangular
+
+from ..base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context, clone
+from ..preprocessing._data import _handle_zeros_in_scale
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.optimize import _check_optimize_result
+from ..utils.validation import validate_data
+from .kernels import RBF, Kernel
+from .kernels import ConstantKernel as C
+
+GPR_CHOLESKY_LOWER = True
+
+
+class GaussianProcessRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
+    """Gaussian process regression (GPR).
+
+    The implementation is based on Algorithm 2.1 of [RW2006]_.
+
+    In addition to standard scikit-learn estimator API,
+    :class:`GaussianProcessRegressor`:
+
+    * allows prediction without prior fitting (based on the GP prior)
+    * provides an additional method `sample_y(X)`, which evaluates samples
+      drawn from the GPR (prior or posterior) at given inputs
+    * exposes a method `log_marginal_likelihood(theta)`, which can be used
+      externally for other ways of selecting hyperparameters, e.g., via
+      Markov chain Monte Carlo.
+
+    To learn the difference between a point-estimate approach vs. a more
+    Bayesian modelling approach, refer to the example entitled
+    :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`.
+
+    Read more in the :ref:`User Guide <gaussian_process>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernel : kernel instance, default=None
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel ``ConstantKernel(1.0, constant_value_bounds="fixed")
+        * RBF(1.0, length_scale_bounds="fixed")`` is used as default. Note that
+        the kernel hyperparameters are optimized during fitting unless the
+        bounds are marked as "fixed".
+
+    alpha : float or ndarray of shape (n_samples,), default=1e-10
+        Value added to the diagonal of the kernel matrix during fitting.
+        This can prevent a potential numerical issue during fitting, by
+        ensuring that the calculated values form a positive definite matrix.
+        It can also be interpreted as the variance of additional Gaussian
+        measurement noise on the training observations. Note that this is
+        different from using a `WhiteKernel`. If an array is passed, it must
+        have the same number of entries as the data used for fitting and is
+        used as datapoint-dependent noise level. Allowing to specify the
+        noise level directly as a parameter is mainly for convenience and
+        for consistency with :class:`~sklearn.linear_model.Ridge`.
+        For an example illustrating how the alpha parameter controls
+        the noise variance in Gaussian Process Regression, see
+        :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy_targets.py`.
+
+    optimizer : "fmin_l_bfgs_b", callable or None, default="fmin_l_bfgs_b"
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func': the objective function to be minimized, which
+                #   takes the hyperparameters theta as a parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are: `{'fmin_l_bfgs_b'}`.
+
+    n_restarts_optimizer : int, default=0
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that `n_restarts_optimizer == 0` implies that one
+        run is performed.
+
+    normalize_y : bool, default=False
+        Whether or not to normalize the target values `y` by removing the mean
+        and scaling to unit-variance. This is recommended for cases where
+        zero-mean, unit-variance priors are used. Note that, in this
+        implementation, the normalisation is reversed before the GP predictions
+        are reported.
+
+        .. versionchanged:: 0.23
+
+    copy_X_train : bool, default=True
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    n_targets : int, default=None
+        The number of dimensions of the target values. Used to decide the number
+        of outputs when sampling from the prior distributions (i.e. calling
+        :meth:`sample_y` before :meth:`fit`). This parameter is ignored once
+        :meth:`fit` has been called.
+
+        .. versionadded:: 1.3
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    X_train_ : array-like of shape (n_samples, n_features) or list of object
+        Feature vectors or other representations of training data (also
+        required for prediction).
+
+    y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)
+        Target values in training data (also required for prediction).
+
+    kernel_ : kernel instance
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters.
+
+    L_ : array-like of shape (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.
+
+    alpha_ : array-like of shape (n_samples,)
+        Dual coefficients of training data points in kernel space.
+
+    log_marginal_likelihood_value_ : float
+        The log-marginal-likelihood of ``self.kernel_.theta``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GaussianProcessClassifier : Gaussian process classification (GPC)
+        based on Laplace approximation.
+
+    References
+    ----------
+    .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
+       "Gaussian Processes for Machine Learning",
+       MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
+    >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
+    >>> kernel = DotProduct() + WhiteKernel()
+    >>> gpr = GaussianProcessRegressor(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    0.3680...
+    >>> gpr.predict(X[:2,:], return_std=True)
+    (array([653.0, 592.1]), array([316.6, 316.6]))
+    """
+
+    _parameter_constraints: dict = {
+        "kernel": [None, Kernel],
+        "alpha": [Interval(Real, 0, None, closed="left"), np.ndarray],
+        "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
+        "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
+        "normalize_y": ["boolean"],
+        "copy_X_train": ["boolean"],
+        "n_targets": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        kernel=None,
+        *,
+        alpha=1e-10,
+        optimizer="fmin_l_bfgs_b",
+        n_restarts_optimizer=0,
+        normalize_y=False,
+        copy_X_train=True,
+        n_targets=None,
+        random_state=None,
+    ):
+        self.kernel = kernel
+        self.alpha = alpha
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.normalize_y = normalize_y
+        self.copy_X_train = copy_X_train
+        self.n_targets = n_targets
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit Gaussian process regression model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Feature vectors or other representations of training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        Returns
+        -------
+        self : object
+            GaussianProcessRegressor class instance.
+        """
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
+                1.0, length_scale_bounds="fixed"
+            )
+        else:
+            self.kernel_ = clone(self.kernel)
+
+        self._rng = check_random_state(self.random_state)
+
+        if self.kernel_.requires_vector_input:
+            dtype, ensure_2d = "numeric", True
+        else:
+            dtype, ensure_2d = None, False
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            multi_output=True,
+            y_numeric=True,
+            ensure_2d=ensure_2d,
+            dtype=dtype,
+        )
+
+        n_targets_seen = y.shape[1] if y.ndim > 1 else 1
+        if self.n_targets is not None and n_targets_seen != self.n_targets:
+            raise ValueError(
+                "The number of targets seen in `y` is different from the parameter "
+                f"`n_targets`. Got {n_targets_seen} != {self.n_targets}."
+            )
+
+        # Normalize target value
+        if self.normalize_y:
+            self._y_train_mean = np.mean(y, axis=0)
+            self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False)
+
+            # Remove mean and make unit variance
+            y = (y - self._y_train_mean) / self._y_train_std
+
+        else:
+            shape_y_stats = (y.shape[1],) if y.ndim == 2 else 1
+            self._y_train_mean = np.zeros(shape=shape_y_stats)
+            self._y_train_std = np.ones(shape=shape_y_stats)
+
+        if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]:
+            if self.alpha.shape[0] == 1:
+                self.alpha = self.alpha[0]
+            else:
+                raise ValueError(
+                    "alpha must be a scalar or an array with same number of "
+                    f"entries as y. ({self.alpha.shape[0]} != {y.shape[0]})"
+                )
+
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
+        self.y_train_ = np.copy(y) if self.copy_X_train else y
+
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood (potentially starting from several initial values)
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True, clone_kernel=False
+                    )
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta, clone_kernel=False)
+
+            # First optimize starting from theta specified in kernel
+            optima = [
+                (
+                    self._constrained_optimization(
+                        obj_func, self.kernel_.theta, self.kernel_.bounds
+                    )
+                )
+            ]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 0:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
+                        "requires that all bounds are finite."
+                    )
+                bounds = self.kernel_.bounds
+                for iteration in range(self.n_restarts_optimizer):
+                    theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1])
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial, bounds)
+                    )
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            lml_values = list(map(itemgetter(1), optima))
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.kernel_._check_bounds_params()
+
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
+                self.kernel_.theta, clone_kernel=False
+            )
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
+        K = self.kernel_(self.X_train_)
+        K[np.diag_indices_from(K)] += self.alpha
+        try:
+            self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
+        except np.linalg.LinAlgError as exc:
+            exc.args = (
+                (
+                    f"The kernel, {self.kernel_}, is not returning a positive "
+                    "definite matrix. Try gradually increasing the 'alpha' "
+                    "parameter of your GaussianProcessRegressor estimator."
+                ),
+            ) + exc.args
+            raise
+        # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
+        self.alpha_ = cho_solve(
+            (self.L_, GPR_CHOLESKY_LOWER),
+            self.y_train_,
+            check_finite=False,
+        )
+        return self
+
+    def predict(self, X, return_std=False, return_cov=False):
+        """Predict using the Gaussian process regression model.
+
+        We can also predict based on an unfitted model by using the GP prior.
+        In addition to the mean of the predictive distribution, optionally also
+        returns its standard deviation (`return_std=True`) or covariance
+        (`return_cov=True`). Note that at most one of the two can be requested.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated.
+
+        return_std : bool, default=False
+            If True, the standard-deviation of the predictive distribution at
+            the query points is returned along with the mean.
+
+        return_cov : bool, default=False
+            If True, the covariance of the joint predictive distribution at
+            the query points is returned along with the mean.
+
+        Returns
+        -------
+        y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)
+            Mean of predictive distribution at query points.
+
+        y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional
+            Standard deviation of predictive distribution at query points.
+            Only returned when `return_std` is True.
+
+        y_cov : ndarray of shape (n_samples, n_samples) or \
+                (n_samples, n_samples, n_targets), optional
+            Covariance of joint predictive distribution at query points.
+            Only returned when `return_cov` is True.
+        """
+        if return_std and return_cov:
+            raise RuntimeError(
+                "At most one of return_std or return_cov can be requested."
+            )
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            dtype, ensure_2d = "numeric", True
+        else:
+            dtype, ensure_2d = None, False
+
+        X = validate_data(self, X, ensure_2d=ensure_2d, dtype=dtype, reset=False)
+
+        if not hasattr(self, "X_train_"):  # Unfitted;predict based on GP prior
+            if self.kernel is None:
+                kernel = C(1.0, constant_value_bounds="fixed") * RBF(
+                    1.0, length_scale_bounds="fixed"
+                )
+            else:
+                kernel = self.kernel
+
+            n_targets = self.n_targets if self.n_targets is not None else 1
+            y_mean = np.zeros(shape=(X.shape[0], n_targets)).squeeze()
+
+            if return_cov:
+                y_cov = kernel(X)
+                if n_targets > 1:
+                    y_cov = np.repeat(
+                        np.expand_dims(y_cov, -1), repeats=n_targets, axis=-1
+                    )
+                return y_mean, y_cov
+            elif return_std:
+                y_var = kernel.diag(X)
+                if n_targets > 1:
+                    y_var = np.repeat(
+                        np.expand_dims(y_var, -1), repeats=n_targets, axis=-1
+                    )
+                return y_mean, np.sqrt(y_var)
+            else:
+                return y_mean
+        else:  # Predict based on GP posterior
+            # Alg 2.1, page 19, line 4 -> f*_bar = K(X_test, X_train) . alpha
+            K_trans = self.kernel_(X, self.X_train_)
+            y_mean = K_trans @ self.alpha_
+
+            # undo normalisation
+            y_mean = self._y_train_std * y_mean + self._y_train_mean
+
+            # if y_mean has shape (n_samples, 1), reshape to (n_samples,)
+            if y_mean.ndim > 1 and y_mean.shape[1] == 1:
+                y_mean = np.squeeze(y_mean, axis=1)
+
+            # Alg 2.1, page 19, line 5 -> v = L \ K(X_test, X_train)^T
+            V = solve_triangular(
+                self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False
+            )
+
+            if return_cov:
+                # Alg 2.1, page 19, line 6 -> K(X_test, X_test) - v^T. v
+                y_cov = self.kernel_(X) - V.T @ V
+
+                # undo normalisation
+                y_cov = np.outer(y_cov, self._y_train_std**2).reshape(*y_cov.shape, -1)
+                # if y_cov has shape (n_samples, n_samples, 1), reshape to
+                # (n_samples, n_samples)
+                if y_cov.shape[2] == 1:
+                    y_cov = np.squeeze(y_cov, axis=2)
+
+                return y_mean, y_cov
+            elif return_std:
+                # Compute variance of predictive distribution
+                # Use einsum to avoid explicitly forming the large matrix
+                # V^T @ V just to extract its diagonal afterward.
+                y_var = self.kernel_.diag(X).copy()
+                y_var -= np.einsum("ij,ji->i", V.T, V)
+
+                # Check if any of the variances is negative because of
+                # numerical issues. If yes: set the variance to 0.
+                y_var_negative = y_var < 0
+                if np.any(y_var_negative):
+                    warnings.warn(
+                        "Predicted variances smaller than 0. "
+                        "Setting those variances to 0."
+                    )
+                    y_var[y_var_negative] = 0.0
+
+                # undo normalisation
+                y_var = np.outer(y_var, self._y_train_std**2).reshape(*y_var.shape, -1)
+
+                # if y_var has shape (n_samples, 1), reshape to (n_samples,)
+                if y_var.shape[1] == 1:
+                    y_var = np.squeeze(y_var, axis=1)
+
+                return y_mean, np.sqrt(y_var)
+            else:
+                return y_mean
+
+    def sample_y(self, X, n_samples=1, random_state=0):
+        """Draw samples from Gaussian process and evaluate at X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Query points where the GP is evaluated.
+
+        n_samples : int, default=1
+            Number of samples drawn from the Gaussian process per query point.
+
+        random_state : int, RandomState instance or None, default=0
+            Determines random number generation to randomly draw samples.
+            Pass an int for reproducible results across multiple function
+            calls.
+            See :term:`Glossary <random_state>`.
+
+        Returns
+        -------
+        y_samples : ndarray of shape (n_samples_X, n_samples), or \
+            (n_samples_X, n_targets, n_samples)
+            Values of n_samples samples drawn from Gaussian process and
+            evaluated at query points.
+        """
+        rng = check_random_state(random_state)
+
+        y_mean, y_cov = self.predict(X, return_cov=True)
+        if y_mean.ndim == 1:
+            y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
+        else:
+            y_samples = [
+                rng.multivariate_normal(
+                    y_mean[:, target], y_cov[..., target], n_samples
+                ).T[:, np.newaxis]
+                for target in range(y_mean.shape[1])
+            ]
+            y_samples = np.hstack(y_samples)
+        return y_samples
+
+    def log_marginal_likelihood(
+        self, theta=None, eval_gradient=False, clone_kernel=True
+    ):
+        """Return log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like of shape (n_kernel_params,) default=None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of ``self.kernel_.theta`` is returned.
+
+        eval_gradient : bool, default=False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. If True, theta must not be None.
+
+        clone_kernel : bool, default=True
+            If True, the kernel attribute is copied. If False, the kernel
+            attribute is modified, but may result in a performance improvement.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when eval_gradient is True.
+        """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        if clone_kernel:
+            kernel = self.kernel_.clone_with_theta(theta)
+        else:
+            kernel = self.kernel_
+            kernel.theta = theta
+
+        if eval_gradient:
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
+        else:
+            K = kernel(self.X_train_)
+
+        # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
+        K[np.diag_indices_from(K)] += self.alpha
+        try:
+            L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
+        except np.linalg.LinAlgError:
+            return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf
+
+        # Support multi-dimensional output of self.y_train_
+        y_train = self.y_train_
+        if y_train.ndim == 1:
+            y_train = y_train[:, np.newaxis]
+
+        # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
+        alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False)
+
+        # Alg 2.1, page 19, line 7
+        # -0.5 . y^T . alpha - sum(log(diag(L))) - n_samples / 2 log(2*pi)
+        # y is originally thought to be a (1, n_samples) row vector. However,
+        # in multioutputs, y is of shape (n_samples, 2) and we need to compute
+        # y^T . alpha for each output, independently using einsum. Thus, it
+        # is equivalent to:
+        # for output_idx in range(n_outputs):
+        #     log_likelihood_dims[output_idx] = (
+        #         y_train[:, [output_idx]] @ alpha[:, [output_idx]]
+        #     )
+        log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha)
+        log_likelihood_dims -= np.log(np.diag(L)).sum()
+        log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)
+        # the log likelihood is sum-up across the outputs
+        log_likelihood = log_likelihood_dims.sum(axis=-1)
+
+        if eval_gradient:
+            # Eq. 5.9, p. 114, and footnote 5 in p. 114
+            # 0.5 * trace((alpha . alpha^T - K^-1) . K_gradient)
+            # alpha is supposed to be a vector of (n_samples,) elements. With
+            # multioutputs, alpha is a matrix of size (n_samples, n_outputs).
+            # Therefore, we want to construct a matrix of
+            # (n_samples, n_samples, n_outputs) equivalent to
+            # for output_idx in range(n_outputs):
+            #     output_alpha = alpha[:, [output_idx]]
+            #     inner_term[..., output_idx] = output_alpha @ output_alpha.T
+            inner_term = np.einsum("ik,jk->ijk", alpha, alpha)
+            # compute K^-1 of shape (n_samples, n_samples)
+            K_inv = cho_solve(
+                (L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False
+            )
+            # create a new axis to use broadcasting between inner_term and
+            # K_inv
+            inner_term -= K_inv[..., np.newaxis]
+            # Since we are interested about the trace of
+            # inner_term @ K_gradient, we don't explicitly compute the
+            # matrix-by-matrix operation and instead use an einsum. Therefore
+            # it is equivalent to:
+            # for param_idx in range(n_kernel_params):
+            #     for output_idx in range(n_output):
+            #         log_likehood_gradient_dims[param_idx, output_idx] = (
+            #             inner_term[..., output_idx] @
+            #             K_gradient[..., param_idx]
+            #         )
+            log_likelihood_gradient_dims = 0.5 * np.einsum(
+                "ijl,jik->kl", inner_term, K_gradient
+            )
+            # the log likelihood gradient is the sum-up across the outputs
+            log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1)
+
+        if eval_gradient:
+            return log_likelihood, log_likelihood_gradient
+        else:
+            return log_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer == "fmin_l_bfgs_b":
+            opt_res = scipy.optimize.minimize(
+                obj_func,
+                initial_theta,
+                method="L-BFGS-B",
+                jac=True,
+                bounds=bounds,
+            )
+            _check_optimize_result("lbfgs", opt_res)
+            theta_opt, func_min = opt_res.x, opt_res.fun
+        elif callable(self.optimizer):
+            theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError(f"Unknown optimizer {self.optimizer}.")
+
+        return theta_opt, func_min
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.requires_fit = False
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/kernels.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a0a6ec667be421695e9d5e85d8282887614a2fe
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/kernels.py
@@ -0,0 +1,2408 @@
+"""A set of kernels that can be combined by operators and used in Gaussian processes."""
+
+# Kernels for Gaussian process regression and classification.
+#
+# The kernels in this module allow kernel-engineering, i.e., they can be
+# combined via the "+" and "*" operators or be exponentiated with a scalar
+# via "**". These sum and product expressions can also contain scalar values,
+# which are automatically converted to a constant kernel.
+#
+# All kernels allow (analytic) gradient-based hyperparameter optimization.
+# The space of hyperparameters can be specified by giving lower und upper
+# boundaries for the value of each hyperparameter (the search space is thus
+# rectangular). Instead of specifying bounds, hyperparameters can also be
+# declared to be "fixed", which causes these hyperparameters to be excluded from
+# optimization.
+
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Note: this module is strongly inspired by the kernel module of the george
+#       package.
+
+import math
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import namedtuple
+from inspect import signature
+
+import numpy as np
+from scipy.spatial.distance import cdist, pdist, squareform
+from scipy.special import gamma, kv
+
+from ..base import clone
+from ..exceptions import ConvergenceWarning
+from ..metrics.pairwise import pairwise_kernels
+from ..utils.validation import _num_samples
+
+
+def _check_length_scale(X, length_scale):
+    length_scale = np.squeeze(length_scale).astype(float)
+    if np.ndim(length_scale) > 1:
+        raise ValueError("length_scale cannot be of dimension greater than 1")
+    if np.ndim(length_scale) == 1 and X.shape[1] != length_scale.shape[0]:
+        raise ValueError(
+            "Anisotropic kernel must have the same number of "
+            "dimensions as data (%d!=%d)" % (length_scale.shape[0], X.shape[1])
+        )
+    return length_scale
+
+
+class Hyperparameter(
+    namedtuple(
+        "Hyperparameter", ("name", "value_type", "bounds", "n_elements", "fixed")
+    )
+):
+    """A kernel hyperparameter's specification in form of a namedtuple.
+
+    .. versionadded:: 0.18
+
+    Attributes
+    ----------
+    name : str
+        The name of the hyperparameter. Note that a kernel using a
+        hyperparameter with name "x" must have the attributes self.x and
+        self.x_bounds
+
+    value_type : str
+        The type of the hyperparameter. Currently, only "numeric"
+        hyperparameters are supported.
+
+    bounds : pair of floats >= 0 or "fixed"
+        The lower and upper bound on the parameter. If n_elements>1, a pair
+        of 1d array with n_elements each may be given alternatively. If
+        the string "fixed" is passed as bounds, the hyperparameter's value
+        cannot be changed.
+
+    n_elements : int, default=1
+        The number of elements of the hyperparameter value. Defaults to 1,
+        which corresponds to a scalar hyperparameter. n_elements > 1
+        corresponds to a hyperparameter which is vector-valued,
+        such as, e.g., anisotropic length-scales.
+
+    fixed : bool, default=None
+        Whether the value of this hyperparameter is fixed, i.e., cannot be
+        changed during hyperparameter tuning. If None is passed, the "fixed" is
+        derived based on the given bounds.
+
+    Examples
+    --------
+    >>> from sklearn.gaussian_process.kernels import ConstantKernel
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import Hyperparameter
+    >>> X, y = make_friedman2(n_samples=50, noise=0, random_state=0)
+    >>> kernel = ConstantKernel(constant_value=1.0,
+    ...    constant_value_bounds=(0.0, 10.0))
+
+    We can access each hyperparameter:
+
+    >>> for hyperparameter in kernel.hyperparameters:
+    ...    print(hyperparameter)
+    Hyperparameter(name='constant_value', value_type='numeric',
+    bounds=array([[ 0., 10.]]), n_elements=1, fixed=False)
+
+    >>> params = kernel.get_params()
+    >>> for key in sorted(params): print(f"{key} : {params[key]}")
+    constant_value : 1.0
+    constant_value_bounds : (0.0, 10.0)
+    """
+
+    # A raw namedtuple is very memory efficient as it packs the attributes
+    # in a struct to get rid of the __dict__ of attributes in particular it
+    # does not copy the string for the keys on each instance.
+    # By deriving a namedtuple class just to introduce the __init__ method we
+    # would also reintroduce the __dict__ on the instance. By telling the
+    # Python interpreter that this subclass uses static __slots__ instead of
+    # dynamic attributes. Furthermore we don't need any additional slot in the
+    # subclass so we set __slots__ to the empty tuple.
+    __slots__ = ()
+
+    def __new__(cls, name, value_type, bounds, n_elements=1, fixed=None):
+        if not isinstance(bounds, str) or bounds != "fixed":
+            bounds = np.atleast_2d(bounds)
+            if n_elements > 1:  # vector-valued parameter
+                if bounds.shape[0] == 1:
+                    bounds = np.repeat(bounds, n_elements, 0)
+                elif bounds.shape[0] != n_elements:
+                    raise ValueError(
+                        "Bounds on %s should have either 1 or "
+                        "%d dimensions. Given are %d"
+                        % (name, n_elements, bounds.shape[0])
+                    )
+
+        if fixed is None:
+            fixed = isinstance(bounds, str) and bounds == "fixed"
+        return super().__new__(cls, name, value_type, bounds, n_elements, fixed)
+
+    # This is mainly a testing utility to check that two hyperparameters
+    # are equal.
+    def __eq__(self, other):
+        return (
+            self.name == other.name
+            and self.value_type == other.value_type
+            and np.all(self.bounds == other.bounds)
+            and self.n_elements == other.n_elements
+            and self.fixed == other.fixed
+        )
+
+
+class Kernel(metaclass=ABCMeta):
+    """Base class for all kernels.
+
+    .. versionadded:: 0.18
+
+    Examples
+    --------
+    >>> from sklearn.gaussian_process.kernels import Kernel, RBF
+    >>> import numpy as np
+    >>> class CustomKernel(Kernel):
+    ...     def __init__(self, length_scale=1.0):
+    ...         self.length_scale = length_scale
+    ...     def __call__(self, X, Y=None):
+    ...         if Y is None:
+    ...             Y = X
+    ...         return np.inner(X, X if Y is None else Y) ** 2
+    ...     def diag(self, X):
+    ...         return np.ones(X.shape[0])
+    ...     def is_stationary(self):
+    ...         return True
+    >>> kernel = CustomKernel(length_scale=2.0)
+    >>> X = np.array([[1, 2], [3, 4]])
+    >>> print(kernel(X))
+    [[ 25 121]
+     [121 625]]
+    """
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        params = dict()
+
+        # introspect the constructor arguments to find the model parameters
+        # to represent
+        cls = self.__class__
+        init = getattr(cls.__init__, "deprecated_original", cls.__init__)
+        init_sign = signature(init)
+        args, varargs = [], []
+        for parameter in init_sign.parameters.values():
+            if parameter.kind != parameter.VAR_KEYWORD and parameter.name != "self":
+                args.append(parameter.name)
+            if parameter.kind == parameter.VAR_POSITIONAL:
+                varargs.append(parameter.name)
+
+        if len(varargs) != 0:
+            raise RuntimeError(
+                "scikit-learn kernels should always "
+                "specify their parameters in the signature"
+                " of their __init__ (no varargs)."
+                " %s doesn't follow this convention." % (cls,)
+            )
+        for arg in args:
+            params[arg] = getattr(self, arg)
+
+        return params
+
+    def set_params(self, **params):
+        """Set the parameters of this kernel.
+
+        The method works on simple kernels as well as on nested kernels.
+        The latter have parameters of the form ``<component>__<parameter>``
+        so that it's possible to update each component of a nested object.
+
+        Returns
+        -------
+        self
+        """
+        if not params:
+            # Simple optimisation to gain speed (inspect is slow)
+            return self
+        valid_params = self.get_params(deep=True)
+        for key, value in params.items():
+            split = key.split("__", 1)
+            if len(split) > 1:
+                # nested objects case
+                name, sub_name = split
+                if name not in valid_params:
+                    raise ValueError(
+                        "Invalid parameter %s for kernel %s. "
+                        "Check the list of available parameters "
+                        "with `kernel.get_params().keys()`." % (name, self)
+                    )
+                sub_object = valid_params[name]
+                sub_object.set_params(**{sub_name: value})
+            else:
+                # simple objects case
+                if key not in valid_params:
+                    raise ValueError(
+                        "Invalid parameter %s for kernel %s. "
+                        "Check the list of available parameters "
+                        "with `kernel.get_params().keys()`."
+                        % (key, self.__class__.__name__)
+                    )
+                setattr(self, key, value)
+        return self
+
+    def clone_with_theta(self, theta):
+        """Returns a clone of self with given hyperparameters theta.
+
+        Parameters
+        ----------
+        theta : ndarray of shape (n_dims,)
+            The hyperparameters
+        """
+        cloned = clone(self)
+        cloned.theta = theta
+        return cloned
+
+    @property
+    def n_dims(self):
+        """Returns the number of non-fixed hyperparameters of the kernel."""
+        return self.theta.shape[0]
+
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter specifications."""
+        r = [
+            getattr(self, attr)
+            for attr in dir(self)
+            if attr.startswith("hyperparameter_")
+        ]
+        return r
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : ndarray of shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        theta = []
+        params = self.get_params()
+        for hyperparameter in self.hyperparameters:
+            if not hyperparameter.fixed:
+                theta.append(params[hyperparameter.name])
+        if len(theta) > 0:
+            return np.log(np.hstack(theta))
+        else:
+            return np.array([])
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : ndarray of shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        params = self.get_params()
+        i = 0
+        for hyperparameter in self.hyperparameters:
+            if hyperparameter.fixed:
+                continue
+            if hyperparameter.n_elements > 1:
+                # vector-valued parameter
+                params[hyperparameter.name] = np.exp(
+                    theta[i : i + hyperparameter.n_elements]
+                )
+                i += hyperparameter.n_elements
+            else:
+                params[hyperparameter.name] = np.exp(theta[i])
+                i += 1
+
+        if i != len(theta):
+            raise ValueError(
+                "theta has not the correct number of entries."
+                " Should be %d; given are %d" % (i, len(theta))
+            )
+        self.set_params(**params)
+
+    @property
+    def bounds(self):
+        """Returns the log-transformed bounds on the theta.
+
+        Returns
+        -------
+        bounds : ndarray of shape (n_dims, 2)
+            The log-transformed bounds on the kernel's hyperparameters theta
+        """
+        bounds = [
+            hyperparameter.bounds
+            for hyperparameter in self.hyperparameters
+            if not hyperparameter.fixed
+        ]
+        if len(bounds) > 0:
+            return np.log(np.vstack(bounds))
+        else:
+            return np.array([])
+
+    def __add__(self, b):
+        if not isinstance(b, Kernel):
+            return Sum(self, ConstantKernel(b))
+        return Sum(self, b)
+
+    def __radd__(self, b):
+        if not isinstance(b, Kernel):
+            return Sum(ConstantKernel(b), self)
+        return Sum(b, self)
+
+    def __mul__(self, b):
+        if not isinstance(b, Kernel):
+            return Product(self, ConstantKernel(b))
+        return Product(self, b)
+
+    def __rmul__(self, b):
+        if not isinstance(b, Kernel):
+            return Product(ConstantKernel(b), self)
+        return Product(b, self)
+
+    def __pow__(self, b):
+        return Exponentiation(self, b)
+
+    def __eq__(self, b):
+        if type(self) != type(b):
+            return False
+        params_a = self.get_params()
+        params_b = b.get_params()
+        for key in set(list(params_a.keys()) + list(params_b.keys())):
+            if np.any(params_a.get(key, None) != params_b.get(key, None)):
+                return False
+        return True
+
+    def __repr__(self):
+        return "{0}({1})".format(
+            self.__class__.__name__, ", ".join(map("{0:.3g}".format, self.theta))
+        )
+
+    @abstractmethod
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Evaluate the kernel."""
+
+    @abstractmethod
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples,)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+
+    @abstractmethod
+    def is_stationary(self):
+        """Returns whether the kernel is stationary."""
+
+    @property
+    def requires_vector_input(self):
+        """Returns whether the kernel is defined on fixed-length feature
+        vectors or generic objects. Defaults to True for backward
+        compatibility."""
+        return True
+
+    def _check_bounds_params(self):
+        """Called after fitting to warn if bounds may have been too tight."""
+        list_close = np.isclose(self.bounds, np.atleast_2d(self.theta).T)
+        idx = 0
+        for hyp in self.hyperparameters:
+            if hyp.fixed:
+                continue
+            for dim in range(hyp.n_elements):
+                if list_close[idx, 0]:
+                    warnings.warn(
+                        "The optimal value found for "
+                        "dimension %s of parameter %s is "
+                        "close to the specified lower "
+                        "bound %s. Decreasing the bound and"
+                        " calling fit again may find a "
+                        "better value." % (dim, hyp.name, hyp.bounds[dim][0]),
+                        ConvergenceWarning,
+                    )
+                elif list_close[idx, 1]:
+                    warnings.warn(
+                        "The optimal value found for "
+                        "dimension %s of parameter %s is "
+                        "close to the specified upper "
+                        "bound %s. Increasing the bound and"
+                        " calling fit again may find a "
+                        "better value." % (dim, hyp.name, hyp.bounds[dim][1]),
+                        ConvergenceWarning,
+                    )
+                idx += 1
+
+
+class NormalizedKernelMixin:
+    """Mixin for kernels which are normalized: k(X, X)=1.
+
+    .. versionadded:: 0.18
+    """
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return np.ones(X.shape[0])
+
+
+class StationaryKernelMixin:
+    """Mixin for kernels which are stationary: k(X, Y)= f(X-Y).
+
+    .. versionadded:: 0.18
+    """
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary."""
+        return True
+
+
+class GenericKernelMixin:
+    """Mixin for kernels which operate on generic objects such as variable-
+    length sequences, trees, and graphs.
+
+    .. versionadded:: 0.22
+    """
+
+    @property
+    def requires_vector_input(self):
+        """Whether the kernel works only on fixed-length feature vectors."""
+        return False
+
+
+class CompoundKernel(Kernel):
+    """Kernel which is composed of a set of other kernels.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernels : list of Kernels
+        The other kernels
+
+    Examples
+    --------
+    >>> from sklearn.gaussian_process.kernels import WhiteKernel
+    >>> from sklearn.gaussian_process.kernels import RBF
+    >>> from sklearn.gaussian_process.kernels import CompoundKernel
+    >>> kernel = CompoundKernel(
+    ...     [WhiteKernel(noise_level=3.0), RBF(length_scale=2.0)])
+    >>> print(kernel.bounds)
+    [[-11.51292546  11.51292546]
+     [-11.51292546  11.51292546]]
+    >>> print(kernel.n_dims)
+    2
+    >>> print(kernel.theta)
+    [1.09861229 0.69314718]
+    """
+
+    def __init__(self, kernels):
+        self.kernels = kernels
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        return dict(kernels=self.kernels)
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : ndarray of shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        return np.hstack([kernel.theta for kernel in self.kernels])
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : array of shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        k_dims = self.k1.n_dims
+        for i, kernel in enumerate(self.kernels):
+            kernel.theta = theta[i * k_dims : (i + 1) * k_dims]
+
+    @property
+    def bounds(self):
+        """Returns the log-transformed bounds on the theta.
+
+        Returns
+        -------
+        bounds : array of shape (n_dims, 2)
+            The log-transformed bounds on the kernel's hyperparameters theta
+        """
+        return np.vstack([kernel.bounds for kernel in self.kernels])
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Note that this compound kernel returns the results of all simple kernel
+        stacked along an additional axis.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object, \
+            default=None
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array-like of shape (n_samples_X, n_features) or list of object, \
+            default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            is evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of the
+            kernel hyperparameter is computed.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y, n_kernels)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape \
+                (n_samples_X, n_samples_X, n_dims, n_kernels), optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        if eval_gradient:
+            K = []
+            K_grad = []
+            for kernel in self.kernels:
+                K_single, K_grad_single = kernel(X, Y, eval_gradient)
+                K.append(K_single)
+                K_grad.append(K_grad_single[..., np.newaxis])
+            return np.dstack(K), np.concatenate(K_grad, 3)
+        else:
+            return np.dstack([kernel(X, Y, eval_gradient) for kernel in self.kernels])
+
+    def __eq__(self, b):
+        if type(self) != type(b) or len(self.kernels) != len(b.kernels):
+            return False
+        return np.all(
+            [self.kernels[i] == b.kernels[i] for i in range(len(self.kernels))]
+        )
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary."""
+        return np.all([kernel.is_stationary() for kernel in self.kernels])
+
+    @property
+    def requires_vector_input(self):
+        """Returns whether the kernel is defined on discrete structures."""
+        return np.any([kernel.requires_vector_input for kernel in self.kernels])
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to `np.diag(self(X))`; however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Argument to the kernel.
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X, n_kernels)
+            Diagonal of kernel k(X, X)
+        """
+        return np.vstack([kernel.diag(X) for kernel in self.kernels]).T
+
+
+class KernelOperator(Kernel):
+    """Base class for all kernel operators.
+
+    .. versionadded:: 0.18
+    """
+
+    def __init__(self, k1, k2):
+        self.k1 = k1
+        self.k2 = k2
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        params = dict(k1=self.k1, k2=self.k2)
+        if deep:
+            deep_items = self.k1.get_params().items()
+            params.update(("k1__" + k, val) for k, val in deep_items)
+            deep_items = self.k2.get_params().items()
+            params.update(("k2__" + k, val) for k, val in deep_items)
+
+        return params
+
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter."""
+        r = [
+            Hyperparameter(
+                "k1__" + hyperparameter.name,
+                hyperparameter.value_type,
+                hyperparameter.bounds,
+                hyperparameter.n_elements,
+            )
+            for hyperparameter in self.k1.hyperparameters
+        ]
+
+        for hyperparameter in self.k2.hyperparameters:
+            r.append(
+                Hyperparameter(
+                    "k2__" + hyperparameter.name,
+                    hyperparameter.value_type,
+                    hyperparameter.bounds,
+                    hyperparameter.n_elements,
+                )
+            )
+        return r
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : ndarray of shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        return np.append(self.k1.theta, self.k2.theta)
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : ndarray of shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        k1_dims = self.k1.n_dims
+        self.k1.theta = theta[:k1_dims]
+        self.k2.theta = theta[k1_dims:]
+
+    @property
+    def bounds(self):
+        """Returns the log-transformed bounds on the theta.
+
+        Returns
+        -------
+        bounds : ndarray of shape (n_dims, 2)
+            The log-transformed bounds on the kernel's hyperparameters theta
+        """
+        if self.k1.bounds.size == 0:
+            return self.k2.bounds
+        if self.k2.bounds.size == 0:
+            return self.k1.bounds
+        return np.vstack((self.k1.bounds, self.k2.bounds))
+
+    def __eq__(self, b):
+        if type(self) != type(b):
+            return False
+        return (self.k1 == b.k1 and self.k2 == b.k2) or (
+            self.k1 == b.k2 and self.k2 == b.k1
+        )
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary."""
+        return self.k1.is_stationary() and self.k2.is_stationary()
+
+    @property
+    def requires_vector_input(self):
+        """Returns whether the kernel is stationary."""
+        return self.k1.requires_vector_input or self.k2.requires_vector_input
+
+
+class Sum(KernelOperator):
+    """The `Sum` kernel takes two kernels :math:`k_1` and :math:`k_2`
+    and combines them via
+
+    .. math::
+        k_{sum}(X, Y) = k_1(X, Y) + k_2(X, Y)
+
+    Note that the `__add__` magic method is overridden, so
+    `Sum(RBF(), RBF())` is equivalent to using the + operator
+    with `RBF() + RBF()`.
+
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    k1 : Kernel
+        The first base-kernel of the sum-kernel
+
+    k2 : Kernel
+        The second base-kernel of the sum-kernel
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import RBF, Sum, ConstantKernel
+    >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
+    >>> kernel = Sum(ConstantKernel(2), RBF())
+    >>> gpr = GaussianProcessRegressor(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    1.0
+    >>> kernel
+    1.41**2 + RBF(length_scale=1)
+    """
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array-like of shape (n_samples_X, n_features) or list of object,\
+                default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            is evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
+                optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        if eval_gradient:
+            K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
+            K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
+            return K1 + K2, np.dstack((K1_gradient, K2_gradient))
+        else:
+            return self.k1(X, Y) + self.k2(X, Y)
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to `np.diag(self(X))`; however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Argument to the kernel.
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.k1.diag(X) + self.k2.diag(X)
+
+    def __repr__(self):
+        return "{0} + {1}".format(self.k1, self.k2)
+
+
+class Product(KernelOperator):
+    """The `Product` kernel takes two kernels :math:`k_1` and :math:`k_2`
+    and combines them via
+
+    .. math::
+        k_{prod}(X, Y) = k_1(X, Y) * k_2(X, Y)
+
+    Note that the `__mul__` magic method is overridden, so
+    `Product(RBF(), RBF())` is equivalent to using the * operator
+    with `RBF() * RBF()`.
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    k1 : Kernel
+        The first base-kernel of the product-kernel
+
+    k2 : Kernel
+        The second base-kernel of the product-kernel
+
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import (RBF, Product,
+    ...            ConstantKernel)
+    >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
+    >>> kernel = Product(ConstantKernel(2), RBF())
+    >>> gpr = GaussianProcessRegressor(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    1.0
+    >>> kernel
+    1.41**2 * RBF(length_scale=1)
+    """
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array-like of shape (n_samples_Y, n_features) or list of object,\
+            default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            is evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
+                optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        if eval_gradient:
+            K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
+            K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
+            return K1 * K2, np.dstack(
+                (K1_gradient * K2[:, :, np.newaxis], K2_gradient * K1[:, :, np.newaxis])
+            )
+        else:
+            return self.k1(X, Y) * self.k2(X, Y)
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Argument to the kernel.
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.k1.diag(X) * self.k2.diag(X)
+
+    def __repr__(self):
+        return "{0} * {1}".format(self.k1, self.k2)
+
+
+class Exponentiation(Kernel):
+    """The Exponentiation kernel takes one base kernel and a scalar parameter
+    :math:`p` and combines them via
+
+    .. math::
+        k_{exp}(X, Y) = k(X, Y) ^p
+
+    Note that the `__pow__` magic method is overridden, so
+    `Exponentiation(RBF(), 2)` is equivalent to using the ** operator
+    with `RBF() ** 2`.
+
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernel : Kernel
+        The base kernel
+
+    exponent : float
+        The exponent for the base kernel
+
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import (RationalQuadratic,
+    ...            Exponentiation)
+    >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
+    >>> kernel = Exponentiation(RationalQuadratic(), exponent=2)
+    >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    0.419
+    >>> gpr.predict(X[:1,:], return_std=True)
+    (array([635.5]), array([0.559]))
+    """
+
+    def __init__(self, kernel, exponent):
+        self.kernel = kernel
+        self.exponent = exponent
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        params = dict(kernel=self.kernel, exponent=self.exponent)
+        if deep:
+            deep_items = self.kernel.get_params().items()
+            params.update(("kernel__" + k, val) for k, val in deep_items)
+        return params
+
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter."""
+        r = []
+        for hyperparameter in self.kernel.hyperparameters:
+            r.append(
+                Hyperparameter(
+                    "kernel__" + hyperparameter.name,
+                    hyperparameter.value_type,
+                    hyperparameter.bounds,
+                    hyperparameter.n_elements,
+                )
+            )
+        return r
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : ndarray of shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        return self.kernel.theta
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : ndarray of shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        self.kernel.theta = theta
+
+    @property
+    def bounds(self):
+        """Returns the log-transformed bounds on the theta.
+
+        Returns
+        -------
+        bounds : ndarray of shape (n_dims, 2)
+            The log-transformed bounds on the kernel's hyperparameters theta
+        """
+        return self.kernel.bounds
+
+    def __eq__(self, b):
+        if type(self) != type(b):
+            return False
+        return self.kernel == b.kernel and self.exponent == b.exponent
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array-like of shape (n_samples_Y, n_features) or list of object,\
+            default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            is evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
+                optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        if eval_gradient:
+            K, K_gradient = self.kernel(X, Y, eval_gradient=True)
+            K_gradient *= self.exponent * K[:, :, np.newaxis] ** (self.exponent - 1)
+            return K**self.exponent, K_gradient
+        else:
+            K = self.kernel(X, Y, eval_gradient=False)
+            return K**self.exponent
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Argument to the kernel.
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.kernel.diag(X) ** self.exponent
+
+    def __repr__(self):
+        return "{0} ** {1}".format(self.kernel, self.exponent)
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary."""
+        return self.kernel.is_stationary()
+
+    @property
+    def requires_vector_input(self):
+        """Returns whether the kernel is defined on discrete structures."""
+        return self.kernel.requires_vector_input
+
+
+class ConstantKernel(StationaryKernelMixin, GenericKernelMixin, Kernel):
+    """Constant kernel.
+
+    Can be used as part of a product-kernel where it scales the magnitude of
+    the other factor (kernel) or as part of a sum-kernel, where it modifies
+    the mean of the Gaussian process.
+
+    .. math::
+        k(x_1, x_2) = constant\\_value \\;\\forall\\; x_1, x_2
+
+    Adding a constant kernel is equivalent to adding a constant::
+
+            kernel = RBF() + ConstantKernel(constant_value=2)
+
+    is the same as::
+
+            kernel = RBF() + 2
+
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    constant_value : float, default=1.0
+        The constant value which defines the covariance:
+        k(x_1, x_2) = constant_value
+
+    constant_value_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on `constant_value`.
+        If set to "fixed", `constant_value` cannot be changed during
+        hyperparameter tuning.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import RBF, ConstantKernel
+    >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
+    >>> kernel = RBF() + ConstantKernel(constant_value=2)
+    >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    0.3696
+    >>> gpr.predict(X[:1,:], return_std=True)
+    (array([606.1]), array([0.248]))
+    """
+
+    def __init__(self, constant_value=1.0, constant_value_bounds=(1e-5, 1e5)):
+        self.constant_value = constant_value
+        self.constant_value_bounds = constant_value_bounds
+
+    @property
+    def hyperparameter_constant_value(self):
+        return Hyperparameter("constant_value", "numeric", self.constant_value_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array-like of shape (n_samples_X, n_features) or list of object, \
+            default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            is evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
+            optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if Y is None:
+            Y = X
+        elif eval_gradient:
+            raise ValueError("Gradient can only be evaluated when Y is None.")
+
+        K = np.full(
+            (_num_samples(X), _num_samples(Y)),
+            self.constant_value,
+            dtype=np.array(self.constant_value).dtype,
+        )
+        if eval_gradient:
+            if not self.hyperparameter_constant_value.fixed:
+                return (
+                    K,
+                    np.full(
+                        (_num_samples(X), _num_samples(X), 1),
+                        self.constant_value,
+                        dtype=np.array(self.constant_value).dtype,
+                    ),
+                )
+            else:
+                return K, np.empty((_num_samples(X), _num_samples(X), 0))
+        else:
+            return K
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Argument to the kernel.
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return np.full(
+            _num_samples(X),
+            self.constant_value,
+            dtype=np.array(self.constant_value).dtype,
+        )
+
+    def __repr__(self):
+        return "{0:.3g}**2".format(np.sqrt(self.constant_value))
+
+
+class WhiteKernel(StationaryKernelMixin, GenericKernelMixin, Kernel):
+    """White kernel.
+
+    The main use-case of this kernel is as part of a sum-kernel where it
+    explains the noise of the signal as independently and identically
+    normally-distributed. The parameter noise_level equals the variance of this
+    noise.
+
+    .. math::
+        k(x_1, x_2) = noise\\_level \\text{ if } x_i == x_j \\text{ else } 0
+
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    noise_level : float, default=1.0
+        Parameter controlling the noise level (variance)
+
+    noise_level_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'noise_level'.
+        If set to "fixed", 'noise_level' cannot be changed during
+        hyperparameter tuning.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
+    >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
+    >>> kernel = DotProduct() + WhiteKernel(noise_level=0.5)
+    >>> gpr = GaussianProcessRegressor(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    0.3680
+    >>> gpr.predict(X[:2,:], return_std=True)
+    (array([653.0, 592.1 ]), array([316.6, 316.6]))
+    """
+
+    def __init__(self, noise_level=1.0, noise_level_bounds=(1e-5, 1e5)):
+        self.noise_level = noise_level
+        self.noise_level_bounds = noise_level_bounds
+
+    @property
+    def hyperparameter_noise_level(self):
+        return Hyperparameter("noise_level", "numeric", self.noise_level_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array-like of shape (n_samples_X, n_features) or list of object,\
+            default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            is evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
+            optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if Y is not None and eval_gradient:
+            raise ValueError("Gradient can only be evaluated when Y is None.")
+
+        if Y is None:
+            K = self.noise_level * np.eye(_num_samples(X))
+            if eval_gradient:
+                if not self.hyperparameter_noise_level.fixed:
+                    return (
+                        K,
+                        self.noise_level * np.eye(_num_samples(X))[:, :, np.newaxis],
+                    )
+                else:
+                    return K, np.empty((_num_samples(X), _num_samples(X), 0))
+            else:
+                return K
+        else:
+            return np.zeros((_num_samples(X), _num_samples(Y)))
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Argument to the kernel.
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return np.full(
+            _num_samples(X), self.noise_level, dtype=np.array(self.noise_level).dtype
+        )
+
+    def __repr__(self):
+        return "{0}(noise_level={1:.3g})".format(
+            self.__class__.__name__, self.noise_level
+        )
+
+
+class RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
+    """Radial basis function kernel (aka squared-exponential kernel).
+
+    The RBF kernel is a stationary kernel. It is also known as the
+    "squared exponential" kernel. It is parameterized by a length scale
+    parameter :math:`l>0`, which can either be a scalar (isotropic variant
+    of the kernel) or a vector with the same number of dimensions as the inputs
+    X (anisotropic variant of the kernel). The kernel is given by:
+
+    .. math::
+        k(x_i, x_j) = \\exp\\left(- \\frac{d(x_i, x_j)^2}{2l^2} \\right)
+
+    where :math:`l` is the length scale of the kernel and
+    :math:`d(\\cdot,\\cdot)` is the Euclidean distance.
+    For advice on how to set the length scale parameter, see e.g. [1]_.
+
+    This kernel is infinitely differentiable, which implies that GPs with this
+    kernel as covariance function have mean square derivatives of all orders,
+    and are thus very smooth.
+    See [2]_, Chapter 4, Section 4.2, for further details of the RBF kernel.
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    length_scale : float or ndarray of shape (n_features,), default=1.0
+        The length scale of the kernel. If a float, an isotropic kernel is
+        used. If an array, an anisotropic kernel is used where each dimension
+        of l defines the length-scale of the respective feature dimension.
+
+    length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'length_scale'.
+        If set to "fixed", 'length_scale' cannot be changed during
+        hyperparameter tuning.
+
+    References
+    ----------
+    .. [1] `David Duvenaud (2014). "The Kernel Cookbook:
+        Advice on Covariance functions".
+        <https://www.cs.toronto.edu/~duvenaud/cookbook/>`_
+
+    .. [2] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).
+        "Gaussian Processes for Machine Learning". The MIT Press.
+        <http://www.gaussianprocess.org/gpml/>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.gaussian_process import GaussianProcessClassifier
+    >>> from sklearn.gaussian_process.kernels import RBF
+    >>> X, y = load_iris(return_X_y=True)
+    >>> kernel = 1.0 * RBF(1.0)
+    >>> gpc = GaussianProcessClassifier(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpc.score(X, y)
+    0.9866
+    >>> gpc.predict_proba(X[:2,:])
+    array([[0.8354, 0.03228, 0.1322],
+           [0.7906, 0.0652, 0.1441]])
+    """
+
+    def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5)):
+        self.length_scale = length_scale
+        self.length_scale_bounds = length_scale_bounds
+
+    @property
+    def anisotropic(self):
+        return np.iterable(self.length_scale) and len(self.length_scale) > 1
+
+    @property
+    def hyperparameter_length_scale(self):
+        if self.anisotropic:
+            return Hyperparameter(
+                "length_scale",
+                "numeric",
+                self.length_scale_bounds,
+                len(self.length_scale),
+            )
+        return Hyperparameter("length_scale", "numeric", self.length_scale_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : ndarray of shape (n_samples_Y, n_features), default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
+                optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        X = np.atleast_2d(X)
+        length_scale = _check_length_scale(X, self.length_scale)
+        if Y is None:
+            dists = pdist(X / length_scale, metric="sqeuclidean")
+            K = np.exp(-0.5 * dists)
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+        else:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated when Y is None.")
+            dists = cdist(X / length_scale, Y / length_scale, metric="sqeuclidean")
+            K = np.exp(-0.5 * dists)
+
+        if eval_gradient:
+            if self.hyperparameter_length_scale.fixed:
+                # Hyperparameter l kept fixed
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+            elif not self.anisotropic or length_scale.shape[0] == 1:
+                K_gradient = (K * squareform(dists))[:, :, np.newaxis]
+                return K, K_gradient
+            elif self.anisotropic:
+                # We need to recompute the pairwise dimension-wise distances
+                K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 / (
+                    length_scale**2
+                )
+                K_gradient *= K[..., np.newaxis]
+                return K, K_gradient
+        else:
+            return K
+
+    def __repr__(self):
+        if self.anisotropic:
+            return "{0}(length_scale=[{1}])".format(
+                self.__class__.__name__,
+                ", ".join(map("{0:.3g}".format, self.length_scale)),
+            )
+        else:  # isotropic
+            return "{0}(length_scale={1:.3g})".format(
+                self.__class__.__name__, np.ravel(self.length_scale)[0]
+            )
+
+
+class Matern(RBF):
+    """Matern kernel.
+
+    The class of Matern kernels is a generalization of the :class:`RBF`.
+    It has an additional parameter :math:`\\nu` which controls the
+    smoothness of the resulting function. The smaller :math:`\\nu`,
+    the less smooth the approximated function is.
+    As :math:`\\nu\\rightarrow\\infty`, the kernel becomes equivalent to
+    the :class:`RBF` kernel. When :math:`\\nu = 1/2`, the Matérn kernel
+    becomes identical to the absolute exponential kernel.
+    Important intermediate values are
+    :math:`\\nu=1.5` (once differentiable functions)
+    and :math:`\\nu=2.5` (twice differentiable functions).
+
+    The kernel is given by:
+
+    .. math::
+         k(x_i, x_j) =  \\frac{1}{\\Gamma(\\nu)2^{\\nu-1}}\\Bigg(
+         \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )
+         \\Bigg)^\\nu K_\\nu\\Bigg(
+         \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\\Bigg)
+
+
+
+    where :math:`d(\\cdot,\\cdot)` is the Euclidean distance,
+    :math:`K_{\\nu}(\\cdot)` is a modified Bessel function and
+    :math:`\\Gamma(\\cdot)` is the gamma function.
+    See [1]_, Chapter 4, Section 4.2, for details regarding the different
+    variants of the Matern kernel.
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    length_scale : float or ndarray of shape (n_features,), default=1.0
+        The length scale of the kernel. If a float, an isotropic kernel is
+        used. If an array, an anisotropic kernel is used where each dimension
+        of l defines the length-scale of the respective feature dimension.
+
+    length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'length_scale'.
+        If set to "fixed", 'length_scale' cannot be changed during
+        hyperparameter tuning.
+
+    nu : float, default=1.5
+        The parameter nu controlling the smoothness of the learned function.
+        The smaller nu, the less smooth the approximated function is.
+        For nu=inf, the kernel becomes equivalent to the RBF kernel and for
+        nu=0.5 to the absolute exponential kernel. Important intermediate
+        values are nu=1.5 (once differentiable functions) and nu=2.5
+        (twice differentiable functions). Note that values of nu not in
+        [0.5, 1.5, 2.5, inf] incur a considerably higher computational cost
+        (appr. 10 times higher) since they require to evaluate the modified
+        Bessel function. Furthermore, in contrast to l, nu is kept fixed to
+        its initial value and not optimized.
+
+    References
+    ----------
+    .. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).
+        "Gaussian Processes for Machine Learning". The MIT Press.
+        <http://www.gaussianprocess.org/gpml/>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.gaussian_process import GaussianProcessClassifier
+    >>> from sklearn.gaussian_process.kernels import Matern
+    >>> X, y = load_iris(return_X_y=True)
+    >>> kernel = 1.0 * Matern(length_scale=1.0, nu=1.5)
+    >>> gpc = GaussianProcessClassifier(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpc.score(X, y)
+    0.9866
+    >>> gpc.predict_proba(X[:2,:])
+    array([[0.8513, 0.0368, 0.1117],
+            [0.8086, 0.0693, 0.1220]])
+    """
+
+    def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5), nu=1.5):
+        super().__init__(length_scale, length_scale_bounds)
+        self.nu = nu
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : ndarray of shape (n_samples_Y, n_features), default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
+                optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        X = np.atleast_2d(X)
+        length_scale = _check_length_scale(X, self.length_scale)
+        if Y is None:
+            dists = pdist(X / length_scale, metric="euclidean")
+        else:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated when Y is None.")
+            dists = cdist(X / length_scale, Y / length_scale, metric="euclidean")
+
+        if self.nu == 0.5:
+            K = np.exp(-dists)
+        elif self.nu == 1.5:
+            K = dists * math.sqrt(3)
+            K = (1.0 + K) * np.exp(-K)
+        elif self.nu == 2.5:
+            K = dists * math.sqrt(5)
+            K = (1.0 + K + K**2 / 3.0) * np.exp(-K)
+        elif self.nu == np.inf:
+            K = np.exp(-(dists**2) / 2.0)
+        else:  # general case; expensive to evaluate
+            K = dists
+            K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
+            tmp = math.sqrt(2 * self.nu) * K
+            K.fill((2 ** (1.0 - self.nu)) / gamma(self.nu))
+            K *= tmp**self.nu
+            K *= kv(self.nu, tmp)
+
+        if Y is None:
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+
+        if eval_gradient:
+            if self.hyperparameter_length_scale.fixed:
+                # Hyperparameter l kept fixed
+                K_gradient = np.empty((X.shape[0], X.shape[0], 0))
+                return K, K_gradient
+
+            # We need to recompute the pairwise dimension-wise distances
+            if self.anisotropic:
+                D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 / (length_scale**2)
+            else:
+                D = squareform(dists**2)[:, :, np.newaxis]
+
+            if self.nu == 0.5:
+                denominator = np.sqrt(D.sum(axis=2))[:, :, np.newaxis]
+                divide_result = np.zeros_like(D)
+                np.divide(
+                    D,
+                    denominator,
+                    out=divide_result,
+                    where=denominator != 0,
+                )
+                K_gradient = K[..., np.newaxis] * divide_result
+            elif self.nu == 1.5:
+                K_gradient = 3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]
+            elif self.nu == 2.5:
+                tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]
+                K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp)
+            elif self.nu == np.inf:
+                K_gradient = D * K[..., np.newaxis]
+            else:
+                # approximate gradient numerically
+                def f(theta):  # helper function
+                    return self.clone_with_theta(theta)(X, Y)
+
+                return K, _approx_fprime(self.theta, f, 1e-10)
+
+            if not self.anisotropic:
+                return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]
+            else:
+                return K, K_gradient
+        else:
+            return K
+
+    def __repr__(self):
+        if self.anisotropic:
+            return "{0}(length_scale=[{1}], nu={2:.3g})".format(
+                self.__class__.__name__,
+                ", ".join(map("{0:.3g}".format, self.length_scale)),
+                self.nu,
+            )
+        else:
+            return "{0}(length_scale={1:.3g}, nu={2:.3g})".format(
+                self.__class__.__name__, np.ravel(self.length_scale)[0], self.nu
+            )
+
+
+class RationalQuadratic(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
+    """Rational Quadratic kernel.
+
+    The RationalQuadratic kernel can be seen as a scale mixture (an infinite
+    sum) of RBF kernels with different characteristic length scales. It is
+    parameterized by a length scale parameter :math:`l>0` and a scale
+    mixture parameter :math:`\\alpha>0`. Only the isotropic variant
+    where length_scale :math:`l` is a scalar is supported at the moment.
+    The kernel is given by:
+
+    .. math::
+        k(x_i, x_j) = \\left(
+        1 + \\frac{d(x_i, x_j)^2 }{ 2\\alpha  l^2}\\right)^{-\\alpha}
+
+    where :math:`\\alpha` is the scale mixture parameter, :math:`l` is
+    the length scale of the kernel and :math:`d(\\cdot,\\cdot)` is the
+    Euclidean distance.
+    For advice on how to set the parameters, see e.g. [1]_.
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    length_scale : float > 0, default=1.0
+        The length scale of the kernel.
+
+    alpha : float > 0, default=1.0
+        Scale mixture parameter
+
+    length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'length_scale'.
+        If set to "fixed", 'length_scale' cannot be changed during
+        hyperparameter tuning.
+
+    alpha_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'alpha'.
+        If set to "fixed", 'alpha' cannot be changed during
+        hyperparameter tuning.
+
+    References
+    ----------
+    .. [1] `David Duvenaud (2014). "The Kernel Cookbook:
+        Advice on Covariance functions".
+        <https://www.cs.toronto.edu/~duvenaud/cookbook/>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.gaussian_process import GaussianProcessClassifier
+    >>> from sklearn.gaussian_process.kernels import RationalQuadratic
+    >>> X, y = load_iris(return_X_y=True)
+    >>> kernel = RationalQuadratic(length_scale=1.0, alpha=1.5)
+    >>> gpc = GaussianProcessClassifier(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpc.score(X, y)
+    0.9733
+    >>> gpc.predict_proba(X[:2,:])
+    array([[0.8881, 0.0566, 0.05518],
+            [0.8678, 0.0707 , 0.0614]])
+    """
+
+    def __init__(
+        self,
+        length_scale=1.0,
+        alpha=1.0,
+        length_scale_bounds=(1e-5, 1e5),
+        alpha_bounds=(1e-5, 1e5),
+    ):
+        self.length_scale = length_scale
+        self.alpha = alpha
+        self.length_scale_bounds = length_scale_bounds
+        self.alpha_bounds = alpha_bounds
+
+    @property
+    def hyperparameter_length_scale(self):
+        return Hyperparameter("length_scale", "numeric", self.length_scale_bounds)
+
+    @property
+    def hyperparameter_alpha(self):
+        return Hyperparameter("alpha", "numeric", self.alpha_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : ndarray of shape (n_samples_Y, n_features), default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if len(np.atleast_1d(self.length_scale)) > 1:
+            raise AttributeError(
+                "RationalQuadratic kernel only supports isotropic version, "
+                "please use a single scalar for length_scale"
+            )
+        X = np.atleast_2d(X)
+        if Y is None:
+            dists = squareform(pdist(X, metric="sqeuclidean"))
+            tmp = dists / (2 * self.alpha * self.length_scale**2)
+            base = 1 + tmp
+            K = base**-self.alpha
+            np.fill_diagonal(K, 1)
+        else:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated when Y is None.")
+            dists = cdist(X, Y, metric="sqeuclidean")
+            K = (1 + dists / (2 * self.alpha * self.length_scale**2)) ** -self.alpha
+
+        if eval_gradient:
+            # gradient with respect to length_scale
+            if not self.hyperparameter_length_scale.fixed:
+                length_scale_gradient = dists * K / (self.length_scale**2 * base)
+                length_scale_gradient = length_scale_gradient[:, :, np.newaxis]
+            else:  # l is kept fixed
+                length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            # gradient with respect to alpha
+            if not self.hyperparameter_alpha.fixed:
+                alpha_gradient = K * (
+                    -self.alpha * np.log(base)
+                    + dists / (2 * self.length_scale**2 * base)
+                )
+                alpha_gradient = alpha_gradient[:, :, np.newaxis]
+            else:  # alpha is kept fixed
+                alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            return K, np.dstack((alpha_gradient, length_scale_gradient))
+        else:
+            return K
+
+    def __repr__(self):
+        return "{0}(alpha={1:.3g}, length_scale={2:.3g})".format(
+            self.__class__.__name__, self.alpha, self.length_scale
+        )
+
+
+class ExpSineSquared(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
+    r"""Exp-Sine-Squared kernel (aka periodic kernel).
+
+    The ExpSineSquared kernel allows one to model functions which repeat
+    themselves exactly. It is parameterized by a length scale
+    parameter :math:`l>0` and a periodicity parameter :math:`p>0`.
+    Only the isotropic variant where :math:`l` is a scalar is
+    supported at the moment. The kernel is given by:
+
+    .. math::
+        k(x_i, x_j) = \text{exp}\left(-
+        \frac{ 2\sin^2(\pi d(x_i, x_j)/p) }{ l^ 2} \right)
+
+    where :math:`l` is the length scale of the kernel, :math:`p` the
+    periodicity of the kernel and :math:`d(\cdot,\cdot)` is the
+    Euclidean distance.
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+
+    length_scale : float > 0, default=1.0
+        The length scale of the kernel.
+
+    periodicity : float > 0, default=1.0
+        The periodicity of the kernel.
+
+    length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'length_scale'.
+        If set to "fixed", 'length_scale' cannot be changed during
+        hyperparameter tuning.
+
+    periodicity_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'periodicity'.
+        If set to "fixed", 'periodicity' cannot be changed during
+        hyperparameter tuning.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import ExpSineSquared
+    >>> X, y = make_friedman2(n_samples=50, noise=0, random_state=0)
+    >>> kernel = ExpSineSquared(length_scale=1, periodicity=1)
+    >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    0.0144
+    >>> gpr.predict(X[:2,:], return_std=True)
+    (array([425.6, 457.5]), array([0.3894, 0.3467]))
+    """
+
+    def __init__(
+        self,
+        length_scale=1.0,
+        periodicity=1.0,
+        length_scale_bounds=(1e-5, 1e5),
+        periodicity_bounds=(1e-5, 1e5),
+    ):
+        self.length_scale = length_scale
+        self.periodicity = periodicity
+        self.length_scale_bounds = length_scale_bounds
+        self.periodicity_bounds = periodicity_bounds
+
+    @property
+    def hyperparameter_length_scale(self):
+        """Returns the length scale"""
+        return Hyperparameter("length_scale", "numeric", self.length_scale_bounds)
+
+    @property
+    def hyperparameter_periodicity(self):
+        return Hyperparameter("periodicity", "numeric", self.periodicity_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : ndarray of shape (n_samples_Y, n_features), default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
+                optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if Y is None:
+            dists = squareform(pdist(X, metric="euclidean"))
+            arg = np.pi * dists / self.periodicity
+            sin_of_arg = np.sin(arg)
+            K = np.exp(-2 * (sin_of_arg / self.length_scale) ** 2)
+        else:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated when Y is None.")
+            dists = cdist(X, Y, metric="euclidean")
+            K = np.exp(
+                -2 * (np.sin(np.pi / self.periodicity * dists) / self.length_scale) ** 2
+            )
+
+        if eval_gradient:
+            cos_of_arg = np.cos(arg)
+            # gradient with respect to length_scale
+            if not self.hyperparameter_length_scale.fixed:
+                length_scale_gradient = 4 / self.length_scale**2 * sin_of_arg**2 * K
+                length_scale_gradient = length_scale_gradient[:, :, np.newaxis]
+            else:  # length_scale is kept fixed
+                length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))
+            # gradient with respect to p
+            if not self.hyperparameter_periodicity.fixed:
+                periodicity_gradient = (
+                    4 * arg / self.length_scale**2 * cos_of_arg * sin_of_arg * K
+                )
+                periodicity_gradient = periodicity_gradient[:, :, np.newaxis]
+            else:  # p is kept fixed
+                periodicity_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            return K, np.dstack((length_scale_gradient, periodicity_gradient))
+        else:
+            return K
+
+    def __repr__(self):
+        return "{0}(length_scale={1:.3g}, periodicity={2:.3g})".format(
+            self.__class__.__name__, self.length_scale, self.periodicity
+        )
+
+
+class DotProduct(Kernel):
+    r"""Dot-Product kernel.
+
+    The DotProduct kernel is non-stationary and can be obtained from linear
+    regression by putting :math:`N(0, 1)` priors on the coefficients
+    of :math:`x_d (d = 1, . . . , D)` and a prior of :math:`N(0, \sigma_0^2)`
+    on the bias. The DotProduct kernel is invariant to a rotation of
+    the coordinates about the origin, but not translations.
+    It is parameterized by a parameter sigma_0 :math:`\sigma`
+    which controls the inhomogenity of the kernel. For :math:`\sigma_0^2 =0`,
+    the kernel is called the homogeneous linear kernel, otherwise
+    it is inhomogeneous. The kernel is given by
+
+    .. math::
+        k(x_i, x_j) = \sigma_0 ^ 2 + x_i \cdot x_j
+
+    The DotProduct kernel is commonly combined with exponentiation.
+
+    See [1]_, Chapter 4, Section 4.2, for further details regarding the
+    DotProduct kernel.
+
+    Read more in the :ref:`User Guide <gp_kernels>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    sigma_0 : float >= 0, default=1.0
+        Parameter controlling the inhomogenity of the kernel. If sigma_0=0,
+        the kernel is homogeneous.
+
+    sigma_0_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'sigma_0'.
+        If set to "fixed", 'sigma_0' cannot be changed during
+        hyperparameter tuning.
+
+    References
+    ----------
+    .. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).
+        "Gaussian Processes for Machine Learning". The MIT Press.
+        <http://www.gaussianprocess.org/gpml/>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
+    >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
+    >>> kernel = DotProduct() + WhiteKernel()
+    >>> gpr = GaussianProcessRegressor(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    0.3680
+    >>> gpr.predict(X[:2,:], return_std=True)
+    (array([653.0, 592.1]), array([316.6, 316.6]))
+    """
+
+    def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, 1e5)):
+        self.sigma_0 = sigma_0
+        self.sigma_0_bounds = sigma_0_bounds
+
+    @property
+    def hyperparameter_sigma_0(self):
+        return Hyperparameter("sigma_0", "numeric", self.sigma_0_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : ndarray of shape (n_samples_Y, n_features), default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
+                optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if Y is None:
+            K = np.inner(X, X) + self.sigma_0**2
+        else:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated when Y is None.")
+            K = np.inner(X, Y) + self.sigma_0**2
+
+        if eval_gradient:
+            if not self.hyperparameter_sigma_0.fixed:
+                K_gradient = np.empty((K.shape[0], K.shape[1], 1))
+                K_gradient[..., 0] = 2 * self.sigma_0**2
+                return K, K_gradient
+            else:
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+        else:
+            return K
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y).
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X).
+        """
+        return np.einsum("ij,ij->i", X, X) + self.sigma_0**2
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary."""
+        return False
+
+    def __repr__(self):
+        return "{0}(sigma_0={1:.3g})".format(self.__class__.__name__, self.sigma_0)
+
+
+# adapted from scipy/optimize/optimize.py for functions with 2d output
+def _approx_fprime(xk, f, epsilon, args=()):
+    f0 = f(*((xk,) + args))
+    grad = np.zeros((f0.shape[0], f0.shape[1], len(xk)), float)
+    ei = np.zeros((len(xk),), float)
+    for k in range(len(xk)):
+        ei[k] = 1.0
+        d = epsilon * ei
+        grad[:, :, k] = (f(*((xk + d,) + args)) - f0) / d[k]
+        ei[k] = 0.0
+    return grad
+
+
+class PairwiseKernel(Kernel):
+    """Wrapper for kernels in sklearn.metrics.pairwise.
+
+    A thin wrapper around the functionality of the kernels in
+    sklearn.metrics.pairwise.
+
+    Note: Evaluation of eval_gradient is not analytic but numeric and all
+          kernels support only isotropic distances. The parameter gamma is
+          considered to be a hyperparameter and may be optimized. The other
+          kernel parameters are set directly at initialization and are kept
+          fixed.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    gamma : float, default=1.0
+        Parameter gamma of the pairwise kernel specified by metric. It should
+        be positive.
+
+    gamma_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
+        The lower and upper bound on 'gamma'.
+        If set to "fixed", 'gamma' cannot be changed during
+        hyperparameter tuning.
+
+    metric : {"linear", "additive_chi2", "chi2", "poly", "polynomial", \
+              "rbf", "laplacian", "sigmoid", "cosine"} or callable, \
+              default="linear"
+        The metric to use when calculating kernel between instances in a
+        feature array. If metric is a string, it must be one of the metrics
+        in pairwise.PAIRWISE_KERNEL_FUNCTIONS.
+        If metric is "precomputed", X is assumed to be a kernel matrix.
+        Alternatively, if metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays from X as input and return a value indicating
+        the distance between them.
+
+    pairwise_kernels_kwargs : dict, default=None
+        All entries of this dict (if any) are passed as keyword arguments to
+        the pairwise kernel function.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.gaussian_process import GaussianProcessClassifier
+    >>> from sklearn.gaussian_process.kernels import PairwiseKernel
+    >>> X, y = load_iris(return_X_y=True)
+    >>> kernel = PairwiseKernel(metric='rbf')
+    >>> gpc = GaussianProcessClassifier(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpc.score(X, y)
+    0.9733
+    >>> gpc.predict_proba(X[:2,:])
+    array([[0.8880, 0.05663, 0.05532],
+           [0.8676, 0.07073, 0.06165]])
+    """
+
+    def __init__(
+        self,
+        gamma=1.0,
+        gamma_bounds=(1e-5, 1e5),
+        metric="linear",
+        pairwise_kernels_kwargs=None,
+    ):
+        self.gamma = gamma
+        self.gamma_bounds = gamma_bounds
+        self.metric = metric
+        self.pairwise_kernels_kwargs = pairwise_kernels_kwargs
+
+    @property
+    def hyperparameter_gamma(self):
+        return Hyperparameter("gamma", "numeric", self.gamma_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : ndarray of shape (n_samples_Y, n_features), default=None
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool, default=False
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
+
+        Returns
+        -------
+        K : ndarray of shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
+                optional
+            The gradient of the kernel k(X, X) with respect to the log of the
+            hyperparameter of the kernel. Only returned when `eval_gradient`
+            is True.
+        """
+        pairwise_kernels_kwargs = self.pairwise_kernels_kwargs
+        if self.pairwise_kernels_kwargs is None:
+            pairwise_kernels_kwargs = {}
+
+        X = np.atleast_2d(X)
+        K = pairwise_kernels(
+            X,
+            Y,
+            metric=self.metric,
+            gamma=self.gamma,
+            filter_params=True,
+            **pairwise_kernels_kwargs,
+        )
+        if eval_gradient:
+            if self.hyperparameter_gamma.fixed:
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+            else:
+                # approximate gradient numerically
+                def f(gamma):  # helper function
+                    return pairwise_kernels(
+                        X,
+                        Y,
+                        metric=self.metric,
+                        gamma=np.exp(gamma),
+                        filter_params=True,
+                        **pairwise_kernels_kwargs,
+                    )
+
+                return K, _approx_fprime(self.theta, f, 1e-10)
+        else:
+            return K
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : ndarray of shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        # We have to fall back to slow way of computing diagonal
+        return np.apply_along_axis(self, 1, X).ravel()
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary."""
+        return self.metric in ["rbf"]
+
+    def __repr__(self):
+        return "{0}(gamma={1}, metric={2})".format(
+            self.__class__.__name__, self.gamma, self.metric
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/_mini_sequence_kernel.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/_mini_sequence_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4667329aff9b8dbeffa90bb0c40c98a708fcc205
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/_mini_sequence_kernel.py
@@ -0,0 +1,54 @@
+import numpy as np
+
+from sklearn.base import clone
+from sklearn.gaussian_process.kernels import (
+    GenericKernelMixin,
+    Hyperparameter,
+    Kernel,
+    StationaryKernelMixin,
+)
+
+
+class MiniSeqKernel(GenericKernelMixin, StationaryKernelMixin, Kernel):
+    """
+    A minimal (but valid) convolutional kernel for sequences of variable
+    length.
+    """
+
+    def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)):
+        self.baseline_similarity = baseline_similarity
+        self.baseline_similarity_bounds = baseline_similarity_bounds
+
+    @property
+    def hyperparameter_baseline_similarity(self):
+        return Hyperparameter(
+            "baseline_similarity", "numeric", self.baseline_similarity_bounds
+        )
+
+    def _f(self, s1, s2):
+        return sum(
+            [1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2]
+        )
+
+    def _g(self, s1, s2):
+        return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            Y = X
+
+        if eval_gradient:
+            return (
+                np.array([[self._f(x, y) for y in Y] for x in X]),
+                np.array([[[self._g(x, y)] for y in Y] for x in X]),
+            )
+        else:
+            return np.array([[self._f(x, y) for y in Y] for x in X])
+
+    def diag(self, X):
+        return np.array([self._f(x, x) for x in X])
+
+    def clone_with_theta(self, theta):
+        cloned = clone(self)
+        cloned.theta = theta
+        return cloned
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpc.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpc.py
new file mode 100644
index 0000000000000000000000000000000000000000..365b8f5a114417fdd2ab9979341ba95489c2b1d2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpc.py
@@ -0,0 +1,320 @@
+"""Testing for Gaussian process classification"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+import numpy as np
+import pytest
+from scipy.optimize import approx_fprime
+
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import (
+    RBF,
+    CompoundKernel,
+    WhiteKernel,
+)
+from sklearn.gaussian_process.kernels import (
+    ConstantKernel as C,
+)
+from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
+from sklearn.utils._testing import assert_almost_equal, assert_array_equal
+
+
+def f(x):
+    return np.sin(x)
+
+
+X = np.atleast_2d(np.linspace(0, 10, 30)).T
+X2 = np.atleast_2d([2.0, 4.0, 5.5, 6.5, 7.5]).T
+y = np.array(f(X).ravel() > 0, dtype=int)
+fX = f(X).ravel()
+y_mc = np.empty(y.shape, dtype=int)  # multi-class
+y_mc[fX < -0.35] = 0
+y_mc[(fX >= -0.35) & (fX < 0.35)] = 1
+y_mc[fX > 0.35] = 2
+
+
+fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
+kernels = [
+    RBF(length_scale=0.1),
+    fixed_kernel,
+    RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+    C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+]
+non_fixed_kernels = [kernel for kernel in kernels if kernel != fixed_kernel]
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_predict_consistent(kernel):
+    # Check binary predict decision has also predicted probability above 0.5.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
+
+
+def test_predict_consistent_structured():
+    # Check binary predict decision has also predicted probability above 0.5.
+    X = ["A", "AB", "B"]
+    y = np.array([True, False, True])
+    kernel = MiniSeqKernel(baseline_similarity_bounds="fixed")
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_lml_improving(kernel):
+    # Test that hyperparameter-tuning improves log-marginal likelihood.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    assert gpc.log_marginal_likelihood(gpc.kernel_.theta) > gpc.log_marginal_likelihood(
+        kernel.theta
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_precomputed(kernel):
+    # Test that lml of optimized kernel is stored correctly.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    assert_almost_equal(
+        gpc.log_marginal_likelihood(gpc.kernel_.theta), gpc.log_marginal_likelihood(), 7
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_without_cloning_kernel(kernel):
+    # Test that clone_kernel=False has side-effects of kernel.theta.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    input_theta = np.ones(gpc.kernel_.theta.shape, dtype=np.float64)
+
+    gpc.log_marginal_likelihood(input_theta, clone_kernel=False)
+    assert_almost_equal(gpc.kernel_.theta, input_theta, 7)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_converged_to_local_maximum(kernel):
+    # Test that we are in local maximum after hyperparameter-optimization.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+    lml, lml_gradient = gpc.log_marginal_likelihood(gpc.kernel_.theta, True)
+
+    assert np.all(
+        (np.abs(lml_gradient) < 1e-4)
+        | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 0])
+        | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 1])
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_gradient(kernel):
+    # Compare analytic and numeric gradient of log marginal likelihood.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+    lml, lml_gradient = gpc.log_marginal_likelihood(kernel.theta, True)
+    lml_gradient_approx = approx_fprime(
+        kernel.theta, lambda theta: gpc.log_marginal_likelihood(theta, False), 1e-10
+    )
+
+    assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
+
+
+def test_random_starts(global_random_seed):
+    # Test that an increasing number of random-starts of GP fitting only
+    # increases the log marginal likelihood of the chosen theta.
+    n_samples, n_features = 25, 2
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randn(n_samples, n_features) * 2 - 1
+    y = (np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)) > 0
+
+    kernel = C(1.0, (1e-2, 1e2)) * RBF(
+        length_scale=[1e-3] * n_features, length_scale_bounds=[(1e-4, 1e2)] * n_features
+    )
+    last_lml = -np.inf
+    for n_restarts_optimizer in range(5):
+        gp = GaussianProcessClassifier(
+            kernel=kernel,
+            n_restarts_optimizer=n_restarts_optimizer,
+            random_state=global_random_seed,
+        ).fit(X, y)
+        lml = gp.log_marginal_likelihood(gp.kernel_.theta)
+        assert lml > last_lml - np.finfo(np.float32).eps
+        last_lml = lml
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_custom_optimizer(kernel, global_random_seed):
+    # Test that GPC can use externally defined optimizers.
+    # Define a dummy optimizer that simply tests 10 random hyperparameters
+    def optimizer(obj_func, initial_theta, bounds):
+        rng = np.random.RandomState(global_random_seed)
+        theta_opt, func_min = (
+            initial_theta,
+            obj_func(initial_theta, eval_gradient=False),
+        )
+        for _ in range(10):
+            theta = np.atleast_1d(
+                rng.uniform(np.maximum(-2, bounds[:, 0]), np.minimum(1, bounds[:, 1]))
+            )
+            f = obj_func(theta, eval_gradient=False)
+            if f < func_min:
+                theta_opt, func_min = theta, f
+        return theta_opt, func_min
+
+    gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer)
+    gpc.fit(X, y_mc)
+    # Checks that optimizer improved marginal likelihood
+    assert gpc.log_marginal_likelihood(
+        gpc.kernel_.theta
+    ) >= gpc.log_marginal_likelihood(kernel.theta)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_multi_class(kernel):
+    # Test GPC for multi-class classification problems.
+    gpc = GaussianProcessClassifier(kernel=kernel)
+    gpc.fit(X, y_mc)
+
+    y_prob = gpc.predict_proba(X2)
+    assert_almost_equal(y_prob.sum(1), 1)
+
+    y_pred = gpc.predict(X2)
+    assert_array_equal(np.argmax(y_prob, 1), y_pred)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_multi_class_n_jobs(kernel):
+    # Test that multi-class GPC produces identical results with n_jobs>1.
+    gpc = GaussianProcessClassifier(kernel=kernel)
+    gpc.fit(X, y_mc)
+
+    gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2)
+    gpc_2.fit(X, y_mc)
+
+    y_prob = gpc.predict_proba(X2)
+    y_prob_2 = gpc_2.predict_proba(X2)
+    assert_almost_equal(y_prob, y_prob_2)
+
+
+def test_warning_bounds():
+    kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
+    gpc = GaussianProcessClassifier(kernel=kernel)
+    warning_message = (
+        "The optimal value found for dimension 0 of parameter "
+        "length_scale is close to the specified upper bound "
+        "0.001. Increasing the bound and calling fit again may "
+        "find a better value."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        gpc.fit(X, y)
+
+    kernel_sum = WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF(
+        length_scale_bounds=[1e3, 1e5]
+    )
+    gpc_sum = GaussianProcessClassifier(kernel=kernel_sum)
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        gpc_sum.fit(X, y)
+
+        assert len(record) == 2
+
+        assert issubclass(record[0].category, ConvergenceWarning)
+        assert (
+            record[0].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "k1__noise_level is close to the "
+            "specified upper bound 0.001. "
+            "Increasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+        assert issubclass(record[1].category, ConvergenceWarning)
+        assert (
+            record[1].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "k2__length_scale is close to the "
+            "specified lower bound 1000.0. "
+            "Decreasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+    X_tile = np.tile(X, 2)
+    kernel_dims = RBF(length_scale=[1.0, 2.0], length_scale_bounds=[1e1, 1e2])
+    gpc_dims = GaussianProcessClassifier(kernel=kernel_dims)
+
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        gpc_dims.fit(X_tile, y)
+
+        assert len(record) == 2
+
+        assert issubclass(record[0].category, ConvergenceWarning)
+        assert (
+            record[0].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "length_scale is close to the "
+            "specified upper bound 100.0. "
+            "Increasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+        assert issubclass(record[1].category, ConvergenceWarning)
+        assert (
+            record[1].message.args[0] == "The optimal value found for "
+            "dimension 1 of parameter "
+            "length_scale is close to the "
+            "specified upper bound 100.0. "
+            "Increasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+
+@pytest.mark.parametrize(
+    "params, error_type, err_msg",
+    [
+        (
+            {"kernel": CompoundKernel(0)},
+            ValueError,
+            "kernel cannot be a CompoundKernel",
+        )
+    ],
+)
+def test_gpc_fit_error(params, error_type, err_msg):
+    """Check that expected error are raised during fit."""
+    gpc = GaussianProcessClassifier(**params)
+    with pytest.raises(error_type, match=err_msg):
+        gpc.fit(X, y)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_gpc_latent_mean_and_variance_shape(kernel):
+    """Checks that the latent mean and variance have the right shape."""
+    gpc = GaussianProcessClassifier(kernel=kernel)
+    gpc.fit(X, y)
+
+    # Check that the latent mean and variance have the right shape
+    latent_mean, latent_variance = gpc.latent_mean_and_variance(X)
+    assert latent_mean.shape == (X.shape[0],)
+    assert latent_variance.shape == (X.shape[0],)
+
+
+def test_gpc_latent_mean_and_variance_complain_on_more_than_2_classes():
+    """Checks that the latent mean and variance have the right shape."""
+    gpc = GaussianProcessClassifier(kernel=RBF())
+    gpc.fit(X, y_mc)
+
+    # Check that the latent mean and variance have the right shape
+    with pytest.raises(
+        ValueError,
+        match="Returning the mean and variance of the latent function f "
+        "is only supported for binary classification",
+    ):
+        gpc.latent_mean_and_variance(X)
+
+
+def test_latent_mean_and_variance_works_on_structured_kernels():
+    X = ["A", "AB", "B"]
+    y = np.array([True, False, True])
+    kernel = MiniSeqKernel(baseline_similarity_bounds="fixed")
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+    gpc.latent_mean_and_variance(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpr.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f43cc3613b3ff7669aba9b73526fd774bfd8452e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpr.py
@@ -0,0 +1,849 @@
+"""Testing for Gaussian process regression"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+import sys
+import warnings
+
+import numpy as np
+import pytest
+from scipy.optimize import approx_fprime
+
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import (
+    RBF,
+    DotProduct,
+    ExpSineSquared,
+    WhiteKernel,
+)
+from sklearn.gaussian_process.kernels import (
+    ConstantKernel as C,
+)
+from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_less,
+)
+
+
+def f(x):
+    return x * np.sin(x)
+
+
+X = np.atleast_2d([1.0, 3.0, 5.0, 6.0, 7.0, 8.0]).T
+X2 = np.atleast_2d([2.0, 4.0, 5.5, 6.5, 7.5]).T
+y = f(X).ravel()
+
+fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
+kernels = [
+    RBF(length_scale=1.0),
+    fixed_kernel,
+    RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+    C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+    C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
+    + C(1e-5, (1e-5, 1e2)),
+    C(0.1, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
+    + C(1e-5, (1e-5, 1e2)),
+]
+non_fixed_kernels = [kernel for kernel in kernels if kernel != fixed_kernel]
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_gpr_interpolation(kernel):
+    if sys.maxsize <= 2**32:
+        pytest.xfail("This test may fail on 32 bit Python")
+
+    # Test the interpolating property for different kernels.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    y_pred, y_cov = gpr.predict(X, return_cov=True)
+
+    assert_almost_equal(y_pred, y)
+    assert_almost_equal(np.diag(y_cov), 0.0)
+
+
+def test_gpr_interpolation_structured():
+    # Test the interpolating property for different kernels.
+    kernel = MiniSeqKernel(baseline_similarity_bounds="fixed")
+    X = ["A", "B", "C"]
+    y = np.array([1, 2, 3])
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    y_pred, y_cov = gpr.predict(X, return_cov=True)
+
+    assert_almost_equal(
+        kernel(X, eval_gradient=True)[1].ravel(), (1 - np.eye(len(X))).ravel()
+    )
+    assert_almost_equal(y_pred, y)
+    assert_almost_equal(np.diag(y_cov), 0.0)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_lml_improving(kernel):
+    if sys.maxsize <= 2**32:
+        pytest.xfail("This test may fail on 32 bit Python")
+
+    # Test that hyperparameter-tuning improves log-marginal likelihood.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    assert gpr.log_marginal_likelihood(gpr.kernel_.theta) > gpr.log_marginal_likelihood(
+        kernel.theta
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_precomputed(kernel):
+    # Test that lml of optimized kernel is stored correctly.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    assert gpr.log_marginal_likelihood(gpr.kernel_.theta) == pytest.approx(
+        gpr.log_marginal_likelihood()
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_without_cloning_kernel(kernel):
+    # Test that lml of optimized kernel is stored correctly.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    input_theta = np.ones(gpr.kernel_.theta.shape, dtype=np.float64)
+
+    gpr.log_marginal_likelihood(input_theta, clone_kernel=False)
+    assert_almost_equal(gpr.kernel_.theta, input_theta, 7)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_converged_to_local_maximum(kernel):
+    # Test that we are in local maximum after hyperparameter-optimization.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+    lml, lml_gradient = gpr.log_marginal_likelihood(gpr.kernel_.theta, True)
+
+    assert np.all(
+        (np.abs(lml_gradient) < 1e-4)
+        | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0])
+        | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1])
+    )
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_solution_inside_bounds(kernel):
+    # Test that hyperparameter-optimization remains in bounds#
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+    bounds = gpr.kernel_.bounds
+    max_ = np.finfo(gpr.kernel_.theta.dtype).max
+    tiny = 1e-10
+    bounds[~np.isfinite(bounds[:, 1]), 1] = max_
+
+    assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny)
+    assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_gradient(kernel):
+    # Compare analytic and numeric gradient of log marginal likelihood.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+    lml, lml_gradient = gpr.log_marginal_likelihood(kernel.theta, True)
+    lml_gradient_approx = approx_fprime(
+        kernel.theta, lambda theta: gpr.log_marginal_likelihood(theta, False), 1e-10
+    )
+
+    assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_prior(kernel):
+    # Test that GP prior has mean 0 and identical variances.
+    gpr = GaussianProcessRegressor(kernel=kernel)
+
+    y_mean, y_cov = gpr.predict(X, return_cov=True)
+
+    assert_almost_equal(y_mean, 0, 5)
+    if len(gpr.kernel.theta) > 1:
+        # XXX: quite hacky, works only for current kernels
+        assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]), 5)
+    else:
+        assert_almost_equal(np.diag(y_cov), 1, 5)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_sample_statistics(kernel):
+    # Test that statistics of samples drawn from GP are correct.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+    y_mean, y_cov = gpr.predict(X2, return_cov=True)
+
+    samples = gpr.sample_y(X2, 300000)
+
+    # More digits accuracy would require many more samples
+    assert_almost_equal(y_mean, np.mean(samples, 1), 1)
+    assert_almost_equal(
+        np.diag(y_cov) / np.diag(y_cov).max(),
+        np.var(samples, 1) / np.diag(y_cov).max(),
+        1,
+    )
+
+
+def test_no_optimizer():
+    # Test that kernel parameters are unmodified when optimizer is None.
+    kernel = RBF(1.0)
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
+    assert np.exp(gpr.kernel_.theta) == 1.0
+
+
+@pytest.mark.parametrize("kernel", kernels)
+@pytest.mark.parametrize("target", [y, np.ones(X.shape[0], dtype=np.float64)])
+def test_predict_cov_vs_std(kernel, target):
+    if sys.maxsize <= 2**32:
+        pytest.xfail("This test may fail on 32 bit Python")
+
+    # Test that predicted std.-dev. is consistent with cov's diagonal.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    y_mean, y_cov = gpr.predict(X2, return_cov=True)
+    y_mean, y_std = gpr.predict(X2, return_std=True)
+    assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std)
+
+
+def test_anisotropic_kernel():
+    # Test that GPR can identify meaningful anisotropic length-scales.
+    # We learn a function which varies in one dimension ten-times slower
+    # than in the other. The corresponding length-scales should differ by at
+    # least a factor 5
+    rng = np.random.RandomState(0)
+    X = rng.uniform(-1, 1, (50, 2))
+    y = X[:, 0] + 0.1 * X[:, 1]
+
+    kernel = RBF([1.0, 1.0])
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    assert np.exp(gpr.kernel_.theta[1]) > np.exp(gpr.kernel_.theta[0]) * 5
+
+
+def test_random_starts():
+    # Test that an increasing number of random-starts of GP fitting only
+    # increases the log marginal likelihood of the chosen theta.
+    n_samples, n_features = 25, 2
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features) * 2 - 1
+    y = (
+        np.sin(X).sum(axis=1)
+        + np.sin(3 * X).sum(axis=1)
+        + rng.normal(scale=0.1, size=n_samples)
+    )
+
+    kernel = C(1.0, (1e-2, 1e2)) * RBF(
+        length_scale=[1.0] * n_features, length_scale_bounds=[(1e-4, 1e2)] * n_features
+    ) + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-5, 1e1))
+    last_lml = -np.inf
+    for n_restarts_optimizer in range(5):
+        gp = GaussianProcessRegressor(
+            kernel=kernel,
+            n_restarts_optimizer=n_restarts_optimizer,
+            random_state=0,
+        ).fit(X, y)
+        lml = gp.log_marginal_likelihood(gp.kernel_.theta)
+        assert lml > last_lml - np.finfo(np.float32).eps
+        last_lml = lml
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_y_normalization(kernel):
+    """
+    Test normalization of the target values in GP
+
+    Fitting non-normalizing GP on normalized y and fitting normalizing GP
+    on unnormalized y should yield identical results. Note that, here,
+    'normalized y' refers to y that has been made zero mean and unit
+    variance.
+
+    """
+
+    y_mean = np.mean(y)
+    y_std = np.std(y)
+    y_norm = (y - y_mean) / y_std
+
+    # Fit non-normalizing GP on normalized y
+    gpr = GaussianProcessRegressor(kernel=kernel)
+    gpr.fit(X, y_norm)
+
+    # Fit normalizing GP on unnormalized y
+    gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+    gpr_norm.fit(X, y)
+
+    # Compare predicted mean, std-devs and covariances
+    y_pred, y_pred_std = gpr.predict(X2, return_std=True)
+    y_pred = y_pred * y_std + y_mean
+    y_pred_std = y_pred_std * y_std
+    y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True)
+
+    assert_almost_equal(y_pred, y_pred_norm)
+    assert_almost_equal(y_pred_std, y_pred_std_norm)
+
+    _, y_cov = gpr.predict(X2, return_cov=True)
+    y_cov = y_cov * y_std**2
+    _, y_cov_norm = gpr_norm.predict(X2, return_cov=True)
+
+    assert_almost_equal(y_cov, y_cov_norm)
+
+
+def test_large_variance_y():
+    """
+    Here we test that, when noramlize_y=True, our GP can produce a
+    sensible fit to training data whose variance is significantly
+    larger than unity. This test was made in response to issue #15612.
+
+    GP predictions are verified against predictions that were made
+    using GPy which, here, is treated as the 'gold standard'. Note that we
+    only investigate the RBF kernel here, as that is what was used in the
+    GPy implementation.
+
+    The following code can be used to recreate the GPy data:
+
+    --------------------------------------------------------------------------
+    import GPy
+
+    kernel_gpy = GPy.kern.RBF(input_dim=1, lengthscale=1.)
+    gpy = GPy.models.GPRegression(X, np.vstack(y_large), kernel_gpy)
+    gpy.optimize()
+    y_pred_gpy, y_var_gpy = gpy.predict(X2)
+    y_pred_std_gpy = np.sqrt(y_var_gpy)
+    --------------------------------------------------------------------------
+    """
+
+    # Here we utilise a larger variance version of the training data
+    y_large = 10 * y
+
+    # Standard GP with normalize_y=True
+    RBF_params = {"length_scale": 1.0}
+    kernel = RBF(**RBF_params)
+    gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+    gpr.fit(X, y_large)
+    y_pred, y_pred_std = gpr.predict(X2, return_std=True)
+
+    # 'Gold standard' mean predictions from GPy
+    y_pred_gpy = np.array(
+        [15.16918303, -27.98707845, -39.31636019, 14.52605515, 69.18503589]
+    )
+
+    # 'Gold standard' std predictions from GPy
+    y_pred_std_gpy = np.array(
+        [7.78860962, 3.83179178, 0.63149951, 0.52745188, 0.86170042]
+    )
+
+    # Based on numerical experiments, it's reasonable to expect our
+    # GP's mean predictions to get within 7% of predictions of those
+    # made by GPy.
+    assert_allclose(y_pred, y_pred_gpy, rtol=0.07, atol=0)
+
+    # Based on numerical experiments, it's reasonable to expect our
+    # GP's std predictions to get within 15% of predictions of those
+    # made by GPy.
+    assert_allclose(y_pred_std, y_pred_std_gpy, rtol=0.15, atol=0)
+
+
+def test_y_multioutput():
+    # Test that GPR can deal with multi-dimensional target values
+    y_2d = np.vstack((y, y * 2)).T
+
+    # Test for fixed kernel that first dimension of 2d GP equals the output
+    # of 1d GP and that second dimension is twice as large
+    kernel = RBF(length_scale=1.0)
+
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False)
+    gpr.fit(X, y)
+
+    gpr_2d = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False)
+    gpr_2d.fit(X, y_2d)
+
+    y_pred_1d, y_std_1d = gpr.predict(X2, return_std=True)
+    y_pred_2d, y_std_2d = gpr_2d.predict(X2, return_std=True)
+    _, y_cov_1d = gpr.predict(X2, return_cov=True)
+    _, y_cov_2d = gpr_2d.predict(X2, return_cov=True)
+
+    assert_almost_equal(y_pred_1d, y_pred_2d[:, 0])
+    assert_almost_equal(y_pred_1d, y_pred_2d[:, 1] / 2)
+
+    # Standard deviation and covariance do not depend on output
+    for target in range(y_2d.shape[1]):
+        assert_almost_equal(y_std_1d, y_std_2d[..., target])
+        assert_almost_equal(y_cov_1d, y_cov_2d[..., target])
+
+    y_sample_1d = gpr.sample_y(X2, n_samples=10)
+    y_sample_2d = gpr_2d.sample_y(X2, n_samples=10)
+
+    assert y_sample_1d.shape == (5, 10)
+    assert y_sample_2d.shape == (5, 2, 10)
+    # Only the first target will be equal
+    assert_almost_equal(y_sample_1d, y_sample_2d[:, 0, :])
+
+    # Test hyperparameter optimization
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr.fit(X, y)
+
+        gpr_2d = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr_2d.fit(X, np.vstack((y, y)).T)
+
+        assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_custom_optimizer(kernel):
+    # Test that GPR can use externally defined optimizers.
+    # Define a dummy optimizer that simply tests 50 random hyperparameters
+    def optimizer(obj_func, initial_theta, bounds):
+        rng = np.random.RandomState(0)
+        theta_opt, func_min = (
+            initial_theta,
+            obj_func(initial_theta, eval_gradient=False),
+        )
+        for _ in range(50):
+            theta = np.atleast_1d(
+                rng.uniform(np.maximum(-2, bounds[:, 0]), np.minimum(1, bounds[:, 1]))
+            )
+            f = obj_func(theta, eval_gradient=False)
+            if f < func_min:
+                theta_opt, func_min = theta, f
+        return theta_opt, func_min
+
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer)
+    gpr.fit(X, y)
+    # Checks that optimizer improved marginal likelihood
+    assert gpr.log_marginal_likelihood(gpr.kernel_.theta) > gpr.log_marginal_likelihood(
+        gpr.kernel.theta
+    )
+
+
+def test_gpr_correct_error_message():
+    X = np.arange(12).reshape(6, -1)
+    y = np.ones(6)
+    kernel = DotProduct()
+    gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
+    message = (
+        "The kernel, %s, is not returning a "
+        "positive definite matrix. Try gradually increasing "
+        "the 'alpha' parameter of your "
+        "GaussianProcessRegressor estimator." % kernel
+    )
+    with pytest.raises(np.linalg.LinAlgError, match=re.escape(message)):
+        gpr.fit(X, y)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_duplicate_input(kernel):
+    # Test GPR can handle two different output-values for the same input.
+    gpr_equal_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
+    gpr_similar_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
+
+    X_ = np.vstack((X, X[0]))
+    y_ = np.hstack((y, y[0] + 1))
+    gpr_equal_inputs.fit(X_, y_)
+
+    X_ = np.vstack((X, X[0] + 1e-15))
+    y_ = np.hstack((y, y[0] + 1))
+    gpr_similar_inputs.fit(X_, y_)
+
+    X_test = np.linspace(0, 10, 100)[:, None]
+    y_pred_equal, y_std_equal = gpr_equal_inputs.predict(X_test, return_std=True)
+    y_pred_similar, y_std_similar = gpr_similar_inputs.predict(X_test, return_std=True)
+
+    assert_almost_equal(y_pred_equal, y_pred_similar)
+    assert_almost_equal(y_std_equal, y_std_similar)
+
+
+def test_no_fit_default_predict():
+    # Test that GPR predictions without fit does not break by default.
+    default_kernel = C(1.0, constant_value_bounds="fixed") * RBF(
+        1.0, length_scale_bounds="fixed"
+    )
+    gpr1 = GaussianProcessRegressor()
+    _, y_std1 = gpr1.predict(X, return_std=True)
+    _, y_cov1 = gpr1.predict(X, return_cov=True)
+
+    gpr2 = GaussianProcessRegressor(kernel=default_kernel)
+    _, y_std2 = gpr2.predict(X, return_std=True)
+    _, y_cov2 = gpr2.predict(X, return_cov=True)
+
+    assert_array_almost_equal(y_std1, y_std2)
+    assert_array_almost_equal(y_cov1, y_cov2)
+
+
+def test_warning_bounds():
+    kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
+    gpr = GaussianProcessRegressor(kernel=kernel)
+    warning_message = (
+        "The optimal value found for dimension 0 of parameter "
+        "length_scale is close to the specified upper bound "
+        "0.001. Increasing the bound and calling fit again may "
+        "find a better value."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        gpr.fit(X, y)
+
+    kernel_sum = WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF(
+        length_scale_bounds=[1e3, 1e5]
+    )
+    gpr_sum = GaussianProcessRegressor(kernel=kernel_sum)
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        gpr_sum.fit(X, y)
+
+        assert len(record) == 2
+
+        assert issubclass(record[0].category, ConvergenceWarning)
+        assert (
+            record[0].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "k1__noise_level is close to the "
+            "specified upper bound 0.001. "
+            "Increasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+        assert issubclass(record[1].category, ConvergenceWarning)
+        assert (
+            record[1].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "k2__length_scale is close to the "
+            "specified lower bound 1000.0. "
+            "Decreasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+    X_tile = np.tile(X, 2)
+    kernel_dims = RBF(length_scale=[1.0, 2.0], length_scale_bounds=[1e1, 1e2])
+    gpr_dims = GaussianProcessRegressor(kernel=kernel_dims)
+
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        gpr_dims.fit(X_tile, y)
+
+        assert len(record) == 2
+
+        assert issubclass(record[0].category, ConvergenceWarning)
+        assert (
+            record[0].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "length_scale is close to the "
+            "specified lower bound 10.0. "
+            "Decreasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+        assert issubclass(record[1].category, ConvergenceWarning)
+        assert (
+            record[1].message.args[0] == "The optimal value found for "
+            "dimension 1 of parameter "
+            "length_scale is close to the "
+            "specified lower bound 10.0. "
+            "Decreasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+
+def test_bound_check_fixed_hyperparameter():
+    # Regression test for issue #17943
+    # Check that having a hyperparameter with fixed bounds doesn't cause an
+    # error
+    k1 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
+    k2 = ExpSineSquared(
+        length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed"
+    )  # seasonal component
+    kernel = k1 + k2
+    GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_constant_target(kernel):
+    """Check that the std. dev. is affected to 1 when normalizing a constant
+    feature.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/18318
+    NaN where affected to the target when scaling due to null std. dev. with
+    constant target.
+    """
+    y_constant = np.ones(X.shape[0], dtype=np.float64)
+
+    gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+    gpr.fit(X, y_constant)
+    assert gpr._y_train_std == pytest.approx(1.0)
+
+    y_pred, y_cov = gpr.predict(X, return_cov=True)
+    assert_allclose(y_pred, y_constant)
+    # set atol because we compare to zero
+    assert_allclose(np.diag(y_cov), 0.0, atol=1e-9)
+
+    # Test multi-target data
+    n_samples, n_targets = X.shape[0], 2
+    rng = np.random.RandomState(0)
+    y = np.concatenate(
+        [
+            rng.normal(size=(n_samples, 1)),  # non-constant target
+            np.full(shape=(n_samples, 1), fill_value=2),  # constant target
+        ],
+        axis=1,
+    )
+
+    gpr.fit(X, y)
+    Y_pred, Y_cov = gpr.predict(X, return_cov=True)
+
+    assert_allclose(Y_pred[:, 1], 2)
+    assert_allclose(np.diag(Y_cov[..., 1]), 0.0, atol=1e-9)
+
+    assert Y_pred.shape == (n_samples, n_targets)
+    assert Y_cov.shape == (n_samples, n_samples, n_targets)
+
+
+def test_gpr_consistency_std_cov_non_invertible_kernel():
+    """Check the consistency between the returned std. dev. and the covariance.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19936
+    Inconsistencies were observed when the kernel cannot be inverted (or
+    numerically stable).
+    """
+    kernel = C(8.98576054e05, (1e-12, 1e12)) * RBF(
+        [5.91326520e02, 1.32584051e03], (1e-12, 1e12)
+    ) + WhiteKernel(noise_level=1e-5)
+    gpr = GaussianProcessRegressor(kernel=kernel, alpha=0, optimizer=None)
+    X_train = np.array(
+        [
+            [0.0, 0.0],
+            [1.54919334, -0.77459667],
+            [-1.54919334, 0.0],
+            [0.0, -1.54919334],
+            [0.77459667, 0.77459667],
+            [-0.77459667, 1.54919334],
+        ]
+    )
+    y_train = np.array(
+        [
+            [-2.14882017e-10],
+            [-4.66975823e00],
+            [4.01823986e00],
+            [-1.30303674e00],
+            [-1.35760156e00],
+            [3.31215668e00],
+        ]
+    )
+    gpr.fit(X_train, y_train)
+    X_test = np.array(
+        [
+            [-1.93649167, -1.93649167],
+            [1.93649167, -1.93649167],
+            [-1.93649167, 1.93649167],
+            [1.93649167, 1.93649167],
+        ]
+    )
+    pred1, std = gpr.predict(X_test, return_std=True)
+    pred2, cov = gpr.predict(X_test, return_cov=True)
+    assert_allclose(std, np.sqrt(np.diagonal(cov)), rtol=1e-5)
+
+
+@pytest.mark.parametrize(
+    "params, TypeError, err_msg",
+    [
+        (
+            {"alpha": np.zeros(100)},
+            ValueError,
+            "alpha must be a scalar or an array with same number of entries as y",
+        ),
+        (
+            {
+                "kernel": WhiteKernel(noise_level_bounds=(-np.inf, np.inf)),
+                "n_restarts_optimizer": 2,
+            },
+            ValueError,
+            "requires that all bounds are finite",
+        ),
+    ],
+)
+def test_gpr_fit_error(params, TypeError, err_msg):
+    """Check that expected error are raised during fit."""
+    gpr = GaussianProcessRegressor(**params)
+    with pytest.raises(TypeError, match=err_msg):
+        gpr.fit(X, y)
+
+
+def test_gpr_lml_error():
+    """Check that we raise the proper error in the LML method."""
+    gpr = GaussianProcessRegressor(kernel=RBF()).fit(X, y)
+
+    err_msg = "Gradient can only be evaluated for theta!=None"
+    with pytest.raises(ValueError, match=err_msg):
+        gpr.log_marginal_likelihood(eval_gradient=True)
+
+
+def test_gpr_predict_error():
+    """Check that we raise the proper error during predict."""
+    gpr = GaussianProcessRegressor(kernel=RBF()).fit(X, y)
+
+    err_msg = "At most one of return_std or return_cov can be requested."
+    with pytest.raises(RuntimeError, match=err_msg):
+        gpr.predict(X, return_cov=True, return_std=True)
+
+
+@pytest.mark.parametrize("normalize_y", [True, False])
+@pytest.mark.parametrize("n_targets", [None, 1, 10])
+def test_predict_shapes(normalize_y, n_targets):
+    """Check the shapes of y_mean, y_std, and y_cov in single-output
+    (n_targets=None) and multi-output settings, including the edge case when
+    n_targets=1, where the sklearn convention is to squeeze the predictions.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/17394
+    https://github.com/scikit-learn/scikit-learn/issues/18065
+    https://github.com/scikit-learn/scikit-learn/issues/22174
+    """
+    rng = np.random.RandomState(1234)
+
+    n_features, n_samples_train, n_samples_test = 6, 9, 7
+
+    y_train_shape = (n_samples_train,)
+    if n_targets is not None:
+        y_train_shape = y_train_shape + (n_targets,)
+
+    # By convention single-output data is squeezed upon prediction
+    y_test_shape = (n_samples_test,)
+    if n_targets is not None and n_targets > 1:
+        y_test_shape = y_test_shape + (n_targets,)
+
+    X_train = rng.randn(n_samples_train, n_features)
+    X_test = rng.randn(n_samples_test, n_features)
+    y_train = rng.randn(*y_train_shape)
+
+    model = GaussianProcessRegressor(normalize_y=normalize_y)
+    model.fit(X_train, y_train)
+
+    y_pred, y_std = model.predict(X_test, return_std=True)
+    _, y_cov = model.predict(X_test, return_cov=True)
+
+    assert y_pred.shape == y_test_shape
+    assert y_std.shape == y_test_shape
+    assert y_cov.shape == (n_samples_test,) + y_test_shape
+
+
+@pytest.mark.parametrize("normalize_y", [True, False])
+@pytest.mark.parametrize("n_targets", [None, 1, 10])
+def test_sample_y_shapes(normalize_y, n_targets):
+    """Check the shapes of y_samples in single-output (n_targets=0) and
+    multi-output settings, including the edge case when n_targets=1, where the
+    sklearn convention is to squeeze the predictions.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/22175
+    """
+    rng = np.random.RandomState(1234)
+
+    n_features, n_samples_train = 6, 9
+    # Number of spatial locations to predict at
+    n_samples_X_test = 7
+    # Number of sample predictions per test point
+    n_samples_y_test = 5
+
+    y_train_shape = (n_samples_train,)
+    if n_targets is not None:
+        y_train_shape = y_train_shape + (n_targets,)
+
+    # By convention single-output data is squeezed upon prediction
+    if n_targets is not None and n_targets > 1:
+        y_test_shape = (n_samples_X_test, n_targets, n_samples_y_test)
+    else:
+        y_test_shape = (n_samples_X_test, n_samples_y_test)
+
+    X_train = rng.randn(n_samples_train, n_features)
+    X_test = rng.randn(n_samples_X_test, n_features)
+    y_train = rng.randn(*y_train_shape)
+
+    model = GaussianProcessRegressor(normalize_y=normalize_y)
+
+    # FIXME: before fitting, the estimator does not have information regarding
+    # the number of targets and default to 1. This is inconsistent with the shape
+    # provided after `fit`. This assert should be made once the following issue
+    # is fixed:
+    # https://github.com/scikit-learn/scikit-learn/issues/22430
+    # y_samples = model.sample_y(X_test, n_samples=n_samples_y_test)
+    # assert y_samples.shape == y_test_shape
+
+    model.fit(X_train, y_train)
+
+    y_samples = model.sample_y(X_test, n_samples=n_samples_y_test)
+    assert y_samples.shape == y_test_shape
+
+
+@pytest.mark.parametrize("n_targets", [None, 1, 2, 3])
+@pytest.mark.parametrize("n_samples", [1, 5])
+def test_sample_y_shape_with_prior(n_targets, n_samples):
+    """Check the output shape of `sample_y` is consistent before and after `fit`."""
+    rng = np.random.RandomState(1024)
+
+    X = rng.randn(10, 3)
+    y = rng.randn(10, n_targets if n_targets is not None else 1)
+
+    model = GaussianProcessRegressor(n_targets=n_targets)
+    shape_before_fit = model.sample_y(X, n_samples=n_samples).shape
+    model.fit(X, y)
+    shape_after_fit = model.sample_y(X, n_samples=n_samples).shape
+    assert shape_before_fit == shape_after_fit
+
+
+@pytest.mark.parametrize("n_targets", [None, 1, 2, 3])
+def test_predict_shape_with_prior(n_targets):
+    """Check the output shape of `predict` with prior distribution."""
+    rng = np.random.RandomState(1024)
+
+    n_sample = 10
+    X = rng.randn(n_sample, 3)
+    y = rng.randn(n_sample, n_targets if n_targets is not None else 1)
+
+    model = GaussianProcessRegressor(n_targets=n_targets)
+    mean_prior, cov_prior = model.predict(X, return_cov=True)
+    _, std_prior = model.predict(X, return_std=True)
+
+    model.fit(X, y)
+    mean_post, cov_post = model.predict(X, return_cov=True)
+    _, std_post = model.predict(X, return_std=True)
+
+    assert mean_prior.shape == mean_post.shape
+    assert cov_prior.shape == cov_post.shape
+    assert std_prior.shape == std_post.shape
+
+
+def test_n_targets_error():
+    """Check that an error is raised when the number of targets seen at fit is
+    inconsistent with n_targets.
+    """
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 3)
+    y = rng.randn(10, 2)
+
+    model = GaussianProcessRegressor(n_targets=1)
+    with pytest.raises(ValueError, match="The number of targets seen in `y`"):
+        model.fit(X, y)
+
+
+class CustomKernel(C):
+    """
+    A custom kernel that has a diag method that returns the first column of the
+    input matrix X. This is a helper for the test to check that the input
+    matrix X is not mutated.
+    """
+
+    def diag(self, X):
+        return X[:, 0]
+
+
+def test_gpr_predict_input_not_modified():
+    """
+    Check that the input X is not modified by the predict method of the
+    GaussianProcessRegressor when setting return_std=True.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/24340
+    """
+    gpr = GaussianProcessRegressor(kernel=CustomKernel()).fit(X, y)
+
+    X2_copy = np.copy(X2)
+    _, _ = gpr.predict(X2, return_std=True)
+
+    assert_allclose(X2, X2_copy)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_kernels.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..5174d50b7df9210fbf67677ed5f18eaedf209ecc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_kernels.py
@@ -0,0 +1,403 @@
+"""Testing for kernels for Gaussian processes."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from inspect import signature
+
+import numpy as np
+import pytest
+
+from sklearn.base import clone
+from sklearn.gaussian_process.kernels import (
+    RBF,
+    CompoundKernel,
+    ConstantKernel,
+    DotProduct,
+    Exponentiation,
+    ExpSineSquared,
+    KernelOperator,
+    Matern,
+    PairwiseKernel,
+    RationalQuadratic,
+    WhiteKernel,
+    _approx_fprime,
+)
+from sklearn.metrics.pairwise import (
+    PAIRWISE_KERNEL_FUNCTIONS,
+    euclidean_distances,
+    pairwise_kernels,
+)
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+X = np.random.RandomState(0).normal(0, 1, (5, 2))
+Y = np.random.RandomState(0).normal(0, 1, (6, 2))
+# Set shared test data as read-only to avoid unintentional in-place
+# modifications that would introduce side-effects between tests.
+X.flags.writeable = False
+Y.flags.writeable = False
+
+kernel_rbf_plus_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0)
+kernels = [
+    RBF(length_scale=2.0),
+    RBF(length_scale_bounds=(0.5, 2.0)),
+    ConstantKernel(constant_value=10.0),
+    2.0 * RBF(length_scale=0.33, length_scale_bounds="fixed"),
+    2.0 * RBF(length_scale=0.5),
+    kernel_rbf_plus_white,
+    2.0 * RBF(length_scale=[0.5, 2.0]),
+    2.0 * Matern(length_scale=0.33, length_scale_bounds="fixed"),
+    2.0 * Matern(length_scale=0.5, nu=0.5),
+    2.0 * Matern(length_scale=1.5, nu=1.5),
+    2.0 * Matern(length_scale=2.5, nu=2.5),
+    2.0 * Matern(length_scale=[0.5, 2.0], nu=0.5),
+    3.0 * Matern(length_scale=[2.0, 0.5], nu=1.5),
+    4.0 * Matern(length_scale=[0.5, 0.5], nu=2.5),
+    RationalQuadratic(length_scale=0.5, alpha=1.5),
+    ExpSineSquared(length_scale=0.5, periodicity=1.5),
+    DotProduct(sigma_0=2.0),
+    DotProduct(sigma_0=2.0) ** 2,
+    RBF(length_scale=[2.0]),
+    Matern(length_scale=[2.0]),
+]
+for metric in PAIRWISE_KERNEL_FUNCTIONS:
+    if metric in ["additive_chi2", "chi2"]:
+        continue
+    kernels.append(PairwiseKernel(gamma=1.0, metric=metric))
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_gradient(kernel):
+    # Compare analytic and numeric gradient of kernels.
+    kernel = clone(kernel)  # make tests independent of one-another
+    K, K_gradient = kernel(X, eval_gradient=True)
+
+    assert K_gradient.shape[0] == X.shape[0]
+    assert K_gradient.shape[1] == X.shape[0]
+    assert K_gradient.shape[2] == kernel.theta.shape[0]
+
+    def eval_kernel_for_theta(theta):
+        kernel_clone = kernel.clone_with_theta(theta)
+        K = kernel_clone(X, eval_gradient=False)
+        return K
+
+    K_gradient_approx = _approx_fprime(kernel.theta, eval_kernel_for_theta, 1e-10)
+
+    assert_almost_equal(K_gradient, K_gradient_approx, 4)
+
+
+@pytest.mark.parametrize(
+    "kernel",
+    [
+        kernel
+        for kernel in kernels
+        # skip non-basic kernels
+        if not (isinstance(kernel, (KernelOperator, Exponentiation)))
+    ],
+)
+def test_kernel_theta(kernel):
+    # Check that parameter vector theta of kernel is set correctly.
+    kernel = clone(kernel)  # make tests independent of one-another
+    theta = kernel.theta
+    _, K_gradient = kernel(X, eval_gradient=True)
+
+    # Determine kernel parameters that contribute to theta
+    init_sign = signature(kernel.__class__.__init__).parameters.values()
+    args = [p.name for p in init_sign if p.name != "self"]
+    theta_vars = map(
+        lambda s: s[0 : -len("_bounds")], filter(lambda s: s.endswith("_bounds"), args)
+    )
+    assert set(hyperparameter.name for hyperparameter in kernel.hyperparameters) == set(
+        theta_vars
+    )
+
+    # Check that values returned in theta are consistent with
+    # hyperparameter values (being their logarithms)
+    for i, hyperparameter in enumerate(kernel.hyperparameters):
+        assert theta[i] == np.log(getattr(kernel, hyperparameter.name))
+
+    # Fixed kernel parameters must be excluded from theta and gradient.
+    for i, hyperparameter in enumerate(kernel.hyperparameters):
+        # create copy with certain hyperparameter fixed
+        params = kernel.get_params()
+        params[hyperparameter.name + "_bounds"] = "fixed"
+        kernel_class = kernel.__class__
+        new_kernel = kernel_class(**params)
+        # Check that theta and K_gradient are identical with the fixed
+        # dimension left out
+        _, K_gradient_new = new_kernel(X, eval_gradient=True)
+        assert theta.shape[0] == new_kernel.theta.shape[0] + 1
+        assert K_gradient.shape[2] == K_gradient_new.shape[2] + 1
+        if i > 0:
+            assert theta[:i] == new_kernel.theta[:i]
+            assert_array_equal(K_gradient[..., :i], K_gradient_new[..., :i])
+        if i + 1 < len(kernel.hyperparameters):
+            assert theta[i + 1 :] == new_kernel.theta[i:]
+            assert_array_equal(K_gradient[..., i + 1 :], K_gradient_new[..., i:])
+
+    # Check that values of theta are modified correctly
+    for i, hyperparameter in enumerate(kernel.hyperparameters):
+        theta[i] = np.log(42)
+        kernel.theta = theta
+        assert_almost_equal(getattr(kernel, hyperparameter.name), 42)
+
+        setattr(kernel, hyperparameter.name, 43)
+        assert_almost_equal(kernel.theta[i], np.log(43))
+
+
+@pytest.mark.parametrize(
+    "kernel",
+    [
+        kernel
+        for kernel in kernels
+        # Identity is not satisfied on diagonal
+        if kernel != kernel_rbf_plus_white
+    ],
+)
+def test_auto_vs_cross(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # Auto-correlation and cross-correlation should be consistent.
+    K_auto = kernel(X)
+    K_cross = kernel(X, X)
+    assert_almost_equal(K_auto, K_cross, 5)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_diag(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # Test that diag method of kernel returns consistent results.
+    K_call_diag = np.diag(kernel(X))
+    K_diag = kernel.diag(X)
+    assert_almost_equal(K_call_diag, K_diag, 5)
+
+
+def test_kernel_operator_commutative():
+    # Adding kernels and multiplying kernels should be commutative.
+    # Check addition
+    assert_almost_equal((RBF(2.0) + 1.0)(X), (1.0 + RBF(2.0))(X))
+
+    # Check multiplication
+    assert_almost_equal((3.0 * RBF(2.0))(X), (RBF(2.0) * 3.0)(X))
+
+
+def test_kernel_anisotropic():
+    # Anisotropic kernel should be consistent with isotropic kernels.
+    kernel = 3.0 * RBF([0.5, 2.0])
+
+    K = kernel(X)
+    X1 = X.copy()
+    X1[:, 0] *= 4
+    K1 = 3.0 * RBF(2.0)(X1)
+    assert_almost_equal(K, K1)
+
+    X2 = X.copy()
+    X2[:, 1] /= 4
+    K2 = 3.0 * RBF(0.5)(X2)
+    assert_almost_equal(K, K2)
+
+    # Check getting and setting via theta
+    kernel.theta = kernel.theta + np.log(2)
+    assert_array_equal(kernel.theta, np.log([6.0, 1.0, 4.0]))
+    assert_array_equal(kernel.k2.length_scale, [1.0, 4.0])
+
+
+@pytest.mark.parametrize(
+    "kernel", [kernel for kernel in kernels if kernel.is_stationary()]
+)
+def test_kernel_stationary(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # Test stationarity of kernels.
+    K = kernel(X, X + 1)
+    assert_almost_equal(K[0, 0], np.diag(K))
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_input_type(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # Test whether kernels is for vectors or structured data
+    if isinstance(kernel, Exponentiation):
+        assert kernel.requires_vector_input == kernel.kernel.requires_vector_input
+    if isinstance(kernel, KernelOperator):
+        assert kernel.requires_vector_input == (
+            kernel.k1.requires_vector_input or kernel.k2.requires_vector_input
+        )
+
+
+def test_compound_kernel_input_type():
+    kernel = CompoundKernel([WhiteKernel(noise_level=3.0)])
+    assert not kernel.requires_vector_input
+
+    kernel = CompoundKernel([WhiteKernel(noise_level=3.0), RBF(length_scale=2.0)])
+    assert kernel.requires_vector_input
+
+
+def check_hyperparameters_equal(kernel1, kernel2):
+    # Check that hyperparameters of two kernels are equal
+    for attr in set(dir(kernel1) + dir(kernel2)):
+        if attr.startswith("hyperparameter_"):
+            attr_value1 = getattr(kernel1, attr)
+            attr_value2 = getattr(kernel2, attr)
+            assert attr_value1 == attr_value2
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_clone(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # Test that sklearn's clone works correctly on kernels.
+    kernel_cloned = clone(kernel)
+
+    # XXX: Should this be fixed?
+    # This differs from the sklearn's estimators equality check.
+    assert kernel == kernel_cloned
+    assert id(kernel) != id(kernel_cloned)
+
+    # Check that all constructor parameters are equal.
+    assert kernel.get_params() == kernel_cloned.get_params()
+
+    # Check that all hyperparameters are equal.
+    check_hyperparameters_equal(kernel, kernel_cloned)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_clone_after_set_params(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # This test is to verify that using set_params does not
+    # break clone on kernels.
+    # This used to break because in kernels such as the RBF, non-trivial
+    # logic that modified the length scale used to be in the constructor
+    # See https://github.com/scikit-learn/scikit-learn/issues/6961
+    # for more details.
+    bounds = (1e-5, 1e5)
+    kernel_cloned = clone(kernel)
+    params = kernel.get_params()
+    # RationalQuadratic kernel is isotropic.
+    isotropic_kernels = (ExpSineSquared, RationalQuadratic)
+    if "length_scale" in params and not isinstance(kernel, isotropic_kernels):
+        length_scale = params["length_scale"]
+        if np.iterable(length_scale):
+            # XXX unreached code as of v0.22
+            params["length_scale"] = length_scale[0]
+            params["length_scale_bounds"] = bounds
+        else:
+            params["length_scale"] = [length_scale] * 2
+            params["length_scale_bounds"] = bounds * 2
+        kernel_cloned.set_params(**params)
+        kernel_cloned_clone = clone(kernel_cloned)
+        assert kernel_cloned_clone.get_params() == kernel_cloned.get_params()
+        assert id(kernel_cloned_clone) != id(kernel_cloned)
+        check_hyperparameters_equal(kernel_cloned, kernel_cloned_clone)
+
+
+def test_matern_kernel():
+    # Test consistency of Matern kernel for special values of nu.
+    K = Matern(nu=1.5, length_scale=1.0)(X)
+    # the diagonal elements of a matern kernel are 1
+    assert_array_almost_equal(np.diag(K), np.ones(X.shape[0]))
+    # matern kernel for coef0==0.5 is equal to absolute exponential kernel
+    K_absexp = np.exp(-euclidean_distances(X, X, squared=False))
+    K = Matern(nu=0.5, length_scale=1.0)(X)
+    assert_array_almost_equal(K, K_absexp)
+    # matern kernel with coef0==inf is equal to RBF kernel
+    K_rbf = RBF(length_scale=1.0)(X)
+    K = Matern(nu=np.inf, length_scale=1.0)(X)
+    assert_array_almost_equal(K, K_rbf)
+    assert_allclose(K, K_rbf)
+    # test that special cases of matern kernel (coef0 in [0.5, 1.5, 2.5])
+    # result in nearly identical results as the general case for coef0 in
+    # [0.5 + tiny, 1.5 + tiny, 2.5 + tiny]
+    tiny = 1e-10
+    for nu in [0.5, 1.5, 2.5]:
+        K1 = Matern(nu=nu, length_scale=1.0)(X)
+        K2 = Matern(nu=nu + tiny, length_scale=1.0)(X)
+        assert_array_almost_equal(K1, K2)
+    # test that coef0==large is close to RBF
+    large = 100
+    K1 = Matern(nu=large, length_scale=1.0)(X)
+    K2 = RBF(length_scale=1.0)(X)
+    assert_array_almost_equal(K1, K2, decimal=2)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_versus_pairwise(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # Check that GP kernels can also be used as pairwise kernels.
+
+    # Test auto-kernel
+    if kernel != kernel_rbf_plus_white:
+        # For WhiteKernel: k(X) != k(X,X). This is assumed by
+        # pairwise_kernels
+        K1 = kernel(X)
+        K2 = pairwise_kernels(X, metric=kernel)
+        assert_array_almost_equal(K1, K2)
+
+    # Test cross-kernel
+    K1 = kernel(X, Y)
+    K2 = pairwise_kernels(X, Y, metric=kernel)
+    assert_array_almost_equal(K1, K2)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_set_get_params(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # Check that set_params()/get_params() is consistent with kernel.theta.
+
+    # Test get_params()
+    index = 0
+    params = kernel.get_params()
+    for hyperparameter in kernel.hyperparameters:
+        if isinstance("string", type(hyperparameter.bounds)):
+            if hyperparameter.bounds == "fixed":
+                continue
+        size = hyperparameter.n_elements
+        if size > 1:  # anisotropic kernels
+            assert_almost_equal(
+                np.exp(kernel.theta[index : index + size]), params[hyperparameter.name]
+            )
+            index += size
+        else:
+            assert_almost_equal(
+                np.exp(kernel.theta[index]), params[hyperparameter.name]
+            )
+            index += 1
+    # Test set_params()
+    index = 0
+    value = 10  # arbitrary value
+    for hyperparameter in kernel.hyperparameters:
+        if isinstance("string", type(hyperparameter.bounds)):
+            if hyperparameter.bounds == "fixed":
+                continue
+        size = hyperparameter.n_elements
+        if size > 1:  # anisotropic kernels
+            kernel.set_params(**{hyperparameter.name: [value] * size})
+            assert_almost_equal(
+                np.exp(kernel.theta[index : index + size]), [value] * size
+            )
+            index += size
+        else:
+            kernel.set_params(**{hyperparameter.name: value})
+            assert_almost_equal(np.exp(kernel.theta[index]), value)
+            index += 1
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_repr_kernels(kernel):
+    kernel = clone(kernel)  # make tests independent of one-another
+    # Smoke-test for repr in kernels.
+
+    repr(kernel)
+
+
+def test_rational_quadratic_kernel():
+    kernel = RationalQuadratic(length_scale=[1.0, 1.0])
+    message = (
+        "RationalQuadratic kernel only supports isotropic "
+        "version, please use a single "
+        "scalar for length_scale"
+    )
+    with pytest.raises(AttributeError, match=message):
+        kernel(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/impute/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaa81d73c34a19004645733c05ea362aae8dcb01
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/impute/__init__.py
@@ -0,0 +1,28 @@
+"""Transformers for missing value imputation."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import typing
+
+from ._base import MissingIndicator, SimpleImputer
+from ._knn import KNNImputer
+
+if typing.TYPE_CHECKING:
+    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
+    # TODO: remove this check once the estimator is no longer experimental.
+    from ._iterative import IterativeImputer  # noqa: F401
+
+__all__ = ["KNNImputer", "MissingIndicator", "SimpleImputer"]
+
+
+# TODO: remove this check once the estimator is no longer experimental.
+def __getattr__(name):
+    if name == "IterativeImputer":
+        raise ImportError(
+            f"{name} is experimental and the API might change without any "
+            "deprecation cycle. To use it, you need to explicitly import "
+            "enable_iterative_imputer:\n"
+            "from sklearn.experimental import enable_iterative_imputer"
+        )
+    raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/_base.py b/.venv/lib/python3.12/site-packages/sklearn/impute/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae74068145678bd362296a367007371a5a353a95
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/impute/_base.py
@@ -0,0 +1,1155 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import warnings
+from collections import Counter
+from functools import partial
+from typing import Callable
+
+import numpy as np
+import numpy.ma as ma
+from scipy import sparse as sp
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils._mask import _get_mask
+from ..utils._missing import is_pandas_na, is_scalar_nan
+from ..utils._param_validation import MissingValues, StrOptions
+from ..utils.fixes import _mode
+from ..utils.sparsefuncs import _get_median
+from ..utils.validation import (
+    FLOAT_DTYPES,
+    _check_feature_names_in,
+    _check_n_features,
+    check_is_fitted,
+    validate_data,
+)
+
+
+def _check_inputs_dtype(X, missing_values):
+    if is_pandas_na(missing_values):
+        # Allow using `pd.NA` as missing values to impute numerical arrays.
+        return
+    if X.dtype.kind in ("f", "i", "u") and not isinstance(missing_values, numbers.Real):
+        raise ValueError(
+            "'X' and 'missing_values' types are expected to be"
+            " both numerical. Got X.dtype={} and "
+            " type(missing_values)={}.".format(X.dtype, type(missing_values))
+        )
+
+
+def _safe_min(items):
+    """Compute the minimum of a list of potentially non-comparable values.
+
+    If values cannot be directly compared due to type incompatibility, the object with
+    the lowest string representation is returned.
+    """
+    try:
+        return min(items)
+    except TypeError as e:
+        if "'<' not supported between" in str(e):
+            return min(items, key=lambda x: (str(type(x)), str(x)))
+        raise  # pragma: no cover
+
+
+def _most_frequent(array, extra_value, n_repeat):
+    """Compute the most frequent value in a 1d array extended with
+    [extra_value] * n_repeat, where extra_value is assumed to be not part
+    of the array."""
+    # Compute the most frequent value in array only
+    if array.size > 0:
+        if array.dtype == object:
+            # scipy.stats.mode is slow with object dtype array.
+            # Python Counter is more efficient
+            counter = Counter(array)
+            most_frequent_count = counter.most_common(1)[0][1]
+            # tie breaking similarly to scipy.stats.mode
+            most_frequent_value = _safe_min(
+                [
+                    value
+                    for value, count in counter.items()
+                    if count == most_frequent_count
+                ]
+            )
+        else:
+            mode = _mode(array)
+            most_frequent_value = mode[0][0]
+            most_frequent_count = mode[1][0]
+    else:
+        most_frequent_value = 0
+        most_frequent_count = 0
+
+    # Compare to array + [extra_value] * n_repeat
+    if most_frequent_count == 0 and n_repeat == 0:
+        return np.nan
+    elif most_frequent_count < n_repeat:
+        return extra_value
+    elif most_frequent_count > n_repeat:
+        return most_frequent_value
+    elif most_frequent_count == n_repeat:
+        # tie breaking similarly to scipy.stats.mode
+        return _safe_min([most_frequent_value, extra_value])
+
+
+class _BaseImputer(TransformerMixin, BaseEstimator):
+    """Base class for all imputers.
+
+    It adds automatically support for `add_indicator`.
+    """
+
+    _parameter_constraints: dict = {
+        "missing_values": [MissingValues()],
+        "add_indicator": ["boolean"],
+        "keep_empty_features": ["boolean"],
+    }
+
+    def __init__(
+        self, *, missing_values=np.nan, add_indicator=False, keep_empty_features=False
+    ):
+        self.missing_values = missing_values
+        self.add_indicator = add_indicator
+        self.keep_empty_features = keep_empty_features
+
+    def _fit_indicator(self, X):
+        """Fit a MissingIndicator."""
+        if self.add_indicator:
+            self.indicator_ = MissingIndicator(
+                missing_values=self.missing_values, error_on_new=False
+            )
+            self.indicator_._fit(X, precomputed=True)
+        else:
+            self.indicator_ = None
+
+    def _transform_indicator(self, X):
+        """Compute the indicator mask.'
+
+        Note that X must be the original data as passed to the imputer before
+        any imputation, since imputation may be done inplace in some cases.
+        """
+        if self.add_indicator:
+            if not hasattr(self, "indicator_"):
+                raise ValueError(
+                    "Make sure to call _fit_indicator before _transform_indicator"
+                )
+            return self.indicator_.transform(X)
+
+    def _concatenate_indicator(self, X_imputed, X_indicator):
+        """Concatenate indicator mask with the imputed data."""
+        if not self.add_indicator:
+            return X_imputed
+
+        if sp.issparse(X_imputed):
+            # sp.hstack may result in different formats between sparse arrays and
+            # matrices; specify the format to keep consistent behavior
+            hstack = partial(sp.hstack, format=X_imputed.format)
+        else:
+            hstack = np.hstack
+
+        if X_indicator is None:
+            raise ValueError(
+                "Data from the missing indicator are not provided. Call "
+                "_fit_indicator and _transform_indicator in the imputer "
+                "implementation."
+            )
+
+        return hstack((X_imputed, X_indicator))
+
+    def _concatenate_indicator_feature_names_out(self, names, input_features):
+        if not self.add_indicator:
+            return names
+
+        indicator_names = self.indicator_.get_feature_names_out(input_features)
+        return np.concatenate([names, indicator_names])
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = is_scalar_nan(self.missing_values)
+        return tags
+
+
+class SimpleImputer(_BaseImputer):
+    """Univariate imputer for completing missing values with simple strategies.
+
+    Replace missing values using a descriptive statistic (e.g. mean, median, or
+    most frequent) along each column, or using a constant value.
+
+    Read more in the :ref:`User Guide <impute>`.
+
+    .. versionadded:: 0.20
+       `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer`
+       estimator which is now removed.
+
+    Parameters
+    ----------
+    missing_values : int, float, str, np.nan, None or pandas.NA, default=np.nan
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        can be set to either `np.nan` or `pd.NA`.
+
+    strategy : str or Callable, default='mean'
+        The imputation strategy.
+
+        - If "mean", then replace missing values using the mean along
+          each column. Can only be used with numeric data.
+        - If "median", then replace missing values using the median along
+          each column. Can only be used with numeric data.
+        - If "most_frequent", then replace missing using the most frequent
+          value along each column. Can be used with strings or numeric data.
+          If there is more than one such value, only the smallest is returned.
+        - If "constant", then replace missing values with fill_value. Can be
+          used with strings or numeric data.
+        - If an instance of Callable, then replace missing values using the
+          scalar statistic returned by running the callable over a dense 1d
+          array containing non-missing values of each column.
+
+        .. versionadded:: 0.20
+           strategy="constant" for fixed value imputation.
+
+        .. versionadded:: 1.5
+           strategy=callable for custom value imputation.
+
+    fill_value : str or numerical value, default=None
+        When strategy == "constant", `fill_value` is used to replace all
+        occurrences of missing_values. For string or object data types,
+        `fill_value` must be a string.
+        If `None`, `fill_value` will be 0 when imputing numerical
+        data and "missing_value" for strings or object data types.
+
+    copy : bool, default=True
+        If True, a copy of X will be created. If False, imputation will
+        be done in-place whenever possible. Note that, in the following cases,
+        a new copy will always be made, even if `copy=False`:
+
+        - If `X` is not an array of floating values;
+        - If `X` is encoded as a CSR matrix;
+        - If `add_indicator=True`.
+
+    add_indicator : bool, default=False
+        If True, a :class:`MissingIndicator` transform will stack onto output
+        of the imputer's transform. This allows a predictive estimator
+        to account for missingness despite imputation. If a feature has no
+        missing values at fit/train time, the feature won't appear on
+        the missing indicator even if there are missing values at
+        transform/test time.
+
+    keep_empty_features : bool, default=False
+        If True, features that consist exclusively of missing values when
+        `fit` is called are returned in results when `transform` is called.
+        The imputed value is always `0` except when `strategy="constant"`
+        in which case `fill_value` will be used instead.
+
+        .. versionadded:: 1.2
+
+        .. versionchanged:: 1.6
+            Currently, when `keep_empty_feature=False` and `strategy="constant"`,
+            empty features are not dropped. This behaviour will change in version
+            1.8. Set `keep_empty_feature=True` to preserve this behaviour.
+
+    Attributes
+    ----------
+    statistics_ : array of shape (n_features,)
+        The imputation fill value for each feature.
+        Computing statistics can result in `np.nan` values.
+        During :meth:`transform`, features corresponding to `np.nan`
+        statistics will be discarded.
+
+    indicator_ : :class:`~sklearn.impute.MissingIndicator`
+        Indicator used to add binary indicators for missing values.
+        `None` if `add_indicator=False`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    IterativeImputer : Multivariate imputer that estimates values to impute for
+        each feature with missing values from all the others.
+    KNNImputer : Multivariate imputer that estimates missing features using
+        nearest samples.
+
+    Notes
+    -----
+    Columns which only contained missing values at :meth:`fit` are discarded
+    upon :meth:`transform` if strategy is not `"constant"`.
+
+    In a prediction context, simple imputation usually performs poorly when
+    associated with a weak learner. However, with a powerful learner, it can
+    lead to as good or better performance than complex imputation such as
+    :class:`~sklearn.impute.IterativeImputer` or :class:`~sklearn.impute.KNNImputer`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import SimpleImputer
+    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
+    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
+    SimpleImputer()
+    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
+    >>> print(imp_mean.transform(X))
+    [[ 7.   2.   3. ]
+     [ 4.   3.5  6. ]
+     [10.   3.5  9. ]]
+
+    For a more detailed example see
+    :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseImputer._parameter_constraints,
+        "strategy": [
+            StrOptions({"mean", "median", "most_frequent", "constant"}),
+            callable,
+        ],
+        "fill_value": "no_validation",  # any object is valid
+        "copy": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        missing_values=np.nan,
+        strategy="mean",
+        fill_value=None,
+        copy=True,
+        add_indicator=False,
+        keep_empty_features=False,
+    ):
+        super().__init__(
+            missing_values=missing_values,
+            add_indicator=add_indicator,
+            keep_empty_features=keep_empty_features,
+        )
+        self.strategy = strategy
+        self.fill_value = fill_value
+        self.copy = copy
+
+    def _validate_input(self, X, in_fit):
+        if self.strategy in ("most_frequent", "constant"):
+            # If input is a list of strings, dtype = object.
+            # Otherwise ValueError is raised in SimpleImputer
+            # with strategy='most_frequent' or 'constant'
+            # because the list is converted to Unicode numpy array
+            if isinstance(X, list) and any(
+                isinstance(elem, str) for row in X for elem in row
+            ):
+                dtype = object
+            else:
+                dtype = None
+        else:
+            dtype = FLOAT_DTYPES
+
+        if not in_fit and self._fit_dtype.kind == "O":
+            # Use object dtype if fitted on object dtypes
+            dtype = self._fit_dtype
+
+        if is_pandas_na(self.missing_values) or is_scalar_nan(self.missing_values):
+            ensure_all_finite = "allow-nan"
+        else:
+            ensure_all_finite = True
+
+        try:
+            X = validate_data(
+                self,
+                X,
+                reset=in_fit,
+                accept_sparse="csc",
+                dtype=dtype,
+                force_writeable=True if not in_fit else None,
+                ensure_all_finite=ensure_all_finite,
+                copy=self.copy,
+            )
+        except ValueError as ve:
+            if "could not convert" in str(ve):
+                new_ve = ValueError(
+                    "Cannot use {} strategy with non-numeric data:\n{}".format(
+                        self.strategy, ve
+                    )
+                )
+                raise new_ve from None
+            else:
+                raise ve
+
+        if in_fit:
+            # Use the dtype seen in `fit` for non-`fit` conversion
+            self._fit_dtype = X.dtype
+
+        _check_inputs_dtype(X, self.missing_values)
+        if X.dtype.kind not in ("i", "u", "f", "O"):
+            raise ValueError(
+                "SimpleImputer does not support data with dtype "
+                "{0}. Please provide either a numeric array (with"
+                " a floating point or integer dtype) or "
+                "categorical data represented either as an array "
+                "with integer dtype or an array of string values "
+                "with an object dtype.".format(X.dtype)
+            )
+
+        if sp.issparse(X) and self.missing_values == 0:
+            # missing_values = 0 not allowed with sparse data as it would
+            # force densification
+            raise ValueError(
+                "Imputation not possible when missing_values "
+                "== 0 and input is sparse. Provide a dense "
+                "array instead."
+            )
+
+        if self.strategy == "constant":
+            if in_fit and self.fill_value is not None:
+                fill_value_dtype = type(self.fill_value)
+                err_msg = (
+                    f"fill_value={self.fill_value!r} (of type {fill_value_dtype!r}) "
+                    f"cannot be cast to the input data that is {X.dtype!r}. "
+                    "If fill_value is a Python scalar, instead pass  a numpy scalar "
+                    "(e.g. fill_value=np.uint8(0) if your data is of type np.uint8). "
+                    "Make sure that both dtypes are of the same kind."
+                )
+            elif not in_fit:
+                fill_value_dtype = self.statistics_.dtype
+                err_msg = (
+                    f"The dtype of the filling value (i.e. {fill_value_dtype!r}) "
+                    f"cannot be cast to the input data that is {X.dtype!r}. "
+                    "Make sure that the dtypes of the input data are of the same kind "
+                    "between fit and transform."
+                )
+            else:
+                # By default, fill_value=None, and the replacement is always
+                # compatible with the input data
+                fill_value_dtype = X.dtype
+
+            # Make sure we can safely cast fill_value dtype to the input data dtype
+            if not np.can_cast(fill_value_dtype, X.dtype, casting="same_kind"):
+                raise ValueError(err_msg)
+
+        return X
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the imputer on `X`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        X = self._validate_input(X, in_fit=True)
+
+        # default fill_value is 0 for numerical input and "missing_value"
+        # otherwise
+        if self.fill_value is None:
+            if X.dtype.kind in ("i", "u", "f"):
+                fill_value = 0
+            else:
+                fill_value = "missing_value"
+        else:
+            fill_value = self.fill_value
+
+        if sp.issparse(X):
+            self.statistics_ = self._sparse_fit(
+                X, self.strategy, self.missing_values, fill_value
+            )
+        else:
+            self.statistics_ = self._dense_fit(
+                X, self.strategy, self.missing_values, fill_value
+            )
+
+        return self
+
+    def _sparse_fit(self, X, strategy, missing_values, fill_value):
+        """Fit the transformer on sparse data."""
+        missing_mask = _get_mask(X, missing_values)
+        mask_data = missing_mask.data
+        n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
+
+        statistics = np.empty(X.shape[1])
+
+        if strategy == "constant":
+            # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic
+            # for empty features to drop them later.
+            if not self.keep_empty_features and any(
+                [all(missing_mask[:, i].data) for i in range(missing_mask.shape[1])]
+            ):
+                warnings.warn(
+                    "Currently, when `keep_empty_feature=False` and "
+                    '`strategy="constant"`, empty features are not dropped. '
+                    "This behaviour will change in version 1.8. Set "
+                    "`keep_empty_feature=True` to preserve this behaviour.",
+                    FutureWarning,
+                )
+
+            # for constant strategy, self.statistics_ is used to store
+            # fill_value in each column
+            statistics.fill(fill_value)
+        else:
+            for i in range(X.shape[1]):
+                column = X.data[X.indptr[i] : X.indptr[i + 1]]
+                mask_column = mask_data[X.indptr[i] : X.indptr[i + 1]]
+                column = column[~mask_column]
+
+                # combine explicit and implicit zeros
+                mask_zeros = _get_mask(column, 0)
+                column = column[~mask_zeros]
+                n_explicit_zeros = mask_zeros.sum()
+                n_zeros = n_implicit_zeros[i] + n_explicit_zeros
+
+                if len(column) == 0 and self.keep_empty_features:
+                    # in case we want to keep columns with only missing values.
+                    statistics[i] = 0
+                else:
+                    if strategy == "mean":
+                        s = column.size + n_zeros
+                        statistics[i] = np.nan if s == 0 else column.sum() / s
+
+                    elif strategy == "median":
+                        statistics[i] = _get_median(column, n_zeros)
+
+                    elif strategy == "most_frequent":
+                        statistics[i] = _most_frequent(column, 0, n_zeros)
+
+                    elif isinstance(strategy, Callable):
+                        statistics[i] = self.strategy(column)
+
+        super()._fit_indicator(missing_mask)
+
+        return statistics
+
+    def _dense_fit(self, X, strategy, missing_values, fill_value):
+        """Fit the transformer on dense data."""
+        missing_mask = _get_mask(X, missing_values)
+        masked_X = ma.masked_array(X, mask=missing_mask)
+
+        super()._fit_indicator(missing_mask)
+
+        # Mean
+        if strategy == "mean":
+            mean_masked = np.ma.mean(masked_X, axis=0)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            mean = np.ma.getdata(mean_masked)
+            mean[np.ma.getmask(mean_masked)] = 0 if self.keep_empty_features else np.nan
+
+            return mean
+
+        # Median
+        elif strategy == "median":
+            median_masked = np.ma.median(masked_X, axis=0)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            median = np.ma.getdata(median_masked)
+            median[np.ma.getmaskarray(median_masked)] = (
+                0 if self.keep_empty_features else np.nan
+            )
+
+            return median
+
+        # Most frequent
+        elif strategy == "most_frequent":
+            # Avoid use of scipy.stats.mstats.mode due to the required
+            # additional overhead and slow benchmarking performance.
+            # See Issue 14325 and PR 14399 for full discussion.
+
+            # To be able access the elements by columns
+            X = X.transpose()
+            mask = missing_mask.transpose()
+
+            if X.dtype.kind == "O":
+                most_frequent = np.empty(X.shape[0], dtype=object)
+            else:
+                most_frequent = np.empty(X.shape[0])
+
+            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
+                row_mask = np.logical_not(row_mask).astype(bool)
+                row = row[row_mask]
+                if len(row) == 0 and self.keep_empty_features:
+                    most_frequent[i] = 0
+                else:
+                    most_frequent[i] = _most_frequent(row, np.nan, 0)
+
+            return most_frequent
+
+        # Constant
+        elif strategy == "constant":
+            # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic
+            # for empty features to drop them later.
+            if not self.keep_empty_features and ma.getmask(masked_X).all(axis=0).any():
+                warnings.warn(
+                    "Currently, when `keep_empty_feature=False` and "
+                    '`strategy="constant"`, empty features are not dropped. '
+                    "This behaviour will change in version 1.8. Set "
+                    "`keep_empty_feature=True` to preserve this behaviour.",
+                    FutureWarning,
+                )
+
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
+            return np.full(X.shape[1], fill_value, dtype=X.dtype)
+
+        # Custom
+        elif isinstance(strategy, Callable):
+            statistics = np.empty(masked_X.shape[1])
+            for i in range(masked_X.shape[1]):
+                statistics[i] = self.strategy(masked_X[:, i].compressed())
+            return statistics
+
+    def transform(self, X):
+        """Impute all missing values in `X`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        X_imputed : {ndarray, sparse matrix} of shape \
+                (n_samples, n_features_out)
+            `X` with imputed values.
+        """
+        check_is_fitted(self)
+
+        X = self._validate_input(X, in_fit=False)
+        statistics = self.statistics_
+
+        if X.shape[1] != statistics.shape[0]:
+            raise ValueError(
+                "X has %d features per sample, expected %d"
+                % (X.shape[1], self.statistics_.shape[0])
+            )
+
+        # compute mask before eliminating invalid features
+        missing_mask = _get_mask(X, self.missing_values)
+
+        # Decide whether to keep missing features
+        if self.strategy == "constant" or self.keep_empty_features:
+            valid_statistics = statistics
+            valid_statistics_indexes = None
+        else:
+            # same as np.isnan but also works for object dtypes
+            invalid_mask = _get_mask(statistics, np.nan)
+            valid_mask = np.logical_not(invalid_mask)
+            valid_statistics = statistics[valid_mask]
+            valid_statistics_indexes = np.flatnonzero(valid_mask)
+
+            if invalid_mask.any():
+                invalid_features = np.arange(X.shape[1])[invalid_mask]
+                # use feature names warning if features are provided
+                if hasattr(self, "feature_names_in_"):
+                    invalid_features = self.feature_names_in_[invalid_features]
+                warnings.warn(
+                    "Skipping features without any observed values:"
+                    f" {invalid_features}. At least one non-missing value is needed"
+                    f" for imputation with strategy='{self.strategy}'."
+                )
+                X = X[:, valid_statistics_indexes]
+
+        # Do actual imputation
+        if sp.issparse(X):
+            if self.missing_values == 0:
+                raise ValueError(
+                    "Imputation not possible when missing_values "
+                    "== 0 and input is sparse. Provide a dense "
+                    "array instead."
+                )
+            else:
+                # if no invalid statistics are found, use the mask computed
+                # before, else recompute mask
+                if valid_statistics_indexes is None:
+                    mask = missing_mask.data
+                else:
+                    mask = _get_mask(X.data, self.missing_values)
+                indexes = np.repeat(
+                    np.arange(len(X.indptr) - 1, dtype=int), np.diff(X.indptr)
+                )[mask]
+
+                X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False)
+        else:
+            # use mask computed before eliminating invalid mask
+            if valid_statistics_indexes is None:
+                mask_valid_features = missing_mask
+            else:
+                mask_valid_features = missing_mask[:, valid_statistics_indexes]
+            n_missing = np.sum(mask_valid_features, axis=0)
+            values = np.repeat(valid_statistics, n_missing)
+            coordinates = np.where(mask_valid_features.transpose())[::-1]
+
+            X[coordinates] = values
+
+        X_indicator = super()._transform_indicator(missing_mask)
+
+        return super()._concatenate_indicator(X, X_indicator)
+
+    def inverse_transform(self, X):
+        """Convert the data back to the original representation.
+
+        Inverts the `transform` operation performed on an array.
+        This operation can only be performed after :class:`SimpleImputer` is
+        instantiated with `add_indicator=True`.
+
+        Note that `inverse_transform` can only invert the transform in
+        features that have binary indicators for missing values. If a feature
+        has no missing values at `fit` time, the feature won't have a binary
+        indicator, and the imputation done at `transform` time won't be
+        inverted.
+
+        .. versionadded:: 0.24
+
+        Parameters
+        ----------
+        X : array-like of shape \
+                (n_samples, n_features + n_features_missing_indicator)
+            The imputed data to be reverted to original data. It has to be
+            an augmented array of imputed data and the missing indicator mask.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            The original `X` with missing values as it was prior
+            to imputation.
+        """
+        check_is_fitted(self)
+
+        if not self.add_indicator:
+            raise ValueError(
+                "'inverse_transform' works only when "
+                "'SimpleImputer' is instantiated with "
+                "'add_indicator=True'. "
+                f"Got 'add_indicator={self.add_indicator}' "
+                "instead."
+            )
+
+        n_features_missing = len(self.indicator_.features_)
+        non_empty_feature_count = X.shape[1] - n_features_missing
+        array_imputed = X[:, :non_empty_feature_count].copy()
+        missing_mask = X[:, non_empty_feature_count:].astype(bool)
+
+        n_features_original = len(self.statistics_)
+        shape_original = (X.shape[0], n_features_original)
+        X_original = np.zeros(shape_original)
+        X_original[:, self.indicator_.features_] = missing_mask
+        full_mask = X_original.astype(bool)
+
+        imputed_idx, original_idx = 0, 0
+        while imputed_idx < len(array_imputed.T):
+            if not np.all(X_original[:, original_idx]):
+                X_original[:, original_idx] = array_imputed.T[imputed_idx]
+                imputed_idx += 1
+                original_idx += 1
+            else:
+                original_idx += 1
+
+        X_original[full_mask] = self.missing_values
+        return X_original
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.input_tags.allow_nan = is_pandas_na(self.missing_values) or is_scalar_nan(
+            self.missing_values
+        )
+        return tags
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        input_features = _check_feature_names_in(self, input_features)
+        non_missing_mask = np.logical_not(_get_mask(self.statistics_, np.nan))
+        names = input_features[non_missing_mask]
+        return self._concatenate_indicator_feature_names_out(names, input_features)
+
+
+class MissingIndicator(TransformerMixin, BaseEstimator):
+    """Binary indicators for missing values.
+
+    Note that this component typically should not be used in a vanilla
+    :class:`~sklearn.pipeline.Pipeline` consisting of transformers and a
+    classifier, but rather could be added using a
+    :class:`~sklearn.pipeline.FeatureUnion` or
+    :class:`~sklearn.compose.ColumnTransformer`.
+
+    Read more in the :ref:`User Guide <impute>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    missing_values : int, float, str, np.nan or None, default=np.nan
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
+
+    features : {'missing-only', 'all'}, default='missing-only'
+        Whether the imputer mask should represent all or a subset of
+        features.
+
+        - If `'missing-only'` (default), the imputer mask will only represent
+          features containing missing values during fit time.
+        - If `'all'`, the imputer mask will represent all features.
+
+    sparse : bool or 'auto', default='auto'
+        Whether the imputer mask format should be sparse or dense.
+
+        - If `'auto'` (default), the imputer mask will be of same type as
+          input.
+        - If `True`, the imputer mask will be a sparse matrix.
+        - If `False`, the imputer mask will be a numpy array.
+
+    error_on_new : bool, default=True
+        If `True`, :meth:`transform` will raise an error when there are
+        features with missing values that have no missing values in
+        :meth:`fit`. This is applicable only when `features='missing-only'`.
+
+    Attributes
+    ----------
+    features_ : ndarray of shape (n_missing_features,) or (n_features,)
+        The features indices which will be returned when calling
+        :meth:`transform`. They are computed during :meth:`fit`. If
+        `features='all'`, `features_` is equal to `range(n_features)`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    SimpleImputer : Univariate imputation of missing values.
+    IterativeImputer : Multivariate imputation of missing values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import MissingIndicator
+    >>> X1 = np.array([[np.nan, 1, 3],
+    ...                [4, 0, np.nan],
+    ...                [8, 1, 0]])
+    >>> X2 = np.array([[5, 1, np.nan],
+    ...                [np.nan, 2, 3],
+    ...                [2, 4, 0]])
+    >>> indicator = MissingIndicator()
+    >>> indicator.fit(X1)
+    MissingIndicator()
+    >>> X2_tr = indicator.transform(X2)
+    >>> X2_tr
+    array([[False,  True],
+           [ True, False],
+           [False, False]])
+    """
+
+    _parameter_constraints: dict = {
+        "missing_values": [MissingValues()],
+        "features": [StrOptions({"missing-only", "all"})],
+        "sparse": ["boolean", StrOptions({"auto"})],
+        "error_on_new": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        missing_values=np.nan,
+        features="missing-only",
+        sparse="auto",
+        error_on_new=True,
+    ):
+        self.missing_values = missing_values
+        self.features = features
+        self.sparse = sparse
+        self.error_on_new = error_on_new
+
+    def _get_missing_features_info(self, X):
+        """Compute the imputer mask and the indices of the features
+        containing missing values.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The input data with missing values. Note that `X` has been
+            checked in :meth:`fit` and :meth:`transform` before to call this
+            function.
+
+        Returns
+        -------
+        imputer_mask : {ndarray, sparse matrix} of shape \
+        (n_samples, n_features)
+            The imputer mask of the original data.
+
+        features_with_missing : ndarray of shape (n_features_with_missing)
+            The features containing missing values.
+        """
+        if not self._precomputed:
+            imputer_mask = _get_mask(X, self.missing_values)
+        else:
+            imputer_mask = X
+
+        if sp.issparse(X):
+            imputer_mask.eliminate_zeros()
+
+            if self.features == "missing-only":
+                # count number of True values in each row.
+                n_missing = imputer_mask.sum(axis=0)
+
+            if self.sparse is False:
+                imputer_mask = imputer_mask.toarray()
+            elif imputer_mask.format == "csr":
+                imputer_mask = imputer_mask.tocsc()
+        else:
+            if not self._precomputed:
+                imputer_mask = _get_mask(X, self.missing_values)
+            else:
+                imputer_mask = X
+
+            if self.features == "missing-only":
+                n_missing = imputer_mask.sum(axis=0)
+
+            if self.sparse is True:
+                imputer_mask = sp.csc_matrix(imputer_mask)
+
+        if self.features == "all":
+            features_indices = np.arange(X.shape[1])
+        else:
+            features_indices = np.flatnonzero(n_missing)
+
+        return imputer_mask, features_indices
+
+    def _validate_input(self, X, in_fit):
+        if not is_scalar_nan(self.missing_values):
+            ensure_all_finite = True
+        else:
+            ensure_all_finite = "allow-nan"
+        X = validate_data(
+            self,
+            X,
+            reset=in_fit,
+            accept_sparse=("csc", "csr"),
+            dtype=None,
+            ensure_all_finite=ensure_all_finite,
+        )
+        _check_inputs_dtype(X, self.missing_values)
+        if X.dtype.kind not in ("i", "u", "f", "O"):
+            raise ValueError(
+                "MissingIndicator does not support data with "
+                "dtype {0}. Please provide either a numeric array"
+                " (with a floating point or integer dtype) or "
+                "categorical data represented either as an array "
+                "with integer dtype or an array of string values "
+                "with an object dtype.".format(X.dtype)
+            )
+
+        if sp.issparse(X) and self.missing_values == 0:
+            # missing_values = 0 not allowed with sparse data as it would
+            # force densification
+            raise ValueError(
+                "Sparse input with missing_values=0 is "
+                "not supported. Provide a dense "
+                "array instead."
+            )
+
+        return X
+
+    def _fit(self, X, y=None, precomputed=False):
+        """Fit the transformer on `X`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+            If `precomputed=True`, then `X` is a mask of the input data.
+
+        precomputed : bool
+            Whether the input data is a mask.
+
+        Returns
+        -------
+        imputer_mask : {ndarray, sparse matrix} of shape (n_samples, \
+        n_features)
+            The imputer mask of the original data.
+        """
+        if precomputed:
+            if not (hasattr(X, "dtype") and X.dtype.kind == "b"):
+                raise ValueError("precomputed is True but the input data is not a mask")
+            self._precomputed = True
+        else:
+            self._precomputed = False
+
+        # Need not validate X again as it would have already been validated
+        # in the Imputer calling MissingIndicator
+        if not self._precomputed:
+            X = self._validate_input(X, in_fit=True)
+        else:
+            # only create `n_features_in_` in the precomputed case
+            _check_n_features(self, X, reset=True)
+
+        self._n_features = X.shape[1]
+
+        missing_features_info = self._get_missing_features_info(X)
+        self.features_ = missing_features_info[1]
+
+        return missing_features_info[0]
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the transformer on `X`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self._fit(X, y)
+
+        return self
+
+    def transform(self, X):
+        """Generate missing values indicator for `X`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) \
+        or (n_samples, n_features_with_missing)
+            The missing indicator for input data. The data type of `Xt`
+            will be boolean.
+        """
+        check_is_fitted(self)
+
+        # Need not validate X again as it would have already been validated
+        # in the Imputer calling MissingIndicator
+        if not self._precomputed:
+            X = self._validate_input(X, in_fit=False)
+        else:
+            if not (hasattr(X, "dtype") and X.dtype.kind == "b"):
+                raise ValueError("precomputed is True but the input data is not a mask")
+
+        imputer_mask, features = self._get_missing_features_info(X)
+
+        if self.features == "missing-only":
+            features_diff_fit_trans = np.setdiff1d(features, self.features_)
+            if self.error_on_new and features_diff_fit_trans.size > 0:
+                raise ValueError(
+                    "The features {} have missing values "
+                    "in transform but have no missing values "
+                    "in fit.".format(features_diff_fit_trans)
+                )
+
+            if self.features_.size < self._n_features:
+                imputer_mask = imputer_mask[:, self.features_]
+
+        return imputer_mask
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Generate missing values indicator for `X`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data to complete.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) \
+        or (n_samples, n_features_with_missing)
+            The missing indicator for input data. The data type of `Xt`
+            will be boolean.
+        """
+        imputer_mask = self._fit(X, y)
+
+        if self.features_.size < self._n_features:
+            imputer_mask = imputer_mask[:, self.features_]
+
+        return imputer_mask
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        input_features = _check_feature_names_in(self, input_features)
+        prefix = self.__class__.__name__.lower()
+        return np.asarray(
+            [
+                f"{prefix}_{feature_name}"
+                for feature_name in input_features[self.features_]
+            ],
+            dtype=object,
+        )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        tags.input_tags.string = True
+        tags.input_tags.sparse = True
+        tags.transformer_tags.preserves_dtype = []
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/_iterative.py b/.venv/lib/python3.12/site-packages/sklearn/impute/_iterative.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddae5373c5460891467d50fd7105473031d957b4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/impute/_iterative.py
@@ -0,0 +1,1030 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from collections import namedtuple
+from numbers import Integral, Real
+from time import time
+
+import numpy as np
+from scipy import stats
+
+from ..base import _fit_context, clone
+from ..exceptions import ConvergenceWarning
+from ..preprocessing import normalize
+from ..utils import _safe_indexing, check_array, check_random_state
+from ..utils._indexing import _safe_assign
+from ..utils._mask import _get_mask
+from ..utils._missing import is_scalar_nan
+from ..utils._param_validation import HasMethods, Interval, StrOptions
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    process_routing,
+)
+from ..utils.validation import (
+    FLOAT_DTYPES,
+    _check_feature_names_in,
+    _num_samples,
+    check_is_fitted,
+    validate_data,
+)
+from ._base import SimpleImputer, _BaseImputer, _check_inputs_dtype
+
+_ImputerTriplet = namedtuple(
+    "_ImputerTriplet", ["feat_idx", "neighbor_feat_idx", "estimator"]
+)
+
+
+def _assign_where(X1, X2, cond):
+    """Assign X2 to X1 where cond is True.
+
+    Parameters
+    ----------
+    X1 : ndarray or dataframe of shape (n_samples, n_features)
+        Data.
+
+    X2 : ndarray of shape (n_samples, n_features)
+        Data to be assigned.
+
+    cond : ndarray of shape (n_samples, n_features)
+        Boolean mask to assign data.
+    """
+    if hasattr(X1, "mask"):  # pandas dataframes
+        X1.mask(cond=cond, other=X2, inplace=True)
+    else:  # ndarrays
+        X1[cond] = X2[cond]
+
+
+class IterativeImputer(_BaseImputer):
+    """Multivariate imputer that estimates each feature from all the others.
+
+    A strategy for imputing missing values by modeling each feature with
+    missing values as a function of other features in a round-robin fashion.
+
+    Read more in the :ref:`User Guide <iterative_imputer>`.
+
+    .. versionadded:: 0.21
+
+    .. note::
+
+      This estimator is still **experimental** for now: the predictions
+      and the API might change without any deprecation cycle. To use it,
+      you need to explicitly import `enable_iterative_imputer`::
+
+        >>> # explicitly require this experimental feature
+        >>> from sklearn.experimental import enable_iterative_imputer  # noqa
+        >>> # now you can import normally from sklearn.impute
+        >>> from sklearn.impute import IterativeImputer
+
+    Parameters
+    ----------
+    estimator : estimator object, default=BayesianRidge()
+        The estimator to use at each step of the round-robin imputation.
+        If `sample_posterior=True`, the estimator must support
+        `return_std` in its `predict` method.
+
+    missing_values : int or np.nan, default=np.nan
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
+
+    sample_posterior : bool, default=False
+        Whether to sample from the (Gaussian) predictive posterior of the
+        fitted estimator for each imputation. Estimator must support
+        `return_std` in its `predict` method if set to `True`. Set to
+        `True` if using `IterativeImputer` for multiple imputations.
+
+    max_iter : int, default=10
+        Maximum number of imputation rounds to perform before returning the
+        imputations computed during the final round. A round is a single
+        imputation of each feature with missing values. The stopping criterion
+        is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`,
+        where `X_t` is `X` at iteration `t`. Note that early stopping is only
+        applied if `sample_posterior=False`.
+
+    tol : float, default=1e-3
+        Tolerance of the stopping condition.
+
+    n_nearest_features : int, default=None
+        Number of other features to use to estimate the missing values of
+        each feature column. Nearness between features is measured using
+        the absolute correlation coefficient between each feature pair (after
+        initial imputation). To ensure coverage of features throughout the
+        imputation process, the neighbor features are not necessarily nearest,
+        but are drawn with probability proportional to correlation for each
+        imputed target feature. Can provide significant speed-up when the
+        number of features is huge. If `None`, all features will be used.
+
+    initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, \
+            default='mean'
+        Which strategy to use to initialize the missing values. Same as the
+        `strategy` parameter in :class:`~sklearn.impute.SimpleImputer`.
+
+    fill_value : str or numerical value, default=None
+        When `strategy="constant"`, `fill_value` is used to replace all
+        occurrences of missing_values. For string or object data types,
+        `fill_value` must be a string.
+        If `None`, `fill_value` will be 0 when imputing numerical
+        data and "missing_value" for strings or object data types.
+
+        .. versionadded:: 1.3
+
+    imputation_order : {'ascending', 'descending', 'roman', 'arabic', \
+            'random'}, default='ascending'
+        The order in which the features will be imputed. Possible values:
+
+        - `'ascending'`: From features with fewest missing values to most.
+        - `'descending'`: From features with most missing values to fewest.
+        - `'roman'`: Left to right.
+        - `'arabic'`: Right to left.
+        - `'random'`: A random order for each round.
+
+    skip_complete : bool, default=False
+        If `True` then features with missing values during :meth:`transform`
+        which did not have any missing values during :meth:`fit` will be
+        imputed with the initial imputation method only. Set to `True` if you
+        have many features with no missing values at both :meth:`fit` and
+        :meth:`transform` time to save compute.
+
+    min_value : float or array-like of shape (n_features,), default=-np.inf
+        Minimum possible imputed value. Broadcast to shape `(n_features,)` if
+        scalar. If array-like, expects shape `(n_features,)`, one min value for
+        each feature. The default is `-np.inf`.
+
+        .. versionchanged:: 0.23
+           Added support for array-like.
+
+    max_value : float or array-like of shape (n_features,), default=np.inf
+        Maximum possible imputed value. Broadcast to shape `(n_features,)` if
+        scalar. If array-like, expects shape `(n_features,)`, one max value for
+        each feature. The default is `np.inf`.
+
+        .. versionchanged:: 0.23
+           Added support for array-like.
+
+    verbose : int, default=0
+        Verbosity flag, controls the debug messages that are issued
+        as functions are evaluated. The higher, the more verbose. Can be 0, 1,
+        or 2.
+
+    random_state : int, RandomState instance or None, default=None
+        The seed of the pseudo random number generator to use. Randomizes
+        selection of estimator features if `n_nearest_features` is not `None`,
+        the `imputation_order` if `random`, and the sampling from posterior if
+        `sample_posterior=True`. Use an integer for determinism.
+        See :term:`the Glossary <random_state>`.
+
+    add_indicator : bool, default=False
+        If `True`, a :class:`MissingIndicator` transform will stack onto output
+        of the imputer's transform. This allows a predictive estimator
+        to account for missingness despite imputation. If a feature has no
+        missing values at fit/train time, the feature won't appear on
+        the missing indicator even if there are missing values at
+        transform/test time.
+
+    keep_empty_features : bool, default=False
+        If True, features that consist exclusively of missing values when
+        `fit` is called are returned in results when `transform` is called.
+        The imputed value is always `0` except when
+        `initial_strategy="constant"` in which case `fill_value` will be
+        used instead.
+
+        .. versionadded:: 1.2
+
+    Attributes
+    ----------
+    initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer`
+        Imputer used to initialize the missing values.
+
+    imputation_sequence_ : list of tuples
+        Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where
+        `feat_idx` is the current feature to be imputed,
+        `neighbor_feat_idx` is the array of other features used to impute the
+        current feature, and `estimator` is the trained estimator used for
+        the imputation. Length is `self.n_features_with_missing_ *
+        self.n_iter_`.
+
+    n_iter_ : int
+        Number of iteration rounds that occurred. Will be less than
+        `self.max_iter` if early stopping criterion was reached.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_features_with_missing_ : int
+        Number of features with missing values.
+
+    indicator_ : :class:`~sklearn.impute.MissingIndicator`
+        Indicator used to add binary indicators for missing values.
+        `None` if `add_indicator=False`.
+
+    random_state_ : RandomState instance
+        RandomState instance that is generated either from a seed, the random
+        number generator or by `np.random`.
+
+    See Also
+    --------
+    SimpleImputer : Univariate imputer for completing missing values
+        with simple strategies.
+    KNNImputer : Multivariate imputer that estimates missing features using
+        nearest samples.
+
+    Notes
+    -----
+    To support imputation in inductive mode we store each feature's estimator
+    during the :meth:`fit` phase, and predict without refitting (in order)
+    during the :meth:`transform` phase.
+
+    Features which contain all missing values at :meth:`fit` are discarded upon
+    :meth:`transform`.
+
+    Using defaults, the imputer scales in :math:`\\mathcal{O}(knp^3\\min(n,p))`
+    where :math:`k` = `max_iter`, :math:`n` the number of samples and
+    :math:`p` the number of features. It thus becomes prohibitively costly when
+    the number of features increases. Setting
+    `n_nearest_features << n_features`, `skip_complete=True` or increasing `tol`
+    can help to reduce its computational cost.
+
+    Depending on the nature of missing values, simple imputers can be
+    preferable in a prediction context.
+
+    References
+    ----------
+    .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
+        Multivariate Imputation by Chained Equations in R". Journal of
+        Statistical Software 45: 1-67.
+        <https://www.jstatsoft.org/article/view/v045i03>`_
+
+    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+        Multivariate Data Suitable for use with an Electronic Computer".
+        Journal of the Royal Statistical Society 22(2): 302-306.
+        <https://www.jstor.org/stable/2984099>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.experimental import enable_iterative_imputer
+    >>> from sklearn.impute import IterativeImputer
+    >>> imp_mean = IterativeImputer(random_state=0)
+    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
+    IterativeImputer(random_state=0)
+    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
+    >>> imp_mean.transform(X)
+    array([[ 6.9584,  2.       ,  3.        ],
+           [ 4.       ,  2.6000,  6.        ],
+           [10.       ,  4.9999,  9.        ]])
+
+    For a more detailed example see
+    :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py` or
+    :ref:`sphx_glr_auto_examples_impute_plot_iterative_imputer_variants_comparison.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseImputer._parameter_constraints,
+        "estimator": [None, HasMethods(["fit", "predict"])],
+        "sample_posterior": ["boolean"],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "n_nearest_features": [None, Interval(Integral, 1, None, closed="left")],
+        "initial_strategy": [
+            StrOptions({"mean", "median", "most_frequent", "constant"})
+        ],
+        "fill_value": "no_validation",  # any object is valid
+        "imputation_order": [
+            StrOptions({"ascending", "descending", "roman", "arabic", "random"})
+        ],
+        "skip_complete": ["boolean"],
+        "min_value": [None, Interval(Real, None, None, closed="both"), "array-like"],
+        "max_value": [None, Interval(Real, None, None, closed="both"), "array-like"],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        estimator=None,
+        *,
+        missing_values=np.nan,
+        sample_posterior=False,
+        max_iter=10,
+        tol=1e-3,
+        n_nearest_features=None,
+        initial_strategy="mean",
+        fill_value=None,
+        imputation_order="ascending",
+        skip_complete=False,
+        min_value=-np.inf,
+        max_value=np.inf,
+        verbose=0,
+        random_state=None,
+        add_indicator=False,
+        keep_empty_features=False,
+    ):
+        super().__init__(
+            missing_values=missing_values,
+            add_indicator=add_indicator,
+            keep_empty_features=keep_empty_features,
+        )
+
+        self.estimator = estimator
+        self.sample_posterior = sample_posterior
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_nearest_features = n_nearest_features
+        self.initial_strategy = initial_strategy
+        self.fill_value = fill_value
+        self.imputation_order = imputation_order
+        self.skip_complete = skip_complete
+        self.min_value = min_value
+        self.max_value = max_value
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def _impute_one_feature(
+        self,
+        X_filled,
+        mask_missing_values,
+        feat_idx,
+        neighbor_feat_idx,
+        estimator=None,
+        fit_mode=True,
+        params=None,
+    ):
+        """Impute a single feature from the others provided.
+
+        This function predicts the missing values of one of the features using
+        the current estimates of all the other features. The `estimator` must
+        support `return_std=True` in its `predict` method for this function
+        to work.
+
+        Parameters
+        ----------
+        X_filled : ndarray
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray
+            Input data's missing indicator matrix.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        neighbor_feat_idx : ndarray
+            Indices of the features to be used in imputing `feat_idx`.
+
+        estimator : object
+            The estimator to use at this step of the round-robin imputation.
+            If `sample_posterior=True`, the estimator must support
+            `return_std` in its `predict` method.
+            If None, it will be cloned from self._estimator.
+
+        fit_mode : boolean, default=True
+            Whether to fit and predict with the estimator or just predict.
+
+        params : dict
+            Additional params routed to the individual estimator.
+
+        Returns
+        -------
+        X_filled : ndarray
+            Input data with `X_filled[missing_row_mask, feat_idx]` updated.
+
+        estimator : estimator with sklearn API
+            The fitted estimator used to impute
+            `X_filled[missing_row_mask, feat_idx]`.
+        """
+        if estimator is None and fit_mode is False:
+            raise ValueError(
+                "If fit_mode is False, then an already-fitted "
+                "estimator should be passed in."
+            )
+
+        if estimator is None:
+            estimator = clone(self._estimator)
+
+        missing_row_mask = mask_missing_values[:, feat_idx]
+        if fit_mode:
+            X_train = _safe_indexing(
+                _safe_indexing(X_filled, neighbor_feat_idx, axis=1),
+                ~missing_row_mask,
+                axis=0,
+            )
+            y_train = _safe_indexing(
+                _safe_indexing(X_filled, feat_idx, axis=1),
+                ~missing_row_mask,
+                axis=0,
+            )
+            estimator.fit(X_train, y_train, **params)
+
+        # if no missing values, don't predict
+        if np.sum(missing_row_mask) == 0:
+            return X_filled, estimator
+
+        # get posterior samples if there is at least one missing value
+        X_test = _safe_indexing(
+            _safe_indexing(X_filled, neighbor_feat_idx, axis=1),
+            missing_row_mask,
+            axis=0,
+        )
+        if self.sample_posterior:
+            mus, sigmas = estimator.predict(X_test, return_std=True)
+            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+            # two types of problems: (1) non-positive sigmas
+            # (2) mus outside legal range of min_value and max_value
+            # (results in inf sample)
+            positive_sigmas = sigmas > 0
+            imputed_values[~positive_sigmas] = mus[~positive_sigmas]
+            mus_too_low = mus < self._min_value[feat_idx]
+            imputed_values[mus_too_low] = self._min_value[feat_idx]
+            mus_too_high = mus > self._max_value[feat_idx]
+            imputed_values[mus_too_high] = self._max_value[feat_idx]
+            # the rest can be sampled without statistical issues
+            inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
+            mus = mus[inrange_mask]
+            sigmas = sigmas[inrange_mask]
+            a = (self._min_value[feat_idx] - mus) / sigmas
+            b = (self._max_value[feat_idx] - mus) / sigmas
+
+            truncated_normal = stats.truncnorm(a=a, b=b, loc=mus, scale=sigmas)
+            imputed_values[inrange_mask] = truncated_normal.rvs(
+                random_state=self.random_state_
+            )
+        else:
+            imputed_values = estimator.predict(X_test)
+            imputed_values = np.clip(
+                imputed_values, self._min_value[feat_idx], self._max_value[feat_idx]
+            )
+
+        # update the feature
+        _safe_assign(
+            X_filled,
+            imputed_values,
+            row_indexer=missing_row_mask,
+            column_indexer=feat_idx,
+        )
+        return X_filled, estimator
+
+    def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat):
+        """Get a list of other features to predict `feat_idx`.
+
+        If `self.n_nearest_features` is less than or equal to the total
+        number of features, then use a probability proportional to the absolute
+        correlation between `feat_idx` and each other feature to randomly
+        choose a subsample of the other features (without replacement).
+
+        Parameters
+        ----------
+        n_features : int
+            Number of features in `X`.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of `X`. The diagonal has been zeroed
+            out and each feature has been normalized to sum to 1. Can be None.
+
+        Returns
+        -------
+        neighbor_feat_idx : array-like
+            The features to use to impute `feat_idx`.
+        """
+        if self.n_nearest_features is not None and self.n_nearest_features < n_features:
+            p = abs_corr_mat[:, feat_idx]
+            neighbor_feat_idx = self.random_state_.choice(
+                np.arange(n_features), self.n_nearest_features, replace=False, p=p
+            )
+        else:
+            inds_left = np.arange(feat_idx)
+            inds_right = np.arange(feat_idx + 1, n_features)
+            neighbor_feat_idx = np.concatenate((inds_left, inds_right))
+        return neighbor_feat_idx
+
+    def _get_ordered_idx(self, mask_missing_values):
+        """Decide in what order we will update the features.
+
+        As a homage to the MICE R package, we will have 4 main options of
+        how to order the updates, and use a random order if anything else
+        is specified.
+
+        Also, this function skips features which have no missing values.
+
+        Parameters
+        ----------
+        mask_missing_values : array-like, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where `n_samples` is the
+            number of samples and `n_features` is the number of features.
+
+        Returns
+        -------
+        ordered_idx : ndarray, shape (n_features,)
+            The order in which to impute the features.
+        """
+        frac_of_missing_values = mask_missing_values.mean(axis=0)
+        if self.skip_complete:
+            missing_values_idx = np.flatnonzero(frac_of_missing_values)
+        else:
+            missing_values_idx = np.arange(np.shape(frac_of_missing_values)[0])
+        if self.imputation_order == "roman":
+            ordered_idx = missing_values_idx
+        elif self.imputation_order == "arabic":
+            ordered_idx = missing_values_idx[::-1]
+        elif self.imputation_order == "ascending":
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:]
+        elif self.imputation_order == "descending":
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:][::-1]
+        elif self.imputation_order == "random":
+            ordered_idx = missing_values_idx
+            self.random_state_.shuffle(ordered_idx)
+        return ordered_idx
+
+    def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
+        """Get absolute correlation matrix between features.
+
+        Parameters
+        ----------
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        tolerance : float, default=1e-6
+            `abs_corr_mat` can have nans, which will be replaced
+            with `tolerance`.
+
+        Returns
+        -------
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of `X` at the beginning of the
+            current round. The diagonal has been zeroed out and each feature's
+            absolute correlations with all others have been normalized to sum
+            to 1.
+        """
+        n_features = X_filled.shape[1]
+        if self.n_nearest_features is None or self.n_nearest_features >= n_features:
+            return None
+        with np.errstate(invalid="ignore"):
+            # if a feature in the neighborhood has only a single value
+            # (e.g., categorical feature), the std. dev. will be null and
+            # np.corrcoef will raise a warning due to a division by zero
+            abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
+        # np.corrcoef is not defined for features with zero std
+        abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
+        # ensures exploration, i.e. at least some probability of sampling
+        np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
+        # features are not their own neighbors
+        np.fill_diagonal(abs_corr_mat, 0)
+        # needs to sum to 1 for np.random.choice sampling
+        abs_corr_mat = normalize(abs_corr_mat, norm="l1", axis=0, copy=False)
+        return abs_corr_mat
+
+    def _initial_imputation(self, X, in_fit=False):
+        """Perform initial imputation for input `X`.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        in_fit : bool, default=False
+            Whether function is called in :meth:`fit`.
+
+        Returns
+        -------
+        Xt : ndarray of shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        X_filled : ndarray of shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray of shape (n_samples, n_features)
+            Input data's missing indicator matrix, where `n_samples` is the
+            number of samples and `n_features` is the number of features,
+            masked by non-missing features.
+
+        X_missing_mask : ndarray, shape (n_samples, n_features)
+            Input data's mask matrix indicating missing datapoints, where
+            `n_samples` is the number of samples and `n_features` is the
+            number of features.
+        """
+        if is_scalar_nan(self.missing_values):
+            ensure_all_finite = "allow-nan"
+        else:
+            ensure_all_finite = True
+
+        X = validate_data(
+            self,
+            X,
+            dtype=FLOAT_DTYPES,
+            order="F",
+            reset=in_fit,
+            ensure_all_finite=ensure_all_finite,
+        )
+        _check_inputs_dtype(X, self.missing_values)
+
+        X_missing_mask = _get_mask(X, self.missing_values)
+        mask_missing_values = X_missing_mask.copy()
+
+        # TODO (1.8): remove this once the deprecation is removed. In the meantime,
+        # we need to catch the warning to avoid false positives.
+        catch_warning = (
+            self.initial_strategy == "constant" and not self.keep_empty_features
+        )
+
+        if self.initial_imputer_ is None:
+            self.initial_imputer_ = SimpleImputer(
+                missing_values=self.missing_values,
+                strategy=self.initial_strategy,
+                fill_value=self.fill_value,
+                keep_empty_features=self.keep_empty_features,
+            ).set_output(transform="default")
+
+            # TODO (1.8): remove this once the deprecation is removed to keep only
+            # the code in the else case.
+            if catch_warning:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore", FutureWarning)
+                    X_filled = self.initial_imputer_.fit_transform(X)
+            else:
+                X_filled = self.initial_imputer_.fit_transform(X)
+        else:
+            # TODO (1.8): remove this once the deprecation is removed to keep only
+            # the code in the else case.
+            if catch_warning:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore", FutureWarning)
+                    X_filled = self.initial_imputer_.transform(X)
+            else:
+                X_filled = self.initial_imputer_.transform(X)
+
+        if in_fit:
+            self._is_empty_feature = np.all(mask_missing_values, axis=0)
+
+        if not self.keep_empty_features:
+            # drop empty features
+            Xt = X[:, ~self._is_empty_feature]
+            mask_missing_values = mask_missing_values[:, ~self._is_empty_feature]
+
+            if self.initial_imputer_.get_params()["strategy"] == "constant":
+                # The constant strategy has a specific behavior and preserve empty
+                # features even with ``keep_empty_features=False``. We need to drop
+                # the column for consistency.
+                # TODO (1.8): remove this `if` branch once the following issue is
+                # addressed:
+                # https://github.com/scikit-learn/scikit-learn/issues/29827
+                X_filled = X_filled[:, ~self._is_empty_feature]
+
+        else:
+            # mark empty features as not missing and keep the original
+            # imputation
+            mask_missing_values[:, self._is_empty_feature] = False
+            Xt = X
+            Xt[:, self._is_empty_feature] = X_filled[:, self._is_empty_feature]
+
+        return Xt, X_filled, mask_missing_values, X_missing_mask
+
+    @staticmethod
+    def _validate_limit(
+        limit, limit_type, n_features, is_empty_feature, keep_empty_feature
+    ):
+        """Validate the limits (min/max) of the feature values.
+
+        Converts scalar min/max limits to vectors of shape `(n_features,)`.
+
+        Parameters
+        ----------
+        limit: scalar or array-like
+            The user-specified limit (i.e, min_value or max_value).
+        limit_type: {'max', 'min'}
+            Type of limit to validate.
+        n_features: int
+            Number of features in the dataset.
+        is_empty_feature: ndarray, shape (n_features, )
+            Mask array indicating empty feature imputer has seen during fit.
+        keep_empty_feature: bool
+            If False, remove empty-feature indices from the limit.
+
+        Returns
+        -------
+        limit: ndarray, shape(n_features,)
+            Array of limits, one for each feature.
+        """
+        n_features_in = _num_samples(is_empty_feature)
+        if (
+            limit is not None
+            and not np.isscalar(limit)
+            and _num_samples(limit) != n_features_in
+        ):
+            raise ValueError(
+                f"'{limit_type}_value' should be of shape ({n_features_in},) when an"
+                f" array-like is provided. Got {len(limit)}, instead."
+            )
+
+        limit_bound = np.inf if limit_type == "max" else -np.inf
+        limit = limit_bound if limit is None else limit
+        if np.isscalar(limit):
+            limit = np.full(n_features, limit)
+        limit = check_array(limit, ensure_all_finite=False, copy=False, ensure_2d=False)
+
+        # Make sure to remove the empty feature elements from the bounds
+        if not keep_empty_feature and len(limit) == len(is_empty_feature):
+            limit = limit[~is_empty_feature]
+
+        return limit
+
+    @_fit_context(
+        # IterativeImputer.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit_transform(self, X, y=None, **params):
+        """Fit the imputer on `X` and return the transformed `X`.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        **params : dict
+            Parameters routed to the `fit` method of the sub-estimator via the
+            metadata routing API.
+
+            .. versionadded:: 1.5
+              Only available if
+              `sklearn.set_config(enable_metadata_routing=True)` is set. See
+              :ref:`Metadata Routing User Guide <metadata_routing>` for more
+              details.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+            The imputed input data.
+        """
+        _raise_for_params(params, self, "fit")
+
+        routed_params = process_routing(
+            self,
+            "fit",
+            **params,
+        )
+
+        self.random_state_ = getattr(
+            self, "random_state_", check_random_state(self.random_state)
+        )
+
+        if self.estimator is None:
+            from ..linear_model import BayesianRidge
+
+            self._estimator = BayesianRidge()
+        else:
+            self._estimator = clone(self.estimator)
+
+        self.imputation_sequence_ = []
+
+        self.initial_imputer_ = None
+
+        X, Xt, mask_missing_values, complete_mask = self._initial_imputation(
+            X, in_fit=True
+        )
+
+        super()._fit_indicator(complete_mask)
+        X_indicator = super()._transform_indicator(complete_mask)
+
+        if self.max_iter == 0 or np.all(mask_missing_values):
+            self.n_iter_ = 0
+            return super()._concatenate_indicator(Xt, X_indicator)
+
+        # Edge case: a single feature, we return the initial imputation.
+        if Xt.shape[1] == 1:
+            self.n_iter_ = 0
+            return super()._concatenate_indicator(Xt, X_indicator)
+
+        self._min_value = self._validate_limit(
+            self.min_value,
+            "min",
+            X.shape[1],
+            self._is_empty_feature,
+            self.keep_empty_features,
+        )
+        self._max_value = self._validate_limit(
+            self.max_value,
+            "max",
+            X.shape[1],
+            self._is_empty_feature,
+            self.keep_empty_features,
+        )
+
+        if not np.all(np.greater(self._max_value, self._min_value)):
+            raise ValueError("One (or more) features have min_value >= max_value.")
+
+        # order in which to impute
+        # note this is probably too slow for large feature data (d > 100000)
+        # and a better way would be good.
+        # see: https://goo.gl/KyCNwj and subsequent comments
+        ordered_idx = self._get_ordered_idx(mask_missing_values)
+        self.n_features_with_missing_ = len(ordered_idx)
+
+        abs_corr_mat = self._get_abs_corr_mat(Xt)
+
+        n_samples, n_features = Xt.shape
+        if self.verbose > 0:
+            print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,))
+        start_t = time()
+        if not self.sample_posterior:
+            Xt_previous = Xt.copy()
+            normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
+        for self.n_iter_ in range(1, self.max_iter + 1):
+            if self.imputation_order == "random":
+                ordered_idx = self._get_ordered_idx(mask_missing_values)
+
+            for feat_idx in ordered_idx:
+                neighbor_feat_idx = self._get_neighbor_feat_idx(
+                    n_features, feat_idx, abs_corr_mat
+                )
+                Xt, estimator = self._impute_one_feature(
+                    Xt,
+                    mask_missing_values,
+                    feat_idx,
+                    neighbor_feat_idx,
+                    estimator=None,
+                    fit_mode=True,
+                    params=routed_params.estimator.fit,
+                )
+                estimator_triplet = _ImputerTriplet(
+                    feat_idx, neighbor_feat_idx, estimator
+                )
+                self.imputation_sequence_.append(estimator_triplet)
+
+            if self.verbose > 1:
+                print(
+                    "[IterativeImputer] Ending imputation round "
+                    "%d/%d, elapsed time %0.2f"
+                    % (self.n_iter_, self.max_iter, time() - start_t)
+                )
+
+            if not self.sample_posterior:
+                inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, axis=None)
+                if self.verbose > 0:
+                    print(
+                        "[IterativeImputer] Change: {}, scaled tolerance: {} ".format(
+                            inf_norm, normalized_tol
+                        )
+                    )
+                if inf_norm < normalized_tol:
+                    if self.verbose > 0:
+                        print("[IterativeImputer] Early stopping criterion reached.")
+                    break
+                Xt_previous = Xt.copy()
+        else:
+            if not self.sample_posterior:
+                warnings.warn(
+                    "[IterativeImputer] Early stopping criterion not reached.",
+                    ConvergenceWarning,
+                )
+        _assign_where(Xt, X, cond=~mask_missing_values)
+
+        return super()._concatenate_indicator(Xt, X_indicator)
+
+    def transform(self, X):
+        """Impute all missing values in `X`.
+
+        Note that this is stochastic, and that if `random_state` is not fixed,
+        repeated calls, or permuted input, results will differ.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        check_is_fitted(self)
+
+        X, Xt, mask_missing_values, complete_mask = self._initial_imputation(
+            X, in_fit=False
+        )
+
+        X_indicator = super()._transform_indicator(complete_mask)
+
+        if self.n_iter_ == 0 or np.all(mask_missing_values):
+            return super()._concatenate_indicator(Xt, X_indicator)
+
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
+        i_rnd = 0
+        if self.verbose > 0:
+            print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,))
+        start_t = time()
+        for it, estimator_triplet in enumerate(self.imputation_sequence_):
+            Xt, _ = self._impute_one_feature(
+                Xt,
+                mask_missing_values,
+                estimator_triplet.feat_idx,
+                estimator_triplet.neighbor_feat_idx,
+                estimator=estimator_triplet.estimator,
+                fit_mode=False,
+            )
+            if not (it + 1) % imputations_per_round:
+                if self.verbose > 1:
+                    print(
+                        "[IterativeImputer] Ending imputation round "
+                        "%d/%d, elapsed time %0.2f"
+                        % (i_rnd + 1, self.n_iter_, time() - start_t)
+                    )
+                i_rnd += 1
+
+        _assign_where(Xt, X, cond=~mask_missing_values)
+
+        return super()._concatenate_indicator(Xt, X_indicator)
+
+    def fit(self, X, y=None, **fit_params):
+        """Fit the imputer on `X` and return self.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        **fit_params : dict
+            Parameters routed to the `fit` method of the sub-estimator via the
+            metadata routing API.
+
+            .. versionadded:: 1.5
+              Only available if
+              `sklearn.set_config(enable_metadata_routing=True)` is set. See
+              :ref:`Metadata Routing User Guide <metadata_routing>` for more
+              details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self.fit_transform(X, **fit_params)
+        return self
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        input_features = _check_feature_names_in(self, input_features)
+        names = self.initial_imputer_.get_feature_names_out(input_features)
+        return self._concatenate_indicator_feature_names_out(names, input_features)
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+        )
+        return router
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py b/.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b7ef06edc256372890cbd9cb85123239b37e3e9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py
@@ -0,0 +1,411 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral
+
+import numpy as np
+
+from ..base import _fit_context
+from ..metrics import pairwise_distances_chunked
+from ..metrics.pairwise import _NAN_METRICS
+from ..neighbors._base import _get_weights
+from ..utils._mask import _get_mask
+from ..utils._missing import is_scalar_nan
+from ..utils._param_validation import Hidden, Interval, StrOptions
+from ..utils.validation import (
+    FLOAT_DTYPES,
+    _check_feature_names_in,
+    check_is_fitted,
+    validate_data,
+)
+from ._base import _BaseImputer
+
+
+class KNNImputer(_BaseImputer):
+    """Imputation for completing missing values using k-Nearest Neighbors.
+
+    Each sample's missing values are imputed using the mean value from
+    `n_neighbors` nearest neighbors found in the training set. Two samples are
+    close if the features that neither is missing are close.
+
+    Read more in the :ref:`User Guide <knnimpute>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    missing_values : int, float, str, np.nan or None, default=np.nan
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to np.nan, since `pd.NA` will be converted to np.nan.
+
+    n_neighbors : int, default=5
+        Number of neighboring samples to use for imputation.
+
+    weights : {'uniform', 'distance'} or callable, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights. All points in each neighborhood are
+          weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - callable : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+    metric : {'nan_euclidean'} or callable, default='nan_euclidean'
+        Distance metric for searching neighbors. Possible values:
+
+        - 'nan_euclidean'
+        - callable : a user-defined function which conforms to the definition
+          of ``func_metric(x, y, *, missing_values=np.nan)``. `x` and `y`
+          corresponds to a row (i.e. 1-D arrays) of `X` and `Y`, respectively.
+          The callable should returns a scalar distance value.
+
+    copy : bool, default=True
+        If True, a copy of X will be created. If False, imputation will
+        be done in-place whenever possible.
+
+    add_indicator : bool, default=False
+        If True, a :class:`MissingIndicator` transform will stack onto the
+        output of the imputer's transform. This allows a predictive estimator
+        to account for missingness despite imputation. If a feature has no
+        missing values at fit/train time, the feature won't appear on the
+        missing indicator even if there are missing values at transform/test
+        time.
+
+    keep_empty_features : bool, default=False
+        If True, features that consist exclusively of missing values when
+        `fit` is called are returned in results when `transform` is called.
+        The imputed value is always `0`.
+
+        .. versionadded:: 1.2
+
+    Attributes
+    ----------
+    indicator_ : :class:`~sklearn.impute.MissingIndicator`
+        Indicator used to add binary indicators for missing values.
+        ``None`` if add_indicator is False.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    SimpleImputer : Univariate imputer for completing missing values
+        with simple strategies.
+    IterativeImputer : Multivariate imputer that estimates values to impute for
+        each feature with missing values from all the others.
+
+    References
+    ----------
+    * `Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
+      Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
+      value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
+      no. 6, 2001 Pages 520-525.
+      <https://academic.oup.com/bioinformatics/article/17/6/520/272365>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import KNNImputer
+    >>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
+    >>> imputer = KNNImputer(n_neighbors=2)
+    >>> imputer.fit_transform(X)
+    array([[1. , 2. , 4. ],
+           [3. , 4. , 3. ],
+           [5.5, 6. , 5. ],
+           [8. , 8. , 7. ]])
+
+    For a more detailed example see
+    :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseImputer._parameter_constraints,
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "weights": [StrOptions({"uniform", "distance"}), callable, Hidden(None)],
+        "metric": [StrOptions(set(_NAN_METRICS)), callable],
+        "copy": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        missing_values=np.nan,
+        n_neighbors=5,
+        weights="uniform",
+        metric="nan_euclidean",
+        copy=True,
+        add_indicator=False,
+        keep_empty_features=False,
+    ):
+        super().__init__(
+            missing_values=missing_values,
+            add_indicator=add_indicator,
+            keep_empty_features=keep_empty_features,
+        )
+        self.n_neighbors = n_neighbors
+        self.weights = weights
+        self.metric = metric
+        self.copy = copy
+
+    def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
+        """Helper function to impute a single column.
+
+        Parameters
+        ----------
+        dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)
+            Distance matrix between the receivers and potential donors from
+            training set. There must be at least one non-nan distance between
+            a receiver and a potential donor.
+
+        n_neighbors : int
+            Number of neighbors to consider.
+
+        fit_X_col : ndarray of shape (n_potential_donors,)
+            Column of potential donors from training set.
+
+        mask_fit_X_col : ndarray of shape (n_potential_donors,)
+            Missing mask for fit_X_col.
+
+        Returns
+        -------
+        imputed_values: ndarray of shape (n_receivers,)
+            Imputed values for receiver.
+        """
+        # Get donors
+        donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[
+            :, :n_neighbors
+        ]
+
+        # Get weight matrix from distance matrix
+        donors_dist = dist_pot_donors[
+            np.arange(donors_idx.shape[0])[:, None], donors_idx
+        ]
+
+        weight_matrix = _get_weights(donors_dist, self.weights)
+
+        # fill nans with zeros
+        if weight_matrix is not None:
+            weight_matrix[np.isnan(weight_matrix)] = 0.0
+        else:
+            weight_matrix = np.ones_like(donors_dist)
+            weight_matrix[np.isnan(donors_dist)] = 0.0
+
+        # Retrieve donor values and calculate kNN average
+        donors = fit_X_col.take(donors_idx)
+        donors_mask = mask_fit_X_col.take(donors_idx)
+        donors = np.ma.array(donors, mask=donors_mask)
+
+        return np.ma.average(donors, axis=1, weights=weight_matrix).data
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the imputer on X.
+
+        Parameters
+        ----------
+        X : array-like shape of (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            The fitted `KNNImputer` class instance.
+        """
+        # Check data integrity and calling arguments
+        if not is_scalar_nan(self.missing_values):
+            ensure_all_finite = True
+        else:
+            ensure_all_finite = "allow-nan"
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=False,
+            dtype=FLOAT_DTYPES,
+            ensure_all_finite=ensure_all_finite,
+            copy=self.copy,
+        )
+
+        self._fit_X = X
+        self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
+        self._valid_mask = ~np.all(self._mask_fit_X, axis=0)
+
+        super()._fit_indicator(self._mask_fit_X)
+
+        return self
+
+    def transform(self, X):
+        """Impute all missing values in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        X : array-like of shape (n_samples, n_output_features)
+            The imputed dataset. `n_output_features` is the number of features
+            that is not always missing during `fit`.
+        """
+
+        check_is_fitted(self)
+        if not is_scalar_nan(self.missing_values):
+            ensure_all_finite = True
+        else:
+            ensure_all_finite = "allow-nan"
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=False,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            ensure_all_finite=ensure_all_finite,
+            copy=self.copy,
+            reset=False,
+        )
+
+        mask = _get_mask(X, self.missing_values)
+        mask_fit_X = self._mask_fit_X
+        valid_mask = self._valid_mask
+
+        X_indicator = super()._transform_indicator(mask)
+
+        # Removes columns where the training data is all nan
+        if not np.any(mask[:, valid_mask]):
+            # No missing values in X
+            if self.keep_empty_features:
+                Xc = X
+                Xc[:, ~valid_mask] = 0
+            else:
+                Xc = X[:, valid_mask]
+
+            # Even if there are no missing values in X, we still concatenate Xc
+            # with the missing value indicator matrix, X_indicator.
+            # This is to ensure that the output maintains consistency in terms
+            # of columns, regardless of whether missing values exist in X or not.
+            return super()._concatenate_indicator(Xc, X_indicator)
+
+        row_missing_idx = np.flatnonzero(mask[:, valid_mask].any(axis=1))
+
+        non_missing_fix_X = np.logical_not(mask_fit_X)
+
+        # Maps from indices from X to indices in dist matrix
+        dist_idx_map = np.zeros(X.shape[0], dtype=int)
+        dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])
+
+        def process_chunk(dist_chunk, start):
+            row_missing_chunk = row_missing_idx[start : start + len(dist_chunk)]
+
+            # Find and impute missing by column
+            for col in range(X.shape[1]):
+                if not valid_mask[col]:
+                    # column was all missing during training
+                    continue
+
+                col_mask = mask[row_missing_chunk, col]
+                if not np.any(col_mask):
+                    # column has no missing values
+                    continue
+
+                (potential_donors_idx,) = np.nonzero(non_missing_fix_X[:, col])
+
+                # receivers_idx are indices in X
+                receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
+
+                # distances for samples that needed imputation for column
+                dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
+                    :, potential_donors_idx
+                ]
+
+                # receivers with all nan distances impute with mean
+                all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
+                all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
+
+                if all_nan_receivers_idx.size:
+                    col_mean = np.ma.array(
+                        self._fit_X[:, col], mask=mask_fit_X[:, col]
+                    ).mean()
+                    X[all_nan_receivers_idx, col] = col_mean
+
+                    if len(all_nan_receivers_idx) == len(receivers_idx):
+                        # all receivers imputed with mean
+                        continue
+
+                    # receivers with at least one defined distance
+                    receivers_idx = receivers_idx[~all_nan_dist_mask]
+                    dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
+                        :, potential_donors_idx
+                    ]
+
+                n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
+                value = self._calc_impute(
+                    dist_subset,
+                    n_neighbors,
+                    self._fit_X[potential_donors_idx, col],
+                    mask_fit_X[potential_donors_idx, col],
+                )
+                X[receivers_idx, col] = value
+
+        # process in fixed-memory chunks
+        gen = pairwise_distances_chunked(
+            X[row_missing_idx, :],
+            self._fit_X,
+            metric=self.metric,
+            missing_values=self.missing_values,
+            ensure_all_finite=ensure_all_finite,
+            reduce_func=process_chunk,
+        )
+        for chunk in gen:
+            # process_chunk modifies X in place. No return value.
+            pass
+
+        if self.keep_empty_features:
+            Xc = X
+            Xc[:, ~valid_mask] = 0
+        else:
+            Xc = X[:, valid_mask]
+
+        return super()._concatenate_indicator(Xc, X_indicator)
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        input_features = _check_feature_names_in(self, input_features)
+        names = input_features[self._valid_mask]
+        return self._concatenate_indicator_feature_names_out(names, input_features)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1bd83f7ca9ea8adde76940e2f7fdd86d89ea5c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_base.py
@@ -0,0 +1,107 @@
+import numpy as np
+import pytest
+
+from sklearn.impute._base import _BaseImputer
+from sklearn.impute._iterative import _assign_where
+from sklearn.utils._mask import _get_mask
+from sklearn.utils._testing import _convert_container, assert_allclose
+
+
+@pytest.fixture
+def data():
+    X = np.random.randn(10, 2)
+    X[::2] = np.nan
+    return X
+
+
+class NoFitIndicatorImputer(_BaseImputer):
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X, y=None):
+        return self._concatenate_indicator(X, self._transform_indicator(X))
+
+
+class NoTransformIndicatorImputer(_BaseImputer):
+    def fit(self, X, y=None):
+        mask = _get_mask(X, value_to_mask=np.nan)
+        super()._fit_indicator(mask)
+        return self
+
+    def transform(self, X, y=None):
+        return self._concatenate_indicator(X, None)
+
+
+class NoPrecomputedMaskFit(_BaseImputer):
+    def fit(self, X, y=None):
+        self._fit_indicator(X)
+        return self
+
+    def transform(self, X):
+        return self._concatenate_indicator(X, self._transform_indicator(X))
+
+
+class NoPrecomputedMaskTransform(_BaseImputer):
+    def fit(self, X, y=None):
+        mask = _get_mask(X, value_to_mask=np.nan)
+        self._fit_indicator(mask)
+        return self
+
+    def transform(self, X):
+        return self._concatenate_indicator(X, self._transform_indicator(X))
+
+
+def test_base_imputer_not_fit(data):
+    imputer = NoFitIndicatorImputer(add_indicator=True)
+    err_msg = "Make sure to call _fit_indicator before _transform_indicator"
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit(data).transform(data)
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit_transform(data)
+
+
+def test_base_imputer_not_transform(data):
+    imputer = NoTransformIndicatorImputer(add_indicator=True)
+    err_msg = (
+        "Call _fit_indicator and _transform_indicator in the imputer implementation"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit(data).transform(data)
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit_transform(data)
+
+
+def test_base_no_precomputed_mask_fit(data):
+    imputer = NoPrecomputedMaskFit(add_indicator=True)
+    err_msg = "precomputed is True but the input data is not a mask"
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit(data)
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit_transform(data)
+
+
+def test_base_no_precomputed_mask_transform(data):
+    imputer = NoPrecomputedMaskTransform(add_indicator=True)
+    err_msg = "precomputed is True but the input data is not a mask"
+    imputer.fit(data)
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.transform(data)
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit_transform(data)
+
+
+@pytest.mark.parametrize("X1_type", ["array", "dataframe"])
+def test_assign_where(X1_type):
+    """Check the behaviour of the private helpers `_assign_where`."""
+    rng = np.random.RandomState(0)
+
+    n_samples, n_features = 10, 5
+    X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type)
+    X2 = rng.randn(n_samples, n_features)
+    mask = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool)
+
+    _assign_where(X1, X2, mask)
+
+    if X1_type == "dataframe":
+        X1 = X1.to_numpy()
+    assert_allclose(X1[mask], X2[mask])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..afebc96ac035c4945c4084dc968323a602072066
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_common.py
@@ -0,0 +1,220 @@
+import numpy as np
+import pytest
+
+from sklearn.experimental import enable_iterative_imputer  # noqa: F401
+from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def imputers():
+    return [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()]
+
+
+def sparse_imputers():
+    return [SimpleImputer()]
+
+
+# ConvergenceWarning will be raised by the IterativeImputer
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
+def test_imputation_missing_value_in_test_array(imputer):
+    # [Non Regression Test for issue #13968] Missing value in test set should
+    # not throw an error and return a finite dataset
+    train = [[1], [2]]
+    test = [[3], [np.nan]]
+    imputer.set_params(add_indicator=True)
+    imputer.fit(train).transform(test)
+
+
+# ConvergenceWarning will be raised by the IterativeImputer
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("marker", [np.nan, -1, 0])
+@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
+def test_imputers_add_indicator(marker, imputer):
+    X = np.array(
+        [
+            [marker, 1, 5, marker, 1],
+            [2, marker, 1, marker, 2],
+            [6, 3, marker, marker, 3],
+            [1, 2, 9, marker, 4],
+        ]
+    )
+    X_true_indicator = np.array(
+        [
+            [1.0, 0.0, 0.0, 1.0],
+            [0.0, 1.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0],
+        ]
+    )
+    imputer.set_params(missing_values=marker, add_indicator=True)
+
+    X_trans = imputer.fit_transform(X)
+    assert_allclose(X_trans[:, -4:], X_true_indicator)
+    assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
+
+    imputer.set_params(add_indicator=False)
+    X_trans_no_indicator = imputer.fit_transform(X)
+    assert_allclose(X_trans[:, :-4], X_trans_no_indicator)
+
+
+# ConvergenceWarning will be raised by the IterativeImputer
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("marker", [np.nan, -1])
+@pytest.mark.parametrize(
+    "imputer", sparse_imputers(), ids=lambda x: x.__class__.__name__
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_imputers_add_indicator_sparse(imputer, marker, csr_container):
+    X = csr_container(
+        [
+            [marker, 1, 5, marker, 1],
+            [2, marker, 1, marker, 2],
+            [6, 3, marker, marker, 3],
+            [1, 2, 9, marker, 4],
+        ]
+    )
+    X_true_indicator = csr_container(
+        [
+            [1.0, 0.0, 0.0, 1.0],
+            [0.0, 1.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0],
+        ]
+    )
+    imputer.set_params(missing_values=marker, add_indicator=True)
+
+    X_trans = imputer.fit_transform(X)
+    assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator)
+    assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
+
+    imputer.set_params(add_indicator=False)
+    X_trans_no_indicator = imputer.fit_transform(X)
+    assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
+
+
+# ConvergenceWarning will be raised by the IterativeImputer
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
+@pytest.mark.parametrize("add_indicator", [True, False])
+def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip("pandas")
+    marker = np.nan
+    imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
+
+    X = np.array(
+        [
+            [marker, 1, 5, marker, 1],
+            [2, marker, 1, marker, 2],
+            [6, 3, marker, marker, 3],
+            [1, 2, 9, marker, 4],
+        ]
+    )
+    # fit on numpy array
+    X_trans_expected = imputer.fit_transform(X)
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
+
+    # fit on pandas dataframe with IntegerArrays
+    X_trans = imputer.fit_transform(X_df)
+
+    assert_allclose(X_trans_expected, X_trans)
+
+
+@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
+@pytest.mark.parametrize("add_indicator", [True, False])
+def test_imputers_feature_names_out_pandas(imputer, add_indicator):
+    """Check feature names out for imputers."""
+    pd = pytest.importorskip("pandas")
+    marker = np.nan
+    imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
+
+    X = np.array(
+        [
+            [marker, 1, 5, 3, marker, 1],
+            [2, marker, 1, 4, marker, 2],
+            [6, 3, 7, marker, marker, 3],
+            [1, 2, 9, 8, marker, 4],
+        ]
+    )
+    X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
+    imputer.fit(X_df)
+
+    names = imputer.get_feature_names_out()
+
+    if add_indicator:
+        expected_names = [
+            "a",
+            "b",
+            "c",
+            "d",
+            "f",
+            "missingindicator_a",
+            "missingindicator_b",
+            "missingindicator_d",
+            "missingindicator_e",
+        ]
+        assert_array_equal(expected_names, names)
+    else:
+        expected_names = ["a", "b", "c", "d", "f"]
+        assert_array_equal(expected_names, names)
+
+
+@pytest.mark.parametrize("keep_empty_features", [True, False])
+@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
+def test_keep_empty_features(imputer, keep_empty_features):
+    """Check that the imputer keeps features with only missing values."""
+    X = np.array([[np.nan, 1], [np.nan, 2], [np.nan, 3]])
+    imputer = imputer.set_params(
+        add_indicator=False, keep_empty_features=keep_empty_features
+    )
+
+    for method in ["fit_transform", "transform"]:
+        X_imputed = getattr(imputer, method)(X)
+        if keep_empty_features:
+            assert X_imputed.shape == X.shape
+        else:
+            assert X_imputed.shape == (X.shape[0], X.shape[1] - 1)
+
+
+@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
+@pytest.mark.parametrize("missing_value_test", [np.nan, 1])
+def test_imputation_adds_missing_indicator_if_add_indicator_is_true(
+    imputer, missing_value_test
+):
+    """Check that missing indicator always exists when add_indicator=True.
+
+    Non-regression test for gh-26590.
+    """
+    X_train = np.array([[0, np.nan], [1, 2]])
+
+    # Test data where missing_value_test variable can be set to np.nan or 1.
+    X_test = np.array([[0, missing_value_test], [1, 2]])
+
+    imputer.set_params(add_indicator=True)
+    imputer.fit(X_train)
+
+    X_test_imputed_with_indicator = imputer.transform(X_test)
+    assert X_test_imputed_with_indicator.shape == (2, 3)
+
+    imputer.set_params(add_indicator=False)
+    imputer.fit(X_train)
+    X_test_imputed_without_indicator = imputer.transform(X_test)
+    assert X_test_imputed_without_indicator.shape == (2, 2)
+
+    assert_allclose(
+        X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator
+    )
+    if np.isnan(missing_value_test):
+        expected_missing_indicator = [1, 0]
+    else:
+        expected_missing_indicator = [0, 0]
+
+    assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_impute.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_impute.py
new file mode 100644
index 0000000000000000000000000000000000000000..4116964c49a7a3be21771593e60e24515a7b475c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_impute.py
@@ -0,0 +1,1955 @@
+import io
+import re
+import warnings
+from itertools import product
+
+import numpy as np
+import pytest
+from scipy import sparse
+from scipy.stats import kstest
+
+from sklearn import tree
+from sklearn.datasets import load_diabetes
+from sklearn.dummy import DummyRegressor
+from sklearn.exceptions import ConvergenceWarning
+
+# make IterativeImputer available
+from sklearn.experimental import enable_iterative_imputer  # noqa: F401
+from sklearn.impute import IterativeImputer, KNNImputer, MissingIndicator, SimpleImputer
+from sklearn.impute._base import _most_frequent
+from sklearn.linear_model import ARDRegression, BayesianRidge, RidgeCV
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline, make_union
+from sklearn.random_projection import _sparse_random_matrix
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import (
+    BSR_CONTAINERS,
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+
+def _assert_array_equal_and_same_dtype(x, y):
+    assert_array_equal(x, y)
+    assert x.dtype == y.dtype
+
+
+def _assert_allclose_and_same_dtype(x, y):
+    assert_allclose(x, y)
+    assert x.dtype == y.dtype
+
+
+def _check_statistics(
+    X, X_true, strategy, statistics, missing_values, sparse_container
+):
+    """Utility function for testing imputation for a given strategy.
+
+    Test with dense and sparse arrays
+
+    Check that:
+        - the statistics (mean, median, mode) are correct
+        - the missing values are imputed correctly"""
+
+    err_msg = "Parameters: strategy = %s, missing_values = %s, sparse = {0}" % (
+        strategy,
+        missing_values,
+    )
+
+    assert_ae = assert_array_equal
+
+    if X.dtype.kind == "f" or X_true.dtype.kind == "f":
+        assert_ae = assert_array_almost_equal
+
+    # Normal matrix
+    imputer = SimpleImputer(missing_values=missing_values, strategy=strategy)
+    X_trans = imputer.fit(X).transform(X.copy())
+    assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(False))
+    assert_ae(X_trans, X_true, err_msg=err_msg.format(False))
+
+    # Sparse matrix
+    imputer = SimpleImputer(missing_values=missing_values, strategy=strategy)
+    imputer.fit(sparse_container(X))
+    X_trans = imputer.transform(sparse_container(X.copy()))
+
+    if sparse.issparse(X_trans):
+        X_trans = X_trans.toarray()
+
+    assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(True))
+    assert_ae(X_trans, X_true, err_msg=err_msg.format(True))
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_imputation_shape(strategy, csr_container):
+    # Verify the shapes of the imputed matrix for different strategies.
+    X = np.random.randn(10, 2)
+    X[::2] = np.nan
+
+    imputer = SimpleImputer(strategy=strategy)
+    X_imputed = imputer.fit_transform(csr_container(X))
+    assert X_imputed.shape == (10, 2)
+    X_imputed = imputer.fit_transform(X)
+    assert X_imputed.shape == (10, 2)
+
+    iterative_imputer = IterativeImputer(initial_strategy=strategy)
+    X_imputed = iterative_imputer.fit_transform(X)
+    assert X_imputed.shape == (10, 2)
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"])
+def test_imputation_deletion_warning(strategy):
+    X = np.ones((3, 5))
+    X[:, 0] = np.nan
+    imputer = SimpleImputer(strategy=strategy).fit(X)
+
+    with pytest.warns(UserWarning, match="Skipping"):
+        imputer.transform(X)
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"])
+def test_imputation_deletion_warning_feature_names(strategy):
+    pd = pytest.importorskip("pandas")
+
+    missing_values = np.nan
+    feature_names = np.array(["a", "b", "c", "d"], dtype=object)
+    X = pd.DataFrame(
+        [
+            [missing_values, missing_values, 1, missing_values],
+            [4, missing_values, 2, 10],
+        ],
+        columns=feature_names,
+    )
+
+    imputer = SimpleImputer(strategy=strategy).fit(X)
+
+    # check SimpleImputer returning feature name attribute correctly
+    assert_array_equal(imputer.feature_names_in_, feature_names)
+
+    # ensure that skipped feature warning includes feature name
+    with pytest.warns(
+        UserWarning, match=r"Skipping features without any observed values: \['b'\]"
+    ):
+        imputer.transform(X)
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_imputation_error_sparse_0(strategy, csc_container):
+    # check that error are raised when missing_values = 0 and input is sparse
+    X = np.ones((3, 5))
+    X[0] = 0
+    X = csc_container(X)
+
+    imputer = SimpleImputer(strategy=strategy, missing_values=0)
+    with pytest.raises(ValueError, match="Provide a dense array"):
+        imputer.fit(X)
+
+    imputer.fit(X.toarray())
+    with pytest.raises(ValueError, match="Provide a dense array"):
+        imputer.transform(X)
+
+
+def safe_median(arr, *args, **kwargs):
+    # np.median([]) raises a TypeError for numpy >= 1.10.1
+    length = arr.size if hasattr(arr, "size") else len(arr)
+    return np.nan if length == 0 else np.median(arr, *args, **kwargs)
+
+
+def safe_mean(arr, *args, **kwargs):
+    # np.mean([]) raises a RuntimeWarning for numpy >= 1.10.1
+    length = arr.size if hasattr(arr, "size") else len(arr)
+    return np.nan if length == 0 else np.mean(arr, *args, **kwargs)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_imputation_mean_median(csc_container):
+    # Test imputation using the mean and median strategies, when
+    # missing_values != 0.
+    rng = np.random.RandomState(0)
+
+    dim = 10
+    dec = 10
+    shape = (dim * dim, dim + dec)
+
+    zeros = np.zeros(shape[0])
+    values = np.arange(1, shape[0] + 1)
+    values[4::2] = -values[4::2]
+
+    tests = [
+        ("mean", np.nan, lambda z, v, p: safe_mean(np.hstack((z, v)))),
+        ("median", np.nan, lambda z, v, p: safe_median(np.hstack((z, v)))),
+    ]
+
+    for strategy, test_missing_values, true_value_fun in tests:
+        X = np.empty(shape)
+        X_true = np.empty(shape)
+        true_statistics = np.empty(shape[1])
+
+        # Create a matrix X with columns
+        #    - with only zeros,
+        #    - with only missing values
+        #    - with zeros, missing values and values
+        # And a matrix X_true containing all true values
+        for j in range(shape[1]):
+            nb_zeros = (j - dec + 1 > 0) * (j - dec + 1) * (j - dec + 1)
+            nb_missing_values = max(shape[0] + dec * dec - (j + dec) * (j + dec), 0)
+            nb_values = shape[0] - nb_zeros - nb_missing_values
+
+            z = zeros[:nb_zeros]
+            p = np.repeat(test_missing_values, nb_missing_values)
+            v = values[rng.permutation(len(values))[:nb_values]]
+
+            true_statistics[j] = true_value_fun(z, v, p)
+
+            # Create the columns
+            X[:, j] = np.hstack((v, z, p))
+
+            if 0 == test_missing_values:
+                # XXX unreached code as of v0.22
+                X_true[:, j] = np.hstack(
+                    (v, np.repeat(true_statistics[j], nb_missing_values + nb_zeros))
+                )
+            else:
+                X_true[:, j] = np.hstack(
+                    (v, z, np.repeat(true_statistics[j], nb_missing_values))
+                )
+
+            # Shuffle them the same way
+            np.random.RandomState(j).shuffle(X[:, j])
+            np.random.RandomState(j).shuffle(X_true[:, j])
+
+        # Mean doesn't support columns containing NaNs, median does
+        if strategy == "median":
+            cols_to_keep = ~np.isnan(X_true).any(axis=0)
+        else:
+            cols_to_keep = ~np.isnan(X_true).all(axis=0)
+
+        X_true = X_true[:, cols_to_keep]
+
+        _check_statistics(
+            X, X_true, strategy, true_statistics, test_missing_values, csc_container
+        )
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_imputation_median_special_cases(csc_container):
+    # Test median imputation with sparse boundary cases
+    X = np.array(
+        [
+            [0, np.nan, np.nan],  # odd: implicit zero
+            [5, np.nan, np.nan],  # odd: explicit nonzero
+            [0, 0, np.nan],  # even: average two zeros
+            [-5, 0, np.nan],  # even: avg zero and neg
+            [0, 5, np.nan],  # even: avg zero and pos
+            [4, 5, np.nan],  # even: avg nonzeros
+            [-4, -5, np.nan],  # even: avg negatives
+            [-1, 2, np.nan],  # even: crossing neg and pos
+        ]
+    ).transpose()
+
+    X_imputed_median = np.array(
+        [
+            [0, 0, 0],
+            [5, 5, 5],
+            [0, 0, 0],
+            [-5, 0, -2.5],
+            [0, 5, 2.5],
+            [4, 5, 4.5],
+            [-4, -5, -4.5],
+            [-1, 2, 0.5],
+        ]
+    ).transpose()
+    statistics_median = [0, 5, 0, -2.5, 2.5, 4.5, -4.5, 0.5]
+
+    _check_statistics(
+        X, X_imputed_median, "median", statistics_median, np.nan, csc_container
+    )
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median"])
+@pytest.mark.parametrize("dtype", [None, object, str])
+def test_imputation_mean_median_error_invalid_type(strategy, dtype):
+    X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype)
+    msg = "non-numeric data:\ncould not convert string to float:"
+    with pytest.raises(ValueError, match=msg):
+        imputer = SimpleImputer(strategy=strategy)
+        imputer.fit_transform(X)
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median"])
+@pytest.mark.parametrize("type", ["list", "dataframe"])
+def test_imputation_mean_median_error_invalid_type_list_pandas(strategy, type):
+    X = [["a", "b", 3], [4, "e", 6], ["g", "h", 9]]
+    if type == "dataframe":
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame(X)
+    msg = "non-numeric data:\ncould not convert string to float:"
+    with pytest.raises(ValueError, match=msg):
+        imputer = SimpleImputer(strategy=strategy)
+        imputer.fit_transform(X)
+
+
+@pytest.mark.parametrize("strategy", ["constant", "most_frequent"])
+@pytest.mark.parametrize("dtype", [str, np.dtype("U"), np.dtype("S")])
+def test_imputation_const_mostf_error_invalid_types(strategy, dtype):
+    # Test imputation on non-numeric data using "most_frequent" and "constant"
+    # strategy
+    X = np.array(
+        [
+            [np.nan, np.nan, "a", "f"],
+            [np.nan, "c", np.nan, "d"],
+            [np.nan, "b", "d", np.nan],
+            [np.nan, "c", "d", "h"],
+        ],
+        dtype=dtype,
+    )
+
+    err_msg = "SimpleImputer does not support data"
+    with pytest.raises(ValueError, match=err_msg):
+        imputer = SimpleImputer(strategy=strategy)
+        imputer.fit(X).transform(X)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_imputation_most_frequent(csc_container):
+    # Test imputation using the most-frequent strategy.
+    X = np.array(
+        [
+            [-1, -1, 0, 5],
+            [-1, 2, -1, 3],
+            [-1, 1, 3, -1],
+            [-1, 2, 3, 7],
+        ]
+    )
+
+    X_true = np.array(
+        [
+            [2, 0, 5],
+            [2, 3, 3],
+            [1, 3, 3],
+            [2, 3, 7],
+        ]
+    )
+
+    # scipy.stats.mode, used in SimpleImputer, doesn't return the first most
+    # frequent as promised in the doc but the lowest most frequent. When this
+    # test will fail after an update of scipy, SimpleImputer will need to be
+    # updated to be consistent with the new (correct) behaviour
+    _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1, csc_container)
+
+
+@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0])
+def test_imputation_most_frequent_objects(marker):
+    # Test imputation using the most-frequent strategy.
+    X = np.array(
+        [
+            [marker, marker, "a", "f"],
+            [marker, "c", marker, "d"],
+            [marker, "b", "d", marker],
+            [marker, "c", "d", "h"],
+        ],
+        dtype=object,
+    )
+
+    X_true = np.array(
+        [
+            ["c", "a", "f"],
+            ["c", "d", "d"],
+            ["b", "d", "d"],
+            ["c", "d", "h"],
+        ],
+        dtype=object,
+    )
+
+    imputer = SimpleImputer(missing_values=marker, strategy="most_frequent")
+    X_trans = imputer.fit(X).transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
+@pytest.mark.parametrize("dtype", [object, "category"])
+def test_imputation_most_frequent_pandas(dtype):
+    # Test imputation using the most frequent strategy on pandas df
+    pd = pytest.importorskip("pandas")
+
+    f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n,i,x,\na,,y,\na,j,,\nb,j,x,")
+
+    df = pd.read_csv(f, dtype=dtype)
+
+    X_true = np.array(
+        [["a", "i", "x"], ["a", "j", "y"], ["a", "j", "x"], ["b", "j", "x"]],
+        dtype=object,
+    )
+
+    imputer = SimpleImputer(strategy="most_frequent")
+    X_trans = imputer.fit_transform(df)
+
+    assert_array_equal(X_trans, X_true)
+
+
+@pytest.mark.parametrize("X_data, missing_value", [(1, 0), (1.0, np.nan)])
+def test_imputation_constant_error_invalid_type(X_data, missing_value):
+    # Verify that exceptions are raised on invalid fill_value type
+    X = np.full((3, 5), X_data, dtype=float)
+    X[0, 0] = missing_value
+
+    fill_value = "x"
+    err_msg = f"fill_value={fill_value!r} (of type {type(fill_value)!r}) cannot be cast"
+    with pytest.raises(ValueError, match=re.escape(err_msg)):
+        imputer = SimpleImputer(
+            missing_values=missing_value, strategy="constant", fill_value=fill_value
+        )
+        imputer.fit_transform(X)
+
+
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
+def test_imputation_constant_integer():
+    # Test imputation using the constant strategy on integers
+    X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
+
+    X_true = np.array([[0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0]])
+
+    imputer = SimpleImputer(
+        missing_values=-1, strategy="constant", fill_value=0, keep_empty_features=True
+    )
+    X_trans = imputer.fit_transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
+@pytest.mark.parametrize("array_constructor", CSR_CONTAINERS + [np.asarray])
+def test_imputation_constant_float(array_constructor):
+    # Test imputation using the constant strategy on floats
+    X = np.array(
+        [
+            [np.nan, 1.1, 0, np.nan],
+            [1.2, np.nan, 1.3, np.nan],
+            [0, 0, np.nan, np.nan],
+            [1.4, 1.5, 0, np.nan],
+        ]
+    )
+
+    X_true = np.array(
+        [[-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1]]
+    )
+
+    X = array_constructor(X)
+
+    X_true = array_constructor(X_true)
+
+    imputer = SimpleImputer(
+        strategy="constant", fill_value=-1, keep_empty_features=True
+    )
+    X_trans = imputer.fit_transform(X)
+
+    assert_allclose_dense_sparse(X_trans, X_true)
+
+
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
+@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0])
+def test_imputation_constant_object(marker):
+    # Test imputation using the constant strategy on objects
+    X = np.array(
+        [
+            [marker, "a", "b", marker],
+            ["c", marker, "d", marker],
+            ["e", "f", marker, marker],
+            ["g", "h", "i", marker],
+        ],
+        dtype=object,
+    )
+
+    X_true = np.array(
+        [
+            ["missing", "a", "b", "missing"],
+            ["c", "missing", "d", "missing"],
+            ["e", "f", "missing", "missing"],
+            ["g", "h", "i", "missing"],
+        ],
+        dtype=object,
+    )
+
+    imputer = SimpleImputer(
+        missing_values=marker,
+        strategy="constant",
+        fill_value="missing",
+        keep_empty_features=True,
+    )
+    X_trans = imputer.fit_transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
+@pytest.mark.parametrize("dtype", [object, "category"])
+def test_imputation_constant_pandas(dtype):
+    # Test imputation using the constant strategy on pandas df
+    pd = pytest.importorskip("pandas")
+
+    f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n,i,x,\na,,y,\na,j,,\nb,j,x,")
+
+    df = pd.read_csv(f, dtype=dtype)
+
+    X_true = np.array(
+        [
+            ["missing_value", "i", "x", "missing_value"],
+            ["a", "missing_value", "y", "missing_value"],
+            ["a", "j", "missing_value", "missing_value"],
+            ["b", "j", "x", "missing_value"],
+        ],
+        dtype=object,
+    )
+
+    imputer = SimpleImputer(strategy="constant", keep_empty_features=True)
+    X_trans = imputer.fit_transform(df)
+
+    assert_array_equal(X_trans, X_true)
+
+
+@pytest.mark.parametrize("X", [[[1], [2]], [[1], [np.nan]]])
+def test_iterative_imputer_one_feature(X):
+    # check we exit early when there is a single feature
+    imputer = IterativeImputer().fit(X)
+    assert imputer.n_iter_ == 0
+    imputer = IterativeImputer()
+    imputer.fit([[1], [2]])
+    assert imputer.n_iter_ == 0
+    imputer.fit([[1], [np.nan]])
+    assert imputer.n_iter_ == 0
+
+
+def test_imputation_pipeline_grid_search():
+    # Test imputation within a pipeline + gridsearch.
+    X = _sparse_random_matrix(100, 100, density=0.10)
+    missing_values = X.data[0]
+
+    pipeline = Pipeline(
+        [
+            ("imputer", SimpleImputer(missing_values=missing_values)),
+            ("tree", tree.DecisionTreeRegressor(random_state=0)),
+        ]
+    )
+
+    parameters = {"imputer__strategy": ["mean", "median", "most_frequent"]}
+
+    Y = _sparse_random_matrix(100, 1, density=0.10).toarray()
+    gs = GridSearchCV(pipeline, parameters)
+    gs.fit(X, Y)
+
+
+def test_imputation_copy():
+    # Test imputation with copy
+    X_orig = _sparse_random_matrix(5, 5, density=0.75, random_state=0)
+
+    # copy=True, dense => copy
+    X = X_orig.copy().toarray()
+    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True)
+    Xt = imputer.fit(X).transform(X)
+    Xt[0, 0] = -1
+    assert not np.all(X == Xt)
+
+    # copy=True, sparse csr => copy
+    X = X_orig.copy()
+    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True)
+    Xt = imputer.fit(X).transform(X)
+    Xt.data[0] = -1
+    assert not np.all(X.data == Xt.data)
+
+    # copy=False, dense => no copy
+    X = X_orig.copy().toarray()
+    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False)
+    Xt = imputer.fit(X).transform(X)
+    Xt[0, 0] = -1
+    assert_array_almost_equal(X, Xt)
+
+    # copy=False, sparse csc => no copy
+    X = X_orig.copy().tocsc()
+    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False)
+    Xt = imputer.fit(X).transform(X)
+    Xt.data[0] = -1
+    assert_array_almost_equal(X.data, Xt.data)
+
+    # copy=False, sparse csr => copy
+    X = X_orig.copy()
+    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False)
+    Xt = imputer.fit(X).transform(X)
+    Xt.data[0] = -1
+    assert not np.all(X.data == Xt.data)
+
+    # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is
+    # made, even if copy=False.
+
+
+def test_iterative_imputer_zero_iters():
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 10
+    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    missing_flag = X == 0
+    X[missing_flag] = np.nan
+
+    imputer = IterativeImputer(max_iter=0)
+    X_imputed = imputer.fit_transform(X)
+    # with max_iter=0, only initial imputation is performed
+    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
+
+    # repeat but force n_iter_ to 0
+    imputer = IterativeImputer(max_iter=5).fit(X)
+    # transformed should not be equal to initial imputation
+    assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X))
+
+    imputer.n_iter_ = 0
+    # now they should be equal as only initial imputation is done
+    assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
+
+
+def test_iterative_imputer_verbose():
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 3
+    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
+    imputer.fit(X)
+    imputer.transform(X)
+    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
+    imputer.fit(X)
+    imputer.transform(X)
+
+
+def test_iterative_imputer_all_missing():
+    n = 100
+    d = 3
+    X = np.zeros((n, d))
+    imputer = IterativeImputer(missing_values=0, max_iter=1)
+    X_imputed = imputer.fit_transform(X)
+    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
+
+
+@pytest.mark.parametrize(
+    "imputation_order", ["random", "roman", "ascending", "descending", "arabic"]
+)
+def test_iterative_imputer_imputation_order(imputation_order):
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    max_iter = 2
+    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    X[:, 0] = 1  # this column should not be discarded by IterativeImputer
+
+    imputer = IterativeImputer(
+        missing_values=0,
+        max_iter=max_iter,
+        n_nearest_features=5,
+        sample_posterior=False,
+        skip_complete=True,
+        min_value=0,
+        max_value=1,
+        verbose=1,
+        imputation_order=imputation_order,
+        random_state=rng,
+    )
+    imputer.fit_transform(X)
+    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
+
+    assert len(ordered_idx) // imputer.n_iter_ == imputer.n_features_with_missing_
+
+    if imputation_order == "roman":
+        assert np.all(ordered_idx[: d - 1] == np.arange(1, d))
+    elif imputation_order == "arabic":
+        assert np.all(ordered_idx[: d - 1] == np.arange(d - 1, 0, -1))
+    elif imputation_order == "random":
+        ordered_idx_round_1 = ordered_idx[: d - 1]
+        ordered_idx_round_2 = ordered_idx[d - 1 :]
+        assert ordered_idx_round_1 != ordered_idx_round_2
+    elif "ending" in imputation_order:
+        assert len(ordered_idx) == max_iter * (d - 1)
+
+
+@pytest.mark.parametrize(
+    "estimator", [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
+)
+def test_iterative_imputer_estimators(estimator):
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 10
+    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+
+    imputer = IterativeImputer(
+        missing_values=0, max_iter=1, estimator=estimator, random_state=rng
+    )
+    imputer.fit_transform(X)
+
+    # check that types are correct for estimators
+    hashes = []
+    for triplet in imputer.imputation_sequence_:
+        expected_type = (
+            type(estimator) if estimator is not None else type(BayesianRidge())
+        )
+        assert isinstance(triplet.estimator, expected_type)
+        hashes.append(id(triplet.estimator))
+
+    # check that each estimator is unique
+    assert len(set(hashes)) == len(hashes)
+
+
+def test_iterative_imputer_clip():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+
+    imputer = IterativeImputer(
+        missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng
+    )
+
+    Xt = imputer.fit_transform(X)
+    assert_allclose(np.min(Xt[X == 0]), 0.1)
+    assert_allclose(np.max(Xt[X == 0]), 0.2)
+    assert_allclose(Xt[X != 0], X[X != 0])
+
+
+def test_iterative_imputer_clip_truncnorm():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    X[:, 0] = 1
+
+    imputer = IterativeImputer(
+        missing_values=0,
+        max_iter=2,
+        n_nearest_features=5,
+        sample_posterior=True,
+        min_value=0.1,
+        max_value=0.2,
+        verbose=1,
+        imputation_order="random",
+        random_state=rng,
+    )
+    Xt = imputer.fit_transform(X)
+    assert_allclose(np.min(Xt[X == 0]), 0.1)
+    assert_allclose(np.max(Xt[X == 0]), 0.2)
+    assert_allclose(Xt[X != 0], X[X != 0])
+
+
+def test_iterative_imputer_truncated_normal_posterior():
+    #  test that the values that are imputed using `sample_posterior=True`
+    #  with boundaries (`min_value` and `max_value` are not None) are drawn
+    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test.
+    #  note that starting from the wrong random seed will make this test fail
+    #  because random sampling doesn't occur at all when the imputation
+    #  is outside of the (min_value, max_value) range
+    rng = np.random.RandomState(42)
+
+    X = rng.normal(size=(5, 5))
+    X[0][0] = np.nan
+
+    imputer = IterativeImputer(
+        min_value=0, max_value=0.5, sample_posterior=True, random_state=rng
+    )
+
+    imputer.fit_transform(X)
+    # generate multiple imputations for the single missing value
+    imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)])
+
+    assert all(imputations >= 0)
+    assert all(imputations <= 0.5)
+
+    mu, sigma = imputations.mean(), imputations.std()
+    ks_statistic, p_value = kstest((imputations - mu) / sigma, "norm")
+    if sigma == 0:
+        sigma += 1e-12
+    ks_statistic, p_value = kstest((imputations - mu) / sigma, "norm")
+    # we want to fail to reject null hypothesis
+    # null hypothesis: distributions are the same
+    assert ks_statistic < 0.2 or p_value > 0.1, "The posterior does appear to be normal"
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"])
+def test_iterative_imputer_missing_at_transform(strategy):
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X_train = rng.randint(low=0, high=3, size=(n, d))
+    X_test = rng.randint(low=0, high=3, size=(n, d))
+
+    X_train[:, 0] = 1  # definitely no missing values in 0th column
+    X_test[0, 0] = 0  # definitely missing value in 0th column
+
+    imputer = IterativeImputer(
+        missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng
+    ).fit(X_train)
+    initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train)
+
+    # if there were no missing values at time of fit, then imputer will
+    # only use the initial imputer for that feature at transform
+    assert_allclose(
+        imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0]
+    )
+
+
+def test_iterative_imputer_transform_stochasticity():
+    rng1 = np.random.RandomState(0)
+    rng2 = np.random.RandomState(1)
+    n = 100
+    d = 10
+    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray()
+
+    # when sample_posterior=True, two transforms shouldn't be equal
+    imputer = IterativeImputer(
+        missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1
+    )
+    imputer.fit(X)
+
+    X_fitted_1 = imputer.transform(X)
+    X_fitted_2 = imputer.transform(X)
+
+    # sufficient to assert that the means are not the same
+    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
+
+    # when sample_posterior=False, and n_nearest_features=None
+    # and imputation_order is not random
+    # the two transforms should be identical even if rng are different
+    imputer1 = IterativeImputer(
+        missing_values=0,
+        max_iter=1,
+        sample_posterior=False,
+        n_nearest_features=None,
+        imputation_order="ascending",
+        random_state=rng1,
+    )
+
+    imputer2 = IterativeImputer(
+        missing_values=0,
+        max_iter=1,
+        sample_posterior=False,
+        n_nearest_features=None,
+        imputation_order="ascending",
+        random_state=rng2,
+    )
+    imputer1.fit(X)
+    imputer2.fit(X)
+
+    X_fitted_1a = imputer1.transform(X)
+    X_fitted_1b = imputer1.transform(X)
+    X_fitted_2 = imputer2.transform(X)
+
+    assert_allclose(X_fitted_1a, X_fitted_1b)
+    assert_allclose(X_fitted_1a, X_fitted_2)
+
+
+def test_iterative_imputer_no_missing():
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 100)
+    X[:, 0] = np.nan
+    m1 = IterativeImputer(max_iter=10, random_state=rng)
+    m2 = IterativeImputer(max_iter=10, random_state=rng)
+    pred1 = m1.fit(X).transform(X)
+    pred2 = m2.fit_transform(X)
+    # should exclude the first column entirely
+    assert_allclose(X[:, 1:], pred1)
+    # fit and fit_transform should both be identical
+    assert_allclose(pred1, pred2)
+
+
+def test_iterative_imputer_rank_one():
+    rng = np.random.RandomState(0)
+    d = 50
+    A = rng.rand(d, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(d, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng)
+    X_filled = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled, X, atol=0.02)
+
+
+@pytest.mark.parametrize("rank", [3, 5])
+def test_iterative_imputer_transform_recovery(rank):
+    rng = np.random.RandomState(0)
+    n = 70
+    d = 70
+    A = rng.rand(n, rank)
+    B = rng.rand(rank, d)
+    X_filled = np.dot(A, B)
+    nan_mask = rng.rand(n, d) < 0.5
+    X_missing = X_filled.copy()
+    X_missing[nan_mask] = np.nan
+
+    # split up data in half
+    n = n // 2
+    X_train = X_missing[:n]
+    X_test_filled = X_filled[n:]
+    X_test = X_missing[n:]
+
+    imputer = IterativeImputer(
+        max_iter=5, imputation_order="descending", verbose=1, random_state=rng
+    ).fit(X_train)
+    X_test_est = imputer.transform(X_test)
+    assert_allclose(X_test_filled, X_test_est, atol=0.1)
+
+
+def test_iterative_imputer_additive_matrix():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    A = rng.randn(n, d)
+    B = rng.randn(n, d)
+    X_filled = np.zeros(A.shape)
+    for i in range(d):
+        for j in range(d):
+            X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2
+    # a quarter is randomly missing
+    nan_mask = rng.rand(n, d) < 0.25
+    X_missing = X_filled.copy()
+    X_missing[nan_mask] = np.nan
+
+    # split up data
+    n = n // 2
+    X_train = X_missing[:n]
+    X_test_filled = X_filled[n:]
+    X_test = X_missing[n:]
+
+    imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train)
+    X_test_est = imputer.transform(X_test)
+    assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
+
+
+def test_iterative_imputer_early_stopping():
+    rng = np.random.RandomState(0)
+    n = 50
+    d = 5
+    A = rng.rand(n, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(n, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = IterativeImputer(
+        max_iter=100, tol=1e-2, sample_posterior=False, verbose=1, random_state=rng
+    )
+    X_filled_100 = imputer.fit_transform(X_missing)
+    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_
+
+    imputer = IterativeImputer(
+        max_iter=imputer.n_iter_, sample_posterior=False, verbose=1, random_state=rng
+    )
+    X_filled_early = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)
+
+    imputer = IterativeImputer(
+        max_iter=100, tol=0, sample_posterior=False, verbose=1, random_state=rng
+    )
+    imputer.fit(X_missing)
+    assert imputer.n_iter_ == imputer.max_iter
+
+
+def test_iterative_imputer_catch_warning():
+    # check that we catch a RuntimeWarning due to a division by zero when a
+    # feature is constant in the dataset
+    X, y = load_diabetes(return_X_y=True)
+    n_samples, n_features = X.shape
+
+    # simulate that a feature only contain one category during fit
+    X[:, 3] = 1
+
+    # add some missing values
+    rng = np.random.RandomState(0)
+    missing_rate = 0.15
+    for feat in range(n_features):
+        sample_idx = rng.choice(
+            np.arange(n_samples), size=int(n_samples * missing_rate), replace=False
+        )
+        X[sample_idx, feat] = np.nan
+
+    imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        X_fill = imputer.fit_transform(X, y)
+    assert not np.any(np.isnan(X_fill))
+
+
+@pytest.mark.parametrize(
+    "min_value, max_value, correct_output",
+    [
+        (0, 100, np.array([[0] * 3, [100] * 3])),
+        (None, None, np.array([[-np.inf] * 3, [np.inf] * 3])),
+        (-np.inf, np.inf, np.array([[-np.inf] * 3, [np.inf] * 3])),
+        ([-5, 5, 10], [100, 200, 300], np.array([[-5, 5, 10], [100, 200, 300]])),
+        (
+            [-5, -np.inf, 10],
+            [100, 200, np.inf],
+            np.array([[-5, -np.inf, 10], [100, 200, np.inf]]),
+        ),
+    ],
+    ids=["scalars", "None-default", "inf", "lists", "lists-with-inf"],
+)
+def test_iterative_imputer_min_max_array_like(min_value, max_value, correct_output):
+    # check that passing scalar or array-like
+    # for min_value and max_value in IterativeImputer works
+    X = np.random.RandomState(0).randn(10, 3)
+    imputer = IterativeImputer(min_value=min_value, max_value=max_value)
+    imputer.fit(X)
+
+    assert isinstance(imputer._min_value, np.ndarray) and isinstance(
+        imputer._max_value, np.ndarray
+    )
+    assert (imputer._min_value.shape[0] == X.shape[1]) and (
+        imputer._max_value.shape[0] == X.shape[1]
+    )
+
+    assert_allclose(correct_output[0, :], imputer._min_value)
+    assert_allclose(correct_output[1, :], imputer._max_value)
+
+
+@pytest.mark.parametrize(
+    "min_value, max_value, err_msg",
+    [
+        (100, 0, "min_value >= max_value."),
+        (np.inf, -np.inf, "min_value >= max_value."),
+        ([-5, 5], [100, 200, 0], "_value' should be of shape"),
+        ([-5, 5, 5], [100, 200], "_value' should be of shape"),
+    ],
+)
+def test_iterative_imputer_catch_min_max_error(min_value, max_value, err_msg):
+    # check that passing scalar or array-like
+    # for min_value and max_value in IterativeImputer works
+    X = np.random.random((10, 3))
+    imputer = IterativeImputer(min_value=min_value, max_value=max_value)
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit(X)
+
+
+@pytest.mark.parametrize(
+    "min_max_1, min_max_2",
+    [([None, None], [-np.inf, np.inf]), ([-10, 10], [[-10] * 4, [10] * 4])],
+    ids=["None-vs-inf", "Scalar-vs-vector"],
+)
+def test_iterative_imputer_min_max_array_like_imputation(min_max_1, min_max_2):
+    # Test that None/inf and scalar/vector give the same imputation
+    X_train = np.array(
+        [
+            [np.nan, 2, 2, 1],
+            [10, np.nan, np.nan, 7],
+            [3, 1, np.nan, 1],
+            [np.nan, 4, 2, np.nan],
+        ]
+    )
+    X_test = np.array(
+        [[np.nan, 2, np.nan, 5], [2, 4, np.nan, np.nan], [np.nan, 1, 10, 1]]
+    )
+    imputer1 = IterativeImputer(
+        min_value=min_max_1[0], max_value=min_max_1[1], random_state=0
+    )
+    imputer2 = IterativeImputer(
+        min_value=min_max_2[0], max_value=min_max_2[1], random_state=0
+    )
+    X_test_imputed1 = imputer1.fit(X_train).transform(X_test)
+    X_test_imputed2 = imputer2.fit(X_train).transform(X_test)
+    assert_allclose(X_test_imputed1[:, 0], X_test_imputed2[:, 0])
+
+
+@pytest.mark.parametrize("skip_complete", [True, False])
+def test_iterative_imputer_skip_non_missing(skip_complete):
+    # check the imputing strategy when missing data are present in the
+    # testing set only.
+    # taken from: https://github.com/scikit-learn/scikit-learn/issues/14383
+    rng = np.random.RandomState(0)
+    X_train = np.array([[5, 2, 2, 1], [10, 1, 2, 7], [3, 1, 1, 1], [8, 4, 2, 2]])
+    X_test = np.array([[np.nan, 2, 4, 5], [np.nan, 4, 1, 2], [np.nan, 1, 10, 1]])
+    imputer = IterativeImputer(
+        initial_strategy="mean", skip_complete=skip_complete, random_state=rng
+    )
+    X_test_est = imputer.fit(X_train).transform(X_test)
+    if skip_complete:
+        # impute with the initial strategy: 'mean'
+        assert_allclose(X_test_est[:, 0], np.mean(X_train[:, 0]))
+    else:
+        assert_allclose(X_test_est[:, 0], [11, 7, 12], rtol=1e-4)
+
+
+@pytest.mark.parametrize("rs_imputer", [None, 1, np.random.RandomState(seed=1)])
+@pytest.mark.parametrize("rs_estimator", [None, 1, np.random.RandomState(seed=1)])
+def test_iterative_imputer_dont_set_random_state(rs_imputer, rs_estimator):
+    class ZeroEstimator:
+        def __init__(self, random_state):
+            self.random_state = random_state
+
+        def fit(self, *args, **kgards):
+            return self
+
+        def predict(self, X):
+            return np.zeros(X.shape[0])
+
+    estimator = ZeroEstimator(random_state=rs_estimator)
+    imputer = IterativeImputer(random_state=rs_imputer)
+    X_train = np.zeros((10, 3))
+    imputer.fit(X_train)
+    assert estimator.random_state == rs_estimator
+
+
+@pytest.mark.parametrize(
+    "X_fit, X_trans, params, msg_err",
+    [
+        (
+            np.array([[-1, 1], [1, 2]]),
+            np.array([[-1, 1], [1, -1]]),
+            {"features": "missing-only", "sparse": "auto"},
+            "have missing values in transform but have no missing values in fit",
+        ),
+        (
+            np.array([["a", "b"], ["c", "a"]], dtype=str),
+            np.array([["a", "b"], ["c", "a"]], dtype=str),
+            {},
+            "MissingIndicator does not support data with dtype",
+        ),
+    ],
+)
+def test_missing_indicator_error(X_fit, X_trans, params, msg_err):
+    indicator = MissingIndicator(missing_values=-1)
+    indicator.set_params(**params)
+    with pytest.raises(ValueError, match=msg_err):
+        indicator.fit(X_fit).transform(X_trans)
+
+
+def _generate_missing_indicator_cases():
+    missing_values_dtypes = [(0, np.int32), (np.nan, np.float64), (-1, np.int32)]
+    arr_types = (
+        [np.array]
+        + CSC_CONTAINERS
+        + CSR_CONTAINERS
+        + COO_CONTAINERS
+        + LIL_CONTAINERS
+        + BSR_CONTAINERS
+    )
+    return [
+        (arr_type, missing_values, dtype)
+        for arr_type, (missing_values, dtype) in product(
+            arr_types, missing_values_dtypes
+        )
+        if not (missing_values == 0 and arr_type is not np.array)
+    ]
+
+
+@pytest.mark.parametrize(
+    "arr_type, missing_values, dtype", _generate_missing_indicator_cases()
+)
+@pytest.mark.parametrize(
+    "param_features, n_features, features_indices",
+    [("missing-only", 3, np.array([0, 1, 2])), ("all", 3, np.array([0, 1, 2]))],
+)
+def test_missing_indicator_new(
+    missing_values, arr_type, dtype, param_features, n_features, features_indices
+):
+    X_fit = np.array([[missing_values, missing_values, 1], [4, 2, missing_values]])
+    X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]])
+    X_fit_expected = np.array([[1, 1, 0], [0, 0, 1]])
+    X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]])
+
+    # convert the input to the right array format and right dtype
+    X_fit = arr_type(X_fit).astype(dtype)
+    X_trans = arr_type(X_trans).astype(dtype)
+    X_fit_expected = X_fit_expected.astype(dtype)
+    X_trans_expected = X_trans_expected.astype(dtype)
+
+    indicator = MissingIndicator(
+        missing_values=missing_values, features=param_features, sparse=False
+    )
+    X_fit_mask = indicator.fit_transform(X_fit)
+    X_trans_mask = indicator.transform(X_trans)
+
+    assert X_fit_mask.shape[1] == n_features
+    assert X_trans_mask.shape[1] == n_features
+
+    assert_array_equal(indicator.features_, features_indices)
+    assert_allclose(X_fit_mask, X_fit_expected[:, features_indices])
+    assert_allclose(X_trans_mask, X_trans_expected[:, features_indices])
+
+    assert X_fit_mask.dtype == bool
+    assert X_trans_mask.dtype == bool
+    assert isinstance(X_fit_mask, np.ndarray)
+    assert isinstance(X_trans_mask, np.ndarray)
+
+    indicator.set_params(sparse=True)
+    X_fit_mask_sparse = indicator.fit_transform(X_fit)
+    X_trans_mask_sparse = indicator.transform(X_trans)
+
+    assert X_fit_mask_sparse.dtype == bool
+    assert X_trans_mask_sparse.dtype == bool
+    assert X_fit_mask_sparse.format == "csc"
+    assert X_trans_mask_sparse.format == "csc"
+    assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask)
+    assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
+
+
+@pytest.mark.parametrize(
+    "arr_type",
+    CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS + LIL_CONTAINERS + BSR_CONTAINERS,
+)
+def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type):
+    # test for sparse input and missing_value == 0
+
+    missing_values = 0
+    X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]])
+    X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]])
+
+    # convert the input to the right array format
+    X_fit_sparse = arr_type(X_fit)
+    X_trans_sparse = arr_type(X_trans)
+
+    indicator = MissingIndicator(missing_values=missing_values)
+
+    with pytest.raises(ValueError, match="Sparse input with missing_values=0"):
+        indicator.fit_transform(X_fit_sparse)
+
+    indicator.fit_transform(X_fit)
+    with pytest.raises(ValueError, match="Sparse input with missing_values=0"):
+        indicator.transform(X_trans_sparse)
+
+
+@pytest.mark.parametrize("param_sparse", [True, False, "auto"])
+@pytest.mark.parametrize(
+    "arr_type, missing_values",
+    [(np.array, 0)]
+    + list(
+        product(
+            CSC_CONTAINERS
+            + CSR_CONTAINERS
+            + COO_CONTAINERS
+            + LIL_CONTAINERS
+            + BSR_CONTAINERS,
+            [np.nan],
+        )
+    ),
+)
+def test_missing_indicator_sparse_param(arr_type, missing_values, param_sparse):
+    # check the format of the output with different sparse parameter
+    X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]])
+    X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]])
+    X_fit = arr_type(X_fit).astype(np.float64)
+    X_trans = arr_type(X_trans).astype(np.float64)
+
+    indicator = MissingIndicator(missing_values=missing_values, sparse=param_sparse)
+    X_fit_mask = indicator.fit_transform(X_fit)
+    X_trans_mask = indicator.transform(X_trans)
+
+    if param_sparse is True:
+        assert X_fit_mask.format == "csc"
+        assert X_trans_mask.format == "csc"
+    elif param_sparse == "auto" and missing_values == 0:
+        assert isinstance(X_fit_mask, np.ndarray)
+        assert isinstance(X_trans_mask, np.ndarray)
+    elif param_sparse is False:
+        assert isinstance(X_fit_mask, np.ndarray)
+        assert isinstance(X_trans_mask, np.ndarray)
+    else:
+        if sparse.issparse(X_fit):
+            assert X_fit_mask.format == "csc"
+            assert X_trans_mask.format == "csc"
+        else:
+            assert isinstance(X_fit_mask, np.ndarray)
+            assert isinstance(X_trans_mask, np.ndarray)
+
+
+def test_missing_indicator_string():
+    X = np.array([["a", "b", "c"], ["b", "c", "a"]], dtype=object)
+    indicator = MissingIndicator(missing_values="a", features="all")
+    X_trans = indicator.fit_transform(X)
+    assert_array_equal(X_trans, np.array([[True, False, False], [False, False, True]]))
+
+
+@pytest.mark.parametrize(
+    "X, missing_values, X_trans_exp",
+    [
+        (
+            np.array([["a", "b"], ["b", "a"]], dtype=object),
+            "a",
+            np.array([["b", "b", True, False], ["b", "b", False, True]], dtype=object),
+        ),
+        (
+            np.array([[np.nan, 1.0], [1.0, np.nan]]),
+            np.nan,
+            np.array([[1.0, 1.0, True, False], [1.0, 1.0, False, True]]),
+        ),
+        (
+            np.array([[np.nan, "b"], ["b", np.nan]], dtype=object),
+            np.nan,
+            np.array([["b", "b", True, False], ["b", "b", False, True]], dtype=object),
+        ),
+        (
+            np.array([[None, "b"], ["b", None]], dtype=object),
+            None,
+            np.array([["b", "b", True, False], ["b", "b", False, True]], dtype=object),
+        ),
+    ],
+)
+def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
+    trans = make_union(
+        SimpleImputer(missing_values=missing_values, strategy="most_frequent"),
+        MissingIndicator(missing_values=missing_values),
+    )
+    X_trans = trans.fit_transform(X)
+    assert_array_equal(X_trans, X_trans_exp)
+
+
+@pytest.mark.parametrize("imputer_constructor", [SimpleImputer, IterativeImputer])
+@pytest.mark.parametrize(
+    "imputer_missing_values, missing_value, err_msg",
+    [
+        ("NaN", np.nan, "Input X contains NaN"),
+        ("-1", -1, "types are expected to be both numerical."),
+    ],
+)
+def test_inconsistent_dtype_X_missing_values(
+    imputer_constructor, imputer_missing_values, missing_value, err_msg
+):
+    # regression test for issue #11390. Comparison between incoherent dtype
+    # for X and missing_values was not raising a proper error.
+    rng = np.random.RandomState(42)
+    X = rng.randn(10, 10)
+    X[0, 0] = missing_value
+
+    imputer = imputer_constructor(missing_values=imputer_missing_values)
+
+    with pytest.raises(ValueError, match=err_msg):
+        imputer.fit_transform(X)
+
+
+def test_missing_indicator_no_missing():
+    # check that all features are dropped if there are no missing values when
+    # features='missing-only' (#13491)
+    X = np.array([[1, 1], [1, 1]])
+
+    mi = MissingIndicator(features="missing-only", missing_values=-1)
+    Xt = mi.fit_transform(X)
+
+    assert Xt.shape[1] == 0
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_missing_indicator_sparse_no_explicit_zeros(csr_container):
+    # Check that non missing values don't become explicit zeros in the mask
+    # generated by missing indicator when X is sparse. (#13491)
+    X = csr_container([[0, 1, 2], [1, 2, 0], [2, 0, 1]])
+
+    mi = MissingIndicator(features="all", missing_values=1)
+    Xt = mi.fit_transform(X)
+
+    assert Xt.nnz == Xt.sum()
+
+
+@pytest.mark.parametrize("imputer_constructor", [SimpleImputer, IterativeImputer])
+def test_imputer_without_indicator(imputer_constructor):
+    X = np.array([[1, 1], [1, 1]])
+    imputer = imputer_constructor()
+    imputer.fit(X)
+
+    assert imputer.indicator_ is None
+
+
+@pytest.mark.parametrize(
+    "arr_type",
+    CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS + LIL_CONTAINERS + BSR_CONTAINERS,
+)
+def test_simple_imputation_add_indicator_sparse_matrix(arr_type):
+    X_sparse = arr_type([[np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan], [1, 2, 9]])
+    X_true = np.array(
+        [
+            [3.0, 1.0, 5.0, 1.0, 0.0, 0.0],
+            [2.0, 2.0, 1.0, 0.0, 1.0, 0.0],
+            [6.0, 3.0, 5.0, 0.0, 0.0, 1.0],
+            [1.0, 2.0, 9.0, 0.0, 0.0, 0.0],
+        ]
+    )
+
+    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True)
+    X_trans = imputer.fit_transform(X_sparse)
+
+    assert sparse.issparse(X_trans)
+    assert X_trans.shape == X_true.shape
+    assert_allclose(X_trans.toarray(), X_true)
+
+
+@pytest.mark.parametrize(
+    "strategy, expected", [("most_frequent", "b"), ("constant", "missing_value")]
+)
+def test_simple_imputation_string_list(strategy, expected):
+    X = [["a", "b"], ["c", np.nan]]
+
+    X_true = np.array([["a", "b"], ["c", expected]], dtype=object)
+
+    imputer = SimpleImputer(strategy=strategy)
+    X_trans = imputer.fit_transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
+@pytest.mark.parametrize(
+    "order, idx_order",
+    [("ascending", [3, 4, 2, 0, 1]), ("descending", [1, 0, 2, 4, 3])],
+)
+def test_imputation_order(order, idx_order):
+    # regression test for #15393
+    rng = np.random.RandomState(42)
+    X = rng.rand(100, 5)
+    X[:50, 1] = np.nan
+    X[:30, 0] = np.nan
+    X[:20, 2] = np.nan
+    X[:10, 4] = np.nan
+
+    with pytest.warns(ConvergenceWarning):
+        trs = IterativeImputer(max_iter=1, imputation_order=order, random_state=0).fit(
+            X
+        )
+        idx = [x.feat_idx for x in trs.imputation_sequence_]
+        assert idx == idx_order
+
+
+@pytest.mark.parametrize("missing_value", [-1, np.nan])
+def test_simple_imputation_inverse_transform(missing_value):
+    # Test inverse_transform feature for np.nan
+    X_1 = np.array(
+        [
+            [9, missing_value, 3, -1],
+            [4, -1, 5, 4],
+            [6, 7, missing_value, -1],
+            [8, 9, 0, missing_value],
+        ]
+    )
+
+    X_2 = np.array(
+        [
+            [5, 4, 2, 1],
+            [2, 1, missing_value, 3],
+            [9, missing_value, 7, 1],
+            [6, 4, 2, missing_value],
+        ]
+    )
+
+    X_3 = np.array(
+        [
+            [1, missing_value, 5, 9],
+            [missing_value, 4, missing_value, missing_value],
+            [2, missing_value, 7, missing_value],
+            [missing_value, 3, missing_value, 8],
+        ]
+    )
+
+    X_4 = np.array(
+        [
+            [1, 1, 1, 3],
+            [missing_value, 2, missing_value, 1],
+            [2, 3, 3, 4],
+            [missing_value, 4, missing_value, 2],
+        ]
+    )
+
+    imputer = SimpleImputer(
+        missing_values=missing_value, strategy="mean", add_indicator=True
+    )
+
+    X_1_trans = imputer.fit_transform(X_1)
+    X_1_inv_trans = imputer.inverse_transform(X_1_trans)
+
+    X_2_trans = imputer.transform(X_2)  # test on new data
+    X_2_inv_trans = imputer.inverse_transform(X_2_trans)
+
+    assert_array_equal(X_1_inv_trans, X_1)
+    assert_array_equal(X_2_inv_trans, X_2)
+
+    for X in [X_3, X_4]:
+        X_trans = imputer.fit_transform(X)
+        X_inv_trans = imputer.inverse_transform(X_trans)
+        assert_array_equal(X_inv_trans, X)
+
+
+@pytest.mark.parametrize("missing_value", [-1, np.nan])
+def test_simple_imputation_inverse_transform_exceptions(missing_value):
+    X_1 = np.array(
+        [
+            [9, missing_value, 3, -1],
+            [4, -1, 5, 4],
+            [6, 7, missing_value, -1],
+            [8, 9, 0, missing_value],
+        ]
+    )
+
+    imputer = SimpleImputer(missing_values=missing_value, strategy="mean")
+    X_1_trans = imputer.fit_transform(X_1)
+    with pytest.raises(
+        ValueError, match=f"Got 'add_indicator={imputer.add_indicator}'"
+    ):
+        imputer.inverse_transform(X_1_trans)
+
+
+@pytest.mark.parametrize(
+    "expected,array,dtype,extra_value,n_repeat",
+    [
+        # array of object dtype
+        ("extra_value", ["a", "b", "c"], object, "extra_value", 2),
+        (
+            "most_frequent_value",
+            ["most_frequent_value", "most_frequent_value", "value"],
+            object,
+            "extra_value",
+            1,
+        ),
+        ("a", ["min_value", "min_valuevalue"], object, "a", 2),
+        ("min_value", ["min_value", "min_value", "value"], object, "z", 2),
+        # array of numeric dtype
+        (10, [1, 2, 3], int, 10, 2),
+        (1, [1, 1, 2], int, 10, 1),
+        (10, [20, 20, 1], int, 10, 2),
+        (1, [1, 1, 20], int, 10, 2),
+    ],
+)
+def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
+    assert expected == _most_frequent(
+        np.array(array, dtype=dtype), extra_value, n_repeat
+    )
+
+
+@pytest.mark.parametrize(
+    "expected,array",
+    [
+        ("a", ["a", "b"]),
+        (1, [1, 2]),
+        (None, [None, "a"]),
+        (None, [None, 1]),
+        (None, [None, "a", 1]),
+        (1, [1, "1"]),
+        (1, ["1", 1]),
+    ],
+)
+def test_most_frequent_tie_object(expected, array):
+    """Check the tie breaking behavior of the most frequent strategy.
+
+    Non-regression test for issue #31717.
+    """
+    assert expected == _most_frequent(np.array(array, dtype=object), None, 0)
+
+
+@pytest.mark.parametrize(
+    "initial_strategy", ["mean", "median", "most_frequent", "constant"]
+)
+def test_iterative_imputer_keep_empty_features(initial_strategy):
+    """Check the behaviour of the iterative imputer with different initial strategy
+    and keeping empty features (i.e. features containing only missing values).
+    """
+    X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]])
+
+    imputer = IterativeImputer(
+        initial_strategy=initial_strategy, keep_empty_features=True
+    )
+    X_imputed = imputer.fit_transform(X)
+    assert_allclose(X_imputed[:, 1], 0)
+    X_imputed = imputer.transform(X)
+    assert_allclose(X_imputed[:, 1], 0)
+
+
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
+def test_iterative_imputer_constant_fill_value():
+    """Check that we propagate properly the parameter `fill_value`."""
+    X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
+
+    fill_value = 100
+    imputer = IterativeImputer(
+        missing_values=-1,
+        initial_strategy="constant",
+        fill_value=fill_value,
+        max_iter=0,
+        keep_empty_features=True,
+    )
+    imputer.fit_transform(X)
+    assert_array_equal(imputer.initial_imputer_.statistics_, fill_value)
+
+
+def test_iterative_imputer_min_max_value_remove_empty():
+    """Check that we properly apply the empty feature mask to `min_value` and
+    `max_value`.
+
+    Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29355
+    """
+    # Intentionally make column 2 as a missing column, then the bound of the imputed
+    # value of column 3 should be (4, 5)
+    X = np.array(
+        [
+            [1, 2, np.nan, np.nan],
+            [4, 5, np.nan, 6],
+            [7, 8, np.nan, np.nan],
+            [10, 11, np.nan, 12],
+        ]
+    )
+    min_value = [-np.inf, -np.inf, -np.inf, 4]
+    max_value = [np.inf, np.inf, np.inf, 5]
+
+    X_imputed = IterativeImputer(
+        min_value=min_value,
+        max_value=max_value,
+        keep_empty_features=False,
+    ).fit_transform(X)
+
+    X_without_missing_column = np.delete(X, 2, axis=1)
+    assert X_imputed.shape == X_without_missing_column.shape
+    assert np.min(X_imputed[np.isnan(X_without_missing_column)]) == pytest.approx(4)
+    assert np.max(X_imputed[np.isnan(X_without_missing_column)]) == pytest.approx(5)
+
+    # Intentionally make column 3 as a missing column, then the bound of the imputed
+    # value of column 2 should be (3.5, 6)
+    X = np.array(
+        [
+            [1, 2, np.nan, np.nan],
+            [4, 5, 6, np.nan],
+            [7, 8, np.nan, np.nan],
+            [10, 11, 12, np.nan],
+        ]
+    )
+    min_value = [-np.inf, -np.inf, 3.5, -np.inf]
+    max_value = [np.inf, np.inf, 6, np.inf]
+
+    X_imputed = IterativeImputer(
+        min_value=min_value,
+        max_value=max_value,
+        keep_empty_features=False,
+    ).fit_transform(X)
+
+    X_without_missing_column = X[:, :3]
+    assert X_imputed.shape == X_without_missing_column.shape
+    assert np.min(X_imputed[np.isnan(X_without_missing_column)]) == pytest.approx(3.5)
+    assert np.max(X_imputed[np.isnan(X_without_missing_column)]) == pytest.approx(6)
+
+
+@pytest.mark.parametrize("keep_empty_features", [True, False])
+def test_knn_imputer_keep_empty_features(keep_empty_features):
+    """Check the behaviour of `keep_empty_features` for `KNNImputer`."""
+    X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]])
+
+    imputer = KNNImputer(keep_empty_features=keep_empty_features)
+
+    for method in ["fit_transform", "transform"]:
+        X_imputed = getattr(imputer, method)(X)
+        if keep_empty_features:
+            assert X_imputed.shape == X.shape
+            assert_array_equal(X_imputed[:, 1], 0)
+        else:
+            assert X_imputed.shape == (X.shape[0], X.shape[1] - 1)
+
+
+def test_simple_impute_pd_na():
+    pd = pytest.importorskip("pandas")
+
+    # Impute pandas array of string types.
+    df = pd.DataFrame({"feature": pd.Series(["abc", None, "de"], dtype="string")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value="na")
+    _assert_array_equal_and_same_dtype(
+        imputer.fit_transform(df), np.array([["abc"], ["na"], ["de"]], dtype=object)
+    )
+
+    # Impute pandas array of string types without any missing values.
+    df = pd.DataFrame({"feature": pd.Series(["abc", "de", "fgh"], dtype="string")})
+    imputer = SimpleImputer(fill_value="ok", strategy="constant")
+    _assert_array_equal_and_same_dtype(
+        imputer.fit_transform(df), np.array([["abc"], ["de"], ["fgh"]], dtype=object)
+    )
+
+    # Impute pandas array of integer types.
+    df = pd.DataFrame({"feature": pd.Series([1, None, 3], dtype="Int64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value=-1)
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1], [-1], [3]], dtype="float64")
+    )
+
+    # Use `np.nan` also works.
+    imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1], [-1], [3]], dtype="float64")
+    )
+
+    # Impute pandas array of integer types with 'median' strategy.
+    df = pd.DataFrame({"feature": pd.Series([1, None, 2, 3], dtype="Int64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="median")
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1], [2], [2], [3]], dtype="float64")
+    )
+
+    # Impute pandas array of integer types with 'mean' strategy.
+    df = pd.DataFrame({"feature": pd.Series([1, None, 2], dtype="Int64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="mean")
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1], [1.5], [2]], dtype="float64")
+    )
+
+    # Impute pandas array of float types.
+    df = pd.DataFrame({"feature": pd.Series([1.0, None, 3.0], dtype="float64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value=-2.0)
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1.0], [-2.0], [3.0]], dtype="float64")
+    )
+
+    # Impute pandas array of float types with 'median' strategy.
+    df = pd.DataFrame({"feature": pd.Series([1.0, None, 2.0, 3.0], dtype="float64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="median")
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df),
+        np.array([[1.0], [2.0], [2.0], [3.0]], dtype="float64"),
+    )
+
+
+def test_missing_indicator_feature_names_out():
+    """Check that missing indicator return the feature names with a prefix."""
+    pd = pytest.importorskip("pandas")
+
+    missing_values = np.nan
+    X = pd.DataFrame(
+        [
+            [missing_values, missing_values, 1, missing_values],
+            [4, missing_values, 2, 10],
+        ],
+        columns=["a", "b", "c", "d"],
+    )
+
+    indicator = MissingIndicator(missing_values=missing_values).fit(X)
+    feature_names = indicator.get_feature_names_out()
+    expected_names = ["missingindicator_a", "missingindicator_b", "missingindicator_d"]
+    assert_array_equal(expected_names, feature_names)
+
+
+def test_imputer_lists_fit_transform():
+    """Check transform uses object dtype when fitted on an object dtype.
+
+    Non-regression test for #19572.
+    """
+
+    X = [["a", "b"], ["c", "b"], ["a", "a"]]
+    imp_frequent = SimpleImputer(strategy="most_frequent").fit(X)
+    X_trans = imp_frequent.transform([[np.nan, np.nan]])
+    assert X_trans.dtype == object
+    assert_array_equal(X_trans, [["a", "b"]])
+
+
+@pytest.mark.parametrize("dtype_test", [np.float32, np.float64])
+def test_imputer_transform_preserves_numeric_dtype(dtype_test):
+    """Check transform preserves numeric dtype independent of fit dtype."""
+    X = np.asarray(
+        [[1.2, 3.4, np.nan], [np.nan, 1.2, 1.3], [4.2, 2, 1]], dtype=np.float64
+    )
+    imp = SimpleImputer().fit(X)
+
+    X_test = np.asarray([[np.nan, np.nan, np.nan]], dtype=dtype_test)
+    X_trans = imp.transform(X_test)
+    assert X_trans.dtype == dtype_test
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse"])
+@pytest.mark.parametrize("keep_empty_features", [True, False])
+def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_features):
+    """Check the behaviour of `keep_empty_features` with `strategy='constant'.
+    For backward compatibility, a column full of missing values will always be
+    fill and never dropped.
+    """
+    X = np.array([[np.nan, 2], [np.nan, 3], [np.nan, 6]])
+    X = _convert_container(X, array_type)
+    fill_value = 10
+    imputer = SimpleImputer(
+        strategy="constant",
+        fill_value=fill_value,
+        keep_empty_features=keep_empty_features,
+    )
+
+    for method in ["fit_transform", "transform"]:
+        # TODO(1.8): Remove the condition and still call getattr(imputer, method)(X)
+        if method.startswith("fit") and not keep_empty_features:
+            warn_msg = '`strategy="constant"`, empty features are not dropped. '
+            with pytest.warns(FutureWarning, match=warn_msg):
+                X_imputed = getattr(imputer, method)(X)
+        else:
+            X_imputed = getattr(imputer, method)(X)
+        assert X_imputed.shape == X.shape
+        constant_feature = (
+            X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]
+        )
+        assert_array_equal(constant_feature, fill_value)
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse"])
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"])
+@pytest.mark.parametrize("keep_empty_features", [True, False])
+def test_simple_imputer_keep_empty_features(strategy, array_type, keep_empty_features):
+    """Check the behaviour of `keep_empty_features` with all strategies but
+    'constant'.
+    """
+    X = np.array([[np.nan, 2], [np.nan, 3], [np.nan, 6]])
+    X = _convert_container(X, array_type)
+    imputer = SimpleImputer(strategy=strategy, keep_empty_features=keep_empty_features)
+
+    for method in ["fit_transform", "transform"]:
+        X_imputed = getattr(imputer, method)(X)
+        if keep_empty_features:
+            assert X_imputed.shape == X.shape
+            constant_feature = (
+                X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]
+            )
+            assert_array_equal(constant_feature, 0)
+        else:
+            assert X_imputed.shape == (X.shape[0], X.shape[1] - 1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_imputation_custom(csc_container):
+    X = np.array(
+        [
+            [1.1, 1.1, 1.1],
+            [3.9, 1.2, np.nan],
+            [np.nan, 1.3, np.nan],
+            [0.1, 1.4, 1.4],
+            [4.9, 1.5, 1.5],
+            [np.nan, 1.6, 1.6],
+        ]
+    )
+
+    X_true = np.array(
+        [
+            [1.1, 1.1, 1.1],
+            [3.9, 1.2, 1.1],
+            [0.1, 1.3, 1.1],
+            [0.1, 1.4, 1.4],
+            [4.9, 1.5, 1.5],
+            [0.1, 1.6, 1.6],
+        ]
+    )
+
+    imputer = SimpleImputer(missing_values=np.nan, strategy=np.min)
+    X_trans = imputer.fit_transform(X)
+    assert_array_equal(X_trans, X_true)
+
+    # Sparse matrix
+    imputer = SimpleImputer(missing_values=np.nan, strategy=np.min)
+    X_trans = imputer.fit_transform(csc_container(X))
+    assert_array_equal(X_trans.toarray(), X_true)
+
+
+def test_simple_imputer_constant_fill_value_casting():
+    """Check that we raise a proper error message when we cannot cast the fill value
+    to the input data type. Otherwise, check that the casting is done properly.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28309
+    """
+    # cannot cast fill_value at fit
+    fill_value = 1.5
+    X_int64 = np.array([[1, 2, 3], [2, 3, 4]], dtype=np.int64)
+    imputer = SimpleImputer(
+        strategy="constant", fill_value=fill_value, missing_values=2
+    )
+    err_msg = f"fill_value={fill_value!r} (of type {type(fill_value)!r}) cannot be cast"
+    with pytest.raises(ValueError, match=re.escape(err_msg)):
+        imputer.fit(X_int64)
+
+    # cannot cast fill_value at transform
+    X_float64 = np.array([[1, 2, 3], [2, 3, 4]], dtype=np.float64)
+    imputer.fit(X_float64)
+    err_msg = (
+        f"The dtype of the filling value (i.e. {imputer.statistics_.dtype!r}) "
+        "cannot be cast"
+    )
+    with pytest.raises(ValueError, match=re.escape(err_msg)):
+        imputer.transform(X_int64)
+
+    # check that no error is raised when having the same kind of dtype
+    fill_value_list = [np.float64(1.5), 1.5, 1]
+    X_float32 = X_float64.astype(np.float32)
+
+    for fill_value in fill_value_list:
+        imputer = SimpleImputer(
+            strategy="constant", fill_value=fill_value, missing_values=2
+        )
+        X_trans = imputer.fit_transform(X_float32)
+        assert X_trans.dtype == X_float32.dtype
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
+def test_iterative_imputer_no_empty_features(strategy):
+    """Check the behaviour of `keep_empty_features` with no empty features.
+
+    With no-empty features, we should get the same imputation whatever the
+    parameter `keep_empty_features`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/29375
+    """
+    X = np.array([[np.nan, 0, 1], [2, np.nan, 3], [4, 5, np.nan]])
+
+    imputer_drop_empty_features = IterativeImputer(
+        initial_strategy=strategy, fill_value=1, keep_empty_features=False
+    )
+
+    imputer_keep_empty_features = IterativeImputer(
+        initial_strategy=strategy, fill_value=1, keep_empty_features=True
+    )
+
+    assert_allclose(
+        imputer_drop_empty_features.fit_transform(X),
+        imputer_keep_empty_features.fit_transform(X),
+    )
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
+@pytest.mark.parametrize(
+    "X_test",
+    [
+        np.array([[1, 2, 3, 4], [5, 6, 7, 8]]),  # without empty feature
+        np.array([[np.nan, 2, 3, 4], [np.nan, 6, 7, 8]]),  # empty feature at column 0
+        np.array([[1, 2, 3, np.nan], [5, 6, 7, np.nan]]),  # empty feature at column 3
+    ],
+)
+def test_iterative_imputer_with_empty_features(strategy, X_test):
+    """Check the behaviour of `keep_empty_features` in the presence of empty features.
+
+    With `keep_empty_features=True`, the empty feature will be imputed with the value
+    defined by the initial imputation.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/29375
+    """
+    X_train = np.array(
+        [[np.nan, np.nan, 0, 1], [np.nan, 2, np.nan, 3], [np.nan, 4, 5, np.nan]]
+    )
+
+    imputer_drop_empty_features = IterativeImputer(
+        initial_strategy=strategy, fill_value=0, keep_empty_features=False
+    )
+    X_train_drop_empty_features = imputer_drop_empty_features.fit_transform(X_train)
+    X_test_drop_empty_features = imputer_drop_empty_features.transform(X_test)
+
+    imputer_keep_empty_features = IterativeImputer(
+        initial_strategy=strategy, fill_value=0, keep_empty_features=True
+    )
+    X_train_keep_empty_features = imputer_keep_empty_features.fit_transform(X_train)
+    X_test_keep_empty_features = imputer_keep_empty_features.transform(X_test)
+
+    assert_allclose(X_train_drop_empty_features, X_train_keep_empty_features[:, 1:])
+    assert_allclose(X_train_keep_empty_features[:, 0], 0)
+
+    assert X_train_drop_empty_features.shape[1] == X_test_drop_empty_features.shape[1]
+    assert X_train_keep_empty_features.shape[1] == X_test_keep_empty_features.shape[1]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_knn.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..34244d628600fc29ae2af1e620f34c83eafc6d81
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_knn.py
@@ -0,0 +1,570 @@
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.impute import KNNImputer
+from sklearn.metrics.pairwise import nan_euclidean_distances, pairwise_distances
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.utils._testing import assert_allclose
+
+
+@pytest.mark.parametrize("weights", ["uniform", "distance"])
+@pytest.mark.parametrize("n_neighbors", range(1, 6))
+def test_knn_imputer_shape(weights, n_neighbors):
+    # Verify the shapes of the imputed matrix for different weights and
+    # number of neighbors.
+    n_rows = 10
+    n_cols = 2
+    X = np.random.rand(n_rows, n_cols)
+    X[0, 0] = np.nan
+
+    imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
+    X_imputed = imputer.fit_transform(X)
+    assert X_imputed.shape == (n_rows, n_cols)
+
+
+@pytest.mark.parametrize("na", [np.nan, -1])
+def test_knn_imputer_default_with_invalid_input(na):
+    # Test imputation with default values and invalid input
+
+    # Test with inf present
+    X = np.array(
+        [
+            [np.inf, 1, 1, 2, na],
+            [2, 1, 2, 2, 3],
+            [3, 2, 3, 3, 8],
+            [na, 6, 0, 5, 13],
+            [na, 7, 0, 7, 8],
+            [6, 6, 2, 5, 7],
+        ]
+    )
+    with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"):
+        KNNImputer(missing_values=na).fit(X)
+
+    # Test with inf present in matrix passed in transform()
+    X = np.array(
+        [
+            [np.inf, 1, 1, 2, na],
+            [2, 1, 2, 2, 3],
+            [3, 2, 3, 3, 8],
+            [na, 6, 0, 5, 13],
+            [na, 7, 0, 7, 8],
+            [6, 6, 2, 5, 7],
+        ]
+    )
+
+    X_fit = np.array(
+        [
+            [0, 1, 1, 2, na],
+            [2, 1, 2, 2, 3],
+            [3, 2, 3, 3, 8],
+            [na, 6, 0, 5, 13],
+            [na, 7, 0, 7, 8],
+            [6, 6, 2, 5, 7],
+        ]
+    )
+    imputer = KNNImputer(missing_values=na).fit(X_fit)
+    with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"):
+        imputer.transform(X)
+
+    # Test with missing_values=0 when NaN present
+    imputer = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform")
+    X = np.array(
+        [
+            [np.nan, 0, 0, 0, 5],
+            [np.nan, 1, 0, np.nan, 3],
+            [np.nan, 2, 0, 0, 0],
+            [np.nan, 6, 0, 5, 13],
+        ]
+    )
+    msg = "Input X contains NaN"
+    with pytest.raises(ValueError, match=msg):
+        imputer.fit(X)
+
+    X = np.array(
+        [
+            [0, 0],
+            [np.nan, 2],
+        ]
+    )
+
+
+@pytest.mark.parametrize("na", [np.nan, -1])
+def test_knn_imputer_removes_all_na_features(na):
+    X = np.array(
+        [
+            [1, 1, na, 1, 1, 1.0],
+            [2, 3, na, 2, 2, 2],
+            [3, 4, na, 3, 3, na],
+            [6, 4, na, na, 6, 6],
+        ]
+    )
+    knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X)
+
+    X_transform = knn.transform(X)
+    assert not np.isnan(X_transform).any()
+    assert X_transform.shape == (4, 5)
+
+    X_test = np.arange(0, 12).reshape(2, 6)
+    X_transform = knn.transform(X_test)
+    assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform)
+
+
+@pytest.mark.parametrize("na", [np.nan, -1])
+def test_knn_imputer_zero_nan_imputes_the_same(na):
+    # Test with an imputable matrix and compare with different missing_values
+    X_zero = np.array(
+        [
+            [1, 0, 1, 1, 1.0],
+            [2, 2, 2, 2, 2],
+            [3, 3, 3, 3, 0],
+            [6, 6, 0, 6, 6],
+        ]
+    )
+
+    X_nan = np.array(
+        [
+            [1, na, 1, 1, 1.0],
+            [2, 2, 2, 2, 2],
+            [3, 3, 3, 3, na],
+            [6, 6, na, 6, 6],
+        ]
+    )
+
+    X_imputed = np.array(
+        [
+            [1, 2.5, 1, 1, 1.0],
+            [2, 2, 2, 2, 2],
+            [3, 3, 3, 3, 1.5],
+            [6, 6, 2.5, 6, 6],
+        ]
+    )
+
+    imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform")
+
+    imputer_nan = KNNImputer(missing_values=na, n_neighbors=2, weights="uniform")
+
+    assert_allclose(imputer_zero.fit_transform(X_zero), X_imputed)
+    assert_allclose(
+        imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan)
+    )
+
+
+@pytest.mark.parametrize("na", [np.nan, -1])
+def test_knn_imputer_verify(na):
+    # Test with an imputable matrix
+    X = np.array(
+        [
+            [1, 0, 0, 1],
+            [2, 1, 2, na],
+            [3, 2, 3, na],
+            [na, 4, 5, 5],
+            [6, na, 6, 7],
+            [8, 8, 8, 8],
+            [16, 15, 18, 19],
+        ]
+    )
+
+    X_imputed = np.array(
+        [
+            [1, 0, 0, 1],
+            [2, 1, 2, 8],
+            [3, 2, 3, 8],
+            [4, 4, 5, 5],
+            [6, 3, 6, 7],
+            [8, 8, 8, 8],
+            [16, 15, 18, 19],
+        ]
+    )
+
+    imputer = KNNImputer(missing_values=na)
+    assert_allclose(imputer.fit_transform(X), X_imputed)
+
+    # Test when there is not enough neighbors
+    X = np.array(
+        [
+            [1, 0, 0, na],
+            [2, 1, 2, na],
+            [3, 2, 3, na],
+            [4, 4, 5, na],
+            [6, 7, 6, na],
+            [8, 8, 8, na],
+            [20, 20, 20, 20],
+            [22, 22, 22, 22],
+        ]
+    )
+
+    # Not enough neighbors, use column mean from training
+    X_impute_value = (20 + 22) / 2
+    X_imputed = np.array(
+        [
+            [1, 0, 0, X_impute_value],
+            [2, 1, 2, X_impute_value],
+            [3, 2, 3, X_impute_value],
+            [4, 4, 5, X_impute_value],
+            [6, 7, 6, X_impute_value],
+            [8, 8, 8, X_impute_value],
+            [20, 20, 20, 20],
+            [22, 22, 22, 22],
+        ]
+    )
+
+    imputer = KNNImputer(missing_values=na)
+    assert_allclose(imputer.fit_transform(X), X_imputed)
+
+    # Test when data in fit() and transform() are different
+    X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 16]])
+
+    X1 = np.array([[1, 0], [3, 2], [4, na]])
+
+    X_2_1 = (0 + 3 + 6 + 7 + 8) / 5
+    X1_imputed = np.array([[1, 0], [3, 2], [4, X_2_1]])
+
+    imputer = KNNImputer(missing_values=na)
+    assert_allclose(imputer.fit(X).transform(X1), X1_imputed)
+
+
+@pytest.mark.parametrize("na", [np.nan, -1])
+def test_knn_imputer_one_n_neighbors(na):
+    X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]])
+
+    X_imputed = np.array([[0, 0], [4, 2], [4, 3], [5, 3], [7, 7], [7, 8], [14, 13]])
+
+    imputer = KNNImputer(n_neighbors=1, missing_values=na)
+
+    assert_allclose(imputer.fit_transform(X), X_imputed)
+
+
+@pytest.mark.parametrize("na", [np.nan, -1])
+def test_knn_imputer_all_samples_are_neighbors(na):
+    X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]])
+
+    X_imputed = np.array(
+        [[0, 0], [6.25, 2], [4, 3], [5, 5.75], [7, 7], [6.25, 8], [14, 13]]
+    )
+
+    n_neighbors = X.shape[0] - 1
+    imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
+
+    assert_allclose(imputer.fit_transform(X), X_imputed)
+
+    n_neighbors = X.shape[0]
+    imputer_plus1 = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
+    assert_allclose(imputer_plus1.fit_transform(X), X_imputed)
+
+
+@pytest.mark.parametrize("na", [np.nan, -1])
+def test_knn_imputer_weight_uniform(na):
+    X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])
+
+    # Test with "uniform" weight (or unweighted)
+    X_imputed_uniform = np.array(
+        [[0, 0], [5, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
+    )
+
+    imputer = KNNImputer(weights="uniform", missing_values=na)
+    assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
+
+    # Test with "callable" weight
+    def no_weight(dist):
+        return None
+
+    imputer = KNNImputer(weights=no_weight, missing_values=na)
+    assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
+
+    # Test with "callable" uniform weight
+    def uniform_weight(dist):
+        return np.ones_like(dist)
+
+    imputer = KNNImputer(weights=uniform_weight, missing_values=na)
+    assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
+
+
+@pytest.mark.parametrize("na", [np.nan, -1])
+def test_knn_imputer_weight_distance(na):
+    X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])
+
+    # Test with "distance" weight
+    nn = KNeighborsRegressor(metric="euclidean", weights="distance")
+    X_rows_idx = [0, 2, 3, 4, 5, 6]
+    nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0])
+    knn_imputed_value = nn.predict(X[1:2, 1:])[0]
+
+    # Manual calculation
+    X_neighbors_idx = [0, 2, 3, 4, 5]
+    dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na)
+    weights = 1 / dist[:, X_neighbors_idx].ravel()
+    manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights)
+
+    X_imputed_distance1 = np.array(
+        [[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
+    )
+
+    # NearestNeighbor calculation
+    X_imputed_distance2 = np.array(
+        [[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
+    )
+
+    imputer = KNNImputer(weights="distance", missing_values=na)
+    assert_allclose(imputer.fit_transform(X), X_imputed_distance1)
+    assert_allclose(imputer.fit_transform(X), X_imputed_distance2)
+
+    # Test with weights = "distance" and n_neighbors=2
+    X = np.array(
+        [
+            [na, 0, 0],
+            [2, 1, 2],
+            [3, 2, 3],
+            [4, 5, 5],
+        ]
+    )
+
+    # neighbors are rows 1, 2, the nan_euclidean_distances are:
+    dist_0_1 = np.sqrt((3 / 2) * ((1 - 0) ** 2 + (2 - 0) ** 2))
+    dist_0_2 = np.sqrt((3 / 2) * ((2 - 0) ** 2 + (3 - 0) ** 2))
+    imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2])
+
+    X_imputed = np.array(
+        [
+            [imputed_value, 0, 0],
+            [2, 1, 2],
+            [3, 2, 3],
+            [4, 5, 5],
+        ]
+    )
+
+    imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na)
+    assert_allclose(imputer.fit_transform(X), X_imputed)
+
+    # Test with varying missingness patterns
+    X = np.array(
+        [
+            [1, 0, 0, 1],
+            [0, na, 1, na],
+            [1, 1, 1, na],
+            [0, 1, 0, 0],
+            [0, 0, 0, 0],
+            [1, 0, 1, 1],
+            [10, 10, 10, 10],
+        ]
+    )
+
+    # Get weights of donor neighbors
+    dist = nan_euclidean_distances(X, missing_values=na)
+    r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
+    r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
+    r1c1_nbor_wt = 1 / r1c1_nbor_dists
+    r1c3_nbor_wt = 1 / r1c3_nbor_dists
+
+    r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
+    r2c3_nbor_wt = 1 / r2c3_nbor_dists
+
+    # Collect donor values
+    col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
+    col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()
+
+    # Final imputed values
+    r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
+    r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
+    r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)
+
+    X_imputed = np.array(
+        [
+            [1, 0, 0, 1],
+            [0, r1c1_imp, 1, r1c3_imp],
+            [1, 1, 1, r2c3_imp],
+            [0, 1, 0, 0],
+            [0, 0, 0, 0],
+            [1, 0, 1, 1],
+            [10, 10, 10, 10],
+        ]
+    )
+
+    imputer = KNNImputer(weights="distance", missing_values=na)
+    assert_allclose(imputer.fit_transform(X), X_imputed)
+
+    X = np.array(
+        [
+            [0, 0, 0, na],
+            [1, 1, 1, na],
+            [2, 2, na, 2],
+            [3, 3, 3, 3],
+            [4, 4, 4, 4],
+            [5, 5, 5, 5],
+            [6, 6, 6, 6],
+            [na, 7, 7, 7],
+        ]
+    )
+
+    dist = pairwise_distances(
+        X, metric="nan_euclidean", squared=False, missing_values=na
+    )
+
+    # Calculate weights
+    r0c3_w = 1.0 / dist[0, 2:-1]
+    r1c3_w = 1.0 / dist[1, 2:-1]
+    r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
+    r7c0_w = 1.0 / dist[7, 2:7]
+
+    # Calculate weighted averages
+    r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
+    r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
+    r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
+    r7c0 = np.average(X[2:7, 0], weights=r7c0_w)
+
+    X_imputed = np.array(
+        [
+            [0, 0, 0, r0c3],
+            [1, 1, 1, r1c3],
+            [2, 2, r2c2, 2],
+            [3, 3, 3, 3],
+            [4, 4, 4, 4],
+            [5, 5, 5, 5],
+            [6, 6, 6, 6],
+            [r7c0, 7, 7, 7],
+        ]
+    )
+
+    imputer_comp_wt = KNNImputer(missing_values=na, weights="distance")
+    assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
+
+
+def test_knn_imputer_callable_metric():
+    # Define callable metric that returns the l1 norm:
+    def custom_callable(x, y, missing_values=np.nan, squared=False):
+        x = np.ma.array(x, mask=np.isnan(x))
+        y = np.ma.array(y, mask=np.isnan(y))
+        dist = np.nansum(np.abs(x - y))
+        return dist
+
+    X = np.array([[4, 3, 3, np.nan], [6, 9, 6, 9], [4, 8, 6, 9], [np.nan, 9, 11, 10.0]])
+
+    X_0_3 = (9 + 9) / 2
+    X_3_0 = (6 + 4) / 2
+    X_imputed = np.array(
+        [[4, 3, 3, X_0_3], [6, 9, 6, 9], [4, 8, 6, 9], [X_3_0, 9, 11, 10.0]]
+    )
+
+    imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
+    assert_allclose(imputer.fit_transform(X), X_imputed)
+
+
+@pytest.mark.parametrize("working_memory", [None, 0])
+@pytest.mark.parametrize("na", [-1, np.nan])
+# Note that we use working_memory=0 to ensure that chunking is tested, even
+# for a small dataset. However, it should raise a UserWarning that we ignore.
+@pytest.mark.filterwarnings("ignore:adhere to working_memory")
+def test_knn_imputer_with_simple_example(na, working_memory):
+    X = np.array(
+        [
+            [0, na, 0, na],
+            [1, 1, 1, na],
+            [2, 2, na, 2],
+            [3, 3, 3, 3],
+            [4, 4, 4, 4],
+            [5, 5, 5, 5],
+            [6, 6, 6, 6],
+            [na, 7, 7, 7],
+        ]
+    )
+
+    r0c1 = np.mean(X[1:6, 1])
+    r0c3 = np.mean(X[2:-1, -1])
+    r1c3 = np.mean(X[2:-1, -1])
+    r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2])
+    r7c0 = np.mean(X[2:-1, 0])
+
+    X_imputed = np.array(
+        [
+            [0, r0c1, 0, r0c3],
+            [1, 1, 1, r1c3],
+            [2, 2, r2c2, 2],
+            [3, 3, 3, 3],
+            [4, 4, 4, 4],
+            [5, 5, 5, 5],
+            [6, 6, 6, 6],
+            [r7c0, 7, 7, 7],
+        ]
+    )
+
+    with config_context(working_memory=working_memory):
+        imputer_comp = KNNImputer(missing_values=na)
+        assert_allclose(imputer_comp.fit_transform(X), X_imputed)
+
+
+@pytest.mark.parametrize("na", [-1, np.nan])
+@pytest.mark.parametrize("weights", ["uniform", "distance"])
+def test_knn_imputer_not_enough_valid_distances(na, weights):
+    # Samples with needed feature has nan distance
+    X1 = np.array([[na, 11], [na, 1], [3, na]])
+    X1_imputed = np.array([[3, 11], [3, 1], [3, 6]])
+
+    knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights)
+    assert_allclose(knn.fit_transform(X1), X1_imputed)
+
+    X2 = np.array([[4, na]])
+    X2_imputed = np.array([[4, 6]])
+    assert_allclose(knn.transform(X2), X2_imputed)
+
+
+@pytest.mark.parametrize("na", [-1, np.nan])
+@pytest.mark.parametrize("weights", ["uniform", "distance"])
+def test_knn_imputer_nan_distance(na, weights):
+    # Samples with nan distance should be excluded from the mean computation
+    X1_train = np.array([[1, 1], [na, 2]])
+    X1_test = np.array([[0, na]])
+    X1_test_expected = np.array([[0, 1]])
+
+    knn1 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights)
+    knn1.fit(X1_train)
+    assert_allclose(knn1.transform(X1_test), X1_test_expected)
+
+    X2_train = np.array([[na, 1, 1], [2, na, 2], [3, 3, na]])
+    X2_test = np.array([[na, 0, na], [0, na, na], [na, na, 0]])
+    X2_test_expected = np.array([[3, 0, 1], [0, 3, 2], [2, 1, 0]])
+
+    knn2 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights)
+    knn2.fit(X2_train)
+    assert_allclose(knn2.transform(X2_test), X2_test_expected)
+
+
+@pytest.mark.parametrize("na", [-1, np.nan])
+def test_knn_imputer_drops_all_nan_features(na):
+    X1 = np.array([[na, 1], [na, 2]])
+    knn = KNNImputer(missing_values=na, n_neighbors=1)
+    X1_expected = np.array([[1], [2]])
+    assert_allclose(knn.fit_transform(X1), X1_expected)
+
+    X2 = np.array([[1, 2], [3, na]])
+    X2_expected = np.array([[2], [1.5]])
+    assert_allclose(knn.transform(X2), X2_expected)
+
+
+@pytest.mark.parametrize("working_memory", [None, 0])
+@pytest.mark.parametrize("na", [-1, np.nan])
+def test_knn_imputer_distance_weighted_not_enough_neighbors(na, working_memory):
+    X = np.array([[3, na], [2, na], [na, 4], [5, 6], [6, 8], [na, 5]])
+
+    dist = pairwise_distances(
+        X, metric="nan_euclidean", squared=False, missing_values=na
+    )
+
+    X_01 = np.average(X[3:5, 1], weights=1 / dist[0, 3:5])
+    X_11 = np.average(X[3:5, 1], weights=1 / dist[1, 3:5])
+    X_20 = np.average(X[3:5, 0], weights=1 / dist[2, 3:5])
+    X_50 = np.average(X[3:5, 0], weights=1 / dist[5, 3:5])
+
+    X_expected = np.array([[3, X_01], [2, X_11], [X_20, 4], [5, 6], [6, 8], [X_50, 5]])
+
+    with config_context(working_memory=working_memory):
+        knn_3 = KNNImputer(missing_values=na, n_neighbors=3, weights="distance")
+        assert_allclose(knn_3.fit_transform(X), X_expected)
+
+        knn_4 = KNNImputer(missing_values=na, n_neighbors=4, weights="distance")
+        assert_allclose(knn_4.fit_transform(X), X_expected)
+
+
+@pytest.mark.parametrize("na, allow_nan", [(-1, False), (np.nan, True)])
+def test_knn_tags(na, allow_nan):
+    knn = KNNImputer(missing_values=na)
+    assert knn.__sklearn_tags__().input_tags.allow_nan == allow_nan
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e0a1125ef04198083da041736a7ebc2ffeafe6a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/__init__.py
@@ -0,0 +1,16 @@
+"""Tools for model inspection."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._partial_dependence import partial_dependence
+from ._permutation_importance import permutation_importance
+from ._plot.decision_boundary import DecisionBoundaryDisplay
+from ._plot.partial_dependence import PartialDependenceDisplay
+
+__all__ = [
+    "DecisionBoundaryDisplay",
+    "PartialDependenceDisplay",
+    "partial_dependence",
+    "permutation_importance",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_partial_dependence.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_partial_dependence.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad352c45cc03bd6018617c4ccaa6247fd68718b5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_partial_dependence.py
@@ -0,0 +1,775 @@
+"""Partial dependence plots for regression and classification models."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from collections.abc import Iterable
+
+import numpy as np
+from scipy import sparse
+from scipy.stats.mstats import mquantiles
+
+from ..base import is_classifier, is_regressor
+from ..ensemble import RandomForestRegressor
+from ..ensemble._gb import BaseGradientBoosting
+from ..ensemble._hist_gradient_boosting.gradient_boosting import (
+    BaseHistGradientBoosting,
+)
+from ..tree import DecisionTreeRegressor
+from ..utils import Bunch, _safe_indexing, check_array
+from ..utils._indexing import _determine_key_type, _get_column_indices, _safe_assign
+from ..utils._optional_dependencies import check_matplotlib_support  # noqa: F401
+from ..utils._param_validation import (
+    HasMethods,
+    Integral,
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from ..utils._response import _get_response_values
+from ..utils.extmath import cartesian
+from ..utils.validation import _check_sample_weight, check_is_fitted
+from ._pd_utils import _check_feature_names, _get_feature_index
+
+__all__ = [
+    "partial_dependence",
+]
+
+
+def _grid_from_X(X, percentiles, is_categorical, grid_resolution, custom_values):
+    """Generate a grid of points based on the percentiles of X.
+
+    The grid is a cartesian product between the columns of ``values``. The
+    ith column of ``values`` consists in ``grid_resolution`` equally-spaced
+    points between the percentiles of the jth column of X.
+
+    If ``grid_resolution`` is bigger than the number of unique values in the
+    j-th column of X or if the feature is a categorical feature (by inspecting
+    `is_categorical`) , then those unique values will be used instead.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_target_features)
+        The data.
+
+    percentiles : tuple of float
+        The percentiles which are used to construct the extreme values of
+        the grid. Must be in [0, 1].
+
+    is_categorical : list of bool
+        For each feature, tells whether it is categorical or not. If a feature
+        is categorical, then the values used will be the unique ones
+        (i.e. categories) instead of the percentiles.
+
+    grid_resolution : int
+        The number of equally spaced points to be placed on the grid for each
+        feature.
+
+    custom_values: dict
+        Mapping from column index of X to an array-like of values where
+        the partial dependence should be calculated for that feature
+
+    Returns
+    -------
+    grid : ndarray of shape (n_points, n_target_features)
+        A value for each feature at each point in the grid. ``n_points`` is
+        always ``<= grid_resolution ** X.shape[1]``.
+
+    values : list of 1d ndarrays
+        The values with which the grid has been created. The size of each
+        array ``values[j]`` is either ``grid_resolution``, the number of
+        unique values in ``X[:, j]``, if j is not in ``custom_range``.
+        If j is in ``custom_range``, then it is the length of ``custom_range[j]``.
+    """
+    if not isinstance(percentiles, Iterable) or len(percentiles) != 2:
+        raise ValueError("'percentiles' must be a sequence of 2 elements.")
+    if not all(0 <= x <= 1 for x in percentiles):
+        raise ValueError("'percentiles' values must be in [0, 1].")
+    if percentiles[0] >= percentiles[1]:
+        raise ValueError("percentiles[0] must be strictly less than percentiles[1].")
+
+    if grid_resolution <= 1:
+        raise ValueError("'grid_resolution' must be strictly greater than 1.")
+
+    def _convert_custom_values(values):
+        # Convert custom types such that object types are always used for string arrays
+        dtype = object if any(isinstance(v, str) for v in values) else None
+        return np.asarray(values, dtype=dtype)
+
+    custom_values = {k: _convert_custom_values(v) for k, v in custom_values.items()}
+    if any(v.ndim != 1 for v in custom_values.values()):
+        error_string = ", ".join(
+            f"Feature {k}: {v.ndim} dimensions"
+            for k, v in custom_values.items()
+            if v.ndim != 1
+        )
+
+        raise ValueError(
+            "The custom grid for some features is not a one-dimensional array. "
+            f"{error_string}"
+        )
+
+    values = []
+    # TODO: we should handle missing values (i.e. `np.nan`) specifically and store them
+    # in a different Bunch attribute.
+    for feature, is_cat in enumerate(is_categorical):
+        if feature in custom_values:
+            # Use values in the custom range
+            axis = custom_values[feature]
+        else:
+            try:
+                uniques = np.unique(_safe_indexing(X, feature, axis=1))
+            except TypeError as exc:
+                # `np.unique` will fail in the presence of `np.nan` and `str` categories
+                # due to sorting. Temporary, we reraise an error explaining the problem.
+                raise ValueError(
+                    f"The column #{feature} contains mixed data types. Finding unique "
+                    "categories fail due to sorting. It usually means that the column "
+                    "contains `np.nan` values together with `str` categories. Such use "
+                    "case is not yet supported in scikit-learn."
+                ) from exc
+
+            if is_cat or uniques.shape[0] < grid_resolution:
+                # Use the unique values either because:
+                # - feature has low resolution use unique values
+                # - feature is categorical
+                axis = uniques
+            else:
+                # create axis based on percentiles and grid resolution
+                emp_percentiles = mquantiles(
+                    _safe_indexing(X, feature, axis=1), prob=percentiles, axis=0
+                )
+                if np.allclose(emp_percentiles[0], emp_percentiles[1]):
+                    raise ValueError(
+                        "percentiles are too close to each other, "
+                        "unable to build the grid. Please choose percentiles "
+                        "that are further apart."
+                    )
+                axis = np.linspace(
+                    emp_percentiles[0],
+                    emp_percentiles[1],
+                    num=grid_resolution,
+                    endpoint=True,
+                )
+        values.append(axis)
+
+    return cartesian(values), values
+
+
+def _partial_dependence_recursion(est, grid, features):
+    """Calculate partial dependence via the recursion method.
+
+    The recursion method is in particular enabled for tree-based estimators.
+
+    For each `grid` value, a weighted tree traversal is performed: if a split node
+    involves an input feature of interest, the corresponding left or right branch
+    is followed; otherwise both branches are followed, each branch being weighted
+    by the fraction of training samples that entered that branch. Finally, the
+    partial dependence is given by a weighted average of all the visited leaves
+    values.
+
+    This method is more efficient in terms of speed than the `'brute'` method
+    (:func:`~sklearn.inspection._partial_dependence._partial_dependence_brute`).
+    However, here, the partial dependence computation is done explicitly with the
+    `X` used during training of `est`.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted estimator object implementing :term:`predict` or
+        :term:`decision_function`. Multioutput-multiclass classifiers are not
+        supported. Note that `'recursion'` is only supported for some tree-based
+        estimators (namely
+        :class:`~sklearn.ensemble.GradientBoostingClassifier`,
+        :class:`~sklearn.ensemble.GradientBoostingRegressor`,
+        :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,
+        :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,
+        :class:`~sklearn.tree.DecisionTreeRegressor`,
+        :class:`~sklearn.ensemble.RandomForestRegressor`,
+        ).
+
+    grid : array-like of shape (n_points, n_target_features)
+        The grid of feature values for which the partial dependence is calculated.
+        Note that `n_points` is the number of points in the grid and `n_target_features`
+        is the number of features you are doing partial dependence at.
+
+    features : array-like of {int, str}
+        The feature (e.g. `[0]`) or pair of interacting features
+        (e.g. `[(0, 1)]`) for which the partial dependency should be computed.
+
+    Returns
+    -------
+    averaged_predictions : array-like of shape (n_targets, n_points)
+        The averaged predictions for the given `grid` of features values.
+        Note that `n_targets` is the number of targets (e.g. 1 for binary
+        classification, `n_tasks` for multi-output regression, and `n_classes` for
+        multiclass classification) and `n_points` is the number of points in the `grid`.
+    """
+    averaged_predictions = est._compute_partial_dependence_recursion(grid, features)
+    if averaged_predictions.ndim == 1:
+        # reshape to (1, n_points) for consistency with
+        # _partial_dependence_brute
+        averaged_predictions = averaged_predictions.reshape(1, -1)
+
+    return averaged_predictions
+
+
+def _partial_dependence_brute(
+    est, grid, features, X, response_method, sample_weight=None
+):
+    """Calculate partial dependence via the brute force method.
+
+    The brute method explicitly averages the predictions of an estimator over a
+    grid of feature values.
+
+    For each `grid` value, all the samples from `X` have their variables of
+    interest replaced by that specific `grid` value. The predictions are then made
+    and averaged across the samples.
+
+    This method is slower than the `'recursion'`
+    (:func:`~sklearn.inspection._partial_dependence._partial_dependence_recursion`)
+    version for estimators with this second option. However, with the `'brute'`
+    force method, the average will be done with the given `X` and not the `X`
+    used during training, as it is done in the `'recursion'` version. Therefore
+    the average can always accept `sample_weight` (even when the estimator was
+    fitted without).
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted estimator object implementing :term:`predict`,
+        :term:`predict_proba`, or :term:`decision_function`.
+        Multioutput-multiclass classifiers are not supported.
+
+    grid : array-like of shape (n_points, n_target_features)
+        The grid of feature values for which the partial dependence is calculated.
+        Note that `n_points` is the number of points in the grid and `n_target_features`
+        is the number of features you are doing partial dependence at.
+
+    features : array-like of {int, str}
+        The feature (e.g. `[0]`) or pair of interacting features
+        (e.g. `[(0, 1)]`) for which the partial dependency should be computed.
+
+    X : array-like of shape (n_samples, n_features)
+        `X` is used to generate values for the complement features. That is, for
+        each value in `grid`, the method will average the prediction of each
+        sample from `X` having that grid value for `features`.
+
+    response_method : {'auto', 'predict_proba', 'decision_function'}, \
+            default='auto'
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. For regressors
+        this parameter is ignored and the response is always the output of
+        :term:`predict`. By default, :term:`predict_proba` is tried first
+        and we revert to :term:`decision_function` if it doesn't exist.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights are used to calculate weighted means when averaging the
+        model output. If `None`, then samples are equally weighted. Note that
+        `sample_weight` does not change the individual predictions.
+
+    Returns
+    -------
+    averaged_predictions : array-like of shape (n_targets, n_points)
+        The averaged predictions for the given `grid` of features values.
+        Note that `n_targets` is the number of targets (e.g. 1 for binary
+        classification, `n_tasks` for multi-output regression, and `n_classes` for
+        multiclass classification) and `n_points` is the number of points in the `grid`.
+
+    predictions : array-like
+        The predictions for the given `grid` of features values over the samples
+        from `X`. For non-multioutput regression and binary classification the
+        shape is `(n_instances, n_points)` and for multi-output regression and
+        multiclass classification the shape is `(n_targets, n_instances, n_points)`,
+        where `n_targets` is the number of targets (`n_tasks` for multi-output
+        regression, and `n_classes` for multiclass classification), `n_instances`
+        is the number of instances in `X`, and `n_points` is the number of points
+        in the `grid`.
+    """
+    predictions = []
+    averaged_predictions = []
+
+    if response_method == "auto":
+        response_method = (
+            "predict" if is_regressor(est) else ["predict_proba", "decision_function"]
+        )
+
+    X_eval = X.copy()
+    for new_values in grid:
+        for i, variable in enumerate(features):
+            _safe_assign(X_eval, new_values[i], column_indexer=variable)
+
+        # Note: predictions is of shape
+        # (n_points,) for non-multioutput regressors
+        # (n_points, n_tasks) for multioutput regressors
+        # (n_points, 1) for the regressors in cross_decomposition (I think)
+        # (n_points, 1) for binary classification (positive class already selected)
+        # (n_points, n_classes) for multiclass classification
+        pred, _ = _get_response_values(est, X_eval, response_method=response_method)
+
+        predictions.append(pred)
+        # average over samples
+        averaged_predictions.append(np.average(pred, axis=0, weights=sample_weight))
+
+    n_samples = X.shape[0]
+
+    # reshape to (n_targets, n_instances, n_points) where n_targets is:
+    # - 1 for non-multioutput regression and binary classification (shape is
+    #   already correct in those cases)
+    # - n_tasks for multi-output regression
+    # - n_classes for multiclass classification.
+    predictions = np.array(predictions).T
+    if is_regressor(est) and predictions.ndim == 2:
+        # non-multioutput regression, shape is (n_instances, n_points,)
+        predictions = predictions.reshape(n_samples, -1)
+    elif is_classifier(est) and predictions.shape[0] == 2:
+        # Binary classification, shape is (2, n_instances, n_points).
+        # we output the effect of **positive** class
+        predictions = predictions[1]
+        predictions = predictions.reshape(n_samples, -1)
+
+    # reshape averaged_predictions to (n_targets, n_points) where n_targets is:
+    # - 1 for non-multioutput regression and binary classification (shape is
+    #   already correct in those cases)
+    # - n_tasks for multi-output regression
+    # - n_classes for multiclass classification.
+    averaged_predictions = np.array(averaged_predictions).T
+    if averaged_predictions.ndim == 1:
+        # reshape to (1, n_points) for consistency with
+        # _partial_dependence_recursion
+        averaged_predictions = averaged_predictions.reshape(1, -1)
+
+    return averaged_predictions, predictions
+
+
+@validate_params(
+    {
+        "estimator": [
+            HasMethods(["fit", "predict"]),
+            HasMethods(["fit", "predict_proba"]),
+            HasMethods(["fit", "decision_function"]),
+        ],
+        "X": ["array-like", "sparse matrix"],
+        "features": ["array-like", Integral, str],
+        "sample_weight": ["array-like", None],
+        "categorical_features": ["array-like", None],
+        "feature_names": ["array-like", None],
+        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
+        "percentiles": [tuple],
+        "grid_resolution": [Interval(Integral, 1, None, closed="left")],
+        "method": [StrOptions({"auto", "recursion", "brute"})],
+        "kind": [StrOptions({"average", "individual", "both"})],
+        "custom_values": [dict, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def partial_dependence(
+    estimator,
+    X,
+    features,
+    *,
+    sample_weight=None,
+    categorical_features=None,
+    feature_names=None,
+    response_method="auto",
+    percentiles=(0.05, 0.95),
+    grid_resolution=100,
+    custom_values=None,
+    method="auto",
+    kind="average",
+):
+    """Partial dependence of ``features``.
+
+    Partial dependence of a feature (or a set of features) corresponds to
+    the average response of an estimator for each possible value of the
+    feature.
+
+    Read more in
+    :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`
+    and the :ref:`User Guide <partial_dependence>`.
+
+    .. warning::
+
+        For :class:`~sklearn.ensemble.GradientBoostingClassifier` and
+        :class:`~sklearn.ensemble.GradientBoostingRegressor`, the
+        `'recursion'` method (used by default) will not account for the `init`
+        predictor of the boosting process. In practice, this will produce
+        the same values as `'brute'` up to a constant offset in the target
+        response, provided that `init` is a constant estimator (which is the
+        default). However, if `init` is not a constant estimator, the
+        partial dependence values are incorrect for `'recursion'` because the
+        offset will be sample-dependent. It is preferable to use the `'brute'`
+        method. Note that this only applies to
+        :class:`~sklearn.ensemble.GradientBoostingClassifier` and
+        :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to
+        :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+        :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
+
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        A fitted estimator object implementing :term:`predict`,
+        :term:`predict_proba`, or :term:`decision_function`.
+        Multioutput-multiclass classifiers are not supported.
+
+    X : {array-like, sparse matrix or dataframe} of shape (n_samples, n_features)
+        ``X`` is used to generate a grid of values for the target
+        ``features`` (where the partial dependence will be evaluated), and
+        also to generate values for the complement features when the
+        `method` is 'brute'.
+
+    features : array-like of {int, str, bool} or int or str
+        The feature (e.g. `[0]`) or pair of interacting features
+        (e.g. `[(0, 1)]`) for which the partial dependency should be computed.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights are used to calculate weighted means when averaging the
+        model output. If `None`, then samples are equally weighted. If
+        `sample_weight` is not `None`, then `method` will be set to `'brute'`.
+        Note that `sample_weight` is ignored for `kind='individual'`.
+
+        .. versionadded:: 1.3
+
+    categorical_features : array-like of shape (n_features,) or shape \
+            (n_categorical_features,), dtype={bool, int, str}, default=None
+        Indicates the categorical features.
+
+        - `None`: no feature will be considered categorical;
+        - boolean array-like: boolean mask of shape `(n_features,)`
+            indicating which features are categorical. Thus, this array has
+            the same shape has `X.shape[1]`;
+        - integer or string array-like: integer indices or strings
+            indicating categorical features.
+
+        .. versionadded:: 1.2
+
+    feature_names : array-like of shape (n_features,), dtype=str, default=None
+        Name of each feature; `feature_names[i]` holds the name of the feature
+        with index `i`.
+        By default, the name of the feature corresponds to their numerical
+        index for NumPy array and their column name for pandas dataframe.
+
+        .. versionadded:: 1.2
+
+    response_method : {'auto', 'predict_proba', 'decision_function'}, \
+            default='auto'
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. For regressors
+        this parameter is ignored and the response is always the output of
+        :term:`predict`. By default, :term:`predict_proba` is tried first
+        and we revert to :term:`decision_function` if it doesn't exist. If
+        ``method`` is 'recursion', the response is always the output of
+        :term:`decision_function`.
+
+    percentiles : tuple of float, default=(0.05, 0.95)
+        The lower and upper percentile used to create the extreme values
+        for the grid. Must be in [0, 1].
+        This parameter is overridden by `custom_values` if that parameter is set.
+
+    grid_resolution : int, default=100
+        The number of equally spaced points on the grid, for each target
+        feature.
+        This parameter is overridden by `custom_values` if that parameter is set.
+
+    custom_values : dict
+        A dictionary mapping the index of an element of `features` to an array
+        of values where the partial dependence should be calculated
+        for that feature. Setting a range of values for a feature overrides
+        `grid_resolution` and `percentiles`.
+
+        See :ref:`how to use partial_dependence
+        <plt_partial_dependence_custom_values>` for an example of how this parameter can
+        be used.
+
+        .. versionadded:: 1.7
+
+    method : {'auto', 'recursion', 'brute'}, default='auto'
+        The method used to calculate the averaged predictions:
+
+        - `'recursion'` is only supported for some tree-based estimators
+          (namely
+          :class:`~sklearn.ensemble.GradientBoostingClassifier`,
+          :class:`~sklearn.ensemble.GradientBoostingRegressor`,
+          :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,
+          :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,
+          :class:`~sklearn.tree.DecisionTreeRegressor`,
+          :class:`~sklearn.ensemble.RandomForestRegressor`,
+          ) when `kind='average'`.
+          This is more efficient in terms of speed.
+          With this method, the target response of a
+          classifier is always the decision function, not the predicted
+          probabilities. Since the `'recursion'` method implicitly computes
+          the average of the Individual Conditional Expectation (ICE) by
+          design, it is not compatible with ICE and thus `kind` must be
+          `'average'`.
+
+        - `'brute'` is supported for any estimator, but is more
+          computationally intensive.
+
+        - `'auto'`: the `'recursion'` is used for estimators that support it,
+          and `'brute'` is used otherwise. If `sample_weight` is not `None`,
+          then `'brute'` is used regardless of the estimator.
+
+        Please see :ref:`this note <pdp_method_differences>` for
+        differences between the `'brute'` and `'recursion'` method.
+
+    kind : {'average', 'individual', 'both'}, default='average'
+        Whether to return the partial dependence averaged across all the
+        samples in the dataset or one value per sample or both.
+        See Returns below.
+
+        Note that the fast `method='recursion'` option is only available for
+        `kind='average'` and `sample_weights=None`. Computing individual
+        dependencies and doing weighted averages requires using the slower
+        `method='brute'`.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    predictions : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        individual : ndarray of shape (n_outputs, n_instances, \
+                len(values[0]), len(values[1]), ...)
+            The predictions for all the points in the grid for all
+            samples in X. This is also known as Individual
+            Conditional Expectation (ICE).
+            Only available when `kind='individual'` or `kind='both'`.
+
+        average : ndarray of shape (n_outputs, len(values[0]), \
+                len(values[1]), ...)
+            The predictions for all the points in the grid, averaged
+            over all samples in X (or over the training data if
+            `method` is 'recursion').
+            Only available when `kind='average'` or `kind='both'`.
+
+        grid_values : seq of 1d ndarrays
+            The values with which the grid has been created. The generated
+            grid is a cartesian product of the arrays in `grid_values` where
+            `len(grid_values) == len(features)`. The size of each array
+            `grid_values[j]` is either `grid_resolution`, or the number of
+            unique values in `X[:, j]`, whichever is smaller.
+
+            .. versionadded:: 1.3
+
+        `n_outputs` corresponds to the number of classes in a multi-class
+        setting, or to the number of tasks for multi-output regression.
+        For classical regression and binary classification `n_outputs==1`.
+        `n_values_feature_j` corresponds to the size `grid_values[j]`.
+
+    See Also
+    --------
+    PartialDependenceDisplay.from_estimator : Plot Partial Dependence.
+    PartialDependenceDisplay : Partial Dependence visualization.
+
+    Examples
+    --------
+    >>> X = [[0, 0, 2], [1, 0, 0]]
+    >>> y = [0, 1]
+    >>> from sklearn.ensemble import GradientBoostingClassifier
+    >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)
+    >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),
+    ...                    grid_resolution=2) # doctest: +SKIP
+    (array([[-4.52,  4.52]]), [array([ 0.,  1.])])
+    """
+    check_is_fitted(estimator)
+
+    if not (is_classifier(estimator) or is_regressor(estimator)):
+        raise ValueError("'estimator' must be a fitted regressor or classifier.")
+
+    if is_classifier(estimator) and isinstance(estimator.classes_[0], np.ndarray):
+        raise ValueError("Multiclass-multioutput estimators are not supported")
+
+    # Use check_array only on lists and other non-array-likes / sparse. Do not
+    # convert DataFrame into a NumPy array.
+    if not (hasattr(X, "__array__") or sparse.issparse(X)):
+        X = check_array(X, ensure_all_finite="allow-nan", dtype=object)
+
+    if is_regressor(estimator) and response_method != "auto":
+        raise ValueError(
+            "The response_method parameter is ignored for regressors and "
+            "must be 'auto'."
+        )
+
+    if kind != "average":
+        if method == "recursion":
+            raise ValueError(
+                "The 'recursion' method only applies when 'kind' is set to 'average'"
+            )
+        method = "brute"
+
+    if method == "recursion" and sample_weight is not None:
+        raise ValueError(
+            "The 'recursion' method can only be applied when sample_weight is None."
+        )
+
+    if method == "auto":
+        if sample_weight is not None:
+            method = "brute"
+        elif isinstance(estimator, BaseGradientBoosting) and estimator.init is None:
+            method = "recursion"
+        elif isinstance(
+            estimator,
+            (BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor),
+        ):
+            method = "recursion"
+        else:
+            method = "brute"
+
+    if method == "recursion":
+        if not isinstance(
+            estimator,
+            (
+                BaseGradientBoosting,
+                BaseHistGradientBoosting,
+                DecisionTreeRegressor,
+                RandomForestRegressor,
+            ),
+        ):
+            supported_classes_recursion = (
+                "GradientBoostingClassifier",
+                "GradientBoostingRegressor",
+                "HistGradientBoostingClassifier",
+                "HistGradientBoostingRegressor",
+                "HistGradientBoostingRegressor",
+                "DecisionTreeRegressor",
+                "RandomForestRegressor",
+            )
+            raise ValueError(
+                "Only the following estimators support the 'recursion' "
+                "method: {}. Try using method='brute'.".format(
+                    ", ".join(supported_classes_recursion)
+                )
+            )
+        if response_method == "auto":
+            response_method = "decision_function"
+
+        if response_method != "decision_function":
+            raise ValueError(
+                "With the 'recursion' method, the response_method must be "
+                "'decision_function'. Got {}.".format(response_method)
+            )
+
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, X)
+
+    if _determine_key_type(features, accept_slice=False) == "int":
+        # _get_column_indices() supports negative indexing. Here, we limit
+        # the indexing to be positive. The upper bound will be checked
+        # by _get_column_indices()
+        if np.any(np.less(features, 0)):
+            raise ValueError("all features must be in [0, {}]".format(X.shape[1] - 1))
+
+    features_indices = np.asarray(
+        _get_column_indices(X, features), dtype=np.intp, order="C"
+    ).ravel()
+
+    feature_names = _check_feature_names(X, feature_names)
+
+    n_features = X.shape[1]
+    if categorical_features is None:
+        is_categorical = [False] * len(features_indices)
+    else:
+        categorical_features = np.asarray(categorical_features)
+        if categorical_features.size == 0:
+            raise ValueError(
+                "Passing an empty list (`[]`) to `categorical_features` is not "
+                "supported. Use `None` instead to indicate that there are no "
+                "categorical features."
+            )
+        if categorical_features.dtype.kind == "b":
+            # categorical features provided as a list of boolean
+            if categorical_features.size != n_features:
+                raise ValueError(
+                    "When `categorical_features` is a boolean array-like, "
+                    "the array should be of shape (n_features,). Got "
+                    f"{categorical_features.size} elements while `X` contains "
+                    f"{n_features} features."
+                )
+            is_categorical = [categorical_features[idx] for idx in features_indices]
+        elif categorical_features.dtype.kind in ("i", "O", "U"):
+            # categorical features provided as a list of indices or feature names
+            categorical_features_idx = [
+                _get_feature_index(cat, feature_names=feature_names)
+                for cat in categorical_features
+            ]
+            is_categorical = [
+                idx in categorical_features_idx for idx in features_indices
+            ]
+        else:
+            raise ValueError(
+                "Expected `categorical_features` to be an array-like of boolean,"
+                f" integer, or string. Got {categorical_features.dtype} instead."
+            )
+
+    custom_values = custom_values or {}
+    if isinstance(features, (str, int)):
+        features = [features]
+
+    for feature_idx, feature, is_cat in zip(features_indices, features, is_categorical):
+        if is_cat:
+            continue
+
+        if _safe_indexing(X, feature_idx, axis=1).dtype.kind in "iu":
+            # TODO(1.9): raise a ValueError instead.
+            warnings.warn(
+                f"The column {feature!r} contains integer data. Partial "
+                "dependence plots are not supported for integer data: this "
+                "can lead to implicit rounding with NumPy arrays or even errors "
+                "with newer pandas versions. Please convert numerical features"
+                "to floating point dtypes ahead of time to avoid problems. "
+                "This will raise ValueError in scikit-learn 1.9.",
+                FutureWarning,
+            )
+            # Do not warn again for other features to avoid spamming the caller.
+            break
+
+    X_subset = _safe_indexing(X, features_indices, axis=1)
+
+    custom_values_for_X_subset = {
+        index: custom_values.get(feature)
+        for index, feature in enumerate(features)
+        if feature in custom_values
+    }
+
+    grid, values = _grid_from_X(
+        X_subset,
+        percentiles,
+        is_categorical,
+        grid_resolution,
+        custom_values_for_X_subset,
+    )
+
+    if method == "brute":
+        averaged_predictions, predictions = _partial_dependence_brute(
+            estimator, grid, features_indices, X, response_method, sample_weight
+        )
+
+        # reshape predictions to
+        # (n_outputs, n_instances, n_values_feature_0, n_values_feature_1, ...)
+        predictions = predictions.reshape(
+            -1, X.shape[0], *[val.shape[0] for val in values]
+        )
+    else:
+        averaged_predictions = _partial_dependence_recursion(
+            estimator, grid, features_indices
+        )
+
+    # reshape averaged_predictions to
+    # (n_outputs, n_values_feature_0, n_values_feature_1, ...)
+    averaged_predictions = averaged_predictions.reshape(
+        -1, *[val.shape[0] for val in values]
+    )
+    pdp_results = Bunch(grid_values=values)
+
+    if kind == "average":
+        pdp_results["average"] = averaged_predictions
+    elif kind == "individual":
+        pdp_results["individual"] = predictions
+    else:  # kind='both'
+        pdp_results["average"] = averaged_predictions
+        pdp_results["individual"] = predictions
+
+    return pdp_results
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_pd_utils.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_pd_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a48ba4d9a4490df59b8503f0b8768c7a986537a9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_pd_utils.py
@@ -0,0 +1,68 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+def _check_feature_names(X, feature_names=None):
+    """Check feature names.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Input data.
+
+    feature_names : None or array-like of shape (n_names,), dtype=str
+        Feature names to check or `None`.
+
+    Returns
+    -------
+    feature_names : list of str
+        Feature names validated. If `feature_names` is `None`, then a list of
+        feature names is provided, i.e. the column names of a pandas dataframe
+        or a generic list of feature names (e.g. `["x0", "x1", ...]`) for a
+        NumPy array.
+    """
+    if feature_names is None:
+        if hasattr(X, "columns") and hasattr(X.columns, "tolist"):
+            # get the column names for a pandas dataframe
+            feature_names = X.columns.tolist()
+        else:
+            # define a list of numbered indices for a numpy array
+            feature_names = [f"x{i}" for i in range(X.shape[1])]
+    elif hasattr(feature_names, "tolist"):
+        # convert numpy array or pandas index to a list
+        feature_names = feature_names.tolist()
+    if len(set(feature_names)) != len(feature_names):
+        raise ValueError("feature_names should not contain duplicates.")
+
+    return feature_names
+
+
+def _get_feature_index(fx, feature_names=None):
+    """Get feature index.
+
+    Parameters
+    ----------
+    fx : int or str
+        Feature index or name.
+
+    feature_names : list of str, default=None
+        All feature names from which to search the indices.
+
+    Returns
+    -------
+    idx : int
+        Feature index.
+    """
+    if isinstance(fx, str):
+        if feature_names is None:
+            raise ValueError(
+                f"Cannot plot partial dependence for feature {fx!r} since "
+                "the list of feature names was not provided, neither as "
+                "column names of a pandas data-frame nor via the feature_names "
+                "parameter."
+            )
+        try:
+            return feature_names.index(fx)
+        except ValueError as e:
+            raise ValueError(f"Feature {fx!r} not in feature_names") from e
+    return fx
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_permutation_importance.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_permutation_importance.py
new file mode 100644
index 0000000000000000000000000000000000000000..451062fbe272e066350b8b5307d23f9180ed6760
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_permutation_importance.py
@@ -0,0 +1,313 @@
+"""Permutation importance for estimators."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+
+import numpy as np
+
+from ..ensemble._bagging import _generate_indices
+from ..metrics import check_scoring, get_scorer_names
+from ..model_selection._validation import _aggregate_score_dicts
+from ..utils import Bunch, _safe_indexing, check_array, check_random_state
+from ..utils._param_validation import (
+    HasMethods,
+    Integral,
+    Interval,
+    RealNotInt,
+    StrOptions,
+    validate_params,
+)
+from ..utils.parallel import Parallel, delayed
+
+
+def _weights_scorer(scorer, estimator, X, y, sample_weight):
+    if sample_weight is not None:
+        return scorer(estimator, X, y, sample_weight=sample_weight)
+    return scorer(estimator, X, y)
+
+
+def _calculate_permutation_scores(
+    estimator,
+    X,
+    y,
+    sample_weight,
+    col_idx,
+    random_state,
+    n_repeats,
+    scorer,
+    max_samples,
+):
+    """Calculate score when `col_idx` is permuted."""
+    random_state = check_random_state(random_state)
+
+    # Work on a copy of X to ensure thread-safety in case of threading based
+    # parallelism. Furthermore, making a copy is also useful when the joblib
+    # backend is 'loky' (default) or the old 'multiprocessing': in those cases,
+    # if X is large it will be automatically be backed by a readonly memory map
+    # (memmap). X.copy() on the other hand is always guaranteed to return a
+    # writable data-structure whose columns can be shuffled inplace.
+    if max_samples < X.shape[0]:
+        row_indices = _generate_indices(
+            random_state=random_state,
+            bootstrap=False,
+            n_population=X.shape[0],
+            n_samples=max_samples,
+        )
+        X_permuted = _safe_indexing(X, row_indices, axis=0)
+        y = _safe_indexing(y, row_indices, axis=0)
+        if sample_weight is not None:
+            sample_weight = _safe_indexing(sample_weight, row_indices, axis=0)
+    else:
+        X_permuted = X.copy()
+
+    scores = []
+    shuffling_idx = np.arange(X_permuted.shape[0])
+    for _ in range(n_repeats):
+        random_state.shuffle(shuffling_idx)
+        if hasattr(X_permuted, "iloc"):
+            col = X_permuted.iloc[shuffling_idx, col_idx]
+            col.index = X_permuted.index
+            X_permuted[X_permuted.columns[col_idx]] = col
+        else:
+            X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
+        scores.append(_weights_scorer(scorer, estimator, X_permuted, y, sample_weight))
+
+    if isinstance(scores[0], dict):
+        scores = _aggregate_score_dicts(scores)
+    else:
+        scores = np.array(scores)
+
+    return scores
+
+
+def _create_importances_bunch(baseline_score, permuted_score):
+    """Compute the importances as the decrease in score.
+
+    Parameters
+    ----------
+    baseline_score : ndarray of shape (n_features,)
+        The baseline score without permutation.
+    permuted_score : ndarray of shape (n_features, n_repeats)
+        The permuted scores for the `n` repetitions.
+
+    Returns
+    -------
+    importances : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+        importances_mean : ndarray, shape (n_features, )
+            Mean of feature importance over `n_repeats`.
+        importances_std : ndarray, shape (n_features, )
+            Standard deviation over `n_repeats`.
+        importances : ndarray, shape (n_features, n_repeats)
+            Raw permutation importance scores.
+    """
+    importances = baseline_score - permuted_score
+    return Bunch(
+        importances_mean=np.mean(importances, axis=1),
+        importances_std=np.std(importances, axis=1),
+        importances=importances,
+    )
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods(["fit"])],
+        "X": ["array-like"],
+        "y": ["array-like", None],
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            list,
+            tuple,
+            dict,
+            None,
+        ],
+        "n_repeats": [Interval(Integral, 1, None, closed="left")],
+        "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
+        "sample_weight": ["array-like", None],
+        "max_samples": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="right"),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def permutation_importance(
+    estimator,
+    X,
+    y,
+    *,
+    scoring=None,
+    n_repeats=5,
+    n_jobs=None,
+    random_state=None,
+    sample_weight=None,
+    max_samples=1.0,
+):
+    """Permutation importance for feature evaluation [BRE]_.
+
+    The :term:`estimator` is required to be a fitted estimator. `X` can be the
+    data set used to train the estimator or a hold-out set. The permutation
+    importance of a feature is calculated as follows. First, a baseline metric,
+    defined by :term:`scoring`, is evaluated on a (potentially different)
+    dataset defined by the `X`. Next, a feature column from the validation set
+    is permuted and the metric is evaluated again. The permutation importance
+    is defined to be the difference between the baseline metric and metric from
+    permutating the feature column.
+
+    Read more in the :ref:`User Guide <permutation_importance>`.
+
+    Parameters
+    ----------
+    estimator : object
+        An estimator that has already been :term:`fitted` and is compatible
+        with :term:`scorer`.
+
+    X : ndarray or DataFrame, shape (n_samples, n_features)
+        Data on which permutation importance will be computed.
+
+    y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)
+        Targets for supervised or `None` for unsupervised.
+
+    scoring : str, callable, list, tuple, or dict, default=None
+        Scorer to use.
+        If `scoring` represents a single score, one can use:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
+
+        Passing multiple scores to `scoring` is more efficient than calling
+        `permutation_importance` for each of the scores as it reuses
+        predictions to avoid redundant computation.
+
+    n_repeats : int, default=5
+        Number of times to permute a feature.
+
+    n_jobs : int or None, default=None
+        Number of jobs to run in parallel. The computation is done by computing
+        permutation score for each columns and parallelized over the columns.
+        `None` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        `-1` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance, default=None
+        Pseudo-random number generator to control the permutations of each
+        feature.
+        Pass an int to get reproducible results across function calls.
+        See :term:`Glossary <random_state>`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights used in scoring.
+
+        .. versionadded:: 0.24
+
+    max_samples : int or float, default=1.0
+        The number of samples to draw from X to compute feature importance
+        in each repeat (without replacement).
+
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples.
+        - If `max_samples` is equal to `1.0` or `X.shape[0]`, all samples
+          will be used.
+
+        While using this option may provide less accurate importance estimates,
+        it keeps the method tractable when evaluating feature importance on
+        large datasets. In combination with `n_repeats`, this allows to control
+        the computational speed vs statistical accuracy trade-off of this method.
+
+        .. versionadded:: 1.0
+
+    Returns
+    -------
+    result : :class:`~sklearn.utils.Bunch` or dict of such instances
+        Dictionary-like object, with the following attributes.
+
+        importances_mean : ndarray of shape (n_features, )
+            Mean of feature importance over `n_repeats`.
+        importances_std : ndarray of shape (n_features, )
+            Standard deviation over `n_repeats`.
+        importances : ndarray of shape (n_features, n_repeats)
+            Raw permutation importance scores.
+
+        If there are multiple scoring metrics in the scoring parameter
+        `result` is a dict with scorer names as keys (e.g. 'roc_auc') and
+        `Bunch` objects like above as values.
+
+    References
+    ----------
+    .. [BRE] :doi:`L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
+             2001. <10.1023/A:1010933404324>`
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.inspection import permutation_importance
+    >>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9],
+    ...      [0, 9, 9],[0, 9, 9],[0, 9, 9]]
+    >>> y = [1, 1, 1, 0, 0, 0]
+    >>> clf = LogisticRegression().fit(X, y)
+    >>> result = permutation_importance(clf, X, y, n_repeats=10,
+    ...                                 random_state=0)
+    >>> result.importances_mean
+    array([0.4666, 0.       , 0.       ])
+    >>> result.importances_std
+    array([0.2211, 0.       , 0.       ])
+    """
+    if not hasattr(X, "iloc"):
+        X = check_array(X, ensure_all_finite="allow-nan", dtype=None)
+
+    # Precompute random seed from the random state to be used
+    # to get a fresh independent RandomState instance for each
+    # parallel call to _calculate_permutation_scores, irrespective of
+    # the fact that variables are shared or not depending on the active
+    # joblib backend (sequential, thread-based or process-based).
+    random_state = check_random_state(random_state)
+    random_seed = random_state.randint(np.iinfo(np.int32).max + 1)
+
+    if not isinstance(max_samples, numbers.Integral):
+        max_samples = int(max_samples * X.shape[0])
+    elif max_samples > X.shape[0]:
+        raise ValueError("max_samples must be <= n_samples")
+
+    scorer = check_scoring(estimator, scoring=scoring)
+    baseline_score = _weights_scorer(scorer, estimator, X, y, sample_weight)
+
+    scores = Parallel(n_jobs=n_jobs)(
+        delayed(_calculate_permutation_scores)(
+            estimator,
+            X,
+            y,
+            sample_weight,
+            col_idx,
+            random_seed,
+            n_repeats,
+            scorer,
+            max_samples,
+        )
+        for col_idx in range(X.shape[1])
+    )
+
+    if isinstance(baseline_score, dict):
+        return {
+            name: _create_importances_bunch(
+                baseline_score[name],
+                # unpack the permuted scores
+                np.array([scores[col_idx][name] for col_idx in range(X.shape[1])]),
+            )
+            for name in baseline_score
+        }
+    else:
+        return _create_importances_bunch(baseline_score, np.array(scores))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..67dd18fb94b593f0a3125c1f5833f3b9597614ba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/__init__.py
@@ -0,0 +1,2 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/decision_boundary.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/decision_boundary.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ef85380583937f564891e8705b7ac91eff0f321
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/decision_boundary.py
@@ -0,0 +1,564 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+import numpy as np
+
+from ...base import is_regressor
+from ...preprocessing import LabelEncoder
+from ...utils import _safe_indexing
+from ...utils._optional_dependencies import check_matplotlib_support
+from ...utils._response import _get_response_values
+from ...utils._set_output import _get_adapter_from_container
+from ...utils.validation import (
+    _is_arraylike_not_scalar,
+    _is_pandas_df,
+    _is_polars_df,
+    _num_features,
+    check_is_fitted,
+)
+
+
+def _check_boundary_response_method(estimator, response_method, class_of_interest):
+    """Validate the response methods to be used with the fitted estimator.
+
+    Parameters
+    ----------
+    estimator : object
+        Fitted estimator to check.
+
+    response_method : {'auto', 'decision_function', 'predict_proba', 'predict'}
+        Specifies whether to use :term:`decision_function`, :term:`predict_proba`,
+        :term:`predict` as the target response. If set to 'auto', the response method is
+        tried in the before mentioned order.
+
+    class_of_interest : int, float, bool, str or None
+        The class considered when plotting the decision. Cannot be None if
+        multiclass and `response_method` is 'predict_proba' or 'decision_function'.
+
+        .. versionadded:: 1.4
+
+    Returns
+    -------
+    prediction_method : list of str or str
+        The name or list of names of the response methods to use.
+    """
+    has_classes = hasattr(estimator, "classes_")
+    if has_classes and _is_arraylike_not_scalar(estimator.classes_[0]):
+        msg = "Multi-label and multi-output multi-class classifiers are not supported"
+        raise ValueError(msg)
+
+    if response_method == "auto":
+        if is_regressor(estimator):
+            prediction_method = "predict"
+        else:
+            prediction_method = ["decision_function", "predict_proba", "predict"]
+    else:
+        prediction_method = response_method
+
+    return prediction_method
+
+
+class DecisionBoundaryDisplay:
+    """Decisions boundary visualization.
+
+    It is recommended to use
+    :func:`~sklearn.inspection.DecisionBoundaryDisplay.from_estimator`
+    to create a :class:`DecisionBoundaryDisplay`. All parameters are stored as
+    attributes.
+
+    Read more in the :ref:`User Guide <visualizations>`.
+
+    For a detailed example comparing the decision boundaries of multinomial and
+    one-vs-rest logistic regression, please see
+    :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_multinomial.py`.
+
+    .. versionadded:: 1.1
+
+    Parameters
+    ----------
+    xx0 : ndarray of shape (grid_resolution, grid_resolution)
+        First output of :func:`meshgrid <numpy.meshgrid>`.
+
+    xx1 : ndarray of shape (grid_resolution, grid_resolution)
+        Second output of :func:`meshgrid <numpy.meshgrid>`.
+
+    response : ndarray of shape (grid_resolution, grid_resolution) or \
+            (grid_resolution, grid_resolution, n_classes)
+        Values of the response function.
+
+    multiclass_colors : list of str or str, default=None
+        Specifies how to color each class when plotting all classes of multiclass
+        problem. Ignored for binary problems and multiclass problems when plotting a
+        single prediction value per point.
+        Possible inputs are:
+
+        * list: list of Matplotlib
+          `color <https://matplotlib.org/stable/users/explain/colors/colors.html#colors-def>`_
+          strings, of length `n_classes`
+        * str: name of :class:`matplotlib.colors.Colormap`
+        * None: 'viridis' colormap is used to sample colors
+
+        Single color colormaps will be generated from the colors in the list or
+        colors taken from the colormap and passed to the `cmap` parameter of
+        the `plot_method`.
+
+        .. versionadded:: 1.7
+
+    xlabel : str, default=None
+        Default label to place on x axis.
+
+    ylabel : str, default=None
+        Default label to place on y axis.
+
+    Attributes
+    ----------
+    surface_ : matplotlib `QuadContourSet` or `QuadMesh` or list of such objects
+        If `plot_method` is 'contour' or 'contourf', `surface_` is
+        :class:`QuadContourSet <matplotlib.contour.QuadContourSet>`. If
+        `plot_method` is 'pcolormesh', `surface_` is
+        :class:`QuadMesh <matplotlib.collections.QuadMesh>`.
+
+    multiclass_colors_ : array of shape (n_classes, 4)
+        Colors used to plot each class in multiclass problems.
+        Only defined when `color_of_interest` is None.
+
+        .. versionadded:: 1.7
+
+    ax_ : matplotlib Axes
+        Axes with decision boundary.
+
+    figure_ : matplotlib Figure
+        Figure containing the decision boundary.
+
+    See Also
+    --------
+    DecisionBoundaryDisplay.from_estimator : Plot decision boundary given an estimator.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> import numpy as np
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.inspection import DecisionBoundaryDisplay
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> iris = load_iris()
+    >>> feature_1, feature_2 = np.meshgrid(
+    ...     np.linspace(iris.data[:, 0].min(), iris.data[:, 0].max()),
+    ...     np.linspace(iris.data[:, 1].min(), iris.data[:, 1].max())
+    ... )
+    >>> grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
+    >>> tree = DecisionTreeClassifier().fit(iris.data[:, :2], iris.target)
+    >>> y_pred = np.reshape(tree.predict(grid), feature_1.shape)
+    >>> display = DecisionBoundaryDisplay(
+    ...     xx0=feature_1, xx1=feature_2, response=y_pred
+    ... )
+    >>> display.plot()
+    <...>
+    >>> display.ax_.scatter(
+    ...     iris.data[:, 0], iris.data[:, 1], c=iris.target, edgecolor="black"
+    ... )
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(
+        self, *, xx0, xx1, response, multiclass_colors=None, xlabel=None, ylabel=None
+    ):
+        self.xx0 = xx0
+        self.xx1 = xx1
+        self.response = response
+        self.multiclass_colors = multiclass_colors
+        self.xlabel = xlabel
+        self.ylabel = ylabel
+
+    def plot(self, plot_method="contourf", ax=None, xlabel=None, ylabel=None, **kwargs):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf'
+            Plotting method to call when plotting the response. Please refer
+            to the following matplotlib documentation for details:
+            :func:`contourf <matplotlib.pyplot.contourf>`,
+            :func:`contour <matplotlib.pyplot.contour>`,
+            :func:`pcolormesh <matplotlib.pyplot.pcolormesh>`.
+
+        ax : Matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        xlabel : str, default=None
+            Overwrite the x-axis label.
+
+        ylabel : str, default=None
+            Overwrite the y-axis label.
+
+        **kwargs : dict
+            Additional keyword arguments to be passed to the `plot_method`.
+
+        Returns
+        -------
+        display: :class:`~sklearn.inspection.DecisionBoundaryDisplay`
+            Object that stores computed values.
+        """
+        check_matplotlib_support("DecisionBoundaryDisplay.plot")
+        import matplotlib as mpl
+        import matplotlib.pyplot as plt
+
+        if plot_method not in ("contourf", "contour", "pcolormesh"):
+            raise ValueError(
+                "plot_method must be 'contourf', 'contour', or 'pcolormesh'. "
+                f"Got {plot_method} instead."
+            )
+
+        if ax is None:
+            _, ax = plt.subplots()
+
+        plot_func = getattr(ax, plot_method)
+        if self.response.ndim == 2:
+            self.surface_ = plot_func(self.xx0, self.xx1, self.response, **kwargs)
+        else:  # self.response.ndim == 3
+            n_responses = self.response.shape[-1]
+            for kwarg in ("cmap", "colors"):
+                if kwarg in kwargs:
+                    warnings.warn(
+                        f"'{kwarg}' is ignored in favor of 'multiclass_colors' "
+                        "in the multiclass case when the response method is "
+                        "'decision_function' or 'predict_proba'."
+                    )
+                    del kwargs[kwarg]
+
+            if self.multiclass_colors is None or isinstance(
+                self.multiclass_colors, str
+            ):
+                if self.multiclass_colors is None:
+                    cmap = "tab10" if n_responses <= 10 else "gist_rainbow"
+                else:
+                    cmap = self.multiclass_colors
+
+                # Special case for the tab10 and tab20 colormaps that encode a
+                # discrete set of colors that are easily distinguishable
+                # contrary to other colormaps that are continuous.
+                if cmap == "tab10" and n_responses <= 10:
+                    colors = plt.get_cmap("tab10", 10).colors[:n_responses]
+                elif cmap == "tab20" and n_responses <= 20:
+                    colors = plt.get_cmap("tab20", 20).colors[:n_responses]
+                else:
+                    cmap = plt.get_cmap(cmap, n_responses)
+                    if not hasattr(cmap, "colors"):
+                        # For LinearSegmentedColormap
+                        colors = cmap(np.linspace(0, 1, n_responses))
+                    else:
+                        colors = cmap.colors
+            elif isinstance(self.multiclass_colors, list):
+                colors = [mpl.colors.to_rgba(color) for color in self.multiclass_colors]
+            else:
+                raise ValueError("'multiclass_colors' must be a list or a str.")
+
+            self.multiclass_colors_ = colors
+            if plot_method == "contour":
+                # Plot only argmax map for contour
+                class_map = self.response.argmax(axis=2)
+                self.surface_ = plot_func(
+                    self.xx0, self.xx1, class_map, colors=colors, **kwargs
+                )
+            else:
+                multiclass_cmaps = [
+                    mpl.colors.LinearSegmentedColormap.from_list(
+                        f"colormap_{class_idx}", [(1.0, 1.0, 1.0, 1.0), (r, g, b, 1.0)]
+                    )
+                    for class_idx, (r, g, b, _) in enumerate(colors)
+                ]
+
+                self.surface_ = []
+                for class_idx, cmap in enumerate(multiclass_cmaps):
+                    response = np.ma.array(
+                        self.response[:, :, class_idx],
+                        mask=~(self.response.argmax(axis=2) == class_idx),
+                    )
+                    self.surface_.append(
+                        plot_func(self.xx0, self.xx1, response, cmap=cmap, **kwargs)
+                    )
+
+        if xlabel is not None or not ax.get_xlabel():
+            xlabel = self.xlabel if xlabel is None else xlabel
+            ax.set_xlabel(xlabel)
+        if ylabel is not None or not ax.get_ylabel():
+            ylabel = self.ylabel if ylabel is None else ylabel
+            ax.set_ylabel(ylabel)
+
+        self.ax_ = ax
+        self.figure_ = ax.figure
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        *,
+        grid_resolution=100,
+        eps=1.0,
+        plot_method="contourf",
+        response_method="auto",
+        class_of_interest=None,
+        multiclass_colors=None,
+        xlabel=None,
+        ylabel=None,
+        ax=None,
+        **kwargs,
+    ):
+        """Plot decision boundary given an estimator.
+
+        Read more in the :ref:`User Guide <visualizations>`.
+
+        Parameters
+        ----------
+        estimator : object
+            Trained estimator used to plot the decision boundary.
+
+        X : {array-like, sparse matrix, dataframe} of shape (n_samples, 2)
+            Input data that should be only 2-dimensional.
+
+        grid_resolution : int, default=100
+            Number of grid points to use for plotting decision boundary.
+            Higher values will make the plot look nicer but be slower to
+            render.
+
+        eps : float, default=1.0
+            Extends the minimum and maximum values of X for evaluating the
+            response function.
+
+        plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf'
+            Plotting method to call when plotting the response. Please refer
+            to the following matplotlib documentation for details:
+            :func:`contourf <matplotlib.pyplot.contourf>`,
+            :func:`contour <matplotlib.pyplot.contour>`,
+            :func:`pcolormesh <matplotlib.pyplot.pcolormesh>`.
+
+        response_method : {'auto', 'decision_function', 'predict_proba', \
+                'predict'}, default='auto'
+            Specifies whether to use :term:`decision_function`,
+            :term:`predict_proba` or :term:`predict` as the target response.
+            If set to 'auto', the response method is tried in the order as
+            listed above.
+
+            .. versionchanged:: 1.6
+                For multiclass problems, 'auto' no longer defaults to 'predict'.
+
+        class_of_interest : int, float, bool or str, default=None
+            The class to be plotted when `response_method` is 'predict_proba'
+            or 'decision_function'. If None, `estimator.classes_[1]` is considered
+            the positive class for binary classifiers. For multiclass
+            classifiers, if None, all classes will be represented in the
+            decision boundary plot; the class with the highest response value
+            at each point is plotted. The color of each class can be set via
+            `multiclass_colors`.
+
+            .. versionadded:: 1.4
+
+        multiclass_colors : list of str, or str, default=None
+            Specifies how to color each class when plotting multiclass
+            'predict_proba' or 'decision_function' and `class_of_interest` is
+            None. Ignored in all other cases.
+
+            Possible inputs are:
+
+            * list: list of Matplotlib
+              `color <https://matplotlib.org/stable/users/explain/colors/colors.html#colors-def>`_
+              strings, of length `n_classes`
+            * str: name of :class:`matplotlib.colors.Colormap`
+            * None: 'tab10' colormap is used to sample colors if the number of
+                classes is less than or equal to 10, otherwise 'gist_rainbow'
+                colormap.
+
+            Single color colormaps will be generated from the colors in the list or
+            colors taken from the colormap, and passed to the `cmap` parameter of
+            the `plot_method`.
+
+            .. versionadded:: 1.7
+
+        xlabel : str, default=None
+            The label used for the x-axis. If `None`, an attempt is made to
+            extract a label from `X` if it is a dataframe, otherwise an empty
+            string is used.
+
+        ylabel : str, default=None
+            The label used for the y-axis. If `None`, an attempt is made to
+            extract a label from `X` if it is a dataframe, otherwise an empty
+            string is used.
+
+        ax : Matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        **kwargs : dict
+            Additional keyword arguments to be passed to the
+            `plot_method`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.inspection.DecisionBoundaryDisplay`
+            Object that stores the result.
+
+        See Also
+        --------
+        DecisionBoundaryDisplay : Decision boundary visualization.
+        sklearn.metrics.ConfusionMatrixDisplay.from_estimator : Plot the
+            confusion matrix given an estimator, the data, and the label.
+        sklearn.metrics.ConfusionMatrixDisplay.from_predictions : Plot the
+            confusion matrix given the true and predicted labels.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import load_iris
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.inspection import DecisionBoundaryDisplay
+        >>> iris = load_iris()
+        >>> X = iris.data[:, :2]
+        >>> classifier = LogisticRegression().fit(X, iris.target)
+        >>> disp = DecisionBoundaryDisplay.from_estimator(
+        ...     classifier, X, response_method="predict",
+        ...     xlabel=iris.feature_names[0], ylabel=iris.feature_names[1],
+        ...     alpha=0.5,
+        ... )
+        >>> disp.ax_.scatter(X[:, 0], X[:, 1], c=iris.target, edgecolor="k")
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+        check_is_fitted(estimator)
+        import matplotlib as mpl
+
+        if not grid_resolution > 1:
+            raise ValueError(
+                "grid_resolution must be greater than 1. Got"
+                f" {grid_resolution} instead."
+            )
+
+        if not eps >= 0:
+            raise ValueError(
+                f"eps must be greater than or equal to 0. Got {eps} instead."
+            )
+
+        possible_plot_methods = ("contourf", "contour", "pcolormesh")
+        if plot_method not in possible_plot_methods:
+            available_methods = ", ".join(possible_plot_methods)
+            raise ValueError(
+                f"plot_method must be one of {available_methods}. "
+                f"Got {plot_method} instead."
+            )
+
+        num_features = _num_features(X)
+        if num_features != 2:
+            raise ValueError(
+                f"n_features must be equal to 2. Got {num_features} instead."
+            )
+
+        if (
+            response_method in ("predict_proba", "decision_function", "auto")
+            and multiclass_colors is not None
+            and hasattr(estimator, "classes_")
+            and (n_classes := len(estimator.classes_)) > 2
+        ):
+            if isinstance(multiclass_colors, list):
+                if len(multiclass_colors) != n_classes:
+                    raise ValueError(
+                        "When 'multiclass_colors' is a list, it must be of the same "
+                        f"length as 'estimator.classes_' ({n_classes}), got: "
+                        f"{len(multiclass_colors)}."
+                    )
+                elif any(
+                    not mpl.colors.is_color_like(col) for col in multiclass_colors
+                ):
+                    raise ValueError(
+                        "When 'multiclass_colors' is a list, it can only contain valid"
+                        f" Matplotlib color names. Got: {multiclass_colors}"
+                    )
+            if isinstance(multiclass_colors, str):
+                if multiclass_colors not in mpl.pyplot.colormaps():
+                    raise ValueError(
+                        "When 'multiclass_colors' is a string, it must be a valid "
+                        f"Matplotlib colormap. Got: {multiclass_colors}"
+                    )
+
+        x0, x1 = _safe_indexing(X, 0, axis=1), _safe_indexing(X, 1, axis=1)
+
+        x0_min, x0_max = x0.min() - eps, x0.max() + eps
+        x1_min, x1_max = x1.min() - eps, x1.max() + eps
+
+        xx0, xx1 = np.meshgrid(
+            np.linspace(x0_min, x0_max, grid_resolution),
+            np.linspace(x1_min, x1_max, grid_resolution),
+        )
+
+        X_grid = np.c_[xx0.ravel(), xx1.ravel()]
+        if _is_pandas_df(X) or _is_polars_df(X):
+            adapter = _get_adapter_from_container(X)
+            X_grid = adapter.create_container(
+                X_grid,
+                X_grid,
+                columns=X.columns,
+            )
+
+        prediction_method = _check_boundary_response_method(
+            estimator, response_method, class_of_interest
+        )
+        try:
+            response, _, response_method_used = _get_response_values(
+                estimator,
+                X_grid,
+                response_method=prediction_method,
+                pos_label=class_of_interest,
+                return_response_method_used=True,
+            )
+        except ValueError as exc:
+            if "is not a valid label" in str(exc):
+                # re-raise a more informative error message since `pos_label` is unknown
+                # to our user when interacting with
+                # `DecisionBoundaryDisplay.from_estimator`
+                raise ValueError(
+                    f"class_of_interest={class_of_interest} is not a valid label: It "
+                    f"should be one of {estimator.classes_}"
+                ) from exc
+            raise
+
+        # convert classes predictions into integers
+        if response_method_used == "predict" and hasattr(estimator, "classes_"):
+            encoder = LabelEncoder()
+            encoder.classes_ = estimator.classes_
+            response = encoder.transform(response)
+
+        if response.ndim == 1:
+            response = response.reshape(*xx0.shape)
+        else:
+            if is_regressor(estimator):
+                raise ValueError("Multi-output regressors are not supported")
+
+            if class_of_interest is not None:
+                # For the multiclass case, `_get_response_values` returns the response
+                # as-is. Thus, we have a column per class and we need to select the
+                # column corresponding to the positive class.
+                col_idx = np.flatnonzero(estimator.classes_ == class_of_interest)[0]
+                response = response[:, col_idx].reshape(*xx0.shape)
+            else:
+                response = response.reshape(*xx0.shape, response.shape[-1])
+
+        if xlabel is None:
+            xlabel = X.columns[0] if hasattr(X, "columns") else ""
+
+        if ylabel is None:
+            ylabel = X.columns[1] if hasattr(X, "columns") else ""
+
+        display = cls(
+            xx0=xx0,
+            xx1=xx1,
+            response=response,
+            multiclass_colors=multiclass_colors,
+            xlabel=xlabel,
+            ylabel=ylabel,
+        )
+        return display.plot(ax=ax, plot_method=plot_method, **kwargs)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/partial_dependence.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/partial_dependence.py
new file mode 100644
index 0000000000000000000000000000000000000000..b31a5070b236b811195f97b6643be7b4c191343e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/partial_dependence.py
@@ -0,0 +1,1495 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+from itertools import chain
+from math import ceil
+
+import numpy as np
+from scipy import sparse
+from scipy.stats.mstats import mquantiles
+
+from ...base import is_regressor
+from ...utils import (
+    Bunch,
+    _safe_indexing,
+    check_array,
+    check_random_state,
+)
+from ...utils._encode import _unique
+from ...utils._optional_dependencies import check_matplotlib_support
+from ...utils._plotting import _validate_style_kwargs
+from ...utils.parallel import Parallel, delayed
+from .. import partial_dependence
+from .._pd_utils import _check_feature_names, _get_feature_index
+
+
+class PartialDependenceDisplay:
+    """Partial Dependence Plot (PDP) and Individual Conditional Expectation (ICE).
+
+    It is recommended to use
+    :func:`~sklearn.inspection.PartialDependenceDisplay.from_estimator` to create a
+    :class:`~sklearn.inspection.PartialDependenceDisplay`. All parameters are stored
+    as attributes.
+
+    For general information regarding `scikit-learn` visualization tools, see
+    the :ref:`Visualization Guide <visualizations>`.
+    For guidance on interpreting these plots, refer to the
+    :ref:`Inspection Guide <partial_dependence>`.
+
+    For an example on how to use this class, see the following example:
+    :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    pd_results : list of Bunch
+        Results of :func:`~sklearn.inspection.partial_dependence` for
+        ``features``.
+
+    features : list of (int,) or list of (int, int)
+        Indices of features for a given plot. A tuple of one integer will plot
+        a partial dependence curve of one feature. A tuple of two integers will
+        plot a two-way partial dependence curve as a contour plot.
+
+    feature_names : list of str
+        Feature names corresponding to the indices in ``features``.
+
+    target_idx : int
+
+        - In a multiclass setting, specifies the class for which the PDPs
+          should be computed. Note that for binary classification, the
+          positive class (index 1) is always used.
+        - In a multioutput setting, specifies the task for which the PDPs
+          should be computed.
+
+        Ignored in binary classification or classical regression settings.
+
+    deciles : dict
+        Deciles for feature indices in ``features``.
+
+    kind : {'average', 'individual', 'both'} or list of such str, \
+            default='average'
+        Whether to plot the partial dependence averaged across all the samples
+        in the dataset or one line per sample or both.
+
+        - ``kind='average'`` results in the traditional PD plot;
+        - ``kind='individual'`` results in the ICE plot;
+        - ``kind='both'`` results in plotting both the ICE and PD on the same
+          plot.
+
+        A list of such strings can be provided to specify `kind` on a per-plot
+        basis. The length of the list should be the same as the number of
+        interaction requested in `features`.
+
+        .. note::
+           ICE ('individual' or 'both') is not a valid option for 2-ways
+           interactions plot. As a result, an error will be raised.
+           2-ways interaction plots should always be configured to
+           use the 'average' kind instead.
+
+        .. note::
+           The fast ``method='recursion'`` option is only available for
+           `kind='average'` and `sample_weights=None`. Computing individual
+           dependencies and doing weighted averages requires using the slower
+           `method='brute'`.
+
+        .. versionadded:: 0.24
+           Add `kind` parameter with `'average'`, `'individual'`, and `'both'`
+           options.
+
+        .. versionadded:: 1.1
+           Add the possibility to pass a list of string specifying `kind`
+           for each plot.
+
+    subsample : float, int or None, default=1000
+        Sampling for ICE curves when `kind` is 'individual' or 'both'.
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to be used to plot ICE curves. If int, represents the
+        maximum absolute number of samples to use.
+
+        Note that the full dataset is still used to calculate partial
+        dependence when `kind='both'`.
+
+        .. versionadded:: 0.24
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the selected samples when subsamples is not
+        `None`. See :term:`Glossary <random_state>` for details.
+
+        .. versionadded:: 0.24
+
+    is_categorical : list of (bool,) or list of (bool, bool), default=None
+        Whether each target feature in `features` is categorical or not.
+        The list should be same size as `features`. If `None`, all features
+        are assumed to be continuous.
+
+        .. versionadded:: 1.2
+
+    Attributes
+    ----------
+    bounding_ax_ : matplotlib Axes or None
+        If `ax` is an axes or None, the `bounding_ax_` is the axes where the
+        grid of partial dependence plots are drawn. If `ax` is a list of axes
+        or a numpy array of axes, `bounding_ax_` is None.
+
+    axes_ : ndarray of matplotlib Axes
+        If `ax` is an axes or None, `axes_[i, j]` is the axes on the i-th row
+        and j-th column. If `ax` is a list of axes, `axes_[i]` is the i-th item
+        in `ax`. Elements that are None correspond to a nonexisting axes in
+        that position.
+
+    lines_ : ndarray of matplotlib Artists
+        If `ax` is an axes or None, `lines_[i, j]` is the partial dependence
+        curve on the i-th row and j-th column. If `ax` is a list of axes,
+        `lines_[i]` is the partial dependence curve corresponding to the i-th
+        item in `ax`. Elements that are None correspond to a nonexisting axes
+        or an axes that does not include a line plot.
+
+    deciles_vlines_ : ndarray of matplotlib LineCollection
+        If `ax` is an axes or None, `vlines_[i, j]` is the line collection
+        representing the x axis deciles of the i-th row and j-th column. If
+        `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in
+        `ax`. Elements that are None correspond to a nonexisting axes or an
+        axes that does not include a PDP plot.
+
+        .. versionadded:: 0.23
+
+    deciles_hlines_ : ndarray of matplotlib LineCollection
+        If `ax` is an axes or None, `vlines_[i, j]` is the line collection
+        representing the y axis deciles of the i-th row and j-th column. If
+        `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in
+        `ax`. Elements that are None correspond to a nonexisting axes or an
+        axes that does not include a 2-way plot.
+
+        .. versionadded:: 0.23
+
+    contours_ : ndarray of matplotlib Artists
+        If `ax` is an axes or None, `contours_[i, j]` is the partial dependence
+        plot on the i-th row and j-th column. If `ax` is a list of axes,
+        `contours_[i]` is the partial dependence plot corresponding to the i-th
+        item in `ax`. Elements that are None correspond to a nonexisting axes
+        or an axes that does not include a contour plot.
+
+    bars_ : ndarray of matplotlib Artists
+        If `ax` is an axes or None, `bars_[i, j]` is the partial dependence bar
+        plot on the i-th row and j-th column (for a categorical feature).
+        If `ax` is a list of axes, `bars_[i]` is the partial dependence bar
+        plot corresponding to the i-th item in `ax`. Elements that are None
+        correspond to a nonexisting axes or an axes that does not include a
+        bar plot.
+
+        .. versionadded:: 1.2
+
+    heatmaps_ : ndarray of matplotlib Artists
+        If `ax` is an axes or None, `heatmaps_[i, j]` is the partial dependence
+        heatmap on the i-th row and j-th column (for a pair of categorical
+        features) . If `ax` is a list of axes, `heatmaps_[i]` is the partial
+        dependence heatmap corresponding to the i-th item in `ax`. Elements
+        that are None correspond to a nonexisting axes or an axes that does not
+        include a heatmap.
+
+        .. versionadded:: 1.2
+
+    figure_ : matplotlib Figure
+        Figure containing partial dependence plots.
+
+    See Also
+    --------
+    partial_dependence : Compute Partial Dependence values.
+    PartialDependenceDisplay.from_estimator : Plot Partial Dependence.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.ensemble import GradientBoostingRegressor
+    >>> from sklearn.inspection import PartialDependenceDisplay
+    >>> from sklearn.inspection import partial_dependence
+    >>> X, y = make_friedman1()
+    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
+    >>> features, feature_names = [(0,)], [f"Features #{i}" for i in range(X.shape[1])]
+    >>> deciles = {0: np.linspace(0, 1, num=5)}
+    >>> pd_results = partial_dependence(
+    ...     clf, X, features=0, kind="average", grid_resolution=5)
+    >>> display = PartialDependenceDisplay(
+    ...     [pd_results], features=features, feature_names=feature_names,
+    ...     target_idx=0, deciles=deciles
+    ... )
+    >>> display.plot(pdp_lim={1: (-1.38, 0.66)})
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(
+        self,
+        pd_results,
+        *,
+        features,
+        feature_names,
+        target_idx,
+        deciles,
+        kind="average",
+        subsample=1000,
+        random_state=None,
+        is_categorical=None,
+    ):
+        self.pd_results = pd_results
+        self.features = features
+        self.feature_names = feature_names
+        self.target_idx = target_idx
+        self.deciles = deciles
+        self.kind = kind
+        self.subsample = subsample
+        self.random_state = random_state
+        self.is_categorical = is_categorical
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        features,
+        *,
+        sample_weight=None,
+        categorical_features=None,
+        feature_names=None,
+        target=None,
+        response_method="auto",
+        n_cols=3,
+        grid_resolution=100,
+        percentiles=(0.05, 0.95),
+        custom_values=None,
+        method="auto",
+        n_jobs=None,
+        verbose=0,
+        line_kw=None,
+        ice_lines_kw=None,
+        pd_line_kw=None,
+        contour_kw=None,
+        ax=None,
+        kind="average",
+        centered=False,
+        subsample=1000,
+        random_state=None,
+    ):
+        """Partial dependence (PD) and individual conditional expectation (ICE) plots.
+
+        Partial dependence plots, individual conditional expectation plots, or an
+        overlay of both can be plotted by setting the `kind` parameter.
+        This method generates one plot for each entry in `features`. The plots
+        are arranged in a grid with `n_cols` columns. For one-way partial
+        dependence plots, the deciles of the feature values are shown on the
+        x-axis. For two-way plots, the deciles are shown on both axes and PDPs
+        are contour plots.
+
+        For general information regarding `scikit-learn` visualization tools, see
+        the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the
+        :ref:`Inspection Guide <partial_dependence>`.
+
+        For an example on how to use this class method, see
+        :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`.
+
+        .. note::
+
+            :func:`PartialDependenceDisplay.from_estimator` does not support using the
+            same axes with multiple calls. To plot the partial dependence for
+            multiple estimators, please pass the axes created by the first call to the
+            second call::
+
+               >>> from sklearn.inspection import PartialDependenceDisplay
+               >>> from sklearn.datasets import make_friedman1
+               >>> from sklearn.linear_model import LinearRegression
+               >>> from sklearn.ensemble import RandomForestRegressor
+               >>> X, y = make_friedman1()
+               >>> est1 = LinearRegression().fit(X, y)
+               >>> est2 = RandomForestRegressor().fit(X, y)
+               >>> disp1 = PartialDependenceDisplay.from_estimator(est1, X,
+               ...                                                 [1, 2])
+               >>> disp2 = PartialDependenceDisplay.from_estimator(est2, X, [1, 2],
+               ...                                                 ax=disp1.axes_)
+
+        .. warning::
+
+            For :class:`~sklearn.ensemble.GradientBoostingClassifier` and
+            :class:`~sklearn.ensemble.GradientBoostingRegressor`, the
+            `'recursion'` method (used by default) will not account for the `init`
+            predictor of the boosting process. In practice, this will produce
+            the same values as `'brute'` up to a constant offset in the target
+            response, provided that `init` is a constant estimator (which is the
+            default). However, if `init` is not a constant estimator, the
+            partial dependence values are incorrect for `'recursion'` because the
+            offset will be sample-dependent. It is preferable to use the `'brute'`
+            method. Note that this only applies to
+            :class:`~sklearn.ensemble.GradientBoostingClassifier` and
+            :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to
+            :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+            :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        estimator : BaseEstimator
+            A fitted estimator object implementing :term:`predict`,
+            :term:`predict_proba`, or :term:`decision_function`.
+            Multioutput-multiclass classifiers are not supported.
+
+        X : {array-like, dataframe} of shape (n_samples, n_features)
+            ``X`` is used to generate a grid of values for the target
+            ``features`` (where the partial dependence will be evaluated), and
+            also to generate values for the complement features when the
+            `method` is `'brute'`.
+
+        features : list of {int, str, pair of int, pair of str}
+            The target features for which to create the PDPs.
+            If `features[i]` is an integer or a string, a one-way PDP is created;
+            if `features[i]` is a tuple, a two-way PDP is created (only supported
+            with `kind='average'`). Each tuple must be of size 2.
+            If any entry is a string, then it must be in ``feature_names``.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights are used to calculate weighted means when averaging the
+            model output. If `None`, then samples are equally weighted. If
+            `sample_weight` is not `None`, then `method` will be set to `'brute'`.
+            Note that `sample_weight` is ignored for `kind='individual'`.
+
+            .. versionadded:: 1.3
+
+        categorical_features : array-like of shape (n_features,) or shape \
+                (n_categorical_features,), dtype={bool, int, str}, default=None
+            Indicates the categorical features.
+
+            - `None`: no feature will be considered categorical;
+            - boolean array-like: boolean mask of shape `(n_features,)`
+              indicating which features are categorical. Thus, this array has
+              the same shape has `X.shape[1]`;
+            - integer or string array-like: integer indices or strings
+              indicating categorical features.
+
+            .. versionadded:: 1.2
+
+        feature_names : array-like of shape (n_features,), dtype=str, default=None
+            Name of each feature; `feature_names[i]` holds the name of the feature
+            with index `i`.
+            By default, the name of the feature corresponds to their numerical
+            index for NumPy array and their column name for pandas dataframe.
+
+        target : int, default=None
+            - In a multiclass setting, specifies the class for which the PDPs
+              should be computed. Note that for binary classification, the
+              positive class (index 1) is always used.
+            - In a multioutput setting, specifies the task for which the PDPs
+              should be computed.
+
+            Ignored in binary classification or classical regression settings.
+
+        response_method : {'auto', 'predict_proba', 'decision_function'}, \
+                default='auto'
+            Specifies whether to use :term:`predict_proba` or
+            :term:`decision_function` as the target response. For regressors
+            this parameter is ignored and the response is always the output of
+            :term:`predict`. By default, :term:`predict_proba` is tried first
+            and we revert to :term:`decision_function` if it doesn't exist. If
+            ``method`` is `'recursion'`, the response is always the output of
+            :term:`decision_function`.
+
+        n_cols : int, default=3
+            The maximum number of columns in the grid plot. Only active when `ax`
+            is a single axis or `None`.
+
+        grid_resolution : int, default=100
+            The number of equally spaced points on the axes of the plots, for each
+            target feature.
+            This parameter is overridden by `custom_values` if that parameter is set.
+
+        percentiles : tuple of float, default=(0.05, 0.95)
+            The lower and upper percentile used to create the extreme values
+            for the PDP axes. Must be in [0, 1].
+            This parameter is overridden by `custom_values` if that parameter is set.
+
+        custom_values : dict
+            A dictionary mapping the index of an element of `features` to an
+            array of values where the partial dependence should be calculated
+            for that feature. Setting a range of values for a feature overrides
+            `grid_resolution` and `percentiles`.
+
+            .. versionadded:: 1.7
+
+        method : str, default='auto'
+            The method used to calculate the averaged predictions:
+
+            - `'recursion'` is only supported for some tree-based estimators
+              (namely
+              :class:`~sklearn.ensemble.GradientBoostingClassifier`,
+              :class:`~sklearn.ensemble.GradientBoostingRegressor`,
+              :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,
+              :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,
+              :class:`~sklearn.tree.DecisionTreeRegressor`,
+              :class:`~sklearn.ensemble.RandomForestRegressor`
+              but is more efficient in terms of speed.
+              With this method, the target response of a
+              classifier is always the decision function, not the predicted
+              probabilities. Since the `'recursion'` method implicitly computes
+              the average of the ICEs by design, it is not compatible with ICE and
+              thus `kind` must be `'average'`.
+
+            - `'brute'` is supported for any estimator, but is more
+              computationally intensive.
+
+            - `'auto'`: the `'recursion'` is used for estimators that support it,
+              and `'brute'` is used otherwise. If `sample_weight` is not `None`,
+              then `'brute'` is used regardless of the estimator.
+
+            Please see :ref:`this note <pdp_method_differences>` for
+            differences between the `'brute'` and `'recursion'` method.
+
+        n_jobs : int, default=None
+            The number of CPUs to use to compute the partial dependences.
+            Computation is parallelized over features specified by the `features`
+            parameter.
+
+            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+            for more details.
+
+        verbose : int, default=0
+            Verbose output during PD computations.
+
+        line_kw : dict, default=None
+            Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
+            For one-way partial dependence plots. It can be used to define common
+            properties for both `ice_lines_kw` and `pdp_line_kw`.
+
+        ice_lines_kw : dict, default=None
+            Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.
+            For ICE lines in the one-way partial dependence plots.
+            The key value pairs defined in `ice_lines_kw` takes priority over
+            `line_kw`.
+
+        pd_line_kw : dict, default=None
+            Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.
+            For partial dependence in one-way partial dependence plots.
+            The key value pairs defined in `pd_line_kw` takes priority over
+            `line_kw`.
+
+        contour_kw : dict, default=None
+            Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.
+            For two-way partial dependence plots.
+
+        ax : Matplotlib axes or array-like of Matplotlib axes, default=None
+            - If a single axis is passed in, it is treated as a bounding axes
+              and a grid of partial dependence plots will be drawn within
+              these bounds. The `n_cols` parameter controls the number of
+              columns in the grid.
+            - If an array-like of axes are passed in, the partial dependence
+              plots will be drawn directly into these axes.
+            - If `None`, a figure and a bounding axes is created and treated
+              as the single axes case.
+
+        kind : {'average', 'individual', 'both'}, default='average'
+            Whether to plot the partial dependence averaged across all the samples
+            in the dataset or one line per sample or both.
+
+            - ``kind='average'`` results in the traditional PD plot;
+            - ``kind='individual'`` results in the ICE plot.
+
+            Note that the fast `method='recursion'` option is only available for
+            `kind='average'` and `sample_weights=None`. Computing individual
+            dependencies and doing weighted averages requires using the slower
+            `method='brute'`.
+
+        centered : bool, default=False
+            If `True`, the ICE and PD lines will start at the origin of the
+            y-axis. By default, no centering is done.
+
+            .. versionadded:: 1.1
+
+        subsample : float, int or None, default=1000
+            Sampling for ICE curves when `kind` is 'individual' or 'both'.
+            If `float`, should be between 0.0 and 1.0 and represent the proportion
+            of the dataset to be used to plot ICE curves. If `int`, represents the
+            absolute number samples to use.
+
+            Note that the full dataset is still used to calculate averaged partial
+            dependence when `kind='both'`.
+
+        random_state : int, RandomState instance or None, default=None
+            Controls the randomness of the selected samples when subsamples is not
+            `None` and `kind` is either `'both'` or `'individual'`.
+            See :term:`Glossary <random_state>` for details.
+
+        Returns
+        -------
+        display : :class:`~sklearn.inspection.PartialDependenceDisplay`
+
+        See Also
+        --------
+        partial_dependence : Compute Partial Dependence values.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_friedman1
+        >>> from sklearn.ensemble import GradientBoostingRegressor
+        >>> from sklearn.inspection import PartialDependenceDisplay
+        >>> X, y = make_friedman1()
+        >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
+        >>> PartialDependenceDisplay.from_estimator(clf, X, [0, (0, 1)])
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+        import matplotlib.pyplot as plt
+
+        # set target_idx for multi-class estimators
+        if hasattr(estimator, "classes_") and np.size(estimator.classes_) > 2:
+            if target is None:
+                raise ValueError("target must be specified for multi-class")
+            target_idx = np.searchsorted(estimator.classes_, target)
+            if (
+                not (0 <= target_idx < len(estimator.classes_))
+                or estimator.classes_[target_idx] != target
+            ):
+                raise ValueError("target not in est.classes_, got {}".format(target))
+        else:
+            # regression and binary classification
+            target_idx = 0
+
+        # Use check_array only on lists and other non-array-likes / sparse. Do not
+        # convert DataFrame into a NumPy array.
+        if not (hasattr(X, "__array__") or sparse.issparse(X)):
+            X = check_array(X, ensure_all_finite="allow-nan", dtype=object)
+        n_features = X.shape[1]
+
+        feature_names = _check_feature_names(X, feature_names)
+        # expand kind to always be a list of str
+        kind_ = [kind] * len(features) if isinstance(kind, str) else kind
+        if len(kind_) != len(features):
+            raise ValueError(
+                "When `kind` is provided as a list of strings, it should contain "
+                f"as many elements as `features`. `kind` contains {len(kind_)} "
+                f"element(s) and `features` contains {len(features)} element(s)."
+            )
+
+        # convert features into a seq of int tuples
+        tmp_features, ice_for_two_way_pd = [], []
+        for kind_plot, fxs in zip(kind_, features):
+            if isinstance(fxs, (numbers.Integral, str)):
+                fxs = (fxs,)
+            try:
+                fxs = tuple(
+                    _get_feature_index(fx, feature_names=feature_names) for fx in fxs
+                )
+            except TypeError as e:
+                raise ValueError(
+                    "Each entry in features must be either an int, "
+                    "a string, or an iterable of size at most 2."
+                ) from e
+            if not 1 <= np.size(fxs) <= 2:
+                raise ValueError(
+                    "Each entry in features must be either an int, "
+                    "a string, or an iterable of size at most 2."
+                )
+            # store the information if 2-way PD was requested with ICE to later
+            # raise a ValueError with an exhaustive list of problematic
+            # settings.
+            ice_for_two_way_pd.append(kind_plot != "average" and np.size(fxs) > 1)
+
+            tmp_features.append(fxs)
+
+        if any(ice_for_two_way_pd):
+            # raise an error and be specific regarding the parameter values
+            # when 1- and 2-way PD were requested
+            kind_ = [
+                "average" if forcing_average else kind_plot
+                for forcing_average, kind_plot in zip(ice_for_two_way_pd, kind_)
+            ]
+            raise ValueError(
+                "ICE plot cannot be rendered for 2-way feature interactions. "
+                "2-way feature interactions mandates PD plots using the "
+                "'average' kind: "
+                f"features={features!r} should be configured to use "
+                f"kind={kind_!r} explicitly."
+            )
+        features = tmp_features
+
+        if categorical_features is None:
+            is_categorical = [
+                (False,) if len(fxs) == 1 else (False, False) for fxs in features
+            ]
+        else:
+            # we need to create a boolean indicator of which features are
+            # categorical from the categorical_features list.
+            categorical_features = np.asarray(categorical_features)
+            if categorical_features.dtype.kind == "b":
+                # categorical features provided as a list of boolean
+                if categorical_features.size != n_features:
+                    raise ValueError(
+                        "When `categorical_features` is a boolean array-like, "
+                        "the array should be of shape (n_features,). Got "
+                        f"{categorical_features.size} elements while `X` contains "
+                        f"{n_features} features."
+                    )
+                is_categorical = [
+                    tuple(categorical_features[fx] for fx in fxs) for fxs in features
+                ]
+            elif categorical_features.dtype.kind in ("i", "O", "U"):
+                # categorical features provided as a list of indices or feature names
+                categorical_features_idx = [
+                    _get_feature_index(cat, feature_names=feature_names)
+                    for cat in categorical_features
+                ]
+                is_categorical = [
+                    tuple([idx in categorical_features_idx for idx in fxs])
+                    for fxs in features
+                ]
+            else:
+                raise ValueError(
+                    "Expected `categorical_features` to be an array-like of boolean,"
+                    f" integer, or string. Got {categorical_features.dtype} instead."
+                )
+
+            for cats in is_categorical:
+                if np.size(cats) == 2 and (cats[0] != cats[1]):
+                    raise ValueError(
+                        "Two-way partial dependence plots are not supported for pairs"
+                        " of continuous and categorical features."
+                    )
+
+            # collect the indices of the categorical features targeted by the partial
+            # dependence computation
+            categorical_features_targeted = set(
+                [
+                    fx
+                    for fxs, cats in zip(features, is_categorical)
+                    for fx in fxs
+                    if any(cats)
+                ]
+            )
+            if categorical_features_targeted:
+                min_n_cats = min(
+                    [
+                        len(_unique(_safe_indexing(X, idx, axis=1)))
+                        for idx in categorical_features_targeted
+                    ]
+                )
+                if grid_resolution < min_n_cats:
+                    raise ValueError(
+                        "The resolution of the computed grid is less than the "
+                        "minimum number of categories in the targeted categorical "
+                        "features. Expect the `grid_resolution` to be greater than "
+                        f"{min_n_cats}. Got {grid_resolution} instead."
+                    )
+
+            for is_cat, kind_plot in zip(is_categorical, kind_):
+                if any(is_cat) and kind_plot != "average":
+                    raise ValueError(
+                        "It is not possible to display individual effects for"
+                        " categorical features."
+                    )
+
+        # Early exit if the axes does not have the correct number of axes
+        if ax is not None and not isinstance(ax, plt.Axes):
+            axes = np.asarray(ax, dtype=object)
+            if axes.size != len(features):
+                raise ValueError(
+                    "Expected ax to have {} axes, got {}".format(
+                        len(features), axes.size
+                    )
+                )
+
+        for i in chain.from_iterable(features):
+            if i >= len(feature_names):
+                raise ValueError(
+                    "All entries of features must be less than "
+                    "len(feature_names) = {0}, got {1}.".format(len(feature_names), i)
+                )
+
+        if isinstance(subsample, numbers.Integral):
+            if subsample <= 0:
+                raise ValueError(
+                    f"When an integer, subsample={subsample} should be positive."
+                )
+        elif isinstance(subsample, numbers.Real):
+            if subsample <= 0 or subsample >= 1:
+                raise ValueError(
+                    f"When a floating-point, subsample={subsample} should be in "
+                    "the (0, 1) range."
+                )
+
+        # compute predictions and/or averaged predictions
+        pd_results = Parallel(n_jobs=n_jobs, verbose=verbose)(
+            delayed(partial_dependence)(
+                estimator,
+                X,
+                fxs,
+                sample_weight=sample_weight,
+                feature_names=feature_names,
+                categorical_features=categorical_features,
+                response_method=response_method,
+                method=method,
+                grid_resolution=grid_resolution,
+                percentiles=percentiles,
+                kind=kind_plot,
+                custom_values=custom_values,
+            )
+            for kind_plot, fxs in zip(kind_, features)
+        )
+
+        # For multioutput regression, we can only check the validity of target
+        # now that we have the predictions.
+        # Also note: as multiclass-multioutput classifiers are not supported,
+        # multiclass and multioutput scenario are mutually exclusive. So there is
+        # no risk of overwriting target_idx here.
+        pd_result = pd_results[0]  # checking the first result is enough
+        n_tasks = (
+            pd_result.average.shape[0]
+            if kind_[0] == "average"
+            else pd_result.individual.shape[0]
+        )
+        if is_regressor(estimator) and n_tasks > 1:
+            if target is None:
+                raise ValueError("target must be specified for multi-output regressors")
+            if not 0 <= target <= n_tasks:
+                raise ValueError(
+                    "target must be in [0, n_tasks], got {}.".format(target)
+                )
+            target_idx = target
+
+        deciles = {}
+        for fxs, cats in zip(features, is_categorical):
+            for fx, cat in zip(fxs, cats):
+                if not cat and fx not in deciles:
+                    X_col = _safe_indexing(X, fx, axis=1)
+                    deciles[fx] = mquantiles(X_col, prob=np.arange(0.1, 1.0, 0.1))
+
+        display = cls(
+            pd_results=pd_results,
+            features=features,
+            feature_names=feature_names,
+            target_idx=target_idx,
+            deciles=deciles,
+            kind=kind,
+            subsample=subsample,
+            random_state=random_state,
+            is_categorical=is_categorical,
+        )
+        return display.plot(
+            ax=ax,
+            n_cols=n_cols,
+            line_kw=line_kw,
+            ice_lines_kw=ice_lines_kw,
+            pd_line_kw=pd_line_kw,
+            contour_kw=contour_kw,
+            centered=centered,
+        )
+
+    def _get_sample_count(self, n_samples):
+        """Compute the number of samples as an integer."""
+        if isinstance(self.subsample, numbers.Integral):
+            if self.subsample < n_samples:
+                return self.subsample
+            return n_samples
+        elif isinstance(self.subsample, numbers.Real):
+            return ceil(n_samples * self.subsample)
+        return n_samples
+
+    def _plot_ice_lines(
+        self,
+        preds,
+        feature_values,
+        n_ice_to_plot,
+        ax,
+        pd_plot_idx,
+        n_total_lines_by_plot,
+        individual_line_kw,
+    ):
+        """Plot the ICE lines.
+
+        Parameters
+        ----------
+        preds : ndarray of shape \
+                (n_instances, n_grid_points)
+            The predictions computed for all points of `feature_values` for a
+            given feature for all samples in `X`.
+        feature_values : ndarray of shape (n_grid_points,)
+            The feature values for which the predictions have been computed.
+        n_ice_to_plot : int
+            The number of ICE lines to plot.
+        ax : Matplotlib axes
+            The axis on which to plot the ICE lines.
+        pd_plot_idx : int
+            The sequential index of the plot. It will be unraveled to find the
+            matching 2D position in the grid layout.
+        n_total_lines_by_plot : int
+            The total number of lines expected to be plot on the axis.
+        individual_line_kw : dict
+            Dict with keywords passed when plotting the ICE lines.
+        """
+        rng = check_random_state(self.random_state)
+        # subsample ice
+        ice_lines_idx = rng.choice(
+            preds.shape[0],
+            n_ice_to_plot,
+            replace=False,
+        )
+        ice_lines_subsampled = preds[ice_lines_idx, :]
+        # plot the subsampled ice
+        for ice_idx, ice in enumerate(ice_lines_subsampled):
+            line_idx = np.unravel_index(
+                pd_plot_idx * n_total_lines_by_plot + ice_idx, self.lines_.shape
+            )
+            self.lines_[line_idx] = ax.plot(
+                feature_values, ice.ravel(), **individual_line_kw
+            )[0]
+
+    def _plot_average_dependence(
+        self,
+        avg_preds,
+        feature_values,
+        ax,
+        pd_line_idx,
+        line_kw,
+        categorical,
+        bar_kw,
+    ):
+        """Plot the average partial dependence.
+
+        Parameters
+        ----------
+        avg_preds : ndarray of shape (n_grid_points,)
+            The average predictions for all points of `feature_values` for a
+            given feature for all samples in `X`.
+        feature_values : ndarray of shape (n_grid_points,)
+            The feature values for which the predictions have been computed.
+        ax : Matplotlib axes
+            The axis on which to plot the average PD.
+        pd_line_idx : int
+            The sequential index of the plot. It will be unraveled to find the
+            matching 2D position in the grid layout.
+        line_kw : dict
+            Dict with keywords passed when plotting the PD plot.
+        categorical : bool
+            Whether feature is categorical.
+        bar_kw: dict
+            Dict with keywords passed when plotting the PD bars (categorical).
+        """
+        if categorical:
+            bar_idx = np.unravel_index(pd_line_idx, self.bars_.shape)
+            self.bars_[bar_idx] = ax.bar(feature_values, avg_preds, **bar_kw)[0]
+            ax.tick_params(axis="x", rotation=90)
+        else:
+            line_idx = np.unravel_index(pd_line_idx, self.lines_.shape)
+            self.lines_[line_idx] = ax.plot(
+                feature_values,
+                avg_preds,
+                **line_kw,
+            )[0]
+
+    def _plot_one_way_partial_dependence(
+        self,
+        kind,
+        preds,
+        avg_preds,
+        feature_values,
+        feature_idx,
+        n_ice_lines,
+        ax,
+        n_cols,
+        pd_plot_idx,
+        n_lines,
+        ice_lines_kw,
+        pd_line_kw,
+        categorical,
+        bar_kw,
+        pdp_lim,
+    ):
+        """Plot 1-way partial dependence: ICE and PDP.
+
+        Parameters
+        ----------
+        kind : str
+            The kind of partial plot to draw.
+        preds : ndarray of shape \
+                (n_instances, n_grid_points) or None
+            The predictions computed for all points of `feature_values` for a
+            given feature for all samples in `X`.
+        avg_preds : ndarray of shape (n_grid_points,)
+            The average predictions for all points of `feature_values` for a
+            given feature for all samples in `X`.
+        feature_values : ndarray of shape (n_grid_points,)
+            The feature values for which the predictions have been computed.
+        feature_idx : int
+            The index corresponding to the target feature.
+        n_ice_lines : int
+            The number of ICE lines to plot.
+        ax : Matplotlib axes
+            The axis on which to plot the ICE and PDP lines.
+        n_cols : int or None
+            The number of column in the axis.
+        pd_plot_idx : int
+            The sequential index of the plot. It will be unraveled to find the
+            matching 2D position in the grid layout.
+        n_lines : int
+            The total number of lines expected to be plot on the axis.
+        ice_lines_kw : dict
+            Dict with keywords passed when plotting the ICE lines.
+        pd_line_kw : dict
+            Dict with keywords passed when plotting the PD plot.
+        categorical : bool
+            Whether feature is categorical.
+        bar_kw: dict
+            Dict with keywords passed when plotting the PD bars (categorical).
+        pdp_lim : dict
+            Global min and max average predictions, such that all plots will
+            have the same scale and y limits. `pdp_lim[1]` is the global min
+            and max for single partial dependence curves.
+        """
+        from matplotlib import transforms
+
+        if kind in ("individual", "both"):
+            self._plot_ice_lines(
+                preds[self.target_idx],
+                feature_values,
+                n_ice_lines,
+                ax,
+                pd_plot_idx,
+                n_lines,
+                ice_lines_kw,
+            )
+
+        if kind in ("average", "both"):
+            # the average is stored as the last line
+            if kind == "average":
+                pd_line_idx = pd_plot_idx
+            else:
+                pd_line_idx = pd_plot_idx * n_lines + n_ice_lines
+            self._plot_average_dependence(
+                avg_preds[self.target_idx].ravel(),
+                feature_values,
+                ax,
+                pd_line_idx,
+                pd_line_kw,
+                categorical,
+                bar_kw,
+            )
+
+        trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
+        # create the decile line for the vertical axis
+        vlines_idx = np.unravel_index(pd_plot_idx, self.deciles_vlines_.shape)
+        if self.deciles.get(feature_idx[0], None) is not None:
+            self.deciles_vlines_[vlines_idx] = ax.vlines(
+                self.deciles[feature_idx[0]],
+                0,
+                0.05,
+                transform=trans,
+                color="k",
+            )
+        # reset ylim which was overwritten by vlines
+        min_val = min(val[0] for val in pdp_lim.values())
+        max_val = max(val[1] for val in pdp_lim.values())
+        ax.set_ylim([min_val, max_val])
+
+        # Set xlabel if it is not already set
+        if not ax.get_xlabel():
+            ax.set_xlabel(self.feature_names[feature_idx[0]])
+
+        if n_cols is None or pd_plot_idx % n_cols == 0:
+            if not ax.get_ylabel():
+                ax.set_ylabel("Partial dependence")
+        else:
+            ax.set_yticklabels([])
+
+        if pd_line_kw.get("label", None) and kind != "individual" and not categorical:
+            ax.legend()
+
+    def _plot_two_way_partial_dependence(
+        self,
+        avg_preds,
+        feature_values,
+        feature_idx,
+        ax,
+        pd_plot_idx,
+        Z_level,
+        contour_kw,
+        categorical,
+        heatmap_kw,
+    ):
+        """Plot 2-way partial dependence.
+
+        Parameters
+        ----------
+        avg_preds : ndarray of shape \
+                (n_instances, n_grid_points, n_grid_points)
+            The average predictions for all points of `feature_values[0]` and
+            `feature_values[1]` for some given features for all samples in `X`.
+        feature_values : seq of 1d array
+            A sequence of array of the feature values for which the predictions
+            have been computed.
+        feature_idx : tuple of int
+            The indices of the target features
+        ax : Matplotlib axes
+            The axis on which to plot the ICE and PDP lines.
+        pd_plot_idx : int
+            The sequential index of the plot. It will be unraveled to find the
+            matching 2D position in the grid layout.
+        Z_level : ndarray of shape (8, 8)
+            The Z-level used to encode the average predictions.
+        contour_kw : dict
+            Dict with keywords passed when plotting the contours.
+        categorical : bool
+            Whether features are categorical.
+        heatmap_kw: dict
+            Dict with keywords passed when plotting the PD heatmap
+            (categorical).
+        """
+        if categorical:
+            import matplotlib.pyplot as plt
+
+            default_im_kw = dict(interpolation="nearest", cmap="viridis")
+            im_kw = {**default_im_kw, **heatmap_kw}
+
+            data = avg_preds[self.target_idx]
+            im = ax.imshow(data, **im_kw)
+            text = None
+            cmap_min, cmap_max = im.cmap(0), im.cmap(1.0)
+
+            text = np.empty_like(data, dtype=object)
+            # print text with appropriate color depending on background
+            thresh = (data.max() + data.min()) / 2.0
+
+            for flat_index in range(data.size):
+                row, col = np.unravel_index(flat_index, data.shape)
+                color = cmap_max if data[row, col] < thresh else cmap_min
+
+                values_format = ".2f"
+                text_data = format(data[row, col], values_format)
+
+                text_kwargs = dict(ha="center", va="center", color=color)
+                text[row, col] = ax.text(col, row, text_data, **text_kwargs)
+
+            fig = ax.figure
+            fig.colorbar(im, ax=ax)
+            ax.set(
+                xticks=np.arange(len(feature_values[1])),
+                yticks=np.arange(len(feature_values[0])),
+                xticklabels=feature_values[1],
+                yticklabels=feature_values[0],
+                xlabel=self.feature_names[feature_idx[1]],
+                ylabel=self.feature_names[feature_idx[0]],
+            )
+
+            plt.setp(ax.get_xticklabels(), rotation="vertical")
+
+            heatmap_idx = np.unravel_index(pd_plot_idx, self.heatmaps_.shape)
+            self.heatmaps_[heatmap_idx] = im
+        else:
+            from matplotlib import transforms
+
+            XX, YY = np.meshgrid(feature_values[0], feature_values[1])
+            Z = avg_preds[self.target_idx].T
+            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors="k")
+            contour_idx = np.unravel_index(pd_plot_idx, self.contours_.shape)
+            self.contours_[contour_idx] = ax.contourf(
+                XX,
+                YY,
+                Z,
+                levels=Z_level,
+                vmax=Z_level[-1],
+                vmin=Z_level[0],
+                **contour_kw,
+            )
+            ax.clabel(CS, fmt="%2.2f", colors="k", fontsize=10, inline=True)
+
+            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
+            # create the decile line for the vertical axis
+            xlim, ylim = ax.get_xlim(), ax.get_ylim()
+            vlines_idx = np.unravel_index(pd_plot_idx, self.deciles_vlines_.shape)
+            self.deciles_vlines_[vlines_idx] = ax.vlines(
+                self.deciles[feature_idx[0]],
+                0,
+                0.05,
+                transform=trans,
+                color="k",
+            )
+            # create the decile line for the horizontal axis
+            hlines_idx = np.unravel_index(pd_plot_idx, self.deciles_hlines_.shape)
+            self.deciles_hlines_[hlines_idx] = ax.hlines(
+                self.deciles[feature_idx[1]],
+                0,
+                0.05,
+                transform=trans,
+                color="k",
+            )
+            # reset xlim and ylim since they are overwritten by hlines and
+            # vlines
+            ax.set_xlim(xlim)
+            ax.set_ylim(ylim)
+
+            # set xlabel if it is not already set
+            if not ax.get_xlabel():
+                ax.set_xlabel(self.feature_names[feature_idx[0]])
+            ax.set_ylabel(self.feature_names[feature_idx[1]])
+
+    def plot(
+        self,
+        *,
+        ax=None,
+        n_cols=3,
+        line_kw=None,
+        ice_lines_kw=None,
+        pd_line_kw=None,
+        contour_kw=None,
+        bar_kw=None,
+        heatmap_kw=None,
+        pdp_lim=None,
+        centered=False,
+    ):
+        """Plot partial dependence plots.
+
+        Parameters
+        ----------
+        ax : Matplotlib axes or array-like of Matplotlib axes, default=None
+            - If a single axis is passed in, it is treated as a bounding axes
+                and a grid of partial dependence plots will be drawn within
+                these bounds. The `n_cols` parameter controls the number of
+                columns in the grid.
+            - If an array-like of axes are passed in, the partial dependence
+                plots will be drawn directly into these axes.
+            - If `None`, a figure and a bounding axes is created and treated
+                as the single axes case.
+
+        n_cols : int, default=3
+            The maximum number of columns in the grid plot. Only active when
+            `ax` is a single axes or `None`.
+
+        line_kw : dict, default=None
+            Dict with keywords passed to the `matplotlib.pyplot.plot` call.
+            For one-way partial dependence plots.
+
+        ice_lines_kw : dict, default=None
+            Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.
+            For ICE lines in the one-way partial dependence plots.
+            The key value pairs defined in `ice_lines_kw` takes priority over
+            `line_kw`.
+
+            .. versionadded:: 1.0
+
+        pd_line_kw : dict, default=None
+            Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.
+            For partial dependence in one-way partial dependence plots.
+            The key value pairs defined in `pd_line_kw` takes priority over
+            `line_kw`.
+
+            .. versionadded:: 1.0
+
+        contour_kw : dict, default=None
+            Dict with keywords passed to the `matplotlib.pyplot.contourf`
+            call for two-way partial dependence plots.
+
+        bar_kw : dict, default=None
+            Dict with keywords passed to the `matplotlib.pyplot.bar`
+            call for one-way categorical partial dependence plots.
+
+            .. versionadded:: 1.2
+
+        heatmap_kw : dict, default=None
+            Dict with keywords passed to the `matplotlib.pyplot.imshow`
+            call for two-way categorical partial dependence plots.
+
+            .. versionadded:: 1.2
+
+        pdp_lim : dict, default=None
+            Global min and max average predictions, such that all plots will have the
+            same scale and y limits. `pdp_lim[1]` is the global min and max for single
+            partial dependence curves. `pdp_lim[2]` is the global min and max for
+            two-way partial dependence curves. If `None` (default), the limit will be
+            inferred from the global minimum and maximum of all predictions.
+
+            .. versionadded:: 1.1
+
+        centered : bool, default=False
+            If `True`, the ICE and PD lines will start at the origin of the
+            y-axis. By default, no centering is done.
+
+            .. versionadded:: 1.1
+
+        Returns
+        -------
+        display : :class:`~sklearn.inspection.PartialDependenceDisplay`
+            Returns a :class:`~sklearn.inspection.PartialDependenceDisplay`
+            object that contains the partial dependence plots.
+        """
+
+        check_matplotlib_support("plot_partial_dependence")
+        import matplotlib.pyplot as plt
+        from matplotlib.gridspec import GridSpecFromSubplotSpec
+
+        if isinstance(self.kind, str):
+            kind = [self.kind] * len(self.features)
+        else:
+            kind = self.kind
+
+        if self.is_categorical is None:
+            is_categorical = [
+                (False,) if len(fx) == 1 else (False, False) for fx in self.features
+            ]
+        else:
+            is_categorical = self.is_categorical
+
+        if len(kind) != len(self.features):
+            raise ValueError(
+                "When `kind` is provided as a list of strings, it should "
+                "contain as many elements as `features`. `kind` contains "
+                f"{len(kind)} element(s) and `features` contains "
+                f"{len(self.features)} element(s)."
+            )
+
+        valid_kinds = {"average", "individual", "both"}
+        if any([k not in valid_kinds for k in kind]):
+            raise ValueError(
+                f"Values provided to `kind` must be one of: {valid_kinds!r} or a list"
+                f" of such values. Currently, kind={self.kind!r}"
+            )
+
+        # Center results before plotting
+        if not centered:
+            pd_results_ = self.pd_results
+        else:
+            pd_results_ = []
+            for kind_plot, pd_result in zip(kind, self.pd_results):
+                current_results = {"grid_values": pd_result["grid_values"]}
+
+                if kind_plot in ("individual", "both"):
+                    preds = pd_result.individual
+                    preds = preds - preds[self.target_idx, :, 0, None]
+                    current_results["individual"] = preds
+
+                if kind_plot in ("average", "both"):
+                    avg_preds = pd_result.average
+                    avg_preds = avg_preds - avg_preds[self.target_idx, 0, None]
+                    current_results["average"] = avg_preds
+
+                pd_results_.append(Bunch(**current_results))
+
+        if pdp_lim is None:
+            # get global min and max average predictions of PD grouped by plot type
+            pdp_lim = {}
+            for kind_plot, pdp in zip(kind, pd_results_):
+                values = pdp["grid_values"]
+                preds = pdp.average if kind_plot == "average" else pdp.individual
+                min_pd = preds[self.target_idx].min()
+                max_pd = preds[self.target_idx].max()
+
+                # expand the limits to account so that the plotted lines do not touch
+                # the edges of the plot
+                span = max_pd - min_pd
+                min_pd -= 0.05 * span
+                max_pd += 0.05 * span
+
+                n_fx = len(values)
+                old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
+                min_pd = min(min_pd, old_min_pd)
+                max_pd = max(max_pd, old_max_pd)
+                pdp_lim[n_fx] = (min_pd, max_pd)
+
+        if line_kw is None:
+            line_kw = {}
+        if ice_lines_kw is None:
+            ice_lines_kw = {}
+        if pd_line_kw is None:
+            pd_line_kw = {}
+        if bar_kw is None:
+            bar_kw = {}
+        if heatmap_kw is None:
+            heatmap_kw = {}
+
+        if ax is None:
+            _, ax = plt.subplots()
+
+        if contour_kw is None:
+            contour_kw = {}
+        default_contour_kws = {"alpha": 0.75}
+        contour_kw = _validate_style_kwargs(default_contour_kws, contour_kw)
+
+        n_features = len(self.features)
+        is_average_plot = [kind_plot == "average" for kind_plot in kind]
+        if all(is_average_plot):
+            # only average plots are requested
+            n_ice_lines = 0
+            n_lines = 1
+        else:
+            # we need to determine the number of ICE samples computed
+            ice_plot_idx = is_average_plot.index(False)
+            n_ice_lines = self._get_sample_count(
+                len(pd_results_[ice_plot_idx].individual[0])
+            )
+            if any([kind_plot == "both" for kind_plot in kind]):
+                n_lines = n_ice_lines + 1  # account for the average line
+            else:
+                n_lines = n_ice_lines
+
+        if isinstance(ax, plt.Axes):
+            # If ax was set off, it has most likely been set to off
+            # by a previous call to plot.
+            if not ax.axison:
+                raise ValueError(
+                    "The ax was already used in another plot "
+                    "function, please set ax=display.axes_ "
+                    "instead"
+                )
+
+            ax.set_axis_off()
+            self.bounding_ax_ = ax
+            self.figure_ = ax.figure
+
+            n_cols = min(n_cols, n_features)
+            n_rows = int(np.ceil(n_features / float(n_cols)))
+
+            self.axes_ = np.empty((n_rows, n_cols), dtype=object)
+            if all(is_average_plot):
+                self.lines_ = np.empty((n_rows, n_cols), dtype=object)
+            else:
+                self.lines_ = np.empty((n_rows, n_cols, n_lines), dtype=object)
+            self.contours_ = np.empty((n_rows, n_cols), dtype=object)
+            self.bars_ = np.empty((n_rows, n_cols), dtype=object)
+            self.heatmaps_ = np.empty((n_rows, n_cols), dtype=object)
+
+            axes_ravel = self.axes_.ravel()
+
+            gs = GridSpecFromSubplotSpec(
+                n_rows, n_cols, subplot_spec=ax.get_subplotspec()
+            )
+            for i, spec in zip(range(n_features), gs):
+                axes_ravel[i] = self.figure_.add_subplot(spec)
+
+        else:  # array-like
+            ax = np.asarray(ax, dtype=object)
+            if ax.size != n_features:
+                raise ValueError(
+                    "Expected ax to have {} axes, got {}".format(n_features, ax.size)
+                )
+
+            if ax.ndim == 2:
+                n_cols = ax.shape[1]
+            else:
+                n_cols = None
+
+            self.bounding_ax_ = None
+            self.figure_ = ax.ravel()[0].figure
+            self.axes_ = ax
+            if all(is_average_plot):
+                self.lines_ = np.empty_like(ax, dtype=object)
+            else:
+                self.lines_ = np.empty(ax.shape + (n_lines,), dtype=object)
+            self.contours_ = np.empty_like(ax, dtype=object)
+            self.bars_ = np.empty_like(ax, dtype=object)
+            self.heatmaps_ = np.empty_like(ax, dtype=object)
+
+        # create contour levels for two-way plots
+        if 2 in pdp_lim:
+            Z_level = np.linspace(*pdp_lim[2], num=8)
+
+        self.deciles_vlines_ = np.empty_like(self.axes_, dtype=object)
+        self.deciles_hlines_ = np.empty_like(self.axes_, dtype=object)
+
+        for pd_plot_idx, (axi, feature_idx, cat, pd_result, kind_plot) in enumerate(
+            zip(
+                self.axes_.ravel(),
+                self.features,
+                is_categorical,
+                pd_results_,
+                kind,
+            )
+        ):
+            avg_preds = None
+            preds = None
+            feature_values = pd_result["grid_values"]
+            if kind_plot == "individual":
+                preds = pd_result.individual
+            elif kind_plot == "average":
+                avg_preds = pd_result.average
+            else:  # kind_plot == 'both'
+                avg_preds = pd_result.average
+                preds = pd_result.individual
+
+            if len(feature_values) == 1:
+                # define the line-style for the current plot
+                default_line_kws = {
+                    "color": "C0",
+                    "label": "average" if kind_plot == "both" else None,
+                }
+                if kind_plot == "individual":
+                    default_ice_lines_kws = {"alpha": 0.3, "linewidth": 0.5}
+                    default_pd_lines_kws = {}
+                elif kind_plot == "both":
+                    # by default, we need to distinguish the average line from
+                    # the individual lines via color and line style
+                    default_ice_lines_kws = {
+                        "alpha": 0.3,
+                        "linewidth": 0.5,
+                        "color": "tab:blue",
+                    }
+                    default_pd_lines_kws = {
+                        "color": "tab:orange",
+                        "linestyle": "--",
+                    }
+                else:
+                    default_ice_lines_kws = {}
+                    default_pd_lines_kws = {}
+
+                default_ice_lines_kws = {**default_line_kws, **default_ice_lines_kws}
+                default_pd_lines_kws = {**default_line_kws, **default_pd_lines_kws}
+
+                line_kw = _validate_style_kwargs(default_line_kws, line_kw)
+
+                ice_lines_kw = _validate_style_kwargs(
+                    _validate_style_kwargs(default_ice_lines_kws, line_kw), ice_lines_kw
+                )
+                del ice_lines_kw["label"]
+
+                pd_line_kw = _validate_style_kwargs(
+                    _validate_style_kwargs(default_pd_lines_kws, line_kw), pd_line_kw
+                )
+
+                default_bar_kws = {"color": "C0"}
+                bar_kw = _validate_style_kwargs(default_bar_kws, bar_kw)
+
+                default_heatmap_kw = {}
+                heatmap_kw = _validate_style_kwargs(default_heatmap_kw, heatmap_kw)
+
+                self._plot_one_way_partial_dependence(
+                    kind_plot,
+                    preds,
+                    avg_preds,
+                    feature_values[0],
+                    feature_idx,
+                    n_ice_lines,
+                    axi,
+                    n_cols,
+                    pd_plot_idx,
+                    n_lines,
+                    ice_lines_kw,
+                    pd_line_kw,
+                    cat[0],
+                    bar_kw,
+                    pdp_lim,
+                )
+            else:
+                self._plot_two_way_partial_dependence(
+                    avg_preds,
+                    feature_values,
+                    feature_idx,
+                    axi,
+                    pd_plot_idx,
+                    Z_level,
+                    contour_kw,
+                    cat[0] and cat[1],
+                    heatmap_kw,
+                )
+
+        return self
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_boundary_decision_display.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_boundary_decision_display.py
new file mode 100644
index 0000000000000000000000000000000000000000..f409a50ab58c0865c17082f95122247bb0d5344d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_boundary_decision_display.py
@@ -0,0 +1,710 @@
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.datasets import (
+    load_diabetes,
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+)
+from sklearn.ensemble import IsolationForest
+from sklearn.inspection import DecisionBoundaryDisplay
+from sklearn.inspection._plot.decision_boundary import _check_boundary_response_method
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import scale
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import parse_version
+
+X, y = make_classification(
+    n_informative=1,
+    n_redundant=1,
+    n_clusters_per_class=1,
+    n_features=2,
+    random_state=42,
+)
+
+
+def load_iris_2d_scaled():
+    X, y = load_iris(return_X_y=True)
+    X = scale(X)[:, :2]
+    return X, y
+
+
+@pytest.fixture(scope="module")
+def fitted_clf():
+    return LogisticRegression().fit(X, y)
+
+
+def test_input_data_dimension(pyplot):
+    """Check that we raise an error when `X` does not have exactly 2 features."""
+    X, y = make_classification(n_samples=10, n_features=4, random_state=0)
+
+    clf = LogisticRegression().fit(X, y)
+    msg = "n_features must be equal to 2. Got 4 instead."
+    with pytest.raises(ValueError, match=msg):
+        DecisionBoundaryDisplay.from_estimator(estimator=clf, X=X)
+
+
+def test_check_boundary_response_method_error():
+    """Check error raised for multi-output multi-class classifiers by
+    `_check_boundary_response_method`.
+    """
+
+    class MultiLabelClassifier:
+        classes_ = [np.array([0, 1]), np.array([0, 1])]
+
+    err_msg = "Multi-label and multi-output multi-class classifiers are not supported"
+    with pytest.raises(ValueError, match=err_msg):
+        _check_boundary_response_method(MultiLabelClassifier(), "predict", None)
+
+
+@pytest.mark.parametrize(
+    "estimator, response_method, class_of_interest, expected_prediction_method",
+    [
+        (DecisionTreeRegressor(), "predict", None, "predict"),
+        (DecisionTreeRegressor(), "auto", None, "predict"),
+        (LogisticRegression().fit(*load_iris_2d_scaled()), "predict", None, "predict"),
+        (
+            LogisticRegression().fit(*load_iris_2d_scaled()),
+            "auto",
+            None,
+            ["decision_function", "predict_proba", "predict"],
+        ),
+        (
+            LogisticRegression().fit(*load_iris_2d_scaled()),
+            "predict_proba",
+            0,
+            "predict_proba",
+        ),
+        (
+            LogisticRegression().fit(*load_iris_2d_scaled()),
+            "decision_function",
+            0,
+            "decision_function",
+        ),
+        (
+            LogisticRegression().fit(X, y),
+            "auto",
+            None,
+            ["decision_function", "predict_proba", "predict"],
+        ),
+        (LogisticRegression().fit(X, y), "predict", None, "predict"),
+        (
+            LogisticRegression().fit(X, y),
+            ["predict_proba", "decision_function"],
+            None,
+            ["predict_proba", "decision_function"],
+        ),
+    ],
+)
+def test_check_boundary_response_method(
+    estimator, response_method, class_of_interest, expected_prediction_method
+):
+    """Check the behaviour of `_check_boundary_response_method` for the supported
+    cases.
+    """
+    prediction_method = _check_boundary_response_method(
+        estimator, response_method, class_of_interest
+    )
+    assert prediction_method == expected_prediction_method
+
+
+def test_multiclass_predict(pyplot):
+    """Check multiclass `response=predict` gives expected results."""
+    grid_resolution = 10
+    eps = 1.0
+    X, y = make_classification(n_classes=3, n_informative=3, random_state=0)
+    X = X[:, [0, 1]]
+    lr = LogisticRegression(random_state=0).fit(X, y)
+
+    disp = DecisionBoundaryDisplay.from_estimator(
+        lr, X, response_method="predict", grid_resolution=grid_resolution, eps=1.0
+    )
+
+    x0_min, x0_max = X[:, 0].min() - eps, X[:, 0].max() + eps
+    x1_min, x1_max = X[:, 1].min() - eps, X[:, 1].max() + eps
+    xx0, xx1 = np.meshgrid(
+        np.linspace(x0_min, x0_max, grid_resolution),
+        np.linspace(x1_min, x1_max, grid_resolution),
+    )
+    response = lr.predict(np.c_[xx0.ravel(), xx1.ravel()])
+    assert_allclose(disp.response, response.reshape(xx0.shape))
+    assert_allclose(disp.xx0, xx0)
+    assert_allclose(disp.xx1, xx1)
+
+
+@pytest.mark.parametrize(
+    "kwargs, error_msg",
+    [
+        (
+            {"plot_method": "hello_world"},
+            r"plot_method must be one of contourf, contour, pcolormesh. Got hello_world"
+            r" instead.",
+        ),
+        (
+            {"grid_resolution": 1},
+            r"grid_resolution must be greater than 1. Got 1 instead",
+        ),
+        (
+            {"grid_resolution": -1},
+            r"grid_resolution must be greater than 1. Got -1 instead",
+        ),
+        ({"eps": -1.1}, r"eps must be greater than or equal to 0. Got -1.1 instead"),
+    ],
+)
+def test_input_validation_errors(pyplot, kwargs, error_msg, fitted_clf):
+    """Check input validation from_estimator."""
+    with pytest.raises(ValueError, match=error_msg):
+        DecisionBoundaryDisplay.from_estimator(fitted_clf, X, **kwargs)
+
+
+@pytest.mark.parametrize(
+    "kwargs, error_msg",
+    [
+        (
+            {"multiclass_colors": {"dict": "not_list"}},
+            "'multiclass_colors' must be a list or a str.",
+        ),
+        ({"multiclass_colors": "not_cmap"}, "it must be a valid Matplotlib colormap"),
+        ({"multiclass_colors": ["red", "green"]}, "it must be of the same length"),
+        (
+            {"multiclass_colors": ["red", "green", "not color"]},
+            "it can only contain valid Matplotlib color names",
+        ),
+    ],
+)
+def test_input_validation_errors_multiclass_colors(pyplot, kwargs, error_msg):
+    """Check input validation for `multiclass_colors` in `from_estimator`."""
+    X, y = load_iris_2d_scaled()
+    clf = LogisticRegression().fit(X, y)
+    with pytest.raises(ValueError, match=error_msg):
+        DecisionBoundaryDisplay.from_estimator(clf, X, **kwargs)
+
+
+def test_display_plot_input_error(pyplot, fitted_clf):
+    """Check input validation for `plot`."""
+    disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, X, grid_resolution=5)
+
+    with pytest.raises(ValueError, match="plot_method must be 'contourf'"):
+        disp.plot(plot_method="hello_world")
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "predict", "predict_proba", "decision_function"]
+)
+@pytest.mark.parametrize("plot_method", ["contourf", "contour"])
+def test_decision_boundary_display_classifier(
+    pyplot, fitted_clf, response_method, plot_method
+):
+    """Check that decision boundary is correct."""
+    fig, ax = pyplot.subplots()
+    eps = 2.0
+    disp = DecisionBoundaryDisplay.from_estimator(
+        fitted_clf,
+        X,
+        grid_resolution=5,
+        response_method=response_method,
+        plot_method=plot_method,
+        eps=eps,
+        ax=ax,
+    )
+    assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet)
+    assert disp.ax_ == ax
+    assert disp.figure_ == fig
+
+    x0, x1 = X[:, 0], X[:, 1]
+
+    x0_min, x0_max = x0.min() - eps, x0.max() + eps
+    x1_min, x1_max = x1.min() - eps, x1.max() + eps
+
+    assert disp.xx0.min() == pytest.approx(x0_min)
+    assert disp.xx0.max() == pytest.approx(x0_max)
+    assert disp.xx1.min() == pytest.approx(x1_min)
+    assert disp.xx1.max() == pytest.approx(x1_max)
+
+    fig2, ax2 = pyplot.subplots()
+    # change plotting method for second plot
+    disp.plot(plot_method="pcolormesh", ax=ax2, shading="auto")
+    assert isinstance(disp.surface_, pyplot.matplotlib.collections.QuadMesh)
+    assert disp.ax_ == ax2
+    assert disp.figure_ == fig2
+
+
+@pytest.mark.parametrize("response_method", ["auto", "predict", "decision_function"])
+@pytest.mark.parametrize("plot_method", ["contourf", "contour"])
+def test_decision_boundary_display_outlier_detector(
+    pyplot, response_method, plot_method
+):
+    """Check that decision boundary is correct for outlier detector."""
+    fig, ax = pyplot.subplots()
+    eps = 2.0
+    outlier_detector = IsolationForest(random_state=0).fit(X, y)
+    disp = DecisionBoundaryDisplay.from_estimator(
+        outlier_detector,
+        X,
+        grid_resolution=5,
+        response_method=response_method,
+        plot_method=plot_method,
+        eps=eps,
+        ax=ax,
+    )
+    assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet)
+    assert disp.ax_ == ax
+    assert disp.figure_ == fig
+
+    x0, x1 = X[:, 0], X[:, 1]
+
+    x0_min, x0_max = x0.min() - eps, x0.max() + eps
+    x1_min, x1_max = x1.min() - eps, x1.max() + eps
+
+    assert disp.xx0.min() == pytest.approx(x0_min)
+    assert disp.xx0.max() == pytest.approx(x0_max)
+    assert disp.xx1.min() == pytest.approx(x1_min)
+    assert disp.xx1.max() == pytest.approx(x1_max)
+
+
+@pytest.mark.parametrize("response_method", ["auto", "predict"])
+@pytest.mark.parametrize("plot_method", ["contourf", "contour"])
+def test_decision_boundary_display_regressor(pyplot, response_method, plot_method):
+    """Check that we can display the decision boundary for a regressor."""
+    X, y = load_diabetes(return_X_y=True)
+    X = X[:, :2]
+    tree = DecisionTreeRegressor().fit(X, y)
+    fig, ax = pyplot.subplots()
+    eps = 2.0
+    disp = DecisionBoundaryDisplay.from_estimator(
+        tree,
+        X,
+        response_method=response_method,
+        ax=ax,
+        eps=eps,
+        plot_method=plot_method,
+    )
+    assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet)
+    assert disp.ax_ == ax
+    assert disp.figure_ == fig
+
+    x0, x1 = X[:, 0], X[:, 1]
+
+    x0_min, x0_max = x0.min() - eps, x0.max() + eps
+    x1_min, x1_max = x1.min() - eps, x1.max() + eps
+
+    assert disp.xx0.min() == pytest.approx(x0_min)
+    assert disp.xx0.max() == pytest.approx(x0_max)
+    assert disp.xx1.min() == pytest.approx(x1_min)
+    assert disp.xx1.max() == pytest.approx(x1_max)
+
+    fig2, ax2 = pyplot.subplots()
+    # change plotting method for second plot
+    disp.plot(plot_method="pcolormesh", ax=ax2, shading="auto")
+    assert isinstance(disp.surface_, pyplot.matplotlib.collections.QuadMesh)
+    assert disp.ax_ == ax2
+    assert disp.figure_ == fig2
+
+
+@pytest.mark.parametrize(
+    "response_method, msg",
+    [
+        (
+            "predict_proba",
+            "MyClassifier has none of the following attributes: predict_proba",
+        ),
+        (
+            "decision_function",
+            "MyClassifier has none of the following attributes: decision_function",
+        ),
+        (
+            "auto",
+            (
+                "MyClassifier has none of the following attributes: decision_function, "
+                "predict_proba, predict"
+            ),
+        ),
+        (
+            "bad_method",
+            "MyClassifier has none of the following attributes: bad_method",
+        ),
+    ],
+)
+def test_error_bad_response(pyplot, response_method, msg):
+    """Check errors for bad response."""
+
+    class MyClassifier(ClassifierMixin, BaseEstimator):
+        def fit(self, X, y):
+            self.fitted_ = True
+            self.classes_ = [0, 1]
+            return self
+
+    clf = MyClassifier().fit(X, y)
+
+    with pytest.raises(AttributeError, match=msg):
+        DecisionBoundaryDisplay.from_estimator(clf, X, response_method=response_method)
+
+
+@pytest.mark.parametrize("response_method", ["auto", "predict", "predict_proba"])
+def test_multilabel_classifier_error(pyplot, response_method):
+    """Check that multilabel classifier raises correct error."""
+    X, y = make_multilabel_classification(random_state=0)
+    X = X[:, :2]
+    tree = DecisionTreeClassifier().fit(X, y)
+
+    msg = "Multi-label and multi-output multi-class classifiers are not supported"
+    with pytest.raises(ValueError, match=msg):
+        DecisionBoundaryDisplay.from_estimator(
+            tree,
+            X,
+            response_method=response_method,
+        )
+
+
+@pytest.mark.parametrize("response_method", ["auto", "predict", "predict_proba"])
+def test_multi_output_multi_class_classifier_error(pyplot, response_method):
+    """Check that multi-output multi-class classifier raises correct error."""
+    X = np.asarray([[0, 1], [1, 2]])
+    y = np.asarray([["tree", "cat"], ["cat", "tree"]])
+    tree = DecisionTreeClassifier().fit(X, y)
+
+    msg = "Multi-label and multi-output multi-class classifiers are not supported"
+    with pytest.raises(ValueError, match=msg):
+        DecisionBoundaryDisplay.from_estimator(
+            tree,
+            X,
+            response_method=response_method,
+        )
+
+
+def test_multioutput_regressor_error(pyplot):
+    """Check that multioutput regressor raises correct error."""
+    X = np.asarray([[0, 1], [1, 2]])
+    y = np.asarray([[0, 1], [4, 1]])
+    tree = DecisionTreeRegressor().fit(X, y)
+    with pytest.raises(ValueError, match="Multi-output regressors are not supported"):
+        DecisionBoundaryDisplay.from_estimator(tree, X, response_method="predict")
+
+
+@pytest.mark.parametrize(
+    "response_method",
+    ["predict_proba", "decision_function", ["predict_proba", "predict"]],
+)
+def test_regressor_unsupported_response(pyplot, response_method):
+    """Check that we can display the decision boundary for a regressor."""
+    X, y = load_diabetes(return_X_y=True)
+    X = X[:, :2]
+    tree = DecisionTreeRegressor().fit(X, y)
+    err_msg = "should either be a classifier to be used with response_method"
+    with pytest.raises(ValueError, match=err_msg):
+        DecisionBoundaryDisplay.from_estimator(tree, X, response_method=response_method)
+
+
+@pytest.mark.filterwarnings(
+    # We expect to raise the following warning because the classifier is fit on a
+    # NumPy array
+    "ignore:X has feature names, but LogisticRegression was fitted without"
+)
+def test_dataframe_labels_used(pyplot, fitted_clf):
+    """Check that column names are used for pandas."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(X, columns=["col_x", "col_y"])
+
+    # pandas column names are used by default
+    _, ax = pyplot.subplots()
+    disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, df, ax=ax)
+    assert ax.get_xlabel() == "col_x"
+    assert ax.get_ylabel() == "col_y"
+
+    # second call to plot will have the names
+    fig, ax = pyplot.subplots()
+    disp.plot(ax=ax)
+    assert ax.get_xlabel() == "col_x"
+    assert ax.get_ylabel() == "col_y"
+
+    # axes with a label will not get overridden
+    fig, ax = pyplot.subplots()
+    ax.set(xlabel="hello", ylabel="world")
+    disp.plot(ax=ax)
+    assert ax.get_xlabel() == "hello"
+    assert ax.get_ylabel() == "world"
+
+    # labels get overridden only if provided to the `plot` method
+    disp.plot(ax=ax, xlabel="overwritten_x", ylabel="overwritten_y")
+    assert ax.get_xlabel() == "overwritten_x"
+    assert ax.get_ylabel() == "overwritten_y"
+
+    # labels do not get inferred if provided to `from_estimator`
+    _, ax = pyplot.subplots()
+    disp = DecisionBoundaryDisplay.from_estimator(
+        fitted_clf, df, ax=ax, xlabel="overwritten_x", ylabel="overwritten_y"
+    )
+    assert ax.get_xlabel() == "overwritten_x"
+    assert ax.get_ylabel() == "overwritten_y"
+
+
+def test_string_target(pyplot):
+    """Check that decision boundary works with classifiers trained on string labels."""
+    iris = load_iris()
+    X = iris.data[:, [0, 1]]
+
+    # Use strings as target
+    y = iris.target_names[iris.target]
+    log_reg = LogisticRegression().fit(X, y)
+
+    # Does not raise
+    DecisionBoundaryDisplay.from_estimator(
+        log_reg,
+        X,
+        grid_resolution=5,
+        response_method="predict",
+    )
+
+
+@pytest.mark.parametrize("constructor_name", ["pandas", "polars"])
+def test_dataframe_support(pyplot, constructor_name):
+    """Check that passing a dataframe at fit and to the Display does not
+    raise warnings.
+
+    Non-regression test for:
+    * https://github.com/scikit-learn/scikit-learn/issues/23311
+    * https://github.com/scikit-learn/scikit-learn/issues/28717
+    """
+    df = _convert_container(
+        X, constructor_name=constructor_name, columns_name=["col_x", "col_y"]
+    )
+    estimator = LogisticRegression().fit(df, y)
+
+    with warnings.catch_warnings():
+        # no warnings linked to feature names validation should be raised
+        warnings.simplefilter("error", UserWarning)
+        DecisionBoundaryDisplay.from_estimator(estimator, df, response_method="predict")
+
+
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+def test_class_of_interest_binary(pyplot, response_method):
+    """Check the behaviour of passing `class_of_interest` for plotting the output of
+    `predict_proba` and `decision_function` in the binary case.
+    """
+    iris = load_iris()
+    X = iris.data[:100, :2]
+    y = iris.target[:100]
+    assert_array_equal(np.unique(y), [0, 1])
+
+    estimator = LogisticRegression().fit(X, y)
+    # We will check that `class_of_interest=None` is equivalent to
+    # `class_of_interest=estimator.classes_[1]`
+    disp_default = DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method=response_method,
+        class_of_interest=None,
+    )
+    disp_class_1 = DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method=response_method,
+        class_of_interest=estimator.classes_[1],
+    )
+
+    assert_allclose(disp_default.response, disp_class_1.response)
+
+    # we can check that `_get_response_values` modifies the response when targeting
+    # the other class, i.e. 1 - p(y=1|x) for `predict_proba` and -decision_function
+    # for `decision_function`.
+    disp_class_0 = DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method=response_method,
+        class_of_interest=estimator.classes_[0],
+    )
+
+    if response_method == "predict_proba":
+        assert_allclose(disp_default.response, 1 - disp_class_0.response)
+    else:
+        assert response_method == "decision_function"
+        assert_allclose(disp_default.response, -disp_class_0.response)
+
+
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+def test_class_of_interest_multiclass(pyplot, response_method):
+    """Check the behaviour of passing `class_of_interest` for plotting the output of
+    `predict_proba` and `decision_function` in the multiclass case.
+    """
+    iris = load_iris()
+    X = iris.data[:, :2]
+    y = iris.target  # the target are numerical labels
+    class_of_interest_idx = 2
+
+    estimator = LogisticRegression().fit(X, y)
+    disp = DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method=response_method,
+        class_of_interest=class_of_interest_idx,
+    )
+
+    # we will check that we plot the expected values as response
+    grid = np.concatenate([disp.xx0.reshape(-1, 1), disp.xx1.reshape(-1, 1)], axis=1)
+    response = getattr(estimator, response_method)(grid)[:, class_of_interest_idx]
+    assert_allclose(response.reshape(*disp.response.shape), disp.response)
+
+    # make the same test but this time using target as strings
+    y = iris.target_names[iris.target]
+    estimator = LogisticRegression().fit(X, y)
+
+    disp = DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method=response_method,
+        class_of_interest=iris.target_names[class_of_interest_idx],
+    )
+
+    grid = np.concatenate([disp.xx0.reshape(-1, 1), disp.xx1.reshape(-1, 1)], axis=1)
+    response = getattr(estimator, response_method)(grid)[:, class_of_interest_idx]
+    assert_allclose(response.reshape(*disp.response.shape), disp.response)
+
+    # check that we raise an error for unknown labels
+    # this test should already be handled in `_get_response_values` but we can have this
+    # test here as well
+    err_msg = "class_of_interest=2 is not a valid label: It should be one of"
+    with pytest.raises(ValueError, match=err_msg):
+        DecisionBoundaryDisplay.from_estimator(
+            estimator,
+            X,
+            response_method=response_method,
+            class_of_interest=class_of_interest_idx,
+        )
+
+
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+def test_multiclass_plot_max_class(pyplot, response_method):
+    """Check plot correct when plotting max multiclass class."""
+    import matplotlib as mpl
+
+    # In matplotlib < v3.5, default value of `pcolormesh(shading)` is 'flat', which
+    # results in the last row and column being dropped. Thus older versions produce
+    # a 99x99 grid, while newer versions produce a 100x100 grid.
+    if parse_version(mpl.__version__) < parse_version("3.5"):
+        pytest.skip("`pcolormesh` in Matplotlib >= 3.5 gives smaller grid size.")
+
+    X, y = load_iris_2d_scaled()
+    clf = LogisticRegression().fit(X, y)
+
+    disp = DecisionBoundaryDisplay.from_estimator(
+        clf,
+        X,
+        plot_method="pcolormesh",
+        response_method=response_method,
+    )
+
+    grid = np.concatenate([disp.xx0.reshape(-1, 1), disp.xx1.reshape(-1, 1)], axis=1)
+    response = getattr(clf, response_method)(grid).reshape(*disp.response.shape)
+    assert_allclose(response, disp.response)
+
+    assert len(disp.surface_) == len(clf.classes_)
+    # Get which class has highest response and check it is plotted
+    highest_class = np.argmax(response, axis=2)
+    for idx, quadmesh in enumerate(disp.surface_):
+        # Note quadmesh mask is True (i.e. masked) when `idx` is NOT the highest class
+        assert_array_equal(
+            highest_class != idx,
+            quadmesh.get_array().mask.reshape(*highest_class.shape),
+        )
+
+
+@pytest.mark.parametrize(
+    "multiclass_colors",
+    [
+        "plasma",
+        "Blues",
+        ["red", "green", "blue"],
+    ],
+)
+@pytest.mark.parametrize("plot_method", ["contourf", "contour", "pcolormesh"])
+def test_multiclass_colors_cmap(pyplot, plot_method, multiclass_colors):
+    """Check correct cmap used for all `multiclass_colors` inputs."""
+    import matplotlib as mpl
+
+    if parse_version(mpl.__version__) < parse_version("3.5"):
+        pytest.skip(
+            "Matplotlib >= 3.5 is needed for `==` to check equivalence of colormaps"
+        )
+
+    X, y = load_iris_2d_scaled()
+    clf = LogisticRegression().fit(X, y)
+
+    disp = DecisionBoundaryDisplay.from_estimator(
+        clf,
+        X,
+        plot_method=plot_method,
+        multiclass_colors=multiclass_colors,
+    )
+
+    if multiclass_colors == "plasma":
+        colors = mpl.pyplot.get_cmap(multiclass_colors, len(clf.classes_)).colors
+    elif multiclass_colors == "Blues":
+        cmap = mpl.pyplot.get_cmap(multiclass_colors, len(clf.classes_))
+        colors = cmap(np.linspace(0, 1, len(clf.classes_)))
+    else:
+        colors = [mpl.colors.to_rgba(color) for color in multiclass_colors]
+
+    if plot_method != "contour":
+        cmaps = [
+            mpl.colors.LinearSegmentedColormap.from_list(
+                f"colormap_{class_idx}", [(1.0, 1.0, 1.0, 1.0), (r, g, b, 1.0)]
+            )
+            for class_idx, (r, g, b, _) in enumerate(colors)
+        ]
+        for idx, quad in enumerate(disp.surface_):
+            assert quad.cmap == cmaps[idx]
+    else:
+        assert_allclose(disp.surface_.colors, colors)
+
+
+def test_cmap_and_colors_logic(pyplot):
+    """Check the handling logic for `cmap` and `colors`."""
+    X, y = load_iris_2d_scaled()
+    clf = LogisticRegression().fit(X, y)
+
+    with pytest.warns(
+        UserWarning,
+        match="'cmap' is ignored in favor of 'multiclass_colors'",
+    ):
+        DecisionBoundaryDisplay.from_estimator(
+            clf,
+            X,
+            multiclass_colors="plasma",
+            cmap="Blues",
+        )
+
+    with pytest.warns(
+        UserWarning,
+        match="'colors' is ignored in favor of 'multiclass_colors'",
+    ):
+        DecisionBoundaryDisplay.from_estimator(
+            clf,
+            X,
+            multiclass_colors="plasma",
+            colors="blue",
+        )
+
+
+def test_subclass_named_constructors_return_type_is_subclass(pyplot):
+    """Check that named constructors return the correct type when subclassed.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/27675
+    """
+    clf = LogisticRegression().fit(X, y)
+
+    class SubclassOfDisplay(DecisionBoundaryDisplay):
+        pass
+
+    curve = SubclassOfDisplay.from_estimator(estimator=clf, X=X)
+
+    assert isinstance(curve, SubclassOfDisplay)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py
new file mode 100644
index 0000000000000000000000000000000000000000..75869079be9cc4fd2113a5186960e7acbc3722d4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py
@@ -0,0 +1,1315 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy.stats.mstats import mquantiles
+
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import (
+    load_diabetes,
+    load_iris,
+    make_classification,
+    make_regression,
+)
+from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
+from sklearn.inspection import PartialDependenceDisplay
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.utils._testing import _convert_container
+
+
+@pytest.fixture(scope="module")
+def diabetes():
+    # diabetes dataset, subsampled for speed
+    data = load_diabetes()
+    data.data = data.data[:50]
+    data.target = data.target[:50]
+    return data
+
+
+@pytest.fixture(scope="module")
+def clf_diabetes(diabetes):
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(diabetes.data, diabetes.target)
+    return clf
+
+
+def custom_values_helper(feature, grid_resolution):
+    return np.linspace(
+        *mquantiles(feature, (0.05, 0.95), axis=0), num=grid_resolution, endpoint=True
+    )
+
+
+@pytest.mark.filterwarnings("ignore:A Bunch will be returned")
+@pytest.mark.parametrize("grid_resolution", [10, 20])
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence(
+    use_custom_values,
+    grid_resolution,
+    pyplot,
+    clf_diabetes,
+    diabetes,
+):
+    # Test partial dependence plot function.
+    # Use columns 0 & 2 as 1 is not quantitative (sex)
+    feature_names = diabetes.feature_names
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            0: custom_values_helper(diabetes.data[:, 0], grid_resolution),
+            2: custom_values_helper(diabetes.data[:, 2], grid_resolution),
+        }
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 2, (0, 2)],
+        grid_resolution=grid_resolution,
+        feature_names=feature_names,
+        contour_kw={"cmap": "jet"},
+        custom_values=custom_values,
+    )
+    fig = pyplot.gcf()
+    axs = fig.get_axes()
+    assert disp.figure_ is fig
+    assert len(axs) == 4
+
+    assert disp.bounding_ax_ is not None
+    assert disp.axes_.shape == (1, 3)
+    assert disp.lines_.shape == (1, 3)
+    assert disp.contours_.shape == (1, 3)
+    assert disp.deciles_vlines_.shape == (1, 3)
+    assert disp.deciles_hlines_.shape == (1, 3)
+
+    assert disp.lines_[0, 2] is None
+    assert disp.contours_[0, 0] is None
+    assert disp.contours_[0, 1] is None
+
+    # deciles lines: always show on xaxis, only show on yaxis if 2-way PDP
+    for i in range(3):
+        assert disp.deciles_vlines_[0, i] is not None
+    assert disp.deciles_hlines_[0, 0] is None
+    assert disp.deciles_hlines_[0, 1] is None
+    assert disp.deciles_hlines_[0, 2] is not None
+
+    assert disp.features == [(0,), (2,), (0, 2)]
+    assert np.all(disp.feature_names == feature_names)
+    assert len(disp.deciles) == 2
+    for i in [0, 2]:
+        assert_allclose(
+            disp.deciles[i],
+            mquantiles(diabetes.data[:, i], prob=np.arange(0.1, 1.0, 0.1)),
+        )
+
+    single_feature_positions = [(0, (0, 0)), (2, (0, 1))]
+    expected_ylabels = ["Partial dependence", ""]
+
+    for i, (feat_col, pos) in enumerate(single_feature_positions):
+        ax = disp.axes_[pos]
+        assert ax.get_ylabel() == expected_ylabels[i]
+        assert ax.get_xlabel() == diabetes.feature_names[feat_col]
+
+        line = disp.lines_[pos]
+
+        avg_preds = disp.pd_results[i]
+        assert avg_preds.average.shape == (1, grid_resolution)
+        target_idx = disp.target_idx
+
+        line_data = line.get_data()
+        assert_allclose(line_data[0], avg_preds["grid_values"][0])
+        assert_allclose(line_data[1], avg_preds.average[target_idx].ravel())
+
+    # two feature position
+    ax = disp.axes_[0, 2]
+    coutour = disp.contours_[0, 2]
+    assert coutour.get_cmap().name == "jet"
+    assert ax.get_xlabel() == diabetes.feature_names[0]
+    assert ax.get_ylabel() == diabetes.feature_names[2]
+
+
+@pytest.mark.parametrize(
+    "kind, centered, subsample, shape",
+    [
+        ("average", False, None, (1, 3)),
+        ("individual", False, None, (1, 3, 50)),
+        ("both", False, None, (1, 3, 51)),
+        ("individual", False, 20, (1, 3, 20)),
+        ("both", False, 20, (1, 3, 21)),
+        ("individual", False, 0.5, (1, 3, 25)),
+        ("both", False, 0.5, (1, 3, 26)),
+        ("average", True, None, (1, 3)),
+        ("individual", True, None, (1, 3, 50)),
+        ("both", True, None, (1, 3, 51)),
+        ("individual", True, 20, (1, 3, 20)),
+        ("both", True, 20, (1, 3, 21)),
+    ],
+)
+def test_plot_partial_dependence_kind(
+    pyplot,
+    kind,
+    centered,
+    subsample,
+    shape,
+    clf_diabetes,
+    diabetes,
+):
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 1, 2],
+        kind=kind,
+        centered=centered,
+        subsample=subsample,
+    )
+
+    assert disp.axes_.shape == (1, 3)
+    assert disp.lines_.shape == shape
+    assert disp.contours_.shape == (1, 3)
+
+    assert disp.contours_[0, 0] is None
+    assert disp.contours_[0, 1] is None
+    assert disp.contours_[0, 2] is None
+
+    if centered:
+        assert all([ln._y[0] == 0.0 for ln in disp.lines_.ravel() if ln is not None])
+    else:
+        assert all([ln._y[0] != 0.0 for ln in disp.lines_.ravel() if ln is not None])
+
+
+@pytest.mark.parametrize(
+    "input_type, feature_names_type",
+    [
+        ("dataframe", None),
+        ("dataframe", "list"),
+        ("list", "list"),
+        ("array", "list"),
+        ("dataframe", "array"),
+        ("list", "array"),
+        ("array", "array"),
+        ("dataframe", "series"),
+        ("list", "series"),
+        ("array", "series"),
+        ("dataframe", "index"),
+        ("list", "index"),
+        ("array", "index"),
+    ],
+)
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_str_features(
+    pyplot,
+    use_custom_values,
+    clf_diabetes,
+    diabetes,
+    input_type,
+    feature_names_type,
+):
+    age = diabetes.data[:, diabetes.feature_names.index("age")]
+    bmi = diabetes.data[:, diabetes.feature_names.index("bmi")]
+
+    if input_type == "dataframe":
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
+    elif input_type == "list":
+        X = diabetes.data.tolist()
+    else:
+        X = diabetes.data
+
+    if feature_names_type is None:
+        feature_names = None
+    else:
+        feature_names = _convert_container(diabetes.feature_names, feature_names_type)
+
+    grid_resolution = 25
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            "age": custom_values_helper(age, grid_resolution),
+            "bmi": custom_values_helper(bmi, grid_resolution),
+        }
+    # check with str features and array feature names and single column
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        X,
+        [("age", "bmi"), "bmi"],
+        grid_resolution=grid_resolution,
+        feature_names=feature_names,
+        n_cols=1,
+        line_kw={"alpha": 0.8},
+        custom_values=custom_values,
+    )
+    fig = pyplot.gcf()
+    axs = fig.get_axes()
+    assert len(axs) == 3
+
+    assert disp.figure_ is fig
+    assert disp.axes_.shape == (2, 1)
+    assert disp.lines_.shape == (2, 1)
+    assert disp.contours_.shape == (2, 1)
+    assert disp.deciles_vlines_.shape == (2, 1)
+    assert disp.deciles_hlines_.shape == (2, 1)
+
+    assert disp.lines_[0, 0] is None
+    assert disp.deciles_vlines_[0, 0] is not None
+    assert disp.deciles_hlines_[0, 0] is not None
+    assert disp.contours_[1, 0] is None
+    assert disp.deciles_hlines_[1, 0] is None
+    assert disp.deciles_vlines_[1, 0] is not None
+
+    # line
+    ax = disp.axes_[1, 0]
+    assert ax.get_xlabel() == "bmi"
+    assert ax.get_ylabel() == "Partial dependence"
+
+    line = disp.lines_[1, 0]
+    avg_preds = disp.pd_results[1]
+    target_idx = disp.target_idx
+    assert line.get_alpha() == 0.8
+
+    line_data = line.get_data()
+    assert_allclose(line_data[0], avg_preds["grid_values"][0])
+    assert_allclose(line_data[1], avg_preds.average[target_idx].ravel())
+
+    # contour
+    ax = disp.axes_[0, 0]
+    assert ax.get_xlabel() == "age"
+    assert ax.get_ylabel() == "bmi"
+
+
+@pytest.mark.filterwarnings("ignore:A Bunch will be returned")
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_custom_axes(
+    use_custom_values, pyplot, clf_diabetes, diabetes
+):
+    grid_resolution = 25
+    fig, (ax1, ax2) = pyplot.subplots(1, 2)
+
+    age = diabetes.data[:, diabetes.feature_names.index("age")]
+    bmi = diabetes.data[:, diabetes.feature_names.index("bmi")]
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            "age": custom_values_helper(age, grid_resolution),
+            "bmi": custom_values_helper(bmi, grid_resolution),
+        }
+
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        ["age", ("age", "bmi")],
+        grid_resolution=grid_resolution,
+        feature_names=diabetes.feature_names,
+        ax=[ax1, ax2],
+        custom_values=custom_values,
+    )
+    assert fig is disp.figure_
+    assert disp.bounding_ax_ is None
+    assert disp.axes_.shape == (2,)
+    assert disp.axes_[0] is ax1
+    assert disp.axes_[1] is ax2
+
+    ax = disp.axes_[0]
+    assert ax.get_xlabel() == "age"
+    assert ax.get_ylabel() == "Partial dependence"
+
+    line = disp.lines_[0]
+    avg_preds = disp.pd_results[0]
+    target_idx = disp.target_idx
+
+    line_data = line.get_data()
+    assert_allclose(line_data[0], avg_preds["grid_values"][0])
+    assert_allclose(line_data[1], avg_preds.average[target_idx].ravel())
+
+    # contour
+    ax = disp.axes_[1]
+    assert ax.get_xlabel() == "age"
+    assert ax.get_ylabel() == "bmi"
+
+
+@pytest.mark.parametrize(
+    "kind, lines", [("average", 1), ("individual", 50), ("both", 51)]
+)
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_passing_numpy_axes(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+    use_custom_values,
+    kind,
+    lines,
+):
+    grid_resolution = 25
+    feature_names = diabetes.feature_names
+
+    age = diabetes.data[:, diabetes.feature_names.index("age")]
+    bmi = diabetes.data[:, diabetes.feature_names.index("bmi")]
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            "age": custom_values_helper(age, grid_resolution),
+            "bmi": custom_values_helper(bmi, grid_resolution),
+        }
+
+    disp1 = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        ["age", "bmi"],
+        kind=kind,
+        grid_resolution=grid_resolution,
+        feature_names=feature_names,
+        custom_values=custom_values,
+    )
+    assert disp1.axes_.shape == (1, 2)
+    assert disp1.axes_[0, 0].get_ylabel() == "Partial dependence"
+    assert disp1.axes_[0, 1].get_ylabel() == ""
+    assert len(disp1.axes_[0, 0].get_lines()) == lines
+    assert len(disp1.axes_[0, 1].get_lines()) == lines
+
+    lr = LinearRegression()
+    lr.fit(diabetes.data, diabetes.target)
+
+    disp2 = PartialDependenceDisplay.from_estimator(
+        lr,
+        diabetes.data,
+        ["age", "bmi"],
+        kind=kind,
+        grid_resolution=grid_resolution,
+        feature_names=feature_names,
+        ax=disp1.axes_,
+    )
+
+    assert np.all(disp1.axes_ == disp2.axes_)
+    assert len(disp2.axes_[0, 0].get_lines()) == 2 * lines
+    assert len(disp2.axes_[0, 1].get_lines()) == 2 * lines
+
+
+@pytest.mark.parametrize("nrows, ncols", [(2, 2), (3, 1)])
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_incorrent_num_axes(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+    use_custom_values,
+    nrows,
+    ncols,
+):
+    grid_resolution = 5
+    fig, axes = pyplot.subplots(nrows, ncols)
+    axes_formats = [list(axes.ravel()), tuple(axes.ravel()), axes]
+
+    msg = "Expected ax to have 2 axes, got {}".format(nrows * ncols)
+
+    age = diabetes.data[:, diabetes.feature_names.index("age")]
+    bmi = diabetes.data[:, diabetes.feature_names.index("bmi")]
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            "age": custom_values_helper(age, grid_resolution),
+            "bmi": custom_values_helper(bmi, grid_resolution),
+        }
+
+    age = diabetes.data[:, diabetes.feature_names.index("age")]
+    bmi = diabetes.data[:, diabetes.feature_names.index("bmi")]
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            "age": custom_values_helper(age, grid_resolution),
+            "bmi": custom_values_helper(bmi, grid_resolution),
+        }
+
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        ["age", "bmi"],
+        grid_resolution=grid_resolution,
+        feature_names=diabetes.feature_names,
+        custom_values=custom_values,
+    )
+
+    for ax_format in axes_formats:
+        with pytest.raises(ValueError, match=msg):
+            PartialDependenceDisplay.from_estimator(
+                clf_diabetes,
+                diabetes.data,
+                ["age", "bmi"],
+                grid_resolution=grid_resolution,
+                feature_names=diabetes.feature_names,
+                ax=ax_format,
+                custom_values=custom_values,
+            )
+
+        # with axes object
+        with pytest.raises(ValueError, match=msg):
+            disp.plot(ax=ax_format)
+
+
+@pytest.mark.filterwarnings("ignore:A Bunch will be returned")
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_with_same_axes(
+    use_custom_values, pyplot, clf_diabetes, diabetes
+):
+    # The first call to plot_partial_dependence will create two new axes to
+    # place in the space of the passed in axes, which results in a total of
+    # three axes in the figure.
+    # Currently the API does not allow for the second call to
+    # plot_partial_dependence to use the same axes again, because it will
+    # create two new axes in the space resulting in five axes. To get the
+    # expected behavior one needs to pass the generated axes into the second
+    # call:
+    # disp1 = plot_partial_dependence(...)
+    # disp2 = plot_partial_dependence(..., ax=disp1.axes_)
+
+    grid_resolution = 25
+
+    age = diabetes.data[:, diabetes.feature_names.index("age")]
+    bmi = diabetes.data[:, diabetes.feature_names.index("bmi")]
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            "age": custom_values_helper(age, grid_resolution),
+            "bmi": custom_values_helper(bmi, grid_resolution),
+        }
+
+    fig, ax = pyplot.subplots()
+    PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        ["age", "bmi"],
+        grid_resolution=grid_resolution,
+        feature_names=diabetes.feature_names,
+        ax=ax,
+        custom_values=custom_values,
+    )
+
+    msg = (
+        "The ax was already used in another plot function, please set "
+        "ax=display.axes_ instead"
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        PartialDependenceDisplay.from_estimator(
+            clf_diabetes,
+            diabetes.data,
+            ["age", "bmi"],
+            grid_resolution=grid_resolution,
+            feature_names=diabetes.feature_names,
+            custom_values=custom_values,
+            ax=ax,
+        )
+
+
+@pytest.mark.filterwarnings("ignore:A Bunch will be returned")
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_feature_name_reuse(
+    use_custom_values, pyplot, clf_diabetes, diabetes
+):
+    # second call to plot does not change the feature names from the first
+    # call
+    grid_resolution = 10
+
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            0: custom_values_helper(diabetes.data[:, 0], grid_resolution),
+            1: custom_values_helper(diabetes.data[:, 1], grid_resolution),
+        }
+
+    feature_names = diabetes.feature_names
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 1],
+        grid_resolution=grid_resolution,
+        feature_names=feature_names,
+        custom_values=custom_values,
+    )
+
+    PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 1],
+        grid_resolution=grid_resolution,
+        ax=disp.axes_,
+        custom_values=custom_values,
+    )
+
+    for i, ax in enumerate(disp.axes_.ravel()):
+        assert ax.get_xlabel() == feature_names[i]
+
+
+@pytest.mark.filterwarnings("ignore:A Bunch will be returned")
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_multiclass(use_custom_values, pyplot):
+    grid_resolution = 25
+    clf_int = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    iris = load_iris()
+
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            0: custom_values_helper(iris.data[:, 0], grid_resolution),
+            1: custom_values_helper(iris.data[:, 1], grid_resolution),
+        }
+
+    # Test partial dependence plot function on multi-class input.
+    clf_int.fit(iris.data, iris.target)
+
+    disp_target_0 = PartialDependenceDisplay.from_estimator(
+        clf_int,
+        iris.data,
+        [0, 1],
+        target=0,
+        grid_resolution=grid_resolution,
+        custom_values=custom_values,
+    )
+    assert disp_target_0.figure_ is pyplot.gcf()
+    assert disp_target_0.axes_.shape == (1, 2)
+    assert disp_target_0.lines_.shape == (1, 2)
+    assert disp_target_0.contours_.shape == (1, 2)
+    assert disp_target_0.deciles_vlines_.shape == (1, 2)
+    assert disp_target_0.deciles_hlines_.shape == (1, 2)
+    assert all(c is None for c in disp_target_0.contours_.flat)
+    assert disp_target_0.target_idx == 0
+
+    # now with symbol labels
+    target = iris.target_names[iris.target]
+    clf_symbol = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf_symbol.fit(iris.data, target)
+
+    disp_symbol = PartialDependenceDisplay.from_estimator(
+        clf_symbol,
+        iris.data,
+        [0, 1],
+        target="setosa",
+        grid_resolution=grid_resolution,
+        custom_values=custom_values,
+    )
+    assert disp_symbol.figure_ is pyplot.gcf()
+    assert disp_symbol.axes_.shape == (1, 2)
+    assert disp_symbol.lines_.shape == (1, 2)
+    assert disp_symbol.contours_.shape == (1, 2)
+    assert disp_symbol.deciles_vlines_.shape == (1, 2)
+    assert disp_symbol.deciles_hlines_.shape == (1, 2)
+    assert all(c is None for c in disp_symbol.contours_.flat)
+    assert disp_symbol.target_idx == 0
+
+    for int_result, symbol_result in zip(
+        disp_target_0.pd_results, disp_symbol.pd_results
+    ):
+        assert_allclose(int_result.average, symbol_result.average)
+        assert_allclose(int_result["grid_values"], symbol_result["grid_values"])
+
+    # check that the pd plots are different for another target
+
+    disp_target_1 = PartialDependenceDisplay.from_estimator(
+        clf_int,
+        iris.data,
+        [0, 3],
+        target=1,
+        grid_resolution=grid_resolution,
+        custom_values=custom_values,
+    )
+    target_0_data_y = disp_target_0.lines_[0, 0].get_data()[1]
+    target_1_data_y = disp_target_1.lines_[0, 0].get_data()[1]
+    assert any(target_0_data_y != target_1_data_y)
+
+
+multioutput_regression_data = make_regression(n_samples=50, n_targets=2, random_state=0)
+
+
+@pytest.mark.parametrize("target", [0, 1])
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_multioutput(use_custom_values, pyplot, target):
+    # Test partial dependence plot function on multi-output input.
+    X, y = multioutput_regression_data
+    clf = LinearRegression().fit(X, y)
+
+    grid_resolution = 25
+
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            0: custom_values_helper(X[:, 0], grid_resolution),
+            1: custom_values_helper(X[:, 1], grid_resolution),
+        }
+
+    disp = PartialDependenceDisplay.from_estimator(
+        clf,
+        X,
+        [0, 1],
+        target=target,
+        grid_resolution=grid_resolution,
+        custom_values=custom_values,
+    )
+    fig = pyplot.gcf()
+    axs = fig.get_axes()
+    assert len(axs) == 3
+    assert disp.target_idx == target
+    assert disp.bounding_ax_ is not None
+
+    positions = [(0, 0), (0, 1)]
+    expected_label = ["Partial dependence", ""]
+
+    for i, pos in enumerate(positions):
+        ax = disp.axes_[pos]
+        assert ax.get_ylabel() == expected_label[i]
+        assert ax.get_xlabel() == f"x{i}"
+
+
+def test_plot_partial_dependence_dataframe(pyplot, clf_diabetes, diabetes):
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
+
+    grid_resolution = 25
+
+    PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        df,
+        ["bp", "s1"],
+        grid_resolution=grid_resolution,
+        feature_names=df.columns.tolist(),
+    )
+
+
+dummy_classification_data = make_classification(random_state=0)
+
+
+@pytest.mark.parametrize(
+    "data, params, err_msg",
+    [
+        (
+            multioutput_regression_data,
+            {"target": None, "features": [0]},
+            "target must be specified for multi-output",
+        ),
+        (
+            multioutput_regression_data,
+            {"target": -1, "features": [0]},
+            r"target must be in \[0, n_tasks\]",
+        ),
+        (
+            multioutput_regression_data,
+            {"target": 100, "features": [0]},
+            r"target must be in \[0, n_tasks\]",
+        ),
+        (
+            dummy_classification_data,
+            {"features": ["foobar"], "feature_names": None},
+            "Feature 'foobar' not in feature_names",
+        ),
+        (
+            dummy_classification_data,
+            {"features": ["foobar"], "feature_names": ["abcd", "def"]},
+            "Feature 'foobar' not in feature_names",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [(1, 2, 3)]},
+            "Each entry in features must be either an int, ",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [1, {}]},
+            "Each entry in features must be either an int, ",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [tuple()]},
+            "Each entry in features must be either an int, ",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [123], "feature_names": ["blahblah"]},
+            "All entries of features must be less than ",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [0, 1, 2], "feature_names": ["a", "b", "a"]},
+            "feature_names should not contain duplicates",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [1, 2], "kind": ["both"]},
+            "When `kind` is provided as a list of strings, it should contain",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [1], "subsample": -1},
+            "When an integer, subsample=-1 should be positive.",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [1], "subsample": 1.2},
+            r"When a floating-point, subsample=1.2 should be in the \(0, 1\) range",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [1, 2], "categorical_features": [1.0, 2.0]},
+            "Expected `categorical_features` to be an array-like of boolean,",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [(1, 2)], "categorical_features": [2]},
+            "Two-way partial dependence plots are not supported for pairs",
+        ),
+        (
+            dummy_classification_data,
+            {"features": [1], "categorical_features": [1], "kind": "individual"},
+            "It is not possible to display individual effects",
+        ),
+    ],
+)
+def test_plot_partial_dependence_error(pyplot, data, params, err_msg):
+    X, y = data
+    estimator = LinearRegression().fit(X, y)
+
+    with pytest.raises(ValueError, match=err_msg):
+        PartialDependenceDisplay.from_estimator(estimator, X, **params)
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        ({"target": 4, "features": [0]}, "target not in est.classes_, got 4"),
+        ({"target": None, "features": [0]}, "target must be specified for multi-class"),
+        (
+            {"target": 1, "features": [4.5]},
+            "Each entry in features must be either an int,",
+        ),
+    ],
+)
+def test_plot_partial_dependence_multiclass_error(pyplot, params, err_msg):
+    iris = load_iris()
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    with pytest.raises(ValueError, match=err_msg):
+        PartialDependenceDisplay.from_estimator(clf, iris.data, **params)
+
+
+def test_plot_partial_dependence_does_not_override_ylabel(
+    pyplot, clf_diabetes, diabetes
+):
+    # Non-regression test to be sure to not override the ylabel if it has been
+    # See https://github.com/scikit-learn/scikit-learn/issues/15772
+    _, axes = pyplot.subplots(1, 2)
+    axes[0].set_ylabel("Hello world")
+    PartialDependenceDisplay.from_estimator(
+        clf_diabetes, diabetes.data, [0, 1], ax=axes
+    )
+
+    assert axes[0].get_ylabel() == "Hello world"
+    assert axes[1].get_ylabel() == "Partial dependence"
+
+
+@pytest.mark.parametrize(
+    "categorical_features, array_type",
+    [
+        (["col_A", "col_C"], "dataframe"),
+        ([0, 2], "array"),
+        ([True, False, True], "array"),
+    ],
+)
+def test_plot_partial_dependence_with_categorical(
+    pyplot, categorical_features, array_type
+):
+    X = [[1, 1, "A"], [2, 0, "C"], [3, 2, "B"]]
+    column_name = ["col_A", "col_B", "col_C"]
+    X = _convert_container(X, array_type, columns_name=column_name)
+    y = np.array([1.2, 0.5, 0.45]).T
+
+    preprocessor = make_column_transformer((OneHotEncoder(), categorical_features))
+    model = make_pipeline(preprocessor, LinearRegression())
+    model.fit(X, y)
+
+    # single feature
+    disp = PartialDependenceDisplay.from_estimator(
+        model,
+        X,
+        features=["col_C"],
+        feature_names=column_name,
+        categorical_features=categorical_features,
+    )
+
+    assert disp.figure_ is pyplot.gcf()
+    assert disp.bars_.shape == (1, 1)
+    assert disp.bars_[0][0] is not None
+    assert disp.lines_.shape == (1, 1)
+    assert disp.lines_[0][0] is None
+    assert disp.contours_.shape == (1, 1)
+    assert disp.contours_[0][0] is None
+    assert disp.deciles_vlines_.shape == (1, 1)
+    assert disp.deciles_vlines_[0][0] is None
+    assert disp.deciles_hlines_.shape == (1, 1)
+    assert disp.deciles_hlines_[0][0] is None
+    assert disp.axes_[0, 0].get_legend() is None
+
+    # interaction between two features
+    disp = PartialDependenceDisplay.from_estimator(
+        model,
+        X,
+        features=[("col_A", "col_C")],
+        feature_names=column_name,
+        categorical_features=categorical_features,
+    )
+
+    assert disp.figure_ is pyplot.gcf()
+    assert disp.bars_.shape == (1, 1)
+    assert disp.bars_[0][0] is None
+    assert disp.lines_.shape == (1, 1)
+    assert disp.lines_[0][0] is None
+    assert disp.contours_.shape == (1, 1)
+    assert disp.contours_[0][0] is None
+    assert disp.deciles_vlines_.shape == (1, 1)
+    assert disp.deciles_vlines_[0][0] is None
+    assert disp.deciles_hlines_.shape == (1, 1)
+    assert disp.deciles_hlines_[0][0] is None
+    assert disp.axes_[0, 0].get_legend() is None
+
+
+def test_plot_partial_dependence_legend(pyplot):
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame(
+        {
+            "col_A": ["A", "B", "C"],
+            "col_B": [1.0, 0.0, 2.0],
+            "col_C": ["C", "B", "A"],
+        }
+    )
+    y = np.array([1.2, 0.5, 0.45]).T
+
+    categorical_features = ["col_A", "col_C"]
+    preprocessor = make_column_transformer((OneHotEncoder(), categorical_features))
+    model = make_pipeline(preprocessor, LinearRegression())
+    model.fit(X, y)
+
+    disp = PartialDependenceDisplay.from_estimator(
+        model,
+        X,
+        features=["col_B", "col_C"],
+        categorical_features=categorical_features,
+        kind=["both", "average"],
+    )
+
+    legend_text = disp.axes_[0, 0].get_legend().get_texts()
+    assert len(legend_text) == 1
+    assert legend_text[0].get_text() == "average"
+    assert disp.axes_[0, 1].get_legend() is None
+
+
+@pytest.mark.parametrize(
+    "kind, expected_shape",
+    [("average", (1, 2)), ("individual", (1, 2, 20)), ("both", (1, 2, 21))],
+)
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_plot_partial_dependence_subsampling(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+    use_custom_values,
+    kind,
+    expected_shape,
+):
+    # check that the subsampling is properly working
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/pull/18359
+    matplotlib = pytest.importorskip("matplotlib")
+    grid_resolution = 25
+    feature_names = diabetes.feature_names
+
+    age = diabetes.data[:, diabetes.feature_names.index("age")]
+    bmi = diabetes.data[:, diabetes.feature_names.index("bmi")]
+
+    custom_values = None
+    if use_custom_values:
+        custom_values = {
+            "age": custom_values_helper(age, grid_resolution),
+            "bmi": custom_values_helper(bmi, grid_resolution),
+        }
+
+    disp1 = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        ["age", "bmi"],
+        kind=kind,
+        grid_resolution=grid_resolution,
+        feature_names=feature_names,
+        subsample=20,
+        random_state=0,
+        custom_values=custom_values,
+    )
+
+    assert disp1.lines_.shape == expected_shape
+    assert all(
+        [isinstance(line, matplotlib.lines.Line2D) for line in disp1.lines_.ravel()]
+    )
+
+
+@pytest.mark.parametrize(
+    "kind, line_kw, label",
+    [
+        ("individual", {}, None),
+        ("individual", {"label": "xxx"}, None),
+        ("average", {}, None),
+        ("average", {"label": "xxx"}, "xxx"),
+        ("both", {}, "average"),
+        ("both", {"label": "xxx"}, "xxx"),
+    ],
+)
+def test_partial_dependence_overwrite_labels(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+    kind,
+    line_kw,
+    label,
+):
+    """Test that make sure that we can overwrite the label of the PDP plot"""
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 2],
+        grid_resolution=25,
+        feature_names=diabetes.feature_names,
+        kind=kind,
+        line_kw=line_kw,
+    )
+
+    for ax in disp.axes_.ravel():
+        if label is None:
+            assert ax.get_legend() is None
+        else:
+            legend_text = ax.get_legend().get_texts()
+            assert len(legend_text) == 1
+            assert legend_text[0].get_text() == label
+
+
+@pytest.mark.parametrize(
+    "categorical_features, array_type",
+    [
+        (["col_A", "col_C"], "dataframe"),
+        ([0, 2], "array"),
+        ([True, False, True], "array"),
+    ],
+)
+def test_grid_resolution_with_categorical(pyplot, categorical_features, array_type):
+    """Check that we raise a ValueError when the grid_resolution is too small
+    respect to the number of categories in the categorical features targeted.
+    """
+    X = [["A", 1, "A"], ["B", 0, "C"], ["C", 2, "B"]]
+    column_name = ["col_A", "col_B", "col_C"]
+    X = _convert_container(X, array_type, columns_name=column_name)
+    y = np.array([1.2, 0.5, 0.45]).T
+
+    preprocessor = make_column_transformer((OneHotEncoder(), categorical_features))
+    model = make_pipeline(preprocessor, LinearRegression())
+    model.fit(X, y)
+
+    err_msg = (
+        "resolution of the computed grid is less than the minimum number of categories"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        PartialDependenceDisplay.from_estimator(
+            model,
+            X,
+            features=["col_C"],
+            feature_names=column_name,
+            categorical_features=categorical_features,
+            grid_resolution=2,
+        )
+
+
+@pytest.mark.parametrize("kind", ["individual", "average", "both"])
+@pytest.mark.parametrize("centered", [True, False])
+def test_partial_dependence_plot_limits_one_way(
+    pyplot, clf_diabetes, diabetes, kind, centered
+):
+    """Check that the PD limit on the plots are properly set on one-way plots."""
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        features=(0, 1),
+        kind=kind,
+        grid_resolution=25,
+        feature_names=diabetes.feature_names,
+    )
+
+    range_pd = np.array([-1, 1], dtype=np.float64)
+    for pd in disp.pd_results:
+        if "average" in pd:
+            pd["average"][...] = range_pd[1]
+            pd["average"][0, 0] = range_pd[0]
+        if "individual" in pd:
+            pd["individual"][...] = range_pd[1]
+            pd["individual"][0, 0, 0] = range_pd[0]
+
+    disp.plot(centered=centered)
+    # check that we anchor to zero x-axis when centering
+    y_lim = range_pd - range_pd[0] if centered else range_pd
+    padding = 0.05 * (y_lim[1] - y_lim[0])
+    y_lim[0] -= padding
+    y_lim[1] += padding
+    for ax in disp.axes_.ravel():
+        assert_allclose(ax.get_ylim(), y_lim)
+
+
+@pytest.mark.parametrize("centered", [True, False])
+def test_partial_dependence_plot_limits_two_way(
+    pyplot, clf_diabetes, diabetes, centered
+):
+    """Check that the PD limit on the plots are properly set on two-way plots."""
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        features=[(0, 1)],
+        kind="average",
+        grid_resolution=25,
+        feature_names=diabetes.feature_names,
+    )
+
+    range_pd = np.array([-1, 1], dtype=np.float64)
+    for pd in disp.pd_results:
+        pd["average"][...] = range_pd[1]
+        pd["average"][0, 0] = range_pd[0]
+
+    disp.plot(centered=centered)
+    contours = disp.contours_[0, 0]
+    levels = range_pd - range_pd[0] if centered else range_pd
+
+    padding = 0.05 * (levels[1] - levels[0])
+    levels[0] -= padding
+    levels[1] += padding
+    expect_levels = np.linspace(*levels, num=8)
+    assert_allclose(contours.levels, expect_levels)
+
+
+def test_partial_dependence_kind_list(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+):
+    """Check that we can provide a list of strings to kind parameter."""
+    matplotlib = pytest.importorskip("matplotlib")
+
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        features=[0, 2, (1, 2)],
+        grid_resolution=20,
+        kind=["both", "both", "average"],
+    )
+
+    for idx in [0, 1]:
+        assert all(
+            [
+                isinstance(line, matplotlib.lines.Line2D)
+                for line in disp.lines_[0, idx].ravel()
+            ]
+        )
+        assert disp.contours_[0, idx] is None
+
+    assert disp.contours_[0, 2] is not None
+    assert all([line is None for line in disp.lines_[0, 2].ravel()])
+
+
+@pytest.mark.parametrize(
+    "features, kind",
+    [
+        ([0, 2, (1, 2)], "individual"),
+        ([0, 2, (1, 2)], "both"),
+        ([(0, 1), (0, 2), (1, 2)], "individual"),
+        ([(0, 1), (0, 2), (1, 2)], "both"),
+        ([0, 2, (1, 2)], ["individual", "individual", "individual"]),
+        ([0, 2, (1, 2)], ["both", "both", "both"]),
+    ],
+)
+def test_partial_dependence_kind_error(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+    features,
+    kind,
+):
+    """Check that we raise an informative error when 2-way PD is requested
+    together with 1-way PD/ICE"""
+    warn_msg = (
+        "ICE plot cannot be rendered for 2-way feature interactions. 2-way "
+        "feature interactions mandates PD plots using the 'average' kind"
+    )
+    with pytest.raises(ValueError, match=warn_msg):
+        PartialDependenceDisplay.from_estimator(
+            clf_diabetes,
+            diabetes.data,
+            features=features,
+            grid_resolution=20,
+            kind=kind,
+        )
+
+
+@pytest.mark.parametrize(
+    "line_kw, pd_line_kw, ice_lines_kw, expected_colors",
+    [
+        ({"color": "r"}, {"color": "g"}, {"color": "b"}, ("g", "b")),
+        (None, {"color": "g"}, {"color": "b"}, ("g", "b")),
+        ({"color": "r"}, None, {"color": "b"}, ("r", "b")),
+        ({"color": "r"}, {"color": "g"}, None, ("g", "r")),
+        ({"color": "r"}, None, None, ("r", "r")),
+        ({"color": "r"}, {"linestyle": "--"}, {"linestyle": "-."}, ("r", "r")),
+        ({"c": "r"}, None, None, ("r", "r")),
+        ({"c": "r", "ls": "-."}, {"color": "g"}, {"color": "b"}, ("g", "b")),
+        ({"c": "r"}, {"c": "g"}, {"c": "b"}, ("g", "b")),
+        ({"c": "r"}, {"ls": "--"}, {"ls": "-."}, ("r", "r")),
+    ],
+)
+def test_plot_partial_dependence_lines_kw(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+    line_kw,
+    pd_line_kw,
+    ice_lines_kw,
+    expected_colors,
+):
+    """Check that passing `pd_line_kw` and `ice_lines_kw` will act on the
+    specific lines in the plot.
+    """
+
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 2],
+        grid_resolution=20,
+        feature_names=diabetes.feature_names,
+        n_cols=2,
+        kind="both",
+        line_kw=line_kw,
+        pd_line_kw=pd_line_kw,
+        ice_lines_kw=ice_lines_kw,
+    )
+
+    line = disp.lines_[0, 0, -1]
+    assert line.get_color() == expected_colors[0], (
+        f"{line.get_color()}!={expected_colors[0]}\n{line_kw} and {pd_line_kw}"
+    )
+    if pd_line_kw is not None:
+        if "linestyle" in pd_line_kw:
+            assert line.get_linestyle() == pd_line_kw["linestyle"]
+        elif "ls" in pd_line_kw:
+            assert line.get_linestyle() == pd_line_kw["ls"]
+    else:
+        assert line.get_linestyle() == "--"
+
+    line = disp.lines_[0, 0, 0]
+    assert line.get_color() == expected_colors[1], (
+        f"{line.get_color()}!={expected_colors[1]}"
+    )
+    if ice_lines_kw is not None:
+        if "linestyle" in ice_lines_kw:
+            assert line.get_linestyle() == ice_lines_kw["linestyle"]
+        elif "ls" in ice_lines_kw:
+            assert line.get_linestyle() == ice_lines_kw["ls"]
+    else:
+        assert line.get_linestyle() == "-"
+
+
+def test_partial_dependence_display_wrong_len_kind(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+):
+    """Check that we raise an error when `kind` is a list with a wrong length.
+
+    This case can only be triggered using the `PartialDependenceDisplay.from_estimator`
+    method.
+    """
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        features=[0, 2],
+        grid_resolution=20,
+        kind="average",  # len(kind) != len(features)
+    )
+
+    # alter `kind` to be a list with a length different from length of `features`
+    disp.kind = ["average"]
+    err_msg = (
+        r"When `kind` is provided as a list of strings, it should contain as many"
+        r" elements as `features`. `kind` contains 1 element\(s\) and `features`"
+        r" contains 2 element\(s\)."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        disp.plot()
+
+
+@pytest.mark.parametrize(
+    "kind",
+    ["individual", "both", "average", ["average", "both"], ["individual", "both"]],
+)
+def test_partial_dependence_display_kind_centered_interaction(
+    pyplot,
+    kind,
+    clf_diabetes,
+    diabetes,
+):
+    """Check that we properly center ICE and PD when passing kind as a string and as a
+    list."""
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 1],
+        kind=kind,
+        centered=True,
+        subsample=5,
+    )
+
+    assert all([ln._y[0] == 0.0 for ln in disp.lines_.ravel() if ln is not None])
+
+
+def test_partial_dependence_display_with_constant_sample_weight(
+    pyplot,
+    clf_diabetes,
+    diabetes,
+):
+    """Check that the utilization of a constant sample weight maintains the
+    standard behavior.
+    """
+    disp = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 1],
+        kind="average",
+        method="brute",
+    )
+
+    sample_weight = np.ones_like(diabetes.target)
+    disp_sw = PartialDependenceDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 1],
+        sample_weight=sample_weight,
+        kind="average",
+        method="brute",
+    )
+
+    assert np.array_equal(
+        disp.pd_results[0]["average"], disp_sw.pd_results[0]["average"]
+    )
+
+
+def test_subclass_named_constructors_return_type_is_subclass(
+    pyplot, diabetes, clf_diabetes
+):
+    """Check that named constructors return the correct type when subclassed.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/27675
+    """
+
+    class SubclassOfDisplay(PartialDependenceDisplay):
+        pass
+
+    curve = SubclassOfDisplay.from_estimator(
+        clf_diabetes,
+        diabetes.data,
+        [0, 2, (0, 2)],
+    )
+
+    assert isinstance(curve, SubclassOfDisplay)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_partial_dependence.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_partial_dependence.py
new file mode 100644
index 0000000000000000000000000000000000000000..816fe5512edc4a142380c3d84bc59a030e1168ff
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_partial_dependence.py
@@ -0,0 +1,1217 @@
+"""
+Testing for the partial dependence module.
+"""
+
+import re
+import warnings
+
+import numpy as np
+import pytest
+
+import sklearn
+from sklearn.base import BaseEstimator, ClassifierMixin, clone, is_regressor
+from sklearn.cluster import KMeans
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import load_iris, make_classification, make_regression
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import (
+    GradientBoostingClassifier,
+    GradientBoostingRegressor,
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+    RandomForestRegressor,
+)
+from sklearn.exceptions import NotFittedError
+from sklearn.impute import SimpleImputer
+from sklearn.inspection import partial_dependence
+from sklearn.inspection._partial_dependence import (
+    _grid_from_X,
+    _partial_dependence_brute,
+    _partial_dependence_recursion,
+)
+from sklearn.linear_model import LinearRegression, LogisticRegression, MultiTaskLasso
+from sklearn.metrics import r2_score
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import (
+    OneHotEncoder,
+    PolynomialFeatures,
+    RobustScaler,
+    StandardScaler,
+    scale,
+)
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.tree.tests.test_tree import assert_is_subtree
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import _IS_32BIT
+from sklearn.utils.validation import check_random_state
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+
+
+# (X, y), n_targets  <-- as expected in the output of partial_dep()
+binary_classification_data = (make_classification(n_samples=50, random_state=0), 1)
+multiclass_classification_data = (
+    make_classification(
+        n_samples=50, n_classes=3, n_clusters_per_class=1, random_state=0
+    ),
+    3,
+)
+regression_data = (make_regression(n_samples=50, random_state=0), 1)
+multioutput_regression_data = (
+    make_regression(n_samples=50, n_targets=2, random_state=0),
+    2,
+)
+
+# iris
+iris = load_iris()
+
+
+@pytest.mark.parametrize(
+    "Estimator, method, data",
+    [
+        (GradientBoostingClassifier, "auto", binary_classification_data),
+        (GradientBoostingClassifier, "auto", multiclass_classification_data),
+        (GradientBoostingClassifier, "brute", binary_classification_data),
+        (GradientBoostingClassifier, "brute", multiclass_classification_data),
+        (GradientBoostingRegressor, "auto", regression_data),
+        (GradientBoostingRegressor, "brute", regression_data),
+        (DecisionTreeRegressor, "brute", regression_data),
+        (LinearRegression, "brute", regression_data),
+        (LinearRegression, "brute", multioutput_regression_data),
+        (LogisticRegression, "brute", binary_classification_data),
+        (LogisticRegression, "brute", multiclass_classification_data),
+        (MultiTaskLasso, "brute", multioutput_regression_data),
+    ],
+)
+@pytest.mark.parametrize("grid_resolution", (5, 10))
+@pytest.mark.parametrize("features", ([1], [1, 2]))
+@pytest.mark.parametrize("kind", ("average", "individual", "both"))
+@pytest.mark.parametrize("use_custom_values", [True, False])
+def test_output_shape(
+    Estimator, method, data, grid_resolution, features, kind, use_custom_values
+):
+    # Check that partial_dependence has consistent output shape for different
+    # kinds of estimators:
+    # - classifiers with binary and multiclass settings
+    # - regressors
+    # - multi-task regressors
+
+    est = Estimator()
+    if hasattr(est, "n_estimators"):
+        est.set_params(n_estimators=2)  # speed-up computations
+
+    # n_target corresponds to the number of classes (1 for binary classif) or
+    # the number of tasks / outputs in multi task settings. It's equal to 1 for
+    # classical regression_data.
+    (X, y), n_targets = data
+    n_instances = X.shape[0]
+
+    custom_values = None
+    if use_custom_values:
+        grid_resolution = 5
+        custom_values = {f: X[:grid_resolution, f] for f in features}
+
+    est.fit(X, y)
+    result = partial_dependence(
+        est,
+        X=X,
+        features=features,
+        method=method,
+        kind=kind,
+        grid_resolution=grid_resolution,
+        custom_values=custom_values,
+    )
+    pdp, axes = result, result["grid_values"]
+
+    expected_pdp_shape = (n_targets, *[grid_resolution for _ in range(len(features))])
+    expected_ice_shape = (
+        n_targets,
+        n_instances,
+        *[grid_resolution for _ in range(len(features))],
+    )
+    if kind == "average":
+        assert pdp.average.shape == expected_pdp_shape
+    elif kind == "individual":
+        assert pdp.individual.shape == expected_ice_shape
+    else:  # 'both'
+        assert pdp.average.shape == expected_pdp_shape
+        assert pdp.individual.shape == expected_ice_shape
+
+    expected_axes_shape = (len(features), grid_resolution)
+    assert axes is not None
+    assert np.asarray(axes).shape == expected_axes_shape
+
+
+def test_grid_from_X():
+    # tests for _grid_from_X: sanity check for output, and for shapes.
+
+    # Make sure that the grid is a cartesian product of the input (it will use
+    # the unique values instead of the percentiles)
+    percentiles = (0.05, 0.95)
+    grid_resolution = 100
+    is_categorical = [False, False]
+    X = np.asarray([[1, 2], [3, 4]])
+    grid, axes = _grid_from_X(X, percentiles, is_categorical, grid_resolution, {})
+    assert_array_equal(grid, [[1, 2], [1, 4], [3, 2], [3, 4]])
+    assert_array_equal(axes, X.T)
+
+    # test shapes of returned objects depending on the number of unique values
+    # for a feature.
+    rng = np.random.RandomState(0)
+    grid_resolution = 15
+
+    # n_unique_values > grid_resolution
+    X = rng.normal(size=(20, 2))
+    grid, axes = _grid_from_X(
+        X,
+        percentiles,
+        is_categorical,
+        grid_resolution=grid_resolution,
+        custom_values={},
+    )
+    assert grid.shape == (grid_resolution * grid_resolution, X.shape[1])
+    assert np.asarray(axes).shape == (2, grid_resolution)
+    assert grid.dtype == X.dtype
+
+    # n_unique_values < grid_resolution, will use actual values
+    n_unique_values = 12
+    X[n_unique_values - 1 :, 0] = 12345
+    rng.shuffle(X)  # just to make sure the order is irrelevant
+    grid, axes = _grid_from_X(
+        X,
+        percentiles,
+        is_categorical,
+        grid_resolution=grid_resolution,
+        custom_values={},
+    )
+    assert grid.shape == (n_unique_values * grid_resolution, X.shape[1])
+    # axes is a list of arrays of different shapes
+    assert axes[0].shape == (n_unique_values,)
+    assert axes[1].shape == (grid_resolution,)
+    assert grid.dtype == X.dtype
+
+    # Check that uses custom_range
+    X = rng.normal(size=(20, 2))
+    X[n_unique_values - 1 :, 0] = 12345
+    col_1_range = [0, 2, 3]
+    grid, axes = _grid_from_X(
+        X,
+        percentiles,
+        is_categorical=is_categorical,
+        grid_resolution=grid_resolution,
+        custom_values={1: col_1_range},
+    )
+    assert grid.shape == (n_unique_values * len(col_1_range), X.shape[1])
+    # axes is a list of arrays of different shapes
+    assert axes[0].shape == (n_unique_values,)
+    assert axes[1].shape == (len(col_1_range),)
+    assert grid.dtype == X.dtype
+
+    # Check that grid_resolution does not impact custom_range
+    X = rng.normal(size=(20, 2))
+    col_0_range = [0, 2, 3, 4, 5, 6]
+    grid_resolution = 5
+    grid, axes = _grid_from_X(
+        X,
+        percentiles,
+        is_categorical=is_categorical,
+        grid_resolution=grid_resolution,
+        custom_values={0: col_0_range},
+    )
+    assert grid.shape == (grid_resolution * len(col_0_range), X.shape[1])
+    # axes is a list of arrays of different shapes
+    assert axes[0].shape == (len(col_0_range),)
+    assert axes[1].shape == (grid_resolution,)
+    assert grid.dtype == np.result_type(X, np.asarray(col_0_range).dtype)
+
+    X = np.array([[0, "a"], [1, "b"], [2, "c"]])
+
+    grid, axes = _grid_from_X(
+        X,
+        percentiles,
+        is_categorical=is_categorical,
+        grid_resolution=grid_resolution,
+        custom_values={1: ["a", "b", "c"]},
+    )
+    assert grid.dtype == object
+
+
+@pytest.mark.parametrize(
+    "grid_resolution",
+    [
+        2,  # since n_categories > 2, we should not use quantiles resampling
+        100,
+    ],
+)
+def test_grid_from_X_with_categorical(grid_resolution):
+    """Check that `_grid_from_X` always sample from categories and does not
+    depend from the percentiles.
+    """
+    pd = pytest.importorskip("pandas")
+    percentiles = (0.05, 0.95)
+    is_categorical = [True]
+    X = pd.DataFrame({"cat_feature": ["A", "B", "C", "A", "B", "D", "E"]})
+    grid, axes = _grid_from_X(
+        X,
+        percentiles,
+        is_categorical,
+        grid_resolution=grid_resolution,
+        custom_values={},
+    )
+    assert grid.shape == (5, X.shape[1])
+    assert axes[0].shape == (5,)
+
+
+@pytest.mark.parametrize("grid_resolution", [3, 100])
+def test_grid_from_X_heterogeneous_type(grid_resolution):
+    """Check that `_grid_from_X` always sample from categories and does not
+    depend from the percentiles.
+    """
+    pd = pytest.importorskip("pandas")
+    percentiles = (0.05, 0.95)
+    is_categorical = [True, False]
+    X = pd.DataFrame(
+        {
+            "cat": ["A", "B", "C", "A", "B", "D", "E", "A", "B", "D"],
+            "num": [1, 1, 1, 2, 5, 6, 6, 6, 6, 8],
+        }
+    )
+    nunique = X.nunique()
+
+    grid, axes = _grid_from_X(
+        X,
+        percentiles,
+        is_categorical,
+        grid_resolution=grid_resolution,
+        custom_values={},
+    )
+    if grid_resolution == 3:
+        assert grid.shape == (15, 2)
+        assert axes[0].shape[0] == nunique["num"]
+        assert axes[1].shape[0] == grid_resolution
+    else:
+        assert grid.shape == (25, 2)
+        assert axes[0].shape[0] == nunique["cat"]
+        assert axes[1].shape[0] == nunique["cat"]
+
+
+@pytest.mark.parametrize(
+    "grid_resolution, percentiles, err_msg",
+    [
+        (2, (0, 0.0001), "percentiles are too close"),
+        (100, (1, 2, 3, 4), "'percentiles' must be a sequence of 2 elements"),
+        (100, 12345, "'percentiles' must be a sequence of 2 elements"),
+        (100, (-1, 0.95), r"'percentiles' values must be in \[0, 1\]"),
+        (100, (0.05, 2), r"'percentiles' values must be in \[0, 1\]"),
+        (100, (0.9, 0.1), r"percentiles\[0\] must be strictly less than"),
+        (1, (0.05, 0.95), "'grid_resolution' must be strictly greater than 1"),
+    ],
+)
+def test_grid_from_X_error(grid_resolution, percentiles, err_msg):
+    X = np.asarray([[1, 2], [3, 4]])
+    is_categorical = [False]
+    with pytest.raises(ValueError, match=err_msg):
+        _grid_from_X(X, percentiles, is_categorical, grid_resolution, custom_values={})
+
+
+@pytest.mark.parametrize("target_feature", range(5))
+@pytest.mark.parametrize(
+    "est, method",
+    [
+        (LinearRegression(), "brute"),
+        (GradientBoostingRegressor(random_state=0), "brute"),
+        (GradientBoostingRegressor(random_state=0), "recursion"),
+        (HistGradientBoostingRegressor(random_state=0), "brute"),
+        (HistGradientBoostingRegressor(random_state=0), "recursion"),
+    ],
+)
+def test_partial_dependence_helpers(est, method, target_feature):
+    # Check that what is returned by _partial_dependence_brute or
+    # _partial_dependence_recursion is equivalent to manually setting a target
+    # feature to a given value, and computing the average prediction over all
+    # samples.
+    # This also checks that the brute and recursion methods give the same
+    # output.
+    # Note that even on the trainset, the brute and the recursion methods
+    # aren't always strictly equivalent, in particular when the slow method
+    # generates unrealistic samples that have low mass in the joint
+    # distribution of the input features, and when some of the features are
+    # dependent. Hence the high tolerance on the checks.
+
+    X, y = make_regression(random_state=0, n_features=5, n_informative=5)
+    # The 'init' estimator for GBDT (here the average prediction) isn't taken
+    # into account with the recursion method, for technical reasons. We set
+    # the mean to 0 to that this 'bug' doesn't have any effect.
+    y = y - y.mean()
+
+    # Clone is necessary to make the test thread-safe.
+    est = clone(est).fit(X, y)
+
+    # target feature will be set to .5 and then to 123
+    features = np.array([target_feature], dtype=np.intp)
+    grid = np.array([[0.5], [123]])
+
+    if method == "brute":
+        pdp, predictions = _partial_dependence_brute(
+            est, grid, features, X, response_method="auto"
+        )
+    else:
+        pdp = _partial_dependence_recursion(est, grid, features)
+
+    mean_predictions = []
+    for val in (0.5, 123):
+        X_ = X.copy()
+        X_[:, target_feature] = val
+        mean_predictions.append(est.predict(X_).mean())
+
+    pdp = pdp[0]  # (shape is (1, 2) so make it (2,))
+
+    # allow for greater margin for error with recursion method
+    rtol = 1e-1 if method == "recursion" else 1e-3
+    assert np.allclose(pdp, mean_predictions, rtol=rtol)
+
+
+@pytest.mark.parametrize("seed", range(1))
+def test_recursion_decision_tree_vs_forest_and_gbdt(seed):
+    # Make sure that the recursion method gives the same results on a
+    # DecisionTreeRegressor and a GradientBoostingRegressor or a
+    # RandomForestRegressor with 1 tree and equivalent parameters.
+
+    rng = np.random.RandomState(seed)
+
+    # Purely random dataset to avoid correlated features
+    n_samples = 1000
+    n_features = 5
+    X = rng.randn(n_samples, n_features)
+    y = rng.randn(n_samples) * 10
+
+    # The 'init' estimator for GBDT (here the average prediction) isn't taken
+    # into account with the recursion method, for technical reasons. We set
+    # the mean to 0 to that this 'bug' doesn't have any effect.
+    y = y - y.mean()
+
+    # set max_depth not too high to avoid splits with same gain but different
+    # features
+    max_depth = 5
+
+    tree_seed = 0
+    forest = RandomForestRegressor(
+        n_estimators=1,
+        max_features=None,
+        bootstrap=False,
+        max_depth=max_depth,
+        random_state=tree_seed,
+    )
+    # The forest will use ensemble.base._set_random_states to set the
+    # random_state of the tree sub-estimator. We simulate this here to have
+    # equivalent estimators.
+    equiv_random_state = check_random_state(tree_seed).randint(np.iinfo(np.int32).max)
+    gbdt = GradientBoostingRegressor(
+        n_estimators=1,
+        learning_rate=1,
+        criterion="squared_error",
+        max_depth=max_depth,
+        random_state=equiv_random_state,
+    )
+    tree = DecisionTreeRegressor(max_depth=max_depth, random_state=equiv_random_state)
+
+    forest.fit(X, y)
+    gbdt.fit(X, y)
+    tree.fit(X, y)
+
+    # sanity check: if the trees aren't the same, the PD values won't be equal
+    try:
+        assert_is_subtree(tree.tree_, gbdt[0, 0].tree_)
+        assert_is_subtree(tree.tree_, forest[0].tree_)
+    except AssertionError:
+        # For some reason the trees aren't exactly equal on 32bits, so the PDs
+        # cannot be equal either. See
+        # https://github.com/scikit-learn/scikit-learn/issues/8853
+        assert _IS_32BIT, "this should only fail on 32 bit platforms"
+        return
+
+    grid = rng.randn(50).reshape(-1, 1)
+    for f in range(n_features):
+        features = np.array([f], dtype=np.intp)
+
+        pdp_forest = _partial_dependence_recursion(forest, grid, features)
+        pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features)
+        pdp_tree = _partial_dependence_recursion(tree, grid, features)
+
+        np.testing.assert_allclose(pdp_gbdt, pdp_tree)
+        np.testing.assert_allclose(pdp_forest, pdp_tree)
+
+
+@pytest.mark.parametrize(
+    "est",
+    (
+        GradientBoostingClassifier(random_state=0),
+        HistGradientBoostingClassifier(random_state=0),
+    ),
+)
+@pytest.mark.parametrize("target_feature", (0, 1, 2, 3, 4, 5))
+def test_recursion_decision_function(est, target_feature):
+    # Make sure the recursion method (implicitly uses decision_function) has
+    # the same result as using brute method with
+    # response_method=decision_function
+
+    X, y = make_classification(n_classes=2, n_clusters_per_class=1, random_state=1)
+    assert np.mean(y) == 0.5  # make sure the init estimator predicts 0 anyway
+
+    est = clone(est).fit(X, y)
+
+    preds_1 = partial_dependence(
+        est,
+        X,
+        [target_feature],
+        response_method="decision_function",
+        method="recursion",
+        kind="average",
+    )
+    preds_2 = partial_dependence(
+        est,
+        X,
+        [target_feature],
+        response_method="decision_function",
+        method="brute",
+        kind="average",
+    )
+
+    assert_allclose(preds_1["average"], preds_2["average"], atol=1e-7)
+
+
+@pytest.mark.parametrize(
+    "est",
+    (
+        LinearRegression(),
+        GradientBoostingRegressor(random_state=0),
+        HistGradientBoostingRegressor(
+            random_state=0, min_samples_leaf=1, max_leaf_nodes=None, max_iter=1
+        ),
+        DecisionTreeRegressor(random_state=0),
+    ),
+)
+@pytest.mark.parametrize("power", (1, 2))
+def test_partial_dependence_easy_target(est, power):
+    # If the target y only depends on one feature in an obvious way (linear or
+    # quadratic) then the partial dependence for that feature should reflect
+    # it.
+    # We here fit a linear regression_data model (with polynomial features if
+    # needed) and compute r_squared to check that the partial dependence
+    # correctly reflects the target.
+
+    rng = np.random.RandomState(0)
+    n_samples = 200
+    target_variable = 2
+    X = rng.normal(size=(n_samples, 5))
+    y = X[:, target_variable] ** power
+
+    est = clone(est).fit(X, y)
+
+    pdp = partial_dependence(
+        est, features=[target_variable], X=X, grid_resolution=1000, kind="average"
+    )
+
+    new_X = pdp["grid_values"][0].reshape(-1, 1)
+    new_y = pdp["average"][0]
+    # add polynomial features if needed
+    new_X = PolynomialFeatures(degree=power).fit_transform(new_X)
+
+    lr = LinearRegression().fit(new_X, new_y)
+    r2 = r2_score(new_y, lr.predict(new_X))
+
+    assert r2 > 0.99
+
+
+@pytest.mark.parametrize(
+    "Estimator",
+    (
+        sklearn.tree.DecisionTreeClassifier,
+        sklearn.tree.ExtraTreeClassifier,
+        sklearn.ensemble.ExtraTreesClassifier,
+        sklearn.neighbors.KNeighborsClassifier,
+        sklearn.neighbors.RadiusNeighborsClassifier,
+        sklearn.ensemble.RandomForestClassifier,
+    ),
+)
+def test_multiclass_multioutput(Estimator):
+    # Make sure error is raised for multiclass-multioutput classifiers
+
+    # make multiclass-multioutput dataset
+    X, y = make_classification(n_classes=3, n_clusters_per_class=1, random_state=0)
+    y = np.array([y, y]).T
+
+    est = Estimator()
+    est.fit(X, y)
+
+    with pytest.raises(
+        ValueError, match="Multiclass-multioutput estimators are not supported"
+    ):
+        partial_dependence(est, X, [0])
+
+
+class NoPredictProbaNoDecisionFunction(ClassifierMixin, BaseEstimator):
+    def fit(self, X, y):
+        # simulate that we have some classes
+        self.classes_ = [0, 1]
+        return self
+
+
+@pytest.mark.parametrize(
+    "estimator, params, err_msg",
+    [
+        (
+            KMeans(random_state=0, n_init="auto"),
+            {"features": [0]},
+            "'estimator' must be a fitted regressor or classifier",
+        ),
+        (
+            LinearRegression(),
+            {"features": [0], "response_method": "predict_proba"},
+            "The response_method parameter is ignored for regressors",
+        ),
+        (
+            GradientBoostingClassifier(random_state=0),
+            {
+                "features": [0],
+                "response_method": "predict_proba",
+                "method": "recursion",
+            },
+            "'recursion' method, the response_method must be 'decision_function'",
+        ),
+        (
+            GradientBoostingClassifier(random_state=0),
+            {"features": [0], "response_method": "predict_proba", "method": "auto"},
+            "'recursion' method, the response_method must be 'decision_function'",
+        ),
+        (
+            LinearRegression(),
+            {"features": [0], "method": "recursion", "kind": "individual"},
+            "The 'recursion' method only applies when 'kind' is set to 'average'",
+        ),
+        (
+            LinearRegression(),
+            {"features": [0], "method": "recursion", "kind": "both"},
+            "The 'recursion' method only applies when 'kind' is set to 'average'",
+        ),
+        (
+            LinearRegression(),
+            {"features": [0], "method": "recursion"},
+            "Only the following estimators support the 'recursion' method:",
+        ),
+        (
+            LinearRegression(),
+            {"features": [0, 1], "custom_values": {0: [1, 2, 3], 1: np.ones((3, 3))}},
+            (
+                "The custom grid for some features is not a one-dimensional array. "
+                "Feature 1: 2 dimensions"
+            ),
+        ),
+    ],
+)
+def test_partial_dependence_error(estimator, params, err_msg):
+    X, y = make_classification(random_state=0)
+    estimator = clone(estimator).fit(X, y)
+
+    with pytest.raises(ValueError, match=err_msg):
+        partial_dependence(estimator, X, **params)
+
+
+@pytest.mark.parametrize(
+    "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)]
+)
+@pytest.mark.parametrize("features", [-1, 10000])
+def test_partial_dependence_unknown_feature_indices(estimator, features):
+    X, y = make_classification(random_state=0)
+    estimator = clone(estimator).fit(X, y)
+
+    err_msg = "all features must be in"
+    with pytest.raises(ValueError, match=err_msg):
+        partial_dependence(estimator, X, [features])
+
+
+@pytest.mark.parametrize(
+    "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)]
+)
+def test_partial_dependence_unknown_feature_string(estimator):
+    pd = pytest.importorskip("pandas")
+    X, y = make_classification(random_state=0)
+    df = pd.DataFrame(X)
+    estimator = clone(estimator).fit(df, y)
+
+    features = ["random"]
+    err_msg = "A given column is not a column of the dataframe"
+    with pytest.raises(ValueError, match=err_msg):
+        partial_dependence(estimator, df, features)
+
+
+@pytest.mark.parametrize(
+    "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)]
+)
+def test_partial_dependence_X_list(estimator):
+    # check that array-like objects are accepted
+    X, y = make_classification(random_state=0)
+    estimator = clone(estimator).fit(X, y)
+    partial_dependence(estimator, list(X), [0], kind="average")
+
+
+def test_warning_recursion_non_constant_init():
+    # make sure that passing a non-constant init parameter to a GBDT and using
+    # recursion method yields a warning.
+
+    gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0)
+    gbc.fit(X, y)
+
+    with pytest.warns(
+        UserWarning, match="Using recursion method with a non-constant init predictor"
+    ):
+        partial_dependence(gbc, X, [0], method="recursion", kind="average")
+
+    with pytest.warns(
+        UserWarning, match="Using recursion method with a non-constant init predictor"
+    ):
+        partial_dependence(gbc, X, [0], method="recursion", kind="average")
+
+
+def test_partial_dependence_sample_weight_of_fitted_estimator():
+    # Test near perfect correlation between partial dependence and diagonal
+    # when sample weights emphasize y = x predictions
+    # non-regression test for #13193
+    # TODO: extend to HistGradientBoosting once sample_weight is supported
+    N = 1000
+    rng = np.random.RandomState(123456)
+    mask = rng.randint(2, size=N, dtype=bool)
+
+    x = rng.rand(N)
+    # set y = x on mask and y = -x outside
+    y = x.copy()
+    y[~mask] = -y[~mask]
+    X = np.c_[mask, x]
+    # sample weights to emphasize data points where y = x
+    sample_weight = np.ones(N)
+    sample_weight[mask] = 1000.0
+
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(X, y, sample_weight=sample_weight)
+
+    pdp = partial_dependence(clf, X, features=[1], kind="average")
+
+    assert np.corrcoef(pdp["average"], pdp["grid_values"])[0, 1] > 0.99
+
+
+def test_hist_gbdt_sw_not_supported():
+    # TODO: remove/fix when PDP supports HGBT with sample weights
+    clf = HistGradientBoostingRegressor(random_state=1)
+    clf.fit(X, y, sample_weight=np.ones(len(X)))
+
+    with pytest.raises(
+        NotImplementedError, match="does not support partial dependence"
+    ):
+        partial_dependence(clf, X, features=[1])
+
+
+def test_partial_dependence_pipeline():
+    # check that the partial dependence support pipeline
+    iris = load_iris()
+
+    scaler = StandardScaler()
+    clf = DummyClassifier(random_state=42)
+    pipe = make_pipeline(scaler, clf)
+
+    clf.fit(scaler.fit_transform(iris.data), iris.target)
+    pipe.fit(iris.data, iris.target)
+
+    features = 0
+    pdp_pipe = partial_dependence(
+        pipe, iris.data, features=[features], grid_resolution=10, kind="average"
+    )
+    pdp_clf = partial_dependence(
+        clf,
+        scaler.transform(iris.data),
+        features=[features],
+        grid_resolution=10,
+        kind="average",
+    )
+    assert_allclose(pdp_pipe["average"], pdp_clf["average"])
+    assert_allclose(
+        pdp_pipe["grid_values"][0],
+        pdp_clf["grid_values"][0] * scaler.scale_[features] + scaler.mean_[features],
+    )
+
+
+@pytest.mark.parametrize(
+    "features, grid_resolution, n_vals_expected",
+    [
+        (["a"], 10, 10),
+        (["a"], 2, 2),
+    ],
+)
+def test_partial_dependence_binary_model_grid_resolution(
+    features, grid_resolution, n_vals_expected
+):
+    pd = pytest.importorskip("pandas")
+    model = DummyClassifier()
+
+    rng = np.random.RandomState(0)
+    X = pd.DataFrame(
+        {
+            "a": rng.randint(0, 10, size=100).astype(np.float64),
+            "b": rng.randint(0, 10, size=100).astype(np.float64),
+        }
+    )
+    y = pd.Series(rng.randint(0, 2, size=100))
+    model.fit(X, y)
+
+    part_dep = partial_dependence(
+        model,
+        X,
+        features=features,
+        grid_resolution=grid_resolution,
+        kind="average",
+    )
+    assert part_dep["average"].size == n_vals_expected
+
+
+@pytest.mark.parametrize(
+    "features, custom_values, n_vals_expected",
+    [
+        (["a"], {"a": [1.0, 2.0, 3.0, 4.0]}, 4),
+        (["a"], {"a": [1.0, 2.0]}, 2),
+        (["a"], {"a": [1.0]}, 1),
+    ],
+)
+def test_partial_dependence_binary_model_custom_values(
+    features, custom_values, n_vals_expected
+):
+    pd = pytest.importorskip("pandas")
+    model = DummyClassifier()
+
+    X = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [6.0, 7.0, 8.0, 9.0]})
+    y = pd.Series([0, 1, 0, 1])
+    model.fit(X, y)
+
+    part_dep = partial_dependence(
+        model,
+        X,
+        features=features,
+        grid_resolution=3,
+        custom_values=custom_values,
+        kind="average",
+    )
+    assert part_dep["average"].size == n_vals_expected
+
+
+@pytest.mark.parametrize(
+    "features, custom_values, n_vals_expected",
+    [
+        (["b"], {"b": ["a", "b"]}, 2),
+        (["b"], {"b": ["a"]}, 1),
+        (["a", "b"], {"a": [1.0, 2.0], "b": ["a", "b"]}, 4),
+    ],
+)
+def test_partial_dependence_pipeline_custom_values(
+    features, custom_values, n_vals_expected
+):
+    pd = pytest.importorskip("pandas")
+    pl = make_pipeline(
+        SimpleImputer(strategy="most_frequent"), OneHotEncoder(), DummyClassifier()
+    )
+
+    X = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": ["a", "b", "a", "b"]})
+    y = pd.Series([0, 1, 0, 1])
+    pl.fit(X, y)
+
+    X_holdout = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": ["a", "b", "a", None]})
+    part_dep = partial_dependence(
+        pl,
+        X_holdout,
+        features=features,
+        grid_resolution=3,
+        custom_values=custom_values,
+        kind="average",
+    )
+    assert part_dep["average"].size == n_vals_expected
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        LogisticRegression(max_iter=1000, random_state=0),
+        GradientBoostingClassifier(random_state=0, n_estimators=5),
+    ],
+    ids=["estimator-brute", "estimator-recursion"],
+)
+@pytest.mark.parametrize(
+    "preprocessor",
+    [
+        None,
+        make_column_transformer(
+            (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
+            (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]),
+        ),
+        make_column_transformer(
+            (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
+            remainder="passthrough",
+        ),
+    ],
+    ids=["None", "column-transformer", "column-transformer-passthrough"],
+)
+@pytest.mark.parametrize(
+    "features",
+    [[0, 2], [iris.feature_names[i] for i in (0, 2)]],
+    ids=["features-integer", "features-string"],
+)
+def test_partial_dependence_dataframe(estimator, preprocessor, features):
+    # check that the partial dependence support dataframe and pipeline
+    # including a column transformer
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(scale(iris.data), columns=iris.feature_names)
+
+    pipe = make_pipeline(preprocessor, clone(estimator))
+    pipe.fit(df, iris.target)
+    pdp_pipe = partial_dependence(
+        pipe, df, features=features, grid_resolution=10, kind="average"
+    )
+
+    # the column transformer will reorder the column when transforming
+    # we mixed the index to be sure that we are computing the partial
+    # dependence of the right columns
+    if preprocessor is not None:
+        X_proc = clone(preprocessor).fit_transform(df)
+        features_clf = [0, 1]
+    else:
+        X_proc = df
+        features_clf = [0, 2]
+
+    clf = clone(estimator).fit(X_proc, iris.target)
+    pdp_clf = partial_dependence(
+        clf,
+        X_proc,
+        features=features_clf,
+        method="brute",
+        grid_resolution=10,
+        kind="average",
+    )
+
+    assert_allclose(pdp_pipe["average"], pdp_clf["average"])
+    if preprocessor is not None:
+        scaler = preprocessor.named_transformers_["standardscaler"]
+        assert_allclose(
+            pdp_pipe["grid_values"][1],
+            pdp_clf["grid_values"][1] * scaler.scale_[1] + scaler.mean_[1],
+        )
+    else:
+        assert_allclose(pdp_pipe["grid_values"][1], pdp_clf["grid_values"][1])
+
+
+@pytest.mark.parametrize(
+    "features, custom_values, expected_pd_shape",
+    [
+        (0, None, (3, 10)),
+        (0, {0: [1.0, 2.0, 3.0]}, (3, 3)),
+        (iris.feature_names[0], None, (3, 10)),
+        (iris.feature_names[0], {iris.feature_names[0]: np.array([1.0, 2.0])}, (3, 2)),
+        ([0, 2], None, (3, 10, 10)),
+        ([0, 2], {2: [7, 8, 9, 10]}, (3, 10, 4)),
+        ([iris.feature_names[i] for i in (0, 2)], None, (3, 10, 10)),
+        (
+            [iris.feature_names[i] for i in (0, 2)],
+            {iris.feature_names[2]: [1, 2, 3, 10]},
+            (3, 10, 4),
+        ),
+        ([iris.feature_names[i] for i in (0, 2)], {2: [1, 2, 3, 10]}, (3, 10, 10)),
+        (
+            [iris.feature_names[i] for i in (0, 2, 3)],
+            {iris.feature_names[2]: [1, 10]},
+            (3, 10, 2, 10),
+        ),
+        ([True, False, True, False], None, (3, 10, 10)),
+    ],
+    ids=[
+        "scalar-int",
+        "scalar-int-custom-values",
+        "scalar-str",
+        "scalar-str-custom-values",
+        "list-int",
+        "list-int-custom-values",
+        "list-str",
+        "list-str-custom-values",
+        "list-str-custom-values-incorrect",
+        "list-str-three-features",
+        "mask",
+    ],
+)
+def test_partial_dependence_feature_type(features, custom_values, expected_pd_shape):
+    # check all possible features type supported in PDP
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(iris.data, columns=iris.feature_names)
+
+    preprocessor = make_column_transformer(
+        (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
+        (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]),
+    )
+    pipe = make_pipeline(
+        preprocessor, LogisticRegression(max_iter=1000, random_state=0)
+    )
+    pipe.fit(df, iris.target)
+    pdp_pipe = partial_dependence(
+        pipe,
+        df,
+        features=features,
+        grid_resolution=10,
+        kind="average",
+        custom_values=custom_values,
+    )
+    assert pdp_pipe["average"].shape == expected_pd_shape
+    assert len(pdp_pipe["grid_values"]) == len(pdp_pipe["average"].shape) - 1
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        LinearRegression(),
+        LogisticRegression(),
+        GradientBoostingRegressor(),
+        GradientBoostingClassifier(),
+    ],
+)
+def test_partial_dependence_unfitted(estimator):
+    X = iris.data
+    preprocessor = make_column_transformer(
+        (StandardScaler(), [0, 2]), (RobustScaler(), [1, 3])
+    )
+    pipe = make_pipeline(preprocessor, estimator)
+    with pytest.raises(NotFittedError, match="is not fitted yet"):
+        partial_dependence(pipe, X, features=[0, 2], grid_resolution=10)
+    with pytest.raises(NotFittedError, match="is not fitted yet"):
+        partial_dependence(estimator, X, features=[0, 2], grid_resolution=10)
+
+
+@pytest.mark.parametrize(
+    "Estimator, data",
+    [
+        (LinearRegression, multioutput_regression_data),
+        (LogisticRegression, binary_classification_data),
+    ],
+)
+def test_kind_average_and_average_of_individual(Estimator, data):
+    est = Estimator()
+    (X, y), n_targets = data
+    est.fit(X, y)
+
+    pdp_avg = partial_dependence(est, X=X, features=[1, 2], kind="average")
+    pdp_ind = partial_dependence(est, X=X, features=[1, 2], kind="individual")
+    avg_ind = np.mean(pdp_ind["individual"], axis=1)
+    assert_allclose(avg_ind, pdp_avg["average"])
+
+
+@pytest.mark.parametrize(
+    "Estimator, data",
+    [
+        (LinearRegression, multioutput_regression_data),
+        (LogisticRegression, binary_classification_data),
+    ],
+)
+def test_partial_dependence_kind_individual_ignores_sample_weight(Estimator, data):
+    """Check that `sample_weight` does not have any effect on reported ICE."""
+    est = Estimator()
+    (X, y), n_targets = data
+    sample_weight = np.arange(X.shape[0])
+    est.fit(X, y)
+
+    pdp_nsw = partial_dependence(est, X=X, features=[1, 2], kind="individual")
+    pdp_sw = partial_dependence(
+        est, X=X, features=[1, 2], kind="individual", sample_weight=sample_weight
+    )
+    assert_allclose(pdp_nsw["individual"], pdp_sw["individual"])
+    assert_allclose(pdp_nsw["grid_values"], pdp_sw["grid_values"])
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        LinearRegression(),
+        LogisticRegression(),
+        RandomForestRegressor(),
+        GradientBoostingClassifier(),
+    ],
+)
+@pytest.mark.parametrize("non_null_weight_idx", [0, 1, -1])
+def test_partial_dependence_non_null_weight_idx(estimator, non_null_weight_idx):
+    """Check that if we pass a `sample_weight` of zeros with only one index with
+    sample weight equals one, then the average `partial_dependence` with this
+    `sample_weight` is equal to the individual `partial_dependence` of the
+    corresponding index.
+    """
+    X, y = iris.data, iris.target
+    preprocessor = make_column_transformer(
+        (StandardScaler(), [0, 2]), (RobustScaler(), [1, 3])
+    )
+    pipe = make_pipeline(preprocessor, clone(estimator)).fit(X, y)
+
+    sample_weight = np.zeros_like(y)
+    sample_weight[non_null_weight_idx] = 1
+    pdp_sw = partial_dependence(
+        pipe,
+        X,
+        [2, 3],
+        kind="average",
+        sample_weight=sample_weight,
+        grid_resolution=10,
+    )
+    pdp_ind = partial_dependence(pipe, X, [2, 3], kind="individual", grid_resolution=10)
+    output_dim = 1 if is_regressor(pipe) else len(np.unique(y))
+    for i in range(output_dim):
+        assert_allclose(
+            pdp_ind["individual"][i][non_null_weight_idx],
+            pdp_sw["average"][i],
+        )
+
+
+@pytest.mark.parametrize(
+    "Estimator, data",
+    [
+        (LinearRegression, multioutput_regression_data),
+        (LogisticRegression, binary_classification_data),
+    ],
+)
+def test_partial_dependence_equivalence_equal_sample_weight(Estimator, data):
+    """Check that `sample_weight=None` is equivalent to having equal weights."""
+
+    est = Estimator()
+    (X, y), n_targets = data
+    est.fit(X, y)
+
+    sample_weight, params = None, {"X": X, "features": [1, 2], "kind": "average"}
+    pdp_sw_none = partial_dependence(est, **params, sample_weight=sample_weight)
+    sample_weight = np.ones(len(y))
+    pdp_sw_unit = partial_dependence(est, **params, sample_weight=sample_weight)
+    assert_allclose(pdp_sw_none["average"], pdp_sw_unit["average"])
+    sample_weight = 2 * np.ones(len(y))
+    pdp_sw_doubling = partial_dependence(est, **params, sample_weight=sample_weight)
+    assert_allclose(pdp_sw_none["average"], pdp_sw_doubling["average"])
+
+
+def test_partial_dependence_sample_weight_size_error():
+    """Check that we raise an error when the size of `sample_weight` is not
+    consistent with `X` and `y`.
+    """
+    est = LogisticRegression()
+    (X, y), n_targets = binary_classification_data
+    sample_weight = np.ones_like(y)
+    est.fit(X, y)
+
+    with pytest.raises(ValueError, match="sample_weight.shape =="):
+        partial_dependence(
+            est, X, features=[0], sample_weight=sample_weight[1:], grid_resolution=10
+        )
+
+
+def test_partial_dependence_sample_weight_with_recursion():
+    """Check that we raise an error when `sample_weight` is provided with
+    `"recursion"` method.
+    """
+    est = RandomForestRegressor()
+    (X, y), n_targets = regression_data
+    sample_weight = np.ones_like(y)
+    est.fit(X, y, sample_weight=sample_weight)
+
+    with pytest.raises(ValueError, match="'recursion' method can only be applied when"):
+        partial_dependence(
+            est, X, features=[0], method="recursion", sample_weight=sample_weight
+        )
+
+
+def test_mixed_type_categorical():
+    """Check that we raise a proper error when a column has mixed types and
+    the sorting of `np.unique` will fail."""
+    X = np.array(["A", "B", "C", np.nan], dtype=object).reshape(-1, 1)
+    y = np.array([0, 1, 0, 1])
+
+    from sklearn.preprocessing import OrdinalEncoder
+
+    clf = make_pipeline(
+        OrdinalEncoder(encoded_missing_value=-1),
+        LogisticRegression(),
+    ).fit(X, y)
+    with pytest.raises(ValueError, match="The column #0 contains mixed data types"):
+        partial_dependence(clf, X, features=[0])
+
+
+def test_reject_array_with_integer_dtype():
+    X = np.arange(8).reshape(4, 2)
+    y = np.array([0, 1, 0, 1])
+    clf = DummyClassifier()
+    clf.fit(X, y)
+    with pytest.warns(
+        FutureWarning, match=re.escape("The column 0 contains integer data.")
+    ):
+        partial_dependence(clf, X, features=0)
+
+    with pytest.warns(
+        FutureWarning, match=re.escape("The column 1 contains integer data.")
+    ):
+        partial_dependence(clf, X, features=[1], categorical_features=[0])
+
+    with pytest.warns(
+        FutureWarning, match=re.escape("The column 0 contains integer data.")
+    ):
+        partial_dependence(clf, X, features=[0, 1])
+
+    # The following should not raise as we do not compute numerical partial
+    # dependence on integer columns.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        partial_dependence(clf, X, features=1, categorical_features=[1])
+
+
+def test_reject_pandas_with_integer_dtype():
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame(
+        {
+            "a": [1.0, 2.0, 3.0],
+            "b": [1, 2, 3],
+            "c": [1, 2, 3],
+        }
+    )
+    y = np.array([0, 1, 0])
+    clf = DummyClassifier()
+    clf.fit(X, y)
+
+    with pytest.warns(
+        FutureWarning, match=re.escape("The column 'c' contains integer data.")
+    ):
+        partial_dependence(clf, X, features="c")
+
+    with pytest.warns(
+        FutureWarning, match=re.escape("The column 'c' contains integer data.")
+    ):
+        partial_dependence(clf, X, features=["a", "c"])
+
+    # The following should not raise as we do not compute numerical partial
+    # dependence on integer columns.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        partial_dependence(clf, X, features=["a"])
+        partial_dependence(clf, X, features=["c"], categorical_features=["c"])
+
+
+def test_partial_dependence_empty_categorical_features():
+    """Check that we raise the proper exception when `categorical_features`
+    is an empty list"""
+    clf = make_pipeline(StandardScaler(), LogisticRegression())
+    clf.fit(iris.data, iris.target)
+
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            "Passing an empty list (`[]`) to `categorical_features` is not "
+            "supported. Use `None` instead to indicate that there are no "
+            "categorical features."
+        ),
+    ):
+        partial_dependence(
+            estimator=clf, X=iris.data, features=[0], categorical_features=[]
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_pd_utils.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_pd_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dea3834a77a70891a4efab25a560d09a49a13e1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_pd_utils.py
@@ -0,0 +1,47 @@
+import numpy as np
+import pytest
+
+from sklearn.inspection._pd_utils import _check_feature_names, _get_feature_index
+from sklearn.utils._testing import _convert_container
+
+
+@pytest.mark.parametrize(
+    "feature_names, array_type, expected_feature_names",
+    [
+        (None, "array", ["x0", "x1", "x2"]),
+        (None, "dataframe", ["a", "b", "c"]),
+        (np.array(["a", "b", "c"]), "array", ["a", "b", "c"]),
+    ],
+)
+def test_check_feature_names(feature_names, array_type, expected_feature_names):
+    X = np.random.randn(10, 3)
+    column_names = ["a", "b", "c"]
+    X = _convert_container(X, constructor_name=array_type, columns_name=column_names)
+    feature_names_validated = _check_feature_names(X, feature_names)
+    assert feature_names_validated == expected_feature_names
+
+
+def test_check_feature_names_error():
+    X = np.random.randn(10, 3)
+    feature_names = ["a", "b", "c", "a"]
+    msg = "feature_names should not contain duplicates."
+    with pytest.raises(ValueError, match=msg):
+        _check_feature_names(X, feature_names)
+
+
+@pytest.mark.parametrize("fx, idx", [(0, 0), (1, 1), ("a", 0), ("b", 1), ("c", 2)])
+def test_get_feature_index(fx, idx):
+    feature_names = ["a", "b", "c"]
+    assert _get_feature_index(fx, feature_names) == idx
+
+
+@pytest.mark.parametrize(
+    "fx, feature_names, err_msg",
+    [
+        ("a", None, "Cannot plot partial dependence for feature 'a'"),
+        ("d", ["a", "b", "c"], "Feature 'd' not in feature_names"),
+    ],
+)
+def test_get_feature_names_error(fx, feature_names, err_msg):
+    with pytest.raises(ValueError, match=err_msg):
+        _get_feature_index(fx, feature_names)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_permutation_importance.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_permutation_importance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b51ad7b71f66dc897ae2700f20e6f968da56e758
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_permutation_importance.py
@@ -0,0 +1,540 @@
+import numpy as np
+import pytest
+from joblib import parallel_backend
+from numpy.testing import assert_allclose
+
+from sklearn.compose import ColumnTransformer
+from sklearn.datasets import (
+    load_diabetes,
+    load_iris,
+    make_classification,
+    make_regression,
+)
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.impute import SimpleImputer
+from sklearn.inspection import permutation_importance
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.metrics import (
+    get_scorer,
+    mean_squared_error,
+    r2_score,
+)
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, scale
+from sklearn.utils._testing import _convert_container
+
+
+@pytest.mark.parametrize("n_jobs", [1, 2])
+@pytest.mark.parametrize("max_samples", [0.5, 1.0])
+@pytest.mark.parametrize("sample_weight", [None, "ones"])
+def test_permutation_importance_correlated_feature_regression(
+    n_jobs, max_samples, sample_weight
+):
+    # Make sure that feature highly correlated to the target have a higher
+    # importance
+    rng = np.random.RandomState(42)
+    n_repeats = 5
+
+    X, y = load_diabetes(return_X_y=True)
+    y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
+
+    X = np.hstack([X, y_with_little_noise])
+
+    weights = np.ones_like(y) if sample_weight == "ones" else sample_weight
+    clf = RandomForestRegressor(n_estimators=10, random_state=42)
+    clf.fit(X, y)
+
+    result = permutation_importance(
+        clf,
+        X,
+        y,
+        sample_weight=weights,
+        n_repeats=n_repeats,
+        random_state=rng,
+        n_jobs=n_jobs,
+        max_samples=max_samples,
+    )
+
+    assert result.importances.shape == (X.shape[1], n_repeats)
+
+    # the correlated feature with y was added as the last column and should
+    # have the highest importance
+    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
+
+
+@pytest.mark.parametrize("n_jobs", [1, 2])
+@pytest.mark.parametrize("max_samples", [0.5, 1.0])
+def test_permutation_importance_correlated_feature_regression_pandas(
+    n_jobs, max_samples
+):
+    pd = pytest.importorskip("pandas")
+
+    # Make sure that feature highly correlated to the target have a higher
+    # importance
+    rng = np.random.RandomState(42)
+    n_repeats = 5
+
+    dataset = load_iris()
+    X, y = dataset.data, dataset.target
+    y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
+
+    # Adds feature correlated with y as the last column
+    X = pd.DataFrame(X, columns=dataset.feature_names)
+    X["correlated_feature"] = y_with_little_noise
+
+    clf = RandomForestClassifier(n_estimators=10, random_state=42)
+    clf.fit(X, y)
+
+    result = permutation_importance(
+        clf,
+        X,
+        y,
+        n_repeats=n_repeats,
+        random_state=rng,
+        n_jobs=n_jobs,
+        max_samples=max_samples,
+    )
+
+    assert result.importances.shape == (X.shape[1], n_repeats)
+
+    # the correlated feature with y was added as the last column and should
+    # have the highest importance
+    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
+
+
+@pytest.mark.parametrize("n_jobs", [1, 2])
+@pytest.mark.parametrize("max_samples", [0.5, 1.0])
+def test_robustness_to_high_cardinality_noisy_feature(n_jobs, max_samples, seed=42):
+    # Permutation variable importance should not be affected by the high
+    # cardinality bias of traditional feature importances, especially when
+    # computed on a held-out test set:
+    rng = np.random.RandomState(seed)
+    n_repeats = 5
+    n_samples = 1000
+    n_classes = 5
+    n_informative_features = 2
+    n_noise_features = 1
+    n_features = n_informative_features + n_noise_features
+
+    # Generate a multiclass classification dataset and a set of informative
+    # binary features that can be used to predict some classes of y exactly
+    # while leaving some classes unexplained to make the problem harder.
+    classes = np.arange(n_classes)
+    y = rng.choice(classes, size=n_samples)
+    X = np.hstack([(y == c).reshape(-1, 1) for c in classes[:n_informative_features]])
+    X = X.astype(np.float32)
+
+    # Not all target classes are explained by the binary class indicator
+    # features:
+    assert n_informative_features < n_classes
+
+    # Add 10 other noisy features with high cardinality (numerical) values
+    # that can be used to overfit the training data.
+    X = np.concatenate([X, rng.randn(n_samples, n_noise_features)], axis=1)
+    assert X.shape == (n_samples, n_features)
+
+    # Split the dataset to be able to evaluate on a held-out test set. The
+    # Test size should be large enough for importance measurements to be
+    # stable:
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.5, random_state=rng
+    )
+    clf = RandomForestClassifier(n_estimators=5, random_state=rng)
+    clf.fit(X_train, y_train)
+
+    # Variable importances computed by impurity decrease on the tree node
+    # splits often use the noisy features in splits. This can give misleading
+    # impression that high cardinality noisy variables are the most important:
+    tree_importances = clf.feature_importances_
+    informative_tree_importances = tree_importances[:n_informative_features]
+    noisy_tree_importances = tree_importances[n_informative_features:]
+    assert informative_tree_importances.max() < noisy_tree_importances.min()
+
+    # Let's check that permutation-based feature importances do not have this
+    # problem.
+    r = permutation_importance(
+        clf,
+        X_test,
+        y_test,
+        n_repeats=n_repeats,
+        random_state=rng,
+        n_jobs=n_jobs,
+        max_samples=max_samples,
+    )
+
+    assert r.importances.shape == (X.shape[1], n_repeats)
+
+    # Split the importances between informative and noisy features
+    informative_importances = r.importances_mean[:n_informative_features]
+    noisy_importances = r.importances_mean[n_informative_features:]
+
+    # Because we do not have a binary variable explaining each target classes,
+    # the RF model will have to use the random variable to make some
+    # (overfitting) splits (as max_depth is not set). Therefore the noisy
+    # variables will be non-zero but with small values oscillating around
+    # zero:
+    assert max(np.abs(noisy_importances)) > 1e-7
+    assert noisy_importances.max() < 0.05
+
+    # The binary features correlated with y should have a higher importance
+    # than the high cardinality noisy features.
+    # The maximum test accuracy is 2 / 5 == 0.4, each informative feature
+    # contributing approximately a bit more than 0.2 of accuracy.
+    assert informative_importances.min() > 0.15
+
+
+def test_permutation_importance_mixed_types():
+    rng = np.random.RandomState(42)
+    n_repeats = 4
+
+    # Last column is correlated with y
+    X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T
+    y = np.array([0, 1, 0, 1])
+
+    clf = make_pipeline(SimpleImputer(), LogisticRegression(solver="lbfgs"))
+    clf.fit(X, y)
+    result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng)
+
+    assert result.importances.shape == (X.shape[1], n_repeats)
+
+    # the correlated feature with y is the last column and should
+    # have the highest importance
+    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
+
+    # use another random state
+    rng = np.random.RandomState(0)
+    result2 = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng)
+    assert result2.importances.shape == (X.shape[1], n_repeats)
+
+    assert not np.allclose(result.importances, result2.importances)
+
+    # the correlated feature with y is the last column and should
+    # have the highest importance
+    assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
+
+
+def test_permutation_importance_mixed_types_pandas():
+    pd = pytest.importorskip("pandas")
+    rng = np.random.RandomState(42)
+    n_repeats = 5
+
+    # Last column is correlated with y
+    X = pd.DataFrame({"col1": [1.0, 2.0, 3.0, np.nan], "col2": ["a", "b", "a", "b"]})
+    y = np.array([0, 1, 0, 1])
+
+    num_preprocess = make_pipeline(SimpleImputer(), StandardScaler())
+    preprocess = ColumnTransformer(
+        [("num", num_preprocess, ["col1"]), ("cat", OneHotEncoder(), ["col2"])]
+    )
+    clf = make_pipeline(preprocess, LogisticRegression(solver="lbfgs"))
+    clf.fit(X, y)
+
+    result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng)
+
+    assert result.importances.shape == (X.shape[1], n_repeats)
+    # the correlated feature with y is the last column and should
+    # have the highest importance
+    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
+
+
+def test_permutation_importance_linear_regresssion():
+    X, y = make_regression(n_samples=500, n_features=10, random_state=0)
+
+    X = scale(X)
+    y = scale(y)
+
+    lr = LinearRegression().fit(X, y)
+
+    # this relationship can be computed in closed form
+    expected_importances = 2 * lr.coef_**2
+    results = permutation_importance(
+        lr, X, y, n_repeats=50, scoring="neg_mean_squared_error"
+    )
+    assert_allclose(
+        expected_importances, results.importances_mean, rtol=1e-1, atol=1e-6
+    )
+
+
+@pytest.mark.parametrize("max_samples", [500, 1.0])
+def test_permutation_importance_equivalence_sequential_parallel(max_samples):
+    # regression test to make sure that sequential and parallel calls will
+    # output the same results.
+    # Also tests that max_samples equal to number of samples is equivalent to 1.0
+    X, y = make_regression(n_samples=500, n_features=10, random_state=0)
+    lr = LinearRegression().fit(X, y)
+
+    importance_sequential = permutation_importance(
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=1, max_samples=max_samples
+    )
+
+    # First check that the problem is structured enough and that the model is
+    # complex enough to not yield trivial, constant importances:
+    imp_min = importance_sequential["importances"].min()
+    imp_max = importance_sequential["importances"].max()
+    assert imp_max - imp_min > 0.3
+
+    # The actually check that parallelism does not impact the results
+    # either with shared memory (threading) or without isolated memory
+    # via process-based parallelism using the default backend
+    # ('loky' or 'multiprocessing') depending on the joblib version:
+
+    # process-based parallelism (by default):
+    importance_processes = permutation_importance(
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=2
+    )
+    assert_allclose(
+        importance_processes["importances"], importance_sequential["importances"]
+    )
+
+    # thread-based parallelism:
+    with parallel_backend("threading"):
+        importance_threading = permutation_importance(
+            lr, X, y, n_repeats=5, random_state=0, n_jobs=2
+        )
+    assert_allclose(
+        importance_threading["importances"], importance_sequential["importances"]
+    )
+
+
+@pytest.mark.parametrize("n_jobs", [None, 1, 2])
+@pytest.mark.parametrize("max_samples", [0.5, 1.0])
+def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples):
+    # This test checks that the column shuffling logic has the same behavior
+    # both a dataframe and a simple numpy array.
+    pd = pytest.importorskip("pandas")
+
+    # regression test to make sure that sequential and parallel calls will
+    # output the same results.
+    X, y = make_regression(n_samples=100, n_features=5, random_state=0)
+    X_df = pd.DataFrame(X)
+
+    # Add a categorical feature that is statistically linked to y:
+    binner = KBinsDiscretizer(
+        n_bins=3,
+        encode="ordinal",
+        quantile_method="averaged_inverted_cdf",
+    )
+    cat_column = binner.fit_transform(y.reshape(-1, 1))
+
+    # Concatenate the extra column to the numpy array: integers will be
+    # cast to float values
+    X = np.hstack([X, cat_column])
+    assert X.dtype.kind == "f"
+
+    # Insert extra column as a non-numpy-native dtype:
+    cat_column = pd.Categorical(cat_column.ravel())
+    new_col_idx = len(X_df.columns)
+    X_df[new_col_idx] = cat_column
+    assert X_df[new_col_idx].dtype == cat_column.dtype
+
+    # Stich an arbitrary index to the dataframe:
+    X_df.index = np.arange(len(X_df)).astype(str)
+
+    rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
+    rf.fit(X, y)
+
+    n_repeats = 3
+    importance_array = permutation_importance(
+        rf,
+        X,
+        y,
+        n_repeats=n_repeats,
+        random_state=0,
+        n_jobs=n_jobs,
+        max_samples=max_samples,
+    )
+
+    # First check that the problem is structured enough and that the model is
+    # complex enough to not yield trivial, constant importances:
+    imp_min = importance_array["importances"].min()
+    imp_max = importance_array["importances"].max()
+    assert imp_max - imp_min > 0.3
+
+    # Now check that importances computed on dataframe matche the values
+    # of those computed on the array with the same data.
+    importance_dataframe = permutation_importance(
+        rf,
+        X_df,
+        y,
+        n_repeats=n_repeats,
+        random_state=0,
+        n_jobs=n_jobs,
+        max_samples=max_samples,
+    )
+    assert_allclose(
+        importance_array["importances"], importance_dataframe["importances"]
+    )
+
+
+@pytest.mark.parametrize("input_type", ["array", "dataframe"])
+def test_permutation_importance_large_memmaped_data(input_type):
+    # Smoke, non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15810
+    n_samples, n_features = int(5e4), 4
+    X, y = make_classification(
+        n_samples=n_samples, n_features=n_features, random_state=0
+    )
+    assert X.nbytes > 1e6  # trigger joblib memmaping
+
+    X = _convert_container(X, input_type)
+    clf = DummyClassifier(strategy="prior").fit(X, y)
+
+    # Actual smoke test: should not raise any error:
+    n_repeats = 5
+    r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2)
+
+    # Auxiliary check: DummyClassifier is feature independent:
+    # permutating feature should not change the predictions
+    expected_importances = np.zeros((n_features, n_repeats))
+    assert_allclose(expected_importances, r.importances)
+
+
+def test_permutation_importance_sample_weight():
+    # Creating data with 2 features and 1000 samples, where the target
+    # variable is a linear combination of the two features, such that
+    # in half of the samples the impact of feature 1 is twice the impact of
+    # feature 2, and vice versa on the other half of the samples.
+    rng = np.random.RandomState(1)
+    n_samples = 1000
+    n_features = 2
+    n_half_samples = n_samples // 2
+    x = rng.normal(0.0, 0.001, (n_samples, n_features))
+    y = np.zeros(n_samples)
+    y[:n_half_samples] = 2 * x[:n_half_samples, 0] + x[:n_half_samples, 1]
+    y[n_half_samples:] = x[n_half_samples:, 0] + 2 * x[n_half_samples:, 1]
+
+    # Fitting linear regression with perfect prediction
+    lr = LinearRegression(fit_intercept=False)
+    lr.fit(x, y)
+
+    # When all samples are weighted with the same weights, the ratio of
+    # the two features importance should equal to 1 on expectation (when using
+    # mean absolutes error as the loss function).
+    pi = permutation_importance(
+        lr, x, y, random_state=1, scoring="neg_mean_absolute_error", n_repeats=200
+    )
+    x1_x2_imp_ratio_w_none = pi.importances_mean[0] / pi.importances_mean[1]
+    assert x1_x2_imp_ratio_w_none == pytest.approx(1, 0.01)
+
+    # When passing a vector of ones as the sample_weight, results should be
+    # the same as in the case that sample_weight=None.
+    w = np.ones(n_samples)
+    pi = permutation_importance(
+        lr,
+        x,
+        y,
+        random_state=1,
+        scoring="neg_mean_absolute_error",
+        n_repeats=200,
+        sample_weight=w,
+    )
+    x1_x2_imp_ratio_w_ones = pi.importances_mean[0] / pi.importances_mean[1]
+    assert x1_x2_imp_ratio_w_ones == pytest.approx(x1_x2_imp_ratio_w_none, 0.01)
+
+    # When the ratio between the weights of the first half of the samples and
+    # the second half of the samples approaches to infinity, the ratio of
+    # the two features importance should equal to 2 on expectation (when using
+    # mean absolutes error as the loss function).
+    w = np.hstack([np.repeat(10.0**10, n_half_samples), np.repeat(1.0, n_half_samples)])
+    lr.fit(x, y, w)
+    pi = permutation_importance(
+        lr,
+        x,
+        y,
+        random_state=1,
+        scoring="neg_mean_absolute_error",
+        n_repeats=200,
+        sample_weight=w,
+    )
+    x1_x2_imp_ratio_w = pi.importances_mean[0] / pi.importances_mean[1]
+    assert x1_x2_imp_ratio_w / x1_x2_imp_ratio_w_none == pytest.approx(2, 0.01)
+
+
+def test_permutation_importance_no_weights_scoring_function():
+    # Creating a scorer function that does not takes sample_weight
+    def my_scorer(estimator, X, y):
+        return 1
+
+    # Creating some data and estimator for the permutation test
+    x = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 2])
+    w = np.array([1, 1])
+    lr = LinearRegression()
+    lr.fit(x, y)
+
+    # test that permutation_importance does not return error when
+    # sample_weight is None
+    try:
+        permutation_importance(lr, x, y, random_state=1, scoring=my_scorer, n_repeats=1)
+    except TypeError:
+        pytest.fail(
+            "permutation_test raised an error when using a scorer "
+            "function that does not accept sample_weight even though "
+            "sample_weight was None"
+        )
+
+    # test that permutation_importance raise exception when sample_weight is
+    # not None
+    with pytest.raises(TypeError):
+        permutation_importance(
+            lr, x, y, random_state=1, scoring=my_scorer, n_repeats=1, sample_weight=w
+        )
+
+
+@pytest.mark.parametrize(
+    "list_single_scorer, multi_scorer",
+    [
+        (["r2", "neg_mean_squared_error"], ["r2", "neg_mean_squared_error"]),
+        (
+            ["r2", "neg_mean_squared_error"],
+            {
+                "r2": get_scorer("r2"),
+                "neg_mean_squared_error": get_scorer("neg_mean_squared_error"),
+            },
+        ),
+        (
+            ["r2", "neg_mean_squared_error"],
+            lambda estimator, X, y: {
+                "r2": r2_score(y, estimator.predict(X)),
+                "neg_mean_squared_error": -mean_squared_error(y, estimator.predict(X)),
+            },
+        ),
+    ],
+)
+def test_permutation_importance_multi_metric(list_single_scorer, multi_scorer):
+    # Test permutation importance when scoring contains multiple scorers
+
+    # Creating some data and estimator for the permutation test
+    x, y = make_regression(n_samples=500, n_features=10, random_state=0)
+    lr = LinearRegression().fit(x, y)
+
+    multi_importance = permutation_importance(
+        lr, x, y, random_state=1, scoring=multi_scorer, n_repeats=2
+    )
+    assert set(multi_importance.keys()) == set(list_single_scorer)
+
+    for scorer in list_single_scorer:
+        multi_result = multi_importance[scorer]
+        single_result = permutation_importance(
+            lr, x, y, random_state=1, scoring=scorer, n_repeats=2
+        )
+
+        assert_allclose(multi_result.importances, single_result.importances)
+
+
+def test_permutation_importance_max_samples_error():
+    """Check that a proper error message is raised when `max_samples` is not
+    set to a valid input value.
+    """
+    X = np.array([(1.0, 2.0, 3.0, 4.0)]).T
+    y = np.array([0, 1, 0, 1])
+
+    clf = LogisticRegression()
+    clf.fit(X, y)
+
+    err_msg = r"max_samples must be <= n_samples"
+
+    with pytest.raises(ValueError, match=err_msg):
+        permutation_importance(clf, X, y, max_samples=5)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..541f164daf46a336719b4148b7b25cea73fe212c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/__init__.py
@@ -0,0 +1,95 @@
+"""A variety of linear models."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# See http://scikit-learn.sourceforge.net/modules/sgd.html and
+# http://scikit-learn.sourceforge.net/modules/linear_model.html for
+# complete documentation.
+
+from ._base import LinearRegression
+from ._bayes import ARDRegression, BayesianRidge
+from ._coordinate_descent import (
+    ElasticNet,
+    ElasticNetCV,
+    Lasso,
+    LassoCV,
+    MultiTaskElasticNet,
+    MultiTaskElasticNetCV,
+    MultiTaskLasso,
+    MultiTaskLassoCV,
+    enet_path,
+    lasso_path,
+)
+from ._glm import GammaRegressor, PoissonRegressor, TweedieRegressor
+from ._huber import HuberRegressor
+from ._least_angle import (
+    Lars,
+    LarsCV,
+    LassoLars,
+    LassoLarsCV,
+    LassoLarsIC,
+    lars_path,
+    lars_path_gram,
+)
+from ._logistic import LogisticRegression, LogisticRegressionCV
+from ._omp import (
+    OrthogonalMatchingPursuit,
+    OrthogonalMatchingPursuitCV,
+    orthogonal_mp,
+    orthogonal_mp_gram,
+)
+from ._passive_aggressive import PassiveAggressiveClassifier, PassiveAggressiveRegressor
+from ._perceptron import Perceptron
+from ._quantile import QuantileRegressor
+from ._ransac import RANSACRegressor
+from ._ridge import Ridge, RidgeClassifier, RidgeClassifierCV, RidgeCV, ridge_regression
+from ._stochastic_gradient import SGDClassifier, SGDOneClassSVM, SGDRegressor
+from ._theil_sen import TheilSenRegressor
+
+__all__ = [
+    "ARDRegression",
+    "BayesianRidge",
+    "ElasticNet",
+    "ElasticNetCV",
+    "GammaRegressor",
+    "HuberRegressor",
+    "Lars",
+    "LarsCV",
+    "Lasso",
+    "LassoCV",
+    "LassoLars",
+    "LassoLarsCV",
+    "LassoLarsIC",
+    "LinearRegression",
+    "LogisticRegression",
+    "LogisticRegressionCV",
+    "MultiTaskElasticNet",
+    "MultiTaskElasticNetCV",
+    "MultiTaskLasso",
+    "MultiTaskLassoCV",
+    "OrthogonalMatchingPursuit",
+    "OrthogonalMatchingPursuitCV",
+    "PassiveAggressiveClassifier",
+    "PassiveAggressiveRegressor",
+    "Perceptron",
+    "PoissonRegressor",
+    "QuantileRegressor",
+    "RANSACRegressor",
+    "Ridge",
+    "RidgeCV",
+    "RidgeClassifier",
+    "RidgeClassifierCV",
+    "SGDClassifier",
+    "SGDOneClassSVM",
+    "SGDRegressor",
+    "TheilSenRegressor",
+    "TweedieRegressor",
+    "enet_path",
+    "lars_path",
+    "lars_path_gram",
+    "lasso_path",
+    "orthogonal_mp",
+    "orthogonal_mp_gram",
+    "ridge_regression",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_base.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c059e3fa84310e4bc022d43cf159eaed3aa752fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_base.py
@@ -0,0 +1,869 @@
+"""
+Generalized Linear Models.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+from scipy import linalg, optimize, sparse
+from scipy.sparse.linalg import lsqr
+from scipy.special import expit
+
+from ..base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    _fit_context,
+)
+from ..utils import check_array, check_random_state
+from ..utils._array_api import (
+    _asarray_with_order,
+    _average,
+    get_namespace,
+    get_namespace_and_device,
+    indexing_dtype,
+    supported_float_dtypes,
+)
+from ..utils._param_validation import Interval
+from ..utils._seq_dataset import (
+    ArrayDataset32,
+    ArrayDataset64,
+    CSRDataset32,
+    CSRDataset64,
+)
+from ..utils.extmath import safe_sparse_dot
+from ..utils.parallel import Parallel, delayed
+from ..utils.sparsefuncs import mean_variance_axis
+from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data
+
+# TODO: bayesian_ridge_regression and bayesian_regression_ard
+# should be squashed into its respective objects.
+
+SPARSE_INTERCEPT_DECAY = 0.01
+# For sparse data intercept updates are scaled by this decay factor to avoid
+# intercept oscillation.
+
+
+def make_dataset(X, y, sample_weight, random_state=None):
+    """Create ``Dataset`` abstraction for sparse and dense inputs.
+
+    This also returns the ``intercept_decay`` which is different
+    for sparse datasets.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Training data
+
+    y : array-like, shape (n_samples, )
+        Target values.
+
+    sample_weight : numpy array of shape (n_samples,)
+        The weight of each sample
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset random sampling. It is not
+        used for dataset shuffling.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    dataset
+        The ``Dataset`` abstraction
+    intercept_decay
+        The intercept decay
+    """
+
+    rng = check_random_state(random_state)
+    # seed should never be 0 in SequentialDataset64
+    seed = rng.randint(1, np.iinfo(np.int32).max)
+
+    if X.dtype == np.float32:
+        CSRData = CSRDataset32
+        ArrayData = ArrayDataset32
+    else:
+        CSRData = CSRDataset64
+        ArrayData = ArrayDataset64
+
+    if sp.issparse(X):
+        dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight, seed=seed)
+        intercept_decay = SPARSE_INTERCEPT_DECAY
+    else:
+        X = np.ascontiguousarray(X)
+        dataset = ArrayData(X, y, sample_weight, seed=seed)
+        intercept_decay = 1.0
+
+    return dataset, intercept_decay
+
+
+def _preprocess_data(
+    X,
+    y,
+    *,
+    fit_intercept,
+    copy=True,
+    copy_y=True,
+    sample_weight=None,
+    check_input=True,
+):
+    """Common data preprocessing for fitting linear models.
+
+    This helper is in charge of the following steps:
+
+    - Ensure that `sample_weight` is an array or `None`.
+    - If `check_input=True`, perform standard input validation of `X`, `y`.
+    - Perform copies if requested to avoid side-effects in case of inplace
+      modifications of the input.
+
+    Then, if `fit_intercept=True` this preprocessing centers both `X` and `y` as
+    follows:
+        - if `X` is dense, center the data and
+        store the mean vector in `X_offset`.
+        - if `X` is sparse, store the mean in `X_offset`
+        without centering `X`. The centering is expected to be handled by the
+        linear solver where appropriate.
+        - in either case, always center `y` and store the mean in `y_offset`.
+        - both `X_offset` and `y_offset` are always weighted by `sample_weight`
+          if not set to `None`.
+
+    If `fit_intercept=False`, no centering is performed and `X_offset`, `y_offset`
+    are set to zero.
+
+    Returns
+    -------
+    X_out : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        If copy=True a copy of the input X is triggered, otherwise operations are
+        inplace.
+        If input X is dense, then X_out is centered.
+    y_out : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)
+        Centered version of y. Possibly performed inplace on input y depending
+        on the copy_y parameter.
+    X_offset : ndarray of shape (n_features,)
+        The mean per column of input X.
+    y_offset : float or ndarray of shape (n_features,)
+    X_scale : ndarray of shape (n_features,)
+        Always an array of ones. TODO: refactor the code base to make it
+        possible to remove this unused variable.
+    """
+    xp, _, device_ = get_namespace_and_device(X, y, sample_weight)
+    n_samples, n_features = X.shape
+    X_is_sparse = sp.issparse(X)
+
+    if isinstance(sample_weight, numbers.Number):
+        sample_weight = None
+    if sample_weight is not None:
+        sample_weight = xp.asarray(sample_weight)
+
+    if check_input:
+        X = check_array(
+            X, copy=copy, accept_sparse=["csr", "csc"], dtype=supported_float_dtypes(xp)
+        )
+        y = check_array(y, dtype=X.dtype, copy=copy_y, ensure_2d=False)
+    else:
+        y = xp.astype(y, X.dtype, copy=copy_y)
+        if copy:
+            if X_is_sparse:
+                X = X.copy()
+            else:
+                X = _asarray_with_order(X, order="K", copy=True, xp=xp)
+
+    dtype_ = X.dtype
+
+    if fit_intercept:
+        if X_is_sparse:
+            X_offset, X_var = mean_variance_axis(X, axis=0, weights=sample_weight)
+        else:
+            X_offset = _average(X, axis=0, weights=sample_weight, xp=xp)
+
+            X_offset = xp.astype(X_offset, X.dtype, copy=False)
+            X -= X_offset
+
+        y_offset = _average(y, axis=0, weights=sample_weight, xp=xp)
+        y -= y_offset
+    else:
+        X_offset = xp.zeros(n_features, dtype=X.dtype, device=device_)
+        if y.ndim == 1:
+            y_offset = xp.asarray(0.0, dtype=dtype_, device=device_)
+        else:
+            y_offset = xp.zeros(y.shape[1], dtype=dtype_, device=device_)
+
+    # XXX: X_scale is no longer needed. It is an historic artifact from the
+    # time where linear model exposed the normalize parameter.
+    X_scale = xp.ones(n_features, dtype=X.dtype, device=device_)
+    return X, y, X_offset, y_offset, X_scale
+
+
+# TODO: _rescale_data should be factored into _preprocess_data.
+# Currently, the fact that sag implements its own way to deal with
+# sample_weight makes the refactoring tricky.
+
+
+def _rescale_data(X, y, sample_weight, inplace=False):
+    """Rescale data sample-wise by square root of sample_weight.
+
+    For many linear models, this enables easy support for sample_weight because
+
+        (y - X w)' S (y - X w)
+
+    with S = diag(sample_weight) becomes
+
+        ||y_rescaled - X_rescaled w||_2^2
+
+    when setting
+
+        y_rescaled = sqrt(S) y
+        X_rescaled = sqrt(S) X
+
+    Returns
+    -------
+    X_rescaled : {array-like, sparse matrix}
+
+    y_rescaled : {array-like, sparse matrix}
+    """
+    # Assume that _validate_data and _check_sample_weight have been called by
+    # the caller.
+    xp, _ = get_namespace(X, y, sample_weight)
+    n_samples = X.shape[0]
+    sample_weight_sqrt = xp.sqrt(sample_weight)
+
+    if sp.issparse(X) or sp.issparse(y):
+        sw_matrix = sparse.dia_matrix(
+            (sample_weight_sqrt, 0), shape=(n_samples, n_samples)
+        )
+
+    if sp.issparse(X):
+        X = safe_sparse_dot(sw_matrix, X)
+    else:
+        if inplace:
+            X *= sample_weight_sqrt[:, None]
+        else:
+            X = X * sample_weight_sqrt[:, None]
+
+    if sp.issparse(y):
+        y = safe_sparse_dot(sw_matrix, y)
+    else:
+        if inplace:
+            if y.ndim == 1:
+                y *= sample_weight_sqrt
+            else:
+                y *= sample_weight_sqrt[:, None]
+        else:
+            if y.ndim == 1:
+                y = y * sample_weight_sqrt
+            else:
+                y = y * sample_weight_sqrt[:, None]
+    return X, y, sample_weight_sqrt
+
+
+class LinearModel(BaseEstimator, metaclass=ABCMeta):
+    """Base class for Linear Models"""
+
+    @abstractmethod
+    def fit(self, X, y):
+        """Fit model."""
+
+    def _decision_function(self, X):
+        check_is_fitted(self)
+
+        X = validate_data(self, X, accept_sparse=["csr", "csc", "coo"], reset=False)
+        coef_ = self.coef_
+        if coef_.ndim == 1:
+            return X @ coef_ + self.intercept_
+        else:
+            return X @ coef_.T + self.intercept_
+
+    def predict(self, X):
+        """
+        Predict using the linear model.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values.
+        """
+        return self._decision_function(X)
+
+    def _set_intercept(self, X_offset, y_offset, X_scale):
+        """Set the intercept_"""
+
+        xp, _ = get_namespace(X_offset, y_offset, X_scale)
+
+        if self.fit_intercept:
+            # We always want coef_.dtype=X.dtype. For instance, X.dtype can differ from
+            # coef_.dtype if warm_start=True.
+            coef_ = xp.astype(self.coef_, X_scale.dtype, copy=False)
+            coef_ = self.coef_ = xp.divide(coef_, X_scale)
+
+            if coef_.ndim == 1:
+                intercept_ = y_offset - X_offset @ coef_
+            else:
+                intercept_ = y_offset - X_offset @ coef_.T
+
+            self.intercept_ = intercept_
+
+        else:
+            self.intercept_ = 0.0
+
+
+# XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
+# Maybe the n_features checking can be moved to LinearModel.
+class LinearClassifierMixin(ClassifierMixin):
+    """Mixin for linear classifiers.
+
+    Handles prediction for sparse and dense X.
+    """
+
+    def decision_function(self, X):
+        """
+        Predict confidence scores for samples.
+
+        The confidence score for a sample is proportional to the signed
+        distance of that sample to the hyperplane.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to get the confidence scores.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Confidence scores per `(n_samples, n_classes)` combination. In the
+            binary case, confidence score for `self.classes_[1]` where >0 means
+            this class would be predicted.
+        """
+        check_is_fitted(self)
+        xp, _ = get_namespace(X)
+
+        X = validate_data(self, X, accept_sparse="csr", reset=False)
+        scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
+        return (
+            xp.reshape(scores, (-1,))
+            if (scores.ndim > 1 and scores.shape[1] == 1)
+            else scores
+        )
+
+    def predict(self, X):
+        """
+        Predict class labels for samples in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to get the predictions.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Vector containing the class labels for each sample.
+        """
+        xp, _ = get_namespace(X)
+        scores = self.decision_function(X)
+        if len(scores.shape) == 1:
+            indices = xp.astype(scores > 0, indexing_dtype(xp))
+        else:
+            indices = xp.argmax(scores, axis=1)
+
+        return xp.take(self.classes_, indices, axis=0)
+
+    def _predict_proba_lr(self, X):
+        """Probability estimation for OvR logistic regression.
+
+        Positive class probabilities are computed as
+        1. / (1. + np.exp(-self.decision_function(X)));
+        multiclass is handled by normalizing that over all classes.
+        """
+        prob = self.decision_function(X)
+        expit(prob, out=prob)
+        if prob.ndim == 1:
+            return np.vstack([1 - prob, prob]).T
+        else:
+            # OvR normalization, like LibLinear's predict_probability
+            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
+            return prob
+
+
+class SparseCoefMixin:
+    """Mixin for converting coef_ to and from CSR format.
+
+    L1-regularizing estimators should inherit this.
+    """
+
+    def densify(self):
+        """
+        Convert coefficient matrix to dense array format.
+
+        Converts the ``coef_`` member (back) to a numpy.ndarray. This is the
+        default format of ``coef_`` and is required for fitting, so calling
+        this method is only required on models that have previously been
+        sparsified; otherwise, it is a no-op.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        msg = "Estimator, %(name)s, must be fitted before densifying."
+        check_is_fitted(self, msg=msg)
+        if sp.issparse(self.coef_):
+            self.coef_ = self.coef_.toarray()
+        return self
+
+    def sparsify(self):
+        """
+        Convert coefficient matrix to sparse format.
+
+        Converts the ``coef_`` member to a scipy.sparse matrix, which for
+        L1-regularized models can be much more memory- and storage-efficient
+        than the usual numpy.ndarray representation.
+
+        The ``intercept_`` member is not converted.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+
+        Notes
+        -----
+        For non-sparse models, i.e. when there are not many zeros in ``coef_``,
+        this may actually *increase* memory usage, so use this method with
+        care. A rule of thumb is that the number of zero elements, which can
+        be computed with ``(coef_ == 0).sum()``, must be more than 50% for this
+        to provide significant benefits.
+
+        After calling this method, further fitting with the partial_fit
+        method (if any) will not work until you call densify.
+        """
+        msg = "Estimator, %(name)s, must be fitted before sparsifying."
+        check_is_fitted(self, msg=msg)
+        self.coef_ = sp.csr_matrix(self.coef_)
+        return self
+
+
+class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
+    """
+    Ordinary least squares Linear Regression.
+
+    LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
+    to minimize the residual sum of squares between the observed targets in
+    the dataset, and the targets predicted by the linear approximation.
+
+    Parameters
+    ----------
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to False, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    tol : float, default=1e-6
+        The precision of the solution (`coef_`) is determined by `tol` which
+        specifies a different convergence criterion for the `lsqr` solver.
+        `tol` is set as `atol` and `btol` of `scipy.sparse.linalg.lsqr` when
+        fitting on sparse training data. This parameter has no effect when fitting
+        on dense data.
+
+        .. versionadded:: 1.7
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. This will only provide
+        speedup in case of sufficiently large problems, that is if firstly
+        `n_targets > 1` and secondly `X` is sparse or if `positive` is set
+        to `True`. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive. This
+        option is only supported for dense arrays.
+
+        For a comparison between a linear regression model with positive constraints
+        on the regression coefficients and a linear regression without such constraints,
+        see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features, ) or (n_targets, n_features)
+        Estimated coefficients for the linear regression problem.
+        If multiple targets are passed during the fit (y 2D), this
+        is a 2D array of shape (n_targets, n_features), while if only
+        one target is passed, this is a 1D array of length n_features.
+
+    rank_ : int
+        Rank of matrix `X`. Only available when `X` is dense.
+
+    singular_ : array of shape (min(X, y),)
+        Singular values of `X`. Only available when `X` is dense.
+
+    intercept_ : float or array of shape (n_targets,)
+        Independent term in the linear model. Set to 0.0 if
+        `fit_intercept = False`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    Ridge : Ridge regression addresses some of the
+        problems of Ordinary Least Squares by imposing a penalty on the
+        size of the coefficients with l2 regularization.
+    Lasso : The Lasso is a linear model that estimates
+        sparse coefficients with l1 regularization.
+    ElasticNet : Elastic-Net is a linear regression
+        model trained with both l1 and l2 -norm regularization of the
+        coefficients.
+
+    Notes
+    -----
+    From the implementation point of view, this is just plain Ordinary
+    Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares
+    (scipy.optimize.nnls) wrapped as a predictor object.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LinearRegression
+    >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
+    >>> # y = 1 * x_0 + 2 * x_1 + 3
+    >>> y = np.dot(X, np.array([1, 2])) + 3
+    >>> reg = LinearRegression().fit(X, y)
+    >>> reg.score(X, y)
+    1.0
+    >>> reg.coef_
+    array([1., 2.])
+    >>> reg.intercept_
+    np.float64(3.0)
+    >>> reg.predict(np.array([[3, 5]]))
+    array([16.])
+    """
+
+    _parameter_constraints: dict = {
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "n_jobs": [None, Integral],
+        "positive": ["boolean"],
+        "tol": [Interval(Real, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        *,
+        fit_intercept=True,
+        copy_X=True,
+        tol=1e-6,
+        n_jobs=None,
+        positive=False,
+    ):
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.tol = tol
+        self.n_jobs = n_jobs
+        self.positive = positive
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """
+        Fit linear model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.17
+               parameter *sample_weight* support to LinearRegression.
+
+        Returns
+        -------
+        self : object
+            Fitted Estimator.
+        """
+        n_jobs_ = self.n_jobs
+
+        accept_sparse = False if self.positive else ["csr", "csc", "coo"]
+
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=accept_sparse,
+            y_numeric=True,
+            multi_output=True,
+            force_writeable=True,
+        )
+
+        has_sw = sample_weight is not None
+        if has_sw:
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=X.dtype, ensure_non_negative=True
+            )
+
+        # Note that neither _rescale_data nor the rest of the fit method of
+        # LinearRegression can benefit from in-place operations when X is a
+        # sparse matrix. Therefore, let's not copy X when it is sparse.
+        copy_X_in_preprocess_data = self.copy_X and not sp.issparse(X)
+
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X,
+            y,
+            fit_intercept=self.fit_intercept,
+            copy=copy_X_in_preprocess_data,
+            sample_weight=sample_weight,
+        )
+
+        if has_sw:
+            # Sample weight can be implemented via a simple rescaling. Note
+            # that we safely do inplace rescaling when _preprocess_data has
+            # already made a copy if requested.
+            X, y, sample_weight_sqrt = _rescale_data(
+                X, y, sample_weight, inplace=copy_X_in_preprocess_data
+            )
+
+        if self.positive:
+            if y.ndim < 2:
+                self.coef_ = optimize.nnls(X, y)[0]
+            else:
+                # scipy.optimize.nnls cannot handle y with shape (M, K)
+                outs = Parallel(n_jobs=n_jobs_)(
+                    delayed(optimize.nnls)(X, y[:, j]) for j in range(y.shape[1])
+                )
+                self.coef_ = np.vstack([out[0] for out in outs])
+        elif sp.issparse(X):
+            X_offset_scale = X_offset / X_scale
+
+            if has_sw:
+
+                def matvec(b):
+                    return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale)
+
+                def rmatvec(b):
+                    return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt)
+
+            else:
+
+                def matvec(b):
+                    return X.dot(b) - b.dot(X_offset_scale)
+
+                def rmatvec(b):
+                    return X.T.dot(b) - X_offset_scale * b.sum()
+
+            X_centered = sparse.linalg.LinearOperator(
+                shape=X.shape, matvec=matvec, rmatvec=rmatvec
+            )
+
+            if y.ndim < 2:
+                self.coef_ = lsqr(X_centered, y, atol=self.tol, btol=self.tol)[0]
+            else:
+                # sparse_lstsq cannot handle y with shape (M, K)
+                outs = Parallel(n_jobs=n_jobs_)(
+                    delayed(lsqr)(
+                        X_centered, y[:, j].ravel(), atol=self.tol, btol=self.tol
+                    )
+                    for j in range(y.shape[1])
+                )
+                self.coef_ = np.vstack([out[0] for out in outs])
+        else:
+            # cut-off ratio for small singular values
+            cond = max(X.shape) * np.finfo(X.dtype).eps
+            self.coef_, _, self.rank_, self.singular_ = linalg.lstsq(X, y, cond=cond)
+            self.coef_ = self.coef_.T
+
+        if y.ndim == 1:
+            self.coef_ = np.ravel(self.coef_)
+        self._set_intercept(X_offset, y_offset, X_scale)
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = not self.positive
+        return tags
+
+
+def _check_precomputed_gram_matrix(
+    X, precompute, X_offset, X_scale, rtol=None, atol=1e-5
+):
+    """Computes a single element of the gram matrix and compares it to
+    the corresponding element of the user supplied gram matrix.
+
+    If the values do not match a ValueError will be thrown.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Data array.
+
+    precompute : array-like of shape (n_features, n_features)
+        User-supplied gram matrix.
+
+    X_offset : ndarray of shape (n_features,)
+        Array of feature means used to center design matrix.
+
+    X_scale : ndarray of shape (n_features,)
+        Array of feature scale factors used to normalize design matrix.
+
+    rtol : float, default=None
+        Relative tolerance; see numpy.allclose
+        If None, it is set to 1e-4 for arrays of dtype numpy.float32 and 1e-7
+        otherwise.
+
+    atol : float, default=1e-5
+        absolute tolerance; see :func`numpy.allclose`. Note that the default
+        here is more tolerant than the default for
+        :func:`numpy.testing.assert_allclose`, where `atol=0`.
+
+    Raises
+    ------
+    ValueError
+        Raised when the provided Gram matrix is not consistent.
+    """
+
+    n_features = X.shape[1]
+    f1 = n_features // 2
+    f2 = min(f1 + 1, n_features - 1)
+
+    v1 = (X[:, f1] - X_offset[f1]) * X_scale[f1]
+    v2 = (X[:, f2] - X_offset[f2]) * X_scale[f2]
+
+    expected = np.dot(v1, v2)
+    actual = precompute[f1, f2]
+
+    dtypes = [precompute.dtype, expected.dtype]
+    if rtol is None:
+        rtols = [1e-4 if dtype == np.float32 else 1e-7 for dtype in dtypes]
+        rtol = max(rtols)
+
+    if not np.isclose(expected, actual, rtol=rtol, atol=atol):
+        raise ValueError(
+            "Gram matrix passed in via 'precompute' parameter "
+            "did not pass validation when a single element was "
+            "checked - please check that it was computed "
+            f"properly. For element ({f1},{f2}) we computed "
+            f"{expected} but the user-supplied value was "
+            f"{actual}."
+        )
+
+
+def _pre_fit(
+    X,
+    y,
+    Xy,
+    precompute,
+    fit_intercept,
+    copy,
+    check_input=True,
+    sample_weight=None,
+):
+    """Function used at beginning of fit in linear models with L1 or L0 penalty.
+
+    This function applies _preprocess_data and additionally computes the gram matrix
+    `precompute` as needed as well as `Xy`.
+    """
+    n_samples, n_features = X.shape
+
+    if sparse.issparse(X):
+        # copy is not needed here as X is not modified inplace when X is sparse
+        precompute = False
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X,
+            y,
+            fit_intercept=fit_intercept,
+            copy=False,
+            check_input=check_input,
+            sample_weight=sample_weight,
+        )
+    else:
+        # copy was done in fit if necessary
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X,
+            y,
+            fit_intercept=fit_intercept,
+            copy=copy,
+            check_input=check_input,
+            sample_weight=sample_weight,
+        )
+        # Rescale only in dense case. Sparse cd solver directly deals with
+        # sample_weight.
+        if sample_weight is not None:
+            # This triggers copies anyway.
+            X, y, _ = _rescale_data(X, y, sample_weight=sample_weight)
+
+    if hasattr(precompute, "__array__"):
+        if fit_intercept and not np.allclose(X_offset, np.zeros(n_features)):
+            warnings.warn(
+                (
+                    "Gram matrix was provided but X was centered to fit "
+                    "intercept: recomputing Gram matrix."
+                ),
+                UserWarning,
+            )
+            # TODO: instead of warning and recomputing, we could just center
+            # the user provided Gram matrix a-posteriori (after making a copy
+            # when `copy=True`).
+            # recompute Gram
+            precompute = "auto"
+            Xy = None
+        elif check_input:
+            # If we're going to use the user's precomputed gram matrix, we
+            # do a quick check to make sure its not totally bogus.
+            _check_precomputed_gram_matrix(X, precompute, X_offset, X_scale)
+
+    # precompute if n_samples > n_features
+    if isinstance(precompute, str) and precompute == "auto":
+        precompute = n_samples > n_features
+
+    if precompute is True:
+        # make sure that the 'precompute' array is contiguous.
+        precompute = np.empty(shape=(n_features, n_features), dtype=X.dtype, order="C")
+        np.dot(X.T, X, out=precompute)
+
+    if not hasattr(precompute, "__array__"):
+        Xy = None  # cannot use Xy if precompute is not Gram
+
+    if hasattr(precompute, "__array__") and Xy is None:
+        common_dtype = np.result_type(X.dtype, y.dtype)
+        if y.ndim == 1:
+            # Xy is 1d, make sure it is contiguous.
+            Xy = np.empty(shape=n_features, dtype=common_dtype, order="C")
+            np.dot(X.T, y, out=Xy)
+        else:
+            # Make sure that Xy is always F contiguous even if X or y are not
+            # contiguous: the goal is to make it fast to extract the data for a
+            # specific target.
+            n_targets = y.shape[1]
+            Xy = np.empty(shape=(n_features, n_targets), dtype=common_dtype, order="F")
+            np.dot(y.T, X, out=Xy.T)
+
+    return X, y, X_offset, y_offset, X_scale, precompute, Xy
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_bayes.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_bayes.py
new file mode 100644
index 0000000000000000000000000000000000000000..e519660323d80f8d8f7f607451f59d26ecb62f19
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_bayes.py
@@ -0,0 +1,826 @@
+"""
+Various bayesian regression
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from math import log
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+from scipy.linalg import pinvh
+
+from ..base import RegressorMixin, _fit_context
+from ..utils import _safe_indexing
+from ..utils._param_validation import Interval
+from ..utils.extmath import fast_logdet
+from ..utils.validation import _check_sample_weight, validate_data
+from ._base import LinearModel, _preprocess_data, _rescale_data
+
+###############################################################################
+# BayesianRidge regression
+
+
+class BayesianRidge(RegressorMixin, LinearModel):
+    """Bayesian ridge regression.
+
+    Fit a Bayesian ridge model. See the Notes section for details on this
+    implementation and the optimization of the regularization parameters
+    lambda (precision of the weights) and alpha (precision of the noise).
+
+    Read more in the :ref:`User Guide <bayesian_regression>`.
+    For an intuitive visualization of how the sinusoid is approximated by
+    a polynomial using different pairs of initial values, see
+    :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`.
+
+    Parameters
+    ----------
+    max_iter : int, default=300
+        Maximum number of iterations over the complete dataset before
+        stopping independently of any early stopping criterion.
+
+        .. versionchanged:: 1.3
+
+    tol : float, default=1e-3
+        Stop the algorithm if w has converged.
+
+    alpha_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the alpha parameter.
+
+    alpha_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the alpha parameter.
+
+    lambda_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the lambda parameter.
+
+    lambda_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the lambda parameter.
+
+    alpha_init : float, default=None
+        Initial value for alpha (precision of the noise).
+        If not set, alpha_init is 1/Var(y).
+
+        .. versionadded:: 0.22
+
+    lambda_init : float, default=None
+        Initial value for lambda (precision of the weights).
+        If not set, lambda_init is 1.
+
+        .. versionadded:: 0.22
+
+    compute_score : bool, default=False
+        If True, compute the log marginal likelihood at each iteration of the
+        optimization.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model.
+        The intercept is not treated as a probabilistic parameter
+        and thus has no associated variance. If set
+        to False, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    verbose : bool, default=False
+        Verbose mode when fitting the model.
+
+    Attributes
+    ----------
+    coef_ : array-like of shape (n_features,)
+        Coefficients of the regression model (mean of distribution)
+
+    intercept_ : float
+        Independent term in decision function. Set to 0.0 if
+        `fit_intercept = False`.
+
+    alpha_ : float
+       Estimated precision of the noise.
+
+    lambda_ : float
+       Estimated precision of the weights.
+
+    sigma_ : array-like of shape (n_features, n_features)
+        Estimated variance-covariance matrix of the weights
+
+    scores_ : array-like of shape (n_iter_+1,)
+        If computed_score is True, value of the log marginal likelihood (to be
+        maximized) at each iteration of the optimization. The array starts
+        with the value of the log marginal likelihood obtained for the initial
+        values of alpha and lambda and ends with the value obtained for the
+        estimated alpha and lambda.
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+    X_offset_ : ndarray of shape (n_features,)
+        If `fit_intercept=True`, offset subtracted for centering data to a
+        zero mean. Set to np.zeros(n_features) otherwise.
+
+    X_scale_ : ndarray of shape (n_features,)
+        Set to np.ones(n_features).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    ARDRegression : Bayesian ARD regression.
+
+    Notes
+    -----
+    There exist several strategies to perform Bayesian ridge regression. This
+    implementation is based on the algorithm described in Appendix A of
+    (Tipping, 2001) where updates of the regularization parameters are done as
+    suggested in (MacKay, 1992). Note that according to A New
+    View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
+    update rules do not guarantee that the marginal likelihood is increasing
+    between two consecutive iterations of the optimization.
+
+    References
+    ----------
+    D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
+    Vol. 4, No. 3, 1992.
+
+    M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
+    Journal of Machine Learning Research, Vol. 1, 2001.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.BayesianRidge()
+    >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
+    BayesianRidge()
+    >>> clf.predict([[1, 1]])
+    array([1.])
+    """
+
+    _parameter_constraints: dict = {
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="neither")],
+        "alpha_1": [Interval(Real, 0, None, closed="left")],
+        "alpha_2": [Interval(Real, 0, None, closed="left")],
+        "lambda_1": [Interval(Real, 0, None, closed="left")],
+        "lambda_2": [Interval(Real, 0, None, closed="left")],
+        "alpha_init": [None, Interval(Real, 0, None, closed="left")],
+        "lambda_init": [None, Interval(Real, 0, None, closed="left")],
+        "compute_score": ["boolean"],
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        max_iter=300,
+        tol=1.0e-3,
+        alpha_1=1.0e-6,
+        alpha_2=1.0e-6,
+        lambda_1=1.0e-6,
+        lambda_2=1.0e-6,
+        alpha_init=None,
+        lambda_init=None,
+        compute_score=False,
+        fit_intercept=True,
+        copy_X=True,
+        verbose=False,
+    ):
+        self.max_iter = max_iter
+        self.tol = tol
+        self.alpha_1 = alpha_1
+        self.alpha_2 = alpha_2
+        self.lambda_1 = lambda_1
+        self.lambda_2 = lambda_2
+        self.alpha_init = alpha_init
+        self.lambda_init = lambda_init
+        self.compute_score = compute_score
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+        y : ndarray of shape (n_samples,)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.20
+               parameter *sample_weight* support to BayesianRidge.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            dtype=[np.float64, np.float32],
+            force_writeable=True,
+            y_numeric=True,
+        )
+        dtype = X.dtype
+        n_samples, n_features = X.shape
+
+        sw_sum = n_samples
+        y_var = y.var()
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype)
+            sw_sum = sample_weight.sum()
+            y_mean = np.average(y, weights=sample_weight)
+            y_var = np.average((y - y_mean) ** 2, weights=sample_weight)
+
+        X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
+            X,
+            y,
+            fit_intercept=self.fit_intercept,
+            copy=self.copy_X,
+            sample_weight=sample_weight,
+        )
+
+        if sample_weight is not None:
+            # Sample weight can be implemented via a simple rescaling.
+            X, y, _ = _rescale_data(X, y, sample_weight)
+
+        self.X_offset_ = X_offset_
+        self.X_scale_ = X_scale_
+
+        # Initialization of the values of the parameters
+        eps = np.finfo(np.float64).eps
+        # Add `eps` in the denominator to omit division by zero
+        alpha_ = self.alpha_init
+        lambda_ = self.lambda_init
+        if alpha_ is None:
+            alpha_ = 1.0 / (y_var + eps)
+        if lambda_ is None:
+            lambda_ = 1.0
+
+        # Avoid unintended type promotion to float64 with numpy 2
+        alpha_ = np.asarray(alpha_, dtype=dtype)
+        lambda_ = np.asarray(lambda_, dtype=dtype)
+
+        verbose = self.verbose
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+
+        self.scores_ = list()
+        coef_old_ = None
+
+        XT_y = np.dot(X.T, y)
+        # Let M, N = n_samples, n_features and K = min(M, N).
+        # The posterior covariance matrix needs Vh_full: (N, N).
+        # The full SVD is only required when n_samples < n_features.
+        # When n_samples < n_features, K=M and full_matrices=True
+        # U: (M, M), S: M, Vh_full: (N, N), Vh: (M, N)
+        # When n_samples > n_features, K=N and full_matrices=False
+        # U: (M, N), S: N, Vh_full: (N, N), Vh: (N, N)
+        U, S, Vh_full = linalg.svd(X, full_matrices=(n_samples < n_features))
+        K = len(S)
+        eigen_vals_ = S**2
+        eigen_vals_full = np.zeros(n_features, dtype=dtype)
+        eigen_vals_full[0:K] = eigen_vals_
+        Vh = Vh_full[0:K, :]
+
+        # Convergence loop of the bayesian ridge regression
+        for iter_ in range(self.max_iter):
+            # update posterior mean coef_ based on alpha_ and lambda_ and
+            # compute corresponding sse (sum of squared errors)
+            coef_, sse_ = self._update_coef_(
+                X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+            )
+            if self.compute_score:
+                # compute the log marginal likelihood
+                s = self._log_marginal_likelihood(
+                    n_samples,
+                    n_features,
+                    sw_sum,
+                    eigen_vals_,
+                    alpha_,
+                    lambda_,
+                    coef_,
+                    sse_,
+                )
+                self.scores_.append(s)
+
+            # Update alpha and lambda according to (MacKay, 1992)
+            gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
+            lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
+            alpha_ = (sw_sum - gamma_ + 2 * alpha_1) / (sse_ + 2 * alpha_2)
+
+            # Check for convergence
+            if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
+                if verbose:
+                    print("Convergence after ", str(iter_), " iterations")
+                break
+            coef_old_ = np.copy(coef_)
+
+        self.n_iter_ = iter_ + 1
+
+        # return regularization parameters and corresponding posterior mean,
+        # log marginal likelihood and posterior covariance
+        self.alpha_ = alpha_
+        self.lambda_ = lambda_
+        self.coef_, sse_ = self._update_coef_(
+            X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+        )
+        if self.compute_score:
+            # compute the log marginal likelihood
+            s = self._log_marginal_likelihood(
+                n_samples,
+                n_features,
+                sw_sum,
+                eigen_vals_,
+                alpha_,
+                lambda_,
+                coef_,
+                sse_,
+            )
+            self.scores_.append(s)
+            self.scores_ = np.array(self.scores_)
+
+        # posterior covariance
+        self.sigma_ = np.dot(
+            Vh_full.T, Vh_full / (alpha_ * eigen_vals_full + lambda_)[:, np.newaxis]
+        )
+
+        self._set_intercept(X_offset_, y_offset_, X_scale_)
+
+        return self
+
+    def predict(self, X, return_std=False):
+        """Predict using the linear model.
+
+        In addition to the mean of the predictive distribution, also its
+        standard deviation can be returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        return_std : bool, default=False
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array-like of shape (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array-like of shape (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if not return_std:
+            return y_mean
+        else:
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
+            return y_mean, y_std
+
+    def _update_coef_(
+        self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+    ):
+        """Update posterior mean and compute corresponding sse (sum of squared errors).
+
+        Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
+        scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
+                         + np.dot(X.T, X))^-1
+        """
+
+        if n_samples > n_features:
+            coef_ = np.linalg.multi_dot(
+                [Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
+            )
+        else:
+            coef_ = np.linalg.multi_dot(
+                [X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
+            )
+
+        # Note: we do not need to explicitly use the weights in this sum because
+        # y and X were preprocessed by _rescale_data to handle the weights.
+        sse_ = np.sum((y - np.dot(X, coef_)) ** 2)
+
+        return coef_, sse_
+
+    def _log_marginal_likelihood(
+        self, n_samples, n_features, sw_sum, eigen_vals, alpha_, lambda_, coef, sse
+    ):
+        """Log marginal likelihood."""
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+
+        # compute the log of the determinant of the posterior covariance.
+        # posterior covariance is given by
+        # sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
+        if n_samples > n_features:
+            logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
+        else:
+            logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
+            logdet_sigma[:n_samples] += alpha_ * eigen_vals
+            logdet_sigma = -np.sum(np.log(logdet_sigma))
+
+        score = lambda_1 * log(lambda_) - lambda_2 * lambda_
+        score += alpha_1 * log(alpha_) - alpha_2 * alpha_
+        score += 0.5 * (
+            n_features * log(lambda_)
+            + sw_sum * log(alpha_)
+            - alpha_ * sse
+            - lambda_ * np.sum(coef**2)
+            + logdet_sigma
+            - sw_sum * log(2 * np.pi)
+        )
+
+        return score
+
+
+###############################################################################
+# ARD (Automatic Relevance Determination) regression
+
+
+class ARDRegression(RegressorMixin, LinearModel):
+    """Bayesian ARD regression.
+
+    Fit the weights of a regression model, using an ARD prior. The weights of
+    the regression model are assumed to be in Gaussian distributions.
+    Also estimate the parameters lambda (precisions of the distributions of the
+    weights) and alpha (precision of the distribution of the noise).
+    The estimation is done by an iterative procedures (Evidence Maximization)
+
+    Read more in the :ref:`User Guide <bayesian_regression>`.
+
+    Parameters
+    ----------
+    max_iter : int, default=300
+        Maximum number of iterations.
+
+        .. versionchanged:: 1.3
+
+    tol : float, default=1e-3
+        Stop the algorithm if w has converged.
+
+    alpha_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the alpha parameter.
+
+    alpha_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the alpha parameter.
+
+    lambda_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the lambda parameter.
+
+    lambda_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the lambda parameter.
+
+    compute_score : bool, default=False
+        If True, compute the objective function at each step of the model.
+
+    threshold_lambda : float, default=10 000
+        Threshold for removing (pruning) weights with high precision from
+        the computation.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    verbose : bool, default=False
+        Verbose mode when fitting the model.
+
+    Attributes
+    ----------
+    coef_ : array-like of shape (n_features,)
+        Coefficients of the regression model (mean of distribution)
+
+    alpha_ : float
+       estimated precision of the noise.
+
+    lambda_ : array-like of shape (n_features,)
+       estimated precisions of the weights.
+
+    sigma_ : array-like of shape (n_features, n_features)
+        estimated variance-covariance matrix of the weights
+
+    scores_ : float
+        if computed, value of the objective function (to be maximized)
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+        .. versionadded:: 1.3
+
+    intercept_ : float
+        Independent term in decision function. Set to 0.0 if
+        ``fit_intercept = False``.
+
+    X_offset_ : float
+        If `fit_intercept=True`, offset subtracted for centering data to a
+        zero mean. Set to np.zeros(n_features) otherwise.
+
+    X_scale_ : float
+        Set to np.ones(n_features).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    BayesianRidge : Bayesian ridge regression.
+
+    References
+    ----------
+    D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
+    competition, ASHRAE Transactions, 1994.
+
+    R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
+    http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
+    Their beta is our ``self.alpha_``
+    Their alpha is our ``self.lambda_``
+    ARD is a little different than the slide: only dimensions/features for
+    which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
+    discarded.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.ARDRegression()
+    >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
+    ARDRegression()
+    >>> clf.predict([[1, 1]])
+    array([1.])
+
+    -   :ref:`sphx_glr_auto_examples_linear_model_plot_ard.py` demonstrates ARD
+        Regression.
+    -   :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py`
+        showcases ARD Regression alongside Lasso and Elastic-Net for sparse,
+        correlated signals, in the presence of noise.
+    """
+
+    _parameter_constraints: dict = {
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "alpha_1": [Interval(Real, 0, None, closed="left")],
+        "alpha_2": [Interval(Real, 0, None, closed="left")],
+        "lambda_1": [Interval(Real, 0, None, closed="left")],
+        "lambda_2": [Interval(Real, 0, None, closed="left")],
+        "compute_score": ["boolean"],
+        "threshold_lambda": [Interval(Real, 0, None, closed="left")],
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        max_iter=300,
+        tol=1.0e-3,
+        alpha_1=1.0e-6,
+        alpha_2=1.0e-6,
+        lambda_1=1.0e-6,
+        lambda_2=1.0e-6,
+        compute_score=False,
+        threshold_lambda=1.0e4,
+        fit_intercept=True,
+        copy_X=True,
+        verbose=False,
+    ):
+        self.max_iter = max_iter
+        self.tol = tol
+        self.fit_intercept = fit_intercept
+        self.alpha_1 = alpha_1
+        self.alpha_2 = alpha_2
+        self.lambda_1 = lambda_1
+        self.lambda_2 = lambda_2
+        self.compute_score = compute_score
+        self.threshold_lambda = threshold_lambda
+        self.copy_X = copy_X
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the model according to the given training data and parameters.
+
+        Iterative procedure to maximize the evidence
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+        y : array-like of shape (n_samples,)
+            Target values (integers). Will be cast to X's dtype if necessary.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            dtype=[np.float64, np.float32],
+            force_writeable=True,
+            y_numeric=True,
+            ensure_min_samples=2,
+        )
+        dtype = X.dtype
+
+        n_samples, n_features = X.shape
+        coef_ = np.zeros(n_features, dtype=dtype)
+
+        X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
+            X, y, fit_intercept=self.fit_intercept, copy=self.copy_X
+        )
+
+        self.X_offset_ = X_offset_
+        self.X_scale_ = X_scale_
+
+        # Launch the convergence loop
+        keep_lambda = np.ones(n_features, dtype=bool)
+
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+        verbose = self.verbose
+
+        # Initialization of the values of the parameters
+        eps = np.finfo(np.float64).eps
+        # Add `eps` in the denominator to omit division by zero if `np.var(y)`
+        # is zero.
+        # Explicitly set dtype to avoid unintended type promotion with numpy 2.
+        alpha_ = np.asarray(1.0 / (np.var(y) + eps), dtype=dtype)
+        lambda_ = np.ones(n_features, dtype=dtype)
+
+        self.scores_ = list()
+        coef_old_ = None
+
+        def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
+            coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
+                [sigma_, X[:, keep_lambda].T, y]
+            )
+            return coef_
+
+        update_sigma = (
+            self._update_sigma
+            if n_samples >= n_features
+            else self._update_sigma_woodbury
+        )
+        # Iterative procedure of ARDRegression
+        for iter_ in range(self.max_iter):
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
+            coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+
+            # Update alpha and lambda
+            sse_ = np.sum((y - np.dot(X, coef_)) ** 2)
+            gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
+            lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
+                (coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
+            )
+            alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (sse_ + 2.0 * alpha_2)
+
+            # Prune the weights with a precision over a threshold
+            keep_lambda = lambda_ < self.threshold_lambda
+            coef_[~keep_lambda] = 0
+
+            # Compute the objective function
+            if self.compute_score:
+                s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
+                s += alpha_1 * log(alpha_) - alpha_2 * alpha_
+                s += 0.5 * (
+                    fast_logdet(sigma_)
+                    + n_samples * log(alpha_)
+                    + np.sum(np.log(lambda_))
+                )
+                s -= 0.5 * (alpha_ * sse_ + (lambda_ * coef_**2).sum())
+                self.scores_.append(s)
+
+            # Check for convergence
+            if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
+                if verbose:
+                    print("Converged after %s iterations" % iter_)
+                break
+            coef_old_ = np.copy(coef_)
+
+            if not keep_lambda.any():
+                break
+
+        self.n_iter_ = iter_ + 1
+
+        if keep_lambda.any():
+            # update sigma and mu using updated params from the last iteration
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
+            coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+        else:
+            sigma_ = np.array([]).reshape(0, 0)
+
+        self.coef_ = coef_
+        self.alpha_ = alpha_
+        self.sigma_ = sigma_
+        self.lambda_ = lambda_
+        self._set_intercept(X_offset_, y_offset_, X_scale_)
+        return self
+
+    def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples < n_features and will invert
+        # a matrix of shape (n_samples, n_samples) making use of the
+        # woodbury formula:
+        # https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+        n_samples = X.shape[0]
+        X_keep = X[:, keep_lambda]
+        inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
+        sigma_ = pinvh(
+            np.eye(n_samples, dtype=X.dtype) / alpha_
+            + np.dot(X_keep * inv_lambda, X_keep.T)
+        )
+        sigma_ = np.dot(sigma_, X_keep * inv_lambda)
+        sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
+        sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
+        return sigma_
+
+    def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples >= n_features and will
+        # invert a matrix of shape (n_features, n_features)
+        X_keep = X[:, keep_lambda]
+        gram = np.dot(X_keep.T, X_keep)
+        eye = np.eye(gram.shape[0], dtype=X.dtype)
+        sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
+        sigma_ = pinvh(sigma_inv)
+        return sigma_
+
+    def predict(self, X, return_std=False):
+        """Predict using the linear model.
+
+        In addition to the mean of the predictive distribution, also its
+        standard deviation can be returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        return_std : bool, default=False
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array-like of shape (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array-like of shape (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if return_std is False:
+            return y_mean
+        else:
+            col_index = self.lambda_ < self.threshold_lambda
+            X = _safe_indexing(X, indices=col_index, axis=1)
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
+            return y_mean, y_std
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_cd_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_cd_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..ce598ebb011d216ffdbbd70cbed507ad14bdb848
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_cd_fast.pyx
@@ -0,0 +1,962 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.math cimport fabs
+import numpy as np
+
+from cython cimport floating
+import warnings
+from ..exceptions import ConvergenceWarning
+
+from ..utils._cython_blas cimport (
+    _axpy, _dot, _asum, _gemv, _nrm2, _copy, _scal
+)
+from ..utils._cython_blas cimport ColMajor, Trans, NoTrans
+from ..utils._typedefs cimport uint32_t
+from ..utils._random cimport our_rand_r
+
+
+# The following two functions are shamelessly copied from the tree code.
+
+cdef enum:
+    # Max value for our rand_r replacement (near the bottom).
+    # We don't use RAND_MAX because it's different across platforms and
+    # particularly tiny on Windows/MSVC.
+    # It corresponds to the maximum representable value for
+    # 32-bit signed integers (i.e. 2^31 - 1).
+    RAND_R_MAX = 2147483647
+
+
+cdef inline uint32_t rand_int(uint32_t end, uint32_t* random_state) noexcept nogil:
+    """Generate a random integer in [0; end)."""
+    return our_rand_r(random_state) % end
+
+
+cdef inline floating fmax(floating x, floating y) noexcept nogil:
+    if x > y:
+        return x
+    return y
+
+
+cdef inline floating fsign(floating f) noexcept nogil:
+    if f == 0:
+        return 0
+    elif f > 0:
+        return 1.0
+    else:
+        return -1.0
+
+
+cdef floating abs_max(int n, const floating* a) noexcept nogil:
+    """np.max(np.abs(a))"""
+    cdef int i
+    cdef floating m = fabs(a[0])
+    cdef floating d
+    for i in range(1, n):
+        d = fabs(a[i])
+        if d > m:
+            m = d
+    return m
+
+
+cdef floating max(int n, floating* a) noexcept nogil:
+    """np.max(a)"""
+    cdef int i
+    cdef floating m = a[0]
+    cdef floating d
+    for i in range(1, n):
+        d = a[i]
+        if d > m:
+            m = d
+    return m
+
+
+cdef floating diff_abs_max(int n, const floating* a, floating* b) noexcept nogil:
+    """np.max(np.abs(a - b))"""
+    cdef int i
+    cdef floating m = fabs(a[0] - b[0])
+    cdef floating d
+    for i in range(1, n):
+        d = fabs(a[i] - b[i])
+        if d > m:
+            m = d
+    return m
+
+
+message_conv = (
+    "Objective did not converge. You might want to increase "
+    "the number of iterations, check the scale of the "
+    "features or consider increasing regularisation."
+)
+
+
+message_ridge = (
+    "Linear regression models with a zero l1 penalization "
+    "strength are more efficiently fitted using one of the "
+    "solvers implemented in "
+    "sklearn.linear_model.Ridge/RidgeCV instead."
+)
+
+
+def enet_coordinate_descent(
+    floating[::1] w,
+    floating alpha,
+    floating beta,
+    const floating[::1, :] X,
+    const floating[::1] y,
+    unsigned int max_iter,
+    floating tol,
+    object rng,
+    bint random=0,
+    bint positive=0
+):
+    """Cython version of the coordinate descent algorithm
+        for Elastic-Net regression
+
+        We minimize
+
+        (1/2) * norm(y - X w, 2)^2 + alpha norm(w, 1) + (beta/2) norm(w, 2)^2
+
+    Returns
+    -------
+    w : ndarray of shape (n_features,)
+        ElasticNet coefficients.
+    gap : float
+        Achieved dual gap.
+    tol : float
+        Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
+    n_iter : int
+        Number of coordinate descent iterations.
+    """
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    # get the data information into easy vars
+    cdef unsigned int n_samples = X.shape[0]
+    cdef unsigned int n_features = X.shape[1]
+
+    # compute norms of the columns of X
+    cdef floating[::1] norm_cols_X = np.square(X).sum(axis=0)
+
+    # initial value of the residuals
+    cdef floating[::1] R = np.empty(n_samples, dtype=dtype)
+    cdef floating[::1] XtA = np.empty(n_features, dtype=dtype)
+
+    cdef floating tmp
+    cdef floating w_ii
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating dual_norm_XtA
+    cdef floating R_norm2
+    cdef floating w_norm2
+    cdef floating l1_norm
+    cdef floating const_
+    cdef floating A_norm2
+    cdef unsigned int ii
+    cdef unsigned int n_iter = 0
+    cdef unsigned int f_iter
+    cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef uint32_t* rand_r_state = &rand_r_state_seed
+
+    if alpha == 0 and beta == 0:
+        warnings.warn("Coordinate descent with no regularization may lead to "
+                      "unexpected results and is discouraged.")
+
+    with nogil:
+        # R = y - np.dot(X, w)
+        _copy(n_samples, &y[0], 1, &R[0], 1)
+        _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0],
+              n_samples, &w[0], 1, 1.0, &R[0], 1)
+
+        # tol *= np.dot(y, y)
+        tol *= _dot(n_samples, &y[0], 1, &y[0], 1)
+
+        for n_iter in range(max_iter):
+            w_max = 0.0
+            d_w_max = 0.0
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if norm_cols_X[ii] == 0.0:
+                    continue
+
+                w_ii = w[ii]  # Store previous value
+
+                if w_ii != 0.0:
+                    # R += w_ii * X[:,ii]
+                    _axpy(n_samples, w_ii, &X[0, ii], 1, &R[0], 1)
+
+                # tmp = (X[:,ii]*R).sum()
+                tmp = _dot(n_samples, &X[0, ii], 1, &R[0], 1)
+
+                if positive and tmp < 0:
+                    w[ii] = 0.0
+                else:
+                    w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
+                             / (norm_cols_X[ii] + beta))
+
+                if w[ii] != 0.0:
+                    # R -=  w[ii] * X[:,ii] # Update residual
+                    _axpy(n_samples, -w[ii], &X[0, ii], 1, &R[0], 1)
+
+                # update the maximum absolute coefficient update
+                d_w_ii = fabs(w[ii] - w_ii)
+                d_w_max = fmax(d_w_max, d_w_ii)
+
+                w_max = fmax(w_max, fabs(w[ii]))
+
+            if (
+                w_max == 0.0
+                or d_w_max / w_max < d_w_tol
+                or n_iter == max_iter - 1
+            ):
+                # the biggest coordinate update of this iteration was smaller
+                # than the tolerance: check the duality gap as ultimate
+                # stopping criterion
+
+                # XtA = np.dot(X.T, R) - beta * w
+                _copy(n_features, &w[0], 1, &XtA[0], 1)
+                _gemv(ColMajor, Trans,
+                      n_samples, n_features, 1.0, &X[0, 0], n_samples,
+                      &R[0], 1,
+                      -beta, &XtA[0], 1)
+
+                if positive:
+                    dual_norm_XtA = max(n_features, &XtA[0])
+                else:
+                    dual_norm_XtA = abs_max(n_features, &XtA[0])
+
+                # R_norm2 = np.dot(R, R)
+                R_norm2 = _dot(n_samples, &R[0], 1, &R[0], 1)
+
+                # w_norm2 = np.dot(w, w)
+                w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
+
+                if (dual_norm_XtA > alpha):
+                    const_ = alpha / dual_norm_XtA
+                    A_norm2 = R_norm2 * (const_ ** 2)
+                    gap = 0.5 * (R_norm2 + A_norm2)
+                else:
+                    const_ = 1.0
+                    gap = R_norm2
+
+                l1_norm = _asum(n_features, &w[0], 1)
+
+                gap += (alpha * l1_norm
+                        - const_ * _dot(n_samples, &R[0], 1, &y[0], 1)  # np.dot(R.T, y)
+                        + 0.5 * beta * (1 + const_ ** 2) * (w_norm2))
+
+                if gap < tol:
+                    # return if we reached desired tolerance
+                    break
+
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                message = (
+                    message_conv +
+                    f" Duality gap: {gap:.3e}, tolerance: {tol:.3e}"
+                )
+                if alpha < np.finfo(np.float64).eps:
+                    message += "\n" + message_ridge
+                warnings.warn(message, ConvergenceWarning)
+
+    return np.asarray(w), gap, tol, n_iter + 1
+
+
+def sparse_enet_coordinate_descent(
+    floating[::1] w,
+    floating alpha,
+    floating beta,
+    const floating[::1] X_data,
+    const int[::1] X_indices,
+    const int[::1] X_indptr,
+    const floating[::1] y,
+    const floating[::1] sample_weight,
+    const floating[::1] X_mean,
+    unsigned int max_iter,
+    floating tol,
+    object rng,
+    bint random=0,
+    bint positive=0,
+):
+    """Cython version of the coordinate descent algorithm for Elastic-Net
+
+    We minimize:
+
+        1/2 * norm(y - Z w, 2)^2 + alpha * norm(w, 1) + (beta/2) * norm(w, 2)^2
+
+    where Z = X - X_mean.
+    With sample weights sw, this becomes
+
+        1/2 * sum(sw * (y - Z w)^2, axis=0) + alpha * norm(w, 1)
+        + (beta/2) * norm(w, 2)^2
+
+    and X_mean is the weighted average of X (per column).
+
+    Returns
+    -------
+    w : ndarray of shape (n_features,)
+        ElasticNet coefficients.
+    gap : float
+        Achieved dual gap.
+    tol : float
+        Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
+    n_iter : int
+        Number of coordinate descent iterations.
+    """
+    # Notes for sample_weight:
+    # For dense X, one centers X and y and then rescales them by sqrt(sample_weight).
+    # Here, for sparse X, we get the sample_weight averaged center X_mean. We take care
+    # that every calculation results as if we had rescaled y and X (and therefore also
+    # X_mean) by sqrt(sample_weight) without actually calculating the square root.
+    # We work with:
+    #     yw = sample_weight * y
+    #     R = sample_weight * residual
+    #     norm_cols_X = np.sum(sample_weight * (X - X_mean)**2, axis=0)
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    # get the data information into easy vars
+    cdef unsigned int n_samples = y.shape[0]
+    cdef unsigned int n_features = w.shape[0]
+
+    # compute norms of the columns of X
+    cdef floating[:] norm_cols_X = np.zeros(n_features, dtype=dtype)
+
+    # initial value of the residuals
+    # R = y - Zw, weighted version R = sample_weight * (y - Zw)
+    cdef floating[::1] R
+    cdef floating[::1] XtA = np.empty(n_features, dtype=dtype)
+    cdef const floating[::1] yw
+
+    cdef floating tmp
+    cdef floating w_ii
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating dual_norm_XtA
+    cdef floating X_mean_ii
+    cdef floating R_sum = 0.0
+    cdef floating R_norm2
+    cdef floating w_norm2
+    cdef floating l1_norm
+    cdef floating const_
+    cdef floating A_norm2
+    cdef floating normalize_sum
+    cdef unsigned int ii
+    cdef unsigned int jj
+    cdef unsigned int n_iter = 0
+    cdef unsigned int f_iter
+    cdef unsigned int startptr = X_indptr[0]
+    cdef unsigned int endptr
+    cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef uint32_t* rand_r_state = &rand_r_state_seed
+    cdef bint center = False
+    cdef bint no_sample_weights = sample_weight is None
+    cdef int kk
+
+    if no_sample_weights:
+        yw = y
+        R = y.copy()
+    else:
+        yw = np.multiply(sample_weight, y)
+        R = yw.copy()
+
+    with nogil:
+        # center = (X_mean != 0).any()
+        for ii in range(n_features):
+            if X_mean[ii]:
+                center = True
+                break
+
+        # R = y - np.dot(X, w)
+        for ii in range(n_features):
+            X_mean_ii = X_mean[ii]
+            endptr = X_indptr[ii + 1]
+            normalize_sum = 0.0
+            w_ii = w[ii]
+
+            if no_sample_weights:
+                for jj in range(startptr, endptr):
+                    normalize_sum += (X_data[jj] - X_mean_ii) ** 2
+                    R[X_indices[jj]] -= X_data[jj] * w_ii
+                norm_cols_X[ii] = normalize_sum + \
+                    (n_samples - endptr + startptr) * X_mean_ii ** 2
+                if center:
+                    for jj in range(n_samples):
+                        R[jj] += X_mean_ii * w_ii
+                        R_sum += R[jj]
+            else:
+                # R = sw * (y - np.dot(X, w))
+                for jj in range(startptr, endptr):
+                    tmp = sample_weight[X_indices[jj]]
+                    # second term will be subtracted by loop over range(n_samples)
+                    normalize_sum += (tmp * (X_data[jj] - X_mean_ii) ** 2
+                                      - tmp * X_mean_ii ** 2)
+                    R[X_indices[jj]] -= tmp * X_data[jj] * w_ii
+                if center:
+                    for jj in range(n_samples):
+                        normalize_sum += sample_weight[jj] * X_mean_ii ** 2
+                        R[jj] += sample_weight[jj] * X_mean_ii * w_ii
+                        R_sum += R[jj]
+                norm_cols_X[ii] = normalize_sum
+            startptr = endptr
+
+        # Note: No need to update R_sum from here on because the update terms cancel
+        # each other: w_ii * np.sum(X[:,ii] - X_mean[ii]) = 0. R_sum is only ever
+        # needed and calculated if X_mean is provided.
+
+        # tol *= np.dot(y, y)
+        # with sample weights: tol *= y @ (sw * y)
+        tol *= _dot(n_samples, &y[0], 1, &yw[0], 1)
+
+        for n_iter in range(max_iter):
+
+            w_max = 0.0
+            d_w_max = 0.0
+
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if norm_cols_X[ii] == 0.0:
+                    continue
+
+                startptr = X_indptr[ii]
+                endptr = X_indptr[ii + 1]
+                w_ii = w[ii]  # Store previous value
+                X_mean_ii = X_mean[ii]
+
+                if w_ii != 0.0:
+                    # R += w_ii * X[:,ii]
+                    if no_sample_weights:
+                        for jj in range(startptr, endptr):
+                            R[X_indices[jj]] += X_data[jj] * w_ii
+                        if center:
+                            for jj in range(n_samples):
+                                R[jj] -= X_mean_ii * w_ii
+                    else:
+                        for jj in range(startptr, endptr):
+                            tmp = sample_weight[X_indices[jj]]
+                            R[X_indices[jj]] += tmp * X_data[jj] * w_ii
+                        if center:
+                            for jj in range(n_samples):
+                                R[jj] -= sample_weight[jj] * X_mean_ii * w_ii
+
+                # tmp = (X[:,ii] * R).sum()
+                tmp = 0.0
+                for jj in range(startptr, endptr):
+                    tmp += R[X_indices[jj]] * X_data[jj]
+
+                if center:
+                    tmp -= R_sum * X_mean_ii
+
+                if positive and tmp < 0.0:
+                    w[ii] = 0.0
+                else:
+                    w[ii] = fsign(tmp) * fmax(fabs(tmp) - alpha, 0) \
+                            / (norm_cols_X[ii] + beta)
+
+                if w[ii] != 0.0:
+                    # R -=  w[ii] * X[:,ii] # Update residual
+                    if no_sample_weights:
+                        for jj in range(startptr, endptr):
+                            R[X_indices[jj]] -= X_data[jj] * w[ii]
+                        if center:
+                            for jj in range(n_samples):
+                                R[jj] += X_mean_ii * w[ii]
+                    else:
+                        for jj in range(startptr, endptr):
+                            tmp = sample_weight[X_indices[jj]]
+                            R[X_indices[jj]] -= tmp * X_data[jj] * w[ii]
+                        if center:
+                            for jj in range(n_samples):
+                                R[jj] += sample_weight[jj] * X_mean_ii * w[ii]
+
+                # update the maximum absolute coefficient update
+                d_w_ii = fabs(w[ii] - w_ii)
+                d_w_max = fmax(d_w_max, d_w_ii)
+
+                w_max = fmax(w_max, fabs(w[ii]))
+
+            if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
+                # the biggest coordinate update of this iteration was smaller than
+                # the tolerance: check the duality gap as ultimate stopping
+                # criterion
+
+                # XtA = X.T @ R - beta * w
+                # sparse X.T / dense R dot product
+                for ii in range(n_features):
+                    XtA[ii] = 0.0
+                    for kk in range(X_indptr[ii], X_indptr[ii + 1]):
+                        XtA[ii] += X_data[kk] * R[X_indices[kk]]
+
+                    if center:
+                        XtA[ii] -= X_mean[ii] * R_sum
+                    XtA[ii] -= beta * w[ii]
+
+                if positive:
+                    dual_norm_XtA = max(n_features, &XtA[0])
+                else:
+                    dual_norm_XtA = abs_max(n_features, &XtA[0])
+
+                # R_norm2 = np.dot(R, R)
+                if no_sample_weights:
+                    R_norm2 = _dot(n_samples, &R[0], 1, &R[0], 1)
+                else:
+                    R_norm2 = 0.0
+                    for jj in range(n_samples):
+                        # R is already multiplied by sample_weight
+                        if sample_weight[jj] != 0:
+                            R_norm2 += (R[jj] ** 2) / sample_weight[jj]
+
+                # w_norm2 = np.dot(w, w)
+                w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
+                if (dual_norm_XtA > alpha):
+                    const_ = alpha / dual_norm_XtA
+                    A_norm2 = R_norm2 * const_**2
+                    gap = 0.5 * (R_norm2 + A_norm2)
+                else:
+                    const_ = 1.0
+                    gap = R_norm2
+
+                l1_norm = _asum(n_features, &w[0], 1)
+
+                gap += (alpha * l1_norm
+                        - const_ * _dot(n_samples, &R[0], 1, &y[0], 1)  # np.dot(R.T, y)
+                        + 0.5 * beta * (1 + const_ ** 2) * w_norm2)
+
+                if gap < tol:
+                    # return if we reached desired tolerance
+                    break
+
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                message = (
+                    message_conv +
+                    f" Duality gap: {gap:.3e}, tolerance: {tol:.3e}"
+                )
+                if alpha < np.finfo(np.float64).eps:
+                    message += "\n" + message_ridge
+                warnings.warn(message, ConvergenceWarning)
+
+    return np.asarray(w), gap, tol, n_iter + 1
+
+
+def enet_coordinate_descent_gram(
+    floating[::1] w,
+    floating alpha,
+    floating beta,
+    const floating[:, ::1] Q,
+    const floating[::1] q,
+    const floating[:] y,
+    unsigned int max_iter,
+    floating tol,
+    object rng,
+    bint random=0,
+    bint positive=0
+):
+    """Cython version of the coordinate descent algorithm
+        for Elastic-Net regression
+
+        We minimize
+
+        (1/2) * w^T Q w - q^T w + alpha norm(w, 1) + (beta/2) * norm(w, 2)^2
+
+        which amount to the Elastic-Net problem when:
+        Q = X^T X (Gram matrix)
+        q = X^T y
+
+    Returns
+    -------
+    w : ndarray of shape (n_features,)
+        ElasticNet coefficients.
+    gap : float
+        Achieved dual gap.
+    tol : float
+        Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
+    n_iter : int
+        Number of coordinate descent iterations.
+    """
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    # get the data information into easy vars
+    cdef unsigned int n_features = Q.shape[0]
+
+    # initial value "Q w" which will be kept of up to date in the iterations
+    cdef floating[:] H = np.dot(Q, w)
+
+    cdef floating[:] XtA = np.zeros(n_features, dtype=dtype)
+    cdef floating tmp
+    cdef floating w_ii
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating q_dot_w
+    cdef floating w_norm2
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating dual_norm_XtA
+    cdef unsigned int ii
+    cdef unsigned int n_iter = 0
+    cdef unsigned int f_iter
+    cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef uint32_t* rand_r_state = &rand_r_state_seed
+
+    cdef floating y_norm2 = np.dot(y, y)
+    cdef floating* w_ptr = &w[0]
+    cdef const floating* Q_ptr = &Q[0, 0]
+    cdef const floating* q_ptr = &q[0]
+    cdef floating* H_ptr = &H[0]
+    cdef floating* XtA_ptr = &XtA[0]
+    tol = tol * y_norm2
+
+    if alpha == 0:
+        warnings.warn(
+            "Coordinate descent without L1 regularization may "
+            "lead to unexpected results and is discouraged. "
+            "Set l1_ratio > 0 to add L1 regularization."
+        )
+
+    with nogil:
+        for n_iter in range(max_iter):
+            w_max = 0.0
+            d_w_max = 0.0
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if Q[ii, ii] == 0.0:
+                    continue
+
+                w_ii = w[ii]  # Store previous value
+
+                if w_ii != 0.0:
+                    # H -= w_ii * Q[ii]
+                    _axpy(n_features, -w_ii, Q_ptr + ii * n_features, 1,
+                          H_ptr, 1)
+
+                tmp = q[ii] - H[ii]
+
+                if positive and tmp < 0:
+                    w[ii] = 0.0
+                else:
+                    w[ii] = fsign(tmp) * fmax(fabs(tmp) - alpha, 0) \
+                        / (Q[ii, ii] + beta)
+
+                if w[ii] != 0.0:
+                    # H +=  w[ii] * Q[ii] # Update H = X.T X w
+                    _axpy(n_features, w[ii], Q_ptr + ii * n_features, 1,
+                          H_ptr, 1)
+
+                # update the maximum absolute coefficient update
+                d_w_ii = fabs(w[ii] - w_ii)
+                if d_w_ii > d_w_max:
+                    d_w_max = d_w_ii
+
+                if fabs(w[ii]) > w_max:
+                    w_max = fabs(w[ii])
+
+            if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
+                # the biggest coordinate update of this iteration was smaller than
+                # the tolerance: check the duality gap as ultimate stopping
+                # criterion
+
+                # q_dot_w = np.dot(w, q)
+                q_dot_w = _dot(n_features, w_ptr, 1, q_ptr, 1)
+
+                for ii in range(n_features):
+                    XtA[ii] = q[ii] - H[ii] - beta * w[ii]
+                if positive:
+                    dual_norm_XtA = max(n_features, XtA_ptr)
+                else:
+                    dual_norm_XtA = abs_max(n_features, XtA_ptr)
+
+                # temp = np.sum(w * H)
+                tmp = 0.0
+                for ii in range(n_features):
+                    tmp += w[ii] * H[ii]
+                R_norm2 = y_norm2 + tmp - 2.0 * q_dot_w
+
+                # w_norm2 = np.dot(w, w)
+                w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
+
+                if (dual_norm_XtA > alpha):
+                    const_ = alpha / dual_norm_XtA
+                    A_norm2 = R_norm2 * (const_ ** 2)
+                    gap = 0.5 * (R_norm2 + A_norm2)
+                else:
+                    const_ = 1.0
+                    gap = R_norm2
+
+                # The call to asum is equivalent to the L1 norm of w
+                gap += (
+                    alpha * _asum(n_features, &w[0], 1)
+                    - const_ * y_norm2
+                    + const_ * q_dot_w
+                    + 0.5 * beta * (1 + const_ ** 2) * w_norm2
+                )
+
+                if gap < tol:
+                    # return if we reached desired tolerance
+                    break
+
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                message = (
+                    message_conv +
+                    f" Duality gap: {gap:.3e}, tolerance: {tol:.3e}"
+                )
+                warnings.warn(message, ConvergenceWarning)
+
+    return np.asarray(w), gap, tol, n_iter + 1
+
+
+def enet_coordinate_descent_multi_task(
+    const floating[::1, :] W,
+    floating l1_reg,
+    floating l2_reg,
+    const floating[::1, :] X,
+    const floating[::1, :] Y,
+    unsigned int max_iter,
+    floating tol,
+    object rng,
+    bint random=0
+):
+    """Cython version of the coordinate descent algorithm
+        for Elastic-Net multi-task regression
+
+        We minimize
+
+        0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2
+
+    Returns
+    -------
+    W : ndarray of shape (n_tasks, n_features)
+        ElasticNet coefficients.
+    gap : float
+        Achieved dual gap.
+    tol : float
+        Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
+    n_iter : int
+        Number of coordinate descent iterations.
+    """
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    # get the data information into easy vars
+    cdef unsigned int n_samples = X.shape[0]
+    cdef unsigned int n_features = X.shape[1]
+    cdef unsigned int n_tasks = Y.shape[1]
+
+    # to store XtA
+    cdef floating[:, ::1] XtA = np.zeros((n_features, n_tasks), dtype=dtype)
+    cdef floating XtA_axis1norm
+    cdef floating dual_norm_XtA
+
+    # initial value of the residuals
+    cdef floating[::1, :] R = np.zeros((n_samples, n_tasks), dtype=dtype, order='F')
+
+    cdef floating[::1] norm_cols_X = np.zeros(n_features, dtype=dtype)
+    cdef floating[::1] tmp = np.zeros(n_tasks, dtype=dtype)
+    cdef floating[::1] w_ii = np.zeros(n_tasks, dtype=dtype)
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating nn
+    cdef floating W_ii_abs_max
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating R_norm
+    cdef floating w_norm
+    cdef floating ry_sum
+    cdef floating l21_norm
+    cdef unsigned int ii
+    cdef unsigned int jj
+    cdef unsigned int n_iter = 0
+    cdef unsigned int f_iter
+    cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef uint32_t* rand_r_state = &rand_r_state_seed
+
+    cdef const floating* X_ptr = &X[0, 0]
+    cdef const floating* Y_ptr = &Y[0, 0]
+
+    if l1_reg == 0:
+        warnings.warn(
+            "Coordinate descent with l1_reg=0 may lead to unexpected"
+            " results and is discouraged."
+        )
+
+    with nogil:
+        # norm_cols_X = (np.asarray(X) ** 2).sum(axis=0)
+        for ii in range(n_features):
+            norm_cols_X[ii] = _nrm2(n_samples, X_ptr + ii * n_samples, 1) ** 2
+
+        # R = Y - np.dot(X, W.T)
+        _copy(n_samples * n_tasks, Y_ptr, 1, &R[0, 0], 1)
+        for ii in range(n_features):
+            for jj in range(n_tasks):
+                if W[jj, ii] != 0:
+                    _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                          &R[0, jj], 1)
+
+        # tol = tol * linalg.norm(Y, ord='fro') ** 2
+        tol = tol * _nrm2(n_samples * n_tasks, Y_ptr, 1) ** 2
+
+        for n_iter in range(max_iter):
+            w_max = 0.0
+            d_w_max = 0.0
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if norm_cols_X[ii] == 0.0:
+                    continue
+
+                # w_ii = W[:, ii] # Store previous value
+                _copy(n_tasks, &W[0, ii], 1, &w_ii[0], 1)
+
+                # Using Numpy:
+                # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
+                # Using Blas Level2:
+                # _ger(RowMajor, n_samples, n_tasks, 1.0,
+                #      &X[0, ii], 1,
+                #      &w_ii[0], 1, &R[0, 0], n_tasks)
+                # Using Blas Level1 and for loop to avoid slower threads
+                # for such small vectors
+                for jj in range(n_tasks):
+                    if w_ii[jj] != 0:
+                        _axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
+
+                # Using numpy:
+                # tmp = np.dot(X[:, ii][None, :], R).ravel()
+                # Using BLAS Level 2:
+                # _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
+                #       n_tasks, &X[0, ii], 1, 0.0, &tmp[0], 1)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    tmp[jj] = _dot(n_samples, X_ptr + ii * n_samples, 1,
+                                   &R[0, jj], 1)
+
+                # nn = sqrt(np.sum(tmp ** 2))
+                nn = _nrm2(n_tasks, &tmp[0], 1)
+
+                # W[:, ii] = tmp * fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg)
+                _copy(n_tasks, &tmp[0], 1, &W[0, ii], 1)
+                _scal(n_tasks, fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg),
+                      &W[0, ii], 1)
+
+                # Using numpy:
+                # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
+                # Using BLAS Level 2:
+                # Update residual : rank 1 update
+                # _ger(RowMajor, n_samples, n_tasks, -1.0,
+                #      &X[0, ii], 1, &W[0, ii], 1,
+                #      &R[0, 0], n_tasks)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    if W[jj, ii] != 0:
+                        _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
+
+                # update the maximum absolute coefficient update
+                d_w_ii = diff_abs_max(n_tasks, &W[0, ii], &w_ii[0])
+
+                if d_w_ii > d_w_max:
+                    d_w_max = d_w_ii
+
+                W_ii_abs_max = abs_max(n_tasks, &W[0, ii])
+                if W_ii_abs_max > w_max:
+                    w_max = W_ii_abs_max
+
+            if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
+                # the biggest coordinate update of this iteration was smaller than
+                # the tolerance: check the duality gap as ultimate stopping
+                # criterion
+
+                # XtA = np.dot(X.T, R) - l2_reg * W.T
+                for ii in range(n_features):
+                    for jj in range(n_tasks):
+                        XtA[ii, jj] = _dot(
+                            n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1
+                            ) - l2_reg * W[jj, ii]
+
+                # dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1)))
+                dual_norm_XtA = 0.0
+                for ii in range(n_features):
+                    # np.sqrt(np.sum(XtA ** 2, axis=1))
+                    XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1)
+                    if XtA_axis1norm > dual_norm_XtA:
+                        dual_norm_XtA = XtA_axis1norm
+
+                # TODO: use squared L2 norm directly
+                # R_norm = linalg.norm(R, ord='fro')
+                # w_norm = linalg.norm(W, ord='fro')
+                R_norm = _nrm2(n_samples * n_tasks, &R[0, 0], 1)
+                w_norm = _nrm2(n_features * n_tasks, &W[0, 0], 1)
+                if (dual_norm_XtA > l1_reg):
+                    const_ = l1_reg / dual_norm_XtA
+                    A_norm = R_norm * const_
+                    gap = 0.5 * (R_norm ** 2 + A_norm ** 2)
+                else:
+                    const_ = 1.0
+                    gap = R_norm ** 2
+
+                # ry_sum = np.sum(R * y)
+                ry_sum = _dot(n_samples * n_tasks, &R[0, 0], 1, &Y[0, 0], 1)
+
+                # l21_norm = np.sqrt(np.sum(W ** 2, axis=0)).sum()
+                l21_norm = 0.0
+                for ii in range(n_features):
+                    l21_norm += _nrm2(n_tasks, &W[0, ii], 1)
+
+                gap += (
+                    l1_reg * l21_norm
+                    - const_ * ry_sum
+                    + 0.5 * l2_reg * (1 + const_ ** 2) * (w_norm ** 2)
+                )
+
+                if gap <= tol:
+                    # return if we reached desired tolerance
+                    break
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                message = (
+                    message_conv +
+                    f" Duality gap: {gap:.3e}, tolerance: {tol:.3e}"
+                )
+                warnings.warn(message, ConvergenceWarning)
+
+    return np.asarray(W), gap, tol, n_iter + 1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py
new file mode 100644
index 0000000000000000000000000000000000000000..940ae6f5e3a3010f7fe2f21d28d68f538d893d8c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py
@@ -0,0 +1,3403 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import sys
+import warnings
+from abc import ABC, abstractmethod
+from functools import partial
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+from scipy import sparse
+
+from sklearn.utils import metadata_routing
+
+from ..base import MultiOutputMixin, RegressorMixin, _fit_context
+from ..model_selection import check_cv
+from ..utils import Bunch, check_array, check_scalar
+from ..utils._metadata_requests import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    get_routing_for_object,
+)
+from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
+from ..utils.extmath import safe_sparse_dot
+from ..utils.metadata_routing import (
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_sample_weight,
+    check_consistent_length,
+    check_is_fitted,
+    check_random_state,
+    column_or_1d,
+    has_fit_parameter,
+    validate_data,
+)
+
+# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
+from . import _cd_fast as cd_fast  # type: ignore[attr-defined]
+from ._base import LinearModel, _pre_fit, _preprocess_data
+
+
+def _set_order(X, y, order="C"):
+    """Change the order of X and y if necessary.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data.
+
+    y : ndarray of shape (n_samples,)
+        Target values.
+
+    order : {None, 'C', 'F'}
+        If 'C', dense arrays are returned as C-ordered, sparse matrices in csr
+        format. If 'F', dense arrays are return as F-ordered, sparse matrices
+        in csc format.
+
+    Returns
+    -------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data with guaranteed order.
+
+    y : ndarray of shape (n_samples,)
+        Target values with guaranteed order.
+    """
+    if order not in [None, "C", "F"]:
+        raise ValueError(
+            "Unknown value for order. Got {} instead of None, 'C' or 'F'.".format(order)
+        )
+    sparse_X = sparse.issparse(X)
+    sparse_y = sparse.issparse(y)
+    if order is not None:
+        sparse_format = "csc" if order == "F" else "csr"
+        if sparse_X:
+            X = X.asformat(sparse_format, copy=False)
+        else:
+            X = np.asarray(X, order=order)
+        if sparse_y:
+            y = y.asformat(sparse_format)
+        else:
+            y = np.asarray(y, order=order)
+    return X, y
+
+
+###############################################################################
+# Paths functions
+
+
+def _alpha_grid(
+    X,
+    y,
+    Xy=None,
+    l1_ratio=1.0,
+    fit_intercept=True,
+    eps=1e-3,
+    n_alphas=100,
+    copy_X=True,
+    sample_weight=None,
+):
+    """Compute the grid of alpha values for elastic net parameter search
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data. Pass directly as Fortran-contiguous data to avoid
+        unnecessary memory duplication
+
+    y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+        Target values
+
+    Xy : array-like of shape (n_features,) or (n_features, n_outputs),\
+         default=None
+        Xy = np.dot(X.T, y) that can be precomputed.
+
+    l1_ratio : float, default=1.0
+        The elastic net mixing parameter, with ``0 < l1_ratio <= 1``.
+        For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not
+        supported) ``For l1_ratio = 1`` it is an L1 penalty. For
+        ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2.
+
+    eps : float, default=1e-3
+        Length of the path. ``eps=1e-3`` means that
+        ``alpha_min / alpha_max = 1e-3``
+
+    n_alphas : int, default=100
+        Number of alphas along the regularization path
+
+    fit_intercept : bool, default=True
+        Whether to fit an intercept or not
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    sample_weight : ndarray of shape (n_samples,), default=None
+    """
+    if l1_ratio == 0:
+        raise ValueError(
+            "Automatic alpha grid generation is not supported for"
+            " l1_ratio=0. Please supply a grid by providing "
+            "your estimator with the appropriate `alphas=` "
+            "argument."
+        )
+    if Xy is not None:
+        Xyw = Xy
+    else:
+        X, y, X_offset, _, _ = _preprocess_data(
+            X,
+            y,
+            fit_intercept=fit_intercept,
+            copy=copy_X,
+            sample_weight=sample_weight,
+            check_input=False,
+        )
+        if sample_weight is not None:
+            if y.ndim > 1:
+                yw = y * sample_weight.reshape(-1, 1)
+            else:
+                yw = y * sample_weight
+        else:
+            yw = y
+        if sparse.issparse(X):
+            Xyw = safe_sparse_dot(X.T, yw, dense_output=True) - np.sum(yw) * X_offset
+        else:
+            Xyw = np.dot(X.T, yw)
+
+    if Xyw.ndim == 1:
+        Xyw = Xyw[:, np.newaxis]
+    if sample_weight is not None:
+        n_samples = sample_weight.sum()
+    else:
+        n_samples = X.shape[0]
+    alpha_max = np.sqrt(np.sum(Xyw**2, axis=1)).max() / (n_samples * l1_ratio)
+
+    if alpha_max <= np.finfo(np.float64).resolution:
+        return np.full(n_alphas, np.finfo(np.float64).resolution)
+
+    return np.geomspace(alpha_max, alpha_max * eps, num=n_alphas)
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", "sparse matrix"],
+        "eps": [Interval(Real, 0, None, closed="neither")],
+        "n_alphas": [Interval(Integral, 1, None, closed="left")],
+        "alphas": ["array-like", None],
+        "precompute": [StrOptions({"auto"}), "boolean", "array-like"],
+        "Xy": ["array-like", None],
+        "copy_X": ["boolean"],
+        "coef_init": ["array-like", None],
+        "verbose": ["verbose"],
+        "return_n_iter": ["boolean"],
+        "positive": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def lasso_path(
+    X,
+    y,
+    *,
+    eps=1e-3,
+    n_alphas=100,
+    alphas=None,
+    precompute="auto",
+    Xy=None,
+    copy_X=True,
+    coef_init=None,
+    verbose=False,
+    return_n_iter=False,
+    positive=False,
+    **params,
+):
+    """Compute Lasso path with coordinate descent.
+
+    The Lasso optimization function varies for mono and multi-outputs.
+
+    For mono-output tasks it is::
+
+        (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    For multi-output tasks it is::
+
+        (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21
+
+    Where::
+
+        ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}
+
+    i.e. the sum of norm of each row.
+
+    Read more in the :ref:`User Guide <lasso>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data. Pass directly as Fortran-contiguous data to avoid
+        unnecessary memory duplication. If ``y`` is mono-output then ``X``
+        can be sparse.
+
+    y : {array-like, sparse matrix} of shape (n_samples,) or \
+        (n_samples, n_targets)
+        Target values.
+
+    eps : float, default=1e-3
+        Length of the path. ``eps=1e-3`` means that
+        ``alpha_min / alpha_max = 1e-3``.
+
+    n_alphas : int, default=100
+        Number of alphas along the regularization path.
+
+    alphas : array-like, default=None
+        List of alphas where to compute the models.
+        If ``None`` alphas are set automatically.
+
+    precompute : 'auto', bool or array-like of shape \
+            (n_features, n_features), default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram
+        matrix can also be passed as argument.
+
+    Xy : array-like of shape (n_features,) or (n_features, n_targets),\
+         default=None
+        Xy = np.dot(X.T, y) that can be precomputed. It is useful
+        only when the Gram matrix is precomputed.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    coef_init : array-like of shape (n_features, ), default=None
+        The initial values of the coefficients.
+
+    verbose : bool or int, default=False
+        Amount of verbosity.
+
+    return_n_iter : bool, default=False
+        Whether to return the number of iterations or not.
+
+    positive : bool, default=False
+        If set to True, forces coefficients to be positive.
+        (Only allowed when ``y.ndim == 1``).
+
+    **params : kwargs
+        Keyword arguments passed to the coordinate descent solver.
+
+    Returns
+    -------
+    alphas : ndarray of shape (n_alphas,)
+        The alphas along the path where models are computed.
+
+    coefs : ndarray of shape (n_features, n_alphas) or \
+            (n_targets, n_features, n_alphas)
+        Coefficients along the path.
+
+    dual_gaps : ndarray of shape (n_alphas,)
+        The dual gaps at the end of the optimization for each alpha.
+
+    n_iters : list of int
+        The number of iterations taken by the coordinate descent optimizer to
+        reach the specified tolerance for each alpha.
+
+    See Also
+    --------
+    lars_path : Compute Least Angle Regression or Lasso path using LARS
+        algorithm.
+    Lasso : The Lasso is a linear model that estimates sparse coefficients.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    LassoCV : Lasso linear model with iterative fitting along a regularization
+        path.
+    LassoLarsCV : Cross-validated Lasso using the LARS algorithm.
+    sklearn.decomposition.sparse_encode : Estimator that can be used to
+        transform signals into sparse linear combination of atoms from a fixed.
+
+    Notes
+    -----
+    For an example, see
+    :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+    <sphx_glr_auto_examples_linear_model_plot_lasso_lasso_lars_elasticnet_path.py>`.
+
+    To avoid unnecessary memory duplication the X argument of the fit method
+    should be directly passed as a Fortran-contiguous numpy array.
+
+    Note that in certain cases, the Lars solver may be significantly
+    faster to implement this functionality. In particular, linear
+    interpolation can be used to retrieve model coefficients between the
+    values output by lars_path
+
+    Examples
+    --------
+
+    Comparing lasso_path and lars_path with interpolation:
+
+    >>> import numpy as np
+    >>> from sklearn.linear_model import lasso_path
+    >>> X = np.array([[1, 2, 3.1], [2.3, 5.4, 4.3]]).T
+    >>> y = np.array([1, 2, 3.1])
+    >>> # Use lasso_path to compute a coefficient path
+    >>> _, coef_path, _ = lasso_path(X, y, alphas=[5., 1., .5])
+    >>> print(coef_path)
+    [[0.         0.         0.46874778]
+     [0.2159048  0.4425765  0.23689075]]
+
+    >>> # Now use lars_path and 1D linear interpolation to compute the
+    >>> # same path
+    >>> from sklearn.linear_model import lars_path
+    >>> alphas, active, coef_path_lars = lars_path(X, y, method='lasso')
+    >>> from scipy import interpolate
+    >>> coef_path_continuous = interpolate.interp1d(alphas[::-1],
+    ...                                             coef_path_lars[:, ::-1])
+    >>> print(coef_path_continuous([5., 1., .5]))
+    [[0.         0.         0.46915237]
+     [0.2159048  0.4425765  0.23668876]]
+    """
+    return enet_path(
+        X,
+        y,
+        l1_ratio=1.0,
+        eps=eps,
+        n_alphas=n_alphas,
+        alphas=alphas,
+        precompute=precompute,
+        Xy=Xy,
+        copy_X=copy_X,
+        coef_init=coef_init,
+        verbose=verbose,
+        positive=positive,
+        return_n_iter=return_n_iter,
+        **params,
+    )
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", "sparse matrix"],
+        "l1_ratio": [Interval(Real, 0.0, 1.0, closed="both")],
+        "eps": [Interval(Real, 0.0, None, closed="neither")],
+        "n_alphas": [Interval(Integral, 1, None, closed="left")],
+        "alphas": ["array-like", None],
+        "precompute": [StrOptions({"auto"}), "boolean", "array-like"],
+        "Xy": ["array-like", None],
+        "copy_X": ["boolean"],
+        "coef_init": ["array-like", None],
+        "verbose": ["verbose"],
+        "return_n_iter": ["boolean"],
+        "positive": ["boolean"],
+        "check_input": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def enet_path(
+    X,
+    y,
+    *,
+    l1_ratio=0.5,
+    eps=1e-3,
+    n_alphas=100,
+    alphas=None,
+    precompute="auto",
+    Xy=None,
+    copy_X=True,
+    coef_init=None,
+    verbose=False,
+    return_n_iter=False,
+    positive=False,
+    check_input=True,
+    **params,
+):
+    """Compute elastic net path with coordinate descent.
+
+    The elastic net optimization function varies for mono and multi-outputs.
+
+    For mono-output tasks it is::
+
+        1 / (2 * n_samples) * ||y - Xw||^2_2
+        + alpha * l1_ratio * ||w||_1
+        + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2
+
+    For multi-output tasks it is::
+
+        (1 / (2 * n_samples)) * ||Y - XW||_Fro^2
+        + alpha * l1_ratio * ||W||_21
+        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+
+    Where::
+
+        ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}
+
+    i.e. the sum of norm of each row.
+
+    Read more in the :ref:`User Guide <elastic_net>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data. Pass directly as Fortran-contiguous data to avoid
+        unnecessary memory duplication. If ``y`` is mono-output then ``X``
+        can be sparse.
+
+    y : {array-like, sparse matrix} of shape (n_samples,) or \
+        (n_samples, n_targets)
+        Target values.
+
+    l1_ratio : float, default=0.5
+        Number between 0 and 1 passed to elastic net (scaling between
+        l1 and l2 penalties). ``l1_ratio=1`` corresponds to the Lasso.
+
+    eps : float, default=1e-3
+        Length of the path. ``eps=1e-3`` means that
+        ``alpha_min / alpha_max = 1e-3``.
+
+    n_alphas : int, default=100
+        Number of alphas along the regularization path.
+
+    alphas : array-like, default=None
+        List of alphas where to compute the models.
+        If None alphas are set automatically.
+
+    precompute : 'auto', bool or array-like of shape \
+            (n_features, n_features), default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram
+        matrix can also be passed as argument.
+
+    Xy : array-like of shape (n_features,) or (n_features, n_targets),\
+         default=None
+        Xy = np.dot(X.T, y) that can be precomputed. It is useful
+        only when the Gram matrix is precomputed.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    coef_init : array-like of shape (n_features, ), default=None
+        The initial values of the coefficients.
+
+    verbose : bool or int, default=False
+        Amount of verbosity.
+
+    return_n_iter : bool, default=False
+        Whether to return the number of iterations or not.
+
+    positive : bool, default=False
+        If set to True, forces coefficients to be positive.
+        (Only allowed when ``y.ndim == 1``).
+
+    check_input : bool, default=True
+        If set to False, the input validation checks are skipped (including the
+        Gram matrix when provided). It is assumed that they are handled
+        by the caller.
+
+    **params : kwargs
+        Keyword arguments passed to the coordinate descent solver.
+
+    Returns
+    -------
+    alphas : ndarray of shape (n_alphas,)
+        The alphas along the path where models are computed.
+
+    coefs : ndarray of shape (n_features, n_alphas) or \
+            (n_targets, n_features, n_alphas)
+        Coefficients along the path.
+
+    dual_gaps : ndarray of shape (n_alphas,)
+        The dual gaps at the end of the optimization for each alpha.
+
+    n_iters : list of int
+        The number of iterations taken by the coordinate descent optimizer to
+        reach the specified tolerance for each alpha.
+        (Is returned when ``return_n_iter`` is set to True).
+
+    See Also
+    --------
+    MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2 mixed-norm \
+    as regularizer.
+    MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in cross-validation.
+    ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.
+    ElasticNetCV : Elastic Net model with iterative fitting along a regularization path.
+
+    Notes
+    -----
+    For an example, see
+    :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+    <sphx_glr_auto_examples_linear_model_plot_lasso_lasso_lars_elasticnet_path.py>`.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import enet_path
+    >>> from sklearn.datasets import make_regression
+    >>> X, y, true_coef = make_regression(
+    ...    n_samples=100, n_features=5, n_informative=2, coef=True, random_state=0
+    ... )
+    >>> true_coef
+    array([ 0.        ,  0.        ,  0.        , 97.9, 45.7])
+    >>> alphas, estimated_coef, _ = enet_path(X, y, n_alphas=3)
+    >>> alphas.shape
+    (3,)
+    >>> estimated_coef
+     array([[ 0.,  0.787,  0.568],
+            [ 0.,  1.120,  0.620],
+            [-0., -2.129, -1.128],
+            [ 0., 23.046, 88.939],
+            [ 0., 10.637, 41.566]])
+    """
+    X_offset_param = params.pop("X_offset", None)
+    X_scale_param = params.pop("X_scale", None)
+    sample_weight = params.pop("sample_weight", None)
+    tol = params.pop("tol", 1e-4)
+    max_iter = params.pop("max_iter", 1000)
+    random_state = params.pop("random_state", None)
+    selection = params.pop("selection", "cyclic")
+
+    if len(params) > 0:
+        raise ValueError("Unexpected parameters in params", params.keys())
+
+    # We expect X and y to be already Fortran ordered when bypassing
+    # checks
+    if check_input:
+        X = check_array(
+            X,
+            accept_sparse="csc",
+            dtype=[np.float64, np.float32],
+            order="F",
+            copy=copy_X,
+        )
+        y = check_array(
+            y,
+            accept_sparse="csc",
+            dtype=X.dtype.type,
+            order="F",
+            copy=False,
+            ensure_2d=False,
+        )
+        if Xy is not None:
+            # Xy should be a 1d contiguous array or a 2D C ordered array
+            Xy = check_array(
+                Xy, dtype=X.dtype.type, order="C", copy=False, ensure_2d=False
+            )
+
+    n_samples, n_features = X.shape
+
+    multi_output = False
+    if y.ndim != 1:
+        multi_output = True
+        n_targets = y.shape[1]
+
+    if multi_output and positive:
+        raise ValueError("positive=True is not allowed for multi-output (y.ndim != 1)")
+
+    # MultiTaskElasticNet does not support sparse matrices
+    if not multi_output and sparse.issparse(X):
+        if X_offset_param is not None:
+            # As sparse matrices are not actually centered we need this to be passed to
+            # the CD solver.
+            X_sparse_scaling = X_offset_param / X_scale_param
+            X_sparse_scaling = np.asarray(X_sparse_scaling, dtype=X.dtype)
+        else:
+            X_sparse_scaling = np.zeros(n_features, dtype=X.dtype)
+
+    # X should have been passed through _pre_fit already if function is called
+    # from ElasticNet.fit
+    if check_input:
+        X, y, _, _, _, precompute, Xy = _pre_fit(
+            X,
+            y,
+            Xy,
+            precompute,
+            fit_intercept=False,
+            copy=False,
+            check_input=check_input,
+        )
+    if alphas is None:
+        # No need to normalize of fit_intercept: it has been done
+        # above
+        alphas = _alpha_grid(
+            X,
+            y,
+            Xy=Xy,
+            l1_ratio=l1_ratio,
+            fit_intercept=False,
+            eps=eps,
+            n_alphas=n_alphas,
+            copy_X=False,
+        )
+    elif len(alphas) > 1:
+        alphas = np.sort(alphas)[::-1]  # make sure alphas are properly ordered
+
+    n_alphas = len(alphas)
+    dual_gaps = np.empty(n_alphas)
+    n_iters = []
+
+    rng = check_random_state(random_state)
+    if selection not in ["random", "cyclic"]:
+        raise ValueError("selection should be either random or cyclic.")
+    random = selection == "random"
+
+    if not multi_output:
+        coefs = np.empty((n_features, n_alphas), dtype=X.dtype)
+    else:
+        coefs = np.empty((n_targets, n_features, n_alphas), dtype=X.dtype)
+
+    if coef_init is None:
+        coef_ = np.zeros(coefs.shape[:-1], dtype=X.dtype, order="F")
+    else:
+        coef_ = np.asfortranarray(coef_init, dtype=X.dtype)
+
+    for i, alpha in enumerate(alphas):
+        # account for n_samples scaling in objectives between here and cd_fast
+        l1_reg = alpha * l1_ratio * n_samples
+        l2_reg = alpha * (1.0 - l1_ratio) * n_samples
+        if not multi_output and sparse.issparse(X):
+            model = cd_fast.sparse_enet_coordinate_descent(
+                w=coef_,
+                alpha=l1_reg,
+                beta=l2_reg,
+                X_data=X.data,
+                X_indices=X.indices,
+                X_indptr=X.indptr,
+                y=y,
+                sample_weight=sample_weight,
+                X_mean=X_sparse_scaling,
+                max_iter=max_iter,
+                tol=tol,
+                rng=rng,
+                random=random,
+                positive=positive,
+            )
+        elif multi_output:
+            model = cd_fast.enet_coordinate_descent_multi_task(
+                coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random
+            )
+        elif isinstance(precompute, np.ndarray):
+            # We expect precompute to be already Fortran ordered when bypassing
+            # checks
+            if check_input:
+                precompute = check_array(precompute, dtype=X.dtype.type, order="C")
+            model = cd_fast.enet_coordinate_descent_gram(
+                coef_,
+                l1_reg,
+                l2_reg,
+                precompute,
+                Xy,
+                y,
+                max_iter,
+                tol,
+                rng,
+                random,
+                positive,
+            )
+        elif precompute is False:
+            model = cd_fast.enet_coordinate_descent(
+                coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
+            )
+        else:
+            raise ValueError(
+                "Precompute should be one of True, False, 'auto' or array-like. Got %r"
+                % precompute
+            )
+        coef_, dual_gap_, eps_, n_iter_ = model
+        coefs[..., i] = coef_
+        # we correct the scale of the returned dual gap, as the objective
+        # in cd_fast is n_samples * the objective in this docstring.
+        dual_gaps[i] = dual_gap_ / n_samples
+        n_iters.append(n_iter_)
+
+        if verbose:
+            if verbose > 2:
+                print(model)
+            elif verbose > 1:
+                print("Path: %03i out of %03i" % (i, n_alphas))
+            else:
+                sys.stderr.write(".")
+
+    if return_n_iter:
+        return alphas, coefs, dual_gaps, n_iters
+    return alphas, coefs, dual_gaps
+
+
+###############################################################################
+# ElasticNet model
+
+
+class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
+    """Linear regression with combined L1 and L2 priors as regularizer.
+
+    Minimizes the objective function::
+
+            1 / (2 * n_samples) * ||y - Xw||^2_2
+            + alpha * l1_ratio * ||w||_1
+            + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2
+
+    If you are interested in controlling the L1 and L2 penalty
+    separately, keep in mind that this is equivalent to::
+
+            a * ||w||_1 + 0.5 * b * ||w||_2^2
+
+    where::
+
+            alpha = a + b and l1_ratio = a / (a + b)
+
+    The parameter l1_ratio corresponds to alpha in the glmnet R package while
+    alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio
+    = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable,
+    unless you supply your own sequence of alpha.
+
+    Read more in the :ref:`User Guide <elastic_net>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Constant that multiplies the penalty terms. Defaults to 1.0.
+        See the notes for the exact mathematical meaning of this
+        parameter. ``alpha = 0`` is equivalent to an ordinary least square,
+        solved by the :class:`LinearRegression` object. For numerical
+        reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.
+        Given this, you should use the :class:`LinearRegression` object.
+
+    l1_ratio : float, default=0.5
+        The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For
+        ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
+        is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
+        combination of L1 and L2.
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If ``False``, the
+        data is assumed to be already centered.
+
+    precompute : bool or array-like of shape (n_features, n_features),\
+                 default=False
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. The Gram matrix can also be passed as argument.
+        For sparse input this option is always ``False`` to preserve sparsity.
+        Check :ref:`an example on how to use a precomputed Gram Matrix in ElasticNet
+        <sphx_glr_auto_examples_linear_model_plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py>`
+        for details.
+
+    max_iter : int, default=1000
+        The maximum number of iterations.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    tol : float, default=1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``, see Notes below.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator that selects a random
+        feature to update. Used when ``selection`` == 'random'.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    selection : {'cyclic', 'random'}, default='cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
+        Parameter vector (w in the cost function formula).
+
+    sparse_coef_ : sparse matrix of shape (n_features,) or \
+            (n_targets, n_features)
+        Sparse representation of the `coef_`.
+
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    n_iter_ : list of int
+        Number of iterations run by the coordinate descent solver to reach
+        the specified tolerance.
+
+    dual_gap_ : float or ndarray of shape (n_targets,)
+        Given param alpha, the dual gaps at the end of the optimization,
+        same shape as each observation of y.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    ElasticNetCV : Elastic net model with best model selection by
+        cross-validation.
+    SGDRegressor : Implements elastic net regression with incremental training.
+    SGDClassifier : Implements logistic regression with elastic net penalty
+        (``SGDClassifier(loss="log_loss", penalty="elasticnet")``).
+
+    Notes
+    -----
+    To avoid unnecessary memory duplication the X argument of the fit method
+    should be directly passed as a Fortran-contiguous numpy array.
+
+    The precise stopping criteria based on `tol` are the following: First, check that
+    that maximum coordinate update, i.e. :math:`\\max_j |w_j^{new} - w_j^{old}|`
+    is smaller than `tol` times the maximum absolute coefficient, :math:`\\max_j |w_j|`.
+    If so, then additionally check whether the dual gap is smaller than `tol` times
+    :math:`||y||_2^2 / n_{\text{samples}}`.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import ElasticNet
+    >>> from sklearn.datasets import make_regression
+
+    >>> X, y = make_regression(n_features=2, random_state=0)
+    >>> regr = ElasticNet(random_state=0)
+    >>> regr.fit(X, y)
+    ElasticNet(random_state=0)
+    >>> print(regr.coef_)
+    [18.83816048 64.55968825]
+    >>> print(regr.intercept_)
+    1.451
+    >>> print(regr.predict([[0, 0]]))
+    [1.451]
+
+    -   :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py`
+        showcases ElasticNet alongside Lasso and ARD Regression for sparse
+        signal recovery in the presence of noise and feature correlation.
+    """
+
+    # "check_input" is used for optimisation and isn't something to be passed
+    # around in a pipeline.
+    __metadata_request__fit = {"check_input": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "l1_ratio": [Interval(Real, 0, 1, closed="both")],
+        "fit_intercept": ["boolean"],
+        "precompute": ["boolean", "array-like"],
+        "max_iter": [Interval(Integral, 1, None, closed="left"), None],
+        "copy_X": ["boolean"],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "warm_start": ["boolean"],
+        "positive": ["boolean"],
+        "random_state": ["random_state"],
+        "selection": [StrOptions({"cyclic", "random"})],
+    }
+
+    path = staticmethod(enet_path)
+
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        l1_ratio=0.5,
+        fit_intercept=True,
+        precompute=False,
+        max_iter=1000,
+        copy_X=True,
+        tol=1e-4,
+        warm_start=False,
+        positive=False,
+        random_state=None,
+        selection="cyclic",
+    ):
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.fit_intercept = fit_intercept
+        self.precompute = precompute
+        self.max_iter = max_iter
+        self.copy_X = copy_X
+        self.tol = tol
+        self.warm_start = warm_start
+        self.positive = positive
+        self.random_state = random_state
+        self.selection = selection
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None, check_input=True):
+        """Fit model with coordinate descent.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix, sparse array} of (n_samples, n_features)
+            Data.
+
+            Note that large sparse matrices and arrays requiring `int64`
+            indices are not accepted.
+
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
+            Target. Will be cast to X's dtype if necessary.
+
+        sample_weight : float or array-like of shape (n_samples,), default=None
+            Sample weights. Internally, the `sample_weight` vector will be
+            rescaled to sum to `n_samples`.
+
+            .. versionadded:: 0.23
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+
+        Notes
+        -----
+        Coordinate descent is an algorithm that considers each column of
+        data at a time hence it will automatically convert the X input
+        as a Fortran-contiguous numpy array if necessary.
+
+        To avoid memory re-allocation it is advised to allocate the
+        initial data in memory directly using that format.
+        """
+        if self.alpha == 0:
+            warnings.warn(
+                (
+                    "With alpha=0, this algorithm does not converge "
+                    "well. You are advised to use the LinearRegression "
+                    "estimator"
+                ),
+                stacklevel=2,
+            )
+
+        # Remember if X is copied
+        X_copied = False
+        # We expect X and y to be float64 or float32 Fortran ordered arrays
+        # when bypassing checks
+        if check_input:
+            X_copied = self.copy_X and self.fit_intercept
+            X, y = validate_data(
+                self,
+                X,
+                y,
+                accept_sparse="csc",
+                order="F",
+                dtype=[np.float64, np.float32],
+                force_writeable=True,
+                accept_large_sparse=False,
+                copy=X_copied,
+                multi_output=True,
+                y_numeric=True,
+            )
+            y = check_array(
+                y, order="F", copy=False, dtype=X.dtype.type, ensure_2d=False
+            )
+
+        n_samples, n_features = X.shape
+        alpha = self.alpha
+
+        if isinstance(sample_weight, numbers.Number):
+            sample_weight = None
+        if sample_weight is not None:
+            if check_input:
+                sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+            # TLDR: Rescale sw to sum up to n_samples.
+            # Long: The objective function of Enet
+            #
+            #    1/2 * np.average(squared error, weights=sw)
+            #    + alpha * penalty                                             (1)
+            #
+            # is invariant under rescaling of sw.
+            # But enet_path coordinate descent minimizes
+            #
+            #     1/2 * sum(squared error) + alpha' * penalty                  (2)
+            #
+            # and therefore sets
+            #
+            #     alpha' = n_samples * alpha                                   (3)
+            #
+            # inside its function body, which results in objective (2) being
+            # equivalent to (1) in case of no sw.
+            # With sw, however, enet_path should set
+            #
+            #     alpha' = sum(sw) * alpha                                     (4)
+            #
+            # Therefore, we use the freedom of Eq. (1) to rescale sw before
+            # calling enet_path, i.e.
+            #
+            #     sw *= n_samples / sum(sw)
+            #
+            # such that sum(sw) = n_samples. This way, (3) and (4) are the same.
+            sample_weight = sample_weight * (n_samples / np.sum(sample_weight))
+            # Note: Alternatively, we could also have rescaled alpha instead
+            # of sample_weight:
+            #
+            #     alpha *= np.sum(sample_weight) / n_samples
+
+        # Ensure copying happens only once, don't do it again if done above.
+        # X and y will be rescaled if sample_weight is not None, order='F'
+        # ensures that the returned X and y are still F-contiguous.
+        should_copy = self.copy_X and not X_copied
+        X, y, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit(
+            X,
+            y,
+            None,
+            self.precompute,
+            fit_intercept=self.fit_intercept,
+            copy=should_copy,
+            check_input=check_input,
+            sample_weight=sample_weight,
+        )
+        # coordinate descent needs F-ordered arrays and _pre_fit might have
+        # called _rescale_data
+        if check_input or sample_weight is not None:
+            X, y = _set_order(X, y, order="F")
+        if y.ndim == 1:
+            y = y[:, np.newaxis]
+        if Xy is not None and Xy.ndim == 1:
+            Xy = Xy[:, np.newaxis]
+
+        n_targets = y.shape[1]
+
+        if not self.warm_start or not hasattr(self, "coef_"):
+            coef_ = np.zeros((n_targets, n_features), dtype=X.dtype, order="F")
+        else:
+            coef_ = self.coef_
+            if coef_.ndim == 1:
+                coef_ = coef_[np.newaxis, :]
+
+        dual_gaps_ = np.zeros(n_targets, dtype=X.dtype)
+        self.n_iter_ = []
+
+        for k in range(n_targets):
+            if Xy is not None:
+                this_Xy = Xy[:, k]
+            else:
+                this_Xy = None
+            _, this_coef, this_dual_gap, this_iter = self.path(
+                X,
+                y[:, k],
+                l1_ratio=self.l1_ratio,
+                eps=None,
+                n_alphas=None,
+                alphas=[alpha],
+                precompute=precompute,
+                Xy=this_Xy,
+                copy_X=True,
+                coef_init=coef_[k],
+                verbose=False,
+                return_n_iter=True,
+                positive=self.positive,
+                check_input=False,
+                # from here on **params
+                tol=self.tol,
+                X_offset=X_offset,
+                X_scale=X_scale,
+                max_iter=self.max_iter,
+                random_state=self.random_state,
+                selection=self.selection,
+                sample_weight=sample_weight,
+            )
+            coef_[k] = this_coef[:, 0]
+            dual_gaps_[k] = this_dual_gap[0]
+            self.n_iter_.append(this_iter[0])
+
+        if n_targets == 1:
+            self.n_iter_ = self.n_iter_[0]
+            self.coef_ = coef_[0]
+            self.dual_gap_ = dual_gaps_[0]
+        else:
+            self.coef_ = coef_
+            self.dual_gap_ = dual_gaps_
+
+        self._set_intercept(X_offset, y_offset, X_scale)
+
+        # check for finiteness of coefficients
+        if not all(np.isfinite(w).all() for w in [self.coef_, self.intercept_]):
+            raise ValueError(
+                "Coordinate descent iterations resulted in non-finite parameter"
+                " values. The input data may contain large values and need to"
+                " be preprocessed."
+            )
+
+        # return self for chaining fit and predict calls
+        return self
+
+    @property
+    def sparse_coef_(self):
+        """Sparse representation of the fitted `coef_`."""
+        return sparse.csr_matrix(self.coef_)
+
+    def _decision_function(self, X):
+        """Decision function of the linear model.
+
+        Parameters
+        ----------
+        X : numpy array or scipy.sparse matrix of shape (n_samples, n_features)
+
+        Returns
+        -------
+        T : ndarray of shape (n_samples,)
+            The predicted decision function.
+        """
+        check_is_fitted(self)
+        if sparse.issparse(X):
+            return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
+        else:
+            return super()._decision_function(X)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+###############################################################################
+# Lasso model
+
+
+class Lasso(ElasticNet):
+    """Linear Model trained with L1 prior as regularizer (aka the Lasso).
+
+    The optimization objective for Lasso is::
+
+        (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    Technically the Lasso model is optimizing the same objective function as
+    the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).
+
+    Read more in the :ref:`User Guide <lasso>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Constant that multiplies the L1 term, controlling regularization
+        strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.
+
+        When `alpha = 0`, the objective is equivalent to ordinary least
+        squares, solved by the :class:`LinearRegression` object. For numerical
+        reasons, using `alpha = 0` with the `Lasso` object is not advised.
+        Instead, you should use the :class:`LinearRegression` object.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to False, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    precompute : bool or array-like of shape (n_features, n_features),\
+                 default=False
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. The Gram matrix can also be passed as argument.
+        For sparse input this option is always ``False`` to preserve sparsity.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    max_iter : int, default=1000
+        The maximum number of iterations.
+
+    tol : float, default=1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``, see Notes below.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator that selects a random
+        feature to update. Used when ``selection`` == 'random'.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    selection : {'cyclic', 'random'}, default='cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
+        Parameter vector (w in the cost function formula).
+
+    dual_gap_ : float or ndarray of shape (n_targets,)
+        Given param alpha, the dual gaps at the end of the optimization,
+        same shape as each observation of y.
+
+    sparse_coef_ : sparse matrix of shape (n_features, 1) or \
+            (n_targets, n_features)
+        Readonly property derived from ``coef_``.
+
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    n_iter_ : int or list of int
+        Number of iterations run by the coordinate descent solver to reach
+        the specified tolerance.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    lars_path : Regularization path using LARS.
+    lasso_path : Regularization path using Lasso.
+    LassoLars : Lasso Path along the regularization parameter using LARS algorithm.
+    LassoCV : Lasso alpha parameter by cross-validation.
+    LassoLarsCV : Lasso least angle parameter algorithm by cross-validation.
+    sklearn.decomposition.sparse_encode : Sparse coding array estimator.
+
+    Notes
+    -----
+    The algorithm used to fit the model is coordinate descent.
+
+    To avoid unnecessary memory duplication the X argument of the fit method
+    should be directly passed as a Fortran-contiguous numpy array.
+
+    Regularization improves the conditioning of the problem and
+    reduces the variance of the estimates. Larger values specify stronger
+    regularization. Alpha corresponds to `1 / (2C)` in other linear
+    models such as :class:`~sklearn.linear_model.LogisticRegression` or
+    :class:`~sklearn.svm.LinearSVC`.
+
+    The precise stopping criteria based on `tol` are the following: First, check that
+    that maximum coordinate update, i.e. :math:`\\max_j |w_j^{new} - w_j^{old}|`
+    is smaller than `tol` times the maximum absolute coefficient, :math:`\\max_j |w_j|`.
+    If so, then additionally check whether the dual gap is smaller than `tol` times
+    :math:`||y||_2^2 / n_{\\text{samples}}`.
+
+    The target can be a 2-dimensional array, resulting in the optimization of the
+    following objective::
+
+        (1 / (2 * n_samples)) * ||Y - XW||^2_F + alpha * ||W||_11
+
+    where :math:`||W||_{1,1}` is the sum of the magnitude of the matrix coefficients.
+    It should not be confused with :class:`~sklearn.linear_model.MultiTaskLasso` which
+    instead penalizes the :math:`L_{2,1}` norm of the coefficients, yielding row-wise
+    sparsity in the coefficients.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.Lasso(alpha=0.1)
+    >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
+    Lasso(alpha=0.1)
+    >>> print(clf.coef_)
+    [0.85 0.  ]
+    >>> print(clf.intercept_)
+    0.15
+
+    -   :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py`
+        compares Lasso with other L1-based regression models (ElasticNet and ARD
+        Regression) for sparse signal recovery in the presence of noise and
+        feature correlation.
+    """
+
+    _parameter_constraints: dict = {
+        **ElasticNet._parameter_constraints,
+    }
+    _parameter_constraints.pop("l1_ratio")
+
+    path = staticmethod(enet_path)
+
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        fit_intercept=True,
+        precompute=False,
+        copy_X=True,
+        max_iter=1000,
+        tol=1e-4,
+        warm_start=False,
+        positive=False,
+        random_state=None,
+        selection="cyclic",
+    ):
+        super().__init__(
+            alpha=alpha,
+            l1_ratio=1.0,
+            fit_intercept=fit_intercept,
+            precompute=precompute,
+            copy_X=copy_X,
+            max_iter=max_iter,
+            tol=tol,
+            warm_start=warm_start,
+            positive=positive,
+            random_state=random_state,
+            selection=selection,
+        )
+
+
+###############################################################################
+# Functions for CV with paths functions
+
+
+def _path_residuals(
+    X,
+    y,
+    sample_weight,
+    train,
+    test,
+    fit_intercept,
+    path,
+    path_params,
+    alphas=None,
+    l1_ratio=1,
+    X_order=None,
+    dtype=None,
+):
+    """Returns the MSE for the models computed by 'path'.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        Target values.
+
+    sample_weight : None or array-like of shape (n_samples,)
+        Sample weights.
+
+    train : list of indices
+        The indices of the train set.
+
+    test : list of indices
+        The indices of the test set.
+
+    path : callable
+        Function returning a list of models on the path. See
+        enet_path for an example of signature.
+
+    path_params : dictionary
+        Parameters passed to the path function.
+
+    alphas : array-like, default=None
+        Array of float that is used for cross-validation. If not
+        provided, computed using 'path'.
+
+    l1_ratio : float, default=1
+        float between 0 and 1 passed to ElasticNet (scaling between
+        l1 and l2 penalties). For ``l1_ratio = 0`` the penalty is an
+        L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0
+        < l1_ratio < 1``, the penalty is a combination of L1 and L2.
+
+    X_order : {'F', 'C'}, default=None
+        The order of the arrays expected by the path function to
+        avoid memory copies.
+
+    dtype : a numpy dtype, default=None
+        The dtype of the arrays expected by the path function to
+        avoid memory copies.
+    """
+    X_train = X[train]
+    y_train = y[train]
+    X_test = X[test]
+    y_test = y[test]
+    if sample_weight is None:
+        sw_train, sw_test = None, None
+    else:
+        sw_train = sample_weight[train]
+        sw_test = sample_weight[test]
+        n_samples = X_train.shape[0]
+        # TLDR: Rescale sw_train to sum up to n_samples on the training set.
+        # See TLDR and long comment inside ElasticNet.fit.
+        sw_train *= n_samples / np.sum(sw_train)
+        # Note: Alternatively, we could also have rescaled alpha instead
+        # of sample_weight:
+        #
+        #     alpha *= np.sum(sample_weight) / n_samples
+
+    if not sparse.issparse(X):
+        for array, array_input in (
+            (X_train, X),
+            (y_train, y),
+            (X_test, X),
+            (y_test, y),
+        ):
+            if array.base is not array_input and not array.flags["WRITEABLE"]:
+                # fancy indexing should create a writable copy but it doesn't
+                # for read-only memmaps (cf. numpy#14132).
+                array.setflags(write=True)
+
+    if y.ndim == 1:
+        precompute = path_params["precompute"]
+    else:
+        # No Gram variant of multi-task exists right now.
+        # Fall back to default enet_multitask
+        precompute = False
+
+    X_train, y_train, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit(
+        X_train,
+        y_train,
+        None,
+        precompute,
+        fit_intercept=fit_intercept,
+        copy=False,
+        sample_weight=sw_train,
+    )
+
+    path_params = path_params.copy()
+    path_params["Xy"] = Xy
+    path_params["X_offset"] = X_offset
+    path_params["X_scale"] = X_scale
+    path_params["precompute"] = precompute
+    path_params["copy_X"] = False
+    path_params["alphas"] = alphas
+    # needed for sparse cd solver
+    path_params["sample_weight"] = sw_train
+
+    if "l1_ratio" in path_params:
+        path_params["l1_ratio"] = l1_ratio
+
+    # Do the ordering and type casting here, as if it is done in the path,
+    # X is copied and a reference is kept here
+    X_train = check_array(X_train, accept_sparse="csc", dtype=dtype, order=X_order)
+    alphas, coefs, _ = path(X_train, y_train, **path_params)
+    del X_train, y_train
+
+    if y.ndim == 1:
+        # Doing this so that it becomes coherent with multioutput.
+        coefs = coefs[np.newaxis, :, :]
+        y_offset = np.atleast_1d(y_offset)
+        y_test = y_test[:, np.newaxis]
+
+    intercepts = y_offset[:, np.newaxis] - np.dot(X_offset, coefs)
+    X_test_coefs = safe_sparse_dot(X_test, coefs)
+    residues = X_test_coefs - y_test[:, :, np.newaxis]
+    residues += intercepts
+    if sample_weight is None:
+        this_mse = (residues**2).mean(axis=0)
+    else:
+        this_mse = np.average(residues**2, weights=sw_test, axis=0)
+
+    return this_mse.mean(axis=0)
+
+
+class LinearModelCV(MultiOutputMixin, LinearModel, ABC):
+    """Base class for iterative model fitting along a regularization path."""
+
+    _parameter_constraints: dict = {
+        "eps": [Interval(Real, 0, None, closed="neither")],
+        "n_alphas": [
+            Interval(Integral, 1, None, closed="left"),
+            Hidden(StrOptions({"deprecated"})),
+        ],
+        # TODO(1.9): remove "warn" and None options.
+        "alphas": [
+            Interval(Integral, 1, None, closed="left"),
+            "array-like",
+            None,
+            Hidden(StrOptions({"warn"})),
+        ],
+        "fit_intercept": ["boolean"],
+        "precompute": [StrOptions({"auto"}), "array-like", "boolean"],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "copy_X": ["boolean"],
+        "cv": ["cv_object"],
+        "verbose": ["verbose"],
+        "n_jobs": [Integral, None],
+        "positive": ["boolean"],
+        "random_state": ["random_state"],
+        "selection": [StrOptions({"cyclic", "random"})],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        eps=1e-3,
+        n_alphas="deprecated",
+        alphas="warn",
+        fit_intercept=True,
+        precompute="auto",
+        max_iter=1000,
+        tol=1e-4,
+        copy_X=True,
+        cv=None,
+        verbose=False,
+        n_jobs=None,
+        positive=False,
+        random_state=None,
+        selection="cyclic",
+    ):
+        self.eps = eps
+        self.n_alphas = n_alphas
+        self.alphas = alphas
+        self.fit_intercept = fit_intercept
+        self.precompute = precompute
+        self.max_iter = max_iter
+        self.tol = tol
+        self.copy_X = copy_X
+        self.cv = cv
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.positive = positive
+        self.random_state = random_state
+        self.selection = selection
+
+    @abstractmethod
+    def _get_estimator(self):
+        """Model to be fitted after the best alpha has been determined."""
+
+    @abstractmethod
+    def _is_multitask(self):
+        """Bool indicating if class is meant for multidimensional target."""
+
+    @staticmethod
+    @abstractmethod
+    def path(X, y, **kwargs):
+        """Compute path with coordinate descent."""
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None, **params):
+        """Fit linear model with coordinate descent.
+
+        Fit is on grid of alphas and best alpha estimated by cross-validation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data. Pass directly as Fortran-contiguous data
+            to avoid unnecessary memory duplication. If y is mono-output,
+            X can be sparse. Note that large sparse matrices and arrays
+            requiring `int64` indices are not accepted.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        sample_weight : float or array-like of shape (n_samples,), \
+                default=None
+            Sample weights used for fitting and evaluation of the weighted
+            mean squared error of each cv-fold. Note that the cross validated
+            MSE that is finally used to find the best model is the unweighted
+            mean over the (weighted) MSEs of each test fold.
+
+        **params : dict, default=None
+            Parameters to be passed to the CV splitter.
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of fitted model.
+        """
+        _raise_for_params(params, self, "fit")
+
+        # TODO(1.9): remove n_alphas and alphas={"warn", None}; set alphas=100 by
+        # default. Remove these deprecations messages and use self.alphas directly
+        # instead of self._alphas.
+        if self.n_alphas == "deprecated":
+            self._alphas = 100
+        else:
+            warnings.warn(
+                "'n_alphas' was deprecated in 1.7 and will be removed in 1.9. "
+                "'alphas' now accepts an integer value which removes the need to pass "
+                "'n_alphas'. The default value of 'alphas' will change from None to "
+                "100 in 1.9. Pass an explicit value to 'alphas' and leave 'n_alphas' "
+                "to its default value to silence this warning.",
+                FutureWarning,
+            )
+            self._alphas = self.n_alphas
+
+        if isinstance(self.alphas, str) and self.alphas == "warn":
+            # - If self.n_alphas == "deprecated", both are left to their default values
+            #   so we don't warn since the future default behavior will be the same as
+            #   the current default behavior.
+            # - If self.n_alphas != "deprecated", then we already warned about it
+            #   and the warning message mentions the future self.alphas default, so
+            #   no need to warn a second time.
+            pass
+        elif self.alphas is None:
+            warnings.warn(
+                "'alphas=None' is deprecated and will be removed in 1.9, at which "
+                "point the default value will be set to 100. Set 'alphas=100' "
+                "to silence this warning.",
+                FutureWarning,
+            )
+        else:
+            self._alphas = self.alphas
+
+        # This makes sure that there is no duplication in memory.
+        # Dealing right with copy_X is important in the following:
+        # Multiple functions touch X and subsamples of X and can induce a
+        # lot of duplication of memory
+        copy_X = self.copy_X and self.fit_intercept
+
+        check_y_params = dict(
+            copy=False, dtype=[np.float64, np.float32], ensure_2d=False
+        )
+        if isinstance(X, np.ndarray) or sparse.issparse(X):
+            # Keep a reference to X
+            reference_to_old_X = X
+            # Let us not impose fortran ordering so far: it is
+            # not useful for the cross-validation loop and will be done
+            # by the model fitting itself
+
+            # Need to validate separately here.
+            # We can't pass multi_output=True because that would allow y to be
+            # csr. We also want to allow y to be 64 or 32 but check_X_y only
+            # allows to convert for 64.
+            check_X_params = dict(
+                accept_sparse="csc",
+                dtype=[np.float64, np.float32],
+                force_writeable=True,
+                copy=False,
+                accept_large_sparse=False,
+            )
+            X, y = validate_data(
+                self, X, y, validate_separately=(check_X_params, check_y_params)
+            )
+            if sparse.issparse(X):
+                if hasattr(reference_to_old_X, "data") and not np.may_share_memory(
+                    reference_to_old_X.data, X.data
+                ):
+                    # X is a sparse matrix and has been copied
+                    copy_X = False
+            elif not np.may_share_memory(reference_to_old_X, X):
+                # X has been copied
+                copy_X = False
+            del reference_to_old_X
+        else:
+            # Need to validate separately here.
+            # We can't pass multi_output=True because that would allow y to be
+            # csr. We also want to allow y to be 64 or 32 but check_X_y only
+            # allows to convert for 64.
+            check_X_params = dict(
+                accept_sparse="csc",
+                dtype=[np.float64, np.float32],
+                order="F",
+                force_writeable=True,
+                copy=copy_X,
+            )
+            X, y = validate_data(
+                self, X, y, validate_separately=(check_X_params, check_y_params)
+            )
+            copy_X = False
+
+        check_consistent_length(X, y)
+
+        if not self._is_multitask():
+            if y.ndim > 1 and y.shape[1] > 1:
+                raise ValueError(
+                    "For multi-task outputs, use MultiTask%s" % self.__class__.__name__
+                )
+            y = column_or_1d(y, warn=True)
+        else:
+            if sparse.issparse(X):
+                raise TypeError("X should be dense but a sparse matrix waspassed")
+            elif y.ndim == 1:
+                raise ValueError(
+                    "For mono-task outputs, use %sCV" % self.__class__.__name__[9:]
+                )
+
+        if isinstance(sample_weight, numbers.Number):
+            sample_weight = None
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        model = self._get_estimator()
+
+        # All LinearModelCV parameters except 'cv' are acceptable
+        path_params = self.get_params()
+
+        # Pop `intercept` that is not parameter of the path function
+        path_params.pop("fit_intercept", None)
+
+        if "l1_ratio" in path_params:
+            l1_ratios = np.atleast_1d(path_params["l1_ratio"])
+            # For the first path, we need to set l1_ratio
+            path_params["l1_ratio"] = l1_ratios[0]
+        else:
+            l1_ratios = [
+                1,
+            ]
+        path_params.pop("cv", None)
+        path_params.pop("n_jobs", None)
+
+        n_l1_ratio = len(l1_ratios)
+
+        check_scalar_alpha = partial(
+            check_scalar,
+            target_type=Real,
+            min_val=0.0,
+            include_boundaries="left",
+        )
+
+        if isinstance(self._alphas, Integral):
+            alphas = [
+                _alpha_grid(
+                    X,
+                    y,
+                    l1_ratio=l1_ratio,
+                    fit_intercept=self.fit_intercept,
+                    eps=self.eps,
+                    n_alphas=self._alphas,
+                    copy_X=self.copy_X,
+                    sample_weight=sample_weight,
+                )
+                for l1_ratio in l1_ratios
+            ]
+        else:
+            # Making sure alphas entries are scalars.
+            for index, alpha in enumerate(self._alphas):
+                check_scalar_alpha(alpha, f"alphas[{index}]")
+            # Making sure alphas is properly ordered.
+            alphas = np.tile(np.sort(self._alphas)[::-1], (n_l1_ratio, 1))
+
+        # We want n_alphas to be the number of alphas used for each l1_ratio.
+        n_alphas = len(alphas[0])
+        path_params.update({"n_alphas": n_alphas})
+
+        path_params["copy_X"] = copy_X
+        # We are not computing in parallel, we can modify X
+        # inplace in the folds
+        if effective_n_jobs(self.n_jobs) > 1:
+            path_params["copy_X"] = False
+
+        # init cross-validation generator
+        cv = check_cv(self.cv)
+
+        if _routing_enabled():
+            splitter_supports_sample_weight = get_routing_for_object(cv).consumes(
+                method="split", params=["sample_weight"]
+            )
+            if (
+                sample_weight is not None
+                and not splitter_supports_sample_weight
+                and not has_fit_parameter(self, "sample_weight")
+            ):
+                raise ValueError(
+                    "The CV splitter and underlying estimator do not support"
+                    " sample weights."
+                )
+
+            if splitter_supports_sample_weight:
+                params["sample_weight"] = sample_weight
+
+            routed_params = process_routing(self, "fit", **params)
+
+            if sample_weight is not None and not has_fit_parameter(
+                self, "sample_weight"
+            ):
+                # MultiTaskElasticNetCV does not (yet) support sample_weight
+                sample_weight = None
+        else:
+            routed_params = Bunch()
+            routed_params.splitter = Bunch(split=Bunch())
+
+        # Compute path for all folds and compute MSE to get the best alpha
+        folds = list(cv.split(X, y, **routed_params.splitter.split))
+        best_mse = np.inf
+
+        # We do a double for loop folded in one, in order to be able to
+        # iterate in parallel on l1_ratio and folds
+        jobs = (
+            delayed(_path_residuals)(
+                X,
+                y,
+                sample_weight,
+                train,
+                test,
+                self.fit_intercept,
+                self.path,
+                path_params,
+                alphas=this_alphas,
+                l1_ratio=this_l1_ratio,
+                X_order="F",
+                dtype=X.dtype.type,
+            )
+            for this_l1_ratio, this_alphas in zip(l1_ratios, alphas)
+            for train, test in folds
+        )
+        mse_paths = Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(jobs)
+        mse_paths = np.reshape(mse_paths, (n_l1_ratio, len(folds), -1))
+        # The mean is computed over folds.
+        mean_mse = np.mean(mse_paths, axis=1)
+        self.mse_path_ = np.squeeze(np.moveaxis(mse_paths, 2, 1))
+        for l1_ratio, l1_alphas, mse_alphas in zip(l1_ratios, alphas, mean_mse):
+            i_best_alpha = np.argmin(mse_alphas)
+            this_best_mse = mse_alphas[i_best_alpha]
+            if this_best_mse < best_mse:
+                best_alpha = l1_alphas[i_best_alpha]
+                best_l1_ratio = l1_ratio
+                best_mse = this_best_mse
+
+        self.l1_ratio_ = best_l1_ratio
+        self.alpha_ = best_alpha
+        if isinstance(self._alphas, Integral):
+            self.alphas_ = np.asarray(alphas)
+            if n_l1_ratio == 1:
+                self.alphas_ = self.alphas_[0]
+        # Remove duplicate alphas in case alphas is provided.
+        else:
+            self.alphas_ = np.asarray(alphas[0])
+
+        # Refit the model with the parameters selected
+        common_params = {
+            name: value
+            for name, value in self.get_params().items()
+            if name in model.get_params()
+        }
+        model.set_params(**common_params)
+        model.alpha = best_alpha
+        model.l1_ratio = best_l1_ratio
+        model.copy_X = copy_X
+        precompute = getattr(self, "precompute", None)
+        if isinstance(precompute, str) and precompute == "auto":
+            model.precompute = False
+
+        if sample_weight is None:
+            # MultiTaskElasticNetCV does not (yet) support sample_weight, even
+            # not sample_weight=None.
+            model.fit(X, y)
+        else:
+            model.fit(X, y, sample_weight=sample_weight)
+        if not hasattr(self, "l1_ratio"):
+            del self.l1_ratio_
+        self.coef_ = model.coef_
+        self.intercept_ = model.intercept_
+        self.dual_gap_ = model.dual_gap_
+        self.n_iter_ = model.n_iter_
+        return self
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.4
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(
+                splitter=check_cv(self.cv),
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+        )
+        return router
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        multitask = self._is_multitask()
+        tags.input_tags.sparse = not multitask
+        tags.target_tags.multi_output = multitask
+        return tags
+
+
+class LassoCV(RegressorMixin, LinearModelCV):
+    """Lasso linear model with iterative fitting along a regularization path.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    The best model is selected by cross-validation.
+
+    The optimization objective for Lasso is::
+
+        (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    Read more in the :ref:`User Guide <lasso>`.
+
+    Parameters
+    ----------
+    eps : float, default=1e-3
+        Length of the path. ``eps=1e-3`` means that
+        ``alpha_min / alpha_max = 1e-3``.
+
+    n_alphas : int, default=100
+        Number of alphas along the regularization path.
+
+        .. deprecated:: 1.7
+            `n_alphas` was deprecated in 1.7 and will be removed in 1.9. Use `alphas`
+            instead.
+
+    alphas : array-like or int, default=None
+        Values of alphas to test along the regularization path.
+        If int, `alphas` values are generated automatically.
+        If array-like, list of alpha values to use.
+
+        .. versionchanged:: 1.7
+            `alphas` accepts an integer value which removes the need to pass
+            `n_alphas`.
+
+        .. deprecated:: 1.7
+            `alphas=None` was deprecated in 1.7 and will be removed in 1.9, at which
+            point the default value will be set to 100.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    precompute : 'auto', bool or array-like of shape \
+            (n_features, n_features), default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram
+        matrix can also be passed as argument.
+
+    max_iter : int, default=1000
+        The maximum number of iterations.
+
+    tol : float, default=1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    cv : int, cross-validation generator or iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - int, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    verbose : bool or int, default=False
+        Amount of verbosity.
+
+    n_jobs : int, default=None
+        Number of CPUs to use during the cross validation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    positive : bool, default=False
+        If positive, restrict regression coefficients to be positive.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator that selects a random
+        feature to update. Used when ``selection`` == 'random'.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    selection : {'cyclic', 'random'}, default='cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    Attributes
+    ----------
+    alpha_ : float
+        The amount of penalization chosen by cross validation.
+
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
+        Parameter vector (w in the cost function formula).
+
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    mse_path_ : ndarray of shape (n_alphas, n_folds)
+        Mean square error for the test set on each fold, varying alpha.
+
+    alphas_ : ndarray of shape (n_alphas,)
+        The grid of alphas used for fitting.
+
+    dual_gap_ : float or ndarray of shape (n_targets,)
+        The dual gap at the end of the optimization for the optimal alpha
+        (``alpha_``).
+
+    n_iter_ : int
+        Number of iterations run by the coordinate descent solver to reach
+        the specified tolerance for the optimal alpha.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    lars_path : Compute Least Angle Regression or Lasso path using LARS
+        algorithm.
+    lasso_path : Compute Lasso path with coordinate descent.
+    Lasso : The Lasso is a linear model that estimates sparse coefficients.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    LassoCV : Lasso linear model with iterative fitting along a regularization
+        path.
+    LassoLarsCV : Cross-validated Lasso using the LARS algorithm.
+
+    Notes
+    -----
+    In `fit`, once the best parameter `alpha` is found through
+    cross-validation, the model is fit again using the entire training set.
+
+    To avoid unnecessary memory duplication the `X` argument of the `fit`
+    method should be directly passed as a Fortran-contiguous numpy array.
+
+    For an example, see :ref:`examples/linear_model/plot_lasso_model_selection.py
+    <sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py>`.
+
+    :class:`LassoCV` leads to different results than a hyperparameter
+    search using :class:`~sklearn.model_selection.GridSearchCV` with a
+    :class:`Lasso` model. In :class:`LassoCV`, a model for a given
+    penalty `alpha` is warm started using the coefficients of the
+    closest model (trained at the previous iteration) on the
+    regularization path. It tends to speed up the hyperparameter
+    search.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LassoCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(noise=4, random_state=0)
+    >>> reg = LassoCV(cv=5, random_state=0).fit(X, y)
+    >>> reg.score(X, y)
+    0.9993
+    >>> reg.predict(X[:1,])
+    array([-78.4951])
+    """
+
+    path = staticmethod(lasso_path)
+
+    def __init__(
+        self,
+        *,
+        eps=1e-3,
+        n_alphas="deprecated",
+        alphas="warn",
+        fit_intercept=True,
+        precompute="auto",
+        max_iter=1000,
+        tol=1e-4,
+        copy_X=True,
+        cv=None,
+        verbose=False,
+        n_jobs=None,
+        positive=False,
+        random_state=None,
+        selection="cyclic",
+    ):
+        super().__init__(
+            eps=eps,
+            n_alphas=n_alphas,
+            alphas=alphas,
+            fit_intercept=fit_intercept,
+            precompute=precompute,
+            max_iter=max_iter,
+            tol=tol,
+            copy_X=copy_X,
+            cv=cv,
+            verbose=verbose,
+            n_jobs=n_jobs,
+            positive=positive,
+            random_state=random_state,
+            selection=selection,
+        )
+
+    def _get_estimator(self):
+        return Lasso()
+
+    def _is_multitask(self):
+        return False
+
+    def fit(self, X, y, sample_weight=None, **params):
+        """Fit Lasso model with coordinate descent.
+
+        Fit is on grid of alphas and best alpha estimated by cross-validation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data. Pass directly as Fortran-contiguous data
+            to avoid unnecessary memory duplication. If y is mono-output,
+            X can be sparse. Note that large sparse matrices and arrays
+            requiring `int64` indices are not accepted.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : float or array-like of shape (n_samples,), \
+                default=None
+            Sample weights used for fitting and evaluation of the weighted
+            mean squared error of each cv-fold. Note that the cross validated
+            MSE that is finally used to find the best model is the unweighted
+            mean over the (weighted) MSEs of each test fold.
+
+        **params : dict, default=None
+            Parameters to be passed to the CV splitter.
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of fitted model.
+        """
+        return super().fit(X, y, sample_weight=sample_weight, **params)
+
+
+class ElasticNetCV(RegressorMixin, LinearModelCV):
+    """Elastic Net model with iterative fitting along a regularization path.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    Read more in the :ref:`User Guide <elastic_net>`.
+
+    Parameters
+    ----------
+    l1_ratio : float or list of float, default=0.5
+        Float between 0 and 1 passed to ElasticNet (scaling between
+        l1 and l2 penalties). For ``l1_ratio = 0``
+        the penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty.
+        For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2
+        This parameter can be a list, in which case the different
+        values are tested by cross-validation and the one giving the best
+        prediction score is used. Note that a good choice of list of
+        values for l1_ratio is often to put more values close to 1
+        (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,
+        .9, .95, .99, 1]``.
+
+    eps : float, default=1e-3
+        Length of the path. ``eps=1e-3`` means that
+        ``alpha_min / alpha_max = 1e-3``.
+
+    n_alphas : int, default=100
+        Number of alphas along the regularization path, used for each l1_ratio.
+
+        .. deprecated:: 1.7
+            `n_alphas` was deprecated in 1.7 and will be removed in 1.9. Use `alphas`
+            instead.
+
+    alphas : array-like or int, default=None
+        Values of alphas to test along the regularization path, used for each l1_ratio.
+        If int, `alphas` values are generated automatically.
+        If array-like, list of alpha values to use.
+
+        .. versionchanged:: 1.7
+            `alphas` accepts an integer value which removes the need to pass
+            `n_alphas`.
+
+        .. deprecated:: 1.7
+            `alphas=None` was deprecated in 1.7 and will be removed in 1.9, at which
+            point the default value will be set to 100.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    precompute : 'auto', bool or array-like of shape \
+            (n_features, n_features), default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram
+        matrix can also be passed as argument.
+
+    max_iter : int, default=1000
+        The maximum number of iterations.
+
+    tol : float, default=1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``.
+
+    cv : int, cross-validation generator or iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - int, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    verbose : bool or int, default=0
+        Amount of verbosity.
+
+    n_jobs : int, default=None
+        Number of CPUs to use during the cross validation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator that selects a random
+        feature to update. Used when ``selection`` == 'random'.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    selection : {'cyclic', 'random'}, default='cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    Attributes
+    ----------
+    alpha_ : float
+        The amount of penalization chosen by cross validation.
+
+    l1_ratio_ : float
+        The compromise between l1 and l2 penalization chosen by
+        cross validation.
+
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
+        Parameter vector (w in the cost function formula).
+
+    intercept_ : float or ndarray of shape (n_targets, n_features)
+        Independent term in the decision function.
+
+    mse_path_ : ndarray of shape (n_l1_ratio, n_alpha, n_folds)
+        Mean square error for the test set on each fold, varying l1_ratio and
+        alpha.
+
+    alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas)
+        The grid of alphas used for fitting, for each l1_ratio.
+
+    dual_gap_ : float
+        The dual gaps at the end of the optimization for the optimal alpha.
+
+    n_iter_ : int
+        Number of iterations run by the coordinate descent solver to reach
+        the specified tolerance for the optimal alpha.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    enet_path : Compute elastic net path with coordinate descent.
+    ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.
+
+    Notes
+    -----
+    In `fit`, once the best parameters `l1_ratio` and `alpha` are found through
+    cross-validation, the model is fit again using the entire training set.
+
+    To avoid unnecessary memory duplication the `X` argument of the `fit`
+    method should be directly passed as a Fortran-contiguous numpy array.
+
+    The parameter `l1_ratio` corresponds to alpha in the glmnet R package
+    while alpha corresponds to the lambda parameter in glmnet.
+    More specifically, the optimization objective is::
+
+        1 / (2 * n_samples) * ||y - Xw||^2_2
+        + alpha * l1_ratio * ||w||_1
+        + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2
+
+    If you are interested in controlling the L1 and L2 penalty
+    separately, keep in mind that this is equivalent to::
+
+        a * L1 + b * L2
+
+    for::
+
+        alpha = a + b and l1_ratio = a / (a + b).
+
+    For an example, see
+    :ref:`examples/linear_model/plot_lasso_model_selection.py
+    <sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py>`.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import ElasticNetCV
+    >>> from sklearn.datasets import make_regression
+
+    >>> X, y = make_regression(n_features=2, random_state=0)
+    >>> regr = ElasticNetCV(cv=5, random_state=0)
+    >>> regr.fit(X, y)
+    ElasticNetCV(cv=5, random_state=0)
+    >>> print(regr.alpha_)
+    0.199
+    >>> print(regr.intercept_)
+    0.398
+    >>> print(regr.predict([[0, 0]]))
+    [0.398]
+    """
+
+    _parameter_constraints: dict = {
+        **LinearModelCV._parameter_constraints,
+        "l1_ratio": [Interval(Real, 0, 1, closed="both"), "array-like"],
+    }
+
+    path = staticmethod(enet_path)
+
+    def __init__(
+        self,
+        *,
+        l1_ratio=0.5,
+        eps=1e-3,
+        n_alphas="deprecated",
+        alphas="warn",
+        fit_intercept=True,
+        precompute="auto",
+        max_iter=1000,
+        tol=1e-4,
+        cv=None,
+        copy_X=True,
+        verbose=0,
+        n_jobs=None,
+        positive=False,
+        random_state=None,
+        selection="cyclic",
+    ):
+        self.l1_ratio = l1_ratio
+        self.eps = eps
+        self.n_alphas = n_alphas
+        self.alphas = alphas
+        self.fit_intercept = fit_intercept
+        self.precompute = precompute
+        self.max_iter = max_iter
+        self.tol = tol
+        self.cv = cv
+        self.copy_X = copy_X
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.positive = positive
+        self.random_state = random_state
+        self.selection = selection
+
+    def _get_estimator(self):
+        return ElasticNet()
+
+    def _is_multitask(self):
+        return False
+
+    def fit(self, X, y, sample_weight=None, **params):
+        """Fit ElasticNet model with coordinate descent.
+
+        Fit is on grid of alphas and best alpha estimated by cross-validation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data. Pass directly as Fortran-contiguous data
+            to avoid unnecessary memory duplication. If y is mono-output,
+            X can be sparse. Note that large sparse matrices and arrays
+            requiring `int64` indices are not accepted.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : float or array-like of shape (n_samples,), \
+                default=None
+            Sample weights used for fitting and evaluation of the weighted
+            mean squared error of each cv-fold. Note that the cross validated
+            MSE that is finally used to find the best model is the unweighted
+            mean over the (weighted) MSEs of each test fold.
+
+        **params : dict, default=None
+            Parameters to be passed to the CV splitter.
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of fitted model.
+        """
+        return super().fit(X, y, sample_weight=sample_weight, **params)
+
+
+###############################################################################
+# Multi Task ElasticNet and Lasso models (with joint feature selection)
+
+
+class MultiTaskElasticNet(Lasso):
+    """Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer.
+
+    The optimization objective for MultiTaskElasticNet is::
+
+        (1 / (2 * n_samples)) * ||Y - XW||_Fro^2
+        + alpha * l1_ratio * ||W||_21
+        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+
+    Where::
+
+        ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2)
+
+    i.e. the sum of norms of each row.
+
+    Read more in the :ref:`User Guide <multi_task_elastic_net>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Constant that multiplies the L1/L2 term. Defaults to 1.0.
+
+    l1_ratio : float, default=0.5
+        The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.
+        For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it
+        is an L2 penalty.
+        For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    max_iter : int, default=1000
+        The maximum number of iterations.
+
+    tol : float, default=1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator that selects a random
+        feature to update. Used when ``selection`` == 'random'.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    selection : {'cyclic', 'random'}, default='cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    Attributes
+    ----------
+    intercept_ : ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    coef_ : ndarray of shape (n_targets, n_features)
+        Parameter vector (W in the cost function formula). If a 1D y is
+        passed in at fit (non multi-task usage), ``coef_`` is then a 1D array.
+        Note that ``coef_`` stores the transpose of ``W``, ``W.T``.
+
+    n_iter_ : int
+        Number of iterations run by the coordinate descent solver to reach
+        the specified tolerance.
+
+    dual_gap_ : float
+        The dual gaps at the end of the optimization.
+
+    eps_ : float
+        The tolerance scaled scaled by the variance of the target `y`.
+
+    sparse_coef_ : sparse matrix of shape (n_features,) or \
+            (n_targets, n_features)
+        Sparse representation of the `coef_`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in
+        cross-validation.
+    ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.
+    MultiTaskLasso : Multi-task Lasso model trained with L1/L2
+        mixed-norm as regularizer.
+
+    Notes
+    -----
+    The algorithm used to fit the model is coordinate descent.
+
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.MultiTaskElasticNet(alpha=0.1)
+    >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])
+    MultiTaskElasticNet(alpha=0.1)
+    >>> print(clf.coef_)
+    [[0.45663524 0.45612256]
+     [0.45663524 0.45612256]]
+    >>> print(clf.intercept_)
+    [0.0872422 0.0872422]
+    """
+
+    _parameter_constraints: dict = {
+        **ElasticNet._parameter_constraints,
+    }
+    for param in ("precompute", "positive"):
+        _parameter_constraints.pop(param)
+
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        l1_ratio=0.5,
+        fit_intercept=True,
+        copy_X=True,
+        max_iter=1000,
+        tol=1e-4,
+        warm_start=False,
+        random_state=None,
+        selection="cyclic",
+    ):
+        self.l1_ratio = l1_ratio
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.max_iter = max_iter
+        self.copy_X = copy_X
+        self.tol = tol
+        self.warm_start = warm_start
+        self.random_state = random_state
+        self.selection = selection
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit MultiTaskElasticNet model with coordinate descent.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Data.
+        y : ndarray of shape (n_samples, n_targets)
+            Target. Will be cast to X's dtype if necessary.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+
+        Notes
+        -----
+        Coordinate descent is an algorithm that considers each column of
+        data at a time hence it will automatically convert the X input
+        as a Fortran-contiguous numpy array if necessary.
+
+        To avoid memory re-allocation it is advised to allocate the
+        initial data in memory directly using that format.
+        """
+        # Need to validate separately here.
+        # We can't pass multi_output=True because that would allow y to be csr.
+        check_X_params = dict(
+            dtype=[np.float64, np.float32],
+            order="F",
+            force_writeable=True,
+            copy=self.copy_X and self.fit_intercept,
+        )
+        check_y_params = dict(ensure_2d=False, order="F")
+        X, y = validate_data(
+            self, X, y, validate_separately=(check_X_params, check_y_params)
+        )
+        check_consistent_length(X, y)
+        y = y.astype(X.dtype)
+
+        if hasattr(self, "l1_ratio"):
+            model_str = "ElasticNet"
+        else:
+            model_str = "Lasso"
+        if y.ndim == 1:
+            raise ValueError("For mono-task outputs, use %s" % model_str)
+
+        n_samples, n_features = X.shape
+        n_targets = y.shape[1]
+
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X, y, fit_intercept=self.fit_intercept, copy=False
+        )
+
+        if not self.warm_start or not hasattr(self, "coef_"):
+            self.coef_ = np.zeros(
+                (n_targets, n_features), dtype=X.dtype.type, order="F"
+            )
+
+        l1_reg = self.alpha * self.l1_ratio * n_samples
+        l2_reg = self.alpha * (1.0 - self.l1_ratio) * n_samples
+
+        self.coef_ = np.asfortranarray(self.coef_)  # coef contiguous in memory
+
+        random = self.selection == "random"
+
+        (
+            self.coef_,
+            self.dual_gap_,
+            self.eps_,
+            self.n_iter_,
+        ) = cd_fast.enet_coordinate_descent_multi_task(
+            self.coef_,
+            l1_reg,
+            l2_reg,
+            X,
+            y,
+            self.max_iter,
+            self.tol,
+            check_random_state(self.random_state),
+            random,
+        )
+
+        # account for different objective scaling here and in cd_fast
+        self.dual_gap_ /= n_samples
+
+        self._set_intercept(X_offset, y_offset, X_scale)
+
+        # return self for chaining fit and predict calls
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = False
+        tags.target_tags.multi_output = True
+        tags.target_tags.single_output = False
+        return tags
+
+
+class MultiTaskLasso(MultiTaskElasticNet):
+    """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.
+
+    The optimization objective for Lasso is::
+
+        (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21
+
+    Where::
+
+        ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}
+
+    i.e. the sum of norm of each row.
+
+    Read more in the :ref:`User Guide <multi_task_lasso>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Constant that multiplies the L1/L2 term. Defaults to 1.0.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    max_iter : int, default=1000
+        The maximum number of iterations.
+
+    tol : float, default=1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator that selects a random
+        feature to update. Used when ``selection`` == 'random'.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    selection : {'cyclic', 'random'}, default='cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_targets, n_features)
+        Parameter vector (W in the cost function formula).
+        Note that ``coef_`` stores the transpose of ``W``, ``W.T``.
+
+    intercept_ : ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    n_iter_ : int
+        Number of iterations run by the coordinate descent solver to reach
+        the specified tolerance.
+
+    dual_gap_ : ndarray of shape (n_alphas,)
+        The dual gaps at the end of the optimization for each alpha.
+
+    eps_ : float
+        The tolerance scaled scaled by the variance of the target `y`.
+
+    sparse_coef_ : sparse matrix of shape (n_features,) or \
+            (n_targets, n_features)
+        Sparse representation of the `coef_`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    Lasso: Linear Model trained with L1 prior as regularizer (aka the Lasso).
+    MultiTaskLassoCV: Multi-task L1 regularized linear model with built-in
+        cross-validation.
+    MultiTaskElasticNetCV: Multi-task L1/L2 ElasticNet with built-in cross-validation.
+
+    Notes
+    -----
+    The algorithm used to fit the model is coordinate descent.
+
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.MultiTaskLasso(alpha=0.1)
+    >>> clf.fit([[0, 1], [1, 2], [2, 4]], [[0, 0], [1, 1], [2, 3]])
+    MultiTaskLasso(alpha=0.1)
+    >>> print(clf.coef_)
+    [[0.         0.60809415]
+    [0.         0.94592424]]
+    >>> print(clf.intercept_)
+    [-0.41888636 -0.87382323]
+    """
+
+    _parameter_constraints: dict = {
+        **MultiTaskElasticNet._parameter_constraints,
+    }
+    _parameter_constraints.pop("l1_ratio")
+
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        fit_intercept=True,
+        copy_X=True,
+        max_iter=1000,
+        tol=1e-4,
+        warm_start=False,
+        random_state=None,
+        selection="cyclic",
+    ):
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.max_iter = max_iter
+        self.copy_X = copy_X
+        self.tol = tol
+        self.warm_start = warm_start
+        self.l1_ratio = 1.0
+        self.random_state = random_state
+        self.selection = selection
+
+
+class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):
+    """Multi-task L1/L2 ElasticNet with built-in cross-validation.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    The optimization objective for MultiTaskElasticNet is::
+
+        (1 / (2 * n_samples)) * ||Y - XW||^Fro_2
+        + alpha * l1_ratio * ||W||_21
+        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+
+    Where::
+
+        ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}
+
+    i.e. the sum of norm of each row.
+
+    Read more in the :ref:`User Guide <multi_task_elastic_net>`.
+
+    .. versionadded:: 0.15
+
+    Parameters
+    ----------
+    l1_ratio : float or list of float, default=0.5
+        The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.
+        For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it
+        is an L2 penalty.
+        For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2.
+        This parameter can be a list, in which case the different
+        values are tested by cross-validation and the one giving the best
+        prediction score is used. Note that a good choice of list of
+        values for l1_ratio is often to put more values close to 1
+        (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,
+        .9, .95, .99, 1]``.
+
+    eps : float, default=1e-3
+        Length of the path. ``eps=1e-3`` means that
+        ``alpha_min / alpha_max = 1e-3``.
+
+    n_alphas : int, default=100
+        Number of alphas along the regularization path.
+
+        .. deprecated:: 1.7
+            `n_alphas` was deprecated in 1.7 and will be removed in 1.9. Use `alphas`
+            instead.
+
+    alphas : array-like or int, default=None
+        Values of alphas to test along the regularization path, used for each l1_ratio.
+        If int, `alphas` values are generated automatically.
+        If array-like, list of alpha values to use.
+
+        .. versionchanged:: 1.7
+            `alphas` accepts an integer value which removes the need to pass
+            `n_alphas`.
+
+        .. deprecated:: 1.7
+            `alphas=None` was deprecated in 1.7 and will be removed in 1.9, at which
+            point the default value will be set to 100.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    max_iter : int, default=1000
+        The maximum number of iterations.
+
+    tol : float, default=1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``.
+
+    cv : int, cross-validation generator or iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - int, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    verbose : bool or int, default=0
+        Amount of verbosity.
+
+    n_jobs : int, default=None
+        Number of CPUs to use during the cross validation. Note that this is
+        used only if multiple values for l1_ratio are given.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator that selects a random
+        feature to update. Used when ``selection`` == 'random'.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    selection : {'cyclic', 'random'}, default='cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    Attributes
+    ----------
+    intercept_ : ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    coef_ : ndarray of shape (n_targets, n_features)
+        Parameter vector (W in the cost function formula).
+        Note that ``coef_`` stores the transpose of ``W``, ``W.T``.
+
+    alpha_ : float
+        The amount of penalization chosen by cross validation.
+
+    mse_path_ : ndarray of shape (n_alphas, n_folds) or \
+                (n_l1_ratio, n_alphas, n_folds)
+        Mean square error for the test set on each fold, varying alpha.
+
+    alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas)
+        The grid of alphas used for fitting, for each l1_ratio.
+
+    l1_ratio_ : float
+        Best l1_ratio obtained by cross-validation.
+
+    n_iter_ : int
+        Number of iterations run by the coordinate descent solver to reach
+        the specified tolerance for the optimal alpha.
+
+    dual_gap_ : float
+        The dual gap at the end of the optimization for the optimal alpha.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    MultiTaskElasticNet : Multi-task L1/L2 ElasticNet with built-in cross-validation.
+    ElasticNetCV : Elastic net model with best model selection by
+        cross-validation.
+    MultiTaskLassoCV : Multi-task Lasso model trained with L1 norm
+        as regularizer and built-in cross-validation.
+
+    Notes
+    -----
+    The algorithm used to fit the model is coordinate descent.
+
+    In `fit`, once the best parameters `l1_ratio` and `alpha` are found through
+    cross-validation, the model is fit again using the entire training set.
+
+    To avoid unnecessary memory duplication the `X` and `y` arguments of the
+    `fit` method should be directly passed as Fortran-contiguous numpy arrays.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.MultiTaskElasticNetCV(cv=3)
+    >>> clf.fit([[0,0], [1, 1], [2, 2]],
+    ...         [[0, 0], [1, 1], [2, 2]])
+    MultiTaskElasticNetCV(cv=3)
+    >>> print(clf.coef_)
+    [[0.52875032 0.46958558]
+     [0.52875032 0.46958558]]
+    >>> print(clf.intercept_)
+    [0.00166409 0.00166409]
+    """
+
+    _parameter_constraints: dict = {
+        **LinearModelCV._parameter_constraints,
+        "l1_ratio": [Interval(Real, 0, 1, closed="both"), "array-like"],
+    }
+    _parameter_constraints.pop("precompute")
+    _parameter_constraints.pop("positive")
+
+    path = staticmethod(enet_path)
+
+    def __init__(
+        self,
+        *,
+        l1_ratio=0.5,
+        eps=1e-3,
+        n_alphas="deprecated",
+        alphas="warn",
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-4,
+        cv=None,
+        copy_X=True,
+        verbose=0,
+        n_jobs=None,
+        random_state=None,
+        selection="cyclic",
+    ):
+        self.l1_ratio = l1_ratio
+        self.eps = eps
+        self.n_alphas = n_alphas
+        self.alphas = alphas
+        self.fit_intercept = fit_intercept
+        self.max_iter = max_iter
+        self.tol = tol
+        self.cv = cv
+        self.copy_X = copy_X
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.selection = selection
+
+    def _get_estimator(self):
+        return MultiTaskElasticNet()
+
+    def _is_multitask(self):
+        return True
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.single_output = False
+        return tags
+
+    # This is necessary as LinearModelCV now supports sample_weight while
+    # MultiTaskElasticNetCV does not (yet).
+    def fit(self, X, y, **params):
+        """Fit MultiTaskElasticNet model with coordinate descent.
+
+        Fit is on grid of alphas and best alpha estimated by cross-validation.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+        y : ndarray of shape (n_samples, n_targets)
+            Training target variable. Will be cast to X's dtype if necessary.
+
+        **params : dict, default=None
+            Parameters to be passed to the CV splitter.
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns MultiTaskElasticNet instance.
+        """
+        return super().fit(X, y, **params)
+
+
+class MultiTaskLassoCV(RegressorMixin, LinearModelCV):
+    """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    The optimization objective for MultiTaskLasso is::
+
+        (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * ||W||_21
+
+    Where::
+
+        ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}
+
+    i.e. the sum of norm of each row.
+
+    Read more in the :ref:`User Guide <multi_task_lasso>`.
+
+    .. versionadded:: 0.15
+
+    Parameters
+    ----------
+    eps : float, default=1e-3
+        Length of the path. ``eps=1e-3`` means that
+        ``alpha_min / alpha_max = 1e-3``.
+
+    n_alphas : int, default=100
+        Number of alphas along the regularization path.
+
+        .. deprecated:: 1.7
+            `n_alphas` was deprecated in 1.7 and will be removed in 1.9. Use `alphas`
+            instead.
+
+    alphas : array-like or int, default=None
+        Values of alphas to test along the regularization path.
+        If int, `alphas` values are generated automatically.
+        If array-like, list of alpha values to use.
+
+        .. versionchanged:: 1.7
+            `alphas` accepts an integer value which removes the need to pass
+            `n_alphas`.
+
+        .. deprecated:: 1.7
+            `alphas=None` was deprecated in 1.7 and will be removed in 1.9, at which
+            point the default value will be set to 100.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    max_iter : int, default=1000
+        The maximum number of iterations.
+
+    tol : float, default=1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    cv : int, cross-validation generator or iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - int, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    verbose : bool or int, default=False
+        Amount of verbosity.
+
+    n_jobs : int, default=None
+        Number of CPUs to use during the cross validation. Note that this is
+        used only if multiple values for l1_ratio are given.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator that selects a random
+        feature to update. Used when ``selection`` == 'random'.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    selection : {'cyclic', 'random'}, default='cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    Attributes
+    ----------
+    intercept_ : ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    coef_ : ndarray of shape (n_targets, n_features)
+        Parameter vector (W in the cost function formula).
+        Note that ``coef_`` stores the transpose of ``W``, ``W.T``.
+
+    alpha_ : float
+        The amount of penalization chosen by cross validation.
+
+    mse_path_ : ndarray of shape (n_alphas, n_folds)
+        Mean square error for the test set on each fold, varying alpha.
+
+    alphas_ : ndarray of shape (n_alphas,)
+        The grid of alphas used for fitting.
+
+    n_iter_ : int
+        Number of iterations run by the coordinate descent solver to reach
+        the specified tolerance for the optimal alpha.
+
+    dual_gap_ : float
+        The dual gap at the end of the optimization for the optimal alpha.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2
+        mixed-norm as regularizer.
+    ElasticNetCV : Elastic net model with best model selection by
+        cross-validation.
+    MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in
+        cross-validation.
+
+    Notes
+    -----
+    The algorithm used to fit the model is coordinate descent.
+
+    In `fit`, once the best parameter `alpha` is found through
+    cross-validation, the model is fit again using the entire training set.
+
+    To avoid unnecessary memory duplication the `X` and `y` arguments of the
+    `fit` method should be directly passed as Fortran-contiguous numpy arrays.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import MultiTaskLassoCV
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.metrics import r2_score
+    >>> X, y = make_regression(n_targets=2, noise=4, random_state=0)
+    >>> reg = MultiTaskLassoCV(cv=5, random_state=0).fit(X, y)
+    >>> r2_score(y, reg.predict(X))
+    0.9994
+    >>> reg.alpha_
+    np.float64(0.5713)
+    >>> reg.predict(X[:1,])
+    array([[153.7971,  94.9015]])
+    """
+
+    _parameter_constraints: dict = {
+        **LinearModelCV._parameter_constraints,
+    }
+    _parameter_constraints.pop("precompute")
+    _parameter_constraints.pop("positive")
+
+    path = staticmethod(lasso_path)
+
+    def __init__(
+        self,
+        *,
+        eps=1e-3,
+        n_alphas="deprecated",
+        alphas="warn",
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-4,
+        copy_X=True,
+        cv=None,
+        verbose=False,
+        n_jobs=None,
+        random_state=None,
+        selection="cyclic",
+    ):
+        super().__init__(
+            eps=eps,
+            n_alphas=n_alphas,
+            alphas=alphas,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            copy_X=copy_X,
+            cv=cv,
+            verbose=verbose,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            selection=selection,
+        )
+
+    def _get_estimator(self):
+        return MultiTaskLasso()
+
+    def _is_multitask(self):
+        return True
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.single_output = False
+        return tags
+
+    # This is necessary as LinearModelCV now supports sample_weight while
+    # MultiTaskLassoCV does not (yet).
+    def fit(self, X, y, **params):
+        """Fit MultiTaskLasso model with coordinate descent.
+
+        Fit is on grid of alphas and best alpha estimated by cross-validation.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Data.
+        y : ndarray of shape (n_samples, n_targets)
+            Target. Will be cast to X's dtype if necessary.
+
+        **params : dict, default=None
+            Parameters to be passed to the CV splitter.
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of fitted model.
+        """
+        return super().fit(X, y, **params)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c471c35096f8ab59d042ea4c0758d88d8819282
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/__init__.py
@@ -0,0 +1,16 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from .glm import (
+    GammaRegressor,
+    PoissonRegressor,
+    TweedieRegressor,
+    _GeneralizedLinearRegressor,
+)
+
+__all__ = [
+    "GammaRegressor",
+    "PoissonRegressor",
+    "TweedieRegressor",
+    "_GeneralizedLinearRegressor",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/_newton_solver.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/_newton_solver.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfef023692d68102d6aee9602f31fd90854bb89d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/_newton_solver.py
@@ -0,0 +1,631 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Newton solver for Generalized Linear Models
+"""
+
+import warnings
+from abc import ABC, abstractmethod
+
+import numpy as np
+import scipy.linalg
+import scipy.optimize
+
+from ..._loss.loss import HalfSquaredError
+from ...exceptions import ConvergenceWarning
+from ...utils.fixes import _get_additional_lbfgs_options_dict
+from ...utils.optimize import _check_optimize_result
+from .._linear_loss import LinearModelLoss
+
+
+class NewtonSolver(ABC):
+    """Newton solver for GLMs.
+
+    This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
+    iteration aims at finding the Newton step which is done by the inner solver. With
+    Hessian H, gradient g and coefficients coef, one step solves:
+
+        H @ coef_newton = -g
+
+    For our GLM / LinearModelLoss, we have gradient g and Hessian H:
+
+        g = X.T @ loss.gradient + l2_reg_strength * coef
+        H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
+
+    Backtracking line search updates coef = coef_old + t * coef_newton for some t in
+    (0, 1].
+
+    This is a base class, actual implementations (child classes) may deviate from the
+    above pattern and use structure specific tricks.
+
+    Usage pattern:
+        - initialize solver: sol = NewtonSolver(...)
+        - solve the problem: sol.solve(X, y, sample_weight)
+
+    References
+    ----------
+    - Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"
+      2nd edition
+      https://doi.org/10.1007/978-0-387-40065-5
+
+    - Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."
+      Cambridge University Press, 2004.
+      https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
+
+    Parameters
+    ----------
+    coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+        Initial coefficients of a linear model.
+        If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+        i.e. one reconstructs the 2d-array via
+        coef.reshape((n_classes, -1), order="F").
+
+    linear_loss : LinearModelLoss
+        The loss to be minimized.
+
+    l2_reg_strength : float, default=0.0
+        L2 regularization strength.
+
+    tol : float, default=1e-4
+        The optimization problem is solved when each of the following condition is
+        fulfilled:
+        1. maximum |gradient| <= tol
+        2. Newton decrement d: 1/2 * d^2 <= tol
+
+    max_iter : int, default=100
+        Maximum number of Newton steps allowed.
+
+    n_threads : int, default=1
+        Number of OpenMP threads to use for the computation of the Hessian and gradient
+        of the loss function.
+
+    Attributes
+    ----------
+    coef_old : ndarray of shape coef.shape
+        Coefficient of previous iteration.
+
+    coef_newton : ndarray of shape coef.shape
+        Newton step.
+
+    gradient : ndarray of shape coef.shape
+        Gradient of the loss w.r.t. the coefficients.
+
+    gradient_old : ndarray of shape coef.shape
+        Gradient of previous iteration.
+
+    loss_value : float
+        Value of objective function = loss + penalty.
+
+    loss_value_old : float
+        Value of objective function of previous itertion.
+
+    raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)
+
+    converged : bool
+        Indicator for convergence of the solver.
+
+    iteration : int
+        Number of Newton steps, i.e. calls to inner_solve
+
+    use_fallback_lbfgs_solve : bool
+        If set to True, the solver will resort to call LBFGS to finish the optimisation
+        procedure in case of convergence issues.
+
+    gradient_times_newton : float
+        gradient @ coef_newton, set in inner_solve and used by line_search. If the
+        Newton step is a descent direction, this is negative.
+    """
+
+    def __init__(
+        self,
+        *,
+        coef,
+        linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True),
+        l2_reg_strength=0.0,
+        tol=1e-4,
+        max_iter=100,
+        n_threads=1,
+        verbose=0,
+    ):
+        self.coef = coef
+        self.linear_loss = linear_loss
+        self.l2_reg_strength = l2_reg_strength
+        self.tol = tol
+        self.max_iter = max_iter
+        self.n_threads = n_threads
+        self.verbose = verbose
+
+    def setup(self, X, y, sample_weight):
+        """Precomputations
+
+        If None, initializes:
+            - self.coef
+        Sets:
+            - self.raw_prediction
+            - self.loss_value
+        """
+        _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
+        self.loss_value = self.linear_loss.loss(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            raw_prediction=self.raw_prediction,
+        )
+
+    @abstractmethod
+    def update_gradient_hessian(self, X, y, sample_weight):
+        """Update gradient and Hessian."""
+
+    @abstractmethod
+    def inner_solve(self, X, y, sample_weight):
+        """Compute Newton step.
+
+        Sets:
+            - self.coef_newton
+            - self.gradient_times_newton
+        """
+
+    def fallback_lbfgs_solve(self, X, y, sample_weight):
+        """Fallback solver in case of emergency.
+
+        If a solver detects convergence problems, it may fall back to this methods in
+        the hope to exit with success instead of raising an error.
+
+        Sets:
+            - self.coef
+            - self.converged
+        """
+        max_iter = self.max_iter - self.iteration
+        opt_res = scipy.optimize.minimize(
+            self.linear_loss.loss_gradient,
+            self.coef,
+            method="L-BFGS-B",
+            jac=True,
+            options={
+                "maxiter": max_iter,
+                "maxls": 50,  # default is 20
+                "gtol": self.tol,
+                "ftol": 64 * np.finfo(np.float64).eps,
+                **_get_additional_lbfgs_options_dict("iprint", self.verbose - 1),
+            },
+            args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
+        )
+        self.iteration += _check_optimize_result("lbfgs", opt_res, max_iter=max_iter)
+        self.coef = opt_res.x
+        self.converged = opt_res.status == 0
+
+    def line_search(self, X, y, sample_weight):
+        """Backtracking line search.
+
+        Sets:
+            - self.coef_old
+            - self.coef
+            - self.loss_value_old
+            - self.loss_value
+            - self.gradient_old
+            - self.gradient
+            - self.raw_prediction
+        """
+        # line search parameters
+        beta, sigma = 0.5, 0.00048828125  # 1/2, 1/2**11
+        eps = 16 * np.finfo(self.loss_value.dtype).eps
+        t = 1  # step size
+
+        # gradient_times_newton = self.gradient @ self.coef_newton
+        # was computed in inner_solve.
+        armijo_term = sigma * self.gradient_times_newton
+        _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
+            self.coef_newton, X
+        )
+
+        self.coef_old = self.coef
+        self.loss_value_old = self.loss_value
+        self.gradient_old = self.gradient
+
+        # np.sum(np.abs(self.gradient_old))
+        sum_abs_grad_old = -1
+
+        is_verbose = self.verbose >= 2
+        if is_verbose:
+            print("  Backtracking Line Search")
+            print(f"    eps=16 * finfo.eps={eps}")
+
+        for i in range(21):  # until and including t = beta**20 ~ 1e-6
+            self.coef = self.coef_old + t * self.coef_newton
+            raw = self.raw_prediction + t * raw_prediction_newton
+            self.loss_value, self.gradient = self.linear_loss.loss_gradient(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+                raw_prediction=raw,
+            )
+            # Note: If coef_newton is too large, loss_gradient may produce inf values,
+            # potentially accompanied by a RuntimeWarning.
+            # This case will be captured by the Armijo condition.
+
+            # 1. Check Armijo / sufficient decrease condition.
+            # The smaller (more negative) the better.
+            loss_improvement = self.loss_value - self.loss_value_old
+            check = loss_improvement <= t * armijo_term
+            if is_verbose:
+                print(
+                    f"    line search iteration={i + 1}, step size={t}\n"
+                    f"      check loss improvement <= armijo term: {loss_improvement} "
+                    f"<= {t * armijo_term} {check}"
+                )
+            if check:
+                break
+            # 2. Deal with relative loss differences around machine precision.
+            tiny_loss = np.abs(self.loss_value_old * eps)
+            check = np.abs(loss_improvement) <= tiny_loss
+            if is_verbose:
+                print(
+                    "      check loss |improvement| <= eps * |loss_old|:"
+                    f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
+                )
+            if check:
+                if sum_abs_grad_old < 0:
+                    sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)
+                # 2.1 Check sum of absolute gradients as alternative condition.
+                sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)
+                check = sum_abs_grad < sum_abs_grad_old
+                if is_verbose:
+                    print(
+                        "      check sum(|gradient|) < sum(|gradient_old|): "
+                        f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
+                    )
+                if check:
+                    break
+
+            t *= beta
+        else:
+            warnings.warn(
+                (
+                    f"Line search of Newton solver {self.__class__.__name__} at"
+                    f" iteration #{self.iteration} did no converge after 21 line search"
+                    " refinement iterations. It will now resort to lbfgs instead."
+                ),
+                ConvergenceWarning,
+            )
+            if self.verbose:
+                print("  Line search did not converge and resorts to lbfgs instead.")
+            self.use_fallback_lbfgs_solve = True
+            return
+
+        self.raw_prediction = raw
+        if is_verbose:
+            print(
+                f"    line search successful after {i + 1} iterations with "
+                f"loss={self.loss_value}."
+            )
+
+    def check_convergence(self, X, y, sample_weight):
+        """Check for convergence.
+
+        Sets self.converged.
+        """
+        if self.verbose:
+            print("  Check Convergence")
+        # Note: Checking maximum relative change of coefficient <= tol is a bad
+        # convergence criterion because even a large step could have brought us close
+        # to the true minimum.
+        # coef_step = self.coef - self.coef_old
+        # change = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))
+        # check = change <= tol
+
+        # 1. Criterion: maximum |gradient| <= tol
+        #    The gradient was already updated in line_search()
+        g_max_abs = np.max(np.abs(self.gradient))
+        check = g_max_abs <= self.tol
+        if self.verbose:
+            print(f"    1. max |gradient| {g_max_abs} <= {self.tol} {check}")
+        if not check:
+            return
+
+        # 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol
+        #       d = sqrt(grad @ hessian^-1 @ grad)
+        #         = sqrt(coef_newton @ hessian @ coef_newton)
+        #    See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
+        d2 = self.coef_newton @ self.hessian @ self.coef_newton
+        check = 0.5 * d2 <= self.tol
+        if self.verbose:
+            print(f"    2. Newton decrement {0.5 * d2} <= {self.tol} {check}")
+        if not check:
+            return
+
+        if self.verbose:
+            loss_value = self.linear_loss.loss(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+            )
+            print(f"  Solver did converge at loss = {loss_value}.")
+        self.converged = True
+
+    def finalize(self, X, y, sample_weight):
+        """Finalize the solvers results.
+
+        Some solvers may need this, others not.
+        """
+        pass
+
+    def solve(self, X, y, sample_weight):
+        """Solve the optimization problem.
+
+        This is the main routine.
+
+        Order of calls:
+            self.setup()
+            while iteration:
+                self.update_gradient_hessian()
+                self.inner_solve()
+                self.line_search()
+                self.check_convergence()
+            self.finalize()
+
+        Returns
+        -------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Solution of the optimization problem.
+        """
+        # setup usually:
+        #   - initializes self.coef if needed
+        #   - initializes and calculates self.raw_predictions, self.loss_value
+        self.setup(X=X, y=y, sample_weight=sample_weight)
+
+        self.iteration = 1
+        self.converged = False
+        self.use_fallback_lbfgs_solve = False
+
+        while self.iteration <= self.max_iter and not self.converged:
+            if self.verbose:
+                print(f"Newton iter={self.iteration}")
+
+            self.use_fallback_lbfgs_solve = False  # Fallback solver.
+
+            # 1. Update Hessian and gradient
+            self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
+
+            # TODO:
+            # if iteration == 1:
+            # We might stop early, e.g. we already are close to the optimum,
+            # usually detected by zero gradients at this stage.
+
+            # 2. Inner solver
+            #    Calculate Newton step/direction
+            #    This usually sets self.coef_newton and self.gradient_times_newton.
+            self.inner_solve(X=X, y=y, sample_weight=sample_weight)
+            if self.use_fallback_lbfgs_solve:
+                break
+
+            # 3. Backtracking line search
+            #    This usually sets self.coef_old, self.coef, self.loss_value_old
+            #    self.loss_value, self.gradient_old, self.gradient,
+            #    self.raw_prediction.
+            self.line_search(X=X, y=y, sample_weight=sample_weight)
+            if self.use_fallback_lbfgs_solve:
+                break
+
+            # 4. Check convergence
+            #    Sets self.converged.
+            self.check_convergence(X=X, y=y, sample_weight=sample_weight)
+
+            # 5. Next iteration
+            self.iteration += 1
+
+        if not self.converged:
+            if self.use_fallback_lbfgs_solve:
+                # Note: The fallback solver circumvents check_convergence and relies on
+                # the convergence checks of lbfgs instead. Enough warnings have been
+                # raised on the way.
+                self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
+            else:
+                warnings.warn(
+                    (
+                        f"Newton solver did not converge after {self.iteration - 1} "
+                        "iterations."
+                    ),
+                    ConvergenceWarning,
+                )
+
+        self.iteration -= 1
+        self.finalize(X=X, y=y, sample_weight=sample_weight)
+        return self.coef
+
+
+class NewtonCholeskySolver(NewtonSolver):
+    """Cholesky based Newton solver.
+
+    Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
+    solver.
+    """
+
+    def setup(self, X, y, sample_weight):
+        super().setup(X=X, y=y, sample_weight=sample_weight)
+        if self.linear_loss.base_loss.is_multiclass:
+            # Easier with ravelled arrays, e.g., for scipy.linalg.solve.
+            # As with LinearModelLoss, we always are contiguous in n_classes.
+            self.coef = self.coef.ravel(order="F")
+        # Note that the computation of gradient in LinearModelLoss follows the shape of
+        # coef.
+        self.gradient = np.empty_like(self.coef)
+        # But the hessian is always 2d.
+        n = self.coef.size
+        self.hessian = np.empty_like(self.coef, shape=(n, n))
+        # To help case distinctions.
+        self.is_multinomial_with_intercept = (
+            self.linear_loss.base_loss.is_multiclass and self.linear_loss.fit_intercept
+        )
+        self.is_multinomial_no_penalty = (
+            self.linear_loss.base_loss.is_multiclass and self.l2_reg_strength == 0
+        )
+        if self.is_multinomial_no_penalty:
+            # See inner_solve. The provided coef might not adhere to the convention
+            # that the last class is set to zero.
+            # This is done by the usual freedom of a (overparametrized) multinomial to
+            # add a constant to all classes which doesn't change predictions.
+            n_classes = self.linear_loss.base_loss.n_classes
+            coef = self.coef.reshape(n_classes, -1, order="F")  # easier as 2d
+            coef -= coef[-1, :]  # coef -= coef of last class
+        elif self.is_multinomial_with_intercept:
+            # See inner_solve. Same as above, but only for the intercept.
+            n_classes = self.linear_loss.base_loss.n_classes
+            # intercept -= intercept of last class
+            self.coef[-n_classes:] -= self.coef[-1]
+
+    def update_gradient_hessian(self, X, y, sample_weight):
+        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            gradient_out=self.gradient,
+            hessian_out=self.hessian,
+            raw_prediction=self.raw_prediction,  # this was updated in line_search
+        )
+
+    def inner_solve(self, X, y, sample_weight):
+        if self.hessian_warning:
+            warnings.warn(
+                (
+                    f"The inner solver of {self.__class__.__name__} detected a "
+                    "pointwise hessian with many negative values at iteration "
+                    f"#{self.iteration}. It will now resort to lbfgs instead."
+                ),
+                ConvergenceWarning,
+            )
+            if self.verbose:
+                print(
+                    "  The inner solver detected a pointwise Hessian with many "
+                    "negative values and resorts to lbfgs instead."
+                )
+            self.use_fallback_lbfgs_solve = True
+            return
+
+        # Note: The following case distinction could also be shifted to the
+        # implementation of HalfMultinomialLoss instead of here within the solver.
+        if self.is_multinomial_no_penalty:
+            # The multinomial loss is overparametrized for each unpenalized feature, so
+            # at least the intercepts. This can be seen by noting that predicted
+            # probabilities are invariant under shifting all coefficients of a single
+            # feature j for all classes by the same amount c:
+            #   coef[k, :] -> coef[k, :] + c    =>    proba stays the same
+            # where we have assumed coef.shape = (n_classes, n_features).
+            # Therefore, also the loss (-log-likelihood), gradient and hessian stay the
+            # same, see
+            # Noah Simon and Jerome Friedman and Trevor Hastie. (2013) "A Blockwise
+            # Descent Algorithm for Group-penalized Multiresponse and Multinomial
+            # Regression". https://doi.org/10.48550/arXiv.1311.6529
+            #
+            # We choose the standard approach and set all the coefficients of the last
+            # class to zero, for all features including the intercept.
+            # Note that coef was already dealt with in setup.
+            n_classes = self.linear_loss.base_loss.n_classes
+            n_dof = self.coef.size // n_classes  # degree of freedom per class
+            n = self.coef.size - n_dof  # effective size
+            self.gradient[n_classes - 1 :: n_classes] = 0
+            self.hessian[n_classes - 1 :: n_classes, :] = 0
+            self.hessian[:, n_classes - 1 :: n_classes] = 0
+            # We also need the reduced variants of gradient and hessian where the
+            # entries set to zero are removed. For 2 features and 3 classes with
+            # arbitrary values, "x" means removed:
+            #   gradient = [0, 1, x, 3, 4, x]
+            #
+            #   hessian = [0,  1, x,  3,  4, x]
+            #             [1,  7, x,  9, 10, x]
+            #             [x,  x, x,  x,  x, x]
+            #             [3,  9, x, 21, 22, x]
+            #             [4, 10, x, 22, 28, x]
+            #             [x,  x, x,  x, x,  x]
+            # The following slicing triggers copies of gradient and hessian.
+            gradient = self.gradient.reshape(-1, n_classes)[:, :-1].flatten()
+            hessian = self.hessian.reshape(n_dof, n_classes, n_dof, n_classes)[
+                :, :-1, :, :-1
+            ].reshape(n, n)
+        elif self.is_multinomial_with_intercept:
+            # Here, only intercepts are unpenalized. We again choose the last class and
+            # set its intercept to zero.
+            # Note that coef was already dealt with in setup.
+            self.gradient[-1] = 0
+            self.hessian[-1, :] = 0
+            self.hessian[:, -1] = 0
+            gradient, hessian = self.gradient[:-1], self.hessian[:-1, :-1]
+        else:
+            gradient, hessian = self.gradient, self.hessian
+
+        try:
+            with warnings.catch_warnings():
+                warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
+                self.coef_newton = scipy.linalg.solve(
+                    hessian, -gradient, check_finite=False, assume_a="sym"
+                )
+                if self.is_multinomial_no_penalty:
+                    self.coef_newton = np.c_[
+                        self.coef_newton.reshape(n_dof, n_classes - 1), np.zeros(n_dof)
+                    ].reshape(-1)
+                    assert self.coef_newton.flags.f_contiguous
+                elif self.is_multinomial_with_intercept:
+                    self.coef_newton = np.r_[self.coef_newton, 0]
+                self.gradient_times_newton = self.gradient @ self.coef_newton
+                if self.gradient_times_newton > 0:
+                    if self.verbose:
+                        print(
+                            "  The inner solver found a Newton step that is not a "
+                            "descent direction and resorts to LBFGS steps instead."
+                        )
+                    self.use_fallback_lbfgs_solve = True
+                    return
+        except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
+            warnings.warn(
+                f"The inner solver of {self.__class__.__name__} stumbled upon a "
+                "singular or very ill-conditioned Hessian matrix at iteration "
+                f"{self.iteration}. It will now resort to lbfgs instead.\n"
+                "Further options are to use another solver or to avoid such situation "
+                "in the first place. Possible remedies are removing collinear features"
+                " of X or increasing the penalization strengths.\n"
+                "The original Linear Algebra message was:\n" + str(e),
+                scipy.linalg.LinAlgWarning,
+            )
+            # Possible causes:
+            # 1. hess_pointwise is negative. But this is already taken care in
+            #    LinearModelLoss.gradient_hessian.
+            # 2. X is singular or ill-conditioned
+            #    This might be the most probable cause.
+            #
+            # There are many possible ways to deal with this situation. Most of them
+            # add, explicitly or implicitly, a matrix to the hessian to make it
+            # positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
+            # Instead, we resort to lbfgs.
+            if self.verbose:
+                print(
+                    "  The inner solver stumbled upon an singular or ill-conditioned "
+                    "Hessian matrix and resorts to LBFGS instead."
+                )
+            self.use_fallback_lbfgs_solve = True
+            return
+
+    def finalize(self, X, y, sample_weight):
+        if self.is_multinomial_no_penalty:
+            # Our convention is usually the symmetric parametrization where
+            # sum(coef[classes, features], axis=0) = 0.
+            # We convert now to this convention. Note that it does not change
+            # the predicted probabilities.
+            n_classes = self.linear_loss.base_loss.n_classes
+            self.coef = self.coef.reshape(n_classes, -1, order="F")
+            self.coef -= np.mean(self.coef, axis=0)
+        elif self.is_multinomial_with_intercept:
+            # Only the intercept needs an update to the symmetric parametrization.
+            n_classes = self.linear_loss.base_loss.n_classes
+            self.coef[-n_classes:] -= np.mean(self.coef[-n_classes:])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/glm.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/glm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ba24878b95b2dedbbc7bee89be5616fb5928359
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/glm.py
@@ -0,0 +1,911 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Generalized Linear Models with Exponential Dispersion Family
+"""
+
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.optimize
+
+from ..._loss.loss import (
+    HalfGammaLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    HalfTweedieLossIdentity,
+)
+from ...base import BaseEstimator, RegressorMixin, _fit_context
+from ...utils import check_array
+from ...utils._openmp_helpers import _openmp_effective_n_threads
+from ...utils._param_validation import Hidden, Interval, StrOptions
+from ...utils.fixes import _get_additional_lbfgs_options_dict
+from ...utils.optimize import _check_optimize_result
+from ...utils.validation import _check_sample_weight, check_is_fitted, validate_data
+from .._linear_loss import LinearModelLoss
+from ._newton_solver import NewtonCholeskySolver, NewtonSolver
+
+
+class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
+    """Regression via a penalized Generalized Linear Model (GLM).
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and
+    predicting the mean of the target y as y_pred=h(X*w) with coefficients w.
+    Therefore, the fit minimizes the following objective function with L2 priors as
+    regularizer::
+
+        1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2
+
+    with inverse link function h, s=sample_weight and per observation (unit) deviance
+    deviance(y_i, h(x_i*w)). Note that for an EDM, 1/2 * deviance is the negative
+    log-likelihood up to a constant (in w) term.
+    The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
+
+    Instead of implementing the EDM family and a link function separately, we directly
+    use the loss functions `from sklearn._loss` which have the link functions included
+    in them for performance reasons. We pick the loss functions that implement
+    (1/2 times) EDM deviances.
+
+    Read more in the :ref:`User Guide <Generalized_linear_models>`.
+
+    .. versionadded:: 0.23
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the penalty term and thus determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+        Values must be in the range `[0.0, inf)`.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X @ coef + intercept).
+
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is a good choice for `n_samples` >> `n_features`, especially
+            with one-hot encoded categorical features with rare categories. Be aware
+            that the memory usage of this solver has a quadratic dependency on
+            `n_features` because it explicitly computes the Hessian matrix.
+
+            .. versionadded:: 1.2
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+        Values must be in the range `[1, inf)`.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+        Values must be in the range `(0.0, inf)`.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_``.
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+        Values must be in the range `[0, inf)`.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+
+    _base_loss : BaseLoss, default=HalfSquaredError()
+        This is set during fit via `self._get_loss()`.
+        A `_base_loss` contains a specific loss function as well as the link
+        function. The loss to be minimized specifies the distributional assumption of
+        the GLM, i.e. the distribution from the EDM. Here are some examples:
+
+        =======================  ========  ==========================
+        _base_loss               Link      Target Domain
+        =======================  ========  ==========================
+        HalfSquaredError         identity  y any real number
+        HalfPoissonLoss          log       0 <= y
+        HalfGammaLoss            log       0 < y
+        HalfTweedieLoss          log       dependent on tweedie power
+        HalfTweedieLossIdentity  identity  dependent on tweedie power
+        =======================  ========  ==========================
+
+        The link function of the GLM, i.e. mapping from linear predictor
+        `X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
+        we have `y_pred = exp(X @ coeff + intercept)`.
+    """
+
+    # We allow for NewtonSolver classes for the "solver" parameter but do not
+    # make them public in the docstrings. This facilitates testing and
+    # benchmarking.
+    _parameter_constraints: dict = {
+        "alpha": [Interval(Real, 0.0, None, closed="left")],
+        "fit_intercept": ["boolean"],
+        "solver": [
+            StrOptions({"lbfgs", "newton-cholesky"}),
+            Hidden(type),
+        ],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0.0, None, closed="neither")],
+        "warm_start": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="lbfgs",
+        max_iter=100,
+        tol=1e-4,
+        warm_start=False,
+        verbose=0,
+    ):
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.solver = solver
+        self.max_iter = max_iter
+        self.tol = tol
+        self.warm_start = warm_start
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit a Generalized Linear Model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+            Fitted model.
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csc", "csr"],
+            dtype=[np.float64, np.float32],
+            y_numeric=True,
+            multi_output=False,
+        )
+
+        # required by losses
+        if self.solver == "lbfgs":
+            # lbfgs will force coef and therefore raw_prediction to be float64. The
+            # base_loss needs y, X @ coef and sample_weight all of same dtype
+            # (and contiguous).
+            loss_dtype = np.float64
+        else:
+            loss_dtype = min(max(y.dtype, X.dtype), np.float64)
+        y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
+
+        if sample_weight is not None:
+            # Note that _check_sample_weight calls check_array(order="C") required by
+            # losses.
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
+
+        n_samples, n_features = X.shape
+        self._base_loss = self._get_loss()
+
+        linear_loss = LinearModelLoss(
+            base_loss=self._base_loss,
+            fit_intercept=self.fit_intercept,
+        )
+
+        if not linear_loss.base_loss.in_y_true_range(y):
+            raise ValueError(
+                "Some value(s) of y are out of the valid range of the loss"
+                f" {self._base_loss.__class__.__name__!r}."
+            )
+
+        # TODO: if alpha=0 check that X is not rank deficient
+
+        # NOTE: Rescaling of sample_weight:
+        # We want to minimize
+        #     obj = 1/(2 * sum(sample_weight)) * sum(sample_weight * deviance)
+        #         + 1/2 * alpha * L2,
+        # with
+        #     deviance = 2 * loss.
+        # The objective is invariant to multiplying sample_weight by a constant. We
+        # could choose this constant such that sum(sample_weight) = 1 in order to end
+        # up with
+        #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
+        # But LinearModelLoss.loss() already computes
+        #     average(loss, weights=sample_weight)
+        # Thus, without rescaling, we have
+        #     obj = LinearModelLoss.loss(...)
+
+        if self.warm_start and hasattr(self, "coef_"):
+            if self.fit_intercept:
+                # LinearModelLoss needs intercept at the end of coefficient array.
+                coef = np.concatenate((self.coef_, np.array([self.intercept_])))
+            else:
+                coef = self.coef_
+            coef = coef.astype(loss_dtype, copy=False)
+        else:
+            coef = linear_loss.init_zero_coef(X, dtype=loss_dtype)
+            if self.fit_intercept:
+                coef[-1] = linear_loss.base_loss.link.link(
+                    np.average(y, weights=sample_weight)
+                )
+
+        l2_reg_strength = self.alpha
+        n_threads = _openmp_effective_n_threads()
+
+        # Algorithms for optimization:
+        # Note again that our losses implement 1/2 * deviance.
+        if self.solver == "lbfgs":
+            func = linear_loss.loss_gradient
+
+            opt_res = scipy.optimize.minimize(
+                func,
+                coef,
+                method="L-BFGS-B",
+                jac=True,
+                options={
+                    "maxiter": self.max_iter,
+                    "maxls": 50,  # default is 20
+                    "gtol": self.tol,
+                    # The constant 64 was found empirically to pass the test suite.
+                    # The point is that ftol is very small, but a bit larger than
+                    # machine precision for float64, which is the dtype used by lbfgs.
+                    "ftol": 64 * np.finfo(float).eps,
+                    **_get_additional_lbfgs_options_dict("iprint", self.verbose - 1),
+                },
+                args=(X, y, sample_weight, l2_reg_strength, n_threads),
+            )
+            self.n_iter_ = _check_optimize_result(
+                "lbfgs", opt_res, max_iter=self.max_iter
+            )
+            coef = opt_res.x
+        elif self.solver == "newton-cholesky":
+            sol = NewtonCholeskySolver(
+                coef=coef,
+                linear_loss=linear_loss,
+                l2_reg_strength=l2_reg_strength,
+                tol=self.tol,
+                max_iter=self.max_iter,
+                n_threads=n_threads,
+                verbose=self.verbose,
+            )
+            coef = sol.solve(X, y, sample_weight)
+            self.n_iter_ = sol.iteration
+        elif issubclass(self.solver, NewtonSolver):
+            sol = self.solver(
+                coef=coef,
+                linear_loss=linear_loss,
+                l2_reg_strength=l2_reg_strength,
+                tol=self.tol,
+                max_iter=self.max_iter,
+                n_threads=n_threads,
+            )
+            coef = sol.solve(X, y, sample_weight)
+            self.n_iter_ = sol.iteration
+        else:
+            raise ValueError(f"Invalid solver={self.solver}.")
+
+        if self.fit_intercept:
+            self.intercept_ = coef[-1]
+            self.coef_ = coef[:-1]
+        else:
+            # set intercept to zero as the other linear models do
+            self.intercept_ = 0.0
+            self.coef_ = coef
+
+        return self
+
+    def _linear_predictor(self, X):
+        """Compute the linear_predictor = `X @ coef_ + intercept_`.
+
+        Note that we often use the term raw_prediction instead of linear predictor.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        y_pred : array of shape (n_samples,)
+            Returns predicted values of linear predictor.
+        """
+        check_is_fitted(self)
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype=[np.float64, np.float32],
+            ensure_2d=True,
+            allow_nd=False,
+            reset=False,
+        )
+        return X @ self.coef_ + self.intercept_
+
+    def predict(self, X):
+        """Predict using GLM with feature matrix X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        y_pred : array of shape (n_samples,)
+            Returns predicted values.
+        """
+        # check_array is done in _linear_predictor
+        raw_prediction = self._linear_predictor(X)
+        y_pred = self._base_loss.link.inverse(raw_prediction)
+        return y_pred
+
+    def score(self, X, y, sample_weight=None):
+        """Compute D^2, the percentage of deviance explained.
+
+        D^2 is a generalization of the coefficient of determination R^2.
+        R^2 uses squared error and D^2 uses the deviance of this GLM, see the
+        :ref:`User Guide <regression_metrics>`.
+
+        D^2 is defined as
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        Best possible score is 1.0 and it can be negative (because the model
+        can be arbitrarily worse).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,)
+            True values of target.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            D^2 of self.predict(X) w.r.t. y.
+        """
+        # TODO: Adapt link to User Guide in the docstring, once
+        # https://github.com/scikit-learn/scikit-learn/pull/22118 is merged.
+        #
+        # Note, default score defined in RegressorMixin is R^2 score.
+        # TODO: make D^2 a score function in module metrics (and thereby get
+        #       input validation and so on)
+        raw_prediction = self._linear_predictor(X)  # validates X
+        # required by losses
+        y = check_array(y, dtype=raw_prediction.dtype, order="C", ensure_2d=False)
+
+        if sample_weight is not None:
+            # Note that _check_sample_weight calls check_array(order="C") required by
+            # losses.
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=y.dtype)
+
+        base_loss = self._base_loss
+
+        if not base_loss.in_y_true_range(y):
+            raise ValueError(
+                "Some value(s) of y are out of the valid range of the loss"
+                f" {base_loss.__name__}."
+            )
+
+        constant = np.average(
+            base_loss.constant_to_optimal_zero(y_true=y, sample_weight=None),
+            weights=sample_weight,
+        )
+
+        # Missing factor of 2 in deviance cancels out.
+        deviance = base_loss(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=1,
+        )
+        y_mean = base_loss.link.link(np.average(y, weights=sample_weight))
+        deviance_null = base_loss(
+            y_true=y,
+            raw_prediction=np.tile(y_mean, y.shape[0]),
+            sample_weight=sample_weight,
+            n_threads=1,
+        )
+        return 1 - (deviance + constant) / (deviance_null + constant)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        try:
+            # Create instance of BaseLoss if fit wasn't called yet. This is necessary as
+            # TweedieRegressor might set the used loss during fit different from
+            # self._base_loss.
+            base_loss = self._get_loss()
+            tags.target_tags.positive_only = not base_loss.in_y_true_range(-1.0)
+        except (ValueError, AttributeError, TypeError):
+            # This happens when the link or power parameter of TweedieRegressor is
+            # invalid. We fallback on the default tags in that case.
+            pass  # pragma: no cover
+        return tags
+
+    def _get_loss(self):
+        """This is only necessary because of the link and power arguments of the
+        TweedieRegressor.
+
+        Note that we do not need to pass sample_weight to the loss class as this is
+        only needed to set loss.constant_hessian on which GLMs do not rely.
+        """
+        return HalfSquaredError()
+
+
+class PoissonRegressor(_GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Poisson distribution.
+
+    This regressor uses the 'log' link function.
+
+    Read more in the :ref:`User Guide <Generalized_linear_models>`.
+
+    .. versionadded:: 0.23
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the L2 penalty term and determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+        Values of `alpha` must be in the range `[0.0, inf)`.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (`X @ coef + intercept`).
+
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is a good choice for `n_samples` >> `n_features`, especially
+            with one-hot encoded categorical features with rare categories. Be aware
+            that the memory usage of this solver has a quadratic dependency on
+            `n_features` because it explicitly computes the Hessian matrix.
+
+            .. versionadded:: 1.2
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+        Values must be in the range `[1, inf)`.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+        Values must be in the range `(0.0, inf)`.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+        Values must be in the range `[0, inf)`.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+
+    See Also
+    --------
+    TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.PoissonRegressor()
+    >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
+    >>> y = [12, 17, 22, 21]
+    >>> clf.fit(X, y)
+    PoissonRegressor()
+    >>> clf.score(X, y)
+    np.float64(0.990)
+    >>> clf.coef_
+    array([0.121, 0.158])
+    >>> clf.intercept_
+    np.float64(2.088)
+    >>> clf.predict([[1, 1], [3, 4]])
+    array([10.676, 21.875])
+    """
+
+    _parameter_constraints: dict = {
+        **_GeneralizedLinearRegressor._parameter_constraints
+    }
+
+    def __init__(
+        self,
+        *,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="lbfgs",
+        max_iter=100,
+        tol=1e-4,
+        warm_start=False,
+        verbose=0,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            solver=solver,
+            max_iter=max_iter,
+            tol=tol,
+            warm_start=warm_start,
+            verbose=verbose,
+        )
+
+    def _get_loss(self):
+        return HalfPoissonLoss()
+
+
+class GammaRegressor(_GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Gamma distribution.
+
+    This regressor uses the 'log' link function.
+
+    Read more in the :ref:`User Guide <Generalized_linear_models>`.
+
+    .. versionadded:: 0.23
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the L2 penalty term and determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+        Values of `alpha` must be in the range `[0.0, inf)`.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor `X @ coef_ + intercept_`.
+
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is a good choice for `n_samples` >> `n_features`, especially
+            with one-hot encoded categorical features with rare categories. Be aware
+            that the memory usage of this solver has a quadratic dependency on
+            `n_features` because it explicitly computes the Hessian matrix.
+
+            .. versionadded:: 1.2
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+        Values must be in the range `[1, inf)`.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+        Values must be in the range `(0.0, inf)`.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for `coef_` and `intercept_`.
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+        Values must be in the range `[0, inf)`.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PoissonRegressor : Generalized Linear Model with a Poisson distribution.
+    TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.GammaRegressor()
+    >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
+    >>> y = [19, 26, 33, 30]
+    >>> clf.fit(X, y)
+    GammaRegressor()
+    >>> clf.score(X, y)
+    np.float64(0.773)
+    >>> clf.coef_
+    array([0.073, 0.067])
+    >>> clf.intercept_
+    np.float64(2.896)
+    >>> clf.predict([[1, 0], [2, 8]])
+    array([19.483, 35.795])
+    """
+
+    _parameter_constraints: dict = {
+        **_GeneralizedLinearRegressor._parameter_constraints
+    }
+
+    def __init__(
+        self,
+        *,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="lbfgs",
+        max_iter=100,
+        tol=1e-4,
+        warm_start=False,
+        verbose=0,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            solver=solver,
+            max_iter=max_iter,
+            tol=tol,
+            warm_start=warm_start,
+            verbose=verbose,
+        )
+
+    def _get_loss(self):
+        return HalfGammaLoss()
+
+
+class TweedieRegressor(_GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Tweedie distribution.
+
+    This estimator can be used to model different GLMs depending on the
+    ``power`` parameter, which determines the underlying distribution.
+
+    Read more in the :ref:`User Guide <Generalized_linear_models>`.
+
+    .. versionadded:: 0.23
+
+    Parameters
+    ----------
+    power : float, default=0
+            The power determines the underlying target distribution according
+            to the following table:
+
+            +-------+------------------------+
+            | Power | Distribution           |
+            +=======+========================+
+            | 0     | Normal                 |
+            +-------+------------------------+
+            | 1     | Poisson                |
+            +-------+------------------------+
+            | (1,2) | Compound Poisson Gamma |
+            +-------+------------------------+
+            | 2     | Gamma                  |
+            +-------+------------------------+
+            | 3     | Inverse Gaussian       |
+            +-------+------------------------+
+
+            For ``0 < power < 1``, no distribution exists.
+
+    alpha : float, default=1
+        Constant that multiplies the L2 penalty term and determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+        Values of `alpha` must be in the range `[0.0, inf)`.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (`X @ coef + intercept`).
+
+    link : {'auto', 'identity', 'log'}, default='auto'
+        The link function of the GLM, i.e. mapping from linear predictor
+        `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
+        the link depending on the chosen `power` parameter as follows:
+
+        - 'identity' for ``power <= 0``, e.g. for the Normal distribution
+        - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
+          distributions
+
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is a good choice for `n_samples` >> `n_features`, especially
+            with one-hot encoded categorical features with rare categories. Be aware
+            that the memory usage of this solver has a quadratic dependency on
+            `n_features` because it explicitly computes the Hessian matrix.
+
+            .. versionadded:: 1.2
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+        Values must be in the range `[1, inf)`.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+        Values must be in the range `(0.0, inf)`.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+        Values must be in the range `[0, inf)`.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PoissonRegressor : Generalized Linear Model with a Poisson distribution.
+    GammaRegressor : Generalized Linear Model with a Gamma distribution.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.TweedieRegressor()
+    >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
+    >>> y = [2, 3.5, 5, 5.5]
+    >>> clf.fit(X, y)
+    TweedieRegressor()
+    >>> clf.score(X, y)
+    np.float64(0.839)
+    >>> clf.coef_
+    array([0.599, 0.299])
+    >>> clf.intercept_
+    np.float64(1.600)
+    >>> clf.predict([[1, 1], [3, 4]])
+    array([2.500, 4.599])
+    """
+
+    _parameter_constraints: dict = {
+        **_GeneralizedLinearRegressor._parameter_constraints,
+        "power": [Interval(Real, None, None, closed="neither")],
+        "link": [StrOptions({"auto", "identity", "log"})],
+    }
+
+    def __init__(
+        self,
+        *,
+        power=0.0,
+        alpha=1.0,
+        fit_intercept=True,
+        link="auto",
+        solver="lbfgs",
+        max_iter=100,
+        tol=1e-4,
+        warm_start=False,
+        verbose=0,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            solver=solver,
+            max_iter=max_iter,
+            tol=tol,
+            warm_start=warm_start,
+            verbose=verbose,
+        )
+        self.link = link
+        self.power = power
+
+    def _get_loss(self):
+        if self.link == "auto":
+            if self.power <= 0:
+                # identity link
+                return HalfTweedieLossIdentity(power=self.power)
+            else:
+                # log link
+                return HalfTweedieLoss(power=self.power)
+
+        if self.link == "log":
+            return HalfTweedieLoss(power=self.power)
+
+        if self.link == "identity":
+            return HalfTweedieLossIdentity(power=self.power)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_huber.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_huber.py
new file mode 100644
index 0000000000000000000000000000000000000000..87e735ec998db226235f22f33477f50dc9e4152e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_huber.py
@@ -0,0 +1,363 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import optimize
+
+from ..base import BaseEstimator, RegressorMixin, _fit_context
+from ..utils._mask import axis0_safe_slice
+from ..utils._param_validation import Interval
+from ..utils.extmath import safe_sparse_dot
+from ..utils.fixes import _get_additional_lbfgs_options_dict
+from ..utils.optimize import _check_optimize_result
+from ..utils.validation import _check_sample_weight, validate_data
+from ._base import LinearModel
+
+
+def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None):
+    """Returns the Huber loss and the gradient.
+
+    Parameters
+    ----------
+    w : ndarray, shape (n_features + 1,) or (n_features + 2,)
+        Feature vector.
+        w[:n_features] gives the coefficients
+        w[-1] gives the scale factor and if the intercept is fit w[-2]
+        gives the intercept factor.
+
+    X : ndarray of shape (n_samples, n_features)
+        Input data.
+
+    y : ndarray of shape (n_samples,)
+        Target vector.
+
+    epsilon : float
+        Robustness of the Huber estimator.
+
+    alpha : float
+        Regularization parameter.
+
+    sample_weight : ndarray of shape (n_samples,), default=None
+        Weight assigned to each sample.
+
+    Returns
+    -------
+    loss : float
+        Huber loss.
+
+    gradient : ndarray, shape (len(w))
+        Returns the derivative of the Huber loss with respect to each
+        coefficient, intercept and the scale as a vector.
+    """
+    _, n_features = X.shape
+    fit_intercept = n_features + 2 == w.shape[0]
+    if fit_intercept:
+        intercept = w[-2]
+    sigma = w[-1]
+    w = w[:n_features]
+    n_samples = np.sum(sample_weight)
+
+    # Calculate the values where |y - X'w -c / sigma| > epsilon
+    # The values above this threshold are outliers.
+    linear_loss = y - safe_sparse_dot(X, w)
+    if fit_intercept:
+        linear_loss -= intercept
+    abs_linear_loss = np.abs(linear_loss)
+    outliers_mask = abs_linear_loss > epsilon * sigma
+
+    # Calculate the linear loss due to the outliers.
+    # This is equal to (2 * M * |y - X'w -c / sigma| - M**2) * sigma
+    outliers = abs_linear_loss[outliers_mask]
+    num_outliers = np.count_nonzero(outliers_mask)
+    n_non_outliers = X.shape[0] - num_outliers
+
+    # n_sq_outliers includes the weight give to the outliers while
+    # num_outliers is just the number of outliers.
+    outliers_sw = sample_weight[outliers_mask]
+    n_sw_outliers = np.sum(outliers_sw)
+    outlier_loss = (
+        2.0 * epsilon * np.sum(outliers_sw * outliers)
+        - sigma * n_sw_outliers * epsilon**2
+    )
+
+    # Calculate the quadratic loss due to the non-outliers.-
+    # This is equal to |(y - X'w - c)**2 / sigma**2| * sigma
+    non_outliers = linear_loss[~outliers_mask]
+    weighted_non_outliers = sample_weight[~outliers_mask] * non_outliers
+    weighted_loss = np.dot(weighted_non_outliers.T, non_outliers)
+    squared_loss = weighted_loss / sigma
+
+    if fit_intercept:
+        grad = np.zeros(n_features + 2)
+    else:
+        grad = np.zeros(n_features + 1)
+
+    # Gradient due to the squared loss.
+    X_non_outliers = -axis0_safe_slice(X, ~outliers_mask, n_non_outliers)
+    grad[:n_features] = (
+        2.0 / sigma * safe_sparse_dot(weighted_non_outliers, X_non_outliers)
+    )
+
+    # Gradient due to the linear loss.
+    signed_outliers = np.ones_like(outliers)
+    signed_outliers_mask = linear_loss[outliers_mask] < 0
+    signed_outliers[signed_outliers_mask] = -1.0
+    X_outliers = axis0_safe_slice(X, outliers_mask, num_outliers)
+    sw_outliers = sample_weight[outliers_mask] * signed_outliers
+    grad[:n_features] -= 2.0 * epsilon * (safe_sparse_dot(sw_outliers, X_outliers))
+
+    # Gradient due to the penalty.
+    grad[:n_features] += alpha * 2.0 * w
+
+    # Gradient due to sigma.
+    grad[-1] = n_samples
+    grad[-1] -= n_sw_outliers * epsilon**2
+    grad[-1] -= squared_loss / sigma
+
+    # Gradient due to the intercept.
+    if fit_intercept:
+        grad[-2] = -2.0 * np.sum(weighted_non_outliers) / sigma
+        grad[-2] -= 2.0 * epsilon * np.sum(sw_outliers)
+
+    loss = n_samples * sigma + squared_loss + outlier_loss
+    loss += alpha * np.dot(w, w)
+    return loss, grad
+
+
+class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
+    """L2-regularized linear regression model that is robust to outliers.
+
+    The Huber Regressor optimizes the squared loss for the samples where
+    ``|(y - Xw - c) / sigma| < epsilon`` and the absolute loss for the samples
+    where ``|(y - Xw - c) / sigma| > epsilon``, where the model coefficients
+    ``w``, the intercept ``c`` and the scale ``sigma`` are parameters
+    to be optimized. The parameter `sigma` makes sure that if `y` is scaled up
+    or down by a certain factor, one does not need to rescale `epsilon` to
+    achieve the same robustness. Note that this does not take into account
+    the fact that the different features of `X` may be of different scales.
+
+    The Huber loss function has the advantage of not being heavily influenced
+    by the outliers while not completely ignoring their effect.
+
+    Read more in the :ref:`User Guide <huber_regression>`
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    epsilon : float, default=1.35
+        The parameter epsilon controls the number of samples that should be
+        classified as outliers. The smaller the epsilon, the more robust it is
+        to outliers. Epsilon must be in the range `[1, inf)`.
+
+    max_iter : int, default=100
+        Maximum number of iterations that
+        ``scipy.optimize.minimize(method="L-BFGS-B")`` should run for.
+
+    alpha : float, default=0.0001
+        Strength of the squared L2 regularization. Note that the penalty is
+        equal to ``alpha * ||w||^2``.
+        Must be in the range `[0, inf)`.
+
+    warm_start : bool, default=False
+        This is useful if the stored attributes of a previously used model
+        has to be reused. If set to False, then the coefficients will
+        be rewritten for every call to fit.
+        See :term:`the Glossary <warm_start>`.
+
+    fit_intercept : bool, default=True
+        Whether or not to fit the intercept. This can be set to False
+        if the data is already centered around the origin.
+
+    tol : float, default=1e-05
+        The iteration will stop when
+        ``max{|proj g_i | i = 1, ..., n}`` <= ``tol``
+        where pg_i is the i-th component of the projected gradient.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Features got by optimizing the L2-regularized Huber loss.
+
+    intercept_ : float
+        Bias.
+
+    scale_ : float
+        The value by which ``|y - Xw - c|`` is scaled down.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations that
+        ``scipy.optimize.minimize(method="L-BFGS-B")`` has run for.
+
+        .. versionchanged:: 0.20
+
+            In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
+            ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.
+
+    outliers_ : array, shape (n_samples,)
+        A boolean mask which is set to True where the samples are identified
+        as outliers.
+
+    See Also
+    --------
+    RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
+    TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
+    SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
+
+    References
+    ----------
+    .. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics
+           Concomitant scale estimates, p. 172
+    .. [2] Art B. Owen (2006), `A robust hybrid of lasso and ridge regression.
+           <https://artowen.su.domains/reports/hhu.pdf>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import HuberRegressor, LinearRegression
+    >>> from sklearn.datasets import make_regression
+    >>> rng = np.random.RandomState(0)
+    >>> X, y, coef = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)
+    >>> X[:4] = rng.uniform(10, 20, (4, 2))
+    >>> y[:4] = rng.uniform(10, 20, 4)
+    >>> huber = HuberRegressor().fit(X, y)
+    >>> huber.score(X, y)
+    -7.284
+    >>> huber.predict(X[:1,])
+    array([806.7200])
+    >>> linear = LinearRegression().fit(X, y)
+    >>> print("True coefficients:", coef)
+    True coefficients: [20.4923...  34.1698...]
+    >>> print("Huber coefficients:", huber.coef_)
+    Huber coefficients: [17.7906... 31.0106...]
+    >>> print("Linear Regression coefficients:", linear.coef_)
+    Linear Regression coefficients: [-1.9221...  7.0226...]
+    """
+
+    _parameter_constraints: dict = {
+        "epsilon": [Interval(Real, 1.0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "warm_start": ["boolean"],
+        "fit_intercept": ["boolean"],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        *,
+        epsilon=1.35,
+        max_iter=100,
+        alpha=0.0001,
+        warm_start=False,
+        fit_intercept=True,
+        tol=1e-05,
+    ):
+        self.epsilon = epsilon
+        self.max_iter = max_iter
+        self.alpha = alpha
+        self.warm_start = warm_start
+        self.fit_intercept = fit_intercept
+        self.tol = tol
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like, shape (n_samples,)
+            Target vector relative to X.
+
+        sample_weight : array-like, shape (n_samples,)
+            Weight given to each sample.
+
+        Returns
+        -------
+        self : object
+            Fitted `HuberRegressor` estimator.
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            copy=False,
+            accept_sparse=["csr"],
+            y_numeric=True,
+            dtype=[np.float64, np.float32],
+        )
+
+        sample_weight = _check_sample_weight(sample_weight, X)
+
+        if self.warm_start and hasattr(self, "coef_"):
+            parameters = np.concatenate((self.coef_, [self.intercept_, self.scale_]))
+        else:
+            if self.fit_intercept:
+                parameters = np.zeros(X.shape[1] + 2)
+            else:
+                parameters = np.zeros(X.shape[1] + 1)
+            # Make sure to initialize the scale parameter to a strictly
+            # positive value:
+            parameters[-1] = 1
+
+        # Sigma or the scale factor should be non-negative.
+        # Setting it to be zero might cause undefined bounds hence we set it
+        # to a value close to zero.
+        bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1))
+        bounds[-1][0] = np.finfo(np.float64).eps * 10
+
+        opt_res = optimize.minimize(
+            _huber_loss_and_gradient,
+            parameters,
+            method="L-BFGS-B",
+            jac=True,
+            args=(X, y, self.epsilon, self.alpha, sample_weight),
+            options={
+                "maxiter": self.max_iter,
+                "gtol": self.tol,
+                **_get_additional_lbfgs_options_dict("iprint", -1),
+            },
+            bounds=bounds,
+        )
+
+        parameters = opt_res.x
+
+        if opt_res.status == 2:
+            raise ValueError(
+                "HuberRegressor convergence failed: l-BFGS-b solver terminated with %s"
+                % opt_res.message
+            )
+        self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
+        self.scale_ = parameters[-1]
+        if self.fit_intercept:
+            self.intercept_ = parameters[-2]
+        else:
+            self.intercept_ = 0.0
+        self.coef_ = parameters[: X.shape[1]]
+
+        residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_)
+        self.outliers_ = residual > self.scale_ * self.epsilon
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_least_angle.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_least_angle.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bffe5f6e8c0d2d6fbc05821d6553e436758c86b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_least_angle.py
@@ -0,0 +1,2346 @@
+"""
+Least Angle Regression algorithm. See the documentation on the
+Generalized Linear Model for a complete discussion.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import sys
+import warnings
+from math import log
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import interpolate, linalg
+from scipy.linalg.lapack import get_lapack_funcs
+
+from ..base import MultiOutputMixin, RegressorMixin, _fit_context
+from ..exceptions import ConvergenceWarning
+from ..model_selection import check_cv
+
+# mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
+from ..utils import (
+    Bunch,
+    arrayfuncs,
+    as_float_array,
+    check_random_state,
+)
+from ..utils._metadata_requests import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import validate_data
+from ._base import LinearModel, LinearRegression, _preprocess_data
+
+SOLVE_TRIANGULAR_ARGS = {"check_finite": False}
+
+
+@validate_params(
+    {
+        "X": [np.ndarray, None],
+        "y": [np.ndarray, None],
+        "Xy": [np.ndarray, None],
+        "Gram": [StrOptions({"auto"}), "boolean", np.ndarray, None],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "alpha_min": [Interval(Real, 0, None, closed="left")],
+        "method": [StrOptions({"lar", "lasso"})],
+        "copy_X": ["boolean"],
+        "eps": [Interval(Real, 0, None, closed="neither"), None],
+        "copy_Gram": ["boolean"],
+        "verbose": ["verbose"],
+        "return_path": ["boolean"],
+        "return_n_iter": ["boolean"],
+        "positive": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def lars_path(
+    X,
+    y,
+    Xy=None,
+    *,
+    Gram=None,
+    max_iter=500,
+    alpha_min=0,
+    method="lar",
+    copy_X=True,
+    eps=np.finfo(float).eps,
+    copy_Gram=True,
+    verbose=0,
+    return_path=True,
+    return_n_iter=False,
+    positive=False,
+):
+    """Compute Least Angle Regression or Lasso path using the LARS algorithm.
+
+    The optimization objective for the case method='lasso' is::
+
+    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    in the case of method='lar', the objective function is only known in
+    the form of an implicit equation (see discussion in [1]_).
+
+    Read more in the :ref:`User Guide <least_angle_regression>`.
+
+    Parameters
+    ----------
+    X : None or ndarray of shape (n_samples, n_features)
+        Input data. If X is `None`, Gram must also be `None`.
+        If only the Gram matrix is available, use `lars_path_gram` instead.
+
+    y : None or ndarray of shape (n_samples,)
+        Input targets.
+
+    Xy : array-like of shape (n_features,), default=None
+        `Xy = X.T @ y` that can be precomputed. It is useful
+        only when the Gram matrix is precomputed.
+
+    Gram : None, 'auto', bool, ndarray of shape (n_features, n_features), \
+            default=None
+        Precomputed Gram matrix `X.T @ X`, if `'auto'`, the Gram
+        matrix is precomputed from the given X, if there are more samples
+        than features.
+
+    max_iter : int, default=500
+        Maximum number of iterations to perform, set to infinity for no limit.
+
+    alpha_min : float, default=0
+        Minimum correlation along the path. It corresponds to the
+        regularization parameter `alpha` in the Lasso.
+
+    method : {'lar', 'lasso'}, default='lar'
+        Specifies the returned model. Select `'lar'` for Least Angle
+        Regression, `'lasso'` for the Lasso.
+
+    copy_X : bool, default=True
+        If `False`, `X` is overwritten.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the `tol` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    copy_Gram : bool, default=True
+        If `False`, `Gram` is overwritten.
+
+    verbose : int, default=0
+        Controls output verbosity.
+
+    return_path : bool, default=True
+        If `True`, returns the entire path, else returns only the
+        last point of the path.
+
+    return_n_iter : bool, default=False
+        Whether to return the number of iterations.
+
+    positive : bool, default=False
+        Restrict coefficients to be >= 0.
+        This option is only allowed with method 'lasso'. Note that the model
+        coefficients will not converge to the ordinary-least-squares solution
+        for small values of alpha. Only coefficients up to the smallest alpha
+        value (`alphas_[alphas_ > 0.].min()` when fit_path=True) reached by
+        the stepwise Lars-Lasso algorithm are typically in congruence with the
+        solution of the coordinate descent `lasso_path` function.
+
+    Returns
+    -------
+    alphas : ndarray of shape (n_alphas + 1,)
+        Maximum of covariances (in absolute value) at each iteration.
+        `n_alphas` is either `max_iter`, `n_features`, or the
+        number of nodes in the path with `alpha >= alpha_min`, whichever
+        is smaller.
+
+    active : ndarray of shape (n_alphas,)
+        Indices of active variables at the end of the path.
+
+    coefs : ndarray of shape (n_features, n_alphas + 1)
+        Coefficients along the path.
+
+    n_iter : int
+        Number of iterations run. Returned only if `return_n_iter` is set
+        to True.
+
+    See Also
+    --------
+    lars_path_gram : Compute LARS path in the sufficient stats mode.
+    lasso_path : Compute Lasso path with coordinate descent.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    Lars : Least Angle Regression model a.k.a. LAR.
+    LassoLarsCV : Cross-validated Lasso, using the LARS algorithm.
+    LarsCV : Cross-validated Least Angle Regression model.
+    sklearn.decomposition.sparse_encode : Sparse coding.
+
+    References
+    ----------
+    .. [1] "Least Angle Regression", Efron et al.
+           http://statweb.stanford.edu/~tibs/ftp/lars.pdf
+
+    .. [2] `Wikipedia entry on the Least-angle regression
+           <https://en.wikipedia.org/wiki/Least-angle_regression>`_
+
+    .. [3] `Wikipedia entry on the Lasso
+           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import lars_path
+    >>> from sklearn.datasets import make_regression
+    >>> X, y, true_coef = make_regression(
+    ...    n_samples=100, n_features=5, n_informative=2, coef=True, random_state=0
+    ... )
+    >>> true_coef
+    array([ 0.        ,  0.        ,  0.        , 97.9, 45.7])
+    >>> alphas, _, estimated_coef = lars_path(X, y)
+    >>> alphas.shape
+    (3,)
+    >>> estimated_coef
+    array([[ 0.     ,  0.     ,  0.     ],
+           [ 0.     ,  0.     ,  0.     ],
+           [ 0.     ,  0.     ,  0.     ],
+           [ 0.     , 46.96, 97.99],
+           [ 0.     ,  0.     , 45.70]])
+    """
+    if X is None and Gram is not None:
+        raise ValueError(
+            "X cannot be None if Gram is not None"
+            "Use lars_path_gram to avoid passing X and y."
+        )
+    return _lars_path_solver(
+        X=X,
+        y=y,
+        Xy=Xy,
+        Gram=Gram,
+        n_samples=None,
+        max_iter=max_iter,
+        alpha_min=alpha_min,
+        method=method,
+        copy_X=copy_X,
+        eps=eps,
+        copy_Gram=copy_Gram,
+        verbose=verbose,
+        return_path=return_path,
+        return_n_iter=return_n_iter,
+        positive=positive,
+    )
+
+
+@validate_params(
+    {
+        "Xy": [np.ndarray],
+        "Gram": [np.ndarray],
+        "n_samples": [Interval(Integral, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "alpha_min": [Interval(Real, 0, None, closed="left")],
+        "method": [StrOptions({"lar", "lasso"})],
+        "copy_X": ["boolean"],
+        "eps": [Interval(Real, 0, None, closed="neither"), None],
+        "copy_Gram": ["boolean"],
+        "verbose": ["verbose"],
+        "return_path": ["boolean"],
+        "return_n_iter": ["boolean"],
+        "positive": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def lars_path_gram(
+    Xy,
+    Gram,
+    *,
+    n_samples,
+    max_iter=500,
+    alpha_min=0,
+    method="lar",
+    copy_X=True,
+    eps=np.finfo(float).eps,
+    copy_Gram=True,
+    verbose=0,
+    return_path=True,
+    return_n_iter=False,
+    positive=False,
+):
+    """The lars_path in the sufficient stats mode.
+
+    The optimization objective for the case method='lasso' is::
+
+    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    in the case of method='lar', the objective function is only known in
+    the form of an implicit equation (see discussion in [1]_).
+
+    Read more in the :ref:`User Guide <least_angle_regression>`.
+
+    Parameters
+    ----------
+    Xy : ndarray of shape (n_features,)
+        `Xy = X.T @ y`.
+
+    Gram : ndarray of shape (n_features, n_features)
+        `Gram = X.T @ X`.
+
+    n_samples : int
+        Equivalent size of sample.
+
+    max_iter : int, default=500
+        Maximum number of iterations to perform, set to infinity for no limit.
+
+    alpha_min : float, default=0
+        Minimum correlation along the path. It corresponds to the
+        regularization parameter alpha parameter in the Lasso.
+
+    method : {'lar', 'lasso'}, default='lar'
+        Specifies the returned model. Select `'lar'` for Least Angle
+        Regression, ``'lasso'`` for the Lasso.
+
+    copy_X : bool, default=True
+        If `False`, `X` is overwritten.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the `tol` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    copy_Gram : bool, default=True
+        If `False`, `Gram` is overwritten.
+
+    verbose : int, default=0
+        Controls output verbosity.
+
+    return_path : bool, default=True
+        If `return_path==True` returns the entire path, else returns only the
+        last point of the path.
+
+    return_n_iter : bool, default=False
+        Whether to return the number of iterations.
+
+    positive : bool, default=False
+        Restrict coefficients to be >= 0.
+        This option is only allowed with method 'lasso'. Note that the model
+        coefficients will not converge to the ordinary-least-squares solution
+        for small values of alpha. Only coefficients up to the smallest alpha
+        value (`alphas_[alphas_ > 0.].min()` when `fit_path=True`) reached by
+        the stepwise Lars-Lasso algorithm are typically in congruence with the
+        solution of the coordinate descent lasso_path function.
+
+    Returns
+    -------
+    alphas : ndarray of shape (n_alphas + 1,)
+        Maximum of covariances (in absolute value) at each iteration.
+        `n_alphas` is either `max_iter`, `n_features` or the
+        number of nodes in the path with `alpha >= alpha_min`, whichever
+        is smaller.
+
+    active : ndarray of shape (n_alphas,)
+        Indices of active variables at the end of the path.
+
+    coefs : ndarray of shape (n_features, n_alphas + 1)
+        Coefficients along the path.
+
+    n_iter : int
+        Number of iterations run. Returned only if `return_n_iter` is set
+        to True.
+
+    See Also
+    --------
+    lars_path_gram : Compute LARS path.
+    lasso_path : Compute Lasso path with coordinate descent.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    Lars : Least Angle Regression model a.k.a. LAR.
+    LassoLarsCV : Cross-validated Lasso, using the LARS algorithm.
+    LarsCV : Cross-validated Least Angle Regression model.
+    sklearn.decomposition.sparse_encode : Sparse coding.
+
+    References
+    ----------
+    .. [1] "Least Angle Regression", Efron et al.
+           http://statweb.stanford.edu/~tibs/ftp/lars.pdf
+
+    .. [2] `Wikipedia entry on the Least-angle regression
+           <https://en.wikipedia.org/wiki/Least-angle_regression>`_
+
+    .. [3] `Wikipedia entry on the Lasso
+           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import lars_path_gram
+    >>> from sklearn.datasets import make_regression
+    >>> X, y, true_coef = make_regression(
+    ...    n_samples=100, n_features=5, n_informative=2, coef=True, random_state=0
+    ... )
+    >>> true_coef
+    array([ 0.        ,  0.        ,  0.        , 97.9, 45.7])
+    >>> alphas, _, estimated_coef = lars_path_gram(X.T @ y, X.T @ X, n_samples=100)
+    >>> alphas.shape
+    (3,)
+    >>> estimated_coef
+    array([[ 0.     ,  0.     ,  0.     ],
+           [ 0.     ,  0.     ,  0.     ],
+           [ 0.     ,  0.     ,  0.     ],
+           [ 0.     , 46.96, 97.99],
+           [ 0.     ,  0.     , 45.70]])
+    """
+    return _lars_path_solver(
+        X=None,
+        y=None,
+        Xy=Xy,
+        Gram=Gram,
+        n_samples=n_samples,
+        max_iter=max_iter,
+        alpha_min=alpha_min,
+        method=method,
+        copy_X=copy_X,
+        eps=eps,
+        copy_Gram=copy_Gram,
+        verbose=verbose,
+        return_path=return_path,
+        return_n_iter=return_n_iter,
+        positive=positive,
+    )
+
+
+def _lars_path_solver(
+    X,
+    y,
+    Xy=None,
+    Gram=None,
+    n_samples=None,
+    max_iter=500,
+    alpha_min=0,
+    method="lar",
+    copy_X=True,
+    eps=np.finfo(float).eps,
+    copy_Gram=True,
+    verbose=0,
+    return_path=True,
+    return_n_iter=False,
+    positive=False,
+):
+    """Compute Least Angle Regression or Lasso path using LARS algorithm [1]
+
+    The optimization objective for the case method='lasso' is::
+
+    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    in the case of method='lar', the objective function is only known in
+    the form of an implicit equation (see discussion in [1])
+
+    Read more in the :ref:`User Guide <least_angle_regression>`.
+
+    Parameters
+    ----------
+    X : None or ndarray of shape (n_samples, n_features)
+        Input data. Note that if X is None then Gram must be specified,
+        i.e., cannot be None or False.
+
+    y : None or ndarray of shape (n_samples,)
+        Input targets.
+
+    Xy : array-like of shape (n_features,), default=None
+        `Xy = np.dot(X.T, y)` that can be precomputed. It is useful
+        only when the Gram matrix is precomputed.
+
+    Gram : None, 'auto' or array-like of shape (n_features, n_features), \
+            default=None
+        Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram
+        matrix is precomputed from the given X, if there are more samples
+        than features.
+
+    n_samples : int or float, default=None
+        Equivalent size of sample. If `None`, it will be `n_samples`.
+
+    max_iter : int, default=500
+        Maximum number of iterations to perform, set to infinity for no limit.
+
+    alpha_min : float, default=0
+        Minimum correlation along the path. It corresponds to the
+        regularization parameter alpha parameter in the Lasso.
+
+    method : {'lar', 'lasso'}, default='lar'
+        Specifies the returned model. Select ``'lar'`` for Least Angle
+        Regression, ``'lasso'`` for the Lasso.
+
+    copy_X : bool, default=True
+        If ``False``, ``X`` is overwritten.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the ``tol`` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    copy_Gram : bool, default=True
+        If ``False``, ``Gram`` is overwritten.
+
+    verbose : int, default=0
+        Controls output verbosity.
+
+    return_path : bool, default=True
+        If ``return_path==True`` returns the entire path, else returns only the
+        last point of the path.
+
+    return_n_iter : bool, default=False
+        Whether to return the number of iterations.
+
+    positive : bool, default=False
+        Restrict coefficients to be >= 0.
+        This option is only allowed with method 'lasso'. Note that the model
+        coefficients will not converge to the ordinary-least-squares solution
+        for small values of alpha. Only coefficients up to the smallest alpha
+        value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by
+        the stepwise Lars-Lasso algorithm are typically in congruence with the
+        solution of the coordinate descent lasso_path function.
+
+    Returns
+    -------
+    alphas : array-like of shape (n_alphas + 1,)
+        Maximum of covariances (in absolute value) at each iteration.
+        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
+        number of nodes in the path with ``alpha >= alpha_min``, whichever
+        is smaller.
+
+    active : array-like of shape (n_alphas,)
+        Indices of active variables at the end of the path.
+
+    coefs : array-like of shape (n_features, n_alphas + 1)
+        Coefficients along the path
+
+    n_iter : int
+        Number of iterations run. Returned only if return_n_iter is set
+        to True.
+
+    See Also
+    --------
+    lasso_path
+    LassoLars
+    Lars
+    LassoLarsCV
+    LarsCV
+    sklearn.decomposition.sparse_encode
+
+    References
+    ----------
+    .. [1] "Least Angle Regression", Efron et al.
+           http://statweb.stanford.edu/~tibs/ftp/lars.pdf
+
+    .. [2] `Wikipedia entry on the Least-angle regression
+           <https://en.wikipedia.org/wiki/Least-angle_regression>`_
+
+    .. [3] `Wikipedia entry on the Lasso
+           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
+
+    """
+    if method == "lar" and positive:
+        raise ValueError("Positive constraint not supported for 'lar' coding method.")
+
+    n_samples = n_samples if n_samples is not None else y.size
+
+    if Xy is None:
+        Cov = np.dot(X.T, y)
+    else:
+        Cov = Xy.copy()
+
+    if Gram is None or Gram is False:
+        Gram = None
+        if X is None:
+            raise ValueError("X and Gram cannot both be unspecified.")
+    elif (isinstance(Gram, str) and Gram == "auto") or Gram is True:
+        if Gram is True or X.shape[0] > X.shape[1]:
+            Gram = np.dot(X.T, X)
+        else:
+            Gram = None
+    elif copy_Gram:
+        Gram = Gram.copy()
+
+    if Gram is None:
+        n_features = X.shape[1]
+    else:
+        n_features = Cov.shape[0]
+        if Gram.shape != (n_features, n_features):
+            raise ValueError("The shapes of the inputs Gram and Xy do not match.")
+
+    if copy_X and X is not None and Gram is None:
+        # force copy. setting the array to be fortran-ordered
+        # speeds up the calculation of the (partial) Gram matrix
+        # and allows to easily swap columns
+        X = X.copy("F")
+
+    max_features = min(max_iter, n_features)
+
+    dtypes = set(a.dtype for a in (X, y, Xy, Gram) if a is not None)
+    if len(dtypes) == 1:
+        # use the precision level of input data if it is consistent
+        return_dtype = next(iter(dtypes))
+    else:
+        # fallback to double precision otherwise
+        return_dtype = np.float64
+
+    if return_path:
+        coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype)
+        alphas = np.zeros(max_features + 1, dtype=return_dtype)
+    else:
+        coef, prev_coef = (
+            np.zeros(n_features, dtype=return_dtype),
+            np.zeros(n_features, dtype=return_dtype),
+        )
+        alpha, prev_alpha = (
+            np.array([0.0], dtype=return_dtype),
+            np.array([0.0], dtype=return_dtype),
+        )
+        # above better ideas?
+
+    n_iter, n_active = 0, 0
+    active, indices = list(), np.arange(n_features)
+    # holds the sign of covariance
+    sign_active = np.empty(max_features, dtype=np.int8)
+    drop = False
+
+    # will hold the cholesky factorization. Only lower part is
+    # referenced.
+    if Gram is None:
+        L = np.empty((max_features, max_features), dtype=X.dtype)
+        swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (X,))
+    else:
+        L = np.empty((max_features, max_features), dtype=Gram.dtype)
+        swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (Cov,))
+    (solve_cholesky,) = get_lapack_funcs(("potrs",), (L,))
+
+    if verbose:
+        if verbose > 1:
+            print("Step\t\tAdded\t\tDropped\t\tActive set size\t\tC")
+        else:
+            sys.stdout.write(".")
+            sys.stdout.flush()
+
+    tiny32 = np.finfo(np.float32).tiny  # to avoid division by 0 warning
+    cov_precision = np.finfo(Cov.dtype).precision
+    equality_tolerance = np.finfo(np.float32).eps
+
+    if Gram is not None:
+        Gram_copy = Gram.copy()
+        Cov_copy = Cov.copy()
+
+    while True:
+        if Cov.size:
+            if positive:
+                C_idx = np.argmax(Cov)
+            else:
+                C_idx = np.argmax(np.abs(Cov))
+
+            C_ = Cov[C_idx]
+
+            if positive:
+                C = C_
+            else:
+                C = np.fabs(C_)
+        else:
+            C = 0.0
+
+        if return_path:
+            alpha = alphas[n_iter, np.newaxis]
+            coef = coefs[n_iter]
+            prev_alpha = alphas[n_iter - 1, np.newaxis]
+            prev_coef = coefs[n_iter - 1]
+
+        alpha[0] = C / n_samples
+        if alpha[0] <= alpha_min + equality_tolerance:  # early stopping
+            if abs(alpha[0] - alpha_min) > equality_tolerance:
+                # interpolation factor 0 <= ss < 1
+                if n_iter > 0:
+                    # In the first iteration, all alphas are zero, the formula
+                    # below would make ss a NaN
+                    ss = (prev_alpha[0] - alpha_min) / (prev_alpha[0] - alpha[0])
+                    coef[:] = prev_coef + ss * (coef - prev_coef)
+                alpha[0] = alpha_min
+            if return_path:
+                coefs[n_iter] = coef
+            break
+
+        if n_iter >= max_iter or n_active >= n_features:
+            break
+        if not drop:
+            ##########################################################
+            # Append x_j to the Cholesky factorization of (Xa * Xa') #
+            #                                                        #
+            #            ( L   0 )                                   #
+            #     L  ->  (       )  , where L * w = Xa' x_j          #
+            #            ( w   z )    and z = ||x_j||                #
+            #                                                        #
+            ##########################################################
+
+            if positive:
+                sign_active[n_active] = np.ones_like(C_)
+            else:
+                sign_active[n_active] = np.sign(C_)
+            m, n = n_active, C_idx + n_active
+
+            Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
+            indices[n], indices[m] = indices[m], indices[n]
+            Cov_not_shortened = Cov
+            Cov = Cov[1:]  # remove Cov[0]
+
+            if Gram is None:
+                X.T[n], X.T[m] = swap(X.T[n], X.T[m])
+                c = nrm2(X.T[n_active]) ** 2
+                L[n_active, :n_active] = np.dot(X.T[n_active], X.T[:n_active].T)
+            else:
+                # swap does only work inplace if matrix is fortran
+                # contiguous ...
+                Gram[m], Gram[n] = swap(Gram[m], Gram[n])
+                Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n])
+                c = Gram[n_active, n_active]
+                L[n_active, :n_active] = Gram[n_active, :n_active]
+
+            # Update the cholesky decomposition for the Gram matrix
+            if n_active:
+                linalg.solve_triangular(
+                    L[:n_active, :n_active],
+                    L[n_active, :n_active],
+                    trans=0,
+                    lower=1,
+                    overwrite_b=True,
+                    **SOLVE_TRIANGULAR_ARGS,
+                )
+
+            v = np.dot(L[n_active, :n_active], L[n_active, :n_active])
+            diag = max(np.sqrt(np.abs(c - v)), eps)
+            L[n_active, n_active] = diag
+
+            if diag < 1e-7:
+                # The system is becoming too ill-conditioned.
+                # We have degenerate vectors in our active set.
+                # We'll 'drop for good' the last regressor added.
+                warnings.warn(
+                    "Regressors in active set degenerate. "
+                    "Dropping a regressor, after %i iterations, "
+                    "i.e. alpha=%.3e, "
+                    "with an active set of %i regressors, and "
+                    "the smallest cholesky pivot element being %.3e."
+                    " Reduce max_iter or increase eps parameters."
+                    % (n_iter, alpha.item(), n_active, diag),
+                    ConvergenceWarning,
+                )
+
+                # XXX: need to figure a 'drop for good' way
+                Cov = Cov_not_shortened
+                Cov[0] = 0
+                Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
+                continue
+
+            active.append(indices[n_active])
+            n_active += 1
+
+            if verbose > 1:
+                print(
+                    "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, active[-1], "", n_active, C)
+                )
+
+        if method == "lasso" and n_iter > 0 and prev_alpha[0] < alpha[0]:
+            # alpha is increasing. This is because the updates of Cov are
+            # bringing in too much numerical error that is greater than
+            # than the remaining correlation with the
+            # regressors. Time to bail out
+            warnings.warn(
+                "Early stopping the lars path, as the residues "
+                "are small and the current value of alpha is no "
+                "longer well controlled. %i iterations, alpha=%.3e, "
+                "previous alpha=%.3e, with an active set of %i "
+                "regressors." % (n_iter, alpha.item(), prev_alpha.item(), n_active),
+                ConvergenceWarning,
+            )
+            break
+
+        # least squares solution
+        least_squares, _ = solve_cholesky(
+            L[:n_active, :n_active], sign_active[:n_active], lower=True
+        )
+
+        if least_squares.size == 1 and least_squares == 0:
+            # This happens because sign_active[:n_active] = 0
+            least_squares[...] = 1
+            AA = 1.0
+        else:
+            # is this really needed ?
+            AA = 1.0 / np.sqrt(np.sum(least_squares * sign_active[:n_active]))
+
+            if not np.isfinite(AA):
+                # L is too ill-conditioned
+                i = 0
+                L_ = L[:n_active, :n_active].copy()
+                while not np.isfinite(AA):
+                    L_.flat[:: n_active + 1] += (2**i) * eps
+                    least_squares, _ = solve_cholesky(
+                        L_, sign_active[:n_active], lower=True
+                    )
+                    tmp = max(np.sum(least_squares * sign_active[:n_active]), eps)
+                    AA = 1.0 / np.sqrt(tmp)
+                    i += 1
+            least_squares *= AA
+
+        if Gram is None:
+            # equiangular direction of variables in the active set
+            eq_dir = np.dot(X.T[:n_active].T, least_squares)
+            # correlation between each unactive variables and
+            # eqiangular vector
+            corr_eq_dir = np.dot(X.T[n_active:], eq_dir)
+        else:
+            # if huge number of features, this takes 50% of time, I
+            # think could be avoided if we just update it using an
+            # orthogonal (QR) decomposition of X
+            corr_eq_dir = np.dot(Gram[:n_active, n_active:].T, least_squares)
+
+        # Explicit rounding can be necessary to avoid `np.argmax(Cov)` yielding
+        # unstable results because of rounding errors.
+        np.around(corr_eq_dir, decimals=cov_precision, out=corr_eq_dir)
+
+        g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny32))
+        if positive:
+            gamma_ = min(g1, C / AA)
+        else:
+            g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny32))
+            gamma_ = min(g1, g2, C / AA)
+
+        # TODO: better names for these variables: z
+        drop = False
+        z = -coef[active] / (least_squares + tiny32)
+        z_pos = arrayfuncs.min_pos(z)
+        if z_pos < gamma_:
+            # some coefficients have changed sign
+            idx = np.where(z == z_pos)[0][::-1]
+
+            # update the sign, important for LAR
+            sign_active[idx] = -sign_active[idx]
+
+            if method == "lasso":
+                gamma_ = z_pos
+            drop = True
+
+        n_iter += 1
+
+        if return_path:
+            if n_iter >= coefs.shape[0]:
+                del coef, alpha, prev_alpha, prev_coef
+                # resize the coefs and alphas array
+                add_features = 2 * max(1, (max_features - n_active))
+                coefs = np.resize(coefs, (n_iter + add_features, n_features))
+                coefs[-add_features:] = 0
+                alphas = np.resize(alphas, n_iter + add_features)
+                alphas[-add_features:] = 0
+            coef = coefs[n_iter]
+            prev_coef = coefs[n_iter - 1]
+        else:
+            # mimic the effect of incrementing n_iter on the array references
+            prev_coef = coef
+            prev_alpha[0] = alpha[0]
+            coef = np.zeros_like(coef)
+
+        coef[active] = prev_coef[active] + gamma_ * least_squares
+
+        # update correlations
+        Cov -= gamma_ * corr_eq_dir
+
+        # See if any coefficient has changed sign
+        if drop and method == "lasso":
+            # handle the case when idx is not length of 1
+            for ii in idx:
+                arrayfuncs.cholesky_delete(L[:n_active, :n_active], ii)
+
+            n_active -= 1
+            # handle the case when idx is not length of 1
+            drop_idx = [active.pop(ii) for ii in idx]
+
+            if Gram is None:
+                # propagate dropped variable
+                for ii in idx:
+                    for i in range(ii, n_active):
+                        X.T[i], X.T[i + 1] = swap(X.T[i], X.T[i + 1])
+                        # yeah this is stupid
+                        indices[i], indices[i + 1] = indices[i + 1], indices[i]
+
+                # TODO: this could be updated
+                residual = y - np.dot(X[:, :n_active], coef[active])
+                temp = np.dot(X.T[n_active], residual)
+
+                Cov = np.r_[temp, Cov]
+            else:
+                for ii in idx:
+                    for i in range(ii, n_active):
+                        indices[i], indices[i + 1] = indices[i + 1], indices[i]
+                        Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
+                        Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i], Gram[:, i + 1])
+
+                # Cov_n = Cov_j + x_j * X + increment(betas) TODO:
+                # will this still work with multiple drops ?
+
+                # recompute covariance. Probably could be done better
+                # wrong as Xy is not swapped with the rest of variables
+
+                # TODO: this could be updated
+                temp = Cov_copy[drop_idx] - np.dot(Gram_copy[drop_idx], coef)
+                Cov = np.r_[temp, Cov]
+
+            sign_active = np.delete(sign_active, idx)
+            sign_active = np.append(sign_active, 0.0)  # just to maintain size
+            if verbose > 1:
+                print(
+                    "%s\t\t%s\t\t%s\t\t%s\t\t%s"
+                    % (n_iter, "", drop_idx, n_active, abs(temp))
+                )
+
+    if return_path:
+        # resize coefs in case of early stop
+        alphas = alphas[: n_iter + 1]
+        coefs = coefs[: n_iter + 1]
+
+        if return_n_iter:
+            return alphas, active, coefs.T, n_iter
+        else:
+            return alphas, active, coefs.T
+    else:
+        if return_n_iter:
+            return alpha, active, coef, n_iter
+        else:
+            return alpha, active, coef
+
+
+###############################################################################
+# Estimator classes
+
+
+class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
+    """Least Angle Regression model a.k.a. LAR.
+
+    Read more in the :ref:`User Guide <least_angle_regression>`.
+
+    Parameters
+    ----------
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    verbose : bool or int, default=False
+        Sets the verbosity amount.
+
+    precompute : bool, 'auto' or array-like , default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram
+        matrix can also be passed as argument.
+
+    n_nonzero_coefs : int, default=500
+        Target number of non-zero coefficients. Use ``np.inf`` for no limit.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the ``tol`` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    fit_path : bool, default=True
+        If True the full path is stored in the ``coef_path_`` attribute.
+        If you compute the solution for a large problem or many targets,
+        setting ``fit_path`` to ``False`` will lead to a speedup, especially
+        with a small alpha.
+
+    jitter : float, default=None
+        Upper bound on a uniform noise parameter to be added to the
+        `y` values, to satisfy the model's assumption of
+        one-at-a-time computations. Might help with stability.
+
+        .. versionadded:: 0.23
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for jittering. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
+
+        .. versionadded:: 0.23
+
+    Attributes
+    ----------
+    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
+        Maximum of covariances (in absolute value) at each iteration.
+        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
+        number of nodes in the path with ``alpha >= alpha_min``, whichever
+        is smaller. If this is a list of array-like, the length of the outer
+        list is `n_targets`.
+
+    active_ : list of shape (n_alphas,) or list of such lists
+        Indices of active variables at the end of the path.
+        If this is a list of list, the length of the outer list is `n_targets`.
+
+    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \
+            of such arrays
+        The varying values of the coefficients along the path. It is not
+        present if the ``fit_path`` parameter is ``False``. If this is a list
+        of array-like, the length of the outer list is `n_targets`.
+
+    coef_ : array-like of shape (n_features,) or (n_targets, n_features)
+        Parameter vector (w in the formulation formula).
+
+    intercept_ : float or array-like of shape (n_targets,)
+        Independent term in decision function.
+
+    n_iter_ : array-like or int
+        The number of iterations taken by lars_path to find the
+        grid of alphas for each target.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    lars_path: Compute Least Angle Regression or Lasso
+        path using LARS algorithm.
+    LarsCV : Cross-validated Least Angle Regression model.
+    sklearn.decomposition.sparse_encode : Sparse coding.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> reg = linear_model.Lars(n_nonzero_coefs=1)
+    >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111])
+    Lars(n_nonzero_coefs=1)
+    >>> print(reg.coef_)
+    [ 0. -1.11]
+    """
+
+    _parameter_constraints: dict = {
+        "fit_intercept": ["boolean"],
+        "verbose": ["verbose"],
+        "precompute": ["boolean", StrOptions({"auto"}), np.ndarray, Hidden(None)],
+        "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left")],
+        "eps": [Interval(Real, 0, None, closed="left")],
+        "copy_X": ["boolean"],
+        "fit_path": ["boolean"],
+        "jitter": [Interval(Real, 0, None, closed="left"), None],
+        "random_state": ["random_state"],
+    }
+
+    method = "lar"
+    positive = False
+
+    def __init__(
+        self,
+        *,
+        fit_intercept=True,
+        verbose=False,
+        precompute="auto",
+        n_nonzero_coefs=500,
+        eps=np.finfo(float).eps,
+        copy_X=True,
+        fit_path=True,
+        jitter=None,
+        random_state=None,
+    ):
+        self.fit_intercept = fit_intercept
+        self.verbose = verbose
+        self.precompute = precompute
+        self.n_nonzero_coefs = n_nonzero_coefs
+        self.eps = eps
+        self.copy_X = copy_X
+        self.fit_path = fit_path
+        self.jitter = jitter
+        self.random_state = random_state
+
+    @staticmethod
+    def _get_gram(precompute, X, y):
+        if (not hasattr(precompute, "__array__")) and (
+            (precompute is True)
+            or (precompute == "auto" and X.shape[0] > X.shape[1])
+            or (precompute == "auto" and y.shape[1] > 1)
+        ):
+            precompute = np.dot(X.T, X)
+
+        return precompute
+
+    def _fit(self, X, y, max_iter, alpha, fit_path, Xy=None):
+        """Auxiliary method to fit the model using X, y as training data"""
+        n_features = X.shape[1]
+
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X, y, fit_intercept=self.fit_intercept, copy=self.copy_X
+        )
+
+        if y.ndim == 1:
+            y = y[:, np.newaxis]
+
+        n_targets = y.shape[1]
+
+        Gram = self._get_gram(self.precompute, X, y)
+
+        self.alphas_ = []
+        self.n_iter_ = []
+        self.coef_ = np.empty((n_targets, n_features), dtype=X.dtype)
+
+        if fit_path:
+            self.active_ = []
+            self.coef_path_ = []
+            for k in range(n_targets):
+                this_Xy = None if Xy is None else Xy[:, k]
+                alphas, active, coef_path, n_iter_ = lars_path(
+                    X,
+                    y[:, k],
+                    Gram=Gram,
+                    Xy=this_Xy,
+                    copy_X=self.copy_X,
+                    copy_Gram=True,
+                    alpha_min=alpha,
+                    method=self.method,
+                    verbose=max(0, self.verbose - 1),
+                    max_iter=max_iter,
+                    eps=self.eps,
+                    return_path=True,
+                    return_n_iter=True,
+                    positive=self.positive,
+                )
+                self.alphas_.append(alphas)
+                self.active_.append(active)
+                self.n_iter_.append(n_iter_)
+                self.coef_path_.append(coef_path)
+                self.coef_[k] = coef_path[:, -1]
+
+            if n_targets == 1:
+                self.alphas_, self.active_, self.coef_path_, self.coef_ = [
+                    a[0]
+                    for a in (self.alphas_, self.active_, self.coef_path_, self.coef_)
+                ]
+                self.n_iter_ = self.n_iter_[0]
+        else:
+            for k in range(n_targets):
+                this_Xy = None if Xy is None else Xy[:, k]
+                alphas, _, self.coef_[k], n_iter_ = lars_path(
+                    X,
+                    y[:, k],
+                    Gram=Gram,
+                    Xy=this_Xy,
+                    copy_X=self.copy_X,
+                    copy_Gram=True,
+                    alpha_min=alpha,
+                    method=self.method,
+                    verbose=max(0, self.verbose - 1),
+                    max_iter=max_iter,
+                    eps=self.eps,
+                    return_path=False,
+                    return_n_iter=True,
+                    positive=self.positive,
+                )
+                self.alphas_.append(alphas)
+                self.n_iter_.append(n_iter_)
+            if n_targets == 1:
+                self.alphas_ = self.alphas_[0]
+                self.n_iter_ = self.n_iter_[0]
+
+        self._set_intercept(X_offset, y_offset, X_scale)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, Xy=None):
+        """Fit the model using X, y as training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        Xy : array-like of shape (n_features,) or (n_features, n_targets), \
+                default=None
+            Xy = np.dot(X.T, y) that can be precomputed. It is useful
+            only when the Gram matrix is precomputed.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        X, y = validate_data(
+            self, X, y, force_writeable=True, y_numeric=True, multi_output=True
+        )
+
+        alpha = getattr(self, "alpha", 0.0)
+        if hasattr(self, "n_nonzero_coefs"):
+            alpha = 0.0  # n_nonzero_coefs parametrization takes priority
+            max_iter = self.n_nonzero_coefs
+        else:
+            max_iter = self.max_iter
+
+        if self.jitter is not None:
+            rng = check_random_state(self.random_state)
+
+            noise = rng.uniform(high=self.jitter, size=len(y))
+            y = y + noise
+
+        self._fit(
+            X,
+            y,
+            max_iter=max_iter,
+            alpha=alpha,
+            fit_path=self.fit_path,
+            Xy=Xy,
+        )
+
+        return self
+
+
+class LassoLars(Lars):
+    """Lasso model fit with Least Angle Regression a.k.a. Lars.
+
+    It is a Linear Model trained with an L1 prior as regularizer.
+
+    The optimization objective for Lasso is::
+
+    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    Read more in the :ref:`User Guide <least_angle_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Constant that multiplies the penalty term. Defaults to 1.0.
+        ``alpha = 0`` is equivalent to an ordinary least square, solved
+        by :class:`LinearRegression`. For numerical reasons, using
+        ``alpha = 0`` with the LassoLars object is not advised and you
+        should prefer the LinearRegression object.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    verbose : bool or int, default=False
+        Sets the verbosity amount.
+
+    precompute : bool, 'auto' or array-like, default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram
+        matrix can also be passed as argument.
+
+    max_iter : int, default=500
+        Maximum number of iterations to perform.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the ``tol`` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    fit_path : bool, default=True
+        If ``True`` the full path is stored in the ``coef_path_`` attribute.
+        If you compute the solution for a large problem or many targets,
+        setting ``fit_path`` to ``False`` will lead to a speedup, especially
+        with a small alpha.
+
+    positive : bool, default=False
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+        Under the positive restriction the model coefficients will not converge
+        to the ordinary-least-squares solution for small values of alpha.
+        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
+        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
+        algorithm are typically in congruence with the solution of the
+        coordinate descent Lasso estimator.
+
+    jitter : float, default=None
+        Upper bound on a uniform noise parameter to be added to the
+        `y` values, to satisfy the model's assumption of
+        one-at-a-time computations. Might help with stability.
+
+        .. versionadded:: 0.23
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for jittering. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
+
+        .. versionadded:: 0.23
+
+    Attributes
+    ----------
+    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
+        Maximum of covariances (in absolute value) at each iteration.
+        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
+        number of nodes in the path with ``alpha >= alpha_min``, whichever
+        is smaller. If this is a list of array-like, the length of the outer
+        list is `n_targets`.
+
+    active_ : list of length n_alphas or list of such lists
+        Indices of active variables at the end of the path.
+        If this is a list of list, the length of the outer list is `n_targets`.
+
+    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \
+            of such arrays
+        If a list is passed it's expected to be one of n_targets such arrays.
+        The varying values of the coefficients along the path. It is not
+        present if the ``fit_path`` parameter is ``False``. If this is a list
+        of array-like, the length of the outer list is `n_targets`.
+
+    coef_ : array-like of shape (n_features,) or (n_targets, n_features)
+        Parameter vector (w in the formulation formula).
+
+    intercept_ : float or array-like of shape (n_targets,)
+        Independent term in decision function.
+
+    n_iter_ : array-like or int
+        The number of iterations taken by lars_path to find the
+        grid of alphas for each target.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    lars_path : Compute Least Angle Regression or Lasso
+        path using LARS algorithm.
+    lasso_path : Compute Lasso path with coordinate descent.
+    Lasso : Linear Model trained with L1 prior as
+        regularizer (aka the Lasso).
+    LassoCV : Lasso linear model with iterative fitting
+        along a regularization path.
+    LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.
+    LassoLarsIC : Lasso model fit with Lars using BIC
+        or AIC for model selection.
+    sklearn.decomposition.sparse_encode : Sparse coding.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> reg = linear_model.LassoLars(alpha=0.01)
+    >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])
+    LassoLars(alpha=0.01)
+    >>> print(reg.coef_)
+    [ 0.         -0.955]
+    """
+
+    _parameter_constraints: dict = {
+        **Lars._parameter_constraints,
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "positive": ["boolean"],
+    }
+    _parameter_constraints.pop("n_nonzero_coefs")
+
+    method = "lasso"
+
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        fit_intercept=True,
+        verbose=False,
+        precompute="auto",
+        max_iter=500,
+        eps=np.finfo(float).eps,
+        copy_X=True,
+        fit_path=True,
+        positive=False,
+        jitter=None,
+        random_state=None,
+    ):
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.max_iter = max_iter
+        self.verbose = verbose
+        self.positive = positive
+        self.precompute = precompute
+        self.copy_X = copy_X
+        self.eps = eps
+        self.fit_path = fit_path
+        self.jitter = jitter
+        self.random_state = random_state
+
+
+###############################################################################
+# Cross-validated estimator classes
+
+
+def _check_copy_and_writeable(array, copy=False):
+    if copy or not array.flags.writeable:
+        return array.copy()
+    return array
+
+
+def _lars_path_residues(
+    X_train,
+    y_train,
+    X_test,
+    y_test,
+    Gram=None,
+    copy=True,
+    method="lar",
+    verbose=False,
+    fit_intercept=True,
+    max_iter=500,
+    eps=np.finfo(float).eps,
+    positive=False,
+):
+    """Compute the residues on left-out data for a full LARS path
+
+    Parameters
+    -----------
+    X_train : array-like of shape (n_samples, n_features)
+        The data to fit the LARS on
+
+    y_train : array-like of shape (n_samples,)
+        The target variable to fit LARS on
+
+    X_test : array-like of shape (n_samples, n_features)
+        The data to compute the residues on
+
+    y_test : array-like of shape (n_samples,)
+        The target variable to compute the residues on
+
+    Gram : None, 'auto' or array-like of shape (n_features, n_features), \
+            default=None
+        Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram
+        matrix is precomputed from the given X, if there are more samples
+        than features
+
+    copy : bool, default=True
+        Whether X_train, X_test, y_train and y_test should be copied;
+        if False, they may be overwritten.
+
+    method : {'lar' , 'lasso'}, default='lar'
+        Specifies the returned model. Select ``'lar'`` for Least Angle
+        Regression, ``'lasso'`` for the Lasso.
+
+    verbose : bool or int, default=False
+        Sets the amount of verbosity
+
+    fit_intercept : bool, default=True
+        whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    positive : bool, default=False
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+        See reservations for using this option in combination with method
+        'lasso' for expected small values of alpha in the doc of LassoLarsCV
+        and LassoLarsIC.
+
+    max_iter : int, default=500
+        Maximum number of iterations to perform.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the ``tol`` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    Returns
+    --------
+    alphas : array-like of shape (n_alphas,)
+        Maximum of covariances (in absolute value) at each iteration.
+        ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever
+        is smaller.
+
+    active : list
+        Indices of active variables at the end of the path.
+
+    coefs : array-like of shape (n_features, n_alphas)
+        Coefficients along the path
+
+    residues : array-like of shape (n_alphas, n_samples)
+        Residues of the prediction on the test data
+    """
+    X_train = _check_copy_and_writeable(X_train, copy)
+    y_train = _check_copy_and_writeable(y_train, copy)
+    X_test = _check_copy_and_writeable(X_test, copy)
+    y_test = _check_copy_and_writeable(y_test, copy)
+
+    if fit_intercept:
+        X_mean = X_train.mean(axis=0)
+        X_train -= X_mean
+        X_test -= X_mean
+        y_mean = y_train.mean(axis=0)
+        y_train = as_float_array(y_train, copy=False)
+        y_train -= y_mean
+        y_test = as_float_array(y_test, copy=False)
+        y_test -= y_mean
+
+    alphas, active, coefs = lars_path(
+        X_train,
+        y_train,
+        Gram=Gram,
+        copy_X=False,
+        copy_Gram=False,
+        method=method,
+        verbose=max(0, verbose - 1),
+        max_iter=max_iter,
+        eps=eps,
+        positive=positive,
+    )
+    residues = np.dot(X_test, coefs) - y_test[:, np.newaxis]
+    return alphas, active, coefs, residues.T
+
+
+class LarsCV(Lars):
+    """Cross-validated Least Angle Regression model.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    Read more in the :ref:`User Guide <least_angle_regression>`.
+
+    Parameters
+    ----------
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    verbose : bool or int, default=False
+        Sets the verbosity amount.
+
+    max_iter : int, default=500
+        Maximum number of iterations to perform.
+
+    precompute : bool, 'auto' or array-like , default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram matrix
+        cannot be passed as argument since we will use only subsets of X.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    max_n_alphas : int, default=1000
+        The maximum number of points on the path used to compute the
+        residuals in the cross-validation.
+
+    n_jobs : int or None, default=None
+        Number of CPUs to use during the cross validation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the ``tol`` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    copy_X : bool, default=True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    Attributes
+    ----------
+    active_ : list of length n_alphas or list of such lists
+        Indices of active variables at the end of the path.
+        If this is a list of lists, the outer list length is `n_targets`.
+
+    coef_ : array-like of shape (n_features,)
+        parameter vector (w in the formulation formula)
+
+    intercept_ : float
+        independent term in decision function
+
+    coef_path_ : array-like of shape (n_features, n_alphas)
+        the varying values of the coefficients along the path
+
+    alpha_ : float
+        the estimated regularization parameter alpha
+
+    alphas_ : array-like of shape (n_alphas,)
+        the different values of alpha along the path
+
+    cv_alphas_ : array-like of shape (n_cv_alphas,)
+        all the values of alpha along the path for the different folds
+
+    mse_path_ : array-like of shape (n_folds, n_cv_alphas)
+        the mean square error on left-out for each fold along the path
+        (alpha values given by ``cv_alphas``)
+
+    n_iter_ : array-like or int
+        the number of iterations run by Lars with the optimal alpha.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    lars_path : Compute Least Angle Regression or Lasso
+        path using LARS algorithm.
+    lasso_path : Compute Lasso path with coordinate descent.
+    Lasso : Linear Model trained with L1 prior as
+        regularizer (aka the Lasso).
+    LassoCV : Lasso linear model with iterative fitting
+        along a regularization path.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    LassoLarsIC : Lasso model fit with Lars using BIC
+        or AIC for model selection.
+    sklearn.decomposition.sparse_encode : Sparse coding.
+
+    Notes
+    -----
+    In `fit`, once the best parameter `alpha` is found through
+    cross-validation, the model is fit again using the entire training set.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LarsCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0)
+    >>> reg = LarsCV(cv=5).fit(X, y)
+    >>> reg.score(X, y)
+    0.9996
+    >>> reg.alpha_
+    np.float64(0.2961)
+    >>> reg.predict(X[:1,])
+    array([154.3996])
+    """
+
+    _parameter_constraints: dict = {
+        **Lars._parameter_constraints,
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "cv": ["cv_object"],
+        "max_n_alphas": [Interval(Integral, 1, None, closed="left")],
+        "n_jobs": [Integral, None],
+    }
+
+    for parameter in ["n_nonzero_coefs", "jitter", "fit_path", "random_state"]:
+        _parameter_constraints.pop(parameter)
+
+    method = "lar"
+
+    def __init__(
+        self,
+        *,
+        fit_intercept=True,
+        verbose=False,
+        max_iter=500,
+        precompute="auto",
+        cv=None,
+        max_n_alphas=1000,
+        n_jobs=None,
+        eps=np.finfo(float).eps,
+        copy_X=True,
+    ):
+        self.max_iter = max_iter
+        self.cv = cv
+        self.max_n_alphas = max_n_alphas
+        self.n_jobs = n_jobs
+        super().__init__(
+            fit_intercept=fit_intercept,
+            verbose=verbose,
+            precompute=precompute,
+            n_nonzero_coefs=500,
+            eps=eps,
+            copy_X=copy_X,
+            fit_path=True,
+        )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.multi_output = False
+        return tags
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, **params):
+        """Fit the model using X, y as training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict, default=None
+            Parameters to be passed to the CV splitter.
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        _raise_for_params(params, self, "fit")
+
+        X, y = validate_data(self, X, y, force_writeable=True, y_numeric=True)
+        X = as_float_array(X, copy=self.copy_X)
+        y = as_float_array(y, copy=self.copy_X)
+
+        # init cross-validation generator
+        cv = check_cv(self.cv, classifier=False)
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **params)
+        else:
+            routed_params = Bunch(splitter=Bunch(split={}))
+
+        # As we use cross-validation, the Gram matrix is not precomputed here
+        Gram = self.precompute
+        if hasattr(Gram, "__array__"):
+            warnings.warn(
+                'Parameter "precompute" cannot be an array in '
+                '%s. Automatically switch to "auto" instead.' % self.__class__.__name__
+            )
+            Gram = "auto"
+
+        cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
+            delayed(_lars_path_residues)(
+                X[train],
+                y[train],
+                X[test],
+                y[test],
+                Gram=Gram,
+                copy=False,
+                method=self.method,
+                verbose=max(0, self.verbose - 1),
+                fit_intercept=self.fit_intercept,
+                max_iter=self.max_iter,
+                eps=self.eps,
+                positive=self.positive,
+            )
+            for train, test in cv.split(X, y, **routed_params.splitter.split)
+        )
+        all_alphas = np.concatenate(next(zip(*cv_paths)))
+        # Unique also sorts
+        all_alphas = np.unique(all_alphas)
+        # Take at most max_n_alphas values
+        stride = int(max(1, int(len(all_alphas) / float(self.max_n_alphas))))
+        all_alphas = all_alphas[::stride]
+
+        mse_path = np.empty((len(all_alphas), len(cv_paths)))
+        for index, (alphas, _, _, residues) in enumerate(cv_paths):
+            alphas = alphas[::-1]
+            residues = residues[::-1]
+            if alphas[0] != 0:
+                alphas = np.r_[0, alphas]
+                residues = np.r_[residues[0, np.newaxis], residues]
+            if alphas[-1] != all_alphas[-1]:
+                alphas = np.r_[alphas, all_alphas[-1]]
+                residues = np.r_[residues, residues[-1, np.newaxis]]
+            this_residues = interpolate.interp1d(alphas, residues, axis=0)(all_alphas)
+            this_residues **= 2
+            mse_path[:, index] = np.mean(this_residues, axis=-1)
+
+        mask = np.all(np.isfinite(mse_path), axis=-1)
+        all_alphas = all_alphas[mask]
+        mse_path = mse_path[mask]
+        # Select the alpha that minimizes left-out error
+        i_best_alpha = np.argmin(mse_path.mean(axis=-1))
+        best_alpha = all_alphas[i_best_alpha]
+
+        # Store our parameters
+        self.alpha_ = best_alpha
+        self.cv_alphas_ = all_alphas
+        self.mse_path_ = mse_path
+
+        # Now compute the full model using best_alpha
+        # it will call a lasso internally when self if LassoLarsCV
+        # as self.method == 'lasso'
+        self._fit(
+            X,
+            y,
+            max_iter=self.max_iter,
+            alpha=best_alpha,
+            Xy=None,
+            fit_path=True,
+        )
+        return self
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.4
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            splitter=check_cv(self.cv),
+            method_mapping=MethodMapping().add(caller="fit", callee="split"),
+        )
+        return router
+
+
+class LassoLarsCV(LarsCV):
+    """Cross-validated Lasso, using the LARS algorithm.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    The optimization objective for Lasso is::
+
+    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    Read more in the :ref:`User Guide <least_angle_regression>`.
+
+    Parameters
+    ----------
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    verbose : bool or int, default=False
+        Sets the verbosity amount.
+
+    max_iter : int, default=500
+        Maximum number of iterations to perform.
+
+    precompute : bool or 'auto' , default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram matrix
+        cannot be passed as argument since we will use only subsets of X.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    max_n_alphas : int, default=1000
+        The maximum number of points on the path used to compute the
+        residuals in the cross-validation.
+
+    n_jobs : int or None, default=None
+        Number of CPUs to use during the cross validation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the ``tol`` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    positive : bool, default=False
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+        Under the positive restriction the model coefficients do not converge
+        to the ordinary-least-squares solution for small values of alpha.
+        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
+        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
+        algorithm are typically in congruence with the solution of the
+        coordinate descent Lasso estimator.
+        As a consequence using LassoLarsCV only makes sense for problems where
+        a sparse solution is expected and/or reached.
+
+    Attributes
+    ----------
+    coef_ : array-like of shape (n_features,)
+        parameter vector (w in the formulation formula)
+
+    intercept_ : float
+        independent term in decision function.
+
+    coef_path_ : array-like of shape (n_features, n_alphas)
+        the varying values of the coefficients along the path
+
+    alpha_ : float
+        the estimated regularization parameter alpha
+
+    alphas_ : array-like of shape (n_alphas,)
+        the different values of alpha along the path
+
+    cv_alphas_ : array-like of shape (n_cv_alphas,)
+        all the values of alpha along the path for the different folds
+
+    mse_path_ : array-like of shape (n_folds, n_cv_alphas)
+        the mean square error on left-out for each fold along the path
+        (alpha values given by ``cv_alphas``)
+
+    n_iter_ : array-like or int
+        the number of iterations run by Lars with the optimal alpha.
+
+    active_ : list of int
+        Indices of active variables at the end of the path.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    lars_path : Compute Least Angle Regression or Lasso
+        path using LARS algorithm.
+    lasso_path : Compute Lasso path with coordinate descent.
+    Lasso : Linear Model trained with L1 prior as
+        regularizer (aka the Lasso).
+    LassoCV : Lasso linear model with iterative fitting
+        along a regularization path.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    LassoLarsIC : Lasso model fit with Lars using BIC
+        or AIC for model selection.
+    sklearn.decomposition.sparse_encode : Sparse coding.
+
+    Notes
+    -----
+    The object solves the same problem as the
+    :class:`~sklearn.linear_model.LassoCV` object. However, unlike the
+    :class:`~sklearn.linear_model.LassoCV`, it find the relevant alphas values
+    by itself. In general, because of this property, it will be more stable.
+    However, it is more fragile to heavily multicollinear datasets.
+
+    It is more efficient than the :class:`~sklearn.linear_model.LassoCV` if
+    only a small number of features are selected compared to the total number,
+    for instance if there are very few samples compared to the number of
+    features.
+
+    In `fit`, once the best parameter `alpha` is found through
+    cross-validation, the model is fit again using the entire training set.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LassoLarsCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(noise=4.0, random_state=0)
+    >>> reg = LassoLarsCV(cv=5).fit(X, y)
+    >>> reg.score(X, y)
+    0.9993
+    >>> reg.alpha_
+    np.float64(0.3972)
+    >>> reg.predict(X[:1,])
+    array([-78.4831])
+    """
+
+    _parameter_constraints = {
+        **LarsCV._parameter_constraints,
+        "positive": ["boolean"],
+    }
+
+    method = "lasso"
+
+    def __init__(
+        self,
+        *,
+        fit_intercept=True,
+        verbose=False,
+        max_iter=500,
+        precompute="auto",
+        cv=None,
+        max_n_alphas=1000,
+        n_jobs=None,
+        eps=np.finfo(float).eps,
+        copy_X=True,
+        positive=False,
+    ):
+        self.fit_intercept = fit_intercept
+        self.verbose = verbose
+        self.max_iter = max_iter
+        self.precompute = precompute
+        self.cv = cv
+        self.max_n_alphas = max_n_alphas
+        self.n_jobs = n_jobs
+        self.eps = eps
+        self.copy_X = copy_X
+        self.positive = positive
+        # XXX : we don't use super().__init__
+        # to avoid setting n_nonzero_coefs
+
+
+class LassoLarsIC(LassoLars):
+    """Lasso model fit with Lars using BIC or AIC for model selection.
+
+    The optimization objective for Lasso is::
+
+    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
+
+    AIC is the Akaike information criterion [2]_ and BIC is the Bayes
+    Information criterion [3]_. Such criteria are useful to select the value
+    of the regularization parameter by making a trade-off between the
+    goodness of fit and the complexity of the model. A good model should
+    explain well the data while being simple.
+
+    Read more in the :ref:`User Guide <lasso_lars_ic>`.
+
+    Parameters
+    ----------
+    criterion : {'aic', 'bic'}, default='aic'
+        The type of criterion to use.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    verbose : bool or int, default=False
+        Sets the verbosity amount.
+
+    precompute : bool, 'auto' or array-like, default='auto'
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to ``'auto'`` let us decide. The Gram
+        matrix can also be passed as argument.
+
+    max_iter : int, default=500
+        Maximum number of iterations to perform. Can be used for
+        early stopping.
+
+    eps : float, default=np.finfo(float).eps
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the ``tol`` parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    positive : bool, default=False
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+        Under the positive restriction the model coefficients do not converge
+        to the ordinary-least-squares solution for small values of alpha.
+        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
+        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
+        algorithm are typically in congruence with the solution of the
+        coordinate descent Lasso estimator.
+        As a consequence using LassoLarsIC only makes sense for problems where
+        a sparse solution is expected and/or reached.
+
+    noise_variance : float, default=None
+        The estimated noise variance of the data. If `None`, an unbiased
+        estimate is computed by an OLS model. However, it is only possible
+        in the case where `n_samples > n_features + fit_intercept`.
+
+        .. versionadded:: 1.1
+
+    Attributes
+    ----------
+    coef_ : array-like of shape (n_features,)
+        parameter vector (w in the formulation formula)
+
+    intercept_ : float
+        independent term in decision function.
+
+    alpha_ : float
+        the alpha parameter chosen by the information criterion
+
+    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
+        Maximum of covariances (in absolute value) at each iteration.
+        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
+        number of nodes in the path with ``alpha >= alpha_min``, whichever
+        is smaller. If a list, it will be of length `n_targets`.
+
+    n_iter_ : int
+        number of iterations run by lars_path to find the grid of
+        alphas.
+
+    criterion_ : array-like of shape (n_alphas,)
+        The value of the information criteria ('aic', 'bic') across all
+        alphas. The alpha which has the smallest information criterion is
+        chosen, as specified in [1]_.
+
+    noise_variance_ : float
+        The estimated noise variance from the data used to compute the
+        criterion.
+
+        .. versionadded:: 1.1
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    lars_path : Compute Least Angle Regression or Lasso
+        path using LARS algorithm.
+    lasso_path : Compute Lasso path with coordinate descent.
+    Lasso : Linear Model trained with L1 prior as
+        regularizer (aka the Lasso).
+    LassoCV : Lasso linear model with iterative fitting
+        along a regularization path.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.
+    sklearn.decomposition.sparse_encode : Sparse coding.
+
+    Notes
+    -----
+    The number of degrees of freedom is computed as in [1]_.
+
+    To have more details regarding the mathematical formulation of the
+    AIC and BIC criteria, please refer to :ref:`User Guide <lasso_lars_ic>`.
+
+    References
+    ----------
+    .. [1] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani.
+            "On the degrees of freedom of the lasso."
+            The Annals of Statistics 35.5 (2007): 2173-2192.
+            <0712.0881>`
+
+    .. [2] `Wikipedia entry on the Akaike information criterion
+            <https://en.wikipedia.org/wiki/Akaike_information_criterion>`_
+
+    .. [3] `Wikipedia entry on the Bayesian information criterion
+            <https://en.wikipedia.org/wiki/Bayesian_information_criterion>`_
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> reg = linear_model.LassoLarsIC(criterion='bic')
+    >>> X = [[-2, 2], [-1, 1], [0, 0], [1, 1], [2, 2]]
+    >>> y = [-2.2222, -1.1111, 0, -1.1111, -2.2222]
+    >>> reg.fit(X, y)
+    LassoLarsIC(criterion='bic')
+    >>> print(reg.coef_)
+    [ 0.  -1.11]
+    """
+
+    _parameter_constraints: dict = {
+        **LassoLars._parameter_constraints,
+        "criterion": [StrOptions({"aic", "bic"})],
+        "noise_variance": [Interval(Real, 0, None, closed="left"), None],
+    }
+
+    for parameter in ["jitter", "fit_path", "alpha", "random_state"]:
+        _parameter_constraints.pop(parameter)
+
+    def __init__(
+        self,
+        criterion="aic",
+        *,
+        fit_intercept=True,
+        verbose=False,
+        precompute="auto",
+        max_iter=500,
+        eps=np.finfo(float).eps,
+        copy_X=True,
+        positive=False,
+        noise_variance=None,
+    ):
+        self.criterion = criterion
+        self.fit_intercept = fit_intercept
+        self.positive = positive
+        self.max_iter = max_iter
+        self.verbose = verbose
+        self.copy_X = copy_X
+        self.precompute = precompute
+        self.eps = eps
+        self.fit_path = True
+        self.noise_variance = noise_variance
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.multi_output = False
+        return tags
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, copy_X=None):
+        """Fit the model using X, y as training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values. Will be cast to X's dtype if necessary.
+
+        copy_X : bool, default=None
+            If provided, this parameter will override the choice
+            of copy_X made at instance creation.
+            If ``True``, X will be copied; else, it may be overwritten.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        if copy_X is None:
+            copy_X = self.copy_X
+        X, y = validate_data(self, X, y, force_writeable=True, y_numeric=True)
+
+        X, y, Xmean, ymean, Xstd = _preprocess_data(
+            X, y, fit_intercept=self.fit_intercept, copy=copy_X
+        )
+
+        Gram = self.precompute
+
+        alphas_, _, coef_path_, self.n_iter_ = lars_path(
+            X,
+            y,
+            Gram=Gram,
+            copy_X=copy_X,
+            copy_Gram=True,
+            alpha_min=0.0,
+            method="lasso",
+            verbose=self.verbose,
+            max_iter=self.max_iter,
+            eps=self.eps,
+            return_n_iter=True,
+            positive=self.positive,
+        )
+
+        n_samples = X.shape[0]
+
+        if self.criterion == "aic":
+            criterion_factor = 2
+        elif self.criterion == "bic":
+            criterion_factor = log(n_samples)
+        else:
+            raise ValueError(
+                f"criterion should be either bic or aic, got {self.criterion!r}"
+            )
+
+        residuals = y[:, np.newaxis] - np.dot(X, coef_path_)
+        residuals_sum_squares = np.sum(residuals**2, axis=0)
+        degrees_of_freedom = np.zeros(coef_path_.shape[1], dtype=int)
+        for k, coef in enumerate(coef_path_.T):
+            mask = np.abs(coef) > np.finfo(coef.dtype).eps
+            if not np.any(mask):
+                continue
+            # get the number of degrees of freedom equal to:
+            # Xc = X[:, mask]
+            # Trace(Xc * inv(Xc.T, Xc) * Xc.T) ie the number of non-zero coefs
+            degrees_of_freedom[k] = np.sum(mask)
+
+        self.alphas_ = alphas_
+
+        if self.noise_variance is None:
+            self.noise_variance_ = self._estimate_noise_variance(
+                X, y, positive=self.positive
+            )
+        else:
+            self.noise_variance_ = self.noise_variance
+
+        self.criterion_ = (
+            n_samples * np.log(2 * np.pi * self.noise_variance_)
+            + residuals_sum_squares / self.noise_variance_
+            + criterion_factor * degrees_of_freedom
+        )
+        n_best = np.argmin(self.criterion_)
+
+        self.alpha_ = alphas_[n_best]
+        self.coef_ = coef_path_[:, n_best]
+        self._set_intercept(Xmean, ymean, Xstd)
+        return self
+
+    def _estimate_noise_variance(self, X, y, positive):
+        """Compute an estimate of the variance with an OLS model.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Data to be fitted by the OLS model. We expect the data to be
+            centered.
+
+        y : ndarray of shape (n_samples,)
+            Associated target.
+
+        positive : bool, default=False
+            Restrict coefficients to be >= 0. This should be inline with
+            the `positive` parameter from `LassoLarsIC`.
+
+        Returns
+        -------
+        noise_variance : float
+            An estimator of the noise variance of an OLS model.
+        """
+        if X.shape[0] <= X.shape[1] + self.fit_intercept:
+            raise ValueError(
+                f"You are using {self.__class__.__name__} in the case where the number "
+                "of samples is smaller than the number of features. In this setting, "
+                "getting a good estimate for the variance of the noise is not "
+                "possible. Provide an estimate of the noise variance in the "
+                "constructor."
+            )
+        # X and y are already centered and we don't need to fit with an intercept
+        ols_model = LinearRegression(positive=positive, fit_intercept=False)
+        y_pred = ols_model.fit(X, y).predict(X)
+        return np.sum((y - y_pred) ** 2) / (
+            X.shape[0] - X.shape[1] - self.fit_intercept
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9213008a19841f1707b57e3e47b7887ea29da4da
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.py
@@ -0,0 +1,825 @@
+"""
+Loss functions for linear models with raw_prediction = X @ coef
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from scipy import sparse
+
+from ..utils.extmath import squared_norm
+
+
+def sandwich_dot(X, W):
+    """Compute the sandwich product X.T @ diag(W) @ X."""
+    # TODO: This "sandwich product" is the main computational bottleneck for solvers
+    # that use the full hessian matrix. Here, thread parallelism would pay-off the
+    # most.
+    # While a dedicated Cython routine could exploit the symmetry, it is very hard to
+    # beat BLAS GEMM, even thought the latter cannot exploit the symmetry, unless one
+    # pays the price of taking square roots and implements
+    #    sqrtWX = sqrt(W)[: None] * X
+    #    return sqrtWX.T @ sqrtWX
+    # which (might) detect the symmetry and use BLAS SYRK under the hood.
+    n_samples = X.shape[0]
+    if sparse.issparse(X):
+        return (
+            X.T @ sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)) @ X
+        ).toarray()
+    else:
+        # np.einsum may use less memory but the following, using BLAS matrix
+        # multiplication (gemm), is by far faster.
+        WX = W[:, None] * X
+        return X.T @ WX
+
+
+class LinearModelLoss:
+    """General class for loss functions with raw_prediction = X @ coef + intercept.
+
+    Note that raw_prediction is also known as linear predictor.
+
+    The loss is the average of per sample losses and includes a term for L2
+    regularization::
+
+        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
+               + 1/2 * l2_reg_strength * ||coef||_2^2
+
+    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.
+
+    Gradient and hessian, for simplicity without intercept, are::
+
+        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
+        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
+                  + l2_reg_strength * identity
+
+    Conventions:
+        if fit_intercept:
+            n_dof =  n_features + 1
+        else:
+            n_dof = n_features
+
+        if base_loss.is_multiclass:
+            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
+        else:
+            coef.shape = (n_dof,)
+
+        The intercept term is at the end of the coef array:
+        if base_loss.is_multiclass:
+            if coef.shape (n_classes, n_dof):
+                intercept = coef[:, -1]
+            if coef.shape (n_classes * n_dof,)
+                intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
+            intercept.shape = (n_classes,)
+        else:
+            intercept = coef[-1]
+
+        Shape of gradient follows shape of coef.
+        gradient.shape = coef.shape
+
+        But hessian (to make our lives simpler) are always 2-d:
+        if base_loss.is_multiclass:
+            hessian.shape = (n_classes * n_dof, n_classes * n_dof)
+        else:
+            hessian.shape = (n_dof, n_dof)
+
+    Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as
+
+        coef.reshape((n_classes, -1), order="F")
+
+    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
+    coefficients without intercept, coef[:, :-1], contiguous and speeds up
+    matrix-vector computations.
+
+    Note: If the average loss per sample is wanted instead of the sum of the loss per
+    sample, one can simply use a rescaled sample_weight such that
+    sum(sample_weight) = 1.
+
+    Parameters
+    ----------
+    base_loss : instance of class BaseLoss from sklearn._loss.
+    fit_intercept : bool
+    """
+
+    def __init__(self, base_loss, fit_intercept):
+        self.base_loss = base_loss
+        self.fit_intercept = fit_intercept
+
+    def init_zero_coef(self, X, dtype=None):
+        """Allocate coef of correct shape with zeros.
+
+        Parameters:
+        -----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        dtype : data-type, default=None
+            Overrides the data type of coef. With dtype=None, coef will have the same
+            dtype as X.
+
+        Returns
+        -------
+        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
+            Coefficients of a linear model.
+        """
+        n_features = X.shape[1]
+        n_classes = self.base_loss.n_classes
+        if self.fit_intercept:
+            n_dof = n_features + 1
+        else:
+            n_dof = n_features
+        if self.base_loss.is_multiclass:
+            coef = np.zeros_like(X, shape=(n_classes, n_dof), dtype=dtype, order="F")
+        else:
+            coef = np.zeros_like(X, shape=n_dof, dtype=dtype)
+        return coef
+
+    def weight_intercept(self, coef):
+        """Helper function to get coefficients and intercept.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+
+        Returns
+        -------
+        weights : ndarray of shape (n_features,) or (n_classes, n_features)
+            Coefficients without intercept term.
+        intercept : float or ndarray of shape (n_classes,)
+            Intercept terms.
+        """
+        if not self.base_loss.is_multiclass:
+            if self.fit_intercept:
+                intercept = coef[-1]
+                weights = coef[:-1]
+            else:
+                intercept = 0.0
+                weights = coef
+        else:
+            # reshape to (n_classes, n_dof)
+            if coef.ndim == 1:
+                weights = coef.reshape((self.base_loss.n_classes, -1), order="F")
+            else:
+                weights = coef
+            if self.fit_intercept:
+                intercept = weights[:, -1]
+                weights = weights[:, :-1]
+            else:
+                intercept = 0.0
+
+        return weights, intercept
+
+    def weight_intercept_raw(self, coef, X):
+        """Helper function to get coefficients, intercept and raw_prediction.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        Returns
+        -------
+        weights : ndarray of shape (n_features,) or (n_classes, n_features)
+            Coefficients without intercept term.
+        intercept : float or ndarray of shape (n_classes,)
+            Intercept terms.
+        raw_prediction : ndarray of shape (n_samples,) or \
+            (n_samples, n_classes)
+        """
+        weights, intercept = self.weight_intercept(coef)
+
+        if not self.base_loss.is_multiclass:
+            raw_prediction = X @ weights + intercept
+        else:
+            # weights has shape (n_classes, n_dof)
+            raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
+
+        return weights, intercept, raw_prediction
+
+    def l2_penalty(self, weights, l2_reg_strength):
+        """Compute L2 penalty term l2_reg_strength/2 *||w||_2^2."""
+        norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
+        return 0.5 * l2_reg_strength * norm2_w
+
+    def loss(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
+    ):
+        """Compute the loss as weighted average over point-wise losses.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        loss : float
+            Weighted average of losses per sample, plus penalty.
+        """
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        loss = self.base_loss.loss(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=None,
+            n_threads=n_threads,
+        )
+        loss = np.average(loss, weights=sample_weight)
+
+        return loss + self.l2_penalty(weights, l2_reg_strength)
+
+    def loss_gradient(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
+    ):
+        """Computes the sum of loss and gradient w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        loss : float
+            Weighted average of losses per sample, plus penalty.
+
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+        """
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
+        n_dof = n_features + int(self.fit_intercept)
+
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        loss, grad_pointwise = self.base_loss.loss_gradient(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=n_threads,
+        )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        loss = loss.sum() / sw_sum
+        loss += self.l2_penalty(weights, l2_reg_strength)
+
+        grad_pointwise /= sw_sum
+
+        if not self.base_loss.is_multiclass:
+            grad = np.empty_like(coef, dtype=weights.dtype)
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+        else:
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+            # grad_pointwise.shape = (n_samples, n_classes)
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[:, -1] = grad_pointwise.sum(axis=0)
+            if coef.ndim == 1:
+                grad = grad.ravel(order="F")
+
+        return loss, grad
+
+    def gradient(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
+    ):
+        """Computes the gradient w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+        """
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
+        n_dof = n_features + int(self.fit_intercept)
+
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        grad_pointwise = self.base_loss.gradient(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=n_threads,
+        )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
+
+        if not self.base_loss.is_multiclass:
+            grad = np.empty_like(coef, dtype=weights.dtype)
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+            return grad
+        else:
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+            # gradient.shape = (n_samples, n_classes)
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[:, -1] = grad_pointwise.sum(axis=0)
+            if coef.ndim == 1:
+                return grad.ravel(order="F")
+            else:
+                return grad
+
+    def gradient_hessian(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        gradient_out=None,
+        hessian_out=None,
+        raw_prediction=None,
+    ):
+        """Computes gradient and hessian w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        gradient_out : None or ndarray of shape coef.shape
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        hessian_out : None or ndarray of shape (n_dof, n_dof) or \
+            (n_classes * n_dof, n_classes * n_dof)
+            A location into which the hessian is stored. If None, a new array
+            might be created.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+
+        hessian : ndarray of shape (n_dof, n_dof) or \
+            (n_classes, n_dof, n_dof, n_classes)
+            Hessian matrix.
+
+        hessian_warning : bool
+            True if pointwise hessian has more than 25% of its elements non-positive.
+        """
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
+        n_dof = n_features + int(self.fit_intercept)
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+
+        # Allocate gradient.
+        if gradient_out is None:
+            grad = np.empty_like(coef, dtype=weights.dtype, order="F")
+        elif gradient_out.shape != coef.shape:
+            raise ValueError(
+                f"gradient_out is required to have shape coef.shape = {coef.shape}; "
+                f"got {gradient_out.shape}."
+            )
+        elif self.base_loss.is_multiclass and not gradient_out.flags.f_contiguous:
+            raise ValueError("gradient_out must be F-contiguous.")
+        else:
+            grad = gradient_out
+        # Allocate hessian.
+        n = coef.size  # for multinomial this equals n_dof * n_classes
+        if hessian_out is None:
+            hess = np.empty((n, n), dtype=weights.dtype)
+        elif hessian_out.shape != (n, n):
+            raise ValueError(
+                f"hessian_out is required to have shape ({n, n}); got "
+                f"{hessian_out.shape=}."
+            )
+        elif self.base_loss.is_multiclass and (
+            not hessian_out.flags.c_contiguous and not hessian_out.flags.f_contiguous
+        ):
+            raise ValueError("hessian_out must be contiguous.")
+        else:
+            hess = hessian_out
+
+        if not self.base_loss.is_multiclass:
+            grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
+                y_true=y,
+                raw_prediction=raw_prediction,
+                sample_weight=sample_weight,
+                n_threads=n_threads,
+            )
+            grad_pointwise /= sw_sum
+            hess_pointwise /= sw_sum
+
+            # For non-canonical link functions and far away from the optimum, the
+            # pointwise hessian can be negative. We take care that 75% of the hessian
+            # entries are positive.
+            hessian_warning = (
+                np.average(hess_pointwise <= 0, weights=sample_weight) > 0.25
+            )
+            hess_pointwise = np.abs(hess_pointwise)
+
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+
+            if hessian_warning:
+                # Exit early without computing the hessian.
+                return grad, hess, hessian_warning
+
+            hess[:n_features, :n_features] = sandwich_dot(X, hess_pointwise)
+
+            if l2_reg_strength > 0:
+                # The L2 penalty enters the Hessian on the diagonal only. To add those
+                # terms, we use a flattened view of the array.
+                order = "C" if hess.flags.c_contiguous else "F"
+                hess.reshape(-1, order=order)[: (n_features * n_dof) : (n_dof + 1)] += (
+                    l2_reg_strength
+                )
+
+            if self.fit_intercept:
+                # With intercept included as added column to X, the hessian becomes
+                # hess = (X, 1)' @ diag(h) @ (X, 1)
+                #      = (X' @ diag(h) @ X, X' @ h)
+                #        (           h @ X, sum(h))
+                # The left upper part has already been filled, it remains to compute
+                # the last row and the last column.
+                Xh = X.T @ hess_pointwise
+                hess[:-1, -1] = Xh
+                hess[-1, :-1] = Xh
+                hess[-1, -1] = hess_pointwise.sum()
+        else:
+            # Here we may safely assume HalfMultinomialLoss aka categorical
+            # cross-entropy.
+            # HalfMultinomialLoss computes only the diagonal part of the hessian, i.e.
+            # diagonal in the classes. Here, we want the full hessian. Therefore, we
+            # call gradient_proba.
+            grad_pointwise, proba = self.base_loss.gradient_proba(
+                y_true=y,
+                raw_prediction=raw_prediction,
+                sample_weight=sample_weight,
+                n_threads=n_threads,
+            )
+            grad_pointwise /= sw_sum
+            grad = grad.reshape((n_classes, n_dof), order="F")
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[:, -1] = grad_pointwise.sum(axis=0)
+            if coef.ndim == 1:
+                grad = grad.ravel(order="F")
+
+            # The full hessian matrix, i.e. not only the diagonal part, dropping most
+            # indices, is given by:
+            #
+            #   hess = X' @ h @ X
+            #
+            # Here, h is a priori a 4-dimensional matrix of shape
+            # (n_samples, n_samples, n_classes, n_classes). It is diagonal its first
+            # two dimensions (the ones with n_samples), i.e. it is
+            # effectively a 3-dimensional matrix (n_samples, n_classes, n_classes).
+            #
+            #   h = diag(p) - p' p
+            #
+            # or with indices k and l for classes
+            #
+            #   h_kl = p_k * delta_kl - p_k * p_l
+            #
+            # with p_k the (predicted) probability for class k. Only the dimension in
+            # n_samples multiplies with X.
+            # For 3 classes and n_samples = 1, this looks like ("@" is a bit misused
+            # here):
+            #
+            #   hess = X' @ (h00 h10 h20) @ X
+            #               (h10 h11 h12)
+            #               (h20 h12 h22)
+            #        = (X' @ diag(h00) @ X, X' @ diag(h10), X' @ diag(h20))
+            #          (X' @ diag(h10) @ X, X' @ diag(h11), X' @ diag(h12))
+            #          (X' @ diag(h20) @ X, X' @ diag(h12), X' @ diag(h22))
+            #
+            # Now coef of shape (n_classes * n_dof) is contiguous in n_classes.
+            # Therefore, we want the hessian to follow this convention, too, i.e.
+            #     hess[:n_classes, :n_classes] = (x0' @ h00 @ x0, x0' @ h10 @ x0, ..)
+            #                                    (x0' @ h10 @ x0, x0' @ h11 @ x0, ..)
+            #                                    (x0' @ h20 @ x0, x0' @ h12 @ x0, ..)
+            # is the first feature, x0, for all classes. In our implementation, we
+            # still want to take advantage of BLAS "X.T @ X". Therefore, we have some
+            # index/slicing battle to fight.
+            if sample_weight is not None:
+                sw = sample_weight / sw_sum
+            else:
+                sw = 1.0 / sw_sum
+
+            for k in range(n_classes):
+                # Diagonal terms (in classes) hess_kk.
+                # Note that this also writes to some of the lower triangular part.
+                h = proba[:, k] * (1 - proba[:, k]) * sw
+                hess[
+                    k : n_classes * n_features : n_classes,
+                    k : n_classes * n_features : n_classes,
+                ] = sandwich_dot(X, h)
+                if self.fit_intercept:
+                    # See above in the non multiclass case.
+                    Xh = X.T @ h
+                    hess[
+                        k : n_classes * n_features : n_classes,
+                        n_classes * n_features + k,
+                    ] = Xh
+                    hess[
+                        n_classes * n_features + k,
+                        k : n_classes * n_features : n_classes,
+                    ] = Xh
+                    hess[n_classes * n_features + k, n_classes * n_features + k] = (
+                        h.sum()
+                    )
+                # Off diagonal terms (in classes) hess_kl.
+                for l in range(k + 1, n_classes):
+                    # Upper triangle (in classes).
+                    h = -proba[:, k] * proba[:, l] * sw
+                    hess[
+                        k : n_classes * n_features : n_classes,
+                        l : n_classes * n_features : n_classes,
+                    ] = sandwich_dot(X, h)
+                    if self.fit_intercept:
+                        Xh = X.T @ h
+                        hess[
+                            k : n_classes * n_features : n_classes,
+                            n_classes * n_features + l,
+                        ] = Xh
+                        hess[
+                            n_classes * n_features + k,
+                            l : n_classes * n_features : n_classes,
+                        ] = Xh
+                        hess[n_classes * n_features + k, n_classes * n_features + l] = (
+                            h.sum()
+                        )
+                    # Fill lower triangle (in classes).
+                    hess[l::n_classes, k::n_classes] = hess[k::n_classes, l::n_classes]
+
+            if l2_reg_strength > 0:
+                # See above in the non multiclass case.
+                order = "C" if hess.flags.c_contiguous else "F"
+                hess.reshape(-1, order=order)[
+                    : (n_classes**2 * n_features * n_dof) : (n_classes * n_dof + 1)
+                ] += l2_reg_strength
+
+            # The pointwise hessian is always non-negative for the multinomial loss.
+            hessian_warning = False
+
+        return grad, hess, hessian_warning
+
+    def gradient_hessian_product(
+        self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
+    ):
+        """Computes gradient and hessp (hessian product function) w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+
+        Returns
+        -------
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+
+        hessp : callable
+            Function that takes in a vector input of shape of gradient and
+            and returns matrix-vector product with hessian.
+        """
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
+        n_dof = n_features + int(self.fit_intercept)
+        weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+
+        if not self.base_loss.is_multiclass:
+            grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
+                y_true=y,
+                raw_prediction=raw_prediction,
+                sample_weight=sample_weight,
+                n_threads=n_threads,
+            )
+            grad_pointwise /= sw_sum
+            hess_pointwise /= sw_sum
+            grad = np.empty_like(coef, dtype=weights.dtype)
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+
+            # Precompute as much as possible: hX, hX_sum and hessian_sum
+            hessian_sum = hess_pointwise.sum()
+            if sparse.issparse(X):
+                hX = (
+                    sparse.dia_matrix((hess_pointwise, 0), shape=(n_samples, n_samples))
+                    @ X
+                )
+            else:
+                hX = hess_pointwise[:, np.newaxis] * X
+
+            if self.fit_intercept:
+                # Calculate the double derivative with respect to intercept.
+                # Note: In case hX is sparse, hX.sum is a matrix object.
+                hX_sum = np.squeeze(np.asarray(hX.sum(axis=0)))
+                # prevent squeezing to zero-dim array if n_features == 1
+                hX_sum = np.atleast_1d(hX_sum)
+
+            # With intercept included and l2_reg_strength = 0, hessp returns
+            # res = (X, 1)' @ diag(h) @ (X, 1) @ s
+            #     = (X, 1)' @ (hX @ s[:n_features], sum(h) * s[-1])
+            # res[:n_features] = X' @ hX @ s[:n_features] + sum(h) * s[-1]
+            # res[-1] = 1' @ hX @ s[:n_features] + sum(h) * s[-1]
+            def hessp(s):
+                ret = np.empty_like(s)
+                if sparse.issparse(X):
+                    ret[:n_features] = X.T @ (hX @ s[:n_features])
+                else:
+                    ret[:n_features] = np.linalg.multi_dot([X.T, hX, s[:n_features]])
+                ret[:n_features] += l2_reg_strength * s[:n_features]
+
+                if self.fit_intercept:
+                    ret[:n_features] += s[-1] * hX_sum
+                    ret[-1] = hX_sum @ s[:n_features] + hessian_sum * s[-1]
+                return ret
+
+        else:
+            # Here we may safely assume HalfMultinomialLoss aka categorical
+            # cross-entropy.
+            # HalfMultinomialLoss computes only the diagonal part of the hessian, i.e.
+            # diagonal in the classes. Here, we want the matrix-vector product of the
+            # full hessian. Therefore, we call gradient_proba.
+            grad_pointwise, proba = self.base_loss.gradient_proba(
+                y_true=y,
+                raw_prediction=raw_prediction,
+                sample_weight=sample_weight,
+                n_threads=n_threads,
+            )
+            grad_pointwise /= sw_sum
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[:, -1] = grad_pointwise.sum(axis=0)
+
+            # Full hessian-vector product, i.e. not only the diagonal part of the
+            # hessian. Derivation with some index battle for input vector s:
+            #   - sample index i
+            #   - feature indices j, m
+            #   - class indices k, l
+            #   - 1_{k=l} is one if k=l else 0
+            #   - p_i_k is the (predicted) probability that sample i belongs to class k
+            #     for all i: sum_k p_i_k = 1
+            #   - s_l_m is input vector for class l and feature m
+            #   - X' = X transposed
+            #
+            # Note: Hessian with dropping most indices is just:
+            #       X' @ p_k (1(k=l) - p_l) @ X
+            #
+            # result_{k j} = sum_{i, l, m} Hessian_{i, k j, m l} * s_l_m
+            #   = sum_{i, l, m} (X')_{ji} * p_i_k * (1_{k=l} - p_i_l)
+            #                   * X_{im} s_l_m
+            #   = sum_{i, m} (X')_{ji} * p_i_k
+            #                * (X_{im} * s_k_m - sum_l p_i_l * X_{im} * s_l_m)
+            #
+            # See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411
+            def hessp(s):
+                s = s.reshape((n_classes, -1), order="F")  # shape = (n_classes, n_dof)
+                if self.fit_intercept:
+                    s_intercept = s[:, -1]
+                    s = s[:, :-1]  # shape = (n_classes, n_features)
+                else:
+                    s_intercept = 0
+                tmp = X @ s.T + s_intercept  # X_{im} * s_k_m
+                tmp += (-proba * tmp).sum(axis=1)[:, np.newaxis]  # - sum_l ..
+                tmp *= proba  # * p_i_k
+                if sample_weight is not None:
+                    tmp *= sample_weight[:, np.newaxis]
+                # hess_prod = empty_like(grad), but we ravel grad below and this
+                # function is run after that.
+                hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+                hess_prod[:, :n_features] = (tmp.T @ X) / sw_sum + l2_reg_strength * s
+                if self.fit_intercept:
+                    hess_prod[:, -1] = tmp.sum(axis=0) / sw_sum
+                if coef.ndim == 1:
+                    return hess_prod.ravel(order="F")
+                else:
+                    return hess_prod
+
+            if coef.ndim == 1:
+                return grad.ravel(order="F"), hessp
+
+        return grad, hessp
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py
new file mode 100644
index 0000000000000000000000000000000000000000..35cfcee7ce7d16e4dcd57ada0d51db87bfa8c69f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py
@@ -0,0 +1,2327 @@
+"""
+Logistic Regression
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+from scipy import optimize
+
+from sklearn.metrics import get_scorer_names
+
+from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss
+from ..base import _fit_context
+from ..metrics import get_scorer
+from ..model_selection import check_cv
+from ..preprocessing import LabelBinarizer, LabelEncoder
+from ..svm._base import _fit_liblinear
+from ..utils import (
+    Bunch,
+    check_array,
+    check_consistent_length,
+    check_random_state,
+    compute_class_weight,
+)
+from ..utils._param_validation import Hidden, Interval, StrOptions
+from ..utils.extmath import row_norms, softmax
+from ..utils.fixes import _get_additional_lbfgs_options_dict
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.multiclass import check_classification_targets
+from ..utils.optimize import _check_optimize_result, _newton_cg
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_method_params,
+    _check_sample_weight,
+    check_is_fitted,
+    validate_data,
+)
+from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin
+from ._glm.glm import NewtonCholeskySolver
+from ._linear_loss import LinearModelLoss
+from ._sag import sag_solver
+
+_LOGISTIC_SOLVER_CONVERGENCE_MSG = (
+    "Please also refer to the documentation for alternative solver options:\n"
+    "    https://scikit-learn.org/stable/modules/linear_model.html"
+    "#logistic-regression"
+)
+
+
+def _check_solver(solver, penalty, dual):
+    if solver not in ["liblinear", "saga"] and penalty not in ("l2", None):
+        raise ValueError(
+            f"Solver {solver} supports only 'l2' or None penalties, got {penalty} "
+            "penalty."
+        )
+    if solver != "liblinear" and dual:
+        raise ValueError(f"Solver {solver} supports only dual=False, got dual={dual}")
+
+    if penalty == "elasticnet" and solver != "saga":
+        raise ValueError(
+            f"Only 'saga' solver supports elasticnet penalty, got solver={solver}."
+        )
+
+    if solver == "liblinear" and penalty is None:
+        raise ValueError("penalty=None is not supported for the liblinear solver")
+
+    return solver
+
+
+def _check_multi_class(multi_class, solver, n_classes):
+    """Computes the multi class type, either "multinomial" or "ovr".
+
+    For `n_classes` > 2 and a solver that supports it, returns "multinomial".
+    For all other cases, in particular binary classification, return "ovr".
+    """
+    if multi_class == "auto":
+        if solver in ("liblinear",):
+            multi_class = "ovr"
+        elif n_classes > 2:
+            multi_class = "multinomial"
+        else:
+            multi_class = "ovr"
+    if multi_class == "multinomial" and solver in ("liblinear",):
+        raise ValueError("Solver %s does not support a multinomial backend." % solver)
+    return multi_class
+
+
+def _logistic_regression_path(
+    X,
+    y,
+    pos_class=None,
+    Cs=10,
+    fit_intercept=True,
+    max_iter=100,
+    tol=1e-4,
+    verbose=0,
+    solver="lbfgs",
+    coef=None,
+    class_weight=None,
+    dual=False,
+    penalty="l2",
+    intercept_scaling=1.0,
+    multi_class="auto",
+    random_state=None,
+    check_input=True,
+    max_squared_sum=None,
+    sample_weight=None,
+    l1_ratio=None,
+    n_threads=1,
+):
+    """Compute a Logistic Regression model for a list of regularization
+    parameters.
+
+    This is an implementation that uses the result of the previous model
+    to speed up computations along the set of solutions, making it faster
+    than sequentially calling LogisticRegression for the different parameters.
+    Note that there will be no speedup with liblinear solver, since it does
+    not handle warm-starting.
+
+    Read more in the :ref:`User Guide <logistic_regression>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input data.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        Input data, target values.
+
+    pos_class : int, default=None
+        The class with respect to which we perform a one-vs-all fit.
+        If None, then it is assumed that the given problem is binary.
+
+    Cs : int or array-like of shape (n_cs,), default=10
+        List of values for the regularization parameter or integer specifying
+        the number of regularization parameters that should be used. In this
+        case, the parameters will be chosen in a logarithmic scale between
+        1e-4 and 1e4.
+
+    fit_intercept : bool, default=True
+        Whether to fit an intercept for the model. In this case the shape of
+        the returned array is (n_cs, n_features + 1).
+
+    max_iter : int, default=100
+        Maximum number of iterations for the solver.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the newton-cg and lbfgs solvers, the iteration
+        will stop when ``max{|g_i | i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient.
+
+    verbose : int, default=0
+        For the liblinear and lbfgs solvers set verbose to any positive
+        number for verbosity.
+
+    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
+            default='lbfgs'
+        Numerical solver to use.
+
+    coef : array-like of shape (n_features,), default=None
+        Initialization value for coefficients of logistic regression.
+        Useless for liblinear solver.
+
+    class_weight : dict or 'balanced', default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+    dual : bool, default=False
+        Dual or primal formulation. Dual formulation is only implemented for
+        l2 penalty with liblinear solver. Prefer dual=False when
+        n_samples > n_features.
+
+    penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
+        Used to specify the norm used in the penalization. The 'newton-cg',
+        'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
+        only supported by the 'saga' solver.
+
+    intercept_scaling : float, default=1.
+        Useful only when the solver `liblinear` is used
+        and `self.fit_intercept` is set to `True`. In this case, `x` becomes
+        `[x, self.intercept_scaling]`,
+        i.e. a "synthetic" feature with constant value equal to
+        `intercept_scaling` is appended to the instance vector.
+        The intercept becomes
+        ``intercept_scaling * synthetic_feature_weight``.
+
+        .. note::
+            The synthetic feature weight is subject to L1 or L2
+            regularization as all other features.
+            To lessen the effect of regularization on synthetic feature weight
+            (and therefore on the intercept) `intercept_scaling` has to be increased.
+
+    multi_class : {'ovr', 'multinomial', 'auto'}, default='auto'
+        If the option chosen is 'ovr', then a binary problem is fit for each
+        label. For 'multinomial' the loss minimised is the multinomial loss fit
+        across the entire probability distribution, *even when the data is
+        binary*. 'multinomial' is unavailable when solver='liblinear'.
+        'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
+        and otherwise selects 'multinomial'.
+
+        .. versionadded:: 0.18
+           Stochastic Average Gradient descent solver for 'multinomial' case.
+        .. versionchanged:: 0.22
+            Default changed from 'ovr' to 'auto' in 0.22.
+
+    random_state : int, RandomState instance, default=None
+        Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
+        data. See :term:`Glossary <random_state>` for details.
+
+    check_input : bool, default=True
+        If False, the input arrays X and y will not be checked.
+
+    max_squared_sum : float, default=None
+        Maximum squared sum of X over samples. Used only in SAG solver.
+        If None, it will be computed, going through all the samples.
+        The value should be precomputed to speed up cross validation.
+
+    sample_weight : array-like of shape(n_samples,), default=None
+        Array of weights that are assigned to individual samples.
+        If not provided, then each sample is given unit weight.
+
+    l1_ratio : float, default=None
+        The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
+        used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
+        to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
+        to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
+        combination of L1 and L2.
+
+    n_threads : int, default=1
+       Number of OpenMP threads to use.
+
+    Returns
+    -------
+    coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
+        List of coefficients for the Logistic Regression model. If
+        fit_intercept is set to True then the second dimension will be
+        n_features + 1, where the last item represents the intercept. For
+        ``multiclass='multinomial'``, the shape is (n_classes, n_cs,
+        n_features) or (n_classes, n_cs, n_features + 1).
+
+    Cs : ndarray
+        Grid of Cs used for cross-validation.
+
+    n_iter : array of shape (n_cs,)
+        Actual number of iteration for each Cs.
+
+    Notes
+    -----
+    You might get slightly different results with the solver liblinear than
+    with the others since this uses LIBLINEAR which penalizes the intercept.
+
+    .. versionchanged:: 0.19
+        The "copy" parameter was removed.
+    """
+    if isinstance(Cs, numbers.Integral):
+        Cs = np.logspace(-4, 4, Cs)
+
+    solver = _check_solver(solver, penalty, dual)
+
+    # Preprocessing.
+    if check_input:
+        X = check_array(
+            X,
+            accept_sparse="csr",
+            dtype=np.float64,
+            accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
+        )
+        y = check_array(y, ensure_2d=False, dtype=None)
+        check_consistent_length(X, y)
+    n_samples, n_features = X.shape
+
+    classes = np.unique(y)
+    random_state = check_random_state(random_state)
+
+    multi_class = _check_multi_class(multi_class, solver, len(classes))
+    if pos_class is None and multi_class != "multinomial":
+        if classes.size > 2:
+            raise ValueError("To fit OvR, use the pos_class argument")
+        # np.unique(y) gives labels in sorted order.
+        pos_class = classes[1]
+
+    if sample_weight is not None or class_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
+
+    # If class_weights is a dict (provided by the user), the weights
+    # are assigned to the original labels. If it is "balanced", then
+    # the class_weights are assigned after masking the labels with a OvR.
+    le = LabelEncoder()
+    if isinstance(class_weight, dict) or (
+        multi_class == "multinomial" and class_weight is not None
+    ):
+        class_weight_ = compute_class_weight(
+            class_weight, classes=classes, y=y, sample_weight=sample_weight
+        )
+        sample_weight *= class_weight_[le.fit_transform(y)]
+
+    # For doing a ovr, we need to mask the labels first. For the
+    # multinomial case this is not necessary.
+    if multi_class == "ovr":
+        w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)
+        mask = y == pos_class
+        y_bin = np.ones(y.shape, dtype=X.dtype)
+        if solver == "liblinear":
+            mask_classes = np.array([-1, 1])
+            y_bin[~mask] = -1.0
+        else:
+            # HalfBinomialLoss, used for those solvers, represents y in [0, 1] instead
+            # of in [-1, 1].
+            mask_classes = np.array([0, 1])
+            y_bin[~mask] = 0.0
+
+        # for compute_class_weight
+        if class_weight == "balanced":
+            class_weight_ = compute_class_weight(
+                class_weight,
+                classes=mask_classes,
+                y=y_bin,
+                sample_weight=sample_weight,
+            )
+            sample_weight *= class_weight_[le.fit_transform(y_bin)]
+
+    else:
+        if solver in ["sag", "saga", "lbfgs", "newton-cg", "newton-cholesky"]:
+            # SAG, lbfgs, newton-cg and newton-cholesky multinomial solvers need
+            # LabelEncoder, not LabelBinarizer, i.e. y as a 1d-array of integers.
+            # LabelEncoder also saves memory compared to LabelBinarizer, especially
+            # when n_classes is large.
+            le = LabelEncoder()
+            Y_multi = le.fit_transform(y).astype(X.dtype, copy=False)
+        else:
+            # For liblinear solver, apply LabelBinarizer, i.e. y is one-hot encoded.
+            lbin = LabelBinarizer()
+            Y_multi = lbin.fit_transform(y)
+            if Y_multi.shape[1] == 1:
+                Y_multi = np.hstack([1 - Y_multi, Y_multi])
+
+        w0 = np.zeros(
+            (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype
+        )
+
+    # IMPORTANT NOTE:
+    # All solvers relying on LinearModelLoss need to scale the penalty with n_samples
+    # or the sum of sample weights because the implemented logistic regression
+    # objective here is (unfortunately)
+    #     C * sum(pointwise_loss) + penalty
+    # instead of (as LinearModelLoss does)
+    #     mean(pointwise_loss) + 1/C * penalty
+    if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+        # This needs to be calculated after sample_weight is multiplied by
+        # class_weight. It is even tested that passing class_weight is equivalent to
+        # passing sample_weights according to class_weight.
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+
+    if coef is not None:
+        # it must work both giving the bias term and not
+        if multi_class == "ovr":
+            if coef.size not in (n_features, w0.size):
+                raise ValueError(
+                    "Initialization coef is of shape %d, expected shape %d or %d"
+                    % (coef.size, n_features, w0.size)
+                )
+            w0[: coef.size] = coef
+        else:
+            # For binary problems coef.shape[0] should be 1, otherwise it
+            # should be classes.size.
+            n_classes = classes.size
+            if n_classes == 2:
+                n_classes = 1
+
+            if coef.shape[0] != n_classes or coef.shape[1] not in (
+                n_features,
+                n_features + 1,
+            ):
+                raise ValueError(
+                    "Initialization coef is of shape (%d, %d), expected "
+                    "shape (%d, %d) or (%d, %d)"
+                    % (
+                        coef.shape[0],
+                        coef.shape[1],
+                        classes.size,
+                        n_features,
+                        classes.size,
+                        n_features + 1,
+                    )
+                )
+
+            if n_classes == 1:
+                w0[0, : coef.shape[1]] = -coef
+                w0[1, : coef.shape[1]] = coef
+            else:
+                w0[:, : coef.shape[1]] = coef
+
+    if multi_class == "multinomial":
+        if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+            # scipy.optimize.minimize and newton-cg accept only ravelled parameters,
+            # i.e. 1d-arrays. LinearModelLoss expects classes to be contiguous and
+            # reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F").
+            # As w0 is F-contiguous, ravel(order="F") also avoids a copy.
+            w0 = w0.ravel(order="F")
+        loss = LinearModelLoss(
+            base_loss=HalfMultinomialLoss(n_classes=classes.size),
+            fit_intercept=fit_intercept,
+        )
+        target = Y_multi
+        if solver == "lbfgs":
+            func = loss.loss_gradient
+        elif solver == "newton-cg":
+            func = loss.loss
+            grad = loss.gradient
+            hess = loss.gradient_hessian_product  # hess = [gradient, hessp]
+        warm_start_sag = {"coef": w0.T}
+    else:
+        target = y_bin
+        if solver == "lbfgs":
+            loss = LinearModelLoss(
+                base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
+            )
+            func = loss.loss_gradient
+        elif solver == "newton-cg":
+            loss = LinearModelLoss(
+                base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
+            )
+            func = loss.loss
+            grad = loss.gradient
+            hess = loss.gradient_hessian_product  # hess = [gradient, hessp]
+        elif solver == "newton-cholesky":
+            loss = LinearModelLoss(
+                base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
+            )
+        warm_start_sag = {"coef": np.expand_dims(w0, axis=1)}
+
+    coefs = list()
+    n_iter = np.zeros(len(Cs), dtype=np.int32)
+    for i, C in enumerate(Cs):
+        if solver == "lbfgs":
+            l2_reg_strength = 1.0 / (C * sw_sum)
+            iprint = [-1, 50, 1, 100, 101][
+                np.searchsorted(np.array([0, 1, 2, 3]), verbose)
+            ]
+            opt_res = optimize.minimize(
+                func,
+                w0,
+                method="L-BFGS-B",
+                jac=True,
+                args=(X, target, sample_weight, l2_reg_strength, n_threads),
+                options={
+                    "maxiter": max_iter,
+                    "maxls": 50,  # default is 20
+                    "gtol": tol,
+                    "ftol": 64 * np.finfo(float).eps,
+                    **_get_additional_lbfgs_options_dict("iprint", iprint),
+                },
+            )
+            n_iter_i = _check_optimize_result(
+                solver,
+                opt_res,
+                max_iter,
+                extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
+            )
+            w0, loss = opt_res.x, opt_res.fun
+        elif solver == "newton-cg":
+            l2_reg_strength = 1.0 / (C * sw_sum)
+            args = (X, target, sample_weight, l2_reg_strength, n_threads)
+            w0, n_iter_i = _newton_cg(
+                grad_hess=hess,
+                func=func,
+                grad=grad,
+                x0=w0,
+                args=args,
+                maxiter=max_iter,
+                tol=tol,
+                verbose=verbose,
+            )
+        elif solver == "newton-cholesky":
+            l2_reg_strength = 1.0 / (C * sw_sum)
+            sol = NewtonCholeskySolver(
+                coef=w0,
+                linear_loss=loss,
+                l2_reg_strength=l2_reg_strength,
+                tol=tol,
+                max_iter=max_iter,
+                n_threads=n_threads,
+                verbose=verbose,
+            )
+            w0 = sol.solve(X=X, y=target, sample_weight=sample_weight)
+            n_iter_i = sol.iteration
+        elif solver == "liblinear":
+            if len(classes) > 2:
+                warnings.warn(
+                    "Using the 'liblinear' solver for multiclass classification is "
+                    "deprecated. An error will be raised in 1.8. Either use another "
+                    "solver which supports the multinomial loss or wrap the estimator "
+                    "in a OneVsRestClassifier to keep applying a one-versus-rest "
+                    "scheme.",
+                    FutureWarning,
+                )
+            (
+                coef_,
+                intercept_,
+                n_iter_i,
+            ) = _fit_liblinear(
+                X,
+                target,
+                C,
+                fit_intercept,
+                intercept_scaling,
+                None,
+                penalty,
+                dual,
+                verbose,
+                max_iter,
+                tol,
+                random_state,
+                sample_weight=sample_weight,
+            )
+            if fit_intercept:
+                w0 = np.concatenate([coef_.ravel(), intercept_])
+            else:
+                w0 = coef_.ravel()
+            # n_iter_i is an array for each class. However, `target` is always encoded
+            # in {-1, 1}, so we only take the first element of n_iter_i.
+            n_iter_i = n_iter_i.item()
+
+        elif solver in ["sag", "saga"]:
+            if multi_class == "multinomial":
+                target = target.astype(X.dtype, copy=False)
+                loss = "multinomial"
+            else:
+                loss = "log"
+            # alpha is for L2-norm, beta is for L1-norm
+            if penalty == "l1":
+                alpha = 0.0
+                beta = 1.0 / C
+            elif penalty == "l2":
+                alpha = 1.0 / C
+                beta = 0.0
+            else:  # Elastic-Net penalty
+                alpha = (1.0 / C) * (1 - l1_ratio)
+                beta = (1.0 / C) * l1_ratio
+
+            w0, n_iter_i, warm_start_sag = sag_solver(
+                X,
+                target,
+                sample_weight,
+                loss,
+                alpha,
+                beta,
+                max_iter,
+                tol,
+                verbose,
+                random_state,
+                False,
+                max_squared_sum,
+                warm_start_sag,
+                is_saga=(solver == "saga"),
+            )
+
+        else:
+            raise ValueError(
+                "solver must be one of {'liblinear', 'lbfgs', "
+                "'newton-cg', 'sag'}, got '%s' instead" % solver
+            )
+
+        if multi_class == "multinomial":
+            n_classes = max(2, classes.size)
+            if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+                multi_w0 = np.reshape(w0, (n_classes, -1), order="F")
+            else:
+                multi_w0 = w0
+            if n_classes == 2:
+                multi_w0 = multi_w0[1][np.newaxis, :]
+            coefs.append(multi_w0.copy())
+        else:
+            coefs.append(w0.copy())
+
+        n_iter[i] = n_iter_i
+
+    return np.array(coefs), np.array(Cs), n_iter
+
+
+# helper function for LogisticCV
+def _log_reg_scoring_path(
+    X,
+    y,
+    train,
+    test,
+    *,
+    pos_class,
+    Cs,
+    scoring,
+    fit_intercept,
+    max_iter,
+    tol,
+    class_weight,
+    verbose,
+    solver,
+    penalty,
+    dual,
+    intercept_scaling,
+    multi_class,
+    random_state,
+    max_squared_sum,
+    sample_weight,
+    l1_ratio,
+    score_params,
+):
+    """Computes scores across logistic_regression_path
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        Target labels.
+
+    train : list of indices
+        The indices of the train set.
+
+    test : list of indices
+        The indices of the test set.
+
+    pos_class : int
+        The class with respect to which we perform a one-vs-all fit.
+        If None, then it is assumed that the given problem is binary.
+
+    Cs : int or list of floats
+        Each of the values in Cs describes the inverse of
+        regularization strength. If Cs is as an int, then a grid of Cs
+        values are chosen in a logarithmic scale between 1e-4 and 1e4.
+
+    scoring : str, callable or None
+        The scoring method to use for cross-validation. Options:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: :ref:`accuracy <accuracy_score>` is used.
+
+    fit_intercept : bool
+        If False, then the bias term is set to zero. Else the last
+        term of each coef_ gives us the intercept.
+
+    max_iter : int
+        Maximum number of iterations for the solver.
+
+    tol : float
+        Tolerance for stopping criteria.
+
+    class_weight : dict or 'balanced'
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+    verbose : int
+        For the liblinear and lbfgs solvers set verbose to any positive
+        number for verbosity.
+
+    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}
+        Decides which solver to use.
+
+    penalty : {'l1', 'l2', 'elasticnet'}
+        Used to specify the norm used in the penalization. The 'newton-cg',
+        'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
+        only supported by the 'saga' solver.
+
+    dual : bool
+        Dual or primal formulation. Dual formulation is only implemented for
+        l2 penalty with liblinear solver. Prefer dual=False when
+        n_samples > n_features.
+
+    intercept_scaling : float
+        Useful only when the solver `liblinear` is used
+        and `self.fit_intercept` is set to `True`. In this case, `x` becomes
+        `[x, self.intercept_scaling]`,
+        i.e. a "synthetic" feature with constant value equal to
+        `intercept_scaling` is appended to the instance vector.
+        The intercept becomes
+        ``intercept_scaling * synthetic_feature_weight``.
+
+        .. note::
+            The synthetic feature weight is subject to L1 or L2
+            regularization as all other features.
+            To lessen the effect of regularization on synthetic feature weight
+            (and therefore on the intercept) `intercept_scaling` has to be increased.
+
+    multi_class : {'auto', 'ovr', 'multinomial'}
+        If the option chosen is 'ovr', then a binary problem is fit for each
+        label. For 'multinomial' the loss minimised is the multinomial loss fit
+        across the entire probability distribution, *even when the data is
+        binary*. 'multinomial' is unavailable when solver='liblinear'.
+
+    random_state : int, RandomState instance
+        Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
+        data. See :term:`Glossary <random_state>` for details.
+
+    max_squared_sum : float
+        Maximum squared sum of X over samples. Used only in SAG solver.
+        If None, it will be computed, going through all the samples.
+        The value should be precomputed to speed up cross validation.
+
+    sample_weight : array-like of shape(n_samples,)
+        Array of weights that are assigned to individual samples.
+        If not provided, then each sample is given unit weight.
+
+    l1_ratio : float
+        The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
+        used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
+        to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
+        to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
+        combination of L1 and L2.
+
+    score_params : dict
+        Parameters to pass to the `score` method of the underlying scorer.
+
+    Returns
+    -------
+    coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
+        List of coefficients for the Logistic Regression model. If
+        fit_intercept is set to True then the second dimension will be
+        n_features + 1, where the last item represents the intercept.
+
+    Cs : ndarray
+        Grid of Cs used for cross-validation.
+
+    scores : ndarray of shape (n_cs,)
+        Scores obtained for each Cs.
+
+    n_iter : ndarray of shape(n_cs,)
+        Actual number of iteration for each Cs.
+    """
+    X_train = X[train]
+    X_test = X[test]
+    y_train = y[train]
+    y_test = y[test]
+
+    sw_train, sw_test = None, None
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, X)
+        sw_train = sample_weight[train]
+        sw_test = sample_weight[test]
+
+    coefs, Cs, n_iter = _logistic_regression_path(
+        X_train,
+        y_train,
+        Cs=Cs,
+        l1_ratio=l1_ratio,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        max_iter=max_iter,
+        class_weight=class_weight,
+        pos_class=pos_class,
+        multi_class=multi_class,
+        tol=tol,
+        verbose=verbose,
+        dual=dual,
+        penalty=penalty,
+        intercept_scaling=intercept_scaling,
+        random_state=random_state,
+        check_input=False,
+        max_squared_sum=max_squared_sum,
+        sample_weight=sw_train,
+    )
+
+    log_reg = LogisticRegression(solver=solver, multi_class=multi_class)
+
+    # The score method of Logistic Regression has a classes_ attribute.
+    if multi_class == "ovr":
+        log_reg.classes_ = np.array([-1, 1])
+    elif multi_class == "multinomial":
+        log_reg.classes_ = np.unique(y_train)
+    else:
+        raise ValueError(
+            "multi_class should be either multinomial or ovr, got %d" % multi_class
+        )
+
+    if pos_class is not None:
+        mask = y_test == pos_class
+        y_test = np.ones(y_test.shape, dtype=np.float64)
+        y_test[~mask] = -1.0
+
+    scores = list()
+
+    scoring = get_scorer(scoring)
+    for w in coefs:
+        if multi_class == "ovr":
+            w = w[np.newaxis, :]
+        if fit_intercept:
+            log_reg.coef_ = w[:, :-1]
+            log_reg.intercept_ = w[:, -1]
+        else:
+            log_reg.coef_ = w
+            log_reg.intercept_ = 0.0
+
+        if scoring is None:
+            scores.append(log_reg.score(X_test, y_test, sample_weight=sw_test))
+        else:
+            score_params = score_params or {}
+            score_params = _check_method_params(X=X, params=score_params, indices=test)
+            scores.append(scoring(log_reg, X_test, y_test, **score_params))
+    return coefs, Cs, np.array(scores), n_iter
+
+
+class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
+    """
+    Logistic Regression (aka logit, MaxEnt) classifier.
+
+    This class implements regularized logistic regression using the
+    'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note
+    that regularization is applied by default**. It can handle both dense
+    and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit
+    floats for optimal performance; any other input format will be converted
+    (and copied).
+
+    The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization
+    with primal formulation, or no regularization. The 'liblinear' solver
+    supports both L1 and L2 regularization, with a dual formulation only for
+    the L2 penalty. The Elastic-Net regularization is only supported by the
+    'saga' solver.
+
+    For :term:`multiclass` problems, all solvers but 'liblinear' optimize the
+    (penalized) multinomial loss. 'liblinear' only handle binary classification but can
+    be extended to handle multiclass by using
+    :class:`~sklearn.multiclass.OneVsRestClassifier`.
+
+    Read more in the :ref:`User Guide <logistic_regression>`.
+
+    Parameters
+    ----------
+    penalty : {'l1', 'l2', 'elasticnet', None}, default='l2'
+        Specify the norm of the penalty:
+
+        - `None`: no penalty is added;
+        - `'l2'`: add a L2 penalty term and it is the default choice;
+        - `'l1'`: add a L1 penalty term;
+        - `'elasticnet'`: both L1 and L2 penalty terms are added.
+
+        .. warning::
+           Some penalties may not work with some solvers. See the parameter
+           `solver` below, to know the compatibility between the penalty and
+           solver.
+
+        .. versionadded:: 0.19
+           l1 penalty with SAGA solver (allowing 'multinomial' + L1)
+
+    dual : bool, default=False
+        Dual (constrained) or primal (regularized, see also
+        :ref:`this equation <regularized-logistic-loss>`) formulation. Dual formulation
+        is only implemented for l2 penalty with liblinear solver. Prefer dual=False when
+        n_samples > n_features.
+
+    tol : float, default=1e-4
+        Tolerance for stopping criteria.
+
+    C : float, default=1.0
+        Inverse of regularization strength; must be a positive float.
+        Like in support vector machines, smaller values specify stronger
+        regularization.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the decision function.
+
+    intercept_scaling : float, default=1
+        Useful only when the solver `liblinear` is used
+        and `self.fit_intercept` is set to `True`. In this case, `x` becomes
+        `[x, self.intercept_scaling]`,
+        i.e. a "synthetic" feature with constant value equal to
+        `intercept_scaling` is appended to the instance vector.
+        The intercept becomes
+        ``intercept_scaling * synthetic_feature_weight``.
+
+        .. note::
+            The synthetic feature weight is subject to L1 or L2
+            regularization as all other features.
+            To lessen the effect of regularization on synthetic feature weight
+            (and therefore on the intercept) `intercept_scaling` has to be increased.
+
+    class_weight : dict or 'balanced', default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+        .. versionadded:: 0.17
+           *class_weight='balanced'*
+
+    random_state : int, RandomState instance, default=None
+        Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
+        data. See :term:`Glossary <random_state>` for details.
+
+    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
+            default='lbfgs'
+
+        Algorithm to use in the optimization problem. Default is 'lbfgs'.
+        To choose a solver, you might want to consider the following aspects:
+
+        - For small datasets, 'liblinear' is a good choice, whereas 'sag'
+          and 'saga' are faster for large ones;
+        - For :term:`multiclass` problems, all solvers except 'liblinear' minimize the
+          full multinomial loss;
+        - 'liblinear' can only handle binary classification by default. To apply a
+          one-versus-rest scheme for the multiclass setting one can wrap it with the
+          :class:`~sklearn.multiclass.OneVsRestClassifier`.
+        - 'newton-cholesky' is a good choice for
+          `n_samples` >> `n_features * n_classes`, especially with one-hot encoded
+          categorical features with rare categories. Be aware that the memory usage
+          of this solver has a quadratic dependency on `n_features * n_classes`
+          because it explicitly computes the full Hessian matrix.
+
+        .. warning::
+           The choice of the algorithm depends on the penalty chosen and on
+           (multinomial) multiclass support:
+
+           ================= ============================== ======================
+           solver            penalty                        multinomial multiclass
+           ================= ============================== ======================
+           'lbfgs'           'l2', None                     yes
+           'liblinear'       'l1', 'l2'                     no
+           'newton-cg'       'l2', None                     yes
+           'newton-cholesky' 'l2', None                     yes
+           'sag'             'l2', None                     yes
+           'saga'            'elasticnet', 'l1', 'l2', None yes
+           ================= ============================== ======================
+
+        .. note::
+           'sag' and 'saga' fast convergence is only guaranteed on features
+           with approximately the same scale. You can preprocess the data with
+           a scaler from :mod:`sklearn.preprocessing`.
+
+        .. seealso::
+           Refer to the :ref:`User Guide <Logistic_regression>` for more
+           information regarding :class:`LogisticRegression` and more specifically the
+           :ref:`Table <logistic_regression_solvers>`
+           summarizing solver/penalty supports.
+
+        .. versionadded:: 0.17
+           Stochastic Average Gradient (SAG) descent solver. Multinomial support in
+           version 0.18.
+        .. versionadded:: 0.19
+           SAGA solver.
+        .. versionchanged:: 0.22
+           The default solver changed from 'liblinear' to 'lbfgs' in 0.22.
+        .. versionadded:: 1.2
+           newton-cholesky solver. Multinomial support in version 1.6.
+
+    max_iter : int, default=100
+        Maximum number of iterations taken for the solvers to converge.
+
+    multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'
+        If the option chosen is 'ovr', then a binary problem is fit for each
+        label. For 'multinomial' the loss minimised is the multinomial loss fit
+        across the entire probability distribution, *even when the data is
+        binary*. 'multinomial' is unavailable when solver='liblinear'.
+        'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
+        and otherwise selects 'multinomial'.
+
+        .. versionadded:: 0.18
+           Stochastic Average Gradient descent solver for 'multinomial' case.
+        .. versionchanged:: 0.22
+            Default changed from 'ovr' to 'auto' in 0.22.
+        .. deprecated:: 1.5
+           ``multi_class`` was deprecated in version 1.5 and will be removed in 1.8.
+           From then on, the recommended 'multinomial' will always be used for
+           `n_classes >= 3`.
+           Solvers that do not support 'multinomial' will raise an error.
+           Use `sklearn.multiclass.OneVsRestClassifier(LogisticRegression())` if you
+           still want to use OvR.
+
+    verbose : int, default=0
+        For the liblinear and lbfgs solvers set verbose to any positive
+        number for verbosity.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        Useless for liblinear solver. See :term:`the Glossary <warm_start>`.
+
+        .. versionadded:: 0.17
+           *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers.
+
+    n_jobs : int, default=None
+        Number of CPU cores used when parallelizing over classes if
+        multi_class='ovr'". This parameter is ignored when the ``solver`` is
+        set to 'liblinear' regardless of whether 'multi_class' is specified or
+        not. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
+        context. ``-1`` means using all processors.
+        See :term:`Glossary <n_jobs>` for more details.
+
+    l1_ratio : float, default=None
+        The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
+        used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
+        to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
+        to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
+        combination of L1 and L2.
+
+    Attributes
+    ----------
+
+    classes_ : ndarray of shape (n_classes, )
+        A list of class labels known to the classifier.
+
+    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
+        Coefficient of the features in the decision function.
+
+        `coef_` is of shape (1, n_features) when the given problem is binary.
+        In particular, when `multi_class='multinomial'`, `coef_` corresponds
+        to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False).
+
+    intercept_ : ndarray of shape (1,) or (n_classes,)
+        Intercept (a.k.a. bias) added to the decision function.
+
+        If `fit_intercept` is set to False, the intercept is set to zero.
+        `intercept_` is of shape (1,) when the given problem is binary.
+        In particular, when `multi_class='multinomial'`, `intercept_`
+        corresponds to outcome 1 (True) and `-intercept_` corresponds to
+        outcome 0 (False).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : ndarray of shape (n_classes,) or (1, )
+        Actual number of iterations for all classes. If binary or multinomial,
+        it returns only 1 element. For liblinear solver, only the maximum
+        number of iteration across all classes is given.
+
+        .. versionchanged:: 0.20
+
+            In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
+            ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.
+
+    See Also
+    --------
+    SGDClassifier : Incrementally trained logistic regression (when given
+        the parameter ``loss="log_loss"``).
+    LogisticRegressionCV : Logistic regression with built-in cross validation.
+
+    Notes
+    -----
+    The underlying C implementation uses a random number generator to
+    select features when fitting the model. It is thus not uncommon,
+    to have slightly different results for the same input data. If
+    that happens, try with a smaller tol parameter.
+
+    Predict output may not match that of standalone liblinear in certain
+    cases. See :ref:`differences from liblinear <liblinear_differences>`
+    in the narrative documentation.
+
+    References
+    ----------
+
+    L-BFGS-B -- Software for Large-scale Bound-constrained Optimization
+        Ciyou Zhu, Richard Byrd, Jorge Nocedal and Jose Luis Morales.
+        http://users.iems.northwestern.edu/~nocedal/lbfgsb.html
+
+    LIBLINEAR -- A Library for Large Linear Classification
+        https://www.csie.ntu.edu.tw/~cjlin/liblinear/
+
+    SAG -- Mark Schmidt, Nicolas Le Roux, and Francis Bach
+        Minimizing Finite Sums with the Stochastic Average Gradient
+        https://hal.inria.fr/hal-00860051/document
+
+    SAGA -- Defazio, A., Bach F. & Lacoste-Julien S. (2014).
+            :arxiv:`"SAGA: A Fast Incremental Gradient Method With Support
+            for Non-Strongly Convex Composite Objectives" <1407.0202>`
+
+    Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent
+        methods for logistic regression and maximum entropy models.
+        Machine Learning 85(1-2):41-75.
+        https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = LogisticRegression(random_state=0).fit(X, y)
+    >>> clf.predict(X[:2, :])
+    array([0, 0])
+    >>> clf.predict_proba(X[:2, :])
+    array([[9.82e-01, 1.82e-02, 1.44e-08],
+           [9.72e-01, 2.82e-02, 3.02e-08]])
+    >>> clf.score(X, y)
+    0.97
+
+    For a comparison of the LogisticRegression with other classifiers see:
+    :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "penalty": [StrOptions({"l1", "l2", "elasticnet"}), None],
+        "dual": ["boolean"],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "C": [Interval(Real, 0, None, closed="right")],
+        "fit_intercept": ["boolean"],
+        "intercept_scaling": [Interval(Real, 0, None, closed="neither")],
+        "class_weight": [dict, StrOptions({"balanced"}), None],
+        "random_state": ["random_state"],
+        "solver": [
+            StrOptions(
+                {"lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"}
+            )
+        ],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "verbose": ["verbose"],
+        "warm_start": ["boolean"],
+        "n_jobs": [None, Integral],
+        "l1_ratio": [Interval(Real, 0, 1, closed="both"), None],
+        "multi_class": [
+            StrOptions({"auto", "ovr", "multinomial"}),
+            Hidden(StrOptions({"deprecated"})),
+        ],
+    }
+
+    def __init__(
+        self,
+        penalty="l2",
+        *,
+        dual=False,
+        tol=1e-4,
+        C=1.0,
+        fit_intercept=True,
+        intercept_scaling=1,
+        class_weight=None,
+        random_state=None,
+        solver="lbfgs",
+        max_iter=100,
+        multi_class="deprecated",
+        verbose=0,
+        warm_start=False,
+        n_jobs=None,
+        l1_ratio=None,
+    ):
+        self.penalty = penalty
+        self.dual = dual
+        self.tol = tol
+        self.C = C
+        self.fit_intercept = fit_intercept
+        self.intercept_scaling = intercept_scaling
+        self.class_weight = class_weight
+        self.random_state = random_state
+        self.solver = solver
+        self.max_iter = max_iter
+        self.multi_class = multi_class
+        self.verbose = verbose
+        self.warm_start = warm_start
+        self.n_jobs = n_jobs
+        self.l1_ratio = l1_ratio
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """
+        Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target vector relative to X.
+
+        sample_weight : array-like of shape (n_samples,) default=None
+            Array of weights that are assigned to individual samples.
+            If not provided, then each sample is given unit weight.
+
+            .. versionadded:: 0.17
+               *sample_weight* support to LogisticRegression.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+
+        Notes
+        -----
+        The SAGA solver supports both float64 and float32 bit arrays.
+        """
+        solver = _check_solver(self.solver, self.penalty, self.dual)
+
+        if self.penalty != "elasticnet" and self.l1_ratio is not None:
+            warnings.warn(
+                "l1_ratio parameter is only used when penalty is "
+                "'elasticnet'. Got "
+                "(penalty={})".format(self.penalty)
+            )
+
+        if self.penalty == "elasticnet" and self.l1_ratio is None:
+            raise ValueError("l1_ratio must be specified when penalty is elasticnet.")
+
+        if self.penalty is None:
+            if self.C != 1.0:  # default values
+                warnings.warn(
+                    "Setting penalty=None will ignore the C and l1_ratio parameters"
+                )
+                # Note that check for l1_ratio is done right above
+            C_ = np.inf
+            penalty = "l2"
+        else:
+            C_ = self.C
+            penalty = self.penalty
+
+        if solver == "lbfgs":
+            _dtype = np.float64
+        else:
+            _dtype = [np.float64, np.float32]
+
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse="csr",
+            dtype=_dtype,
+            order="C",
+            accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
+        )
+        check_classification_targets(y)
+        self.classes_ = np.unique(y)
+
+        # TODO(1.8) remove multi_class
+        multi_class = self.multi_class
+        if self.multi_class == "multinomial" and len(self.classes_) == 2:
+            warnings.warn(
+                (
+                    "'multi_class' was deprecated in version 1.5 and will be removed in"
+                    " 1.8. From then on, binary problems will be fit as proper binary "
+                    " logistic regression models (as if multi_class='ovr' were set)."
+                    " Leave it to its default value to avoid this warning."
+                ),
+                FutureWarning,
+            )
+        elif self.multi_class in ("multinomial", "auto"):
+            warnings.warn(
+                (
+                    "'multi_class' was deprecated in version 1.5 and will be removed in"
+                    " 1.8. From then on, it will always use 'multinomial'."
+                    " Leave it to its default value to avoid this warning."
+                ),
+                FutureWarning,
+            )
+        elif self.multi_class == "ovr":
+            warnings.warn(
+                (
+                    "'multi_class' was deprecated in version 1.5 and will be removed in"
+                    " 1.8. Use OneVsRestClassifier(LogisticRegression(..)) instead."
+                    " Leave it to its default value to avoid this warning."
+                ),
+                FutureWarning,
+            )
+        else:
+            # Set to old default value.
+            multi_class = "auto"
+        multi_class = _check_multi_class(multi_class, solver, len(self.classes_))
+
+        if solver == "liblinear":
+            if len(self.classes_) > 2:
+                warnings.warn(
+                    "Using the 'liblinear' solver for multiclass classification is "
+                    "deprecated. An error will be raised in 1.8. Either use another "
+                    "solver which supports the multinomial loss or wrap the estimator "
+                    "in a OneVsRestClassifier to keep applying a one-versus-rest "
+                    "scheme.",
+                    FutureWarning,
+                )
+            if effective_n_jobs(self.n_jobs) != 1:
+                warnings.warn(
+                    "'n_jobs' > 1 does not have any effect when"
+                    " 'solver' is set to 'liblinear'. Got 'n_jobs'"
+                    " = {}.".format(effective_n_jobs(self.n_jobs))
+                )
+            self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
+                X,
+                y,
+                self.C,
+                self.fit_intercept,
+                self.intercept_scaling,
+                self.class_weight,
+                self.penalty,
+                self.dual,
+                self.verbose,
+                self.max_iter,
+                self.tol,
+                self.random_state,
+                sample_weight=sample_weight,
+            )
+            return self
+
+        if solver in ["sag", "saga"]:
+            max_squared_sum = row_norms(X, squared=True).max()
+        else:
+            max_squared_sum = None
+
+        n_classes = len(self.classes_)
+        classes_ = self.classes_
+        if n_classes < 2:
+            raise ValueError(
+                "This solver needs samples of at least 2 classes"
+                " in the data, but the data contains only one"
+                " class: %r" % classes_[0]
+            )
+
+        if len(self.classes_) == 2:
+            n_classes = 1
+            classes_ = classes_[1:]
+
+        if self.warm_start:
+            warm_start_coef = getattr(self, "coef_", None)
+        else:
+            warm_start_coef = None
+        if warm_start_coef is not None and self.fit_intercept:
+            warm_start_coef = np.append(
+                warm_start_coef, self.intercept_[:, np.newaxis], axis=1
+            )
+
+        # Hack so that we iterate only once for the multinomial case.
+        if multi_class == "multinomial":
+            classes_ = [None]
+            warm_start_coef = [warm_start_coef]
+        if warm_start_coef is None:
+            warm_start_coef = [None] * n_classes
+
+        path_func = delayed(_logistic_regression_path)
+
+        # The SAG solver releases the GIL so it's more efficient to use
+        # threads for this solver.
+        if solver in ["sag", "saga"]:
+            prefer = "threads"
+        else:
+            prefer = "processes"
+
+        # TODO: Refactor this to avoid joblib parallelism entirely when doing binary
+        # and multinomial multiclass classification and use joblib only for the
+        # one-vs-rest multiclass case.
+        if (
+            solver in ["lbfgs", "newton-cg", "newton-cholesky"]
+            and len(classes_) == 1
+            and effective_n_jobs(self.n_jobs) == 1
+        ):
+            # In the future, we would like n_threads = _openmp_effective_n_threads()
+            # For the time being, we just do
+            n_threads = 1
+        else:
+            n_threads = 1
+
+        fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
+            path_func(
+                X,
+                y,
+                pos_class=class_,
+                Cs=[C_],
+                l1_ratio=self.l1_ratio,
+                fit_intercept=self.fit_intercept,
+                tol=self.tol,
+                verbose=self.verbose,
+                solver=solver,
+                multi_class=multi_class,
+                max_iter=self.max_iter,
+                class_weight=self.class_weight,
+                check_input=False,
+                random_state=self.random_state,
+                coef=warm_start_coef_,
+                penalty=penalty,
+                max_squared_sum=max_squared_sum,
+                sample_weight=sample_weight,
+                n_threads=n_threads,
+            )
+            for class_, warm_start_coef_ in zip(classes_, warm_start_coef)
+        )
+
+        fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
+        self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]
+
+        n_features = X.shape[1]
+        if multi_class == "multinomial":
+            self.coef_ = fold_coefs_[0][0]
+        else:
+            self.coef_ = np.asarray(fold_coefs_)
+            self.coef_ = self.coef_.reshape(
+                n_classes, n_features + int(self.fit_intercept)
+            )
+
+        if self.fit_intercept:
+            self.intercept_ = self.coef_[:, -1]
+            self.coef_ = self.coef_[:, :-1]
+        else:
+            self.intercept_ = np.zeros(n_classes)
+
+        return self
+
+    def predict_proba(self, X):
+        """
+        Probability estimates.
+
+        The returned estimates for all classes are ordered by the
+        label of classes.
+
+        For a multi_class problem, if multi_class is set to be "multinomial"
+        the softmax function is used to find the predicted probability of
+        each class.
+        Else use a one-vs-rest approach, i.e. calculate the probability
+        of each class assuming it to be positive using the logistic function
+        and normalize these values across all the classes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Vector to be scored, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        T : array-like of shape (n_samples, n_classes)
+            Returns the probability of the sample for each class in the model,
+            where classes are ordered as they are in ``self.classes_``.
+        """
+        check_is_fitted(self)
+
+        ovr = self.multi_class in ["ovr", "warn"] or (
+            self.multi_class in ["auto", "deprecated"]
+            and (self.classes_.size <= 2 or self.solver == "liblinear")
+        )
+        if ovr:
+            return super()._predict_proba_lr(X)
+        else:
+            decision = self.decision_function(X)
+            if decision.ndim == 1:
+                # Workaround for multi_class="multinomial" and binary outcomes
+                # which requires softmax prediction with only a 1D decision.
+                decision_2d = np.c_[-decision, decision]
+            else:
+                decision_2d = decision
+            return softmax(decision_2d, copy=False)
+
+    def predict_log_proba(self, X):
+        """
+        Predict logarithm of probability estimates.
+
+        The returned estimates for all classes are ordered by the
+        label of classes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Vector to be scored, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        T : array-like of shape (n_samples, n_classes)
+            Returns the log-probability of the sample for each class in the
+            model, where classes are ordered as they are in ``self.classes_``.
+        """
+        return np.log(self.predict_proba(X))
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstimator):
+    """Logistic Regression CV (aka logit, MaxEnt) classifier.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    This class implements logistic regression using liblinear, newton-cg, sag
+    or lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2
+    regularization with primal formulation. The liblinear solver supports both
+    L1 and L2 regularization, with a dual formulation only for the L2 penalty.
+    Elastic-Net penalty is only supported by the saga solver.
+
+    For the grid of `Cs` values and `l1_ratios` values, the best hyperparameter
+    is selected by the cross-validator
+    :class:`~sklearn.model_selection.StratifiedKFold`, but it can be changed
+    using the :term:`cv` parameter. The 'newton-cg', 'sag', 'saga' and 'lbfgs'
+    solvers can warm-start the coefficients (see :term:`Glossary<warm_start>`).
+
+    Read more in the :ref:`User Guide <logistic_regression>`.
+
+    Parameters
+    ----------
+    Cs : int or list of floats, default=10
+        Each of the values in Cs describes the inverse of regularization
+        strength. If Cs is as an int, then a grid of Cs values are chosen
+        in a logarithmic scale between 1e-4 and 1e4.
+        Like in support vector machines, smaller values specify stronger
+        regularization.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the decision function.
+
+    cv : int or cross-validation generator, default=None
+        The default cross-validation generator used is Stratified K-Folds.
+        If an integer is provided, then it is the number of folds used.
+        See the module :mod:`sklearn.model_selection` module for the
+        list of possible cross-validation objects.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    dual : bool, default=False
+        Dual (constrained) or primal (regularized, see also
+        :ref:`this equation <regularized-logistic-loss>`) formulation. Dual formulation
+        is only implemented for l2 penalty with liblinear solver. Prefer dual=False when
+        n_samples > n_features.
+
+    penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
+        Specify the norm of the penalty:
+
+        - `'l2'`: add a L2 penalty term (used by default);
+        - `'l1'`: add a L1 penalty term;
+        - `'elasticnet'`: both L1 and L2 penalty terms are added.
+
+        .. warning::
+           Some penalties may not work with some solvers. See the parameter
+           `solver` below, to know the compatibility between the penalty and
+           solver.
+
+    scoring : str or callable, default=None
+        The scoring method to use for cross-validation. Options:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: :ref:`accuracy <accuracy_score>` is used.
+
+    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
+            default='lbfgs'
+
+        Algorithm to use in the optimization problem. Default is 'lbfgs'.
+        To choose a solver, you might want to consider the following aspects:
+
+        - For small datasets, 'liblinear' is a good choice, whereas 'sag'
+          and 'saga' are faster for large ones;
+        - For multiclass problems, all solvers except 'liblinear' minimize the full
+          multinomial loss;
+        - 'liblinear' might be slower in :class:`LogisticRegressionCV`
+          because it does not handle warm-starting.
+        - 'liblinear' can only handle binary classification by default. To apply a
+          one-versus-rest scheme for the multiclass setting one can wrap it with the
+          :class:`~sklearn.multiclass.OneVsRestClassifier`.
+        - 'newton-cholesky' is a good choice for
+          `n_samples` >> `n_features * n_classes`, especially with one-hot encoded
+          categorical features with rare categories. Be aware that the memory usage
+          of this solver has a quadratic dependency on `n_features * n_classes`
+          because it explicitly computes the full Hessian matrix.
+
+        .. warning::
+           The choice of the algorithm depends on the penalty chosen and on
+           (multinomial) multiclass support:
+
+           ================= ============================== ======================
+           solver            penalty                        multinomial multiclass
+           ================= ============================== ======================
+           'lbfgs'           'l2'                           yes
+           'liblinear'       'l1', 'l2'                     no
+           'newton-cg'       'l2'                           yes
+           'newton-cholesky' 'l2',                          yes
+           'sag'             'l2',                          yes
+           'saga'            'elasticnet', 'l1', 'l2'       yes
+           ================= ============================== ======================
+
+        .. note::
+           'sag' and 'saga' fast convergence is only guaranteed on features
+           with approximately the same scale. You can preprocess the data with
+           a scaler from :mod:`sklearn.preprocessing`.
+
+        .. versionadded:: 0.17
+           Stochastic Average Gradient (SAG) descent solver. Multinomial support in
+           version 0.18.
+        .. versionadded:: 0.19
+           SAGA solver.
+        .. versionadded:: 1.2
+           newton-cholesky solver. Multinomial support in version 1.6.
+
+    tol : float, default=1e-4
+        Tolerance for stopping criteria.
+
+    max_iter : int, default=100
+        Maximum number of iterations of the optimization algorithm.
+
+    class_weight : dict or 'balanced', default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+        .. versionadded:: 0.17
+           class_weight == 'balanced'
+
+    n_jobs : int, default=None
+        Number of CPU cores used during the cross-validation loop.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any
+        positive number for verbosity.
+
+    refit : bool, default=True
+        If set to True, the scores are averaged across all folds, and the
+        coefs and the C that corresponds to the best score is taken, and a
+        final refit is done using these parameters.
+        Otherwise the coefs, intercepts and C that correspond to the
+        best scores across folds are averaged.
+
+    intercept_scaling : float, default=1
+        Useful only when the solver `liblinear` is used
+        and `self.fit_intercept` is set to `True`. In this case, `x` becomes
+        `[x, self.intercept_scaling]`,
+        i.e. a "synthetic" feature with constant value equal to
+        `intercept_scaling` is appended to the instance vector.
+        The intercept becomes
+        ``intercept_scaling * synthetic_feature_weight``.
+
+        .. note::
+            The synthetic feature weight is subject to L1 or L2
+            regularization as all other features.
+            To lessen the effect of regularization on synthetic feature weight
+            (and therefore on the intercept) `intercept_scaling` has to be increased.
+
+    multi_class : {'auto, 'ovr', 'multinomial'}, default='auto'
+        If the option chosen is 'ovr', then a binary problem is fit for each
+        label. For 'multinomial' the loss minimised is the multinomial loss fit
+        across the entire probability distribution, *even when the data is
+        binary*. 'multinomial' is unavailable when solver='liblinear'.
+        'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
+        and otherwise selects 'multinomial'.
+
+        .. versionadded:: 0.18
+           Stochastic Average Gradient descent solver for 'multinomial' case.
+        .. versionchanged:: 0.22
+            Default changed from 'ovr' to 'auto' in 0.22.
+        .. deprecated:: 1.5
+           ``multi_class`` was deprecated in version 1.5 and will be removed in 1.8.
+           From then on, the recommended 'multinomial' will always be used for
+           `n_classes >= 3`.
+           Solvers that do not support 'multinomial' will raise an error.
+           Use `sklearn.multiclass.OneVsRestClassifier(LogisticRegressionCV())` if you
+           still want to use OvR.
+
+    random_state : int, RandomState instance, default=None
+        Used when `solver='sag'`, 'saga' or 'liblinear' to shuffle the data.
+        Note that this only applies to the solver and not the cross-validation
+        generator. See :term:`Glossary <random_state>` for details.
+
+    l1_ratios : list of float, default=None
+        The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``.
+        Only used if ``penalty='elasticnet'``. A value of 0 is equivalent to
+        using ``penalty='l2'``, while 1 is equivalent to using
+        ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination
+        of L1 and L2.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes, )
+        A list of class labels known to the classifier.
+
+    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
+        Coefficient of the features in the decision function.
+
+        `coef_` is of shape (1, n_features) when the given problem
+        is binary.
+
+    intercept_ : ndarray of shape (1,) or (n_classes,)
+        Intercept (a.k.a. bias) added to the decision function.
+
+        If `fit_intercept` is set to False, the intercept is set to zero.
+        `intercept_` is of shape(1,) when the problem is binary.
+
+    Cs_ : ndarray of shape (n_cs)
+        Array of C i.e. inverse of regularization parameter values used
+        for cross-validation.
+
+    l1_ratios_ : ndarray of shape (n_l1_ratios)
+        Array of l1_ratios used for cross-validation. If no l1_ratio is used
+        (i.e. penalty is not 'elasticnet'), this is set to ``[None]``
+
+    coefs_paths_ : ndarray of shape (n_folds, n_cs, n_features) or \
+                   (n_folds, n_cs, n_features + 1)
+        dict with classes as the keys, and the path of coefficients obtained
+        during cross-validating across each fold and then across each Cs
+        after doing an OvR for the corresponding class as values.
+        If the 'multi_class' option is set to 'multinomial', then
+        the coefs_paths are the coefficients corresponding to each class.
+        Each dict value has shape ``(n_folds, n_cs, n_features)`` or
+        ``(n_folds, n_cs, n_features + 1)`` depending on whether the
+        intercept is fit or not. If ``penalty='elasticnet'``, the shape is
+        ``(n_folds, n_cs, n_l1_ratios_, n_features)`` or
+        ``(n_folds, n_cs, n_l1_ratios_, n_features + 1)``.
+
+    scores_ : dict
+        dict with classes as the keys, and the values as the
+        grid of scores obtained during cross-validating each fold, after doing
+        an OvR for the corresponding class. If the 'multi_class' option
+        given is 'multinomial' then the same scores are repeated across
+        all classes, since this is the multinomial class. Each dict value
+        has shape ``(n_folds, n_cs)`` or ``(n_folds, n_cs, n_l1_ratios)`` if
+        ``penalty='elasticnet'``.
+
+    C_ : ndarray of shape (n_classes,) or (n_classes - 1,)
+        Array of C that maps to the best scores across every class. If refit is
+        set to False, then for each class, the best C is the average of the
+        C's that correspond to the best scores for each fold.
+        `C_` is of shape(n_classes,) when the problem is binary.
+
+    l1_ratio_ : ndarray of shape (n_classes,) or (n_classes - 1,)
+        Array of l1_ratio that maps to the best scores across every class. If
+        refit is set to False, then for each class, the best l1_ratio is the
+        average of the l1_ratio's that correspond to the best scores for each
+        fold.  `l1_ratio_` is of shape(n_classes,) when the problem is binary.
+
+    n_iter_ : ndarray of shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs)
+        Actual number of iterations for all classes, folds and Cs.
+        In the binary or multinomial cases, the first dimension is equal to 1.
+        If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds,
+        n_cs, n_l1_ratios)`` or ``(1, n_folds, n_cs, n_l1_ratios)``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    LogisticRegression : Logistic regression without tuning the
+        hyperparameter `C`.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.linear_model import LogisticRegressionCV
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y)
+    >>> clf.predict(X[:2, :])
+    array([0, 0])
+    >>> clf.predict_proba(X[:2, :]).shape
+    (2, 3)
+    >>> clf.score(X, y)
+    0.98...
+    """
+
+    _parameter_constraints: dict = {**LogisticRegression._parameter_constraints}
+
+    for param in ["C", "warm_start", "l1_ratio"]:
+        _parameter_constraints.pop(param)
+
+    _parameter_constraints.update(
+        {
+            "Cs": [Interval(Integral, 1, None, closed="left"), "array-like"],
+            "cv": ["cv_object"],
+            "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+            "l1_ratios": ["array-like", None],
+            "refit": ["boolean"],
+            "penalty": [StrOptions({"l1", "l2", "elasticnet"})],
+        }
+    )
+
+    def __init__(
+        self,
+        *,
+        Cs=10,
+        fit_intercept=True,
+        cv=None,
+        dual=False,
+        penalty="l2",
+        scoring=None,
+        solver="lbfgs",
+        tol=1e-4,
+        max_iter=100,
+        class_weight=None,
+        n_jobs=None,
+        verbose=0,
+        refit=True,
+        intercept_scaling=1.0,
+        multi_class="deprecated",
+        random_state=None,
+        l1_ratios=None,
+    ):
+        self.Cs = Cs
+        self.fit_intercept = fit_intercept
+        self.cv = cv
+        self.dual = dual
+        self.penalty = penalty
+        self.scoring = scoring
+        self.tol = tol
+        self.max_iter = max_iter
+        self.class_weight = class_weight
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        self.solver = solver
+        self.refit = refit
+        self.intercept_scaling = intercept_scaling
+        self.multi_class = multi_class
+        self.random_state = random_state
+        self.l1_ratios = l1_ratios
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None, **params):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target vector relative to X.
+
+        sample_weight : array-like of shape (n_samples,) default=None
+            Array of weights that are assigned to individual samples.
+            If not provided, then each sample is given unit weight.
+
+        **params : dict
+            Parameters to pass to the underlying splitter and scorer.
+
+            .. versionadded:: 1.4
+
+        Returns
+        -------
+        self : object
+            Fitted LogisticRegressionCV estimator.
+        """
+        _raise_for_params(params, self, "fit")
+
+        solver = _check_solver(self.solver, self.penalty, self.dual)
+
+        if self.penalty == "elasticnet":
+            if (
+                self.l1_ratios is None
+                or len(self.l1_ratios) == 0
+                or any(
+                    (
+                        not isinstance(l1_ratio, numbers.Number)
+                        or l1_ratio < 0
+                        or l1_ratio > 1
+                    )
+                    for l1_ratio in self.l1_ratios
+                )
+            ):
+                raise ValueError(
+                    "l1_ratios must be a list of numbers between "
+                    "0 and 1; got (l1_ratios=%r)" % self.l1_ratios
+                )
+            l1_ratios_ = self.l1_ratios
+        else:
+            if self.l1_ratios is not None:
+                warnings.warn(
+                    "l1_ratios parameter is only used when penalty "
+                    "is 'elasticnet'. Got (penalty={})".format(self.penalty)
+                )
+
+            l1_ratios_ = [None]
+
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse="csr",
+            dtype=np.float64,
+            order="C",
+            accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
+        )
+        check_classification_targets(y)
+
+        class_weight = self.class_weight
+
+        # Encode for string labels
+        label_encoder = LabelEncoder().fit(y)
+        y = label_encoder.transform(y)
+        if isinstance(class_weight, dict):
+            class_weight = {
+                label_encoder.transform([cls])[0]: v for cls, v in class_weight.items()
+            }
+
+        # The original class labels
+        classes = self.classes_ = label_encoder.classes_
+        encoded_labels = label_encoder.transform(label_encoder.classes_)
+
+        # TODO(1.8) remove multi_class
+        multi_class = self.multi_class
+        if self.multi_class == "multinomial" and len(self.classes_) == 2:
+            warnings.warn(
+                (
+                    "'multi_class' was deprecated in version 1.5 and will be removed in"
+                    " 1.8. From then on, binary problems will be fit as proper binary "
+                    " logistic regression models (as if multi_class='ovr' were set)."
+                    " Leave it to its default value to avoid this warning."
+                ),
+                FutureWarning,
+            )
+        elif self.multi_class in ("multinomial", "auto"):
+            warnings.warn(
+                (
+                    "'multi_class' was deprecated in version 1.5 and will be removed in"
+                    " 1.8. From then on, it will always use 'multinomial'."
+                    " Leave it to its default value to avoid this warning."
+                ),
+                FutureWarning,
+            )
+        elif self.multi_class == "ovr":
+            warnings.warn(
+                (
+                    "'multi_class' was deprecated in version 1.5 and will be removed in"
+                    " 1.8. Use OneVsRestClassifier(LogisticRegressionCV(..)) instead."
+                    " Leave it to its default value to avoid this warning."
+                ),
+                FutureWarning,
+            )
+        else:
+            # Set to old default value.
+            multi_class = "auto"
+        multi_class = _check_multi_class(multi_class, solver, len(classes))
+
+        if solver in ["sag", "saga"]:
+            max_squared_sum = row_norms(X, squared=True).max()
+        else:
+            max_squared_sum = None
+
+        if _routing_enabled():
+            routed_params = process_routing(
+                self,
+                "fit",
+                sample_weight=sample_weight,
+                **params,
+            )
+        else:
+            routed_params = Bunch()
+            routed_params.splitter = Bunch(split={})
+            routed_params.scorer = Bunch(score=params)
+            if sample_weight is not None:
+                routed_params.scorer.score["sample_weight"] = sample_weight
+
+        # init cross-validation generator
+        cv = check_cv(self.cv, y, classifier=True)
+        folds = list(cv.split(X, y, **routed_params.splitter.split))
+
+        # Use the label encoded classes
+        n_classes = len(encoded_labels)
+
+        if n_classes < 2:
+            raise ValueError(
+                "This solver needs samples of at least 2 classes"
+                " in the data, but the data contains only one"
+                " class: %r" % classes[0]
+            )
+
+        if n_classes == 2:
+            # OvR in case of binary problems is as good as fitting
+            # the higher label
+            n_classes = 1
+            encoded_labels = encoded_labels[1:]
+            classes = classes[1:]
+
+        # We need this hack to iterate only once over labels, in the case of
+        # multi_class = multinomial, without changing the value of the labels.
+        if multi_class == "multinomial":
+            iter_encoded_labels = iter_classes = [None]
+        else:
+            iter_encoded_labels = encoded_labels
+            iter_classes = classes
+
+        # compute the class weights for the entire dataset y
+        if class_weight == "balanced":
+            class_weight = compute_class_weight(
+                class_weight,
+                classes=np.arange(len(self.classes_)),
+                y=y,
+                sample_weight=sample_weight,
+            )
+            class_weight = dict(enumerate(class_weight))
+
+        path_func = delayed(_log_reg_scoring_path)
+
+        # The SAG solver releases the GIL so it's more efficient to use
+        # threads for this solver.
+        if self.solver in ["sag", "saga"]:
+            prefer = "threads"
+        else:
+            prefer = "processes"
+
+        fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
+            path_func(
+                X,
+                y,
+                train,
+                test,
+                pos_class=label,
+                Cs=self.Cs,
+                fit_intercept=self.fit_intercept,
+                penalty=self.penalty,
+                dual=self.dual,
+                solver=solver,
+                tol=self.tol,
+                max_iter=self.max_iter,
+                verbose=self.verbose,
+                class_weight=class_weight,
+                scoring=self.scoring,
+                multi_class=multi_class,
+                intercept_scaling=self.intercept_scaling,
+                random_state=self.random_state,
+                max_squared_sum=max_squared_sum,
+                sample_weight=sample_weight,
+                l1_ratio=l1_ratio,
+                score_params=routed_params.scorer.score,
+            )
+            for label in iter_encoded_labels
+            for train, test in folds
+            for l1_ratio in l1_ratios_
+        )
+
+        # _log_reg_scoring_path will output different shapes depending on the
+        # multi_class param, so we need to reshape the outputs accordingly.
+        # Cs is of shape (n_classes . n_folds . n_l1_ratios, n_Cs) and all the
+        # rows are equal, so we just take the first one.
+        # After reshaping,
+        # - scores is of shape (n_classes, n_folds, n_Cs . n_l1_ratios)
+        # - coefs_paths is of shape
+        #  (n_classes, n_folds, n_Cs . n_l1_ratios, n_features)
+        # - n_iter is of shape
+        #  (n_classes, n_folds, n_Cs . n_l1_ratios) or
+        #  (1, n_folds, n_Cs . n_l1_ratios)
+        coefs_paths, Cs, scores, n_iter_ = zip(*fold_coefs_)
+        self.Cs_ = Cs[0]
+        if multi_class == "multinomial":
+            coefs_paths = np.reshape(
+                coefs_paths,
+                (len(folds), len(l1_ratios_) * len(self.Cs_), n_classes, -1),
+            )
+            # equiv to coefs_paths = np.moveaxis(coefs_paths, (0, 1, 2, 3),
+            #                                                 (1, 2, 0, 3))
+            coefs_paths = np.swapaxes(coefs_paths, 0, 1)
+            coefs_paths = np.swapaxes(coefs_paths, 0, 2)
+            self.n_iter_ = np.reshape(
+                n_iter_, (1, len(folds), len(self.Cs_) * len(l1_ratios_))
+            )
+            # repeat same scores across all classes
+            scores = np.tile(scores, (n_classes, 1, 1))
+        else:
+            coefs_paths = np.reshape(
+                coefs_paths,
+                (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_), -1),
+            )
+            self.n_iter_ = np.reshape(
+                n_iter_, (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_))
+            )
+        scores = np.reshape(scores, (n_classes, len(folds), -1))
+        self.scores_ = dict(zip(classes, scores))
+        self.coefs_paths_ = dict(zip(classes, coefs_paths))
+
+        self.C_ = list()
+        self.l1_ratio_ = list()
+        self.coef_ = np.empty((n_classes, X.shape[1]))
+        self.intercept_ = np.zeros(n_classes)
+        for index, (cls, encoded_label) in enumerate(
+            zip(iter_classes, iter_encoded_labels)
+        ):
+            if multi_class == "ovr":
+                scores = self.scores_[cls]
+                coefs_paths = self.coefs_paths_[cls]
+            else:
+                # For multinomial, all scores are the same across classes
+                scores = scores[0]
+                # coefs_paths will keep its original shape because
+                # logistic_regression_path expects it this way
+
+            if self.refit:
+                # best_index is between 0 and (n_Cs . n_l1_ratios - 1)
+                # for example, with n_cs=2 and n_l1_ratios=3
+                # the layout of scores is
+                # [c1, c2, c1, c2, c1, c2]
+                #   l1_1 ,  l1_2 ,  l1_3
+                best_index = scores.sum(axis=0).argmax()
+
+                best_index_C = best_index % len(self.Cs_)
+                C_ = self.Cs_[best_index_C]
+                self.C_.append(C_)
+
+                best_index_l1 = best_index // len(self.Cs_)
+                l1_ratio_ = l1_ratios_[best_index_l1]
+                self.l1_ratio_.append(l1_ratio_)
+
+                if multi_class == "multinomial":
+                    coef_init = np.mean(coefs_paths[:, :, best_index, :], axis=1)
+                else:
+                    coef_init = np.mean(coefs_paths[:, best_index, :], axis=0)
+
+                # Note that y is label encoded and hence pos_class must be
+                # the encoded label / None (for 'multinomial')
+                w, _, _ = _logistic_regression_path(
+                    X,
+                    y,
+                    pos_class=encoded_label,
+                    Cs=[C_],
+                    solver=solver,
+                    fit_intercept=self.fit_intercept,
+                    coef=coef_init,
+                    max_iter=self.max_iter,
+                    tol=self.tol,
+                    penalty=self.penalty,
+                    class_weight=class_weight,
+                    multi_class=multi_class,
+                    verbose=max(0, self.verbose - 1),
+                    random_state=self.random_state,
+                    check_input=False,
+                    max_squared_sum=max_squared_sum,
+                    sample_weight=sample_weight,
+                    l1_ratio=l1_ratio_,
+                )
+                w = w[0]
+
+            else:
+                # Take the best scores across every fold and the average of
+                # all coefficients corresponding to the best scores.
+                best_indices = np.argmax(scores, axis=1)
+                if multi_class == "ovr":
+                    w = np.mean(
+                        [coefs_paths[i, best_indices[i], :] for i in range(len(folds))],
+                        axis=0,
+                    )
+                else:
+                    w = np.mean(
+                        [
+                            coefs_paths[:, i, best_indices[i], :]
+                            for i in range(len(folds))
+                        ],
+                        axis=0,
+                    )
+
+                best_indices_C = best_indices % len(self.Cs_)
+                self.C_.append(np.mean(self.Cs_[best_indices_C]))
+
+                if self.penalty == "elasticnet":
+                    best_indices_l1 = best_indices // len(self.Cs_)
+                    self.l1_ratio_.append(np.mean(l1_ratios_[best_indices_l1]))
+                else:
+                    self.l1_ratio_.append(None)
+
+            if multi_class == "multinomial":
+                self.C_ = np.tile(self.C_, n_classes)
+                self.l1_ratio_ = np.tile(self.l1_ratio_, n_classes)
+                self.coef_ = w[:, : X.shape[1]]
+                if self.fit_intercept:
+                    self.intercept_ = w[:, -1]
+            else:
+                self.coef_[index] = w[: X.shape[1]]
+                if self.fit_intercept:
+                    self.intercept_[index] = w[-1]
+
+        self.C_ = np.asarray(self.C_)
+        self.l1_ratio_ = np.asarray(self.l1_ratio_)
+        self.l1_ratios_ = np.asarray(l1_ratios_)
+        # if elasticnet was used, add the l1_ratios dimension to some
+        # attributes
+        if self.l1_ratios is not None:
+            # with n_cs=2 and n_l1_ratios=3
+            # the layout of scores is
+            # [c1, c2, c1, c2, c1, c2]
+            #   l1_1 ,  l1_2 ,  l1_3
+            # To get a 2d array with the following layout
+            #      l1_1, l1_2, l1_3
+            # c1 [[ .  ,  .  ,  .  ],
+            # c2  [ .  ,  .  ,  .  ]]
+            # We need to first reshape and then transpose.
+            # The same goes for the other arrays
+            for cls, coefs_path in self.coefs_paths_.items():
+                self.coefs_paths_[cls] = coefs_path.reshape(
+                    (len(folds), self.l1_ratios_.size, self.Cs_.size, -1)
+                )
+                self.coefs_paths_[cls] = np.transpose(
+                    self.coefs_paths_[cls], (0, 2, 1, 3)
+                )
+            for cls, score in self.scores_.items():
+                self.scores_[cls] = score.reshape(
+                    (len(folds), self.l1_ratios_.size, self.Cs_.size)
+                )
+                self.scores_[cls] = np.transpose(self.scores_[cls], (0, 2, 1))
+
+            self.n_iter_ = self.n_iter_.reshape(
+                (-1, len(folds), self.l1_ratios_.size, self.Cs_.size)
+            )
+            self.n_iter_ = np.transpose(self.n_iter_, (0, 1, 3, 2))
+
+        return self
+
+    def score(self, X, y, sample_weight=None, **score_params):
+        """Score using the `scoring` option on the given test data and labels.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,)
+            True labels for X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        **score_params : dict
+            Parameters to pass to the `score` method of the underlying scorer.
+
+            .. versionadded:: 1.4
+
+        Returns
+        -------
+        score : float
+            Score of self.predict(X) w.r.t. y.
+        """
+        _raise_for_params(score_params, self, "score")
+
+        scoring = self._get_scorer()
+        if _routing_enabled():
+            routed_params = process_routing(
+                self,
+                "score",
+                sample_weight=sample_weight,
+                **score_params,
+            )
+        else:
+            routed_params = Bunch()
+            routed_params.scorer = Bunch(score={})
+            if sample_weight is not None:
+                routed_params.scorer.score["sample_weight"] = sample_weight
+
+        return scoring(
+            self,
+            X,
+            y,
+            **routed_params.scorer.score,
+        )
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.4
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(
+                splitter=self.cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                scorer=self._get_scorer(),
+                method_mapping=MethodMapping()
+                .add(caller="score", callee="score")
+                .add(caller="fit", callee="score"),
+            )
+        )
+        return router
+
+    def _get_scorer(self):
+        """Get the scorer based on the scoring method specified.
+        The default scoring method is `accuracy`.
+        """
+        scoring = self.scoring or "accuracy"
+        return get_scorer(scoring)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_omp.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_omp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f4dbac2d7634b0fe4e6a02771e64f80adcf490b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_omp.py
@@ -0,0 +1,1121 @@
+"""Orthogonal matching pursuit algorithms"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from math import sqrt
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+from scipy.linalg.lapack import get_lapack_funcs
+
+from ..base import MultiOutputMixin, RegressorMixin, _fit_context
+from ..model_selection import check_cv
+from ..utils import Bunch, as_float_array, check_array
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import validate_data
+from ._base import LinearModel, _pre_fit
+
+premature = (
+    "Orthogonal matching pursuit ended prematurely due to linear"
+    " dependence in the dictionary. The requested precision might"
+    " not have been met."
+)
+
+
+def _cholesky_omp(X, y, n_nonzero_coefs, tol=None, copy_X=True, return_path=False):
+    """Orthogonal Matching Pursuit step using the Cholesky decomposition.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Input dictionary. Columns are assumed to have unit norm.
+
+    y : ndarray of shape (n_samples,)
+        Input targets.
+
+    n_nonzero_coefs : int
+        Targeted number of non-zero elements.
+
+    tol : float, default=None
+        Targeted squared error, if not None overrides n_nonzero_coefs.
+
+    copy_X : bool, default=True
+        Whether the design matrix X must be copied by the algorithm. A false
+        value is only helpful if X is already Fortran-ordered, otherwise a
+        copy is made anyway.
+
+    return_path : bool, default=False
+        Whether to return every value of the nonzero coefficients along the
+        forward path. Useful for cross-validation.
+
+    Returns
+    -------
+    gamma : ndarray of shape (n_nonzero_coefs,)
+        Non-zero elements of the solution.
+
+    idx : ndarray of shape (n_nonzero_coefs,)
+        Indices of the positions of the elements in gamma within the solution
+        vector.
+
+    coef : ndarray of shape (n_features, n_nonzero_coefs)
+        The first k values of column k correspond to the coefficient value
+        for the active features at that step. The lower left triangle contains
+        garbage. Only returned if ``return_path=True``.
+
+    n_active : int
+        Number of active features at convergence.
+    """
+    if copy_X:
+        X = X.copy("F")
+    else:  # even if we are allowed to overwrite, still copy it if bad order
+        X = np.asfortranarray(X)
+
+    min_float = np.finfo(X.dtype).eps
+    nrm2, swap = linalg.get_blas_funcs(("nrm2", "swap"), (X,))
+    (potrs,) = get_lapack_funcs(("potrs",), (X,))
+
+    alpha = np.dot(X.T, y)
+    residual = y
+    gamma = np.empty(0)
+    n_active = 0
+    indices = np.arange(X.shape[1])  # keeping track of swapping
+
+    max_features = X.shape[1] if tol is not None else n_nonzero_coefs
+
+    L = np.empty((max_features, max_features), dtype=X.dtype)
+
+    if return_path:
+        coefs = np.empty_like(L)
+
+    while True:
+        lam = np.argmax(np.abs(np.dot(X.T, residual)))
+        if lam < n_active or alpha[lam] ** 2 < min_float:
+            # atom already selected or inner product too small
+            warnings.warn(premature, RuntimeWarning, stacklevel=2)
+            break
+
+        if n_active > 0:
+            # Updates the Cholesky decomposition of X' X
+            L[n_active, :n_active] = np.dot(X[:, :n_active].T, X[:, lam])
+            linalg.solve_triangular(
+                L[:n_active, :n_active],
+                L[n_active, :n_active],
+                trans=0,
+                lower=1,
+                overwrite_b=True,
+                check_finite=False,
+            )
+            v = nrm2(L[n_active, :n_active]) ** 2
+            Lkk = linalg.norm(X[:, lam]) ** 2 - v
+            if Lkk <= min_float:  # selected atoms are dependent
+                warnings.warn(premature, RuntimeWarning, stacklevel=2)
+                break
+            L[n_active, n_active] = sqrt(Lkk)
+        else:
+            L[0, 0] = linalg.norm(X[:, lam])
+
+        X.T[n_active], X.T[lam] = swap(X.T[n_active], X.T[lam])
+        alpha[n_active], alpha[lam] = alpha[lam], alpha[n_active]
+        indices[n_active], indices[lam] = indices[lam], indices[n_active]
+        n_active += 1
+
+        # solves LL'x = X'y as a composition of two triangular systems
+        gamma, _ = potrs(
+            L[:n_active, :n_active], alpha[:n_active], lower=True, overwrite_b=False
+        )
+
+        if return_path:
+            coefs[:n_active, n_active - 1] = gamma
+        residual = y - np.dot(X[:, :n_active], gamma)
+        if tol is not None and nrm2(residual) ** 2 <= tol:
+            break
+        elif n_active == max_features:
+            break
+
+    if return_path:
+        return gamma, indices[:n_active], coefs[:, :n_active], n_active
+    else:
+        return gamma, indices[:n_active], n_active
+
+
+def _gram_omp(
+    Gram,
+    Xy,
+    n_nonzero_coefs,
+    tol_0=None,
+    tol=None,
+    copy_Gram=True,
+    copy_Xy=True,
+    return_path=False,
+):
+    """Orthogonal Matching Pursuit step on a precomputed Gram matrix.
+
+    This function uses the Cholesky decomposition method.
+
+    Parameters
+    ----------
+    Gram : ndarray of shape (n_features, n_features)
+        Gram matrix of the input data matrix.
+
+    Xy : ndarray of shape (n_features,)
+        Input targets.
+
+    n_nonzero_coefs : int
+        Targeted number of non-zero elements.
+
+    tol_0 : float, default=None
+        Squared norm of y, required if tol is not None.
+
+    tol : float, default=None
+        Targeted squared error, if not None overrides n_nonzero_coefs.
+
+    copy_Gram : bool, default=True
+        Whether the gram matrix must be copied by the algorithm. A false
+        value is only helpful if it is already Fortran-ordered, otherwise a
+        copy is made anyway.
+
+    copy_Xy : bool, default=True
+        Whether the covariance vector Xy must be copied by the algorithm.
+        If False, it may be overwritten.
+
+    return_path : bool, default=False
+        Whether to return every value of the nonzero coefficients along the
+        forward path. Useful for cross-validation.
+
+    Returns
+    -------
+    gamma : ndarray of shape (n_nonzero_coefs,)
+        Non-zero elements of the solution.
+
+    idx : ndarray of shape (n_nonzero_coefs,)
+        Indices of the positions of the elements in gamma within the solution
+        vector.
+
+    coefs : ndarray of shape (n_features, n_nonzero_coefs)
+        The first k values of column k correspond to the coefficient value
+        for the active features at that step. The lower left triangle contains
+        garbage. Only returned if ``return_path=True``.
+
+    n_active : int
+        Number of active features at convergence.
+    """
+    Gram = Gram.copy("F") if copy_Gram else np.asfortranarray(Gram)
+
+    if copy_Xy or not Xy.flags.writeable:
+        Xy = Xy.copy()
+
+    min_float = np.finfo(Gram.dtype).eps
+    nrm2, swap = linalg.get_blas_funcs(("nrm2", "swap"), (Gram,))
+    (potrs,) = get_lapack_funcs(("potrs",), (Gram,))
+
+    indices = np.arange(len(Gram))  # keeping track of swapping
+    alpha = Xy
+    tol_curr = tol_0
+    delta = 0
+    gamma = np.empty(0)
+    n_active = 0
+
+    max_features = len(Gram) if tol is not None else n_nonzero_coefs
+
+    L = np.empty((max_features, max_features), dtype=Gram.dtype)
+
+    L[0, 0] = 1.0
+    if return_path:
+        coefs = np.empty_like(L)
+
+    while True:
+        lam = np.argmax(np.abs(alpha))
+        if lam < n_active or alpha[lam] ** 2 < min_float:
+            # selected same atom twice, or inner product too small
+            warnings.warn(premature, RuntimeWarning, stacklevel=3)
+            break
+        if n_active > 0:
+            L[n_active, :n_active] = Gram[lam, :n_active]
+            linalg.solve_triangular(
+                L[:n_active, :n_active],
+                L[n_active, :n_active],
+                trans=0,
+                lower=1,
+                overwrite_b=True,
+                check_finite=False,
+            )
+            v = nrm2(L[n_active, :n_active]) ** 2
+            Lkk = Gram[lam, lam] - v
+            if Lkk <= min_float:  # selected atoms are dependent
+                warnings.warn(premature, RuntimeWarning, stacklevel=3)
+                break
+            L[n_active, n_active] = sqrt(Lkk)
+        else:
+            L[0, 0] = sqrt(Gram[lam, lam])
+
+        Gram[n_active], Gram[lam] = swap(Gram[n_active], Gram[lam])
+        Gram.T[n_active], Gram.T[lam] = swap(Gram.T[n_active], Gram.T[lam])
+        indices[n_active], indices[lam] = indices[lam], indices[n_active]
+        Xy[n_active], Xy[lam] = Xy[lam], Xy[n_active]
+        n_active += 1
+        # solves LL'x = X'y as a composition of two triangular systems
+        gamma, _ = potrs(
+            L[:n_active, :n_active], Xy[:n_active], lower=True, overwrite_b=False
+        )
+        if return_path:
+            coefs[:n_active, n_active - 1] = gamma
+        beta = np.dot(Gram[:, :n_active], gamma)
+        alpha = Xy - beta
+        if tol is not None:
+            tol_curr += delta
+            delta = np.inner(gamma, beta[:n_active])
+            tol_curr -= delta
+            if abs(tol_curr) <= tol:
+                break
+        elif n_active == max_features:
+            break
+
+    if return_path:
+        return gamma, indices[:n_active], coefs[:, :n_active], n_active
+    else:
+        return gamma, indices[:n_active], n_active
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "y": [np.ndarray],
+        "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None],
+        "tol": [Interval(Real, 0, None, closed="left"), None],
+        "precompute": ["boolean", StrOptions({"auto"})],
+        "copy_X": ["boolean"],
+        "return_path": ["boolean"],
+        "return_n_iter": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def orthogonal_mp(
+    X,
+    y,
+    *,
+    n_nonzero_coefs=None,
+    tol=None,
+    precompute=False,
+    copy_X=True,
+    return_path=False,
+    return_n_iter=False,
+):
+    r"""Orthogonal Matching Pursuit (OMP).
+
+    Solves n_targets Orthogonal Matching Pursuit problems.
+    An instance of the problem has the form:
+
+    When parametrized by the number of non-zero coefficients using
+    `n_nonzero_coefs`:
+    argmin ||y - X\gamma||^2 subject to ||\gamma||_0 <= n_{nonzero coefs}
+
+    When parametrized by error using the parameter `tol`:
+    argmin ||\gamma||_0 subject to ||y - X\gamma||^2 <= tol
+
+    Read more in the :ref:`User Guide <omp>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Input data. Columns are assumed to have unit norm.
+
+    y : ndarray of shape (n_samples,) or (n_samples, n_targets)
+        Input targets.
+
+    n_nonzero_coefs : int, default=None
+        Desired number of non-zero entries in the solution. If None (by
+        default) this value is set to 10% of n_features.
+
+    tol : float, default=None
+        Maximum squared norm of the residual. If not None, overrides n_nonzero_coefs.
+
+    precompute : 'auto' or bool, default=False
+        Whether to perform precomputations. Improves performance when n_targets
+        or n_samples is very large.
+
+    copy_X : bool, default=True
+        Whether the design matrix X must be copied by the algorithm. A false
+        value is only helpful if X is already Fortran-ordered, otherwise a
+        copy is made anyway.
+
+    return_path : bool, default=False
+        Whether to return every value of the nonzero coefficients along the
+        forward path. Useful for cross-validation.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    Returns
+    -------
+    coef : ndarray of shape (n_features,) or (n_features, n_targets)
+        Coefficients of the OMP solution. If `return_path=True`, this contains
+        the whole coefficient path. In this case its shape is
+        (n_features, n_features) or (n_features, n_targets, n_features) and
+        iterating over the last axis generates coefficients in increasing order
+        of active features.
+
+    n_iters : array-like or int
+        Number of active features across every target. Returned only if
+        `return_n_iter` is set to True.
+
+    See Also
+    --------
+    OrthogonalMatchingPursuit : Orthogonal Matching Pursuit model.
+    orthogonal_mp_gram : Solve OMP problems using Gram matrix and the product X.T * y.
+    lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm.
+    sklearn.decomposition.sparse_encode : Sparse coding.
+
+    Notes
+    -----
+    Orthogonal matching pursuit was introduced in S. Mallat, Z. Zhang,
+    Matching pursuits with time-frequency dictionaries, IEEE Transactions on
+    Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.
+    (https://www.di.ens.fr/~mallat/papiers/MallatPursuit93.pdf)
+
+    This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,
+    M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal
+    Matching Pursuit Technical Report - CS Technion, April 2008.
+    https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.linear_model import orthogonal_mp
+    >>> X, y = make_regression(noise=4, random_state=0)
+    >>> coef = orthogonal_mp(X, y)
+    >>> coef.shape
+    (100,)
+    >>> X[:1,] @ coef
+    array([-78.68])
+    """
+    X = check_array(X, order="F", copy=copy_X)
+    copy_X = False
+    if y.ndim == 1:
+        y = y.reshape(-1, 1)
+    y = check_array(y)
+    if y.shape[1] > 1:  # subsequent targets will be affected
+        copy_X = True
+    if n_nonzero_coefs is None and tol is None:
+        # default for n_nonzero_coefs is 0.1 * n_features
+        # but at least one.
+        n_nonzero_coefs = max(int(0.1 * X.shape[1]), 1)
+    if tol is None and n_nonzero_coefs > X.shape[1]:
+        raise ValueError(
+            "The number of atoms cannot be more than the number of features"
+        )
+    if precompute == "auto":
+        precompute = X.shape[0] > X.shape[1]
+    if precompute:
+        G = np.dot(X.T, X)
+        G = np.asfortranarray(G)
+        Xy = np.dot(X.T, y)
+        if tol is not None:
+            norms_squared = np.sum((y**2), axis=0)
+        else:
+            norms_squared = None
+        return orthogonal_mp_gram(
+            G,
+            Xy,
+            n_nonzero_coefs=n_nonzero_coefs,
+            tol=tol,
+            norms_squared=norms_squared,
+            copy_Gram=copy_X,
+            copy_Xy=False,
+            return_path=return_path,
+        )
+
+    if return_path:
+        coef = np.zeros((X.shape[1], y.shape[1], X.shape[1]))
+    else:
+        coef = np.zeros((X.shape[1], y.shape[1]))
+    n_iters = []
+
+    for k in range(y.shape[1]):
+        out = _cholesky_omp(
+            X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
+        )
+        if return_path:
+            _, idx, coefs, n_iter = out
+            coef = coef[:, :, : len(idx)]
+            for n_active, x in enumerate(coefs.T):
+                coef[idx[: n_active + 1], k, n_active] = x[: n_active + 1]
+        else:
+            x, idx, n_iter = out
+            coef[idx, k] = x
+        n_iters.append(n_iter)
+
+    if y.shape[1] == 1:
+        n_iters = n_iters[0]
+
+    if return_n_iter:
+        return np.squeeze(coef), n_iters
+    else:
+        return np.squeeze(coef)
+
+
+@validate_params(
+    {
+        "Gram": ["array-like"],
+        "Xy": ["array-like"],
+        "n_nonzero_coefs": [Interval(Integral, 0, None, closed="neither"), None],
+        "tol": [Interval(Real, 0, None, closed="left"), None],
+        "norms_squared": ["array-like", None],
+        "copy_Gram": ["boolean"],
+        "copy_Xy": ["boolean"],
+        "return_path": ["boolean"],
+        "return_n_iter": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def orthogonal_mp_gram(
+    Gram,
+    Xy,
+    *,
+    n_nonzero_coefs=None,
+    tol=None,
+    norms_squared=None,
+    copy_Gram=True,
+    copy_Xy=True,
+    return_path=False,
+    return_n_iter=False,
+):
+    """Gram Orthogonal Matching Pursuit (OMP).
+
+    Solves n_targets Orthogonal Matching Pursuit problems using only
+    the Gram matrix X.T * X and the product X.T * y.
+
+    Read more in the :ref:`User Guide <omp>`.
+
+    Parameters
+    ----------
+    Gram : array-like of shape (n_features, n_features)
+        Gram matrix of the input data: `X.T * X`.
+
+    Xy : array-like of shape (n_features,) or (n_features, n_targets)
+        Input targets multiplied by `X`: `X.T * y`.
+
+    n_nonzero_coefs : int, default=None
+        Desired number of non-zero entries in the solution. If `None` (by
+        default) this value is set to 10% of n_features.
+
+    tol : float, default=None
+        Maximum squared norm of the residual. If not `None`,
+        overrides `n_nonzero_coefs`.
+
+    norms_squared : array-like of shape (n_targets,), default=None
+        Squared L2 norms of the lines of `y`. Required if `tol` is not None.
+
+    copy_Gram : bool, default=True
+        Whether the gram matrix must be copied by the algorithm. A `False`
+        value is only helpful if it is already Fortran-ordered, otherwise a
+        copy is made anyway.
+
+    copy_Xy : bool, default=True
+        Whether the covariance vector `Xy` must be copied by the algorithm.
+        If `False`, it may be overwritten.
+
+    return_path : bool, default=False
+        Whether to return every value of the nonzero coefficients along the
+        forward path. Useful for cross-validation.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    Returns
+    -------
+    coef : ndarray of shape (n_features,) or (n_features, n_targets)
+        Coefficients of the OMP solution. If `return_path=True`, this contains
+        the whole coefficient path. In this case its shape is
+        `(n_features, n_features)` or `(n_features, n_targets, n_features)` and
+        iterating over the last axis yields coefficients in increasing order
+        of active features.
+
+    n_iters : list or int
+        Number of active features across every target. Returned only if
+        `return_n_iter` is set to True.
+
+    See Also
+    --------
+    OrthogonalMatchingPursuit : Orthogonal Matching Pursuit model (OMP).
+    orthogonal_mp : Solves n_targets Orthogonal Matching Pursuit problems.
+    lars_path : Compute Least Angle Regression or Lasso path using
+        LARS algorithm.
+    sklearn.decomposition.sparse_encode : Generic sparse coding.
+        Each column of the result is the solution to a Lasso problem.
+
+    Notes
+    -----
+    Orthogonal matching pursuit was introduced in G. Mallat, Z. Zhang,
+    Matching pursuits with time-frequency dictionaries, IEEE Transactions on
+    Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.
+    (https://www.di.ens.fr/~mallat/papiers/MallatPursuit93.pdf)
+
+    This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,
+    M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal
+    Matching Pursuit Technical Report - CS Technion, April 2008.
+    https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.linear_model import orthogonal_mp_gram
+    >>> X, y = make_regression(noise=4, random_state=0)
+    >>> coef = orthogonal_mp_gram(X.T @ X, X.T @ y)
+    >>> coef.shape
+    (100,)
+    >>> X[:1,] @ coef
+    array([-78.68])
+    """
+    Gram = check_array(Gram, order="F", copy=copy_Gram)
+    Xy = np.asarray(Xy)
+    if Xy.ndim > 1 and Xy.shape[1] > 1:
+        # or subsequent target will be affected
+        copy_Gram = True
+    if Xy.ndim == 1:
+        Xy = Xy[:, np.newaxis]
+        if tol is not None:
+            norms_squared = [norms_squared]
+    if copy_Xy or not Xy.flags.writeable:
+        # Make the copy once instead of many times in _gram_omp itself.
+        Xy = Xy.copy()
+
+    if n_nonzero_coefs is None and tol is None:
+        n_nonzero_coefs = int(0.1 * len(Gram))
+    if tol is not None and norms_squared is None:
+        raise ValueError(
+            "Gram OMP needs the precomputed norms in order "
+            "to evaluate the error sum of squares."
+        )
+    if tol is not None and tol < 0:
+        raise ValueError("Epsilon cannot be negative")
+    if tol is None and n_nonzero_coefs <= 0:
+        raise ValueError("The number of atoms must be positive")
+    if tol is None and n_nonzero_coefs > len(Gram):
+        raise ValueError(
+            "The number of atoms cannot be more than the number of features"
+        )
+
+    if return_path:
+        coef = np.zeros((len(Gram), Xy.shape[1], len(Gram)), dtype=Gram.dtype)
+    else:
+        coef = np.zeros((len(Gram), Xy.shape[1]), dtype=Gram.dtype)
+
+    n_iters = []
+    for k in range(Xy.shape[1]):
+        out = _gram_omp(
+            Gram,
+            Xy[:, k],
+            n_nonzero_coefs,
+            norms_squared[k] if tol is not None else None,
+            tol,
+            copy_Gram=copy_Gram,
+            copy_Xy=False,
+            return_path=return_path,
+        )
+        if return_path:
+            _, idx, coefs, n_iter = out
+            coef = coef[:, :, : len(idx)]
+            for n_active, x in enumerate(coefs.T):
+                coef[idx[: n_active + 1], k, n_active] = x[: n_active + 1]
+        else:
+            x, idx, n_iter = out
+            coef[idx, k] = x
+        n_iters.append(n_iter)
+
+    if Xy.shape[1] == 1:
+        n_iters = n_iters[0]
+
+    if return_n_iter:
+        return np.squeeze(coef), n_iters
+    else:
+        return np.squeeze(coef)
+
+
+class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel):
+    """Orthogonal Matching Pursuit model (OMP).
+
+    Read more in the :ref:`User Guide <omp>`.
+
+    Parameters
+    ----------
+    n_nonzero_coefs : int, default=None
+        Desired number of non-zero entries in the solution. Ignored if `tol` is set.
+        When `None` and `tol` is also `None`, this value is either set to 10% of
+        `n_features` or 1, whichever is greater.
+
+    tol : float, default=None
+        Maximum squared norm of the residual. If not None, overrides n_nonzero_coefs.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    precompute : 'auto' or bool, default='auto'
+        Whether to use a precomputed Gram and Xy matrix to speed up
+        calculations. Improves performance when :term:`n_targets` or
+        :term:`n_samples` is very large. Note that if you already have such
+        matrices, you can pass them directly to the fit method.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
+        Parameter vector (w in the formula).
+
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    n_iter_ : int or array-like
+        Number of active features across every target.
+
+    n_nonzero_coefs_ : int or None
+        The number of non-zero coefficients in the solution or `None` when `tol` is
+        set. If `n_nonzero_coefs` is None and `tol` is None this value is either set
+        to 10% of `n_features` or 1, whichever is greater.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    orthogonal_mp : Solves n_targets Orthogonal Matching Pursuit problems.
+    orthogonal_mp_gram :  Solves n_targets Orthogonal Matching Pursuit
+        problems using only the Gram matrix X.T * X and the product X.T * y.
+    lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm.
+    Lars : Least Angle Regression model a.k.a. LAR.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    sklearn.decomposition.sparse_encode : Generic sparse coding.
+        Each column of the result is the solution to a Lasso problem.
+    OrthogonalMatchingPursuitCV : Cross-validated
+        Orthogonal Matching Pursuit model (OMP).
+
+    Notes
+    -----
+    Orthogonal matching pursuit was introduced in G. Mallat, Z. Zhang,
+    Matching pursuits with time-frequency dictionaries, IEEE Transactions on
+    Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.
+    (https://www.di.ens.fr/~mallat/papiers/MallatPursuit93.pdf)
+
+    This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,
+    M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal
+    Matching Pursuit Technical Report - CS Technion, April 2008.
+    https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import OrthogonalMatchingPursuit
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(noise=4, random_state=0)
+    >>> reg = OrthogonalMatchingPursuit().fit(X, y)
+    >>> reg.score(X, y)
+    0.9991
+    >>> reg.predict(X[:1,])
+    array([-78.3854])
+    """
+
+    _parameter_constraints: dict = {
+        "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None],
+        "tol": [Interval(Real, 0, None, closed="left"), None],
+        "fit_intercept": ["boolean"],
+        "precompute": [StrOptions({"auto"}), "boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        n_nonzero_coefs=None,
+        tol=None,
+        fit_intercept=True,
+        precompute="auto",
+    ):
+        self.n_nonzero_coefs = n_nonzero_coefs
+        self.tol = tol
+        self.fit_intercept = fit_intercept
+        self.precompute = precompute
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the model using X, y as training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values. Will be cast to X's dtype if necessary.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        X, y = validate_data(self, X, y, multi_output=True, y_numeric=True)
+        n_features = X.shape[1]
+
+        X, y, X_offset, y_offset, X_scale, Gram, Xy = _pre_fit(
+            X, y, None, self.precompute, self.fit_intercept, copy=True
+        )
+
+        if y.ndim == 1:
+            y = y[:, np.newaxis]
+
+        if self.n_nonzero_coefs is None and self.tol is None:
+            # default for n_nonzero_coefs is 0.1 * n_features
+            # but at least one.
+            self.n_nonzero_coefs_ = max(int(0.1 * n_features), 1)
+        elif self.tol is not None:
+            self.n_nonzero_coefs_ = None
+        else:
+            self.n_nonzero_coefs_ = self.n_nonzero_coefs
+
+        if Gram is False:
+            coef_, self.n_iter_ = orthogonal_mp(
+                X,
+                y,
+                n_nonzero_coefs=self.n_nonzero_coefs_,
+                tol=self.tol,
+                precompute=False,
+                copy_X=True,
+                return_n_iter=True,
+            )
+        else:
+            norms_sq = np.sum(y**2, axis=0) if self.tol is not None else None
+
+            coef_, self.n_iter_ = orthogonal_mp_gram(
+                Gram,
+                Xy=Xy,
+                n_nonzero_coefs=self.n_nonzero_coefs_,
+                tol=self.tol,
+                norms_squared=norms_sq,
+                copy_Gram=True,
+                copy_Xy=True,
+                return_n_iter=True,
+            )
+        self.coef_ = coef_.T
+        self._set_intercept(X_offset, y_offset, X_scale)
+        return self
+
+
+def _omp_path_residues(
+    X_train,
+    y_train,
+    X_test,
+    y_test,
+    copy=True,
+    fit_intercept=True,
+    max_iter=100,
+):
+    """Compute the residues on left-out data for a full LARS path.
+
+    Parameters
+    ----------
+    X_train : ndarray of shape (n_samples, n_features)
+        The data to fit the LARS on.
+
+    y_train : ndarray of shape (n_samples)
+        The target variable to fit LARS on.
+
+    X_test : ndarray of shape (n_samples, n_features)
+        The data to compute the residues on.
+
+    y_test : ndarray of shape (n_samples)
+        The target variable to compute the residues on.
+
+    copy : bool, default=True
+        Whether X_train, X_test, y_train and y_test should be copied.  If
+        False, they may be overwritten.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    max_iter : int, default=100
+        Maximum numbers of iterations to perform, therefore maximum features
+        to include. 100 by default.
+
+    Returns
+    -------
+    residues : ndarray of shape (n_samples, max_features)
+        Residues of the prediction on the test data.
+    """
+
+    if copy:
+        X_train = X_train.copy()
+        y_train = y_train.copy()
+        X_test = X_test.copy()
+        y_test = y_test.copy()
+
+    if fit_intercept:
+        X_mean = X_train.mean(axis=0)
+        X_train -= X_mean
+        X_test -= X_mean
+        y_mean = y_train.mean(axis=0)
+        y_train = as_float_array(y_train, copy=False)
+        y_train -= y_mean
+        y_test = as_float_array(y_test, copy=False)
+        y_test -= y_mean
+
+    coefs = orthogonal_mp(
+        X_train,
+        y_train,
+        n_nonzero_coefs=max_iter,
+        tol=None,
+        precompute=False,
+        copy_X=False,
+        return_path=True,
+    )
+    if coefs.ndim == 1:
+        coefs = coefs[:, np.newaxis]
+
+    return np.dot(coefs.T, X_test.T) - y_test
+
+
+class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel):
+    """Cross-validated Orthogonal Matching Pursuit model (OMP).
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    Read more in the :ref:`User Guide <omp>`.
+
+    Parameters
+    ----------
+    copy : bool, default=True
+        Whether the design matrix X must be copied by the algorithm. A false
+        value is only helpful if X is already Fortran-ordered, otherwise a
+        copy is made anyway.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    max_iter : int, default=None
+        Maximum numbers of iterations to perform, therefore maximum features
+        to include. 10% of ``n_features`` but at least 5 if available.
+
+    cv : int, cross-validation generator or iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    n_jobs : int, default=None
+        Number of CPUs to use during the cross validation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool or int, default=False
+        Sets the verbosity amount.
+
+    Attributes
+    ----------
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function.
+
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
+        Parameter vector (w in the problem formulation).
+
+    n_nonzero_coefs_ : int
+        Estimated number of non-zero coefficients giving the best mean squared
+        error over the cross-validation folds.
+
+    n_iter_ : int or array-like
+        Number of active features across every target for the model refit with
+        the best hyperparameters got by cross-validating across all folds.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    orthogonal_mp : Solves n_targets Orthogonal Matching Pursuit problems.
+    orthogonal_mp_gram : Solves n_targets Orthogonal Matching Pursuit
+        problems using only the Gram matrix X.T * X and the product X.T * y.
+    lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm.
+    Lars : Least Angle Regression model a.k.a. LAR.
+    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
+    OrthogonalMatchingPursuit : Orthogonal Matching Pursuit model (OMP).
+    LarsCV : Cross-validated Least Angle Regression model.
+    LassoLarsCV : Cross-validated Lasso model fit with Least Angle Regression.
+    sklearn.decomposition.sparse_encode : Generic sparse coding.
+        Each column of the result is the solution to a Lasso problem.
+
+    Notes
+    -----
+    In `fit`, once the optimal number of non-zero coefficients is found through
+    cross-validation, the model is fit again using the entire training set.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import OrthogonalMatchingPursuitCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_features=100, n_informative=10,
+    ...                        noise=4, random_state=0)
+    >>> reg = OrthogonalMatchingPursuitCV(cv=5).fit(X, y)
+    >>> reg.score(X, y)
+    0.9991
+    >>> reg.n_nonzero_coefs_
+    np.int64(10)
+    >>> reg.predict(X[:1,])
+    array([-78.3854])
+    """
+
+    _parameter_constraints: dict = {
+        "copy": ["boolean"],
+        "fit_intercept": ["boolean"],
+        "max_iter": [Interval(Integral, 0, None, closed="left"), None],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        copy=True,
+        fit_intercept=True,
+        max_iter=None,
+        cv=None,
+        n_jobs=None,
+        verbose=False,
+    ):
+        self.copy = copy
+        self.fit_intercept = fit_intercept
+        self.max_iter = max_iter
+        self.cv = cv
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, **fit_params):
+        """Fit the model using X, y as training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values. Will be cast to X's dtype if necessary.
+
+        **fit_params : dict
+            Parameters to pass to the underlying splitter.
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        _raise_for_params(fit_params, self, "fit")
+
+        X, y = validate_data(self, X, y, y_numeric=True, ensure_min_features=2)
+        X = as_float_array(X, copy=False, ensure_all_finite=False)
+        cv = check_cv(self.cv, classifier=False)
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            # TODO(SLEP6): remove when metadata routing cannot be disabled.
+            routed_params = Bunch()
+            routed_params.splitter = Bunch(split={})
+        max_iter = (
+            min(max(int(0.1 * X.shape[1]), 5), X.shape[1])
+            if not self.max_iter
+            else self.max_iter
+        )
+        cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
+            delayed(_omp_path_residues)(
+                X[train],
+                y[train],
+                X[test],
+                y[test],
+                self.copy,
+                self.fit_intercept,
+                max_iter,
+            )
+            for train, test in cv.split(X, **routed_params.splitter.split)
+        )
+
+        min_early_stop = min(fold.shape[0] for fold in cv_paths)
+        mse_folds = np.array(
+            [(fold[:min_early_stop] ** 2).mean(axis=1) for fold in cv_paths]
+        )
+        best_n_nonzero_coefs = np.argmin(mse_folds.mean(axis=0)) + 1
+        self.n_nonzero_coefs_ = best_n_nonzero_coefs
+        omp = OrthogonalMatchingPursuit(
+            n_nonzero_coefs=best_n_nonzero_coefs,
+            fit_intercept=self.fit_intercept,
+        ).fit(X, y)
+
+        self.coef_ = omp.coef_
+        self.intercept_ = omp.intercept_
+        self.n_iter_ = omp.n_iter_
+        return self
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.4
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            splitter=self.cv,
+            method_mapping=MethodMapping().add(caller="fit", callee="split"),
+        )
+        return router
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_passive_aggressive.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_passive_aggressive.py
new file mode 100644
index 0000000000000000000000000000000000000000..61eb06edae85f9c6d04a94c070cd71c1bbbcaa3b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_passive_aggressive.py
@@ -0,0 +1,573 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Real
+
+from ..base import _fit_context
+from ..utils._param_validation import Interval, StrOptions
+from ._stochastic_gradient import DEFAULT_EPSILON, BaseSGDClassifier, BaseSGDRegressor
+
+
+class PassiveAggressiveClassifier(BaseSGDClassifier):
+    """Passive Aggressive Classifier.
+
+    Read more in the :ref:`User Guide <passive_aggressive>`.
+
+    Parameters
+    ----------
+    C : float, default=1.0
+        Maximum step size (regularization). Defaults to 1.0.
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`~sklearn.linear_model.PassiveAggressiveClassifier.partial_fit` method.
+
+        .. versionadded:: 0.19
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, the iterations will stop
+        when (loss > previous_loss - tol).
+
+        .. versionadded:: 0.19
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to True, it will automatically set aside
+        a stratified fraction of training data as validation and terminate
+        training when validation score is not improving by at least `tol` for
+        `n_iter_no_change` consecutive epochs.
+
+        .. versionadded:: 0.20
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True.
+
+        .. versionadded:: 0.20
+
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before early stopping.
+
+        .. versionadded:: 0.20
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    loss : str, default="hinge"
+        The loss function to be used:
+        hinge: equivalent to PA-I in the reference paper.
+        squared_hinge: equivalent to PA-II in the reference paper.
+
+    n_jobs : int or None, default=None
+        The number of CPUs to use to do the OVA (One Versus All, for
+        multi-class problems) computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance, default=None
+        Used to shuffle the training data, when ``shuffle`` is set to
+        ``True``. Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+
+    class_weight : dict, {class_label: weight} or "balanced" or None, \
+            default=None
+        Preset for the class_weight fit parameter.
+
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+        .. versionadded:: 0.17
+           parameter *class_weight* to automatically weight samples.
+
+    average : bool or int, default=False
+        When set to True, computes the averaged SGD weights and stores the
+        result in the ``coef_`` attribute. If set to an int greater than 1,
+        averaging will begin once the total number of samples seen reaches
+        average. So average=10 will begin averaging after seeing 10 samples.
+
+        .. versionadded:: 0.19
+           parameter *average* to use weights averaging in SGD.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
+        Weights assigned to the features.
+
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+        For multiclass fits, it is the maximum over every binary fit.
+
+    classes_ : ndarray of shape (n_classes,)
+        The unique classes labels.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    See Also
+    --------
+    SGDClassifier : Incrementally trained logistic regression.
+    Perceptron : Linear perceptron classifier.
+
+    References
+    ----------
+    Online Passive-Aggressive Algorithms
+    <http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>
+    K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import PassiveAggressiveClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_features=4, random_state=0)
+    >>> clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,
+    ... tol=1e-3)
+    >>> clf.fit(X, y)
+    PassiveAggressiveClassifier(random_state=0)
+    >>> print(clf.coef_)
+    [[0.26642044 0.45070924 0.67251877 0.64185414]]
+    >>> print(clf.intercept_)
+    [1.84127814]
+    >>> print(clf.predict([[0, 0, 0, 0]]))
+    [1]
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSGDClassifier._parameter_constraints,
+        "loss": [StrOptions({"hinge", "squared_hinge"})],
+        "C": [Interval(Real, 0, None, closed="right")],
+    }
+
+    def __init__(
+        self,
+        *,
+        C=1.0,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        shuffle=True,
+        verbose=0,
+        loss="hinge",
+        n_jobs=None,
+        random_state=None,
+        warm_start=False,
+        class_weight=None,
+        average=False,
+    ):
+        super().__init__(
+            penalty=None,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            shuffle=shuffle,
+            verbose=verbose,
+            random_state=random_state,
+            eta0=1.0,
+            warm_start=warm_start,
+            class_weight=class_weight,
+            average=average,
+            n_jobs=n_jobs,
+        )
+
+        self.C = C
+        self.loss = loss
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y, classes=None):
+        """Fit linear model with Passive Aggressive algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Subset of the training data.
+
+        y : array-like of shape (n_samples,)
+            Subset of the target values.
+
+        classes : ndarray of shape (n_classes,)
+            Classes across all calls to partial_fit.
+            Can be obtained by via `np.unique(y_all)`, where y_all is the
+            target vector of the entire dataset.
+            This argument is required for the first call to partial_fit
+            and can be omitted in the subsequent calls.
+            Note that y doesn't need to contain all labels in `classes`.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        if not hasattr(self, "classes_"):
+            self._more_validate_params(for_partial_fit=True)
+
+            if self.class_weight == "balanced":
+                raise ValueError(
+                    "class_weight 'balanced' is not supported for "
+                    "partial_fit. For 'balanced' weights, use "
+                    "`sklearn.utils.compute_class_weight` with "
+                    "`class_weight='balanced'`. In place of y you "
+                    "can use a large enough subset of the full "
+                    "training set target to properly estimate the "
+                    "class frequency distributions. Pass the "
+                    "resulting weights as the class_weight "
+                    "parameter."
+                )
+
+        lr = "pa1" if self.loss == "hinge" else "pa2"
+        return self._partial_fit(
+            X,
+            y,
+            alpha=1.0,
+            C=self.C,
+            loss="hinge",
+            learning_rate=lr,
+            max_iter=1,
+            classes=classes,
+            sample_weight=None,
+            coef_init=None,
+            intercept_init=None,
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, coef_init=None, intercept_init=None):
+        """Fit linear model with Passive Aggressive algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        coef_init : ndarray of shape (n_classes, n_features)
+            The initial coefficients to warm-start the optimization.
+
+        intercept_init : ndarray of shape (n_classes,)
+            The initial intercept to warm-start the optimization.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self._more_validate_params()
+
+        lr = "pa1" if self.loss == "hinge" else "pa2"
+        return self._fit(
+            X,
+            y,
+            alpha=1.0,
+            C=self.C,
+            loss="hinge",
+            learning_rate=lr,
+            coef_init=coef_init,
+            intercept_init=intercept_init,
+        )
+
+
+class PassiveAggressiveRegressor(BaseSGDRegressor):
+    """Passive Aggressive Regressor.
+
+    Read more in the :ref:`User Guide <passive_aggressive>`.
+
+    Parameters
+    ----------
+
+    C : float, default=1.0
+        Maximum step size (regularization). Defaults to 1.0.
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered. Defaults to True.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`~sklearn.linear_model.PassiveAggressiveRegressor.partial_fit` method.
+
+        .. versionadded:: 0.19
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, the iterations will stop
+        when (loss > previous_loss - tol).
+
+        .. versionadded:: 0.19
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation.
+        score is not improving. If set to True, it will automatically set aside
+        a fraction of training data as validation and terminate
+        training when validation score is not improving by at least tol for
+        n_iter_no_change consecutive epochs.
+
+        .. versionadded:: 0.20
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True.
+
+        .. versionadded:: 0.20
+
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before early stopping.
+
+        .. versionadded:: 0.20
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    loss : str, default="epsilon_insensitive"
+        The loss function to be used:
+        epsilon_insensitive: equivalent to PA-I in the reference paper.
+        squared_epsilon_insensitive: equivalent to PA-II in the reference
+        paper.
+
+    epsilon : float, default=0.1
+        If the difference between the current prediction and the correct label
+        is below this threshold, the model is not updated.
+
+    random_state : int, RandomState instance, default=None
+        Used to shuffle the training data, when ``shuffle`` is set to
+        ``True``. Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+
+    average : bool or int, default=False
+        When set to True, computes the averaged SGD weights and stores the
+        result in the ``coef_`` attribute. If set to an int greater than 1,
+        averaging will begin once the total number of samples seen reaches
+        average. So average=10 will begin averaging after seeing 10 samples.
+
+        .. versionadded:: 0.19
+           parameter *average* to use weights averaging in SGD.
+
+    Attributes
+    ----------
+    coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
+            n_features]
+        Weights assigned to the features.
+
+    intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    See Also
+    --------
+    SGDRegressor : Linear model fitted by minimizing a regularized
+        empirical loss with SGD.
+
+    References
+    ----------
+    Online Passive-Aggressive Algorithms
+    <http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>
+    K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006).
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import PassiveAggressiveRegressor
+    >>> from sklearn.datasets import make_regression
+
+    >>> X, y = make_regression(n_features=4, random_state=0)
+    >>> regr = PassiveAggressiveRegressor(max_iter=100, random_state=0,
+    ... tol=1e-3)
+    >>> regr.fit(X, y)
+    PassiveAggressiveRegressor(max_iter=100, random_state=0)
+    >>> print(regr.coef_)
+    [20.48736655 34.18818427 67.59122734 87.94731329]
+    >>> print(regr.intercept_)
+    [-0.02306214]
+    >>> print(regr.predict([[0, 0, 0, 0]]))
+    [-0.02306214]
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSGDRegressor._parameter_constraints,
+        "loss": [StrOptions({"epsilon_insensitive", "squared_epsilon_insensitive"})],
+        "C": [Interval(Real, 0, None, closed="right")],
+        "epsilon": [Interval(Real, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        *,
+        C=1.0,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        shuffle=True,
+        verbose=0,
+        loss="epsilon_insensitive",
+        epsilon=DEFAULT_EPSILON,
+        random_state=None,
+        warm_start=False,
+        average=False,
+    ):
+        super().__init__(
+            penalty=None,
+            l1_ratio=0,
+            epsilon=epsilon,
+            eta0=1.0,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            shuffle=shuffle,
+            verbose=verbose,
+            random_state=random_state,
+            warm_start=warm_start,
+            average=average,
+        )
+        self.C = C
+        self.loss = loss
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y):
+        """Fit linear model with Passive Aggressive algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Subset of training data.
+
+        y : numpy array of shape [n_samples]
+            Subset of target values.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        if not hasattr(self, "coef_"):
+            self._more_validate_params(for_partial_fit=True)
+
+        lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
+        return self._partial_fit(
+            X,
+            y,
+            alpha=1.0,
+            C=self.C,
+            loss="epsilon_insensitive",
+            learning_rate=lr,
+            max_iter=1,
+            sample_weight=None,
+            coef_init=None,
+            intercept_init=None,
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, coef_init=None, intercept_init=None):
+        """Fit linear model with Passive Aggressive algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : numpy array of shape [n_samples]
+            Target values.
+
+        coef_init : array, shape = [n_features]
+            The initial coefficients to warm-start the optimization.
+
+        intercept_init : array, shape = [1]
+            The initial intercept to warm-start the optimization.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self._more_validate_params()
+
+        lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
+        return self._fit(
+            X,
+            y,
+            alpha=1.0,
+            C=self.C,
+            loss="epsilon_insensitive",
+            learning_rate=lr,
+            coef_init=coef_init,
+            intercept_init=intercept_init,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_perceptron.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_perceptron.py
new file mode 100644
index 0000000000000000000000000000000000000000..e93200ba385faf037be75654061932ee6e886b7b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_perceptron.py
@@ -0,0 +1,226 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Real
+
+from ..utils._param_validation import Interval, StrOptions
+from ._stochastic_gradient import BaseSGDClassifier
+
+
+class Perceptron(BaseSGDClassifier):
+    """Linear perceptron classifier.
+
+    The implementation is a wrapper around :class:`~sklearn.linear_model.SGDClassifier`
+    by fixing the `loss` and `learning_rate` parameters as::
+
+        SGDClassifier(loss="perceptron", learning_rate="constant")
+
+    Other available parameters are described below and are forwarded to
+    :class:`~sklearn.linear_model.SGDClassifier`.
+
+    Read more in the :ref:`User Guide <perceptron>`.
+
+    Parameters
+    ----------
+
+    penalty : {'l2','l1','elasticnet'}, default=None
+        The penalty (aka regularization term) to be used.
+
+    alpha : float, default=0.0001
+        Constant that multiplies the regularization term if regularization is
+        used.
+
+    l1_ratio : float, default=0.15
+        The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.
+        `l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.
+        Only used if `penalty='elasticnet'`.
+
+        .. versionadded:: 0.24
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`partial_fit` method.
+
+        .. versionadded:: 0.19
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, the iterations will stop
+        when (loss > previous_loss - tol).
+
+        .. versionadded:: 0.19
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    eta0 : float, default=1
+        Constant by which the updates are multiplied.
+
+    n_jobs : int, default=None
+        The number of CPUs to use to do the OVA (One Versus All, for
+        multi-class problems) computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance or None, default=0
+        Used to shuffle the training data, when ``shuffle`` is set to
+        ``True``. Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to True, it will automatically set aside
+        a stratified fraction of training data as validation and terminate
+        training when validation score is not improving by at least `tol` for
+        `n_iter_no_change` consecutive epochs.
+
+        .. versionadded:: 0.20
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True.
+
+        .. versionadded:: 0.20
+
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before early stopping.
+
+        .. versionadded:: 0.20
+
+    class_weight : dict, {class_label: weight} or "balanced", default=None
+        Preset for the class_weight fit parameter.
+
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution. See
+        :term:`the Glossary <warm_start>`.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        The unique classes labels.
+
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
+        Weights assigned to the features.
+
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+        For multiclass fits, it is the maximum over every binary fit.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    See Also
+    --------
+    sklearn.linear_model.SGDClassifier : Linear classifiers
+        (SVM, logistic regression, etc.) with SGD training.
+
+    Notes
+    -----
+    ``Perceptron`` is a classification algorithm which shares the same
+    underlying implementation with ``SGDClassifier``. In fact,
+    ``Perceptron()`` is equivalent to `SGDClassifier(loss="perceptron",
+    eta0=1, learning_rate="constant", penalty=None)`.
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Perceptron and references therein.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.linear_model import Perceptron
+    >>> X, y = load_digits(return_X_y=True)
+    >>> clf = Perceptron(tol=1e-3, random_state=0)
+    >>> clf.fit(X, y)
+    Perceptron()
+    >>> clf.score(X, y)
+    0.939...
+    """
+
+    _parameter_constraints: dict = {**BaseSGDClassifier._parameter_constraints}
+    _parameter_constraints.pop("loss")
+    _parameter_constraints.pop("average")
+    _parameter_constraints.update(
+        {
+            "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None],
+            "alpha": [Interval(Real, 0, None, closed="left")],
+            "l1_ratio": [Interval(Real, 0, 1, closed="both")],
+            "eta0": [Interval(Real, 0, None, closed="left")],
+        }
+    )
+
+    def __init__(
+        self,
+        *,
+        penalty=None,
+        alpha=0.0001,
+        l1_ratio=0.15,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        shuffle=True,
+        verbose=0,
+        eta0=1.0,
+        n_jobs=None,
+        random_state=0,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        class_weight=None,
+        warm_start=False,
+    ):
+        super().__init__(
+            loss="perceptron",
+            penalty=penalty,
+            alpha=alpha,
+            l1_ratio=l1_ratio,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            shuffle=shuffle,
+            verbose=verbose,
+            random_state=random_state,
+            learning_rate="constant",
+            eta0=eta0,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            power_t=0.5,
+            warm_start=warm_start,
+            class_weight=class_weight,
+            n_jobs=n_jobs,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_quantile.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_quantile.py
new file mode 100644
index 0000000000000000000000000000000000000000..446d232958e8dbe3fec247ab37c05b39469160e8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_quantile.py
@@ -0,0 +1,301 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Real
+
+import numpy as np
+from scipy import sparse
+from scipy.optimize import linprog
+
+from ..base import BaseEstimator, RegressorMixin, _fit_context
+from ..exceptions import ConvergenceWarning
+from ..utils import _safe_indexing
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.fixes import parse_version, sp_version
+from ..utils.validation import _check_sample_weight, validate_data
+from ._base import LinearModel
+
+
+class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
+    """Linear regression model that predicts conditional quantiles.
+
+    The linear :class:`QuantileRegressor` optimizes the pinball loss for a
+    desired `quantile` and is robust to outliers.
+
+    This model uses an L1 regularization like
+    :class:`~sklearn.linear_model.Lasso`.
+
+    Read more in the :ref:`User Guide <quantile_regression>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    quantile : float, default=0.5
+        The quantile that the model tries to predict. It must be strictly
+        between 0 and 1. If 0.5 (default), the model predicts the 50%
+        quantile, i.e. the median.
+
+    alpha : float, default=1.0
+        Regularization constant that multiplies the L1 penalty term.
+
+    fit_intercept : bool, default=True
+        Whether or not to fit the intercept.
+
+    solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', \
+            'revised simplex'}, default='highs'
+        Method used by :func:`scipy.optimize.linprog` to solve the linear
+        programming formulation.
+
+        It is recommended to use the highs methods because
+        they are the fastest ones. Solvers "highs-ds", "highs-ipm" and "highs"
+        support sparse input data and, in fact, always convert to sparse csc.
+
+        From `scipy>=1.11.0`, "interior-point" is not available anymore.
+
+        .. versionchanged:: 1.4
+           The default of `solver` changed to `"highs"` in version 1.4.
+
+    solver_options : dict, default=None
+        Additional parameters passed to :func:`scipy.optimize.linprog` as
+        options. If `None` and if `solver='interior-point'`, then
+        `{"lstsq": True}` is passed to :func:`scipy.optimize.linprog` for the
+        sake of stability.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the features.
+
+    intercept_ : float
+        The intercept of the model, aka bias term.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The actual number of iterations performed by the solver.
+
+    See Also
+    --------
+    Lasso : The Lasso is a linear model that estimates sparse coefficients
+        with l1 regularization.
+    HuberRegressor : Linear regression model that is robust to outliers.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import QuantileRegressor
+    >>> import numpy as np
+    >>> n_samples, n_features = 10, 2
+    >>> rng = np.random.RandomState(0)
+    >>> y = rng.randn(n_samples)
+    >>> X = rng.randn(n_samples, n_features)
+    >>> # the two following lines are optional in practice
+    >>> from sklearn.utils.fixes import sp_version, parse_version
+    >>> reg = QuantileRegressor(quantile=0.8).fit(X, y)
+    >>> np.mean(y <= reg.predict(X))
+    np.float64(0.8)
+    """
+
+    _parameter_constraints: dict = {
+        "quantile": [Interval(Real, 0, 1, closed="neither")],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "fit_intercept": ["boolean"],
+        "solver": [
+            StrOptions(
+                {
+                    "highs-ds",
+                    "highs-ipm",
+                    "highs",
+                    "interior-point",
+                    "revised simplex",
+                }
+            ),
+        ],
+        "solver_options": [dict, None],
+    }
+
+    def __init__(
+        self,
+        *,
+        quantile=0.5,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="highs",
+        solver_options=None,
+    ):
+        self.quantile = quantile
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.solver = solver
+        self.solver_options = solver_options
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csc", "csr", "coo"],
+            y_numeric=True,
+            multi_output=False,
+        )
+        sample_weight = _check_sample_weight(sample_weight, X)
+
+        n_features = X.shape[1]
+        n_params = n_features
+
+        if self.fit_intercept:
+            n_params += 1
+            # Note that centering y and X with _preprocess_data does not work
+            # for quantile regression.
+
+        # The objective is defined as 1/n * sum(pinball loss) + alpha * L1.
+        # So we rescale the penalty term, which is equivalent.
+        alpha = np.sum(sample_weight) * self.alpha
+
+        if self.solver == "interior-point" and sp_version >= parse_version("1.11.0"):
+            raise ValueError(
+                f"Solver {self.solver} is not anymore available in SciPy >= 1.11.0."
+            )
+
+        if sparse.issparse(X) and self.solver not in ["highs", "highs-ds", "highs-ipm"]:
+            raise ValueError(
+                f"Solver {self.solver} does not support sparse X. "
+                "Use solver 'highs' for example."
+            )
+        # make default solver more stable
+        if self.solver_options is None and self.solver == "interior-point":
+            solver_options = {"lstsq": True}
+        else:
+            solver_options = self.solver_options
+
+        # After rescaling alpha, the minimization problem is
+        #     min sum(pinball loss) + alpha * L1
+        # Use linear programming formulation of quantile regression
+        #     min_x c x
+        #           A_eq x = b_eq
+        #                0 <= x
+        # x = (s0, s, t0, t, u, v) = slack variables >= 0
+        # intercept = s0 - t0
+        # coef = s - t
+        # c = (0, alpha * 1_p, 0, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n)
+        # residual = y - X@coef - intercept = u - v
+        # A_eq = (1_n, X, -1_n, -X, diag(1_n), -diag(1_n))
+        # b_eq = y
+        # p = n_features
+        # n = n_samples
+        # 1_n = vector of length n with entries equal one
+        # see https://stats.stackexchange.com/questions/384909/
+        #
+        # Filtering out zero sample weights from the beginning makes life
+        # easier for the linprog solver.
+        indices = np.nonzero(sample_weight)[0]
+        n_indices = len(indices)  # use n_mask instead of n_samples
+        if n_indices < len(sample_weight):
+            sample_weight = sample_weight[indices]
+            X = _safe_indexing(X, indices)
+            y = _safe_indexing(y, indices)
+        c = np.concatenate(
+            [
+                np.full(2 * n_params, fill_value=alpha),
+                sample_weight * self.quantile,
+                sample_weight * (1 - self.quantile),
+            ]
+        )
+        if self.fit_intercept:
+            # do not penalize the intercept
+            c[0] = 0
+            c[n_params] = 0
+
+        if self.solver in ["highs", "highs-ds", "highs-ipm"]:
+            # Note that highs methods always use a sparse CSC memory layout internally,
+            # even for optimization problems parametrized using dense numpy arrays.
+            # Therefore, we work with CSC matrices as early as possible to limit
+            # unnecessary repeated memory copies.
+            eye = sparse.eye(n_indices, dtype=X.dtype, format="csc")
+            if self.fit_intercept:
+                ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype))
+                A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc")
+            else:
+                A_eq = sparse.hstack([X, -X, eye, -eye], format="csc")
+        else:
+            eye = np.eye(n_indices)
+            if self.fit_intercept:
+                ones = np.ones((n_indices, 1))
+                A_eq = np.concatenate([ones, X, -ones, -X, eye, -eye], axis=1)
+            else:
+                A_eq = np.concatenate([X, -X, eye, -eye], axis=1)
+
+        b_eq = y
+
+        result = linprog(
+            c=c,
+            A_eq=A_eq,
+            b_eq=b_eq,
+            method=self.solver,
+            options=solver_options,
+        )
+        solution = result.x
+        if not result.success:
+            failure = {
+                1: "Iteration limit reached.",
+                2: "Problem appears to be infeasible.",
+                3: "Problem appears to be unbounded.",
+                4: "Numerical difficulties encountered.",
+            }
+            warnings.warn(
+                "Linear programming for QuantileRegressor did not succeed.\n"
+                f"Status is {result.status}: "
+                + failure.setdefault(result.status, "unknown reason")
+                + "\n"
+                + "Result message of linprog:\n"
+                + result.message,
+                ConvergenceWarning,
+            )
+
+        # positive slack - negative slack
+        # solution is an array with (params_pos, params_neg, u, v)
+        params = solution[:n_params] - solution[n_params : 2 * n_params]
+
+        self.n_iter_ = result.nit
+
+        if self.fit_intercept:
+            self.coef_ = params[1:]
+            self.intercept_ = params[0]
+        else:
+            self.coef_ = params
+            self.intercept_ = 0.0
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ransac.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ransac.py
new file mode 100644
index 0000000000000000000000000000000000000000..c18065436dc3518ccb4a2359480cf7db7f36cd7e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ransac.py
@@ -0,0 +1,726 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import (
+    BaseEstimator,
+    MetaEstimatorMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    _fit_context,
+    clone,
+)
+from ..exceptions import ConvergenceWarning
+from ..utils import check_consistent_length, check_random_state, get_tags
+from ..utils._bunch import Bunch
+from ..utils._param_validation import (
+    HasMethods,
+    Interval,
+    Options,
+    RealNotInt,
+    StrOptions,
+)
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.random import sample_without_replacement
+from ..utils.validation import (
+    _check_method_params,
+    _check_sample_weight,
+    check_is_fitted,
+    has_fit_parameter,
+    validate_data,
+)
+from ._base import LinearRegression
+
+_EPSILON = np.spacing(1)
+
+
+def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability):
+    """Determine number trials such that at least one outlier-free subset is
+    sampled for the given inlier/outlier ratio.
+
+    Parameters
+    ----------
+    n_inliers : int
+        Number of inliers in the data.
+
+    n_samples : int
+        Total number of samples in the data.
+
+    min_samples : int
+        Minimum number of samples chosen randomly from original data.
+
+    probability : float
+        Probability (confidence) that one outlier-free sample is generated.
+
+    Returns
+    -------
+    trials : int
+        Number of trials.
+
+    """
+    inlier_ratio = n_inliers / float(n_samples)
+    nom = max(_EPSILON, 1 - probability)
+    denom = max(_EPSILON, 1 - inlier_ratio**min_samples)
+    if nom == 1:
+        return 0
+    if denom == 1:
+        return float("inf")
+    return abs(float(np.ceil(np.log(nom) / np.log(denom))))
+
+
+class RANSACRegressor(
+    MetaEstimatorMixin,
+    RegressorMixin,
+    MultiOutputMixin,
+    BaseEstimator,
+):
+    """RANSAC (RANdom SAmple Consensus) algorithm.
+
+    RANSAC is an iterative algorithm for the robust estimation of parameters
+    from a subset of inliers from the complete data set.
+
+    Read more in the :ref:`User Guide <ransac_regression>`.
+
+    Parameters
+    ----------
+    estimator : object, default=None
+        Base estimator object which implements the following methods:
+
+        * `fit(X, y)`: Fit model to given training data and target values.
+        * `score(X, y)`: Returns the mean accuracy on the given test data,
+          which is used for the stop criterion defined by `stop_score`.
+          Additionally, the score is used to decide which of two equally
+          large consensus sets is chosen as the better one.
+        * `predict(X)`: Returns predicted values using the linear model,
+          which is used to compute residual error using loss function.
+
+        If `estimator` is None, then
+        :class:`~sklearn.linear_model.LinearRegression` is used for
+        target values of dtype float.
+
+        Note that the current implementation only supports regression
+        estimators.
+
+    min_samples : int (>= 1) or float ([0, 1]), default=None
+        Minimum number of samples chosen randomly from original data. Treated
+        as an absolute number of samples for `min_samples >= 1`, treated as a
+        relative number `ceil(min_samples * X.shape[0])` for
+        `min_samples < 1`. This is typically chosen as the minimal number of
+        samples necessary to estimate the given `estimator`. By default a
+        :class:`~sklearn.linear_model.LinearRegression` estimator is assumed and
+        `min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly
+        dependent upon the model, so if a `estimator` other than
+        :class:`~sklearn.linear_model.LinearRegression` is used, the user must
+        provide a value.
+
+    residual_threshold : float, default=None
+        Maximum residual for a data sample to be classified as an inlier.
+        By default the threshold is chosen as the MAD (median absolute
+        deviation) of the target values `y`. Points whose residuals are
+        strictly equal to the threshold are considered as inliers.
+
+    is_data_valid : callable, default=None
+        This function is called with the randomly selected data before the
+        model is fitted to it: `is_data_valid(X, y)`. If its return value is
+        False the current randomly chosen sub-sample is skipped.
+
+    is_model_valid : callable, default=None
+        This function is called with the estimated model and the randomly
+        selected data: `is_model_valid(model, X, y)`. If its return value is
+        False the current randomly chosen sub-sample is skipped.
+        Rejecting samples with this function is computationally costlier than
+        with `is_data_valid`. `is_model_valid` should therefore only be used if
+        the estimated model is needed for making the rejection decision.
+
+    max_trials : int, default=100
+        Maximum number of iterations for random sample selection.
+
+    max_skips : int, default=np.inf
+        Maximum number of iterations that can be skipped due to finding zero
+        inliers or invalid data defined by ``is_data_valid`` or invalid models
+        defined by ``is_model_valid``.
+
+        .. versionadded:: 0.19
+
+    stop_n_inliers : int, default=np.inf
+        Stop iteration if at least this number of inliers are found.
+
+    stop_score : float, default=np.inf
+        Stop iteration if score is greater equal than this threshold.
+
+    stop_probability : float in range [0, 1], default=0.99
+        RANSAC iteration stops if at least one outlier-free set of the training
+        data is sampled in RANSAC. This requires to generate at least N
+        samples (iterations)::
+
+            N >= log(1 - probability) / log(1 - e**m)
+
+        where the probability (confidence) is typically set to high value such
+        as 0.99 (the default) and e is the current fraction of inliers w.r.t.
+        the total number of samples.
+
+    loss : str, callable, default='absolute_error'
+        String inputs, 'absolute_error' and 'squared_error' are supported which
+        find the absolute error and squared error per sample respectively.
+
+        If ``loss`` is a callable, then it should be a function that takes
+        two arrays as inputs, the true and predicted value and returns a 1-D
+        array with the i-th value of the array corresponding to the loss
+        on ``X[i]``.
+
+        If the loss on a sample is greater than the ``residual_threshold``,
+        then this sample is classified as an outlier.
+
+        .. versionadded:: 0.18
+
+    random_state : int, RandomState instance, default=None
+        The generator used to initialize the centers.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    estimator_ : object
+        Final model fitted on the inliers predicted by the "best" model found
+        during RANSAC sampling (copy of the `estimator` object).
+
+    n_trials_ : int
+        Number of random selection trials until one of the stop criteria is
+        met. It is always ``<= max_trials``.
+
+    inlier_mask_ : bool array of shape [n_samples]
+        Boolean mask of inliers classified as ``True``.
+
+    n_skips_no_inliers_ : int
+        Number of iterations skipped due to finding zero inliers.
+
+        .. versionadded:: 0.19
+
+    n_skips_invalid_data_ : int
+        Number of iterations skipped due to invalid data defined by
+        ``is_data_valid``.
+
+        .. versionadded:: 0.19
+
+    n_skips_invalid_model_ : int
+        Number of iterations skipped due to an invalid model defined by
+        ``is_model_valid``.
+
+        .. versionadded:: 0.19
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    HuberRegressor : Linear regression model that is robust to outliers.
+    TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
+    SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/RANSAC
+    .. [2] https://www.sri.com/wp-content/uploads/2021/12/ransac-publication.pdf
+    .. [3] https://bmva-archive.org.uk/bmvc/2009/Papers/Paper355/Paper355.pdf
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import RANSACRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, random_state=0)
+    >>> reg = RANSACRegressor(random_state=0).fit(X, y)
+    >>> reg.score(X, y)
+    0.9885
+    >>> reg.predict(X[:1,])
+    array([-31.9417])
+
+    For a more detailed example, see
+    :ref:`sphx_glr_auto_examples_linear_model_plot_ransac.py`
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit", "score", "predict"]), None],
+        "min_samples": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="both"),
+            None,
+        ],
+        "residual_threshold": [Interval(Real, 0, None, closed="left"), None],
+        "is_data_valid": [callable, None],
+        "is_model_valid": [callable, None],
+        "max_trials": [
+            Interval(Integral, 0, None, closed="left"),
+            Options(Real, {np.inf}),
+        ],
+        "max_skips": [
+            Interval(Integral, 0, None, closed="left"),
+            Options(Real, {np.inf}),
+        ],
+        "stop_n_inliers": [
+            Interval(Integral, 0, None, closed="left"),
+            Options(Real, {np.inf}),
+        ],
+        "stop_score": [Interval(Real, None, None, closed="both")],
+        "stop_probability": [Interval(Real, 0, 1, closed="both")],
+        "loss": [StrOptions({"absolute_error", "squared_error"}), callable],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        estimator=None,
+        *,
+        min_samples=None,
+        residual_threshold=None,
+        is_data_valid=None,
+        is_model_valid=None,
+        max_trials=100,
+        max_skips=np.inf,
+        stop_n_inliers=np.inf,
+        stop_score=np.inf,
+        stop_probability=0.99,
+        loss="absolute_error",
+        random_state=None,
+    ):
+        self.estimator = estimator
+        self.min_samples = min_samples
+        self.residual_threshold = residual_threshold
+        self.is_data_valid = is_data_valid
+        self.is_model_valid = is_model_valid
+        self.max_trials = max_trials
+        self.max_skips = max_skips
+        self.stop_n_inliers = stop_n_inliers
+        self.stop_score = stop_score
+        self.stop_probability = stop_probability
+        self.random_state = random_state
+        self.loss = loss
+
+    @_fit_context(
+        # RansacRegressor.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        """Fit estimator using RANSAC algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample
+            raises error if sample_weight is passed and estimator
+            fit method does not support it.
+
+            .. versionadded:: 0.18
+
+        **fit_params : dict
+            Parameters routed to the `fit` method of the sub-estimator via the
+            metadata routing API.
+
+            .. versionadded:: 1.5
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        self : object
+            Fitted `RANSACRegressor` estimator.
+
+        Raises
+        ------
+        ValueError
+            If no valid consensus set could be found. This occurs if
+            `is_data_valid` and `is_model_valid` return False for all
+            `max_trials` randomly chosen sub-samples.
+        """
+        # Need to validate separately here. We can't pass multi_output=True
+        # because that would allow y to be csr. Delay expensive finiteness
+        # check to the estimator's own input validation.
+        _raise_for_params(fit_params, self, "fit")
+        check_X_params = dict(accept_sparse="csr", ensure_all_finite=False)
+        check_y_params = dict(ensure_2d=False)
+        X, y = validate_data(
+            self, X, y, validate_separately=(check_X_params, check_y_params)
+        )
+        check_consistent_length(X, y)
+
+        if self.estimator is not None:
+            estimator = clone(self.estimator)
+        else:
+            estimator = LinearRegression()
+
+        if self.min_samples is None:
+            if not isinstance(estimator, LinearRegression):
+                raise ValueError(
+                    "`min_samples` needs to be explicitly set when estimator "
+                    "is not a LinearRegression."
+                )
+            min_samples = X.shape[1] + 1
+        elif 0 < self.min_samples < 1:
+            min_samples = np.ceil(self.min_samples * X.shape[0])
+        elif self.min_samples >= 1:
+            min_samples = self.min_samples
+        if min_samples > X.shape[0]:
+            raise ValueError(
+                "`min_samples` may not be larger than number "
+                "of samples: n_samples = %d." % (X.shape[0])
+            )
+
+        if self.residual_threshold is None:
+            # MAD (median absolute deviation)
+            residual_threshold = np.median(np.abs(y - np.median(y)))
+        else:
+            residual_threshold = self.residual_threshold
+
+        if self.loss == "absolute_error":
+            if y.ndim == 1:
+                loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
+            else:
+                loss_function = lambda y_true, y_pred: np.sum(
+                    np.abs(y_true - y_pred), axis=1
+                )
+        elif self.loss == "squared_error":
+            if y.ndim == 1:
+                loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2
+            else:
+                loss_function = lambda y_true, y_pred: np.sum(
+                    (y_true - y_pred) ** 2, axis=1
+                )
+
+        elif callable(self.loss):
+            loss_function = self.loss
+
+        random_state = check_random_state(self.random_state)
+
+        try:  # Not all estimator accept a random_state
+            estimator.set_params(random_state=random_state)
+        except ValueError:
+            pass
+
+        estimator_fit_has_sample_weight = has_fit_parameter(estimator, "sample_weight")
+        estimator_name = type(estimator).__name__
+        if sample_weight is not None and not estimator_fit_has_sample_weight:
+            raise ValueError(
+                "%s does not support sample_weight. Sample"
+                " weights are only used for the calibration"
+                " itself." % estimator_name
+            )
+
+        if sample_weight is not None:
+            fit_params["sample_weight"] = sample_weight
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            routed_params = Bunch()
+            routed_params.estimator = Bunch(fit={}, predict={}, score={})
+            if sample_weight is not None:
+                sample_weight = _check_sample_weight(sample_weight, X)
+                routed_params.estimator.fit = {"sample_weight": sample_weight}
+
+        n_inliers_best = 1
+        score_best = -np.inf
+        inlier_mask_best = None
+        X_inlier_best = None
+        y_inlier_best = None
+        inlier_best_idxs_subset = None
+        self.n_skips_no_inliers_ = 0
+        self.n_skips_invalid_data_ = 0
+        self.n_skips_invalid_model_ = 0
+
+        # number of data samples
+        n_samples = X.shape[0]
+        sample_idxs = np.arange(n_samples)
+
+        self.n_trials_ = 0
+        max_trials = self.max_trials
+        while self.n_trials_ < max_trials:
+            self.n_trials_ += 1
+
+            if (
+                self.n_skips_no_inliers_
+                + self.n_skips_invalid_data_
+                + self.n_skips_invalid_model_
+            ) > self.max_skips:
+                break
+
+            # choose random sample set
+            subset_idxs = sample_without_replacement(
+                n_samples, min_samples, random_state=random_state
+            )
+            X_subset = X[subset_idxs]
+            y_subset = y[subset_idxs]
+
+            # check if random sample set is valid
+            if self.is_data_valid is not None and not self.is_data_valid(
+                X_subset, y_subset
+            ):
+                self.n_skips_invalid_data_ += 1
+                continue
+
+            # cut `fit_params` down to `subset_idxs`
+            fit_params_subset = _check_method_params(
+                X, params=routed_params.estimator.fit, indices=subset_idxs
+            )
+
+            # fit model for current random sample set
+            estimator.fit(X_subset, y_subset, **fit_params_subset)
+
+            # check if estimated model is valid
+            if self.is_model_valid is not None and not self.is_model_valid(
+                estimator, X_subset, y_subset
+            ):
+                self.n_skips_invalid_model_ += 1
+                continue
+
+            # residuals of all data for current random sample model
+            y_pred = estimator.predict(X)
+            residuals_subset = loss_function(y, y_pred)
+
+            # classify data into inliers and outliers
+            inlier_mask_subset = residuals_subset <= residual_threshold
+            n_inliers_subset = np.sum(inlier_mask_subset)
+
+            # less inliers -> skip current random sample
+            if n_inliers_subset < n_inliers_best:
+                self.n_skips_no_inliers_ += 1
+                continue
+
+            # extract inlier data set
+            inlier_idxs_subset = sample_idxs[inlier_mask_subset]
+            X_inlier_subset = X[inlier_idxs_subset]
+            y_inlier_subset = y[inlier_idxs_subset]
+
+            # cut `fit_params` down to `inlier_idxs_subset`
+            score_params_inlier_subset = _check_method_params(
+                X, params=routed_params.estimator.score, indices=inlier_idxs_subset
+            )
+
+            # score of inlier data set
+            score_subset = estimator.score(
+                X_inlier_subset,
+                y_inlier_subset,
+                **score_params_inlier_subset,
+            )
+
+            # same number of inliers but worse score -> skip current random
+            # sample
+            if n_inliers_subset == n_inliers_best and score_subset < score_best:
+                continue
+
+            # save current random sample as best sample
+            n_inliers_best = n_inliers_subset
+            score_best = score_subset
+            inlier_mask_best = inlier_mask_subset
+            X_inlier_best = X_inlier_subset
+            y_inlier_best = y_inlier_subset
+            inlier_best_idxs_subset = inlier_idxs_subset
+
+            max_trials = min(
+                max_trials,
+                _dynamic_max_trials(
+                    n_inliers_best, n_samples, min_samples, self.stop_probability
+                ),
+            )
+
+            # break if sufficient number of inliers or score is reached
+            if n_inliers_best >= self.stop_n_inliers or score_best >= self.stop_score:
+                break
+
+        # if none of the iterations met the required criteria
+        if inlier_mask_best is None:
+            if (
+                self.n_skips_no_inliers_
+                + self.n_skips_invalid_data_
+                + self.n_skips_invalid_model_
+            ) > self.max_skips:
+                raise ValueError(
+                    "RANSAC skipped more iterations than `max_skips` without"
+                    " finding a valid consensus set. Iterations were skipped"
+                    " because each randomly chosen sub-sample failed the"
+                    " passing criteria. See estimator attributes for"
+                    " diagnostics (n_skips*)."
+                )
+            else:
+                raise ValueError(
+                    "RANSAC could not find a valid consensus set. All"
+                    " `max_trials` iterations were skipped because each"
+                    " randomly chosen sub-sample failed the passing criteria."
+                    " See estimator attributes for diagnostics (n_skips*)."
+                )
+        else:
+            if (
+                self.n_skips_no_inliers_
+                + self.n_skips_invalid_data_
+                + self.n_skips_invalid_model_
+            ) > self.max_skips:
+                warnings.warn(
+                    (
+                        "RANSAC found a valid consensus set but exited"
+                        " early due to skipping more iterations than"
+                        " `max_skips`. See estimator attributes for"
+                        " diagnostics (n_skips*)."
+                    ),
+                    ConvergenceWarning,
+                )
+
+        # estimate final model using all inliers
+        fit_params_best_idxs_subset = _check_method_params(
+            X, params=routed_params.estimator.fit, indices=inlier_best_idxs_subset
+        )
+
+        estimator.fit(X_inlier_best, y_inlier_best, **fit_params_best_idxs_subset)
+
+        self.estimator_ = estimator
+        self.inlier_mask_ = inlier_mask_best
+        return self
+
+    def predict(self, X, **params):
+        """Predict using the estimated model.
+
+        This is a wrapper for `estimator_.predict(X)`.
+
+        Parameters
+        ----------
+        X : {array-like or sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        **params : dict
+            Parameters routed to the `predict` method of the sub-estimator via
+            the metadata routing API.
+
+            .. versionadded:: 1.5
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        y : array, shape = [n_samples] or [n_samples, n_targets]
+            Returns predicted values.
+        """
+        check_is_fitted(self)
+        X = validate_data(
+            self,
+            X,
+            ensure_all_finite=False,
+            accept_sparse=True,
+            reset=False,
+        )
+
+        _raise_for_params(params, self, "predict")
+
+        if _routing_enabled():
+            predict_params = process_routing(self, "predict", **params).estimator[
+                "predict"
+            ]
+        else:
+            predict_params = {}
+
+        return self.estimator_.predict(X, **predict_params)
+
+    def score(self, X, y, **params):
+        """Return the score of the prediction.
+
+        This is a wrapper for `estimator_.score(X, y)`.
+
+        Parameters
+        ----------
+        X : (array-like or sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        **params : dict
+            Parameters routed to the `score` method of the sub-estimator via
+            the metadata routing API.
+
+            .. versionadded:: 1.5
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        z : float
+            Score of the prediction.
+        """
+        check_is_fitted(self)
+        X = validate_data(
+            self,
+            X,
+            ensure_all_finite=False,
+            accept_sparse=True,
+            reset=False,
+        )
+
+        _raise_for_params(params, self, "score")
+        if _routing_enabled():
+            score_params = process_routing(self, "score", **params).estimator["score"]
+        else:
+            score_params = {}
+
+        return self.estimator_.score(X, y, **score_params)
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="fit")
+            .add(caller="fit", callee="score")
+            .add(caller="score", callee="score")
+            .add(caller="predict", callee="predict"),
+        )
+        return router
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        if self.estimator is None:
+            tags.input_tags.sparse = True  # default estimator is LinearRegression
+        else:
+            tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a55291a70ace22716d07fecffc931c8dadb093e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py
@@ -0,0 +1,2899 @@
+"""
+Ridge regression
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import warnings
+from abc import ABCMeta, abstractmethod
+from functools import partial
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg, optimize, sparse
+from scipy.sparse import linalg as sp_linalg
+
+from sklearn.base import BaseEstimator
+
+from ..base import MultiOutputMixin, RegressorMixin, _fit_context, is_classifier
+from ..exceptions import ConvergenceWarning
+from ..metrics import check_scoring, get_scorer_names
+from ..model_selection import GridSearchCV
+from ..preprocessing import LabelBinarizer
+from ..utils import (
+    Bunch,
+    check_array,
+    check_consistent_length,
+    check_scalar,
+    column_or_1d,
+    compute_sample_weight,
+)
+from ..utils._array_api import (
+    _is_numpy_namespace,
+    _ravel,
+    device,
+    get_namespace,
+    get_namespace_and_device,
+)
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.extmath import row_norms, safe_sparse_dot
+from ..utils.fixes import _sparse_linalg_cg
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.sparsefuncs import mean_variance_axis
+from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data
+from ._base import LinearClassifierMixin, LinearModel, _preprocess_data, _rescale_data
+from ._sag import sag_solver
+
+
+def _get_rescaled_operator(X, X_offset, sample_weight_sqrt):
+    """Create LinearOperator for matrix products with implicit centering.
+
+    Matrix product `LinearOperator @ coef` returns `(X - X_offset) @ coef`.
+    """
+
+    def matvec(b):
+        return X.dot(b) - sample_weight_sqrt * b.dot(X_offset)
+
+    def rmatvec(b):
+        return X.T.dot(b) - X_offset * b.dot(sample_weight_sqrt)
+
+    X1 = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec)
+    return X1
+
+
+def _solve_sparse_cg(
+    X,
+    y,
+    alpha,
+    max_iter=None,
+    tol=1e-4,
+    verbose=0,
+    X_offset=None,
+    X_scale=None,
+    sample_weight_sqrt=None,
+):
+    if sample_weight_sqrt is None:
+        sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)
+
+    n_samples, n_features = X.shape
+
+    if X_offset is None or X_scale is None:
+        X1 = sp_linalg.aslinearoperator(X)
+    else:
+        X_offset_scale = X_offset / X_scale
+        X1 = _get_rescaled_operator(X, X_offset_scale, sample_weight_sqrt)
+
+    coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
+
+    if n_features > n_samples:
+
+        def create_mv(curr_alpha):
+            def _mv(x):
+                return X1.matvec(X1.rmatvec(x)) + curr_alpha * x
+
+            return _mv
+
+    else:
+
+        def create_mv(curr_alpha):
+            def _mv(x):
+                return X1.rmatvec(X1.matvec(x)) + curr_alpha * x
+
+            return _mv
+
+    for i in range(y.shape[1]):
+        y_column = y[:, i]
+
+        mv = create_mv(alpha[i])
+        if n_features > n_samples:
+            # kernel ridge
+            # w = X.T * inv(X X^t + alpha*Id) y
+            C = sp_linalg.LinearOperator(
+                (n_samples, n_samples), matvec=mv, dtype=X.dtype
+            )
+            coef, info = _sparse_linalg_cg(C, y_column, rtol=tol)
+            coefs[i] = X1.rmatvec(coef)
+        else:
+            # linear ridge
+            # w = inv(X^t X + alpha*Id) * X.T y
+            y_column = X1.rmatvec(y_column)
+            C = sp_linalg.LinearOperator(
+                (n_features, n_features), matvec=mv, dtype=X.dtype
+            )
+            coefs[i], info = _sparse_linalg_cg(C, y_column, maxiter=max_iter, rtol=tol)
+
+        if info < 0:
+            raise ValueError("Failed with error code %d" % info)
+
+        if max_iter is None and info > 0 and verbose:
+            warnings.warn(
+                "sparse_cg did not converge after %d iterations." % info,
+                ConvergenceWarning,
+            )
+
+    return coefs
+
+
+def _solve_lsqr(
+    X,
+    y,
+    *,
+    alpha,
+    fit_intercept=True,
+    max_iter=None,
+    tol=1e-4,
+    X_offset=None,
+    X_scale=None,
+    sample_weight_sqrt=None,
+):
+    """Solve Ridge regression via LSQR.
+
+    We expect that y is always mean centered.
+    If X is dense, we expect it to be mean centered such that we can solve
+        ||y - Xw||_2^2 + alpha * ||w||_2^2
+
+    If X is sparse, we expect X_offset to be given such that we can solve
+        ||y - (X - X_offset)w||_2^2 + alpha * ||w||_2^2
+
+    With sample weights S=diag(sample_weight), this becomes
+        ||sqrt(S) (y - (X - X_offset) w)||_2^2 + alpha * ||w||_2^2
+    and we expect y and X to already be rescaled, i.e. sqrt(S) @ y, sqrt(S) @ X. In
+    this case, X_offset is the sample_weight weighted mean of X before scaling by
+    sqrt(S). The objective then reads
+       ||y - (X - sqrt(S) X_offset) w)||_2^2 + alpha * ||w||_2^2
+    """
+    if sample_weight_sqrt is None:
+        sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)
+
+    if sparse.issparse(X) and fit_intercept:
+        X_offset_scale = X_offset / X_scale
+        X1 = _get_rescaled_operator(X, X_offset_scale, sample_weight_sqrt)
+    else:
+        # No need to touch anything
+        X1 = X
+
+    n_samples, n_features = X.shape
+    coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
+    n_iter = np.empty(y.shape[1], dtype=np.int32)
+
+    # According to the lsqr documentation, alpha = damp^2.
+    sqrt_alpha = np.sqrt(alpha)
+
+    for i in range(y.shape[1]):
+        y_column = y[:, i]
+        info = sp_linalg.lsqr(
+            X1, y_column, damp=sqrt_alpha[i], atol=tol, btol=tol, iter_lim=max_iter
+        )
+        coefs[i] = info[0]
+        n_iter[i] = info[2]
+
+    return coefs, n_iter
+
+
+def _solve_cholesky(X, y, alpha):
+    # w = inv(X^t X + alpha*Id) * X.T y
+    n_features = X.shape[1]
+    n_targets = y.shape[1]
+
+    A = safe_sparse_dot(X.T, X, dense_output=True)
+    Xy = safe_sparse_dot(X.T, y, dense_output=True)
+
+    one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]])
+
+    if one_alpha:
+        A.flat[:: n_features + 1] += alpha[0]
+        return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
+    else:
+        coefs = np.empty([n_targets, n_features], dtype=X.dtype)
+        for coef, target, current_alpha in zip(coefs, Xy.T, alpha):
+            A.flat[:: n_features + 1] += current_alpha
+            coef[:] = linalg.solve(A, target, assume_a="pos", overwrite_a=False).ravel()
+            A.flat[:: n_features + 1] -= current_alpha
+        return coefs
+
+
+def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False):
+    # dual_coef = inv(X X^t + alpha*Id) y
+    n_samples = K.shape[0]
+    n_targets = y.shape[1]
+
+    if copy:
+        K = K.copy()
+
+    alpha = np.atleast_1d(alpha)
+    one_alpha = (alpha == alpha[0]).all()
+    has_sw = isinstance(sample_weight, np.ndarray) or sample_weight not in [1.0, None]
+
+    if has_sw:
+        # Unlike other solvers, we need to support sample_weight directly
+        # because K might be a pre-computed kernel.
+        sw = np.sqrt(np.atleast_1d(sample_weight))
+        y = y * sw[:, np.newaxis]
+        K *= np.outer(sw, sw)
+
+    if one_alpha:
+        # Only one penalty, we can solve multi-target problems in one time.
+        K.flat[:: n_samples + 1] += alpha[0]
+
+        try:
+            # Note: we must use overwrite_a=False in order to be able to
+            #       use the fall-back solution below in case a LinAlgError
+            #       is raised
+            dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
+        except np.linalg.LinAlgError:
+            warnings.warn(
+                "Singular matrix in solving dual problem. Using "
+                "least-squares solution instead."
+            )
+            dual_coef = linalg.lstsq(K, y)[0]
+
+        # K is expensive to compute and store in memory so change it back in
+        # case it was user-given.
+        K.flat[:: n_samples + 1] -= alpha[0]
+
+        if has_sw:
+            dual_coef *= sw[:, np.newaxis]
+
+        return dual_coef
+    else:
+        # One penalty per target. We need to solve each target separately.
+        dual_coefs = np.empty([n_targets, n_samples], K.dtype)
+
+        for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha):
+            K.flat[:: n_samples + 1] += current_alpha
+
+            dual_coef[:] = linalg.solve(
+                K, target, assume_a="pos", overwrite_a=False
+            ).ravel()
+
+            K.flat[:: n_samples + 1] -= current_alpha
+
+        if has_sw:
+            dual_coefs *= sw[np.newaxis, :]
+
+        return dual_coefs.T
+
+
+def _solve_svd(X, y, alpha, xp=None):
+    xp, _ = get_namespace(X, xp=xp)
+    U, s, Vt = xp.linalg.svd(X, full_matrices=False)
+    idx = s > 1e-15  # same default value as scipy.linalg.pinv
+    s_nnz = s[idx][:, None]
+    UTy = U.T @ y
+    d = xp.zeros((s.shape[0], alpha.shape[0]), dtype=X.dtype, device=device(X))
+    d[idx] = s_nnz / (s_nnz**2 + alpha)
+    d_UT_y = d * UTy
+    return (Vt.T @ d_UT_y).T
+
+
+def _solve_lbfgs(
+    X,
+    y,
+    alpha,
+    positive=True,
+    max_iter=None,
+    tol=1e-4,
+    X_offset=None,
+    X_scale=None,
+    sample_weight_sqrt=None,
+):
+    """Solve ridge regression with LBFGS.
+
+    The main purpose is fitting with forcing coefficients to be positive.
+    For unconstrained ridge regression, there are faster dedicated solver methods.
+    Note that with positive bounds on the coefficients, LBFGS seems faster
+    than scipy.optimize.lsq_linear.
+    """
+    n_samples, n_features = X.shape
+
+    options = {}
+    if max_iter is not None:
+        options["maxiter"] = max_iter
+    config = {
+        "method": "L-BFGS-B",
+        "tol": tol,
+        "jac": True,
+        "options": options,
+    }
+    if positive:
+        config["bounds"] = [(0, np.inf)] * n_features
+
+    if X_offset is not None and X_scale is not None:
+        X_offset_scale = X_offset / X_scale
+    else:
+        X_offset_scale = None
+
+    if sample_weight_sqrt is None:
+        sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)
+
+    coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
+
+    for i in range(y.shape[1]):
+        x0 = np.zeros((n_features,))
+        y_column = y[:, i]
+
+        def func(w):
+            residual = X.dot(w) - y_column
+            if X_offset_scale is not None:
+                residual -= sample_weight_sqrt * w.dot(X_offset_scale)
+            f = 0.5 * residual.dot(residual) + 0.5 * alpha[i] * w.dot(w)
+            grad = X.T @ residual + alpha[i] * w
+            if X_offset_scale is not None:
+                grad -= X_offset_scale * residual.dot(sample_weight_sqrt)
+
+            return f, grad
+
+        result = optimize.minimize(func, x0, **config)
+        if not result["success"]:
+            warnings.warn(
+                (
+                    "The lbfgs solver did not converge. Try increasing max_iter "
+                    f"or tol. Currently: max_iter={max_iter} and tol={tol}"
+                ),
+                ConvergenceWarning,
+            )
+        coefs[i] = result["x"]
+
+    return coefs
+
+
+def _get_valid_accept_sparse(is_X_sparse, solver):
+    if is_X_sparse and solver in ["auto", "sag", "saga"]:
+        return "csr"
+    else:
+        return ["csr", "csc", "coo"]
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix", sp_linalg.LinearOperator],
+        "y": ["array-like"],
+        "alpha": [Interval(Real, 0, None, closed="left"), "array-like"],
+        "sample_weight": [
+            Interval(Real, None, None, closed="neither"),
+            "array-like",
+            None,
+        ],
+        "solver": [
+            StrOptions(
+                {"auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"}
+            )
+        ],
+        "max_iter": [Interval(Integral, 0, None, closed="left"), None],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "verbose": ["verbose"],
+        "positive": ["boolean"],
+        "random_state": ["random_state"],
+        "return_n_iter": ["boolean"],
+        "return_intercept": ["boolean"],
+        "check_input": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def ridge_regression(
+    X,
+    y,
+    alpha,
+    *,
+    sample_weight=None,
+    solver="auto",
+    max_iter=None,
+    tol=1e-4,
+    verbose=0,
+    positive=False,
+    random_state=None,
+    return_n_iter=False,
+    return_intercept=False,
+    check_input=True,
+):
+    """Solve the ridge equation by the method of normal equations.
+
+    Read more in the :ref:`User Guide <ridge_regression>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix, LinearOperator} of shape \
+        (n_samples, n_features)
+        Training data.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        Target values.
+
+    alpha : float or array-like of shape (n_targets,)
+        Constant that multiplies the L2 term, controlling regularization
+        strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.
+
+        When `alpha = 0`, the objective is equivalent to ordinary least
+        squares, solved by the :class:`LinearRegression` object. For numerical
+        reasons, using `alpha = 0` with the `Ridge` object is not advised.
+        Instead, you should use the :class:`LinearRegression` object.
+
+        If an array is passed, penalties are assumed to be specific to the
+        targets. Hence they must correspond in number.
+
+    sample_weight : float or array-like of shape (n_samples,), default=None
+        Individual weights for each sample. If given a float, every sample
+        will have the same weight. If sample_weight is not None and
+        solver='auto', the solver will be set to 'cholesky'.
+
+        .. versionadded:: 0.17
+
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
+            'sag', 'saga', 'lbfgs'}, default='auto'
+        Solver to use in the computational routines:
+
+        - 'auto' chooses the solver automatically based on the type of data.
+
+        - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
+          coefficients. It is the most stable solver, in particular more stable
+          for singular matrices than 'cholesky' at the cost of being slower.
+
+        - 'cholesky' uses the standard scipy.linalg.solve function to
+          obtain a closed-form solution via a Cholesky decomposition of
+          dot(X.T, X)
+
+        - 'sparse_cg' uses the conjugate gradient solver as found in
+          scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
+          more appropriate than 'cholesky' for large-scale data
+          (possibility to set `tol` and `max_iter`).
+
+        - 'lsqr' uses the dedicated regularized least-squares routine
+          scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
+          procedure.
+
+        - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
+          its improved, unbiased version named SAGA. Both methods also use an
+          iterative procedure, and are often faster than other solvers when
+          both n_samples and n_features are large. Note that 'sag' and
+          'saga' fast convergence is only guaranteed on features with
+          approximately the same scale. You can preprocess the data with a
+          scaler from sklearn.preprocessing.
+
+        - 'lbfgs' uses L-BFGS-B algorithm implemented in
+          `scipy.optimize.minimize`. It can be used only when `positive`
+          is True.
+
+        All solvers except 'svd' support both dense and sparse data. However, only
+        'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
+        `fit_intercept` is True.
+
+        .. versionadded:: 0.17
+           Stochastic Average Gradient descent solver.
+        .. versionadded:: 0.19
+           SAGA solver.
+
+    max_iter : int, default=None
+        Maximum number of iterations for conjugate gradient solver.
+        For the 'sparse_cg' and 'lsqr' solvers, the default value is determined
+        by scipy.sparse.linalg. For 'sag' and saga solver, the default value is
+        1000. For 'lbfgs' solver, the default value is 15000.
+
+    tol : float, default=1e-4
+        Precision of the solution. Note that `tol` has no effect for solvers 'svd' and
+        'cholesky'.
+
+        .. versionchanged:: 1.2
+           Default value changed from 1e-3 to 1e-4 for consistency with other linear
+           models.
+
+    verbose : int, default=0
+        Verbosity level. Setting verbose > 0 will display additional
+        information depending on the solver used.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive.
+        Only 'lbfgs' solver is supported in this case.
+
+    random_state : int, RandomState instance, default=None
+        Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
+        See :term:`Glossary <random_state>` for details.
+
+    return_n_iter : bool, default=False
+        If True, the method also returns `n_iter`, the actual number of
+        iteration performed by the solver.
+
+        .. versionadded:: 0.17
+
+    return_intercept : bool, default=False
+        If True and if X is sparse, the method also returns the intercept,
+        and the solver is automatically changed to 'sag'. This is only a
+        temporary fix for fitting the intercept with sparse data. For dense
+        data, use sklearn.linear_model._preprocess_data before your regression.
+
+        .. versionadded:: 0.17
+
+    check_input : bool, default=True
+        If False, the input arrays X and y will not be checked.
+
+        .. versionadded:: 0.21
+
+    Returns
+    -------
+    coef : ndarray of shape (n_features,) or (n_targets, n_features)
+        Weight vector(s).
+
+    n_iter : int, optional
+        The actual number of iteration performed by the solver.
+        Only returned if `return_n_iter` is True.
+
+    intercept : float or ndarray of shape (n_targets,)
+        The intercept of the model. Only returned if `return_intercept`
+        is True and if X is a scipy sparse array.
+
+    Notes
+    -----
+    This function won't compute the intercept.
+
+    Regularization improves the conditioning of the problem and
+    reduces the variance of the estimates. Larger values specify stronger
+    regularization. Alpha corresponds to ``1 / (2C)`` in other linear
+    models such as :class:`~sklearn.linear_model.LogisticRegression` or
+    :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are
+    assumed to be specific to the targets. Hence they must correspond in
+    number.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.linear_model import ridge_regression
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.randn(100, 4)
+    >>> y = 2.0 * X[:, 0] - 1.0 * X[:, 1] + 0.1 * rng.standard_normal(100)
+    >>> coef, intercept = ridge_regression(X, y, alpha=1.0, return_intercept=True,
+    ...                                    random_state=0)
+    >>> coef
+    array([ 1.97, -1., -2.69e-3, -9.27e-4 ])
+    >>> intercept
+    np.float64(-.0012)
+    """
+    return _ridge_regression(
+        X,
+        y,
+        alpha,
+        sample_weight=sample_weight,
+        solver=solver,
+        max_iter=max_iter,
+        tol=tol,
+        verbose=verbose,
+        positive=positive,
+        random_state=random_state,
+        return_n_iter=return_n_iter,
+        return_intercept=return_intercept,
+        X_scale=None,
+        X_offset=None,
+        check_input=check_input,
+    )
+
+
+def _ridge_regression(
+    X,
+    y,
+    alpha,
+    sample_weight=None,
+    solver="auto",
+    max_iter=None,
+    tol=1e-4,
+    verbose=0,
+    positive=False,
+    random_state=None,
+    return_n_iter=False,
+    return_intercept=False,
+    return_solver=False,
+    X_scale=None,
+    X_offset=None,
+    check_input=True,
+    fit_intercept=False,
+):
+    xp, is_array_api_compliant, device_ = get_namespace_and_device(
+        X, y, sample_weight, X_scale, X_offset
+    )
+    is_numpy_namespace = _is_numpy_namespace(xp)
+    X_is_sparse = sparse.issparse(X)
+
+    has_sw = sample_weight is not None
+
+    solver = resolve_solver(solver, positive, return_intercept, X_is_sparse, xp)
+
+    if is_numpy_namespace and not X_is_sparse:
+        X = np.asarray(X)
+
+    if not is_numpy_namespace and solver != "svd":
+        raise ValueError(
+            f"Array API dispatch to namespace {xp.__name__} only supports "
+            f"solver 'svd'. Got '{solver}'."
+        )
+
+    if positive and solver != "lbfgs":
+        raise ValueError(
+            "When positive=True, only 'lbfgs' solver can be used. "
+            f"Please change solver {solver} to 'lbfgs' "
+            "or set positive=False."
+        )
+
+    if solver == "lbfgs" and not positive:
+        raise ValueError(
+            "'lbfgs' solver can be used only when positive=True. "
+            "Please use another solver."
+        )
+
+    if return_intercept and solver != "sag":
+        raise ValueError(
+            "In Ridge, only 'sag' solver can directly fit the "
+            "intercept. Please change solver to 'sag' or set "
+            "return_intercept=False."
+        )
+
+    if check_input:
+        _dtype = [xp.float64, xp.float32]
+        _accept_sparse = _get_valid_accept_sparse(X_is_sparse, solver)
+        X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype, order="C")
+        y = check_array(y, dtype=X.dtype, ensure_2d=False, order=None)
+    check_consistent_length(X, y)
+
+    n_samples, n_features = X.shape
+
+    if y.ndim > 2:
+        raise ValueError("Target y has the wrong shape %s" % str(y.shape))
+
+    if y.ndim == 1:
+        y = xp.reshape(y, (-1, 1))
+
+    n_samples_, n_targets = y.shape
+
+    if n_samples != n_samples_:
+        raise ValueError(
+            "Number of samples in X and y does not correspond: %d != %d"
+            % (n_samples, n_samples_)
+        )
+
+    if has_sw:
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        if solver not in ["sag", "saga"]:
+            # SAG supports sample_weight directly. For other solvers,
+            # we implement sample_weight via a simple rescaling.
+            X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight)
+
+    # Some callers of this method might pass alpha as single
+    # element array which already has been validated.
+    if alpha is not None and not isinstance(alpha, type(xp.asarray([0.0]))):
+        alpha = check_scalar(
+            alpha,
+            "alpha",
+            target_type=numbers.Real,
+            min_val=0.0,
+            include_boundaries="left",
+        )
+
+    # There should be either 1 or n_targets penalties
+    alpha = _ravel(xp.asarray(alpha, device=device_, dtype=X.dtype), xp=xp)
+    if alpha.shape[0] not in [1, n_targets]:
+        raise ValueError(
+            "Number of targets and number of penalties do not correspond: %d != %d"
+            % (alpha.shape[0], n_targets)
+        )
+
+    if alpha.shape[0] == 1 and n_targets > 1:
+        alpha = xp.full(
+            shape=(n_targets,), fill_value=alpha[0], dtype=alpha.dtype, device=device_
+        )
+
+    n_iter = None
+    if solver == "sparse_cg":
+        coef = _solve_sparse_cg(
+            X,
+            y,
+            alpha,
+            max_iter=max_iter,
+            tol=tol,
+            verbose=verbose,
+            X_offset=X_offset,
+            X_scale=X_scale,
+            sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
+        )
+
+    elif solver == "lsqr":
+        coef, n_iter = _solve_lsqr(
+            X,
+            y,
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            X_offset=X_offset,
+            X_scale=X_scale,
+            sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
+        )
+
+    elif solver == "cholesky":
+        if n_features > n_samples:
+            K = safe_sparse_dot(X, X.T, dense_output=True)
+            try:
+                dual_coef = _solve_cholesky_kernel(K, y, alpha)
+
+                coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T
+            except linalg.LinAlgError:
+                # use SVD solver if matrix is singular
+                solver = "svd"
+        else:
+            try:
+                coef = _solve_cholesky(X, y, alpha)
+            except linalg.LinAlgError:
+                # use SVD solver if matrix is singular
+                solver = "svd"
+
+    elif solver in ["sag", "saga"]:
+        # precompute max_squared_sum for all targets
+        max_squared_sum = row_norms(X, squared=True).max()
+
+        coef = np.empty((y.shape[1], n_features), dtype=X.dtype)
+        n_iter = np.empty(y.shape[1], dtype=np.int32)
+        intercept = np.zeros((y.shape[1],), dtype=X.dtype)
+        for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
+            init = {
+                "coef": np.zeros((n_features + int(return_intercept), 1), dtype=X.dtype)
+            }
+            coef_, n_iter_, _ = sag_solver(
+                X,
+                target.ravel(),
+                sample_weight,
+                "squared",
+                alpha_i,
+                0,
+                max_iter,
+                tol,
+                verbose,
+                random_state,
+                False,
+                max_squared_sum,
+                init,
+                is_saga=solver == "saga",
+            )
+            if return_intercept:
+                coef[i] = coef_[:-1]
+                intercept[i] = coef_[-1]
+            else:
+                coef[i] = coef_
+            n_iter[i] = n_iter_
+
+        if intercept.shape[0] == 1:
+            intercept = intercept[0]
+
+    elif solver == "lbfgs":
+        coef = _solve_lbfgs(
+            X,
+            y,
+            alpha,
+            positive=positive,
+            tol=tol,
+            max_iter=max_iter,
+            X_offset=X_offset,
+            X_scale=X_scale,
+            sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
+        )
+
+    if solver == "svd":
+        if X_is_sparse:
+            raise TypeError("SVD solver does not support sparse inputs currently")
+        coef = _solve_svd(X, y, alpha, xp)
+
+    if n_targets == 1:
+        coef = _ravel(coef)
+
+    coef = xp.asarray(coef)
+
+    if return_n_iter and return_intercept:
+        res = coef, n_iter, intercept
+    elif return_intercept:
+        res = coef, intercept
+    elif return_n_iter:
+        res = coef, n_iter
+    else:
+        res = coef
+
+    return (*res, solver) if return_solver else res
+
+
+def resolve_solver(solver, positive, return_intercept, is_sparse, xp):
+    if solver != "auto":
+        return solver
+
+    is_numpy_namespace = _is_numpy_namespace(xp)
+
+    auto_solver_np = resolve_solver_for_numpy(positive, return_intercept, is_sparse)
+    if is_numpy_namespace:
+        return auto_solver_np
+
+    if positive:
+        raise ValueError(
+            "The solvers that support positive fitting do not support "
+            f"Array API dispatch to namespace {xp.__name__}. Please "
+            "either disable Array API dispatch, or use a numpy-like "
+            "namespace, or set `positive=False`."
+        )
+
+    # At the moment, Array API dispatch only supports the "svd" solver.
+    solver = "svd"
+    if solver != auto_solver_np:
+        warnings.warn(
+            f"Using Array API dispatch to namespace {xp.__name__} with "
+            f"`solver='auto'` will result in using the solver '{solver}'. "
+            "The results may differ from those when using a Numpy array, "
+            f"because in that case the preferred solver would be {auto_solver_np}. "
+            f"Set `solver='{solver}'` to suppress this warning."
+        )
+
+    return solver
+
+
+def resolve_solver_for_numpy(positive, return_intercept, is_sparse):
+    if positive:
+        return "lbfgs"
+
+    if return_intercept:
+        # sag supports fitting intercept directly
+        return "sag"
+
+    if not is_sparse:
+        return "cholesky"
+
+    return "sparse_cg"
+
+
+class _BaseRidge(LinearModel, metaclass=ABCMeta):
+    _parameter_constraints: dict = {
+        "alpha": [Interval(Real, 0, None, closed="left"), np.ndarray],
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "max_iter": [Interval(Integral, 1, None, closed="left"), None],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "solver": [
+            StrOptions(
+                {"auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"}
+            )
+        ],
+        "positive": ["boolean"],
+        "random_state": ["random_state"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        fit_intercept=True,
+        copy_X=True,
+        max_iter=None,
+        tol=1e-4,
+        solver="auto",
+        positive=False,
+        random_state=None,
+    ):
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.max_iter = max_iter
+        self.tol = tol
+        self.solver = solver
+        self.positive = positive
+        self.random_state = random_state
+
+    def fit(self, X, y, sample_weight=None):
+        xp, is_array_api_compliant = get_namespace(X, y, sample_weight)
+
+        if self.solver == "lbfgs" and not self.positive:
+            raise ValueError(
+                "'lbfgs' solver can be used only when positive=True. "
+                "Please use another solver."
+            )
+
+        if self.positive:
+            if self.solver not in ["auto", "lbfgs"]:
+                raise ValueError(
+                    f"solver='{self.solver}' does not support positive fitting. Please"
+                    " set the solver to 'auto' or 'lbfgs', or set `positive=False`"
+                )
+            else:
+                solver = self.solver
+        elif sparse.issparse(X) and self.fit_intercept:
+            if self.solver not in ["auto", "lbfgs", "lsqr", "sag", "sparse_cg"]:
+                raise ValueError(
+                    "solver='{}' does not support fitting the intercept "
+                    "on sparse data. Please set the solver to 'auto' or "
+                    "'lsqr', 'sparse_cg', 'sag', 'lbfgs' "
+                    "or set `fit_intercept=False`".format(self.solver)
+                )
+            if self.solver in ["lsqr", "lbfgs"]:
+                solver = self.solver
+            elif self.solver == "sag" and self.max_iter is None and self.tol > 1e-4:
+                warnings.warn(
+                    '"sag" solver requires many iterations to fit '
+                    "an intercept with sparse inputs. Either set the "
+                    'solver to "auto" or "sparse_cg", or set a low '
+                    '"tol" and a high "max_iter" (especially if inputs are '
+                    "not standardized)."
+                )
+                solver = "sag"
+            else:
+                solver = "sparse_cg"
+        else:
+            solver = self.solver
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        # when X is sparse we only remove offset from y
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X,
+            y,
+            fit_intercept=self.fit_intercept,
+            copy=self.copy_X,
+            sample_weight=sample_weight,
+        )
+
+        if solver == "sag" and sparse.issparse(X) and self.fit_intercept:
+            self.coef_, self.n_iter_, self.intercept_, self.solver_ = _ridge_regression(
+                X,
+                y,
+                alpha=self.alpha,
+                sample_weight=sample_weight,
+                max_iter=self.max_iter,
+                tol=self.tol,
+                solver="sag",
+                positive=self.positive,
+                random_state=self.random_state,
+                return_n_iter=True,
+                return_intercept=True,
+                return_solver=True,
+                check_input=False,
+            )
+            # add the offset which was subtracted by _preprocess_data
+            self.intercept_ += y_offset
+
+        else:
+            if sparse.issparse(X) and self.fit_intercept:
+                # required to fit intercept with sparse_cg and lbfgs solver
+                params = {"X_offset": X_offset, "X_scale": X_scale}
+            else:
+                # for dense matrices or when intercept is set to 0
+                params = {}
+
+            self.coef_, self.n_iter_, self.solver_ = _ridge_regression(
+                X,
+                y,
+                alpha=self.alpha,
+                sample_weight=sample_weight,
+                max_iter=self.max_iter,
+                tol=self.tol,
+                solver=solver,
+                positive=self.positive,
+                random_state=self.random_state,
+                return_n_iter=True,
+                return_intercept=False,
+                return_solver=True,
+                check_input=False,
+                fit_intercept=self.fit_intercept,
+                **params,
+            )
+            self._set_intercept(X_offset, y_offset, X_scale)
+
+        return self
+
+
+class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
+    """Linear least squares with l2 regularization.
+
+    Minimizes the objective function::
+
+    ||y - Xw||^2_2 + alpha * ||w||^2_2
+
+    This model solves a regression model where the loss function is
+    the linear least squares function and regularization is given by
+    the l2-norm. Also known as Ridge Regression or Tikhonov regularization.
+    This estimator has built-in support for multi-variate regression
+    (i.e., when y is a 2d-array of shape (n_samples, n_targets)).
+
+    Read more in the :ref:`User Guide <ridge_regression>`.
+
+    Parameters
+    ----------
+    alpha : {float, ndarray of shape (n_targets,)}, default=1.0
+        Constant that multiplies the L2 term, controlling regularization
+        strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.
+
+        When `alpha = 0`, the objective is equivalent to ordinary least
+        squares, solved by the :class:`LinearRegression` object. For numerical
+        reasons, using `alpha = 0` with the `Ridge` object is not advised.
+        Instead, you should use the :class:`LinearRegression` object.
+
+        If an array is passed, penalties are assumed to be specific to the
+        targets. Hence they must correspond in number.
+
+    fit_intercept : bool, default=True
+        Whether to fit the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. ``X`` and ``y`` are expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    max_iter : int, default=None
+        Maximum number of iterations for conjugate gradient solver.
+        For 'sparse_cg' and 'lsqr' solvers, the default value is determined
+        by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
+        For 'lbfgs' solver, the default value is 15000.
+
+    tol : float, default=1e-4
+        The precision of the solution (`coef_`) is determined by `tol` which
+        specifies a different convergence criterion for each solver:
+
+        - 'svd': `tol` has no impact.
+
+        - 'cholesky': `tol` has no impact.
+
+        - 'sparse_cg': norm of residuals smaller than `tol`.
+
+        - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,
+          which control the norm of the residual vector in terms of the norms of
+          matrix and coefficients.
+
+        - 'sag' and 'saga': relative change of coef smaller than `tol`.
+
+        - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals|
+          smaller than `tol`.
+
+        .. versionchanged:: 1.2
+           Default value changed from 1e-3 to 1e-4 for consistency with other linear
+           models.
+
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
+            'sag', 'saga', 'lbfgs'}, default='auto'
+        Solver to use in the computational routines:
+
+        - 'auto' chooses the solver automatically based on the type of data.
+
+        - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
+          coefficients. It is the most stable solver, in particular more stable
+          for singular matrices than 'cholesky' at the cost of being slower.
+
+        - 'cholesky' uses the standard scipy.linalg.solve function to
+          obtain a closed-form solution.
+
+        - 'sparse_cg' uses the conjugate gradient solver as found in
+          scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
+          more appropriate than 'cholesky' for large-scale data
+          (possibility to set `tol` and `max_iter`).
+
+        - 'lsqr' uses the dedicated regularized least-squares routine
+          scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
+          procedure.
+
+        - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
+          its improved, unbiased version named SAGA. Both methods also use an
+          iterative procedure, and are often faster than other solvers when
+          both n_samples and n_features are large. Note that 'sag' and
+          'saga' fast convergence is only guaranteed on features with
+          approximately the same scale. You can preprocess the data with a
+          scaler from sklearn.preprocessing.
+
+        - 'lbfgs' uses L-BFGS-B algorithm implemented in
+          `scipy.optimize.minimize`. It can be used only when `positive`
+          is True.
+
+        All solvers except 'svd' support both dense and sparse data. However, only
+        'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
+        `fit_intercept` is True.
+
+        .. versionadded:: 0.17
+           Stochastic Average Gradient descent solver.
+        .. versionadded:: 0.19
+           SAGA solver.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive.
+        Only 'lbfgs' solver is supported in this case.
+
+    random_state : int, RandomState instance, default=None
+        Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
+        See :term:`Glossary <random_state>` for details.
+
+        .. versionadded:: 0.17
+           `random_state` to support Stochastic Average Gradient.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
+        Weight vector(s).
+
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function. Set to 0.0 if
+        ``fit_intercept = False``.
+
+    n_iter_ : None or ndarray of shape (n_targets,)
+        Actual number of iterations for each target. Available only for
+        sag and lsqr solvers. Other solvers will return None.
+
+        .. versionadded:: 0.17
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    solver_ : str
+        The solver that was used at fit time by the computational
+        routines.
+
+        .. versionadded:: 1.5
+
+    See Also
+    --------
+    RidgeClassifier : Ridge classifier.
+    RidgeCV : Ridge regression with built-in cross validation.
+    :class:`~sklearn.kernel_ridge.KernelRidge` : Kernel ridge regression
+        combines ridge regression with the kernel trick.
+
+    Notes
+    -----
+    Regularization improves the conditioning of the problem and
+    reduces the variance of the estimates. Larger values specify stronger
+    regularization. Alpha corresponds to ``1 / (2C)`` in other linear
+    models such as :class:`~sklearn.linear_model.LogisticRegression` or
+    :class:`~sklearn.svm.LinearSVC`.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import Ridge
+    >>> import numpy as np
+    >>> n_samples, n_features = 10, 5
+    >>> rng = np.random.RandomState(0)
+    >>> y = rng.randn(n_samples)
+    >>> X = rng.randn(n_samples, n_features)
+    >>> clf = Ridge(alpha=1.0)
+    >>> clf.fit(X, y)
+    Ridge()
+    """
+
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        fit_intercept=True,
+        copy_X=True,
+        max_iter=None,
+        tol=1e-4,
+        solver="auto",
+        positive=False,
+        random_state=None,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            copy_X=copy_X,
+            max_iter=max_iter,
+            tol=tol,
+            solver=solver,
+            positive=positive,
+            random_state=random_state,
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit Ridge regression model.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        sample_weight : float or ndarray of shape (n_samples,), default=None
+            Individual weights for each sample. If given a float, every sample
+            will have the same weight.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)
+        xp, _ = get_namespace(X, y, sample_weight)
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=_accept_sparse,
+            dtype=[xp.float64, xp.float32],
+            force_writeable=True,
+            multi_output=True,
+            y_numeric=True,
+        )
+        return super().fit(X, y, sample_weight=sample_weight)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.array_api_support = True
+        tags.input_tags.sparse = (self.solver != "svd") and (
+            self.solver != "cholesky" or not self.fit_intercept
+        )
+        return tags
+
+
+class _RidgeClassifierMixin(LinearClassifierMixin):
+    def _prepare_data(self, X, y, sample_weight, solver):
+        """Validate `X` and `y` and binarize `y`.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : ndarray of shape (n_samples,)
+            Target values.
+
+        sample_weight : float or ndarray of shape (n_samples,), default=None
+            Individual weights for each sample. If given a float, every sample
+            will have the same weight.
+
+        solver : str
+            The solver used in `Ridge` to know which sparse format to support.
+
+        Returns
+        -------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Validated training data.
+
+        y : ndarray of shape (n_samples,)
+            Validated target values.
+
+        sample_weight : ndarray of shape (n_samples,)
+            Validated sample weights.
+
+        Y : ndarray of shape (n_samples, n_classes)
+            The binarized version of `y`.
+        """
+        accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=accept_sparse,
+            multi_output=True,
+            y_numeric=False,
+            force_writeable=True,
+        )
+
+        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
+        Y = self._label_binarizer.fit_transform(y)
+        if not self._label_binarizer.y_type_.startswith("multilabel"):
+            y = column_or_1d(y, warn=True)
+
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        if self.class_weight:
+            sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)
+        return X, y, sample_weight, Y
+
+    def predict(self, X):
+        """Predict class labels for samples in `X`.
+
+        Parameters
+        ----------
+        X : {array-like, spare matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to predict the targets.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            Vector or matrix containing the predictions. In binary and
+            multiclass problems, this is a vector containing `n_samples`. In
+            a multilabel problem, it returns a matrix of shape
+            `(n_samples, n_outputs)`.
+        """
+        check_is_fitted(self, attributes=["_label_binarizer"])
+        if self._label_binarizer.y_type_.startswith("multilabel"):
+            # Threshold such that the negative label is -1 and positive label
+            # is 1 to use the inverse transform of the label binarizer fitted
+            # during fit.
+            scores = 2 * (self.decision_function(X) > 0) - 1
+            return self._label_binarizer.inverse_transform(scores)
+        return super().predict(X)
+
+    @property
+    def classes_(self):
+        """Classes labels."""
+        return self._label_binarizer.classes_
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        return tags
+
+
+class RidgeClassifier(_RidgeClassifierMixin, _BaseRidge):
+    """Classifier using Ridge regression.
+
+    This classifier first converts the target values into ``{-1, 1}`` and
+    then treats the problem as a regression task (multi-output regression in
+    the multiclass case).
+
+    Read more in the :ref:`User Guide <ridge_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Regularization strength; must be a positive float. Regularization
+        improves the conditioning of the problem and reduces the variance of
+        the estimates. Larger values specify stronger regularization.
+        Alpha corresponds to ``1 / (2C)`` in other linear models such as
+        :class:`~sklearn.linear_model.LogisticRegression` or
+        :class:`~sklearn.svm.LinearSVC`.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set to false, no
+        intercept will be used in calculations (e.g. data is expected to be
+        already centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    max_iter : int, default=None
+        Maximum number of iterations for conjugate gradient solver.
+        The default value is determined by scipy.sparse.linalg.
+
+    tol : float, default=1e-4
+        The precision of the solution (`coef_`) is determined by `tol` which
+        specifies a different convergence criterion for each solver:
+
+        - 'svd': `tol` has no impact.
+
+        - 'cholesky': `tol` has no impact.
+
+        - 'sparse_cg': norm of residuals smaller than `tol`.
+
+        - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,
+          which control the norm of the residual vector in terms of the norms of
+          matrix and coefficients.
+
+        - 'sag' and 'saga': relative change of coef smaller than `tol`.
+
+        - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals|
+          smaller than `tol`.
+
+        .. versionchanged:: 1.2
+           Default value changed from 1e-3 to 1e-4 for consistency with other linear
+           models.
+
+    class_weight : dict or 'balanced', default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
+            'sag', 'saga', 'lbfgs'}, default='auto'
+        Solver to use in the computational routines:
+
+        - 'auto' chooses the solver automatically based on the type of data.
+
+        - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
+          coefficients. It is the most stable solver, in particular more stable
+          for singular matrices than 'cholesky' at the cost of being slower.
+
+        - 'cholesky' uses the standard scipy.linalg.solve function to
+          obtain a closed-form solution.
+
+        - 'sparse_cg' uses the conjugate gradient solver as found in
+          scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
+          more appropriate than 'cholesky' for large-scale data
+          (possibility to set `tol` and `max_iter`).
+
+        - 'lsqr' uses the dedicated regularized least-squares routine
+          scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
+          procedure.
+
+        - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
+          its unbiased and more flexible version named SAGA. Both methods
+          use an iterative procedure, and are often faster than other solvers
+          when both n_samples and n_features are large. Note that 'sag' and
+          'saga' fast convergence is only guaranteed on features with
+          approximately the same scale. You can preprocess the data with a
+          scaler from sklearn.preprocessing.
+
+          .. versionadded:: 0.17
+             Stochastic Average Gradient descent solver.
+          .. versionadded:: 0.19
+             SAGA solver.
+
+        - 'lbfgs' uses L-BFGS-B algorithm implemented in
+          `scipy.optimize.minimize`. It can be used only when `positive`
+          is True.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive.
+        Only 'lbfgs' solver is supported in this case.
+
+    random_state : int, RandomState instance, default=None
+        Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
+        See :term:`Glossary <random_state>` for details.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
+        Coefficient of the features in the decision function.
+
+        ``coef_`` is of shape (1, n_features) when the given problem is binary.
+
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function. Set to 0.0 if
+        ``fit_intercept = False``.
+
+    n_iter_ : None or ndarray of shape (n_targets,)
+        Actual number of iterations for each target. Available only for
+        sag and lsqr solvers. Other solvers will return None.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    solver_ : str
+        The solver that was used at fit time by the computational
+        routines.
+
+        .. versionadded:: 1.5
+
+    See Also
+    --------
+    Ridge : Ridge regression.
+    RidgeClassifierCV :  Ridge classifier with built-in cross validation.
+
+    Notes
+    -----
+    For multi-class classification, n_class classifiers are trained in
+    a one-versus-all approach. Concretely, this is implemented by taking
+    advantage of the multi-variate response support in Ridge.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.linear_model import RidgeClassifier
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> clf = RidgeClassifier().fit(X, y)
+    >>> clf.score(X, y)
+    0.9595...
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseRidge._parameter_constraints,
+        "class_weight": [dict, StrOptions({"balanced"}), None],
+    }
+
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        fit_intercept=True,
+        copy_X=True,
+        max_iter=None,
+        tol=1e-4,
+        class_weight=None,
+        solver="auto",
+        positive=False,
+        random_state=None,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            copy_X=copy_X,
+            max_iter=max_iter,
+            tol=tol,
+            solver=solver,
+            positive=positive,
+            random_state=random_state,
+        )
+        self.class_weight = class_weight
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit Ridge classifier model.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : ndarray of shape (n_samples,)
+            Target values.
+
+        sample_weight : float or ndarray of shape (n_samples,), default=None
+            Individual weights for each sample. If given a float, every sample
+            will have the same weight.
+
+            .. versionadded:: 0.17
+               *sample_weight* support to RidgeClassifier.
+
+        Returns
+        -------
+        self : object
+            Instance of the estimator.
+        """
+        X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver)
+
+        super().fit(X, Y, sample_weight=sample_weight)
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = (self.solver != "svd") and (
+            self.solver != "cholesky" or not self.fit_intercept
+        )
+        return tags
+
+
+def _check_gcv_mode(X, gcv_mode):
+    if gcv_mode in ["eigen", "svd"]:
+        return gcv_mode
+    # if X has more rows than columns, use decomposition of X^T.X,
+    # otherwise X.X^T
+    if X.shape[0] > X.shape[1]:
+        return "svd"
+    return "eigen"
+
+
+def _find_smallest_angle(query, vectors):
+    """Find the column of vectors that is most aligned with the query.
+
+    Both query and the columns of vectors must have their l2 norm equal to 1.
+
+    Parameters
+    ----------
+    query : ndarray of shape (n_samples,)
+        Normalized query vector.
+
+    vectors : ndarray of shape (n_samples, n_features)
+        Vectors to which we compare query, as columns. Must be normalized.
+    """
+    abs_cosine = np.abs(query.dot(vectors))
+    index = np.argmax(abs_cosine)
+    return index
+
+
+class _X_CenterStackOp(sparse.linalg.LinearOperator):
+    """Behaves as centered and scaled X with an added intercept column.
+
+    This operator behaves as
+    np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])
+    """
+
+    def __init__(self, X, X_mean, sqrt_sw):
+        n_samples, n_features = X.shape
+        super().__init__(X.dtype, (n_samples, n_features + 1))
+        self.X = X
+        self.X_mean = X_mean
+        self.sqrt_sw = sqrt_sw
+
+    def _matvec(self, v):
+        v = v.ravel()
+        return (
+            safe_sparse_dot(self.X, v[:-1], dense_output=True)
+            - self.sqrt_sw * self.X_mean.dot(v[:-1])
+            + v[-1] * self.sqrt_sw
+        )
+
+    def _matmat(self, v):
+        return (
+            safe_sparse_dot(self.X, v[:-1], dense_output=True)
+            - self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1])
+            + v[-1] * self.sqrt_sw[:, None]
+        )
+
+    def _transpose(self):
+        return _XT_CenterStackOp(self.X, self.X_mean, self.sqrt_sw)
+
+
+class _XT_CenterStackOp(sparse.linalg.LinearOperator):
+    """Behaves as transposed centered and scaled X with an intercept column.
+
+    This operator behaves as
+    np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T
+    """
+
+    def __init__(self, X, X_mean, sqrt_sw):
+        n_samples, n_features = X.shape
+        super().__init__(X.dtype, (n_features + 1, n_samples))
+        self.X = X
+        self.X_mean = X_mean
+        self.sqrt_sw = sqrt_sw
+
+    def _matvec(self, v):
+        v = v.ravel()
+        n_features = self.shape[0]
+        res = np.empty(n_features, dtype=self.X.dtype)
+        res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - (
+            self.X_mean * self.sqrt_sw.dot(v)
+        )
+        res[-1] = np.dot(v, self.sqrt_sw)
+        return res
+
+    def _matmat(self, v):
+        n_features = self.shape[0]
+        res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype)
+        res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean[
+            :, None
+        ] * self.sqrt_sw.dot(v)
+        res[-1] = np.dot(self.sqrt_sw, v)
+        return res
+
+
+class _IdentityRegressor(RegressorMixin, BaseEstimator):
+    """Fake regressor which will directly output the prediction."""
+
+    def decision_function(self, y_predict):
+        return y_predict
+
+    def predict(self, y_predict):
+        return y_predict
+
+
+class _IdentityClassifier(LinearClassifierMixin, BaseEstimator):
+    """Fake classifier which will directly output the prediction.
+
+    We inherit from LinearClassifierMixin to get the proper shape for the
+    output `y`.
+    """
+
+    def __init__(self, classes):
+        self.classes_ = classes
+
+    def decision_function(self, y_predict):
+        return y_predict
+
+
+class _RidgeGCV(LinearModel):
+    """Ridge regression with built-in Leave-one-out Cross-Validation.
+
+    This class is not intended to be used directly. Use RidgeCV instead.
+
+    `_RidgeGCV` uses a Generalized Cross-Validation for model selection. It's an
+    efficient approximation of leave-one-out cross-validation (LOO-CV), where instead of
+    computing multiple models by excluding one data point at a time, it uses an
+    algebraic shortcut to approximate the LOO-CV error, making it faster and
+    computationally more efficient.
+
+    Using a naive grid-search approach with a leave-one-out cross-validation in contrast
+    requires to fit `n_samples` models to compute the prediction error for each sample
+    and then to repeat this process for each alpha in the grid.
+
+    Here, the prediction error for each sample is computed by solving a **single**
+    linear system (in other words a single model) via a matrix factorization (i.e.
+    eigendecomposition or SVD) solving the problem stated in the Notes section. Finally,
+    we need to repeat this process for each alpha in the grid. The detailed complexity
+    is further discussed in Sect. 4 in [1].
+
+    This algebraic approach is only applicable for regularized least squares
+    problems. It could potentially be extended to kernel ridge regression.
+
+    See the Notes section and references for more details regarding the formulation
+    and the linear system that is solved.
+
+    Notes
+    -----
+
+    We want to solve (K + alpha*Id)c = y,
+    where K = X X^T is the kernel matrix.
+
+    Let G = (K + alpha*Id).
+
+    Dual solution: c = G^-1y
+    Primal solution: w = X^T c
+
+    Compute eigendecomposition K = Q V Q^T.
+    Then G^-1 = Q (V + alpha*Id)^-1 Q^T,
+    where (V + alpha*Id) is diagonal.
+    It is thus inexpensive to inverse for many alphas.
+
+    Let loov be the vector of prediction values for each example
+    when the model was fitted with all examples but this example.
+
+    loov = (KG^-1Y - diag(KG^-1)Y) / diag(I-KG^-1)
+
+    Let looe be the vector of prediction errors for each example
+    when the model was fitted with all examples but this example.
+
+    looe = y - loov = c / diag(G^-1)
+
+    The best score (negative mean squared error or user-provided scoring) is
+    stored in the `best_score_` attribute, and the selected hyperparameter in
+    `alpha_`.
+
+    References
+    ----------
+    [1] http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
+    [2] https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
+    """
+
+    def __init__(
+        self,
+        alphas=(0.1, 1.0, 10.0),
+        *,
+        fit_intercept=True,
+        scoring=None,
+        copy_X=True,
+        gcv_mode=None,
+        store_cv_results=False,
+        is_clf=False,
+        alpha_per_target=False,
+    ):
+        self.alphas = alphas
+        self.fit_intercept = fit_intercept
+        self.scoring = scoring
+        self.copy_X = copy_X
+        self.gcv_mode = gcv_mode
+        self.store_cv_results = store_cv_results
+        self.is_clf = is_clf
+        self.alpha_per_target = alpha_per_target
+
+    @staticmethod
+    def _decomp_diag(v_prime, Q):
+        # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
+        return (v_prime * Q**2).sum(axis=-1)
+
+    @staticmethod
+    def _diag_dot(D, B):
+        # compute dot(diag(D), B)
+        if len(B.shape) > 1:
+            # handle case where B is > 1-d
+            D = D[(slice(None),) + (np.newaxis,) * (len(B.shape) - 1)]
+        return D * B
+
+    def _compute_gram(self, X, sqrt_sw):
+        """Computes the Gram matrix XX^T with possible centering.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The preprocessed design matrix.
+
+        sqrt_sw : ndarray of shape (n_samples,)
+            square roots of sample weights
+
+        Returns
+        -------
+        gram : ndarray of shape (n_samples, n_samples)
+            The Gram matrix.
+        X_mean : ndarray of shape (n_feature,)
+            The weighted mean of ``X`` for each feature.
+
+        Notes
+        -----
+        When X is dense the centering has been done in preprocessing
+        so the mean is 0 and we just compute XX^T.
+
+        When X is sparse it has not been centered in preprocessing, but it has
+        been scaled by sqrt(sample weights).
+
+        When self.fit_intercept is False no centering is done.
+
+        The centered X is never actually computed because centering would break
+        the sparsity of X.
+        """
+        center = self.fit_intercept and sparse.issparse(X)
+        if not center:
+            # in this case centering has been done in preprocessing
+            # or we are not fitting an intercept.
+            X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+            return safe_sparse_dot(X, X.T, dense_output=True), X_mean
+        # X is sparse
+        n_samples = X.shape[0]
+        sample_weight_matrix = sparse.dia_matrix(
+            (sqrt_sw, 0), shape=(n_samples, n_samples)
+        )
+        X_weighted = sample_weight_matrix.dot(X)
+        X_mean, _ = mean_variance_axis(X_weighted, axis=0)
+        X_mean *= n_samples / sqrt_sw.dot(sqrt_sw)
+        X_mX = sqrt_sw[:, None] * safe_sparse_dot(X_mean, X.T, dense_output=True)
+        X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean)
+        return (
+            safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m - X_mX - X_mX.T,
+            X_mean,
+        )
+
+    def _compute_covariance(self, X, sqrt_sw):
+        """Computes covariance matrix X^TX with possible centering.
+
+        Parameters
+        ----------
+        X : sparse matrix of shape (n_samples, n_features)
+            The preprocessed design matrix.
+
+        sqrt_sw : ndarray of shape (n_samples,)
+            square roots of sample weights
+
+        Returns
+        -------
+        covariance : ndarray of shape (n_features, n_features)
+            The covariance matrix.
+        X_mean : ndarray of shape (n_feature,)
+            The weighted mean of ``X`` for each feature.
+
+        Notes
+        -----
+        Since X is sparse it has not been centered in preprocessing, but it has
+        been scaled by sqrt(sample weights).
+
+        When self.fit_intercept is False no centering is done.
+
+        The centered X is never actually computed because centering would break
+        the sparsity of X.
+        """
+        if not self.fit_intercept:
+            # in this case centering has been done in preprocessing
+            # or we are not fitting an intercept.
+            X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+            return safe_sparse_dot(X.T, X, dense_output=True), X_mean
+        # this function only gets called for sparse X
+        n_samples = X.shape[0]
+        sample_weight_matrix = sparse.dia_matrix(
+            (sqrt_sw, 0), shape=(n_samples, n_samples)
+        )
+        X_weighted = sample_weight_matrix.dot(X)
+        X_mean, _ = mean_variance_axis(X_weighted, axis=0)
+        X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw)
+        weight_sum = sqrt_sw.dot(sqrt_sw)
+        return (
+            safe_sparse_dot(X.T, X, dense_output=True)
+            - weight_sum * np.outer(X_mean, X_mean),
+            X_mean,
+        )
+
+    def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw):
+        """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)
+        without explicitly centering X nor computing X.dot(A)
+        when X is sparse.
+
+        Parameters
+        ----------
+        X : sparse matrix of shape (n_samples, n_features)
+
+        A : ndarray of shape (n_features, n_features)
+
+        X_mean : ndarray of shape (n_features,)
+
+        sqrt_sw : ndarray of shape (n_features,)
+            square roots of sample weights
+
+        Returns
+        -------
+        diag : np.ndarray, shape (n_samples,)
+            The computed diagonal.
+        """
+        intercept_col = scale = sqrt_sw
+        batch_size = X.shape[1]
+        diag = np.empty(X.shape[0], dtype=X.dtype)
+        for start in range(0, X.shape[0], batch_size):
+            batch = slice(start, min(X.shape[0], start + batch_size), 1)
+            X_batch = np.empty(
+                (X[batch].shape[0], X.shape[1] + self.fit_intercept), dtype=X.dtype
+            )
+            if self.fit_intercept:
+                X_batch[:, :-1] = X[batch].toarray() - X_mean * scale[batch][:, None]
+                X_batch[:, -1] = intercept_col[batch]
+            else:
+                X_batch = X[batch].toarray()
+            diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1)
+        return diag
+
+    def _eigen_decompose_gram(self, X, y, sqrt_sw):
+        """Eigendecomposition of X.X^T, used when n_samples <= n_features."""
+        # if X is dense it has already been centered in preprocessing
+        K, X_mean = self._compute_gram(X, sqrt_sw)
+        if self.fit_intercept:
+            # to emulate centering X with sample weights,
+            # ie removing the weighted average, we add a column
+            # containing the square roots of the sample weights.
+            # by centering, it is orthogonal to the other columns
+            K += np.outer(sqrt_sw, sqrt_sw)
+        eigvals, Q = linalg.eigh(K)
+        QT_y = np.dot(Q.T, y)
+        return X_mean, eigvals, Q, QT_y
+
+    def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, eigvals, Q, QT_y):
+        """Compute dual coefficients and diagonal of G^-1.
+
+        Used when we have a decomposition of X.X^T (n_samples <= n_features).
+        """
+        w = 1.0 / (eigvals + alpha)
+        if self.fit_intercept:
+            # the vector containing the square roots of the sample weights (1
+            # when no sample weights) is the eigenvector of XX^T which
+            # corresponds to the intercept; we cancel the regularization on
+            # this dimension. the corresponding eigenvalue is
+            # sum(sample_weight).
+            normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
+            intercept_dim = _find_smallest_angle(normalized_sw, Q)
+            w[intercept_dim] = 0  # cancel regularization for the intercept
+
+        c = np.dot(Q, self._diag_dot(w, QT_y))
+        G_inverse_diag = self._decomp_diag(w, Q)
+        # handle case where y is 2-d
+        if len(y.shape) != 1:
+            G_inverse_diag = G_inverse_diag[:, np.newaxis]
+        return G_inverse_diag, c
+
+    def _eigen_decompose_covariance(self, X, y, sqrt_sw):
+        """Eigendecomposition of X^T.X, used when n_samples > n_features
+        and X is sparse.
+        """
+        n_samples, n_features = X.shape
+        cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype)
+        cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw)
+        if not self.fit_intercept:
+            cov = cov[:-1, :-1]
+        # to emulate centering X with sample weights,
+        # ie removing the weighted average, we add a column
+        # containing the square roots of the sample weights.
+        # by centering, it is orthogonal to the other columns
+        # when all samples have the same weight we add a column of 1
+        else:
+            cov[-1] = 0
+            cov[:, -1] = 0
+            cov[-1, -1] = sqrt_sw.dot(sqrt_sw)
+        nullspace_dim = max(0, n_features - n_samples)
+        eigvals, V = linalg.eigh(cov)
+        # remove eigenvalues and vectors in the null space of X^T.X
+        eigvals = eigvals[nullspace_dim:]
+        V = V[:, nullspace_dim:]
+        return X_mean, eigvals, V, X
+
+    def _solve_eigen_covariance_no_intercept(
+        self, alpha, y, sqrt_sw, X_mean, eigvals, V, X
+    ):
+        """Compute dual coefficients and diagonal of G^-1.
+
+        Used when we have a decomposition of X^T.X
+        (n_samples > n_features and X is sparse), and not fitting an intercept.
+        """
+        w = 1 / (eigvals + alpha)
+        A = (V * w).dot(V.T)
+        AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True))
+        y_hat = safe_sparse_dot(X, AXy, dense_output=True)
+        hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
+        if len(y.shape) != 1:
+            # handle case where y is 2-d
+            hat_diag = hat_diag[:, np.newaxis]
+        return (1 - hat_diag) / alpha, (y - y_hat) / alpha
+
+    def _solve_eigen_covariance_intercept(
+        self, alpha, y, sqrt_sw, X_mean, eigvals, V, X
+    ):
+        """Compute dual coefficients and diagonal of G^-1.
+
+        Used when we have a decomposition of X^T.X
+        (n_samples > n_features and X is sparse),
+        and we are fitting an intercept.
+        """
+        # the vector [0, 0, ..., 0, 1]
+        # is the eigenvector of X^TX which
+        # corresponds to the intercept; we cancel the regularization on
+        # this dimension. the corresponding eigenvalue is
+        # sum(sample_weight), e.g. n when uniform sample weights.
+        intercept_sv = np.zeros(V.shape[0])
+        intercept_sv[-1] = 1
+        intercept_dim = _find_smallest_angle(intercept_sv, V)
+        w = 1 / (eigvals + alpha)
+        w[intercept_dim] = 1 / eigvals[intercept_dim]
+        A = (V * w).dot(V.T)
+        # add a column to X containing the square roots of sample weights
+        X_op = _X_CenterStackOp(X, X_mean, sqrt_sw)
+        AXy = A.dot(X_op.T.dot(y))
+        y_hat = X_op.dot(AXy)
+        hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
+        # return (1 - hat_diag), (y - y_hat)
+        if len(y.shape) != 1:
+            # handle case where y is 2-d
+            hat_diag = hat_diag[:, np.newaxis]
+        return (1 - hat_diag) / alpha, (y - y_hat) / alpha
+
+    def _solve_eigen_covariance(self, alpha, y, sqrt_sw, X_mean, eigvals, V, X):
+        """Compute dual coefficients and diagonal of G^-1.
+
+        Used when we have a decomposition of X^T.X
+        (n_samples > n_features and X is sparse).
+        """
+        if self.fit_intercept:
+            return self._solve_eigen_covariance_intercept(
+                alpha, y, sqrt_sw, X_mean, eigvals, V, X
+            )
+        return self._solve_eigen_covariance_no_intercept(
+            alpha, y, sqrt_sw, X_mean, eigvals, V, X
+        )
+
+    def _svd_decompose_design_matrix(self, X, y, sqrt_sw):
+        # X already centered
+        X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+        if self.fit_intercept:
+            # to emulate fit_intercept=True situation, add a column
+            # containing the square roots of the sample weights
+            # by centering, the other columns are orthogonal to that one
+            intercept_column = sqrt_sw[:, None]
+            X = np.hstack((X, intercept_column))
+        U, singvals, _ = linalg.svd(X, full_matrices=0)
+        singvals_sq = singvals**2
+        UT_y = np.dot(U.T, y)
+        return X_mean, singvals_sq, U, UT_y
+
+    def _solve_svd_design_matrix(self, alpha, y, sqrt_sw, X_mean, singvals_sq, U, UT_y):
+        """Compute dual coefficients and diagonal of G^-1.
+
+        Used when we have an SVD decomposition of X
+        (n_samples > n_features and X is dense).
+        """
+        w = ((singvals_sq + alpha) ** -1) - (alpha**-1)
+        if self.fit_intercept:
+            # detect intercept column
+            normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
+            intercept_dim = _find_smallest_angle(normalized_sw, U)
+            # cancel the regularization for the intercept
+            w[intercept_dim] = -(alpha**-1)
+        c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha**-1) * y
+        G_inverse_diag = self._decomp_diag(w, U) + (alpha**-1)
+        if len(y.shape) != 1:
+            # handle case where y is 2-d
+            G_inverse_diag = G_inverse_diag[:, np.newaxis]
+        return G_inverse_diag, c
+
+    def fit(self, X, y, sample_weight=None, score_params=None):
+        """Fit Ridge regression model with gcv.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Training data. Will be cast to float64 if necessary.
+
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
+            Target values. Will be cast to float64 if necessary.
+
+        sample_weight : float or ndarray of shape (n_samples,), default=None
+            Individual weights for each sample. If given a float, every sample
+            will have the same weight. Note that the scale of `sample_weight`
+            has an impact on the loss; i.e. multiplying all weights by `k`
+            is equivalent to setting `alpha / k`.
+
+        score_params : dict, default=None
+            Parameters to be passed to the underlying scorer.
+
+            .. versionadded:: 1.5
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype=[np.float64],
+            multi_output=True,
+            y_numeric=True,
+        )
+
+        # alpha_per_target cannot be used in classifier mode. All subclasses
+        # of _RidgeGCV that are classifiers keep alpha_per_target at its
+        # default value: False, so the condition below should never happen.
+        assert not (self.is_clf and self.alpha_per_target)
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        self.alphas = np.asarray(self.alphas)
+
+        unscaled_y = y
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X,
+            y,
+            fit_intercept=self.fit_intercept,
+            copy=self.copy_X,
+            sample_weight=sample_weight,
+        )
+
+        gcv_mode = _check_gcv_mode(X, self.gcv_mode)
+
+        if gcv_mode == "eigen":
+            decompose = self._eigen_decompose_gram
+            solve = self._solve_eigen_gram
+        elif gcv_mode == "svd":
+            if sparse.issparse(X):
+                decompose = self._eigen_decompose_covariance
+                solve = self._solve_eigen_covariance
+            else:
+                decompose = self._svd_decompose_design_matrix
+                solve = self._solve_svd_design_matrix
+
+        n_samples = X.shape[0]
+
+        if sample_weight is not None:
+            X, y, sqrt_sw = _rescale_data(X, y, sample_weight)
+        else:
+            sqrt_sw = np.ones(n_samples, dtype=X.dtype)
+
+        X_mean, *decomposition = decompose(X, y, sqrt_sw)
+
+        n_y = 1 if len(y.shape) == 1 else y.shape[1]
+        n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas)
+
+        if self.store_cv_results:
+            self.cv_results_ = np.empty((n_samples * n_y, n_alphas), dtype=X.dtype)
+
+        best_coef, best_score, best_alpha = None, None, None
+
+        for i, alpha in enumerate(np.atleast_1d(self.alphas)):
+            G_inverse_diag, c = solve(float(alpha), y, sqrt_sw, X_mean, *decomposition)
+            if self.scoring is None:
+                squared_errors = (c / G_inverse_diag) ** 2
+                alpha_score = self._score_without_scorer(squared_errors=squared_errors)
+                if self.store_cv_results:
+                    self.cv_results_[:, i] = squared_errors.ravel()
+            else:
+                predictions = y - (c / G_inverse_diag)
+                # Rescale predictions back to original scale
+                if sample_weight is not None:  # avoid the unnecessary division by ones
+                    if predictions.ndim > 1:
+                        predictions /= sqrt_sw[:, None]
+                    else:
+                        predictions /= sqrt_sw
+                predictions += y_offset
+
+                if self.store_cv_results:
+                    self.cv_results_[:, i] = predictions.ravel()
+
+                score_params = score_params or {}
+                alpha_score = self._score(
+                    predictions=predictions,
+                    y=unscaled_y,
+                    n_y=n_y,
+                    scorer=self.scoring,
+                    score_params=score_params,
+                )
+
+            # Keep track of the best model
+            if best_score is None:
+                # initialize
+                if self.alpha_per_target and n_y > 1:
+                    best_coef = c
+                    best_score = np.atleast_1d(alpha_score)
+                    best_alpha = np.full(n_y, alpha)
+                else:
+                    best_coef = c
+                    best_score = alpha_score
+                    best_alpha = alpha
+            else:
+                # update
+                if self.alpha_per_target and n_y > 1:
+                    to_update = alpha_score > best_score
+                    best_coef[:, to_update] = c[:, to_update]
+                    best_score[to_update] = alpha_score[to_update]
+                    best_alpha[to_update] = alpha
+                elif alpha_score > best_score:
+                    best_coef, best_score, best_alpha = c, alpha_score, alpha
+
+        self.alpha_ = best_alpha
+        self.best_score_ = best_score
+        self.dual_coef_ = best_coef
+        self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
+        if y.ndim == 1 or y.shape[1] == 1:
+            self.coef_ = self.coef_.ravel()
+
+        if sparse.issparse(X):
+            X_offset = X_mean * X_scale
+        else:
+            X_offset += X_mean * X_scale
+        self._set_intercept(X_offset, y_offset, X_scale)
+
+        if self.store_cv_results:
+            if len(y.shape) == 1:
+                cv_results_shape = n_samples, n_alphas
+            else:
+                cv_results_shape = n_samples, n_y, n_alphas
+            self.cv_results_ = self.cv_results_.reshape(cv_results_shape)
+
+        return self
+
+    def _score_without_scorer(self, squared_errors):
+        """Performs scoring using squared errors when the scorer is None."""
+        if self.alpha_per_target:
+            _score = -squared_errors.mean(axis=0)
+        else:
+            _score = -squared_errors.mean()
+
+        return _score
+
+    def _score(self, *, predictions, y, n_y, scorer, score_params):
+        """Performs scoring with the specified scorer using the
+        predictions and the true y values.
+        """
+        if self.is_clf:
+            identity_estimator = _IdentityClassifier(classes=np.arange(n_y))
+            _score = scorer(
+                identity_estimator,
+                predictions,
+                y.argmax(axis=1),
+                **score_params,
+            )
+        else:
+            identity_estimator = _IdentityRegressor()
+            if self.alpha_per_target:
+                _score = np.array(
+                    [
+                        scorer(
+                            identity_estimator,
+                            predictions[:, j],
+                            y[:, j],
+                            **score_params,
+                        )
+                        for j in range(n_y)
+                    ]
+                )
+            else:
+                _score = scorer(identity_estimator, predictions, y, **score_params)
+
+        return _score
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # Required since this is neither a RegressorMixin nor a ClassifierMixin
+        tags.target_tags.required = True
+        return tags
+
+
+class _BaseRidgeCV(LinearModel):
+    _parameter_constraints: dict = {
+        "alphas": ["array-like", Interval(Real, 0, None, closed="neither")],
+        "fit_intercept": ["boolean"],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "cv": ["cv_object"],
+        "gcv_mode": [StrOptions({"auto", "svd", "eigen"}), None],
+        "store_cv_results": ["boolean"],
+        "alpha_per_target": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        alphas=(0.1, 1.0, 10.0),
+        *,
+        fit_intercept=True,
+        scoring=None,
+        cv=None,
+        gcv_mode=None,
+        store_cv_results=False,
+        alpha_per_target=False,
+    ):
+        self.alphas = alphas
+        self.fit_intercept = fit_intercept
+        self.scoring = scoring
+        self.cv = cv
+        self.gcv_mode = gcv_mode
+        self.store_cv_results = store_cv_results
+        self.alpha_per_target = alpha_per_target
+
+    def fit(self, X, y, sample_weight=None, **params):
+        """Fit Ridge regression model with cv.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data. If using GCV, will be cast to float64
+            if necessary.
+
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : float or ndarray of shape (n_samples,), default=None
+            Individual weights for each sample. If given a float, every sample
+            will have the same weight.
+
+        **params : dict, default=None
+            Extra parameters for the underlying scorer.
+
+            .. versionadded:: 1.5
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+
+        Notes
+        -----
+        When sample_weight is provided, the selected hyperparameter may depend
+        on whether we use leave-one-out cross-validation (cv=None)
+        or another form of cross-validation, because only leave-one-out
+        cross-validation takes the sample weights into account when computing
+        the validation score.
+        """
+        _raise_for_params(params, self, "fit")
+        cv = self.cv
+        scorer = self._get_scorer()
+
+        # `_RidgeGCV` does not work for alpha = 0
+        if cv is None:
+            check_scalar_alpha = partial(
+                check_scalar,
+                target_type=numbers.Real,
+                min_val=0.0,
+                include_boundaries="neither",
+            )
+        else:
+            check_scalar_alpha = partial(
+                check_scalar,
+                target_type=numbers.Real,
+                min_val=0.0,
+                include_boundaries="left",
+            )
+
+        if isinstance(self.alphas, (np.ndarray, list, tuple)):
+            n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas)
+            if n_alphas != 1:
+                for index, alpha in enumerate(self.alphas):
+                    alpha = check_scalar_alpha(alpha, f"alphas[{index}]")
+            else:
+                self.alphas[0] = check_scalar_alpha(self.alphas[0], "alphas")
+        alphas = np.asarray(self.alphas)
+
+        if sample_weight is not None:
+            params["sample_weight"] = sample_weight
+
+        if cv is None:
+            if _routing_enabled():
+                routed_params = process_routing(
+                    self,
+                    "fit",
+                    **params,
+                )
+            else:
+                routed_params = Bunch(scorer=Bunch(score={}))
+                if sample_weight is not None:
+                    routed_params.scorer.score["sample_weight"] = sample_weight
+
+            # reset `scorer` variable to original user-intend if no scoring is passed
+            if self.scoring is None:
+                scorer = None
+
+            estimator = _RidgeGCV(
+                alphas,
+                fit_intercept=self.fit_intercept,
+                scoring=scorer,
+                gcv_mode=self.gcv_mode,
+                store_cv_results=self.store_cv_results,
+                is_clf=is_classifier(self),
+                alpha_per_target=self.alpha_per_target,
+            )
+            estimator.fit(
+                X,
+                y,
+                sample_weight=sample_weight,
+                score_params=routed_params.scorer.score,
+            )
+            self.alpha_ = estimator.alpha_
+            self.best_score_ = estimator.best_score_
+            if self.store_cv_results:
+                self.cv_results_ = estimator.cv_results_
+        else:
+            if self.store_cv_results:
+                raise ValueError("cv!=None and store_cv_results=True are incompatible")
+            if self.alpha_per_target:
+                raise ValueError("cv!=None and alpha_per_target=True are incompatible")
+
+            parameters = {"alpha": alphas}
+            solver = "sparse_cg" if sparse.issparse(X) else "auto"
+            model = RidgeClassifier if is_classifier(self) else Ridge
+            estimator = model(
+                fit_intercept=self.fit_intercept,
+                solver=solver,
+            )
+            if _routing_enabled():
+                estimator.set_fit_request(sample_weight=True)
+
+            grid_search = GridSearchCV(
+                estimator,
+                parameters,
+                cv=cv,
+                scoring=scorer,
+            )
+
+            grid_search.fit(X, y, **params)
+            estimator = grid_search.best_estimator_
+            self.alpha_ = grid_search.best_estimator_.alpha
+            self.best_score_ = grid_search.best_score_
+
+        self.coef_ = estimator.coef_
+        self.intercept_ = estimator.intercept_
+        self.n_features_in_ = estimator.n_features_in_
+        if hasattr(estimator, "feature_names_in_"):
+            self.feature_names_in_ = estimator.feature_names_in_
+
+        return self
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(
+                scorer=self.scoring,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+            .add(
+                splitter=self.cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+        )
+        return router
+
+    def _get_scorer(self):
+        scorer = check_scoring(estimator=self, scoring=self.scoring, allow_none=True)
+        if _routing_enabled() and self.scoring is None:
+            # This estimator passes an array of 1s as sample_weight even if
+            # sample_weight is not provided by the user. Therefore we need to
+            # always request it. But we don't set it if it's passed explicitly
+            # by the user.
+            scorer.set_score_request(sample_weight=True)
+        return scorer
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
+    """Ridge regression with built-in cross-validation.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    By default, it performs efficient Leave-One-Out Cross-Validation.
+
+    Read more in the :ref:`User Guide <ridge_regression>`.
+
+    Parameters
+    ----------
+    alphas : array-like of shape (n_alphas,), default=(0.1, 1.0, 10.0)
+        Array of alpha values to try.
+        Regularization strength; must be a positive float. Regularization
+        improves the conditioning of the problem and reduces the variance of
+        the estimates. Larger values specify stronger regularization.
+        Alpha corresponds to ``1 / (2C)`` in other linear models such as
+        :class:`~sklearn.linear_model.LogisticRegression` or
+        :class:`~sklearn.svm.LinearSVC`.
+        If using Leave-One-Out cross-validation, alphas must be strictly positive.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    scoring : str, callable, default=None
+        The scoring method to use for cross-validation. Options:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: negative :ref:`mean squared error <mean_squared_error>` if cv is
+          None (i.e. when using leave-one-out cross-validation), or
+          :ref:`coefficient of determination <r2_score>` (:math:`R^2`) otherwise.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the efficient Leave-One-Out cross-validation
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if ``y`` is binary or multiclass,
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+    gcv_mode : {'auto', 'svd', 'eigen'}, default='auto'
+        Flag indicating which strategy to use when performing
+        Leave-One-Out Cross-Validation. Options are::
+
+            'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'
+            'svd' : force use of singular value decomposition of X when X is
+                dense, eigenvalue decomposition of X^T.X when X is sparse.
+            'eigen' : force computation via eigendecomposition of X.X^T
+
+        The 'auto' mode is the default and is intended to pick the cheaper
+        option of the two depending on the shape of the training data.
+
+    store_cv_results : bool, default=False
+        Flag indicating if the cross-validation values corresponding to
+        each alpha should be stored in the ``cv_results_`` attribute (see
+        below). This flag is only compatible with ``cv=None`` (i.e. using
+        Leave-One-Out Cross-Validation).
+
+        .. versionchanged:: 1.5
+            Parameter name changed from `store_cv_values` to `store_cv_results`.
+
+    alpha_per_target : bool, default=False
+        Flag indicating whether to optimize the alpha value (picked from the
+        `alphas` parameter list) for each target separately (for multi-output
+        settings: multiple prediction targets). When set to `True`, after
+        fitting, the `alpha_` attribute will contain a value for each target.
+        When set to `False`, a single alpha is used for all targets.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    cv_results_ : ndarray of shape (n_samples, n_alphas) or \
+            shape (n_samples, n_targets, n_alphas), optional
+        Cross-validation values for each alpha (only available if
+        ``store_cv_results=True`` and ``cv=None``). After ``fit()`` has been
+        called, this attribute will contain the mean squared errors if
+        `scoring is None` otherwise it will contain standardized per point
+        prediction values.
+
+        .. versionchanged:: 1.5
+            `cv_values_` changed to `cv_results_`.
+
+    coef_ : ndarray of shape (n_features) or (n_targets, n_features)
+        Weight vector(s).
+
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function. Set to 0.0 if
+        ``fit_intercept = False``.
+
+    alpha_ : float or ndarray of shape (n_targets,)
+        Estimated regularization parameter, or, if ``alpha_per_target=True``,
+        the estimated regularization parameter for each target.
+
+    best_score_ : float or ndarray of shape (n_targets,)
+        Score of base estimator with best alpha, or, if
+        ``alpha_per_target=True``, a score for each target.
+
+        .. versionadded:: 0.23
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    Ridge : Ridge regression.
+    RidgeClassifier : Classifier based on ridge regression on {-1, 1} labels.
+    RidgeClassifierCV : Ridge classifier with built-in cross validation.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.linear_model import RidgeCV
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
+    >>> clf.score(X, y)
+    0.5166...
+    """
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None, **params):
+        """Fit Ridge regression model with cv.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data. If using GCV, will be cast to float64
+            if necessary.
+
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : float or ndarray of shape (n_samples,), default=None
+            Individual weights for each sample. If given a float, every sample
+            will have the same weight.
+
+        **params : dict, default=None
+            Parameters to be passed to the underlying scorer.
+
+            .. versionadded:: 1.5
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+
+        Notes
+        -----
+        When sample_weight is provided, the selected hyperparameter may depend
+        on whether we use leave-one-out cross-validation (cv=None)
+        or another form of cross-validation, because only leave-one-out
+        cross-validation takes the sample weights into account when computing
+        the validation score.
+        """
+        super().fit(X, y, sample_weight=sample_weight, **params)
+        return self
+
+
+class RidgeClassifierCV(_RidgeClassifierMixin, _BaseRidgeCV):
+    """Ridge classifier with built-in cross-validation.
+
+    See glossary entry for :term:`cross-validation estimator`.
+
+    By default, it performs Leave-One-Out Cross-Validation. Currently,
+    only the n_features > n_samples case is handled efficiently.
+
+    Read more in the :ref:`User Guide <ridge_regression>`.
+
+    Parameters
+    ----------
+    alphas : array-like of shape (n_alphas,), default=(0.1, 1.0, 10.0)
+        Array of alpha values to try.
+        Regularization strength; must be a positive float. Regularization
+        improves the conditioning of the problem and reduces the variance of
+        the estimates. Larger values specify stronger regularization.
+        Alpha corresponds to ``1 / (2C)`` in other linear models such as
+        :class:`~sklearn.linear_model.LogisticRegression` or
+        :class:`~sklearn.svm.LinearSVC`.
+        If using Leave-One-Out cross-validation, alphas must be strictly positive.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    scoring : str, callable, default=None
+        The scoring method to use for cross-validation. Options:
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: negative :ref:`mean squared error <mean_squared_error>` if cv is
+          None (i.e. when using leave-one-out cross-validation), or
+          :ref:`accuracy <accuracy_score>` otherwise.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the efficient Leave-One-Out cross-validation
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+    class_weight : dict or 'balanced', default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+    store_cv_results : bool, default=False
+        Flag indicating if the cross-validation results corresponding to
+        each alpha should be stored in the ``cv_results_`` attribute (see
+        below). This flag is only compatible with ``cv=None`` (i.e. using
+        Leave-One-Out Cross-Validation).
+
+        .. versionchanged:: 1.5
+            Parameter name changed from `store_cv_values` to `store_cv_results`.
+
+    Attributes
+    ----------
+    cv_results_ : ndarray of shape (n_samples, n_targets, n_alphas), optional
+        Cross-validation results for each alpha (only if ``store_cv_results=True`` and
+        ``cv=None``). After ``fit()`` has been called, this attribute will
+        contain the mean squared errors if `scoring is None` otherwise it
+        will contain standardized per point prediction values.
+
+        .. versionchanged:: 1.5
+            `cv_values_` changed to `cv_results_`.
+
+    coef_ : ndarray of shape (1, n_features) or (n_targets, n_features)
+        Coefficient of the features in the decision function.
+
+        ``coef_`` is of shape (1, n_features) when the given problem is binary.
+
+    intercept_ : float or ndarray of shape (n_targets,)
+        Independent term in decision function. Set to 0.0 if
+        ``fit_intercept = False``.
+
+    alpha_ : float
+        Estimated regularization parameter.
+
+    best_score_ : float
+        Score of base estimator with best alpha.
+
+        .. versionadded:: 0.23
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    Ridge : Ridge regression.
+    RidgeClassifier : Ridge classifier.
+    RidgeCV : Ridge regression with built-in cross validation.
+
+    Notes
+    -----
+    For multi-class classification, n_class classifiers are trained in
+    a one-versus-all approach. Concretely, this is implemented by taking
+    advantage of the multi-variate response support in Ridge.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.linear_model import RidgeClassifierCV
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
+    >>> clf.score(X, y)
+    0.9630...
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseRidgeCV._parameter_constraints,
+        "class_weight": [dict, StrOptions({"balanced"}), None],
+    }
+    for param in ("gcv_mode", "alpha_per_target"):
+        _parameter_constraints.pop(param)
+
+    def __init__(
+        self,
+        alphas=(0.1, 1.0, 10.0),
+        *,
+        fit_intercept=True,
+        scoring=None,
+        cv=None,
+        class_weight=None,
+        store_cv_results=False,
+    ):
+        super().__init__(
+            alphas=alphas,
+            fit_intercept=fit_intercept,
+            scoring=scoring,
+            cv=cv,
+            store_cv_results=store_cv_results,
+        )
+        self.class_weight = class_weight
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None, **params):
+        """Fit Ridge classifier with cv.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples
+            and `n_features` is the number of features. When using GCV,
+            will be cast to float64 if necessary.
+
+        y : ndarray of shape (n_samples,)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : float or ndarray of shape (n_samples,), default=None
+            Individual weights for each sample. If given a float, every sample
+            will have the same weight.
+
+        **params : dict, default=None
+            Parameters to be passed to the underlying scorer.
+
+            .. versionadded:: 1.5
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support
+        # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept
+        # all sparse format.
+        X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, solver="eigen")
+
+        # If cv is None, gcv mode will be used and we used the binarized Y
+        # since y will not be binarized in _RidgeGCV estimator.
+        # If cv is not None, a GridSearchCV with some RidgeClassifier
+        # estimators are used where y will be binarized. Thus, we pass y
+        # instead of the binarized Y.
+        target = Y if self.cv is None else y
+        super().fit(X, target, sample_weight=sample_weight, **params)
+        return self
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag.py
new file mode 100644
index 0000000000000000000000000000000000000000..12e5d049b0b1f88b17405f5633d6d7371a3cca83
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag.py
@@ -0,0 +1,370 @@
+"""Solvers for Ridge and LogisticRegression using SAG algorithm"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+import numpy as np
+
+from ..exceptions import ConvergenceWarning
+from ..utils import check_array
+from ..utils.extmath import row_norms
+from ..utils.validation import _check_sample_weight
+from ._base import make_dataset
+from ._sag_fast import sag32, sag64
+
+
+def get_auto_step_size(
+    max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=None, is_saga=False
+):
+    """Compute automatic step size for SAG solver.
+
+    The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is
+    the max sum of squares for over all samples.
+
+    Parameters
+    ----------
+    max_squared_sum : float
+        Maximum squared sum of X over samples.
+
+    alpha_scaled : float
+        Constant that multiplies the regularization term, scaled by
+        1. / n_samples, the number of samples.
+
+    loss : {'log', 'squared', 'multinomial'}
+        The loss function used in SAG solver.
+
+    fit_intercept : bool
+        Specifies if a constant (a.k.a. bias or intercept) will be
+        added to the decision function.
+
+    n_samples : int, default=None
+        Number of rows in X. Useful if is_saga=True.
+
+    is_saga : bool, default=False
+        Whether to return step size for the SAGA algorithm or the SAG
+        algorithm.
+
+    Returns
+    -------
+    step_size : float
+        Step size used in SAG solver.
+
+    References
+    ----------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+
+    :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
+    "SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives" <1407.0202>`
+    """
+    if loss in ("log", "multinomial"):
+        L = 0.25 * (max_squared_sum + int(fit_intercept)) + alpha_scaled
+    elif loss == "squared":
+        # inverse Lipschitz constant for squared loss
+        L = max_squared_sum + int(fit_intercept) + alpha_scaled
+    else:
+        raise ValueError(
+            "Unknown loss function for SAG solver, got %s instead of 'log' or 'squared'"
+            % loss
+        )
+    if is_saga:
+        # SAGA theoretical step size is 1/3L or 1 / (2 * (L + mu n))
+        # See Defazio et al. 2014
+        mun = min(2 * n_samples * alpha_scaled, L)
+        step = 1.0 / (2 * L + mun)
+    else:
+        # SAG theoretical step size is 1/16L but it is recommended to use 1 / L
+        # see http://www.birs.ca//workshops//2014/14w5003/files/schmidt.pdf,
+        # slide 65
+        step = 1.0 / L
+    return step
+
+
+def sag_solver(
+    X,
+    y,
+    sample_weight=None,
+    loss="log",
+    alpha=1.0,
+    beta=0.0,
+    max_iter=1000,
+    tol=0.001,
+    verbose=0,
+    random_state=None,
+    check_input=True,
+    max_squared_sum=None,
+    warm_start_mem=None,
+    is_saga=False,
+):
+    """SAG solver for Ridge and LogisticRegression.
+
+    SAG stands for Stochastic Average Gradient: the gradient of the loss is
+    estimated each sample at a time and the model is updated along the way with
+    a constant learning rate.
+
+    IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the
+    same scale. You can normalize the data by using
+    sklearn.preprocessing.StandardScaler on your data before passing it to the
+    fit method.
+
+    This implementation works with data represented as dense numpy arrays or
+    sparse scipy arrays of floating point values for the features. It will
+    fit the data according to squared loss or log loss.
+
+    The regularizer is a penalty added to the loss function that shrinks model
+    parameters towards the zero vector using the squared euclidean norm L2.
+
+    .. versionadded:: 0.17
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data.
+
+    y : ndarray of shape (n_samples,)
+        Target values. With loss='multinomial', y must be label encoded
+        (see preprocessing.LabelEncoder). For loss='log' it must be in [0, 1].
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Weights applied to individual samples (1. for unweighted).
+
+    loss : {'log', 'squared', 'multinomial'}, default='log'
+        Loss function that will be optimized:
+        -'log' is the binary logistic loss, as used in LogisticRegression.
+        -'squared' is the squared loss, as used in Ridge.
+        -'multinomial' is the multinomial logistic loss, as used in
+         LogisticRegression.
+
+        .. versionadded:: 0.18
+           *loss='multinomial'*
+
+    alpha : float, default=1.
+        L2 regularization term in the objective function
+        ``(0.5 * alpha * || W ||_F^2)``.
+
+    beta : float, default=0.
+        L1 regularization term in the objective function
+        ``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True.
+
+    max_iter : int, default=1000
+        The max number of passes over the training data if the stopping
+        criteria is not reached.
+
+    tol : float, default=0.001
+        The stopping criteria for the weights. The iterations will stop when
+        max(change in weights) / max(weights) < tol.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    random_state : int, RandomState instance or None, default=None
+        Used when shuffling the data. Pass an int for reproducible output
+        across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    check_input : bool, default=True
+        If False, the input arrays X and y will not be checked.
+
+    max_squared_sum : float, default=None
+        Maximum squared sum of X over samples. If None, it will be computed,
+        going through all the samples. The value should be precomputed
+        to speed up cross validation.
+
+    warm_start_mem : dict, default=None
+        The initialization parameters used for warm starting. Warm starting is
+        currently used in LogisticRegression but not in Ridge.
+        It contains:
+            - 'coef': the weight vector, with the intercept in last line
+                if the intercept is fitted.
+            - 'gradient_memory': the scalar gradient for all seen samples.
+            - 'sum_gradient': the sum of gradient over all seen samples,
+                for each feature.
+            - 'intercept_sum_gradient': the sum of gradient over all seen
+                samples, for the intercept.
+            - 'seen': array of boolean describing the seen samples.
+            - 'num_seen': the number of seen samples.
+
+    is_saga : bool, default=False
+        Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves
+        better in the first epochs, and allow for l1 regularisation.
+
+    Returns
+    -------
+    coef_ : ndarray of shape (n_features,)
+        Weight vector.
+
+    n_iter_ : int
+        The number of full pass on all samples.
+
+    warm_start_mem : dict
+        Contains a 'coef' key with the fitted result, and possibly the
+        fitted intercept at the end of the array. Contains also other keys
+        used for warm starting.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import linear_model
+    >>> n_samples, n_features = 10, 5
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.randn(n_samples, n_features)
+    >>> y = rng.randn(n_samples)
+    >>> clf = linear_model.Ridge(solver='sag')
+    >>> clf.fit(X, y)
+    Ridge(solver='sag')
+
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> y = np.array([1, 1, 2, 2])
+    >>> clf = linear_model.LogisticRegression(solver='sag')
+    >>> clf.fit(X, y)
+    LogisticRegression(solver='sag')
+
+    References
+    ----------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+
+    :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
+    "SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives" <1407.0202>`
+
+    See Also
+    --------
+    Ridge, SGDRegressor, ElasticNet, Lasso, SVR,
+    LogisticRegression, SGDClassifier, LinearSVC, Perceptron
+    """
+    if warm_start_mem is None:
+        warm_start_mem = {}
+    # Ridge default max_iter is None
+    if max_iter is None:
+        max_iter = 1000
+
+    if check_input:
+        _dtype = [np.float64, np.float32]
+        X = check_array(X, dtype=_dtype, accept_sparse="csr", order="C")
+        y = check_array(y, dtype=_dtype, ensure_2d=False, order="C")
+
+    n_samples, n_features = X.shape[0], X.shape[1]
+    # As in SGD, the alpha is scaled by n_samples.
+    alpha_scaled = float(alpha) / n_samples
+    beta_scaled = float(beta) / n_samples
+
+    # if loss == 'multinomial', y should be label encoded.
+    n_classes = int(y.max()) + 1 if loss == "multinomial" else 1
+
+    # initialization
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+    if "coef" in warm_start_mem.keys():
+        coef_init = warm_start_mem["coef"]
+    else:
+        # assume fit_intercept is False
+        coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C")
+
+    # coef_init contains possibly the intercept_init at the end.
+    # Note that Ridge centers the data before fitting, so fit_intercept=False.
+    fit_intercept = coef_init.shape[0] == (n_features + 1)
+    if fit_intercept:
+        intercept_init = coef_init[-1, :]
+        coef_init = coef_init[:-1, :]
+    else:
+        intercept_init = np.zeros(n_classes, dtype=X.dtype)
+
+    if "intercept_sum_gradient" in warm_start_mem.keys():
+        intercept_sum_gradient = warm_start_mem["intercept_sum_gradient"]
+    else:
+        intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype)
+
+    if "gradient_memory" in warm_start_mem.keys():
+        gradient_memory_init = warm_start_mem["gradient_memory"]
+    else:
+        gradient_memory_init = np.zeros(
+            (n_samples, n_classes), dtype=X.dtype, order="C"
+        )
+    if "sum_gradient" in warm_start_mem.keys():
+        sum_gradient_init = warm_start_mem["sum_gradient"]
+    else:
+        sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C")
+
+    if "seen" in warm_start_mem.keys():
+        seen_init = warm_start_mem["seen"]
+    else:
+        seen_init = np.zeros(n_samples, dtype=np.int32, order="C")
+
+    if "num_seen" in warm_start_mem.keys():
+        num_seen_init = warm_start_mem["num_seen"]
+    else:
+        num_seen_init = 0
+
+    dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state)
+
+    if max_squared_sum is None:
+        max_squared_sum = row_norms(X, squared=True).max()
+    step_size = get_auto_step_size(
+        max_squared_sum,
+        alpha_scaled,
+        loss,
+        fit_intercept,
+        n_samples=n_samples,
+        is_saga=is_saga,
+    )
+    if step_size * alpha_scaled == 1:
+        raise ZeroDivisionError(
+            "Current sag implementation does not handle "
+            "the case step_size * alpha_scaled == 1"
+        )
+
+    sag = sag64 if X.dtype == np.float64 else sag32
+    num_seen, n_iter_ = sag(
+        dataset,
+        coef_init,
+        intercept_init,
+        n_samples,
+        n_features,
+        n_classes,
+        tol,
+        max_iter,
+        loss,
+        step_size,
+        alpha_scaled,
+        beta_scaled,
+        sum_gradient_init,
+        gradient_memory_init,
+        seen_init,
+        num_seen_init,
+        fit_intercept,
+        intercept_sum_gradient,
+        intercept_decay,
+        is_saga,
+        verbose,
+    )
+
+    if n_iter_ == max_iter:
+        warnings.warn(
+            "The max_iter was reached which means the coef_ did not converge",
+            ConvergenceWarning,
+        )
+
+    if fit_intercept:
+        coef_init = np.vstack((coef_init, intercept_init))
+
+    warm_start_mem = {
+        "coef": coef_init,
+        "sum_gradient": sum_gradient_init,
+        "intercept_sum_gradient": intercept_sum_gradient,
+        "gradient_memory": gradient_memory_init,
+        "seen": seen_init,
+        "num_seen": num_seen,
+    }
+
+    if loss == "multinomial":
+        coef_ = coef_init.T
+    else:
+        coef_ = coef_init[:, 0]
+
+    return coef_, n_iter_, warm_start_mem
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag_fast.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag_fast.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..906928673b0b7570cd7d5e819f9dd521539f5233
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag_fast.pyx.tp
@@ -0,0 +1,642 @@
+{{py:
+
+"""
+
+Template file for easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: sag_fast.pyx
+
+Each class is duplicated for all dtypes (float and double). The keywords
+between double braces are substituted during the build.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# name_suffix, c_type, np_type
+dtypes = [('64', 'double', 'np.float64'),
+          ('32', 'float', 'np.float32')]
+
+}}
+"""SAG and SAGA implementation"""
+
+import numpy as np
+from libc.math cimport exp, fabs, isfinite, log
+from libc.time cimport time, time_t
+from libc.stdio cimport printf
+
+from .._loss._loss cimport (
+    CyLossFunction,
+    CyHalfBinomialLoss,
+    CyHalfMultinomialLoss,
+    CyHalfSquaredError,
+)
+from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef inline {{c_type}} fmax{{name_suffix}}({{c_type}} x, {{c_type}} y) noexcept nogil:
+    if x > y:
+        return x
+    return y
+
+{{endfor}}
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef inline {{c_type}} _soft_thresholding{{name_suffix}}({{c_type}} x, {{c_type}} shrinkage) noexcept nogil:
+    return fmax{{name_suffix}}(x - shrinkage, 0) - fmax{{name_suffix}}(- x - shrinkage, 0)
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+def sag{{name_suffix}}(
+    SequentialDataset{{name_suffix}} dataset,
+    {{c_type}}[:, ::1] weights_array,
+    {{c_type}}[::1] intercept_array,
+    int n_samples,
+    int n_features,
+    int n_classes,
+    double tol,
+    int max_iter,
+    str loss_function,
+    double step_size,
+    double alpha,
+    double beta,
+    {{c_type}}[:, ::1] sum_gradient_init,
+    {{c_type}}[:, ::1] gradient_memory_init,
+    bint[::1] seen_init,
+    int num_seen,
+    bint fit_intercept,
+    {{c_type}}[::1] intercept_sum_gradient_init,
+    double intercept_decay,
+    bint saga,
+    bint verbose
+):
+    """Stochastic Average Gradient (SAG) and SAGA solvers.
+
+    Used in Ridge and LogisticRegression.
+
+    Some implementation details:
+
+    - Just-in-time (JIT) update: In SAG(A), the average-gradient update is
+    collinear with the drawn sample X_i. Therefore, if the data is sparse, the
+    random sample X_i will change the average gradient only on features j where
+    X_ij != 0. In some cases, the average gradient on feature j might change
+    only after k random samples with no change. In these cases, instead of
+    applying k times the same gradient step on feature j, we apply the gradient
+    step only once, scaled by k. This is called the "just-in-time update", and
+    it is performed in `lagged_update{{name_suffix}}`. This function also
+    applies the proximal operator after the gradient step (if L1 regularization
+    is used in SAGA).
+
+    - Weight scale: In SAG(A), the weights are scaled down at each iteration
+    due to the L2 regularization. To avoid updating all the weights at each
+    iteration, the weight scale is factored out in a separate variable `wscale`
+    which is only used in the JIT update. When this variable is too small, it
+    is reset for numerical stability using the function
+    `scale_weights{{name_suffix}}`. This reset requires applying all remaining
+    JIT updates. This reset is also performed every `n_samples` iterations
+    before each convergence check, so when the algorithm stops, we are sure
+    that there is no remaining JIT updates.
+
+    Reference
+    ---------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+    (section 4.3)
+
+    :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
+    "SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives" <1407.0202>`
+    """
+    # the data pointer for x, the current sample
+    cdef {{c_type}} *x_data_ptr = NULL
+    # the index pointer for the column of the data
+    cdef int *x_ind_ptr = NULL
+    # the number of non-zero features for current sample
+    cdef int xnnz = -1
+    # the label value for current sample
+    # the label value for current sample
+    cdef {{c_type}} y
+    # the sample weight
+    cdef {{c_type}} sample_weight
+
+    # helper variable for indexes
+    cdef int f_idx, s_idx, feature_ind, class_ind, j
+    # the number of pass through all samples
+    cdef int n_iter = 0
+    # helper to track iterations through samples
+    cdef int sample_itr
+    # the index (row number) of the current sample
+    cdef int sample_ind
+
+    # the maximum change in weights, used to compute stopping criteria
+    cdef {{c_type}} max_change
+    # a holder variable for the max weight, used to compute stopping criteria
+    cdef {{c_type}} max_weight
+
+    # the start time of the fit
+    cdef time_t start_time
+    # the end time of the fit
+    cdef time_t end_time
+
+    # precomputation since the step size does not change in this implementation
+    cdef {{c_type}} wscale_update = 1.0 - step_size * alpha
+
+    # helper for cumulative sum
+    cdef {{c_type}} cum_sum
+
+    # the pointer to the coef_ or weights
+    cdef {{c_type}}* weights = &weights_array[0, 0]
+
+    # the sum of gradients for each feature
+    cdef {{c_type}}* sum_gradient = &sum_gradient_init[0, 0]
+
+    # the previously seen gradient for each sample
+    cdef {{c_type}}* gradient_memory = &gradient_memory_init[0, 0]
+
+    # the cumulative sums needed for JIT params
+    cdef {{c_type}}[::1] cumulative_sums = np.empty(n_samples, dtype={{np_type}}, order="c")
+
+    # the index for the last time this feature was updated
+    cdef int[::1] feature_hist = np.zeros(n_features, dtype=np.int32, order="c")
+
+    # the previous weights to use to compute stopping criteria
+    cdef {{c_type}}[:, ::1] previous_weights_array = np.zeros((n_features, n_classes), dtype={{np_type}}, order="c")
+    cdef {{c_type}}* previous_weights = &previous_weights_array[0, 0]
+
+    cdef {{c_type}}[::1] prediction = np.zeros(n_classes, dtype={{np_type}}, order="c")
+
+    cdef {{c_type}}[::1] gradient = np.zeros(n_classes, dtype={{np_type}}, order="c")
+
+    # Intermediate variable that need declaration since cython cannot infer when templating
+    cdef {{c_type}} val
+
+    # Bias correction term in saga
+    cdef {{c_type}} gradient_correction
+
+    # the scalar used for multiplying z
+    cdef {{c_type}} wscale = 1.0
+
+    # return value (-1 if an error occurred, 0 otherwise)
+    cdef int status = 0
+
+    # the cumulative sums for each iteration for the sparse implementation
+    cumulative_sums[0] = 0.0
+
+    # the multipliative scale needed for JIT params
+    cdef {{c_type}}[::1] cumulative_sums_prox
+    cdef {{c_type}}* cumulative_sums_prox_ptr
+
+    cdef bint prox = beta > 0 and saga
+
+    # Loss function to optimize
+    cdef CyLossFunction loss
+    # Whether the loss function is multinomial
+    cdef bint multinomial = False
+    # Multinomial loss function
+    cdef CyHalfMultinomialLoss multiloss
+
+    if loss_function == "multinomial":
+        multinomial = True
+        multiloss = CyHalfMultinomialLoss()
+    elif loss_function == "log":
+        loss = CyHalfBinomialLoss()
+    elif loss_function == "squared":
+        loss = CyHalfSquaredError()
+    else:
+        raise ValueError("Invalid loss parameter: got %s instead of "
+                         "one of ('log', 'squared', 'multinomial')"
+                         % loss_function)
+
+    if prox:
+        cumulative_sums_prox = np.empty(n_samples, dtype={{np_type}}, order="c")
+        cumulative_sums_prox_ptr = &cumulative_sums_prox[0]
+    else:
+        cumulative_sums_prox = None
+        cumulative_sums_prox_ptr = NULL
+
+    with nogil:
+        start_time = time(NULL)
+        for n_iter in range(max_iter):
+            for sample_itr in range(n_samples):
+                # extract a random sample
+                sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, &y, &sample_weight)
+
+                # cached index for gradient_memory
+                s_idx = sample_ind * n_classes
+
+                # update the number of samples seen and the seen array
+                if seen_init[sample_ind] == 0:
+                    num_seen += 1
+                    seen_init[sample_ind] = 1
+
+                # make the weight updates (just-in-time gradient step, and prox operator)
+                if sample_itr > 0:
+                   status = lagged_update{{name_suffix}}(
+                       weights=weights,
+                       wscale=wscale,
+                       xnnz=xnnz,
+                       n_samples=n_samples,
+                       n_classes=n_classes,
+                       sample_itr=sample_itr,
+                       cumulative_sums=&cumulative_sums[0],
+                       cumulative_sums_prox=cumulative_sums_prox_ptr,
+                       feature_hist=&feature_hist[0],
+                       prox=prox,
+                       sum_gradient=sum_gradient,
+                       x_ind_ptr=x_ind_ptr,
+                       reset=False,
+                       n_iter=n_iter
+                   )
+                   if status == -1:
+                       break
+
+                # find the current prediction
+                predict_sample{{name_suffix}}(
+                    x_data_ptr=x_data_ptr,
+                    x_ind_ptr=x_ind_ptr,
+                    xnnz=xnnz,
+                    w_data_ptr=weights,
+                    wscale=wscale,
+                    intercept=&intercept_array[0],
+                    prediction=&prediction[0],
+                    n_classes=n_classes
+                )
+
+                # compute the gradient for this sample, given the prediction
+                if multinomial:
+                    multiloss.cy_gradient(
+                        y_true=y,
+                        raw_prediction=prediction,
+                        sample_weight=sample_weight,
+                        gradient_out=gradient,
+                    )
+                else:
+                    gradient[0] = loss.cy_gradient(y, prediction[0]) * sample_weight
+
+                # L2 regularization by simply rescaling the weights
+                wscale *= wscale_update
+
+                # make the updates to the sum of gradients
+                for j in range(xnnz):
+                    feature_ind = x_ind_ptr[j]
+                    val = x_data_ptr[j]
+                    f_idx = feature_ind * n_classes
+                    for class_ind in range(n_classes):
+                        gradient_correction = \
+                            val * (gradient[class_ind] -
+                                   gradient_memory[s_idx + class_ind])
+                        if saga:
+                            # Note that this is not the main gradient step,
+                            # which is performed just-in-time in lagged_update.
+                            # This part is done outside the JIT update
+                            # as it does not depend on the average gradient.
+                            # The prox operator is applied after the JIT update
+                            weights[f_idx + class_ind] -= \
+                                (gradient_correction * step_size
+                                 * (1 - 1. / num_seen) / wscale)
+                        sum_gradient[f_idx + class_ind] += gradient_correction
+
+                # fit the intercept
+                if fit_intercept:
+                    for class_ind in range(n_classes):
+                        gradient_correction = (gradient[class_ind] -
+                                               gradient_memory[s_idx + class_ind])
+                        intercept_sum_gradient_init[class_ind] += gradient_correction
+                        gradient_correction *= step_size * (1. - 1. / num_seen)
+                        if saga:
+                            intercept_array[class_ind] -= \
+                                (step_size * intercept_sum_gradient_init[class_ind] /
+                                 num_seen * intercept_decay) + gradient_correction
+                        else:
+                            intercept_array[class_ind] -= \
+                                (step_size * intercept_sum_gradient_init[class_ind] /
+                                 num_seen * intercept_decay)
+
+                        # check to see that the intercept is not inf or NaN
+                        if not isfinite(intercept_array[class_ind]):
+                            status = -1
+                            break
+                    # Break from the n_samples outer loop if an error happened
+                    # in the fit_intercept n_classes inner loop
+                    if status == -1:
+                        break
+
+                # update the gradient memory for this sample
+                for class_ind in range(n_classes):
+                    gradient_memory[s_idx + class_ind] = gradient[class_ind]
+
+                if sample_itr == 0:
+                    cumulative_sums[0] = step_size / (wscale * num_seen)
+                    if prox:
+                        cumulative_sums_prox[0] = step_size * beta / wscale
+                else:
+                    cumulative_sums[sample_itr] = \
+                        (cumulative_sums[sample_itr - 1] +
+                         step_size / (wscale * num_seen))
+                    if prox:
+                        cumulative_sums_prox[sample_itr] = \
+                        (cumulative_sums_prox[sample_itr - 1] +
+                             step_size * beta / wscale)
+                # If wscale gets too small, we need to reset the scale.
+                # This also resets the just-in-time update system.
+                if wscale < 1e-9:
+                    if verbose:
+                        with gil:
+                            print("rescaling...")
+                    status = scale_weights{{name_suffix}}(
+                        weights=weights,
+                        wscale=&wscale,
+                        n_features=n_features,
+                        n_samples=n_samples,
+                        n_classes=n_classes,
+                        sample_itr=sample_itr,
+                        cumulative_sums=&cumulative_sums[0],
+                        cumulative_sums_prox=cumulative_sums_prox_ptr,
+                        feature_hist=&feature_hist[0],
+                        prox=prox,
+                        sum_gradient=sum_gradient,
+                        n_iter=n_iter
+                    )
+                    if status == -1:
+                        break
+
+            # Break from the n_iter outer loop if an error happened in the
+            # n_samples inner loop
+            if status == -1:
+                break
+
+            # We scale the weights every n_samples iterations and reset the
+            # just-in-time update system for numerical stability.
+            # Because this reset is done before every convergence check, we are
+            # sure there is no remaining lagged update when the algorithm stops.
+            status = scale_weights{{name_suffix}}(
+                weights=weights,
+                wscale=&wscale,
+                n_features=n_features,
+                n_samples=n_samples,
+                n_classes=n_classes,
+                sample_itr=n_samples - 1,
+                cumulative_sums=&cumulative_sums[0],
+                cumulative_sums_prox=cumulative_sums_prox_ptr,
+                feature_hist=&feature_hist[0],
+                prox=prox,
+                sum_gradient=sum_gradient,
+                n_iter=n_iter
+            )
+            if status == -1:
+                break
+
+            # check if the stopping criteria is reached
+            max_change = 0.0
+            max_weight = 0.0
+            for idx in range(n_features * n_classes):
+                max_weight = fmax{{name_suffix}}(max_weight, fabs(weights[idx]))
+                max_change = fmax{{name_suffix}}(max_change, fabs(weights[idx] - previous_weights[idx]))
+                previous_weights[idx] = weights[idx]
+            if ((max_weight != 0 and max_change / max_weight <= tol)
+                or max_weight == 0 and max_change == 0):
+                if verbose:
+                    end_time = time(NULL)
+                    with gil:
+                        print("convergence after %d epochs took %d seconds" %
+                              (n_iter + 1, end_time - start_time))
+                break
+            elif verbose:
+                printf('Epoch %d, change: %.8g\n', n_iter + 1,
+                                                  max_change / max_weight)
+    n_iter += 1
+    # We do the error treatment here based on error code in status to avoid
+    # re-acquiring the GIL within the cython code, which slows the computation
+    # when the sag/saga solver is used concurrently in multiple Python threads.
+    if status == -1:
+        raise ValueError(("Floating-point under-/overflow occurred at epoch"
+                          " #%d. Scaling input data with StandardScaler or"
+                          " MinMaxScaler might help.") % n_iter)
+
+    if verbose and n_iter >= max_iter:
+        end_time = time(NULL)
+        print(("max_iter reached after %d seconds") %
+              (end_time - start_time))
+
+    return num_seen, n_iter
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef int scale_weights{{name_suffix}}(
+    {{c_type}}* weights,
+    {{c_type}}* wscale,
+    int n_features,
+    int n_samples,
+    int n_classes,
+    int sample_itr,
+    {{c_type}}* cumulative_sums,
+    {{c_type}}* cumulative_sums_prox,
+    int* feature_hist,
+    bint prox,
+    {{c_type}}* sum_gradient,
+    int n_iter
+) noexcept nogil:
+    """Scale the weights and reset wscale to 1.0 for numerical stability, and
+    reset the just-in-time (JIT) update system.
+
+    See `sag{{name_suffix}}`'s docstring about the JIT update system.
+
+    wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
+    can become very small, so we reset it every n_samples iterations to 1.0 for
+    numerical stability. To be able to scale, we first need to update every
+    coefficients and reset the just-in-time update system.
+    This also limits the size of `cumulative_sums`.
+    """
+
+    cdef int status
+    status = lagged_update{{name_suffix}}(
+        weights,
+        wscale[0],
+        n_features,
+        n_samples,
+        n_classes,
+        sample_itr + 1,
+        cumulative_sums,
+        cumulative_sums_prox,
+        feature_hist,
+        prox,
+        sum_gradient,
+        NULL,
+        True,
+        n_iter
+    )
+    # if lagged update succeeded, reset wscale to 1.0
+    if status == 0:
+        wscale[0] = 1.0
+    return status
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef int lagged_update{{name_suffix}}(
+    {{c_type}}* weights,
+    {{c_type}} wscale,
+    int xnnz,
+    int n_samples,
+    int n_classes,
+    int sample_itr,
+    {{c_type}}* cumulative_sums,
+    {{c_type}}* cumulative_sums_prox,
+    int* feature_hist,
+    bint prox,
+    {{c_type}}* sum_gradient,
+    int* x_ind_ptr,
+    bint reset,
+    int n_iter
+) noexcept nogil:
+    """Hard perform the JIT updates for non-zero features of present sample.
+
+    See `sag{{name_suffix}}`'s docstring about the JIT update system.
+
+    The updates that awaits are kept in memory using cumulative_sums,
+    cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
+    (Defazio et al. 2014) for details. If reset=True, we also reset wscale to
+    1 (this is done at the end of each epoch).
+    """
+    cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
+    cdef {{c_type}} cum_sum, grad_step, prox_step, cum_sum_prox
+    for feature_ind in range(xnnz):
+        if not reset:
+            feature_ind = x_ind_ptr[feature_ind]
+        f_idx = feature_ind * n_classes
+
+        cum_sum = cumulative_sums[sample_itr - 1]
+        if prox:
+            cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
+        if feature_hist[feature_ind] != 0:
+            cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
+            if prox:
+                cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
+        if not prox:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                weights[idx] -= cum_sum * sum_gradient[idx]
+                if reset:
+                    weights[idx] *= wscale
+                    if not isfinite(weights[idx]):
+                        # returning here does not require the gil as the return
+                        # type is a C integer
+                        return -1
+        else:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
+                    # In this case, we can perform all the gradient steps and
+                    # all the proximal steps in this order, which is more
+                    # efficient than unrolling all the lagged updates.
+                    # Idea taken from scikit-learn-contrib/lightning.
+                    weights[idx] -= cum_sum * sum_gradient[idx]
+                    weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx],
+                                                      cum_sum_prox)
+                else:
+                    last_update_ind = feature_hist[feature_ind]
+                    if last_update_ind == -1:
+                        last_update_ind = sample_itr - 1
+                    for lagged_ind in range(sample_itr - 1,
+                                   last_update_ind - 1, -1):
+                        if lagged_ind > 0:
+                            grad_step = (cumulative_sums[lagged_ind]
+                               - cumulative_sums[lagged_ind - 1])
+                            prox_step = (cumulative_sums_prox[lagged_ind]
+                               - cumulative_sums_prox[lagged_ind - 1])
+                        else:
+                            grad_step = cumulative_sums[lagged_ind]
+                            prox_step = cumulative_sums_prox[lagged_ind]
+                        weights[idx] -= sum_gradient[idx] * grad_step
+                        weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx],
+                                                          prox_step)
+
+                if reset:
+                    weights[idx] *= wscale
+                    # check to see that the weight is not inf or NaN
+                    if not isfinite(weights[idx]):
+                        return -1
+        if reset:
+            feature_hist[feature_ind] = sample_itr % n_samples
+        else:
+            feature_hist[feature_ind] = sample_itr
+
+    if reset:
+        cumulative_sums[sample_itr - 1] = 0.0
+        if prox:
+            cumulative_sums_prox[sample_itr - 1] = 0.0
+
+    return 0
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef void predict_sample{{name_suffix}}(
+    {{c_type}}* x_data_ptr,
+    int* x_ind_ptr,
+    int xnnz,
+    {{c_type}}* w_data_ptr,
+    {{c_type}} wscale,
+    {{c_type}}* intercept,
+    {{c_type}}* prediction,
+    int n_classes
+) noexcept nogil:
+    """Compute the prediction given sparse sample x and dense weight w.
+
+    Parameters
+    ----------
+    x_data_ptr : pointer
+        Pointer to the data of the sample x
+
+    x_ind_ptr : pointer
+        Pointer to the indices of the sample  x
+
+    xnnz : int
+        Number of non-zero element in the sample  x
+
+    w_data_ptr : pointer
+        Pointer to the data of the weights w
+
+    wscale : {{c_type}}
+        Scale of the weights w
+
+    intercept : pointer
+        Pointer to the intercept
+
+    prediction : pointer
+        Pointer to store the resulting prediction
+
+    n_classes : int
+        Number of classes in multinomial case. Equals 1 in binary case.
+
+    """
+    cdef int feature_ind, class_ind, j
+    cdef {{c_type}} innerprod
+
+    for class_ind in range(n_classes):
+        innerprod = 0.0
+        # Compute the dot product only on non-zero elements of x
+        for j in range(xnnz):
+            feature_ind = x_ind_ptr[j]
+            innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
+                          x_data_ptr[j])
+
+        prediction[class_ind] = wscale * innerprod + intercept[class_ind]
+
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..45cdf9172d8c455e3ba27f5755337683dd704aad
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp
@@ -0,0 +1,661 @@
+{{py:
+
+"""
+Template file to easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: _sgd_fast.pyx
+
+Each relevant function is duplicated for the dtypes float and double.
+The keywords between double braces are substituted during the build.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# The dtypes are defined as follows (name_suffix, c_type, np_type)
+dtypes = [
+    ("64", "double", "np.float64"),
+    ("32", "float", "np.float32"),
+]
+
+}}
+"""SGD implementation"""
+
+import numpy as np
+from time import time
+
+from cython cimport floating
+from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
+
+from .._loss._loss cimport CyLossFunction
+from ..utils._typedefs cimport uint32_t, uint8_t
+from ..utils._weight_vector cimport WeightVector32, WeightVector64
+from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
+
+
+cdef extern from *:
+    """
+    /* Penalty constants */
+    #define NO_PENALTY 0
+    #define L1 1
+    #define L2 2
+    #define ELASTICNET 3
+
+    /* Learning rate constants */
+    #define CONSTANT 1
+    #define OPTIMAL 2
+    #define INVSCALING 3
+    #define ADAPTIVE 4
+    #define PA1 5
+    #define PA2 6
+    """
+    int NO_PENALTY = 0
+    int L1 = 1
+    int L2 = 2
+    int ELASTICNET = 3
+
+    int CONSTANT = 1
+    int OPTIMAL = 2
+    int INVSCALING = 3
+    int ADAPTIVE = 4
+    int PA1 = 5
+    int PA2 = 6
+
+
+# ----------------------------------------
+# Extension Types for Loss Functions
+# ----------------------------------------
+
+cdef class Regression(CyLossFunction):
+    """Base class for loss functions for regression"""
+
+    def py_loss(self, double p, double y):
+        """Python version of `loss` for testing only.
+
+        Pytest needs a python function and can't use cdef functions.
+
+        Parameters
+        ----------
+        p : double
+            The prediction, `p = w^T x + intercept`.
+        y : double
+            The true value (aka target).
+
+        Returns
+        -------
+        double
+            The loss evaluated at `p` and `y`.
+        """
+        return self.cy_loss(y, p)
+
+    def py_dloss(self, double p, double y):
+        """Python version of `dloss` for testing only.
+
+        Pytest needs a python function and can't use cdef functions.
+
+        Parameters
+        ----------
+        p : double
+            The prediction, `p = w^T x`.
+        y : double
+            The true value (aka target).
+
+        Returns
+        -------
+        double
+            The derivative of the loss function with regards to `p`.
+        """
+        return self.cy_gradient(y, p)
+
+
+cdef class Classification(CyLossFunction):
+    """Base class for loss functions for classification"""
+
+    def py_loss(self, double p, double y):
+        """Python version of `loss` for testing only."""
+        return self.cy_loss(y, p)
+
+    def py_dloss(self, double p, double y):
+        """Python version of `dloss` for testing only."""
+        return self.cy_gradient(y, p)
+
+
+cdef class ModifiedHuber(Classification):
+    """Modified Huber loss for binary classification with y in {-1, 1}
+
+    This is equivalent to quadratically smoothed SVM with gamma = 2.
+
+    See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
+    Stochastic Gradient Descent', ICML'04.
+    """
+    cdef double cy_loss(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        if z >= 1.0:
+            return 0.0
+        elif z >= -1.0:
+            return (1.0 - z) * (1.0 - z)
+        else:
+            return -4.0 * z
+
+    cdef double cy_gradient(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        if z >= 1.0:
+            return 0.0
+        elif z >= -1.0:
+            return 2.0 * (1.0 - z) * -y
+        else:
+            return -4.0 * y
+
+    def __reduce__(self):
+        return ModifiedHuber, ()
+
+
+cdef class Hinge(Classification):
+    """Hinge loss for binary classification tasks with y in {-1,1}
+
+    Parameters
+    ----------
+
+    threshold : float > 0.0
+        Margin threshold. When threshold=1.0, one gets the loss used by SVM.
+        When threshold=0.0, one gets the loss used by the Perceptron.
+    """
+
+    cdef double threshold
+
+    def __init__(self, double threshold=1.0):
+        self.threshold = threshold
+
+    cdef double cy_loss(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        if z <= self.threshold:
+            return self.threshold - z
+        return 0.0
+
+    cdef double cy_gradient(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        if z <= self.threshold:
+            return -y
+        return 0.0
+
+    def __reduce__(self):
+        return Hinge, (self.threshold,)
+
+
+cdef class SquaredHinge(Classification):
+    """Squared Hinge loss for binary classification tasks with y in {-1,1}
+
+    Parameters
+    ----------
+
+    threshold : float > 0.0
+        Margin threshold. When threshold=1.0, one gets the loss used by
+        (quadratically penalized) SVM.
+    """
+
+    cdef double threshold
+
+    def __init__(self, double threshold=1.0):
+        self.threshold = threshold
+
+    cdef double cy_loss(self, double y, double p) noexcept nogil:
+        cdef double z = self.threshold - p * y
+        if z > 0:
+            return z * z
+        return 0.0
+
+    cdef double cy_gradient(self, double y, double p) noexcept nogil:
+        cdef double z = self.threshold - p * y
+        if z > 0:
+            return -2 * y * z
+        return 0.0
+
+    def __reduce__(self):
+        return SquaredHinge, (self.threshold,)
+
+
+cdef class EpsilonInsensitive(Regression):
+    """Epsilon-Insensitive loss (used by SVR).
+
+    loss = max(0, |y - p| - epsilon)
+    """
+
+    cdef double epsilon
+
+    def __init__(self, double epsilon):
+        self.epsilon = epsilon
+
+    cdef double cy_loss(self, double y, double p) noexcept nogil:
+        cdef double ret = fabs(y - p) - self.epsilon
+        return ret if ret > 0 else 0
+
+    cdef double cy_gradient(self, double y, double p) noexcept nogil:
+        if y - p > self.epsilon:
+            return -1
+        elif p - y > self.epsilon:
+            return 1
+        else:
+            return 0
+
+    def __reduce__(self):
+        return EpsilonInsensitive, (self.epsilon,)
+
+
+cdef class SquaredEpsilonInsensitive(Regression):
+    """Epsilon-Insensitive loss.
+
+    loss = max(0, |y - p| - epsilon)^2
+    """
+
+    cdef double epsilon
+
+    def __init__(self, double epsilon):
+        self.epsilon = epsilon
+
+    cdef double cy_loss(self, double y, double p) noexcept nogil:
+        cdef double ret = fabs(y - p) - self.epsilon
+        return ret * ret if ret > 0 else 0
+
+    cdef double cy_gradient(self, double y, double p) noexcept nogil:
+        cdef double z
+        z = y - p
+        if z > self.epsilon:
+            return -2 * (z - self.epsilon)
+        elif z < -self.epsilon:
+            return 2 * (-z - self.epsilon)
+        else:
+            return 0
+
+    def __reduce__(self):
+        return SquaredEpsilonInsensitive, (self.epsilon,)
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+def _plain_sgd{{name_suffix}}(
+    const {{c_type}}[::1] weights,
+    double intercept,
+    const {{c_type}}[::1] average_weights,
+    double average_intercept,
+    CyLossFunction loss,
+    int penalty_type,
+    double alpha,
+    double C,
+    double l1_ratio,
+    SequentialDataset{{name_suffix}} dataset,
+    const uint8_t[::1] validation_mask,
+    bint early_stopping,
+    validation_score_cb,
+    int n_iter_no_change,
+    unsigned int max_iter,
+    double tol,
+    int fit_intercept,
+    int verbose,
+    bint shuffle,
+    uint32_t seed,
+    double weight_pos,
+    double weight_neg,
+    int learning_rate,
+    double eta0,
+    double power_t,
+    bint one_class,
+    double t=1.0,
+    double intercept_decay=1.0,
+    int average=0,
+):
+    """SGD for generic loss functions and penalties with optional averaging
+
+    Parameters
+    ----------
+    weights : ndarray[{{c_type}}, ndim=1]
+        The allocated vector of weights.
+    intercept : double
+        The initial intercept.
+    average_weights : ndarray[{{c_type}}, ndim=1]
+        The average weights as computed for ASGD. Should be None if average
+        is 0.
+    average_intercept : double
+        The average intercept for ASGD. Should be 0 if average is 0.
+    loss : CyLossFunction
+        A concrete ``CyLossFunction`` object.
+    penalty_type : int
+        The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
+    alpha : float
+        The regularization parameter.
+    C : float
+        Maximum step size for passive aggressive.
+    l1_ratio : float
+        The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
+        l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
+    dataset : SequentialDataset
+        A concrete ``SequentialDataset`` object.
+    validation_mask : ndarray[uint8_t, ndim=1]
+        Equal to True on the validation set.
+    early_stopping : boolean
+        Whether to use a stopping criterion based on the validation set.
+    validation_score_cb : callable
+        A callable to compute a validation score given the current
+        coefficients and intercept values.
+        Used only if early_stopping is True.
+    n_iter_no_change : int
+        Number of iteration with no improvement to wait before stopping.
+    max_iter : int
+        The maximum number of iterations (epochs).
+    tol: double
+        The tolerance for the stopping criterion.
+    fit_intercept : int
+        Whether or not to fit the intercept (1 or 0).
+    verbose : int
+        Print verbose output; 0 for quite.
+    shuffle : boolean
+        Whether to shuffle the training data before each epoch.
+    weight_pos : float
+        The weight of the positive class.
+    weight_neg : float
+        The weight of the negative class.
+    seed : uint32_t
+        Seed of the pseudorandom number generator used to shuffle the data.
+    learning_rate : int
+        The learning rate:
+        (1) constant, eta = eta0
+        (2) optimal, eta = 1.0/(alpha * t).
+        (3) inverse scaling, eta = eta0 / pow(t, power_t)
+        (4) adaptive decrease
+        (5) Passive Aggressive-I, eta = min(alpha, loss/norm(x))
+        (6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
+    eta0 : double
+        The initial learning rate.
+    power_t : double
+        The exponent for inverse scaling learning rate.
+    one_class : boolean
+        Whether to solve the One-Class SVM optimization problem.
+    t : double
+        Initial state of the learning rate. This value is equal to the
+        iteration count except when the learning rate is set to `optimal`.
+        Default: 1.0.
+    average : int
+        The number of iterations before averaging starts. average=1 is
+        equivalent to averaging for all iterations.
+
+
+    Returns
+    -------
+    weights : array, shape=[n_features]
+        The fitted weight vector.
+    intercept : float
+        The fitted intercept term.
+    average_weights : array shape=[n_features]
+        The averaged weights across iterations. Values are valid only if
+        average > 0.
+    average_intercept : float
+        The averaged intercept across iterations.
+        Values are valid only if average > 0.
+    n_iter_ : int
+        The actual number of iter (epochs).
+    """
+
+    # get the data information into easy vars
+    cdef Py_ssize_t n_samples = dataset.n_samples
+    cdef Py_ssize_t n_features = weights.shape[0]
+
+    cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
+    cdef {{c_type}} *x_data_ptr = NULL
+    cdef int *x_ind_ptr = NULL
+
+    # helper variables
+    cdef int no_improvement_count = 0
+    cdef bint infinity = False
+    cdef int xnnz
+    cdef double eta = 0.0
+    cdef double p = 0.0
+    cdef double update = 0.0
+    cdef double intercept_update = 0.0
+    cdef double sumloss = 0.0
+    cdef double score = 0.0
+    cdef double best_loss = INFINITY
+    cdef double best_score = -INFINITY
+    cdef {{c_type}} y = 0.0
+    cdef {{c_type}} sample_weight
+    cdef {{c_type}} class_weight = 1.0
+    cdef unsigned int count = 0
+    cdef unsigned int train_count = n_samples - np.sum(validation_mask)
+    cdef unsigned int epoch = 0
+    cdef unsigned int i = 0
+    cdef int is_hinge = isinstance(loss, Hinge)
+    cdef double optimal_init = 0.0
+    cdef double dloss = 0.0
+    cdef double MAX_DLOSS = 1e12
+
+    cdef long long sample_index
+
+    # q vector is only used for L1 regularization
+    cdef {{c_type}}[::1] q = None
+    cdef {{c_type}} * q_data_ptr = NULL
+    if penalty_type == L1 or penalty_type == ELASTICNET:
+        q = np.zeros((n_features,), dtype={{np_type}}, order="c")
+        q_data_ptr = &q[0]
+    cdef double u = 0.0
+
+    if penalty_type == L2:
+        l1_ratio = 0.0
+    elif penalty_type == L1:
+        l1_ratio = 1.0
+
+    eta = eta0
+
+    if learning_rate == OPTIMAL:
+        typw = np.sqrt(1.0 / np.sqrt(alpha))
+        # computing eta0, the initial learning rate
+        initial_eta0 = typw / max(1.0, loss.cy_gradient(1.0, -typw))
+        # initialize t such that eta at first sample equals eta0
+        optimal_init = 1.0 / (initial_eta0 * alpha)
+
+    t_start = time()
+    with nogil:
+        for epoch in range(max_iter):
+            sumloss = 0
+            if verbose > 0:
+                with gil:
+                    print("-- Epoch %d" % (epoch + 1))
+            if shuffle:
+                dataset.shuffle(seed)
+            for i in range(n_samples):
+                dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
+                             &y, &sample_weight)
+
+                sample_index = dataset.index_data_ptr[dataset.current_index]
+                if validation_mask[sample_index]:
+                    # do not learn on the validation set
+                    continue
+
+                p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
+                if learning_rate == OPTIMAL:
+                    eta = 1.0 / (alpha * (optimal_init + t - 1))
+                elif learning_rate == INVSCALING:
+                    eta = eta0 / pow(t, power_t)
+
+                if verbose or not early_stopping:
+                    sumloss += loss.cy_loss(y, p)
+
+                if y > 0.0:
+                    class_weight = weight_pos
+                else:
+                    class_weight = weight_neg
+
+                if learning_rate == PA1:
+                    update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
+                    if update == 0:
+                        continue
+                    update = min(C, loss.cy_loss(y, p) / update)
+                elif learning_rate == PA2:
+                    update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
+                    update = loss.cy_loss(y, p) / (update + 0.5 / C)
+                else:
+                    dloss = loss.cy_gradient(y, p)
+                    # clip dloss with large values to avoid numerical
+                    # instabilities
+                    if dloss < -MAX_DLOSS:
+                        dloss = -MAX_DLOSS
+                    elif dloss > MAX_DLOSS:
+                        dloss = MAX_DLOSS
+                    update = -eta * dloss
+
+                if learning_rate >= PA1:
+                    if is_hinge:
+                        # classification
+                        update *= y
+                    elif y - p < 0:
+                        # regression
+                        update *= -1
+
+                update *= class_weight * sample_weight
+
+                if penalty_type >= L2:
+                    # do not scale to negative values when eta or alpha are too
+                    # big: instead set the weights to zero
+                    w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
+
+                if update != 0.0:
+                    w.add(x_data_ptr, x_ind_ptr, xnnz, update)
+                if fit_intercept == 1:
+                    intercept_update = update
+                    if one_class:  # specific for One-Class SVM
+                        intercept_update -= 2. * eta * alpha
+                    if intercept_update != 0:
+                        intercept += intercept_update * intercept_decay
+
+                if 0 < average <= t:
+                    # compute the average for the intercept and update the
+                    # average weights, this is done regardless as to whether
+                    # the update is 0
+
+                    w.add_average(x_data_ptr, x_ind_ptr, xnnz,
+                                  update, (t - average + 1))
+                    average_intercept += ((intercept - average_intercept) /
+                                          (t - average + 1))
+
+                if penalty_type == L1 or penalty_type == ELASTICNET:
+                    u += (l1_ratio * eta * alpha)
+                    l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)
+
+                t += 1
+                count += 1
+
+            # report epoch information
+            if verbose > 0:
+                with gil:
+                    print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
+                          "Avg. loss: %f"
+                          % (w.norm(), np.nonzero(weights)[0].shape[0],
+                             intercept, count, sumloss / train_count))
+                    print("Total training time: %.2f seconds."
+                          % (time() - t_start))
+
+            # floating-point under-/overflow check.
+            if (not isfinite(intercept) or any_nonfinite(weights)):
+                infinity = True
+                break
+
+            # evaluate the score on the validation set
+            if early_stopping:
+                with gil:
+                    score = validation_score_cb(weights.base, intercept)
+                if tol > -INFINITY and score < best_score + tol:
+                    no_improvement_count += 1
+                else:
+                    no_improvement_count = 0
+                if score > best_score:
+                    best_score = score
+            # or evaluate the loss on the training set
+            else:
+                if tol > -INFINITY and sumloss > best_loss - tol * train_count:
+                    no_improvement_count += 1
+                else:
+                    no_improvement_count = 0
+                if sumloss < best_loss:
+                    best_loss = sumloss
+
+            # if there is no improvement several times in a row
+            if no_improvement_count >= n_iter_no_change:
+                if learning_rate == ADAPTIVE and eta > 1e-6:
+                    eta = eta / 5
+                    no_improvement_count = 0
+                else:
+                    if verbose:
+                        with gil:
+                            print("Convergence after %d epochs took %.2f "
+                                  "seconds" % (epoch + 1, time() - t_start))
+                    break
+
+    if infinity:
+        raise ValueError(("Floating-point under-/overflow occurred at epoch"
+                          " #%d. Scaling input data with StandardScaler or"
+                          " MinMaxScaler might help.") % (epoch + 1))
+
+    w.reset_wscale()
+
+    return (
+        weights.base,
+        intercept,
+        None if average_weights is None else average_weights.base,
+        average_intercept,
+        epoch + 1
+    )
+
+{{endfor}}
+
+
+cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
+    for i in range(w.shape[0]):
+        if not isfinite(w[i]):
+            return True
+    return 0
+
+
+cdef inline double sqnorm(
+    floating * x_data_ptr,
+    int * x_ind_ptr,
+    int xnnz,
+) noexcept nogil:
+    cdef double x_norm = 0.0
+    cdef int j
+    cdef double z
+    for j in range(xnnz):
+        z = x_data_ptr[j]
+        x_norm += z * z
+    return x_norm
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef void l1penalty{{name_suffix}}(
+    WeightVector{{name_suffix}} w,
+    {{c_type}} * q_data_ptr,
+    int *x_ind_ptr,
+    int xnnz,
+    double u,
+) noexcept nogil:
+    """Apply the L1 penalty to each updated feature
+
+    This implements the truncated gradient approach by
+    [Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
+    """
+    cdef double z = 0.0
+    cdef int j = 0
+    cdef int idx = 0
+    cdef double wscale = w.wscale
+    cdef {{c_type}} *w_data_ptr = w.w_data_ptr
+    for j in range(xnnz):
+        idx = x_ind_ptr[j]
+        z = w_data_ptr[idx]
+        if wscale * z > 0.0:
+            w_data_ptr[idx] = max(
+                0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
+
+        elif wscale * z < 0.0:
+            w_data_ptr[idx] = min(
+                0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
+
+        q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_stochastic_gradient.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_stochastic_gradient.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f7c814000614e91e2daf605835c0ebc69fc76c3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_stochastic_gradient.py
@@ -0,0 +1,2604 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Classification, regression and One-Class SVM using Stochastic Gradient
+Descent (SGD).
+"""
+
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+
+import numpy as np
+
+from .._loss._loss import CyHalfBinomialLoss, CyHalfSquaredError, CyHuberLoss
+from ..base import (
+    BaseEstimator,
+    OutlierMixin,
+    RegressorMixin,
+    _fit_context,
+    clone,
+    is_classifier,
+)
+from ..exceptions import ConvergenceWarning
+from ..model_selection import ShuffleSplit, StratifiedShuffleSplit
+from ..utils import check_random_state, compute_class_weight
+from ..utils._param_validation import Hidden, Interval, StrOptions
+from ..utils.extmath import safe_sparse_dot
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import _check_partial_fit_first_call
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data
+from ._base import LinearClassifierMixin, SparseCoefMixin, make_dataset
+from ._sgd_fast import (
+    EpsilonInsensitive,
+    Hinge,
+    ModifiedHuber,
+    SquaredEpsilonInsensitive,
+    SquaredHinge,
+    _plain_sgd32,
+    _plain_sgd64,
+)
+
+LEARNING_RATE_TYPES = {
+    "constant": 1,
+    "optimal": 2,
+    "invscaling": 3,
+    "adaptive": 4,
+    "pa1": 5,
+    "pa2": 6,
+}
+
+PENALTY_TYPES = {"none": 0, "l2": 2, "l1": 1, "elasticnet": 3}
+
+DEFAULT_EPSILON = 0.1
+# Default value of ``epsilon`` parameter.
+
+MAX_INT = np.iinfo(np.int32).max
+
+
+class _ValidationScoreCallback:
+    """Callback for early stopping based on validation score"""
+
+    def __init__(self, estimator, X_val, y_val, sample_weight_val, classes=None):
+        self.estimator = clone(estimator)
+        self.estimator.t_ = 1  # to pass check_is_fitted
+        if classes is not None:
+            self.estimator.classes_ = classes
+        self.X_val = X_val
+        self.y_val = y_val
+        self.sample_weight_val = sample_weight_val
+
+    def __call__(self, coef, intercept):
+        est = self.estimator
+        est.coef_ = coef.reshape(1, -1)
+        est.intercept_ = np.atleast_1d(intercept)
+        return est.score(self.X_val, self.y_val, self.sample_weight_val)
+
+
+class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for SGD classification and regression."""
+
+    _parameter_constraints: dict = {
+        "fit_intercept": ["boolean"],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left"), None],
+        "shuffle": ["boolean"],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+        "warm_start": ["boolean"],
+        "average": [Interval(Integral, 0, None, closed="neither"), "boolean"],
+    }
+
+    def __init__(
+        self,
+        loss,
+        *,
+        penalty="l2",
+        alpha=0.0001,
+        C=1.0,
+        l1_ratio=0.15,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        shuffle=True,
+        verbose=0,
+        epsilon=0.1,
+        random_state=None,
+        learning_rate="optimal",
+        eta0=0.0,
+        power_t=0.5,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        warm_start=False,
+        average=False,
+    ):
+        self.loss = loss
+        self.penalty = penalty
+        self.learning_rate = learning_rate
+        self.epsilon = epsilon
+        self.alpha = alpha
+        self.C = C
+        self.l1_ratio = l1_ratio
+        self.fit_intercept = fit_intercept
+        self.shuffle = shuffle
+        self.random_state = random_state
+        self.verbose = verbose
+        self.eta0 = eta0
+        self.power_t = power_t
+        self.early_stopping = early_stopping
+        self.validation_fraction = validation_fraction
+        self.n_iter_no_change = n_iter_no_change
+        self.warm_start = warm_start
+        self.average = average
+        self.max_iter = max_iter
+        self.tol = tol
+
+    @abstractmethod
+    def fit(self, X, y):
+        """Fit model."""
+
+    def _more_validate_params(self, for_partial_fit=False):
+        """Validate input params."""
+        if self.early_stopping and for_partial_fit:
+            raise ValueError("early_stopping should be False with partial_fit")
+        if (
+            self.learning_rate in ("constant", "invscaling", "adaptive")
+            and self.eta0 <= 0.0
+        ):
+            raise ValueError("eta0 must be > 0")
+        if self.learning_rate == "optimal" and self.alpha == 0:
+            raise ValueError(
+                "alpha must be > 0 since "
+                "learning_rate is 'optimal'. alpha is used "
+                "to compute the optimal learning rate."
+            )
+        if self.penalty == "elasticnet" and self.l1_ratio is None:
+            raise ValueError("l1_ratio must be set when penalty is 'elasticnet'")
+
+        # raises ValueError if not registered
+        self._get_penalty_type(self.penalty)
+        self._get_learning_rate_type(self.learning_rate)
+
+    def _get_l1_ratio(self):
+        if self.l1_ratio is None:
+            # plain_sgd expects a float. Any value is fine since at this point
+            # penalty can't be "elsaticnet" so l1_ratio is not used.
+            return 0.0
+        return self.l1_ratio
+
+    def _get_loss_function(self, loss):
+        """Get concrete ``LossFunction`` object for str ``loss``."""
+        loss_ = self.loss_functions[loss]
+        loss_class, args = loss_[0], loss_[1:]
+        if loss in ("huber", "epsilon_insensitive", "squared_epsilon_insensitive"):
+            args = (self.epsilon,)
+        return loss_class(*args)
+
+    def _get_learning_rate_type(self, learning_rate):
+        return LEARNING_RATE_TYPES[learning_rate]
+
+    def _get_penalty_type(self, penalty):
+        penalty = str(penalty).lower()
+        return PENALTY_TYPES[penalty]
+
+    def _allocate_parameter_mem(
+        self,
+        n_classes,
+        n_features,
+        input_dtype,
+        coef_init=None,
+        intercept_init=None,
+        one_class=0,
+    ):
+        """Allocate mem for parameters; initialize if provided."""
+        if n_classes > 2:
+            # allocate coef_ for multi-class
+            if coef_init is not None:
+                coef_init = np.asarray(coef_init, dtype=input_dtype, order="C")
+                if coef_init.shape != (n_classes, n_features):
+                    raise ValueError("Provided ``coef_`` does not match dataset. ")
+                self.coef_ = coef_init
+            else:
+                self.coef_ = np.zeros(
+                    (n_classes, n_features), dtype=input_dtype, order="C"
+                )
+
+            # allocate intercept_ for multi-class
+            if intercept_init is not None:
+                intercept_init = np.asarray(
+                    intercept_init, order="C", dtype=input_dtype
+                )
+                if intercept_init.shape != (n_classes,):
+                    raise ValueError("Provided intercept_init does not match dataset.")
+                self.intercept_ = intercept_init
+            else:
+                self.intercept_ = np.zeros(n_classes, dtype=input_dtype, order="C")
+        else:
+            # allocate coef_
+            if coef_init is not None:
+                coef_init = np.asarray(coef_init, dtype=input_dtype, order="C")
+                coef_init = coef_init.ravel()
+                if coef_init.shape != (n_features,):
+                    raise ValueError("Provided coef_init does not match dataset.")
+                self.coef_ = coef_init
+            else:
+                self.coef_ = np.zeros(n_features, dtype=input_dtype, order="C")
+
+            # allocate intercept_
+            if intercept_init is not None:
+                intercept_init = np.asarray(intercept_init, dtype=input_dtype)
+                if intercept_init.shape != (1,) and intercept_init.shape != ():
+                    raise ValueError("Provided intercept_init does not match dataset.")
+                if one_class:
+                    self.offset_ = intercept_init.reshape(
+                        1,
+                    )
+                else:
+                    self.intercept_ = intercept_init.reshape(
+                        1,
+                    )
+            else:
+                if one_class:
+                    self.offset_ = np.zeros(1, dtype=input_dtype, order="C")
+                else:
+                    self.intercept_ = np.zeros(1, dtype=input_dtype, order="C")
+
+        # initialize average parameters
+        if self.average > 0:
+            self._standard_coef = self.coef_
+            self._average_coef = np.zeros(
+                self.coef_.shape, dtype=input_dtype, order="C"
+            )
+            if one_class:
+                self._standard_intercept = 1 - self.offset_
+            else:
+                self._standard_intercept = self.intercept_
+
+            self._average_intercept = np.zeros(
+                self._standard_intercept.shape, dtype=input_dtype, order="C"
+            )
+
+    def _make_validation_split(self, y, sample_mask):
+        """Split the dataset between training set and validation set.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples, )
+            Target values.
+
+        sample_mask : ndarray of shape (n_samples, )
+            A boolean array indicating whether each sample should be included
+            for validation set.
+
+        Returns
+        -------
+        validation_mask : ndarray of shape (n_samples, )
+            Equal to True on the validation set, False on the training set.
+        """
+        n_samples = y.shape[0]
+        validation_mask = np.zeros(n_samples, dtype=np.bool_)
+        if not self.early_stopping:
+            # use the full set for training, with an empty validation set
+            return validation_mask
+
+        if is_classifier(self):
+            splitter_type = StratifiedShuffleSplit
+        else:
+            splitter_type = ShuffleSplit
+        cv = splitter_type(
+            test_size=self.validation_fraction, random_state=self.random_state
+        )
+        idx_train, idx_val = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y))
+
+        if not np.any(sample_mask[idx_val]):
+            raise ValueError(
+                "The sample weights for validation set are all zero, consider using a"
+                " different random state."
+            )
+
+        if idx_train.shape[0] == 0 or idx_val.shape[0] == 0:
+            raise ValueError(
+                "Splitting %d samples into a train set and a validation set "
+                "with validation_fraction=%r led to an empty set (%d and %d "
+                "samples). Please either change validation_fraction, increase "
+                "number of samples, or disable early_stopping."
+                % (
+                    n_samples,
+                    self.validation_fraction,
+                    idx_train.shape[0],
+                    idx_val.shape[0],
+                )
+            )
+
+        validation_mask[idx_val] = True
+        return validation_mask
+
+    def _make_validation_score_cb(
+        self, validation_mask, X, y, sample_weight, classes=None
+    ):
+        if not self.early_stopping:
+            return None
+
+        return _ValidationScoreCallback(
+            self,
+            X[validation_mask],
+            y[validation_mask],
+            sample_weight[validation_mask],
+            classes=classes,
+        )
+
+
+def _prepare_fit_binary(est, y, i, input_dtype, label_encode=True):
+    """Initialization for fit_binary.
+
+    Returns y, coef, intercept, average_coef, average_intercept.
+    """
+    y_i = np.ones(y.shape, dtype=input_dtype, order="C")
+    if label_encode:
+        # y in {0, 1}
+        y_i[y != est.classes_[i]] = 0.0
+    else:
+        # y in {-1, +1}
+        y_i[y != est.classes_[i]] = -1.0
+    average_intercept = 0
+    average_coef = None
+
+    if len(est.classes_) == 2:
+        if not est.average:
+            coef = est.coef_.ravel()
+            intercept = est.intercept_[0]
+        else:
+            coef = est._standard_coef.ravel()
+            intercept = est._standard_intercept[0]
+            average_coef = est._average_coef.ravel()
+            average_intercept = est._average_intercept[0]
+    else:
+        if not est.average:
+            coef = est.coef_[i]
+            intercept = est.intercept_[i]
+        else:
+            coef = est._standard_coef[i]
+            intercept = est._standard_intercept[i]
+            average_coef = est._average_coef[i]
+            average_intercept = est._average_intercept[i]
+
+    return y_i, coef, intercept, average_coef, average_intercept
+
+
+def fit_binary(
+    est,
+    i,
+    X,
+    y,
+    alpha,
+    C,
+    learning_rate,
+    max_iter,
+    pos_weight,
+    neg_weight,
+    sample_weight,
+    validation_mask=None,
+    random_state=None,
+):
+    """Fit a single binary classifier.
+
+    The i'th class is considered the "positive" class.
+
+    Parameters
+    ----------
+    est : Estimator object
+        The estimator to fit
+
+    i : int
+        Index of the positive class
+
+    X : numpy array or sparse matrix of shape [n_samples,n_features]
+        Training data
+
+    y : numpy array of shape [n_samples, ]
+        Target values
+
+    alpha : float
+        The regularization parameter
+
+    C : float
+        Maximum step size for passive aggressive
+
+    learning_rate : str
+        The learning rate. Accepted values are 'constant', 'optimal',
+        'invscaling', 'pa1' and 'pa2'.
+
+    max_iter : int
+        The maximum number of iterations (epochs)
+
+    pos_weight : float
+        The weight of the positive class
+
+    neg_weight : float
+        The weight of the negative class
+
+    sample_weight : numpy array of shape [n_samples, ]
+        The weight of each sample
+
+    validation_mask : numpy array of shape [n_samples, ], default=None
+        Precomputed validation mask in case _fit_binary is called in the
+        context of a one-vs-rest reduction.
+
+    random_state : int, RandomState instance, default=None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+    """
+    # if average is not true, average_coef, and average_intercept will be
+    # unused
+    label_encode = isinstance(est._loss_function_, CyHalfBinomialLoss)
+    y_i, coef, intercept, average_coef, average_intercept = _prepare_fit_binary(
+        est, y, i, input_dtype=X.dtype, label_encode=label_encode
+    )
+    assert y_i.shape[0] == y.shape[0] == sample_weight.shape[0]
+
+    random_state = check_random_state(random_state)
+    dataset, intercept_decay = make_dataset(
+        X, y_i, sample_weight, random_state=random_state
+    )
+
+    penalty_type = est._get_penalty_type(est.penalty)
+    learning_rate_type = est._get_learning_rate_type(learning_rate)
+
+    if validation_mask is None:
+        validation_mask = est._make_validation_split(y_i, sample_mask=sample_weight > 0)
+    classes = np.array([-1, 1], dtype=y_i.dtype)
+    validation_score_cb = est._make_validation_score_cb(
+        validation_mask, X, y_i, sample_weight, classes=classes
+    )
+
+    # numpy mtrand expects a C long which is a signed 32 bit integer under
+    # Windows
+    seed = random_state.randint(MAX_INT)
+
+    tol = est.tol if est.tol is not None else -np.inf
+
+    _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype)
+    coef, intercept, average_coef, average_intercept, n_iter_ = _plain_sgd(
+        coef,
+        intercept,
+        average_coef,
+        average_intercept,
+        est._loss_function_,
+        penalty_type,
+        alpha,
+        C,
+        est._get_l1_ratio(),
+        dataset,
+        validation_mask,
+        est.early_stopping,
+        validation_score_cb,
+        int(est.n_iter_no_change),
+        max_iter,
+        tol,
+        int(est.fit_intercept),
+        int(est.verbose),
+        int(est.shuffle),
+        seed,
+        pos_weight,
+        neg_weight,
+        learning_rate_type,
+        est.eta0,
+        est.power_t,
+        0,
+        est.t_,
+        intercept_decay,
+        est.average,
+    )
+
+    if est.average:
+        if len(est.classes_) == 2:
+            est._average_intercept[0] = average_intercept
+        else:
+            est._average_intercept[i] = average_intercept
+
+    return coef, intercept, n_iter_
+
+
+def _get_plain_sgd_function(input_dtype):
+    return _plain_sgd32 if input_dtype == np.float32 else _plain_sgd64
+
+
+class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta):
+    loss_functions = {
+        "hinge": (Hinge, 1.0),
+        "squared_hinge": (SquaredHinge, 1.0),
+        "perceptron": (Hinge, 0.0),
+        "log_loss": (CyHalfBinomialLoss,),
+        "modified_huber": (ModifiedHuber,),
+        "squared_error": (CyHalfSquaredError,),
+        "huber": (CyHuberLoss, DEFAULT_EPSILON),
+        "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON),
+        "squared_epsilon_insensitive": (SquaredEpsilonInsensitive, DEFAULT_EPSILON),
+    }
+
+    _parameter_constraints: dict = {
+        **BaseSGD._parameter_constraints,
+        "loss": [StrOptions(set(loss_functions))],
+        "early_stopping": ["boolean"],
+        "validation_fraction": [Interval(Real, 0, 1, closed="neither")],
+        "n_iter_no_change": [Interval(Integral, 1, None, closed="left")],
+        "n_jobs": [Integral, None],
+        "class_weight": [StrOptions({"balanced"}), dict, None],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        loss="hinge",
+        *,
+        penalty="l2",
+        alpha=0.0001,
+        l1_ratio=0.15,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        shuffle=True,
+        verbose=0,
+        epsilon=DEFAULT_EPSILON,
+        n_jobs=None,
+        random_state=None,
+        learning_rate="optimal",
+        eta0=0.0,
+        power_t=0.5,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        class_weight=None,
+        warm_start=False,
+        average=False,
+    ):
+        super().__init__(
+            loss=loss,
+            penalty=penalty,
+            alpha=alpha,
+            l1_ratio=l1_ratio,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            shuffle=shuffle,
+            verbose=verbose,
+            epsilon=epsilon,
+            random_state=random_state,
+            learning_rate=learning_rate,
+            eta0=eta0,
+            power_t=power_t,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            warm_start=warm_start,
+            average=average,
+        )
+        self.class_weight = class_weight
+        self.n_jobs = n_jobs
+
+    def _partial_fit(
+        self,
+        X,
+        y,
+        alpha,
+        C,
+        loss,
+        learning_rate,
+        max_iter,
+        classes,
+        sample_weight,
+        coef_init,
+        intercept_init,
+    ):
+        first_call = not hasattr(self, "classes_")
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse="csr",
+            dtype=[np.float64, np.float32],
+            order="C",
+            accept_large_sparse=False,
+            reset=first_call,
+        )
+
+        n_samples, n_features = X.shape
+
+        _check_partial_fit_first_call(self, classes)
+
+        n_classes = self.classes_.shape[0]
+
+        # Allocate datastructures from input arguments
+        self._expanded_class_weight = compute_class_weight(
+            self.class_weight, classes=self.classes_, y=y
+        )
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        if getattr(self, "coef_", None) is None or coef_init is not None:
+            self._allocate_parameter_mem(
+                n_classes=n_classes,
+                n_features=n_features,
+                input_dtype=X.dtype,
+                coef_init=coef_init,
+                intercept_init=intercept_init,
+            )
+        elif n_features != self.coef_.shape[-1]:
+            raise ValueError(
+                "Number of features %d does not match previous data %d."
+                % (n_features, self.coef_.shape[-1])
+            )
+
+        self._loss_function_ = self._get_loss_function(loss)
+        if not hasattr(self, "t_"):
+            self.t_ = 1.0
+
+        # delegate to concrete training procedure
+        if n_classes > 2:
+            self._fit_multiclass(
+                X,
+                y,
+                alpha=alpha,
+                C=C,
+                learning_rate=learning_rate,
+                sample_weight=sample_weight,
+                max_iter=max_iter,
+            )
+        elif n_classes == 2:
+            self._fit_binary(
+                X,
+                y,
+                alpha=alpha,
+                C=C,
+                learning_rate=learning_rate,
+                sample_weight=sample_weight,
+                max_iter=max_iter,
+            )
+        else:
+            raise ValueError(
+                "The number of classes has to be greater than one; got %d class"
+                % n_classes
+            )
+
+        return self
+
+    def _fit(
+        self,
+        X,
+        y,
+        alpha,
+        C,
+        loss,
+        learning_rate,
+        coef_init=None,
+        intercept_init=None,
+        sample_weight=None,
+    ):
+        if hasattr(self, "classes_"):
+            # delete the attribute otherwise _partial_fit thinks it's not the first call
+            delattr(self, "classes_")
+
+        # labels can be encoded as float, int, or string literals
+        # np.unique sorts in asc order; largest class id is positive class
+        y = validate_data(self, y=y)
+        classes = np.unique(y)
+
+        if self.warm_start and hasattr(self, "coef_"):
+            if coef_init is None:
+                coef_init = self.coef_
+            if intercept_init is None:
+                intercept_init = self.intercept_
+        else:
+            self.coef_ = None
+            self.intercept_ = None
+
+        if self.average > 0:
+            self._standard_coef = self.coef_
+            self._standard_intercept = self.intercept_
+            self._average_coef = None
+            self._average_intercept = None
+
+        # Clear iteration count for multiple call to fit.
+        self.t_ = 1.0
+
+        self._partial_fit(
+            X,
+            y,
+            alpha,
+            C,
+            loss,
+            learning_rate,
+            self.max_iter,
+            classes,
+            sample_weight,
+            coef_init,
+            intercept_init,
+        )
+
+        if (
+            self.tol is not None
+            and self.tol > -np.inf
+            and self.n_iter_ == self.max_iter
+        ):
+            warnings.warn(
+                (
+                    "Maximum number of iteration reached before "
+                    "convergence. Consider increasing max_iter to "
+                    "improve the fit."
+                ),
+                ConvergenceWarning,
+            )
+        return self
+
+    def _fit_binary(self, X, y, alpha, C, sample_weight, learning_rate, max_iter):
+        """Fit a binary classifier on X and y."""
+        coef, intercept, n_iter_ = fit_binary(
+            self,
+            1,
+            X,
+            y,
+            alpha,
+            C,
+            learning_rate,
+            max_iter,
+            self._expanded_class_weight[1],
+            self._expanded_class_weight[0],
+            sample_weight,
+            random_state=self.random_state,
+        )
+
+        self.t_ += n_iter_ * X.shape[0]
+        self.n_iter_ = n_iter_
+
+        # need to be 2d
+        if self.average > 0:
+            if self.average <= self.t_ - 1:
+                self.coef_ = self._average_coef.reshape(1, -1)
+                self.intercept_ = self._average_intercept
+            else:
+                self.coef_ = self._standard_coef.reshape(1, -1)
+                self._standard_intercept = np.atleast_1d(intercept)
+                self.intercept_ = self._standard_intercept
+        else:
+            self.coef_ = coef.reshape(1, -1)
+            # intercept is a float, need to convert it to an array of length 1
+            self.intercept_ = np.atleast_1d(intercept)
+
+    def _fit_multiclass(self, X, y, alpha, C, learning_rate, sample_weight, max_iter):
+        """Fit a multi-class classifier by combining binary classifiers
+
+        Each binary classifier predicts one class versus all others. This
+        strategy is called OvA (One versus All) or OvR (One versus Rest).
+        """
+        # Precompute the validation split using the multiclass labels
+        # to ensure proper balancing of the classes.
+        validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0)
+
+        # Use joblib to fit OvA in parallel.
+        # Pick the random seed for each job outside of fit_binary to avoid
+        # sharing the estimator random state between threads which could lead
+        # to non-deterministic behavior
+        random_state = check_random_state(self.random_state)
+        seeds = random_state.randint(MAX_INT, size=len(self.classes_))
+        result = Parallel(
+            n_jobs=self.n_jobs, verbose=self.verbose, require="sharedmem"
+        )(
+            delayed(fit_binary)(
+                self,
+                i,
+                X,
+                y,
+                alpha,
+                C,
+                learning_rate,
+                max_iter,
+                self._expanded_class_weight[i],
+                1.0,
+                sample_weight,
+                validation_mask=validation_mask,
+                random_state=seed,
+            )
+            for i, seed in enumerate(seeds)
+        )
+
+        # take the maximum of n_iter_ over every binary fit
+        n_iter_ = 0.0
+        for i, (_, intercept, n_iter_i) in enumerate(result):
+            self.intercept_[i] = intercept
+            n_iter_ = max(n_iter_, n_iter_i)
+
+        self.t_ += n_iter_ * X.shape[0]
+        self.n_iter_ = n_iter_
+
+        if self.average > 0:
+            if self.average <= self.t_ - 1.0:
+                self.coef_ = self._average_coef
+                self.intercept_ = self._average_intercept
+            else:
+                self.coef_ = self._standard_coef
+                self._standard_intercept = np.atleast_1d(self.intercept_)
+                self.intercept_ = self._standard_intercept
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y, classes=None, sample_weight=None):
+        """Perform one epoch of stochastic gradient descent on given samples.
+
+        Internally, this method uses ``max_iter = 1``. Therefore, it is not
+        guaranteed that a minimum of the cost function is reached after calling
+        it once. Matters such as objective convergence, early stopping, and
+        learning rate adjustments should be handled by the user.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Subset of the training data.
+
+        y : ndarray of shape (n_samples,)
+            Subset of the target values.
+
+        classes : ndarray of shape (n_classes,), default=None
+            Classes across all calls to partial_fit.
+            Can be obtained by via `np.unique(y_all)`, where y_all is the
+            target vector of the entire dataset.
+            This argument is required for the first call to partial_fit
+            and can be omitted in the subsequent calls.
+            Note that y doesn't need to contain all labels in `classes`.
+
+        sample_weight : array-like, shape (n_samples,), default=None
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        if not hasattr(self, "classes_"):
+            self._more_validate_params(for_partial_fit=True)
+
+            if self.class_weight == "balanced":
+                raise ValueError(
+                    "class_weight '{0}' is not supported for "
+                    "partial_fit. In order to use 'balanced' weights,"
+                    " use compute_class_weight('{0}', "
+                    "classes=classes, y=y). "
+                    "In place of y you can use a large enough sample "
+                    "of the full training set target to properly "
+                    "estimate the class frequency distributions. "
+                    "Pass the resulting weights as the class_weight "
+                    "parameter.".format(self.class_weight)
+                )
+
+        return self._partial_fit(
+            X,
+            y,
+            alpha=self.alpha,
+            C=1.0,
+            loss=self.loss,
+            learning_rate=self.learning_rate,
+            max_iter=1,
+            classes=classes,
+            sample_weight=sample_weight,
+            coef_init=None,
+            intercept_init=None,
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
+        """Fit linear model with Stochastic Gradient Descent.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        y : ndarray of shape (n_samples,)
+            Target values.
+
+        coef_init : ndarray of shape (n_classes, n_features), default=None
+            The initial coefficients to warm-start the optimization.
+
+        intercept_init : ndarray of shape (n_classes,), default=None
+            The initial intercept to warm-start the optimization.
+
+        sample_weight : array-like, shape (n_samples,), default=None
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed. These weights will
+            be multiplied with class_weight (passed through the
+            constructor) if class_weight is specified.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        self._more_validate_params()
+
+        return self._fit(
+            X,
+            y,
+            alpha=self.alpha,
+            C=1.0,
+            loss=self.loss,
+            learning_rate=self.learning_rate,
+            coef_init=coef_init,
+            intercept_init=intercept_init,
+            sample_weight=sample_weight,
+        )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class SGDClassifier(BaseSGDClassifier):
+    """Linear classifiers (SVM, logistic regression, etc.) with SGD training.
+
+    This estimator implements regularized linear models with stochastic
+    gradient descent (SGD) learning: the gradient of the loss is estimated
+    each sample at a time and the model is updated along the way with a
+    decreasing strength schedule (aka learning rate). SGD allows minibatch
+    (online/out-of-core) learning via the `partial_fit` method.
+    For best results using the default learning rate schedule, the data should
+    have zero mean and unit variance.
+
+    This implementation works with data represented as dense or sparse arrays
+    of floating point values for the features. The model it fits can be
+    controlled with the loss parameter; by default, it fits a linear support
+    vector machine (SVM).
+
+    The regularizer is a penalty added to the loss function that shrinks model
+    parameters towards the zero vector using either the squared euclidean norm
+    L2 or the absolute norm L1 or a combination of both (Elastic Net). If the
+    parameter update crosses the 0.0 value because of the regularizer, the
+    update is truncated to 0.0 to allow for learning sparse models and achieve
+    online feature selection.
+
+    Read more in the :ref:`User Guide <sgd>`.
+
+    Parameters
+    ----------
+    loss : {'hinge', 'log_loss', 'modified_huber', 'squared_hinge',\
+        'perceptron', 'squared_error', 'huber', 'epsilon_insensitive',\
+        'squared_epsilon_insensitive'}, default='hinge'
+        The loss function to be used.
+
+        - 'hinge' gives a linear SVM.
+        - 'log_loss' gives logistic regression, a probabilistic classifier.
+        - 'modified_huber' is another smooth loss that brings tolerance to
+          outliers as well as probability estimates.
+        - 'squared_hinge' is like hinge but is quadratically penalized.
+        - 'perceptron' is the linear loss used by the perceptron algorithm.
+        - The other losses, 'squared_error', 'huber', 'epsilon_insensitive' and
+          'squared_epsilon_insensitive' are designed for regression but can be useful
+          in classification as well; see
+          :class:`~sklearn.linear_model.SGDRegressor` for a description.
+
+        More details about the losses formulas can be found in the :ref:`User Guide
+        <sgd_mathematical_formulation>` and you can find a visualisation of the loss
+        functions in
+        :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_loss_functions.py`.
+
+    penalty : {'l2', 'l1', 'elasticnet', None}, default='l2'
+        The penalty (aka regularization term) to be used. Defaults to 'l2'
+        which is the standard regularizer for linear SVM models. 'l1' and
+        'elasticnet' might bring sparsity to the model (feature selection)
+        not achievable with 'l2'. No penalty is added when set to `None`.
+
+        You can see a visualisation of the penalties in
+        :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_penalties.py`.
+
+    alpha : float, default=0.0001
+        Constant that multiplies the regularization term. The higher the
+        value, the stronger the regularization. Also used to compute the
+        learning rate when `learning_rate` is set to 'optimal'.
+        Values must be in the range `[0.0, inf)`.
+
+    l1_ratio : float, default=0.15
+        The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
+        l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
+        Only used if `penalty` is 'elasticnet'.
+        Values must be in the range `[0.0, 1.0]` or can be `None` if
+        `penalty` is not `elasticnet`.
+
+        .. versionchanged:: 1.7
+            `l1_ratio` can be `None` when `penalty` is not "elasticnet".
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`partial_fit` method.
+        Values must be in the range `[1, inf)`.
+
+        .. versionadded:: 0.19
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, training will stop
+        when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
+        epochs.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
+        Values must be in the range `[0.0, inf)`.
+
+        .. versionadded:: 0.19
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+
+    verbose : int, default=0
+        The verbosity level.
+        Values must be in the range `[0, inf)`.
+
+    epsilon : float, default=0.1
+        Epsilon in the epsilon-insensitive loss functions; only if `loss` is
+        'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
+        For 'huber', determines the threshold at which it becomes less
+        important to get the prediction exactly right.
+        For epsilon-insensitive, any differences between the current prediction
+        and the correct label are ignored if they are less than this threshold.
+        Values must be in the range `[0.0, inf)`.
+
+    n_jobs : int, default=None
+        The number of CPUs to use to do the OVA (One Versus All, for
+        multi-class problems) computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance, default=None
+        Used for shuffling the data, when ``shuffle`` is set to ``True``.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+        Integer values must be in the range `[0, 2**32 - 1]`.
+
+    learning_rate : str, default='optimal'
+        The learning rate schedule:
+
+        - 'constant': `eta = eta0`
+        - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
+          where `t0` is chosen by a heuristic proposed by Leon Bottou.
+        - 'invscaling': `eta = eta0 / pow(t, power_t)`
+        - 'adaptive': `eta = eta0`, as long as the training keeps decreasing.
+          Each time n_iter_no_change consecutive epochs fail to decrease the
+          training loss by tol or fail to increase validation score by tol if
+          `early_stopping` is `True`, the current learning rate is divided by 5.
+
+        .. versionadded:: 0.20
+            Added 'adaptive' option.
+
+    eta0 : float, default=0.0
+        The initial learning rate for the 'constant', 'invscaling' or
+        'adaptive' schedules. The default value is 0.0 as eta0 is not used by
+        the default schedule 'optimal'.
+        Values must be in the range `[0.0, inf)`.
+
+    power_t : float, default=0.5
+        The exponent for inverse scaling learning rate.
+        Values must be in the range `(-inf, inf)`.
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to `True`, it will automatically set aside
+        a stratified fraction of training data as validation and terminate
+        training when validation score returned by the `score` method is not
+        improving by at least tol for n_iter_no_change consecutive epochs.
+
+        See :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_early_stopping.py` for an
+        example of the effects of early stopping.
+
+        .. versionadded:: 0.20
+            Added 'early_stopping' option
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if `early_stopping` is True.
+        Values must be in the range `(0.0, 1.0)`.
+
+        .. versionadded:: 0.20
+            Added 'validation_fraction' option
+
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before stopping
+        fitting.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
+        Integer values must be in the range `[1, max_iter)`.
+
+        .. versionadded:: 0.20
+            Added 'n_iter_no_change' option
+
+    class_weight : dict, {class_label: weight} or "balanced", default=None
+        Preset for the class_weight fit parameter.
+
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+        If a dynamic learning rate is used, the learning rate is adapted
+        depending on the number of samples already seen. Calling ``fit`` resets
+        this counter, while ``partial_fit`` will result in increasing the
+        existing counter.
+
+    average : bool or int, default=False
+        When set to `True`, computes the averaged SGD weights across all
+        updates and stores the result in the ``coef_`` attribute. If set to
+        an int greater than 1, averaging will begin once the total number of
+        samples seen reaches `average`. So ``average=10`` will begin
+        averaging after seeing 10 samples.
+        Integer values must be in the range `[1, n_samples]`.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
+        Weights assigned to the features.
+
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
+        Constants in decision function.
+
+    n_iter_ : int
+        The actual number of iterations before reaching the stopping criterion.
+        For multiclass fits, it is the maximum over every binary fit.
+
+    classes_ : array of shape (n_classes,)
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.svm.LinearSVC : Linear support vector classification.
+    LogisticRegression : Logistic regression.
+    Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to
+        ``SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant",
+        penalty=None)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import SGDClassifier
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.pipeline import make_pipeline
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> Y = np.array([1, 1, 2, 2])
+    >>> # Always scale the input. The most convenient way is to use a pipeline.
+    >>> clf = make_pipeline(StandardScaler(),
+    ...                     SGDClassifier(max_iter=1000, tol=1e-3))
+    >>> clf.fit(X, Y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('sgdclassifier', SGDClassifier())])
+    >>> print(clf.predict([[-0.8, -1]]))
+    [1]
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSGDClassifier._parameter_constraints,
+        "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "l1_ratio": [Interval(Real, 0, 1, closed="both"), None],
+        "power_t": [Interval(Real, None, None, closed="neither")],
+        "epsilon": [Interval(Real, 0, None, closed="left")],
+        "learning_rate": [
+            StrOptions({"constant", "optimal", "invscaling", "adaptive"}),
+            Hidden(StrOptions({"pa1", "pa2"})),
+        ],
+        "eta0": [Interval(Real, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        loss="hinge",
+        *,
+        penalty="l2",
+        alpha=0.0001,
+        l1_ratio=0.15,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        shuffle=True,
+        verbose=0,
+        epsilon=DEFAULT_EPSILON,
+        n_jobs=None,
+        random_state=None,
+        learning_rate="optimal",
+        eta0=0.0,
+        power_t=0.5,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        class_weight=None,
+        warm_start=False,
+        average=False,
+    ):
+        super().__init__(
+            loss=loss,
+            penalty=penalty,
+            alpha=alpha,
+            l1_ratio=l1_ratio,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            shuffle=shuffle,
+            verbose=verbose,
+            epsilon=epsilon,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            learning_rate=learning_rate,
+            eta0=eta0,
+            power_t=power_t,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            class_weight=class_weight,
+            warm_start=warm_start,
+            average=average,
+        )
+
+    def _check_proba(self):
+        if self.loss not in ("log_loss", "modified_huber"):
+            raise AttributeError(
+                "probability estimates are not available for loss=%r" % self.loss
+            )
+        return True
+
+    @available_if(_check_proba)
+    def predict_proba(self, X):
+        """Probability estimates.
+
+        This method is only available for log loss and modified Huber loss.
+
+        Multiclass probability estimates are derived from binary (one-vs.-rest)
+        estimates by simple normalization, as recommended by Zadrozny and
+        Elkan.
+
+        Binary probability estimates for loss="modified_huber" are given by
+        (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions
+        it is necessary to perform proper probability calibration by wrapping
+        the classifier with
+        :class:`~sklearn.calibration.CalibratedClassifierCV` instead.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data for prediction.
+
+        Returns
+        -------
+        ndarray of shape (n_samples, n_classes)
+            Returns the probability of the sample for each class in the model,
+            where classes are ordered as they are in `self.classes_`.
+
+        References
+        ----------
+        Zadrozny and Elkan, "Transforming classifier scores into multiclass
+        probability estimates", SIGKDD'02,
+        https://dl.acm.org/doi/pdf/10.1145/775047.775151
+
+        The justification for the formula in the loss="modified_huber"
+        case is in the appendix B in:
+        http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf
+        """
+        check_is_fitted(self)
+
+        if self.loss == "log_loss":
+            return self._predict_proba_lr(X)
+
+        elif self.loss == "modified_huber":
+            binary = len(self.classes_) == 2
+            scores = self.decision_function(X)
+
+            if binary:
+                prob2 = np.ones((scores.shape[0], 2))
+                prob = prob2[:, 1]
+            else:
+                prob = scores
+
+            np.clip(scores, -1, 1, prob)
+            prob += 1.0
+            prob /= 2.0
+
+            if binary:
+                prob2[:, 0] -= prob
+                prob = prob2
+            else:
+                # the above might assign zero to all classes, which doesn't
+                # normalize neatly; work around this to produce uniform
+                # probabilities
+                prob_sum = prob.sum(axis=1)
+                all_zero = prob_sum == 0
+                if np.any(all_zero):
+                    prob[all_zero, :] = 1
+                    prob_sum[all_zero] = len(self.classes_)
+
+                # normalize
+                prob /= prob_sum.reshape((prob.shape[0], -1))
+
+            return prob
+
+        else:
+            raise NotImplementedError(
+                "predict_(log_)proba only supported when"
+                " loss='log_loss' or loss='modified_huber' "
+                "(%r given)" % self.loss
+            )
+
+    @available_if(_check_proba)
+    def predict_log_proba(self, X):
+        """Log of probability estimates.
+
+        This method is only available for log loss and modified Huber loss.
+
+        When loss="modified_huber", probability estimates may be hard zeros
+        and ones, so taking the logarithm is not possible.
+
+        See ``predict_proba`` for details.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data for prediction.
+
+        Returns
+        -------
+        T : array-like, shape (n_samples, n_classes)
+            Returns the log-probability of the sample for each class in the
+            model, where classes are ordered as they are in
+            `self.classes_`.
+        """
+        return np.log(self.predict_proba(X))
+
+
+class BaseSGDRegressor(RegressorMixin, BaseSGD):
+    loss_functions = {
+        "squared_error": (CyHalfSquaredError,),
+        "huber": (CyHuberLoss, DEFAULT_EPSILON),
+        "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON),
+        "squared_epsilon_insensitive": (SquaredEpsilonInsensitive, DEFAULT_EPSILON),
+    }
+
+    _parameter_constraints: dict = {
+        **BaseSGD._parameter_constraints,
+        "loss": [StrOptions(set(loss_functions))],
+        "early_stopping": ["boolean"],
+        "validation_fraction": [Interval(Real, 0, 1, closed="neither")],
+        "n_iter_no_change": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        loss="squared_error",
+        *,
+        penalty="l2",
+        alpha=0.0001,
+        l1_ratio=0.15,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        shuffle=True,
+        verbose=0,
+        epsilon=DEFAULT_EPSILON,
+        random_state=None,
+        learning_rate="invscaling",
+        eta0=0.01,
+        power_t=0.25,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        warm_start=False,
+        average=False,
+    ):
+        super().__init__(
+            loss=loss,
+            penalty=penalty,
+            alpha=alpha,
+            l1_ratio=l1_ratio,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            shuffle=shuffle,
+            verbose=verbose,
+            epsilon=epsilon,
+            random_state=random_state,
+            learning_rate=learning_rate,
+            eta0=eta0,
+            power_t=power_t,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            warm_start=warm_start,
+            average=average,
+        )
+
+    def _partial_fit(
+        self,
+        X,
+        y,
+        alpha,
+        C,
+        loss,
+        learning_rate,
+        max_iter,
+        sample_weight,
+        coef_init,
+        intercept_init,
+    ):
+        first_call = getattr(self, "coef_", None) is None
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse="csr",
+            copy=False,
+            order="C",
+            dtype=[np.float64, np.float32],
+            accept_large_sparse=False,
+            reset=first_call,
+        )
+        y = y.astype(X.dtype, copy=False)
+
+        n_samples, n_features = X.shape
+
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        # Allocate datastructures from input arguments
+        if first_call:
+            self._allocate_parameter_mem(
+                n_classes=1,
+                n_features=n_features,
+                input_dtype=X.dtype,
+                coef_init=coef_init,
+                intercept_init=intercept_init,
+            )
+        if self.average > 0 and getattr(self, "_average_coef", None) is None:
+            self._average_coef = np.zeros(n_features, dtype=X.dtype, order="C")
+            self._average_intercept = np.zeros(1, dtype=X.dtype, order="C")
+
+        self._fit_regressor(
+            X, y, alpha, C, loss, learning_rate, sample_weight, max_iter
+        )
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y, sample_weight=None):
+        """Perform one epoch of stochastic gradient descent on given samples.
+
+        Internally, this method uses ``max_iter = 1``. Therefore, it is not
+        guaranteed that a minimum of the cost function is reached after calling
+        it once. Matters such as objective convergence and early stopping
+        should be handled by the user.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Subset of training data.
+
+        y : numpy array of shape (n_samples,)
+            Subset of target values.
+
+        sample_weight : array-like, shape (n_samples,), default=None
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        if not hasattr(self, "coef_"):
+            self._more_validate_params(for_partial_fit=True)
+
+        return self._partial_fit(
+            X,
+            y,
+            self.alpha,
+            C=1.0,
+            loss=self.loss,
+            learning_rate=self.learning_rate,
+            max_iter=1,
+            sample_weight=sample_weight,
+            coef_init=None,
+            intercept_init=None,
+        )
+
+    def _fit(
+        self,
+        X,
+        y,
+        alpha,
+        C,
+        loss,
+        learning_rate,
+        coef_init=None,
+        intercept_init=None,
+        sample_weight=None,
+    ):
+        if self.warm_start and getattr(self, "coef_", None) is not None:
+            if coef_init is None:
+                coef_init = self.coef_
+            if intercept_init is None:
+                intercept_init = self.intercept_
+        else:
+            self.coef_ = None
+            self.intercept_ = None
+
+        # Clear iteration count for multiple call to fit.
+        self.t_ = 1.0
+
+        self._partial_fit(
+            X,
+            y,
+            alpha,
+            C,
+            loss,
+            learning_rate,
+            self.max_iter,
+            sample_weight,
+            coef_init,
+            intercept_init,
+        )
+
+        if (
+            self.tol is not None
+            and self.tol > -np.inf
+            and self.n_iter_ == self.max_iter
+        ):
+            warnings.warn(
+                (
+                    "Maximum number of iteration reached before "
+                    "convergence. Consider increasing max_iter to "
+                    "improve the fit."
+                ),
+                ConvergenceWarning,
+            )
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
+        """Fit linear model with Stochastic Gradient Descent.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        y : ndarray of shape (n_samples,)
+            Target values.
+
+        coef_init : ndarray of shape (n_features,), default=None
+            The initial coefficients to warm-start the optimization.
+
+        intercept_init : ndarray of shape (1,), default=None
+            The initial intercept to warm-start the optimization.
+
+        sample_weight : array-like, shape (n_samples,), default=None
+            Weights applied to individual samples (1. for unweighted).
+
+        Returns
+        -------
+        self : object
+            Fitted `SGDRegressor` estimator.
+        """
+        self._more_validate_params()
+
+        return self._fit(
+            X,
+            y,
+            alpha=self.alpha,
+            C=1.0,
+            loss=self.loss,
+            learning_rate=self.learning_rate,
+            coef_init=coef_init,
+            intercept_init=intercept_init,
+            sample_weight=sample_weight,
+        )
+
+    def _decision_function(self, X):
+        """Predict using the linear model
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+           Predicted target values per element in X.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(self, X, accept_sparse="csr", reset=False)
+
+        scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
+        return scores.ravel()
+
+    def predict(self, X):
+        """Predict using the linear model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data.
+
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+           Predicted target values per element in X.
+        """
+        return self._decision_function(X)
+
+    def _fit_regressor(
+        self, X, y, alpha, C, loss, learning_rate, sample_weight, max_iter
+    ):
+        loss_function = self._get_loss_function(loss)
+        penalty_type = self._get_penalty_type(self.penalty)
+        learning_rate_type = self._get_learning_rate_type(learning_rate)
+
+        if not hasattr(self, "t_"):
+            self.t_ = 1.0
+
+        validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0)
+        validation_score_cb = self._make_validation_score_cb(
+            validation_mask, X, y, sample_weight
+        )
+
+        random_state = check_random_state(self.random_state)
+        # numpy mtrand expects a C long which is a signed 32 bit integer under
+        # Windows
+        seed = random_state.randint(0, MAX_INT)
+
+        dataset, intercept_decay = make_dataset(
+            X, y, sample_weight, random_state=random_state
+        )
+
+        tol = self.tol if self.tol is not None else -np.inf
+
+        if self.average:
+            coef = self._standard_coef
+            intercept = self._standard_intercept
+            average_coef = self._average_coef
+            average_intercept = self._average_intercept
+        else:
+            coef = self.coef_
+            intercept = self.intercept_
+            average_coef = None  # Not used
+            average_intercept = [0]  # Not used
+
+        _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype)
+        coef, intercept, average_coef, average_intercept, self.n_iter_ = _plain_sgd(
+            coef,
+            intercept[0],
+            average_coef,
+            average_intercept[0],
+            loss_function,
+            penalty_type,
+            alpha,
+            C,
+            self._get_l1_ratio(),
+            dataset,
+            validation_mask,
+            self.early_stopping,
+            validation_score_cb,
+            int(self.n_iter_no_change),
+            max_iter,
+            tol,
+            int(self.fit_intercept),
+            int(self.verbose),
+            int(self.shuffle),
+            seed,
+            1.0,
+            1.0,
+            learning_rate_type,
+            self.eta0,
+            self.power_t,
+            0,
+            self.t_,
+            intercept_decay,
+            self.average,
+        )
+
+        self.t_ += self.n_iter_ * X.shape[0]
+
+        if self.average > 0:
+            self._average_intercept = np.atleast_1d(average_intercept)
+            self._standard_intercept = np.atleast_1d(intercept)
+
+            if self.average <= self.t_ - 1.0:
+                # made enough updates for averaging to be taken into account
+                self.coef_ = average_coef
+                self.intercept_ = np.atleast_1d(average_intercept)
+            else:
+                self.coef_ = coef
+                self.intercept_ = np.atleast_1d(intercept)
+
+        else:
+            self.intercept_ = np.atleast_1d(intercept)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class SGDRegressor(BaseSGDRegressor):
+    """Linear model fitted by minimizing a regularized empirical loss with SGD.
+
+    SGD stands for Stochastic Gradient Descent: the gradient of the loss is
+    estimated each sample at a time and the model is updated along the way with
+    a decreasing strength schedule (aka learning rate).
+
+    The regularizer is a penalty added to the loss function that shrinks model
+    parameters towards the zero vector using either the squared euclidean norm
+    L2 or the absolute norm L1 or a combination of both (Elastic Net). If the
+    parameter update crosses the 0.0 value because of the regularizer, the
+    update is truncated to 0.0 to allow for learning sparse models and achieve
+    online feature selection.
+
+    This implementation works with data represented as dense numpy arrays of
+    floating point values for the features.
+
+    Read more in the :ref:`User Guide <sgd>`.
+
+    Parameters
+    ----------
+    loss : str, default='squared_error'
+        The loss function to be used. The possible values are 'squared_error',
+        'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'
+
+        The 'squared_error' refers to the ordinary least squares fit.
+        'huber' modifies 'squared_error' to focus less on getting outliers
+        correct by switching from squared to linear loss past a distance of
+        epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is
+        linear past that; this is the loss function used in SVR.
+        'squared_epsilon_insensitive' is the same but becomes squared loss past
+        a tolerance of epsilon.
+
+        More details about the losses formulas can be found in the
+        :ref:`User Guide <sgd_mathematical_formulation>`.
+
+    penalty : {'l2', 'l1', 'elasticnet', None}, default='l2'
+        The penalty (aka regularization term) to be used. Defaults to 'l2'
+        which is the standard regularizer for linear SVM models. 'l1' and
+        'elasticnet' might bring sparsity to the model (feature selection)
+        not achievable with 'l2'. No penalty is added when set to `None`.
+
+        You can see a visualisation of the penalties in
+        :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_penalties.py`.
+
+    alpha : float, default=0.0001
+        Constant that multiplies the regularization term. The higher the
+        value, the stronger the regularization. Also used to compute the
+        learning rate when `learning_rate` is set to 'optimal'.
+        Values must be in the range `[0.0, inf)`.
+
+    l1_ratio : float, default=0.15
+        The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
+        l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
+        Only used if `penalty` is 'elasticnet'.
+        Values must be in the range `[0.0, 1.0]` or can be `None` if
+        `penalty` is not `elasticnet`.
+
+        .. versionchanged:: 1.7
+            `l1_ratio` can be `None` when `penalty` is not "elasticnet".
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`partial_fit` method.
+        Values must be in the range `[1, inf)`.
+
+        .. versionadded:: 0.19
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, training will stop
+        when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
+        epochs.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
+        Values must be in the range `[0.0, inf)`.
+
+        .. versionadded:: 0.19
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+
+    verbose : int, default=0
+        The verbosity level.
+        Values must be in the range `[0, inf)`.
+
+    epsilon : float, default=0.1
+        Epsilon in the epsilon-insensitive loss functions; only if `loss` is
+        'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
+        For 'huber', determines the threshold at which it becomes less
+        important to get the prediction exactly right.
+        For epsilon-insensitive, any differences between the current prediction
+        and the correct label are ignored if they are less than this threshold.
+        Values must be in the range `[0.0, inf)`.
+
+    random_state : int, RandomState instance, default=None
+        Used for shuffling the data, when ``shuffle`` is set to ``True``.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    learning_rate : str, default='invscaling'
+        The learning rate schedule:
+
+        - 'constant': `eta = eta0`
+        - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
+          where t0 is chosen by a heuristic proposed by Leon Bottou.
+        - 'invscaling': `eta = eta0 / pow(t, power_t)`
+        - 'adaptive': eta = eta0, as long as the training keeps decreasing.
+          Each time n_iter_no_change consecutive epochs fail to decrease the
+          training loss by tol or fail to increase validation score by tol if
+          early_stopping is True, the current learning rate is divided by 5.
+
+        .. versionadded:: 0.20
+            Added 'adaptive' option.
+
+    eta0 : float, default=0.01
+        The initial learning rate for the 'constant', 'invscaling' or
+        'adaptive' schedules. The default value is 0.01.
+        Values must be in the range `[0.0, inf)`.
+
+    power_t : float, default=0.25
+        The exponent for inverse scaling learning rate.
+        Values must be in the range `(-inf, inf)`.
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to True, it will automatically set aside
+        a fraction of training data as validation and terminate
+        training when validation score returned by the `score` method is not
+        improving by at least `tol` for `n_iter_no_change` consecutive
+        epochs.
+
+        See :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_early_stopping.py` for an
+        example of the effects of early stopping.
+
+        .. versionadded:: 0.20
+            Added 'early_stopping' option
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if `early_stopping` is True.
+        Values must be in the range `(0.0, 1.0)`.
+
+        .. versionadded:: 0.20
+            Added 'validation_fraction' option
+
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before stopping
+        fitting.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
+        Integer values must be in the range `[1, max_iter)`.
+
+        .. versionadded:: 0.20
+            Added 'n_iter_no_change' option
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+        If a dynamic learning rate is used, the learning rate is adapted
+        depending on the number of samples already seen. Calling ``fit`` resets
+        this counter, while ``partial_fit``  will result in increasing the
+        existing counter.
+
+    average : bool or int, default=False
+        When set to True, computes the averaged SGD weights across all
+        updates and stores the result in the ``coef_`` attribute. If set to
+        an int greater than 1, averaging will begin once the total number of
+        samples seen reaches `average`. So ``average=10`` will begin
+        averaging after seeing 10 samples.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_features,)
+        Weights assigned to the features.
+
+    intercept_ : ndarray of shape (1,)
+        The intercept term.
+
+    n_iter_ : int
+        The actual number of iterations before reaching the stopping criterion.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    HuberRegressor : Linear regression model that is robust to outliers.
+    Lars : Least Angle Regression model.
+    Lasso : Linear Model trained with L1 prior as regularizer.
+    RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
+    Ridge : Linear least squares with l2 regularization.
+    sklearn.svm.SVR : Epsilon-Support Vector Regression.
+    TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import SGDRegressor
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> n_samples, n_features = 10, 5
+    >>> rng = np.random.RandomState(0)
+    >>> y = rng.randn(n_samples)
+    >>> X = rng.randn(n_samples, n_features)
+    >>> # Always scale the input. The most convenient way is to use a pipeline.
+    >>> reg = make_pipeline(StandardScaler(),
+    ...                     SGDRegressor(max_iter=1000, tol=1e-3))
+    >>> reg.fit(X, y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('sgdregressor', SGDRegressor())])
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSGDRegressor._parameter_constraints,
+        "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "l1_ratio": [Interval(Real, 0, 1, closed="both"), None],
+        "power_t": [Interval(Real, None, None, closed="neither")],
+        "learning_rate": [
+            StrOptions({"constant", "optimal", "invscaling", "adaptive"}),
+            Hidden(StrOptions({"pa1", "pa2"})),
+        ],
+        "epsilon": [Interval(Real, 0, None, closed="left")],
+        "eta0": [Interval(Real, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        loss="squared_error",
+        *,
+        penalty="l2",
+        alpha=0.0001,
+        l1_ratio=0.15,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        shuffle=True,
+        verbose=0,
+        epsilon=DEFAULT_EPSILON,
+        random_state=None,
+        learning_rate="invscaling",
+        eta0=0.01,
+        power_t=0.25,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        warm_start=False,
+        average=False,
+    ):
+        super().__init__(
+            loss=loss,
+            penalty=penalty,
+            alpha=alpha,
+            l1_ratio=l1_ratio,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            shuffle=shuffle,
+            verbose=verbose,
+            epsilon=epsilon,
+            random_state=random_state,
+            learning_rate=learning_rate,
+            eta0=eta0,
+            power_t=power_t,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            warm_start=warm_start,
+            average=average,
+        )
+
+
+class SGDOneClassSVM(OutlierMixin, BaseSGD):
+    """Solves linear One-Class SVM using Stochastic Gradient Descent.
+
+    This implementation is meant to be used with a kernel approximation
+    technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results
+    similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by
+    default.
+
+    Read more in the :ref:`User Guide <sgd_online_one_class_svm>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    nu : float, default=0.5
+        The nu parameter of the One Class SVM: an upper bound on the
+        fraction of training errors and a lower bound of the fraction of
+        support vectors. Should be in the interval (0, 1]. By default 0.5
+        will be taken.
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. Defaults to True.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        `partial_fit`. Defaults to 1000.
+        Values must be in the range `[1, inf)`.
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, the iterations will stop
+        when (loss > previous_loss - tol). Defaults to 1e-3.
+        Values must be in the range `[0.0, inf)`.
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+        Defaults to True.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    random_state : int, RandomState instance or None, default=None
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
+    learning_rate : {'constant', 'optimal', 'invscaling', 'adaptive'}, default='optimal'
+        The learning rate schedule to use with `fit`. (If using `partial_fit`,
+        learning rate must be controlled directly).
+
+        - 'constant': `eta = eta0`
+        - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
+          where t0 is chosen by a heuristic proposed by Leon Bottou.
+        - 'invscaling': `eta = eta0 / pow(t, power_t)`
+        - 'adaptive': eta = eta0, as long as the training keeps decreasing.
+          Each time n_iter_no_change consecutive epochs fail to decrease the
+          training loss by tol or fail to increase validation score by tol if
+          early_stopping is True, the current learning rate is divided by 5.
+
+    eta0 : float, default=0.0
+        The initial learning rate for the 'constant', 'invscaling' or
+        'adaptive' schedules. The default value is 0.0 as eta0 is not used by
+        the default schedule 'optimal'.
+        Values must be in the range `[0.0, inf)`.
+
+    power_t : float, default=0.5
+        The exponent for inverse scaling learning rate.
+        Values must be in the range `(-inf, inf)`.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+        If a dynamic learning rate is used, the learning rate is adapted
+        depending on the number of samples already seen. Calling ``fit`` resets
+        this counter, while ``partial_fit``  will result in increasing the
+        existing counter.
+
+    average : bool or int, default=False
+        When set to True, computes the averaged SGD weights and stores the
+        result in the ``coef_`` attribute. If set to an int greater than 1,
+        averaging will begin once the total number of samples seen reaches
+        average. So ``average=10`` will begin averaging after seeing 10
+        samples.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features)
+        Weights assigned to the features.
+
+    offset_ : ndarray of shape (1,)
+        Offset used to define the decision function from the raw scores.
+        We have the relation: decision_function = score_samples - offset.
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
+
+    Notes
+    -----
+    This estimator has a linear complexity in the number of training samples
+    and is thus better suited than the `sklearn.svm.OneClassSVM`
+    implementation for datasets with a large number of training samples (say
+    > 10,000).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import linear_model
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> clf = linear_model.SGDOneClassSVM(random_state=42)
+    >>> clf.fit(X)
+    SGDOneClassSVM(random_state=42)
+
+    >>> print(clf.predict([[4, 4]]))
+    [1]
+    """
+
+    loss_functions = {"hinge": (Hinge, 1.0)}
+
+    _parameter_constraints: dict = {
+        **BaseSGD._parameter_constraints,
+        "nu": [Interval(Real, 0.0, 1.0, closed="right")],
+        "learning_rate": [
+            StrOptions({"constant", "optimal", "invscaling", "adaptive"}),
+            Hidden(StrOptions({"pa1", "pa2"})),
+        ],
+        "eta0": [Interval(Real, 0, None, closed="left")],
+        "power_t": [Interval(Real, None, None, closed="neither")],
+    }
+
+    def __init__(
+        self,
+        nu=0.5,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        shuffle=True,
+        verbose=0,
+        random_state=None,
+        learning_rate="optimal",
+        eta0=0.0,
+        power_t=0.5,
+        warm_start=False,
+        average=False,
+    ):
+        self.nu = nu
+        super().__init__(
+            loss="hinge",
+            penalty="l2",
+            C=1.0,
+            l1_ratio=0,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            shuffle=shuffle,
+            verbose=verbose,
+            epsilon=DEFAULT_EPSILON,
+            random_state=random_state,
+            learning_rate=learning_rate,
+            eta0=eta0,
+            power_t=power_t,
+            early_stopping=False,
+            validation_fraction=0.1,
+            n_iter_no_change=5,
+            warm_start=warm_start,
+            average=average,
+        )
+
+    def _fit_one_class(self, X, alpha, C, sample_weight, learning_rate, max_iter):
+        """Uses SGD implementation with X and y=np.ones(n_samples)."""
+
+        # The One-Class SVM uses the SGD implementation with
+        # y=np.ones(n_samples).
+        n_samples = X.shape[0]
+        y = np.ones(n_samples, dtype=X.dtype, order="C")
+
+        dataset, offset_decay = make_dataset(X, y, sample_weight)
+
+        penalty_type = self._get_penalty_type(self.penalty)
+        learning_rate_type = self._get_learning_rate_type(learning_rate)
+
+        # early stopping is set to False for the One-Class SVM. thus
+        # validation_mask and validation_score_cb will be set to values
+        # associated to early_stopping=False in _make_validation_split and
+        # _make_validation_score_cb respectively.
+        validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0)
+        validation_score_cb = self._make_validation_score_cb(
+            validation_mask, X, y, sample_weight
+        )
+
+        random_state = check_random_state(self.random_state)
+        # numpy mtrand expects a C long which is a signed 32 bit integer under
+        # Windows
+        seed = random_state.randint(0, np.iinfo(np.int32).max)
+
+        tol = self.tol if self.tol is not None else -np.inf
+
+        one_class = 1
+        # There are no class weights for the One-Class SVM and they are
+        # therefore set to 1.
+        pos_weight = 1
+        neg_weight = 1
+
+        if self.average:
+            coef = self._standard_coef
+            intercept = self._standard_intercept
+            average_coef = self._average_coef
+            average_intercept = self._average_intercept
+        else:
+            coef = self.coef_
+            intercept = 1 - self.offset_
+            average_coef = None  # Not used
+            average_intercept = [0]  # Not used
+
+        _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype)
+        coef, intercept, average_coef, average_intercept, self.n_iter_ = _plain_sgd(
+            coef,
+            intercept[0],
+            average_coef,
+            average_intercept[0],
+            self._loss_function_,
+            penalty_type,
+            alpha,
+            C,
+            self.l1_ratio,
+            dataset,
+            validation_mask,
+            self.early_stopping,
+            validation_score_cb,
+            int(self.n_iter_no_change),
+            max_iter,
+            tol,
+            int(self.fit_intercept),
+            int(self.verbose),
+            int(self.shuffle),
+            seed,
+            neg_weight,
+            pos_weight,
+            learning_rate_type,
+            self.eta0,
+            self.power_t,
+            one_class,
+            self.t_,
+            offset_decay,
+            self.average,
+        )
+
+        self.t_ += self.n_iter_ * n_samples
+
+        if self.average > 0:
+            self._average_intercept = np.atleast_1d(average_intercept)
+            self._standard_intercept = np.atleast_1d(intercept)
+
+            if self.average <= self.t_ - 1.0:
+                # made enough updates for averaging to be taken into account
+                self.coef_ = average_coef
+                self.offset_ = 1 - np.atleast_1d(average_intercept)
+            else:
+                self.coef_ = coef
+                self.offset_ = 1 - np.atleast_1d(intercept)
+
+        else:
+            self.offset_ = 1 - np.atleast_1d(intercept)
+
+    def _partial_fit(
+        self,
+        X,
+        alpha,
+        C,
+        loss,
+        learning_rate,
+        max_iter,
+        sample_weight,
+        coef_init,
+        offset_init,
+    ):
+        first_call = getattr(self, "coef_", None) is None
+        X = validate_data(
+            self,
+            X,
+            None,
+            accept_sparse="csr",
+            dtype=[np.float64, np.float32],
+            order="C",
+            accept_large_sparse=False,
+            reset=first_call,
+        )
+
+        n_features = X.shape[1]
+
+        # Allocate datastructures from input arguments
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        # We use intercept = 1 - offset where intercept is the intercept of
+        # the SGD implementation and offset is the offset of the One-Class SVM
+        # optimization problem.
+        if getattr(self, "coef_", None) is None or coef_init is not None:
+            self._allocate_parameter_mem(
+                n_classes=1,
+                n_features=n_features,
+                input_dtype=X.dtype,
+                coef_init=coef_init,
+                intercept_init=offset_init,
+                one_class=1,
+            )
+        elif n_features != self.coef_.shape[-1]:
+            raise ValueError(
+                "Number of features %d does not match previous data %d."
+                % (n_features, self.coef_.shape[-1])
+            )
+
+        if self.average and getattr(self, "_average_coef", None) is None:
+            self._average_coef = np.zeros(n_features, dtype=X.dtype, order="C")
+            self._average_intercept = np.zeros(1, dtype=X.dtype, order="C")
+
+        self._loss_function_ = self._get_loss_function(loss)
+        if not hasattr(self, "t_"):
+            self.t_ = 1.0
+
+        # delegate to concrete training procedure
+        self._fit_one_class(
+            X,
+            alpha=alpha,
+            C=C,
+            learning_rate=learning_rate,
+            sample_weight=sample_weight,
+            max_iter=max_iter,
+        )
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None, sample_weight=None):
+        """Fit linear One-Class SVM with Stochastic Gradient Descent.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Subset of the training data.
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        sample_weight : array-like, shape (n_samples,), optional
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed.
+
+        Returns
+        -------
+        self : object
+            Returns a fitted instance of self.
+        """
+        if not hasattr(self, "coef_"):
+            self._more_validate_params(for_partial_fit=True)
+
+        alpha = self.nu / 2
+        return self._partial_fit(
+            X,
+            alpha,
+            C=1.0,
+            loss=self.loss,
+            learning_rate=self.learning_rate,
+            max_iter=1,
+            sample_weight=sample_weight,
+            coef_init=None,
+            offset_init=None,
+        )
+
+    def _fit(
+        self,
+        X,
+        alpha,
+        C,
+        loss,
+        learning_rate,
+        coef_init=None,
+        offset_init=None,
+        sample_weight=None,
+    ):
+        if self.warm_start and hasattr(self, "coef_"):
+            if coef_init is None:
+                coef_init = self.coef_
+            if offset_init is None:
+                offset_init = self.offset_
+        else:
+            self.coef_ = None
+            self.offset_ = None
+
+        # Clear iteration count for multiple call to fit.
+        self.t_ = 1.0
+
+        self._partial_fit(
+            X,
+            alpha,
+            C,
+            loss,
+            learning_rate,
+            self.max_iter,
+            sample_weight,
+            coef_init,
+            offset_init,
+        )
+
+        if (
+            self.tol is not None
+            and self.tol > -np.inf
+            and self.n_iter_ == self.max_iter
+        ):
+            warnings.warn(
+                (
+                    "Maximum number of iteration reached before "
+                    "convergence. Consider increasing max_iter to "
+                    "improve the fit."
+                ),
+                ConvergenceWarning,
+            )
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):
+        """Fit linear One-Class SVM with Stochastic Gradient Descent.
+
+        This solves an equivalent optimization problem of the
+        One-Class SVM primal optimization problem and returns a weight vector
+        w and an offset rho such that the decision function is given by
+        <w, x> - rho.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        coef_init : array, shape (n_classes, n_features)
+            The initial coefficients to warm-start the optimization.
+
+        offset_init : array, shape (n_classes,)
+            The initial offset to warm-start the optimization.
+
+        sample_weight : array-like, shape (n_samples,), optional
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed. These weights will
+            be multiplied with class_weight (passed through the
+            constructor) if class_weight is specified.
+
+        Returns
+        -------
+        self : object
+            Returns a fitted instance of self.
+        """
+        self._more_validate_params()
+
+        alpha = self.nu / 2
+        self._fit(
+            X,
+            alpha=alpha,
+            C=1.0,
+            loss=self.loss,
+            learning_rate=self.learning_rate,
+            coef_init=coef_init,
+            offset_init=offset_init,
+            sample_weight=sample_weight,
+        )
+
+        return self
+
+    def decision_function(self, X):
+        """Signed distance to the separating hyperplane.
+
+        Signed distance is positive for an inlier and negative for an
+        outlier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Testing data.
+
+        Returns
+        -------
+        dec : array-like, shape (n_samples,)
+            Decision function values of the samples.
+        """
+
+        check_is_fitted(self, "coef_")
+
+        X = validate_data(self, X, accept_sparse="csr", reset=False)
+        decisions = safe_sparse_dot(X, self.coef_.T, dense_output=True) - self.offset_
+
+        return decisions.ravel()
+
+    def score_samples(self, X):
+        """Raw scoring function of the samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Testing data.
+
+        Returns
+        -------
+        score_samples : array-like, shape (n_samples,)
+            Unshiffted scoring function values of the samples.
+        """
+        score_samples = self.decision_function(X) + self.offset_
+        return score_samples
+
+    def predict(self, X):
+        """Return labels (1 inlier, -1 outlier) of the samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Testing data.
+
+        Returns
+        -------
+        y : array, shape (n_samples,)
+            Labels of the samples.
+        """
+        y = (self.decision_function(X) >= 0).astype(np.int32)
+        y[y == 0] = -1  # for consistency with outlier detectors
+        return y
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_theil_sen.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_theil_sen.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b25145a8ca55efe3f99e80f24a8da6e4b1a9f50
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_theil_sen.py
@@ -0,0 +1,467 @@
+"""
+A Theil-Sen Estimator for Multiple Linear Regression Model
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from itertools import combinations
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+from scipy import linalg
+from scipy.linalg.lapack import get_lapack_funcs
+from scipy.special import binom
+
+from ..base import RegressorMixin, _fit_context
+from ..exceptions import ConvergenceWarning
+from ..utils import check_random_state
+from ..utils._param_validation import Hidden, Interval, StrOptions
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import validate_data
+from ._base import LinearModel
+
+_EPSILON = np.finfo(np.double).eps
+
+
+def _modified_weiszfeld_step(X, x_old):
+    """Modified Weiszfeld step.
+
+    This function defines one iteration step in order to approximate the
+    spatial median (L1 median). It is a form of an iteratively re-weighted
+    least squares method.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    x_old : ndarray of shape = (n_features,)
+        Current start vector.
+
+    Returns
+    -------
+    x_new : ndarray of shape (n_features,)
+        New iteration step.
+
+    References
+    ----------
+    - On Computation of Spatial Median for Robust Data Mining, 2005
+      T. Kärkkäinen and S. Äyrämö
+      http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
+    """
+    diff = X - x_old
+    diff_norm = np.sqrt(np.sum(diff**2, axis=1))
+    mask = diff_norm >= _EPSILON
+    # x_old equals one of our samples
+    is_x_old_in_X = int(mask.sum() < X.shape[0])
+
+    diff = diff[mask]
+    diff_norm = diff_norm[mask][:, np.newaxis]
+    quotient_norm = linalg.norm(np.sum(diff / diff_norm, axis=0))
+
+    if quotient_norm > _EPSILON:  # to avoid division by zero
+        new_direction = np.sum(X[mask, :] / diff_norm, axis=0) / np.sum(
+            1 / diff_norm, axis=0
+        )
+    else:
+        new_direction = 1.0
+        quotient_norm = 1.0
+
+    return (
+        max(0.0, 1.0 - is_x_old_in_X / quotient_norm) * new_direction
+        + min(1.0, is_x_old_in_X / quotient_norm) * x_old
+    )
+
+
+def _spatial_median(X, max_iter=300, tol=1.0e-3):
+    """Spatial median (L1 median).
+
+    The spatial median is member of a class of so-called M-estimators which
+    are defined by an optimization problem. Given a number of p points in an
+    n-dimensional space, the point x minimizing the sum of all distances to the
+    p other points is called spatial median.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    max_iter : int, default=300
+        Maximum number of iterations.
+
+    tol : float, default=1.e-3
+        Stop the algorithm if spatial_median has converged.
+
+    Returns
+    -------
+    spatial_median : ndarray of shape = (n_features,)
+        Spatial median.
+
+    n_iter : int
+        Number of iterations needed.
+
+    References
+    ----------
+    - On Computation of Spatial Median for Robust Data Mining, 2005
+      T. Kärkkäinen and S. Äyrämö
+      http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
+    """
+    if X.shape[1] == 1:
+        return 1, np.median(X.ravel(), keepdims=True)
+
+    tol **= 2  # We are computing the tol on the squared norm
+    spatial_median_old = np.mean(X, axis=0)
+
+    for n_iter in range(max_iter):
+        spatial_median = _modified_weiszfeld_step(X, spatial_median_old)
+        if np.sum((spatial_median_old - spatial_median) ** 2) < tol:
+            break
+        else:
+            spatial_median_old = spatial_median
+    else:
+        warnings.warn(
+            "Maximum number of iterations {max_iter} reached in "
+            "spatial median for TheilSen regressor."
+            "".format(max_iter=max_iter),
+            ConvergenceWarning,
+        )
+    return n_iter, spatial_median
+
+
+def _breakdown_point(n_samples, n_subsamples):
+    """Approximation of the breakdown point.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples.
+
+    n_subsamples : int
+        Number of subsamples to consider.
+
+    Returns
+    -------
+    breakdown_point : float
+        Approximation of breakdown point.
+    """
+    return (
+        1
+        - (
+            0.5 ** (1 / n_subsamples) * (n_samples - n_subsamples + 1)
+            + n_subsamples
+            - 1
+        )
+        / n_samples
+    )
+
+
+def _lstsq(X, y, indices, fit_intercept):
+    """Least Squares Estimator for TheilSenRegressor class.
+
+    This function calculates the least squares method on a subset of rows of X
+    and y defined by the indices array. Optionally, an intercept column is
+    added if intercept is set to true.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Design matrix, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : ndarray of shape (n_samples,)
+        Target vector, where `n_samples` is the number of samples.
+
+    indices : ndarray of shape (n_subpopulation, n_subsamples)
+        Indices of all subsamples with respect to the chosen subpopulation.
+
+    fit_intercept : bool
+        Fit intercept or not.
+
+    Returns
+    -------
+    weights : ndarray of shape (n_subpopulation, n_features + intercept)
+        Solution matrix of n_subpopulation solved least square problems.
+    """
+    fit_intercept = int(fit_intercept)
+    n_features = X.shape[1] + fit_intercept
+    n_subsamples = indices.shape[1]
+    weights = np.empty((indices.shape[0], n_features))
+    X_subpopulation = np.ones((n_subsamples, n_features))
+    # gelss need to pad y_subpopulation to be of the max dim of X_subpopulation
+    y_subpopulation = np.zeros((max(n_subsamples, n_features)))
+    (lstsq,) = get_lapack_funcs(("gelss",), (X_subpopulation, y_subpopulation))
+
+    for index, subset in enumerate(indices):
+        X_subpopulation[:, fit_intercept:] = X[subset, :]
+        y_subpopulation[:n_subsamples] = y[subset]
+        weights[index] = lstsq(X_subpopulation, y_subpopulation)[1][:n_features]
+
+    return weights
+
+
+class TheilSenRegressor(RegressorMixin, LinearModel):
+    """Theil-Sen Estimator: robust multivariate regression model.
+
+    The algorithm calculates least square solutions on subsets with size
+    n_subsamples of the samples in X. Any value of n_subsamples between the
+    number of features and samples leads to an estimator with a compromise
+    between robustness and efficiency. Since the number of least square
+    solutions is "n_samples choose n_subsamples", it can be extremely large
+    and can therefore be limited with max_subpopulation. If this limit is
+    reached, the subsets are chosen randomly. In a final step, the spatial
+    median (or L1 median) is calculated of all least square solutions.
+
+    Read more in the :ref:`User Guide <theil_sen_regression>`.
+
+    Parameters
+    ----------
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations.
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+        .. deprecated:: 1.6
+            `copy_X` was deprecated in 1.6 and will be removed in 1.8.
+            It has no effect as a copy is always made.
+
+    max_subpopulation : int, default=1e4
+        Instead of computing with a set of cardinality 'n choose k', where n is
+        the number of samples and k is the number of subsamples (at least
+        number of features), consider only a stochastic subpopulation of a
+        given maximal size if 'n choose k' is larger than max_subpopulation.
+        For other than small problem sizes this parameter will determine
+        memory usage and runtime if n_subsamples is not changed. Note that the
+        data type should be int but floats such as 1e4 can be accepted too.
+
+    n_subsamples : int, default=None
+        Number of samples to calculate the parameters. This is at least the
+        number of features (plus 1 if fit_intercept=True) and the number of
+        samples as a maximum. A lower number leads to a higher breakdown
+        point and a low efficiency while a high number leads to a low
+        breakdown point and a high efficiency. If None, take the
+        minimum number of subsamples leading to maximal robustness.
+        If n_subsamples is set to n_samples, Theil-Sen is identical to least
+        squares.
+
+    max_iter : int, default=300
+        Maximum number of iterations for the calculation of spatial median.
+
+    tol : float, default=1e-3
+        Tolerance when calculating spatial median.
+
+    random_state : int, RandomState instance or None, default=None
+        A random number generator instance to define the state of the random
+        permutations generator. Pass an int for reproducible output across
+        multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        Number of CPUs to use during the cross validation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool, default=False
+        Verbose mode when fitting the model.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_features,)
+        Coefficients of the regression model (median of distribution).
+
+    intercept_ : float
+        Estimated intercept of regression model.
+
+    breakdown_ : float
+        Approximated breakdown point.
+
+    n_iter_ : int
+        Number of iterations needed for the spatial median.
+
+    n_subpopulation_ : int
+        Number of combinations taken into account from 'n choose k', where n is
+        the number of samples and k is the number of subsamples.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    HuberRegressor : Linear regression model that is robust to outliers.
+    RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
+    SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
+
+    References
+    ----------
+    - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009
+      Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
+      http://home.olemiss.edu/~xdang/papers/MTSE.pdf
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import TheilSenRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, random_state=0)
+    >>> reg = TheilSenRegressor(random_state=0).fit(X, y)
+    >>> reg.score(X, y)
+    0.9884
+    >>> reg.predict(X[:1,])
+    array([-31.5871])
+    """
+
+    _parameter_constraints: dict = {
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean", Hidden(StrOptions({"deprecated"}))],
+        # target_type should be Integral but can accept Real for backward compatibility
+        "max_subpopulation": [Interval(Real, 1, None, closed="left")],
+        "n_subsamples": [None, Integral],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+        "random_state": ["random_state"],
+        "n_jobs": [None, Integral],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        fit_intercept=True,
+        copy_X="deprecated",
+        max_subpopulation=1e4,
+        n_subsamples=None,
+        max_iter=300,
+        tol=1.0e-3,
+        random_state=None,
+        n_jobs=None,
+        verbose=False,
+    ):
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.max_subpopulation = max_subpopulation
+        self.n_subsamples = n_subsamples
+        self.max_iter = max_iter
+        self.tol = tol
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    def _check_subparams(self, n_samples, n_features):
+        n_subsamples = self.n_subsamples
+
+        if self.fit_intercept:
+            n_dim = n_features + 1
+        else:
+            n_dim = n_features
+
+        if n_subsamples is not None:
+            if n_subsamples > n_samples:
+                raise ValueError(
+                    "Invalid parameter since n_subsamples > "
+                    "n_samples ({0} > {1}).".format(n_subsamples, n_samples)
+                )
+            if n_samples >= n_features:
+                if n_dim > n_subsamples:
+                    plus_1 = "+1" if self.fit_intercept else ""
+                    raise ValueError(
+                        "Invalid parameter since n_features{0} "
+                        "> n_subsamples ({1} > {2})."
+                        "".format(plus_1, n_dim, n_subsamples)
+                    )
+            else:  # if n_samples < n_features
+                if n_subsamples != n_samples:
+                    raise ValueError(
+                        "Invalid parameter since n_subsamples != "
+                        "n_samples ({0} != {1}) while n_samples "
+                        "< n_features.".format(n_subsamples, n_samples)
+                    )
+        else:
+            n_subsamples = min(n_dim, n_samples)
+
+        all_combinations = max(1, np.rint(binom(n_samples, n_subsamples)))
+        n_subpopulation = int(min(self.max_subpopulation, all_combinations))
+
+        return n_subsamples, n_subpopulation
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit linear model.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+        y : ndarray of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : returns an instance of self.
+            Fitted `TheilSenRegressor` estimator.
+        """
+        if self.copy_X != "deprecated":
+            warnings.warn(
+                "`copy_X` was deprecated in 1.6 and will be removed in 1.8 since it "
+                "has no effect internally. Simply leave this parameter to its default "
+                "value to avoid this warning.",
+                FutureWarning,
+            )
+
+        random_state = check_random_state(self.random_state)
+        X, y = validate_data(self, X, y, y_numeric=True)
+        n_samples, n_features = X.shape
+        n_subsamples, self.n_subpopulation_ = self._check_subparams(
+            n_samples, n_features
+        )
+        self.breakdown_ = _breakdown_point(n_samples, n_subsamples)
+
+        if self.verbose:
+            print("Breakdown point: {0}".format(self.breakdown_))
+            print("Number of samples: {0}".format(n_samples))
+            tol_outliers = int(self.breakdown_ * n_samples)
+            print("Tolerable outliers: {0}".format(tol_outliers))
+            print("Number of subpopulations: {0}".format(self.n_subpopulation_))
+
+        # Determine indices of subpopulation
+        if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation:
+            indices = list(combinations(range(n_samples), n_subsamples))
+        else:
+            indices = [
+                random_state.choice(n_samples, size=n_subsamples, replace=False)
+                for _ in range(self.n_subpopulation_)
+            ]
+
+        n_jobs = effective_n_jobs(self.n_jobs)
+        index_list = np.array_split(indices, n_jobs)
+        weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_lstsq)(X, y, index_list[job], self.fit_intercept)
+            for job in range(n_jobs)
+        )
+        weights = np.vstack(weights)
+        self.n_iter_, coefs = _spatial_median(
+            weights, max_iter=self.max_iter, tol=self.tol
+        )
+
+        if self.fit_intercept:
+            self.intercept_ = coefs[0]
+            self.coef_ = coefs[1:]
+        else:
+            self.intercept_ = 0.0
+            self.coef_ = coefs
+
+        return self
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/meson.build b/.venv/lib/python3.12/site-packages/sklearn/linear_model/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..6d8405c7933891dcdbbc340d47108cde68089d1c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/meson.build
@@ -0,0 +1,32 @@
+# .pyx is generated, so this is needed to make Cython compilation work
+linear_model_cython_tree = [
+  fs.copyfile('__init__.py'),
+]
+
+py.extension_module(
+  '_cd_fast',
+  [cython_gen.process('_cd_fast.pyx'), utils_cython_tree],
+  subdir: 'sklearn/linear_model',
+  install: true
+)
+
+name_list = ['_sgd_fast', '_sag_fast']
+
+foreach name: name_list
+  pyx = custom_target(
+    name + '_pyx',
+    output: name + '.pyx',
+    input: name + '.pyx.tp',
+    command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+    # TODO in principle this should go in py.exension_module below. This is
+    # temporary work-around for dependency issue with .pyx.tp files. For more
+    # details, see https://github.com/mesonbuild/meson/issues/13212
+    depends: [linear_model_cython_tree, utils_cython_tree, _loss_cython_tree],
+  )
+  py.extension_module(
+    name,
+    cython_gen.process(pyx),
+    subdir: 'sklearn/linear_model',
+    install: true
+)
+endforeach
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2483a26644cbbe30388703efc4f687bb01ba5f62
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_common.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+import inspect
+
+import numpy as np
+import pytest
+
+from sklearn.base import is_classifier
+from sklearn.datasets import make_classification, make_low_rank_matrix, make_regression
+from sklearn.linear_model import (
+    ARDRegression,
+    BayesianRidge,
+    ElasticNet,
+    ElasticNetCV,
+    GammaRegressor,
+    HuberRegressor,
+    Lars,
+    LarsCV,
+    Lasso,
+    LassoCV,
+    LassoLars,
+    LassoLarsCV,
+    LassoLarsIC,
+    LinearRegression,
+    LogisticRegression,
+    LogisticRegressionCV,
+    MultiTaskElasticNet,
+    MultiTaskElasticNetCV,
+    MultiTaskLasso,
+    MultiTaskLassoCV,
+    OrthogonalMatchingPursuit,
+    OrthogonalMatchingPursuitCV,
+    PassiveAggressiveClassifier,
+    PassiveAggressiveRegressor,
+    Perceptron,
+    PoissonRegressor,
+    Ridge,
+    RidgeClassifier,
+    RidgeClassifierCV,
+    RidgeCV,
+    SGDClassifier,
+    SGDRegressor,
+    TheilSenRegressor,
+    TweedieRegressor,
+)
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.svm import LinearSVC, LinearSVR
+from sklearn.utils._testing import set_random_state
+
+
+# Note: GammaRegressor() and TweedieRegressor(power != 1) have a non-canonical link.
+@pytest.mark.parametrize(
+    "model",
+    [
+        ARDRegression(),
+        BayesianRidge(),
+        ElasticNet(),
+        ElasticNetCV(),
+        Lars(),
+        LarsCV(),
+        Lasso(),
+        LassoCV(),
+        LassoLarsCV(),
+        LassoLarsIC(),
+        LinearRegression(),
+        # TODO: FIx SAGA which fails badly with sample_weights.
+        # This is a known limitation, see:
+        # https://github.com/scikit-learn/scikit-learn/issues/21305
+        pytest.param(
+            LogisticRegression(
+                penalty="elasticnet", solver="saga", l1_ratio=0.5, tol=1e-15
+            ),
+            marks=pytest.mark.xfail(reason="Missing importance sampling scheme"),
+        ),
+        LogisticRegressionCV(tol=1e-6),
+        MultiTaskElasticNet(),
+        MultiTaskElasticNetCV(),
+        MultiTaskLasso(),
+        MultiTaskLassoCV(),
+        OrthogonalMatchingPursuit(),
+        OrthogonalMatchingPursuitCV(),
+        PoissonRegressor(),
+        Ridge(),
+        RidgeCV(),
+        pytest.param(
+            SGDRegressor(tol=1e-15),
+            marks=pytest.mark.xfail(reason="Insufficient precision."),
+        ),
+        SGDRegressor(penalty="elasticnet", max_iter=10_000),
+        TweedieRegressor(power=0),  # same as Ridge
+    ],
+    ids=lambda x: x.__class__.__name__,
+)
+@pytest.mark.parametrize("with_sample_weight", [False, True])
+def test_balance_property(model, with_sample_weight, global_random_seed):
+    # Test that sum(y_predicted) == sum(y_observed) on the training set.
+    # This must hold for all linear models with deviance of an exponential disperson
+    # family as loss and the corresponding canonical link if fit_intercept=True.
+    # Examples:
+    #     - squared error and identity link (most linear models)
+    #     - Poisson deviance with log link
+    #     - log loss with logit link
+    # This is known as balance property or unconditional calibration/unbiasedness.
+    # For reference, see Corollary 3.18, 3.20 and Chapter 5.1.5 of
+    # M.V. Wuthrich and M. Merz, "Statistical Foundations of Actuarial Learning and its
+    # Applications" (June 3, 2022). http://doi.org/10.2139/ssrn.3822407
+
+    if (
+        with_sample_weight
+        and "sample_weight" not in inspect.signature(model.fit).parameters.keys()
+    ):
+        pytest.skip("Estimator does not support sample_weight.")
+
+    rel = 2e-4  # test precision
+    if isinstance(model, SGDRegressor):
+        rel = 1e-1
+    elif hasattr(model, "solver") and model.solver == "saga":
+        rel = 1e-2
+
+    rng = np.random.RandomState(global_random_seed)
+    n_train, n_features, n_targets = 100, 10, None
+    if isinstance(
+        model,
+        (MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLasso, MultiTaskLassoCV),
+    ):
+        n_targets = 3
+    X = make_low_rank_matrix(n_samples=n_train, n_features=n_features, random_state=rng)
+    if n_targets:
+        coef = (
+            rng.uniform(low=-2, high=2, size=(n_features, n_targets))
+            / np.max(X, axis=0)[:, None]
+        )
+    else:
+        coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+
+    expectation = np.exp(X @ coef + 0.5)
+    y = rng.poisson(lam=expectation) + 1  # strict positive, i.e. y > 0
+    if is_classifier(model):
+        y = (y > expectation + 1).astype(np.float64)
+
+    if with_sample_weight:
+        sw = rng.uniform(low=1, high=10, size=y.shape[0])
+    else:
+        sw = None
+
+    model.set_params(fit_intercept=True)  # to be sure
+    if with_sample_weight:
+        model.fit(X, y, sample_weight=sw)
+    else:
+        model.fit(X, y)
+    # Assert balance property.
+    if is_classifier(model):
+        assert np.average(model.predict_proba(X)[:, 1], weights=sw) == pytest.approx(
+            np.average(y, weights=sw), rel=rel
+        )
+    else:
+        assert np.average(model.predict(X), weights=sw, axis=0) == pytest.approx(
+            np.average(y, weights=sw, axis=0), rel=rel
+        )
+
+
+@pytest.mark.filterwarnings("ignore:The default of 'normalize'")
+@pytest.mark.filterwarnings("ignore:lbfgs failed to converge")
+@pytest.mark.parametrize(
+    "Regressor",
+    [
+        ARDRegression,
+        BayesianRidge,
+        ElasticNet,
+        ElasticNetCV,
+        GammaRegressor,
+        HuberRegressor,
+        Lars,
+        LarsCV,
+        Lasso,
+        LassoCV,
+        LassoLars,
+        LassoLarsCV,
+        LassoLarsIC,
+        LinearSVR,
+        LinearRegression,
+        OrthogonalMatchingPursuit,
+        OrthogonalMatchingPursuitCV,
+        PassiveAggressiveRegressor,
+        PoissonRegressor,
+        Ridge,
+        RidgeCV,
+        SGDRegressor,
+        TheilSenRegressor,
+        TweedieRegressor,
+    ],
+)
+@pytest.mark.parametrize("ndim", [1, 2])
+def test_linear_model_regressor_coef_shape(Regressor, ndim):
+    """Check the consistency of linear models `coef` shape."""
+    if Regressor is LinearRegression:
+        pytest.xfail("LinearRegression does not follow `coef_` shape contract!")
+
+    X, y = make_regression(random_state=0, n_samples=200, n_features=20)
+    y = MinMaxScaler().fit_transform(y.reshape(-1, 1))[:, 0] + 1
+    y = y[:, np.newaxis] if ndim == 2 else y
+
+    regressor = Regressor()
+    set_random_state(regressor)
+    regressor.fit(X, y)
+    assert regressor.coef_.shape == (X.shape[1],)
+
+
+@pytest.mark.parametrize(
+    "Classifier",
+    [
+        LinearSVC,
+        LogisticRegression,
+        LogisticRegressionCV,
+        PassiveAggressiveClassifier,
+        Perceptron,
+        RidgeClassifier,
+        RidgeClassifierCV,
+        SGDClassifier,
+    ],
+)
+@pytest.mark.parametrize("n_classes", [2, 3])
+def test_linear_model_classifier_coef_shape(Classifier, n_classes):
+    if Classifier in (RidgeClassifier, RidgeClassifierCV):
+        pytest.xfail(f"{Classifier} does not follow `coef_` shape contract!")
+
+    X, y = make_classification(n_informative=10, n_classes=n_classes, random_state=0)
+    n_features = X.shape[1]
+
+    classifier = Classifier()
+    set_random_state(classifier)
+    classifier.fit(X, y)
+    expected_shape = (1, n_features) if n_classes == 2 else (n_classes, n_features)
+    assert classifier.coef_.shape == expected_shape
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_least_angle.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_least_angle.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b4a39750e03a495afd512a219b036433e31070c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_least_angle.py
@@ -0,0 +1,869 @@
+import warnings
+
+import numpy as np
+import pytest
+from scipy import linalg
+
+from sklearn import datasets, linear_model
+from sklearn.base import clone
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import (
+    Lars,
+    LarsCV,
+    LassoLars,
+    LassoLarsCV,
+    LassoLarsIC,
+    lars_path,
+)
+from sklearn.linear_model._least_angle import _lars_path_residues
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils._testing import (
+    TempMemmap,
+    assert_allclose,
+    assert_array_almost_equal,
+    ignore_warnings,
+)
+
+# TODO: use another dataset that has multiple drops
+diabetes = datasets.load_diabetes()
+X, y = diabetes.data, diabetes.target
+G = np.dot(X.T, X)
+Xy = np.dot(X.T, y)
+n_samples = y.size
+
+
+def test_simple():
+    # Principle of Lars is to keep covariances tied and decreasing
+
+    # also test verbose output
+    import sys
+    from io import StringIO
+
+    old_stdout = sys.stdout
+    try:
+        sys.stdout = StringIO()
+
+        _, _, coef_path_ = linear_model.lars_path(X, y, method="lar", verbose=10)
+
+        sys.stdout = old_stdout
+
+        for i, coef_ in enumerate(coef_path_.T):
+            res = y - np.dot(X, coef_)
+            cov = np.dot(X.T, res)
+            C = np.max(abs(cov))
+            eps = 1e-3
+            ocur = len(cov[C - eps < abs(cov)])
+            if i < X.shape[1]:
+                assert ocur == i + 1
+            else:
+                # no more than max_pred variables can go into the active set
+                assert ocur == X.shape[1]
+    finally:
+        sys.stdout = old_stdout
+
+
+def test_simple_precomputed():
+    # The same, with precomputed Gram matrix
+
+    _, _, coef_path_ = linear_model.lars_path(X, y, Gram=G, method="lar")
+
+    for i, coef_ in enumerate(coef_path_.T):
+        res = y - np.dot(X, coef_)
+        cov = np.dot(X.T, res)
+        C = np.max(abs(cov))
+        eps = 1e-3
+        ocur = len(cov[C - eps < abs(cov)])
+        if i < X.shape[1]:
+            assert ocur == i + 1
+        else:
+            # no more than max_pred variables can go into the active set
+            assert ocur == X.shape[1]
+
+
+def _assert_same_lars_path_result(output1, output2):
+    assert len(output1) == len(output2)
+    for o1, o2 in zip(output1, output2):
+        assert_allclose(o1, o2)
+
+
+@pytest.mark.parametrize("method", ["lar", "lasso"])
+@pytest.mark.parametrize("return_path", [True, False])
+def test_lars_path_gram_equivalent(method, return_path):
+    _assert_same_lars_path_result(
+        linear_model.lars_path_gram(
+            Xy=Xy, Gram=G, n_samples=n_samples, method=method, return_path=return_path
+        ),
+        linear_model.lars_path(X, y, Gram=G, method=method, return_path=return_path),
+    )
+
+
+def test_x_none_gram_none_raises_value_error():
+    # Test that lars_path with no X and Gram raises exception
+    Xy = np.dot(X.T, y)
+    with pytest.raises(ValueError, match="X and Gram cannot both be unspecified"):
+        linear_model.lars_path(None, y, Gram=None, Xy=Xy)
+
+
+def test_all_precomputed():
+    # Test that lars_path with precomputed Gram and Xy gives the right answer
+    G = np.dot(X.T, X)
+    Xy = np.dot(X.T, y)
+    for method in "lar", "lasso":
+        output = linear_model.lars_path(X, y, method=method)
+        output_pre = linear_model.lars_path(X, y, Gram=G, Xy=Xy, method=method)
+        for expected, got in zip(output, output_pre):
+            assert_array_almost_equal(expected, got)
+
+
+# TODO: remove warning filter when numpy min version >= 2.0.0
+@pytest.mark.filterwarnings("ignore: `rcond` parameter will change")
+def test_lars_lstsq():
+    # Test that Lars gives least square solution at the end
+    # of the path
+    X1 = 3 * X  # use un-normalized dataset
+    clf = linear_model.LassoLars(alpha=0.0)
+    clf.fit(X1, y)
+    coef_lstsq = np.linalg.lstsq(X1, y)[0]
+    assert_array_almost_equal(clf.coef_, coef_lstsq)
+
+
+# TODO: remove warning filter when numpy min version >= 2.0.0
+@pytest.mark.filterwarnings("ignore: `rcond` parameter will change")
+def test_lasso_gives_lstsq_solution():
+    # Test that Lars Lasso gives least square solution at the end
+    # of the path
+    _, _, coef_path_ = linear_model.lars_path(X, y, method="lasso")
+    coef_lstsq = np.linalg.lstsq(X, y)[0]
+    assert_array_almost_equal(coef_lstsq, coef_path_[:, -1])
+
+
+def test_collinearity():
+    # Check that lars_path is robust to collinearity in input
+    X = np.array([[3.0, 3.0, 1.0], [2.0, 2.0, 0.0], [1.0, 1.0, 0]])
+    y = np.array([1.0, 0.0, 0])
+    rng = np.random.RandomState(0)
+
+    f = ignore_warnings
+    _, _, coef_path_ = f(linear_model.lars_path)(X, y, alpha_min=0.01)
+    assert not np.isnan(coef_path_).any()
+    residual = np.dot(X, coef_path_[:, -1]) - y
+    assert (residual**2).sum() < 1.0  # just make sure it's bounded
+
+    n_samples = 10
+    X = rng.rand(n_samples, 5)
+    y = np.zeros(n_samples)
+    _, _, coef_path_ = linear_model.lars_path(
+        X,
+        y,
+        Gram="auto",
+        copy_X=False,
+        copy_Gram=False,
+        alpha_min=0.0,
+        method="lasso",
+        verbose=0,
+        max_iter=500,
+    )
+    assert_array_almost_equal(coef_path_, np.zeros_like(coef_path_))
+
+
+def test_no_path():
+    # Test that the ``return_path=False`` option returns the correct output
+    alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar")
+    alpha_, _, coef = linear_model.lars_path(X, y, method="lar", return_path=False)
+
+    assert_array_almost_equal(coef, coef_path_[:, -1])
+    assert alpha_ == alphas_[-1]
+
+
+def test_no_path_precomputed():
+    # Test that the ``return_path=False`` option with Gram remains correct
+    alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar", Gram=G)
+    alpha_, _, coef = linear_model.lars_path(
+        X, y, method="lar", Gram=G, return_path=False
+    )
+
+    assert_array_almost_equal(coef, coef_path_[:, -1])
+    assert alpha_ == alphas_[-1]
+
+
+def test_no_path_all_precomputed():
+    # Test that the ``return_path=False`` option with Gram and Xy remains
+    # correct
+    X, y = 3 * diabetes.data, diabetes.target
+    G = np.dot(X.T, X)
+    Xy = np.dot(X.T, y)
+    alphas_, _, coef_path_ = linear_model.lars_path(
+        X, y, method="lasso", Xy=Xy, Gram=G, alpha_min=0.9
+    )
+    alpha_, _, coef = linear_model.lars_path(
+        X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9, return_path=False
+    )
+
+    assert_array_almost_equal(coef, coef_path_[:, -1])
+    assert alpha_ == alphas_[-1]
+
+
+@pytest.mark.parametrize(
+    "classifier", [linear_model.Lars, linear_model.LarsCV, linear_model.LassoLarsIC]
+)
+def test_lars_precompute(classifier):
+    # Check for different values of precompute
+    G = np.dot(X.T, X)
+
+    clf = classifier(precompute=G)
+    output_1 = ignore_warnings(clf.fit)(X, y).coef_
+    for precompute in [True, False, "auto", None]:
+        clf = classifier(precompute=precompute)
+        output_2 = clf.fit(X, y).coef_
+        assert_array_almost_equal(output_1, output_2, decimal=8)
+
+
+def test_singular_matrix():
+    # Test when input is a singular matrix
+    X1 = np.array([[1, 1.0], [1.0, 1.0]])
+    y1 = np.array([1, 1])
+    _, _, coef_path = linear_model.lars_path(X1, y1)
+    assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0]])
+
+
+def test_rank_deficient_design():
+    # consistency test that checks that LARS Lasso is handling rank
+    # deficient input data (with n_features < rank) in the same way
+    # as coordinate descent Lasso
+    y = [5, 0, 5]
+    for X in ([[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]]):
+        # To be able to use the coefs to compute the objective function,
+        # we need to turn off normalization
+        lars = linear_model.LassoLars(0.1)
+        coef_lars_ = lars.fit(X, y).coef_
+        obj_lars = 1.0 / (2.0 * 3.0) * linalg.norm(
+            y - np.dot(X, coef_lars_)
+        ) ** 2 + 0.1 * linalg.norm(coef_lars_, 1)
+        coord_descent = linear_model.Lasso(0.1, tol=1e-6)
+        coef_cd_ = coord_descent.fit(X, y).coef_
+        obj_cd = (1.0 / (2.0 * 3.0)) * linalg.norm(
+            y - np.dot(X, coef_cd_)
+        ) ** 2 + 0.1 * linalg.norm(coef_cd_, 1)
+        assert obj_lars < obj_cd * (1.0 + 1e-8)
+
+
+def test_lasso_lars_vs_lasso_cd():
+    # Test that LassoLars and Lasso using coordinate descent give the
+    # same results.
+    X = 3 * diabetes.data
+
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso")
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
+    for c, a in zip(lasso_path.T, alphas):
+        if a == 0:
+            continue
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
+    # similar test, with the classifiers
+    for alpha in np.linspace(1e-2, 1 - 1e-2, 20):
+        clf1 = linear_model.LassoLars(alpha=alpha).fit(X, y)
+        clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8).fit(X, y)
+        err = linalg.norm(clf1.coef_ - clf2.coef_)
+        assert err < 1e-3
+
+    # same test, with normalized data
+    X = diabetes.data
+    X = X - X.sum(axis=0)
+    X /= np.linalg.norm(X, axis=0)
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso")
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
+    for c, a in zip(lasso_path.T, alphas):
+        if a == 0:
+            continue
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
+
+def test_lasso_lars_vs_lasso_cd_early_stopping():
+    # Test that LassoLars and Lasso using coordinate descent give the
+    # same results when early stopping is used.
+    # (test : before, in the middle, and in the last part of the path)
+    alphas_min = [10, 0.9, 1e-4]
+
+    X = diabetes.data
+
+    for alpha_min in alphas_min:
+        alphas, _, lasso_path = linear_model.lars_path(
+            X, y, method="lasso", alpha_min=alpha_min
+        )
+        lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
+        lasso_cd.alpha = alphas[-1]
+        lasso_cd.fit(X, y)
+        error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
+        assert error < 0.01
+
+    # same test, with normalization
+    X = diabetes.data - diabetes.data.sum(axis=0)
+    X /= np.linalg.norm(X, axis=0)
+
+    for alpha_min in alphas_min:
+        alphas, _, lasso_path = linear_model.lars_path(
+            X, y, method="lasso", alpha_min=alpha_min
+        )
+        lasso_cd = linear_model.Lasso(tol=1e-8)
+        lasso_cd.alpha = alphas[-1]
+        lasso_cd.fit(X, y)
+        error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
+        assert error < 0.01
+
+
+def test_lasso_lars_path_length():
+    # Test that the path length of the LassoLars is right
+    lasso = linear_model.LassoLars()
+    lasso.fit(X, y)
+    lasso2 = linear_model.LassoLars(alpha=lasso.alphas_[2])
+    lasso2.fit(X, y)
+    assert_array_almost_equal(lasso.alphas_[:3], lasso2.alphas_)
+    # Also check that the sequence of alphas is always decreasing
+    assert np.all(np.diff(lasso.alphas_) < 0)
+
+
+def test_lasso_lars_vs_lasso_cd_ill_conditioned():
+    # Test lasso lars on a very ill-conditioned design, and check that
+    # it does not blow up, and stays somewhat close to a solution given
+    # by the coordinate descent solver
+    # Also test that lasso_path (using lars_path output style) gives
+    # the same result as lars_path and previous lasso output style
+    # under these conditions.
+    rng = np.random.RandomState(42)
+
+    # Generate data
+    n, m = 70, 100
+    k = 5
+    X = rng.randn(n, m)
+    w = np.zeros((m, 1))
+    i = np.arange(0, m)
+    rng.shuffle(i)
+    supp = i[:k]
+    w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1)
+    y = np.dot(X, w)
+    sigma = 0.2
+    y += sigma * rng.rand(*y.shape)
+    y = y.squeeze()
+    lars_alphas, _, lars_coef = linear_model.lars_path(X, y, method="lasso")
+
+    _, lasso_coef2, _ = linear_model.lasso_path(X, y, alphas=lars_alphas, tol=1e-6)
+
+    assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1)
+
+
+def test_lasso_lars_vs_lasso_cd_ill_conditioned2():
+    # Create an ill-conditioned situation in which the LARS has to go
+    # far in the path to converge, and check that LARS and coordinate
+    # descent give the same answers
+    # Note it used to be the case that Lars had to use the drop for good
+    # strategy for this but this is no longer the case with the
+    # equality_tolerance checks
+    X = [[1e20, 1e20, 0], [-1e-32, 0, 0], [1, 1, 1]]
+    y = [10, 10, 1]
+    alpha = 0.0001
+
+    def objective_function(coef):
+        return 1.0 / (2.0 * len(X)) * linalg.norm(
+            y - np.dot(X, coef)
+        ) ** 2 + alpha * linalg.norm(coef, 1)
+
+    lars = linear_model.LassoLars(alpha=alpha)
+    warning_message = "Regressors in active set degenerate."
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        lars.fit(X, y)
+    lars_coef_ = lars.coef_
+    lars_obj = objective_function(lars_coef_)
+
+    coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-4)
+    cd_coef_ = coord_descent.fit(X, y).coef_
+    cd_obj = objective_function(cd_coef_)
+
+    assert lars_obj < cd_obj * (1.0 + 1e-8)
+
+
+def test_lars_add_features():
+    # assure that at least some features get added if necessary
+    # test for 6d2b4c
+    # Hilbert matrix
+    n = 5
+    H = 1.0 / (np.arange(1, n + 1) + np.arange(n)[:, np.newaxis])
+    clf = linear_model.Lars(fit_intercept=False).fit(H, np.arange(n))
+    assert np.all(np.isfinite(clf.coef_))
+
+
+def test_lars_n_nonzero_coefs(verbose=False):
+    lars = linear_model.Lars(n_nonzero_coefs=6, verbose=verbose)
+    lars.fit(X, y)
+    assert len(lars.coef_.nonzero()[0]) == 6
+    # The path should be of length 6 + 1 in a Lars going down to 6
+    # non-zero coefs
+    assert len(lars.alphas_) == 7
+
+
+def test_multitarget():
+    # Assure that estimators receiving multidimensional y do the right thing
+    Y = np.vstack([y, y**2]).T
+    n_targets = Y.shape[1]
+    estimators = [
+        linear_model.LassoLars(),
+        linear_model.Lars(),
+        # regression test for gh-1615
+        linear_model.LassoLars(fit_intercept=False),
+        linear_model.Lars(fit_intercept=False),
+    ]
+
+    for estimator in estimators:
+        estimator.fit(X, Y)
+        Y_pred = estimator.predict(X)
+        alphas, active, coef, path = (
+            estimator.alphas_,
+            estimator.active_,
+            estimator.coef_,
+            estimator.coef_path_,
+        )
+        for k in range(n_targets):
+            estimator.fit(X, Y[:, k])
+            y_pred = estimator.predict(X)
+            assert_array_almost_equal(alphas[k], estimator.alphas_)
+            assert_array_almost_equal(active[k], estimator.active_)
+            assert_array_almost_equal(coef[k], estimator.coef_)
+            assert_array_almost_equal(path[k], estimator.coef_path_)
+            assert_array_almost_equal(Y_pred[:, k], y_pred)
+
+
+def test_lars_cv():
+    # Test the LassoLarsCV object by checking that the optimal alpha
+    # increases as the number of samples increases.
+    # This property is not actually guaranteed in general and is just a
+    # property of the given dataset, with the given steps chosen.
+    old_alpha = 0
+    lars_cv = linear_model.LassoLarsCV()
+    for length in (400, 200, 100):
+        X = diabetes.data[:length]
+        y = diabetes.target[:length]
+        lars_cv.fit(X, y)
+        np.testing.assert_array_less(old_alpha, lars_cv.alpha_)
+        old_alpha = lars_cv.alpha_
+    assert not hasattr(lars_cv, "n_nonzero_coefs")
+
+
+def test_lars_cv_max_iter(recwarn):
+    warnings.simplefilter("always")
+    with np.errstate(divide="raise", invalid="raise"):
+        X = diabetes.data
+        y = diabetes.target
+        rng = np.random.RandomState(42)
+        x = rng.randn(len(y))
+        X = diabetes.data
+        X = np.c_[X, x, x]  # add correlated features
+        X = StandardScaler().fit_transform(X)
+        lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5)
+        lars_cv.fit(X, y)
+
+    # Check that there is no warning in general and no ConvergenceWarning
+    # in particular.
+    # Materialize the string representation of the warning to get a more
+    # informative error message in case of AssertionError.
+    recorded_warnings = [str(w) for w in recwarn]
+    assert len(recorded_warnings) == 0
+
+
+def test_lasso_lars_ic():
+    # Test the LassoLarsIC object by checking that
+    # - some good features are selected.
+    # - alpha_bic > alpha_aic
+    # - n_nonzero_bic < n_nonzero_aic
+    lars_bic = linear_model.LassoLarsIC("bic")
+    lars_aic = linear_model.LassoLarsIC("aic")
+    rng = np.random.RandomState(42)
+    X = diabetes.data
+    X = np.c_[X, rng.randn(X.shape[0], 5)]  # add 5 bad features
+    X = StandardScaler().fit_transform(X)
+    lars_bic.fit(X, y)
+    lars_aic.fit(X, y)
+    nonzero_bic = np.where(lars_bic.coef_)[0]
+    nonzero_aic = np.where(lars_aic.coef_)[0]
+    assert lars_bic.alpha_ > lars_aic.alpha_
+    assert len(nonzero_bic) < len(nonzero_aic)
+    assert np.max(nonzero_bic) < diabetes.data.shape[1]
+
+
+def test_lars_path_readonly_data():
+    # When using automated memory mapping on large input, the
+    # fold data is in read-only mode
+    # This is a non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/4597
+    splitted_data = train_test_split(X, y, random_state=42)
+    with TempMemmap(splitted_data) as (X_train, X_test, y_train, y_test):
+        # The following should not fail despite copy=False
+        _lars_path_residues(X_train, y_train, X_test, y_test, copy=False)
+
+
+def test_lars_path_positive_constraint():
+    # this is the main test for the positive parameter on the lars_path method
+    # the estimator classes just make use of this function
+
+    # we do the test on the diabetes dataset
+
+    # ensure that we get negative coefficients when positive=False
+    # and all positive when positive=True
+    # for method 'lar' (default) and lasso
+
+    err_msg = "Positive constraint not supported for 'lar' coding method."
+    with pytest.raises(ValueError, match=err_msg):
+        linear_model.lars_path(
+            diabetes["data"], diabetes["target"], method="lar", positive=True
+        )
+
+    method = "lasso"
+    _, _, coefs = linear_model.lars_path(
+        X, y, return_path=True, method=method, positive=False
+    )
+    assert coefs.min() < 0
+
+    _, _, coefs = linear_model.lars_path(
+        X, y, return_path=True, method=method, positive=True
+    )
+    assert coefs.min() >= 0
+
+
+# now we gonna test the positive option for all estimator classes
+
+default_parameter = {"fit_intercept": False}
+
+estimator_parameter_map = {
+    "LassoLars": {"alpha": 0.1},
+    "LassoLarsCV": {},
+    "LassoLarsIC": {},
+}
+
+
+def test_estimatorclasses_positive_constraint():
+    # testing the transmissibility for the positive option of all estimator
+    # classes in this same function here
+    default_parameter = {"fit_intercept": False}
+
+    estimator_parameter_map = {
+        "LassoLars": {"alpha": 0.1},
+        "LassoLarsCV": {},
+        "LassoLarsIC": {},
+    }
+    for estname in estimator_parameter_map:
+        params = default_parameter.copy()
+        params.update(estimator_parameter_map[estname])
+        estimator = getattr(linear_model, estname)(positive=False, **params)
+        estimator.fit(X, y)
+        assert estimator.coef_.min() < 0
+        estimator = getattr(linear_model, estname)(positive=True, **params)
+        estimator.fit(X, y)
+        assert min(estimator.coef_) >= 0
+
+
+def test_lasso_lars_vs_lasso_cd_positive():
+    # Test that LassoLars and Lasso using coordinate descent give the
+    # same results when using the positive option
+
+    # This test is basically a copy of the above with additional positive
+    # option. However for the middle part, the comparison of coefficient values
+    # for a range of alphas, we had to make an adaptations. See below.
+
+    # not normalized data
+    X = 3 * diabetes.data
+
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True)
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
+    for c, a in zip(lasso_path.T, alphas):
+        if a == 0:
+            continue
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
+    # The range of alphas chosen for coefficient comparison here is restricted
+    # as compared with the above test without the positive option. This is due
+    # to the circumstance that the Lars-Lasso algorithm does not converge to
+    # the least-squares-solution for small alphas, see 'Least Angle Regression'
+    # by Efron et al 2004. The coefficients are typically in congruence up to
+    # the smallest alpha reached by the Lars-Lasso algorithm and start to
+    # diverge thereafter.  See
+    # https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff
+
+    for alpha in np.linspace(6e-1, 1 - 1e-2, 20):
+        clf1 = linear_model.LassoLars(
+            fit_intercept=False, alpha=alpha, positive=True
+        ).fit(X, y)
+        clf2 = linear_model.Lasso(
+            fit_intercept=False, alpha=alpha, tol=1e-8, positive=True
+        ).fit(X, y)
+        err = linalg.norm(clf1.coef_ - clf2.coef_)
+        assert err < 1e-3
+
+    # normalized data
+    X = diabetes.data - diabetes.data.sum(axis=0)
+    X /= np.linalg.norm(X, axis=0)
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True)
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
+    for c, a in zip(lasso_path.T[:-1], alphas[:-1]):  # don't include alpha=0
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
+
+def test_lasso_lars_vs_R_implementation():
+    # Test that sklearn LassoLars implementation agrees with the LassoLars
+    # implementation available in R (lars library) when fit_intercept=False.
+
+    # Let's generate the data used in the bug report 7778
+    y = np.array([-6.45006793, -3.51251449, -8.52445396, 6.12277822, -19.42109366])
+    x = np.array(
+        [
+            [0.47299829, 0, 0, 0, 0],
+            [0.08239882, 0.85784863, 0, 0, 0],
+            [0.30114139, -0.07501577, 0.80895216, 0, 0],
+            [-0.01460346, -0.1015233, 0.0407278, 0.80338378, 0],
+            [-0.69363927, 0.06754067, 0.18064514, -0.0803561, 0.40427291],
+        ]
+    )
+
+    X = x.T
+
+    # The R result was obtained using the following code:
+    #
+    # library(lars)
+    # model_lasso_lars = lars(X, t(y), type="lasso", intercept=FALSE,
+    #                         trace=TRUE, normalize=FALSE)
+    # r = t(model_lasso_lars$beta)
+    #
+
+    r = np.array(
+        [
+            [
+                0,
+                0,
+                0,
+                0,
+                0,
+                -79.810362809499026,
+                -83.528788732782829,
+                -83.777653739190711,
+                -83.784156932888934,
+                -84.033390591756657,
+            ],
+            [0, 0, 0, 0, -0.476624256777266, 0, 0, 0, 0, 0.025219751009936],
+            [
+                0,
+                -3.577397088285891,
+                -4.702795355871871,
+                -7.016748621359461,
+                -7.614898471899412,
+                -0.336938391359179,
+                0,
+                0,
+                0.001213370600853,
+                0.048162321585148,
+            ],
+            [
+                0,
+                0,
+                0,
+                2.231558436628169,
+                2.723267514525966,
+                2.811549786389614,
+                2.813766976061531,
+                2.817462468949557,
+                2.817368178703816,
+                2.816221090636795,
+            ],
+            [
+                0,
+                0,
+                -1.218422599914637,
+                -3.457726183014808,
+                -4.021304522060710,
+                -45.827461592423745,
+                -47.776608869312305,
+                -47.911561610746404,
+                -47.914845922736234,
+                -48.039562334265717,
+            ],
+        ]
+    )
+
+    model_lasso_lars = linear_model.LassoLars(alpha=0, fit_intercept=False)
+    model_lasso_lars.fit(X, y)
+    skl_betas = model_lasso_lars.coef_path_
+
+    assert_array_almost_equal(r, skl_betas, decimal=12)
+
+
+@pytest.mark.parametrize("copy_X", [True, False])
+def test_lasso_lars_copyX_behaviour(copy_X):
+    """
+    Test that user input regarding copy_X is not being overridden (it was until
+    at least version 0.21)
+
+    """
+    lasso_lars = LassoLarsIC(copy_X=copy_X, precompute=False)
+    rng = np.random.RandomState(0)
+    X = rng.normal(0, 1, (100, 5))
+    X_copy = X.copy()
+    y = X[:, 2]
+    lasso_lars.fit(X, y)
+    assert copy_X == np.array_equal(X, X_copy)
+
+
+@pytest.mark.parametrize("copy_X", [True, False])
+def test_lasso_lars_fit_copyX_behaviour(copy_X):
+    """
+    Test that user input to .fit for copy_X overrides default __init__ value
+
+    """
+    lasso_lars = LassoLarsIC(precompute=False)
+    rng = np.random.RandomState(0)
+    X = rng.normal(0, 1, (100, 5))
+    X_copy = X.copy()
+    y = X[:, 2]
+    lasso_lars.fit(X, y, copy_X=copy_X)
+    assert copy_X == np.array_equal(X, X_copy)
+
+
+@pytest.mark.parametrize("est", (LassoLars(alpha=1e-3), Lars()))
+def test_lars_with_jitter(est):
+    # Test that a small amount of jitter helps stability,
+    # using example provided in issue #2746
+
+    X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
+    y = [-2.5, -2.5]
+    expected_coef = [0, 2.5, 0, 2.5, 0]
+
+    # set to fit_intercept to False since target is constant and we want check
+    # the value of coef. coef would be all zeros otherwise.
+    est.set_params(fit_intercept=False)
+    est_jitter = clone(est).set_params(jitter=10e-8, random_state=0)
+
+    est.fit(X, y)
+    est_jitter.fit(X, y)
+
+    assert np.mean((est.coef_ - est_jitter.coef_) ** 2) > 0.1
+    np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3)
+
+
+def test_X_none_gram_not_none():
+    with pytest.raises(ValueError, match="X cannot be None if Gram is not None"):
+        lars_path(X=None, y=np.array([1]), Gram=True)
+
+
+def test_copy_X_with_auto_gram():
+    # Non-regression test for #17789, `copy_X=True` and Gram='auto' does not
+    # overwrite X
+    rng = np.random.RandomState(42)
+    X = rng.rand(6, 6)
+    y = rng.rand(6)
+
+    X_before = X.copy()
+    linear_model.lars_path(X, y, Gram="auto", copy_X=True, method="lasso")
+    # X did not change
+    assert_allclose(X, X_before)
+
+
+@pytest.mark.parametrize(
+    "LARS, has_coef_path, args",
+    (
+        (Lars, True, {}),
+        (LassoLars, True, {}),
+        (LassoLarsIC, False, {}),
+        (LarsCV, True, {}),
+        # max_iter=5 is for avoiding ConvergenceWarning
+        (LassoLarsCV, True, {"max_iter": 5}),
+    ),
+)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_lars_dtype_match(LARS, has_coef_path, args, dtype):
+    # The test ensures that the fit method preserves input dtype
+    rng = np.random.RandomState(0)
+    X = rng.rand(20, 6).astype(dtype)
+    y = rng.rand(20).astype(dtype)
+
+    model = LARS(**args)
+    model.fit(X, y)
+    assert model.coef_.dtype == dtype
+    if has_coef_path:
+        assert model.coef_path_.dtype == dtype
+    assert model.intercept_.dtype == dtype
+
+
+@pytest.mark.parametrize(
+    "LARS, has_coef_path, args",
+    (
+        (Lars, True, {}),
+        (LassoLars, True, {}),
+        (LassoLarsIC, False, {}),
+        (LarsCV, True, {}),
+        # max_iter=5 is for avoiding ConvergenceWarning
+        (LassoLarsCV, True, {"max_iter": 5}),
+    ),
+)
+def test_lars_numeric_consistency(LARS, has_coef_path, args):
+    # The test ensures numerical consistency between trained coefficients
+    # of float32 and float64.
+    rtol = 1e-5
+    atol = 1e-5
+
+    rng = np.random.RandomState(0)
+    X_64 = rng.rand(10, 6)
+    y_64 = rng.rand(10)
+
+    model_64 = LARS(**args).fit(X_64, y_64)
+    model_32 = LARS(**args).fit(X_64.astype(np.float32), y_64.astype(np.float32))
+
+    assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol)
+    if has_coef_path:
+        assert_allclose(model_64.coef_path_, model_32.coef_path_, rtol=rtol, atol=atol)
+    assert_allclose(model_64.intercept_, model_32.intercept_, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("criterion", ["aic", "bic"])
+def test_lassolarsic_alpha_selection(criterion):
+    """Check that we properly compute the AIC and BIC score.
+
+    In this test, we reproduce the example of the Fig. 2 of Zou et al.
+    (reference [1] in LassoLarsIC) In this example, only 7 features should be
+    selected.
+    """
+    model = make_pipeline(StandardScaler(), LassoLarsIC(criterion=criterion))
+    model.fit(X, y)
+
+    best_alpha_selected = np.argmin(model[-1].criterion_)
+    assert best_alpha_selected == 7
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_lassolarsic_noise_variance(fit_intercept):
+    """Check the behaviour when `n_samples` < `n_features` and that one needs
+    to provide the noise variance."""
+    rng = np.random.RandomState(0)
+    X, y = datasets.make_regression(
+        n_samples=10, n_features=11 - fit_intercept, random_state=rng
+    )
+
+    model = make_pipeline(StandardScaler(), LassoLarsIC(fit_intercept=fit_intercept))
+
+    err_msg = (
+        "You are using LassoLarsIC in the case where the number of samples is smaller"
+        " than the number of features"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        model.fit(X, y)
+
+    model.set_params(lassolarsic__noise_variance=1.0)
+    model.fit(X, y).predict(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_linear_loss.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_linear_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a273656b3dbb8508bb468d6f5ac906b16dbc03f5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_linear_loss.py
@@ -0,0 +1,510 @@
+"""
+Tests for LinearModelLoss
+
+Note that correctness of losses (which compose LinearModelLoss) is already well
+covered in the _loss module.
+"""
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy import linalg, optimize
+
+from sklearn._loss.loss import (
+    HalfBinomialLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+)
+from sklearn.datasets import make_low_rank_matrix
+from sklearn.linear_model._linear_loss import LinearModelLoss
+from sklearn.utils.extmath import squared_norm
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+# We do not need to test all losses, just what LinearModelLoss does on top of the
+# base losses.
+LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss]
+
+
+def random_X_y_coef(
+    linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42
+):
+    """Random generate y, X and coef in valid range."""
+    rng = np.random.RandomState(seed)
+    n_dof = n_features + linear_model_loss.fit_intercept
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        random_state=rng,
+    )
+    coef = linear_model_loss.init_zero_coef(X)
+
+    if linear_model_loss.base_loss.is_multiclass:
+        n_classes = linear_model_loss.base_loss.n_classes
+        coef.flat[:] = rng.uniform(
+            low=coef_bound[0],
+            high=coef_bound[1],
+            size=n_classes * n_dof,
+        )
+        if linear_model_loss.fit_intercept:
+            raw_prediction = X @ coef[:, :-1].T + coef[:, -1]
+        else:
+            raw_prediction = X @ coef.T
+        proba = linear_model_loss.base_loss.link.inverse(raw_prediction)
+
+        # y = rng.choice(np.arange(n_classes), p=proba) does not work.
+        # See https://stackoverflow.com/a/34190035/16761084
+        def choice_vectorized(items, p):
+            s = p.cumsum(axis=1)
+            r = rng.rand(p.shape[0])[:, None]
+            k = (s < r).sum(axis=1)
+            return items[k]
+
+        y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
+    else:
+        coef.flat[:] = rng.uniform(
+            low=coef_bound[0],
+            high=coef_bound[1],
+            size=n_dof,
+        )
+        if linear_model_loss.fit_intercept:
+            raw_prediction = X @ coef[:-1] + coef[-1]
+        else:
+            raw_prediction = X @ coef
+        y = linear_model_loss.base_loss.link.inverse(
+            raw_prediction + rng.uniform(low=-1, high=1, size=n_samples)
+        )
+
+    return X, y, coef
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("n_features", [0, 1, 10])
+@pytest.mark.parametrize("dtype", [None, np.float32, np.float64, np.int64])
+def test_init_zero_coef(
+    base_loss, fit_intercept, n_features, dtype, global_random_seed
+):
+    """Test that init_zero_coef initializes coef correctly."""
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.normal(size=(5, n_features))
+    coef = loss.init_zero_coef(X, dtype=dtype)
+    if loss.base_loss.is_multiclass:
+        n_classes = loss.base_loss.n_classes
+        assert coef.shape == (n_classes, n_features + fit_intercept)
+        assert coef.flags["F_CONTIGUOUS"]
+    else:
+        assert coef.shape == (n_features + fit_intercept,)
+
+    if dtype is None:
+        assert coef.dtype == X.dtype
+    else:
+        assert coef.dtype == dtype
+
+    assert np.count_nonzero(coef) == 0
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_loss_grad_hess_are_the_same(
+    base_loss,
+    fit_intercept,
+    sample_weight,
+    l2_reg_strength,
+    csr_container,
+    global_random_seed,
+):
+    """Test that loss and gradient are the same across different functions."""
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=10, n_features=5, seed=global_random_seed
+    )
+    X_old, y_old, coef_old = X.copy(), y.copy(), coef.copy()
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    l1 = loss.loss(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g1 = loss.gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l2, g2 = loss.loss_gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g3, h3 = loss.gradient_hessian_product(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g4, h4, _ = loss.gradient_hessian(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    assert_allclose(l1, l2)
+    assert_allclose(g1, g2)
+    assert_allclose(g1, g3)
+    assert_allclose(g1, g4)
+    # The ravelling only takes effect for multiclass.
+    assert_allclose(h4 @ g4.ravel(order="F"), h3(g3).ravel(order="F"))
+    # Test that gradient_out and hessian_out are considered properly.
+    g_out = np.empty_like(coef)
+    h_out = np.empty_like(coef, shape=(coef.size, coef.size))
+    g5, h5, _ = loss.gradient_hessian(
+        coef,
+        X,
+        y,
+        sample_weight=sample_weight,
+        l2_reg_strength=l2_reg_strength,
+        gradient_out=g_out,
+        hessian_out=h_out,
+    )
+    assert np.shares_memory(g5, g_out)
+    assert np.shares_memory(h5, h_out)
+    assert_allclose(g5, g_out)
+    assert_allclose(h5, h_out)
+    assert_allclose(g1, g5)
+    assert_allclose(h5, h4)
+
+    # same for sparse X
+    Xs = csr_container(X)
+    l1_sp = loss.loss(
+        coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g1_sp = loss.gradient(
+        coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l2_sp, g2_sp = loss.loss_gradient(
+        coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g3_sp, h3_sp = loss.gradient_hessian_product(
+        coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g4_sp, h4_sp, _ = loss.gradient_hessian(
+        coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    assert_allclose(l1, l1_sp)
+    assert_allclose(l1, l2_sp)
+    assert_allclose(g1, g1_sp)
+    assert_allclose(g1, g2_sp)
+    assert_allclose(g1, g3_sp)
+    assert_allclose(h3(g1), h3_sp(g1_sp))
+    assert_allclose(g1, g4_sp)
+    assert_allclose(h4, h4_sp)
+
+    # X, y and coef should not have changed
+    assert_allclose(X, X_old)
+    assert_allclose(Xs.toarray(), X_old)
+    assert_allclose(y, y_old)
+    assert_allclose(coef, coef_old)
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [None])
+def test_loss_gradients_hessp_intercept(
+    base_loss, sample_weight, l2_reg_strength, X_container, global_random_seed
+):
+    """Test that loss and gradient handle intercept correctly."""
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=False)
+    loss_inter = LinearModelLoss(base_loss=base_loss(), fit_intercept=True)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss,
+        n_samples=n_samples,
+        n_features=n_features,
+        seed=global_random_seed,
+    )
+
+    X[:, -1] = 1  # make last column of 1 to mimic intercept term
+    X_inter = X[
+        :, :-1
+    ]  # exclude intercept column as it is added automatically by loss_inter
+
+    if X_container is not None:
+        X = X_container(X)
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    l, g = loss.loss_gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    _, hessp = loss.gradient_hessian_product(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l_inter, g_inter = loss_inter.loss_gradient(
+        coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    _, hessp_inter = loss_inter.gradient_hessian_product(
+        coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+
+    # Note, that intercept gets no L2 penalty.
+    assert l == pytest.approx(
+        l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1])
+    )
+
+    g_inter_corrected = g_inter
+    g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1]
+    assert_allclose(g, g_inter_corrected)
+
+    s = np.random.RandomState(global_random_seed).randn(*coef.shape)
+    h = hessp(s)
+    h_inter = hessp_inter(s)
+    h_inter_corrected = h_inter
+    h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1]
+    assert_allclose(h, h_inter_corrected)
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+def test_gradients_hessians_numerically(
+    base_loss, fit_intercept, sample_weight, l2_reg_strength, global_random_seed
+):
+    """Test gradients and hessians with numerical derivatives.
+
+    Gradient should equal the numerical derivatives of the loss function.
+    Hessians should equal the numerical derivatives of gradients.
+    """
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss,
+        n_samples=n_samples,
+        n_features=n_features,
+        seed=global_random_seed,
+    )
+    coef = coef.ravel(order="F")  # this is important only for multinomial loss
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    # 1. Check gradients numerically
+    eps = 1e-6
+    g, hessp = loss.gradient_hessian_product(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    # Use a trick to get central finite difference of accuracy 4 (five-point stencil)
+    # https://en.wikipedia.org/wiki/Numerical_differentiation
+    # https://en.wikipedia.org/wiki/Finite_difference_coefficient
+    # approx_g1 = (f(x + eps) - f(x - eps)) / (2*eps)
+    approx_g1 = optimize.approx_fprime(
+        coef,
+        lambda coef: loss.loss(
+            coef - eps,
+            X,
+            y,
+            sample_weight=sample_weight,
+            l2_reg_strength=l2_reg_strength,
+        ),
+        2 * eps,
+    )
+    # approx_g2 = (f(x + 2*eps) - f(x - 2*eps)) / (4*eps)
+    approx_g2 = optimize.approx_fprime(
+        coef,
+        lambda coef: loss.loss(
+            coef - 2 * eps,
+            X,
+            y,
+            sample_weight=sample_weight,
+            l2_reg_strength=l2_reg_strength,
+        ),
+        4 * eps,
+    )
+    # Five-point stencil approximation
+    # See: https://en.wikipedia.org/wiki/Five-point_stencil#1D_first_derivative
+    approx_g = (4 * approx_g1 - approx_g2) / 3
+    assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8)
+
+    # 2. Check hessp numerically along the second direction of the gradient
+    vector = np.zeros_like(g)
+    vector[1] = 1
+    hess_col = hessp(vector)
+    # Computation of the Hessian is particularly fragile to numerical errors when doing
+    # simple finite differences. Here we compute the grad along a path in the direction
+    # of the vector and then use a least-square regression to estimate the slope
+    eps = 1e-3
+    d_x = np.linspace(-eps, eps, 30)
+    d_grad = np.array(
+        [
+            loss.gradient(
+                coef + t * vector,
+                X,
+                y,
+                sample_weight=sample_weight,
+                l2_reg_strength=l2_reg_strength,
+            )
+            for t in d_x
+        ]
+    )
+    d_grad -= d_grad.mean(axis=0)
+    approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
+    assert_allclose(approx_hess_col, hess_col, rtol=1e-3)
+
+
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_multinomial_coef_shape(fit_intercept, global_random_seed):
+    """Test that multinomial LinearModelLoss respects shape of coef."""
+    loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=fit_intercept)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss,
+        n_samples=n_samples,
+        n_features=n_features,
+        seed=global_random_seed,
+    )
+    s = np.random.RandomState(global_random_seed).randn(*coef.shape)
+
+    l, g = loss.loss_gradient(coef, X, y)
+    g1 = loss.gradient(coef, X, y)
+    g2, hessp = loss.gradient_hessian_product(coef, X, y)
+    h = hessp(s)
+    assert g.shape == coef.shape
+    assert h.shape == coef.shape
+    assert_allclose(g, g1)
+    assert_allclose(g, g2)
+    g3, hess, _ = loss.gradient_hessian(coef, X, y)
+    assert g3.shape == coef.shape
+    # But full hessian is always 2d.
+    assert hess.shape == (coef.size, coef.size)
+
+    coef_r = coef.ravel(order="F")
+    s_r = s.ravel(order="F")
+    l_r, g_r = loss.loss_gradient(coef_r, X, y)
+    g1_r = loss.gradient(coef_r, X, y)
+    g2_r, hessp_r = loss.gradient_hessian_product(coef_r, X, y)
+    h_r = hessp_r(s_r)
+    assert g_r.shape == coef_r.shape
+    assert h_r.shape == coef_r.shape
+    assert_allclose(g_r, g1_r)
+    assert_allclose(g_r, g2_r)
+
+    assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F"))
+    assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F"))
+
+
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_multinomial_hessian_3_classes(sample_weight, global_random_seed):
+    """Test multinomial hessian for 3 classes and 2 points.
+
+    For n_classes = 3 and n_samples = 2, we have
+      p0 = [p0_0, p0_1]
+      p1 = [p1_0, p1_1]
+      p2 = [p2_0, p2_1]
+    and with 2 x 2 diagonal subblocks
+      H = [p0 * (1-p0),    -p0 * p1,    -p0 * p2]
+          [   -p0 * p1, p1 * (1-p1),    -p1 * p2]
+          [   -p0 * p2,    -p1 * p2, p2 * (1-p2)]
+      hess = X' H X
+    """
+    n_samples, n_features, n_classes = 2, 5, 3
+    loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=n_classes), fit_intercept=False
+    )
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss,
+        n_samples=n_samples,
+        n_features=n_features,
+        seed=global_random_seed,
+    )
+    coef = coef.ravel(order="F")  # this is important only for multinomial loss
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    grad, hess, _ = loss.gradient_hessian(
+        coef,
+        X,
+        y,
+        sample_weight=sample_weight,
+        l2_reg_strength=0,
+    )
+    # Hessian must be a symmetrix matrix.
+    assert_allclose(hess, hess.T)
+
+    weights, intercept, raw_prediction = loss.weight_intercept_raw(coef, X)
+    grad_pointwise, proba = loss.base_loss.gradient_proba(
+        y_true=y,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    p0d, p1d, p2d, oned = (
+        np.diag(proba[:, 0]),
+        np.diag(proba[:, 1]),
+        np.diag(proba[:, 2]),
+        np.diag(np.ones(2)),
+    )
+    h = np.block(
+        [
+            [p0d * (oned - p0d), -p0d * p1d, -p0d * p2d],
+            [-p0d * p1d, p1d * (oned - p1d), -p1d * p2d],
+            [-p0d * p2d, -p1d * p2d, p2d * (oned - p2d)],
+        ]
+    )
+    h = h.reshape((n_classes, n_samples, n_classes, n_samples))
+    if sample_weight is None:
+        h /= n_samples
+    else:
+        h *= sample_weight / np.sum(sample_weight)
+    # hess_expected.shape = (n_features, n_classes, n_classes, n_features)
+    hess_expected = np.einsum("ij, mini, ik->jmnk", X, h, X)
+    hess_expected = np.moveaxis(hess_expected, 2, 3)
+    hess_expected = hess_expected.reshape(
+        n_classes * n_features, n_classes * n_features, order="C"
+    )
+    assert_allclose(hess_expected, hess_expected.T)
+    assert_allclose(hess, hess_expected)
+
+
+def test_linear_loss_gradient_hessian_raises_wrong_out_parameters():
+    """Test that wrong gradient_out and hessian_out raises errors."""
+    n_samples, n_features, n_classes = 5, 2, 3
+    loss = LinearModelLoss(base_loss=HalfBinomialLoss(), fit_intercept=False)
+    X = np.ones((n_samples, n_features))
+    y = np.ones(n_samples)
+    coef = loss.init_zero_coef(X)
+    gradient_out = np.zeros(1)
+    with pytest.raises(
+        ValueError, match="gradient_out is required to have shape coef.shape"
+    ):
+        loss.gradient_hessian(
+            coef=coef,
+            X=X,
+            y=y,
+            gradient_out=gradient_out,
+            hessian_out=None,
+        )
+    hessian_out = np.zeros(1)
+    with pytest.raises(ValueError, match="hessian_out is required to have shape"):
+        loss.gradient_hessian(
+            coef=coef,
+            X=X,
+            y=y,
+            gradient_out=None,
+            hessian_out=hessian_out,
+        )
+
+    loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=False)
+    coef = loss.init_zero_coef(X)
+    gradient_out = np.zeros((2 * n_classes, n_features))[::2]
+    with pytest.raises(ValueError, match="gradient_out must be F-contiguous"):
+        loss.gradient_hessian(
+            coef=coef,
+            X=X,
+            y=y,
+            gradient_out=gradient_out,
+        )
+    hessian_out = np.zeros((2 * n_classes * n_features, n_classes * n_features))[::2]
+    with pytest.raises(ValueError, match="hessian_out must be contiguous"):
+        loss.gradient_hessian(
+            coef=coef,
+            X=X,
+            y=y,
+            gradient_out=None,
+            hessian_out=hessian_out,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_logistic.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_logistic.py
new file mode 100644
index 0000000000000000000000000000000000000000..007c900dd36776ba4bd3d5731f5f40cf882028b3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_logistic.py
@@ -0,0 +1,2471 @@
+import itertools
+import os
+import warnings
+from functools import partial
+
+import numpy as np
+import pytest
+from numpy.testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from scipy import sparse
+from scipy.linalg import LinAlgWarning, svd
+
+from sklearn import config_context
+from sklearn._loss import HalfMultinomialLoss
+from sklearn.base import clone
+from sklearn.datasets import load_iris, make_classification, make_low_rank_matrix
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import SGDClassifier
+from sklearn.linear_model._logistic import (
+    LogisticRegression as LogisticRegressionDefault,
+)
+from sklearn.linear_model._logistic import (
+    LogisticRegressionCV as LogisticRegressionCVDefault,
+)
+from sklearn.linear_model._logistic import (
+    _log_reg_scoring_path,
+    _logistic_regression_path,
+)
+from sklearn.metrics import get_scorer, log_loss
+from sklearn.model_selection import (
+    GridSearchCV,
+    LeaveOneGroupOut,
+    StratifiedKFold,
+    cross_val_score,
+    train_test_split,
+)
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.preprocessing import LabelEncoder, StandardScaler, scale
+from sklearn.svm import l1_min_c
+from sklearn.utils import compute_class_weight, shuffle
+from sklearn.utils._testing import ignore_warnings, skip_if_no_parallel
+from sklearn.utils.fixes import _IS_32BIT, COO_CONTAINERS, CSR_CONTAINERS
+
+pytestmark = pytest.mark.filterwarnings(
+    "error::sklearn.exceptions.ConvergenceWarning:sklearn.*"
+)
+# Fixing random_state helps prevent ConvergenceWarnings
+LogisticRegression = partial(LogisticRegressionDefault, random_state=0)
+LogisticRegressionCV = partial(LogisticRegressionCVDefault, random_state=0)
+
+
+SOLVERS = ("lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga")
+X = [[-1, 0], [0, 1], [1, 1]]
+Y1 = [0, 1, 1]
+Y2 = [2, 1, 0]
+iris = load_iris()
+
+
+def check_predictions(clf, X, y):
+    """Check that the model is able to fit the classification data"""
+    n_samples = len(y)
+    classes = np.unique(y)
+    n_classes = classes.shape[0]
+
+    predicted = clf.fit(X, y).predict(X)
+    assert_array_equal(clf.classes_, classes)
+
+    assert predicted.shape == (n_samples,)
+    assert_array_equal(predicted, y)
+
+    probabilities = clf.predict_proba(X)
+    assert probabilities.shape == (n_samples, n_classes)
+    assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples))
+    assert_array_equal(probabilities.argmax(axis=1), y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_predict_2_classes(csr_container):
+    # Simple sanity check on a 2 classes dataset
+    # Make sure it predicts the correct result on simple datasets.
+    check_predictions(LogisticRegression(random_state=0), X, Y1)
+    check_predictions(LogisticRegression(random_state=0), csr_container(X), Y1)
+
+    check_predictions(LogisticRegression(C=100, random_state=0), X, Y1)
+    check_predictions(LogisticRegression(C=100, random_state=0), csr_container(X), Y1)
+
+    check_predictions(LogisticRegression(fit_intercept=False, random_state=0), X, Y1)
+    check_predictions(
+        LogisticRegression(fit_intercept=False, random_state=0), csr_container(X), Y1
+    )
+
+
+def test_logistic_cv_mock_scorer():
+    class MockScorer:
+        def __init__(self):
+            self.calls = 0
+            self.scores = [0.1, 0.4, 0.8, 0.5]
+
+        def __call__(self, model, X, y, sample_weight=None):
+            score = self.scores[self.calls % len(self.scores)]
+            self.calls += 1
+            return score
+
+    mock_scorer = MockScorer()
+    Cs = [1, 2, 3, 4]
+    cv = 2
+
+    lr = LogisticRegressionCV(Cs=Cs, scoring=mock_scorer, cv=cv)
+    X, y = make_classification(random_state=0)
+    lr.fit(X, y)
+
+    # Cs[2] has the highest score (0.8) from MockScorer
+    assert lr.C_[0] == Cs[2]
+
+    # scorer called 8 times (cv*len(Cs))
+    assert mock_scorer.calls == cv * len(Cs)
+
+    # reset mock_scorer
+    mock_scorer.calls = 0
+    custom_score = lr.score(X, lr.predict(X))
+
+    assert custom_score == mock_scorer.scores[0]
+    assert mock_scorer.calls == 1
+
+
+@skip_if_no_parallel
+def test_lr_liblinear_warning():
+    X, y = make_classification(random_state=0)
+
+    lr = LogisticRegression(solver="liblinear", n_jobs=2)
+    warning_message = (
+        "'n_jobs' > 1 does not have any effect when"
+        " 'solver' is set to 'liblinear'. Got 'n_jobs'"
+        " = 2."
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        lr.fit(X, y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_predict_3_classes(csr_container):
+    check_predictions(LogisticRegression(C=10), X, Y2)
+    check_predictions(LogisticRegression(C=10), csr_container(X), Y2)
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.filterwarnings(
+    "ignore:.*'liblinear' solver for multiclass classification is deprecated.*"
+)
+@pytest.mark.parametrize(
+    "clf",
+    [
+        LogisticRegression(C=len(iris.data), solver="liblinear", multi_class="ovr"),
+        LogisticRegression(C=len(iris.data), solver="lbfgs"),
+        LogisticRegression(C=len(iris.data), solver="newton-cg"),
+        LogisticRegression(
+            C=len(iris.data), solver="sag", tol=1e-2, multi_class="ovr", random_state=42
+        ),
+        LogisticRegression(
+            C=len(iris.data),
+            solver="saga",
+            tol=1e-2,
+            multi_class="ovr",
+            random_state=42,
+        ),
+        LogisticRegression(C=len(iris.data), solver="newton-cholesky"),
+    ],
+)
+def test_predict_iris(clf):
+    """Test logistic regression with the iris dataset.
+
+    Test that both multinomial and OvR solvers handle multiclass data correctly and
+    give good accuracy score (>0.95) for the training data.
+    """
+    n_samples, n_features = iris.data.shape
+    target = iris.target_names[iris.target]
+
+    if clf.solver == "lbfgs":
+        # lbfgs has convergence issues on the iris data with its default max_iter=100
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            clf.fit(iris.data, target)
+    else:
+        clf.fit(iris.data, target)
+    assert_array_equal(np.unique(target), clf.classes_)
+
+    pred = clf.predict(iris.data)
+    assert np.mean(pred == target) > 0.95
+
+    probabilities = clf.predict_proba(iris.data)
+    assert_allclose(probabilities.sum(axis=1), np.ones(n_samples))
+
+    pred = iris.target_names[probabilities.argmax(axis=1)]
+    assert np.mean(pred == target) > 0.95
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.parametrize("LR", [LogisticRegression, LogisticRegressionCV])
+def test_check_solver_option(LR):
+    X, y = iris.data, iris.target
+
+    # only 'liblinear' solver
+    for solver in ["liblinear"]:
+        msg = f"Solver {solver} does not support a multinomial backend."
+        lr = LR(solver=solver, multi_class="multinomial")
+        with pytest.raises(ValueError, match=msg):
+            lr.fit(X, y)
+
+    # all solvers except 'liblinear' and 'saga'
+    for solver in ["lbfgs", "newton-cg", "newton-cholesky", "sag"]:
+        msg = "Solver %s supports only 'l2' or None penalties," % solver
+        lr = LR(solver=solver, penalty="l1", multi_class="ovr")
+        with pytest.raises(ValueError, match=msg):
+            lr.fit(X, y)
+    for solver in ["lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"]:
+        msg = "Solver %s supports only dual=False, got dual=True" % solver
+        lr = LR(solver=solver, dual=True, multi_class="ovr")
+        with pytest.raises(ValueError, match=msg):
+            lr.fit(X, y)
+
+    # only saga supports elasticnet. We only test for liblinear because the
+    # error is raised before for the other solvers (solver %s supports only l2
+    # penalties)
+    for solver in ["liblinear"]:
+        msg = f"Only 'saga' solver supports elasticnet penalty, got solver={solver}."
+        lr = LR(solver=solver, penalty="elasticnet")
+        with pytest.raises(ValueError, match=msg):
+            lr.fit(X, y)
+
+    # liblinear does not support penalty='none'
+    # (LogisticRegressionCV does not supports penalty='none' at all)
+    if LR is LogisticRegression:
+        msg = "penalty=None is not supported for the liblinear solver"
+        lr = LR(penalty=None, solver="liblinear")
+        with pytest.raises(ValueError, match=msg):
+            lr.fit(X, y)
+
+
+@pytest.mark.parametrize("LR", [LogisticRegression, LogisticRegressionCV])
+def test_elasticnet_l1_ratio_err_helpful(LR):
+    # Check that an informative error message is raised when penalty="elasticnet"
+    # but l1_ratio is not specified.
+    model = LR(penalty="elasticnet", solver="saga")
+    with pytest.raises(ValueError, match=r".*l1_ratio.*"):
+        model.fit(np.array([[1, 2], [3, 4]]), np.array([0, 1]))
+
+
+# TODO(1.8): remove whole test with deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.parametrize("solver", ["lbfgs", "newton-cg", "sag", "saga"])
+def test_multinomial_binary(solver):
+    # Test multinomial LR on a binary problem.
+    target = (iris.target > 0).astype(np.intp)
+    target = np.array(["setosa", "not-setosa"])[target]
+
+    clf = LogisticRegression(
+        solver=solver, multi_class="multinomial", random_state=42, max_iter=2000
+    )
+    clf.fit(iris.data, target)
+
+    assert clf.coef_.shape == (1, iris.data.shape[1])
+    assert clf.intercept_.shape == (1,)
+    assert_array_equal(clf.predict(iris.data), target)
+
+    mlr = LogisticRegression(
+        solver=solver, multi_class="multinomial", random_state=42, fit_intercept=False
+    )
+    mlr.fit(iris.data, target)
+    pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), axis=1)]
+    assert np.mean(pred == target) > 0.9
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+# Maybe even remove this whole test as correctness of multinomial loss is tested
+# elsewhere.
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+def test_multinomial_binary_probabilities(global_random_seed):
+    # Test multinomial LR gives expected probabilities based on the
+    # decision function, for a binary problem.
+    X, y = make_classification(random_state=global_random_seed)
+    clf = LogisticRegression(
+        multi_class="multinomial",
+        solver="saga",
+        tol=1e-3,
+        random_state=global_random_seed,
+    )
+    clf.fit(X, y)
+
+    decision = clf.decision_function(X)
+    proba = clf.predict_proba(X)
+
+    expected_proba_class_1 = np.exp(decision) / (np.exp(decision) + np.exp(-decision))
+    expected_proba = np.c_[1 - expected_proba_class_1, expected_proba_class_1]
+
+    assert_almost_equal(proba, expected_proba)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_sparsify(coo_container):
+    # Test sparsify and densify members.
+    n_samples, n_features = iris.data.shape
+    target = iris.target_names[iris.target]
+    X = scale(iris.data)
+    clf = LogisticRegression(random_state=0).fit(X, target)
+
+    pred_d_d = clf.decision_function(X)
+
+    clf.sparsify()
+    assert sparse.issparse(clf.coef_)
+    pred_s_d = clf.decision_function(X)
+
+    sp_data = coo_container(X)
+    pred_s_s = clf.decision_function(sp_data)
+
+    clf.densify()
+    pred_d_s = clf.decision_function(sp_data)
+
+    assert_array_almost_equal(pred_d_d, pred_s_d)
+    assert_array_almost_equal(pred_d_d, pred_s_s)
+    assert_array_almost_equal(pred_d_d, pred_d_s)
+
+
+def test_inconsistent_input():
+    # Test that an exception is raised on inconsistent input
+    rng = np.random.RandomState(0)
+    X_ = rng.random_sample((5, 10))
+    y_ = np.ones(X_.shape[0])
+    y_[0] = 0
+
+    clf = LogisticRegression(random_state=0)
+
+    # Wrong dimensions for training data
+    y_wrong = y_[:-1]
+
+    with pytest.raises(ValueError):
+        clf.fit(X, y_wrong)
+
+    # Wrong dimensions for test data
+    with pytest.raises(ValueError):
+        clf.fit(X_, y_).predict(rng.random_sample((3, 12)))
+
+
+def test_write_parameters():
+    # Test that we can write to coef_ and intercept_
+    clf = LogisticRegression(random_state=0)
+    clf.fit(X, Y1)
+    clf.coef_[:] = 0
+    clf.intercept_[:] = 0
+    assert_array_almost_equal(clf.decision_function(X), 0)
+
+
+def test_nan():
+    # Test proper NaN handling.
+    # Regression test for Issue #252: fit used to go into an infinite loop.
+    Xnan = np.array(X, dtype=np.float64)
+    Xnan[0, 1] = np.nan
+    logistic = LogisticRegression(random_state=0)
+
+    with pytest.raises(ValueError):
+        logistic.fit(Xnan, Y1)
+
+
+def test_consistency_path():
+    # Test that the path algorithm is consistent
+    rng = np.random.RandomState(0)
+    X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2)))
+    y = [1] * 100 + [-1] * 100
+    Cs = np.logspace(0, 4, 10)
+
+    f = ignore_warnings
+    # can't test with fit_intercept=True since LIBLINEAR
+    # penalizes the intercept
+    for solver in ["sag", "saga"]:
+        coefs, Cs, _ = f(_logistic_regression_path)(
+            X,
+            y,
+            Cs=Cs,
+            fit_intercept=False,
+            tol=1e-5,
+            solver=solver,
+            max_iter=1000,
+            random_state=0,
+        )
+        for i, C in enumerate(Cs):
+            lr = LogisticRegression(
+                C=C,
+                fit_intercept=False,
+                tol=1e-5,
+                solver=solver,
+                random_state=0,
+                max_iter=1000,
+            )
+            lr.fit(X, y)
+            lr_coef = lr.coef_.ravel()
+            assert_array_almost_equal(
+                lr_coef, coefs[i], decimal=4, err_msg="with solver = %s" % solver
+            )
+
+    # test for fit_intercept=True
+    for solver in ("lbfgs", "newton-cg", "newton-cholesky", "liblinear", "sag", "saga"):
+        Cs = [1e3]
+        coefs, Cs, _ = f(_logistic_regression_path)(
+            X,
+            y,
+            Cs=Cs,
+            tol=1e-6,
+            solver=solver,
+            intercept_scaling=10000.0,
+            random_state=0,
+        )
+        lr = LogisticRegression(
+            C=Cs[0],
+            tol=1e-6,
+            intercept_scaling=10000.0,
+            random_state=0,
+            solver=solver,
+        )
+        lr.fit(X, y)
+        lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_])
+        assert_array_almost_equal(
+            lr_coef, coefs[0], decimal=4, err_msg="with solver = %s" % solver
+        )
+
+
+def test_logistic_regression_path_convergence_fail():
+    rng = np.random.RandomState(0)
+    X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2)))
+    y = [1] * 100 + [-1] * 100
+    Cs = [1e3]
+
+    # Check that the convergence message points to both a model agnostic
+    # advice (scaling the data) and to the logistic regression specific
+    # documentation that includes hints on the solver configuration.
+    with pytest.warns(ConvergenceWarning) as record:
+        _logistic_regression_path(
+            X, y, Cs=Cs, tol=0.0, max_iter=1, random_state=0, verbose=0
+        )
+
+    assert len(record) == 1
+    warn_msg = record[0].message.args[0]
+    assert "lbfgs failed to converge after 1 iteration(s)" in warn_msg
+    assert "Increase the number of iterations" in warn_msg
+    assert "scale the data" in warn_msg
+    assert "linear_model.html#logistic-regression" in warn_msg
+
+
+def test_liblinear_dual_random_state():
+    # random_state is relevant for liblinear solver only if dual=True
+    X, y = make_classification(n_samples=20, random_state=0)
+    lr1 = LogisticRegression(
+        random_state=0,
+        dual=True,
+        tol=1e-3,
+        solver="liblinear",
+    )
+    lr1.fit(X, y)
+    lr2 = LogisticRegression(
+        random_state=0,
+        dual=True,
+        tol=1e-3,
+        solver="liblinear",
+    )
+    lr2.fit(X, y)
+    lr3 = LogisticRegression(
+        random_state=8,
+        dual=True,
+        tol=1e-3,
+        solver="liblinear",
+    )
+    lr3.fit(X, y)
+
+    # same result for same random state
+    assert_array_almost_equal(lr1.coef_, lr2.coef_)
+    # different results for different random states
+    msg = "Arrays are not almost equal to 6 decimals"
+    with pytest.raises(AssertionError, match=msg):
+        assert_array_almost_equal(lr1.coef_, lr3.coef_)
+
+
+def test_logistic_cv():
+    # test for LogisticRegressionCV object
+    n_samples, n_features = 50, 5
+    rng = np.random.RandomState(0)
+    X_ref = rng.randn(n_samples, n_features)
+    y = np.sign(X_ref.dot(5 * rng.randn(n_features)))
+    X_ref -= X_ref.mean()
+    X_ref /= X_ref.std()
+    lr_cv = LogisticRegressionCV(
+        Cs=[1.0], fit_intercept=False, solver="liblinear", cv=3
+    )
+    lr_cv.fit(X_ref, y)
+    lr = LogisticRegression(C=1.0, fit_intercept=False, solver="liblinear")
+    lr.fit(X_ref, y)
+    assert_array_almost_equal(lr.coef_, lr_cv.coef_)
+
+    assert_array_equal(lr_cv.coef_.shape, (1, n_features))
+    assert_array_equal(lr_cv.classes_, [-1, 1])
+    assert len(lr_cv.classes_) == 2
+
+    coefs_paths = np.asarray(list(lr_cv.coefs_paths_.values()))
+    assert_array_equal(coefs_paths.shape, (1, 3, 1, n_features))
+    assert_array_equal(lr_cv.Cs_.shape, (1,))
+    scores = np.asarray(list(lr_cv.scores_.values()))
+    assert_array_equal(scores.shape, (1, 3, 1))
+
+
+@pytest.mark.parametrize(
+    "scoring, multiclass_agg_list",
+    [
+        ("accuracy", [""]),
+        ("precision", ["_macro", "_weighted"]),
+        # no need to test for micro averaging because it
+        # is the same as accuracy for f1, precision,
+        # and recall (see https://github.com/
+        # scikit-learn/scikit-learn/pull/
+        # 11578#discussion_r203250062)
+        ("f1", ["_macro", "_weighted"]),
+        ("neg_log_loss", [""]),
+        ("recall", ["_macro", "_weighted"]),
+    ],
+)
+def test_logistic_cv_multinomial_score(scoring, multiclass_agg_list):
+    # test that LogisticRegressionCV uses the right score to compute its
+    # cross-validation scores when using a multinomial scoring
+    # see https://github.com/scikit-learn/scikit-learn/issues/8720
+    X, y = make_classification(
+        n_samples=100, random_state=0, n_classes=3, n_informative=6
+    )
+    train, test = np.arange(80), np.arange(80, 100)
+    lr = LogisticRegression(C=1.0)
+    # we use lbfgs to support multinomial
+    params = lr.get_params()
+    # we store the params to set them further in _log_reg_scoring_path
+    for key in ["C", "n_jobs", "warm_start"]:
+        del params[key]
+    lr.fit(X[train], y[train])
+    for averaging in multiclass_agg_list:
+        scorer = get_scorer(scoring + averaging)
+        assert_array_almost_equal(
+            _log_reg_scoring_path(
+                X,
+                y,
+                train,
+                test,
+                Cs=[1.0],
+                scoring=scorer,
+                pos_class=None,
+                max_squared_sum=None,
+                sample_weight=None,
+                score_params=None,
+                **(params | {"multi_class": "multinomial"}),
+            )[2][0],
+            scorer(lr, X[test], y[test]),
+        )
+
+
+def test_multinomial_logistic_regression_string_inputs():
+    # Test with string labels for LogisticRegression(CV)
+    n_samples, n_features, n_classes = 50, 5, 3
+    X_ref, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_classes=n_classes,
+        n_informative=3,
+        random_state=0,
+    )
+    y_str = LabelEncoder().fit(["bar", "baz", "foo"]).inverse_transform(y)
+    # For numerical labels, let y values be taken from set (-1, 0, 1)
+    y = np.array(y) - 1
+    # Test for string labels
+    lr = LogisticRegression()
+    lr_cv = LogisticRegressionCV(Cs=3)
+    lr_str = LogisticRegression()
+    lr_cv_str = LogisticRegressionCV(Cs=3)
+
+    lr.fit(X_ref, y)
+    lr_cv.fit(X_ref, y)
+    lr_str.fit(X_ref, y_str)
+    lr_cv_str.fit(X_ref, y_str)
+
+    assert_array_almost_equal(lr.coef_, lr_str.coef_)
+    assert sorted(lr_str.classes_) == ["bar", "baz", "foo"]
+    assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_)
+    assert sorted(lr_str.classes_) == ["bar", "baz", "foo"]
+    assert sorted(lr_cv_str.classes_) == ["bar", "baz", "foo"]
+
+    # The predictions should be in original labels
+    assert sorted(np.unique(lr_str.predict(X_ref))) == ["bar", "baz", "foo"]
+    assert sorted(np.unique(lr_cv_str.predict(X_ref))) == ["bar", "baz", "foo"]
+
+    # Make sure class weights can be given with string labels
+    lr_cv_str = LogisticRegression(class_weight={"bar": 1, "baz": 2, "foo": 0}).fit(
+        X_ref, y_str
+    )
+    assert sorted(np.unique(lr_cv_str.predict(X_ref))) == ["bar", "baz"]
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_logistic_cv_sparse(csr_container):
+    X, y = make_classification(n_samples=50, n_features=5, random_state=0)
+    X[X < 1.0] = 0.0
+    csr = csr_container(X)
+
+    clf = LogisticRegressionCV()
+    clf.fit(X, y)
+    clfs = LogisticRegressionCV()
+    clfs.fit(csr, y)
+    assert_array_almost_equal(clfs.coef_, clf.coef_)
+    assert_array_almost_equal(clfs.intercept_, clf.intercept_)
+    assert clfs.C_ == clf.C_
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+# Best remove this whole test.
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+def test_ovr_multinomial_iris():
+    # Test that OvR and multinomial are correct using the iris dataset.
+    train, target = iris.data, iris.target
+    n_samples, n_features = train.shape
+
+    # The cv indices from stratified kfold (where stratification is done based
+    # on the fine-grained iris classes, i.e, before the classes 0 and 1 are
+    # conflated) is used for both clf and clf1
+    n_cv = 2
+    cv = StratifiedKFold(n_cv)
+    precomputed_folds = list(cv.split(train, target))
+
+    # Train clf on the original dataset where classes 0 and 1 are separated
+    clf = LogisticRegressionCV(cv=precomputed_folds, multi_class="ovr")
+    clf.fit(train, target)
+
+    # Conflate classes 0 and 1 and train clf1 on this modified dataset
+    clf1 = LogisticRegressionCV(cv=precomputed_folds, multi_class="ovr")
+    target_copy = target.copy()
+    target_copy[target_copy == 0] = 1
+    clf1.fit(train, target_copy)
+
+    # Ensure that what OvR learns for class2 is same regardless of whether
+    # classes 0 and 1 are separated or not
+    assert_allclose(clf.scores_[2], clf1.scores_[2])
+    assert_allclose(clf.intercept_[2:], clf1.intercept_)
+    assert_allclose(clf.coef_[2][np.newaxis, :], clf1.coef_)
+
+    # Test the shape of various attributes.
+    assert clf.coef_.shape == (3, n_features)
+    assert_array_equal(clf.classes_, [0, 1, 2])
+    coefs_paths = np.asarray(list(clf.coefs_paths_.values()))
+    assert coefs_paths.shape == (3, n_cv, 10, n_features + 1)
+    assert clf.Cs_.shape == (10,)
+    scores = np.asarray(list(clf.scores_.values()))
+    assert scores.shape == (3, n_cv, 10)
+
+    # Test that for the iris data multinomial gives a better accuracy than OvR
+    for solver in ["lbfgs", "newton-cg", "sag", "saga"]:
+        max_iter = 500 if solver in ["sag", "saga"] else 30
+        clf_multi = LogisticRegressionCV(
+            solver=solver,
+            max_iter=max_iter,
+            random_state=42,
+            tol=1e-3 if solver in ["sag", "saga"] else 1e-2,
+            cv=2,
+        )
+        if solver == "lbfgs":
+            # lbfgs requires scaling to avoid convergence warnings
+            train = scale(train)
+
+        clf_multi.fit(train, target)
+        multi_score = clf_multi.score(train, target)
+        ovr_score = clf.score(train, target)
+        assert multi_score > ovr_score
+
+        # Test attributes of LogisticRegressionCV
+        assert clf.coef_.shape == clf_multi.coef_.shape
+        assert_array_equal(clf_multi.classes_, [0, 1, 2])
+        coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values()))
+        assert coefs_paths.shape == (3, n_cv, 10, n_features + 1)
+        assert clf_multi.Cs_.shape == (10,)
+        scores = np.asarray(list(clf_multi.scores_.values()))
+        assert scores.shape == (3, n_cv, 10)
+
+
+def test_logistic_regression_solvers():
+    """Test solvers converge to the same result."""
+    X, y = make_classification(n_features=10, n_informative=5, random_state=0)
+
+    params = dict(fit_intercept=False, random_state=42)
+
+    regressors = {
+        solver: LogisticRegression(solver=solver, **params).fit(X, y)
+        for solver in SOLVERS
+    }
+
+    for solver_1, solver_2 in itertools.combinations(regressors, r=2):
+        assert_array_almost_equal(
+            regressors[solver_1].coef_, regressors[solver_2].coef_, decimal=3
+        )
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_logistic_regression_solvers_multiclass(fit_intercept):
+    """Test solvers converge to the same result for multiclass problems."""
+    X, y = make_classification(
+        n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0
+    )
+    tol = 1e-8
+    params = dict(fit_intercept=fit_intercept, tol=tol, random_state=42)
+
+    # Override max iteration count for specific solvers to allow for
+    # proper convergence.
+    solver_max_iter = {"lbfgs": 200, "sag": 10_000, "saga": 10_000}
+
+    regressors = {
+        solver: LogisticRegression(
+            solver=solver, max_iter=solver_max_iter.get(solver, 100), **params
+        ).fit(X, y)
+        for solver in set(SOLVERS) - set(["liblinear"])
+    }
+
+    for solver_1, solver_2 in itertools.combinations(regressors, r=2):
+        assert_allclose(
+            regressors[solver_1].coef_,
+            regressors[solver_2].coef_,
+            rtol=5e-3 if (solver_1 == "saga" or solver_2 == "saga") else 1e-3,
+            err_msg=f"{solver_1} vs {solver_2}",
+        )
+        if fit_intercept:
+            assert_allclose(
+                regressors[solver_1].intercept_,
+                regressors[solver_2].intercept_,
+                rtol=5e-3 if (solver_1 == "saga" or solver_2 == "saga") else 1e-3,
+                err_msg=f"{solver_1} vs {solver_2}",
+            )
+
+
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_logistic_regression_solvers_multiclass_unpenalized(
+    fit_intercept, global_random_seed
+):
+    """Test and compare solver results for unpenalized multinomial multiclass."""
+    # We want to avoid perfect separation.
+    n_samples, n_features, n_classes = 100, 4, 3
+    rng = np.random.RandomState(global_random_seed)
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features + fit_intercept,
+        effective_rank=n_features + fit_intercept,
+        tail_strength=0.1,
+        random_state=rng,
+    )
+    if fit_intercept:
+        X[:, -1] = 1
+    U, s, Vt = svd(X)
+    assert np.all(s > 1e-3)  # to be sure that X is not singular
+    assert np.max(s) / np.min(s) < 100  # condition number of X
+    if fit_intercept:
+        X = X[:, :-1]
+    coef = rng.uniform(low=1, high=3, size=n_features * n_classes)
+    coef = coef.reshape(n_classes, n_features)
+    intercept = rng.uniform(low=-1, high=1, size=n_classes) * fit_intercept
+    raw_prediction = X @ coef.T + intercept
+
+    loss = HalfMultinomialLoss(n_classes=n_classes)
+    proba = loss.link.inverse(raw_prediction)
+    # Only newer numpy version (1.22) support more dimensions on pvals.
+    y = np.zeros(n_samples)
+    for i in range(n_samples):
+        y[i] = np.argwhere(rng.multinomial(n=1, pvals=proba[i, :]))[0, 0]
+
+    tol = 1e-9
+    params = dict(fit_intercept=fit_intercept, random_state=42)
+    solver_max_iter = {"lbfgs": 200, "sag": 10_000, "saga": 10_000}
+    solver_tol = {"sag": 1e-8, "saga": 1e-8}
+    regressors = {
+        solver: LogisticRegression(
+            C=np.inf,
+            solver=solver,
+            tol=solver_tol.get(solver, tol),
+            max_iter=solver_max_iter.get(solver, 100),
+            **params,
+        ).fit(X, y)
+        for solver in set(SOLVERS) - set(["liblinear"])
+    }
+    for solver in regressors.keys():
+        # See the docstring of test_multinomial_identifiability_on_iris for reference.
+        assert_allclose(
+            regressors[solver].coef_.sum(axis=0), 0, atol=1e-10, err_msg=solver
+        )
+
+    for solver_1, solver_2 in itertools.combinations(regressors, r=2):
+        assert_allclose(
+            regressors[solver_1].coef_,
+            regressors[solver_2].coef_,
+            rtol=5e-3 if (solver_1 == "saga" or solver_2 == "saga") else 2e-3,
+            err_msg=f"{solver_1} vs {solver_2}",
+        )
+        if fit_intercept:
+            assert_allclose(
+                regressors[solver_1].intercept_,
+                regressors[solver_2].intercept_,
+                rtol=5e-3 if (solver_1 == "saga" or solver_2 == "saga") else 1e-3,
+                err_msg=f"{solver_1} vs {solver_2}",
+            )
+
+
+@pytest.mark.parametrize("weight", [{0: 0.1, 1: 0.2}, {0: 0.1, 1: 0.2, 2: 0.5}])
+@pytest.mark.parametrize("class_weight", ["weight", "balanced"])
+def test_logistic_regressioncv_class_weights(weight, class_weight, global_random_seed):
+    """Test class_weight for LogisticRegressionCV."""
+    n_classes = len(weight)
+    if class_weight == "weight":
+        class_weight = weight
+
+    X, y = make_classification(
+        n_samples=30,
+        n_features=3,
+        n_repeated=0,
+        n_informative=3,
+        n_redundant=0,
+        n_classes=n_classes,
+        random_state=global_random_seed,
+    )
+    params = dict(
+        Cs=1,
+        fit_intercept=False,
+        class_weight=class_weight,
+        tol=1e-8,
+    )
+    clf_lbfgs = LogisticRegressionCV(solver="lbfgs", **params)
+
+    # XXX: lbfgs' line search can fail and cause a ConvergenceWarning for some
+    # 10% of the random seeds, but only on specific platforms (in particular
+    # when using Atlas BLAS/LAPACK implementation). Doubling the maxls internal
+    # parameter of the solver does not help. However this lack of proper
+    # convergence does not seem to prevent the assertion to pass, so we ignore
+    # the warning for now.
+    # See: https://github.com/scikit-learn/scikit-learn/pull/27649
+    with ignore_warnings(category=ConvergenceWarning):
+        clf_lbfgs.fit(X, y)
+
+    for solver in set(SOLVERS) - set(["lbfgs", "liblinear", "newton-cholesky"]):
+        clf = LogisticRegressionCV(solver=solver, **params)
+        if solver in ("sag", "saga"):
+            clf.set_params(
+                tol=1e-18, max_iter=10000, random_state=global_random_seed + 1
+            )
+        clf.fit(X, y)
+
+        assert_allclose(
+            clf.coef_, clf_lbfgs.coef_, rtol=1e-3, err_msg=f"{solver} vs lbfgs"
+        )
+
+
+@pytest.mark.parametrize("problem", ("single", "cv"))
+@pytest.mark.parametrize(
+    "solver", ("lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga")
+)
+def test_logistic_regression_sample_weights(problem, solver, global_random_seed):
+    n_samples_per_cv_group = 200
+    n_cv_groups = 3
+
+    X, y = make_classification(
+        n_samples=n_samples_per_cv_group * n_cv_groups,
+        n_features=5,
+        n_informative=3,
+        n_classes=2,
+        n_redundant=0,
+        random_state=global_random_seed,
+    )
+    rng = np.random.RandomState(global_random_seed)
+    sw = np.ones(y.shape[0])
+
+    kw_weighted = {
+        "random_state": global_random_seed,
+        "fit_intercept": False,
+        "max_iter": 100_000 if solver.startswith("sag") else 1_000,
+        "tol": 1e-8,
+    }
+    kw_repeated = kw_weighted.copy()
+    sw[:n_samples_per_cv_group] = rng.randint(0, 5, size=n_samples_per_cv_group)
+    X_repeated = np.repeat(X, sw.astype(int), axis=0)
+    y_repeated = np.repeat(y, sw.astype(int), axis=0)
+
+    if problem == "single":
+        LR = LogisticRegression
+    elif problem == "cv":
+        LR = LogisticRegressionCV
+        # We weight the first fold 2 times more.
+        groups_weighted = np.concatenate(
+            [
+                np.full(n_samples_per_cv_group, 0),
+                np.full(n_samples_per_cv_group, 1),
+                np.full(n_samples_per_cv_group, 2),
+            ]
+        )
+        splits_weighted = list(LeaveOneGroupOut().split(X, groups=groups_weighted))
+        kw_weighted.update({"Cs": 100, "cv": splits_weighted})
+
+        groups_repeated = np.repeat(groups_weighted, sw.astype(int), axis=0)
+        splits_repeated = list(
+            LeaveOneGroupOut().split(X_repeated, groups=groups_repeated)
+        )
+        kw_repeated.update({"Cs": 100, "cv": splits_repeated})
+
+    clf_sw_weighted = LR(solver=solver, **kw_weighted)
+    clf_sw_repeated = LR(solver=solver, **kw_repeated)
+
+    if solver == "lbfgs":
+        # lbfgs has convergence issues on the data but this should not impact
+        # the quality of the results.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            clf_sw_weighted.fit(X, y, sample_weight=sw)
+            clf_sw_repeated.fit(X_repeated, y_repeated)
+
+    else:
+        clf_sw_weighted.fit(X, y, sample_weight=sw)
+        clf_sw_repeated.fit(X_repeated, y_repeated)
+
+    if problem == "cv":
+        assert_allclose(clf_sw_weighted.scores_[1], clf_sw_repeated.scores_[1])
+    assert_allclose(clf_sw_weighted.coef_, clf_sw_repeated.coef_, atol=1e-5)
+
+
+@pytest.mark.parametrize(
+    "solver", ("lbfgs", "newton-cg", "newton-cholesky", "sag", "saga")
+)
+def test_logistic_regression_solver_class_weights(solver, global_random_seed):
+    # Test that passing class_weight as [1, 2] is the same as
+    # passing class weight = [1,1] but adjusting sample weights
+    # to be 2 for all instances of class 1.
+
+    X, y = make_classification(
+        n_samples=300,
+        n_features=5,
+        n_informative=3,
+        n_classes=2,
+        random_state=global_random_seed,
+    )
+
+    sample_weight = y + 1
+
+    kw_weighted = {
+        "random_state": global_random_seed,
+        "fit_intercept": False,
+        "max_iter": 100_000,
+        "tol": 1e-8,
+    }
+    clf_cw_12 = LogisticRegression(
+        solver=solver, class_weight={0: 1, 1: 2}, **kw_weighted
+    )
+    clf_cw_12.fit(X, y)
+    clf_sw_12 = LogisticRegression(solver=solver, **kw_weighted)
+    clf_sw_12.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(clf_cw_12.coef_, clf_sw_12.coef_, atol=1e-6)
+
+
+def test_sample_and_class_weight_equivalence_liblinear(global_random_seed):
+    # Test the above for l1 penalty and l2 penalty with dual=True.
+    # since the patched liblinear code is different.
+
+    X, y = make_classification(
+        n_samples=300,
+        n_features=5,
+        n_informative=3,
+        n_classes=2,
+        random_state=global_random_seed,
+    )
+
+    sample_weight = y + 1
+
+    clf_cw = LogisticRegression(
+        solver="liblinear",
+        fit_intercept=False,
+        class_weight={0: 1, 1: 2},
+        penalty="l1",
+        max_iter=10_000,
+        tol=1e-12,
+        random_state=global_random_seed,
+    )
+    clf_cw.fit(X, y)
+    clf_sw = LogisticRegression(
+        solver="liblinear",
+        fit_intercept=False,
+        penalty="l1",
+        max_iter=10_000,
+        tol=1e-12,
+        random_state=global_random_seed,
+    )
+    clf_sw.fit(X, y, sample_weight)
+    assert_allclose(clf_cw.coef_, clf_sw.coef_, atol=1e-10)
+
+    clf_cw = LogisticRegression(
+        solver="liblinear",
+        fit_intercept=False,
+        class_weight={0: 1, 1: 2},
+        penalty="l2",
+        max_iter=10_000,
+        tol=1e-12,
+        dual=True,
+        random_state=global_random_seed,
+    )
+    clf_cw.fit(X, y)
+    clf_sw = LogisticRegression(
+        solver="liblinear",
+        fit_intercept=False,
+        penalty="l2",
+        max_iter=10_000,
+        tol=1e-12,
+        dual=True,
+        random_state=global_random_seed,
+    )
+    clf_sw.fit(X, y, sample_weight)
+    assert_allclose(clf_cw.coef_, clf_sw.coef_, atol=1e-10)
+
+
+def _compute_class_weight_dictionary(y):
+    # helper for returning a dictionary instead of an array
+    classes = np.unique(y)
+    class_weight = compute_class_weight("balanced", classes=classes, y=y)
+    class_weight_dict = dict(zip(classes, class_weight))
+    return class_weight_dict
+
+
+@pytest.mark.parametrize("csr_container", [lambda x: x] + CSR_CONTAINERS)
+def test_logistic_regression_class_weights(csr_container):
+    # Scale data to avoid convergence warnings with the lbfgs solver
+    X_iris = scale(iris.data)
+    # Multinomial case: remove 90% of class 0
+    X = X_iris[45:, :]
+    X = csr_container(X)
+    y = iris.target[45:]
+    class_weight_dict = _compute_class_weight_dictionary(y)
+
+    for solver in set(SOLVERS) - set(["liblinear", "newton-cholesky"]):
+        params = dict(solver=solver, max_iter=1000)
+        clf1 = LogisticRegression(class_weight="balanced", **params)
+        clf2 = LogisticRegression(class_weight=class_weight_dict, **params)
+        clf1.fit(X, y)
+        clf2.fit(X, y)
+        assert len(clf1.classes_) == 3
+        assert_allclose(clf1.coef_, clf2.coef_, rtol=1e-4)
+        # Same as appropriate sample_weight.
+        sw = np.ones(X.shape[0])
+        for c in clf1.classes_:
+            sw[y == c] *= class_weight_dict[c]
+        clf3 = LogisticRegression(**params).fit(X, y, sample_weight=sw)
+        assert_allclose(clf3.coef_, clf2.coef_, rtol=1e-4)
+
+    # Binary case: remove 90% of class 0 and 100% of class 2
+    X = X_iris[45:100, :]
+    y = iris.target[45:100]
+    class_weight_dict = _compute_class_weight_dictionary(y)
+
+    for solver in SOLVERS:
+        params = dict(solver=solver, max_iter=1000)
+        clf1 = LogisticRegression(class_weight="balanced", **params)
+        clf2 = LogisticRegression(class_weight=class_weight_dict, **params)
+        clf1.fit(X, y)
+        clf2.fit(X, y)
+        assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=6)
+
+
+def test_logistic_regression_multinomial():
+    # Tests for the multinomial option in logistic regression
+
+    # Some basic attributes of Logistic Regression
+    n_samples, n_features, n_classes = 50, 20, 3
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=10,
+        n_classes=n_classes,
+        random_state=0,
+    )
+
+    X = StandardScaler(with_mean=False).fit_transform(X)
+
+    # 'lbfgs' is used as a referenced
+    solver = "lbfgs"
+    ref_i = LogisticRegression(solver=solver, tol=1e-6)
+    ref_w = LogisticRegression(solver=solver, fit_intercept=False, tol=1e-6)
+    ref_i.fit(X, y)
+    ref_w.fit(X, y)
+    assert ref_i.coef_.shape == (n_classes, n_features)
+    assert ref_w.coef_.shape == (n_classes, n_features)
+    for solver in ["sag", "saga", "newton-cg"]:
+        clf_i = LogisticRegression(
+            solver=solver,
+            random_state=42,
+            max_iter=2000,
+            tol=1e-7,
+        )
+        clf_w = LogisticRegression(
+            solver=solver,
+            random_state=42,
+            max_iter=2000,
+            tol=1e-7,
+            fit_intercept=False,
+        )
+        clf_i.fit(X, y)
+        clf_w.fit(X, y)
+        assert clf_i.coef_.shape == (n_classes, n_features)
+        assert clf_w.coef_.shape == (n_classes, n_features)
+
+        # Compare solutions between lbfgs and the other solvers
+        assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-3)
+        assert_allclose(ref_w.coef_, clf_w.coef_, rtol=1e-2)
+        assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-3)
+
+    # Test that the path give almost the same results. However since in this
+    # case we take the average of the coefs after fitting across all the
+    # folds, it need not be exactly the same.
+    for solver in ["lbfgs", "newton-cg", "sag", "saga"]:
+        clf_path = LogisticRegressionCV(
+            solver=solver, max_iter=2000, tol=1e-6, Cs=[1.0]
+        )
+        clf_path.fit(X, y)
+        assert_allclose(clf_path.coef_, ref_i.coef_, rtol=1e-2)
+        assert_allclose(clf_path.intercept_, ref_i.intercept_, rtol=1e-2)
+
+
+def test_liblinear_decision_function_zero():
+    # Test negative prediction when decision_function values are zero.
+    # Liblinear predicts the positive class when decision_function values
+    # are zero. This is a test to verify that we do not do the same.
+    # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600
+    # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623
+    X, y = make_classification(n_samples=5, n_features=5, random_state=0)
+    clf = LogisticRegression(fit_intercept=False, solver="liblinear")
+    clf.fit(X, y)
+
+    # Dummy data such that the decision function becomes zero.
+    X = np.zeros((5, 5))
+    assert_array_equal(clf.predict(X), np.zeros(5))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_liblinear_logregcv_sparse(csr_container):
+    # Test LogRegCV with solver='liblinear' works for sparse matrices
+
+    X, y = make_classification(n_samples=10, n_features=5, random_state=0)
+    clf = LogisticRegressionCV(solver="liblinear")
+    clf.fit(csr_container(X), y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_saga_sparse(csr_container):
+    # Test LogRegCV with solver='liblinear' works for sparse matrices
+
+    X, y = make_classification(n_samples=10, n_features=5, random_state=0)
+    clf = LogisticRegressionCV(solver="saga", tol=1e-2)
+    clf.fit(csr_container(X), y)
+
+
+def test_logreg_intercept_scaling_zero():
+    # Test that intercept_scaling is ignored when fit_intercept is False
+
+    clf = LogisticRegression(fit_intercept=False)
+    clf.fit(X, Y1)
+    assert clf.intercept_ == 0.0
+
+
+def test_logreg_l1():
+    # Because liblinear penalizes the intercept and saga does not, we do not
+    # fit the intercept to make it possible to compare the coefficients of
+    # the two models at convergence.
+    rng = np.random.RandomState(42)
+    n_samples = 50
+    X, y = make_classification(n_samples=n_samples, n_features=20, random_state=0)
+    X_noise = rng.normal(size=(n_samples, 3))
+    X_constant = np.ones(shape=(n_samples, 2))
+    X = np.concatenate((X, X_noise, X_constant), axis=1)
+    lr_liblinear = LogisticRegression(
+        penalty="l1",
+        C=1.0,
+        solver="liblinear",
+        fit_intercept=False,
+        tol=1e-10,
+    )
+    lr_liblinear.fit(X, y)
+
+    lr_saga = LogisticRegression(
+        penalty="l1",
+        C=1.0,
+        solver="saga",
+        fit_intercept=False,
+        max_iter=1000,
+        tol=1e-10,
+    )
+    lr_saga.fit(X, y)
+    assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_)
+
+    # Noise and constant features should be regularized to zero by the l1
+    # penalty
+    assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5))
+    assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_logreg_l1_sparse_data(csr_container):
+    # Because liblinear penalizes the intercept and saga does not, we do not
+    # fit the intercept to make it possible to compare the coefficients of
+    # the two models at convergence.
+    rng = np.random.RandomState(42)
+    n_samples = 50
+    X, y = make_classification(n_samples=n_samples, n_features=20, random_state=0)
+    X_noise = rng.normal(scale=0.1, size=(n_samples, 3))
+    X_constant = np.zeros(shape=(n_samples, 2))
+    X = np.concatenate((X, X_noise, X_constant), axis=1)
+    X[X < 1] = 0
+    X = csr_container(X)
+
+    lr_liblinear = LogisticRegression(
+        penalty="l1",
+        C=1.0,
+        solver="liblinear",
+        fit_intercept=False,
+        tol=1e-10,
+    )
+    lr_liblinear.fit(X, y)
+
+    lr_saga = LogisticRegression(
+        penalty="l1",
+        C=1.0,
+        solver="saga",
+        fit_intercept=False,
+        max_iter=1000,
+        tol=1e-10,
+    )
+    lr_saga.fit(X, y)
+    assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_)
+    # Noise and constant features should be regularized to zero by the l1
+    # penalty
+    assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5))
+    assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5))
+
+    # Check that solving on the sparse and dense data yield the same results
+    lr_saga_dense = LogisticRegression(
+        penalty="l1",
+        C=1.0,
+        solver="saga",
+        fit_intercept=False,
+        max_iter=1000,
+        tol=1e-10,
+    )
+    lr_saga_dense.fit(X.toarray(), y)
+    assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_)
+
+
+@pytest.mark.parametrize("random_seed", [42])
+@pytest.mark.parametrize("penalty", ["l1", "l2"])
+def test_logistic_regression_cv_refit(random_seed, penalty):
+    # Test that when refit=True, logistic regression cv with the saga solver
+    # converges to the same solution as logistic regression with a fixed
+    # regularization parameter.
+    # Internally the LogisticRegressionCV model uses a warm start to refit on
+    # the full data model with the optimal C found by CV. As the penalized
+    # logistic regression loss is convex, we should still recover exactly
+    # the same solution as long as the stopping criterion is strict enough (and
+    # that there are no exactly duplicated features when penalty='l1').
+    X, y = make_classification(n_samples=100, n_features=20, random_state=random_seed)
+    common_params = dict(
+        solver="saga",
+        penalty=penalty,
+        random_state=random_seed,
+        max_iter=1000,
+        tol=1e-12,
+    )
+    lr_cv = LogisticRegressionCV(Cs=[1.0], refit=True, **common_params)
+    lr_cv.fit(X, y)
+    lr = LogisticRegression(C=1.0, **common_params)
+    lr.fit(X, y)
+    assert_array_almost_equal(lr_cv.coef_, lr.coef_)
+
+
+def test_logreg_predict_proba_multinomial():
+    X, y = make_classification(
+        n_samples=10, n_features=20, random_state=0, n_classes=3, n_informative=10
+    )
+
+    # Predicted probabilities using the true-entropy loss should give a
+    # smaller loss than those using the ovr method.
+    clf_multi = LogisticRegression(solver="lbfgs")
+    clf_multi.fit(X, y)
+    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
+    clf_ovr = OneVsRestClassifier(LogisticRegression(solver="lbfgs"))
+    clf_ovr.fit(X, y)
+    clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X))
+    assert clf_ovr_loss > clf_multi_loss
+
+    # Predicted probabilities using the soft-max function should give a
+    # smaller loss than those using the logistic function.
+    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
+    clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X))
+    assert clf_wrong_loss > clf_multi_loss
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.parametrize("max_iter", np.arange(1, 5))
+@pytest.mark.parametrize("multi_class", ["ovr", "multinomial"])
+@pytest.mark.parametrize(
+    "solver, message",
+    [
+        (
+            "newton-cg",
+            "newton-cg failed to converge.* Increase the number of iterations.",
+        ),
+        (
+            "liblinear",
+            "Liblinear failed to converge, increase the number of iterations.",
+        ),
+        ("sag", "The max_iter was reached which means the coef_ did not converge"),
+        ("saga", "The max_iter was reached which means the coef_ did not converge"),
+        ("lbfgs", "lbfgs failed to converge"),
+        ("newton-cholesky", "Newton solver did not converge after [0-9]* iterations"),
+    ],
+)
+def test_max_iter(max_iter, multi_class, solver, message):
+    # Test that the maximum number of iteration is reached
+    X, y_bin = iris.data, iris.target.copy()
+    y_bin[y_bin == 2] = 0
+
+    if solver in ("liblinear",) and multi_class == "multinomial":
+        pytest.skip("'multinomial' is not supported by liblinear")
+    if solver == "newton-cholesky" and max_iter > 1:
+        pytest.skip("solver newton-cholesky might converge very fast")
+
+    lr = LogisticRegression(
+        max_iter=max_iter,
+        tol=1e-15,
+        multi_class=multi_class,
+        random_state=0,
+        solver=solver,
+    )
+    with pytest.warns(ConvergenceWarning, match=message):
+        lr.fit(X, y_bin)
+
+    assert lr.n_iter_[0] == max_iter
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.filterwarnings(
+    "ignore:.*'liblinear' solver for multiclass classification is deprecated.*"
+)
+@pytest.mark.parametrize("solver", SOLVERS)
+def test_n_iter(solver):
+    # Test that self.n_iter_ has the correct format.
+    X, y = iris.data, iris.target
+    if solver == "lbfgs":
+        # lbfgs requires scaling to avoid convergence warnings
+        X = scale(X)
+
+    n_classes = np.unique(y).shape[0]
+    assert n_classes == 3
+
+    # Also generate a binary classification sub-problem.
+    y_bin = y.copy()
+    y_bin[y_bin == 2] = 0
+
+    n_Cs = 4
+    n_cv_fold = 2
+
+    # Binary classification case
+    clf = LogisticRegression(tol=1e-2, C=1.0, solver=solver, random_state=42)
+    clf.fit(X, y_bin)
+    assert clf.n_iter_.shape == (1,)
+
+    clf_cv = LogisticRegressionCV(
+        tol=1e-2, solver=solver, Cs=n_Cs, cv=n_cv_fold, random_state=42
+    )
+    clf_cv.fit(X, y_bin)
+    assert clf_cv.n_iter_.shape == (1, n_cv_fold, n_Cs)
+
+    # OvR case
+    clf.set_params(multi_class="ovr").fit(X, y)
+    assert clf.n_iter_.shape == (n_classes,)
+
+    clf_cv.set_params(multi_class="ovr").fit(X, y)
+    assert clf_cv.n_iter_.shape == (n_classes, n_cv_fold, n_Cs)
+
+    # multinomial case
+    if solver in ("liblinear",):
+        # This solver only supports one-vs-rest multiclass classification.
+        return
+
+    # When using the multinomial objective function, there is a single
+    # optimization problem to solve for all classes at once:
+    clf.set_params(multi_class="multinomial").fit(X, y)
+    assert clf.n_iter_.shape == (1,)
+
+    clf_cv.set_params(multi_class="multinomial").fit(X, y)
+    assert clf_cv.n_iter_.shape == (1, n_cv_fold, n_Cs)
+
+
+@pytest.mark.parametrize("solver", sorted(set(SOLVERS) - set(["liblinear"])))
+@pytest.mark.parametrize("warm_start", (True, False))
+@pytest.mark.parametrize("fit_intercept", (True, False))
+def test_warm_start(solver, warm_start, fit_intercept):
+    # A 1-iteration second fit on same data should give almost same result
+    # with warm starting, and quite different result without warm starting.
+    # Warm starting does not work with liblinear solver.
+    X, y = iris.data, iris.target
+
+    clf = LogisticRegression(
+        tol=1e-4,
+        warm_start=warm_start,
+        solver=solver,
+        random_state=42,
+        fit_intercept=fit_intercept,
+    )
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+        coef_1 = clf.coef_
+
+        clf.max_iter = 1
+        clf.fit(X, y)
+    cum_diff = np.sum(np.abs(coef_1 - clf.coef_))
+    msg = (
+        f"Warm starting issue with solver {solver}"
+        f"with {fit_intercept=} and {warm_start=}"
+    )
+    if warm_start:
+        assert 2.0 > cum_diff, msg
+    else:
+        assert cum_diff > 2.0, msg
+
+
+@pytest.mark.parametrize("solver", ["newton-cholesky", "newton-cg"])
+@pytest.mark.parametrize("fit_intercept", (True, False))
+@pytest.mark.parametrize("penalty", ("l2", None))
+def test_warm_start_newton_solver(global_random_seed, solver, fit_intercept, penalty):
+    """Test that 2 steps at once are the same as 2 single steps with warm start."""
+    X, y = iris.data, iris.target
+
+    clf1 = LogisticRegression(
+        solver=solver,
+        max_iter=2,
+        fit_intercept=fit_intercept,
+        penalty=penalty,
+        random_state=global_random_seed,
+    )
+    with ignore_warnings(category=ConvergenceWarning):
+        clf1.fit(X, y)
+
+    clf2 = LogisticRegression(
+        solver=solver,
+        max_iter=1,
+        warm_start=True,
+        fit_intercept=fit_intercept,
+        penalty=penalty,
+        random_state=global_random_seed,
+    )
+    with ignore_warnings(category=ConvergenceWarning):
+        clf2.fit(X, y)
+        clf2.fit(X, y)
+
+    assert_allclose(clf2.coef_, clf1.coef_)
+    if fit_intercept:
+        assert_allclose(clf2.intercept_, clf1.intercept_)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_saga_vs_liblinear(csr_container):
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X = np.concatenate([X] * 3)
+    y = np.concatenate([y] * 3)
+
+    X_bin = X[y <= 1]
+    y_bin = y[y <= 1] * 2 - 1
+
+    X_sparse, y_sparse = make_classification(
+        n_samples=50, n_features=20, random_state=0
+    )
+    X_sparse = csr_container(X_sparse)
+
+    for X, y in ((X_bin, y_bin), (X_sparse, y_sparse)):
+        for penalty in ["l1", "l2"]:
+            n_samples = X.shape[0]
+            # alpha=1e-3 is time consuming
+            for alpha in np.logspace(-1, 1, 3):
+                saga = LogisticRegression(
+                    C=1.0 / (n_samples * alpha),
+                    solver="saga",
+                    max_iter=200,
+                    fit_intercept=False,
+                    penalty=penalty,
+                    random_state=0,
+                    tol=1e-6,
+                )
+
+                liblinear = LogisticRegression(
+                    C=1.0 / (n_samples * alpha),
+                    solver="liblinear",
+                    max_iter=200,
+                    fit_intercept=False,
+                    penalty=penalty,
+                    random_state=0,
+                    tol=1e-6,
+                )
+
+                saga.fit(X, y)
+                liblinear.fit(X, y)
+                # Convergence for alpha=1e-3 is very slow
+                assert_array_almost_equal(saga.coef_, liblinear.coef_, 3)
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.parametrize("multi_class", ["ovr", "multinomial"])
+@pytest.mark.parametrize(
+    "solver", ["liblinear", "newton-cg", "newton-cholesky", "saga"]
+)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_dtype_match(solver, multi_class, fit_intercept, csr_container):
+    # Test that np.float32 input data is not cast to np.float64 when possible
+    # and that the output is approximately the same no matter the input format.
+
+    if solver == "liblinear" and multi_class == "multinomial":
+        pytest.skip(f"Solver={solver} does not support multinomial logistic.")
+
+    out32_type = np.float64 if solver == "liblinear" else np.float32
+
+    X_32 = np.array(X).astype(np.float32)
+    y_32 = np.array(Y1).astype(np.float32)
+    X_64 = np.array(X).astype(np.float64)
+    y_64 = np.array(Y1).astype(np.float64)
+    X_sparse_32 = csr_container(X, dtype=np.float32)
+    X_sparse_64 = csr_container(X, dtype=np.float64)
+    solver_tol = 5e-4
+
+    lr_templ = LogisticRegression(
+        solver=solver,
+        multi_class=multi_class,
+        random_state=42,
+        tol=solver_tol,
+        fit_intercept=fit_intercept,
+    )
+
+    # Check 32-bit type consistency
+    lr_32 = clone(lr_templ)
+    lr_32.fit(X_32, y_32)
+    assert lr_32.coef_.dtype == out32_type
+
+    # Check 32-bit type consistency with sparsity
+    lr_32_sparse = clone(lr_templ)
+    lr_32_sparse.fit(X_sparse_32, y_32)
+    assert lr_32_sparse.coef_.dtype == out32_type
+
+    # Check 64-bit type consistency
+    lr_64 = clone(lr_templ)
+    lr_64.fit(X_64, y_64)
+    assert lr_64.coef_.dtype == np.float64
+
+    # Check 64-bit type consistency with sparsity
+    lr_64_sparse = clone(lr_templ)
+    lr_64_sparse.fit(X_sparse_64, y_64)
+    assert lr_64_sparse.coef_.dtype == np.float64
+
+    # solver_tol bounds the norm of the loss gradient
+    # dw ~= inv(H)*grad ==> |dw| ~= |inv(H)| * solver_tol, where H - hessian
+    #
+    # See https://github.com/scikit-learn/scikit-learn/pull/13645
+    #
+    # with  Z = np.hstack((np.ones((3,1)), np.array(X)))
+    # In [8]: np.linalg.norm(np.diag([0,2,2]) + np.linalg.inv((Z.T @ Z)/4))
+    # Out[8]: 1.7193336918135917
+
+    # factor of 2 to get the ball diameter
+    atol = 2 * 1.72 * solver_tol
+    if os.name == "nt" and _IS_32BIT:
+        # FIXME
+        atol = 1e-2
+
+    # Check accuracy consistency
+    assert_allclose(lr_32.coef_, lr_64.coef_.astype(np.float32), atol=atol)
+
+    if solver == "saga" and fit_intercept:
+        # FIXME: SAGA on sparse data fits the intercept inaccurately with the
+        # default tol and max_iter parameters.
+        atol = 1e-1
+
+    assert_allclose(lr_32.coef_, lr_32_sparse.coef_, atol=atol)
+    assert_allclose(lr_64.coef_, lr_64_sparse.coef_, atol=atol)
+
+
+def test_warm_start_converge_LR():
+    # Test to see that the logistic regression converges on warm start,
+    # with multi_class='multinomial'. Non-regressive test for #10836
+
+    rng = np.random.RandomState(0)
+    X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2)))
+    y = np.array([1] * 100 + [-1] * 100)
+    lr_no_ws = LogisticRegression(solver="sag", warm_start=False, random_state=0)
+    lr_ws = LogisticRegression(solver="sag", warm_start=True, random_state=0)
+
+    lr_no_ws_loss = log_loss(y, lr_no_ws.fit(X, y).predict_proba(X))
+    for i in range(5):
+        lr_ws.fit(X, y)
+    lr_ws_loss = log_loss(y, lr_ws.predict_proba(X))
+    assert_allclose(lr_no_ws_loss, lr_ws_loss, rtol=1e-5)
+
+
+def test_elastic_net_coeffs():
+    # make sure elasticnet penalty gives different coefficients from l1 and l2
+    # with saga solver (l1_ratio different from 0 or 1)
+    X, y = make_classification(random_state=0)
+
+    C = 2.0
+    l1_ratio = 0.5
+    coeffs = list()
+    for penalty, ratio in (("elasticnet", l1_ratio), ("l1", None), ("l2", None)):
+        lr = LogisticRegression(
+            penalty=penalty,
+            C=C,
+            solver="saga",
+            random_state=0,
+            l1_ratio=ratio,
+            tol=1e-3,
+            max_iter=200,
+        )
+        lr.fit(X, y)
+        coeffs.append(lr.coef_)
+
+    elastic_net_coeffs, l1_coeffs, l2_coeffs = coeffs
+    # make sure coeffs differ by at least .1
+    assert not np.allclose(elastic_net_coeffs, l1_coeffs, rtol=0, atol=0.1)
+    assert not np.allclose(elastic_net_coeffs, l2_coeffs, rtol=0, atol=0.1)
+    assert not np.allclose(l2_coeffs, l1_coeffs, rtol=0, atol=0.1)
+
+
+@pytest.mark.parametrize("C", [0.001, 0.1, 1, 10, 100, 1000, 1e6])
+@pytest.mark.parametrize("penalty, l1_ratio", [("l1", 1), ("l2", 0)])
+def test_elastic_net_l1_l2_equivalence(C, penalty, l1_ratio):
+    # Make sure elasticnet is equivalent to l1 when l1_ratio=1 and to l2 when
+    # l1_ratio=0.
+    X, y = make_classification(random_state=0)
+
+    lr_enet = LogisticRegression(
+        penalty="elasticnet",
+        C=C,
+        l1_ratio=l1_ratio,
+        solver="saga",
+        random_state=0,
+        tol=1e-2,
+    )
+    lr_expected = LogisticRegression(
+        penalty=penalty, C=C, solver="saga", random_state=0, tol=1e-2
+    )
+    lr_enet.fit(X, y)
+    lr_expected.fit(X, y)
+
+    assert_array_almost_equal(lr_enet.coef_, lr_expected.coef_)
+
+
+@pytest.mark.parametrize("C", [0.001, 1, 100, 1e6])
+def test_elastic_net_vs_l1_l2(C):
+    # Make sure that elasticnet with grid search on l1_ratio gives same or
+    # better results than just l1 or just l2.
+
+    X, y = make_classification(500, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    param_grid = {"l1_ratio": np.linspace(0, 1, 5)}
+
+    enet_clf = LogisticRegression(
+        penalty="elasticnet", C=C, solver="saga", random_state=0, tol=1e-2
+    )
+    gs = GridSearchCV(enet_clf, param_grid, refit=True)
+
+    l1_clf = LogisticRegression(
+        penalty="l1", C=C, solver="saga", random_state=0, tol=1e-2
+    )
+    l2_clf = LogisticRegression(
+        penalty="l2", C=C, solver="saga", random_state=0, tol=1e-2
+    )
+
+    for clf in (gs, l1_clf, l2_clf):
+        clf.fit(X_train, y_train)
+
+    assert gs.score(X_test, y_test) >= l1_clf.score(X_test, y_test)
+    assert gs.score(X_test, y_test) >= l2_clf.score(X_test, y_test)
+
+
+@pytest.mark.parametrize("C", np.logspace(-3, 2, 4))
+@pytest.mark.parametrize("l1_ratio", [0.1, 0.5, 0.9])
+def test_LogisticRegression_elastic_net_objective(C, l1_ratio):
+    # Check that training with a penalty matching the objective leads
+    # to a lower objective.
+    # Here we train a logistic regression with l2 (a) and elasticnet (b)
+    # penalties, and compute the elasticnet objective. That of a should be
+    # greater than that of b (both objectives are convex).
+    X, y = make_classification(
+        n_samples=1000,
+        n_classes=2,
+        n_features=20,
+        n_informative=10,
+        n_redundant=0,
+        n_repeated=0,
+        random_state=0,
+    )
+    X = scale(X)
+
+    lr_enet = LogisticRegression(
+        penalty="elasticnet",
+        solver="saga",
+        random_state=0,
+        C=C,
+        l1_ratio=l1_ratio,
+        fit_intercept=False,
+    )
+    lr_l2 = LogisticRegression(
+        penalty="l2", solver="saga", random_state=0, C=C, fit_intercept=False
+    )
+    lr_enet.fit(X, y)
+    lr_l2.fit(X, y)
+
+    def enet_objective(lr):
+        coef = lr.coef_.ravel()
+        obj = C * log_loss(y, lr.predict_proba(X))
+        obj += l1_ratio * np.sum(np.abs(coef))
+        obj += (1.0 - l1_ratio) * 0.5 * np.dot(coef, coef)
+        return obj
+
+    assert enet_objective(lr_enet) < enet_objective(lr_l2)
+
+
+@pytest.mark.parametrize("n_classes", (2, 3))
+def test_LogisticRegressionCV_GridSearchCV_elastic_net(n_classes):
+    # make sure LogisticRegressionCV gives same best params (l1 and C) as
+    # GridSearchCV when penalty is elasticnet
+
+    X, y = make_classification(
+        n_samples=100, n_classes=n_classes, n_informative=3, random_state=0
+    )
+
+    cv = StratifiedKFold(5)
+
+    l1_ratios = np.linspace(0, 1, 3)
+    Cs = np.logspace(-4, 4, 3)
+
+    lrcv = LogisticRegressionCV(
+        penalty="elasticnet",
+        Cs=Cs,
+        solver="saga",
+        cv=cv,
+        l1_ratios=l1_ratios,
+        random_state=0,
+        tol=1e-2,
+    )
+    lrcv.fit(X, y)
+
+    param_grid = {"C": Cs, "l1_ratio": l1_ratios}
+    lr = LogisticRegression(
+        penalty="elasticnet",
+        solver="saga",
+        random_state=0,
+        tol=1e-2,
+    )
+    gs = GridSearchCV(lr, param_grid, cv=cv)
+    gs.fit(X, y)
+
+    assert gs.best_params_["l1_ratio"] == lrcv.l1_ratio_[0]
+    assert gs.best_params_["C"] == lrcv.C_[0]
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+# Maybe remove whole test after removal of the deprecated multi_class.
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+def test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr():
+    # make sure LogisticRegressionCV gives same best params (l1 and C) as
+    # GridSearchCV when penalty is elasticnet and multiclass is ovr. We can't
+    # compare best_params like in the previous test because
+    # LogisticRegressionCV with multi_class='ovr' will have one C and one
+    # l1_param for each class, while LogisticRegression will share the
+    # parameters over the *n_classes* classifiers.
+
+    X, y = make_classification(
+        n_samples=100, n_classes=3, n_informative=3, random_state=0
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    cv = StratifiedKFold(5)
+
+    l1_ratios = np.linspace(0, 1, 3)
+    Cs = np.logspace(-4, 4, 3)
+
+    lrcv = LogisticRegressionCV(
+        penalty="elasticnet",
+        Cs=Cs,
+        solver="saga",
+        cv=cv,
+        l1_ratios=l1_ratios,
+        random_state=0,
+        multi_class="ovr",
+        tol=1e-2,
+    )
+    lrcv.fit(X_train, y_train)
+
+    param_grid = {"C": Cs, "l1_ratio": l1_ratios}
+    lr = LogisticRegression(
+        penalty="elasticnet",
+        solver="saga",
+        random_state=0,
+        multi_class="ovr",
+        tol=1e-2,
+    )
+    gs = GridSearchCV(lr, param_grid, cv=cv)
+    gs.fit(X_train, y_train)
+
+    # Check that predictions are 80% the same
+    assert (lrcv.predict(X_train) == gs.predict(X_train)).mean() >= 0.8
+    assert (lrcv.predict(X_test) == gs.predict(X_test)).mean() >= 0.8
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.parametrize("penalty", ("l2", "elasticnet"))
+@pytest.mark.parametrize("multi_class", ("ovr", "multinomial", "auto"))
+def test_LogisticRegressionCV_no_refit(penalty, multi_class):
+    # Test LogisticRegressionCV attribute shapes when refit is False
+
+    n_classes = 3
+    n_features = 20
+    X, y = make_classification(
+        n_samples=200,
+        n_classes=n_classes,
+        n_informative=n_classes,
+        n_features=n_features,
+        random_state=0,
+    )
+
+    Cs = np.logspace(-4, 4, 3)
+    if penalty == "elasticnet":
+        l1_ratios = np.linspace(0, 1, 2)
+    else:
+        l1_ratios = None
+
+    lrcv = LogisticRegressionCV(
+        penalty=penalty,
+        Cs=Cs,
+        solver="saga",
+        l1_ratios=l1_ratios,
+        random_state=0,
+        multi_class=multi_class,
+        tol=1e-2,
+        refit=False,
+    )
+    lrcv.fit(X, y)
+    assert lrcv.C_.shape == (n_classes,)
+    assert lrcv.l1_ratio_.shape == (n_classes,)
+    assert lrcv.coef_.shape == (n_classes, n_features)
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+# Remove multi_class an change first element of the expected n_iter_.shape from
+# n_classes to 1 (according to the docstring).
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+def test_LogisticRegressionCV_elasticnet_attribute_shapes():
+    # Make sure the shapes of scores_ and coefs_paths_ attributes are correct
+    # when using elasticnet (added one dimension for l1_ratios)
+
+    n_classes = 3
+    n_features = 20
+    X, y = make_classification(
+        n_samples=200,
+        n_classes=n_classes,
+        n_informative=n_classes,
+        n_features=n_features,
+        random_state=0,
+    )
+
+    Cs = np.logspace(-4, 4, 3)
+    l1_ratios = np.linspace(0, 1, 2)
+
+    n_folds = 2
+    lrcv = LogisticRegressionCV(
+        penalty="elasticnet",
+        Cs=Cs,
+        solver="saga",
+        cv=n_folds,
+        l1_ratios=l1_ratios,
+        multi_class="ovr",
+        random_state=0,
+        tol=1e-2,
+    )
+    lrcv.fit(X, y)
+    coefs_paths = np.asarray(list(lrcv.coefs_paths_.values()))
+    assert coefs_paths.shape == (
+        n_classes,
+        n_folds,
+        Cs.size,
+        l1_ratios.size,
+        n_features + 1,
+    )
+    scores = np.asarray(list(lrcv.scores_.values()))
+    assert scores.shape == (n_classes, n_folds, Cs.size, l1_ratios.size)
+
+    assert lrcv.n_iter_.shape == (n_classes, n_folds, Cs.size, l1_ratios.size)
+
+
+def test_l1_ratio_non_elasticnet():
+    msg = (
+        r"l1_ratio parameter is only used when penalty is"
+        r" 'elasticnet'\. Got \(penalty=l1\)"
+    )
+    with pytest.warns(UserWarning, match=msg):
+        LogisticRegression(penalty="l1", solver="saga", l1_ratio=0.5).fit(X, Y1)
+
+
+@pytest.mark.parametrize("C", np.logspace(-3, 2, 4))
+@pytest.mark.parametrize("l1_ratio", [0.1, 0.5, 0.9])
+def test_elastic_net_versus_sgd(C, l1_ratio):
+    # Compare elasticnet penalty in LogisticRegression() and SGD(loss='log')
+    n_samples = 500
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_classes=2,
+        n_features=5,
+        n_informative=5,
+        n_redundant=0,
+        n_repeated=0,
+        random_state=1,
+    )
+    X = scale(X)
+
+    sgd = SGDClassifier(
+        penalty="elasticnet",
+        random_state=1,
+        fit_intercept=False,
+        tol=None,
+        max_iter=2000,
+        l1_ratio=l1_ratio,
+        alpha=1.0 / C / n_samples,
+        loss="log_loss",
+    )
+    log = LogisticRegression(
+        penalty="elasticnet",
+        random_state=1,
+        fit_intercept=False,
+        tol=1e-5,
+        max_iter=1000,
+        l1_ratio=l1_ratio,
+        C=C,
+        solver="saga",
+    )
+
+    sgd.fit(X, y)
+    log.fit(X, y)
+    assert_array_almost_equal(sgd.coef_, log.coef_, decimal=1)
+
+
+def test_logistic_regression_path_coefs_multinomial():
+    # Make sure that the returned coefs by logistic_regression_path when
+    # multi_class='multinomial' don't override each other (used to be a
+    # bug).
+    X, y = make_classification(
+        n_samples=200,
+        n_classes=3,
+        n_informative=2,
+        n_redundant=0,
+        n_clusters_per_class=1,
+        random_state=0,
+        n_features=2,
+    )
+    Cs = [0.00001, 1, 10000]
+    coefs, _, _ = _logistic_regression_path(
+        X,
+        y,
+        penalty="l1",
+        Cs=Cs,
+        solver="saga",
+        random_state=0,
+        multi_class="multinomial",
+    )
+
+    with pytest.raises(AssertionError):
+        assert_array_almost_equal(coefs[0], coefs[1], decimal=1)
+    with pytest.raises(AssertionError):
+        assert_array_almost_equal(coefs[0], coefs[2], decimal=1)
+    with pytest.raises(AssertionError):
+        assert_array_almost_equal(coefs[1], coefs[2], decimal=1)
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.filterwarnings(
+    "ignore:.*'liblinear' solver for multiclass classification is deprecated.*"
+)
+@pytest.mark.parametrize(
+    "est",
+    [
+        LogisticRegression(random_state=0, max_iter=500),
+        LogisticRegressionCV(random_state=0, cv=3, Cs=3, tol=1e-3, max_iter=500),
+    ],
+    ids=lambda x: x.__class__.__name__,
+)
+@pytest.mark.parametrize("solver", SOLVERS)
+def test_logistic_regression_multi_class_auto(est, solver):
+    # check multi_class='auto' => multi_class='ovr'
+    # iff binary y or liblinear
+
+    def fit(X, y, **kw):
+        return clone(est).set_params(**kw).fit(X, y)
+
+    scaled_data = scale(iris.data)
+    X = scaled_data[::10]
+    X2 = scaled_data[1::10]
+    y_multi = iris.target[::10]
+    y_bin = y_multi == 0
+    est_auto_bin = fit(X, y_bin, multi_class="auto", solver=solver)
+    est_ovr_bin = fit(X, y_bin, multi_class="ovr", solver=solver)
+    assert_allclose(est_auto_bin.coef_, est_ovr_bin.coef_)
+    assert_allclose(est_auto_bin.predict_proba(X2), est_ovr_bin.predict_proba(X2))
+
+    est_auto_multi = fit(X, y_multi, multi_class="auto", solver=solver)
+    if solver == "liblinear":
+        est_ovr_multi = fit(X, y_multi, multi_class="ovr", solver=solver)
+        assert_allclose(est_auto_multi.coef_, est_ovr_multi.coef_)
+        assert_allclose(
+            est_auto_multi.predict_proba(X2), est_ovr_multi.predict_proba(X2)
+        )
+    else:
+        est_multi_multi = fit(X, y_multi, multi_class="multinomial", solver=solver)
+        assert_allclose(est_auto_multi.coef_, est_multi_multi.coef_)
+        assert_allclose(
+            est_auto_multi.predict_proba(X2), est_multi_multi.predict_proba(X2)
+        )
+
+        # Make sure multi_class='ovr' is distinct from ='multinomial'
+        assert not np.allclose(
+            est_auto_bin.coef_,
+            fit(X, y_bin, multi_class="multinomial", solver=solver).coef_,
+        )
+        assert not np.allclose(
+            est_auto_bin.coef_,
+            fit(X, y_multi, multi_class="multinomial", solver=solver).coef_,
+        )
+
+
+@pytest.mark.parametrize("solver", sorted(set(SOLVERS) - set(["liblinear"])))
+def test_penalty_none(solver):
+    # - Make sure warning is raised if penalty=None and C is set to a
+    #   non-default value.
+    # - Make sure setting penalty=None is equivalent to setting C=np.inf with
+    #   l2 penalty.
+    X, y = make_classification(n_samples=1000, n_redundant=0, random_state=0)
+
+    msg = "Setting penalty=None will ignore the C"
+    lr = LogisticRegression(penalty=None, solver=solver, C=4)
+    with pytest.warns(UserWarning, match=msg):
+        lr.fit(X, y)
+
+    lr_none = LogisticRegression(penalty=None, solver=solver, random_state=0)
+    lr_l2_C_inf = LogisticRegression(
+        penalty="l2", C=np.inf, solver=solver, random_state=0
+    )
+    pred_none = lr_none.fit(X, y).predict(X)
+    pred_l2_C_inf = lr_l2_C_inf.fit(X, y).predict(X)
+    assert_array_equal(pred_none, pred_l2_C_inf)
+
+
+@pytest.mark.parametrize(
+    "params",
+    [
+        {"penalty": "l1", "dual": False, "tol": 1e-6, "max_iter": 1000},
+        {"penalty": "l2", "dual": True, "tol": 1e-12, "max_iter": 1000},
+        {"penalty": "l2", "dual": False, "tol": 1e-12, "max_iter": 1000},
+    ],
+)
+def test_logisticregression_liblinear_sample_weight(params):
+    # check that we support sample_weight with liblinear in all possible cases:
+    # l1-primal, l2-primal, l2-dual
+    X = np.array(
+        [
+            [1, 3],
+            [1, 3],
+            [1, 3],
+            [1, 3],
+            [2, 1],
+            [2, 1],
+            [2, 1],
+            [2, 1],
+            [3, 3],
+            [3, 3],
+            [3, 3],
+            [3, 3],
+            [4, 1],
+            [4, 1],
+            [4, 1],
+            [4, 1],
+        ],
+        dtype=np.dtype("float"),
+    )
+    y = np.array(
+        [1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype("int")
+    )
+
+    X2 = np.vstack([X, X])
+    y2 = np.hstack([y, 3 - y])
+    sample_weight = np.ones(shape=len(y) * 2)
+    sample_weight[len(y) :] = 0
+    X2, y2, sample_weight = shuffle(X2, y2, sample_weight, random_state=0)
+
+    base_clf = LogisticRegression(solver="liblinear", random_state=42)
+    base_clf.set_params(**params)
+    clf_no_weight = clone(base_clf).fit(X, y)
+    clf_with_weight = clone(base_clf).fit(X2, y2, sample_weight=sample_weight)
+
+    for method in ("predict", "predict_proba", "decision_function"):
+        X_clf_no_weight = getattr(clf_no_weight, method)(X)
+        X_clf_with_weight = getattr(clf_with_weight, method)(X)
+        assert_allclose(X_clf_no_weight, X_clf_with_weight)
+
+
+def test_scores_attribute_layout_elasticnet():
+    # Non regression test for issue #14955.
+    # when penalty is elastic net the scores_ attribute has shape
+    # (n_classes, n_Cs, n_l1_ratios)
+    # We here make sure that the second dimension indeed corresponds to Cs and
+    # the third dimension corresponds to l1_ratios.
+
+    X, y = make_classification(n_samples=1000, random_state=0)
+    cv = StratifiedKFold(n_splits=5)
+
+    l1_ratios = [0.1, 0.9]
+    Cs = [0.1, 1, 10]
+
+    lrcv = LogisticRegressionCV(
+        penalty="elasticnet",
+        solver="saga",
+        l1_ratios=l1_ratios,
+        Cs=Cs,
+        cv=cv,
+        random_state=0,
+        max_iter=250,
+        tol=1e-3,
+    )
+    lrcv.fit(X, y)
+
+    avg_scores_lrcv = lrcv.scores_[1].mean(axis=0)  # average over folds
+
+    for i, C in enumerate(Cs):
+        for j, l1_ratio in enumerate(l1_ratios):
+            lr = LogisticRegression(
+                penalty="elasticnet",
+                solver="saga",
+                C=C,
+                l1_ratio=l1_ratio,
+                random_state=0,
+                max_iter=250,
+                tol=1e-3,
+            )
+
+            avg_score_lr = cross_val_score(lr, X, y, cv=cv).mean()
+            assert avg_scores_lrcv[i, j] == pytest.approx(avg_score_lr)
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.parametrize("solver", ["lbfgs", "newton-cg", "newton-cholesky"])
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_multinomial_identifiability_on_iris(solver, fit_intercept):
+    """Test that the multinomial classification is identifiable.
+
+    A multinomial with c classes can be modeled with
+    probability_k = exp(X@coef_k) / sum(exp(X@coef_l), l=1..c) for k=1..c.
+    This is not identifiable, unless one chooses a further constraint.
+    According to [1], the maximum of the L2 penalized likelihood automatically
+    satisfies the symmetric constraint:
+    sum(coef_k, k=1..c) = 0
+
+    Further details can be found in [2].
+
+    Reference
+    ---------
+    .. [1] :doi:`Zhu, Ji and Trevor J. Hastie. "Classification of gene microarrays by
+           penalized logistic regression". Biostatistics 5 3 (2004): 427-43.
+           <10.1093/biostatistics/kxg046>`
+
+    .. [2] :arxiv:`Noah Simon and Jerome Friedman and Trevor Hastie. (2013)
+           "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
+           Multinomial Regression". <1311.6529>`
+    """
+    # Test logistic regression with the iris dataset
+    n_samples, n_features = iris.data.shape
+    target = iris.target_names[iris.target]
+
+    clf = LogisticRegression(
+        C=len(iris.data),
+        solver="lbfgs",
+        fit_intercept=fit_intercept,
+    )
+    # Scaling X to ease convergence.
+    X_scaled = scale(iris.data)
+    clf.fit(X_scaled, target)
+
+    # axis=0 is sum over classes
+    assert_allclose(clf.coef_.sum(axis=0), 0, atol=1e-10)
+    if fit_intercept:
+        assert clf.intercept_.sum(axis=0) == pytest.approx(0, abs=1e-11)
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of multi_class
+@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning")
+@pytest.mark.parametrize("multi_class", ["ovr", "multinomial", "auto"])
+@pytest.mark.parametrize("class_weight", [{0: 1.0, 1: 10.0, 2: 1.0}, "balanced"])
+def test_sample_weight_not_modified(multi_class, class_weight):
+    X, y = load_iris(return_X_y=True)
+    n_features = len(X)
+    W = np.ones(n_features)
+    W[: n_features // 2] = 2
+
+    expected = W.copy()
+
+    clf = LogisticRegression(
+        random_state=0, class_weight=class_weight, max_iter=200, multi_class=multi_class
+    )
+    clf.fit(X, y, sample_weight=W)
+    assert_allclose(expected, W)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_large_sparse_matrix(solver, global_random_seed, csr_container):
+    # Solvers either accept large sparse matrices, or raise helpful error.
+    # Non-regression test for pull-request #21093.
+
+    # generate sparse matrix with int64 indices
+    X = csr_container(sparse.rand(20, 10, random_state=global_random_seed))
+    for attr in ["indices", "indptr"]:
+        setattr(X, attr, getattr(X, attr).astype("int64"))
+    rng = np.random.RandomState(global_random_seed)
+    y = rng.randint(2, size=X.shape[0])
+
+    if solver in ["liblinear", "sag", "saga"]:
+        msg = "Only sparse matrices with 32-bit integer indices"
+        with pytest.raises(ValueError, match=msg):
+            LogisticRegression(solver=solver).fit(X, y)
+    else:
+        LogisticRegression(solver=solver).fit(X, y)
+
+
+def test_single_feature_newton_cg():
+    # Test that Newton-CG works with a single feature and intercept.
+    # Non-regression test for issue #23605.
+
+    X = np.array([[0.5, 0.65, 1.1, 1.25, 0.8, 0.54, 0.95, 0.7]]).T
+    y = np.array([1, 1, 0, 0, 1, 1, 0, 1])
+    assert X.shape[1] == 1
+    LogisticRegression(solver="newton-cg", fit_intercept=True).fit(X, y)
+
+
+def test_liblinear_not_stuck():
+    # Non-regression https://github.com/scikit-learn/scikit-learn/issues/18264
+    X = iris.data.copy()
+    y = iris.target.copy()
+    X = X[y != 2]
+    y = y[y != 2]
+    X_prep = StandardScaler().fit_transform(X)
+
+    C = l1_min_c(X, y, loss="log") * 10 ** (10 / 29)
+    clf = LogisticRegression(
+        penalty="l1",
+        solver="liblinear",
+        tol=1e-6,
+        max_iter=100,
+        intercept_scaling=10000.0,
+        random_state=0,
+        C=C,
+    )
+
+    # test that the fit does not raise a ConvergenceWarning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", ConvergenceWarning)
+        clf.fit(X_prep, y)
+
+
+@config_context(enable_metadata_routing=True)
+def test_lr_cv_scores_differ_when_sample_weight_is_requested():
+    """Test that `sample_weight` is correctly passed to the scorer in
+    `LogisticRegressionCV.fit` and `LogisticRegressionCV.score` by
+    checking the difference in scores with the case when `sample_weight`
+    is not requested.
+    """
+    rng = np.random.RandomState(10)
+    X, y = make_classification(n_samples=10, random_state=rng)
+    X_t, y_t = make_classification(n_samples=10, random_state=rng)
+    sample_weight = np.ones(len(y))
+    sample_weight[: len(y) // 2] = 2
+    kwargs = {"sample_weight": sample_weight}
+
+    scorer1 = get_scorer("accuracy")
+    lr_cv1 = LogisticRegressionCV(scoring=scorer1)
+    lr_cv1.fit(X, y, **kwargs)
+
+    scorer2 = get_scorer("accuracy")
+    scorer2.set_score_request(sample_weight=True)
+    lr_cv2 = LogisticRegressionCV(scoring=scorer2)
+    lr_cv2.fit(X, y, **kwargs)
+
+    assert not np.allclose(lr_cv1.scores_[1], lr_cv2.scores_[1])
+
+    score_1 = lr_cv1.score(X_t, y_t, **kwargs)
+    score_2 = lr_cv2.score(X_t, y_t, **kwargs)
+
+    assert not np.allclose(score_1, score_2)
+
+
+def test_lr_cv_scores_without_enabling_metadata_routing():
+    """Test that `sample_weight` is passed correctly to the scorer in
+    `LogisticRegressionCV.fit` and `LogisticRegressionCV.score` even
+    when `enable_metadata_routing=False`
+    """
+    rng = np.random.RandomState(10)
+    X, y = make_classification(n_samples=10, random_state=rng)
+    X_t, y_t = make_classification(n_samples=10, random_state=rng)
+    sample_weight = np.ones(len(y))
+    sample_weight[: len(y) // 2] = 2
+    kwargs = {"sample_weight": sample_weight}
+
+    with config_context(enable_metadata_routing=False):
+        scorer1 = get_scorer("accuracy")
+        lr_cv1 = LogisticRegressionCV(scoring=scorer1)
+        lr_cv1.fit(X, y, **kwargs)
+        score_1 = lr_cv1.score(X_t, y_t, **kwargs)
+
+    with config_context(enable_metadata_routing=True):
+        scorer2 = get_scorer("accuracy")
+        scorer2.set_score_request(sample_weight=True)
+        lr_cv2 = LogisticRegressionCV(scoring=scorer2)
+        lr_cv2.fit(X, y, **kwargs)
+        score_2 = lr_cv2.score(X_t, y_t, **kwargs)
+
+    assert_allclose(lr_cv1.scores_[1], lr_cv2.scores_[1])
+    assert_allclose(score_1, score_2)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+def test_zero_max_iter(solver):
+    # Make sure we can inspect the state of LogisticRegression right after
+    # initialization (before the first weight update).
+    X, y = load_iris(return_X_y=True)
+    y = y == 2
+    with ignore_warnings(category=ConvergenceWarning):
+        clf = LogisticRegression(solver=solver, max_iter=0).fit(X, y)
+    if solver not in ["saga", "sag"]:
+        # XXX: sag and saga have n_iter_ = [1]...
+        assert clf.n_iter_ == 0
+
+    if solver != "lbfgs":
+        # XXX: lbfgs has already started to update the coefficients...
+        assert_allclose(clf.coef_, np.zeros_like(clf.coef_))
+        assert_allclose(
+            clf.decision_function(X),
+            np.full(shape=X.shape[0], fill_value=clf.intercept_),
+        )
+        assert_allclose(
+            clf.predict_proba(X),
+            np.full(shape=(X.shape[0], 2), fill_value=0.5),
+        )
+    assert clf.score(X, y) < 0.7
+
+
+def test_passing_params_without_enabling_metadata_routing():
+    """Test that the right error message is raised when metadata params
+    are passed while not supported when `enable_metadata_routing=False`."""
+    X, y = make_classification(n_samples=10, random_state=0)
+    lr_cv = LogisticRegressionCV()
+    msg = "is only supported if enable_metadata_routing=True"
+
+    with config_context(enable_metadata_routing=False):
+        params = {"extra_param": 1.0}
+
+        with pytest.raises(ValueError, match=msg):
+            lr_cv.fit(X, y, **params)
+
+        with pytest.raises(ValueError, match=msg):
+            lr_cv.score(X, y, **params)
+
+
+# TODO(1.8): remove
+def test_multi_class_deprecated():
+    """Check `multi_class` parameter deprecated."""
+    X, y = make_classification(n_classes=3, n_samples=50, n_informative=6)
+    lr = LogisticRegression(multi_class="ovr")
+    msg = "'multi_class' was deprecated"
+    with pytest.warns(FutureWarning, match=msg):
+        lr.fit(X, y)
+
+    lrCV = LogisticRegressionCV(multi_class="ovr")
+    with pytest.warns(FutureWarning, match=msg):
+        lrCV.fit(X, y)
+
+    # Special warning for "binary multinomial"
+    X, y = make_classification(n_classes=2, n_samples=50, n_informative=6)
+    lr = LogisticRegression(multi_class="multinomial")
+    msg = "'multi_class' was deprecated.*binary problems"
+    with pytest.warns(FutureWarning, match=msg):
+        lr.fit(X, y)
+
+    lrCV = LogisticRegressionCV(multi_class="multinomial")
+    with pytest.warns(FutureWarning, match=msg):
+        lrCV.fit(X, y)
+
+
+def test_newton_cholesky_fallback_to_lbfgs(global_random_seed):
+    # Wide data matrix should lead to a rank-deficient Hessian matrix
+    # hence make the Newton-Cholesky solver raise a warning and fallback to
+    # lbfgs.
+    X, y = make_classification(
+        n_samples=10, n_features=20, random_state=global_random_seed
+    )
+    C = 1e30  # very high C to nearly disable regularization
+
+    # Check that LBFGS can converge without any warning on this problem.
+    lr_lbfgs = LogisticRegression(solver="lbfgs", C=C)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        lr_lbfgs.fit(X, y)
+        n_iter_lbfgs = lr_lbfgs.n_iter_[0]
+
+    assert n_iter_lbfgs >= 1
+
+    # Check that the Newton-Cholesky solver raises a warning and falls back to
+    # LBFGS. This should converge with the same number of iterations as the
+    # above call of lbfgs since the Newton-Cholesky triggers the fallback
+    # before completing the first iteration, for the problem setting at hand.
+    lr_nc = LogisticRegression(solver="newton-cholesky", C=C)
+    with ignore_warnings(category=LinAlgWarning):
+        lr_nc.fit(X, y)
+        n_iter_nc = lr_nc.n_iter_[0]
+
+    assert n_iter_nc == n_iter_lbfgs
+
+    # Trying to fit the same model again with a small iteration budget should
+    # therefore raise a ConvergenceWarning:
+    lr_nc_limited = LogisticRegression(
+        solver="newton-cholesky", C=C, max_iter=n_iter_lbfgs - 1
+    )
+    with ignore_warnings(category=LinAlgWarning):
+        with pytest.warns(ConvergenceWarning, match="lbfgs failed to converge"):
+            lr_nc_limited.fit(X, y)
+            n_iter_nc_limited = lr_nc_limited.n_iter_[0]
+
+    assert n_iter_nc_limited == lr_nc_limited.max_iter - 1
+
+
+# TODO(1.8): check for an error instead
+@pytest.mark.parametrize("Estimator", [LogisticRegression, LogisticRegressionCV])
+def test_liblinear_multiclass_warning(Estimator):
+    """Check that liblinear warns on multiclass problems."""
+    msg = (
+        "Using the 'liblinear' solver for multiclass classification is "
+        "deprecated. An error will be raised in 1.8. Either use another "
+        "solver which supports the multinomial loss or wrap the estimator "
+        "in a OneVsRestClassifier to keep applying a one-versus-rest "
+        "scheme."
+    )
+    with pytest.warns(FutureWarning, match=msg):
+        Estimator(solver="liblinear").fit(iris.data, iris.target)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_passive_aggressive.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_passive_aggressive.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcfd58b1eab2b51ecd8cc1097bd48577e2babe0d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_passive_aggressive.py
@@ -0,0 +1,268 @@
+import numpy as np
+import pytest
+
+from sklearn.base import ClassifierMixin
+from sklearn.datasets import load_iris
+from sklearn.linear_model import PassiveAggressiveClassifier, PassiveAggressiveRegressor
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+iris = load_iris()
+random_state = check_random_state(12)
+indices = np.arange(iris.data.shape[0])
+random_state.shuffle(indices)
+X = iris.data[indices]
+y = iris.target[indices]
+
+
+class MyPassiveAggressive(ClassifierMixin):
+    def __init__(
+        self,
+        C=1.0,
+        epsilon=0.01,
+        loss="hinge",
+        fit_intercept=True,
+        n_iter=1,
+        random_state=None,
+    ):
+        self.C = C
+        self.epsilon = epsilon
+        self.loss = loss
+        self.fit_intercept = fit_intercept
+        self.n_iter = n_iter
+
+    def fit(self, X, y):
+        n_samples, n_features = X.shape
+        self.w = np.zeros(n_features, dtype=np.float64)
+        self.b = 0.0
+
+        for t in range(self.n_iter):
+            for i in range(n_samples):
+                p = self.project(X[i])
+                if self.loss in ("hinge", "squared_hinge"):
+                    loss = max(1 - y[i] * p, 0)
+                else:
+                    loss = max(np.abs(p - y[i]) - self.epsilon, 0)
+
+                sqnorm = np.dot(X[i], X[i])
+
+                if self.loss in ("hinge", "epsilon_insensitive"):
+                    step = min(self.C, loss / sqnorm)
+                elif self.loss in ("squared_hinge", "squared_epsilon_insensitive"):
+                    step = loss / (sqnorm + 1.0 / (2 * self.C))
+
+                if self.loss in ("hinge", "squared_hinge"):
+                    step *= y[i]
+                else:
+                    step *= np.sign(y[i] - p)
+
+                self.w += step * X[i]
+                if self.fit_intercept:
+                    self.b += step
+
+    def project(self, X):
+        return np.dot(X, self.w) + self.b
+
+
+@pytest.mark.parametrize("average", [False, True])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+def test_classifier_accuracy(csr_container, fit_intercept, average):
+    data = csr_container(X) if csr_container is not None else X
+    clf = PassiveAggressiveClassifier(
+        C=1.0,
+        max_iter=30,
+        fit_intercept=fit_intercept,
+        random_state=1,
+        average=average,
+        tol=None,
+    )
+    clf.fit(data, y)
+    score = clf.score(data, y)
+    assert score > 0.79
+    if average:
+        assert hasattr(clf, "_average_coef")
+        assert hasattr(clf, "_average_intercept")
+        assert hasattr(clf, "_standard_intercept")
+        assert hasattr(clf, "_standard_coef")
+
+
+@pytest.mark.parametrize("average", [False, True])
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+def test_classifier_partial_fit(csr_container, average):
+    classes = np.unique(y)
+    data = csr_container(X) if csr_container is not None else X
+    clf = PassiveAggressiveClassifier(random_state=0, average=average, max_iter=5)
+    for t in range(30):
+        clf.partial_fit(data, y, classes)
+    score = clf.score(data, y)
+    assert score > 0.79
+    if average:
+        assert hasattr(clf, "_average_coef")
+        assert hasattr(clf, "_average_intercept")
+        assert hasattr(clf, "_standard_intercept")
+        assert hasattr(clf, "_standard_coef")
+
+
+def test_classifier_refit():
+    # Classifier can be retrained on different labels and features.
+    clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y)
+    assert_array_equal(clf.classes_, np.unique(y))
+
+    clf.fit(X[:, :-1], iris.target_names[y])
+    assert_array_equal(clf.classes_, iris.target_names)
+
+
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+@pytest.mark.parametrize("loss", ("hinge", "squared_hinge"))
+def test_classifier_correctness(loss, csr_container):
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    clf1 = MyPassiveAggressive(loss=loss, n_iter=2)
+    clf1.fit(X, y_bin)
+
+    data = csr_container(X) if csr_container is not None else X
+    clf2 = PassiveAggressiveClassifier(loss=loss, max_iter=2, shuffle=False, tol=None)
+    clf2.fit(data, y_bin)
+
+    assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "predict_log_proba", "transform"]
+)
+def test_classifier_undefined_methods(response_method):
+    clf = PassiveAggressiveClassifier(max_iter=100)
+    with pytest.raises(AttributeError):
+        getattr(clf, response_method)
+
+
+def test_class_weights():
+    # Test class weights.
+    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y2 = [1, 1, 1, -1, -1]
+
+    clf = PassiveAggressiveClassifier(
+        C=0.1, max_iter=100, class_weight=None, random_state=100
+    )
+    clf.fit(X2, y2)
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+
+    # we give a small weights to class 1
+    clf = PassiveAggressiveClassifier(
+        C=0.1, max_iter=100, class_weight={1: 0.001}, random_state=100
+    )
+    clf.fit(X2, y2)
+
+    # now the hyperplane should rotate clock-wise and
+    # the prediction on this point should shift
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
+
+
+def test_partial_fit_weight_class_balanced():
+    # partial_fit with class_weight='balanced' not supported
+    clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100)
+    with pytest.raises(ValueError):
+        clf.partial_fit(X, y, classes=np.unique(y))
+
+
+def test_equal_class_weight():
+    X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
+    y2 = [0, 0, 1, 1]
+    clf = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight=None)
+    clf.fit(X2, y2)
+
+    # Already balanced, so "balanced" weights should have no effect
+    clf_balanced = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight="balanced")
+    clf_balanced.fit(X2, y2)
+
+    clf_weighted = PassiveAggressiveClassifier(
+        C=0.1, tol=None, class_weight={0: 0.5, 1: 0.5}
+    )
+    clf_weighted.fit(X2, y2)
+
+    # should be similar up to some epsilon due to learning rate schedule
+    assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
+    assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
+
+
+def test_wrong_class_weight_label():
+    # ValueError due to wrong class_weight label.
+    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y2 = [1, 1, 1, -1, -1]
+
+    clf = PassiveAggressiveClassifier(class_weight={0: 0.5}, max_iter=100)
+    with pytest.raises(ValueError):
+        clf.fit(X2, y2)
+
+
+@pytest.mark.parametrize("average", [False, True])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+def test_regressor_mse(csr_container, fit_intercept, average):
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    data = csr_container(X) if csr_container is not None else X
+    reg = PassiveAggressiveRegressor(
+        C=1.0,
+        fit_intercept=fit_intercept,
+        random_state=0,
+        average=average,
+        max_iter=5,
+    )
+    reg.fit(data, y_bin)
+    pred = reg.predict(data)
+    assert np.mean((pred - y_bin) ** 2) < 1.7
+    if average:
+        assert hasattr(reg, "_average_coef")
+        assert hasattr(reg, "_average_intercept")
+        assert hasattr(reg, "_standard_intercept")
+        assert hasattr(reg, "_standard_coef")
+
+
+@pytest.mark.parametrize("average", [False, True])
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+def test_regressor_partial_fit(csr_container, average):
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    data = csr_container(X) if csr_container is not None else X
+    reg = PassiveAggressiveRegressor(random_state=0, average=average, max_iter=100)
+    for t in range(50):
+        reg.partial_fit(data, y_bin)
+    pred = reg.predict(data)
+    assert np.mean((pred - y_bin) ** 2) < 1.7
+    if average:
+        assert hasattr(reg, "_average_coef")
+        assert hasattr(reg, "_average_intercept")
+        assert hasattr(reg, "_standard_intercept")
+        assert hasattr(reg, "_standard_coef")
+
+
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+@pytest.mark.parametrize("loss", ("epsilon_insensitive", "squared_epsilon_insensitive"))
+def test_regressor_correctness(loss, csr_container):
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    reg1 = MyPassiveAggressive(loss=loss, n_iter=2)
+    reg1.fit(X, y_bin)
+
+    data = csr_container(X) if csr_container is not None else X
+    reg2 = PassiveAggressiveRegressor(tol=None, loss=loss, max_iter=2, shuffle=False)
+    reg2.fit(data, y_bin)
+
+    assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2)
+
+
+def test_regressor_undefined_methods():
+    reg = PassiveAggressiveRegressor(max_iter=100)
+    with pytest.raises(AttributeError):
+        reg.transform(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_perceptron.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_perceptron.py
new file mode 100644
index 0000000000000000000000000000000000000000..71456ae72132ccebc76da96aea9213fd55f47c9d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_perceptron.py
@@ -0,0 +1,88 @@
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_iris
+from sklearn.linear_model import Perceptron
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_allclose, assert_array_almost_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+iris = load_iris()
+random_state = check_random_state(12)
+indices = np.arange(iris.data.shape[0])
+random_state.shuffle(indices)
+X = iris.data[indices]
+y = iris.target[indices]
+
+
+class MyPerceptron:
+    def __init__(self, n_iter=1):
+        self.n_iter = n_iter
+
+    def fit(self, X, y):
+        n_samples, n_features = X.shape
+        self.w = np.zeros(n_features, dtype=np.float64)
+        self.b = 0.0
+
+        for t in range(self.n_iter):
+            for i in range(n_samples):
+                if self.predict(X[i])[0] != y[i]:
+                    self.w += y[i] * X[i]
+                    self.b += y[i]
+
+    def project(self, X):
+        return np.dot(X, self.w) + self.b
+
+    def predict(self, X):
+        X = np.atleast_2d(X)
+        return np.sign(self.project(X))
+
+
+@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array])
+def test_perceptron_accuracy(container):
+    data = container(X)
+    clf = Perceptron(max_iter=100, tol=None, shuffle=False)
+    clf.fit(data, y)
+    score = clf.score(data, y)
+    assert score > 0.7
+
+
+def test_perceptron_correctness():
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    clf1 = MyPerceptron(n_iter=2)
+    clf1.fit(X, y_bin)
+
+    clf2 = Perceptron(max_iter=2, shuffle=False, tol=None)
+    clf2.fit(X, y_bin)
+
+    assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
+
+
+def test_undefined_methods():
+    clf = Perceptron(max_iter=100)
+    for meth in ("predict_proba", "predict_log_proba"):
+        with pytest.raises(AttributeError):
+            getattr(clf, meth)
+
+
+def test_perceptron_l1_ratio():
+    """Check that `l1_ratio` has an impact when `penalty='elasticnet'`"""
+    clf1 = Perceptron(l1_ratio=0, penalty="elasticnet")
+    clf1.fit(X, y)
+
+    clf2 = Perceptron(l1_ratio=0.15, penalty="elasticnet")
+    clf2.fit(X, y)
+
+    assert clf1.score(X, y) != clf2.score(X, y)
+
+    # check that the bounds of elastic net which should correspond to an l1 or
+    # l2 penalty depending of `l1_ratio` value.
+    clf_l1 = Perceptron(penalty="l1").fit(X, y)
+    clf_elasticnet = Perceptron(l1_ratio=1, penalty="elasticnet").fit(X, y)
+    assert_allclose(clf_l1.coef_, clf_elasticnet.coef_)
+
+    clf_l2 = Perceptron(penalty="l2").fit(X, y)
+    clf_elasticnet = Perceptron(l1_ratio=0, penalty="elasticnet").fit(X, y)
+    assert_allclose(clf_l2.coef_, clf_elasticnet.coef_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_quantile.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_quantile.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d166b14091ccc11e148184056a6d4a58a48a664
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_quantile.py
@@ -0,0 +1,283 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import pytest
+from pytest import approx
+from scipy.optimize import minimize
+
+from sklearn.datasets import make_regression
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import HuberRegressor, QuantileRegressor
+from sklearn.metrics import mean_pinball_loss
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    parse_version,
+    sp_version,
+)
+
+
+@pytest.fixture
+def X_y_data():
+    X, y = make_regression(n_samples=10, n_features=1, random_state=0, noise=1)
+    return X, y
+
+
+@pytest.mark.skipif(
+    parse_version(sp_version.base_version) >= parse_version("1.11"),
+    reason="interior-point solver is not available in SciPy 1.11",
+)
+@pytest.mark.parametrize("solver", ["interior-point", "revised simplex"])
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_incompatible_solver_for_sparse_input(X_y_data, solver, csc_container):
+    X, y = X_y_data
+    X_sparse = csc_container(X)
+    err_msg = (
+        f"Solver {solver} does not support sparse X. Use solver 'highs' for example."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        QuantileRegressor(solver=solver).fit(X_sparse, y)
+
+
+@pytest.mark.parametrize(
+    "quantile, alpha, intercept, coef",
+    [
+        # for 50% quantile w/o regularization, any slope in [1, 10] is okay
+        [0.5, 0, 1, None],
+        # if positive error costs more, the slope is maximal
+        [0.51, 0, 1, 10],
+        # if negative error costs more, the slope is minimal
+        [0.49, 0, 1, 1],
+        # for a small lasso penalty, the slope is also minimal
+        [0.5, 0.01, 1, 1],
+        # for a large lasso penalty, the model predicts the constant median
+        [0.5, 100, 2, 0],
+    ],
+)
+def test_quantile_toy_example(quantile, alpha, intercept, coef):
+    # test how different parameters affect a small intuitive example
+    X = [[0], [1], [1]]
+    y = [1, 2, 11]
+    model = QuantileRegressor(quantile=quantile, alpha=alpha).fit(X, y)
+    assert_allclose(model.intercept_, intercept, atol=1e-2)
+    if coef is not None:
+        assert_allclose(model.coef_[0], coef, atol=1e-2)
+    if alpha < 100:
+        assert model.coef_[0] >= 1
+    assert model.coef_[0] <= 10
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_quantile_equals_huber_for_low_epsilon(fit_intercept):
+    X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0)
+    alpha = 1e-4
+    huber = HuberRegressor(
+        epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept
+    ).fit(X, y)
+    quant = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y)
+    assert_allclose(huber.coef_, quant.coef_, atol=1e-1)
+    if fit_intercept:
+        assert huber.intercept_ == approx(quant.intercept_, abs=1e-1)
+        # check that we still predict fraction
+        assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1)
+
+
+@pytest.mark.parametrize("q", [0.5, 0.9, 0.05])
+def test_quantile_estimates_calibration(q):
+    # Test that model estimates percentage of points below the prediction
+    X, y = make_regression(n_samples=1000, n_features=20, random_state=0, noise=1.0)
+    quant = QuantileRegressor(quantile=q, alpha=0).fit(X, y)
+    assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2)
+
+
+def test_quantile_sample_weight():
+    # test that with unequal sample weights we still estimate weighted fraction
+    n = 1000
+    X, y = make_regression(n_samples=n, n_features=5, random_state=0, noise=10.0)
+    weight = np.ones(n)
+    # when we increase weight of upper observations,
+    # estimate of quantile should go up
+    weight[y > y.mean()] = 100
+    quant = QuantileRegressor(quantile=0.5, alpha=1e-8)
+    quant.fit(X, y, sample_weight=weight)
+    fraction_below = np.mean(y < quant.predict(X))
+    assert fraction_below > 0.5
+    weighted_fraction_below = np.average(y < quant.predict(X), weights=weight)
+    assert weighted_fraction_below == approx(0.5, abs=3e-2)
+
+
+@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
+def test_asymmetric_error(quantile):
+    """Test quantile regression for asymmetric distributed targets."""
+    n_samples = 1000
+    rng = np.random.RandomState(42)
+    X = np.concatenate(
+        (
+            np.abs(rng.randn(n_samples)[:, None]),
+            -rng.randint(2, size=(n_samples, 1)),
+        ),
+        axis=1,
+    )
+    intercept = 1.23
+    coef = np.array([0.5, -2])
+    #  Take care that X @ coef + intercept > 0
+    assert np.min(X @ coef + intercept) > 0
+    # For an exponential distribution with rate lambda, e.g. exp(-lambda * x),
+    # the quantile at level q is:
+    #   quantile(q) = - log(1 - q) / lambda
+    #   scale = 1/lambda = -quantile(q) / log(1 - q)
+    y = rng.exponential(
+        scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples
+    )
+    model = QuantileRegressor(
+        quantile=quantile,
+        alpha=0,
+    ).fit(X, y)
+    # This test can be made to pass with any solver but in the interest
+    # of sparing continuous integration resources, the test is performed
+    # with the fastest solver only.
+
+    assert model.intercept_ == approx(intercept, rel=0.2)
+    assert_allclose(model.coef_, coef, rtol=0.6)
+    assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
+
+    # Now compare to Nelder-Mead optimization with L1 penalty
+    alpha = 0.01
+    model.set_params(alpha=alpha).fit(X, y)
+    model_coef = np.r_[model.intercept_, model.coef_]
+
+    def func(coef):
+        loss = mean_pinball_loss(y, X @ coef[1:] + coef[0], alpha=quantile)
+        L1 = np.sum(np.abs(coef[1:]))
+        return loss + alpha * L1
+
+    res = minimize(
+        fun=func,
+        x0=[1, 0, -1],
+        method="Nelder-Mead",
+        tol=1e-12,
+        options={"maxiter": 2000},
+    )
+
+    assert func(model_coef) == approx(func(res.x))
+    assert_allclose(model.intercept_, res.x[0])
+    assert_allclose(model.coef_, res.x[1:])
+    assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
+
+
+@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
+def test_equivariance(quantile):
+    """Test equivariace of quantile regression.
+
+    See Koenker (2005) Quantile Regression, Chapter 2.2.3.
+    """
+    rng = np.random.RandomState(42)
+    n_samples, n_features = 100, 5
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_features,
+        noise=0,
+        random_state=rng,
+        shuffle=False,
+    )
+    # make y asymmetric
+    y += rng.exponential(scale=100, size=y.shape)
+    params = dict(alpha=0)
+    model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y)
+
+    # coef(q; a*y, X) = a * coef(q; y, X)
+    a = 2.5
+    model2 = QuantileRegressor(quantile=quantile, **params).fit(X, a * y)
+    assert model2.intercept_ == approx(a * model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, a * model1.coef_, rtol=1e-5)
+
+    # coef(1-q; -a*y, X) = -a * coef(q; y, X)
+    model2 = QuantileRegressor(quantile=1 - quantile, **params).fit(X, -a * y)
+    assert model2.intercept_ == approx(-a * model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, -a * model1.coef_, rtol=1e-5)
+
+    # coef(q; y + X @ g, X) = coef(q; y, X) + g
+    g_intercept, g_coef = rng.randn(), rng.randn(n_features)
+    model2 = QuantileRegressor(quantile=quantile, **params)
+    model2.fit(X, y + X @ g_coef + g_intercept)
+    assert model2.intercept_ == approx(model1.intercept_ + g_intercept)
+    assert_allclose(model2.coef_, model1.coef_ + g_coef, rtol=1e-6)
+
+    # coef(q; y, X @ A) = A^-1 @ coef(q; y, X)
+    A = rng.randn(n_features, n_features)
+    model2 = QuantileRegressor(quantile=quantile, **params)
+    model2.fit(X @ A, y)
+    assert model2.intercept_ == approx(model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, np.linalg.solve(A, model1.coef_), rtol=1e-5)
+
+
+@pytest.mark.skipif(
+    parse_version(sp_version.base_version) >= parse_version("1.11"),
+    reason="interior-point solver is not available in SciPy 1.11",
+)
+@pytest.mark.filterwarnings("ignore:`method='interior-point'` is deprecated")
+def test_linprog_failure():
+    """Test that linprog fails."""
+    X = np.linspace(0, 10, num=10).reshape(-1, 1)
+    y = np.linspace(0, 10, num=10)
+    reg = QuantileRegressor(
+        alpha=0, solver="interior-point", solver_options={"maxiter": 1}
+    )
+
+    msg = "Linear programming for QuantileRegressor did not succeed."
+    with pytest.warns(ConvergenceWarning, match=msg):
+        reg.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "sparse_container", CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS
+)
+@pytest.mark.parametrize("solver", ["highs", "highs-ds", "highs-ipm"])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_sparse_input(sparse_container, solver, fit_intercept, global_random_seed):
+    """Test that sparse and dense X give same results."""
+    n_informative = 10
+    quantile_level = 0.6
+    X, y = make_regression(
+        n_samples=300,
+        n_features=20,
+        n_informative=10,
+        random_state=global_random_seed,
+        noise=1.0,
+    )
+    X_sparse = sparse_container(X)
+    alpha = 0.1
+    quant_dense = QuantileRegressor(
+        quantile=quantile_level, alpha=alpha, fit_intercept=fit_intercept
+    ).fit(X, y)
+    quant_sparse = QuantileRegressor(
+        quantile=quantile_level, alpha=alpha, fit_intercept=fit_intercept, solver=solver
+    ).fit(X_sparse, y)
+    assert_allclose(quant_sparse.coef_, quant_dense.coef_, rtol=1e-2)
+    sparse_support = quant_sparse.coef_ != 0
+    dense_support = quant_dense.coef_ != 0
+    assert dense_support.sum() == pytest.approx(n_informative, abs=1)
+    assert sparse_support.sum() == pytest.approx(n_informative, abs=1)
+    if fit_intercept:
+        assert quant_sparse.intercept_ == approx(quant_dense.intercept_)
+        # check that we still predict fraction
+        empirical_coverage = np.mean(y < quant_sparse.predict(X_sparse))
+        assert empirical_coverage == approx(quantile_level, abs=3e-2)
+
+
+def test_error_interior_point_future(X_y_data, monkeypatch):
+    """Check that we will raise a proper error when requesting
+    `solver='interior-point'` in SciPy >= 1.11.
+    """
+    X, y = X_y_data
+    import sklearn.linear_model._quantile
+
+    with monkeypatch.context() as m:
+        m.setattr(sklearn.linear_model._quantile, "sp_version", parse_version("1.11.0"))
+        err_msg = "Solver interior-point is not anymore available in SciPy >= 1.11.0."
+        with pytest.raises(ValueError, match=err_msg):
+            QuantileRegressor(solver="interior-point").fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ransac.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ransac.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b2bc66160ef3f5e686da7c546cf01314035ae57
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ransac.py
@@ -0,0 +1,545 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from sklearn.datasets import make_regression
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import (
+    LinearRegression,
+    OrthogonalMatchingPursuit,
+    RANSACRegressor,
+    Ridge,
+)
+from sklearn.linear_model._ransac import _dynamic_max_trials
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
+
+# Generate coordinates of line
+X = np.arange(-200, 200)
+y = 0.2 * X + 20
+data = np.column_stack([X, y])
+
+# Add some faulty data
+rng = np.random.RandomState(1000)
+outliers = np.unique(rng.randint(len(X), size=200))
+data[outliers, :] += 50 + rng.rand(len(outliers), 2) * 10
+
+X = data[:, 0][:, np.newaxis]
+y = data[:, 1]
+
+
+def test_ransac_inliers_outliers():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+
+    # Estimate parameters of corrupted data
+    ransac_estimator.fit(X, y)
+
+    # Ground truth / reference inlier mask
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+
+def test_ransac_is_data_valid():
+    def is_data_valid(X, y):
+        assert X.shape[0] == 2
+        assert y.shape[0] == 2
+        return False
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2)
+    y = rng.rand(10, 1)
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        is_data_valid=is_data_valid,
+        random_state=0,
+    )
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
+
+
+def test_ransac_is_model_valid():
+    def is_model_valid(estimator, X, y):
+        assert X.shape[0] == 2
+        assert y.shape[0] == 2
+        return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        is_model_valid=is_model_valid,
+        random_state=0,
+    )
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
+
+
+def test_ransac_max_trials():
+    estimator = LinearRegression()
+
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        max_trials=0,
+        random_state=0,
+    )
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
+
+    # there is a 1e-9 chance it will take these many trials. No good reason
+    # 1e-2 isn't enough, can still happen
+    # 2 is the what ransac defines  as min_samples = X.shape[1] + 1
+    max_trials = _dynamic_max_trials(len(X) - len(outliers), X.shape[0], 2, 1 - 1e-9)
+    ransac_estimator = RANSACRegressor(estimator, min_samples=2)
+    for i in range(50):
+        ransac_estimator.set_params(min_samples=2, random_state=i)
+        ransac_estimator.fit(X, y)
+        assert ransac_estimator.n_trials_ < max_trials + 1
+
+
+def test_ransac_stop_n_inliers():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        stop_n_inliers=2,
+        random_state=0,
+    )
+    ransac_estimator.fit(X, y)
+
+    assert ransac_estimator.n_trials_ == 1
+
+
+def test_ransac_stop_score():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        stop_score=0,
+        random_state=0,
+    )
+    ransac_estimator.fit(X, y)
+
+    assert ransac_estimator.n_trials_ == 1
+
+
+def test_ransac_score():
+    X = np.arange(100)[:, None]
+    y = np.zeros((100,))
+    y[0] = 1
+    y[1] = 100
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=0.5, random_state=0
+    )
+    ransac_estimator.fit(X, y)
+
+    assert ransac_estimator.score(X[2:], y[2:]) == 1
+    assert ransac_estimator.score(X[:2], y[:2]) < 1
+
+
+def test_ransac_predict():
+    X = np.arange(100)[:, None]
+    y = np.zeros((100,))
+    y[0] = 1
+    y[1] = 100
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=0.5, random_state=0
+    )
+    ransac_estimator.fit(X, y)
+
+    assert_array_equal(ransac_estimator.predict(X), np.zeros(100))
+
+
+def test_ransac_no_valid_data():
+    def is_data_valid(X, y):
+        return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, is_data_valid=is_data_valid, max_trials=5
+    )
+
+    msg = "RANSAC could not find a valid consensus set"
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
+    assert ransac_estimator.n_skips_no_inliers_ == 0
+    assert ransac_estimator.n_skips_invalid_data_ == 5
+    assert ransac_estimator.n_skips_invalid_model_ == 0
+
+
+def test_ransac_no_valid_model():
+    def is_model_valid(estimator, X, y):
+        return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, is_model_valid=is_model_valid, max_trials=5
+    )
+
+    msg = "RANSAC could not find a valid consensus set"
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
+    assert ransac_estimator.n_skips_no_inliers_ == 0
+    assert ransac_estimator.n_skips_invalid_data_ == 0
+    assert ransac_estimator.n_skips_invalid_model_ == 5
+
+
+def test_ransac_exceed_max_skips():
+    def is_data_valid(X, y):
+        return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, is_data_valid=is_data_valid, max_trials=5, max_skips=3
+    )
+
+    msg = "RANSAC skipped more iterations than `max_skips`"
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
+    assert ransac_estimator.n_skips_no_inliers_ == 0
+    assert ransac_estimator.n_skips_invalid_data_ == 4
+    assert ransac_estimator.n_skips_invalid_model_ == 0
+
+
+def test_ransac_warn_exceed_max_skips():
+    global cause_skip
+    cause_skip = False
+
+    def is_data_valid(X, y):
+        global cause_skip
+        if not cause_skip:
+            cause_skip = True
+            return True
+        else:
+            return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, is_data_valid=is_data_valid, max_skips=3, max_trials=5
+    )
+    warning_message = (
+        "RANSAC found a valid consensus set but exited "
+        "early due to skipping more iterations than "
+        "`max_skips`. See estimator attributes for "
+        "diagnostics."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        ransac_estimator.fit(X, y)
+    assert ransac_estimator.n_skips_no_inliers_ == 0
+    assert ransac_estimator.n_skips_invalid_data_ == 4
+    assert ransac_estimator.n_skips_invalid_model_ == 0
+
+
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSR_CONTAINERS + CSC_CONTAINERS
+)
+def test_ransac_sparse(sparse_container):
+    X_sparse = sparse_container(X)
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_estimator.fit(X_sparse, y)
+
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+
+def test_ransac_none_estimator():
+    estimator = LinearRegression()
+
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_none_estimator = RANSACRegressor(
+        None, min_samples=2, residual_threshold=5, random_state=0
+    )
+
+    ransac_estimator.fit(X, y)
+    ransac_none_estimator.fit(X, y)
+
+    assert_array_almost_equal(
+        ransac_estimator.predict(X), ransac_none_estimator.predict(X)
+    )
+
+
+def test_ransac_min_n_samples():
+    estimator = LinearRegression()
+    ransac_estimator1 = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_estimator2 = RANSACRegressor(
+        estimator,
+        min_samples=2.0 / X.shape[0],
+        residual_threshold=5,
+        random_state=0,
+    )
+    ransac_estimator5 = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_estimator6 = RANSACRegressor(estimator, residual_threshold=5, random_state=0)
+    ransac_estimator7 = RANSACRegressor(
+        estimator, min_samples=X.shape[0] + 1, residual_threshold=5, random_state=0
+    )
+    # GH #19390
+    ransac_estimator8 = RANSACRegressor(
+        Ridge(), min_samples=None, residual_threshold=5, random_state=0
+    )
+
+    ransac_estimator1.fit(X, y)
+    ransac_estimator2.fit(X, y)
+    ransac_estimator5.fit(X, y)
+    ransac_estimator6.fit(X, y)
+
+    assert_array_almost_equal(
+        ransac_estimator1.predict(X), ransac_estimator2.predict(X)
+    )
+    assert_array_almost_equal(
+        ransac_estimator1.predict(X), ransac_estimator5.predict(X)
+    )
+    assert_array_almost_equal(
+        ransac_estimator1.predict(X), ransac_estimator6.predict(X)
+    )
+
+    with pytest.raises(ValueError):
+        ransac_estimator7.fit(X, y)
+
+    err_msg = "`min_samples` needs to be explicitly set"
+    with pytest.raises(ValueError, match=err_msg):
+        ransac_estimator8.fit(X, y)
+
+
+def test_ransac_multi_dimensional_targets():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+
+    # 3-D target values
+    yyy = np.column_stack([y, y, y])
+
+    # Estimate parameters of corrupted data
+    ransac_estimator.fit(X, yyy)
+
+    # Ground truth / reference inlier mask
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+
+def test_ransac_residual_loss():
+    def loss_multi1(y_true, y_pred):
+        return np.sum(np.abs(y_true - y_pred), axis=1)
+
+    def loss_multi2(y_true, y_pred):
+        return np.sum((y_true - y_pred) ** 2, axis=1)
+
+    def loss_mono(y_true, y_pred):
+        return np.abs(y_true - y_pred)
+
+    yyy = np.column_stack([y, y, y])
+
+    estimator = LinearRegression()
+    ransac_estimator0 = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_estimator1 = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        random_state=0,
+        loss=loss_multi1,
+    )
+    ransac_estimator2 = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        random_state=0,
+        loss=loss_multi2,
+    )
+
+    # multi-dimensional
+    ransac_estimator0.fit(X, yyy)
+    ransac_estimator1.fit(X, yyy)
+    ransac_estimator2.fit(X, yyy)
+    assert_array_almost_equal(
+        ransac_estimator0.predict(X), ransac_estimator1.predict(X)
+    )
+    assert_array_almost_equal(
+        ransac_estimator0.predict(X), ransac_estimator2.predict(X)
+    )
+
+    # one-dimensional
+    ransac_estimator0.fit(X, y)
+    ransac_estimator2.loss = loss_mono
+    ransac_estimator2.fit(X, y)
+    assert_array_almost_equal(
+        ransac_estimator0.predict(X), ransac_estimator2.predict(X)
+    )
+    ransac_estimator3 = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        random_state=0,
+        loss="squared_error",
+    )
+    ransac_estimator3.fit(X, y)
+    assert_array_almost_equal(
+        ransac_estimator0.predict(X), ransac_estimator2.predict(X)
+    )
+
+
+def test_ransac_default_residual_threshold():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(estimator, min_samples=2, random_state=0)
+
+    # Estimate parameters of corrupted data
+    ransac_estimator.fit(X, y)
+
+    # Ground truth / reference inlier mask
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+
+def test_ransac_dynamic_max_trials():
+    # Numbers hand-calculated and confirmed on page 119 (Table 4.3) in
+    #   Hartley, R.~I. and Zisserman, A., 2004,
+    #   Multiple View Geometry in Computer Vision, Second Edition,
+    #   Cambridge University Press, ISBN: 0521540518
+
+    # e = 0%, min_samples = X
+    assert _dynamic_max_trials(100, 100, 2, 0.99) == 1
+
+    # e = 5%, min_samples = 2
+    assert _dynamic_max_trials(95, 100, 2, 0.99) == 2
+    # e = 10%, min_samples = 2
+    assert _dynamic_max_trials(90, 100, 2, 0.99) == 3
+    # e = 30%, min_samples = 2
+    assert _dynamic_max_trials(70, 100, 2, 0.99) == 7
+    # e = 50%, min_samples = 2
+    assert _dynamic_max_trials(50, 100, 2, 0.99) == 17
+
+    # e = 5%, min_samples = 8
+    assert _dynamic_max_trials(95, 100, 8, 0.99) == 5
+    # e = 10%, min_samples = 8
+    assert _dynamic_max_trials(90, 100, 8, 0.99) == 9
+    # e = 30%, min_samples = 8
+    assert _dynamic_max_trials(70, 100, 8, 0.99) == 78
+    # e = 50%, min_samples = 8
+    assert _dynamic_max_trials(50, 100, 8, 0.99) == 1177
+
+    # e = 0%, min_samples = 10
+    assert _dynamic_max_trials(1, 100, 10, 0) == 0
+    assert _dynamic_max_trials(1, 100, 10, 1) == float("inf")
+
+
+def test_ransac_fit_sample_weight():
+    ransac_estimator = RANSACRegressor(random_state=0)
+    n_samples = y.shape[0]
+    weights = np.ones(n_samples)
+    ransac_estimator.fit(X, y, sample_weight=weights)
+    # sanity check
+    assert ransac_estimator.inlier_mask_.shape[0] == n_samples
+
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+    # check that mask is correct
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+    # check that fit(X)  = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
+    #   X = X1 repeated n1 times, X2 repeated n2 times and so forth
+    random_state = check_random_state(0)
+    X_ = random_state.randint(0, 200, [10, 1])
+    y_ = np.ndarray.flatten(0.2 * X_ + 2)
+    sample_weight = random_state.randint(0, 10, 10)
+    outlier_X = random_state.randint(0, 1000, [1, 1])
+    outlier_weight = random_state.randint(0, 10, 1)
+    outlier_y = random_state.randint(-1000, 0, 1)
+
+    X_flat = np.append(
+        np.repeat(X_, sample_weight, axis=0),
+        np.repeat(outlier_X, outlier_weight, axis=0),
+        axis=0,
+    )
+    y_flat = np.ndarray.flatten(
+        np.append(
+            np.repeat(y_, sample_weight, axis=0),
+            np.repeat(outlier_y, outlier_weight, axis=0),
+            axis=0,
+        )
+    )
+    ransac_estimator.fit(X_flat, y_flat)
+    ref_coef_ = ransac_estimator.estimator_.coef_
+
+    sample_weight = np.append(sample_weight, outlier_weight)
+    X_ = np.append(X_, outlier_X, axis=0)
+    y_ = np.append(y_, outlier_y)
+    ransac_estimator.fit(X_, y_, sample_weight=sample_weight)
+
+    assert_allclose(ransac_estimator.estimator_.coef_, ref_coef_)
+
+    # check that if estimator.fit doesn't support
+    # sample_weight, raises error
+    estimator = OrthogonalMatchingPursuit()
+    ransac_estimator = RANSACRegressor(estimator, min_samples=10)
+
+    err_msg = f"{estimator.__class__.__name__} does not support sample_weight."
+    with pytest.raises(ValueError, match=err_msg):
+        ransac_estimator.fit(X, y, sample_weight=weights)
+
+
+def test_ransac_final_model_fit_sample_weight():
+    X, y = make_regression(n_samples=1000, random_state=10)
+    rng = check_random_state(42)
+    sample_weight = rng.randint(1, 4, size=y.shape[0])
+    sample_weight = sample_weight / sample_weight.sum()
+    ransac = RANSACRegressor(random_state=0)
+    ransac.fit(X, y, sample_weight=sample_weight)
+
+    final_model = LinearRegression()
+    mask_samples = ransac.inlier_mask_
+    final_model.fit(
+        X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples]
+    )
+
+    assert_allclose(ransac.estimator_.coef_, final_model.coef_, atol=1e-12)
+
+
+def test_perfect_horizontal_line():
+    """Check that we can fit a line where all samples are inliers.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19497
+    """
+    X = np.arange(100)[:, None]
+    y = np.zeros((100,))
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(estimator, random_state=0)
+    ransac_estimator.fit(X, y)
+
+    assert_allclose(ransac_estimator.estimator_.coef_, 0.0)
+    assert_allclose(ransac_estimator.estimator_.intercept_, 0.0)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ridge.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ridge.py
new file mode 100644
index 0000000000000000000000000000000000000000..24515195fb7ccd674091ab6b90a91b43a59a14aa
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ridge.py
@@ -0,0 +1,2380 @@
+import warnings
+from itertools import product
+
+import numpy as np
+import pytest
+from scipy import linalg
+
+from sklearn import config_context, datasets
+from sklearn.base import clone
+from sklearn.datasets import (
+    make_classification,
+    make_low_rank_matrix,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import (
+    LinearRegression,
+    Ridge,
+    RidgeClassifier,
+    RidgeClassifierCV,
+    RidgeCV,
+    ridge_regression,
+)
+from sklearn.linear_model._ridge import (
+    _check_gcv_mode,
+    _RidgeGCV,
+    _solve_cholesky,
+    _solve_cholesky_kernel,
+    _solve_lbfgs,
+    _solve_svd,
+    _X_CenterStackOp,
+)
+from sklearn.metrics import get_scorer, make_scorer, mean_squared_error
+from sklearn.model_selection import (
+    GridSearchCV,
+    GroupKFold,
+    KFold,
+    LeaveOneOut,
+    cross_val_predict,
+)
+from sklearn.preprocessing import minmax_scale
+from sklearn.utils import check_random_state
+from sklearn.utils._array_api import (
+    _NUMPY_NAMESPACE_NAMES,
+    _atol_for_type,
+    _convert_to_numpy,
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+    yield_namespaces,
+)
+from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.estimator_checks import (
+    _array_api_for_tests,
+    check_array_api_input_and_values,
+)
+from sklearn.utils.fixes import (
+    _IS_32BIT,
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+SOLVERS = ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga"]
+SPARSE_SOLVERS_WITH_INTERCEPT = ("sparse_cg", "sag")
+SPARSE_SOLVERS_WITHOUT_INTERCEPT = ("sparse_cg", "cholesky", "lsqr", "sag", "saga")
+
+diabetes = datasets.load_diabetes()
+X_diabetes, y_diabetes = diabetes.data, diabetes.target
+ind = np.arange(X_diabetes.shape[0])
+rng = np.random.RandomState(0)
+rng.shuffle(ind)
+ind = ind[:200]
+X_diabetes, y_diabetes = X_diabetes[ind], y_diabetes[ind]
+
+iris = datasets.load_iris()
+X_iris, y_iris = iris.data, iris.target
+
+
+def _accuracy_callable(y_test, y_pred, **kwargs):
+    return np.mean(y_test == y_pred)
+
+
+def _mean_squared_error_callable(y_test, y_pred):
+    return ((y_test - y_pred) ** 2).mean()
+
+
+@pytest.fixture(params=["long", "wide"])
+def ols_ridge_dataset(global_random_seed, request):
+    """Dataset with OLS and Ridge solutions, well conditioned X.
+
+    The construction is based on the SVD decomposition of X = U S V'.
+
+    Parameters
+    ----------
+    type : {"long", "wide"}
+        If "long", then n_samples > n_features.
+        If "wide", then n_features > n_samples.
+
+    For "wide", we return the minimum norm solution w = X' (XX')^-1 y:
+
+        min ||w||_2 subject to X w = y
+
+    Returns
+    -------
+    X : ndarray
+        Last column of 1, i.e. intercept.
+    y : ndarray
+    coef_ols : ndarray of shape
+        Minimum norm OLS solutions, i.e. min ||X w - y||_2_2 (with minimum ||w||_2 in
+        case of ambiguity)
+        Last coefficient is intercept.
+    coef_ridge : ndarray of shape (5,)
+        Ridge solution with alpha=1, i.e. min ||X w - y||_2_2 + ||w||_2^2.
+        Last coefficient is intercept.
+    """
+    # Make larger dim more than double as big as the smaller one.
+    # This helps when constructing singular matrices like (X, X).
+    if request.param == "long":
+        n_samples, n_features = 12, 4
+    else:
+        n_samples, n_features = 4, 12
+    k = min(n_samples, n_features)
+    rng = np.random.RandomState(global_random_seed)
+    X = make_low_rank_matrix(
+        n_samples=n_samples, n_features=n_features, effective_rank=k, random_state=rng
+    )
+    X[:, -1] = 1  # last columns acts as intercept
+    U, s, Vt = linalg.svd(X)
+    assert np.all(s > 1e-3)  # to be sure
+    U1, U2 = U[:, :k], U[:, k:]
+    Vt1, _ = Vt[:k, :], Vt[k:, :]
+
+    if request.param == "long":
+        # Add a term that vanishes in the product X'y
+        coef_ols = rng.uniform(low=-10, high=10, size=n_features)
+        y = X @ coef_ols
+        y += U2 @ rng.normal(size=n_samples - n_features) ** 2
+    else:
+        y = rng.uniform(low=-10, high=10, size=n_samples)
+        # w = X'(XX')^-1 y = V s^-1 U' y
+        coef_ols = Vt1.T @ np.diag(1 / s) @ U1.T @ y
+
+    # Add penalty alpha * ||coef||_2^2 for alpha=1 and solve via normal equations.
+    # Note that the problem is well conditioned such that we get accurate results.
+    alpha = 1
+    d = alpha * np.identity(n_features)
+    d[-1, -1] = 0  # intercept gets no penalty
+    coef_ridge = linalg.solve(X.T @ X + d, X.T @ y)
+
+    # To be sure
+    R_OLS = y - X @ coef_ols
+    R_Ridge = y - X @ coef_ridge
+    assert np.linalg.norm(R_OLS) < np.linalg.norm(R_Ridge)
+
+    return X, y, coef_ols, coef_ridge
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_ridge_regression(solver, fit_intercept, ols_ridge_dataset, global_random_seed):
+    """Test that Ridge converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    """
+    X, y, _, coef = ols_ridge_dataset
+    alpha = 1.0  # because ols_ridge_dataset uses this.
+    params = dict(
+        alpha=alpha,
+        fit_intercept=True,
+        solver=solver,
+        tol=1e-15 if solver in ("sag", "saga") else 1e-10,
+        random_state=global_random_seed,
+    )
+
+    # Calculate residuals and R2.
+    res_null = y - np.mean(y)
+    res_Ridge = y - X @ coef
+    R2_Ridge = 1 - np.sum(res_Ridge**2) / np.sum(res_null**2)
+
+    model = Ridge(**params)
+    X = X[:, :-1]  # remove intercept
+    if fit_intercept:
+        intercept = coef[-1]
+    else:
+        X = X - X.mean(axis=0)
+        y = y - y.mean()
+        intercept = 0
+    model.fit(X, y)
+    coef = coef[:-1]
+
+    assert model.intercept_ == pytest.approx(intercept)
+    assert_allclose(model.coef_, coef)
+    assert model.score(X, y) == pytest.approx(R2_Ridge)
+
+    # Same with sample_weight.
+    model = Ridge(**params).fit(X, y, sample_weight=np.ones(X.shape[0]))
+    assert model.intercept_ == pytest.approx(intercept)
+    assert_allclose(model.coef_, coef)
+    assert model.score(X, y) == pytest.approx(R2_Ridge)
+
+    assert model.solver_ == solver
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_ridge_regression_hstacked_X(
+    solver, fit_intercept, ols_ridge_dataset, global_random_seed
+):
+    """Test that Ridge converges for all solvers to correct solution on hstacked data.
+
+    We work with a simple constructed data set with known solution.
+    Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2.
+    For long X, [X, X] is a singular matrix.
+    """
+    X, y, _, coef = ols_ridge_dataset
+    n_samples, n_features = X.shape
+    alpha = 1.0  # because ols_ridge_dataset uses this.
+
+    model = Ridge(
+        alpha=alpha / 2,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-15 if solver in ("sag", "saga") else 1e-10,
+        random_state=global_random_seed,
+    )
+    X = X[:, :-1]  # remove intercept
+    X = 0.5 * np.concatenate((X, X), axis=1)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1)
+    if fit_intercept:
+        intercept = coef[-1]
+    else:
+        X = X - X.mean(axis=0)
+        y = y - y.mean()
+        intercept = 0
+    model.fit(X, y)
+    coef = coef[:-1]
+
+    assert model.intercept_ == pytest.approx(intercept)
+    # coefficients are not all on the same magnitude, adding a small atol to
+    # make this test less brittle
+    assert_allclose(model.coef_, np.r_[coef, coef], atol=1e-8)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_ridge_regression_vstacked_X(
+    solver, fit_intercept, ols_ridge_dataset, global_random_seed
+):
+    """Test that Ridge converges for all solvers to correct solution on vstacked data.
+
+    We work with a simple constructed data set with known solution.
+    Fit on [X] with alpha is the same as fit on [X], [y]
+                                                [X], [y] with 2 * alpha.
+    For wide X, [X', X'] is a singular matrix.
+    """
+    X, y, _, coef = ols_ridge_dataset
+    n_samples, n_features = X.shape
+    alpha = 1.0  # because ols_ridge_dataset uses this.
+
+    model = Ridge(
+        alpha=2 * alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-15 if solver in ("sag", "saga") else 1e-10,
+        random_state=global_random_seed,
+    )
+    X = X[:, :-1]  # remove intercept
+    X = np.concatenate((X, X), axis=0)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    y = np.r_[y, y]
+    if fit_intercept:
+        intercept = coef[-1]
+    else:
+        X = X - X.mean(axis=0)
+        y = y - y.mean()
+        intercept = 0
+    model.fit(X, y)
+    coef = coef[:-1]
+
+    assert model.intercept_ == pytest.approx(intercept)
+    # coefficients are not all on the same magnitude, adding a small atol to
+    # make this test less brittle
+    assert_allclose(model.coef_, coef, atol=1e-8)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_ridge_regression_unpenalized(
+    solver, fit_intercept, ols_ridge_dataset, global_random_seed
+):
+    """Test that unpenalized Ridge = OLS converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    Note: This checks the minimum norm solution for wide X, i.e.
+    n_samples < n_features:
+        min ||w||_2 subject to X w = y
+    """
+    X, y, coef, _ = ols_ridge_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # OLS
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-15 if solver in ("sag", "saga") else 1e-10,
+        random_state=global_random_seed,
+    )
+
+    model = Ridge(**params)
+    # Note that cholesky might give a warning: "Singular matrix in solving dual
+    # problem. Using least-squares solution instead."
+    if fit_intercept:
+        X = X[:, :-1]  # remove intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        intercept = 0
+    model.fit(X, y)
+
+    # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
+    # for the wide/fat case with n_features > n_samples. The current Ridge solvers do
+    # NOT return the minimum norm solution with fit_intercept=True.
+    if n_samples > n_features or not fit_intercept:
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef)
+    else:
+        # As it is an underdetermined problem, residuals = 0. This shows that we get
+        # a solution to X w = y ....
+        assert_allclose(model.predict(X), y)
+        assert_allclose(X @ coef + intercept, y)
+        # But it is not the minimum norm solution. (This should be equal.)
+        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
+            np.r_[intercept, coef]
+        )
+
+        pytest.xfail(reason="Ridge does not provide the minimum norm solution.")
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_ridge_regression_unpenalized_hstacked_X(
+    solver, fit_intercept, ols_ridge_dataset, global_random_seed
+):
+    """Test that unpenalized Ridge = OLS converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    OLS fit on [X] is the same as fit on [X, X]/2.
+    For long X, [X, X] is a singular matrix and we check against the minimum norm
+    solution:
+        min ||w||_2 subject to min ||X w - y||_2
+    """
+    X, y, coef, _ = ols_ridge_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # OLS
+
+    model = Ridge(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-15 if solver in ("sag", "saga") else 1e-10,
+        random_state=global_random_seed,
+    )
+    if fit_intercept:
+        X = X[:, :-1]  # remove intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        intercept = 0
+    X = 0.5 * np.concatenate((X, X), axis=1)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    model.fit(X, y)
+
+    if n_samples > n_features or not fit_intercept:
+        assert model.intercept_ == pytest.approx(intercept)
+        if solver == "cholesky":
+            # Cholesky is a bad choice for singular X.
+            pytest.skip()
+        assert_allclose(model.coef_, np.r_[coef, coef])
+    else:
+        # FIXME: Same as in test_ridge_regression_unpenalized.
+        # As it is an underdetermined problem, residuals = 0. This shows that we get
+        # a solution to X w = y ....
+        assert_allclose(model.predict(X), y)
+        # But it is not the minimum norm solution. (This should be equal.)
+        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
+            np.r_[intercept, coef, coef]
+        )
+
+        pytest.xfail(reason="Ridge does not provide the minimum norm solution.")
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, np.r_[coef, coef])
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_ridge_regression_unpenalized_vstacked_X(
+    solver, fit_intercept, ols_ridge_dataset, global_random_seed
+):
+    """Test that unpenalized Ridge = OLS converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    OLS fit on [X] is the same as fit on [X], [y]
+                                         [X], [y].
+    For wide X, [X', X'] is a singular matrix and we check against the minimum norm
+    solution:
+        min ||w||_2 subject to X w = y
+    """
+    X, y, coef, _ = ols_ridge_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # OLS
+
+    model = Ridge(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-15 if solver in ("sag", "saga") else 1e-10,
+        random_state=global_random_seed,
+    )
+
+    if fit_intercept:
+        X = X[:, :-1]  # remove intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        intercept = 0
+    X = np.concatenate((X, X), axis=0)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    y = np.r_[y, y]
+    model.fit(X, y)
+
+    if n_samples > n_features or not fit_intercept:
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef)
+    else:
+        # FIXME: Same as in test_ridge_regression_unpenalized.
+        # As it is an underdetermined problem, residuals = 0. This shows that we get
+        # a solution to X w = y ....
+        assert_allclose(model.predict(X), y)
+        # But it is not the minimum norm solution. (This should be equal.)
+        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
+            np.r_[intercept, coef]
+        )
+
+        pytest.xfail(reason="Ridge does not provide the minimum norm solution.")
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize("alpha", [1.0, 1e-2])
+def test_ridge_regression_sample_weights(
+    solver,
+    fit_intercept,
+    sparse_container,
+    alpha,
+    ols_ridge_dataset,
+    global_random_seed,
+):
+    """Test that Ridge with sample weights gives correct results.
+
+    We use the following trick:
+        ||y - Xw||_2 = (z - Aw)' W (z - Aw)
+    for z=[y, y], A' = [X', X'] (vstacked), and W[:n/2] + W[n/2:] = 1, W=diag(W)
+    """
+    if sparse_container is not None:
+        if fit_intercept and solver not in SPARSE_SOLVERS_WITH_INTERCEPT:
+            pytest.skip()
+        elif not fit_intercept and solver not in SPARSE_SOLVERS_WITHOUT_INTERCEPT:
+            pytest.skip()
+    X, y, _, coef = ols_ridge_dataset
+    n_samples, n_features = X.shape
+    sw = rng.uniform(low=0, high=1, size=n_samples)
+
+    model = Ridge(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-15 if solver in ["sag", "saga"] else 1e-10,
+        max_iter=100_000,
+        random_state=global_random_seed,
+    )
+    X = X[:, :-1]  # remove intercept
+    X = np.concatenate((X, X), axis=0)
+    y = np.r_[y, y]
+    sw = np.r_[sw, 1 - sw] * alpha
+    if fit_intercept:
+        intercept = coef[-1]
+    else:
+        X = X - X.mean(axis=0)
+        y = y - y.mean()
+        intercept = 0
+    if sparse_container is not None:
+        X = sparse_container(X)
+    model.fit(X, y, sample_weight=sw)
+    coef = coef[:-1]
+
+    assert model.intercept_ == pytest.approx(intercept)
+    assert_allclose(model.coef_, coef)
+
+
+def test_primal_dual_relationship():
+    y = y_diabetes.reshape(-1, 1)
+    coef = _solve_cholesky(X_diabetes, y, alpha=[1e-2])
+    K = np.dot(X_diabetes, X_diabetes.T)
+    dual_coef = _solve_cholesky_kernel(K, y, alpha=[1e-2])
+    coef2 = np.dot(X_diabetes.T, dual_coef).T
+    assert_array_almost_equal(coef, coef2)
+
+
+def test_ridge_regression_convergence_fail():
+    rng = np.random.RandomState(0)
+    y = rng.randn(5)
+    X = rng.randn(5, 10)
+    warning_message = r"sparse_cg did not converge after [0-9]+ iterations."
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        ridge_regression(
+            X, y, alpha=1.0, solver="sparse_cg", tol=0.0, max_iter=None, verbose=1
+        )
+
+
+def test_ridge_shapes_type():
+    # Test shape of coef_ and intercept_
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 5, 10
+    X = rng.randn(n_samples, n_features)
+    y = rng.randn(n_samples)
+    Y1 = y[:, np.newaxis]
+    Y = np.c_[y, 1 + y]
+
+    ridge = Ridge()
+
+    ridge.fit(X, y)
+    assert ridge.coef_.shape == (n_features,)
+    assert ridge.intercept_.shape == ()
+    assert isinstance(ridge.coef_, np.ndarray)
+    assert isinstance(ridge.intercept_, float)
+
+    ridge.fit(X, Y1)
+    assert ridge.coef_.shape == (n_features,)
+    assert ridge.intercept_.shape == (1,)
+    assert isinstance(ridge.coef_, np.ndarray)
+    assert isinstance(ridge.intercept_, np.ndarray)
+
+    ridge.fit(X, Y)
+    assert ridge.coef_.shape == (2, n_features)
+    assert ridge.intercept_.shape == (2,)
+    assert isinstance(ridge.coef_, np.ndarray)
+    assert isinstance(ridge.intercept_, np.ndarray)
+
+
+def test_ridge_intercept():
+    # Test intercept with multiple targets GH issue #708
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 5, 10
+    X = rng.randn(n_samples, n_features)
+    y = rng.randn(n_samples)
+    Y = np.c_[y, 1.0 + y]
+
+    ridge = Ridge()
+
+    ridge.fit(X, y)
+    intercept = ridge.intercept_
+
+    ridge.fit(X, Y)
+    assert_almost_equal(ridge.intercept_[0], intercept)
+    assert_almost_equal(ridge.intercept_[1], intercept + 1.0)
+
+
+def test_ridge_vs_lstsq():
+    # On alpha=0., Ridge and OLS yield the same solution.
+
+    rng = np.random.RandomState(0)
+    # we need more samples than features
+    n_samples, n_features = 5, 4
+    y = rng.randn(n_samples)
+    X = rng.randn(n_samples, n_features)
+
+    ridge = Ridge(alpha=0.0, fit_intercept=False)
+    ols = LinearRegression(fit_intercept=False)
+
+    ridge.fit(X, y)
+    ols.fit(X, y)
+    assert_almost_equal(ridge.coef_, ols.coef_)
+
+    ridge.fit(X, y)
+    ols.fit(X, y)
+    assert_almost_equal(ridge.coef_, ols.coef_)
+
+
+def test_ridge_individual_penalties():
+    # Tests the ridge object using individual penalties
+
+    rng = np.random.RandomState(42)
+
+    n_samples, n_features, n_targets = 20, 10, 5
+    X = rng.randn(n_samples, n_features)
+    y = rng.randn(n_samples, n_targets)
+
+    penalties = np.arange(n_targets)
+
+    coef_cholesky = np.array(
+        [
+            Ridge(alpha=alpha, solver="cholesky").fit(X, target).coef_
+            for alpha, target in zip(penalties, y.T)
+        ]
+    )
+
+    coefs_indiv_pen = [
+        Ridge(alpha=penalties, solver=solver, tol=1e-12).fit(X, y).coef_
+        for solver in ["svd", "sparse_cg", "lsqr", "cholesky", "sag", "saga"]
+    ]
+    for coef_indiv_pen in coefs_indiv_pen:
+        assert_array_almost_equal(coef_cholesky, coef_indiv_pen)
+
+    # Test error is raised when number of targets and penalties do not match.
+    ridge = Ridge(alpha=penalties[:-1])
+    err_msg = "Number of targets and number of penalties do not correspond: 4 != 5"
+    with pytest.raises(ValueError, match=err_msg):
+        ridge.fit(X, y)
+
+
+@pytest.mark.parametrize("n_col", [(), (1,), (3,)])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_X_CenterStackOp(n_col, csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.randn(11, 8)
+    X_m = rng.randn(8)
+    sqrt_sw = rng.randn(len(X))
+    Y = rng.randn(11, *n_col)
+    A = rng.randn(9, *n_col)
+    operator = _X_CenterStackOp(csr_container(X), X_m, sqrt_sw)
+    reference_operator = np.hstack([X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]])
+    assert_allclose(reference_operator.dot(A), operator.dot(A))
+    assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y))
+
+
+@pytest.mark.parametrize("shape", [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)])
+@pytest.mark.parametrize("uniform_weights", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_compute_gram(shape, uniform_weights, csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.randn(*shape)
+    if uniform_weights:
+        sw = np.ones(X.shape[0])
+    else:
+        sw = rng.chisquare(1, shape[0])
+    sqrt_sw = np.sqrt(sw)
+    X_mean = np.average(X, axis=0, weights=sw)
+    X_centered = (X - X_mean) * sqrt_sw[:, None]
+    true_gram = X_centered.dot(X_centered.T)
+    X_sparse = csr_container(X * sqrt_sw[:, None])
+    gcv = _RidgeGCV(fit_intercept=True)
+    computed_gram, computed_mean = gcv._compute_gram(X_sparse, sqrt_sw)
+    assert_allclose(X_mean, computed_mean)
+    assert_allclose(true_gram, computed_gram)
+
+
+@pytest.mark.parametrize("shape", [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)])
+@pytest.mark.parametrize("uniform_weights", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_compute_covariance(shape, uniform_weights, csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.randn(*shape)
+    if uniform_weights:
+        sw = np.ones(X.shape[0])
+    else:
+        sw = rng.chisquare(1, shape[0])
+    sqrt_sw = np.sqrt(sw)
+    X_mean = np.average(X, axis=0, weights=sw)
+    X_centered = (X - X_mean) * sqrt_sw[:, None]
+    true_covariance = X_centered.T.dot(X_centered)
+    X_sparse = csr_container(X * sqrt_sw[:, None])
+    gcv = _RidgeGCV(fit_intercept=True)
+    computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw)
+    assert_allclose(X_mean, computed_mean)
+    assert_allclose(true_covariance, computed_cov)
+
+
+def _make_sparse_offset_regression(
+    n_samples=100,
+    n_features=100,
+    proportion_nonzero=0.5,
+    n_informative=10,
+    n_targets=1,
+    bias=13.0,
+    X_offset=30.0,
+    noise=30.0,
+    shuffle=True,
+    coef=False,
+    positive=False,
+    random_state=None,
+):
+    X, y, c = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_informative,
+        n_targets=n_targets,
+        bias=bias,
+        noise=noise,
+        shuffle=shuffle,
+        coef=True,
+        random_state=random_state,
+    )
+    if n_features == 1:
+        c = np.asarray([c])
+    X += X_offset
+    mask = (
+        np.random.RandomState(random_state).binomial(1, proportion_nonzero, X.shape) > 0
+    )
+    removed_X = X.copy()
+    X[~mask] = 0.0
+    removed_X[mask] = 0.0
+    y -= removed_X.dot(c)
+    if positive:
+        y += X.dot(np.abs(c) + 1 - c)
+        c = np.abs(c) + 1
+    if n_features == 1:
+        c = c[0]
+    if coef:
+        return X, y, c
+    return X, y
+
+
+@pytest.mark.parametrize(
+    "solver, sparse_container",
+    (
+        (solver, sparse_container)
+        for (solver, sparse_container) in product(
+            ["cholesky", "sag", "sparse_cg", "lsqr", "saga", "ridgecv"],
+            [None] + CSR_CONTAINERS,
+        )
+        if sparse_container is None or solver in ["sparse_cg", "ridgecv"]
+    ),
+)
+@pytest.mark.parametrize(
+    "n_samples,dtype,proportion_nonzero",
+    [(20, "float32", 0.1), (40, "float32", 1.0), (20, "float64", 0.2)],
+)
+def test_solver_consistency(
+    solver, proportion_nonzero, n_samples, dtype, sparse_container, global_random_seed
+):
+    alpha = 1.0
+    noise = 50.0 if proportion_nonzero > 0.9 else 500.0
+    X, y = _make_sparse_offset_regression(
+        bias=10,
+        n_features=30,
+        proportion_nonzero=proportion_nonzero,
+        noise=noise,
+        random_state=global_random_seed,
+        n_samples=n_samples,
+    )
+    # Manually scale the data to avoid pathological cases. We use
+    # minmax_scale to deal with the sparse case without breaking
+    # the sparsity pattern.
+    X = minmax_scale(X)
+
+    svd_ridge = Ridge(solver="svd", alpha=alpha).fit(X, y)
+    X = X.astype(dtype, copy=False)
+    y = y.astype(dtype, copy=False)
+    if sparse_container is not None:
+        X = sparse_container(X)
+    if solver == "ridgecv":
+        ridge = RidgeCV(alphas=[alpha])
+    else:
+        if solver.startswith("sag"):
+            # Avoid ConvergenceWarning for sag and saga solvers.
+            tol = 1e-7
+            max_iter = 100_000
+        else:
+            tol = 1e-10
+            max_iter = None
+
+        ridge = Ridge(
+            alpha=alpha,
+            solver=solver,
+            max_iter=max_iter,
+            tol=tol,
+            random_state=global_random_seed,
+        )
+    ridge.fit(X, y)
+    assert_allclose(ridge.coef_, svd_ridge.coef_, atol=1e-3, rtol=1e-3)
+    assert_allclose(ridge.intercept_, svd_ridge.intercept_, atol=1e-3, rtol=1e-3)
+
+
+@pytest.mark.parametrize("gcv_mode", ["svd", "eigen"])
+@pytest.mark.parametrize("X_container", [np.asarray] + CSR_CONTAINERS)
+@pytest.mark.parametrize("X_shape", [(11, 8), (11, 20)])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize(
+    "y_shape, noise",
+    [
+        ((11,), 1.0),
+        ((11, 1), 30.0),
+        ((11, 3), 150.0),
+    ],
+)
+def test_ridge_gcv_vs_ridge_loo_cv(
+    gcv_mode, X_container, X_shape, y_shape, fit_intercept, noise
+):
+    n_samples, n_features = X_shape
+    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
+    X, y = _make_sparse_offset_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_targets=n_targets,
+        random_state=0,
+        shuffle=False,
+        noise=noise,
+        n_informative=5,
+    )
+    y = y.reshape(y_shape)
+
+    alphas = [1e-3, 0.1, 1.0, 10.0, 1e3]
+    loo_ridge = RidgeCV(
+        cv=n_samples,
+        fit_intercept=fit_intercept,
+        alphas=alphas,
+        scoring="neg_mean_squared_error",
+    )
+    gcv_ridge = RidgeCV(
+        gcv_mode=gcv_mode,
+        fit_intercept=fit_intercept,
+        alphas=alphas,
+    )
+
+    loo_ridge.fit(X, y)
+
+    X_gcv = X_container(X)
+    gcv_ridge.fit(X_gcv, y)
+
+    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
+    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
+    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
+
+
+def test_ridge_loo_cv_asym_scoring():
+    # checking on asymmetric scoring
+    scoring = "explained_variance"
+    n_samples, n_features = 10, 5
+    n_targets = 1
+    X, y = _make_sparse_offset_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_targets=n_targets,
+        random_state=0,
+        shuffle=False,
+        noise=1,
+        n_informative=5,
+    )
+
+    alphas = [1e-3, 0.1, 1.0, 10.0, 1e3]
+    loo_ridge = RidgeCV(
+        cv=n_samples, fit_intercept=True, alphas=alphas, scoring=scoring
+    )
+
+    gcv_ridge = RidgeCV(fit_intercept=True, alphas=alphas, scoring=scoring)
+
+    loo_ridge.fit(X, y)
+    gcv_ridge.fit(X, y)
+
+    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_), (
+        f"{gcv_ridge.alpha_=}, {loo_ridge.alpha_=}"
+    )
+    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
+    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
+
+
+@pytest.mark.parametrize("gcv_mode", ["svd", "eigen"])
+@pytest.mark.parametrize("X_container", [np.asarray] + CSR_CONTAINERS)
+@pytest.mark.parametrize("n_features", [8, 20])
+@pytest.mark.parametrize(
+    "y_shape, fit_intercept, noise",
+    [
+        ((11,), True, 1.0),
+        ((11, 1), True, 20.0),
+        ((11, 3), True, 150.0),
+        ((11, 3), False, 30.0),
+    ],
+)
+def test_ridge_gcv_sample_weights(
+    gcv_mode, X_container, fit_intercept, n_features, y_shape, noise
+):
+    alphas = [1e-3, 0.1, 1.0, 10.0, 1e3]
+    rng = np.random.RandomState(0)
+    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
+    X, y = _make_sparse_offset_regression(
+        n_samples=11,
+        n_features=n_features,
+        n_targets=n_targets,
+        random_state=0,
+        shuffle=False,
+        noise=noise,
+    )
+    y = y.reshape(y_shape)
+
+    sample_weight = 3 * rng.randn(len(X))
+    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
+    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
+    sample_weight = sample_weight.astype(float)
+    X_tiled, y_tiled = X[indices], y[indices]
+
+    cv = GroupKFold(n_splits=X.shape[0])
+    splits = cv.split(X_tiled, y_tiled, groups=indices)
+    kfold = RidgeCV(
+        alphas=alphas,
+        cv=splits,
+        scoring="neg_mean_squared_error",
+        fit_intercept=fit_intercept,
+    )
+    kfold.fit(X_tiled, y_tiled)
+
+    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
+    splits = cv.split(X_tiled, y_tiled, groups=indices)
+    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
+    if predictions.shape != y_tiled.shape:
+        predictions = predictions.reshape(y_tiled.shape)
+    kfold_errors = (y_tiled - predictions) ** 2
+    kfold_errors = [
+        np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0])
+    ]
+    kfold_errors = np.asarray(kfold_errors)
+
+    X_gcv = X_container(X)
+    gcv_ridge = RidgeCV(
+        alphas=alphas,
+        store_cv_results=True,
+        gcv_mode=gcv_mode,
+        fit_intercept=fit_intercept,
+    )
+    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
+    if len(y_shape) == 2:
+        gcv_errors = gcv_ridge.cv_results_[:, :, alphas.index(kfold.alpha_)]
+    else:
+        gcv_errors = gcv_ridge.cv_results_[:, alphas.index(kfold.alpha_)]
+
+    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
+    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
+    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
+    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize(
+    "mode, mode_n_greater_than_p, mode_p_greater_than_n",
+    [
+        (None, "svd", "eigen"),
+        ("auto", "svd", "eigen"),
+        ("eigen", "eigen", "eigen"),
+        ("svd", "svd", "svd"),
+    ],
+)
+def test_check_gcv_mode_choice(
+    sparse_container, mode, mode_n_greater_than_p, mode_p_greater_than_n
+):
+    X, _ = make_regression(n_samples=5, n_features=2)
+    if sparse_container is not None:
+        X = sparse_container(X)
+    assert _check_gcv_mode(X, mode) == mode_n_greater_than_p
+    assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n
+
+
+def _test_ridge_loo(sparse_container):
+    # test that can work with both dense or sparse matrices
+    n_samples = X_diabetes.shape[0]
+
+    ret = []
+
+    if sparse_container is None:
+        X, fit_intercept = X_diabetes, True
+    else:
+        X, fit_intercept = sparse_container(X_diabetes), False
+    ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)
+
+    # check best alpha
+    ridge_gcv.fit(X, y_diabetes)
+    alpha_ = ridge_gcv.alpha_
+    ret.append(alpha_)
+
+    # check that we get same best alpha with custom loss_func
+    f = ignore_warnings
+    scoring = make_scorer(mean_squared_error, greater_is_better=False)
+    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
+    f(ridge_gcv2.fit)(X, y_diabetes)
+    assert ridge_gcv2.alpha_ == pytest.approx(alpha_)
+
+    # check that we get same best alpha with custom score_func
+    def func(x, y):
+        return -mean_squared_error(x, y)
+
+    scoring = make_scorer(func)
+    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
+    f(ridge_gcv3.fit)(X, y_diabetes)
+    assert ridge_gcv3.alpha_ == pytest.approx(alpha_)
+
+    # check that we get same best alpha with a scorer
+    scorer = get_scorer("neg_mean_squared_error")
+    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
+    ridge_gcv4.fit(X, y_diabetes)
+    assert ridge_gcv4.alpha_ == pytest.approx(alpha_)
+
+    # check that we get same best alpha with sample weights
+    if sparse_container is None:
+        ridge_gcv.fit(X, y_diabetes, sample_weight=np.ones(n_samples))
+        assert ridge_gcv.alpha_ == pytest.approx(alpha_)
+
+    # simulate several responses
+    Y = np.vstack((y_diabetes, y_diabetes)).T
+
+    ridge_gcv.fit(X, Y)
+    Y_pred = ridge_gcv.predict(X)
+    ridge_gcv.fit(X, y_diabetes)
+    y_pred = ridge_gcv.predict(X)
+
+    assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5)
+
+    return ret
+
+
+def _test_ridge_cv(sparse_container):
+    X = X_diabetes if sparse_container is None else sparse_container(X_diabetes)
+    ridge_cv = RidgeCV()
+    ridge_cv.fit(X, y_diabetes)
+    ridge_cv.predict(X)
+
+    assert len(ridge_cv.coef_.shape) == 1
+    assert type(ridge_cv.intercept_) is np.float64
+
+    cv = KFold(5)
+    ridge_cv.set_params(cv=cv)
+    ridge_cv.fit(X, y_diabetes)
+    ridge_cv.predict(X)
+
+    assert len(ridge_cv.coef_.shape) == 1
+    assert type(ridge_cv.intercept_) is np.float64
+
+
+@pytest.mark.parametrize(
+    "ridge, make_dataset",
+    [
+        (RidgeCV(store_cv_results=False), make_regression),
+        (RidgeClassifierCV(store_cv_results=False), make_classification),
+    ],
+)
+def test_ridge_gcv_cv_results_not_stored(ridge, make_dataset):
+    # Check that `cv_results_` is not stored when store_cv_results is False
+    X, y = make_dataset(n_samples=6, random_state=42)
+    ridge.fit(X, y)
+    assert not hasattr(ridge, "cv_results_")
+
+
+@pytest.mark.parametrize(
+    "ridge, make_dataset",
+    [(RidgeCV(), make_regression), (RidgeClassifierCV(), make_classification)],
+)
+@pytest.mark.parametrize("cv", [None, 3])
+def test_ridge_best_score(ridge, make_dataset, cv):
+    # check that the best_score_ is store
+    X, y = make_dataset(n_samples=6, random_state=42)
+    ridge.set_params(store_cv_results=False, cv=cv)
+    ridge.fit(X, y)
+    assert hasattr(ridge, "best_score_")
+    assert isinstance(ridge.best_score_, float)
+
+
+def test_ridge_cv_individual_penalties():
+    # Tests the ridge_cv object optimizing individual penalties for each target
+
+    rng = np.random.RandomState(42)
+
+    # Create random dataset with multiple targets. Each target should have
+    # a different optimal alpha.
+    n_samples, n_features, n_targets = 20, 5, 3
+    y = rng.randn(n_samples, n_targets)
+    X = (
+        np.dot(y[:, [0]], np.ones((1, n_features)))
+        + np.dot(y[:, [1]], 0.05 * np.ones((1, n_features)))
+        + np.dot(y[:, [2]], 0.001 * np.ones((1, n_features)))
+        + rng.randn(n_samples, n_features)
+    )
+
+    alphas = (1, 100, 1000)
+
+    # Find optimal alpha for each target
+    optimal_alphas = [RidgeCV(alphas=alphas).fit(X, target).alpha_ for target in y.T]
+
+    # Find optimal alphas for all targets simultaneously
+    ridge_cv = RidgeCV(alphas=alphas, alpha_per_target=True).fit(X, y)
+    assert_array_equal(optimal_alphas, ridge_cv.alpha_)
+
+    # The resulting regression weights should incorporate the different
+    # alpha values.
+    assert_array_almost_equal(
+        Ridge(alpha=ridge_cv.alpha_).fit(X, y).coef_, ridge_cv.coef_
+    )
+
+    # Test shape of alpha_ and cv_results_
+    ridge_cv = RidgeCV(alphas=alphas, alpha_per_target=True, store_cv_results=True).fit(
+        X, y
+    )
+    assert ridge_cv.alpha_.shape == (n_targets,)
+    assert ridge_cv.best_score_.shape == (n_targets,)
+    assert ridge_cv.cv_results_.shape == (n_samples, len(alphas), n_targets)
+
+    # Test edge case of there being only one alpha value
+    ridge_cv = RidgeCV(alphas=1, alpha_per_target=True, store_cv_results=True).fit(X, y)
+    assert ridge_cv.alpha_.shape == (n_targets,)
+    assert ridge_cv.best_score_.shape == (n_targets,)
+    assert ridge_cv.cv_results_.shape == (n_samples, n_targets, 1)
+
+    # Test edge case of there being only one target
+    ridge_cv = RidgeCV(alphas=alphas, alpha_per_target=True, store_cv_results=True).fit(
+        X, y[:, 0]
+    )
+    assert np.isscalar(ridge_cv.alpha_)
+    assert np.isscalar(ridge_cv.best_score_)
+    assert ridge_cv.cv_results_.shape == (n_samples, len(alphas))
+
+    # Try with a custom scoring function
+    ridge_cv = RidgeCV(alphas=alphas, alpha_per_target=True, scoring="r2").fit(X, y)
+    assert_array_equal(optimal_alphas, ridge_cv.alpha_)
+    assert_array_almost_equal(
+        Ridge(alpha=ridge_cv.alpha_).fit(X, y).coef_, ridge_cv.coef_
+    )
+
+    # Using a custom CV object should throw an error in combination with
+    # alpha_per_target=True
+    ridge_cv = RidgeCV(alphas=alphas, cv=LeaveOneOut(), alpha_per_target=True)
+    msg = "cv!=None and alpha_per_target=True are incompatible"
+    with pytest.raises(ValueError, match=msg):
+        ridge_cv.fit(X, y)
+    ridge_cv = RidgeCV(alphas=alphas, cv=6, alpha_per_target=True)
+    with pytest.raises(ValueError, match=msg):
+        ridge_cv.fit(X, y)
+
+
+def _test_ridge_diabetes(sparse_container):
+    X = X_diabetes if sparse_container is None else sparse_container(X_diabetes)
+    ridge = Ridge(fit_intercept=False)
+    ridge.fit(X, y_diabetes)
+    return np.round(ridge.score(X, y_diabetes), 5)
+
+
+def _test_multi_ridge_diabetes(sparse_container):
+    # simulate several responses
+    X = X_diabetes if sparse_container is None else sparse_container(X_diabetes)
+    Y = np.vstack((y_diabetes, y_diabetes)).T
+    n_features = X_diabetes.shape[1]
+
+    ridge = Ridge(fit_intercept=False)
+    ridge.fit(X, Y)
+    assert ridge.coef_.shape == (2, n_features)
+    Y_pred = ridge.predict(X)
+    ridge.fit(X, y_diabetes)
+    y_pred = ridge.predict(X)
+    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
+
+
+def _test_ridge_classifiers(sparse_container):
+    n_classes = np.unique(y_iris).shape[0]
+    n_features = X_iris.shape[1]
+    X = X_iris if sparse_container is None else sparse_container(X_iris)
+
+    for reg in (RidgeClassifier(), RidgeClassifierCV()):
+        reg.fit(X, y_iris)
+        assert reg.coef_.shape == (n_classes, n_features)
+        y_pred = reg.predict(X)
+        assert np.mean(y_iris == y_pred) > 0.79
+
+    cv = KFold(5)
+    reg = RidgeClassifierCV(cv=cv)
+    reg.fit(X, y_iris)
+    y_pred = reg.predict(X)
+    assert np.mean(y_iris == y_pred) >= 0.8
+
+
+@pytest.mark.parametrize("scoring", [None, "accuracy", _accuracy_callable])
+@pytest.mark.parametrize("cv", [None, KFold(5)])
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+def test_ridge_classifier_with_scoring(sparse_container, scoring, cv):
+    # non-regression test for #14672
+    # check that RidgeClassifierCV works with all sort of scoring and
+    # cross-validation
+    X = X_iris if sparse_container is None else sparse_container(X_iris)
+    scoring_ = make_scorer(scoring) if callable(scoring) else scoring
+    clf = RidgeClassifierCV(scoring=scoring_, cv=cv)
+    # Smoke test to check that fit/predict does not raise error
+    clf.fit(X, y_iris).predict(X)
+
+
+@pytest.mark.parametrize("cv", [None, KFold(5)])
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+def test_ridge_regression_custom_scoring(sparse_container, cv):
+    # check that custom scoring is working as expected
+    # check the tie breaking strategy (keep the first alpha tried)
+
+    def _dummy_score(y_test, y_pred, **kwargs):
+        return 0.42
+
+    X = X_iris if sparse_container is None else sparse_container(X_iris)
+    alphas = np.logspace(-2, 2, num=5)
+    clf = RidgeClassifierCV(alphas=alphas, scoring=make_scorer(_dummy_score), cv=cv)
+    clf.fit(X, y_iris)
+    assert clf.best_score_ == pytest.approx(0.42)
+    # In case of tie score, the first alphas will be kept
+    assert clf.alpha_ == pytest.approx(alphas[0])
+
+
+def _test_tolerance(sparse_container):
+    X = X_diabetes if sparse_container is None else sparse_container(X_diabetes)
+
+    ridge = Ridge(tol=1e-5, fit_intercept=False)
+    ridge.fit(X, y_diabetes)
+    score = ridge.score(X, y_diabetes)
+
+    ridge2 = Ridge(tol=1e-3, fit_intercept=False)
+    ridge2.fit(X, y_diabetes)
+    score2 = ridge2.score(X, y_diabetes)
+
+    assert score >= score2
+
+
+def check_array_api_attributes(name, estimator, array_namespace, device, dtype_name):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    X_iris_np = X_iris.astype(dtype_name)
+    y_iris_np = y_iris.astype(dtype_name)
+
+    X_iris_xp = xp.asarray(X_iris_np, device=device)
+    y_iris_xp = xp.asarray(y_iris_np, device=device)
+
+    estimator.fit(X_iris_np, y_iris_np)
+    coef_np = estimator.coef_
+    intercept_np = estimator.intercept_
+
+    with config_context(array_api_dispatch=True):
+        estimator_xp = clone(estimator).fit(X_iris_xp, y_iris_xp)
+        coef_xp = estimator_xp.coef_
+        assert coef_xp.shape == (4,)
+        assert coef_xp.dtype == X_iris_xp.dtype
+
+        assert_allclose(
+            _convert_to_numpy(coef_xp, xp=xp),
+            coef_np,
+            atol=_atol_for_type(dtype_name),
+        )
+        intercept_xp = estimator_xp.intercept_
+        assert intercept_xp.shape == ()
+        assert intercept_xp.dtype == X_iris_xp.dtype
+
+        assert_allclose(
+            _convert_to_numpy(intercept_xp, xp=xp),
+            intercept_np,
+            atol=_atol_for_type(dtype_name),
+        )
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize(
+    "check",
+    [check_array_api_input_and_values, check_array_api_attributes],
+    ids=_get_check_estimator_ids,
+)
+@pytest.mark.parametrize(
+    "estimator",
+    [Ridge(solver="svd")],
+    ids=_get_check_estimator_ids,
+)
+def test_ridge_array_api_compliance(
+    estimator, check, array_namespace, device, dtype_name
+):
+    name = estimator.__class__.__name__
+    check(name, estimator, array_namespace, device=device, dtype_name=dtype_name)
+
+
+@pytest.mark.parametrize(
+    "array_namespace", yield_namespaces(include_numpy_namespaces=False)
+)
+def test_array_api_error_and_warnings_for_solver_parameter(array_namespace):
+    xp = _array_api_for_tests(array_namespace, device=None)
+
+    X_iris_xp = xp.asarray(X_iris[:5])
+    y_iris_xp = xp.asarray(y_iris[:5])
+
+    available_solvers = Ridge._parameter_constraints["solver"][0].options
+    for solver in available_solvers - {"auto", "svd"}:
+        ridge = Ridge(solver=solver, positive=solver == "lbfgs")
+        expected_msg = (
+            f"Array API dispatch to namespace {xp.__name__} only supports "
+            f"solver 'svd'. Got '{solver}'."
+        )
+
+        with pytest.raises(ValueError, match=expected_msg):
+            with config_context(array_api_dispatch=True):
+                ridge.fit(X_iris_xp, y_iris_xp)
+
+    ridge = Ridge(solver="auto", positive=True)
+    expected_msg = (
+        "The solvers that support positive fitting do not support "
+        f"Array API dispatch to namespace {xp.__name__}. Please "
+        "either disable Array API dispatch, or use a numpy-like "
+        "namespace, or set `positive=False`."
+    )
+
+    with pytest.raises(ValueError, match=expected_msg):
+        with config_context(array_api_dispatch=True):
+            ridge.fit(X_iris_xp, y_iris_xp)
+
+    ridge = Ridge()
+    expected_msg = (
+        f"Using Array API dispatch to namespace {xp.__name__} with `solver='auto'` "
+        "will result in using the solver 'svd'. The results may differ from those "
+        "when using a Numpy array, because in that case the preferred solver would "
+        "be cholesky. Set `solver='svd'` to suppress this warning."
+    )
+    with pytest.warns(UserWarning, match=expected_msg):
+        with config_context(array_api_dispatch=True):
+            ridge.fit(X_iris_xp, y_iris_xp)
+
+
+@pytest.mark.parametrize("array_namespace", sorted(_NUMPY_NAMESPACE_NAMES))
+def test_array_api_numpy_namespace_no_warning(array_namespace):
+    xp = _array_api_for_tests(array_namespace, device=None)
+
+    X_iris_xp = xp.asarray(X_iris[:5])
+    y_iris_xp = xp.asarray(y_iris[:5])
+
+    ridge = Ridge()
+    expected_msg = (
+        "Results might be different than when Array API dispatch is "
+        "disabled, or when a numpy-like namespace is used"
+    )
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", message=expected_msg, category=UserWarning)
+        with config_context(array_api_dispatch=True):
+            ridge.fit(X_iris_xp, y_iris_xp)
+
+    # All numpy namespaces are compatible with all solver, in particular
+    # solvers that support `positive=True` (like 'lbfgs') should work.
+    with config_context(array_api_dispatch=True):
+        Ridge(solver="auto", positive=True).fit(X_iris_xp, y_iris_xp)
+
+
+@pytest.mark.parametrize(
+    "test_func",
+    (
+        _test_ridge_loo,
+        _test_ridge_cv,
+        _test_ridge_diabetes,
+        _test_multi_ridge_diabetes,
+        _test_ridge_classifiers,
+        _test_tolerance,
+    ),
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_dense_sparse(test_func, csr_container):
+    # test dense matrix
+    ret_dense = test_func(None)
+    # test sparse matrix
+    ret_sparse = test_func(csr_container)
+    # test that the outputs are the same
+    if ret_dense is not None and ret_sparse is not None:
+        assert_array_almost_equal(ret_dense, ret_sparse, decimal=3)
+
+
+def test_class_weights():
+    # Test class weights.
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y = [1, 1, 1, -1, -1]
+
+    reg = RidgeClassifier(class_weight=None)
+    reg.fit(X, y)
+    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))
+
+    # we give a small weights to class 1
+    reg = RidgeClassifier(class_weight={1: 0.001})
+    reg.fit(X, y)
+
+    # now the hyperplane should rotate clock-wise and
+    # the prediction on this point should shift
+    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([-1]))
+
+    # check if class_weight = 'balanced' can handle negative labels.
+    reg = RidgeClassifier(class_weight="balanced")
+    reg.fit(X, y)
+    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))
+
+    # class_weight = 'balanced', and class_weight = None should return
+    # same values when y has equal number of all labels
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0]])
+    y = [1, 1, -1, -1]
+    reg = RidgeClassifier(class_weight=None)
+    reg.fit(X, y)
+    rega = RidgeClassifier(class_weight="balanced")
+    rega.fit(X, y)
+    assert len(rega.classes_) == 2
+    assert_array_almost_equal(reg.coef_, rega.coef_)
+    assert_array_almost_equal(reg.intercept_, rega.intercept_)
+
+
+@pytest.mark.parametrize("reg", (RidgeClassifier, RidgeClassifierCV))
+def test_class_weight_vs_sample_weight(reg):
+    """Check class_weights resemble sample_weights behavior."""
+
+    # Iris is balanced, so no effect expected for using 'balanced' weights
+    reg1 = reg()
+    reg1.fit(iris.data, iris.target)
+    reg2 = reg(class_weight="balanced")
+    reg2.fit(iris.data, iris.target)
+    assert_almost_equal(reg1.coef_, reg2.coef_)
+
+    # Inflate importance of class 1, check against user-defined weights
+    sample_weight = np.ones(iris.target.shape)
+    sample_weight[iris.target == 1] *= 100
+    class_weight = {0: 1.0, 1: 100.0, 2: 1.0}
+    reg1 = reg()
+    reg1.fit(iris.data, iris.target, sample_weight)
+    reg2 = reg(class_weight=class_weight)
+    reg2.fit(iris.data, iris.target)
+    assert_almost_equal(reg1.coef_, reg2.coef_)
+
+    # Check that sample_weight and class_weight are multiplicative
+    reg1 = reg()
+    reg1.fit(iris.data, iris.target, sample_weight**2)
+    reg2 = reg(class_weight=class_weight)
+    reg2.fit(iris.data, iris.target, sample_weight)
+    assert_almost_equal(reg1.coef_, reg2.coef_)
+
+
+def test_class_weights_cv():
+    # Test class weights for cross validated ridge classifier.
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y = [1, 1, 1, -1, -1]
+
+    reg = RidgeClassifierCV(class_weight=None, alphas=[0.01, 0.1, 1])
+    reg.fit(X, y)
+
+    # we give a small weights to class 1
+    reg = RidgeClassifierCV(class_weight={1: 0.001}, alphas=[0.01, 0.1, 1, 10])
+    reg.fit(X, y)
+
+    assert_array_equal(reg.predict([[-0.2, 2]]), np.array([-1]))
+
+
+@pytest.mark.parametrize(
+    "scoring", [None, "neg_mean_squared_error", _mean_squared_error_callable]
+)
+def test_ridgecv_store_cv_results(scoring):
+    rng = np.random.RandomState(42)
+
+    n_samples = 8
+    n_features = 5
+    x = rng.randn(n_samples, n_features)
+    alphas = [1e-1, 1e0, 1e1]
+    n_alphas = len(alphas)
+
+    scoring_ = make_scorer(scoring) if callable(scoring) else scoring
+
+    r = RidgeCV(alphas=alphas, cv=None, store_cv_results=True, scoring=scoring_)
+
+    # with len(y.shape) == 1
+    y = rng.randn(n_samples)
+    r.fit(x, y)
+    assert r.cv_results_.shape == (n_samples, n_alphas)
+
+    # with len(y.shape) == 2
+    n_targets = 3
+    y = rng.randn(n_samples, n_targets)
+    r.fit(x, y)
+    assert r.cv_results_.shape == (n_samples, n_targets, n_alphas)
+
+    r = RidgeCV(cv=3, store_cv_results=True, scoring=scoring)
+    with pytest.raises(ValueError, match="cv!=None and store_cv_results"):
+        r.fit(x, y)
+
+
+@pytest.mark.parametrize("scoring", [None, "accuracy", _accuracy_callable])
+def test_ridge_classifier_cv_store_cv_results(scoring):
+    x = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y = np.array([1, 1, 1, -1, -1])
+
+    n_samples = x.shape[0]
+    alphas = [1e-1, 1e0, 1e1]
+    n_alphas = len(alphas)
+
+    scoring_ = make_scorer(scoring) if callable(scoring) else scoring
+
+    r = RidgeClassifierCV(
+        alphas=alphas, cv=None, store_cv_results=True, scoring=scoring_
+    )
+
+    # with len(y.shape) == 1
+    n_targets = 1
+    r.fit(x, y)
+    assert r.cv_results_.shape == (n_samples, n_targets, n_alphas)
+
+    # with len(y.shape) == 2
+    y = np.array(
+        [[1, 1, 1, -1, -1], [1, -1, 1, -1, 1], [-1, -1, 1, -1, -1]]
+    ).transpose()
+    n_targets = y.shape[1]
+    r.fit(x, y)
+    assert r.cv_results_.shape == (n_samples, n_targets, n_alphas)
+
+
+@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV])
+def test_ridgecv_alphas_conversion(Estimator):
+    rng = np.random.RandomState(0)
+    alphas = (0.1, 1.0, 10.0)
+
+    n_samples, n_features = 5, 5
+    if Estimator is RidgeCV:
+        y = rng.randn(n_samples)
+    else:
+        y = rng.randint(0, 2, n_samples)
+    X = rng.randn(n_samples, n_features)
+
+    ridge_est = Estimator(alphas=alphas)
+    assert ridge_est.alphas is alphas, (
+        f"`alphas` was mutated in `{Estimator.__name__}.__init__`"
+    )
+
+    ridge_est.fit(X, y)
+    assert_array_equal(ridge_est.alphas, np.asarray(alphas))
+
+
+@pytest.mark.parametrize("cv", [None, 3])
+@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV])
+def test_ridgecv_alphas_zero(cv, Estimator):
+    """Check alpha=0.0 raises error only when `cv=None`."""
+    rng = np.random.RandomState(0)
+    alphas = (0.0, 1.0, 10.0)
+
+    n_samples, n_features = 5, 5
+    if Estimator is RidgeCV:
+        y = rng.randn(n_samples)
+    else:
+        y = rng.randint(0, 2, n_samples)
+    X = rng.randn(n_samples, n_features)
+
+    ridge_est = Estimator(alphas=alphas, cv=cv)
+    if cv is None:
+        with pytest.raises(ValueError, match=r"alphas\[0\] == 0.0, must be > 0.0."):
+            ridge_est.fit(X, y)
+    else:
+        ridge_est.fit(X, y)
+
+
+def test_ridgecv_sample_weight():
+    rng = np.random.RandomState(0)
+    alphas = (0.1, 1.0, 10.0)
+
+    # There are different algorithms for n_samples > n_features
+    # and the opposite, so test them both.
+    for n_samples, n_features in ((6, 5), (5, 10)):
+        y = rng.randn(n_samples)
+        X = rng.randn(n_samples, n_features)
+        sample_weight = 1.0 + rng.rand(n_samples)
+
+        cv = KFold(5)
+        ridgecv = RidgeCV(alphas=alphas, cv=cv)
+        ridgecv.fit(X, y, sample_weight=sample_weight)
+
+        # Check using GridSearchCV directly
+        parameters = {"alpha": alphas}
+        gs = GridSearchCV(Ridge(), parameters, cv=cv)
+        gs.fit(X, y, sample_weight=sample_weight)
+
+        assert ridgecv.alpha_ == gs.best_estimator_.alpha
+        assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
+
+
+def test_raises_value_error_if_sample_weights_greater_than_1d():
+    # Sample weights must be either scalar or 1D
+
+    n_sampless = [2, 3]
+    n_featuress = [3, 2]
+
+    rng = np.random.RandomState(42)
+
+    for n_samples, n_features in zip(n_sampless, n_featuress):
+        X = rng.randn(n_samples, n_features)
+        y = rng.randn(n_samples)
+        sample_weights_OK = rng.randn(n_samples) ** 2 + 1
+        sample_weights_OK_1 = 1.0
+        sample_weights_OK_2 = 2.0
+        sample_weights_not_OK = sample_weights_OK[:, np.newaxis]
+        sample_weights_not_OK_2 = sample_weights_OK[np.newaxis, :]
+
+        ridge = Ridge(alpha=1)
+
+        # make sure the "OK" sample weights actually work
+        ridge.fit(X, y, sample_weights_OK)
+        ridge.fit(X, y, sample_weights_OK_1)
+        ridge.fit(X, y, sample_weights_OK_2)
+
+        def fit_ridge_not_ok():
+            ridge.fit(X, y, sample_weights_not_OK)
+
+        def fit_ridge_not_ok_2():
+            ridge.fit(X, y, sample_weights_not_OK_2)
+
+        err_msg = "Sample weights must be 1D array or scalar"
+        with pytest.raises(ValueError, match=err_msg):
+            fit_ridge_not_ok()
+
+        err_msg = "Sample weights must be 1D array or scalar"
+        with pytest.raises(ValueError, match=err_msg):
+            fit_ridge_not_ok_2()
+
+
+@pytest.mark.parametrize("n_samples,n_features", [[2, 3], [3, 2]])
+@pytest.mark.parametrize(
+    "sparse_container",
+    COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
+)
+def test_sparse_design_with_sample_weights(n_samples, n_features, sparse_container):
+    # Sample weights must work with sparse matrices
+    rng = np.random.RandomState(42)
+
+    sparse_ridge = Ridge(alpha=1.0, fit_intercept=False)
+    dense_ridge = Ridge(alpha=1.0, fit_intercept=False)
+
+    X = rng.randn(n_samples, n_features)
+    y = rng.randn(n_samples)
+    sample_weights = rng.randn(n_samples) ** 2 + 1
+    X_sparse = sparse_container(X)
+    sparse_ridge.fit(X_sparse, y, sample_weight=sample_weights)
+    dense_ridge.fit(X, y, sample_weight=sample_weights)
+
+    assert_array_almost_equal(sparse_ridge.coef_, dense_ridge.coef_, decimal=6)
+
+
+def test_ridgecv_int_alphas():
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y = [1, 1, 1, -1, -1]
+
+    # Integers
+    ridge = RidgeCV(alphas=(1, 10, 100))
+    ridge.fit(X, y)
+
+
+@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV])
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        ({"alphas": (1, -1, -100)}, ValueError, r"alphas\[1\] == -1, must be > 0.0"),
+        (
+            {"alphas": (-0.1, -1.0, -10.0)},
+            ValueError,
+            r"alphas\[0\] == -0.1, must be > 0.0",
+        ),
+        (
+            {"alphas": (1, 1.0, "1")},
+            TypeError,
+            r"alphas\[2\] must be an instance of float, not str",
+        ),
+    ],
+)
+def test_ridgecv_alphas_validation(Estimator, params, err_type, err_msg):
+    """Check the `alphas` validation in RidgeCV and RidgeClassifierCV."""
+
+    n_samples, n_features = 5, 5
+    X = rng.randn(n_samples, n_features)
+    y = rng.randint(0, 2, n_samples)
+
+    with pytest.raises(err_type, match=err_msg):
+        Estimator(**params).fit(X, y)
+
+
+@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV])
+def test_ridgecv_alphas_scalar(Estimator):
+    """Check the case when `alphas` is a scalar.
+    This case was supported in the past when `alphas` where converted
+    into array in `__init__`.
+    We add this test to ensure backward compatibility.
+    """
+
+    n_samples, n_features = 5, 5
+    X = rng.randn(n_samples, n_features)
+    if Estimator is RidgeCV:
+        y = rng.randn(n_samples)
+    else:
+        y = rng.randint(0, 2, n_samples)
+
+    Estimator(alphas=1).fit(X, y)
+
+
+def test_sparse_cg_max_iter():
+    reg = Ridge(solver="sparse_cg", max_iter=1)
+    reg.fit(X_diabetes, y_diabetes)
+    assert reg.coef_.shape[0] == X_diabetes.shape[1]
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_n_iter():
+    # Test that self.n_iter_ is correct.
+    n_targets = 2
+    X, y = X_diabetes, y_diabetes
+    y_n = np.tile(y, (n_targets, 1)).T
+
+    for max_iter in range(1, 4):
+        for solver in ("sag", "saga", "lsqr"):
+            reg = Ridge(solver=solver, max_iter=max_iter, tol=1e-12)
+            reg.fit(X, y_n)
+            assert_array_equal(reg.n_iter_, np.tile(max_iter, n_targets))
+
+    for solver in ("sparse_cg", "svd", "cholesky"):
+        reg = Ridge(solver=solver, max_iter=1, tol=1e-1)
+        reg.fit(X, y_n)
+        assert reg.n_iter_ is None
+
+
+@pytest.mark.parametrize("solver", ["lsqr", "sparse_cg", "lbfgs", "auto"])
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_ridge_fit_intercept_sparse(
+    solver, with_sample_weight, global_random_seed, csr_container
+):
+    """Check that ridge finds the same coefs and intercept on dense and sparse input
+    in the presence of sample weights.
+
+    For now only sparse_cg and lbfgs can correctly fit an intercept
+    with sparse X with default tol and max_iter.
+    'sag' is tested separately in test_ridge_fit_intercept_sparse_sag because it
+    requires more iterations and should raise a warning if default max_iter is used.
+    Other solvers raise an exception, as checked in
+    test_ridge_fit_intercept_sparse_error
+    """
+    positive = solver == "lbfgs"
+    X, y = _make_sparse_offset_regression(
+        n_features=20, random_state=global_random_seed, positive=positive
+    )
+
+    sample_weight = None
+    if with_sample_weight:
+        rng = np.random.RandomState(global_random_seed)
+        sample_weight = 1.0 + rng.uniform(size=X.shape[0])
+
+    # "auto" should switch to "sparse_cg" when X is sparse
+    # so the reference we use for both ("auto" and "sparse_cg") is
+    # Ridge(solver="sparse_cg"), fitted using the dense representation (note
+    # that "sparse_cg" can fit sparse or dense data)
+    dense_solver = "sparse_cg" if solver == "auto" else solver
+    dense_ridge = Ridge(solver=dense_solver, tol=1e-12, positive=positive)
+    sparse_ridge = Ridge(solver=solver, tol=1e-12, positive=positive)
+
+    dense_ridge.fit(X, y, sample_weight=sample_weight)
+    sparse_ridge.fit(csr_container(X), y, sample_weight=sample_weight)
+
+    assert_allclose(dense_ridge.intercept_, sparse_ridge.intercept_)
+    assert_allclose(dense_ridge.coef_, sparse_ridge.coef_, rtol=5e-7)
+
+
+@pytest.mark.parametrize("solver", ["saga", "svd", "cholesky"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_ridge_fit_intercept_sparse_error(solver, csr_container):
+    X, y = _make_sparse_offset_regression(n_features=20, random_state=0)
+    X_csr = csr_container(X)
+    sparse_ridge = Ridge(solver=solver)
+    err_msg = "solver='{}' does not support".format(solver)
+    with pytest.raises(ValueError, match=err_msg):
+        sparse_ridge.fit(X_csr, y)
+
+
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_ridge_fit_intercept_sparse_sag(
+    with_sample_weight, global_random_seed, csr_container
+):
+    X, y = _make_sparse_offset_regression(
+        n_features=5, n_samples=20, random_state=global_random_seed, X_offset=5.0
+    )
+    if with_sample_weight:
+        rng = np.random.RandomState(global_random_seed)
+        sample_weight = 1.0 + rng.uniform(size=X.shape[0])
+    else:
+        sample_weight = None
+    X_csr = csr_container(X)
+
+    params = dict(
+        alpha=1.0, solver="sag", fit_intercept=True, tol=1e-10, max_iter=100000
+    )
+    dense_ridge = Ridge(**params)
+    sparse_ridge = Ridge(**params)
+    dense_ridge.fit(X, y, sample_weight=sample_weight)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        sparse_ridge.fit(X_csr, y, sample_weight=sample_weight)
+    assert_allclose(dense_ridge.intercept_, sparse_ridge.intercept_, rtol=1e-4)
+    assert_allclose(dense_ridge.coef_, sparse_ridge.coef_, rtol=1e-4)
+    with pytest.warns(UserWarning, match='"sag" solver requires.*'):
+        Ridge(solver="sag", fit_intercept=True, tol=1e-3, max_iter=None).fit(X_csr, y)
+
+
+@pytest.mark.parametrize("return_intercept", [False, True])
+@pytest.mark.parametrize("sample_weight", [None, np.ones(1000)])
+@pytest.mark.parametrize("container", [np.array] + CSR_CONTAINERS)
+@pytest.mark.parametrize(
+    "solver", ["auto", "sparse_cg", "cholesky", "lsqr", "sag", "saga", "lbfgs"]
+)
+def test_ridge_regression_check_arguments_validity(
+    return_intercept, sample_weight, container, solver
+):
+    """check if all combinations of arguments give valid estimations"""
+
+    # test excludes 'svd' solver because it raises exception for sparse inputs
+
+    rng = check_random_state(42)
+    X = rng.rand(1000, 3)
+    true_coefs = [1, 2, 0.1]
+    y = np.dot(X, true_coefs)
+    true_intercept = 0.0
+    if return_intercept:
+        true_intercept = 10000.0
+    y += true_intercept
+    X_testing = container(X)
+
+    alpha, tol = 1e-3, 1e-6
+    atol = 1e-3 if _IS_32BIT else 1e-4
+
+    positive = solver == "lbfgs"
+
+    if solver not in ["sag", "auto"] and return_intercept:
+        with pytest.raises(ValueError, match="In Ridge, only 'sag' solver"):
+            ridge_regression(
+                X_testing,
+                y,
+                alpha=alpha,
+                solver=solver,
+                sample_weight=sample_weight,
+                return_intercept=return_intercept,
+                positive=positive,
+                tol=tol,
+            )
+        return
+
+    out = ridge_regression(
+        X_testing,
+        y,
+        alpha=alpha,
+        solver=solver,
+        sample_weight=sample_weight,
+        positive=positive,
+        return_intercept=return_intercept,
+        tol=tol,
+    )
+
+    if return_intercept:
+        coef, intercept = out
+        assert_allclose(coef, true_coefs, rtol=0, atol=atol)
+        assert_allclose(intercept, true_intercept, rtol=0, atol=atol)
+    else:
+        assert_allclose(out, true_coefs, rtol=0, atol=atol)
+
+
+@pytest.mark.parametrize(
+    "solver", ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga", "lbfgs"]
+)
+def test_dtype_match(solver):
+    rng = np.random.RandomState(0)
+    alpha = 1.0
+    positive = solver == "lbfgs"
+
+    n_samples, n_features = 6, 5
+    X_64 = rng.randn(n_samples, n_features)
+    y_64 = rng.randn(n_samples)
+    X_32 = X_64.astype(np.float32)
+    y_32 = y_64.astype(np.float32)
+
+    tol = 2 * np.finfo(np.float32).resolution
+    # Check type consistency 32bits
+    ridge_32 = Ridge(
+        alpha=alpha, solver=solver, max_iter=500, tol=tol, positive=positive
+    )
+    ridge_32.fit(X_32, y_32)
+    coef_32 = ridge_32.coef_
+
+    # Check type consistency 64 bits
+    ridge_64 = Ridge(
+        alpha=alpha, solver=solver, max_iter=500, tol=tol, positive=positive
+    )
+    ridge_64.fit(X_64, y_64)
+    coef_64 = ridge_64.coef_
+
+    # Do the actual checks at once for easier debug
+    assert coef_32.dtype == X_32.dtype
+    assert coef_64.dtype == X_64.dtype
+    assert ridge_32.predict(X_32).dtype == X_32.dtype
+    assert ridge_64.predict(X_64).dtype == X_64.dtype
+    assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4, atol=5e-4)
+
+
+def test_dtype_match_cholesky():
+    # Test different alphas in cholesky solver to ensure full coverage.
+    # This test is separated from test_dtype_match for clarity.
+    rng = np.random.RandomState(0)
+    alpha = np.array([1.0, 0.5])
+
+    n_samples, n_features, n_target = 6, 7, 2
+    X_64 = rng.randn(n_samples, n_features)
+    y_64 = rng.randn(n_samples, n_target)
+    X_32 = X_64.astype(np.float32)
+    y_32 = y_64.astype(np.float32)
+
+    # Check type consistency 32bits
+    ridge_32 = Ridge(alpha=alpha, solver="cholesky")
+    ridge_32.fit(X_32, y_32)
+    coef_32 = ridge_32.coef_
+
+    # Check type consistency 64 bits
+    ridge_64 = Ridge(alpha=alpha, solver="cholesky")
+    ridge_64.fit(X_64, y_64)
+    coef_64 = ridge_64.coef_
+
+    # Do all the checks at once, like this is easier to debug
+    assert coef_32.dtype == X_32.dtype
+    assert coef_64.dtype == X_64.dtype
+    assert ridge_32.predict(X_32).dtype == X_32.dtype
+    assert ridge_64.predict(X_64).dtype == X_64.dtype
+    assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
+
+
+@pytest.mark.parametrize(
+    "solver", ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"]
+)
+@pytest.mark.parametrize("seed", range(1))
+def test_ridge_regression_dtype_stability(solver, seed):
+    random_state = np.random.RandomState(seed)
+    n_samples, n_features = 6, 5
+    X = random_state.randn(n_samples, n_features)
+    coef = random_state.randn(n_features)
+    y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples)
+    alpha = 1.0
+    positive = solver == "lbfgs"
+    results = dict()
+    # XXX: Sparse CG seems to be far less numerically stable than the
+    # others, maybe we should not enable float32 for this one.
+    atol = 1e-3 if solver == "sparse_cg" else 1e-5
+    for current_dtype in (np.float32, np.float64):
+        results[current_dtype] = ridge_regression(
+            X.astype(current_dtype),
+            y.astype(current_dtype),
+            alpha=alpha,
+            solver=solver,
+            random_state=random_state,
+            sample_weight=None,
+            positive=positive,
+            max_iter=500,
+            tol=1e-10,
+            return_n_iter=False,
+            return_intercept=False,
+        )
+
+    assert results[np.float32].dtype == np.float32
+    assert results[np.float64].dtype == np.float64
+    assert_allclose(results[np.float32], results[np.float64], atol=atol)
+
+
+def test_ridge_sag_with_X_fortran():
+    # check that Fortran array are converted when using SAG solver
+    X, y = make_regression(random_state=42)
+    # for the order of X and y to not be C-ordered arrays
+    X = np.asfortranarray(X)
+    X = X[::2, :]
+    y = y[::2]
+    Ridge(solver="sag").fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "Classifier, params",
+    [
+        (RidgeClassifier, {}),
+        (RidgeClassifierCV, {"cv": None}),
+        (RidgeClassifierCV, {"cv": 3}),
+    ],
+)
+def test_ridgeclassifier_multilabel(Classifier, params):
+    """Check that multilabel classification is supported and give meaningful
+    results."""
+    X, y = make_multilabel_classification(n_classes=1, random_state=0)
+    y = y.reshape(-1, 1)
+    Y = np.concatenate([y, y], axis=1)
+    clf = Classifier(**params).fit(X, Y)
+    Y_pred = clf.predict(X)
+
+    assert Y_pred.shape == Y.shape
+    assert_array_equal(Y_pred[:, 0], Y_pred[:, 1])
+    Ridge(solver="sag").fit(X, y)
+
+
+@pytest.mark.parametrize("solver", ["auto", "lbfgs"])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0])
+def test_ridge_positive_regression_test(solver, fit_intercept, alpha):
+    """Test that positive Ridge finds true positive coefficients."""
+    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    coef = np.array([1, -10])
+    if fit_intercept:
+        intercept = 20
+        y = X.dot(coef) + intercept
+    else:
+        y = X.dot(coef)
+
+    model = Ridge(
+        alpha=alpha, positive=True, solver=solver, fit_intercept=fit_intercept
+    )
+    model.fit(X, y)
+    assert np.all(model.coef_ >= 0)
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0])
+def test_ridge_ground_truth_positive_test(fit_intercept, alpha):
+    """Test that Ridge w/wo positive converges to the same solution.
+
+    Ridge with positive=True and positive=False must give the same
+    when the ground truth coefs are all positive.
+    """
+    rng = np.random.RandomState(42)
+    X = rng.randn(300, 100)
+    coef = rng.uniform(0.1, 1.0, size=X.shape[1])
+    if fit_intercept:
+        intercept = 1
+        y = X @ coef + intercept
+    else:
+        y = X @ coef
+    y += rng.normal(size=X.shape[0]) * 0.01
+
+    results = []
+    for positive in [True, False]:
+        model = Ridge(
+            alpha=alpha, positive=positive, fit_intercept=fit_intercept, tol=1e-10
+        )
+        results.append(model.fit(X, y).coef_)
+    assert_allclose(*results, atol=1e-6, rtol=0)
+
+
+@pytest.mark.parametrize(
+    "solver", ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
+)
+def test_ridge_positive_error_test(solver):
+    """Test input validation for positive argument in Ridge."""
+    alpha = 0.1
+    X = np.array([[1, 2], [3, 4]])
+    coef = np.array([1, -1])
+    y = X @ coef
+
+    model = Ridge(alpha=alpha, positive=True, solver=solver, fit_intercept=False)
+    with pytest.raises(ValueError, match="does not support positive"):
+        model.fit(X, y)
+
+    with pytest.raises(ValueError, match="only 'lbfgs' solver can be used"):
+        _, _ = ridge_regression(
+            X, y, alpha, positive=True, solver=solver, return_intercept=False
+        )
+
+
+@pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0])
+def test_positive_ridge_loss(alpha):
+    """Check ridge loss consistency when positive argument is enabled."""
+    X, y = make_regression(n_samples=300, n_features=300, random_state=42)
+    alpha = 0.10
+    n_checks = 100
+
+    def ridge_loss(model, random_state=None, noise_scale=1e-8):
+        intercept = model.intercept_
+        if random_state is not None:
+            rng = np.random.RandomState(random_state)
+            coef = model.coef_ + rng.uniform(0, noise_scale, size=model.coef_.shape)
+        else:
+            coef = model.coef_
+
+        return 0.5 * np.sum((y - X @ coef - intercept) ** 2) + 0.5 * alpha * np.sum(
+            coef**2
+        )
+
+    model = Ridge(alpha=alpha).fit(X, y)
+    model_positive = Ridge(alpha=alpha, positive=True).fit(X, y)
+
+    # Check 1:
+    #   Loss for solution found by Ridge(positive=False)
+    #   is lower than that for solution found by Ridge(positive=True)
+    loss = ridge_loss(model)
+    loss_positive = ridge_loss(model_positive)
+    assert loss <= loss_positive
+
+    # Check 2:
+    #   Loss for solution found by Ridge(positive=True)
+    #   is lower than that for small random positive perturbation
+    #   of the positive solution.
+    for random_state in range(n_checks):
+        loss_perturbed = ridge_loss(model_positive, random_state=random_state)
+        assert loss_positive <= loss_perturbed
+
+
+@pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0])
+def test_lbfgs_solver_consistency(alpha):
+    """Test that LBGFS gets almost the same coef of svd when positive=False."""
+    X, y = make_regression(n_samples=300, n_features=300, random_state=42)
+    y = np.expand_dims(y, 1)
+    alpha = np.asarray([alpha])
+    config = {
+        "positive": False,
+        "tol": 1e-16,
+        "max_iter": 500000,
+    }
+
+    coef_lbfgs = _solve_lbfgs(X, y, alpha, **config)
+    coef_cholesky = _solve_svd(X, y, alpha)
+    assert_allclose(coef_lbfgs, coef_cholesky, atol=1e-4, rtol=0)
+
+
+def test_lbfgs_solver_error():
+    """Test that LBFGS solver raises ConvergenceWarning."""
+    X = np.array([[1, -1], [1, 1]])
+    y = np.array([-1e10, 1e10])
+
+    model = Ridge(
+        alpha=0.01,
+        solver="lbfgs",
+        fit_intercept=False,
+        tol=1e-12,
+        positive=True,
+        max_iter=1,
+    )
+    with pytest.warns(ConvergenceWarning, match="lbfgs solver did not converge"):
+        model.fit(X, y)
+
+
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize("data", ["tall", "wide"])
+@pytest.mark.parametrize("solver", SOLVERS + ["lbfgs"])
+def test_ridge_sample_weight_consistency(
+    fit_intercept, sparse_container, data, solver, global_random_seed
+):
+    """Test that the impact of sample_weight is consistent.
+
+    Note that this test is stricter than the common test
+    check_sample_weight_equivalence alone.
+    """
+    # filter out solver that do not support sparse input
+    if sparse_container is not None:
+        if solver == "svd" or (solver in ("cholesky", "saga") and fit_intercept):
+            pytest.skip("unsupported configuration")
+
+    # XXX: this test is quite sensitive to the seed used to generate the data:
+    # ideally we would like the test to pass for any global_random_seed but this is not
+    # the case at the moment.
+    rng = np.random.RandomState(42)
+    n_samples = 12
+    if data == "tall":
+        n_features = n_samples // 2
+    else:
+        n_features = n_samples * 2
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+    if sparse_container is not None:
+        X = sparse_container(X)
+    params = dict(
+        fit_intercept=fit_intercept,
+        alpha=1.0,
+        solver=solver,
+        positive=(solver == "lbfgs"),
+        random_state=global_random_seed,  # for sag/saga
+        tol=1e-12,
+    )
+
+    # 1) sample_weight=np.ones(..) should be equivalent to sample_weight=None,
+    # a special case of check_sample_weight_equivalence(name, reg), but we also
+    # test with sparse input.
+    reg = Ridge(**params).fit(X, y, sample_weight=None)
+    coef = reg.coef_.copy()
+    if fit_intercept:
+        intercept = reg.intercept_
+    sample_weight = np.ones_like(y)
+    reg.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(reg.coef_, coef, rtol=1e-6)
+    if fit_intercept:
+        assert_allclose(reg.intercept_, intercept)
+
+    # 2) setting elements of sample_weight to 0 is equivalent to removing these samples,
+    # another special case of check_sample_weight_equivalence(name, reg), but we
+    # also test with sparse input
+    sample_weight = rng.uniform(low=0.01, high=2, size=X.shape[0])
+    sample_weight[-5:] = 0
+    y[-5:] *= 1000  # to make excluding those samples important
+    reg.fit(X, y, sample_weight=sample_weight)
+    coef = reg.coef_.copy()
+    if fit_intercept:
+        intercept = reg.intercept_
+    reg.fit(X[:-5, :], y[:-5], sample_weight=sample_weight[:-5])
+    assert_allclose(reg.coef_, coef, rtol=1e-6)
+    if fit_intercept:
+        assert_allclose(reg.intercept_, intercept)
+
+    # 3) scaling of sample_weight should have no effect
+    # Note: For models with penalty, scaling the penalty term might work.
+    reg2 = Ridge(**params).set_params(alpha=np.pi * params["alpha"])
+    reg2.fit(X, y, sample_weight=np.pi * sample_weight)
+    if solver in ("sag", "saga") and not fit_intercept:
+        pytest.xfail(f"Solver {solver} does fail test for scaling of sample_weight.")
+    assert_allclose(reg2.coef_, coef, rtol=1e-6)
+    if fit_intercept:
+        assert_allclose(reg2.intercept_, intercept)
+
+    # 4) check that multiplying sample_weight by 2 is equivalent
+    # to repeating corresponding samples twice
+    if sparse_container is not None:
+        X = X.toarray()
+    X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
+    y2 = np.concatenate([y, y[: n_samples // 2]])
+    sample_weight_1 = sample_weight.copy()
+    sample_weight_1[: n_samples // 2] *= 2
+    sample_weight_2 = np.concatenate(
+        [sample_weight, sample_weight[: n_samples // 2]], axis=0
+    )
+    if sparse_container is not None:
+        X = sparse_container(X)
+        X2 = sparse_container(X2)
+    reg1 = Ridge(**params).fit(X, y, sample_weight=sample_weight_1)
+    reg2 = Ridge(**params).fit(X2, y2, sample_weight=sample_weight_2)
+    assert_allclose(reg1.coef_, reg2.coef_)
+    if fit_intercept:
+        assert_allclose(reg1.intercept_, reg2.intercept_)
+
+
+@pytest.mark.parametrize("with_sample_weight", [False, True])
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("n_targets", [1, 2])
+def test_ridge_cv_results_predictions(with_sample_weight, fit_intercept, n_targets):
+    """Check that the predictions stored in `cv_results_` are on the original scale.
+
+    The GCV approach works on scaled data: centered by an offset and scaled by the
+    square root of the sample weights. Thus, prior to computing scores, the
+    predictions need to be scaled back to the original scale. These predictions are
+    the ones stored in `cv_results_` in `RidgeCV`.
+
+    In this test, we check that the internal predictions stored in `cv_results_` are
+    equivalent to a naive LOO-CV grid search with a `Ridge` estimator.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/13998
+    """
+    X, y = make_regression(
+        n_samples=100, n_features=10, n_targets=n_targets, random_state=0
+    )
+    sample_weight = np.ones(shape=(X.shape[0],))
+    if with_sample_weight:
+        sample_weight[::2] = 0.5
+
+    alphas = (0.1, 1.0, 10.0)
+
+    # scoring should be set to store predictions and not the squared error
+    ridge_cv = RidgeCV(
+        alphas=alphas,
+        scoring="neg_mean_squared_error",
+        fit_intercept=fit_intercept,
+        store_cv_results=True,
+    )
+    ridge_cv.fit(X, y, sample_weight=sample_weight)
+
+    # manual grid-search with a `Ridge` estimator
+    predictions = np.empty(shape=(*y.shape, len(alphas)))
+    cv = LeaveOneOut()
+    for alpha_idx, alpha in enumerate(alphas):
+        for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
+            ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept)
+            ridge.fit(X[train_idx], y[train_idx], sample_weight[train_idx])
+            predictions[idx, ..., alpha_idx] = ridge.predict(X[test_idx])
+    assert_allclose(ridge_cv.cv_results_, predictions)
+
+
+def test_ridge_cv_multioutput_sample_weight(global_random_seed):
+    """Check that `RidgeCV` works properly with multioutput and sample_weight
+    when `scoring != None`.
+
+    We check the error reported by the RidgeCV is close to a naive LOO-CV using a
+    Ridge estimator.
+    """
+    X, y = make_regression(n_targets=2, random_state=global_random_seed)
+    sample_weight = np.ones(shape=(X.shape[0],))
+
+    ridge_cv = RidgeCV(scoring="neg_mean_squared_error", store_cv_results=True)
+    ridge_cv.fit(X, y, sample_weight=sample_weight)
+
+    cv = LeaveOneOut()
+    ridge = Ridge(alpha=ridge_cv.alpha_)
+    y_pred_loo = np.squeeze(
+        [
+            ridge.fit(X[train], y[train], sample_weight=sample_weight[train]).predict(
+                X[test]
+            )
+            for train, test in cv.split(X)
+        ]
+    )
+    assert_allclose(ridge_cv.best_score_, -mean_squared_error(y, y_pred_loo))
+
+
+def test_ridge_cv_custom_multioutput_scorer():
+    """Check that `RidgeCV` works properly with a custom multioutput scorer."""
+    X, y = make_regression(n_targets=2, random_state=0)
+
+    def custom_error(y_true, y_pred):
+        errors = (y_true - y_pred) ** 2
+        mean_errors = np.mean(errors, axis=0)
+        if mean_errors.ndim == 1:
+            # case of multioutput
+            return -np.average(mean_errors, weights=[2, 1])
+        # single output - this part of the code should not be reached in the case of
+        # multioutput scoring
+        return -mean_errors  # pragma: no cover
+
+    def custom_multioutput_scorer(estimator, X, y):
+        """Multioutput score that give twice more importance to the second target."""
+        return -custom_error(y, estimator.predict(X))
+
+    ridge_cv = RidgeCV(scoring=custom_multioutput_scorer)
+    ridge_cv.fit(X, y)
+
+    cv = LeaveOneOut()
+    ridge = Ridge(alpha=ridge_cv.alpha_)
+    y_pred_loo = np.squeeze(
+        [ridge.fit(X[train], y[train]).predict(X[test]) for train, test in cv.split(X)]
+    )
+
+    assert_allclose(ridge_cv.best_score_, -custom_error(y, y_pred_loo))
+
+
+# Metadata Routing Tests
+# ======================
+
+
+@pytest.mark.parametrize("metaestimator", [RidgeCV, RidgeClassifierCV])
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_with_default_scoring(metaestimator):
+    """Test that `RidgeCV` or `RidgeClassifierCV` with default `scoring`
+    argument (`None`), don't enter into `RecursionError` when metadata is routed.
+    """
+    metaestimator().get_metadata_routing()
+
+
+@pytest.mark.parametrize(
+    "metaestimator, make_dataset",
+    [
+        (RidgeCV(), make_regression),
+        (RidgeClassifierCV(), make_classification),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_set_score_request_with_default_scoring(metaestimator, make_dataset):
+    """Test that `set_score_request` is set within `RidgeCV.fit()` and
+    `RidgeClassifierCV.fit()` when using the default scoring and no
+    UnsetMetadataPassedError is raised. Regression test for the fix in PR #29634."""
+    X, y = make_dataset(n_samples=100, n_features=5, random_state=42)
+    metaestimator.fit(X, y, sample_weight=np.ones(X.shape[0]))
+
+
+# End of Metadata Routing Tests
+# =============================
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sag.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sag.py
new file mode 100644
index 0000000000000000000000000000000000000000..575838f8e8497a01c60adbb74ddad95dadc6e662
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sag.py
@@ -0,0 +1,861 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import math
+import re
+
+import numpy as np
+import pytest
+
+from sklearn.base import clone
+from sklearn.datasets import load_iris, make_blobs, make_classification
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.linear_model._sag import get_auto_step_size
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import check_random_state, compute_class_weight
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+)
+from sklearn.utils.extmath import row_norms
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+iris = load_iris()
+
+
+# this is used for sag classification
+def log_dloss(p, y):
+    z = p * y
+    # approximately equal and saves the computation of the log
+    if z > 18.0:
+        return math.exp(-z) * -y
+    if z < -18.0:
+        return -y
+    return -y / (math.exp(z) + 1.0)
+
+
+def log_loss(p, y):
+    return np.mean(np.log(1.0 + np.exp(-y * p)))
+
+
+# this is used for sag regression
+def squared_dloss(p, y):
+    return p - y
+
+
+def squared_loss(p, y):
+    return np.mean(0.5 * (p - y) * (p - y))
+
+
+# function for measuring the log loss
+def get_pobj(w, alpha, myX, myy, loss):
+    w = w.ravel()
+    pred = np.dot(myX, w)
+    p = loss(pred, myy)
+    p += alpha * w.dot(w) / 2.0
+    return p
+
+
+def sag(
+    X,
+    y,
+    step_size,
+    alpha,
+    n_iter=1,
+    dloss=None,
+    sparse=False,
+    sample_weight=None,
+    fit_intercept=True,
+    saga=False,
+):
+    n_samples, n_features = X.shape[0], X.shape[1]
+
+    weights = np.zeros(X.shape[1])
+    sum_gradient = np.zeros(X.shape[1])
+    gradient_memory = np.zeros((n_samples, n_features))
+
+    intercept = 0.0
+    intercept_sum_gradient = 0.0
+    intercept_gradient_memory = np.zeros(n_samples)
+
+    rng = np.random.RandomState(77)
+    decay = 1.0
+    seen = set()
+
+    # sparse data has a fixed decay of .01
+    if sparse:
+        decay = 0.01
+
+    for epoch in range(n_iter):
+        for k in range(n_samples):
+            idx = int(rng.rand() * n_samples)
+            # idx = k
+            entry = X[idx]
+            seen.add(idx)
+            p = np.dot(entry, weights) + intercept
+            gradient = dloss(p, y[idx])
+            if sample_weight is not None:
+                gradient *= sample_weight[idx]
+            update = entry * gradient + alpha * weights
+            gradient_correction = update - gradient_memory[idx]
+            sum_gradient += gradient_correction
+            gradient_memory[idx] = update
+            if saga:
+                weights -= gradient_correction * step_size * (1 - 1.0 / len(seen))
+
+            if fit_intercept:
+                gradient_correction = gradient - intercept_gradient_memory[idx]
+                intercept_gradient_memory[idx] = gradient
+                intercept_sum_gradient += gradient_correction
+                gradient_correction *= step_size * (1.0 - 1.0 / len(seen))
+                if saga:
+                    intercept -= (
+                        step_size * intercept_sum_gradient / len(seen) * decay
+                    ) + gradient_correction
+                else:
+                    intercept -= step_size * intercept_sum_gradient / len(seen) * decay
+
+            weights -= step_size * sum_gradient / len(seen)
+
+    return weights, intercept
+
+
+def sag_sparse(
+    X,
+    y,
+    step_size,
+    alpha,
+    n_iter=1,
+    dloss=None,
+    sample_weight=None,
+    sparse=False,
+    fit_intercept=True,
+    saga=False,
+    random_state=0,
+):
+    if step_size * alpha == 1.0:
+        raise ZeroDivisionError(
+            "Sparse sag does not handle the case step_size * alpha == 1"
+        )
+    n_samples, n_features = X.shape[0], X.shape[1]
+
+    weights = np.zeros(n_features)
+    sum_gradient = np.zeros(n_features)
+    last_updated = np.zeros(n_features, dtype=int)
+    gradient_memory = np.zeros(n_samples)
+    rng = check_random_state(random_state)
+    intercept = 0.0
+    intercept_sum_gradient = 0.0
+    wscale = 1.0
+    decay = 1.0
+    seen = set()
+
+    c_sum = np.zeros(n_iter * n_samples)
+
+    # sparse data has a fixed decay of .01
+    if sparse:
+        decay = 0.01
+
+    counter = 0
+    for epoch in range(n_iter):
+        for k in range(n_samples):
+            # idx = k
+            idx = int(rng.rand() * n_samples)
+            entry = X[idx]
+            seen.add(idx)
+
+            if counter >= 1:
+                for j in range(n_features):
+                    if last_updated[j] == 0:
+                        weights[j] -= c_sum[counter - 1] * sum_gradient[j]
+                    else:
+                        weights[j] -= (
+                            c_sum[counter - 1] - c_sum[last_updated[j] - 1]
+                        ) * sum_gradient[j]
+                    last_updated[j] = counter
+
+            p = (wscale * np.dot(entry, weights)) + intercept
+            gradient = dloss(p, y[idx])
+
+            if sample_weight is not None:
+                gradient *= sample_weight[idx]
+
+            update = entry * gradient
+            gradient_correction = update - (gradient_memory[idx] * entry)
+            sum_gradient += gradient_correction
+            if saga:
+                for j in range(n_features):
+                    weights[j] -= (
+                        gradient_correction[j]
+                        * step_size
+                        * (1 - 1.0 / len(seen))
+                        / wscale
+                    )
+
+            if fit_intercept:
+                gradient_correction = gradient - gradient_memory[idx]
+                intercept_sum_gradient += gradient_correction
+                gradient_correction *= step_size * (1.0 - 1.0 / len(seen))
+                if saga:
+                    intercept -= (
+                        step_size * intercept_sum_gradient / len(seen) * decay
+                    ) + gradient_correction
+                else:
+                    intercept -= step_size * intercept_sum_gradient / len(seen) * decay
+
+            gradient_memory[idx] = gradient
+
+            wscale *= 1.0 - alpha * step_size
+            if counter == 0:
+                c_sum[0] = step_size / (wscale * len(seen))
+            else:
+                c_sum[counter] = c_sum[counter - 1] + step_size / (wscale * len(seen))
+
+            if counter >= 1 and wscale < 1e-9:
+                for j in range(n_features):
+                    if last_updated[j] == 0:
+                        weights[j] -= c_sum[counter] * sum_gradient[j]
+                    else:
+                        weights[j] -= (
+                            c_sum[counter] - c_sum[last_updated[j] - 1]
+                        ) * sum_gradient[j]
+                    last_updated[j] = counter + 1
+                c_sum[counter] = 0
+                weights *= wscale
+                wscale = 1.0
+
+            counter += 1
+
+    for j in range(n_features):
+        if last_updated[j] == 0:
+            weights[j] -= c_sum[counter - 1] * sum_gradient[j]
+        else:
+            weights[j] -= (
+                c_sum[counter - 1] - c_sum[last_updated[j] - 1]
+            ) * sum_gradient[j]
+    weights *= wscale
+    return weights, intercept
+
+
+def get_step_size(X, alpha, fit_intercept, classification=True):
+    if classification:
+        return 4.0 / (np.max(np.sum(X * X, axis=1)) + fit_intercept + 4.0 * alpha)
+    else:
+        return 1.0 / (np.max(np.sum(X * X, axis=1)) + fit_intercept + alpha)
+
+
+def test_classifier_matching():
+    n_samples = 20
+    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
+    # y must be 0 or 1
+    alpha = 1.1
+    fit_intercept = True
+    step_size = get_step_size(X, alpha, fit_intercept)
+    for solver in ["sag", "saga"]:
+        if solver == "sag":
+            n_iter = 80
+        else:
+            # SAGA variance w.r.t. stream order is higher
+            n_iter = 300
+        clf = LogisticRegression(
+            solver=solver,
+            fit_intercept=fit_intercept,
+            tol=1e-11,
+            C=1.0 / alpha / n_samples,
+            max_iter=n_iter,
+            random_state=10,
+        )
+        clf.fit(X, y)
+
+        weights, intercept = sag_sparse(
+            X,
+            2 * y - 1,  # y must be -1 or +1
+            step_size,
+            alpha,
+            n_iter=n_iter,
+            dloss=log_dloss,
+            fit_intercept=fit_intercept,
+            saga=solver == "saga",
+        )
+        weights2, intercept2 = sag(
+            X,
+            2 * y - 1,  # y must be -1 or +1
+            step_size,
+            alpha,
+            n_iter=n_iter,
+            dloss=log_dloss,
+            fit_intercept=fit_intercept,
+            saga=solver == "saga",
+        )
+        weights = np.atleast_2d(weights)
+        intercept = np.atleast_1d(intercept)
+        weights2 = np.atleast_2d(weights2)
+        intercept2 = np.atleast_1d(intercept2)
+
+        assert_array_almost_equal(weights, clf.coef_, decimal=9)
+        assert_array_almost_equal(intercept, clf.intercept_, decimal=9)
+        assert_array_almost_equal(weights2, clf.coef_, decimal=9)
+        assert_array_almost_equal(intercept2, clf.intercept_, decimal=9)
+
+
+def test_regressor_matching():
+    n_samples = 10
+    n_features = 5
+
+    rng = np.random.RandomState(10)
+    X = rng.normal(size=(n_samples, n_features))
+    true_w = rng.normal(size=n_features)
+    y = X.dot(true_w)
+
+    alpha = 1.0
+    n_iter = 100
+    fit_intercept = True
+
+    step_size = get_step_size(X, alpha, fit_intercept, classification=False)
+    clf = Ridge(
+        fit_intercept=fit_intercept,
+        tol=0.00000000001,
+        solver="sag",
+        alpha=alpha * n_samples,
+        max_iter=n_iter,
+    )
+    clf.fit(X, y)
+
+    weights1, intercept1 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=squared_dloss,
+        fit_intercept=fit_intercept,
+    )
+    weights2, intercept2 = sag(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=squared_dloss,
+        fit_intercept=fit_intercept,
+    )
+
+    assert_allclose(weights1, clf.coef_)
+    assert_allclose(intercept1, clf.intercept_)
+    assert_allclose(weights2, clf.coef_)
+    assert_allclose(intercept2, clf.intercept_)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_pobj_matches_logistic_regression(csr_container):
+    """tests if the sag pobj matches log reg"""
+    n_samples = 100
+    alpha = 1.0
+    max_iter = 20
+    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
+
+    clf1 = LogisticRegression(
+        solver="sag",
+        fit_intercept=False,
+        tol=0.0000001,
+        C=1.0 / alpha / n_samples,
+        max_iter=max_iter,
+        random_state=10,
+    )
+    clf2 = clone(clf1)
+    clf3 = LogisticRegression(
+        fit_intercept=False,
+        tol=0.0000001,
+        C=1.0 / alpha / n_samples,
+        max_iter=max_iter,
+        random_state=10,
+    )
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    clf3.fit(X, y)
+
+    pobj1 = get_pobj(clf1.coef_, alpha, X, y, log_loss)
+    pobj2 = get_pobj(clf2.coef_, alpha, X, y, log_loss)
+    pobj3 = get_pobj(clf3.coef_, alpha, X, y, log_loss)
+
+    assert_array_almost_equal(pobj1, pobj2, decimal=4)
+    assert_array_almost_equal(pobj2, pobj3, decimal=4)
+    assert_array_almost_equal(pobj3, pobj1, decimal=4)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_pobj_matches_ridge_regression(csr_container):
+    """tests if the sag pobj matches ridge reg"""
+    n_samples = 100
+    n_features = 10
+    alpha = 1.0
+    n_iter = 100
+    fit_intercept = False
+    rng = np.random.RandomState(10)
+    X = rng.normal(size=(n_samples, n_features))
+    true_w = rng.normal(size=n_features)
+    y = X.dot(true_w)
+
+    clf1 = Ridge(
+        fit_intercept=fit_intercept,
+        tol=0.00000000001,
+        solver="sag",
+        alpha=alpha,
+        max_iter=n_iter,
+        random_state=42,
+    )
+    clf2 = clone(clf1)
+    clf3 = Ridge(
+        fit_intercept=fit_intercept,
+        tol=0.00001,
+        solver="lsqr",
+        alpha=alpha,
+        max_iter=n_iter,
+        random_state=42,
+    )
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    clf3.fit(X, y)
+
+    pobj1 = get_pobj(clf1.coef_, alpha, X, y, squared_loss)
+    pobj2 = get_pobj(clf2.coef_, alpha, X, y, squared_loss)
+    pobj3 = get_pobj(clf3.coef_, alpha, X, y, squared_loss)
+
+    assert_array_almost_equal(pobj1, pobj2, decimal=4)
+    assert_array_almost_equal(pobj1, pobj3, decimal=4)
+    assert_array_almost_equal(pobj3, pobj2, decimal=4)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_regressor_computed_correctly(csr_container):
+    """tests if the sag regressor is computed correctly"""
+    alpha = 0.1
+    n_features = 10
+    n_samples = 40
+    max_iter = 100
+    tol = 0.000001
+    fit_intercept = True
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+    y = np.dot(X, w) + 2.0
+    step_size = get_step_size(X, alpha, fit_intercept, classification=False)
+
+    clf1 = Ridge(
+        fit_intercept=fit_intercept,
+        tol=tol,
+        solver="sag",
+        alpha=alpha * n_samples,
+        max_iter=max_iter,
+        random_state=rng,
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+
+    spweights1, spintercept1 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=max_iter,
+        dloss=squared_dloss,
+        fit_intercept=fit_intercept,
+        random_state=rng,
+    )
+
+    spweights2, spintercept2 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=max_iter,
+        dloss=squared_dloss,
+        sparse=True,
+        fit_intercept=fit_intercept,
+        random_state=rng,
+    )
+
+    assert_array_almost_equal(clf1.coef_.ravel(), spweights1.ravel(), decimal=3)
+    assert_almost_equal(clf1.intercept_, spintercept1, decimal=1)
+
+    # TODO: uncomment when sparse Ridge with intercept will be fixed (#4710)
+    # assert_array_almost_equal(clf2.coef_.ravel(),
+    #                          spweights2.ravel(),
+    #                          decimal=3)
+    # assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)'''
+
+
+def test_get_auto_step_size():
+    X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
+    alpha = 1.2
+    fit_intercept = False
+    # sum the squares of the second sample because that's the largest
+    max_squared_sum = 4 + 9 + 16
+    max_squared_sum_ = row_norms(X, squared=True).max()
+    n_samples = X.shape[0]
+    assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)
+
+    for saga in [True, False]:
+        for fit_intercept in (True, False):
+            if saga:
+                L_sqr = max_squared_sum + alpha + int(fit_intercept)
+                L_log = (max_squared_sum + 4.0 * alpha + int(fit_intercept)) / 4.0
+                mun_sqr = min(2 * n_samples * alpha, L_sqr)
+                mun_log = min(2 * n_samples * alpha, L_log)
+                step_size_sqr = 1 / (2 * L_sqr + mun_sqr)
+                step_size_log = 1 / (2 * L_log + mun_log)
+            else:
+                step_size_sqr = 1.0 / (max_squared_sum + alpha + int(fit_intercept))
+                step_size_log = 4.0 / (
+                    max_squared_sum + 4.0 * alpha + int(fit_intercept)
+                )
+
+            step_size_sqr_ = get_auto_step_size(
+                max_squared_sum_,
+                alpha,
+                "squared",
+                fit_intercept,
+                n_samples=n_samples,
+                is_saga=saga,
+            )
+            step_size_log_ = get_auto_step_size(
+                max_squared_sum_,
+                alpha,
+                "log",
+                fit_intercept,
+                n_samples=n_samples,
+                is_saga=saga,
+            )
+
+            assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
+            assert_almost_equal(step_size_log, step_size_log_, decimal=4)
+
+    msg = "Unknown loss function for SAG solver, got wrong instead of"
+    with pytest.raises(ValueError, match=msg):
+        get_auto_step_size(max_squared_sum_, alpha, "wrong", fit_intercept)
+
+
+@pytest.mark.parametrize("seed", range(3))  # locally tested with 1000 seeds
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_regressor(seed, csr_container):
+    """tests if the sag regressor performs well"""
+    xmin, xmax = -5, 5
+    n_samples = 300
+    tol = 0.001
+    max_iter = 100
+    alpha = 0.1
+    rng = np.random.RandomState(seed)
+    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+
+    # simple linear function without noise
+    y = 0.5 * X.ravel()
+
+    clf1 = Ridge(
+        tol=tol,
+        solver="sag",
+        max_iter=max_iter,
+        alpha=alpha * n_samples,
+        random_state=rng,
+    )
+    clf2 = clone(clf1)
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    score1 = clf1.score(X, y)
+    score2 = clf2.score(X, y)
+    assert score1 > 0.98
+    assert score2 > 0.98
+
+    # simple linear function with noise
+    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+
+    clf1 = Ridge(tol=tol, solver="sag", max_iter=max_iter, alpha=alpha * n_samples)
+    clf2 = clone(clf1)
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    score1 = clf1.score(X, y)
+    score2 = clf2.score(X, y)
+    assert score1 > 0.45
+    assert score2 > 0.45
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_classifier_computed_correctly(csr_container):
+    """tests if the binary classifier is computed correctly"""
+    alpha = 0.1
+    n_samples = 50
+    n_iter = 50
+    tol = 0.00001
+    fit_intercept = True
+    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
+    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
+    classes = np.unique(y)
+    y_tmp = np.ones(n_samples)
+    y_tmp[y != classes[1]] = -1
+    y = y_tmp
+
+    clf1 = LogisticRegression(
+        solver="sag",
+        C=1.0 / alpha / n_samples,
+        max_iter=n_iter,
+        tol=tol,
+        random_state=77,
+        fit_intercept=fit_intercept,
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+
+    spweights, spintercept = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=log_dloss,
+        fit_intercept=fit_intercept,
+    )
+    spweights2, spintercept2 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=log_dloss,
+        sparse=True,
+        fit_intercept=fit_intercept,
+    )
+
+    assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2)
+    assert_almost_equal(clf1.intercept_, spintercept, decimal=1)
+
+    assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2)
+    assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_multiclass_computed_correctly(csr_container):
+    """tests if the multiclass classifier is computed correctly"""
+    alpha = 0.1
+    n_samples = 20
+    tol = 1e-5
+    max_iter = 70
+    fit_intercept = True
+    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1)
+    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
+    classes = np.unique(y)
+
+    clf1 = OneVsRestClassifier(
+        LogisticRegression(
+            solver="sag",
+            C=1.0 / alpha / n_samples,
+            max_iter=max_iter,
+            tol=tol,
+            random_state=77,
+            fit_intercept=fit_intercept,
+        )
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+
+    coef1 = []
+    intercept1 = []
+    coef2 = []
+    intercept2 = []
+    for cl in classes:
+        y_encoded = np.ones(n_samples)
+        y_encoded[y != cl] = -1
+
+        spweights1, spintercept1 = sag_sparse(
+            X,
+            y_encoded,
+            step_size,
+            alpha,
+            dloss=log_dloss,
+            n_iter=max_iter,
+            fit_intercept=fit_intercept,
+        )
+        spweights2, spintercept2 = sag_sparse(
+            X,
+            y_encoded,
+            step_size,
+            alpha,
+            dloss=log_dloss,
+            n_iter=max_iter,
+            sparse=True,
+            fit_intercept=fit_intercept,
+        )
+        coef1.append(spweights1)
+        intercept1.append(spintercept1)
+
+        coef2.append(spweights2)
+        intercept2.append(spintercept2)
+
+    coef1 = np.vstack(coef1)
+    intercept1 = np.array(intercept1)
+    coef2 = np.vstack(coef2)
+    intercept2 = np.array(intercept2)
+
+    for i, cl in enumerate(classes):
+        assert_allclose(clf1.estimators_[i].coef_.ravel(), coef1[i], rtol=1e-2)
+        assert_allclose(clf1.estimators_[i].intercept_, intercept1[i], rtol=1e-1)
+
+        assert_allclose(clf2.estimators_[i].coef_.ravel(), coef2[i], rtol=1e-2)
+        # Note the very crude accuracy, i.e. high rtol.
+        assert_allclose(clf2.estimators_[i].intercept_, intercept2[i], rtol=5e-1)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_classifier_results(csr_container):
+    """tests if classifier results match target"""
+    alpha = 0.1
+    n_features = 20
+    n_samples = 10
+    tol = 0.01
+    max_iter = 200
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+    y = np.dot(X, w)
+    y = np.sign(y)
+    clf1 = LogisticRegression(
+        solver="sag",
+        C=1.0 / alpha / n_samples,
+        max_iter=max_iter,
+        tol=tol,
+        random_state=77,
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    pred1 = clf1.predict(X)
+    pred2 = clf2.predict(X)
+    assert_almost_equal(pred1, y, decimal=12)
+    assert_almost_equal(pred2, y, decimal=12)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_binary_classifier_class_weight(csr_container):
+    """tests binary classifier with classweights for each class"""
+    alpha = 0.1
+    n_samples = 50
+    n_iter = 20
+    tol = 0.00001
+    fit_intercept = True
+    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1)
+    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
+    classes = np.unique(y)
+    y_tmp = np.ones(n_samples)
+    y_tmp[y != classes[1]] = -1
+    y = y_tmp
+
+    class_weight = {1: 0.45, -1: 0.55}
+    clf1 = LogisticRegression(
+        solver="sag",
+        C=1.0 / alpha / n_samples,
+        max_iter=n_iter,
+        tol=tol,
+        random_state=77,
+        fit_intercept=fit_intercept,
+        class_weight=class_weight,
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+
+    le = LabelEncoder()
+    class_weight_ = compute_class_weight(class_weight, classes=np.unique(y), y=y)
+    sample_weight = class_weight_[le.fit_transform(y)]
+    spweights, spintercept = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=log_dloss,
+        sample_weight=sample_weight,
+        fit_intercept=fit_intercept,
+    )
+    spweights2, spintercept2 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=log_dloss,
+        sparse=True,
+        sample_weight=sample_weight,
+        fit_intercept=fit_intercept,
+    )
+
+    assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2)
+    assert_almost_equal(clf1.intercept_, spintercept, decimal=1)
+
+    assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2)
+    assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
+
+
+def test_classifier_single_class():
+    """tests if ValueError is thrown with only one class"""
+    X = [[1, 2], [3, 4]]
+    y = [1, 1]
+
+    msg = "This solver needs samples of at least 2 classes in the data"
+    with pytest.raises(ValueError, match=msg):
+        LogisticRegression(solver="sag").fit(X, y)
+
+
+def test_step_size_alpha_error():
+    X = [[0, 0], [0, 0]]
+    y = [1, -1]
+    fit_intercept = False
+    alpha = 1.0
+    msg = re.escape(
+        "Current sag implementation does not handle the case"
+        " step_size * alpha_scaled == 1"
+    )
+
+    clf1 = LogisticRegression(solver="sag", C=1.0 / alpha, fit_intercept=fit_intercept)
+    with pytest.raises(ZeroDivisionError, match=msg):
+        clf1.fit(X, y)
+
+    clf2 = Ridge(fit_intercept=fit_intercept, solver="sag", alpha=alpha)
+    with pytest.raises(ZeroDivisionError, match=msg):
+        clf2.fit(X, y)
+
+
+@pytest.mark.parametrize("solver", ["sag", "saga"])
+def test_sag_classifier_raises_error(solver):
+    # Following #13316, the error handling behavior changed in cython sag. This
+    # is simply a non-regression test to make sure numerical errors are
+    # properly raised.
+
+    # Train a classifier on a simple problem
+    rng = np.random.RandomState(42)
+    X, y = make_classification(random_state=rng)
+    clf = LogisticRegression(solver=solver, random_state=rng, warm_start=True)
+    clf.fit(X, y)
+
+    # Trigger a numerical error by:
+    # - corrupting the fitted coefficients of the classifier
+    # - fit it again starting from its current state thanks to warm_start
+    clf.coef_[:] = np.nan
+
+    with pytest.raises(ValueError, match="Floating-point under-/overflow"):
+        clf.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sgd.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d138ae3649b22c4848dcacae1391a399e72fcd
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sgd.py
@@ -0,0 +1,2195 @@
+import pickle
+from unittest.mock import Mock
+
+import joblib
+import numpy as np
+import pytest
+import scipy.sparse as sp
+
+from sklearn import datasets, linear_model, metrics
+from sklearn.base import clone, is_classifier
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.kernel_approximation import Nystroem
+from sklearn.linear_model import _sgd_fast as sgd_fast
+from sklearn.linear_model import _stochastic_gradient
+from sklearn.model_selection import (
+    RandomizedSearchCV,
+    ShuffleSplit,
+    StratifiedShuffleSplit,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, scale
+from sklearn.svm import OneClassSVM
+from sklearn.utils import get_tags
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+
+def _update_kwargs(kwargs):
+    if "random_state" not in kwargs:
+        kwargs["random_state"] = 42
+
+    if "tol" not in kwargs:
+        kwargs["tol"] = None
+    if "max_iter" not in kwargs:
+        kwargs["max_iter"] = 5
+
+
+class _SparseSGDClassifier(linear_model.SGDClassifier):
+    def fit(self, X, y, *args, **kw):
+        X = sp.csr_matrix(X)
+        return super().fit(X, y, *args, **kw)
+
+    def partial_fit(self, X, y, *args, **kw):
+        X = sp.csr_matrix(X)
+        return super().partial_fit(X, y, *args, **kw)
+
+    def decision_function(self, X):
+        X = sp.csr_matrix(X)
+        return super().decision_function(X)
+
+    def predict_proba(self, X):
+        X = sp.csr_matrix(X)
+        return super().predict_proba(X)
+
+
+class _SparseSGDRegressor(linear_model.SGDRegressor):
+    def fit(self, X, y, *args, **kw):
+        X = sp.csr_matrix(X)
+        return linear_model.SGDRegressor.fit(self, X, y, *args, **kw)
+
+    def partial_fit(self, X, y, *args, **kw):
+        X = sp.csr_matrix(X)
+        return linear_model.SGDRegressor.partial_fit(self, X, y, *args, **kw)
+
+    def decision_function(self, X, *args, **kw):
+        # XXX untested as of v0.22
+        X = sp.csr_matrix(X)
+        return linear_model.SGDRegressor.decision_function(self, X, *args, **kw)
+
+
+class _SparseSGDOneClassSVM(linear_model.SGDOneClassSVM):
+    def fit(self, X, *args, **kw):
+        X = sp.csr_matrix(X)
+        return linear_model.SGDOneClassSVM.fit(self, X, *args, **kw)
+
+    def partial_fit(self, X, *args, **kw):
+        X = sp.csr_matrix(X)
+        return linear_model.SGDOneClassSVM.partial_fit(self, X, *args, **kw)
+
+    def decision_function(self, X, *args, **kw):
+        X = sp.csr_matrix(X)
+        return linear_model.SGDOneClassSVM.decision_function(self, X, *args, **kw)
+
+
+def SGDClassifier(**kwargs):
+    _update_kwargs(kwargs)
+    return linear_model.SGDClassifier(**kwargs)
+
+
+def SGDRegressor(**kwargs):
+    _update_kwargs(kwargs)
+    return linear_model.SGDRegressor(**kwargs)
+
+
+def SGDOneClassSVM(**kwargs):
+    _update_kwargs(kwargs)
+    return linear_model.SGDOneClassSVM(**kwargs)
+
+
+def SparseSGDClassifier(**kwargs):
+    _update_kwargs(kwargs)
+    return _SparseSGDClassifier(**kwargs)
+
+
+def SparseSGDRegressor(**kwargs):
+    _update_kwargs(kwargs)
+    return _SparseSGDRegressor(**kwargs)
+
+
+def SparseSGDOneClassSVM(**kwargs):
+    _update_kwargs(kwargs)
+    return _SparseSGDOneClassSVM(**kwargs)
+
+
+# Test Data
+
+# test sample 1
+X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
+Y = [1, 1, 1, 2, 2, 2]
+T = np.array([[-1, -1], [2, 2], [3, 2]])
+true_result = [1, 2, 2]
+
+# test sample 2; string class labels
+X2 = np.array(
+    [
+        [-1, 1],
+        [-0.75, 0.5],
+        [-1.5, 1.5],
+        [1, 1],
+        [0.75, 0.5],
+        [1.5, 1.5],
+        [-1, -1],
+        [0, -0.5],
+        [1, -1],
+    ]
+)
+Y2 = ["one"] * 3 + ["two"] * 3 + ["three"] * 3
+T2 = np.array([[-1.5, 0.5], [1, 2], [0, -2]])
+true_result2 = ["one", "two", "three"]
+
+# test sample 3
+X3 = np.array(
+    [
+        [1, 1, 0, 0, 0, 0],
+        [1, 1, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 1, 0, 0],
+        [0, 0, 0, 1, 0, 0],
+    ]
+)
+Y3 = np.array([1, 1, 1, 1, 2, 2, 2, 2])
+
+# test sample 4 - two more or less redundant feature groups
+X4 = np.array(
+    [
+        [1, 0.9, 0.8, 0, 0, 0],
+        [1, 0.84, 0.98, 0, 0, 0],
+        [1, 0.96, 0.88, 0, 0, 0],
+        [1, 0.91, 0.99, 0, 0, 0],
+        [0, 0, 0, 0.89, 0.91, 1],
+        [0, 0, 0, 0.79, 0.84, 1],
+        [0, 0, 0, 0.91, 0.95, 1],
+        [0, 0, 0, 0.93, 1, 1],
+    ]
+)
+Y4 = np.array([1, 1, 1, 1, 2, 2, 2, 2])
+
+iris = datasets.load_iris()
+
+# test sample 5 - test sample 1 as binary classification problem
+X5 = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
+Y5 = [1, 1, 1, 2, 2, 2]
+true_result5 = [0, 1, 1]
+
+
+###############################################################################
+# Common Test Case to classification and regression
+
+
+# a simple implementation of ASGD to use for testing
+# uses squared loss to find the gradient
+def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
+    if weight_init is None:
+        weights = np.zeros(X.shape[1])
+    else:
+        weights = weight_init
+
+    average_weights = np.zeros(X.shape[1])
+    intercept = intercept_init
+    average_intercept = 0.0
+    decay = 1.0
+
+    # sparse data has a fixed decay of .01
+    if klass in (SparseSGDClassifier, SparseSGDRegressor):
+        decay = 0.01
+
+    for i, entry in enumerate(X):
+        p = np.dot(entry, weights)
+        p += intercept
+        gradient = p - y[i]
+        weights *= 1.0 - (eta * alpha)
+        weights += -(eta * gradient * entry)
+        intercept += -(eta * gradient) * decay
+
+        average_weights *= i
+        average_weights += weights
+        average_weights /= i + 1.0
+
+        average_intercept *= i
+        average_intercept += intercept
+        average_intercept /= i + 1.0
+
+    return average_weights, average_intercept
+
+
+def _test_warm_start(klass, X, Y, lr):
+    # Test that explicit warm restart...
+    clf = klass(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf.fit(X, Y)
+
+    clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf2.fit(X, Y, coef_init=clf.coef_.copy(), intercept_init=clf.intercept_.copy())
+
+    # ... and implicit warm restart are equivalent.
+    clf3 = klass(
+        alpha=0.01, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr
+    )
+    clf3.fit(X, Y)
+
+    assert clf3.t_ == clf.t_
+    assert_array_almost_equal(clf3.coef_, clf.coef_)
+
+    clf3.set_params(alpha=0.001)
+    clf3.fit(X, Y)
+
+    assert clf3.t_ == clf2.t_
+    assert_array_almost_equal(clf3.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
+def test_warm_start(klass, lr):
+    _test_warm_start(klass, X, Y, lr)
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_input_format(klass):
+    # Input format tests.
+    clf = klass(alpha=0.01, shuffle=False)
+    clf.fit(X, Y)
+    Y_ = np.array(Y)[:, np.newaxis]
+
+    Y_ = np.c_[Y_, Y_]
+    with pytest.raises(ValueError):
+        clf.fit(X, Y_)
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_clone(klass):
+    # Test whether clone works ok.
+    clf = klass(alpha=0.01, penalty="l1")
+    clf = clone(clf)
+    clf.set_params(penalty="l2")
+    clf.fit(X, Y)
+
+    clf2 = klass(alpha=0.01, penalty="l2")
+    clf2.fit(X, Y)
+
+    assert_array_equal(clf.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize(
+    "klass",
+    [
+        SGDClassifier,
+        SparseSGDClassifier,
+        SGDRegressor,
+        SparseSGDRegressor,
+        SGDOneClassSVM,
+        SparseSGDOneClassSVM,
+    ],
+)
+def test_plain_has_no_average_attr(klass):
+    clf = klass(average=True, eta0=0.01)
+    clf.fit(X, Y)
+
+    assert hasattr(clf, "_average_coef")
+    assert hasattr(clf, "_average_intercept")
+    assert hasattr(clf, "_standard_intercept")
+    assert hasattr(clf, "_standard_coef")
+
+    clf = klass()
+    clf.fit(X, Y)
+
+    assert not hasattr(clf, "_average_coef")
+    assert not hasattr(clf, "_average_intercept")
+    assert not hasattr(clf, "_standard_intercept")
+    assert not hasattr(clf, "_standard_coef")
+
+
+@pytest.mark.parametrize(
+    "klass",
+    [
+        SGDClassifier,
+        SparseSGDClassifier,
+        SGDRegressor,
+        SparseSGDRegressor,
+        SGDOneClassSVM,
+        SparseSGDOneClassSVM,
+    ],
+)
+def test_late_onset_averaging_not_reached(klass):
+    clf1 = klass(average=600)
+    clf2 = klass()
+    for _ in range(100):
+        if is_classifier(clf1):
+            clf1.partial_fit(X, Y, classes=np.unique(Y))
+            clf2.partial_fit(X, Y, classes=np.unique(Y))
+        else:
+            clf1.partial_fit(X, Y)
+            clf2.partial_fit(X, Y)
+
+    assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
+    if klass in [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]:
+        assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
+    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+        assert_allclose(clf1.offset_, clf2.offset_)
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_late_onset_averaging_reached(klass):
+    eta0 = 0.001
+    alpha = 0.0001
+    Y_encode = np.array(Y)
+    Y_encode[Y_encode == 1] = -1.0
+    Y_encode[Y_encode == 2] = 1.0
+
+    clf1 = klass(
+        average=7,
+        learning_rate="constant",
+        loss="squared_error",
+        eta0=eta0,
+        alpha=alpha,
+        max_iter=2,
+        shuffle=False,
+    )
+    clf2 = klass(
+        average=False,
+        learning_rate="constant",
+        loss="squared_error",
+        eta0=eta0,
+        alpha=alpha,
+        max_iter=1,
+        shuffle=False,
+    )
+
+    clf1.fit(X, Y_encode)
+    clf2.fit(X, Y_encode)
+
+    average_weights, average_intercept = asgd(
+        klass,
+        X,
+        Y_encode,
+        eta0,
+        alpha,
+        weight_init=clf2.coef_.ravel(),
+        intercept_init=clf2.intercept_,
+    )
+
+    assert_array_almost_equal(clf1.coef_.ravel(), average_weights.ravel(), decimal=16)
+    assert_almost_equal(clf1.intercept_, average_intercept, decimal=16)
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_early_stopping(klass):
+    X = iris.data[iris.target > 0]
+    Y = iris.target[iris.target > 0]
+    for early_stopping in [True, False]:
+        max_iter = 1000
+        clf = klass(early_stopping=early_stopping, tol=1e-3, max_iter=max_iter).fit(
+            X, Y
+        )
+        assert clf.n_iter_ < max_iter
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_adaptive_longer_than_constant(klass):
+    clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
+    clf1.fit(iris.data, iris.target)
+    clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100)
+    clf2.fit(iris.data, iris.target)
+    assert clf1.n_iter_ > clf2.n_iter_
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_validation_set_not_used_for_training(klass):
+    X, Y = iris.data, iris.target
+    validation_fraction = 0.4
+    seed = 42
+    shuffle = False
+    max_iter = 10
+    clf1 = klass(
+        early_stopping=True,
+        random_state=np.random.RandomState(seed),
+        validation_fraction=validation_fraction,
+        learning_rate="constant",
+        eta0=0.01,
+        tol=None,
+        max_iter=max_iter,
+        shuffle=shuffle,
+    )
+    clf1.fit(X, Y)
+    assert clf1.n_iter_ == max_iter
+
+    clf2 = klass(
+        early_stopping=False,
+        random_state=np.random.RandomState(seed),
+        learning_rate="constant",
+        eta0=0.01,
+        tol=None,
+        max_iter=max_iter,
+        shuffle=shuffle,
+    )
+
+    if is_classifier(clf2):
+        cv = StratifiedShuffleSplit(test_size=validation_fraction, random_state=seed)
+    else:
+        cv = ShuffleSplit(test_size=validation_fraction, random_state=seed)
+    idx_train, idx_val = next(cv.split(X, Y))
+    idx_train = np.sort(idx_train)  # remove shuffling
+    clf2.fit(X[idx_train], Y[idx_train])
+    assert clf2.n_iter_ == max_iter
+
+    assert_array_equal(clf1.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_n_iter_no_change(klass):
+    X, Y = iris.data, iris.target
+    # test that n_iter_ increases monotonically with n_iter_no_change
+    for early_stopping in [True, False]:
+        n_iter_list = [
+            klass(
+                early_stopping=early_stopping,
+                n_iter_no_change=n_iter_no_change,
+                tol=1e-4,
+                max_iter=1000,
+            )
+            .fit(X, Y)
+            .n_iter_
+            for n_iter_no_change in [2, 3, 10]
+        ]
+        assert_array_equal(n_iter_list, sorted(n_iter_list))
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_not_enough_sample_for_early_stopping(klass):
+    # test an error is raised if the training or validation set is empty
+    clf = klass(early_stopping=True, validation_fraction=0.99)
+    with pytest.raises(ValueError):
+        clf.fit(X3, Y3)
+
+
+@pytest.mark.parametrize("Estimator", [SGDClassifier, SGDRegressor])
+@pytest.mark.parametrize("l1_ratio", [0, 0.7, 1])
+def test_sgd_l1_ratio_not_used(Estimator, l1_ratio):
+    """Check that l1_ratio is not used when penalty is not 'elasticnet'"""
+    clf1 = Estimator(penalty="l1", l1_ratio=None, random_state=0).fit(X, Y)
+    clf2 = Estimator(penalty="l1", l1_ratio=l1_ratio, random_state=0).fit(X, Y)
+
+    assert_allclose(clf1.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize(
+    "Estimator", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_sgd_failing_penalty_validation(Estimator):
+    clf = Estimator(penalty="elasticnet", l1_ratio=None)
+    with pytest.raises(
+        ValueError, match="l1_ratio must be set when penalty is 'elasticnet'"
+    ):
+        clf.fit(X, Y)
+
+
+###############################################################################
+# Classification Test Case
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_clf(klass):
+    # Check that SGD gives any results :-)
+
+    for loss in ("hinge", "squared_hinge", "log_loss", "modified_huber"):
+        clf = klass(
+            penalty="l2",
+            alpha=0.01,
+            fit_intercept=True,
+            loss=loss,
+            max_iter=10,
+            shuffle=True,
+        )
+        clf.fit(X, Y)
+        # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7)
+        assert_array_equal(clf.predict(T), true_result)
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
+)
+def test_provide_coef(klass):
+    """Check that the shape of `coef_init` is validated."""
+    with pytest.raises(ValueError, match="Provided coef_init does not match dataset"):
+        klass().fit(X, Y, coef_init=np.zeros((3,)))
+
+
+@pytest.mark.parametrize(
+    "klass, fit_params",
+    [
+        (SGDClassifier, {"intercept_init": np.zeros((3,))}),
+        (SparseSGDClassifier, {"intercept_init": np.zeros((3,))}),
+        (SGDOneClassSVM, {"offset_init": np.zeros((3,))}),
+        (SparseSGDOneClassSVM, {"offset_init": np.zeros((3,))}),
+    ],
+)
+def test_set_intercept_offset(klass, fit_params):
+    """Check that `intercept_init` or `offset_init` is validated."""
+    sgd_estimator = klass()
+    with pytest.raises(ValueError, match="does not match dataset"):
+        sgd_estimator.fit(X, Y, **fit_params)
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+def test_sgd_early_stopping_with_partial_fit(klass):
+    """Check that we raise an error for `early_stopping` used with
+    `partial_fit`.
+    """
+    err_msg = "early_stopping should be False with partial_fit"
+    with pytest.raises(ValueError, match=err_msg):
+        klass(early_stopping=True).partial_fit(X, Y)
+
+
+@pytest.mark.parametrize(
+    "klass, fit_params",
+    [
+        (SGDClassifier, {"intercept_init": 0}),
+        (SparseSGDClassifier, {"intercept_init": 0}),
+        (SGDOneClassSVM, {"offset_init": 0}),
+        (SparseSGDOneClassSVM, {"offset_init": 0}),
+    ],
+)
+def test_set_intercept_offset_binary(klass, fit_params):
+    """Check that we can pass a scaler with binary classification to
+    `intercept_init` or `offset_init`."""
+    klass().fit(X5, Y5, **fit_params)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_average_binary_computed_correctly(klass):
+    # Checks the SGDClassifier correctly computes the average weights
+    eta = 0.1
+    alpha = 2.0
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+
+    clf = klass(
+        loss="squared_error",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    # simple linear function without noise
+    y = np.dot(X, w)
+    y = np.sign(y)
+
+    clf.fit(X, y)
+
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+    average_weights = average_weights.reshape(1, -1)
+    assert_array_almost_equal(clf.coef_, average_weights, decimal=14)
+    assert_almost_equal(clf.intercept_, average_intercept, decimal=14)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_set_intercept_to_intercept(klass):
+    # Checks intercept_ shape consistency for the warm starts
+    # Inconsistent intercept_ shape.
+    clf = klass().fit(X5, Y5)
+    klass().fit(X5, Y5, intercept_init=clf.intercept_)
+    clf = klass().fit(X, Y)
+    klass().fit(X, Y, intercept_init=clf.intercept_)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_at_least_two_labels(klass):
+    # Target must have at least two labels
+    clf = klass(alpha=0.01, max_iter=20)
+    with pytest.raises(ValueError):
+        clf.fit(X2, np.ones(9))
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_weight_class_balanced(klass):
+    # partial_fit with class_weight='balanced' not supported"""
+    regex = (
+        r"class_weight 'balanced' is not supported for "
+        r"partial_fit\. In order to use 'balanced' weights, "
+        r"use compute_class_weight\('balanced', classes=classes, y=y\). "
+        r"In place of y you can use a large enough sample "
+        r"of the full training set target to properly "
+        r"estimate the class frequency distributions\. "
+        r"Pass the resulting weights as the class_weight "
+        r"parameter\."
+    )
+    with pytest.raises(ValueError, match=regex):
+        klass(class_weight="balanced").partial_fit(X, Y, classes=np.unique(Y))
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass(klass):
+    # Multi-class test case
+    clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2)
+    assert clf.coef_.shape == (3, 2)
+    assert clf.intercept_.shape == (3,)
+    assert clf.decision_function([[0, 0]]).shape == (1, 3)
+    pred = clf.predict(T2)
+    assert_array_equal(pred, true_result2)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_average(klass):
+    eta = 0.001
+    alpha = 0.01
+    # Multi-class average test case
+    clf = klass(
+        loss="squared_error",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    np_Y2 = np.array(Y2)
+    clf.fit(X2, np_Y2)
+    classes = np.unique(np_Y2)
+
+    for i, cl in enumerate(classes):
+        y_i = np.ones(np_Y2.shape[0])
+        y_i[np_Y2 != cl] = -1
+        average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha)
+        assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16)
+        assert_almost_equal(average_intercept, clf.intercept_[i], decimal=16)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_with_init_coef(klass):
+    # Multi-class test case
+    clf = klass(alpha=0.01, max_iter=20)
+    clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), intercept_init=np.zeros(3))
+    assert clf.coef_.shape == (3, 2)
+    assert clf.intercept_.shape, (3,)
+    pred = clf.predict(T2)
+    assert_array_equal(pred, true_result2)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_njobs(klass):
+    # Multi-class test case with multi-core support
+    clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
+    assert clf.coef_.shape == (3, 2)
+    assert clf.intercept_.shape == (3,)
+    assert clf.decision_function([[0, 0]]).shape == (1, 3)
+    pred = clf.predict(T2)
+    assert_array_equal(pred, true_result2)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_set_coef_multiclass(klass):
+    # Checks coef_init and intercept_init shape for multi-class
+    # problems
+    # Provided coef_ does not match dataset
+    clf = klass()
+    with pytest.raises(ValueError):
+        clf.fit(X2, Y2, coef_init=np.zeros((2, 2)))
+
+    # Provided coef_ does match dataset
+    clf = klass().fit(X2, Y2, coef_init=np.zeros((3, 2)))
+
+    # Provided intercept_ does not match dataset
+    clf = klass()
+    with pytest.raises(ValueError):
+        clf.fit(X2, Y2, intercept_init=np.zeros((1,)))
+
+    # Provided intercept_ does match dataset.
+    clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_predict_proba_method_access(klass):
+    # Checks that SGDClassifier predict_proba and predict_log_proba methods
+    # can either be accessed or raise an appropriate error message
+    # otherwise. See
+    # https://github.com/scikit-learn/scikit-learn/issues/10938 for more
+    # details.
+    for loss in linear_model.SGDClassifier.loss_functions:
+        clf = SGDClassifier(loss=loss)
+        if loss in ("log_loss", "modified_huber"):
+            assert hasattr(clf, "predict_proba")
+            assert hasattr(clf, "predict_log_proba")
+        else:
+            inner_msg = "probability estimates are not available for loss={!r}".format(
+                loss
+            )
+            assert not hasattr(clf, "predict_proba")
+            assert not hasattr(clf, "predict_log_proba")
+            with pytest.raises(
+                AttributeError, match="has no attribute 'predict_proba'"
+            ) as exec_info:
+                clf.predict_proba
+
+            assert isinstance(exec_info.value.__cause__, AttributeError)
+            assert inner_msg in str(exec_info.value.__cause__)
+
+            with pytest.raises(
+                AttributeError, match="has no attribute 'predict_log_proba'"
+            ) as exec_info:
+                clf.predict_log_proba
+            assert isinstance(exec_info.value.__cause__, AttributeError)
+            assert inner_msg in str(exec_info.value.__cause__)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_proba(klass):
+    # Check SGD.predict_proba
+
+    # Hinge loss does not allow for conditional prob estimate.
+    # We cannot use the factory here, because it defines predict_proba
+    # anyway.
+    clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=10, tol=None).fit(X, Y)
+    assert not hasattr(clf, "predict_proba")
+    assert not hasattr(clf, "predict_log_proba")
+
+    # log and modified_huber losses can output probability estimates
+    # binary case
+    for loss in ["log_loss", "modified_huber"]:
+        clf = klass(loss=loss, alpha=0.01, max_iter=10)
+        clf.fit(X, Y)
+        p = clf.predict_proba([[3, 2]])
+        assert p[0, 1] > 0.5
+        p = clf.predict_proba([[-1, -1]])
+        assert p[0, 1] < 0.5
+
+        # If predict_proba is 0, we get "RuntimeWarning: divide by zero encountered
+        # in log". We avoid it here.
+        with np.errstate(divide="ignore"):
+            p = clf.predict_log_proba([[3, 2]])
+            assert p[0, 1] > p[0, 0]
+            p = clf.predict_log_proba([[-1, -1]])
+            assert p[0, 1] < p[0, 0]
+
+    # log loss multiclass probability estimates
+    clf = klass(loss="log_loss", alpha=0.01, max_iter=10).fit(X2, Y2)
+
+    d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
+    p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
+    assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
+    assert_almost_equal(p[0].sum(), 1)
+    assert np.all(p[0] >= 0)
+
+    p = clf.predict_proba([[-1, -1]])
+    d = clf.decision_function([[-1, -1]])
+    assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))
+
+    lp = clf.predict_log_proba([[3, 2]])
+    p = clf.predict_proba([[3, 2]])
+    assert_array_almost_equal(np.log(p), lp)
+
+    lp = clf.predict_log_proba([[-1, -1]])
+    p = clf.predict_proba([[-1, -1]])
+    assert_array_almost_equal(np.log(p), lp)
+
+    # Modified Huber multiclass probability estimates; requires a separate
+    # test because the hard zero/one probabilities may destroy the
+    # ordering present in decision_function output.
+    clf = klass(loss="modified_huber", alpha=0.01, max_iter=10)
+    clf.fit(X2, Y2)
+    d = clf.decision_function([[3, 2]])
+    p = clf.predict_proba([[3, 2]])
+    if klass != SparseSGDClassifier:
+        assert np.argmax(d, axis=1) == np.argmax(p, axis=1)
+    else:  # XXX the sparse test gets a different X2 (?)
+        assert np.argmin(d, axis=1) == np.argmin(p, axis=1)
+
+    # the following sample produces decision_function values < -1,
+    # which would cause naive normalization to fail (see comment
+    # in SGDClassifier.predict_proba)
+    x = X.mean(axis=0)
+    d = clf.decision_function([x])
+    if np.all(d < -1):  # XXX not true in sparse test case (why?)
+        p = clf.predict_proba([x])
+        assert_array_almost_equal(p[0], [1 / 3.0] * 3)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_l1(klass):
+    # Test L1 regularization
+    n = len(X4)
+    rng = np.random.RandomState(13)
+    idx = np.arange(n)
+    rng.shuffle(idx)
+
+    X = X4[idx, :]
+    Y = Y4[idx]
+
+    clf = klass(
+        penalty="l1",
+        alpha=0.2,
+        fit_intercept=False,
+        max_iter=2000,
+        tol=None,
+        shuffle=False,
+    )
+    clf.fit(X, Y)
+    assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,)))
+    pred = clf.predict(X)
+    assert_array_equal(pred, Y)
+
+    # test sparsify with dense inputs
+    clf.sparsify()
+    assert sp.issparse(clf.coef_)
+    pred = clf.predict(X)
+    assert_array_equal(pred, Y)
+
+    # pickle and unpickle with sparse coef_
+    clf = pickle.loads(pickle.dumps(clf))
+    assert sp.issparse(clf.coef_)
+    pred = clf.predict(X)
+    assert_array_equal(pred, Y)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_class_weights(klass):
+    # Test class weights.
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y = [1, 1, 1, -1, -1]
+
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+
+    # we give a small weights to class 1
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001})
+    clf.fit(X, y)
+
+    # now the hyperplane should rotate clock-wise and
+    # the prediction on this point should shift
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_equal_class_weight(klass):
+    # Test if equal class weights approx. equals no class weights.
+    X = [[1, 0], [1, 0], [0, 1], [0, 1]]
+    y = [0, 0, 1, 1]
+    clf = klass(alpha=0.1, max_iter=1000, class_weight=None)
+    clf.fit(X, y)
+
+    X = [[1, 0], [0, 1]]
+    y = [0, 1]
+    clf_weighted = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5})
+    clf_weighted.fit(X, y)
+
+    # should be similar up to some epsilon due to learning rate schedule
+    assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_wrong_class_weight_label(klass):
+    # ValueError due to not existing class label.
+    clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
+    with pytest.raises(ValueError):
+        clf.fit(X, Y)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_weights_multiplied(klass):
+    # Tests that class_weight and sample_weight are multiplicative
+    class_weights = {1: 0.6, 2: 0.3}
+    rng = np.random.RandomState(0)
+    sample_weights = rng.random_sample(Y4.shape[0])
+    multiplied_together = np.copy(sample_weights)
+    multiplied_together[Y4 == 1] *= class_weights[1]
+    multiplied_together[Y4 == 2] *= class_weights[2]
+
+    clf1 = klass(alpha=0.1, max_iter=20, class_weight=class_weights)
+    clf2 = klass(alpha=0.1, max_iter=20)
+
+    clf1.fit(X4, Y4, sample_weight=sample_weights)
+    clf2.fit(X4, Y4, sample_weight=multiplied_together)
+
+    assert_almost_equal(clf1.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_balanced_weight(klass):
+    # Test class weights for imbalanced data"""
+    # compute reference metrics on iris dataset that is quite balanced by
+    # default
+    X, y = iris.data, iris.target
+    X = scale(X)
+    idx = np.arange(X.shape[0])
+    rng = np.random.RandomState(6)
+    rng.shuffle(idx)
+    X = X[idx]
+    y = y[idx]
+    clf = klass(alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False).fit(X, y)
+    f1 = metrics.f1_score(y, clf.predict(X), average="weighted")
+    assert_almost_equal(f1, 0.96, decimal=1)
+
+    # make the same prediction using balanced class_weight
+    clf_balanced = klass(
+        alpha=0.0001, max_iter=1000, class_weight="balanced", shuffle=False
+    ).fit(X, y)
+    f1 = metrics.f1_score(y, clf_balanced.predict(X), average="weighted")
+    assert_almost_equal(f1, 0.96, decimal=1)
+
+    # Make sure that in the balanced case it does not change anything
+    # to use "balanced"
+    assert_array_almost_equal(clf.coef_, clf_balanced.coef_, 6)
+
+    # build an very very imbalanced dataset out of iris data
+    X_0 = X[y == 0, :]
+    y_0 = y[y == 0]
+
+    X_imbalanced = np.vstack([X] + [X_0] * 10)
+    y_imbalanced = np.concatenate([y] + [y_0] * 10)
+
+    # fit a model on the imbalanced data without class weight info
+    clf = klass(max_iter=1000, class_weight=None, shuffle=False)
+    clf.fit(X_imbalanced, y_imbalanced)
+    y_pred = clf.predict(X)
+    assert metrics.f1_score(y, y_pred, average="weighted") < 0.96
+
+    # fit a model with balanced class_weight enabled
+    clf = klass(max_iter=1000, class_weight="balanced", shuffle=False)
+    clf.fit(X_imbalanced, y_imbalanced)
+    y_pred = clf.predict(X)
+    assert metrics.f1_score(y, y_pred, average="weighted") > 0.96
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sample_weights(klass):
+    # Test weights on individual samples
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y = [1, 1, 1, -1, -1]
+
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+
+    # we give a small weights to class 1
+    clf.fit(X, y, sample_weight=[0.001] * 3 + [1] * 2)
+
+    # now the hyperplane should rotate clock-wise and
+    # the prediction on this point should shift
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
+
+
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
+)
+def test_wrong_sample_weights(klass):
+    # Test if ValueError is raised if sample_weight has wrong shape
+    if klass in [SGDClassifier, SparseSGDClassifier]:
+        clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
+    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+        clf = klass(nu=0.1, max_iter=1000, fit_intercept=False)
+    # provided sample_weight too long
+    with pytest.raises(ValueError):
+        clf.fit(X, Y, sample_weight=np.arange(7))
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_exception(klass):
+    clf = klass(alpha=0.01)
+    # classes was not specified
+    with pytest.raises(ValueError):
+        clf.partial_fit(X3, Y3)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_binary(klass):
+    third = X.shape[0] // 3
+    clf = klass(alpha=0.01)
+    classes = np.unique(Y)
+
+    clf.partial_fit(X[:third], Y[:third], classes=classes)
+    assert clf.coef_.shape == (1, X.shape[1])
+    assert clf.intercept_.shape == (1,)
+    assert clf.decision_function([[0, 0]]).shape == (1,)
+    id1 = id(clf.coef_.data)
+
+    clf.partial_fit(X[third:], Y[third:])
+    id2 = id(clf.coef_.data)
+    # check that coef_ haven't been re-allocated
+    assert id1, id2
+
+    y_pred = clf.predict(T)
+    assert_array_equal(y_pred, true_result)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_multiclass(klass):
+    third = X2.shape[0] // 3
+    clf = klass(alpha=0.01)
+    classes = np.unique(Y2)
+
+    clf.partial_fit(X2[:third], Y2[:third], classes=classes)
+    assert clf.coef_.shape == (3, X2.shape[1])
+    assert clf.intercept_.shape == (3,)
+    assert clf.decision_function([[0, 0]]).shape == (1, 3)
+    id1 = id(clf.coef_.data)
+
+    clf.partial_fit(X2[third:], Y2[third:])
+    id2 = id(clf.coef_.data)
+    # check that coef_ haven't been re-allocated
+    assert id1, id2
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_multiclass_average(klass):
+    third = X2.shape[0] // 3
+    clf = klass(alpha=0.01, average=X2.shape[0])
+    classes = np.unique(Y2)
+
+    clf.partial_fit(X2[:third], Y2[:third], classes=classes)
+    assert clf.coef_.shape == (3, X2.shape[1])
+    assert clf.intercept_.shape == (3,)
+
+    clf.partial_fit(X2[third:], Y2[third:])
+    assert clf.coef_.shape == (3, X2.shape[1])
+    assert clf.intercept_.shape == (3,)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_fit_then_partial_fit(klass):
+    # Partial_fit should work after initial fit in the multiclass case.
+    # Non-regression test for #2496; fit would previously produce a
+    # Fortran-ordered coef_ that subsequent partial_fit couldn't handle.
+    clf = klass()
+    clf.fit(X2, Y2)
+    clf.partial_fit(X2, Y2)  # no exception here
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
+def test_partial_fit_equal_fit_classif(klass, lr):
+    for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)):
+        clf = klass(alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False)
+        clf.fit(X_, Y_)
+        y_pred = clf.decision_function(T_)
+        t = clf.t_
+
+        classes = np.unique(Y_)
+        clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
+        for i in range(2):
+            clf.partial_fit(X_, Y_, classes=classes)
+        y_pred2 = clf.decision_function(T_)
+
+        assert clf.t_ == t
+        assert_array_almost_equal(y_pred, y_pred2, decimal=2)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_regression_losses(klass):
+    random_state = np.random.RandomState(1)
+    clf = klass(
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.1,
+        loss="epsilon_insensitive",
+        random_state=random_state,
+    )
+    clf.fit(X, Y)
+    assert 1.0 == np.mean(clf.predict(X) == Y)
+
+    clf = klass(
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.1,
+        loss="squared_epsilon_insensitive",
+        random_state=random_state,
+    )
+    clf.fit(X, Y)
+    assert 1.0 == np.mean(clf.predict(X) == Y)
+
+    clf = klass(alpha=0.01, loss="huber", random_state=random_state)
+    clf.fit(X, Y)
+    assert 1.0 == np.mean(clf.predict(X) == Y)
+
+    clf = klass(
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.01,
+        loss="squared_error",
+        random_state=random_state,
+    )
+    clf.fit(X, Y)
+    assert 1.0 == np.mean(clf.predict(X) == Y)
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_warm_start_multiclass(klass):
+    _test_warm_start(klass, X2, Y2, "optimal")
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_multiple_fit(klass):
+    # Test multiple calls of fit w/ different shaped inputs.
+    clf = klass(alpha=0.01, shuffle=False)
+    clf.fit(X, Y)
+    assert hasattr(clf, "coef_")
+
+    # Non-regression test: try fitting with a different label set.
+    y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)]
+    clf.fit(X[:, :-1], y)
+
+
+###############################################################################
+# Regression Test Case
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_reg(klass):
+    # Check that SGD gives any results.
+    clf = klass(alpha=0.1, max_iter=2, fit_intercept=False)
+    clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
+    assert clf.coef_[0] == clf.coef_[1]
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_averaged_computed_correctly(klass):
+    # Tests the average regressor matches the naive implementation
+
+    eta = 0.001
+    alpha = 0.01
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+
+    # simple linear function without noise
+    y = np.dot(X, w)
+
+    clf = klass(
+        loss="squared_error",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    clf.fit(X, y)
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+
+    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
+    assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_averaged_partial_fit(klass):
+    # Tests whether the partial fit yields the same average as the fit
+    eta = 0.001
+    alpha = 0.01
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+
+    # simple linear function without noise
+    y = np.dot(X, w)
+
+    clf = klass(
+        loss="squared_error",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    clf.partial_fit(X[: int(n_samples / 2)][:], y[: int(n_samples / 2)])
+    clf.partial_fit(X[int(n_samples / 2) :][:], y[int(n_samples / 2) :])
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+
+    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
+    assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16)
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_average_sparse(klass):
+    # Checks the average weights on data with 0s
+
+    eta = 0.001
+    alpha = 0.01
+    clf = klass(
+        loss="squared_error",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    n_samples = Y3.shape[0]
+
+    clf.partial_fit(X3[: int(n_samples / 2)][:], Y3[: int(n_samples / 2)])
+    clf.partial_fit(X3[int(n_samples / 2) :][:], Y3[int(n_samples / 2) :])
+    average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha)
+
+    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
+    assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_least_squares_fit(klass):
+    xmin, xmax = -5, 5
+    n_samples = 100
+    rng = np.random.RandomState(0)
+    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+
+    # simple linear function without noise
+    y = 0.5 * X.ravel()
+
+    clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert score > 0.99
+
+    # simple linear function with noise
+    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+
+    clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert score > 0.5
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_epsilon_insensitive(klass):
+    xmin, xmax = -5, 5
+    n_samples = 100
+    rng = np.random.RandomState(0)
+    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+
+    # simple linear function without noise
+    y = 0.5 * X.ravel()
+
+    clf = klass(
+        loss="epsilon_insensitive",
+        epsilon=0.01,
+        alpha=0.1,
+        max_iter=20,
+        fit_intercept=False,
+    )
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert score > 0.99
+
+    # simple linear function with noise
+    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+
+    clf = klass(
+        loss="epsilon_insensitive",
+        epsilon=0.01,
+        alpha=0.1,
+        max_iter=20,
+        fit_intercept=False,
+    )
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert score > 0.5
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_huber_fit(klass):
+    xmin, xmax = -5, 5
+    n_samples = 100
+    rng = np.random.RandomState(0)
+    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+
+    # simple linear function without noise
+    y = 0.5 * X.ravel()
+
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert score > 0.99
+
+    # simple linear function with noise
+    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert score > 0.5
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_elasticnet_convergence(klass):
+    # Check that the SGD output is consistent with coordinate descent
+
+    n_samples, n_features = 1000, 5
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+    # ground_truth linear model that generate y from X and to which the
+    # models should converge if the regularizer would be set to 0.0
+    ground_truth_coef = rng.randn(n_features)
+    y = np.dot(X, ground_truth_coef)
+
+    # XXX: alpha = 0.1 seems to cause convergence problems
+    for alpha in [0.01, 0.001]:
+        for l1_ratio in [0.5, 0.8, 1.0]:
+            cd = linear_model.ElasticNet(
+                alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False
+            )
+            cd.fit(X, y)
+            sgd = klass(
+                penalty="elasticnet",
+                max_iter=50,
+                alpha=alpha,
+                l1_ratio=l1_ratio,
+                fit_intercept=False,
+            )
+            sgd.fit(X, y)
+            err_msg = (
+                "cd and sgd did not converge to comparable "
+                "results for alpha=%f and l1_ratio=%f" % (alpha, l1_ratio)
+            )
+            assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, err_msg=err_msg)
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_partial_fit(klass):
+    third = X.shape[0] // 3
+    clf = klass(alpha=0.01)
+
+    clf.partial_fit(X[:third], Y[:third])
+    assert clf.coef_.shape == (X.shape[1],)
+    assert clf.intercept_.shape == (1,)
+    assert clf.predict([[0, 0]]).shape == (1,)
+    id1 = id(clf.coef_.data)
+
+    clf.partial_fit(X[third:], Y[third:])
+    id2 = id(clf.coef_.data)
+    # check that coef_ haven't been re-allocated
+    assert id1, id2
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
+def test_partial_fit_equal_fit(klass, lr):
+    clf = klass(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
+    clf.fit(X, Y)
+    y_pred = clf.predict(T)
+    t = clf.t_
+
+    clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
+    for i in range(2):
+        clf.partial_fit(X, Y)
+    y_pred2 = clf.predict(T)
+
+    assert clf.t_ == t
+    assert_array_almost_equal(y_pred, y_pred2, decimal=2)
+
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_loss_function_epsilon(klass):
+    clf = klass(epsilon=0.9)
+    clf.set_params(epsilon=0.1)
+    assert clf.loss_functions["huber"][1] == 0.1
+
+
+###############################################################################
+# SGD One Class SVM Test Case
+
+
+# a simple implementation of ASGD to use for testing SGDOneClassSVM
+def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0):
+    if coef_init is None:
+        coef = np.zeros(X.shape[1])
+    else:
+        coef = coef_init
+
+    average_coef = np.zeros(X.shape[1])
+    offset = offset_init
+    intercept = 1 - offset
+    average_intercept = 0.0
+    decay = 1.0
+
+    # sparse data has a fixed decay of .01
+    if klass == SparseSGDOneClassSVM:
+        decay = 0.01
+
+    for i, entry in enumerate(X):
+        p = np.dot(entry, coef)
+        p += intercept
+        if p <= 1.0:
+            gradient = -1
+        else:
+            gradient = 0
+        coef *= max(0, 1.0 - (eta * nu / 2))
+        coef += -(eta * gradient * entry)
+        intercept += -(eta * (nu + gradient)) * decay
+
+        average_coef *= i
+        average_coef += coef
+        average_coef /= i + 1.0
+
+        average_intercept *= i
+        average_intercept += intercept
+        average_intercept /= i + 1.0
+
+    return average_coef, 1 - average_intercept
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def _test_warm_start_oneclass(klass, X, lr):
+    # Test that explicit warm restart...
+    clf = klass(nu=0.5, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf.fit(X)
+
+    clf2 = klass(nu=0.1, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf2.fit(X, coef_init=clf.coef_.copy(), offset_init=clf.offset_.copy())
+
+    # ... and implicit warm restart are equivalent.
+    clf3 = klass(nu=0.5, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr)
+    clf3.fit(X)
+
+    assert clf3.t_ == clf.t_
+    assert_allclose(clf3.coef_, clf.coef_)
+
+    clf3.set_params(nu=0.1)
+    clf3.fit(X)
+
+    assert clf3.t_ == clf2.t_
+    assert_allclose(clf3.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
+def test_warm_start_oneclass(klass, lr):
+    _test_warm_start_oneclass(klass, X, lr)
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_clone_oneclass(klass):
+    # Test whether clone works ok.
+    clf = klass(nu=0.5)
+    clf = clone(clf)
+    clf.set_params(nu=0.1)
+    clf.fit(X)
+
+    clf2 = klass(nu=0.1)
+    clf2.fit(X)
+
+    assert_array_equal(clf.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_partial_fit_oneclass(klass):
+    third = X.shape[0] // 3
+    clf = klass(nu=0.1)
+
+    clf.partial_fit(X[:third])
+    assert clf.coef_.shape == (X.shape[1],)
+    assert clf.offset_.shape == (1,)
+    assert clf.predict([[0, 0]]).shape == (1,)
+    previous_coefs = clf.coef_
+
+    clf.partial_fit(X[third:])
+    # check that coef_ haven't been re-allocated
+    assert clf.coef_ is previous_coefs
+
+    # raises ValueError if number of features does not match previous data
+    with pytest.raises(ValueError):
+        clf.partial_fit(X[:, 1])
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
+def test_partial_fit_equal_fit_oneclass(klass, lr):
+    clf = klass(nu=0.05, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
+    clf.fit(X)
+    y_scores = clf.decision_function(T)
+    t = clf.t_
+    coef = clf.coef_
+    offset = clf.offset_
+
+    clf = klass(nu=0.05, eta0=0.01, max_iter=1, learning_rate=lr, shuffle=False)
+    for _ in range(2):
+        clf.partial_fit(X)
+    y_scores2 = clf.decision_function(T)
+
+    assert clf.t_ == t
+    assert_allclose(y_scores, y_scores2)
+    assert_allclose(clf.coef_, coef)
+    assert_allclose(clf.offset_, offset)
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_late_onset_averaging_reached_oneclass(klass):
+    # Test average
+    eta0 = 0.001
+    nu = 0.05
+
+    # 2 passes over the training set but average only at second pass
+    clf1 = klass(
+        average=7, learning_rate="constant", eta0=eta0, nu=nu, max_iter=2, shuffle=False
+    )
+    # 1 pass over the training set with no averaging
+    clf2 = klass(
+        average=False,
+        learning_rate="constant",
+        eta0=eta0,
+        nu=nu,
+        max_iter=1,
+        shuffle=False,
+    )
+
+    clf1.fit(X)
+    clf2.fit(X)
+
+    # Start from clf2 solution, compute averaging using asgd function and
+    # compare with clf1 solution
+    average_coef, average_offset = asgd_oneclass(
+        klass, X, eta0, nu, coef_init=clf2.coef_.ravel(), offset_init=clf2.offset_
+    )
+
+    assert_allclose(clf1.coef_.ravel(), average_coef.ravel())
+    assert_allclose(clf1.offset_, average_offset)
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_sgd_averaged_computed_correctly_oneclass(klass):
+    # Tests the average SGD One-Class SVM matches the naive implementation
+    eta = 0.001
+    nu = 0.05
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+
+    clf = klass(
+        learning_rate="constant",
+        eta0=eta,
+        nu=nu,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    clf.fit(X)
+    average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
+
+    assert_allclose(clf.coef_, average_coef)
+    assert_allclose(clf.offset_, average_offset)
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_sgd_averaged_partial_fit_oneclass(klass):
+    # Tests whether the partial fit yields the same average as the fit
+    eta = 0.001
+    nu = 0.05
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+
+    clf = klass(
+        learning_rate="constant",
+        eta0=eta,
+        nu=nu,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    clf.partial_fit(X[: int(n_samples / 2)][:])
+    clf.partial_fit(X[int(n_samples / 2) :][:])
+    average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
+
+    assert_allclose(clf.coef_, average_coef)
+    assert_allclose(clf.offset_, average_offset)
+
+
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_average_sparse_oneclass(klass):
+    # Checks the average coef on data with 0s
+    eta = 0.001
+    nu = 0.01
+    clf = klass(
+        learning_rate="constant",
+        eta0=eta,
+        nu=nu,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    n_samples = X3.shape[0]
+
+    clf.partial_fit(X3[: int(n_samples / 2)])
+    clf.partial_fit(X3[int(n_samples / 2) :])
+    average_coef, average_offset = asgd_oneclass(klass, X3, eta, nu)
+
+    assert_allclose(clf.coef_, average_coef)
+    assert_allclose(clf.offset_, average_offset)
+
+
+def test_sgd_oneclass():
+    # Test fit, decision_function, predict and score_samples on a toy
+    # dataset
+    X_train = np.array([[-2, -1], [-1, -1], [1, 1]])
+    X_test = np.array([[0.5, -2], [2, 2]])
+    clf = SGDOneClassSVM(
+        nu=0.5, eta0=1, learning_rate="constant", shuffle=False, max_iter=1
+    )
+    clf.fit(X_train)
+    assert_allclose(clf.coef_, np.array([-0.125, 0.4375]))
+    assert clf.offset_[0] == -0.5
+
+    scores = clf.score_samples(X_test)
+    assert_allclose(scores, np.array([-0.9375, 0.625]))
+
+    dec = clf.score_samples(X_test) - clf.offset_
+    assert_allclose(clf.decision_function(X_test), dec)
+
+    pred = clf.predict(X_test)
+    assert_array_equal(pred, np.array([-1, 1]))
+
+
+def test_ocsvm_vs_sgdocsvm():
+    # Checks SGDOneClass SVM gives a good approximation of kernelized
+    # One-Class SVM
+    nu = 0.05
+    gamma = 2.0
+    random_state = 42
+
+    # Generate train and test data
+    rng = np.random.RandomState(random_state)
+    X = 0.3 * rng.randn(500, 2)
+    X_train = np.r_[X + 2, X - 2]
+    X = 0.3 * rng.randn(100, 2)
+    X_test = np.r_[X + 2, X - 2]
+
+    # One-Class SVM
+    clf = OneClassSVM(gamma=gamma, kernel="rbf", nu=nu)
+    clf.fit(X_train)
+    y_pred_ocsvm = clf.predict(X_test)
+    dec_ocsvm = clf.decision_function(X_test).reshape(1, -1)
+
+    # SGDOneClassSVM using kernel approximation
+    max_iter = 15
+    transform = Nystroem(gamma=gamma, random_state=random_state)
+    clf_sgd = SGDOneClassSVM(
+        nu=nu,
+        shuffle=True,
+        fit_intercept=True,
+        max_iter=max_iter,
+        random_state=random_state,
+        tol=None,
+    )
+    pipe_sgd = make_pipeline(transform, clf_sgd)
+    pipe_sgd.fit(X_train)
+    y_pred_sgdocsvm = pipe_sgd.predict(X_test)
+    dec_sgdocsvm = pipe_sgd.decision_function(X_test).reshape(1, -1)
+
+    assert np.mean(y_pred_sgdocsvm == y_pred_ocsvm) >= 0.99
+    corrcoef = np.corrcoef(np.concatenate((dec_ocsvm, dec_sgdocsvm)))[0, 1]
+    assert corrcoef >= 0.9
+
+
+def test_l1_ratio():
+    # Test if l1 ratio extremes match L1 and L2 penalty settings.
+    X, y = datasets.make_classification(
+        n_samples=1000, n_features=100, n_informative=20, random_state=1234
+    )
+
+    # test if elasticnet with l1_ratio near 1 gives same result as pure l1
+    est_en = SGDClassifier(
+        alpha=0.001,
+        penalty="elasticnet",
+        tol=None,
+        max_iter=6,
+        l1_ratio=0.9999999999,
+        random_state=42,
+    ).fit(X, y)
+    est_l1 = SGDClassifier(
+        alpha=0.001, penalty="l1", max_iter=6, random_state=42, tol=None
+    ).fit(X, y)
+    assert_array_almost_equal(est_en.coef_, est_l1.coef_)
+
+    # test if elasticnet with l1_ratio near 0 gives same result as pure l2
+    est_en = SGDClassifier(
+        alpha=0.001,
+        penalty="elasticnet",
+        tol=None,
+        max_iter=6,
+        l1_ratio=0.0000000001,
+        random_state=42,
+    ).fit(X, y)
+    est_l2 = SGDClassifier(
+        alpha=0.001, penalty="l2", max_iter=6, random_state=42, tol=None
+    ).fit(X, y)
+    assert_array_almost_equal(est_en.coef_, est_l2.coef_)
+
+
+def test_underflow_or_overlow():
+    with np.errstate(all="raise"):
+        # Generate some weird data with hugely unscaled features
+        rng = np.random.RandomState(0)
+        n_samples = 100
+        n_features = 10
+
+        X = rng.normal(size=(n_samples, n_features))
+        X[:, :2] *= 1e300
+        assert np.isfinite(X).all()
+
+        # Use MinMaxScaler to scale the data without introducing a numerical
+        # instability (computing the standard deviation naively is not possible
+        # on this data)
+        X_scaled = MinMaxScaler().fit_transform(X)
+        assert np.isfinite(X_scaled).all()
+
+        # Define a ground truth on the scaled data
+        ground_truth = rng.normal(size=n_features)
+        y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32)
+        assert_array_equal(np.unique(y), [0, 1])
+
+        model = SGDClassifier(alpha=0.1, loss="squared_hinge", max_iter=500)
+
+        # smoke test: model is stable on scaled data
+        model.fit(X_scaled, y)
+        assert np.isfinite(model.coef_).all()
+
+        # model is numerically unstable on unscaled data
+        msg_regxp = (
+            r"Floating-point under-/overflow occurred at epoch #.*"
+            " Scaling input data with StandardScaler or MinMaxScaler"
+            " might help."
+        )
+        with pytest.raises(ValueError, match=msg_regxp):
+            model.fit(X, y)
+
+
+def test_numerical_stability_large_gradient():
+    # Non regression test case for numerical stability on scaled problems
+    # where the gradient can still explode with some losses
+    model = SGDClassifier(
+        loss="squared_hinge",
+        max_iter=10,
+        shuffle=True,
+        penalty="elasticnet",
+        l1_ratio=0.3,
+        alpha=0.01,
+        eta0=0.001,
+        random_state=0,
+        tol=None,
+    )
+    with np.errstate(all="raise"):
+        model.fit(iris.data, iris.target)
+    assert np.isfinite(model.coef_).all()
+
+
+@pytest.mark.parametrize("penalty", ["l2", "l1", "elasticnet"])
+def test_large_regularization(penalty):
+    # Non regression tests for numerical stability issues caused by large
+    # regularization parameters
+    model = SGDClassifier(
+        alpha=1e5,
+        learning_rate="constant",
+        eta0=0.1,
+        penalty=penalty,
+        shuffle=False,
+        tol=None,
+        max_iter=6,
+    )
+    with np.errstate(all="raise"):
+        model.fit(iris.data, iris.target)
+    assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_))
+
+
+def test_tol_parameter():
+    # Test that the tol parameter behaves as expected
+    X = StandardScaler().fit_transform(iris.data)
+    y = iris.target == 1
+
+    # With tol is None, the number of iteration should be equal to max_iter
+    max_iter = 42
+    model_0 = SGDClassifier(tol=None, random_state=0, max_iter=max_iter)
+    model_0.fit(X, y)
+    assert max_iter == model_0.n_iter_
+
+    # If tol is not None, the number of iteration should be less than max_iter
+    max_iter = 2000
+    model_1 = SGDClassifier(tol=0, random_state=0, max_iter=max_iter)
+    model_1.fit(X, y)
+    assert max_iter > model_1.n_iter_
+    assert model_1.n_iter_ > 5
+
+    # A larger tol should yield a smaller number of iteration
+    model_2 = SGDClassifier(tol=0.1, random_state=0, max_iter=max_iter)
+    model_2.fit(X, y)
+    assert model_1.n_iter_ > model_2.n_iter_
+    assert model_2.n_iter_ > 3
+
+    # Strict tolerance and small max_iter should trigger a warning
+    model_3 = SGDClassifier(max_iter=3, tol=1e-3, random_state=0)
+    warning_message = (
+        "Maximum number of iteration reached before "
+        "convergence. Consider increasing max_iter to "
+        "improve the fit."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        model_3.fit(X, y)
+    assert model_3.n_iter_ == 3
+
+
+def _test_loss_common(loss_function, cases):
+    # Test the different loss functions
+    # cases is a list of (p, y, expected)
+    for p, y, expected_loss, expected_dloss in cases:
+        assert_almost_equal(loss_function.py_loss(p, y), expected_loss)
+        assert_almost_equal(loss_function.py_dloss(p, y), expected_dloss)
+
+
+def test_loss_hinge():
+    # Test Hinge (hinge / perceptron)
+    # hinge
+    loss = sgd_fast.Hinge(1.0)
+    cases = [
+        # (p, y, expected_loss, expected_dloss)
+        (1.1, 1.0, 0.0, 0.0),
+        (-2.0, -1.0, 0.0, 0.0),
+        (1.0, 1.0, 0.0, -1.0),
+        (-1.0, -1.0, 0.0, 1.0),
+        (0.5, 1.0, 0.5, -1.0),
+        (2.0, -1.0, 3.0, 1.0),
+        (-0.5, -1.0, 0.5, 1.0),
+        (0.0, 1.0, 1, -1.0),
+    ]
+    _test_loss_common(loss, cases)
+
+    # perceptron
+    loss = sgd_fast.Hinge(0.0)
+    cases = [
+        # (p, y, expected_loss, expected_dloss)
+        (1.0, 1.0, 0.0, 0.0),
+        (-0.1, -1.0, 0.0, 0.0),
+        (0.0, 1.0, 0.0, -1.0),
+        (0.0, -1.0, 0.0, 1.0),
+        (0.5, -1.0, 0.5, 1.0),
+        (2.0, -1.0, 2.0, 1.0),
+        (-0.5, 1.0, 0.5, -1.0),
+        (-1.0, 1.0, 1.0, -1.0),
+    ]
+    _test_loss_common(loss, cases)
+
+
+def test_gradient_squared_hinge():
+    # Test SquaredHinge
+    loss = sgd_fast.SquaredHinge(1.0)
+    cases = [
+        # (p, y, expected_loss, expected_dloss)
+        (1.0, 1.0, 0.0, 0.0),
+        (-2.0, -1.0, 0.0, 0.0),
+        (1.0, -1.0, 4.0, 4.0),
+        (-1.0, 1.0, 4.0, -4.0),
+        (0.5, 1.0, 0.25, -1.0),
+        (0.5, -1.0, 2.25, 3.0),
+    ]
+    _test_loss_common(loss, cases)
+
+
+def test_loss_modified_huber():
+    # (p, y, expected_loss, expected_dloss)
+    loss = sgd_fast.ModifiedHuber()
+    cases = [
+        # (p, y, expected_loss, expected_dloss)
+        (1.0, 1.0, 0.0, 0.0),
+        (-1.0, -1.0, 0.0, 0.0),
+        (2.0, 1.0, 0.0, 0.0),
+        (0.0, 1.0, 1.0, -2.0),
+        (-1.0, 1.0, 4.0, -4.0),
+        (0.5, -1.0, 2.25, 3.0),
+        (-2.0, 1.0, 8, -4.0),
+        (-3.0, 1.0, 12, -4.0),
+    ]
+    _test_loss_common(loss, cases)
+
+
+def test_loss_epsilon_insensitive():
+    # Test EpsilonInsensitive
+    loss = sgd_fast.EpsilonInsensitive(0.1)
+    cases = [
+        # (p, y, expected_loss, expected_dloss)
+        (0.0, 0.0, 0.0, 0.0),
+        (0.1, 0.0, 0.0, 0.0),
+        (-2.05, -2.0, 0.0, 0.0),
+        (3.05, 3.0, 0.0, 0.0),
+        (2.2, 2.0, 0.1, 1.0),
+        (2.0, -1.0, 2.9, 1.0),
+        (2.0, 2.2, 0.1, -1.0),
+        (-2.0, 1.0, 2.9, -1.0),
+    ]
+    _test_loss_common(loss, cases)
+
+
+def test_loss_squared_epsilon_insensitive():
+    # Test SquaredEpsilonInsensitive
+    loss = sgd_fast.SquaredEpsilonInsensitive(0.1)
+    cases = [
+        # (p, y, expected_loss, expected_dloss)
+        (0.0, 0.0, 0.0, 0.0),
+        (0.1, 0.0, 0.0, 0.0),
+        (-2.05, -2.0, 0.0, 0.0),
+        (3.05, 3.0, 0.0, 0.0),
+        (2.2, 2.0, 0.01, 0.2),
+        (2.0, -1.0, 8.41, 5.8),
+        (2.0, 2.2, 0.01, -0.2),
+        (-2.0, 1.0, 8.41, -5.8),
+    ]
+    _test_loss_common(loss, cases)
+
+
+def test_multi_thread_multi_class_and_early_stopping():
+    # This is a non-regression test for a bad interaction between
+    # early stopping internal attribute and thread-based parallelism.
+    clf = SGDClassifier(
+        alpha=1e-3,
+        tol=1e-3,
+        max_iter=1000,
+        early_stopping=True,
+        n_iter_no_change=100,
+        random_state=0,
+        n_jobs=2,
+    )
+    clf.fit(iris.data, iris.target)
+    assert clf.n_iter_ > clf.n_iter_no_change
+    assert clf.n_iter_ < clf.n_iter_no_change + 20
+    assert clf.score(iris.data, iris.target) > 0.8
+
+
+def test_multi_core_gridsearch_and_early_stopping():
+    # This is a non-regression test for a bad interaction between
+    # early stopping internal attribute and process-based multi-core
+    # parallelism.
+    param_grid = {
+        "alpha": np.logspace(-4, 4, 9),
+        "n_iter_no_change": [5, 10, 50],
+    }
+
+    clf = SGDClassifier(tol=1e-2, max_iter=1000, early_stopping=True, random_state=0)
+    search = RandomizedSearchCV(clf, param_grid, n_iter=5, n_jobs=2, random_state=0)
+    search.fit(iris.data, iris.target)
+    assert search.best_score_ > 0.8
+
+
+@pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"])
+def test_SGDClassifier_fit_for_all_backends(backend):
+    # This is a non-regression smoke test. In the multi-class case,
+    # SGDClassifier.fit fits each class in a one-versus-all fashion using
+    # joblib.Parallel.  However, each OvA step updates the coef_ attribute of
+    # the estimator in-place. Internally, SGDClassifier calls Parallel using
+    # require='sharedmem'. This test makes sure SGDClassifier.fit works
+    # consistently even when the user asks for a backend that does not provide
+    # sharedmem semantics.
+
+    # We further test a case where memmapping would have been used if
+    # SGDClassifier.fit was called from a loky or multiprocessing backend. In
+    # this specific case, in-place modification of clf.coef_ would have caused
+    # a segmentation fault when trying to write in a readonly memory mapped
+    # buffer.
+
+    random_state = np.random.RandomState(42)
+
+    # Create a classification problem with 50000 features and 20 classes. Using
+    # loky or multiprocessing this make the clf.coef_ exceed the threshold
+    # above which memmaping is used in joblib and loky (1MB as of 2018/11/1).
+    X = sp.random(500, 2000, density=0.02, format="csr", random_state=random_state)
+    y = random_state.choice(20, 500)
+
+    # Begin by fitting a SGD classifier sequentially
+    clf_sequential = SGDClassifier(max_iter=1000, n_jobs=1, random_state=42)
+    clf_sequential.fit(X, y)
+
+    # Fit a SGDClassifier using the specified backend, and make sure the
+    # coefficients are equal to those obtained using a sequential fit
+    clf_parallel = SGDClassifier(max_iter=1000, n_jobs=4, random_state=42)
+    with joblib.parallel_backend(backend=backend):
+        clf_parallel.fit(X, y)
+    assert_array_almost_equal(clf_sequential.coef_, clf_parallel.coef_)
+
+
+@pytest.mark.parametrize(
+    "Estimator", [linear_model.SGDClassifier, linear_model.SGDRegressor]
+)
+def test_sgd_random_state(Estimator, global_random_seed):
+    # Train the same model on the same data without converging and check that we
+    # get reproducible results by fixing the random seed.
+    if Estimator == linear_model.SGDRegressor:
+        X, y = datasets.make_regression(random_state=global_random_seed)
+    else:
+        X, y = datasets.make_classification(random_state=global_random_seed)
+
+    # Fitting twice a model with the same hyper-parameters on the same training
+    # set with the same seed leads to the same results deterministically.
+
+    est = Estimator(random_state=global_random_seed, max_iter=1)
+    with pytest.warns(ConvergenceWarning):
+        coef_same_seed_a = est.fit(X, y).coef_
+        assert est.n_iter_ == 1
+
+    est = Estimator(random_state=global_random_seed, max_iter=1)
+    with pytest.warns(ConvergenceWarning):
+        coef_same_seed_b = est.fit(X, y).coef_
+        assert est.n_iter_ == 1
+
+    assert_allclose(coef_same_seed_a, coef_same_seed_b)
+
+    # Fitting twice a model with the same hyper-parameters on the same training
+    # set but with different random seed leads to different results after one
+    # epoch because of the random shuffling of the dataset.
+
+    est = Estimator(random_state=global_random_seed + 1, max_iter=1)
+    with pytest.warns(ConvergenceWarning):
+        coef_other_seed = est.fit(X, y).coef_
+        assert est.n_iter_ == 1
+
+    assert np.abs(coef_same_seed_a - coef_other_seed).max() > 1.0
+
+
+def test_validation_mask_correctly_subsets(monkeypatch):
+    """Test that data passed to validation callback correctly subsets.
+
+    Non-regression test for #23255.
+    """
+    X, Y = iris.data, iris.target
+    n_samples = X.shape[0]
+    validation_fraction = 0.2
+    clf = linear_model.SGDClassifier(
+        early_stopping=True,
+        tol=1e-3,
+        max_iter=1000,
+        validation_fraction=validation_fraction,
+    )
+
+    mock = Mock(side_effect=_stochastic_gradient._ValidationScoreCallback)
+    monkeypatch.setattr(_stochastic_gradient, "_ValidationScoreCallback", mock)
+    clf.fit(X, Y)
+
+    X_val, y_val = mock.call_args[0][1:3]
+    assert X_val.shape[0] == int(n_samples * validation_fraction)
+    assert y_val.shape[0] == int(n_samples * validation_fraction)
+
+
+def test_sgd_error_on_zero_validation_weight():
+    # Test that SGDClassifier raises error when all the validation samples
+    # have zero sample_weight. Non-regression test for #17229.
+    X, Y = iris.data, iris.target
+    sample_weight = np.zeros_like(Y)
+    validation_fraction = 0.4
+
+    clf = linear_model.SGDClassifier(
+        early_stopping=True, validation_fraction=validation_fraction, random_state=0
+    )
+
+    error_message = (
+        "The sample weights for validation set are all zero, consider using a"
+        " different random state."
+    )
+    with pytest.raises(ValueError, match=error_message):
+        clf.fit(X, Y, sample_weight=sample_weight)
+
+
+@pytest.mark.parametrize("Estimator", [SGDClassifier, SGDRegressor])
+def test_sgd_verbose(Estimator):
+    """non-regression test for gh #25249"""
+    Estimator(verbose=1).fit(X, Y)
+
+
+@pytest.mark.parametrize(
+    "SGDEstimator",
+    [
+        SGDClassifier,
+        SparseSGDClassifier,
+        SGDRegressor,
+        SparseSGDRegressor,
+        SGDOneClassSVM,
+        SparseSGDOneClassSVM,
+    ],
+)
+@pytest.mark.parametrize("data_type", (np.float32, np.float64))
+def test_sgd_dtype_match(SGDEstimator, data_type):
+    _X = X.astype(data_type)
+    _Y = np.array(Y, dtype=data_type)
+    sgd_model = SGDEstimator()
+    sgd_model.fit(_X, _Y)
+    assert sgd_model.coef_.dtype == data_type
+
+
+@pytest.mark.parametrize(
+    "SGDEstimator",
+    [
+        SGDClassifier,
+        SparseSGDClassifier,
+        SGDRegressor,
+        SparseSGDRegressor,
+        SGDOneClassSVM,
+        SparseSGDOneClassSVM,
+    ],
+)
+def test_sgd_numerical_consistency(SGDEstimator):
+    X_64 = X.astype(dtype=np.float64)
+    Y_64 = np.array(Y, dtype=np.float64)
+
+    X_32 = X.astype(dtype=np.float32)
+    Y_32 = np.array(Y, dtype=np.float32)
+
+    sgd_64 = SGDEstimator(max_iter=20)
+    sgd_64.fit(X_64, Y_64)
+
+    sgd_32 = SGDEstimator(max_iter=20)
+    sgd_32.fit(X_32, Y_32)
+
+    assert_allclose(sgd_64.coef_, sgd_32.coef_)
+
+
+def test_sgd_one_class_svm_estimator_type():
+    """Check that SGDOneClassSVM has the correct estimator type.
+
+    Non-regression test for if the mixin was not on the left.
+    """
+    sgd_ocsvm = SGDOneClassSVM()
+    assert get_tags(sgd_ocsvm).estimator_type == "outlier_detector"
diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_theil_sen.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_theil_sen.py
new file mode 100644
index 0000000000000000000000000000000000000000..216415f2ee9277e618c457afc0a7280c8a2a4b8a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_theil_sen.py
@@ -0,0 +1,303 @@
+"""
+Testing for Theil-Sen module (sklearn.linear_model.theil_sen)
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import os
+import re
+import sys
+from contextlib import contextmanager
+
+import numpy as np
+import pytest
+from numpy.testing import (
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_array_less,
+)
+from scipy.linalg import norm
+from scipy.optimize import fmin_bfgs
+
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import LinearRegression, TheilSenRegressor
+from sklearn.linear_model._theil_sen import (
+    _breakdown_point,
+    _modified_weiszfeld_step,
+    _spatial_median,
+)
+from sklearn.utils._testing import assert_almost_equal
+
+
+@contextmanager
+def no_stdout_stderr():
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr
+    with open(os.devnull, "w") as devnull:
+        sys.stdout = devnull
+        sys.stderr = devnull
+        yield
+        devnull.flush()
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr
+
+
+def gen_toy_problem_1d(intercept=True):
+    random_state = np.random.RandomState(0)
+    # Linear model y = 3*x + N(2, 0.1**2)
+    w = 3.0
+    if intercept:
+        c = 2.0
+        n_samples = 50
+    else:
+        c = 0.1
+        n_samples = 100
+    x = random_state.normal(size=n_samples)
+    noise = 0.1 * random_state.normal(size=n_samples)
+    y = w * x + c + noise
+    # Add some outliers
+    if intercept:
+        x[42], y[42] = (-2, 4)
+        x[43], y[43] = (-2.5, 8)
+        x[33], y[33] = (2.5, 1)
+        x[49], y[49] = (2.1, 2)
+    else:
+        x[42], y[42] = (-2, 4)
+        x[43], y[43] = (-2.5, 8)
+        x[53], y[53] = (2.5, 1)
+        x[60], y[60] = (2.1, 2)
+        x[72], y[72] = (1.8, -7)
+    return x[:, np.newaxis], y, w, c
+
+
+def gen_toy_problem_2d():
+    random_state = np.random.RandomState(0)
+    n_samples = 100
+    # Linear model y = 5*x_1 + 10*x_2 + N(1, 0.1**2)
+    X = random_state.normal(size=(n_samples, 2))
+    w = np.array([5.0, 10.0])
+    c = 1.0
+    noise = 0.1 * random_state.normal(size=n_samples)
+    y = np.dot(X, w) + c + noise
+    # Add some outliers
+    n_outliers = n_samples // 10
+    ix = random_state.randint(0, n_samples, size=n_outliers)
+    y[ix] = 50 * random_state.normal(size=n_outliers)
+    return X, y, w, c
+
+
+def gen_toy_problem_4d():
+    random_state = np.random.RandomState(0)
+    n_samples = 10000
+    # Linear model y = 5*x_1 + 10*x_2  + 42*x_3 + 7*x_4 + N(1, 0.1**2)
+    X = random_state.normal(size=(n_samples, 4))
+    w = np.array([5.0, 10.0, 42.0, 7.0])
+    c = 1.0
+    noise = 0.1 * random_state.normal(size=n_samples)
+    y = np.dot(X, w) + c + noise
+    # Add some outliers
+    n_outliers = n_samples // 10
+    ix = random_state.randint(0, n_samples, size=n_outliers)
+    y[ix] = 50 * random_state.normal(size=n_outliers)
+    return X, y, w, c
+
+
+def test_modweiszfeld_step_1d():
+    X = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
+    # Check startvalue is element of X and solution
+    median = 2.0
+    new_y = _modified_weiszfeld_step(X, median)
+    assert_array_almost_equal(new_y, median)
+    # Check startvalue is not the solution
+    y = 2.5
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_less(median, new_y)
+    assert_array_less(new_y, y)
+    # Check startvalue is not the solution but element of X
+    y = 3.0
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_less(median, new_y)
+    assert_array_less(new_y, y)
+    # Check that a single vector is identity
+    X = np.array([1.0, 2.0, 3.0]).reshape(1, 3)
+    y = X[0]
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_equal(y, new_y)
+
+
+def test_modweiszfeld_step_2d():
+    X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2)
+    y = np.array([0.5, 0.5])
+    # Check first two iterations
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_almost_equal(new_y, np.array([1 / 3, 2 / 3]))
+    new_y = _modified_weiszfeld_step(X, new_y)
+    assert_array_almost_equal(new_y, np.array([0.2792408, 0.7207592]))
+    # Check fix point
+    y = np.array([0.21132505, 0.78867497])
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_almost_equal(new_y, y)
+
+
+def test_spatial_median_1d():
+    X = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
+    true_median = 2.0
+    _, median = _spatial_median(X)
+    assert_array_almost_equal(median, true_median)
+    # Test larger problem and for exact solution in 1d case
+    random_state = np.random.RandomState(0)
+    X = random_state.randint(100, size=(1000, 1))
+    true_median = np.median(X.ravel())
+    _, median = _spatial_median(X)
+    assert_array_equal(median, true_median)
+
+
+def test_spatial_median_2d():
+    X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2)
+    _, median = _spatial_median(X, max_iter=100, tol=1.0e-6)
+
+    def cost_func(y):
+        dists = np.array([norm(x - y) for x in X])
+        return np.sum(dists)
+
+    # Check if median is solution of the Fermat-Weber location problem
+    fermat_weber = fmin_bfgs(cost_func, median, disp=False)
+    assert_array_almost_equal(median, fermat_weber)
+    # Check when maximum iteration is exceeded a warning is emitted
+    warning_message = "Maximum number of iterations 30 reached in spatial median."
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        _spatial_median(X, max_iter=30, tol=0.0)
+
+
+def test_theil_sen_1d():
+    X, y, w, c = gen_toy_problem_1d()
+    # Check that Least Squares fails
+    lstq = LinearRegression().fit(X, y)
+    assert np.abs(lstq.coef_ - w) > 0.9
+    # Check that Theil-Sen works
+    theil_sen = TheilSenRegressor(random_state=0).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, w, 1)
+    assert_array_almost_equal(theil_sen.intercept_, c, 1)
+
+
+def test_theil_sen_1d_no_intercept():
+    X, y, w, c = gen_toy_problem_1d(intercept=False)
+    # Check that Least Squares fails
+    lstq = LinearRegression(fit_intercept=False).fit(X, y)
+    assert np.abs(lstq.coef_ - w - c) > 0.5
+    # Check that Theil-Sen works
+    theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, w + c, 1)
+    assert_almost_equal(theil_sen.intercept_, 0.0)
+
+    # non-regression test for #18104
+    theil_sen.score(X, y)
+
+
+def test_theil_sen_2d():
+    X, y, w, c = gen_toy_problem_2d()
+    # Check that Least Squares fails
+    lstq = LinearRegression().fit(X, y)
+    assert norm(lstq.coef_ - w) > 1.0
+    # Check that Theil-Sen works
+    theil_sen = TheilSenRegressor(max_subpopulation=1e3, random_state=0).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, w, 1)
+    assert_array_almost_equal(theil_sen.intercept_, c, 1)
+
+
+def test_calc_breakdown_point():
+    bp = _breakdown_point(1e10, 2)
+    assert np.abs(bp - 1 + 1 / (np.sqrt(2))) < 1.0e-6
+
+
+@pytest.mark.parametrize(
+    "param, ExceptionCls, match",
+    [
+        (
+            {"n_subsamples": 1},
+            ValueError,
+            re.escape("Invalid parameter since n_features+1 > n_subsamples (2 > 1)"),
+        ),
+        (
+            {"n_subsamples": 101},
+            ValueError,
+            re.escape("Invalid parameter since n_subsamples > n_samples (101 > 50)"),
+        ),
+    ],
+)
+def test_checksubparams_invalid_input(param, ExceptionCls, match):
+    X, y, w, c = gen_toy_problem_1d()
+    theil_sen = TheilSenRegressor(**param, random_state=0)
+    with pytest.raises(ExceptionCls, match=match):
+        theil_sen.fit(X, y)
+
+
+def test_checksubparams_n_subsamples_if_less_samples_than_features():
+    random_state = np.random.RandomState(0)
+    n_samples, n_features = 10, 20
+    X = random_state.normal(size=(n_samples, n_features))
+    y = random_state.normal(size=n_samples)
+    theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0)
+    with pytest.raises(ValueError):
+        theil_sen.fit(X, y)
+
+
+def test_subpopulation():
+    X, y, w, c = gen_toy_problem_4d()
+    theil_sen = TheilSenRegressor(max_subpopulation=250, random_state=0).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, w, 1)
+    assert_array_almost_equal(theil_sen.intercept_, c, 1)
+
+
+def test_subsamples():
+    X, y, w, c = gen_toy_problem_4d()
+    theil_sen = TheilSenRegressor(n_subsamples=X.shape[0], random_state=0).fit(X, y)
+    lstq = LinearRegression().fit(X, y)
+    # Check for exact the same results as Least Squares
+    assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 9)
+
+
+def test_verbosity():
+    X, y, w, c = gen_toy_problem_1d()
+    # Check that Theil-Sen can be verbose
+    with no_stdout_stderr():
+        TheilSenRegressor(verbose=True, random_state=0).fit(X, y)
+        TheilSenRegressor(verbose=True, max_subpopulation=10, random_state=0).fit(X, y)
+
+
+def test_theil_sen_parallel():
+    X, y, w, c = gen_toy_problem_2d()
+    # Check that Least Squares fails
+    lstq = LinearRegression().fit(X, y)
+    assert norm(lstq.coef_ - w) > 1.0
+    # Check that Theil-Sen works
+    theil_sen = TheilSenRegressor(n_jobs=2, random_state=0, max_subpopulation=2e3).fit(
+        X, y
+    )
+    assert_array_almost_equal(theil_sen.coef_, w, 1)
+    assert_array_almost_equal(theil_sen.intercept_, c, 1)
+
+
+def test_less_samples_than_features():
+    random_state = np.random.RandomState(0)
+    n_samples, n_features = 10, 20
+    X = random_state.normal(size=(n_samples, n_features))
+    y = random_state.normal(size=n_samples)
+    # Check that Theil-Sen falls back to Least Squares if fit_intercept=False
+    theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y)
+    lstq = LinearRegression(fit_intercept=False).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 12)
+    # Check fit_intercept=True case. This will not be equal to the Least
+    # Squares solution since the intercept is calculated differently.
+    theil_sen = TheilSenRegressor(fit_intercept=True, random_state=0).fit(X, y)
+    y_pred = theil_sen.predict(X)
+    assert_array_almost_equal(y_pred, y, 12)
+
+
+# TODO(1.8): Remove
+def test_copy_X_deprecated():
+    X, y, _, _ = gen_toy_problem_1d()
+    theil_sen = TheilSenRegressor(copy_X=True, random_state=0)
+    with pytest.warns(FutureWarning, match="`copy_X` was deprecated"):
+        theil_sen.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..349f7c1a4a7c41a053e3ae35228dc654dc6b63fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/__init__.py
@@ -0,0 +1,22 @@
+"""Data embedding techniques."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._isomap import Isomap
+from ._locally_linear import LocallyLinearEmbedding, locally_linear_embedding
+from ._mds import MDS, smacof
+from ._spectral_embedding import SpectralEmbedding, spectral_embedding
+from ._t_sne import TSNE, trustworthiness
+
+__all__ = [
+    "MDS",
+    "TSNE",
+    "Isomap",
+    "LocallyLinearEmbedding",
+    "SpectralEmbedding",
+    "locally_linear_embedding",
+    "smacof",
+    "spectral_embedding",
+    "trustworthiness",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_barnes_hut_tsne.pyx b/.venv/lib/python3.12/site-packages/sklearn/manifold/_barnes_hut_tsne.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..e84df4a9074b220d2a5dc01b203559d4a0945e6c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_barnes_hut_tsne.pyx
@@ -0,0 +1,295 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# See http://homepage.tudelft.nl/19j49/t-SNE.html for reference
+# implementations and papers describing the technique
+
+
+import numpy as np
+cimport numpy as cnp
+from libc.stdio cimport printf
+from libc.math cimport log
+from libc.stdlib cimport malloc, free
+from libc.time cimport clock, clock_t
+from cython.parallel cimport prange, parallel
+
+from ..neighbors._quad_tree cimport _QuadTree
+
+cnp.import_array()
+
+
+cdef char* EMPTY_STRING = ""
+
+# Smallest strictly positive value that can be represented by floating
+# point numbers for different precision levels. This is useful to avoid
+# taking the log of zero when computing the KL divergence.
+cdef float FLOAT32_TINY = np.finfo(np.float32).tiny
+
+# Useful to void division by zero or divergence to +inf.
+cdef float FLOAT64_EPS = np.finfo(np.float64).eps
+
+# This is effectively an ifdef statement in Cython
+# It allows us to write printf debugging lines
+# and remove them at compile time
+cdef enum:
+    DEBUGFLAG = 0
+
+cdef float compute_gradient(float[:] val_P,
+                            float[:, :] pos_reference,
+                            cnp.int64_t[:] neighbors,
+                            cnp.int64_t[:] indptr,
+                            float[:, :] tot_force,
+                            _QuadTree qt,
+                            float theta,
+                            int dof,
+                            long start,
+                            bint compute_error,
+                            int num_threads) noexcept nogil:
+    # Having created the tree, calculate the gradient
+    # in two components, the positive and negative forces
+    cdef:
+        long i, coord
+        int ax
+        long n_samples = pos_reference.shape[0]
+        int n_dimensions = qt.n_dimensions
+        clock_t t1 = 0, t2 = 0
+        double sQ
+        float error
+        int take_timing = 1 if qt.verbose > 15 else 0
+
+    if qt.verbose > 11:
+        printf("[t-SNE] Allocating %li elements in force arrays\n",
+               n_samples * n_dimensions * 2)
+    cdef float* neg_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
+    cdef float* pos_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
+
+    if take_timing:
+        t1 = clock()
+    sQ = compute_gradient_negative(pos_reference, neg_f, qt, dof, theta, start,
+                                   num_threads)
+    if take_timing:
+        t2 = clock()
+        printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1)))
+
+    if take_timing:
+        t1 = clock()
+    error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr,
+                                      pos_f, n_dimensions, dof, sQ, start,
+                                      qt.verbose, compute_error, num_threads)
+    if take_timing:
+        t2 = clock()
+        printf("[t-SNE] Computing positive gradient: %e ticks\n",
+               ((float) (t2 - t1)))
+    for i in prange(start, n_samples, nogil=True, num_threads=num_threads,
+                    schedule='static'):
+        for ax in range(n_dimensions):
+            coord = i * n_dimensions + ax
+            tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sQ)
+
+    free(neg_f)
+    free(pos_f)
+    return error
+
+
+cdef float compute_gradient_positive(float[:] val_P,
+                                     float[:, :] pos_reference,
+                                     cnp.int64_t[:] neighbors,
+                                     cnp.int64_t[:] indptr,
+                                     float* pos_f,
+                                     int n_dimensions,
+                                     int dof,
+                                     double sum_Q,
+                                     cnp.int64_t start,
+                                     int verbose,
+                                     bint compute_error,
+                                     int num_threads) noexcept nogil:
+    # Sum over the following expression for i not equal to j
+    # grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j)
+    # This is equivalent to compute_edge_forces in the authors' code
+    # It just goes over the nearest neighbors instead of all the data points
+    # (unlike the non-nearest neighbors version of `compute_gradient_positive')
+    cdef:
+        int ax
+        long i, j, k
+        long n_samples = indptr.shape[0] - 1
+        float C = 0.0
+        float dij, qij, pij
+        float exponent = (dof + 1.0) / 2.0
+        float float_dof = (float) (dof)
+        float* buff
+        clock_t t1 = 0, t2 = 0
+        float dt
+
+    if verbose > 10:
+        t1 = clock()
+
+    with nogil, parallel(num_threads=num_threads):
+        # Define private buffer variables
+        buff = <float *> malloc(sizeof(float) * n_dimensions)
+
+        for i in prange(start, n_samples, schedule='static'):
+            # Init the gradient vector
+            for ax in range(n_dimensions):
+                pos_f[i * n_dimensions + ax] = 0.0
+            # Compute the positive interaction for the nearest neighbors
+            for k in range(indptr[i], indptr[i+1]):
+                j = neighbors[k]
+                dij = 0.0
+                pij = val_P[k]
+                for ax in range(n_dimensions):
+                    buff[ax] = pos_reference[i, ax] - pos_reference[j, ax]
+                    dij += buff[ax] * buff[ax]
+                qij = float_dof / (float_dof + dij)
+                if dof != 1:  # i.e. exponent != 1
+                    qij = qij ** exponent
+                dij = pij * qij
+
+                # only compute the error when needed
+                if compute_error:
+                    qij = qij / sum_Q
+                    C += pij * log(max(pij, FLOAT32_TINY) / max(qij, FLOAT32_TINY))
+                for ax in range(n_dimensions):
+                    pos_f[i * n_dimensions + ax] += dij * buff[ax]
+
+        free(buff)
+    if verbose > 10:
+        t2 = clock()
+        dt = ((float) (t2 - t1))
+        printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt)
+    return C
+
+
+cdef double compute_gradient_negative(float[:, :] pos_reference,
+                                      float* neg_f,
+                                      _QuadTree qt,
+                                      int dof,
+                                      float theta,
+                                      long start,
+                                      int num_threads) noexcept nogil:
+    cdef:
+        int ax
+        int n_dimensions = qt.n_dimensions
+        int offset = n_dimensions + 2
+        long i, j, idx
+        long n_samples = pos_reference.shape[0]
+        long n = n_samples - start
+        long dta = 0
+        long dtb = 0
+        float size, dist2s, mult
+        float exponent = (dof + 1.0) / 2.0
+        float float_dof = (float) (dof)
+        double qijZ, sum_Q = 0.0
+        float* force
+        float* neg_force
+        float* pos
+        clock_t t1 = 0, t2 = 0, t3 = 0
+        int take_timing = 1 if qt.verbose > 20 else 0
+
+    with nogil, parallel(num_threads=num_threads):
+        # Define thread-local buffers
+        summary = <float*> malloc(sizeof(float) * n * offset)
+        pos = <float *> malloc(sizeof(float) * n_dimensions)
+        force = <float *> malloc(sizeof(float) * n_dimensions)
+        neg_force = <float *> malloc(sizeof(float) * n_dimensions)
+
+        for i in prange(start, n_samples, schedule='static'):
+            # Clear the arrays
+            for ax in range(n_dimensions):
+                force[ax] = 0.0
+                neg_force[ax] = 0.0
+                pos[ax] = pos_reference[i, ax]
+
+            # Find which nodes are summarizing and collect their centers of mass
+            # deltas, and sizes, into vectorized arrays
+            if take_timing:
+                t1 = clock()
+            idx = qt.summarize(pos, summary, theta*theta)
+            if take_timing:
+                t2 = clock()
+            # Compute the t-SNE negative force
+            # for the digits dataset, walking the tree
+            # is about 10-15x more expensive than the
+            # following for loop
+            for j in range(idx // offset):
+
+                dist2s = summary[j * offset + n_dimensions]
+                size = summary[j * offset + n_dimensions + 1]
+                qijZ = float_dof / (float_dof + dist2s)  # 1/(1+dist)
+                if dof != 1:  # i.e. exponent != 1
+                    qijZ = qijZ ** exponent
+
+                sum_Q += size * qijZ   # size of the node * q
+                mult = size * qijZ * qijZ
+                for ax in range(n_dimensions):
+                    neg_force[ax] += mult * summary[j * offset + ax]
+            if take_timing:
+                t3 = clock()
+            for ax in range(n_dimensions):
+                neg_f[i * n_dimensions + ax] = neg_force[ax]
+            if take_timing:
+                dta += t2 - t1
+                dtb += t3 - t2
+        free(pos)
+        free(force)
+        free(neg_force)
+        free(summary)
+    if take_timing:
+        printf("[t-SNE] Tree: %li clock ticks | ", dta)
+        printf("Force computation: %li clock ticks\n", dtb)
+
+    # Put sum_Q to machine EPSILON to avoid divisions by 0
+    sum_Q = max(sum_Q, FLOAT64_EPS)
+    return sum_Q
+
+
+def gradient(float[:] val_P,
+             float[:, :] pos_output,
+             cnp.int64_t[:] neighbors,
+             cnp.int64_t[:] indptr,
+             float[:, :] forces,
+             float theta,
+             int n_dimensions,
+             int verbose,
+             int dof=1,
+             long skip_num_points=0,
+             bint compute_error=1,
+             int num_threads=1):
+    # This function is designed to be called from external Python
+    # it passes the 'forces' array by reference and fills that's array
+    # up in-place
+    cdef float C
+    cdef int n
+    n = pos_output.shape[0]
+    assert val_P.itemsize == 4
+    assert pos_output.itemsize == 4
+    assert forces.itemsize == 4
+    m = "Forces array and pos_output shapes are incompatible"
+    assert n == forces.shape[0], m
+    m = "Pij and pos_output shapes are incompatible"
+    assert n == indptr.shape[0] - 1, m
+    if verbose > 10:
+        printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions)
+    cdef _QuadTree qt = _QuadTree(pos_output.shape[1], verbose)
+    if verbose > 10:
+        printf("[t-SNE] Inserting %li points\n", pos_output.shape[0])
+    qt.build_tree(pos_output)
+    if verbose > 10:
+        # XXX: format hack to workaround lack of `const char *` type
+        # in the generated C code that triggers error with gcc 4.9
+        # and -Werror=format-security
+        printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING)
+
+    C = compute_gradient(val_P, pos_output, neighbors, indptr, forces,
+                         qt, theta, dof, skip_num_points, compute_error,
+                         num_threads)
+
+    if verbose > 10:
+        # XXX: format hack to workaround lack of `const char *` type
+        # in the generated C code
+        # and -Werror=format-security
+        printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING)
+    m = "Tree consistency failed: unexpected number of points on the tree"
+    assert qt.cells[0].cumulative_size == qt.n_points, m
+    if not compute_error:
+        C = np.nan
+    return C
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_isomap.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_isomap.py
new file mode 100644
index 0000000000000000000000000000000000000000..90154470c18a486a250ea112cb31e57167d2eb43
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_isomap.py
@@ -0,0 +1,442 @@
+"""Isomap for manifold learning"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.sparse import issparse
+from scipy.sparse.csgraph import connected_components, shortest_path
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..decomposition import KernelPCA
+from ..metrics.pairwise import _VALID_METRICS
+from ..neighbors import NearestNeighbors, kneighbors_graph, radius_neighbors_graph
+from ..preprocessing import KernelCenterer
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.graph import _fix_connected_components
+from ..utils.validation import check_is_fitted
+
+
+class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Isomap Embedding.
+
+    Non-linear dimensionality reduction through Isometric Mapping
+
+    Read more in the :ref:`User Guide <isomap>`.
+
+    Parameters
+    ----------
+    n_neighbors : int or None, default=5
+        Number of neighbors to consider for each point. If `n_neighbors` is an int,
+        then `radius` must be `None`.
+
+    radius : float or None, default=None
+        Limiting distance of neighbors to return. If `radius` is a float,
+        then `n_neighbors` must be set to `None`.
+
+        .. versionadded:: 1.1
+
+    n_components : int, default=2
+        Number of coordinates for the manifold.
+
+    eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
+        'auto' : Attempt to choose the most efficient solver
+        for the given problem.
+
+        'arpack' : Use Arnoldi decomposition to find the eigenvalues
+        and eigenvectors.
+
+        'dense' : Use a direct solver (i.e. LAPACK)
+        for the eigenvalue decomposition.
+
+    tol : float, default=0
+        Convergence tolerance passed to arpack or lobpcg.
+        not used if eigen_solver == 'dense'.
+
+    max_iter : int, default=None
+        Maximum number of iterations for the arpack solver.
+        not used if eigen_solver == 'dense'.
+
+    path_method : {'auto', 'FW', 'D'}, default='auto'
+        Method to use in finding shortest path.
+
+        'auto' : attempt to choose the best algorithm automatically.
+
+        'FW' : Floyd-Warshall algorithm.
+
+        'D' : Dijkstra's algorithm.
+
+    neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
+                          default='auto'
+        Algorithm to use for nearest neighbors search,
+        passed to neighbors.NearestNeighbors instance.
+
+    n_jobs : int or None, default=None
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    metric : str, or callable, default="minkowski"
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string or callable, it must be one of
+        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
+        its metric parameter.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square. X may be a :term:`Glossary <sparse graph>`.
+
+        .. versionadded:: 0.22
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+        .. versionadded:: 0.22
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    embedding_ : array-like, shape (n_samples, n_components)
+        Stores the embedding vectors.
+
+    kernel_pca_ : object
+        :class:`~sklearn.decomposition.KernelPCA` object used to implement the
+        embedding.
+
+    nbrs_ : sklearn.neighbors.NearestNeighbors instance
+        Stores nearest neighbors instance, including BallTree or KDtree
+        if applicable.
+
+    dist_matrix_ : array-like, shape (n_samples, n_samples)
+        Stores the geodesic distance matrix of training data.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.decomposition.PCA : Principal component analysis that is a linear
+        dimensionality reduction method.
+    sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
+        kernels and PCA.
+    MDS : Manifold learning using multidimensional scaling.
+    TSNE : T-distributed Stochastic Neighbor Embedding.
+    LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
+    SpectralEmbedding : Spectral embedding for non-linear dimensionality.
+
+    References
+    ----------
+
+    .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
+           framework for nonlinear dimensionality reduction. Science 290 (5500)
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.manifold import Isomap
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> embedding = Isomap(n_components=2)
+    >>> X_transformed = embedding.fit_transform(X[:100])
+    >>> X_transformed.shape
+    (100, 2)
+    """
+
+    _parameter_constraints: dict = {
+        "n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
+        "radius": [Interval(Real, 0, None, closed="both"), None],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 1, None, closed="left"), None],
+        "path_method": [StrOptions({"auto", "FW", "D"})],
+        "neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
+        "n_jobs": [Integral, None],
+        "p": [Interval(Real, 1, None, closed="left")],
+        "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
+        "metric_params": [dict, None],
+    }
+
+    def __init__(
+        self,
+        *,
+        n_neighbors=5,
+        radius=None,
+        n_components=2,
+        eigen_solver="auto",
+        tol=0,
+        max_iter=None,
+        path_method="auto",
+        neighbors_algorithm="auto",
+        n_jobs=None,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+    ):
+        self.n_neighbors = n_neighbors
+        self.radius = radius
+        self.n_components = n_components
+        self.eigen_solver = eigen_solver
+        self.tol = tol
+        self.max_iter = max_iter
+        self.path_method = path_method
+        self.neighbors_algorithm = neighbors_algorithm
+        self.n_jobs = n_jobs
+        self.metric = metric
+        self.p = p
+        self.metric_params = metric_params
+
+    def _fit_transform(self, X):
+        if self.n_neighbors is not None and self.radius is not None:
+            raise ValueError(
+                "Both n_neighbors and radius are provided. Use"
+                f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use"
+                " radius-based neighbors"
+            )
+
+        self.nbrs_ = NearestNeighbors(
+            n_neighbors=self.n_neighbors,
+            radius=self.radius,
+            algorithm=self.neighbors_algorithm,
+            metric=self.metric,
+            p=self.p,
+            metric_params=self.metric_params,
+            n_jobs=self.n_jobs,
+        )
+        self.nbrs_.fit(X)
+        self.n_features_in_ = self.nbrs_.n_features_in_
+        if hasattr(self.nbrs_, "feature_names_in_"):
+            self.feature_names_in_ = self.nbrs_.feature_names_in_
+
+        self.kernel_pca_ = KernelPCA(
+            n_components=self.n_components,
+            kernel="precomputed",
+            eigen_solver=self.eigen_solver,
+            tol=self.tol,
+            max_iter=self.max_iter,
+            n_jobs=self.n_jobs,
+        ).set_output(transform="default")
+
+        if self.n_neighbors is not None:
+            nbg = kneighbors_graph(
+                self.nbrs_,
+                self.n_neighbors,
+                metric=self.metric,
+                p=self.p,
+                metric_params=self.metric_params,
+                mode="distance",
+                n_jobs=self.n_jobs,
+            )
+        else:
+            nbg = radius_neighbors_graph(
+                self.nbrs_,
+                radius=self.radius,
+                metric=self.metric,
+                p=self.p,
+                metric_params=self.metric_params,
+                mode="distance",
+                n_jobs=self.n_jobs,
+            )
+
+        # Compute the number of connected components, and connect the different
+        # components to be able to compute a shortest path between all pairs
+        # of samples in the graph.
+        # Similar fix to cluster._agglomerative._fix_connectivity.
+        n_connected_components, labels = connected_components(nbg)
+        if n_connected_components > 1:
+            if self.metric == "precomputed" and issparse(X):
+                raise RuntimeError(
+                    "The number of connected components of the neighbors graph"
+                    f" is {n_connected_components} > 1. The graph cannot be "
+                    "completed with metric='precomputed', and Isomap cannot be"
+                    "fitted. Increase the number of neighbors to avoid this "
+                    "issue, or precompute the full distance matrix instead "
+                    "of passing a sparse neighbors graph."
+                )
+            warnings.warn(
+                (
+                    "The number of connected components of the neighbors graph "
+                    f"is {n_connected_components} > 1. Completing the graph to fit"
+                    " Isomap might be slow. Increase the number of neighbors to "
+                    "avoid this issue."
+                ),
+                stacklevel=2,
+            )
+
+            # use array validated by NearestNeighbors
+            nbg = _fix_connected_components(
+                X=self.nbrs_._fit_X,
+                graph=nbg,
+                n_connected_components=n_connected_components,
+                component_labels=labels,
+                mode="distance",
+                metric=self.nbrs_.effective_metric_,
+                **self.nbrs_.effective_metric_params_,
+            )
+
+        self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False)
+
+        if self.nbrs_._fit_X.dtype == np.float32:
+            self.dist_matrix_ = self.dist_matrix_.astype(
+                self.nbrs_._fit_X.dtype, copy=False
+            )
+
+        G = self.dist_matrix_**2
+        G *= -0.5
+
+        self.embedding_ = self.kernel_pca_.fit_transform(G)
+        self._n_features_out = self.embedding_.shape[1]
+
+    def reconstruction_error(self):
+        """Compute the reconstruction error for the embedding.
+
+        Returns
+        -------
+        reconstruction_error : float
+            Reconstruction error.
+
+        Notes
+        -----
+        The cost function of an isomap embedding is
+
+        ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
+
+        Where D is the matrix of distances for the input data X,
+        D_fit is the matrix of distances for the output embedding X_fit,
+        and K is the isomap kernel:
+
+        ``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
+        """
+        G = -0.5 * self.dist_matrix_**2
+        G_center = KernelCenterer().fit_transform(G)
+        evals = self.kernel_pca_.eigenvalues_
+        return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
+
+    @_fit_context(
+        # Isomap.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Compute the embedding vectors for data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
+            Sample data, shape = (n_samples, n_features), in the form of a
+            numpy array, sparse matrix, precomputed tree, or NearestNeighbors
+            object.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns a fitted instance of self.
+        """
+        self._fit_transform(X)
+        return self
+
+    @_fit_context(
+        # Isomap.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit_transform(self, X, y=None):
+        """Fit the model from data in X and transform X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix, BallTree, KDTree}
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        X_new : array-like, shape (n_samples, n_components)
+            X transformed in the new space.
+        """
+        self._fit_transform(X)
+        return self.embedding_
+
+    def transform(self, X):
+        """Transform X.
+
+        This is implemented by linking the points X into the graph of geodesic
+        distances of the training data. First the `n_neighbors` nearest
+        neighbors of X are found in the training data, and from these the
+        shortest geodesic distances from each point in X to each point in
+        the training data are computed in order to construct the kernel.
+        The embedding of X is the projection of this kernel onto the
+        embedding vectors of the training set.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_queries, n_features)
+            If neighbors_algorithm='precomputed', X is assumed to be a
+            distance matrix or a sparse graph of shape
+            (n_queries, n_samples_fit).
+
+        Returns
+        -------
+        X_new : array-like, shape (n_queries, n_components)
+            X transformed in the new space.
+        """
+        check_is_fitted(self)
+        if self.n_neighbors is not None:
+            distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
+        else:
+            distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True)
+
+        # Create the graph of shortest distances from X to
+        # training data via the nearest neighbors of X.
+        # This can be done as a single array operation, but it potentially
+        # takes a lot of memory.  To avoid that, use a loop:
+
+        n_samples_fit = self.nbrs_.n_samples_fit_
+        n_queries = distances.shape[0]
+
+        if hasattr(X, "dtype") and X.dtype == np.float32:
+            dtype = np.float32
+        else:
+            dtype = np.float64
+
+        G_X = np.zeros((n_queries, n_samples_fit), dtype)
+        for i in range(n_queries):
+            G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)
+
+        G_X **= 2
+        G_X *= -0.5
+
+        return self.kernel_pca_.transform(G_X)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_locally_linear.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_locally_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e3f456f7ca57e0a5ef4ba3aaf847475aacadfab
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_locally_linear.py
@@ -0,0 +1,879 @@
+"""Locally Linear Embedding"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.linalg import eigh, qr, solve, svd
+from scipy.sparse import csr_matrix, eye, lil_matrix
+from scipy.sparse.linalg import eigsh
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+    _UnstableArchMixin,
+)
+from ..neighbors import NearestNeighbors
+from ..utils import check_array, check_random_state
+from ..utils._arpack import _init_arpack_v0
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.extmath import stable_cumsum
+from ..utils.validation import FLOAT_DTYPES, check_is_fitted, validate_data
+
+
+def barycenter_weights(X, Y, indices, reg=1e-3):
+    """Compute barycenter weights of X from Y along the first axis
+
+    We estimate the weights to assign to each point in Y[indices] to recover
+    the point X[i]. The barycenter weights sum to 1.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_dim)
+
+    Y : array-like, shape (n_samples, n_dim)
+
+    indices : array-like, shape (n_samples, n_dim)
+            Indices of the points in Y used to compute the barycenter
+
+    reg : float, default=1e-3
+        Amount of regularization to add for the problem to be
+        well-posed in the case of n_neighbors > n_dim
+
+    Returns
+    -------
+    B : array-like, shape (n_samples, n_neighbors)
+
+    Notes
+    -----
+    See developers note for more information.
+    """
+    X = check_array(X, dtype=FLOAT_DTYPES)
+    Y = check_array(Y, dtype=FLOAT_DTYPES)
+    indices = check_array(indices, dtype=int)
+
+    n_samples, n_neighbors = indices.shape
+    assert X.shape[0] == n_samples
+
+    B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
+    v = np.ones(n_neighbors, dtype=X.dtype)
+
+    # this might raise a LinalgError if G is singular and has trace
+    # zero
+    for i, ind in enumerate(indices):
+        A = Y[ind]
+        C = A - X[i]  # broadcasting
+        G = np.dot(C, C.T)
+        trace = np.trace(G)
+        if trace > 0:
+            R = reg * trace
+        else:
+            R = reg
+        G.flat[:: n_neighbors + 1] += R
+        w = solve(G, v, assume_a="pos")
+        B[i, :] = w / np.sum(w)
+    return B
+
+
+def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
+    """Computes the barycenter weighted graph of k-Neighbors for points in X
+
+    Parameters
+    ----------
+    X : {array-like, NearestNeighbors}
+        Sample data, shape = (n_samples, n_features), in the form of a
+        numpy array or a NearestNeighbors object.
+
+    n_neighbors : int
+        Number of neighbors for each sample.
+
+    reg : float, default=1e-3
+        Amount of regularization when solving the least-squares
+        problem. Only relevant if mode='barycenter'. If None, use the
+        default.
+
+    n_jobs : int or None, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    A : sparse matrix in CSR format, shape = [n_samples, n_samples]
+        A[i, j] is assigned the weight of edge that connects i to j.
+
+    See Also
+    --------
+    sklearn.neighbors.kneighbors_graph
+    sklearn.neighbors.radius_neighbors_graph
+    """
+    knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
+    X = knn._fit_X
+    n_samples = knn.n_samples_fit_
+    ind = knn.kneighbors(X, return_distance=False)[:, 1:]
+    data = barycenter_weights(X, X, ind, reg=reg)
+    indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
+    return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples))
+
+
+def null_space(
+    M, k, k_skip=1, eigen_solver="arpack", tol=1e-6, max_iter=100, random_state=None
+):
+    """
+    Find the null space of a matrix M.
+
+    Parameters
+    ----------
+    M : {array, matrix, sparse matrix, LinearOperator}
+        Input covariance matrix: should be symmetric positive semi-definite
+
+    k : int
+        Number of eigenvalues/vectors to return
+
+    k_skip : int, default=1
+        Number of low eigenvalues to skip.
+
+    eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'
+        auto : algorithm will attempt to choose the best method for input data
+        arpack : use arnoldi iteration in shift-invert mode.
+                    For this method, M may be a dense matrix, sparse matrix,
+                    or general linear operator.
+                    Warning: ARPACK can be unstable for some problems.  It is
+                    best to try several random seeds in order to check results.
+        dense  : use standard dense matrix operations for the eigenvalue
+                    decomposition.  For this method, M must be an array
+                    or matrix type.  This method should be avoided for
+                    large problems.
+
+    tol : float, default=1e-6
+        Tolerance for 'arpack' method.
+        Not used if eigen_solver=='dense'.
+
+    max_iter : int, default=100
+        Maximum number of iterations for 'arpack' method.
+        Not used if eigen_solver=='dense'
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator when ``solver`` == 'arpack'.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    """
+    if eigen_solver == "auto":
+        if M.shape[0] > 200 and k + k_skip < 10:
+            eigen_solver = "arpack"
+        else:
+            eigen_solver = "dense"
+
+    if eigen_solver == "arpack":
+        v0 = _init_arpack_v0(M.shape[0], random_state)
+        try:
+            eigen_values, eigen_vectors = eigsh(
+                M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0
+            )
+        except RuntimeError as e:
+            raise ValueError(
+                "Error in determining null-space with ARPACK. Error message: "
+                "'%s'. Note that eigen_solver='arpack' can fail when the "
+                "weight matrix is singular or otherwise ill-behaved. In that "
+                "case, eigen_solver='dense' is recommended. See online "
+                "documentation for more information." % e
+            ) from e
+
+        return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
+    elif eigen_solver == "dense":
+        if hasattr(M, "toarray"):
+            M = M.toarray()
+        eigen_values, eigen_vectors = eigh(
+            M, subset_by_index=(k_skip, k + k_skip - 1), overwrite_a=True
+        )
+        index = np.argsort(np.abs(eigen_values))
+        return eigen_vectors[:, index], np.sum(eigen_values)
+    else:
+        raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
+
+
+def _locally_linear_embedding(
+    X,
+    *,
+    n_neighbors,
+    n_components,
+    reg=1e-3,
+    eigen_solver="auto",
+    tol=1e-6,
+    max_iter=100,
+    method="standard",
+    hessian_tol=1e-4,
+    modified_tol=1e-12,
+    random_state=None,
+    n_jobs=None,
+):
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
+    nbrs.fit(X)
+    X = nbrs._fit_X
+
+    N, d_in = X.shape
+
+    if n_components > d_in:
+        raise ValueError(
+            "output dimension must be less than or equal to input dimension"
+        )
+    if n_neighbors >= N:
+        raise ValueError(
+            "Expected n_neighbors < n_samples, but n_samples = %d, n_neighbors = %d"
+            % (N, n_neighbors)
+        )
+
+    M_sparse = eigen_solver != "dense"
+    M_container_constructor = lil_matrix if M_sparse else np.zeros
+
+    if method == "standard":
+        W = barycenter_kneighbors_graph(
+            nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs
+        )
+
+        # we'll compute M = (I-W)'(I-W)
+        # depending on the solver, we'll do this differently
+        if M_sparse:
+            M = eye(*W.shape, format=W.format) - W
+            M = M.T @ M
+        else:
+            M = (W.T @ W - W.T - W).toarray()
+            M.flat[:: M.shape[0] + 1] += 1  # M = W' W - W' - W + I
+
+    elif method == "hessian":
+        dp = n_components * (n_components + 1) // 2
+
+        if n_neighbors <= n_components + dp:
+            raise ValueError(
+                "for method='hessian', n_neighbors must be "
+                "greater than "
+                "[n_components * (n_components + 3) / 2]"
+            )
+
+        neighbors = nbrs.kneighbors(
+            X, n_neighbors=n_neighbors + 1, return_distance=False
+        )
+        neighbors = neighbors[:, 1:]
+
+        Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
+        Yi[:, 0] = 1
+
+        M = M_container_constructor((N, N), dtype=np.float64)
+
+        use_svd = n_neighbors > d_in
+
+        for i in range(N):
+            Gi = X[neighbors[i]]
+            Gi -= Gi.mean(0)
+
+            # build Hessian estimator
+            if use_svd:
+                U = svd(Gi, full_matrices=0)[0]
+            else:
+                Ci = np.dot(Gi, Gi.T)
+                U = eigh(Ci)[1][:, ::-1]
+
+            Yi[:, 1 : 1 + n_components] = U[:, :n_components]
+
+            j = 1 + n_components
+            for k in range(n_components):
+                Yi[:, j : j + n_components - k] = U[:, k : k + 1] * U[:, k:n_components]
+                j += n_components - k
+
+            Q, R = qr(Yi)
+
+            w = Q[:, n_components + 1 :]
+            S = w.sum(0)
+
+            S[np.where(abs(S) < hessian_tol)] = 1
+            w /= S
+
+            nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
+            M[nbrs_x, nbrs_y] += np.dot(w, w.T)
+
+    elif method == "modified":
+        if n_neighbors < n_components:
+            raise ValueError("modified LLE requires n_neighbors >= n_components")
+
+        neighbors = nbrs.kneighbors(
+            X, n_neighbors=n_neighbors + 1, return_distance=False
+        )
+        neighbors = neighbors[:, 1:]
+
+        # find the eigenvectors and eigenvalues of each local covariance
+        # matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
+        # where the columns are eigenvectors
+        V = np.zeros((N, n_neighbors, n_neighbors))
+        nev = min(d_in, n_neighbors)
+        evals = np.zeros([N, nev])
+
+        # choose the most efficient way to find the eigenvectors
+        use_svd = n_neighbors > d_in
+
+        if use_svd:
+            for i in range(N):
+                X_nbrs = X[neighbors[i]] - X[i]
+                V[i], evals[i], _ = svd(X_nbrs, full_matrices=True)
+            evals **= 2
+        else:
+            for i in range(N):
+                X_nbrs = X[neighbors[i]] - X[i]
+                C_nbrs = np.dot(X_nbrs, X_nbrs.T)
+                evi, vi = eigh(C_nbrs)
+                evals[i] = evi[::-1]
+                V[i] = vi[:, ::-1]
+
+        # find regularized weights: this is like normal LLE.
+        # because we've already computed the SVD of each covariance matrix,
+        # it's faster to use this rather than np.linalg.solve
+        reg = 1e-3 * evals.sum(1)
+
+        tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
+        tmp[:, :nev] /= evals + reg[:, None]
+        tmp[:, nev:] /= reg[:, None]
+
+        w_reg = np.zeros((N, n_neighbors))
+        for i in range(N):
+            w_reg[i] = np.dot(V[i], tmp[i])
+        w_reg /= w_reg.sum(1)[:, None]
+
+        # calculate eta: the median of the ratio of small to large eigenvalues
+        # across the points.  This is used to determine s_i, below
+        rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
+        eta = np.median(rho)
+
+        # find s_i, the size of the "almost null space" for each point:
+        # this is the size of the largest set of eigenvalues
+        # such that Sum[v; v in set]/Sum[v; v not in set] < eta
+        s_range = np.zeros(N, dtype=int)
+        evals_cumsum = stable_cumsum(evals, 1)
+        eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
+        for i in range(N):
+            s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
+        s_range += n_neighbors - nev  # number of zero eigenvalues
+
+        # Now calculate M.
+        # This is the [N x N] matrix whose null space is the desired embedding
+        M = M_container_constructor((N, N), dtype=np.float64)
+
+        for i in range(N):
+            s_i = s_range[i]
+
+            # select bottom s_i eigenvectors and calculate alpha
+            Vi = V[i, :, n_neighbors - s_i :]
+            alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
+
+            # compute Householder matrix which satisfies
+            #  Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
+            # using prescription from paper
+            h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))
+
+            norm_h = np.linalg.norm(h)
+            if norm_h < modified_tol:
+                h *= 0
+            else:
+                h /= norm_h
+
+            # Householder matrix is
+            #  >> Hi = np.identity(s_i) - 2*np.outer(h,h)
+            # Then the weight matrix is
+            #  >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
+            # We do this much more efficiently:
+            Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None]
+
+            # Update M as follows:
+            # >> W_hat = np.zeros( (N,s_i) )
+            # >> W_hat[neighbors[i],:] = Wi
+            # >> W_hat[i] -= 1
+            # >> M += np.dot(W_hat,W_hat.T)
+            # We can do this much more efficiently:
+            nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
+            M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
+            Wi_sum1 = Wi.sum(1)
+            M[i, neighbors[i]] -= Wi_sum1
+            M[neighbors[i], [i]] -= Wi_sum1
+            M[i, i] += s_i
+
+    elif method == "ltsa":
+        neighbors = nbrs.kneighbors(
+            X, n_neighbors=n_neighbors + 1, return_distance=False
+        )
+        neighbors = neighbors[:, 1:]
+
+        M = M_container_constructor((N, N), dtype=np.float64)
+
+        use_svd = n_neighbors > d_in
+
+        for i in range(N):
+            Xi = X[neighbors[i]]
+            Xi -= Xi.mean(0)
+
+            # compute n_components largest eigenvalues of Xi @ Xi^T
+            if use_svd:
+                v = svd(Xi, full_matrices=True)[0]
+            else:
+                Ci = np.dot(Xi, Xi.T)
+                v = eigh(Ci)[1][:, ::-1]
+
+            Gi = np.zeros((n_neighbors, n_components + 1))
+            Gi[:, 1:] = v[:, :n_components]
+            Gi[:, 0] = 1.0 / np.sqrt(n_neighbors)
+
+            GiGiT = np.dot(Gi, Gi.T)
+
+            nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
+            M[nbrs_x, nbrs_y] -= GiGiT
+
+            M[neighbors[i], neighbors[i]] += np.ones(shape=n_neighbors)
+
+    if M_sparse:
+        M = M.tocsr()
+
+    return null_space(
+        M,
+        n_components,
+        k_skip=1,
+        eigen_solver=eigen_solver,
+        tol=tol,
+        max_iter=max_iter,
+        random_state=random_state,
+    )
+
+
+@validate_params(
+    {
+        "X": ["array-like", NearestNeighbors],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "reg": [Interval(Real, 0, None, closed="left")],
+        "eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
+        "hessian_tol": [Interval(Real, 0, None, closed="left")],
+        "modified_tol": [Interval(Real, 0, None, closed="left")],
+        "random_state": ["random_state"],
+        "n_jobs": [None, Integral],
+    },
+    prefer_skip_nested_validation=True,
+)
+def locally_linear_embedding(
+    X,
+    *,
+    n_neighbors,
+    n_components,
+    reg=1e-3,
+    eigen_solver="auto",
+    tol=1e-6,
+    max_iter=100,
+    method="standard",
+    hessian_tol=1e-4,
+    modified_tol=1e-12,
+    random_state=None,
+    n_jobs=None,
+):
+    """Perform a Locally Linear Embedding analysis on the data.
+
+    Read more in the :ref:`User Guide <locally_linear_embedding>`.
+
+    Parameters
+    ----------
+    X : {array-like, NearestNeighbors}
+        Sample data, shape = (n_samples, n_features), in the form of a
+        numpy array or a NearestNeighbors object.
+
+    n_neighbors : int
+        Number of neighbors to consider for each point.
+
+    n_components : int
+        Number of coordinates for the manifold.
+
+    reg : float, default=1e-3
+        Regularization constant, multiplies the trace of the local covariance
+        matrix of the distances.
+
+    eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
+        auto : algorithm will attempt to choose the best method for input data
+
+        arpack : use arnoldi iteration in shift-invert mode.
+                    For this method, M may be a dense matrix, sparse matrix,
+                    or general linear operator.
+                    Warning: ARPACK can be unstable for some problems.  It is
+                    best to try several random seeds in order to check results.
+
+        dense  : use standard dense matrix operations for the eigenvalue
+                    decomposition.  For this method, M must be an array
+                    or matrix type.  This method should be avoided for
+                    large problems.
+
+    tol : float, default=1e-6
+        Tolerance for 'arpack' method
+        Not used if eigen_solver=='dense'.
+
+    max_iter : int, default=100
+        Maximum number of iterations for the arpack solver.
+
+    method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
+        standard : use the standard locally linear embedding algorithm.
+                   see reference [1]_
+        hessian  : use the Hessian eigenmap method.  This method requires
+                   n_neighbors > n_components * (1 + (n_components + 1) / 2.
+                   see reference [2]_
+        modified : use the modified locally linear embedding algorithm.
+                   see reference [3]_
+        ltsa     : use local tangent space alignment algorithm
+                   see reference [4]_
+
+    hessian_tol : float, default=1e-4
+        Tolerance for Hessian eigenmapping method.
+        Only used if method == 'hessian'.
+
+    modified_tol : float, default=1e-12
+        Tolerance for modified LLE method.
+        Only used if method == 'modified'.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator when ``solver`` == 'arpack'.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int or None, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    Y : ndarray of shape (n_samples, n_components)
+        Embedding vectors.
+
+    squared_error : float
+        Reconstruction error for the embedding vectors. Equivalent to
+        ``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.
+
+    References
+    ----------
+
+    .. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
+        by locally linear embedding.  Science 290:2323 (2000).
+    .. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
+        linear embedding techniques for high-dimensional data.
+        Proc Natl Acad Sci U S A.  100:5591 (2003).
+    .. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
+        Embedding Using Multiple Weights.
+        <https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
+    .. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
+        dimensionality reduction via tangent space alignment.
+        Journal of Shanghai Univ.  8:406 (2004)
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.manifold import locally_linear_embedding
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> embedding, _ = locally_linear_embedding(X[:100],n_neighbors=5, n_components=2)
+    >>> embedding.shape
+    (100, 2)
+    """
+    return _locally_linear_embedding(
+        X=X,
+        n_neighbors=n_neighbors,
+        n_components=n_components,
+        reg=reg,
+        eigen_solver=eigen_solver,
+        tol=tol,
+        max_iter=max_iter,
+        method=method,
+        hessian_tol=hessian_tol,
+        modified_tol=modified_tol,
+        random_state=random_state,
+        n_jobs=n_jobs,
+    )
+
+
+class LocallyLinearEmbedding(
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _UnstableArchMixin,
+    BaseEstimator,
+):
+    """Locally Linear Embedding.
+
+    Read more in the :ref:`User Guide <locally_linear_embedding>`.
+
+    Parameters
+    ----------
+    n_neighbors : int, default=5
+        Number of neighbors to consider for each point.
+
+    n_components : int, default=2
+        Number of coordinates for the manifold.
+
+    reg : float, default=1e-3
+        Regularization constant, multiplies the trace of the local covariance
+        matrix of the distances.
+
+    eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
+        The solver used to compute the eigenvectors. The available options are:
+
+        - `'auto'` : algorithm will attempt to choose the best method for input
+          data.
+        - `'arpack'` : use arnoldi iteration in shift-invert mode. For this
+          method, M may be a dense matrix, sparse matrix, or general linear
+          operator.
+        - `'dense'`  : use standard dense matrix operations for the eigenvalue
+          decomposition. For this method, M must be an array or matrix type.
+          This method should be avoided for large problems.
+
+        .. warning::
+           ARPACK can be unstable for some problems.  It is best to try several
+           random seeds in order to check results.
+
+    tol : float, default=1e-6
+        Tolerance for 'arpack' method
+        Not used if eigen_solver=='dense'.
+
+    max_iter : int, default=100
+        Maximum number of iterations for the arpack solver.
+        Not used if eigen_solver=='dense'.
+
+    method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
+        - `standard`: use the standard locally linear embedding algorithm. see
+          reference [1]_
+        - `hessian`: use the Hessian eigenmap method. This method requires
+          ``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see
+          reference [2]_
+        - `modified`: use the modified locally linear embedding algorithm.
+          see reference [3]_
+        - `ltsa`: use local tangent space alignment algorithm. see
+          reference [4]_
+
+    hessian_tol : float, default=1e-4
+        Tolerance for Hessian eigenmapping method.
+        Only used if ``method == 'hessian'``.
+
+    modified_tol : float, default=1e-12
+        Tolerance for modified LLE method.
+        Only used if ``method == 'modified'``.
+
+    neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
+                          default='auto'
+        Algorithm to use for nearest neighbors search, passed to
+        :class:`~sklearn.neighbors.NearestNeighbors` instance.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator when
+        ``eigen_solver`` == 'arpack'. Pass an int for reproducible results
+        across multiple function calls. See :term:`Glossary <random_state>`.
+
+    n_jobs : int or None, default=None
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    embedding_ : array-like, shape [n_samples, n_components]
+        Stores the embedding vectors
+
+    reconstruction_error_ : float
+        Reconstruction error associated with `embedding_`
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    nbrs_ : NearestNeighbors object
+        Stores nearest neighbors instance, including BallTree or KDtree
+        if applicable.
+
+    See Also
+    --------
+    SpectralEmbedding : Spectral embedding for non-linear dimensionality
+        reduction.
+    TSNE : Distributed Stochastic Neighbor Embedding.
+
+    References
+    ----------
+
+    .. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
+        by locally linear embedding.  Science 290:2323 (2000).
+    .. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
+        linear embedding techniques for high-dimensional data.
+        Proc Natl Acad Sci U S A.  100:5591 (2003).
+    .. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
+        Embedding Using Multiple Weights.
+        <https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
+    .. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
+        dimensionality reduction via tangent space alignment.
+        Journal of Shanghai Univ.  8:406 (2004)
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.manifold import LocallyLinearEmbedding
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> embedding = LocallyLinearEmbedding(n_components=2)
+    >>> X_transformed = embedding.fit_transform(X[:100])
+    >>> X_transformed.shape
+    (100, 2)
+    """
+
+    _parameter_constraints: dict = {
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "reg": [Interval(Real, 0, None, closed="left")],
+        "eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
+        "hessian_tol": [Interval(Real, 0, None, closed="left")],
+        "modified_tol": [Interval(Real, 0, None, closed="left")],
+        "neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
+        "random_state": ["random_state"],
+        "n_jobs": [None, Integral],
+    }
+
+    def __init__(
+        self,
+        *,
+        n_neighbors=5,
+        n_components=2,
+        reg=1e-3,
+        eigen_solver="auto",
+        tol=1e-6,
+        max_iter=100,
+        method="standard",
+        hessian_tol=1e-4,
+        modified_tol=1e-12,
+        neighbors_algorithm="auto",
+        random_state=None,
+        n_jobs=None,
+    ):
+        self.n_neighbors = n_neighbors
+        self.n_components = n_components
+        self.reg = reg
+        self.eigen_solver = eigen_solver
+        self.tol = tol
+        self.max_iter = max_iter
+        self.method = method
+        self.hessian_tol = hessian_tol
+        self.modified_tol = modified_tol
+        self.random_state = random_state
+        self.neighbors_algorithm = neighbors_algorithm
+        self.n_jobs = n_jobs
+
+    def _fit_transform(self, X):
+        self.nbrs_ = NearestNeighbors(
+            n_neighbors=self.n_neighbors,
+            algorithm=self.neighbors_algorithm,
+            n_jobs=self.n_jobs,
+        )
+
+        random_state = check_random_state(self.random_state)
+        X = validate_data(self, X, dtype=float)
+        self.nbrs_.fit(X)
+        self.embedding_, self.reconstruction_error_ = _locally_linear_embedding(
+            X=self.nbrs_,
+            n_neighbors=self.n_neighbors,
+            n_components=self.n_components,
+            eigen_solver=self.eigen_solver,
+            tol=self.tol,
+            max_iter=self.max_iter,
+            method=self.method,
+            hessian_tol=self.hessian_tol,
+            modified_tol=self.modified_tol,
+            random_state=random_state,
+            reg=self.reg,
+            n_jobs=self.n_jobs,
+        )
+        self._n_features_out = self.embedding_.shape[1]
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Compute the embedding vectors for data X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted `LocallyLinearEmbedding` class instance.
+        """
+        self._fit_transform(X)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Compute the embedding vectors for data X and transform X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        X_new : array-like, shape (n_samples, n_components)
+            Returns the instance itself.
+        """
+        self._fit_transform(X)
+        return self.embedding_
+
+    def transform(self, X):
+        """
+        Transform new points into embedding space.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Returns the instance itself.
+
+        Notes
+        -----
+        Because of scaling performed by this method, it is discouraged to use
+        it together with methods that are not scale-invariant (like SVMs).
+        """
+        check_is_fitted(self)
+
+        X = validate_data(self, X, reset=False)
+        ind = self.nbrs_.kneighbors(
+            X, n_neighbors=self.n_neighbors, return_distance=False
+        )
+        weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg)
+        X_new = np.empty((X.shape[0], self.n_components))
+        for i in range(X.shape[0]):
+            X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
+        return X_new
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_mds.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_mds.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c31c72f7ef59e782be2476971e28b7f487dd644
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_mds.py
@@ -0,0 +1,714 @@
+"""
+Multi-dimensional Scaling (MDS).
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+
+from ..base import BaseEstimator, _fit_context
+from ..isotonic import IsotonicRegression
+from ..metrics import euclidean_distances
+from ..utils import check_array, check_random_state, check_symmetric
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import validate_data
+
+
+def _smacof_single(
+    dissimilarities,
+    metric=True,
+    n_components=2,
+    init=None,
+    max_iter=300,
+    verbose=0,
+    eps=1e-6,
+    random_state=None,
+    normalized_stress=False,
+):
+    """Computes multidimensional scaling using SMACOF algorithm.
+
+    Parameters
+    ----------
+    dissimilarities : ndarray of shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Must be symmetric.
+
+    metric : bool, default=True
+        Compute metric or nonmetric SMACOF algorithm.
+        When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
+        missing values.
+
+    n_components : int, default=2
+        Number of dimensions in which to immerse the dissimilarities. If an
+        ``init`` array is provided, this option is overridden and the shape of
+        ``init`` is used to determine the dimensionality of the embedding
+        space.
+
+    init : ndarray of shape (n_samples, n_components), default=None
+        Starting configuration of the embedding to initialize the algorithm. By
+        default, the algorithm is initialized with a randomly chosen array.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, default=0
+        Level of verbosity.
+
+    eps : float, default=1e-6
+        The tolerance with respect to stress (normalized by the sum of squared
+        embedding distances) at which to declare convergence.
+
+        .. versionchanged:: 1.7
+           The default value for `eps` has changed from 1e-3 to 1e-6, as a result
+           of a bugfix in the computation of the convergence criterion.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    normalized_stress : bool, default=False
+        Whether to return normalized stress value (Stress-1) instead of raw
+        stress.
+
+        .. versionadded:: 1.2
+
+        .. versionchanged:: 1.7
+           Normalized stress is now supported for metric MDS as well.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_components)
+        Coordinates of the points in a ``n_components``-space.
+
+    stress : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+        If `normalized_stress=True`, returns Stress-1.
+        A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
+        0.1 fair, and 0.2 poor [1]_.
+
+    n_iter : int
+        The number of iterations corresponding to the best stress.
+
+    References
+    ----------
+    .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
+           Psychometrika, 29 (1964)
+
+    .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
+           hypothesis" Kruskal, J. Psychometrika, 29, (1964)
+
+    .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
+           Groenen P. Springer Series in Statistics (1997)
+    """
+    dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
+
+    n_samples = dissimilarities.shape[0]
+    random_state = check_random_state(random_state)
+
+    dissimilarities_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
+    dissimilarities_flat_w = dissimilarities_flat[dissimilarities_flat != 0]
+    if init is None:
+        # Randomly choose initial configuration
+        X = random_state.uniform(size=n_samples * n_components)
+        X = X.reshape((n_samples, n_components))
+    else:
+        # overrides the parameter p
+        n_components = init.shape[1]
+        if n_samples != init.shape[0]:
+            raise ValueError(
+                "init matrix should be of shape (%d, %d)" % (n_samples, n_components)
+            )
+        X = init
+    distances = euclidean_distances(X)
+
+    # Out of bounds condition cannot happen because we are transforming
+    # the training set here, but does sometimes get triggered in
+    # practice due to machine precision issues. Hence "clip".
+    ir = IsotonicRegression(out_of_bounds="clip")
+
+    old_stress = None
+    for it in range(max_iter):
+        # Compute distance and monotonic regression
+        if metric:
+            disparities = dissimilarities
+        else:
+            distances_flat = distances.ravel()
+            # dissimilarities with 0 are considered as missing values
+            distances_flat_w = distances_flat[dissimilarities_flat != 0]
+
+            # Compute the disparities using isotonic regression.
+            # For the first SMACOF iteration, use scaled original dissimilarities.
+            # (This choice follows the R implementation described in this paper:
+            # https://www.jstatsoft.org/article/view/v102i10)
+            if it < 1:
+                disparities_flat = dissimilarities_flat_w
+            else:
+                disparities_flat = ir.fit_transform(
+                    dissimilarities_flat_w, distances_flat_w
+                )
+            disparities = np.zeros_like(distances_flat)
+            disparities[dissimilarities_flat != 0] = disparities_flat
+            disparities = disparities.reshape((n_samples, n_samples))
+            disparities *= np.sqrt(
+                (n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
+            )
+            disparities = disparities + disparities.T
+
+        # Update X using the Guttman transform
+        distances[distances == 0] = 1e-5
+        ratio = disparities / distances
+        B = -ratio
+        B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
+        X = 1.0 / n_samples * np.dot(B, X)
+
+        # Compute stress
+        distances = euclidean_distances(X)
+        stress = ((distances.ravel() - disparities.ravel()) ** 2).sum() / 2
+
+        if verbose >= 2:  # pragma: no cover
+            print(f"Iteration {it}, stress {stress:.4f}")
+        if old_stress is not None:
+            sum_squared_distances = (distances.ravel() ** 2).sum()
+            if ((old_stress - stress) / (sum_squared_distances / 2)) < eps:
+                if verbose:  # pragma: no cover
+                    print("Convergence criterion reached.")
+                break
+        old_stress = stress
+
+    if normalized_stress:
+        sum_squared_distances = (distances.ravel() ** 2).sum()
+        stress = np.sqrt(stress / (sum_squared_distances / 2))
+
+    return X, stress, it + 1
+
+
+# TODO(1.9): change default `n_init` to 1, see PR #31117
+@validate_params(
+    {
+        "dissimilarities": ["array-like"],
+        "metric": ["boolean"],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "init": ["array-like", None],
+        "n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})],
+        "n_jobs": [Integral, None],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "verbose": ["verbose"],
+        "eps": [Interval(Real, 0, None, closed="left")],
+        "random_state": ["random_state"],
+        "return_n_iter": ["boolean"],
+        "normalized_stress": ["boolean", StrOptions({"auto"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def smacof(
+    dissimilarities,
+    *,
+    metric=True,
+    n_components=2,
+    init=None,
+    n_init="warn",
+    n_jobs=None,
+    max_iter=300,
+    verbose=0,
+    eps=1e-6,
+    random_state=None,
+    return_n_iter=False,
+    normalized_stress="auto",
+):
+    """Compute multidimensional scaling using the SMACOF algorithm.
+
+    The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
+    multidimensional scaling algorithm which minimizes an objective function
+    (the *stress*) using a majorization technique. Stress majorization, also
+    known as the Guttman Transform, guarantees a monotone convergence of
+    stress, and is more powerful than traditional techniques such as gradient
+    descent.
+
+    The SMACOF algorithm for metric MDS can be summarized by the following
+    steps:
+
+    1. Set an initial start configuration, randomly or not.
+    2. Compute the stress
+    3. Compute the Guttman Transform
+    4. Iterate 2 and 3 until convergence.
+
+    The nonmetric algorithm adds a monotonic regression step before computing
+    the stress.
+
+    Parameters
+    ----------
+    dissimilarities : array-like of shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Must be symmetric.
+
+    metric : bool, default=True
+        Compute metric or nonmetric SMACOF algorithm.
+        When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
+        missing values.
+
+    n_components : int, default=2
+        Number of dimensions in which to immerse the dissimilarities. If an
+        ``init`` array is provided, this option is overridden and the shape of
+        ``init`` is used to determine the dimensionality of the embedding
+        space.
+
+    init : array-like of shape (n_samples, n_components), default=None
+        Starting configuration of the embedding to initialize the algorithm. By
+        default, the algorithm is initialized with a randomly chosen array.
+
+    n_init : int, default=8
+        Number of times the SMACOF algorithm will be run with different
+        initializations. The final results will be the best output of the runs,
+        determined by the run with the smallest final stress. If ``init`` is
+        provided, this option is overridden and a single run is performed.
+
+        .. versionchanged:: 1.9
+           The default value for `n_iter` will change from 8 to 1 in version 1.9.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. If multiple
+        initializations are used (``n_init``), each run of the algorithm is
+        computed in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, default=0
+        Level of verbosity.
+
+    eps : float, default=1e-6
+        The tolerance with respect to stress (normalized by the sum of squared
+        embedding distances) at which to declare convergence.
+
+        .. versionchanged:: 1.7
+           The default value for `eps` has changed from 1e-3 to 1e-6, as a result
+           of a bugfix in the computation of the convergence criterion.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    normalized_stress : bool or "auto", default="auto"
+        Whether to return normalized stress value (Stress-1) instead of raw
+        stress. By default, metric MDS returns raw stress while non-metric MDS
+        returns normalized stress.
+
+        .. versionadded:: 1.2
+
+        .. versionchanged:: 1.4
+           The default value changed from `False` to `"auto"` in version 1.4.
+
+        .. versionchanged:: 1.7
+           Normalized stress is now supported for metric MDS as well.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_components)
+        Coordinates of the points in a ``n_components``-space.
+
+    stress : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+        If `normalized_stress=True`, returns Stress-1.
+        A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
+        0.1 fair, and 0.2 poor [1]_.
+
+    n_iter : int
+        The number of iterations corresponding to the best stress. Returned
+        only if ``return_n_iter`` is set to ``True``.
+
+    References
+    ----------
+    .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
+           Psychometrika, 29 (1964)
+
+    .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
+           hypothesis" Kruskal, J. Psychometrika, 29, (1964)
+
+    .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
+           Groenen P. Springer Series in Statistics (1997)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.manifold import smacof
+    >>> from sklearn.metrics import euclidean_distances
+    >>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
+    >>> dissimilarities = euclidean_distances(X)
+    >>> Z, stress = smacof(
+    ...     dissimilarities, n_components=2, n_init=1, eps=1e-6, random_state=42
+    ... )
+    >>> Z.shape
+    (3, 2)
+    >>> np.round(stress, 6).item()
+    3.2e-05
+    """
+
+    if n_init == "warn":
+        warnings.warn(
+            "The default value of `n_init` will change from 8 to 1 in 1.9.",
+            FutureWarning,
+        )
+        n_init = 8
+
+    dissimilarities = check_array(dissimilarities)
+    random_state = check_random_state(random_state)
+
+    if normalized_stress == "auto":
+        normalized_stress = not metric
+
+    if hasattr(init, "__array__"):
+        init = np.asarray(init).copy()
+        if not n_init == 1:
+            warnings.warn(
+                "Explicit initial positions passed: "
+                "performing only one init of the MDS instead of %d" % n_init
+            )
+            n_init = 1
+
+    best_pos, best_stress = None, None
+
+    if effective_n_jobs(n_jobs) == 1:
+        for it in range(n_init):
+            pos, stress, n_iter_ = _smacof_single(
+                dissimilarities,
+                metric=metric,
+                n_components=n_components,
+                init=init,
+                max_iter=max_iter,
+                verbose=verbose,
+                eps=eps,
+                random_state=random_state,
+                normalized_stress=normalized_stress,
+            )
+            if best_stress is None or stress < best_stress:
+                best_stress = stress
+                best_pos = pos.copy()
+                best_iter = n_iter_
+    else:
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
+            delayed(_smacof_single)(
+                dissimilarities,
+                metric=metric,
+                n_components=n_components,
+                init=init,
+                max_iter=max_iter,
+                verbose=verbose,
+                eps=eps,
+                random_state=seed,
+                normalized_stress=normalized_stress,
+            )
+            for seed in seeds
+        )
+        positions, stress, n_iters = zip(*results)
+        best = np.argmin(stress)
+        best_stress = stress[best]
+        best_pos = positions[best]
+        best_iter = n_iters[best]
+
+    if return_n_iter:
+        return best_pos, best_stress, best_iter
+    else:
+        return best_pos, best_stress
+
+
+# TODO(1.9): change default `n_init` to 1, see PR #31117
+class MDS(BaseEstimator):
+    """Multidimensional scaling.
+
+    Read more in the :ref:`User Guide <multidimensional_scaling>`.
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        Number of dimensions in which to immerse the dissimilarities.
+
+    metric : bool, default=True
+        If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
+        When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
+        missing values.
+
+    n_init : int, default=4
+        Number of times the SMACOF algorithm will be run with different
+        initializations. The final results will be the best output of the runs,
+        determined by the run with the smallest final stress.
+
+        .. versionchanged:: 1.9
+           The default value for `n_init` will change from 4 to 1 in version 1.9.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, default=0
+        Level of verbosity.
+
+    eps : float, default=1e-6
+        The tolerance with respect to stress (normalized by the sum of squared
+        embedding distances) at which to declare convergence.
+
+        .. versionchanged:: 1.7
+           The default value for `eps` has changed from 1e-3 to 1e-6, as a result
+           of a bugfix in the computation of the convergence criterion.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. If multiple
+        initializations are used (``n_init``), each run of the algorithm is
+        computed in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    dissimilarity : {'euclidean', 'precomputed'}, default='euclidean'
+        Dissimilarity measure to use:
+
+        - 'euclidean':
+            Pairwise Euclidean distances between points in the dataset.
+
+        - 'precomputed':
+            Pre-computed dissimilarities are passed directly to ``fit`` and
+            ``fit_transform``.
+
+    normalized_stress : bool or "auto" default="auto"
+        Whether to return normalized stress value (Stress-1) instead of raw
+        stress. By default, metric MDS returns raw stress while non-metric MDS
+        returns normalized stress.
+
+        .. versionadded:: 1.2
+
+        .. versionchanged:: 1.4
+           The default value changed from `False` to `"auto"` in version 1.4.
+
+        .. versionchanged:: 1.7
+           Normalized stress is now supported for metric MDS as well.
+
+    Attributes
+    ----------
+    embedding_ : ndarray of shape (n_samples, n_components)
+        Stores the position of the dataset in the embedding space.
+
+    stress_ : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+        If `normalized_stress=True`, returns Stress-1.
+        A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
+        0.1 fair, and 0.2 poor [1]_.
+
+    dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Symmetric matrix that:
+
+        - either uses a custom dissimilarity matrix by setting `dissimilarity`
+          to 'precomputed';
+        - or constructs a dissimilarity matrix from data using
+          Euclidean distances.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The number of iterations corresponding to the best stress.
+
+    See Also
+    --------
+    sklearn.decomposition.PCA : Principal component analysis that is a linear
+        dimensionality reduction method.
+    sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
+        kernels and PCA.
+    TSNE : T-distributed Stochastic Neighbor Embedding.
+    Isomap : Manifold learning based on Isometric Mapping.
+    LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
+    SpectralEmbedding : Spectral embedding for non-linear dimensionality.
+
+    References
+    ----------
+    .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
+       Psychometrika, 29 (1964)
+
+    .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
+       hypothesis" Kruskal, J. Psychometrika, 29, (1964)
+
+    .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
+       Groenen P. Springer Series in Statistics (1997)
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.manifold import MDS
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> embedding = MDS(n_components=2, n_init=1)
+    >>> X_transformed = embedding.fit_transform(X[:100])
+    >>> X_transformed.shape
+    (100, 2)
+
+    For a more detailed example of usage, see
+    :ref:`sphx_glr_auto_examples_manifold_plot_mds.py`.
+
+    For a comparison of manifold learning techniques, see
+    :ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "metric": ["boolean"],
+        "n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "verbose": ["verbose"],
+        "eps": [Interval(Real, 0.0, None, closed="left")],
+        "n_jobs": [None, Integral],
+        "random_state": ["random_state"],
+        "dissimilarity": [StrOptions({"euclidean", "precomputed"})],
+        "normalized_stress": ["boolean", StrOptions({"auto"})],
+    }
+
+    def __init__(
+        self,
+        n_components=2,
+        *,
+        metric=True,
+        n_init="warn",
+        max_iter=300,
+        verbose=0,
+        eps=1e-6,
+        n_jobs=None,
+        random_state=None,
+        dissimilarity="euclidean",
+        normalized_stress="auto",
+    ):
+        self.n_components = n_components
+        self.dissimilarity = dissimilarity
+        self.metric = metric
+        self.n_init = n_init
+        self.max_iter = max_iter
+        self.eps = eps
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.normalized_stress = normalized_stress
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.pairwise = self.dissimilarity == "precomputed"
+        return tags
+
+    def fit(self, X, y=None, init=None):
+        """
+        Compute the position of the points in the embedding space.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Input data. If ``dissimilarity=='precomputed'``, the input should
+            be the dissimilarity matrix.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        init : ndarray of shape (n_samples, n_components), default=None
+            Starting configuration of the embedding to initialize the SMACOF
+            algorithm. By default, the algorithm is initialized with a randomly
+            chosen array.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self.fit_transform(X, init=init)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None, init=None):
+        """
+        Fit the data from `X`, and returns the embedded coordinates.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Input data. If ``dissimilarity=='precomputed'``, the input should
+            be the dissimilarity matrix.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        init : ndarray of shape (n_samples, n_components), default=None
+            Starting configuration of the embedding to initialize the SMACOF
+            algorithm. By default, the algorithm is initialized with a randomly
+            chosen array.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            X transformed in the new space.
+        """
+
+        if self.n_init == "warn":
+            warnings.warn(
+                "The default value of `n_init` will change from 4 to 1 in 1.9.",
+                FutureWarning,
+            )
+            self._n_init = 4
+        else:
+            self._n_init = self.n_init
+
+        X = validate_data(self, X)
+        if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
+            warnings.warn(
+                "The MDS API has changed. ``fit`` now constructs a"
+                " dissimilarity matrix from data. To use a custom "
+                "dissimilarity matrix, set "
+                "``dissimilarity='precomputed'``."
+            )
+
+        if self.dissimilarity == "precomputed":
+            self.dissimilarity_matrix_ = X
+        elif self.dissimilarity == "euclidean":
+            self.dissimilarity_matrix_ = euclidean_distances(X)
+
+        self.embedding_, self.stress_, self.n_iter_ = smacof(
+            self.dissimilarity_matrix_,
+            metric=self.metric,
+            n_components=self.n_components,
+            init=init,
+            n_init=self._n_init,
+            n_jobs=self.n_jobs,
+            max_iter=self.max_iter,
+            verbose=self.verbose,
+            eps=self.eps,
+            random_state=self.random_state,
+            return_n_iter=True,
+            normalized_stress=self.normalized_stress,
+        )
+
+        return self.embedding_
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_spectral_embedding.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_spectral_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a3b95e023897567bd49cc5c0e969a240a1e2afd
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_spectral_embedding.py
@@ -0,0 +1,776 @@
+"""Spectral Embedding."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse
+from scipy.linalg import eigh
+from scipy.sparse.csgraph import connected_components
+from scipy.sparse.linalg import eigsh, lobpcg
+
+from ..base import BaseEstimator, _fit_context
+from ..metrics.pairwise import rbf_kernel
+from ..neighbors import NearestNeighbors, kneighbors_graph
+from ..utils import (
+    check_array,
+    check_random_state,
+    check_symmetric,
+)
+from ..utils._arpack import _init_arpack_v0
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.extmath import _deterministic_vector_sign_flip
+from ..utils.fixes import laplacian as csgraph_laplacian
+from ..utils.fixes import parse_version, sp_version
+from ..utils.validation import validate_data
+
+
+def _graph_connected_component(graph, node_id):
+    """Find the largest graph connected components that contains one
+    given node.
+
+    Parameters
+    ----------
+    graph : array-like of shape (n_samples, n_samples)
+        Adjacency matrix of the graph, non-zero weight means an edge
+        between the nodes.
+
+    node_id : int
+        The index of the query node of the graph.
+
+    Returns
+    -------
+    connected_components_matrix : array-like of shape (n_samples,)
+        An array of bool value indicating the indexes of the nodes
+        belonging to the largest connected components of the given query
+        node.
+    """
+    n_node = graph.shape[0]
+    if sparse.issparse(graph):
+        # speed up row-wise access to boolean connection mask
+        graph = graph.tocsr()
+    connected_nodes = np.zeros(n_node, dtype=bool)
+    nodes_to_explore = np.zeros(n_node, dtype=bool)
+    nodes_to_explore[node_id] = True
+    for _ in range(n_node):
+        last_num_component = connected_nodes.sum()
+        np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
+        if last_num_component >= connected_nodes.sum():
+            break
+        indices = np.where(nodes_to_explore)[0]
+        nodes_to_explore.fill(False)
+        for i in indices:
+            if sparse.issparse(graph):
+                # scipy not yet implemented 1D sparse slices; can be changed back to
+                # `neighbors = graph[i].toarray().ravel()` once implemented
+                neighbors = graph[[i], :].toarray().ravel()
+            else:
+                neighbors = graph[i]
+            np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
+    return connected_nodes
+
+
+def _graph_is_connected(graph):
+    """Return whether the graph is connected (True) or Not (False).
+
+    Parameters
+    ----------
+    graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
+        Adjacency matrix of the graph, non-zero weight means an edge
+        between the nodes.
+
+    Returns
+    -------
+    is_connected : bool
+        True means the graph is fully connected and False means not.
+    """
+    if sparse.issparse(graph):
+        # Before Scipy 1.11.3, `connected_components` only supports 32-bit indices.
+        # PR: https://github.com/scipy/scipy/pull/18913
+        # First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
+        # TODO(jjerphan): Once SciPy 1.11.3 is the minimum supported version, use
+        # `accept_large_sparse=True`.
+        accept_large_sparse = sp_version >= parse_version("1.11.3")
+        graph = check_array(
+            graph, accept_sparse=True, accept_large_sparse=accept_large_sparse
+        )
+        # sparse graph, find all the connected components
+        n_connected_components, _ = connected_components(graph)
+        return n_connected_components == 1
+    else:
+        # dense graph, find all connected components start from node 0
+        return _graph_connected_component(graph, 0).sum() == graph.shape[0]
+
+
+def _set_diag(laplacian, value, norm_laplacian):
+    """Set the diagonal of the laplacian matrix and convert it to a
+    sparse format well suited for eigenvalue decomposition.
+
+    Parameters
+    ----------
+    laplacian : {ndarray, sparse matrix}
+        The graph laplacian.
+
+    value : float
+        The value of the diagonal.
+
+    norm_laplacian : bool
+        Whether the value of the diagonal should be changed or not.
+
+    Returns
+    -------
+    laplacian : {array, sparse matrix}
+        An array of matrix in a form that is well suited to fast
+        eigenvalue decomposition, depending on the band width of the
+        matrix.
+    """
+    n_nodes = laplacian.shape[0]
+    # We need all entries in the diagonal to values
+    if not sparse.issparse(laplacian):
+        if norm_laplacian:
+            laplacian.flat[:: n_nodes + 1] = value
+    else:
+        laplacian = laplacian.tocoo()
+        if norm_laplacian:
+            diag_idx = laplacian.row == laplacian.col
+            laplacian.data[diag_idx] = value
+        # If the matrix has a small number of diagonals (as in the
+        # case of structured matrices coming from images), the
+        # dia format might be best suited for matvec products:
+        n_diags = np.unique(laplacian.row - laplacian.col).size
+        if n_diags <= 7:
+            # 3 or less outer diagonals on each side
+            laplacian = laplacian.todia()
+        else:
+            # csr has the fastest matvec and is thus best suited to
+            # arpack
+            laplacian = laplacian.tocsr()
+    return laplacian
+
+
+@validate_params(
+    {
+        "adjacency": ["array-like", "sparse matrix"],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
+        "random_state": ["random_state"],
+        "eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
+        "norm_laplacian": ["boolean"],
+        "drop_first": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def spectral_embedding(
+    adjacency,
+    *,
+    n_components=8,
+    eigen_solver=None,
+    random_state=None,
+    eigen_tol="auto",
+    norm_laplacian=True,
+    drop_first=True,
+):
+    """Project the sample on the first eigenvectors of the graph Laplacian.
+
+    The adjacency matrix is used to compute a normalized graph Laplacian
+    whose spectrum (especially the eigenvectors associated to the
+    smallest eigenvalues) has an interpretation in terms of minimal
+    number of cuts necessary to split the graph into comparably sized
+    components.
+
+    This embedding can also 'work' even if the ``adjacency`` variable is
+    not strictly the adjacency matrix of a graph but more generally
+    an affinity or similarity matrix between samples (for instance the
+    heat kernel of a euclidean distance matrix or a k-NN matrix).
+
+    However care must taken to always make the affinity matrix symmetric
+    so that the eigenvector decomposition works as expected.
+
+    Note : Laplacian Eigenmaps is the actual algorithm implemented here.
+
+    Read more in the :ref:`User Guide <spectral_embedding>`.
+
+    Parameters
+    ----------
+    adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
+        The adjacency matrix of the graph to embed.
+
+    n_components : int, default=8
+        The dimension of the projection subspace.
+
+    eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
+        The eigenvalue decomposition strategy to use. AMG requires pyamg
+        to be installed. It can be faster on very large, sparse problems,
+        but may also lead to instabilities. If None, then ``'arpack'`` is
+        used.
+
+    random_state : int, RandomState instance or None, default=None
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigen vectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
+
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigendecomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead
+        to convergence issues and should be avoided.
+
+        .. versionadded:: 1.2
+           Added 'auto' option.
+
+    norm_laplacian : bool, default=True
+        If True, then compute symmetric normalized Laplacian.
+
+    drop_first : bool, default=True
+        Whether to drop the first eigenvector. For spectral embedding, this
+        should be True as the first eigenvector should be constant vector for
+        connected graph, but for spectral clustering, this should be kept as
+        False to retain the first eigenvector.
+
+    Returns
+    -------
+    embedding : ndarray of shape (n_samples, n_components)
+        The reduced samples.
+
+    Notes
+    -----
+    Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
+    has one connected component. If there graph has many components, the first
+    few eigenvectors will simply uncover the connected components of the graph.
+
+    References
+    ----------
+    * https://en.wikipedia.org/wiki/LOBPCG
+
+    * :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
+      Block Preconditioned Conjugate Gradient Method",
+      Andrew V. Knyazev
+      <10.1137/S1064827500366124>`
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.neighbors import kneighbors_graph
+    >>> from sklearn.manifold import spectral_embedding
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> X = X[:100]
+    >>> affinity_matrix = kneighbors_graph(
+    ...     X, n_neighbors=int(X.shape[0] / 10), include_self=True
+    ... )
+    >>> # make the matrix symmetric
+    >>> affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T)
+    >>> embedding = spectral_embedding(affinity_matrix, n_components=2, random_state=42)
+    >>> embedding.shape
+    (100, 2)
+    """
+    random_state = check_random_state(random_state)
+
+    return _spectral_embedding(
+        adjacency,
+        n_components=n_components,
+        eigen_solver=eigen_solver,
+        random_state=random_state,
+        eigen_tol=eigen_tol,
+        norm_laplacian=norm_laplacian,
+        drop_first=drop_first,
+    )
+
+
+def _spectral_embedding(
+    adjacency,
+    *,
+    n_components=8,
+    eigen_solver=None,
+    random_state=None,
+    eigen_tol="auto",
+    norm_laplacian=True,
+    drop_first=True,
+):
+    adjacency = check_symmetric(adjacency)
+
+    if eigen_solver == "amg":
+        try:
+            from pyamg import smoothed_aggregation_solver
+        except ImportError as e:
+            raise ValueError(
+                "The eigen_solver was set to 'amg', but pyamg is not available."
+            ) from e
+
+    if eigen_solver is None:
+        eigen_solver = "arpack"
+
+    n_nodes = adjacency.shape[0]
+    # Whether to drop the first eigenvector
+    if drop_first:
+        n_components = n_components + 1
+
+    if not _graph_is_connected(adjacency):
+        warnings.warn(
+            "Graph is not fully connected, spectral embedding may not work as expected."
+        )
+
+    laplacian, dd = csgraph_laplacian(
+        adjacency, normed=norm_laplacian, return_diag=True
+    )
+    if eigen_solver == "arpack" or (
+        eigen_solver != "lobpcg"
+        and (not sparse.issparse(laplacian) or n_nodes < 5 * n_components)
+    ):
+        # lobpcg used with eigen_solver='amg' has bugs for low number of nodes
+        # for details see the source code in scipy:
+        # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
+        # /lobpcg/lobpcg.py#L237
+        # or matlab:
+        # https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
+        laplacian = _set_diag(laplacian, 1, norm_laplacian)
+
+        # Here we'll use shift-invert mode for fast eigenvalues
+        # (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
+        #  for a short explanation of what this means)
+        # Because the normalized Laplacian has eigenvalues between 0 and 2,
+        # I - L has eigenvalues between -1 and 1.  ARPACK is most efficient
+        # when finding eigenvalues of largest magnitude (keyword which='LM')
+        # and when these eigenvalues are very large compared to the rest.
+        # For very large, very sparse graphs, I - L can have many, many
+        # eigenvalues very near 1.0.  This leads to slow convergence.  So
+        # instead, we'll use ARPACK's shift-invert mode, asking for the
+        # eigenvalues near 1.0.  This effectively spreads-out the spectrum
+        # near 1.0 and leads to much faster convergence: potentially an
+        # orders-of-magnitude speedup over simply using keyword which='LA'
+        # in standard mode.
+        try:
+            # We are computing the opposite of the laplacian inplace so as
+            # to spare a memory allocation of a possibly very large array
+            tol = 0 if eigen_tol == "auto" else eigen_tol
+            laplacian *= -1
+            v0 = _init_arpack_v0(laplacian.shape[0], random_state)
+            laplacian = check_array(
+                laplacian, accept_sparse="csr", accept_large_sparse=False
+            )
+            _, diffusion_map = eigsh(
+                laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0
+            )
+            embedding = diffusion_map.T[n_components::-1]
+            if norm_laplacian:
+                # recover u = D^-1/2 x from the eigenvector output x
+                embedding = embedding / dd
+        except RuntimeError:
+            # When submatrices are exactly singular, an LU decomposition
+            # in arpack fails. We fallback to lobpcg
+            eigen_solver = "lobpcg"
+            # Revert the laplacian to its opposite to have lobpcg work
+            laplacian *= -1
+
+    elif eigen_solver == "amg":
+        # Use AMG to get a preconditioner and speed up the eigenvalue
+        # problem.
+        if not sparse.issparse(laplacian):
+            warnings.warn("AMG works better for sparse matrices")
+        laplacian = check_array(
+            laplacian, dtype=[np.float64, np.float32], accept_sparse=True
+        )
+        laplacian = _set_diag(laplacian, 1, norm_laplacian)
+
+        # The Laplacian matrix is always singular, having at least one zero
+        # eigenvalue, corresponding to the trivial eigenvector, which is a
+        # constant. Using a singular matrix for preconditioning may result in
+        # random failures in LOBPCG and is not supported by the existing
+        # theory:
+        #     see https://doi.org/10.1007/s10208-015-9297-1
+        # Shift the Laplacian so its diagononal is not all ones. The shift
+        # does change the eigenpairs however, so we'll feed the shifted
+        # matrix to the solver and afterward set it back to the original.
+        diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
+        laplacian += diag_shift
+        if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array):
+            # `pyamg` does not work with `csr_array` and we need to convert it to a
+            # `csr_matrix` object.
+            laplacian = sparse.csr_matrix(laplacian)
+        ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
+        laplacian -= diag_shift
+
+        M = ml.aspreconditioner()
+        # Create initial approximation X to eigenvectors
+        X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
+        X[:, 0] = dd.ravel()
+        X = X.astype(laplacian.dtype)
+
+        tol = None if eigen_tol == "auto" else eigen_tol
+        _, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False)
+        embedding = diffusion_map.T
+        if norm_laplacian:
+            # recover u = D^-1/2 x from the eigenvector output x
+            embedding = embedding / dd
+        if embedding.shape[0] == 1:
+            raise ValueError
+
+    if eigen_solver == "lobpcg":
+        laplacian = check_array(
+            laplacian, dtype=[np.float64, np.float32], accept_sparse=True
+        )
+        if n_nodes < 5 * n_components + 1:
+            # see note above under arpack why lobpcg has problems with small
+            # number of nodes
+            # lobpcg will fallback to eigh, so we short circuit it
+            if sparse.issparse(laplacian):
+                laplacian = laplacian.toarray()
+            _, diffusion_map = eigh(laplacian, check_finite=False)
+            embedding = diffusion_map.T[:n_components]
+            if norm_laplacian:
+                # recover u = D^-1/2 x from the eigenvector output x
+                embedding = embedding / dd
+        else:
+            laplacian = _set_diag(laplacian, 1, norm_laplacian)
+            # We increase the number of eigenvectors requested, as lobpcg
+            # doesn't behave well in low dimension and create initial
+            # approximation X to eigenvectors
+            X = random_state.standard_normal(
+                size=(laplacian.shape[0], n_components + 1)
+            )
+            X[:, 0] = dd.ravel()
+            X = X.astype(laplacian.dtype)
+            tol = None if eigen_tol == "auto" else eigen_tol
+            _, diffusion_map = lobpcg(
+                laplacian, X, tol=tol, largest=False, maxiter=2000
+            )
+            embedding = diffusion_map.T[:n_components]
+            if norm_laplacian:
+                # recover u = D^-1/2 x from the eigenvector output x
+                embedding = embedding / dd
+            if embedding.shape[0] == 1:
+                raise ValueError
+
+    embedding = _deterministic_vector_sign_flip(embedding)
+    if drop_first:
+        return embedding[1:n_components].T
+    else:
+        return embedding[:n_components].T
+
+
+class SpectralEmbedding(BaseEstimator):
+    """Spectral embedding for non-linear dimensionality reduction.
+
+    Forms an affinity matrix given by the specified function and
+    applies spectral decomposition to the corresponding graph laplacian.
+    The resulting transformation is given by the value of the
+    eigenvectors for each data point.
+
+    Note : Laplacian Eigenmaps is the actual algorithm implemented here.
+
+    Read more in the :ref:`User Guide <spectral_embedding>`.
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        The dimension of the projected subspace.
+
+    affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
+                'precomputed_nearest_neighbors'} or callable, \
+                default='nearest_neighbors'
+        How to construct the affinity matrix.
+         - 'nearest_neighbors' : construct the affinity matrix by computing a
+           graph of nearest neighbors.
+         - 'rbf' : construct the affinity matrix by computing a radial basis
+           function (RBF) kernel.
+         - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
+         - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
+           of precomputed nearest neighbors, and constructs the affinity matrix
+           by selecting the ``n_neighbors`` nearest neighbors.
+         - callable : use passed in function as affinity
+           the function takes in data matrix (n_samples, n_features)
+           and return affinity matrix (n_samples, n_samples).
+
+    gamma : float, default=None
+        Kernel coefficient for rbf kernel. If None, gamma will be set to
+        1/n_features.
+
+    random_state : int, RandomState instance or None, default=None
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigen vectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
+
+    eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
+        The eigenvalue decomposition strategy to use. AMG requires pyamg
+        to be installed. It can be faster on very large, sparse problems.
+        If None, then ``'arpack'`` is used.
+
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigendecomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
+        values of `tol<1e-5` may lead to convergence issues and should be
+        avoided.
+
+        .. versionadded:: 1.2
+
+    n_neighbors : int, default=None
+        Number of nearest neighbors for nearest_neighbors graph building.
+        If None, n_neighbors will be set to max(n_samples/10, 1).
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    embedding_ : ndarray of shape (n_samples, n_components)
+        Spectral embedding of the training matrix.
+
+    affinity_matrix_ : ndarray of shape (n_samples, n_samples)
+        Affinity_matrix constructed from samples or precomputed.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_neighbors_ : int
+        Number of nearest neighbors effectively used.
+
+    See Also
+    --------
+    Isomap : Non-linear dimensionality reduction through Isometric Mapping.
+
+    References
+    ----------
+
+    - :doi:`A Tutorial on Spectral Clustering, 2007
+      Ulrike von Luxburg
+      <10.1007/s11222-007-9033-z>`
+
+    - `On Spectral Clustering: Analysis and an algorithm, 2001
+      Andrew Y. Ng, Michael I. Jordan, Yair Weiss
+      <https://citeseerx.ist.psu.edu/doc_view/pid/796c5d6336fc52aa84db575fb821c78918b65f58>`_
+
+    - :doi:`Normalized cuts and image segmentation, 2000
+      Jianbo Shi, Jitendra Malik
+      <10.1109/34.868688>`
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.manifold import SpectralEmbedding
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> embedding = SpectralEmbedding(n_components=2)
+    >>> X_transformed = embedding.fit_transform(X[:100])
+    >>> X_transformed.shape
+    (100, 2)
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "affinity": [
+            StrOptions(
+                {
+                    "nearest_neighbors",
+                    "rbf",
+                    "precomputed",
+                    "precomputed_nearest_neighbors",
+                },
+            ),
+            callable,
+        ],
+        "gamma": [Interval(Real, 0, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
+        "eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
+        "n_jobs": [None, Integral],
+    }
+
+    def __init__(
+        self,
+        n_components=2,
+        *,
+        affinity="nearest_neighbors",
+        gamma=None,
+        random_state=None,
+        eigen_solver=None,
+        eigen_tol="auto",
+        n_neighbors=None,
+        n_jobs=None,
+    ):
+        self.n_components = n_components
+        self.affinity = affinity
+        self.gamma = gamma
+        self.random_state = random_state
+        self.eigen_solver = eigen_solver
+        self.eigen_tol = eigen_tol
+        self.n_neighbors = n_neighbors
+        self.n_jobs = n_jobs
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.input_tags.pairwise = self.affinity in [
+            "precomputed",
+            "precomputed_nearest_neighbors",
+        ]
+        return tags
+
+    def _get_affinity_matrix(self, X, Y=None):
+        """Calculate the affinity matrix from data
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            If affinity is "precomputed"
+            X : array-like of shape (n_samples, n_samples),
+            Interpret X as precomputed adjacency graph computed from
+            samples.
+
+        Y: Ignored
+
+        Returns
+        -------
+        affinity_matrix of shape (n_samples, n_samples)
+        """
+        if self.affinity == "precomputed":
+            self.affinity_matrix_ = X
+            return self.affinity_matrix_
+        if self.affinity == "precomputed_nearest_neighbors":
+            estimator = NearestNeighbors(
+                n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
+            ).fit(X)
+            connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
+            self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
+            return self.affinity_matrix_
+        if self.affinity == "nearest_neighbors":
+            if sparse.issparse(X):
+                warnings.warn(
+                    "Nearest neighbors affinity currently does "
+                    "not support sparse input, falling back to "
+                    "rbf affinity"
+                )
+                self.affinity = "rbf"
+            else:
+                self.n_neighbors_ = (
+                    self.n_neighbors
+                    if self.n_neighbors is not None
+                    else max(int(X.shape[0] / 10), 1)
+                )
+                self.affinity_matrix_ = kneighbors_graph(
+                    X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
+                )
+                # currently only symmetric affinity_matrix supported
+                self.affinity_matrix_ = 0.5 * (
+                    self.affinity_matrix_ + self.affinity_matrix_.T
+                )
+                return self.affinity_matrix_
+        if self.affinity == "rbf":
+            self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
+            self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
+            return self.affinity_matrix_
+        self.affinity_matrix_ = self.affinity(X)
+        return self.affinity_matrix_
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model from data in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            If affinity is "precomputed"
+            X : {array-like, sparse matrix}, shape (n_samples, n_samples),
+            Interpret X as precomputed adjacency graph computed from
+            samples.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(self, X, accept_sparse="csr", ensure_min_samples=2)
+
+        random_state = check_random_state(self.random_state)
+
+        affinity_matrix = self._get_affinity_matrix(X)
+        self.embedding_ = _spectral_embedding(
+            affinity_matrix,
+            n_components=self.n_components,
+            eigen_solver=self.eigen_solver,
+            eigen_tol=self.eigen_tol,
+            random_state=random_state,
+        )
+        return self
+
+    def fit_transform(self, X, y=None):
+        """Fit the model from data in X and transform X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            If affinity is "precomputed"
+            X : {array-like, sparse matrix} of shape (n_samples, n_samples),
+            Interpret X as precomputed adjacency graph computed from
+            samples.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        X_new : array-like of shape (n_samples, n_components)
+            Spectral embedding of the training matrix.
+        """
+        self.fit(X)
+        return self.embedding_
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_t_sne.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_t_sne.py
new file mode 100644
index 0000000000000000000000000000000000000000..51882a5b38abdec7b60c26c1794dafedeef4f666
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_t_sne.py
@@ -0,0 +1,1184 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# This is the exact and Barnes-Hut t-SNE implementation. There are other
+# modifications of the algorithm:
+# * Fast Optimization for t-SNE:
+#   https://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf
+
+from numbers import Integral, Real
+from time import time
+
+import numpy as np
+from scipy import linalg
+from scipy.sparse import csr_matrix, issparse
+from scipy.spatial.distance import pdist, squareform
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..decomposition import PCA
+from ..metrics.pairwise import _VALID_METRICS, pairwise_distances
+from ..neighbors import NearestNeighbors
+from ..utils import check_random_state
+from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.validation import _num_samples, check_non_negative, validate_data
+
+# mypy error: Module 'sklearn.manifold' has no attribute '_utils'
+# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
+from . import _barnes_hut_tsne, _utils  # type: ignore[attr-defined]
+
+MACHINE_EPSILON = np.finfo(np.double).eps
+
+
+def _joint_probabilities(distances, desired_perplexity, verbose):
+    """Compute joint probabilities p_ij from distances.
+
+    Parameters
+    ----------
+    distances : ndarray of shape (n_samples * (n_samples-1) / 2,)
+        Distances of samples are stored as condensed matrices, i.e.
+        we omit the diagonal and duplicate entries and store everything
+        in a one-dimensional array.
+
+    desired_perplexity : float
+        Desired perplexity of the joint probability distributions.
+
+    verbose : int
+        Verbosity level.
+
+    Returns
+    -------
+    P : ndarray of shape (n_samples * (n_samples-1) / 2,)
+        Condensed joint probability matrix.
+    """
+    # Compute conditional probabilities such that they approximately match
+    # the desired perplexity
+    distances = distances.astype(np.float32, copy=False)
+    conditional_P = _utils._binary_search_perplexity(
+        distances, desired_perplexity, verbose
+    )
+    P = conditional_P + conditional_P.T
+    sum_P = np.maximum(np.sum(P), MACHINE_EPSILON)
+    P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON)
+    return P
+
+
+def _joint_probabilities_nn(distances, desired_perplexity, verbose):
+    """Compute joint probabilities p_ij from distances using just nearest
+    neighbors.
+
+    This method is approximately equal to _joint_probabilities. The latter
+    is O(N), but limiting the joint probability to nearest neighbors improves
+    this substantially to O(uN).
+
+    Parameters
+    ----------
+    distances : sparse matrix of shape (n_samples, n_samples)
+        Distances of samples to its n_neighbors nearest neighbors. All other
+        distances are left to zero (and are not materialized in memory).
+        Matrix should be of CSR format.
+
+    desired_perplexity : float
+        Desired perplexity of the joint probability distributions.
+
+    verbose : int
+        Verbosity level.
+
+    Returns
+    -------
+    P : sparse matrix of shape (n_samples, n_samples)
+        Condensed joint probability matrix with only nearest neighbors. Matrix
+        will be of CSR format.
+    """
+    t0 = time()
+    # Compute conditional probabilities such that they approximately match
+    # the desired perplexity
+    distances.sort_indices()
+    n_samples = distances.shape[0]
+    distances_data = distances.data.reshape(n_samples, -1)
+    distances_data = distances_data.astype(np.float32, copy=False)
+    conditional_P = _utils._binary_search_perplexity(
+        distances_data, desired_perplexity, verbose
+    )
+    assert np.all(np.isfinite(conditional_P)), "All probabilities should be finite"
+
+    # Symmetrize the joint probability distribution using sparse operations
+    P = csr_matrix(
+        (conditional_P.ravel(), distances.indices, distances.indptr),
+        shape=(n_samples, n_samples),
+    )
+    P = P + P.T
+
+    # Normalize the joint probability distribution
+    sum_P = np.maximum(P.sum(), MACHINE_EPSILON)
+    P /= sum_P
+
+    assert np.all(np.abs(P.data) <= 1.0)
+    if verbose >= 2:
+        duration = time() - t0
+        print("[t-SNE] Computed conditional probabilities in {:.3f}s".format(duration))
+    return P
+
+
+def _kl_divergence(
+    params,
+    P,
+    degrees_of_freedom,
+    n_samples,
+    n_components,
+    skip_num_points=0,
+    compute_error=True,
+):
+    """t-SNE objective function: gradient of the KL divergence
+    of p_ijs and q_ijs and the absolute error.
+
+    Parameters
+    ----------
+    params : ndarray of shape (n_params,)
+        Unraveled embedding.
+
+    P : ndarray of shape (n_samples * (n_samples-1) / 2,)
+        Condensed joint probability matrix.
+
+    degrees_of_freedom : int
+        Degrees of freedom of the Student's-t distribution.
+
+    n_samples : int
+        Number of samples.
+
+    n_components : int
+        Dimension of the embedded space.
+
+    skip_num_points : int, default=0
+        This does not compute the gradient for points with indices below
+        `skip_num_points`. This is useful when computing transforms of new
+        data where you'd like to keep the old data fixed.
+
+    compute_error: bool, default=True
+        If False, the kl_divergence is not computed and returns NaN.
+
+    Returns
+    -------
+    kl_divergence : float
+        Kullback-Leibler divergence of p_ij and q_ij.
+
+    grad : ndarray of shape (n_params,)
+        Unraveled gradient of the Kullback-Leibler divergence with respect to
+        the embedding.
+    """
+    X_embedded = params.reshape(n_samples, n_components)
+
+    # Q is a heavy-tailed distribution: Student's t-distribution
+    dist = pdist(X_embedded, "sqeuclidean")
+    dist /= degrees_of_freedom
+    dist += 1.0
+    dist **= (degrees_of_freedom + 1.0) / -2.0
+    Q = np.maximum(dist / (2.0 * np.sum(dist)), MACHINE_EPSILON)
+
+    # Optimization trick below: np.dot(x, y) is faster than
+    # np.sum(x * y) because it calls BLAS
+
+    # Objective: C (Kullback-Leibler divergence of P and Q)
+    if compute_error:
+        kl_divergence = 2.0 * np.dot(P, np.log(np.maximum(P, MACHINE_EPSILON) / Q))
+    else:
+        kl_divergence = np.nan
+
+    # Gradient: dC/dY
+    # pdist always returns double precision distances. Thus we need to take
+    grad = np.ndarray((n_samples, n_components), dtype=params.dtype)
+    PQd = squareform((P - Q) * dist)
+    for i in range(skip_num_points, n_samples):
+        grad[i] = np.dot(np.ravel(PQd[i], order="K"), X_embedded[i] - X_embedded)
+    grad = grad.ravel()
+    c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
+    grad *= c
+
+    return kl_divergence, grad
+
+
+def _kl_divergence_bh(
+    params,
+    P,
+    degrees_of_freedom,
+    n_samples,
+    n_components,
+    angle=0.5,
+    skip_num_points=0,
+    verbose=False,
+    compute_error=True,
+    num_threads=1,
+):
+    """t-SNE objective function: KL divergence of p_ijs and q_ijs.
+
+    Uses Barnes-Hut tree methods to calculate the gradient that
+    runs in O(NlogN) instead of O(N^2).
+
+    Parameters
+    ----------
+    params : ndarray of shape (n_params,)
+        Unraveled embedding.
+
+    P : sparse matrix of shape (n_samples, n_sample)
+        Sparse approximate joint probability matrix, computed only for the
+        k nearest-neighbors and symmetrized. Matrix should be of CSR format.
+
+    degrees_of_freedom : int
+        Degrees of freedom of the Student's-t distribution.
+
+    n_samples : int
+        Number of samples.
+
+    n_components : int
+        Dimension of the embedded space.
+
+    angle : float, default=0.5
+        This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
+        'angle' is the angular size (referred to as theta in [3]) of a distant
+        node as measured from a point. If this size is below 'angle' then it is
+        used as a summary node of all points contained within it.
+        This method is not very sensitive to changes in this parameter
+        in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
+        computation time and angle greater 0.8 has quickly increasing error.
+
+    skip_num_points : int, default=0
+        This does not compute the gradient for points with indices below
+        `skip_num_points`. This is useful when computing transforms of new
+        data where you'd like to keep the old data fixed.
+
+    verbose : int, default=False
+        Verbosity level.
+
+    compute_error: bool, default=True
+        If False, the kl_divergence is not computed and returns NaN.
+
+    num_threads : int, default=1
+        Number of threads used to compute the gradient. This is set here to
+        avoid calling _openmp_effective_n_threads for each gradient step.
+
+    Returns
+    -------
+    kl_divergence : float
+        Kullback-Leibler divergence of p_ij and q_ij.
+
+    grad : ndarray of shape (n_params,)
+        Unraveled gradient of the Kullback-Leibler divergence with respect to
+        the embedding.
+    """
+    params = params.astype(np.float32, copy=False)
+    X_embedded = params.reshape(n_samples, n_components)
+
+    val_P = P.data.astype(np.float32, copy=False)
+    neighbors = P.indices.astype(np.int64, copy=False)
+    indptr = P.indptr.astype(np.int64, copy=False)
+
+    grad = np.zeros(X_embedded.shape, dtype=np.float32)
+    error = _barnes_hut_tsne.gradient(
+        val_P,
+        X_embedded,
+        neighbors,
+        indptr,
+        grad,
+        angle,
+        n_components,
+        verbose,
+        dof=degrees_of_freedom,
+        compute_error=compute_error,
+        num_threads=num_threads,
+    )
+    c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
+    grad = grad.ravel()
+    grad *= c
+
+    return error, grad
+
+
+def _gradient_descent(
+    objective,
+    p0,
+    it,
+    max_iter,
+    n_iter_check=1,
+    n_iter_without_progress=300,
+    momentum=0.8,
+    learning_rate=200.0,
+    min_gain=0.01,
+    min_grad_norm=1e-7,
+    verbose=0,
+    args=None,
+    kwargs=None,
+):
+    """Batch gradient descent with momentum and individual gains.
+
+    Parameters
+    ----------
+    objective : callable
+        Should return a tuple of cost and gradient for a given parameter
+        vector. When expensive to compute, the cost can optionally
+        be None and can be computed every n_iter_check steps using
+        the objective_error function.
+
+    p0 : array-like of shape (n_params,)
+        Initial parameter vector.
+
+    it : int
+        Current number of iterations (this function will be called more than
+        once during the optimization).
+
+    max_iter : int
+        Maximum number of gradient descent iterations.
+
+    n_iter_check : int, default=1
+        Number of iterations before evaluating the global error. If the error
+        is sufficiently low, we abort the optimization.
+
+    n_iter_without_progress : int, default=300
+        Maximum number of iterations without progress before we abort the
+        optimization.
+
+    momentum : float within (0.0, 1.0), default=0.8
+        The momentum generates a weight for previous gradients that decays
+        exponentially.
+
+    learning_rate : float, default=200.0
+        The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
+        the learning rate is too high, the data may look like a 'ball' with any
+        point approximately equidistant from its nearest neighbours. If the
+        learning rate is too low, most points may look compressed in a dense
+        cloud with few outliers.
+
+    min_gain : float, default=0.01
+        Minimum individual gain for each parameter.
+
+    min_grad_norm : float, default=1e-7
+        If the gradient norm is below this threshold, the optimization will
+        be aborted.
+
+    verbose : int, default=0
+        Verbosity level.
+
+    args : sequence, default=None
+        Arguments to pass to objective function.
+
+    kwargs : dict, default=None
+        Keyword arguments to pass to objective function.
+
+    Returns
+    -------
+    p : ndarray of shape (n_params,)
+        Optimum parameters.
+
+    error : float
+        Optimum.
+
+    i : int
+        Last iteration.
+    """
+    if args is None:
+        args = []
+    if kwargs is None:
+        kwargs = {}
+
+    p = p0.copy().ravel()
+    update = np.zeros_like(p)
+    gains = np.ones_like(p)
+    error = np.finfo(float).max
+    best_error = np.finfo(float).max
+    best_iter = i = it
+
+    tic = time()
+    for i in range(it, max_iter):
+        check_convergence = (i + 1) % n_iter_check == 0
+        # only compute the error when needed
+        kwargs["compute_error"] = check_convergence or i == max_iter - 1
+
+        error, grad = objective(p, *args, **kwargs)
+
+        inc = update * grad < 0.0
+        dec = np.invert(inc)
+        gains[inc] += 0.2
+        gains[dec] *= 0.8
+        np.clip(gains, min_gain, np.inf, out=gains)
+        grad *= gains
+        update = momentum * update - learning_rate * grad
+        p += update
+
+        if check_convergence:
+            toc = time()
+            duration = toc - tic
+            tic = toc
+            grad_norm = linalg.norm(grad)
+
+            if verbose >= 2:
+                print(
+                    "[t-SNE] Iteration %d: error = %.7f,"
+                    " gradient norm = %.7f"
+                    " (%s iterations in %0.3fs)"
+                    % (i + 1, error, grad_norm, n_iter_check, duration)
+                )
+
+            if error < best_error:
+                best_error = error
+                best_iter = i
+            elif i - best_iter > n_iter_without_progress:
+                if verbose >= 2:
+                    print(
+                        "[t-SNE] Iteration %d: did not make any progress "
+                        "during the last %d episodes. Finished."
+                        % (i + 1, n_iter_without_progress)
+                    )
+                break
+            if grad_norm <= min_grad_norm:
+                if verbose >= 2:
+                    print(
+                        "[t-SNE] Iteration %d: gradient norm %f. Finished."
+                        % (i + 1, grad_norm)
+                    )
+                break
+
+    return p, error, i
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "X_embedded": ["array-like", "sparse matrix"],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
+    },
+    prefer_skip_nested_validation=True,
+)
+def trustworthiness(X, X_embedded, *, n_neighbors=5, metric="euclidean"):
+    r"""Indicate to what extent the local structure is retained.
+
+    The trustworthiness is within [0, 1]. It is defined as
+
+    .. math::
+
+        T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
+            \sum_{j \in \mathcal{N}_{i}^{k}} \max(0, (r(i, j) - k))
+
+    where for each sample i, :math:`\mathcal{N}_{i}^{k}` are its k nearest
+    neighbors in the output space, and every sample j is its :math:`r(i, j)`-th
+    nearest neighbor in the input space. In other words, any unexpected nearest
+    neighbors in the output space are penalised in proportion to their rank in
+    the input space.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+        (n_samples, n_samples)
+        If the metric is 'precomputed' X must be a square distance
+        matrix. Otherwise it contains a sample per row.
+
+    X_embedded : {array-like, sparse matrix} of shape (n_samples, n_components)
+        Embedding of the training data in low-dimensional space.
+
+    n_neighbors : int, default=5
+        The number of neighbors that will be considered. Should be fewer than
+        `n_samples / 2` to ensure the trustworthiness to lies within [0, 1], as
+        mentioned in [1]_. An error will be raised otherwise.
+
+    metric : str or callable, default='euclidean'
+        Which metric to use for computing pairwise distances between samples
+        from the original input space. If metric is 'precomputed', X must be a
+        matrix of pairwise distances or squared distances. Otherwise, for a list
+        of available metrics, see the documentation of argument metric in
+        `sklearn.pairwise.pairwise_distances` and metrics listed in
+        `sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the
+        "cosine" metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    trustworthiness : float
+        Trustworthiness of the low-dimensional embedding.
+
+    References
+    ----------
+    .. [1] Jarkko Venna and Samuel Kaski. 2001. Neighborhood
+           Preservation in Nonlinear Projection Methods: An Experimental Study.
+           In Proceedings of the International Conference on Artificial Neural Networks
+           (ICANN '01). Springer-Verlag, Berlin, Heidelberg, 485-491.
+
+    .. [2] Laurens van der Maaten. Learning a Parametric Embedding by Preserving
+           Local Structure. Proceedings of the Twelfth International Conference on
+           Artificial Intelligence and Statistics, PMLR 5:384-391, 2009.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_blobs
+    >>> from sklearn.decomposition import PCA
+    >>> from sklearn.manifold import trustworthiness
+    >>> X, _ = make_blobs(n_samples=100, n_features=10, centers=3, random_state=42)
+    >>> X_embedded = PCA(n_components=2).fit_transform(X)
+    >>> print(f"{trustworthiness(X, X_embedded, n_neighbors=5):.2f}")
+    0.92
+    """
+    n_samples = _num_samples(X)
+    if n_neighbors >= n_samples / 2:
+        raise ValueError(
+            f"n_neighbors ({n_neighbors}) should be less than n_samples / 2"
+            f" ({n_samples / 2})"
+        )
+    dist_X = pairwise_distances(X, metric=metric)
+    if metric == "precomputed":
+        dist_X = dist_X.copy()
+    # we set the diagonal to np.inf to exclude the points themselves from
+    # their own neighborhood
+    np.fill_diagonal(dist_X, np.inf)
+    ind_X = np.argsort(dist_X, axis=1)
+    # `ind_X[i]` is the index of sorted distances between i and other samples
+    ind_X_embedded = (
+        NearestNeighbors(n_neighbors=n_neighbors)
+        .fit(X_embedded)
+        .kneighbors(return_distance=False)
+    )
+
+    # We build an inverted index of neighbors in the input space: For sample i,
+    # we define `inverted_index[i]` as the inverted index of sorted distances:
+    # inverted_index[i][ind_X[i]] = np.arange(1, n_sample + 1)
+    inverted_index = np.zeros((n_samples, n_samples), dtype=int)
+    ordered_indices = np.arange(n_samples + 1)
+    inverted_index[ordered_indices[:-1, np.newaxis], ind_X] = ordered_indices[1:]
+    ranks = (
+        inverted_index[ordered_indices[:-1, np.newaxis], ind_X_embedded] - n_neighbors
+    )
+    t = np.sum(ranks[ranks > 0])
+    t = 1.0 - t * (
+        2.0 / (n_samples * n_neighbors * (2.0 * n_samples - 3.0 * n_neighbors - 1.0))
+    )
+    return t
+
+
+class TSNE(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """T-distributed Stochastic Neighbor Embedding.
+
+    t-SNE [1] is a tool to visualize high-dimensional data. It converts
+    similarities between data points to joint probabilities and tries
+    to minimize the Kullback-Leibler divergence between the joint
+    probabilities of the low-dimensional embedding and the
+    high-dimensional data. t-SNE has a cost function that is not convex,
+    i.e. with different initializations we can get different results.
+
+    It is highly recommended to use another dimensionality reduction
+    method (e.g. PCA for dense data or TruncatedSVD for sparse data)
+    to reduce the number of dimensions to a reasonable amount (e.g. 50)
+    if the number of features is very high. This will suppress some
+    noise and speed up the computation of pairwise distances between
+    samples. For more tips see Laurens van der Maaten's FAQ [2].
+
+    Read more in the :ref:`User Guide <t_sne>`.
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        Dimension of the embedded space.
+
+    perplexity : float, default=30.0
+        The perplexity is related to the number of nearest neighbors that
+        is used in other manifold learning algorithms. Larger datasets
+        usually require a larger perplexity. Consider selecting a value
+        between 5 and 50. Different values can result in significantly
+        different results. The perplexity must be less than the number
+        of samples.
+
+    early_exaggeration : float, default=12.0
+        Controls how tight natural clusters in the original space are in
+        the embedded space and how much space will be between them. For
+        larger values, the space between natural clusters will be larger
+        in the embedded space. Again, the choice of this parameter is not
+        very critical. If the cost function increases during initial
+        optimization, the early exaggeration factor or the learning rate
+        might be too high.
+
+    learning_rate : float or "auto", default="auto"
+        The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
+        the learning rate is too high, the data may look like a 'ball' with any
+        point approximately equidistant from its nearest neighbours. If the
+        learning rate is too low, most points may look compressed in a dense
+        cloud with few outliers. If the cost function gets stuck in a bad local
+        minimum increasing the learning rate may help.
+        Note that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE,
+        etc.) use a definition of learning_rate that is 4 times smaller than
+        ours. So our learning_rate=200 corresponds to learning_rate=800 in
+        those other implementations. The 'auto' option sets the learning_rate
+        to `max(N / early_exaggeration / 4, 50)` where N is the sample size,
+        following [4] and [5].
+
+        .. versionchanged:: 1.2
+           The default value changed to `"auto"`.
+
+    max_iter : int, default=1000
+        Maximum number of iterations for the optimization. Should be at
+        least 250.
+
+        .. versionchanged:: 1.5
+            Parameter name changed from `n_iter` to `max_iter`.
+
+    n_iter_without_progress : int, default=300
+        Maximum number of iterations without progress before we abort the
+        optimization, used after 250 initial iterations with early
+        exaggeration. Note that progress is only checked every 50 iterations so
+        this value is rounded to the next multiple of 50.
+
+        .. versionadded:: 0.17
+           parameter *n_iter_without_progress* to control stopping criteria.
+
+    min_grad_norm : float, default=1e-7
+        If the gradient norm is below this threshold, the optimization will
+        be stopped.
+
+    metric : str or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string, it must be one of the options
+        allowed by scipy.spatial.distance.pdist for its metric parameter, or
+        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
+        If metric is "precomputed", X is assumed to be a distance matrix.
+        Alternatively, if metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays from X as input and return a value indicating
+        the distance between them. The default is "euclidean" which is
+        interpreted as squared euclidean distance.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+        .. versionadded:: 1.1
+
+    init : {"random", "pca"} or ndarray of shape (n_samples, n_components), \
+            default="pca"
+        Initialization of embedding.
+        PCA initialization cannot be used with precomputed distances and is
+        usually more globally stable than random initialization.
+
+        .. versionchanged:: 1.2
+           The default value changed to `"pca"`.
+
+    verbose : int, default=0
+        Verbosity level.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the random number generator. Pass an int for reproducible
+        results across multiple function calls. Note that different
+        initializations might result in different local minima of the cost
+        function. See :term:`Glossary <random_state>`.
+
+    method : {'barnes_hut', 'exact'}, default='barnes_hut'
+        By default the gradient calculation algorithm uses Barnes-Hut
+        approximation running in O(NlogN) time. method='exact'
+        will run on the slower, but exact, algorithm in O(N^2) time. The
+        exact algorithm should be used when nearest-neighbor errors need
+        to be better than 3%. However, the exact method cannot scale to
+        millions of examples.
+
+        .. versionadded:: 0.17
+           Approximate optimization *method* via the Barnes-Hut.
+
+    angle : float, default=0.5
+        Only used if method='barnes_hut'
+        This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
+        'angle' is the angular size (referred to as theta in [3]) of a distant
+        node as measured from a point. If this size is below 'angle' then it is
+        used as a summary node of all points contained within it.
+        This method is not very sensitive to changes in this parameter
+        in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
+        computation time and angle greater 0.8 has quickly increasing error.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search. This parameter
+        has no impact when ``metric="precomputed"`` or
+        (``metric="euclidean"`` and ``method="exact"``).
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    embedding_ : array-like of shape (n_samples, n_components)
+        Stores the embedding vectors.
+
+    kl_divergence_ : float
+        Kullback-Leibler divergence after optimization.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    learning_rate_ : float
+        Effective learning rate.
+
+        .. versionadded:: 1.2
+
+    n_iter_ : int
+        Number of iterations run.
+
+    See Also
+    --------
+    sklearn.decomposition.PCA : Principal component analysis that is a linear
+        dimensionality reduction method.
+    sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
+        kernels and PCA.
+    MDS : Manifold learning using multidimensional scaling.
+    Isomap : Manifold learning based on Isometric Mapping.
+    LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
+    SpectralEmbedding : Spectral embedding for non-linear dimensionality.
+
+    Notes
+    -----
+    For an example of using :class:`~sklearn.manifold.TSNE` in combination with
+    :class:`~sklearn.neighbors.KNeighborsTransformer` see
+    :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`.
+
+    References
+    ----------
+
+    [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data
+        Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008.
+
+    [2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding
+        https://lvdmaaten.github.io/tsne/
+
+    [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.
+        Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
+        https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf
+
+    [4] Belkina, A. C., Ciccolella, C. O., Anno, R., Halpert, R., Spidlen, J.,
+        & Snyder-Cappione, J. E. (2019). Automated optimized parameters for
+        T-distributed stochastic neighbor embedding improve visualization
+        and analysis of large datasets. Nature Communications, 10(1), 1-12.
+
+    [5] Kobak, D., & Berens, P. (2019). The art of using t-SNE for single-cell
+        transcriptomics. Nature Communications, 10(1), 1-14.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.manifold import TSNE
+    >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
+    >>> X_embedded = TSNE(n_components=2, learning_rate='auto',
+    ...                   init='random', perplexity=3).fit_transform(X)
+    >>> X_embedded.shape
+    (4, 2)
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "perplexity": [Interval(Real, 0, None, closed="neither")],
+        "early_exaggeration": [Interval(Real, 1, None, closed="left")],
+        "learning_rate": [
+            StrOptions({"auto"}),
+            Interval(Real, 0, None, closed="neither"),
+        ],
+        "max_iter": [Interval(Integral, 250, None, closed="left")],
+        "n_iter_without_progress": [Interval(Integral, -1, None, closed="left")],
+        "min_grad_norm": [Interval(Real, 0, None, closed="left")],
+        "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
+        "metric_params": [dict, None],
+        "init": [
+            StrOptions({"pca", "random"}),
+            np.ndarray,
+        ],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+        "method": [StrOptions({"barnes_hut", "exact"})],
+        "angle": [Interval(Real, 0, 1, closed="both")],
+        "n_jobs": [None, Integral],
+    }
+
+    # Control the number of exploration iterations with early_exaggeration on
+    _EXPLORATION_MAX_ITER = 250
+
+    # Control the number of iterations between progress checks
+    _N_ITER_CHECK = 50
+
+    def __init__(
+        self,
+        n_components=2,
+        *,
+        perplexity=30.0,
+        early_exaggeration=12.0,
+        learning_rate="auto",
+        max_iter=1000,
+        n_iter_without_progress=300,
+        min_grad_norm=1e-7,
+        metric="euclidean",
+        metric_params=None,
+        init="pca",
+        verbose=0,
+        random_state=None,
+        method="barnes_hut",
+        angle=0.5,
+        n_jobs=None,
+    ):
+        self.n_components = n_components
+        self.perplexity = perplexity
+        self.early_exaggeration = early_exaggeration
+        self.learning_rate = learning_rate
+        self.max_iter = max_iter
+        self.n_iter_without_progress = n_iter_without_progress
+        self.min_grad_norm = min_grad_norm
+        self.metric = metric
+        self.metric_params = metric_params
+        self.init = init
+        self.verbose = verbose
+        self.random_state = random_state
+        self.method = method
+        self.angle = angle
+        self.n_jobs = n_jobs
+
+    def _check_params_vs_input(self, X):
+        if self.perplexity >= X.shape[0]:
+            raise ValueError(
+                f"perplexity ({self.perplexity}) must be less "
+                f"than n_samples ({X.shape[0]})"
+            )
+
+    def _fit(self, X, skip_num_points=0):
+        """Private function to fit the model using X as training data."""
+
+        if isinstance(self.init, str) and self.init == "pca" and issparse(X):
+            raise TypeError(
+                "PCA initialization is currently not supported "
+                "with the sparse input matrix. Use "
+                'init="random" instead.'
+            )
+
+        if self.learning_rate == "auto":
+            # See issue #18018
+            self.learning_rate_ = X.shape[0] / self.early_exaggeration / 4
+            self.learning_rate_ = np.maximum(self.learning_rate_, 50)
+        else:
+            self.learning_rate_ = self.learning_rate
+
+        if self.method == "barnes_hut":
+            X = validate_data(
+                self,
+                X,
+                accept_sparse=["csr"],
+                ensure_min_samples=2,
+                dtype=[np.float32, np.float64],
+            )
+        else:
+            X = validate_data(
+                self,
+                X,
+                accept_sparse=["csr", "csc", "coo"],
+                dtype=[np.float32, np.float64],
+            )
+        if self.metric == "precomputed":
+            if isinstance(self.init, str) and self.init == "pca":
+                raise ValueError(
+                    'The parameter init="pca" cannot be used with metric="precomputed".'
+                )
+            if X.shape[0] != X.shape[1]:
+                raise ValueError("X should be a square distance matrix")
+
+            check_non_negative(
+                X,
+                (
+                    "TSNE.fit(). With metric='precomputed', X "
+                    "should contain positive distances."
+                ),
+            )
+
+            if self.method == "exact" and issparse(X):
+                raise TypeError(
+                    'TSNE with method="exact" does not accept sparse '
+                    'precomputed distance matrix. Use method="barnes_hut" '
+                    "or provide the dense distance matrix."
+                )
+
+        if self.method == "barnes_hut" and self.n_components > 3:
+            raise ValueError(
+                "'n_components' should be inferior to 4 for the "
+                "barnes_hut algorithm as it relies on "
+                "quad-tree or oct-tree."
+            )
+        random_state = check_random_state(self.random_state)
+
+        n_samples = X.shape[0]
+
+        neighbors_nn = None
+        if self.method == "exact":
+            # Retrieve the distance matrix, either using the precomputed one or
+            # computing it.
+            if self.metric == "precomputed":
+                distances = X
+            else:
+                if self.verbose:
+                    print("[t-SNE] Computing pairwise distances...")
+
+                if self.metric == "euclidean":
+                    # Euclidean is squared here, rather than using **= 2,
+                    # because euclidean_distances already calculates
+                    # squared distances, and returns np.sqrt(dist) for
+                    # squared=False.
+                    # Also, Euclidean is slower for n_jobs>1, so don't set here
+                    distances = pairwise_distances(X, metric=self.metric, squared=True)
+                else:
+                    metric_params_ = self.metric_params or {}
+                    distances = pairwise_distances(
+                        X, metric=self.metric, n_jobs=self.n_jobs, **metric_params_
+                    )
+
+            if np.any(distances < 0):
+                raise ValueError(
+                    "All distances should be positive, the metric given is not correct"
+                )
+
+            if self.metric != "euclidean":
+                distances **= 2
+
+            # compute the joint probability distribution for the input space
+            P = _joint_probabilities(distances, self.perplexity, self.verbose)
+            assert np.all(np.isfinite(P)), "All probabilities should be finite"
+            assert np.all(P >= 0), "All probabilities should be non-negative"
+            assert np.all(P <= 1), (
+                "All probabilities should be less or then equal to one"
+            )
+
+        else:
+            # Compute the number of nearest neighbors to find.
+            # LvdM uses 3 * perplexity as the number of neighbors.
+            # In the event that we have very small # of points
+            # set the neighbors to n - 1.
+            n_neighbors = min(n_samples - 1, int(3.0 * self.perplexity + 1))
+
+            if self.verbose:
+                print("[t-SNE] Computing {} nearest neighbors...".format(n_neighbors))
+
+            # Find the nearest neighbors for every point
+            knn = NearestNeighbors(
+                algorithm="auto",
+                n_jobs=self.n_jobs,
+                n_neighbors=n_neighbors,
+                metric=self.metric,
+                metric_params=self.metric_params,
+            )
+            t0 = time()
+            knn.fit(X)
+            duration = time() - t0
+            if self.verbose:
+                print(
+                    "[t-SNE] Indexed {} samples in {:.3f}s...".format(
+                        n_samples, duration
+                    )
+                )
+
+            t0 = time()
+            distances_nn = knn.kneighbors_graph(mode="distance")
+            duration = time() - t0
+            if self.verbose:
+                print(
+                    "[t-SNE] Computed neighbors for {} samples in {:.3f}s...".format(
+                        n_samples, duration
+                    )
+                )
+
+            # Free the memory used by the ball_tree
+            del knn
+
+            # knn return the euclidean distance but we need it squared
+            # to be consistent with the 'exact' method. Note that the
+            # the method was derived using the euclidean method as in the
+            # input space. Not sure of the implication of using a different
+            # metric.
+            distances_nn.data **= 2
+
+            # compute the joint probability distribution for the input space
+            P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose)
+
+        if isinstance(self.init, np.ndarray):
+            X_embedded = self.init
+        elif self.init == "pca":
+            pca = PCA(
+                n_components=self.n_components,
+                svd_solver="randomized",
+                random_state=random_state,
+            )
+            # Always output a numpy array, no matter what is configured globally
+            pca.set_output(transform="default")
+            X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
+            # PCA is rescaled so that PC1 has standard deviation 1e-4 which is
+            # the default value for random initialization. See issue #18018.
+            X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4
+        elif self.init == "random":
+            # The embedding is initialized with iid samples from Gaussians with
+            # standard deviation 1e-4.
+            X_embedded = 1e-4 * random_state.standard_normal(
+                size=(n_samples, self.n_components)
+            ).astype(np.float32)
+
+        # Degrees of freedom of the Student's t-distribution. The suggestion
+        # degrees_of_freedom = n_components - 1 comes from
+        # "Learning a Parametric Embedding by Preserving Local Structure"
+        # Laurens van der Maaten, 2009.
+        degrees_of_freedom = max(self.n_components - 1, 1)
+
+        return self._tsne(
+            P,
+            degrees_of_freedom,
+            n_samples,
+            X_embedded=X_embedded,
+            neighbors=neighbors_nn,
+            skip_num_points=skip_num_points,
+        )
+
+    def _tsne(
+        self,
+        P,
+        degrees_of_freedom,
+        n_samples,
+        X_embedded,
+        neighbors=None,
+        skip_num_points=0,
+    ):
+        """Runs t-SNE."""
+        # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P
+        # and the Student's t-distributions Q. The optimization algorithm that
+        # we use is batch gradient descent with two stages:
+        # * initial optimization with early exaggeration and momentum at 0.5
+        # * final optimization with momentum at 0.8
+        params = X_embedded.ravel()
+
+        opt_args = {
+            "it": 0,
+            "n_iter_check": self._N_ITER_CHECK,
+            "min_grad_norm": self.min_grad_norm,
+            "learning_rate": self.learning_rate_,
+            "verbose": self.verbose,
+            "kwargs": dict(skip_num_points=skip_num_points),
+            "args": [P, degrees_of_freedom, n_samples, self.n_components],
+            "n_iter_without_progress": self._EXPLORATION_MAX_ITER,
+            "max_iter": self._EXPLORATION_MAX_ITER,
+            "momentum": 0.5,
+        }
+        if self.method == "barnes_hut":
+            obj_func = _kl_divergence_bh
+            opt_args["kwargs"]["angle"] = self.angle
+            # Repeat verbose argument for _kl_divergence_bh
+            opt_args["kwargs"]["verbose"] = self.verbose
+            # Get the number of threads for gradient computation here to
+            # avoid recomputing it at each iteration.
+            opt_args["kwargs"]["num_threads"] = _openmp_effective_n_threads()
+        else:
+            obj_func = _kl_divergence
+
+        # Learning schedule (part 1): do 250 iteration with lower momentum but
+        # higher learning rate controlled via the early exaggeration parameter
+        P *= self.early_exaggeration
+        params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args)
+        if self.verbose:
+            print(
+                "[t-SNE] KL divergence after %d iterations with early exaggeration: %f"
+                % (it + 1, kl_divergence)
+            )
+
+        # Learning schedule (part 2): disable early exaggeration and finish
+        # optimization with a higher momentum at 0.8
+        P /= self.early_exaggeration
+        remaining = self.max_iter - self._EXPLORATION_MAX_ITER
+        if it < self._EXPLORATION_MAX_ITER or remaining > 0:
+            opt_args["max_iter"] = self.max_iter
+            opt_args["it"] = it + 1
+            opt_args["momentum"] = 0.8
+            opt_args["n_iter_without_progress"] = self.n_iter_without_progress
+            params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args)
+
+        # Save the final number of iterations
+        self.n_iter_ = it
+
+        if self.verbose:
+            print(
+                "[t-SNE] KL divergence after %d iterations: %f"
+                % (it + 1, kl_divergence)
+            )
+
+        X_embedded = params.reshape(n_samples, self.n_components)
+        self.kl_divergence_ = kl_divergence
+
+        return X_embedded
+
+    @_fit_context(
+        # TSNE.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit_transform(self, X, y=None):
+        """Fit X into an embedded space and return that transformed output.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+            (n_samples, n_samples)
+            If the metric is 'precomputed' X must be a square distance
+            matrix. Otherwise it contains a sample per row. If the method
+            is 'exact', X may be a sparse matrix of type 'csr', 'csc'
+            or 'coo'. If the method is 'barnes_hut' and the metric is
+            'precomputed', X may be a precomputed sparse graph.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Embedding of the training data in low-dimensional space.
+        """
+        self._check_params_vs_input(X)
+        embedding = self._fit(X)
+        self.embedding_ = embedding
+        return self.embedding_
+
+    @_fit_context(
+        # TSNE.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit X into an embedded space.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+            (n_samples, n_samples)
+            If the metric is 'precomputed' X must be a square distance
+            matrix. Otherwise it contains a sample per row. If the method
+            is 'exact', X may be a sparse matrix of type 'csr', 'csc'
+            or 'coo'. If the method is 'barnes_hut' and the metric is
+            'precomputed', X may be a precomputed sparse graph.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self.fit_transform(X)
+        return self
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.embedding_.shape[1]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..079767b633d4d757170fe1227300694510c704bd
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..be3a1d2f91f6670cea8eee130990becc3fc4b8bb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx
@@ -0,0 +1,120 @@
+import numpy as np
+
+from libc cimport math
+from libc.math cimport INFINITY
+
+from ..utils._typedefs cimport float32_t, float64_t
+
+
+cdef float EPSILON_DBL = 1e-8
+cdef float PERPLEXITY_TOLERANCE = 1e-5
+
+
+# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
+def _binary_search_perplexity(
+        const float32_t[:, :] sqdistances,
+        float desired_perplexity,
+        int verbose):
+    """Binary search for sigmas of conditional Gaussians.
+
+    This approximation reduces the computational complexity from O(N^2) to
+    O(uN).
+
+    Parameters
+    ----------
+    sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
+        Distances between training samples and their k nearest neighbors.
+        When using the exact method, this is a square (n_samples, n_samples)
+        distance matrix. The TSNE default metric is "euclidean" which is
+        interpreted as squared euclidean distance.
+
+    desired_perplexity : float
+        Desired perplexity (2^entropy) of the conditional Gaussians.
+
+    verbose : int
+        Verbosity level.
+
+    Returns
+    -------
+    P : ndarray of shape (n_samples, n_samples), dtype=np.float64
+        Probabilities of conditional Gaussian distributions p_i|j.
+    """
+    # Maximum number of binary search steps
+    cdef long n_steps = 100
+
+    cdef long n_samples = sqdistances.shape[0]
+    cdef long n_neighbors = sqdistances.shape[1]
+    cdef int using_neighbors = n_neighbors < n_samples
+    # Precisions of conditional Gaussian distributions
+    cdef double beta
+    cdef double beta_min
+    cdef double beta_max
+    cdef double beta_sum = 0.0
+
+    # Use log scale
+    cdef double desired_entropy = math.log(desired_perplexity)
+    cdef double entropy_diff
+
+    cdef double entropy
+    cdef double sum_Pi
+    cdef double sum_disti_Pi
+    cdef long i, j, l
+
+    # This array is later used as a 32bit array. It has multiple intermediate
+    # floating point additions that benefit from the extra precision
+    cdef float64_t[:, :] P = np.zeros(
+        (n_samples, n_neighbors), dtype=np.float64)
+
+    for i in range(n_samples):
+        beta_min = -INFINITY
+        beta_max = INFINITY
+        beta = 1.0
+
+        # Binary search of precision for i-th conditional distribution
+        for l in range(n_steps):
+            # Compute current entropy and corresponding probabilities
+            # computed just over the nearest neighbors or over all data
+            # if we're not using neighbors
+            sum_Pi = 0.0
+            for j in range(n_neighbors):
+                if j != i or using_neighbors:
+                    P[i, j] = math.exp(-sqdistances[i, j] * beta)
+                    sum_Pi += P[i, j]
+
+            if sum_Pi == 0.0:
+                sum_Pi = EPSILON_DBL
+            sum_disti_Pi = 0.0
+
+            for j in range(n_neighbors):
+                P[i, j] /= sum_Pi
+                sum_disti_Pi += sqdistances[i, j] * P[i, j]
+
+            entropy = math.log(sum_Pi) + beta * sum_disti_Pi
+            entropy_diff = entropy - desired_entropy
+
+            if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
+                break
+
+            if entropy_diff > 0.0:
+                beta_min = beta
+                if beta_max == INFINITY:
+                    beta *= 2.0
+                else:
+                    beta = (beta + beta_max) / 2.0
+            else:
+                beta_max = beta
+                if beta_min == -INFINITY:
+                    beta /= 2.0
+                else:
+                    beta = (beta + beta_min) / 2.0
+
+        beta_sum += beta
+
+        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
+            print("[t-SNE] Computed conditional probabilities for sample "
+                  "%d / %d" % (i + 1, n_samples))
+
+    if verbose:
+        print("[t-SNE] Mean sigma: %f"
+              % np.mean(math.sqrt(n_samples / beta_sum)))
+    return np.asarray(P)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/meson.build b/.venv/lib/python3.12/site-packages/sklearn/manifold/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..c060590410d63ff06ca8b0f062b08cd1581a07de
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/meson.build
@@ -0,0 +1,14 @@
+py.extension_module(
+  '_utils',
+  [cython_gen.process('_utils.pyx'), utils_cython_tree],
+  subdir: 'sklearn/manifold',
+  install: true
+)
+
+py.extension_module(
+  '_barnes_hut_tsne',
+  cython_gen.process('_barnes_hut_tsne.pyx'),
+  dependencies: [np_dep, openmp_dep],
+  subdir: 'sklearn/manifold',
+  install: true
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_isomap.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_isomap.py
new file mode 100644
index 0000000000000000000000000000000000000000..e38b92442e58d9881726bdee85073ad38a7c95e1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_isomap.py
@@ -0,0 +1,348 @@
+import math
+from itertools import product
+
+import numpy as np
+import pytest
+from scipy.sparse import rand as sparse_rand
+
+from sklearn import clone, datasets, manifold, neighbors, pipeline, preprocessing
+from sklearn.datasets import make_blobs
+from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+eigen_solvers = ["auto", "dense", "arpack"]
+path_methods = ["auto", "FW", "D"]
+
+
+def create_sample_data(dtype, n_pts=25, add_noise=False):
+    # grid of equidistant points in 2D, n_components = n_dim
+    n_per_side = int(math.sqrt(n_pts))
+    X = np.array(list(product(range(n_per_side), repeat=2))).astype(dtype, copy=False)
+    if add_noise:
+        # add noise in a third dimension
+        rng = np.random.RandomState(0)
+        noise = 0.1 * rng.randn(n_pts, 1).astype(dtype, copy=False)
+        X = np.concatenate((X, noise), 1)
+    return X
+
+
+@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
+@pytest.mark.parametrize("eigen_solver", eigen_solvers)
+@pytest.mark.parametrize("path_method", path_methods)
+def test_isomap_simple_grid(
+    global_dtype, n_neighbors, radius, eigen_solver, path_method
+):
+    # Isomap should preserve distances when all neighbors are used
+    n_pts = 25
+    X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=False)
+
+    # distances from each point to all others
+    if n_neighbors is not None:
+        G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
+    else:
+        G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
+
+    clf = manifold.Isomap(
+        n_neighbors=n_neighbors,
+        radius=radius,
+        n_components=2,
+        eigen_solver=eigen_solver,
+        path_method=path_method,
+    )
+    clf.fit(X)
+
+    if n_neighbors is not None:
+        G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
+    else:
+        G_iso = neighbors.radius_neighbors_graph(
+            clf.embedding_, radius, mode="distance"
+        )
+    atol = 1e-5 if global_dtype == np.float32 else 0
+    assert_allclose_dense_sparse(G, G_iso, atol=atol)
+
+
+@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
+@pytest.mark.parametrize("eigen_solver", eigen_solvers)
+@pytest.mark.parametrize("path_method", path_methods)
+def test_isomap_reconstruction_error(
+    global_dtype, n_neighbors, radius, eigen_solver, path_method
+):
+    if global_dtype is np.float32:
+        pytest.skip(
+            "Skipping test due to numerical instabilities on float32 data"
+            "from KernelCenterer used in the reconstruction_error method"
+        )
+
+    # Same setup as in test_isomap_simple_grid, with an added dimension
+    n_pts = 25
+    X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=True)
+
+    # compute input kernel
+    if n_neighbors is not None:
+        G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
+    else:
+        G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
+    centerer = preprocessing.KernelCenterer()
+    K = centerer.fit_transform(-0.5 * G**2)
+
+    clf = manifold.Isomap(
+        n_neighbors=n_neighbors,
+        radius=radius,
+        n_components=2,
+        eigen_solver=eigen_solver,
+        path_method=path_method,
+    )
+    clf.fit(X)
+
+    # compute output kernel
+    if n_neighbors is not None:
+        G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
+    else:
+        G_iso = neighbors.radius_neighbors_graph(
+            clf.embedding_, radius, mode="distance"
+        )
+    G_iso = G_iso.toarray()
+    K_iso = centerer.fit_transform(-0.5 * G_iso**2)
+
+    # make sure error agrees
+    reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
+    atol = 1e-5 if global_dtype == np.float32 else 0
+    assert_allclose(reconstruction_error, clf.reconstruction_error(), atol=atol)
+
+
+@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
+def test_transform(global_dtype, n_neighbors, radius):
+    n_samples = 200
+    n_components = 10
+    noise_scale = 0.01
+
+    # Create S-curve dataset
+    X, y = datasets.make_s_curve(n_samples, random_state=0)
+
+    X = X.astype(global_dtype, copy=False)
+
+    # Compute isomap embedding
+    iso = manifold.Isomap(
+        n_components=n_components, n_neighbors=n_neighbors, radius=radius
+    )
+    X_iso = iso.fit_transform(X)
+
+    # Re-embed a noisy version of the points
+    rng = np.random.RandomState(0)
+    noise = noise_scale * rng.randn(*X.shape)
+    X_iso2 = iso.transform(X + noise)
+
+    # Make sure the rms error on re-embedding is comparable to noise_scale
+    assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
+
+
+@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
+def test_pipeline(n_neighbors, radius, global_dtype):
+    # check that Isomap works fine as a transformer in a Pipeline
+    # only checks that no error is raised.
+    # TODO check that it actually does something useful
+    X, y = datasets.make_blobs(random_state=0)
+    X = X.astype(global_dtype, copy=False)
+    clf = pipeline.Pipeline(
+        [
+            ("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
+            ("clf", neighbors.KNeighborsClassifier()),
+        ]
+    )
+    clf.fit(X, y)
+    assert 0.9 < clf.score(X, y)
+
+
+def test_pipeline_with_nearest_neighbors_transformer(global_dtype):
+    # Test chaining NearestNeighborsTransformer and Isomap with
+    # neighbors_algorithm='precomputed'
+    algorithm = "auto"
+    n_neighbors = 10
+
+    X, _ = datasets.make_blobs(random_state=0)
+    X2, _ = datasets.make_blobs(random_state=1)
+
+    X = X.astype(global_dtype, copy=False)
+    X2 = X2.astype(global_dtype, copy=False)
+
+    # compare the chained version and the compact version
+    est_chain = pipeline.make_pipeline(
+        neighbors.KNeighborsTransformer(
+            n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
+        ),
+        manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
+    )
+    est_compact = manifold.Isomap(
+        n_neighbors=n_neighbors, neighbors_algorithm=algorithm
+    )
+
+    Xt_chain = est_chain.fit_transform(X)
+    Xt_compact = est_compact.fit_transform(X)
+    assert_allclose(Xt_chain, Xt_compact)
+
+    Xt_chain = est_chain.transform(X2)
+    Xt_compact = est_compact.transform(X2)
+    assert_allclose(Xt_chain, Xt_compact)
+
+
+@pytest.mark.parametrize(
+    "metric, p, is_euclidean",
+    [
+        ("euclidean", 2, True),
+        ("manhattan", 1, False),
+        ("minkowski", 1, False),
+        ("minkowski", 2, True),
+        (lambda x1, x2: np.sqrt(np.sum(x1**2 + x2**2)), 2, False),
+    ],
+)
+def test_different_metric(global_dtype, metric, p, is_euclidean):
+    # Isomap must work on various metric parameters work correctly
+    # and must default to euclidean.
+    X, _ = datasets.make_blobs(random_state=0)
+    X = X.astype(global_dtype, copy=False)
+
+    reference = manifold.Isomap().fit_transform(X)
+    embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
+
+    if is_euclidean:
+        assert_allclose(embedding, reference)
+    else:
+        with pytest.raises(AssertionError, match="Not equal to tolerance"):
+            assert_allclose(embedding, reference)
+
+
+def test_isomap_clone_bug():
+    # regression test for bug reported in #6062
+    model = manifold.Isomap()
+    for n_neighbors in [10, 15, 20]:
+        model.set_params(n_neighbors=n_neighbors)
+        model.fit(np.random.rand(50, 2))
+        assert model.nbrs_.n_neighbors == n_neighbors
+
+
+@pytest.mark.parametrize("eigen_solver", eigen_solvers)
+@pytest.mark.parametrize("path_method", path_methods)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_input(
+    global_dtype, eigen_solver, path_method, global_random_seed, csr_container
+):
+    # TODO: compare results on dense and sparse data as proposed in:
+    # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
+    X = csr_container(
+        sparse_rand(
+            100,
+            3,
+            density=0.1,
+            format="csr",
+            dtype=global_dtype,
+            random_state=global_random_seed,
+        )
+    )
+
+    iso_dense = manifold.Isomap(
+        n_components=2,
+        eigen_solver=eigen_solver,
+        path_method=path_method,
+        n_neighbors=8,
+    )
+    iso_sparse = clone(iso_dense)
+
+    X_trans_dense = iso_dense.fit_transform(X.toarray())
+    X_trans_sparse = iso_sparse.fit_transform(X)
+
+    assert_allclose(X_trans_sparse, X_trans_dense, rtol=1e-4, atol=1e-4)
+
+
+def test_isomap_fit_precomputed_radius_graph(global_dtype):
+    # Isomap.fit_transform must yield similar result when using
+    # a precomputed distance matrix.
+
+    X, y = datasets.make_s_curve(200, random_state=0)
+    X = X.astype(global_dtype, copy=False)
+    radius = 10
+
+    g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
+    isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
+    isomap.fit(g)
+    precomputed_result = isomap.embedding_
+
+    isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
+    result = isomap.fit_transform(X)
+    atol = 1e-5 if global_dtype == np.float32 else 0
+    assert_allclose(precomputed_result, result, atol=atol)
+
+
+def test_isomap_fitted_attributes_dtype(global_dtype):
+    """Check that the fitted attributes are stored accordingly to the
+    data type of X."""
+    iso = manifold.Isomap(n_neighbors=2)
+
+    X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
+
+    iso.fit(X)
+
+    assert iso.dist_matrix_.dtype == global_dtype
+    assert iso.embedding_.dtype == global_dtype
+
+
+def test_isomap_dtype_equivalence():
+    """Check the equivalence of the results with 32 and 64 bits input."""
+    iso_32 = manifold.Isomap(n_neighbors=2)
+    X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
+    iso_32.fit(X_32)
+
+    iso_64 = manifold.Isomap(n_neighbors=2)
+    X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
+    iso_64.fit(X_64)
+
+    assert_allclose(iso_32.dist_matrix_, iso_64.dist_matrix_)
+
+
+def test_isomap_raise_error_when_neighbor_and_radius_both_set():
+    # Isomap.fit_transform must raise a ValueError if
+    # radius and n_neighbors are provided.
+
+    X, _ = datasets.load_digits(return_X_y=True)
+    isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
+    msg = "Both n_neighbors and radius are provided"
+    with pytest.raises(ValueError, match=msg):
+        isomap.fit_transform(X)
+
+
+def test_multiple_connected_components():
+    # Test that a warning is raised when the graph has multiple components
+    X = np.array([0, 1, 2, 5, 6, 7])[:, None]
+    with pytest.warns(UserWarning, match="number of connected components"):
+        manifold.Isomap(n_neighbors=2).fit(X)
+
+
+def test_multiple_connected_components_metric_precomputed(global_dtype):
+    # Test that an error is raised when the graph has multiple components
+    # and when X is a precomputed neighbors graph.
+    X = np.array([0, 1, 2, 5, 6, 7])[:, None].astype(global_dtype, copy=False)
+
+    # works with a precomputed distance matrix (dense)
+    X_distances = pairwise_distances(X)
+    with pytest.warns(UserWarning, match="number of connected components"):
+        manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
+
+    # does not work with a precomputed neighbors graph (sparse)
+    X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
+    with pytest.raises(RuntimeError, match="number of connected components"):
+        manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
+
+
+def test_get_feature_names_out():
+    """Check get_feature_names_out for Isomap."""
+    X, y = make_blobs(random_state=0, n_features=4)
+    n_components = 2
+
+    iso = manifold.Isomap(n_components=n_components)
+    iso.fit_transform(X)
+    names = iso.get_feature_names_out()
+    assert_array_equal([f"isomap{i}" for i in range(n_components)], names)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_locally_linear.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_locally_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..835aa20fd1d32ace684eea9afd451bcdcf695f79
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_locally_linear.py
@@ -0,0 +1,171 @@
+from itertools import product
+
+import numpy as np
+import pytest
+from scipy import linalg
+
+from sklearn import manifold, neighbors
+from sklearn.datasets import make_blobs
+from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_equal,
+    ignore_warnings,
+)
+
+eigen_solvers = ["dense", "arpack"]
+
+
+# ----------------------------------------------------------------------
+# Test utility routines
+def test_barycenter_kneighbors_graph(global_dtype):
+    X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
+
+    graph = barycenter_kneighbors_graph(X, 1)
+    expected_graph = np.array(
+        [[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
+    )
+
+    assert graph.dtype == global_dtype
+
+    assert_allclose(graph.toarray(), expected_graph)
+
+    graph = barycenter_kneighbors_graph(X, 2)
+    # check that columns sum to one
+    assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
+    pred = np.dot(graph.toarray(), X)
+    assert linalg.norm(pred - X) / X.shape[0] < 1
+
+
+# ----------------------------------------------------------------------
+# Test LLE by computing the reconstruction error on some manifolds.
+
+
+def test_lle_simple_grid(global_dtype):
+    # note: ARPACK is numerically unstable, so this test will fail for
+    #       some random seeds.  We choose 42 because the tests pass.
+    #       for arm64 platforms 2 makes the test fail.
+    # TODO: rewrite this test to make less sensitive to the random seed,
+    # irrespective of the platform.
+    rng = np.random.RandomState(42)
+
+    # grid of equidistant points in 2D, n_components = n_dim
+    X = np.array(list(product(range(5), repeat=2)))
+    X = X + 1e-10 * rng.uniform(size=X.shape)
+    X = X.astype(global_dtype, copy=False)
+
+    n_components = 2
+    clf = manifold.LocallyLinearEmbedding(
+        n_neighbors=5, n_components=n_components, random_state=rng
+    )
+    tol = 0.1
+
+    N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
+    reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
+    assert reconstruction_error < tol
+
+    for solver in eigen_solvers:
+        clf.set_params(eigen_solver=solver)
+        clf.fit(X)
+        assert clf.embedding_.shape[1] == n_components
+        reconstruction_error = (
+            linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
+        )
+
+        assert reconstruction_error < tol
+        assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
+
+    # re-embed a noisy version of X using the transform method
+    noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
+    X_reembedded = clf.transform(X + noise)
+    assert linalg.norm(X_reembedded - clf.embedding_) < tol
+
+
+@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
+@pytest.mark.parametrize("solver", eigen_solvers)
+def test_lle_manifold(global_dtype, method, solver):
+    rng = np.random.RandomState(0)
+    # similar test on a slightly more complex manifold
+    X = np.array(list(product(np.arange(18), repeat=2)))
+    X = np.c_[X, X[:, 0] ** 2 / 18]
+    X = X + 1e-10 * rng.uniform(size=X.shape)
+    X = X.astype(global_dtype, copy=False)
+    n_components = 2
+
+    clf = manifold.LocallyLinearEmbedding(
+        n_neighbors=6, n_components=n_components, method=method, random_state=0
+    )
+    tol = 1.5 if method == "standard" else 3
+
+    N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
+    reconstruction_error = linalg.norm(np.dot(N, X) - X)
+    assert reconstruction_error < tol
+
+    clf.set_params(eigen_solver=solver)
+    clf.fit(X)
+    assert clf.embedding_.shape[1] == n_components
+    reconstruction_error = (
+        linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
+    )
+    details = "solver: %s, method: %s" % (solver, method)
+    assert reconstruction_error < tol, details
+    assert (
+        np.abs(clf.reconstruction_error_ - reconstruction_error)
+        < tol * reconstruction_error
+    ), details
+
+
+def test_pipeline():
+    # check that LocallyLinearEmbedding works fine as a Pipeline
+    # only checks that no error is raised.
+    # TODO check that it actually does something useful
+    from sklearn import datasets, pipeline
+
+    X, y = datasets.make_blobs(random_state=0)
+    clf = pipeline.Pipeline(
+        [
+            ("filter", manifold.LocallyLinearEmbedding(random_state=0)),
+            ("clf", neighbors.KNeighborsClassifier()),
+        ]
+    )
+    clf.fit(X, y)
+    assert 0.9 < clf.score(X, y)
+
+
+# Test the error raised when the weight matrix is singular
+def test_singular_matrix():
+    M = np.ones((200, 3))
+    f = ignore_warnings
+    with pytest.raises(ValueError, match="Error in determining null-space with ARPACK"):
+        f(
+            manifold.locally_linear_embedding(
+                M,
+                n_neighbors=2,
+                n_components=1,
+                method="standard",
+                eigen_solver="arpack",
+            )
+        )
+
+
+# regression test for #6033
+def test_integer_input():
+    rand = np.random.RandomState(0)
+    X = rand.randint(0, 100, size=(20, 3))
+
+    for method in ["standard", "hessian", "modified", "ltsa"]:
+        clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
+        clf.fit(X)  # this previously raised a TypeError
+
+
+def test_get_feature_names_out():
+    """Check get_feature_names_out for LocallyLinearEmbedding."""
+    X, y = make_blobs(random_state=0, n_features=4)
+    n_components = 2
+
+    iso = manifold.LocallyLinearEmbedding(n_components=n_components)
+    iso.fit(X)
+    names = iso.get_feature_names_out()
+    assert_array_equal(
+        [f"locallylinearembedding{i}" for i in range(n_components)], names
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_mds.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_mds.py
new file mode 100644
index 0000000000000000000000000000000000000000..88dc842a1d5fc4168a3cc9003c929f1770e839bb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_mds.py
@@ -0,0 +1,234 @@
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
+
+from sklearn.datasets import load_digits
+from sklearn.manifold import _mds as mds
+from sklearn.metrics import euclidean_distances
+
+
+def test_smacof():
+    # test metric smacof using the data of "Modern Multidimensional Scaling",
+    # Borg & Groenen, p 154
+    sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
+    Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
+    X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
+    X_true = np.array(
+        [[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
+    )
+    assert_array_almost_equal(X, X_true, decimal=3)
+
+
+def test_nonmetric_lower_normalized_stress():
+    # Testing that nonmetric MDS results in lower normalized stress compared
+    # compared to metric MDS (non-regression test for issue 27028)
+    sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
+    Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
+
+    _, stress1 = mds.smacof(
+        sim, init=Z, n_components=2, max_iter=1000, n_init=1, normalized_stress=True
+    )
+
+    _, stress2 = mds.smacof(
+        sim,
+        init=Z,
+        n_components=2,
+        max_iter=1000,
+        n_init=1,
+        normalized_stress=True,
+        metric=False,
+    )
+    assert stress1 > stress2
+
+
+def test_nonmetric_mds_optimization():
+    # Test that stress is decreasing during nonmetric MDS optimization
+    # (non-regression test for issue 27028)
+    X, _ = load_digits(return_X_y=True)
+    rng = np.random.default_rng(seed=42)
+    ind_subset = rng.choice(len(X), size=200, replace=False)
+    X = X[ind_subset]
+
+    mds_est = mds.MDS(
+        n_components=2,
+        n_init=1,
+        max_iter=2,
+        metric=False,
+        random_state=42,
+    ).fit(X)
+    stress_after_2_iter = mds_est.stress_
+
+    mds_est = mds.MDS(
+        n_components=2,
+        n_init=1,
+        max_iter=3,
+        metric=False,
+        random_state=42,
+    ).fit(X)
+    stress_after_3_iter = mds_est.stress_
+
+    assert stress_after_2_iter > stress_after_3_iter
+
+
+@pytest.mark.parametrize("metric", [True, False])
+def test_mds_recovers_true_data(metric):
+    X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
+    mds_est = mds.MDS(
+        n_components=2,
+        n_init=1,
+        eps=1e-15,
+        max_iter=1000,
+        metric=metric,
+        random_state=42,
+    ).fit(X)
+    stress = mds_est.stress_
+    assert_allclose(stress, 0, atol=1e-6)
+
+
+def test_smacof_error():
+    # Not symmetric similarity matrix:
+    sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
+
+    with pytest.raises(ValueError):
+        mds.smacof(sim, n_init=1)
+
+    # Not squared similarity matrix:
+    sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
+
+    with pytest.raises(ValueError):
+        mds.smacof(sim, n_init=1)
+
+    # init not None and not correct format:
+    sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
+
+    Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
+    with pytest.raises(ValueError):
+        mds.smacof(sim, init=Z, n_init=1)
+
+
+def test_MDS():
+    sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
+    mds_clf = mds.MDS(
+        metric=False,
+        n_jobs=3,
+        n_init=3,
+        dissimilarity="precomputed",
+    )
+    mds_clf.fit(sim)
+
+
+# TODO(1.9): remove warning filter
+@pytest.mark.filterwarnings("ignore::FutureWarning")
+@pytest.mark.parametrize("k", [0.5, 1.5, 2])
+def test_normed_stress(k):
+    """Test that non-metric MDS normalized stress is scale-invariant."""
+    sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
+
+    X1, stress1 = mds.smacof(sim, metric=False, max_iter=5, random_state=0)
+    X2, stress2 = mds.smacof(k * sim, metric=False, max_iter=5, random_state=0)
+
+    assert_allclose(stress1, stress2, rtol=1e-5)
+    assert_allclose(X1, X2, rtol=1e-5)
+
+
+# TODO(1.9): remove warning filter
+@pytest.mark.filterwarnings("ignore::FutureWarning")
+@pytest.mark.parametrize("metric", [True, False])
+def test_normalized_stress_auto(metric, monkeypatch):
+    rng = np.random.RandomState(0)
+    X = rng.randn(4, 3)
+    dist = euclidean_distances(X)
+
+    mock = Mock(side_effect=mds._smacof_single)
+    monkeypatch.setattr("sklearn.manifold._mds._smacof_single", mock)
+
+    est = mds.MDS(metric=metric, normalized_stress="auto", random_state=rng)
+    est.fit_transform(X)
+    assert mock.call_args[1]["normalized_stress"] != metric
+
+    mds.smacof(dist, metric=metric, normalized_stress="auto", random_state=rng)
+    assert mock.call_args[1]["normalized_stress"] != metric
+
+
+def test_isotonic_outofbounds():
+    # This particular configuration can trigger out of bounds error
+    # in the isotonic regression (non-regression test for issue 26999)
+    dis = np.array(
+        [
+            [0.0, 1.732050807568877, 1.7320508075688772],
+            [1.732050807568877, 0.0, 6.661338147750939e-16],
+            [1.7320508075688772, 6.661338147750939e-16, 0.0],
+        ]
+    )
+    init = np.array(
+        [
+            [0.08665881585055124, 0.7939114643387546],
+            [0.9959834154297658, 0.7555546025640025],
+            [0.8766008278401566, 0.4227358815811242],
+        ]
+    )
+    mds.smacof(dis, init=init, metric=False, n_init=1)
+
+
+# TODO(1.9): remove warning filter
+@pytest.mark.filterwarnings("ignore::FutureWarning")
+@pytest.mark.parametrize("normalized_stress", [True, False])
+def test_returned_stress(normalized_stress):
+    # Test that the final stress corresponds to the final embedding
+    # (non-regression test for issue 16846)
+    X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
+    D = euclidean_distances(X)
+
+    mds_est = mds.MDS(
+        n_components=2,
+        random_state=42,
+        normalized_stress=normalized_stress,
+    ).fit(X)
+
+    Z = mds_est.embedding_
+    stress = mds_est.stress_
+
+    D_mds = euclidean_distances(Z)
+    stress_Z = ((D_mds.ravel() - D.ravel()) ** 2).sum() / 2
+
+    if normalized_stress:
+        stress_Z = np.sqrt(stress_Z / ((D_mds.ravel() ** 2).sum() / 2))
+
+    assert_allclose(stress, stress_Z)
+
+
+# TODO(1.9): remove warning filter
+@pytest.mark.filterwarnings("ignore::FutureWarning")
+@pytest.mark.parametrize("metric", [True, False])
+def test_convergence_does_not_depend_on_scale(metric):
+    # Test that the number of iterations until convergence does not depend on
+    # the scale of the input data
+    X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
+
+    mds_est = mds.MDS(
+        n_components=2,
+        random_state=42,
+        metric=metric,
+    )
+
+    mds_est.fit(X * 100)
+    n_iter1 = mds_est.n_iter_
+
+    mds_est.fit(X / 100)
+    n_iter2 = mds_est.n_iter_
+
+    assert_equal(n_iter1, n_iter2)
+
+
+# TODO(1.9): delete this test
+def test_future_warning_n_init():
+    X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
+    sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
+
+    with pytest.warns(FutureWarning):
+        mds.smacof(sim)
+
+    with pytest.warns(FutureWarning):
+        mds.MDS().fit(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_spectral_embedding.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_spectral_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4115734a404360d0d4ce507d18df9e4b2b5396
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_spectral_embedding.py
@@ -0,0 +1,503 @@
+import itertools
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+from scipy import sparse
+from scipy.linalg import eigh
+from scipy.sparse.linalg import eigsh, lobpcg
+
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+from sklearn.manifold import SpectralEmbedding, _spectral_embedding, spectral_embedding
+from sklearn.manifold._spectral_embedding import (
+    _graph_connected_component,
+    _graph_is_connected,
+)
+from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
+from sklearn.metrics.pairwise import rbf_kernel
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
+from sklearn.utils.extmath import _deterministic_vector_sign_flip
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    parse_version,
+    sp_version,
+)
+from sklearn.utils.fixes import laplacian as csgraph_laplacian
+
+try:
+    from pyamg import smoothed_aggregation_solver  # noqa: F401
+
+    pyamg_available = True
+except ImportError:
+    pyamg_available = False
+skip_if_no_pyamg = pytest.mark.skipif(
+    not pyamg_available, reason="PyAMG is required for the tests in this function."
+)
+
+# non centered, sparse centers to check the
+centers = np.array(
+    [
+        [0.0, 5.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 4.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 5.0, 1.0],
+    ]
+)
+n_samples = 1000
+n_clusters, n_features = centers.shape
+S, true_labels = make_blobs(
+    n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
+)
+
+
+def _assert_equal_with_sign_flipping(A, B, tol=0.0):
+    """Check array A and B are equal with possible sign flipping on
+    each column"""
+    tol_squared = tol**2
+    for A_col, B_col in zip(A.T, B.T):
+        assert (
+            np.max((A_col - B_col) ** 2) <= tol_squared
+            or np.max((A_col + B_col) ** 2) <= tol_squared
+        )
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_sparse_graph_connected_component(coo_container):
+    rng = np.random.RandomState(42)
+    n_samples = 300
+    boundaries = [0, 42, 121, 200, n_samples]
+    p = rng.permutation(n_samples)
+    connections = []
+
+    for start, stop in itertools.pairwise(boundaries):
+        group = p[start:stop]
+        # Connect all elements within the group at least once via an
+        # arbitrary path that spans the group.
+        for i in range(len(group) - 1):
+            connections.append((group[i], group[i + 1]))
+
+        # Add some more random connections within the group
+        min_idx, max_idx = 0, len(group) - 1
+        n_random_connections = 1000
+        source = rng.randint(min_idx, max_idx, size=n_random_connections)
+        target = rng.randint(min_idx, max_idx, size=n_random_connections)
+        connections.extend(zip(group[source], group[target]))
+
+    # Build a symmetric affinity matrix
+    row_idx, column_idx = tuple(np.array(connections).T)
+    data = rng.uniform(0.1, 42, size=len(connections))
+    affinity = coo_container((data, (row_idx, column_idx)))
+    affinity = 0.5 * (affinity + affinity.T)
+
+    for start, stop in itertools.pairwise(boundaries):
+        component_1 = _graph_connected_component(affinity, p[start])
+        component_size = stop - start
+        assert component_1.sum() == component_size
+
+        # We should retrieve the same component mask by starting by both ends
+        # of the group
+        component_2 = _graph_connected_component(affinity, p[stop - 1])
+        assert component_2.sum() == component_size
+        assert_array_equal(component_1, component_2)
+
+
+# TODO: investigate why this test is seed-sensitive on 32-bit Python
+# runtimes. Is this revealing a numerical stability problem ? Or is it
+# expected from the test numerical design ? In the latter case the test
+# should be made less seed-sensitive instead.
+@pytest.mark.parametrize(
+    "eigen_solver",
+    [
+        "arpack",
+        "lobpcg",
+        pytest.param("amg", marks=skip_if_no_pyamg),
+    ],
+)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0):
+    # Test spectral embedding with two components
+    random_state = np.random.RandomState(seed)
+    n_sample = 100
+    affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
+    # first component
+    affinity[0:n_sample, 0:n_sample] = (
+        np.abs(random_state.randn(n_sample, n_sample)) + 2
+    )
+    # second component
+    affinity[n_sample::, n_sample::] = (
+        np.abs(random_state.randn(n_sample, n_sample)) + 2
+    )
+
+    # Test of internal _graph_connected_component before connection
+    component = _graph_connected_component(affinity, 0)
+    assert component[:n_sample].all()
+    assert not component[n_sample:].any()
+    component = _graph_connected_component(affinity, -1)
+    assert not component[:n_sample].any()
+    assert component[n_sample:].all()
+
+    # connection
+    affinity[0, n_sample + 1] = 1
+    affinity[n_sample + 1, 0] = 1
+    affinity.flat[:: 2 * n_sample + 1] = 0
+    affinity = 0.5 * (affinity + affinity.T)
+
+    true_label = np.zeros(shape=2 * n_sample)
+    true_label[0:n_sample] = 1
+
+    se_precomp = SpectralEmbedding(
+        n_components=1,
+        affinity="precomputed",
+        random_state=np.random.RandomState(seed),
+        eigen_solver=eigen_solver,
+    )
+
+    embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
+    # thresholding on the first components using 0.
+    label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
+    assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
+@pytest.mark.parametrize(
+    "eigen_solver",
+    [
+        "arpack",
+        "lobpcg",
+        pytest.param("amg", marks=skip_if_no_pyamg),
+    ],
+)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_spectral_embedding_precomputed_affinity(
+    sparse_container, eigen_solver, dtype, seed=36
+):
+    # Test spectral embedding with precomputed kernel
+    gamma = 1.0
+    X = S if sparse_container is None else sparse_container(S)
+
+    se_precomp = SpectralEmbedding(
+        n_components=2,
+        affinity="precomputed",
+        random_state=np.random.RandomState(seed),
+        eigen_solver=eigen_solver,
+    )
+    se_rbf = SpectralEmbedding(
+        n_components=2,
+        affinity="rbf",
+        gamma=gamma,
+        random_state=np.random.RandomState(seed),
+        eigen_solver=eigen_solver,
+    )
+    embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
+    embed_rbf = se_rbf.fit_transform(X.astype(dtype))
+    assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
+    _assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
+
+
+def test_precomputed_nearest_neighbors_filtering():
+    # Test precomputed graph filtering when containing too many neighbors
+    n_neighbors = 2
+    results = []
+    for additional_neighbors in [0, 10]:
+        nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
+        graph = nn.kneighbors_graph(S, mode="connectivity")
+        embedding = (
+            SpectralEmbedding(
+                random_state=0,
+                n_components=2,
+                affinity="precomputed_nearest_neighbors",
+                n_neighbors=n_neighbors,
+            )
+            .fit(graph)
+            .embedding_
+        )
+        results.append(embedding)
+
+    assert_array_equal(results[0], results[1])
+
+
+@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
+def test_spectral_embedding_callable_affinity(sparse_container, seed=36):
+    # Test spectral embedding with callable affinity
+    gamma = 0.9
+    kern = rbf_kernel(S, gamma=gamma)
+    X = S if sparse_container is None else sparse_container(S)
+
+    se_callable = SpectralEmbedding(
+        n_components=2,
+        affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
+        gamma=gamma,
+        random_state=np.random.RandomState(seed),
+    )
+    se_rbf = SpectralEmbedding(
+        n_components=2,
+        affinity="rbf",
+        gamma=gamma,
+        random_state=np.random.RandomState(seed),
+    )
+    embed_rbf = se_rbf.fit_transform(X)
+    embed_callable = se_callable.fit_transform(X)
+    assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
+    assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
+    _assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
+
+
+@pytest.mark.skipif(
+    not pyamg_available, reason="PyAMG is required for the tests in this function."
+)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36):
+    se_amg = SpectralEmbedding(
+        n_components=2,
+        affinity="nearest_neighbors",
+        eigen_solver="amg",
+        n_neighbors=5,
+        random_state=np.random.RandomState(seed),
+    )
+    se_arpack = SpectralEmbedding(
+        n_components=2,
+        affinity="nearest_neighbors",
+        eigen_solver="arpack",
+        n_neighbors=5,
+        random_state=np.random.RandomState(seed),
+    )
+    embed_amg = se_amg.fit_transform(S.astype(dtype))
+    embed_arpack = se_arpack.fit_transform(S.astype(dtype))
+    _assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
+
+    # same with special case in which amg is not actually used
+    # regression test for #10715
+    # affinity between nodes
+    row = np.array([0, 0, 1, 2, 3, 3, 4], dtype=np.int32)
+    col = np.array([1, 2, 2, 3, 4, 5, 5], dtype=np.int32)
+    val = np.array([100, 100, 100, 1, 100, 100, 100], dtype=np.int64)
+
+    affinity = coo_container(
+        (np.hstack([val, val]), (np.hstack([row, col]), np.hstack([col, row]))),
+        shape=(6, 6),
+    )
+    se_amg.affinity = "precomputed"
+    se_arpack.affinity = "precomputed"
+    embed_amg = se_amg.fit_transform(affinity.astype(dtype))
+    embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
+    _assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
+
+    # Check that passing a sparse matrix with `np.int64` indices dtype raises an error
+    # or is successful based on the version of SciPy which is installed.
+    # Use a CSR matrix to avoid any conversion during the validation
+    affinity = affinity.tocsr()
+    affinity.indptr = affinity.indptr.astype(np.int64)
+    affinity.indices = affinity.indices.astype(np.int64)
+
+    # PR: https://github.com/scipy/scipy/pull/18913
+    # First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
+    scipy_graph_traversal_supports_int64_index = sp_version >= parse_version("1.11.3")
+    if scipy_graph_traversal_supports_int64_index:
+        se_amg.fit_transform(affinity)
+    else:
+        err_msg = "Only sparse matrices with 32-bit integer indices are accepted"
+        with pytest.raises(ValueError, match=err_msg):
+            se_amg.fit_transform(affinity)
+
+
+@pytest.mark.skipif(
+    not pyamg_available, reason="PyAMG is required for the tests in this function."
+)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
+    # Non-regression test for amg solver failure (issue #13393 on github)
+    num_nodes = 100
+    X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
+    X = X.astype(dtype)
+    upper = sparse.triu(X) - sparse.diags(X.diagonal())
+    sym_matrix = upper + upper.T
+    embedding = spectral_embedding(
+        sym_matrix, n_components=10, eigen_solver="amg", random_state=0
+    )
+
+    # Check that the learned embedding is stable w.r.t. random solver init:
+    for i in range(3):
+        new_embedding = spectral_embedding(
+            sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
+        )
+        _assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
+
+
+def test_pipeline_spectral_clustering(seed=36):
+    # Test using pipeline to do spectral clustering
+    random_state = np.random.RandomState(seed)
+    se_rbf = SpectralEmbedding(
+        n_components=n_clusters, affinity="rbf", random_state=random_state
+    )
+    se_knn = SpectralEmbedding(
+        n_components=n_clusters,
+        affinity="nearest_neighbors",
+        n_neighbors=5,
+        random_state=random_state,
+    )
+    for se in [se_rbf, se_knn]:
+        km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
+        km.fit(se.fit_transform(S))
+        assert_array_almost_equal(
+            normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
+        )
+
+
+def test_connectivity(seed=36):
+    # Test that graph connectivity test works as expected
+    graph = np.array(
+        [
+            [1, 0, 0, 0, 0],
+            [0, 1, 1, 0, 0],
+            [0, 1, 1, 1, 0],
+            [0, 0, 1, 1, 1],
+            [0, 0, 0, 1, 1],
+        ]
+    )
+    assert not _graph_is_connected(graph)
+    for csr_container in CSR_CONTAINERS:
+        assert not _graph_is_connected(csr_container(graph))
+    for csc_container in CSC_CONTAINERS:
+        assert not _graph_is_connected(csc_container(graph))
+
+    graph = np.array(
+        [
+            [1, 1, 0, 0, 0],
+            [1, 1, 1, 0, 0],
+            [0, 1, 1, 1, 0],
+            [0, 0, 1, 1, 1],
+            [0, 0, 0, 1, 1],
+        ]
+    )
+    assert _graph_is_connected(graph)
+    for csr_container in CSR_CONTAINERS:
+        assert _graph_is_connected(csr_container(graph))
+    for csc_container in CSC_CONTAINERS:
+        assert _graph_is_connected(csc_container(graph))
+
+
+def test_spectral_embedding_deterministic():
+    # Test that Spectral Embedding is deterministic
+    random_state = np.random.RandomState(36)
+    data = random_state.randn(10, 30)
+    sims = rbf_kernel(data)
+    embedding_1 = spectral_embedding(sims)
+    embedding_2 = spectral_embedding(sims)
+    assert_array_almost_equal(embedding_1, embedding_2)
+
+
+def test_spectral_embedding_unnormalized():
+    # Test that spectral_embedding is also processing unnormalized laplacian
+    # correctly
+    random_state = np.random.RandomState(36)
+    data = random_state.randn(10, 30)
+    sims = rbf_kernel(data)
+    n_components = 8
+    embedding_1 = spectral_embedding(
+        sims, norm_laplacian=False, n_components=n_components, drop_first=False
+    )
+
+    # Verify using manual computation with dense eigh
+    laplacian, dd = csgraph_laplacian(sims, normed=False, return_diag=True)
+    _, diffusion_map = eigh(laplacian)
+    embedding_2 = diffusion_map.T[:n_components]
+    embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
+
+    assert_array_almost_equal(embedding_1, embedding_2)
+
+
+def test_spectral_embedding_first_eigen_vector():
+    # Test that the first eigenvector of spectral_embedding
+    # is constant and that the second is not (for a connected graph)
+    random_state = np.random.RandomState(36)
+    data = random_state.randn(10, 30)
+    sims = rbf_kernel(data)
+    n_components = 2
+
+    for seed in range(10):
+        embedding = spectral_embedding(
+            sims,
+            norm_laplacian=False,
+            n_components=n_components,
+            drop_first=False,
+            random_state=seed,
+        )
+
+        assert np.std(embedding[:, 0]) == pytest.approx(0)
+        assert np.std(embedding[:, 1]) > 1e-3
+
+
+@pytest.mark.parametrize(
+    "eigen_solver",
+    [
+        "arpack",
+        "lobpcg",
+        pytest.param("amg", marks=skip_if_no_pyamg),
+    ],
+)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
+    """Check that `SpectralEmbedding is preserving the dtype of the fitted
+    attribute and transformed data.
+
+    Ideally, this test should be covered by the common test
+    `check_transformer_preserve_dtypes`. However, this test only run
+    with transformers implementing `transform` while `SpectralEmbedding`
+    implements only `fit_transform`.
+    """
+    X = S.astype(dtype)
+    se = SpectralEmbedding(
+        n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
+    )
+    X_trans = se.fit_transform(X)
+
+    assert X_trans.dtype == dtype
+    assert se.embedding_.dtype == dtype
+    assert se.affinity_matrix_.dtype == dtype
+
+
+@pytest.mark.skipif(
+    pyamg_available,
+    reason="PyAMG is installed and we should not test for an error.",
+)
+def test_error_pyamg_not_available():
+    se_precomp = SpectralEmbedding(
+        n_components=2,
+        affinity="rbf",
+        eigen_solver="amg",
+    )
+    err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
+    with pytest.raises(ValueError, match=err_msg):
+        se_precomp.fit_transform(S)
+
+
+@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container):
+    """Test that `eigen_tol="auto"` is resolved correctly"""
+    if solver == "amg" and not pyamg_available:
+        pytest.skip("PyAMG is not available.")
+    X, _ = make_blobs(
+        n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+    D = pairwise_distances(X)  # Distance matrix
+    S = np.max(D) - D  # Similarity matrix
+
+    solver_func = eigsh if solver == "arpack" else lobpcg
+    default_value = 0 if solver == "arpack" else None
+    if solver == "amg":
+        S = csr_container(S)
+
+    mocked_solver = Mock(side_effect=solver_func)
+
+    monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)
+
+    spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
+    mocked_solver.assert_called()
+
+    _, kwargs = mocked_solver.call_args
+    assert kwargs["tol"] == default_value
diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_t_sne.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_t_sne.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f32b889d5b1f20c758228f075e4f5541bfb3300
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_t_sne.py
@@ -0,0 +1,1187 @@
+import re
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from numpy.testing import assert_allclose
+from scipy.optimize import check_grad
+from scipy.spatial.distance import pdist, squareform
+
+from sklearn import config_context
+from sklearn.datasets import make_blobs
+
+# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
+from sklearn.manifold import (  # type: ignore[attr-defined]
+    TSNE,
+    _barnes_hut_tsne,
+)
+from sklearn.manifold._t_sne import (
+    _gradient_descent,
+    _joint_probabilities,
+    _joint_probabilities_nn,
+    _kl_divergence,
+    _kl_divergence_bh,
+    trustworthiness,
+)
+from sklearn.manifold._utils import _binary_search_perplexity
+from sklearn.metrics.pairwise import (
+    cosine_distances,
+    manhattan_distances,
+    pairwise_distances,
+)
+from sklearn.neighbors import NearestNeighbors, kneighbors_graph
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    skip_if_32bit,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS, LIL_CONTAINERS
+
+x = np.linspace(0, 1, 10)
+xx, yy = np.meshgrid(x, x)
+X_2d_grid = np.hstack(
+    [
+        xx.ravel().reshape(-1, 1),
+        yy.ravel().reshape(-1, 1),
+    ]
+)
+
+
+def test_gradient_descent_stops():
+    # Test stopping conditions of gradient descent.
+    class ObjectiveSmallGradient:
+        def __init__(self):
+            self.it = -1
+
+        def __call__(self, _, compute_error=True):
+            self.it += 1
+            return (10 - self.it) / 10.0, np.array([1e-5])
+
+    def flat_function(_, compute_error=True):
+        return 0.0, np.ones(1)
+
+    # Gradient norm
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        _, error, it = _gradient_descent(
+            ObjectiveSmallGradient(),
+            np.zeros(1),
+            0,
+            max_iter=100,
+            n_iter_without_progress=100,
+            momentum=0.0,
+            learning_rate=0.0,
+            min_gain=0.0,
+            min_grad_norm=1e-5,
+            verbose=2,
+        )
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+    assert error == 1.0
+    assert it == 0
+    assert "gradient norm" in out
+
+    # Maximum number of iterations without improvement
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        _, error, it = _gradient_descent(
+            flat_function,
+            np.zeros(1),
+            0,
+            max_iter=100,
+            n_iter_without_progress=10,
+            momentum=0.0,
+            learning_rate=0.0,
+            min_gain=0.0,
+            min_grad_norm=0.0,
+            verbose=2,
+        )
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+    assert error == 0.0
+    assert it == 11
+    assert "did not make any progress" in out
+
+    # Maximum number of iterations
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        _, error, it = _gradient_descent(
+            ObjectiveSmallGradient(),
+            np.zeros(1),
+            0,
+            max_iter=11,
+            n_iter_without_progress=100,
+            momentum=0.0,
+            learning_rate=0.0,
+            min_gain=0.0,
+            min_grad_norm=0.0,
+            verbose=2,
+        )
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+    assert error == 0.0
+    assert it == 10
+    assert "Iteration 10" in out
+
+
+def test_binary_search():
+    # Test if the binary search finds Gaussians with desired perplexity.
+    random_state = check_random_state(0)
+    data = random_state.randn(50, 5)
+    distances = pairwise_distances(data).astype(np.float32)
+    desired_perplexity = 25.0
+    P = _binary_search_perplexity(distances, desired_perplexity, verbose=0)
+    P = np.maximum(P, np.finfo(np.double).eps)
+    mean_perplexity = np.mean(
+        [np.exp(-np.sum(P[i] * np.log(P[i]))) for i in range(P.shape[0])]
+    )
+    assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
+
+
+def test_binary_search_underflow():
+    # Test if the binary search finds Gaussians with desired perplexity.
+    # A more challenging case than the one above, producing numeric
+    # underflow in float precision (see issue #19471 and PR #19472).
+    random_state = check_random_state(42)
+    data = random_state.randn(1, 90).astype(np.float32) + 100
+    desired_perplexity = 30.0
+    P = _binary_search_perplexity(data, desired_perplexity, verbose=0)
+    perplexity = 2 ** -np.nansum(P[0, 1:] * np.log2(P[0, 1:]))
+    assert_almost_equal(perplexity, desired_perplexity, decimal=3)
+
+
+def test_binary_search_neighbors():
+    # Binary perplexity search approximation.
+    # Should be approximately equal to the slow method when we use
+    # all points as neighbors.
+    n_samples = 200
+    desired_perplexity = 25.0
+    random_state = check_random_state(0)
+    data = random_state.randn(n_samples, 2).astype(np.float32, copy=False)
+    distances = pairwise_distances(data)
+    P1 = _binary_search_perplexity(distances, desired_perplexity, verbose=0)
+
+    # Test that when we use all the neighbors the results are identical
+    n_neighbors = n_samples - 1
+    nn = NearestNeighbors().fit(data)
+    distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode="distance")
+    distances_nn = distance_graph.data.astype(np.float32, copy=False)
+    distances_nn = distances_nn.reshape(n_samples, n_neighbors)
+    P2 = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0)
+
+    indptr = distance_graph.indptr
+    P1_nn = np.array(
+        [
+            P1[k, distance_graph.indices[indptr[k] : indptr[k + 1]]]
+            for k in range(n_samples)
+        ]
+    )
+    assert_array_almost_equal(P1_nn, P2, decimal=4)
+
+    # Test that the highest P_ij are the same when fewer neighbors are used
+    for k in np.linspace(150, n_samples - 1, 5):
+        k = int(k)
+        topn = k * 10  # check the top 10 * k entries out of k * k entries
+        distance_graph = nn.kneighbors_graph(n_neighbors=k, mode="distance")
+        distances_nn = distance_graph.data.astype(np.float32, copy=False)
+        distances_nn = distances_nn.reshape(n_samples, k)
+        P2k = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0)
+        assert_array_almost_equal(P1_nn, P2, decimal=2)
+        idx = np.argsort(P1.ravel())[::-1]
+        P1top = P1.ravel()[idx][:topn]
+        idx = np.argsort(P2k.ravel())[::-1]
+        P2top = P2k.ravel()[idx][:topn]
+        assert_array_almost_equal(P1top, P2top, decimal=2)
+
+
+def test_binary_perplexity_stability():
+    # Binary perplexity search should be stable.
+    # The binary_search_perplexity had a bug wherein the P array
+    # was uninitialized, leading to sporadically failing tests.
+    n_neighbors = 10
+    n_samples = 100
+    random_state = check_random_state(0)
+    data = random_state.randn(n_samples, 5)
+    nn = NearestNeighbors().fit(data)
+    distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode="distance")
+    distances = distance_graph.data.astype(np.float32, copy=False)
+    distances = distances.reshape(n_samples, n_neighbors)
+    last_P = None
+    desired_perplexity = 3
+    for _ in range(100):
+        P = _binary_search_perplexity(distances.copy(), desired_perplexity, verbose=0)
+        P1 = _joint_probabilities_nn(distance_graph, desired_perplexity, verbose=0)
+        # Convert the sparse matrix to a dense one for testing
+        P1 = P1.toarray()
+        if last_P is None:
+            last_P = P
+            last_P1 = P1
+        else:
+            assert_array_almost_equal(P, last_P, decimal=4)
+            assert_array_almost_equal(P1, last_P1, decimal=4)
+
+
+def test_gradient():
+    # Test gradient of Kullback-Leibler divergence.
+    random_state = check_random_state(0)
+
+    n_samples = 50
+    n_features = 2
+    n_components = 2
+    alpha = 1.0
+
+    distances = random_state.randn(n_samples, n_features).astype(np.float32)
+    distances = np.abs(distances.dot(distances.T))
+    np.fill_diagonal(distances, 0.0)
+    X_embedded = random_state.randn(n_samples, n_components).astype(np.float32)
+
+    P = _joint_probabilities(distances, desired_perplexity=25.0, verbose=0)
+
+    def fun(params):
+        return _kl_divergence(params, P, alpha, n_samples, n_components)[0]
+
+    def grad(params):
+        return _kl_divergence(params, P, alpha, n_samples, n_components)[1]
+
+    assert_almost_equal(check_grad(fun, grad, X_embedded.ravel()), 0.0, decimal=5)
+
+
+def test_trustworthiness():
+    # Test trustworthiness score.
+    random_state = check_random_state(0)
+
+    # Affine transformation
+    X = random_state.randn(100, 2)
+    assert trustworthiness(X, 5.0 + X / 10.0) == 1.0
+
+    # Randomly shuffled
+    X = np.arange(100).reshape(-1, 1)
+    X_embedded = X.copy()
+    random_state.shuffle(X_embedded)
+    assert trustworthiness(X, X_embedded) < 0.6
+
+    # Completely different
+    X = np.arange(5).reshape(-1, 1)
+    X_embedded = np.array([[0], [2], [4], [1], [3]])
+    assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 0.2)
+
+
+def test_trustworthiness_n_neighbors_error():
+    """Raise an error when n_neighbors >= n_samples / 2.
+
+    Non-regression test for #18567.
+    """
+    regex = "n_neighbors .+ should be less than .+"
+    rng = np.random.RandomState(42)
+    X = rng.rand(7, 4)
+    X_embedded = rng.rand(7, 2)
+    with pytest.raises(ValueError, match=regex):
+        trustworthiness(X, X_embedded, n_neighbors=5)
+
+    trust = trustworthiness(X, X_embedded, n_neighbors=3)
+    assert 0 <= trust <= 1
+
+
+@pytest.mark.parametrize("method", ["exact", "barnes_hut"])
+@pytest.mark.parametrize("init", ("random", "pca"))
+def test_preserve_trustworthiness_approximately(method, init):
+    # Nearest neighbors should be preserved approximately.
+    random_state = check_random_state(0)
+    n_components = 2
+    X = random_state.randn(50, n_components).astype(np.float32)
+    tsne = TSNE(
+        n_components=n_components,
+        init=init,
+        random_state=0,
+        method=method,
+        max_iter=700,
+        learning_rate="auto",
+    )
+    X_embedded = tsne.fit_transform(X)
+    t = trustworthiness(X, X_embedded, n_neighbors=1)
+    assert t > 0.85
+
+
+def test_optimization_minimizes_kl_divergence():
+    """t-SNE should give a lower KL divergence with more iterations."""
+    random_state = check_random_state(0)
+    X, _ = make_blobs(n_features=3, random_state=random_state)
+    kl_divergences = []
+    for max_iter in [250, 300, 350]:
+        tsne = TSNE(
+            n_components=2,
+            init="random",
+            perplexity=10,
+            learning_rate=100.0,
+            max_iter=max_iter,
+            random_state=0,
+        )
+        tsne.fit_transform(X)
+        kl_divergences.append(tsne.kl_divergence_)
+    assert kl_divergences[1] <= kl_divergences[0]
+    assert kl_divergences[2] <= kl_divergences[1]
+
+
+@pytest.mark.parametrize("method", ["exact", "barnes_hut"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_fit_transform_csr_matrix(method, csr_container):
+    # TODO: compare results on dense and sparse data as proposed in:
+    # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
+    # X can be a sparse matrix.
+    rng = check_random_state(0)
+    X = rng.randn(50, 2)
+    X[(rng.randint(0, 50, 25), rng.randint(0, 2, 25))] = 0.0
+    X_csr = csr_container(X)
+    tsne = TSNE(
+        n_components=2,
+        init="random",
+        perplexity=10,
+        learning_rate=100.0,
+        random_state=0,
+        method=method,
+        max_iter=750,
+    )
+    X_embedded = tsne.fit_transform(X_csr)
+    assert_allclose(trustworthiness(X_csr, X_embedded, n_neighbors=1), 1.0, rtol=1.1e-1)
+
+
+def test_preserve_trustworthiness_approximately_with_precomputed_distances():
+    # Nearest neighbors should be preserved approximately.
+    random_state = check_random_state(0)
+    for i in range(3):
+        X = random_state.randn(80, 2)
+        D = squareform(pdist(X), "sqeuclidean")
+        tsne = TSNE(
+            n_components=2,
+            perplexity=2,
+            learning_rate=100.0,
+            early_exaggeration=2.0,
+            metric="precomputed",
+            random_state=i,
+            verbose=0,
+            max_iter=500,
+            init="random",
+        )
+        X_embedded = tsne.fit_transform(D)
+        t = trustworthiness(D, X_embedded, n_neighbors=1, metric="precomputed")
+        assert t > 0.95
+
+
+def test_trustworthiness_not_euclidean_metric():
+    # Test trustworthiness with a metric different from 'euclidean' and
+    # 'precomputed'
+    random_state = check_random_state(0)
+    X = random_state.randn(100, 2)
+    assert trustworthiness(X, X, metric="cosine") == trustworthiness(
+        pairwise_distances(X, metric="cosine"), X, metric="precomputed"
+    )
+
+
+@pytest.mark.parametrize(
+    "method, retype",
+    [
+        ("exact", np.asarray),
+        ("barnes_hut", np.asarray),
+        *[("barnes_hut", csr_container) for csr_container in CSR_CONTAINERS],
+    ],
+)
+@pytest.mark.parametrize(
+    "D, message_regex",
+    [
+        ([[0.0], [1.0]], ".* square distance matrix"),
+        ([[0.0, -1.0], [1.0, 0.0]], ".* positive.*"),
+    ],
+)
+def test_bad_precomputed_distances(method, D, retype, message_regex):
+    tsne = TSNE(
+        metric="precomputed",
+        method=method,
+        init="random",
+        random_state=42,
+        perplexity=1,
+    )
+    with pytest.raises(ValueError, match=message_regex):
+        tsne.fit_transform(retype(D))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_exact_no_precomputed_sparse(csr_container):
+    tsne = TSNE(
+        metric="precomputed",
+        method="exact",
+        init="random",
+        random_state=42,
+        perplexity=1,
+    )
+    with pytest.raises(TypeError, match="sparse"):
+        tsne.fit_transform(csr_container([[0, 5], [5, 0]]))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_high_perplexity_precomputed_sparse_distances(csr_container):
+    # Perplexity should be less than 50
+    dist = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0]])
+    bad_dist = csr_container(dist)
+    tsne = TSNE(metric="precomputed", init="random", random_state=42, perplexity=1)
+    msg = "3 neighbors per samples are required, but some samples have only 1"
+    with pytest.raises(ValueError, match=msg):
+        tsne.fit_transform(bad_dist)
+
+
+@pytest.mark.filterwarnings(
+    "ignore:Precomputed sparse input was not sorted by "
+    "row values:sklearn.exceptions.EfficiencyWarning"
+)
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + LIL_CONTAINERS)
+def test_sparse_precomputed_distance(sparse_container):
+    """Make sure that TSNE works identically for sparse and dense matrix"""
+    random_state = check_random_state(0)
+    X = random_state.randn(100, 2)
+
+    D_sparse = kneighbors_graph(X, n_neighbors=100, mode="distance", include_self=True)
+    D = pairwise_distances(X)
+    assert sp.issparse(D_sparse)
+    assert_almost_equal(D_sparse.toarray(), D)
+
+    tsne = TSNE(
+        metric="precomputed", random_state=0, init="random", learning_rate="auto"
+    )
+    Xt_dense = tsne.fit_transform(D)
+
+    Xt_sparse = tsne.fit_transform(sparse_container(D_sparse))
+    assert_almost_equal(Xt_dense, Xt_sparse)
+
+
+def test_non_positive_computed_distances():
+    # Computed distance matrices must be positive.
+    def metric(x, y):
+        return -1
+
+    # Negative computed distances should be caught even if result is squared
+    tsne = TSNE(metric=metric, method="exact", perplexity=1)
+    X = np.array([[0.0, 0.0], [1.0, 1.0]])
+    with pytest.raises(ValueError, match="All distances .*metric given.*"):
+        tsne.fit_transform(X)
+
+
+def test_init_ndarray():
+    # Initialize TSNE with ndarray and test fit
+    tsne = TSNE(init=np.zeros((100, 2)), learning_rate="auto")
+    X_embedded = tsne.fit_transform(np.ones((100, 5)))
+    assert_array_equal(np.zeros((100, 2)), X_embedded)
+
+
+def test_init_ndarray_precomputed():
+    # Initialize TSNE with ndarray and metric 'precomputed'
+    # Make sure no FutureWarning is thrown from _fit
+    tsne = TSNE(
+        init=np.zeros((100, 2)),
+        metric="precomputed",
+        learning_rate=50.0,
+    )
+    tsne.fit(np.zeros((100, 100)))
+
+
+def test_pca_initialization_not_compatible_with_precomputed_kernel():
+    # Precomputed distance matrices cannot use PCA initialization.
+    tsne = TSNE(metric="precomputed", init="pca", perplexity=1)
+    with pytest.raises(
+        ValueError,
+        match='The parameter init="pca" cannot be used with metric="precomputed".',
+    ):
+        tsne.fit_transform(np.array([[0.0], [1.0]]))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pca_initialization_not_compatible_with_sparse_input(csr_container):
+    # Sparse input matrices cannot use PCA initialization.
+    tsne = TSNE(init="pca", learning_rate=100.0, perplexity=1)
+    with pytest.raises(TypeError, match="PCA initialization.*"):
+        tsne.fit_transform(csr_container([[0, 5], [5, 0]]))
+
+
+def test_n_components_range():
+    # barnes_hut method should only be used with n_components <= 3
+    tsne = TSNE(n_components=4, method="barnes_hut", perplexity=1)
+    with pytest.raises(ValueError, match="'n_components' should be .*"):
+        tsne.fit_transform(np.array([[0.0], [1.0]]))
+
+
+def test_early_exaggeration_used():
+    # check that the ``early_exaggeration`` parameter has an effect
+    random_state = check_random_state(0)
+    n_components = 2
+    methods = ["exact", "barnes_hut"]
+    X = random_state.randn(25, n_components).astype(np.float32)
+    for method in methods:
+        tsne = TSNE(
+            n_components=n_components,
+            perplexity=1,
+            learning_rate=100.0,
+            init="pca",
+            random_state=0,
+            method=method,
+            early_exaggeration=1.0,
+            max_iter=250,
+        )
+        X_embedded1 = tsne.fit_transform(X)
+        tsne = TSNE(
+            n_components=n_components,
+            perplexity=1,
+            learning_rate=100.0,
+            init="pca",
+            random_state=0,
+            method=method,
+            early_exaggeration=10.0,
+            max_iter=250,
+        )
+        X_embedded2 = tsne.fit_transform(X)
+
+        assert not np.allclose(X_embedded1, X_embedded2)
+
+
+def test_max_iter_used():
+    # check that the ``max_iter`` parameter has an effect
+    random_state = check_random_state(0)
+    n_components = 2
+    methods = ["exact", "barnes_hut"]
+    X = random_state.randn(25, n_components).astype(np.float32)
+    for method in methods:
+        for max_iter in [251, 500]:
+            tsne = TSNE(
+                n_components=n_components,
+                perplexity=1,
+                learning_rate=0.5,
+                init="random",
+                random_state=0,
+                method=method,
+                early_exaggeration=1.0,
+                max_iter=max_iter,
+            )
+            tsne.fit_transform(X)
+
+            assert tsne.n_iter_ == max_iter - 1
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_answer_gradient_two_points(csr_container):
+    # Test the tree with only a single set of children.
+    #
+    # These tests & answers have been checked against the reference
+    # implementation by LvdM.
+    pos_input = np.array([[1.0, 0.0], [0.0, 1.0]])
+    pos_output = np.array(
+        [[-4.961291e-05, -1.072243e-04], [9.259460e-05, 2.702024e-04]]
+    )
+    neighbors = np.array([[1], [0]])
+    grad_output = np.array(
+        [[-2.37012478e-05, -6.29044398e-05], [2.37012478e-05, 6.29044398e-05]]
+    )
+    _run_answer_test(pos_input, pos_output, neighbors, grad_output, csr_container)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_answer_gradient_four_points(csr_container):
+    # Four points tests the tree with multiple levels of children.
+    #
+    # These tests & answers have been checked against the reference
+    # implementation by LvdM.
+    pos_input = np.array([[1.0, 0.0], [0.0, 1.0], [5.0, 2.0], [7.3, 2.2]])
+    pos_output = np.array(
+        [
+            [6.080564e-05, -7.120823e-05],
+            [-1.718945e-04, -4.000536e-05],
+            [-2.271720e-04, 8.663310e-05],
+            [-1.032577e-04, -3.582033e-05],
+        ]
+    )
+    neighbors = np.array([[1, 2, 3], [0, 2, 3], [1, 0, 3], [1, 2, 0]])
+    grad_output = np.array(
+        [
+            [5.81128448e-05, -7.78033454e-06],
+            [-5.81526851e-05, 7.80976444e-06],
+            [4.24275173e-08, -3.69569698e-08],
+            [-2.58720939e-09, 7.52706374e-09],
+        ]
+    )
+    _run_answer_test(pos_input, pos_output, neighbors, grad_output, csr_container)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_skip_num_points_gradient(csr_container):
+    # Test the kwargs option skip_num_points.
+    #
+    # Skip num points should make it such that the Barnes_hut gradient
+    # is not calculated for indices below skip_num_point.
+    # Aside from skip_num_points=2 and the first two gradient rows
+    # being set to zero, these data points are the same as in
+    # test_answer_gradient_four_points()
+    pos_input = np.array([[1.0, 0.0], [0.0, 1.0], [5.0, 2.0], [7.3, 2.2]])
+    pos_output = np.array(
+        [
+            [6.080564e-05, -7.120823e-05],
+            [-1.718945e-04, -4.000536e-05],
+            [-2.271720e-04, 8.663310e-05],
+            [-1.032577e-04, -3.582033e-05],
+        ]
+    )
+    neighbors = np.array([[1, 2, 3], [0, 2, 3], [1, 0, 3], [1, 2, 0]])
+    grad_output = np.array(
+        [
+            [0.0, 0.0],
+            [0.0, 0.0],
+            [4.24275173e-08, -3.69569698e-08],
+            [-2.58720939e-09, 7.52706374e-09],
+        ]
+    )
+    _run_answer_test(
+        pos_input, pos_output, neighbors, grad_output, csr_container, False, 0.1, 2
+    )
+
+
+def _run_answer_test(
+    pos_input,
+    pos_output,
+    neighbors,
+    grad_output,
+    csr_container,
+    verbose=False,
+    perplexity=0.1,
+    skip_num_points=0,
+):
+    distances = pairwise_distances(pos_input).astype(np.float32)
+    args = distances, perplexity, verbose
+    pos_output = pos_output.astype(np.float32)
+    neighbors = neighbors.astype(np.int64, copy=False)
+    pij_input = _joint_probabilities(*args)
+    pij_input = squareform(pij_input).astype(np.float32)
+    grad_bh = np.zeros(pos_output.shape, dtype=np.float32)
+
+    P = csr_container(pij_input)
+
+    neighbors = P.indices.astype(np.int64)
+    indptr = P.indptr.astype(np.int64)
+
+    _barnes_hut_tsne.gradient(
+        P.data, pos_output, neighbors, indptr, grad_bh, 0.5, 2, 1, skip_num_points=0
+    )
+    assert_array_almost_equal(grad_bh, grad_output, decimal=4)
+
+
+def test_verbose():
+    # Verbose options write to stdout.
+    random_state = check_random_state(0)
+    tsne = TSNE(verbose=2, perplexity=4)
+    X = random_state.randn(5, 2)
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        tsne.fit_transform(X)
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+
+    assert "[t-SNE]" in out
+    assert "nearest neighbors..." in out
+    assert "Computed conditional probabilities" in out
+    assert "Mean sigma" in out
+    assert "early exaggeration" in out
+
+
+def test_chebyshev_metric():
+    # t-SNE should allow metrics that cannot be squared (issue #3526).
+    random_state = check_random_state(0)
+    tsne = TSNE(metric="chebyshev", perplexity=4)
+    X = random_state.randn(5, 2)
+    tsne.fit_transform(X)
+
+
+def test_reduction_to_one_component():
+    # t-SNE should allow reduction to one component (issue #4154).
+    random_state = check_random_state(0)
+    tsne = TSNE(n_components=1, perplexity=4)
+    X = random_state.randn(5, 2)
+    X_embedded = tsne.fit(X).embedding_
+    assert np.all(np.isfinite(X_embedded))
+
+
+@pytest.mark.parametrize("method", ["barnes_hut", "exact"])
+@pytest.mark.parametrize("dt", [np.float32, np.float64])
+def test_64bit(method, dt):
+    # Ensure 64bit arrays are handled correctly.
+    random_state = check_random_state(0)
+
+    X = random_state.randn(10, 2).astype(dt, copy=False)
+    tsne = TSNE(
+        n_components=2,
+        perplexity=2,
+        learning_rate=100.0,
+        random_state=0,
+        method=method,
+        verbose=0,
+        max_iter=300,
+        init="random",
+    )
+    X_embedded = tsne.fit_transform(X)
+    effective_type = X_embedded.dtype
+
+    # tsne cython code is only single precision, so the output will
+    # always be single precision, irrespectively of the input dtype
+    assert effective_type == np.float32
+
+
+@pytest.mark.parametrize("method", ["barnes_hut", "exact"])
+def test_kl_divergence_not_nan(method):
+    # Ensure kl_divergence_ is computed at last iteration
+    # even though max_iter % n_iter_check != 0, i.e. 1003 % 50 != 0
+    random_state = check_random_state(0)
+
+    X = random_state.randn(50, 2)
+    tsne = TSNE(
+        n_components=2,
+        perplexity=2,
+        learning_rate=100.0,
+        random_state=0,
+        method=method,
+        verbose=0,
+        max_iter=503,
+        init="random",
+    )
+    tsne.fit_transform(X)
+
+    assert not np.isnan(tsne.kl_divergence_)
+
+
+def test_barnes_hut_angle():
+    # When Barnes-Hut's angle=0 this corresponds to the exact method.
+    angle = 0.0
+    perplexity = 10
+    n_samples = 100
+    for n_components in [2, 3]:
+        n_features = 5
+        degrees_of_freedom = float(n_components - 1.0)
+
+        random_state = check_random_state(0)
+        data = random_state.randn(n_samples, n_features)
+        distances = pairwise_distances(data)
+        params = random_state.randn(n_samples, n_components)
+        P = _joint_probabilities(distances, perplexity, verbose=0)
+        kl_exact, grad_exact = _kl_divergence(
+            params, P, degrees_of_freedom, n_samples, n_components
+        )
+
+        n_neighbors = n_samples - 1
+        distances_csr = (
+            NearestNeighbors()
+            .fit(data)
+            .kneighbors_graph(n_neighbors=n_neighbors, mode="distance")
+        )
+        P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0)
+        kl_bh, grad_bh = _kl_divergence_bh(
+            params,
+            P_bh,
+            degrees_of_freedom,
+            n_samples,
+            n_components,
+            angle=angle,
+            skip_num_points=0,
+            verbose=0,
+        )
+
+        P = squareform(P)
+        P_bh = P_bh.toarray()
+        assert_array_almost_equal(P_bh, P, decimal=5)
+        assert_almost_equal(kl_exact, kl_bh, decimal=3)
+
+
+@skip_if_32bit
+def test_n_iter_without_progress():
+    # Use a dummy negative n_iter_without_progress and check output on stdout
+    random_state = check_random_state(0)
+    X = random_state.randn(100, 10)
+    for method in ["barnes_hut", "exact"]:
+        tsne = TSNE(
+            n_iter_without_progress=-1,
+            verbose=2,
+            learning_rate=1e8,
+            random_state=0,
+            method=method,
+            max_iter=351,
+            init="random",
+        )
+        tsne._N_ITER_CHECK = 1
+        tsne._EXPLORATION_MAX_ITER = 0
+
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+        try:
+            tsne.fit_transform(X)
+        finally:
+            out = sys.stdout.getvalue()
+            sys.stdout.close()
+            sys.stdout = old_stdout
+
+        # The output needs to contain the value of n_iter_without_progress
+        assert "did not make any progress during the last -1 episodes. Finished." in out
+
+
+def test_min_grad_norm():
+    # Make sure that the parameter min_grad_norm is used correctly
+    random_state = check_random_state(0)
+    X = random_state.randn(100, 2)
+    min_grad_norm = 0.002
+    tsne = TSNE(min_grad_norm=min_grad_norm, verbose=2, random_state=0, method="exact")
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        tsne.fit_transform(X)
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+
+    lines_out = out.split("\n")
+
+    # extract the gradient norm from the verbose output
+    gradient_norm_values = []
+    for line in lines_out:
+        # When the computation is Finished just an old gradient norm value
+        # is repeated that we do not need to store
+        if "Finished" in line:
+            break
+
+        start_grad_norm = line.find("gradient norm")
+        if start_grad_norm >= 0:
+            line = line[start_grad_norm:]
+            line = line.replace("gradient norm = ", "").split(" ")[0]
+            gradient_norm_values.append(float(line))
+
+    # Compute how often the gradient norm is smaller than min_grad_norm
+    gradient_norm_values = np.array(gradient_norm_values)
+    n_smaller_gradient_norms = len(
+        gradient_norm_values[gradient_norm_values <= min_grad_norm]
+    )
+
+    # The gradient norm can be smaller than min_grad_norm at most once,
+    # because in the moment it becomes smaller the optimization stops
+    assert n_smaller_gradient_norms <= 1
+
+
+def test_accessible_kl_divergence():
+    # Ensures that the accessible kl_divergence matches the computed value
+    random_state = check_random_state(0)
+    X = random_state.randn(50, 2)
+    tsne = TSNE(
+        n_iter_without_progress=2,
+        verbose=2,
+        random_state=0,
+        method="exact",
+        max_iter=500,
+    )
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        tsne.fit_transform(X)
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+
+    # The output needs to contain the accessible kl_divergence as the error at
+    # the last iteration
+    for line in out.split("\n")[::-1]:
+        if "Iteration" in line:
+            _, _, error = line.partition("error = ")
+            if error:
+                error, _, _ = error.partition(",")
+                break
+    assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5)
+
+
+@pytest.mark.parametrize("method", ["barnes_hut", "exact"])
+def test_uniform_grid(method):
+    """Make sure that TSNE can approximately recover a uniform 2D grid
+
+    Due to ties in distances between point in X_2d_grid, this test is platform
+    dependent for ``method='barnes_hut'`` due to numerical imprecision.
+
+    Also, t-SNE is not assured to converge to the right solution because bad
+    initialization can lead to convergence to bad local minimum (the
+    optimization problem is non-convex). To avoid breaking the test too often,
+    we re-run t-SNE from the final point when the convergence is not good
+    enough.
+    """
+    seeds = range(3)
+    max_iter = 500
+    for seed in seeds:
+        tsne = TSNE(
+            n_components=2,
+            init="random",
+            random_state=seed,
+            perplexity=50,
+            max_iter=max_iter,
+            method=method,
+            learning_rate="auto",
+        )
+        Y = tsne.fit_transform(X_2d_grid)
+
+        try_name = "{}_{}".format(method, seed)
+        try:
+            assert_uniform_grid(Y, try_name)
+        except AssertionError:
+            # If the test fails a first time, re-run with init=Y to see if
+            # this was caused by a bad initialization. Note that this will
+            # also run an early_exaggeration step.
+            try_name += ":rerun"
+            tsne.init = Y
+            Y = tsne.fit_transform(X_2d_grid)
+            assert_uniform_grid(Y, try_name)
+
+
+def assert_uniform_grid(Y, try_name=None):
+    # Ensure that the resulting embedding leads to approximately
+    # uniformly spaced points: the distance to the closest neighbors
+    # should be non-zero and approximately constant.
+    nn = NearestNeighbors(n_neighbors=1).fit(Y)
+    dist_to_nn = nn.kneighbors(return_distance=True)[0].ravel()
+    assert dist_to_nn.min() > 0.1
+
+    smallest_to_mean = dist_to_nn.min() / np.mean(dist_to_nn)
+    largest_to_mean = dist_to_nn.max() / np.mean(dist_to_nn)
+
+    assert smallest_to_mean > 0.5, try_name
+    assert largest_to_mean < 2, try_name
+
+
+def test_bh_match_exact():
+    # check that the ``barnes_hut`` method match the exact one when
+    # ``angle = 0`` and ``perplexity > n_samples / 3``
+    random_state = check_random_state(0)
+    n_features = 10
+    X = random_state.randn(30, n_features).astype(np.float32)
+    X_embeddeds = {}
+    max_iter = {}
+    for method in ["exact", "barnes_hut"]:
+        tsne = TSNE(
+            n_components=2,
+            method=method,
+            learning_rate=1.0,
+            init="random",
+            random_state=0,
+            max_iter=251,
+            perplexity=29.5,
+            angle=0,
+        )
+        # Kill the early_exaggeration
+        tsne._EXPLORATION_MAX_ITER = 0
+        X_embeddeds[method] = tsne.fit_transform(X)
+        max_iter[method] = tsne.n_iter_
+
+    assert max_iter["exact"] == max_iter["barnes_hut"]
+    assert_allclose(X_embeddeds["exact"], X_embeddeds["barnes_hut"], rtol=1e-4)
+
+
+def test_gradient_bh_multithread_match_sequential():
+    # check that the bh gradient with different num_threads gives the same
+    # results
+
+    n_features = 10
+    n_samples = 30
+    n_components = 2
+    degrees_of_freedom = 1
+
+    angle = 3
+    perplexity = 5
+
+    random_state = check_random_state(0)
+    data = random_state.randn(n_samples, n_features).astype(np.float32)
+    params = random_state.randn(n_samples, n_components)
+
+    n_neighbors = n_samples - 1
+    distances_csr = (
+        NearestNeighbors()
+        .fit(data)
+        .kneighbors_graph(n_neighbors=n_neighbors, mode="distance")
+    )
+    P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0)
+    kl_sequential, grad_sequential = _kl_divergence_bh(
+        params,
+        P_bh,
+        degrees_of_freedom,
+        n_samples,
+        n_components,
+        angle=angle,
+        skip_num_points=0,
+        verbose=0,
+        num_threads=1,
+    )
+    for num_threads in [2, 4]:
+        kl_multithread, grad_multithread = _kl_divergence_bh(
+            params,
+            P_bh,
+            degrees_of_freedom,
+            n_samples,
+            n_components,
+            angle=angle,
+            skip_num_points=0,
+            verbose=0,
+            num_threads=num_threads,
+        )
+
+        assert_allclose(kl_multithread, kl_sequential, rtol=1e-6)
+        assert_allclose(grad_multithread, grad_multithread)
+
+
+@pytest.mark.parametrize(
+    "metric, dist_func",
+    [("manhattan", manhattan_distances), ("cosine", cosine_distances)],
+)
+@pytest.mark.parametrize("method", ["barnes_hut", "exact"])
+def test_tsne_with_different_distance_metrics(metric, dist_func, method):
+    """Make sure that TSNE works for different distance metrics"""
+
+    if method == "barnes_hut" and metric == "manhattan":
+        # The distances computed by `manhattan_distances` differ slightly from those
+        # computed internally by NearestNeighbors via the PairwiseDistancesReduction
+        # Cython code-based. This in turns causes T-SNE to converge to a different
+        # solution but this should not impact the qualitative results as both
+        # methods.
+        # NOTE: it's probably not valid from a mathematical point of view to use the
+        # Manhattan distance for T-SNE...
+        # TODO: re-enable this test if/when `manhattan_distances` is refactored to
+        # reuse the same underlying Cython code NearestNeighbors.
+        # For reference, see:
+        # https://github.com/scikit-learn/scikit-learn/pull/23865/files#r925721573
+        pytest.xfail(
+            "Distance computations are different for method == 'barnes_hut' and metric"
+            " == 'manhattan', but this is expected."
+        )
+
+    random_state = check_random_state(0)
+    n_components_original = 3
+    n_components_embedding = 2
+    X = random_state.randn(50, n_components_original).astype(np.float32)
+    X_transformed_tsne = TSNE(
+        metric=metric,
+        method=method,
+        n_components=n_components_embedding,
+        random_state=0,
+        max_iter=300,
+        init="random",
+        learning_rate="auto",
+    ).fit_transform(X)
+    X_transformed_tsne_precomputed = TSNE(
+        metric="precomputed",
+        method=method,
+        n_components=n_components_embedding,
+        random_state=0,
+        max_iter=300,
+        init="random",
+        learning_rate="auto",
+    ).fit_transform(dist_func(X))
+    assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)
+
+
+@pytest.mark.parametrize("method", ["exact", "barnes_hut"])
+def test_tsne_n_jobs(method):
+    """Make sure that the n_jobs parameter doesn't impact the output"""
+    random_state = check_random_state(0)
+    n_features = 10
+    X = random_state.randn(30, n_features)
+    X_tr_ref = TSNE(
+        n_components=2,
+        method=method,
+        perplexity=25.0,
+        angle=0,
+        n_jobs=1,
+        random_state=0,
+        init="random",
+        learning_rate="auto",
+    ).fit_transform(X)
+    X_tr = TSNE(
+        n_components=2,
+        method=method,
+        perplexity=25.0,
+        angle=0,
+        n_jobs=2,
+        random_state=0,
+        init="random",
+        learning_rate="auto",
+    ).fit_transform(X)
+
+    assert_allclose(X_tr_ref, X_tr)
+
+
+def test_tsne_with_mahalanobis_distance():
+    """Make sure that method_parameters works with mahalanobis distance."""
+    random_state = check_random_state(0)
+    n_samples, n_features = 300, 10
+    X = random_state.randn(n_samples, n_features)
+    default_params = {
+        "perplexity": 40,
+        "max_iter": 250,
+        "learning_rate": "auto",
+        "init": "random",
+        "n_components": 3,
+        "random_state": 0,
+    }
+
+    tsne = TSNE(metric="mahalanobis", **default_params)
+    msg = "Must provide either V or VI for Mahalanobis distance"
+    with pytest.raises(ValueError, match=msg):
+        tsne.fit_transform(X)
+
+    precomputed_X = squareform(pdist(X, metric="mahalanobis"), checks=True)
+    X_trans_expected = TSNE(metric="precomputed", **default_params).fit_transform(
+        precomputed_X
+    )
+
+    X_trans = TSNE(
+        metric="mahalanobis", metric_params={"V": np.cov(X.T)}, **default_params
+    ).fit_transform(X)
+    assert_allclose(X_trans, X_trans_expected)
+
+
+@pytest.mark.parametrize("perplexity", (20, 30))
+def test_tsne_perplexity_validation(perplexity):
+    """Make sure that perplexity > n_samples results in a ValueError"""
+
+    random_state = check_random_state(0)
+    X = random_state.randn(20, 2)
+    est = TSNE(
+        learning_rate="auto",
+        init="pca",
+        perplexity=perplexity,
+        random_state=random_state,
+    )
+    msg = re.escape(f"perplexity ({perplexity}) must be less than n_samples (20)")
+    with pytest.raises(ValueError, match=msg):
+        est.fit_transform(X)
+
+
+def test_tsne_works_with_pandas_output():
+    """Make sure that TSNE works when the output is set to "pandas".
+
+    Non-regression test for gh-25365.
+    """
+    pytest.importorskip("pandas")
+    with config_context(transform_output="pandas"):
+        arr = np.arange(35 * 4).reshape(35, 4)
+        TSNE(n_components=2).fit_transform(arr)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce86525acc368f681af3c1fd635fbe37ed2815c3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/__init__.py
@@ -0,0 +1,181 @@
+"""Score functions, performance metrics, pairwise metrics and distance computations."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from . import cluster
+from ._classification import (
+    accuracy_score,
+    balanced_accuracy_score,
+    brier_score_loss,
+    class_likelihood_ratios,
+    classification_report,
+    cohen_kappa_score,
+    confusion_matrix,
+    d2_log_loss_score,
+    f1_score,
+    fbeta_score,
+    hamming_loss,
+    hinge_loss,
+    jaccard_score,
+    log_loss,
+    matthews_corrcoef,
+    multilabel_confusion_matrix,
+    precision_recall_fscore_support,
+    precision_score,
+    recall_score,
+    zero_one_loss,
+)
+from ._dist_metrics import DistanceMetric
+from ._plot.confusion_matrix import ConfusionMatrixDisplay
+from ._plot.det_curve import DetCurveDisplay
+from ._plot.precision_recall_curve import PrecisionRecallDisplay
+from ._plot.regression import PredictionErrorDisplay
+from ._plot.roc_curve import RocCurveDisplay
+from ._ranking import (
+    auc,
+    average_precision_score,
+    coverage_error,
+    dcg_score,
+    det_curve,
+    label_ranking_average_precision_score,
+    label_ranking_loss,
+    ndcg_score,
+    precision_recall_curve,
+    roc_auc_score,
+    roc_curve,
+    top_k_accuracy_score,
+)
+from ._regression import (
+    d2_absolute_error_score,
+    d2_pinball_score,
+    d2_tweedie_score,
+    explained_variance_score,
+    max_error,
+    mean_absolute_error,
+    mean_absolute_percentage_error,
+    mean_gamma_deviance,
+    mean_pinball_loss,
+    mean_poisson_deviance,
+    mean_squared_error,
+    mean_squared_log_error,
+    mean_tweedie_deviance,
+    median_absolute_error,
+    r2_score,
+    root_mean_squared_error,
+    root_mean_squared_log_error,
+)
+from ._scorer import check_scoring, get_scorer, get_scorer_names, make_scorer
+from .cluster import (
+    adjusted_mutual_info_score,
+    adjusted_rand_score,
+    calinski_harabasz_score,
+    completeness_score,
+    consensus_score,
+    davies_bouldin_score,
+    fowlkes_mallows_score,
+    homogeneity_completeness_v_measure,
+    homogeneity_score,
+    mutual_info_score,
+    normalized_mutual_info_score,
+    pair_confusion_matrix,
+    rand_score,
+    silhouette_samples,
+    silhouette_score,
+    v_measure_score,
+)
+from .pairwise import (
+    euclidean_distances,
+    nan_euclidean_distances,
+    pairwise_distances,
+    pairwise_distances_argmin,
+    pairwise_distances_argmin_min,
+    pairwise_distances_chunked,
+    pairwise_kernels,
+)
+
+__all__ = [
+    "ConfusionMatrixDisplay",
+    "DetCurveDisplay",
+    "DistanceMetric",
+    "PrecisionRecallDisplay",
+    "PredictionErrorDisplay",
+    "RocCurveDisplay",
+    "accuracy_score",
+    "adjusted_mutual_info_score",
+    "adjusted_rand_score",
+    "auc",
+    "average_precision_score",
+    "balanced_accuracy_score",
+    "brier_score_loss",
+    "calinski_harabasz_score",
+    "check_scoring",
+    "class_likelihood_ratios",
+    "classification_report",
+    "cluster",
+    "cohen_kappa_score",
+    "completeness_score",
+    "confusion_matrix",
+    "consensus_score",
+    "coverage_error",
+    "d2_absolute_error_score",
+    "d2_log_loss_score",
+    "d2_pinball_score",
+    "d2_tweedie_score",
+    "davies_bouldin_score",
+    "dcg_score",
+    "det_curve",
+    "euclidean_distances",
+    "explained_variance_score",
+    "f1_score",
+    "fbeta_score",
+    "fowlkes_mallows_score",
+    "get_scorer",
+    "get_scorer_names",
+    "hamming_loss",
+    "hinge_loss",
+    "homogeneity_completeness_v_measure",
+    "homogeneity_score",
+    "jaccard_score",
+    "label_ranking_average_precision_score",
+    "label_ranking_loss",
+    "log_loss",
+    "make_scorer",
+    "matthews_corrcoef",
+    "max_error",
+    "mean_absolute_error",
+    "mean_absolute_percentage_error",
+    "mean_gamma_deviance",
+    "mean_pinball_loss",
+    "mean_poisson_deviance",
+    "mean_squared_error",
+    "mean_squared_log_error",
+    "mean_tweedie_deviance",
+    "median_absolute_error",
+    "multilabel_confusion_matrix",
+    "mutual_info_score",
+    "nan_euclidean_distances",
+    "ndcg_score",
+    "normalized_mutual_info_score",
+    "pair_confusion_matrix",
+    "pairwise_distances",
+    "pairwise_distances_argmin",
+    "pairwise_distances_argmin_min",
+    "pairwise_distances_chunked",
+    "pairwise_kernels",
+    "precision_recall_curve",
+    "precision_recall_fscore_support",
+    "precision_score",
+    "r2_score",
+    "rand_score",
+    "recall_score",
+    "roc_auc_score",
+    "roc_curve",
+    "root_mean_squared_error",
+    "root_mean_squared_log_error",
+    "silhouette_samples",
+    "silhouette_score",
+    "top_k_accuracy_score",
+    "v_measure_score",
+    "zero_one_loss",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43850ec0986c9616c11dbb02410d667fe3f2d05e
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_base.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b4dd7f01788628a18b39b4c23ab23221d3d26ef
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_base.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_ranking.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_ranking.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84384539144e1b2578f3a8d38139a2f6b4a0e267
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_ranking.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_regression.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_regression.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d34d10666b22a07a2a5652052bea31d6445baf54
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_regression.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_scorer.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_scorer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99425fc0fc069e3c3b3897999463aaade7759cb6
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_scorer.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/pairwise.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/pairwise.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e9c141c115ab136474b17ff00f9b29e2e8db9da
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/pairwise.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_base.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa4150c88a9783aee51d6bf9e89172806728c97f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_base.py
@@ -0,0 +1,193 @@
+"""
+Common code for all metrics.
+
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import combinations
+
+import numpy as np
+
+from ..utils import check_array, check_consistent_length
+from ..utils.multiclass import type_of_target
+
+
+def _average_binary_score(binary_metric, y_true, y_score, average, sample_weight=None):
+    """Average a binary metric for multilabel classification.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples] or [n_samples, n_classes]
+        True binary labels in binary label indicators.
+
+    y_score : array, shape = [n_samples] or [n_samples, n_classes]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or binary decisions.
+
+    average : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro'
+        If ``None``, the scores for each class are returned. Otherwise,
+        this determines the type of averaging performed on the data:
+
+        ``'micro'``:
+            Calculate metrics globally by considering each element of the label
+            indicator matrix as a label.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label).
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average.
+
+        Will be ignored when ``y_true`` is binary.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    binary_metric : callable, returns shape [n_classes]
+        The binary metric function to use.
+
+    Returns
+    -------
+    score : float or array of shape [n_classes]
+        If not ``None``, average the score, else return the score for each
+        classes.
+
+    """
+    average_options = (None, "micro", "macro", "weighted", "samples")
+    if average not in average_options:
+        raise ValueError("average has to be one of {0}".format(average_options))
+
+    y_type = type_of_target(y_true)
+    if y_type not in ("binary", "multilabel-indicator"):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    if y_type == "binary":
+        return binary_metric(y_true, y_score, sample_weight=sample_weight)
+
+    check_consistent_length(y_true, y_score, sample_weight)
+    y_true = check_array(y_true)
+    y_score = check_array(y_score)
+
+    not_average_axis = 1
+    score_weight = sample_weight
+    average_weight = None
+
+    if average == "micro":
+        if score_weight is not None:
+            score_weight = np.repeat(score_weight, y_true.shape[1])
+        y_true = y_true.ravel()
+        y_score = y_score.ravel()
+
+    elif average == "weighted":
+        if score_weight is not None:
+            average_weight = np.sum(
+                np.multiply(y_true, np.reshape(score_weight, (-1, 1))), axis=0
+            )
+        else:
+            average_weight = np.sum(y_true, axis=0)
+        if np.isclose(average_weight.sum(), 0.0):
+            return 0
+
+    elif average == "samples":
+        # swap average_weight <-> score_weight
+        average_weight = score_weight
+        score_weight = None
+        not_average_axis = 0
+
+    if y_true.ndim == 1:
+        y_true = y_true.reshape((-1, 1))
+
+    if y_score.ndim == 1:
+        y_score = y_score.reshape((-1, 1))
+
+    n_classes = y_score.shape[not_average_axis]
+    score = np.zeros((n_classes,))
+    for c in range(n_classes):
+        y_true_c = y_true.take([c], axis=not_average_axis).ravel()
+        y_score_c = y_score.take([c], axis=not_average_axis).ravel()
+        score[c] = binary_metric(y_true_c, y_score_c, sample_weight=score_weight)
+
+    # Average the results
+    if average is not None:
+        if average_weight is not None:
+            # Scores with 0 weights are forced to be 0, preventing the average
+            # score from being affected by 0-weighted NaN elements.
+            average_weight = np.asarray(average_weight)
+            score[average_weight == 0] = 0
+        return float(np.average(score, weights=average_weight))
+    else:
+        return score
+
+
+def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average="macro"):
+    """Average one-versus-one scores for multiclass classification.
+
+    Uses the binary metric for one-vs-one multiclass classification,
+    where the score is computed according to the Hand & Till (2001) algorithm.
+
+    Parameters
+    ----------
+    binary_metric : callable
+        The binary metric function to use that accepts the following as input:
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    y_true : array-like of shape (n_samples,)
+        True multiclass labels.
+
+    y_score : array-like of shape (n_samples, n_classes)
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class.
+
+    average : {'macro', 'weighted'}, default='macro'
+        Determines the type of averaging performed on the pairwise binary
+        metric scores:
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes.
+
+    Returns
+    -------
+    score : float
+        Average of the pairwise binary metric scores.
+    """
+    check_consistent_length(y_true, y_score)
+
+    y_true_unique = np.unique(y_true)
+    n_classes = y_true_unique.shape[0]
+    n_pairs = n_classes * (n_classes - 1) // 2
+    pair_scores = np.empty(n_pairs)
+
+    is_weighted = average == "weighted"
+    prevalence = np.empty(n_pairs) if is_weighted else None
+
+    # Compute scores treating a as positive class and b as negative class,
+    # then b as positive class and a as negative class
+    for ix, (a, b) in enumerate(combinations(y_true_unique, 2)):
+        a_mask = y_true == a
+        b_mask = y_true == b
+        ab_mask = np.logical_or(a_mask, b_mask)
+
+        if is_weighted:
+            prevalence[ix] = np.average(ab_mask)
+
+        a_true = a_mask[ab_mask]
+        b_true = b_mask[ab_mask]
+
+        a_true_score = binary_metric(a_true, y_score[ab_mask, a])
+        b_true_score = binary_metric(b_true, y_score[ab_mask, b])
+        pair_scores[ix] = (a_true_score + b_true_score) / 2
+
+    return np.average(pair_scores, weights=prevalence)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_classification.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..06503046790beacc11e0a40df39ec9aeb89d0cac
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_classification.py
@@ -0,0 +1,3730 @@
+"""Metrics to assess performance on classification task given class prediction.
+
+Functions named as ``*_score`` return a scalar value to maximize: the higher
+the better.
+
+Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
+the lower the better.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.sparse import coo_matrix, csr_matrix, issparse
+from scipy.special import xlogy
+
+from ..exceptions import UndefinedMetricWarning
+from ..preprocessing import LabelBinarizer, LabelEncoder
+from ..utils import (
+    assert_all_finite,
+    check_array,
+    check_consistent_length,
+    check_scalar,
+    column_or_1d,
+)
+from ..utils._array_api import (
+    _average,
+    _bincount,
+    _count_nonzero,
+    _find_matching_floating_dtype,
+    _is_numpy_namespace,
+    _max_precision_float_dtype,
+    _searchsorted,
+    _tolist,
+    _union1d,
+    get_namespace,
+    get_namespace_and_device,
+    xpx,
+)
+from ..utils._param_validation import (
+    Hidden,
+    Interval,
+    Options,
+    StrOptions,
+    validate_params,
+)
+from ..utils._unique import attach_unique
+from ..utils.extmath import _nanaverage
+from ..utils.multiclass import type_of_target, unique_labels
+from ..utils.validation import (
+    _check_pos_label_consistency,
+    _check_sample_weight,
+    _num_samples,
+)
+
+
+def _check_zero_division(zero_division):
+    if isinstance(zero_division, str) and zero_division == "warn":
+        return np.float64(0.0)
+    elif isinstance(zero_division, (int, float)) and zero_division in [0, 1]:
+        return np.float64(zero_division)
+    else:  # np.isnan(zero_division)
+        return np.nan
+
+
+def _check_targets(y_true, y_pred):
+    """Check that y_true and y_pred belong to the same classification task.
+
+    This converts multiclass or binary types to a common shape, and raises a
+    ValueError for a mix of multilabel and multiclass targets, a mix of
+    multilabel formats, for the presence of continuous-valued or multioutput
+    targets, or for targets of different lengths.
+
+    Column vectors are squeezed to 1d, while multilabel formats are returned
+    as CSR sparse label indicators.
+
+    Parameters
+    ----------
+    y_true : array-like
+
+    y_pred : array-like
+
+    Returns
+    -------
+    type_true : one of {'multilabel-indicator', 'multiclass', 'binary'}
+        The type of the true target data, as output by
+        ``utils.multiclass.type_of_target``.
+
+    y_true : array or indicator matrix
+
+    y_pred : array or indicator matrix
+    """
+    xp, _ = get_namespace(y_true, y_pred)
+    check_consistent_length(y_true, y_pred)
+    type_true = type_of_target(y_true, input_name="y_true")
+    type_pred = type_of_target(y_pred, input_name="y_pred")
+
+    y_type = {type_true, type_pred}
+    if y_type == {"binary", "multiclass"}:
+        y_type = {"multiclass"}
+
+    if len(y_type) > 1:
+        raise ValueError(
+            "Classification metrics can't handle a mix of {0} and {1} targets".format(
+                type_true, type_pred
+            )
+        )
+
+    # We can't have more than one value on y_type => The set is no more needed
+    y_type = y_type.pop()
+
+    # No metrics support "multiclass-multioutput" format
+    if y_type not in ["binary", "multiclass", "multilabel-indicator"]:
+        raise ValueError("{0} is not supported".format(y_type))
+
+    if y_type in ["binary", "multiclass"]:
+        xp, _ = get_namespace(y_true, y_pred)
+        y_true = column_or_1d(y_true)
+        y_pred = column_or_1d(y_pred)
+        if y_type == "binary":
+            try:
+                unique_values = _union1d(y_true, y_pred, xp)
+            except TypeError as e:
+                # We expect y_true and y_pred to be of the same data type.
+                # If `y_true` was provided to the classifier as strings,
+                # `y_pred` given by the classifier will also be encoded with
+                # strings. So we raise a meaningful error
+                raise TypeError(
+                    "Labels in y_true and y_pred should be of the same type. "
+                    f"Got y_true={xp.unique(y_true)} and "
+                    f"y_pred={xp.unique(y_pred)}. Make sure that the "
+                    "predictions provided by the classifier coincides with "
+                    "the true labels."
+                ) from e
+            if unique_values.shape[0] > 2:
+                y_type = "multiclass"
+
+    if y_type.startswith("multilabel"):
+        if _is_numpy_namespace(xp):
+            # XXX: do we really want to sparse-encode multilabel indicators when
+            # they are passed as a dense arrays? This is not possible for array
+            # API inputs in general hence we only do it for NumPy inputs. But even
+            # for NumPy the usefulness is questionable.
+            y_true = csr_matrix(y_true)
+            y_pred = csr_matrix(y_pred)
+        y_type = "multilabel-indicator"
+
+    return y_type, y_true, y_pred
+
+
+def _validate_multiclass_probabilistic_prediction(
+    y_true, y_prob, sample_weight, labels
+):
+    r"""Convert y_true and y_prob to shape (n_samples, n_classes)
+
+    1. Verify that y_true, y_prob, and sample_weights have the same first dim
+    2. Ensure 2 or more classes in y_true i.e. valid classification task. The
+       classes are provided by the labels argument, or inferred using y_true.
+       When inferring y_true is assumed binary if it has shape (n_samples, ).
+    3. Validate y_true, and y_prob have the same number of classes. Convert to
+       shape (n_samples, n_classes)
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels for n_samples samples.
+
+    y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If `y_prob.shape = (n_samples,)`
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in `y_prob` are assumed to be
+        ordered lexicographically, as done by
+        :class:`preprocessing.LabelBinarizer`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If `labels`
+        is `None` and `y_prob` has shape `(n_samples,)` the labels are
+        assumed to be binary and are inferred from `y_true`.
+
+    Returns
+    -------
+    transformed_labels : array of shape (n_samples, n_classes)
+
+    y_prob : array of shape (n_samples, n_classes)
+    """
+    y_prob = check_array(
+        y_prob, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
+    )
+
+    if y_prob.max() > 1:
+        raise ValueError(f"y_prob contains values greater than 1: {y_prob.max()}")
+    if y_prob.min() < 0:
+        raise ValueError(f"y_prob contains values lower than 0: {y_prob.min()}")
+
+    check_consistent_length(y_prob, y_true, sample_weight)
+    lb = LabelBinarizer()
+
+    if labels is not None:
+        lb = lb.fit(labels)
+        # LabelBinarizer does not respect the order implied by labels, which
+        # can be misleading.
+        if not np.all(lb.classes_ == labels):
+            warnings.warn(
+                f"Labels passed were {labels}. But this function "
+                "assumes labels are ordered lexicographically. "
+                f"Pass the ordered labels={lb.classes_.tolist()} and ensure that "
+                "the columns of y_prob correspond to this ordering.",
+                UserWarning,
+            )
+        if not np.isin(y_true, labels).all():
+            undeclared_labels = set(y_true) - set(labels)
+            raise ValueError(
+                f"y_true contains values {undeclared_labels} not belonging "
+                f"to the passed labels {labels}."
+            )
+
+    else:
+        lb = lb.fit(y_true)
+
+    if len(lb.classes_) == 1:
+        if labels is None:
+            raise ValueError(
+                "y_true contains only one label ({0}). Please "
+                "provide the list of all expected class labels explicitly through the "
+                "labels argument.".format(lb.classes_[0])
+            )
+        else:
+            raise ValueError(
+                "The labels array needs to contain at least two "
+                "labels, got {0}.".format(lb.classes_)
+            )
+
+    transformed_labels = lb.transform(y_true)
+
+    if transformed_labels.shape[1] == 1:
+        transformed_labels = np.append(
+            1 - transformed_labels, transformed_labels, axis=1
+        )
+
+    # If y_prob is of single dimension, assume y_true to be binary
+    # and then check.
+    if y_prob.ndim == 1:
+        y_prob = y_prob[:, np.newaxis]
+    if y_prob.shape[1] == 1:
+        y_prob = np.append(1 - y_prob, y_prob, axis=1)
+
+    eps = np.finfo(y_prob.dtype).eps
+
+    # Make sure y_prob is normalized
+    y_prob_sum = y_prob.sum(axis=1)
+    if not np.allclose(y_prob_sum, 1, rtol=np.sqrt(eps)):
+        warnings.warn(
+            "The y_prob values do not sum to one. Make sure to pass probabilities.",
+            UserWarning,
+        )
+
+    # Check if dimensions are consistent.
+    transformed_labels = check_array(transformed_labels)
+    if len(lb.classes_) != y_prob.shape[1]:
+        if labels is None:
+            raise ValueError(
+                "y_true and y_prob contain different number of "
+                "classes: {0} vs {1}. Please provide the true "
+                "labels explicitly through the labels argument. "
+                "Classes found in "
+                "y_true: {2}".format(
+                    transformed_labels.shape[1], y_prob.shape[1], lb.classes_
+                )
+            )
+        else:
+            raise ValueError(
+                "The number of classes in labels is different "
+                "from that in y_prob. Classes found in "
+                "labels: {0}".format(lb.classes_)
+            )
+
+    return transformed_labels, y_prob
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "normalize": ["boolean"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
+    """Accuracy classification score.
+
+    In multilabel classification, this function computes subset accuracy:
+    the set of labels predicted for a sample must *exactly* match the
+    corresponding set of labels in y_true.
+
+    Read more in the :ref:`User Guide <accuracy_score>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) labels.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Predicted labels, as returned by a classifier.
+
+    normalize : bool, default=True
+        If ``False``, return the number of correctly classified samples.
+        Otherwise, return the fraction of correctly classified samples.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    score : float or int
+        If ``normalize == True``, return the fraction of correctly
+        classified samples (float), else returns the number of correctly
+        classified samples (int).
+
+        The best performance is 1 with ``normalize == True`` and the number
+        of samples with ``normalize == False``.
+
+    See Also
+    --------
+    balanced_accuracy_score : Compute the balanced accuracy to deal with
+        imbalanced datasets.
+    jaccard_score : Compute the Jaccard similarity coefficient score.
+    hamming_loss : Compute the average Hamming loss or Hamming distance between
+        two sets of samples.
+    zero_one_loss : Compute the Zero-one classification loss. By default, the
+        function will return the percentage of imperfectly predicted subsets.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import accuracy_score
+    >>> y_pred = [0, 2, 1, 3]
+    >>> y_true = [0, 1, 2, 3]
+    >>> accuracy_score(y_true, y_pred)
+    0.5
+    >>> accuracy_score(y_true, y_pred, normalize=False)
+    2.0
+
+    In the multilabel case with binary label indicators:
+
+    >>> import numpy as np
+    >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
+    0.5
+    """
+    xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight)
+    # Compute accuracy for each possible representation
+    y_true, y_pred = attach_unique(y_true, y_pred)
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    if y_type.startswith("multilabel"):
+        differing_labels = _count_nonzero(y_true - y_pred, xp=xp, device=device, axis=1)
+        score = xp.asarray(differing_labels == 0, device=device)
+    else:
+        score = y_true == y_pred
+
+    return float(_average(score, weights=sample_weight, normalize=normalize))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "labels": ["array-like", None],
+        "sample_weight": ["array-like", None],
+        "normalize": [StrOptions({"true", "pred", "all"}), None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def confusion_matrix(
+    y_true, y_pred, *, labels=None, sample_weight=None, normalize=None
+):
+    """Compute confusion matrix to evaluate the accuracy of a classification.
+
+    By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
+    is equal to the number of observations known to be in group :math:`i` and
+    predicted to be in group :math:`j`.
+
+    Thus in binary classification, the count of true negatives is
+    :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
+    :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.
+
+    Read more in the :ref:`User Guide <confusion_matrix>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated targets as returned by a classifier.
+
+    labels : array-like of shape (n_classes), default=None
+        List of labels to index the matrix. This may be used to reorder
+        or select a subset of labels.
+        If ``None`` is given, those that appear at least once
+        in ``y_true`` or ``y_pred`` are used in sorted order.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+        .. versionadded:: 0.18
+
+    normalize : {'true', 'pred', 'all'}, default=None
+        Normalizes confusion matrix over the true (rows), predicted (columns)
+        conditions or all the population. If None, confusion matrix will not be
+        normalized.
+
+    Returns
+    -------
+    C : ndarray of shape (n_classes, n_classes)
+        Confusion matrix whose i-th row and j-th
+        column entry indicates the number of
+        samples with true label being i-th class
+        and predicted label being j-th class.
+
+    See Also
+    --------
+    ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
+        given an estimator, the data, and the label.
+    ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
+        given the true and predicted labels.
+    ConfusionMatrixDisplay : Confusion Matrix visualization.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Confusion matrix
+           <https://en.wikipedia.org/wiki/Confusion_matrix>`_
+           (Wikipedia and other references may use a different
+           convention for axes).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import confusion_matrix
+    >>> y_true = [2, 0, 2, 2, 0, 1]
+    >>> y_pred = [0, 0, 2, 2, 0, 2]
+    >>> confusion_matrix(y_true, y_pred)
+    array([[2, 0, 0],
+           [0, 0, 1],
+           [1, 0, 2]])
+
+    >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
+    >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
+    >>> confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])
+    array([[2, 0, 0],
+           [0, 0, 1],
+           [1, 0, 2]])
+
+    In the binary case, we can extract true positives, etc. as follows:
+
+    >>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel().tolist()
+    >>> (tn, fp, fn, tp)
+    (0, 2, 1, 1)
+    """
+    y_true, y_pred = attach_unique(y_true, y_pred)
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    if y_type not in ("binary", "multiclass"):
+        raise ValueError("%s is not supported" % y_type)
+
+    if labels is None:
+        labels = unique_labels(y_true, y_pred)
+    else:
+        labels = np.asarray(labels)
+        n_labels = labels.size
+        if n_labels == 0:
+            raise ValueError("'labels' should contains at least one label.")
+        elif y_true.size == 0:
+            return np.zeros((n_labels, n_labels), dtype=int)
+        elif len(np.intersect1d(y_true, labels)) == 0:
+            raise ValueError("At least one label specified must be in y_true")
+
+    if sample_weight is None:
+        sample_weight = np.ones(y_true.shape[0], dtype=np.int64)
+    else:
+        sample_weight = np.asarray(sample_weight)
+
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    n_labels = labels.size
+    # If labels are not consecutive integers starting from zero, then
+    # y_true and y_pred must be converted into index form
+    need_index_conversion = not (
+        labels.dtype.kind in {"i", "u", "b"}
+        and np.all(labels == np.arange(n_labels))
+        and y_true.min() >= 0
+        and y_pred.min() >= 0
+    )
+    if need_index_conversion:
+        label_to_ind = {y: x for x, y in enumerate(labels)}
+        y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred])
+        y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true])
+
+    # intersect y_pred, y_true with labels, eliminate items not in labels
+    ind = np.logical_and(y_pred < n_labels, y_true < n_labels)
+    if not np.all(ind):
+        y_pred = y_pred[ind]
+        y_true = y_true[ind]
+        # also eliminate weights of eliminated items
+        sample_weight = sample_weight[ind]
+
+    # Choose the accumulator dtype to always have high precision
+    if sample_weight.dtype.kind in {"i", "u", "b"}:
+        dtype = np.int64
+    else:
+        dtype = np.float64
+
+    cm = coo_matrix(
+        (sample_weight, (y_true, y_pred)),
+        shape=(n_labels, n_labels),
+        dtype=dtype,
+    ).toarray()
+
+    with np.errstate(all="ignore"):
+        if normalize == "true":
+            cm = cm / cm.sum(axis=1, keepdims=True)
+        elif normalize == "pred":
+            cm = cm / cm.sum(axis=0, keepdims=True)
+        elif normalize == "all":
+            cm = cm / cm.sum()
+        cm = np.nan_to_num(cm)
+
+    if cm.shape == (1, 1):
+        warnings.warn(
+            (
+                "A single label was found in 'y_true' and 'y_pred'. For the confusion "
+                "matrix to have the correct shape, use the 'labels' parameter to pass "
+                "all known labels."
+            ),
+            UserWarning,
+        )
+
+    return cm
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "sample_weight": ["array-like", None],
+        "labels": ["array-like", None],
+        "samplewise": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def multilabel_confusion_matrix(
+    y_true, y_pred, *, sample_weight=None, labels=None, samplewise=False
+):
+    """Compute a confusion matrix for each class or sample.
+
+    .. versionadded:: 0.21
+
+    Compute class-wise (default) or sample-wise (samplewise=True) multilabel
+    confusion matrix to evaluate the accuracy of a classification, and output
+    confusion matrices for each class or sample.
+
+    In multilabel confusion matrix :math:`MCM`, the count of true negatives
+    is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`,
+    true positives is :math:`MCM_{:,1,1}` and false positives is
+    :math:`MCM_{:,0,1}`.
+
+    Multiclass data will be treated as if binarized under a one-vs-rest
+    transformation. Returned confusion matrices will be in the order of
+    sorted unique labels in the union of (y_true, y_pred).
+
+    Read more in the :ref:`User Guide <multilabel_confusion_matrix>`.
+
+    Parameters
+    ----------
+    y_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \
+            (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \
+            (n_samples,)
+        Estimated targets as returned by a classifier.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like of shape (n_classes,), default=None
+        A list of classes or column indices to select some (or to force
+        inclusion of classes absent from the data).
+
+    samplewise : bool, default=False
+        In the multilabel case, this calculates a confusion matrix per sample.
+
+    Returns
+    -------
+    multi_confusion : ndarray of shape (n_outputs, 2, 2)
+        A 2x2 confusion matrix corresponding to each output in the input.
+        When calculating class-wise multi_confusion (default), then
+        n_outputs = n_labels; when calculating sample-wise multi_confusion
+        (samplewise=True), n_outputs = n_samples. If ``labels`` is defined,
+        the results will be returned in the order specified in ``labels``,
+        otherwise the results will be returned in sorted order by default.
+
+    See Also
+    --------
+    confusion_matrix : Compute confusion matrix to evaluate the accuracy of a
+        classifier.
+
+    Notes
+    -----
+    The `multilabel_confusion_matrix` calculates class-wise or sample-wise
+    multilabel confusion matrices, and in multiclass tasks, labels are
+    binarized under a one-vs-rest way; while
+    :func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix
+    for confusion between every two classes.
+
+    Examples
+    --------
+    Multilabel-indicator case:
+
+    >>> import numpy as np
+    >>> from sklearn.metrics import multilabel_confusion_matrix
+    >>> y_true = np.array([[1, 0, 1],
+    ...                    [0, 1, 0]])
+    >>> y_pred = np.array([[1, 0, 0],
+    ...                    [0, 1, 1]])
+    >>> multilabel_confusion_matrix(y_true, y_pred)
+    array([[[1, 0],
+            [0, 1]],
+    <BLANKLINE>
+           [[1, 0],
+            [0, 1]],
+    <BLANKLINE>
+           [[0, 1],
+            [1, 0]]])
+
+    Multiclass case:
+
+    >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
+    >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
+    >>> multilabel_confusion_matrix(y_true, y_pred,
+    ...                             labels=["ant", "bird", "cat"])
+    array([[[3, 1],
+            [0, 2]],
+    <BLANKLINE>
+           [[5, 0],
+            [1, 0]],
+    <BLANKLINE>
+           [[2, 1],
+            [1, 2]]])
+    """
+    y_true, y_pred = attach_unique(y_true, y_pred)
+    xp, _, device_ = get_namespace_and_device(y_true, y_pred)
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight, device=device_)
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    if y_type not in ("binary", "multiclass", "multilabel-indicator"):
+        raise ValueError("%s is not supported" % y_type)
+
+    present_labels = unique_labels(y_true, y_pred)
+    if labels is None:
+        labels = present_labels
+        n_labels = None
+    else:
+        labels = xp.asarray(labels, device=device_)
+        n_labels = labels.shape[0]
+        labels = xp.concat(
+            [labels, xpx.setdiff1d(present_labels, labels, assume_unique=True, xp=xp)],
+            axis=-1,
+        )
+
+    if y_true.ndim == 1:
+        if samplewise:
+            raise ValueError(
+                "Samplewise metrics are not available outside of "
+                "multilabel classification."
+            )
+
+        le = LabelEncoder()
+        le.fit(labels)
+        y_true = le.transform(y_true)
+        y_pred = le.transform(y_pred)
+        sorted_labels = le.classes_
+
+        # labels are now from 0 to len(labels) - 1 -> use bincount
+        tp = y_true == y_pred
+        tp_bins = y_true[tp]
+        if sample_weight is not None:
+            tp_bins_weights = sample_weight[tp]
+        else:
+            tp_bins_weights = None
+
+        if tp_bins.shape[0]:
+            tp_sum = _bincount(
+                tp_bins, weights=tp_bins_weights, minlength=labels.shape[0], xp=xp
+            )
+        else:
+            # Pathological case
+            true_sum = pred_sum = tp_sum = xp.zeros(labels.shape[0])
+        if y_pred.shape[0]:
+            pred_sum = _bincount(
+                y_pred, weights=sample_weight, minlength=labels.shape[0], xp=xp
+            )
+        if y_true.shape[0]:
+            true_sum = _bincount(
+                y_true, weights=sample_weight, minlength=labels.shape[0], xp=xp
+            )
+
+        # Retain only selected labels
+        indices = _searchsorted(sorted_labels, labels[:n_labels], xp=xp)
+        tp_sum = xp.take(tp_sum, indices, axis=0)
+        true_sum = xp.take(true_sum, indices, axis=0)
+        pred_sum = xp.take(pred_sum, indices, axis=0)
+
+    else:
+        sum_axis = 1 if samplewise else 0
+
+        # All labels are index integers for multilabel.
+        # Select labels:
+        if labels.shape != present_labels.shape or xp.any(
+            xp.not_equal(labels, present_labels)
+        ):
+            if xp.max(labels) > xp.max(present_labels):
+                raise ValueError(
+                    "All labels must be in [0, n labels) for "
+                    "multilabel targets. "
+                    "Got %d > %d" % (xp.max(labels), xp.max(present_labels))
+                )
+            if xp.min(labels) < 0:
+                raise ValueError(
+                    "All labels must be in [0, n labels) for "
+                    "multilabel targets. "
+                    "Got %d < 0" % xp.min(labels)
+                )
+
+        if n_labels is not None:
+            y_true = y_true[:, labels[:n_labels]]
+            y_pred = y_pred[:, labels[:n_labels]]
+
+        if issparse(y_true) or issparse(y_pred):
+            true_and_pred = y_true.multiply(y_pred)
+        else:
+            true_and_pred = xp.multiply(y_true, y_pred)
+
+        # calculate weighted counts
+        tp_sum = _count_nonzero(
+            true_and_pred,
+            axis=sum_axis,
+            sample_weight=sample_weight,
+            xp=xp,
+            device=device_,
+        )
+        pred_sum = _count_nonzero(
+            y_pred,
+            axis=sum_axis,
+            sample_weight=sample_weight,
+            xp=xp,
+            device=device_,
+        )
+        true_sum = _count_nonzero(
+            y_true,
+            axis=sum_axis,
+            sample_weight=sample_weight,
+            xp=xp,
+            device=device_,
+        )
+
+    fp = pred_sum - tp_sum
+    fn = true_sum - tp_sum
+    tp = tp_sum
+
+    if sample_weight is not None and samplewise:
+        tp = xp.asarray(tp)
+        fp = xp.asarray(fp)
+        fn = xp.asarray(fn)
+        tn = sample_weight * y_true.shape[1] - tp - fp - fn
+    elif sample_weight is not None:
+        tn = xp.sum(sample_weight) - tp - fp - fn
+    elif samplewise:
+        tn = y_true.shape[1] - tp - fp - fn
+    else:
+        tn = y_true.shape[0] - tp - fp - fn
+
+    return xp.reshape(xp.stack([tn, fp, fn, tp]).T, (-1, 2, 2))
+
+
+@validate_params(
+    {
+        "y1": ["array-like"],
+        "y2": ["array-like"],
+        "labels": ["array-like", None],
+        "weights": [StrOptions({"linear", "quadratic"}), None],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None):
+    r"""Compute Cohen's kappa: a statistic that measures inter-annotator agreement.
+
+    This function computes Cohen's kappa [1]_, a score that expresses the level
+    of agreement between two annotators on a classification problem. It is
+    defined as
+
+    .. math::
+        \kappa = (p_o - p_e) / (1 - p_e)
+
+    where :math:`p_o` is the empirical probability of agreement on the label
+    assigned to any sample (the observed agreement ratio), and :math:`p_e` is
+    the expected agreement when both annotators assign labels randomly.
+    :math:`p_e` is estimated using a per-annotator empirical prior over the
+    class labels [2]_.
+
+    Read more in the :ref:`User Guide <cohen_kappa>`.
+
+    Parameters
+    ----------
+    y1 : array-like of shape (n_samples,)
+        Labels assigned by the first annotator.
+
+    y2 : array-like of shape (n_samples,)
+        Labels assigned by the second annotator. The kappa statistic is
+        symmetric, so swapping ``y1`` and ``y2`` doesn't change the value.
+
+    labels : array-like of shape (n_classes,), default=None
+        List of labels to index the matrix. This may be used to select a
+        subset of labels. If `None`, all labels that appear at least once in
+        ``y1`` or ``y2`` are used. Note that at least one label in `labels` must be
+        present in `y1`, even though this function is otherwise agnostic to the order
+        of `y1` and `y2`.
+
+    weights : {'linear', 'quadratic'}, default=None
+        Weighting type to calculate the score. `None` means not weighted;
+        "linear" means linear weighting; "quadratic" means quadratic weighting.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    kappa : float
+        The kappa statistic, which is a number between -1 and 1. The maximum
+        value means complete agreement; zero or lower means chance agreement.
+
+    References
+    ----------
+    .. [1] :doi:`J. Cohen (1960). "A coefficient of agreement for nominal scales".
+           Educational and Psychological Measurement 20(1):37-46.
+           <10.1177/001316446002000104>`
+    .. [2] `R. Artstein and M. Poesio (2008). "Inter-coder agreement for
+           computational linguistics". Computational Linguistics 34(4):555-596
+           <https://www.mitpressjournals.org/doi/pdf/10.1162/coli.07-034-R2>`_.
+    .. [3] `Wikipedia entry for the Cohen's kappa
+            <https://en.wikipedia.org/wiki/Cohen%27s_kappa>`_.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import cohen_kappa_score
+    >>> y1 = ["negative", "positive", "negative", "neutral", "positive"]
+    >>> y2 = ["negative", "positive", "negative", "neutral", "negative"]
+    >>> cohen_kappa_score(y1, y2)
+    0.6875
+    """
+    try:
+        confusion = confusion_matrix(y1, y2, labels=labels, sample_weight=sample_weight)
+    except ValueError as e:
+        if "At least one label specified must be in y_true" in str(e):
+            msg = (
+                "At least one label in `labels` must be present in `y1` (even though "
+                "`cohen_kappa_score` is otherwise agnostic to the order of `y1` and "
+                "`y2`)."
+            )
+            raise ValueError(msg) from e
+        raise
+
+    n_classes = confusion.shape[0]
+    sum0 = np.sum(confusion, axis=0)
+    sum1 = np.sum(confusion, axis=1)
+    expected = np.outer(sum0, sum1) / np.sum(sum0)
+
+    if weights is None:
+        w_mat = np.ones([n_classes, n_classes], dtype=int)
+        w_mat.flat[:: n_classes + 1] = 0
+    else:  # "linear" or "quadratic"
+        w_mat = np.zeros([n_classes, n_classes], dtype=int)
+        w_mat += np.arange(n_classes)
+        if weights == "linear":
+            w_mat = np.abs(w_mat - w_mat.T)
+        else:
+            w_mat = (w_mat - w_mat.T) ** 2
+
+    k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
+    return float(1 - k)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "labels": ["array-like", None],
+        "pos_label": [Real, str, "boolean", None],
+        "average": [
+            StrOptions({"micro", "macro", "samples", "weighted", "binary"}),
+            None,
+        ],
+        "sample_weight": ["array-like", None],
+        "zero_division": [
+            Options(Real, {0, 1}),
+            StrOptions({"warn"}),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def jaccard_score(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn",
+):
+    """Jaccard similarity coefficient score.
+
+    The Jaccard index [1], or Jaccard similarity coefficient, defined as
+    the size of the intersection divided by the size of the union of two label
+    sets, is used to compare set of predicted labels for a sample to the
+    corresponding set of labels in ``y_true``.
+
+    Support beyond term:`binary` targets is achieved by treating :term:`multiclass`
+    and :term:`multilabel` data as a collection of binary problems, one for each
+    label. For the :term:`binary` case, setting `average='binary'` will return the
+    Jaccard similarity coefficient for `pos_label`. If `average` is not `'binary'`,
+    `pos_label` is ignored and scores for both classes are computed, then averaged or
+    both returned (when `average=None`). Similarly, for :term:`multiclass` and
+    :term:`multilabel` targets, scores for all `labels` are either returned or
+    averaged depending on the `average` parameter. Use `labels` specify the set of
+    labels to calculate the score for.
+
+    Read more in the :ref:`User Guide <jaccard_similarity_score>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) labels.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Predicted labels, as returned by a classifier.
+
+    labels : array-like of shape (n_classes,), default=None
+        The set of labels to include when `average != 'binary'`, and their
+        order if `average is None`. Labels present in the data can be
+        excluded, for example in multiclass classification to exclude a "negative
+        class". Labels not present in the data can be included and will be
+        "assigned" 0 samples. For multilabel targets, labels are column indices.
+        By default, all labels in `y_true` and `y_pred` are used in sorted order.
+
+    pos_label : int, float, bool or str, default=1
+        The class to report if `average='binary'` and the data is binary,
+        otherwise this parameter is ignored.
+        For multiclass or multilabel targets, set `labels=[pos_label]` and
+        `average != 'binary'` to report metrics for one label only.
+
+    average : {'micro', 'macro', 'samples', 'weighted', \
+            'binary'} or None, default='binary'
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : "warn", {0.0, 1.0}, default="warn"
+        Sets the value to return when there is a zero division, i.e. when there
+        there are no negative values in predictions and labels. If set to
+        "warn", this acts like 0, but a warning is also raised.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    score : float or ndarray of shape (n_unique_labels,), dtype=np.float64
+        The Jaccard score. When `average` is not `None`, a single scalar is
+        returned.
+
+    See Also
+    --------
+    accuracy_score : Function for calculating the accuracy score.
+    f1_score : Function for calculating the F1 score.
+    multilabel_confusion_matrix : Function for computing a confusion matrix\
+                                  for each class or sample.
+
+    Notes
+    -----
+    :func:`jaccard_score` may be a poor metric if there are no
+    positives for some samples or classes. Jaccard is undefined if there are
+    no true or predicted labels, and our implementation will return a score
+    of 0 with a warning.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Jaccard index
+           <https://en.wikipedia.org/wiki/Jaccard_index>`_.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import jaccard_score
+    >>> y_true = np.array([[0, 1, 1],
+    ...                    [1, 1, 0]])
+    >>> y_pred = np.array([[1, 1, 1],
+    ...                    [1, 0, 0]])
+
+    In the binary case:
+
+    >>> jaccard_score(y_true[0], y_pred[0])
+    0.6666
+
+    In the 2D comparison case (e.g. image similarity):
+
+    >>> jaccard_score(y_true, y_pred, average="micro")
+    0.6
+
+    In the multilabel case:
+
+    >>> jaccard_score(y_true, y_pred, average='samples')
+    0.5833
+    >>> jaccard_score(y_true, y_pred, average='macro')
+    0.6666
+    >>> jaccard_score(y_true, y_pred, average=None)
+    array([0.5, 0.5, 1. ])
+
+    In the multiclass case:
+
+    >>> y_pred = [0, 2, 1, 2]
+    >>> y_true = [0, 1, 2, 2]
+    >>> jaccard_score(y_true, y_pred, average=None)
+    array([1. , 0. , 0.33])
+    """
+    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
+    samplewise = average == "samples"
+    MCM = multilabel_confusion_matrix(
+        y_true,
+        y_pred,
+        sample_weight=sample_weight,
+        labels=labels,
+        samplewise=samplewise,
+    )
+    numerator = MCM[:, 1, 1]
+    denominator = MCM[:, 1, 1] + MCM[:, 0, 1] + MCM[:, 1, 0]
+
+    xp, _, device_ = get_namespace_and_device(y_true, y_pred)
+    if average == "micro":
+        numerator = xp.asarray(xp.sum(numerator, keepdims=True), device=device_)
+        denominator = xp.asarray(xp.sum(denominator, keepdims=True), device=device_)
+
+    jaccard = _prf_divide(
+        numerator,
+        denominator,
+        "jaccard",
+        "true or predicted",
+        average,
+        ("jaccard",),
+        zero_division=zero_division,
+    )
+    if average is None:
+        return jaccard
+    if average == "weighted":
+        weights = MCM[:, 1, 0] + MCM[:, 1, 1]
+        if not xp.any(weights):
+            # numerator is 0, and warning should have already been issued
+            weights = None
+    elif average == "samples" and sample_weight is not None:
+        weights = sample_weight
+    else:
+        weights = None
+    return float(_average(jaccard, weights=weights, xp=xp))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
+    """Compute the Matthews correlation coefficient (MCC).
+
+    The Matthews correlation coefficient is used in machine learning as a
+    measure of the quality of binary and multiclass classifications. It takes
+    into account true and false positives and negatives and is generally
+    regarded as a balanced measure which can be used even if the classes are of
+    very different sizes. The MCC is in essence a correlation coefficient value
+    between -1 and +1. A coefficient of +1 represents a perfect prediction, 0
+    an average random prediction and -1 an inverse prediction.  The statistic
+    is also known as the phi coefficient. [source: Wikipedia]
+
+    Binary and multiclass labels are supported.  Only in the binary case does
+    this relate to information about true and false positives and negatives.
+    See references below.
+
+    Read more in the :ref:`User Guide <matthews_corrcoef>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated targets as returned by a classifier.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+        .. versionadded:: 0.18
+
+    Returns
+    -------
+    mcc : float
+        The Matthews correlation coefficient (+1 represents a perfect
+        prediction, 0 an average random prediction and -1 and inverse
+        prediction).
+
+    References
+    ----------
+    .. [1] :doi:`Baldi, Brunak, Chauvin, Andersen and Nielsen, (2000). Assessing the
+       accuracy of prediction algorithms for classification: an overview.
+       <10.1093/bioinformatics/16.5.412>`
+
+    .. [2] `Wikipedia entry for the Matthews Correlation Coefficient (phi coefficient)
+       <https://en.wikipedia.org/wiki/Phi_coefficient>`_.
+
+    .. [3] `Gorodkin, (2004). Comparing two K-category assignments by a
+        K-category correlation coefficient
+        <https://www.sciencedirect.com/science/article/pii/S1476927104000799>`_.
+
+    .. [4] `Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN
+        Error Measures in MultiClass Prediction
+        <https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0041882>`_.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import matthews_corrcoef
+    >>> y_true = [+1, +1, +1, -1]
+    >>> y_pred = [+1, -1, +1, +1]
+    >>> matthews_corrcoef(y_true, y_pred)
+    -0.33
+    """
+    y_true, y_pred = attach_unique(y_true, y_pred)
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    check_consistent_length(y_true, y_pred, sample_weight)
+    if y_type not in {"binary", "multiclass"}:
+        raise ValueError("%s is not supported" % y_type)
+
+    lb = LabelEncoder()
+    lb.fit(np.hstack([y_true, y_pred]))
+    y_true = lb.transform(y_true)
+    y_pred = lb.transform(y_pred)
+
+    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
+    t_sum = C.sum(axis=1, dtype=np.float64)
+    p_sum = C.sum(axis=0, dtype=np.float64)
+    n_correct = np.trace(C, dtype=np.float64)
+    n_samples = p_sum.sum()
+    cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum)
+    cov_ypyp = n_samples**2 - np.dot(p_sum, p_sum)
+    cov_ytyt = n_samples**2 - np.dot(t_sum, t_sum)
+
+    cov_ypyp_ytyt = cov_ypyp * cov_ytyt
+    if cov_ypyp_ytyt == 0:
+        return 0.0
+    else:
+        return float(cov_ytyp / np.sqrt(cov_ypyp_ytyt))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "normalize": ["boolean"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):
+    """Zero-one classification loss.
+
+    If normalize is ``True``, return the fraction of misclassifications
+    (float), else it returns the number of misclassifications (int). The best
+    performance is 0.
+
+    Read more in the :ref:`User Guide <zero_one_loss>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) labels.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Predicted labels, as returned by a classifier.
+
+    normalize : bool, default=True
+        If ``False``, return the number of misclassifications.
+        Otherwise, return the fraction of misclassifications.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float or int,
+        If ``normalize == True``, return the fraction of misclassifications
+        (float), else it returns the number of misclassifications (int).
+
+    See Also
+    --------
+    accuracy_score : Compute the accuracy score. By default, the function will
+        return the fraction of correct predictions divided by the total number
+        of predictions.
+    hamming_loss : Compute the average Hamming loss or Hamming distance between
+        two sets of samples.
+    jaccard_score : Compute the Jaccard similarity coefficient score.
+
+    Notes
+    -----
+    In multilabel classification, the zero_one_loss function corresponds to
+    the subset zero-one loss: for each sample, the entire set of labels must be
+    correctly predicted, otherwise the loss for that sample is equal to one.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import zero_one_loss
+    >>> y_pred = [1, 2, 3, 4]
+    >>> y_true = [2, 2, 3, 4]
+    >>> zero_one_loss(y_true, y_pred)
+    0.25
+    >>> zero_one_loss(y_true, y_pred, normalize=False)
+    1.0
+
+    In the multilabel case with binary label indicators:
+
+    >>> import numpy as np
+    >>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
+    0.5
+    """
+    xp, _ = get_namespace(y_true, y_pred)
+    score = accuracy_score(
+        y_true, y_pred, normalize=normalize, sample_weight=sample_weight
+    )
+
+    if normalize:
+        return 1 - score
+    else:
+        if sample_weight is not None:
+            n_samples = xp.sum(sample_weight)
+        else:
+            n_samples = _num_samples(y_true)
+        return n_samples - score
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "labels": ["array-like", None],
+        "pos_label": [Real, str, "boolean", None],
+        "average": [
+            StrOptions({"micro", "macro", "samples", "weighted", "binary"}),
+            None,
+        ],
+        "sample_weight": ["array-like", None],
+        "zero_division": [
+            Options(Real, {0.0, 1.0}),
+            "nan",
+            StrOptions({"warn"}),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def f1_score(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn",
+):
+    """Compute the F1 score, also known as balanced F-score or F-measure.
+
+    The F1 score can be interpreted as a harmonic mean of the precision and
+    recall, where an F1 score reaches its best value at 1 and worst score at 0.
+    The relative contribution of precision and recall to the F1 score are
+    equal. The formula for the F1 score is:
+
+    .. math::
+        \\text{F1} = \\frac{2 * \\text{TP}}{2 * \\text{TP} + \\text{FP} + \\text{FN}}
+
+    Where :math:`\\text{TP}` is the number of true positives, :math:`\\text{FN}` is the
+    number of false negatives, and :math:`\\text{FP}` is the number of false positives.
+    F1 is by default
+    calculated as 0.0 when there are no true positives, false negatives, or
+    false positives.
+
+    Support beyond :term:`binary` targets is achieved by treating :term:`multiclass`
+    and :term:`multilabel` data as a collection of binary problems, one for each
+    label. For the :term:`binary` case, setting `average='binary'` will return
+    F1 score for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored
+    and F1 score for both classes are computed, then averaged or both returned (when
+    `average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets,
+    F1 score for all `labels` are either returned or averaged depending on the
+    `average` parameter. Use `labels` specify the set of labels to calculate F1 score
+    for.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    labels : array-like, default=None
+        The set of labels to include when `average != 'binary'`, and their
+        order if `average is None`. Labels present in the data can be
+        excluded, for example in multiclass classification to exclude a "negative
+        class". Labels not present in the data can be included and will be
+        "assigned" 0 samples. For multilabel targets, labels are column indices.
+        By default, all labels in `y_true` and `y_pred` are used in sorted order.
+
+        .. versionchanged:: 0.17
+           Parameter `labels` improved for multiclass problem.
+
+    pos_label : int, float, bool or str, default=1
+        The class to report if `average='binary'` and the data is binary,
+        otherwise this parameter is ignored.
+        For multiclass or multilabel targets, set `labels=[pos_label]` and
+        `average != 'binary'` to report metrics for one label only.
+
+    average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
+            default='binary'
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the metrics for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
+        Sets the value to return when there is a zero division, i.e. when all
+        predictions and labels are negative.
+
+        Notes:
+        - If set to "warn", this acts like 0, but a warning is also raised.
+        - If set to `np.nan`, such values will be excluded from the average.
+
+        .. versionadded:: 1.3
+           `np.nan` option was added.
+
+    Returns
+    -------
+    f1_score : float or array of float, shape = [n_unique_labels]
+        F1 score of the positive class in binary classification or weighted
+        average of the F1 scores of each class for the multiclass task.
+
+    See Also
+    --------
+    fbeta_score : Compute the F-beta score.
+    precision_recall_fscore_support : Compute the precision, recall, F-score,
+        and support.
+    jaccard_score : Compute the Jaccard similarity coefficient score.
+    multilabel_confusion_matrix : Compute a confusion matrix for each class or
+        sample.
+
+    Notes
+    -----
+    When ``true positive + false positive + false negative == 0`` (i.e. a class
+    is completely absent from both ``y_true`` or ``y_pred``), f-score is
+    undefined. In such cases, by default f-score will be set to 0.0, and
+    ``UndefinedMetricWarning`` will be raised. This behavior can be modified by
+    setting the ``zero_division`` parameter.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the F1-score
+           <https://en.wikipedia.org/wiki/F1_score>`_.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import f1_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> f1_score(y_true, y_pred, average='macro')
+    0.267
+    >>> f1_score(y_true, y_pred, average='micro')
+    0.33
+    >>> f1_score(y_true, y_pred, average='weighted')
+    0.267
+    >>> f1_score(y_true, y_pred, average=None)
+    array([0.8, 0. , 0. ])
+
+    >>> # binary classification
+    >>> y_true_empty = [0, 0, 0, 0, 0, 0]
+    >>> y_pred_empty = [0, 0, 0, 0, 0, 0]
+    >>> f1_score(y_true_empty, y_pred_empty)
+    0.0...
+    >>> f1_score(y_true_empty, y_pred_empty, zero_division=1.0)
+    1.0...
+    >>> f1_score(y_true_empty, y_pred_empty, zero_division=np.nan)
+    nan...
+
+    >>> # multilabel classification
+    >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]
+    >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]
+    >>> f1_score(y_true, y_pred, average=None)
+    array([0.66666667, 1.        , 0.66666667])
+    """
+    return fbeta_score(
+        y_true,
+        y_pred,
+        beta=1,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "beta": [Interval(Real, 0.0, None, closed="both")],
+        "labels": ["array-like", None],
+        "pos_label": [Real, str, "boolean", None],
+        "average": [
+            StrOptions({"micro", "macro", "samples", "weighted", "binary"}),
+            None,
+        ],
+        "sample_weight": ["array-like", None],
+        "zero_division": [
+            Options(Real, {0.0, 1.0}),
+            "nan",
+            StrOptions({"warn"}),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fbeta_score(
+    y_true,
+    y_pred,
+    *,
+    beta,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn",
+):
+    """Compute the F-beta score.
+
+    The F-beta score is the weighted harmonic mean of precision and recall,
+    reaching its optimal value at 1 and its worst value at 0.
+
+    The `beta` parameter represents the ratio of recall importance to
+    precision importance. `beta > 1` gives more weight to recall, while
+    `beta < 1` favors precision. For example, `beta = 2` makes recall twice
+    as important as precision, while `beta = 0.5` does the opposite.
+    Asymptotically, `beta -> +inf` considers only recall, and `beta -> 0`
+    only precision.
+
+    The formula for F-beta score is:
+
+    .. math::
+
+       F_\\beta = \\frac{(1 + \\beta^2) \\text{tp}}
+                        {(1 + \\beta^2) \\text{tp} + \\text{fp} + \\beta^2 \\text{fn}}
+
+    Where :math:`\\text{tp}` is the number of true positives, :math:`\\text{fp}` is the
+    number of false positives, and :math:`\\text{fn}` is the number of false negatives.
+
+    Support beyond term:`binary` targets is achieved by treating :term:`multiclass`
+    and :term:`multilabel` data as a collection of binary problems, one for each
+    label. For the :term:`binary` case, setting `average='binary'` will return
+    F-beta score for `pos_label`. If `average` is not `'binary'`, `pos_label` is
+    ignored and F-beta score for both classes are computed, then averaged or both
+    returned (when `average=None`). Similarly, for :term:`multiclass` and
+    :term:`multilabel` targets, F-beta score for all `labels` are either returned or
+    averaged depending on the `average` parameter. Use `labels` specify the set of
+    labels to calculate F-beta score for.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    beta : float
+        Determines the weight of recall in the combined score.
+
+    labels : array-like, default=None
+        The set of labels to include when `average != 'binary'`, and their
+        order if `average is None`. Labels present in the data can be
+        excluded, for example in multiclass classification to exclude a "negative
+        class". Labels not present in the data can be included and will be
+        "assigned" 0 samples. For multilabel targets, labels are column indices.
+        By default, all labels in `y_true` and `y_pred` are used in sorted order.
+
+        .. versionchanged:: 0.17
+           Parameter `labels` improved for multiclass problem.
+
+    pos_label : int, float, bool or str, default=1
+        The class to report if `average='binary'` and the data is binary,
+        otherwise this parameter is ignored.
+        For multiclass or multilabel targets, set `labels=[pos_label]` and
+        `average != 'binary'` to report metrics for one label only.
+
+    average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
+            default='binary'
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the metrics for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
+        Sets the value to return when there is a zero division, i.e. when all
+        predictions and labels are negative.
+
+        Notes:
+
+        - If set to "warn", this acts like 0, but a warning is also raised.
+        - If set to `np.nan`, such values will be excluded from the average.
+
+        .. versionadded:: 1.3
+           `np.nan` option was added.
+
+    Returns
+    -------
+    fbeta_score : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+        F-beta score of the positive class in binary classification or weighted
+        average of the F-beta score of each class for the multiclass task.
+
+    See Also
+    --------
+    precision_recall_fscore_support : Compute the precision, recall, F-score,
+        and support.
+    multilabel_confusion_matrix : Compute a confusion matrix for each class or
+        sample.
+
+    Notes
+    -----
+    When ``true positive + false positive + false negative == 0``, f-score
+    returns 0.0 and raises ``UndefinedMetricWarning``. This behavior can be
+    modified by setting ``zero_division``.
+
+    F-beta score is not implemented as a named scorer that can be passed to
+    the `scoring` parameter of cross-validation tools directly: it requires to be
+    wrapped with :func:`make_scorer` so as to specify the value of `beta`. See
+    examples for details.
+
+    References
+    ----------
+    .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
+           Modern Information Retrieval. Addison Wesley, pp. 327-328.
+
+    .. [2] `Wikipedia entry for the F1-score
+           <https://en.wikipedia.org/wiki/F1_score>`_.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import fbeta_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5)
+    0.238
+    >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5)
+    0.33
+    >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
+    0.238
+    >>> fbeta_score(y_true, y_pred, average=None, beta=0.5)
+    array([0.71, 0.        , 0.        ])
+    >>> y_pred_empty = [0, 0, 0, 0, 0, 0]
+    >>> fbeta_score(
+    ...     y_true,
+    ...     y_pred_empty,
+    ...     average="macro",
+    ...     zero_division=np.nan,
+    ...     beta=0.5,
+    ... )
+    0.128
+
+    In order to use :func:`fbeta_scorer` as a scorer, a callable
+    scorer objects needs to be created first with :func:`make_scorer`,
+    passing the value for the `beta` parameter.
+
+    >>> from sklearn.metrics import fbeta_score, make_scorer
+    >>> ftwo_scorer = make_scorer(fbeta_score, beta=2)
+    >>> from sklearn.model_selection import GridSearchCV
+    >>> from sklearn.svm import LinearSVC
+    >>> grid = GridSearchCV(
+    ...     LinearSVC(dual="auto"),
+    ...     param_grid={'C': [1, 10]},
+    ...     scoring=ftwo_scorer,
+    ...     cv=5
+    ... )
+    """
+
+    _, _, f, _ = precision_recall_fscore_support(
+        y_true,
+        y_pred,
+        beta=beta,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=("f-score",),
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+    return f
+
+
+def _prf_divide(
+    numerator, denominator, metric, modifier, average, warn_for, zero_division="warn"
+):
+    """Performs division and handles divide-by-zero.
+
+    On zero-division, sets the corresponding result elements equal to
+    0, 1 or np.nan (according to ``zero_division``). Plus, if
+    ``zero_division != "warn"`` raises a warning.
+
+    The metric, modifier and average arguments are used only for determining
+    an appropriate warning.
+    """
+    xp, _ = get_namespace(numerator, denominator)
+    dtype_float = _find_matching_floating_dtype(numerator, denominator, xp=xp)
+    mask = denominator == 0
+    denominator = xp.asarray(denominator, copy=True, dtype=dtype_float)
+    denominator[mask] = 1  # avoid infs/nans
+    result = xp.asarray(numerator, dtype=dtype_float) / denominator
+
+    if not xp.any(mask):
+        return result
+
+    # set those with 0 denominator to `zero_division`, and 0 when "warn"
+    zero_division_value = _check_zero_division(zero_division)
+    result[mask] = zero_division_value
+
+    # we assume the user will be removing warnings if zero_division is set
+    # to something different than "warn". If we are computing only f-score
+    # the warning will be raised only if precision and recall are ill-defined
+    if zero_division != "warn" or metric not in warn_for:
+        return result
+
+    # build appropriate warning
+    if metric in warn_for:
+        _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
+
+    return result
+
+
+def _warn_prf(average, modifier, msg_start, result_size):
+    axis0, axis1 = "sample", "label"
+    if average == "samples":
+        axis0, axis1 = axis1, axis0
+    msg = (
+        "{0} ill-defined and being set to 0.0 {{0}} "
+        "no {1} {2}s. Use `zero_division` parameter to control"
+        " this behavior.".format(msg_start, modifier, axis0)
+    )
+    if result_size == 1:
+        msg = msg.format("due to")
+    else:
+        msg = msg.format("in {0}s with".format(axis1))
+    warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+
+
+def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
+    """Validation associated with set-wise metrics.
+
+    Returns identified labels.
+    """
+    average_options = (None, "micro", "macro", "weighted", "samples")
+    if average not in average_options and average != "binary":
+        raise ValueError("average has to be one of " + str(average_options))
+
+    y_true, y_pred = attach_unique(y_true, y_pred)
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    # Convert to Python primitive type to avoid NumPy type / Python str
+    # comparison. See https://github.com/numpy/numpy/issues/6784
+    present_labels = _tolist(unique_labels(y_true, y_pred))
+    if average == "binary":
+        if y_type == "binary":
+            if pos_label not in present_labels:
+                if len(present_labels) >= 2:
+                    raise ValueError(
+                        f"pos_label={pos_label} is not a valid label. It "
+                        f"should be one of {present_labels}"
+                    )
+            labels = [pos_label]
+        else:
+            average_options = list(average_options)
+            if y_type == "multiclass":
+                average_options.remove("samples")
+            raise ValueError(
+                "Target is %s but average='binary'. Please "
+                "choose another average setting, one of %r." % (y_type, average_options)
+            )
+    elif pos_label not in (None, 1):
+        warnings.warn(
+            "Note that pos_label (set to %r) is ignored when "
+            "average != 'binary' (got %r). You may use "
+            "labels=[pos_label] to specify a single positive class."
+            % (pos_label, average),
+            UserWarning,
+        )
+    return labels
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "beta": [Interval(Real, 0.0, None, closed="both")],
+        "labels": ["array-like", None],
+        "pos_label": [Real, str, "boolean", None],
+        "average": [
+            StrOptions({"micro", "macro", "samples", "weighted", "binary"}),
+            None,
+        ],
+        "warn_for": [list, tuple, set],
+        "sample_weight": ["array-like", None],
+        "zero_division": [
+            Options(Real, {0.0, 1.0}),
+            "nan",
+            StrOptions({"warn"}),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def precision_recall_fscore_support(
+    y_true,
+    y_pred,
+    *,
+    beta=1.0,
+    labels=None,
+    pos_label=1,
+    average=None,
+    warn_for=("precision", "recall", "f-score"),
+    sample_weight=None,
+    zero_division="warn",
+):
+    """Compute precision, recall, F-measure and support for each class.
+
+    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    true positives and ``fp`` the number of false positives. The precision is
+    intuitively the ability of the classifier not to label a negative sample as
+    positive.
+
+    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    true positives and ``fn`` the number of false negatives. The recall is
+    intuitively the ability of the classifier to find all the positive samples.
+
+    The F-beta score can be interpreted as a weighted harmonic mean of
+    the precision and recall, where an F-beta score reaches its best
+    value at 1 and worst score at 0.
+
+    The F-beta score weights recall more than precision by a factor of
+    ``beta``. ``beta == 1.0`` means recall and precision are equally important.
+
+    The support is the number of occurrences of each class in ``y_true``.
+
+    Support beyond term:`binary` targets is achieved by treating :term:`multiclass`
+    and :term:`multilabel` data as a collection of binary problems, one for each
+    label. For the :term:`binary` case, setting `average='binary'` will return
+    metrics for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored
+    and metrics for both classes are computed, then averaged or both returned (when
+    `average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets,
+    metrics for all `labels` are either returned or averaged depending on the `average`
+    parameter. Use `labels` specify the set of labels to calculate metrics for.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    beta : float, default=1.0
+        The strength of recall versus precision in the F-score.
+
+    labels : array-like, default=None
+        The set of labels to include when `average != 'binary'`, and their
+        order if `average is None`. Labels present in the data can be
+        excluded, for example in multiclass classification to exclude a "negative
+        class". Labels not present in the data can be included and will be
+        "assigned" 0 samples. For multilabel targets, labels are column indices.
+        By default, all labels in `y_true` and `y_pred` are used in sorted order.
+
+        .. versionchanged:: 0.17
+           Parameter `labels` improved for multiclass problem.
+
+    pos_label : int, float, bool or str, default=1
+        The class to report if `average='binary'` and the data is binary,
+        otherwise this parameter is ignored.
+        For multiclass or multilabel targets, set `labels=[pos_label]` and
+        `average != 'binary'` to report metrics for one label only.
+
+    average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
+            default='binary'
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the metrics for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    warn_for : list, tuple or set, for internal use
+        This determines which warnings will be made in the case that this
+        function is being used to return only one of its metrics.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
+        Sets the value to return when there is a zero division:
+
+        - recall: when there are no positive labels
+        - precision: when there are no positive predictions
+        - f-score: both
+
+        Notes:
+
+        - If set to "warn", this acts like 0, but a warning is also raised.
+        - If set to `np.nan`, such values will be excluded from the average.
+
+        .. versionadded:: 1.3
+           `np.nan` option was added.
+
+    Returns
+    -------
+    precision : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+        Precision score.
+
+    recall : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+        Recall score.
+
+    fbeta_score : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+        F-beta score.
+
+    support : None (if average is not None) or array of int, shape =\
+        [n_unique_labels]
+        The number of occurrences of each label in ``y_true``.
+
+    Notes
+    -----
+    When ``true positive + false positive == 0``, precision is undefined.
+    When ``true positive + false negative == 0``, recall is undefined. When
+    ``true positive + false negative + false positive == 0``, f-score is
+    undefined. In such cases, by default the metric will be set to 0, and
+    ``UndefinedMetricWarning`` will be raised. This behavior can be modified
+    with ``zero_division``.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Precision and recall
+           <https://en.wikipedia.org/wiki/Precision_and_recall>`_.
+
+    .. [2] `Wikipedia entry for the F1-score
+           <https://en.wikipedia.org/wiki/F1_score>`_.
+
+    .. [3] `Discriminative Methods for Multi-labeled Classification Advances
+           in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu
+           Godbole, Sunita Sarawagi
+           <http://www.godbole.net/shantanu/pubs/multilabelsvm-pakdd04.pdf>`_.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import precision_recall_fscore_support
+    >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])
+    >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])
+    >>> precision_recall_fscore_support(y_true, y_pred, average='macro')
+    (0.222, 0.333, 0.267, None)
+    >>> precision_recall_fscore_support(y_true, y_pred, average='micro')
+    (0.33, 0.33, 0.33, None)
+    >>> precision_recall_fscore_support(y_true, y_pred, average='weighted')
+    (0.222, 0.333, 0.267, None)
+
+    It is possible to compute per-label precisions, recalls, F1-scores and
+    supports instead of averaging:
+
+    >>> precision_recall_fscore_support(y_true, y_pred, average=None,
+    ... labels=['pig', 'dog', 'cat'])
+    (array([0.        , 0.        , 0.66]),
+     array([0., 0., 1.]), array([0. , 0. , 0.8]),
+     array([2, 2, 2]))
+    """
+    _check_zero_division(zero_division)
+    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
+
+    # Calculate tp_sum, pred_sum, true_sum ###
+    samplewise = average == "samples"
+    MCM = multilabel_confusion_matrix(
+        y_true,
+        y_pred,
+        sample_weight=sample_weight,
+        labels=labels,
+        samplewise=samplewise,
+    )
+    tp_sum = MCM[:, 1, 1]
+    pred_sum = tp_sum + MCM[:, 0, 1]
+    true_sum = tp_sum + MCM[:, 1, 0]
+
+    xp, _, device_ = get_namespace_and_device(y_true, y_pred)
+    if average == "micro":
+        tp_sum = xp.reshape(xp.sum(tp_sum), (1,))
+        pred_sum = xp.reshape(xp.sum(pred_sum), (1,))
+        true_sum = xp.reshape(xp.sum(true_sum), (1,))
+
+    # Finally, we have all our sufficient statistics. Divide! #
+    beta2 = beta**2
+
+    # Divide, and on zero-division, set scores and/or warn according to
+    # zero_division:
+    precision = _prf_divide(
+        tp_sum, pred_sum, "precision", "predicted", average, warn_for, zero_division
+    )
+    recall = _prf_divide(
+        tp_sum, true_sum, "recall", "true", average, warn_for, zero_division
+    )
+
+    if np.isposinf(beta):
+        f_score = recall
+    elif beta == 0:
+        f_score = precision
+    else:
+        # The score is defined as:
+        # score = (1 + beta**2) * precision * recall / (beta**2 * precision + recall)
+        # Therefore, we can express the score in terms of confusion matrix entries as:
+        # score = (1 + beta**2) * tp / ((1 + beta**2) * tp + beta**2 * fn + fp)
+
+        # Array api strict requires all arrays to be of the same type so we
+        # need to convert true_sum, pred_sum and tp_sum to the max supported
+        # float dtype because beta2 is a float
+        max_float_type = _max_precision_float_dtype(xp=xp, device=device_)
+        denom = beta2 * xp.astype(true_sum, max_float_type) + xp.astype(
+            pred_sum, max_float_type
+        )
+        f_score = _prf_divide(
+            (1 + beta2) * xp.astype(tp_sum, max_float_type),
+            denom,
+            "f-score",
+            "true nor predicted",
+            average,
+            warn_for,
+            zero_division,
+        )
+
+    # Average the results
+    if average == "weighted":
+        weights = true_sum
+    elif average == "samples":
+        weights = sample_weight
+    else:
+        weights = None
+
+    if average is not None:
+        precision = float(_nanaverage(precision, weights=weights))
+        recall = float(_nanaverage(recall, weights=weights))
+        f_score = float(_nanaverage(f_score, weights=weights))
+        true_sum = None  # return no support
+
+    return precision, recall, f_score, true_sum
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "labels": ["array-like", None],
+        "sample_weight": ["array-like", None],
+        "raise_warning": ["boolean", Hidden(StrOptions({"deprecated"}))],
+        "replace_undefined_by": [
+            Options(Real, {1.0, np.nan}),
+            dict,
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def class_likelihood_ratios(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    sample_weight=None,
+    raise_warning="deprecated",
+    replace_undefined_by=np.nan,
+):
+    """Compute binary classification positive and negative likelihood ratios.
+
+    The positive likelihood ratio is `LR+ = sensitivity / (1 - specificity)`
+    where the sensitivity or recall is the ratio `tp / (tp + fn)` and the
+    specificity is `tn / (tn + fp)`. The negative likelihood ratio is `LR- = (1
+    - sensitivity) / specificity`. Here `tp` is the number of true positives,
+    `fp` the number of false positives, `tn` is the number of true negatives and
+    `fn` the number of false negatives. Both class likelihood ratios can be used
+    to obtain post-test probabilities given a pre-test probability.
+
+    `LR+` ranges from 1.0 to infinity. A `LR+` of 1.0 indicates that the probability
+    of predicting the positive class is the same for samples belonging to either
+    class; therefore, the test is useless. The greater `LR+` is, the more a
+    positive prediction is likely to be a true positive when compared with the
+    pre-test probability. A value of `LR+` lower than 1.0 is invalid as it would
+    indicate that the odds of a sample being a true positive decrease with
+    respect to the pre-test odds.
+
+    `LR-` ranges from 0.0 to 1.0. The closer it is to 0.0, the lower the probability
+    of a given sample to be a false negative. A `LR-` of 1.0 means the test is
+    useless because the odds of having the condition did not change after the
+    test. A value of `LR-` greater than 1.0 invalidates the classifier as it
+    indicates an increase in the odds of a sample belonging to the positive
+    class after being classified as negative. This is the case when the
+    classifier systematically predicts the opposite of the true label.
+
+    A typical application in medicine is to identify the positive/negative class
+    to the presence/absence of a disease, respectively; the classifier being a
+    diagnostic test; the pre-test probability of an individual having the
+    disease can be the prevalence of such disease (proportion of a particular
+    population found to be affected by a medical condition); and the post-test
+    probabilities would be the probability that the condition is truly present
+    given a positive test result.
+
+    Read more in the :ref:`User Guide <class_likelihood_ratios>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    labels : array-like, default=None
+        List of labels to index the matrix. This may be used to select the
+        positive and negative classes with the ordering `labels=[negative_class,
+        positive_class]`. If `None` is given, those that appear at least once in
+        `y_true` or `y_pred` are used in sorted order.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    raise_warning : bool, default=True
+        Whether or not a case-specific warning message is raised when there is division
+        by zero.
+
+        .. deprecated:: 1.7
+            `raise_warning` was deprecated in version 1.7 and will be removed in 1.9,
+            when an :class:`~sklearn.exceptions.UndefinedMetricWarning` will always
+            raise in case of a division by zero.
+
+    replace_undefined_by : np.nan, 1.0, or dict, default=np.nan
+        Sets the return values for LR+ and LR- when there is a division by zero. Can
+        take the following values:
+
+        - `np.nan` to return `np.nan` for both `LR+` and `LR-`
+        - `1.0` to return the worst possible scores: `{"LR+": 1.0, "LR-": 1.0}`
+        - a dict in the format `{"LR+": value_1, "LR-": value_2}` where the values can
+          be non-negative floats, `np.inf` or `np.nan` in the range of the
+          likelihood ratios. For example, `{"LR+": 1.0, "LR-": 1.0}` can be used for
+          returning the worst scores, indicating a useless model, and `{"LR+": np.inf,
+          "LR-": 0.0}` can be used for returning the best scores, indicating a useful
+          model.
+
+        If a division by zero occurs, only the affected metric is replaced with the set
+        value; the other metric is calculated as usual.
+
+        .. versionadded:: 1.7
+
+    Returns
+    -------
+    (positive_likelihood_ratio, negative_likelihood_ratio) : tuple
+        A tuple of two floats, the first containing the positive likelihood ratio (LR+)
+        and the second the negative likelihood ratio (LR-).
+
+    Warns
+    -----
+    Raises :class:`~sklearn.exceptions.UndefinedMetricWarning` when `y_true` and
+    `y_pred` lead to the following conditions:
+
+        - The number of false positives is 0 and `raise_warning` is set to `True`
+          (default): positive likelihood ratio is undefined.
+        - The number of true negatives is 0 and `raise_warning` is set to `True`
+          (default): negative likelihood ratio is undefined.
+        - The sum of true positives and false negatives is 0 (no samples of the positive
+          class are present in `y_true`): both likelihood ratios are undefined.
+
+        For the first two cases, an undefined metric can be defined by setting the
+        `replace_undefined_by` param.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Likelihood ratios in diagnostic testing
+           <https://en.wikipedia.org/wiki/Likelihood_ratios_in_diagnostic_testing>`_.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import class_likelihood_ratios
+    >>> class_likelihood_ratios([0, 1, 0, 1, 0], [1, 1, 0, 0, 0])
+    (1.5, 0.75)
+    >>> y_true = np.array(["non-cat", "cat", "non-cat", "cat", "non-cat"])
+    >>> y_pred = np.array(["cat", "cat", "non-cat", "non-cat", "non-cat"])
+    >>> class_likelihood_ratios(y_true, y_pred)
+    (1.33, 0.66)
+    >>> y_true = np.array(["non-zebra", "zebra", "non-zebra", "zebra", "non-zebra"])
+    >>> y_pred = np.array(["zebra", "zebra", "non-zebra", "non-zebra", "non-zebra"])
+    >>> class_likelihood_ratios(y_true, y_pred)
+    (1.5, 0.75)
+
+    To avoid ambiguities, use the notation `labels=[negative_class,
+    positive_class]`
+
+    >>> y_true = np.array(["non-cat", "cat", "non-cat", "cat", "non-cat"])
+    >>> y_pred = np.array(["cat", "cat", "non-cat", "non-cat", "non-cat"])
+    >>> class_likelihood_ratios(y_true, y_pred, labels=["non-cat", "cat"])
+    (1.5, 0.75)
+    """
+    # TODO(1.9): When `raise_warning` is removed, the following changes need to be made:
+    # The checks for `raise_warning==True` need to be removed and we will always warn,
+    # remove `FutureWarning`, and the Warns section in the docstring should not mention
+    # `raise_warning` anymore.
+    y_true, y_pred = attach_unique(y_true, y_pred)
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    if y_type != "binary":
+        raise ValueError(
+            "class_likelihood_ratios only supports binary classification "
+            f"problems, got targets of type: {y_type}"
+        )
+
+    msg_deprecated_param = (
+        "`raise_warning` was deprecated in version 1.7 and will be removed in 1.9. An "
+        "`UndefinedMetricWarning` will always be raised in case of a division by zero "
+        "and the value set with the `replace_undefined_by` param will be returned."
+    )
+    if raise_warning != "deprecated":
+        warnings.warn(msg_deprecated_param, FutureWarning)
+    else:
+        raise_warning = True
+
+    if replace_undefined_by == 1.0:
+        replace_undefined_by = {"LR+": 1.0, "LR-": 1.0}
+
+    if isinstance(replace_undefined_by, dict):
+        msg = (
+            "The dictionary passed as `replace_undefined_by` needs to be in the form "
+            "`{'LR+': `value_1`, 'LR-': `value_2`}` where the value for `LR+` ranges "
+            "from `1.0` to `np.inf` or is `np.nan` and the value for `LR-` ranges from "
+            f"`0.0` to `1.0` or is `np.nan`; got `{replace_undefined_by}`."
+        )
+        if ("LR+" in replace_undefined_by) and ("LR-" in replace_undefined_by):
+            try:
+                desired_lr_pos = replace_undefined_by.get("LR+", None)
+                check_scalar(
+                    desired_lr_pos,
+                    "positive_likelihood_ratio",
+                    target_type=(Real),
+                    min_val=1.0,
+                    include_boundaries="left",
+                )
+                desired_lr_neg = replace_undefined_by.get("LR-", None)
+                check_scalar(
+                    desired_lr_neg,
+                    "negative_likelihood_ratio",
+                    target_type=(Real),
+                    min_val=0.0,
+                    max_val=1.0,
+                    include_boundaries="both",
+                )
+            except Exception as e:
+                raise ValueError(msg) from e
+        else:
+            raise ValueError(msg)
+
+    cm = confusion_matrix(
+        y_true,
+        y_pred,
+        sample_weight=sample_weight,
+        labels=labels,
+    )
+
+    tn, fp, fn, tp = cm.ravel()
+    support_pos = tp + fn
+    support_neg = tn + fp
+    pos_num = tp * support_neg
+    pos_denom = fp * support_pos
+    neg_num = fn * support_neg
+    neg_denom = tn * support_pos
+
+    # if `support_pos == 0`a division by zero will occur
+    if support_pos == 0:
+        msg = (
+            "No samples of the positive class are present in `y_true`. "
+            "`positive_likelihood_ratio` and `negative_likelihood_ratio` are both set "
+            "to `np.nan`. Use the `replace_undefined_by` param to control this "
+            "behavior. To suppress this warning or turn it into an error, see Python's "
+            "`warnings` module and `warnings.catch_warnings()`."
+        )
+        warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+        positive_likelihood_ratio = np.nan
+        negative_likelihood_ratio = np.nan
+
+    # if `fp == 0`a division by zero will occur
+    if fp == 0:
+        if raise_warning:
+            if tp == 0:
+                msg_beginning = (
+                    "No samples were predicted for the positive class and "
+                    "`positive_likelihood_ratio` is "
+                )
+            else:
+                msg_beginning = "`positive_likelihood_ratio` is ill-defined and "
+            msg_end = "set to `np.nan`. Use the `replace_undefined_by` param to "
+            "control this behavior. To suppress this warning or turn it into an error, "
+            "see Python's `warnings` module and `warnings.catch_warnings()`."
+            warnings.warn(msg_beginning + msg_end, UndefinedMetricWarning, stacklevel=2)
+        if isinstance(replace_undefined_by, float) and np.isnan(replace_undefined_by):
+            positive_likelihood_ratio = replace_undefined_by
+        else:
+            # replace_undefined_by is a dict and
+            # isinstance(replace_undefined_by.get("LR+", None), Real); this includes
+            # `np.inf` and `np.nan`
+            positive_likelihood_ratio = desired_lr_pos
+    else:
+        positive_likelihood_ratio = pos_num / pos_denom
+
+    # if `tn == 0`a division by zero will occur
+    if tn == 0:
+        if raise_warning:
+            msg = (
+                "`negative_likelihood_ratio` is ill-defined and set to `np.nan`. "
+                "Use the `replace_undefined_by` param to control this behavior. To "
+                "suppress this warning or turn it into an error, see Python's "
+                "`warnings` module and `warnings.catch_warnings()`."
+            )
+            warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+        if isinstance(replace_undefined_by, float) and np.isnan(replace_undefined_by):
+            negative_likelihood_ratio = replace_undefined_by
+        else:
+            # replace_undefined_by is a dict and
+            # isinstance(replace_undefined_by.get("LR-", None), Real); this includes
+            # `np.nan`
+            negative_likelihood_ratio = desired_lr_neg
+    else:
+        negative_likelihood_ratio = neg_num / neg_denom
+
+    return float(positive_likelihood_ratio), float(negative_likelihood_ratio)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "labels": ["array-like", None],
+        "pos_label": [Real, str, "boolean", None],
+        "average": [
+            StrOptions({"micro", "macro", "samples", "weighted", "binary"}),
+            None,
+        ],
+        "sample_weight": ["array-like", None],
+        "zero_division": [
+            Options(Real, {0.0, 1.0}),
+            "nan",
+            StrOptions({"warn"}),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def precision_score(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn",
+):
+    """Compute the precision.
+
+    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    true positives and ``fp`` the number of false positives. The precision is
+    intuitively the ability of the classifier not to label as positive a sample
+    that is negative.
+
+    The best value is 1 and the worst value is 0.
+
+    Support beyond term:`binary` targets is achieved by treating :term:`multiclass`
+    and :term:`multilabel` data as a collection of binary problems, one for each
+    label. For the :term:`binary` case, setting `average='binary'` will return
+    precision for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored
+    and precision for both classes are computed, then averaged or both returned (when
+    `average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets,
+    precision for all `labels` are either returned or averaged depending on the
+    `average` parameter. Use `labels` specify the set of labels to calculate precision
+    for.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    labels : array-like, default=None
+        The set of labels to include when `average != 'binary'`, and their
+        order if `average is None`. Labels present in the data can be
+        excluded, for example in multiclass classification to exclude a "negative
+        class". Labels not present in the data can be included and will be
+        "assigned" 0 samples. For multilabel targets, labels are column indices.
+        By default, all labels in `y_true` and `y_pred` are used in sorted order.
+
+        .. versionchanged:: 0.17
+           Parameter `labels` improved for multiclass problem.
+
+    pos_label : int, float, bool or str, default=1
+        The class to report if `average='binary'` and the data is binary,
+        otherwise this parameter is ignored.
+        For multiclass or multilabel targets, set `labels=[pos_label]` and
+        `average != 'binary'` to report metrics for one label only.
+
+    average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
+            default='binary'
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the metrics for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
+        Sets the value to return when there is a zero division.
+
+        Notes:
+
+        - If set to "warn", this acts like 0, but a warning is also raised.
+        - If set to `np.nan`, such values will be excluded from the average.
+
+        .. versionadded:: 1.3
+           `np.nan` option was added.
+
+    Returns
+    -------
+    precision : float (if average is not None) or array of float of shape \
+                (n_unique_labels,)
+        Precision of the positive class in binary classification or weighted
+        average of the precision of each class for the multiclass task.
+
+    See Also
+    --------
+    precision_recall_fscore_support : Compute precision, recall, F-measure and
+        support for each class.
+    recall_score :  Compute the ratio ``tp / (tp + fn)`` where ``tp`` is the
+        number of true positives and ``fn`` the number of false negatives.
+    PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given
+        an estimator and some data.
+    PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given
+        binary class predictions.
+    multilabel_confusion_matrix : Compute a confusion matrix for each class or
+        sample.
+
+    Notes
+    -----
+    When ``true positive + false positive == 0``, precision returns 0 and
+    raises ``UndefinedMetricWarning``. This behavior can be
+    modified with ``zero_division``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import precision_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> precision_score(y_true, y_pred, average='macro')
+    0.22
+    >>> precision_score(y_true, y_pred, average='micro')
+    0.33
+    >>> precision_score(y_true, y_pred, average='weighted')
+    0.22
+    >>> precision_score(y_true, y_pred, average=None)
+    array([0.66, 0.        , 0.        ])
+    >>> y_pred = [0, 0, 0, 0, 0, 0]
+    >>> precision_score(y_true, y_pred, average=None)
+    array([0.33, 0.        , 0.        ])
+    >>> precision_score(y_true, y_pred, average=None, zero_division=1)
+    array([0.33, 1.        , 1.        ])
+    >>> precision_score(y_true, y_pred, average=None, zero_division=np.nan)
+    array([0.33,        nan,        nan])
+
+    >>> # multilabel classification
+    >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]
+    >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]
+    >>> precision_score(y_true, y_pred, average=None)
+    array([0.5, 1. , 1. ])
+    """
+    p, _, _, _ = precision_recall_fscore_support(
+        y_true,
+        y_pred,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=("precision",),
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+    return p
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "labels": ["array-like", None],
+        "pos_label": [Real, str, "boolean", None],
+        "average": [
+            StrOptions({"micro", "macro", "samples", "weighted", "binary"}),
+            None,
+        ],
+        "sample_weight": ["array-like", None],
+        "zero_division": [
+            Options(Real, {0.0, 1.0}),
+            "nan",
+            StrOptions({"warn"}),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def recall_score(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn",
+):
+    """Compute the recall.
+
+    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    true positives and ``fn`` the number of false negatives. The recall is
+    intuitively the ability of the classifier to find all the positive samples.
+
+    The best value is 1 and the worst value is 0.
+
+    Support beyond term:`binary` targets is achieved by treating :term:`multiclass`
+    and :term:`multilabel` data as a collection of binary problems, one for each
+    label. For the :term:`binary` case, setting `average='binary'` will return
+    recall for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored
+    and recall for both classes are computed then averaged or both returned (when
+    `average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets,
+    recall for all `labels` are either returned or averaged depending on the `average`
+    parameter. Use `labels` specify the set of labels to calculate recall for.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    labels : array-like, default=None
+        The set of labels to include when `average != 'binary'`, and their
+        order if `average is None`. Labels present in the data can be
+        excluded, for example in multiclass classification to exclude a "negative
+        class". Labels not present in the data can be included and will be
+        "assigned" 0 samples. For multilabel targets, labels are column indices.
+        By default, all labels in `y_true` and `y_pred` are used in sorted order.
+
+        .. versionchanged:: 0.17
+           Parameter `labels` improved for multiclass problem.
+
+    pos_label : int, float, bool or str, default=1
+        The class to report if `average='binary'` and the data is binary,
+        otherwise this parameter is ignored.
+        For multiclass or multilabel targets, set `labels=[pos_label]` and
+        `average != 'binary'` to report metrics for one label only.
+
+    average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
+            default='binary'
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the metrics for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall. Weighted recall
+            is equal to accuracy.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
+        Sets the value to return when there is a zero division.
+
+        Notes:
+
+        - If set to "warn", this acts like 0, but a warning is also raised.
+        - If set to `np.nan`, such values will be excluded from the average.
+
+        .. versionadded:: 1.3
+           `np.nan` option was added.
+
+    Returns
+    -------
+    recall : float (if average is not None) or array of float of shape \
+             (n_unique_labels,)
+        Recall of the positive class in binary classification or weighted
+        average of the recall of each class for the multiclass task.
+
+    See Also
+    --------
+    precision_recall_fscore_support : Compute precision, recall, F-measure and
+        support for each class.
+    precision_score : Compute the ratio ``tp / (tp + fp)`` where ``tp`` is the
+        number of true positives and ``fp`` the number of false positives.
+    balanced_accuracy_score : Compute balanced accuracy to deal with imbalanced
+        datasets.
+    multilabel_confusion_matrix : Compute a confusion matrix for each class or
+        sample.
+    PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given
+        an estimator and some data.
+    PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given
+        binary class predictions.
+
+    Notes
+    -----
+    When ``true positive + false negative == 0``, recall returns 0 and raises
+    ``UndefinedMetricWarning``. This behavior can be modified with
+    ``zero_division``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import recall_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> recall_score(y_true, y_pred, average='macro')
+    0.33
+    >>> recall_score(y_true, y_pred, average='micro')
+    0.33
+    >>> recall_score(y_true, y_pred, average='weighted')
+    0.33
+    >>> recall_score(y_true, y_pred, average=None)
+    array([1., 0., 0.])
+    >>> y_true = [0, 0, 0, 0, 0, 0]
+    >>> recall_score(y_true, y_pred, average=None)
+    array([0.5, 0. , 0. ])
+    >>> recall_score(y_true, y_pred, average=None, zero_division=1)
+    array([0.5, 1. , 1. ])
+    >>> recall_score(y_true, y_pred, average=None, zero_division=np.nan)
+    array([0.5, nan, nan])
+
+    >>> # multilabel classification
+    >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]
+    >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]
+    >>> recall_score(y_true, y_pred, average=None)
+    array([1. , 1. , 0.5])
+    """
+    _, r, _, _ = precision_recall_fscore_support(
+        y_true,
+        y_pred,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=("recall",),
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+    return r
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "adjusted": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False):
+    """Compute the balanced accuracy.
+
+    The balanced accuracy in binary and multiclass classification problems to
+    deal with imbalanced datasets. It is defined as the average of recall
+    obtained on each class.
+
+    The best value is 1 and the worst value is 0 when ``adjusted=False``.
+
+    Read more in the :ref:`User Guide <balanced_accuracy_score>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated targets as returned by a classifier.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    adjusted : bool, default=False
+        When true, the result is adjusted for chance, so that random
+        performance would score 0, while keeping perfect performance at a score
+        of 1.
+
+    Returns
+    -------
+    balanced_accuracy : float
+        Balanced accuracy score.
+
+    See Also
+    --------
+    average_precision_score : Compute average precision (AP) from prediction
+        scores.
+    precision_score : Compute the precision score.
+    recall_score : Compute the recall score.
+    roc_auc_score : Compute Area Under the Receiver Operating Characteristic
+        Curve (ROC AUC) from prediction scores.
+
+    Notes
+    -----
+    Some literature promotes alternative definitions of balanced accuracy. Our
+    definition is equivalent to :func:`accuracy_score` with class-balanced
+    sample weights, and shares desirable properties with the binary case.
+    See the :ref:`User Guide <balanced_accuracy_score>`.
+
+    References
+    ----------
+    .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010).
+           The balanced accuracy and its posterior distribution.
+           Proceedings of the 20th International Conference on Pattern
+           Recognition, 3121-24.
+    .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015).
+           `Fundamentals of Machine Learning for Predictive Data Analytics:
+           Algorithms, Worked Examples, and Case Studies
+           <https://mitpress.mit.edu/books/fundamentals-machine-learning-predictive-data-analytics>`_.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import balanced_accuracy_score
+    >>> y_true = [0, 1, 0, 0, 1, 0]
+    >>> y_pred = [0, 1, 0, 0, 0, 1]
+    >>> balanced_accuracy_score(y_true, y_pred)
+    0.625
+    """
+    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        per_class = np.diag(C) / C.sum(axis=1)
+    if np.any(np.isnan(per_class)):
+        warnings.warn("y_pred contains classes not in y_true")
+        per_class = per_class[~np.isnan(per_class)]
+    score = np.mean(per_class)
+    if adjusted:
+        n_classes = len(per_class)
+        chance = 1 / n_classes
+        score -= chance
+        score /= 1 - chance
+    return float(score)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "labels": ["array-like", None],
+        "target_names": ["array-like", None],
+        "sample_weight": ["array-like", None],
+        "digits": [Interval(Integral, 0, None, closed="left")],
+        "output_dict": ["boolean"],
+        "zero_division": [
+            Options(Real, {0.0, 1.0}),
+            "nan",
+            StrOptions({"warn"}),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def classification_report(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    target_names=None,
+    sample_weight=None,
+    digits=2,
+    output_dict=False,
+    zero_division="warn",
+):
+    """Build a text report showing the main classification metrics.
+
+    Read more in the :ref:`User Guide <classification_report>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    labels : array-like of shape (n_labels,), default=None
+        Optional list of label indices to include in the report.
+
+    target_names : array-like of shape (n_labels,), default=None
+        Optional display names matching the labels (same order).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    digits : int, default=2
+        Number of digits for formatting output floating point values.
+        When ``output_dict`` is ``True``, this will be ignored and the
+        returned values will not be rounded.
+
+    output_dict : bool, default=False
+        If True, return output as dict.
+
+        .. versionadded:: 0.20
+
+    zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
+        Sets the value to return when there is a zero division. If set to
+        "warn", this acts as 0, but warnings are also raised.
+
+        .. versionadded:: 1.3
+           `np.nan` option was added.
+
+    Returns
+    -------
+    report : str or dict
+        Text summary of the precision, recall, F1 score for each class.
+        Dictionary returned if output_dict is True. Dictionary has the
+        following structure::
+
+            {'label 1': {'precision':0.5,
+                         'recall':1.0,
+                         'f1-score':0.67,
+                         'support':1},
+             'label 2': { ... },
+              ...
+            }
+
+        The reported averages include macro average (averaging the unweighted
+        mean per label), weighted average (averaging the support-weighted mean
+        per label), and sample average (only for multilabel classification).
+        Micro average (averaging the total true positives, false negatives and
+        false positives) is only shown for multi-label or multi-class
+        with a subset of classes, because it corresponds to accuracy
+        otherwise and would be the same for all metrics.
+        See also :func:`precision_recall_fscore_support` for more details
+        on averages.
+
+        Note that in binary classification, recall of the positive class
+        is also known as "sensitivity"; recall of the negative class is
+        "specificity".
+
+    See Also
+    --------
+    precision_recall_fscore_support: Compute precision, recall, F-measure and
+        support for each class.
+    confusion_matrix: Compute confusion matrix to evaluate the accuracy of a
+        classification.
+    multilabel_confusion_matrix: Compute a confusion matrix for each class or sample.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import classification_report
+    >>> y_true = [0, 1, 2, 2, 2]
+    >>> y_pred = [0, 0, 2, 2, 1]
+    >>> target_names = ['class 0', 'class 1', 'class 2']
+    >>> print(classification_report(y_true, y_pred, target_names=target_names))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+         class 0       0.50      1.00      0.67         1
+         class 1       0.00      0.00      0.00         1
+         class 2       1.00      0.67      0.80         3
+    <BLANKLINE>
+        accuracy                           0.60         5
+       macro avg       0.50      0.56      0.49         5
+    weighted avg       0.70      0.60      0.61         5
+    <BLANKLINE>
+    >>> y_pred = [1, 1, 0]
+    >>> y_true = [1, 1, 1]
+    >>> print(classification_report(y_true, y_pred, labels=[1, 2, 3]))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               1       1.00      0.67      0.80         3
+               2       0.00      0.00      0.00         0
+               3       0.00      0.00      0.00         0
+    <BLANKLINE>
+       micro avg       1.00      0.67      0.80         3
+       macro avg       0.33      0.22      0.27         3
+    weighted avg       1.00      0.67      0.80         3
+    <BLANKLINE>
+    """
+
+    y_true, y_pred = attach_unique(y_true, y_pred)
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+
+    if labels is None:
+        labels = unique_labels(y_true, y_pred)
+        labels_given = False
+    else:
+        labels = np.asarray(labels)
+        labels_given = True
+
+    # labelled micro average
+    micro_is_accuracy = (y_type == "multiclass" or y_type == "binary") and (
+        not labels_given or (set(labels) >= set(unique_labels(y_true, y_pred)))
+    )
+
+    if target_names is not None and len(labels) != len(target_names):
+        if labels_given:
+            warnings.warn(
+                "labels size, {0}, does not match size of target_names, {1}".format(
+                    len(labels), len(target_names)
+                )
+            )
+        else:
+            raise ValueError(
+                "Number of classes, {0}, does not match size of "
+                "target_names, {1}. Try specifying the labels "
+                "parameter".format(len(labels), len(target_names))
+            )
+    if target_names is None:
+        target_names = ["%s" % l for l in labels]
+
+    headers = ["precision", "recall", "f1-score", "support"]
+    # compute per-class results without averaging
+    p, r, f1, s = precision_recall_fscore_support(
+        y_true,
+        y_pred,
+        labels=labels,
+        average=None,
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+    rows = zip(target_names, p, r, f1, s)
+
+    if y_type.startswith("multilabel"):
+        average_options = ("micro", "macro", "weighted", "samples")
+    else:
+        average_options = ("micro", "macro", "weighted")
+
+    if output_dict:
+        report_dict = {label[0]: label[1:] for label in rows}
+        for label, scores in report_dict.items():
+            report_dict[label] = dict(zip(headers, [float(i) for i in scores]))
+    else:
+        longest_last_line_heading = "weighted avg"
+        name_width = max(len(cn) for cn in target_names)
+        width = max(name_width, len(longest_last_line_heading), digits)
+        head_fmt = "{:>{width}s} " + " {:>9}" * len(headers)
+        report = head_fmt.format("", *headers, width=width)
+        report += "\n\n"
+        row_fmt = "{:>{width}s} " + " {:>9.{digits}f}" * 3 + " {:>9}\n"
+        for row in rows:
+            report += row_fmt.format(*row, width=width, digits=digits)
+        report += "\n"
+
+    # compute all applicable averages
+    for average in average_options:
+        if average.startswith("micro") and micro_is_accuracy:
+            line_heading = "accuracy"
+        else:
+            line_heading = average + " avg"
+
+        # compute averages with specified averaging method
+        avg_p, avg_r, avg_f1, _ = precision_recall_fscore_support(
+            y_true,
+            y_pred,
+            labels=labels,
+            average=average,
+            sample_weight=sample_weight,
+            zero_division=zero_division,
+        )
+        avg = [avg_p, avg_r, avg_f1, np.sum(s)]
+
+        if output_dict:
+            report_dict[line_heading] = dict(zip(headers, [float(i) for i in avg]))
+        else:
+            if line_heading == "accuracy":
+                row_fmt_accuracy = (
+                    "{:>{width}s} "
+                    + " {:>9.{digits}}" * 2
+                    + " {:>9.{digits}f}"
+                    + " {:>9}\n"
+                )
+                report += row_fmt_accuracy.format(
+                    line_heading, "", "", *avg[2:], width=width, digits=digits
+                )
+            else:
+                report += row_fmt.format(line_heading, *avg, width=width, digits=digits)
+
+    if output_dict:
+        if "accuracy" in report_dict.keys():
+            report_dict["accuracy"] = report_dict["accuracy"]["precision"]
+        return report_dict
+    else:
+        return report
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_pred": ["array-like", "sparse matrix"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def hamming_loss(y_true, y_pred, *, sample_weight=None):
+    """Compute the average Hamming loss.
+
+    The Hamming loss is the fraction of labels that are incorrectly predicted.
+
+    Read more in the :ref:`User Guide <hamming_loss>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) labels.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Predicted labels, as returned by a classifier.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+        .. versionadded:: 0.18
+
+    Returns
+    -------
+    loss : float or int
+        Return the average Hamming loss between element of ``y_true`` and
+        ``y_pred``.
+
+    See Also
+    --------
+    accuracy_score : Compute the accuracy score. By default, the function will
+        return the fraction of correct predictions divided by the total number
+        of predictions.
+    jaccard_score : Compute the Jaccard similarity coefficient score.
+    zero_one_loss : Compute the Zero-one classification loss. By default, the
+        function will return the percentage of imperfectly predicted subsets.
+
+    Notes
+    -----
+    In multiclass classification, the Hamming loss corresponds to the Hamming
+    distance between ``y_true`` and ``y_pred`` which is equivalent to the
+    subset ``zero_one_loss`` function, when `normalize` parameter is set to
+    True.
+
+    In multilabel classification, the Hamming loss is different from the
+    subset zero-one loss. The zero-one loss considers the entire set of labels
+    for a given sample incorrect if it does not entirely match the true set of
+    labels. Hamming loss is more forgiving in that it penalizes only the
+    individual labels.
+
+    The Hamming loss is upperbounded by the subset zero-one loss, when
+    `normalize` parameter is set to True. It is always between 0 and 1,
+    lower being better.
+
+    References
+    ----------
+    .. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification:
+           An Overview. International Journal of Data Warehousing & Mining,
+           3(3), 1-13, July-September 2007.
+
+    .. [2] `Wikipedia entry on the Hamming distance
+           <https://en.wikipedia.org/wiki/Hamming_distance>`_.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import hamming_loss
+    >>> y_pred = [1, 2, 3, 4]
+    >>> y_true = [2, 2, 3, 4]
+    >>> hamming_loss(y_true, y_pred)
+    0.25
+
+    In the multilabel case with binary label indicators:
+
+    >>> import numpy as np
+    >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))
+    0.75
+    """
+    y_true, y_pred = attach_unique(y_true, y_pred)
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight)
+
+    if sample_weight is None:
+        weight_average = 1.0
+    else:
+        sample_weight = xp.asarray(sample_weight, device=device)
+        weight_average = _average(sample_weight, xp=xp)
+
+    if y_type.startswith("multilabel"):
+        n_differences = _count_nonzero(
+            y_true - y_pred, xp=xp, device=device, sample_weight=sample_weight
+        )
+        return float(n_differences) / (
+            y_true.shape[0] * y_true.shape[1] * weight_average
+        )
+
+    elif y_type in ["binary", "multiclass"]:
+        return float(_average(y_true != y_pred, weights=sample_weight, normalize=True))
+    else:
+        raise ValueError("{0} is not supported".format(y_type))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "normalize": ["boolean"],
+        "sample_weight": ["array-like", None],
+        "labels": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None):
+    r"""Log loss, aka logistic loss or cross-entropy loss.
+
+    This is the loss function used in (multinomial) logistic regression
+    and extensions of it such as neural networks, defined as the negative
+    log-likelihood of a logistic model that returns ``y_pred`` probabilities
+    for its training data ``y_true``.
+    The log loss is only defined for two or more labels.
+    For a single sample with true label :math:`y \in \{0,1\}` and
+    a probability estimate :math:`p = \operatorname{Pr}(y = 1)`, the log
+    loss is:
+
+    .. math::
+        L_{\log}(y, p) = -(y \log (p) + (1 - y) \log (1 - p))
+
+    Read more in the :ref:`User Guide <log_loss>`.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels for n_samples samples.
+
+    y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_pred.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_pred`` are assumed to be
+        ordered alphabetically, as done by
+        :class:`~sklearn.preprocessing.LabelBinarizer`.
+
+        `y_pred` values are clipped to `[eps, 1-eps]` where `eps` is the machine
+        precision for `y_pred`'s dtype.
+
+    normalize : bool, default=True
+        If true, return the mean loss per sample.
+        Otherwise, return the sum of the per-sample losses.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+
+        .. versionadded:: 0.18
+
+    Returns
+    -------
+    loss : float
+        Log loss, aka logistic loss or cross-entropy loss.
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+
+    References
+    ----------
+    C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
+    p. 209.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import log_loss
+    >>> log_loss(["spam", "ham", "ham", "spam"],
+    ...          [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])
+    0.21616
+    """
+    transformed_labels, y_pred = _validate_multiclass_probabilistic_prediction(
+        y_true, y_pred, sample_weight, labels
+    )
+
+    # Clipping
+    eps = np.finfo(y_pred.dtype).eps
+    y_pred = np.clip(y_pred, eps, 1 - eps)
+
+    loss = -xlogy(transformed_labels, y_pred).sum(axis=1)
+
+    return float(_average(loss, weights=sample_weight, normalize=normalize))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "pred_decision": ["array-like"],
+        "labels": ["array-like", None],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):
+    """Average hinge loss (non-regularized).
+
+    In binary class case, assuming labels in y_true are encoded with +1 and -1,
+    when a prediction mistake is made, ``margin = y_true * pred_decision`` is
+    always negative (since the signs disagree), implying ``1 - margin`` is
+    always greater than 1.  The cumulated hinge loss is therefore an upper
+    bound of the number of mistakes made by the classifier.
+
+    In multiclass case, the function expects that either all the labels are
+    included in y_true or an optional labels argument is provided which
+    contains all the labels. The multilabel margin is calculated according
+    to Crammer-Singer's method. As in the binary case, the cumulated hinge loss
+    is an upper bound of the number of mistakes made by the classifier.
+
+    Read more in the :ref:`User Guide <hinge_loss>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True target, consisting of integers of two values. The positive label
+        must be greater than the negative label.
+
+    pred_decision : array-like of shape (n_samples,) or (n_samples, n_classes)
+        Predicted decisions, as output by decision_function (floats).
+
+    labels : array-like, default=None
+        Contains all the labels for the problem. Used in multiclass hinge loss.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        Average hinge loss.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry on the Hinge loss
+           <https://en.wikipedia.org/wiki/Hinge_loss>`_.
+
+    .. [2] Koby Crammer, Yoram Singer. On the Algorithmic
+           Implementation of Multiclass Kernel-based Vector
+           Machines. Journal of Machine Learning Research 2,
+           (2001), 265-292.
+
+    .. [3] `L1 AND L2 Regularization for Multiclass Hinge Loss Models
+           by Robert C. Moore, John DeNero
+           <https://storage.googleapis.com/pub-tools-public-publication-data/pdf/37362.pdf>`_.
+
+    Examples
+    --------
+    >>> from sklearn import svm
+    >>> from sklearn.metrics import hinge_loss
+    >>> X = [[0], [1]]
+    >>> y = [-1, 1]
+    >>> est = svm.LinearSVC(random_state=0)
+    >>> est.fit(X, y)
+    LinearSVC(random_state=0)
+    >>> pred_decision = est.decision_function([[-2], [3], [0.5]])
+    >>> pred_decision
+    array([-2.18,  2.36,  0.09])
+    >>> hinge_loss([-1, 1, 1], pred_decision)
+    0.30
+
+    In the multiclass case:
+
+    >>> import numpy as np
+    >>> X = np.array([[0], [1], [2], [3]])
+    >>> Y = np.array([0, 1, 2, 3])
+    >>> labels = np.array([0, 1, 2, 3])
+    >>> est = svm.LinearSVC()
+    >>> est.fit(X, Y)
+    LinearSVC()
+    >>> pred_decision = est.decision_function([[-1], [2], [3]])
+    >>> y_true = [0, 2, 3]
+    >>> hinge_loss(y_true, pred_decision, labels=labels)
+    0.56
+    """
+    check_consistent_length(y_true, pred_decision, sample_weight)
+    pred_decision = check_array(pred_decision, ensure_2d=False)
+    y_true = column_or_1d(y_true)
+    y_true_unique = np.unique(labels if labels is not None else y_true)
+
+    if y_true_unique.size > 2:
+        if pred_decision.ndim <= 1:
+            raise ValueError(
+                "The shape of pred_decision cannot be 1d array"
+                "with a multiclass target. pred_decision shape "
+                "must be (n_samples, n_classes), that is "
+                f"({y_true.shape[0]}, {y_true_unique.size})."
+                f" Got: {pred_decision.shape}"
+            )
+
+        # pred_decision.ndim > 1 is true
+        if y_true_unique.size != pred_decision.shape[1]:
+            if labels is None:
+                raise ValueError(
+                    "Please include all labels in y_true "
+                    "or pass labels as third argument"
+                )
+            else:
+                raise ValueError(
+                    "The shape of pred_decision is not "
+                    "consistent with the number of classes. "
+                    "With a multiclass target, pred_decision "
+                    "shape must be "
+                    "(n_samples, n_classes), that is "
+                    f"({y_true.shape[0]}, {y_true_unique.size}). "
+                    f"Got: {pred_decision.shape}"
+                )
+        if labels is None:
+            labels = y_true_unique
+        le = LabelEncoder()
+        le.fit(labels)
+        y_true = le.transform(y_true)
+        mask = np.ones_like(pred_decision, dtype=bool)
+        mask[np.arange(y_true.shape[0]), y_true] = False
+        margin = pred_decision[~mask]
+        margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1), axis=1)
+
+    else:
+        # Handles binary class case
+        # this code assumes that positive and negative labels
+        # are encoded as +1 and -1 respectively
+        pred_decision = column_or_1d(pred_decision)
+        pred_decision = np.ravel(pred_decision)
+
+        lbin = LabelBinarizer(neg_label=-1)
+        y_true = lbin.fit_transform(y_true)[:, 0]
+
+        try:
+            margin = y_true * pred_decision
+        except TypeError:
+            raise TypeError("pred_decision should be an array of floats.")
+
+    losses = 1 - margin
+    # The hinge_loss doesn't penalize good enough predictions.
+    np.clip(losses, 0, None, out=losses)
+    return float(np.average(losses, weights=sample_weight))
+
+
+def _validate_binary_probabilistic_prediction(y_true, y_prob, sample_weight, pos_label):
+    r"""Convert y_true and y_prob in binary classification to shape (n_samples, 2)
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True labels.
+
+    y_prob : array-like of shape (n_samples,)
+        Probabilities of the positive class.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    pos_label : int, float, bool or str, default=None
+        Label of the positive class. If None, `pos_label` will be inferred
+        in the following manner:
+
+        * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
+        * else if `y_true` contains string, an error will be raised and
+          `pos_label` should be explicitly specified;
+        * otherwise, `pos_label` defaults to the greater label,
+          i.e. `np.unique(y_true)[-1]`.
+
+    Returns
+    -------
+    transformed_labels : array of shape (n_samples, 2)
+
+    y_prob : array of shape (n_samples, 2)
+    """
+    # sanity checks on y_true and y_prob
+    y_true = column_or_1d(y_true)
+    y_prob = column_or_1d(y_prob)
+
+    assert_all_finite(y_true)
+    assert_all_finite(y_prob)
+
+    check_consistent_length(y_prob, y_true, sample_weight)
+
+    y_type = type_of_target(y_true, input_name="y_true")
+    if y_type != "binary":
+        raise ValueError(
+            f"The type of the target inferred from y_true is {y_type} but should be "
+            "binary according to the shape of y_prob."
+        )
+
+    if y_prob.max() > 1:
+        raise ValueError(f"y_prob contains values greater than 1: {y_prob.max()}")
+    if y_prob.min() < 0:
+        raise ValueError(f"y_prob contains values less than 0: {y_prob.min()}")
+
+    # check that pos_label is consistent with y_true
+    try:
+        pos_label = _check_pos_label_consistency(pos_label, y_true)
+    except ValueError:
+        classes = np.unique(y_true)
+        if classes.dtype.kind not in ("O", "U", "S"):
+            # for backward compatibility, if classes are not string then
+            # `pos_label` will correspond to the greater label
+            pos_label = classes[-1]
+        else:
+            raise
+
+    # convert (n_samples,) to (n_samples, 2) shape
+    y_true = np.array(y_true == pos_label, int)
+    transformed_labels = np.column_stack((1 - y_true, y_true))
+    y_prob = np.column_stack((1 - y_prob, y_prob))
+
+    return transformed_labels, y_prob
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_proba": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "pos_label": [Real, str, "boolean", None],
+        "labels": ["array-like", None],
+        "scale_by_half": ["boolean", StrOptions({"auto"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def brier_score_loss(
+    y_true,
+    y_proba,
+    *,
+    sample_weight=None,
+    pos_label=None,
+    labels=None,
+    scale_by_half="auto",
+):
+    r"""Compute the Brier score loss.
+
+    The smaller the Brier score loss, the better, hence the naming with "loss".
+    The Brier score measures the mean squared difference between the predicted
+    probability and the actual outcome. The Brier score is a strictly proper scoring
+    rule.
+
+    Read more in the :ref:`User Guide <brier_score_loss>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True targets.
+
+    y_proba : array-like of shape (n_samples,) or (n_samples, n_classes)
+        Predicted probabilities. If `y_proba.shape = (n_samples,)`
+        the probabilities provided are assumed to be that of the
+        positive class. If `y_proba.shape = (n_samples, n_classes)`
+        the columns in `y_proba` are assumed to correspond to the
+        labels in alphabetical order, as done by
+        :class:`~sklearn.preprocessing.LabelBinarizer`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    pos_label : int, float, bool or str, default=None
+        Label of the positive class when `y_proba.shape = (n_samples,)`.
+        If not provided, `pos_label` will be inferred in the
+        following manner:
+
+        * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
+        * else if `y_true` contains string, an error will be raised and
+          `pos_label` should be explicitly specified;
+        * otherwise, `pos_label` defaults to the greater label,
+          i.e. `np.unique(y_true)[-1]`.
+
+    labels : array-like of shape (n_classes,), default=None
+        Class labels when `y_proba.shape = (n_samples, n_classes)`.
+        If not provided, labels will be inferred from `y_true`.
+
+        .. versionadded:: 1.7
+
+    scale_by_half : bool or "auto", default="auto"
+        When True, scale the Brier score by 1/2 to lie in the [0, 1] range instead
+        of the [0, 2] range. The default "auto" option implements the rescaling to
+        [0, 1] only for binary classification (as customary) but keeps the
+        original [0, 2] range for multiclass classification.
+
+        .. versionadded:: 1.7
+
+    Returns
+    -------
+    score : float
+        Brier score loss.
+
+    Notes
+    -----
+
+    For :math:`N` observations labeled from :math:`C` possible classes, the Brier
+    score is defined as:
+
+    .. math::
+        \frac{1}{N}\sum_{i=1}^{N}\sum_{c=1}^{C}(y_{ic} - \hat{p}_{ic})^{2}
+
+    where :math:`y_{ic}` is 1 if observation `i` belongs to class `c`,
+    otherwise 0 and :math:`\hat{p}_{ic}` is the predicted probability for
+    observation `i` to belong to class `c`.
+    The Brier score then ranges between :math:`[0, 2]`.
+
+    In binary classification tasks the Brier score is usually divided by
+    two and then ranges between :math:`[0, 1]`. It can be alternatively
+    written as:
+
+    .. math::
+        \frac{1}{N}\sum_{i=1}^{N}(y_{i} - \hat{p}_{i})^{2}
+
+    where :math:`y_{i}` is the binary target and :math:`\hat{p}_{i}`
+    is the predicted probability of the positive class.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Brier score
+            <https://en.wikipedia.org/wiki/Brier_score>`_.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import brier_score_loss
+    >>> y_true = np.array([0, 1, 1, 0])
+    >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"])
+    >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3])
+    >>> brier_score_loss(y_true, y_prob)
+    0.0375
+    >>> brier_score_loss(y_true, 1-y_prob, pos_label=0)
+    0.0375
+    >>> brier_score_loss(y_true_categorical, y_prob, pos_label="ham")
+    0.0375
+    >>> brier_score_loss(y_true, np.array(y_prob) > 0.5)
+    0.0
+    >>> brier_score_loss(y_true, y_prob, scale_by_half=False)
+    0.075
+    >>> brier_score_loss(
+    ...    ["eggs", "ham", "spam"],
+    ...    [[0.8, 0.1, 0.1], [0.2, 0.7, 0.1], [0.2, 0.2, 0.6]],
+    ...    labels=["eggs", "ham", "spam"]
+    ... )
+    0.146
+    """
+    y_proba = check_array(
+        y_proba, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
+    )
+
+    if y_proba.ndim == 1 or y_proba.shape[1] == 1:
+        transformed_labels, y_proba = _validate_binary_probabilistic_prediction(
+            y_true, y_proba, sample_weight, pos_label
+        )
+    else:
+        transformed_labels, y_proba = _validate_multiclass_probabilistic_prediction(
+            y_true, y_proba, sample_weight, labels
+        )
+
+    brier_score = np.average(
+        np.sum((transformed_labels - y_proba) ** 2, axis=1), weights=sample_weight
+    )
+
+    if scale_by_half == "auto":
+        scale_by_half = y_proba.ndim == 1 or y_proba.shape[1] < 3
+    if scale_by_half:
+        brier_score *= 0.5
+
+    return float(brier_score)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "labels": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None):
+    """
+    :math:`D^2` score function, fraction of log loss explained.
+
+    Best possible score is 1.0 and it can be negative (because the model can be
+    arbitrarily worse). A model that always predicts the per-class proportions
+    of `y_true`, disregarding the input features, gets a D^2 score of 0.0.
+
+    Read more in the :ref:`User Guide <d2_score_classification>`.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        The actuals labels for the n_samples samples.
+
+    y_pred : array-like of shape (n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_pred.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_pred`` are assumed to be
+        ordered alphabetically, as done by
+        :class:`~sklearn.preprocessing.LabelBinarizer`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+
+    Returns
+    -------
+    d2 : float or ndarray of floats
+        The D^2 score.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Like R^2, D^2 score may be negative (it need not actually be the square of
+    a quantity D).
+
+    This metric is not well-defined for a single sample and will return a NaN
+    value if n_samples is less than two.
+    """
+    y_pred = check_array(y_pred, ensure_2d=False, dtype="numeric")
+    check_consistent_length(y_pred, y_true, sample_weight)
+    if _num_samples(y_pred) < 2:
+        msg = "D^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
+
+    # log loss of the fitted model
+    numerator = log_loss(
+        y_true=y_true,
+        y_pred=y_pred,
+        normalize=False,
+        sample_weight=sample_weight,
+        labels=labels,
+    )
+
+    # Proportion of labels in the dataset
+    weights = _check_sample_weight(sample_weight, y_true)
+
+    # If labels is passed, augment y_true to ensure that all labels are represented
+    # Use 0 weight for the new samples to not affect the counts
+    y_true_, weights_ = (
+        (
+            np.concatenate([y_true, labels]),
+            np.concatenate([weights, np.zeros_like(weights, shape=len(labels))]),
+        )
+        if labels is not None
+        else (y_true, weights)
+    )
+
+    _, y_value_indices = np.unique(y_true_, return_inverse=True)
+    counts = np.bincount(y_value_indices, weights=weights_)
+    y_prob = counts / weights.sum()
+    y_pred_null = np.tile(y_prob, (len(y_true), 1))
+
+    # log loss of the null model
+    denominator = log_loss(
+        y_true=y_true,
+        y_pred=y_pred_null,
+        normalize=False,
+        sample_weight=sample_weight,
+        labels=labels,
+    )
+
+    return float(1 - (numerator / denominator))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..0a249a8a9fb0a158c34c9f725891467de6041d40
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd
@@ -0,0 +1,268 @@
+from libc.math cimport sqrt, exp
+
+from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
+
+cdef class DistanceMetric:
+    pass
+
+######################################################################
+# Inline distance functions
+#
+#  We use these for the default (euclidean) case so that they can be
+#  inlined.  This leads to faster computation for the most common case
+cdef inline float64_t euclidean_dist64(
+    const float64_t* x1,
+    const float64_t* x2,
+    intp_t size,
+) except -1 nogil:
+    cdef float64_t tmp, d=0
+    cdef intp_t j
+    for j in range(size):
+        tmp = <float64_t> (x1[j] - x2[j])
+        d += tmp * tmp
+    return sqrt(d)
+
+
+cdef inline float64_t euclidean_rdist64(
+    const float64_t* x1,
+    const float64_t* x2,
+    intp_t size,
+) except -1 nogil:
+    cdef float64_t tmp, d=0
+    cdef intp_t j
+    for j in range(size):
+        tmp = <float64_t>(x1[j] - x2[j])
+        d += tmp * tmp
+    return d
+
+
+cdef inline float64_t euclidean_dist_to_rdist64(const float64_t dist) except -1 nogil:
+    return dist * dist
+
+
+cdef inline float64_t euclidean_rdist_to_dist64(const float64_t dist) except -1 nogil:
+    return sqrt(dist)
+
+
+######################################################################
+# DistanceMetric64 base class
+cdef class DistanceMetric64(DistanceMetric):
+    # The following attributes are required for a few of the subclasses.
+    # we must define them here so that cython's limited polymorphism will work.
+    # Because we don't expect to instantiate a lot of these objects, the
+    # extra memory overhead of this setup should not be an issue.
+    cdef float64_t p
+    cdef const float64_t[::1] vec
+    cdef const float64_t[:, ::1] mat
+    cdef intp_t size
+    cdef object func
+    cdef object kwargs
+
+    cdef float64_t dist(
+        self,
+        const float64_t* x1,
+        const float64_t* x2,
+        intp_t size,
+    ) except -1 nogil
+
+    cdef float64_t rdist(
+        self,
+        const float64_t* x1,
+        const float64_t* x2,
+        intp_t size,
+    ) except -1 nogil
+
+    cdef float64_t dist_csr(
+        self,
+        const float64_t* x1_data,
+        const int32_t* x1_indices,
+        const float64_t* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil
+
+    cdef float64_t rdist_csr(
+        self,
+        const float64_t* x1_data,
+        const int32_t* x1_indices,
+        const float64_t* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil
+
+    cdef int pdist(
+        self,
+        const float64_t[:, ::1] X,
+        float64_t[:, ::1] D,
+    ) except -1
+
+    cdef int cdist(
+        self,
+        const float64_t[:, ::1] X,
+        const float64_t[:, ::1] Y,
+        float64_t[:, ::1] D,
+    ) except -1
+
+    cdef int pdist_csr(
+        self,
+        const float64_t* x1_data,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
+        const intp_t size,
+        float64_t[:, ::1] D,
+    ) except -1 nogil
+
+    cdef int cdist_csr(
+        self,
+        const float64_t* x1_data,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
+        const float64_t* x2_data,
+        const int32_t[::1] x2_indices,
+        const int32_t[::1] x2_indptr,
+        const intp_t size,
+        float64_t[:, ::1] D,
+    ) except -1 nogil
+
+    cdef float64_t _rdist_to_dist(self, float64_t rdist) except -1 nogil
+
+    cdef float64_t _dist_to_rdist(self, float64_t dist) except -1 nogil
+
+######################################################################
+# Inline distance functions
+#
+#  We use these for the default (euclidean) case so that they can be
+#  inlined.  This leads to faster computation for the most common case
+cdef inline float64_t euclidean_dist32(
+    const float32_t* x1,
+    const float32_t* x2,
+    intp_t size,
+) except -1 nogil:
+    cdef float64_t tmp, d=0
+    cdef intp_t j
+    for j in range(size):
+        tmp = <float64_t> (x1[j] - x2[j])
+        d += tmp * tmp
+    return sqrt(d)
+
+
+cdef inline float64_t euclidean_rdist32(
+    const float32_t* x1,
+    const float32_t* x2,
+    intp_t size,
+) except -1 nogil:
+    cdef float64_t tmp, d=0
+    cdef intp_t j
+    for j in range(size):
+        tmp = <float64_t>(x1[j] - x2[j])
+        d += tmp * tmp
+    return d
+
+
+cdef inline float64_t euclidean_dist_to_rdist32(const float32_t dist) except -1 nogil:
+    return dist * dist
+
+
+cdef inline float64_t euclidean_rdist_to_dist32(const float32_t dist) except -1 nogil:
+    return sqrt(dist)
+
+
+######################################################################
+# DistanceMetric32 base class
+cdef class DistanceMetric32(DistanceMetric):
+    # The following attributes are required for a few of the subclasses.
+    # we must define them here so that cython's limited polymorphism will work.
+    # Because we don't expect to instantiate a lot of these objects, the
+    # extra memory overhead of this setup should not be an issue.
+    cdef float64_t p
+    cdef const float64_t[::1] vec
+    cdef const float64_t[:, ::1] mat
+    cdef intp_t size
+    cdef object func
+    cdef object kwargs
+
+    cdef float32_t dist(
+        self,
+        const float32_t* x1,
+        const float32_t* x2,
+        intp_t size,
+    ) except -1 nogil
+
+    cdef float32_t rdist(
+        self,
+        const float32_t* x1,
+        const float32_t* x2,
+        intp_t size,
+    ) except -1 nogil
+
+    cdef float32_t dist_csr(
+        self,
+        const float32_t* x1_data,
+        const int32_t* x1_indices,
+        const float32_t* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil
+
+    cdef float32_t rdist_csr(
+        self,
+        const float32_t* x1_data,
+        const int32_t* x1_indices,
+        const float32_t* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil
+
+    cdef int pdist(
+        self,
+        const float32_t[:, ::1] X,
+        float32_t[:, ::1] D,
+    ) except -1
+
+    cdef int cdist(
+        self,
+        const float32_t[:, ::1] X,
+        const float32_t[:, ::1] Y,
+        float32_t[:, ::1] D,
+    ) except -1
+
+    cdef int pdist_csr(
+        self,
+        const float32_t* x1_data,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
+        const intp_t size,
+        float32_t[:, ::1] D,
+    ) except -1 nogil
+
+    cdef int cdist_csr(
+        self,
+        const float32_t* x1_data,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
+        const float32_t* x2_data,
+        const int32_t[::1] x2_indices,
+        const int32_t[::1] x2_indptr,
+        const intp_t size,
+        float32_t[:, ::1] D,
+    ) except -1 nogil
+
+    cdef float32_t _rdist_to_dist(self, float32_t rdist) except -1 nogil
+
+    cdef float32_t _dist_to_rdist(self, float32_t dist) except -1 nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd.tp
new file mode 100644
index 0000000000000000000000000000000000000000..313225088c776e8575bfb4cec47c1f17183fab03
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd.tp
@@ -0,0 +1,152 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    ('64', 'float64_t', 'np.float64'),
+    ('32', 'float32_t', 'np.float32')
+]
+
+}}
+from libc.math cimport sqrt, exp
+
+from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
+
+cdef class DistanceMetric:
+    pass
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+######################################################################
+# Inline distance functions
+#
+#  We use these for the default (euclidean) case so that they can be
+#  inlined.  This leads to faster computation for the most common case
+cdef inline float64_t euclidean_dist{{name_suffix}}(
+    const {{INPUT_DTYPE_t}}* x1,
+    const {{INPUT_DTYPE_t}}* x2,
+    intp_t size,
+) except -1 nogil:
+    cdef float64_t tmp, d=0
+    cdef intp_t j
+    for j in range(size):
+        tmp = <float64_t> (x1[j] - x2[j])
+        d += tmp * tmp
+    return sqrt(d)
+
+
+cdef inline float64_t euclidean_rdist{{name_suffix}}(
+    const {{INPUT_DTYPE_t}}* x1,
+    const {{INPUT_DTYPE_t}}* x2,
+    intp_t size,
+) except -1 nogil:
+    cdef float64_t tmp, d=0
+    cdef intp_t j
+    for j in range(size):
+        tmp = <float64_t>(x1[j] - x2[j])
+        d += tmp * tmp
+    return d
+
+
+cdef inline float64_t euclidean_dist_to_rdist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) except -1 nogil:
+    return dist * dist
+
+
+cdef inline float64_t euclidean_rdist_to_dist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) except -1 nogil:
+    return sqrt(dist)
+
+
+######################################################################
+# DistanceMetric{{name_suffix}} base class
+cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
+    # The following attributes are required for a few of the subclasses.
+    # we must define them here so that cython's limited polymorphism will work.
+    # Because we don't expect to instantiate a lot of these objects, the
+    # extra memory overhead of this setup should not be an issue.
+    cdef float64_t p
+    cdef const float64_t[::1] vec
+    cdef const float64_t[:, ::1] mat
+    cdef intp_t size
+    cdef object func
+    cdef object kwargs
+
+    cdef {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil
+
+    cdef {{INPUT_DTYPE_t}} rdist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil
+
+    cdef {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil
+
+    cdef {{INPUT_DTYPE_t}} rdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil
+
+    cdef int pdist(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
+    ) except -1
+
+    cdef int cdist(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
+    ) except -1
+
+    cdef int pdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
+        const intp_t size,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
+    ) except -1 nogil
+
+    cdef int cdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t[::1] x2_indices,
+        const int32_t[::1] x2_indptr,
+        const intp_t size,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
+    ) except -1 nogil
+
+    cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil
+
+    cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..b7d3d1f4d86a6b4817af36489d1846b74afe7e6d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pyx.tp
@@ -0,0 +1,2811 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    ('64', 'float64_t', 'np.float64'),
+    ('32', 'float32_t', 'np.float32')
+]
+
+}}
+# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
+# written for the scikit-learn project
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+cimport numpy as cnp
+
+cnp.import_array()  # required in order to use C-API
+
+from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
+
+from scipy.sparse import csr_matrix, issparse
+from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
+from ..utils import check_array
+from ..utils.fixes import parse_version, sp_base_version
+
+cdef inline double fmax(double a, double b) noexcept nogil:
+    return max(a, b)
+
+
+######################################################################
+# newObj function
+#  this is a helper function for pickling
+def newObj(obj):
+    return obj.__new__(obj)
+
+
+BOOL_METRICS = [
+    "hamming",
+    "jaccard",
+    "dice",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalsneath",
+]
+DEPRECATED_METRICS = []
+if sp_base_version < parse_version("1.17"):
+    # Deprecated in SciPy 1.15 and removed in SciPy 1.17
+    BOOL_METRICS += ["sokalmichener"]
+if sp_base_version >= parse_version("1.15"):
+    DEPRECATED_METRICS.append("sokalmichener")
+if sp_base_version < parse_version("1.11"):
+    # Deprecated in SciPy 1.9 and removed in SciPy 1.11
+    BOOL_METRICS += ["kulsinski"]
+if sp_base_version >= parse_version("1.9"):
+    DEPRECATED_METRICS.append("kulsinski")
+if sp_base_version < parse_version("1.9"):
+    # Deprecated in SciPy 1.0 and removed in SciPy 1.9
+    BOOL_METRICS += ["matching"]
+if sp_base_version >= parse_version("1.0"):
+    DEPRECATED_METRICS.append("matching")
+
+def get_valid_metric_ids(L):
+    """Given an iterable of metric class names or class identifiers,
+    return a list of metric IDs which map to those classes.
+
+    Example:
+    >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance'])
+    >>> sorted(L)
+    ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan']
+    """
+    return [key for (key, val) in METRIC_MAPPING64.items()
+            if (val.__name__ in L) or (val in L)]
+
+cdef class DistanceMetric:
+    """Uniform interface for fast distance metric functions.
+
+    The `DistanceMetric` class provides a convenient way to compute pairwise distances
+    between samples. It supports various distance metrics, such as Euclidean distance,
+    Manhattan distance, and more.
+
+    The `pairwise` method can be used to compute pairwise distances between samples in
+    the input arrays. It returns a distance matrix representing the distances between
+    all pairs of samples.
+
+    The :meth:`get_metric` method allows you to retrieve a specific metric using its
+    string identifier.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import DistanceMetric
+    >>> dist = DistanceMetric.get_metric('euclidean')
+    >>> X = [[1, 2], [3, 4], [5, 6]]
+    >>> Y = [[7, 8], [9, 10]]
+    >>> dist.pairwise(X,Y)
+    array([[7.81..., 10.63...]
+           [5.65...,  8.48...]
+           [1.41...,  4.24...]])
+
+    .. rubric:: Available Metrics
+
+    The following lists the string metric identifiers and the associated
+    distance metric classes:
+
+    **Metrics intended for real-valued vector spaces:**
+
+    ==============  ====================  ========  ===============================
+    identifier      class name            args      distance function
+    --------------  --------------------  --------  -------------------------------
+    "euclidean"     EuclideanDistance     -         ``sqrt(sum((x - y)^2))``
+    "manhattan"     ManhattanDistance     -         ``sum(|x - y|)``
+    "chebyshev"     ChebyshevDistance     -         ``max(|x - y|)``
+    "minkowski"     MinkowskiDistance     p, w      ``sum(w * |x - y|^p)^(1/p)``
+    "seuclidean"    SEuclideanDistance    V         ``sqrt(sum((x - y)^2 / V))``
+    "mahalanobis"   MahalanobisDistance   V or VI   ``sqrt((x - y)' V^-1 (x - y))``
+    ==============  ====================  ========  ===============================
+
+    **Metrics intended for two-dimensional vector spaces:**  Note that the haversine
+    distance metric requires data in the form of [latitude, longitude] and both
+    inputs and outputs are in units of radians.
+
+    ============  ==================  ===============================================================
+    identifier    class name          distance function
+    ------------  ------------------  ---------------------------------------------------------------
+    "haversine"   HaversineDistance   ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
+    ============  ==================  ===============================================================
+
+
+    **Metrics intended for integer-valued vector spaces:**  Though intended
+    for integer-valued vectors, these are also valid metrics in the case of
+    real-valued vectors.
+
+    =============  ====================  ========================================
+    identifier     class name            distance function
+    -------------  --------------------  ----------------------------------------
+    "hamming"      HammingDistance       ``N_unequal(x, y) / N_tot``
+    "canberra"     CanberraDistance      ``sum(|x - y| / (|x| + |y|))``
+    "braycurtis"   BrayCurtisDistance    ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
+    =============  ====================  ========================================
+
+    **Metrics intended for boolean-valued vector spaces:**  Any nonzero entry
+    is evaluated to "True".  In the listings below, the following
+    abbreviations are used:
+
+    - N: number of dimensions
+    - NTT: number of dims in which both values are True
+    - NTF: number of dims in which the first value is True, second is False
+    - NFT: number of dims in which the first value is False, second is True
+    - NFF: number of dims in which both values are False
+    - NNEQ: number of non-equal dimensions, NNEQ = NTF + NFT
+    - NNZ: number of nonzero dimensions, NNZ = NTF + NFT + NTT
+
+    =================  =======================  ===============================
+    identifier         class name               distance function
+    -----------------  -----------------------  -------------------------------
+    "jaccard"          JaccardDistance          NNEQ / NNZ
+    "matching"         MatchingDistance         NNEQ / N
+    "dice"             DiceDistance             NNEQ / (NTT + NNZ)
+    "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
+    "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
+    "russellrao"       RussellRaoDistance       (N - NTT) / N
+    "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
+    "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
+    =================  =======================  ===============================
+
+    **User-defined distance:**
+
+    ===========    ===============    =======
+    identifier     class name         args
+    -----------    ---------------    -------
+    "pyfunc"       PyFuncDistance     func
+    ===========    ===============    =======
+
+    Here ``func`` is a function which takes two one-dimensional numpy
+    arrays, and returns a distance.  Note that in order to be used within
+    the BallTree, the distance must be a true metric:
+    i.e. it must satisfy the following properties
+
+    1) Non-negativity: d(x, y) >= 0
+    2) Identity: d(x, y) = 0 if and only if x == y
+    3) Symmetry: d(x, y) = d(y, x)
+    4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
+
+    Because of the Python object overhead involved in calling the python
+    function, this will be fairly slow, but it will have the same
+    scaling as other distances.
+    """
+    @classmethod
+    def get_metric(cls, metric, dtype=np.float64, **kwargs):
+        """Get the given distance metric from the string identifier.
+
+        See the docstring of DistanceMetric for a list of available metrics.
+
+        Parameters
+        ----------
+        metric : str or class name
+            The string identifier or class name of the desired distance metric.
+            See the documentation of the `DistanceMetric` class for a list of
+            available metrics.
+
+        dtype : {np.float32, np.float64}, default=np.float64
+            The data type of the input on which the metric will be applied.
+            This affects the precision of the computed distances.
+            By default, it is set to `np.float64`.
+
+        **kwargs
+            Additional keyword arguments that will be passed to the requested metric.
+            These arguments can be used to customize the behavior of the specific
+            metric.
+
+        Returns
+        -------
+        metric_obj : instance of the requested metric
+            An instance of the requested distance metric class.
+        """
+        if dtype == np.float32:
+            specialized_class = DistanceMetric32
+        elif dtype == np.float64:
+            specialized_class = DistanceMetric64
+        else:
+            raise ValueError(
+                f"Unexpected dtype {dtype} provided. Please select a dtype from"
+                " {np.float32, np.float64}"
+            )
+
+        return specialized_class.get_metric(metric, **kwargs)
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+######################################################################
+# metric mappings
+#  These map from metric id strings to class names
+METRIC_MAPPING{{name_suffix}} = {
+    'euclidean': EuclideanDistance{{name_suffix}},
+    'l2': EuclideanDistance{{name_suffix}},
+    'minkowski': MinkowskiDistance{{name_suffix}},
+    'p': MinkowskiDistance{{name_suffix}},
+    'manhattan': ManhattanDistance{{name_suffix}},
+    'cityblock': ManhattanDistance{{name_suffix}},
+    'l1': ManhattanDistance{{name_suffix}},
+    'chebyshev': ChebyshevDistance{{name_suffix}},
+    'infinity': ChebyshevDistance{{name_suffix}},
+    'seuclidean': SEuclideanDistance{{name_suffix}},
+    'mahalanobis': MahalanobisDistance{{name_suffix}},
+    'hamming': HammingDistance{{name_suffix}},
+    'canberra': CanberraDistance{{name_suffix}},
+    'braycurtis': BrayCurtisDistance{{name_suffix}},
+    'matching': MatchingDistance{{name_suffix}},
+    'jaccard': JaccardDistance{{name_suffix}},
+    'dice': DiceDistance{{name_suffix}},
+    'kulsinski': KulsinskiDistance{{name_suffix}},
+    'rogerstanimoto': RogersTanimotoDistance{{name_suffix}},
+    'russellrao': RussellRaoDistance{{name_suffix}},
+    'sokalmichener': SokalMichenerDistance{{name_suffix}},
+    'sokalsneath': SokalSneathDistance{{name_suffix}},
+    'haversine': HaversineDistance{{name_suffix}},
+    'pyfunc': PyFuncDistance{{name_suffix}},
+}
+
+cdef inline object _buffer_to_ndarray{{name_suffix}}(const {{INPUT_DTYPE_t}}* x, intp_t n):
+    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
+    # In particular, if x is deallocated before the returned array goes
+    # out of scope, this could cause memory errors.  Since there is not
+    # a possibility of this for our use-case, this should be safe.
+
+    # Note: this Segfaults unless np.import_array() is called above
+    # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0
+    return cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&n, cnp.NPY_FLOAT64, <void*>x)
+
+
+cdef {{INPUT_DTYPE_t}} INF{{name_suffix}} = np.inf
+
+
+######################################################################
+# Distance Metric Classes
+cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
+    """DistanceMetric class
+
+    This class provides a uniform interface to fast distance metric
+    functions.  The various metrics can be accessed via the :meth:`get_metric`
+    class method and the metric string identifier (see below).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import DistanceMetric
+    >>> dist = DistanceMetric.get_metric('euclidean')
+    >>> X = [[0, 1, 2],
+             [3, 4, 5]]
+    >>> dist.pairwise(X)
+    array([[ 0.        ,  5.19615242],
+           [ 5.19615242,  0.        ]])
+
+    Available Metrics
+
+    The following lists the string metric identifiers and the associated
+    distance metric classes:
+
+    **Metrics intended for real-valued vector spaces:**
+
+    ==============  ====================  ========  ===============================
+    identifier      class name            args      distance function
+    --------------  --------------------  --------  -------------------------------
+    "euclidean"     EuclideanDistance     -         ``sqrt(sum((x - y)^2))``
+    "manhattan"     ManhattanDistance     -         ``sum(|x - y|)``
+    "chebyshev"     ChebyshevDistance     -         ``max(|x - y|)``
+    "minkowski"     MinkowskiDistance     p, w      ``sum(w * |x - y|^p)^(1/p)``
+    "seuclidean"    SEuclideanDistance    V         ``sqrt(sum((x - y)^2 / V))``
+    "mahalanobis"   MahalanobisDistance   V or VI   ``sqrt((x - y)' V^-1 (x - y))``
+    ==============  ====================  ========  ===============================
+
+    **Metrics intended for two-dimensional vector spaces:**  Note that the haversine
+    distance metric requires data in the form of [latitude, longitude] and both
+    inputs and outputs are in units of radians.
+
+    ============  ==================  ===============================================================
+    identifier    class name          distance function
+    ------------  ------------------  ---------------------------------------------------------------
+    "haversine"   HaversineDistance   ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
+    ============  ==================  ===============================================================
+
+
+    **Metrics intended for integer-valued vector spaces:**  Though intended
+    for integer-valued vectors, these are also valid metrics in the case of
+    real-valued vectors.
+
+    =============  ====================  ========================================
+    identifier     class name            distance function
+    -------------  --------------------  ----------------------------------------
+    "hamming"      HammingDistance       ``N_unequal(x, y) / N_tot``
+    "canberra"     CanberraDistance      ``sum(|x - y| / (|x| + |y|))``
+    "braycurtis"   BrayCurtisDistance    ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
+    =============  ====================  ========================================
+
+    **Metrics intended for boolean-valued vector spaces:**  Any nonzero entry
+    is evaluated to "True".  In the listings below, the following
+    abbreviations are used:
+
+    - N: number of dimensions
+    - NTT: number of dims in which both values are True
+    - NTF: number of dims in which the first value is True, second is False
+    - NFT: number of dims in which the first value is False, second is True
+    - NFF: number of dims in which both values are False
+    - NNEQ: number of non-equal dimensions, NNEQ = NTF + NFT
+    - NNZ: number of nonzero dimensions, NNZ = NTF + NFT + NTT
+
+    =================  =======================  ===============================
+    identifier         class name               distance function
+    -----------------  -----------------------  -------------------------------
+    "jaccard"          JaccardDistance          NNEQ / NNZ
+    "matching"         MatchingDistance         NNEQ / N
+    "dice"             DiceDistance             NNEQ / (NTT + NNZ)
+    "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
+    "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
+    "russellrao"       RussellRaoDistance       (N - NTT) / N
+    "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
+    "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
+    =================  =======================  ===============================
+
+    **User-defined distance:**
+
+    ===========    ===============    =======
+    identifier     class name         args
+    -----------    ---------------    -------
+    "pyfunc"       PyFuncDistance     func
+    ===========    ===============    =======
+
+    Here ``func`` is a function which takes two one-dimensional numpy
+    arrays, and returns a distance.  Note that in order to be used within
+    the BallTree, the distance must be a true metric:
+    i.e. it must satisfy the following properties
+
+    1) Non-negativity: d(x, y) >= 0
+    2) Identity: d(x, y) = 0 if and only if x == y
+    3) Symmetry: d(x, y) = d(y, x)
+    4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
+
+    Because of the Python object overhead involved in calling the python
+    function, this will be fairly slow, but it will have the same
+    scaling as other distances.
+    """
+    def __cinit__(self):
+        self.p = 2
+        self.vec = np.zeros(1, dtype=np.float64, order='C')
+        self.mat = np.zeros((1, 1), dtype=np.float64, order='C')
+        self.size = 1
+
+    def __reduce__(self):
+        """
+        reduce method used for pickling
+        """
+        return (newObj, (self.__class__,), self.__getstate__())
+
+    def __getstate__(self):
+        """
+        get state for pickling
+        """
+        if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}":
+            return (float(self.p), np.asarray(self.vec), np.asarray(self.mat), self.func, self.kwargs)
+        return (float(self.p), np.asarray(self.vec), np.asarray(self.mat))
+
+    def __setstate__(self, state):
+        """
+        set state for pickling
+        """
+        self.p = state[0]
+        self.vec = state[1]
+        self.mat = state[2]
+        if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}":
+            self.func = state[3]
+            self.kwargs = state[4]
+        self.size = self.vec.shape[0]
+
+    @classmethod
+    def get_metric(cls, metric, **kwargs):
+        """Get the given distance metric from the string identifier.
+
+        See the docstring of DistanceMetric for a list of available metrics.
+
+        Parameters
+        ----------
+        metric : str or class name
+            The distance metric to use
+        **kwargs
+            additional arguments will be passed to the requested metric
+        """
+        if isinstance(metric, DistanceMetric{{name_suffix}}):
+            return metric
+
+        if callable(metric):
+            return PyFuncDistance{{name_suffix}}(metric, **kwargs)
+
+        # Map the metric string ID to the metric class
+        if isinstance(metric, type) and issubclass(metric, DistanceMetric{{name_suffix}}):
+            pass
+        else:
+            try:
+                metric = METRIC_MAPPING{{name_suffix}}[metric]
+            except:
+                raise ValueError("Unrecognized metric '%s'" % metric)
+
+        # In Minkowski special cases, return more efficient methods
+        if metric is MinkowskiDistance{{name_suffix}}:
+            p = kwargs.pop('p', 2)
+            w = kwargs.pop('w', None)
+            if p == 1 and w is None:
+                return ManhattanDistance{{name_suffix}}(**kwargs)
+            elif p == 2 and w is None:
+                return EuclideanDistance{{name_suffix}}(**kwargs)
+            elif np.isinf(p) and w is None:
+                return ChebyshevDistance{{name_suffix}}(**kwargs)
+            else:
+                return MinkowskiDistance{{name_suffix}}(p, w, **kwargs)
+        else:
+            return metric(**kwargs)
+
+    def __init__(self):
+        if self.__class__ is DistanceMetric{{name_suffix}}:
+            raise NotImplementedError("DistanceMetric{{name_suffix}} is an abstract class")
+
+    def _validate_data(self, X):
+        """Validate the input data.
+
+        This should be overridden in a base class if a specific input format
+        is required.
+        """
+        return
+
+    cdef {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        """Compute the distance between vectors x1 and x2
+
+        This should be overridden in a base class.
+        """
+        return -999
+
+    cdef {{INPUT_DTYPE_t}} rdist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2.
+
+        This can optionally be overridden in a base class.
+
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+        """
+        return self.dist(x1, x2, size)
+
+    cdef int pdist(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
+    ) except -1:
+        """Compute the pairwise distances between points in X"""
+        cdef intp_t i1, i2
+        for i1 in range(X.shape[0]):
+            for i2 in range(i1, X.shape[0]):
+                D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1])
+                D[i2, i1] = D[i1, i2]
+        return 0
+
+
+    cdef int cdist(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
+    ) except -1:
+        """Compute the cross-pairwise distances between arrays X and Y"""
+        cdef intp_t i1, i2
+        if X.shape[1] != Y.shape[1]:
+            raise ValueError('X and Y must have the same second dimension')
+        for i1 in range(X.shape[0]):
+            for i2 in range(Y.shape[0]):
+                D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
+        return 0
+
+    cdef {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+        """Compute the distance between vectors x1 and x2 represented
+        under the CSR format.
+
+        This must be overridden in a subclass.
+
+        Notes
+        -----
+        0. The implementation of this method in subclasses must be robust to the
+        presence of explicit zeros in the CSR representation.
+
+        1. The `data` arrays are passed using pointers to be able to support an
+        alternative representation of the CSR data structure for supporting
+        fused sparse-dense datasets pairs with minimum overhead.
+
+        See the explanations in `SparseDenseDatasetsPair.__init__`.
+
+        2. An alternative signature would be:
+
+            cdef {{INPUT_DTYPE_t}} dist_csr(
+                self,
+                const {{INPUT_DTYPE_t}}* x1_data,
+                const int32_t* x1_indices,
+                const {{INPUT_DTYPE_t}}* x2_data,
+                const int32_t* x2_indices,
+            ) except -1 nogil:
+
+        Where callers would use slicing on the original CSR data and indices
+        memoryviews:
+
+            x1_start = X1_csr.indices_ptr[i]
+            x1_end   = X1_csr.indices_ptr[i+1]
+            x2_start = X2_csr.indices_ptr[j]
+            x2_end   = X2_csr.indices_ptr[j+1]
+
+            self.dist_csr(
+                &x1_data[x1_start],
+                x1_indices[x1_start:x1_end],
+                &x2_data[x2_start],
+                x2_indices[x2_start:x2_end],
+            )
+
+        Yet, slicing on memoryview slows down execution as it takes the GIL.
+        See: https://github.com/scikit-learn/scikit-learn/issues/17299
+
+        Hence, to avoid slicing the data and indices arrays of the sparse
+        matrices containing respectively x1 and x2 (namely x{1,2}_{data,indices})
+        are passed as well as their indices pointers (namely x{1,2}_{start,end}).
+
+        3. For reference about the CSR format, see section 3.4 of
+        Saad, Y. (2003), Iterative Methods for Sparse Linear Systems, SIAM.
+        https://www-users.cse.umn.edu/~saad/IterMethBook_2ndEd.pdf
+        """
+        return -999
+
+    cdef {{INPUT_DTYPE_t}} rdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+        """Distance between rows of CSR matrices x1 and x2.
+
+        This can optionally be overridden in a subclass.
+
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+
+        Notes
+        -----
+        The implementation of this method in subclasses must be robust to the
+        presence of explicit zeros in the CSR representation.
+
+        More information about the motives for this method signature is given
+        in the docstring of dist_csr.
+        """
+        return self.dist_csr(
+            x1_data,
+            x1_indices,
+            x2_data,
+            x2_indices,
+            x1_start,
+            x1_end,
+            x2_start,
+            x2_end,
+            size,
+        )
+
+    cdef int pdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
+        const intp_t size,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
+    ) except -1 nogil:
+        """Pairwise distances between rows in CSR matrix X.
+
+        Note that this implementation is twice faster than cdist_csr(X, X)
+        because it leverages the symmetry of the problem.
+        """
+        cdef:
+            intp_t i1, i2
+            intp_t n_x1 = x1_indptr.shape[0] - 1
+            intp_t x1_start, x1_end, x2_start, x2_end
+
+        for i1 in range(n_x1):
+            x1_start = x1_indptr[i1]
+            x1_end = x1_indptr[i1 + 1]
+            for i2 in range(i1, n_x1):
+                x2_start = x1_indptr[i2]
+                x2_end = x1_indptr[i2 + 1]
+                D[i1, i2] = D[i2, i1] = self.dist_csr(
+                    x1_data,
+                    &x1_indices[0],
+                    x1_data,
+                    &x1_indices[0],
+                    x1_start,
+                    x1_end,
+                    x2_start,
+                    x2_end,
+                    size,
+                )
+        return 0
+
+    cdef int cdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t[::1] x2_indices,
+        const int32_t[::1] x2_indptr,
+        const intp_t size,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
+    ) except -1 nogil:
+        """Compute the cross-pairwise distances between arrays X and Y
+        represented in the CSR format."""
+        cdef:
+            intp_t i1, i2
+            intp_t n_x1 = x1_indptr.shape[0] - 1
+            intp_t n_x2 = x2_indptr.shape[0] - 1
+            intp_t x1_start, x1_end, x2_start, x2_end
+
+        for i1 in range(n_x1):
+            x1_start = x1_indptr[i1]
+            x1_end = x1_indptr[i1 + 1]
+            for i2 in range(n_x2):
+                x2_start = x2_indptr[i2]
+                x2_end = x2_indptr[i2 + 1]
+
+                D[i1, i2] = self.dist_csr(
+                    x1_data,
+                    &x1_indices[0],
+                    x2_data,
+                    &x2_indices[0],
+                    x1_start,
+                    x1_end,
+                    x2_start,
+                    x2_end,
+                    size,
+                )
+        return 0
+
+    cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+        """Convert the rank-preserving surrogate distance to the distance"""
+        return rdist
+
+    cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+        """Convert the distance to the rank-preserving surrogate distance"""
+        return dist
+
+    def rdist_to_dist(self, rdist):
+        """Convert the rank-preserving surrogate distance to the distance.
+
+        The surrogate distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+
+        Parameters
+        ----------
+        rdist : double
+            Surrogate distance.
+
+        Returns
+        -------
+        double
+            True distance.
+        """
+        return rdist
+
+    def dist_to_rdist(self, dist):
+        """Convert the true distance to the rank-preserving surrogate distance.
+
+        The surrogate distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+
+        Parameters
+        ----------
+        dist : double
+            True distance.
+
+        Returns
+        -------
+        double
+            Surrogate distance.
+        """
+        return dist
+
+    def _pairwise_dense_dense(self, X, Y):
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Yarr
+        cdef {{INPUT_DTYPE_t}}[:, ::1] Darr
+
+        Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C')
+        self._validate_data(Xarr)
+        if X is Y:
+            Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C')
+            self.pdist(Xarr, Darr)
+        else:
+            Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C')
+            self._validate_data(Yarr)
+            Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C')
+            self.cdist(Xarr, Yarr, Darr)
+        return np.asarray(Darr)
+
+    def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix):
+        cdef:
+            intp_t n_X, n_features
+            const {{INPUT_DTYPE_t}}[::1] X_data
+            const int32_t[::1] X_indices
+            const int32_t[::1] X_indptr
+
+            intp_t n_Y
+            const {{INPUT_DTYPE_t}}[::1] Y_data
+            const int32_t[::1] Y_indices
+            const int32_t[::1] Y_indptr
+
+            {{INPUT_DTYPE_t}}[:, ::1] Darr
+
+        X_csr = X.tocsr()
+        n_X, n_features = X_csr.shape
+        X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}})
+        X_indices = np.asarray(X_csr.indices, dtype=np.int32)
+        X_indptr = np.asarray(X_csr.indptr, dtype=np.int32)
+        if X is Y:
+            Darr = np.empty((n_X, n_X), dtype={{INPUT_DTYPE}}, order='C')
+            self.pdist_csr(
+                x1_data=&X_data[0],
+                x1_indices=X_indices,
+                x1_indptr=X_indptr,
+                size=n_features,
+                D=Darr,
+            )
+        else:
+            Y_csr = Y.tocsr()
+            n_Y, _ = Y_csr.shape
+            Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}})
+            Y_indices = np.asarray(Y_csr.indices, dtype=np.int32)
+            Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32)
+
+            Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C')
+            self.cdist_csr(
+                x1_data=&X_data[0],
+                x1_indices=X_indices,
+                x1_indptr=X_indptr,
+                x2_data=&Y_data[0],
+                x2_indices=Y_indices,
+                x2_indptr=Y_indptr,
+                size=n_features,
+                D=Darr,
+            )
+        return np.asarray(Darr)
+
+    def _pairwise_sparse_dense(self, X: csr_matrix, Y):
+        cdef:
+            intp_t n_X = X.shape[0]
+            intp_t n_features = X.shape[1]
+            const {{INPUT_DTYPE_t}}[::1] X_data = np.asarray(
+                X.data, dtype={{INPUT_DTYPE}},
+            )
+            const int32_t[::1] X_indices = np.asarray(
+                X.indices, dtype=np.int32,
+            )
+            const int32_t[::1] X_indptr = np.asarray(
+                X.indptr, dtype=np.int32,
+            )
+
+            const {{INPUT_DTYPE_t}}[:, ::1] Y_data = np.asarray(
+                Y, dtype={{INPUT_DTYPE}}, order="C",
+            )
+            intp_t n_Y = Y_data.shape[0]
+            const int32_t[::1] Y_indices = (
+                np.arange(n_features, dtype=np.int32)
+            )
+
+            {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C')
+
+            intp_t i1, i2
+            intp_t x1_start, x1_end
+            {{INPUT_DTYPE_t}} * x2_data
+
+        with nogil:
+            # Use the exact same adaptation for CSR than in SparseDenseDatasetsPair
+            # for supporting the sparse-dense case with minimal overhead.
+            # Note: at this point this method is only a convenience method
+            # used in the tests via the DistanceMetric.pairwise method.
+            # Therefore, there is no need to attempt parallelization of those
+            # nested for-loops.
+            # Efficient parallel computation of pairwise distances can be
+            # achieved via the PairwiseDistances class instead. The latter
+            # internally calls into vector-wise distance computation from
+            # the DistanceMetric subclass while benefiting from the generic
+            # Cython/OpenMP parallelization template for the generic pairwise
+            # distance + reduction computational pattern.
+            for i1 in range(n_X):
+                x1_start = X_indptr[i1]
+                x1_end = X_indptr[i1 + 1]
+                for i2 in range(n_Y):
+                    x2_data = &Y_data[0, 0] + i2 * n_features
+
+                    Darr[i1, i2] = self.dist_csr(
+                        x1_data=&X_data[0],
+                        x1_indices=&X_indices[0],
+                        x2_data=x2_data,
+                        x2_indices=&Y_indices[0],
+                        x1_start=x1_start,
+                        x1_end=x1_end,
+                        x2_start=0,
+                        x2_end=n_features,
+                        size=n_features,
+                    )
+
+        return np.asarray(Darr)
+
+    def _pairwise_dense_sparse(self, X, Y: csr_matrix):
+        # We could have implemented this method using _pairwise_dense_sparse by
+        # swapping argument and by transposing the results, but this would
+        # have come with an extra copy to ensure C-contiguity of the result.
+        cdef:
+            intp_t n_X = X.shape[0]
+            intp_t n_features = X.shape[1]
+
+            const {{INPUT_DTYPE_t}}[:, ::1] X_data = np.asarray(
+                X, dtype={{INPUT_DTYPE}}, order="C",
+            )
+            const int32_t[::1] X_indices = np.arange(
+                n_features, dtype=np.int32,
+            )
+
+            intp_t n_Y = Y.shape[0]
+            const {{INPUT_DTYPE_t}}[::1] Y_data = np.asarray(
+                Y.data, dtype={{INPUT_DTYPE}},
+            )
+            const int32_t[::1] Y_indices = np.asarray(
+                Y.indices, dtype=np.int32,
+            )
+            const int32_t[::1] Y_indptr = np.asarray(
+                Y.indptr, dtype=np.int32,
+            )
+
+            {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C')
+
+            intp_t i1, i2
+            {{INPUT_DTYPE_t}} * x1_data
+
+            intp_t x2_start, x2_end
+
+        with nogil:
+            # Use the exact same adaptation for CSR than in SparseDenseDatasetsPair
+            # for supporting the dense-sparse case with minimal overhead.
+            # Note: at this point this method is only a convenience method
+            # used in the tests via the DistanceMetric.pairwise method.
+            # Therefore, there is no need to attempt parallelization of those
+            # nested for-loops.
+            # Efficient parallel computation of pairwise distances can be
+            # achieved via the PairwiseDistances class instead. The latter
+            # internally calls into vector-wise distance computation from
+            # the DistanceMetric subclass while benefiting from the generic
+            # Cython/OpenMP parallelization template for the generic pairwise
+            # distance + reduction computational pattern.
+            for i1 in range(n_X):
+                x1_data = &X_data[0, 0] + i1 * n_features
+                for i2 in range(n_Y):
+                    x2_start = Y_indptr[i2]
+                    x2_end = Y_indptr[i2 + 1]
+
+                    Darr[i1, i2] = self.dist_csr(
+                        x1_data=x1_data,
+                        x1_indices=&X_indices[0],
+                        x2_data=&Y_data[0],
+                        x2_indices=&Y_indices[0],
+                        x1_start=0,
+                        x1_end=n_features,
+                        x2_start=x2_start,
+                        x2_end=x2_end,
+                        size=n_features,
+                    )
+
+        return np.asarray(Darr)
+
+
+    def pairwise(self, X, Y=None):
+        """Compute the pairwise distances between X and Y
+
+        This is a convenience routine for the sake of testing.  For many
+        metrics, the utilities in scipy.spatial.distance.cdist and
+        scipy.spatial.distance.pdist will be faster.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+            If not specified, then Y=X.
+
+        Returns
+        -------
+        dist : ndarray of shape  (n_samples_X, n_samples_Y)
+            The distance matrix of pairwise distances between points in X and Y.
+        """
+        X = check_array(X, accept_sparse=['csr'])
+
+        if Y is None:
+            Y = X
+        else:
+            Y = check_array(Y, accept_sparse=['csr'])
+
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+
+        if not X_is_sparse and not Y_is_sparse:
+            return self._pairwise_dense_dense(X, Y)
+
+        if X_is_sparse and Y_is_sparse:
+            return self._pairwise_sparse_sparse(X, Y)
+
+        if X_is_sparse and not Y_is_sparse:
+            return self._pairwise_sparse_dense(X, Y)
+
+        return self._pairwise_dense_sparse(X, Y)
+
+#------------------------------------------------------------
+# Euclidean Distance
+#  d = sqrt(sum(x_i^2 - y_i^2))
+cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Euclidean Distance metric
+
+    .. math::
+       D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 }
+    """
+    def __init__(self):
+        self.p = 2
+
+    cdef inline {{INPUT_DTYPE_t}} dist(self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        return euclidean_dist{{name_suffix}}(x1, x2, size)
+
+    cdef inline {{INPUT_DTYPE_t}} rdist(self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        return euclidean_rdist{{name_suffix}}(x1, x2, size)
+
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+        return sqrt(rdist)
+
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t d = 0.0
+            float64_t unsquared = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                unsquared = x1_data[i1] - x2_data[i2]
+                d = d + (unsquared * unsquared)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                unsquared = x1_data[i1]
+                d = d + (unsquared * unsquared)
+                i1 = i1 + 1
+            else:
+                unsquared = x2_data[i2]
+                d = d + (unsquared * unsquared)
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                unsquared = x2_data[i2]
+                d = d + (unsquared * unsquared)
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                unsquared = x1_data[i1]
+                d = d + (unsquared * unsquared)
+                i1 = i1 + 1
+
+        return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+        return sqrt(
+            self.rdist_csr(
+            x1_data,
+            x1_indices,
+            x2_data,
+            x2_indices,
+            x1_start,
+            x1_end,
+            x2_start,
+            x2_end,
+            size,
+        ))
+
+#------------------------------------------------------------
+# SEuclidean Distance
+#  d = sqrt(sum((x_i - y_i2)^2 / v_i))
+cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Standardized Euclidean Distance metric
+
+    .. math::
+       D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
+    """
+    def __init__(self, V):
+        self.vec = np.asarray(V, dtype=np.float64)
+        self.size = self.vec.shape[0]
+        self.p = 2
+
+    def _validate_data(self, X):
+        if X.shape[1] != self.size:
+            raise ValueError('SEuclidean dist: size of V does not match')
+
+    cdef inline {{INPUT_DTYPE_t}} rdist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef float64_t tmp, d=0
+        cdef intp_t j
+        for j in range(size):
+            tmp = x1[j] - x2[j]
+            d += (tmp * tmp / self.vec[j])
+        return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        return sqrt(self.rdist(x1, x2, size))
+
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+        return sqrt(rdist)
+
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t d = 0.0
+            float64_t unsquared = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                unsquared = x1_data[i1] - x2_data[i2]
+                d = d + (unsquared * unsquared) / self.vec[ix1]
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                unsquared = x1_data[i1]
+                d = d + (unsquared * unsquared) / self.vec[ix1]
+                i1 = i1 + 1
+            else:
+                unsquared = x2_data[i2]
+                d = d + (unsquared * unsquared) / self.vec[ix2]
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                ix2 = x2_indices[i2]
+                unsquared = x2_data[i2]
+                d = d + (unsquared * unsquared) / self.vec[ix2]
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                ix1 = x1_indices[i1]
+                unsquared = x1_data[i1]
+                d = d + (unsquared * unsquared) / self.vec[ix1]
+                i1 = i1 + 1
+        return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+        return sqrt(
+            self.rdist_csr(
+            x1_data,
+            x1_indices,
+            x2_data,
+            x2_indices,
+            x1_start,
+            x1_end,
+            x2_start,
+            x2_end,
+            size,
+        ))
+
+#------------------------------------------------------------
+# Manhattan Distance
+#  d = sum(abs(x_i - y_i))
+cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Manhattan/City-block Distance metric
+
+    .. math::
+       D(x, y) = \sum_i |x_i - y_i|
+    """
+    def __init__(self):
+        self.p = 1
+
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef float64_t d = 0
+        cdef intp_t j
+        for j in range(size):
+            d += fabs(x1[j] - x2[j])
+        return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            {{INPUT_DTYPE_t}} d = 0.0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                d = d + fabs(x1_data[i1] - x2_data[i2])
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                d = d + fabs(x1_data[i1])
+                i1 = i1 + 1
+            else:
+                d = d + fabs(x2_data[i2])
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                d = d + fabs(x2_data[i2])
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                d = d + fabs(x1_data[i1])
+                i1 = i1 + 1
+
+        return d
+
+
+#------------------------------------------------------------
+# Chebyshev Distance
+#  d = max_i(abs(x_i - y_i))
+cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    """Chebyshev/Infinity Distance
+
+    .. math::
+       D(x, y) = max_i (|x_i - y_i|)
+
+    Examples
+    --------
+    >>> from sklearn.metrics.dist_metrics import DistanceMetric
+    >>> dist = DistanceMetric.get_metric('chebyshev')
+    >>> X = [[0, 1, 2],
+    ...      [3, 4, 5]]
+    >>> Y = [[-1, 0, 1],
+    ...      [3, 4, 5]]
+    >>> dist.pairwise(X, Y)
+    array([[1.732..., 5.196...],
+           [6.928..., 0....   ]])
+    """
+    def __init__(self):
+        self.p = INF{{name_suffix}}
+
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef float64_t d = 0
+        cdef intp_t j
+        for j in range(size):
+            d = fmax(d, fabs(x1[j] - x2[j]))
+        return d
+
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t d = 0.0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                d = fmax(d, fabs(x1_data[i1] - x2_data[i2]))
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                d = fmax(d, fabs(x1_data[i1]))
+                i1 = i1 + 1
+            else:
+                d = fmax(d, fabs(x2_data[i2]))
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                d = fmax(d, fabs(x2_data[i2]))
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                d = fmax(d, fabs(x1_data[i1]))
+                i1 = i1 + 1
+
+        return d
+
+
+#------------------------------------------------------------
+# Minkowski Distance
+cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Minkowski Distance
+
+    .. math::
+        D(x, y) = {||u-v||}_p
+
+    when w is None.
+
+    Here is the more general expanded expression for the weighted case:
+
+    .. math::
+        D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p)
+
+    Parameters
+    ----------
+    p : float
+        The order of the p-norm of the difference (see above).
+
+        .. versionchanged:: 1.4.0
+            Minkowski distance allows `p` to be `0<p<1`.
+
+
+    w : (N,) array-like (optional)
+        The weight vector.
+
+    Minkowski Distance requires p > 0 and finite.
+    When :math:`p \in (0,1)`, it isn't a true metric but is permissible when
+    the triangular inequality isn't necessary.
+    For p = infinity, use ChebyshevDistance.
+    Note that for p=1, ManhattanDistance is more efficient, and for
+    p=2, EuclideanDistance is more efficient.
+
+    """
+    def __init__(self, p, w=None):
+        if p <= 0:
+            raise ValueError("p must be greater than 0")
+        elif np.isinf(p):
+            raise ValueError("MinkowskiDistance requires finite p. "
+                             "For p=inf, use ChebyshevDistance.")
+
+        self.p = p
+        if w is not None:
+            w_array = check_array(
+                w, ensure_2d=False, dtype=np.float64, input_name="w"
+            )
+            if (w_array < 0).any():
+                raise ValueError("w cannot contain negative weights")
+            self.vec = w_array
+            self.size = self.vec.shape[0]
+        else:
+            self.vec = np.asarray([], dtype=np.float64)
+            self.size = 0
+
+    def _validate_data(self, X):
+        if self.size > 0 and X.shape[1] != self.size:
+            raise ValueError("MinkowskiDistance: the size of w must match "
+                             f"the number of features ({X.shape[1]}). "
+                             f"Currently len(w)={self.size}.")
+
+    cdef inline {{INPUT_DTYPE_t}} rdist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef float64_t d=0
+        cdef intp_t j
+        cdef bint has_w = self.size > 0
+        if has_w:
+            for j in range(size):
+                d += (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p))
+        else:
+            for j in range(size):
+                d += (pow(fabs(x1[j] - x2[j]), self.p))
+        return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        return pow(self.rdist(x1, x2, size), 1. / self.p)
+
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+        return pow(rdist, 1. / self.p)
+
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+        return pow(dist, self.p)
+
+    def rdist_to_dist(self, rdist):
+        return rdist ** (1. / self.p)
+
+    def dist_to_rdist(self, dist):
+        return dist ** self.p
+
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t d = 0.0
+            bint has_w = self.size > 0
+
+        if has_w:
+            while i1 < x1_end and i2 < x2_end:
+                ix1 = x1_indices[i1]
+                ix2 = x2_indices[i2]
+
+                if ix1 == ix2:
+                    d = d + (self.vec[ix1] * pow(fabs(
+                        x1_data[i1] - x2_data[i2]
+                    ), self.p))
+                    i1 = i1 + 1
+                    i2 = i2 + 1
+                elif ix1 < ix2:
+                    d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p))
+                    i1 = i1 + 1
+                else:
+                    d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p))
+                    i2 = i2 + 1
+
+            if i1 == x1_end:
+                while i2 < x2_end:
+                    ix2 = x2_indices[i2]
+                    d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p))
+                    i2 = i2 + 1
+            else:
+                while i1 < x1_end:
+                    ix1 = x1_indices[i1]
+                    d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p))
+                    i1 = i1 + 1
+
+            return d
+        else:
+            while i1 < x1_end and i2 < x2_end:
+                ix1 = x1_indices[i1]
+                ix2 = x2_indices[i2]
+
+                if ix1 == ix2:
+                    d = d + (pow(fabs(
+                        x1_data[i1] - x2_data[i2]
+                    ), self.p))
+                    i1 = i1 + 1
+                    i2 = i2 + 1
+                elif ix1 < ix2:
+                    d = d + (pow(fabs(x1_data[i1]), self.p))
+                    i1 = i1 + 1
+                else:
+                    d = d + (pow(fabs(x2_data[i2]), self.p))
+                    i2 = i2 + 1
+
+            if i1 == x1_end:
+                while i2 < x2_end:
+                    d = d + (pow(fabs(x2_data[i2]), self.p))
+                    i2 = i2 + 1
+            else:
+                while i1 < x1_end:
+                    d = d + (pow(fabs(x1_data[i1]), self.p))
+                    i1 = i1 + 1
+
+            return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+        return pow(
+            self.rdist_csr(
+                x1_data,
+                x1_indices,
+                x2_data,
+                x2_indices,
+                x1_start,
+                x1_end,
+                x2_start,
+                x2_end,
+                size,
+            ),
+            1 / self.p
+        )
+
+#------------------------------------------------------------
+# Mahalanobis Distance
+#  d = sqrt( (x - y)^T V^-1 (x - y) )
+cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    """Mahalanobis Distance
+
+    .. math::
+       D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) }
+
+    Parameters
+    ----------
+    V : array-like
+        Symmetric positive-definite covariance matrix.
+        The inverse of this matrix will be explicitly computed.
+    VI : array-like
+        optionally specify the inverse directly.  If VI is passed,
+        then V is not referenced.
+    """
+    cdef float64_t[::1] buffer
+
+    def __init__(self, V=None, VI=None):
+        if VI is None:
+            if V is None:
+                raise ValueError("Must provide either V or VI "
+                                 "for Mahalanobis distance")
+            VI = np.linalg.inv(V)
+        if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
+            raise ValueError("V/VI must be square")
+
+        self.mat = np.asarray(VI, dtype=np.float64, order='C')
+
+        self.size = self.mat.shape[0]
+
+        # We need to create a buffer to store the vectors' coordinates' differences
+        self.buffer = np.zeros(self.size, dtype=np.float64)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        self.size = self.mat.shape[0]
+        self.buffer = np.zeros(self.size, dtype=np.float64)
+
+    def _validate_data(self, X):
+        if X.shape[1] != self.size:
+            raise ValueError('Mahalanobis dist: size of V does not match')
+
+    cdef inline {{INPUT_DTYPE_t}} rdist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef float64_t tmp, d = 0
+        cdef intp_t i, j
+
+        # compute (x1 - x2).T * VI * (x1 - x2)
+        for i in range(size):
+            self.buffer[i] = x1[i] - x2[i]
+
+        for i in range(size):
+            tmp = 0
+            for j in range(size):
+                tmp += self.mat[i, j] * self.buffer[j]
+            d += tmp * self.buffer[i]
+        return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        return sqrt(self.rdist(x1, x2, size))
+
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+        return sqrt(rdist)
+
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t tmp, d = 0.0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                self.buffer[ix1] = x1_data[i1] - x2_data[i2]
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                self.buffer[ix1] = x1_data[i1]
+                i1 = i1 + 1
+            else:
+                self.buffer[ix2] = - x2_data[i2]
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                ix2 = x2_indices[i2]
+                self.buffer[ix2] = - x2_data[i2]
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                ix1 = x1_indices[i1]
+                self.buffer[ix1] = x1_data[i1]
+                i1 = i1 + 1
+
+        for i in range(size):
+            tmp = 0
+            for j in range(size):
+                tmp += self.mat[i, j] * self.buffer[j]
+            d += tmp * self.buffer[i]
+
+        return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+        return sqrt(
+            self.rdist_csr(
+            x1_data,
+            x1_indices,
+            x2_data,
+            x2_indices,
+            x1_start,
+            x1_end,
+            x2_start,
+            x2_end,
+            size,
+        ))
+
+#------------------------------------------------------------
+# Hamming Distance
+#  d = N_unequal(x, y) / N_tot
+cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Hamming Distance
+
+    Hamming distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int n_unequal = 0
+        cdef intp_t j
+        for j in range(size):
+            if x1[j] != x2[j]:
+                n_unequal += 1
+        return float(n_unequal) / size
+
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t d = 0.0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                d += (x1_data[i1] != x2_data[i2])
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                d += (x1_data[i1] != 0)
+                i1 = i1 + 1
+            else:
+                d += (x2_data[i2] != 0)
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                d += (x2_data[i2] != 0)
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                d += (x1_data[i1] != 0)
+                i1 = i1 + 1
+
+        d /= size
+
+        return d
+
+
+#------------------------------------------------------------
+# Canberra Distance
+#  D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ]
+cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Canberra Distance
+
+    Canberra distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef float64_t denom, d = 0
+        cdef intp_t j
+        for j in range(size):
+            denom = fabs(x1[j]) + fabs(x2[j])
+            if denom > 0:
+                d += fabs(x1[j] - x2[j]) / denom
+        return d
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t d = 0.0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                d += (
+                        fabs(x1_data[i1] - x2_data[i2]) /
+                        (fabs(x1_data[i1]) + fabs(x2_data[i2]))
+                )
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                d += 1.
+                i1 = i1 + 1
+            else:
+                d += 1.
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                d += 1.
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                d += 1.
+                i1 = i1 + 1
+
+        return d
+
+#------------------------------------------------------------
+# Bray-Curtis Distance
+#  D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)]
+cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Bray-Curtis Distance
+
+    Bray-Curtis distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef float64_t num = 0, denom = 0
+        cdef intp_t j
+        for j in range(size):
+            num += fabs(x1[j] - x2[j])
+            denom += fabs(x1[j]) + fabs(x2[j])
+        if denom > 0:
+            return num / denom
+        else:
+            return 0.0
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t num = 0.0
+            float64_t denom = 0.0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                num += fabs(x1_data[i1] - x2_data[i2])
+                denom += fabs(x1_data[i1]) + fabs(x2_data[i2])
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                num += fabs(x1_data[i1])
+                denom += fabs(x1_data[i1])
+                i1 = i1 + 1
+            else:
+                num += fabs(x2_data[i2])
+                denom += fabs(x2_data[i2])
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                num += fabs(x1_data[i1])
+                denom += fabs(x1_data[i1])
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                num += fabs(x2_data[i2])
+                denom += fabs(x2_data[i2])
+                i1 = i1 + 1
+
+        return num / denom
+
+#------------------------------------------------------------
+# Jaccard Distance (boolean)
+#  D(x, y) = N_unequal(x, y) / N_nonzero(x, y)
+cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Jaccard Distance
+
+    Jaccard Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+        D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT)
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int tf1, tf2, n_eq = 0, nnz = 0
+        cdef intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            nnz += (tf1 or tf2)
+            n_eq += (tf1 and tf2)
+        # Based on https://github.com/scipy/scipy/pull/7373
+        # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
+        # was changed to return 0, instead of nan.
+        if nnz == 0:
+            return 0
+        return (nnz - n_eq) * 1.0 / nnz
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            intp_t tf1, tf2, n_tt = 0, nnz = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            tf1 = x1_data[i1] != 0
+            tf2 = x2_data[i2] != 0
+
+            if ix1 == ix2:
+                nnz += (tf1 or tf2)
+                n_tt += (tf1 and tf2)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                nnz += tf1
+                i1 = i1 + 1
+            else:
+                nnz += tf2
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                tf2 = x2_data[i2] != 0
+                nnz += tf2
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                tf1 = x1_data[i1] != 0
+                nnz += tf1
+                i1 = i1 + 1
+
+        # Based on https://github.com/scipy/scipy/pull/7373
+        # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
+        # was changed to return 0, instead of nan.
+        if nnz == 0:
+            return 0
+        return (nnz - n_tt) * 1.0 / nnz
+
+#------------------------------------------------------------
+# Matching Distance (boolean)
+#  D(x, y) = n_neq / n
+cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Matching Distance
+
+    Matching Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+        D(x, y) = (N_TF + N_FT) / N
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int tf1, tf2, n_neq = 0
+        cdef intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return n_neq * 1. / size
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            intp_t tf1, tf2, n_neq = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            if ix1 == ix2:
+                tf1 = x1_data[i1] != 0
+                tf2 = x2_data[i2] != 0
+                n_neq += (tf1 != tf2)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                n_neq += (x1_data[i1] != 0)
+                i1 = i1 + 1
+            else:
+                n_neq += (x2_data[i2] != 0)
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                n_neq += (x2_data[i2] != 0)
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                n_neq += (x1_data[i1] != 0)
+                i1 = i1 + 1
+
+        return n_neq * 1.0 / size
+
+#------------------------------------------------------------
+# Dice Distance (boolean)
+#  D(x, y) = n_neq / (2 * ntt + n_neq)
+cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Dice Distance
+
+    Dice Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+        D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT)
+
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int tf1, tf2, n_neq = 0, n_tt = 0
+        cdef intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_tt += (tf1 and tf2)
+            n_neq += (tf1 != tf2)
+        return n_neq / (2.0 * n_tt + n_neq)
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            intp_t tf1, tf2, n_tt = 0, n_neq = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            tf1 = x1_data[i1] != 0
+            tf2 = x2_data[i2] != 0
+
+            if ix1 == ix2:
+                n_tt += (tf1 and tf2)
+                n_neq += (tf1 != tf2)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                n_neq += tf1
+                i1 = i1 + 1
+            else:
+                n_neq += tf2
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                tf2 = x2_data[i2] != 0
+                n_neq += tf2
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                tf1 = x1_data[i1] != 0
+                n_neq += tf1
+                i1 = i1 + 1
+
+        return n_neq / (2.0 * n_tt + n_neq)
+
+
+#------------------------------------------------------------
+# Kulsinski Distance (boolean)
+#  D(x, y) = (ntf + nft - ntt + n) / (n_neq + n)
+cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Kulsinski Distance
+
+    Kulsinski Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+        D(x, y) = 1 - N_TT / (N + N_TF + N_FT)
+
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int tf1, tf2, n_tt = 0, n_neq = 0
+        cdef intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+            n_tt += (tf1 and tf2)
+        return (n_neq - n_tt + size) * 1.0 / (n_neq + size)
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            intp_t tf1, tf2, n_tt = 0, n_neq = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            tf1 = x1_data[i1] != 0
+            tf2 = x2_data[i2] != 0
+
+            if ix1 == ix2:
+                n_tt += (tf1 and tf2)
+                n_neq += (tf1 != tf2)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                n_neq += tf1
+                i1 = i1 + 1
+            else:
+                n_neq += tf2
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                tf2 = x2_data[i2] != 0
+                n_neq += tf2
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                tf1 = x1_data[i1] != 0
+                n_neq += tf1
+                i1 = i1 + 1
+
+        return (n_neq - n_tt + size) * 1.0 / (n_neq + size)
+
+#------------------------------------------------------------
+# Rogers-Tanimoto Distance (boolean)
+#  D(x, y) = 2 * n_neq / (n + n_neq)
+cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Rogers-Tanimoto Distance
+
+    Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+        D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT)
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int tf1, tf2, n_neq = 0
+        cdef intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return (2.0 * n_neq) / (size + n_neq)
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            intp_t tf1, tf2, n_neq = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            tf1 = x1_data[i1] != 0
+            tf2 = x2_data[i2] != 0
+
+            if ix1 == ix2:
+                n_neq += (tf1 != tf2)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                n_neq += tf1
+                i1 = i1 + 1
+            else:
+                n_neq += tf2
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                tf2 = x2_data[i2] != 0
+                n_neq += tf2
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                tf1 = x1_data[i1] != 0
+                n_neq += tf1
+                i1 = i1 + 1
+
+        return (2.0 * n_neq) / (size + n_neq)
+
+#------------------------------------------------------------
+# Russell-Rao Distance (boolean)
+#  D(x, y) = (n - ntt) / n
+cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Russell-Rao Distance
+
+    Russell-Rao Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+        D(x, y) = (N - N_TT) / N
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int tf1, tf2, n_tt = 0
+        cdef intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_tt += (tf1 and tf2)
+        return (size - n_tt) * 1. / size
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            intp_t tf1, tf2, n_tt = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            tf1 = x1_data[i1] != 0
+            tf2 = x2_data[i2] != 0
+
+            if ix1 == ix2:
+                n_tt += (tf1 and tf2)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                i1 = i1 + 1
+            else:
+                i2 = i2 + 1
+
+        # We don't need to go through all the longest
+        # vector because tf1 or tf2 will be false
+        # and thus n_tt won't be increased.
+
+        return (size - n_tt) * 1. / size
+
+
+
+#------------------------------------------------------------
+# Sokal-Michener Distance (boolean)
+#  D(x, y) = 2 * n_neq / (n + n_neq)
+cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Sokal-Michener Distance
+
+    Sokal-Michener Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+        D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT)
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int tf1, tf2, n_neq = 0
+        cdef intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return (2.0 * n_neq) / (size + n_neq)
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            intp_t tf1, tf2, n_neq = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            tf1 = x1_data[i1] != 0
+            tf2 = x2_data[i2] != 0
+
+            if ix1 == ix2:
+                n_neq += (tf1 != tf2)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                n_neq += tf1
+                i1 = i1 + 1
+            else:
+                n_neq += tf2
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                tf2 = x2_data[i2] != 0
+                n_neq += tf2
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                tf1 = x1_data[i1] != 0
+                n_neq += tf1
+                i1 = i1 + 1
+
+        return (2.0 * n_neq) / (size + n_neq)
+
+#------------------------------------------------------------
+# Sokal-Sneath Distance (boolean)
+#  D(x, y) = n_neq / (0.5 * n_tt + n_neq)
+cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Sokal-Sneath Distance
+
+    Sokal-Sneath Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+        D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF)
+    """
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef int tf1, tf2, n_tt = 0, n_neq = 0
+        cdef intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+            n_tt += (tf1 and tf2)
+        return n_neq / (0.5 * n_tt + n_neq)
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            intp_t tf1, tf2, n_tt = 0, n_neq = 0
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            tf1 = x1_data[i1] != 0
+            tf2 = x2_data[i2] != 0
+
+            if ix1 == ix2:
+                n_tt += (tf1 and tf2)
+                n_neq += (tf1 != tf2)
+                i1 = i1 + 1
+                i2 = i2 + 1
+            elif ix1 < ix2:
+                n_neq += tf1
+                i1 = i1 + 1
+            else:
+                n_neq += tf2
+                i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                tf2 = x2_data[i2] != 0
+                n_neq += tf2
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                tf1 = x1_data[i1] != 0
+                n_neq += tf1
+                i1 = i1 + 1
+
+        return n_neq / (0.5 * n_tt + n_neq)
+
+
+#------------------------------------------------------------
+# Haversine Distance (2 dimensional)
+#  D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2)
+#                          + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]}
+cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    """Haversine (Spherical) Distance
+
+    The Haversine distance is the angular distance between two points on
+    the surface of a sphere.  The first distance of each point is assumed
+    to be the latitude, the second is the longitude, given in radians.
+    The dimension of the points must be 2:
+
+    D(x, y) = 2 arcsin[sqrt{sin^2((x1 - y1) / 2) + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}]
+
+    """
+
+    def _validate_data(self, X):
+        if X.shape[1] != 2:
+            raise ValueError("Haversine distance only valid "
+                             "in 2 dimensions")
+
+    cdef inline {{INPUT_DTYPE_t}} rdist(self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        cdef float64_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0])))
+        cdef float64_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1])))
+        return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
+
+    cdef inline {{INPUT_DTYPE_t}} dist(self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        return 2 * asin(sqrt(self.rdist(x1, x2, size)))
+
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+        return 2 * asin(sqrt(rdist))
+
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+        cdef float64_t tmp = sin(0.5 *  dist)
+        return tmp * tmp
+
+    def rdist_to_dist(self, rdist):
+        return 2 * np.arcsin(np.sqrt(rdist))
+
+    def dist_to_rdist(self, dist):
+        tmp = np.sin(0.5 * dist)
+        return tmp * tmp
+
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+        return 2 * asin(sqrt(self.rdist_csr(
+            x1_data,
+            x1_indices,
+            x2_data,
+            x2_indices,
+            x1_start,
+            x1_end,
+            x2_start,
+            x2_end,
+            size,
+        )))
+
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
+    ) except -1 nogil:
+
+        cdef:
+            intp_t ix1, ix2
+            intp_t i1 = x1_start
+            intp_t i2 = x2_start
+
+            float64_t x1_0 = 0
+            float64_t x1_1 = 0
+            float64_t x2_0 = 0
+            float64_t x2_1 = 0
+            float64_t sin_0
+            float64_t sin_1
+
+        while i1 < x1_end and i2 < x2_end:
+            ix1 = x1_indices[i1]
+            ix2 = x2_indices[i2]
+
+            # Find the components in the 2D vectors to work with
+            x1_component = ix1 if (x1_start == 0) else ix1 % x1_start
+            x2_component = ix2 if (x2_start == 0) else ix2 % x2_start
+
+            if x1_component == 0:
+                x1_0 = x1_data[i1]
+            else:
+                x1_1 = x1_data[i1]
+
+            if x2_component == 0:
+                x2_0 = x2_data[i2]
+            else:
+                x2_1 = x2_data[i2]
+
+            i1 = i1 + 1
+            i2 = i2 + 1
+
+        if i1 == x1_end:
+            while i2 < x2_end:
+                ix2 = x2_indices[i2]
+                x2_component = ix2 if (x2_start == 0) else ix2 % x2_start
+                if x2_component == 0:
+                    x2_0 = x2_data[i2]
+                else:
+                    x2_1 = x2_data[i2]
+                i2 = i2 + 1
+        else:
+            while i1 < x1_end:
+                ix1 = x1_indices[i1]
+                x1_component = ix1 if (x1_start == 0) else ix1 % x1_start
+                if x1_component == 0:
+                    x1_0 = x1_data[i1]
+                else:
+                    x1_1 = x1_data[i1]
+                i1 = i1 + 1
+
+        sin_0 = sin(0.5 * (x1_0 - x2_0))
+        sin_1 = sin(0.5 * (x1_1 - x2_1))
+
+        return (sin_0 * sin_0 + cos(x1_0) * cos(x2_0) * sin_1 * sin_1)
+
+#------------------------------------------------------------
+# User-defined distance
+#
+cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    """PyFunc Distance
+
+    A user-defined distance
+
+    Parameters
+    ----------
+    func : function
+        func should take two numpy arrays as input, and return a distance.
+    """
+    def __init__(self, func, **kwargs):
+        self.func = func
+        self.kwargs = kwargs
+
+    # in cython < 0.26, GIL was required to be acquired during definition of
+    # the function and inside the body of the function. This behaviour is not
+    # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
+    # only way to be back compatible is to inherit `dist` from the base class
+    # without GIL and called an inline `_dist` which acquire GIL.
+    cdef inline {{INPUT_DTYPE_t}} dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 nogil:
+        return self._dist(x1, x2, size)
+
+    cdef inline {{INPUT_DTYPE_t}} _dist(
+        self,
+        const {{INPUT_DTYPE_t}}* x1,
+        const {{INPUT_DTYPE_t}}* x2,
+        intp_t size,
+    ) except -1 with gil:
+        cdef:
+            object x1arr = _buffer_to_ndarray{{name_suffix}}(x1, size)
+            object x2arr = _buffer_to_ndarray{{name_suffix}}(x2, size)
+        d = self.func(x1arr, x2arr, **self.kwargs)
+        try:
+            # Cython generates code here that results in a TypeError
+            # if d is the wrong type.
+            return d
+        except TypeError:
+            raise TypeError("Custom distance function must accept two "
+                            "vectors and return a float.")
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b532e0fa8ff07a27111f86d2ccc36b8d48879b5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__init__.py
@@ -0,0 +1,112 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+#
+# Pairwise Distances Reductions
+# =============================
+#
+# Overview
+# --------
+#
+#    This module provides routines to compute pairwise distances between a set
+#    of row vectors of X and another set of row vectors of Y and apply a
+#    reduction on top. The canonical example is the brute-force computation
+#    of the top k nearest neighbors by leveraging the arg-k-min reduction.
+#
+#    The reduction takes a matrix of pairwise distances between rows of X and Y
+#    as input and outputs an aggregate data-structure for each row of X. The
+#    aggregate values are typically smaller than the number of rows in Y, hence
+#    the term reduction.
+#
+#    For computational reasons, the reduction are performed on the fly on chunks
+#    of rows of X and Y so as to keep intermediate data-structures in CPU cache
+#    and avoid unnecessary round trips of large distance arrays with the RAM
+#    that would otherwise severely degrade the speed by making the overall
+#    processing memory-bound.
+#
+#    Finally, the routines follow a generic parallelization template to process
+#    chunks of data with OpenMP loops (via Cython prange), either on rows of X
+#    or rows of Y depending on their respective sizes.
+#
+#
+# Dispatching to specialized implementations
+# ------------------------------------------
+#
+#    Dispatchers are meant to be used in the Python code. Under the hood, a
+#    dispatcher must only define the logic to choose at runtime to the correct
+#    dtype-specialized :class:`BaseDistancesReductionDispatcher` implementation based
+#    on the dtype of X and of Y.
+#
+#
+# High-level diagram
+# ------------------
+#
+#    Legend:
+#
+#      A ---⊳ B: A inherits from B
+#      A ---x B: A dispatches to B
+#
+#
+#                                      (base dispatcher)
+#                               BaseDistancesReductionDispatcher
+#                                              ∆
+#                                              |
+#                                              |
+#           +------------------+---------------+---------------+------------------+
+#           |                  |                               |                  |
+#           |             (dispatcher)                    (dispatcher)            |
+#           |               ArgKmin                      RadiusNeighbors          |
+#           |                  |                               |                  |
+#           |                  |                               |                  |
+#           |                  |     (float{32,64} implem.)    |                  |
+#           |                  | BaseDistancesReduction{32,64} |                  |
+#           |                  |               ∆               |                  |
+#      (dispatcher)            |               |               |             (dispatcher)
+#    ArgKminClassMode          |               |               |        RadiusNeighborsClassMode
+#           |                  |    +----------+----------+    |                  |
+#           |                  |    |                     |    |                  |
+#           |                  |    |                     |    |                  |
+#           |                  x    |                     |    x                  |
+#           |     +-------⊳ ArgKmin{32,64}         RadiusNeighbors{32,64} ⊲---+   |
+#           x     |            |    ∆                     ∆    |              |   x
+#   ArgKminClassMode{32,64}    |    |                     |    |   RadiusNeighborsClassMode{32,64}
+# ===================================== Specializations ============================================
+#                              |    |                     |    |
+#                              |    |                     |    |
+#                              x    |                     |    x
+#                      EuclideanArgKmin{32,64}    EuclideanRadiusNeighbors{32,64}
+#
+#
+#    For instance :class:`ArgKmin` dispatches to:
+#      - :class:`ArgKmin64` if X and Y are two `float64` array-likes
+#      - :class:`ArgKmin32` if X and Y are two `float32` array-likes
+#
+#    In addition, if the metric parameter is set to "euclidean" or "sqeuclidean",
+#    then some direct subclass of `BaseDistancesReduction{32,64}` further dispatches
+#    to one of their subclass for euclidean-specialized implementation. For instance,
+#    :class:`ArgKmin64` dispatches to :class:`EuclideanArgKmin64`.
+#
+#    Those Euclidean-specialized implementations relies on optimal implementations of
+#    a decomposition of the squared euclidean distance matrix into a sum of three terms
+#    (see :class:`MiddleTermComputer{32,64}`).
+#
+
+from ._dispatcher import (
+    ArgKmin,
+    ArgKminClassMode,
+    BaseDistancesReductionDispatcher,
+    RadiusNeighbors,
+    RadiusNeighborsClassMode,
+    sqeuclidean_row_norms,
+)
+
+__all__ = [
+    "ArgKmin",
+    "ArgKminClassMode",
+    "BaseDistancesReductionDispatcher",
+    "RadiusNeighbors",
+    "RadiusNeighborsClassMode",
+    "sqeuclidean_row_norms",
+]
+
+# ruff: noqa: E501
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..515308c6957d286fa9d4c33b1b96631d7c6efe38
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/_dispatcher.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/_dispatcher.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d25737cf601efaf838210d2f5ae020490dcc4a5
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/_dispatcher.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
new file mode 100644
index 0000000000000000000000000000000000000000..f3a9ce96e64c00f2818b43d147baaa363d6895ee
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
@@ -0,0 +1,31 @@
+from ...utils._typedefs cimport intp_t, float64_t
+
+{{for name_suffix in ['64', '32']}}
+
+from ._base cimport BaseDistancesReduction{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
+
+cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
+    """float{{name_suffix}} implementation of the ArgKmin."""
+
+    cdef:
+        intp_t k
+
+        intp_t[:, ::1] argkmin_indices
+        float64_t[:, ::1] argkmin_distances
+
+        # Used as array of pointers to private datastructures used in threads.
+        float64_t ** heaps_r_distances_chunks
+        intp_t ** heaps_indices_chunks
+
+
+cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
+    """EuclideanDistance-specialisation of ArgKmin{{name_suffix}}."""
+    cdef:
+        MiddleTermComputer{{name_suffix}} middle_term_computer
+        const float64_t[::1] X_norm_squared
+        const float64_t[::1] Y_norm_squared
+
+        bint use_squared_distances
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..c21717554e94b22d48558811534ecbc08fe6dc52
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -0,0 +1,512 @@
+from libc.stdlib cimport free, malloc
+from libc.float cimport DBL_MAX
+from cython cimport final
+from cython.parallel cimport parallel, prange
+
+from ...utils._heap cimport heap_push
+from ...utils._sorting cimport simultaneous_sort
+from ...utils._typedefs cimport intp_t, float64_t
+
+import numpy as np
+import warnings
+
+from numbers import Integral
+from scipy.sparse import issparse
+from ...utils import check_array, check_scalar
+from ...utils.fixes import _in_unstable_openblas_configuration
+from ...utils.parallel import _get_threadpool_controller
+
+{{for name_suffix in ['64', '32']}}
+
+from ._base cimport (
+    BaseDistancesReduction{{name_suffix}},
+    _sqeuclidean_row_norms{{name_suffix}},
+)
+
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
+
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
+
+
+cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
+    """float{{name_suffix}} implementation of the ArgKmin."""
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        intp_t k,
+        metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        """Compute the argkmin reduction.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`ArgKmin{{name_suffix}}`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance should directly be created outside of this class method.
+        """
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in DOT or GEMM for instance).
+        with _get_threadpool_controller().limit(limits=1, user_api='blas'):
+          if metric in ("euclidean", "sqeuclidean"):
+              # Specialized implementation of ArgKmin for the Euclidean distance
+              # for the dense-dense and sparse-sparse cases.
+              # This implementation computes the distances by chunk using
+              # a decomposition of the Squared Euclidean distance.
+              # This specialisation has an improved arithmetic intensity for both
+              # the dense and sparse settings, allowing in most case speed-ups of
+              # several orders of magnitude compared to the generic ArgKmin
+              # implementation.
+              # Note that squared norms of X and Y are precomputed in the
+              # constructor of this class by issuing BLAS calls that may use
+              # multithreading (depending on the BLAS implementation), hence calling
+              # the constructor needs to be protected under the threadpool_limits
+              # context, along with the main calls to _parallel_on_Y and
+              # _parallel_on_X.
+              # For more information see MiddleTermComputer.
+              use_squared_distances = metric == "sqeuclidean"
+              pda = EuclideanArgKmin{{name_suffix}}(
+                  X=X, Y=Y, k=k,
+                  use_squared_distances=use_squared_distances,
+                  chunk_size=chunk_size,
+                  strategy=strategy,
+                  metric_kwargs=metric_kwargs,
+              )
+          else:
+              # Fall back on a generic implementation that handles most scipy
+              # metrics by computing the distances between 2 vectors at a time.
+              pda = ArgKmin{{name_suffix}}(
+                  datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
+                  k=k,
+                  chunk_size=chunk_size,
+                  strategy=strategy,
+              )
+
+          if pda.execute_in_parallel_on_Y:
+              pda._parallel_on_Y()
+          else:
+              pda._parallel_on_X()
+
+        return pda._finalize_results(return_distance)
+
+    def __init__(
+        self,
+        DatasetsPair{{name_suffix}} datasets_pair,
+        chunk_size=None,
+        strategy=None,
+        intp_t k=1,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+        )
+        self.k = check_scalar(k, "k", Integral, min_val=1)
+
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There are as many pointers as effective threads.
+        #
+        # For the sake of explicitness:
+        #   - when parallelizing on X, the pointers of those heaps are referencing
+        #   (with proper offsets) addresses of the two main heaps (see below)
+        #   - when parallelizing on Y, the pointers of those heaps are referencing
+        #   small heaps which are thread-wise-allocated and whose content will be
+        #   merged with the main heaps'.
+        self.heaps_r_distances_chunks = <float64_t **> malloc(
+            sizeof(float64_t *) * self.chunks_n_threads
+        )
+        self.heaps_indices_chunks = <intp_t **> malloc(
+            sizeof(intp_t *) * self.chunks_n_threads
+        )
+
+        # Main heaps which will be returned as results by `ArgKmin{{name_suffix}}.compute`.
+        self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=np.intp)
+        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=np.float64)
+
+    def __dealloc__(self):
+        if self.heaps_indices_chunks is not NULL:
+            free(self.heaps_indices_chunks)
+
+        if self.heaps_r_distances_chunks is not NULL:
+            free(self.heaps_r_distances_chunks)
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        cdef:
+            intp_t i, j
+            intp_t n_samples_X = X_end - X_start
+            intp_t n_samples_Y = Y_end - Y_start
+            float64_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            intp_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
+        # Pushing the distances and their associated indices on a heap
+        # which by construction will keep track of the argkmin.
+        for i in range(n_samples_X):
+            for j in range(n_samples_Y):
+                heap_push(
+                    values=heaps_r_distances + i * self.k,
+                    indices=heaps_indices + i * self.k,
+                    size=self.k,
+                    val=self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
+                    val_idx=Y_start + j,
+                )
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        # As this strategy is embarrassingly parallel, we can set each
+        # thread's heaps pointer to the proper position on the main heaps.
+        self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
+        self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        cdef:
+            intp_t idx
+
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) noexcept nogil:
+        cdef:
+            # Maximum number of scalar elements (the last chunks can be smaller)
+            intp_t heaps_size = self.X_n_samples_chunk * self.k
+            intp_t thread_num
+
+        # The allocation is done in parallel for data locality purposes: this way
+        # the heaps used in each threads are allocated in pages which are closer
+        # to the CPU core used by the thread.
+        # See comments about First Touch Placement Policy:
+        # https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf #noqa
+        for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True,
+                                 num_threads=self.chunks_n_threads):
+            # As chunks of X are shared across threads, so must their
+            # heaps. To solve this, each thread has its own heaps
+            # which are then synchronised back in the main ones.
+            self.heaps_r_distances_chunks[thread_num] = <float64_t *> malloc(
+                heaps_size * sizeof(float64_t)
+            )
+            self.heaps_indices_chunks[thread_num] = <intp_t *> malloc(
+                heaps_size * sizeof(intp_t)
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        # Initialising heaps (memset can't be used here)
+        for idx in range(self.X_n_samples_chunk * self.k):
+            self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX
+            self.heaps_indices_chunks[thread_num][idx] = -1
+
+    @final
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        cdef:
+            intp_t idx, jdx, thread_num
+        with nogil, parallel(num_threads=self.effective_n_threads):
+            # Synchronising the thread heaps with the main heaps.
+            # This is done in parallel sample-wise (no need for locks).
+            #
+            # This might break each thread's data locality as each heap which
+            # was allocated in a thread is being now being used in several threads.
+            #
+            # Still, this parallel pattern has shown to be efficient in practice.
+            for idx in prange(X_end - X_start, schedule="static"):
+                for thread_num in range(self.chunks_n_threads):
+                    for jdx in range(self.k):
+                        heap_push(
+                            values=&self.argkmin_distances[X_start + idx, 0],
+                            indices=&self.argkmin_indices[X_start + idx, 0],
+                            size=self.k,
+                            val=self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
+                            val_idx=self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
+                        )
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) noexcept nogil:
+        cdef:
+            intp_t idx, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks).
+            for idx in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[idx, 0],
+                    &self.argkmin_indices[idx, 0],
+                    self.k,
+                )
+        return
+
+    cdef void compute_exact_distances(self) noexcept nogil:
+        cdef:
+            intp_t i, j
+            float64_t[:, ::1] distances = self.argkmin_distances
+        for i in prange(self.n_samples_X, schedule='static', nogil=True,
+                        num_threads=self.effective_n_threads):
+            for j in range(self.k):
+                distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
+                    # Guard against potential -0., causing nan production.
+                    max(distances[i, j], 0.)
+                )
+
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            # We need to recompute distances because we relied on
+            # surrogate distances for the reduction.
+            self.compute_exact_distances()
+
+            # Values are returned identically to the way `KNeighborsMixin.kneighbors`
+            # returns values. This is counter-intuitive but this allows not using
+            # complex adaptations where `ArgKmin.compute` is called.
+            return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
+
+        return np.asarray(self.argkmin_indices)
+
+
+cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
+    """EuclideanDistance-specialisation of ArgKmin{{name_suffix}}."""
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (ArgKmin{{name_suffix}}.is_usable_for(X, Y, metric) and
+                not _in_unstable_openblas_configuration())
+
+    def __init__(
+        self,
+        X,
+        Y,
+        intp_t k,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        strategy=None,
+        metric_kwargs=None,
+    ):
+        if (
+            isinstance(metric_kwargs, dict) and
+            (metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"})
+        ):
+            warnings.warn(
+                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
+                f"usable for this case (EuclideanArgKmin64) and will be ignored.",
+                UserWarning,
+                stacklevel=3,
+            )
+
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric="euclidean"),
+            chunk_size=chunk_size,
+            strategy=strategy,
+            k=k,
+        )
+        cdef:
+            intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
+
+        self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
+            X,
+            Y,
+            self.effective_n_threads,
+            self.chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features=X.shape[1],
+            chunk_size=self.chunk_size,
+        )
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = check_array(
+                metric_kwargs.pop("Y_norm_squared"),
+                ensure_2d=False,
+                input_name="Y_norm_squared",
+                dtype=np.float64,
+            )
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
+                Y,
+                self.effective_n_threads,
+            )
+
+        if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
+            self.X_norm_squared = check_array(
+                metric_kwargs.pop("X_norm_squared"),
+                ensure_2d=False,
+                input_name="X_norm_squared",
+                dtype=np.float64,
+            )
+        else:
+            # Do not recompute norms if datasets are identical.
+            self.X_norm_squared = (
+                self.Y_norm_squared if X is Y else
+                _sqeuclidean_row_norms{{name_suffix}}(
+                    X,
+                    self.effective_n_threads,
+                )
+            )
+
+        self.use_squared_distances = use_squared_distances
+
+    @final
+    cdef void compute_exact_distances(self) noexcept nogil:
+        if not self.use_squared_distances:
+            ArgKmin{{name_suffix}}.compute_exact_distances(self)
+
+    @final
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        intp_t thread_num,
+    ) noexcept nogil:
+        ArgKmin{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
+        self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
+
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        ArgKmin{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        ArgKmin{{name_suffix}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
+    @final
+    cdef void _parallel_on_Y_init(
+        self,
+    ) noexcept nogil:
+        ArgKmin{{name_suffix}}._parallel_on_Y_init(self)
+        self.middle_term_computer._parallel_on_Y_init()
+
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        ArgKmin{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        ArgKmin{{name_suffix}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num
+        )
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        cdef:
+            intp_t i, j
+            float64_t sqeuclidean_dist_i_j
+            intp_t n_X = X_end - X_start
+            intp_t n_Y = Y_end - Y_start
+            float64_t * dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
+                X_start, X_end, Y_start, Y_end, thread_num
+            )
+            float64_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            intp_t * heaps_indices = self.heaps_indices_chunks[thread_num]
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
+        for i in range(n_X):
+            for j in range(n_Y):
+                sqeuclidean_dist_i_j = (
+                    self.X_norm_squared[i + X_start] +
+                    dist_middle_terms[i * n_Y + j] +
+                    self.Y_norm_squared[j + Y_start]
+                )
+
+                # Catastrophic cancellation might cause -0. to be present,
+                # e.g. when computing d(x_i, y_i) when X is Y.
+                sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j)
+
+                heap_push(
+                    values=heaps_r_distances + i * self.k,
+                    indices=heaps_indices + i * self.k,
+                    size=self.k,
+                    val=sqeuclidean_dist_i_j,
+                    val_idx=j + Y_start,
+                )
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..51fb745dca78408b7829a8aeb324bb7f99631c6b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
@@ -0,0 +1,182 @@
+from cython cimport floating, integral
+from cython.parallel cimport parallel, prange
+from libcpp.map cimport map as cpp_map, pair as cpp_pair
+from libc.stdlib cimport free
+
+from ...utils._typedefs cimport intp_t, float64_t
+from ...utils.parallel import _get_threadpool_controller
+
+import numpy as np
+from scipy.sparse import issparse
+from ._classmode cimport WeightingStrategy
+
+{{for name_suffix in ["32", "64"]}}
+from ._argkmin cimport ArgKmin{{name_suffix}}
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
+
+cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
+    """
+    {{name_suffix}}bit implementation of ArgKminClassMode.
+    """
+    cdef:
+        const intp_t[:] Y_labels,
+        const intp_t[:] unique_Y_labels
+        float64_t[:, :] class_scores
+        cpp_map[intp_t, intp_t] labels_to_index
+        WeightingStrategy weight_type
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        intp_t k,
+        weights,
+        Y_labels,
+        unique_Y_labels,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+    ):
+        """Compute the argkmin reduction with Y_labels.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`ArgKminClassMode{{name_suffix}}`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance _must_ directly be created outside of this class method.
+        """
+        # Use a generic implementation that handles most scipy
+        # metrics by computing the distances between 2 vectors at a time.
+        pda = ArgKminClassMode{{name_suffix}}(
+            datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
+            k=k,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with _get_threadpool_controller().limit(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results()
+
+    def __init__(
+        self,
+        DatasetsPair{{name_suffix}} datasets_pair,
+        const intp_t[:] Y_labels,
+        const intp_t[:] unique_Y_labels,
+        chunk_size=None,
+        strategy=None,
+        intp_t k=1,
+        weights=None,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            k=k,
+        )
+
+        if weights == "uniform":
+            self.weight_type = WeightingStrategy.uniform
+        elif weights == "distance":
+            self.weight_type = WeightingStrategy.distance
+        else:
+            self.weight_type = WeightingStrategy.callable
+        self.Y_labels = Y_labels
+
+        self.unique_Y_labels = unique_Y_labels
+
+        cdef intp_t idx, neighbor_class_idx
+        # Map from set of unique labels to their indices in `class_scores`
+        # Buffer used in building a histogram for one-pass weighted mode
+        self.class_scores = np.zeros(
+            (self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64,
+        )
+
+    def _finalize_results(self):
+        probabilities = np.asarray(self.class_scores)
+        probabilities /= probabilities.sum(axis=1, keepdims=True)
+        return probabilities
+
+    cdef inline void weighted_histogram_mode(
+        self,
+        intp_t sample_index,
+        intp_t* indices,
+        float64_t* distances,
+   ) noexcept nogil:
+        cdef:
+            intp_t neighbor_idx, neighbor_class_idx, label_index, multi_output_index
+            float64_t score_incr = 1
+            # TODO: Implement other WeightingStrategy values
+            bint use_distance_weighting = (
+                self.weight_type == WeightingStrategy.distance
+            )
+
+        # Iterate through the sample k-nearest neighbours
+        for neighbor_rank in range(self.k):
+            # Absolute indice of the neighbor_rank-th Nearest Neighbors
+            # in range [0, n_samples_Y)
+            # TODO: inspect if it worth permuting this condition
+            # and the for-loop above for improved branching.
+            if use_distance_weighting:
+                score_incr = 1 / distances[neighbor_rank]
+            neighbor_idx = indices[neighbor_rank]
+            neighbor_class_idx = self.Y_labels[neighbor_idx]
+            self.class_scores[sample_index][neighbor_class_idx] += score_incr
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        cdef:
+            intp_t idx, sample_index
+        for idx in range(X_end - X_start):
+            # One-pass top-one weighted mode
+            # Compute the absolute index in [0, n_samples_X)
+            sample_index = X_start + idx
+            self.weighted_histogram_mode(
+                sample_index,
+                &self.heaps_indices_chunks[thread_num][idx * self.k],
+                &self.heaps_r_distances_chunks[thread_num][idx * self.k],
+            )
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) noexcept nogil:
+        cdef:
+            intp_t sample_index, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            for sample_index in prange(self.n_samples_X, schedule='static'):
+                self.weighted_histogram_mode(
+                    sample_index,
+                    &self.argkmin_indices[sample_index][0],
+                    &self.argkmin_distances[sample_index][0],
+                )
+        return
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
new file mode 100644
index 0000000000000000000000000000000000000000..9578129993c37d392853f97ede19b5ce201a422f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -0,0 +1,135 @@
+from cython cimport final
+
+from ...utils._typedefs cimport intp_t, float64_t
+
+{{for name_suffix in ['64', '32']}}
+
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
+
+
+cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
+    X,
+    intp_t num_threads,
+)
+
+cdef class BaseDistancesReduction{{name_suffix}}:
+    """
+    Base float{{name_suffix}} implementation template of the pairwise-distances
+    reduction backends.
+
+    Implementations inherit from this template and may override the several
+    defined hooks as needed in order to easily extend functionality with
+    minimal redundant code.
+    """
+
+    cdef:
+        readonly DatasetsPair{{name_suffix}} datasets_pair
+
+        # The number of threads that can be used is stored in effective_n_threads.
+        #
+        # The number of threads to use in the parallelization strategy
+        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
+        # for small datasets, fewer threads might be needed to loop over pair of chunks.
+        #
+        # Hence, the number of threads that _will_ be used for looping over chunks
+        # is stored in chunks_n_threads, allowing solely using what we need.
+        #
+        # Thus, an invariant is:
+        #
+        #                 chunks_n_threads <= effective_n_threads
+        #
+        intp_t effective_n_threads
+        intp_t chunks_n_threads
+
+        intp_t n_samples_chunk, chunk_size
+
+        intp_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
+        intp_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
+
+        bint execute_in_parallel_on_Y
+
+    @final
+    cdef void _parallel_on_X(self) noexcept nogil
+
+    @final
+    cdef void _parallel_on_Y(self) noexcept nogil
+
+    # Placeholder methods which have to be implemented
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+
+    # Placeholder methods which can be implemented
+
+    cdef void compute_exact_distances(self) noexcept nogil
+
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        intp_t thread_num,
+    ) noexcept nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil
+
+    cdef void _parallel_on_X_parallel_finalize(
+        self,
+        intp_t thread_num
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) noexcept nogil
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..2bbfd74e2c2c399297f7836ffb6b973f8318a9e4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -0,0 +1,504 @@
+from cython cimport final
+from cython.operator cimport dereference as deref
+from cython.parallel cimport parallel, prange
+from libcpp.vector cimport vector
+
+from ...utils._cython_blas cimport _dot
+from ...utils._openmp_helpers cimport omp_get_thread_num
+from ...utils._typedefs cimport intp_t, float32_t, float64_t, int32_t
+
+import numpy as np
+
+from scipy.sparse import issparse
+from numbers import Integral
+from sklearn import get_config
+from sklearn.utils import check_scalar
+from ...utils._openmp_helpers import _openmp_effective_n_threads
+
+#####################
+
+cdef float64_t[::1] _sqeuclidean_row_norms64_dense(
+    const float64_t[:, ::1] X,
+    intp_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/14262
+        float64_t * X_ptr = <float64_t *> &X[0, 0]
+        intp_t idx = 0
+        intp_t n = X.shape[0]
+        intp_t d = X.shape[1]
+        float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64)
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+
+    return squared_row_norms
+
+
+cdef float64_t[::1] _sqeuclidean_row_norms32_dense(
+    const float32_t[:, ::1] X,
+    intp_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/14262
+        float32_t * X_ptr = <float32_t *> &X[0, 0]
+        intp_t i = 0, j = 0
+        intp_t thread_num
+        intp_t n = X.shape[0]
+        intp_t d = X.shape[1]
+        float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64)
+
+        # To upcast the i-th row of X from float32 to float64
+        vector[vector[float64_t]] X_i_upcast = vector[vector[float64_t]](
+            num_threads, vector[float64_t](d)
+        )
+
+    with nogil, parallel(num_threads=num_threads):
+        thread_num = omp_get_thread_num()
+
+        for i in prange(n, schedule='static'):
+            # Upcasting the i-th row of X from float32 to float64
+            for j in range(d):
+                X_i_upcast[thread_num][j] = <float64_t> deref(X_ptr + i * d + j)
+
+            squared_row_norms[i] = _dot(
+                d, X_i_upcast[thread_num].data(), 1,
+                X_i_upcast[thread_num].data(), 1,
+            )
+
+    return squared_row_norms
+
+
+cdef float64_t[::1] _sqeuclidean_row_norms64_sparse(
+    const float64_t[:] X_data,
+    const int32_t[:] X_indptr,
+    intp_t num_threads,
+):
+    cdef:
+        intp_t n = X_indptr.shape[0] - 1
+        int32_t X_i_ptr, idx = 0
+        float64_t[::1] squared_row_norms = np.zeros(n, dtype=np.float64)
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        for X_i_ptr in range(X_indptr[idx], X_indptr[idx+1]):
+            squared_row_norms[idx] += X_data[X_i_ptr] * X_data[X_i_ptr]
+
+    return squared_row_norms
+
+
+{{for name_suffix in ["64", "32"]}}
+
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
+
+
+cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
+    X,
+    intp_t num_threads,
+):
+    if issparse(X):
+        # TODO: remove this instruction which is a cast in the float32 case
+        # by moving squared row norms computations in MiddleTermComputer.
+        X_data = np.asarray(X.data, dtype=np.float64)
+        X_indptr = np.asarray(X.indptr, dtype=np.int32)
+        return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads)
+    else:
+        return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads)
+
+
+cdef class BaseDistancesReduction{{name_suffix}}:
+    """
+    Base float{{name_suffix}} implementation template of the pairwise-distances
+    reduction backends.
+
+    Implementations inherit from this template and may override the several
+    defined hooks as needed in order to easily extend functionality with
+    minimal redundant code.
+    """
+
+    def __init__(
+        self,
+        DatasetsPair{{name_suffix}} datasets_pair,
+        chunk_size=None,
+        strategy=None,
+     ):
+        cdef:
+            intp_t X_n_full_chunks, Y_n_full_chunks
+
+        if chunk_size is None:
+            chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
+
+        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
+
+        self.effective_n_threads = _openmp_effective_n_threads()
+
+        self.datasets_pair = datasets_pair
+
+        self.n_samples_X = datasets_pair.n_samples_X()
+        self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
+        X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
+        X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
+        self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0)
+
+        if X_n_samples_remainder != 0:
+            self.X_n_samples_last_chunk = X_n_samples_remainder
+        else:
+            self.X_n_samples_last_chunk = self.X_n_samples_chunk
+
+        self.n_samples_Y = datasets_pair.n_samples_Y()
+        self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
+        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
+        Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
+        self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0)
+
+        if Y_n_samples_remainder != 0:
+            self.Y_n_samples_last_chunk = Y_n_samples_remainder
+        else:
+            self.Y_n_samples_last_chunk = self.Y_n_samples_chunk
+
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
+        if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
+            raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
+                               f"or 'auto', but currently strategy='{self.strategy}'.")
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            # parallel_on_X has less synchronization overhead than
+            # parallel_on_Y and should therefore be used whenever
+            # n_samples_X is large enough to not starve any of the
+            # available hardware threads.
+            if self.n_samples_Y < self.n_samples_X:
+                # No point to even consider parallelizing on Y in this case. This
+                # is in particular important to do this on machines with a large
+                # number of hardware threads.
+                strategy = 'parallel_on_X'
+            elif 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
+                # If Y is larger than X, but X is still large enough to allow for
+                # parallelism, we might still want to favor parallelizing on X.
+                strategy = 'parallel_on_X'
+            else:
+                strategy = 'parallel_on_Y'
+
+        self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
+
+        # Not using less, not using more.
+        self.chunks_n_threads = min(
+            self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks,
+            self.effective_n_threads,
+        )
+
+    @final
+    cdef void _parallel_on_X(self) noexcept nogil:
+        """Perform computation and reduction in parallel on chunks of X.
+
+        This strategy dispatches tasks statically on threads. Each task
+        processes exactly only one chunk of X, computing and reducing
+        distances matrices between vectors of this chunk and vectors of all
+        chunks of Y, one chunk of Y at a time.
+
+        This strategy is embarrassingly parallel with no intermediate data
+        structures synchronization at all.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            intp_t thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            thread_num = omp_get_thread_num()
+
+            # Allocating thread datastructures
+            self._parallel_on_X_parallel_init(thread_num)
+
+            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
+                X_start = X_chunk_idx * self.X_n_samples_chunk
+                if X_chunk_idx == self.X_n_chunks - 1:
+                    X_end = X_start + self.X_n_samples_last_chunk
+                else:
+                    X_end = X_start + self.X_n_samples_chunk
+
+                # Reinitializing thread datastructures for the new X chunk
+                self._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+                for Y_chunk_idx in range(self.Y_n_chunks):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1:
+                        Y_end = Y_start + self.Y_n_samples_last_chunk
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                    self._compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                # Adjusting thread datastructures on the full pass on Y
+                self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end)
+
+            # end: for X_chunk_idx
+
+            # Deallocating thread datastructures
+            self._parallel_on_X_parallel_finalize(thread_num)
+
+        # end: with nogil, parallel
+        return
+
+    @final
+    cdef void _parallel_on_Y(self) noexcept nogil:
+        """Perform computation and reduction in parallel on chunks of Y.
+
+        This strategy is a sequence of embarrassingly parallel subtasks:
+        chunks of X are iterated over sequentially, and for each chunk of X,
+        tasks are dispatched statically on threads. Each task processes one
+        and only one chunk of Y, computing and reducing distances matrices
+        between vectors of the chunk of X and vectors of the Y.
+
+        It comes with lock-free and parallelized intermediate data structures
+        that synchronize at each iteration of the sequential outer loop on X
+        chunks.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            intp_t thread_num
+
+        # Allocating datastructures shared by all threads
+        self._parallel_on_Y_init()
+
+        for X_chunk_idx in range(self.X_n_chunks):
+            X_start = X_chunk_idx * self.X_n_samples_chunk
+            if X_chunk_idx == self.X_n_chunks - 1:
+                X_end = X_start + self.X_n_samples_last_chunk
+            else:
+                X_end = X_start + self.X_n_samples_chunk
+
+            with nogil, parallel(num_threads=self.chunks_n_threads):
+                thread_num = omp_get_thread_num()
+
+                # Initializing datastructures used in this thread
+                self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1:
+                        Y_end = Y_start + self.Y_n_samples_last_chunk
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                    self._compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+                # end: prange
+
+            # end: with nogil, parallel
+
+            # Synchronizing the thread datastructures with the main ones
+            self._parallel_on_Y_synchronize(X_start, X_end)
+
+        # end: for X_chunk_idx
+        # Deallocating temporary datastructures and adjusting main datastructures
+        self._parallel_on_Y_finalize()
+        return
+
+    # Placeholder methods which have to be implemented
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        """Compute the pairwise distances on two chunks of X and Y and reduce them.
+
+        This is THE core computational method of BaseDistancesReduction{{name_suffix}}.
+        This must be implemented in subclasses agnostically from the parallelization
+        strategies.
+        """
+        return
+
+    def _finalize_results(self, bint return_distance):
+        """Callback adapting datastructures before returning results.
+
+        This must be implemented in subclasses.
+        """
+        return None
+
+    # Placeholder methods which can be implemented
+
+    cdef void compute_exact_distances(self) noexcept nogil:
+        """Convert rank-preserving distances to exact distances or recompute them."""
+        return
+
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        intp_t thread_num,
+    ) noexcept nogil:
+        """Allocate datastructures used in a thread given its number."""
+        return
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        """Initialize datastructures used in a thread given its number.
+
+        In this method, EuclideanDistance specialisations of subclass of
+        BaseDistancesReduction _must_ call:
+
+        self.middle_term_computer._parallel_on_X_init_chunk(
+            thread_num, X_start, X_end,
+        )
+
+        to ensure the proper upcast of X[X_start:X_end] to float64 prior
+        to the reduction with float64 accumulator buffers when X.dtype is
+        float32.
+        """
+        return
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        """Initialize datastructures just before the _compute_and_reduce_distances_on_chunks.
+
+        In this method, EuclideanDistance specialisations of subclass of
+        BaseDistancesReduction _must_ call:
+
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
+        to ensure the proper upcast of Y[Y_start:Y_end] to float64 prior
+        to the reduction with float64 accumulator buffers when Y.dtype is
+        float32.
+        """
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        """Interact with datastructures after a reduction on chunks."""
+        return
+
+    cdef void _parallel_on_X_parallel_finalize(
+        self,
+        intp_t thread_num
+    ) noexcept nogil:
+        """Interact with datastructures after executing all the reductions."""
+        return
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) noexcept nogil:
+        """Allocate datastructures used in all threads."""
+        return
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        """Initialize datastructures used in a thread given its number.
+
+        In this method, EuclideanDistance specialisations of subclass of
+        BaseDistancesReduction _must_ call:
+
+        self.middle_term_computer._parallel_on_Y_parallel_init(
+            thread_num, X_start, X_end,
+        )
+
+        to ensure the proper upcast of X[X_start:X_end] to float64 prior
+        to the reduction with float64 accumulator buffers when X.dtype is
+        float32.
+        """
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        """Initialize datastructures just before the _compute_and_reduce_distances_on_chunks.
+
+        In this method, EuclideanDistance specialisations of subclass of
+        BaseDistancesReduction _must_ call:
+
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
+        to ensure the proper upcast of Y[Y_start:Y_end] to float64 prior
+        to the reduction with float64 accumulator buffers when Y.dtype is
+        float32.
+        """
+        return
+
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        """Update thread datastructures before leaving a parallel region."""
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) noexcept nogil:
+        """Update datastructures after executing all the reductions."""
+        return
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..65db044d668e89cc0a681a871663220d065dca41
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd
@@ -0,0 +1,5 @@
+cpdef enum WeightingStrategy:
+    uniform = 0
+    # TODO: Implement the following options in weighted_histogram_mode
+    distance = 1
+    callable = 2
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp
new file mode 100644
index 0000000000000000000000000000000000000000..1e57b3291a8f4be47902a1c4c26c1a41d1f43297
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp
@@ -0,0 +1,67 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    ('64', 'DistanceMetric64', 'float64_t'),
+    ('32', 'DistanceMetric32', 'float32_t')
+]
+
+}}
+from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
+from ...metrics._dist_metrics cimport DistanceMetric64, DistanceMetric32, DistanceMetric
+
+{{for name_suffix, DistanceMetric, INPUT_DTYPE_t in implementation_specific_values}}
+
+
+cdef class DatasetsPair{{name_suffix}}:
+    cdef:
+        {{DistanceMetric}} distance_metric
+        intp_t n_features
+
+    cdef intp_t n_samples_X(self) noexcept nogil
+
+    cdef intp_t n_samples_Y(self) noexcept nogil
+
+    cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil
+
+    cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil
+
+
+cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:, ::1] X
+        const {{INPUT_DTYPE_t}}[:, ::1] Y
+
+
+cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:] X_data
+        const int32_t[::1] X_indices
+        const int32_t[::1] X_indptr
+
+        const {{INPUT_DTYPE_t}}[:] Y_data
+        const int32_t[::1] Y_indices
+        const int32_t[::1] Y_indptr
+
+
+cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:] X_data
+        const int32_t[::1] X_indices
+        const int32_t[::1] X_indptr
+
+        const {{INPUT_DTYPE_t}}[:] Y_data
+        const int32_t[::1] Y_indices
+        intp_t n_Y
+
+
+cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        # As distance metrics are commutative, we can simply rely
+        # on the implementation of SparseDenseDatasetsPair and
+        # swap arguments.
+        DatasetsPair{{name_suffix}} datasets_pair
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..2c3ca44047145e98ca4b446a85db87b1c2ecd2c2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp
@@ -0,0 +1,406 @@
+import copy
+
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    ('64', 'DistanceMetric64', 'float64_t', 'np.float64'),
+    ('32', 'DistanceMetric32', 'float32_t', 'np.float32')
+]
+
+}}
+import numpy as np
+
+from cython cimport final
+
+from ...utils._typedefs cimport float64_t, float32_t, intp_t
+
+from scipy.sparse import issparse, csr_matrix
+
+{{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+cdef class DatasetsPair{{name_suffix}}:
+    """Abstract class which wraps a pair of datasets (X, Y).
+
+    This class allows computing distances between a single pair of rows of
+    of X and Y at a time given the pair of their indices (i, j). This class is
+    specialized for each metric thanks to the :func:`get_for` factory classmethod.
+
+    The handling of parallelization over chunks to compute the distances
+    and aggregation for several rows at a time is done in dedicated
+    subclasses of :class:`BaseDistancesReductionDispatcher` that in-turn rely on
+    subclasses of :class:`DatasetsPair` for each pair of rows in the data. The
+    goal is to make it possible to decouple the generic parallelization and
+    aggregation logic from metric-specific computation as much as possible.
+
+    X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
+    in subclasses.
+
+    This class avoids the overhead of dispatching distance computations
+    to :class:`sklearn.metrics.DistanceMetric` based on the physical
+    representation of the vectors (sparse vs. dense). It makes use of
+    cython.final to remove the overhead of dispatching method calls.
+
+    Parameters
+    ----------
+    distance_metric: {{DistanceMetric}}
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        metric="euclidean",
+        dict metric_kwargs=None,
+    ) -> DatasetsPair{{name_suffix}}:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        metric : str or DistanceMetric object, default='euclidean'
+            The distance metric to compute between rows of X and Y.
+            The default metric is a fast implementation of the Euclidean
+            metric. For a list of available metrics, see the documentation
+            of :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        datasets_pair: DatasetsPair{{name_suffix}}
+            The suited DatasetsPair{{name_suffix}} implementation.
+        """
+        # X_norm_squared and Y_norm_squared might be propagated
+        # down to DatasetsPairs via metrics_kwargs when the Euclidean
+        # specialisations can't be used.
+        # To prevent X_norm_squared and Y_norm_squared to be passed
+        # down to DistanceMetrics (whose constructors would raise
+        # a RuntimeError), we pop them here.
+        if metric_kwargs is not None:
+            # Copying metric_kwargs not to pop "X_norm_squared"
+            # and "Y_norm_squared" where they are used
+            metric_kwargs = copy.copy(metric_kwargs)
+            metric_kwargs.pop("X_norm_squared", None)
+            metric_kwargs.pop("Y_norm_squared", None)
+        cdef:
+            {{DistanceMetric}} distance_metric = DistanceMetric.get_metric(
+                metric,
+                {{INPUT_DTYPE}},
+                **(metric_kwargs or {})
+            )
+
+        # Metric-specific checks that do not replace nor duplicate `check_array`.
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
+
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+
+        if not X_is_sparse and not Y_is_sparse:
+            return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+
+        if X_is_sparse and Y_is_sparse:
+            return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+
+        if X_is_sparse and not Y_is_sparse:
+            return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+
+        return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with np.int32."""
+        X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}})
+        X_indices = np.asarray(X.indices, dtype=np.int32)
+        X_indptr = np.asarray(X.indptr, dtype=np.int32)
+        return X_data, X_indices, X_indptr
+
+    def __init__(self, {{DistanceMetric}} distance_metric, intp_t n_features):
+        self.distance_metric = distance_metric
+        self.n_features = n_features
+
+    cdef intp_t n_samples_X(self) noexcept nogil:
+        """Number of samples in X."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -999
+
+    cdef intp_t n_samples_Y(self) noexcept nogil:
+        """Number of samples in Y."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -999
+
+    cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
+        return self.dist(i, j)
+
+    cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -1
+
+@final
+cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    """Compute distances between row vectors of two arrays.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n_samples_X, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    Y: ndarray of shape (n_samples_Y, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two row vectors of (X, Y).
+    """
+
+    def __init__(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        {{DistanceMetric}} distance_metric,
+    ):
+        super().__init__(distance_metric, n_features=X.shape[1])
+        # Arrays have already been checked
+        self.X = X
+        self.Y = Y
+
+    @final
+    cdef intp_t n_samples_X(self) noexcept nogil:
+        return self.X.shape[0]
+
+    @final
+    cdef intp_t n_samples_Y(self) noexcept nogil:
+        return self.Y.shape[0]
+
+    @final
+    cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
+        return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.n_features)
+
+    @final
+    cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
+        return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.n_features)
+
+
+@final
+cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    """Compute distances between vectors of two CSR matrices.
+
+    Parameters
+    ----------
+    X: sparse matrix of shape (n_samples_X, n_features)
+        Rows represent vectors. Must be in CSR format.
+
+    Y: sparse matrix of shape (n_samples_Y, n_features)
+        Rows represent vectors. Must be in CSR format.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
+        super().__init__(distance_metric, n_features=X.shape[1])
+
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
+    @final
+    cdef intp_t n_samples_X(self) noexcept nogil:
+        return self.X_indptr.shape[0] - 1
+
+    @final
+    cdef intp_t n_samples_Y(self) noexcept nogil:
+        return self.Y_indptr.shape[0] - 1
+
+    @final
+    cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
+        return self.distance_metric.rdist_csr(
+            x1_data=&self.X_data[0],
+            x1_indices=&self.X_indices[0],
+            x2_data=&self.Y_data[0],
+            x2_indices=&self.Y_indices[0],
+            x1_start=self.X_indptr[i],
+            x1_end=self.X_indptr[i + 1],
+            x2_start=self.Y_indptr[j],
+            x2_end=self.Y_indptr[j + 1],
+            size=self.n_features,
+        )
+
+    @final
+    cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
+        return self.distance_metric.dist_csr(
+            x1_data=&self.X_data[0],
+            x1_indices=&self.X_indices[0],
+            x2_data=&self.Y_data[0],
+            x2_indices=&self.Y_indices[0],
+            x1_start=self.X_indptr[i],
+            x1_end=self.X_indptr[i + 1],
+            x2_start=self.Y_indptr[j],
+            x2_end=self.Y_indptr[j + 1],
+            size=self.n_features,
+        )
+
+
+@final
+cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    """Compute distances between vectors of a CSR matrix and a dense array.
+
+    Parameters
+    ----------
+    X: sparse matrix of shape (n_samples_X, n_features)
+        Rows represent vectors. Must be in CSR format.
+
+    Y: ndarray of shape (n_samples_Y, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
+        super().__init__(distance_metric, n_features=X.shape[1])
+
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+
+        # We support the sparse-dense case by using the sparse-sparse interfaces
+        # of `DistanceMetric` (namely `DistanceMetric.{dist_csr,rdist_csr}`) to
+        # avoid introducing a new complex set of interfaces. In this case, we
+        # need to convert `Y` (the dense array) into a CSR matrix.
+        #
+        # Here we motive using another simpler CSR representation to use for `Y`.
+        #
+        # If we were to use the usual CSR representation for `Y`, storing all
+        # the columns indices in `indices` would have required allocating an
+        # array of n_samples × n_features elements with repeated contiguous
+        # integers from 0 to n_features - 1. This would have been very wasteful
+        # from a memory point of view. This alternative representation just uses
+        # the necessary amount of information needed and only necessitates
+        # shifting the address of `data` before calling the CSR × CSR routines.
+        #
+        # In this representation:
+        #
+        #  - the `data` array is the original dense array, `Y`, whose first
+        #  element's address is shifted before calling the CSR × CSR routine
+        #
+        #  - the `indices` array is a single row of `n_features` elements:
+        #
+        #                      [0, 1, ..., n_features-1]
+        #
+        #  - the `indptr` array is not materialised as the indices pointers'
+        #  offset is constant (the offset equals `n_features`). Moreover, as
+        #  `data` is shifted, constant `start` and `end` indices pointers
+        #  respectively equalling 0 and n_features are used.
+
+        # Y array already has been checked here
+        self.n_Y = Y.shape[0]
+        self.Y_data = np.ravel(Y)
+        self.Y_indices = np.arange(self.n_features, dtype=np.int32)
+
+    @final
+    cdef intp_t n_samples_X(self) noexcept nogil:
+        return self.X_indptr.shape[0] - 1
+
+    @final
+    cdef intp_t n_samples_Y(self) noexcept nogil:
+        return self.n_Y
+
+    @final
+    cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
+        return self.distance_metric.rdist_csr(
+            x1_data=&self.X_data[0],
+            x1_indices=&self.X_indices[0],
+            # Increment the data pointer such that x2_start=0 is aligned with the
+            # j-th row
+            x2_data=&self.Y_data[0] + j * self.n_features,
+            x2_indices=&self.Y_indices[0],
+            x1_start=self.X_indptr[i],
+            x1_end=self.X_indptr[i + 1],
+            x2_start=0,
+            x2_end=self.n_features,
+            size=self.n_features,
+        )
+
+    @final
+    cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
+
+        return self.distance_metric.dist_csr(
+            x1_data=&self.X_data[0],
+            x1_indices=&self.X_indices[0],
+            # Increment the data pointer such that x2_start=0 is aligned with the
+            # j-th row
+            x2_data=&self.Y_data[0] + j * self.n_features,
+            x2_indices=&self.Y_indices[0],
+            x1_start=self.X_indptr[i],
+            x1_end=self.X_indptr[i + 1],
+            x2_start=0,
+            x2_end=self.n_features,
+            size=self.n_features,
+        )
+
+
+@final
+cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    """Compute distances between vectors of a dense array and a CSR matrix.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n_samples_X, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    Y: sparse matrix of shape (n_samples_Y, n_features)
+        Rows represent vectors. Must be in CSR format.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
+        super().__init__(distance_metric, n_features=X.shape[1])
+        # Swapping arguments on the constructor
+        self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric)
+
+    @final
+    cdef intp_t n_samples_X(self) noexcept nogil:
+        # Swapping interface
+        return self.datasets_pair.n_samples_Y()
+
+    @final
+    cdef intp_t n_samples_Y(self) noexcept nogil:
+        # Swapping interface
+        return self.datasets_pair.n_samples_X()
+
+    @final
+    cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
+        # Swapping arguments on the same interface
+        return self.datasets_pair.surrogate_dist(j, i)
+
+    @final
+    cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
+        # Swapping arguments on the same interface
+        return self.datasets_pair.dist(j, i)
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8307cbe84eaa904b50bdf11b59546aef397dbc3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -0,0 +1,767 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import abstractmethod
+from typing import List
+
+import numpy as np
+from scipy.sparse import issparse
+
+from ... import get_config
+from .._dist_metrics import (
+    BOOL_METRICS,
+    METRIC_MAPPING64,
+    DistanceMetric,
+)
+from ._argkmin import (
+    ArgKmin32,
+    ArgKmin64,
+)
+from ._argkmin_classmode import (
+    ArgKminClassMode32,
+    ArgKminClassMode64,
+)
+from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64
+from ._radius_neighbors import (
+    RadiusNeighbors32,
+    RadiusNeighbors64,
+)
+from ._radius_neighbors_classmode import (
+    RadiusNeighborsClassMode32,
+    RadiusNeighborsClassMode64,
+)
+
+
+def sqeuclidean_row_norms(X, num_threads):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    Parameters
+    ----------
+    X : ndarray or CSR matrix of shape (n_samples, n_features)
+        Input data. Must be c-contiguous.
+
+    num_threads : int
+        The number of OpenMP threads to use.
+
+    Returns
+    -------
+    sqeuclidean_row_norms : ndarray of shape (n_samples,)
+        Arrays containing the squared euclidean norm of each row of X.
+    """
+    if X.dtype == np.float64:
+        return np.asarray(_sqeuclidean_row_norms64(X, num_threads))
+    if X.dtype == np.float32:
+        return np.asarray(_sqeuclidean_row_norms32(X, num_threads))
+
+    raise ValueError(
+        "Only float64 or float32 datasets are supported at this time, "
+        f"got: X.dtype={X.dtype}."
+    )
+
+
+class BaseDistancesReductionDispatcher:
+    """Abstract base dispatcher for pairwise distance computation & reduction.
+
+    Each dispatcher extending the base :class:`BaseDistancesReductionDispatcher`
+    dispatcher must implement the :meth:`compute` classmethod.
+    """
+
+    @classmethod
+    def valid_metrics(cls) -> List[str]:
+        excluded = {
+            # PyFunc cannot be supported because it necessitates interacting with
+            # the CPython interpreter to call user defined functions.
+            "pyfunc",
+            "mahalanobis",  # is numerically unstable
+            # In order to support discrete distance metrics, we need to have a
+            # stable simultaneous sort which preserves the order of the indices
+            # because there generally is a lot of occurrences for a given values
+            # of distances in this case.
+            # TODO: implement a stable simultaneous_sort.
+            "hamming",
+            *BOOL_METRICS,
+        }
+        return sorted(({"sqeuclidean"} | set(METRIC_MAPPING64.keys())) - excluded)
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        """Return True if the dispatcher can be used for the
+        given parameters.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        Returns
+        -------
+        True if the dispatcher can be used, else False.
+        """
+
+        # FIXME: the current Cython implementation is too slow for a large number of
+        # features. We temporarily disable it to fallback on SciPy's implementation.
+        # See: https://github.com/scikit-learn/scikit-learn/issues/28191
+        if (
+            issparse(X)
+            and issparse(Y)
+            and isinstance(metric, str)
+            and "euclidean" in metric
+        ):
+            return False
+
+        def is_numpy_c_ordered(X):
+            return hasattr(X, "flags") and getattr(X.flags, "c_contiguous", False)
+
+        def is_valid_sparse_matrix(X):
+            return (
+                issparse(X)
+                and X.format == "csr"
+                and
+                # TODO: support CSR matrices without non-zeros elements
+                X.nnz > 0
+                and
+                # TODO: support CSR matrices with int64 indices and indptr
+                # See: https://github.com/scikit-learn/scikit-learn/issues/23653
+                X.indices.dtype == X.indptr.dtype == np.int32
+            )
+
+        is_usable = (
+            get_config().get("enable_cython_pairwise_dist", True)
+            and (is_numpy_c_ordered(X) or is_valid_sparse_matrix(X))
+            and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y))
+            and X.dtype == Y.dtype
+            and X.dtype in (np.float32, np.float64)
+            and (metric in cls.valid_metrics() or isinstance(metric, DistanceMetric))
+        )
+
+        return is_usable
+
+    @classmethod
+    @abstractmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        **kwargs,
+    ):
+        """Compute the reduction.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        **kwargs : additional parameters for the reduction
+
+        Notes
+        -----
+        This method is an abstract class method: it has to be implemented
+        for all subclasses.
+        """
+
+
+class ArgKmin(BaseDistancesReductionDispatcher):
+    """Compute the argkmin of row vectors of X on the ones of Y.
+
+    For each row vector of X, computes the indices of k first the rows
+    vectors of Y with the smallest distances.
+
+    ArgKmin is typically used to perform
+    bruteforce k-nearest neighbors queries.
+
+    This class is not meant to be instantiated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        k,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+        return_distance=False,
+    ):
+        """Compute the argkmin reduction.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        k : int
+            The k for the argkmin reduction.
+
+        metric : str, default='euclidean'
+            The distance metric to use for argkmin.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient
+
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
+        Returns
+        -------
+        If return_distance=False:
+          - argkmin_indices : ndarray of shape (n_samples_X, k)
+            Indices of the argkmin for each vector in X.
+
+        If return_distance=True:
+          - argkmin_distances : ndarray of shape (n_samples_X, k)
+            Distances to the argkmin for each vector in X.
+          - argkmin_indices : ndarray of shape (n_samples_X, k)
+            Indices of the argkmin for each vector in X.
+
+        Notes
+        -----
+        This classmethod inspects the arguments values to dispatch to the
+        dtype-specialized implementation of :class:`ArgKmin`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+        """
+        if X.dtype == Y.dtype == np.float64:
+            return ArgKmin64.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                return_distance=return_distance,
+            )
+
+        if X.dtype == Y.dtype == np.float32:
+            return ArgKmin32.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                return_distance=return_distance,
+            )
+
+        raise ValueError(
+            "Only float64 or float32 datasets pairs are supported at this time, "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+
+class RadiusNeighbors(BaseDistancesReductionDispatcher):
+    """Compute radius-based neighbors for two sets of vectors.
+
+    For each row-vector X[i] of the queries X, find all the indices j of
+    row-vectors in Y such that:
+
+                        dist(X[i], Y[j]) <= radius
+
+    The distance function `dist` depends on the values of the `metric`
+    and `metric_kwargs` parameters.
+
+    This class is not meant to be instantiated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        radius,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+        return_distance=False,
+        sort_results=False,
+    ):
+        """Return the results of the reduction for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        radius : float
+            The radius defining the neighborhood.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient
+                despite the synchronization step at each iteration of the outer loop
+                on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its neighbors if set to True.
+
+        sort_results : boolean, default=False
+            Sort results with respect to distances between each X vector and its
+            neighbors if set to True.
+
+        Returns
+        -------
+        If return_distance=False:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+
+        If return_distance=True:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+          - neighbors_distances : ndarray of n_samples_X ndarray
+            Distances to the neighbors for each vector in X.
+
+        Notes
+        -----
+        This classmethod inspects the arguments values to dispatch to the
+        dtype-specialized implementation of :class:`RadiusNeighbors`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+        """
+        if X.dtype == Y.dtype == np.float64:
+            return RadiusNeighbors64.compute(
+                X=X,
+                Y=Y,
+                radius=radius,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                sort_results=sort_results,
+                return_distance=return_distance,
+            )
+
+        if X.dtype == Y.dtype == np.float32:
+            return RadiusNeighbors32.compute(
+                X=X,
+                Y=Y,
+                radius=radius,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                sort_results=sort_results,
+                return_distance=return_distance,
+            )
+
+        raise ValueError(
+            "Only float64 or float32 datasets pairs are supported at this time, "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+
+class ArgKminClassMode(BaseDistancesReductionDispatcher):
+    """Compute the argkmin of row vectors of X on the ones of Y with labels.
+
+    For each row vector of X, computes the indices of k first the rows
+    vectors of Y with the smallest distances. Computes weighted mode of labels.
+
+    ArgKminClassMode is typically used to perform bruteforce k-nearest neighbors
+    queries when the weighted mode of the labels for the k-nearest neighbors
+    are required, such as in `predict` methods.
+
+    This class is not meant to be instantiated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def valid_metrics(cls) -> List[str]:
+        excluded = {
+            # Euclidean is technically usable for ArgKminClassMode
+            # but its current implementation would not be competitive.
+            # TODO: implement Euclidean specialization using GEMM.
+            "euclidean",
+            "sqeuclidean",
+        }
+        return list(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        k,
+        weights,
+        Y_labels,
+        unique_Y_labels,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+    ):
+        """Compute the argkmin reduction.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            The input array to be labelled.
+
+        Y : ndarray of shape (n_samples_Y, n_features)
+            The input array whose class membership are provided through the
+            `Y_labels` parameter.
+
+        k : int
+            The number of nearest neighbors to consider.
+
+        weights : ndarray
+            The weights applied over the `Y_labels` of `Y` when computing the
+            weighted mode of the labels.
+
+        Y_labels : ndarray
+            An array containing the index of the class membership of the
+            associated samples in `Y`. This is used in labeling `X`.
+
+        unique_Y_labels : ndarray
+            An array containing all unique indices contained in the
+            corresponding `Y_labels` array.
+
+        metric : str, default='euclidean'
+            The distance metric to use. For a list of available metrics, see
+            the documentation of :class:`~sklearn.metrics.DistanceMetric`.
+            Currently does not support `'precomputed'`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient
+                despite the synchronization step at each iteration of the outer loop
+                on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples_X, n_classes)
+            An array containing the class probabilities for each sample.
+
+        Notes
+        -----
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`PairwiseDistancesArgKmin`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+        """
+        if weights not in {"uniform", "distance"}:
+            raise ValueError(
+                "Only the 'uniform' or 'distance' weights options are supported"
+                f" at this time. Got: {weights=}."
+            )
+        if X.dtype == Y.dtype == np.float64:
+            return ArgKminClassMode64.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                weights=weights,
+                Y_labels=np.array(Y_labels, dtype=np.intp),
+                unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        if X.dtype == Y.dtype == np.float32:
+            return ArgKminClassMode32.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                weights=weights,
+                Y_labels=np.array(Y_labels, dtype=np.intp),
+                unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        raise ValueError(
+            "Only float64 or float32 datasets pairs are supported at this time, "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+
+class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher):
+    """Compute radius-based class modes of row vectors of X using the
+    those of Y.
+
+    For each row-vector X[i] of the queries X, find all the indices j of
+    row-vectors in Y such that:
+
+                        dist(X[i], Y[j]) <= radius
+
+    RadiusNeighborsClassMode is typically used to perform bruteforce
+    radius neighbors queries when the weighted mode of the labels for
+    the nearest neighbors within the specified radius are required,
+    such as in `predict` methods.
+
+    This class is not meant to be instantiated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def valid_metrics(cls) -> List[str]:
+        excluded = {
+            # Euclidean is technically usable for RadiusNeighborsClassMode
+            # but it would not be competitive.
+            # TODO: implement Euclidean specialization using GEMM.
+            "euclidean",
+            "sqeuclidean",
+        }
+        return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        radius,
+        weights,
+        Y_labels,
+        unique_Y_labels,
+        outlier_label,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+    ):
+        """Return the results of the reduction for the given arguments.
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            The input array to be labelled.
+        Y : ndarray of shape (n_samples_Y, n_features)
+            The input array whose class membership is provided through
+            the `Y_labels` parameter.
+        radius : float
+            The radius defining the neighborhood.
+        weights : ndarray
+            The weights applied to the `Y_labels` when computing the
+            weighted mode of the labels.
+        Y_labels : ndarray
+            An array containing the index of the class membership of the
+            associated samples in `Y`. This is used in labeling `X`.
+        unique_Y_labels : ndarray
+            An array containing all unique class labels.
+        outlier_label : int, default=None
+            Label for outlier samples (samples with no neighbors in given
+            radius). In the default case when the value is None if any
+            outlier is detected, a ValueError will be raised. The outlier
+            label should be selected from among the unique 'Y' labels. If
+            it is specified with a different value a warning will be raised
+            and all class probabilities of outliers will be assigned to be 0.
+        metric : str, default='euclidean'
+            The distance metric to use. For a list of available metrics, see
+            the documentation of :class:`~sklearn.metrics.DistanceMetric`.
+            Currently does not support `'precomputed'`.
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient
+                despite the synchronization step at each iteration of the outer loop
+                on chunks of `X`.
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples_X, n_classes)
+            An array containing the class probabilities for each sample.
+        """
+        if weights not in {"uniform", "distance"}:
+            raise ValueError(
+                "Only the 'uniform' or 'distance' weights options are supported"
+                f" at this time. Got: {weights=}."
+            )
+        if X.dtype == Y.dtype == np.float64:
+            return RadiusNeighborsClassMode64.compute(
+                X=X,
+                Y=Y,
+                radius=radius,
+                weights=weights,
+                Y_labels=np.array(Y_labels, dtype=np.intp),
+                unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
+                outlier_label=outlier_label,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        if X.dtype == Y.dtype == np.float32:
+            return RadiusNeighborsClassMode32.compute(
+                X=X,
+                Y=Y,
+                radius=radius,
+                weights=weights,
+                Y_labels=np.array(Y_labels, dtype=np.intp),
+                unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
+                outlier_label=outlier_label,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        raise ValueError(
+            "Only float64 or float32 datasets pairs are supported at this time, "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
new file mode 100644
index 0000000000000000000000000000000000000000..bdf007bd0514ab4b49ccdd55a3bd5dbe1b2c75ec
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
@@ -0,0 +1,228 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    # We also use the float64 dtype and C-type names as defined in
+    # `sklearn.utils._typedefs` to maintain consistency.
+    #
+    ('64', False, 'float64_t', 'np.float64'),
+    ('32', True, 'float32_t', 'np.float32')
+]
+
+}}
+from libcpp.vector cimport vector
+
+from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
+
+
+cdef void _middle_term_sparse_sparse_64(
+    const float64_t[:] X_data,
+    const int32_t[:] X_indices,
+    const int32_t[:] X_indptr,
+    intp_t X_start,
+    intp_t X_end,
+    const float64_t[:] Y_data,
+    const int32_t[:] Y_indices,
+    const int32_t[:] Y_indptr,
+    intp_t Y_start,
+    intp_t Y_end,
+    float64_t * D,
+) noexcept nogil
+
+
+{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+
+cdef class MiddleTermComputer{{name_suffix}}:
+    cdef:
+        intp_t effective_n_threads
+        intp_t chunks_n_threads
+        intp_t dist_middle_terms_chunks_size
+        intp_t n_features
+        intp_t chunk_size
+
+        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
+        vector[vector[float64_t]] dist_middle_terms_chunks
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+    cdef void _parallel_on_X_parallel_init(self, intp_t thread_num) noexcept nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_init(self) noexcept nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num
+    ) noexcept nogil
+
+    cdef float64_t * _compute_dist_middle_terms(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+
+cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:, ::1] X
+        const {{INPUT_DTYPE_t}}[:, ::1] Y
+
+    {{if upcast_to_float64}}
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
+        vector[vector[float64_t]] X_c_upcast
+        vector[vector[float64_t]] Y_c_upcast
+    {{endif}}
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num
+    ) noexcept nogil
+
+    cdef float64_t * _compute_dist_middle_terms(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+
+cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef:
+        const float64_t[:] X_data
+        const int32_t[:] X_indices
+        const int32_t[:] X_indptr
+
+        const float64_t[:] Y_data
+        const int32_t[:] Y_indices
+        const int32_t[:] Y_indptr
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num
+    ) noexcept nogil
+
+    cdef float64_t * _compute_dist_middle_terms(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+
+cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef:
+        const float64_t[:] X_data
+        const int32_t[:] X_indices
+        const int32_t[:] X_indptr
+
+        const {{INPUT_DTYPE_t}}[:, ::1] Y
+
+        # We treat the dense-sparse case with the sparse-dense case by simply
+        # treating the dist_middle_terms as F-ordered and by swapping arguments.
+        # This attribute is meant to encode the case and adapt the logic
+        # accordingly.
+        bint c_ordered_middle_term
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num
+    ) noexcept nogil
+
+    cdef float64_t * _compute_dist_middle_terms(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..1fca2d674720c40fa2df8f56fea4f3a7a6980ba8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -0,0 +1,633 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    # We also use the float64 dtype and C-type names as defined in
+    # `sklearn.utils._typedefs` to maintain consistency.
+    #
+    ('64', False, 'float64_t', 'np.float64'),
+    ('32', True, 'float32_t', 'np.float32')
+]
+
+}}
+from libcpp.vector cimport vector
+from libcpp.algorithm cimport fill
+
+from ...utils._cython_blas cimport (
+  BLAS_Order,
+  BLAS_Trans,
+  NoTrans,
+  RowMajor,
+  Trans,
+  _gemm,
+)
+from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
+
+import numpy as np
+from scipy.sparse import issparse, csr_matrix
+
+
+cdef void _middle_term_sparse_sparse_64(
+    const float64_t[:] X_data,
+    const int32_t[:] X_indices,
+    const int32_t[:] X_indptr,
+    intp_t X_start,
+    intp_t X_end,
+    const float64_t[:] Y_data,
+    const int32_t[:] Y_indices,
+    const int32_t[:] Y_indptr,
+    intp_t Y_start,
+    intp_t Y_end,
+    float64_t * D,
+) noexcept nogil:
+    # This routine assumes that D points to the first element of a
+    # zeroed buffer of length at least equal to n_X × n_Y, conceptually
+    # representing a 2-d C-ordered array.
+    cdef:
+        intp_t i, j, k
+        intp_t n_X = X_end - X_start
+        intp_t n_Y = Y_end - Y_start
+        intp_t x_col, x_ptr, y_col, y_ptr
+
+    for i in range(n_X):
+        for x_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+            x_col = X_indices[x_ptr]
+            for j in range(n_Y):
+                k = i * n_Y + j
+                for y_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
+                    y_col = Y_indices[y_ptr]
+                    if x_col == y_col:
+                        D[k] += -2 * X_data[x_ptr] * Y_data[y_ptr]
+
+
+{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+cdef void _middle_term_sparse_dense_{{name_suffix}}(
+    const float64_t[:] X_data,
+    const int32_t[:] X_indices,
+    const int32_t[:] X_indptr,
+    intp_t X_start,
+    intp_t X_end,
+    const {{INPUT_DTYPE_t}}[:, ::1] Y,
+    intp_t Y_start,
+    intp_t Y_end,
+    bint c_ordered_middle_term,
+    float64_t * dist_middle_terms,
+) noexcept nogil:
+    # This routine assumes that dist_middle_terms is a pointer to the first element
+    # of a buffer filled with zeros of length at least equal to n_X × n_Y, conceptually
+    # representing a 2-d C-ordered of F-ordered array.
+    cdef:
+        intp_t i, j, k
+        intp_t n_X = X_end - X_start
+        intp_t n_Y = Y_end - Y_start
+        intp_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
+
+    for i in range(n_X):
+        for j in range(n_Y):
+            k = i * n_Y + j if c_ordered_middle_term else j * n_X + i
+            for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+                X_i_col_idx = X_indices[X_i_ptr]
+                dist_middle_terms[k] += -2 * X_data[X_i_ptr] * Y[Y_start + j, X_i_col_idx]
+
+
+cdef class MiddleTermComputer{{name_suffix}}:
+    """Helper class to compute a Euclidean distance matrix in chunks.
+
+    This is an abstract base class that is further specialized depending
+    on the type of data (dense or sparse).
+
+    `EuclideanDistance` subclasses relies on the squared Euclidean
+    distances between chunks of vectors X_c and Y_c using the
+    following decomposition for the (i,j) pair :
+
+
+         ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+
+
+    This helper class is in charge of wrapping the common logic to compute
+    the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        effective_n_threads,
+        chunks_n_threads,
+        dist_middle_terms_chunks_size,
+        n_features,
+        chunk_size,
+    ) -> MiddleTermComputer{{name_suffix}}:
+        """Return the MiddleTermComputer implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+
+        Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+
+        Returns
+        -------
+        middle_term_computer: MiddleTermComputer{{name_suffix}}
+            The suited MiddleTermComputer{{name_suffix}} implementation.
+        """
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+
+        if not X_is_sparse and not Y_is_sparse:
+            return DenseDenseMiddleTermComputer{{name_suffix}}(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+        if X_is_sparse and Y_is_sparse:
+            return SparseSparseMiddleTermComputer{{name_suffix}}(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+        if X_is_sparse and not Y_is_sparse:
+            return SparseDenseMiddleTermComputer{{name_suffix}}(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+                c_ordered_middle_term=True
+            )
+        if not X_is_sparse and Y_is_sparse:
+            # NOTE: The Dense-Sparse case is implement via the Sparse-Dense case.
+            #
+            # To do so:
+            #    - X (dense) and Y (sparse) are swapped
+            #    - the distance middle term is seen as F-ordered for consistency
+            #      (c_ordered_middle_term = False)
+            return SparseDenseMiddleTermComputer{{name_suffix}}(
+                # Mind that X and Y are swapped here.
+                Y,
+                X,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+                c_ordered_middle_term=False,
+            )
+        raise NotImplementedError(
+            "X and Y must be CSR sparse matrices or numpy arrays."
+        )
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with np.int32."""
+        X_data = np.asarray(X.data, dtype=np.float64)
+        X_indices = np.asarray(X.indices, dtype=np.int32)
+        X_indptr = np.asarray(X.indptr, dtype=np.int32)
+        return X_data, X_indices, X_indptr
+
+    def __init__(
+        self,
+        intp_t effective_n_threads,
+        intp_t chunks_n_threads,
+        intp_t dist_middle_terms_chunks_size,
+        intp_t n_features,
+        intp_t chunk_size,
+    ):
+        self.effective_n_threads = effective_n_threads
+        self.chunks_n_threads = chunks_n_threads
+        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = n_features
+        self.chunk_size = chunk_size
+
+        self.dist_middle_terms_chunks = vector[vector[float64_t]](self.effective_n_threads)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        return
+
+    cdef void _parallel_on_X_parallel_init(self, intp_t thread_num) noexcept nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        return
+
+    cdef void _parallel_on_Y_init(self) noexcept nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num
+    ) noexcept nogil:
+        return
+
+    cdef float64_t * _compute_dist_middle_terms(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        return NULL
+
+
+cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    """Computes the middle term of the Euclidean distance between two chunked dense matrices
+    X_c and Y_c.
+
+                        dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    This class use the BLAS gemm routine to perform the dot product of each chunks
+    of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
+    """
+
+    def __init__(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        intp_t effective_n_threads,
+        intp_t chunks_n_threads,
+        intp_t dist_middle_terms_chunks_size,
+        intp_t n_features,
+        intp_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X = X
+        self.Y = Y
+
+{{if upcast_to_float64}}
+        # We populate the buffer for upcasting chunks of X and Y from float32 to float64.
+        self.X_c_upcast = vector[vector[float64_t]](self.effective_n_threads)
+        self.Y_c_upcast = vector[vector[float64_t]](self.effective_n_threads)
+
+        upcast_buffer_n_elements = self.chunk_size * n_features
+
+        for thread_num in range(self.effective_n_threads):
+            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+{{endif}}
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+{{if upcast_to_float64}}
+        cdef:
+            intp_t i, j
+            intp_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.Y[Y_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+{{if upcast_to_float64}}
+        cdef:
+            intp_t i, j
+            intp_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.X[X_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+{{if upcast_to_float64}}
+        cdef:
+            intp_t i, j
+            intp_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.X[X_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num
+    ) noexcept nogil:
+{{if upcast_to_float64}}
+        cdef:
+            intp_t i, j
+            intp_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.Y[Y_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef float64_t * _compute_dist_middle_terms(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        cdef:
+            float64_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            intp_t m = X_end - X_start
+            intp_t n = Y_end - Y_start
+            intp_t K = self.n_features
+            float64_t alpha = - 2.
+{{if upcast_to_float64}}
+            float64_t * A = self.X_c_upcast[thread_num].data()
+            float64_t * B = self.Y_c_upcast[thread_num].data()
+{{else}}
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            # See: https://github.com/scipy/scipy/issues/14262
+            float64_t * A = <float64_t *> &self.X[X_start, 0]
+            float64_t * B = <float64_t *> &self.Y[Y_start, 0]
+{{endif}}
+            intp_t lda = self.n_features
+            intp_t ldb = self.n_features
+            float64_t beta = 0.
+            intp_t ldc = Y_end - Y_start
+
+        # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+
+        return dist_middle_terms
+
+
+cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    """Middle term of the Euclidean distance between two chunked CSR matrices.
+
+    The result is return as a contiguous array.
+
+            dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64.
+    This routine iterates over the data, indices and indptr arrays of the sparse matrices without
+    densifying them.
+    """
+
+    def __init__(
+        self,
+        X,
+        Y,
+        intp_t effective_n_threads,
+        intp_t chunks_n_threads,
+        intp_t dist_middle_terms_chunks_size,
+        intp_t n_features,
+        intp_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef float64_t * _compute_dist_middle_terms(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        cdef:
+            float64_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
+            )
+
+        _middle_term_sparse_sparse_64(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start,
+            X_end,
+            self.Y_data,
+            self.Y_indices,
+            self.Y_indptr,
+            Y_start,
+            Y_end,
+            dist_middle_terms,
+        )
+
+        return dist_middle_terms
+
+cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    """Middle term of the Euclidean distance between chunks of a CSR matrix and a np.ndarray.
+
+    The logic of the computation is wrapped in the routine _middle_term_sparse_dense_{{name_suffix}}.
+    This routine iterates over the data, indices and indptr arrays of the sparse matrices
+    without densifying them.
+    """
+
+    def __init__(
+        self,
+        X,
+        Y,
+        intp_t effective_n_threads,
+        intp_t chunks_n_threads,
+        intp_t dist_middle_terms_chunks_size,
+        intp_t n_features,
+        intp_t chunk_size,
+        bint c_ordered_middle_term,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y = Y
+        self.c_ordered_middle_term = c_ordered_middle_term
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        # Fill the thread's dist_middle_terms_chunks with 0.0 before
+        # computing its elements in _compute_dist_middle_terms.
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        # Fill the thread's dist_middle_terms_chunks with 0.0 before
+        # computing its elements in _compute_dist_middle_terms.
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef float64_t * _compute_dist_middle_terms(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        cdef:
+            float64_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
+            )
+
+        # For the dense-sparse case, we use the sparse-dense case
+        # with dist_middle_terms seen as F-ordered.
+        # Hence we swap indices pointers here.
+        if not self.c_ordered_middle_term:
+            X_start, Y_start = Y_start, X_start
+            X_end, Y_end = Y_end, X_end
+
+        _middle_term_sparse_dense_{{name_suffix}}(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start,
+            X_end,
+            self.Y,
+            Y_start,
+            Y_end,
+            self.c_ordered_middle_term,
+            dist_middle_terms,
+        )
+
+        return dist_middle_terms
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp
new file mode 100644
index 0000000000000000000000000000000000000000..809a80a68c5b0c6a513b8b9267fe211f109cdaee
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp
@@ -0,0 +1,90 @@
+cimport numpy as cnp
+
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
+from cython cimport final
+
+from ...utils._typedefs cimport intp_t, float64_t
+
+cnp.import_array()
+
+######################
+## std::vector to np.ndarray coercion
+# As type covariance is not supported for C++ containers via Cython,
+# we need to redefine fused types.
+ctypedef fused vector_double_intp_t:
+    vector[intp_t]
+    vector[float64_t]
+
+
+ctypedef fused vector_vector_double_intp_t:
+    vector[vector[intp_t]]
+    vector[vector[float64_t]]
+
+cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
+    shared_ptr[vector_vector_double_intp_t] vecs
+)
+
+#####################
+{{for name_suffix in ['64', '32']}}
+
+from ._base cimport BaseDistancesReduction{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
+
+cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
+    """float{{name_suffix}} implementation of the RadiusNeighbors."""
+
+    cdef:
+        float64_t radius
+
+        # DistanceMetric{{name_suffix}} compute rank-preserving surrogate distance via rdist
+        # which are proxies necessitating less computations.
+        # We get the equivalent for the radius to be able to compare it against
+        # vectors' rank-preserving surrogate distances.
+        float64_t r_radius
+
+        # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
+        #
+        # For this implementation, we want resizable buffers which we will wrap
+        # into numpy arrays at the end. std::vector comes as a handy container
+        # for interacting efficiently with resizable buffers.
+        #
+        # Though it is possible to access their buffer address with
+        # std::vector::data, they can't be stolen: buffers lifetime
+        # is tied to their std::vector and are deallocated when
+        # std::vectors are.
+        #
+        # To solve this, we dynamically allocate std::vectors and then
+        # encapsulate them in a StdVectorSentinel responsible for
+        # freeing them when the associated np.ndarray is freed.
+        #
+        # Shared pointers (defined via shared_ptr) are use for safer memory management.
+        # Unique pointers (defined via unique_ptr) can't be used as datastructures
+        # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
+        shared_ptr[vector[vector[intp_t]]] neigh_indices
+        shared_ptr[vector[vector[float64_t]]] neigh_distances
+
+        # Used as array of pointers to private datastructures used in threads.
+        vector[shared_ptr[vector[vector[intp_t]]]] neigh_indices_chunks
+        vector[shared_ptr[vector[vector[float64_t]]]] neigh_distances_chunks
+
+        bint sort_results
+
+    @final
+    cdef void _merge_vectors(
+        self,
+        intp_t idx,
+        intp_t num_threads,
+    ) noexcept nogil
+
+
+cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
+    """EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}."""
+    cdef:
+        MiddleTermComputer{{name_suffix}} middle_term_computer
+        const float64_t[::1] X_norm_squared
+        const float64_t[::1] Y_norm_squared
+
+        bint use_squared_distances
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..d0567f2ead804d122fc24424a5d502084b6565e0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -0,0 +1,514 @@
+cimport numpy as cnp
+import numpy as np
+import warnings
+
+from libcpp.memory cimport shared_ptr, make_shared
+from libcpp.vector cimport vector
+from libcpp.algorithm cimport move
+from cython cimport final
+from cython.operator cimport dereference as deref
+from cython.parallel cimport parallel, prange
+
+from ...utils._sorting cimport simultaneous_sort
+from ...utils._typedefs cimport intp_t, float64_t
+from ...utils._vector_sentinel cimport vector_to_nd_array
+
+from numbers import Real
+from scipy.sparse import issparse
+from ...utils import check_array, check_scalar
+from ...utils.fixes import _in_unstable_openblas_configuration
+from ...utils.parallel import _get_threadpool_controller
+
+cnp.import_array()
+
+######################
+
+cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
+    shared_ptr[vector_vector_double_intp_t] vecs
+):
+    """Coerce a std::vector of std::vector to a ndarray of ndarray."""
+    cdef:
+        intp_t n = deref(vecs).size()
+        cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray)
+
+    for i in range(n):
+        nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
+
+    return nd_arrays_of_nd_arrays
+
+#####################
+{{for name_suffix in ['64', '32']}}
+
+from ._base cimport (
+    BaseDistancesReduction{{name_suffix}},
+    _sqeuclidean_row_norms{{name_suffix}}
+)
+
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
+
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
+
+
+cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
+    """float{{name_suffix}} implementation of the RadiusNeighbors."""
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        float64_t radius,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+        bint sort_results=False,
+    ):
+        """Compute the radius-neighbors reduction.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`RadiusNeighbors{{name_suffix}}`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance should directly be created outside of this class method.
+        """
+        if metric in ("euclidean", "sqeuclidean"):
+            # Specialized implementation of RadiusNeighbors for the Euclidean
+            # distance for the dense-dense and sparse-sparse cases.
+            # This implementation computes the distances by chunk using
+            # a decomposition of the Squared Euclidean distance.
+            # This specialisation has an improved arithmetic intensity for both
+            # the dense and sparse settings, allowing in most case speed-ups of
+            # several orders of magnitude compared to the generic RadiusNeighbors
+            # implementation.
+            # For more information see MiddleTermComputer.
+            use_squared_distances = metric == "sqeuclidean"
+            pda = EuclideanRadiusNeighbors{{name_suffix}}(
+                X=X, Y=Y, radius=radius,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size,
+                strategy=strategy,
+                sort_results=sort_results,
+                metric_kwargs=metric_kwargs,
+            )
+        else:
+             # Fall back on a generic implementation that handles most scipy
+             # metrics by computing the distances between 2 vectors at a time.
+            pda = RadiusNeighbors{{name_suffix}}(
+                datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
+                radius=radius,
+                chunk_size=chunk_size,
+                strategy=strategy,
+                sort_results=sort_results,
+            )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with _get_threadpool_controller().limit(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results(return_distance)
+
+
+    def __init__(
+        self,
+        DatasetsPair{{name_suffix}} datasets_pair,
+        float64_t radius,
+        chunk_size=None,
+        strategy=None,
+        sort_results=False,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+        )
+
+        self.radius = check_scalar(radius, "radius", Real, min_val=0)
+        self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
+        self.sort_results = sort_results
+
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There are as many pointers as effective threads.
+        #
+        # For the sake of explicitness:
+        #   - when parallelizing on X, the pointers of those heaps are referencing
+        #   self.neigh_distances and self.neigh_indices
+        #   - when parallelizing on Y, the pointers of those heaps are referencing
+        #   std::vectors of std::vectors which are thread-wise-allocated and whose
+        #   content will be merged into self.neigh_distances and self.neigh_indices.
+        self.neigh_distances_chunks = vector[shared_ptr[vector[vector[float64_t]]]](
+            self.chunks_n_threads
+        )
+        self.neigh_indices_chunks = vector[shared_ptr[vector[vector[intp_t]]]](
+            self.chunks_n_threads
+        )
+
+        # Temporary datastructures which will be coerced to numpy arrays on before
+        # RadiusNeighbors.compute "return" and will be then freed.
+        self.neigh_distances = make_shared[vector[vector[float64_t]]](self.n_samples_X)
+        self.neigh_indices = make_shared[vector[vector[intp_t]]](self.n_samples_X)
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        cdef:
+            intp_t i, j
+            float64_t r_dist_i_j
+
+        for i in range(X_start, X_end):
+            for j in range(Y_start, Y_end):
+                r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
+                if r_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
+
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            # We need to recompute distances because we relied on
+            # surrogate distances for the reduction.
+            self.compute_exact_distances()
+            return (
+                coerce_vectors_to_nd_arrays(self.neigh_distances),
+                coerce_vectors_to_nd_arrays(self.neigh_indices),
+            )
+
+        return coerce_vectors_to_nd_arrays(self.neigh_indices)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+
+        # As this strategy is embarrassingly parallel, we can set the
+        # thread vectors' pointers to the main vectors'.
+        self.neigh_distances_chunks[thread_num] = self.neigh_distances
+        self.neigh_indices_chunks[thread_num] = self.neigh_indices
+
+    @final
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        cdef:
+            intp_t idx
+
+        # Sorting neighbors for each query vector of X
+        if self.sort_results:
+            for idx in range(X_start, X_end):
+                simultaneous_sort(
+                    deref(self.neigh_distances)[idx].data(),
+                    deref(self.neigh_indices)[idx].data(),
+                    deref(self.neigh_indices)[idx].size()
+                )
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) noexcept nogil:
+        cdef:
+            intp_t thread_num
+        # As chunks of X are shared across threads, so must datastructures to avoid race
+        # conditions: each thread has its own vectors of n_samples_X vectors which are
+        # then merged back in the main n_samples_X vectors.
+        for thread_num in range(self.chunks_n_threads):
+            self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[float64_t]]](self.n_samples_X)
+            self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[intp_t]]](self.n_samples_X)
+
+    @final
+    cdef void _merge_vectors(
+        self,
+        intp_t idx,
+        intp_t num_threads,
+    ) noexcept nogil:
+        cdef:
+            intp_t thread_num
+            intp_t idx_n_elements = 0
+            intp_t last_element_idx = deref(self.neigh_indices)[idx].size()
+
+        # Resizing buffers only once for the given number of elements.
+        for thread_num in range(num_threads):
+            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
+
+        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
+        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
+
+        # Moving the elements by range using the range first element
+        # as the reference for the insertion.
+        for thread_num in range(num_threads):
+            move(
+                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
+                deref(self.neigh_distances)[idx].begin() + last_element_idx
+            )
+            move(
+                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
+                deref(self.neigh_indices)[idx].begin() + last_element_idx
+            )
+            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) noexcept nogil:
+        cdef:
+            intp_t idx
+
+        with nogil, parallel(num_threads=self.effective_n_threads):
+            # Merge vectors used in threads into the main ones.
+            # This is done in parallel sample-wise (no need for locks).
+            for idx in prange(self.n_samples_X, schedule='static'):
+                self._merge_vectors(idx, self.chunks_n_threads)
+
+            # The content of the vector have been std::moved.
+            # Hence they can't be used anymore and can be deleted.
+            # Their deletion is carried out automatically as the
+            # implementation relies on shared pointers.
+
+            # Sort in parallel in ascending order w.r.t the distances if requested.
+            if self.sort_results:
+                for idx in prange(self.n_samples_X, schedule='static'):
+                    simultaneous_sort(
+                        deref(self.neigh_distances)[idx].data(),
+                        deref(self.neigh_indices)[idx].data(),
+                        deref(self.neigh_indices)[idx].size()
+                    )
+
+        return
+
+    cdef void compute_exact_distances(self) noexcept nogil:
+        """Convert rank-preserving distances to pairwise distances in parallel."""
+        cdef:
+            intp_t i
+            vector[intp_t].size_type j
+
+        for i in prange(self.n_samples_X, nogil=True, schedule='static',
+                        num_threads=self.effective_n_threads):
+            for j in range(deref(self.neigh_indices)[i].size()):
+                deref(self.neigh_distances)[i][j] = (
+                        self.datasets_pair.distance_metric._rdist_to_dist(
+                            # Guard against potential -0., causing nan production.
+                            max(deref(self.neigh_distances)[i][j], 0.)
+                        )
+                )
+
+
+cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
+    """EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}."""
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (RadiusNeighbors{{name_suffix}}.is_usable_for(X, Y, metric)
+                and not _in_unstable_openblas_configuration())
+
+    def __init__(
+        self,
+        X,
+        Y,
+        float64_t radius,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        strategy=None,
+        sort_results=False,
+        metric_kwargs=None,
+    ):
+        if (
+            isinstance(metric_kwargs, dict) and
+            (metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"})
+        ):
+            warnings.warn(
+                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
+                f"usable for this case (EuclideanRadiusNeighbors64) and will be ignored.",
+                UserWarning,
+                stacklevel=3,
+            )
+
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric="euclidean"),
+            radius=radius,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            sort_results=sort_results,
+        )
+        cdef:
+            intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
+
+        self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
+            X,
+            Y,
+            self.effective_n_threads,
+            self.chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features=X.shape[1],
+            chunk_size=self.chunk_size,
+        )
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = check_array(
+                metric_kwargs.pop("Y_norm_squared"),
+                ensure_2d=False,
+                input_name="Y_norm_squared",
+                dtype=np.float64,
+            )
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
+                Y,
+                self.effective_n_threads,
+            )
+
+        if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
+            self.X_norm_squared = check_array(
+                metric_kwargs.pop("X_norm_squared"),
+                ensure_2d=False,
+                input_name="X_norm_squared",
+                dtype=np.float64,
+            )
+        else:
+            # Do not recompute norms if datasets are identical.
+            self.X_norm_squared = (
+                self.Y_norm_squared if X is Y else
+                _sqeuclidean_row_norms{{name_suffix}}(
+                    X,
+                    self.effective_n_threads,
+                )
+            )
+
+        self.use_squared_distances = use_squared_distances
+
+        if use_squared_distances:
+            # In this specialisation and this setup, the value passed to the radius is
+            # already considered to be the adapted radius, so we overwrite it.
+            self.r_radius = radius
+
+    @final
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        intp_t thread_num,
+    ) noexcept nogil:
+        RadiusNeighbors{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
+        self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
+
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        RadiusNeighbors{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        RadiusNeighbors{{name_suffix}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
+    @final
+    cdef void _parallel_on_Y_init(
+        self,
+    ) noexcept nogil:
+        RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self)
+        self.middle_term_computer._parallel_on_Y_init()
+
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        RadiusNeighbors{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        RadiusNeighbors{{name_suffix}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num
+        )
+
+    @final
+    cdef void compute_exact_distances(self) noexcept nogil:
+        if not self.use_squared_distances:
+            RadiusNeighbors{{name_suffix}}.compute_exact_distances(self)
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        intp_t X_start,
+        intp_t X_end,
+        intp_t Y_start,
+        intp_t Y_end,
+        intp_t thread_num,
+    ) noexcept nogil:
+        cdef:
+            intp_t i, j
+            float64_t sqeuclidean_dist_i_j
+            intp_t n_X = X_end - X_start
+            intp_t n_Y = Y_end - Y_start
+            float64_t *dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
+                X_start, X_end, Y_start, Y_end, thread_num
+            )
+
+        # Pushing the distance and their associated indices in vectors.
+        for i in range(n_X):
+            for j in range(n_Y):
+                sqeuclidean_dist_i_j = (
+                    self.X_norm_squared[i + X_start]
+                    + dist_middle_terms[i * n_Y + j]
+                    + self.Y_norm_squared[j + Y_start]
+                )
+
+                # Catastrophic cancellation might cause -0. to be present,
+                # e.g. when computing d(x_i, y_i) when X is Y.
+                sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j)
+
+                if sqeuclidean_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(sqeuclidean_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..0a9b22251843e60e52fa6d29248f3a745b37e414
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
@@ -0,0 +1,217 @@
+import warnings
+
+from cython cimport floating, final, integral
+from cython.operator cimport dereference as deref
+from cython.parallel cimport parallel, prange
+from ._classmode cimport WeightingStrategy
+from ...utils._typedefs cimport intp_t, float64_t, uint8_t
+
+import numpy as np
+from scipy.sparse import issparse
+from ...utils.parallel import _get_threadpool_controller
+
+
+{{for name_suffix in ["32", "64"]}}
+from ._radius_neighbors cimport RadiusNeighbors{{name_suffix}}
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
+
+cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
+    """
+    {{name_suffix}}bit implementation of RadiusNeighborsClassMode.
+    """
+    cdef:
+        const intp_t[::1] Y_labels
+        const intp_t[::1] unique_Y_labels
+        intp_t outlier_label_index
+        bint outlier_label_exists
+        bint outliers_exist
+        uint8_t[::1] outliers
+        object outlier_label
+        float64_t[:, ::1] class_scores
+        WeightingStrategy weight_type
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        float64_t radius,
+        weights,
+        Y_labels,
+        unique_Y_labels,
+        outlier_label=None,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+    ):
+        # Use a generic implementation that handles most scipy
+        # metrics by computing the distances between 2 vectors at a time.
+        pda = RadiusNeighborsClassMode{{name_suffix}}(
+            datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
+            radius=radius,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+            outlier_label=outlier_label,
+        )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with _get_threadpool_controller().limit(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results()
+
+    def __init__(
+        self,
+        DatasetsPair{{name_suffix}} datasets_pair,
+        const intp_t[::1] Y_labels,
+        const intp_t[::1] unique_Y_labels,
+        float64_t radius,
+        chunk_size=None,
+        strategy=None,
+        weights=None,
+        outlier_label=None,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            radius=radius,
+        )
+
+        if weights == "uniform":
+            self.weight_type = WeightingStrategy.uniform
+        elif weights == "distance":
+            self.weight_type = WeightingStrategy.distance
+        else:
+            self.weight_type = WeightingStrategy.callable
+
+        self.Y_labels = Y_labels
+        self.unique_Y_labels = unique_Y_labels
+        self.outlier_label_index = -1
+        self.outliers_exist = False
+        self.outlier_label = outlier_label
+        self.outliers = np.zeros(self.n_samples_X, dtype=np.bool_)
+
+        cdef intp_t idx
+        if self.outlier_label is not None:
+            for idx in range(self.unique_Y_labels.shape[0]):
+                if self.unique_Y_labels[idx] == outlier_label:
+                    self.outlier_label_index = idx
+
+        # Map from set of unique labels to their indices in `class_scores`
+        # Buffer used in building a histogram for one-pass weighted mode
+        self.class_scores = np.zeros(
+            (self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64,
+        )
+
+
+    cdef inline void weighted_histogram_mode(
+        self,
+        intp_t sample_index,
+        intp_t sample_n_neighbors,
+        intp_t* indices,
+        float64_t* distances,
+    ) noexcept nogil:
+        cdef:
+            intp_t neighbor_idx, neighbor_class_idx, label_index
+            float64_t score_incr = 1
+            bint use_distance_weighting = (
+                self.weight_type == WeightingStrategy.distance
+            )
+
+        if sample_n_neighbors == 0:
+            self.outliers_exist = True
+            self.outliers[sample_index] = True
+            if self.outlier_label_index >= 0:
+                self.class_scores[sample_index][self.outlier_label_index] = score_incr
+
+            return
+
+        # Iterate over the neighbors. This can be different for
+        # each of the samples as they are based on the radius.
+        for neighbor_rank in range(sample_n_neighbors):
+            if use_distance_weighting:
+                score_incr = 1 / distances[neighbor_rank]
+
+            neighbor_idx = indices[neighbor_rank]
+            neighbor_class_idx = self.Y_labels[neighbor_idx]
+            self.class_scores[sample_index][neighbor_class_idx] += score_incr
+
+        return
+
+    @final
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        intp_t thread_num,
+        intp_t X_start,
+        intp_t X_end,
+    ) noexcept nogil:
+        cdef:
+            intp_t idx
+
+        for idx in range(X_start, X_end):
+            self.weighted_histogram_mode(
+                sample_index=idx,
+                sample_n_neighbors=deref(self.neigh_indices)[idx].size(),
+                indices=deref(self.neigh_indices)[idx].data(),
+                distances=deref(self.neigh_distances)[idx].data(),
+            )
+
+        return
+
+    @final
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) noexcept nogil:
+        cdef:
+            intp_t idx
+
+        with nogil, parallel(num_threads=self.effective_n_threads):
+            # Merge vectors used in threads into the main ones.
+            # This is done in parallel sample-wise (no need for locks).
+            for idx in prange(self.n_samples_X, schedule='static'):
+                self._merge_vectors(idx, self.chunks_n_threads)
+
+            for idx in prange(self.n_samples_X, schedule='static'):
+                self.weighted_histogram_mode(
+                    sample_index=idx,
+                    sample_n_neighbors=deref(self.neigh_indices)[idx].size(),
+                    indices=deref(self.neigh_indices)[idx].data(),
+                    distances=deref(self.neigh_distances)[idx].data(),
+                )
+
+        return
+
+    def _finalize_results(self):
+        if self.outliers_exist and self.outlier_label is None:
+            raise ValueError(
+                "No neighbors found for test samples %r, "
+                "you can try using larger radius, "
+                "giving a label for outliers, "
+                "or considering removing them from your dataset."
+                % np.where(self.outliers)[0]
+            )
+
+        if self.outliers_exist and self.outlier_label_index < 0:
+            warnings.warn(
+                "Outlier label %s is not in training "
+                "classes. All class probabilities of "
+                "outliers will be assigned with 0."
+                % self.outlier_label
+            )
+
+        probabilities = np.asarray(self.class_scores)
+        normalizer = probabilities.sum(axis=1, keepdims=True)
+        normalizer[normalizer == 0.0] = 1.0
+        probabilities /= normalizer
+        return probabilities
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/meson.build b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..0f7eaa286399c5319d16d2c413d7be1957a10d74
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/meson.build
@@ -0,0 +1,193 @@
+# Note: the dependencies between different Cython files in
+# _pairwise_distances_reduction is probably one of the most involved in
+# scikit-learn. If you change this file make sure you build from scratch:
+# rm -rf build; make dev-meson
+# run a command like this:
+# ninja -C build/cp312 -t missingdeps
+# and make sure that the output is something like:
+# No missing dependencies on generated files found.
+
+# _pairwise_distances_reduction is cimported from other subpackages so this is
+# needed for the cimport to work
+_pairwise_distances_reduction_cython_tree = [
+  fs.copyfile('__init__.py'),
+  # We are in a sub-module of metrics, so we always need to have
+  # sklearn/metrics/__init__.py copied to the build directory to avoid the
+  # error:
+  # relative cimport beyond main package is not allowed
+  metrics_cython_tree
+]
+
+_classmode_pxd = fs.copyfile('_classmode.pxd')
+
+_datasets_pair_pxd = custom_target(
+  '_datasets_pair_pxd',
+  output: '_datasets_pair.pxd',
+  input: '_datasets_pair.pxd.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
+)
+_datasets_pair_pyx = custom_target(
+  '_datasets_pair_pyx',
+  output: '_datasets_pair.pyx',
+  input: '_datasets_pair.pyx.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # TODO in principle this should go in py.exension_module below. This is
+  # temporary work-around for dependency issue with .pyx.tp files. For more
+  # details, see https://github.com/mesonbuild/meson/issues/13212
+  depends: [_datasets_pair_pxd, _pairwise_distances_reduction_cython_tree, utils_cython_tree],
+)
+_datasets_pair = py.extension_module(
+  '_datasets_pair',
+  cython_gen_cpp.process(_datasets_pair_pyx),
+  dependencies: [np_dep],
+  subdir: 'sklearn/metrics/_pairwise_distances_reduction',
+  install: true
+)
+
+_base_pxd = custom_target(
+  '_base_pxd',
+  output: '_base.pxd',
+  input: '_base.pxd.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
+)
+_base_pyx = custom_target(
+  '_base_pyx',
+  output: '_base.pyx',
+  input: '_base.pyx.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # TODO in principle this should go in py.exension_module below. This is
+  # temporary work-around for dependency issue with .pyx.tp files. For more
+  # details, see https://github.com/mesonbuild/meson/issues/13212
+  depends: [_base_pxd, _pairwise_distances_reduction_cython_tree,
+            _datasets_pair_pxd, utils_cython_tree],
+)
+_base = py.extension_module(
+  '_base',
+  cython_gen_cpp.process(_base_pyx),
+  dependencies: [np_dep, openmp_dep],
+  subdir: 'sklearn/metrics/_pairwise_distances_reduction',
+  install: true
+)
+
+_middle_term_computer_pxd = custom_target(
+  '_middle_term_computer_pxd',
+  output: '_middle_term_computer.pxd',
+  input: '_middle_term_computer.pxd.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
+)
+_middle_term_computer_pyx = custom_target(
+  '_middle_term_computer_pyx',
+  output: '_middle_term_computer.pyx',
+  input: '_middle_term_computer.pyx.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # TODO in principle this should go in py.exension_module below. This is
+  # temporary work-around for dependency issue with .pyx.tp files. For more
+  # details, see https://github.com/mesonbuild/meson/issues/13212
+  depends: [_middle_term_computer_pxd,
+            _pairwise_distances_reduction_cython_tree,
+            utils_cython_tree],
+)
+_middle_term_computer = py.extension_module(
+  '_middle_term_computer',
+  cython_gen_cpp.process(_middle_term_computer_pyx),
+  dependencies: [np_dep],
+  subdir: 'sklearn/metrics/_pairwise_distances_reduction',
+  install: true
+)
+
+_argkmin_pxd = custom_target(
+    '_argkmin_pxd',
+    output: '_argkmin.pxd',
+    input: '_argkmin.pxd.tp',
+    command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
+  )
+_argkmin_pyx = custom_target(
+    '_argkmin_pyx',
+    output: '_argkmin.pyx',
+    input: '_argkmin.pyx.tp',
+    command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+    # TODO in principle this should go in py.exension_module below. This is
+    # temporary work-around for dependency issue with .pyx.tp files. For more
+    # details, see https://github.com/mesonbuild/meson/issues/13212
+    depends: [_argkmin_pxd,
+              _pairwise_distances_reduction_cython_tree,
+              _datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd],
+      )
+_argkmin = py.extension_module(
+    '_argkmin',
+    cython_gen_cpp.process(_argkmin_pyx),
+    dependencies: [np_dep, openmp_dep],
+    subdir: 'sklearn/metrics/_pairwise_distances_reduction',
+    install: true
+)
+
+_radius_neighbors_pxd = custom_target(
+    '_radius_neighbors_pxd',
+    output: '_radius_neighbors.pxd',
+    input: '_radius_neighbors.pxd.tp',
+    command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
+  )
+_radius_neighbors_pyx = custom_target(
+    '_radius_neighbors_pyx',
+    output: '_radius_neighbors.pyx',
+    input: '_radius_neighbors.pyx.tp',
+    command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+    # TODO in principle this should go in py.exension_module below. This is
+    # temporary work-around for dependency issue with .pyx.tp files. For more
+    # details, see https://github.com/mesonbuild/meson/issues/13212
+    depends: [_radius_neighbors_pxd,
+              _datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd,
+              _pairwise_distances_reduction_cython_tree, utils_cython_tree],
+)
+_radius_neighbors = py.extension_module(
+    '_radius_neighbors',
+    cython_gen_cpp.process(_radius_neighbors_pyx),
+    dependencies: [np_dep, openmp_dep],
+    subdir: 'sklearn/metrics/_pairwise_distances_reduction',
+    install: true
+)
+
+_argkmin_classmode_pyx = custom_target(
+  '_argkmin_classmode_pyx',
+  output: '_argkmin_classmode.pyx',
+  input: '_argkmin_classmode.pyx.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # TODO in principle this should go in py.exension_module below. This is
+  # temporary work-around for dependency issue with .pyx.tp files. For more
+  # details, see https://github.com/mesonbuild/meson/issues/13212
+  depends: [_classmode_pxd,
+            _argkmin_pxd, _pairwise_distances_reduction_cython_tree,
+            _datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd, utils_cython_tree],
+)
+_argkmin_classmode = py.extension_module(
+  '_argkmin_classmode',
+  cython_gen_cpp.process(_argkmin_classmode_pyx),
+  dependencies: [np_dep, openmp_dep],
+  # XXX: for some reason -fno-sized-deallocation is needed otherwise there is
+  # an error with undefined symbol _ZdlPv at import time in manylinux wheels.
+  # See https://github.com/scikit-learn/scikit-learn/issues/28596 for more details.
+  cpp_args: ['-fno-sized-deallocation'],
+  subdir: 'sklearn/metrics/_pairwise_distances_reduction',
+  install: true
+)
+
+_radius_neighbors_classmode_pyx = custom_target(
+  '_radius_neighbors_classmode_pyx',
+  output: '_radius_neighbors_classmode.pyx',
+  input: '_radius_neighbors_classmode.pyx.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # TODO in principle this should go in py.exension_module below. This is
+  # temporary work-around for dependency issue with .pyx.tp files. For more
+  # details, see https://github.com/mesonbuild/meson/issues/13212
+  depends: [_classmode_pxd,
+            _middle_term_computer_pxd, _radius_neighbors_pxd,
+            _pairwise_distances_reduction_cython_tree,
+            _datasets_pair_pxd, _base_pxd, utils_cython_tree],
+)
+_radius_neighbors_classmode = py.extension_module(
+  '_radius_neighbors_classmode',
+  cython_gen_cpp.process(_radius_neighbors_classmode_pyx),
+  dependencies: [np_dep, openmp_dep],
+  subdir: 'sklearn/metrics/_pairwise_distances_reduction',
+  install: true
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..bf4ded09b2610eef7949cd56e5270b77cb2ce4db
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_fast.pyx
@@ -0,0 +1,107 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cython cimport floating
+from cython.parallel cimport prange
+from libc.math cimport fabs
+
+from ..utils._typedefs cimport intp_t
+
+from ..utils._openmp_helpers import _openmp_effective_n_threads
+
+
+def _chi2_kernel_fast(floating[:, :] X,
+                      floating[:, :] Y,
+                      floating[:, :] result):
+    cdef intp_t i, j, k
+    cdef intp_t n_samples_X = X.shape[0]
+    cdef intp_t n_samples_Y = Y.shape[0]
+    cdef intp_t n_features = X.shape[1]
+    cdef double res, nom, denom
+
+    with nogil:
+        for i in range(n_samples_X):
+            for j in range(n_samples_Y):
+                res = 0
+                for k in range(n_features):
+                    denom = (X[i, k] - Y[j, k])
+                    nom = (X[i, k] + Y[j, k])
+                    if nom != 0:
+                        res += denom * denom / nom
+                result[i, j] = -res
+
+
+def _sparse_manhattan(
+    const floating[::1] X_data,
+    const int[:] X_indices,
+    const int[:] X_indptr,
+    const floating[::1] Y_data,
+    const int[:] Y_indices,
+    const int[:] Y_indptr,
+    double[:, ::1] D,
+):
+    """Pairwise L1 distances for CSR matrices.
+
+    Usage:
+    >>> D = np.zeros(X.shape[0], Y.shape[0])
+    >>> _sparse_manhattan(X.data, X.indices, X.indptr,
+    ...                   Y.data, Y.indices, Y.indptr,
+    ...                   D)
+    """
+    cdef intp_t px, py, i, j, ix, iy
+    cdef double d = 0.0
+
+    cdef int m = D.shape[0]
+    cdef int n = D.shape[1]
+
+    cdef int X_indptr_end = 0
+    cdef int Y_indptr_end = 0
+
+    cdef int num_threads = _openmp_effective_n_threads()
+
+    # We scan the matrices row by row.
+    # Given row px in X and row py in Y, we find the positions (i and j
+    # respectively), in .indices where the indices for the two rows start.
+    # If the indices (ix and iy) are the same, the corresponding data values
+    # are processed and the cursors i and j are advanced.
+    # If not, the lowest index is considered. Its associated data value is
+    # processed and its cursor is advanced.
+    # We proceed like this until one of the cursors hits the end for its row.
+    # Then we process all remaining data values in the other row.
+
+    # Below the avoidance of inplace operators is intentional.
+    # When prange is used, the inplace operator has a special meaning, i.e. it
+    # signals a "reduction"
+
+    for px in prange(m, nogil=True, num_threads=num_threads):
+        X_indptr_end = X_indptr[px + 1]
+        for py in range(n):
+            Y_indptr_end = Y_indptr[py + 1]
+            i = X_indptr[px]
+            j = Y_indptr[py]
+            d = 0.0
+            while i < X_indptr_end and j < Y_indptr_end:
+                ix = X_indices[i]
+                iy = Y_indices[j]
+
+                if ix == iy:
+                    d = d + fabs(X_data[i] - Y_data[j])
+                    i = i + 1
+                    j = j + 1
+                elif ix < iy:
+                    d = d + fabs(X_data[i])
+                    i = i + 1
+                else:
+                    d = d + fabs(Y_data[j])
+                    j = j + 1
+
+            if i == X_indptr_end:
+                while j < Y_indptr_end:
+                    d = d + fabs(Y_data[j])
+                    j = j + 1
+            else:
+                while i < X_indptr_end:
+                    d = d + fabs(X_data[i])
+                    i = i + 1
+
+            D[px, py] = d
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..67dd18fb94b593f0a3125c1f5833f3b9597614ba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__init__.py
@@ -0,0 +1,2 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cff9ae99268b96a9d6d332ac2377c325ba7a0ddd
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/confusion_matrix.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/confusion_matrix.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34f825229da0c7a280e0d47b5d99f64c0273e3f9
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/confusion_matrix.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/det_curve.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/det_curve.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad7fc06bba7a8a7059de53ca08328114ec2e0459
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/det_curve.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/precision_recall_curve.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/precision_recall_curve.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b9648fc4ff03a95433f835c6cbf564d4dcbba24
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/precision_recall_curve.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/regression.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/regression.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc63164253445f0e6d8e3d9cfcbbb801f23d9a95
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/regression.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/roc_curve.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/roc_curve.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..119e8cc687ac38eee648c599315604ad0fa82b0c
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/roc_curve.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/confusion_matrix.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/confusion_matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..cee515bebe08e859268c27c5441ce3450434d817
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/confusion_matrix.py
@@ -0,0 +1,499 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import product
+
+import numpy as np
+
+from ...base import is_classifier
+from ...utils._optional_dependencies import check_matplotlib_support
+from ...utils._plotting import _validate_style_kwargs
+from ...utils.multiclass import unique_labels
+from .. import confusion_matrix
+
+
+class ConfusionMatrixDisplay:
+    """Confusion Matrix visualization.
+
+    It is recommended to use
+    :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or
+    :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to
+    create a :class:`ConfusionMatrixDisplay`. All parameters are stored as
+    attributes.
+
+    For general information regarding `scikit-learn` visualization tools, see
+    the :ref:`Visualization Guide <visualizations>`.
+    For guidance on interpreting these plots, refer to the
+    :ref:`Model Evaluation Guide <confusion_matrix>`.
+
+    Parameters
+    ----------
+    confusion_matrix : ndarray of shape (n_classes, n_classes)
+        Confusion matrix.
+
+    display_labels : ndarray of shape (n_classes,), default=None
+        Display labels for plot. If None, display labels are set from 0 to
+        `n_classes - 1`.
+
+    Attributes
+    ----------
+    im_ : matplotlib AxesImage
+        Image representing the confusion matrix.
+
+    text_ : ndarray of shape (n_classes, n_classes), dtype=matplotlib Text, \
+            or None
+        Array of matplotlib axes. `None` if `include_values` is false.
+
+    ax_ : matplotlib Axes
+        Axes with confusion matrix.
+
+    figure_ : matplotlib Figure
+        Figure containing the confusion matrix.
+
+    See Also
+    --------
+    confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a
+        classification.
+    ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
+        given an estimator, the data, and the label.
+    ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
+        given the true and predicted labels.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.svm import SVC
+    >>> X, y = make_classification(random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ...                                                     random_state=0)
+    >>> clf = SVC(random_state=0)
+    >>> clf.fit(X_train, y_train)
+    SVC(random_state=0)
+    >>> predictions = clf.predict(X_test)
+    >>> cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
+    >>> disp = ConfusionMatrixDisplay(confusion_matrix=cm,
+    ...                               display_labels=clf.classes_)
+    >>> disp.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(self, confusion_matrix, *, display_labels=None):
+        self.confusion_matrix = confusion_matrix
+        self.display_labels = display_labels
+
+    def plot(
+        self,
+        *,
+        include_values=True,
+        cmap="viridis",
+        xticks_rotation="horizontal",
+        values_format=None,
+        ax=None,
+        colorbar=True,
+        im_kw=None,
+        text_kw=None,
+    ):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        include_values : bool, default=True
+            Includes values in confusion matrix.
+
+        cmap : str or matplotlib Colormap, default='viridis'
+            Colormap recognized by matplotlib.
+
+        xticks_rotation : {'vertical', 'horizontal'} or float, \
+                         default='horizontal'
+            Rotation of xtick labels.
+
+        values_format : str, default=None
+            Format specification for values in confusion matrix. If `None`,
+            the format specification is 'd' or '.2g' whichever is shorter.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        colorbar : bool, default=True
+            Whether or not to add a colorbar to the plot.
+
+        im_kw : dict, default=None
+            Dict with keywords passed to `matplotlib.pyplot.imshow` call.
+
+        text_kw : dict, default=None
+            Dict with keywords passed to `matplotlib.pyplot.text` call.
+
+            .. versionadded:: 1.2
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
+            Returns a :class:`~sklearn.metrics.ConfusionMatrixDisplay` instance
+            that contains all the information to plot the confusion matrix.
+        """
+        check_matplotlib_support("ConfusionMatrixDisplay.plot")
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            fig, ax = plt.subplots()
+        else:
+            fig = ax.figure
+
+        cm = self.confusion_matrix
+        n_classes = cm.shape[0]
+
+        default_im_kw = dict(interpolation="nearest", cmap=cmap)
+        im_kw = im_kw or {}
+        im_kw = _validate_style_kwargs(default_im_kw, im_kw)
+        text_kw = text_kw or {}
+
+        self.im_ = ax.imshow(cm, **im_kw)
+        self.text_ = None
+        cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(1.0)
+
+        if include_values:
+            self.text_ = np.empty_like(cm, dtype=object)
+
+            # print text with appropriate color depending on background
+            thresh = (cm.max() + cm.min()) / 2.0
+
+            for i, j in product(range(n_classes), range(n_classes)):
+                color = cmap_max if cm[i, j] < thresh else cmap_min
+
+                if values_format is None:
+                    text_cm = format(cm[i, j], ".2g")
+                    if cm.dtype.kind != "f":
+                        text_d = format(cm[i, j], "d")
+                        if len(text_d) < len(text_cm):
+                            text_cm = text_d
+                else:
+                    text_cm = format(cm[i, j], values_format)
+
+                default_text_kwargs = dict(ha="center", va="center", color=color)
+                text_kwargs = _validate_style_kwargs(default_text_kwargs, text_kw)
+
+                self.text_[i, j] = ax.text(j, i, text_cm, **text_kwargs)
+
+        if self.display_labels is None:
+            display_labels = np.arange(n_classes)
+        else:
+            display_labels = self.display_labels
+        if colorbar:
+            fig.colorbar(self.im_, ax=ax)
+        ax.set(
+            xticks=np.arange(n_classes),
+            yticks=np.arange(n_classes),
+            xticklabels=display_labels,
+            yticklabels=display_labels,
+            ylabel="True label",
+            xlabel="Predicted label",
+        )
+
+        ax.set_ylim((n_classes - 0.5, -0.5))
+        plt.setp(ax.get_xticklabels(), rotation=xticks_rotation)
+
+        self.figure_ = fig
+        self.ax_ = ax
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        labels=None,
+        sample_weight=None,
+        normalize=None,
+        display_labels=None,
+        include_values=True,
+        xticks_rotation="horizontal",
+        values_format=None,
+        cmap="viridis",
+        ax=None,
+        colorbar=True,
+        im_kw=None,
+        text_kw=None,
+    ):
+        """Plot Confusion Matrix given an estimator and some data.
+
+        For general information regarding `scikit-learn` visualization tools, see
+        the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the
+        :ref:`Model Evaluation Guide <confusion_matrix>`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+            in which the last estimator is a classifier.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        labels : array-like of shape (n_classes,), default=None
+            List of labels to index the confusion matrix. This may be used to
+            reorder or select a subset of labels. If `None` is given, those
+            that appear at least once in `y_true` or `y_pred` are used in
+            sorted order.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        normalize : {'true', 'pred', 'all'}, default=None
+            Either to normalize the counts display in the matrix:
+
+            - if `'true'`, the confusion matrix is normalized over the true
+              conditions (e.g. rows);
+            - if `'pred'`, the confusion matrix is normalized over the
+              predicted conditions (e.g. columns);
+            - if `'all'`, the confusion matrix is normalized by the total
+              number of samples;
+            - if `None` (default), the confusion matrix will not be normalized.
+
+        display_labels : array-like of shape (n_classes,), default=None
+            Target names used for plotting. By default, `labels` will be used
+            if it is defined, otherwise the unique labels of `y_true` and
+            `y_pred` will be used.
+
+        include_values : bool, default=True
+            Includes values in confusion matrix.
+
+        xticks_rotation : {'vertical', 'horizontal'} or float, \
+                default='horizontal'
+            Rotation of xtick labels.
+
+        values_format : str, default=None
+            Format specification for values in confusion matrix. If `None`, the
+            format specification is 'd' or '.2g' whichever is shorter.
+
+        cmap : str or matplotlib Colormap, default='viridis'
+            Colormap recognized by matplotlib.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        colorbar : bool, default=True
+            Whether or not to add a colorbar to the plot.
+
+        im_kw : dict, default=None
+            Dict with keywords passed to `matplotlib.pyplot.imshow` call.
+
+        text_kw : dict, default=None
+            Dict with keywords passed to `matplotlib.pyplot.text` call.
+
+            .. versionadded:: 1.2
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
+
+        See Also
+        --------
+        ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
+            given the true and predicted labels.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import ConfusionMatrixDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...         X, y, random_state=0)
+        >>> clf = SVC(random_state=0)
+        >>> clf.fit(X_train, y_train)
+        SVC(random_state=0)
+        >>> ConfusionMatrixDisplay.from_estimator(
+        ...     clf, X_test, y_test)
+        <...>
+        >>> plt.show()
+
+        For a detailed example of using a confusion matrix to evaluate a
+        Support Vector Classifier, please see
+        :ref:`sphx_glr_auto_examples_model_selection_plot_confusion_matrix.py`
+        """
+        method_name = f"{cls.__name__}.from_estimator"
+        check_matplotlib_support(method_name)
+        if not is_classifier(estimator):
+            raise ValueError(f"{method_name} only supports classifiers")
+        y_pred = estimator.predict(X)
+
+        return cls.from_predictions(
+            y,
+            y_pred,
+            sample_weight=sample_weight,
+            labels=labels,
+            normalize=normalize,
+            display_labels=display_labels,
+            include_values=include_values,
+            cmap=cmap,
+            ax=ax,
+            xticks_rotation=xticks_rotation,
+            values_format=values_format,
+            colorbar=colorbar,
+            im_kw=im_kw,
+            text_kw=text_kw,
+        )
+
+    @classmethod
+    def from_predictions(
+        cls,
+        y_true,
+        y_pred,
+        *,
+        labels=None,
+        sample_weight=None,
+        normalize=None,
+        display_labels=None,
+        include_values=True,
+        xticks_rotation="horizontal",
+        values_format=None,
+        cmap="viridis",
+        ax=None,
+        colorbar=True,
+        im_kw=None,
+        text_kw=None,
+    ):
+        """Plot Confusion Matrix given true and predicted labels.
+
+        For general information regarding `scikit-learn` visualization tools, see
+        the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the
+        :ref:`Model Evaluation Guide <confusion_matrix>`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            True labels.
+
+        y_pred : array-like of shape (n_samples,)
+            The predicted labels given by the method `predict` of an
+            classifier.
+
+        labels : array-like of shape (n_classes,), default=None
+            List of labels to index the confusion matrix. This may be used to
+            reorder or select a subset of labels. If `None` is given, those
+            that appear at least once in `y_true` or `y_pred` are used in
+            sorted order.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        normalize : {'true', 'pred', 'all'}, default=None
+            Either to normalize the counts display in the matrix:
+
+            - if `'true'`, the confusion matrix is normalized over the true
+              conditions (e.g. rows);
+            - if `'pred'`, the confusion matrix is normalized over the
+              predicted conditions (e.g. columns);
+            - if `'all'`, the confusion matrix is normalized by the total
+              number of samples;
+            - if `None` (default), the confusion matrix will not be normalized.
+
+        display_labels : array-like of shape (n_classes,), default=None
+            Target names used for plotting. By default, `labels` will be used
+            if it is defined, otherwise the unique labels of `y_true` and
+            `y_pred` will be used.
+
+        include_values : bool, default=True
+            Includes values in confusion matrix.
+
+        xticks_rotation : {'vertical', 'horizontal'} or float, \
+                default='horizontal'
+            Rotation of xtick labels.
+
+        values_format : str, default=None
+            Format specification for values in confusion matrix. If `None`, the
+            format specification is 'd' or '.2g' whichever is shorter.
+
+        cmap : str or matplotlib Colormap, default='viridis'
+            Colormap recognized by matplotlib.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        colorbar : bool, default=True
+            Whether or not to add a colorbar to the plot.
+
+        im_kw : dict, default=None
+            Dict with keywords passed to `matplotlib.pyplot.imshow` call.
+
+        text_kw : dict, default=None
+            Dict with keywords passed to `matplotlib.pyplot.text` call.
+
+            .. versionadded:: 1.2
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
+
+        See Also
+        --------
+        ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
+            given an estimator, the data, and the label.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import ConfusionMatrixDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...         X, y, random_state=0)
+        >>> clf = SVC(random_state=0)
+        >>> clf.fit(X_train, y_train)
+        SVC(random_state=0)
+        >>> y_pred = clf.predict(X_test)
+        >>> ConfusionMatrixDisplay.from_predictions(
+        ...    y_test, y_pred)
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_predictions")
+
+        if display_labels is None:
+            if labels is None:
+                display_labels = unique_labels(y_true, y_pred)
+            else:
+                display_labels = labels
+
+        cm = confusion_matrix(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight,
+            labels=labels,
+            normalize=normalize,
+        )
+
+        disp = cls(confusion_matrix=cm, display_labels=display_labels)
+
+        return disp.plot(
+            include_values=include_values,
+            cmap=cmap,
+            ax=ax,
+            xticks_rotation=xticks_rotation,
+            values_format=values_format,
+            colorbar=colorbar,
+            im_kw=im_kw,
+            text_kw=text_kw,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/det_curve.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/det_curve.py
new file mode 100644
index 0000000000000000000000000000000000000000..590b908d917232d9e43b0f8492710ee978ce989c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/det_curve.py
@@ -0,0 +1,371 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import scipy as sp
+
+from ...utils._plotting import _BinaryClassifierCurveDisplayMixin
+from .._ranking import det_curve
+
+
+class DetCurveDisplay(_BinaryClassifierCurveDisplayMixin):
+    """Detection Error Tradeoff (DET) curve visualization.
+
+    It is recommended to use :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`
+    or :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` to create a
+    visualizer. All parameters are stored as attributes.
+
+    For general information regarding `scikit-learn` visualization tools, see
+    the :ref:`Visualization Guide <visualizations>`.
+    For guidance on interpreting these plots, refer to the
+    :ref:`Model Evaluation Guide <det_curve>`.
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    fpr : ndarray
+        False positive rate.
+
+    fnr : ndarray
+        False negative rate.
+
+    estimator_name : str, default=None
+        Name of estimator. If None, the estimator name is not shown.
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class.
+
+    Attributes
+    ----------
+    line_ : matplotlib Artist
+        DET Curve.
+
+    ax_ : matplotlib Axes
+        Axes with DET Curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the curve.
+
+    See Also
+    --------
+    det_curve : Compute error rates for different probability thresholds.
+    DetCurveDisplay.from_estimator : Plot DET curve given an estimator and
+        some data.
+    DetCurveDisplay.from_predictions : Plot DET curve given the true and
+        predicted labels.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.metrics import det_curve, DetCurveDisplay
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.svm import SVC
+    >>> X, y = make_classification(n_samples=1000, random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, test_size=0.4, random_state=0)
+    >>> clf = SVC(random_state=0).fit(X_train, y_train)
+    >>> y_pred = clf.decision_function(X_test)
+    >>> fpr, fnr, _ = det_curve(y_test, y_pred)
+    >>> display = DetCurveDisplay(
+    ...     fpr=fpr, fnr=fnr, estimator_name="SVC"
+    ... )
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(self, *, fpr, fnr, estimator_name=None, pos_label=None):
+        self.fpr = fpr
+        self.fnr = fnr
+        self.estimator_name = estimator_name
+        self.pos_label = pos_label
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        sample_weight=None,
+        drop_intermediate=True,
+        response_method="auto",
+        pos_label=None,
+        name=None,
+        ax=None,
+        **kwargs,
+    ):
+        """Plot DET curve given an estimator and data.
+
+        For general information regarding `scikit-learn` visualization tools, see
+        the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the
+        :ref:`Model Evaluation Guide <det_curve>`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+            in which the last estimator is a classifier.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        drop_intermediate : bool, default=True
+            Whether to drop thresholds where true positives (tp) do not change
+            from the previous or subsequent threshold. All points with the same
+            tp value have the same `fnr` and thus same y coordinate.
+
+            .. versionadded:: 1.7
+
+        response_method : {'predict_proba', 'decision_function', 'auto'} \
+                default='auto'
+            Specifies whether to use :term:`predict_proba` or
+            :term:`decision_function` as the predicted target response. If set
+            to 'auto', :term:`predict_proba` is tried first and if it does not
+            exist :term:`decision_function` is tried next.
+
+        pos_label : int, float, bool or str, default=None
+            The label of the positive class. When `pos_label=None`, if `y_true`
+            is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an
+            error will be raised.
+
+        name : str, default=None
+            Name of DET curve for labeling. If `None`, use the name of the
+            estimator.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        **kwargs : dict
+            Additional keywords arguments passed to matplotlib `plot` function.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.DetCurveDisplay`
+            Object that stores computed values.
+
+        See Also
+        --------
+        det_curve : Compute error rates for different probability thresholds.
+        DetCurveDisplay.from_predictions : Plot DET curve given the true and
+            predicted labels.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import DetCurveDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(n_samples=1000, random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...     X, y, test_size=0.4, random_state=0)
+        >>> clf = SVC(random_state=0).fit(X_train, y_train)
+        >>> DetCurveDisplay.from_estimator(
+        ...    clf, X_test, y_test)
+        <...>
+        >>> plt.show()
+        """
+        y_pred, pos_label, name = cls._validate_and_get_response_values(
+            estimator,
+            X,
+            y,
+            response_method=response_method,
+            pos_label=pos_label,
+            name=name,
+        )
+
+        return cls.from_predictions(
+            y_true=y,
+            y_pred=y_pred,
+            sample_weight=sample_weight,
+            drop_intermediate=drop_intermediate,
+            name=name,
+            ax=ax,
+            pos_label=pos_label,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_predictions(
+        cls,
+        y_true,
+        y_pred,
+        *,
+        sample_weight=None,
+        drop_intermediate=True,
+        pos_label=None,
+        name=None,
+        ax=None,
+        **kwargs,
+    ):
+        """Plot the DET curve given the true and predicted labels.
+
+        For general information regarding `scikit-learn` visualization tools, see
+        the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the
+        :ref:`Model Evaluation Guide <det_curve>`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            True labels.
+
+        y_pred : array-like of shape (n_samples,)
+            Target scores, can either be probability estimates of the positive
+            class, confidence values, or non-thresholded measure of decisions
+            (as returned by `decision_function` on some classifiers).
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        drop_intermediate : bool, default=True
+            Whether to drop thresholds where true positives (tp) do not change
+            from the previous or subsequent threshold. All points with the same
+            tp value have the same `fnr` and thus same y coordinate.
+
+            .. versionadded:: 1.7
+
+        pos_label : int, float, bool or str, default=None
+            The label of the positive class. When `pos_label=None`, if `y_true`
+            is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an
+            error will be raised.
+
+        name : str, default=None
+            Name of DET curve for labeling. If `None`, name will be set to
+            `"Classifier"`.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        **kwargs : dict
+            Additional keywords arguments passed to matplotlib `plot` function.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.DetCurveDisplay`
+            Object that stores computed values.
+
+        See Also
+        --------
+        det_curve : Compute error rates for different probability thresholds.
+        DetCurveDisplay.from_estimator : Plot DET curve given an estimator and
+            some data.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import DetCurveDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(n_samples=1000, random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...     X, y, test_size=0.4, random_state=0)
+        >>> clf = SVC(random_state=0).fit(X_train, y_train)
+        >>> y_pred = clf.decision_function(X_test)
+        >>> DetCurveDisplay.from_predictions(
+        ...    y_test, y_pred)
+        <...>
+        >>> plt.show()
+        """
+        pos_label_validated, name = cls._validate_from_predictions_params(
+            y_true, y_pred, sample_weight=sample_weight, pos_label=pos_label, name=name
+        )
+
+        fpr, fnr, _ = det_curve(
+            y_true,
+            y_pred,
+            pos_label=pos_label,
+            sample_weight=sample_weight,
+            drop_intermediate=drop_intermediate,
+        )
+
+        viz = cls(
+            fpr=fpr,
+            fnr=fnr,
+            estimator_name=name,
+            pos_label=pos_label_validated,
+        )
+
+        return viz.plot(ax=ax, name=name, **kwargs)
+
+    def plot(self, ax=None, *, name=None, **kwargs):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        name : str, default=None
+            Name of DET curve for labeling. If `None`, use `estimator_name` if
+            it is not `None`, otherwise no labeling is shown.
+
+        **kwargs : dict
+            Additional keywords arguments passed to matplotlib `plot` function.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.DetCurveDisplay`
+            Object that stores computed values.
+        """
+        self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
+
+        line_kwargs = {} if name is None else {"label": name}
+        line_kwargs.update(**kwargs)
+
+        # We have the following bounds:
+        # sp.stats.norm.ppf(0.0) = -np.inf
+        # sp.stats.norm.ppf(1.0) = np.inf
+        # We therefore clip to eps and 1 - eps to not provide infinity to matplotlib.
+        eps = np.finfo(self.fpr.dtype).eps
+        self.fpr = self.fpr.clip(eps, 1 - eps)
+        self.fnr = self.fnr.clip(eps, 1 - eps)
+
+        (self.line_,) = self.ax_.plot(
+            sp.stats.norm.ppf(self.fpr),
+            sp.stats.norm.ppf(self.fnr),
+            **line_kwargs,
+        )
+        info_pos_label = (
+            f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
+        )
+
+        xlabel = "False Positive Rate" + info_pos_label
+        ylabel = "False Negative Rate" + info_pos_label
+        self.ax_.set(xlabel=xlabel, ylabel=ylabel)
+
+        if "label" in line_kwargs:
+            self.ax_.legend(loc="lower right")
+
+        ticks = [0.001, 0.01, 0.05, 0.20, 0.5, 0.80, 0.95, 0.99, 0.999]
+        tick_locations = sp.stats.norm.ppf(ticks)
+        tick_labels = [
+            "{:.0%}".format(s) if (100 * s).is_integer() else "{:.1%}".format(s)
+            for s in ticks
+        ]
+        self.ax_.set_xticks(tick_locations)
+        self.ax_.set_xticklabels(tick_labels)
+        self.ax_.set_xlim(-3, 3)
+        self.ax_.set_yticks(tick_locations)
+        self.ax_.set_yticklabels(tick_labels)
+        self.ax_.set_ylim(-3, 3)
+
+        return self
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/precision_recall_curve.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/precision_recall_curve.py
new file mode 100644
index 0000000000000000000000000000000000000000..30dd1fba08761f12d74d75743b4985aca3442d59
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/precision_recall_curve.py
@@ -0,0 +1,555 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from collections import Counter
+
+from ...utils._plotting import (
+    _BinaryClassifierCurveDisplayMixin,
+    _despine,
+    _validate_style_kwargs,
+)
+from .._ranking import average_precision_score, precision_recall_curve
+
+
+class PrecisionRecallDisplay(_BinaryClassifierCurveDisplayMixin):
+    """Precision Recall visualization.
+
+    It is recommended to use
+    :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator` or
+    :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` to create
+    a :class:`~sklearn.metrics.PrecisionRecallDisplay`. All parameters are
+    stored as attributes.
+
+    For general information regarding `scikit-learn` visualization tools, see
+    the :ref:`Visualization Guide <visualizations>`.
+    For guidance on interpreting these plots, refer to the :ref:`Model
+    Evaluation Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    precision : ndarray
+        Precision values.
+
+    recall : ndarray
+        Recall values.
+
+    average_precision : float, default=None
+        Average precision. If None, the average precision is not shown.
+
+    estimator_name : str, default=None
+        Name of estimator. If None, then the estimator name is not shown.
+
+    pos_label : int, float, bool or str, default=None
+        The class considered as the positive class. If None, the class will not
+        be shown in the legend.
+
+        .. versionadded:: 0.24
+
+    prevalence_pos_label : float, default=None
+        The prevalence of the positive label. It is used for plotting the
+        chance level line. If None, the chance level line will not be plotted
+        even if `plot_chance_level` is set to True when plotting.
+
+        .. versionadded:: 1.3
+
+    Attributes
+    ----------
+    line_ : matplotlib Artist
+        Precision recall curve.
+
+    chance_level_ : matplotlib Artist or None
+        The chance level line. It is `None` if the chance level is not plotted.
+
+        .. versionadded:: 1.3
+
+    ax_ : matplotlib Axes
+        Axes with precision recall curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the curve.
+
+    See Also
+    --------
+    precision_recall_curve : Compute precision-recall pairs for different
+        probability thresholds.
+    PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given
+        a binary classifier.
+    PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve
+        using predictions from a binary classifier.
+
+    Notes
+    -----
+    The average precision (cf. :func:`~sklearn.metrics.average_precision_score`) in
+    scikit-learn is computed without any interpolation. To be consistent with
+    this metric, the precision-recall curve is plotted without any
+    interpolation as well (step-wise style).
+
+    You can change this style by passing the keyword argument
+    `drawstyle="default"` in :meth:`plot`, :meth:`from_estimator`, or
+    :meth:`from_predictions`. However, the curve will not be strictly
+    consistent with the reported average precision.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.metrics import (precision_recall_curve,
+    ...                              PrecisionRecallDisplay)
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.svm import SVC
+    >>> X, y = make_classification(random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ...                                                     random_state=0)
+    >>> clf = SVC(random_state=0)
+    >>> clf.fit(X_train, y_train)
+    SVC(random_state=0)
+    >>> predictions = clf.predict(X_test)
+    >>> precision, recall, _ = precision_recall_curve(y_test, predictions)
+    >>> disp = PrecisionRecallDisplay(precision=precision, recall=recall)
+    >>> disp.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(
+        self,
+        precision,
+        recall,
+        *,
+        average_precision=None,
+        estimator_name=None,
+        pos_label=None,
+        prevalence_pos_label=None,
+    ):
+        self.estimator_name = estimator_name
+        self.precision = precision
+        self.recall = recall
+        self.average_precision = average_precision
+        self.pos_label = pos_label
+        self.prevalence_pos_label = prevalence_pos_label
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        name=None,
+        plot_chance_level=False,
+        chance_level_kw=None,
+        despine=False,
+        **kwargs,
+    ):
+        """Plot visualization.
+
+        Extra keyword arguments will be passed to matplotlib's `plot`.
+
+        Parameters
+        ----------
+        ax : Matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        name : str, default=None
+            Name of precision recall curve for labeling. If `None`, use
+            `estimator_name` if not `None`, otherwise no labeling is shown.
+
+        plot_chance_level : bool, default=False
+            Whether to plot the chance level. The chance level is the prevalence
+            of the positive label computed from the data passed during
+            :meth:`from_estimator` or :meth:`from_predictions` call.
+
+            .. versionadded:: 1.3
+
+        chance_level_kw : dict, default=None
+            Keyword arguments to be passed to matplotlib's `plot` for rendering
+            the chance level line.
+
+            .. versionadded:: 1.3
+
+        despine : bool, default=False
+            Whether to remove the top and right spines from the plot.
+
+            .. versionadded:: 1.6
+
+        **kwargs : dict
+            Keyword arguments to be passed to matplotlib's `plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
+            Object that stores computed values.
+
+        Notes
+        -----
+        The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
+        in scikit-learn is computed without any interpolation. To be consistent
+        with this metric, the precision-recall curve is plotted without any
+        interpolation as well (step-wise style).
+
+        You can change this style by passing the keyword argument
+        `drawstyle="default"`. However, the curve will not be strictly
+        consistent with the reported average precision.
+        """
+        self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
+
+        default_line_kwargs = {"drawstyle": "steps-post"}
+        if self.average_precision is not None and name is not None:
+            default_line_kwargs["label"] = (
+                f"{name} (AP = {self.average_precision:0.2f})"
+            )
+        elif self.average_precision is not None:
+            default_line_kwargs["label"] = f"AP = {self.average_precision:0.2f}"
+        elif name is not None:
+            default_line_kwargs["label"] = name
+
+        line_kwargs = _validate_style_kwargs(default_line_kwargs, kwargs)
+
+        (self.line_,) = self.ax_.plot(self.recall, self.precision, **line_kwargs)
+
+        info_pos_label = (
+            f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
+        )
+
+        xlabel = "Recall" + info_pos_label
+        ylabel = "Precision" + info_pos_label
+        self.ax_.set(
+            xlabel=xlabel,
+            xlim=(-0.01, 1.01),
+            ylabel=ylabel,
+            ylim=(-0.01, 1.01),
+            aspect="equal",
+        )
+
+        if plot_chance_level:
+            if self.prevalence_pos_label is None:
+                raise ValueError(
+                    "You must provide prevalence_pos_label when constructing the "
+                    "PrecisionRecallDisplay object in order to plot the chance "
+                    "level line. Alternatively, you may use "
+                    "PrecisionRecallDisplay.from_estimator or "
+                    "PrecisionRecallDisplay.from_predictions "
+                    "to automatically set prevalence_pos_label"
+                )
+
+            default_chance_level_line_kw = {
+                "label": f"Chance level (AP = {self.prevalence_pos_label:0.2f})",
+                "color": "k",
+                "linestyle": "--",
+            }
+
+            if chance_level_kw is None:
+                chance_level_kw = {}
+
+            chance_level_line_kw = _validate_style_kwargs(
+                default_chance_level_line_kw, chance_level_kw
+            )
+
+            (self.chance_level_,) = self.ax_.plot(
+                (0, 1),
+                (self.prevalence_pos_label, self.prevalence_pos_label),
+                **chance_level_line_kw,
+            )
+        else:
+            self.chance_level_ = None
+
+        if despine:
+            _despine(self.ax_)
+
+        if "label" in line_kwargs or plot_chance_level:
+            self.ax_.legend(loc="lower left")
+
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        sample_weight=None,
+        drop_intermediate=False,
+        response_method="auto",
+        pos_label=None,
+        name=None,
+        ax=None,
+        plot_chance_level=False,
+        chance_level_kw=None,
+        despine=False,
+        **kwargs,
+    ):
+        """Plot precision-recall curve given an estimator and some data.
+
+        For general information regarding `scikit-learn` visualization tools, see
+        the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the :ref:`Model
+        Evaluation Guide <precision_recall_f_measure_metrics>`.
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+            in which the last estimator is a classifier.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        drop_intermediate : bool, default=False
+            Whether to drop some suboptimal thresholds which would not appear
+            on a plotted precision-recall curve. This is useful in order to
+            create lighter precision-recall curves.
+
+            .. versionadded:: 1.3
+
+        response_method : {'predict_proba', 'decision_function', 'auto'}, \
+            default='auto'
+            Specifies whether to use :term:`predict_proba` or
+            :term:`decision_function` as the target response. If set to 'auto',
+            :term:`predict_proba` is tried first and if it does not exist
+            :term:`decision_function` is tried next.
+
+        pos_label : int, float, bool or str, default=None
+            The class considered as the positive class when computing the
+            precision and recall metrics. By default, `estimators.classes_[1]`
+            is considered as the positive class.
+
+        name : str, default=None
+            Name for labeling curve. If `None`, no name is used.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is created.
+
+        plot_chance_level : bool, default=False
+            Whether to plot the chance level. The chance level is the prevalence
+            of the positive label computed from the data passed during
+            :meth:`from_estimator` or :meth:`from_predictions` call.
+
+            .. versionadded:: 1.3
+
+        chance_level_kw : dict, default=None
+            Keyword arguments to be passed to matplotlib's `plot` for rendering
+            the chance level line.
+
+            .. versionadded:: 1.3
+
+        despine : bool, default=False
+            Whether to remove the top and right spines from the plot.
+
+            .. versionadded:: 1.6
+
+        **kwargs : dict
+            Keyword arguments to be passed to matplotlib's `plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
+
+        See Also
+        --------
+        PrecisionRecallDisplay.from_predictions : Plot precision-recall curve
+            using estimated probabilities or output of decision function.
+
+        Notes
+        -----
+        The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
+        in scikit-learn is computed without any interpolation. To be consistent
+        with this metric, the precision-recall curve is plotted without any
+        interpolation as well (step-wise style).
+
+        You can change this style by passing the keyword argument
+        `drawstyle="default"`. However, the curve will not be strictly
+        consistent with the reported average precision.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import PrecisionRecallDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...         X, y, random_state=0)
+        >>> clf = LogisticRegression()
+        >>> clf.fit(X_train, y_train)
+        LogisticRegression()
+        >>> PrecisionRecallDisplay.from_estimator(
+        ...    clf, X_test, y_test)
+        <...>
+        >>> plt.show()
+        """
+        y_pred, pos_label, name = cls._validate_and_get_response_values(
+            estimator,
+            X,
+            y,
+            response_method=response_method,
+            pos_label=pos_label,
+            name=name,
+        )
+
+        return cls.from_predictions(
+            y,
+            y_pred,
+            sample_weight=sample_weight,
+            name=name,
+            pos_label=pos_label,
+            drop_intermediate=drop_intermediate,
+            ax=ax,
+            plot_chance_level=plot_chance_level,
+            chance_level_kw=chance_level_kw,
+            despine=despine,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_predictions(
+        cls,
+        y_true,
+        y_pred,
+        *,
+        sample_weight=None,
+        drop_intermediate=False,
+        pos_label=None,
+        name=None,
+        ax=None,
+        plot_chance_level=False,
+        chance_level_kw=None,
+        despine=False,
+        **kwargs,
+    ):
+        """Plot precision-recall curve given binary class predictions.
+
+        For general information regarding `scikit-learn` visualization tools, see
+        the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the :ref:`Model
+        Evaluation Guide <precision_recall_f_measure_metrics>`.
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            True binary labels.
+
+        y_pred : array-like of shape (n_samples,)
+            Estimated probabilities or output of decision function.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        drop_intermediate : bool, default=False
+            Whether to drop some suboptimal thresholds which would not appear
+            on a plotted precision-recall curve. This is useful in order to
+            create lighter precision-recall curves.
+
+            .. versionadded:: 1.3
+
+        pos_label : int, float, bool or str, default=None
+            The class considered as the positive class when computing the
+            precision and recall metrics.
+
+        name : str, default=None
+            Name for labeling curve. If `None`, name will be set to
+            `"Classifier"`.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is created.
+
+        plot_chance_level : bool, default=False
+            Whether to plot the chance level. The chance level is the prevalence
+            of the positive label computed from the data passed during
+            :meth:`from_estimator` or :meth:`from_predictions` call.
+
+            .. versionadded:: 1.3
+
+        chance_level_kw : dict, default=None
+            Keyword arguments to be passed to matplotlib's `plot` for rendering
+            the chance level line.
+
+            .. versionadded:: 1.3
+
+        despine : bool, default=False
+            Whether to remove the top and right spines from the plot.
+
+            .. versionadded:: 1.6
+
+        **kwargs : dict
+            Keyword arguments to be passed to matplotlib's `plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
+
+        See Also
+        --------
+        PrecisionRecallDisplay.from_estimator : Plot precision-recall curve
+            using an estimator.
+
+        Notes
+        -----
+        The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
+        in scikit-learn is computed without any interpolation. To be consistent
+        with this metric, the precision-recall curve is plotted without any
+        interpolation as well (step-wise style).
+
+        You can change this style by passing the keyword argument
+        `drawstyle="default"`. However, the curve will not be strictly
+        consistent with the reported average precision.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import PrecisionRecallDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...         X, y, random_state=0)
+        >>> clf = LogisticRegression()
+        >>> clf.fit(X_train, y_train)
+        LogisticRegression()
+        >>> y_pred = clf.predict_proba(X_test)[:, 1]
+        >>> PrecisionRecallDisplay.from_predictions(
+        ...    y_test, y_pred)
+        <...>
+        >>> plt.show()
+        """
+        pos_label, name = cls._validate_from_predictions_params(
+            y_true, y_pred, sample_weight=sample_weight, pos_label=pos_label, name=name
+        )
+
+        precision, recall, _ = precision_recall_curve(
+            y_true,
+            y_pred,
+            pos_label=pos_label,
+            sample_weight=sample_weight,
+            drop_intermediate=drop_intermediate,
+        )
+        average_precision = average_precision_score(
+            y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight
+        )
+
+        class_count = Counter(y_true)
+        prevalence_pos_label = class_count[pos_label] / sum(class_count.values())
+
+        viz = cls(
+            precision=precision,
+            recall=recall,
+            average_precision=average_precision,
+            estimator_name=name,
+            pos_label=pos_label,
+            prevalence_pos_label=prevalence_pos_label,
+        )
+
+        return viz.plot(
+            ax=ax,
+            name=name,
+            plot_chance_level=plot_chance_level,
+            chance_level_kw=chance_level_kw,
+            despine=despine,
+            **kwargs,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/regression.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b56859cabefd181eafec383a1aa48c2a28807a4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/regression.py
@@ -0,0 +1,413 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+
+import numpy as np
+
+from ...utils import _safe_indexing, check_random_state
+from ...utils._optional_dependencies import check_matplotlib_support
+from ...utils._plotting import _validate_style_kwargs
+
+
+class PredictionErrorDisplay:
+    """Visualization of the prediction error of a regression model.
+
+    This tool can display "residuals vs predicted" or "actual vs predicted"
+    using scatter plots to qualitatively assess the behavior of a regressor,
+    preferably on held-out data points.
+
+    See the details in the docstrings of
+    :func:`~sklearn.metrics.PredictionErrorDisplay.from_estimator` or
+    :func:`~sklearn.metrics.PredictionErrorDisplay.from_predictions` to
+    create a visualizer. All parameters are stored as attributes.
+
+    For general information regarding `scikit-learn` visualization tools, read
+    more in the :ref:`Visualization Guide <visualizations>`.
+    For details regarding interpreting these plots, refer to the
+    :ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
+
+    .. versionadded:: 1.2
+
+    Parameters
+    ----------
+    y_true : ndarray of shape (n_samples,)
+        True values.
+
+    y_pred : ndarray of shape (n_samples,)
+        Prediction values.
+
+    Attributes
+    ----------
+    line_ : matplotlib Artist
+        Optimal line representing `y_true == y_pred`. Therefore, it is a
+        diagonal line for `kind="predictions"` and a horizontal line for
+        `kind="residuals"`.
+
+    errors_lines_ : matplotlib Artist or None
+        Residual lines. If `with_errors=False`, then it is set to `None`.
+
+    scatter_ : matplotlib Artist
+        Scatter data points.
+
+    ax_ : matplotlib Axes
+        Axes with the different matplotlib axis.
+
+    figure_ : matplotlib Figure
+        Figure containing the scatter and lines.
+
+    See Also
+    --------
+    PredictionErrorDisplay.from_estimator : Prediction error visualization
+        given an estimator and some data.
+    PredictionErrorDisplay.from_predictions : Prediction error visualization
+        given the true and predicted targets.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.linear_model import Ridge
+    >>> from sklearn.metrics import PredictionErrorDisplay
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> ridge = Ridge().fit(X, y)
+    >>> y_pred = ridge.predict(X)
+    >>> display = PredictionErrorDisplay(y_true=y, y_pred=y_pred)
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(self, *, y_true, y_pred):
+        self.y_true = y_true
+        self.y_pred = y_pred
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        kind="residual_vs_predicted",
+        scatter_kwargs=None,
+        line_kwargs=None,
+    ):
+        """Plot visualization.
+
+        Extra keyword arguments will be passed to matplotlib's ``plot``.
+
+        Parameters
+        ----------
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
+                default="residual_vs_predicted"
+            The type of plot to draw:
+
+            - "actual_vs_predicted" draws the observed values (y-axis) vs.
+              the predicted values (x-axis).
+            - "residual_vs_predicted" draws the residuals, i.e. difference
+              between observed and predicted values, (y-axis) vs. the predicted
+              values (x-axis).
+
+        scatter_kwargs : dict, default=None
+            Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
+            call.
+
+        line_kwargs : dict, default=None
+            Dictionary with keyword passed to the `matplotlib.pyplot.plot`
+            call to draw the optimal line.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.PredictionErrorDisplay`
+
+            Object that stores computed values.
+        """
+        check_matplotlib_support(f"{self.__class__.__name__}.plot")
+
+        expected_kind = ("actual_vs_predicted", "residual_vs_predicted")
+        if kind not in expected_kind:
+            raise ValueError(
+                f"`kind` must be one of {', '.join(expected_kind)}. "
+                f"Got {kind!r} instead."
+            )
+
+        import matplotlib.pyplot as plt
+
+        if scatter_kwargs is None:
+            scatter_kwargs = {}
+        if line_kwargs is None:
+            line_kwargs = {}
+
+        default_scatter_kwargs = {"color": "tab:blue", "alpha": 0.8}
+        default_line_kwargs = {"color": "black", "alpha": 0.7, "linestyle": "--"}
+
+        scatter_kwargs = _validate_style_kwargs(default_scatter_kwargs, scatter_kwargs)
+        line_kwargs = _validate_style_kwargs(default_line_kwargs, line_kwargs)
+
+        scatter_kwargs = {**default_scatter_kwargs, **scatter_kwargs}
+        line_kwargs = {**default_line_kwargs, **line_kwargs}
+
+        if ax is None:
+            _, ax = plt.subplots()
+
+        if kind == "actual_vs_predicted":
+            max_value = max(np.max(self.y_true), np.max(self.y_pred))
+            min_value = min(np.min(self.y_true), np.min(self.y_pred))
+            self.line_ = ax.plot(
+                [min_value, max_value], [min_value, max_value], **line_kwargs
+            )[0]
+
+            x_data, y_data = self.y_pred, self.y_true
+            xlabel, ylabel = "Predicted values", "Actual values"
+
+            self.scatter_ = ax.scatter(x_data, y_data, **scatter_kwargs)
+
+            # force to have a squared axis
+            ax.set_aspect("equal", adjustable="datalim")
+            ax.set_xticks(np.linspace(min_value, max_value, num=5))
+            ax.set_yticks(np.linspace(min_value, max_value, num=5))
+        else:  # kind == "residual_vs_predicted"
+            self.line_ = ax.plot(
+                [np.min(self.y_pred), np.max(self.y_pred)],
+                [0, 0],
+                **line_kwargs,
+            )[0]
+            self.scatter_ = ax.scatter(
+                self.y_pred, self.y_true - self.y_pred, **scatter_kwargs
+            )
+            xlabel, ylabel = "Predicted values", "Residuals (actual - predicted)"
+
+        ax.set(xlabel=xlabel, ylabel=ylabel)
+
+        self.ax_ = ax
+        self.figure_ = ax.figure
+
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        kind="residual_vs_predicted",
+        subsample=1_000,
+        random_state=None,
+        ax=None,
+        scatter_kwargs=None,
+        line_kwargs=None,
+    ):
+        """Plot the prediction error given a regressor and some data.
+
+        For general information regarding `scikit-learn` visualization tools,
+        read more in the :ref:`Visualization Guide <visualizations>`.
+        For details regarding interpreting these plots, refer to the
+        :ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
+
+        .. versionadded:: 1.2
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            Fitted regressor or a fitted :class:`~sklearn.pipeline.Pipeline`
+            in which the last estimator is a regressor.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
+                default="residual_vs_predicted"
+            The type of plot to draw:
+
+            - "actual_vs_predicted" draws the observed values (y-axis) vs.
+              the predicted values (x-axis).
+            - "residual_vs_predicted" draws the residuals, i.e. difference
+              between observed and predicted values, (y-axis) vs. the predicted
+              values (x-axis).
+
+        subsample : float, int or None, default=1_000
+            Sampling the samples to be shown on the scatter plot. If `float`,
+            it should be between 0 and 1 and represents the proportion of the
+            original dataset. If `int`, it represents the number of samples
+            display on the scatter plot. If `None`, no subsampling will be
+            applied. by default, 1000 samples or less will be displayed.
+
+        random_state : int or RandomState, default=None
+            Controls the randomness when `subsample` is not `None`.
+            See :term:`Glossary <random_state>` for details.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        scatter_kwargs : dict, default=None
+            Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
+            call.
+
+        line_kwargs : dict, default=None
+            Dictionary with keyword passed to the `matplotlib.pyplot.plot`
+            call to draw the optimal line.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.PredictionErrorDisplay`
+            Object that stores the computed values.
+
+        See Also
+        --------
+        PredictionErrorDisplay : Prediction error visualization for regression.
+        PredictionErrorDisplay.from_predictions : Prediction error visualization
+            given the true and predicted targets.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import load_diabetes
+        >>> from sklearn.linear_model import Ridge
+        >>> from sklearn.metrics import PredictionErrorDisplay
+        >>> X, y = load_diabetes(return_X_y=True)
+        >>> ridge = Ridge().fit(X, y)
+        >>> disp = PredictionErrorDisplay.from_estimator(ridge, X, y)
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+
+        y_pred = estimator.predict(X)
+
+        return cls.from_predictions(
+            y_true=y,
+            y_pred=y_pred,
+            kind=kind,
+            subsample=subsample,
+            random_state=random_state,
+            ax=ax,
+            scatter_kwargs=scatter_kwargs,
+            line_kwargs=line_kwargs,
+        )
+
+    @classmethod
+    def from_predictions(
+        cls,
+        y_true,
+        y_pred,
+        *,
+        kind="residual_vs_predicted",
+        subsample=1_000,
+        random_state=None,
+        ax=None,
+        scatter_kwargs=None,
+        line_kwargs=None,
+    ):
+        """Plot the prediction error given the true and predicted targets.
+
+        For general information regarding `scikit-learn` visualization tools,
+        read more in the :ref:`Visualization Guide <visualizations>`.
+        For details regarding interpreting these plots, refer to the
+        :ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
+
+        .. versionadded:: 1.2
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            True target values.
+
+        y_pred : array-like of shape (n_samples,)
+            Predicted target values.
+
+        kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
+                default="residual_vs_predicted"
+            The type of plot to draw:
+
+            - "actual_vs_predicted" draws the observed values (y-axis) vs.
+              the predicted values (x-axis).
+            - "residual_vs_predicted" draws the residuals, i.e. difference
+              between observed and predicted values, (y-axis) vs. the predicted
+              values (x-axis).
+
+        subsample : float, int or None, default=1_000
+            Sampling the samples to be shown on the scatter plot. If `float`,
+            it should be between 0 and 1 and represents the proportion of the
+            original dataset. If `int`, it represents the number of samples
+            display on the scatter plot. If `None`, no subsampling will be
+            applied. by default, 1000 samples or less will be displayed.
+
+        random_state : int or RandomState, default=None
+            Controls the randomness when `subsample` is not `None`.
+            See :term:`Glossary <random_state>` for details.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        scatter_kwargs : dict, default=None
+            Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
+            call.
+
+        line_kwargs : dict, default=None
+            Dictionary with keyword passed to the `matplotlib.pyplot.plot`
+            call to draw the optimal line.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.PredictionErrorDisplay`
+            Object that stores the computed values.
+
+        See Also
+        --------
+        PredictionErrorDisplay : Prediction error visualization for regression.
+        PredictionErrorDisplay.from_estimator : Prediction error visualization
+            given an estimator and some data.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import load_diabetes
+        >>> from sklearn.linear_model import Ridge
+        >>> from sklearn.metrics import PredictionErrorDisplay
+        >>> X, y = load_diabetes(return_X_y=True)
+        >>> ridge = Ridge().fit(X, y)
+        >>> y_pred = ridge.predict(X)
+        >>> disp = PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred)
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_predictions")
+
+        random_state = check_random_state(random_state)
+
+        n_samples = len(y_true)
+        if isinstance(subsample, numbers.Integral):
+            if subsample <= 0:
+                raise ValueError(
+                    f"When an integer, subsample={subsample} should be positive."
+                )
+        elif isinstance(subsample, numbers.Real):
+            if subsample <= 0 or subsample >= 1:
+                raise ValueError(
+                    f"When a floating-point, subsample={subsample} should"
+                    " be in the (0, 1) range."
+                )
+            subsample = int(n_samples * subsample)
+
+        if subsample is not None and subsample < n_samples:
+            indices = random_state.choice(np.arange(n_samples), size=subsample)
+            y_true = _safe_indexing(y_true, indices, axis=0)
+            y_pred = _safe_indexing(y_pred, indices, axis=0)
+
+        viz = cls(
+            y_true=y_true,
+            y_pred=y_pred,
+        )
+
+        return viz.plot(
+            ax=ax,
+            kind=kind,
+            scatter_kwargs=scatter_kwargs,
+            line_kwargs=line_kwargs,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/roc_curve.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/roc_curve.py
new file mode 100644
index 0000000000000000000000000000000000000000..383f14e688859afe537ccf89da68fe2751bcb5a4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/roc_curve.py
@@ -0,0 +1,795 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import warnings
+
+import numpy as np
+
+from ...utils import _safe_indexing
+from ...utils._plotting import (
+    _BinaryClassifierCurveDisplayMixin,
+    _check_param_lengths,
+    _convert_to_list_leaving_none,
+    _deprecate_estimator_name,
+    _despine,
+    _validate_style_kwargs,
+)
+from ...utils._response import _get_response_values_binary
+from .._ranking import auc, roc_curve
+
+
+class RocCurveDisplay(_BinaryClassifierCurveDisplayMixin):
+    """ROC Curve visualization.
+
+    It is recommended to use
+    :func:`~sklearn.metrics.RocCurveDisplay.from_estimator` or
+    :func:`~sklearn.metrics.RocCurveDisplay.from_predictions` or
+    :func:`~sklearn.metrics.RocCurveDisplay.from_cv_results` to create
+    a :class:`~sklearn.metrics.RocCurveDisplay`. All parameters are
+    stored as attributes.
+
+    For general information regarding `scikit-learn` visualization tools, see
+    the :ref:`Visualization Guide <visualizations>`.
+    For guidance on interpreting these plots, refer to the :ref:`Model
+    Evaluation Guide <roc_metrics>`.
+
+    Parameters
+    ----------
+    fpr : ndarray or list of ndarrays
+        False positive rates. Each ndarray should contain values for a single curve.
+        If plotting multiple curves, list should be of same length as `tpr`.
+
+        .. versionchanged:: 1.7
+            Now accepts a list for plotting multiple curves.
+
+    tpr : ndarray or list of ndarrays
+        True positive rates. Each ndarray should contain values for a single curve.
+        If plotting multiple curves, list should be of same length as `fpr`.
+
+        .. versionchanged:: 1.7
+            Now accepts a list for plotting multiple curves.
+
+    roc_auc : float or list of floats, default=None
+        Area under ROC curve, used for labeling each curve in the legend.
+        If plotting multiple curves, should be a list of the same length as `fpr`
+        and `tpr`. If `None`, ROC AUC scores are not shown in the legend.
+
+        .. versionchanged:: 1.7
+            Now accepts a list for plotting multiple curves.
+
+    name : str or list of str, default=None
+        Name for labeling legend entries. The number of legend entries is determined
+        by the `curve_kwargs` passed to `plot`, and is not affected by `name`.
+        To label each curve, provide a list of strings. To avoid labeling
+        individual curves that have the same appearance, this cannot be used in
+        conjunction with `curve_kwargs` being a dictionary or None. If a
+        string is provided, it will be used to either label the single legend entry
+        or if there are multiple legend entries, label each individual curve with
+        the same name. If still `None`, no name is shown in the legend.
+
+        .. versionadded:: 1.7
+
+    pos_label : int, float, bool or str, default=None
+        The class considered as the positive class when computing the roc auc
+        metrics. By default, `estimators.classes_[1]` is considered
+        as the positive class.
+
+        .. versionadded:: 0.24
+
+    estimator_name : str, default=None
+        Name of estimator. If None, the estimator name is not shown.
+
+        .. deprecated:: 1.7
+            `estimator_name` is deprecated and will be removed in 1.9. Use `name`
+            instead.
+
+    Attributes
+    ----------
+    line_ : matplotlib Artist or list of matplotlib Artists
+        ROC Curves.
+
+        .. versionchanged:: 1.7
+            This attribute can now be a list of Artists, for when multiple curves
+            are plotted.
+
+    chance_level_ : matplotlib Artist or None
+        The chance level line. It is `None` if the chance level is not plotted.
+
+        .. versionadded:: 1.3
+
+    ax_ : matplotlib Axes
+        Axes with ROC Curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the curve.
+
+    See Also
+    --------
+    roc_curve : Compute Receiver operating characteristic (ROC) curve.
+    RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic
+        (ROC) curve given an estimator and some data.
+    RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic
+        (ROC) curve given the true and predicted values.
+    roc_auc_score : Compute the area under the ROC curve.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> import numpy as np
+    >>> from sklearn import metrics
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)
+    >>> roc_auc = metrics.auc(fpr, tpr)
+    >>> display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
+    ...                                   name='example estimator')
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(
+        self,
+        *,
+        fpr,
+        tpr,
+        roc_auc=None,
+        name=None,
+        pos_label=None,
+        estimator_name="deprecated",
+    ):
+        self.fpr = fpr
+        self.tpr = tpr
+        self.roc_auc = roc_auc
+        self.name = _deprecate_estimator_name(estimator_name, name, "1.7")
+        self.pos_label = pos_label
+
+    def _validate_plot_params(self, *, ax, name):
+        self.ax_, self.figure_, name = super()._validate_plot_params(ax=ax, name=name)
+
+        fpr = _convert_to_list_leaving_none(self.fpr)
+        tpr = _convert_to_list_leaving_none(self.tpr)
+        roc_auc = _convert_to_list_leaving_none(self.roc_auc)
+        name = _convert_to_list_leaving_none(name)
+
+        optional = {"self.roc_auc": roc_auc}
+        if isinstance(name, list) and len(name) != 1:
+            optional.update({"'name' (or self.name)": name})
+        _check_param_lengths(
+            required={"self.fpr": fpr, "self.tpr": tpr},
+            optional=optional,
+            class_name="RocCurveDisplay",
+        )
+        return fpr, tpr, roc_auc, name
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        name=None,
+        curve_kwargs=None,
+        plot_chance_level=False,
+        chance_level_kw=None,
+        despine=False,
+        **kwargs,
+    ):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        name : str or list of str, default=None
+            Name for labeling legend entries. The number of legend entries
+            is determined by `curve_kwargs`, and is not affected by `name`.
+            To label each curve, provide a list of strings. To avoid labeling
+            individual curves that have the same appearance, this cannot be used in
+            conjunction with `curve_kwargs` being a dictionary or None. If a
+            string is provided, it will be used to either label the single legend entry
+            or if there are multiple legend entries, label each individual curve with
+            the same name. If `None`, set to `name` provided at `RocCurveDisplay`
+            initialization. If still `None`, no name is shown in the legend.
+
+            .. versionadded:: 1.7
+
+        curve_kwargs : dict or list of dict, default=None
+            Keywords arguments to be passed to matplotlib's `plot` function
+            to draw individual ROC curves. For single curve plotting, should be
+            a dictionary. For multi-curve plotting, if a list is provided the
+            parameters are applied to the ROC curves of each CV fold
+            sequentially and a legend entry is added for each curve.
+            If a single dictionary is provided, the same parameters are applied
+            to all ROC curves and a single legend entry for all curves is added,
+            labeled with the mean ROC AUC score.
+
+            .. versionadded:: 1.7
+
+        plot_chance_level : bool, default=False
+            Whether to plot the chance level.
+
+            .. versionadded:: 1.3
+
+        chance_level_kw : dict, default=None
+            Keyword arguments to be passed to matplotlib's `plot` for rendering
+            the chance level line.
+
+            .. versionadded:: 1.3
+
+        despine : bool, default=False
+            Whether to remove the top and right spines from the plot.
+
+            .. versionadded:: 1.6
+
+        **kwargs : dict
+            Keyword arguments to be passed to matplotlib's `plot`.
+
+            .. deprecated:: 1.7
+                kwargs is deprecated and will be removed in 1.9. Pass matplotlib
+                arguments to `curve_kwargs` as a dictionary instead.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.RocCurveDisplay`
+            Object that stores computed values.
+        """
+        fpr, tpr, roc_auc, name = self._validate_plot_params(ax=ax, name=name)
+        n_curves = len(fpr)
+        if not isinstance(curve_kwargs, list) and n_curves > 1:
+            if roc_auc:
+                legend_metric = {"mean": np.mean(roc_auc), "std": np.std(roc_auc)}
+            else:
+                legend_metric = {"mean": None, "std": None}
+        else:
+            roc_auc = roc_auc if roc_auc is not None else [None] * n_curves
+            legend_metric = {"metric": roc_auc}
+
+        curve_kwargs = self._validate_curve_kwargs(
+            n_curves,
+            name,
+            legend_metric,
+            "AUC",
+            curve_kwargs=curve_kwargs,
+            **kwargs,
+        )
+
+        default_chance_level_line_kw = {
+            "label": "Chance level (AUC = 0.5)",
+            "color": "k",
+            "linestyle": "--",
+        }
+
+        if chance_level_kw is None:
+            chance_level_kw = {}
+
+        chance_level_kw = _validate_style_kwargs(
+            default_chance_level_line_kw, chance_level_kw
+        )
+
+        self.line_ = []
+        for fpr, tpr, line_kw in zip(fpr, tpr, curve_kwargs):
+            self.line_.extend(self.ax_.plot(fpr, tpr, **line_kw))
+        # Return single artist if only one curve is plotted
+        if len(self.line_) == 1:
+            self.line_ = self.line_[0]
+
+        info_pos_label = (
+            f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
+        )
+
+        xlabel = "False Positive Rate" + info_pos_label
+        ylabel = "True Positive Rate" + info_pos_label
+        self.ax_.set(
+            xlabel=xlabel,
+            xlim=(-0.01, 1.01),
+            ylabel=ylabel,
+            ylim=(-0.01, 1.01),
+            aspect="equal",
+        )
+
+        if plot_chance_level:
+            (self.chance_level_,) = self.ax_.plot((0, 1), (0, 1), **chance_level_kw)
+        else:
+            self.chance_level_ = None
+
+        if despine:
+            _despine(self.ax_)
+
+        if curve_kwargs[0].get("label") is not None or (
+            plot_chance_level and chance_level_kw.get("label") is not None
+        ):
+            self.ax_.legend(loc="lower right")
+
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        sample_weight=None,
+        drop_intermediate=True,
+        response_method="auto",
+        pos_label=None,
+        name=None,
+        ax=None,
+        curve_kwargs=None,
+        plot_chance_level=False,
+        chance_level_kw=None,
+        despine=False,
+        **kwargs,
+    ):
+        """Create a ROC Curve display from an estimator.
+
+        For general information regarding `scikit-learn` visualization tools,
+        see the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the :ref:`Model
+        Evaluation Guide <roc_metrics>`.
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+            in which the last estimator is a classifier.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        drop_intermediate : bool, default=True
+            Whether to drop thresholds where the resulting point is collinear
+            with its neighbors in ROC space. This has no effect on the ROC AUC
+            or visual shape of the curve, but reduces the number of plotted
+            points.
+
+        response_method : {'predict_proba', 'decision_function', 'auto'} \
+                default='auto'
+            Specifies whether to use :term:`predict_proba` or
+            :term:`decision_function` as the target response. If set to 'auto',
+            :term:`predict_proba` is tried first and if it does not exist
+            :term:`decision_function` is tried next.
+
+        pos_label : int, float, bool or str, default=None
+            The class considered as the positive class when computing the ROC AUC.
+            By default, `estimators.classes_[1]` is considered
+            as the positive class.
+
+        name : str, default=None
+            Name of ROC Curve for labeling. If `None`, use the name of the
+            estimator.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is created.
+
+        curve_kwargs : dict, default=None
+            Keywords arguments to be passed to matplotlib's `plot` function.
+
+            .. versionadded:: 1.7
+
+        plot_chance_level : bool, default=False
+            Whether to plot the chance level.
+
+            .. versionadded:: 1.3
+
+        chance_level_kw : dict, default=None
+            Keyword arguments to be passed to matplotlib's `plot` for rendering
+            the chance level line.
+
+            .. versionadded:: 1.3
+
+        despine : bool, default=False
+            Whether to remove the top and right spines from the plot.
+
+            .. versionadded:: 1.6
+
+        **kwargs : dict
+            Keyword arguments to be passed to matplotlib's `plot`.
+
+            .. deprecated:: 1.7
+                kwargs is deprecated and will be removed in 1.9. Pass matplotlib
+                arguments to `curve_kwargs` as a dictionary instead.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.RocCurveDisplay`
+            The ROC Curve display.
+
+        See Also
+        --------
+        roc_curve : Compute Receiver operating characteristic (ROC) curve.
+        RocCurveDisplay.from_predictions : ROC Curve visualization given the
+            probabilities of scores of a classifier.
+        roc_auc_score : Compute the area under the ROC curve.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import RocCurveDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...     X, y, random_state=0)
+        >>> clf = SVC(random_state=0).fit(X_train, y_train)
+        >>> RocCurveDisplay.from_estimator(
+        ...    clf, X_test, y_test)
+        <...>
+        >>> plt.show()
+        """
+        y_score, pos_label, name = cls._validate_and_get_response_values(
+            estimator,
+            X,
+            y,
+            response_method=response_method,
+            pos_label=pos_label,
+            name=name,
+        )
+
+        return cls.from_predictions(
+            y_true=y,
+            y_score=y_score,
+            sample_weight=sample_weight,
+            drop_intermediate=drop_intermediate,
+            pos_label=pos_label,
+            name=name,
+            ax=ax,
+            curve_kwargs=curve_kwargs,
+            plot_chance_level=plot_chance_level,
+            chance_level_kw=chance_level_kw,
+            despine=despine,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_predictions(
+        cls,
+        y_true,
+        y_score=None,
+        *,
+        sample_weight=None,
+        drop_intermediate=True,
+        pos_label=None,
+        name=None,
+        ax=None,
+        curve_kwargs=None,
+        plot_chance_level=False,
+        chance_level_kw=None,
+        despine=False,
+        y_pred="deprecated",
+        **kwargs,
+    ):
+        """Plot ROC curve given the true and predicted values.
+
+        For general information regarding `scikit-learn` visualization tools,
+        see the :ref:`Visualization Guide <visualizations>`.
+        For guidance on interpreting these plots, refer to the :ref:`Model
+        Evaluation Guide <roc_metrics>`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            True labels.
+
+        y_score : array-like of shape (n_samples,)
+            Target scores, can either be probability estimates of the positive
+            class, confidence values, or non-thresholded measure of decisions
+            (as returned by “decision_function” on some classifiers).
+
+            .. versionadded:: 1.7
+                `y_pred` has been renamed to `y_score`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        drop_intermediate : bool, default=True
+            Whether to drop thresholds where the resulting point is collinear
+            with its neighbors in ROC space. This has no effect on the ROC AUC
+            or visual shape of the curve, but reduces the number of plotted
+            points.
+
+        pos_label : int, float, bool or str, default=None
+            The label of the positive class when computing the ROC AUC.
+            When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label`
+            is set to 1, otherwise an error will be raised.
+
+        name : str, default=None
+            Name of ROC curve for legend labeling. If `None`, name will be set to
+            `"Classifier"`.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        curve_kwargs : dict, default=None
+            Keywords arguments to be passed to matplotlib's `plot` function.
+
+            .. versionadded:: 1.7
+
+        plot_chance_level : bool, default=False
+            Whether to plot the chance level.
+
+            .. versionadded:: 1.3
+
+        chance_level_kw : dict, default=None
+            Keyword arguments to be passed to matplotlib's `plot` for rendering
+            the chance level line.
+
+            .. versionadded:: 1.3
+
+        despine : bool, default=False
+            Whether to remove the top and right spines from the plot.
+
+            .. versionadded:: 1.6
+
+        y_pred : array-like of shape (n_samples,)
+            Target scores, can either be probability estimates of the positive
+            class, confidence values, or non-thresholded measure of decisions
+            (as returned by “decision_function” on some classifiers).
+
+            .. deprecated:: 1.7
+                `y_pred` is deprecated and will be removed in 1.9. Use
+                `y_score` instead.
+
+        **kwargs : dict
+            Additional keywords arguments passed to matplotlib `plot` function.
+
+            .. deprecated:: 1.7
+                kwargs is deprecated and will be removed in 1.9. Pass matplotlib
+                arguments to `curve_kwargs` as a dictionary instead.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.RocCurveDisplay`
+            Object that stores computed values.
+
+        See Also
+        --------
+        roc_curve : Compute Receiver operating characteristic (ROC) curve.
+        RocCurveDisplay.from_estimator : ROC Curve visualization given an
+            estimator and some data.
+        roc_auc_score : Compute the area under the ROC curve.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import RocCurveDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...     X, y, random_state=0)
+        >>> clf = SVC(random_state=0).fit(X_train, y_train)
+        >>> y_score = clf.decision_function(X_test)
+        >>> RocCurveDisplay.from_predictions(y_test, y_score)
+        <...>
+        >>> plt.show()
+        """
+        # TODO(1.9): remove after the end of the deprecation period of `y_pred`
+        if y_score is not None and not (
+            isinstance(y_pred, str) and y_pred == "deprecated"
+        ):
+            raise ValueError(
+                "`y_pred` and `y_score` cannot be both specified. Please use `y_score`"
+                " only as `y_pred` is deprecated in 1.7 and will be removed in 1.9."
+            )
+        if not (isinstance(y_pred, str) and y_pred == "deprecated"):
+            warnings.warn(
+                (
+                    "y_pred is deprecated in 1.7 and will be removed in 1.9. "
+                    "Please use `y_score` instead."
+                ),
+                FutureWarning,
+            )
+            y_score = y_pred
+
+        pos_label_validated, name = cls._validate_from_predictions_params(
+            y_true, y_score, sample_weight=sample_weight, pos_label=pos_label, name=name
+        )
+
+        fpr, tpr, _ = roc_curve(
+            y_true,
+            y_score,
+            pos_label=pos_label,
+            sample_weight=sample_weight,
+            drop_intermediate=drop_intermediate,
+        )
+        roc_auc = auc(fpr, tpr)
+
+        viz = cls(
+            fpr=fpr,
+            tpr=tpr,
+            roc_auc=roc_auc,
+            name=name,
+            pos_label=pos_label_validated,
+        )
+
+        return viz.plot(
+            ax=ax,
+            curve_kwargs=curve_kwargs,
+            plot_chance_level=plot_chance_level,
+            chance_level_kw=chance_level_kw,
+            despine=despine,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_cv_results(
+        cls,
+        cv_results,
+        X,
+        y,
+        *,
+        sample_weight=None,
+        drop_intermediate=True,
+        response_method="auto",
+        pos_label=None,
+        ax=None,
+        name=None,
+        curve_kwargs=None,
+        plot_chance_level=False,
+        chance_level_kwargs=None,
+        despine=False,
+    ):
+        """Create a multi-fold ROC curve display given cross-validation results.
+
+        .. versionadded:: 1.7
+
+        Parameters
+        ----------
+        cv_results : dict
+            Dictionary as returned by :func:`~sklearn.model_selection.cross_validate`
+            using `return_estimator=True` and `return_indices=True` (i.e., dictionary
+            should contain the keys "estimator" and "indices").
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        drop_intermediate : bool, default=True
+            Whether to drop some suboptimal thresholds which would not appear
+            on a plotted ROC curve. This is useful in order to create lighter
+            ROC curves.
+
+        response_method : {'predict_proba', 'decision_function', 'auto'} \
+                default='auto'
+            Specifies whether to use :term:`predict_proba` or
+            :term:`decision_function` as the target response. If set to 'auto',
+            :term:`predict_proba` is tried first and if it does not exist
+            :term:`decision_function` is tried next.
+
+        pos_label : int, float, bool or str, default=None
+            The class considered as the positive class when computing the ROC AUC
+            metrics. By default, `estimators.classes_[1]` is considered
+            as the positive class.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        name : str or list of str, default=None
+            Name for labeling legend entries. The number of legend entries
+            is determined by `curve_kwargs`, and is not affected by `name`.
+            To label each curve, provide a list of strings. To avoid labeling
+            individual curves that have the same appearance, this cannot be used in
+            conjunction with `curve_kwargs` being a dictionary or None. If a
+            string is provided, it will be used to either label the single legend entry
+            or if there are multiple legend entries, label each individual curve with
+            the same name. If `None`, no name is shown in the legend.
+
+        curve_kwargs : dict or list of dict, default=None
+            Keywords arguments to be passed to matplotlib's `plot` function
+            to draw individual ROC curves. If a list is provided the
+            parameters are applied to the ROC curves of each CV fold
+            sequentially and a legend entry is added for each curve.
+            If a single dictionary is provided, the same parameters are applied
+            to all ROC curves and a single legend entry for all curves is added,
+            labeled with the mean ROC AUC score.
+
+        plot_chance_level : bool, default=False
+            Whether to plot the chance level.
+
+        chance_level_kwargs : dict, default=None
+            Keyword arguments to be passed to matplotlib's `plot` for rendering
+            the chance level line.
+
+        despine : bool, default=False
+            Whether to remove the top and right spines from the plot.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.RocCurveDisplay`
+            The multi-fold ROC curve display.
+
+        See Also
+        --------
+        roc_curve : Compute Receiver operating characteristic (ROC) curve.
+            RocCurveDisplay.from_estimator : ROC Curve visualization given an
+            estimator and some data.
+        RocCurveDisplay.from_predictions : ROC Curve visualization given the
+            probabilities of scores of a classifier.
+        roc_auc_score : Compute the area under the ROC curve.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import RocCurveDisplay
+        >>> from sklearn.model_selection import cross_validate
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(random_state=0)
+        >>> clf = SVC(random_state=0)
+        >>> cv_results = cross_validate(
+        ...     clf, X, y, cv=3, return_estimator=True, return_indices=True)
+        >>> RocCurveDisplay.from_cv_results(cv_results, X, y)
+        <...>
+        >>> plt.show()
+        """
+        pos_label_ = cls._validate_from_cv_results_params(
+            cv_results,
+            X,
+            y,
+            sample_weight=sample_weight,
+            pos_label=pos_label,
+        )
+
+        fpr_folds, tpr_folds, auc_folds = [], [], []
+        for estimator, test_indices in zip(
+            cv_results["estimator"], cv_results["indices"]["test"]
+        ):
+            y_true = _safe_indexing(y, test_indices)
+            y_pred, _ = _get_response_values_binary(
+                estimator,
+                _safe_indexing(X, test_indices),
+                response_method=response_method,
+                pos_label=pos_label_,
+            )
+            sample_weight_fold = (
+                None
+                if sample_weight is None
+                else _safe_indexing(sample_weight, test_indices)
+            )
+            fpr, tpr, _ = roc_curve(
+                y_true,
+                y_pred,
+                pos_label=pos_label_,
+                sample_weight=sample_weight_fold,
+                drop_intermediate=drop_intermediate,
+            )
+            roc_auc = auc(fpr, tpr)
+
+            fpr_folds.append(fpr)
+            tpr_folds.append(tpr)
+            auc_folds.append(roc_auc)
+
+        viz = cls(
+            fpr=fpr_folds,
+            tpr=tpr_folds,
+            roc_auc=auc_folds,
+            name=name,
+            pos_label=pos_label_,
+        )
+        return viz.plot(
+            ax=ax,
+            curve_kwargs=curve_kwargs,
+            plot_chance_level=plot_chance_level,
+            chance_level_kw=chance_level_kwargs,
+            despine=despine,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_common_curve_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_common_curve_display.py
new file mode 100644
index 0000000000000000000000000000000000000000..753f2a1e7319d51b2ff7c299a25a7146801e5fd3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_common_curve_display.py
@@ -0,0 +1,292 @@
+import numpy as np
+import pytest
+
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.calibration import CalibrationDisplay
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import load_iris
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    ConfusionMatrixDisplay,
+    DetCurveDisplay,
+    PrecisionRecallDisplay,
+    PredictionErrorDisplay,
+    RocCurveDisplay,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+
+
+@pytest.fixture(scope="module")
+def data():
+    return load_iris(return_X_y=True)
+
+
+@pytest.fixture(scope="module")
+def data_binary(data):
+    X, y = data
+    return X[y < 2], y[y < 2]
+
+
+@pytest.mark.parametrize(
+    "Display",
+    [CalibrationDisplay, DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay],
+)
+def test_display_curve_error_classifier(pyplot, data, data_binary, Display):
+    """Check that a proper error is raised when only binary classification is
+    supported."""
+    X, y = data
+    X_binary, y_binary = data_binary
+    clf = DecisionTreeClassifier().fit(X, y)
+
+    # Case 1: multiclass classifier with multiclass target
+    msg = "Expected 'estimator' to be a binary classifier. Got 3 classes instead."
+    with pytest.raises(ValueError, match=msg):
+        Display.from_estimator(clf, X, y)
+
+    # Case 2: multiclass classifier with binary target
+    with pytest.raises(ValueError, match=msg):
+        Display.from_estimator(clf, X_binary, y_binary)
+
+    # Case 3: binary classifier with multiclass target
+    clf = DecisionTreeClassifier().fit(X_binary, y_binary)
+    msg = "The target y is not binary. Got multiclass type of target."
+    with pytest.raises(ValueError, match=msg):
+        Display.from_estimator(clf, X, y)
+
+
+@pytest.mark.parametrize(
+    "Display",
+    [CalibrationDisplay, DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay],
+)
+def test_display_curve_error_regression(pyplot, data_binary, Display):
+    """Check that we raise an error with regressor."""
+
+    # Case 1: regressor
+    X, y = data_binary
+    regressor = DecisionTreeRegressor().fit(X, y)
+
+    msg = "Expected 'estimator' to be a binary classifier. Got DecisionTreeRegressor"
+    with pytest.raises(ValueError, match=msg):
+        Display.from_estimator(regressor, X, y)
+
+    # Case 2: regression target
+    classifier = DecisionTreeClassifier().fit(X, y)
+    # Force `y_true` to be seen as a regression problem
+    y = y + 0.5
+    msg = "The target y is not binary. Got continuous type of target."
+    with pytest.raises(ValueError, match=msg):
+        Display.from_estimator(classifier, X, y)
+    with pytest.raises(ValueError, match=msg):
+        Display.from_predictions(y, regressor.fit(X, y).predict(X))
+
+
+@pytest.mark.parametrize(
+    "response_method, msg",
+    [
+        (
+            "predict_proba",
+            "MyClassifier has none of the following attributes: predict_proba.",
+        ),
+        (
+            "decision_function",
+            "MyClassifier has none of the following attributes: decision_function.",
+        ),
+        (
+            "auto",
+            (
+                "MyClassifier has none of the following attributes: predict_proba,"
+                " decision_function."
+            ),
+        ),
+        (
+            "bad_method",
+            "MyClassifier has none of the following attributes: bad_method.",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
+)
+def test_display_curve_error_no_response(
+    pyplot,
+    data_binary,
+    response_method,
+    msg,
+    Display,
+):
+    """Check that a proper error is raised when the response method requested
+    is not defined for the given trained classifier."""
+    X, y = data_binary
+
+    class MyClassifier(ClassifierMixin, BaseEstimator):
+        def fit(self, X, y):
+            self.classes_ = [0, 1]
+            return self
+
+    clf = MyClassifier().fit(X, y)
+
+    with pytest.raises(AttributeError, match=msg):
+        Display.from_estimator(clf, X, y, response_method=response_method)
+
+
+@pytest.mark.parametrize("Display", [DetCurveDisplay, PrecisionRecallDisplay])
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_display_curve_estimator_name_multiple_calls(
+    pyplot,
+    data_binary,
+    Display,
+    constructor_name,
+):
+    """Check that passing `name` when calling `plot` will overwrite the original name
+    in the legend."""
+    X, y = data_binary
+    clf_name = "my hand-crafted name"
+    clf = LogisticRegression().fit(X, y)
+    y_pred = clf.predict_proba(X)[:, 1]
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    if constructor_name == "from_estimator":
+        disp = Display.from_estimator(clf, X, y, name=clf_name)
+    else:
+        disp = Display.from_predictions(y, y_pred, name=clf_name)
+    assert disp.estimator_name == clf_name
+    pyplot.close("all")
+    disp.plot()
+    assert clf_name in disp.line_.get_label()
+    pyplot.close("all")
+    clf_name = "another_name"
+    disp.plot(name=clf_name)
+    assert clf_name in disp.line_.get_label()
+
+
+# TODO: remove this test once classes moved to using `name` instead of
+# `estimator_name`
+@pytest.mark.parametrize(
+    "clf",
+    [
+        LogisticRegression(),
+        make_pipeline(StandardScaler(), LogisticRegression()),
+        make_pipeline(
+            make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
+        ),
+    ],
+)
+@pytest.mark.parametrize("Display", [DetCurveDisplay, PrecisionRecallDisplay])
+def test_display_curve_not_fitted_errors_old_name(pyplot, data_binary, clf, Display):
+    """Check that a proper error is raised when the classifier is not
+    fitted."""
+    X, y = data_binary
+    # clone since we parametrize the test and the classifier will be fitted
+    # when testing the second and subsequent plotting function
+    model = clone(clf)
+    with pytest.raises(NotFittedError):
+        Display.from_estimator(model, X, y)
+    model.fit(X, y)
+    disp = Display.from_estimator(model, X, y)
+    assert model.__class__.__name__ in disp.line_.get_label()
+    assert disp.estimator_name == model.__class__.__name__
+
+
+@pytest.mark.parametrize(
+    "clf",
+    [
+        LogisticRegression(),
+        make_pipeline(StandardScaler(), LogisticRegression()),
+        make_pipeline(
+            make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
+        ),
+    ],
+)
+@pytest.mark.parametrize("Display", [RocCurveDisplay])
+def test_display_curve_not_fitted_errors(pyplot, data_binary, clf, Display):
+    """Check that a proper error is raised when the classifier is not fitted."""
+    X, y = data_binary
+    # clone since we parametrize the test and the classifier will be fitted
+    # when testing the second and subsequent plotting function
+    model = clone(clf)
+    with pytest.raises(NotFittedError):
+        Display.from_estimator(model, X, y)
+    model.fit(X, y)
+    disp = Display.from_estimator(model, X, y)
+    assert model.__class__.__name__ in disp.line_.get_label()
+    assert disp.name == model.__class__.__name__
+
+
+@pytest.mark.parametrize(
+    "Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
+)
+def test_display_curve_n_samples_consistency(pyplot, data_binary, Display):
+    """Check the error raised when `y_pred` or `sample_weight` have inconsistent
+    length."""
+    X, y = data_binary
+    classifier = DecisionTreeClassifier().fit(X, y)
+
+    msg = "Found input variables with inconsistent numbers of samples"
+    with pytest.raises(ValueError, match=msg):
+        Display.from_estimator(classifier, X[:-2], y)
+    with pytest.raises(ValueError, match=msg):
+        Display.from_estimator(classifier, X, y[:-2])
+    with pytest.raises(ValueError, match=msg):
+        Display.from_estimator(classifier, X, y, sample_weight=np.ones(X.shape[0] - 2))
+
+
+@pytest.mark.parametrize(
+    "Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
+)
+def test_display_curve_error_pos_label(pyplot, data_binary, Display):
+    """Check consistence of error message when `pos_label` should be specified."""
+    X, y = data_binary
+    y = y + 10
+
+    classifier = DecisionTreeClassifier().fit(X, y)
+    y_pred = classifier.predict_proba(X)[:, -1]
+    msg = r"y_true takes value in {10, 11} and pos_label is not specified"
+    with pytest.raises(ValueError, match=msg):
+        Display.from_predictions(y, y_pred)
+
+
+@pytest.mark.parametrize(
+    "Display",
+    [
+        CalibrationDisplay,
+        DetCurveDisplay,
+        PrecisionRecallDisplay,
+        RocCurveDisplay,
+        PredictionErrorDisplay,
+        ConfusionMatrixDisplay,
+    ],
+)
+@pytest.mark.parametrize(
+    "constructor",
+    ["from_predictions", "from_estimator"],
+)
+def test_classifier_display_curve_named_constructor_return_type(
+    pyplot, data_binary, Display, constructor
+):
+    """Check that named constructors return the correct type when subclassed.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/27675
+    """
+    X, y = data_binary
+
+    # This can be anything - we just need to check the named constructor return
+    # type so the only requirement here is instantiating the class without error
+    y_pred = y
+
+    classifier = LogisticRegression().fit(X, y)
+
+    class SubclassOfDisplay(Display):
+        pass
+
+    if constructor == "from_predictions":
+        curve = SubclassOfDisplay.from_predictions(y, y_pred)
+    else:  # constructor == "from_estimator"
+        curve = SubclassOfDisplay.from_estimator(classifier, X, y)
+
+    assert isinstance(curve, SubclassOfDisplay)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e93bf4993a93f0f5c12d295aa9c0c3b6136218d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py
@@ -0,0 +1,374 @@
+import numpy as np
+import pytest
+from numpy.testing import (
+    assert_allclose,
+    assert_array_equal,
+)
+
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import make_classification
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC, SVR
+
+
+def test_confusion_matrix_display_validation(pyplot):
+    """Check that we raise the proper error when validating parameters."""
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=5, random_state=0
+    )
+
+    with pytest.raises(NotFittedError):
+        ConfusionMatrixDisplay.from_estimator(SVC(), X, y)
+
+    regressor = SVR().fit(X, y)
+    y_pred_regressor = regressor.predict(X)
+    y_pred_classifier = SVC().fit(X, y).predict(X)
+
+    err_msg = "ConfusionMatrixDisplay.from_estimator only supports classifiers"
+    with pytest.raises(ValueError, match=err_msg):
+        ConfusionMatrixDisplay.from_estimator(regressor, X, y)
+
+    err_msg = "Mix type of y not allowed, got types"
+    with pytest.raises(ValueError, match=err_msg):
+        # Force `y_true` to be seen as a regression problem
+        ConfusionMatrixDisplay.from_predictions(y + 0.5, y_pred_classifier)
+    with pytest.raises(ValueError, match=err_msg):
+        ConfusionMatrixDisplay.from_predictions(y, y_pred_regressor)
+
+    err_msg = "Found input variables with inconsistent numbers of samples"
+    with pytest.raises(ValueError, match=err_msg):
+        ConfusionMatrixDisplay.from_predictions(y, y_pred_classifier[::2])
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+@pytest.mark.parametrize("with_labels", [True, False])
+@pytest.mark.parametrize("with_display_labels", [True, False])
+def test_confusion_matrix_display_custom_labels(
+    pyplot, constructor_name, with_labels, with_display_labels
+):
+    """Check the resulting plot when labels are given."""
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    ax = pyplot.gca()
+    labels = [2, 1, 0, 3, 4] if with_labels else None
+    display_labels = ["b", "d", "a", "e", "f"] if with_display_labels else None
+
+    cm = confusion_matrix(y, y_pred, labels=labels)
+    common_kwargs = {
+        "ax": ax,
+        "display_labels": display_labels,
+        "labels": labels,
+    }
+    if constructor_name == "from_estimator":
+        disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
+    else:
+        disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
+    assert_allclose(disp.confusion_matrix, cm)
+
+    if with_display_labels:
+        expected_display_labels = display_labels
+    elif with_labels:
+        expected_display_labels = labels
+    else:
+        expected_display_labels = list(range(n_classes))
+
+    expected_display_labels_str = [str(name) for name in expected_display_labels]
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    assert_array_equal(disp.display_labels, expected_display_labels)
+    assert_array_equal(x_ticks, expected_display_labels_str)
+    assert_array_equal(y_ticks, expected_display_labels_str)
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+@pytest.mark.parametrize("normalize", ["true", "pred", "all", None])
+@pytest.mark.parametrize("include_values", [True, False])
+def test_confusion_matrix_display_plotting(
+    pyplot,
+    constructor_name,
+    normalize,
+    include_values,
+):
+    """Check the overall plotting rendering."""
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    ax = pyplot.gca()
+    cmap = "plasma"
+
+    cm = confusion_matrix(y, y_pred)
+    common_kwargs = {
+        "normalize": normalize,
+        "cmap": cmap,
+        "ax": ax,
+        "include_values": include_values,
+    }
+    if constructor_name == "from_estimator":
+        disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
+    else:
+        disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
+
+    assert disp.ax_ == ax
+
+    if normalize == "true":
+        cm = cm / cm.sum(axis=1, keepdims=True)
+    elif normalize == "pred":
+        cm = cm / cm.sum(axis=0, keepdims=True)
+    elif normalize == "all":
+        cm = cm / cm.sum()
+
+    assert_allclose(disp.confusion_matrix, cm)
+    import matplotlib as mpl
+
+    assert isinstance(disp.im_, mpl.image.AxesImage)
+    assert disp.im_.get_cmap().name == cmap
+    assert isinstance(disp.ax_, pyplot.Axes)
+    assert isinstance(disp.figure_, pyplot.Figure)
+
+    assert disp.ax_.get_ylabel() == "True label"
+    assert disp.ax_.get_xlabel() == "Predicted label"
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    expected_display_labels = list(range(n_classes))
+
+    expected_display_labels_str = [str(name) for name in expected_display_labels]
+
+    assert_array_equal(disp.display_labels, expected_display_labels)
+    assert_array_equal(x_ticks, expected_display_labels_str)
+    assert_array_equal(y_ticks, expected_display_labels_str)
+
+    image_data = disp.im_.get_array().data
+    assert_allclose(image_data, cm)
+
+    if include_values:
+        assert disp.text_.shape == (n_classes, n_classes)
+        fmt = ".2g"
+        expected_text = np.array([format(v, fmt) for v in cm.ravel(order="C")])
+        text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")])
+        assert_array_equal(expected_text, text_text)
+    else:
+        assert disp.text_ is None
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_confusion_matrix_display(pyplot, constructor_name):
+    """Check the behaviour of the default constructor without using the class
+    methods."""
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    cm = confusion_matrix(y, y_pred)
+    common_kwargs = {
+        "normalize": None,
+        "include_values": True,
+        "cmap": "viridis",
+        "xticks_rotation": 45.0,
+    }
+    if constructor_name == "from_estimator":
+        disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
+    else:
+        disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
+
+    assert_allclose(disp.confusion_matrix, cm)
+    assert disp.text_.shape == (n_classes, n_classes)
+
+    rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
+    assert_allclose(rotations, 45.0)
+
+    image_data = disp.im_.get_array().data
+    assert_allclose(image_data, cm)
+
+    disp.plot(cmap="plasma")
+    assert disp.im_.get_cmap().name == "plasma"
+
+    disp.plot(include_values=False)
+    assert disp.text_ is None
+
+    disp.plot(xticks_rotation=90.0)
+    rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
+    assert_allclose(rotations, 90.0)
+
+    disp.plot(values_format="e")
+    expected_text = np.array([format(v, "e") for v in cm.ravel(order="C")])
+    text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")])
+    assert_array_equal(expected_text, text_text)
+
+
+def test_confusion_matrix_contrast(pyplot):
+    """Check that the text color is appropriate depending on background."""
+
+    cm = np.eye(2) / 2
+    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
+
+    disp.plot(cmap=pyplot.cm.gray)
+    # diagonal text is black
+    assert_allclose(disp.text_[0, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
+    assert_allclose(disp.text_[1, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
+
+    # off-diagonal text is white
+    assert_allclose(disp.text_[0, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
+    assert_allclose(disp.text_[1, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
+
+    disp.plot(cmap=pyplot.cm.gray_r)
+    # diagonal text is white
+    assert_allclose(disp.text_[0, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
+    assert_allclose(disp.text_[1, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
+
+    # off-diagonal text is black
+    assert_allclose(disp.text_[0, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
+    assert_allclose(disp.text_[1, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
+
+    # Regression test for #15920
+    cm = np.array([[19, 34], [32, 58]])
+    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
+
+    disp.plot(cmap=pyplot.cm.Blues)
+    min_color = pyplot.cm.Blues(0)
+    max_color = pyplot.cm.Blues(255)
+    assert_allclose(disp.text_[0, 0].get_color(), max_color)
+    assert_allclose(disp.text_[0, 1].get_color(), max_color)
+    assert_allclose(disp.text_[1, 0].get_color(), max_color)
+    assert_allclose(disp.text_[1, 1].get_color(), min_color)
+
+
+@pytest.mark.parametrize(
+    "clf",
+    [
+        LogisticRegression(),
+        make_pipeline(StandardScaler(), LogisticRegression()),
+        make_pipeline(
+            make_column_transformer((StandardScaler(), [0, 1])),
+            LogisticRegression(),
+        ),
+    ],
+    ids=["clf", "pipeline-clf", "pipeline-column_transformer-clf"],
+)
+def test_confusion_matrix_pipeline(pyplot, clf):
+    """Check the behaviour of the plotting with more complex pipeline."""
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    with pytest.raises(NotFittedError):
+        ConfusionMatrixDisplay.from_estimator(clf, X, y)
+    clf.fit(X, y)
+    y_pred = clf.predict(X)
+
+    disp = ConfusionMatrixDisplay.from_estimator(clf, X, y)
+    cm = confusion_matrix(y, y_pred)
+
+    assert_allclose(disp.confusion_matrix, cm)
+    assert disp.text_.shape == (n_classes, n_classes)
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_confusion_matrix_with_unknown_labels(pyplot, constructor_name):
+    """Check that when labels=None, the unique values in `y_pred` and `y_true`
+    will be used.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/18405
+    """
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+    # create unseen labels in `y_true` not seen during fitting and not present
+    # in 'classifier.classes_'
+    y = y + 1
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    common_kwargs = {"labels": None}
+    if constructor_name == "from_estimator":
+        disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
+    else:
+        disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
+
+    display_labels = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    expected_labels = [str(i) for i in range(n_classes + 1)]
+    assert_array_equal(expected_labels, display_labels)
+
+
+def test_colormap_max(pyplot):
+    """Check that the max color is used for the color of the text."""
+    gray = pyplot.get_cmap("gray", 1024)
+    confusion_matrix = np.array([[1.0, 0.0], [0.0, 1.0]])
+
+    disp = ConfusionMatrixDisplay(confusion_matrix)
+    disp.plot(cmap=gray)
+
+    color = disp.text_[1, 0].get_color()
+    assert_allclose(color, [1.0, 1.0, 1.0, 1.0])
+
+
+def test_im_kw_adjust_vmin_vmax(pyplot):
+    """Check that im_kw passes kwargs to imshow"""
+
+    confusion_matrix = np.array([[0.48, 0.04], [0.08, 0.4]])
+    disp = ConfusionMatrixDisplay(confusion_matrix)
+    disp.plot(im_kw=dict(vmin=0.0, vmax=0.8))
+
+    clim = disp.im_.get_clim()
+    assert clim[0] == pytest.approx(0.0)
+    assert clim[1] == pytest.approx(0.8)
+
+
+def test_confusion_matrix_text_kw(pyplot):
+    """Check that text_kw is passed to the text call."""
+    font_size = 15.0
+    X, y = make_classification(random_state=0)
+    classifier = SVC().fit(X, y)
+
+    # from_estimator passes the font size
+    disp = ConfusionMatrixDisplay.from_estimator(
+        classifier, X, y, text_kw={"fontsize": font_size}
+    )
+    for text in disp.text_.reshape(-1):
+        assert text.get_fontsize() == font_size
+
+    # plot adjusts plot to new font size
+    new_font_size = 20.0
+    disp.plot(text_kw={"fontsize": new_font_size})
+    for text in disp.text_.reshape(-1):
+        assert text.get_fontsize() == new_font_size
+
+    # from_predictions passes the font size
+    y_pred = classifier.predict(X)
+    disp = ConfusionMatrixDisplay.from_predictions(
+        y, y_pred, text_kw={"fontsize": font_size}
+    )
+    for text in disp.text_.reshape(-1):
+        assert text.get_fontsize() == font_size
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_det_curve_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_det_curve_display.py
new file mode 100644
index 0000000000000000000000000000000000000000..105778c63103040255278dfd4410dab5a2abd792
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_det_curve_display.py
@@ -0,0 +1,114 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn.datasets import load_iris
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import DetCurveDisplay, det_curve
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@pytest.mark.parametrize("drop_intermediate", [True, False])
+@pytest.mark.parametrize("with_strings", [True, False])
+def test_det_curve_display(
+    pyplot,
+    constructor_name,
+    response_method,
+    with_sample_weight,
+    drop_intermediate,
+    with_strings,
+):
+    X, y = load_iris(return_X_y=True)
+    # Binarize the data with only the two first classes
+    X, y = X[y < 2], y[y < 2]
+
+    pos_label = None
+    if with_strings:
+        y = np.array(["c", "b"])[y]
+        pos_label = "c"
+
+    if with_sample_weight:
+        rng = np.random.RandomState(42)
+        sample_weight = rng.randint(1, 4, size=(X.shape[0]))
+    else:
+        sample_weight = None
+
+    lr = LogisticRegression()
+    lr.fit(X, y)
+    y_pred = getattr(lr, response_method)(X)
+    if y_pred.ndim == 2:
+        y_pred = y_pred[:, 1]
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    common_kwargs = {
+        "name": lr.__class__.__name__,
+        "alpha": 0.8,
+        "sample_weight": sample_weight,
+        "drop_intermediate": drop_intermediate,
+        "pos_label": pos_label,
+    }
+    if constructor_name == "from_estimator":
+        disp = DetCurveDisplay.from_estimator(lr, X, y, **common_kwargs)
+    else:
+        disp = DetCurveDisplay.from_predictions(y, y_pred, **common_kwargs)
+
+    fpr, fnr, _ = det_curve(
+        y,
+        y_pred,
+        sample_weight=sample_weight,
+        drop_intermediate=drop_intermediate,
+        pos_label=pos_label,
+    )
+
+    assert_allclose(disp.fpr, fpr, atol=1e-7)
+    assert_allclose(disp.fnr, fnr, atol=1e-7)
+
+    assert disp.estimator_name == "LogisticRegression"
+
+    # cannot fail thanks to pyplot fixture
+    import matplotlib as mpl
+
+    assert isinstance(disp.line_, mpl.lines.Line2D)
+    assert disp.line_.get_alpha() == 0.8
+    assert isinstance(disp.ax_, mpl.axes.Axes)
+    assert isinstance(disp.figure_, mpl.figure.Figure)
+    assert disp.line_.get_label() == "LogisticRegression"
+
+    expected_pos_label = 1 if pos_label is None else pos_label
+    expected_ylabel = f"False Negative Rate (Positive label: {expected_pos_label})"
+    expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})"
+    assert disp.ax_.get_ylabel() == expected_ylabel
+    assert disp.ax_.get_xlabel() == expected_xlabel
+
+
+@pytest.mark.parametrize(
+    "constructor_name, expected_clf_name",
+    [
+        ("from_estimator", "LogisticRegression"),
+        ("from_predictions", "Classifier"),
+    ],
+)
+def test_det_curve_display_default_name(
+    pyplot,
+    constructor_name,
+    expected_clf_name,
+):
+    # Check the default name display in the figure when `name` is not provided
+    X, y = load_iris(return_X_y=True)
+    # Binarize the data with only the two first classes
+    X, y = X[y < 2], y[y < 2]
+
+    lr = LogisticRegression().fit(X, y)
+    y_pred = lr.predict_proba(X)[:, 1]
+
+    if constructor_name == "from_estimator":
+        disp = DetCurveDisplay.from_estimator(lr, X, y)
+    else:
+        disp = DetCurveDisplay.from_predictions(y, y_pred)
+
+    assert disp.estimator_name == expected_clf_name
+    assert disp.line_.get_label() == expected_clf_name
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_precision_recall_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_precision_recall_display.py
new file mode 100644
index 0000000000000000000000000000000000000000..022a5fbf28a914e4e27b6679b0d572d5a356ca82
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_precision_recall_display.py
@@ -0,0 +1,382 @@
+from collections import Counter
+
+import numpy as np
+import pytest
+from scipy.integrate import trapezoid
+
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import load_breast_cancer, make_classification
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    PrecisionRecallDisplay,
+    average_precision_score,
+    precision_recall_curve,
+)
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import shuffle
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+@pytest.mark.parametrize("drop_intermediate", [True, False])
+def test_precision_recall_display_plotting(
+    pyplot, constructor_name, response_method, drop_intermediate
+):
+    """Check the overall plotting rendering."""
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+    pos_label = 1
+
+    classifier = LogisticRegression().fit(X, y)
+    classifier.fit(X, y)
+
+    y_pred = getattr(classifier, response_method)(X)
+    y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, pos_label]
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    if constructor_name == "from_estimator":
+        display = PrecisionRecallDisplay.from_estimator(
+            classifier,
+            X,
+            y,
+            response_method=response_method,
+            drop_intermediate=drop_intermediate,
+        )
+    else:
+        display = PrecisionRecallDisplay.from_predictions(
+            y, y_pred, pos_label=pos_label, drop_intermediate=drop_intermediate
+        )
+
+    precision, recall, _ = precision_recall_curve(
+        y, y_pred, pos_label=pos_label, drop_intermediate=drop_intermediate
+    )
+    average_precision = average_precision_score(y, y_pred, pos_label=pos_label)
+
+    np.testing.assert_allclose(display.precision, precision)
+    np.testing.assert_allclose(display.recall, recall)
+    assert display.average_precision == pytest.approx(average_precision)
+
+    import matplotlib as mpl
+
+    assert isinstance(display.line_, mpl.lines.Line2D)
+    assert isinstance(display.ax_, mpl.axes.Axes)
+    assert isinstance(display.figure_, mpl.figure.Figure)
+
+    assert display.ax_.get_xlabel() == "Recall (Positive label: 1)"
+    assert display.ax_.get_ylabel() == "Precision (Positive label: 1)"
+    assert display.ax_.get_adjustable() == "box"
+    assert display.ax_.get_aspect() in ("equal", 1.0)
+    assert display.ax_.get_xlim() == display.ax_.get_ylim() == (-0.01, 1.01)
+
+    # plotting passing some new parameters
+    display.plot(alpha=0.8, name="MySpecialEstimator")
+    expected_label = f"MySpecialEstimator (AP = {average_precision:0.2f})"
+    assert display.line_.get_label() == expected_label
+    assert display.line_.get_alpha() == pytest.approx(0.8)
+
+    # Check that the chance level line is not plotted by default
+    assert display.chance_level_ is None
+
+
+@pytest.mark.parametrize("chance_level_kw", [None, {"color": "r"}, {"c": "r"}])
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_precision_recall_chance_level_line(
+    pyplot,
+    chance_level_kw,
+    constructor_name,
+):
+    """Check the chance level line plotting behavior."""
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+    pos_prevalence = Counter(y)[1] / len(y)
+
+    lr = LogisticRegression()
+    y_pred = lr.fit(X, y).predict_proba(X)[:, 1]
+
+    if constructor_name == "from_estimator":
+        display = PrecisionRecallDisplay.from_estimator(
+            lr,
+            X,
+            y,
+            plot_chance_level=True,
+            chance_level_kw=chance_level_kw,
+        )
+    else:
+        display = PrecisionRecallDisplay.from_predictions(
+            y,
+            y_pred,
+            plot_chance_level=True,
+            chance_level_kw=chance_level_kw,
+        )
+
+    import matplotlib as mpl
+
+    assert isinstance(display.chance_level_, mpl.lines.Line2D)
+    assert tuple(display.chance_level_.get_xdata()) == (0, 1)
+    assert tuple(display.chance_level_.get_ydata()) == (pos_prevalence, pos_prevalence)
+
+    # Checking for chance level line styles
+    if chance_level_kw is None:
+        assert display.chance_level_.get_color() == "k"
+    else:
+        assert display.chance_level_.get_color() == "r"
+
+
+@pytest.mark.parametrize(
+    "constructor_name, default_label",
+    [
+        ("from_estimator", "LogisticRegression (AP = {:.2f})"),
+        ("from_predictions", "Classifier (AP = {:.2f})"),
+    ],
+)
+def test_precision_recall_display_name(pyplot, constructor_name, default_label):
+    """Check the behaviour of the name parameters"""
+    X, y = make_classification(n_classes=2, n_samples=100, random_state=0)
+    pos_label = 1
+
+    classifier = LogisticRegression().fit(X, y)
+    classifier.fit(X, y)
+
+    y_pred = classifier.predict_proba(X)[:, pos_label]
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    if constructor_name == "from_estimator":
+        display = PrecisionRecallDisplay.from_estimator(classifier, X, y)
+    else:
+        display = PrecisionRecallDisplay.from_predictions(
+            y, y_pred, pos_label=pos_label
+        )
+
+    average_precision = average_precision_score(y, y_pred, pos_label=pos_label)
+
+    # check that the default name is used
+    assert display.line_.get_label() == default_label.format(average_precision)
+
+    # check that the name can be set
+    display.plot(name="MySpecialEstimator")
+    assert (
+        display.line_.get_label()
+        == f"MySpecialEstimator (AP = {average_precision:.2f})"
+    )
+
+
+@pytest.mark.parametrize(
+    "clf",
+    [
+        make_pipeline(StandardScaler(), LogisticRegression()),
+        make_pipeline(
+            make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
+        ),
+    ],
+)
+def test_precision_recall_display_pipeline(pyplot, clf):
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+    with pytest.raises(NotFittedError):
+        PrecisionRecallDisplay.from_estimator(clf, X, y)
+    clf.fit(X, y)
+    display = PrecisionRecallDisplay.from_estimator(clf, X, y)
+    assert display.estimator_name == clf.__class__.__name__
+
+
+def test_precision_recall_display_string_labels(pyplot):
+    # regression test #15738
+    cancer = load_breast_cancer()
+    X, y = cancer.data, cancer.target_names[cancer.target]
+
+    lr = make_pipeline(StandardScaler(), LogisticRegression())
+    lr.fit(X, y)
+    for klass in cancer.target_names:
+        assert klass in lr.classes_
+    display = PrecisionRecallDisplay.from_estimator(lr, X, y)
+
+    y_pred = lr.predict_proba(X)[:, 1]
+    avg_prec = average_precision_score(y, y_pred, pos_label=lr.classes_[1])
+
+    assert display.average_precision == pytest.approx(avg_prec)
+    assert display.estimator_name == lr.__class__.__name__
+
+    err_msg = r"y_true takes value in {'benign', 'malignant'}"
+    with pytest.raises(ValueError, match=err_msg):
+        PrecisionRecallDisplay.from_predictions(y, y_pred)
+
+    display = PrecisionRecallDisplay.from_predictions(
+        y, y_pred, pos_label=lr.classes_[1]
+    )
+    assert display.average_precision == pytest.approx(avg_prec)
+
+
+@pytest.mark.parametrize(
+    "average_precision, estimator_name, expected_label",
+    [
+        (0.9, None, "AP = 0.90"),
+        (None, "my_est", "my_est"),
+        (0.8, "my_est2", "my_est2 (AP = 0.80)"),
+    ],
+)
+def test_default_labels(pyplot, average_precision, estimator_name, expected_label):
+    """Check the default labels used in the display."""
+    precision = np.array([1, 0.5, 0])
+    recall = np.array([0, 0.5, 1])
+    display = PrecisionRecallDisplay(
+        precision,
+        recall,
+        average_precision=average_precision,
+        estimator_name=estimator_name,
+    )
+    display.plot()
+    assert display.line_.get_label() == expected_label
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_method):
+    # check that we can provide the positive label and display the proper
+    # statistics
+    X, y = load_breast_cancer(return_X_y=True)
+    # create an highly imbalanced version of the breast cancer dataset
+    idx_positive = np.flatnonzero(y == 1)
+    idx_negative = np.flatnonzero(y == 0)
+    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
+    X, y = X[idx_selected], y[idx_selected]
+    X, y = shuffle(X, y, random_state=42)
+    # only use 2 features to make the problem even harder
+    X = X[:, :2]
+    y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X,
+        y,
+        stratify=y,
+        random_state=0,
+    )
+
+    classifier = LogisticRegression()
+    classifier.fit(X_train, y_train)
+
+    # sanity check to be sure the positive class is classes_[0] and that we
+    # are betrayed by the class imbalance
+    assert classifier.classes_.tolist() == ["cancer", "not cancer"]
+
+    y_pred = getattr(classifier, response_method)(X_test)
+    # we select the corresponding probability columns or reverse the decision
+    #  function otherwise
+    y_pred_cancer = -1 * y_pred if y_pred.ndim == 1 else y_pred[:, 0]
+    y_pred_not_cancer = y_pred if y_pred.ndim == 1 else y_pred[:, 1]
+
+    if constructor_name == "from_estimator":
+        display = PrecisionRecallDisplay.from_estimator(
+            classifier,
+            X_test,
+            y_test,
+            pos_label="cancer",
+            response_method=response_method,
+        )
+    else:
+        display = PrecisionRecallDisplay.from_predictions(
+            y_test,
+            y_pred_cancer,
+            pos_label="cancer",
+        )
+    # we should obtain the statistics of the "cancer" class
+    avg_prec_limit = 0.65
+    assert display.average_precision < avg_prec_limit
+    assert -trapezoid(display.precision, display.recall) < avg_prec_limit
+
+    # otherwise we should obtain the statistics of the "not cancer" class
+    if constructor_name == "from_estimator":
+        display = PrecisionRecallDisplay.from_estimator(
+            classifier,
+            X_test,
+            y_test,
+            response_method=response_method,
+            pos_label="not cancer",
+        )
+    else:
+        display = PrecisionRecallDisplay.from_predictions(
+            y_test,
+            y_pred_not_cancer,
+            pos_label="not cancer",
+        )
+    avg_prec_limit = 0.95
+    assert display.average_precision > avg_prec_limit
+    assert -trapezoid(display.precision, display.recall) > avg_prec_limit
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_precision_recall_prevalence_pos_label_reusable(pyplot, constructor_name):
+    # Check that even if one passes plot_chance_level=False the first time
+    # one can still call disp.plot with plot_chance_level=True and get the
+    # chance level line
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    lr = LogisticRegression()
+    y_pred = lr.fit(X, y).predict_proba(X)[:, 1]
+
+    if constructor_name == "from_estimator":
+        display = PrecisionRecallDisplay.from_estimator(
+            lr, X, y, plot_chance_level=False
+        )
+    else:
+        display = PrecisionRecallDisplay.from_predictions(
+            y, y_pred, plot_chance_level=False
+        )
+    assert display.chance_level_ is None
+
+    import matplotlib as mpl
+
+    # When calling from_estimator or from_predictions,
+    # prevalence_pos_label should have been set, so that directly
+    # calling plot_chance_level=True should plot the chance level line
+    display.plot(plot_chance_level=True)
+    assert isinstance(display.chance_level_, mpl.lines.Line2D)
+
+
+def test_precision_recall_raise_no_prevalence(pyplot):
+    # Check that raises correctly when plotting chance level with
+    # no prvelance_pos_label is provided
+    precision = np.array([1, 0.5, 0])
+    recall = np.array([0, 0.5, 1])
+    display = PrecisionRecallDisplay(precision, recall)
+
+    msg = (
+        "You must provide prevalence_pos_label when constructing the "
+        "PrecisionRecallDisplay object in order to plot the chance "
+        "level line. Alternatively, you may use "
+        "PrecisionRecallDisplay.from_estimator or "
+        "PrecisionRecallDisplay.from_predictions "
+        "to automatically set prevalence_pos_label"
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        display.plot(plot_chance_level=True)
+
+
+@pytest.mark.parametrize("despine", [True, False])
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_plot_precision_recall_despine(pyplot, despine, constructor_name):
+    # Check that the despine keyword is working correctly
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    clf = LogisticRegression().fit(X, y)
+    clf.fit(X, y)
+
+    y_pred = clf.decision_function(X)
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    if constructor_name == "from_estimator":
+        display = PrecisionRecallDisplay.from_estimator(clf, X, y, despine=despine)
+    else:
+        display = PrecisionRecallDisplay.from_predictions(y, y_pred, despine=despine)
+
+    for s in ["top", "right"]:
+        assert display.ax_.spines[s].get_visible() is not despine
+
+    if despine:
+        for s in ["bottom", "left"]:
+            assert display.ax_.spines[s].get_bounds() == (0, 1)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_predict_error_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_predict_error_display.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2cb888e8884958f55d665879429f224fc9b787d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_predict_error_display.py
@@ -0,0 +1,169 @@
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn.datasets import load_diabetes
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import Ridge
+from sklearn.metrics import PredictionErrorDisplay
+
+X, y = load_diabetes(return_X_y=True)
+
+
+@pytest.fixture
+def regressor_fitted():
+    return Ridge().fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "regressor, params, err_type, err_msg",
+    [
+        (
+            Ridge().fit(X, y),
+            {"subsample": -1},
+            ValueError,
+            "When an integer, subsample=-1 should be",
+        ),
+        (
+            Ridge().fit(X, y),
+            {"subsample": 20.0},
+            ValueError,
+            "When a floating-point, subsample=20.0 should be",
+        ),
+        (
+            Ridge().fit(X, y),
+            {"subsample": -20.0},
+            ValueError,
+            "When a floating-point, subsample=-20.0 should be",
+        ),
+        (
+            Ridge().fit(X, y),
+            {"kind": "xxx"},
+            ValueError,
+            "`kind` must be one of",
+        ),
+    ],
+)
+@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
+def test_prediction_error_display_raise_error(
+    pyplot, class_method, regressor, params, err_type, err_msg
+):
+    """Check that we raise the proper error when making the parameters
+    # validation."""
+    with pytest.raises(err_type, match=err_msg):
+        if class_method == "from_estimator":
+            PredictionErrorDisplay.from_estimator(regressor, X, y, **params)
+        else:
+            y_pred = regressor.predict(X)
+            PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred, **params)
+
+
+def test_from_estimator_not_fitted(pyplot):
+    """Check that we raise a `NotFittedError` when the passed regressor is not
+    fit."""
+    regressor = Ridge()
+    with pytest.raises(NotFittedError, match="is not fitted yet."):
+        PredictionErrorDisplay.from_estimator(regressor, X, y)
+
+
+@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
+@pytest.mark.parametrize("kind", ["actual_vs_predicted", "residual_vs_predicted"])
+def test_prediction_error_display(pyplot, regressor_fitted, class_method, kind):
+    """Check the default behaviour of the display."""
+    if class_method == "from_estimator":
+        display = PredictionErrorDisplay.from_estimator(
+            regressor_fitted, X, y, kind=kind
+        )
+    else:
+        y_pred = regressor_fitted.predict(X)
+        display = PredictionErrorDisplay.from_predictions(
+            y_true=y, y_pred=y_pred, kind=kind
+        )
+
+    if kind == "actual_vs_predicted":
+        assert_allclose(display.line_.get_xdata(), display.line_.get_ydata())
+        assert display.ax_.get_xlabel() == "Predicted values"
+        assert display.ax_.get_ylabel() == "Actual values"
+        assert display.line_ is not None
+    else:
+        assert display.ax_.get_xlabel() == "Predicted values"
+        assert display.ax_.get_ylabel() == "Residuals (actual - predicted)"
+        assert display.line_ is not None
+
+    assert display.ax_.get_legend() is None
+
+
+@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
+@pytest.mark.parametrize(
+    "subsample, expected_size",
+    [(5, 5), (0.1, int(X.shape[0] * 0.1)), (None, X.shape[0])],
+)
+def test_plot_prediction_error_subsample(
+    pyplot, regressor_fitted, class_method, subsample, expected_size
+):
+    """Check the behaviour of `subsample`."""
+    if class_method == "from_estimator":
+        display = PredictionErrorDisplay.from_estimator(
+            regressor_fitted, X, y, subsample=subsample
+        )
+    else:
+        y_pred = regressor_fitted.predict(X)
+        display = PredictionErrorDisplay.from_predictions(
+            y_true=y, y_pred=y_pred, subsample=subsample
+        )
+    assert len(display.scatter_.get_offsets()) == expected_size
+
+
+@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
+def test_plot_prediction_error_ax(pyplot, regressor_fitted, class_method):
+    """Check that we can pass an axis to the display."""
+    _, ax = pyplot.subplots()
+    if class_method == "from_estimator":
+        display = PredictionErrorDisplay.from_estimator(regressor_fitted, X, y, ax=ax)
+    else:
+        y_pred = regressor_fitted.predict(X)
+        display = PredictionErrorDisplay.from_predictions(
+            y_true=y, y_pred=y_pred, ax=ax
+        )
+    assert display.ax_ is ax
+
+
+@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
+@pytest.mark.parametrize(
+    "scatter_kwargs",
+    [None, {"color": "blue", "alpha": 0.9}, {"c": "blue", "alpha": 0.9}],
+)
+@pytest.mark.parametrize(
+    "line_kwargs", [None, {"color": "red", "linestyle": "-"}, {"c": "red", "ls": "-"}]
+)
+def test_prediction_error_custom_artist(
+    pyplot, regressor_fitted, class_method, scatter_kwargs, line_kwargs
+):
+    """Check that we can tune the style of the line and the scatter."""
+    extra_params = {
+        "kind": "actual_vs_predicted",
+        "scatter_kwargs": scatter_kwargs,
+        "line_kwargs": line_kwargs,
+    }
+    if class_method == "from_estimator":
+        display = PredictionErrorDisplay.from_estimator(
+            regressor_fitted, X, y, **extra_params
+        )
+    else:
+        y_pred = regressor_fitted.predict(X)
+        display = PredictionErrorDisplay.from_predictions(
+            y_true=y, y_pred=y_pred, **extra_params
+        )
+
+    if line_kwargs is not None:
+        assert display.line_.get_linestyle() == "-"
+        assert display.line_.get_color() == "red"
+    else:
+        assert display.line_.get_linestyle() == "--"
+        assert display.line_.get_color() == "black"
+        assert display.line_.get_alpha() == 0.7
+
+    if scatter_kwargs is not None:
+        assert_allclose(display.scatter_.get_facecolor(), [[0.0, 0.0, 1.0, 0.9]])
+        assert_allclose(display.scatter_.get_edgecolor(), [[0.0, 0.0, 1.0, 0.9]])
+    else:
+        assert display.scatter_.get_alpha() == 0.8
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_roc_curve_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_roc_curve_display.py
new file mode 100644
index 0000000000000000000000000000000000000000..23fa2f2e3a5e6a7f0e8b918ec4b75e404887af8b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_roc_curve_display.py
@@ -0,0 +1,987 @@
+from collections.abc import Mapping
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy.integrate import trapezoid
+
+from sklearn import clone
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import load_breast_cancer, make_classification
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import RocCurveDisplay, auc, roc_curve
+from sklearn.model_selection import cross_validate, train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import _safe_indexing, shuffle
+from sklearn.utils._response import _get_response_values_binary
+
+
+@pytest.fixture(scope="module")
+def data_binary():
+    X, y = make_classification(
+        n_samples=200,
+        n_features=20,
+        n_informative=5,
+        n_redundant=2,
+        flip_y=0.1,
+        class_sep=0.8,
+        random_state=42,
+    )
+    return X, y
+
+
+def _check_figure_axes_and_labels(display, pos_label):
+    """Check mpl axes and figure defaults are correct."""
+    import matplotlib as mpl
+
+    assert isinstance(display.ax_, mpl.axes.Axes)
+    assert isinstance(display.figure_, mpl.figure.Figure)
+    assert display.ax_.get_adjustable() == "box"
+    assert display.ax_.get_aspect() in ("equal", 1.0)
+    assert display.ax_.get_xlim() == display.ax_.get_ylim() == (-0.01, 1.01)
+
+    expected_pos_label = 1 if pos_label is None else pos_label
+    expected_ylabel = f"True Positive Rate (Positive label: {expected_pos_label})"
+    expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})"
+
+    assert display.ax_.get_ylabel() == expected_ylabel
+    assert display.ax_.get_xlabel() == expected_xlabel
+
+
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@pytest.mark.parametrize("drop_intermediate", [True, False])
+@pytest.mark.parametrize("with_strings", [True, False])
+@pytest.mark.parametrize(
+    "constructor_name, default_name",
+    [
+        ("from_estimator", "LogisticRegression"),
+        ("from_predictions", "Classifier"),
+    ],
+)
+def test_roc_curve_display_plotting(
+    pyplot,
+    response_method,
+    data_binary,
+    with_sample_weight,
+    drop_intermediate,
+    with_strings,
+    constructor_name,
+    default_name,
+):
+    """Check the overall plotting behaviour for single curve."""
+    X, y = data_binary
+
+    pos_label = None
+    if with_strings:
+        y = np.array(["c", "b"])[y]
+        pos_label = "c"
+
+    if with_sample_weight:
+        rng = np.random.RandomState(42)
+        sample_weight = rng.randint(1, 4, size=(X.shape[0]))
+    else:
+        sample_weight = None
+
+    lr = LogisticRegression()
+    lr.fit(X, y)
+
+    y_score = getattr(lr, response_method)(X)
+    y_score = y_score if y_score.ndim == 1 else y_score[:, 1]
+
+    if constructor_name == "from_estimator":
+        display = RocCurveDisplay.from_estimator(
+            lr,
+            X,
+            y,
+            sample_weight=sample_weight,
+            drop_intermediate=drop_intermediate,
+            pos_label=pos_label,
+            curve_kwargs={"alpha": 0.8},
+        )
+    else:
+        display = RocCurveDisplay.from_predictions(
+            y,
+            y_score,
+            sample_weight=sample_weight,
+            drop_intermediate=drop_intermediate,
+            pos_label=pos_label,
+            curve_kwargs={"alpha": 0.8},
+        )
+
+    fpr, tpr, _ = roc_curve(
+        y,
+        y_score,
+        sample_weight=sample_weight,
+        drop_intermediate=drop_intermediate,
+        pos_label=pos_label,
+    )
+
+    assert_allclose(display.roc_auc, auc(fpr, tpr))
+    assert_allclose(display.fpr, fpr)
+    assert_allclose(display.tpr, tpr)
+
+    assert display.name == default_name
+
+    import matplotlib as mpl
+
+    _check_figure_axes_and_labels(display, pos_label)
+    assert isinstance(display.line_, mpl.lines.Line2D)
+    assert display.line_.get_alpha() == 0.8
+
+    expected_label = f"{default_name} (AUC = {display.roc_auc:.2f})"
+    assert display.line_.get_label() == expected_label
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        (
+            {
+                "fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
+                "tpr": [np.array([0, 0.5, 1])],
+                "roc_auc": None,
+                "name": None,
+            },
+            "self.fpr and self.tpr from `RocCurveDisplay` initialization,",
+        ),
+        (
+            {
+                "fpr": [np.array([0, 0.5, 1])],
+                "tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
+                "roc_auc": [0.8, 0.9],
+                "name": None,
+            },
+            "self.fpr, self.tpr and self.roc_auc from `RocCurveDisplay`",
+        ),
+        (
+            {
+                "fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
+                "tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
+                "roc_auc": [0.8],
+                "name": None,
+            },
+            "Got: self.fpr: 2, self.tpr: 2, self.roc_auc: 1",
+        ),
+        (
+            {
+                "fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
+                "tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
+                "roc_auc": [0.8, 0.9],
+                "name": ["curve1", "curve2", "curve3"],
+            },
+            r"self.fpr, self.tpr, self.roc_auc and 'name' \(or self.name\)",
+        ),
+        (
+            {
+                "fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
+                "tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
+                "roc_auc": [0.8, 0.9],
+                # List of length 1 is always allowed
+                "name": ["curve1"],
+            },
+            None,
+        ),
+    ],
+)
+def test_roc_curve_plot_parameter_length_validation(pyplot, params, err_msg):
+    """Check `plot` parameter length validation performed correctly."""
+    display = RocCurveDisplay(**params)
+    if err_msg:
+        with pytest.raises(ValueError, match=err_msg):
+            display.plot()
+    else:
+        # No error should be raised
+        display.plot()
+
+
+def test_validate_plot_params(pyplot):
+    """Check `_validate_plot_params` returns the correct variables."""
+    fpr = np.array([0, 0.5, 1])
+    tpr = [np.array([0, 0.5, 1])]
+    roc_auc = None
+    name = "test_curve"
+
+    # Initialize display with test inputs
+    display = RocCurveDisplay(
+        fpr=fpr,
+        tpr=tpr,
+        roc_auc=roc_auc,
+        name=name,
+        pos_label=None,
+    )
+    fpr_out, tpr_out, roc_auc_out, name_out = display._validate_plot_params(
+        ax=None, name=None
+    )
+
+    assert isinstance(fpr_out, list)
+    assert isinstance(tpr_out, list)
+    assert len(fpr_out) == 1
+    assert len(tpr_out) == 1
+    assert roc_auc_out is None
+    assert name_out == ["test_curve"]
+
+
+def test_roc_curve_from_cv_results_param_validation(pyplot, data_binary):
+    """Check parameter validation is correct."""
+    X, y = data_binary
+
+    # `cv_results` missing key
+    cv_results_no_est = cross_validate(
+        LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=False
+    )
+    cv_results_no_indices = cross_validate(
+        LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=False
+    )
+    for cv_results in (cv_results_no_est, cv_results_no_indices):
+        with pytest.raises(
+            ValueError,
+            match="`cv_results` does not contain one of the following required",
+        ):
+            RocCurveDisplay.from_cv_results(cv_results, X, y)
+
+    cv_results = cross_validate(
+        LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
+    )
+
+    # `X` wrong length
+    with pytest.raises(ValueError, match="`X` does not contain the correct"):
+        RocCurveDisplay.from_cv_results(cv_results, X[:10, :], y)
+
+    # `y` not binary
+    y_multi = y.copy()
+    y_multi[0] = 2
+    with pytest.raises(ValueError, match="The target `y` is not binary."):
+        RocCurveDisplay.from_cv_results(cv_results, X, y_multi)
+
+    # input inconsistent length
+    with pytest.raises(ValueError, match="Found input variables with inconsistent"):
+        RocCurveDisplay.from_cv_results(cv_results, X, y[:10])
+    with pytest.raises(ValueError, match="Found input variables with inconsistent"):
+        RocCurveDisplay.from_cv_results(cv_results, X, y, sample_weight=[1, 2])
+
+    # `pos_label` inconsistency
+    y_multi[y_multi == 1] = 2
+    with pytest.raises(ValueError, match=r"y takes value in \{0, 2\}"):
+        RocCurveDisplay.from_cv_results(cv_results, X, y_multi)
+
+    # `name` is list while `curve_kwargs` is None or dict
+    for curve_kwargs in (None, {"alpha": 0.2}):
+        with pytest.raises(ValueError, match="To avoid labeling individual curves"):
+            RocCurveDisplay.from_cv_results(
+                cv_results,
+                X,
+                y,
+                name=["one", "two", "three"],
+                curve_kwargs=curve_kwargs,
+            )
+
+    # `curve_kwargs` incorrect length
+    with pytest.raises(ValueError, match="`curve_kwargs` must be None, a dictionary"):
+        RocCurveDisplay.from_cv_results(cv_results, X, y, curve_kwargs=[{"alpha": 1}])
+
+    # `curve_kwargs` both alias provided
+    with pytest.raises(TypeError, match="Got both c and"):
+        RocCurveDisplay.from_cv_results(
+            cv_results, X, y, curve_kwargs={"c": "blue", "color": "red"}
+        )
+
+
+@pytest.mark.parametrize(
+    "curve_kwargs",
+    [None, {"alpha": 0.2}, [{"alpha": 0.2}, {"alpha": 0.3}, {"alpha": 0.4}]],
+)
+def test_roc_curve_display_from_cv_results_curve_kwargs(
+    pyplot, data_binary, curve_kwargs
+):
+    """Check `curve_kwargs` correctly passed."""
+    X, y = data_binary
+    n_cv = 3
+    cv_results = cross_validate(
+        LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
+    )
+    display = RocCurveDisplay.from_cv_results(
+        cv_results,
+        X,
+        y,
+        curve_kwargs=curve_kwargs,
+    )
+    if curve_kwargs is None:
+        # Default `alpha` used
+        assert all(line.get_alpha() == 0.5 for line in display.line_)
+    elif isinstance(curve_kwargs, Mapping):
+        # `alpha` from dict used for all curves
+        assert all(line.get_alpha() == 0.2 for line in display.line_)
+    else:
+        # Different `alpha` used for each curve
+        assert all(
+            line.get_alpha() == curve_kwargs[i]["alpha"]
+            for i, line in enumerate(display.line_)
+        )
+
+
+# TODO(1.9): Remove in 1.9
+def test_roc_curve_display_estimator_name_deprecation(pyplot):
+    """Check deprecation of `estimator_name`."""
+    fpr = np.array([0, 0.5, 1])
+    tpr = np.array([0, 0.5, 1])
+    with pytest.warns(FutureWarning, match="`estimator_name` is deprecated in"):
+        RocCurveDisplay(fpr=fpr, tpr=tpr, estimator_name="test")
+
+
+# TODO(1.9): Remove in 1.9
+@pytest.mark.parametrize(
+    "constructor_name", ["from_estimator", "from_predictions", "plot"]
+)
+def test_roc_curve_display_kwargs_deprecation(pyplot, data_binary, constructor_name):
+    """Check **kwargs deprecated correctly in favour of `curve_kwargs`."""
+    X, y = data_binary
+    lr = LogisticRegression()
+    lr.fit(X, y)
+    fpr = np.array([0, 0.5, 1])
+    tpr = np.array([0, 0.5, 1])
+
+    # Error when both `curve_kwargs` and `**kwargs` provided
+    with pytest.raises(ValueError, match="Cannot provide both `curve_kwargs`"):
+        if constructor_name == "from_estimator":
+            RocCurveDisplay.from_estimator(
+                lr, X, y, curve_kwargs={"alpha": 1}, label="test"
+            )
+        elif constructor_name == "from_predictions":
+            RocCurveDisplay.from_predictions(
+                y, y, curve_kwargs={"alpha": 1}, label="test"
+            )
+        else:
+            RocCurveDisplay(fpr=fpr, tpr=tpr).plot(
+                curve_kwargs={"alpha": 1}, label="test"
+            )
+
+    # Warning when `**kwargs`` provided
+    with pytest.warns(FutureWarning, match=r"`\*\*kwargs` is deprecated and will be"):
+        if constructor_name == "from_estimator":
+            RocCurveDisplay.from_estimator(lr, X, y, label="test")
+        elif constructor_name == "from_predictions":
+            RocCurveDisplay.from_predictions(y, y, label="test")
+        else:
+            RocCurveDisplay(fpr=fpr, tpr=tpr).plot(label="test")
+
+
+@pytest.mark.parametrize(
+    "curve_kwargs",
+    [
+        None,
+        {"color": "blue"},
+        [{"color": "blue"}, {"color": "green"}, {"color": "red"}],
+    ],
+)
+@pytest.mark.parametrize("drop_intermediate", [True, False])
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@pytest.mark.parametrize("with_strings", [True, False])
+def test_roc_curve_display_plotting_from_cv_results(
+    pyplot,
+    data_binary,
+    with_strings,
+    with_sample_weight,
+    response_method,
+    drop_intermediate,
+    curve_kwargs,
+):
+    """Check overall plotting of `from_cv_results`."""
+    X, y = data_binary
+
+    pos_label = None
+    if with_strings:
+        y = np.array(["c", "b"])[y]
+        pos_label = "c"
+
+    if with_sample_weight:
+        rng = np.random.RandomState(42)
+        sample_weight = rng.randint(1, 4, size=(X.shape[0]))
+    else:
+        sample_weight = None
+
+    cv_results = cross_validate(
+        LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
+    )
+    display = RocCurveDisplay.from_cv_results(
+        cv_results,
+        X,
+        y,
+        sample_weight=sample_weight,
+        drop_intermediate=drop_intermediate,
+        response_method=response_method,
+        pos_label=pos_label,
+        curve_kwargs=curve_kwargs,
+    )
+
+    for idx, (estimator, test_indices) in enumerate(
+        zip(cv_results["estimator"], cv_results["indices"]["test"])
+    ):
+        y_true = _safe_indexing(y, test_indices)
+        y_pred = _get_response_values_binary(
+            estimator,
+            _safe_indexing(X, test_indices),
+            response_method=response_method,
+            pos_label=pos_label,
+        )[0]
+        sample_weight_fold = (
+            None
+            if sample_weight is None
+            else _safe_indexing(sample_weight, test_indices)
+        )
+        fpr, tpr, _ = roc_curve(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight_fold,
+            drop_intermediate=drop_intermediate,
+            pos_label=pos_label,
+        )
+        assert_allclose(display.roc_auc[idx], auc(fpr, tpr))
+        assert_allclose(display.fpr[idx], fpr)
+        assert_allclose(display.tpr[idx], tpr)
+
+    assert display.name is None
+
+    import matplotlib as mpl
+
+    _check_figure_axes_and_labels(display, pos_label)
+    if with_sample_weight:
+        aggregate_expected_labels = ["AUC = 0.64 +/- 0.04", "_child1", "_child2"]
+    else:
+        aggregate_expected_labels = ["AUC = 0.61 +/- 0.05", "_child1", "_child2"]
+    for idx, line in enumerate(display.line_):
+        assert isinstance(line, mpl.lines.Line2D)
+        # Default alpha for `from_cv_results`
+        line.get_alpha() == 0.5
+        if isinstance(curve_kwargs, list):
+            # Each individual curve labelled
+            assert line.get_label() == f"AUC = {display.roc_auc[idx]:.2f}"
+        else:
+            # Single aggregate label
+            assert line.get_label() == aggregate_expected_labels[idx]
+
+
+@pytest.mark.parametrize("roc_auc", [[1.0, 1.0, 1.0], None])
+@pytest.mark.parametrize(
+    "curve_kwargs",
+    [None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
+)
+@pytest.mark.parametrize("name", [None, "single", ["one", "two", "three"]])
+def test_roc_curve_plot_legend_label(pyplot, data_binary, name, curve_kwargs, roc_auc):
+    """Check legend label correct with all `curve_kwargs`, `name` combinations."""
+    fpr = [np.array([0, 0.5, 1]), np.array([0, 0.5, 1]), np.array([0, 0.5, 1])]
+    tpr = [np.array([0, 0.5, 1]), np.array([0, 0.5, 1]), np.array([0, 0.5, 1])]
+    if not isinstance(curve_kwargs, list) and isinstance(name, list):
+        with pytest.raises(ValueError, match="To avoid labeling individual curves"):
+            RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot(
+                name=name, curve_kwargs=curve_kwargs
+            )
+
+    else:
+        display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot(
+            name=name, curve_kwargs=curve_kwargs
+        )
+        legend = display.ax_.get_legend()
+        if legend is None:
+            # No legend is created, exit test early
+            assert name is None
+            assert roc_auc is None
+            return
+        else:
+            legend_labels = [text.get_text() for text in legend.get_texts()]
+
+        if isinstance(curve_kwargs, list):
+            # Multiple labels in legend
+            assert len(legend_labels) == 3
+            for idx, label in enumerate(legend_labels):
+                if name is None:
+                    expected_label = "AUC = 1.00" if roc_auc else None
+                    assert label == expected_label
+                elif isinstance(name, str):
+                    expected_label = "single (AUC = 1.00)" if roc_auc else "single"
+                    assert label == expected_label
+                else:
+                    # `name` is a list of different strings
+                    expected_label = (
+                        f"{name[idx]} (AUC = 1.00)" if roc_auc else f"{name[idx]}"
+                    )
+                    assert label == expected_label
+        else:
+            # Single label in legend
+            assert len(legend_labels) == 1
+            if name is None:
+                expected_label = "AUC = 1.00 +/- 0.00" if roc_auc else None
+                assert legend_labels[0] == expected_label
+            else:
+                # name is single string
+                expected_label = "single (AUC = 1.00 +/- 0.00)" if roc_auc else "single"
+                assert legend_labels[0] == expected_label
+
+
+@pytest.mark.parametrize(
+    "curve_kwargs",
+    [None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
+)
+@pytest.mark.parametrize("name", [None, "single", ["one", "two", "three"]])
+def test_roc_curve_from_cv_results_legend_label(
+    pyplot, data_binary, name, curve_kwargs
+):
+    """Check legend label correct with all `curve_kwargs`, `name` combinations."""
+    X, y = data_binary
+    n_cv = 3
+    cv_results = cross_validate(
+        LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
+    )
+
+    if not isinstance(curve_kwargs, list) and isinstance(name, list):
+        with pytest.raises(ValueError, match="To avoid labeling individual curves"):
+            RocCurveDisplay.from_cv_results(
+                cv_results, X, y, name=name, curve_kwargs=curve_kwargs
+            )
+    else:
+        display = RocCurveDisplay.from_cv_results(
+            cv_results, X, y, name=name, curve_kwargs=curve_kwargs
+        )
+
+        legend = display.ax_.get_legend()
+        legend_labels = [text.get_text() for text in legend.get_texts()]
+        if isinstance(curve_kwargs, list):
+            # Multiple labels in legend
+            assert len(legend_labels) == 3
+            auc = ["0.62", "0.66", "0.55"]
+            for idx, label in enumerate(legend_labels):
+                if name is None:
+                    assert label == f"AUC = {auc[idx]}"
+                elif isinstance(name, str):
+                    assert label == f"single (AUC = {auc[idx]})"
+                else:
+                    # `name` is a list of different strings
+                    assert label == f"{name[idx]} (AUC = {auc[idx]})"
+        else:
+            # Single label in legend
+            assert len(legend_labels) == 1
+            if name is None:
+                assert legend_labels[0] == "AUC = 0.61 +/- 0.05"
+            else:
+                # name is single string
+                assert legend_labels[0] == "single (AUC = 0.61 +/- 0.05)"
+
+
+@pytest.mark.parametrize(
+    "curve_kwargs",
+    [None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
+)
+def test_roc_curve_from_cv_results_curve_kwargs(pyplot, data_binary, curve_kwargs):
+    """Check line kwargs passed correctly in `from_cv_results`."""
+
+    X, y = data_binary
+    cv_results = cross_validate(
+        LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
+    )
+    display = RocCurveDisplay.from_cv_results(
+        cv_results, X, y, curve_kwargs=curve_kwargs
+    )
+
+    for idx, line in enumerate(display.line_):
+        color = line.get_color()
+        if curve_kwargs is None:
+            # Default color
+            assert color == "blue"
+        elif isinstance(curve_kwargs, Mapping):
+            # All curves "red"
+            assert color == "red"
+        else:
+            assert color == curve_kwargs[idx]["c"]
+
+
+def _check_chance_level(plot_chance_level, chance_level_kw, display):
+    """Check chance level line and line styles correct."""
+    import matplotlib as mpl
+
+    if plot_chance_level:
+        assert isinstance(display.chance_level_, mpl.lines.Line2D)
+        assert tuple(display.chance_level_.get_xdata()) == (0, 1)
+        assert tuple(display.chance_level_.get_ydata()) == (0, 1)
+    else:
+        assert display.chance_level_ is None
+
+    # Checking for chance level line styles
+    if plot_chance_level and chance_level_kw is None:
+        assert display.chance_level_.get_color() == "k"
+        assert display.chance_level_.get_linestyle() == "--"
+        assert display.chance_level_.get_label() == "Chance level (AUC = 0.5)"
+    elif plot_chance_level:
+        if "c" in chance_level_kw:
+            assert display.chance_level_.get_color() == chance_level_kw["c"]
+        else:
+            assert display.chance_level_.get_color() == chance_level_kw["color"]
+        if "lw" in chance_level_kw:
+            assert display.chance_level_.get_linewidth() == chance_level_kw["lw"]
+        else:
+            assert display.chance_level_.get_linewidth() == chance_level_kw["linewidth"]
+        if "ls" in chance_level_kw:
+            assert display.chance_level_.get_linestyle() == chance_level_kw["ls"]
+        else:
+            assert display.chance_level_.get_linestyle() == chance_level_kw["linestyle"]
+
+
+@pytest.mark.parametrize("plot_chance_level", [True, False])
+@pytest.mark.parametrize("label", [None, "Test Label"])
+@pytest.mark.parametrize(
+    "chance_level_kw",
+    [
+        None,
+        {"linewidth": 1, "color": "red", "linestyle": "-", "label": "DummyEstimator"},
+        {"lw": 1, "c": "red", "ls": "-", "label": "DummyEstimator"},
+        {"lw": 1, "color": "blue", "ls": "-", "label": None},
+    ],
+)
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_roc_curve_chance_level_line(
+    pyplot,
+    data_binary,
+    plot_chance_level,
+    chance_level_kw,
+    label,
+    constructor_name,
+):
+    """Check chance level plotting behavior of `from_predictions`, `from_estimator`."""
+    X, y = data_binary
+
+    lr = LogisticRegression()
+    lr.fit(X, y)
+
+    y_score = getattr(lr, "predict_proba")(X)
+    y_score = y_score if y_score.ndim == 1 else y_score[:, 1]
+
+    if constructor_name == "from_estimator":
+        display = RocCurveDisplay.from_estimator(
+            lr,
+            X,
+            y,
+            curve_kwargs={"alpha": 0.8, "label": label},
+            plot_chance_level=plot_chance_level,
+            chance_level_kw=chance_level_kw,
+        )
+    else:
+        display = RocCurveDisplay.from_predictions(
+            y,
+            y_score,
+            curve_kwargs={"alpha": 0.8, "label": label},
+            plot_chance_level=plot_chance_level,
+            chance_level_kw=chance_level_kw,
+        )
+
+    import matplotlib as mpl
+
+    assert isinstance(display.line_, mpl.lines.Line2D)
+    assert display.line_.get_alpha() == 0.8
+    assert isinstance(display.ax_, mpl.axes.Axes)
+    assert isinstance(display.figure_, mpl.figure.Figure)
+
+    _check_chance_level(plot_chance_level, chance_level_kw, display)
+
+    # Checking for legend behaviour
+    if plot_chance_level and chance_level_kw is not None:
+        if label is not None or chance_level_kw.get("label") is not None:
+            legend = display.ax_.get_legend()
+            assert legend is not None  #  Legend should be present if any label is set
+            legend_labels = [text.get_text() for text in legend.get_texts()]
+            if label is not None:
+                assert label in legend_labels
+            if chance_level_kw.get("label") is not None:
+                assert chance_level_kw["label"] in legend_labels
+        else:
+            assert display.ax_.get_legend() is None
+
+
+@pytest.mark.parametrize("plot_chance_level", [True, False])
+@pytest.mark.parametrize(
+    "chance_level_kw",
+    [
+        None,
+        {"linewidth": 1, "color": "red", "linestyle": "-", "label": "DummyEstimator"},
+        {"lw": 1, "c": "red", "ls": "-", "label": "DummyEstimator"},
+        {"lw": 1, "color": "blue", "ls": "-", "label": None},
+    ],
+)
+@pytest.mark.parametrize("curve_kwargs", [None, {"alpha": 0.8}])
+def test_roc_curve_chance_level_line_from_cv_results(
+    pyplot,
+    data_binary,
+    plot_chance_level,
+    chance_level_kw,
+    curve_kwargs,
+):
+    """Check chance level plotting behavior with `from_cv_results`."""
+    X, y = data_binary
+    n_cv = 3
+    cv_results = cross_validate(
+        LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
+    )
+
+    display = RocCurveDisplay.from_cv_results(
+        cv_results,
+        X,
+        y,
+        plot_chance_level=plot_chance_level,
+        chance_level_kwargs=chance_level_kw,
+        curve_kwargs=curve_kwargs,
+    )
+
+    import matplotlib as mpl
+
+    assert all(isinstance(line, mpl.lines.Line2D) for line in display.line_)
+    # Ensure both curve line kwargs passed correctly as well
+    if curve_kwargs:
+        assert all(line.get_alpha() == 0.8 for line in display.line_)
+    assert isinstance(display.ax_, mpl.axes.Axes)
+    assert isinstance(display.figure_, mpl.figure.Figure)
+
+    _check_chance_level(plot_chance_level, chance_level_kw, display)
+
+    legend = display.ax_.get_legend()
+    # There is always a legend, to indicate each 'Fold' curve
+    assert legend is not None
+    legend_labels = [text.get_text() for text in legend.get_texts()]
+    if plot_chance_level and chance_level_kw is not None:
+        if chance_level_kw.get("label") is not None:
+            assert chance_level_kw["label"] in legend_labels
+        else:
+            assert len(legend_labels) == 1
+
+
+@pytest.mark.parametrize(
+    "clf",
+    [
+        LogisticRegression(),
+        make_pipeline(StandardScaler(), LogisticRegression()),
+        make_pipeline(
+            make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
+        ),
+    ],
+)
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_roc_curve_display_complex_pipeline(pyplot, data_binary, clf, constructor_name):
+    """Check the behaviour with complex pipeline."""
+    X, y = data_binary
+
+    clf = clone(clf)
+
+    if constructor_name == "from_estimator":
+        with pytest.raises(NotFittedError):
+            RocCurveDisplay.from_estimator(clf, X, y)
+
+    clf.fit(X, y)
+
+    if constructor_name == "from_estimator":
+        display = RocCurveDisplay.from_estimator(clf, X, y)
+        name = clf.__class__.__name__
+    else:
+        display = RocCurveDisplay.from_predictions(y, y)
+        name = "Classifier"
+
+    assert name in display.line_.get_label()
+    assert display.name == name
+
+
+@pytest.mark.parametrize(
+    "roc_auc, name, curve_kwargs, expected_labels",
+    [
+        ([0.9, 0.8], None, None, ["AUC = 0.85 +/- 0.05", "_child1"]),
+        ([0.9, 0.8], "Est name", None, ["Est name (AUC = 0.85 +/- 0.05)", "_child1"]),
+        (
+            [0.8, 0.7],
+            ["fold1", "fold2"],
+            [{"c": "blue"}, {"c": "red"}],
+            ["fold1 (AUC = 0.80)", "fold2 (AUC = 0.70)"],
+        ),
+        (None, ["fold1", "fold2"], [{"c": "blue"}, {"c": "red"}], ["fold1", "fold2"]),
+    ],
+)
+def test_roc_curve_display_default_labels(
+    pyplot, roc_auc, name, curve_kwargs, expected_labels
+):
+    """Check the default labels used in the display."""
+    fpr = [np.array([0, 0.5, 1]), np.array([0, 0.3, 1])]
+    tpr = [np.array([0, 0.5, 1]), np.array([0, 0.3, 1])]
+    disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, name=name).plot(
+        curve_kwargs=curve_kwargs
+    )
+    for idx, expected_label in enumerate(expected_labels):
+        assert disp.line_[idx].get_label() == expected_label
+
+
+def _check_auc(display, constructor_name):
+    roc_auc_limit = 0.95679
+    roc_auc_limit_multi = [0.97007, 0.985915, 0.980952]
+
+    if constructor_name == "from_cv_results":
+        for idx, roc_auc in enumerate(display.roc_auc):
+            assert roc_auc == pytest.approx(roc_auc_limit_multi[idx])
+    else:
+        assert display.roc_auc == pytest.approx(roc_auc_limit)
+        assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
+
+
+@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
+@pytest.mark.parametrize(
+    "constructor_name", ["from_estimator", "from_predictions", "from_cv_results"]
+)
+def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name):
+    # check that we can provide the positive label and display the proper
+    # statistics
+    X, y = load_breast_cancer(return_X_y=True)
+    # create an highly imbalanced
+    idx_positive = np.flatnonzero(y == 1)
+    idx_negative = np.flatnonzero(y == 0)
+    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
+    X, y = X[idx_selected], y[idx_selected]
+    X, y = shuffle(X, y, random_state=42)
+    # only use 2 features to make the problem even harder
+    X = X[:, :2]
+    y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X,
+        y,
+        stratify=y,
+        random_state=0,
+    )
+
+    classifier = LogisticRegression()
+    classifier.fit(X_train, y_train)
+    cv_results = cross_validate(
+        LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
+    )
+
+    # Sanity check to be sure the positive class is `classes_[0]`
+    # Class imbalance ensures a large difference in prediction values between classes,
+    # allowing us to catch errors when we switch `pos_label`
+    assert classifier.classes_.tolist() == ["cancer", "not cancer"]
+
+    y_score = getattr(classifier, response_method)(X_test)
+    # we select the corresponding probability columns or reverse the decision
+    # function otherwise
+    y_score_cancer = -1 * y_score if y_score.ndim == 1 else y_score[:, 0]
+    y_score_not_cancer = y_score if y_score.ndim == 1 else y_score[:, 1]
+
+    pos_label = "cancer"
+    y_score = y_score_cancer
+    if constructor_name == "from_estimator":
+        display = RocCurveDisplay.from_estimator(
+            classifier,
+            X_test,
+            y_test,
+            pos_label=pos_label,
+            response_method=response_method,
+        )
+    elif constructor_name == "from_predictions":
+        display = RocCurveDisplay.from_predictions(
+            y_test,
+            y_score,
+            pos_label=pos_label,
+        )
+    else:
+        display = RocCurveDisplay.from_cv_results(
+            cv_results,
+            X,
+            y,
+            response_method=response_method,
+            pos_label=pos_label,
+        )
+
+    _check_auc(display, constructor_name)
+
+    pos_label = "not cancer"
+    y_score = y_score_not_cancer
+    if constructor_name == "from_estimator":
+        display = RocCurveDisplay.from_estimator(
+            classifier,
+            X_test,
+            y_test,
+            response_method=response_method,
+            pos_label=pos_label,
+        )
+    elif constructor_name == "from_predictions":
+        display = RocCurveDisplay.from_predictions(
+            y_test,
+            y_score,
+            pos_label=pos_label,
+        )
+    else:
+        display = RocCurveDisplay.from_cv_results(
+            cv_results,
+            X,
+            y,
+            response_method=response_method,
+            pos_label=pos_label,
+        )
+
+    _check_auc(display, constructor_name)
+
+
+# TODO(1.9): remove
+def test_y_score_and_y_pred_specified_error():
+    """Check that an error is raised when both y_score and y_pred are specified."""
+    y_true = np.array([0, 1, 1, 0])
+    y_score = np.array([0.1, 0.4, 0.35, 0.8])
+    y_pred = np.array([0.2, 0.3, 0.5, 0.1])
+
+    with pytest.raises(
+        ValueError, match="`y_pred` and `y_score` cannot be both specified"
+    ):
+        RocCurveDisplay.from_predictions(y_true, y_score=y_score, y_pred=y_pred)
+
+
+# TODO(1.9): remove
+def test_y_pred_deprecation_warning(pyplot):
+    """Check that a warning is raised when y_pred is specified."""
+    y_true = np.array([0, 1, 1, 0])
+    y_score = np.array([0.1, 0.4, 0.35, 0.8])
+
+    with pytest.warns(FutureWarning, match="y_pred is deprecated in 1.7"):
+        display_y_pred = RocCurveDisplay.from_predictions(y_true, y_pred=y_score)
+
+    assert_allclose(display_y_pred.fpr, [0, 0.5, 0.5, 1])
+    assert_allclose(display_y_pred.tpr, [0, 0, 1, 1])
+
+    display_y_score = RocCurveDisplay.from_predictions(y_true, y_score)
+    assert_allclose(display_y_score.fpr, [0, 0.5, 0.5, 1])
+    assert_allclose(display_y_score.tpr, [0, 0, 1, 1])
+
+
+@pytest.mark.parametrize("despine", [True, False])
+@pytest.mark.parametrize(
+    "constructor_name", ["from_estimator", "from_predictions", "from_cv_results"]
+)
+def test_plot_roc_curve_despine(pyplot, data_binary, despine, constructor_name):
+    # Check that the despine keyword is working correctly
+    X, y = data_binary
+
+    lr = LogisticRegression().fit(X, y)
+    lr.fit(X, y)
+    cv_results = cross_validate(
+        LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
+    )
+
+    y_pred = lr.decision_function(X)
+
+    # safe guard for the if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions", "from_cv_results")
+
+    if constructor_name == "from_estimator":
+        display = RocCurveDisplay.from_estimator(lr, X, y, despine=despine)
+    elif constructor_name == "from_predictions":
+        display = RocCurveDisplay.from_predictions(y, y_pred, despine=despine)
+    else:
+        display = RocCurveDisplay.from_cv_results(cv_results, X, y, despine=despine)
+
+    for s in ["top", "right"]:
+        assert display.ax_.spines[s].get_visible() is not despine
+
+    if despine:
+        for s in ["bottom", "left"]:
+            assert display.ax_.spines[s].get_bounds() == (0, 1)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_ranking.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_ranking.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d0e5211c236c703676923a65bfe5df75affef96
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_ranking.py
@@ -0,0 +1,2077 @@
+"""Metrics to assess performance on classification task given scores.
+
+Functions named as ``*_score`` return a scalar value to maximize: the higher
+the better.
+
+Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
+the lower the better.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from functools import partial
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.integrate import trapezoid
+from scipy.sparse import csr_matrix, issparse
+from scipy.stats import rankdata
+
+from ..exceptions import UndefinedMetricWarning
+from ..preprocessing import label_binarize
+from ..utils import (
+    assert_all_finite,
+    check_array,
+    check_consistent_length,
+    column_or_1d,
+)
+from ..utils._encode import _encode, _unique
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.extmath import stable_cumsum
+from ..utils.multiclass import type_of_target
+from ..utils.sparsefuncs import count_nonzero
+from ..utils.validation import _check_pos_label_consistency, _check_sample_weight
+from ._base import _average_binary_score, _average_multiclass_ovo_score
+
+
+@validate_params(
+    {"x": ["array-like"], "y": ["array-like"]},
+    prefer_skip_nested_validation=True,
+)
+def auc(x, y):
+    """Compute Area Under the Curve (AUC) using the trapezoidal rule.
+
+    This is a general function, given points on a curve.  For computing the
+    area under the ROC-curve, see :func:`roc_auc_score`.  For an alternative
+    way to summarize a precision-recall curve, see
+    :func:`average_precision_score`.
+
+    Parameters
+    ----------
+    x : array-like of shape (n,)
+        X coordinates. These must be either monotonic increasing or monotonic
+        decreasing.
+    y : array-like of shape (n,)
+        Y coordinates.
+
+    Returns
+    -------
+    auc : float
+        Area Under the Curve.
+
+    See Also
+    --------
+    roc_auc_score : Compute the area under the ROC curve.
+    average_precision_score : Compute average precision from prediction scores.
+    precision_recall_curve : Compute precision-recall pairs for different
+        probability thresholds.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import metrics
+    >>> y_true = np.array([1, 1, 2, 2])
+    >>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score, pos_label=2)
+    >>> metrics.auc(fpr, tpr)
+    0.75
+    """
+    check_consistent_length(x, y)
+    x = column_or_1d(x)
+    y = column_or_1d(y)
+
+    if x.shape[0] < 2:
+        raise ValueError(
+            "At least 2 points are needed to compute area under curve, but x.shape = %s"
+            % x.shape
+        )
+
+    direction = 1
+    dx = np.diff(x)
+    if np.any(dx < 0):
+        if np.all(dx <= 0):
+            direction = -1
+        else:
+            raise ValueError("x is neither increasing nor decreasing : {}.".format(x))
+
+    area = direction * trapezoid(y, x)
+    if isinstance(area, np.memmap):
+        # Reductions such as .sum used internally in trapezoid do not return a
+        # scalar by default for numpy.memmap instances contrary to
+        # regular numpy.ndarray instances.
+        area = area.dtype.type(area)
+    return float(area)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "average": [StrOptions({"micro", "samples", "weighted", "macro"}), None],
+        "pos_label": [Real, str, "boolean"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def average_precision_score(
+    y_true, y_score, *, average="macro", pos_label=1, sample_weight=None
+):
+    """Compute average precision (AP) from prediction scores.
+
+    AP summarizes a precision-recall curve as the weighted mean of precisions
+    achieved at each threshold, with the increase in recall from the previous
+    threshold used as the weight:
+
+    .. math::
+        \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n
+
+    where :math:`P_n` and :math:`R_n` are the precision and recall at the nth
+    threshold [1]_. This implementation is not interpolated and is different
+    from computing the area under the precision-recall curve with the
+    trapezoidal rule, which uses linear interpolation and can be too
+    optimistic.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
+        True binary labels or binary label indicators.
+
+    y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by :term:`decision_function` on some classifiers).
+        For :term:`decision_function` scores, values greater than or equal to
+        zero should indicate the positive class.
+
+    average : {'micro', 'samples', 'weighted', 'macro'} or None, \
+            default='macro'
+        If ``None``, the scores for each class are returned. Otherwise,
+        this determines the type of averaging performed on the data:
+
+        ``'micro'``:
+            Calculate metrics globally by considering each element of the label
+            indicator matrix as a label.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label).
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average.
+
+        Will be ignored when ``y_true`` is binary.
+
+    pos_label : int, float, bool or str, default=1
+        The label of the positive class. Only applied to binary ``y_true``.
+        For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    average_precision : float
+        Average precision score.
+
+    See Also
+    --------
+    roc_auc_score : Compute the area under the ROC curve.
+    precision_recall_curve : Compute precision-recall pairs for different
+        probability thresholds.
+    PrecisionRecallDisplay.from_estimator : Plot the precision recall curve
+        using an estimator and data.
+    PrecisionRecallDisplay.from_predictions : Plot the precision recall curve
+        using true and predicted labels.
+
+    Notes
+    -----
+    .. versionchanged:: 0.19
+      Instead of linearly interpolating between operating points, precisions
+      are weighted by the change in recall since the last operating point.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Average precision
+           <https://en.wikipedia.org/w/index.php?title=Information_retrieval&
+           oldid=793358396#Average_precision>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import average_precision_score
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> average_precision_score(y_true, y_scores)
+    0.83
+    >>> y_true = np.array([0, 0, 1, 1, 2, 2])
+    >>> y_scores = np.array([
+    ...     [0.7, 0.2, 0.1],
+    ...     [0.4, 0.3, 0.3],
+    ...     [0.1, 0.8, 0.1],
+    ...     [0.2, 0.3, 0.5],
+    ...     [0.4, 0.4, 0.2],
+    ...     [0.1, 0.2, 0.7],
+    ... ])
+    >>> average_precision_score(y_true, y_scores)
+    0.77
+    """
+
+    def _binary_uninterpolated_average_precision(
+        y_true, y_score, pos_label=1, sample_weight=None
+    ):
+        precision, recall, _ = precision_recall_curve(
+            y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
+        )
+        # Return the step function integral
+        # The following works because the last entry of precision is
+        # guaranteed to be 1, as returned by precision_recall_curve.
+        # Due to numerical error, we can get `-0.0` and we therefore clip it.
+        return float(max(0.0, -np.sum(np.diff(recall) * np.array(precision)[:-1])))
+
+    y_type = type_of_target(y_true, input_name="y_true")
+
+    # Convert to Python primitive type to avoid NumPy type / Python str
+    # comparison. See https://github.com/numpy/numpy/issues/6784
+    present_labels = np.unique(y_true).tolist()
+
+    if y_type == "binary":
+        if len(present_labels) == 2 and pos_label not in present_labels:
+            raise ValueError(
+                f"pos_label={pos_label} is not a valid label. It should be "
+                f"one of {present_labels}"
+            )
+
+    elif y_type == "multilabel-indicator" and pos_label != 1:
+        raise ValueError(
+            "Parameter pos_label is fixed to 1 for multilabel-indicator y_true. "
+            "Do not set pos_label or set pos_label to 1."
+        )
+
+    elif y_type == "multiclass":
+        if pos_label != 1:
+            raise ValueError(
+                "Parameter pos_label is fixed to 1 for multiclass y_true. "
+                "Do not set pos_label or set pos_label to 1."
+            )
+        y_true = label_binarize(y_true, classes=present_labels)
+
+    average_precision = partial(
+        _binary_uninterpolated_average_precision, pos_label=pos_label
+    )
+    return _average_binary_score(
+        average_precision, y_true, y_score, average, sample_weight=sample_weight
+    )
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "pos_label": [Real, str, "boolean", None],
+        "sample_weight": ["array-like", None],
+        "drop_intermediate": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def det_curve(
+    y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=False
+):
+    """Compute Detection Error Tradeoff (DET) for different probability thresholds.
+
+    .. note::
+       This metric is used for evaluation of ranking and error tradeoffs of
+       a binary classification task.
+
+    Read more in the :ref:`User Guide <det_curve>`.
+
+    .. versionadded:: 0.24
+
+    .. versionchanged:: 1.7
+       An arbitrary threshold at infinity is added to represent a classifier
+       that always predicts the negative class, i.e. `fpr=0` and `fnr=1`, unless
+       `fpr=0` is already reached at a finite threshold.
+
+    Parameters
+    ----------
+    y_true : ndarray of shape (n_samples,)
+        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
+        pos_label should be explicitly given.
+
+    y_score : ndarray of shape of (n_samples,)
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+        For :term:`decision_function` scores, values greater than or equal to
+        zero should indicate the positive class.
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class.
+        When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},
+        ``pos_label`` is set to 1, otherwise an error will be raised.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    drop_intermediate : bool, default=False
+        Whether to drop thresholds where true positives (tp) do not change from
+        the previous or subsequent threshold. All points with the same tp value
+        have the same `fnr` and thus same y coordinate.
+
+        .. versionadded:: 1.7
+
+    Returns
+    -------
+    fpr : ndarray of shape (n_thresholds,)
+        False positive rate (FPR) such that element i is the false positive
+        rate of predictions with score >= thresholds[i]. This is occasionally
+        referred to as false acceptance probability or fall-out.
+
+    fnr : ndarray of shape (n_thresholds,)
+        False negative rate (FNR) such that element i is the false negative
+        rate of predictions with score >= thresholds[i]. This is occasionally
+        referred to as false rejection or miss rate.
+
+    thresholds : ndarray of shape (n_thresholds,)
+        Decreasing thresholds on the decision function (either `predict_proba`
+        or `decision_function`) used to compute FPR and FNR.
+
+        .. versionchanged:: 1.7
+           An arbitrary threshold at infinity is added for the case `fpr=0`
+           and `fnr=1`.
+
+    See Also
+    --------
+    DetCurveDisplay.from_estimator : Plot DET curve given an estimator and
+        some data.
+    DetCurveDisplay.from_predictions : Plot DET curve given the true and
+        predicted labels.
+    DetCurveDisplay : DET curve visualization.
+    roc_curve : Compute Receiver operating characteristic (ROC) curve.
+    precision_recall_curve : Compute precision-recall curve.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import det_curve
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, fnr, thresholds = det_curve(y_true, y_scores)
+    >>> fpr
+    array([0.5, 0.5, 0. ])
+    >>> fnr
+    array([0. , 0.5, 0.5])
+    >>> thresholds
+    array([0.35, 0.4 , 0.8 ])
+    """
+    fps, tps, thresholds = _binary_clf_curve(
+        y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
+    )
+
+    # add a threshold at inf where the clf always predicts the negative class
+    # i.e. tps = fps = 0
+    tps = np.concatenate(([0], tps))
+    fps = np.concatenate(([0], fps))
+    thresholds = np.concatenate(([np.inf], thresholds))
+
+    if drop_intermediate and len(fps) > 2:
+        # Drop thresholds where true positives (tp) do not change from the
+        # previous or subsequent threshold. As tp + fn, is fixed for a dataset,
+        # this means the false negative rate (fnr) remains constant while the
+        # false positive rate (fpr) changes, producing horizontal line segments
+        # in the transformed (normal deviate) scale. These intermediate points
+        # can be dropped to create lighter DET curve plots.
+        optimal_idxs = np.where(
+            np.concatenate(
+                [[True], np.logical_or(np.diff(tps[:-1]), np.diff(tps[1:])), [True]]
+            )
+        )[0]
+        fps = fps[optimal_idxs]
+        tps = tps[optimal_idxs]
+        thresholds = thresholds[optimal_idxs]
+
+    if len(np.unique(y_true)) != 2:
+        raise ValueError(
+            "Only one class is present in y_true. Detection error "
+            "tradeoff curve is not defined in that case."
+        )
+
+    fns = tps[-1] - tps
+    p_count = tps[-1]
+    n_count = fps[-1]
+
+    # start with false positives zero, which may be at a finite threshold
+    first_ind = (
+        fps.searchsorted(fps[0], side="right") - 1
+        if fps.searchsorted(fps[0], side="right") > 0
+        else None
+    )
+    # stop with false negatives zero
+    last_ind = tps.searchsorted(tps[-1]) + 1
+    sl = slice(first_ind, last_ind)
+
+    # reverse the output such that list of false positives is decreasing
+    return (fps[sl][::-1] / n_count, fns[sl][::-1] / p_count, thresholds[sl][::-1])
+
+
+def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
+    """Binary roc auc score."""
+    if len(np.unique(y_true)) != 2:
+        warnings.warn(
+            (
+                "Only one class is present in y_true. ROC AUC score "
+                "is not defined in that case."
+            ),
+            UndefinedMetricWarning,
+        )
+        return np.nan
+
+    fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight)
+    if max_fpr is None or max_fpr == 1:
+        return auc(fpr, tpr)
+    if max_fpr <= 0 or max_fpr > 1:
+        raise ValueError("Expected max_fpr in range (0, 1], got: %r" % max_fpr)
+
+    # Add a single point at max_fpr by linear interpolation
+    stop = np.searchsorted(fpr, max_fpr, "right")
+    x_interp = [fpr[stop - 1], fpr[stop]]
+    y_interp = [tpr[stop - 1], tpr[stop]]
+    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
+    fpr = np.append(fpr[:stop], max_fpr)
+    partial_auc = auc(fpr, tpr)
+
+    # McClish correction: standardize result to be 0.5 if non-discriminant
+    # and 1 if maximal
+    min_area = 0.5 * max_fpr**2
+    max_area = max_fpr
+    return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "average": [StrOptions({"micro", "macro", "samples", "weighted"}), None],
+        "sample_weight": ["array-like", None],
+        "max_fpr": [Interval(Real, 0.0, 1, closed="right"), None],
+        "multi_class": [StrOptions({"raise", "ovr", "ovo"})],
+        "labels": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def roc_auc_score(
+    y_true,
+    y_score,
+    *,
+    average="macro",
+    sample_weight=None,
+    max_fpr=None,
+    multi_class="raise",
+    labels=None,
+):
+    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) \
+    from prediction scores.
+
+    Note: this implementation can be used with binary, multiclass and
+    multilabel classification, but some restrictions apply (see Parameters).
+
+    Read more in the :ref:`User Guide <roc_metrics>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
+        True labels or binary label indicators. The binary and multiclass cases
+        expect labels with shape (n_samples,) while the multilabel case expects
+        binary label indicators with shape (n_samples, n_classes).
+
+    y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
+        Target scores.
+
+        * In the binary case, it corresponds to an array of shape
+          `(n_samples,)`. Both probability estimates and non-thresholded
+          decision values can be provided. The probability estimates correspond
+          to the **probability of the class with the greater label**,
+          i.e. `estimator.classes_[1]` and thus
+          `estimator.predict_proba(X, y)[:, 1]`. The decision values
+          corresponds to the output of `estimator.decision_function(X, y)`.
+          See more information in the :ref:`User guide <roc_auc_binary>`;
+        * In the multiclass case, it corresponds to an array of shape
+          `(n_samples, n_classes)` of probability estimates provided by the
+          `predict_proba` method. The probability estimates **must**
+          sum to 1 across the possible classes. In addition, the order of the
+          class scores must correspond to the order of ``labels``,
+          if provided, or else to the numerical or lexicographical order of
+          the labels in ``y_true``. See more information in the
+          :ref:`User guide <roc_auc_multiclass>`;
+        * In the multilabel case, it corresponds to an array of shape
+          `(n_samples, n_classes)`. Probability estimates are provided by the
+          `predict_proba` method and the non-thresholded decision values by
+          the `decision_function` method. The probability estimates correspond
+          to the **probability of the class with the greater label for each
+          output** of the classifier. See more information in the
+          :ref:`User guide <roc_auc_multilabel>`.
+
+    average : {'micro', 'macro', 'samples', 'weighted'} or None, \
+            default='macro'
+        If ``None``, the scores for each class are returned.
+        Otherwise, this determines the type of averaging performed on the data.
+        Note: multiclass ROC AUC currently only handles the 'macro' and
+        'weighted' averages. For multiclass targets, `average=None` is only
+        implemented for `multi_class='ovr'` and `average='micro'` is only
+        implemented for `multi_class='ovr'`.
+
+        ``'micro'``:
+            Calculate metrics globally by considering each element of the label
+            indicator matrix as a label.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label).
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average.
+
+        Will be ignored when ``y_true`` is binary.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    max_fpr : float > 0 and <= 1, default=None
+        If not ``None``, the standardized partial AUC [2]_ over the range
+        [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,
+        should be either equal to ``None`` or ``1.0`` as AUC ROC partial
+        computation currently is not supported for multiclass.
+
+    multi_class : {'raise', 'ovr', 'ovo'}, default='raise'
+        Only used for multiclass targets. Determines the type of configuration
+        to use. The default value raises an error, so either
+        ``'ovr'`` or ``'ovo'`` must be passed explicitly.
+
+        ``'ovr'``:
+            Stands for One-vs-rest. Computes the AUC of each class
+            against the rest [3]_ [4]_. This
+            treats the multiclass case in the same way as the multilabel case.
+            Sensitive to class imbalance even when ``average == 'macro'``,
+            because class imbalance affects the composition of each of the
+            'rest' groupings.
+        ``'ovo'``:
+            Stands for One-vs-one. Computes the average AUC of all
+            possible pairwise combinations of classes [5]_.
+            Insensitive to class imbalance when
+            ``average == 'macro'``.
+
+    labels : array-like of shape (n_classes,), default=None
+        Only used for multiclass targets. List of labels that index the
+        classes in ``y_score``. If ``None``, the numerical or lexicographical
+        order of the labels in ``y_true`` is used.
+
+    Returns
+    -------
+    auc : float
+        Area Under the Curve score.
+
+    See Also
+    --------
+    average_precision_score : Area under the precision-recall curve.
+    roc_curve : Compute Receiver operating characteristic (ROC) curve.
+    RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic
+        (ROC) curve given an estimator and some data.
+    RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic
+        (ROC) curve given the true and predicted values.
+
+    Notes
+    -----
+    The Gini Coefficient is a summary measure of the ranking ability of binary
+    classifiers. It is expressed using the area under of the ROC as follows:
+
+    G = 2 * AUC - 1
+
+    Where G is the Gini coefficient and AUC is the ROC-AUC score. This normalisation
+    will ensure that random guessing will yield a score of 0 in expectation, and it is
+    upper bounded by 1.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Receiver operating characteristic
+            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+    .. [2] `Analyzing a portion of the ROC curve. McClish, 1989
+            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
+
+    .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving
+           probability estimation trees (Section 6.2), CeDER Working Paper
+           #IS-00-04, Stern School of Business, New York University.
+
+    .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern
+            Recognition Letters, 27(8), 861-874.
+            <https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_
+
+    .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area
+            Under the ROC Curve for Multiple Class Classification Problems.
+            Machine Learning, 45(2), 171-186.
+            <http://link.springer.com/article/10.1023/A:1010920819831>`_
+    .. [6] `Wikipedia entry for the Gini coefficient
+            <https://en.wikipedia.org/wiki/Gini_coefficient>`_
+
+    Examples
+    --------
+    Binary case:
+
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.metrics import roc_auc_score
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> clf = LogisticRegression(solver="newton-cholesky", random_state=0).fit(X, y)
+    >>> roc_auc_score(y, clf.predict_proba(X)[:, 1])
+    0.99
+    >>> roc_auc_score(y, clf.decision_function(X))
+    0.99
+
+    Multiclass case:
+
+    >>> from sklearn.datasets import load_iris
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = LogisticRegression(solver="newton-cholesky").fit(X, y)
+    >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')
+    0.99
+
+    Multilabel case:
+
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_multilabel_classification
+    >>> from sklearn.multioutput import MultiOutputClassifier
+    >>> X, y = make_multilabel_classification(random_state=0)
+    >>> clf = MultiOutputClassifier(clf).fit(X, y)
+    >>> # get a list of n_output containing probability arrays of shape
+    >>> # (n_samples, n_classes)
+    >>> y_score = clf.predict_proba(X)
+    >>> # extract the positive columns for each output
+    >>> y_score = np.transpose([score[:, 1] for score in y_score])
+    >>> roc_auc_score(y, y_score, average=None)
+    array([0.828, 0.852, 0.94, 0.869, 0.95])
+    >>> from sklearn.linear_model import RidgeClassifierCV
+    >>> clf = RidgeClassifierCV().fit(X, y)
+    >>> roc_auc_score(y, clf.decision_function(X), average=None)
+    array([0.82, 0.847, 0.93, 0.872, 0.944])
+    """
+
+    y_type = type_of_target(y_true, input_name="y_true")
+    y_true = check_array(y_true, ensure_2d=False, dtype=None)
+    y_score = check_array(y_score, ensure_2d=False)
+
+    if y_type == "multiclass" or (
+        y_type == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2
+    ):
+        # do not support partial ROC computation for multiclass
+        if max_fpr is not None and max_fpr != 1.0:
+            raise ValueError(
+                "Partial AUC computation not available in "
+                "multiclass setting, 'max_fpr' must be"
+                " set to `None`, received `max_fpr={0}` "
+                "instead".format(max_fpr)
+            )
+        if multi_class == "raise":
+            raise ValueError("multi_class must be in ('ovo', 'ovr')")
+        return _multiclass_roc_auc_score(
+            y_true, y_score, labels, multi_class, average, sample_weight
+        )
+    elif y_type == "binary":
+        labels = np.unique(y_true)
+        y_true = label_binarize(y_true, classes=labels)[:, 0]
+        return _average_binary_score(
+            partial(_binary_roc_auc_score, max_fpr=max_fpr),
+            y_true,
+            y_score,
+            average,
+            sample_weight=sample_weight,
+        )
+    else:  # multilabel-indicator
+        return _average_binary_score(
+            partial(_binary_roc_auc_score, max_fpr=max_fpr),
+            y_true,
+            y_score,
+            average,
+            sample_weight=sample_weight,
+        )
+
+
+def _multiclass_roc_auc_score(
+    y_true, y_score, labels, multi_class, average, sample_weight
+):
+    """Multiclass roc auc score.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True multiclass labels.
+
+    y_score : array-like of shape (n_samples, n_classes)
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    labels : array-like of shape (n_classes,) or None
+        List of labels to index ``y_score`` used for multiclass. If ``None``,
+        the lexical order of ``y_true`` is used to index ``y_score``.
+
+    multi_class : {'ovr', 'ovo'}
+        Determines the type of multiclass configuration to use.
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
+    average : {'micro', 'macro', 'weighted'}
+        Determines the type of averaging performed on the pairwise binary
+        metric scores
+        ``'micro'``:
+            Calculate metrics for the binarized-raveled classes. Only supported
+            for `multi_class='ovr'`.
+
+        .. versionadded:: 1.2
+
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes.
+
+    sample_weight : array-like of shape (n_samples,) or None
+        Sample weights.
+
+    """
+    # validation of the input y_score
+    if not np.allclose(1, y_score.sum(axis=1)):
+        raise ValueError(
+            "Target scores need to be probabilities for multiclass "
+            "roc_auc, i.e. they should sum up to 1.0 over classes"
+        )
+
+    # validation for multiclass parameter specifications
+    average_options = ("macro", "weighted", None)
+    if multi_class == "ovr":
+        average_options = ("micro",) + average_options
+    if average not in average_options:
+        raise ValueError(
+            "average must be one of {0} for multiclass problems".format(average_options)
+        )
+
+    multiclass_options = ("ovo", "ovr")
+    if multi_class not in multiclass_options:
+        raise ValueError(
+            "multi_class='{0}' is not supported "
+            "for multiclass ROC AUC, multi_class must be "
+            "in {1}".format(multi_class, multiclass_options)
+        )
+
+    if average is None and multi_class == "ovo":
+        raise NotImplementedError(
+            "average=None is not implemented for multi_class='ovo'."
+        )
+
+    if labels is not None:
+        labels = column_or_1d(labels)
+        classes = _unique(labels)
+        if len(classes) != len(labels):
+            raise ValueError("Parameter 'labels' must be unique")
+        if not np.array_equal(classes, labels):
+            raise ValueError("Parameter 'labels' must be ordered")
+        if len(classes) != y_score.shape[1]:
+            raise ValueError(
+                "Number of given labels, {0}, not equal to the number "
+                "of columns in 'y_score', {1}".format(len(classes), y_score.shape[1])
+            )
+        if len(np.setdiff1d(y_true, classes)):
+            raise ValueError("'y_true' contains labels not in parameter 'labels'")
+    else:
+        classes = _unique(y_true)
+        if len(classes) != y_score.shape[1]:
+            raise ValueError(
+                "Number of classes in y_true not equal to the number of "
+                "columns in 'y_score'"
+            )
+
+    if multi_class == "ovo":
+        if sample_weight is not None:
+            raise ValueError(
+                "sample_weight is not supported "
+                "for multiclass one-vs-one ROC AUC, "
+                "'sample_weight' must be None in this case."
+            )
+        y_true_encoded = _encode(y_true, uniques=classes)
+        # Hand & Till (2001) implementation (ovo)
+        return _average_multiclass_ovo_score(
+            _binary_roc_auc_score, y_true_encoded, y_score, average=average
+        )
+    else:
+        # ovr is same as multi-label
+        y_true_multilabel = label_binarize(y_true, classes=classes)
+        return _average_binary_score(
+            _binary_roc_auc_score,
+            y_true_multilabel,
+            y_score,
+            average,
+            sample_weight=sample_weight,
+        )
+
+
+def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
+    """Calculate true and false positives per binary classification threshold.
+
+    Parameters
+    ----------
+    y_true : ndarray of shape (n_samples,)
+        True targets of binary classification.
+
+    y_score : ndarray of shape (n_samples,)
+        Estimated probabilities or output of a decision function.
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    fps : ndarray of shape (n_thresholds,)
+        A count of false positives, at index i being the number of negative
+        samples assigned a score >= thresholds[i]. The total number of
+        negative samples is equal to fps[-1] (thus true negatives are given by
+        fps[-1] - fps).
+
+    tps : ndarray of shape (n_thresholds,)
+        An increasing count of true positives, at index i being the number
+        of positive samples assigned a score >= thresholds[i]. The total
+        number of positive samples is equal to tps[-1] (thus false negatives
+        are given by tps[-1] - tps).
+
+    thresholds : ndarray of shape (n_thresholds,)
+        Decreasing score values.
+    """
+    # Check to make sure y_true is valid
+    y_type = type_of_target(y_true, input_name="y_true")
+    if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    check_consistent_length(y_true, y_score, sample_weight)
+    y_true = column_or_1d(y_true)
+    y_score = column_or_1d(y_score)
+    assert_all_finite(y_true)
+    assert_all_finite(y_score)
+
+    # Filter out zero-weighted samples, as they should not impact the result
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+        sample_weight = _check_sample_weight(sample_weight, y_true)
+        nonzero_weight_mask = sample_weight != 0
+        y_true = y_true[nonzero_weight_mask]
+        y_score = y_score[nonzero_weight_mask]
+        sample_weight = sample_weight[nonzero_weight_mask]
+
+    pos_label = _check_pos_label_consistency(pos_label, y_true)
+
+    # make y_true a boolean vector
+    y_true = y_true == pos_label
+
+    # sort scores and corresponding truth values
+    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
+    y_score = y_score[desc_score_indices]
+    y_true = y_true[desc_score_indices]
+    if sample_weight is not None:
+        weight = sample_weight[desc_score_indices]
+    else:
+        weight = 1.0
+
+    # y_score typically has many tied values. Here we extract
+    # the indices associated with the distinct values. We also
+    # concatenate a value for the end of the curve.
+    distinct_value_indices = np.where(np.diff(y_score))[0]
+    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
+
+    # accumulate the true positives with decreasing threshold
+    tps = stable_cumsum(y_true * weight)[threshold_idxs]
+    if sample_weight is not None:
+        # express fps as a cumsum to ensure fps is increasing even in
+        # the presence of floating point errors
+        fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs]
+    else:
+        fps = 1 + threshold_idxs - tps
+    return fps, tps, y_score[threshold_idxs]
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "pos_label": [Real, str, "boolean", None],
+        "sample_weight": ["array-like", None],
+        "drop_intermediate": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def precision_recall_curve(
+    y_true,
+    y_score,
+    *,
+    pos_label=None,
+    sample_weight=None,
+    drop_intermediate=False,
+):
+    """Compute precision-recall pairs for different probability thresholds.
+
+    Note: this implementation is restricted to the binary classification task.
+
+    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    true positives and ``fp`` the number of false positives. The precision is
+    intuitively the ability of the classifier not to label as positive a sample
+    that is negative.
+
+    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    true positives and ``fn`` the number of false negatives. The recall is
+    intuitively the ability of the classifier to find all the positive samples.
+
+    The last precision and recall values are 1. and 0. respectively and do not
+    have a corresponding threshold. This ensures that the graph starts on the
+    y axis.
+
+    The first precision and recall values are precision=class balance and recall=1.0
+    which corresponds to a classifier that always predicts the positive class.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
+        pos_label should be explicitly given.
+
+    y_score : array-like of shape (n_samples,)
+        Target scores, can either be probability estimates of the positive
+        class, or non-thresholded measure of decisions (as returned by
+        `decision_function` on some classifiers).
+        For :term:`decision_function` scores, values greater than or equal to
+        zero should indicate the positive class.
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class.
+        When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
+        ``pos_label`` is set to 1, otherwise an error will be raised.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    drop_intermediate : bool, default=False
+        Whether to drop some suboptimal thresholds which would not appear
+        on a plotted precision-recall curve. This is useful in order to create
+        lighter precision-recall curves.
+
+        .. versionadded:: 1.3
+
+    Returns
+    -------
+    precision : ndarray of shape (n_thresholds + 1,)
+        Precision values such that element i is the precision of
+        predictions with score >= thresholds[i] and the last element is 1.
+
+    recall : ndarray of shape (n_thresholds + 1,)
+        Decreasing recall values such that element i is the recall of
+        predictions with score >= thresholds[i] and the last element is 0.
+
+    thresholds : ndarray of shape (n_thresholds,)
+        Increasing thresholds on the decision function used to compute
+        precision and recall where `n_thresholds = len(np.unique(y_score))`.
+
+    See Also
+    --------
+    PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given
+        a binary classifier.
+    PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve
+        using predictions from a binary classifier.
+    average_precision_score : Compute average precision from prediction scores.
+    det_curve: Compute error rates for different probability thresholds.
+    roc_curve : Compute Receiver operating characteristic (ROC) curve.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import precision_recall_curve
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> precision, recall, thresholds = precision_recall_curve(
+    ...     y_true, y_scores)
+    >>> precision
+    array([0.5       , 0.66666667, 0.5       , 1.        , 1.        ])
+    >>> recall
+    array([1. , 1. , 0.5, 0.5, 0. ])
+    >>> thresholds
+    array([0.1 , 0.35, 0.4 , 0.8 ])
+    """
+    fps, tps, thresholds = _binary_clf_curve(
+        y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
+    )
+
+    if drop_intermediate and len(fps) > 2:
+        # Drop thresholds corresponding to points where true positives (tps)
+        # do not change from the previous or subsequent point. This will keep
+        # only the first and last point for each tps value. All points
+        # with the same tps value have the same recall and thus x coordinate.
+        # They appear as a vertical line on the plot.
+        optimal_idxs = np.where(
+            np.concatenate(
+                [[True], np.logical_or(np.diff(tps[:-1]), np.diff(tps[1:])), [True]]
+            )
+        )[0]
+        fps = fps[optimal_idxs]
+        tps = tps[optimal_idxs]
+        thresholds = thresholds[optimal_idxs]
+
+    ps = tps + fps
+    # Initialize the result array with zeros to make sure that precision[ps == 0]
+    # does not contain uninitialized values.
+    precision = np.zeros_like(tps)
+    np.divide(tps, ps, out=precision, where=(ps != 0))
+
+    # When no positive label in y_true, recall is set to 1 for all thresholds
+    # tps[-1] == 0 <=> y_true == all negative labels
+    if tps[-1] == 0:
+        warnings.warn(
+            "No positive class found in y_true, "
+            "recall is set to one for all thresholds."
+        )
+        recall = np.ones_like(tps)
+    else:
+        recall = tps / tps[-1]
+
+    # reverse the outputs so recall is decreasing
+    sl = slice(None, None, -1)
+    return np.hstack((precision[sl], 1)), np.hstack((recall[sl], 0)), thresholds[sl]
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "pos_label": [Real, str, "boolean", None],
+        "sample_weight": ["array-like", None],
+        "drop_intermediate": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def roc_curve(
+    y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True
+):
+    """Compute Receiver operating characteristic (ROC).
+
+    Note: this implementation is restricted to the binary classification task.
+
+    Read more in the :ref:`User Guide <roc_metrics>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
+        pos_label should be explicitly given.
+
+    y_score : array-like of shape (n_samples,)
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+        For :term:`decision_function` scores, values greater than or equal to
+        zero should indicate the positive class.
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class.
+        When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},
+        ``pos_label`` is set to 1, otherwise an error will be raised.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    drop_intermediate : bool, default=True
+        Whether to drop thresholds where the resulting point is collinear with
+        its neighbors in ROC space. This has no effect on the ROC AUC or visual
+        shape of the curve, but reduces the number of plotted points.
+
+        .. versionadded:: 0.17
+           parameter *drop_intermediate*.
+
+    Returns
+    -------
+    fpr : ndarray of shape (>2,)
+        Increasing false positive rates such that element i is the false
+        positive rate of predictions with score >= `thresholds[i]`.
+
+    tpr : ndarray of shape (>2,)
+        Increasing true positive rates such that element `i` is the true
+        positive rate of predictions with score >= `thresholds[i]`.
+
+    thresholds : ndarray of shape (n_thresholds,)
+        Decreasing thresholds on the decision function used to compute
+        fpr and tpr. The first threshold is set to `np.inf`.
+
+        .. versionchanged:: 1.3
+           An arbitrary threshold at infinity (stored in `thresholds[0]`) is
+           added to represent a classifier that always predicts the negative
+           class, i.e. `fpr=0` and `tpr=0`.
+
+    See Also
+    --------
+    RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic
+        (ROC) curve given an estimator and some data.
+    RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic
+        (ROC) curve given the true and predicted values.
+    det_curve: Compute error rates for different probability thresholds.
+    roc_auc_score : Compute the area under the ROC curve.
+
+    Notes
+    -----
+    Since the thresholds are sorted from low to high values, they
+    are reversed upon returning them to ensure they correspond to both ``fpr``
+    and ``tpr``, which are sorted in reversed order during their calculation.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Receiver operating characteristic
+            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
+           Letters, 2006, 27(8):861-874.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import metrics
+    >>> y = np.array([1, 1, 2, 2])
+    >>> scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
+    >>> fpr
+    array([0. , 0. , 0.5, 0.5, 1. ])
+    >>> tpr
+    array([0. , 0.5, 0.5, 1. , 1. ])
+    >>> thresholds
+    array([ inf, 0.8 , 0.4 , 0.35, 0.1 ])
+    """
+    fps, tps, thresholds = _binary_clf_curve(
+        y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
+    )
+
+    # Attempt to drop thresholds corresponding to points in between and
+    # collinear with other points. These are always suboptimal and do not
+    # appear on a plotted ROC curve (and thus do not affect the AUC).
+    # Here np.diff(_, 2) is used as a "second derivative" to tell if there
+    # is a corner at the point. Both fps and tps must be tested to handle
+    # thresholds with multiple data points (which are combined in
+    # _binary_clf_curve). This keeps all cases where the point should be kept,
+    # but does not drop more complicated cases like fps = [1, 3, 7],
+    # tps = [1, 2, 4]; there is no harm in keeping too many thresholds.
+    if drop_intermediate and len(fps) > 2:
+        optimal_idxs = np.where(
+            np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True]
+        )[0]
+        fps = fps[optimal_idxs]
+        tps = tps[optimal_idxs]
+        thresholds = thresholds[optimal_idxs]
+
+    # Add an extra threshold position
+    # to make sure that the curve starts at (0, 0)
+    tps = np.r_[0, tps]
+    fps = np.r_[0, fps]
+    # get dtype of `y_score` even if it is an array-like
+    thresholds = np.r_[np.inf, thresholds]
+
+    if fps[-1] <= 0:
+        warnings.warn(
+            "No negative samples in y_true, false positive value should be meaningless",
+            UndefinedMetricWarning,
+        )
+        fpr = np.repeat(np.nan, fps.shape)
+    else:
+        fpr = fps / fps[-1]
+
+    if tps[-1] <= 0:
+        warnings.warn(
+            "No positive samples in y_true, true positive value should be meaningless",
+            UndefinedMetricWarning,
+        )
+        tpr = np.repeat(np.nan, tps.shape)
+    else:
+        tpr = tps / tps[-1]
+
+    return fpr, tpr, thresholds
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_score": ["array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None):
+    """Compute ranking-based average precision.
+
+    Label ranking average precision (LRAP) is the average over each ground
+    truth label assigned to each sample, of the ratio of true vs. total
+    labels with lower score.
+
+    This metric is used in multilabel ranking problem, where the goal
+    is to give better rank to the labels associated to each sample.
+
+    The obtained score is always strictly greater than 0 and
+    the best value is 1.
+
+    Read more in the :ref:`User Guide <label_ranking_average_precision>`.
+
+    Parameters
+    ----------
+    y_true : {array-like, sparse matrix} of shape (n_samples, n_labels)
+        True binary labels in binary indicator format.
+
+    y_score : array-like of shape (n_samples, n_labels)
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+        For :term:`decision_function` scores, values greater than or equal to
+        zero should indicate the positive class.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    score : float
+        Ranking-based average precision score.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import label_ranking_average_precision_score
+    >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])
+    >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])
+    >>> label_ranking_average_precision_score(y_true, y_score)
+    0.416
+    """
+    check_consistent_length(y_true, y_score, sample_weight)
+    y_true = check_array(y_true, ensure_2d=False, accept_sparse="csr")
+    y_score = check_array(y_score, ensure_2d=False)
+
+    if y_true.shape != y_score.shape:
+        raise ValueError("y_true and y_score have different shape")
+
+    # Handle badly formatted array and the degenerate case with one label
+    y_type = type_of_target(y_true, input_name="y_true")
+    if y_type != "multilabel-indicator" and not (
+        y_type == "binary" and y_true.ndim == 2
+    ):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    if not issparse(y_true):
+        y_true = csr_matrix(y_true)
+
+    y_score = -y_score
+
+    n_samples, n_labels = y_true.shape
+
+    out = 0.0
+    for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):
+        relevant = y_true.indices[start:stop]
+
+        if relevant.size == 0 or relevant.size == n_labels:
+            # If all labels are relevant or unrelevant, the score is also
+            # equal to 1. The label ranking has no meaning.
+            aux = 1.0
+        else:
+            scores_i = y_score[i]
+            rank = rankdata(scores_i, "max")[relevant]
+            L = rankdata(scores_i[relevant], "max")
+            aux = (L / rank).mean()
+
+        if sample_weight is not None:
+            aux = aux * sample_weight[i]
+        out += aux
+
+    if sample_weight is None:
+        out /= n_samples
+    else:
+        out /= np.sum(sample_weight)
+
+    return float(out)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def coverage_error(y_true, y_score, *, sample_weight=None):
+    """Coverage error measure.
+
+    Compute how far we need to go through the ranked scores to cover all
+    true labels. The best value is equal to the average number
+    of labels in ``y_true`` per sample.
+
+    Ties in ``y_scores`` are broken by giving maximal rank that would have
+    been assigned to all tied values.
+
+    Note: Our implementation's score is 1 greater than the one given in
+    Tsoumakas et al., 2010. This extends it to handle the degenerate case
+    in which an instance has 0 true labels.
+
+    Read more in the :ref:`User Guide <coverage_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples, n_labels)
+        True binary labels in binary indicator format.
+
+    y_score : array-like of shape (n_samples, n_labels)
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+        For :term:`decision_function` scores, values greater than or equal to
+        zero should indicate the positive class.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    coverage_error : float
+        The coverage error.
+
+    References
+    ----------
+    .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).
+           Mining multi-label data. In Data mining and knowledge discovery
+           handbook (pp. 667-685). Springer US.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import coverage_error
+    >>> y_true = [[1, 0, 0], [0, 1, 1]]
+    >>> y_score = [[1, 0, 0], [0, 1, 1]]
+    >>> coverage_error(y_true, y_score)
+    1.5
+    """
+    y_true = check_array(y_true, ensure_2d=True)
+    y_score = check_array(y_score, ensure_2d=True)
+    check_consistent_length(y_true, y_score, sample_weight)
+
+    y_type = type_of_target(y_true, input_name="y_true")
+    if y_type != "multilabel-indicator":
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    if y_true.shape != y_score.shape:
+        raise ValueError("y_true and y_score have different shape")
+
+    y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true))
+    y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1))
+    coverage = (y_score >= y_min_relevant).sum(axis=1)
+    coverage = coverage.filled(0)
+
+    return float(np.average(coverage, weights=sample_weight))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like", "sparse matrix"],
+        "y_score": ["array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def label_ranking_loss(y_true, y_score, *, sample_weight=None):
+    """Compute Ranking loss measure.
+
+    Compute the average number of label pairs that are incorrectly ordered
+    given y_score weighted by the size of the label set and the number of
+    labels not in the label set.
+
+    This is similar to the error set size, but weighted by the number of
+    relevant and irrelevant labels. The best performance is achieved with
+    a ranking loss of zero.
+
+    Read more in the :ref:`User Guide <label_ranking_loss>`.
+
+    .. versionadded:: 0.17
+       A function *label_ranking_loss*
+
+    Parameters
+    ----------
+    y_true : {array-like, sparse matrix} of shape (n_samples, n_labels)
+        True binary labels in binary indicator format.
+
+    y_score : array-like of shape (n_samples, n_labels)
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+        For :term:`decision_function` scores, values greater than or equal to
+        zero should indicate the positive class.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        Average number of label pairs that are incorrectly ordered given
+        y_score weighted by the size of the label set and the number of labels not
+        in the label set.
+
+    References
+    ----------
+    .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).
+           Mining multi-label data. In Data mining and knowledge discovery
+           handbook (pp. 667-685). Springer US.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import label_ranking_loss
+    >>> y_true = [[1, 0, 0], [0, 0, 1]]
+    >>> y_score = [[0.75, 0.5, 1], [1, 0.2, 0.1]]
+    >>> label_ranking_loss(y_true, y_score)
+    0.75
+    """
+    y_true = check_array(y_true, ensure_2d=False, accept_sparse="csr")
+    y_score = check_array(y_score, ensure_2d=False)
+    check_consistent_length(y_true, y_score, sample_weight)
+
+    y_type = type_of_target(y_true, input_name="y_true")
+    if y_type not in ("multilabel-indicator",):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    if y_true.shape != y_score.shape:
+        raise ValueError("y_true and y_score have different shape")
+
+    n_samples, n_labels = y_true.shape
+
+    y_true = csr_matrix(y_true)
+
+    loss = np.zeros(n_samples)
+    for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):
+        # Sort and bin the label scores
+        unique_scores, unique_inverse = np.unique(y_score[i], return_inverse=True)
+        true_at_reversed_rank = np.bincount(
+            unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores)
+        )
+        all_at_reversed_rank = np.bincount(unique_inverse, minlength=len(unique_scores))
+        false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
+
+        # if the scores are ordered, it's possible to count the number of
+        # incorrectly ordered paires in linear time by cumulatively counting
+        # how many false labels of a given score have a score higher than the
+        # accumulated true labels with lower score.
+        loss[i] = np.dot(true_at_reversed_rank.cumsum(), false_at_reversed_rank)
+
+    n_positives = count_nonzero(y_true, axis=1)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        loss /= (n_labels - n_positives) * n_positives
+
+    # When there is no positive or no negative labels, those values should
+    # be consider as correct, i.e. the ranking doesn't matter.
+    loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.0
+
+    return float(np.average(loss, weights=sample_weight))
+
+
+def _dcg_sample_scores(y_true, y_score, k=None, log_base=2, ignore_ties=False):
+    """Compute Discounted Cumulative Gain.
+
+    Sum the true scores ranked in the order induced by the predicted scores,
+    after applying a logarithmic discount.
+
+    This ranking metric yields a high value if true labels are ranked high by
+    ``y_score``.
+
+    Parameters
+    ----------
+    y_true : ndarray of shape (n_samples, n_labels)
+        True targets of multilabel classification, or true scores of entities
+        to be ranked.
+
+    y_score : ndarray of shape (n_samples, n_labels)
+        Target scores, can either be probability estimates, confidence values,
+        or non-thresholded measure of decisions (as returned by
+        "decision_function" on some classifiers).
+
+    k : int, default=None
+        Only consider the highest k scores in the ranking. If `None`, use all
+        outputs.
+
+    log_base : float, default=2
+        Base of the logarithm used for the discount. A low value means a
+        sharper discount (top results are more important).
+
+    ignore_ties : bool, default=False
+        Assume that there are no ties in y_score (which is likely to be the
+        case if y_score is continuous) for efficiency gains.
+
+    Returns
+    -------
+    discounted_cumulative_gain : ndarray of shape (n_samples,)
+        The DCG score for each sample.
+
+    See Also
+    --------
+    ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted
+        Cumulative Gain (the DCG obtained for a perfect ranking), in order to
+        have a score between 0 and 1.
+    """
+    discount = 1 / (np.log(np.arange(y_true.shape[1]) + 2) / np.log(log_base))
+    if k is not None:
+        discount[k:] = 0
+    if ignore_ties:
+        ranking = np.argsort(y_score)[:, ::-1]
+        ranked = y_true[np.arange(ranking.shape[0])[:, np.newaxis], ranking]
+        cumulative_gains = discount.dot(ranked.T)
+    else:
+        discount_cumsum = np.cumsum(discount)
+        cumulative_gains = [
+            _tie_averaged_dcg(y_t, y_s, discount_cumsum)
+            for y_t, y_s in zip(y_true, y_score)
+        ]
+        cumulative_gains = np.asarray(cumulative_gains)
+    return cumulative_gains
+
+
+def _tie_averaged_dcg(y_true, y_score, discount_cumsum):
+    """
+    Compute DCG by averaging over possible permutations of ties.
+
+    The gain (`y_true`) of an index falling inside a tied group (in the order
+    induced by `y_score`) is replaced by the average gain within this group.
+    The discounted gain for a tied group is then the average `y_true` within
+    this group times the sum of discounts of the corresponding ranks.
+
+    This amounts to averaging scores for all possible orderings of the tied
+    groups.
+
+    (note in the case of dcg@k the discount is 0 after index k)
+
+    Parameters
+    ----------
+    y_true : ndarray
+        The true relevance scores.
+
+    y_score : ndarray
+        Predicted scores.
+
+    discount_cumsum : ndarray
+        Precomputed cumulative sum of the discounts.
+
+    Returns
+    -------
+    discounted_cumulative_gain : float
+        The discounted cumulative gain.
+
+    References
+    ----------
+    McSherry, F., & Najork, M. (2008, March). Computing information retrieval
+    performance measures efficiently in the presence of tied scores. In
+    European conference on information retrieval (pp. 414-421). Springer,
+    Berlin, Heidelberg.
+    """
+    _, inv, counts = np.unique(-y_score, return_inverse=True, return_counts=True)
+    ranked = np.zeros(len(counts))
+    np.add.at(ranked, inv, y_true)
+    ranked /= counts
+    groups = np.cumsum(counts) - 1
+    discount_sums = np.empty(len(counts))
+    discount_sums[0] = discount_cumsum[groups[0]]
+    discount_sums[1:] = np.diff(discount_cumsum[groups])
+    return (ranked * discount_sums).sum()
+
+
+def _check_dcg_target_type(y_true):
+    y_type = type_of_target(y_true, input_name="y_true")
+    supported_fmt = (
+        "multilabel-indicator",
+        "continuous-multioutput",
+        "multiclass-multioutput",
+    )
+    if y_type not in supported_fmt:
+        raise ValueError(
+            "Only {} formats are supported. Got {} instead".format(
+                supported_fmt, y_type
+            )
+        )
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "k": [Interval(Integral, 1, None, closed="left"), None],
+        "log_base": [Interval(Real, 0.0, None, closed="neither")],
+        "sample_weight": ["array-like", None],
+        "ignore_ties": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def dcg_score(
+    y_true, y_score, *, k=None, log_base=2, sample_weight=None, ignore_ties=False
+):
+    """Compute Discounted Cumulative Gain.
+
+    Sum the true scores ranked in the order induced by the predicted scores,
+    after applying a logarithmic discount.
+
+    This ranking metric yields a high value if true labels are ranked high by
+    ``y_score``.
+
+    Usually the Normalized Discounted Cumulative Gain (NDCG, computed by
+    ndcg_score) is preferred.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples, n_labels)
+        True targets of multilabel classification, or true scores of entities
+        to be ranked.
+
+    y_score : array-like of shape (n_samples, n_labels)
+        Target scores, can either be probability estimates, confidence values,
+        or non-thresholded measure of decisions (as returned by
+        "decision_function" on some classifiers).
+
+    k : int, default=None
+        Only consider the highest k scores in the ranking. If None, use all
+        outputs.
+
+    log_base : float, default=2
+        Base of the logarithm used for the discount. A low value means a
+        sharper discount (top results are more important).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights. If `None`, all samples are given the same weight.
+
+    ignore_ties : bool, default=False
+        Assume that there are no ties in y_score (which is likely to be the
+        case if y_score is continuous) for efficiency gains.
+
+    Returns
+    -------
+    discounted_cumulative_gain : float
+        The averaged sample DCG scores.
+
+    See Also
+    --------
+    ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted
+        Cumulative Gain (the DCG obtained for a perfect ranking), in order to
+        have a score between 0 and 1.
+
+    References
+    ----------
+    `Wikipedia entry for Discounted Cumulative Gain
+    <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_.
+
+    Jarvelin, K., & Kekalainen, J. (2002).
+    Cumulated gain-based evaluation of IR techniques. ACM Transactions on
+    Information Systems (TOIS), 20(4), 422-446.
+
+    Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).
+    A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th
+    Annual Conference on Learning Theory (COLT 2013).
+
+    McSherry, F., & Najork, M. (2008, March). Computing information retrieval
+    performance measures efficiently in the presence of tied scores. In
+    European conference on information retrieval (pp. 414-421). Springer,
+    Berlin, Heidelberg.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import dcg_score
+    >>> # we have ground-truth relevance of some answers to a query:
+    >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
+    >>> # we predict scores for the answers
+    >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
+    >>> dcg_score(true_relevance, scores)
+    9.49
+    >>> # we can set k to truncate the sum; only top k answers contribute
+    >>> dcg_score(true_relevance, scores, k=2)
+    5.63
+    >>> # now we have some ties in our prediction
+    >>> scores = np.asarray([[1, 0, 0, 0, 1]])
+    >>> # by default ties are averaged, so here we get the average true
+    >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5
+    >>> dcg_score(true_relevance, scores, k=1)
+    7.5
+    >>> # we can choose to ignore ties for faster results, but only
+    >>> # if we know there aren't ties in our scores, otherwise we get
+    >>> # wrong results:
+    >>> dcg_score(true_relevance,
+    ...           scores, k=1, ignore_ties=True)
+    5.0
+    """
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
+    check_consistent_length(y_true, y_score, sample_weight)
+    _check_dcg_target_type(y_true)
+    return float(
+        np.average(
+            _dcg_sample_scores(
+                y_true, y_score, k=k, log_base=log_base, ignore_ties=ignore_ties
+            ),
+            weights=sample_weight,
+        )
+    )
+
+
+def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False):
+    """Compute Normalized Discounted Cumulative Gain.
+
+    Sum the true scores ranked in the order induced by the predicted scores,
+    after applying a logarithmic discount. Then divide by the best possible
+    score (Ideal DCG, obtained for a perfect ranking) to obtain a score between
+    0 and 1.
+
+    This ranking metric yields a high value if true labels are ranked high by
+    ``y_score``.
+
+    Parameters
+    ----------
+    y_true : ndarray of shape (n_samples, n_labels)
+        True targets of multilabel classification, or true scores of entities
+        to be ranked.
+
+    y_score : ndarray of shape (n_samples, n_labels)
+        Target scores, can either be probability estimates, confidence values,
+        or non-thresholded measure of decisions (as returned by
+        "decision_function" on some classifiers).
+
+    k : int, default=None
+        Only consider the highest k scores in the ranking. If None, use all
+        outputs.
+
+    ignore_ties : bool, default=False
+        Assume that there are no ties in y_score (which is likely to be the
+        case if y_score is continuous) for efficiency gains.
+
+    Returns
+    -------
+    normalized_discounted_cumulative_gain : ndarray of shape (n_samples,)
+        The NDCG score for each sample (float in [0., 1.]).
+
+    See Also
+    --------
+    dcg_score : Discounted Cumulative Gain (not normalized).
+
+    """
+    gain = _dcg_sample_scores(y_true, y_score, k, ignore_ties=ignore_ties)
+    # Here we use the order induced by y_true so we can ignore ties since
+    # the gain associated to tied indices is the same (permuting ties doesn't
+    # change the value of the re-ordered y_true)
+    normalizing_gain = _dcg_sample_scores(y_true, y_true, k, ignore_ties=True)
+    all_irrelevant = normalizing_gain == 0
+    gain[all_irrelevant] = 0
+    gain[~all_irrelevant] /= normalizing_gain[~all_irrelevant]
+    return gain
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "k": [Interval(Integral, 1, None, closed="left"), None],
+        "sample_weight": ["array-like", None],
+        "ignore_ties": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def ndcg_score(y_true, y_score, *, k=None, sample_weight=None, ignore_ties=False):
+    """Compute Normalized Discounted Cumulative Gain.
+
+    Sum the true scores ranked in the order induced by the predicted scores,
+    after applying a logarithmic discount. Then divide by the best possible
+    score (Ideal DCG, obtained for a perfect ranking) to obtain a score between
+    0 and 1.
+
+    This ranking metric returns a high value if true labels are ranked high by
+    ``y_score``.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples, n_labels)
+        True targets of multilabel classification, or true scores of entities
+        to be ranked. Negative values in `y_true` may result in an output
+        that is not between 0 and 1.
+
+    y_score : array-like of shape (n_samples, n_labels)
+        Target scores, can either be probability estimates, confidence values,
+        or non-thresholded measure of decisions (as returned by
+        "decision_function" on some classifiers).
+
+    k : int, default=None
+        Only consider the highest k scores in the ranking. If `None`, use all
+        outputs.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights. If `None`, all samples are given the same weight.
+
+    ignore_ties : bool, default=False
+        Assume that there are no ties in y_score (which is likely to be the
+        case if y_score is continuous) for efficiency gains.
+
+    Returns
+    -------
+    normalized_discounted_cumulative_gain : float in [0., 1.]
+        The averaged NDCG scores for all samples.
+
+    See Also
+    --------
+    dcg_score : Discounted Cumulative Gain (not normalized).
+
+    References
+    ----------
+    `Wikipedia entry for Discounted Cumulative Gain
+    <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_
+
+    Jarvelin, K., & Kekalainen, J. (2002).
+    Cumulated gain-based evaluation of IR techniques. ACM Transactions on
+    Information Systems (TOIS), 20(4), 422-446.
+
+    Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).
+    A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th
+    Annual Conference on Learning Theory (COLT 2013)
+
+    McSherry, F., & Najork, M. (2008, March). Computing information retrieval
+    performance measures efficiently in the presence of tied scores. In
+    European conference on information retrieval (pp. 414-421). Springer,
+    Berlin, Heidelberg.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import ndcg_score
+    >>> # we have ground-truth relevance of some answers to a query:
+    >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
+    >>> # we predict some scores (relevance) for the answers
+    >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
+    >>> ndcg_score(true_relevance, scores)
+    0.69
+    >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]])
+    >>> ndcg_score(true_relevance, scores)
+    0.49
+    >>> # we can set k to truncate the sum; only top k answers contribute.
+    >>> ndcg_score(true_relevance, scores, k=4)
+    0.35
+    >>> # the normalization takes k into account so a perfect answer
+    >>> # would still get 1.0
+    >>> ndcg_score(true_relevance, true_relevance, k=4)
+    1.0...
+    >>> # now we have some ties in our prediction
+    >>> scores = np.asarray([[1, 0, 0, 0, 1]])
+    >>> # by default ties are averaged, so here we get the average (normalized)
+    >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75
+    >>> ndcg_score(true_relevance, scores, k=1)
+    0.75
+    >>> # we can choose to ignore ties for faster results, but only
+    >>> # if we know there aren't ties in our scores, otherwise we get
+    >>> # wrong results:
+    >>> ndcg_score(true_relevance,
+    ...           scores, k=1, ignore_ties=True)
+    0.5...
+    """
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
+    check_consistent_length(y_true, y_score, sample_weight)
+
+    if y_true.min() < 0:
+        raise ValueError("ndcg_score should not be used on negative y_true values.")
+    if y_true.ndim > 1 and y_true.shape[1] <= 1:
+        raise ValueError(
+            "Computing NDCG is only meaningful when there is more than 1 document. "
+            f"Got {y_true.shape[1]} instead."
+        )
+    _check_dcg_target_type(y_true)
+    gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties)
+    return float(np.average(gain, weights=sample_weight))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "k": [Interval(Integral, 1, None, closed="left")],
+        "normalize": ["boolean"],
+        "sample_weight": ["array-like", None],
+        "labels": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def top_k_accuracy_score(
+    y_true, y_score, *, k=2, normalize=True, sample_weight=None, labels=None
+):
+    """Top-k Accuracy classification score.
+
+    This metric computes the number of times where the correct label is among
+    the top `k` labels predicted (ranked by predicted scores). Note that the
+    multilabel case isn't covered here.
+
+    Read more in the :ref:`User Guide <top_k_accuracy_score>`
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True labels.
+
+    y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
+        Target scores. These can be either probability estimates or
+        non-thresholded decision values (as returned by
+        :term:`decision_function` on some classifiers).
+        The binary case expects scores with shape (n_samples,) while the
+        multiclass case expects scores with shape (n_samples, n_classes).
+        In the multiclass case, the order of the class scores must
+        correspond to the order of ``labels``, if provided, or else to
+        the numerical or lexicographical order of the labels in ``y_true``.
+        If ``y_true`` does not contain all the labels, ``labels`` must be
+        provided.
+
+    k : int, default=2
+        Number of most likely outcomes considered to find the correct label.
+
+    normalize : bool, default=True
+        If `True`, return the fraction of correctly classified samples.
+        Otherwise, return the number of correctly classified samples.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights. If `None`, all samples are given the same weight.
+
+    labels : array-like of shape (n_classes,), default=None
+        Multiclass only. List of labels that index the classes in ``y_score``.
+        If ``None``, the numerical or lexicographical order of the labels in
+        ``y_true`` is used. If ``y_true`` does not contain all the labels,
+        ``labels`` must be provided.
+
+    Returns
+    -------
+    score : float
+        The top-k accuracy score. The best performance is 1 with
+        `normalize == True` and the number of samples with
+        `normalize == False`.
+
+    See Also
+    --------
+    accuracy_score : Compute the accuracy score. By default, the function will
+        return the fraction of correct predictions divided by the total number
+        of predictions.
+
+    Notes
+    -----
+    In cases where two or more labels are assigned equal predicted scores,
+    the labels with the highest indices will be chosen first. This might
+    impact the result if the correct label falls after the threshold because
+    of that.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import top_k_accuracy_score
+    >>> y_true = np.array([0, 1, 2, 2])
+    >>> y_score = np.array([[0.5, 0.2, 0.2],  # 0 is in top 2
+    ...                     [0.3, 0.4, 0.2],  # 1 is in top 2
+    ...                     [0.2, 0.4, 0.3],  # 2 is in top 2
+    ...                     [0.7, 0.2, 0.1]]) # 2 isn't in top 2
+    >>> top_k_accuracy_score(y_true, y_score, k=2)
+    0.75
+    >>> # Not normalizing gives the number of "correctly" classified samples
+    >>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False)
+    3.0
+    """
+    y_true = check_array(y_true, ensure_2d=False, dtype=None)
+    y_true = column_or_1d(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
+    if y_type == "binary" and labels is not None and len(labels) > 2:
+        y_type = "multiclass"
+    if y_type not in {"binary", "multiclass"}:
+        raise ValueError(
+            f"y type must be 'binary' or 'multiclass', got '{y_type}' instead."
+        )
+    y_score = check_array(y_score, ensure_2d=False)
+    if y_type == "binary":
+        if y_score.ndim == 2 and y_score.shape[1] != 1:
+            raise ValueError(
+                "`y_true` is binary while y_score is 2d with"
+                f" {y_score.shape[1]} classes. If `y_true` does not contain all the"
+                " labels, `labels` must be provided."
+            )
+        y_score = column_or_1d(y_score)
+
+    check_consistent_length(y_true, y_score, sample_weight)
+    y_score_n_classes = y_score.shape[1] if y_score.ndim == 2 else 2
+
+    if labels is None:
+        classes = _unique(y_true)
+        n_classes = len(classes)
+
+        if n_classes != y_score_n_classes:
+            raise ValueError(
+                f"Number of classes in 'y_true' ({n_classes}) not equal "
+                f"to the number of classes in 'y_score' ({y_score_n_classes})."
+                "You can provide a list of all known classes by assigning it "
+                "to the `labels` parameter."
+            )
+    else:
+        labels = column_or_1d(labels)
+        classes = _unique(labels)
+        n_labels = len(labels)
+        n_classes = len(classes)
+
+        if n_classes != n_labels:
+            raise ValueError("Parameter 'labels' must be unique.")
+
+        if not np.array_equal(classes, labels):
+            raise ValueError("Parameter 'labels' must be ordered.")
+
+        if n_classes != y_score_n_classes:
+            raise ValueError(
+                f"Number of given labels ({n_classes}) not equal to the "
+                f"number of classes in 'y_score' ({y_score_n_classes})."
+            )
+
+        if len(np.setdiff1d(y_true, classes)):
+            raise ValueError("'y_true' contains labels not in parameter 'labels'.")
+
+    if k >= n_classes:
+        warnings.warn(
+            (
+                f"'k' ({k}) greater than or equal to 'n_classes' ({n_classes}) "
+                "will result in a perfect score and is therefore meaningless."
+            ),
+            UndefinedMetricWarning,
+        )
+
+    y_true_encoded = _encode(y_true, uniques=classes)
+
+    if y_type == "binary":
+        if k == 1:
+            threshold = 0.5 if y_score.min() >= 0 and y_score.max() <= 1 else 0
+            y_pred = (y_score > threshold).astype(np.int64)
+            hits = y_pred == y_true_encoded
+        else:
+            hits = np.ones_like(y_score, dtype=np.bool_)
+    elif y_type == "multiclass":
+        sorted_pred = np.argsort(y_score, axis=1, kind="mergesort")[:, ::-1]
+        hits = (y_true_encoded == sorted_pred[:, :k].T).any(axis=0)
+
+    if normalize:
+        return float(np.average(hits, weights=sample_weight))
+    elif sample_weight is None:
+        return float(np.sum(hits))
+    else:
+        return float(np.dot(hits, sample_weight))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_regression.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..0731e00ce3a1ab24adb2e33ed17ac948455586e8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_regression.py
@@ -0,0 +1,1930 @@
+"""Metrics to assess performance on regression task.
+
+Functions named as ``*_score`` return a scalar value to maximize: the higher
+the better.
+
+Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
+the lower the better.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Real
+
+import numpy as np
+
+from ..exceptions import UndefinedMetricWarning
+from ..utils._array_api import (
+    _average,
+    _find_matching_floating_dtype,
+    get_namespace,
+    get_namespace_and_device,
+    size,
+)
+from ..utils._array_api import (
+    _xlogy as xlogy,
+)
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.stats import _weighted_percentile
+from ..utils.validation import (
+    _check_sample_weight,
+    _num_samples,
+    check_array,
+    check_consistent_length,
+    column_or_1d,
+)
+
+__ALL__ = [
+    "max_error",
+    "mean_absolute_error",
+    "mean_squared_error",
+    "mean_squared_log_error",
+    "median_absolute_error",
+    "mean_absolute_percentage_error",
+    "mean_pinball_loss",
+    "r2_score",
+    "root_mean_squared_log_error",
+    "root_mean_squared_error",
+    "explained_variance_score",
+    "mean_tweedie_deviance",
+    "mean_poisson_deviance",
+    "mean_gamma_deviance",
+    "d2_tweedie_score",
+    "d2_pinball_score",
+    "d2_absolute_error_score",
+]
+
+
+def _check_reg_targets(
+    y_true, y_pred, sample_weight, multioutput, dtype="numeric", xp=None
+):
+    """Check that y_true, y_pred and sample_weight belong to the same regression task.
+
+    To reduce redundancy when calling `_find_matching_floating_dtype`,
+    please use `_check_reg_targets_with_floating_dtype` instead.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,) or None
+        Sample weights.
+
+    multioutput : array-like or string in ['raw_values', uniform_average',
+        'variance_weighted'] or None
+        None is accepted due to backward compatibility of r2_score().
+
+    dtype : str or list, default="numeric"
+        the dtype argument passed to check_array.
+
+    xp : module, default=None
+        Precomputed array namespace module. When passed, typically from a caller
+        that has already performed inspection of its own inputs, skips array
+        namespace inspection.
+
+    Returns
+    -------
+    type_true : one of {'continuous', continuous-multioutput'}
+        The type of the true target data, as output by
+        'utils.multiclass.type_of_target'.
+
+    y_true : array-like of shape (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,) or None
+        Sample weights.
+
+    multioutput : array-like of shape (n_outputs) or string in ['raw_values',
+        uniform_average', 'variance_weighted'] or None
+        Custom output weights if ``multioutput`` is array-like or
+        just the corresponding argument if ``multioutput`` is a
+        correct keyword.
+    """
+    xp, _ = get_namespace(y_true, y_pred, multioutput, xp=xp)
+
+    check_consistent_length(y_true, y_pred, sample_weight)
+    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
+    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, y_true, dtype=dtype)
+
+    if y_true.ndim == 1:
+        y_true = xp.reshape(y_true, (-1, 1))
+
+    if y_pred.ndim == 1:
+        y_pred = xp.reshape(y_pred, (-1, 1))
+
+    if y_true.shape[1] != y_pred.shape[1]:
+        raise ValueError(
+            "y_true and y_pred have different number of output ({0}!={1})".format(
+                y_true.shape[1], y_pred.shape[1]
+            )
+        )
+
+    n_outputs = y_true.shape[1]
+    allowed_multioutput_str = ("raw_values", "uniform_average", "variance_weighted")
+    if isinstance(multioutput, str):
+        if multioutput not in allowed_multioutput_str:
+            raise ValueError(
+                "Allowed 'multioutput' string values are {}. "
+                "You provided multioutput={!r}".format(
+                    allowed_multioutput_str, multioutput
+                )
+            )
+    elif multioutput is not None:
+        multioutput = check_array(multioutput, ensure_2d=False)
+        if n_outputs == 1:
+            raise ValueError("Custom weights are useful only in multi-output cases.")
+        elif n_outputs != multioutput.shape[0]:
+            raise ValueError(
+                "There must be equally many custom weights "
+                f"({multioutput.shape[0]}) as outputs ({n_outputs})."
+            )
+    y_type = "continuous" if n_outputs == 1 else "continuous-multioutput"
+
+    return y_type, y_true, y_pred, sample_weight, multioutput
+
+
+def _check_reg_targets_with_floating_dtype(
+    y_true, y_pred, sample_weight, multioutput, xp=None
+):
+    """Ensures y_true, y_pred, and sample_weight correspond to same regression task.
+
+    Extends `_check_reg_targets` by automatically selecting a suitable floating-point
+    data type for inputs using `_find_matching_floating_dtype`.
+
+    Use this private method only when converting inputs to array API-compatibles.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,)
+
+    multioutput : array-like or string in ['raw_values', 'uniform_average', \
+        'variance_weighted'] or None
+        None is accepted due to backward compatibility of r2_score().
+
+    xp : module, default=None
+        Precomputed array namespace module. When passed, typically from a caller
+        that has already performed inspection of its own inputs, skips array
+        namespace inspection.
+
+    Returns
+    -------
+    type_true : one of {'continuous', 'continuous-multioutput'}
+        The type of the true target data, as output by
+        'utils.multiclass.type_of_target'.
+
+    y_true : array-like of shape (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : array-like of shape (n_outputs) or string in ['raw_values', \
+        'uniform_average', 'variance_weighted'] or None
+        Custom output weights if ``multioutput`` is array-like or
+        just the corresponding argument if ``multioutput`` is a
+        correct keyword.
+    """
+    dtype_name = _find_matching_floating_dtype(y_true, y_pred, sample_weight, xp=xp)
+
+    y_type, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(
+        y_true, y_pred, sample_weight, multioutput, dtype=dtype_name, xp=xp
+    )
+
+    return y_type, y_true, y_pred, sample_weight, multioutput
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mean_absolute_error(
+    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
+):
+    """Mean absolute error regression loss.
+
+    The mean absolute error is a non-negative floating point value, where best value
+    is 0.0. Read more in the :ref:`User Guide <mean_absolute_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average'}  or array-like of shape \
+            (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or array of floats
+        If multioutput is 'raw_values', then mean absolute error is returned
+        for each output separately.
+        If multioutput is 'uniform_average' or an ndarray of weights, then the
+        weighted average of all output errors is returned.
+
+        MAE output is non-negative floating point. The best value is 0.0.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_absolute_error
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> mean_absolute_error(y_true, y_pred)
+    0.5
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> mean_absolute_error(y_true, y_pred)
+    0.75
+    >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values')
+    array([0.5, 1. ])
+    >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.85...
+    """
+    xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput)
+
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
+    )
+
+    output_errors = _average(
+        xp.abs(y_pred - y_true), weights=sample_weight, axis=0, xp=xp
+    )
+    if isinstance(multioutput, str):
+        if multioutput == "raw_values":
+            return output_errors
+        elif multioutput == "uniform_average":
+            # pass None as weights to _average: uniform mean
+            multioutput = None
+
+    # Average across the outputs (if needed).
+    # The second call to `_average` should always return
+    # a scalar array that we convert to a Python float to
+    # consistently return the same eager evaluated value.
+    # Therefore, `axis=None`.
+    mean_absolute_error = _average(output_errors, weights=multioutput)
+
+    return float(mean_absolute_error)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "alpha": [Interval(Real, 0, 1, closed="both")],
+        "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mean_pinball_loss(
+    y_true, y_pred, *, sample_weight=None, alpha=0.5, multioutput="uniform_average"
+):
+    """Pinball loss for quantile regression.
+
+    Read more in the :ref:`User Guide <pinball_loss>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    alpha : float, slope of the pinball loss, default=0.5,
+        This loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`,
+        `alpha=0.95` is minimized by estimators of the 95th percentile.
+
+    multioutput : {'raw_values', 'uniform_average'}  or array-like of shape \
+            (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        If multioutput is 'raw_values', then mean absolute error is returned
+        for each output separately.
+        If multioutput is 'uniform_average' or an ndarray of weights, then the
+        weighted average of all output errors is returned.
+
+        The pinball loss output is a non-negative floating point. The best
+        value is 0.0.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_pinball_loss
+    >>> y_true = [1, 2, 3]
+    >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1)
+    0.03...
+    >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1)
+    0.3...
+    >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9)
+    0.3...
+    >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9)
+    0.03...
+    >>> mean_pinball_loss(y_true, y_true, alpha=0.1)
+    0.0
+    >>> mean_pinball_loss(y_true, y_true, alpha=0.9)
+    0.0
+    """
+    xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput)
+
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
+    )
+
+    diff = y_true - y_pred
+    sign = xp.astype(diff >= 0, diff.dtype)
+    loss = alpha * sign * diff - (1 - alpha) * (1 - sign) * diff
+    output_errors = _average(loss, weights=sample_weight, axis=0)
+
+    if isinstance(multioutput, str) and multioutput == "raw_values":
+        return output_errors
+
+    if isinstance(multioutput, str) and multioutput == "uniform_average":
+        # pass None as weights to _average: uniform mean
+        multioutput = None
+
+    # Average across the outputs (if needed).
+    # The second call to `_average` should always return
+    # a scalar array that we convert to a Python float to
+    # consistently return the same eager evaluated value.
+    # Therefore, `axis=None`.
+    return float(_average(output_errors, weights=multioutput))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mean_absolute_percentage_error(
+    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
+):
+    """Mean absolute percentage error (MAPE) regression loss.
+
+    Note that we are not using the common "percentage" definition: the percentage
+    in the range [0, 100] is converted to a relative value in the range [0, 1]
+    by dividing by 100. Thus, an error of 200% corresponds to a relative error of 2.
+
+    Read more in the :ref:`User Guide <mean_absolute_percentage_error>`.
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+        If input is list then the shape must be (n_outputs,).
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        If multioutput is 'raw_values', then mean absolute percentage error
+        is returned for each output separately.
+        If multioutput is 'uniform_average' or an ndarray of weights, then the
+        weighted average of all output errors is returned.
+
+        MAPE output is non-negative floating point. The best value is 0.0.
+        But note that bad predictions can lead to arbitrarily large
+        MAPE values, especially if some `y_true` values are very close to zero.
+        Note that we return a large value instead of `inf` when `y_true` is zero.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_absolute_percentage_error
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> mean_absolute_percentage_error(y_true, y_pred)
+    0.3273...
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> mean_absolute_percentage_error(y_true, y_pred)
+    0.5515...
+    >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.6198...
+    >>> # the value when some element of the y_true is zero is arbitrarily high because
+    >>> # of the division by epsilon
+    >>> y_true = [1., 0., 2.4, 7.]
+    >>> y_pred = [1.2, 0.1, 2.4, 8.]
+    >>> mean_absolute_percentage_error(y_true, y_pred)
+    112589990684262.48
+    """
+    xp, _, device_ = get_namespace_and_device(
+        y_true, y_pred, sample_weight, multioutput
+    )
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
+    )
+    epsilon = xp.asarray(xp.finfo(xp.float64).eps, dtype=y_true.dtype, device=device_)
+    y_true_abs = xp.abs(y_true)
+    mape = xp.abs(y_pred - y_true) / xp.maximum(y_true_abs, epsilon)
+    output_errors = _average(mape, weights=sample_weight, axis=0)
+    if isinstance(multioutput, str):
+        if multioutput == "raw_values":
+            return output_errors
+        elif multioutput == "uniform_average":
+            # pass None as weights to _average: uniform mean
+            multioutput = None
+
+    # Average across the outputs (if needed).
+    # The second call to `_average` should always return
+    # a scalar array that we convert to a Python float to
+    # consistently return the same eager evaluated value.
+    # Therefore, `axis=None`.
+    mean_absolute_percentage_error = _average(output_errors, weights=multioutput)
+
+    return float(mean_absolute_percentage_error)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mean_squared_error(
+    y_true,
+    y_pred,
+    *,
+    sample_weight=None,
+    multioutput="uniform_average",
+):
+    """Mean squared error regression loss.
+
+    Read more in the :ref:`User Guide <mean_squared_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
+            (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or array of floats
+        A non-negative floating point value (the best value is 0.0), or an
+        array of floating point values, one for each individual target.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_squared_error
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> mean_squared_error(y_true, y_pred)
+    0.375
+    >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]
+    >>> y_pred = [[0, 2],[-1, 2],[8, -5]]
+    >>> mean_squared_error(y_true, y_pred)
+    0.708...
+    >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')
+    array([0.41666667, 1.        ])
+    >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.825...
+    """
+    xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput)
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
+    )
+    output_errors = _average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
+
+    if isinstance(multioutput, str):
+        if multioutput == "raw_values":
+            return output_errors
+        elif multioutput == "uniform_average":
+            # pass None as weights to _average: uniform mean
+            multioutput = None
+
+    # Average across the outputs (if needed).
+    # The second call to `_average` should always return
+    # a scalar array that we convert to a Python float to
+    # consistently return the same eager evaluated value.
+    # Therefore, `axis=None`.
+    mean_squared_error = _average(output_errors, weights=multioutput)
+
+    return float(mean_squared_error)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def root_mean_squared_error(
+    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
+):
+    """Root mean squared error regression loss.
+
+    Read more in the :ref:`User Guide <mean_squared_error>`.
+
+    .. versionadded:: 1.4
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
+            (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        A non-negative floating point value (the best value is 0.0), or an
+        array of floating point values, one for each individual target.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import root_mean_squared_error
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> root_mean_squared_error(y_true, y_pred)
+    0.612...
+    >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]
+    >>> y_pred = [[0, 2],[-1, 2],[8, -5]]
+    >>> root_mean_squared_error(y_true, y_pred)
+    0.822...
+    """
+
+    xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput)
+
+    output_errors = xp.sqrt(
+        mean_squared_error(
+            y_true, y_pred, sample_weight=sample_weight, multioutput="raw_values"
+        )
+    )
+
+    if isinstance(multioutput, str):
+        if multioutput == "raw_values":
+            return output_errors
+        elif multioutput == "uniform_average":
+            # pass None as weights to _average: uniform mean
+            multioutput = None
+
+    # Average across the outputs (if needed).
+    # The second call to `_average` should always return
+    # a scalar array that we convert to a Python float to
+    # consistently return the same eager evaluated value.
+    # Therefore, `axis=None`.
+    root_mean_squared_error = _average(output_errors, weights=multioutput)
+
+    return float(root_mean_squared_error)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mean_squared_log_error(
+    y_true,
+    y_pred,
+    *,
+    sample_weight=None,
+    multioutput="uniform_average",
+):
+    """Mean squared logarithmic error regression loss.
+
+    Read more in the :ref:`User Guide <mean_squared_log_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
+            (n_outputs,), default='uniform_average'
+
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors when the input is of multioutput
+            format.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        A non-negative floating point value (the best value is 0.0), or an
+        array of floating point values, one for each individual target.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_squared_log_error
+    >>> y_true = [3, 5, 2.5, 7]
+    >>> y_pred = [2.5, 5, 4, 8]
+    >>> mean_squared_log_error(y_true, y_pred)
+    0.039...
+    >>> y_true = [[0.5, 1], [1, 2], [7, 6]]
+    >>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]]
+    >>> mean_squared_log_error(y_true, y_pred)
+    0.044...
+    >>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values')
+    array([0.00462428, 0.08377444])
+    >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.060...
+    """
+    xp, _ = get_namespace(y_true, y_pred)
+
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
+    )
+
+    if xp.any(y_true <= -1) or xp.any(y_pred <= -1):
+        raise ValueError(
+            "Mean Squared Logarithmic Error cannot be used when "
+            "targets contain values less than or equal to -1."
+        )
+
+    return mean_squared_error(
+        xp.log1p(y_true),
+        xp.log1p(y_pred),
+        sample_weight=sample_weight,
+        multioutput=multioutput,
+    )
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def root_mean_squared_log_error(
+    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
+):
+    """Root mean squared logarithmic error regression loss.
+
+    Read more in the :ref:`User Guide <mean_squared_log_error>`.
+
+    .. versionadded:: 1.4
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
+            (n_outputs,), default='uniform_average'
+
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors when the input is of multioutput
+            format.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        A non-negative floating point value (the best value is 0.0), or an
+        array of floating point values, one for each individual target.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import root_mean_squared_log_error
+    >>> y_true = [3, 5, 2.5, 7]
+    >>> y_pred = [2.5, 5, 4, 8]
+    >>> root_mean_squared_log_error(y_true, y_pred)
+    0.199...
+    """
+    xp, _ = get_namespace(y_true, y_pred)
+
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
+    )
+
+    if xp.any(y_true <= -1) or xp.any(y_pred <= -1):
+        raise ValueError(
+            "Root Mean Squared Logarithmic Error cannot be used when "
+            "targets contain values less than or equal to -1."
+        )
+
+    return root_mean_squared_error(
+        xp.log1p(y_true),
+        xp.log1p(y_pred),
+        sample_weight=sample_weight,
+        multioutput=multioutput,
+    )
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def median_absolute_error(
+    y_true, y_pred, *, multioutput="uniform_average", sample_weight=None
+):
+    """Median absolute error regression loss.
+
+    Median absolute error output is non-negative floating point. The best value
+    is 0.0. Read more in the :ref:`User Guide <median_absolute_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
+            (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output values. Array-like value defines
+        weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    loss : float or ndarray of floats
+        If multioutput is 'raw_values', then mean absolute error is returned
+        for each output separately.
+        If multioutput is 'uniform_average' or an ndarray of weights, then the
+        weighted average of all output errors is returned.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import median_absolute_error
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> median_absolute_error(y_true, y_pred)
+    0.5
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> median_absolute_error(y_true, y_pred)
+    0.75
+    >>> median_absolute_error(y_true, y_pred, multioutput='raw_values')
+    array([0.5, 1. ])
+    >>> median_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    0.85
+    """
+    _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(
+        y_true, y_pred, sample_weight, multioutput
+    )
+    if sample_weight is None:
+        output_errors = np.median(np.abs(y_pred - y_true), axis=0)
+    else:
+        output_errors = _weighted_percentile(
+            np.abs(y_pred - y_true), sample_weight=sample_weight
+        )
+    if isinstance(multioutput, str):
+        if multioutput == "raw_values":
+            return output_errors
+        elif multioutput == "uniform_average":
+            # pass None as weights to np.average: uniform mean
+            multioutput = None
+
+    return float(np.average(output_errors, weights=multioutput))
+
+
+def _assemble_r2_explained_variance(
+    numerator, denominator, n_outputs, multioutput, force_finite, xp, device
+):
+    """Common part used by explained variance score and :math:`R^2` score."""
+    dtype = numerator.dtype
+
+    nonzero_denominator = denominator != 0
+
+    if not force_finite:
+        # Standard formula, that may lead to NaN or -Inf
+        output_scores = 1 - (numerator / denominator)
+    else:
+        nonzero_numerator = numerator != 0
+        # Default = Zero Numerator = perfect predictions. Set to 1.0
+        # (note: even if denominator is zero, thus avoiding NaN scores)
+        output_scores = xp.ones([n_outputs], device=device, dtype=dtype)
+        # Non-zero Numerator and Non-zero Denominator: use the formula
+        valid_score = nonzero_denominator & nonzero_numerator
+
+        output_scores[valid_score] = 1 - (
+            numerator[valid_score] / denominator[valid_score]
+        )
+
+        # Non-zero Numerator and Zero Denominator:
+        # arbitrary set to 0.0 to avoid -inf scores
+        output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0
+
+    if isinstance(multioutput, str):
+        if multioutput == "raw_values":
+            # return scores individually
+            return output_scores
+        elif multioutput == "uniform_average":
+            # pass None as weights to _average: uniform mean
+            avg_weights = None
+        elif multioutput == "variance_weighted":
+            avg_weights = denominator
+            if not xp.any(nonzero_denominator):
+                # All weights are zero, _average would raise a ZeroDiv error.
+                # This only happens when all y are constant (or 1-element long)
+                # Since weights are all equal, fall back to uniform weights.
+                avg_weights = None
+    else:
+        avg_weights = multioutput
+
+    result = _average(output_scores, weights=avg_weights)
+    if size(result) == 1:
+        return float(result)
+    return result
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [
+            StrOptions({"raw_values", "uniform_average", "variance_weighted"}),
+            "array-like",
+        ],
+        "force_finite": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def explained_variance_score(
+    y_true,
+    y_pred,
+    *,
+    sample_weight=None,
+    multioutput="uniform_average",
+    force_finite=True,
+):
+    """Explained variance regression score function.
+
+    Best possible score is 1.0, lower values are worse.
+
+    In the particular case when ``y_true`` is constant, the explained variance
+    score is not finite: it is either ``NaN`` (perfect predictions) or
+    ``-Inf`` (imperfect predictions). To prevent such non-finite numbers to
+    pollute higher-level experiments such as a grid search cross-validation,
+    by default these cases are replaced with 1.0 (perfect predictions) or 0.0
+    (imperfect predictions) respectively. If ``force_finite``
+    is set to ``False``, this score falls back on the original :math:`R^2`
+    definition.
+
+    .. note::
+       The Explained Variance score is similar to the
+       :func:`R^2 score <r2_score>`, with the notable difference that it
+       does not account for systematic offsets in the prediction. Most often
+       the :func:`R^2 score <r2_score>` should be preferred.
+
+    Read more in the :ref:`User Guide <explained_variance_score>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average', 'variance_weighted'} or \
+            array-like of shape (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output scores.
+        Array-like value defines weights used to average scores.
+
+        'raw_values' :
+            Returns a full set of scores in case of multioutput input.
+
+        'uniform_average' :
+            Scores of all outputs are averaged with uniform weight.
+
+        'variance_weighted' :
+            Scores of all outputs are averaged, weighted by the variances
+            of each individual output.
+
+    force_finite : bool, default=True
+        Flag indicating if ``NaN`` and ``-Inf`` scores resulting from constant
+        data should be replaced with real numbers (``1.0`` if prediction is
+        perfect, ``0.0`` otherwise). Default is ``True``, a convenient setting
+        for hyperparameters' search procedures (e.g. grid search
+        cross-validation).
+
+        .. versionadded:: 1.1
+
+    Returns
+    -------
+    score : float or ndarray of floats
+        The explained variance or ndarray if 'multioutput' is 'raw_values'.
+
+    See Also
+    --------
+    r2_score :
+        Similar metric, but accounting for systematic offsets in
+        prediction.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import explained_variance_score
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> explained_variance_score(y_true, y_pred)
+    0.957...
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> explained_variance_score(y_true, y_pred, multioutput='uniform_average')
+    0.983...
+    >>> y_true = [-2, -2, -2]
+    >>> y_pred = [-2, -2, -2]
+    >>> explained_variance_score(y_true, y_pred)
+    1.0
+    >>> explained_variance_score(y_true, y_pred, force_finite=False)
+    nan
+    >>> y_true = [-2, -2, -2]
+    >>> y_pred = [-2, -2, -2 + 1e-8]
+    >>> explained_variance_score(y_true, y_pred)
+    0.0
+    >>> explained_variance_score(y_true, y_pred, force_finite=False)
+    -inf
+    """
+    xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight, multioutput)
+
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
+    )
+
+    y_diff_avg = _average(y_true - y_pred, weights=sample_weight, axis=0)
+    numerator = _average(
+        (y_true - y_pred - y_diff_avg) ** 2, weights=sample_weight, axis=0
+    )
+
+    y_true_avg = _average(y_true, weights=sample_weight, axis=0)
+    denominator = _average((y_true - y_true_avg) ** 2, weights=sample_weight, axis=0)
+
+    return _assemble_r2_explained_variance(
+        numerator=numerator,
+        denominator=denominator,
+        n_outputs=y_true.shape[1],
+        multioutput=multioutput,
+        force_finite=force_finite,
+        xp=xp,
+        device=device,
+    )
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [
+            StrOptions({"raw_values", "uniform_average", "variance_weighted"}),
+            "array-like",
+            None,
+        ],
+        "force_finite": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def r2_score(
+    y_true,
+    y_pred,
+    *,
+    sample_weight=None,
+    multioutput="uniform_average",
+    force_finite=True,
+):
+    """:math:`R^2` (coefficient of determination) regression score function.
+
+    Best possible score is 1.0 and it can be negative (because the
+    model can be arbitrarily worse). In the general case when the true y is
+    non-constant, a constant model that always predicts the average y
+    disregarding the input features would get a :math:`R^2` score of 0.0.
+
+    In the particular case when ``y_true`` is constant, the :math:`R^2` score
+    is not finite: it is either ``NaN`` (perfect predictions) or ``-Inf``
+    (imperfect predictions). To prevent such non-finite numbers to pollute
+    higher-level experiments such as a grid search cross-validation, by default
+    these cases are replaced with 1.0 (perfect predictions) or 0.0 (imperfect
+    predictions) respectively. You can set ``force_finite`` to ``False`` to
+    prevent this fix from happening.
+
+    Note: when the prediction residuals have zero mean, the :math:`R^2` score
+    is identical to the
+    :func:`Explained Variance score <explained_variance_score>`.
+
+    Read more in the :ref:`User Guide <r2_score>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average', 'variance_weighted'}, \
+            array-like of shape (n_outputs,) or None, default='uniform_average'
+
+        Defines aggregating of multiple output scores.
+        Array-like value defines weights used to average scores.
+        Default is "uniform_average".
+
+        'raw_values' :
+            Returns a full set of scores in case of multioutput input.
+
+        'uniform_average' :
+            Scores of all outputs are averaged with uniform weight.
+
+        'variance_weighted' :
+            Scores of all outputs are averaged, weighted by the variances
+            of each individual output.
+
+        .. versionchanged:: 0.19
+            Default value of multioutput is 'uniform_average'.
+
+    force_finite : bool, default=True
+        Flag indicating if ``NaN`` and ``-Inf`` scores resulting from constant
+        data should be replaced with real numbers (``1.0`` if prediction is
+        perfect, ``0.0`` otherwise). Default is ``True``, a convenient setting
+        for hyperparameters' search procedures (e.g. grid search
+        cross-validation).
+
+        .. versionadded:: 1.1
+
+    Returns
+    -------
+    z : float or ndarray of floats
+        The :math:`R^2` score or ndarray of scores if 'multioutput' is
+        'raw_values'.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Unlike most other scores, :math:`R^2` score may be negative (it need not
+    actually be the square of a quantity R).
+
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry on the Coefficient of determination
+            <https://en.wikipedia.org/wiki/Coefficient_of_determination>`_
+
+    Examples
+    --------
+    >>> from sklearn.metrics import r2_score
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> r2_score(y_true, y_pred)
+    0.948...
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> r2_score(y_true, y_pred,
+    ...          multioutput='variance_weighted')
+    0.938...
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [1, 2, 3]
+    >>> r2_score(y_true, y_pred)
+    1.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [2, 2, 2]
+    >>> r2_score(y_true, y_pred)
+    0.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [3, 2, 1]
+    >>> r2_score(y_true, y_pred)
+    -3.0
+    >>> y_true = [-2, -2, -2]
+    >>> y_pred = [-2, -2, -2]
+    >>> r2_score(y_true, y_pred)
+    1.0
+    >>> r2_score(y_true, y_pred, force_finite=False)
+    nan
+    >>> y_true = [-2, -2, -2]
+    >>> y_pred = [-2, -2, -2 + 1e-8]
+    >>> r2_score(y_true, y_pred)
+    0.0
+    >>> r2_score(y_true, y_pred, force_finite=False)
+    -inf
+    """
+    xp, _, device_ = get_namespace_and_device(
+        y_true, y_pred, sample_weight, multioutput
+    )
+
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
+    )
+
+    if _num_samples(y_pred) < 2:
+        msg = "R^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+        weight = sample_weight[:, None]
+    else:
+        weight = 1.0
+
+    numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
+    denominator = xp.sum(
+        weight * (y_true - _average(y_true, axis=0, weights=sample_weight, xp=xp)) ** 2,
+        axis=0,
+    )
+
+    return _assemble_r2_explained_variance(
+        numerator=numerator,
+        denominator=denominator,
+        n_outputs=y_true.shape[1],
+        multioutput=multioutput,
+        force_finite=force_finite,
+        xp=xp,
+        device=device_,
+    )
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def max_error(y_true, y_pred):
+    """
+    The max_error metric calculates the maximum residual error.
+
+    Read more in the :ref:`User Guide <max_error>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values.
+
+    Returns
+    -------
+    max_error : float
+        A positive floating point value (the best value is 0.0).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import max_error
+    >>> y_true = [3, 2, 7, 1]
+    >>> y_pred = [4, 2, 7, 1]
+    >>> max_error(y_true, y_pred)
+    1.0
+    """
+    xp, _ = get_namespace(y_true, y_pred)
+    y_type, y_true, y_pred, _, _ = _check_reg_targets(
+        y_true, y_pred, sample_weight=None, multioutput=None, xp=xp
+    )
+    if y_type == "continuous-multioutput":
+        raise ValueError("Multioutput not supported in max_error")
+    return float(xp.max(xp.abs(y_true - y_pred)))
+
+
+def _mean_tweedie_deviance(y_true, y_pred, sample_weight, power):
+    """Mean Tweedie deviance regression loss."""
+    xp, _, device_ = get_namespace_and_device(y_true, y_pred)
+    p = power
+    if p < 0:
+        # 'Extreme stable', y any real number, y_pred > 0
+        dev = 2 * (
+            xp.pow(
+                xp.where(y_true > 0, y_true, 0.0),
+                2 - p,
+            )
+            / ((1 - p) * (2 - p))
+            - y_true * xp.pow(y_pred, 1 - p) / (1 - p)
+            + xp.pow(y_pred, 2 - p) / (2 - p)
+        )
+    elif p == 0:
+        # Normal distribution, y and y_pred any real number
+        dev = (y_true - y_pred) ** 2
+    elif p == 1:
+        # Poisson distribution
+        dev = 2 * (xlogy(y_true, y_true / y_pred) - y_true + y_pred)
+    elif p == 2:
+        # Gamma distribution
+        dev = 2 * (xp.log(y_pred / y_true) + y_true / y_pred - 1)
+    else:
+        dev = 2 * (
+            xp.pow(y_true, 2 - p) / ((1 - p) * (2 - p))
+            - y_true * xp.pow(y_pred, 1 - p) / (1 - p)
+            + xp.pow(y_pred, 2 - p) / (2 - p)
+        )
+    return float(_average(dev, weights=sample_weight))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "power": [
+            Interval(Real, None, 0, closed="right"),
+            Interval(Real, 1, None, closed="left"),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
+    """Mean Tweedie deviance regression loss.
+
+    Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    power : float, default=0
+        Tweedie power parameter. Either power <= 0 or power >= 1.
+
+        The higher `p` the less weight is given to extreme
+        deviations between true and predicted targets.
+
+        - power < 0: Extreme stable distribution. Requires: y_pred > 0.
+        - power = 0 : Normal distribution, output corresponds to
+          mean_squared_error. y_true and y_pred can be any real numbers.
+        - power = 1 : Poisson distribution. Requires: y_true >= 0 and
+          y_pred > 0.
+        - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0
+          and y_pred > 0.
+        - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.
+        - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0
+          and y_pred > 0.
+        - otherwise : Positive stable distribution. Requires: y_true > 0
+          and y_pred > 0.
+
+    Returns
+    -------
+    loss : float
+        A non-negative floating point value (the best value is 0.0).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_tweedie_deviance
+    >>> y_true = [2, 0, 1, 4]
+    >>> y_pred = [0.5, 0.5, 2., 2.]
+    >>> mean_tweedie_deviance(y_true, y_pred, power=1)
+    1.4260...
+    """
+    xp, _ = get_namespace(y_true, y_pred)
+    y_type, y_true, y_pred, sample_weight, _ = _check_reg_targets_with_floating_dtype(
+        y_true, y_pred, sample_weight, multioutput=None, xp=xp
+    )
+    if y_type == "continuous-multioutput":
+        raise ValueError("Multioutput not supported in mean_tweedie_deviance")
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+        sample_weight = sample_weight[:, np.newaxis]
+
+    message = f"Mean Tweedie deviance error with power={power} can only be used on "
+    if power < 0:
+        # 'Extreme stable', y any real number, y_pred > 0
+        if xp.any(y_pred <= 0):
+            raise ValueError(message + "strictly positive y_pred.")
+    elif power == 0:
+        # Normal, y and y_pred can be any real number
+        pass
+    elif 1 <= power < 2:
+        # Poisson and compound Poisson distribution, y >= 0, y_pred > 0
+        if xp.any(y_true < 0) or xp.any(y_pred <= 0):
+            raise ValueError(message + "non-negative y and strictly positive y_pred.")
+    elif power >= 2:
+        # Gamma and Extreme stable distribution, y and y_pred > 0
+        if xp.any(y_true <= 0) or xp.any(y_pred <= 0):
+            raise ValueError(message + "strictly positive y and y_pred.")
+    else:  # pragma: nocover
+        # Unreachable statement
+        raise ValueError
+
+    return _mean_tweedie_deviance(
+        y_true, y_pred, sample_weight=sample_weight, power=power
+    )
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None):
+    """Mean Poisson deviance regression loss.
+
+    Poisson deviance is equivalent to the Tweedie deviance with
+    the power parameter `power=1`.
+
+    Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values. Requires y_true >= 0.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values. Requires y_pred > 0.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        A non-negative floating point value (the best value is 0.0).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_poisson_deviance
+    >>> y_true = [2, 0, 1, 4]
+    >>> y_pred = [0.5, 0.5, 2., 2.]
+    >>> mean_poisson_deviance(y_true, y_pred)
+    1.4260...
+    """
+    return mean_tweedie_deviance(y_true, y_pred, sample_weight=sample_weight, power=1)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None):
+    """Mean Gamma deviance regression loss.
+
+    Gamma deviance is equivalent to the Tweedie deviance with
+    the power parameter `power=2`. It is invariant to scaling of
+    the target variable, and measures relative errors.
+
+    Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values. Requires y_true > 0.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values. Requires y_pred > 0.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        A non-negative floating point value (the best value is 0.0).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_gamma_deviance
+    >>> y_true = [2, 0.5, 1, 4]
+    >>> y_pred = [0.5, 0.5, 2., 2.]
+    >>> mean_gamma_deviance(y_true, y_pred)
+    1.0568...
+    """
+    return mean_tweedie_deviance(y_true, y_pred, sample_weight=sample_weight, power=2)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "power": [
+            Interval(Real, None, 0, closed="right"),
+            Interval(Real, 1, None, closed="left"),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0):
+    """
+    :math:`D^2` regression score function, fraction of Tweedie deviance explained.
+
+    Best possible score is 1.0 and it can be negative (because the model can be
+    arbitrarily worse). A model that always uses the empirical mean of `y_true` as
+    constant prediction, disregarding the input features, gets a D^2 score of 0.0.
+
+    Read more in the :ref:`User Guide <d2_score>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    power : float, default=0
+        Tweedie power parameter. Either power <= 0 or power >= 1.
+
+        The higher `p` the less weight is given to extreme
+        deviations between true and predicted targets.
+
+        - power < 0: Extreme stable distribution. Requires: y_pred > 0.
+        - power = 0 : Normal distribution, output corresponds to r2_score.
+          y_true and y_pred can be any real numbers.
+        - power = 1 : Poisson distribution. Requires: y_true >= 0 and
+          y_pred > 0.
+        - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0
+          and y_pred > 0.
+        - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.
+        - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0
+          and y_pred > 0.
+        - otherwise : Positive stable distribution. Requires: y_true > 0
+          and y_pred > 0.
+
+    Returns
+    -------
+    z : float
+        The D^2 score.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Like R^2, D^2 score may be negative (it need not actually be the square of
+    a quantity D).
+
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
+
+    References
+    ----------
+    .. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J.
+           Wainwright. "Statistical Learning with Sparsity: The Lasso and
+           Generalizations." (2015). https://hastie.su.domains/StatLearnSparsity/
+
+    Examples
+    --------
+    >>> from sklearn.metrics import d2_tweedie_score
+    >>> y_true = [0.5, 1, 2.5, 7]
+    >>> y_pred = [1, 1, 5, 3.5]
+    >>> d2_tweedie_score(y_true, y_pred)
+    0.285...
+    >>> d2_tweedie_score(y_true, y_pred, power=1)
+    0.487...
+    >>> d2_tweedie_score(y_true, y_pred, power=2)
+    0.630...
+    >>> d2_tweedie_score(y_true, y_true, power=2)
+    1.0
+    """
+    xp, _ = get_namespace(y_true, y_pred)
+
+    y_type, y_true, y_pred, sample_weight, _ = _check_reg_targets_with_floating_dtype(
+        y_true, y_pred, sample_weight, multioutput=None, xp=xp
+    )
+    if y_type == "continuous-multioutput":
+        raise ValueError("Multioutput not supported in d2_tweedie_score")
+
+    if _num_samples(y_pred) < 2:
+        msg = "D^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
+
+    y_true, y_pred = xp.squeeze(y_true, axis=1), xp.squeeze(y_pred, axis=1)
+    numerator = mean_tweedie_deviance(
+        y_true, y_pred, sample_weight=sample_weight, power=power
+    )
+
+    y_avg = _average(y_true, weights=sample_weight, xp=xp)
+    denominator = _mean_tweedie_deviance(
+        y_true, y_avg, sample_weight=sample_weight, power=power
+    )
+
+    return 1 - numerator / denominator
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "alpha": [Interval(Real, 0, 1, closed="both")],
+        "multioutput": [
+            StrOptions({"raw_values", "uniform_average"}),
+            "array-like",
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def d2_pinball_score(
+    y_true, y_pred, *, sample_weight=None, alpha=0.5, multioutput="uniform_average"
+):
+    """
+    :math:`D^2` regression score function, fraction of pinball loss explained.
+
+    Best possible score is 1.0 and it can be negative (because the model can be
+    arbitrarily worse). A model that always uses the empirical alpha-quantile of
+    `y_true` as constant prediction, disregarding the input features,
+    gets a :math:`D^2` score of 0.0.
+
+    Read more in the :ref:`User Guide <d2_score>`.
+
+    .. versionadded:: 1.1
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    alpha : float, default=0.5
+        Slope of the pinball deviance. It determines the quantile level alpha
+        for which the pinball deviance and also D2 are optimal.
+        The default `alpha=0.5` is equivalent to `d2_absolute_error_score`.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
+            (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average scores.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Scores of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    score : float or ndarray of floats
+        The :math:`D^2` score with a pinball deviance
+        or ndarray of scores if `multioutput='raw_values'`.
+
+    Notes
+    -----
+    Like :math:`R^2`, :math:`D^2` score may be negative
+    (it need not actually be the square of a quantity D).
+
+    This metric is not well-defined for a single point and will return a NaN
+    value if n_samples is less than two.
+
+     References
+    ----------
+    .. [1] Eq. (7) of `Koenker, Roger; Machado, José A. F. (1999).
+           "Goodness of Fit and Related Inference Processes for Quantile Regression"
+           <https://doi.org/10.1080/01621459.1999.10473882>`_
+    .. [2] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J.
+           Wainwright. "Statistical Learning with Sparsity: The Lasso and
+           Generalizations." (2015). https://hastie.su.domains/StatLearnSparsity/
+
+    Examples
+    --------
+    >>> from sklearn.metrics import d2_pinball_score
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [1, 3, 3]
+    >>> d2_pinball_score(y_true, y_pred)
+    0.5
+    >>> d2_pinball_score(y_true, y_pred, alpha=0.9)
+    0.772...
+    >>> d2_pinball_score(y_true, y_pred, alpha=0.1)
+    -1.045...
+    >>> d2_pinball_score(y_true, y_true, alpha=0.1)
+    1.0
+    """
+    _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(
+        y_true, y_pred, sample_weight, multioutput
+    )
+
+    if _num_samples(y_pred) < 2:
+        msg = "D^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
+
+    numerator = mean_pinball_loss(
+        y_true,
+        y_pred,
+        sample_weight=sample_weight,
+        alpha=alpha,
+        multioutput="raw_values",
+    )
+
+    if sample_weight is None:
+        y_quantile = np.tile(
+            np.percentile(y_true, q=alpha * 100, axis=0), (len(y_true), 1)
+        )
+    else:
+        y_quantile = np.tile(
+            _weighted_percentile(
+                y_true, sample_weight=sample_weight, percentile_rank=alpha * 100
+            ),
+            (len(y_true), 1),
+        )
+
+    denominator = mean_pinball_loss(
+        y_true,
+        y_quantile,
+        sample_weight=sample_weight,
+        alpha=alpha,
+        multioutput="raw_values",
+    )
+
+    nonzero_numerator = numerator != 0
+    nonzero_denominator = denominator != 0
+    valid_score = nonzero_numerator & nonzero_denominator
+    output_scores = np.ones(y_true.shape[1])
+
+    output_scores[valid_score] = 1 - (numerator[valid_score] / denominator[valid_score])
+    output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0
+
+    if isinstance(multioutput, str):
+        if multioutput == "raw_values":
+            # return scores individually
+            return output_scores
+        else:  # multioutput == "uniform_average"
+            # passing None as weights to np.average results in uniform mean
+            avg_weights = None
+    else:
+        avg_weights = multioutput
+
+    return float(np.average(output_scores, weights=avg_weights))
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "sample_weight": ["array-like", None],
+        "multioutput": [
+            StrOptions({"raw_values", "uniform_average"}),
+            "array-like",
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def d2_absolute_error_score(
+    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
+):
+    """
+    :math:`D^2` regression score function, fraction of absolute error explained.
+
+    Best possible score is 1.0 and it can be negative (because the model can be
+    arbitrarily worse). A model that always uses the empirical median of `y_true`
+    as constant prediction, disregarding the input features,
+    gets a :math:`D^2` score of 0.0.
+
+    Read more in the :ref:`User Guide <d2_score>`.
+
+    .. versionadded:: 1.1
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
+            (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average scores.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Scores of all outputs are averaged with uniform weight.
+
+    Returns
+    -------
+    score : float or ndarray of floats
+        The :math:`D^2` score with an absolute error deviance
+        or ndarray of scores if 'multioutput' is 'raw_values'.
+
+    Notes
+    -----
+    Like :math:`R^2`, :math:`D^2` score may be negative
+    (it need not actually be the square of a quantity D).
+
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
+
+     References
+    ----------
+    .. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J.
+           Wainwright. "Statistical Learning with Sparsity: The Lasso and
+           Generalizations." (2015). https://hastie.su.domains/StatLearnSparsity/
+
+    Examples
+    --------
+    >>> from sklearn.metrics import d2_absolute_error_score
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> d2_absolute_error_score(y_true, y_pred)
+    0.764...
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> d2_absolute_error_score(y_true, y_pred, multioutput='uniform_average')
+    0.691...
+    >>> d2_absolute_error_score(y_true, y_pred, multioutput='raw_values')
+    array([0.8125    , 0.57142857])
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [1, 2, 3]
+    >>> d2_absolute_error_score(y_true, y_pred)
+    1.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [2, 2, 2]
+    >>> d2_absolute_error_score(y_true, y_pred)
+    0.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [3, 2, 1]
+    >>> d2_absolute_error_score(y_true, y_pred)
+    -1.0
+    """
+    return d2_pinball_score(
+        y_true, y_pred, sample_weight=sample_weight, alpha=0.5, multioutput=multioutput
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py
new file mode 100644
index 0000000000000000000000000000000000000000..08e5a20187de7f5c15985ed337603f442bda9fec
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py
@@ -0,0 +1,1166 @@
+"""
+The :mod:`sklearn.metrics.scorer` submodule implements a flexible
+interface for model selection and evaluation using
+arbitrary score functions.
+
+A scorer object is a callable that can be passed to
+:class:`~sklearn.model_selection.GridSearchCV` or
+:func:`sklearn.model_selection.cross_val_score` as the ``scoring``
+parameter, to specify how a model should be evaluated.
+
+The signature of the call is ``(estimator, X, y)`` where ``estimator``
+is the model to be evaluated, ``X`` is the test data and ``y`` is the
+ground truth labeling (or ``None`` in the case of unsupervised models).
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import copy
+import warnings
+from collections import Counter
+from functools import partial
+from inspect import signature
+from numbers import Integral
+from traceback import format_exc
+
+import numpy as np
+
+from ..base import is_regressor
+from ..utils import Bunch
+from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params
+from ..utils._response import _get_response_values
+from ..utils.metadata_routing import (
+    MetadataRequest,
+    MetadataRouter,
+    MethodMapping,
+    _MetadataRequester,
+    _raise_for_params,
+    _routing_enabled,
+    get_routing_for_object,
+    process_routing,
+)
+from ..utils.validation import _check_response_method
+from . import (
+    accuracy_score,
+    average_precision_score,
+    balanced_accuracy_score,
+    brier_score_loss,
+    class_likelihood_ratios,
+    d2_absolute_error_score,
+    explained_variance_score,
+    f1_score,
+    jaccard_score,
+    log_loss,
+    matthews_corrcoef,
+    max_error,
+    mean_absolute_error,
+    mean_absolute_percentage_error,
+    mean_gamma_deviance,
+    mean_poisson_deviance,
+    mean_squared_error,
+    mean_squared_log_error,
+    median_absolute_error,
+    precision_score,
+    r2_score,
+    recall_score,
+    roc_auc_score,
+    root_mean_squared_error,
+    root_mean_squared_log_error,
+    top_k_accuracy_score,
+)
+from .cluster import (
+    adjusted_mutual_info_score,
+    adjusted_rand_score,
+    completeness_score,
+    fowlkes_mallows_score,
+    homogeneity_score,
+    mutual_info_score,
+    normalized_mutual_info_score,
+    rand_score,
+    v_measure_score,
+)
+
+
+def _cached_call(cache, estimator, response_method, *args, **kwargs):
+    """Call estimator with method and args and kwargs."""
+    if cache is not None and response_method in cache:
+        return cache[response_method]
+
+    result, _ = _get_response_values(
+        estimator, *args, response_method=response_method, **kwargs
+    )
+
+    if cache is not None:
+        cache[response_method] = result
+
+    return result
+
+
+class _MultimetricScorer:
+    """Callable for multimetric scoring used to avoid repeated calls
+    to `predict_proba`, `predict`, and `decision_function`.
+
+    `_MultimetricScorer` will return a dictionary of scores corresponding to
+    the scorers in the dictionary. Note that `_MultimetricScorer` can be
+    created with a dictionary with one key  (i.e. only one actual scorer).
+
+    Parameters
+    ----------
+    scorers : dict
+        Dictionary mapping names to callable scorers.
+
+    raise_exc : bool, default=True
+        Whether to raise the exception in `__call__` or not. If set to `False`
+        a formatted string of the exception details is passed as result of
+        the failing scorer.
+    """
+
+    def __init__(self, *, scorers, raise_exc=True):
+        self._scorers = scorers
+        self._raise_exc = raise_exc
+
+    def __call__(self, estimator, *args, **kwargs):
+        """Evaluate predicted target values."""
+        scores = {}
+        cache = {} if self._use_cache(estimator) else None
+        cached_call = partial(_cached_call, cache)
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "score", **kwargs)
+        else:
+            # Scorers all get the same args, and get all of them except sample_weight.
+            # Only the ones having `sample_weight` in their signature will receive it.
+            # This does not work for metadata other than sample_weight, and for those
+            # users have to enable metadata routing.
+            common_kwargs = {
+                arg: value for arg, value in kwargs.items() if arg != "sample_weight"
+            }
+            routed_params = Bunch(
+                **{name: Bunch(score=common_kwargs.copy()) for name in self._scorers}
+            )
+            if "sample_weight" in kwargs:
+                for name, scorer in self._scorers.items():
+                    if scorer._accept_sample_weight():
+                        routed_params[name].score["sample_weight"] = kwargs[
+                            "sample_weight"
+                        ]
+
+        for name, scorer in self._scorers.items():
+            try:
+                if isinstance(scorer, _BaseScorer):
+                    score = scorer._score(
+                        cached_call, estimator, *args, **routed_params.get(name).score
+                    )
+                else:
+                    score = scorer(estimator, *args, **routed_params.get(name).score)
+                scores[name] = score
+            except Exception as e:
+                if self._raise_exc:
+                    raise e
+                else:
+                    scores[name] = format_exc()
+        return scores
+
+    def __repr__(self):
+        scorers = ", ".join([f'"{s}"' for s in self._scorers])
+        return f"MultiMetricScorer({scorers})"
+
+    def _accept_sample_weight(self):
+        # TODO(slep006): remove when metadata routing is the only way
+        return any(scorer._accept_sample_weight() for scorer in self._scorers.values())
+
+    def _use_cache(self, estimator):
+        """Return True if using a cache is beneficial, thus when a response method will
+        be called several time.
+        """
+        if len(self._scorers) == 1:  # Only one scorer
+            return False
+
+        counter = Counter(
+            [
+                _check_response_method(estimator, scorer._response_method).__name__
+                for scorer in self._scorers.values()
+                if isinstance(scorer, _BaseScorer)
+            ]
+        )
+        if any(val > 1 for val in counter.values()):
+            # The exact same response method or iterable of response methods
+            # will be called more than once.
+            return True
+
+        return False
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.3
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        return MetadataRouter(owner=self.__class__.__name__).add(
+            **self._scorers,
+            method_mapping=MethodMapping().add(caller="score", callee="score"),
+        )
+
+
+class _BaseScorer(_MetadataRequester):
+    """Base scorer that is used as `scorer(estimator, X, y_true)`.
+
+    Parameters
+    ----------
+    score_func : callable
+        The score function to use. It will be called as
+        `score_func(y_true, y_pred, **kwargs)`.
+
+    sign : int
+        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
+        Thus, `sign` defined if higher scores are better or worse.
+
+    kwargs : dict
+        Additional parameters to pass to the score function.
+
+    response_method : str
+        The method to call on the estimator to get the response values.
+    """
+
+    def __init__(self, score_func, sign, kwargs, response_method="predict"):
+        self._score_func = score_func
+        self._sign = sign
+        self._kwargs = kwargs
+        self._response_method = response_method
+        # TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6)
+        self._deprecation_msg = None
+
+    def _get_pos_label(self):
+        if "pos_label" in self._kwargs:
+            return self._kwargs["pos_label"]
+        score_func_params = signature(self._score_func).parameters
+        if "pos_label" in score_func_params:
+            return score_func_params["pos_label"].default
+        return None
+
+    def _accept_sample_weight(self):
+        # TODO(slep006): remove when metadata routing is the only way
+        return "sample_weight" in signature(self._score_func).parameters
+
+    def __repr__(self):
+        sign_string = "" if self._sign > 0 else ", greater_is_better=False"
+        response_method_string = f", response_method={self._response_method!r}"
+        kwargs_string = "".join([f", {k}={v}" for k, v in self._kwargs.items()])
+
+        return (
+            f"make_scorer({self._score_func.__name__}{sign_string}"
+            f"{response_method_string}{kwargs_string})"
+        )
+
+    def __call__(self, estimator, X, y_true, sample_weight=None, **kwargs):
+        """Evaluate predicted target values for X relative to y_true.
+
+        Parameters
+        ----------
+        estimator : object
+            Trained estimator to use for scoring. Must have a predict_proba
+            method; the output of that is used to compute the score.
+
+        X : {array-like, sparse matrix}
+            Test data that will be fed to estimator.predict.
+
+        y_true : array-like
+            Gold standard target values for X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        **kwargs : dict
+            Other parameters passed to the scorer. Refer to
+            :func:`set_score_request` for more details.
+
+            Only available if `enable_metadata_routing=True`. See the
+            :ref:`User Guide <metadata_routing>`.
+
+            .. versionadded:: 1.3
+
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+        # TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6)
+        if self._deprecation_msg is not None:
+            warnings.warn(
+                self._deprecation_msg, category=DeprecationWarning, stacklevel=2
+            )
+
+        _raise_for_params(kwargs, self, None)
+
+        _kwargs = copy.deepcopy(kwargs)
+        if sample_weight is not None:
+            _kwargs["sample_weight"] = sample_weight
+
+        return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
+
+    def _warn_overlap(self, message, kwargs):
+        """Warn if there is any overlap between ``self._kwargs`` and ``kwargs``.
+
+        This method is intended to be used to check for overlap between
+        ``self._kwargs`` and ``kwargs`` passed as metadata.
+        """
+        _kwargs = set() if self._kwargs is None else set(self._kwargs.keys())
+        overlap = _kwargs.intersection(kwargs.keys())
+        if overlap:
+            warnings.warn(
+                f"{message} Overlapping parameters are: {overlap}", UserWarning
+            )
+
+    def set_score_request(self, **kwargs):
+        """Set requested parameters by the scorer.
+
+        Please see :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.3
+
+        Parameters
+        ----------
+        kwargs : dict
+            Arguments should be of the form ``param_name=alias``, and `alias`
+            can be one of ``{True, False, None, str}``.
+        """
+        if not _routing_enabled():
+            raise RuntimeError(
+                "This method is only available when metadata routing is enabled."
+                " You can enable it using"
+                " sklearn.set_config(enable_metadata_routing=True)."
+            )
+
+        self._warn_overlap(
+            message=(
+                "You are setting metadata request for parameters which are "
+                "already set as kwargs for this metric. These set values will be "
+                "overridden by passed metadata if provided. Please pass them either "
+                "as metadata or kwargs to `make_scorer`."
+            ),
+            kwargs=kwargs,
+        )
+        self._metadata_request = MetadataRequest(owner=self.__class__.__name__)
+        for param, alias in kwargs.items():
+            self._metadata_request.score.add_request(param=param, alias=alias)
+        return self
+
+
+class _Scorer(_BaseScorer):
+    def _score(self, method_caller, estimator, X, y_true, **kwargs):
+        """Evaluate the response method of `estimator` on `X` and `y_true`.
+
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
+        estimator : object
+            Trained estimator to use for scoring.
+
+        X : {array-like, sparse matrix}
+            Test data that will be fed to clf.decision_function or
+            clf.predict_proba.
+
+        y_true : array-like
+            Gold standard target values for X. These must be class labels,
+            not decision function values.
+
+        **kwargs : dict
+            Other parameters passed to the scorer. Refer to
+            :func:`set_score_request` for more details.
+
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+        self._warn_overlap(
+            message=(
+                "There is an overlap between set kwargs of this scorer instance and"
+                " passed metadata. Please pass them either as kwargs to `make_scorer`"
+                " or metadata, but not both."
+            ),
+            kwargs=kwargs,
+        )
+
+        pos_label = None if is_regressor(estimator) else self._get_pos_label()
+        response_method = _check_response_method(estimator, self._response_method)
+        y_pred = method_caller(
+            estimator,
+            _get_response_method_name(response_method),
+            X,
+            pos_label=pos_label,
+        )
+
+        scoring_kwargs = {**self._kwargs, **kwargs}
+        return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
+
+
+@validate_params(
+    {
+        "scoring": [str, callable, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def get_scorer(scoring):
+    """Get a scorer from string.
+
+    Read more in the :ref:`User Guide <scoring_parameter>`.
+    :func:`~sklearn.metrics.get_scorer_names` can be used to retrieve the names
+    of all available scorers.
+
+    Parameters
+    ----------
+    scoring : str, callable or None
+        Scoring method as string. If callable it is returned as is.
+        If None, returns None.
+
+    Returns
+    -------
+    scorer : callable
+        The scorer.
+
+    Notes
+    -----
+    When passed a string, this function always returns a copy of the scorer
+    object. Calling `get_scorer` twice for the same scorer results in two
+    separate scorer objects.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.dummy import DummyClassifier
+    >>> from sklearn.metrics import get_scorer
+    >>> X = np.reshape([0, 1, -1, -0.5, 2], (-1, 1))
+    >>> y = np.array([0, 1, 1, 0, 1])
+    >>> classifier = DummyClassifier(strategy="constant", constant=0).fit(X, y)
+    >>> accuracy = get_scorer("accuracy")
+    >>> accuracy(classifier, X, y)
+    0.4
+    """
+    if isinstance(scoring, str):
+        try:
+            if scoring == "max_error":
+                # TODO (1.8): scoring="max_error" has been deprecated in 1.6,
+                # remove in 1.8
+                scorer = max_error_scorer
+            else:
+                scorer = copy.deepcopy(_SCORERS[scoring])
+        except KeyError:
+            raise ValueError(
+                "%r is not a valid scoring value. "
+                "Use sklearn.metrics.get_scorer_names() "
+                "to get valid options." % scoring
+            )
+    else:
+        scorer = scoring
+    return scorer
+
+
+class _PassthroughScorer(_MetadataRequester):
+    # Passes scoring of estimator's `score` method back to estimator if scoring
+    # is `None`.
+
+    def __init__(self, estimator):
+        self._estimator = estimator
+
+        requests = MetadataRequest(owner=self.__class__.__name__)
+        try:
+            requests.score = copy.deepcopy(estimator._metadata_request.score)
+        except AttributeError:
+            try:
+                requests.score = copy.deepcopy(estimator._get_default_requests().score)
+            except AttributeError:
+                pass
+
+        self._metadata_request = requests
+
+    def __call__(self, estimator, *args, **kwargs):
+        """Method that wraps estimator.score"""
+        return estimator.score(*args, **kwargs)
+
+    def __repr__(self):
+        return f"{self._estimator.__class__}.score"
+
+    def _accept_sample_weight(self):
+        # TODO(slep006): remove when metadata routing is the only way
+        return "sample_weight" in signature(self._estimator.score).parameters
+
+    def get_metadata_routing(self):
+        """Get requested data properties.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.3
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        return get_routing_for_object(self._metadata_request)
+
+    def set_score_request(self, **kwargs):
+        """Set requested parameters by the scorer.
+
+        Please see :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Parameters
+        ----------
+        kwargs : dict
+            Arguments should be of the form ``param_name=alias``, and `alias`
+            can be one of ``{True, False, None, str}``.
+        """
+        if not _routing_enabled():
+            raise RuntimeError(
+                "This method is only available when metadata routing is enabled."
+                " You can enable it using"
+                " sklearn.set_config(enable_metadata_routing=True)."
+            )
+
+        for param, alias in kwargs.items():
+            self._metadata_request.score.add_request(param=param, alias=alias)
+        return self
+
+
+def _check_multimetric_scoring(estimator, scoring):
+    """Check the scoring parameter in cases when multiple metrics are allowed.
+
+    In addition, multimetric scoring leverages a caching mechanism to not call the same
+    estimator response method multiple times. Hence, the scorer is modified to only use
+    a single response method given a list of response methods and the estimator.
+
+    Parameters
+    ----------
+    estimator : sklearn estimator instance
+        The estimator for which the scoring will be applied.
+
+    scoring : list, tuple or dict
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
+
+        The possibilities are:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where they keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+    Returns
+    -------
+    scorers_dict : dict
+        A dict mapping each scorer name to its validated scorer.
+    """
+    err_msg_generic = (
+        f"scoring is invalid (got {scoring!r}). Refer to the "
+        "scoring glossary for details: "
+        "https://scikit-learn.org/stable/glossary.html#term-scoring"
+    )
+
+    if isinstance(scoring, (list, tuple, set)):
+        err_msg = (
+            "The list/tuple elements must be unique strings of predefined scorers. "
+        )
+        try:
+            keys = set(scoring)
+        except TypeError as e:
+            raise ValueError(err_msg) from e
+
+        if len(keys) != len(scoring):
+            raise ValueError(
+                f"{err_msg} Duplicate elements were found in"
+                f" the given list. {scoring!r}"
+            )
+        elif len(keys) > 0:
+            if not all(isinstance(k, str) for k in keys):
+                if any(callable(k) for k in keys):
+                    raise ValueError(
+                        f"{err_msg} One or more of the elements "
+                        "were callables. Use a dict of score "
+                        "name mapped to the scorer callable. "
+                        f"Got {scoring!r}"
+                    )
+                else:
+                    raise ValueError(
+                        f"{err_msg} Non-string types were found "
+                        f"in the given list. Got {scoring!r}"
+                    )
+            scorers = {
+                scorer: check_scoring(estimator, scoring=scorer) for scorer in scoring
+            }
+        else:
+            raise ValueError(f"{err_msg} Empty list was given. {scoring!r}")
+
+    elif isinstance(scoring, dict):
+        keys = set(scoring)
+        if not all(isinstance(k, str) for k in keys):
+            raise ValueError(
+                "Non-string types were found in the keys of "
+                f"the given dict. scoring={scoring!r}"
+            )
+        if len(keys) == 0:
+            raise ValueError(f"An empty dict was passed. {scoring!r}")
+        scorers = {
+            key: check_scoring(estimator, scoring=scorer)
+            for key, scorer in scoring.items()
+        }
+    else:
+        raise ValueError(err_msg_generic)
+
+    return scorers
+
+
+def _get_response_method_name(response_method):
+    try:
+        return response_method.__name__
+    except AttributeError:
+        return _get_response_method_name(response_method.func)
+
+
+@validate_params(
+    {
+        "score_func": [callable],
+        "response_method": [
+            None,
+            list,
+            tuple,
+            StrOptions({"predict", "predict_proba", "decision_function"}),
+            Hidden(StrOptions({"default"})),
+        ],
+        "greater_is_better": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def make_scorer(
+    score_func, *, response_method="default", greater_is_better=True, **kwargs
+):
+    """Make a scorer from a performance metric or loss function.
+
+    A scorer is a wrapper around an arbitrary metric or loss function that is called
+    with the signature `scorer(estimator, X, y_true, **kwargs)`.
+
+    It is accepted in all scikit-learn estimators or functions allowing a `scoring`
+    parameter.
+
+    The parameter `response_method` allows to specify which method of the estimator
+    should be used to feed the scoring/loss function.
+
+    Read more in the :ref:`User Guide <scoring_callable>`.
+
+    Parameters
+    ----------
+    score_func : callable
+        Score function (or loss function) with signature
+        ``score_func(y, y_pred, **kwargs)``.
+
+    response_method : {"predict_proba", "decision_function", "predict"} or \
+            list/tuple of such str, default=None
+
+        Specifies the response method to use get prediction from an estimator
+        (i.e. :term:`predict_proba`, :term:`decision_function` or
+        :term:`predict`). Possible choices are:
+
+        - if `str`, it corresponds to the name to the method to return;
+        - if a list or tuple of `str`, it provides the method names in order of
+          preference. The method returned corresponds to the first method in
+          the list and which is implemented by `estimator`.
+        - if `None`, it is equivalent to `"predict"`.
+
+        .. versionadded:: 1.4
+
+        .. deprecated:: 1.6
+            None is equivalent to 'predict' and is deprecated. It will be removed in
+            version 1.8.
+
+    greater_is_better : bool, default=True
+        Whether `score_func` is a score function (default), meaning high is
+        good, or a loss function, meaning low is good. In the latter case, the
+        scorer object will sign-flip the outcome of the `score_func`.
+
+    **kwargs : additional arguments
+        Additional parameters to be passed to `score_func`.
+
+    Returns
+    -------
+    scorer : callable
+        Callable object that returns a scalar score; greater is better.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import fbeta_score, make_scorer
+    >>> ftwo_scorer = make_scorer(fbeta_score, beta=2)
+    >>> ftwo_scorer
+    make_scorer(fbeta_score, response_method='predict', beta=2)
+    >>> from sklearn.model_selection import GridSearchCV
+    >>> from sklearn.svm import LinearSVC
+    >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]},
+    ...                     scoring=ftwo_scorer)
+    """
+    sign = 1 if greater_is_better else -1
+
+    if response_method is None:
+        warnings.warn(
+            "response_method=None is deprecated in version 1.6 and will be removed "
+            "in version 1.8. Leave it to its default value to avoid this warning.",
+            FutureWarning,
+        )
+        response_method = "predict"
+    elif response_method == "default":
+        response_method = "predict"
+
+    return _Scorer(score_func, sign, kwargs, response_method)
+
+
+# Standard regression scores
+explained_variance_scorer = make_scorer(explained_variance_score)
+r2_scorer = make_scorer(r2_score)
+neg_max_error_scorer = make_scorer(max_error, greater_is_better=False)
+max_error_scorer = make_scorer(max_error, greater_is_better=False)
+# TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6)
+deprecation_msg = (
+    "Scoring method max_error was renamed to "
+    "neg_max_error in version 1.6 and will "
+    "be removed in 1.8."
+)
+max_error_scorer._deprecation_msg = deprecation_msg
+neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False)
+neg_mean_squared_log_error_scorer = make_scorer(
+    mean_squared_log_error, greater_is_better=False
+)
+neg_mean_absolute_error_scorer = make_scorer(
+    mean_absolute_error, greater_is_better=False
+)
+neg_mean_absolute_percentage_error_scorer = make_scorer(
+    mean_absolute_percentage_error, greater_is_better=False
+)
+neg_median_absolute_error_scorer = make_scorer(
+    median_absolute_error, greater_is_better=False
+)
+neg_root_mean_squared_error_scorer = make_scorer(
+    root_mean_squared_error, greater_is_better=False
+)
+neg_root_mean_squared_log_error_scorer = make_scorer(
+    root_mean_squared_log_error, greater_is_better=False
+)
+neg_mean_poisson_deviance_scorer = make_scorer(
+    mean_poisson_deviance, greater_is_better=False
+)
+
+neg_mean_gamma_deviance_scorer = make_scorer(
+    mean_gamma_deviance, greater_is_better=False
+)
+d2_absolute_error_scorer = make_scorer(d2_absolute_error_score)
+
+# Standard Classification Scores
+accuracy_scorer = make_scorer(accuracy_score)
+balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)
+matthews_corrcoef_scorer = make_scorer(matthews_corrcoef)
+
+
+def positive_likelihood_ratio(y_true, y_pred):
+    return class_likelihood_ratios(y_true, y_pred, replace_undefined_by=1.0)[0]
+
+
+def negative_likelihood_ratio(y_true, y_pred):
+    return class_likelihood_ratios(y_true, y_pred, replace_undefined_by=1.0)[1]
+
+
+positive_likelihood_ratio_scorer = make_scorer(positive_likelihood_ratio)
+neg_negative_likelihood_ratio_scorer = make_scorer(
+    negative_likelihood_ratio, greater_is_better=False
+)
+
+# Score functions that need decision values
+top_k_accuracy_scorer = make_scorer(
+    top_k_accuracy_score,
+    greater_is_better=True,
+    response_method=("decision_function", "predict_proba"),
+)
+roc_auc_scorer = make_scorer(
+    roc_auc_score,
+    greater_is_better=True,
+    response_method=("decision_function", "predict_proba"),
+)
+average_precision_scorer = make_scorer(
+    average_precision_score,
+    response_method=("decision_function", "predict_proba"),
+)
+roc_auc_ovo_scorer = make_scorer(
+    roc_auc_score, response_method="predict_proba", multi_class="ovo"
+)
+roc_auc_ovo_weighted_scorer = make_scorer(
+    roc_auc_score,
+    response_method="predict_proba",
+    multi_class="ovo",
+    average="weighted",
+)
+roc_auc_ovr_scorer = make_scorer(
+    roc_auc_score, response_method="predict_proba", multi_class="ovr"
+)
+roc_auc_ovr_weighted_scorer = make_scorer(
+    roc_auc_score,
+    response_method="predict_proba",
+    multi_class="ovr",
+    average="weighted",
+)
+
+# Score function for probabilistic classification
+neg_log_loss_scorer = make_scorer(
+    log_loss, greater_is_better=False, response_method="predict_proba"
+)
+neg_brier_score_scorer = make_scorer(
+    brier_score_loss, greater_is_better=False, response_method="predict_proba"
+)
+brier_score_loss_scorer = make_scorer(
+    brier_score_loss, greater_is_better=False, response_method="predict_proba"
+)
+
+
+# Clustering scores
+adjusted_rand_scorer = make_scorer(adjusted_rand_score)
+rand_scorer = make_scorer(rand_score)
+homogeneity_scorer = make_scorer(homogeneity_score)
+completeness_scorer = make_scorer(completeness_score)
+v_measure_scorer = make_scorer(v_measure_score)
+mutual_info_scorer = make_scorer(mutual_info_score)
+adjusted_mutual_info_scorer = make_scorer(adjusted_mutual_info_score)
+normalized_mutual_info_scorer = make_scorer(normalized_mutual_info_score)
+fowlkes_mallows_scorer = make_scorer(fowlkes_mallows_score)
+
+
+_SCORERS = dict(
+    explained_variance=explained_variance_scorer,
+    r2=r2_scorer,
+    neg_max_error=neg_max_error_scorer,
+    matthews_corrcoef=matthews_corrcoef_scorer,
+    neg_median_absolute_error=neg_median_absolute_error_scorer,
+    neg_mean_absolute_error=neg_mean_absolute_error_scorer,
+    neg_mean_absolute_percentage_error=neg_mean_absolute_percentage_error_scorer,
+    neg_mean_squared_error=neg_mean_squared_error_scorer,
+    neg_mean_squared_log_error=neg_mean_squared_log_error_scorer,
+    neg_root_mean_squared_error=neg_root_mean_squared_error_scorer,
+    neg_root_mean_squared_log_error=neg_root_mean_squared_log_error_scorer,
+    neg_mean_poisson_deviance=neg_mean_poisson_deviance_scorer,
+    neg_mean_gamma_deviance=neg_mean_gamma_deviance_scorer,
+    d2_absolute_error_score=d2_absolute_error_scorer,
+    accuracy=accuracy_scorer,
+    top_k_accuracy=top_k_accuracy_scorer,
+    roc_auc=roc_auc_scorer,
+    roc_auc_ovr=roc_auc_ovr_scorer,
+    roc_auc_ovo=roc_auc_ovo_scorer,
+    roc_auc_ovr_weighted=roc_auc_ovr_weighted_scorer,
+    roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer,
+    balanced_accuracy=balanced_accuracy_scorer,
+    average_precision=average_precision_scorer,
+    neg_log_loss=neg_log_loss_scorer,
+    neg_brier_score=neg_brier_score_scorer,
+    positive_likelihood_ratio=positive_likelihood_ratio_scorer,
+    neg_negative_likelihood_ratio=neg_negative_likelihood_ratio_scorer,
+    # Cluster metrics that use supervised evaluation
+    adjusted_rand_score=adjusted_rand_scorer,
+    rand_score=rand_scorer,
+    homogeneity_score=homogeneity_scorer,
+    completeness_score=completeness_scorer,
+    v_measure_score=v_measure_scorer,
+    mutual_info_score=mutual_info_scorer,
+    adjusted_mutual_info_score=adjusted_mutual_info_scorer,
+    normalized_mutual_info_score=normalized_mutual_info_scorer,
+    fowlkes_mallows_score=fowlkes_mallows_scorer,
+)
+
+
+def get_scorer_names():
+    """Get the names of all available scorers.
+
+    These names can be passed to :func:`~sklearn.metrics.get_scorer` to
+    retrieve the scorer object.
+
+    Returns
+    -------
+    list of str
+        Names of all available scorers.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import get_scorer_names
+    >>> all_scorers = get_scorer_names()
+    >>> type(all_scorers)
+    <class 'list'>
+    >>> all_scorers[:3]
+    ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score']
+    >>> "roc_auc" in all_scorers
+    True
+    """
+    return sorted(_SCORERS.keys())
+
+
+for name, metric in [
+    ("precision", precision_score),
+    ("recall", recall_score),
+    ("f1", f1_score),
+    ("jaccard", jaccard_score),
+]:
+    _SCORERS[name] = make_scorer(metric, average="binary")
+    for average in ["macro", "micro", "samples", "weighted"]:
+        qualified_name = "{0}_{1}".format(name, average)
+        _SCORERS[qualified_name] = make_scorer(metric, pos_label=None, average=average)
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods("fit"), None],
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            list,
+            set,
+            tuple,
+            dict,
+            None,
+        ],
+        "allow_none": ["boolean"],
+        "raise_exc": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def check_scoring(estimator=None, scoring=None, *, allow_none=False, raise_exc=True):
+    """Determine scorer from user options.
+
+    A TypeError will be thrown if the estimator cannot be scored.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit' or None, default=None
+        The object to use to fit the data. If `None`, then this function may error
+        depending on `allow_none`.
+
+    scoring : str, callable, list, tuple, set, or dict, default=None
+        Scorer to use. If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_string_names`);
+        - a callable (see :ref:`scoring_callable`) that returns a single value;
+        - `None`, the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list, tuple or set of unique strings;
+        - a callable returning a dictionary where the keys are the metric names and the
+          values are the metric scorers;
+        - a dictionary with metric names as keys and callables a values. The callables
+          need to have the signature `callable(estimator, X, y)`.
+
+    allow_none : bool, default=False
+        Whether to return None or raise an error if no `scoring` is specified and the
+        estimator has no `score` method.
+
+    raise_exc : bool, default=True
+        Whether to raise an exception (if a subset of the scorers in multimetric scoring
+        fails) or to return an error code.
+
+        - If set to `True`, raises the failing scorer's exception.
+        - If set to `False`, a formatted string of the exception details is passed as
+          result of the failing scorer(s).
+
+        This applies if `scoring` is list, tuple, set, or dict. Ignored if `scoring` is
+        a str or a callable.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    scoring : callable
+        A scorer callable object / function with signature ``scorer(estimator, X, y)``.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.metrics import check_scoring
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> classifier = DecisionTreeClassifier(max_depth=2).fit(X, y)
+    >>> scorer = check_scoring(classifier, scoring='accuracy')
+    >>> scorer(classifier, X, y)
+    0.96...
+
+    >>> from sklearn.metrics import make_scorer, accuracy_score, mean_squared_log_error
+    >>> X, y = load_iris(return_X_y=True)
+    >>> y *= -1
+    >>> clf = DecisionTreeClassifier().fit(X, y)
+    >>> scoring = {
+    ...     "accuracy": make_scorer(accuracy_score),
+    ...     "mean_squared_log_error": make_scorer(mean_squared_log_error),
+    ... }
+    >>> scoring_call = check_scoring(estimator=clf, scoring=scoring, raise_exc=False)
+    >>> scores = scoring_call(clf, X, y)
+    >>> scores
+    {'accuracy': 1.0, 'mean_squared_log_error': 'Traceback ...'}
+    """
+    if isinstance(scoring, str):
+        return get_scorer(scoring)
+    if callable(scoring):
+        # Heuristic to ensure user has not passed a metric
+        module = getattr(scoring, "__module__", None)
+        if (
+            hasattr(module, "startswith")
+            and module.startswith("sklearn.metrics.")
+            and not module.startswith("sklearn.metrics._scorer")
+            and not module.startswith("sklearn.metrics.tests.")
+        ):
+            raise ValueError(
+                "scoring value %r looks like it is a metric "
+                "function rather than a scorer. A scorer should "
+                "require an estimator as its first parameter. "
+                "Please use `make_scorer` to convert a metric "
+                "to a scorer." % scoring
+            )
+        return get_scorer(scoring)
+    if isinstance(scoring, (list, tuple, set, dict)):
+        scorers = _check_multimetric_scoring(estimator, scoring=scoring)
+        return _MultimetricScorer(scorers=scorers, raise_exc=raise_exc)
+    if scoring is None:
+        if hasattr(estimator, "score"):
+            return _PassthroughScorer(estimator)
+        elif allow_none:
+            return None
+        else:
+            raise TypeError(
+                "If no scoring is specified, the estimator passed should "
+                "have a 'score' method. The estimator %r does not." % estimator
+            )
+
+
+def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label):
+    """Threshold `y_score` and return the associated class labels."""
+    if pos_label is None:
+        map_thresholded_score_to_label = np.array([0, 1])
+    else:
+        pos_label_idx = np.flatnonzero(classes == pos_label)[0]
+        neg_label_idx = np.flatnonzero(classes != pos_label)[0]
+        map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx])
+
+    return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]]
+
+
+class _CurveScorer(_BaseScorer):
+    """Scorer taking a continuous response and output a score for each threshold.
+
+    Parameters
+    ----------
+    score_func : callable
+        The score function to use. It will be called as
+        `score_func(y_true, y_pred, **kwargs)`.
+
+    sign : int
+        Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`.
+        Thus, `sign` defined if higher scores are better or worse.
+
+    kwargs : dict
+        Additional parameters to pass to the score function.
+
+    thresholds : int or array-like
+        Related to the number of decision thresholds for which we want to compute the
+        score. If an integer, it will be used to generate `thresholds` thresholds
+        uniformly distributed between the minimum and maximum predicted scores. If an
+        array-like, it will be used as the thresholds.
+
+    response_method : str
+        The method to call on the estimator to get the response values.
+    """
+
+    def __init__(self, score_func, sign, kwargs, thresholds, response_method):
+        super().__init__(
+            score_func=score_func,
+            sign=sign,
+            kwargs=kwargs,
+            response_method=response_method,
+        )
+        self._thresholds = thresholds
+
+    @classmethod
+    def from_scorer(cls, scorer, response_method, thresholds):
+        """Create a continuous scorer from a normal scorer."""
+        instance = cls(
+            score_func=scorer._score_func,
+            sign=scorer._sign,
+            response_method=response_method,
+            thresholds=thresholds,
+            kwargs=scorer._kwargs,
+        )
+        # transfer the metadata request
+        instance._metadata_request = scorer._get_metadata_request()
+        return instance
+
+    def _score(self, method_caller, estimator, X, y_true, **kwargs):
+        """Evaluate predicted target values for X relative to y_true.
+
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
+        estimator : object
+            Trained estimator to use for scoring.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Test data that will be fed to estimator.predict.
+
+        y_true : array-like of shape (n_samples,)
+            Gold standard target values for X.
+
+        **kwargs : dict
+            Other parameters passed to the scorer. Refer to
+            :func:`set_score_request` for more details.
+
+        Returns
+        -------
+        scores : ndarray of shape (thresholds,)
+            The scores associated to each threshold.
+
+        potential_thresholds : ndarray of shape (thresholds,)
+            The potential thresholds used to compute the scores.
+        """
+        pos_label = self._get_pos_label()
+        y_score = method_caller(
+            estimator, self._response_method, X, pos_label=pos_label
+        )
+
+        scoring_kwargs = {**self._kwargs, **kwargs}
+        if isinstance(self._thresholds, Integral):
+            potential_thresholds = np.linspace(
+                np.min(y_score), np.max(y_score), self._thresholds
+            )
+        else:
+            potential_thresholds = np.asarray(self._thresholds)
+        score_thresholds = [
+            self._sign
+            * self._score_func(
+                y_true,
+                _threshold_scores_to_class_labels(
+                    y_score, th, estimator.classes_, pos_label
+                ),
+                **scoring_kwargs,
+            )
+            for th in potential_thresholds
+        ]
+        return np.array(score_thresholds), potential_thresholds
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..76020d80f8eb02a4647dada4415e5286a0bebe59
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__init__.py
@@ -0,0 +1,55 @@
+"""Evaluation metrics for cluster analysis results.
+
+- Supervised evaluation uses a ground truth class values for each sample.
+- Unsupervised evaluation does not use ground truths and measures the "quality" of the
+  model itself.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._bicluster import consensus_score
+from ._supervised import (
+    adjusted_mutual_info_score,
+    adjusted_rand_score,
+    completeness_score,
+    contingency_matrix,
+    entropy,
+    expected_mutual_information,
+    fowlkes_mallows_score,
+    homogeneity_completeness_v_measure,
+    homogeneity_score,
+    mutual_info_score,
+    normalized_mutual_info_score,
+    pair_confusion_matrix,
+    rand_score,
+    v_measure_score,
+)
+from ._unsupervised import (
+    calinski_harabasz_score,
+    davies_bouldin_score,
+    silhouette_samples,
+    silhouette_score,
+)
+
+__all__ = [
+    "adjusted_mutual_info_score",
+    "adjusted_rand_score",
+    "calinski_harabasz_score",
+    "completeness_score",
+    "consensus_score",
+    "contingency_matrix",
+    "davies_bouldin_score",
+    "entropy",
+    "expected_mutual_information",
+    "fowlkes_mallows_score",
+    "homogeneity_completeness_v_measure",
+    "homogeneity_score",
+    "mutual_info_score",
+    "normalized_mutual_info_score",
+    "pair_confusion_matrix",
+    "rand_score",
+    "silhouette_samples",
+    "silhouette_score",
+    "v_measure_score",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c69a2d027da84ba1b8addf91ea51b3cc6a80268
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_bicluster.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_bicluster.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f14a46c9f9f89d38b13f343e03698f2b94a604f
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_bicluster.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_supervised.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_supervised.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..447798eca5602bc47102cf654fff33295f94aa54
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_supervised.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_unsupervised.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_unsupervised.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..729e240f5a483c5c76692a99a0476d02a5fd68c5
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_unsupervised.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_bicluster.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_bicluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb306c025b69466817de26661eaf286ea59024bc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_bicluster.py
@@ -0,0 +1,114 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+from ...utils._param_validation import StrOptions, validate_params
+from ...utils.validation import check_array, check_consistent_length
+
+__all__ = ["consensus_score"]
+
+
+def _check_rows_and_columns(a, b):
+    """Unpacks the row and column arrays and checks their shape."""
+    check_consistent_length(*a)
+    check_consistent_length(*b)
+    checks = lambda x: check_array(x, ensure_2d=False)
+    a_rows, a_cols = map(checks, a)
+    b_rows, b_cols = map(checks, b)
+    return a_rows, a_cols, b_rows, b_cols
+
+
+def _jaccard(a_rows, a_cols, b_rows, b_cols):
+    """Jaccard coefficient on the elements of the two biclusters."""
+    intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum()
+
+    a_size = a_rows.sum() * a_cols.sum()
+    b_size = b_rows.sum() * b_cols.sum()
+
+    return intersection / (a_size + b_size - intersection)
+
+
+def _pairwise_similarity(a, b, similarity):
+    """Computes pairwise similarity matrix.
+
+    result[i, j] is the Jaccard coefficient of a's bicluster i and b's
+    bicluster j.
+
+    """
+    a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
+    n_a = a_rows.shape[0]
+    n_b = b_rows.shape[0]
+    result = np.array(
+        [
+            [similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)]
+            for i in range(n_a)
+        ]
+    )
+    return result
+
+
+@validate_params(
+    {
+        "a": [tuple],
+        "b": [tuple],
+        "similarity": [callable, StrOptions({"jaccard"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def consensus_score(a, b, *, similarity="jaccard"):
+    """The similarity of two sets of biclusters.
+
+    Similarity between individual biclusters is computed. Then the best
+    matching between sets is found by solving a linear sum assignment problem,
+    using a modified Jonker-Volgenant algorithm.
+    The final score is the sum of similarities divided by the size of
+    the larger set.
+
+    Read more in the :ref:`User Guide <biclustering>`.
+
+    Parameters
+    ----------
+    a : tuple (rows, columns)
+        Tuple of row and column indicators for a set of biclusters.
+
+    b : tuple (rows, columns)
+        Another set of biclusters like ``a``.
+
+    similarity : 'jaccard' or callable, default='jaccard'
+        May be the string "jaccard" to use the Jaccard coefficient, or
+        any function that takes four arguments, each of which is a 1d
+        indicator vector: (a_rows, a_columns, b_rows, b_columns).
+
+    Returns
+    -------
+    consensus_score : float
+       Consensus score, a non-negative value, sum of similarities
+       divided by size of larger set.
+
+    See Also
+    --------
+    scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem.
+
+    References
+    ----------
+    * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
+      for bicluster acquisition
+      <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import consensus_score
+    >>> a = ([[True, False], [False, True]], [[False, True], [True, False]])
+    >>> b = ([[False, True], [True, False]], [[True, False], [False, True]])
+    >>> consensus_score(a, b, similarity='jaccard')
+    1.0
+    """
+    if similarity == "jaccard":
+        similarity = _jaccard
+    matrix = _pairwise_similarity(a, b, similarity)
+    row_indices, col_indices = linear_sum_assignment(1.0 - matrix)
+    n_a = len(a[0])
+    n_b = len(b[0])
+    return float(matrix[row_indices, col_indices].sum() / max(n_a, n_b))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..3d51def36c255b7479fea1ae516fdc47c0c4faeb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx
@@ -0,0 +1,69 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.math cimport exp, lgamma
+
+from ...utils._typedefs cimport float64_t, int64_t
+
+import numpy as np
+from scipy.special import gammaln
+
+
+def expected_mutual_information(contingency, int64_t n_samples):
+    """Calculate the expected mutual information for two labelings."""
+    cdef:
+        float64_t emi = 0
+        int64_t n_rows, n_cols
+        float64_t term2, term3, gln
+        int64_t[::1] a_view, b_view
+        float64_t[::1] term1
+        float64_t[::1] gln_a, gln_b, gln_Na, gln_Nb, gln_Nnij, log_Nnij
+        float64_t[::1] log_a, log_b
+        Py_ssize_t i, j, nij
+        int64_t start, end
+
+    n_rows, n_cols = contingency.shape
+    a = np.ravel(contingency.sum(axis=1).astype(np.int64, copy=False))
+    b = np.ravel(contingency.sum(axis=0).astype(np.int64, copy=False))
+    a_view = a
+    b_view = b
+
+    # any labelling with zero entropy implies EMI = 0
+    if a.size == 1 or b.size == 1:
+        return 0.0
+
+    # There are three major terms to the EMI equation, which are multiplied to
+    # and then summed over varying nij values.
+    # While nijs[0] will never be used, having it simplifies the indexing.
+    nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float')
+    nijs[0] = 1  # Stops divide by zero warnings. As its not used, no issue.
+    # term1 is nij / N
+    term1 = nijs / n_samples
+    # term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b)
+    log_a = np.log(a)
+    log_b = np.log(b)
+    # term2 uses log(N * nij) = log(N) + log(nij)
+    log_Nnij = np.log(n_samples) + np.log(nijs)
+    # term3 is large, and involved many factorials. Calculate these in log
+    # space to stop overflows.
+    gln_a = gammaln(a + 1)
+    gln_b = gammaln(b + 1)
+    gln_Na = gammaln(n_samples - a + 1)
+    gln_Nb = gammaln(n_samples - b + 1)
+    gln_Nnij = gammaln(nijs + 1) + gammaln(n_samples + 1)
+
+    # emi itself is a summation over the various values.
+    for i in range(n_rows):
+        for j in range(n_cols):
+            start = max(1, a_view[i] - n_samples + b_view[j])
+            end = min(a_view[i], b_view[j]) + 1
+            for nij in range(start, end):
+                term2 = log_Nnij[nij] - log_a[i] - log_b[j]
+                # Numerators are positive, denominators are negative.
+                gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j]
+                       - gln_Nnij[nij] - lgamma(a_view[i] - nij + 1)
+                       - lgamma(b_view[j] - nij + 1)
+                       - lgamma(n_samples - a_view[i] - b_view[j] + nij + 1))
+                term3 = exp(gln)
+                emi += (term1[nij] * term2 * term3)
+    return emi
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_supervised.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccc11d752adbacd4960592e154a2c886276bc3f9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_supervised.py
@@ -0,0 +1,1314 @@
+"""Utilities to evaluate the clustering performance of models.
+
+Functions named as *_score return a scalar value to maximize: the higher the
+better.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from math import log
+from numbers import Real
+
+import numpy as np
+from scipy import sparse as sp
+
+from ...utils._array_api import _max_precision_float_dtype, get_namespace_and_device
+from ...utils._param_validation import Hidden, Interval, StrOptions, validate_params
+from ...utils.multiclass import type_of_target
+from ...utils.validation import check_array, check_consistent_length
+from ._expected_mutual_info_fast import expected_mutual_information
+
+
+def check_clusterings(labels_true, labels_pred):
+    """Check that the labels arrays are 1D and of same dimension.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,)
+        The true labels.
+
+    labels_pred : array-like of shape (n_samples,)
+        The predicted labels.
+    """
+    labels_true = check_array(
+        labels_true,
+        ensure_2d=False,
+        ensure_min_samples=0,
+        dtype=None,
+    )
+
+    labels_pred = check_array(
+        labels_pred,
+        ensure_2d=False,
+        ensure_min_samples=0,
+        dtype=None,
+    )
+
+    type_label = type_of_target(labels_true)
+    type_pred = type_of_target(labels_pred)
+
+    if "continuous" in (type_pred, type_label):
+        msg = (
+            "Clustering metrics expects discrete values but received"
+            f" {type_label} values for label, and {type_pred} values "
+            "for target"
+        )
+        warnings.warn(msg, UserWarning)
+
+    # input checks
+    if labels_true.ndim != 1:
+        raise ValueError("labels_true must be 1D: shape is %r" % (labels_true.shape,))
+    if labels_pred.ndim != 1:
+        raise ValueError("labels_pred must be 1D: shape is %r" % (labels_pred.shape,))
+    check_consistent_length(labels_true, labels_pred)
+
+    return labels_true, labels_pred
+
+
+def _generalized_average(U, V, average_method):
+    """Return a particular mean of two numbers."""
+    if average_method == "min":
+        return min(U, V)
+    elif average_method == "geometric":
+        return np.sqrt(U * V)
+    elif average_method == "arithmetic":
+        return np.mean([U, V])
+    elif average_method == "max":
+        return max(U, V)
+    else:
+        raise ValueError(
+            "'average_method' must be 'min', 'geometric', 'arithmetic', or 'max'"
+        )
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like", None],
+        "labels_pred": ["array-like", None],
+        "eps": [Interval(Real, 0, None, closed="left"), None],
+        "sparse": ["boolean"],
+        "dtype": "no_validation",  # delegate the validation to SciPy
+    },
+    prefer_skip_nested_validation=True,
+)
+def contingency_matrix(
+    labels_true, labels_pred, *, eps=None, sparse=False, dtype=np.int64
+):
+    """Build a contingency matrix describing the relationship between labels.
+
+    Read more in the :ref:`User Guide <contingency_matrix>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,)
+        Ground truth class labels to be used as a reference.
+
+    labels_pred : array-like of shape (n_samples,)
+        Cluster labels to evaluate.
+
+    eps : float, default=None
+        If a float, that value is added to all values in the contingency
+        matrix. This helps to stop NaN propagation.
+        If ``None``, nothing is adjusted.
+
+    sparse : bool, default=False
+        If `True`, return a sparse CSR contingency matrix. If `eps` is not
+        `None` and `sparse` is `True` will raise ValueError.
+
+        .. versionadded:: 0.18
+
+    dtype : numeric type, default=np.int64
+        Output dtype. Ignored if `eps` is not `None`.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred]
+        Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in
+        true class :math:`i` and in predicted class :math:`j`. If
+        ``eps is None``, the dtype of this array will be integer unless set
+        otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype
+        will be float.
+        Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.cluster import contingency_matrix
+    >>> labels_true = [0, 0, 1, 1, 2, 2]
+    >>> labels_pred = [1, 0, 2, 1, 0, 2]
+    >>> contingency_matrix(labels_true, labels_pred)
+    array([[1, 1, 0],
+           [0, 1, 1],
+           [1, 0, 1]])
+    """
+
+    if eps is not None and sparse:
+        raise ValueError("Cannot set 'eps' when sparse=True")
+
+    classes, class_idx = np.unique(labels_true, return_inverse=True)
+    clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)
+    n_classes = classes.shape[0]
+    n_clusters = clusters.shape[0]
+    # Using coo_matrix to accelerate simple histogram calculation,
+    # i.e. bins are consecutive integers
+    # Currently, coo_matrix is faster than histogram2d for simple cases
+    contingency = sp.coo_matrix(
+        (np.ones(class_idx.shape[0]), (class_idx, cluster_idx)),
+        shape=(n_classes, n_clusters),
+        dtype=dtype,
+    )
+    if sparse:
+        contingency = contingency.tocsr()
+        contingency.sum_duplicates()
+    else:
+        contingency = contingency.toarray()
+        if eps is not None:
+            # don't use += as contingency is integer
+            contingency = contingency + eps
+    return contingency
+
+
+# clustering measures
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def pair_confusion_matrix(labels_true, labels_pred):
+    """Pair confusion matrix arising from two clusterings.
+
+    The pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix
+    between two clusterings by considering all pairs of samples and counting
+    pairs that are assigned into the same or into different clusters under
+    the true and predicted clusterings [1]_.
+
+    Considering a pair of samples that is clustered together a positive pair,
+    then as in binary classification the count of true negatives is
+    :math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is
+    :math:`C_{11}` and false positives is :math:`C_{01}`.
+
+    Read more in the :ref:`User Guide <pair_confusion_matrix>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,), dtype=integral
+        Ground truth class labels to be used as a reference.
+
+    labels_pred : array-like of shape (n_samples,), dtype=integral
+        Cluster labels to evaluate.
+
+    Returns
+    -------
+    C : ndarray of shape (2, 2), dtype=np.int64
+        The contingency matrix.
+
+    See Also
+    --------
+    sklearn.metrics.rand_score : Rand Score.
+    sklearn.metrics.adjusted_rand_score : Adjusted Rand Score.
+    sklearn.metrics.adjusted_mutual_info_score : Adjusted Mutual Information.
+
+    References
+    ----------
+    .. [1] :doi:`Hubert, L., Arabie, P. "Comparing partitions."
+           Journal of Classification 2, 193–218 (1985).
+           <10.1007/BF01908075>`
+
+    Examples
+    --------
+    Perfectly matching labelings have all non-zero entries on the
+    diagonal regardless of actual label values:
+
+      >>> from sklearn.metrics.cluster import pair_confusion_matrix
+      >>> pair_confusion_matrix([0, 0, 1, 1], [1, 1, 0, 0])
+      array([[8, 0],
+             [0, 4]]...
+
+    Labelings that assign all classes members to the same clusters
+    are complete but may be not always pure, hence penalized, and
+    have some off-diagonal non-zero entries:
+
+      >>> pair_confusion_matrix([0, 0, 1, 2], [0, 0, 1, 1])
+      array([[8, 2],
+             [0, 2]]...
+
+    Note that the matrix is not symmetric.
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+    n_samples = np.int64(labels_true.shape[0])
+
+    # Computation using the contingency data
+    contingency = contingency_matrix(
+        labels_true, labels_pred, sparse=True, dtype=np.int64
+    )
+    n_c = np.ravel(contingency.sum(axis=1))
+    n_k = np.ravel(contingency.sum(axis=0))
+    sum_squares = (contingency.data**2).sum()
+    C = np.empty((2, 2), dtype=np.int64)
+    C[1, 1] = sum_squares - n_samples
+    C[0, 1] = contingency.dot(n_k).sum() - sum_squares
+    C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares
+    C[0, 0] = n_samples**2 - C[0, 1] - C[1, 0] - sum_squares
+    return C
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def rand_score(labels_true, labels_pred):
+    """Rand index.
+
+    The Rand Index computes a similarity measure between two clusterings
+    by considering all pairs of samples and counting pairs that are
+    assigned in the same or different clusters in the predicted and
+    true clusterings [1]_ [2]_.
+
+    The raw RI score [3]_ is:
+
+    .. code-block:: text
+
+        RI = (number of agreeing pairs) / (number of pairs)
+
+    Read more in the :ref:`User Guide <rand_score>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,), dtype=integral
+        Ground truth class labels to be used as a reference.
+
+    labels_pred : array-like of shape (n_samples,), dtype=integral
+        Cluster labels to evaluate.
+
+    Returns
+    -------
+    RI : float
+       Similarity score between 0.0 and 1.0, inclusive, 1.0 stands for
+       perfect match.
+
+    See Also
+    --------
+    adjusted_rand_score: Adjusted Rand Score.
+    adjusted_mutual_info_score: Adjusted Mutual Information.
+
+    References
+    ----------
+    .. [1] :doi:`Hubert, L., Arabie, P. "Comparing partitions."
+       Journal of Classification 2, 193–218 (1985).
+       <10.1007/BF01908075>`.
+
+    .. [2] `Wikipedia: Simple Matching Coefficient
+        <https://en.wikipedia.org/wiki/Simple_matching_coefficient>`_
+
+    .. [3] `Wikipedia: Rand Index <https://en.wikipedia.org/wiki/Rand_index>`_
+
+    Examples
+    --------
+    Perfectly matching labelings have a score of 1 even
+
+      >>> from sklearn.metrics.cluster import rand_score
+      >>> rand_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Labelings that assign all classes members to the same clusters
+    are complete but may not always be pure, hence penalized:
+
+      >>> rand_score([0, 0, 1, 2], [0, 0, 1, 1])
+      0.83
+    """
+    contingency = pair_confusion_matrix(labels_true, labels_pred)
+    numerator = contingency.diagonal().sum()
+    denominator = contingency.sum()
+
+    if numerator == denominator or denominator == 0:
+        # Special limit cases: no clustering since the data is not split;
+        # or trivial clustering where each document is assigned a unique
+        # cluster. These are perfect matches hence return 1.0.
+        return 1.0
+
+    return float(numerator / denominator)
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def adjusted_rand_score(labels_true, labels_pred):
+    """Rand index adjusted for chance.
+
+    The Rand Index computes a similarity measure between two clusterings
+    by considering all pairs of samples and counting pairs that are
+    assigned in the same or different clusters in the predicted and
+    true clusterings.
+
+    The raw RI score is then "adjusted for chance" into the ARI score
+    using the following scheme::
+
+        ARI = (RI - Expected_RI) / (max(RI) - Expected_RI)
+
+    The adjusted Rand index is thus ensured to have a value close to
+    0.0 for random labeling independently of the number of clusters and
+    samples and exactly 1.0 when the clusterings are identical (up to
+    a permutation). The adjusted Rand index is bounded below by -0.5 for
+    especially discordant clusterings.
+
+    ARI is a symmetric measure::
+
+        adjusted_rand_score(a, b) == adjusted_rand_score(b, a)
+
+    Read more in the :ref:`User Guide <adjusted_rand_score>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,), dtype=int
+        Ground truth class labels to be used as a reference.
+
+    labels_pred : array-like of shape (n_samples,), dtype=int
+        Cluster labels to evaluate.
+
+    Returns
+    -------
+    ARI : float
+       Similarity score between -0.5 and 1.0. Random labelings have an ARI
+       close to 0.0. 1.0 stands for perfect match.
+
+    See Also
+    --------
+    adjusted_mutual_info_score : Adjusted Mutual Information.
+
+    References
+    ----------
+    .. [Hubert1985] L. Hubert and P. Arabie, Comparing Partitions,
+      Journal of Classification 1985
+      https://link.springer.com/article/10.1007%2FBF01908075
+
+    .. [Steinley2004] D. Steinley, Properties of the Hubert-Arabie
+      adjusted Rand index, Psychological Methods 2004
+
+    .. [wk] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index
+
+    .. [Chacon] :doi:`Minimum adjusted Rand index for two clusterings of a given size,
+      2022, J. E. Chacón and A. I. Rastrojo <10.1007/s11634-022-00491-w>`
+
+    Examples
+    --------
+    Perfectly matching labelings have a score of 1 even
+
+      >>> from sklearn.metrics.cluster import adjusted_rand_score
+      >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Labelings that assign all classes members to the same clusters
+    are complete but may not always be pure, hence penalized::
+
+      >>> adjusted_rand_score([0, 0, 1, 2], [0, 0, 1, 1])
+      0.57
+
+    ARI is symmetric, so labelings that have pure clusters with members
+    coming from the same classes but unnecessary splits are penalized::
+
+      >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2])
+      0.57
+
+    If classes members are completely split across different clusters, the
+    assignment is totally incomplete, hence the ARI is very low::
+
+      >>> adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3])
+      0.0
+
+    ARI may take a negative value for especially discordant labelings that
+    are a worse choice than the expected value of random labels::
+
+      >>> adjusted_rand_score([0, 0, 1, 1], [0, 1, 0, 1])
+      -0.5
+
+    See :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py`
+    for a more detailed example.
+    """
+    (tn, fp), (fn, tp) = pair_confusion_matrix(labels_true, labels_pred)
+    # convert to Python integer types, to avoid overflow or underflow
+    tn, fp, fn, tp = int(tn), int(fp), int(fn), int(tp)
+
+    # Special cases: empty data or full agreement
+    if fn == 0 and fp == 0:
+        return 1.0
+
+    return 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+        "beta": [Interval(Real, 0, None, closed="left")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0):
+    """Compute the homogeneity and completeness and V-Measure scores at once.
+
+    Those metrics are based on normalized conditional entropy measures of
+    the clustering labeling to evaluate given the knowledge of a Ground
+    Truth class labels of the same samples.
+
+    A clustering result satisfies homogeneity if all of its clusters
+    contain only data points which are members of a single class.
+
+    A clustering result satisfies completeness if all the data points
+    that are members of a given class are elements of the same cluster.
+
+    Both scores have positive values between 0.0 and 1.0, larger values
+    being desirable.
+
+    Those 3 metrics are independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score values in any way.
+
+    V-Measure is furthermore symmetric: swapping ``labels_true`` and
+    ``label_pred`` will give the same score. This does not hold for
+    homogeneity and completeness. V-Measure is identical to
+    :func:`normalized_mutual_info_score` with the arithmetic averaging
+    method.
+
+    Read more in the :ref:`User Guide <homogeneity_completeness>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,)
+        Ground truth class labels to be used as a reference.
+
+    labels_pred : array-like of shape (n_samples,)
+        Cluster labels to evaluate.
+
+    beta : float, default=1.0
+        Ratio of weight attributed to ``homogeneity`` vs ``completeness``.
+        If ``beta`` is greater than 1, ``completeness`` is weighted more
+        strongly in the calculation. If ``beta`` is less than 1,
+        ``homogeneity`` is weighted more strongly.
+
+    Returns
+    -------
+    homogeneity : float
+        Score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling.
+
+    completeness : float
+        Score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling.
+
+    v_measure : float
+        Harmonic mean of the first two.
+
+    See Also
+    --------
+    homogeneity_score : Homogeneity metric of cluster labeling.
+    completeness_score : Completeness metric of cluster labeling.
+    v_measure_score : V-Measure (NMI with arithmetic mean option).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import homogeneity_completeness_v_measure
+    >>> y_true, y_pred = [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 2, 2]
+    >>> homogeneity_completeness_v_measure(y_true, y_pred)
+    (0.71, 0.771, 0.74)
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+
+    if len(labels_true) == 0:
+        return 1.0, 1.0, 1.0
+
+    entropy_C = entropy(labels_true)
+    entropy_K = entropy(labels_pred)
+
+    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    MI = mutual_info_score(None, None, contingency=contingency)
+
+    homogeneity = MI / (entropy_C) if entropy_C else 1.0
+    completeness = MI / (entropy_K) if entropy_K else 1.0
+
+    if homogeneity + completeness == 0.0:
+        v_measure_score = 0.0
+    else:
+        v_measure_score = (
+            (1 + beta)
+            * homogeneity
+            * completeness
+            / (beta * homogeneity + completeness)
+        )
+
+    return float(homogeneity), float(completeness), float(v_measure_score)
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def homogeneity_score(labels_true, labels_pred):
+    """Homogeneity metric of a cluster labeling given a ground truth.
+
+    A clustering result satisfies homogeneity if all of its clusters
+    contain only data points which are members of a single class.
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is not symmetric: switching ``label_true`` with ``label_pred``
+    will return the :func:`completeness_score` which will be different in
+    general.
+
+    Read more in the :ref:`User Guide <homogeneity_completeness>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,)
+        Ground truth class labels to be used as a reference.
+
+    labels_pred : array-like of shape (n_samples,)
+        Cluster labels to evaluate.
+
+    Returns
+    -------
+    homogeneity : float
+       Score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling.
+
+    See Also
+    --------
+    completeness_score : Completeness metric of cluster labeling.
+    v_measure_score : V-Measure (NMI with arithmetic mean option).
+
+    References
+    ----------
+
+    .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A
+       conditional entropy-based external cluster evaluation measure
+       <https://aclweb.org/anthology/D/D07/D07-1043.pdf>`_
+
+    Examples
+    --------
+
+    Perfect labelings are homogeneous::
+
+      >>> from sklearn.metrics.cluster import homogeneity_score
+      >>> homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Non-perfect labelings that further split classes into more clusters can be
+    perfectly homogeneous::
+
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))
+      1.000000
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3]))
+      1.000000
+
+    Clusters that include samples from different classes do not make for an
+    homogeneous labeling::
+
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 0, 1]))
+      0.0...
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 0, 0]))
+      0.0...
+    """
+    return homogeneity_completeness_v_measure(labels_true, labels_pred)[0]
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def completeness_score(labels_true, labels_pred):
+    """Compute completeness metric of a cluster labeling given a ground truth.
+
+    A clustering result satisfies completeness if all the data points
+    that are members of a given class are elements of the same cluster.
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is not symmetric: switching ``label_true`` with ``label_pred``
+    will return the :func:`homogeneity_score` which will be different in
+    general.
+
+    Read more in the :ref:`User Guide <homogeneity_completeness>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,)
+        Ground truth class labels to be used as a reference.
+
+    labels_pred : array-like of shape (n_samples,)
+        Cluster labels to evaluate.
+
+    Returns
+    -------
+    completeness : float
+       Score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling.
+
+    See Also
+    --------
+    homogeneity_score : Homogeneity metric of cluster labeling.
+    v_measure_score : V-Measure (NMI with arithmetic mean option).
+
+    References
+    ----------
+
+    .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A
+       conditional entropy-based external cluster evaluation measure
+       <https://aclweb.org/anthology/D/D07/D07-1043.pdf>`_
+
+    Examples
+    --------
+
+    Perfect labelings are complete::
+
+      >>> from sklearn.metrics.cluster import completeness_score
+      >>> completeness_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Non-perfect labelings that assign all classes members to the same clusters
+    are still complete::
+
+      >>> print(completeness_score([0, 0, 1, 1], [0, 0, 0, 0]))
+      1.0
+      >>> print(completeness_score([0, 1, 2, 3], [0, 0, 1, 1]))
+      0.999
+
+    If classes members are split across different clusters, the
+    assignment cannot be complete::
+
+      >>> print(completeness_score([0, 0, 1, 1], [0, 1, 0, 1]))
+      0.0
+      >>> print(completeness_score([0, 0, 0, 0], [0, 1, 2, 3]))
+      0.0
+    """
+    return homogeneity_completeness_v_measure(labels_true, labels_pred)[1]
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+        "beta": [Interval(Real, 0, None, closed="left")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def v_measure_score(labels_true, labels_pred, *, beta=1.0):
+    """V-measure cluster labeling given a ground truth.
+
+    This score is identical to :func:`normalized_mutual_info_score` with
+    the ``'arithmetic'`` option for averaging.
+
+    The V-measure is the harmonic mean between homogeneity and completeness::
+
+        v = (1 + beta) * homogeneity * completeness
+             / (beta * homogeneity + completeness)
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is furthermore symmetric: switching ``label_true`` with
+    ``label_pred`` will return the same score value. This can be useful to
+    measure the agreement of two independent label assignments strategies
+    on the same dataset when the real ground truth is not known.
+
+    Read more in the :ref:`User Guide <homogeneity_completeness>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,)
+        Ground truth class labels to be used as a reference.
+
+    labels_pred : array-like of shape (n_samples,)
+        Cluster labels to evaluate.
+
+    beta : float, default=1.0
+        Ratio of weight attributed to ``homogeneity`` vs ``completeness``.
+        If ``beta`` is greater than 1, ``completeness`` is weighted more
+        strongly in the calculation. If ``beta`` is less than 1,
+        ``homogeneity`` is weighted more strongly.
+
+    Returns
+    -------
+    v_measure : float
+       Score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling.
+
+    See Also
+    --------
+    homogeneity_score : Homogeneity metric of cluster labeling.
+    completeness_score : Completeness metric of cluster labeling.
+    normalized_mutual_info_score : Normalized Mutual Information.
+
+    References
+    ----------
+
+    .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A
+       conditional entropy-based external cluster evaluation measure
+       <https://aclweb.org/anthology/D/D07/D07-1043.pdf>`_
+
+    Examples
+    --------
+    Perfect labelings are both homogeneous and complete, hence have score 1.0::
+
+      >>> from sklearn.metrics.cluster import v_measure_score
+      >>> v_measure_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> v_measure_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    Labelings that assign all classes members to the same clusters
+    are complete but not homogeneous, hence penalized::
+
+      >>> print("%.6f" % v_measure_score([0, 0, 1, 2], [0, 0, 1, 1]))
+      0.8
+      >>> print("%.6f" % v_measure_score([0, 1, 2, 3], [0, 0, 1, 1]))
+      0.67
+
+    Labelings that have pure clusters with members coming from the same
+    classes are homogeneous but un-necessary splits harm completeness
+    and thus penalize V-measure as well::
+
+      >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 0, 1, 2]))
+      0.8
+      >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 1, 2, 3]))
+      0.67
+
+    If classes members are completely split across different clusters,
+    the assignment is totally incomplete, hence the V-Measure is null::
+
+      >>> print("%.6f" % v_measure_score([0, 0, 0, 0], [0, 1, 2, 3]))
+      0.0
+
+    Clusters that include samples from totally different classes totally
+    destroy the homogeneity of the labeling, hence::
+
+      >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 0, 0, 0]))
+      0.0
+    """
+    return homogeneity_completeness_v_measure(labels_true, labels_pred, beta=beta)[2]
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like", None],
+        "labels_pred": ["array-like", None],
+        "contingency": ["array-like", "sparse matrix", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mutual_info_score(labels_true, labels_pred, *, contingency=None):
+    """Mutual Information between two clusterings.
+
+    The Mutual Information is a measure of the similarity between two labels
+    of the same data. Where :math:`|U_i|` is the number of the samples
+    in cluster :math:`U_i` and :math:`|V_j|` is the number of the
+    samples in cluster :math:`V_j`, the Mutual Information
+    between clusterings :math:`U` and :math:`V` is given as:
+
+    .. math::
+
+        MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N}
+        \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|}
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is furthermore symmetric: switching :math:`U` (i.e
+    ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the
+    same score value. This can be useful to measure the agreement of two
+    independent label assignments strategies on the same dataset when the
+    real ground truth is not known.
+
+    Read more in the :ref:`User Guide <mutual_info_score>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,), dtype=integral
+        A clustering of the data into disjoint subsets, called :math:`U` in
+        the above formula.
+
+    labels_pred : array-like of shape (n_samples,), dtype=integral
+        A clustering of the data into disjoint subsets, called :math:`V` in
+        the above formula.
+
+    contingency : {array-like, sparse matrix} of shape \
+            (n_classes_true, n_classes_pred), default=None
+        A contingency matrix given by the
+        :func:`~sklearn.metrics.cluster.contingency_matrix` function. If value
+        is ``None``, it will be computed, otherwise the given value is used,
+        with ``labels_true`` and ``labels_pred`` ignored.
+
+    Returns
+    -------
+    mi : float
+       Mutual information, a non-negative value, measured in nats using the
+       natural logarithm.
+
+    See Also
+    --------
+    adjusted_mutual_info_score : Adjusted against chance Mutual Information.
+    normalized_mutual_info_score : Normalized Mutual Information.
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mutual_info_score
+    >>> labels_true = [0, 1, 1, 0, 1, 0]
+    >>> labels_pred = [0, 1, 0, 0, 1, 1]
+    >>> mutual_info_score(labels_true, labels_pred)
+    0.0566
+    """
+    if contingency is None:
+        labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+        contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    else:
+        contingency = check_array(
+            contingency,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype=[int, np.int32, np.int64],
+        )
+
+    if isinstance(contingency, np.ndarray):
+        # For an array
+        nzx, nzy = np.nonzero(contingency)
+        nz_val = contingency[nzx, nzy]
+    else:
+        # For a sparse matrix
+        nzx, nzy, nz_val = sp.find(contingency)
+
+    contingency_sum = contingency.sum()
+    pi = np.ravel(contingency.sum(axis=1))
+    pj = np.ravel(contingency.sum(axis=0))
+
+    # Since MI <= min(H(X), H(Y)), any labelling with zero entropy, i.e. containing a
+    # single cluster, implies MI = 0
+    if pi.size == 1 or pj.size == 1:
+        return 0.0
+
+    log_contingency_nm = np.log(nz_val)
+    contingency_nm = nz_val / contingency_sum
+    # Don't need to calculate the full outer product, just for non-zeroes
+    outer = pi.take(nzx).astype(np.int64, copy=False) * pj.take(nzy).astype(
+        np.int64, copy=False
+    )
+    log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())
+    mi = (
+        contingency_nm * (log_contingency_nm - log(contingency_sum))
+        + contingency_nm * log_outer
+    )
+    mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi)
+    return float(np.clip(mi.sum(), 0.0, None))
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+        "average_method": [StrOptions({"arithmetic", "max", "min", "geometric"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def adjusted_mutual_info_score(
+    labels_true, labels_pred, *, average_method="arithmetic"
+):
+    """Adjusted Mutual Information between two clusterings.
+
+    Adjusted Mutual Information (AMI) is an adjustment of the Mutual
+    Information (MI) score to account for chance. It accounts for the fact that
+    the MI is generally higher for two clusterings with a larger number of
+    clusters, regardless of whether there is actually more information shared.
+    For two clusterings :math:`U` and :math:`V`, the AMI is given as::
+
+        AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is furthermore symmetric: switching :math:`U` (``label_true``)
+    with :math:`V` (``labels_pred``) will return the same score value. This can
+    be useful to measure the agreement of two independent label assignments
+    strategies on the same dataset when the real ground truth is not known.
+
+    Be mindful that this function is an order of magnitude slower than other
+    metrics, such as the Adjusted Rand Index.
+
+    Read more in the :ref:`User Guide <mutual_info_score>`.
+
+    Parameters
+    ----------
+    labels_true : int array-like of shape (n_samples,)
+        A clustering of the data into disjoint subsets, called :math:`U` in
+        the above formula.
+
+    labels_pred : int array-like of shape (n_samples,)
+        A clustering of the data into disjoint subsets, called :math:`V` in
+        the above formula.
+
+    average_method : {'min', 'geometric', 'arithmetic', 'max'}, default='arithmetic'
+        How to compute the normalizer in the denominator.
+
+        .. versionadded:: 0.20
+
+        .. versionchanged:: 0.22
+           The default value of ``average_method`` changed from 'max' to
+           'arithmetic'.
+
+    Returns
+    -------
+    ami: float (upperlimited by 1.0)
+       The AMI returns a value of 1 when the two partitions are identical
+       (ie perfectly matched). Random partitions (independent labellings) have
+       an expected AMI around 0 on average hence can be negative. The value is
+       in adjusted nats (based on the natural logarithm).
+
+    See Also
+    --------
+    adjusted_rand_score : Adjusted Rand Index.
+    mutual_info_score : Mutual Information (not adjusted for chance).
+
+    References
+    ----------
+    .. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for
+       Clusterings Comparison: Variants, Properties, Normalization and
+       Correction for Chance, JMLR
+       <http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf>`_
+
+    .. [2] `Wikipedia entry for the Adjusted Mutual Information
+       <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
+
+    Examples
+    --------
+
+    Perfect labelings are both homogeneous and complete, hence have
+    score 1.0::
+
+      >>> from sklearn.metrics.cluster import adjusted_mutual_info_score
+      >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    If classes members are completely split across different clusters,
+    the assignment is totally in-complete, hence the AMI is null::
+
+      >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
+      0.0
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+    n_samples = labels_true.shape[0]
+    classes = np.unique(labels_true)
+    clusters = np.unique(labels_pred)
+
+    # Special limit cases: no clustering since the data is not split.
+    # It corresponds to both labellings having zero entropy.
+    # This is a perfect match hence return 1.0.
+    if (
+        classes.shape[0] == clusters.shape[0] == 1
+        or classes.shape[0] == clusters.shape[0] == 0
+    ):
+        return 1.0
+    # if there is only one class or one cluster return 0.0.
+    elif classes.shape[0] == 1 or clusters.shape[0] == 1:
+        return 0.0
+
+    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    # Calculate the MI for the two clusterings
+    mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)
+    # Calculate the expected value for the mutual information
+    emi = expected_mutual_information(contingency, n_samples)
+    # Calculate entropy for each labeling
+    h_true, h_pred = entropy(labels_true), entropy(labels_pred)
+    normalizer = _generalized_average(h_true, h_pred, average_method)
+    denominator = normalizer - emi
+    # Avoid 0.0 / 0.0 when expectation equals maximum, i.e. a perfect match.
+    # normalizer should always be >= emi, but because of floating-point
+    # representation, sometimes emi is slightly larger. Correct this
+    # by preserving the sign.
+    if denominator < 0:
+        denominator = min(denominator, -np.finfo("float64").eps)
+    else:
+        denominator = max(denominator, np.finfo("float64").eps)
+    # The same applies analogously to mi and emi.
+    numerator = mi - emi
+    if numerator < 0:
+        numerator = min(numerator, -np.finfo("float64").eps)
+    else:
+        numerator = max(numerator, np.finfo("float64").eps)
+    return float(numerator / denominator)
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+        "average_method": [StrOptions({"arithmetic", "max", "min", "geometric"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def normalized_mutual_info_score(
+    labels_true, labels_pred, *, average_method="arithmetic"
+):
+    """Normalized Mutual Information between two clusterings.
+
+    Normalized Mutual Information (NMI) is a normalization of the Mutual
+    Information (MI) score to scale the results between 0 (no mutual
+    information) and 1 (perfect correlation). In this function, mutual
+    information is normalized by some generalized mean of ``H(labels_true)``
+    and ``H(labels_pred))``, defined by the `average_method`.
+
+    This measure is not adjusted for chance. Therefore
+    :func:`adjusted_mutual_info_score` might be preferred.
+
+    This metric is independent of the absolute values of the labels:
+    a permutation of the class or cluster label values won't change the
+    score value in any way.
+
+    This metric is furthermore symmetric: switching ``label_true`` with
+    ``label_pred`` will return the same score value. This can be useful to
+    measure the agreement of two independent label assignments strategies
+    on the same dataset when the real ground truth is not known.
+
+    Read more in the :ref:`User Guide <mutual_info_score>`.
+
+    Parameters
+    ----------
+    labels_true : int array-like of shape (n_samples,)
+        A clustering of the data into disjoint subsets.
+
+    labels_pred : int array-like of shape (n_samples,)
+        A clustering of the data into disjoint subsets.
+
+    average_method : {'min', 'geometric', 'arithmetic', 'max'}, default='arithmetic'
+        How to compute the normalizer in the denominator.
+
+        .. versionadded:: 0.20
+
+        .. versionchanged:: 0.22
+           The default value of ``average_method`` changed from 'geometric' to
+           'arithmetic'.
+
+    Returns
+    -------
+    nmi : float
+       Score between 0.0 and 1.0 in normalized nats (based on the natural
+       logarithm). 1.0 stands for perfectly complete labeling.
+
+    See Also
+    --------
+    v_measure_score : V-Measure (NMI with arithmetic mean option).
+    adjusted_rand_score : Adjusted Rand Index.
+    adjusted_mutual_info_score : Adjusted Mutual Information (adjusted
+        against chance).
+
+    Examples
+    --------
+
+    Perfect labelings are both homogeneous and complete, hence have
+    score 1.0::
+
+      >>> from sklearn.metrics.cluster import normalized_mutual_info_score
+      >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    If classes members are completely split across different clusters,
+    the assignment is totally in-complete, hence the NMI is null::
+
+      >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
+      0.0
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+    classes = np.unique(labels_true)
+    clusters = np.unique(labels_pred)
+
+    # Special limit cases: no clustering since the data is not split.
+    # It corresponds to both labellings having zero entropy.
+    # This is a perfect match hence return 1.0.
+    if (
+        classes.shape[0] == clusters.shape[0] == 1
+        or classes.shape[0] == clusters.shape[0] == 0
+    ):
+        return 1.0
+
+    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+    contingency = contingency.astype(np.float64, copy=False)
+    # Calculate the MI for the two clusterings
+    mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)
+
+    # At this point mi = 0 can't be a perfect match (the special case of a single
+    # cluster has been dealt with before). Hence, if mi = 0, the nmi must be 0 whatever
+    # the normalization.
+    if mi == 0:
+        return 0.0
+
+    # Calculate entropy for each labeling
+    h_true, h_pred = entropy(labels_true), entropy(labels_pred)
+
+    normalizer = _generalized_average(h_true, h_pred, average_method)
+    return float(mi / normalizer)
+
+
+@validate_params(
+    {
+        "labels_true": ["array-like"],
+        "labels_pred": ["array-like"],
+        "sparse": ["boolean", Hidden(StrOptions({"deprecated"}))],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fowlkes_mallows_score(labels_true, labels_pred, *, sparse="deprecated"):
+    """Measure the similarity of two clusterings of a set of points.
+
+    .. versionadded:: 0.18
+
+    The Fowlkes-Mallows index (FMI) is defined as the geometric mean of
+    the precision and recall::
+
+        FMI = TP / sqrt((TP + FP) * (TP + FN))
+
+    Where ``TP`` is the number of **True Positive** (i.e. the number of pairs of
+    points that belong to the same cluster in both ``labels_true`` and
+    ``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the
+    number of pairs of points that belong to the same cluster in
+    ``labels_pred`` but not in ``labels_true``) and ``FN`` is the number of
+    **False Negative** (i.e. the number of pairs of points that belong to the
+    same cluster in ``labels_true`` but not in ``labels_pred``).
+
+    The score ranges from 0 to 1. A high value indicates a good similarity
+    between two clusters.
+
+    Read more in the :ref:`User Guide <fowlkes_mallows_scores>`.
+
+    Parameters
+    ----------
+    labels_true : array-like of shape (n_samples,), dtype=int
+        A clustering of the data into disjoint subsets.
+
+    labels_pred : array-like of shape (n_samples,), dtype=int
+        A clustering of the data into disjoint subsets.
+
+    sparse : bool, default=False
+        Compute contingency matrix internally with sparse matrix.
+
+        .. deprecated:: 1.7
+            The ``sparse`` parameter is deprecated and will be removed in 1.9. It has
+            no effect.
+
+    Returns
+    -------
+    score : float
+       The resulting Fowlkes-Mallows score.
+
+    References
+    ----------
+    .. [1] `E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two
+       hierarchical clusterings". Journal of the American Statistical
+       Association
+       <https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008>`_
+
+    .. [2] `Wikipedia entry for the Fowlkes-Mallows Index
+           <https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_
+
+    Examples
+    --------
+
+    Perfect labelings are both homogeneous and complete, hence have
+    score 1.0::
+
+      >>> from sklearn.metrics.cluster import fowlkes_mallows_score
+      >>> fowlkes_mallows_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> fowlkes_mallows_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    If classes members are completely split across different clusters,
+    the assignment is totally random, hence the FMI is null::
+
+      >>> fowlkes_mallows_score([0, 0, 0, 0], [0, 1, 2, 3])
+      0.0
+    """
+    # TODO(1.9): remove the sparse parameter
+    if sparse != "deprecated":
+        warnings.warn(
+            "The 'sparse' parameter was deprecated in 1.7 and will be removed in 1.9. "
+            "It has no effect. Leave it to its default value to silence this warning.",
+            FutureWarning,
+        )
+
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+    (n_samples,) = labels_true.shape
+
+    c = contingency_matrix(labels_true, labels_pred, sparse=True)
+    c = c.astype(np.int64, copy=False)
+    tk = np.dot(c.data, c.data) - n_samples
+    pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples
+    qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples
+    return float(np.sqrt(tk / pk) * np.sqrt(tk / qk)) if tk != 0.0 else 0.0
+
+
+@validate_params(
+    {
+        "labels": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def entropy(labels):
+    """Calculate the entropy for a labeling.
+
+    Parameters
+    ----------
+    labels : array-like of shape (n_samples,), dtype=int
+        The labels.
+
+    Returns
+    -------
+    entropy : float
+       The entropy for a labeling.
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+    """
+    xp, is_array_api_compliant, device_ = get_namespace_and_device(labels)
+    labels_len = labels.shape[0] if is_array_api_compliant else len(labels)
+    if labels_len == 0:
+        return 1.0
+
+    pi = xp.astype(xp.unique_counts(labels)[1], _max_precision_float_dtype(xp, device_))
+
+    # single cluster => zero entropy
+    if pi.size == 1:
+        return 0.0
+
+    pi_sum = xp.sum(pi)
+    # log(a / b) should be calculated as log(a) - log(b) for
+    # possible loss of precision
+    # Always convert the result as a Python scalar (on CPU) instead of a device
+    # specific scalar array.
+    return float(-xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum))))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_unsupervised.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_unsupervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..38cec419e73f778ecdb7bdac89e090a26cdd794a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_unsupervised.py
@@ -0,0 +1,463 @@
+"""Unsupervised evaluation metrics."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import functools
+from numbers import Integral
+
+import numpy as np
+from scipy.sparse import issparse
+
+from ...preprocessing import LabelEncoder
+from ...utils import _safe_indexing, check_random_state, check_X_y
+from ...utils._array_api import _atol_for_type
+from ...utils._param_validation import (
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from ..pairwise import _VALID_METRICS, pairwise_distances, pairwise_distances_chunked
+
+
+def check_number_of_labels(n_labels, n_samples):
+    """Check that number of labels are valid.
+
+    Parameters
+    ----------
+    n_labels : int
+        Number of labels.
+
+    n_samples : int
+        Number of samples.
+    """
+    if not 1 < n_labels < n_samples:
+        raise ValueError(
+            "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
+            % n_labels
+        )
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "labels": ["array-like"],
+        "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
+        "sample_size": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def silhouette_score(
+    X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds
+):
+    """Compute the mean Silhouette Coefficient of all samples.
+
+    The Silhouette Coefficient is calculated using the mean intra-cluster
+    distance (``a``) and the mean nearest-cluster distance (``b``) for each
+    sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
+    b)``.  To clarify, ``b`` is the distance between a sample and the nearest
+    cluster that the sample is not a part of.
+    Note that Silhouette Coefficient is only defined if number of labels
+    is ``2 <= n_labels <= n_samples - 1``.
+
+    This function returns the mean Silhouette Coefficient over all samples.
+    To obtain the values for each sample, use :func:`silhouette_samples`.
+
+    The best value is 1 and the worst value is -1. Values near 0 indicate
+    overlapping clusters. Negative values generally indicate that a sample has
+    been assigned to the wrong cluster, as a different cluster is more similar.
+
+    Read more in the :ref:`User Guide <silhouette_coefficient>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
+            "precomputed" or (n_samples_a, n_features) otherwise
+        An array of pairwise distances between samples, or a feature array.
+
+    labels : array-like of shape (n_samples,)
+        Predicted labels for each sample.
+
+    metric : str or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string, it must be one of the options
+        allowed by :func:`~sklearn.metrics.pairwise_distances`. If ``X`` is
+        the distance array itself, use ``metric="precomputed"``.
+
+    sample_size : int, default=None
+        The size of the sample to use when computing the Silhouette Coefficient
+        on a random subset of the data.
+        If ``sample_size is None``, no sampling is used.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for selecting a subset of samples.
+        Used when ``sample_size is not None``.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    **kwds : optional keyword parameters
+        Any further parameters are passed directly to the distance function.
+        If using a scipy.spatial.distance metric, the parameters are still
+        metric dependent. See the scipy docs for usage examples.
+
+    Returns
+    -------
+    silhouette : float
+        Mean Silhouette Coefficient for all samples.
+
+    References
+    ----------
+
+    .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
+       Interpretation and Validation of Cluster Analysis". Computational
+       and Applied Mathematics 20: 53-65.
+       <https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
+
+    .. [2] `Wikipedia entry on the Silhouette Coefficient
+           <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_blobs
+    >>> from sklearn.cluster import KMeans
+    >>> from sklearn.metrics import silhouette_score
+    >>> X, y = make_blobs(random_state=42)
+    >>> kmeans = KMeans(n_clusters=2, random_state=42)
+    >>> silhouette_score(X, kmeans.fit_predict(X))
+    0.49...
+    """
+    if sample_size is not None:
+        X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"])
+        random_state = check_random_state(random_state)
+        indices = random_state.permutation(X.shape[0])[:sample_size]
+        if metric == "precomputed":
+            X, labels = X[indices].T[indices].T, labels[indices]
+        else:
+            X, labels = X[indices], labels[indices]
+    return float(np.mean(silhouette_samples(X, labels, metric=metric, **kwds)))
+
+
+def _silhouette_reduce(D_chunk, start, labels, label_freqs):
+    """Accumulate silhouette statistics for vertical chunk of X.
+
+    Parameters
+    ----------
+    D_chunk : {array-like, sparse matrix} of shape (n_chunk_samples, n_samples)
+        Precomputed distances for a chunk. If a sparse matrix is provided,
+        only CSR format is accepted.
+    start : int
+        First index in the chunk.
+    labels : array-like of shape (n_samples,)
+        Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.
+    label_freqs : array-like
+        Distribution of cluster labels in ``labels``.
+    """
+    n_chunk_samples = D_chunk.shape[0]
+    # accumulate distances from each sample to each cluster
+    cluster_distances = np.zeros(
+        (n_chunk_samples, len(label_freqs)), dtype=D_chunk.dtype
+    )
+
+    if issparse(D_chunk):
+        if D_chunk.format != "csr":
+            raise TypeError(
+                "Expected CSR matrix. Please pass sparse matrix in CSR format."
+            )
+        for i in range(n_chunk_samples):
+            indptr = D_chunk.indptr
+            indices = D_chunk.indices[indptr[i] : indptr[i + 1]]
+            sample_weights = D_chunk.data[indptr[i] : indptr[i + 1]]
+            sample_labels = np.take(labels, indices)
+            cluster_distances[i] += np.bincount(
+                sample_labels, weights=sample_weights, minlength=len(label_freqs)
+            )
+    else:
+        for i in range(n_chunk_samples):
+            sample_weights = D_chunk[i]
+            sample_labels = labels
+            cluster_distances[i] += np.bincount(
+                sample_labels, weights=sample_weights, minlength=len(label_freqs)
+            )
+
+    # intra_index selects intra-cluster distances within cluster_distances
+    end = start + n_chunk_samples
+    intra_index = (np.arange(n_chunk_samples), labels[start:end])
+    # intra_cluster_distances are averaged over cluster size outside this function
+    intra_cluster_distances = cluster_distances[intra_index]
+    # of the remaining distances we normalise and extract the minimum
+    cluster_distances[intra_index] = np.inf
+    cluster_distances /= label_freqs
+    inter_cluster_distances = cluster_distances.min(axis=1)
+    return intra_cluster_distances, inter_cluster_distances
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "labels": ["array-like"],
+        "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
+    },
+    prefer_skip_nested_validation=True,
+)
+def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
+    """Compute the Silhouette Coefficient for each sample.
+
+    The Silhouette Coefficient is a measure of how well samples are clustered
+    with samples that are similar to themselves. Clustering models with a high
+    Silhouette Coefficient are said to be dense, where samples in the same
+    cluster are similar to each other, and well separated, where samples in
+    different clusters are not very similar to each other.
+
+    The Silhouette Coefficient is calculated using the mean intra-cluster
+    distance (``a``) and the mean nearest-cluster distance (``b``) for each
+    sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
+    b)``.
+    Note that Silhouette Coefficient is only defined if number of labels
+    is 2 ``<= n_labels <= n_samples - 1``.
+
+    This function returns the Silhouette Coefficient for each sample.
+
+    The best value is 1 and the worst value is -1. Values near 0 indicate
+    overlapping clusters.
+
+    Read more in the :ref:`User Guide <silhouette_coefficient>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
+            "precomputed" or (n_samples_a, n_features) otherwise
+        An array of pairwise distances between samples, or a feature array. If
+        a sparse matrix is provided, CSR format should be favoured avoiding
+        an additional copy.
+
+    labels : array-like of shape (n_samples,)
+        Label values for each sample.
+
+    metric : str or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string, it must be one of the options
+        allowed by :func:`~sklearn.metrics.pairwise_distances`.
+        If ``X`` is the distance array itself, use "precomputed" as the metric.
+        Precomputed distance matrices must have 0 along the diagonal.
+
+    **kwds : optional keyword parameters
+        Any further parameters are passed directly to the distance function.
+        If using a ``scipy.spatial.distance`` metric, the parameters are still
+        metric dependent. See the scipy docs for usage examples.
+
+    Returns
+    -------
+    silhouette : array-like of shape (n_samples,)
+        Silhouette Coefficients for each sample.
+
+    References
+    ----------
+
+    .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
+       Interpretation and Validation of Cluster Analysis". Computational
+       and Applied Mathematics 20: 53-65.
+       <https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
+
+    .. [2] `Wikipedia entry on the Silhouette Coefficient
+       <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
+
+    Examples
+    --------
+    >>> from sklearn.metrics import silhouette_samples
+    >>> from sklearn.datasets import make_blobs
+    >>> from sklearn.cluster import KMeans
+    >>> X, y = make_blobs(n_samples=50, random_state=42)
+    >>> kmeans = KMeans(n_clusters=3, random_state=42)
+    >>> labels = kmeans.fit_predict(X)
+    >>> silhouette_samples(X, labels)
+    array([...])
+    """
+    X, labels = check_X_y(X, labels, accept_sparse=["csr"])
+
+    # Check for non-zero diagonal entries in precomputed distance matrix
+    if metric == "precomputed":
+        error_msg = ValueError(
+            "The precomputed distance matrix contains non-zero "
+            "elements on the diagonal. Use np.fill_diagonal(X, 0)."
+        )
+        if X.dtype.kind == "f":
+            atol = _atol_for_type(X.dtype)
+
+            if np.any(np.abs(X.diagonal()) > atol):
+                raise error_msg
+        elif np.any(X.diagonal() != 0):  # integral dtype
+            raise error_msg
+
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+    n_samples = len(labels)
+    label_freqs = np.bincount(labels)
+    check_number_of_labels(len(le.classes_), n_samples)
+
+    kwds["metric"] = metric
+    reduce_func = functools.partial(
+        _silhouette_reduce, labels=labels, label_freqs=label_freqs
+    )
+    results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds))
+    intra_clust_dists, inter_clust_dists = results
+    intra_clust_dists = np.concatenate(intra_clust_dists)
+    inter_clust_dists = np.concatenate(inter_clust_dists)
+
+    denom = (label_freqs - 1).take(labels, mode="clip")
+    with np.errstate(divide="ignore", invalid="ignore"):
+        intra_clust_dists /= denom
+
+    sil_samples = inter_clust_dists - intra_clust_dists
+    with np.errstate(divide="ignore", invalid="ignore"):
+        sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
+    # nan values are for clusters of size 1, and should be 0
+    return np.nan_to_num(sil_samples)
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "labels": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def calinski_harabasz_score(X, labels):
+    """Compute the Calinski and Harabasz score.
+
+    It is also known as the Variance Ratio Criterion.
+
+    The score is defined as ratio of the sum of between-cluster dispersion and
+    of within-cluster dispersion.
+
+    Read more in the :ref:`User Guide <calinski_harabasz_index>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        A list of ``n_features``-dimensional data points. Each row corresponds
+        to a single data point.
+
+    labels : array-like of shape (n_samples,)
+        Predicted labels for each sample.
+
+    Returns
+    -------
+    score : float
+        The resulting Calinski-Harabasz score.
+
+    References
+    ----------
+    .. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster
+       analysis". Communications in Statistics
+       <https://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_blobs
+    >>> from sklearn.cluster import KMeans
+    >>> from sklearn.metrics import calinski_harabasz_score
+    >>> X, _ = make_blobs(random_state=0)
+    >>> kmeans = KMeans(n_clusters=3, random_state=0,).fit(X)
+    >>> calinski_harabasz_score(X, kmeans.labels_)
+    114.8...
+    """
+    X, labels = check_X_y(X, labels)
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+
+    n_samples, _ = X.shape
+    n_labels = len(le.classes_)
+
+    check_number_of_labels(n_labels, n_samples)
+
+    extra_disp, intra_disp = 0.0, 0.0
+    mean = np.mean(X, axis=0)
+    for k in range(n_labels):
+        cluster_k = X[labels == k]
+        mean_k = np.mean(cluster_k, axis=0)
+        extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2)
+        intra_disp += np.sum((cluster_k - mean_k) ** 2)
+
+    return float(
+        1.0
+        if intra_disp == 0.0
+        else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.0))
+    )
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "labels": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def davies_bouldin_score(X, labels):
+    """Compute the Davies-Bouldin score.
+
+    The score is defined as the average similarity measure of each cluster with
+    its most similar cluster, where similarity is the ratio of within-cluster
+    distances to between-cluster distances. Thus, clusters which are farther
+    apart and less dispersed will result in a better score.
+
+    The minimum score is zero, with lower values indicating better clustering.
+
+    Read more in the :ref:`User Guide <davies-bouldin_index>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        A list of ``n_features``-dimensional data points. Each row corresponds
+        to a single data point.
+
+    labels : array-like of shape (n_samples,)
+        Predicted labels for each sample.
+
+    Returns
+    -------
+    score: float
+        The resulting Davies-Bouldin score.
+
+    References
+    ----------
+    .. [1] Davies, David L.; Bouldin, Donald W. (1979).
+       `"A Cluster Separation Measure"
+       <https://ieeexplore.ieee.org/document/4766909>`__.
+       IEEE Transactions on Pattern Analysis and Machine Intelligence.
+       PAMI-1 (2): 224-227
+
+    Examples
+    --------
+    >>> from sklearn.metrics import davies_bouldin_score
+    >>> X = [[0, 1], [1, 1], [3, 4]]
+    >>> labels = [0, 0, 1]
+    >>> davies_bouldin_score(X, labels)
+    0.12...
+    """
+    X, labels = check_X_y(X, labels)
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+    n_samples, _ = X.shape
+    n_labels = len(le.classes_)
+    check_number_of_labels(n_labels, n_samples)
+
+    intra_dists = np.zeros(n_labels)
+    centroids = np.zeros((n_labels, len(X[0])), dtype=float)
+    for k in range(n_labels):
+        cluster_k = _safe_indexing(X, labels == k)
+        centroid = cluster_k.mean(axis=0)
+        centroids[k] = centroid
+        intra_dists[k] = np.average(pairwise_distances(cluster_k, [centroid]))
+
+    centroid_distances = pairwise_distances(centroids)
+
+    if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
+        return 0.0
+
+    centroid_distances[centroid_distances == 0] = np.inf
+    combined_intra_dists = intra_dists[:, None] + intra_dists
+    scores = np.max(combined_intra_dists / centroid_distances, axis=1)
+    return float(np.mean(scores))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/meson.build b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..5f25296c7540f289dc74eba4a97ddac5fad9af90
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/meson.build
@@ -0,0 +1,6 @@
+py.extension_module(
+  '_expected_mutual_info_fast',
+  cython_gen.process('_expected_mutual_info_fast.pyx'),
+  subdir: 'sklearn/metrics/cluster',
+  install: true
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_bicluster.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_bicluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..53f7805100a1313709d1d8868d45071b3066f836
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_bicluster.py
@@ -0,0 +1,56 @@
+"""Testing for bicluster metrics module"""
+
+import numpy as np
+
+from sklearn.metrics import consensus_score
+from sklearn.metrics.cluster._bicluster import _jaccard
+from sklearn.utils._testing import assert_almost_equal
+
+
+def test_jaccard():
+    a1 = np.array([True, True, False, False])
+    a2 = np.array([True, True, True, True])
+    a3 = np.array([False, True, True, False])
+    a4 = np.array([False, False, True, True])
+
+    assert _jaccard(a1, a1, a1, a1) == 1
+    assert _jaccard(a1, a1, a2, a2) == 0.25
+    assert _jaccard(a1, a1, a3, a3) == 1.0 / 7
+    assert _jaccard(a1, a1, a4, a4) == 0
+
+
+def test_consensus_score():
+    a = [[True, True, False, False], [False, False, True, True]]
+    b = a[::-1]
+
+    assert consensus_score((a, a), (a, a)) == 1
+    assert consensus_score((a, a), (b, b)) == 1
+    assert consensus_score((a, b), (a, b)) == 1
+    assert consensus_score((a, b), (b, a)) == 1
+
+    assert consensus_score((a, a), (b, a)) == 0
+    assert consensus_score((a, a), (a, b)) == 0
+    assert consensus_score((b, b), (a, b)) == 0
+    assert consensus_score((b, b), (b, a)) == 0
+
+
+def test_consensus_score_issue2445():
+    """Different number of biclusters in A and B"""
+    a_rows = np.array(
+        [
+            [True, True, False, False],
+            [False, False, True, True],
+            [False, False, False, True],
+        ]
+    )
+    a_cols = np.array(
+        [
+            [True, True, False, False],
+            [False, False, True, True],
+            [False, False, False, True],
+        ]
+    )
+    idx = [0, 2]
+    s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx]))
+    # B contains 2 of the 3 biclusters in A, so score should be 2/3
+    assert_almost_equal(s, 2.0 / 3.0)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a73670fbffce40eabaca55fc177648938cdccb26
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_common.py
@@ -0,0 +1,234 @@
+from functools import partial
+from itertools import chain
+
+import numpy as np
+import pytest
+
+from sklearn.metrics.cluster import (
+    adjusted_mutual_info_score,
+    adjusted_rand_score,
+    calinski_harabasz_score,
+    completeness_score,
+    davies_bouldin_score,
+    fowlkes_mallows_score,
+    homogeneity_score,
+    mutual_info_score,
+    normalized_mutual_info_score,
+    rand_score,
+    silhouette_score,
+    v_measure_score,
+)
+from sklearn.utils._testing import assert_allclose
+
+# Dictionaries of metrics
+# ------------------------
+# The goal of having those dictionaries is to have an easy way to call a
+# particular metric and associate a name to each function:
+#   - SUPERVISED_METRICS: all supervised cluster metrics - (when given a
+# ground truth value)
+#   - UNSUPERVISED_METRICS: all unsupervised cluster metrics
+#
+# Those dictionaries will be used to test systematically some invariance
+# properties, e.g. invariance toward several input layout.
+#
+
+SUPERVISED_METRICS = {
+    "adjusted_mutual_info_score": adjusted_mutual_info_score,
+    "adjusted_rand_score": adjusted_rand_score,
+    "rand_score": rand_score,
+    "completeness_score": completeness_score,
+    "homogeneity_score": homogeneity_score,
+    "mutual_info_score": mutual_info_score,
+    "normalized_mutual_info_score": normalized_mutual_info_score,
+    "v_measure_score": v_measure_score,
+    "fowlkes_mallows_score": fowlkes_mallows_score,
+}
+
+UNSUPERVISED_METRICS = {
+    "silhouette_score": silhouette_score,
+    "silhouette_manhattan": partial(silhouette_score, metric="manhattan"),
+    "calinski_harabasz_score": calinski_harabasz_score,
+    "davies_bouldin_score": davies_bouldin_score,
+}
+
+# Lists of metrics with common properties
+# ---------------------------------------
+# Lists of metrics with common properties are used to test systematically some
+# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics
+# that are symmetric with respect to their input argument y_true and y_pred.
+#
+# --------------------------------------------------------------------
+# Symmetric with respect to their input arguments y_true and y_pred.
+# Symmetric metrics only apply to supervised clusters.
+SYMMETRIC_METRICS = [
+    "adjusted_rand_score",
+    "rand_score",
+    "v_measure_score",
+    "mutual_info_score",
+    "adjusted_mutual_info_score",
+    "normalized_mutual_info_score",
+    "fowlkes_mallows_score",
+]
+
+NON_SYMMETRIC_METRICS = ["homogeneity_score", "completeness_score"]
+
+# Metrics whose upper bound is 1
+NORMALIZED_METRICS = [
+    "adjusted_rand_score",
+    "rand_score",
+    "homogeneity_score",
+    "completeness_score",
+    "v_measure_score",
+    "adjusted_mutual_info_score",
+    "fowlkes_mallows_score",
+    "normalized_mutual_info_score",
+]
+
+
+rng = np.random.RandomState(0)
+y1 = rng.randint(3, size=30)
+y2 = rng.randint(3, size=30)
+
+
+def test_symmetric_non_symmetric_union():
+    assert sorted(SYMMETRIC_METRICS + NON_SYMMETRIC_METRICS) == sorted(
+        SUPERVISED_METRICS
+    )
+
+
+@pytest.mark.parametrize(
+    "metric_name, y1, y2", [(name, y1, y2) for name in SYMMETRIC_METRICS]
+)
+def test_symmetry(metric_name, y1, y2):
+    metric = SUPERVISED_METRICS[metric_name]
+    assert metric(y1, y2) == pytest.approx(metric(y2, y1))
+
+
+@pytest.mark.parametrize(
+    "metric_name, y1, y2", [(name, y1, y2) for name in NON_SYMMETRIC_METRICS]
+)
+def test_non_symmetry(metric_name, y1, y2):
+    metric = SUPERVISED_METRICS[metric_name]
+    assert metric(y1, y2) != pytest.approx(metric(y2, y1))
+
+
+@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS)
+def test_normalized_output(metric_name):
+    upper_bound_1 = [0, 0, 0, 1, 1, 1]
+    upper_bound_2 = [0, 0, 0, 1, 1, 1]
+    metric = SUPERVISED_METRICS[metric_name]
+    assert metric([0, 0, 0, 1, 1], [0, 0, 0, 1, 2]) > 0.0
+    assert metric([0, 0, 1, 1, 2], [0, 0, 1, 1, 1]) > 0.0
+    assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
+    assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
+    assert metric(upper_bound_1, upper_bound_2) == pytest.approx(1.0)
+
+    lower_bound_1 = [0, 0, 0, 0, 0, 0]
+    lower_bound_2 = [0, 1, 2, 3, 4, 5]
+    score = np.array(
+        [metric(lower_bound_1, lower_bound_2), metric(lower_bound_2, lower_bound_1)]
+    )
+    assert not (score < 0).any()
+
+
+@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
+def test_permute_labels(metric_name):
+    # All clustering metrics do not change score due to permutations of labels
+    # that is when 0 and 1 exchanged.
+    y_label = np.array([0, 0, 0, 1, 1, 0, 1])
+    y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
+    if metric_name in SUPERVISED_METRICS:
+        metric = SUPERVISED_METRICS[metric_name]
+        score_1 = metric(y_pred, y_label)
+        assert_allclose(score_1, metric(1 - y_pred, y_label))
+        assert_allclose(score_1, metric(1 - y_pred, 1 - y_label))
+        assert_allclose(score_1, metric(y_pred, 1 - y_label))
+    else:
+        metric = UNSUPERVISED_METRICS[metric_name]
+        X = np.random.randint(10, size=(7, 10))
+        score_1 = metric(X, y_pred)
+        assert_allclose(score_1, metric(X, 1 - y_pred))
+
+
+@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
+# For all clustering metrics Input parameters can be both
+# in the form of arrays lists, positive, negative or string
+def test_format_invariance(metric_name):
+    y_true = [0, 0, 0, 0, 1, 1, 1, 1]
+    y_pred = [0, 1, 2, 3, 4, 5, 6, 7]
+
+    def generate_formats(y):
+        y = np.array(y)
+        yield y, "array of ints"
+        yield y.tolist(), "list of ints"
+        yield [str(x) + "-a" for x in y.tolist()], "list of strs"
+        yield (
+            np.array([str(x) + "-a" for x in y.tolist()], dtype=object),
+            "array of strs",
+        )
+        yield y - 1, "including negative ints"
+        yield y + 1, "strictly positive ints"
+
+    if metric_name in SUPERVISED_METRICS:
+        metric = SUPERVISED_METRICS[metric_name]
+        score_1 = metric(y_true, y_pred)
+        y_true_gen = generate_formats(y_true)
+        y_pred_gen = generate_formats(y_pred)
+        for (y_true_fmt, fmt_name), (y_pred_fmt, _) in zip(y_true_gen, y_pred_gen):
+            assert score_1 == metric(y_true_fmt, y_pred_fmt)
+    else:
+        metric = UNSUPERVISED_METRICS[metric_name]
+        X = np.random.randint(10, size=(8, 10))
+        score_1 = metric(X, y_true)
+        assert score_1 == metric(X.astype(float), y_true)
+        y_true_gen = generate_formats(y_true)
+        for y_true_fmt, fmt_name in y_true_gen:
+            assert score_1 == metric(X, y_true_fmt)
+
+
+@pytest.mark.parametrize("metric", SUPERVISED_METRICS.values())
+def test_single_sample(metric):
+    # only the supervised metrics support single sample
+    for i, j in [(0, 0), (0, 1), (1, 0), (1, 1)]:
+        metric([i], [j])
+
+
+@pytest.mark.parametrize(
+    "metric_name, metric_func", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS).items()
+)
+def test_inf_nan_input(metric_name, metric_func):
+    if metric_name in SUPERVISED_METRICS:
+        invalids = [
+            ([0, 1], [np.inf, np.inf]),
+            ([0, 1], [np.nan, np.nan]),
+            ([0, 1], [np.nan, np.inf]),
+        ]
+    else:
+        X = np.random.randint(10, size=(2, 10))
+        invalids = [(X, [np.inf, np.inf]), (X, [np.nan, np.nan]), (X, [np.nan, np.inf])]
+    with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
+        for args in invalids:
+            metric_func(*args)
+
+
+@pytest.mark.parametrize("name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
+def test_returned_value_consistency(name):
+    """Ensure that the returned values of all metrics are consistent.
+
+    It can only be a float. It should not be a numpy float64 or float32.
+    """
+
+    rng = np.random.RandomState(0)
+    X = rng.randint(10, size=(20, 10))
+    labels_true = rng.randint(0, 3, size=(20,))
+    labels_pred = rng.randint(0, 3, size=(20,))
+
+    if name in SUPERVISED_METRICS:
+        metric = SUPERVISED_METRICS[name]
+        score = metric(labels_true, labels_pred)
+    else:
+        metric = UNSUPERVISED_METRICS[name]
+        score = metric(X, labels_pred)
+
+    assert isinstance(score, float)
+    assert not isinstance(score, (np.float64, np.float32))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_supervised.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..7421b726ebe677a6845167b3b268614891b38013
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_supervised.py
@@ -0,0 +1,522 @@
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal
+
+from sklearn.base import config_context
+from sklearn.metrics.cluster import (
+    adjusted_mutual_info_score,
+    adjusted_rand_score,
+    completeness_score,
+    contingency_matrix,
+    entropy,
+    expected_mutual_information,
+    fowlkes_mallows_score,
+    homogeneity_completeness_v_measure,
+    homogeneity_score,
+    mutual_info_score,
+    normalized_mutual_info_score,
+    pair_confusion_matrix,
+    rand_score,
+    v_measure_score,
+)
+from sklearn.metrics.cluster._supervised import _generalized_average, check_clusterings
+from sklearn.utils import assert_all_finite
+from sklearn.utils._array_api import (
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._testing import _array_api_for_tests, assert_almost_equal
+
+score_funcs = [
+    adjusted_rand_score,
+    rand_score,
+    homogeneity_score,
+    completeness_score,
+    v_measure_score,
+    adjusted_mutual_info_score,
+    normalized_mutual_info_score,
+]
+
+
+@pytest.mark.parametrize("score_func", score_funcs)
+def test_error_messages_on_wrong_input(score_func):
+    expected = r"Found input variables with inconsistent numbers of samples: \[2, 3\]"
+    with pytest.raises(ValueError, match=expected):
+        score_func([0, 1], [1, 1, 1])
+
+    expected = r"labels_true must be 1D: shape is \(2"
+    with pytest.raises(ValueError, match=expected):
+        score_func([[0, 1], [1, 0]], [1, 1, 1])
+
+    expected = r"labels_pred must be 1D: shape is \(2"
+    with pytest.raises(ValueError, match=expected):
+        score_func([0, 1, 0], [[1, 1], [0, 0]])
+
+
+def test_generalized_average():
+    a, b = 1, 2
+    methods = ["min", "geometric", "arithmetic", "max"]
+    means = [_generalized_average(a, b, method) for method in methods]
+    assert means[0] <= means[1] <= means[2] <= means[3]
+    c, d = 12, 12
+    means = [_generalized_average(c, d, method) for method in methods]
+    assert means[0] == means[1] == means[2] == means[3]
+
+
+@pytest.mark.parametrize("score_func", score_funcs)
+def test_perfect_matches(score_func):
+    assert score_func([], []) == pytest.approx(1.0)
+    assert score_func([0], [1]) == pytest.approx(1.0)
+    assert score_func([0, 0, 0], [0, 0, 0]) == pytest.approx(1.0)
+    assert score_func([0, 1, 0], [42, 7, 42]) == pytest.approx(1.0)
+    assert score_func([0.0, 1.0, 0.0], [42.0, 7.0, 42.0]) == pytest.approx(1.0)
+    assert score_func([0.0, 1.0, 2.0], [42.0, 7.0, 2.0]) == pytest.approx(1.0)
+    assert score_func([0, 1, 2], [42, 7, 2]) == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize(
+    "score_func",
+    [
+        normalized_mutual_info_score,
+        adjusted_mutual_info_score,
+    ],
+)
+@pytest.mark.parametrize("average_method", ["min", "geometric", "arithmetic", "max"])
+def test_perfect_matches_with_changing_means(score_func, average_method):
+    assert score_func([], [], average_method=average_method) == pytest.approx(1.0)
+    assert score_func([0], [1], average_method=average_method) == pytest.approx(1.0)
+    assert score_func(
+        [0, 0, 0], [0, 0, 0], average_method=average_method
+    ) == pytest.approx(1.0)
+    assert score_func(
+        [0, 1, 0], [42, 7, 42], average_method=average_method
+    ) == pytest.approx(1.0)
+    assert score_func(
+        [0.0, 1.0, 0.0], [42.0, 7.0, 42.0], average_method=average_method
+    ) == pytest.approx(1.0)
+    assert score_func(
+        [0.0, 1.0, 2.0], [42.0, 7.0, 2.0], average_method=average_method
+    ) == pytest.approx(1.0)
+    assert score_func(
+        [0, 1, 2], [42, 7, 2], average_method=average_method
+    ) == pytest.approx(1.0)
+    # Non-regression tests for: https://github.com/scikit-learn/scikit-learn/issues/30950
+    assert score_func([0, 1], [0, 1], average_method=average_method) == pytest.approx(
+        1.0
+    )
+    assert score_func(
+        [0, 1, 2, 3], [0, 1, 2, 3], average_method=average_method
+    ) == pytest.approx(1.0)
+
+
+def test_homogeneous_but_not_complete_labeling():
+    # homogeneous but not complete clustering
+    h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 2, 2])
+    assert_almost_equal(h, 1.00, 2)
+    assert_almost_equal(c, 0.69, 2)
+    assert_almost_equal(v, 0.81, 2)
+
+
+def test_complete_but_not_homogeneous_labeling():
+    # complete but not homogeneous clustering
+    h, c, v = homogeneity_completeness_v_measure([0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 1, 1])
+    assert_almost_equal(h, 0.58, 2)
+    assert_almost_equal(c, 1.00, 2)
+    assert_almost_equal(v, 0.73, 2)
+
+
+def test_not_complete_and_not_homogeneous_labeling():
+    # neither complete nor homogeneous but not so bad either
+    h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
+    assert_almost_equal(h, 0.67, 2)
+    assert_almost_equal(c, 0.42, 2)
+    assert_almost_equal(v, 0.52, 2)
+
+
+def test_beta_parameter():
+    # test for when beta passed to
+    # homogeneity_completeness_v_measure
+    # and v_measure_score
+    beta_test = 0.2
+    h_test = 0.67
+    c_test = 0.42
+    v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)
+
+    h, c, v = homogeneity_completeness_v_measure(
+        [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test
+    )
+    assert_almost_equal(h, h_test, 2)
+    assert_almost_equal(c, c_test, 2)
+    assert_almost_equal(v, v_test, 2)
+
+    v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test)
+    assert_almost_equal(v, v_test, 2)
+
+
+def test_non_consecutive_labels():
+    # regression tests for labels with gaps
+    h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2])
+    assert_almost_equal(h, 0.67, 2)
+    assert_almost_equal(c, 0.42, 2)
+    assert_almost_equal(v, 0.52, 2)
+
+    h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
+    assert_almost_equal(h, 0.67, 2)
+    assert_almost_equal(c, 0.42, 2)
+    assert_almost_equal(v, 0.52, 2)
+
+    ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
+    ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
+    assert_almost_equal(ari_1, 0.24, 2)
+    assert_almost_equal(ari_2, 0.24, 2)
+
+    ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
+    ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
+    assert_almost_equal(ri_1, 0.66, 2)
+    assert_almost_equal(ri_2, 0.66, 2)
+
+
+def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10, seed=42):
+    # Compute score for random uniform cluster labelings
+    random_labels = np.random.RandomState(seed).randint
+    scores = np.zeros((len(k_range), n_runs))
+    for i, k in enumerate(k_range):
+        for j in range(n_runs):
+            labels_a = random_labels(low=0, high=k, size=n_samples)
+            labels_b = random_labels(low=0, high=k, size=n_samples)
+            scores[i, j] = score_func(labels_a, labels_b)
+    return scores
+
+
+def test_adjustment_for_chance():
+    # Check that adjusted scores are almost zero on random labels
+    n_clusters_range = [2, 10, 50, 90]
+    n_samples = 100
+    n_runs = 10
+
+    scores = uniform_labelings_scores(
+        adjusted_rand_score, n_samples, n_clusters_range, n_runs
+    )
+
+    max_abs_scores = np.abs(scores).max(axis=1)
+    assert_array_almost_equal(max_abs_scores, [0.02, 0.03, 0.03, 0.02], 2)
+
+
+def test_adjusted_mutual_info_score():
+    # Compute the Adjusted Mutual Information and test against known values
+    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
+    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
+    # Mutual information
+    mi = mutual_info_score(labels_a, labels_b)
+    assert_almost_equal(mi, 0.41022, 5)
+    # with provided sparse contingency
+    C = contingency_matrix(labels_a, labels_b, sparse=True)
+    mi = mutual_info_score(labels_a, labels_b, contingency=C)
+    assert_almost_equal(mi, 0.41022, 5)
+    # with provided dense contingency
+    C = contingency_matrix(labels_a, labels_b)
+    mi = mutual_info_score(labels_a, labels_b, contingency=C)
+    assert_almost_equal(mi, 0.41022, 5)
+    # Expected mutual information
+    n_samples = C.sum()
+    emi = expected_mutual_information(C, n_samples)
+    assert_almost_equal(emi, 0.15042, 5)
+    # Adjusted mutual information
+    ami = adjusted_mutual_info_score(labels_a, labels_b)
+    assert_almost_equal(ami, 0.27821, 5)
+    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
+    assert ami == pytest.approx(1.0)
+    # Test with a very large array
+    a110 = np.array([list(labels_a) * 110]).flatten()
+    b110 = np.array([list(labels_b) * 110]).flatten()
+    ami = adjusted_mutual_info_score(a110, b110)
+    assert_almost_equal(ami, 0.38, 2)
+
+
+def test_expected_mutual_info_overflow():
+    # Test for regression where contingency cell exceeds 2**16
+    # leading to overflow in np.outer, resulting in EMI > 1
+    assert expected_mutual_information(np.array([[70000]]), 70000) <= 1
+
+
+def test_int_overflow_mutual_info_fowlkes_mallows_score():
+    # Test overflow in mutual_info_classif and fowlkes_mallows_score
+    x = np.array(
+        [1] * (52632 + 2529)
+        + [2] * (14660 + 793)
+        + [3] * (3271 + 204)
+        + [4] * (814 + 39)
+        + [5] * (316 + 20)
+    )
+    y = np.array(
+        [0] * 52632
+        + [1] * 2529
+        + [0] * 14660
+        + [1] * 793
+        + [0] * 3271
+        + [1] * 204
+        + [0] * 814
+        + [1] * 39
+        + [0] * 316
+        + [1] * 20
+    )
+
+    assert_all_finite(mutual_info_score(x, y))
+    assert_all_finite(fowlkes_mallows_score(x, y))
+
+
+def test_entropy():
+    assert_almost_equal(entropy([0, 0, 42.0]), 0.6365141, 5)
+    assert_almost_equal(entropy([]), 1)
+    assert entropy([1, 1, 1, 1]) == 0
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_entropy_array_api(array_namespace, device, dtype_name):
+    xp = _array_api_for_tests(array_namespace, device)
+    float_labels = xp.asarray(np.asarray([0, 0, 42.0], dtype=dtype_name), device=device)
+    empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device)
+    int_labels = xp.asarray([1, 1, 1, 1], device=device)
+    with config_context(array_api_dispatch=True):
+        assert entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5)
+        assert entropy(empty_int32_labels) == 1
+        assert entropy(int_labels) == 0
+
+
+def test_contingency_matrix():
+    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
+    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
+    C = contingency_matrix(labels_a, labels_b)
+    C2 = np.histogram2d(labels_a, labels_b, bins=(np.arange(1, 5), np.arange(1, 5)))[0]
+    assert_array_almost_equal(C, C2)
+    C = contingency_matrix(labels_a, labels_b, eps=0.1)
+    assert_array_almost_equal(C, C2 + 0.1)
+
+
+def test_contingency_matrix_sparse():
+    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
+    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
+    C = contingency_matrix(labels_a, labels_b)
+    C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray()
+    assert_array_almost_equal(C, C_sparse)
+    with pytest.raises(ValueError, match="Cannot set 'eps' when sparse=True"):
+        contingency_matrix(labels_a, labels_b, eps=1e-10, sparse=True)
+
+
+def test_exactly_zero_info_score():
+    # Check numerical stability when information is exactly zero
+    for i in np.logspace(1, 4, 4).astype(int):
+        labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int))
+        assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
+        assert v_measure_score(labels_a, labels_b) == pytest.approx(0.0)
+        assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0
+        assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
+        for method in ["min", "geometric", "arithmetic", "max"]:
+            assert (
+                adjusted_mutual_info_score(labels_a, labels_b, average_method=method)
+                == 0.0
+            )
+            assert normalized_mutual_info_score(
+                labels_a, labels_b, average_method=method
+            ) == pytest.approx(0.0)
+
+
+def test_v_measure_and_mutual_information(seed=36):
+    # Check relation between v_measure, entropy and mutual information
+    for i in np.logspace(1, 4, 4).astype(int):
+        random_state = np.random.RandomState(seed)
+        labels_a, labels_b = (
+            random_state.randint(0, 10, i),
+            random_state.randint(0, 10, i),
+        )
+        assert_almost_equal(
+            v_measure_score(labels_a, labels_b),
+            2.0
+            * mutual_info_score(labels_a, labels_b)
+            / (entropy(labels_a) + entropy(labels_b)),
+            0,
+        )
+        avg = "arithmetic"
+        assert_almost_equal(
+            v_measure_score(labels_a, labels_b),
+            normalized_mutual_info_score(labels_a, labels_b, average_method=avg),
+        )
+
+
+def test_fowlkes_mallows_score():
+    # General case
+    score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
+    assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))
+
+    # Perfect match but where the label names changed
+    perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
+    assert_almost_equal(perfect_score, 1.0)
+
+    # Worst case
+    worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
+    assert_almost_equal(worst_score, 0.0)
+
+
+def test_fowlkes_mallows_score_properties():
+    # handcrafted example
+    labels_a = np.array([0, 0, 0, 1, 1, 2])
+    labels_b = np.array([1, 1, 2, 2, 0, 0])
+    expected = 1.0 / np.sqrt((1.0 + 3.0) * (1.0 + 2.0))
+    # FMI = TP / sqrt((TP + FP) * (TP + FN))
+
+    score_original = fowlkes_mallows_score(labels_a, labels_b)
+    assert_almost_equal(score_original, expected)
+
+    # symmetric property
+    score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
+    assert_almost_equal(score_symmetric, expected)
+
+    # permutation property
+    score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
+    assert_almost_equal(score_permuted, expected)
+
+    # symmetric and permutation(both together)
+    score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
+    assert_almost_equal(score_both, expected)
+
+
+@pytest.mark.parametrize(
+    "labels_true, labels_pred",
+    [
+        (["a"] * 6, [1, 1, 0, 0, 1, 1]),
+        ([1] * 6, [1, 1, 0, 0, 1, 1]),
+        ([1, 1, 0, 0, 1, 1], ["a"] * 6),
+        ([1, 1, 0, 0, 1, 1], [1] * 6),
+        (["a"] * 6, ["a"] * 6),
+    ],
+)
+def test_mutual_info_score_positive_constant_label(labels_true, labels_pred):
+    # Check that MI = 0 when one or both labelling are constant
+    # non-regression test for #16355
+    assert mutual_info_score(labels_true, labels_pred) == 0
+
+
+def test_check_clustering_error():
+    # Test warning message for continuous values
+    rng = np.random.RandomState(42)
+    noise = rng.rand(500)
+    wavelength = np.linspace(0.01, 1, 500) * 1e-6
+    msg = (
+        "Clustering metrics expects discrete values but received "
+        "continuous values for label, and continuous values for "
+        "target"
+    )
+
+    with pytest.warns(UserWarning, match=msg):
+        check_clusterings(wavelength, noise)
+
+
+def test_pair_confusion_matrix_fully_dispersed():
+    # edge case: every element is its own cluster
+    N = 100
+    clustering1 = list(range(N))
+    clustering2 = clustering1
+    expected = np.array([[N * (N - 1), 0], [0, 0]])
+    assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
+
+
+def test_pair_confusion_matrix_single_cluster():
+    # edge case: only one cluster
+    N = 100
+    clustering1 = np.zeros((N,))
+    clustering2 = clustering1
+    expected = np.array([[0, 0], [0, N * (N - 1)]])
+    assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
+
+
+def test_pair_confusion_matrix():
+    # regular case: different non-trivial clusterings
+    n = 10
+    N = n**2
+    clustering1 = np.hstack([[i + 1] * n for i in range(n)])
+    clustering2 = np.hstack([[i + 1] * (n + 1) for i in range(n)])[:N]
+    # basic quadratic implementation
+    expected = np.zeros(shape=(2, 2), dtype=np.int64)
+    for i in range(len(clustering1)):
+        for j in range(len(clustering2)):
+            if i != j:
+                same_cluster_1 = int(clustering1[i] == clustering1[j])
+                same_cluster_2 = int(clustering2[i] == clustering2[j])
+                expected[same_cluster_1, same_cluster_2] += 1
+    assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
+
+
+@pytest.mark.parametrize(
+    "clustering1, clustering2",
+    [(list(range(100)), list(range(100))), (np.zeros((100,)), np.zeros((100,)))],
+)
+def test_rand_score_edge_cases(clustering1, clustering2):
+    # edge case 1: every element is its own cluster
+    # edge case 2: only one cluster
+    assert_allclose(rand_score(clustering1, clustering2), 1.0)
+
+
+def test_rand_score():
+    # regular case: different non-trivial clusterings
+    clustering1 = [0, 0, 0, 1, 1, 1]
+    clustering2 = [0, 1, 0, 1, 2, 2]
+    # pair confusion matrix
+    D11 = 2 * 2  # ordered pairs (1, 3), (5, 6)
+    D10 = 2 * 4  # ordered pairs (1, 2), (2, 3), (4, 5), (4, 6)
+    D01 = 2 * 1  # ordered pair (2, 4)
+    D00 = 5 * 6 - D11 - D01 - D10  # the remaining pairs
+    # rand score
+    expected_numerator = D00 + D11
+    expected_denominator = D00 + D01 + D10 + D11
+    expected = expected_numerator / expected_denominator
+    assert_allclose(rand_score(clustering1, clustering2), expected)
+
+
+def test_adjusted_rand_score_overflow():
+    """Check that large amount of data will not lead to overflow in
+    `adjusted_rand_score`.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20305
+    """
+    rng = np.random.RandomState(0)
+    y_true = rng.randint(0, 2, 100_000, dtype=np.int8)
+    y_pred = rng.randint(0, 2, 100_000, dtype=np.int8)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        adjusted_rand_score(y_true, y_pred)
+
+
+@pytest.mark.parametrize("average_method", ["min", "arithmetic", "geometric", "max"])
+def test_normalized_mutual_info_score_bounded(average_method):
+    """Check that nmi returns a score between 0 (included) and 1 (excluded
+    for non-perfect match)
+
+    Non-regression test for issue #13836
+    """
+    labels1 = [0] * 469
+    labels2 = [1] + labels1[1:]
+    labels3 = [0, 1] + labels1[2:]
+
+    # labels1 is constant. The mutual info between labels1 and any other labelling is 0.
+    nmi = normalized_mutual_info_score(labels1, labels2, average_method=average_method)
+    assert nmi == 0
+
+    # non constant, non perfect matching labels
+    nmi = normalized_mutual_info_score(labels2, labels3, average_method=average_method)
+    assert 0 <= nmi < 1
+
+
+# TODO(1.9): remove
+@pytest.mark.parametrize("sparse", [True, False])
+def test_fowlkes_mallows_sparse_deprecated(sparse):
+    """Check deprecation warning for 'sparse' parameter of fowlkes_mallows_score."""
+    with pytest.warns(
+        FutureWarning, match="The 'sparse' parameter was deprecated in 1.7"
+    ):
+        fowlkes_mallows_score([0, 1], [1, 1], sparse=sparse)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0420bbd406ec873022ee3a6e511c51fafd82f11
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py
@@ -0,0 +1,413 @@
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy.sparse import issparse
+
+from sklearn import datasets
+from sklearn.metrics import pairwise_distances
+from sklearn.metrics.cluster import (
+    calinski_harabasz_score,
+    davies_bouldin_score,
+    silhouette_samples,
+    silhouette_score,
+)
+from sklearn.metrics.cluster._unsupervised import _silhouette_reduce
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.fixes import (
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+
+@pytest.mark.parametrize(
+    "sparse_container",
+    [None] + CSR_CONTAINERS + CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
+)
+@pytest.mark.parametrize("sample_size", [None, "half"])
+def test_silhouette(sparse_container, sample_size):
+    # Tests the Silhouette Coefficient.
+    dataset = datasets.load_iris()
+    X, y = dataset.data, dataset.target
+    if sparse_container is not None:
+        X = sparse_container(X)
+    sample_size = int(X.shape[0] / 2) if sample_size == "half" else sample_size
+
+    D = pairwise_distances(X, metric="euclidean")
+    # Given that the actual labels are used, we can assume that S would be positive.
+    score_precomputed = silhouette_score(
+        D, y, metric="precomputed", sample_size=sample_size, random_state=0
+    )
+    score_euclidean = silhouette_score(
+        X, y, metric="euclidean", sample_size=sample_size, random_state=0
+    )
+    assert score_precomputed > 0
+    assert score_euclidean > 0
+    assert score_precomputed == pytest.approx(score_euclidean)
+
+
+def test_cluster_size_1():
+    # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
+    # (cluster 0). We also test the case where there are identical samples
+    # as the only members of a cluster (cluster 2). To our knowledge, this case
+    # is not discussed in reference material, and we choose for it a sample
+    # score of 1.
+    X = [[0.0], [1.0], [1.0], [2.0], [3.0], [3.0]]
+    labels = np.array([0, 1, 1, 1, 2, 2])
+
+    # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
+    # Cluster 1: intra-cluster = [.5, .5, 1]
+    #            inter-cluster = [1, 1, 1]
+    #            silhouette    = [.5, .5, 0]
+    # Cluster 2: intra-cluster = [0, 0]
+    #            inter-cluster = [arbitrary, arbitrary]
+    #            silhouette    = [1., 1.]
+
+    silhouette = silhouette_score(X, labels)
+    assert not np.isnan(silhouette)
+    ss = silhouette_samples(X, labels)
+    assert_array_equal(ss, [0, 0.5, 0.5, 0, 1, 1])
+
+
+def test_silhouette_paper_example():
+    # Explicitly check per-sample results against Rousseeuw (1987)
+    # Data from Table 1
+    lower = [
+        5.58,
+        7.00,
+        6.50,
+        7.08,
+        7.00,
+        3.83,
+        4.83,
+        5.08,
+        8.17,
+        5.83,
+        2.17,
+        5.75,
+        6.67,
+        6.92,
+        4.92,
+        6.42,
+        5.00,
+        5.58,
+        6.00,
+        4.67,
+        6.42,
+        3.42,
+        5.50,
+        6.42,
+        6.42,
+        5.00,
+        3.92,
+        6.17,
+        2.50,
+        4.92,
+        6.25,
+        7.33,
+        4.50,
+        2.25,
+        6.33,
+        2.75,
+        6.08,
+        6.67,
+        4.25,
+        2.67,
+        6.00,
+        6.17,
+        6.17,
+        6.92,
+        6.17,
+        5.25,
+        6.83,
+        4.50,
+        3.75,
+        5.75,
+        5.42,
+        6.08,
+        5.83,
+        6.67,
+        3.67,
+        4.75,
+        3.00,
+        6.08,
+        6.67,
+        5.00,
+        5.58,
+        4.83,
+        6.17,
+        5.67,
+        6.50,
+        6.92,
+    ]
+    D = np.zeros((12, 12))
+    D[np.tril_indices(12, -1)] = lower
+    D += D.T
+
+    names = [
+        "BEL",
+        "BRA",
+        "CHI",
+        "CUB",
+        "EGY",
+        "FRA",
+        "IND",
+        "ISR",
+        "USA",
+        "USS",
+        "YUG",
+        "ZAI",
+    ]
+
+    # Data from Figure 2
+    labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
+    expected1 = {
+        "USA": 0.43,
+        "BEL": 0.39,
+        "FRA": 0.35,
+        "ISR": 0.30,
+        "BRA": 0.22,
+        "EGY": 0.20,
+        "ZAI": 0.19,
+        "CUB": 0.40,
+        "USS": 0.34,
+        "CHI": 0.33,
+        "YUG": 0.26,
+        "IND": -0.04,
+    }
+    score1 = 0.28
+
+    # Data from Figure 3
+    labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
+    expected2 = {
+        "USA": 0.47,
+        "FRA": 0.44,
+        "BEL": 0.42,
+        "ISR": 0.37,
+        "EGY": 0.02,
+        "ZAI": 0.28,
+        "BRA": 0.25,
+        "IND": 0.17,
+        "CUB": 0.48,
+        "USS": 0.44,
+        "YUG": 0.31,
+        "CHI": 0.31,
+    }
+    score2 = 0.33
+
+    for labels, expected, score in [
+        (labels1, expected1, score1),
+        (labels2, expected2, score2),
+    ]:
+        expected = [expected[name] for name in names]
+        # we check to 2dp because that's what's in the paper
+        pytest.approx(
+            expected,
+            silhouette_samples(D, np.array(labels), metric="precomputed"),
+            abs=1e-2,
+        )
+        pytest.approx(
+            score, silhouette_score(D, np.array(labels), metric="precomputed"), abs=1e-2
+        )
+
+
+def test_correct_labelsize():
+    # Assert 1 < n_labels < n_samples
+    dataset = datasets.load_iris()
+    X = dataset.data
+
+    # n_labels = n_samples
+    y = np.arange(X.shape[0])
+    err_msg = (
+        r"Number of labels is %d\. Valid values are 2 "
+        r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        silhouette_score(X, y)
+
+    # n_labels = 1
+    y = np.zeros(X.shape[0])
+    err_msg = (
+        r"Number of labels is %d\. Valid values are 2 "
+        r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        silhouette_score(X, y)
+
+
+def test_non_encoded_labels():
+    dataset = datasets.load_iris()
+    X = dataset.data
+    labels = dataset.target
+    assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels)
+    assert_array_equal(
+        silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels)
+    )
+
+
+def test_non_numpy_labels():
+    dataset = datasets.load_iris()
+    X = dataset.data
+    y = dataset.target
+    assert silhouette_score(list(X), list(y)) == silhouette_score(X, y)
+
+
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_silhouette_nonzero_diag(dtype):
+    # Make sure silhouette_samples requires diagonal to be zero.
+    # Non-regression test for #12178
+
+    # Construct a zero-diagonal matrix
+    dists = pairwise_distances(
+        np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T
+    )
+    labels = [0, 0, 0, 1, 1, 1]
+
+    # small values on the diagonal are OK
+    dists[2][2] = np.finfo(dists.dtype).eps * 10
+    silhouette_samples(dists, labels, metric="precomputed")
+
+    # values bigger than eps * 100 are not
+    dists[2][2] = np.finfo(dists.dtype).eps * 1000
+    with pytest.raises(ValueError, match="contains non-zero"):
+        silhouette_samples(dists, labels, metric="precomputed")
+
+
+@pytest.mark.parametrize(
+    "sparse_container",
+    CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
+)
+def test_silhouette_samples_precomputed_sparse(sparse_container):
+    """Check that silhouette_samples works for sparse matrices correctly."""
+    X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
+    y = [0, 0, 0, 0, 1, 1, 1, 1]
+    pdist_dense = pairwise_distances(X)
+    pdist_sparse = sparse_container(pdist_dense)
+    assert issparse(pdist_sparse)
+    output_with_sparse_input = silhouette_samples(pdist_sparse, y, metric="precomputed")
+    output_with_dense_input = silhouette_samples(pdist_dense, y, metric="precomputed")
+    assert_allclose(output_with_sparse_input, output_with_dense_input)
+
+
+@pytest.mark.parametrize(
+    "sparse_container",
+    CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
+)
+def test_silhouette_samples_euclidean_sparse(sparse_container):
+    """Check that silhouette_samples works for sparse matrices correctly."""
+    X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
+    y = [0, 0, 0, 0, 1, 1, 1, 1]
+    pdist_dense = pairwise_distances(X)
+    pdist_sparse = sparse_container(pdist_dense)
+    assert issparse(pdist_sparse)
+    output_with_sparse_input = silhouette_samples(pdist_sparse, y)
+    output_with_dense_input = silhouette_samples(pdist_dense, y)
+    assert_allclose(output_with_sparse_input, output_with_dense_input)
+
+
+@pytest.mark.parametrize(
+    "sparse_container", CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS
+)
+def test_silhouette_reduce(sparse_container):
+    """Check for non-CSR input to private method `_silhouette_reduce`."""
+    X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
+    pdist_dense = pairwise_distances(X)
+    pdist_sparse = sparse_container(pdist_dense)
+    y = [0, 0, 0, 0, 1, 1, 1, 1]
+    label_freqs = np.bincount(y)
+    with pytest.raises(
+        TypeError,
+        match="Expected CSR matrix. Please pass sparse matrix in CSR format.",
+    ):
+        _silhouette_reduce(pdist_sparse, start=0, labels=y, label_freqs=label_freqs)
+
+
+def assert_raises_on_only_one_label(func):
+    """Assert message when there is only one label"""
+    rng = np.random.RandomState(seed=0)
+    with pytest.raises(ValueError, match="Number of labels is"):
+        func(rng.rand(10, 2), np.zeros(10))
+
+
+def assert_raises_on_all_points_same_cluster(func):
+    """Assert message when all point are in different clusters"""
+    rng = np.random.RandomState(seed=0)
+    with pytest.raises(ValueError, match="Number of labels is"):
+        func(rng.rand(10, 2), np.arange(10))
+
+
+def test_calinski_harabasz_score():
+    assert_raises_on_only_one_label(calinski_harabasz_score)
+
+    assert_raises_on_all_points_same_cluster(calinski_harabasz_score)
+
+    # Assert the value is 1. when all samples are equals
+    assert 1.0 == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)
+
+    # Assert the value is 0. when all the mean cluster are equal
+    assert 0.0 == calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)
+
+    # General case (with non numpy arrays)
+    X = (
+        [[0, 0], [1, 1]] * 5
+        + [[3, 3], [4, 4]] * 5
+        + [[0, 4], [1, 3]] * 5
+        + [[3, 1], [4, 0]] * 5
+    )
+    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
+    pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
+
+
+def test_davies_bouldin_score():
+    assert_raises_on_only_one_label(davies_bouldin_score)
+    assert_raises_on_all_points_same_cluster(davies_bouldin_score)
+
+    # Assert the value is 0. when all samples are equals
+    assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(
+        0.0
+    )
+
+    # Assert the value is 0. when all the mean cluster are equal
+    assert davies_bouldin_score(
+        [[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10
+    ) == pytest.approx(0.0)
+
+    # General case (with non numpy arrays)
+    X = (
+        [[0, 0], [1, 1]] * 5
+        + [[3, 3], [4, 4]] * 5
+        + [[0, 4], [1, 3]] * 5
+        + [[3, 1], [4, 0]] * 5
+    )
+    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
+    pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)
+
+    # Ensure divide by zero warning is not raised in general case
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        davies_bouldin_score(X, labels)
+
+    # General case - cluster have one sample
+    X = [[0, 0], [2, 2], [3, 3], [5, 5]]
+    labels = [0, 0, 1, 2]
+    pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3)
+
+
+def test_silhouette_score_integer_precomputed():
+    """Check that silhouette_score works for precomputed metrics that are integers.
+
+    Non-regression test for #22107.
+    """
+    result = silhouette_score(
+        [[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
+    )
+    assert result == pytest.approx(1 / 6)
+
+    # non-zero on diagonal for ints raises an error
+    with pytest.raises(ValueError, match="contains non-zero"):
+        silhouette_score(
+            [[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/meson.build b/.venv/lib/python3.12/site-packages/sklearn/metrics/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..f0f9894cc6f59a9500a1598c9c9a94d5d6f58429
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/meson.build
@@ -0,0 +1,49 @@
+# Metrics is cimported from other subpackages so this is needed for the cimport
+# to work
+metrics_cython_tree = [
+  fs.copyfile('__init__.py')
+]
+# Some metrics code cimports code from utils, we may as well copy all the necessary files
+metrics_cython_tree += utils_cython_tree
+
+_dist_metrics_pxd = custom_target(
+  '_dist_metrics_pxd',
+  output: '_dist_metrics.pxd',
+  input: '_dist_metrics.pxd.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # Need to install the generated pxd because it is needed in other subpackages
+  # Cython code, e.g. sklearn.cluster
+  install_dir: sklearn_dir / 'metrics',
+  install: true,
+)
+metrics_cython_tree += [_dist_metrics_pxd]
+
+_dist_metrics_pyx = custom_target(
+  '_dist_metrics_pyx',
+  output: '_dist_metrics.pyx',
+  input: '_dist_metrics.pyx.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # TODO in principle this should go in py.exension_module below. This is
+  # temporary work-around for dependency issue with .pyx.tp files. For more
+  # details, see https://github.com/mesonbuild/meson/issues/13212
+  depends: metrics_cython_tree,
+)
+
+_dist_metrics = py.extension_module(
+  '_dist_metrics',
+  cython_gen.process(_dist_metrics_pyx),
+  dependencies: [np_dep],
+  subdir: 'sklearn/metrics',
+  install: true
+)
+
+py.extension_module(
+  '_pairwise_fast',
+  [cython_gen.process('_pairwise_fast.pyx'), metrics_cython_tree],
+  dependencies: [openmp_dep],
+  subdir: 'sklearn/metrics',
+  install: true
+)
+
+subdir('_pairwise_distances_reduction')
+subdir('cluster')
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/pairwise.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/pairwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..050b58866c8ef589fba008c8444948b30e3416ed
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/pairwise.py
@@ -0,0 +1,2675 @@
+"""Metrics for pairwise distances and affinity of sets of samples."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import math
+import warnings
+from functools import partial
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+from scipy.sparse import csr_matrix, issparse
+from scipy.spatial import distance
+
+from .. import config_context
+from ..exceptions import DataConversionWarning
+from ..preprocessing import normalize
+from ..utils import check_array, gen_batches, gen_even_slices
+from ..utils._array_api import (
+    _fill_or_add_to_diagonal,
+    _find_matching_floating_dtype,
+    _is_numpy_namespace,
+    _max_precision_float_dtype,
+    _modify_in_place_if_numpy,
+    get_namespace,
+    get_namespace_and_device,
+)
+from ..utils._chunking import get_chunk_n_rows
+from ..utils._mask import _get_mask
+from ..utils._missing import is_scalar_nan
+from ..utils._param_validation import (
+    Hidden,
+    Interval,
+    MissingValues,
+    Options,
+    StrOptions,
+    validate_params,
+)
+from ..utils.deprecation import _deprecate_force_all_finite
+from ..utils.extmath import row_norms, safe_sparse_dot
+from ..utils.fixes import parse_version, sp_base_version
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _num_samples, check_non_negative
+from ._pairwise_distances_reduction import ArgKmin
+from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
+
+
+# Utility Functions
+def _return_float_dtype(X, Y):
+    """
+    1. If dtype of X and Y is float32, then dtype float32 is returned.
+    2. Else dtype float is returned.
+    """
+    if not issparse(X) and not isinstance(X, np.ndarray):
+        X = np.asarray(X)
+
+    if Y is None:
+        Y_dtype = X.dtype
+    elif not issparse(Y) and not isinstance(Y, np.ndarray):
+        Y = np.asarray(Y)
+        Y_dtype = Y.dtype
+    else:
+        Y_dtype = Y.dtype
+
+    if X.dtype == Y_dtype == np.float32:
+        dtype = np.float32
+    else:
+        dtype = float
+
+    return X, Y, dtype
+
+
+def check_pairwise_arrays(
+    X,
+    Y,
+    *,
+    precomputed=False,
+    dtype="infer_float",
+    accept_sparse="csr",
+    force_all_finite="deprecated",
+    ensure_all_finite=None,
+    ensure_2d=True,
+    copy=False,
+):
+    """Set X and Y appropriately and checks inputs.
+
+    If Y is None, it is set as a pointer to X (i.e. not a copy).
+    If Y is given, this does not happen.
+    All distance metrics should use this function first to assert that the
+    given parameters are correct and safe to use.
+
+    Specifically, this function first ensures that both X and Y are arrays,
+    then checks that they are at least two dimensional while ensuring that
+    their elements are floats (or dtype if provided). Finally, the function
+    checks that the size of the second dimension of the two arrays is equal, or
+    the equivalent check for a precomputed distance matrix.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
+
+    precomputed : bool, default=False
+        True if X is to be treated as precomputed distances to the samples in
+        Y.
+
+    dtype : str, type, list of type or None default="infer_float"
+        Data type required for X and Y. If "infer_float", the dtype will be an
+        appropriate float type selected by _return_float_dtype. If None, the
+        dtype of the input is preserved.
+
+        .. versionadded:: 0.18
+
+    accept_sparse : str, bool or list/tuple of str, default='csr'
+        String[s] representing allowed sparse matrix formats, such as 'csc',
+        'csr', etc. If the input is sparse but not in the allowed format,
+        it will be converted to the first listed format. True allows the input
+        to be any format. False means that a sparse matrix input will
+        raise an error.
+
+    force_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
+        possibilities are:
+
+        - True: Force all values of array to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
+
+        .. versionadded:: 0.22
+           ``force_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`.
+
+        .. deprecated:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite` and will be removed
+           in 1.8.
+
+    ensure_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
+        possibilities are:
+
+        - True: Force all values of array to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
+
+        .. versionadded:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite`.
+
+    ensure_2d : bool, default=True
+        Whether to raise an error when the input arrays are not 2-dimensional. Setting
+        this to `False` is necessary when using a custom metric with certain
+        non-numerical inputs (e.g. a list of strings).
+
+        .. versionadded:: 1.5
+
+    copy : bool, default=False
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
+        .. versionadded:: 0.22
+
+    Returns
+    -------
+    safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        An array equal to X, guaranteed to be a numpy array.
+
+    safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
+        An array equal to Y if Y was not None, guaranteed to be a numpy array.
+        If Y was None, safe_Y will be a pointer to X.
+    """
+    ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite)
+
+    xp, _ = get_namespace(X, Y)
+    if any([issparse(X), issparse(Y)]) or _is_numpy_namespace(xp):
+        X, Y, dtype_float = _return_float_dtype(X, Y)
+    else:
+        dtype_float = _find_matching_floating_dtype(X, Y, xp=xp)
+
+    estimator = "check_pairwise_arrays"
+    if dtype == "infer_float":
+        dtype = dtype_float
+
+    if Y is X or Y is None:
+        X = Y = check_array(
+            X,
+            accept_sparse=accept_sparse,
+            dtype=dtype,
+            copy=copy,
+            ensure_all_finite=ensure_all_finite,
+            estimator=estimator,
+            ensure_2d=ensure_2d,
+        )
+    else:
+        X = check_array(
+            X,
+            accept_sparse=accept_sparse,
+            dtype=dtype,
+            copy=copy,
+            ensure_all_finite=ensure_all_finite,
+            estimator=estimator,
+            ensure_2d=ensure_2d,
+        )
+        Y = check_array(
+            Y,
+            accept_sparse=accept_sparse,
+            dtype=dtype,
+            copy=copy,
+            ensure_all_finite=ensure_all_finite,
+            estimator=estimator,
+            ensure_2d=ensure_2d,
+        )
+
+    if precomputed:
+        if X.shape[1] != Y.shape[0]:
+            raise ValueError(
+                "Precomputed metric requires shape "
+                "(n_queries, n_indexed). Got (%d, %d) "
+                "for %d indexed." % (X.shape[0], X.shape[1], Y.shape[0])
+            )
+    elif ensure_2d and X.shape[1] != Y.shape[1]:
+        # Only check the number of features if 2d arrays are enforced. Otherwise,
+        # validation is left to the user for custom metrics.
+        raise ValueError(
+            "Incompatible dimension for X and Y matrices: "
+            "X.shape[1] == %d while Y.shape[1] == %d" % (X.shape[1], Y.shape[1])
+        )
+
+    return X, Y
+
+
+def check_paired_arrays(X, Y):
+    """Set X and Y appropriately and checks inputs for paired distances.
+
+    All paired distance metrics should use this function first to assert that
+    the given parameters are correct and safe to use.
+
+    Specifically, this function first ensures that both X and Y are arrays,
+    then checks that they are at least two dimensional while ensuring that
+    their elements are floats. Finally, the function checks that the size
+    of the dimensions of the two arrays are equal.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
+
+    Returns
+    -------
+    safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        An array equal to X, guaranteed to be a numpy array.
+
+    safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
+        An array equal to Y if Y was not None, guaranteed to be a numpy array.
+        If Y was None, safe_Y will be a pointer to X.
+    """
+    X, Y = check_pairwise_arrays(X, Y)
+    if X.shape != Y.shape:
+        raise ValueError(
+            "X and Y should be of same shape. They were respectively %r and %r long."
+            % (X.shape, Y.shape)
+        )
+    return X, Y
+
+
+# Pairwise distances
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "Y_norm_squared": ["array-like", None],
+        "squared": ["boolean"],
+        "X_norm_squared": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def euclidean_distances(
+    X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None
+):
+    """
+    Compute the distance matrix between each pair from a feature array X and Y.
+
+    For efficiency reasons, the euclidean distance between a pair of row
+    vector x and y is computed as::
+
+        dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
+
+    This formulation has two advantages over other ways of computing distances.
+    First, it is computationally efficient when dealing with sparse data.
+    Second, if one argument varies but the other remains unchanged, then
+    `dot(x, x)` and/or `dot(y, y)` can be pre-computed.
+
+    However, this is not the most precise way of doing this computation,
+    because this equation potentially suffers from "catastrophic cancellation".
+    Also, the distance matrix returned by this function may not be exactly
+    symmetric as required by, e.g., :mod:`scipy.spatial.distance` functions.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        An array where each row is a sample and each column is a feature.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
+            default=None
+        An array where each row is a sample and each column is a feature.
+        If `None`, method uses `Y=X`.
+
+    Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \
+            or (1, n_samples_Y), default=None
+        Pre-computed dot-products of vectors in Y (e.g.,
+        ``(Y**2).sum(axis=1)``)
+        May be ignored in some cases, see the note below.
+
+    squared : bool, default=False
+        Return squared Euclidean distances.
+
+    X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \
+            or (1, n_samples_X), default=None
+        Pre-computed dot-products of vectors in X (e.g.,
+        ``(X**2).sum(axis=1)``)
+        May be ignored in some cases, see the note below.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples_X, n_samples_Y)
+        Returns the distances between the row vectors of `X`
+        and the row vectors of `Y`.
+
+    See Also
+    --------
+    paired_distances : Distances between pairs of elements of X and Y.
+
+    Notes
+    -----
+    To achieve a better accuracy, `X_norm_squared` and `Y_norm_squared` may be
+    unused if they are passed as `np.float32`.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import euclidean_distances
+    >>> X = [[0, 1], [1, 1]]
+    >>> # distance between rows of X
+    >>> euclidean_distances(X, X)
+    array([[0., 1.],
+           [1., 0.]])
+    >>> # get distance to origin
+    >>> euclidean_distances(X, [[0, 0]])
+    array([[1.        ],
+           [1.41421356]])
+    """
+    xp, _ = get_namespace(X, Y)
+    X, Y = check_pairwise_arrays(X, Y)
+
+    if X_norm_squared is not None:
+        X_norm_squared = check_array(X_norm_squared, ensure_2d=False)
+        original_shape = X_norm_squared.shape
+        if X_norm_squared.shape == (X.shape[0],):
+            X_norm_squared = xp.reshape(X_norm_squared, (-1, 1))
+        if X_norm_squared.shape == (1, X.shape[0]):
+            X_norm_squared = X_norm_squared.T
+        if X_norm_squared.shape != (X.shape[0], 1):
+            raise ValueError(
+                f"Incompatible dimensions for X of shape {X.shape} and "
+                f"X_norm_squared of shape {original_shape}."
+            )
+
+    if Y_norm_squared is not None:
+        Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False)
+        original_shape = Y_norm_squared.shape
+        if Y_norm_squared.shape == (Y.shape[0],):
+            Y_norm_squared = xp.reshape(Y_norm_squared, (1, -1))
+        if Y_norm_squared.shape == (Y.shape[0], 1):
+            Y_norm_squared = Y_norm_squared.T
+        if Y_norm_squared.shape != (1, Y.shape[0]):
+            raise ValueError(
+                f"Incompatible dimensions for Y of shape {Y.shape} and "
+                f"Y_norm_squared of shape {original_shape}."
+            )
+
+    return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)
+
+
+def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared=False):
+    """Computational part of euclidean_distances
+
+    Assumes inputs are already checked.
+
+    If norms are passed as float32, they are unused. If arrays are passed as
+    float32, norms needs to be recomputed on upcast chunks.
+    TODO: use a float64 accumulator in row_norms to avoid the latter.
+    """
+    xp, _, device_ = get_namespace_and_device(X, Y)
+    if X_norm_squared is not None and X_norm_squared.dtype != xp.float32:
+        XX = xp.reshape(X_norm_squared, (-1, 1))
+    elif X.dtype != xp.float32:
+        XX = row_norms(X, squared=True)[:, None]
+    else:
+        XX = None
+
+    if Y is X:
+        YY = None if XX is None else XX.T
+    else:
+        if Y_norm_squared is not None and Y_norm_squared.dtype != xp.float32:
+            YY = xp.reshape(Y_norm_squared, (1, -1))
+        elif Y.dtype != xp.float32:
+            YY = row_norms(Y, squared=True)[None, :]
+        else:
+            YY = None
+
+    if X.dtype == xp.float32 or Y.dtype == xp.float32:
+        # To minimize precision issues with float32, we compute the distance
+        # matrix on chunks of X and Y upcast to float64
+        distances = _euclidean_distances_upcast(X, XX, Y, YY)
+    else:
+        # if dtype is already float64, no need to chunk and upcast
+        distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True)
+        distances += XX
+        distances += YY
+
+    xp_zero = xp.asarray(0, device=device_, dtype=distances.dtype)
+    distances = _modify_in_place_if_numpy(
+        xp, xp.maximum, distances, xp_zero, out=distances
+    )
+
+    # Ensure that distances between vectors and themselves are set to 0.0.
+    # This may not be the case due to floating point rounding errors.
+    if X is Y:
+        _fill_or_add_to_diagonal(distances, 0, xp=xp, add_value=False)
+
+    if squared:
+        return distances
+
+    distances = _modify_in_place_if_numpy(xp, xp.sqrt, distances, out=distances)
+    return distances
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "Y": ["array-like", None],
+        "squared": ["boolean"],
+        "missing_values": [MissingValues(numeric_only=True)],
+        "copy": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def nan_euclidean_distances(
+    X, Y=None, *, squared=False, missing_values=np.nan, copy=True
+):
+    """Calculate the euclidean distances in the presence of missing values.
+
+    Compute the euclidean distance between each pair of samples in X and Y,
+    where Y=X is assumed if Y=None. When calculating the distance between a
+    pair of samples, this formulation ignores feature coordinates with a
+    missing value in either sample and scales up the weight of the remaining
+    coordinates:
+
+    .. code-block:: text
+
+        dist(x,y) = sqrt(weight * sq. distance from present coordinates)
+
+    where:
+
+    .. code-block:: text
+
+        weight = Total # of coordinates / # of present coordinates
+
+    For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]`` is:
+
+    .. math::
+        \\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)}
+
+    If all the coordinates are missing or if there are no common present
+    coordinates then NaN is returned for that pair.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples_X, n_features)
+        An array where each row is a sample and each column is a feature.
+
+    Y : array-like of shape (n_samples_Y, n_features), default=None
+        An array where each row is a sample and each column is a feature.
+        If `None`, method uses `Y=X`.
+
+    squared : bool, default=False
+        Return squared Euclidean distances.
+
+    missing_values : np.nan, float or int, default=np.nan
+        Representation of missing value.
+
+    copy : bool, default=True
+        Make and use a deep copy of X and Y (if Y exists).
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples_X, n_samples_Y)
+        Returns the distances between the row vectors of `X`
+        and the row vectors of `Y`.
+
+    See Also
+    --------
+    paired_distances : Distances between pairs of elements of X and Y.
+
+    References
+    ----------
+    * John K. Dixon, "Pattern Recognition with Partly Missing Data",
+      IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:
+      10, pp. 617 - 621, Oct. 1979.
+      http://ieeexplore.ieee.org/abstract/document/4310090/
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import nan_euclidean_distances
+    >>> nan = float("NaN")
+    >>> X = [[0, 1], [1, nan]]
+    >>> nan_euclidean_distances(X, X) # distance between rows of X
+    array([[0.        , 1.41421356],
+           [1.41421356, 0.        ]])
+
+    >>> # get distance to origin
+    >>> nan_euclidean_distances(X, [[0, 0]])
+    array([[1.        ],
+           [1.41421356]])
+    """
+
+    ensure_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True
+    X, Y = check_pairwise_arrays(
+        X, Y, accept_sparse=False, ensure_all_finite=ensure_all_finite, copy=copy
+    )
+    # Get missing mask for X
+    missing_X = _get_mask(X, missing_values)
+
+    # Get missing mask for Y
+    missing_Y = missing_X if Y is X else _get_mask(Y, missing_values)
+
+    # set missing values to zero
+    X[missing_X] = 0
+    Y[missing_Y] = 0
+
+    distances = euclidean_distances(X, Y, squared=True)
+
+    # Adjust distances for missing values
+    XX = X * X
+    YY = Y * Y
+    distances -= np.dot(XX, missing_Y.T)
+    distances -= np.dot(missing_X, YY.T)
+
+    np.clip(distances, 0, None, out=distances)
+
+    if X is Y:
+        # Ensure that distances between vectors and themselves are set to 0.0.
+        # This may not be the case due to floating point rounding errors.
+        np.fill_diagonal(distances, 0.0)
+
+    present_X = 1 - missing_X
+    present_Y = present_X if Y is X else ~missing_Y
+    present_count = np.dot(present_X, present_Y.T)
+    distances[present_count == 0] = np.nan
+    # avoid divide by zero
+    np.maximum(1, present_count, out=present_count)
+    distances /= present_count
+    distances *= X.shape[1]
+
+    if not squared:
+        np.sqrt(distances, out=distances)
+
+    return distances
+
+
+def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None):
+    """Euclidean distances between X and Y.
+
+    Assumes X and Y have float32 dtype.
+    Assumes XX and YY have float64 dtype or are None.
+
+    X and Y are upcast to float64 by chunks, which size is chosen to limit
+    memory increase by approximately 10% (at least 10MiB).
+    """
+    xp, _, device_ = get_namespace_and_device(X, Y)
+    n_samples_X = X.shape[0]
+    n_samples_Y = Y.shape[0]
+    n_features = X.shape[1]
+
+    distances = xp.empty((n_samples_X, n_samples_Y), dtype=xp.float32, device=device_)
+
+    if batch_size is None:
+        x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1
+        y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1
+
+        # Allow 10% more memory than X, Y and the distance matrix take (at
+        # least 10MiB)
+        maxmem = max(
+            (
+                (x_density * n_samples_X + y_density * n_samples_Y) * n_features
+                + (x_density * n_samples_X * y_density * n_samples_Y)
+            )
+            / 10,
+            10 * 2**17,
+        )
+
+        # The increase amount of memory in 8-byte blocks is:
+        # - x_density * batch_size * n_features (copy of chunk of X)
+        # - y_density * batch_size * n_features (copy of chunk of Y)
+        # - batch_size * batch_size (chunk of distance matrix)
+        # Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem
+        #                                 xd=x_density and yd=y_density
+        tmp = (x_density + y_density) * n_features
+        batch_size = (-tmp + math.sqrt(tmp**2 + 4 * maxmem)) / 2
+        batch_size = max(int(batch_size), 1)
+
+    x_batches = gen_batches(n_samples_X, batch_size)
+    xp_max_float = _max_precision_float_dtype(xp=xp, device=device_)
+    for i, x_slice in enumerate(x_batches):
+        X_chunk = xp.astype(X[x_slice, :], xp_max_float)
+        if XX is None:
+            XX_chunk = row_norms(X_chunk, squared=True)[:, None]
+        else:
+            XX_chunk = XX[x_slice]
+
+        y_batches = gen_batches(n_samples_Y, batch_size)
+
+        for j, y_slice in enumerate(y_batches):
+            if X is Y and j < i:
+                # when X is Y the distance matrix is symmetric so we only need
+                # to compute half of it.
+                d = distances[y_slice, x_slice].T
+
+            else:
+                Y_chunk = xp.astype(Y[y_slice, :], xp_max_float)
+                if YY is None:
+                    YY_chunk = row_norms(Y_chunk, squared=True)[None, :]
+                else:
+                    YY_chunk = YY[:, y_slice]
+
+                d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True)
+                d += XX_chunk
+                d += YY_chunk
+
+            distances[x_slice, y_slice] = xp.astype(d, xp.float32, copy=False)
+
+    return distances
+
+
+def _argmin_min_reduce(dist, start):
+    # `start` is specified in the signature but not used. This is because the higher
+    # order `pairwise_distances_chunked` function needs reduction functions that are
+    # passed as argument to have a two arguments signature.
+    indices = dist.argmin(axis=1)
+    values = dist[np.arange(dist.shape[0]), indices]
+    return indices, values
+
+
+def _argmin_reduce(dist, start):
+    # `start` is specified in the signature but not used. This is because the higher
+    # order `pairwise_distances_chunked` function needs reduction functions that are
+    # passed as argument to have a two arguments signature.
+    return dist.argmin(axis=1)
+
+
+_VALID_METRICS = [
+    "euclidean",
+    "l2",
+    "l1",
+    "manhattan",
+    "cityblock",
+    "braycurtis",
+    "canberra",
+    "chebyshev",
+    "correlation",
+    "cosine",
+    "dice",
+    "hamming",
+    "jaccard",
+    "mahalanobis",
+    "matching",
+    "minkowski",
+    "rogerstanimoto",
+    "russellrao",
+    "seuclidean",
+    "sokalsneath",
+    "sqeuclidean",
+    "yule",
+    "wminkowski",
+    "nan_euclidean",
+    "haversine",
+]
+if sp_base_version < parse_version("1.17"):  # pragma: no cover
+    # Deprecated in SciPy 1.15 and removed in SciPy 1.17
+    _VALID_METRICS += ["sokalmichener"]
+if sp_base_version < parse_version("1.11"):  # pragma: no cover
+    # Deprecated in SciPy 1.9 and removed in SciPy 1.11
+    _VALID_METRICS += ["kulsinski"]
+if sp_base_version < parse_version("1.9"):
+    # Deprecated in SciPy 1.0 and removed in SciPy 1.9
+    _VALID_METRICS += ["matching"]
+
+_NAN_METRICS = ["nan_euclidean"]
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix"],
+        "axis": [Options(Integral, {0, 1})],
+        "metric": [
+            StrOptions(set(_VALID_METRICS).union(ArgKmin.valid_metrics())),
+            callable,
+        ],
+        "metric_kwargs": [dict, None],
+    },
+    prefer_skip_nested_validation=False,  # metric is not validated yet
+)
+def pairwise_distances_argmin_min(
+    X, Y, *, axis=1, metric="euclidean", metric_kwargs=None
+):
+    """Compute minimum distances between one point and a set of points.
+
+    This function computes for each row in X, the index of the row of Y which
+    is closest (according to the specified distance). The minimal distances are
+    also returned.
+
+    This is mostly equivalent to calling::
+
+        (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),
+         pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))
+
+    but uses much less memory, and is faster for large arrays.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        Array containing points.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
+        Array containing points.
+
+    axis : int, default=1
+        Axis along which the argmin and distances are to be computed.
+
+    metric : str or callable, default='euclidean'
+        Metric to use for distance computation. Any metric from scikit-learn
+        or :mod:`scipy.spatial.distance` can be used.
+
+        If metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays as input and return one value indicating the
+        distance between them. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+        Valid values for metric are:
+
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan', 'nan_euclidean']
+
+        - from :mod:`scipy.spatial.distance`: ['braycurtis', 'canberra', 'chebyshev',
+          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+          'yule']
+
+        See the documentation for :mod:`scipy.spatial.distance` for details on these
+        metrics.
+
+        .. note::
+           `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
+
+        .. note::
+           `'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead).
+
+    metric_kwargs : dict, default=None
+        Keyword arguments to pass to specified metric function.
+
+    Returns
+    -------
+    argmin : ndarray
+        Y[argmin[i], :] is the row in Y that is closest to X[i, :].
+
+    distances : ndarray
+        The array of minimum distances. `distances[i]` is the distance between
+        the i-th row in X and the argmin[i]-th row in Y.
+
+    See Also
+    --------
+    pairwise_distances : Distances between every pair of samples of X and Y.
+    pairwise_distances_argmin : Same as `pairwise_distances_argmin_min` but only
+        returns the argmins.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import pairwise_distances_argmin_min
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> argmin, distances = pairwise_distances_argmin_min(X, Y)
+    >>> argmin
+    array([0, 1])
+    >>> distances
+    array([1., 1.])
+    """
+    ensure_all_finite = "allow-nan" if metric == "nan_euclidean" else True
+    X, Y = check_pairwise_arrays(X, Y, ensure_all_finite=ensure_all_finite)
+
+    if axis == 0:
+        X, Y = Y, X
+
+    if metric_kwargs is None:
+        metric_kwargs = {}
+
+    if ArgKmin.is_usable_for(X, Y, metric):
+        # This is an adaptor for one "sqeuclidean" specification.
+        # For this backend, we can directly use "sqeuclidean".
+        if metric_kwargs.get("squared", False) and metric == "euclidean":
+            metric = "sqeuclidean"
+            metric_kwargs = {}
+
+        values, indices = ArgKmin.compute(
+            X=X,
+            Y=Y,
+            k=1,
+            metric=metric,
+            metric_kwargs=metric_kwargs,
+            strategy="auto",
+            return_distance=True,
+        )
+        values = values.flatten()
+        indices = indices.flatten()
+    else:
+        # Joblib-based backend, which is used when user-defined callable
+        # are passed for metric.
+
+        # This won't be used in the future once PairwiseDistancesReductions support:
+        #   - DistanceMetrics which work on supposedly binary data
+        #   - CSR-dense and dense-CSR case if 'euclidean' in metric.
+
+        # Turn off check for finiteness because this is costly and because arrays
+        # have already been validated.
+        with config_context(assume_finite=True):
+            indices, values = zip(
+                *pairwise_distances_chunked(
+                    X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
+                )
+            )
+        indices = np.concatenate(indices)
+        values = np.concatenate(values)
+
+    return indices, values
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix"],
+        "axis": [Options(Integral, {0, 1})],
+        "metric": [
+            StrOptions(set(_VALID_METRICS).union(ArgKmin.valid_metrics())),
+            callable,
+        ],
+        "metric_kwargs": [dict, None],
+    },
+    prefer_skip_nested_validation=False,  # metric is not validated yet
+)
+def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None):
+    """Compute minimum distances between one point and a set of points.
+
+    This function computes for each row in X, the index of the row of Y which
+    is closest (according to the specified distance).
+
+    This is mostly equivalent to calling::
+
+        pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis)
+
+    but uses much less memory, and is faster for large arrays.
+
+    This function works with dense 2D arrays only.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        Array containing points.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
+        Arrays containing points.
+
+    axis : int, default=1
+        Axis along which the argmin and distances are to be computed.
+
+    metric : str or callable, default="euclidean"
+        Metric to use for distance computation. Any metric from scikit-learn
+        or :mod:`scipy.spatial.distance` can be used.
+
+        If metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays as input and return one value indicating the
+        distance between them. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+        Valid values for metric are:
+
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan', 'nan_euclidean']
+
+        - from :mod:`scipy.spatial.distance`: ['braycurtis', 'canberra', 'chebyshev',
+          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+          'yule']
+
+        See the documentation for :mod:`scipy.spatial.distance` for details on these
+        metrics.
+
+        .. note::
+           `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
+
+        .. note::
+           `'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead).
+
+    metric_kwargs : dict, default=None
+        Keyword arguments to pass to specified metric function.
+
+    Returns
+    -------
+    argmin : numpy.ndarray
+        Y[argmin[i], :] is the row in Y that is closest to X[i, :].
+
+    See Also
+    --------
+    pairwise_distances : Distances between every pair of samples of X and Y.
+    pairwise_distances_argmin_min : Same as `pairwise_distances_argmin` but also
+        returns the distances.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import pairwise_distances_argmin
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> pairwise_distances_argmin(X, Y)
+    array([0, 1])
+    """
+    ensure_all_finite = "allow-nan" if metric == "nan_euclidean" else True
+    X, Y = check_pairwise_arrays(X, Y, ensure_all_finite=ensure_all_finite)
+
+    if axis == 0:
+        X, Y = Y, X
+
+    if metric_kwargs is None:
+        metric_kwargs = {}
+
+    if ArgKmin.is_usable_for(X, Y, metric):
+        # This is an adaptor for one "sqeuclidean" specification.
+        # For this backend, we can directly use "sqeuclidean".
+        if metric_kwargs.get("squared", False) and metric == "euclidean":
+            metric = "sqeuclidean"
+            metric_kwargs = {}
+
+        indices = ArgKmin.compute(
+            X=X,
+            Y=Y,
+            k=1,
+            metric=metric,
+            metric_kwargs=metric_kwargs,
+            strategy="auto",
+            return_distance=False,
+        )
+        indices = indices.flatten()
+    else:
+        # Joblib-based backend, which is used when user-defined callable
+        # are passed for metric.
+
+        # This won't be used in the future once PairwiseDistancesReductions support:
+        #   - DistanceMetrics which work on supposedly binary data
+        #   - CSR-dense and dense-CSR case if 'euclidean' in metric.
+
+        # Turn off check for finiteness because this is costly and because arrays
+        # have already been validated.
+        with config_context(assume_finite=True):
+            indices = np.concatenate(
+                list(
+                    # This returns a np.ndarray generator whose arrays we need
+                    # to flatten into one.
+                    pairwise_distances_chunked(
+                        X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs
+                    )
+                )
+            )
+
+    return indices
+
+
+@validate_params(
+    {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix", None]},
+    prefer_skip_nested_validation=True,
+)
+def haversine_distances(X, Y=None):
+    """Compute the Haversine distance between samples in X and Y.
+
+    The Haversine (or great circle) distance is the angular distance between
+    two points on the surface of a sphere. The first coordinate of each point
+    is assumed to be the latitude, the second is the longitude, given
+    in radians. The dimension of the data must be 2.
+
+    .. math::
+       D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x_{lat} - y_{lat}) / 2)
+                                + \\cos(x_{lat})\\cos(y_{lat})\\
+                                sin^2((x_{lon} - y_{lon}) / 2)}]
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, 2)
+        A feature array.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, 2), default=None
+        An optional second feature array. If `None`, uses `Y=X`.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples_X, n_samples_Y)
+        The distance matrix.
+
+    Notes
+    -----
+    As the Earth is nearly spherical, the haversine formula provides a good
+    approximation of the distance between two points of the Earth surface, with
+    a less than 1% error on average.
+
+    Examples
+    --------
+    We want to calculate the distance between the Ezeiza Airport
+    (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris,
+    France).
+
+    >>> from sklearn.metrics.pairwise import haversine_distances
+    >>> from math import radians
+    >>> bsas = [-34.83333, -58.5166646]
+    >>> paris = [49.0083899664, 2.53844117956]
+    >>> bsas_in_radians = [radians(_) for _ in bsas]
+    >>> paris_in_radians = [radians(_) for _ in paris]
+    >>> result = haversine_distances([bsas_in_radians, paris_in_radians])
+    >>> result * 6371000/1000  # multiply by Earth radius to get kilometers
+    array([[    0.        , 11099.54035582],
+           [11099.54035582,     0.        ]])
+    """
+    from ..metrics import DistanceMetric
+
+    return DistanceMetric.get_metric("haversine").pairwise(X, Y)
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def manhattan_distances(X, Y=None):
+    """Compute the L1 distances between the vectors in X and Y.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        An array where each row is a sample and each column is a feature.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        An array where each row is a sample and each column is a feature.
+        If `None`, method uses `Y=X`.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples_X, n_samples_Y)
+        Pairwise L1 distances.
+
+    Notes
+    -----
+    When X and/or Y are CSR sparse matrices and they are not already
+    in canonical format, this function modifies them in-place to
+    make them canonical.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import manhattan_distances
+    >>> manhattan_distances([[3]], [[3]])
+    array([[0.]])
+    >>> manhattan_distances([[3]], [[2]])
+    array([[1.]])
+    >>> manhattan_distances([[2]], [[3]])
+    array([[1.]])
+    >>> manhattan_distances([[1, 2], [3, 4]],\
+         [[1, 2], [0, 3]])
+    array([[0., 2.],
+           [4., 4.]])
+    """
+    X, Y = check_pairwise_arrays(X, Y)
+
+    if issparse(X) or issparse(Y):
+        X = csr_matrix(X, copy=False)
+        Y = csr_matrix(Y, copy=False)
+        X.sum_duplicates()  # this also sorts indices in-place
+        Y.sum_duplicates()
+        D = np.zeros((X.shape[0], Y.shape[0]))
+        _sparse_manhattan(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, D)
+        return D
+
+    return distance.cdist(X, Y, "cityblock")
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def cosine_distances(X, Y=None):
+    """Compute cosine distance between samples in X and Y.
+
+    Cosine distance is defined as 1.0 minus the cosine similarity.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        Matrix `X`.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
+            default=None
+        Matrix `Y`.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples_X, n_samples_Y)
+        Returns the cosine distance between samples in X and Y.
+
+    See Also
+    --------
+    cosine_similarity : Compute cosine similarity between samples in X and Y.
+    scipy.spatial.distance.cosine : Dense matrices only.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import cosine_distances
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> cosine_distances(X, Y)
+    array([[1.   , 1.   ],
+           [0.422, 0.183]])
+    """
+    xp, _ = get_namespace(X, Y)
+
+    # 1.0 - cosine_similarity(X, Y) without copy
+    S = cosine_similarity(X, Y)
+    S *= -1
+    S += 1
+    S = xp.clip(S, 0.0, 2.0)
+    if X is Y or Y is None:
+        # Ensure that distances between vectors and themselves are set to 0.0.
+        # This may not be the case due to floating point rounding errors.
+        _fill_or_add_to_diagonal(S, 0.0, xp, add_value=False)
+    return S
+
+
+# Paired distances
+@validate_params(
+    {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]},
+    prefer_skip_nested_validation=True,
+)
+def paired_euclidean_distances(X, Y):
+    """Compute the paired euclidean distances between X and Y.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input array/matrix X.
+
+    Y : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input array/matrix Y.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples,)
+        Output array/matrix containing the calculated paired euclidean
+        distances.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import paired_euclidean_distances
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> paired_euclidean_distances(X, Y)
+    array([1., 1.])
+    """
+    X, Y = check_paired_arrays(X, Y)
+    return row_norms(X - Y)
+
+
+@validate_params(
+    {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]},
+    prefer_skip_nested_validation=True,
+)
+def paired_manhattan_distances(X, Y):
+    """Compute the paired L1 distances between X and Y.
+
+    Distances are calculated between (X[0], Y[0]), (X[1], Y[1]), ...,
+    (X[n_samples], Y[n_samples]).
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        An array-like where each row is a sample and each column is a feature.
+
+    Y : {array-like, sparse matrix} of shape (n_samples, n_features)
+        An array-like where each row is a sample and each column is a feature.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples,)
+        L1 paired distances between the row vectors of `X`
+        and the row vectors of `Y`.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import paired_manhattan_distances
+    >>> import numpy as np
+    >>> X = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]])
+    >>> Y = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
+    >>> paired_manhattan_distances(X, Y)
+    array([1., 2., 1.])
+    """
+    X, Y = check_paired_arrays(X, Y)
+    diff = X - Y
+    if issparse(diff):
+        diff.data = np.abs(diff.data)
+        return np.squeeze(np.array(diff.sum(axis=1)))
+    else:
+        return np.abs(diff).sum(axis=-1)
+
+
+@validate_params(
+    {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]},
+    prefer_skip_nested_validation=True,
+)
+def paired_cosine_distances(X, Y):
+    """
+    Compute the paired cosine distances between X and Y.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        An array where each row is a sample and each column is a feature.
+
+    Y : {array-like, sparse matrix} of shape (n_samples, n_features)
+        An array where each row is a sample and each column is a feature.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples,)
+        Returns the distances between the row vectors of `X`
+        and the row vectors of `Y`, where `distances[i]` is the
+        distance between `X[i]` and `Y[i]`.
+
+    Notes
+    -----
+    The cosine distance is equivalent to the half the squared
+    euclidean distance if each sample is normalized to unit norm.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import paired_cosine_distances
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> paired_cosine_distances(X, Y)
+    array([0.5       , 0.184])
+    """
+    X, Y = check_paired_arrays(X, Y)
+    return 0.5 * row_norms(normalize(X) - normalize(Y), squared=True)
+
+
+PAIRED_DISTANCES = {
+    "cosine": paired_cosine_distances,
+    "euclidean": paired_euclidean_distances,
+    "l2": paired_euclidean_distances,
+    "l1": paired_manhattan_distances,
+    "manhattan": paired_manhattan_distances,
+    "cityblock": paired_manhattan_distances,
+}
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "Y": ["array-like"],
+        "metric": [StrOptions(set(PAIRED_DISTANCES)), callable],
+    },
+    prefer_skip_nested_validation=True,
+)
+def paired_distances(X, Y, *, metric="euclidean", **kwds):
+    """
+    Compute the paired distances between X and Y.
+
+    Compute the distances between (X[0], Y[0]), (X[1], Y[1]), etc...
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Array 1 for distance computation.
+
+    Y : ndarray of shape (n_samples, n_features)
+        Array 2 for distance computation.
+
+    metric : str or callable, default="euclidean"
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string, it must be one of the options
+        specified in PAIRED_DISTANCES, including "euclidean",
+        "manhattan", or "cosine".
+        Alternatively, if metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays from `X` as input and return a value indicating
+        the distance between them.
+
+    **kwds : dict
+        Unused parameters.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples,)
+        Returns the distances between the row vectors of `X`
+        and the row vectors of `Y`.
+
+    See Also
+    --------
+    sklearn.metrics.pairwise_distances : Computes the distance between every pair of
+        samples.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import paired_distances
+    >>> X = [[0, 1], [1, 1]]
+    >>> Y = [[0, 1], [2, 1]]
+    >>> paired_distances(X, Y)
+    array([0., 1.])
+    """
+
+    if metric in PAIRED_DISTANCES:
+        func = PAIRED_DISTANCES[metric]
+        return func(X, Y)
+    elif callable(metric):
+        # Check the matrix first (it is usually done by the metric)
+        X, Y = check_paired_arrays(X, Y)
+        distances = np.zeros(len(X))
+        for i in range(len(X)):
+            distances[i] = metric(X[i], Y[i])
+        return distances
+
+
+# Kernels
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "dense_output": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def linear_kernel(X, Y=None, dense_output=True):
+    """
+    Compute the linear kernel between X and Y.
+
+    Read more in the :ref:`User Guide <linear_kernel>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        A feature array.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. If `None`, uses `Y=X`.
+
+    dense_output : bool, default=True
+        Whether to return dense output even when the input is sparse. If
+        ``False``, the output is sparse if both input arrays are sparse.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    kernel : ndarray of shape (n_samples_X, n_samples_Y)
+        The Gram matrix of the linear kernel, i.e. `X @ Y.T`.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import linear_kernel
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> linear_kernel(X, Y)
+    array([[0., 0.],
+           [1., 2.]])
+    """
+    X, Y = check_pairwise_arrays(X, Y)
+    return safe_sparse_dot(X, Y.T, dense_output=dense_output)
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "degree": [Interval(Real, 1, None, closed="left")],
+        "gamma": [
+            Interval(Real, 0, None, closed="left"),
+            None,
+            Hidden(np.ndarray),
+        ],
+        "coef0": [Interval(Real, None, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1):
+    """
+    Compute the polynomial kernel between X and Y.
+
+    .. code-block:: text
+
+        K(X, Y) = (gamma <X, Y> + coef0) ^ degree
+
+    Read more in the :ref:`User Guide <polynomial_kernel>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        A feature array.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. If `None`, uses `Y=X`.
+
+    degree : float, default=3
+        Kernel degree.
+
+    gamma : float, default=None
+        Coefficient of the vector inner product. If None, defaults to 1.0 / n_features.
+
+    coef0 : float, default=1
+        Constant offset added to scaled inner product.
+
+    Returns
+    -------
+    kernel : ndarray of shape (n_samples_X, n_samples_Y)
+        The polynomial kernel.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import polynomial_kernel
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> polynomial_kernel(X, Y, degree=2)
+    array([[1.     , 1.     ],
+           [1.77, 2.77]])
+    """
+    X, Y = check_pairwise_arrays(X, Y)
+    if gamma is None:
+        gamma = 1.0 / X.shape[1]
+
+    K = safe_sparse_dot(X, Y.T, dense_output=True)
+    K *= gamma
+    K += coef0
+    K **= degree
+    return K
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "gamma": [
+            Interval(Real, 0, None, closed="left"),
+            None,
+            Hidden(np.ndarray),
+        ],
+        "coef0": [Interval(Real, None, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def sigmoid_kernel(X, Y=None, gamma=None, coef0=1):
+    """Compute the sigmoid kernel between X and Y.
+
+    .. code-block:: text
+
+        K(X, Y) = tanh(gamma <X, Y> + coef0)
+
+    Read more in the :ref:`User Guide <sigmoid_kernel>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        A feature array.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. If `None`, uses `Y=X`.
+
+    gamma : float, default=None
+        Coefficient of the vector inner product. If None, defaults to 1.0 / n_features.
+
+    coef0 : float, default=1
+        Constant offset added to scaled inner product.
+
+    Returns
+    -------
+    kernel : ndarray of shape (n_samples_X, n_samples_Y)
+        Sigmoid kernel between two arrays.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import sigmoid_kernel
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> sigmoid_kernel(X, Y)
+    array([[0.76, 0.76],
+           [0.87, 0.93]])
+    """
+    xp, _ = get_namespace(X, Y)
+    X, Y = check_pairwise_arrays(X, Y)
+    if gamma is None:
+        gamma = 1.0 / X.shape[1]
+
+    K = safe_sparse_dot(X, Y.T, dense_output=True)
+    K *= gamma
+    K += coef0
+    # compute tanh in-place for numpy
+    K = _modify_in_place_if_numpy(xp, xp.tanh, K, out=K)
+    return K
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "gamma": [
+            Interval(Real, 0, None, closed="left"),
+            None,
+            Hidden(np.ndarray),
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def rbf_kernel(X, Y=None, gamma=None):
+    """Compute the rbf (gaussian) kernel between X and Y.
+
+    .. code-block:: text
+
+        K(x, y) = exp(-gamma ||x-y||^2)
+
+    for each pair of rows x in X and y in Y.
+
+    Read more in the :ref:`User Guide <rbf_kernel>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        A feature array.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. If `None`, uses `Y=X`.
+
+    gamma : float, default=None
+        If None, defaults to 1.0 / n_features.
+
+    Returns
+    -------
+    kernel : ndarray of shape (n_samples_X, n_samples_Y)
+        The RBF kernel.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import rbf_kernel
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> rbf_kernel(X, Y)
+    array([[0.71, 0.51],
+           [0.51, 0.71]])
+    """
+    xp, _ = get_namespace(X, Y)
+    X, Y = check_pairwise_arrays(X, Y)
+    if gamma is None:
+        gamma = 1.0 / X.shape[1]
+
+    K = euclidean_distances(X, Y, squared=True)
+    K *= -gamma
+    # exponentiate K in-place when using numpy
+    K = _modify_in_place_if_numpy(xp, xp.exp, K, out=K)
+    return K
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "gamma": [
+            Interval(Real, 0, None, closed="neither"),
+            Hidden(np.ndarray),
+            None,
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def laplacian_kernel(X, Y=None, gamma=None):
+    """Compute the laplacian kernel between X and Y.
+
+    The laplacian kernel is defined as:
+
+    .. code-block:: text
+
+        K(x, y) = exp(-gamma ||x-y||_1)
+
+    for each pair of rows x in X and y in Y.
+    Read more in the :ref:`User Guide <laplacian_kernel>`.
+
+    .. versionadded:: 0.17
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        A feature array.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. If `None`, uses `Y=X`.
+
+    gamma : float, default=None
+        If None, defaults to 1.0 / n_features. Otherwise it should be strictly positive.
+
+    Returns
+    -------
+    kernel : ndarray of shape (n_samples_X, n_samples_Y)
+        The kernel matrix.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import laplacian_kernel
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> laplacian_kernel(X, Y)
+    array([[0.71, 0.51],
+           [0.51, 0.71]])
+    """
+    X, Y = check_pairwise_arrays(X, Y)
+    if gamma is None:
+        gamma = 1.0 / X.shape[1]
+
+    K = -gamma * manhattan_distances(X, Y)
+    np.exp(K, K)  # exponentiate K in-place
+    return K
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "dense_output": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def cosine_similarity(X, Y=None, dense_output=True):
+    """Compute cosine similarity between samples in X and Y.
+
+    Cosine similarity, or the cosine kernel, computes similarity as the
+    normalized dot product of X and Y:
+
+    .. code-block:: text
+
+        K(X, Y) = <X, Y> / (||X||*||Y||)
+
+    On L2-normalized data, this function is equivalent to linear_kernel.
+
+    Read more in the :ref:`User Guide <cosine_similarity>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
+        Input data.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
+            default=None
+        Input data. If ``None``, the output will be the pairwise
+        similarities between all samples in ``X``.
+
+    dense_output : bool, default=True
+        Whether to return dense output even when the input is sparse. If
+        ``False``, the output is sparse if both input arrays are sparse.
+
+        .. versionadded:: 0.17
+           parameter ``dense_output`` for dense output.
+
+    Returns
+    -------
+    similarities : ndarray or sparse matrix of shape (n_samples_X, n_samples_Y)
+        Returns the cosine similarity between samples in X and Y.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import cosine_similarity
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> cosine_similarity(X, Y)
+    array([[0.   , 0.   ],
+           [0.577, 0.816]])
+    """
+    X, Y = check_pairwise_arrays(X, Y)
+
+    X_normalized = normalize(X, copy=True)
+    if X is Y:
+        Y_normalized = X_normalized
+    else:
+        Y_normalized = normalize(Y, copy=True)
+
+    K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output)
+
+    return K
+
+
+@validate_params(
+    {"X": ["array-like"], "Y": ["array-like", None]},
+    prefer_skip_nested_validation=True,
+)
+def additive_chi2_kernel(X, Y=None):
+    """Compute the additive chi-squared kernel between observations in X and Y.
+
+    The chi-squared kernel is computed between each pair of rows in X and Y.  X
+    and Y have to be non-negative. This kernel is most commonly applied to
+    histograms.
+
+    The chi-squared kernel is given by:
+
+    .. code-block:: text
+
+        k(x, y) = -Sum [(x - y)^2 / (x + y)]
+
+    It can be interpreted as a weighted difference per entry.
+
+    Read more in the :ref:`User Guide <chi2_kernel>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples_X, n_features)
+        A feature array.
+
+    Y : array-like of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. If `None`, uses `Y=X`.
+
+    Returns
+    -------
+    kernel : array-like of shape (n_samples_X, n_samples_Y)
+        The kernel matrix.
+
+    See Also
+    --------
+    chi2_kernel : The exponentiated version of the kernel, which is usually
+        preferable.
+    sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation
+        to this kernel.
+
+    Notes
+    -----
+    As the negative of a distance, this kernel is only conditionally positive
+    definite.
+
+    References
+    ----------
+    * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.
+      Local features and kernels for classification of texture and object
+      categories: A comprehensive study
+      International Journal of Computer Vision 2007
+      https://hal.archives-ouvertes.fr/hal-00171412/document
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import additive_chi2_kernel
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> additive_chi2_kernel(X, Y)
+    array([[-1., -2.],
+           [-2., -1.]])
+    """
+    xp, _, device_ = get_namespace_and_device(X, Y)
+    X, Y = check_pairwise_arrays(X, Y, accept_sparse=False)
+    if xp.any(X < 0):
+        raise ValueError("X contains negative values.")
+    if Y is not X and xp.any(Y < 0):
+        raise ValueError("Y contains negative values.")
+
+    if _is_numpy_namespace(xp):
+        result = np.zeros((X.shape[0], Y.shape[0]), dtype=X.dtype)
+        _chi2_kernel_fast(X, Y, result)
+        return result
+    else:
+        dtype = _find_matching_floating_dtype(X, Y, xp=xp)
+        xb = X[:, None, :]
+        yb = Y[None, :, :]
+        nom = -((xb - yb) ** 2)
+        denom = xb + yb
+        nom = xp.where(denom == 0, xp.asarray(0, dtype=dtype, device=device_), nom)
+        denom = xp.where(denom == 0, xp.asarray(1, dtype=dtype, device=device_), denom)
+        return xp.sum(nom / denom, axis=2)
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "Y": ["array-like", None],
+        "gamma": [Interval(Real, 0, None, closed="neither"), Hidden(np.ndarray)],
+    },
+    prefer_skip_nested_validation=True,
+)
+def chi2_kernel(X, Y=None, gamma=1.0):
+    """Compute the exponential chi-squared kernel between X and Y.
+
+    The chi-squared kernel is computed between each pair of rows in X and Y.  X
+    and Y have to be non-negative. This kernel is most commonly applied to
+    histograms.
+
+    The chi-squared kernel is given by:
+
+    .. code-block:: text
+
+        k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)])
+
+    It can be interpreted as a weighted difference per entry.
+
+    Read more in the :ref:`User Guide <chi2_kernel>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples_X, n_features)
+        A feature array.
+
+    Y : array-like of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. If `None`, uses `Y=X`.
+
+    gamma : float, default=1
+        Scaling parameter of the chi2 kernel.
+
+    Returns
+    -------
+    kernel : ndarray of shape (n_samples_X, n_samples_Y)
+        The kernel matrix.
+
+    See Also
+    --------
+    additive_chi2_kernel : The additive version of this kernel.
+    sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation
+        to the additive version of this kernel.
+
+    References
+    ----------
+    * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.
+      Local features and kernels for classification of texture and object
+      categories: A comprehensive study
+      International Journal of Computer Vision 2007
+      https://hal.archives-ouvertes.fr/hal-00171412/document
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import chi2_kernel
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> chi2_kernel(X, Y)
+    array([[0.368, 0.135],
+           [0.135, 0.368]])
+    """
+    xp, _ = get_namespace(X, Y)
+    K = additive_chi2_kernel(X, Y)
+    K *= gamma
+    if _is_numpy_namespace(xp):
+        return np.exp(K, out=K)
+    return xp.exp(K)
+
+
+# Helper functions - distance
+PAIRWISE_DISTANCE_FUNCTIONS = {
+    # If updating this dictionary, update the doc in both distance_metrics()
+    # and also in pairwise_distances()!
+    "cityblock": manhattan_distances,
+    "cosine": cosine_distances,
+    "euclidean": euclidean_distances,
+    "haversine": haversine_distances,
+    "l2": euclidean_distances,
+    "l1": manhattan_distances,
+    "manhattan": manhattan_distances,
+    "precomputed": None,  # HACK: precomputed is always allowed, never called
+    "nan_euclidean": nan_euclidean_distances,
+}
+
+
+def distance_metrics():
+    """Valid metrics for pairwise_distances.
+
+    This function simply returns the valid pairwise distance metrics.
+    It exists to allow for a description of the mapping for
+    each of the valid strings.
+
+    The valid distance metrics, and the function they map to, are:
+
+    =============== ========================================
+    metric          Function
+    =============== ========================================
+    'cityblock'     metrics.pairwise.manhattan_distances
+    'cosine'        metrics.pairwise.cosine_distances
+    'euclidean'     metrics.pairwise.euclidean_distances
+    'haversine'     metrics.pairwise.haversine_distances
+    'l1'            metrics.pairwise.manhattan_distances
+    'l2'            metrics.pairwise.euclidean_distances
+    'manhattan'     metrics.pairwise.manhattan_distances
+    'nan_euclidean' metrics.pairwise.nan_euclidean_distances
+    =============== ========================================
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Returns
+    -------
+    distance_metrics : dict
+        Returns valid metrics for pairwise_distances.
+    """
+    return PAIRWISE_DISTANCE_FUNCTIONS
+
+
+def _dist_wrapper(dist_func, dist_matrix, slice_, *args, **kwargs):
+    """Write in-place to a slice of a distance matrix."""
+    dist_matrix[:, slice_] = dist_func(*args, **kwargs)
+
+
+def _parallel_pairwise(X, Y, func, n_jobs, **kwds):
+    """Break the pairwise matrix in n_jobs even slices
+    and compute them using multithreading."""
+
+    if Y is None:
+        Y = X
+    X, Y, dtype = _return_float_dtype(X, Y)
+
+    if effective_n_jobs(n_jobs) == 1:
+        return func(X, Y, **kwds)
+
+    # enforce a threading backend to prevent data communication overhead
+    fd = delayed(_dist_wrapper)
+    ret = np.empty((X.shape[0], Y.shape[0]), dtype=dtype, order="F")
+    Parallel(backend="threading", n_jobs=n_jobs)(
+        fd(func, ret, s, X, Y[s], **kwds)
+        for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs))
+    )
+
+    if (X is Y or Y is None) and func is euclidean_distances:
+        # zeroing diagonal for euclidean norm.
+        # TODO: do it also for other norms.
+        np.fill_diagonal(ret, 0)
+
+    return ret
+
+
+def _pairwise_callable(X, Y, metric, ensure_all_finite=True, **kwds):
+    """Handle the callable case for pairwise_{distances,kernels}."""
+    X, Y = check_pairwise_arrays(
+        X,
+        Y,
+        dtype=None,
+        ensure_all_finite=ensure_all_finite,
+        # No input dimension checking done for custom metrics (left to user)
+        ensure_2d=False,
+    )
+
+    if X is Y:
+        # Only calculate metric for upper triangle
+        out = np.zeros((X.shape[0], Y.shape[0]), dtype="float")
+        iterator = itertools.combinations(range(X.shape[0]), 2)
+        for i, j in iterator:
+            # scipy has not yet implemented 1D sparse slices; once implemented this can
+            # be removed and `arr[ind]` can be simply used.
+            x = X[[i], :] if issparse(X) else X[i]
+            y = Y[[j], :] if issparse(Y) else Y[j]
+            out[i, j] = metric(x, y, **kwds)
+
+        # Make symmetric
+        # NB: out += out.T will produce incorrect results
+        out = out + out.T
+
+        # Calculate diagonal
+        # NB: nonzero diagonals are allowed for both metrics and kernels
+        for i in range(X.shape[0]):
+            # scipy has not yet implemented 1D sparse slices; once implemented this can
+            # be removed and `arr[ind]` can be simply used.
+            x = X[[i], :] if issparse(X) else X[i]
+            out[i, i] = metric(x, x, **kwds)
+
+    else:
+        # Calculate all cells
+        out = np.empty((X.shape[0], Y.shape[0]), dtype="float")
+        iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
+        for i, j in iterator:
+            # scipy has not yet implemented 1D sparse slices; once implemented this can
+            # be removed and `arr[ind]` can be simply used.
+            x = X[[i], :] if issparse(X) else X[i]
+            y = Y[[j], :] if issparse(Y) else Y[j]
+            out[i, j] = metric(x, y, **kwds)
+
+    return out
+
+
+def _check_chunk_size(reduced, chunk_size):
+    """Checks chunk is a sequence of expected size or a tuple of same."""
+    if reduced is None:
+        return
+    is_tuple = isinstance(reduced, tuple)
+    if not is_tuple:
+        reduced = (reduced,)
+    if any(isinstance(r, tuple) or not hasattr(r, "__iter__") for r in reduced):
+        raise TypeError(
+            "reduce_func returned %r. Expected sequence(s) of length %d."
+            % (reduced if is_tuple else reduced[0], chunk_size)
+        )
+    if any(_num_samples(r) != chunk_size for r in reduced):
+        actual_size = tuple(_num_samples(r) for r in reduced)
+        raise ValueError(
+            "reduce_func returned object of length %s. "
+            "Expected same length as input: %d."
+            % (actual_size if is_tuple else actual_size[0], chunk_size)
+        )
+
+
+def _precompute_metric_params(X, Y, metric=None, **kwds):
+    """Precompute data-derived metric parameters if not provided."""
+    if metric == "seuclidean" and "V" not in kwds:
+        if X is Y:
+            V = np.var(X, axis=0, ddof=1)
+        else:
+            raise ValueError(
+                "The 'V' parameter is required for the seuclidean metric "
+                "when Y is passed."
+            )
+        return {"V": V}
+    if metric == "mahalanobis" and "VI" not in kwds:
+        if X is Y:
+            VI = np.linalg.inv(np.cov(X.T)).T
+        else:
+            raise ValueError(
+                "The 'VI' parameter is required for the mahalanobis metric "
+                "when Y is passed."
+            )
+        return {"VI": VI}
+    return {}
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "reduce_func": [callable, None],
+        "metric": [StrOptions({"precomputed"}.union(_VALID_METRICS)), callable],
+        "n_jobs": [Integral, None],
+        "working_memory": [Interval(Real, 0, None, closed="left"), None],
+    },
+    prefer_skip_nested_validation=False,  # metric is not validated yet
+)
+def pairwise_distances_chunked(
+    X,
+    Y=None,
+    *,
+    reduce_func=None,
+    metric="euclidean",
+    n_jobs=None,
+    working_memory=None,
+    **kwds,
+):
+    """Generate a distance matrix chunk by chunk with optional reduction.
+
+    In cases where not all of a pairwise distance matrix needs to be
+    stored at once, this is used to calculate pairwise distances in
+    ``working_memory``-sized chunks.  If ``reduce_func`` is given, it is
+    run on each chunk and its return values are concatenated into lists,
+    arrays or sparse matrices.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \
+            (n_samples_X, n_features)
+        Array of pairwise distances between samples, or a feature array.
+        The shape the array should be (n_samples_X, n_samples_X) if
+        metric='precomputed' and (n_samples_X, n_features) otherwise.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. Only allowed if
+        metric != "precomputed".
+
+    reduce_func : callable, default=None
+        The function which is applied on each chunk of the distance matrix,
+        reducing it to needed values.  ``reduce_func(D_chunk, start)``
+        is called repeatedly, where ``D_chunk`` is a contiguous vertical
+        slice of the pairwise distance matrix, starting at row ``start``.
+        It should return one of: None; an array, a list, or a sparse matrix
+        of length ``D_chunk.shape[0]``; or a tuple of such objects.
+        Returning None is useful for in-place operations, rather than
+        reductions.
+
+        If None, pairwise_distances_chunked returns a generator of vertical
+        chunks of the distance matrix.
+
+    metric : str or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string, it must be one of the options
+        allowed by :func:`scipy.spatial.distance.pdist` for its metric parameter,
+        or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
+        If metric is "precomputed", X is assumed to be a distance matrix.
+        Alternatively, if metric is a callable function, it is called on
+        each pair of instances (rows) and the resulting value recorded.
+        The callable should take two arrays from X as input and return a
+        value indicating the distance between them.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. This works by
+        breaking down the pairwise matrix into n_jobs even slices and
+        computing them in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    working_memory : float, default=None
+        The sought maximum memory for temporary distance matrix chunks.
+        When None (default), the value of
+        ``sklearn.get_config()['working_memory']`` is used.
+
+    **kwds : optional keyword parameters
+        Any further parameters are passed directly to the distance function.
+        If using a :mod:`scipy.spatial.distance` metric, the parameters are still
+        metric dependent. See the scipy docs for usage examples.
+
+    Yields
+    ------
+    D_chunk : {ndarray, sparse matrix}
+        A contiguous slice of distance matrix, optionally processed by
+        ``reduce_func``.
+
+    Examples
+    --------
+    Without reduce_func:
+
+    >>> import numpy as np
+    >>> from sklearn.metrics import pairwise_distances_chunked
+    >>> X = np.random.RandomState(0).rand(5, 3)
+    >>> D_chunk = next(pairwise_distances_chunked(X))
+    >>> D_chunk
+    array([[0.   , 0.295, 0.417, 0.197, 0.572],
+           [0.295, 0.   , 0.576, 0.419, 0.764],
+           [0.417, 0.576, 0.   , 0.449, 0.903],
+           [0.197, 0.419, 0.449, 0.   , 0.512],
+           [0.572, 0.764, 0.903, 0.512, 0.   ]])
+
+    Retrieve all neighbors and average distance within radius r:
+
+    >>> r = .2
+    >>> def reduce_func(D_chunk, start):
+    ...     neigh = [np.flatnonzero(d < r) for d in D_chunk]
+    ...     avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1)
+    ...     return neigh, avg_dist
+    >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func)
+    >>> neigh, avg_dist = next(gen)
+    >>> neigh
+    [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])]
+    >>> avg_dist
+    array([0.039, 0.        , 0.        , 0.039, 0.        ])
+
+    Where r is defined per sample, we need to make use of ``start``:
+
+    >>> r = [.2, .4, .4, .3, .1]
+    >>> def reduce_func(D_chunk, start):
+    ...     neigh = [np.flatnonzero(d < r[i])
+    ...              for i, d in enumerate(D_chunk, start)]
+    ...     return neigh
+    >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func))
+    >>> neigh
+    [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])]
+
+    Force row-by-row generation by reducing ``working_memory``:
+
+    >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func,
+    ...                                  working_memory=0)
+    >>> next(gen)
+    [array([0, 3])]
+    >>> next(gen)
+    [array([0, 1])]
+    """
+    n_samples_X = _num_samples(X)
+    if metric == "precomputed":
+        slices = (slice(0, n_samples_X),)
+    else:
+        if Y is None:
+            Y = X
+        # We get as many rows as possible within our working_memory budget to
+        # store len(Y) distances in each row of output.
+        #
+        # Note:
+        #  - this will get at least 1 row, even if 1 row of distances will
+        #    exceed working_memory.
+        #  - this does not account for any temporary memory usage while
+        #    calculating distances (e.g. difference of vectors in manhattan
+        #    distance.
+        chunk_n_rows = get_chunk_n_rows(
+            row_bytes=8 * _num_samples(Y),
+            max_n_rows=n_samples_X,
+            working_memory=working_memory,
+        )
+        slices = gen_batches(n_samples_X, chunk_n_rows)
+
+    # precompute data-derived metric params
+    params = _precompute_metric_params(X, Y, metric=metric, **kwds)
+    kwds.update(**params)
+
+    for sl in slices:
+        if sl.start == 0 and sl.stop == n_samples_X:
+            X_chunk = X  # enable optimised paths for X is Y
+        else:
+            X_chunk = X[sl]
+        D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds)
+        if (X is Y or Y is None) and PAIRWISE_DISTANCE_FUNCTIONS.get(
+            metric, None
+        ) is euclidean_distances:
+            # zeroing diagonal, taking care of aliases of "euclidean",
+            # i.e. "l2"
+            D_chunk.flat[sl.start :: _num_samples(X) + 1] = 0
+        if reduce_func is not None:
+            chunk_size = D_chunk.shape[0]
+            D_chunk = reduce_func(D_chunk, sl.start)
+            _check_chunk_size(D_chunk, chunk_size)
+        yield D_chunk
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
+        "n_jobs": [Integral, None],
+        "force_all_finite": [
+            "boolean",
+            StrOptions({"allow-nan"}),
+            Hidden(StrOptions({"deprecated"})),
+        ],
+        "ensure_all_finite": ["boolean", StrOptions({"allow-nan"}), Hidden(None)],
+    },
+    prefer_skip_nested_validation=True,
+)
+def pairwise_distances(
+    X,
+    Y=None,
+    metric="euclidean",
+    *,
+    n_jobs=None,
+    force_all_finite="deprecated",
+    ensure_all_finite=None,
+    **kwds,
+):
+    """Compute the distance matrix from a feature array X and optional Y.
+
+    This function takes one or two feature arrays or a distance matrix, and returns
+    a distance matrix.
+
+    - If `X` is a feature array, of shape (n_samples_X, n_features), and:
+
+      - `Y` is `None` and `metric` is not 'precomputed', the pairwise distances
+        between `X` and itself are returned.
+      - `Y` is a feature array of shape (n_samples_Y, n_features), the pairwise
+        distances between `X` and `Y` is returned.
+
+    - If `X` is a distance matrix, of shape (n_samples_X, n_samples_X), `metric`
+      should be 'precomputed'. `Y` is thus ignored and `X` is returned as is.
+
+    If the input is a collection of non-numeric data (e.g. a list of strings or a
+    boolean array), a custom metric must be passed.
+
+    This method provides a safe way to take a distance matrix as input, while
+    preserving compatibility with many other algorithms that take a vector
+    array.
+
+    Valid values for metric are:
+
+    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+      'manhattan', 'nan_euclidean']. All metrics support sparse matrix
+      inputs except 'nan_euclidean'.
+
+    - From :mod:`scipy.spatial.distance`: ['braycurtis', 'canberra', 'chebyshev',
+      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
+      'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
+      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'].
+      These metrics do not support sparse matrix inputs.
+
+    .. note::
+        `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
+
+    .. note::
+        `'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead).
+
+    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
+    valid :mod:`scipy.spatial.distance` metrics), the scikit-learn implementation
+    will be used, which is faster and has support for sparse matrices (except
+    for 'cityblock'). For a verbose description of the metrics from
+    scikit-learn, see :func:`sklearn.metrics.pairwise.distance_metrics`
+    function.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \
+            (n_samples_X, n_features)
+        Array of pairwise distances between samples, or a feature array.
+        The shape of the array should be (n_samples_X, n_samples_X) if
+        metric == "precomputed" and (n_samples_X, n_features) otherwise.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        An optional second feature array. Only allowed if
+        metric != "precomputed".
+
+    metric : str or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string, it must be one of the options
+        allowed by :func:`scipy.spatial.distance.pdist` for its metric parameter, or
+        a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``.
+        If metric is "precomputed", X is assumed to be a distance matrix.
+        Alternatively, if metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays from X as input and return a value indicating
+        the distance between them.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. This works by breaking
+        down the pairwise matrix into n_jobs even slices and computing them
+        using multithreading.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        The "euclidean" and "cosine" metrics rely heavily on BLAS which is already
+        multithreaded. So, increasing `n_jobs` would likely cause oversubscription
+        and quickly degrade performance.
+
+    force_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored
+        for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The
+        possibilities are:
+
+        - True: Force all values of array to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
+
+        .. versionadded:: 0.22
+           ``force_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`.
+
+        .. deprecated:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite` and will be removed
+           in 1.8.
+
+    ensure_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored
+        for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The
+        possibilities are:
+
+        - True: Force all values of array to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
+
+        .. versionadded:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite`.
+
+    **kwds : optional keyword parameters
+        Any further parameters are passed directly to the distance function.
+        If using a scipy.spatial.distance metric, the parameters are still
+        metric dependent. See the scipy docs for usage examples.
+
+    Returns
+    -------
+    D : ndarray of shape (n_samples_X, n_samples_X) or \
+            (n_samples_X, n_samples_Y)
+        A distance matrix D such that D_{i, j} is the distance between the
+        ith and jth vectors of the given matrix X, if Y is None.
+        If Y is not None, then D_{i, j} is the distance between the ith array
+        from X and the jth array from Y.
+
+    See Also
+    --------
+    pairwise_distances_chunked : Performs the same calculation as this
+        function, but returns a generator of chunks of the distance matrix, in
+        order to limit memory usage.
+    sklearn.metrics.pairwise.paired_distances : Computes the distances between
+        corresponding elements of two arrays.
+
+    Notes
+    -----
+    If metric is a callable, no restrictions are placed on `X` and `Y` dimensions.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import pairwise_distances
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> pairwise_distances(X, Y, metric='sqeuclidean')
+    array([[1., 2.],
+           [2., 1.]])
+    """
+    ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite)
+
+    if metric == "precomputed":
+        X, _ = check_pairwise_arrays(
+            X, Y, precomputed=True, ensure_all_finite=ensure_all_finite
+        )
+
+        whom = (
+            "`pairwise_distances`. Precomputed distance "
+            " need to have non-negative values."
+        )
+        check_non_negative(X, whom=whom)
+        return X
+    elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
+        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
+    elif callable(metric):
+        func = partial(
+            _pairwise_callable,
+            metric=metric,
+            ensure_all_finite=ensure_all_finite,
+            **kwds,
+        )
+    else:
+        if issparse(X) or issparse(Y):
+            raise TypeError("scipy distance metrics do not support sparse matrices.")
+
+        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else "infer_float"
+
+        if dtype is bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)):
+            msg = "Data was converted to boolean for metric %s" % metric
+            warnings.warn(msg, DataConversionWarning)
+
+        X, Y = check_pairwise_arrays(
+            X, Y, dtype=dtype, ensure_all_finite=ensure_all_finite
+        )
+
+        # precompute data-derived metric params
+        params = _precompute_metric_params(X, Y, metric=metric, **kwds)
+        kwds.update(**params)
+
+        if effective_n_jobs(n_jobs) == 1 and X is Y:
+            return distance.squareform(distance.pdist(X, metric=metric, **kwds))
+        func = partial(distance.cdist, metric=metric, **kwds)
+
+    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
+
+
+# These distances require boolean arrays, when using scipy.spatial.distance
+PAIRWISE_BOOLEAN_FUNCTIONS = [
+    "dice",
+    "jaccard",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalsneath",
+    "yule",
+]
+if sp_base_version < parse_version("1.17"):
+    # Deprecated in SciPy 1.15 and removed in SciPy 1.17
+    PAIRWISE_BOOLEAN_FUNCTIONS += ["sokalmichener"]
+if sp_base_version < parse_version("1.11"):
+    # Deprecated in SciPy 1.9 and removed in SciPy 1.11
+    PAIRWISE_BOOLEAN_FUNCTIONS += ["kulsinski"]
+if sp_base_version < parse_version("1.9"):
+    # Deprecated in SciPy 1.0 and removed in SciPy 1.9
+    PAIRWISE_BOOLEAN_FUNCTIONS += ["matching"]
+
+# Helper functions - distance
+PAIRWISE_KERNEL_FUNCTIONS = {
+    # If updating this dictionary, update the doc in both distance_metrics()
+    # and also in pairwise_distances()!
+    "additive_chi2": additive_chi2_kernel,
+    "chi2": chi2_kernel,
+    "linear": linear_kernel,
+    "polynomial": polynomial_kernel,
+    "poly": polynomial_kernel,
+    "rbf": rbf_kernel,
+    "laplacian": laplacian_kernel,
+    "sigmoid": sigmoid_kernel,
+    "cosine": cosine_similarity,
+}
+
+
+def kernel_metrics():
+    """Valid metrics for pairwise_kernels.
+
+    This function simply returns the valid pairwise distance metrics.
+    It exists, however, to allow for a verbose description of the mapping for
+    each of the valid strings.
+
+    The valid distance metrics, and the function they map to, are:
+      ===============   ========================================
+      metric            Function
+      ===============   ========================================
+      'additive_chi2'   sklearn.pairwise.additive_chi2_kernel
+      'chi2'            sklearn.pairwise.chi2_kernel
+      'linear'          sklearn.pairwise.linear_kernel
+      'poly'            sklearn.pairwise.polynomial_kernel
+      'polynomial'      sklearn.pairwise.polynomial_kernel
+      'rbf'             sklearn.pairwise.rbf_kernel
+      'laplacian'       sklearn.pairwise.laplacian_kernel
+      'sigmoid'         sklearn.pairwise.sigmoid_kernel
+      'cosine'          sklearn.pairwise.cosine_similarity
+      ===============   ========================================
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Returns
+    -------
+    kernel_metrics : dict
+        Returns valid metrics for pairwise_kernels.
+    """
+    return PAIRWISE_KERNEL_FUNCTIONS
+
+
+KERNEL_PARAMS = {
+    "additive_chi2": (),
+    "chi2": frozenset(["gamma"]),
+    "cosine": (),
+    "linear": (),
+    "poly": frozenset(["gamma", "degree", "coef0"]),
+    "polynomial": frozenset(["gamma", "degree", "coef0"]),
+    "rbf": frozenset(["gamma"]),
+    "laplacian": frozenset(["gamma"]),
+    "sigmoid": frozenset(["gamma", "coef0"]),
+}
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "Y": ["array-like", "sparse matrix", None],
+        "metric": [
+            StrOptions(set(PAIRWISE_KERNEL_FUNCTIONS) | {"precomputed"}),
+            callable,
+        ],
+        "filter_params": ["boolean"],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def pairwise_kernels(
+    X, Y=None, metric="linear", *, filter_params=False, n_jobs=None, **kwds
+):
+    """Compute the kernel between arrays X and optional array Y.
+
+    This function takes one or two feature arrays or a kernel matrix, and returns
+    a kernel matrix.
+
+    - If `X` is a feature array, of shape (n_samples_X, n_features), and:
+
+      - `Y` is `None` and `metric` is not 'precomputed', the pairwise kernels
+        between `X` and itself are returned.
+      - `Y` is a feature array of shape (n_samples_Y, n_features), the pairwise
+        kernels between `X` and `Y` is returned.
+
+    - If `X` is a kernel matrix, of shape (n_samples_X, n_samples_X), `metric`
+      should be 'precomputed'. `Y` is thus ignored and `X` is returned as is.
+
+    This method provides a safe way to take a kernel matrix as input, while
+    preserving compatibility with many other algorithms that take a vector
+    array.
+
+    Valid values for metric are:
+        ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf',
+        'laplacian', 'sigmoid', 'cosine']
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}  of shape (n_samples_X, n_samples_X) or \
+            (n_samples_X, n_features)
+        Array of pairwise kernels between samples, or a feature array.
+        The shape of the array should be (n_samples_X, n_samples_X) if
+        metric == "precomputed" and (n_samples_X, n_features) otherwise.
+
+    Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
+        A second feature array only if X has shape (n_samples_X, n_features).
+
+    metric : str or callable, default="linear"
+        The metric to use when calculating kernel between instances in a
+        feature array. If metric is a string, it must be one of the metrics
+        in ``pairwise.PAIRWISE_KERNEL_FUNCTIONS``.
+        If metric is "precomputed", X is assumed to be a kernel matrix.
+        Alternatively, if metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two rows from X as input and return the corresponding
+        kernel value as a single number. This means that callables from
+        :mod:`sklearn.metrics.pairwise` are not allowed, as they operate on
+        matrices, not single samples. Use the string identifying the kernel
+        instead.
+
+    filter_params : bool, default=False
+        Whether to filter invalid parameters or not.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. This works by breaking
+        down the pairwise matrix into n_jobs even slices and computing them
+        using multithreading.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    **kwds : optional keyword parameters
+        Any further parameters are passed directly to the kernel function.
+
+    Returns
+    -------
+    K : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)
+        A kernel matrix K such that K_{i, j} is the kernel between the
+        ith and jth vectors of the given matrix X, if Y is None.
+        If Y is not None, then K_{i, j} is the kernel between the ith array
+        from X and the jth array from Y.
+
+    Notes
+    -----
+    If metric is a callable, no restrictions are placed on `X` and `Y` dimensions.
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import pairwise_kernels
+    >>> X = [[0, 0, 0], [1, 1, 1]]
+    >>> Y = [[1, 0, 0], [1, 1, 0]]
+    >>> pairwise_kernels(X, Y, metric='linear')
+    array([[0., 0.],
+           [1., 2.]])
+    """
+    # import GPKernel locally to prevent circular imports
+    from ..gaussian_process.kernels import Kernel as GPKernel
+
+    if metric == "precomputed":
+        X, _ = check_pairwise_arrays(X, Y, precomputed=True)
+        return X
+    elif isinstance(metric, GPKernel):
+        func = metric.__call__
+    elif metric in PAIRWISE_KERNEL_FUNCTIONS:
+        if filter_params:
+            kwds = {k: kwds[k] for k in kwds if k in KERNEL_PARAMS[metric]}
+        func = PAIRWISE_KERNEL_FUNCTIONS[metric]
+    elif callable(metric):
+        func = partial(_pairwise_callable, metric=metric, **kwds)
+
+    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_classification.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..b66353e5ecfab4973aca5456473dbb947b86b0a9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_classification.py
@@ -0,0 +1,3397 @@
+import re
+import warnings
+from functools import partial
+from itertools import chain, permutations, product
+
+import numpy as np
+import pytest
+from scipy import linalg
+from scipy.spatial.distance import hamming as sp_hamming
+from scipy.stats import bernoulli
+
+from sklearn import datasets, svm
+from sklearn.datasets import make_multilabel_classification
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.metrics import (
+    accuracy_score,
+    average_precision_score,
+    balanced_accuracy_score,
+    brier_score_loss,
+    class_likelihood_ratios,
+    classification_report,
+    cohen_kappa_score,
+    confusion_matrix,
+    f1_score,
+    fbeta_score,
+    hamming_loss,
+    hinge_loss,
+    jaccard_score,
+    log_loss,
+    make_scorer,
+    matthews_corrcoef,
+    multilabel_confusion_matrix,
+    precision_recall_fscore_support,
+    precision_score,
+    recall_score,
+    zero_one_loss,
+)
+from sklearn.metrics._classification import _check_targets, d2_log_loss_score
+from sklearn.model_selection import cross_val_score
+from sklearn.preprocessing import LabelBinarizer, label_binarize
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._mocking import MockDataFrame
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.extmath import _nanaverage
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+from sklearn.utils.validation import check_random_state
+
+###############################################################################
+# Utilities for testing
+
+
+def make_prediction(dataset=None, binary=False):
+    """Make some classification predictions on a toy dataset using a SVC
+
+    If binary is True restrict to a binary classification problem instead of a
+    multiclass classification problem
+    """
+
+    if dataset is None:
+        # import some data to play with
+        dataset = datasets.load_iris()
+
+    X = dataset.data
+    y = dataset.target
+
+    if binary:
+        # restrict to a binary classification task
+        X, y = X[y < 2], y[y < 2]
+
+    n_samples, n_features = X.shape
+    p = np.arange(n_samples)
+
+    rng = check_random_state(37)
+    rng.shuffle(p)
+    X, y = X[p], y[p]
+    half = int(n_samples / 2)
+
+    # add noisy features to make the problem harder and avoid perfect results
+    rng = np.random.RandomState(0)
+    X = np.c_[X, rng.randn(n_samples, 200 * n_features)]
+
+    # run classifier, get class probabilities and label predictions
+    clf = svm.SVC(kernel="linear", probability=True, random_state=0)
+    y_pred_proba = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
+
+    if binary:
+        # only interested in probabilities of the positive case
+        # XXX: do we really want a special API for the binary case?
+        y_pred_proba = y_pred_proba[:, 1]
+
+    y_pred = clf.predict(X[half:])
+    y_true = y[half:]
+    return y_true, y_pred, y_pred_proba
+
+
+###############################################################################
+# Tests
+
+
+def test_classification_report_dictionary_output():
+    # Test performance report with dictionary output
+    iris = datasets.load_iris()
+    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)
+
+    # print classification report with class names
+    expected_report = {
+        "setosa": {
+            "precision": 0.82608695652173914,
+            "recall": 0.79166666666666663,
+            "f1-score": 0.8085106382978724,
+            "support": 24,
+        },
+        "versicolor": {
+            "precision": 0.33333333333333331,
+            "recall": 0.096774193548387094,
+            "f1-score": 0.15000000000000002,
+            "support": 31,
+        },
+        "virginica": {
+            "precision": 0.41860465116279072,
+            "recall": 0.90000000000000002,
+            "f1-score": 0.57142857142857151,
+            "support": 20,
+        },
+        "macro avg": {
+            "f1-score": 0.5099797365754813,
+            "precision": 0.5260083136726211,
+            "recall": 0.596146953405018,
+            "support": 75,
+        },
+        "accuracy": 0.5333333333333333,
+        "weighted avg": {
+            "f1-score": 0.47310435663627154,
+            "precision": 0.5137535108414785,
+            "recall": 0.5333333333333333,
+            "support": 75,
+        },
+    }
+
+    report = classification_report(
+        y_true,
+        y_pred,
+        labels=np.arange(len(iris.target_names)),
+        target_names=iris.target_names,
+        output_dict=True,
+    )
+
+    # assert the 2 dicts are equal.
+    assert report.keys() == expected_report.keys()
+    for key in expected_report:
+        if key == "accuracy":
+            assert isinstance(report[key], float)
+            assert report[key] == expected_report[key]
+        else:
+            assert report[key].keys() == expected_report[key].keys()
+            for metric in expected_report[key]:
+                assert_almost_equal(expected_report[key][metric], report[key][metric])
+
+    assert isinstance(expected_report["setosa"]["precision"], float)
+    assert isinstance(expected_report["macro avg"]["precision"], float)
+    assert isinstance(expected_report["setosa"]["support"], int)
+    assert isinstance(expected_report["macro avg"]["support"], int)
+
+
+def test_classification_report_output_dict_empty_input():
+    report = classification_report(y_true=[], y_pred=[], output_dict=True)
+    expected_report = {
+        "accuracy": 0.0,
+        "macro avg": {
+            "f1-score": np.nan,
+            "precision": np.nan,
+            "recall": np.nan,
+            "support": 0,
+        },
+        "weighted avg": {
+            "f1-score": np.nan,
+            "precision": np.nan,
+            "recall": np.nan,
+            "support": 0,
+        },
+    }
+    assert isinstance(report, dict)
+    # assert the 2 dicts are equal.
+    assert report.keys() == expected_report.keys()
+    for key in expected_report:
+        if key == "accuracy":
+            assert isinstance(report[key], float)
+            assert report[key] == expected_report[key]
+        else:
+            assert report[key].keys() == expected_report[key].keys()
+            for metric in expected_report[key]:
+                assert_almost_equal(expected_report[key][metric], report[key][metric])
+
+
+@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan])
+def test_classification_report_zero_division_warning(zero_division):
+    y_true, y_pred = ["a", "b", "c"], ["a", "b", "d"]
+    with warnings.catch_warnings(record=True) as record:
+        classification_report(
+            y_true, y_pred, zero_division=zero_division, output_dict=True
+        )
+        if zero_division == "warn":
+            assert len(record) > 1
+            for item in record:
+                msg = "Use `zero_division` parameter to control this behavior."
+                assert msg in str(item.message)
+        else:
+            assert not record
+
+
+@pytest.mark.parametrize(
+    "labels, show_micro_avg", [([0], True), ([0, 1], False), ([0, 1, 2], False)]
+)
+def test_classification_report_labels_subset_superset(labels, show_micro_avg):
+    """Check the behaviour of passing `labels` as a superset or subset of the labels.
+    WHen a superset, we expect to show the "accuracy" in the report while it should be
+    the micro-averaging if this is a subset.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27927
+    """
+
+    y_true, y_pred = [0, 1], [0, 1]
+
+    report = classification_report(y_true, y_pred, labels=labels, output_dict=True)
+    if show_micro_avg:
+        assert "micro avg" in report
+        assert "accuracy" not in report
+    else:  # accuracy should be shown
+        assert "accuracy" in report
+        assert "micro avg" not in report
+
+
+def test_multilabel_accuracy_score_subset_accuracy():
+    # Dense label indicator matrix format
+    y1 = np.array([[0, 1, 1], [1, 0, 1]])
+    y2 = np.array([[0, 0, 1], [1, 0, 1]])
+
+    assert accuracy_score(y1, y2) == 0.5
+    assert accuracy_score(y1, y1) == 1
+    assert accuracy_score(y2, y2) == 1
+    assert accuracy_score(y2, np.logical_not(y2)) == 0
+    assert accuracy_score(y1, np.logical_not(y1)) == 0
+    assert accuracy_score(y1, np.zeros(y1.shape)) == 0
+    assert accuracy_score(y2, np.zeros(y1.shape)) == 0
+
+
+def test_precision_recall_f1_score_binary():
+    # Test Precision Recall and F1 Score for binary classification task
+    y_true, y_pred, _ = make_prediction(binary=True)
+
+    # detailed measures for each class
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
+    assert_array_almost_equal(p, [0.73, 0.85], 2)
+    assert_array_almost_equal(r, [0.88, 0.68], 2)
+    assert_array_almost_equal(f, [0.80, 0.76], 2)
+    assert_array_equal(s, [25, 25])
+
+    # individual scoring function that can be used for grid search: in the
+    # binary class case the score is the value of the measure for the positive
+    # class (e.g. label == 1). This is deprecated for average != 'binary'.
+    for kwargs in [{}, {"average": "binary"}]:
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+
+            ps = precision_score(y_true, y_pred, **kwargs)
+            assert_array_almost_equal(ps, 0.85, 2)
+
+            rs = recall_score(y_true, y_pred, **kwargs)
+            assert_array_almost_equal(rs, 0.68, 2)
+
+            fs = f1_score(y_true, y_pred, **kwargs)
+            assert_array_almost_equal(fs, 0.76, 2)
+
+            assert_almost_equal(
+                fbeta_score(y_true, y_pred, beta=2, **kwargs),
+                (1 + 2**2) * ps * rs / (2**2 * ps + rs),
+                2,
+            )
+
+
+@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning")
+def test_precision_recall_f_binary_single_class():
+    # Test precision, recall and F-scores behave with a single positive or
+    # negative class
+    # Such a case may occur with non-stratified cross-validation
+    assert 1.0 == precision_score([1, 1], [1, 1])
+    assert 1.0 == recall_score([1, 1], [1, 1])
+    assert 1.0 == f1_score([1, 1], [1, 1])
+    assert 1.0 == fbeta_score([1, 1], [1, 1], beta=0)
+
+    assert 0.0 == precision_score([-1, -1], [-1, -1])
+    assert 0.0 == recall_score([-1, -1], [-1, -1])
+    assert 0.0 == f1_score([-1, -1], [-1, -1])
+    assert 0.0 == fbeta_score([-1, -1], [-1, -1], beta=float("inf"))
+    assert fbeta_score([-1, -1], [-1, -1], beta=float("inf")) == pytest.approx(
+        fbeta_score([-1, -1], [-1, -1], beta=1e5)
+    )
+
+
+@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning")
+def test_precision_recall_f_extra_labels():
+    # Test handling of explicit additional (not in input) labels to PRF
+    y_true = [1, 3, 3, 2]
+    y_pred = [1, 1, 3, 2]
+    y_true_bin = label_binarize(y_true, classes=np.arange(5))
+    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+    data = [(y_true, y_pred), (y_true_bin, y_pred_bin)]
+
+    for i, (y_true, y_pred) in enumerate(data):
+        # No average: zeros in array
+        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average=None)
+        assert_array_almost_equal([0.0, 1.0, 1.0, 0.5, 0.0], actual)
+
+        # Macro average is changed
+        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average="macro")
+        assert_array_almost_equal(np.mean([0.0, 1.0, 1.0, 0.5, 0.0]), actual)
+
+        # No effect otherwise
+        for average in ["micro", "weighted", "samples"]:
+            if average == "samples" and i == 0:
+                continue
+            assert_almost_equal(
+                recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average),
+                recall_score(y_true, y_pred, labels=None, average=average),
+            )
+
+    # Error when introducing invalid label in multilabel case
+    # (although it would only affect performance if average='macro'/None)
+    for average in [None, "macro", "micro", "samples"]:
+        with pytest.raises(ValueError):
+            recall_score(y_true_bin, y_pred_bin, labels=np.arange(6), average=average)
+        with pytest.raises(ValueError):
+            recall_score(
+                y_true_bin, y_pred_bin, labels=np.arange(-1, 4), average=average
+            )
+
+    # tests non-regression on issue #10307
+    y_true = np.array([[0, 1, 1], [1, 0, 0]])
+    y_pred = np.array([[1, 1, 1], [1, 0, 1]])
+    p, r, f, _ = precision_recall_fscore_support(
+        y_true, y_pred, average="samples", labels=[0, 1]
+    )
+    assert_almost_equal(np.array([p, r, f]), np.array([3 / 4, 1, 5 / 6]))
+
+
+@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning")
+def test_precision_recall_f_ignored_labels():
+    # Test a subset of labels may be requested for PRF
+    y_true = [1, 1, 2, 3]
+    y_pred = [1, 3, 3, 3]
+    y_true_bin = label_binarize(y_true, classes=np.arange(5))
+    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+    data = [(y_true, y_pred), (y_true_bin, y_pred_bin)]
+
+    for i, (y_true, y_pred) in enumerate(data):
+        recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
+        recall_all = partial(recall_score, y_true, y_pred, labels=None)
+
+        assert_array_almost_equal([0.5, 1.0], recall_13(average=None))
+        assert_almost_equal((0.5 + 1.0) / 2, recall_13(average="macro"))
+        assert_almost_equal((0.5 * 2 + 1.0 * 1) / 3, recall_13(average="weighted"))
+        assert_almost_equal(2.0 / 3, recall_13(average="micro"))
+
+        # ensure the above were meaningful tests:
+        for average in ["macro", "weighted", "micro"]:
+            assert recall_13(average=average) != recall_all(average=average)
+
+
+def test_average_precision_score_non_binary_class():
+    """Test multiclass-multiouptut for `average_precision_score`."""
+    y_true = np.array(
+        [
+            [2, 2, 1],
+            [1, 2, 0],
+            [0, 1, 2],
+            [1, 2, 1],
+            [2, 0, 1],
+            [1, 2, 1],
+        ]
+    )
+    y_score = np.array(
+        [
+            [0.7, 0.2, 0.1],
+            [0.4, 0.3, 0.3],
+            [0.1, 0.8, 0.1],
+            [0.2, 0.3, 0.5],
+            [0.4, 0.4, 0.2],
+            [0.1, 0.2, 0.7],
+        ]
+    )
+    err_msg = "multiclass-multioutput format is not supported"
+    with pytest.raises(ValueError, match=err_msg):
+        average_precision_score(y_true, y_score, pos_label=2)
+
+
+@pytest.mark.parametrize(
+    "y_true, y_score",
+    [
+        (
+            [0, 0, 1, 2],
+            np.array(
+                [
+                    [0.7, 0.2, 0.1],
+                    [0.4, 0.3, 0.3],
+                    [0.1, 0.8, 0.1],
+                    [0.2, 0.3, 0.5],
+                ]
+            ),
+        ),
+        (
+            [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
+            [0, 0.1, 0.1, 0.4, 0.5, 0.6, 0.6, 0.9, 0.9, 1, 1],
+        ),
+    ],
+)
+def test_average_precision_score_duplicate_values(y_true, y_score):
+    """
+    Duplicate values with precision-recall require a different
+    processing than when computing the AUC of a ROC, because the
+    precision-recall curve is a decreasing curve
+    The following situation corresponds to a perfect
+    test statistic, the average_precision_score should be 1.
+    """
+    assert average_precision_score(y_true, y_score) == 1
+
+
+@pytest.mark.parametrize(
+    "y_true, y_score",
+    [
+        (
+            [2, 2, 1, 1, 0],
+            np.array(
+                [
+                    [0.2, 0.3, 0.5],
+                    [0.2, 0.3, 0.5],
+                    [0.4, 0.5, 0.3],
+                    [0.4, 0.5, 0.3],
+                    [0.8, 0.5, 0.3],
+                ]
+            ),
+        ),
+        (
+            [0, 1, 1],
+            [0.5, 0.5, 0.6],
+        ),
+    ],
+)
+def test_average_precision_score_tied_values(y_true, y_score):
+    # Here if we go from left to right in y_true, the 0 values are
+    # separated from the 1 values, so it appears that we've
+    # correctly sorted our classifications. But in fact the first two
+    # values have the same score (0.5) and so the first two values
+    # could be swapped around, creating an imperfect sorting. This
+    # imperfection should come through in the end score, making it less
+    # than one.
+    assert average_precision_score(y_true, y_score) != 1.0
+
+
+def test_precision_recall_f_unused_pos_label():
+    # Check warning that pos_label unused when set to non-default value
+    # but average != 'binary'; even if data is binary.
+
+    msg = (
+        r"Note that pos_label \(set to 2\) is "
+        r"ignored when average != 'binary' \(got 'macro'\). You "
+        r"may use labels=\[pos_label\] to specify a single "
+        "positive class."
+    )
+    with pytest.warns(UserWarning, match=msg):
+        precision_recall_fscore_support(
+            [1, 2, 1], [1, 2, 2], pos_label=2, average="macro"
+        )
+
+
+def test_confusion_matrix_binary():
+    # Test confusion matrix - binary classification case
+    y_true, y_pred, _ = make_prediction(binary=True)
+
+    def test(y_true, y_pred):
+        cm = confusion_matrix(y_true, y_pred)
+        assert_array_equal(cm, [[22, 3], [8, 17]])
+
+        tp, fp, fn, tn = cm.flatten()
+        num = tp * tn - fp * fn
+        den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
+
+        true_mcc = 0 if den == 0 else num / den
+        mcc = matthews_corrcoef(y_true, y_pred)
+        assert_array_almost_equal(mcc, true_mcc, decimal=2)
+        assert_array_almost_equal(mcc, 0.57, decimal=2)
+
+    test(y_true, y_pred)
+    test([str(y) for y in y_true], [str(y) for y in y_pred])
+
+
+def test_multilabel_confusion_matrix_binary():
+    # Test multilabel confusion matrix - binary classification case
+    y_true, y_pred, _ = make_prediction(binary=True)
+
+    def test(y_true, y_pred):
+        cm = multilabel_confusion_matrix(y_true, y_pred)
+        assert_array_equal(cm, [[[17, 8], [3, 22]], [[22, 3], [8, 17]]])
+
+    test(y_true, y_pred)
+    test([str(y) for y in y_true], [str(y) for y in y_pred])
+
+
+def test_multilabel_confusion_matrix_multiclass():
+    # Test multilabel confusion matrix - multi-class case
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    def test(y_true, y_pred, string_type=False):
+        # compute confusion matrix with default labels introspection
+        cm = multilabel_confusion_matrix(y_true, y_pred)
+        assert_array_equal(
+            cm, [[[47, 4], [5, 19]], [[38, 6], [28, 3]], [[30, 25], [2, 18]]]
+        )
+
+        # compute confusion matrix with explicit label ordering
+        labels = ["0", "2", "1"] if string_type else [0, 2, 1]
+        cm = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
+        assert_array_equal(
+            cm, [[[47, 4], [5, 19]], [[30, 25], [2, 18]], [[38, 6], [28, 3]]]
+        )
+
+        # compute confusion matrix with super set of present labels
+        labels = ["0", "2", "1", "3"] if string_type else [0, 2, 1, 3]
+        cm = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
+        assert_array_equal(
+            cm,
+            [
+                [[47, 4], [5, 19]],
+                [[30, 25], [2, 18]],
+                [[38, 6], [28, 3]],
+                [[75, 0], [0, 0]],
+            ],
+        )
+
+    test(y_true, y_pred)
+    test([str(y) for y in y_true], [str(y) for y in y_pred], string_type=True)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_multilabel_confusion_matrix_multilabel(csc_container, csr_container):
+    # Test multilabel confusion matrix - multilabel-indicator case
+
+    y_true = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]])
+    y_pred = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]])
+    y_true_csr = csr_container(y_true)
+    y_pred_csr = csr_container(y_pred)
+    y_true_csc = csc_container(y_true)
+    y_pred_csc = csc_container(y_pred)
+
+    # cross test different types
+    sample_weight = np.array([2, 1, 3])
+    real_cm = [[[1, 0], [1, 1]], [[1, 0], [1, 1]], [[0, 2], [1, 0]]]
+    trues = [y_true, y_true_csr, y_true_csc]
+    preds = [y_pred, y_pred_csr, y_pred_csc]
+
+    for y_true_tmp in trues:
+        for y_pred_tmp in preds:
+            cm = multilabel_confusion_matrix(y_true_tmp, y_pred_tmp)
+            assert_array_equal(cm, real_cm)
+
+    # test support for samplewise
+    cm = multilabel_confusion_matrix(y_true, y_pred, samplewise=True)
+    assert_array_equal(cm, [[[1, 0], [1, 1]], [[1, 1], [0, 1]], [[0, 1], [2, 0]]])
+
+    # test support for labels
+    cm = multilabel_confusion_matrix(y_true, y_pred, labels=[2, 0])
+    assert_array_equal(cm, [[[0, 2], [1, 0]], [[1, 0], [1, 1]]])
+
+    # test support for labels with samplewise
+    cm = multilabel_confusion_matrix(y_true, y_pred, labels=[2, 0], samplewise=True)
+    assert_array_equal(cm, [[[0, 0], [1, 1]], [[1, 1], [0, 0]], [[0, 1], [1, 0]]])
+
+    # test support for sample_weight with sample_wise
+    cm = multilabel_confusion_matrix(
+        y_true, y_pred, sample_weight=sample_weight, samplewise=True
+    )
+    assert_array_equal(cm, [[[2, 0], [2, 2]], [[1, 1], [0, 1]], [[0, 3], [6, 0]]])
+
+
+def test_multilabel_confusion_matrix_errors():
+    y_true = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]])
+    y_pred = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]])
+
+    # Bad sample_weight
+    with pytest.raises(ValueError, match="inconsistent numbers of samples"):
+        multilabel_confusion_matrix(y_true, y_pred, sample_weight=[1, 2])
+    with pytest.raises(ValueError, match="should be a 1d array"):
+        multilabel_confusion_matrix(
+            y_true, y_pred, sample_weight=[[1, 2, 3], [2, 3, 4], [3, 4, 5]]
+        )
+
+    # Bad labels
+    err_msg = r"All labels must be in \[0, n labels\)"
+    with pytest.raises(ValueError, match=err_msg):
+        multilabel_confusion_matrix(y_true, y_pred, labels=[-1])
+    err_msg = r"All labels must be in \[0, n labels\)"
+    with pytest.raises(ValueError, match=err_msg):
+        multilabel_confusion_matrix(y_true, y_pred, labels=[3])
+
+    # Using samplewise outside multilabel
+    with pytest.raises(ValueError, match="Samplewise metrics"):
+        multilabel_confusion_matrix([0, 1, 2], [1, 2, 0], samplewise=True)
+
+    # Bad y_type
+    err_msg = "multiclass-multioutput is not supported"
+    with pytest.raises(ValueError, match=err_msg):
+        multilabel_confusion_matrix([[0, 1, 2], [2, 1, 0]], [[1, 2, 0], [1, 0, 2]])
+
+
+@pytest.mark.parametrize(
+    "normalize, cm_dtype, expected_results",
+    [
+        ("true", "f", 0.333333333),
+        ("pred", "f", 0.333333333),
+        ("all", "f", 0.1111111111),
+        (None, "i", 2),
+    ],
+)
+def test_confusion_matrix_normalize(normalize, cm_dtype, expected_results):
+    y_test = [0, 1, 2] * 6
+    y_pred = list(chain(*permutations([0, 1, 2])))
+    cm = confusion_matrix(y_test, y_pred, normalize=normalize)
+    assert_allclose(cm, expected_results)
+    assert cm.dtype.kind == cm_dtype
+
+
+def test_confusion_matrix_normalize_single_class():
+    y_test = [0, 0, 0, 0, 1, 1, 1, 1]
+    y_pred = [0, 0, 0, 0, 0, 0, 0, 0]
+
+    cm_true = confusion_matrix(y_test, y_pred, normalize="true")
+    assert cm_true.sum() == pytest.approx(2.0)
+
+    # additionally check that no warnings are raised due to a division by zero
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        cm_pred = confusion_matrix(y_test, y_pred, normalize="pred")
+
+    assert cm_pred.sum() == pytest.approx(1.0)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        confusion_matrix(y_pred, y_test, normalize="true")
+
+
+def test_confusion_matrix_single_label():
+    """Test `confusion_matrix` warns when only one label found."""
+    y_test = [0, 0, 0, 0]
+    y_pred = [0, 0, 0, 0]
+
+    with pytest.warns(UserWarning, match="A single label was found in"):
+        confusion_matrix(y_pred, y_test)
+
+
+@pytest.mark.parametrize(
+    "params, warn_msg",
+    [
+        # When `fp == 0` and `tp != 0`, LR+ is undefined
+        (
+            {
+                "y_true": np.array([1, 1, 1, 0, 0, 0]),
+                "y_pred": np.array([1, 1, 1, 0, 0, 0]),
+            },
+            "`positive_likelihood_ratio` is ill-defined and set to `np.nan`.",
+        ),
+        # When `fp == 0` and `tp == 0`, LR+ is undefined
+        (
+            {
+                "y_true": np.array([1, 1, 1, 0, 0, 0]),
+                "y_pred": np.array([0, 0, 0, 0, 0, 0]),
+            },
+            (
+                "No samples were predicted for the positive class and "
+                "`positive_likelihood_ratio` is set to `np.nan`."
+            ),
+        ),
+        # When `tn == 0`, LR- is undefined
+        (
+            {
+                "y_true": np.array([1, 1, 1, 0, 0, 0]),
+                "y_pred": np.array([0, 0, 0, 1, 1, 1]),
+            },
+            "`negative_likelihood_ratio` is ill-defined and set to `np.nan`.",
+        ),
+        # When `tp + fn == 0` both ratios are undefined
+        (
+            {
+                "y_true": np.array([0, 0, 0, 0, 0, 0]),
+                "y_pred": np.array([1, 1, 1, 0, 0, 0]),
+            },
+            "No samples of the positive class are present in `y_true`.",
+        ),
+    ],
+)
+def test_likelihood_ratios_warnings(params, warn_msg):
+    # likelihood_ratios must raise warnings when at
+    # least one of the ratios is ill-defined.
+
+    with pytest.warns(UserWarning, match=warn_msg):
+        class_likelihood_ratios(**params)
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        (
+            {
+                "y_true": np.array([0, 1, 0, 1, 0]),
+                "y_pred": np.array([1, 1, 0, 0, 2]),
+            },
+            (
+                "class_likelihood_ratios only supports binary classification "
+                "problems, got targets of type: multiclass"
+            ),
+        ),
+    ],
+)
+def test_likelihood_ratios_errors(params, err_msg):
+    # likelihood_ratios must raise error when attempting
+    # non-binary classes to avoid Simpson's paradox
+    with pytest.raises(ValueError, match=err_msg):
+        class_likelihood_ratios(**params)
+
+
+def test_likelihood_ratios():
+    # Build confusion matrix with tn=9, fp=8, fn=1, tp=2,
+    # sensitivity=2/3, specificity=9/17, prevalence=3/20,
+    # LR+=34/24, LR-=17/27
+    y_true = np.array([1] * 3 + [0] * 17)
+    y_pred = np.array([1] * 2 + [0] * 10 + [1] * 8)
+
+    pos, neg = class_likelihood_ratios(y_true, y_pred)
+    assert_allclose(pos, 34 / 24)
+    assert_allclose(neg, 17 / 27)
+
+    # Build limit case with y_pred = y_true
+    pos, neg = class_likelihood_ratios(y_true, y_true)
+    assert_array_equal(pos, np.nan * 2)
+    assert_allclose(neg, np.zeros(2), rtol=1e-12)
+
+    # Ignore last 5 samples to get tn=9, fp=3, fn=1, tp=2,
+    # sensitivity=2/3, specificity=9/12, prevalence=3/20,
+    # LR+=24/9, LR-=12/27
+    sample_weight = np.array([1.0] * 15 + [0.0] * 5)
+    pos, neg = class_likelihood_ratios(y_true, y_pred, sample_weight=sample_weight)
+    assert_allclose(pos, 24 / 9)
+    assert_allclose(neg, 12 / 27)
+
+
+# TODO(1.9): remove test
+@pytest.mark.parametrize("raise_warning", [True, False])
+def test_likelihood_ratios_raise_warning_deprecation(raise_warning):
+    """Test that class_likelihood_ratios raises a `FutureWarning` when `raise_warning`
+    param is set."""
+    y_true = np.array([1, 0])
+    y_pred = np.array([1, 0])
+
+    msg = "`raise_warning` was deprecated in version 1.7 and will be removed in 1.9."
+    with pytest.warns(FutureWarning, match=msg):
+        class_likelihood_ratios(y_true, y_pred, raise_warning=raise_warning)
+
+
+def test_likelihood_ratios_replace_undefined_by_worst():
+    """Test that class_likelihood_ratios returns the worst scores `1.0` for both LR+ and
+    LR- when `replace_undefined_by=1` is set."""
+    # This data causes fp=0 (0 false positives) in the confusion_matrix and a division
+    # by zero that affects the positive_likelihood_ratio:
+    y_true = np.array([1, 1, 0])
+    y_pred = np.array([1, 0, 0])
+
+    positive_likelihood_ratio, _ = class_likelihood_ratios(
+        y_true, y_pred, replace_undefined_by=1
+    )
+    assert positive_likelihood_ratio == pytest.approx(1.0)
+
+    # This data causes tn=0 (0 true negatives) in the confusion_matrix and a division
+    # by zero that affects the negative_likelihood_ratio:
+    y_true = np.array([1, 0, 0])
+    y_pred = np.array([1, 1, 1])
+
+    _, negative_likelihood_ratio = class_likelihood_ratios(
+        y_true, y_pred, replace_undefined_by=1
+    )
+    assert negative_likelihood_ratio == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize(
+    "replace_undefined_by",
+    [
+        {"LR+": 0.0},
+        {"LR-": 0.0},
+        {"LR+": -5.0, "LR-": 0.0},
+        {"LR+": 1.0, "LR-": "nan"},
+        {"LR+": 0.0, "LR-": 0.0},
+        {"LR+": 1.0, "LR-": 2.0},
+    ],
+)
+def test_likelihood_ratios_wrong_dict_replace_undefined_by(replace_undefined_by):
+    """Test that class_likelihood_ratios raises a `ValueError` if the input dict for
+    `replace_undefined_by` is in the wrong format or contains impossible values."""
+    y_true = np.array([1, 0])
+    y_pred = np.array([1, 0])
+
+    msg = "The dictionary passed as `replace_undefined_by` needs to be in the form"
+    with pytest.raises(ValueError, match=msg):
+        class_likelihood_ratios(
+            y_true, y_pred, replace_undefined_by=replace_undefined_by
+        )
+
+
+@pytest.mark.parametrize(
+    "replace_undefined_by, expected",
+    [
+        ({"LR+": 1.0, "LR-": 1.0}, 1.0),
+        ({"LR+": np.inf, "LR-": 0.0}, np.inf),
+        ({"LR+": 2.0, "LR-": 0.0}, 2.0),
+        ({"LR+": np.nan, "LR-": np.nan}, np.nan),
+        (np.nan, np.nan),
+    ],
+)
+def test_likelihood_ratios_replace_undefined_by_0_fp(replace_undefined_by, expected):
+    """Test that the `replace_undefined_by` param returns the right value for the
+    positive_likelihood_ratio as defined by the user."""
+    # This data causes fp=0 (0 false positives) in the confusion_matrix and a division
+    # by zero that affects the positive_likelihood_ratio:
+    y_true = np.array([1, 1, 0])
+    y_pred = np.array([1, 0, 0])
+
+    positive_likelihood_ratio, _ = class_likelihood_ratios(
+        y_true, y_pred, replace_undefined_by=replace_undefined_by
+    )
+
+    if np.isnan(expected):
+        assert np.isnan(positive_likelihood_ratio)
+    else:
+        assert positive_likelihood_ratio == pytest.approx(expected)
+
+
+@pytest.mark.parametrize(
+    "replace_undefined_by, expected",
+    [
+        ({"LR+": 1.0, "LR-": 1.0}, 1.0),
+        ({"LR+": np.inf, "LR-": 0.0}, 0.0),
+        ({"LR+": np.inf, "LR-": 0.5}, 0.5),
+        ({"LR+": np.nan, "LR-": np.nan}, np.nan),
+        (np.nan, np.nan),
+    ],
+)
+def test_likelihood_ratios_replace_undefined_by_0_tn(replace_undefined_by, expected):
+    """Test that the `replace_undefined_by` param returns the right value for the
+    negative_likelihood_ratio as defined by the user."""
+    # This data causes tn=0 (0 true negatives) in the confusion_matrix and a division
+    # by zero that affects the negative_likelihood_ratio:
+    y_true = np.array([1, 0, 0])
+    y_pred = np.array([1, 1, 1])
+
+    _, negative_likelihood_ratio = class_likelihood_ratios(
+        y_true, y_pred, replace_undefined_by=replace_undefined_by
+    )
+
+    if np.isnan(expected):
+        assert np.isnan(negative_likelihood_ratio)
+    else:
+        assert negative_likelihood_ratio == pytest.approx(expected)
+
+
+def test_cohen_kappa():
+    # These label vectors reproduce the contingency matrix from Artstein and
+    # Poesio (2008), Table 1: np.array([[20, 20], [10, 50]]).
+    y1 = np.array([0] * 40 + [1] * 60)
+    y2 = np.array([0] * 20 + [1] * 20 + [0] * 10 + [1] * 50)
+    kappa = cohen_kappa_score(y1, y2)
+    assert_almost_equal(kappa, 0.348, decimal=3)
+    assert kappa == cohen_kappa_score(y2, y1)
+
+    # Add spurious labels and ignore them.
+    y1 = np.append(y1, [2] * 4)
+    y2 = np.append(y2, [2] * 4)
+    assert cohen_kappa_score(y1, y2, labels=[0, 1]) == kappa
+
+    assert_almost_equal(cohen_kappa_score(y1, y1), 1.0)
+
+    # Multiclass example: Artstein and Poesio, Table 4.
+    y1 = np.array([0] * 46 + [1] * 44 + [2] * 10)
+    y2 = np.array([0] * 52 + [1] * 32 + [2] * 16)
+    assert_almost_equal(cohen_kappa_score(y1, y2), 0.8013, decimal=4)
+
+    # Weighting example: none, linear, quadratic.
+    y1 = np.array([0] * 46 + [1] * 44 + [2] * 10)
+    y2 = np.array([0] * 50 + [1] * 40 + [2] * 10)
+    assert_almost_equal(cohen_kappa_score(y1, y2), 0.9315, decimal=4)
+    assert_almost_equal(cohen_kappa_score(y1, y2, weights="linear"), 0.9412, decimal=4)
+    assert_almost_equal(
+        cohen_kappa_score(y1, y2, weights="quadratic"), 0.9541, decimal=4
+    )
+
+
+def test_cohen_kappa_score_error_wrong_label():
+    """Test that correct error is raised when users pass labels that are not in y1."""
+    labels = [1, 2]
+    y1 = np.array(["a"] * 5 + ["b"] * 5)
+    y2 = np.array(["b"] * 10)
+    with pytest.raises(
+        ValueError, match="At least one label in `labels` must be present in `y1`"
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+
+@pytest.mark.parametrize("zero_division", [0, 1, np.nan])
+@pytest.mark.parametrize("y_true, y_pred", [([0], [0])])
+@pytest.mark.parametrize(
+    "metric",
+    [
+        f1_score,
+        partial(fbeta_score, beta=1),
+        precision_score,
+        recall_score,
+    ],
+)
+def test_zero_division_nan_no_warning(metric, y_true, y_pred, zero_division):
+    """Check the behaviour of `zero_division` when setting to 0, 1 or np.nan.
+    No warnings should be raised.
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        result = metric(y_true, y_pred, zero_division=zero_division)
+
+    if np.isnan(zero_division):
+        assert np.isnan(result)
+    else:
+        assert result == zero_division
+
+
+@pytest.mark.parametrize("y_true, y_pred", [([0], [0])])
+@pytest.mark.parametrize(
+    "metric",
+    [
+        f1_score,
+        partial(fbeta_score, beta=1),
+        precision_score,
+        recall_score,
+    ],
+)
+def test_zero_division_nan_warning(metric, y_true, y_pred):
+    """Check the behaviour of `zero_division` when setting to "warn".
+    A `UndefinedMetricWarning` should be raised.
+    """
+    with pytest.warns(UndefinedMetricWarning):
+        result = metric(y_true, y_pred, zero_division="warn")
+    assert result == 0.0
+
+
+def test_matthews_corrcoef_against_numpy_corrcoef(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    y_true = rng.randint(0, 2, size=20)
+    y_pred = rng.randint(0, 2, size=20)
+
+    assert_almost_equal(
+        matthews_corrcoef(y_true, y_pred), np.corrcoef(y_true, y_pred)[0, 1], 10
+    )
+
+
+def test_matthews_corrcoef_against_jurman(global_random_seed):
+    # Check that the multiclass matthews_corrcoef agrees with the definition
+    # presented in Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC
+    # and CEN Error Measures in MultiClass Prediction
+    rng = np.random.RandomState(global_random_seed)
+    y_true = rng.randint(0, 2, size=20)
+    y_pred = rng.randint(0, 2, size=20)
+    sample_weight = rng.rand(20)
+
+    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
+    N = len(C)
+    cov_ytyp = sum(
+        [
+            C[k, k] * C[m, l] - C[l, k] * C[k, m]
+            for k in range(N)
+            for m in range(N)
+            for l in range(N)
+        ]
+    )
+    cov_ytyt = sum(
+        [
+            C[:, k].sum()
+            * np.sum([C[g, f] for f in range(N) for g in range(N) if f != k])
+            for k in range(N)
+        ]
+    )
+    cov_ypyp = np.sum(
+        [
+            C[k, :].sum()
+            * np.sum([C[f, g] for f in range(N) for g in range(N) if f != k])
+            for k in range(N)
+        ]
+    )
+    mcc_jurman = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
+    mcc_ours = matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight)
+
+    assert_almost_equal(mcc_ours, mcc_jurman, 10)
+
+
+def test_matthews_corrcoef(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    y_true = ["a" if i == 0 else "b" for i in rng.randint(0, 2, size=20)]
+
+    # corrcoef of same vectors must be 1
+    assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0)
+
+    # corrcoef, when the two vectors are opposites of each other, should be -1
+    y_true_inv = ["b" if i == "a" else "a" for i in y_true]
+    assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1)
+
+    y_true_inv2 = label_binarize(y_true, classes=["a", "b"])
+    y_true_inv2 = np.where(y_true_inv2, "a", "b")
+    assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1)
+
+    # For the zero vector case, the corrcoef cannot be calculated and should
+    # output 0
+    assert_almost_equal(matthews_corrcoef([0, 0, 0, 0], [0, 0, 0, 0]), 0.0)
+
+    # And also for any other vector with 0 variance
+    assert_almost_equal(matthews_corrcoef(y_true, ["a"] * len(y_true)), 0.0)
+
+    # These two vectors have 0 correlation and hence mcc should be 0
+    y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
+    y_2 = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1]
+    assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.0)
+
+    # Check that sample weight is able to selectively exclude
+    mask = [1] * 10 + [0] * 10
+    # Now the first half of the vector elements are alone given a weight of 1
+    # and hence the mcc will not be a perfect 0 as in the previous case
+    with pytest.raises(AssertionError):
+        assert_almost_equal(matthews_corrcoef(y_1, y_2, sample_weight=mask), 0.0)
+
+
+def test_matthews_corrcoef_multiclass(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    ord_a = ord("a")
+    n_classes = 4
+    y_true = [chr(ord_a + i) for i in rng.randint(0, n_classes, size=20)]
+
+    # corrcoef of same vectors must be 1
+    assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0)
+
+    # with multiclass > 2 it is not possible to achieve -1
+    y_true = [0, 0, 1, 1, 2, 2]
+    y_pred_bad = [2, 2, 0, 0, 1, 1]
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred_bad), -0.5)
+
+    # Maximizing false positives and negatives minimizes the MCC
+    # The minimum will be different for depending on the input
+    y_true = [0, 0, 1, 1, 2, 2]
+    y_pred_min = [1, 1, 0, 0, 0, 0]
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred_min), -12 / np.sqrt(24 * 16))
+
+    # Zero variance will result in an mcc of zero
+    y_true = [0, 1, 2]
+    y_pred = [3, 3, 3]
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0)
+
+    # Also for ground truth with zero variance
+    y_true = [3, 3, 3]
+    y_pred = [0, 1, 2]
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0)
+
+    # These two vectors have 0 correlation and hence mcc should be 0
+    y_1 = [0, 1, 2, 0, 1, 2, 0, 1, 2]
+    y_2 = [1, 1, 1, 2, 2, 2, 0, 0, 0]
+    assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.0)
+
+    # We can test that binary assumptions hold using the multiclass computation
+    # by masking the weight of samples not in the first two classes
+
+    # Masking the last label should let us get an MCC of -1
+    y_true = [0, 0, 1, 1, 2]
+    y_pred = [1, 1, 0, 0, 2]
+    sample_weight = [1, 1, 1, 1, 0]
+    assert_almost_equal(
+        matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight), -1
+    )
+
+    # For the zero vector case, the corrcoef cannot be calculated and should
+    # output 0
+    y_true = [0, 0, 1, 2]
+    y_pred = [0, 0, 1, 2]
+    sample_weight = [1, 1, 0, 0]
+    assert_almost_equal(
+        matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight), 0.0
+    )
+
+
+@pytest.mark.parametrize("n_points", [100, 10000])
+def test_matthews_corrcoef_overflow(n_points, global_random_seed):
+    # https://github.com/scikit-learn/scikit-learn/issues/9622
+    rng = np.random.RandomState(global_random_seed)
+
+    def mcc_safe(y_true, y_pred):
+        conf_matrix = confusion_matrix(y_true, y_pred)
+        true_pos = conf_matrix[1, 1]
+        false_pos = conf_matrix[1, 0]
+        false_neg = conf_matrix[0, 1]
+        n_points = len(y_true)
+        pos_rate = (true_pos + false_neg) / n_points
+        activity = (true_pos + false_pos) / n_points
+        mcc_numerator = true_pos / n_points - pos_rate * activity
+        mcc_denominator = activity * pos_rate * (1 - activity) * (1 - pos_rate)
+        return mcc_numerator / np.sqrt(mcc_denominator)
+
+    def random_ys(n_points):  # binary
+        x_true = rng.random_sample(n_points)
+        x_pred = x_true + 0.2 * (rng.random_sample(n_points) - 0.5)
+        y_true = x_true > 0.5
+        y_pred = x_pred > 0.5
+        return y_true, y_pred
+
+    arr = np.repeat([0.0, 1.0], n_points)  # binary
+    assert_almost_equal(matthews_corrcoef(arr, arr), 1.0)
+    arr = np.repeat([0.0, 1.0, 2.0], n_points)  # multiclass
+    assert_almost_equal(matthews_corrcoef(arr, arr), 1.0)
+
+    y_true, y_pred = random_ys(n_points)
+    assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0)
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred), mcc_safe(y_true, y_pred))
+
+
+def test_precision_recall_f1_score_multiclass():
+    # Test Precision Recall and F1 Score for multiclass classification task
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    # compute scores with default labels introspection
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
+    assert_array_almost_equal(p, [0.83, 0.33, 0.42], 2)
+    assert_array_almost_equal(r, [0.79, 0.09, 0.90], 2)
+    assert_array_almost_equal(f, [0.81, 0.15, 0.57], 2)
+    assert_array_equal(s, [24, 31, 20])
+
+    # averaging tests
+    ps = precision_score(y_true, y_pred, pos_label=1, average="micro")
+    assert_array_almost_equal(ps, 0.53, 2)
+
+    rs = recall_score(y_true, y_pred, average="micro")
+    assert_array_almost_equal(rs, 0.53, 2)
+
+    fs = f1_score(y_true, y_pred, average="micro")
+    assert_array_almost_equal(fs, 0.53, 2)
+
+    ps = precision_score(y_true, y_pred, average="macro")
+    assert_array_almost_equal(ps, 0.53, 2)
+
+    rs = recall_score(y_true, y_pred, average="macro")
+    assert_array_almost_equal(rs, 0.60, 2)
+
+    fs = f1_score(y_true, y_pred, average="macro")
+    assert_array_almost_equal(fs, 0.51, 2)
+
+    ps = precision_score(y_true, y_pred, average="weighted")
+    assert_array_almost_equal(ps, 0.51, 2)
+
+    rs = recall_score(y_true, y_pred, average="weighted")
+    assert_array_almost_equal(rs, 0.53, 2)
+
+    fs = f1_score(y_true, y_pred, average="weighted")
+    assert_array_almost_equal(fs, 0.47, 2)
+
+    with pytest.raises(ValueError):
+        precision_score(y_true, y_pred, average="samples")
+    with pytest.raises(ValueError):
+        recall_score(y_true, y_pred, average="samples")
+    with pytest.raises(ValueError):
+        f1_score(y_true, y_pred, average="samples")
+    with pytest.raises(ValueError):
+        fbeta_score(y_true, y_pred, average="samples", beta=0.5)
+
+    # same prediction but with and explicit label ordering
+    p, r, f, s = precision_recall_fscore_support(
+        y_true, y_pred, labels=[0, 2, 1], average=None
+    )
+    assert_array_almost_equal(p, [0.83, 0.41, 0.33], 2)
+    assert_array_almost_equal(r, [0.79, 0.90, 0.10], 2)
+    assert_array_almost_equal(f, [0.81, 0.57, 0.15], 2)
+    assert_array_equal(s, [24, 20, 31])
+
+
+@pytest.mark.parametrize("average", ["samples", "micro", "macro", "weighted", None])
+def test_precision_refcall_f1_score_multilabel_unordered_labels(average):
+    # test that labels need not be sorted in the multilabel case
+    y_true = np.array([[1, 1, 0, 0]])
+    y_pred = np.array([[0, 0, 1, 1]])
+    p, r, f, s = precision_recall_fscore_support(
+        y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average
+    )
+    assert_array_equal(p, 0)
+    assert_array_equal(r, 0)
+    assert_array_equal(f, 0)
+    if average is None:
+        assert_array_equal(s, [0, 1, 1, 0])
+
+
+def test_precision_recall_f1_score_binary_averaged():
+    y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1])
+    y_pred = np.array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1])
+
+    # compute scores with default labels introspection
+    ps, rs, fs, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
+    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average="macro")
+    assert p == np.mean(ps)
+    assert r == np.mean(rs)
+    assert f == np.mean(fs)
+    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
+    support = np.bincount(y_true)
+    assert p == np.average(ps, weights=support)
+    assert r == np.average(rs, weights=support)
+    assert f == np.average(fs, weights=support)
+
+
+def test_zero_precision_recall():
+    # Check that pathological cases do not bring NaNs
+
+    old_error_settings = np.seterr(all="raise")
+
+    try:
+        y_true = np.array([0, 1, 2, 0, 1, 2])
+        y_pred = np.array([2, 0, 1, 1, 2, 0])
+
+        assert_almost_equal(precision_score(y_true, y_pred, average="macro"), 0.0, 2)
+        assert_almost_equal(recall_score(y_true, y_pred, average="macro"), 0.0, 2)
+        assert_almost_equal(f1_score(y_true, y_pred, average="macro"), 0.0, 2)
+
+    finally:
+        np.seterr(**old_error_settings)
+
+
+def test_confusion_matrix_multiclass_subset_labels():
+    # Test confusion matrix - multi-class case with subset of labels
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    # compute confusion matrix with only first two labels considered
+    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
+    assert_array_equal(cm, [[19, 4], [4, 3]])
+
+    # compute confusion matrix with explicit label ordering for only subset
+    # of labels
+    cm = confusion_matrix(y_true, y_pred, labels=[2, 1])
+    assert_array_equal(cm, [[18, 2], [24, 3]])
+
+    # a label not in y_true should result in zeros for that row/column
+    extra_label = np.max(y_true) + 1
+    cm = confusion_matrix(y_true, y_pred, labels=[2, extra_label])
+    assert_array_equal(cm, [[18, 0], [0, 0]])
+
+
+@pytest.mark.parametrize(
+    "labels, err_msg",
+    [
+        ([], "'labels' should contains at least one label."),
+        ([3, 4], "At least one label specified must be in y_true"),
+    ],
+    ids=["empty list", "unknown labels"],
+)
+def test_confusion_matrix_error(labels, err_msg):
+    y_true, y_pred, _ = make_prediction(binary=False)
+    with pytest.raises(ValueError, match=err_msg):
+        confusion_matrix(y_true, y_pred, labels=labels)
+
+
+@pytest.mark.parametrize(
+    "labels", (None, [0, 1], [0, 1, 2]), ids=["None", "binary", "multiclass"]
+)
+def test_confusion_matrix_on_zero_length_input(labels):
+    expected_n_classes = len(labels) if labels else 0
+    expected = np.zeros((expected_n_classes, expected_n_classes), dtype=int)
+    cm = confusion_matrix([], [], labels=labels)
+    assert_array_equal(cm, expected)
+
+
+def test_confusion_matrix_dtype():
+    y = [0, 1, 1]
+    weight = np.ones(len(y))
+    # confusion_matrix returns int64 by default
+    cm = confusion_matrix(y, y)
+    assert cm.dtype == np.int64
+    # The dtype of confusion_matrix is always 64 bit
+    for dtype in [np.bool_, np.int32, np.uint64]:
+        cm = confusion_matrix(y, y, sample_weight=weight.astype(dtype, copy=False))
+        assert cm.dtype == np.int64
+    for dtype in [np.float32, np.float64, None, object]:
+        cm = confusion_matrix(y, y, sample_weight=weight.astype(dtype, copy=False))
+        assert cm.dtype == np.float64
+
+    # np.iinfo(np.uint32).max should be accumulated correctly
+    weight = np.full(len(y), 4294967295, dtype=np.uint32)
+    cm = confusion_matrix(y, y, sample_weight=weight)
+    assert cm[0, 0] == 4294967295
+    assert cm[1, 1] == 8589934590
+
+    # np.iinfo(np.int64).max should cause an overflow
+    weight = np.full(len(y), 9223372036854775807, dtype=np.int64)
+    cm = confusion_matrix(y, y, sample_weight=weight)
+    assert cm[0, 0] == 9223372036854775807
+    assert cm[1, 1] == -2
+
+
+@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
+def test_confusion_matrix_pandas_nullable(dtype):
+    """Checks that confusion_matrix works with pandas nullable dtypes.
+
+    Non-regression test for gh-25635.
+    """
+    pd = pytest.importorskip("pandas")
+
+    y_ndarray = np.array([1, 0, 0, 1, 0, 1, 1, 0, 1])
+    y_true = pd.Series(y_ndarray, dtype=dtype)
+    y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64")
+
+    output = confusion_matrix(y_true, y_predicted)
+    expected_output = confusion_matrix(y_ndarray, y_predicted)
+
+    assert_array_equal(output, expected_output)
+
+
+def test_classification_report_multiclass():
+    # Test performance report
+    iris = datasets.load_iris()
+    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)
+
+    # print classification report with class names
+    expected_report = """\
+              precision    recall  f1-score   support
+
+      setosa       0.83      0.79      0.81        24
+  versicolor       0.33      0.10      0.15        31
+   virginica       0.42      0.90      0.57        20
+
+    accuracy                           0.53        75
+   macro avg       0.53      0.60      0.51        75
+weighted avg       0.51      0.53      0.47        75
+"""
+    report = classification_report(
+        y_true,
+        y_pred,
+        labels=np.arange(len(iris.target_names)),
+        target_names=iris.target_names,
+    )
+    assert report == expected_report
+
+
+def test_classification_report_multiclass_balanced():
+    y_true, y_pred = [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]
+
+    expected_report = """\
+              precision    recall  f1-score   support
+
+           0       0.33      0.33      0.33         3
+           1       0.33      0.33      0.33         3
+           2       0.33      0.33      0.33         3
+
+    accuracy                           0.33         9
+   macro avg       0.33      0.33      0.33         9
+weighted avg       0.33      0.33      0.33         9
+"""
+    report = classification_report(y_true, y_pred)
+    assert report == expected_report
+
+
+def test_classification_report_multiclass_with_label_detection():
+    iris = datasets.load_iris()
+    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)
+
+    # print classification report with label detection
+    expected_report = """\
+              precision    recall  f1-score   support
+
+           0       0.83      0.79      0.81        24
+           1       0.33      0.10      0.15        31
+           2       0.42      0.90      0.57        20
+
+    accuracy                           0.53        75
+   macro avg       0.53      0.60      0.51        75
+weighted avg       0.51      0.53      0.47        75
+"""
+    report = classification_report(y_true, y_pred)
+    assert report == expected_report
+
+
+def test_classification_report_multiclass_with_digits():
+    # Test performance report with added digits in floating point values
+    iris = datasets.load_iris()
+    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)
+
+    # print classification report with class names
+    expected_report = """\
+              precision    recall  f1-score   support
+
+      setosa    0.82609   0.79167   0.80851        24
+  versicolor    0.33333   0.09677   0.15000        31
+   virginica    0.41860   0.90000   0.57143        20
+
+    accuracy                        0.53333        75
+   macro avg    0.52601   0.59615   0.50998        75
+weighted avg    0.51375   0.53333   0.47310        75
+"""
+    report = classification_report(
+        y_true,
+        y_pred,
+        labels=np.arange(len(iris.target_names)),
+        target_names=iris.target_names,
+        digits=5,
+    )
+    assert report == expected_report
+
+
+def test_classification_report_multiclass_with_string_label():
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    y_true = np.array(["blue", "green", "red"])[y_true]
+    y_pred = np.array(["blue", "green", "red"])[y_pred]
+
+    expected_report = """\
+              precision    recall  f1-score   support
+
+        blue       0.83      0.79      0.81        24
+       green       0.33      0.10      0.15        31
+         red       0.42      0.90      0.57        20
+
+    accuracy                           0.53        75
+   macro avg       0.53      0.60      0.51        75
+weighted avg       0.51      0.53      0.47        75
+"""
+    report = classification_report(y_true, y_pred)
+    assert report == expected_report
+
+    expected_report = """\
+              precision    recall  f1-score   support
+
+           a       0.83      0.79      0.81        24
+           b       0.33      0.10      0.15        31
+           c       0.42      0.90      0.57        20
+
+    accuracy                           0.53        75
+   macro avg       0.53      0.60      0.51        75
+weighted avg       0.51      0.53      0.47        75
+"""
+    report = classification_report(y_true, y_pred, target_names=["a", "b", "c"])
+    assert report == expected_report
+
+
+def test_classification_report_multiclass_with_unicode_label():
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    labels = np.array(["blue\xa2", "green\xa2", "red\xa2"])
+    y_true = labels[y_true]
+    y_pred = labels[y_pred]
+
+    expected_report = """\
+              precision    recall  f1-score   support
+
+       blue\xa2       0.83      0.79      0.81        24
+      green\xa2       0.33      0.10      0.15        31
+        red\xa2       0.42      0.90      0.57        20
+
+    accuracy                           0.53        75
+   macro avg       0.53      0.60      0.51        75
+weighted avg       0.51      0.53      0.47        75
+"""
+    report = classification_report(y_true, y_pred)
+    assert report == expected_report
+
+
+def test_classification_report_multiclass_with_long_string_label():
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    labels = np.array(["blue", "green" * 5, "red"])
+    y_true = labels[y_true]
+    y_pred = labels[y_pred]
+
+    expected_report = """\
+                           precision    recall  f1-score   support
+
+                     blue       0.83      0.79      0.81        24
+greengreengreengreengreen       0.33      0.10      0.15        31
+                      red       0.42      0.90      0.57        20
+
+                 accuracy                           0.53        75
+                macro avg       0.53      0.60      0.51        75
+             weighted avg       0.51      0.53      0.47        75
+"""
+
+    report = classification_report(y_true, y_pred)
+    assert report == expected_report
+
+
+def test_classification_report_labels_target_names_unequal_length():
+    y_true = [0, 0, 2, 0, 0]
+    y_pred = [0, 2, 2, 0, 0]
+    target_names = ["class 0", "class 1", "class 2"]
+
+    msg = "labels size, 2, does not match size of target_names, 3"
+    with pytest.warns(UserWarning, match=msg):
+        classification_report(y_true, y_pred, labels=[0, 2], target_names=target_names)
+
+
+def test_classification_report_no_labels_target_names_unequal_length():
+    y_true = [0, 0, 2, 0, 0]
+    y_pred = [0, 2, 2, 0, 0]
+    target_names = ["class 0", "class 1", "class 2"]
+
+    err_msg = (
+        "Number of classes, 2, does not "
+        "match size of target_names, 3. "
+        "Try specifying the labels parameter"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        classification_report(y_true, y_pred, target_names=target_names)
+
+
+@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning")
+def test_multilabel_classification_report():
+    n_classes = 4
+    n_samples = 50
+
+    _, y_true = make_multilabel_classification(
+        n_features=1, n_samples=n_samples, n_classes=n_classes, random_state=0
+    )
+
+    _, y_pred = make_multilabel_classification(
+        n_features=1, n_samples=n_samples, n_classes=n_classes, random_state=1
+    )
+
+    expected_report = """\
+              precision    recall  f1-score   support
+
+           0       0.50      0.67      0.57        24
+           1       0.51      0.74      0.61        27
+           2       0.29      0.08      0.12        26
+           3       0.52      0.56      0.54        27
+
+   micro avg       0.50      0.51      0.50       104
+   macro avg       0.45      0.51      0.46       104
+weighted avg       0.45      0.51      0.46       104
+ samples avg       0.46      0.42      0.40       104
+"""
+
+    report = classification_report(y_true, y_pred)
+    assert report == expected_report
+
+
+def test_multilabel_zero_one_loss_subset():
+    # Dense label indicator matrix format
+    y1 = np.array([[0, 1, 1], [1, 0, 1]])
+    y2 = np.array([[0, 0, 1], [1, 0, 1]])
+
+    assert zero_one_loss(y1, y2) == 0.5
+    assert zero_one_loss(y1, y1) == 0
+    assert zero_one_loss(y2, y2) == 0
+    assert zero_one_loss(y2, np.logical_not(y2)) == 1
+    assert zero_one_loss(y1, np.logical_not(y1)) == 1
+    assert zero_one_loss(y1, np.zeros(y1.shape)) == 1
+    assert zero_one_loss(y2, np.zeros(y1.shape)) == 1
+
+
+def test_multilabel_hamming_loss():
+    # Dense label indicator matrix format
+    y1 = np.array([[0, 1, 1], [1, 0, 1]])
+    y2 = np.array([[0, 0, 1], [1, 0, 1]])
+    w = np.array([1, 3])
+
+    assert hamming_loss(y1, y2) == 1 / 6
+    assert hamming_loss(y1, y1) == 0
+    assert hamming_loss(y2, y2) == 0
+    assert hamming_loss(y2, 1 - y2) == 1
+    assert hamming_loss(y1, 1 - y1) == 1
+    assert hamming_loss(y1, np.zeros(y1.shape)) == 4 / 6
+    assert hamming_loss(y2, np.zeros(y1.shape)) == 0.5
+    assert hamming_loss(y1, y2, sample_weight=w) == 1.0 / 12
+    assert hamming_loss(y1, 1 - y2, sample_weight=w) == 11.0 / 12
+    assert hamming_loss(y1, np.zeros_like(y1), sample_weight=w) == 2.0 / 3
+    # sp_hamming only works with 1-D arrays
+    assert hamming_loss(y1[0], y2[0]) == sp_hamming(y1[0], y2[0])
+
+
+def test_jaccard_score_validation():
+    y_true = np.array([0, 1, 0, 1, 1])
+    y_pred = np.array([0, 1, 0, 1, 1])
+    err_msg = r"pos_label=2 is not a valid label. It should be one of \[0, 1\]"
+    with pytest.raises(ValueError, match=err_msg):
+        jaccard_score(y_true, y_pred, average="binary", pos_label=2)
+
+    y_true = np.array([[0, 1, 1], [1, 0, 0]])
+    y_pred = np.array([[1, 1, 1], [1, 0, 1]])
+    msg1 = (
+        r"Target is multilabel-indicator but average='binary'. "
+        r"Please choose another average setting, one of \[None, "
+        r"'micro', 'macro', 'weighted', 'samples'\]."
+    )
+    with pytest.raises(ValueError, match=msg1):
+        jaccard_score(y_true, y_pred, average="binary", pos_label=-1)
+
+    y_true = np.array([0, 1, 1, 0, 2])
+    y_pred = np.array([1, 1, 1, 1, 0])
+    msg2 = (
+        r"Target is multiclass but average='binary'. Please choose "
+        r"another average setting, one of \[None, 'micro', 'macro', "
+        r"'weighted'\]."
+    )
+    with pytest.raises(ValueError, match=msg2):
+        jaccard_score(y_true, y_pred, average="binary")
+    msg3 = "Samplewise metrics are not available outside of multilabel classification."
+    with pytest.raises(ValueError, match=msg3):
+        jaccard_score(y_true, y_pred, average="samples")
+
+    msg = (
+        r"Note that pos_label \(set to 3\) is ignored when "
+        r"average != 'binary' \(got 'micro'\). You may use "
+        r"labels=\[pos_label\] to specify a single positive "
+        "class."
+    )
+    with pytest.warns(UserWarning, match=msg):
+        jaccard_score(y_true, y_pred, average="micro", pos_label=3)
+
+
+def test_multilabel_jaccard_score(recwarn):
+    # Dense label indicator matrix format
+    y1 = np.array([[0, 1, 1], [1, 0, 1]])
+    y2 = np.array([[0, 0, 1], [1, 0, 1]])
+
+    # size(y1 \inter y2) = [1, 2]
+    # size(y1 \union y2) = [2, 2]
+
+    assert jaccard_score(y1, y2, average="samples") == 0.75
+    assert jaccard_score(y1, y1, average="samples") == 1
+    assert jaccard_score(y2, y2, average="samples") == 1
+    assert jaccard_score(y2, np.logical_not(y2), average="samples") == 0
+    assert jaccard_score(y1, np.logical_not(y1), average="samples") == 0
+    assert jaccard_score(y1, np.zeros(y1.shape), average="samples") == 0
+    assert jaccard_score(y2, np.zeros(y1.shape), average="samples") == 0
+
+    y_true = np.array([[0, 1, 1], [1, 0, 0]])
+    y_pred = np.array([[1, 1, 1], [1, 0, 1]])
+    # average='macro'
+    assert_almost_equal(jaccard_score(y_true, y_pred, average="macro"), 2.0 / 3)
+    # average='micro'
+    assert_almost_equal(jaccard_score(y_true, y_pred, average="micro"), 3.0 / 5)
+    # average='samples'
+    assert_almost_equal(jaccard_score(y_true, y_pred, average="samples"), 7.0 / 12)
+    assert_almost_equal(
+        jaccard_score(y_true, y_pred, average="samples", labels=[0, 2]), 1.0 / 2
+    )
+    assert_almost_equal(
+        jaccard_score(y_true, y_pred, average="samples", labels=[1, 2]), 1.0 / 2
+    )
+    # average=None
+    assert_array_equal(
+        jaccard_score(y_true, y_pred, average=None), np.array([1.0 / 2, 1.0, 1.0 / 2])
+    )
+
+    y_true = np.array([[0, 1, 1], [1, 0, 1]])
+    y_pred = np.array([[1, 1, 1], [1, 0, 1]])
+    assert_almost_equal(jaccard_score(y_true, y_pred, average="macro"), 5.0 / 6)
+    # average='weighted'
+    assert_almost_equal(jaccard_score(y_true, y_pred, average="weighted"), 7.0 / 8)
+
+    msg2 = "Got 4 > 2"
+    with pytest.raises(ValueError, match=msg2):
+        jaccard_score(y_true, y_pred, labels=[4], average="macro")
+    msg3 = "Got -1 < 0"
+    with pytest.raises(ValueError, match=msg3):
+        jaccard_score(y_true, y_pred, labels=[-1], average="macro")
+
+    msg = (
+        "Jaccard is ill-defined and being set to 0.0 in labels "
+        "with no true or predicted samples."
+    )
+
+    with pytest.warns(UndefinedMetricWarning, match=msg):
+        assert (
+            jaccard_score(np.array([[0, 1]]), np.array([[0, 1]]), average="macro")
+            == 0.5
+        )
+
+    msg = (
+        "Jaccard is ill-defined and being set to 0.0 in samples "
+        "with no true or predicted labels."
+    )
+
+    with pytest.warns(UndefinedMetricWarning, match=msg):
+        assert (
+            jaccard_score(
+                np.array([[0, 0], [1, 1]]),
+                np.array([[0, 0], [1, 1]]),
+                average="samples",
+            )
+            == 0.5
+        )
+
+    assert not list(recwarn)
+
+
+def test_multiclass_jaccard_score(recwarn):
+    y_true = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "bird"]
+    y_pred = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "cat"]
+    labels = ["ant", "bird", "cat"]
+    lb = LabelBinarizer()
+    lb.fit(labels)
+    y_true_bin = lb.transform(y_true)
+    y_pred_bin = lb.transform(y_pred)
+    multi_jaccard_score = partial(jaccard_score, y_true, y_pred)
+    bin_jaccard_score = partial(jaccard_score, y_true_bin, y_pred_bin)
+    multi_labels_list = [
+        ["ant", "bird"],
+        ["ant", "cat"],
+        ["cat", "bird"],
+        ["ant"],
+        ["bird"],
+        ["cat"],
+        None,
+    ]
+    bin_labels_list = [[0, 1], [0, 2], [2, 1], [0], [1], [2], None]
+
+    # other than average='samples'/'none-samples', test everything else here
+    for average in ("macro", "weighted", "micro", None):
+        for m_label, b_label in zip(multi_labels_list, bin_labels_list):
+            assert_almost_equal(
+                multi_jaccard_score(average=average, labels=m_label),
+                bin_jaccard_score(average=average, labels=b_label),
+            )
+
+    y_true = np.array([[0, 0], [0, 0], [0, 0]])
+    y_pred = np.array([[0, 0], [0, 0], [0, 0]])
+    with ignore_warnings():
+        assert jaccard_score(y_true, y_pred, average="weighted") == 0
+
+    assert not list(recwarn)
+
+
+def test_average_binary_jaccard_score(recwarn):
+    # tp=0, fp=0, fn=1, tn=0
+    assert jaccard_score([1], [0], average="binary") == 0.0
+    # tp=0, fp=0, fn=0, tn=1
+    msg = (
+        "Jaccard is ill-defined and being set to 0.0 due to "
+        "no true or predicted samples"
+    )
+    with pytest.warns(UndefinedMetricWarning, match=msg):
+        assert jaccard_score([0, 0], [0, 0], average="binary") == 0.0
+
+    # tp=1, fp=0, fn=0, tn=0 (pos_label=0)
+    assert jaccard_score([0], [0], pos_label=0, average="binary") == 1.0
+    y_true = np.array([1, 0, 1, 1, 0])
+    y_pred = np.array([1, 0, 1, 1, 1])
+    assert_almost_equal(jaccard_score(y_true, y_pred, average="binary"), 3.0 / 4)
+    assert_almost_equal(
+        jaccard_score(y_true, y_pred, average="binary", pos_label=0), 1.0 / 2
+    )
+
+    assert not list(recwarn)
+
+
+def test_jaccard_score_zero_division_warning():
+    # check that we raised a warning with default behavior if a zero division
+    # happens
+    y_true = np.array([[1, 0, 1], [0, 0, 0]])
+    y_pred = np.array([[0, 0, 0], [0, 0, 0]])
+    msg = (
+        "Jaccard is ill-defined and being set to 0.0 in "
+        "samples with no true or predicted labels."
+        " Use `zero_division` parameter to control this behavior."
+    )
+    with pytest.warns(UndefinedMetricWarning, match=msg):
+        score = jaccard_score(y_true, y_pred, average="samples", zero_division="warn")
+        assert score == pytest.approx(0.0)
+
+
+@pytest.mark.parametrize("zero_division, expected_score", [(0, 0), (1, 0.5)])
+def test_jaccard_score_zero_division_set_value(zero_division, expected_score):
+    # check that we don't issue warning by passing the zero_division parameter
+    y_true = np.array([[1, 0, 1], [0, 0, 0]])
+    y_pred = np.array([[0, 0, 0], [0, 0, 0]])
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UndefinedMetricWarning)
+        score = jaccard_score(
+            y_true, y_pred, average="samples", zero_division=zero_division
+        )
+    assert score == pytest.approx(expected_score)
+
+
+@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning")
+def test_precision_recall_f1_score_multilabel_1():
+    # Test precision_recall_f1_score on a crafted multilabel example
+    # First crafted example
+
+    y_true = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 1]])
+    y_pred = np.array([[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 1, 0]])
+
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
+
+    # tp = [0, 1, 1, 0]
+    # fn = [1, 0, 0, 1]
+    # fp = [1, 1, 0, 0]
+    # Check per class
+
+    assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2)
+    assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2)
+    assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
+    assert_array_almost_equal(s, [1, 1, 1, 1], 2)
+
+    f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+    support = s
+    assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2)
+
+    # Check macro
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro")
+    assert_almost_equal(p, 1.5 / 4)
+    assert_almost_equal(r, 0.5)
+    assert_almost_equal(f, 2.5 / 1.5 * 0.25)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(y_true, y_pred, beta=2, average="macro"), np.mean(f2)
+    )
+
+    # Check micro
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro")
+    assert_almost_equal(p, 0.5)
+    assert_almost_equal(r, 0.5)
+    assert_almost_equal(f, 0.5)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(y_true, y_pred, beta=2, average="micro"),
+        (1 + 4) * p * r / (4 * p + r),
+    )
+
+    # Check weighted
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted")
+    assert_almost_equal(p, 1.5 / 4)
+    assert_almost_equal(r, 0.5)
+    assert_almost_equal(f, 2.5 / 1.5 * 0.25)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(y_true, y_pred, beta=2, average="weighted"),
+        np.average(f2, weights=support),
+    )
+    # Check samples
+    # |h(x_i) inter y_i | = [0, 1, 1]
+    # |y_i| = [1, 1, 2]
+    # |h(x_i)| = [1, 1, 2]
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples")
+    assert_almost_equal(p, 0.5)
+    assert_almost_equal(r, 0.5)
+    assert_almost_equal(f, 0.5)
+    assert s is None
+    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="samples"), 0.5)
+
+
+@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning")
+def test_precision_recall_f1_score_multilabel_2():
+    # Test precision_recall_f1_score on a crafted multilabel example 2
+    # Second crafted example
+    y_true = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 1, 0]])
+    y_pred = np.array([[0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 0, 0]])
+
+    # tp = [ 0.  1.  0.  0.]
+    # fp = [ 1.  0.  0.  2.]
+    # fn = [ 1.  1.  1.  0.]
+
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
+    assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2)
+    assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2)
+    assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2)
+    assert_array_almost_equal(s, [1, 2, 1, 0], 2)
+
+    f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
+    support = s
+    assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2)
+
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro")
+    assert_almost_equal(p, 0.25)
+    assert_almost_equal(r, 0.25)
+    assert_almost_equal(f, 2 * 0.25 * 0.25 / 0.5)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(y_true, y_pred, beta=2, average="micro"),
+        (1 + 4) * p * r / (4 * p + r),
+    )
+
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro")
+    assert_almost_equal(p, 0.25)
+    assert_almost_equal(r, 0.125)
+    assert_almost_equal(f, 2 / 12)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(y_true, y_pred, beta=2, average="macro"), np.mean(f2)
+    )
+
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted")
+    assert_almost_equal(p, 2 / 4)
+    assert_almost_equal(r, 1 / 4)
+    assert_almost_equal(f, 2 / 3 * 2 / 4)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(y_true, y_pred, beta=2, average="weighted"),
+        np.average(f2, weights=support),
+    )
+
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples")
+    # Check samples
+    # |h(x_i) inter y_i | = [0, 0, 1]
+    # |y_i| = [1, 1, 2]
+    # |h(x_i)| = [1, 1, 2]
+
+    assert_almost_equal(p, 1 / 6)
+    assert_almost_equal(r, 1 / 6)
+    assert_almost_equal(f, 2 / 4 * 1 / 3)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(y_true, y_pred, beta=2, average="samples"), 0.1666, 2
+    )
+
+
+@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning")
+@pytest.mark.parametrize(
+    "zero_division, zero_division_expected",
+    [("warn", 0), (0, 0), (1, 1), (np.nan, np.nan)],
+)
+def test_precision_recall_f1_score_with_an_empty_prediction(
+    zero_division, zero_division_expected
+):
+    y_true = np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 1, 0]])
+    y_pred = np.array([[0, 0, 0, 0], [0, 0, 0, 1], [0, 1, 1, 0]])
+
+    # true_pos = [ 0.  1.  1.  0.]
+    # false_pos = [ 0.  0.  0.  1.]
+    # false_neg = [ 1.  1.  0.  0.]
+
+    p, r, f, s = precision_recall_fscore_support(
+        y_true, y_pred, average=None, zero_division=zero_division
+    )
+
+    assert_array_almost_equal(p, [zero_division_expected, 1.0, 1.0, 0.0], 2)
+    assert_array_almost_equal(r, [0.0, 0.5, 1.0, zero_division_expected], 2)
+    expected_f = 0
+    assert_array_almost_equal(f, [expected_f, 1 / 1.5, 1, expected_f], 2)
+    assert_array_almost_equal(s, [1, 2, 1, 0], 2)
+
+    f2 = fbeta_score(y_true, y_pred, beta=2, average=None, zero_division=zero_division)
+    support = s
+    assert_array_almost_equal(f2, [expected_f, 0.55, 1, expected_f], 2)
+
+    p, r, f, s = precision_recall_fscore_support(
+        y_true, y_pred, average="macro", zero_division=zero_division
+    )
+
+    value_to_sum = 0 if np.isnan(zero_division_expected) else zero_division_expected
+    values_to_average = 3 + (not np.isnan(zero_division_expected))
+
+    assert_almost_equal(p, (2 + value_to_sum) / values_to_average)
+    assert_almost_equal(r, (1.5 + value_to_sum) / values_to_average)
+    expected_f = (2 / 3 + 1) / 4
+    assert_almost_equal(f, expected_f)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(
+            y_true,
+            y_pred,
+            beta=2,
+            average="macro",
+            zero_division=zero_division,
+        ),
+        _nanaverage(f2, weights=None),
+    )
+
+    p, r, f, s = precision_recall_fscore_support(
+        y_true, y_pred, average="micro", zero_division=zero_division
+    )
+    assert_almost_equal(p, 2 / 3)
+    assert_almost_equal(r, 0.5)
+    assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5))
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(
+            y_true, y_pred, beta=2, average="micro", zero_division=zero_division
+        ),
+        (1 + 4) * p * r / (4 * p + r),
+    )
+
+    p, r, f, s = precision_recall_fscore_support(
+        y_true, y_pred, average="weighted", zero_division=zero_division
+    )
+    assert_almost_equal(p, 3 / 4 if zero_division_expected == 0 else 1.0)
+    assert_almost_equal(r, 0.5)
+    values_to_average = 4
+    assert_almost_equal(f, (2 * 2 / 3 + 1) / values_to_average)
+    assert s is None
+    assert_almost_equal(
+        fbeta_score(
+            y_true, y_pred, beta=2, average="weighted", zero_division=zero_division
+        ),
+        _nanaverage(f2, weights=support),
+    )
+
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples")
+    # |h(x_i) inter y_i | = [0, 0, 2]
+    # |y_i| = [1, 1, 2]
+    # |h(x_i)| = [0, 1, 2]
+    assert_almost_equal(p, 1 / 3)
+    assert_almost_equal(r, 1 / 3)
+    assert_almost_equal(f, 1 / 3)
+    assert s is None
+    expected_result = 0.333
+    assert_almost_equal(
+        fbeta_score(
+            y_true, y_pred, beta=2, average="samples", zero_division=zero_division
+        ),
+        expected_result,
+        2,
+    )
+
+
+@pytest.mark.parametrize("beta", [1])
+@pytest.mark.parametrize("average", ["macro", "micro", "weighted", "samples"])
+@pytest.mark.parametrize("zero_division", [0, 1, np.nan])
+def test_precision_recall_f1_no_labels(beta, average, zero_division):
+    y_true = np.zeros((20, 3))
+    y_pred = np.zeros_like(y_true)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        p, r, f, s = precision_recall_fscore_support(
+            y_true,
+            y_pred,
+            average=average,
+            beta=beta,
+            zero_division=zero_division,
+        )
+        fbeta = fbeta_score(
+            y_true,
+            y_pred,
+            beta=beta,
+            average=average,
+            zero_division=zero_division,
+        )
+    assert s is None
+
+    # if zero_division = nan, check that all metrics are nan and exit
+    if np.isnan(zero_division):
+        for metric in [p, r, f, fbeta]:
+            assert np.isnan(metric)
+        return
+
+    zero_division = float(zero_division)
+    assert_almost_equal(p, zero_division)
+    assert_almost_equal(r, zero_division)
+    assert_almost_equal(f, zero_division)
+
+    assert_almost_equal(fbeta, float(zero_division))
+
+
+@pytest.mark.parametrize("average", ["macro", "micro", "weighted", "samples"])
+def test_precision_recall_f1_no_labels_check_warnings(average):
+    y_true = np.zeros((20, 3))
+    y_pred = np.zeros_like(y_true)
+
+    func = precision_recall_fscore_support
+    with pytest.warns(UndefinedMetricWarning):
+        p, r, f, s = func(y_true, y_pred, average=average, beta=1.0)
+
+    assert_almost_equal(p, 0)
+    assert_almost_equal(r, 0)
+    assert_almost_equal(f, 0)
+    assert s is None
+
+    with pytest.warns(UndefinedMetricWarning):
+        fbeta = fbeta_score(y_true, y_pred, average=average, beta=1.0)
+
+    assert_almost_equal(fbeta, 0)
+
+
+@pytest.mark.parametrize("zero_division", [0, 1, np.nan])
+def test_precision_recall_f1_no_labels_average_none(zero_division):
+    y_true = np.zeros((20, 3))
+    y_pred = np.zeros_like(y_true)
+
+    # tp = [0, 0, 0]
+    # fn = [0, 0, 0]
+    # fp = [0, 0, 0]
+    # support = [0, 0, 0]
+    # |y_hat_i inter y_i | = [0, 0, 0]
+    # |y_i| = [0, 0, 0]
+    # |y_hat_i| = [0, 0, 0]
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        p, r, f, s = precision_recall_fscore_support(
+            y_true,
+            y_pred,
+            average=None,
+            beta=1.0,
+            zero_division=zero_division,
+        )
+        fbeta = fbeta_score(
+            y_true, y_pred, beta=1.0, average=None, zero_division=zero_division
+        )
+
+    zero_division = np.float64(zero_division)
+    assert_array_almost_equal(p, [zero_division, zero_division, zero_division], 2)
+    assert_array_almost_equal(r, [zero_division, zero_division, zero_division], 2)
+    assert_array_almost_equal(f, [zero_division, zero_division, zero_division], 2)
+    assert_array_almost_equal(s, [0, 0, 0], 2)
+
+    assert_array_almost_equal(fbeta, [zero_division, zero_division, zero_division], 2)
+
+
+def test_precision_recall_f1_no_labels_average_none_warn():
+    y_true = np.zeros((20, 3))
+    y_pred = np.zeros_like(y_true)
+
+    # tp = [0, 0, 0]
+    # fn = [0, 0, 0]
+    # fp = [0, 0, 0]
+    # support = [0, 0, 0]
+    # |y_hat_i inter y_i | = [0, 0, 0]
+    # |y_i| = [0, 0, 0]
+    # |y_hat_i| = [0, 0, 0]
+
+    with pytest.warns(UndefinedMetricWarning):
+        p, r, f, s = precision_recall_fscore_support(
+            y_true, y_pred, average=None, beta=1
+        )
+
+    assert_array_almost_equal(p, [0, 0, 0], 2)
+    assert_array_almost_equal(r, [0, 0, 0], 2)
+    assert_array_almost_equal(f, [0, 0, 0], 2)
+    assert_array_almost_equal(s, [0, 0, 0], 2)
+
+    with pytest.warns(UndefinedMetricWarning):
+        fbeta = fbeta_score(y_true, y_pred, beta=1, average=None)
+
+    assert_array_almost_equal(fbeta, [0, 0, 0], 2)
+
+
+def test_prf_warnings():
+    # average of per-label scores
+    f, w = precision_recall_fscore_support, UndefinedMetricWarning
+    for average in [None, "weighted", "macro"]:
+        msg = (
+            "Precision is ill-defined and "
+            "being set to 0.0 in labels with no predicted samples."
+            " Use `zero_division` parameter to control"
+            " this behavior."
+        )
+        with pytest.warns(w, match=msg):
+            f([0, 1, 2], [1, 1, 2], average=average)
+
+        msg = (
+            "Recall is ill-defined and "
+            "being set to 0.0 in labels with no true samples."
+            " Use `zero_division` parameter to control"
+            " this behavior."
+        )
+        with pytest.warns(w, match=msg):
+            f([1, 1, 2], [0, 1, 2], average=average)
+
+    # average of per-sample scores
+    msg = (
+        "Precision is ill-defined and "
+        "being set to 0.0 in samples with no predicted labels."
+        " Use `zero_division` parameter to control"
+        " this behavior."
+    )
+    with pytest.warns(w, match=msg):
+        f(np.array([[1, 0], [1, 0]]), np.array([[1, 0], [0, 0]]), average="samples")
+
+    msg = (
+        "Recall is ill-defined and "
+        "being set to 0.0 in samples with no true labels."
+        " Use `zero_division` parameter to control"
+        " this behavior."
+    )
+    with pytest.warns(w, match=msg):
+        f(np.array([[1, 0], [0, 0]]), np.array([[1, 0], [1, 0]]), average="samples")
+
+    # single score: micro-average
+    msg = (
+        "Precision is ill-defined and "
+        "being set to 0.0 due to no predicted samples."
+        " Use `zero_division` parameter to control"
+        " this behavior."
+    )
+    with pytest.warns(w, match=msg):
+        f(np.array([[1, 1], [1, 1]]), np.array([[0, 0], [0, 0]]), average="micro")
+
+    msg = (
+        "Recall is ill-defined and "
+        "being set to 0.0 due to no true samples."
+        " Use `zero_division` parameter to control"
+        " this behavior."
+    )
+    with pytest.warns(w, match=msg):
+        f(np.array([[0, 0], [0, 0]]), np.array([[1, 1], [1, 1]]), average="micro")
+
+    # single positive label
+    msg = (
+        "Precision is ill-defined and "
+        "being set to 0.0 due to no predicted samples."
+        " Use `zero_division` parameter to control"
+        " this behavior."
+    )
+    with pytest.warns(w, match=msg):
+        f([1, 1], [-1, -1], average="binary")
+
+    msg = (
+        "Recall is ill-defined and "
+        "being set to 0.0 due to no true samples."
+        " Use `zero_division` parameter to control"
+        " this behavior."
+    )
+    with pytest.warns(w, match=msg):
+        f([-1, -1], [1, 1], average="binary")
+
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        precision_recall_fscore_support([0, 0], [0, 0], average="binary")
+        msg = (
+            "F-score is ill-defined and being set to 0.0 due to no true nor "
+            "predicted samples. Use `zero_division` parameter to control this"
+            " behavior."
+        )
+        assert str(record.pop().message) == msg
+        msg = (
+            "Recall is ill-defined and "
+            "being set to 0.0 due to no true samples."
+            " Use `zero_division` parameter to control"
+            " this behavior."
+        )
+        assert str(record.pop().message) == msg
+        msg = (
+            "Precision is ill-defined and "
+            "being set to 0.0 due to no predicted samples."
+            " Use `zero_division` parameter to control"
+            " this behavior."
+        )
+        assert str(record.pop().message) == msg
+
+
+@pytest.mark.parametrize("zero_division", [0, 1, np.nan])
+def test_prf_no_warnings_if_zero_division_set(zero_division):
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        # average of per-label scores
+        for average in [None, "weighted", "macro"]:
+            precision_recall_fscore_support(
+                [0, 1, 2], [1, 1, 2], average=average, zero_division=zero_division
+            )
+
+            precision_recall_fscore_support(
+                [1, 1, 2], [0, 1, 2], average=average, zero_division=zero_division
+            )
+
+        # average of per-sample scores
+        precision_recall_fscore_support(
+            np.array([[1, 0], [1, 0]]),
+            np.array([[1, 0], [0, 0]]),
+            average="samples",
+            zero_division=zero_division,
+        )
+
+        precision_recall_fscore_support(
+            np.array([[1, 0], [0, 0]]),
+            np.array([[1, 0], [1, 0]]),
+            average="samples",
+            zero_division=zero_division,
+        )
+
+        # single score: micro-average
+        precision_recall_fscore_support(
+            np.array([[1, 1], [1, 1]]),
+            np.array([[0, 0], [0, 0]]),
+            average="micro",
+            zero_division=zero_division,
+        )
+
+        precision_recall_fscore_support(
+            np.array([[0, 0], [0, 0]]),
+            np.array([[1, 1], [1, 1]]),
+            average="micro",
+            zero_division=zero_division,
+        )
+
+        # single positive label
+        precision_recall_fscore_support(
+            [1, 1], [-1, -1], average="binary", zero_division=zero_division
+        )
+
+        precision_recall_fscore_support(
+            [-1, -1], [1, 1], average="binary", zero_division=zero_division
+        )
+
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        precision_recall_fscore_support(
+            [0, 0], [0, 0], average="binary", zero_division=zero_division
+        )
+        assert len(record) == 0
+
+
+@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan])
+def test_recall_warnings(zero_division):
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        recall_score(
+            np.array([[1, 1], [1, 1]]),
+            np.array([[0, 0], [0, 0]]),
+            average="micro",
+            zero_division=zero_division,
+        )
+
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        recall_score(
+            np.array([[0, 0], [0, 0]]),
+            np.array([[1, 1], [1, 1]]),
+            average="micro",
+            zero_division=zero_division,
+        )
+        if zero_division == "warn":
+            assert (
+                str(record.pop().message) == "Recall is ill-defined and "
+                "being set to 0.0 due to no true samples."
+                " Use `zero_division` parameter to control"
+                " this behavior."
+            )
+        else:
+            assert len(record) == 0
+
+        recall_score([0, 0], [0, 0])
+        if zero_division == "warn":
+            assert (
+                str(record.pop().message) == "Recall is ill-defined and "
+                "being set to 0.0 due to no true samples."
+                " Use `zero_division` parameter to control"
+                " this behavior."
+            )
+
+
+@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan])
+def test_precision_warnings(zero_division):
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        precision_score(
+            np.array([[1, 1], [1, 1]]),
+            np.array([[0, 0], [0, 0]]),
+            average="micro",
+            zero_division=zero_division,
+        )
+        if zero_division == "warn":
+            assert (
+                str(record.pop().message) == "Precision is ill-defined and "
+                "being set to 0.0 due to no predicted samples."
+                " Use `zero_division` parameter to control"
+                " this behavior."
+            )
+        else:
+            assert len(record) == 0
+
+        precision_score([0, 0], [0, 0])
+        if zero_division == "warn":
+            assert (
+                str(record.pop().message) == "Precision is ill-defined and "
+                "being set to 0.0 due to no predicted samples."
+                " Use `zero_division` parameter to control"
+                " this behavior."
+            )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        precision_score(
+            np.array([[0, 0], [0, 0]]),
+            np.array([[1, 1], [1, 1]]),
+            average="micro",
+            zero_division=zero_division,
+        )
+
+
+@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan])
+def test_fscore_warnings(zero_division):
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+
+        for score in [f1_score, partial(fbeta_score, beta=2)]:
+            score(
+                np.array([[1, 1], [1, 1]]),
+                np.array([[0, 0], [0, 0]]),
+                average="micro",
+                zero_division=zero_division,
+            )
+            assert len(record) == 0
+
+            score(
+                np.array([[0, 0], [0, 0]]),
+                np.array([[1, 1], [1, 1]]),
+                average="micro",
+                zero_division=zero_division,
+            )
+            assert len(record) == 0
+
+            score(
+                np.array([[0, 0], [0, 0]]),
+                np.array([[0, 0], [0, 0]]),
+                average="micro",
+                zero_division=zero_division,
+            )
+            if zero_division == "warn":
+                assert (
+                    str(record.pop().message) == "F-score is ill-defined and "
+                    "being set to 0.0 due to no true nor predicted "
+                    "samples. Use `zero_division` parameter to "
+                    "control this behavior."
+                )
+            else:
+                assert len(record) == 0
+
+
+def test_prf_average_binary_data_non_binary():
+    # Error if user does not explicitly set non-binary average mode
+    y_true_mc = [1, 2, 3, 3]
+    y_pred_mc = [1, 2, 3, 1]
+    msg_mc = (
+        r"Target is multiclass but average='binary'. Please "
+        r"choose another average setting, one of \["
+        r"None, 'micro', 'macro', 'weighted'\]."
+    )
+    y_true_ind = np.array([[0, 1, 1], [1, 0, 0], [0, 0, 1]])
+    y_pred_ind = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
+    msg_ind = (
+        r"Target is multilabel-indicator but average='binary'. Please "
+        r"choose another average setting, one of \["
+        r"None, 'micro', 'macro', 'weighted', 'samples'\]."
+    )
+
+    for y_true, y_pred, msg in [
+        (y_true_mc, y_pred_mc, msg_mc),
+        (y_true_ind, y_pred_ind, msg_ind),
+    ]:
+        for metric in [
+            precision_score,
+            recall_score,
+            f1_score,
+            partial(fbeta_score, beta=2),
+        ]:
+            with pytest.raises(ValueError, match=msg):
+                metric(y_true, y_pred)
+
+
+def test__check_targets():
+    # Check that _check_targets correctly merges target types, squeezes
+    # output and fails if input lengths differ.
+    IND = "multilabel-indicator"
+    MC = "multiclass"
+    BIN = "binary"
+    CNT = "continuous"
+    MMC = "multiclass-multioutput"
+    MCN = "continuous-multioutput"
+    # all of length 3
+    EXAMPLES = [
+        (IND, np.array([[0, 1, 1], [1, 0, 0], [0, 0, 1]])),
+        # must not be considered binary
+        (IND, np.array([[0, 1], [1, 0], [1, 1]])),
+        (MC, [2, 3, 1]),
+        (BIN, [0, 1, 1]),
+        (CNT, [0.0, 1.5, 1.0]),
+        (MC, np.array([[2], [3], [1]])),
+        (BIN, np.array([[0], [1], [1]])),
+        (CNT, np.array([[0.0], [1.5], [1.0]])),
+        (MMC, np.array([[0, 2], [1, 3], [2, 3]])),
+        (MCN, np.array([[0.5, 2.0], [1.1, 3.0], [2.0, 3.0]])),
+    ]
+    # expected type given input types, or None for error
+    # (types will be tried in either order)
+    EXPECTED = {
+        (IND, IND): IND,
+        (MC, MC): MC,
+        (BIN, BIN): BIN,
+        (MC, IND): None,
+        (BIN, IND): None,
+        (BIN, MC): MC,
+        # Disallowed types
+        (CNT, CNT): None,
+        (MMC, MMC): None,
+        (MCN, MCN): None,
+        (IND, CNT): None,
+        (MC, CNT): None,
+        (BIN, CNT): None,
+        (MMC, CNT): None,
+        (MCN, CNT): None,
+        (IND, MMC): None,
+        (MC, MMC): None,
+        (BIN, MMC): None,
+        (MCN, MMC): None,
+        (IND, MCN): None,
+        (MC, MCN): None,
+        (BIN, MCN): None,
+    }
+
+    for (type1, y1), (type2, y2) in product(EXAMPLES, repeat=2):
+        try:
+            expected = EXPECTED[type1, type2]
+        except KeyError:
+            expected = EXPECTED[type2, type1]
+        if expected is None:
+            with pytest.raises(ValueError):
+                _check_targets(y1, y2)
+
+            if type1 != type2:
+                err_msg = (
+                    "Classification metrics can't handle a mix "
+                    "of {0} and {1} targets".format(type1, type2)
+                )
+                with pytest.raises(ValueError, match=err_msg):
+                    _check_targets(y1, y2)
+
+            else:
+                if type1 not in (BIN, MC, IND):
+                    err_msg = "{0} is not supported".format(type1)
+                    with pytest.raises(ValueError, match=err_msg):
+                        _check_targets(y1, y2)
+
+        else:
+            merged_type, y1out, y2out = _check_targets(y1, y2)
+            assert merged_type == expected
+            if merged_type.startswith("multilabel"):
+                assert y1out.format == "csr"
+                assert y2out.format == "csr"
+            else:
+                assert_array_equal(y1out, np.squeeze(y1))
+                assert_array_equal(y2out, np.squeeze(y2))
+            with pytest.raises(ValueError):
+                _check_targets(y1[:-1], y2)
+
+    # Make sure seq of seq is not supported
+    y1 = [(1, 2), (0, 2, 3)]
+    y2 = [(2,), (0, 2)]
+    msg = (
+        "You appear to be using a legacy multi-label data representation. "
+        "Sequence of sequences are no longer supported; use a binary array"
+        " or sparse matrix instead - the MultiLabelBinarizer"
+        " transformer can convert to this format."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _check_targets(y1, y2)
+
+
+def test__check_targets_multiclass_with_both_y_true_and_y_pred_binary():
+    # https://github.com/scikit-learn/scikit-learn/issues/8098
+    y_true = [0, 1]
+    y_pred = [0, -1]
+    assert _check_targets(y_true, y_pred)[0] == "multiclass"
+
+
+def test_hinge_loss_binary():
+    y_true = np.array([-1, 1, 1, -1])
+    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
+    assert hinge_loss(y_true, pred_decision) == 1.2 / 4
+
+    y_true = np.array([0, 2, 2, 0])
+    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
+    assert hinge_loss(y_true, pred_decision) == 1.2 / 4
+
+
+def test_hinge_loss_multiclass():
+    pred_decision = np.array(
+        [
+            [+0.36, -0.17, -0.58, -0.99],
+            [-0.54, -0.37, -0.48, -0.58],
+            [-1.45, -0.58, -0.38, -0.17],
+            [-0.54, -0.38, -0.48, -0.58],
+            [-2.36, -0.79, -0.27, +0.24],
+            [-1.45, -0.58, -0.38, -0.17],
+        ]
+    )
+    y_true = np.array([0, 1, 2, 1, 3, 2])
+    dummy_losses = np.array(
+        [
+            1 - pred_decision[0][0] + pred_decision[0][1],
+            1 - pred_decision[1][1] + pred_decision[1][2],
+            1 - pred_decision[2][2] + pred_decision[2][3],
+            1 - pred_decision[3][1] + pred_decision[3][2],
+            1 - pred_decision[4][3] + pred_decision[4][2],
+            1 - pred_decision[5][2] + pred_decision[5][3],
+        ]
+    )
+    np.clip(dummy_losses, 0, None, out=dummy_losses)
+    dummy_hinge_loss = np.mean(dummy_losses)
+    assert hinge_loss(y_true, pred_decision) == dummy_hinge_loss
+
+
+def test_hinge_loss_multiclass_missing_labels_with_labels_none():
+    y_true = np.array([0, 1, 2, 2])
+    pred_decision = np.array(
+        [
+            [+1.27, 0.034, -0.68, -1.40],
+            [-1.45, -0.58, -0.38, -0.17],
+            [-2.36, -0.79, -0.27, +0.24],
+            [-2.36, -0.79, -0.27, +0.24],
+        ]
+    )
+    error_message = (
+        "Please include all labels in y_true or pass labels as third argument"
+    )
+    with pytest.raises(ValueError, match=error_message):
+        hinge_loss(y_true, pred_decision)
+
+
+def test_hinge_loss_multiclass_no_consistent_pred_decision_shape():
+    # test for inconsistency between multiclass problem and pred_decision
+    # argument
+    y_true = np.array([2, 1, 0, 1, 0, 1, 1])
+    pred_decision = np.array([0, 1, 2, 1, 0, 2, 1])
+    error_message = (
+        "The shape of pred_decision cannot be 1d array"
+        "with a multiclass target. pred_decision shape "
+        "must be (n_samples, n_classes), that is "
+        "(7, 3). Got: (7,)"
+    )
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        hinge_loss(y_true=y_true, pred_decision=pred_decision)
+
+    # test for inconsistency between pred_decision shape and labels number
+    pred_decision = np.array([[0, 1], [0, 1], [0, 1], [0, 1], [2, 0], [0, 1], [1, 0]])
+    labels = [0, 1, 2]
+    error_message = (
+        "The shape of pred_decision is not "
+        "consistent with the number of classes. "
+        "With a multiclass target, pred_decision "
+        "shape must be (n_samples, n_classes), that is "
+        "(7, 3). Got: (7, 2)"
+    )
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        hinge_loss(y_true=y_true, pred_decision=pred_decision, labels=labels)
+
+
+def test_hinge_loss_multiclass_with_missing_labels():
+    pred_decision = np.array(
+        [
+            [+0.36, -0.17, -0.58, -0.99],
+            [-0.55, -0.38, -0.48, -0.58],
+            [-1.45, -0.58, -0.38, -0.17],
+            [-0.55, -0.38, -0.48, -0.58],
+            [-1.45, -0.58, -0.38, -0.17],
+        ]
+    )
+    y_true = np.array([0, 1, 2, 1, 2])
+    labels = np.array([0, 1, 2, 3])
+    dummy_losses = np.array(
+        [
+            1 - pred_decision[0][0] + pred_decision[0][1],
+            1 - pred_decision[1][1] + pred_decision[1][2],
+            1 - pred_decision[2][2] + pred_decision[2][3],
+            1 - pred_decision[3][1] + pred_decision[3][2],
+            1 - pred_decision[4][2] + pred_decision[4][3],
+        ]
+    )
+    np.clip(dummy_losses, 0, None, out=dummy_losses)
+    dummy_hinge_loss = np.mean(dummy_losses)
+    assert hinge_loss(y_true, pred_decision, labels=labels) == dummy_hinge_loss
+
+
+def test_hinge_loss_multiclass_missing_labels_only_two_unq_in_y_true():
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/17630
+    # check that we can compute the hinge loss when providing an array
+    # with labels allowing to not have all labels in y_true
+    pred_decision = np.array(
+        [
+            [+0.36, -0.17, -0.58],
+            [-0.15, -0.58, -0.48],
+            [-1.45, -0.58, -0.38],
+            [-0.55, -0.78, -0.42],
+            [-1.45, -0.58, -0.38],
+        ]
+    )
+    y_true = np.array([0, 2, 2, 0, 2])
+    labels = np.array([0, 1, 2])
+    dummy_losses = np.array(
+        [
+            1 - pred_decision[0][0] + pred_decision[0][1],
+            1 - pred_decision[1][2] + pred_decision[1][0],
+            1 - pred_decision[2][2] + pred_decision[2][1],
+            1 - pred_decision[3][0] + pred_decision[3][2],
+            1 - pred_decision[4][2] + pred_decision[4][1],
+        ]
+    )
+    np.clip(dummy_losses, 0, None, out=dummy_losses)
+    dummy_hinge_loss = np.mean(dummy_losses)
+    assert_almost_equal(
+        hinge_loss(y_true, pred_decision, labels=labels), dummy_hinge_loss
+    )
+
+
+def test_hinge_loss_multiclass_invariance_lists():
+    # Currently, invariance of string and integer labels cannot be tested
+    # in common invariance tests because invariance tests for multiclass
+    # decision functions is not implemented yet.
+    y_true = ["blue", "green", "red", "green", "white", "red"]
+    pred_decision = [
+        [+0.36, -0.17, -0.58, -0.99],
+        [-0.55, -0.38, -0.48, -0.58],
+        [-1.45, -0.58, -0.38, -0.17],
+        [-0.55, -0.38, -0.48, -0.58],
+        [-2.36, -0.79, -0.27, +0.24],
+        [-1.45, -0.58, -0.38, -0.17],
+    ]
+    dummy_losses = np.array(
+        [
+            1 - pred_decision[0][0] + pred_decision[0][1],
+            1 - pred_decision[1][1] + pred_decision[1][2],
+            1 - pred_decision[2][2] + pred_decision[2][3],
+            1 - pred_decision[3][1] + pred_decision[3][2],
+            1 - pred_decision[4][3] + pred_decision[4][2],
+            1 - pred_decision[5][2] + pred_decision[5][3],
+        ]
+    )
+    np.clip(dummy_losses, 0, None, out=dummy_losses)
+    dummy_hinge_loss = np.mean(dummy_losses)
+    assert hinge_loss(y_true, pred_decision) == dummy_hinge_loss
+
+
+def test_log_loss():
+    # binary case with symbolic labels ("no" < "yes")
+    y_true = ["no", "no", "no", "yes", "yes", "yes"]
+    y_pred = np.array(
+        [[0.5, 0.5], [0.1, 0.9], [0.01, 0.99], [0.9, 0.1], [0.75, 0.25], [0.001, 0.999]]
+    )
+    loss = log_loss(y_true, y_pred)
+    loss_true = -np.mean(bernoulli.logpmf(np.array(y_true) == "yes", y_pred[:, 1]))
+    assert_allclose(loss, loss_true)
+
+    # multiclass case; adapted from http://bit.ly/RJJHWA
+    y_true = [1, 0, 2]
+    y_pred = [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]]
+    loss = log_loss(y_true, y_pred, normalize=True)
+    assert_allclose(loss, 0.6904911)
+
+    # check that we got all the shapes and axes right
+    # by doubling the length of y_true and y_pred
+    y_true *= 2
+    y_pred *= 2
+    loss = log_loss(y_true, y_pred, normalize=False)
+    assert_allclose(loss, 0.6904911 * 6)
+
+    # raise error if number of classes are not equal.
+    y_true = [1, 0, 2]
+    y_pred = [[0.3, 0.7], [0.6, 0.4], [0.4, 0.6]]
+    with pytest.raises(ValueError):
+        log_loss(y_true, y_pred)
+
+    # raise error if labels do not contain all values of y_true
+    y_true = ["a", "b", "c"]
+    y_pred = [[0.9, 0.1, 0.0], [0.1, 0.9, 0.0], [0.1, 0.1, 0.8]]
+    labels = ["a", "c", "d"]
+    error_str = (
+        "y_true contains values {'b'} not belonging to the passed "
+        "labels ['a', 'c', 'd']."
+    )
+    with pytest.raises(ValueError, match=re.escape(error_str)):
+        log_loss(y_true, y_pred, labels=labels)
+
+    # case when y_true is a string array object
+    y_true = ["ham", "spam", "spam", "ham"]
+    y_pred = [[0.3, 0.7], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3]]
+    loss = log_loss(y_true, y_pred)
+    assert_allclose(loss, 0.7469410)
+
+    # test labels option
+
+    y_true = [2, 2]
+    y_pred = [[0.2, 0.8], [0.6, 0.4]]
+    y_score = np.array([[0.1, 0.9], [0.1, 0.9]])
+    error_str = (
+        "y_true contains only one label (2). Please provide the list of all "
+        "expected class labels explicitly through the labels argument."
+    )
+    with pytest.raises(ValueError, match=re.escape(error_str)):
+        log_loss(y_true, y_pred)
+
+    y_pred = [[0.2, 0.8], [0.6, 0.4], [0.7, 0.3]]
+    error_str = "Found input variables with inconsistent numbers of samples: [3, 2]"
+    with pytest.raises(ValueError, match=re.escape(error_str)):
+        log_loss(y_true, y_pred)
+
+    # works when the labels argument is used
+
+    true_log_loss = -np.mean(np.log(y_score[:, 1]))
+    calculated_log_loss = log_loss(y_true, y_score, labels=[1, 2])
+    assert_allclose(calculated_log_loss, true_log_loss)
+
+    # ensure labels work when len(np.unique(y_true)) != y_pred.shape[1]
+    y_true = [1, 2, 2]
+    y_score2 = [[0.7, 0.1, 0.2], [0.2, 0.7, 0.1], [0.1, 0.7, 0.2]]
+    loss = log_loss(y_true, y_score2, labels=[1, 2, 3])
+    assert_allclose(loss, -np.log(0.7))
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32, np.float16])
+def test_log_loss_eps(dtype):
+    """Check the behaviour internal eps that changes depending on the input dtype.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/24315
+    """
+    y_true = np.array([0, 1], dtype=dtype)
+    y_pred = np.array([1, 0], dtype=dtype)
+
+    loss = log_loss(y_true, y_pred)
+    assert np.isfinite(loss)
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32, np.float16])
+def test_log_loss_not_probabilities_warning(dtype):
+    """Check that log_loss raises a warning when y_pred values don't sum to 1."""
+    y_true = np.array([0, 1, 1, 0])
+    y_pred = np.array([[0.2, 0.7], [0.6, 0.3], [0.4, 0.7], [0.8, 0.3]], dtype=dtype)
+
+    with pytest.warns(UserWarning, match="The y_prob values do not sum to one."):
+        log_loss(y_true, y_pred)
+
+
+@pytest.mark.parametrize(
+    "y_true, y_pred",
+    [
+        ([0, 1, 0], [0, 1, 0]),
+        ([0, 1, 0], [[1, 0], [0, 1], [1, 0]]),
+        ([0, 1, 2], [[1, 0, 0], [0, 1, 0], [0, 0, 1]]),
+    ],
+)
+def test_log_loss_perfect_predictions(y_true, y_pred):
+    """Check that log_loss returns 0 for perfect predictions."""
+    # Because of the clipping, the result is not exactly 0
+    assert log_loss(y_true, y_pred) == pytest.approx(0)
+
+
+def test_log_loss_pandas_input():
+    # case when input is a pandas series and dataframe gh-5715
+    y_tr = np.array(["ham", "spam", "spam", "ham"])
+    y_pr = np.array([[0.3, 0.7], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3]])
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((Series, DataFrame))
+    except ImportError:
+        pass
+    for TrueInputType, PredInputType in types:
+        # y_pred dataframe, y_true series
+        y_true, y_pred = TrueInputType(y_tr), PredInputType(y_pr)
+        loss = log_loss(y_true, y_pred)
+        assert_allclose(loss, 0.7469410)
+
+
+def test_log_loss_warnings():
+    expected_message = re.escape(
+        "Labels passed were ['spam', 'eggs', 'ham']. But this function "
+        "assumes labels are ordered lexicographically. "
+        "Pass the ordered labels=['eggs', 'ham', 'spam'] and ensure that "
+        "the columns of y_prob correspond to this ordering."
+    )
+    with pytest.warns(UserWarning, match=expected_message):
+        log_loss(
+            ["eggs", "spam", "ham"],
+            [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+            labels=["spam", "eggs", "ham"],
+        )
+
+
+def test_brier_score_loss_binary():
+    # Check brier_score_loss function
+    y_true = np.array([0, 1, 1, 0, 1, 1])
+    y_prob = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95])
+    true_score = linalg.norm(y_true - y_prob) ** 2 / len(y_true)
+
+    assert_almost_equal(brier_score_loss(y_true, y_true), 0.0)
+    assert_almost_equal(brier_score_loss(y_true, y_prob), true_score)
+    assert_almost_equal(brier_score_loss(1.0 + y_true, y_prob), true_score)
+    assert_almost_equal(brier_score_loss(2 * y_true - 1, y_prob), true_score)
+
+    # check that using (n_samples, 2) y_prob or y_true gives the same score
+    y_prob_reshaped = np.column_stack((1 - y_prob, y_prob))
+    y_true_reshaped = np.column_stack((1 - y_true, y_true))
+    assert_almost_equal(brier_score_loss(y_true, y_prob_reshaped), true_score)
+    assert_almost_equal(brier_score_loss(y_true_reshaped, y_prob_reshaped), true_score)
+
+    # check scale_by_half argument
+    assert_almost_equal(
+        brier_score_loss(y_true, y_prob, scale_by_half="auto"), true_score
+    )
+    assert_almost_equal(
+        brier_score_loss(y_true, y_prob, scale_by_half=True), true_score
+    )
+    assert_almost_equal(
+        brier_score_loss(y_true, y_prob, scale_by_half=False), 2 * true_score
+    )
+
+    # calculate correctly when there's only one class in y_true
+    assert_almost_equal(brier_score_loss([-1], [0.4]), 0.4**2)
+    assert_almost_equal(brier_score_loss([0], [0.4]), 0.4**2)
+    assert_almost_equal(brier_score_loss([1], [0.4]), (1 - 0.4) ** 2)
+    assert_almost_equal(brier_score_loss(["foo"], [0.4], pos_label="bar"), 0.4**2)
+    assert_almost_equal(
+        brier_score_loss(["foo"], [0.4], pos_label="foo"),
+        (1 - 0.4) ** 2,
+    )
+
+
+def test_brier_score_loss_multiclass():
+    # test cases for multi-class
+    assert_almost_equal(
+        brier_score_loss(
+            ["eggs", "spam", "ham"],
+            [[1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]],
+            labels=["eggs", "ham", "spam", "yams"],
+        ),
+        2 / 3,
+    )
+
+    assert_almost_equal(
+        brier_score_loss(
+            [1, 0, 2], [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]]
+        ),
+        0.41333333,
+    )
+
+    # check perfect predictions for 3 classes
+    assert_almost_equal(
+        brier_score_loss(
+            [0, 1, 2], [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
+        ),
+        0,
+    )
+
+    # check perfectly incorrect predictions for 3 classes
+    assert_almost_equal(
+        brier_score_loss(
+            [0, 1, 2], [[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0]]
+        ),
+        2,
+    )
+
+
+def test_brier_score_loss_invalid_inputs():
+    # binary case
+    y_true = np.array([0, 1, 1, 0, 1, 1])
+    y_prob = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95])
+    with pytest.raises(ValueError):
+        # bad length of y_prob
+        brier_score_loss(y_true, y_prob[1:])
+    with pytest.raises(ValueError):
+        # y_pred has value greater than 1
+        brier_score_loss(y_true, y_prob + 1.0)
+    with pytest.raises(ValueError):
+        # y_pred has value less than 0
+        brier_score_loss(y_true, y_prob - 1.0)
+
+    # multiclass case
+    y_true = np.array([1, 0, 2])
+    y_prob = np.array([[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]])
+    with pytest.raises(ValueError):
+        # bad length of y_pred
+        brier_score_loss(y_true, y_prob[1:])
+    with pytest.raises(ValueError):
+        # y_pred has value greater than 1
+        brier_score_loss(y_true, y_prob + 1.0)
+    with pytest.raises(ValueError):
+        # y_pred has value less than 0
+        brier_score_loss(y_true, y_prob - 1.0)
+
+    # raise an error for multiclass y_true and binary y_prob
+    y_true = np.array([0, 1, 2, 0])
+    y_prob = np.array([0.8, 0.6, 0.4, 0.2])
+    error_message = re.escape(
+        "The type of the target inferred from y_true is multiclass "
+        "but should be binary according to the shape of y_prob."
+    )
+    with pytest.raises(ValueError, match=error_message):
+        brier_score_loss(y_true, y_prob)
+
+    # raise an error for wrong number of classes
+    y_true = [0, 1, 2]
+    y_prob = [[1, 0], [0, 1], [0, 1]]
+    error_message = (
+        "y_true and y_prob contain different number of "
+        "classes: 3 vs 2. Please provide the true "
+        "labels explicitly through the labels argument. "
+        "Classes found in "
+        "y_true: [0 1 2]"
+    )
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        brier_score_loss(y_true, y_prob)
+
+    y_true = ["eggs", "spam", "ham"]
+    y_prob = [[1, 0, 0], [0, 1, 0], [0, 1, 0]]
+    labels = ["eggs", "spam", "ham", "yams"]
+    error_message = (
+        "The number of classes in labels is different "
+        "from that in y_prob. Classes found in "
+        "labels: ['eggs' 'ham' 'spam' 'yams']"
+    )
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        brier_score_loss(y_true, y_prob, labels=labels)
+
+    # raise error message when there's only one class in y_true
+    y_true = ["eggs"]
+    y_prob = [[0.9, 0.1]]
+    error_message = (
+        "y_true contains only one label (eggs). Please "
+        "provide the list of all expected class labels explicitly through the "
+        "labels argument."
+    )
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        brier_score_loss(y_true, y_prob)
+
+    # error is fixed when labels is specified
+    assert_almost_equal(brier_score_loss(y_true, y_prob, labels=["eggs", "ham"]), 0.01)
+
+
+def test_brier_score_loss_warnings():
+    expected_message = re.escape(
+        "Labels passed were ['spam', 'eggs', 'ham']. But this function "
+        "assumes labels are ordered lexicographically. "
+        "Pass the ordered labels=['eggs', 'ham', 'spam'] and ensure that "
+        "the columns of y_prob correspond to this ordering."
+    )
+    with pytest.warns(UserWarning, match=expected_message):
+        brier_score_loss(
+            ["eggs", "spam", "ham"],
+            [
+                [1, 0, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+            ],
+            labels=["spam", "eggs", "ham"],
+        )
+
+
+def test_balanced_accuracy_score_unseen():
+    msg = "y_pred contains classes not in y_true"
+    with pytest.warns(UserWarning, match=msg):
+        balanced_accuracy_score([0, 0, 0], [0, 0, 1])
+
+
+@pytest.mark.parametrize(
+    "y_true,y_pred",
+    [
+        (["a", "b", "a", "b"], ["a", "a", "a", "b"]),
+        (["a", "b", "c", "b"], ["a", "a", "a", "b"]),
+        (["a", "a", "a", "b"], ["a", "b", "c", "b"]),
+    ],
+)
+def test_balanced_accuracy_score(y_true, y_pred):
+    macro_recall = recall_score(
+        y_true, y_pred, average="macro", labels=np.unique(y_true)
+    )
+    with ignore_warnings():
+        # Warnings are tested in test_balanced_accuracy_score_unseen
+        balanced = balanced_accuracy_score(y_true, y_pred)
+    assert balanced == pytest.approx(macro_recall)
+    adjusted = balanced_accuracy_score(y_true, y_pred, adjusted=True)
+    chance = balanced_accuracy_score(y_true, np.full_like(y_true, y_true[0]))
+    assert adjusted == (balanced - chance) / (1 - chance)
+
+
+@pytest.mark.parametrize(
+    "metric",
+    [
+        jaccard_score,
+        f1_score,
+        partial(fbeta_score, beta=0.5),
+        precision_recall_fscore_support,
+        precision_score,
+        recall_score,
+        brier_score_loss,
+    ],
+)
+@pytest.mark.parametrize(
+    "classes", [(False, True), (0, 1), (0.0, 1.0), ("zero", "one")]
+)
+def test_classification_metric_pos_label_types(metric, classes):
+    """Check that the metric works with different types of `pos_label`.
+
+    We can expect `pos_label` to be a bool, an integer, a float, a string.
+    No error should be raised for those types.
+    """
+    rng = np.random.RandomState(42)
+    n_samples, pos_label = 10, classes[-1]
+    y_true = rng.choice(classes, size=n_samples, replace=True)
+    if metric is brier_score_loss:
+        # brier score loss requires probabilities
+        y_pred = rng.uniform(size=n_samples)
+    else:
+        y_pred = y_true.copy()
+    result = metric(y_true, y_pred, pos_label=pos_label)
+    assert not np.any(np.isnan(result))
+
+
+@pytest.mark.parametrize(
+    "y_true, y_pred, expected_score",
+    [
+        (np.array([0, 1]), np.array([1, 0]), 0.0),
+        (np.array([0, 1]), np.array([0, 1]), 1.0),
+        (np.array([0, 1]), np.array([0, 0]), 0.0),
+        (np.array([0, 0]), np.array([0, 0]), 1.0),
+    ],
+)
+def test_f1_for_small_binary_inputs_with_zero_division(y_true, y_pred, expected_score):
+    """Check the behaviour of `zero_division` for f1-score.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/26965
+    """
+    assert f1_score(y_true, y_pred, zero_division=1.0) == pytest.approx(expected_score)
+
+
+@pytest.mark.parametrize(
+    "scoring",
+    [
+        make_scorer(f1_score, zero_division=np.nan),
+        make_scorer(fbeta_score, beta=2, zero_division=np.nan),
+        make_scorer(precision_score, zero_division=np.nan),
+        make_scorer(recall_score, zero_division=np.nan),
+    ],
+)
+def test_classification_metric_division_by_zero_nan_validaton(scoring):
+    """Check that we validate `np.nan` properly for classification metrics.
+
+    With `n_jobs=2` in cross-validation, the `np.nan` used for the singleton will be
+    different in the sub-process and we should not use the `is` operator but
+    `math.isnan`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27563
+    """
+    X, y = datasets.make_classification(random_state=0)
+    classifier = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X, y)
+    cross_val_score(classifier, X, y, scoring=scoring, n_jobs=2, error_score="raise")
+
+
+def test_d2_log_loss_score():
+    y_true = [0, 0, 0, 1, 1, 1]
+    y_true_string = ["no", "no", "no", "yes", "yes", "yes"]
+    y_pred = np.array(
+        [
+            [0.5, 0.5],
+            [0.9, 0.1],
+            [0.4, 0.6],
+            [0.6, 0.4],
+            [0.35, 0.65],
+            [0.01, 0.99],
+        ]
+    )
+    y_pred_null = np.array(
+        [
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true=y_true, y_pred=y_pred)
+    log_likelihood = log_loss(y_true=y_true, y_pred=y_pred, normalize=False)
+    log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null, normalize=False)
+    d2_score_true = 1 - log_likelihood / log_likelihood_null
+    assert d2_score == pytest.approx(d2_score_true)
+
+    # check that using sample weight also gives the correct d2 score
+    sample_weight = np.array([2, 1, 3, 4, 3, 1])
+    y_pred_null[:, 0] = sample_weight[:3].sum() / sample_weight.sum()
+    y_pred_null[:, 1] = sample_weight[3:].sum() / sample_weight.sum()
+    d2_score = d2_log_loss_score(
+        y_true=y_true, y_pred=y_pred, sample_weight=sample_weight
+    )
+    log_likelihood = log_loss(
+        y_true=y_true,
+        y_pred=y_pred,
+        sample_weight=sample_weight,
+        normalize=False,
+    )
+    log_likelihood_null = log_loss(
+        y_true=y_true,
+        y_pred=y_pred_null,
+        sample_weight=sample_weight,
+        normalize=False,
+    )
+    d2_score_true = 1 - log_likelihood / log_likelihood_null
+    assert d2_score == pytest.approx(d2_score_true)
+
+    # check if good predictions give a relatively higher value for the d2 score
+    y_pred = np.array(
+        [
+            [0.9, 0.1],
+            [0.8, 0.2],
+            [0.9, 0.1],
+            [0.1, 0.9],
+            [0.2, 0.8],
+            [0.1, 0.9],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert 0.5 < d2_score < 1.0
+    # check that a similar value is obtained for string labels
+    d2_score_string = d2_log_loss_score(y_true_string, y_pred)
+    assert d2_score_string == pytest.approx(d2_score)
+
+    # check if poor predictions gives a relatively low value for the d2 score
+    y_pred = np.array(
+        [
+            [0.5, 0.5],
+            [0.1, 0.9],
+            [0.1, 0.9],
+            [0.9, 0.1],
+            [0.75, 0.25],
+            [0.1, 0.9],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score < 0
+    # check that a similar value is obtained for string labels
+    d2_score_string = d2_log_loss_score(y_true_string, y_pred)
+    assert d2_score_string == pytest.approx(d2_score)
+
+    # check if simply using the average of the classes as the predictions
+    # gives a d2 score of 0
+    y_true = [0, 0, 0, 1, 1, 1]
+    y_pred = np.array(
+        [
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score == 0
+    d2_score_string = d2_log_loss_score(y_true_string, y_pred)
+    assert d2_score_string == 0
+
+    # check if simply using the average of the classes as the predictions
+    # gives a d2 score of 0 when the positive class has a higher proportion
+    y_true = [0, 1, 1, 1]
+    y_true_string = ["no", "yes", "yes", "yes"]
+    y_pred = np.array([[0.25, 0.75], [0.25, 0.75], [0.25, 0.75], [0.25, 0.75]])
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score == 0
+    d2_score_string = d2_log_loss_score(y_true_string, y_pred)
+    assert d2_score_string == 0
+    sample_weight = [2, 2, 2, 2]
+    d2_score_with_sample_weight = d2_log_loss_score(
+        y_true, y_pred, sample_weight=sample_weight
+    )
+    assert d2_score_with_sample_weight == 0
+
+    # check that the d2 scores seem correct when more than 2
+    # labels are specified
+    y_true = ["high", "high", "low", "neutral"]
+    sample_weight = [1.4, 0.6, 0.8, 0.2]
+
+    y_pred = np.array(
+        [
+            [0.8, 0.1, 0.1],
+            [0.8, 0.1, 0.1],
+            [0.1, 0.8, 0.1],
+            [0.1, 0.1, 0.8],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert 0.5 < d2_score < 1.0
+    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
+    assert 0.5 < d2_score < 1.0
+
+    y_pred = np.array(
+        [
+            [0.2, 0.5, 0.3],
+            [0.1, 0.7, 0.2],
+            [0.1, 0.1, 0.8],
+            [0.2, 0.7, 0.1],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score < 0
+    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
+    assert d2_score < 0
+
+
+def test_d2_log_loss_score_missing_labels():
+    """Check that d2_log_loss_score works when not all labels are present in y_true
+
+    non-regression test for https://github.com/scikit-learn/scikit-learn/issues/30713
+    """
+    y_true = [2, 0, 2, 0]
+    labels = [0, 1, 2]
+    sample_weight = [1.4, 0.6, 0.7, 0.3]
+    y_pred = np.tile([1, 0, 0], (4, 1))
+
+    log_loss_obs = log_loss(y_true, y_pred, sample_weight=sample_weight, labels=labels)
+
+    # Null model consists of weighted average of the classes.
+    # Given that the sum of the weights is 3,
+    # - weighted average of 0s is (0.6 + 0.3) / 3 = 0.3
+    # - weighted average of 1s is 0
+    # - weighted average of 2s is (1.4 + 0.7) / 3 = 0.7
+    y_pred_null = np.tile([0.3, 0, 0.7], (4, 1))
+    log_loss_null = log_loss(
+        y_true, y_pred_null, sample_weight=sample_weight, labels=labels
+    )
+
+    expected_d2_score = 1 - log_loss_obs / log_loss_null
+    d2_score = d2_log_loss_score(
+        y_true, y_pred, sample_weight=sample_weight, labels=labels
+    )
+    assert_allclose(d2_score, expected_d2_score)
+
+
+def test_d2_log_loss_score_label_order():
+    """Check that d2_log_loss_score doesn't depend on the order of the labels."""
+    y_true = [2, 0, 2, 0]
+    y_pred = np.tile([1, 0, 0], (4, 1))
+
+    d2_score = d2_log_loss_score(y_true, y_pred, labels=[0, 1, 2])
+    d2_score_other = d2_log_loss_score(y_true, y_pred, labels=[0, 2, 1])
+
+    assert_allclose(d2_score, d2_score_other)
+
+
+def test_d2_log_loss_score_raises():
+    """Test that d2_log_loss_score raises the appropriate errors on
+    invalid inputs."""
+    y_true = [0, 1, 2]
+    y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]]
+    err = "contain different number of classes"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred)
+
+    # check error if the number of classes in labels do not match the number
+    # of classes in y_pred.
+    y_true = [0, 1, 2]
+    y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
+    labels = [0, 1, 2]
+    err = "number of classes in labels is different"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred, labels=labels)
+
+    # check error if y_true and y_pred do not have equal lengths
+    y_true = [0, 1, 2]
+    y_pred = [[0.5, 0.5, 0.5], [0.6, 0.3, 0.1]]
+    err = "inconsistent numbers of samples"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred)
+
+    # check warning for samples < 2
+    y_true = [1]
+    y_pred = [[0.5, 0.5]]
+    err = "score is not well-defined"
+    with pytest.warns(UndefinedMetricWarning, match=err):
+        d2_log_loss_score(y_true, y_pred)
+
+    # check error when y_true only has 1 label
+    y_true = [1, 1, 1]
+    y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
+    err = "y_true contains only one label"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred)
+
+    # check error when y_true only has 1 label and labels also has
+    # only 1 label
+    y_true = [1, 1, 1]
+    labels = [1]
+    y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
+    err = "The labels array needs to contain at least two"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred, labels=labels)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..39522876e8f24589174fa6ce2f6890ad552e5899
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_common.py
@@ -0,0 +1,2348 @@
+import math
+from functools import partial
+from inspect import signature
+from itertools import chain, permutations, product
+
+import numpy as np
+import pytest
+
+from sklearn._config import config_context
+from sklearn.datasets import make_multilabel_classification
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.metrics import (
+    accuracy_score,
+    average_precision_score,
+    balanced_accuracy_score,
+    brier_score_loss,
+    cohen_kappa_score,
+    confusion_matrix,
+    coverage_error,
+    d2_absolute_error_score,
+    d2_pinball_score,
+    d2_tweedie_score,
+    dcg_score,
+    det_curve,
+    explained_variance_score,
+    f1_score,
+    fbeta_score,
+    hamming_loss,
+    hinge_loss,
+    jaccard_score,
+    label_ranking_average_precision_score,
+    label_ranking_loss,
+    log_loss,
+    matthews_corrcoef,
+    max_error,
+    mean_absolute_error,
+    mean_absolute_percentage_error,
+    mean_gamma_deviance,
+    mean_pinball_loss,
+    mean_poisson_deviance,
+    mean_squared_error,
+    mean_squared_log_error,
+    mean_tweedie_deviance,
+    median_absolute_error,
+    multilabel_confusion_matrix,
+    ndcg_score,
+    precision_recall_curve,
+    precision_score,
+    r2_score,
+    recall_score,
+    roc_auc_score,
+    roc_curve,
+    root_mean_squared_error,
+    root_mean_squared_log_error,
+    top_k_accuracy_score,
+    zero_one_loss,
+)
+from sklearn.metrics._base import _average_binary_score
+from sklearn.metrics.pairwise import (
+    additive_chi2_kernel,
+    chi2_kernel,
+    cosine_distances,
+    cosine_similarity,
+    euclidean_distances,
+    linear_kernel,
+    paired_cosine_distances,
+    paired_euclidean_distances,
+    polynomial_kernel,
+    rbf_kernel,
+    sigmoid_kernel,
+)
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils import shuffle
+from sklearn.utils._array_api import (
+    _atol_for_type,
+    _convert_to_numpy,
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._testing import (
+    _array_api_for_tests,
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+    assert_array_less,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, parse_version, sp_version
+from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import _num_samples, check_random_state
+
+# Note toward developers about metric testing
+# -------------------------------------------
+# It is often possible to write one general test for several metrics:
+#
+#   - invariance properties, e.g. invariance to sample order
+#   - common behavior for an argument, e.g. the "normalize" with value True
+#     will return the mean of the metrics and with value False will return
+#     the sum of the metrics.
+#
+# In order to improve the overall metric testing, it is a good idea to write
+# first a specific test for the given metric and then add a general test for
+# all metrics that have the same behavior.
+#
+# Two types of datastructures are used in order to implement this system:
+# dictionaries of metrics and lists of metrics with common properties.
+#
+# Dictionaries of metrics
+# ------------------------
+# The goal of having those dictionaries is to have an easy way to call a
+# particular metric and associate a name to each function:
+#
+#   - REGRESSION_METRICS: all regression metrics.
+#   - CLASSIFICATION_METRICS: all classification metrics
+#     which compare a ground truth and the estimated targets as returned by a
+#     classifier.
+#   - THRESHOLDED_METRICS: all classification metrics which
+#     compare a ground truth and a score, e.g. estimated probabilities or
+#     decision function (format might vary)
+#
+# Those dictionaries will be used to test systematically some invariance
+# properties, e.g. invariance toward several input layout.
+#
+
+REGRESSION_METRICS = {
+    "max_error": max_error,
+    "mean_absolute_error": mean_absolute_error,
+    "mean_squared_error": mean_squared_error,
+    "mean_squared_log_error": mean_squared_log_error,
+    "mean_pinball_loss": mean_pinball_loss,
+    "median_absolute_error": median_absolute_error,
+    "mean_absolute_percentage_error": mean_absolute_percentage_error,
+    "explained_variance_score": explained_variance_score,
+    "r2_score": partial(r2_score, multioutput="variance_weighted"),
+    "root_mean_squared_error": root_mean_squared_error,
+    "root_mean_squared_log_error": root_mean_squared_log_error,
+    "mean_normal_deviance": partial(mean_tweedie_deviance, power=0),
+    "mean_poisson_deviance": mean_poisson_deviance,
+    "mean_gamma_deviance": mean_gamma_deviance,
+    "mean_compound_poisson_deviance": partial(mean_tweedie_deviance, power=1.4),
+    "d2_tweedie_score": partial(d2_tweedie_score, power=1.4),
+    "d2_pinball_score": d2_pinball_score,
+    "d2_absolute_error_score": d2_absolute_error_score,
+}
+
+CLASSIFICATION_METRICS = {
+    "accuracy_score": accuracy_score,
+    "balanced_accuracy_score": balanced_accuracy_score,
+    "adjusted_balanced_accuracy_score": partial(balanced_accuracy_score, adjusted=True),
+    "unnormalized_accuracy_score": partial(accuracy_score, normalize=False),
+    # `confusion_matrix` returns absolute values and hence behaves unnormalized
+    # . Naming it with an unnormalized_ prefix is necessary for this module to
+    # skip sample_weight scaling checks which will fail for unnormalized
+    # metrics.
+    "unnormalized_confusion_matrix": confusion_matrix,
+    "normalized_confusion_matrix": lambda *args, **kwargs: (
+        confusion_matrix(*args, **kwargs).astype("float")
+        / confusion_matrix(*args, **kwargs).sum(axis=1)[:, np.newaxis]
+    ),
+    "unnormalized_multilabel_confusion_matrix": multilabel_confusion_matrix,
+    "unnormalized_multilabel_confusion_matrix_sample": partial(
+        multilabel_confusion_matrix, samplewise=True
+    ),
+    "hamming_loss": hamming_loss,
+    "zero_one_loss": zero_one_loss,
+    "unnormalized_zero_one_loss": partial(zero_one_loss, normalize=False),
+    # These are needed to test averaging
+    "jaccard_score": jaccard_score,
+    "precision_score": precision_score,
+    "recall_score": recall_score,
+    "f1_score": f1_score,
+    "f2_score": partial(fbeta_score, beta=2),
+    "f0.5_score": partial(fbeta_score, beta=0.5),
+    "matthews_corrcoef_score": matthews_corrcoef,
+    "weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5),
+    "weighted_f1_score": partial(f1_score, average="weighted"),
+    "weighted_f2_score": partial(fbeta_score, average="weighted", beta=2),
+    "weighted_precision_score": partial(precision_score, average="weighted"),
+    "weighted_recall_score": partial(recall_score, average="weighted"),
+    "weighted_jaccard_score": partial(jaccard_score, average="weighted"),
+    "micro_f0.5_score": partial(fbeta_score, average="micro", beta=0.5),
+    "micro_f1_score": partial(f1_score, average="micro"),
+    "micro_f2_score": partial(fbeta_score, average="micro", beta=2),
+    "micro_precision_score": partial(precision_score, average="micro"),
+    "micro_recall_score": partial(recall_score, average="micro"),
+    "micro_jaccard_score": partial(jaccard_score, average="micro"),
+    "macro_f0.5_score": partial(fbeta_score, average="macro", beta=0.5),
+    "macro_f1_score": partial(f1_score, average="macro"),
+    "macro_f2_score": partial(fbeta_score, average="macro", beta=2),
+    "macro_precision_score": partial(precision_score, average="macro"),
+    "macro_recall_score": partial(recall_score, average="macro"),
+    "macro_jaccard_score": partial(jaccard_score, average="macro"),
+    "samples_f0.5_score": partial(fbeta_score, average="samples", beta=0.5),
+    "samples_f1_score": partial(f1_score, average="samples"),
+    "samples_f2_score": partial(fbeta_score, average="samples", beta=2),
+    "samples_precision_score": partial(precision_score, average="samples"),
+    "samples_recall_score": partial(recall_score, average="samples"),
+    "samples_jaccard_score": partial(jaccard_score, average="samples"),
+    "cohen_kappa_score": cohen_kappa_score,
+}
+
+
+def precision_recall_curve_padded_thresholds(*args, **kwargs):
+    """
+    The dimensions of precision-recall pairs and the threshold array as
+    returned by the precision_recall_curve do not match. See
+    func:`sklearn.metrics.precision_recall_curve`
+
+    This prevents implicit conversion of return value triple to an higher
+    dimensional np.array of dtype('float64') (it will be of dtype('object)
+    instead). This again is needed for assert_array_equal to work correctly.
+
+    As a workaround we pad the threshold array with NaN values to match
+    the dimension of precision and recall arrays respectively.
+    """
+    precision, recall, thresholds = precision_recall_curve(*args, **kwargs)
+
+    pad_threshholds = len(precision) - len(thresholds)
+
+    return np.array(
+        [
+            precision,
+            recall,
+            np.pad(
+                thresholds.astype(np.float64),
+                pad_width=(0, pad_threshholds),
+                mode="constant",
+                constant_values=[np.nan],
+            ),
+        ]
+    )
+
+
+CURVE_METRICS = {
+    "roc_curve": roc_curve,
+    "precision_recall_curve": precision_recall_curve_padded_thresholds,
+    "det_curve": det_curve,
+}
+
+THRESHOLDED_METRICS = {
+    "coverage_error": coverage_error,
+    "label_ranking_loss": label_ranking_loss,
+    "log_loss": log_loss,
+    "unnormalized_log_loss": partial(log_loss, normalize=False),
+    "hinge_loss": hinge_loss,
+    "brier_score_loss": brier_score_loss,
+    "roc_auc_score": roc_auc_score,  # default: average="macro"
+    "weighted_roc_auc": partial(roc_auc_score, average="weighted"),
+    "samples_roc_auc": partial(roc_auc_score, average="samples"),
+    "micro_roc_auc": partial(roc_auc_score, average="micro"),
+    "ovr_roc_auc": partial(roc_auc_score, average="macro", multi_class="ovr"),
+    "weighted_ovr_roc_auc": partial(
+        roc_auc_score, average="weighted", multi_class="ovr"
+    ),
+    "ovo_roc_auc": partial(roc_auc_score, average="macro", multi_class="ovo"),
+    "weighted_ovo_roc_auc": partial(
+        roc_auc_score, average="weighted", multi_class="ovo"
+    ),
+    "partial_roc_auc": partial(roc_auc_score, max_fpr=0.5),
+    "average_precision_score": average_precision_score,  # default: average="macro"
+    "weighted_average_precision_score": partial(
+        average_precision_score, average="weighted"
+    ),
+    "samples_average_precision_score": partial(
+        average_precision_score, average="samples"
+    ),
+    "micro_average_precision_score": partial(average_precision_score, average="micro"),
+    "label_ranking_average_precision_score": label_ranking_average_precision_score,
+    "ndcg_score": ndcg_score,
+    "dcg_score": dcg_score,
+    "top_k_accuracy_score": top_k_accuracy_score,
+}
+
+ALL_METRICS = dict()
+ALL_METRICS.update(THRESHOLDED_METRICS)
+ALL_METRICS.update(CLASSIFICATION_METRICS)
+ALL_METRICS.update(REGRESSION_METRICS)
+ALL_METRICS.update(CURVE_METRICS)
+
+# Lists of metrics with common properties
+# ---------------------------------------
+# Lists of metrics with common properties are used to test systematically some
+# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics that
+# are symmetric with respect to their input argument y_true and y_pred.
+#
+# When you add a new metric or functionality, check if a general test
+# is already written.
+
+# Those metrics don't support binary inputs
+METRIC_UNDEFINED_BINARY = {
+    "samples_f0.5_score",
+    "samples_f1_score",
+    "samples_f2_score",
+    "samples_precision_score",
+    "samples_recall_score",
+    "samples_jaccard_score",
+    "coverage_error",
+    "unnormalized_multilabel_confusion_matrix_sample",
+    "label_ranking_loss",
+    "label_ranking_average_precision_score",
+    "dcg_score",
+    "ndcg_score",
+}
+
+# Those metrics don't support multiclass inputs
+METRIC_UNDEFINED_MULTICLASS = {
+    "micro_roc_auc",
+    "samples_roc_auc",
+    "partial_roc_auc",
+    "roc_auc_score",
+    "weighted_roc_auc",
+    "jaccard_score",
+    # with default average='binary', multiclass is prohibited
+    "precision_score",
+    "recall_score",
+    "f1_score",
+    "f2_score",
+    "f0.5_score",
+    # curves
+    "roc_curve",
+    "precision_recall_curve",
+    "det_curve",
+}
+
+# Metric undefined with "binary" or "multiclass" input
+METRIC_UNDEFINED_BINARY_MULTICLASS = METRIC_UNDEFINED_BINARY.union(
+    METRIC_UNDEFINED_MULTICLASS
+)
+
+# Metrics with an "average" argument
+METRICS_WITH_AVERAGING = {
+    "precision_score",
+    "recall_score",
+    "f1_score",
+    "f2_score",
+    "f0.5_score",
+    "jaccard_score",
+}
+
+# Threshold-based metrics with an "average" argument
+THRESHOLDED_METRICS_WITH_AVERAGING = {
+    "roc_auc_score",
+    "average_precision_score",
+    "partial_roc_auc",
+}
+
+# Metrics with a "pos_label" argument
+METRICS_WITH_POS_LABEL = {
+    "roc_curve",
+    "precision_recall_curve",
+    "det_curve",
+    "brier_score_loss",
+    "precision_score",
+    "recall_score",
+    "f1_score",
+    "f2_score",
+    "f0.5_score",
+    "jaccard_score",
+    "average_precision_score",
+    "weighted_average_precision_score",
+    "micro_average_precision_score",
+    "samples_average_precision_score",
+}
+
+# Metrics with a "labels" argument
+# TODO: Handle multi_class metrics that has a labels argument as well as a
+# decision function argument. e.g hinge_loss
+METRICS_WITH_LABELS = {
+    "unnormalized_confusion_matrix",
+    "normalized_confusion_matrix",
+    "roc_curve",
+    "precision_recall_curve",
+    "det_curve",
+    "precision_score",
+    "recall_score",
+    "f1_score",
+    "f2_score",
+    "f0.5_score",
+    "jaccard_score",
+    "weighted_f0.5_score",
+    "weighted_f1_score",
+    "weighted_f2_score",
+    "weighted_precision_score",
+    "weighted_recall_score",
+    "weighted_jaccard_score",
+    "micro_f0.5_score",
+    "micro_f1_score",
+    "micro_f2_score",
+    "micro_precision_score",
+    "micro_recall_score",
+    "micro_jaccard_score",
+    "macro_f0.5_score",
+    "macro_f1_score",
+    "macro_f2_score",
+    "macro_precision_score",
+    "macro_recall_score",
+    "macro_jaccard_score",
+    "unnormalized_multilabel_confusion_matrix",
+    "unnormalized_multilabel_confusion_matrix_sample",
+    "cohen_kappa_score",
+    "log_loss",
+    "brier_score_loss",
+}
+
+# Metrics with a "normalize" option
+METRICS_WITH_NORMALIZE_OPTION = {
+    "accuracy_score",
+    "top_k_accuracy_score",
+    "zero_one_loss",
+}
+
+# Threshold-based metrics with "multilabel-indicator" format support
+THRESHOLDED_MULTILABEL_METRICS = {
+    "log_loss",
+    "unnormalized_log_loss",
+    "brier_score_loss",
+    "roc_auc_score",
+    "weighted_roc_auc",
+    "samples_roc_auc",
+    "micro_roc_auc",
+    "partial_roc_auc",
+    "average_precision_score",
+    "weighted_average_precision_score",
+    "samples_average_precision_score",
+    "micro_average_precision_score",
+    "coverage_error",
+    "label_ranking_loss",
+    "ndcg_score",
+    "dcg_score",
+    "label_ranking_average_precision_score",
+}
+
+# Classification metrics with  "multilabel-indicator" format
+MULTILABELS_METRICS = {
+    "accuracy_score",
+    "unnormalized_accuracy_score",
+    "hamming_loss",
+    "zero_one_loss",
+    "unnormalized_zero_one_loss",
+    "weighted_f0.5_score",
+    "weighted_f1_score",
+    "weighted_f2_score",
+    "weighted_precision_score",
+    "weighted_recall_score",
+    "weighted_jaccard_score",
+    "macro_f0.5_score",
+    "macro_f1_score",
+    "macro_f2_score",
+    "macro_precision_score",
+    "macro_recall_score",
+    "macro_jaccard_score",
+    "micro_f0.5_score",
+    "micro_f1_score",
+    "micro_f2_score",
+    "micro_precision_score",
+    "micro_recall_score",
+    "micro_jaccard_score",
+    "unnormalized_multilabel_confusion_matrix",
+    "samples_f0.5_score",
+    "samples_f1_score",
+    "samples_f2_score",
+    "samples_precision_score",
+    "samples_recall_score",
+    "samples_jaccard_score",
+}
+
+# Regression metrics with "multioutput-continuous" format support
+MULTIOUTPUT_METRICS = {
+    "mean_absolute_error",
+    "median_absolute_error",
+    "mean_squared_error",
+    "mean_squared_log_error",
+    "r2_score",
+    "root_mean_squared_error",
+    "root_mean_squared_log_error",
+    "explained_variance_score",
+    "mean_absolute_percentage_error",
+    "mean_pinball_loss",
+    "d2_pinball_score",
+    "d2_absolute_error_score",
+}
+
+# Symmetric with respect to their input arguments y_true and y_pred
+# metric(y_true, y_pred) == metric(y_pred, y_true).
+SYMMETRIC_METRICS = {
+    "accuracy_score",
+    "unnormalized_accuracy_score",
+    "hamming_loss",
+    "zero_one_loss",
+    "unnormalized_zero_one_loss",
+    "micro_jaccard_score",
+    "macro_jaccard_score",
+    "jaccard_score",
+    "samples_jaccard_score",
+    "f1_score",
+    "micro_f1_score",
+    "macro_f1_score",
+    "weighted_recall_score",
+    "mean_squared_log_error",
+    "root_mean_squared_error",
+    "root_mean_squared_log_error",
+    # P = R = F = accuracy in multiclass case
+    "micro_f0.5_score",
+    "micro_f1_score",
+    "micro_f2_score",
+    "micro_precision_score",
+    "micro_recall_score",
+    "matthews_corrcoef_score",
+    "mean_absolute_error",
+    "mean_squared_error",
+    "median_absolute_error",
+    "max_error",
+    # Pinball loss is only symmetric for alpha=0.5 which is the default.
+    "mean_pinball_loss",
+    "cohen_kappa_score",
+    "mean_normal_deviance",
+}
+
+# Asymmetric with respect to their input arguments y_true and y_pred
+# metric(y_true, y_pred) != metric(y_pred, y_true).
+NOT_SYMMETRIC_METRICS = {
+    "balanced_accuracy_score",
+    "adjusted_balanced_accuracy_score",
+    "explained_variance_score",
+    "r2_score",
+    "unnormalized_confusion_matrix",
+    "normalized_confusion_matrix",
+    "roc_curve",
+    "precision_recall_curve",
+    "det_curve",
+    "precision_score",
+    "recall_score",
+    "f2_score",
+    "f0.5_score",
+    "weighted_f0.5_score",
+    "weighted_f1_score",
+    "weighted_f2_score",
+    "weighted_precision_score",
+    "weighted_jaccard_score",
+    "unnormalized_multilabel_confusion_matrix",
+    "macro_f0.5_score",
+    "macro_f2_score",
+    "macro_precision_score",
+    "macro_recall_score",
+    "hinge_loss",
+    "mean_gamma_deviance",
+    "mean_poisson_deviance",
+    "mean_compound_poisson_deviance",
+    "d2_tweedie_score",
+    "d2_pinball_score",
+    "d2_absolute_error_score",
+    "mean_absolute_percentage_error",
+}
+
+
+# No Sample weight support
+METRICS_WITHOUT_SAMPLE_WEIGHT = {
+    "median_absolute_error",
+    "max_error",
+    "ovo_roc_auc",
+    "weighted_ovo_roc_auc",
+}
+
+METRICS_REQUIRE_POSITIVE_Y = {
+    "mean_poisson_deviance",
+    "mean_gamma_deviance",
+    "mean_compound_poisson_deviance",
+    "d2_tweedie_score",
+}
+
+# Metrics involving y = log(1+x)
+METRICS_WITH_LOG1P_Y = {
+    "mean_squared_log_error",
+    "root_mean_squared_log_error",
+}
+
+
+def _require_positive_targets(y1, y2):
+    """Make targets strictly positive"""
+    offset = abs(min(y1.min(), y2.min())) + 1
+    y1 += offset
+    y2 += offset
+    return y1, y2
+
+
+def _require_log1p_targets(y1, y2):
+    """Make targets strictly larger than -1"""
+    offset = abs(min(y1.min(), y2.min())) - 0.99
+    y1 = y1.astype(np.float64)
+    y2 = y2.astype(np.float64)
+    y1 += offset
+    y2 += offset
+    return y1, y2
+
+
+def test_symmetry_consistency():
+    # We shouldn't forget any metrics
+    assert (
+        SYMMETRIC_METRICS
+        | NOT_SYMMETRIC_METRICS
+        | set(THRESHOLDED_METRICS)
+        | METRIC_UNDEFINED_BINARY_MULTICLASS
+    ) == set(ALL_METRICS)
+
+    assert (SYMMETRIC_METRICS & NOT_SYMMETRIC_METRICS) == set()
+
+
+@pytest.mark.parametrize("name", sorted(SYMMETRIC_METRICS))
+def test_symmetric_metric(name):
+    # Test the symmetry of score and loss functions
+    random_state = check_random_state(0)
+    y_true = random_state.randint(0, 2, size=(20,))
+    y_pred = random_state.randint(0, 2, size=(20,))
+
+    if name in METRICS_REQUIRE_POSITIVE_Y:
+        y_true, y_pred = _require_positive_targets(y_true, y_pred)
+
+    elif name in METRICS_WITH_LOG1P_Y:
+        y_true, y_pred = _require_log1p_targets(y_true, y_pred)
+
+    y_true_bin = random_state.randint(0, 2, size=(20, 25))
+    y_pred_bin = random_state.randint(0, 2, size=(20, 25))
+
+    metric = ALL_METRICS[name]
+    if name in METRIC_UNDEFINED_BINARY:
+        if name in MULTILABELS_METRICS:
+            assert_allclose(
+                metric(y_true_bin, y_pred_bin),
+                metric(y_pred_bin, y_true_bin),
+                err_msg="%s is not symmetric" % name,
+            )
+        else:
+            assert False, "This case is currently unhandled"
+    else:
+        assert_allclose(
+            metric(y_true, y_pred),
+            metric(y_pred, y_true),
+            err_msg="%s is not symmetric" % name,
+        )
+
+
+@pytest.mark.parametrize("name", sorted(NOT_SYMMETRIC_METRICS))
+def test_not_symmetric_metric(name):
+    # Test the symmetry of score and loss functions
+    random_state = check_random_state(0)
+    metric = ALL_METRICS[name]
+
+    # The metric can be accidentally symmetric on a random draw.
+    # We run several random draws to check that at least of them
+    # gives an asymmetric result.
+    always_symmetric = True
+    for _ in range(5):
+        y_true = random_state.randint(0, 2, size=(20,))
+        y_pred = random_state.randint(0, 2, size=(20,))
+
+        if name in METRICS_REQUIRE_POSITIVE_Y:
+            y_true, y_pred = _require_positive_targets(y_true, y_pred)
+
+        nominal = metric(y_true, y_pred)
+        swapped = metric(y_pred, y_true)
+        if not np.allclose(nominal, swapped):
+            always_symmetric = False
+            break
+
+    if always_symmetric:
+        raise ValueError(f"{name} seems to be symmetric")
+
+
+def test_symmetry_tests():
+    # check test_symmetric_metric and test_not_symmetric_metric
+    sym = "accuracy_score"
+    not_sym = "recall_score"
+    # test_symmetric_metric passes on a symmetric metric
+    # but fails on a not symmetric metric
+    test_symmetric_metric(sym)
+    with pytest.raises(AssertionError, match=f"{not_sym} is not symmetric"):
+        test_symmetric_metric(not_sym)
+    # test_not_symmetric_metric passes on a not symmetric metric
+    # but fails on a symmetric metric
+    test_not_symmetric_metric(not_sym)
+    with pytest.raises(ValueError, match=f"{sym} seems to be symmetric"):
+        test_not_symmetric_metric(sym)
+
+
+@pytest.mark.parametrize(
+    "name", sorted(set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
+)
+def test_sample_order_invariance(name):
+    random_state = check_random_state(0)
+    y_true = random_state.randint(0, 2, size=(20,))
+    y_pred = random_state.randint(0, 2, size=(20,))
+
+    if name in METRICS_REQUIRE_POSITIVE_Y:
+        y_true, y_pred = _require_positive_targets(y_true, y_pred)
+    elif name in METRICS_WITH_LOG1P_Y:
+        y_true, y_pred = _require_log1p_targets(y_true, y_pred)
+
+    y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0)
+
+    with ignore_warnings():
+        metric = ALL_METRICS[name]
+        assert_allclose(
+            metric(y_true, y_pred),
+            metric(y_true_shuffle, y_pred_shuffle),
+            err_msg="%s is not sample order invariant" % name,
+        )
+
+
+def test_sample_order_invariance_multilabel_and_multioutput():
+    random_state = check_random_state(0)
+
+    # Generate some data
+    y_true = random_state.randint(0, 2, size=(20, 25))
+    y_pred = random_state.randint(0, 2, size=(20, 25))
+    y_score = random_state.uniform(size=y_true.shape)
+
+    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
+    y_score /= y_score.sum(axis=1, keepdims=True)
+
+    y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(
+        y_true, y_pred, y_score, random_state=0
+    )
+
+    for name in MULTILABELS_METRICS:
+        metric = ALL_METRICS[name]
+        assert_allclose(
+            metric(y_true, y_pred),
+            metric(y_true_shuffle, y_pred_shuffle),
+            err_msg="%s is not sample order invariant" % name,
+        )
+
+    for name in THRESHOLDED_MULTILABEL_METRICS:
+        metric = ALL_METRICS[name]
+        assert_allclose(
+            metric(y_true, y_score),
+            metric(y_true_shuffle, y_score_shuffle),
+            err_msg="%s is not sample order invariant" % name,
+        )
+
+    for name in MULTIOUTPUT_METRICS:
+        metric = ALL_METRICS[name]
+        assert_allclose(
+            metric(y_true, y_score),
+            metric(y_true_shuffle, y_score_shuffle),
+            err_msg="%s is not sample order invariant" % name,
+        )
+        assert_allclose(
+            metric(y_true, y_pred),
+            metric(y_true_shuffle, y_pred_shuffle),
+            err_msg="%s is not sample order invariant" % name,
+        )
+
+
+@pytest.mark.parametrize(
+    "name", sorted(set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
+)
+def test_format_invariance_with_1d_vectors(name):
+    random_state = check_random_state(0)
+    y1 = random_state.randint(0, 2, size=(20,))
+    y2 = random_state.randint(0, 2, size=(20,))
+
+    if name in METRICS_REQUIRE_POSITIVE_Y:
+        y1, y2 = _require_positive_targets(y1, y2)
+    elif name in METRICS_WITH_LOG1P_Y:
+        y1, y2 = _require_log1p_targets(y1, y2)
+
+    y1_list = list(y1)
+    y2_list = list(y2)
+
+    y1_1d, y2_1d = np.array(y1), np.array(y2)
+    assert_array_equal(y1_1d.ndim, 1)
+    assert_array_equal(y2_1d.ndim, 1)
+    y1_column = np.reshape(y1_1d, (-1, 1))
+    y2_column = np.reshape(y2_1d, (-1, 1))
+    y1_row = np.reshape(y1_1d, (1, -1))
+    y2_row = np.reshape(y2_1d, (1, -1))
+
+    with ignore_warnings():
+        metric = ALL_METRICS[name]
+
+        measure = metric(y1, y2)
+
+        assert_allclose(
+            metric(y1_list, y2_list),
+            measure,
+            err_msg="%s is not representation invariant with list" % name,
+        )
+
+        assert_allclose(
+            metric(y1_1d, y2_1d),
+            measure,
+            err_msg="%s is not representation invariant with np-array-1d" % name,
+        )
+
+        assert_allclose(
+            metric(y1_column, y2_column),
+            measure,
+            err_msg="%s is not representation invariant with np-array-column" % name,
+        )
+
+        # Mix format support
+        assert_allclose(
+            metric(y1_1d, y2_list),
+            measure,
+            err_msg="%s is not representation invariant with mix np-array-1d and list"
+            % name,
+        )
+
+        assert_allclose(
+            metric(y1_list, y2_1d),
+            measure,
+            err_msg="%s is not representation invariant with mix np-array-1d and list"
+            % name,
+        )
+
+        assert_allclose(
+            metric(y1_1d, y2_column),
+            measure,
+            err_msg=(
+                "%s is not representation invariant with mix "
+                "np-array-1d and np-array-column"
+            )
+            % name,
+        )
+
+        assert_allclose(
+            metric(y1_column, y2_1d),
+            measure,
+            err_msg=(
+                "%s is not representation invariant with mix "
+                "np-array-1d and np-array-column"
+            )
+            % name,
+        )
+
+        assert_allclose(
+            metric(y1_list, y2_column),
+            measure,
+            err_msg=(
+                "%s is not representation invariant with mix list and np-array-column"
+            )
+            % name,
+        )
+
+        assert_allclose(
+            metric(y1_column, y2_list),
+            measure,
+            err_msg=(
+                "%s is not representation invariant with mix list and np-array-column"
+            )
+            % name,
+        )
+
+        # These mix representations aren't allowed
+        with pytest.raises(ValueError):
+            metric(y1_1d, y2_row)
+        with pytest.raises(ValueError):
+            metric(y1_row, y2_1d)
+        with pytest.raises(ValueError):
+            metric(y1_list, y2_row)
+        with pytest.raises(ValueError):
+            metric(y1_row, y2_list)
+        with pytest.raises(ValueError):
+            metric(y1_column, y2_row)
+        with pytest.raises(ValueError):
+            metric(y1_row, y2_column)
+
+        # NB: We do not test for y1_row, y2_row as these may be
+        # interpreted as multilabel or multioutput data.
+        if name not in (
+            MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTILABELS_METRICS
+        ):
+            if "roc_auc" in name:
+                # for consistency between the `roc_cuve` and `roc_auc_score`
+                # np.nan is returned and an `UndefinedMetricWarning` is raised
+                with pytest.warns(UndefinedMetricWarning):
+                    assert math.isnan(metric(y1_row, y2_row))
+            else:
+                with pytest.raises(ValueError):
+                    metric(y1_row, y2_row)
+
+
+@pytest.mark.parametrize(
+    "name", sorted(set(CLASSIFICATION_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
+)
+def test_classification_invariance_string_vs_numbers_labels(name):
+    # Ensure that classification metrics with string labels are invariant
+    random_state = check_random_state(0)
+    y1 = random_state.randint(0, 2, size=(20,))
+    y2 = random_state.randint(0, 2, size=(20,))
+
+    y1_str = np.array(["eggs", "spam"])[y1]
+    y2_str = np.array(["eggs", "spam"])[y2]
+
+    pos_label_str = "spam"
+    labels_str = ["eggs", "spam"]
+
+    with ignore_warnings():
+        metric = CLASSIFICATION_METRICS[name]
+        measure_with_number = metric(y1, y2)
+
+        # Ugly, but handle case with a pos_label and label
+        metric_str = metric
+        if name in METRICS_WITH_POS_LABEL:
+            metric_str = partial(metric_str, pos_label=pos_label_str)
+
+        measure_with_str = metric_str(y1_str, y2_str)
+
+        assert_array_equal(
+            measure_with_number,
+            measure_with_str,
+            err_msg="{0} failed string vs number invariance test".format(name),
+        )
+
+        measure_with_strobj = metric_str(y1_str.astype("O"), y2_str.astype("O"))
+        assert_array_equal(
+            measure_with_number,
+            measure_with_strobj,
+            err_msg="{0} failed string object vs number invariance test".format(name),
+        )
+
+        if name in METRICS_WITH_LABELS:
+            metric_str = partial(metric_str, labels=labels_str)
+            measure_with_str = metric_str(y1_str, y2_str)
+            assert_array_equal(
+                measure_with_number,
+                measure_with_str,
+                err_msg="{0} failed string vs number  invariance test".format(name),
+            )
+
+            measure_with_strobj = metric_str(y1_str.astype("O"), y2_str.astype("O"))
+            assert_array_equal(
+                measure_with_number,
+                measure_with_strobj,
+                err_msg="{0} failed string vs number  invariance test".format(name),
+            )
+
+
+@pytest.mark.parametrize("name", THRESHOLDED_METRICS)
+def test_thresholded_invariance_string_vs_numbers_labels(name):
+    # Ensure that thresholded metrics with string labels are invariant
+    random_state = check_random_state(0)
+    y1 = random_state.randint(0, 2, size=(20,))
+    y2 = random_state.randint(0, 2, size=(20,))
+
+    y1_str = np.array(["eggs", "spam"])[y1]
+
+    pos_label_str = "spam"
+
+    with ignore_warnings():
+        metric = THRESHOLDED_METRICS[name]
+        if name not in METRIC_UNDEFINED_BINARY:
+            # Ugly, but handle case with a pos_label and label
+            metric_str = metric
+            if name in METRICS_WITH_POS_LABEL:
+                metric_str = partial(metric_str, pos_label=pos_label_str)
+
+            measure_with_number = metric(y1, y2)
+            measure_with_str = metric_str(y1_str, y2)
+            assert_array_equal(
+                measure_with_number,
+                measure_with_str,
+                err_msg="{0} failed string vs number invariance test".format(name),
+            )
+
+            measure_with_strobj = metric_str(y1_str.astype("O"), y2)
+            assert_array_equal(
+                measure_with_number,
+                measure_with_strobj,
+                err_msg="{0} failed string object vs number invariance test".format(
+                    name
+                ),
+            )
+        else:
+            # TODO those metrics doesn't support string label yet
+            with pytest.raises(ValueError):
+                metric(y1_str, y2)
+            with pytest.raises(ValueError):
+                metric(y1_str.astype("O"), y2)
+
+
+invalids_nan_inf = [
+    ([0, 1], [np.inf, np.inf]),
+    ([0, 1], [np.nan, np.nan]),
+    ([0, 1], [np.nan, np.inf]),
+    ([0, 1], [np.inf, 1]),
+    ([0, 1], [np.nan, 1]),
+]
+
+
+@pytest.mark.parametrize(
+    "metric", chain(THRESHOLDED_METRICS.values(), REGRESSION_METRICS.values())
+)
+@pytest.mark.parametrize("y_true, y_score", invalids_nan_inf)
+def test_regression_thresholded_inf_nan_input(metric, y_true, y_score):
+    # Reshape since coverage_error only accepts 2D arrays.
+    if metric == coverage_error:
+        y_true = [y_true]
+        y_score = [y_score]
+    with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
+        metric(y_true, y_score)
+
+
+@pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values())
+@pytest.mark.parametrize(
+    "y_true, y_score",
+    invalids_nan_inf
+    +
+    # Add an additional case for classification only
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/6809
+    [
+        ([np.nan, 1, 2], [1, 2, 3]),
+        ([np.inf, 1, 2], [1, 2, 3]),
+    ],
+)
+def test_classification_inf_nan_input(metric, y_true, y_score):
+    """check that classification metrics raise a message mentioning the
+    occurrence of non-finite values in the target vectors."""
+    if not np.isfinite(y_true).all():
+        input_name = "y_true"
+        if np.isnan(y_true).any():
+            unexpected_value = "NaN"
+        else:
+            unexpected_value = "infinity or a value too large"
+    else:
+        input_name = "y_pred"
+        if np.isnan(y_score).any():
+            unexpected_value = "NaN"
+        else:
+            unexpected_value = "infinity or a value too large"
+
+    err_msg = f"Input {input_name} contains {unexpected_value}"
+
+    with pytest.raises(ValueError, match=err_msg):
+        metric(y_true, y_score)
+
+
+@pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values())
+def test_classification_binary_continuous_input(metric):
+    """check that classification metrics raise a message of mixed type data
+    with continuous/binary target vectors."""
+    y_true, y_score = ["a", "b", "a"], [0.1, 0.2, 0.3]
+    err_msg = (
+        "Classification metrics can't handle a mix of binary and continuous targets"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        metric(y_true, y_score)
+
+
+def check_single_sample(name):
+    # Non-regression test: scores should work with a single sample.
+    # This is important for leave-one-out cross validation.
+    # Score functions tested are those that formerly called np.squeeze,
+    # which turns an array of size 1 into a 0-d array (!).
+    metric = ALL_METRICS[name]
+
+    # assert that no exception is thrown
+    if name in METRICS_REQUIRE_POSITIVE_Y:
+        values = [1, 2]
+    elif name in METRICS_WITH_LOG1P_Y:
+        values = [-0.7, 1]
+    else:
+        values = [0, 1]
+    for i, j in product(values, repeat=2):
+        metric([i], [j])
+
+
+def check_single_sample_multioutput(name):
+    metric = ALL_METRICS[name]
+    for i, j, k, l in product([0, 1], repeat=4):
+        metric(np.array([[i, j]]), np.array([[k, l]]))
+
+
+# filter many metric specific warnings
+@pytest.mark.filterwarnings("ignore")
+@pytest.mark.parametrize(
+    "name",
+    sorted(
+        set(ALL_METRICS)
+        # Those metrics are not always defined with one sample
+        # or in multiclass classification
+        - METRIC_UNDEFINED_BINARY_MULTICLASS
+        - set(THRESHOLDED_METRICS)
+    ),
+)
+def test_single_sample(name):
+    check_single_sample(name)
+
+
+# filter many metric specific warnings
+@pytest.mark.filterwarnings("ignore")
+@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS | MULTILABELS_METRICS))
+def test_single_sample_multioutput(name):
+    check_single_sample_multioutput(name)
+
+
+@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS))
+def test_multioutput_number_of_output_differ(name):
+    y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]])
+    y_pred = np.array([[0, 0], [1, 0], [0, 0]])
+
+    metric = ALL_METRICS[name]
+    with pytest.raises(ValueError):
+        metric(y_true, y_pred)
+
+
+@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS))
+def test_multioutput_regression_invariance_to_dimension_shuffling(name):
+    # test invariance to dimension shuffling
+    random_state = check_random_state(0)
+    y_true = random_state.uniform(0, 2, size=(20, 5))
+    y_pred = random_state.uniform(0, 2, size=(20, 5))
+
+    metric = ALL_METRICS[name]
+    error = metric(y_true, y_pred)
+
+    for _ in range(3):
+        perm = random_state.permutation(y_true.shape[1])
+        assert_allclose(
+            metric(y_true[:, perm], y_pred[:, perm]),
+            error,
+            err_msg="%s is not dimension shuffling invariant" % (name),
+        )
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.UndefinedMetricWarning")
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_multilabel_representation_invariance(coo_container):
+    # Generate some data
+    n_classes = 4
+    n_samples = 50
+
+    _, y1 = make_multilabel_classification(
+        n_features=1,
+        n_classes=n_classes,
+        random_state=0,
+        n_samples=n_samples,
+        allow_unlabeled=True,
+    )
+    _, y2 = make_multilabel_classification(
+        n_features=1,
+        n_classes=n_classes,
+        random_state=1,
+        n_samples=n_samples,
+        allow_unlabeled=True,
+    )
+
+    # To make sure at least one empty label is present
+    y1 = np.vstack([y1, [[0] * n_classes]])
+    y2 = np.vstack([y2, [[0] * n_classes]])
+
+    y1_sparse_indicator = coo_container(y1)
+    y2_sparse_indicator = coo_container(y2)
+
+    y1_list_array_indicator = list(y1)
+    y2_list_array_indicator = list(y2)
+
+    y1_list_list_indicator = [list(a) for a in y1_list_array_indicator]
+    y2_list_list_indicator = [list(a) for a in y2_list_array_indicator]
+
+    for name in MULTILABELS_METRICS:
+        metric = ALL_METRICS[name]
+
+        # XXX cruel hack to work with partial functions
+        if isinstance(metric, partial):
+            metric.__module__ = "tmp"
+            metric.__name__ = name
+
+        measure = metric(y1, y2)
+
+        # Check representation invariance
+        assert_allclose(
+            metric(y1_sparse_indicator, y2_sparse_indicator),
+            measure,
+            err_msg=(
+                "%s failed representation invariance between "
+                "dense and sparse indicator formats."
+            )
+            % name,
+        )
+        assert_almost_equal(
+            metric(y1_list_list_indicator, y2_list_list_indicator),
+            measure,
+            err_msg=(
+                "%s failed representation invariance  "
+                "between dense array and list of list "
+                "indicator formats."
+            )
+            % name,
+        )
+        assert_almost_equal(
+            metric(y1_list_array_indicator, y2_list_array_indicator),
+            measure,
+            err_msg=(
+                "%s failed representation invariance  "
+                "between dense and list of array "
+                "indicator formats."
+            )
+            % name,
+        )
+
+
+@pytest.mark.parametrize("name", sorted(MULTILABELS_METRICS))
+def test_raise_value_error_multilabel_sequences(name):
+    # make sure the multilabel-sequence format raises ValueError
+    multilabel_sequences = [
+        [[1], [2], [0, 1]],
+        [(), (2), (0, 1)],
+        [[]],
+        [()],
+        np.array([[], [1, 2]], dtype="object"),
+    ]
+
+    metric = ALL_METRICS[name]
+    for seq in multilabel_sequences:
+        with pytest.raises(ValueError):
+            metric(seq, seq)
+
+
+@pytest.mark.parametrize("name", sorted(METRICS_WITH_NORMALIZE_OPTION))
+def test_normalize_option_binary_classification(name):
+    # Test in the binary case
+    n_classes = 2
+    n_samples = 20
+    random_state = check_random_state(0)
+
+    y_true = random_state.randint(0, n_classes, size=(n_samples,))
+    y_pred = random_state.randint(0, n_classes, size=(n_samples,))
+    y_score = random_state.normal(size=y_true.shape)
+
+    metrics = ALL_METRICS[name]
+    pred = y_score if name in THRESHOLDED_METRICS else y_pred
+    measure_normalized = metrics(y_true, pred, normalize=True)
+    measure_not_normalized = metrics(y_true, pred, normalize=False)
+
+    assert_array_less(
+        -1.0 * measure_normalized,
+        0,
+        err_msg="We failed to test correctly the normalize option",
+    )
+
+    assert_allclose(
+        measure_normalized,
+        measure_not_normalized / n_samples,
+        err_msg=f"Failed with {name}",
+    )
+
+
+@pytest.mark.parametrize("name", sorted(METRICS_WITH_NORMALIZE_OPTION))
+def test_normalize_option_multiclass_classification(name):
+    # Test in the multiclass case
+    n_classes = 4
+    n_samples = 20
+    random_state = check_random_state(0)
+
+    y_true = random_state.randint(0, n_classes, size=(n_samples,))
+    y_pred = random_state.randint(0, n_classes, size=(n_samples,))
+    y_score = random_state.uniform(size=(n_samples, n_classes))
+
+    metrics = ALL_METRICS[name]
+    pred = y_score if name in THRESHOLDED_METRICS else y_pred
+    measure_normalized = metrics(y_true, pred, normalize=True)
+    measure_not_normalized = metrics(y_true, pred, normalize=False)
+
+    assert_array_less(
+        -1.0 * measure_normalized,
+        0,
+        err_msg="We failed to test correctly the normalize option",
+    )
+
+    assert_allclose(
+        measure_normalized,
+        measure_not_normalized / n_samples,
+        err_msg=f"Failed with {name}",
+    )
+
+
+@pytest.mark.parametrize(
+    "name", sorted(METRICS_WITH_NORMALIZE_OPTION.intersection(MULTILABELS_METRICS))
+)
+def test_normalize_option_multilabel_classification(name):
+    # Test in the multilabel case
+    n_classes = 4
+    n_samples = 100
+    random_state = check_random_state(0)
+
+    # for both random_state 0 and 1, y_true and y_pred has at least one
+    # unlabelled entry
+    _, y_true = make_multilabel_classification(
+        n_features=1,
+        n_classes=n_classes,
+        random_state=0,
+        allow_unlabeled=True,
+        n_samples=n_samples,
+    )
+    _, y_pred = make_multilabel_classification(
+        n_features=1,
+        n_classes=n_classes,
+        random_state=1,
+        allow_unlabeled=True,
+        n_samples=n_samples,
+    )
+
+    y_score = random_state.uniform(size=y_true.shape)
+
+    # To make sure at least one empty label is present
+    y_true += [0] * n_classes
+    y_pred += [0] * n_classes
+
+    metrics = ALL_METRICS[name]
+    pred = y_score if name in THRESHOLDED_METRICS else y_pred
+    measure_normalized = metrics(y_true, pred, normalize=True)
+    measure_not_normalized = metrics(y_true, pred, normalize=False)
+
+    assert_array_less(
+        -1.0 * measure_normalized,
+        0,
+        err_msg="We failed to test correctly the normalize option",
+    )
+
+    assert_allclose(
+        measure_normalized,
+        measure_not_normalized / n_samples,
+        err_msg=f"Failed with {name}",
+    )
+
+
+def _check_averaging(
+    metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel
+):
+    n_samples, n_classes = y_true_binarize.shape
+
+    # No averaging
+    label_measure = metric(y_true, y_pred, average=None)
+    assert_allclose(
+        label_measure,
+        [
+            metric(y_true_binarize[:, i], y_pred_binarize[:, i])
+            for i in range(n_classes)
+        ],
+    )
+
+    # Micro measure
+    micro_measure = metric(y_true, y_pred, average="micro")
+    assert_allclose(
+        micro_measure, metric(y_true_binarize.ravel(), y_pred_binarize.ravel())
+    )
+
+    # Macro measure
+    macro_measure = metric(y_true, y_pred, average="macro")
+    assert_allclose(macro_measure, np.mean(label_measure))
+
+    # Weighted measure
+    weights = np.sum(y_true_binarize, axis=0, dtype=int)
+
+    if np.sum(weights) != 0:
+        weighted_measure = metric(y_true, y_pred, average="weighted")
+        assert_allclose(weighted_measure, np.average(label_measure, weights=weights))
+    else:
+        weighted_measure = metric(y_true, y_pred, average="weighted")
+        assert_allclose(weighted_measure, 0)
+
+    # Sample measure
+    if is_multilabel:
+        sample_measure = metric(y_true, y_pred, average="samples")
+        assert_allclose(
+            sample_measure,
+            np.mean(
+                [
+                    metric(y_true_binarize[i], y_pred_binarize[i])
+                    for i in range(n_samples)
+                ]
+            ),
+        )
+
+    with pytest.raises(ValueError):
+        metric(y_true, y_pred, average="unknown")
+    with pytest.raises(ValueError):
+        metric(y_true, y_pred, average="garbage")
+
+
+def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score):
+    is_multilabel = type_of_target(y_true).startswith("multilabel")
+
+    metric = ALL_METRICS[name]
+
+    if name in METRICS_WITH_AVERAGING:
+        _check_averaging(
+            metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel
+        )
+    elif name in THRESHOLDED_METRICS_WITH_AVERAGING:
+        _check_averaging(
+            metric, y_true, y_score, y_true_binarize, y_score, is_multilabel
+        )
+    else:
+        raise ValueError("Metric is not recorded as having an average option")
+
+
+@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING))
+def test_averaging_multiclass(name):
+    n_samples, n_classes = 50, 3
+    random_state = check_random_state(0)
+    y_true = random_state.randint(0, n_classes, size=(n_samples,))
+    y_pred = random_state.randint(0, n_classes, size=(n_samples,))
+    y_score = random_state.uniform(size=(n_samples, n_classes))
+
+    lb = LabelBinarizer().fit(y_true)
+    y_true_binarize = lb.transform(y_true)
+    y_pred_binarize = lb.transform(y_pred)
+
+    check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)
+
+
+@pytest.mark.parametrize(
+    "name", sorted(METRICS_WITH_AVERAGING | THRESHOLDED_METRICS_WITH_AVERAGING)
+)
+def test_averaging_multilabel(name):
+    n_samples, n_classes = 40, 5
+    _, y = make_multilabel_classification(
+        n_features=1,
+        n_classes=n_classes,
+        random_state=5,
+        n_samples=n_samples,
+        allow_unlabeled=False,
+    )
+    y_true = y[:20]
+    y_pred = y[20:]
+    y_score = check_random_state(0).normal(size=(20, n_classes))
+    y_true_binarize = y_true
+    y_pred_binarize = y_pred
+
+    check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)
+
+
+@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING))
+def test_averaging_multilabel_all_zeroes(name):
+    y_true = np.zeros((20, 3))
+    y_pred = np.zeros((20, 3))
+    y_score = np.zeros((20, 3))
+    y_true_binarize = y_true
+    y_pred_binarize = y_pred
+
+    check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)
+
+
+def test_averaging_binary_multilabel_all_zeroes():
+    y_true = np.zeros((20, 3))
+    y_pred = np.zeros((20, 3))
+    y_true_binarize = y_true
+    y_pred_binarize = y_pred
+    # Test _average_binary_score for weight.sum() == 0
+    binary_metric = lambda y_true, y_score, average="macro": _average_binary_score(
+        precision_score, y_true, y_score, average
+    )
+    _check_averaging(
+        binary_metric,
+        y_true,
+        y_pred,
+        y_true_binarize,
+        y_pred_binarize,
+        is_multilabel=True,
+    )
+
+
+@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING))
+def test_averaging_multilabel_all_ones(name):
+    y_true = np.ones((20, 3))
+    y_pred = np.ones((20, 3))
+    y_score = np.ones((20, 3))
+    y_true_binarize = y_true
+    y_pred_binarize = y_pred
+
+    check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)
+
+
+def check_sample_weight_invariance(name, metric, y1, y2):
+    rng = np.random.RandomState(0)
+    sample_weight = rng.randint(1, 10, size=len(y1))
+
+    # top_k_accuracy_score always lead to a perfect score for k > 1 in the
+    # binary case
+    metric = partial(metric, k=1) if name == "top_k_accuracy_score" else metric
+
+    # check that unit weights gives the same score as no weight
+    unweighted_score = metric(y1, y2, sample_weight=None)
+
+    assert_allclose(
+        unweighted_score,
+        metric(y1, y2, sample_weight=np.ones(shape=len(y1))),
+        err_msg="For %s sample_weight=None is not equivalent to sample_weight=ones"
+        % name,
+    )
+
+    # check that the weighted and unweighted scores are unequal
+    weighted_score = metric(y1, y2, sample_weight=sample_weight)
+
+    # use context manager to supply custom error message
+    with pytest.raises(AssertionError):
+        assert_allclose(unweighted_score, weighted_score)
+        raise ValueError(
+            "Unweighted and weighted scores are unexpectedly "
+            "almost equal (%s) and (%s) "
+            "for %s" % (unweighted_score, weighted_score, name)
+        )
+
+    # check that sample_weight can be a list
+    weighted_score_list = metric(y1, y2, sample_weight=sample_weight.tolist())
+    assert_allclose(
+        weighted_score,
+        weighted_score_list,
+        err_msg=(
+            "Weighted scores for array and list "
+            "sample_weight input are not equal (%s != %s) for %s"
+        )
+        % (weighted_score, weighted_score_list, name),
+    )
+
+    # check that integer weights is the same as repeated samples
+    repeat_weighted_score = metric(
+        np.repeat(y1, sample_weight, axis=0),
+        np.repeat(y2, sample_weight, axis=0),
+        sample_weight=None,
+    )
+    assert_allclose(
+        weighted_score,
+        repeat_weighted_score,
+        err_msg="Weighting %s is not equal to repeating samples" % name,
+    )
+
+    # check that ignoring a fraction of the samples is equivalent to setting
+    # the corresponding weights to zero
+    sample_weight_subset = sample_weight[1::2]
+    sample_weight_zeroed = np.copy(sample_weight)
+    sample_weight_zeroed[::2] = 0
+    y1_subset = y1[1::2]
+    y2_subset = y2[1::2]
+    weighted_score_subset = metric(
+        y1_subset, y2_subset, sample_weight=sample_weight_subset
+    )
+    weighted_score_zeroed = metric(y1, y2, sample_weight=sample_weight_zeroed)
+    assert_allclose(
+        weighted_score_subset,
+        weighted_score_zeroed,
+        err_msg=(
+            "Zeroing weights does not give the same result as "
+            "removing the corresponding samples (%s != %s) for %s"
+        )
+        % (weighted_score_zeroed, weighted_score_subset, name),
+    )
+
+    if not name.startswith("unnormalized"):
+        # check that the score is invariant under scaling of the weights by a
+        # common factor
+        for scaling in [2, 0.3]:
+            assert_allclose(
+                weighted_score,
+                metric(y1, y2, sample_weight=sample_weight * scaling),
+                err_msg="%s sample_weight is not invariant under scaling" % name,
+            )
+
+    # Check that if number of samples in y_true and sample_weight are not
+    # equal, meaningful error is raised.
+    error_message = (
+        r"Found input variables with inconsistent numbers of "
+        r"samples: \[{}, {}, {}\]".format(
+            _num_samples(y1), _num_samples(y2), _num_samples(sample_weight) * 2
+        )
+    )
+    with pytest.raises(ValueError, match=error_message):
+        metric(y1, y2, sample_weight=np.hstack([sample_weight, sample_weight]))
+
+
+@pytest.mark.parametrize(
+    "name",
+    sorted(
+        set(ALL_METRICS).intersection(set(REGRESSION_METRICS))
+        - METRICS_WITHOUT_SAMPLE_WEIGHT
+    ),
+)
+def test_regression_sample_weight_invariance(name):
+    n_samples = 50
+    random_state = check_random_state(0)
+    # regression
+    y_true = random_state.random_sample(size=(n_samples,))
+    y_pred = random_state.random_sample(size=(n_samples,))
+    metric = ALL_METRICS[name]
+    check_sample_weight_invariance(name, metric, y_true, y_pred)
+
+
+@pytest.mark.parametrize(
+    "name",
+    sorted(
+        set(ALL_METRICS).intersection(set(REGRESSION_METRICS))
+        - METRICS_WITHOUT_SAMPLE_WEIGHT
+    ),
+)
+def test_regression_with_invalid_sample_weight(name):
+    # Check that `sample_weight` with incorrect length raises error
+    n_samples = 50
+    random_state = check_random_state(0)
+    y_true = random_state.random_sample(size=(n_samples,))
+    y_pred = random_state.random_sample(size=(n_samples,))
+    metric = ALL_METRICS[name]
+
+    sample_weight = random_state.random_sample(size=(n_samples - 1,))
+    with pytest.raises(ValueError, match="Found input variables with inconsistent"):
+        metric(y_true, y_pred, sample_weight=sample_weight)
+
+    sample_weight = random_state.random_sample(size=(n_samples * 2,)).reshape(
+        (n_samples, 2)
+    )
+    with pytest.raises(ValueError, match="Sample weights must be 1D array or scalar"):
+        metric(y_true, y_pred, sample_weight=sample_weight)
+
+
+@pytest.mark.parametrize(
+    "name",
+    sorted(
+        set(ALL_METRICS)
+        - set(REGRESSION_METRICS)
+        - METRICS_WITHOUT_SAMPLE_WEIGHT
+        - METRIC_UNDEFINED_BINARY
+    ),
+)
+def test_binary_sample_weight_invariance(name):
+    # binary
+    n_samples = 50
+    random_state = check_random_state(0)
+    y_true = random_state.randint(0, 2, size=(n_samples,))
+    y_pred = random_state.randint(0, 2, size=(n_samples,))
+    y_score = random_state.random_sample(size=(n_samples,))
+    metric = ALL_METRICS[name]
+    if name in THRESHOLDED_METRICS:
+        check_sample_weight_invariance(name, metric, y_true, y_score)
+    else:
+        check_sample_weight_invariance(name, metric, y_true, y_pred)
+
+
+@pytest.mark.parametrize(
+    "name",
+    sorted(
+        set(ALL_METRICS)
+        - set(REGRESSION_METRICS)
+        - METRICS_WITHOUT_SAMPLE_WEIGHT
+        - METRIC_UNDEFINED_BINARY_MULTICLASS
+    ),
+)
+def test_multiclass_sample_weight_invariance(name):
+    # multiclass
+    n_samples = 50
+    random_state = check_random_state(0)
+    y_true = random_state.randint(0, 5, size=(n_samples,))
+    y_pred = random_state.randint(0, 5, size=(n_samples,))
+    y_score = random_state.random_sample(size=(n_samples, 5))
+    metric = ALL_METRICS[name]
+    if name in THRESHOLDED_METRICS:
+        # softmax
+        temp = np.exp(-y_score)
+        y_score_norm = temp / temp.sum(axis=-1).reshape(-1, 1)
+        check_sample_weight_invariance(name, metric, y_true, y_score_norm)
+    else:
+        check_sample_weight_invariance(name, metric, y_true, y_pred)
+
+
+@pytest.mark.parametrize(
+    "name",
+    sorted(
+        (MULTILABELS_METRICS | THRESHOLDED_MULTILABEL_METRICS)
+        - METRICS_WITHOUT_SAMPLE_WEIGHT
+    ),
+)
+def test_multilabel_sample_weight_invariance(name):
+    # multilabel indicator
+    random_state = check_random_state(0)
+    _, ya = make_multilabel_classification(
+        n_features=1, n_classes=10, random_state=0, n_samples=50, allow_unlabeled=False
+    )
+    _, yb = make_multilabel_classification(
+        n_features=1, n_classes=10, random_state=1, n_samples=50, allow_unlabeled=False
+    )
+    y_true = np.vstack([ya, yb])
+    y_pred = np.vstack([ya, ya])
+    y_score = random_state.uniform(size=y_true.shape)
+
+    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
+    y_score /= y_score.sum(axis=1, keepdims=True)
+
+    metric = ALL_METRICS[name]
+    if name in THRESHOLDED_METRICS:
+        check_sample_weight_invariance(name, metric, y_true, y_score)
+    else:
+        check_sample_weight_invariance(name, metric, y_true, y_pred)
+
+
+@pytest.mark.parametrize(
+    "name",
+    sorted(MULTIOUTPUT_METRICS - METRICS_WITHOUT_SAMPLE_WEIGHT),
+)
+def test_multioutput_sample_weight_invariance(name):
+    random_state = check_random_state(0)
+    y_true = random_state.uniform(0, 2, size=(20, 5))
+    y_pred = random_state.uniform(0, 2, size=(20, 5))
+
+    metric = ALL_METRICS[name]
+    check_sample_weight_invariance(name, metric, y_true, y_pred)
+
+
+def test_no_averaging_labels():
+    # test labels argument when not using averaging
+    # in multi-class and multi-label cases
+    y_true_multilabel = np.array([[1, 1, 0, 0], [1, 1, 0, 0]])
+    y_pred_multilabel = np.array([[0, 0, 1, 1], [0, 1, 1, 0]])
+    y_true_multiclass = np.array([0, 1, 2])
+    y_pred_multiclass = np.array([0, 2, 3])
+    labels = np.array([3, 0, 1, 2])
+    _, inverse_labels = np.unique(labels, return_inverse=True)
+
+    for name in METRICS_WITH_AVERAGING:
+        for y_true, y_pred in [
+            [y_true_multiclass, y_pred_multiclass],
+            [y_true_multilabel, y_pred_multilabel],
+        ]:
+            if name not in MULTILABELS_METRICS and y_pred.ndim > 1:
+                continue
+
+            metric = ALL_METRICS[name]
+
+            score_labels = metric(y_true, y_pred, labels=labels, average=None)
+            score = metric(y_true, y_pred, average=None)
+            assert_array_equal(score_labels, score[inverse_labels])
+
+
+@pytest.mark.parametrize(
+    "name", sorted(MULTILABELS_METRICS - {"unnormalized_multilabel_confusion_matrix"})
+)
+def test_multilabel_label_permutations_invariance(name):
+    random_state = check_random_state(0)
+    n_samples, n_classes = 20, 4
+
+    y_true = random_state.randint(0, 2, size=(n_samples, n_classes))
+    y_score = random_state.randint(0, 2, size=(n_samples, n_classes))
+
+    metric = ALL_METRICS[name]
+    score = metric(y_true, y_score)
+
+    for perm in permutations(range(n_classes), n_classes):
+        y_score_perm = y_score[:, perm]
+        y_true_perm = y_true[:, perm]
+
+        current_score = metric(y_true_perm, y_score_perm)
+        assert_almost_equal(score, current_score)
+
+
+@pytest.mark.parametrize(
+    "name", sorted(THRESHOLDED_MULTILABEL_METRICS | MULTIOUTPUT_METRICS)
+)
+def test_thresholded_multilabel_multioutput_permutations_invariance(name):
+    random_state = check_random_state(0)
+    n_samples, n_classes = 20, 4
+    y_true = random_state.randint(0, 2, size=(n_samples, n_classes))
+    y_score = random_state.uniform(size=y_true.shape)
+
+    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
+    y_score /= y_score.sum(axis=1, keepdims=True)
+
+    # Makes sure all samples have at least one label. This works around errors
+    # when running metrics where average="sample"
+    y_true[y_true.sum(1) == 4, 0] = 0
+    y_true[y_true.sum(1) == 0, 0] = 1
+
+    metric = ALL_METRICS[name]
+    score = metric(y_true, y_score)
+
+    for perm in permutations(range(n_classes), n_classes):
+        y_score_perm = y_score[:, perm]
+        y_true_perm = y_true[:, perm]
+
+        current_score = metric(y_true_perm, y_score_perm)
+        if metric == mean_absolute_percentage_error:
+            assert np.isfinite(current_score)
+            assert current_score > 1e6
+            # Here we are not comparing the values in case of MAPE because
+            # whenever y_true value is exactly zero, the MAPE value doesn't
+            # signify anything. Thus, in this case we are just expecting
+            # very large finite value.
+        else:
+            assert_almost_equal(score, current_score)
+
+
+@pytest.mark.parametrize(
+    "name", sorted(set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
+)
+def test_thresholded_metric_permutation_invariance(name):
+    n_samples, n_classes = 100, 3
+    random_state = check_random_state(0)
+
+    y_score = random_state.rand(n_samples, n_classes)
+    temp = np.exp(-y_score)
+    y_score = temp / temp.sum(axis=-1).reshape(-1, 1)
+    y_true = random_state.randint(0, n_classes, size=n_samples)
+
+    metric = ALL_METRICS[name]
+    score = metric(y_true, y_score)
+    for perm in permutations(range(n_classes), n_classes):
+        inverse_perm = np.zeros(n_classes, dtype=int)
+        inverse_perm[list(perm)] = np.arange(n_classes)
+        y_score_perm = y_score[:, inverse_perm]
+        y_true_perm = np.take(perm, y_true)
+
+        current_score = metric(y_true_perm, y_score_perm)
+        assert_almost_equal(score, current_score)
+
+
+@pytest.mark.parametrize("metric_name", CLASSIFICATION_METRICS)
+def test_metrics_consistent_type_error(metric_name):
+    # check that an understable message is raised when the type between y_true
+    # and y_pred mismatch
+    rng = np.random.RandomState(42)
+    y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=object)
+    y2 = rng.randint(0, 2, size=y1.size)
+
+    err_msg = "Labels in y_true and y_pred should be of the same type."
+    with pytest.raises(TypeError, match=err_msg):
+        CLASSIFICATION_METRICS[metric_name](y1, y2)
+
+
+@pytest.mark.parametrize(
+    "metric, y_pred_threshold",
+    [
+        (average_precision_score, True),
+        (brier_score_loss, True),
+        (f1_score, False),
+        (partial(fbeta_score, beta=1), False),
+        (jaccard_score, False),
+        (precision_recall_curve, True),
+        (precision_score, False),
+        (recall_score, False),
+        (roc_curve, True),
+    ],
+)
+@pytest.mark.parametrize("dtype_y_str", [str, object])
+def test_metrics_pos_label_error_str(metric, y_pred_threshold, dtype_y_str):
+    # check that the error message if `pos_label` is not specified and the
+    # targets is made of strings.
+    rng = np.random.RandomState(42)
+    y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=dtype_y_str)
+    y2 = rng.randint(0, 2, size=y1.size)
+
+    if not y_pred_threshold:
+        y2 = np.array(["spam", "eggs"], dtype=dtype_y_str)[y2]
+
+    err_msg_pos_label_None = (
+        "y_true takes value in {'eggs', 'spam'} and pos_label is not "
+        "specified: either make y_true take value in {0, 1} or {-1, 1} or "
+        "pass pos_label explicit"
+    )
+    err_msg_pos_label_1 = (
+        r"pos_label=1 is not a valid label. It should be one of \['eggs', 'spam'\]"
+    )
+
+    pos_label_default = signature(metric).parameters["pos_label"].default
+
+    err_msg = err_msg_pos_label_1 if pos_label_default == 1 else err_msg_pos_label_None
+    with pytest.raises(ValueError, match=err_msg):
+        metric(y1, y2)
+
+
+def check_array_api_metric(
+    metric, array_namespace, device, dtype_name, a_np, b_np, **metric_kwargs
+):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    a_xp = xp.asarray(a_np, device=device)
+    b_xp = xp.asarray(b_np, device=device)
+
+    metric_np = metric(a_np, b_np, **metric_kwargs)
+
+    if metric_kwargs.get("sample_weight") is not None:
+        metric_kwargs["sample_weight"] = xp.asarray(
+            metric_kwargs["sample_weight"], device=device
+        )
+
+    multioutput = metric_kwargs.get("multioutput")
+    if isinstance(multioutput, np.ndarray):
+        metric_kwargs["multioutput"] = xp.asarray(multioutput, device=device)
+
+    # When array API dispatch is disabled, and np.asarray works (for example PyTorch
+    # with CPU device), calling the metric function with such numpy compatible inputs
+    # should work (albeit by implicitly converting to numpy arrays instead of
+    # dispatching to the array library).
+    try:
+        np.asarray(a_xp)
+        np.asarray(b_xp)
+        numpy_as_array_works = True
+    except (TypeError, RuntimeError, ValueError):
+        # PyTorch with CUDA device and CuPy raise TypeError consistently.
+        # array-api-strict chose to raise RuntimeError instead. NumPy raises
+        # a ValueError if the `__array__` dunder does not return an array.
+        # Exception type may need to be updated in the future for other libraries.
+        numpy_as_array_works = False
+
+    if numpy_as_array_works:
+        metric_xp = metric(a_xp, b_xp, **metric_kwargs)
+        assert_allclose(
+            metric_xp,
+            metric_np,
+            atol=_atol_for_type(dtype_name),
+        )
+        metric_xp_mixed_1 = metric(a_np, b_xp, **metric_kwargs)
+        assert_allclose(
+            metric_xp_mixed_1,
+            metric_np,
+            atol=_atol_for_type(dtype_name),
+        )
+        metric_xp_mixed_2 = metric(a_xp, b_np, **metric_kwargs)
+        assert_allclose(
+            metric_xp_mixed_2,
+            metric_np,
+            atol=_atol_for_type(dtype_name),
+        )
+
+    with config_context(array_api_dispatch=True):
+        metric_xp = metric(a_xp, b_xp, **metric_kwargs)
+
+        assert_allclose(
+            _convert_to_numpy(xp.asarray(metric_xp), xp),
+            metric_np,
+            atol=_atol_for_type(dtype_name),
+        )
+
+
+def check_array_api_binary_classification_metric(
+    metric, array_namespace, device, dtype_name
+):
+    y_true_np = np.array([0, 0, 1, 1])
+    y_pred_np = np.array([0, 1, 0, 1])
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=y_true_np,
+        b_np=y_pred_np,
+        sample_weight=None,
+    )
+
+    sample_weight = np.array([0.0, 0.1, 2.0, 1.0], dtype=dtype_name)
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=y_true_np,
+        b_np=y_pred_np,
+        sample_weight=sample_weight,
+    )
+
+
+def check_array_api_multiclass_classification_metric(
+    metric, array_namespace, device, dtype_name
+):
+    y_true_np = np.array([0, 1, 2, 3])
+    y_pred_np = np.array([0, 1, 0, 2])
+
+    additional_params = {
+        "average": ("micro", "macro", "weighted"),
+        "beta": (0.2, 0.5, 0.8),
+    }
+    metric_kwargs_combinations = _get_metric_kwargs_for_array_api_testing(
+        metric=metric,
+        params=additional_params,
+    )
+    for metric_kwargs in metric_kwargs_combinations:
+        check_array_api_metric(
+            metric,
+            array_namespace,
+            device,
+            dtype_name,
+            a_np=y_true_np,
+            b_np=y_pred_np,
+            sample_weight=None,
+            **metric_kwargs,
+        )
+
+        sample_weight = np.array([0.0, 0.1, 2.0, 1.0], dtype=dtype_name)
+
+        check_array_api_metric(
+            metric,
+            array_namespace,
+            device,
+            dtype_name,
+            a_np=y_true_np,
+            b_np=y_pred_np,
+            sample_weight=sample_weight,
+            **metric_kwargs,
+        )
+
+
+def check_array_api_multilabel_classification_metric(
+    metric, array_namespace, device, dtype_name
+):
+    y_true_np = np.array([[1, 1], [0, 1], [0, 0]], dtype=dtype_name)
+    y_pred_np = np.array([[1, 1], [1, 1], [1, 1]], dtype=dtype_name)
+
+    additional_params = {
+        "average": ("micro", "macro", "weighted"),
+        "beta": (0.2, 0.5, 0.8),
+    }
+    metric_kwargs_combinations = _get_metric_kwargs_for_array_api_testing(
+        metric=metric,
+        params=additional_params,
+    )
+    for metric_kwargs in metric_kwargs_combinations:
+        check_array_api_metric(
+            metric,
+            array_namespace,
+            device,
+            dtype_name,
+            a_np=y_true_np,
+            b_np=y_pred_np,
+            sample_weight=None,
+            **metric_kwargs,
+        )
+
+        sample_weight = np.array([0.0, 0.1, 2.0], dtype=dtype_name)
+
+        check_array_api_metric(
+            metric,
+            array_namespace,
+            device,
+            dtype_name,
+            a_np=y_true_np,
+            b_np=y_pred_np,
+            sample_weight=sample_weight,
+            **metric_kwargs,
+        )
+
+
+def check_array_api_regression_metric(metric, array_namespace, device, dtype_name):
+    func_name = metric.func.__name__ if isinstance(metric, partial) else metric.__name__
+    if func_name == "mean_poisson_deviance" and sp_version < parse_version("1.14.0"):
+        pytest.skip(
+            "mean_poisson_deviance's dependency `xlogy` is available as of scipy 1.14.0"
+        )
+
+    y_true_np = np.array([2.0, 0.1, 1.0, 4.0], dtype=dtype_name)
+    y_pred_np = np.array([0.5, 0.5, 2, 2], dtype=dtype_name)
+
+    metric_kwargs = {}
+    metric_params = signature(metric).parameters
+
+    if "sample_weight" in metric_params:
+        metric_kwargs["sample_weight"] = None
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=y_true_np,
+        b_np=y_pred_np,
+        **metric_kwargs,
+    )
+
+    if "sample_weight" in metric_params:
+        metric_kwargs["sample_weight"] = np.array(
+            [0.1, 2.0, 1.5, 0.5], dtype=dtype_name
+        )
+
+        check_array_api_metric(
+            metric,
+            array_namespace,
+            device,
+            dtype_name,
+            a_np=y_true_np,
+            b_np=y_pred_np,
+            **metric_kwargs,
+        )
+
+
+def check_array_api_regression_metric_multioutput(
+    metric, array_namespace, device, dtype_name
+):
+    y_true_np = np.array([[1, 3, 2], [1, 2, 2]], dtype=dtype_name)
+    y_pred_np = np.array([[1, 4, 4], [1, 1, 1]], dtype=dtype_name)
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=y_true_np,
+        b_np=y_pred_np,
+        sample_weight=None,
+    )
+
+    sample_weight = np.array([0.1, 2.0], dtype=dtype_name)
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=y_true_np,
+        b_np=y_pred_np,
+        sample_weight=sample_weight,
+    )
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=y_true_np,
+        b_np=y_pred_np,
+        multioutput=np.array([0.1, 0.3, 0.7], dtype=dtype_name),
+    )
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=y_true_np,
+        b_np=y_pred_np,
+        multioutput="raw_values",
+    )
+
+
+def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name):
+    X_np = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], dtype=dtype_name)
+    Y_np = np.array([[0.2, 0.3, 0.4], [0.5, 0.6, 0.7]], dtype=dtype_name)
+
+    metric_kwargs = {}
+    if "dense_output" in signature(metric).parameters:
+        metric_kwargs["dense_output"] = False
+        check_array_api_metric(
+            metric,
+            array_namespace,
+            device,
+            dtype_name,
+            a_np=X_np,
+            b_np=Y_np,
+            **metric_kwargs,
+        )
+        metric_kwargs["dense_output"] = True
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=X_np,
+        b_np=Y_np,
+        **metric_kwargs,
+    )
+
+
+array_api_metric_checkers = {
+    accuracy_score: [
+        check_array_api_binary_classification_metric,
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    f1_score: [
+        check_array_api_binary_classification_metric,
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    fbeta_score: [
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    jaccard_score: [
+        check_array_api_binary_classification_metric,
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    multilabel_confusion_matrix: [
+        check_array_api_binary_classification_metric,
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    precision_score: [
+        check_array_api_binary_classification_metric,
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    recall_score: [
+        check_array_api_binary_classification_metric,
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    zero_one_loss: [
+        check_array_api_binary_classification_metric,
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    hamming_loss: [
+        check_array_api_binary_classification_metric,
+        check_array_api_multiclass_classification_metric,
+        check_array_api_multilabel_classification_metric,
+    ],
+    mean_tweedie_deviance: [check_array_api_regression_metric],
+    partial(mean_tweedie_deviance, power=-0.5): [check_array_api_regression_metric],
+    partial(mean_tweedie_deviance, power=1.5): [check_array_api_regression_metric],
+    r2_score: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    cosine_similarity: [check_array_api_metric_pairwise],
+    explained_variance_score: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    mean_absolute_error: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    mean_pinball_loss: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    mean_squared_error: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    mean_squared_log_error: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    d2_tweedie_score: [
+        check_array_api_regression_metric,
+    ],
+    paired_cosine_distances: [check_array_api_metric_pairwise],
+    mean_poisson_deviance: [check_array_api_regression_metric],
+    additive_chi2_kernel: [check_array_api_metric_pairwise],
+    mean_gamma_deviance: [check_array_api_regression_metric],
+    max_error: [check_array_api_regression_metric],
+    mean_absolute_percentage_error: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    chi2_kernel: [check_array_api_metric_pairwise],
+    paired_euclidean_distances: [check_array_api_metric_pairwise],
+    cosine_distances: [check_array_api_metric_pairwise],
+    euclidean_distances: [check_array_api_metric_pairwise],
+    linear_kernel: [check_array_api_metric_pairwise],
+    polynomial_kernel: [check_array_api_metric_pairwise],
+    rbf_kernel: [check_array_api_metric_pairwise],
+    root_mean_squared_error: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    root_mean_squared_log_error: [
+        check_array_api_regression_metric,
+        check_array_api_regression_metric_multioutput,
+    ],
+    sigmoid_kernel: [check_array_api_metric_pairwise],
+}
+
+
+def yield_metric_checker_combinations(metric_checkers=array_api_metric_checkers):
+    for metric, checkers in metric_checkers.items():
+        for checker in checkers:
+            yield metric, checker
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize("metric, check_func", yield_metric_checker_combinations())
+def test_array_api_compliance(metric, array_namespace, device, dtype_name, check_func):
+    check_func(metric, array_namespace, device, dtype_name)
+
+
+@pytest.mark.parametrize("df_lib_name", ["pandas", "polars"])
+@pytest.mark.parametrize("metric_name", sorted(ALL_METRICS))
+def test_metrics_dataframe_series(metric_name, df_lib_name):
+    df_lib = pytest.importorskip(df_lib_name)
+
+    y_pred = df_lib.Series([0.0, 1.0, 0, 1.0])
+    y_true = df_lib.Series([1.0, 0.0, 0.0, 0.0])
+
+    metric = ALL_METRICS[metric_name]
+    try:
+        expected_metric = metric(y_pred.to_numpy(), y_true.to_numpy())
+    except ValueError:
+        pytest.skip(f"{metric_name} can not deal with 1d inputs")
+
+    assert_allclose(metric(y_pred, y_true), expected_metric)
+
+
+def _get_metric_kwargs_for_array_api_testing(metric, params):
+    """Helper function to enable specifying a variety of additional params and
+    their corresponding values, so that they can be passed to a metric function
+    when testing for array api compliance."""
+    metric_kwargs_combinations = [{}]
+    for param, values in params.items():
+        if param not in signature(metric).parameters:
+            continue
+
+        new_combinations = []
+        for kwargs in metric_kwargs_combinations:
+            for value in values:
+                new_kwargs = kwargs.copy()
+                new_kwargs[param] = value
+                new_combinations.append(new_kwargs)
+
+        metric_kwargs_combinations = new_combinations
+
+    return metric_kwargs_combinations
+
+
+@pytest.mark.parametrize("name", sorted(ALL_METRICS))
+def test_returned_value_consistency(name):
+    """Ensure that the returned values of all metrics are consistent.
+
+    It can either be a float, a numpy array, or a tuple of floats or numpy arrays.
+    It should not be a numpy float64 or float32.
+    """
+
+    rng = np.random.RandomState(0)
+    y_true = rng.randint(0, 2, size=(20,))
+    y_pred = rng.randint(0, 2, size=(20,))
+
+    if name in METRICS_REQUIRE_POSITIVE_Y:
+        y_true, y_pred = _require_positive_targets(y_true, y_pred)
+
+    if name in METRIC_UNDEFINED_BINARY:
+        y_true = rng.randint(0, 2, size=(20, 3))
+        y_pred = rng.randint(0, 2, size=(20, 3))
+
+    metric = ALL_METRICS[name]
+    score = metric(y_true, y_pred)
+
+    assert isinstance(score, (float, np.ndarray, tuple))
+    assert not isinstance(score, (np.float64, np.float32))
+
+    if isinstance(score, tuple):
+        assert all(isinstance(v, float) for v in score) or all(
+            isinstance(v, np.ndarray) for v in score
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_dist_metrics.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_dist_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..f93d3b984bdb7c218d0517ca9e6c21ec930f96fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_dist_metrics.py
@@ -0,0 +1,431 @@
+import copy
+import itertools
+import pickle
+
+import numpy as np
+import pytest
+from scipy.spatial.distance import cdist
+
+from sklearn.metrics import DistanceMetric
+from sklearn.metrics._dist_metrics import (
+    BOOL_METRICS,
+    DEPRECATED_METRICS,
+    DistanceMetric32,
+    DistanceMetric64,
+)
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    assert_allclose,
+    create_memmap_backed_data,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def dist_func(x1, x2, p):
+    return np.sum((x1 - x2) ** p) ** (1.0 / p)
+
+
+rng = check_random_state(0)
+d = 4
+n1 = 20
+n2 = 25
+X64 = rng.random_sample((n1, d))
+Y64 = rng.random_sample((n2, d))
+X32 = X64.astype("float32")
+Y32 = Y64.astype("float32")
+
+[X_mmap, Y_mmap] = create_memmap_backed_data([X64, Y64])
+
+# make boolean arrays: ones and zeros
+X_bool = (X64 < 0.3).astype(np.float64)  # quite sparse
+Y_bool = (Y64 < 0.7).astype(np.float64)  # not too sparse
+
+[X_bool_mmap, Y_bool_mmap] = create_memmap_backed_data([X_bool, Y_bool])
+
+
+V = rng.random_sample((d, d))
+VI = np.dot(V, V.T)
+
+METRICS_DEFAULT_PARAMS = [
+    ("euclidean", {}),
+    ("cityblock", {}),
+    ("minkowski", dict(p=(0.5, 1, 1.5, 2, 3))),
+    ("chebyshev", {}),
+    ("seuclidean", dict(V=(rng.random_sample(d),))),
+    ("mahalanobis", dict(VI=(VI,))),
+    ("hamming", {}),
+    ("canberra", {}),
+    ("braycurtis", {}),
+    ("minkowski", dict(p=(0.5, 1, 1.5, 3), w=(rng.random_sample(d),))),
+]
+
+
+@pytest.mark.parametrize(
+    "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0]
+)
+@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_cdist(metric_param_grid, X, Y, csr_container):
+    metric, param_grid = metric_param_grid
+    keys = param_grid.keys()
+    X_csr, Y_csr = csr_container(X), csr_container(Y)
+    for vals in itertools.product(*param_grid.values()):
+        kwargs = dict(zip(keys, vals))
+        rtol_dict = {}
+        if metric == "mahalanobis" and X.dtype == np.float32:
+            # Computation of mahalanobis differs between
+            # the scipy and scikit-learn implementation.
+            # Hence, we increase the relative tolerance.
+            # TODO: Inspect slight numerical discrepancy
+            # with scipy
+            rtol_dict = {"rtol": 1e-6}
+
+        D_scipy_cdist = cdist(X, Y, metric, **kwargs)
+
+        dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
+
+        # DistanceMetric.pairwise must be consistent for all
+        # combinations of formats in {sparse, dense}.
+        D_sklearn = dm.pairwise(X, Y)
+        assert D_sklearn.flags.c_contiguous
+        assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict)
+
+        D_sklearn = dm.pairwise(X_csr, Y_csr)
+        assert D_sklearn.flags.c_contiguous
+        assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict)
+
+        D_sklearn = dm.pairwise(X_csr, Y)
+        assert D_sklearn.flags.c_contiguous
+        assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict)
+
+        D_sklearn = dm.pairwise(X, Y_csr)
+        assert D_sklearn.flags.c_contiguous
+        assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict)
+
+
+@pytest.mark.parametrize("metric", BOOL_METRICS)
+@pytest.mark.parametrize(
+    "X_bool, Y_bool", [(X_bool, Y_bool), (X_bool_mmap, Y_bool_mmap)]
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_cdist_bool_metric(metric, X_bool, Y_bool, csr_container):
+    if metric in DEPRECATED_METRICS:
+        with ignore_warnings(category=DeprecationWarning):
+            # Some metrics can be deprecated depending on the scipy version.
+            # But if they are present, we still want to test whether
+            # scikit-learn gives the same result, whether or not they are
+            # deprecated.
+            D_scipy_cdist = cdist(X_bool, Y_bool, metric)
+    else:
+        D_scipy_cdist = cdist(X_bool, Y_bool, metric)
+
+    dm = DistanceMetric.get_metric(metric)
+    D_sklearn = dm.pairwise(X_bool, Y_bool)
+    assert_allclose(D_sklearn, D_scipy_cdist)
+
+    # DistanceMetric.pairwise must be consistent
+    # on all combinations of format in {sparse, dense}².
+    X_bool_csr, Y_bool_csr = csr_container(X_bool), csr_container(Y_bool)
+
+    D_sklearn = dm.pairwise(X_bool, Y_bool)
+    assert D_sklearn.flags.c_contiguous
+    assert_allclose(D_sklearn, D_scipy_cdist)
+
+    D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr)
+    assert D_sklearn.flags.c_contiguous
+    assert_allclose(D_sklearn, D_scipy_cdist)
+
+    D_sklearn = dm.pairwise(X_bool, Y_bool_csr)
+    assert D_sklearn.flags.c_contiguous
+    assert_allclose(D_sklearn, D_scipy_cdist)
+
+    D_sklearn = dm.pairwise(X_bool_csr, Y_bool)
+    assert D_sklearn.flags.c_contiguous
+    assert_allclose(D_sklearn, D_scipy_cdist)
+
+
+@pytest.mark.parametrize(
+    "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0]
+)
+@pytest.mark.parametrize("X", [X64, X32, X_mmap])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pdist(metric_param_grid, X, csr_container):
+    metric, param_grid = metric_param_grid
+    keys = param_grid.keys()
+    X_csr = csr_container(X)
+    for vals in itertools.product(*param_grid.values()):
+        kwargs = dict(zip(keys, vals))
+        rtol_dict = {}
+        if metric == "mahalanobis" and X.dtype == np.float32:
+            # Computation of mahalanobis differs between
+            # the scipy and scikit-learn implementation.
+            # Hence, we increase the relative tolerance.
+            # TODO: Inspect slight numerical discrepancy
+            # with scipy
+            rtol_dict = {"rtol": 1e-6}
+
+        D_scipy_pdist = cdist(X, X, metric, **kwargs)
+
+        dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
+        D_sklearn = dm.pairwise(X)
+        assert D_sklearn.flags.c_contiguous
+        assert_allclose(D_sklearn, D_scipy_pdist, **rtol_dict)
+
+        D_sklearn_csr = dm.pairwise(X_csr)
+        assert D_sklearn.flags.c_contiguous
+        assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict)
+
+        D_sklearn_csr = dm.pairwise(X_csr, X_csr)
+        assert D_sklearn.flags.c_contiguous
+        assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict)
+
+
+@pytest.mark.parametrize(
+    "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0]
+)
+def test_distance_metrics_dtype_consistency(metric_param_grid):
+    # DistanceMetric must return similar distances for both float32 and float64
+    # input data.
+    metric, param_grid = metric_param_grid
+    keys = param_grid.keys()
+
+    # Choose rtol to make sure that this test is robust to changes in the random
+    # seed in the module-level test data generation code.
+    rtol = 1e-5
+
+    for vals in itertools.product(*param_grid.values()):
+        kwargs = dict(zip(keys, vals))
+        dm64 = DistanceMetric.get_metric(metric, np.float64, **kwargs)
+        dm32 = DistanceMetric.get_metric(metric, np.float32, **kwargs)
+
+        D64 = dm64.pairwise(X64)
+        D32 = dm32.pairwise(X32)
+
+        assert D64.dtype == np.float64
+        assert D32.dtype == np.float32
+
+        # assert_allclose introspects the dtype of the input arrays to decide
+        # which rtol value to use by default but in this case we know that D32
+        # is not computed with the same precision so we set rtol manually.
+        assert_allclose(D64, D32, rtol=rtol)
+
+        D64 = dm64.pairwise(X64, Y64)
+        D32 = dm32.pairwise(X32, Y32)
+        assert_allclose(D64, D32, rtol=rtol)
+
+
+@pytest.mark.parametrize("metric", BOOL_METRICS)
+@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pdist_bool_metrics(metric, X_bool, csr_container):
+    if metric in DEPRECATED_METRICS:
+        with ignore_warnings(category=DeprecationWarning):
+            # Some metrics can be deprecated depending on the scipy version.
+            # But if they are present, we still want to test whether
+            # scikit-learn gives the same result, whether or not they are
+            # deprecated.
+            D_scipy_pdist = cdist(X_bool, X_bool, metric)
+    else:
+        D_scipy_pdist = cdist(X_bool, X_bool, metric)
+
+    dm = DistanceMetric.get_metric(metric)
+    D_sklearn = dm.pairwise(X_bool)
+    assert_allclose(D_sklearn, D_scipy_pdist)
+
+    X_bool_csr = csr_container(X_bool)
+    D_sklearn = dm.pairwise(X_bool_csr)
+    assert_allclose(D_sklearn, D_scipy_pdist)
+
+
+@pytest.mark.parametrize("writable_kwargs", [True, False])
+@pytest.mark.parametrize(
+    "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0]
+)
+@pytest.mark.parametrize("X", [X64, X32])
+def test_pickle(writable_kwargs, metric_param_grid, X):
+    metric, param_grid = metric_param_grid
+    keys = param_grid.keys()
+    for vals in itertools.product(*param_grid.values()):
+        if any(isinstance(val, np.ndarray) for val in vals):
+            vals = copy.deepcopy(vals)
+            for val in vals:
+                if isinstance(val, np.ndarray):
+                    val.setflags(write=writable_kwargs)
+        kwargs = dict(zip(keys, vals))
+        dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
+        D1 = dm.pairwise(X)
+        dm2 = pickle.loads(pickle.dumps(dm))
+        D2 = dm2.pairwise(X)
+        assert_allclose(D1, D2)
+
+
+@pytest.mark.parametrize("metric", BOOL_METRICS)
+@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
+def test_pickle_bool_metrics(metric, X_bool):
+    dm = DistanceMetric.get_metric(metric)
+    D1 = dm.pairwise(X_bool)
+    dm2 = pickle.loads(pickle.dumps(dm))
+    D2 = dm2.pairwise(X_bool)
+    assert_allclose(D1, D2)
+
+
+@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_haversine_metric(X, Y, csr_container):
+    # The Haversine DistanceMetric only works on 2 features.
+    X = np.asarray(X[:, :2])
+    Y = np.asarray(Y[:, :2])
+
+    X_csr, Y_csr = csr_container(X), csr_container(Y)
+
+    # Haversine is not supported by scipy.special.distance.{cdist,pdist}
+    # So we reimplement it to have a reference.
+    def haversine_slow(x1, x2):
+        return 2 * np.arcsin(
+            np.sqrt(
+                np.sin(0.5 * (x1[0] - x2[0])) ** 2
+                + np.cos(x1[0]) * np.cos(x2[0]) * np.sin(0.5 * (x1[1] - x2[1])) ** 2
+            )
+        )
+
+    D_reference = np.zeros((X_csr.shape[0], Y_csr.shape[0]))
+    for i, xi in enumerate(X):
+        for j, yj in enumerate(Y):
+            D_reference[i, j] = haversine_slow(xi, yj)
+
+    haversine = DistanceMetric.get_metric("haversine", X.dtype)
+
+    D_sklearn = haversine.pairwise(X, Y)
+    assert_allclose(
+        haversine.dist_to_rdist(D_sklearn), np.sin(0.5 * D_reference) ** 2, rtol=1e-6
+    )
+
+    assert_allclose(D_sklearn, D_reference)
+
+    D_sklearn = haversine.pairwise(X_csr, Y_csr)
+    assert D_sklearn.flags.c_contiguous
+    assert_allclose(D_sklearn, D_reference)
+
+    D_sklearn = haversine.pairwise(X_csr, Y)
+    assert D_sklearn.flags.c_contiguous
+    assert_allclose(D_sklearn, D_reference)
+
+    D_sklearn = haversine.pairwise(X, Y_csr)
+    assert D_sklearn.flags.c_contiguous
+    assert_allclose(D_sklearn, D_reference)
+
+
+def test_pyfunc_metric():
+    X = np.random.random((10, 3))
+
+    euclidean = DistanceMetric.get_metric("euclidean")
+    pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func, p=2)
+
+    # Check if both callable metric and predefined metric initialized
+    # DistanceMetric object is picklable
+    euclidean_pkl = pickle.loads(pickle.dumps(euclidean))
+    pyfunc_pkl = pickle.loads(pickle.dumps(pyfunc))
+
+    D1 = euclidean.pairwise(X)
+    D2 = pyfunc.pairwise(X)
+
+    D1_pkl = euclidean_pkl.pairwise(X)
+    D2_pkl = pyfunc_pkl.pairwise(X)
+
+    assert_allclose(D1, D2)
+    assert_allclose(D1_pkl, D2_pkl)
+
+
+def test_input_data_size():
+    # Regression test for #6288
+    # Previously, a metric requiring a particular input dimension would fail
+    def custom_metric(x, y):
+        assert x.shape[0] == 3
+        return np.sum((x - y) ** 2)
+
+    rng = check_random_state(0)
+    X = rng.rand(10, 3)
+
+    pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric)
+    eucl = DistanceMetric.get_metric("euclidean")
+    assert_allclose(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)
+
+
+def test_readonly_kwargs():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/21685
+
+    rng = check_random_state(0)
+
+    weights = rng.rand(100)
+    VI = rng.rand(10, 10)
+    weights.setflags(write=False)
+    VI.setflags(write=False)
+
+    # Those distances metrics have to support readonly buffers.
+    DistanceMetric.get_metric("seuclidean", V=weights)
+    DistanceMetric.get_metric("mahalanobis", VI=VI)
+
+
+@pytest.mark.parametrize(
+    "w, err_type, err_msg",
+    [
+        (np.array([1, 1.5, -13]), ValueError, "w cannot contain negative weights"),
+        (np.array([1, 1.5, np.nan]), ValueError, "w contains NaN"),
+        *[
+            (
+                csr_container([[1, 1.5, 1]]),
+                TypeError,
+                "Sparse data was passed for w, but dense data is required",
+            )
+            for csr_container in CSR_CONTAINERS
+        ],
+        (np.array(["a", "b", "c"]), ValueError, "could not convert string to float"),
+        (np.array([]), ValueError, "a minimum of 1 is required"),
+    ],
+)
+def test_minkowski_metric_validate_weights_values(w, err_type, err_msg):
+    with pytest.raises(err_type, match=err_msg):
+        DistanceMetric.get_metric("minkowski", p=3, w=w)
+
+
+def test_minkowski_metric_validate_weights_size():
+    w2 = rng.random_sample(d + 1)
+    dm = DistanceMetric.get_metric("minkowski", p=3, w=w2)
+    msg = (
+        "MinkowskiDistance: the size of w must match "
+        f"the number of features \\({X64.shape[1]}\\). "
+        f"Currently len\\(w\\)={w2.shape[0]}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        dm.pairwise(X64, Y64)
+
+
+@pytest.mark.parametrize("metric, metric_kwargs", METRICS_DEFAULT_PARAMS)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_get_metric_dtype(metric, metric_kwargs, dtype):
+    specialized_cls = {
+        np.float32: DistanceMetric32,
+        np.float64: DistanceMetric64,
+    }[dtype]
+
+    # We don't need the entire grid, just one for a sanity check
+    metric_kwargs = {k: v[0] for k, v in metric_kwargs.items()}
+    generic_type = type(DistanceMetric.get_metric(metric, dtype, **metric_kwargs))
+    specialized_type = type(specialized_cls.get_metric(metric, **metric_kwargs))
+
+    assert generic_type is specialized_type
+
+
+def test_get_metric_bad_dtype():
+    dtype = np.int32
+    msg = r"Unexpected dtype .* provided. Please select a dtype from"
+    with pytest.raises(ValueError, match=msg):
+        DistanceMetric.get_metric("manhattan", dtype)
+
+
+def test_minkowski_metric_validate_bad_p_parameter():
+    msg = "p must be greater than 0"
+    with pytest.raises(ValueError, match=msg):
+        DistanceMetric.get_metric("minkowski", p=0)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c1ba4b2f7d5280235ed2038ac2bd933db4b701d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise.py
@@ -0,0 +1,1683 @@
+import warnings
+from types import GeneratorType
+
+import numpy as np
+import pytest
+from numpy import linalg
+from scipy.sparse import issparse
+from scipy.spatial.distance import (
+    cdist,
+    cityblock,
+    cosine,
+    minkowski,
+    pdist,
+    squareform,
+)
+
+from sklearn import config_context
+from sklearn.exceptions import DataConversionWarning
+from sklearn.metrics.pairwise import (
+    PAIRED_DISTANCES,
+    PAIRWISE_BOOLEAN_FUNCTIONS,
+    PAIRWISE_DISTANCE_FUNCTIONS,
+    PAIRWISE_KERNEL_FUNCTIONS,
+    _euclidean_distances_upcast,
+    additive_chi2_kernel,
+    check_paired_arrays,
+    check_pairwise_arrays,
+    chi2_kernel,
+    cosine_distances,
+    cosine_similarity,
+    euclidean_distances,
+    haversine_distances,
+    laplacian_kernel,
+    linear_kernel,
+    manhattan_distances,
+    nan_euclidean_distances,
+    paired_cosine_distances,
+    paired_distances,
+    paired_euclidean_distances,
+    paired_manhattan_distances,
+    pairwise_distances,
+    pairwise_distances_argmin,
+    pairwise_distances_argmin_min,
+    pairwise_distances_chunked,
+    pairwise_kernels,
+    polynomial_kernel,
+    rbf_kernel,
+    sigmoid_kernel,
+)
+from sklearn.preprocessing import normalize
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import (
+    BSR_CONTAINERS,
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+)
+from sklearn.utils.parallel import Parallel, delayed
+
+
+def test_pairwise_distances_for_dense_data(global_dtype):
+    # Test the pairwise_distance helper function.
+    rng = np.random.RandomState(0)
+
+    # Euclidean distance should be equivalent to calling the function.
+    X = rng.random_sample((5, 4)).astype(global_dtype, copy=False)
+    S = pairwise_distances(X, metric="euclidean")
+    S2 = euclidean_distances(X)
+    assert_allclose(S, S2)
+    assert S.dtype == S2.dtype == global_dtype
+
+    # Euclidean distance, with Y != X.
+    Y = rng.random_sample((2, 4)).astype(global_dtype, copy=False)
+    S = pairwise_distances(X, Y, metric="euclidean")
+    S2 = euclidean_distances(X, Y)
+    assert_allclose(S, S2)
+    assert S.dtype == S2.dtype == global_dtype
+
+    # Check to ensure NaNs work with pairwise_distances.
+    X_masked = rng.random_sample((5, 4)).astype(global_dtype, copy=False)
+    Y_masked = rng.random_sample((2, 4)).astype(global_dtype, copy=False)
+    X_masked[0, 0] = np.nan
+    Y_masked[0, 0] = np.nan
+    S_masked = pairwise_distances(X_masked, Y_masked, metric="nan_euclidean")
+    S2_masked = nan_euclidean_distances(X_masked, Y_masked)
+    assert_allclose(S_masked, S2_masked)
+    assert S_masked.dtype == S2_masked.dtype == global_dtype
+
+    # Test with tuples as X and Y
+    X_tuples = tuple([tuple([v for v in row]) for row in X])
+    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
+    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
+    assert_allclose(S, S2)
+    assert S.dtype == S2.dtype == global_dtype
+
+    # Test haversine distance
+    # The data should be valid latitude and longitude
+    # haversine converts to float64 currently so we don't check dtypes.
+    X = rng.random_sample((5, 2)).astype(global_dtype, copy=False)
+    X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi / 2
+    X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
+    S = pairwise_distances(X, metric="haversine")
+    S2 = haversine_distances(X)
+    assert_allclose(S, S2)
+
+    # Test haversine distance, with Y != X
+    Y = rng.random_sample((2, 2)).astype(global_dtype, copy=False)
+    Y[:, 0] = (Y[:, 0] - 0.5) * 2 * np.pi / 2
+    Y[:, 1] = (Y[:, 1] - 0.5) * 2 * np.pi
+    S = pairwise_distances(X, Y, metric="haversine")
+    S2 = haversine_distances(X, Y)
+    assert_allclose(S, S2)
+
+    # "cityblock" uses scikit-learn metric, cityblock (function) is
+    # scipy.spatial.
+    # The metric functions from scipy converts to float64 so we don't check the dtypes.
+    S = pairwise_distances(X, metric="cityblock")
+    S2 = pairwise_distances(X, metric=cityblock)
+    assert S.shape[0] == S.shape[1]
+    assert S.shape[0] == X.shape[0]
+    assert_allclose(S, S2)
+
+    # The manhattan metric should be equivalent to cityblock.
+    S = pairwise_distances(X, Y, metric="manhattan")
+    S2 = pairwise_distances(X, Y, metric=cityblock)
+    assert S.shape[0] == X.shape[0]
+    assert S.shape[1] == Y.shape[0]
+    assert_allclose(S, S2)
+
+    # Test cosine as a string metric versus cosine callable
+    # The string "cosine" uses sklearn.metric,
+    # while the function cosine is scipy.spatial
+    S = pairwise_distances(X, Y, metric="cosine")
+    S2 = pairwise_distances(X, Y, metric=cosine)
+    assert S.shape[0] == X.shape[0]
+    assert S.shape[1] == Y.shape[0]
+    assert_allclose(S, S2)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("bsr_container", BSR_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pairwise_distances_for_sparse_data(
+    coo_container, csc_container, bsr_container, csr_container, global_dtype
+):
+    # Test the pairwise_distance helper function.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4)).astype(global_dtype, copy=False)
+    Y = rng.random_sample((2, 4)).astype(global_dtype, copy=False)
+
+    # Test with sparse X and Y,
+    # currently only supported for Euclidean, L1 and cosine.
+    X_sparse = csr_container(X)
+    Y_sparse = csr_container(Y)
+
+    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
+    S2 = euclidean_distances(X_sparse, Y_sparse)
+    assert_allclose(S, S2)
+    assert S.dtype == S2.dtype == global_dtype
+
+    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
+    S2 = cosine_distances(X_sparse, Y_sparse)
+    assert_allclose(S, S2)
+    assert S.dtype == S2.dtype == global_dtype
+
+    S = pairwise_distances(X_sparse, csc_container(Y), metric="manhattan")
+    S2 = manhattan_distances(bsr_container(X), coo_container(Y))
+    assert_allclose(S, S2)
+    if global_dtype == np.float64:
+        assert S.dtype == S2.dtype == global_dtype
+    else:
+        # TODO Fix manhattan_distances to preserve dtype.
+        # currently pairwise_distances uses manhattan_distances but converts the result
+        # back to the input dtype
+        with pytest.raises(AssertionError):
+            assert S.dtype == S2.dtype == global_dtype
+
+    S2 = manhattan_distances(X, Y)
+    assert_allclose(S, S2)
+    if global_dtype == np.float64:
+        assert S.dtype == S2.dtype == global_dtype
+    else:
+        # TODO Fix manhattan_distances to preserve dtype.
+        # currently pairwise_distances uses manhattan_distances but converts the result
+        # back to the input dtype
+        with pytest.raises(AssertionError):
+            assert S.dtype == S2.dtype == global_dtype
+
+    # Test with scipy.spatial.distance metric, with a kwd
+    kwds = {"p": 2.0}
+    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
+    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
+    assert_allclose(S, S2)
+
+    # same with Y = None
+    kwds = {"p": 2.0}
+    S = pairwise_distances(X, metric="minkowski", **kwds)
+    S2 = pairwise_distances(X, metric=minkowski, **kwds)
+    assert_allclose(S, S2)
+
+    # Test that scipy distance metrics throw an error if sparse matrix given
+    with pytest.raises(TypeError):
+        pairwise_distances(X_sparse, metric="minkowski")
+    with pytest.raises(TypeError):
+        pairwise_distances(X, Y_sparse, metric="minkowski")
+
+
+# Some scipy metrics are deprecated (depending on the scipy version) but we
+# still want to test them.
+@ignore_warnings(category=DeprecationWarning)
+@pytest.mark.parametrize("metric", PAIRWISE_BOOLEAN_FUNCTIONS)
+def test_pairwise_boolean_distance(metric):
+    # test that we convert to boolean arrays for boolean distances
+    rng = np.random.RandomState(0)
+    X = rng.randn(5, 4)
+    Y = X.copy()
+    Y[0, 0] = 1 - Y[0, 0]
+
+    # ignore conversion to boolean in pairwise_distances
+    with ignore_warnings(category=DataConversionWarning):
+        for Z in [Y, None]:
+            res = pairwise_distances(X, Z, metric=metric)
+            np.nan_to_num(res, nan=0, posinf=0, neginf=0, copy=False)
+            assert np.sum(res != 0) == 0
+
+    # non-boolean arrays are converted to boolean for boolean
+    # distance metrics with a data conversion warning
+    msg = "Data was converted to boolean for metric %s" % metric
+    with pytest.warns(DataConversionWarning, match=msg):
+        pairwise_distances(X, metric=metric)
+
+    # Check that the warning is raised if X is boolean by Y is not boolean:
+    with pytest.warns(DataConversionWarning, match=msg):
+        pairwise_distances(X.astype(bool), Y=Y, metric=metric)
+
+    # Check that no warning is raised if X is already boolean and Y is None:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", DataConversionWarning)
+        pairwise_distances(X.astype(bool), metric=metric)
+
+
+def test_no_data_conversion_warning():
+    # No warnings issued if metric is not a boolean distance function
+    rng = np.random.RandomState(0)
+    X = rng.randn(5, 4)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", DataConversionWarning)
+        pairwise_distances(X, metric="minkowski")
+
+
+@pytest.mark.parametrize("func", [pairwise_distances, pairwise_kernels])
+def test_pairwise_precomputed(func):
+    # Test correct shape
+    with pytest.raises(ValueError, match=".* shape .*"):
+        func(np.zeros((5, 3)), metric="precomputed")
+    # with two args
+    with pytest.raises(ValueError, match=".* shape .*"):
+        func(np.zeros((5, 3)), np.zeros((4, 4)), metric="precomputed")
+    # even if shape[1] agrees (although thus second arg is spurious)
+    with pytest.raises(ValueError, match=".* shape .*"):
+        func(np.zeros((5, 3)), np.zeros((4, 3)), metric="precomputed")
+
+    # Test not copied (if appropriate dtype)
+    S = np.zeros((5, 5))
+    S2 = func(S, metric="precomputed")
+    assert S is S2
+    # with two args
+    S = np.zeros((5, 3))
+    S2 = func(S, np.zeros((3, 3)), metric="precomputed")
+    assert S is S2
+
+    # Test always returns float dtype
+    S = func(np.array([[1]], dtype="int"), metric="precomputed")
+    assert "f" == S.dtype.kind
+
+    # Test converts list to array-like
+    S = func([[1.0]], metric="precomputed")
+    assert isinstance(S, np.ndarray)
+
+
+def test_pairwise_precomputed_non_negative():
+    # Test non-negative values
+    with pytest.raises(ValueError, match=".* non-negative values.*"):
+        pairwise_distances(np.full((5, 5), -1), metric="precomputed")
+
+
+_minkowski_kwds = {"w": np.arange(1, 5).astype("double", copy=False), "p": 1}
+
+
+def callable_rbf_kernel(x, y, **kwds):
+    # Callable version of pairwise.rbf_kernel.
+    K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds)
+    # unpack the output since this is a scalar packed in a 0-dim array
+    return K.item()
+
+
+@pytest.mark.parametrize(
+    "func, metric, kwds",
+    [
+        (pairwise_distances, "euclidean", {}),
+        (
+            pairwise_distances,
+            minkowski,
+            _minkowski_kwds,
+        ),
+        (
+            pairwise_distances,
+            "minkowski",
+            _minkowski_kwds,
+        ),
+        (pairwise_kernels, "polynomial", {"degree": 1}),
+        (pairwise_kernels, callable_rbf_kernel, {"gamma": 0.1}),
+    ],
+)
+@pytest.mark.parametrize("dtype", [np.float64, np.float32, int])
+def test_pairwise_parallel(func, metric, kwds, dtype):
+    rng = np.random.RandomState(0)
+    X = np.array(5 * rng.random_sample((5, 4)), dtype=dtype)
+    Y = np.array(5 * rng.random_sample((3, 4)), dtype=dtype)
+
+    S = func(X, metric=metric, n_jobs=1, **kwds)
+    S2 = func(X, metric=metric, n_jobs=2, **kwds)
+    assert_allclose(S, S2)
+
+    S = func(X, Y, metric=metric, n_jobs=1, **kwds)
+    S2 = func(X, Y, metric=metric, n_jobs=2, **kwds)
+    assert_allclose(S, S2)
+
+
+def test_pairwise_callable_nonstrict_metric():
+    # paired_distances should allow callable metric where metric(x, x) != 0
+    # Knowing that the callable is a strict metric would allow the diagonal to
+    # be left uncalculated and set to 0.
+    assert pairwise_distances([[1.0]], metric=lambda x, y: 5)[0, 0] == 5
+
+
+# Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS.
+@pytest.mark.parametrize(
+    "metric",
+    ["rbf", "laplacian", "sigmoid", "polynomial", "linear", "chi2", "additive_chi2"],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pairwise_kernels(metric, csr_container):
+    # Test the pairwise_kernels helper function.
+
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    Y = rng.random_sample((2, 4))
+    function = PAIRWISE_KERNEL_FUNCTIONS[metric]
+    # Test with Y=None
+    K1 = pairwise_kernels(X, metric=metric)
+    K2 = function(X)
+    assert_allclose(K1, K2)
+    # Test with Y=Y
+    K1 = pairwise_kernels(X, Y=Y, metric=metric)
+    K2 = function(X, Y=Y)
+    assert_allclose(K1, K2)
+    # Test with tuples as X and Y
+    X_tuples = tuple([tuple([v for v in row]) for row in X])
+    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
+    K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric)
+    assert_allclose(K1, K2)
+
+    # Test with sparse X and Y
+    X_sparse = csr_container(X)
+    Y_sparse = csr_container(Y)
+    if metric in ["chi2", "additive_chi2"]:
+        # these don't support sparse matrices yet
+        return
+    K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric)
+    assert_allclose(K1, K2)
+
+
+def test_pairwise_kernels_callable():
+    # Test the pairwise_kernels helper function
+    # with a callable function, with given keywords.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    Y = rng.random_sample((2, 4))
+
+    metric = callable_rbf_kernel
+    kwds = {"gamma": 0.1}
+    K1 = pairwise_kernels(X, Y=Y, metric=metric, **kwds)
+    K2 = rbf_kernel(X, Y=Y, **kwds)
+    assert_allclose(K1, K2)
+
+    # callable function, X=Y
+    K1 = pairwise_kernels(X, Y=X, metric=metric, **kwds)
+    K2 = rbf_kernel(X, Y=X, **kwds)
+    assert_allclose(K1, K2)
+
+
+def test_pairwise_kernels_filter_param():
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    Y = rng.random_sample((2, 4))
+    K = rbf_kernel(X, Y, gamma=0.1)
+    params = {"gamma": 0.1, "blabla": ":)"}
+    K2 = pairwise_kernels(X, Y, metric="rbf", filter_params=True, **params)
+    assert_allclose(K, K2)
+
+    with pytest.raises(TypeError):
+        pairwise_kernels(X, Y, metric="rbf", **params)
+
+
+@pytest.mark.parametrize("metric, func", PAIRED_DISTANCES.items())
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_paired_distances(metric, func, csr_container):
+    # Test the pairwise_distance helper function.
+    rng = np.random.RandomState(0)
+    # Euclidean distance should be equivalent to calling the function.
+    X = rng.random_sample((5, 4))
+    # Euclidean distance, with Y != X.
+    Y = rng.random_sample((5, 4))
+
+    S = paired_distances(X, Y, metric=metric)
+    S2 = func(X, Y)
+    assert_allclose(S, S2)
+    S3 = func(csr_container(X), csr_container(Y))
+    assert_allclose(S, S3)
+    if metric in PAIRWISE_DISTANCE_FUNCTIONS:
+        # Check the pairwise_distances implementation
+        # gives the same value
+        distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
+        distances = np.diag(distances)
+        assert_allclose(distances, S)
+
+
+def test_paired_distances_callable(global_dtype):
+    # Test the paired_distance helper function
+    # with the callable implementation
+    rng = np.random.RandomState(0)
+    # Euclidean distance should be equivalent to calling the function.
+    X = rng.random_sample((5, 4)).astype(global_dtype, copy=False)
+    # Euclidean distance, with Y != X.
+    Y = rng.random_sample((5, 4)).astype(global_dtype, copy=False)
+
+    S = paired_distances(X, Y, metric="manhattan")
+    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
+    assert_allclose(S, S2)
+
+    # Test that a value error is raised when the lengths of X and Y should not
+    # differ
+    Y = rng.random_sample((3, 4))
+    with pytest.raises(ValueError):
+        paired_distances(X, Y)
+
+
+@pytest.mark.parametrize("dok_container", DOK_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pairwise_distances_argmin_min(dok_container, csr_container, global_dtype):
+    # Check pairwise minimum distances computation for any metric
+    X = np.asarray([[0], [1]], dtype=global_dtype)
+    Y = np.asarray([[-2], [3]], dtype=global_dtype)
+
+    Xsp = dok_container(X)
+    Ysp = csr_container(Y, dtype=global_dtype)
+
+    expected_idx = [0, 1]
+    expected_vals = [2, 2]
+    expected_vals_sq = [4, 4]
+
+    # euclidean metric
+    idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
+    idx2 = pairwise_distances_argmin(X, Y, metric="euclidean")
+    assert_allclose(idx, expected_idx)
+    assert_allclose(idx2, expected_idx)
+    assert_allclose(vals, expected_vals)
+    # sparse matrix case
+    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
+    idxsp2 = pairwise_distances_argmin(Xsp, Ysp, metric="euclidean")
+    assert_allclose(idxsp, expected_idx)
+    assert_allclose(idxsp2, expected_idx)
+    assert_allclose(valssp, expected_vals)
+    # We don't want np.matrix here
+    assert type(idxsp) == np.ndarray
+    assert type(valssp) == np.ndarray
+
+    # Squared Euclidean metric
+    idx, vals = pairwise_distances_argmin_min(X, Y, metric="sqeuclidean")
+    idx2, vals2 = pairwise_distances_argmin_min(
+        X, Y, metric="euclidean", metric_kwargs={"squared": True}
+    )
+    idx3 = pairwise_distances_argmin(X, Y, metric="sqeuclidean")
+    idx4 = pairwise_distances_argmin(
+        X, Y, metric="euclidean", metric_kwargs={"squared": True}
+    )
+
+    assert_allclose(vals, expected_vals_sq)
+    assert_allclose(vals2, expected_vals_sq)
+
+    assert_allclose(idx, expected_idx)
+    assert_allclose(idx2, expected_idx)
+    assert_allclose(idx3, expected_idx)
+    assert_allclose(idx4, expected_idx)
+
+    # Non-euclidean scikit-learn metric
+    idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")
+    idx2 = pairwise_distances_argmin(X, Y, metric="manhattan")
+    assert_allclose(idx, expected_idx)
+    assert_allclose(idx2, expected_idx)
+    assert_allclose(vals, expected_vals)
+    # sparse matrix case
+    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
+    idxsp2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan")
+    assert_allclose(idxsp, expected_idx)
+    assert_allclose(idxsp2, expected_idx)
+    assert_allclose(valssp, expected_vals)
+
+    # Non-euclidean Scipy distance (callable)
+    idx, vals = pairwise_distances_argmin_min(
+        X, Y, metric=minkowski, metric_kwargs={"p": 2}
+    )
+    assert_allclose(idx, expected_idx)
+    assert_allclose(vals, expected_vals)
+
+    # Non-euclidean Scipy distance (string)
+    idx, vals = pairwise_distances_argmin_min(
+        X, Y, metric="minkowski", metric_kwargs={"p": 2}
+    )
+    assert_allclose(idx, expected_idx)
+    assert_allclose(vals, expected_vals)
+
+    # Compare with naive implementation
+    rng = np.random.RandomState(0)
+    X = rng.randn(97, 149)
+    Y = rng.randn(111, 149)
+
+    dist = pairwise_distances(X, Y, metric="manhattan")
+    dist_orig_ind = dist.argmin(axis=0)
+    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]
+
+    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
+        X, Y, axis=0, metric="manhattan"
+    )
+    assert_allclose(dist_orig_ind, dist_chunked_ind, rtol=1e-7)
+    assert_allclose(dist_orig_val, dist_chunked_val, rtol=1e-7)
+
+    # Changing the axis and permuting datasets must give the same results
+    argmin_0, dist_0 = pairwise_distances_argmin_min(X, Y, axis=0)
+    argmin_1, dist_1 = pairwise_distances_argmin_min(Y, X, axis=1)
+
+    assert_allclose(dist_0, dist_1)
+    assert_array_equal(argmin_0, argmin_1)
+
+    argmin_0, dist_0 = pairwise_distances_argmin_min(X, X, axis=0)
+    argmin_1, dist_1 = pairwise_distances_argmin_min(X, X, axis=1)
+
+    assert_allclose(dist_0, dist_1)
+    assert_array_equal(argmin_0, argmin_1)
+
+    # Changing the axis and permuting datasets must give the same results
+    argmin_0 = pairwise_distances_argmin(X, Y, axis=0)
+    argmin_1 = pairwise_distances_argmin(Y, X, axis=1)
+
+    assert_array_equal(argmin_0, argmin_1)
+
+    argmin_0 = pairwise_distances_argmin(X, X, axis=0)
+    argmin_1 = pairwise_distances_argmin(X, X, axis=1)
+
+    assert_array_equal(argmin_0, argmin_1)
+
+    # F-contiguous arrays must be supported and must return identical results.
+    argmin_C_contiguous = pairwise_distances_argmin(X, Y)
+    argmin_F_contiguous = pairwise_distances_argmin(
+        np.asfortranarray(X), np.asfortranarray(Y)
+    )
+
+    assert_array_equal(argmin_C_contiguous, argmin_F_contiguous)
+
+
+def _reduce_func(dist, start):
+    return dist[:, :100]
+
+
+def test_pairwise_distances_chunked_reduce(global_dtype):
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((400, 4)).astype(global_dtype, copy=False)
+    # Reduced Euclidean distance
+    S = pairwise_distances(X)[:, :100]
+    S_chunks = pairwise_distances_chunked(
+        X, None, reduce_func=_reduce_func, working_memory=2**-16
+    )
+    assert isinstance(S_chunks, GeneratorType)
+    S_chunks = list(S_chunks)
+    assert len(S_chunks) > 1
+    assert S_chunks[0].dtype == X.dtype
+
+    # atol is for diagonal where S is explicitly zeroed on the diagonal
+    assert_allclose(np.vstack(S_chunks), S, atol=1e-7)
+
+
+def test_pairwise_distances_chunked_reduce_none(global_dtype):
+    # check that the reduce func is allowed to return None
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((10, 4)).astype(global_dtype, copy=False)
+    S_chunks = pairwise_distances_chunked(
+        X, None, reduce_func=lambda dist, start: None, working_memory=2**-16
+    )
+    assert isinstance(S_chunks, GeneratorType)
+    S_chunks = list(S_chunks)
+    assert len(S_chunks) > 1
+    assert all(chunk is None for chunk in S_chunks)
+
+
+@pytest.mark.parametrize(
+    "good_reduce",
+    [
+        lambda D, start: list(D),
+        lambda D, start: np.array(D),
+        lambda D, start: (list(D), list(D)),
+    ]
+    + [
+        lambda D, start, scipy_csr_type=scipy_csr_type: scipy_csr_type(D)
+        for scipy_csr_type in CSR_CONTAINERS
+    ]
+    + [
+        lambda D, start, scipy_dok_type=scipy_dok_type: (
+            scipy_dok_type(D),
+            np.array(D),
+            list(D),
+        )
+        for scipy_dok_type in DOK_CONTAINERS
+    ],
+)
+def test_pairwise_distances_chunked_reduce_valid(good_reduce):
+    X = np.arange(10).reshape(-1, 1)
+    S_chunks = pairwise_distances_chunked(
+        X, None, reduce_func=good_reduce, working_memory=64
+    )
+    next(S_chunks)
+
+
+@pytest.mark.parametrize(
+    ("bad_reduce", "err_type", "message"),
+    [
+        (
+            lambda D, s: np.concatenate([D, D[-1:]]),
+            ValueError,
+            r"length 11\..* input: 10\.",
+        ),
+        (
+            lambda D, s: (D, np.concatenate([D, D[-1:]])),
+            ValueError,
+            r"length \(10, 11\)\..* input: 10\.",
+        ),
+        (lambda D, s: (D[:9], D), ValueError, r"length \(9, 10\)\..* input: 10\."),
+        (
+            lambda D, s: 7,
+            TypeError,
+            r"returned 7\. Expected sequence\(s\) of length 10\.",
+        ),
+        (
+            lambda D, s: (7, 8),
+            TypeError,
+            r"returned \(7, 8\)\. Expected sequence\(s\) of length 10\.",
+        ),
+        (
+            lambda D, s: (np.arange(10), 9),
+            TypeError,
+            r", 9\)\. Expected sequence\(s\) of length 10\.",
+        ),
+    ],
+)
+def test_pairwise_distances_chunked_reduce_invalid(
+    global_dtype, bad_reduce, err_type, message
+):
+    X = np.arange(10).reshape(-1, 1).astype(global_dtype, copy=False)
+    S_chunks = pairwise_distances_chunked(
+        X, None, reduce_func=bad_reduce, working_memory=64
+    )
+    with pytest.raises(err_type, match=message):
+        next(S_chunks)
+
+
+def check_pairwise_distances_chunked(X, Y, working_memory, metric="euclidean"):
+    gen = pairwise_distances_chunked(X, Y, working_memory=working_memory, metric=metric)
+    assert isinstance(gen, GeneratorType)
+    blockwise_distances = list(gen)
+    Y = X if Y is None else Y
+    min_block_mib = len(Y) * 8 * 2**-20
+
+    for block in blockwise_distances:
+        memory_used = block.nbytes
+        assert memory_used <= max(working_memory, min_block_mib) * 2**20
+
+    blockwise_distances = np.vstack(blockwise_distances)
+    S = pairwise_distances(X, Y, metric=metric)
+    assert_allclose(blockwise_distances, S, atol=1e-7)
+
+
+@pytest.mark.parametrize("metric", ("euclidean", "l2", "sqeuclidean"))
+def test_pairwise_distances_chunked_diagonal(metric, global_dtype):
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(1000, 10), scale=1e10).astype(global_dtype, copy=False)
+    chunks = list(pairwise_distances_chunked(X, working_memory=1, metric=metric))
+    assert len(chunks) > 1
+    assert_allclose(np.diag(np.vstack(chunks)), 0, rtol=1e-10)
+
+
+@pytest.mark.parametrize("metric", ("euclidean", "l2", "sqeuclidean"))
+def test_parallel_pairwise_distances_diagonal(metric, global_dtype):
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(1000, 10), scale=1e10).astype(global_dtype, copy=False)
+    distances = pairwise_distances(X, metric=metric, n_jobs=2)
+    assert_allclose(np.diag(distances), 0, atol=1e-10)
+
+
+@pytest.mark.filterwarnings("ignore:Could not adhere to working_memory config")
+def test_pairwise_distances_chunked(global_dtype):
+    # Test the pairwise_distance helper function.
+    rng = np.random.RandomState(0)
+    # Euclidean distance should be equivalent to calling the function.
+    X = rng.random_sample((200, 4)).astype(global_dtype, copy=False)
+    check_pairwise_distances_chunked(X, None, working_memory=1, metric="euclidean")
+    # Test small amounts of memory
+    for power in range(-16, 0):
+        check_pairwise_distances_chunked(
+            X, None, working_memory=2**power, metric="euclidean"
+        )
+    # X as list
+    check_pairwise_distances_chunked(
+        X.tolist(), None, working_memory=1, metric="euclidean"
+    )
+    # Euclidean distance, with Y != X.
+    Y = rng.random_sample((100, 4)).astype(global_dtype, copy=False)
+    check_pairwise_distances_chunked(X, Y, working_memory=1, metric="euclidean")
+    check_pairwise_distances_chunked(
+        X.tolist(), Y.tolist(), working_memory=1, metric="euclidean"
+    )
+    # absurdly large working_memory
+    check_pairwise_distances_chunked(X, Y, working_memory=10000, metric="euclidean")
+    # "cityblock" uses scikit-learn metric, cityblock (function) is
+    # scipy.spatial.
+    check_pairwise_distances_chunked(X, Y, working_memory=1, metric="cityblock")
+
+    # Test precomputed returns all at once
+    D = pairwise_distances(X)
+    gen = pairwise_distances_chunked(D, working_memory=2**-16, metric="precomputed")
+    assert isinstance(gen, GeneratorType)
+    assert next(gen) is D
+    with pytest.raises(StopIteration):
+        next(gen)
+
+
+@pytest.mark.parametrize(
+    "x_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+@pytest.mark.parametrize(
+    "y_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+def test_euclidean_distances_known_result(x_array_constr, y_array_constr):
+    # Check the pairwise Euclidean distances computation on known result
+    X = x_array_constr([[0]])
+    Y = y_array_constr([[1], [2]])
+    D = euclidean_distances(X, Y)
+    assert_allclose(D, [[1.0, 2.0]])
+
+
+@pytest.mark.parametrize(
+    "y_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+def test_euclidean_distances_with_norms(global_dtype, y_array_constr):
+    # check that we still get the right answers with {X,Y}_norm_squared
+    # and that we get a wrong answer with wrong {X,Y}_norm_squared
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((10, 10)).astype(global_dtype, copy=False)
+    Y = rng.random_sample((20, 10)).astype(global_dtype, copy=False)
+
+    # norms will only be used if their dtype is float64
+    X_norm_sq = (X.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1)
+    Y_norm_sq = (Y.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1)
+
+    Y = y_array_constr(Y)
+
+    D1 = euclidean_distances(X, Y)
+    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
+    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
+    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq)
+    assert_allclose(D2, D1)
+    assert_allclose(D3, D1)
+    assert_allclose(D4, D1)
+
+    # check we get the wrong answer with wrong {X,Y}_norm_squared
+    wrong_D = euclidean_distances(
+        X,
+        Y,
+        X_norm_squared=np.zeros_like(X_norm_sq),
+        Y_norm_squared=np.zeros_like(Y_norm_sq),
+    )
+    with pytest.raises(AssertionError):
+        assert_allclose(wrong_D, D1)
+
+
+@pytest.mark.parametrize("symmetric", [True, False])
+def test_euclidean_distances_float32_norms(global_random_seed, symmetric):
+    # Non-regression test for #27621
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.random_sample((10, 10))
+    Y = X if symmetric else rng.random_sample((20, 10))
+    X_norm_sq = (X.astype(np.float32) ** 2).sum(axis=1).reshape(1, -1)
+    Y_norm_sq = (Y.astype(np.float32) ** 2).sum(axis=1).reshape(1, -1)
+    D1 = euclidean_distances(X, Y)
+    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
+    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
+    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq)
+    assert_allclose(D2, D1)
+    assert_allclose(D3, D1)
+    assert_allclose(D4, D1)
+
+
+def test_euclidean_distances_norm_shapes():
+    # Check all accepted shapes for the norms or appropriate error messages.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((10, 10))
+    Y = rng.random_sample((20, 10))
+
+    X_norm_squared = (X**2).sum(axis=1)
+    Y_norm_squared = (Y**2).sum(axis=1)
+
+    D1 = euclidean_distances(
+        X, Y, X_norm_squared=X_norm_squared, Y_norm_squared=Y_norm_squared
+    )
+    D2 = euclidean_distances(
+        X,
+        Y,
+        X_norm_squared=X_norm_squared.reshape(-1, 1),
+        Y_norm_squared=Y_norm_squared.reshape(-1, 1),
+    )
+    D3 = euclidean_distances(
+        X,
+        Y,
+        X_norm_squared=X_norm_squared.reshape(1, -1),
+        Y_norm_squared=Y_norm_squared.reshape(1, -1),
+    )
+
+    assert_allclose(D2, D1)
+    assert_allclose(D3, D1)
+
+    with pytest.raises(ValueError, match="Incompatible dimensions for X"):
+        euclidean_distances(X, Y, X_norm_squared=X_norm_squared[:5])
+    with pytest.raises(ValueError, match="Incompatible dimensions for Y"):
+        euclidean_distances(X, Y, Y_norm_squared=Y_norm_squared[:5])
+
+
+@pytest.mark.parametrize(
+    "x_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+@pytest.mark.parametrize(
+    "y_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+def test_euclidean_distances(global_dtype, x_array_constr, y_array_constr):
+    # check that euclidean distances gives same result as scipy cdist
+    # when X and Y != X are provided
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10)).astype(global_dtype, copy=False)
+    X[X < 0.8] = 0
+    Y = rng.random_sample((10, 10)).astype(global_dtype, copy=False)
+    Y[Y < 0.8] = 0
+
+    expected = cdist(X, Y)
+
+    X = x_array_constr(X)
+    Y = y_array_constr(Y)
+    distances = euclidean_distances(X, Y)
+
+    # the default rtol=1e-7 is too close to the float32 precision
+    # and fails due to rounding errors.
+    assert_allclose(distances, expected, rtol=1e-6)
+    assert distances.dtype == global_dtype
+
+
+@pytest.mark.parametrize(
+    "x_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+def test_euclidean_distances_sym(global_dtype, x_array_constr):
+    # check that euclidean distances gives same result as scipy pdist
+    # when only X is provided
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10)).astype(global_dtype, copy=False)
+    X[X < 0.8] = 0
+
+    expected = squareform(pdist(X))
+
+    X = x_array_constr(X)
+    distances = euclidean_distances(X)
+
+    # the default rtol=1e-7 is too close to the float32 precision
+    # and fails due to rounding errors.
+    assert_allclose(distances, expected, rtol=1e-6)
+    assert distances.dtype == global_dtype
+
+
+@pytest.mark.parametrize("batch_size", [None, 5, 7, 101])
+@pytest.mark.parametrize(
+    "x_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+@pytest.mark.parametrize(
+    "y_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+def test_euclidean_distances_upcast(batch_size, x_array_constr, y_array_constr):
+    # check batches handling when Y != X (#13910)
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10)).astype(np.float32)
+    X[X < 0.8] = 0
+    Y = rng.random_sample((10, 10)).astype(np.float32)
+    Y[Y < 0.8] = 0
+
+    expected = cdist(X, Y)
+
+    X = x_array_constr(X)
+    Y = y_array_constr(Y)
+    distances = _euclidean_distances_upcast(X, Y=Y, batch_size=batch_size)
+    distances = np.sqrt(np.maximum(distances, 0))
+
+    # the default rtol=1e-7 is too close to the float32 precision
+    # and fails due to rounding errors.
+    assert_allclose(distances, expected, rtol=1e-6)
+
+
+@pytest.mark.parametrize("batch_size", [None, 5, 7, 101])
+@pytest.mark.parametrize(
+    "x_array_constr",
+    [np.array] + CSR_CONTAINERS,
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+def test_euclidean_distances_upcast_sym(batch_size, x_array_constr):
+    # check batches handling when X is Y (#13910)
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10)).astype(np.float32)
+    X[X < 0.8] = 0
+
+    expected = squareform(pdist(X))
+
+    X = x_array_constr(X)
+    distances = _euclidean_distances_upcast(X, Y=X, batch_size=batch_size)
+    distances = np.sqrt(np.maximum(distances, 0))
+
+    # the default rtol=1e-7 is too close to the float32 precision
+    # and fails due to rounding errors.
+    assert_allclose(distances, expected, rtol=1e-6)
+
+
+@pytest.mark.parametrize(
+    "dtype, eps, rtol",
+    [
+        (np.float32, 1e-4, 1e-5),
+        pytest.param(
+            np.float64,
+            1e-8,
+            0.99,
+            marks=pytest.mark.xfail(reason="failing due to lack of precision"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("dim", [1, 1000000])
+def test_euclidean_distances_extreme_values(dtype, eps, rtol, dim):
+    # check that euclidean distances is correct with float32 input thanks to
+    # upcasting. On float64 there are still precision issues.
+    X = np.array([[1.0] * dim], dtype=dtype)
+    Y = np.array([[1.0 + eps] * dim], dtype=dtype)
+
+    distances = euclidean_distances(X, Y)
+    expected = cdist(X, Y)
+
+    assert_allclose(distances, expected, rtol=1e-5)
+
+
+@pytest.mark.parametrize("squared", [True, False])
+def test_nan_euclidean_distances_equal_to_euclidean_distance(squared):
+    # with no nan values
+    rng = np.random.RandomState(1337)
+    X = rng.randn(3, 4)
+    Y = rng.randn(4, 4)
+
+    normal_distance = euclidean_distances(X, Y=Y, squared=squared)
+    nan_distance = nan_euclidean_distances(X, Y=Y, squared=squared)
+    assert_allclose(normal_distance, nan_distance)
+
+
+@pytest.mark.parametrize("X", [np.array([[np.inf, 0]]), np.array([[0, -np.inf]])])
+@pytest.mark.parametrize("Y", [np.array([[np.inf, 0]]), np.array([[0, -np.inf]]), None])
+def test_nan_euclidean_distances_infinite_values(X, Y):
+    with pytest.raises(ValueError) as excinfo:
+        nan_euclidean_distances(X, Y=Y)
+
+    exp_msg = "Input contains infinity or a value too large for dtype('float64')."
+    assert exp_msg == str(excinfo.value)
+
+
+@pytest.mark.parametrize(
+    "X, X_diag, missing_value",
+    [
+        (np.array([[0, 1], [1, 0]]), np.sqrt(2), np.nan),
+        (np.array([[0, 1], [1, np.nan]]), np.sqrt(2), np.nan),
+        (np.array([[np.nan, 1], [1, np.nan]]), np.nan, np.nan),
+        (np.array([[np.nan, 1], [np.nan, 0]]), np.sqrt(2), np.nan),
+        (np.array([[0, np.nan], [1, np.nan]]), np.sqrt(2), np.nan),
+        (np.array([[0, 1], [1, 0]]), np.sqrt(2), -1),
+        (np.array([[0, 1], [1, -1]]), np.sqrt(2), -1),
+        (np.array([[-1, 1], [1, -1]]), np.nan, -1),
+        (np.array([[-1, 1], [-1, 0]]), np.sqrt(2), -1),
+        (np.array([[0, -1], [1, -1]]), np.sqrt(2), -1),
+    ],
+)
+def test_nan_euclidean_distances_2x2(X, X_diag, missing_value):
+    exp_dist = np.array([[0.0, X_diag], [X_diag, 0]])
+
+    dist = nan_euclidean_distances(X, missing_values=missing_value)
+    assert_allclose(exp_dist, dist)
+
+    dist_sq = nan_euclidean_distances(X, squared=True, missing_values=missing_value)
+    assert_allclose(exp_dist**2, dist_sq)
+
+    dist_two = nan_euclidean_distances(X, X, missing_values=missing_value)
+    assert_allclose(exp_dist, dist_two)
+
+    dist_two_copy = nan_euclidean_distances(X, X.copy(), missing_values=missing_value)
+    assert_allclose(exp_dist, dist_two_copy)
+
+
+@pytest.mark.parametrize("missing_value", [np.nan, -1])
+def test_nan_euclidean_distances_complete_nan(missing_value):
+    X = np.array([[missing_value, missing_value], [0, 1]])
+
+    exp_dist = np.array([[np.nan, np.nan], [np.nan, 0]])
+
+    dist = nan_euclidean_distances(X, missing_values=missing_value)
+    assert_allclose(exp_dist, dist)
+
+    dist = nan_euclidean_distances(X, X.copy(), missing_values=missing_value)
+    assert_allclose(exp_dist, dist)
+
+
+@pytest.mark.parametrize("missing_value", [np.nan, -1])
+def test_nan_euclidean_distances_not_trival(missing_value):
+    X = np.array(
+        [
+            [1.0, missing_value, 3.0, 4.0, 2.0],
+            [missing_value, 4.0, 6.0, 1.0, missing_value],
+            [3.0, missing_value, missing_value, missing_value, 1.0],
+        ]
+    )
+
+    Y = np.array(
+        [
+            [missing_value, 7.0, 7.0, missing_value, 2.0],
+            [missing_value, missing_value, 5.0, 4.0, 7.0],
+            [missing_value, missing_value, missing_value, 4.0, 5.0],
+        ]
+    )
+
+    # Check for symmetry
+    D1 = nan_euclidean_distances(X, Y, missing_values=missing_value)
+    D2 = nan_euclidean_distances(Y, X, missing_values=missing_value)
+
+    assert_almost_equal(D1, D2.T)
+
+    # Check with explicit formula and squared=True
+    assert_allclose(
+        nan_euclidean_distances(
+            X[:1], Y[:1], squared=True, missing_values=missing_value
+        ),
+        [[5.0 / 2.0 * ((7 - 3) ** 2 + (2 - 2) ** 2)]],
+    )
+
+    # Check with explicit formula and squared=False
+    assert_allclose(
+        nan_euclidean_distances(
+            X[1:2], Y[1:2], squared=False, missing_values=missing_value
+        ),
+        [[np.sqrt(5.0 / 2.0 * ((6 - 5) ** 2 + (1 - 4) ** 2))]],
+    )
+
+    # Check when Y = X is explicitly passed
+    D3 = nan_euclidean_distances(X, missing_values=missing_value)
+    D4 = nan_euclidean_distances(X, X, missing_values=missing_value)
+    D5 = nan_euclidean_distances(X, X.copy(), missing_values=missing_value)
+    assert_allclose(D3, D4)
+    assert_allclose(D4, D5)
+
+    # Check copy = True against copy = False
+    D6 = nan_euclidean_distances(X, Y, copy=True)
+    D7 = nan_euclidean_distances(X, Y, copy=False)
+    assert_allclose(D6, D7)
+
+
+@pytest.mark.parametrize("missing_value", [np.nan, -1])
+def test_nan_euclidean_distances_one_feature_match_positive(missing_value):
+    # First feature is the only feature that is non-nan and in both
+    # samples. The result of `nan_euclidean_distances` with squared=True
+    # should be non-negative. The non-squared version should all be close to 0.
+    X = np.array(
+        [
+            [-122.27, 648.0, missing_value, 37.85],
+            [-122.27, missing_value, 2.34701493, missing_value],
+        ]
+    )
+
+    dist_squared = nan_euclidean_distances(
+        X, missing_values=missing_value, squared=True
+    )
+    assert np.all(dist_squared >= 0)
+
+    dist = nan_euclidean_distances(X, missing_values=missing_value, squared=False)
+    assert_allclose(dist, 0.0)
+
+
+def test_cosine_distances():
+    # Check the pairwise Cosine distances computation
+    rng = np.random.RandomState(1337)
+    x = np.abs(rng.rand(910))
+    XA = np.vstack([x, x])
+    D = cosine_distances(XA)
+    assert_allclose(D, [[0.0, 0.0], [0.0, 0.0]], atol=1e-10)
+    # check that all elements are in [0, 2]
+    assert np.all(D >= 0.0)
+    assert np.all(D <= 2.0)
+    # check that diagonal elements are equal to 0
+    assert_allclose(D[np.diag_indices_from(D)], [0.0, 0.0])
+
+    XB = np.vstack([x, -x])
+    D2 = cosine_distances(XB)
+    # check that all elements are in [0, 2]
+    assert np.all(D2 >= 0.0)
+    assert np.all(D2 <= 2.0)
+    # check that diagonal elements are equal to 0 and non diagonal to 2
+    assert_allclose(D2, [[0.0, 2.0], [2.0, 0.0]])
+
+    # check large random matrix
+    X = np.abs(rng.rand(1000, 5000))
+    D = cosine_distances(X)
+    # check that diagonal elements are equal to 0
+    assert_allclose(D[np.diag_indices_from(D)], [0.0] * D.shape[0])
+    assert np.all(D >= 0.0)
+    assert np.all(D <= 2.0)
+
+
+def test_haversine_distances():
+    # Check haversine distance with distances computation
+    def slow_haversine_distances(x, y):
+        diff_lat = y[0] - x[0]
+        diff_lon = y[1] - x[1]
+        a = np.sin(diff_lat / 2) ** 2 + (
+            np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon / 2) ** 2
+        )
+        c = 2 * np.arcsin(np.sqrt(a))
+        return c
+
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 2))
+    Y = rng.random_sample((10, 2))
+    D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X])
+    D2 = haversine_distances(X, Y)
+    assert_allclose(D1, D2)
+    # Test haversine distance does not accept X where n_feature != 2
+    X = rng.random_sample((10, 3))
+    err_msg = "Haversine distance only valid in 2 dimensions"
+    with pytest.raises(ValueError, match=err_msg):
+        haversine_distances(X)
+
+
+# Paired distances
+
+
+def test_paired_euclidean_distances():
+    # Check the paired Euclidean distances computation
+    X = [[0], [0]]
+    Y = [[1], [2]]
+    D = paired_euclidean_distances(X, Y)
+    assert_allclose(D, [1.0, 2.0])
+
+
+def test_paired_manhattan_distances():
+    # Check the paired manhattan distances computation
+    X = [[0], [0]]
+    Y = [[1], [2]]
+    D = paired_manhattan_distances(X, Y)
+    assert_allclose(D, [1.0, 2.0])
+
+
+def test_paired_cosine_distances():
+    # Check the paired manhattan distances computation
+    X = [[0], [0]]
+    Y = [[1], [2]]
+    D = paired_cosine_distances(X, Y)
+    assert_allclose(D, [0.5, 0.5])
+
+
+def test_chi_square_kernel():
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    Y = rng.random_sample((10, 4))
+    K_add = additive_chi2_kernel(X, Y)
+    gamma = 0.1
+    K = chi2_kernel(X, Y, gamma=gamma)
+    assert K.dtype == float
+    for i, x in enumerate(X):
+        for j, y in enumerate(Y):
+            chi2 = -np.sum((x - y) ** 2 / (x + y))
+            chi2_exp = np.exp(gamma * chi2)
+            assert_almost_equal(K_add[i, j], chi2)
+            assert_almost_equal(K[i, j], chi2_exp)
+
+    # check diagonal is ones for data with itself
+    K = chi2_kernel(Y)
+    assert_array_equal(np.diag(K), 1)
+    # check off-diagonal is < 1 but > 0:
+    assert np.all(K > 0)
+    assert np.all(K - np.diag(np.diag(K)) < 1)
+    # check that float32 is preserved
+    X = rng.random_sample((5, 4)).astype(np.float32)
+    Y = rng.random_sample((10, 4)).astype(np.float32)
+    K = chi2_kernel(X, Y)
+    assert K.dtype == np.float32
+
+    # check integer type gets converted,
+    # check that zeros are handled
+    X = rng.random_sample((10, 4)).astype(np.int32)
+    K = chi2_kernel(X, X)
+    assert np.isfinite(K).all()
+    assert K.dtype == float
+
+    # check that kernel of similar things is greater than dissimilar ones
+    X = [[0.3, 0.7], [1.0, 0]]
+    Y = [[0, 1], [0.9, 0.1]]
+    K = chi2_kernel(X, Y)
+    assert K[0, 0] > K[0, 1]
+    assert K[1, 1] > K[1, 0]
+
+    # test negative input
+    with pytest.raises(ValueError):
+        chi2_kernel([[0, -1]])
+    with pytest.raises(ValueError):
+        chi2_kernel([[0, -1]], [[-1, -1]])
+    with pytest.raises(ValueError):
+        chi2_kernel([[0, 1]], [[-1, -1]])
+
+    # different n_features in X and Y
+    with pytest.raises(ValueError):
+        chi2_kernel([[0, 1]], [[0.2, 0.2, 0.6]])
+
+
+@pytest.mark.parametrize(
+    "kernel",
+    (
+        linear_kernel,
+        polynomial_kernel,
+        rbf_kernel,
+        laplacian_kernel,
+        sigmoid_kernel,
+        cosine_similarity,
+    ),
+)
+def test_kernel_symmetry(kernel):
+    # Valid kernels should be symmetric
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    K = kernel(X, X)
+    assert_allclose(K, K.T, 15)
+
+
+@pytest.mark.parametrize(
+    "kernel",
+    (
+        linear_kernel,
+        polynomial_kernel,
+        rbf_kernel,
+        laplacian_kernel,
+        sigmoid_kernel,
+        cosine_similarity,
+    ),
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_kernel_sparse(kernel, csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    X_sparse = csr_container(X)
+    K = kernel(X, X)
+    K2 = kernel(X_sparse, X_sparse)
+    assert_allclose(K, K2)
+
+
+def test_linear_kernel():
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    K = linear_kernel(X, X)
+    # the diagonal elements of a linear kernel are their squared norm
+    assert_allclose(K.flat[::6], [linalg.norm(x) ** 2 for x in X])
+
+
+def test_rbf_kernel():
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    K = rbf_kernel(X, X)
+    # the diagonal elements of a rbf kernel are 1
+    assert_allclose(K.flat[::6], np.ones(5))
+
+
+def test_laplacian_kernel():
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    K = laplacian_kernel(X, X)
+    # the diagonal elements of a laplacian kernel are 1
+    assert_allclose(np.diag(K), np.ones(5))
+
+    # off-diagonal elements are < 1 but > 0:
+    assert np.all(K > 0)
+    assert np.all(K - np.diag(np.diag(K)) < 1)
+
+
+@pytest.mark.parametrize(
+    "metric, pairwise_func",
+    [("linear", linear_kernel), ("cosine", cosine_similarity)],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pairwise_similarity_sparse_output(metric, pairwise_func, csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    Y = rng.random_sample((3, 4))
+    Xcsr = csr_container(X)
+    Ycsr = csr_container(Y)
+
+    # should be sparse
+    K1 = pairwise_func(Xcsr, Ycsr, dense_output=False)
+    assert issparse(K1)
+
+    # should be dense, and equal to K1
+    K2 = pairwise_func(X, Y, dense_output=True)
+    assert not issparse(K2)
+    assert_allclose(K1.toarray(), K2)
+
+    # show the kernel output equal to the sparse.toarray()
+    K3 = pairwise_kernels(X, Y=Y, metric=metric)
+    assert_allclose(K1.toarray(), K3)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_cosine_similarity(csr_container):
+    # Test the cosine_similarity.
+
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    Y = rng.random_sample((3, 4))
+    Xcsr = csr_container(X)
+    Ycsr = csr_container(Y)
+
+    for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)):
+        # Test that the cosine is kernel is equal to a linear kernel when data
+        # has been previously normalized by L2-norm.
+        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
+        X_ = normalize(X_)
+        if Y_ is not None:
+            Y_ = normalize(Y_)
+        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
+        assert_allclose(K1, K2)
+
+
+def test_check_dense_matrices():
+    # Ensure that pairwise array check works for dense matrices.
+    # Check that if XB is None, XB is returned as reference to XA
+    XA = np.resize(np.arange(40), (5, 8))
+    XA_checked, XB_checked = check_pairwise_arrays(XA, None)
+    assert XA_checked is XB_checked
+    assert_array_equal(XA, XA_checked)
+
+
+def test_check_XB_returned():
+    # Ensure that if XA and XB are given correctly, they return as equal.
+    # Check that if XB is not None, it is returned equal.
+    # Note that the second dimension of XB is the same as XA.
+    XA = np.resize(np.arange(40), (5, 8))
+    XB = np.resize(np.arange(32), (4, 8))
+    XA_checked, XB_checked = check_pairwise_arrays(XA, XB)
+    assert_array_equal(XA, XA_checked)
+    assert_array_equal(XB, XB_checked)
+
+    XB = np.resize(np.arange(40), (5, 8))
+    XA_checked, XB_checked = check_paired_arrays(XA, XB)
+    assert_array_equal(XA, XA_checked)
+    assert_array_equal(XB, XB_checked)
+
+
+def test_check_different_dimensions():
+    # Ensure an error is raised if the dimensions are different.
+    XA = np.resize(np.arange(45), (5, 9))
+    XB = np.resize(np.arange(32), (4, 8))
+    with pytest.raises(ValueError):
+        check_pairwise_arrays(XA, XB)
+
+    XB = np.resize(np.arange(4 * 9), (4, 9))
+    with pytest.raises(ValueError):
+        check_paired_arrays(XA, XB)
+
+
+def test_check_invalid_dimensions():
+    # Ensure an error is raised on 1D input arrays.
+    # The modified tests are not 1D. In the old test, the array was internally
+    # converted to 2D anyways
+    XA = np.arange(45).reshape(9, 5)
+    XB = np.arange(32).reshape(4, 8)
+    with pytest.raises(ValueError):
+        check_pairwise_arrays(XA, XB)
+    XA = np.arange(45).reshape(9, 5)
+    XB = np.arange(32).reshape(4, 8)
+    with pytest.raises(ValueError):
+        check_pairwise_arrays(XA, XB)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_check_sparse_arrays(csr_container):
+    # Ensures that checks return valid sparse matrices.
+    rng = np.random.RandomState(0)
+    XA = rng.random_sample((5, 4))
+    XA_sparse = csr_container(XA)
+    XB = rng.random_sample((5, 4))
+    XB_sparse = csr_container(XB)
+    XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse)
+    # compare their difference because testing csr matrices for
+    # equality with '==' does not work as expected.
+    assert issparse(XA_checked)
+    assert abs(XA_sparse - XA_checked).sum() == 0
+    assert issparse(XB_checked)
+    assert abs(XB_sparse - XB_checked).sum() == 0
+
+    XA_checked, XA_2_checked = check_pairwise_arrays(XA_sparse, XA_sparse)
+    assert issparse(XA_checked)
+    assert abs(XA_sparse - XA_checked).sum() == 0
+    assert issparse(XA_2_checked)
+    assert abs(XA_2_checked - XA_checked).sum() == 0
+
+
+def tuplify(X):
+    # Turns a numpy matrix (any n-dimensional array) into tuples.
+    s = X.shape
+    if len(s) > 1:
+        # Tuplify each sub-array in the input.
+        return tuple(tuplify(row) for row in X)
+    else:
+        # Single dimension input, just return tuple of contents.
+        return tuple(r for r in X)
+
+
+def test_check_tuple_input():
+    # Ensures that checks return valid tuples.
+    rng = np.random.RandomState(0)
+    XA = rng.random_sample((5, 4))
+    XA_tuples = tuplify(XA)
+    XB = rng.random_sample((5, 4))
+    XB_tuples = tuplify(XB)
+    XA_checked, XB_checked = check_pairwise_arrays(XA_tuples, XB_tuples)
+    assert_array_equal(XA_tuples, XA_checked)
+    assert_array_equal(XB_tuples, XB_checked)
+
+
+def test_check_preserve_type():
+    # Ensures that type float32 is preserved.
+    XA = np.resize(np.arange(40), (5, 8)).astype(np.float32)
+    XB = np.resize(np.arange(40), (5, 8)).astype(np.float32)
+
+    XA_checked, XB_checked = check_pairwise_arrays(XA, None)
+    assert XA_checked.dtype == np.float32
+
+    # both float32
+    XA_checked, XB_checked = check_pairwise_arrays(XA, XB)
+    assert XA_checked.dtype == np.float32
+    assert XB_checked.dtype == np.float32
+
+    # mismatched A
+    XA_checked, XB_checked = check_pairwise_arrays(XA.astype(float), XB)
+    assert XA_checked.dtype == float
+    assert XB_checked.dtype == float
+
+    # mismatched B
+    XA_checked, XB_checked = check_pairwise_arrays(XA, XB.astype(float))
+    assert XA_checked.dtype == float
+    assert XB_checked.dtype == float
+
+
+@pytest.mark.parametrize("n_jobs", [1, 2])
+@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"])
+@pytest.mark.parametrize(
+    "dist_function", [pairwise_distances, pairwise_distances_chunked]
+)
+def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function):
+    # check that pairwise_distances give the same result in sequential and
+    # parallel, when metric has data-derived parameters.
+    with config_context(working_memory=0.1):  # to have more than 1 chunk
+        rng = np.random.RandomState(0)
+        X = rng.random_sample((100, 10))
+
+        expected_dist = squareform(pdist(X, metric=metric))
+        dist = np.vstack(tuple(dist_function(X, metric=metric, n_jobs=n_jobs)))
+
+        assert_allclose(dist, expected_dist)
+
+
+@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"])
+def test_pairwise_distances_data_derived_params_error(metric):
+    # check that pairwise_distances raises an error when Y is passed but
+    # metric has data-derived params that are not provided by the user.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10))
+    Y = rng.random_sample((100, 10))
+
+    with pytest.raises(
+        ValueError,
+        match=rf"The '(V|VI)' parameter is required for the {metric} metric",
+    ):
+        pairwise_distances(X, Y, metric=metric)
+
+
+@pytest.mark.parametrize(
+    "metric",
+    [
+        "braycurtis",
+        "canberra",
+        "chebyshev",
+        "correlation",
+        "hamming",
+        "mahalanobis",
+        "minkowski",
+        "seuclidean",
+        "sqeuclidean",
+        "cityblock",
+        "cosine",
+        "euclidean",
+    ],
+)
+@pytest.mark.parametrize("y_is_x", [True, False], ids=["Y is X", "Y is not X"])
+def test_numeric_pairwise_distances_datatypes(metric, global_dtype, y_is_x):
+    # Check that pairwise distances gives the same result as pdist and cdist
+    # regardless of input datatype when using any scipy metric for comparing
+    # numeric vectors
+    #
+    # This test is necessary because pairwise_distances used to throw an
+    # error when using metric='seuclidean' and the input data was not
+    # of type np.float64 (#15730)
+
+    rng = np.random.RandomState(0)
+
+    X = rng.random_sample((5, 4)).astype(global_dtype, copy=False)
+
+    params = {}
+    if y_is_x:
+        Y = X
+        expected_dist = squareform(pdist(X, metric=metric))
+    else:
+        Y = rng.random_sample((5, 4)).astype(global_dtype, copy=False)
+        expected_dist = cdist(X, Y, metric=metric)
+        # precompute parameters for seuclidean & mahalanobis when x is not y
+        if metric == "seuclidean":
+            params = {"V": np.var(np.vstack([X, Y]), axis=0, ddof=1, dtype=np.float64)}
+        elif metric == "mahalanobis":
+            params = {"VI": np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}
+
+    dist = pairwise_distances(X, Y, metric=metric, **params)
+
+    assert_allclose(dist, expected_dist)
+
+
+@pytest.mark.parametrize(
+    "pairwise_distances_func",
+    [pairwise_distances, pairwise_distances_argmin, pairwise_distances_argmin_min],
+)
+def test_nan_euclidean_support(pairwise_distances_func):
+    """Check that `nan_euclidean` is lenient with `nan` values."""
+
+    X = [[0, 1], [1, np.nan], [2, 3], [3, 5]]
+    output = pairwise_distances_func(X, X, metric="nan_euclidean")
+
+    assert not np.isnan(output).any()
+
+
+def test_nan_euclidean_constant_input_argmin():
+    """Check that the behavior of constant input is the same in the case of
+    full of nan vector and full of zero vector.
+    """
+
+    X_nan = [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]]
+    argmin_nan = pairwise_distances_argmin(X_nan, X_nan, metric="nan_euclidean")
+
+    X_const = [[0, 0], [0, 0], [0, 0]]
+    argmin_const = pairwise_distances_argmin(X_const, X_const, metric="nan_euclidean")
+
+    assert_allclose(argmin_nan, argmin_const)
+
+
+@pytest.mark.parametrize(
+    "X,Y,expected_distance",
+    [
+        (
+            ["a", "ab", "abc"],
+            None,
+            [[0.0, 1.0, 2.0], [1.0, 0.0, 1.0], [2.0, 1.0, 0.0]],
+        ),
+        (
+            ["a", "ab", "abc"],
+            ["a", "ab"],
+            [[0.0, 1.0], [1.0, 0.0], [2.0, 1.0]],
+        ),
+    ],
+)
+def test_pairwise_dist_custom_metric_for_string(X, Y, expected_distance):
+    """Check pairwise_distances with lists of strings as input."""
+
+    def dummy_string_similarity(x, y):
+        return np.abs(len(x) - len(y))
+
+    actual_distance = pairwise_distances(X=X, Y=Y, metric=dummy_string_similarity)
+    assert_allclose(actual_distance, expected_distance)
+
+
+def test_pairwise_dist_custom_metric_for_bool():
+    """Check that pairwise_distances does not convert boolean input to float
+    when using a custom metric.
+    """
+
+    def dummy_bool_dist(v1, v2):
+        # dummy distance func using `&` and thus relying on the input data being boolean
+        return 1 - (v1 & v2).sum() / (v1 | v2).sum()
+
+    X = np.array([[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]], dtype=bool)
+
+    expected_distance = np.array(
+        [
+            [0.0, 0.5, 0.75],
+            [0.5, 0.0, 0.5],
+            [0.75, 0.5, 0.0],
+        ]
+    )
+
+    actual_distance = pairwise_distances(X=X, metric=dummy_bool_dist)
+    assert_allclose(actual_distance, expected_distance)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_manhattan_readonly_dataset(csr_container):
+    # Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/7981
+    matrices1 = [csr_container(np.ones((5, 5)))]
+    matrices2 = [csr_container(np.ones((5, 5)))]
+    # Joblib memory maps datasets which makes them read-only.
+    # The following call was reporting as failing in #7981, but this must pass.
+    Parallel(n_jobs=2, max_nbytes=0)(
+        delayed(manhattan_distances)(m1, m2) for m1, m2 in zip(matrices1, matrices2)
+    )
+
+
+# TODO(1.8): remove
+def test_force_all_finite_rename_warning():
+    X = np.random.uniform(size=(10, 10))
+    Y = np.random.uniform(size=(10, 10))
+
+    msg = "'force_all_finite' was renamed to 'ensure_all_finite'"
+
+    with pytest.warns(FutureWarning, match=msg):
+        check_pairwise_arrays(X, Y, force_all_finite=True)
+
+    with pytest.warns(FutureWarning, match=msg):
+        pairwise_distances(X, Y, force_all_finite=True)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ea6d5d094d5602ce4e3d4161b398d42c65677e6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -0,0 +1,1643 @@
+import itertools
+import re
+import warnings
+from functools import partial
+
+import numpy as np
+import pytest
+from scipy.spatial.distance import cdist
+
+from sklearn.metrics import euclidean_distances, pairwise_distances
+from sklearn.metrics._pairwise_distances_reduction import (
+    ArgKmin,
+    ArgKminClassMode,
+    BaseDistancesReductionDispatcher,
+    RadiusNeighbors,
+    RadiusNeighborsClassMode,
+    sqeuclidean_row_norms,
+)
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_equal,
+    create_memmap_backed_data,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.parallel import _get_threadpool_controller
+
+# Common supported metric between scipy.spatial.distance.cdist
+# and BaseDistanceReductionDispatcher.
+# This allows constructing tests to check consistency of results
+# of concrete BaseDistanceReductionDispatcher on some metrics using APIs
+# from scipy and numpy.
+CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [
+    "braycurtis",
+    "canberra",
+    "chebyshev",
+    "cityblock",
+    "euclidean",
+    "minkowski",
+    "seuclidean",
+]
+
+
+def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
+    """Return list of dummy DistanceMetric kwargs for tests."""
+
+    # Distinguishing on cases not to compute unneeded datastructures.
+    rng = np.random.RandomState(seed)
+
+    if metric == "minkowski":
+        minkowski_kwargs = [
+            dict(p=1.5),
+            dict(p=2),
+            dict(p=3),
+            dict(p=np.inf),
+            dict(p=3, w=rng.rand(n_features)),
+        ]
+
+        return minkowski_kwargs
+
+    if metric == "seuclidean":
+        return [dict(V=rng.rand(n_features))]
+
+    # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
+    # In those cases, no kwargs is needed.
+    return [{}]
+
+
+def assert_same_distances_for_common_neighbors(
+    query_idx,
+    dist_row_a,
+    dist_row_b,
+    indices_row_a,
+    indices_row_b,
+    rtol,
+    atol,
+):
+    """Check that the distances of common neighbors are equal up to tolerance.
+
+    This does not check if there are missing neighbors in either result set.
+    Missingness is handled by assert_no_missing_neighbors.
+    """
+    # Compute a mapping from indices to distances for each result set and
+    # check that the computed neighbors with matching indices are within
+    # the expected distance tolerance.
+    indices_to_dist_a = dict(zip(indices_row_a, dist_row_a))
+    indices_to_dist_b = dict(zip(indices_row_b, dist_row_b))
+
+    common_indices = set(indices_row_a).intersection(set(indices_row_b))
+    for idx in common_indices:
+        dist_a = indices_to_dist_a[idx]
+        dist_b = indices_to_dist_b[idx]
+        try:
+            assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol)
+        except AssertionError as e:
+            # Wrap exception to provide more context while also including
+            # the original exception with the computed absolute and
+            # relative differences.
+            raise AssertionError(
+                f"Query vector with index {query_idx} lead to different distances"
+                f" for common neighbor with index {idx}:"
+                f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and"
+                f" rtol={rtol})"
+            ) from e
+
+
+def assert_no_missing_neighbors(
+    query_idx,
+    dist_row_a,
+    dist_row_b,
+    indices_row_a,
+    indices_row_b,
+    threshold,
+):
+    """Compare the indices of neighbors in two results sets.
+
+    Any neighbor index with a distance below the precision threshold should
+    match one in the other result set. We ignore the last few neighbors beyond
+    the threshold as those can typically be missing due to rounding errors.
+
+    For radius queries, the threshold is just the radius minus the expected
+    precision level.
+
+    For k-NN queries, it is the maximum distance to the k-th neighbor minus the
+    expected precision level.
+    """
+    mask_a = dist_row_a < threshold
+    mask_b = dist_row_b < threshold
+    missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b)
+    missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a)
+    if len(missing_from_a) > 0 or len(missing_from_b) > 0:
+        raise AssertionError(
+            f"Query vector with index {query_idx} lead to mismatched result indices:\n"
+            f"neighbors in b missing from a: {missing_from_a}\n"
+            f"neighbors in a missing from b: {missing_from_b}\n"
+            f"dist_row_a={dist_row_a}\n"
+            f"dist_row_b={dist_row_b}\n"
+            f"indices_row_a={indices_row_a}\n"
+            f"indices_row_b={indices_row_b}\n"
+        )
+
+
+def assert_compatible_argkmin_results(
+    neighbors_dists_a,
+    neighbors_dists_b,
+    neighbors_indices_a,
+    neighbors_indices_b,
+    rtol=1e-5,
+    atol=1e-6,
+):
+    """Assert that argkmin results are valid up to rounding errors.
+
+    This function asserts that the results of argkmin queries are valid up to:
+    - rounding error tolerance on distance values;
+    - permutations of indices for distances values that differ up to the
+      expected precision level.
+
+    Furthermore, the distances must be sorted.
+
+    To be used for testing neighbors queries on float32 datasets: we accept
+    neighbors rank swaps only if they are caused by small rounding errors on
+    the distance computations.
+    """
+    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
+
+    assert (
+        neighbors_dists_a.shape
+        == neighbors_dists_b.shape
+        == neighbors_indices_a.shape
+        == neighbors_indices_b.shape
+    ), "Arrays of results have incompatible shapes."
+
+    n_queries, _ = neighbors_dists_a.shape
+
+    # Asserting equality results one row at a time
+    for query_idx in range(n_queries):
+        dist_row_a = neighbors_dists_a[query_idx]
+        dist_row_b = neighbors_dists_b[query_idx]
+        indices_row_a = neighbors_indices_a[query_idx]
+        indices_row_b = neighbors_indices_b[query_idx]
+
+        assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
+        assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
+
+        assert_same_distances_for_common_neighbors(
+            query_idx,
+            dist_row_a,
+            dist_row_b,
+            indices_row_a,
+            indices_row_b,
+            rtol,
+            atol,
+        )
+
+        # Check that any neighbor with distances below the rounding error
+        # threshold have matching indices. The threshold is the distance to the
+        # k-th neighbors minus the expected precision level:
+        #
+        # (1 - rtol) * dist_k - atol
+        #
+        # Where dist_k is defined as the maximum distance to the kth-neighbor
+        # among the two result sets. This way of defining the threshold is
+        # stricter than taking the minimum of the two.
+        threshold = (1 - rtol) * np.maximum(
+            np.max(dist_row_a), np.max(dist_row_b)
+        ) - atol
+        assert_no_missing_neighbors(
+            query_idx,
+            dist_row_a,
+            dist_row_b,
+            indices_row_a,
+            indices_row_b,
+            threshold,
+        )
+
+
+def _non_trivial_radius(
+    *,
+    X=None,
+    Y=None,
+    metric=None,
+    precomputed_dists=None,
+    expected_n_neighbors=10,
+    n_subsampled_queries=10,
+    **metric_kwargs,
+):
+    # Find a non-trivial radius using a small subsample of the pairwise
+    # distances between X and Y: we want to return around expected_n_neighbors
+    # on average. Yielding too many results would make the test slow (because
+    # checking the results is expensive for large result sets), yielding 0 most
+    # of the time would make the test useless.
+    assert precomputed_dists is not None or metric is not None, (
+        "Either metric or precomputed_dists must be provided."
+    )
+
+    if precomputed_dists is None:
+        assert X is not None
+        assert Y is not None
+        sampled_dists = pairwise_distances(X, Y, metric=metric, **metric_kwargs)
+    else:
+        sampled_dists = precomputed_dists[:n_subsampled_queries].copy()
+    sampled_dists.sort(axis=1)
+    return sampled_dists[:, expected_n_neighbors].mean()
+
+
+def assert_compatible_radius_results(
+    neighbors_dists_a,
+    neighbors_dists_b,
+    neighbors_indices_a,
+    neighbors_indices_b,
+    radius,
+    check_sorted=True,
+    rtol=1e-5,
+    atol=1e-6,
+):
+    """Assert that radius neighborhood results are valid up to:
+
+      - relative and absolute tolerance on computed distance values
+      - permutations of indices for distances values that differ up to
+        a precision level
+      - missing or extra last elements if their distance is
+        close to the radius
+
+    To be used for testing neighbors queries on float32 datasets: we
+    accept neighbors rank swaps only if they are caused by small
+    rounding errors on the distance computations.
+
+    Input arrays must be sorted w.r.t distances.
+    """
+    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
+
+    assert (
+        len(neighbors_dists_a)
+        == len(neighbors_dists_b)
+        == len(neighbors_indices_a)
+        == len(neighbors_indices_b)
+    )
+
+    n_queries = len(neighbors_dists_a)
+
+    # Asserting equality of results one vector at a time
+    for query_idx in range(n_queries):
+        dist_row_a = neighbors_dists_a[query_idx]
+        dist_row_b = neighbors_dists_b[query_idx]
+        indices_row_a = neighbors_indices_a[query_idx]
+        indices_row_b = neighbors_indices_b[query_idx]
+
+        if check_sorted:
+            assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
+            assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
+
+        assert len(dist_row_a) == len(indices_row_a)
+        assert len(dist_row_b) == len(indices_row_b)
+
+        # Check that all distances are within the requested radius
+        if len(dist_row_a) > 0:
+            max_dist_a = np.max(dist_row_a)
+            assert max_dist_a <= radius, (
+                f"Largest returned distance {max_dist_a} not within requested"
+                f" radius {radius} on row {query_idx}"
+            )
+        if len(dist_row_b) > 0:
+            max_dist_b = np.max(dist_row_b)
+            assert max_dist_b <= radius, (
+                f"Largest returned distance {max_dist_b} not within requested"
+                f" radius {radius} on row {query_idx}"
+            )
+
+        assert_same_distances_for_common_neighbors(
+            query_idx,
+            dist_row_a,
+            dist_row_b,
+            indices_row_a,
+            indices_row_b,
+            rtol,
+            atol,
+        )
+
+        threshold = (1 - rtol) * radius - atol
+        assert_no_missing_neighbors(
+            query_idx,
+            dist_row_a,
+            dist_row_b,
+            indices_row_a,
+            indices_row_b,
+            threshold,
+        )
+
+
+FLOAT32_TOLS = {
+    "atol": 1e-7,
+    "rtol": 1e-5,
+}
+FLOAT64_TOLS = {
+    "atol": 1e-9,
+    "rtol": 1e-7,
+}
+ASSERT_RESULT = {
+    (ArgKmin, np.float64): partial(assert_compatible_argkmin_results, **FLOAT64_TOLS),
+    (ArgKmin, np.float32): partial(assert_compatible_argkmin_results, **FLOAT32_TOLS),
+    (
+        RadiusNeighbors,
+        np.float64,
+    ): partial(assert_compatible_radius_results, **FLOAT64_TOLS),
+    (
+        RadiusNeighbors,
+        np.float32,
+    ): partial(assert_compatible_radius_results, **FLOAT32_TOLS),
+}
+
+
+def test_assert_compatible_argkmin_results():
+    atol = 1e-7
+    rtol = 0.0
+    tols = dict(atol=atol, rtol=rtol)
+
+    eps = atol / 3
+    _1m = 1.0 - eps
+    _1p = 1.0 + eps
+
+    _6_1m = 6.1 - eps
+    _6_1p = 6.1 + eps
+
+    ref_dist = np.array(
+        [
+            [1.2, 2.5, _6_1m, 6.1, _6_1p],
+            [_1m, _1m, 1, _1p, _1p],
+        ]
+    )
+    ref_indices = np.array(
+        [
+            [1, 2, 3, 4, 5],
+            [6, 7, 8, 9, 10],
+        ]
+    )
+
+    # Sanity check: compare the reference results to themselves.
+    assert_compatible_argkmin_results(
+        ref_dist, ref_dist, ref_indices, ref_indices, rtol
+    )
+
+    # Apply valid permutation on indices: the last 3 points are all very close
+    # to one another so we accept any permutation on their rankings.
+    assert_compatible_argkmin_results(
+        np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
+        np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
+        np.array([[1, 2, 3, 4, 5]]),
+        np.array([[1, 2, 5, 4, 3]]),
+        **tols,
+    )
+
+    # The last few indices do not necessarily have to match because of the rounding
+    # errors on the distances: there could be tied results at the boundary.
+    assert_compatible_argkmin_results(
+        np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]),
+        np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]),
+        np.array([[1, 2, 3, 4, 5]]),
+        np.array([[1, 2, 3, 6, 7]]),
+        **tols,
+    )
+
+    # All points have close distances so any ranking permutation
+    # is valid for this query result.
+    assert_compatible_argkmin_results(
+        np.array([[_1m, 1, _1p, _1p, _1p]]),
+        np.array([[1, 1, 1, 1, _1p]]),
+        np.array([[7, 6, 8, 10, 9]]),
+        np.array([[6, 9, 7, 8, 10]]),
+        **tols,
+    )
+
+    # They could also be nearly truncation of very large nearly tied result
+    # sets hence all indices can also be distinct in this case:
+    assert_compatible_argkmin_results(
+        np.array([[_1m, 1, _1p, _1p, _1p]]),
+        np.array([[_1m, 1, 1, 1, _1p]]),
+        np.array([[34, 30, 8, 12, 24]]),
+        np.array([[42, 1, 21, 13, 3]]),
+        **tols,
+    )
+
+    # Apply invalid permutation on indices: permuting the ranks of the 2
+    # nearest neighbors is invalid because the distance values are too
+    # different.
+    msg = re.escape(
+        "Query vector with index 0 lead to different distances for common neighbor with"
+        " index 1: dist_a=1.2 vs dist_b=2.5"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_argkmin_results(
+            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
+            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
+            np.array([[1, 2, 3, 4, 5]]),
+            np.array([[2, 1, 3, 4, 5]]),
+            **tols,
+        )
+
+    # Detect missing indices within the expected precision level, even when the
+    # distances match exactly.
+    msg = re.escape(
+        "neighbors in b missing from a: [12]\nneighbors in a missing from b: [1]"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_argkmin_results(
+            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
+            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
+            np.array([[1, 2, 3, 4, 5]]),
+            np.array([[12, 2, 4, 11, 3]]),
+            **tols,
+        )
+
+    # Detect missing indices outside the expected precision level.
+    msg = re.escape(
+        "neighbors in b missing from a: []\nneighbors in a missing from b: [3]"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_argkmin_results(
+            np.array([[_1m, 1.0, _6_1m, 6.1, _6_1p]]),
+            np.array([[1.0, 1.0, _6_1m, 6.1, 7]]),
+            np.array([[1, 2, 3, 4, 5]]),
+            np.array([[2, 1, 4, 5, 12]]),
+            **tols,
+        )
+
+    # Detect missing indices outside the expected precision level, in the other
+    # direction:
+    msg = re.escape(
+        "neighbors in b missing from a: [5]\nneighbors in a missing from b: []"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_argkmin_results(
+            np.array([[_1m, 1.0, _6_1m, 6.1, 7]]),
+            np.array([[1.0, 1.0, _6_1m, 6.1, _6_1p]]),
+            np.array([[1, 2, 3, 4, 12]]),
+            np.array([[2, 1, 5, 3, 4]]),
+            **tols,
+        )
+
+    # Distances aren't properly sorted
+    msg = "Distances aren't sorted on row 0"
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_argkmin_results(
+            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
+            np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]),
+            np.array([[1, 2, 3, 4, 5]]),
+            np.array([[2, 1, 4, 5, 3]]),
+            **tols,
+        )
+
+
+@pytest.mark.parametrize("check_sorted", [True, False])
+def test_assert_compatible_radius_results(check_sorted):
+    atol = 1e-7
+    rtol = 0.0
+    tols = dict(atol=atol, rtol=rtol)
+
+    eps = atol / 3
+    _1m = 1.0 - eps
+    _1p = 1.0 + eps
+    _6_1m = 6.1 - eps
+    _6_1p = 6.1 + eps
+
+    ref_dist = [
+        np.array([1.2, 2.5, _6_1m, 6.1, _6_1p]),
+        np.array([_1m, 1, _1p, _1p]),
+    ]
+
+    ref_indices = [
+        np.array([1, 2, 3, 4, 5]),
+        np.array([6, 7, 8, 9]),
+    ]
+
+    # Sanity check: compare the reference results to themselves.
+    assert_compatible_radius_results(
+        ref_dist,
+        ref_dist,
+        ref_indices,
+        ref_indices,
+        radius=7.0,
+        check_sorted=check_sorted,
+        **tols,
+    )
+
+    # Apply valid permutation on indices
+    assert_compatible_radius_results(
+        np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+        np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+        np.array([np.array([1, 2, 3, 4, 5])]),
+        np.array([np.array([1, 2, 4, 5, 3])]),
+        radius=7.0,
+        check_sorted=check_sorted,
+        **tols,
+    )
+    assert_compatible_radius_results(
+        np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
+        np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
+        np.array([np.array([6, 7, 8, 9, 10])]),
+        np.array([np.array([6, 9, 7, 8, 10])]),
+        radius=7.0,
+        check_sorted=check_sorted,
+        **tols,
+    )
+
+    # Apply invalid permutation on indices
+    msg = re.escape(
+        "Query vector with index 0 lead to different distances for common neighbor with"
+        " index 1: dist_a=1.2 vs dist_b=2.5"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_radius_results(
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+            np.array([np.array([1, 2, 3, 4, 5])]),
+            np.array([np.array([2, 1, 3, 4, 5])]),
+            radius=7.0,
+            check_sorted=check_sorted,
+            **tols,
+        )
+
+    # Having extra last or missing elements is valid if they are in the
+    # tolerated rounding error range: [(1 - rtol) * radius - atol, radius]
+    assert_compatible_radius_results(
+        np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p, _6_1p])]),
+        np.array([np.array([1.2, 2.5, _6_1m, 6.1])]),
+        np.array([np.array([1, 2, 3, 4, 5, 7])]),
+        np.array([np.array([1, 2, 3, 6])]),
+        radius=_6_1p,
+        check_sorted=check_sorted,
+        **tols,
+    )
+
+    # Any discrepancy outside the tolerated rounding error range is invalid and
+    # indicates a missing neighbor in one of the result sets.
+    msg = re.escape(
+        "Query vector with index 0 lead to mismatched result indices:\nneighbors in b"
+        " missing from a: []\nneighbors in a missing from b: [3]"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_radius_results(
+            np.array([np.array([1.2, 2.5, 6])]),
+            np.array([np.array([1.2, 2.5])]),
+            np.array([np.array([1, 2, 3])]),
+            np.array([np.array([1, 2])]),
+            radius=6.1,
+            check_sorted=check_sorted,
+            **tols,
+        )
+    msg = re.escape(
+        "Query vector with index 0 lead to mismatched result indices:\nneighbors in b"
+        " missing from a: [4]\nneighbors in a missing from b: [2]"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_radius_results(
+            np.array([np.array([1.2, 2.1, 2.5])]),
+            np.array([np.array([1.2, 2, 2.5])]),
+            np.array([np.array([1, 2, 3])]),
+            np.array([np.array([1, 4, 3])]),
+            radius=6.1,
+            check_sorted=check_sorted,
+            **tols,
+        )
+
+    # Radius upper bound is strictly checked
+    msg = re.escape(
+        "Largest returned distance 6.100000033333333 not within requested radius 6.1 on"
+        " row 0"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_radius_results(
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]),
+            np.array([np.array([1, 2, 3, 4, 5])]),
+            np.array([np.array([2, 1, 4, 5, 3])]),
+            radius=6.1,
+            check_sorted=check_sorted,
+            **tols,
+        )
+    with pytest.raises(AssertionError, match=msg):
+        assert_compatible_radius_results(
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]),
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+            np.array([np.array([1, 2, 3, 4, 5])]),
+            np.array([np.array([2, 1, 4, 5, 3])]),
+            radius=6.1,
+            check_sorted=check_sorted,
+            **tols,
+        )
+
+    if check_sorted:
+        # Distances aren't properly sorted
+        msg = "Distances aren't sorted on row 0"
+        with pytest.raises(AssertionError, match=msg):
+            assert_compatible_radius_results(
+                np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+                np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
+                np.array([np.array([1, 2, 3, 4, 5])]),
+                np.array([np.array([2, 1, 4, 5, 3])]),
+                radius=_6_1p,
+                check_sorted=True,
+                **tols,
+            )
+    else:
+        assert_compatible_radius_results(
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+            np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
+            np.array([np.array([1, 2, 3, 4, 5])]),
+            np.array([np.array([2, 1, 4, 5, 3])]),
+            radius=_6_1p,
+            check_sorted=False,
+            **tols,
+        )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pairwise_distances_reduction_is_usable_for(csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    X_csr = csr_container(X)
+    Y_csr = csr_container(Y)
+    metric = "manhattan"
+
+    # Must be usable for all possible pair of {dense, sparse} datasets
+    assert BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric)
+    assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y_csr, metric)
+    assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric)
+    assert BaseDistancesReductionDispatcher.is_usable_for(X, Y_csr, metric)
+
+    assert BaseDistancesReductionDispatcher.is_usable_for(
+        X.astype(np.float64), Y.astype(np.float64), metric
+    )
+
+    assert BaseDistancesReductionDispatcher.is_usable_for(
+        X.astype(np.float32), Y.astype(np.float32), metric
+    )
+
+    assert not BaseDistancesReductionDispatcher.is_usable_for(
+        X.astype(np.int64), Y.astype(np.int64), metric
+    )
+
+    assert not BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric="pyfunc")
+    assert not BaseDistancesReductionDispatcher.is_usable_for(
+        X.astype(np.float32), Y, metric
+    )
+    assert not BaseDistancesReductionDispatcher.is_usable_for(
+        X, Y.astype(np.int32), metric
+    )
+
+    # F-ordered arrays are not supported
+    assert not BaseDistancesReductionDispatcher.is_usable_for(
+        np.asfortranarray(X), Y, metric
+    )
+
+    assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric="euclidean")
+    assert BaseDistancesReductionDispatcher.is_usable_for(
+        X, Y_csr, metric="sqeuclidean"
+    )
+
+    # FIXME: the current Cython implementation is too slow for a large number of
+    # features. We temporarily disable it to fallback on SciPy's implementation.
+    # See: https://github.com/scikit-learn/scikit-learn/issues/28191
+    assert not BaseDistancesReductionDispatcher.is_usable_for(
+        X_csr, Y_csr, metric="sqeuclidean"
+    )
+    assert not BaseDistancesReductionDispatcher.is_usable_for(
+        X_csr, Y_csr, metric="euclidean"
+    )
+
+    # CSR matrices without non-zeros elements aren't currently supported
+    # TODO: support CSR matrices without non-zeros elements
+    X_csr_0_nnz = csr_container(X * 0)
+    assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric)
+
+    # CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features)
+    # aren't supported as of now.
+    # See: https://github.com/scikit-learn/scikit-learn/issues/23653
+    # TODO: support CSR matrices with int64 indices and indptr
+    X_csr_int64 = csr_container(X)
+    X_csr_int64.indices = X_csr_int64.indices.astype(np.int64)
+    assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_int64, Y, metric)
+
+
+def test_argkmin_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    k = 5
+    metric = "euclidean"
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float32 and Y.dtype=float64"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ArgKmin.compute(X=X.astype(np.float32), Y=Y, k=k, metric=metric)
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float64 and Y.dtype=int32"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
+
+    with pytest.raises(ValueError, match="k == -1, must be >= 1."):
+        ArgKmin.compute(X=X, Y=Y, k=-1, metric=metric)
+
+    with pytest.raises(ValueError, match="k == 0, must be >= 1."):
+        ArgKmin.compute(X=X, Y=Y, k=0, metric=metric)
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        ArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric")
+
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
+        ArgKmin.compute(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric)
+
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
+        ArgKmin.compute(X=np.asfortranarray(X), Y=Y, k=k, metric=metric)
+
+    # A UserWarning must be raised in this case.
+    unused_metric_kwargs = {"p": 3}
+
+    message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
+
+    with pytest.warns(UserWarning, match=message):
+        ArgKmin.compute(
+            X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs
+        )
+
+    # A UserWarning must be raised in this case.
+    metric_kwargs = {
+        "p": 3,  # unused
+        "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
+    }
+
+    message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
+
+    with pytest.warns(UserWarning, match=message):
+        ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
+
+    # No user warning must be raised in this case.
+    metric_kwargs = {
+        "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
+    }
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", category=UserWarning)
+        ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
+
+    # No user warning must be raised in this case.
+    metric_kwargs = {
+        "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
+        "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
+    }
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", category=UserWarning)
+        ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
+
+
+def test_argkmin_classmode_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    k = 5
+    metric = "manhattan"
+
+    weights = "uniform"
+    Y_labels = rng.randint(low=0, high=10, size=100)
+    unique_Y_labels = np.unique(Y_labels)
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float32 and Y.dtype=float64"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ArgKminClassMode.compute(
+            X=X.astype(np.float32),
+            Y=Y,
+            k=k,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float64 and Y.dtype=int32"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ArgKminClassMode.compute(
+            X=X,
+            Y=Y.astype(np.int32),
+            k=k,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+    with pytest.raises(ValueError, match="k == -1, must be >= 1."):
+        ArgKminClassMode.compute(
+            X=X,
+            Y=Y,
+            k=-1,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+    with pytest.raises(ValueError, match="k == 0, must be >= 1."):
+        ArgKminClassMode.compute(
+            X=X,
+            Y=Y,
+            k=0,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        ArgKminClassMode.compute(
+            X=X,
+            Y=Y,
+            k=k,
+            metric="wrong metric",
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
+        ArgKminClassMode.compute(
+            X=np.array([1.0, 2.0]),
+            Y=Y,
+            k=k,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
+        ArgKminClassMode.compute(
+            X=np.asfortranarray(X),
+            Y=Y,
+            k=k,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+    non_existent_weights_strategy = "non_existent_weights_strategy"
+    message = (
+        "Only the 'uniform' or 'distance' weights options are supported at this time. "
+        f"Got: weights='{non_existent_weights_strategy}'."
+    )
+    with pytest.raises(ValueError, match=message):
+        ArgKminClassMode.compute(
+            X=X,
+            Y=Y,
+            k=k,
+            metric=metric,
+            weights=non_existent_weights_strategy,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+        )
+
+    # TODO: introduce assertions on UserWarnings once the Euclidean specialisation
+    # of ArgKminClassMode is supported.
+
+
+def test_radius_neighbors_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    radius = 5
+    metric = "euclidean"
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float32 and Y.dtype=float64"
+    )
+    with pytest.raises(
+        ValueError,
+        match=msg,
+    ):
+        RadiusNeighbors.compute(
+            X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
+        )
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float64 and Y.dtype=int32"
+    )
+    with pytest.raises(
+        ValueError,
+        match=msg,
+    ):
+        RadiusNeighbors.compute(X=X, Y=Y.astype(np.int32), radius=radius, metric=metric)
+
+    with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
+        RadiusNeighbors.compute(X=X, Y=Y, radius=-1, metric=metric)
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        RadiusNeighbors.compute(X=X, Y=Y, radius=radius, metric="wrong metric")
+
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
+        RadiusNeighbors.compute(
+            X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
+        )
+
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
+        RadiusNeighbors.compute(
+            X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
+        )
+
+    unused_metric_kwargs = {"p": 3}
+
+    # A UserWarning must be raised in this case.
+    message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
+
+    with pytest.warns(UserWarning, match=message):
+        RadiusNeighbors.compute(
+            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs
+        )
+
+    # A UserWarning must be raised in this case.
+    metric_kwargs = {
+        "p": 3,  # unused
+        "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
+    }
+
+    message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
+
+    with pytest.warns(UserWarning, match=message):
+        RadiusNeighbors.compute(
+            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
+        )
+
+    # No user warning must be raised in this case.
+    metric_kwargs = {
+        "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
+        "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
+    }
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", category=UserWarning)
+        RadiusNeighbors.compute(
+            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
+        )
+
+    # No user warning must be raised in this case.
+    metric_kwargs = {
+        "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
+    }
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", category=UserWarning)
+        RadiusNeighbors.compute(
+            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
+        )
+
+
+def test_radius_neighbors_classmode_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    radius = 5
+    metric = "manhattan"
+    weights = "uniform"
+    Y_labels = rng.randint(low=0, high=10, size=100)
+    unique_Y_labels = np.unique(Y_labels)
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float32 and Y.dtype=float64"
+    )
+    with pytest.raises(ValueError, match=msg):
+        RadiusNeighborsClassMode.compute(
+            X=X.astype(np.float32),
+            Y=Y,
+            radius=radius,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+            outlier_label=None,
+        )
+
+    msg = (
+        "Only float64 or float32 datasets pairs are supported at this time, "
+        "got: X.dtype=float64 and Y.dtype=int32"
+    )
+    with pytest.raises(ValueError, match=msg):
+        RadiusNeighborsClassMode.compute(
+            X=X,
+            Y=Y.astype(np.int32),
+            radius=radius,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+            outlier_label=None,
+        )
+
+    with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
+        RadiusNeighborsClassMode.compute(
+            X=X,
+            Y=Y,
+            radius=-1,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+            outlier_label=None,
+        )
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        RadiusNeighborsClassMode.compute(
+            X=X,
+            Y=Y,
+            radius=-1,
+            metric="wrong_metric",
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+            outlier_label=None,
+        )
+
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
+        RadiusNeighborsClassMode.compute(
+            X=np.array([1.0, 2.0]),
+            Y=Y,
+            radius=radius,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+            outlier_label=None,
+        )
+
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
+        RadiusNeighborsClassMode.compute(
+            X=np.asfortranarray(X),
+            Y=Y,
+            radius=radius,
+            metric=metric,
+            weights=weights,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+            outlier_label=None,
+        )
+
+    non_existent_weights_strategy = "non_existent_weights_strategy"
+    msg = (
+        "Only the 'uniform' or 'distance' weights options are supported at this time. "
+        f"Got: weights='{non_existent_weights_strategy}'."
+    )
+    with pytest.raises(ValueError, match=msg):
+        RadiusNeighborsClassMode.compute(
+            X=X,
+            Y=Y,
+            radius=radius,
+            metric="wrong_metric",
+            weights=non_existent_weights_strategy,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
+            outlier_label=None,
+        )
+
+
+@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_chunk_size_agnosticism(
+    global_random_seed,
+    Dispatcher,
+    dtype,
+    n_features=100,
+):
+    """Check that results do not depend on the chunk size."""
+    rng = np.random.RandomState(global_random_seed)
+    spread = 100
+    n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
+    X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
+
+    if Dispatcher is ArgKmin:
+        parameter = 10
+        check_parameters = {}
+        compute_parameters = {}
+    else:
+        radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
+        parameter = radius
+        check_parameters = {"radius": radius}
+        compute_parameters = {"sort_results": True}
+
+    ref_dist, ref_indices = Dispatcher.compute(
+        X,
+        Y,
+        parameter,
+        chunk_size=256,  # default
+        metric="manhattan",
+        return_distance=True,
+        **compute_parameters,
+    )
+
+    dist, indices = Dispatcher.compute(
+        X,
+        Y,
+        parameter,
+        chunk_size=41,
+        metric="manhattan",
+        return_distance=True,
+        **compute_parameters,
+    )
+
+    ASSERT_RESULT[(Dispatcher, dtype)](
+        ref_dist, dist, ref_indices, indices, **check_parameters
+    )
+
+
+@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_n_threads_agnosticism(
+    global_random_seed,
+    Dispatcher,
+    dtype,
+    n_features=100,
+):
+    """Check that results do not depend on the number of threads."""
+    rng = np.random.RandomState(global_random_seed)
+    n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
+    spread = 100
+    X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
+
+    if Dispatcher is ArgKmin:
+        parameter = 10
+        check_parameters = {}
+        compute_parameters = {}
+    else:
+        radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
+        parameter = radius
+        check_parameters = {"radius": radius}
+        compute_parameters = {"sort_results": True}
+
+    ref_dist, ref_indices = Dispatcher.compute(
+        X,
+        Y,
+        parameter,
+        chunk_size=25,  # make sure we use multiple threads
+        return_distance=True,
+        **compute_parameters,
+    )
+
+    with _get_threadpool_controller().limit(limits=1, user_api="openmp"):
+        dist, indices = Dispatcher.compute(
+            X,
+            Y,
+            parameter,
+            chunk_size=25,
+            return_distance=True,
+            **compute_parameters,
+        )
+
+    ASSERT_RESULT[(Dispatcher, dtype)](
+        ref_dist, dist, ref_indices, indices, **check_parameters
+    )
+
+
+@pytest.mark.parametrize(
+    "Dispatcher, dtype",
+    [
+        (ArgKmin, np.float64),
+        (RadiusNeighbors, np.float32),
+        (ArgKmin, np.float32),
+        (RadiusNeighbors, np.float64),
+    ],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_format_agnosticism(
+    global_random_seed,
+    Dispatcher,
+    dtype,
+    csr_container,
+):
+    """Check that results do not depend on the format (dense, sparse) of the input."""
+    rng = np.random.RandomState(global_random_seed)
+    spread = 100
+    n_samples, n_features = 100, 100
+
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    X_csr = csr_container(X)
+    Y_csr = csr_container(Y)
+
+    if Dispatcher is ArgKmin:
+        parameter = 10
+        check_parameters = {}
+        compute_parameters = {}
+    else:
+        # Adjusting the radius to ensure that the expected results is neither
+        # trivially empty nor too large.
+        radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
+        parameter = radius
+        check_parameters = {"radius": radius}
+        compute_parameters = {"sort_results": True}
+
+    dist_dense, indices_dense = Dispatcher.compute(
+        X,
+        Y,
+        parameter,
+        chunk_size=50,
+        return_distance=True,
+        **compute_parameters,
+    )
+
+    for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
+        if _X is X and _Y is Y:
+            continue
+        dist, indices = Dispatcher.compute(
+            _X,
+            _Y,
+            parameter,
+            chunk_size=50,
+            return_distance=True,
+            **compute_parameters,
+        )
+        ASSERT_RESULT[(Dispatcher, dtype)](
+            dist_dense,
+            dist,
+            indices_dense,
+            indices,
+            **check_parameters,
+        )
+
+
+@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
+def test_strategies_consistency(
+    global_random_seed,
+    global_dtype,
+    Dispatcher,
+    n_features=10,
+):
+    """Check that the results do not depend on the strategy used."""
+    rng = np.random.RandomState(global_random_seed)
+    metric = rng.choice(
+        np.array(
+            [
+                "euclidean",
+                "minkowski",
+                "manhattan",
+                "haversine",
+            ],
+            dtype=object,
+        )
+    )
+    n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
+    spread = 100
+    X = rng.rand(n_samples_X, n_features).astype(global_dtype) * spread
+    Y = rng.rand(n_samples_Y, n_features).astype(global_dtype) * spread
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        X = np.ascontiguousarray(X[:, :2])
+        Y = np.ascontiguousarray(Y[:, :2])
+
+    if Dispatcher is ArgKmin:
+        parameter = 10
+        check_parameters = {}
+        compute_parameters = {}
+    else:
+        radius = _non_trivial_radius(X=X, Y=Y, metric=metric)
+        parameter = radius
+        check_parameters = {"radius": radius}
+        compute_parameters = {"sort_results": True}
+
+    dist_par_X, indices_par_X = Dispatcher.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        # Taking the first
+        metric_kwargs=_get_metric_params_list(
+            metric, n_features, seed=global_random_seed
+        )[0],
+        # To be sure to use parallelization
+        chunk_size=n_samples_X // 4,
+        strategy="parallel_on_X",
+        return_distance=True,
+        **compute_parameters,
+    )
+
+    dist_par_Y, indices_par_Y = Dispatcher.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        # Taking the first
+        metric_kwargs=_get_metric_params_list(
+            metric, n_features, seed=global_random_seed
+        )[0],
+        # To be sure to use parallelization
+        chunk_size=n_samples_Y // 4,
+        strategy="parallel_on_Y",
+        return_distance=True,
+        **compute_parameters,
+    )
+
+    ASSERT_RESULT[(Dispatcher, global_dtype)](
+        dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters
+    )
+
+
+# "Concrete Dispatchers"-specific tests
+
+
+@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
+@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_pairwise_distances_argkmin(
+    global_random_seed,
+    metric,
+    strategy,
+    dtype,
+    csr_container,
+    n_queries=5,
+    n_samples=100,
+    k=10,
+):
+    rng = np.random.RandomState(global_random_seed)
+    n_features = rng.choice([50, 500])
+    translation = rng.choice([0, 1e6])
+    spread = 1000
+    X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread
+    Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    X_csr = csr_container(X)
+    Y_csr = csr_container(Y)
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        X = np.ascontiguousarray(X[:, :2])
+        Y = np.ascontiguousarray(Y[:, :2])
+
+    metric_kwargs = _get_metric_params_list(metric, n_features)[0]
+
+    # Reference for argkmin results
+    if metric == "euclidean":
+        # Compare to scikit-learn GEMM optimized implementation
+        dist_matrix = euclidean_distances(X, Y)
+    else:
+        dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
+    # Taking argkmin (indices of the k smallest values)
+    argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
+    # Getting the associated distances
+    argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
+    for row_idx in range(argkmin_indices_ref.shape[0]):
+        argkmin_distances_ref[row_idx] = dist_matrix[
+            row_idx, argkmin_indices_ref[row_idx]
+        ]
+
+    for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
+        argkmin_distances, argkmin_indices = ArgKmin.compute(
+            _X,
+            _Y,
+            k,
+            metric=metric,
+            metric_kwargs=metric_kwargs,
+            return_distance=True,
+            # So as to have more than a chunk, forcing parallelism.
+            chunk_size=n_samples // 4,
+            strategy=strategy,
+        )
+
+        ASSERT_RESULT[(ArgKmin, dtype)](
+            argkmin_distances,
+            argkmin_distances_ref,
+            argkmin_indices,
+            argkmin_indices_ref,
+        )
+
+
+@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
+@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_pairwise_distances_radius_neighbors(
+    global_random_seed,
+    metric,
+    strategy,
+    dtype,
+    n_queries=5,
+    n_samples=100,
+):
+    rng = np.random.RandomState(global_random_seed)
+    n_features = rng.choice([50, 500])
+    translation = rng.choice([0, 1e6])
+    spread = 1000
+    X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread
+    Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    metric_kwargs = _get_metric_params_list(
+        metric, n_features, seed=global_random_seed
+    )[0]
+
+    # Reference for argkmin results
+    if metric == "euclidean":
+        # Compare to scikit-learn GEMM optimized implementation
+        dist_matrix = euclidean_distances(X, Y)
+    else:
+        dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
+
+    radius = _non_trivial_radius(precomputed_dists=dist_matrix)
+
+    # Getting the neighbors for a given radius
+    neigh_indices_ref = []
+    neigh_distances_ref = []
+
+    for row in dist_matrix:
+        ind = np.arange(row.shape[0])[row <= radius]
+        dist = row[ind]
+
+        sort = np.argsort(dist)
+        ind, dist = ind[sort], dist[sort]
+
+        neigh_indices_ref.append(ind)
+        neigh_distances_ref.append(dist)
+
+    neigh_distances, neigh_indices = RadiusNeighbors.compute(
+        X,
+        Y,
+        radius,
+        metric=metric,
+        metric_kwargs=metric_kwargs,
+        return_distance=True,
+        # So as to have more than a chunk, forcing parallelism.
+        chunk_size=n_samples // 4,
+        strategy=strategy,
+        sort_results=True,
+    )
+
+    ASSERT_RESULT[(RadiusNeighbors, dtype)](
+        neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref, radius
+    )
+
+
+@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
+@pytest.mark.parametrize("metric", ["manhattan", "euclidean"])
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_memmap_backed_data(
+    metric,
+    Dispatcher,
+    dtype,
+):
+    """Check that the results do not depend on the datasets writability."""
+    rng = np.random.RandomState(0)
+    spread = 100
+    n_samples, n_features = 128, 10
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    # Create read only datasets
+    X_mm, Y_mm = create_memmap_backed_data([X, Y])
+
+    if Dispatcher is ArgKmin:
+        parameter = 10
+        check_parameters = {}
+        compute_parameters = {}
+    else:
+        # Scaling the radius slightly with the numbers of dimensions
+        radius = 10 ** np.log(n_features)
+        parameter = radius
+        check_parameters = {"radius": radius}
+        compute_parameters = {"sort_results": True}
+
+    ref_dist, ref_indices = Dispatcher.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        return_distance=True,
+        **compute_parameters,
+    )
+
+    dist_mm, indices_mm = Dispatcher.compute(
+        X_mm,
+        Y_mm,
+        parameter,
+        metric=metric,
+        return_distance=True,
+        **compute_parameters,
+    )
+
+    ASSERT_RESULT[(Dispatcher, dtype)](
+        ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters
+    )
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sqeuclidean_row_norms(
+    global_random_seed,
+    dtype,
+    csr_container,
+):
+    rng = np.random.RandomState(global_random_seed)
+    spread = 100
+    n_samples = rng.choice([97, 100, 101, 1000])
+    n_features = rng.choice([5, 10, 100])
+    num_threads = rng.choice([1, 2, 8])
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    X_csr = csr_container(X)
+
+    sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
+    sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads)
+
+    sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads)
+
+    assert_allclose(sq_row_norm_reference, sq_row_norm)
+    assert_allclose(sq_row_norm_reference, sq_row_norm_csr)
+
+    with pytest.raises(ValueError):
+        X = np.asfortranarray(X)
+        sqeuclidean_row_norms(X, num_threads=num_threads)
+
+
+def test_argkmin_classmode_strategy_consistent():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    k = 5
+    metric = "manhattan"
+
+    weights = "uniform"
+    Y_labels = rng.randint(low=0, high=10, size=100)
+    unique_Y_labels = np.unique(Y_labels)
+    results_X = ArgKminClassMode.compute(
+        X=X,
+        Y=Y,
+        k=k,
+        metric=metric,
+        weights=weights,
+        Y_labels=Y_labels,
+        unique_Y_labels=unique_Y_labels,
+        strategy="parallel_on_X",
+    )
+    results_Y = ArgKminClassMode.compute(
+        X=X,
+        Y=Y,
+        k=k,
+        metric=metric,
+        weights=weights,
+        Y_labels=Y_labels,
+        unique_Y_labels=unique_Y_labels,
+        strategy="parallel_on_Y",
+    )
+    assert_array_equal(results_X, results_Y)
+
+
+@pytest.mark.parametrize("outlier_label", [None, 0, 3, 6, 9])
+def test_radius_neighbors_classmode_strategy_consistent(outlier_label):
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    radius = 5
+    metric = "manhattan"
+
+    weights = "uniform"
+    Y_labels = rng.randint(low=0, high=10, size=100)
+    unique_Y_labels = np.unique(Y_labels)
+    results_X = RadiusNeighborsClassMode.compute(
+        X=X,
+        Y=Y,
+        radius=radius,
+        metric=metric,
+        weights=weights,
+        Y_labels=Y_labels,
+        unique_Y_labels=unique_Y_labels,
+        outlier_label=outlier_label,
+        strategy="parallel_on_X",
+    )
+    results_Y = RadiusNeighborsClassMode.compute(
+        X=X,
+        Y=Y,
+        radius=radius,
+        metric=metric,
+        weights=weights,
+        Y_labels=Y_labels,
+        unique_Y_labels=unique_Y_labels,
+        outlier_label=outlier_label,
+        strategy="parallel_on_Y",
+    )
+    assert_allclose(results_X, results_Y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_ranking.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_ranking.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d740249f8aba4d5a87ecd2d6a16087557335d9a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_ranking.py
@@ -0,0 +1,2270 @@
+import math
+import re
+
+import numpy as np
+import pytest
+from scipy import stats
+
+from sklearn import datasets, svm
+from sklearn.datasets import make_multilabel_classification
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    accuracy_score,
+    auc,
+    average_precision_score,
+    coverage_error,
+    dcg_score,
+    det_curve,
+    label_ranking_average_precision_score,
+    label_ranking_loss,
+    ndcg_score,
+    precision_recall_curve,
+    roc_auc_score,
+    roc_curve,
+    top_k_accuracy_score,
+)
+from sklearn.metrics._ranking import _dcg_sample_scores, _ndcg_sample_scores
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import label_binarize
+from sklearn.random_projection import _sparse_random_matrix
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.extmath import softmax
+from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.validation import (
+    check_array,
+    check_consistent_length,
+    check_random_state,
+)
+
+###############################################################################
+# Utilities for testing
+
+CURVE_FUNCS = [
+    det_curve,
+    precision_recall_curve,
+    roc_curve,
+]
+
+
+def make_prediction(dataset=None, binary=False):
+    """Make some classification predictions on a toy dataset using a SVC
+
+    If binary is True restrict to a binary classification problem instead of a
+    multiclass classification problem
+    """
+
+    if dataset is None:
+        # import some data to play with
+        dataset = datasets.load_iris()
+
+    X = dataset.data
+    y = dataset.target
+
+    if binary:
+        # restrict to a binary classification task
+        X, y = X[y < 2], y[y < 2]
+
+    n_samples, n_features = X.shape
+    p = np.arange(n_samples)
+
+    rng = check_random_state(37)
+    rng.shuffle(p)
+    X, y = X[p], y[p]
+    half = int(n_samples / 2)
+
+    # add noisy features to make the problem harder and avoid perfect results
+    rng = np.random.RandomState(0)
+    X = np.c_[X, rng.randn(n_samples, 200 * n_features)]
+
+    # run classifier, get class probabilities and label predictions
+    clf = svm.SVC(kernel="linear", probability=True, random_state=0)
+    y_score = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
+
+    if binary:
+        # only interested in probabilities of the positive case
+        # XXX: do we really want a special API for the binary case?
+        y_score = y_score[:, 1]
+
+    y_pred = clf.predict(X[half:])
+    y_true = y[half:]
+    return y_true, y_pred, y_score
+
+
+###############################################################################
+# Tests
+
+
+def _auc(y_true, y_score):
+    """Alternative implementation to check for correctness of
+    `roc_auc_score`."""
+    pos_label = np.unique(y_true)[1]
+
+    # Count the number of times positive samples are correctly ranked above
+    # negative samples.
+    pos = y_score[y_true == pos_label]
+    neg = y_score[y_true != pos_label]
+    diff_matrix = pos.reshape(1, -1) - neg.reshape(-1, 1)
+    n_correct = np.sum(diff_matrix > 0)
+
+    return n_correct / float(len(pos) * len(neg))
+
+
+def _average_precision(y_true, y_score):
+    """Alternative implementation to check for correctness of
+    `average_precision_score`.
+
+    Note that this implementation fails on some edge cases.
+    For example, for constant predictions e.g. [0.5, 0.5, 0.5],
+    y_true = [1, 0, 0] returns an average precision of 0.33...
+    but y_true = [0, 0, 1] returns 1.0.
+    """
+    pos_label = np.unique(y_true)[1]
+    n_pos = np.sum(y_true == pos_label)
+    order = np.argsort(y_score)[::-1]
+    y_score = y_score[order]
+    y_true = y_true[order]
+
+    score = 0
+    for i in range(len(y_score)):
+        if y_true[i] == pos_label:
+            # Compute precision up to document i
+            # i.e, percentage of relevant documents up to document i.
+            prec = 0
+            for j in range(0, i + 1):
+                if y_true[j] == pos_label:
+                    prec += 1.0
+            prec /= i + 1.0
+            score += prec
+
+    return score / n_pos
+
+
+def _average_precision_slow(y_true, y_score):
+    """A second alternative implementation of average precision that closely
+    follows the Wikipedia article's definition (see References). This should
+    give identical results as `average_precision_score` for all inputs.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Average precision
+       <https://en.wikipedia.org/wiki/Average_precision>`_
+    """
+    precision, recall, threshold = precision_recall_curve(y_true, y_score)
+    precision = list(reversed(precision))
+    recall = list(reversed(recall))
+    average_precision = 0
+    for i in range(1, len(precision)):
+        average_precision += precision[i] * (recall[i] - recall[i - 1])
+    return average_precision
+
+
+def _partial_roc_auc_score(y_true, y_predict, max_fpr):
+    """Alternative implementation to check for correctness of `roc_auc_score`
+    with `max_fpr` set.
+    """
+
+    def _partial_roc(y_true, y_predict, max_fpr):
+        fpr, tpr, _ = roc_curve(y_true, y_predict)
+        new_fpr = fpr[fpr <= max_fpr]
+        new_fpr = np.append(new_fpr, max_fpr)
+        new_tpr = tpr[fpr <= max_fpr]
+        idx_out = np.argmax(fpr > max_fpr)
+        idx_in = idx_out - 1
+        x_interp = [fpr[idx_in], fpr[idx_out]]
+        y_interp = [tpr[idx_in], tpr[idx_out]]
+        new_tpr = np.append(new_tpr, np.interp(max_fpr, x_interp, y_interp))
+        return (new_fpr, new_tpr)
+
+    new_fpr, new_tpr = _partial_roc(y_true, y_predict, max_fpr)
+    partial_auc = auc(new_fpr, new_tpr)
+
+    # Formula (5) from McClish 1989
+    fpr1 = 0
+    fpr2 = max_fpr
+    min_area = 0.5 * (fpr2 - fpr1) * (fpr2 + fpr1)
+    max_area = fpr2 - fpr1
+    return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
+
+
+@pytest.mark.parametrize("drop", [True, False])
+def test_roc_curve(drop):
+    # Test Area under Receiver Operating Characteristic (ROC) curve
+    y_true, _, y_score = make_prediction(binary=True)
+    expected_auc = _auc(y_true, y_score)
+
+    fpr, tpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=drop)
+    roc_auc = auc(fpr, tpr)
+    assert_array_almost_equal(roc_auc, expected_auc, decimal=2)
+    assert_almost_equal(roc_auc, roc_auc_score(y_true, y_score))
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_end_points():
+    # Make sure that roc_curve returns a curve start at 0 and ending and
+    # 1 even in corner cases
+    rng = np.random.RandomState(0)
+    y_true = np.array([0] * 50 + [1] * 50)
+    y_pred = rng.randint(3, size=100)
+    fpr, tpr, thr = roc_curve(y_true, y_pred, drop_intermediate=True)
+    assert fpr[0] == 0
+    assert fpr[-1] == 1
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thr.shape
+
+
+def test_roc_returns_consistency():
+    # Test whether the returned threshold matches up with tpr
+    # make small toy dataset
+    y_true, _, y_score = make_prediction(binary=True)
+    fpr, tpr, thresholds = roc_curve(y_true, y_score)
+
+    # use the given thresholds to determine the tpr
+    tpr_correct = []
+    for t in thresholds:
+        tp = np.sum((y_score >= t) & y_true)
+        p = np.sum(y_true)
+        tpr_correct.append(1.0 * tp / p)
+
+    # compare tpr and tpr_correct to see if the thresholds' order was correct
+    assert_array_almost_equal(tpr, tpr_correct, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_multi():
+    # roc_curve not applicable for multi-class problems
+    y_true, _, y_score = make_prediction(binary=False)
+
+    with pytest.raises(ValueError):
+        roc_curve(y_true, y_score)
+
+
+def test_roc_curve_confidence():
+    # roc_curve for confidence scores
+    y_true, _, y_score = make_prediction(binary=True)
+
+    fpr, tpr, thresholds = roc_curve(y_true, y_score - 0.5)
+    roc_auc = auc(fpr, tpr)
+    assert_array_almost_equal(roc_auc, 0.90, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_hard():
+    # roc_curve for hard decisions
+    y_true, pred, y_score = make_prediction(binary=True)
+
+    # always predict one
+    trivial_pred = np.ones(y_true.shape)
+    fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
+    roc_auc = auc(fpr, tpr)
+    assert_array_almost_equal(roc_auc, 0.50, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+    # always predict zero
+    trivial_pred = np.zeros(y_true.shape)
+    fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
+    roc_auc = auc(fpr, tpr)
+    assert_array_almost_equal(roc_auc, 0.50, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+    # hard decisions
+    fpr, tpr, thresholds = roc_curve(y_true, pred)
+    roc_auc = auc(fpr, tpr)
+    assert_array_almost_equal(roc_auc, 0.78, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_one_label():
+    y_true = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    y_pred = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+    # assert there are warnings
+    expected_message = (
+        "No negative samples in y_true, false positive value should be meaningless"
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
+
+    # all true labels, all fpr should be nan
+    assert_array_equal(fpr, np.full(len(thresholds), np.nan))
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+    # assert there are warnings
+    expected_message = (
+        "No positive samples in y_true, true positive value should be meaningless"
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        fpr, tpr, thresholds = roc_curve([1 - x for x in y_true], y_pred)
+    # all negative labels, all tpr should be nan
+    assert_array_equal(tpr, np.full(len(thresholds), np.nan))
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_toydata():
+    # Binary classification
+    y_true = [0, 1]
+    y_score = [0, 1]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 0, 1])
+    assert_array_almost_equal(fpr, [0, 1, 1])
+    assert_almost_equal(roc_auc, 1.0)
+
+    y_true = [0, 1]
+    y_score = [1, 0]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 1, 1])
+    assert_array_almost_equal(fpr, [0, 0, 1])
+    assert_almost_equal(roc_auc, 0.0)
+
+    y_true = [1, 0]
+    y_score = [1, 1]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 1])
+    assert_array_almost_equal(fpr, [0, 1])
+    assert_almost_equal(roc_auc, 0.5)
+
+    y_true = [1, 0]
+    y_score = [1, 0]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 0, 1])
+    assert_array_almost_equal(fpr, [0, 1, 1])
+    assert_almost_equal(roc_auc, 1.0)
+
+    y_true = [1, 0]
+    y_score = [0.5, 0.5]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 1])
+    assert_array_almost_equal(fpr, [0, 1])
+    assert_almost_equal(roc_auc, 0.5)
+
+    # case with no positive samples
+    y_true = [0, 0]
+    y_score = [0.25, 0.75]
+    # assert UndefinedMetricWarning because of no positive sample in y_true
+    expected_message = (
+        "No positive samples in y_true, true positive value should be meaningless"
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        tpr, fpr, _ = roc_curve(y_true, y_score)
+    assert_array_almost_equal(tpr, [0.0, 0.5, 1.0])
+    assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan])
+    expected_message = (
+        "Only one class is present in y_true. "
+        "ROC AUC score is not defined in that case."
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        auc = roc_auc_score(y_true, y_score)
+    assert math.isnan(auc)
+
+    # case with no negative samples
+    y_true = [1, 1]
+    y_score = [0.25, 0.75]
+    # assert UndefinedMetricWarning because of no negative sample in y_true
+    expected_message = (
+        "No negative samples in y_true, false positive value should be meaningless"
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        tpr, fpr, _ = roc_curve(y_true, y_score)
+    assert_array_almost_equal(tpr, [np.nan, np.nan, np.nan])
+    assert_array_almost_equal(fpr, [0.0, 0.5, 1.0])
+    expected_message = (
+        "Only one class is present in y_true. "
+        "ROC AUC score is not defined in that case."
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        auc = roc_auc_score(y_true, y_score)
+    assert math.isnan(auc)
+
+    # Multi-label classification task
+    y_true = np.array([[0, 1], [0, 1]])
+    y_score = np.array([[0, 1], [0, 1]])
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        roc_auc_score(y_true, y_score, average="macro")
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        roc_auc_score(y_true, y_score, average="weighted")
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.0)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.0)
+
+    y_true = np.array([[0, 1], [0, 1]])
+    y_score = np.array([[0, 1], [1, 0]])
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        roc_auc_score(y_true, y_score, average="macro")
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        roc_auc_score(y_true, y_score, average="weighted")
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5)
+
+    y_true = np.array([[1, 0], [0, 1]])
+    y_score = np.array([[0, 1], [1, 0]])
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0)
+
+    y_true = np.array([[1, 0], [0, 1]])
+    y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0.5)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0.5)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5)
+
+
+def test_roc_curve_drop_intermediate():
+    # Test that drop_intermediate drops the correct thresholds
+    y_true = [0, 0, 0, 0, 1, 1]
+    y_score = [0.0, 0.2, 0.5, 0.6, 0.7, 1.0]
+    tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True)
+    assert_array_almost_equal(thresholds, [np.inf, 1.0, 0.7, 0.0])
+
+    # Test dropping thresholds with repeating scores
+    y_true = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+    y_score = [0.0, 0.1, 0.6, 0.6, 0.7, 0.8, 0.9, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0]
+    tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True)
+    assert_array_almost_equal(thresholds, [np.inf, 1.0, 0.9, 0.7, 0.6, 0.0])
+
+
+def test_roc_curve_fpr_tpr_increasing():
+    # Ensure that fpr and tpr returned by roc_curve are increasing.
+    # Construct an edge case with float y_score and sample_weight
+    # when some adjacent values of fpr and tpr are actually the same.
+    y_true = [0, 0, 1, 1, 1]
+    y_score = [0.1, 0.7, 0.3, 0.4, 0.5]
+    sample_weight = np.repeat(0.2, 5)
+    fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight)
+    assert (np.diff(fpr) < 0).sum() == 0
+    assert (np.diff(tpr) < 0).sum() == 0
+
+
+def test_auc():
+    # Test Area Under Curve (AUC) computation
+    x = [0, 1]
+    y = [0, 1]
+    assert_array_almost_equal(auc(x, y), 0.5)
+    x = [1, 0]
+    y = [0, 1]
+    assert_array_almost_equal(auc(x, y), 0.5)
+    x = [1, 0, 0]
+    y = [0, 1, 1]
+    assert_array_almost_equal(auc(x, y), 0.5)
+    x = [0, 1]
+    y = [1, 1]
+    assert_array_almost_equal(auc(x, y), 1)
+    x = [0, 0.5, 1]
+    y = [0, 0.5, 1]
+    assert_array_almost_equal(auc(x, y), 0.5)
+
+
+def test_auc_errors():
+    # Incompatible shapes
+    with pytest.raises(ValueError):
+        auc([0.0, 0.5, 1.0], [0.1, 0.2])
+
+    # Too few x values
+    with pytest.raises(ValueError):
+        auc([0.0], [0.1])
+
+    # x is not in order
+    x = [2, 1, 3, 4]
+    y = [5, 6, 7, 8]
+    error_message = "x is neither increasing nor decreasing : {}".format(np.array(x))
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        auc(x, y)
+
+
+@pytest.mark.parametrize(
+    "y_true, labels",
+    [
+        (np.array([0, 1, 0, 2]), [0, 1, 2]),
+        (np.array([0, 1, 0, 2]), None),
+        (["a", "b", "a", "c"], ["a", "b", "c"]),
+        (["a", "b", "a", "c"], None),
+    ],
+)
+def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
+    # Tests the one-vs-one multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]]
+    )
+
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5])
+    average_score_01 = (score_01 + score_10) / 2
+
+    # Consider labels 0 and 2:
+    score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0])
+    score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8])
+    average_score_02 = (score_02 + score_20) / 2
+
+    # Consider labels 1 and 2:
+    score_12 = roc_auc_score([1, 0], [0.4, 0.2])
+    score_21 = roc_auc_score([0, 1], [0.3, 0.8])
+    average_score_12 = (score_12 + score_21) / 2
+
+    # Unweighted, one-vs-one multiclass ROC AUC algorithm
+    ovo_unweighted_score = (average_score_01 + average_score_02 + average_score_12) / 3
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"),
+        ovo_unweighted_score,
+    )
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    # Each term is weighted by the prevalence for the positive label.
+    pair_scores = [average_score_01, average_score_02, average_score_12]
+    prevalence = [0.75, 0.75, 0.50]
+    ovo_weighted_score = np.average(pair_scores, weights=prevalence)
+    assert_almost_equal(
+        roc_auc_score(
+            y_true, y_scores, labels=labels, multi_class="ovo", average="weighted"
+        ),
+        ovo_weighted_score,
+    )
+
+    # Check that average=None raises NotImplemented error
+    error_message = "average=None is not implemented for multi_class='ovo'."
+    with pytest.raises(NotImplementedError, match=error_message):
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo", average=None)
+
+
+@pytest.mark.parametrize(
+    "y_true, labels",
+    [
+        (np.array([0, 2, 0, 2]), [0, 1, 2]),
+        (np.array(["a", "d", "a", "d"]), ["a", "b", "d"]),
+    ],
+)
+def test_multiclass_ovo_roc_auc_toydata_binary(y_true, labels):
+    # Tests the one-vs-one multiclass ROC AUC algorithm for binary y_true
+    #
+    # on a small example, representative of an expected use case.
+    y_scores = np.array(
+        [[0.2, 0.0, 0.8], [0.6, 0.0, 0.4], [0.55, 0.0, 0.45], [0.4, 0.0, 0.6]]
+    )
+
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1, 0], [0.2, 0.6, 0.55, 0.4])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0, 1], [0.8, 0.4, 0.45, 0.6])
+    ovo_score = (score_01 + score_10) / 2
+
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"), ovo_score
+    )
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    assert_almost_equal(
+        roc_auc_score(
+            y_true, y_scores, labels=labels, multi_class="ovo", average="weighted"
+        ),
+        ovo_score,
+    )
+
+
+@pytest.mark.parametrize(
+    "y_true, labels",
+    [
+        (np.array([0, 1, 2, 2]), None),
+        (["a", "b", "c", "c"], None),
+        ([0, 1, 2, 2], [0, 1, 2]),
+        (["a", "b", "c", "c"], ["a", "b", "c"]),
+    ],
+)
+def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
+    # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_scores = np.array(
+        [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]]
+    )
+    # Compute the expected result by individually computing the 'one-vs-rest'
+    # ROC AUC scores for classes 0, 1, and 2.
+    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
+    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
+    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels, average=None),
+        [out_0, out_1, out_2],
+    )
+
+    # Compute unweighted results (default behaviour is average="macro")
+    result_unweighted = (out_0 + out_1 + out_2) / 3.0
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels),
+        result_unweighted,
+    )
+
+    # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
+    # on the same input (Provost & Domingos, 2000)
+    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
+    assert_almost_equal(
+        roc_auc_score(
+            y_true, y_scores, multi_class="ovr", labels=labels, average="weighted"
+        ),
+        result_weighted,
+    )
+
+
+@pytest.mark.parametrize(
+    "multi_class, average",
+    [
+        ("ovr", "macro"),
+        ("ovr", "micro"),
+        ("ovo", "macro"),
+    ],
+)
+def test_perfect_imperfect_chance_multiclass_roc_auc(multi_class, average):
+    y_true = np.array([3, 1, 2, 0])
+
+    # Perfect classifier (from a ranking point of view) has roc_auc_score = 1.0
+    y_perfect = [
+        [0.0, 0.0, 0.0, 1.0],
+        [0.0, 1.0, 0.0, 0.0],
+        [0.0, 0.0, 1.0, 0.0],
+        [0.75, 0.05, 0.05, 0.15],
+    ]
+    assert_almost_equal(
+        roc_auc_score(y_true, y_perfect, multi_class=multi_class, average=average),
+        1.0,
+    )
+
+    # Imperfect classifier has roc_auc_score < 1.0
+    y_imperfect = [
+        [0.0, 0.0, 0.0, 1.0],
+        [0.0, 1.0, 0.0, 0.0],
+        [0.0, 0.0, 1.0, 0.0],
+        [0.0, 0.0, 0.0, 1.0],
+    ]
+    assert (
+        roc_auc_score(y_true, y_imperfect, multi_class=multi_class, average=average)
+        < 1.0
+    )
+
+    # Chance level classifier has roc_auc_score = 5.0
+    y_chance = 0.25 * np.ones((4, 4))
+    assert roc_auc_score(
+        y_true, y_chance, multi_class=multi_class, average=average
+    ) == pytest.approx(0.5)
+
+
+def test_micro_averaged_ovr_roc_auc(global_random_seed):
+    seed = global_random_seed
+    # Let's generate a set of random predictions and matching true labels such
+    # that the predictions are not perfect. To make the problem more interesting,
+    # we use an imbalanced class distribution (by using different parameters
+    # in the Dirichlet prior (conjugate prior of the multinomial distribution).
+    y_pred = stats.dirichlet.rvs([2.0, 1.0, 0.5], size=1000, random_state=seed)
+    y_true = np.asarray(
+        [
+            stats.multinomial.rvs(n=1, p=y_pred_i, random_state=seed).argmax()
+            for y_pred_i in y_pred
+        ]
+    )
+    y_onehot = label_binarize(y_true, classes=[0, 1, 2])
+    fpr, tpr, _ = roc_curve(y_onehot.ravel(), y_pred.ravel())
+    roc_auc_by_hand = auc(fpr, tpr)
+    roc_auc_auto = roc_auc_score(y_true, y_pred, multi_class="ovr", average="micro")
+    assert roc_auc_by_hand == pytest.approx(roc_auc_auto)
+
+
+@pytest.mark.parametrize(
+    "msg, y_true, labels",
+    [
+        ("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]),
+        (
+            "Parameter 'labels' must be unique",
+            np.array(["a", "b", "c", "c"]),
+            ["a", "a", "b"],
+        ),
+        (
+            (
+                "Number of classes in y_true not equal to the number of columns "
+                "in 'y_score'"
+            ),
+            np.array([0, 2, 0, 2]),
+            None,
+        ),
+        (
+            "Parameter 'labels' must be ordered",
+            np.array(["a", "b", "c", "c"]),
+            ["a", "c", "b"],
+        ),
+        (
+            (
+                "Number of given labels, 2, not equal to the number of columns in "
+                "'y_score', 3"
+            ),
+            np.array([0, 1, 2, 2]),
+            [0, 1],
+        ),
+        (
+            (
+                "Number of given labels, 2, not equal to the number of columns in "
+                "'y_score', 3"
+            ),
+            np.array(["a", "b", "c", "c"]),
+            ["a", "b"],
+        ),
+        (
+            (
+                "Number of given labels, 4, not equal to the number of columns in "
+                "'y_score', 3"
+            ),
+            np.array([0, 1, 2, 2]),
+            [0, 1, 2, 3],
+        ),
+        (
+            (
+                "Number of given labels, 4, not equal to the number of columns in "
+                "'y_score', 3"
+            ),
+            np.array(["a", "b", "c", "c"]),
+            ["a", "b", "c", "d"],
+        ),
+        (
+            "'y_true' contains labels not in parameter 'labels'",
+            np.array(["a", "b", "c", "e"]),
+            ["a", "b", "c"],
+        ),
+        (
+            "'y_true' contains labels not in parameter 'labels'",
+            np.array(["a", "b", "c", "d"]),
+            ["a", "b", "c"],
+        ),
+        (
+            "'y_true' contains labels not in parameter 'labels'",
+            np.array([0, 1, 2, 3]),
+            [0, 1, 2],
+        ),
+    ],
+)
+@pytest.mark.parametrize("multi_class", ["ovo", "ovr"])
+def test_roc_auc_score_multiclass_labels_error(msg, y_true, labels, multi_class):
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]]
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class=multi_class)
+
+
+@pytest.mark.parametrize(
+    "msg, kwargs",
+    [
+        (
+            (
+                r"average must be one of \('macro', 'weighted', None\) for "
+                r"multiclass problems"
+            ),
+            {"average": "samples", "multi_class": "ovo"},
+        ),
+        (
+            (
+                r"average must be one of \('micro', 'macro', 'weighted', None\) for "
+                r"multiclass problems"
+            ),
+            {"average": "samples", "multi_class": "ovr"},
+        ),
+        (
+            (
+                r"sample_weight is not supported for multiclass one-vs-one "
+                r"ROC AUC, 'sample_weight' must be None in this case"
+            ),
+            {"multi_class": "ovo", "sample_weight": []},
+        ),
+        (
+            (
+                r"Partial AUC computation not available in multiclass setting, "
+                r"'max_fpr' must be set to `None`, received `max_fpr=0.5` "
+                r"instead"
+            ),
+            {"multi_class": "ovo", "max_fpr": 0.5},
+        ),
+        (r"multi_class must be in \('ovo', 'ovr'\)", {}),
+    ],
+)
+def test_roc_auc_score_multiclass_error(msg, kwargs):
+    # Test that roc_auc_score function returns an error when trying
+    # to compute multiclass AUC for parameters where an output
+    # is not defined.
+    rng = check_random_state(404)
+    y_score = rng.rand(20, 3)
+    y_prob = softmax(y_score)
+    y_true = rng.randint(0, 3, size=20)
+    with pytest.raises(ValueError, match=msg):
+        roc_auc_score(y_true, y_prob, **kwargs)
+
+
+def test_auc_score_non_binary_class():
+    # Test that roc_auc_score function returns an error when trying
+    # to compute AUC for non-binary class values.
+    rng = check_random_state(404)
+    y_pred = rng.rand(10)
+    # y_true contains only one class value
+    y_true = np.zeros(10, dtype="int")
+    warn_message = (
+        "Only one class is present in y_true. "
+        "ROC AUC score is not defined in that case."
+    )
+    with pytest.warns(UndefinedMetricWarning, match=warn_message):
+        roc_auc_score(y_true, y_pred)
+    y_true = np.ones(10, dtype="int")
+    with pytest.warns(UndefinedMetricWarning, match=warn_message):
+        roc_auc_score(y_true, y_pred)
+    y_true = np.full(10, -1, dtype="int")
+    with pytest.warns(UndefinedMetricWarning, match=warn_message):
+        roc_auc_score(y_true, y_pred)
+
+
+@pytest.mark.parametrize("curve_func", CURVE_FUNCS)
+def test_binary_clf_curve_multiclass_error(curve_func):
+    rng = check_random_state(404)
+    y_true = rng.randint(0, 3, size=10)
+    y_pred = rng.rand(10)
+    msg = "multiclass format is not supported"
+    with pytest.raises(ValueError, match=msg):
+        curve_func(y_true, y_pred)
+
+
+@pytest.mark.parametrize("curve_func", CURVE_FUNCS)
+def test_binary_clf_curve_implicit_pos_label(curve_func):
+    # Check that using string class labels raises an informative
+    # error for any supported string dtype:
+    msg = (
+        "y_true takes value in {'a', 'b'} and pos_label is "
+        "not specified: either make y_true take "
+        "value in {0, 1} or {-1, 1} or pass pos_label "
+        "explicitly."
+    )
+    with pytest.raises(ValueError, match=msg):
+        curve_func(np.array(["a", "b"], dtype="<U1"), [0.0, 1.0])
+
+    with pytest.raises(ValueError, match=msg):
+        curve_func(np.array(["a", "b"], dtype=object), [0.0, 1.0])
+
+    # Check that it is possible to use floating point class labels
+    # that are interpreted similarly to integer class labels:
+    y_pred = [0.0, 1.0, 0.2, 0.42]
+    int_curve = curve_func([0, 1, 1, 0], y_pred)
+    float_curve = curve_func([0.0, 1.0, 1.0, 0.0], y_pred)
+    for int_curve_part, float_curve_part in zip(int_curve, float_curve):
+        np.testing.assert_allclose(int_curve_part, float_curve_part)
+
+
+@pytest.mark.filterwarnings("ignore:Support for labels represented as bytes")
+@pytest.mark.parametrize("curve_func", [precision_recall_curve, roc_curve])
+@pytest.mark.parametrize("labels_type", ["list", "array"])
+def test_binary_clf_curve_implicit_bytes_pos_label(curve_func, labels_type):
+    # Check that using bytes class labels raises an informative
+    # error for any supported string dtype:
+    labels = _convert_container([b"a", b"b"], labels_type)
+    msg = "Support for labels represented as bytes is not supported"
+    with pytest.raises(TypeError, match=msg):
+        curve_func(labels, [0.0, 1.0])
+
+
+@pytest.mark.parametrize("curve_func", CURVE_FUNCS)
+def test_binary_clf_curve_zero_sample_weight(curve_func):
+    y_true = [0, 0, 1, 1, 1]
+    y_score = [0.1, 0.2, 0.3, 0.4, 0.5]
+    sample_weight = [1, 1, 1, 0.5, 0]
+
+    result_1 = curve_func(y_true, y_score, sample_weight=sample_weight)
+    result_2 = curve_func(y_true[:-1], y_score[:-1], sample_weight=sample_weight[:-1])
+
+    for arr_1, arr_2 in zip(result_1, result_2):
+        assert_allclose(arr_1, arr_2)
+
+
+@pytest.mark.parametrize("drop", [True, False])
+def test_precision_recall_curve(drop):
+    y_true, _, y_score = make_prediction(binary=True)
+    _test_precision_recall_curve(y_true, y_score, drop)
+
+    # Make sure the first point of the Precision-Recall on the right is:
+    # (p=1.0, r=class balance) on a non-balanced dataset [1:]
+    p, r, t = precision_recall_curve(y_true[1:], y_score[1:], drop_intermediate=drop)
+    assert r[0] == 1.0
+    assert p[0] == y_true[1:].mean()
+
+    # Use {-1, 1} for labels; make sure original labels aren't modified
+    y_true[np.where(y_true == 0)] = -1
+    y_true_copy = y_true.copy()
+    _test_precision_recall_curve(y_true, y_score, drop)
+    assert_array_equal(y_true_copy, y_true)
+
+    labels = [1, 0, 0, 1]
+    predict_probas = [1, 2, 3, 4]
+    p, r, t = precision_recall_curve(labels, predict_probas, drop_intermediate=drop)
+    if drop:
+        assert_allclose(p, [0.5, 0.33333333, 1.0, 1.0])
+        assert_allclose(r, [1.0, 0.5, 0.5, 0.0])
+        assert_allclose(t, [1, 2, 4])
+    else:
+        assert_allclose(p, [0.5, 0.33333333, 0.5, 1.0, 1.0])
+        assert_allclose(r, [1.0, 0.5, 0.5, 0.5, 0.0])
+        assert_allclose(t, [1, 2, 3, 4])
+    assert p.size == r.size
+    assert p.size == t.size + 1
+
+
+def _test_precision_recall_curve(y_true, y_score, drop):
+    # Test Precision-Recall and area under PR curve
+    p, r, thresholds = precision_recall_curve(y_true, y_score, drop_intermediate=drop)
+    precision_recall_auc = _average_precision_slow(y_true, y_score)
+    assert_array_almost_equal(precision_recall_auc, 0.859, 3)
+    assert_array_almost_equal(
+        precision_recall_auc, average_precision_score(y_true, y_score)
+    )
+    # `_average_precision` is not very precise in case of 0.5 ties: be tolerant
+    assert_almost_equal(
+        _average_precision(y_true, y_score), precision_recall_auc, decimal=2
+    )
+    assert p.size == r.size
+    assert p.size == thresholds.size + 1
+    # Smoke test in the case of proba having only one value
+    p, r, thresholds = precision_recall_curve(
+        y_true, np.zeros_like(y_score), drop_intermediate=drop
+    )
+    assert p.size == r.size
+    assert p.size == thresholds.size + 1
+
+
+@pytest.mark.parametrize("drop", [True, False])
+def test_precision_recall_curve_toydata(drop):
+    with np.errstate(all="raise"):
+        # Binary classification
+        y_true = [0, 1]
+        y_score = [0, 1]
+        p, r, _ = precision_recall_curve(y_true, y_score, drop_intermediate=drop)
+        auc_prc = average_precision_score(y_true, y_score)
+        assert_array_almost_equal(p, [0.5, 1, 1])
+        assert_array_almost_equal(r, [1, 1, 0])
+        assert_almost_equal(auc_prc, 1.0)
+
+        y_true = [0, 1]
+        y_score = [1, 0]
+        p, r, _ = precision_recall_curve(y_true, y_score, drop_intermediate=drop)
+        auc_prc = average_precision_score(y_true, y_score)
+        assert_array_almost_equal(p, [0.5, 0.0, 1.0])
+        assert_array_almost_equal(r, [1.0, 0.0, 0.0])
+        # Here we are doing a terrible prediction: we are always getting
+        # it wrong, hence the average_precision_score is the accuracy at
+        # chance: 50%
+        assert_almost_equal(auc_prc, 0.5)
+
+        y_true = [1, 0]
+        y_score = [1, 1]
+        p, r, _ = precision_recall_curve(y_true, y_score, drop_intermediate=drop)
+        auc_prc = average_precision_score(y_true, y_score)
+        assert_array_almost_equal(p, [0.5, 1])
+        assert_array_almost_equal(r, [1.0, 0])
+        assert_almost_equal(auc_prc, 0.5)
+
+        y_true = [1, 0]
+        y_score = [1, 0]
+        p, r, _ = precision_recall_curve(y_true, y_score, drop_intermediate=drop)
+        auc_prc = average_precision_score(y_true, y_score)
+        assert_array_almost_equal(p, [0.5, 1, 1])
+        assert_array_almost_equal(r, [1, 1, 0])
+        assert_almost_equal(auc_prc, 1.0)
+
+        y_true = [1, 0]
+        y_score = [0.5, 0.5]
+        p, r, _ = precision_recall_curve(y_true, y_score, drop_intermediate=drop)
+        auc_prc = average_precision_score(y_true, y_score)
+        assert_array_almost_equal(p, [0.5, 1])
+        assert_array_almost_equal(r, [1, 0.0])
+        assert_almost_equal(auc_prc, 0.5)
+
+        y_true = [0, 0]
+        y_score = [0.25, 0.75]
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            p, r, _ = precision_recall_curve(y_true, y_score, drop_intermediate=drop)
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            auc_prc = average_precision_score(y_true, y_score)
+        assert_allclose(p, [0, 0, 1])
+        assert_allclose(r, [1, 1, 0])
+        assert_allclose(auc_prc, 0)
+
+        y_true = [1, 1]
+        y_score = [0.25, 0.75]
+        p, r, _ = precision_recall_curve(y_true, y_score, drop_intermediate=drop)
+        assert_almost_equal(average_precision_score(y_true, y_score), 1.0)
+        assert_array_almost_equal(p, [1.0, 1.0, 1.0])
+        assert_array_almost_equal(r, [1, 0.5, 0.0])
+
+        # Multi-label classification task
+        y_true = np.array([[0, 1], [0, 1]])
+        y_score = np.array([[0, 1], [0, 1]])
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            assert_allclose(
+                average_precision_score(y_true, y_score, average="macro"), 0.5
+            )
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            assert_allclose(
+                average_precision_score(y_true, y_score, average="weighted"), 1.0
+            )
+        assert_allclose(
+            average_precision_score(y_true, y_score, average="samples"), 1.0
+        )
+        assert_allclose(average_precision_score(y_true, y_score, average="micro"), 1.0)
+
+        y_true = np.array([[0, 1], [0, 1]])
+        y_score = np.array([[0, 1], [1, 0]])
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            assert_allclose(
+                average_precision_score(y_true, y_score, average="macro"), 0.5
+            )
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            assert_allclose(
+                average_precision_score(y_true, y_score, average="weighted"), 1.0
+            )
+        assert_allclose(
+            average_precision_score(y_true, y_score, average="samples"), 0.75
+        )
+        assert_allclose(average_precision_score(y_true, y_score, average="micro"), 0.5)
+
+        y_true = np.array([[1, 0], [0, 1]])
+        y_score = np.array([[0, 1], [1, 0]])
+        assert_almost_equal(
+            average_precision_score(y_true, y_score, average="macro"), 0.5
+        )
+        assert_almost_equal(
+            average_precision_score(y_true, y_score, average="weighted"), 0.5
+        )
+        assert_almost_equal(
+            average_precision_score(y_true, y_score, average="samples"), 0.5
+        )
+        assert_almost_equal(
+            average_precision_score(y_true, y_score, average="micro"), 0.5
+        )
+
+        y_true = np.array([[0, 0], [0, 0]])
+        y_score = np.array([[0, 1], [0, 1]])
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            assert_allclose(
+                average_precision_score(y_true, y_score, average="macro"), 0.0
+            )
+        assert_allclose(
+            average_precision_score(y_true, y_score, average="weighted"), 0.0
+        )
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            assert_allclose(
+                average_precision_score(y_true, y_score, average="samples"), 0.0
+            )
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            assert_allclose(
+                average_precision_score(y_true, y_score, average="micro"), 0.0
+            )
+
+        y_true = np.array([[1, 1], [1, 1]])
+        y_score = np.array([[0, 1], [0, 1]])
+        assert_allclose(average_precision_score(y_true, y_score, average="macro"), 1.0)
+        assert_allclose(
+            average_precision_score(y_true, y_score, average="weighted"), 1.0
+        )
+        assert_allclose(
+            average_precision_score(y_true, y_score, average="samples"), 1.0
+        )
+        assert_allclose(average_precision_score(y_true, y_score, average="micro"), 1.0)
+
+        y_true = np.array([[1, 0], [0, 1]])
+        y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
+        assert_almost_equal(
+            average_precision_score(y_true, y_score, average="macro"), 0.5
+        )
+        assert_almost_equal(
+            average_precision_score(y_true, y_score, average="weighted"), 0.5
+        )
+        assert_almost_equal(
+            average_precision_score(y_true, y_score, average="samples"), 0.5
+        )
+        assert_almost_equal(
+            average_precision_score(y_true, y_score, average="micro"), 0.5
+        )
+
+    with np.errstate(all="ignore"):
+        # if one class is never present weighted should not be NaN
+        y_true = np.array([[0, 0], [0, 1]])
+        y_score = np.array([[0, 0], [0, 1]])
+        with pytest.warns(UserWarning, match="No positive class found in y_true"):
+            assert_allclose(
+                average_precision_score(y_true, y_score, average="weighted"), 1
+            )
+
+
+def test_precision_recall_curve_drop_intermediate():
+    """Check the behaviour of the `drop_intermediate` parameter."""
+    y_true = [0, 0, 0, 0, 1, 1]
+    y_score = [0.0, 0.2, 0.5, 0.6, 0.7, 1.0]
+    precision, recall, thresholds = precision_recall_curve(
+        y_true, y_score, drop_intermediate=True
+    )
+    assert_allclose(thresholds, [0.0, 0.7, 1.0])
+
+    # Test dropping thresholds with repeating scores
+    y_true = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+    y_score = [0.0, 0.1, 0.6, 0.6, 0.7, 0.8, 0.9, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0]
+    precision, recall, thresholds = precision_recall_curve(
+        y_true, y_score, drop_intermediate=True
+    )
+    assert_allclose(thresholds, [0.0, 0.6, 0.7, 0.8, 0.9, 1.0])
+
+    # Test all false keeps only endpoints
+    y_true = [0, 0, 0, 0]
+    y_score = [0.0, 0.1, 0.2, 0.3]
+    precision, recall, thresholds = precision_recall_curve(
+        y_true, y_score, drop_intermediate=True
+    )
+    assert_allclose(thresholds, [0.0, 0.3])
+
+    # Test all true keeps all thresholds
+    y_true = [1, 1, 1, 1]
+    y_score = [0.0, 0.1, 0.2, 0.3]
+    precision, recall, thresholds = precision_recall_curve(
+        y_true, y_score, drop_intermediate=True
+    )
+    assert_allclose(thresholds, [0.0, 0.1, 0.2, 0.3])
+
+
+def test_average_precision_constant_values():
+    # Check the average_precision_score of a constant predictor is
+    # the TPR
+
+    # Generate a dataset with 25% of positives
+    y_true = np.zeros(100, dtype=int)
+    y_true[::4] = 1
+    # And a constant score
+    y_score = np.ones(100)
+    # The precision is then the fraction of positive whatever the recall
+    # is, as there is only one threshold:
+    assert average_precision_score(y_true, y_score) == 0.25
+
+
+def test_average_precision_score_binary_pos_label_errors():
+    # Raise an error when pos_label is not in binary y_true
+    y_true = np.array([0, 1])
+    y_pred = np.array([0, 1])
+    err_msg = r"pos_label=2 is not a valid label. It should be one of \[0, 1\]"
+    with pytest.raises(ValueError, match=err_msg):
+        average_precision_score(y_true, y_pred, pos_label=2)
+
+
+def test_average_precision_score_multilabel_pos_label_errors():
+    # Raise an error for multilabel-indicator y_true with
+    # pos_label other than 1
+    y_true = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
+    y_pred = np.array([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2], [0.2, 0.8]])
+    err_msg = (
+        "Parameter pos_label is fixed to 1 for multilabel-indicator y_true. "
+        "Do not set pos_label or set pos_label to 1."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        average_precision_score(y_true, y_pred, pos_label=0)
+
+
+def test_average_precision_score_multiclass_pos_label_errors():
+    # Raise an error for multiclass y_true with pos_label other than 1
+    y_true = np.array([0, 1, 2, 0, 1, 2])
+    y_pred = np.array(
+        [
+            [0.5, 0.2, 0.1],
+            [0.4, 0.5, 0.3],
+            [0.1, 0.2, 0.6],
+            [0.2, 0.3, 0.5],
+            [0.2, 0.3, 0.5],
+            [0.2, 0.3, 0.5],
+        ]
+    )
+    err_msg = (
+        "Parameter pos_label is fixed to 1 for multiclass y_true. "
+        "Do not set pos_label or set pos_label to 1."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        average_precision_score(y_true, y_pred, pos_label=3)
+
+
+def test_score_scale_invariance():
+    # Test that average_precision_score and roc_auc_score are invariant by
+    # the scaling or shifting of probabilities
+    # This test was expanded (added scaled_down) in response to github
+    # issue #3864 (and others), where overly aggressive rounding was causing
+    # problems for users with very small y_score values
+    y_true, _, y_score = make_prediction(binary=True)
+
+    roc_auc = roc_auc_score(y_true, y_score)
+    roc_auc_scaled_up = roc_auc_score(y_true, 100 * y_score)
+    roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * y_score)
+    roc_auc_shifted = roc_auc_score(y_true, y_score - 10)
+    assert roc_auc == roc_auc_scaled_up
+    assert roc_auc == roc_auc_scaled_down
+    assert roc_auc == roc_auc_shifted
+
+    pr_auc = average_precision_score(y_true, y_score)
+    pr_auc_scaled_up = average_precision_score(y_true, 100 * y_score)
+    pr_auc_scaled_down = average_precision_score(y_true, 1e-6 * y_score)
+    pr_auc_shifted = average_precision_score(y_true, y_score - 10)
+    assert pr_auc == pr_auc_scaled_up
+    assert pr_auc == pr_auc_scaled_down
+    assert pr_auc == pr_auc_shifted
+
+
+@pytest.mark.parametrize(
+    "y_true,y_score,expected_fpr,expected_fnr",
+    [
+        ([0, 0, 1], [0, 0.5, 1], [0], [0]),
+        ([0, 0, 1], [0, 0.25, 0.5], [0], [0]),
+        ([0, 0, 1], [0.5, 0.75, 1], [0], [0]),
+        ([0, 0, 1], [0.25, 0.5, 0.75], [0], [0]),
+        ([0, 1, 0], [0, 0.5, 1], [0.5, 0.5, 0], [0, 1, 1]),
+        ([0, 1, 0], [0, 0.25, 0.5], [0.5, 0.5, 0], [0, 1, 1]),
+        ([0, 1, 0], [0.5, 0.75, 1], [0.5, 0.5, 0], [0, 1, 1]),
+        ([0, 1, 0], [0.25, 0.5, 0.75], [0.5, 0.5, 0], [0, 1, 1]),
+        ([0, 1, 1], [0, 0.5, 1], [0.0], [0]),
+        ([0, 1, 1], [0, 0.25, 0.5], [0], [0]),
+        ([0, 1, 1], [0.5, 0.75, 1], [0], [0]),
+        ([0, 1, 1], [0.25, 0.5, 0.75], [0], [0]),
+        ([1, 0, 0], [0, 0.5, 1], [1, 1, 0.5, 0], [0, 1, 1, 1]),
+        ([1, 0, 0], [0, 0.25, 0.5], [1, 1, 0.5, 0], [0, 1, 1, 1]),
+        ([1, 0, 0], [0.5, 0.75, 1], [1, 1, 0.5, 0], [0, 1, 1, 1]),
+        ([1, 0, 0], [0.25, 0.5, 0.75], [1, 1, 0.5, 0], [0, 1, 1, 1]),
+        ([1, 0, 1], [0, 0.5, 1], [1, 1, 0], [0, 0.5, 0.5]),
+        ([1, 0, 1], [0, 0.25, 0.5], [1, 1, 0], [0, 0.5, 0.5]),
+        ([1, 0, 1], [0.5, 0.75, 1], [1, 1, 0], [0, 0.5, 0.5]),
+        ([1, 0, 1], [0.25, 0.5, 0.75], [1, 1, 0], [0, 0.5, 0.5]),
+    ],
+)
+def test_det_curve_toydata(y_true, y_score, expected_fpr, expected_fnr):
+    # Check on a batch of small examples.
+    fpr, fnr, _ = det_curve(y_true, y_score)
+
+    assert_allclose(fpr, expected_fpr)
+    assert_allclose(fnr, expected_fnr)
+
+
+@pytest.mark.parametrize(
+    ["y_true", "y_score", "expected_fpr", "expected_fnr", "drop_intermediate"],
+    [
+        # drop when true positives do not change from the previous or subsequent point
+        ([1, 0, 0], [0, 0.5, 1], [1, 1, 0.5, 0.0], [0, 1, 1, 1], False),
+        ([1, 0, 0], [0, 0.5, 1], [1, 1, 0.0], [0, 1, 1], True),
+        ([1, 0, 0], [0, 0.25, 0.5], [1, 1, 0.5, 0.0], [0, 1, 1, 1], False),
+        ([1, 0, 0], [0, 0.25, 0.5], [1, 1, 0.0], [0, 1, 1], True),
+        # do nothing otherwise
+        ([1, 0, 1], [0, 0.5, 1], [1, 1, 0], [0, 0.5, 0.5], False),
+        ([1, 0, 1], [0, 0.5, 1], [1, 1, 0], [0, 0.5, 0.5], True),
+        ([1, 0, 1], [0, 0.25, 0.5], [1, 1, 0], [0, 0.5, 0.5], False),
+        ([1, 0, 1], [0, 0.25, 0.5], [1, 1, 0], [0, 0.5, 0.5], True),
+    ],
+)
+def test_det_curve_drop_intermediate(
+    y_true, y_score, expected_fpr, expected_fnr, drop_intermediate
+):
+    # Check on a batch of small examples.
+    fpr, fnr, _ = det_curve(y_true, y_score, drop_intermediate=drop_intermediate)
+
+    assert_allclose(fpr, expected_fpr)
+    assert_allclose(fnr, expected_fnr)
+
+
+@pytest.mark.parametrize(
+    "y_true,y_score,expected_fpr,expected_fnr",
+    [
+        ([1, 0], [0.5, 0.5], [1, 0], [0, 1]),
+        ([0, 1], [0.5, 0.5], [1, 0], [0, 1]),
+        ([0, 0, 1], [0.25, 0.5, 0.5], [0.5, 0], [0, 1]),
+        ([0, 1, 0], [0.25, 0.5, 0.5], [0.5, 0], [0, 1]),
+        ([0, 1, 1], [0.25, 0.5, 0.5], [0], [0]),
+        ([1, 0, 0], [0.25, 0.5, 0.5], [1, 1, 0], [0, 1, 1]),
+        ([1, 0, 1], [0.25, 0.5, 0.5], [1, 1, 0], [0, 0.5, 1]),
+        ([1, 1, 0], [0.25, 0.5, 0.5], [1, 1, 0], [0, 0.5, 1]),
+    ],
+)
+def test_det_curve_tie_handling(y_true, y_score, expected_fpr, expected_fnr):
+    fpr, fnr, _ = det_curve(y_true, y_score)
+
+    assert_allclose(fpr, expected_fpr)
+    assert_allclose(fnr, expected_fnr)
+
+
+def test_det_curve_sanity_check():
+    # Exactly duplicated inputs yield the same result.
+    assert_allclose(
+        det_curve([0, 0, 1], [0, 0.5, 1]),
+        det_curve([0, 0, 0, 0, 1, 1], [0, 0, 0.5, 0.5, 1, 1]),
+    )
+
+
+@pytest.mark.parametrize("y_score", [(0), (0.25), (0.5), (0.75), (1)])
+def test_det_curve_constant_scores(y_score):
+    fpr, fnr, threshold = det_curve(
+        y_true=[0, 1, 0, 1, 0, 1], y_score=np.full(6, y_score)
+    )
+
+    assert_allclose(fpr, [1, 0])
+    assert_allclose(fnr, [0, 1])
+    assert_allclose(threshold, [y_score, np.inf])
+
+
+@pytest.mark.parametrize(
+    "y_true",
+    [
+        ([0, 0, 0, 0, 0, 1]),
+        ([0, 0, 0, 0, 1, 1]),
+        ([0, 0, 0, 1, 1, 1]),
+        ([0, 0, 1, 1, 1, 1]),
+        ([0, 1, 1, 1, 1, 1]),
+    ],
+)
+def test_det_curve_perfect_scores(y_true):
+    fpr, fnr, _ = det_curve(y_true=y_true, y_score=y_true)
+
+    assert_allclose(fpr, [0])
+    assert_allclose(fnr, [0])
+
+
+@pytest.mark.parametrize(
+    "y_true, y_pred, err_msg",
+    [
+        ([0, 1], [0, 0.5, 1], "inconsistent numbers of samples"),
+        ([0, 1, 1], [0, 0.5], "inconsistent numbers of samples"),
+        ([0, 0, 0], [0, 0.5, 1], "Only one class is present in y_true"),
+        ([1, 1, 1], [0, 0.5, 1], "Only one class is present in y_true"),
+        (
+            ["cancer", "cancer", "not cancer"],
+            [0.2, 0.3, 0.8],
+            "pos_label is not specified",
+        ),
+    ],
+)
+def test_det_curve_bad_input(y_true, y_pred, err_msg):
+    # input variables with inconsistent numbers of samples
+    with pytest.raises(ValueError, match=err_msg):
+        det_curve(y_true, y_pred)
+
+
+def test_det_curve_pos_label():
+    y_true = ["cancer"] * 3 + ["not cancer"] * 7
+    y_pred_pos_not_cancer = np.array([0.1, 0.4, 0.6, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9])
+    y_pred_pos_cancer = 1 - y_pred_pos_not_cancer
+
+    fpr_pos_cancer, fnr_pos_cancer, th_pos_cancer = det_curve(
+        y_true,
+        y_pred_pos_cancer,
+        pos_label="cancer",
+    )
+    fpr_pos_not_cancer, fnr_pos_not_cancer, th_pos_not_cancer = det_curve(
+        y_true,
+        y_pred_pos_not_cancer,
+        pos_label="not cancer",
+    )
+
+    # check that the first threshold will change depending which label we
+    # consider positive
+    assert th_pos_cancer[0] == pytest.approx(0.4)
+    assert th_pos_not_cancer[0] == pytest.approx(0.2)
+
+    # check for the symmetry of the fpr and fnr
+    assert_allclose(fpr_pos_cancer, fnr_pos_not_cancer[::-1])
+    assert_allclose(fnr_pos_cancer, fpr_pos_not_cancer[::-1])
+
+
+def check_lrap_toy(lrap_score):
+    # Check on several small example that it works
+    assert_almost_equal(lrap_score([[0, 1]], [[0.25, 0.75]]), 1)
+    assert_almost_equal(lrap_score([[0, 1]], [[0.75, 0.25]]), 1 / 2)
+    assert_almost_equal(lrap_score([[1, 1]], [[0.75, 0.25]]), 1)
+
+    assert_almost_equal(lrap_score([[0, 0, 1]], [[0.25, 0.5, 0.75]]), 1)
+    assert_almost_equal(lrap_score([[0, 1, 0]], [[0.25, 0.5, 0.75]]), 1 / 2)
+    assert_almost_equal(lrap_score([[0, 1, 1]], [[0.25, 0.5, 0.75]]), 1)
+    assert_almost_equal(lrap_score([[1, 0, 0]], [[0.25, 0.5, 0.75]]), 1 / 3)
+    assert_almost_equal(
+        lrap_score([[1, 0, 1]], [[0.25, 0.5, 0.75]]), (2 / 3 + 1 / 1) / 2
+    )
+    assert_almost_equal(
+        lrap_score([[1, 1, 0]], [[0.25, 0.5, 0.75]]), (2 / 3 + 1 / 2) / 2
+    )
+
+    assert_almost_equal(lrap_score([[0, 0, 1]], [[0.75, 0.5, 0.25]]), 1 / 3)
+    assert_almost_equal(lrap_score([[0, 1, 0]], [[0.75, 0.5, 0.25]]), 1 / 2)
+    assert_almost_equal(
+        lrap_score([[0, 1, 1]], [[0.75, 0.5, 0.25]]), (1 / 2 + 2 / 3) / 2
+    )
+    assert_almost_equal(lrap_score([[1, 0, 0]], [[0.75, 0.5, 0.25]]), 1)
+    assert_almost_equal(lrap_score([[1, 0, 1]], [[0.75, 0.5, 0.25]]), (1 + 2 / 3) / 2)
+    assert_almost_equal(lrap_score([[1, 1, 0]], [[0.75, 0.5, 0.25]]), 1)
+    assert_almost_equal(lrap_score([[1, 1, 1]], [[0.75, 0.5, 0.25]]), 1)
+
+    assert_almost_equal(lrap_score([[0, 0, 1]], [[0.5, 0.75, 0.25]]), 1 / 3)
+    assert_almost_equal(lrap_score([[0, 1, 0]], [[0.5, 0.75, 0.25]]), 1)
+    assert_almost_equal(lrap_score([[0, 1, 1]], [[0.5, 0.75, 0.25]]), (1 + 2 / 3) / 2)
+    assert_almost_equal(lrap_score([[1, 0, 0]], [[0.5, 0.75, 0.25]]), 1 / 2)
+    assert_almost_equal(
+        lrap_score([[1, 0, 1]], [[0.5, 0.75, 0.25]]), (1 / 2 + 2 / 3) / 2
+    )
+    assert_almost_equal(lrap_score([[1, 1, 0]], [[0.5, 0.75, 0.25]]), 1)
+    assert_almost_equal(lrap_score([[1, 1, 1]], [[0.5, 0.75, 0.25]]), 1)
+
+    # Tie handling
+    assert_almost_equal(lrap_score([[1, 0]], [[0.5, 0.5]]), 0.5)
+    assert_almost_equal(lrap_score([[0, 1]], [[0.5, 0.5]]), 0.5)
+    assert_almost_equal(lrap_score([[1, 1]], [[0.5, 0.5]]), 1)
+
+    assert_almost_equal(lrap_score([[0, 0, 1]], [[0.25, 0.5, 0.5]]), 0.5)
+    assert_almost_equal(lrap_score([[0, 1, 0]], [[0.25, 0.5, 0.5]]), 0.5)
+    assert_almost_equal(lrap_score([[0, 1, 1]], [[0.25, 0.5, 0.5]]), 1)
+    assert_almost_equal(lrap_score([[1, 0, 0]], [[0.25, 0.5, 0.5]]), 1 / 3)
+    assert_almost_equal(
+        lrap_score([[1, 0, 1]], [[0.25, 0.5, 0.5]]), (2 / 3 + 1 / 2) / 2
+    )
+    assert_almost_equal(
+        lrap_score([[1, 1, 0]], [[0.25, 0.5, 0.5]]), (2 / 3 + 1 / 2) / 2
+    )
+    assert_almost_equal(lrap_score([[1, 1, 1]], [[0.25, 0.5, 0.5]]), 1)
+
+    assert_almost_equal(lrap_score([[1, 1, 0]], [[0.5, 0.5, 0.5]]), 2 / 3)
+
+    assert_almost_equal(lrap_score([[1, 1, 1, 0]], [[0.5, 0.5, 0.5, 0.5]]), 3 / 4)
+
+
+def check_zero_or_all_relevant_labels(lrap_score):
+    random_state = check_random_state(0)
+
+    for n_labels in range(2, 5):
+        y_score = random_state.uniform(size=(1, n_labels))
+        y_score_ties = np.zeros_like(y_score)
+
+        # No relevant labels
+        y_true = np.zeros((1, n_labels))
+        assert lrap_score(y_true, y_score) == 1.0
+        assert lrap_score(y_true, y_score_ties) == 1.0
+
+        # Only relevant labels
+        y_true = np.ones((1, n_labels))
+        assert lrap_score(y_true, y_score) == 1.0
+        assert lrap_score(y_true, y_score_ties) == 1.0
+
+    # Degenerate case: only one label
+    assert_almost_equal(
+        lrap_score([[1], [0], [1], [0]], [[0.5], [0.5], [0.5], [0.5]]), 1.0
+    )
+
+
+def check_lrap_error_raised(lrap_score):
+    # Raise value error if not appropriate format
+    with pytest.raises(ValueError):
+        lrap_score([0, 1, 0], [0.25, 0.3, 0.2])
+    with pytest.raises(ValueError):
+        lrap_score([0, 1, 2], [[0.25, 0.75, 0.0], [0.7, 0.3, 0.0], [0.8, 0.2, 0.0]])
+    with pytest.raises(ValueError):
+        lrap_score(
+            [(0), (1), (2)], [[0.25, 0.75, 0.0], [0.7, 0.3, 0.0], [0.8, 0.2, 0.0]]
+        )
+
+    # Check that y_true.shape != y_score.shape raise the proper exception
+    with pytest.raises(ValueError):
+        lrap_score([[0, 1], [0, 1]], [0, 1])
+    with pytest.raises(ValueError):
+        lrap_score([[0, 1], [0, 1]], [[0, 1]])
+    with pytest.raises(ValueError):
+        lrap_score([[0, 1], [0, 1]], [[0], [1]])
+    with pytest.raises(ValueError):
+        lrap_score([[0, 1]], [[0, 1], [0, 1]])
+    with pytest.raises(ValueError):
+        lrap_score([[0], [1]], [[0, 1], [0, 1]])
+    with pytest.raises(ValueError):
+        lrap_score([[0, 1], [0, 1]], [[0], [1]])
+
+
+def check_lrap_only_ties(lrap_score):
+    # Check tie handling in score
+    # Basic check with only ties and increasing label space
+    for n_labels in range(2, 10):
+        y_score = np.ones((1, n_labels))
+
+        # Check for growing number of consecutive relevant
+        for n_relevant in range(1, n_labels):
+            # Check for a bunch of positions
+            for pos in range(n_labels - n_relevant):
+                y_true = np.zeros((1, n_labels))
+                y_true[0, pos : pos + n_relevant] = 1
+                assert_almost_equal(lrap_score(y_true, y_score), n_relevant / n_labels)
+
+
+def check_lrap_without_tie_and_increasing_score(lrap_score):
+    # Check that Label ranking average precision works for various
+    # Basic check with increasing label space size and decreasing score
+    for n_labels in range(2, 10):
+        y_score = n_labels - (np.arange(n_labels).reshape((1, n_labels)) + 1)
+
+        # First and last
+        y_true = np.zeros((1, n_labels))
+        y_true[0, 0] = 1
+        y_true[0, -1] = 1
+        assert_almost_equal(lrap_score(y_true, y_score), (2 / n_labels + 1) / 2)
+
+        # Check for growing number of consecutive relevant label
+        for n_relevant in range(1, n_labels):
+            # Check for a bunch of position
+            for pos in range(n_labels - n_relevant):
+                y_true = np.zeros((1, n_labels))
+                y_true[0, pos : pos + n_relevant] = 1
+                assert_almost_equal(
+                    lrap_score(y_true, y_score),
+                    sum(
+                        (r + 1) / ((pos + r + 1) * n_relevant)
+                        for r in range(n_relevant)
+                    ),
+                )
+
+
+def _my_lrap(y_true, y_score):
+    """Simple implementation of label ranking average precision"""
+    check_consistent_length(y_true, y_score)
+    y_true = check_array(y_true)
+    y_score = check_array(y_score)
+    n_samples, n_labels = y_true.shape
+    score = np.empty((n_samples,))
+    for i in range(n_samples):
+        # The best rank correspond to 1. Rank higher than 1 are worse.
+        # The best inverse ranking correspond to n_labels.
+        unique_rank, inv_rank = np.unique(y_score[i], return_inverse=True)
+        n_ranks = unique_rank.size
+        rank = n_ranks - inv_rank
+
+        # Rank need to be corrected to take into account ties
+        # ex: rank 1 ex aequo means that both label are rank 2.
+        corr_rank = np.bincount(rank, minlength=n_ranks + 1).cumsum()
+        rank = corr_rank[rank]
+
+        relevant = y_true[i].nonzero()[0]
+        if relevant.size == 0 or relevant.size == n_labels:
+            score[i] = 1
+            continue
+
+        score[i] = 0.0
+        for label in relevant:
+            # Let's count the number of relevant label with better rank
+            # (smaller rank).
+            n_ranked_above = sum(rank[r] <= rank[label] for r in relevant)
+
+            # Weight by the rank of the actual label
+            score[i] += n_ranked_above / rank[label]
+
+        score[i] /= relevant.size
+
+    return score.mean()
+
+
+def check_alternative_lrap_implementation(
+    lrap_score, n_classes=5, n_samples=20, random_state=0
+):
+    _, y_true = make_multilabel_classification(
+        n_features=1,
+        allow_unlabeled=False,
+        random_state=random_state,
+        n_classes=n_classes,
+        n_samples=n_samples,
+    )
+
+    # Score with ties
+    y_score = _sparse_random_matrix(
+        n_components=y_true.shape[0],
+        n_features=y_true.shape[1],
+        random_state=random_state,
+    )
+
+    if hasattr(y_score, "toarray"):
+        y_score = y_score.toarray()
+    score_lrap = label_ranking_average_precision_score(y_true, y_score)
+    score_my_lrap = _my_lrap(y_true, y_score)
+    assert_almost_equal(score_lrap, score_my_lrap)
+
+    # Uniform score
+    random_state = check_random_state(random_state)
+    y_score = random_state.uniform(size=(n_samples, n_classes))
+    score_lrap = label_ranking_average_precision_score(y_true, y_score)
+    score_my_lrap = _my_lrap(y_true, y_score)
+    assert_almost_equal(score_lrap, score_my_lrap)
+
+
+@pytest.mark.parametrize(
+    "check",
+    (
+        check_lrap_toy,
+        check_lrap_without_tie_and_increasing_score,
+        check_lrap_only_ties,
+        check_zero_or_all_relevant_labels,
+    ),
+)
+@pytest.mark.parametrize("func", (label_ranking_average_precision_score, _my_lrap))
+def test_label_ranking_avp(check, func):
+    check(func)
+
+
+def test_lrap_error_raised():
+    check_lrap_error_raised(label_ranking_average_precision_score)
+
+
+@pytest.mark.parametrize("n_samples", (1, 2, 8, 20))
+@pytest.mark.parametrize("n_classes", (2, 5, 10))
+@pytest.mark.parametrize("random_state", range(1))
+def test_alternative_lrap_implementation(n_samples, n_classes, random_state):
+    check_alternative_lrap_implementation(
+        label_ranking_average_precision_score, n_classes, n_samples, random_state
+    )
+
+
+def test_lrap_sample_weighting_zero_labels():
+    # Degenerate sample labeling (e.g., zero labels for a sample) is a valid
+    # special case for lrap (the sample is considered to achieve perfect
+    # precision), but this case is not tested in test_common.
+    # For these test samples, the APs are 0.5, 0.75, and 1.0 (default for zero
+    # labels).
+    y_true = np.array([[1, 0, 0, 0], [1, 0, 0, 1], [0, 0, 0, 0]], dtype=bool)
+    y_score = np.array(
+        [[0.3, 0.4, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4], [0.4, 0.3, 0.2, 0.1]]
+    )
+    samplewise_lraps = np.array([0.5, 0.75, 1.0])
+    sample_weight = np.array([1.0, 1.0, 0.0])
+
+    assert_almost_equal(
+        label_ranking_average_precision_score(
+            y_true, y_score, sample_weight=sample_weight
+        ),
+        np.sum(sample_weight * samplewise_lraps) / np.sum(sample_weight),
+    )
+
+
+def test_coverage_error():
+    # Toy case
+    assert_almost_equal(coverage_error([[0, 1]], [[0.25, 0.75]]), 1)
+    assert_almost_equal(coverage_error([[0, 1]], [[0.75, 0.25]]), 2)
+    assert_almost_equal(coverage_error([[1, 1]], [[0.75, 0.25]]), 2)
+    assert_almost_equal(coverage_error([[0, 0]], [[0.75, 0.25]]), 0)
+
+    assert_almost_equal(coverage_error([[0, 0, 0]], [[0.25, 0.5, 0.75]]), 0)
+    assert_almost_equal(coverage_error([[0, 0, 1]], [[0.25, 0.5, 0.75]]), 1)
+    assert_almost_equal(coverage_error([[0, 1, 0]], [[0.25, 0.5, 0.75]]), 2)
+    assert_almost_equal(coverage_error([[0, 1, 1]], [[0.25, 0.5, 0.75]]), 2)
+    assert_almost_equal(coverage_error([[1, 0, 0]], [[0.25, 0.5, 0.75]]), 3)
+    assert_almost_equal(coverage_error([[1, 0, 1]], [[0.25, 0.5, 0.75]]), 3)
+    assert_almost_equal(coverage_error([[1, 1, 0]], [[0.25, 0.5, 0.75]]), 3)
+    assert_almost_equal(coverage_error([[1, 1, 1]], [[0.25, 0.5, 0.75]]), 3)
+
+    assert_almost_equal(coverage_error([[0, 0, 0]], [[0.75, 0.5, 0.25]]), 0)
+    assert_almost_equal(coverage_error([[0, 0, 1]], [[0.75, 0.5, 0.25]]), 3)
+    assert_almost_equal(coverage_error([[0, 1, 0]], [[0.75, 0.5, 0.25]]), 2)
+    assert_almost_equal(coverage_error([[0, 1, 1]], [[0.75, 0.5, 0.25]]), 3)
+    assert_almost_equal(coverage_error([[1, 0, 0]], [[0.75, 0.5, 0.25]]), 1)
+    assert_almost_equal(coverage_error([[1, 0, 1]], [[0.75, 0.5, 0.25]]), 3)
+    assert_almost_equal(coverage_error([[1, 1, 0]], [[0.75, 0.5, 0.25]]), 2)
+    assert_almost_equal(coverage_error([[1, 1, 1]], [[0.75, 0.5, 0.25]]), 3)
+
+    assert_almost_equal(coverage_error([[0, 0, 0]], [[0.5, 0.75, 0.25]]), 0)
+    assert_almost_equal(coverage_error([[0, 0, 1]], [[0.5, 0.75, 0.25]]), 3)
+    assert_almost_equal(coverage_error([[0, 1, 0]], [[0.5, 0.75, 0.25]]), 1)
+    assert_almost_equal(coverage_error([[0, 1, 1]], [[0.5, 0.75, 0.25]]), 3)
+    assert_almost_equal(coverage_error([[1, 0, 0]], [[0.5, 0.75, 0.25]]), 2)
+    assert_almost_equal(coverage_error([[1, 0, 1]], [[0.5, 0.75, 0.25]]), 3)
+    assert_almost_equal(coverage_error([[1, 1, 0]], [[0.5, 0.75, 0.25]]), 2)
+    assert_almost_equal(coverage_error([[1, 1, 1]], [[0.5, 0.75, 0.25]]), 3)
+
+    # Non trivial case
+    assert_almost_equal(
+        coverage_error([[0, 1, 0], [1, 1, 0]], [[0.1, 10.0, -3], [0, 1, 3]]),
+        (1 + 3) / 2.0,
+    )
+
+    assert_almost_equal(
+        coverage_error(
+            [[0, 1, 0], [1, 1, 0], [0, 1, 1]], [[0.1, 10, -3], [0, 1, 3], [0, 2, 0]]
+        ),
+        (1 + 3 + 3) / 3.0,
+    )
+
+    assert_almost_equal(
+        coverage_error(
+            [[0, 1, 0], [1, 1, 0], [0, 1, 1]], [[0.1, 10, -3], [3, 1, 3], [0, 2, 0]]
+        ),
+        (1 + 3 + 3) / 3.0,
+    )
+
+
+def test_coverage_tie_handling():
+    assert_almost_equal(coverage_error([[0, 0]], [[0.5, 0.5]]), 0)
+    assert_almost_equal(coverage_error([[1, 0]], [[0.5, 0.5]]), 2)
+    assert_almost_equal(coverage_error([[0, 1]], [[0.5, 0.5]]), 2)
+    assert_almost_equal(coverage_error([[1, 1]], [[0.5, 0.5]]), 2)
+
+    assert_almost_equal(coverage_error([[0, 0, 0]], [[0.25, 0.5, 0.5]]), 0)
+    assert_almost_equal(coverage_error([[0, 0, 1]], [[0.25, 0.5, 0.5]]), 2)
+    assert_almost_equal(coverage_error([[0, 1, 0]], [[0.25, 0.5, 0.5]]), 2)
+    assert_almost_equal(coverage_error([[0, 1, 1]], [[0.25, 0.5, 0.5]]), 2)
+    assert_almost_equal(coverage_error([[1, 0, 0]], [[0.25, 0.5, 0.5]]), 3)
+    assert_almost_equal(coverage_error([[1, 0, 1]], [[0.25, 0.5, 0.5]]), 3)
+    assert_almost_equal(coverage_error([[1, 1, 0]], [[0.25, 0.5, 0.5]]), 3)
+    assert_almost_equal(coverage_error([[1, 1, 1]], [[0.25, 0.5, 0.5]]), 3)
+
+
+@pytest.mark.parametrize(
+    "y_true, y_score",
+    [
+        ([1, 0, 1], [0.25, 0.5, 0.5]),
+        ([1, 0, 1], [[0.25, 0.5, 0.5]]),
+        ([[1, 0, 1]], [0.25, 0.5, 0.5]),
+    ],
+)
+def test_coverage_1d_error_message(y_true, y_score):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/23368
+    with pytest.raises(ValueError, match=r"Expected 2D array, got 1D array instead"):
+        coverage_error(y_true, y_score)
+
+
+def test_label_ranking_loss():
+    assert_almost_equal(label_ranking_loss([[0, 1]], [[0.25, 0.75]]), 0)
+    assert_almost_equal(label_ranking_loss([[0, 1]], [[0.75, 0.25]]), 1)
+
+    assert_almost_equal(label_ranking_loss([[0, 0, 1]], [[0.25, 0.5, 0.75]]), 0)
+    assert_almost_equal(label_ranking_loss([[0, 1, 0]], [[0.25, 0.5, 0.75]]), 1 / 2)
+    assert_almost_equal(label_ranking_loss([[0, 1, 1]], [[0.25, 0.5, 0.75]]), 0)
+    assert_almost_equal(label_ranking_loss([[1, 0, 0]], [[0.25, 0.5, 0.75]]), 2 / 2)
+    assert_almost_equal(label_ranking_loss([[1, 0, 1]], [[0.25, 0.5, 0.75]]), 1 / 2)
+    assert_almost_equal(label_ranking_loss([[1, 1, 0]], [[0.25, 0.5, 0.75]]), 2 / 2)
+
+    # Undefined metrics -  the ranking doesn't matter
+    assert_almost_equal(label_ranking_loss([[0, 0]], [[0.75, 0.25]]), 0)
+    assert_almost_equal(label_ranking_loss([[1, 1]], [[0.75, 0.25]]), 0)
+    assert_almost_equal(label_ranking_loss([[0, 0]], [[0.5, 0.5]]), 0)
+    assert_almost_equal(label_ranking_loss([[1, 1]], [[0.5, 0.5]]), 0)
+
+    assert_almost_equal(label_ranking_loss([[0, 0, 0]], [[0.5, 0.75, 0.25]]), 0)
+    assert_almost_equal(label_ranking_loss([[1, 1, 1]], [[0.5, 0.75, 0.25]]), 0)
+    assert_almost_equal(label_ranking_loss([[0, 0, 0]], [[0.25, 0.5, 0.5]]), 0)
+    assert_almost_equal(label_ranking_loss([[1, 1, 1]], [[0.25, 0.5, 0.5]]), 0)
+
+    # Non trivial case
+    assert_almost_equal(
+        label_ranking_loss([[0, 1, 0], [1, 1, 0]], [[0.1, 10.0, -3], [0, 1, 3]]),
+        (0 + 2 / 2) / 2.0,
+    )
+
+    assert_almost_equal(
+        label_ranking_loss(
+            [[0, 1, 0], [1, 1, 0], [0, 1, 1]], [[0.1, 10, -3], [0, 1, 3], [0, 2, 0]]
+        ),
+        (0 + 2 / 2 + 1 / 2) / 3.0,
+    )
+
+    assert_almost_equal(
+        label_ranking_loss(
+            [[0, 1, 0], [1, 1, 0], [0, 1, 1]], [[0.1, 10, -3], [3, 1, 3], [0, 2, 0]]
+        ),
+        (0 + 2 / 2 + 1 / 2) / 3.0,
+    )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_label_ranking_loss_sparse(csr_container):
+    assert_almost_equal(
+        label_ranking_loss(
+            csr_container(np.array([[0, 1, 0], [1, 1, 0]])), [[0.1, 10, -3], [3, 1, 3]]
+        ),
+        (0 + 2 / 2) / 2.0,
+    )
+
+
+def test_ranking_appropriate_input_shape():
+    # Check that y_true.shape != y_score.shape raise the proper exception
+    with pytest.raises(ValueError):
+        label_ranking_loss([[0, 1], [0, 1]], [0, 1])
+    with pytest.raises(ValueError):
+        label_ranking_loss([[0, 1], [0, 1]], [[0, 1]])
+    with pytest.raises(ValueError):
+        label_ranking_loss([[0, 1], [0, 1]], [[0], [1]])
+    with pytest.raises(ValueError):
+        label_ranking_loss([[0, 1]], [[0, 1], [0, 1]])
+    with pytest.raises(ValueError):
+        label_ranking_loss([[0], [1]], [[0, 1], [0, 1]])
+    with pytest.raises(ValueError):
+        label_ranking_loss([[0, 1], [0, 1]], [[0], [1]])
+
+
+def test_ranking_loss_ties_handling():
+    # Tie handling
+    assert_almost_equal(label_ranking_loss([[1, 0]], [[0.5, 0.5]]), 1)
+    assert_almost_equal(label_ranking_loss([[0, 1]], [[0.5, 0.5]]), 1)
+    assert_almost_equal(label_ranking_loss([[0, 0, 1]], [[0.25, 0.5, 0.5]]), 1 / 2)
+    assert_almost_equal(label_ranking_loss([[0, 1, 0]], [[0.25, 0.5, 0.5]]), 1 / 2)
+    assert_almost_equal(label_ranking_loss([[0, 1, 1]], [[0.25, 0.5, 0.5]]), 0)
+    assert_almost_equal(label_ranking_loss([[1, 0, 0]], [[0.25, 0.5, 0.5]]), 1)
+    assert_almost_equal(label_ranking_loss([[1, 0, 1]], [[0.25, 0.5, 0.5]]), 1)
+    assert_almost_equal(label_ranking_loss([[1, 1, 0]], [[0.25, 0.5, 0.5]]), 1)
+
+
+def test_dcg_score():
+    _, y_true = make_multilabel_classification(random_state=0, n_classes=10)
+    y_score = -y_true + 1
+    _test_dcg_score_for(y_true, y_score)
+    y_true, y_score = np.random.RandomState(0).random_sample((2, 100, 10))
+    _test_dcg_score_for(y_true, y_score)
+
+
+def _test_dcg_score_for(y_true, y_score):
+    discount = np.log2(np.arange(y_true.shape[1]) + 2)
+    ideal = _dcg_sample_scores(y_true, y_true)
+    score = _dcg_sample_scores(y_true, y_score)
+    assert (score <= ideal).all()
+    assert (_dcg_sample_scores(y_true, y_true, k=5) <= ideal).all()
+    assert ideal.shape == (y_true.shape[0],)
+    assert score.shape == (y_true.shape[0],)
+    assert ideal == pytest.approx((np.sort(y_true)[:, ::-1] / discount).sum(axis=1))
+
+
+def test_dcg_ties():
+    y_true = np.asarray([np.arange(5)])
+    y_score = np.zeros(y_true.shape)
+    dcg = _dcg_sample_scores(y_true, y_score)
+    dcg_ignore_ties = _dcg_sample_scores(y_true, y_score, ignore_ties=True)
+    discounts = 1 / np.log2(np.arange(2, 7))
+    assert dcg == pytest.approx([discounts.sum() * y_true.mean()])
+    assert dcg_ignore_ties == pytest.approx([(discounts * y_true[:, ::-1]).sum()])
+    y_score[0, 3:] = 1
+    dcg = _dcg_sample_scores(y_true, y_score)
+    dcg_ignore_ties = _dcg_sample_scores(y_true, y_score, ignore_ties=True)
+    assert dcg_ignore_ties == pytest.approx([(discounts * y_true[:, ::-1]).sum()])
+    assert dcg == pytest.approx(
+        [
+            discounts[:2].sum() * y_true[0, 3:].mean()
+            + discounts[2:].sum() * y_true[0, :3].mean()
+        ]
+    )
+
+
+def test_ndcg_ignore_ties_with_k():
+    a = np.arange(12).reshape((2, 6))
+    assert ndcg_score(a, a, k=3, ignore_ties=True) == pytest.approx(
+        ndcg_score(a, a, k=3, ignore_ties=True)
+    )
+
+
+def test_ndcg_negative_ndarray_error():
+    """Check `ndcg_score` exception when `y_true` contains negative values."""
+    y_true = np.array([[-0.89, -0.53, -0.47, 0.39, 0.56]])
+    y_score = np.array([[0.07, 0.31, 0.75, 0.33, 0.27]])
+    expected_message = "ndcg_score should not be used on negative y_true values"
+    with pytest.raises(ValueError, match=expected_message):
+        ndcg_score(y_true, y_score)
+
+
+def test_ndcg_invariant():
+    y_true = np.arange(70).reshape(7, 10)
+    y_score = y_true + np.random.RandomState(0).uniform(-0.2, 0.2, size=y_true.shape)
+    ndcg = ndcg_score(y_true, y_score)
+    ndcg_no_ties = ndcg_score(y_true, y_score, ignore_ties=True)
+    assert ndcg == pytest.approx(ndcg_no_ties)
+    assert ndcg == pytest.approx(1.0)
+    y_score += 1000
+    assert ndcg_score(y_true, y_score) == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize("ignore_ties", [True, False])
+def test_ndcg_toy_examples(ignore_ties):
+    y_true = 3 * np.eye(7)[:5]
+    y_score = np.tile(np.arange(6, -1, -1), (5, 1))
+    y_score_noisy = y_score + np.random.RandomState(0).uniform(
+        -0.2, 0.2, size=y_score.shape
+    )
+    assert _dcg_sample_scores(
+        y_true, y_score, ignore_ties=ignore_ties
+    ) == pytest.approx(3 / np.log2(np.arange(2, 7)))
+    assert _dcg_sample_scores(
+        y_true, y_score_noisy, ignore_ties=ignore_ties
+    ) == pytest.approx(3 / np.log2(np.arange(2, 7)))
+    assert _ndcg_sample_scores(
+        y_true, y_score, ignore_ties=ignore_ties
+    ) == pytest.approx(1 / np.log2(np.arange(2, 7)))
+    assert _dcg_sample_scores(
+        y_true, y_score, log_base=10, ignore_ties=ignore_ties
+    ) == pytest.approx(3 / np.log10(np.arange(2, 7)))
+    assert ndcg_score(y_true, y_score, ignore_ties=ignore_ties) == pytest.approx(
+        (1 / np.log2(np.arange(2, 7))).mean()
+    )
+    assert dcg_score(y_true, y_score, ignore_ties=ignore_ties) == pytest.approx(
+        (3 / np.log2(np.arange(2, 7))).mean()
+    )
+    y_true = 3 * np.ones((5, 7))
+    expected_dcg_score = (3 / np.log2(np.arange(2, 9))).sum()
+    assert _dcg_sample_scores(
+        y_true, y_score, ignore_ties=ignore_ties
+    ) == pytest.approx(expected_dcg_score * np.ones(5))
+    assert _ndcg_sample_scores(
+        y_true, y_score, ignore_ties=ignore_ties
+    ) == pytest.approx(np.ones(5))
+    assert dcg_score(y_true, y_score, ignore_ties=ignore_ties) == pytest.approx(
+        expected_dcg_score
+    )
+    assert ndcg_score(y_true, y_score, ignore_ties=ignore_ties) == pytest.approx(1.0)
+
+
+def test_ndcg_error_single_document():
+    """Check that we raise an informative error message when trying to
+    compute NDCG with a single document."""
+    err_msg = (
+        "Computing NDCG is only meaningful when there is more than 1 document. "
+        "Got 1 instead."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        ndcg_score([[1]], [[1]])
+
+
+def test_ndcg_score():
+    _, y_true = make_multilabel_classification(random_state=0, n_classes=10)
+    y_score = -y_true + 1
+    _test_ndcg_score_for(y_true, y_score)
+    y_true, y_score = np.random.RandomState(0).random_sample((2, 100, 10))
+    _test_ndcg_score_for(y_true, y_score)
+
+
+def _test_ndcg_score_for(y_true, y_score):
+    ideal = _ndcg_sample_scores(y_true, y_true)
+    score = _ndcg_sample_scores(y_true, y_score)
+    assert (score <= ideal).all()
+    all_zero = (y_true == 0).all(axis=1)
+    assert ideal[~all_zero] == pytest.approx(np.ones((~all_zero).sum()))
+    assert ideal[all_zero] == pytest.approx(np.zeros(all_zero.sum()))
+    assert score[~all_zero] == pytest.approx(
+        _dcg_sample_scores(y_true, y_score)[~all_zero]
+        / _dcg_sample_scores(y_true, y_true)[~all_zero]
+    )
+    assert score[all_zero] == pytest.approx(np.zeros(all_zero.sum()))
+    assert ideal.shape == (y_true.shape[0],)
+    assert score.shape == (y_true.shape[0],)
+
+
+def test_partial_roc_auc_score():
+    # Check `roc_auc_score` for max_fpr != `None`
+    y_true = np.array([0, 0, 1, 1])
+    assert roc_auc_score(y_true, y_true, max_fpr=1) == 1
+    assert roc_auc_score(y_true, y_true, max_fpr=0.001) == 1
+    with pytest.raises(ValueError):
+        assert roc_auc_score(y_true, y_true, max_fpr=-0.1)
+    with pytest.raises(ValueError):
+        assert roc_auc_score(y_true, y_true, max_fpr=1.1)
+    with pytest.raises(ValueError):
+        assert roc_auc_score(y_true, y_true, max_fpr=0)
+
+    y_scores = np.array([0.1, 0, 0.1, 0.01])
+    roc_auc_with_max_fpr_one = roc_auc_score(y_true, y_scores, max_fpr=1)
+    unconstrained_roc_auc = roc_auc_score(y_true, y_scores)
+    assert roc_auc_with_max_fpr_one == unconstrained_roc_auc
+    assert roc_auc_score(y_true, y_scores, max_fpr=0.3) == 0.5
+
+    y_true, y_pred, _ = make_prediction(binary=True)
+    for max_fpr in np.linspace(1e-4, 1, 5):
+        assert_almost_equal(
+            roc_auc_score(y_true, y_pred, max_fpr=max_fpr),
+            _partial_roc_auc_score(y_true, y_pred, max_fpr),
+        )
+
+
+@pytest.mark.parametrize(
+    "y_true, k, true_score",
+    [
+        ([0, 1, 2, 3], 1, 0.25),
+        ([0, 1, 2, 3], 2, 0.5),
+        ([0, 1, 2, 3], 3, 0.75),
+    ],
+)
+def test_top_k_accuracy_score(y_true, k, true_score):
+    y_score = np.array(
+        [
+            [0.4, 0.3, 0.2, 0.1],
+            [0.1, 0.3, 0.4, 0.2],
+            [0.4, 0.1, 0.2, 0.3],
+            [0.3, 0.2, 0.4, 0.1],
+        ]
+    )
+    score = top_k_accuracy_score(y_true, y_score, k=k)
+    assert score == pytest.approx(true_score)
+
+
+@pytest.mark.parametrize(
+    "y_score, k, true_score",
+    [
+        (np.array([-1, -1, 1, 1]), 1, 1),
+        (np.array([-1, 1, -1, 1]), 1, 0.5),
+        (np.array([-1, 1, -1, 1]), 2, 1),
+        (np.array([0.2, 0.2, 0.7, 0.7]), 1, 1),
+        (np.array([0.2, 0.7, 0.2, 0.7]), 1, 0.5),
+        (np.array([0.2, 0.7, 0.2, 0.7]), 2, 1),
+    ],
+)
+def test_top_k_accuracy_score_binary(y_score, k, true_score):
+    y_true = [0, 0, 1, 1]
+
+    threshold = 0.5 if y_score.min() >= 0 and y_score.max() <= 1 else 0
+    y_pred = (y_score > threshold).astype(np.int64) if k == 1 else y_true
+
+    score = top_k_accuracy_score(y_true, y_score, k=k)
+    score_acc = accuracy_score(y_true, y_pred)
+
+    assert score == score_acc == pytest.approx(true_score)
+
+
+@pytest.mark.parametrize(
+    "y_true, true_score, labels",
+    [
+        (np.array([0, 1, 1, 2]), 0.75, [0, 1, 2, 3]),
+        (np.array([0, 1, 1, 1]), 0.5, [0, 1, 2, 3]),
+        (np.array([1, 1, 1, 1]), 0.5, [0, 1, 2, 3]),
+        (np.array(["a", "e", "e", "a"]), 0.75, ["a", "b", "d", "e"]),
+    ],
+)
+@pytest.mark.parametrize("labels_as_ndarray", [True, False])
+def test_top_k_accuracy_score_multiclass_with_labels(
+    y_true, true_score, labels, labels_as_ndarray
+):
+    """Test when labels and y_score are multiclass."""
+    if labels_as_ndarray:
+        labels = np.asarray(labels)
+    y_score = np.array(
+        [
+            [0.4, 0.3, 0.2, 0.1],
+            [0.1, 0.3, 0.4, 0.2],
+            [0.4, 0.1, 0.2, 0.3],
+            [0.3, 0.2, 0.4, 0.1],
+        ]
+    )
+
+    score = top_k_accuracy_score(y_true, y_score, k=2, labels=labels)
+    assert score == pytest.approx(true_score)
+
+
+def test_top_k_accuracy_score_increasing():
+    # Make sure increasing k leads to a higher score
+    X, y = datasets.make_classification(
+        n_classes=10, n_samples=1000, n_informative=10, random_state=0
+    )
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    clf = LogisticRegression(random_state=0)
+    clf.fit(X_train, y_train)
+
+    for X, y in zip((X_train, X_test), (y_train, y_test)):
+        scores = [
+            top_k_accuracy_score(y, clf.predict_proba(X), k=k) for k in range(2, 10)
+        ]
+
+        assert np.all(np.diff(scores) > 0)
+
+
+@pytest.mark.parametrize(
+    "y_true, k, true_score",
+    [
+        ([0, 1, 2, 3], 1, 0.25),
+        ([0, 1, 2, 3], 2, 0.5),
+        ([0, 1, 2, 3], 3, 1),
+    ],
+)
+def test_top_k_accuracy_score_ties(y_true, k, true_score):
+    # Make sure highest indices labels are chosen first in case of ties
+    y_score = np.array(
+        [
+            [5, 5, 7, 0],
+            [1, 5, 5, 5],
+            [0, 0, 3, 3],
+            [1, 1, 1, 1],
+        ]
+    )
+    assert top_k_accuracy_score(y_true, y_score, k=k) == pytest.approx(true_score)
+
+
+@pytest.mark.parametrize(
+    "y_true, k",
+    [
+        ([0, 1, 2, 3], 4),
+        ([0, 1, 2, 3], 5),
+    ],
+)
+def test_top_k_accuracy_score_warning(y_true, k):
+    y_score = np.array(
+        [
+            [0.4, 0.3, 0.2, 0.1],
+            [0.1, 0.4, 0.3, 0.2],
+            [0.2, 0.1, 0.4, 0.3],
+            [0.3, 0.2, 0.1, 0.4],
+        ]
+    )
+    expected_message = (
+        r"'k' \(\d+\) greater than or equal to 'n_classes' \(\d+\) will result in a "
+        "perfect score and is therefore meaningless."
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        score = top_k_accuracy_score(y_true, y_score, k=k)
+    assert score == 1
+
+
+@pytest.mark.parametrize(
+    "y_true, y_score, labels, msg",
+    [
+        (
+            [0, 0.57, 1, 2],
+            [
+                [0.2, 0.1, 0.7],
+                [0.4, 0.3, 0.3],
+                [0.3, 0.4, 0.3],
+                [0.4, 0.5, 0.1],
+            ],
+            None,
+            "y type must be 'binary' or 'multiclass', got 'continuous'",
+        ),
+        (
+            [0, 1, 2, 3],
+            [
+                [0.2, 0.1, 0.7],
+                [0.4, 0.3, 0.3],
+                [0.3, 0.4, 0.3],
+                [0.4, 0.5, 0.1],
+            ],
+            None,
+            r"Number of classes in 'y_true' \(4\) not equal to the number of "
+            r"classes in 'y_score' \(3\).",
+        ),
+        (
+            ["c", "c", "a", "b"],
+            [
+                [0.2, 0.1, 0.7],
+                [0.4, 0.3, 0.3],
+                [0.3, 0.4, 0.3],
+                [0.4, 0.5, 0.1],
+            ],
+            ["a", "b", "c", "c"],
+            "Parameter 'labels' must be unique.",
+        ),
+        (
+            ["c", "c", "a", "b"],
+            [
+                [0.2, 0.1, 0.7],
+                [0.4, 0.3, 0.3],
+                [0.3, 0.4, 0.3],
+                [0.4, 0.5, 0.1],
+            ],
+            ["a", "c", "b"],
+            "Parameter 'labels' must be ordered.",
+        ),
+        (
+            [0, 0, 1, 2],
+            [
+                [0.2, 0.1, 0.7],
+                [0.4, 0.3, 0.3],
+                [0.3, 0.4, 0.3],
+                [0.4, 0.5, 0.1],
+            ],
+            [0, 1, 2, 3],
+            r"Number of given labels \(4\) not equal to the number of classes in "
+            r"'y_score' \(3\).",
+        ),
+        (
+            [0, 0, 1, 2],
+            [
+                [0.2, 0.1, 0.7],
+                [0.4, 0.3, 0.3],
+                [0.3, 0.4, 0.3],
+                [0.4, 0.5, 0.1],
+            ],
+            [0, 1, 3],
+            "'y_true' contains labels not in parameter 'labels'.",
+        ),
+        (
+            [0, 1],
+            [[0.5, 0.2, 0.2], [0.3, 0.4, 0.2]],
+            None,
+            (
+                "`y_true` is binary while y_score is 2d with 3 classes. If"
+                " `y_true` does not contain all the labels, `labels` must be provided"
+            ),
+        ),
+    ],
+)
+def test_top_k_accuracy_score_error(y_true, y_score, labels, msg):
+    with pytest.raises(ValueError, match=msg):
+        top_k_accuracy_score(y_true, y_score, k=2, labels=labels)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_label_ranking_avg_precision_score_should_allow_csr_matrix_for_y_true_input(
+    csr_container,
+):
+    # Test that label_ranking_avg_precision_score accept sparse y_true.
+    # Non-regression test for #22575
+    y_true = csr_container([[1, 0, 0], [0, 0, 1]])
+    y_score = np.array([[0.5, 0.9, 0.6], [0, 0, 1]])
+    result = label_ranking_average_precision_score(y_true, y_score)
+    assert result == pytest.approx(2 / 3)
+
+
+@pytest.mark.parametrize(
+    "metric", [average_precision_score, det_curve, precision_recall_curve, roc_curve]
+)
+@pytest.mark.parametrize(
+    "classes", [(False, True), (0, 1), (0.0, 1.0), ("zero", "one")]
+)
+def test_ranking_metric_pos_label_types(metric, classes):
+    """Check that the metric works with different types of `pos_label`.
+
+    We can expect `pos_label` to be a bool, an integer, a float, a string.
+    No error should be raised for those types.
+    """
+    rng = np.random.RandomState(42)
+    n_samples, pos_label = 10, classes[-1]
+    y_true = rng.choice(classes, size=n_samples, replace=True)
+    y_proba = rng.rand(n_samples)
+    result = metric(y_true, y_proba, pos_label=pos_label)
+    if isinstance(result, float):
+        assert not np.isnan(result)
+    else:
+        metric_1, metric_2, thresholds = result
+        assert not np.isnan(metric_1).any()
+        assert not np.isnan(metric_2).any()
+        assert not np.isnan(thresholds).any()
+
+
+def test_roc_curve_with_probablity_estimates(global_random_seed):
+    """Check that thresholds do not exceed 1.0 when `y_score` is a probability
+    estimate.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/26193
+    """
+    rng = np.random.RandomState(global_random_seed)
+    y_true = rng.randint(0, 2, size=10)
+    y_score = rng.rand(10)
+    _, _, thresholds = roc_curve(y_true, y_score)
+    assert np.isinf(thresholds[0])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_regression.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..396ae5d0ffae143e333f14861dc839931326a030
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_regression.py
@@ -0,0 +1,636 @@
+from itertools import product
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy import optimize
+from scipy.special import factorial, xlogy
+
+from sklearn.dummy import DummyRegressor
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.metrics import (
+    d2_absolute_error_score,
+    d2_pinball_score,
+    d2_tweedie_score,
+    explained_variance_score,
+    make_scorer,
+    max_error,
+    mean_absolute_error,
+    mean_absolute_percentage_error,
+    mean_pinball_loss,
+    mean_squared_error,
+    mean_squared_log_error,
+    mean_tweedie_deviance,
+    median_absolute_error,
+    r2_score,
+    root_mean_squared_error,
+    root_mean_squared_log_error,
+)
+from sklearn.metrics._regression import _check_reg_targets
+from sklearn.model_selection import GridSearchCV
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+
+def test_regression_metrics(n_samples=50):
+    y_true = np.arange(n_samples)
+    y_pred = y_true + 1
+    y_pred_2 = y_true - 1
+
+    assert_almost_equal(mean_squared_error(y_true, y_pred), 1.0)
+    assert_almost_equal(
+        mean_squared_log_error(y_true, y_pred),
+        mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred)),
+    )
+    assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.0)
+    assert_almost_equal(mean_pinball_loss(y_true, y_pred), 0.5)
+    assert_almost_equal(mean_pinball_loss(y_true, y_pred_2), 0.5)
+    assert_almost_equal(mean_pinball_loss(y_true, y_pred, alpha=0.4), 0.6)
+    assert_almost_equal(mean_pinball_loss(y_true, y_pred_2, alpha=0.4), 0.4)
+    assert_almost_equal(median_absolute_error(y_true, y_pred), 1.0)
+    mape = mean_absolute_percentage_error(y_true, y_pred)
+    assert np.isfinite(mape)
+    assert mape > 1e6
+    assert_almost_equal(max_error(y_true, y_pred), 1.0)
+    assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2)
+    assert_almost_equal(r2_score(y_true, y_pred, force_finite=False), 0.995, 2)
+    assert_almost_equal(explained_variance_score(y_true, y_pred), 1.0)
+    assert_almost_equal(
+        explained_variance_score(y_true, y_pred, force_finite=False), 1.0
+    )
+    assert_almost_equal(
+        mean_tweedie_deviance(y_true, y_pred, power=0),
+        mean_squared_error(y_true, y_pred),
+    )
+    assert_almost_equal(
+        d2_tweedie_score(y_true, y_pred, power=0), r2_score(y_true, y_pred)
+    )
+    dev_median = np.abs(y_true - np.median(y_true)).sum()
+    assert_array_almost_equal(
+        d2_absolute_error_score(y_true, y_pred),
+        1 - np.abs(y_true - y_pred).sum() / dev_median,
+    )
+    alpha = 0.2
+    pinball_loss = lambda y_true, y_pred, alpha: alpha * np.maximum(
+        y_true - y_pred, 0
+    ) + (1 - alpha) * np.maximum(y_pred - y_true, 0)
+    y_quantile = np.percentile(y_true, q=alpha * 100)
+    assert_almost_equal(
+        d2_pinball_score(y_true, y_pred, alpha=alpha),
+        1
+        - pinball_loss(y_true, y_pred, alpha).sum()
+        / pinball_loss(y_true, y_quantile, alpha).sum(),
+    )
+    assert_almost_equal(
+        d2_absolute_error_score(y_true, y_pred),
+        d2_pinball_score(y_true, y_pred, alpha=0.5),
+    )
+
+    # Tweedie deviance needs positive y_pred, except for p=0,
+    # p>=2 needs positive y_true
+    # results evaluated by sympy
+    y_true = np.arange(1, 1 + n_samples)
+    y_pred = 2 * y_true
+    n = n_samples
+    assert_almost_equal(
+        mean_tweedie_deviance(y_true, y_pred, power=-1),
+        5 / 12 * n * (n**2 + 2 * n + 1),
+    )
+    assert_almost_equal(
+        mean_tweedie_deviance(y_true, y_pred, power=1), (n + 1) * (1 - np.log(2))
+    )
+    assert_almost_equal(
+        mean_tweedie_deviance(y_true, y_pred, power=2), 2 * np.log(2) - 1
+    )
+    assert_almost_equal(
+        mean_tweedie_deviance(y_true, y_pred, power=3 / 2),
+        ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum(),
+    )
+    assert_almost_equal(
+        mean_tweedie_deviance(y_true, y_pred, power=3), np.sum(1 / y_true) / (4 * n)
+    )
+
+    dev_mean = 2 * np.mean(xlogy(y_true, 2 * y_true / (n + 1)))
+    assert_almost_equal(
+        d2_tweedie_score(y_true, y_pred, power=1),
+        1 - (n + 1) * (1 - np.log(2)) / dev_mean,
+    )
+
+    dev_mean = 2 * np.log((n + 1) / 2) - 2 / n * np.log(factorial(n))
+    assert_almost_equal(
+        d2_tweedie_score(y_true, y_pred, power=2), 1 - (2 * np.log(2) - 1) / dev_mean
+    )
+
+
+def test_root_mean_squared_error_multioutput_raw_value():
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/pull/16323
+    mse = mean_squared_error([[1]], [[10]], multioutput="raw_values")
+    rmse = root_mean_squared_error([[1]], [[10]], multioutput="raw_values")
+    assert np.sqrt(mse) == pytest.approx(rmse)
+
+
+def test_multioutput_regression():
+    y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]])
+    y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]])
+
+    error = mean_squared_error(y_true, y_pred)
+    assert_almost_equal(error, (1.0 / 3 + 2.0 / 3 + 2.0 / 3) / 4.0)
+
+    error = root_mean_squared_error(y_true, y_pred)
+    assert_almost_equal(error, 0.454, decimal=2)
+
+    error = mean_squared_log_error(y_true, y_pred)
+    assert_almost_equal(error, 0.200, decimal=2)
+
+    error = root_mean_squared_log_error(y_true, y_pred)
+    assert_almost_equal(error, 0.315, decimal=2)
+
+    # mean_absolute_error and mean_squared_error are equal because
+    # it is a binary problem.
+    error = mean_absolute_error(y_true, y_pred)
+    assert_almost_equal(error, (1.0 + 2.0 / 3) / 4.0)
+
+    error = mean_pinball_loss(y_true, y_pred)
+    assert_almost_equal(error, (1.0 + 2.0 / 3) / 8.0)
+
+    error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2)
+    assert np.isfinite(error)
+    assert error > 1e6
+    error = median_absolute_error(y_true, y_pred)
+    assert_almost_equal(error, (1.0 + 1.0) / 4.0)
+
+    error = r2_score(y_true, y_pred, multioutput="variance_weighted")
+    assert_almost_equal(error, 1.0 - 5.0 / 2)
+    error = r2_score(y_true, y_pred, multioutput="uniform_average")
+    assert_almost_equal(error, -0.875)
+
+    score = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput="raw_values")
+    raw_expected_score = [
+        1
+        - np.abs(y_true[:, i] - y_pred[:, i]).sum()
+        / np.abs(y_true[:, i] - np.median(y_true[:, i])).sum()
+        for i in range(y_true.shape[1])
+    ]
+    # in the last case, the denominator vanishes and hence we get nan,
+    # but since the numerator vanishes as well the expected score is 1.0
+    raw_expected_score = np.where(np.isnan(raw_expected_score), 1, raw_expected_score)
+    assert_array_almost_equal(score, raw_expected_score)
+
+    score = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput="uniform_average")
+    assert_almost_equal(score, raw_expected_score.mean())
+    # constant `y_true` with force_finite=True leads to 1. or 0.
+    yc = [5.0, 5.0]
+    error = r2_score(yc, [5.0, 5.0], multioutput="variance_weighted")
+    assert_almost_equal(error, 1.0)
+    error = r2_score(yc, [5.0, 5.1], multioutput="variance_weighted")
+    assert_almost_equal(error, 0.0)
+
+    # Setting force_finite=False results in the nan for 4th output propagating
+    error = r2_score(
+        y_true, y_pred, multioutput="variance_weighted", force_finite=False
+    )
+    assert_almost_equal(error, np.nan)
+    error = r2_score(y_true, y_pred, multioutput="uniform_average", force_finite=False)
+    assert_almost_equal(error, np.nan)
+
+    # Dropping the 4th output to check `force_finite=False` for nominal
+    y_true = y_true[:, :-1]
+    y_pred = y_pred[:, :-1]
+    error = r2_score(y_true, y_pred, multioutput="variance_weighted")
+    error2 = r2_score(
+        y_true, y_pred, multioutput="variance_weighted", force_finite=False
+    )
+    assert_almost_equal(error, error2)
+    error = r2_score(y_true, y_pred, multioutput="uniform_average")
+    error2 = r2_score(y_true, y_pred, multioutput="uniform_average", force_finite=False)
+    assert_almost_equal(error, error2)
+
+    # constant `y_true` with force_finite=False leads to NaN or -Inf.
+    error = r2_score(
+        yc, [5.0, 5.0], multioutput="variance_weighted", force_finite=False
+    )
+    assert_almost_equal(error, np.nan)
+    error = r2_score(
+        yc, [5.0, 6.0], multioutput="variance_weighted", force_finite=False
+    )
+    assert_almost_equal(error, -np.inf)
+
+
+def test_regression_metrics_at_limits():
+    # Single-sample case
+    # Note: for r2 and d2_tweedie see also test_regression_single_sample
+    assert_almost_equal(mean_squared_error([0.0], [0.0]), 0.0)
+    assert_almost_equal(root_mean_squared_error([0.0], [0.0]), 0.0)
+    assert_almost_equal(mean_squared_log_error([0.0], [0.0]), 0.0)
+    assert_almost_equal(mean_absolute_error([0.0], [0.0]), 0.0)
+    assert_almost_equal(mean_pinball_loss([0.0], [0.0]), 0.0)
+    assert_almost_equal(mean_absolute_percentage_error([0.0], [0.0]), 0.0)
+    assert_almost_equal(median_absolute_error([0.0], [0.0]), 0.0)
+    assert_almost_equal(max_error([0.0], [0.0]), 0.0)
+    assert_almost_equal(explained_variance_score([0.0], [0.0]), 1.0)
+
+    # Perfect cases
+    assert_almost_equal(r2_score([0.0, 1], [0.0, 1]), 1.0)
+    assert_almost_equal(d2_pinball_score([0.0, 1], [0.0, 1]), 1.0)
+
+    # Non-finite cases
+    # R² and explained variance have a fix by default for non-finite cases
+    for s in (r2_score, explained_variance_score):
+        assert_almost_equal(s([0, 0], [1, -1]), 0.0)
+        assert_almost_equal(s([0, 0], [1, -1], force_finite=False), -np.inf)
+        assert_almost_equal(s([1, 1], [1, 1]), 1.0)
+        assert_almost_equal(s([1, 1], [1, 1], force_finite=False), np.nan)
+    msg = (
+        "Mean Squared Logarithmic Error cannot be used when "
+        "targets contain values less than or equal to -1."
+    )
+    with pytest.raises(ValueError, match=msg):
+        mean_squared_log_error([-1.0], [-1.0])
+    msg = (
+        "Mean Squared Logarithmic Error cannot be used when "
+        "targets contain values less than or equal to -1."
+    )
+    with pytest.raises(ValueError, match=msg):
+        mean_squared_log_error([1.0, 2.0, 3.0], [1.0, -2.0, 3.0])
+    msg = (
+        "Mean Squared Logarithmic Error cannot be used when "
+        "targets contain values less than or equal to -1."
+    )
+    with pytest.raises(ValueError, match=msg):
+        mean_squared_log_error([1.0, -2.0, 3.0], [1.0, 2.0, 3.0])
+    msg = (
+        "Mean Squared Logarithmic Error cannot be used when "
+        "targets contain values less than or equal to -1."
+    )
+    with pytest.raises(ValueError, match=msg):
+        root_mean_squared_log_error([1.0, -2.0, 3.0], [1.0, 2.0, 3.0])
+    msg = (
+        "Root Mean Squared Logarithmic Error cannot be used when "
+        "targets contain values less than or equal to -1."
+    )
+
+    # Tweedie deviance error
+    power = -1.2
+    assert_allclose(
+        mean_tweedie_deviance([0], [1.0], power=power), 2 / (2 - power), rtol=1e-3
+    )
+    msg = "can only be used on strictly positive y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.0], [0.0], power=power)
+    with pytest.raises(ValueError, match=msg):
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
+
+    assert_almost_equal(mean_tweedie_deviance([0.0], [0.0], power=0), 0.0, 2)
+
+    power = 1.0
+    msg = "only be used on non-negative y and strictly positive y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.0], [0.0], power=power)
+    with pytest.raises(ValueError, match=msg):
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
+
+    power = 1.5
+    assert_allclose(mean_tweedie_deviance([0.0], [1.0], power=power), 2 / (2 - power))
+    msg = "only be used on non-negative y and strictly positive y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.0], [0.0], power=power)
+    with pytest.raises(ValueError, match=msg):
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
+
+    power = 2.0
+    assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8)
+    msg = "can only be used on strictly positive y and y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.0], [0.0], power=power)
+    with pytest.raises(ValueError, match=msg):
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
+
+    power = 3.0
+    assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8)
+    msg = "can only be used on strictly positive y and y_pred."
+    with pytest.raises(ValueError, match=msg):
+        mean_tweedie_deviance([0.0], [0.0], power=power)
+    with pytest.raises(ValueError, match=msg):
+        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
+
+
+def test__check_reg_targets():
+    # All of length 3
+    EXAMPLES = [
+        ("continuous", [1, 2, 3], 1),
+        ("continuous", [[1], [2], [3]], 1),
+        ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2),
+        ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2),
+        ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3),
+    ]
+
+    for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES, repeat=2):
+        if type1 == type2 and n_out1 == n_out2:
+            y_type, y_check1, y_check2, _, _ = _check_reg_targets(
+                y1, y2, sample_weight=None, multioutput=None
+            )
+            assert type1 == y_type
+            if type1 == "continuous":
+                assert_array_equal(y_check1, np.reshape(y1, (-1, 1)))
+                assert_array_equal(y_check2, np.reshape(y2, (-1, 1)))
+            else:
+                assert_array_equal(y_check1, y1)
+                assert_array_equal(y_check2, y2)
+        else:
+            with pytest.raises(ValueError):
+                _check_reg_targets(y1, y2, sample_weight=None, multioutput=None)
+
+
+def test__check_reg_targets_exception():
+    invalid_multioutput = "this_value_is_not_valid"
+    expected_message = (
+        "Allowed 'multioutput' string values are.+You provided multioutput={!r}".format(
+            invalid_multioutput
+        )
+    )
+    with pytest.raises(ValueError, match=expected_message):
+        _check_reg_targets([1, 2, 3], [[1], [2], [3]], None, invalid_multioutput)
+
+
+def test_regression_multioutput_array():
+    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
+    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]
+
+    mse = mean_squared_error(y_true, y_pred, multioutput="raw_values")
+    mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
+
+    pbl = mean_pinball_loss(y_true, y_pred, multioutput="raw_values")
+    mape = mean_absolute_percentage_error(y_true, y_pred, multioutput="raw_values")
+    r = r2_score(y_true, y_pred, multioutput="raw_values")
+    evs = explained_variance_score(y_true, y_pred, multioutput="raw_values")
+    d2ps = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput="raw_values")
+    evs2 = explained_variance_score(
+        y_true, y_pred, multioutput="raw_values", force_finite=False
+    )
+
+    assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2)
+    assert_array_almost_equal(mae, [0.25, 0.625], decimal=2)
+    assert_array_almost_equal(pbl, [0.25 / 2, 0.625 / 2], decimal=2)
+    assert_array_almost_equal(mape, [0.0778, 0.2262], decimal=2)
+    assert_array_almost_equal(r, [0.95, 0.93], decimal=2)
+    assert_array_almost_equal(evs, [0.95, 0.93], decimal=2)
+    assert_array_almost_equal(d2ps, [0.833, 0.722], decimal=2)
+    assert_array_almost_equal(evs2, [0.95, 0.93], decimal=2)
+
+    # mean_absolute_error and mean_squared_error are equal because
+    # it is a binary problem.
+    y_true = [[0, 0]] * 4
+    y_pred = [[1, 1]] * 4
+    mse = mean_squared_error(y_true, y_pred, multioutput="raw_values")
+    mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
+    pbl = mean_pinball_loss(y_true, y_pred, multioutput="raw_values")
+    r = r2_score(y_true, y_pred, multioutput="raw_values")
+    d2ps = d2_pinball_score(y_true, y_pred, multioutput="raw_values")
+    assert_array_almost_equal(mse, [1.0, 1.0], decimal=2)
+    assert_array_almost_equal(mae, [1.0, 1.0], decimal=2)
+    assert_array_almost_equal(pbl, [0.5, 0.5], decimal=2)
+    assert_array_almost_equal(r, [0.0, 0.0], decimal=2)
+    assert_array_almost_equal(d2ps, [0.0, 0.0], decimal=2)
+
+    r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="raw_values")
+    assert_array_almost_equal(r, [0, -3.5], decimal=2)
+    assert np.mean(r) == r2_score(
+        [[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="uniform_average"
+    )
+    evs = explained_variance_score(
+        [[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="raw_values"
+    )
+    assert_array_almost_equal(evs, [0, -1.25], decimal=2)
+    evs2 = explained_variance_score(
+        [[0, -1], [0, 1]],
+        [[2, 2], [1, 1]],
+        multioutput="raw_values",
+        force_finite=False,
+    )
+    assert_array_almost_equal(evs2, [-np.inf, -1.25], decimal=2)
+
+    # Checking for the condition in which both numerator and denominator is
+    # zero.
+    y_true = [[1, 3], [1, 2]]
+    y_pred = [[1, 4], [1, 1]]
+    r2 = r2_score(y_true, y_pred, multioutput="raw_values")
+    assert_array_almost_equal(r2, [1.0, -3.0], decimal=2)
+    assert np.mean(r2) == r2_score(y_true, y_pred, multioutput="uniform_average")
+    r22 = r2_score(y_true, y_pred, multioutput="raw_values", force_finite=False)
+    assert_array_almost_equal(r22, [np.nan, -3.0], decimal=2)
+    assert_almost_equal(
+        np.mean(r22),
+        r2_score(y_true, y_pred, multioutput="uniform_average", force_finite=False),
+    )
+
+    evs = explained_variance_score(y_true, y_pred, multioutput="raw_values")
+    assert_array_almost_equal(evs, [1.0, -3.0], decimal=2)
+    assert np.mean(evs) == explained_variance_score(y_true, y_pred)
+    d2ps = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput="raw_values")
+    assert_array_almost_equal(d2ps, [1.0, -1.0], decimal=2)
+    evs2 = explained_variance_score(
+        y_true, y_pred, multioutput="raw_values", force_finite=False
+    )
+    assert_array_almost_equal(evs2, [np.nan, -3.0], decimal=2)
+    assert_almost_equal(
+        np.mean(evs2), explained_variance_score(y_true, y_pred, force_finite=False)
+    )
+
+    # Handling msle separately as it does not accept negative inputs.
+    y_true = np.array([[0.5, 1], [1, 2], [7, 6]])
+    y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]])
+    msle = mean_squared_log_error(y_true, y_pred, multioutput="raw_values")
+    msle2 = mean_squared_error(
+        np.log(1 + y_true), np.log(1 + y_pred), multioutput="raw_values"
+    )
+    assert_array_almost_equal(msle, msle2, decimal=2)
+
+
+def test_regression_custom_weights():
+    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
+    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]
+
+    msew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6])
+    rmsew = root_mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6])
+    maew = mean_absolute_error(y_true, y_pred, multioutput=[0.4, 0.6])
+    mapew = mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.4, 0.6])
+    rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6])
+    evsw = explained_variance_score(y_true, y_pred, multioutput=[0.4, 0.6])
+    d2psw = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput=[0.4, 0.6])
+    evsw2 = explained_variance_score(
+        y_true, y_pred, multioutput=[0.4, 0.6], force_finite=False
+    )
+
+    assert_almost_equal(msew, 0.39, decimal=2)
+    assert_almost_equal(rmsew, 0.59, decimal=2)
+    assert_almost_equal(maew, 0.475, decimal=3)
+    assert_almost_equal(mapew, 0.1668, decimal=2)
+    assert_almost_equal(rw, 0.94, decimal=2)
+    assert_almost_equal(evsw, 0.94, decimal=2)
+    assert_almost_equal(d2psw, 0.766, decimal=2)
+    assert_almost_equal(evsw2, 0.94, decimal=2)
+
+    # Handling msle separately as it does not accept negative inputs.
+    y_true = np.array([[0.5, 1], [1, 2], [7, 6]])
+    y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]])
+    msle = mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
+    msle2 = mean_squared_error(
+        np.log(1 + y_true), np.log(1 + y_pred), multioutput=[0.3, 0.7]
+    )
+    assert_almost_equal(msle, msle2, decimal=2)
+
+
+@pytest.mark.parametrize("metric", [r2_score, d2_tweedie_score, d2_pinball_score])
+def test_regression_single_sample(metric):
+    y_true = [0]
+    y_pred = [1]
+    warning_msg = "not well-defined with less than two samples."
+
+    # Trigger the warning
+    with pytest.warns(UndefinedMetricWarning, match=warning_msg):
+        score = metric(y_true, y_pred)
+        assert np.isnan(score)
+
+
+def test_tweedie_deviance_continuity(global_random_seed):
+    n_samples = 100
+
+    rng = np.random.RandomState(global_random_seed)
+
+    y_true = rng.rand(n_samples) + 0.1
+    y_pred = rng.rand(n_samples) + 0.1
+
+    assert_allclose(
+        mean_tweedie_deviance(y_true, y_pred, power=0 - 1e-10),
+        mean_tweedie_deviance(y_true, y_pred, power=0),
+    )
+
+    # Ws we get closer to the limit, with 1e-12 difference the
+    # tolerance to pass the below check increases. There are likely
+    # numerical precision issues on the edges of different definition
+    # regions.
+    assert_allclose(
+        mean_tweedie_deviance(y_true, y_pred, power=1 + 1e-10),
+        mean_tweedie_deviance(y_true, y_pred, power=1),
+        rtol=1e-5,
+    )
+
+    assert_allclose(
+        mean_tweedie_deviance(y_true, y_pred, power=2 - 1e-10),
+        mean_tweedie_deviance(y_true, y_pred, power=2),
+        rtol=1e-5,
+    )
+
+    assert_allclose(
+        mean_tweedie_deviance(y_true, y_pred, power=2 + 1e-10),
+        mean_tweedie_deviance(y_true, y_pred, power=2),
+        rtol=1e-5,
+    )
+
+
+def test_mean_absolute_percentage_error(global_random_seed):
+    random_number_generator = np.random.RandomState(global_random_seed)
+    y_true = random_number_generator.exponential(size=100)
+    y_pred = 1.2 * y_true
+    assert mean_absolute_percentage_error(y_true, y_pred) == pytest.approx(0.2)
+
+
+@pytest.mark.parametrize(
+    "distribution", ["normal", "lognormal", "exponential", "uniform"]
+)
+@pytest.mark.parametrize("target_quantile", [0.05, 0.5, 0.75])
+def test_mean_pinball_loss_on_constant_predictions(
+    distribution, target_quantile, global_random_seed
+):
+    if not hasattr(np, "quantile"):
+        pytest.skip(
+            "This test requires a more recent version of numpy "
+            "with support for np.quantile."
+        )
+
+    # Check that the pinball loss is minimized by the empirical quantile.
+    n_samples = 3000
+    rng = np.random.RandomState(global_random_seed)
+    data = getattr(rng, distribution)(size=n_samples)
+
+    # Compute the best possible pinball loss for any constant predictor:
+    best_pred = np.quantile(data, target_quantile)
+    best_constant_pred = np.full(n_samples, fill_value=best_pred)
+    best_pbl = mean_pinball_loss(data, best_constant_pred, alpha=target_quantile)
+
+    # Evaluate the loss on a grid of quantiles
+    candidate_predictions = np.quantile(data, np.linspace(0, 1, 100))
+    for pred in candidate_predictions:
+        # Compute the pinball loss of a constant predictor:
+        constant_pred = np.full(n_samples, fill_value=pred)
+        pbl = mean_pinball_loss(data, constant_pred, alpha=target_quantile)
+
+        # Check that the loss of this constant predictor is greater or equal
+        # than the loss of using the optimal quantile (up to machine
+        # precision):
+        assert pbl >= best_pbl - np.finfo(np.float64).eps
+
+        # Check that the value of the pinball loss matches the analytical
+        # formula.
+        expected_pbl = (pred - data[data < pred]).sum() * (1 - target_quantile) + (
+            data[data >= pred] - pred
+        ).sum() * target_quantile
+        expected_pbl /= n_samples
+        assert_almost_equal(expected_pbl, pbl)
+
+    # Check that we can actually recover the target_quantile by minimizing the
+    # pinball loss w.r.t. the constant prediction quantile.
+    def objective_func(x):
+        constant_pred = np.full(n_samples, fill_value=x)
+        return mean_pinball_loss(data, constant_pred, alpha=target_quantile)
+
+    result = optimize.minimize(objective_func, data.mean())
+    assert result.success
+    # The minimum is not unique with limited data, hence the large tolerance.
+    # For the normal distribution and the 0.5 quantile, the expected result is close to
+    # 0, hence the additional use of absolute tolerance.
+    assert_allclose(result.x, best_pred, rtol=1e-1, atol=1e-3)
+    assert result.fun == pytest.approx(best_pbl)
+
+
+def test_dummy_quantile_parameter_tuning(global_random_seed):
+    # Integration test to check that it is possible to use the pinball loss to
+    # tune the hyperparameter of a quantile regressor. This is conceptually
+    # similar to the previous test but using the scikit-learn estimator and
+    # scoring API instead.
+    n_samples = 1000
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.normal(size=(n_samples, 5))  # Ignored
+    y = rng.exponential(size=n_samples)
+
+    all_quantiles = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
+    for alpha in all_quantiles:
+        neg_mean_pinball_loss = make_scorer(
+            mean_pinball_loss,
+            alpha=alpha,
+            greater_is_better=False,
+        )
+        regressor = DummyRegressor(strategy="quantile", quantile=0.25)
+        grid_search = GridSearchCV(
+            regressor,
+            param_grid=dict(quantile=all_quantiles),
+            scoring=neg_mean_pinball_loss,
+        ).fit(X, y)
+
+        assert grid_search.best_params_["quantile"] == pytest.approx(alpha)
+
+
+def test_pinball_loss_relation_with_mae(global_random_seed):
+    # Test that mean_pinball loss with alpha=0.5 if half of mean absolute error
+    rng = np.random.RandomState(global_random_seed)
+    n = 100
+    y_true = rng.normal(size=n)
+    y_pred = y_true.copy() + rng.uniform(n)
+    assert (
+        mean_absolute_error(y_true, y_pred)
+        == mean_pinball_loss(y_true, y_pred, alpha=0.5) * 2
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_score_objects.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_score_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..672ed8ae7eecc593e0aa02e76e7158c9f01e67e4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_score_objects.py
@@ -0,0 +1,1665 @@
+import numbers
+import pickle
+import warnings
+from copy import deepcopy
+from functools import partial
+
+import joblib
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.cluster import KMeans
+from sklearn.datasets import (
+    load_diabetes,
+    make_blobs,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.linear_model import LogisticRegression, Perceptron, Ridge
+from sklearn.metrics import (
+    accuracy_score,
+    average_precision_score,
+    balanced_accuracy_score,
+    brier_score_loss,
+    check_scoring,
+    f1_score,
+    fbeta_score,
+    get_scorer,
+    get_scorer_names,
+    jaccard_score,
+    log_loss,
+    make_scorer,
+    matthews_corrcoef,
+    precision_score,
+    r2_score,
+    recall_score,
+    roc_auc_score,
+    top_k_accuracy_score,
+)
+from sklearn.metrics import cluster as cluster_module
+from sklearn.metrics._scorer import (
+    _check_multimetric_scoring,
+    _CurveScorer,
+    _MultimetricScorer,
+    _PassthroughScorer,
+    _Scorer,
+)
+from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import make_pipeline
+from sklearn.svm import LinearSVC
+from sklearn.tests.metadata_routing_common import (
+    assert_request_is_empty,
+)
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.metadata_routing import MetadataRouter, MethodMapping
+
+REGRESSION_SCORERS = [
+    "d2_absolute_error_score",
+    "explained_variance",
+    "r2",
+    "neg_mean_absolute_error",
+    "neg_mean_squared_error",
+    "neg_mean_absolute_percentage_error",
+    "neg_mean_squared_log_error",
+    "neg_median_absolute_error",
+    "neg_root_mean_squared_error",
+    "neg_root_mean_squared_log_error",
+    "mean_absolute_error",
+    "mean_absolute_percentage_error",
+    "mean_squared_error",
+    "median_absolute_error",
+    "neg_max_error",
+    "neg_mean_poisson_deviance",
+    "neg_mean_gamma_deviance",
+]
+
+CLF_SCORERS = [
+    "accuracy",
+    "balanced_accuracy",
+    "top_k_accuracy",
+    "f1",
+    "f1_weighted",
+    "f1_macro",
+    "f1_micro",
+    "roc_auc",
+    "average_precision",
+    "precision",
+    "precision_weighted",
+    "precision_macro",
+    "precision_micro",
+    "recall",
+    "recall_weighted",
+    "recall_macro",
+    "recall_micro",
+    "neg_log_loss",
+    "neg_brier_score",
+    "jaccard",
+    "jaccard_weighted",
+    "jaccard_macro",
+    "jaccard_micro",
+    "roc_auc_ovr",
+    "roc_auc_ovo",
+    "roc_auc_ovr_weighted",
+    "roc_auc_ovo_weighted",
+    "matthews_corrcoef",
+    "positive_likelihood_ratio",
+    "neg_negative_likelihood_ratio",
+]
+
+# All supervised cluster scorers (They behave like classification metric)
+CLUSTER_SCORERS = [
+    "adjusted_rand_score",
+    "rand_score",
+    "homogeneity_score",
+    "completeness_score",
+    "v_measure_score",
+    "mutual_info_score",
+    "adjusted_mutual_info_score",
+    "normalized_mutual_info_score",
+    "fowlkes_mallows_score",
+]
+
+MULTILABEL_ONLY_SCORERS = [
+    "precision_samples",
+    "recall_samples",
+    "f1_samples",
+    "jaccard_samples",
+]
+
+REQUIRE_POSITIVE_Y_SCORERS = ["neg_mean_poisson_deviance", "neg_mean_gamma_deviance"]
+
+
+def _require_positive_y(y):
+    """Make targets strictly positive"""
+    offset = abs(y.min()) + 1
+    y = y + offset
+    return y
+
+
+def _make_estimators(X_train, y_train, y_ml_train):
+    # Make estimators that make sense to test various scoring methods
+    sensible_regr = DecisionTreeRegressor(random_state=0)
+    # some of the regressions scorers require strictly positive input.
+    sensible_regr.fit(X_train, _require_positive_y(y_train))
+    sensible_clf = DecisionTreeClassifier(random_state=0)
+    sensible_clf.fit(X_train, y_train)
+    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
+    sensible_ml_clf.fit(X_train, y_ml_train)
+    return dict(
+        [(name, sensible_regr) for name in REGRESSION_SCORERS]
+        + [(name, sensible_clf) for name in CLF_SCORERS]
+        + [(name, sensible_clf) for name in CLUSTER_SCORERS]
+        + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
+    )
+
+
+@pytest.fixture(scope="module")
+def memmap_data_and_estimators(tmp_path_factory):
+    temp_folder = tmp_path_factory.mktemp("sklearn_test_score_objects")
+    X, y = make_classification(n_samples=30, n_features=5, random_state=0)
+    _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0)
+    filename = temp_folder / "test_data.pkl"
+    joblib.dump((X, y, y_ml), filename)
+    X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode="r")
+    estimators = _make_estimators(X_mm, y_mm, y_ml_mm)
+
+    yield X_mm, y_mm, y_ml_mm, estimators
+
+
+class EstimatorWithFit(BaseEstimator):
+    """Dummy estimator to test scoring validators"""
+
+    def fit(self, X, y):
+        return self
+
+
+class EstimatorWithFitAndScore(BaseEstimator):
+    """Dummy estimator to test scoring validators"""
+
+    def fit(self, X, y):
+        return self
+
+    def score(self, X, y):
+        return 1.0
+
+
+class EstimatorWithFitAndPredict(BaseEstimator):
+    """Dummy estimator to test scoring validators"""
+
+    def fit(self, X, y):
+        self.y = y
+        return self
+
+    def predict(self, X):
+        return self.y
+
+
+class DummyScorer:
+    """Dummy scorer that always returns 1."""
+
+    def __call__(self, est, X, y):
+        return 1
+
+
+def test_all_scorers_repr():
+    # Test that all scorers have a working repr
+    for name in get_scorer_names():
+        repr(get_scorer(name))
+
+
+def check_scoring_validator_for_single_metric_usecases(scoring_validator):
+    # Test all branches of single metric usecases
+    estimator = EstimatorWithFitAndScore()
+    estimator.fit([[1]], [1])
+    scorer = scoring_validator(estimator)
+    assert isinstance(scorer, _PassthroughScorer)
+    assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)
+
+    estimator = EstimatorWithFitAndPredict()
+    estimator.fit([[1]], [1])
+    pattern = (
+        r"If no scoring is specified, the estimator passed should have"
+        r" a 'score' method\. The estimator .* does not\."
+    )
+    with pytest.raises(TypeError, match=pattern):
+        scoring_validator(estimator)
+
+    scorer = scoring_validator(estimator, scoring="accuracy")
+    assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)
+
+    estimator = EstimatorWithFit()
+    scorer = scoring_validator(estimator, scoring="accuracy")
+    assert isinstance(scorer, _Scorer)
+    assert scorer._response_method == "predict"
+
+    # Test the allow_none parameter for check_scoring alone
+    if scoring_validator is check_scoring:
+        estimator = EstimatorWithFit()
+        scorer = scoring_validator(estimator, allow_none=True)
+        assert scorer is None
+
+
+@pytest.mark.parametrize(
+    "scoring",
+    (
+        ("accuracy",),
+        ["precision"],
+        {"acc": "accuracy", "precision": "precision"},
+        ("accuracy", "precision"),
+        ["precision", "accuracy"],
+        {
+            "accuracy": make_scorer(accuracy_score),
+            "precision": make_scorer(precision_score),
+        },
+    ),
+    ids=[
+        "single_tuple",
+        "single_list",
+        "dict_str",
+        "multi_tuple",
+        "multi_list",
+        "dict_callable",
+    ],
+)
+def test_check_scoring_and_check_multimetric_scoring(scoring):
+    check_scoring_validator_for_single_metric_usecases(check_scoring)
+    # To make sure the check_scoring is correctly applied to the constituent
+    # scorers
+
+    estimator = LinearSVC(random_state=0)
+    estimator.fit([[1], [2], [3]], [1, 1, 0])
+
+    scorers = _check_multimetric_scoring(estimator, scoring)
+    assert isinstance(scorers, dict)
+    assert sorted(scorers.keys()) == sorted(list(scoring))
+    assert all([isinstance(scorer, _Scorer) for scorer in list(scorers.values())])
+    assert all(scorer._response_method == "predict" for scorer in scorers.values())
+
+    if "acc" in scoring:
+        assert_almost_equal(
+            scorers["acc"](estimator, [[1], [2], [3]], [1, 0, 0]), 2.0 / 3.0
+        )
+    if "accuracy" in scoring:
+        assert_almost_equal(
+            scorers["accuracy"](estimator, [[1], [2], [3]], [1, 0, 0]), 2.0 / 3.0
+        )
+    if "precision" in scoring:
+        assert_almost_equal(
+            scorers["precision"](estimator, [[1], [2], [3]], [1, 0, 0]), 0.5
+        )
+
+
+@pytest.mark.parametrize(
+    "scoring, msg",
+    [
+        (
+            (make_scorer(precision_score), make_scorer(accuracy_score)),
+            "One or more of the elements were callables",
+        ),
+        ([5], "Non-string types were found"),
+        ((make_scorer(precision_score),), "One or more of the elements were callables"),
+        ((), "Empty list was given"),
+        (("f1", "f1"), "Duplicate elements were found"),
+        ({4: "accuracy"}, "Non-string types were found in the keys"),
+        ({}, "An empty dict was passed"),
+    ],
+    ids=[
+        "tuple of callables",
+        "list of int",
+        "tuple of one callable",
+        "empty tuple",
+        "non-unique str",
+        "non-string key dict",
+        "empty dict",
+    ],
+)
+def test_check_scoring_and_check_multimetric_scoring_errors(scoring, msg):
+    # Make sure it raises errors when scoring parameter is not valid.
+    # More weird corner cases are tested at test_validation.py
+    estimator = EstimatorWithFitAndPredict()
+    estimator.fit([[1]], [1])
+
+    with pytest.raises(ValueError, match=msg):
+        _check_multimetric_scoring(estimator, scoring=scoring)
+
+
+def test_check_scoring_gridsearchcv():
+    # test that check_scoring works on GridSearchCV and pipeline.
+    # slightly redundant non-regression test.
+
+    grid = GridSearchCV(LinearSVC(), param_grid={"C": [0.1, 1]}, cv=3)
+    scorer = check_scoring(grid, scoring="f1")
+    assert isinstance(scorer, _Scorer)
+    assert scorer._response_method == "predict"
+
+    pipe = make_pipeline(LinearSVC())
+    scorer = check_scoring(pipe, scoring="f1")
+    assert isinstance(scorer, _Scorer)
+    assert scorer._response_method == "predict"
+
+    # check that cross_val_score definitely calls the scorer
+    # and doesn't make any assumptions about the estimator apart from having a
+    # fit.
+    scores = cross_val_score(
+        EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer(), cv=3
+    )
+    assert_array_equal(scores, 1)
+
+
+@pytest.mark.parametrize(
+    "scorer_name, metric",
+    [
+        ("f1", f1_score),
+        ("f1_weighted", partial(f1_score, average="weighted")),
+        ("f1_macro", partial(f1_score, average="macro")),
+        ("f1_micro", partial(f1_score, average="micro")),
+        ("precision", precision_score),
+        ("precision_weighted", partial(precision_score, average="weighted")),
+        ("precision_macro", partial(precision_score, average="macro")),
+        ("precision_micro", partial(precision_score, average="micro")),
+        ("recall", recall_score),
+        ("recall_weighted", partial(recall_score, average="weighted")),
+        ("recall_macro", partial(recall_score, average="macro")),
+        ("recall_micro", partial(recall_score, average="micro")),
+        ("jaccard", jaccard_score),
+        ("jaccard_weighted", partial(jaccard_score, average="weighted")),
+        ("jaccard_macro", partial(jaccard_score, average="macro")),
+        ("jaccard_micro", partial(jaccard_score, average="micro")),
+        ("top_k_accuracy", top_k_accuracy_score),
+        ("matthews_corrcoef", matthews_corrcoef),
+    ],
+)
+def test_classification_binary_scores(scorer_name, metric):
+    # check consistency between score and scorer for scores supporting
+    # binary classification.
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LinearSVC(random_state=0)
+    clf.fit(X_train, y_train)
+
+    score = get_scorer(scorer_name)(clf, X_test, y_test)
+    expected_score = metric(y_test, clf.predict(X_test))
+    assert_almost_equal(score, expected_score)
+
+
+@pytest.mark.parametrize(
+    "scorer_name, metric",
+    [
+        ("accuracy", accuracy_score),
+        ("balanced_accuracy", balanced_accuracy_score),
+        ("f1_weighted", partial(f1_score, average="weighted")),
+        ("f1_macro", partial(f1_score, average="macro")),
+        ("f1_micro", partial(f1_score, average="micro")),
+        ("precision_weighted", partial(precision_score, average="weighted")),
+        ("precision_macro", partial(precision_score, average="macro")),
+        ("precision_micro", partial(precision_score, average="micro")),
+        ("recall_weighted", partial(recall_score, average="weighted")),
+        ("recall_macro", partial(recall_score, average="macro")),
+        ("recall_micro", partial(recall_score, average="micro")),
+        ("jaccard_weighted", partial(jaccard_score, average="weighted")),
+        ("jaccard_macro", partial(jaccard_score, average="macro")),
+        ("jaccard_micro", partial(jaccard_score, average="micro")),
+    ],
+)
+def test_classification_multiclass_scores(scorer_name, metric):
+    # check consistency between score and scorer for scores supporting
+    # multiclass classification.
+    X, y = make_classification(
+        n_classes=3, n_informative=3, n_samples=30, random_state=0
+    )
+
+    # use `stratify` = y to ensure train and test sets capture all classes
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=0, stratify=y
+    )
+
+    clf = DecisionTreeClassifier(random_state=0)
+    clf.fit(X_train, y_train)
+    score = get_scorer(scorer_name)(clf, X_test, y_test)
+    expected_score = metric(y_test, clf.predict(X_test))
+    assert score == pytest.approx(expected_score)
+
+
+def test_custom_scorer_pickling():
+    # test that custom scorer can be pickled
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LinearSVC(random_state=0)
+    clf.fit(X_train, y_train)
+
+    scorer = make_scorer(fbeta_score, beta=2)
+    score1 = scorer(clf, X_test, y_test)
+    unpickled_scorer = pickle.loads(pickle.dumps(scorer))
+    score2 = unpickled_scorer(clf, X_test, y_test)
+    assert score1 == pytest.approx(score2)
+
+    # smoke test the repr:
+    repr(fbeta_score)
+
+
+def test_regression_scorers():
+    # Test regression scorers.
+    diabetes = load_diabetes()
+    X, y = diabetes.data, diabetes.target
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = Ridge()
+    clf.fit(X_train, y_train)
+    score1 = get_scorer("r2")(clf, X_test, y_test)
+    score2 = r2_score(y_test, clf.predict(X_test))
+    assert_almost_equal(score1, score2)
+
+
+def test_thresholded_scorers():
+    # Test scorers that take thresholds.
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LogisticRegression(random_state=0)
+    clf.fit(X_train, y_train)
+    score1 = get_scorer("roc_auc")(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
+    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
+    assert_almost_equal(score1, score2)
+    assert_almost_equal(score1, score3)
+
+    logscore = get_scorer("neg_log_loss")(clf, X_test, y_test)
+    logloss = log_loss(y_test, clf.predict_proba(X_test))
+    assert_almost_equal(-logscore, logloss)
+
+    # same for an estimator without decision_function
+    clf = DecisionTreeClassifier()
+    clf.fit(X_train, y_train)
+    score1 = get_scorer("roc_auc")(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
+    assert_almost_equal(score1, score2)
+
+    # test with a regressor (no decision_function)
+    reg = DecisionTreeRegressor()
+    reg.fit(X_train, y_train)
+    err_msg = "DecisionTreeRegressor has none of the following attributes"
+    with pytest.raises(AttributeError, match=err_msg):
+        get_scorer("roc_auc")(reg, X_test, y_test)
+
+    # Test that an exception is raised on more than two classes
+    X, y = make_blobs(random_state=0, centers=3)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf.fit(X_train, y_train)
+    with pytest.raises(ValueError, match="multi_class must be in \\('ovo', 'ovr'\\)"):
+        get_scorer("roc_auc")(clf, X_test, y_test)
+
+    # test error is raised with a single class present in model
+    # (predict_proba shape is not suitable for binary auc)
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = DecisionTreeClassifier()
+    clf.fit(X_train, np.zeros_like(y_train))
+    with pytest.raises(ValueError, match="need classifier with two classes"):
+        get_scorer("roc_auc")(clf, X_test, y_test)
+
+    # for proba scorers
+    with pytest.raises(ValueError, match="need classifier with two classes"):
+        get_scorer("neg_log_loss")(clf, X_test, y_test)
+
+
+def test_thresholded_scorers_multilabel_indicator_data():
+    # Test that the scorer work with multilabel-indicator format
+    # for multilabel and multi-output multi-class classifier
+    X, y = make_multilabel_classification(allow_unlabeled=False, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    # Multi-output multi-class predict_proba
+    clf = DecisionTreeClassifier()
+    clf.fit(X_train, y_train)
+    y_proba = clf.predict_proba(X_test)
+    score1 = get_scorer("roc_auc")(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T)
+    assert_almost_equal(score1, score2)
+
+    # Multilabel predict_proba
+    clf = OneVsRestClassifier(DecisionTreeClassifier())
+    clf.fit(X_train, y_train)
+    score1 = get_scorer("roc_auc")(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, clf.predict_proba(X_test))
+    assert_almost_equal(score1, score2)
+
+    # Multilabel decision function
+    clf = OneVsRestClassifier(LinearSVC(random_state=0))
+    clf.fit(X_train, y_train)
+    score1 = get_scorer("roc_auc")(clf, X_test, y_test)
+    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
+    assert_almost_equal(score1, score2)
+
+
+def test_supervised_cluster_scorers():
+    # Test clustering scorers against gold standard labeling.
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    km = KMeans(n_clusters=3, n_init="auto")
+    km.fit(X_train)
+    for name in CLUSTER_SCORERS:
+        score1 = get_scorer(name)(km, X_test, y_test)
+        score2 = getattr(cluster_module, name)(y_test, km.predict(X_test))
+        assert_almost_equal(score1, score2)
+
+
+def test_raises_on_score_list():
+    # Test that when a list of scores is returned, we raise proper errors.
+    X, y = make_blobs(random_state=0)
+    f1_scorer_no_average = make_scorer(f1_score, average=None)
+    clf = DecisionTreeClassifier()
+    with pytest.raises(ValueError):
+        cross_val_score(clf, X, y, scoring=f1_scorer_no_average)
+    grid_search = GridSearchCV(
+        clf, scoring=f1_scorer_no_average, param_grid={"max_depth": [1, 2]}
+    )
+    with pytest.raises(ValueError):
+        grid_search.fit(X, y)
+
+
+def test_classification_scorer_sample_weight():
+    # Test that classification scorers support sample_weight or raise sensible
+    # errors
+
+    # Unlike the metrics invariance test, in the scorer case it's harder
+    # to ensure that, on the classifier output, weighted and unweighted
+    # scores really should be unequal.
+    X, y = make_classification(random_state=0)
+    _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0)
+    split = train_test_split(X, y, y_ml, random_state=0)
+    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split
+
+    sample_weight = np.ones_like(y_test)
+    sample_weight[:10] = 0
+
+    # get sensible estimators for each metric
+    estimator = _make_estimators(X_train, y_train, y_ml_train)
+
+    for name in get_scorer_names():
+        scorer = get_scorer(name)
+        if name in REGRESSION_SCORERS:
+            # skip the regression scores
+            continue
+        if name == "top_k_accuracy":
+            # in the binary case k > 1 will always lead to a perfect score
+            scorer._kwargs = {"k": 1}
+        if name in MULTILABEL_ONLY_SCORERS:
+            target = y_ml_test
+        else:
+            target = y_test
+        try:
+            weighted = scorer(
+                estimator[name], X_test, target, sample_weight=sample_weight
+            )
+            ignored = scorer(estimator[name], X_test[10:], target[10:])
+            unweighted = scorer(estimator[name], X_test, target)
+            # this should not raise. sample_weight should be ignored if None.
+            _ = scorer(estimator[name], X_test[:10], target[:10], sample_weight=None)
+            assert weighted != unweighted, (
+                f"scorer {name} behaves identically when called with "
+                f"sample weights: {weighted} vs {unweighted}"
+            )
+            assert_almost_equal(
+                weighted,
+                ignored,
+                err_msg=(
+                    f"scorer {name} behaves differently "
+                    "when ignoring samples and setting "
+                    f"sample_weight to 0: {weighted} vs {ignored}"
+                ),
+            )
+
+        except TypeError as e:
+            assert "sample_weight" in str(e), (
+                f"scorer {name} raises unhelpful exception when called "
+                f"with sample weights: {e}"
+            )
+
+
+def test_regression_scorer_sample_weight():
+    # Test that regression scorers support sample_weight or raise sensible
+    # errors
+
+    # Odd number of test samples req for neg_median_absolute_error
+    X, y = make_regression(n_samples=101, n_features=20, random_state=0)
+    y = _require_positive_y(y)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    sample_weight = np.ones_like(y_test)
+    # Odd number req for neg_median_absolute_error
+    sample_weight[:11] = 0
+
+    reg = DecisionTreeRegressor(random_state=0)
+    reg.fit(X_train, y_train)
+
+    for name in get_scorer_names():
+        scorer = get_scorer(name)
+        if name not in REGRESSION_SCORERS:
+            # skip classification scorers
+            continue
+        try:
+            weighted = scorer(reg, X_test, y_test, sample_weight=sample_weight)
+            ignored = scorer(reg, X_test[11:], y_test[11:])
+            unweighted = scorer(reg, X_test, y_test)
+            assert weighted != unweighted, (
+                f"scorer {name} behaves identically when called with "
+                f"sample weights: {weighted} vs {unweighted}"
+            )
+            assert_almost_equal(
+                weighted,
+                ignored,
+                err_msg=(
+                    f"scorer {name} behaves differently "
+                    "when ignoring samples and setting "
+                    f"sample_weight to 0: {weighted} vs {ignored}"
+                ),
+            )
+
+        except TypeError as e:
+            assert "sample_weight" in str(e), (
+                f"scorer {name} raises unhelpful exception when called "
+                f"with sample weights: {e}"
+            )
+
+
+@pytest.mark.parametrize("name", get_scorer_names())
+def test_scorer_memmap_input(name, memmap_data_and_estimators):
+    # Non-regression test for #6147: some score functions would
+    # return singleton memmap when computed on memmap data instead of scalar
+    # float values.
+    X_mm, y_mm, y_ml_mm, estimators = memmap_data_and_estimators
+
+    if name in REQUIRE_POSITIVE_Y_SCORERS:
+        y_mm_1 = _require_positive_y(y_mm)
+        y_ml_mm_1 = _require_positive_y(y_ml_mm)
+    else:
+        y_mm_1, y_ml_mm_1 = y_mm, y_ml_mm
+
+    # UndefinedMetricWarning for P / R scores
+    with ignore_warnings():
+        scorer, estimator = get_scorer(name), estimators[name]
+        if name in MULTILABEL_ONLY_SCORERS:
+            score = scorer(estimator, X_mm, y_ml_mm_1)
+        else:
+            score = scorer(estimator, X_mm, y_mm_1)
+        assert isinstance(score, numbers.Number), name
+
+
+def test_scoring_is_not_metric():
+    with pytest.raises(ValueError, match="make_scorer"):
+        check_scoring(LogisticRegression(), scoring=f1_score)
+    with pytest.raises(ValueError, match="make_scorer"):
+        check_scoring(LogisticRegression(), scoring=roc_auc_score)
+    with pytest.raises(ValueError, match="make_scorer"):
+        check_scoring(Ridge(), scoring=r2_score)
+    with pytest.raises(ValueError, match="make_scorer"):
+        check_scoring(KMeans(), scoring=cluster_module.adjusted_rand_score)
+    with pytest.raises(ValueError, match="make_scorer"):
+        check_scoring(KMeans(), scoring=cluster_module.rand_score)
+
+
+def test_deprecated_scorer():
+    X, y = make_regression(n_samples=10, n_features=1, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    reg = DecisionTreeRegressor()
+    reg.fit(X_train, y_train)
+    deprecated_scorer = get_scorer("max_error")
+    with pytest.warns(DeprecationWarning):
+        deprecated_scorer(reg, X_test, y_test)
+
+
+@pytest.mark.parametrize(
+    (
+        "scorers,expected_predict_count,"
+        "expected_predict_proba_count,expected_decision_func_count"
+    ),
+    [
+        (
+            {
+                "a1": "accuracy",
+                "a2": "accuracy",
+                "ll1": "neg_log_loss",
+                "ll2": "neg_log_loss",
+                "ra1": "roc_auc",
+                "ra2": "roc_auc",
+            },
+            1,
+            1,
+            1,
+        ),
+        (["roc_auc", "accuracy"], 1, 0, 1),
+        (["neg_log_loss", "accuracy"], 1, 1, 0),
+    ],
+)
+def test_multimetric_scorer_calls_method_once(
+    scorers,
+    expected_predict_count,
+    expected_predict_proba_count,
+    expected_decision_func_count,
+):
+    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])
+    pos_proba = np.random.rand(X.shape[0])
+    proba = np.c_[1 - pos_proba, pos_proba]
+
+    class MyClassifier(ClassifierMixin, BaseEstimator):
+        def __init__(self):
+            self._expected_predict_count = 0
+            self._expected_predict_proba_count = 0
+            self._expected_decision_function_count = 0
+
+        def fit(self, X, y):
+            self.classes_ = np.unique(y)
+            return self
+
+        def predict(self, X):
+            self._expected_predict_count += 1
+            return y
+
+        def predict_proba(self, X):
+            self._expected_predict_proba_count += 1
+            return proba
+
+        def decision_function(self, X):
+            self._expected_decision_function_count += 1
+            return pos_proba
+
+    mock_est = MyClassifier().fit(X, y)
+    scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers)
+    multi_scorer = _MultimetricScorer(scorers=scorer_dict)
+    results = multi_scorer(mock_est, X, y)
+
+    assert set(scorers) == set(results)  # compare dict keys
+
+    assert mock_est._expected_predict_count == expected_predict_count
+    assert mock_est._expected_predict_proba_count == expected_predict_proba_count
+    assert mock_est._expected_decision_function_count == expected_decision_func_count
+
+
+@pytest.mark.parametrize(
+    "scorers",
+    [
+        (["roc_auc", "neg_log_loss"]),
+        (
+            {
+                "roc_auc": make_scorer(
+                    roc_auc_score,
+                    response_method=["predict_proba", "decision_function"],
+                ),
+                "neg_log_loss": make_scorer(log_loss, response_method="predict_proba"),
+            }
+        ),
+    ],
+)
+def test_multimetric_scorer_calls_method_once_classifier_no_decision(scorers):
+    predict_proba_call_cnt = 0
+
+    class MockKNeighborsClassifier(KNeighborsClassifier):
+        def predict_proba(self, X):
+            nonlocal predict_proba_call_cnt
+            predict_proba_call_cnt += 1
+            return super().predict_proba(X)
+
+    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])
+
+    # no decision function
+    clf = MockKNeighborsClassifier(n_neighbors=1)
+    clf.fit(X, y)
+
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
+    scorer = _MultimetricScorer(scorers=scorer_dict)
+    scorer(clf, X, y)
+
+    assert predict_proba_call_cnt == 1
+
+
+def test_multimetric_scorer_calls_method_once_regressor_threshold():
+    predict_called_cnt = 0
+
+    class MockDecisionTreeRegressor(DecisionTreeRegressor):
+        def predict(self, X):
+            nonlocal predict_called_cnt
+            predict_called_cnt += 1
+            return super().predict(X)
+
+    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])
+
+    # no decision function
+    clf = MockDecisionTreeRegressor()
+    clf.fit(X, y)
+
+    scorers = {"neg_mse": "neg_mean_squared_error", "r2": "r2"}
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
+    scorer = _MultimetricScorer(scorers=scorer_dict)
+    scorer(clf, X, y)
+
+    assert predict_called_cnt == 1
+
+
+def test_multimetric_scorer_sanity_check():
+    # scoring dictionary returned is the same as calling each scorer separately
+    scorers = {
+        "a1": "accuracy",
+        "a2": "accuracy",
+        "ll1": "neg_log_loss",
+        "ll2": "neg_log_loss",
+        "ra1": "roc_auc",
+        "ra2": "roc_auc",
+    }
+
+    X, y = make_classification(random_state=0)
+
+    clf = DecisionTreeClassifier()
+    clf.fit(X, y)
+
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
+    multi_scorer = _MultimetricScorer(scorers=scorer_dict)
+
+    result = multi_scorer(clf, X, y)
+
+    separate_scores = {
+        name: get_scorer(name)(clf, X, y)
+        for name in ["accuracy", "neg_log_loss", "roc_auc"]
+    }
+
+    for key, value in result.items():
+        score_name = scorers[key]
+        assert_allclose(value, separate_scores[score_name])
+
+
+@pytest.mark.parametrize("raise_exc", [True, False])
+def test_multimetric_scorer_exception_handling(raise_exc):
+    """Check that the calling of the `_MultimetricScorer` returns
+    exception messages in the result dict for the failing scorers
+    in case of `raise_exc` is `False` and if `raise_exc` is `True`,
+    then the proper exception is raised.
+    """
+    scorers = {
+        "failing_1": "neg_mean_squared_log_error",
+        "non_failing": "neg_median_absolute_error",
+        "failing_2": "neg_mean_squared_log_error",
+    }
+
+    X, y = make_classification(
+        n_samples=50, n_features=2, n_redundant=0, random_state=0
+    )
+    # neg_mean_squared_log_error fails if y contains values less than or equal to -1
+    y *= -1
+
+    clf = DecisionTreeClassifier().fit(X, y)
+
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
+    multi_scorer = _MultimetricScorer(scorers=scorer_dict, raise_exc=raise_exc)
+
+    error_msg = (
+        "Mean Squared Logarithmic Error cannot be used when "
+        "targets contain values less than or equal to -1."
+    )
+
+    if raise_exc:
+        with pytest.raises(ValueError, match=error_msg):
+            multi_scorer(clf, X, y)
+    else:
+        result = multi_scorer(clf, X, y)
+
+        exception_message_1 = result["failing_1"]
+        score = result["non_failing"]
+        exception_message_2 = result["failing_2"]
+
+        assert isinstance(exception_message_1, str) and error_msg in exception_message_1
+        assert isinstance(score, float)
+        assert isinstance(exception_message_2, str) and error_msg in exception_message_2
+
+
+@pytest.mark.parametrize(
+    "scorer_name, metric",
+    [
+        ("roc_auc_ovr", partial(roc_auc_score, multi_class="ovr")),
+        ("roc_auc_ovo", partial(roc_auc_score, multi_class="ovo")),
+        (
+            "roc_auc_ovr_weighted",
+            partial(roc_auc_score, multi_class="ovr", average="weighted"),
+        ),
+        (
+            "roc_auc_ovo_weighted",
+            partial(roc_auc_score, multi_class="ovo", average="weighted"),
+        ),
+    ],
+)
+def test_multiclass_roc_proba_scorer(scorer_name, metric):
+    scorer = get_scorer(scorer_name)
+    X, y = make_classification(
+        n_classes=3, n_informative=3, n_samples=20, random_state=0
+    )
+    lr = LogisticRegression().fit(X, y)
+    y_proba = lr.predict_proba(X)
+    expected_score = metric(y, y_proba)
+
+    assert scorer(lr, X, y) == pytest.approx(expected_score)
+
+
+def test_multiclass_roc_proba_scorer_label():
+    scorer = make_scorer(
+        roc_auc_score,
+        multi_class="ovo",
+        labels=[0, 1, 2],
+        response_method="predict_proba",
+    )
+    X, y = make_classification(
+        n_classes=3, n_informative=3, n_samples=20, random_state=0
+    )
+    lr = LogisticRegression().fit(X, y)
+    y_proba = lr.predict_proba(X)
+
+    y_binary = y == 0
+    expected_score = roc_auc_score(
+        y_binary, y_proba, multi_class="ovo", labels=[0, 1, 2]
+    )
+
+    assert scorer(lr, X, y_binary) == pytest.approx(expected_score)
+
+
+@pytest.mark.parametrize(
+    "scorer_name",
+    ["roc_auc_ovr", "roc_auc_ovo", "roc_auc_ovr_weighted", "roc_auc_ovo_weighted"],
+)
+def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
+    # Perceptron has no predict_proba
+    scorer = get_scorer(scorer_name)
+    X, y = make_classification(
+        n_classes=3, n_informative=3, n_samples=20, random_state=0
+    )
+    lr = Perceptron().fit(X, y)
+    msg = "Perceptron has none of the following attributes: predict_proba."
+    with pytest.raises(AttributeError, match=msg):
+        scorer(lr, X, y)
+
+
+@pytest.fixture
+def string_labeled_classification_problem():
+    """Train a classifier on binary problem with string target.
+
+    The classifier is trained on a binary classification problem where the
+    minority class of interest has a string label that is intentionally not the
+    greatest class label using the lexicographic order. In this case, "cancer"
+    is the positive label, and `classifier.classes_` is
+    `["cancer", "not cancer"]`.
+
+    In addition, the dataset is imbalanced to better identify problems when
+    using non-symmetric performance metrics such as f1-score, average precision
+    and so on.
+
+    Returns
+    -------
+    classifier : estimator object
+        Trained classifier on the binary problem.
+    X_test : ndarray of shape (n_samples, n_features)
+        Data to be used as testing set in tests.
+    y_test : ndarray of shape (n_samples,), dtype=object
+        Binary target where labels are strings.
+    y_pred : ndarray of shape (n_samples,), dtype=object
+        Prediction of `classifier` when predicting for `X_test`.
+    y_pred_proba : ndarray of shape (n_samples, 2), dtype=np.float64
+        Probabilities of `classifier` when predicting for `X_test`.
+    y_pred_decision : ndarray of shape (n_samples,), dtype=np.float64
+        Decision function values of `classifier` when predicting on `X_test`.
+    """
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.utils import shuffle
+
+    X, y = load_breast_cancer(return_X_y=True)
+    # create an highly imbalanced classification task
+    idx_positive = np.flatnonzero(y == 1)
+    idx_negative = np.flatnonzero(y == 0)
+    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
+    X, y = X[idx_selected], y[idx_selected]
+    X, y = shuffle(X, y, random_state=42)
+    # only use 2 features to make the problem even harder
+    X = X[:, :2]
+    y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X,
+        y,
+        stratify=y,
+        random_state=0,
+    )
+    classifier = LogisticRegression().fit(X_train, y_train)
+    y_pred = classifier.predict(X_test)
+    y_pred_proba = classifier.predict_proba(X_test)
+    y_pred_decision = classifier.decision_function(X_test)
+
+    return classifier, X_test, y_test, y_pred, y_pred_proba, y_pred_decision
+
+
+def test_average_precision_pos_label(string_labeled_classification_problem):
+    # check that _Scorer will lead to the right score when passing
+    # `pos_label`. Currently, only `average_precision_score` is defined to
+    # be such a scorer.
+    (
+        clf,
+        X_test,
+        y_test,
+        _,
+        y_pred_proba,
+        y_pred_decision,
+    ) = string_labeled_classification_problem
+
+    pos_label = "cancer"
+    # we need to select the positive column or reverse the decision values
+    y_pred_proba = y_pred_proba[:, 0]
+    y_pred_decision = y_pred_decision * -1
+    assert clf.classes_[0] == pos_label
+
+    # check that when calling the scoring function, probability estimates and
+    # decision values lead to the same results
+    ap_proba = average_precision_score(y_test, y_pred_proba, pos_label=pos_label)
+    ap_decision_function = average_precision_score(
+        y_test, y_pred_decision, pos_label=pos_label
+    )
+    assert ap_proba == pytest.approx(ap_decision_function)
+
+    # create a scorer which would require to pass a `pos_label`
+    # check that it fails if `pos_label` is not provided
+    average_precision_scorer = make_scorer(
+        average_precision_score,
+        response_method=("decision_function", "predict_proba"),
+    )
+    err_msg = "pos_label=1 is not a valid label. It should be one of "
+    with pytest.raises(ValueError, match=err_msg):
+        average_precision_scorer(clf, X_test, y_test)
+
+    # otherwise, the scorer should give the same results than calling the
+    # scoring function
+    average_precision_scorer = make_scorer(
+        average_precision_score,
+        response_method=("decision_function", "predict_proba"),
+        pos_label=pos_label,
+    )
+    ap_scorer = average_precision_scorer(clf, X_test, y_test)
+
+    assert ap_scorer == pytest.approx(ap_proba)
+
+    # The above scorer call is using `clf.decision_function`. We will force
+    # it to use `clf.predict_proba`.
+    clf_without_predict_proba = deepcopy(clf)
+
+    def _predict_proba(self, X):
+        raise NotImplementedError
+
+    clf_without_predict_proba.predict_proba = partial(
+        _predict_proba, clf_without_predict_proba
+    )
+    # sanity check
+    with pytest.raises(NotImplementedError):
+        clf_without_predict_proba.predict_proba(X_test)
+
+    ap_scorer = average_precision_scorer(clf_without_predict_proba, X_test, y_test)
+    assert ap_scorer == pytest.approx(ap_proba)
+
+
+def test_brier_score_loss_pos_label(string_labeled_classification_problem):
+    # check that _Scorer leads to the right score when `pos_label` is
+    # provided. Currently only the `brier_score_loss` is defined to be such
+    # a scorer.
+    clf, X_test, y_test, _, y_pred_proba, _ = string_labeled_classification_problem
+
+    pos_label = "cancer"
+    assert clf.classes_[0] == pos_label
+
+    # brier score loss is symmetric
+    brier_pos_cancer = brier_score_loss(y_test, y_pred_proba[:, 0], pos_label="cancer")
+    brier_pos_not_cancer = brier_score_loss(
+        y_test, y_pred_proba[:, 1], pos_label="not cancer"
+    )
+    assert brier_pos_cancer == pytest.approx(brier_pos_not_cancer)
+
+    brier_scorer = make_scorer(
+        brier_score_loss,
+        response_method="predict_proba",
+        pos_label=pos_label,
+    )
+    assert brier_scorer(clf, X_test, y_test) == pytest.approx(brier_pos_cancer)
+
+
+@pytest.mark.parametrize(
+    "score_func", [f1_score, precision_score, recall_score, jaccard_score]
+)
+def test_non_symmetric_metric_pos_label(
+    score_func, string_labeled_classification_problem
+):
+    # check that _Scorer leads to the right score when `pos_label` is
+    # provided. We check for all possible metric supported.
+    # Note: At some point we may end up having "scorer tags".
+    clf, X_test, y_test, y_pred, _, _ = string_labeled_classification_problem
+
+    pos_label = "cancer"
+    assert clf.classes_[0] == pos_label
+
+    score_pos_cancer = score_func(y_test, y_pred, pos_label="cancer")
+    score_pos_not_cancer = score_func(y_test, y_pred, pos_label="not cancer")
+
+    assert score_pos_cancer != pytest.approx(score_pos_not_cancer)
+
+    scorer = make_scorer(score_func, pos_label=pos_label)
+    assert scorer(clf, X_test, y_test) == pytest.approx(score_pos_cancer)
+
+
+@pytest.mark.parametrize(
+    "scorer",
+    [
+        make_scorer(
+            average_precision_score,
+            response_method=("decision_function", "predict_proba"),
+            pos_label="xxx",
+        ),
+        make_scorer(brier_score_loss, response_method="predict_proba", pos_label="xxx"),
+        make_scorer(f1_score, pos_label="xxx"),
+    ],
+    ids=["non-thresholded scorer", "probability scorer", "thresholded scorer"],
+)
+def test_scorer_select_proba_error(scorer):
+    # check that we raise the proper error when passing an unknown
+    # pos_label
+    X, y = make_classification(
+        n_classes=2, n_informative=3, n_samples=20, random_state=0
+    )
+    lr = LogisticRegression().fit(X, y)
+    assert scorer._kwargs["pos_label"] not in np.unique(y).tolist()
+
+    err_msg = "is not a valid label"
+    with pytest.raises(ValueError, match=err_msg):
+        scorer(lr, X, y)
+
+
+def test_get_scorer_return_copy():
+    # test that get_scorer returns a copy
+    assert get_scorer("roc_auc") is not get_scorer("roc_auc")
+
+
+def test_scorer_no_op_multiclass_select_proba():
+    # check that calling a _Scorer on a multiclass problem do not raise
+    # even if `y_true` would be binary during the scoring.
+    # `_select_proba_binary` should not be called in this case.
+    X, y = make_classification(
+        n_classes=3, n_informative=3, n_samples=20, random_state=0
+    )
+    lr = LogisticRegression().fit(X, y)
+
+    mask_last_class = y == lr.classes_[-1]
+    X_test, y_test = X[~mask_last_class], y[~mask_last_class]
+    assert_array_equal(np.unique(y_test), lr.classes_[:-1])
+
+    scorer = make_scorer(
+        roc_auc_score,
+        response_method="predict_proba",
+        multi_class="ovo",
+        labels=lr.classes_,
+    )
+    scorer(lr, X_test, y_test)
+
+
+@pytest.mark.parametrize("name", get_scorer_names())
+def test_scorer_set_score_request_raises(name):
+    """Test that set_score_request is only available when feature flag is on."""
+    # Make sure they expose the routing methods.
+    scorer = get_scorer(name)
+    with pytest.raises(RuntimeError, match="This method is only available"):
+        scorer.set_score_request()
+
+
+@pytest.mark.parametrize("name", get_scorer_names(), ids=get_scorer_names())
+@config_context(enable_metadata_routing=True)
+def test_scorer_metadata_request(name):
+    """Testing metadata requests for scorers.
+
+    This test checks many small things in a large test, to reduce the
+    boilerplate required for each section.
+    """
+    # Make sure they expose the routing methods.
+    scorer = get_scorer(name)
+    assert hasattr(scorer, "set_score_request")
+    assert hasattr(scorer, "get_metadata_routing")
+
+    # Check that by default no metadata is requested.
+    assert_request_is_empty(scorer.get_metadata_routing())
+
+    weighted_scorer = scorer.set_score_request(sample_weight=True)
+    # set_score_request should mutate the instance, rather than returning a
+    # new instance
+    assert weighted_scorer is scorer
+
+    # make sure the scorer doesn't request anything on methods other than
+    # `score`, and that the requested value on `score` is correct.
+    assert_request_is_empty(weighted_scorer.get_metadata_routing(), exclude="score")
+    assert (
+        weighted_scorer.get_metadata_routing().score.requests["sample_weight"] is True
+    )
+
+    # make sure putting the scorer in a router doesn't request anything by
+    # default
+    router = MetadataRouter(owner="test").add(
+        scorer=get_scorer(name),
+        method_mapping=MethodMapping().add(caller="score", callee="score"),
+    )
+    # make sure `sample_weight` is refused if passed.
+    with pytest.raises(TypeError, match="got unexpected argument"):
+        router.validate_metadata(params={"sample_weight": 1}, method="score")
+    # make sure `sample_weight` is not routed even if passed.
+    routed_params = router.route_params(params={"sample_weight": 1}, caller="score")
+    assert not routed_params.scorer.score
+
+    # make sure putting weighted_scorer in a router requests sample_weight
+    router = MetadataRouter(owner="test").add(
+        scorer=weighted_scorer,
+        method_mapping=MethodMapping().add(caller="score", callee="score"),
+    )
+    router.validate_metadata(params={"sample_weight": 1}, method="score")
+    routed_params = router.route_params(params={"sample_weight": 1}, caller="score")
+    assert list(routed_params.scorer.score.keys()) == ["sample_weight"]
+
+
+@config_context(enable_metadata_routing=True)
+def test_metadata_kwarg_conflict():
+    """This test makes sure the right warning is raised if the user passes
+    some metadata both as a constructor to make_scorer, and during __call__.
+    """
+    X, y = make_classification(
+        n_classes=3, n_informative=3, n_samples=20, random_state=0
+    )
+    lr = LogisticRegression().fit(X, y)
+
+    scorer = make_scorer(
+        roc_auc_score,
+        response_method="predict_proba",
+        multi_class="ovo",
+        labels=lr.classes_,
+    )
+    with pytest.warns(UserWarning, match="already set as kwargs"):
+        scorer.set_score_request(labels=True)
+
+    with pytest.warns(UserWarning, match="There is an overlap"):
+        scorer(lr, X, y, labels=lr.classes_)
+
+
+@config_context(enable_metadata_routing=True)
+def test_PassthroughScorer_set_score_request():
+    """Test that _PassthroughScorer.set_score_request adds the correct metadata request
+    on itself and doesn't change its estimator's routing."""
+    est = LogisticRegression().set_score_request(sample_weight="estimator_weights")
+    # make a `_PassthroughScorer` with `check_scoring`:
+    scorer = check_scoring(est, None)
+    assert (
+        scorer.get_metadata_routing().score.requests["sample_weight"]
+        == "estimator_weights"
+    )
+
+    scorer.set_score_request(sample_weight="scorer_weights")
+    assert (
+        scorer.get_metadata_routing().score.requests["sample_weight"]
+        == "scorer_weights"
+    )
+
+    # making sure changing the passthrough object doesn't affect the estimator.
+    assert (
+        est.get_metadata_routing().score.requests["sample_weight"]
+        == "estimator_weights"
+    )
+
+
+def test_PassthroughScorer_set_score_request_raises_without_routing_enabled():
+    """Test that _PassthroughScorer.set_score_request raises if metadata routing is
+    disabled."""
+    scorer = check_scoring(LogisticRegression(), None)
+    msg = "This method is only available when metadata routing is enabled."
+
+    with pytest.raises(RuntimeError, match=msg):
+        scorer.set_score_request(sample_weight="my_weights")
+
+
+@config_context(enable_metadata_routing=True)
+def test_multimetric_scoring_metadata_routing():
+    # Test that _MultimetricScorer properly routes metadata.
+    def score1(y_true, y_pred):
+        return 1
+
+    def score2(y_true, y_pred, sample_weight="test"):
+        # make sure sample_weight is not passed
+        assert sample_weight == "test"
+        return 1
+
+    def score3(y_true, y_pred, sample_weight=None):
+        # make sure sample_weight is passed
+        assert sample_weight is not None
+        return 1
+
+    scorers = {
+        "score1": make_scorer(score1),
+        "score2": make_scorer(score2).set_score_request(sample_weight=False),
+        "score3": make_scorer(score3).set_score_request(sample_weight=True),
+    }
+
+    X, y = make_classification(
+        n_samples=50, n_features=2, n_redundant=0, random_state=0
+    )
+
+    clf = DecisionTreeClassifier().fit(X, y)
+
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
+    multi_scorer = _MultimetricScorer(scorers=scorer_dict)
+    # This passes since routing is done.
+    multi_scorer(clf, X, y, sample_weight=1)
+
+
+@config_context(enable_metadata_routing=False)
+def test_multimetric_scoring_kwargs():
+    # Test that _MultimetricScorer correctly forwards kwargs
+    # to the scorers when metadata routing is disabled.
+    # `sample_weight` is only forwarded to the scorers that accept it.
+    # Other arguments are forwarded to all scorers.
+    def score1(y_true, y_pred, common_arg=None):
+        # make sure common_arg is passed
+        assert common_arg is not None
+        return 1
+
+    def score2(y_true, y_pred, common_arg=None, sample_weight=None):
+        # make sure common_arg is passed
+        assert common_arg is not None
+        # make sure sample_weight is passed
+        assert sample_weight is not None
+        return 1
+
+    scorers = {
+        "score1": make_scorer(score1),
+        "score2": make_scorer(score2),
+    }
+
+    X, y = make_classification(
+        n_samples=50, n_features=2, n_redundant=0, random_state=0
+    )
+
+    clf = DecisionTreeClassifier().fit(X, y)
+
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
+    multi_scorer = _MultimetricScorer(scorers=scorer_dict)
+    multi_scorer(clf, X, y, common_arg=1, sample_weight=1)
+
+
+def test_kwargs_without_metadata_routing_error():
+    # Test that kwargs are not supported in scorers if metadata routing is not
+    # enabled.
+    # TODO: remove when enable_metadata_routing is deprecated
+    def score(y_true, y_pred, param=None):
+        return 1  # pragma: no cover
+
+    X, y = make_classification(
+        n_samples=50, n_features=2, n_redundant=0, random_state=0
+    )
+
+    clf = DecisionTreeClassifier().fit(X, y)
+    scorer = make_scorer(score)
+    with config_context(enable_metadata_routing=False):
+        with pytest.raises(
+            ValueError, match="is only supported if enable_metadata_routing=True"
+        ):
+            scorer(clf, X, y, param="blah")
+
+
+def test_get_scorer_multilabel_indicator():
+    """Check that our scorer deal with multi-label indicator matrices.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/26817
+    """
+    X, Y = make_multilabel_classification(n_samples=72, n_classes=3, random_state=0)
+    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
+
+    estimator = KNeighborsClassifier().fit(X_train, Y_train)
+
+    score = get_scorer("average_precision")(estimator, X_test, Y_test)
+    assert score > 0.8
+
+
+@pytest.mark.parametrize(
+    "scorer, expected_repr",
+    [
+        (
+            get_scorer("accuracy"),
+            "make_scorer(accuracy_score, response_method='predict')",
+        ),
+        (
+            get_scorer("neg_log_loss"),
+            (
+                "make_scorer(log_loss, greater_is_better=False,"
+                " response_method='predict_proba')"
+            ),
+        ),
+        (
+            get_scorer("roc_auc"),
+            (
+                "make_scorer(roc_auc_score, response_method="
+                "('decision_function', 'predict_proba'))"
+            ),
+        ),
+        (
+            make_scorer(fbeta_score, beta=2),
+            "make_scorer(fbeta_score, response_method='predict', beta=2)",
+        ),
+    ],
+)
+def test_make_scorer_repr(scorer, expected_repr):
+    """Check the representation of the scorer."""
+    assert repr(scorer) == expected_repr
+
+
+@pytest.mark.parametrize("pass_estimator", [True, False])
+def test_get_scorer_multimetric(pass_estimator):
+    """Check that check_scoring is compatible with multi-metric configurations."""
+    X, y = make_classification(n_samples=150, n_features=10, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LogisticRegression(random_state=0)
+
+    if pass_estimator:
+        check_scoring_ = check_scoring
+    else:
+        check_scoring_ = partial(check_scoring, clf)
+
+    clf.fit(X_train, y_train)
+
+    y_pred = clf.predict(X_test)
+    y_proba = clf.predict_proba(X_test)
+
+    expected_results = {
+        "r2": r2_score(y_test, y_pred),
+        "roc_auc": roc_auc_score(y_test, y_proba[:, 1]),
+        "accuracy": accuracy_score(y_test, y_pred),
+    }
+
+    for container in [set, list, tuple]:
+        scoring = check_scoring_(scoring=container(["r2", "roc_auc", "accuracy"]))
+        result = scoring(clf, X_test, y_test)
+
+        assert result.keys() == expected_results.keys()
+        for name in result:
+            assert result[name] == pytest.approx(expected_results[name])
+
+    def double_accuracy(y_true, y_pred):
+        return 2 * accuracy_score(y_true, y_pred)
+
+    custom_scorer = make_scorer(double_accuracy, response_method="predict")
+
+    # dict with different names
+    dict_scoring = check_scoring_(
+        scoring={
+            "my_r2": "r2",
+            "my_roc_auc": "roc_auc",
+            "double_accuracy": custom_scorer,
+        }
+    )
+    dict_result = dict_scoring(clf, X_test, y_test)
+    assert len(dict_result) == 3
+    assert dict_result["my_r2"] == pytest.approx(expected_results["r2"])
+    assert dict_result["my_roc_auc"] == pytest.approx(expected_results["roc_auc"])
+    assert dict_result["double_accuracy"] == pytest.approx(
+        2 * expected_results["accuracy"]
+    )
+
+
+def test_multimetric_scorer_repr():
+    """Check repr for multimetric scorer"""
+    multi_metric_scorer = check_scoring(scoring=["accuracy", "r2"])
+
+    assert str(multi_metric_scorer) == 'MultiMetricScorer("accuracy", "r2")'
+
+
+def test_check_scoring_multimetric_raise_exc():
+    """Test that check_scoring returns error code for a subset of scorers in
+    multimetric scoring if raise_exc=False and raises otherwise."""
+
+    def raising_scorer(estimator, X, y):
+        raise ValueError("That doesn't work.")
+
+    X, y = make_classification(n_samples=150, n_features=10, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LogisticRegression().fit(X_train, y_train)
+
+    # "raising_scorer" is raising ValueError and should return an string representation
+    # of the error of the last scorer:
+    scoring = {
+        "accuracy": make_scorer(accuracy_score),
+        "raising_scorer": raising_scorer,
+    }
+    scoring_call = check_scoring(estimator=clf, scoring=scoring, raise_exc=False)
+    scores = scoring_call(clf, X_test, y_test)
+    assert "That doesn't work." in scores["raising_scorer"]
+
+    # should raise an error
+    scoring_call = check_scoring(estimator=clf, scoring=scoring, raise_exc=True)
+    err_msg = "That doesn't work."
+    with pytest.raises(ValueError, match=err_msg):
+        scores = scoring_call(clf, X_test, y_test)
+
+
+@pytest.mark.parametrize("enable_metadata_routing", [True, False])
+def test_metadata_routing_multimetric_metadata_routing(enable_metadata_routing):
+    """Test multimetric scorer works with and without metadata routing enabled when
+    there is no actual metadata to pass.
+
+    Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/28256
+    """
+    X, y = make_classification(n_samples=50, n_features=10, random_state=0)
+    estimator = EstimatorWithFitAndPredict().fit(X, y)
+
+    multimetric_scorer = _MultimetricScorer(scorers={"acc": get_scorer("accuracy")})
+    with config_context(enable_metadata_routing=enable_metadata_routing):
+        multimetric_scorer(estimator, X, y)
+
+
+def test_curve_scorer():
+    """Check the behaviour of the `_CurveScorer` class."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression().fit(X, y)
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    assert thresholds.shape == scores.shape
+    # check that the thresholds are probabilities with extreme values close to 0 and 1.
+    # they are not exactly 0 and 1 because they are the extremum of the
+    # `estimator.predict_proba(X)` values.
+    assert 0 <= thresholds.min() <= 0.01
+    assert 0.99 <= thresholds.max() <= 1
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0.5 <= scores.min() <= 1
+
+    # check that passing kwargs to the scorer works
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"adjusted": True},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    # balanced accuracy should be between 0.5 and 1 when it is not adjusted
+    assert 0 <= scores.min() <= 0.5
+
+    # check that we can inverse the sign of the score when dealing with `neg_*` scorer
+    curve_scorer = _CurveScorer(
+        balanced_accuracy_score,
+        sign=-1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"adjusted": True},
+    )
+    scores, thresholds = curve_scorer(estimator, X, y)
+
+    assert all(scores <= 0)
+
+
+def test_curve_scorer_pos_label(global_random_seed):
+    """Check that we propagate properly the `pos_label` parameter to the scorer."""
+    n_samples = 30
+    X, y = make_classification(
+        n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed
+    )
+    estimator = LogisticRegression().fit(X, y)
+
+    curve_scorer = _CurveScorer(
+        recall_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"pos_label": 1},
+    )
+    scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y)
+
+    curve_scorer = _CurveScorer(
+        recall_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={"pos_label": 0},
+    )
+    scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y)
+
+    # Since `pos_label` is forwarded to the curve_scorer, the thresholds are not equal.
+    assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all()
+    # The min-max range for the thresholds is defined by the probabilities of the
+    # `pos_label` class (the column of `predict_proba`).
+    y_pred = estimator.predict_proba(X)
+    assert thresholds_pos_label_0.min() == pytest.approx(y_pred.min(axis=0)[0])
+    assert thresholds_pos_label_0.max() == pytest.approx(y_pred.max(axis=0)[0])
+    assert thresholds_pos_label_1.min() == pytest.approx(y_pred.min(axis=0)[1])
+    assert thresholds_pos_label_1.max() == pytest.approx(y_pred.max(axis=0)[1])
+
+    # The recall cannot be negative and `pos_label=1` should have a higher recall
+    # since there is less samples to be considered.
+    assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min()
+    assert scores_pos_label_0.max() == pytest.approx(1.0)
+    assert scores_pos_label_1.max() == pytest.approx(1.0)
+
+
+# TODO(1.8): remove
+def test_make_scorer_reponse_method_default_warning():
+    with pytest.warns(FutureWarning, match="response_method=None is deprecated"):
+        make_scorer(accuracy_score, response_method=None)
+
+    # No warning is raised if response_method is left to its default value
+    # because the future default value has the same effect as the current one.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", FutureWarning)
+        make_scorer(accuracy_score)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27263a0ed74381a7c8dad4d6488eba570eb49b8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/__init__.py
@@ -0,0 +1,9 @@
+"""Mixture modeling algorithms."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._bayesian_mixture import BayesianGaussianMixture
+from ._gaussian_mixture import GaussianMixture
+
+__all__ = ["BayesianGaussianMixture", "GaussianMixture"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/_base.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f66344a2847533629f52ddb10a4e819144cc8cfe
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/_base.py
@@ -0,0 +1,571 @@
+"""Base class for mixture models."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+from time import time
+
+import numpy as np
+from scipy.special import logsumexp
+
+from .. import cluster
+from ..base import BaseEstimator, DensityMixin, _fit_context
+from ..cluster import kmeans_plusplus
+from ..exceptions import ConvergenceWarning
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.validation import check_is_fitted, validate_data
+
+
+def _check_shape(param, param_shape, name):
+    """Validate the shape of the input parameter 'param'.
+
+    Parameters
+    ----------
+    param : array
+
+    param_shape : tuple
+
+    name : str
+    """
+    param = np.array(param)
+    if param.shape != param_shape:
+        raise ValueError(
+            "The parameter '%s' should have the shape of %s, but got %s"
+            % (name, param_shape, param.shape)
+        )
+
+
+class BaseMixture(DensityMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for mixture models.
+
+    This abstract class specifies an interface for all mixture classes and
+    provides basic common methods for mixture models.
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+        "reg_covar": [Interval(Real, 0.0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "init_params": [
+            StrOptions({"kmeans", "random", "random_from_data", "k-means++"})
+        ],
+        "random_state": ["random_state"],
+        "warm_start": ["boolean"],
+        "verbose": ["verbose"],
+        "verbose_interval": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        n_components,
+        tol,
+        reg_covar,
+        max_iter,
+        n_init,
+        init_params,
+        random_state,
+        warm_start,
+        verbose,
+        verbose_interval,
+    ):
+        self.n_components = n_components
+        self.tol = tol
+        self.reg_covar = reg_covar
+        self.max_iter = max_iter
+        self.n_init = n_init
+        self.init_params = init_params
+        self.random_state = random_state
+        self.warm_start = warm_start
+        self.verbose = verbose
+        self.verbose_interval = verbose_interval
+
+    @abstractmethod
+    def _check_parameters(self, X):
+        """Check initial parameters of the derived class.
+
+        Parameters
+        ----------
+        X : array-like of shape  (n_samples, n_features)
+        """
+        pass
+
+    def _initialize_parameters(self, X, random_state):
+        """Initialize the model parameters.
+
+        Parameters
+        ----------
+        X : array-like of shape  (n_samples, n_features)
+
+        random_state : RandomState
+            A random number generator instance that controls the random seed
+            used for the method chosen to initialize the parameters.
+        """
+        n_samples, _ = X.shape
+
+        if self.init_params == "kmeans":
+            resp = np.zeros((n_samples, self.n_components), dtype=X.dtype)
+            label = (
+                cluster.KMeans(
+                    n_clusters=self.n_components, n_init=1, random_state=random_state
+                )
+                .fit(X)
+                .labels_
+            )
+            resp[np.arange(n_samples), label] = 1
+        elif self.init_params == "random":
+            resp = np.asarray(
+                random_state.uniform(size=(n_samples, self.n_components)), dtype=X.dtype
+            )
+            resp /= resp.sum(axis=1)[:, np.newaxis]
+        elif self.init_params == "random_from_data":
+            resp = np.zeros((n_samples, self.n_components), dtype=X.dtype)
+            indices = random_state.choice(
+                n_samples, size=self.n_components, replace=False
+            )
+            resp[indices, np.arange(self.n_components)] = 1
+        elif self.init_params == "k-means++":
+            resp = np.zeros((n_samples, self.n_components), dtype=X.dtype)
+            _, indices = kmeans_plusplus(
+                X,
+                self.n_components,
+                random_state=random_state,
+            )
+            resp[indices, np.arange(self.n_components)] = 1
+
+        self._initialize(X, resp)
+
+    @abstractmethod
+    def _initialize(self, X, resp):
+        """Initialize the model parameters of the derived class.
+
+        Parameters
+        ----------
+        X : array-like of shape  (n_samples, n_features)
+
+        resp : array-like of shape (n_samples, n_components)
+        """
+        pass
+
+    def fit(self, X, y=None):
+        """Estimate model parameters with the EM algorithm.
+
+        The method fits the model ``n_init`` times and sets the parameters with
+        which the model has the largest likelihood or lower bound. Within each
+        trial, the method iterates between E-step and M-step for ``max_iter``
+        times until the change of likelihood or lower bound is less than
+        ``tol``, otherwise, a ``ConvergenceWarning`` is raised.
+        If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single
+        initialization is performed upon the first call. Upon consecutive
+        calls, training starts where it left off.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points. Each row
+            corresponds to a single data point.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            The fitted mixture.
+        """
+        # parameters are validated in fit_predict
+        self.fit_predict(X, y)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_predict(self, X, y=None):
+        """Estimate model parameters using X and predict the labels for X.
+
+        The method fits the model n_init times and sets the parameters with
+        which the model has the largest likelihood or lower bound. Within each
+        trial, the method iterates between E-step and M-step for `max_iter`
+        times until the change of likelihood or lower bound is less than
+        `tol`, otherwise, a :class:`~sklearn.exceptions.ConvergenceWarning` is
+        raised. After fitting, it predicts the most probable label for the
+        input data points.
+
+        .. versionadded:: 0.20
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points. Each row
+            corresponds to a single data point.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        labels : array, shape (n_samples,)
+            Component labels.
+        """
+        X = validate_data(self, X, dtype=[np.float64, np.float32], ensure_min_samples=2)
+        if X.shape[0] < self.n_components:
+            raise ValueError(
+                "Expected n_samples >= n_components "
+                f"but got n_components = {self.n_components}, "
+                f"n_samples = {X.shape[0]}"
+            )
+        self._check_parameters(X)
+
+        # if we enable warm_start, we will have a unique initialisation
+        do_init = not (self.warm_start and hasattr(self, "converged_"))
+        n_init = self.n_init if do_init else 1
+
+        max_lower_bound = -np.inf
+        best_lower_bounds = []
+        self.converged_ = False
+
+        random_state = check_random_state(self.random_state)
+
+        n_samples, _ = X.shape
+        for init in range(n_init):
+            self._print_verbose_msg_init_beg(init)
+
+            if do_init:
+                self._initialize_parameters(X, random_state)
+
+            lower_bound = -np.inf if do_init else self.lower_bound_
+            current_lower_bounds = []
+
+            if self.max_iter == 0:
+                best_params = self._get_parameters()
+                best_n_iter = 0
+            else:
+                converged = False
+                for n_iter in range(1, self.max_iter + 1):
+                    prev_lower_bound = lower_bound
+
+                    log_prob_norm, log_resp = self._e_step(X)
+                    self._m_step(X, log_resp)
+                    lower_bound = self._compute_lower_bound(log_resp, log_prob_norm)
+                    current_lower_bounds.append(lower_bound)
+
+                    change = lower_bound - prev_lower_bound
+                    self._print_verbose_msg_iter_end(n_iter, change)
+
+                    if abs(change) < self.tol:
+                        converged = True
+                        break
+
+                self._print_verbose_msg_init_end(lower_bound, converged)
+
+                if lower_bound > max_lower_bound or max_lower_bound == -np.inf:
+                    max_lower_bound = lower_bound
+                    best_params = self._get_parameters()
+                    best_n_iter = n_iter
+                    best_lower_bounds = current_lower_bounds
+                    self.converged_ = converged
+
+        # Should only warn about convergence if max_iter > 0, otherwise
+        # the user is assumed to have used 0-iters initialization
+        # to get the initial means.
+        if not self.converged_ and self.max_iter > 0:
+            warnings.warn(
+                (
+                    "Best performing initialization did not converge. "
+                    "Try different init parameters, or increase max_iter, "
+                    "tol, or check for degenerate data."
+                ),
+                ConvergenceWarning,
+            )
+
+        self._set_parameters(best_params)
+        self.n_iter_ = best_n_iter
+        self.lower_bound_ = max_lower_bound
+        self.lower_bounds_ = best_lower_bounds
+
+        # Always do a final e-step to guarantee that the labels returned by
+        # fit_predict(X) are always consistent with fit(X).predict(X)
+        # for any value of max_iter and tol (and any random_state).
+        _, log_resp = self._e_step(X)
+
+        return log_resp.argmax(axis=1)
+
+    def _e_step(self, X):
+        """E step.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        Returns
+        -------
+        log_prob_norm : float
+            Mean of the logarithms of the probabilities of each sample in X
+
+        log_responsibility : array, shape (n_samples, n_components)
+            Logarithm of the posterior probabilities (or responsibilities) of
+            the point of each sample in X.
+        """
+        log_prob_norm, log_resp = self._estimate_log_prob_resp(X)
+        return np.mean(log_prob_norm), log_resp
+
+    @abstractmethod
+    def _m_step(self, X, log_resp):
+        """M step.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        log_resp : array-like of shape (n_samples, n_components)
+            Logarithm of the posterior probabilities (or responsibilities) of
+            the point of each sample in X.
+        """
+        pass
+
+    @abstractmethod
+    def _get_parameters(self):
+        pass
+
+    @abstractmethod
+    def _set_parameters(self, params):
+        pass
+
+    def score_samples(self, X):
+        """Compute the log-likelihood of each sample.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points. Each row
+            corresponds to a single data point.
+
+        Returns
+        -------
+        log_prob : array, shape (n_samples,)
+            Log-likelihood of each sample in `X` under the current model.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+
+        return logsumexp(self._estimate_weighted_log_prob(X), axis=1)
+
+    def score(self, X, y=None):
+        """Compute the per-sample average log-likelihood of the given data X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_dimensions)
+            List of n_features-dimensional data points. Each row
+            corresponds to a single data point.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-likelihood of `X` under the Gaussian mixture model.
+        """
+        return self.score_samples(X).mean()
+
+    def predict(self, X):
+        """Predict the labels for the data samples in X using trained model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points. Each row
+            corresponds to a single data point.
+
+        Returns
+        -------
+        labels : array, shape (n_samples,)
+            Component labels.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+        return self._estimate_weighted_log_prob(X).argmax(axis=1)
+
+    def predict_proba(self, X):
+        """Evaluate the components' density for each sample.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points. Each row
+            corresponds to a single data point.
+
+        Returns
+        -------
+        resp : array, shape (n_samples, n_components)
+            Density of each Gaussian component for each sample in X.
+        """
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+        _, log_resp = self._estimate_log_prob_resp(X)
+        return np.exp(log_resp)
+
+    def sample(self, n_samples=1):
+        """Generate random samples from the fitted Gaussian distribution.
+
+        Parameters
+        ----------
+        n_samples : int, default=1
+            Number of samples to generate.
+
+        Returns
+        -------
+        X : array, shape (n_samples, n_features)
+            Randomly generated sample.
+
+        y : array, shape (nsamples,)
+            Component labels.
+        """
+        check_is_fitted(self)
+
+        if n_samples < 1:
+            raise ValueError(
+                "Invalid value for 'n_samples': %d . The sampling requires at "
+                "least one sample." % (self.n_components)
+            )
+
+        _, n_features = self.means_.shape
+        rng = check_random_state(self.random_state)
+        n_samples_comp = rng.multinomial(n_samples, self.weights_)
+
+        if self.covariance_type == "full":
+            X = np.vstack(
+                [
+                    rng.multivariate_normal(mean, covariance, int(sample))
+                    for (mean, covariance, sample) in zip(
+                        self.means_, self.covariances_, n_samples_comp
+                    )
+                ]
+            )
+        elif self.covariance_type == "tied":
+            X = np.vstack(
+                [
+                    rng.multivariate_normal(mean, self.covariances_, int(sample))
+                    for (mean, sample) in zip(self.means_, n_samples_comp)
+                ]
+            )
+        else:
+            X = np.vstack(
+                [
+                    mean
+                    + rng.standard_normal(size=(sample, n_features))
+                    * np.sqrt(covariance)
+                    for (mean, covariance, sample) in zip(
+                        self.means_, self.covariances_, n_samples_comp
+                    )
+                ]
+            )
+
+        y = np.concatenate(
+            [np.full(sample, j, dtype=int) for j, sample in enumerate(n_samples_comp)]
+        )
+
+        return (X, y)
+
+    def _estimate_weighted_log_prob(self, X):
+        """Estimate the weighted log-probabilities, log P(X | Z) + log weights.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        Returns
+        -------
+        weighted_log_prob : array, shape (n_samples, n_component)
+        """
+        return self._estimate_log_prob(X) + self._estimate_log_weights()
+
+    @abstractmethod
+    def _estimate_log_weights(self):
+        """Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.
+
+        Returns
+        -------
+        log_weight : array, shape (n_components, )
+        """
+        pass
+
+    @abstractmethod
+    def _estimate_log_prob(self, X):
+        """Estimate the log-probabilities log P(X | Z).
+
+        Compute the log-probabilities per each component for each sample.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        Returns
+        -------
+        log_prob : array, shape (n_samples, n_component)
+        """
+        pass
+
+    def _estimate_log_prob_resp(self, X):
+        """Estimate log probabilities and responsibilities for each sample.
+
+        Compute the log probabilities, weighted log probabilities per
+        component and responsibilities for each sample in X with respect to
+        the current state of the model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        Returns
+        -------
+        log_prob_norm : array, shape (n_samples,)
+            log p(X)
+
+        log_responsibilities : array, shape (n_samples, n_components)
+            logarithm of the responsibilities
+        """
+        weighted_log_prob = self._estimate_weighted_log_prob(X)
+        log_prob_norm = logsumexp(weighted_log_prob, axis=1)
+        with np.errstate(under="ignore"):
+            # ignore underflow
+            log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]
+        return log_prob_norm, log_resp
+
+    def _print_verbose_msg_init_beg(self, n_init):
+        """Print verbose message on initialization."""
+        if self.verbose == 1:
+            print("Initialization %d" % n_init)
+        elif self.verbose >= 2:
+            print("Initialization %d" % n_init)
+            self._init_prev_time = time()
+            self._iter_prev_time = self._init_prev_time
+
+    def _print_verbose_msg_iter_end(self, n_iter, diff_ll):
+        """Print verbose message on initialization."""
+        if n_iter % self.verbose_interval == 0:
+            if self.verbose == 1:
+                print("  Iteration %d" % n_iter)
+            elif self.verbose >= 2:
+                cur_time = time()
+                print(
+                    "  Iteration %d\t time lapse %.5fs\t ll change %.5f"
+                    % (n_iter, cur_time - self._iter_prev_time, diff_ll)
+                )
+                self._iter_prev_time = cur_time
+
+    def _print_verbose_msg_init_end(self, lb, init_has_converged):
+        """Print verbose message on the end of iteration."""
+        converged_msg = "converged" if init_has_converged else "did not converge"
+        if self.verbose == 1:
+            print(f"Initialization {converged_msg}.")
+        elif self.verbose >= 2:
+            t = time() - self._init_prev_time
+            print(
+                f"Initialization {converged_msg}. time lapse {t:.5f}s\t lower bound"
+                f" {lb:.5f}."
+            )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/_bayesian_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/_bayesian_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..57220186faf61694f0945a276bc60254ba861bd5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/_bayesian_mixture.py
@@ -0,0 +1,891 @@
+"""Bayesian Gaussian Mixture Model."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import math
+from numbers import Real
+
+import numpy as np
+from scipy.special import betaln, digamma, gammaln
+
+from ..utils import check_array
+from ..utils._param_validation import Interval, StrOptions
+from ._base import BaseMixture, _check_shape
+from ._gaussian_mixture import (
+    _check_precision_matrix,
+    _check_precision_positivity,
+    _compute_log_det_cholesky,
+    _compute_precision_cholesky,
+    _estimate_gaussian_parameters,
+    _estimate_log_gaussian_prob,
+)
+
+
+def _log_dirichlet_norm(dirichlet_concentration):
+    """Compute the log of the Dirichlet distribution normalization term.
+
+    Parameters
+    ----------
+    dirichlet_concentration : array-like of shape (n_samples,)
+        The parameters values of the Dirichlet distribution.
+
+    Returns
+    -------
+    log_dirichlet_norm : float
+        The log normalization of the Dirichlet distribution.
+    """
+    return gammaln(np.sum(dirichlet_concentration)) - np.sum(
+        gammaln(dirichlet_concentration)
+    )
+
+
+def _log_wishart_norm(degrees_of_freedom, log_det_precisions_chol, n_features):
+    """Compute the log of the Wishart distribution normalization term.
+
+    Parameters
+    ----------
+    degrees_of_freedom : array-like of shape (n_components,)
+        The number of degrees of freedom on the covariance Wishart
+        distributions.
+
+    log_det_precision_chol : array-like of shape (n_components,)
+         The determinant of the precision matrix for each component.
+
+    n_features : int
+        The number of features.
+
+    Return
+    ------
+    log_wishart_norm : array-like of shape (n_components,)
+        The log normalization of the Wishart distribution.
+    """
+    # To simplify the computation we have removed the np.log(np.pi) term
+    return -(
+        degrees_of_freedom * log_det_precisions_chol
+        + degrees_of_freedom * n_features * 0.5 * math.log(2.0)
+        + np.sum(
+            gammaln(0.5 * (degrees_of_freedom - np.arange(n_features)[:, np.newaxis])),
+            0,
+        )
+    )
+
+
+class BayesianGaussianMixture(BaseMixture):
+    """Variational Bayesian estimation of a Gaussian mixture.
+
+    This class allows to infer an approximate posterior distribution over the
+    parameters of a Gaussian mixture distribution. The effective number of
+    components can be inferred from the data.
+
+    This class implements two types of prior for the weights distribution: a
+    finite mixture model with Dirichlet distribution and an infinite mixture
+    model with the Dirichlet Process. In practice Dirichlet Process inference
+    algorithm is approximated and uses a truncated distribution with a fixed
+    maximum number of components (called the Stick-breaking representation).
+    The number of components actually used almost always depends on the data.
+
+    .. versionadded:: 0.18
+
+    Read more in the :ref:`User Guide <bgmm>`.
+
+    Parameters
+    ----------
+    n_components : int, default=1
+        The number of mixture components. Depending on the data and the value
+        of the `weight_concentration_prior` the model can decide to not use
+        all the components by setting some component `weights_` to values very
+        close to zero. The number of effective components is therefore smaller
+        than n_components.
+
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'
+        String describing the type of covariance parameters to use.
+        Must be one of:
+
+        - 'full' (each component has its own general covariance matrix),
+        - 'tied' (all components share the same general covariance matrix),
+        - 'diag' (each component has its own diagonal covariance matrix),
+        - 'spherical' (each component has its own single variance).
+
+    tol : float, default=1e-3
+        The convergence threshold. EM iterations will stop when the
+        lower bound average gain on the likelihood (of the training data with
+        respect to the model) is below this threshold.
+
+    reg_covar : float, default=1e-6
+        Non-negative regularization added to the diagonal of covariance.
+        Allows to assure that the covariance matrices are all positive.
+
+    max_iter : int, default=100
+        The number of EM iterations to perform.
+
+    n_init : int, default=1
+        The number of initializations to perform. The result with the highest
+        lower bound value on the likelihood is kept.
+
+    init_params : {'kmeans', 'k-means++', 'random', 'random_from_data'}, \
+    default='kmeans'
+        The method used to initialize the weights, the means and the
+        covariances. String must be one of:
+
+        - 'kmeans': responsibilities are initialized using kmeans.
+        - 'k-means++': use the k-means++ method to initialize.
+        - 'random': responsibilities are initialized randomly.
+        - 'random_from_data': initial means are randomly selected data points.
+
+        .. versionchanged:: v1.1
+            `init_params` now accepts 'random_from_data' and 'k-means++' as
+            initialization methods.
+
+    weight_concentration_prior_type : {'dirichlet_process', 'dirichlet_distribution'}, \
+            default='dirichlet_process'
+        String describing the type of the weight concentration prior.
+
+    weight_concentration_prior : float or None, default=None
+        The dirichlet concentration of each component on the weight
+        distribution (Dirichlet). This is commonly called gamma in the
+        literature. The higher concentration puts more mass in
+        the center and will lead to more components being active, while a lower
+        concentration parameter will lead to more mass at the edge of the
+        mixture weights simplex. The value of the parameter must be greater
+        than 0. If it is None, it's set to ``1. / n_components``.
+
+    mean_precision_prior : float or None, default=None
+        The precision prior on the mean distribution (Gaussian).
+        Controls the extent of where means can be placed. Larger
+        values concentrate the cluster means around `mean_prior`.
+        The value of the parameter must be greater than 0.
+        If it is None, it is set to 1.
+
+    mean_prior : array-like, shape (n_features,), default=None
+        The prior on the mean distribution (Gaussian).
+        If it is None, it is set to the mean of X.
+
+    degrees_of_freedom_prior : float or None, default=None
+        The prior of the number of degrees of freedom on the covariance
+        distributions (Wishart). If it is None, it's set to `n_features`.
+
+    covariance_prior : float or array-like, default=None
+        The prior on the covariance distribution (Wishart).
+        If it is None, the emiprical covariance prior is initialized using the
+        covariance of X. The shape depends on `covariance_type`::
+
+                (n_features, n_features) if 'full',
+                (n_features, n_features) if 'tied',
+                (n_features)             if 'diag',
+                float                    if 'spherical'
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random seed given to the method chosen to initialize the
+        parameters (see `init_params`).
+        In addition, it controls the generation of random samples from the
+        fitted distribution (see the method `sample`).
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    warm_start : bool, default=False
+        If 'warm_start' is True, the solution of the last fitting is used as
+        initialization for the next call of fit(). This can speed up
+        convergence when fit is called several times on similar problems.
+        See :term:`the Glossary <warm_start>`.
+
+    verbose : int, default=0
+        Enable verbose output. If 1 then it prints the current
+        initialization and each iteration step. If greater than 1 then
+        it prints also the log probability and the time needed
+        for each step.
+
+    verbose_interval : int, default=10
+        Number of iteration done before the next print.
+
+    Attributes
+    ----------
+    weights_ : array-like of shape (n_components,)
+        The weights of each mixture components.
+
+    means_ : array-like of shape (n_components, n_features)
+        The mean of each mixture component.
+
+    covariances_ : array-like
+        The covariance of each mixture component.
+        The shape depends on `covariance_type`::
+
+            (n_components,)                        if 'spherical',
+            (n_features, n_features)               if 'tied',
+            (n_components, n_features)             if 'diag',
+            (n_components, n_features, n_features) if 'full'
+
+    precisions_ : array-like
+        The precision matrices for each component in the mixture. A precision
+        matrix is the inverse of a covariance matrix. A covariance matrix is
+        symmetric positive definite so the mixture of Gaussian can be
+        equivalently parameterized by the precision matrices. Storing the
+        precision matrices instead of the covariance matrices makes it more
+        efficient to compute the log-likelihood of new samples at test time.
+        The shape depends on ``covariance_type``::
+
+            (n_components,)                        if 'spherical',
+            (n_features, n_features)               if 'tied',
+            (n_components, n_features)             if 'diag',
+            (n_components, n_features, n_features) if 'full'
+
+    precisions_cholesky_ : array-like
+        The cholesky decomposition of the precision matrices of each mixture
+        component. A precision matrix is the inverse of a covariance matrix.
+        A covariance matrix is symmetric positive definite so the mixture of
+        Gaussian can be equivalently parameterized by the precision matrices.
+        Storing the precision matrices instead of the covariance matrices makes
+        it more efficient to compute the log-likelihood of new samples at test
+        time. The shape depends on ``covariance_type``::
+
+            (n_components,)                        if 'spherical',
+            (n_features, n_features)               if 'tied',
+            (n_components, n_features)             if 'diag',
+            (n_components, n_features, n_features) if 'full'
+
+    converged_ : bool
+        True when convergence of the best fit of inference was reached, False otherwise.
+
+    n_iter_ : int
+        Number of step used by the best fit of inference to reach the
+        convergence.
+
+    lower_bound_ : float
+        Lower bound value on the model evidence (of the training data) of the
+        best fit of inference.
+
+    lower_bounds_ : array-like of shape (`n_iter_`,)
+        The list of lower bound values on the model evidence from each iteration
+        of the best fit of inference.
+
+    weight_concentration_prior_ : tuple or float
+        The dirichlet concentration of each component on the weight
+        distribution (Dirichlet). The type depends on
+        ``weight_concentration_prior_type``::
+
+            (float, float) if 'dirichlet_process' (Beta parameters),
+            float          if 'dirichlet_distribution' (Dirichlet parameters).
+
+        The higher concentration puts more mass in
+        the center and will lead to more components being active, while a lower
+        concentration parameter will lead to more mass at the edge of the
+        simplex.
+
+    weight_concentration_ : array-like of shape (n_components,)
+        The dirichlet concentration of each component on the weight
+        distribution (Dirichlet).
+
+    mean_precision_prior_ : float
+        The precision prior on the mean distribution (Gaussian).
+        Controls the extent of where means can be placed.
+        Larger values concentrate the cluster means around `mean_prior`.
+        If mean_precision_prior is set to None, `mean_precision_prior_` is set
+        to 1.
+
+    mean_precision_ : array-like of shape (n_components,)
+        The precision of each components on the mean distribution (Gaussian).
+
+    mean_prior_ : array-like of shape (n_features,)
+        The prior on the mean distribution (Gaussian).
+
+    degrees_of_freedom_prior_ : float
+        The prior of the number of degrees of freedom on the covariance
+        distributions (Wishart).
+
+    degrees_of_freedom_ : array-like of shape (n_components,)
+        The number of degrees of freedom of each components in the model.
+
+    covariance_prior_ : float or array-like
+        The prior on the covariance distribution (Wishart).
+        The shape depends on `covariance_type`::
+
+            (n_features, n_features) if 'full',
+            (n_features, n_features) if 'tied',
+            (n_features)             if 'diag',
+            float                    if 'spherical'
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GaussianMixture : Finite Gaussian mixture fit with EM.
+
+    References
+    ----------
+
+    .. [1] `Bishop, Christopher M. (2006). "Pattern recognition and machine
+       learning". Vol. 4 No. 4. New York: Springer.
+       <https://www.springer.com/kr/book/9780387310732>`_
+
+    .. [2] `Hagai Attias. (2000). "A Variational Bayesian Framework for
+       Graphical Models". In Advances in Neural Information Processing
+       Systems 12.
+       <https://citeseerx.ist.psu.edu/doc_view/pid/ee844fd96db7041a9681b5a18bff008912052c7e>`_
+
+    .. [3] `Blei, David M. and Michael I. Jordan. (2006). "Variational
+       inference for Dirichlet process mixtures". Bayesian analysis 1.1
+       <https://www.cs.princeton.edu/courses/archive/fall11/cos597C/reading/BleiJordan2005.pdf>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.mixture import BayesianGaussianMixture
+    >>> X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [12, 4], [10, 7]])
+    >>> bgm = BayesianGaussianMixture(n_components=2, random_state=42).fit(X)
+    >>> bgm.means_
+    array([[2.49 , 2.29],
+           [8.45, 4.52 ]])
+    >>> bgm.predict([[0, 0], [9, 3]])
+    array([0, 1])
+    """
+
+    _parameter_constraints: dict = {
+        **BaseMixture._parameter_constraints,
+        "covariance_type": [StrOptions({"spherical", "tied", "diag", "full"})],
+        "weight_concentration_prior_type": [
+            StrOptions({"dirichlet_process", "dirichlet_distribution"})
+        ],
+        "weight_concentration_prior": [
+            None,
+            Interval(Real, 0.0, None, closed="neither"),
+        ],
+        "mean_precision_prior": [None, Interval(Real, 0.0, None, closed="neither")],
+        "mean_prior": [None, "array-like"],
+        "degrees_of_freedom_prior": [None, Interval(Real, 0.0, None, closed="neither")],
+        "covariance_prior": [
+            None,
+            "array-like",
+            Interval(Real, 0.0, None, closed="neither"),
+        ],
+    }
+
+    def __init__(
+        self,
+        *,
+        n_components=1,
+        covariance_type="full",
+        tol=1e-3,
+        reg_covar=1e-6,
+        max_iter=100,
+        n_init=1,
+        init_params="kmeans",
+        weight_concentration_prior_type="dirichlet_process",
+        weight_concentration_prior=None,
+        mean_precision_prior=None,
+        mean_prior=None,
+        degrees_of_freedom_prior=None,
+        covariance_prior=None,
+        random_state=None,
+        warm_start=False,
+        verbose=0,
+        verbose_interval=10,
+    ):
+        super().__init__(
+            n_components=n_components,
+            tol=tol,
+            reg_covar=reg_covar,
+            max_iter=max_iter,
+            n_init=n_init,
+            init_params=init_params,
+            random_state=random_state,
+            warm_start=warm_start,
+            verbose=verbose,
+            verbose_interval=verbose_interval,
+        )
+
+        self.covariance_type = covariance_type
+        self.weight_concentration_prior_type = weight_concentration_prior_type
+        self.weight_concentration_prior = weight_concentration_prior
+        self.mean_precision_prior = mean_precision_prior
+        self.mean_prior = mean_prior
+        self.degrees_of_freedom_prior = degrees_of_freedom_prior
+        self.covariance_prior = covariance_prior
+
+    def _check_parameters(self, X):
+        """Check that the parameters are well defined.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+        """
+        self._check_weights_parameters()
+        self._check_means_parameters(X)
+        self._check_precision_parameters(X)
+        self._checkcovariance_prior_parameter(X)
+
+    def _check_weights_parameters(self):
+        """Check the parameter of the Dirichlet distribution."""
+        if self.weight_concentration_prior is None:
+            self.weight_concentration_prior_ = 1.0 / self.n_components
+        else:
+            self.weight_concentration_prior_ = self.weight_concentration_prior
+
+    def _check_means_parameters(self, X):
+        """Check the parameters of the Gaussian distribution.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+        """
+        _, n_features = X.shape
+
+        if self.mean_precision_prior is None:
+            self.mean_precision_prior_ = 1.0
+        else:
+            self.mean_precision_prior_ = self.mean_precision_prior
+
+        if self.mean_prior is None:
+            self.mean_prior_ = X.mean(axis=0)
+        else:
+            self.mean_prior_ = check_array(
+                self.mean_prior, dtype=[np.float64, np.float32], ensure_2d=False
+            )
+            _check_shape(self.mean_prior_, (n_features,), "means")
+
+    def _check_precision_parameters(self, X):
+        """Check the prior parameters of the precision distribution.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+        """
+        _, n_features = X.shape
+
+        if self.degrees_of_freedom_prior is None:
+            self.degrees_of_freedom_prior_ = n_features
+        elif self.degrees_of_freedom_prior > n_features - 1.0:
+            self.degrees_of_freedom_prior_ = self.degrees_of_freedom_prior
+        else:
+            raise ValueError(
+                "The parameter 'degrees_of_freedom_prior' "
+                "should be greater than %d, but got %.3f."
+                % (n_features - 1, self.degrees_of_freedom_prior)
+            )
+
+    def _checkcovariance_prior_parameter(self, X):
+        """Check the `covariance_prior_`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+        """
+        _, n_features = X.shape
+
+        if self.covariance_prior is None:
+            self.covariance_prior_ = {
+                "full": np.atleast_2d(np.cov(X.T)),
+                "tied": np.atleast_2d(np.cov(X.T)),
+                "diag": np.var(X, axis=0, ddof=1),
+                "spherical": np.var(X, axis=0, ddof=1).mean(),
+            }[self.covariance_type]
+
+        elif self.covariance_type in ["full", "tied"]:
+            self.covariance_prior_ = check_array(
+                self.covariance_prior, dtype=[np.float64, np.float32], ensure_2d=False
+            )
+            _check_shape(
+                self.covariance_prior_,
+                (n_features, n_features),
+                "%s covariance_prior" % self.covariance_type,
+            )
+            _check_precision_matrix(self.covariance_prior_, self.covariance_type)
+        elif self.covariance_type == "diag":
+            self.covariance_prior_ = check_array(
+                self.covariance_prior, dtype=[np.float64, np.float32], ensure_2d=False
+            )
+            _check_shape(
+                self.covariance_prior_,
+                (n_features,),
+                "%s covariance_prior" % self.covariance_type,
+            )
+            _check_precision_positivity(self.covariance_prior_, self.covariance_type)
+        # spherical case
+        else:
+            self.covariance_prior_ = self.covariance_prior
+
+    def _initialize(self, X, resp):
+        """Initialization of the mixture parameters.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        resp : array-like of shape (n_samples, n_components)
+        """
+        nk, xk, sk = _estimate_gaussian_parameters(
+            X, resp, self.reg_covar, self.covariance_type
+        )
+
+        self._estimate_weights(nk)
+        self._estimate_means(nk, xk)
+        self._estimate_precisions(nk, xk, sk)
+
+    def _estimate_weights(self, nk):
+        """Estimate the parameters of the Dirichlet distribution.
+
+        Parameters
+        ----------
+        nk : array-like of shape (n_components,)
+        """
+        if self.weight_concentration_prior_type == "dirichlet_process":
+            # For dirichlet process weight_concentration will be a tuple
+            # containing the two parameters of the beta distribution
+            self.weight_concentration_ = (
+                1.0 + nk,
+                (
+                    self.weight_concentration_prior_
+                    + np.hstack((np.cumsum(nk[::-1])[-2::-1], 0))
+                ),
+            )
+        else:
+            # case Variational Gaussian mixture with dirichlet distribution
+            self.weight_concentration_ = self.weight_concentration_prior_ + nk
+
+    def _estimate_means(self, nk, xk):
+        """Estimate the parameters of the Gaussian distribution.
+
+        Parameters
+        ----------
+        nk : array-like of shape (n_components,)
+
+        xk : array-like of shape (n_components, n_features)
+        """
+        self.mean_precision_ = self.mean_precision_prior_ + nk
+        self.means_ = (
+            self.mean_precision_prior_ * self.mean_prior_ + nk[:, np.newaxis] * xk
+        ) / self.mean_precision_[:, np.newaxis]
+
+    def _estimate_precisions(self, nk, xk, sk):
+        """Estimate the precisions parameters of the precision distribution.
+
+        Parameters
+        ----------
+        nk : array-like of shape (n_components,)
+
+        xk : array-like of shape (n_components, n_features)
+
+        sk : array-like
+            The shape depends of `covariance_type`:
+            'full' : (n_components, n_features, n_features)
+            'tied' : (n_features, n_features)
+            'diag' : (n_components, n_features)
+            'spherical' : (n_components,)
+        """
+        {
+            "full": self._estimate_wishart_full,
+            "tied": self._estimate_wishart_tied,
+            "diag": self._estimate_wishart_diag,
+            "spherical": self._estimate_wishart_spherical,
+        }[self.covariance_type](nk, xk, sk)
+
+        self.precisions_cholesky_ = _compute_precision_cholesky(
+            self.covariances_, self.covariance_type
+        )
+
+    def _estimate_wishart_full(self, nk, xk, sk):
+        """Estimate the full Wishart distribution parameters.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        nk : array-like of shape (n_components,)
+
+        xk : array-like of shape (n_components, n_features)
+
+        sk : array-like of shape (n_components, n_features, n_features)
+        """
+        _, n_features = xk.shape
+
+        # Warning : in some Bishop book, there is a typo on the formula 10.63
+        # `degrees_of_freedom_k = degrees_of_freedom_0 + Nk` is
+        # the correct formula
+        self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk
+
+        self.covariances_ = np.empty((self.n_components, n_features, n_features))
+
+        for k in range(self.n_components):
+            diff = xk[k] - self.mean_prior_
+            self.covariances_[k] = (
+                self.covariance_prior_
+                + nk[k] * sk[k]
+                + nk[k]
+                * self.mean_precision_prior_
+                / self.mean_precision_[k]
+                * np.outer(diff, diff)
+            )
+
+        # Contrary to the original bishop book, we normalize the covariances
+        self.covariances_ /= self.degrees_of_freedom_[:, np.newaxis, np.newaxis]
+
+    def _estimate_wishart_tied(self, nk, xk, sk):
+        """Estimate the tied Wishart distribution parameters.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        nk : array-like of shape (n_components,)
+
+        xk : array-like of shape (n_components, n_features)
+
+        sk : array-like of shape (n_features, n_features)
+        """
+        _, n_features = xk.shape
+
+        # Warning : in some Bishop book, there is a typo on the formula 10.63
+        # `degrees_of_freedom_k = degrees_of_freedom_0 + Nk`
+        # is the correct formula
+        self.degrees_of_freedom_ = (
+            self.degrees_of_freedom_prior_ + nk.sum() / self.n_components
+        )
+
+        diff = xk - self.mean_prior_
+        self.covariances_ = (
+            self.covariance_prior_
+            + sk * nk.sum() / self.n_components
+            + self.mean_precision_prior_
+            / self.n_components
+            * np.dot((nk / self.mean_precision_) * diff.T, diff)
+        )
+
+        # Contrary to the original bishop book, we normalize the covariances
+        self.covariances_ /= self.degrees_of_freedom_
+
+    def _estimate_wishart_diag(self, nk, xk, sk):
+        """Estimate the diag Wishart distribution parameters.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        nk : array-like of shape (n_components,)
+
+        xk : array-like of shape (n_components, n_features)
+
+        sk : array-like of shape (n_components, n_features)
+        """
+        _, n_features = xk.shape
+
+        # Warning : in some Bishop book, there is a typo on the formula 10.63
+        # `degrees_of_freedom_k = degrees_of_freedom_0 + Nk`
+        # is the correct formula
+        self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk
+
+        diff = xk - self.mean_prior_
+        self.covariances_ = self.covariance_prior_ + nk[:, np.newaxis] * (
+            sk
+            + (self.mean_precision_prior_ / self.mean_precision_)[:, np.newaxis]
+            * np.square(diff)
+        )
+
+        # Contrary to the original bishop book, we normalize the covariances
+        self.covariances_ /= self.degrees_of_freedom_[:, np.newaxis]
+
+    def _estimate_wishart_spherical(self, nk, xk, sk):
+        """Estimate the spherical Wishart distribution parameters.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        nk : array-like of shape (n_components,)
+
+        xk : array-like of shape (n_components, n_features)
+
+        sk : array-like of shape (n_components,)
+        """
+        _, n_features = xk.shape
+
+        # Warning : in some Bishop book, there is a typo on the formula 10.63
+        # `degrees_of_freedom_k = degrees_of_freedom_0 + Nk`
+        # is the correct formula
+        self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk
+
+        diff = xk - self.mean_prior_
+        self.covariances_ = self.covariance_prior_ + nk * (
+            sk
+            + self.mean_precision_prior_
+            / self.mean_precision_
+            * np.mean(np.square(diff), 1)
+        )
+
+        # Contrary to the original bishop book, we normalize the covariances
+        self.covariances_ /= self.degrees_of_freedom_
+
+    def _m_step(self, X, log_resp):
+        """M step.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        log_resp : array-like of shape (n_samples, n_components)
+            Logarithm of the posterior probabilities (or responsibilities) of
+            the point of each sample in X.
+        """
+        n_samples, _ = X.shape
+
+        nk, xk, sk = _estimate_gaussian_parameters(
+            X, np.exp(log_resp), self.reg_covar, self.covariance_type
+        )
+        self._estimate_weights(nk)
+        self._estimate_means(nk, xk)
+        self._estimate_precisions(nk, xk, sk)
+
+    def _estimate_log_weights(self):
+        if self.weight_concentration_prior_type == "dirichlet_process":
+            digamma_sum = digamma(
+                self.weight_concentration_[0] + self.weight_concentration_[1]
+            )
+            digamma_a = digamma(self.weight_concentration_[0])
+            digamma_b = digamma(self.weight_concentration_[1])
+            return (
+                digamma_a
+                - digamma_sum
+                + np.hstack((0, np.cumsum(digamma_b - digamma_sum)[:-1]))
+            )
+        else:
+            # case Variational Gaussian mixture with dirichlet distribution
+            return digamma(self.weight_concentration_) - digamma(
+                np.sum(self.weight_concentration_)
+            )
+
+    def _estimate_log_prob(self, X):
+        _, n_features = X.shape
+        # We remove `n_features * np.log(self.degrees_of_freedom_)` because
+        # the precision matrix is normalized
+        log_gauss = _estimate_log_gaussian_prob(
+            X, self.means_, self.precisions_cholesky_, self.covariance_type
+        ) - 0.5 * n_features * np.log(self.degrees_of_freedom_)
+
+        log_lambda = n_features * np.log(2.0) + np.sum(
+            digamma(
+                0.5
+                * (self.degrees_of_freedom_ - np.arange(0, n_features)[:, np.newaxis])
+            ),
+            0,
+        )
+
+        return log_gauss + 0.5 * (log_lambda - n_features / self.mean_precision_)
+
+    def _compute_lower_bound(self, log_resp, log_prob_norm):
+        """Estimate the lower bound of the model.
+
+        The lower bound on the likelihood (of the training data with respect to
+        the model) is used to detect the convergence and has to increase at
+        each iteration.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        log_resp : array, shape (n_samples, n_components)
+            Logarithm of the posterior probabilities (or responsibilities) of
+            the point of each sample in X.
+
+        log_prob_norm : float
+            Logarithm of the probability of each sample in X.
+
+        Returns
+        -------
+        lower_bound : float
+        """
+        # Contrary to the original formula, we have done some simplification
+        # and removed all the constant terms.
+        (n_features,) = self.mean_prior_.shape
+
+        # We removed `.5 * n_features * np.log(self.degrees_of_freedom_)`
+        # because the precision matrix is normalized.
+        log_det_precisions_chol = _compute_log_det_cholesky(
+            self.precisions_cholesky_, self.covariance_type, n_features
+        ) - 0.5 * n_features * np.log(self.degrees_of_freedom_)
+
+        if self.covariance_type == "tied":
+            log_wishart = self.n_components * np.float64(
+                _log_wishart_norm(
+                    self.degrees_of_freedom_, log_det_precisions_chol, n_features
+                )
+            )
+        else:
+            log_wishart = np.sum(
+                _log_wishart_norm(
+                    self.degrees_of_freedom_, log_det_precisions_chol, n_features
+                )
+            )
+
+        if self.weight_concentration_prior_type == "dirichlet_process":
+            log_norm_weight = -np.sum(
+                betaln(self.weight_concentration_[0], self.weight_concentration_[1])
+            )
+        else:
+            log_norm_weight = _log_dirichlet_norm(self.weight_concentration_)
+
+        return (
+            -np.sum(np.exp(log_resp) * log_resp)
+            - log_wishart
+            - log_norm_weight
+            - 0.5 * n_features * np.sum(np.log(self.mean_precision_))
+        )
+
+    def _get_parameters(self):
+        return (
+            self.weight_concentration_,
+            self.mean_precision_,
+            self.means_,
+            self.degrees_of_freedom_,
+            self.covariances_,
+            self.precisions_cholesky_,
+        )
+
+    def _set_parameters(self, params):
+        (
+            self.weight_concentration_,
+            self.mean_precision_,
+            self.means_,
+            self.degrees_of_freedom_,
+            self.covariances_,
+            self.precisions_cholesky_,
+        ) = params
+
+        # Weights computation
+        if self.weight_concentration_prior_type == "dirichlet_process":
+            weight_dirichlet_sum = (
+                self.weight_concentration_[0] + self.weight_concentration_[1]
+            )
+            tmp = self.weight_concentration_[1] / weight_dirichlet_sum
+            self.weights_ = (
+                self.weight_concentration_[0]
+                / weight_dirichlet_sum
+                * np.hstack((1, np.cumprod(tmp[:-1])))
+            )
+            self.weights_ /= np.sum(self.weights_)
+        else:
+            self.weights_ = self.weight_concentration_ / np.sum(
+                self.weight_concentration_
+            )
+
+        # Precisions matrices computation
+        if self.covariance_type == "full":
+            self.precisions_ = np.array(
+                [
+                    np.dot(prec_chol, prec_chol.T)
+                    for prec_chol in self.precisions_cholesky_
+                ]
+            )
+
+        elif self.covariance_type == "tied":
+            self.precisions_ = np.dot(
+                self.precisions_cholesky_, self.precisions_cholesky_.T
+            )
+        else:
+            self.precisions_ = self.precisions_cholesky_**2
diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/_gaussian_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/_gaussian_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4bdd3a0d68c81c73bcf6d606cf09bdd52aff66c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/_gaussian_mixture.py
@@ -0,0 +1,934 @@
+"""Gaussian Mixture Model."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from scipy import linalg
+
+from ..utils import check_array
+from ..utils._param_validation import StrOptions
+from ..utils.extmath import row_norms
+from ._base import BaseMixture, _check_shape
+
+###############################################################################
+# Gaussian mixture shape checkers used by the GaussianMixture class
+
+
+def _check_weights(weights, n_components):
+    """Check the user provided 'weights'.
+
+    Parameters
+    ----------
+    weights : array-like of shape (n_components,)
+        The proportions of components of each mixture.
+
+    n_components : int
+        Number of components.
+
+    Returns
+    -------
+    weights : array, shape (n_components,)
+    """
+    weights = check_array(weights, dtype=[np.float64, np.float32], ensure_2d=False)
+    _check_shape(weights, (n_components,), "weights")
+
+    # check range
+    if any(np.less(weights, 0.0)) or any(np.greater(weights, 1.0)):
+        raise ValueError(
+            "The parameter 'weights' should be in the range "
+            "[0, 1], but got max value %.5f, min value %.5f"
+            % (np.min(weights), np.max(weights))
+        )
+
+    # check normalization
+    atol = 1e-6 if weights.dtype == np.float32 else 1e-8
+    if not np.allclose(np.abs(1.0 - np.sum(weights)), 0.0, atol=atol):
+        raise ValueError(
+            "The parameter 'weights' should be normalized, but got sum(weights) = %.5f"
+            % np.sum(weights)
+        )
+    return weights
+
+
+def _check_means(means, n_components, n_features):
+    """Validate the provided 'means'.
+
+    Parameters
+    ----------
+    means : array-like of shape (n_components, n_features)
+        The centers of the current components.
+
+    n_components : int
+        Number of components.
+
+    n_features : int
+        Number of features.
+
+    Returns
+    -------
+    means : array, (n_components, n_features)
+    """
+    means = check_array(means, dtype=[np.float64, np.float32], ensure_2d=False)
+    _check_shape(means, (n_components, n_features), "means")
+    return means
+
+
+def _check_precision_positivity(precision, covariance_type):
+    """Check a precision vector is positive-definite."""
+    if np.any(np.less_equal(precision, 0.0)):
+        raise ValueError("'%s precision' should be positive" % covariance_type)
+
+
+def _check_precision_matrix(precision, covariance_type):
+    """Check a precision matrix is symmetric and positive-definite."""
+    if not (
+        np.allclose(precision, precision.T) and np.all(linalg.eigvalsh(precision) > 0.0)
+    ):
+        raise ValueError(
+            "'%s precision' should be symmetric, positive-definite" % covariance_type
+        )
+
+
+def _check_precisions_full(precisions, covariance_type):
+    """Check the precision matrices are symmetric and positive-definite."""
+    for prec in precisions:
+        _check_precision_matrix(prec, covariance_type)
+
+
+def _check_precisions(precisions, covariance_type, n_components, n_features):
+    """Validate user provided precisions.
+
+    Parameters
+    ----------
+    precisions : array-like
+        'full' : shape of (n_components, n_features, n_features)
+        'tied' : shape of (n_features, n_features)
+        'diag' : shape of (n_components, n_features)
+        'spherical' : shape of (n_components,)
+
+    covariance_type : str
+
+    n_components : int
+        Number of components.
+
+    n_features : int
+        Number of features.
+
+    Returns
+    -------
+    precisions : array
+    """
+    precisions = check_array(
+        precisions,
+        dtype=[np.float64, np.float32],
+        ensure_2d=False,
+        allow_nd=covariance_type == "full",
+    )
+
+    precisions_shape = {
+        "full": (n_components, n_features, n_features),
+        "tied": (n_features, n_features),
+        "diag": (n_components, n_features),
+        "spherical": (n_components,),
+    }
+    _check_shape(
+        precisions, precisions_shape[covariance_type], "%s precision" % covariance_type
+    )
+
+    _check_precisions = {
+        "full": _check_precisions_full,
+        "tied": _check_precision_matrix,
+        "diag": _check_precision_positivity,
+        "spherical": _check_precision_positivity,
+    }
+    _check_precisions[covariance_type](precisions, covariance_type)
+    return precisions
+
+
+###############################################################################
+# Gaussian mixture parameters estimators (used by the M-Step)
+
+
+def _estimate_gaussian_covariances_full(resp, X, nk, means, reg_covar):
+    """Estimate the full covariance matrices.
+
+    Parameters
+    ----------
+    resp : array-like of shape (n_samples, n_components)
+
+    X : array-like of shape (n_samples, n_features)
+
+    nk : array-like of shape (n_components,)
+
+    means : array-like of shape (n_components, n_features)
+
+    reg_covar : float
+
+    Returns
+    -------
+    covariances : array, shape (n_components, n_features, n_features)
+        The covariance matrix of the current components.
+    """
+    n_components, n_features = means.shape
+    covariances = np.empty((n_components, n_features, n_features), dtype=X.dtype)
+    for k in range(n_components):
+        diff = X - means[k]
+        covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k]
+        covariances[k].flat[:: n_features + 1] += reg_covar
+    return covariances
+
+
+def _estimate_gaussian_covariances_tied(resp, X, nk, means, reg_covar):
+    """Estimate the tied covariance matrix.
+
+    Parameters
+    ----------
+    resp : array-like of shape (n_samples, n_components)
+
+    X : array-like of shape (n_samples, n_features)
+
+    nk : array-like of shape (n_components,)
+
+    means : array-like of shape (n_components, n_features)
+
+    reg_covar : float
+
+    Returns
+    -------
+    covariance : array, shape (n_features, n_features)
+        The tied covariance matrix of the components.
+    """
+    avg_X2 = np.dot(X.T, X)
+    avg_means2 = np.dot(nk * means.T, means)
+    covariance = avg_X2 - avg_means2
+    covariance /= nk.sum()
+    covariance.flat[:: len(covariance) + 1] += reg_covar
+    return covariance
+
+
+def _estimate_gaussian_covariances_diag(resp, X, nk, means, reg_covar):
+    """Estimate the diagonal covariance vectors.
+
+    Parameters
+    ----------
+    responsibilities : array-like of shape (n_samples, n_components)
+
+    X : array-like of shape (n_samples, n_features)
+
+    nk : array-like of shape (n_components,)
+
+    means : array-like of shape (n_components, n_features)
+
+    reg_covar : float
+
+    Returns
+    -------
+    covariances : array, shape (n_components, n_features)
+        The covariance vector of the current components.
+    """
+    avg_X2 = np.dot(resp.T, X * X) / nk[:, np.newaxis]
+    avg_means2 = means**2
+    return avg_X2 - avg_means2 + reg_covar
+
+
+def _estimate_gaussian_covariances_spherical(resp, X, nk, means, reg_covar):
+    """Estimate the spherical variance values.
+
+    Parameters
+    ----------
+    responsibilities : array-like of shape (n_samples, n_components)
+
+    X : array-like of shape (n_samples, n_features)
+
+    nk : array-like of shape (n_components,)
+
+    means : array-like of shape (n_components, n_features)
+
+    reg_covar : float
+
+    Returns
+    -------
+    variances : array, shape (n_components,)
+        The variance values of each components.
+    """
+    return _estimate_gaussian_covariances_diag(resp, X, nk, means, reg_covar).mean(1)
+
+
+def _estimate_gaussian_parameters(X, resp, reg_covar, covariance_type):
+    """Estimate the Gaussian distribution parameters.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The input data array.
+
+    resp : array-like of shape (n_samples, n_components)
+        The responsibilities for each data sample in X.
+
+    reg_covar : float
+        The regularization added to the diagonal of the covariance matrices.
+
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}
+        The type of precision matrices.
+
+    Returns
+    -------
+    nk : array-like of shape (n_components,)
+        The numbers of data samples in the current components.
+
+    means : array-like of shape (n_components, n_features)
+        The centers of the current components.
+
+    covariances : array-like
+        The covariance matrix of the current components.
+        The shape depends of the covariance_type.
+    """
+    nk = resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps
+    means = np.dot(resp.T, X) / nk[:, np.newaxis]
+    covariances = {
+        "full": _estimate_gaussian_covariances_full,
+        "tied": _estimate_gaussian_covariances_tied,
+        "diag": _estimate_gaussian_covariances_diag,
+        "spherical": _estimate_gaussian_covariances_spherical,
+    }[covariance_type](resp, X, nk, means, reg_covar)
+    return nk, means, covariances
+
+
+def _compute_precision_cholesky(covariances, covariance_type):
+    """Compute the Cholesky decomposition of the precisions.
+
+    Parameters
+    ----------
+    covariances : array-like
+        The covariance matrix of the current components.
+        The shape depends of the covariance_type.
+
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}
+        The type of precision matrices.
+
+    Returns
+    -------
+    precisions_cholesky : array-like
+        The cholesky decomposition of sample precisions of the current
+        components. The shape depends of the covariance_type.
+    """
+    estimate_precision_error_message = (
+        "Fitting the mixture model failed because some components have "
+        "ill-defined empirical covariance (for instance caused by singleton "
+        "or collapsed samples). Try to decrease the number of components, "
+        "increase reg_covar, or scale the input data."
+    )
+    dtype = covariances.dtype
+    if dtype == np.float32:
+        estimate_precision_error_message += (
+            " The numerical accuracy can also be improved by passing float64"
+            " data instead of float32."
+        )
+
+    if covariance_type == "full":
+        n_components, n_features, _ = covariances.shape
+        precisions_chol = np.empty((n_components, n_features, n_features), dtype=dtype)
+        for k, covariance in enumerate(covariances):
+            try:
+                cov_chol = linalg.cholesky(covariance, lower=True)
+            except linalg.LinAlgError:
+                raise ValueError(estimate_precision_error_message)
+            precisions_chol[k] = linalg.solve_triangular(
+                cov_chol, np.eye(n_features, dtype=dtype), lower=True
+            ).T
+    elif covariance_type == "tied":
+        _, n_features = covariances.shape
+        try:
+            cov_chol = linalg.cholesky(covariances, lower=True)
+        except linalg.LinAlgError:
+            raise ValueError(estimate_precision_error_message)
+        precisions_chol = linalg.solve_triangular(
+            cov_chol, np.eye(n_features, dtype=dtype), lower=True
+        ).T
+    else:
+        if np.any(np.less_equal(covariances, 0.0)):
+            raise ValueError(estimate_precision_error_message)
+        precisions_chol = 1.0 / np.sqrt(covariances)
+    return precisions_chol
+
+
+def _flipudlr(array):
+    """Reverse the rows and columns of an array."""
+    return np.flipud(np.fliplr(array))
+
+
+def _compute_precision_cholesky_from_precisions(precisions, covariance_type):
+    r"""Compute the Cholesky decomposition of precisions using precisions themselves.
+
+    As implemented in :func:`_compute_precision_cholesky`, the `precisions_cholesky_` is
+    an upper-triangular matrix for each Gaussian component, which can be expressed as
+    the $UU^T$ factorization of the precision matrix for each Gaussian component, where
+    $U$ is an upper-triangular matrix.
+
+    In order to use the Cholesky decomposition to get $UU^T$, the precision matrix
+    $\Lambda$ needs to be permutated such that its rows and columns are reversed, which
+    can be done by applying a similarity transformation with an exchange matrix $J$,
+    where the 1 elements reside on the anti-diagonal and all other elements are 0. In
+    particular, the Cholesky decomposition of the transformed precision matrix is
+    $J\Lambda J=LL^T$, where $L$ is a lower-triangular matrix. Because $\Lambda=UU^T$
+    and $J=J^{-1}=J^T$, the `precisions_cholesky_` for each Gaussian component can be
+    expressed as $JLJ$.
+
+    Refer to #26415 for details.
+
+    Parameters
+    ----------
+    precisions : array-like
+        The precision matrix of the current components.
+        The shape depends on the covariance_type.
+
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}
+        The type of precision matrices.
+
+    Returns
+    -------
+    precisions_cholesky : array-like
+        The cholesky decomposition of sample precisions of the current
+        components. The shape depends on the covariance_type.
+    """
+    if covariance_type == "full":
+        precisions_cholesky = np.array(
+            [
+                _flipudlr(linalg.cholesky(_flipudlr(precision), lower=True))
+                for precision in precisions
+            ]
+        )
+    elif covariance_type == "tied":
+        precisions_cholesky = _flipudlr(
+            linalg.cholesky(_flipudlr(precisions), lower=True)
+        )
+    else:
+        precisions_cholesky = np.sqrt(precisions)
+    return precisions_cholesky
+
+
+###############################################################################
+# Gaussian mixture probability estimators
+def _compute_log_det_cholesky(matrix_chol, covariance_type, n_features):
+    """Compute the log-det of the cholesky decomposition of matrices.
+
+    Parameters
+    ----------
+    matrix_chol : array-like
+        Cholesky decompositions of the matrices.
+        'full' : shape of (n_components, n_features, n_features)
+        'tied' : shape of (n_features, n_features)
+        'diag' : shape of (n_components, n_features)
+        'spherical' : shape of (n_components,)
+
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}
+
+    n_features : int
+        Number of features.
+
+    Returns
+    -------
+    log_det_precision_chol : array-like of shape (n_components,)
+        The determinant of the precision matrix for each component.
+    """
+    if covariance_type == "full":
+        n_components, _, _ = matrix_chol.shape
+        log_det_chol = np.sum(
+            np.log(matrix_chol.reshape(n_components, -1)[:, :: n_features + 1]), axis=1
+        )
+
+    elif covariance_type == "tied":
+        log_det_chol = np.sum(np.log(np.diag(matrix_chol)))
+
+    elif covariance_type == "diag":
+        log_det_chol = np.sum(np.log(matrix_chol), axis=1)
+
+    else:
+        log_det_chol = n_features * np.log(matrix_chol)
+
+    return log_det_chol
+
+
+def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
+    """Estimate the log Gaussian probability.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+
+    means : array-like of shape (n_components, n_features)
+
+    precisions_chol : array-like
+        Cholesky decompositions of the precision matrices.
+        'full' : shape of (n_components, n_features, n_features)
+        'tied' : shape of (n_features, n_features)
+        'diag' : shape of (n_components, n_features)
+        'spherical' : shape of (n_components,)
+
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}
+
+    Returns
+    -------
+    log_prob : array, shape (n_samples, n_components)
+    """
+    n_samples, n_features = X.shape
+    n_components, _ = means.shape
+    # The determinant of the precision matrix from the Cholesky decomposition
+    # corresponds to the negative half of the determinant of the full precision
+    # matrix.
+    # In short: det(precision_chol) = - det(precision) / 2
+    log_det = _compute_log_det_cholesky(precisions_chol, covariance_type, n_features)
+
+    if covariance_type == "full":
+        log_prob = np.empty((n_samples, n_components), dtype=X.dtype)
+        for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
+            y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
+            log_prob[:, k] = np.sum(np.square(y), axis=1)
+
+    elif covariance_type == "tied":
+        log_prob = np.empty((n_samples, n_components), dtype=X.dtype)
+        for k, mu in enumerate(means):
+            y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol)
+            log_prob[:, k] = np.sum(np.square(y), axis=1)
+
+    elif covariance_type == "diag":
+        precisions = precisions_chol**2
+        log_prob = (
+            np.sum((means**2 * precisions), 1)
+            - 2.0 * np.dot(X, (means * precisions).T)
+            + np.dot(X**2, precisions.T)
+        )
+
+    elif covariance_type == "spherical":
+        precisions = precisions_chol**2
+        log_prob = (
+            np.sum(means**2, 1) * precisions
+            - 2 * np.dot(X, means.T * precisions)
+            + np.outer(row_norms(X, squared=True), precisions)
+        )
+    # Since we are using the precision of the Cholesky decomposition,
+    # `- 0.5 * log_det_precision` becomes `+ log_det_precision_chol`
+    return -0.5 * (n_features * np.log(2 * np.pi).astype(X.dtype) + log_prob) + log_det
+
+
+class GaussianMixture(BaseMixture):
+    """Gaussian Mixture.
+
+    Representation of a Gaussian mixture model probability distribution.
+    This class allows to estimate the parameters of a Gaussian mixture
+    distribution.
+
+    Read more in the :ref:`User Guide <gmm>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    n_components : int, default=1
+        The number of mixture components.
+
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'
+        String describing the type of covariance parameters to use.
+        Must be one of:
+
+        - 'full': each component has its own general covariance matrix.
+        - 'tied': all components share the same general covariance matrix.
+        - 'diag': each component has its own diagonal covariance matrix.
+        - 'spherical': each component has its own single variance.
+
+        For an example of using `covariance_type`, refer to
+        :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`.
+
+    tol : float, default=1e-3
+        The convergence threshold. EM iterations will stop when the
+        lower bound average gain is below this threshold.
+
+    reg_covar : float, default=1e-6
+        Non-negative regularization added to the diagonal of covariance.
+        Allows to assure that the covariance matrices are all positive.
+
+    max_iter : int, default=100
+        The number of EM iterations to perform.
+
+    n_init : int, default=1
+        The number of initializations to perform. The best results are kept.
+
+    init_params : {'kmeans', 'k-means++', 'random', 'random_from_data'}, \
+    default='kmeans'
+        The method used to initialize the weights, the means and the
+        precisions.
+        String must be one of:
+
+        - 'kmeans' : responsibilities are initialized using kmeans.
+        - 'k-means++' : use the k-means++ method to initialize.
+        - 'random' : responsibilities are initialized randomly.
+        - 'random_from_data' : initial means are randomly selected data points.
+
+        .. versionchanged:: v1.1
+            `init_params` now accepts 'random_from_data' and 'k-means++' as
+            initialization methods.
+
+    weights_init : array-like of shape (n_components, ), default=None
+        The user-provided initial weights.
+        If it is None, weights are initialized using the `init_params` method.
+
+    means_init : array-like of shape (n_components, n_features), default=None
+        The user-provided initial means,
+        If it is None, means are initialized using the `init_params` method.
+
+    precisions_init : array-like, default=None
+        The user-provided initial precisions (inverse of the covariance
+        matrices).
+        If it is None, precisions are initialized using the 'init_params'
+        method.
+        The shape depends on 'covariance_type'::
+
+            (n_components,)                        if 'spherical',
+            (n_features, n_features)               if 'tied',
+            (n_components, n_features)             if 'diag',
+            (n_components, n_features, n_features) if 'full'
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random seed given to the method chosen to initialize the
+        parameters (see `init_params`).
+        In addition, it controls the generation of random samples from the
+        fitted distribution (see the method `sample`).
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    warm_start : bool, default=False
+        If 'warm_start' is True, the solution of the last fitting is used as
+        initialization for the next call of fit(). This can speed up
+        convergence when fit is called several times on similar problems.
+        In that case, 'n_init' is ignored and only a single initialization
+        occurs upon the first call.
+        See :term:`the Glossary <warm_start>`.
+
+    verbose : int, default=0
+        Enable verbose output. If 1 then it prints the current
+        initialization and each iteration step. If greater than 1 then
+        it prints also the log probability and the time needed
+        for each step.
+
+    verbose_interval : int, default=10
+        Number of iteration done before the next print.
+
+    Attributes
+    ----------
+    weights_ : array-like of shape (n_components,)
+        The weights of each mixture components.
+
+    means_ : array-like of shape (n_components, n_features)
+        The mean of each mixture component.
+
+    covariances_ : array-like
+        The covariance of each mixture component.
+        The shape depends on `covariance_type`::
+
+            (n_components,)                        if 'spherical',
+            (n_features, n_features)               if 'tied',
+            (n_components, n_features)             if 'diag',
+            (n_components, n_features, n_features) if 'full'
+
+        For an example of using covariances, refer to
+        :ref:`sphx_glr_auto_examples_mixture_plot_gmm_covariances.py`.
+
+    precisions_ : array-like
+        The precision matrices for each component in the mixture. A precision
+        matrix is the inverse of a covariance matrix. A covariance matrix is
+        symmetric positive definite so the mixture of Gaussian can be
+        equivalently parameterized by the precision matrices. Storing the
+        precision matrices instead of the covariance matrices makes it more
+        efficient to compute the log-likelihood of new samples at test time.
+        The shape depends on `covariance_type`::
+
+            (n_components,)                        if 'spherical',
+            (n_features, n_features)               if 'tied',
+            (n_components, n_features)             if 'diag',
+            (n_components, n_features, n_features) if 'full'
+
+    precisions_cholesky_ : array-like
+        The cholesky decomposition of the precision matrices of each mixture
+        component. A precision matrix is the inverse of a covariance matrix.
+        A covariance matrix is symmetric positive definite so the mixture of
+        Gaussian can be equivalently parameterized by the precision matrices.
+        Storing the precision matrices instead of the covariance matrices makes
+        it more efficient to compute the log-likelihood of new samples at test
+        time. The shape depends on `covariance_type`::
+
+            (n_components,)                        if 'spherical',
+            (n_features, n_features)               if 'tied',
+            (n_components, n_features)             if 'diag',
+            (n_components, n_features, n_features) if 'full'
+
+    converged_ : bool
+        True when convergence of the best fit of EM was reached, False otherwise.
+
+    n_iter_ : int
+        Number of step used by the best fit of EM to reach the convergence.
+
+    lower_bound_ : float
+        Lower bound value on the log-likelihood (of the training data with
+        respect to the model) of the best fit of EM.
+
+    lower_bounds_ : array-like of shape (`n_iter_`,)
+        The list of lower bound values on the log-likelihood from each
+        iteration of the best fit of EM.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    BayesianGaussianMixture : Gaussian mixture model fit with a variational
+        inference.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.mixture import GaussianMixture
+    >>> X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
+    >>> gm = GaussianMixture(n_components=2, random_state=0).fit(X)
+    >>> gm.means_
+    array([[10.,  2.],
+           [ 1.,  2.]])
+    >>> gm.predict([[0, 0], [12, 3]])
+    array([1, 0])
+
+    For a comparison of Gaussian Mixture with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
+    """
+
+    _parameter_constraints: dict = {
+        **BaseMixture._parameter_constraints,
+        "covariance_type": [StrOptions({"full", "tied", "diag", "spherical"})],
+        "weights_init": ["array-like", None],
+        "means_init": ["array-like", None],
+        "precisions_init": ["array-like", None],
+    }
+
+    def __init__(
+        self,
+        n_components=1,
+        *,
+        covariance_type="full",
+        tol=1e-3,
+        reg_covar=1e-6,
+        max_iter=100,
+        n_init=1,
+        init_params="kmeans",
+        weights_init=None,
+        means_init=None,
+        precisions_init=None,
+        random_state=None,
+        warm_start=False,
+        verbose=0,
+        verbose_interval=10,
+    ):
+        super().__init__(
+            n_components=n_components,
+            tol=tol,
+            reg_covar=reg_covar,
+            max_iter=max_iter,
+            n_init=n_init,
+            init_params=init_params,
+            random_state=random_state,
+            warm_start=warm_start,
+            verbose=verbose,
+            verbose_interval=verbose_interval,
+        )
+
+        self.covariance_type = covariance_type
+        self.weights_init = weights_init
+        self.means_init = means_init
+        self.precisions_init = precisions_init
+
+    def _check_parameters(self, X):
+        """Check the Gaussian mixture parameters are well defined."""
+        _, n_features = X.shape
+
+        if self.weights_init is not None:
+            self.weights_init = _check_weights(self.weights_init, self.n_components)
+
+        if self.means_init is not None:
+            self.means_init = _check_means(
+                self.means_init, self.n_components, n_features
+            )
+
+        if self.precisions_init is not None:
+            self.precisions_init = _check_precisions(
+                self.precisions_init,
+                self.covariance_type,
+                self.n_components,
+                n_features,
+            )
+
+    def _initialize_parameters(self, X, random_state):
+        # If all the initial parameters are all provided, then there is no need to run
+        # the initialization.
+        compute_resp = (
+            self.weights_init is None
+            or self.means_init is None
+            or self.precisions_init is None
+        )
+        if compute_resp:
+            super()._initialize_parameters(X, random_state)
+        else:
+            self._initialize(X, None)
+
+    def _initialize(self, X, resp):
+        """Initialization of the Gaussian mixture parameters.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        resp : array-like of shape (n_samples, n_components)
+        """
+        n_samples, _ = X.shape
+        weights, means, covariances = None, None, None
+        if resp is not None:
+            weights, means, covariances = _estimate_gaussian_parameters(
+                X, resp, self.reg_covar, self.covariance_type
+            )
+            if self.weights_init is None:
+                weights /= n_samples
+
+        self.weights_ = weights if self.weights_init is None else self.weights_init
+        self.means_ = means if self.means_init is None else self.means_init
+
+        if self.precisions_init is None:
+            self.covariances_ = covariances
+            self.precisions_cholesky_ = _compute_precision_cholesky(
+                covariances, self.covariance_type
+            )
+        else:
+            self.precisions_cholesky_ = _compute_precision_cholesky_from_precisions(
+                self.precisions_init, self.covariance_type
+            )
+
+    def _m_step(self, X, log_resp):
+        """M step.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        log_resp : array-like of shape (n_samples, n_components)
+            Logarithm of the posterior probabilities (or responsibilities) of
+            the point of each sample in X.
+        """
+        self.weights_, self.means_, self.covariances_ = _estimate_gaussian_parameters(
+            X, np.exp(log_resp), self.reg_covar, self.covariance_type
+        )
+        self.weights_ /= self.weights_.sum()
+        self.precisions_cholesky_ = _compute_precision_cholesky(
+            self.covariances_, self.covariance_type
+        )
+
+    def _estimate_log_prob(self, X):
+        return _estimate_log_gaussian_prob(
+            X, self.means_, self.precisions_cholesky_, self.covariance_type
+        )
+
+    def _estimate_log_weights(self):
+        return np.log(self.weights_)
+
+    def _compute_lower_bound(self, _, log_prob_norm):
+        return log_prob_norm
+
+    def _get_parameters(self):
+        return (
+            self.weights_,
+            self.means_,
+            self.covariances_,
+            self.precisions_cholesky_,
+        )
+
+    def _set_parameters(self, params):
+        (
+            self.weights_,
+            self.means_,
+            self.covariances_,
+            self.precisions_cholesky_,
+        ) = params
+
+        # Attributes computation
+        _, n_features = self.means_.shape
+
+        dtype = self.precisions_cholesky_.dtype
+        if self.covariance_type == "full":
+            self.precisions_ = np.empty_like(self.precisions_cholesky_)
+            for k, prec_chol in enumerate(self.precisions_cholesky_):
+                self.precisions_[k] = np.dot(prec_chol, prec_chol.T)
+
+        elif self.covariance_type == "tied":
+            self.precisions_ = np.dot(
+                self.precisions_cholesky_, self.precisions_cholesky_.T
+            )
+        else:
+            self.precisions_ = self.precisions_cholesky_**2
+
+    def _n_parameters(self):
+        """Return the number of free parameters in the model."""
+        _, n_features = self.means_.shape
+        if self.covariance_type == "full":
+            cov_params = self.n_components * n_features * (n_features + 1) / 2.0
+        elif self.covariance_type == "diag":
+            cov_params = self.n_components * n_features
+        elif self.covariance_type == "tied":
+            cov_params = n_features * (n_features + 1) / 2.0
+        elif self.covariance_type == "spherical":
+            cov_params = self.n_components
+        mean_params = n_features * self.n_components
+        return int(cov_params + mean_params + self.n_components - 1)
+
+    def bic(self, X):
+        """Bayesian information criterion for the current model on the input X.
+
+        You can refer to this :ref:`mathematical section <aic_bic>` for more
+        details regarding the formulation of the BIC used.
+
+        For an example of GMM selection using `bic` information criterion,
+        refer to :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`.
+
+        Parameters
+        ----------
+        X : array of shape (n_samples, n_dimensions)
+            The input samples.
+
+        Returns
+        -------
+        bic : float
+            The lower the better.
+        """
+        return -2 * self.score(X) * X.shape[0] + self._n_parameters() * np.log(
+            X.shape[0]
+        )
+
+    def aic(self, X):
+        """Akaike information criterion for the current model on the input X.
+
+        You can refer to this :ref:`mathematical section <aic_bic>` for more
+        details regarding the formulation of the AIC used.
+
+        Parameters
+        ----------
+        X : array of shape (n_samples, n_dimensions)
+            The input samples.
+
+        Returns
+        -------
+        aic : float
+            The lower the better.
+        """
+        return -2 * self.score(X) * X.shape[0] + 2 * self._n_parameters()
diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..d36543903cb87b07ea1a1c75b9a69aa63bd7dbff
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py
@@ -0,0 +1,464 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import copy
+
+import numpy as np
+import pytest
+from scipy.special import gammaln
+
+from sklearn.exceptions import NotFittedError
+from sklearn.metrics.cluster import adjusted_rand_score
+from sklearn.mixture import BayesianGaussianMixture
+from sklearn.mixture._bayesian_mixture import _log_dirichlet_norm, _log_wishart_norm
+from sklearn.mixture.tests.test_gaussian_mixture import RandomData
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_equal,
+)
+
+COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"]
+PRIOR_TYPE = ["dirichlet_process", "dirichlet_distribution"]
+
+
+def test_log_dirichlet_norm():
+    rng = np.random.RandomState(0)
+
+    weight_concentration = rng.rand(2)
+    expected_norm = gammaln(np.sum(weight_concentration)) - np.sum(
+        gammaln(weight_concentration)
+    )
+    predected_norm = _log_dirichlet_norm(weight_concentration)
+
+    assert_almost_equal(expected_norm, predected_norm)
+
+
+def test_log_wishart_norm():
+    rng = np.random.RandomState(0)
+
+    n_components, n_features = 5, 2
+    degrees_of_freedom = np.abs(rng.rand(n_components)) + 1.0
+    log_det_precisions_chol = n_features * np.log(range(2, 2 + n_components))
+
+    expected_norm = np.empty(5)
+    for k, (degrees_of_freedom_k, log_det_k) in enumerate(
+        zip(degrees_of_freedom, log_det_precisions_chol)
+    ):
+        expected_norm[k] = -(
+            degrees_of_freedom_k * (log_det_k + 0.5 * n_features * np.log(2.0))
+            + np.sum(
+                gammaln(
+                    0.5
+                    * (degrees_of_freedom_k - np.arange(0, n_features)[:, np.newaxis])
+                ),
+                0,
+            )
+        ).item()
+    predected_norm = _log_wishart_norm(
+        degrees_of_freedom, log_det_precisions_chol, n_features
+    )
+
+    assert_almost_equal(expected_norm, predected_norm)
+
+
+def test_bayesian_mixture_weights_prior_initialisation():
+    rng = np.random.RandomState(0)
+    n_samples, n_components, n_features = 10, 5, 2
+    X = rng.rand(n_samples, n_features)
+
+    # Check correct init for a given value of weight_concentration_prior
+    weight_concentration_prior = rng.rand()
+    bgmm = BayesianGaussianMixture(
+        weight_concentration_prior=weight_concentration_prior, random_state=rng
+    ).fit(X)
+    assert_almost_equal(weight_concentration_prior, bgmm.weight_concentration_prior_)
+
+    # Check correct init for the default value of weight_concentration_prior
+    bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X)
+    assert_almost_equal(1.0 / n_components, bgmm.weight_concentration_prior_)
+
+
+def test_bayesian_mixture_mean_prior_initialisation():
+    rng = np.random.RandomState(0)
+    n_samples, n_components, n_features = 10, 3, 2
+    X = rng.rand(n_samples, n_features)
+
+    # Check correct init for a given value of mean_precision_prior
+    mean_precision_prior = rng.rand()
+    bgmm = BayesianGaussianMixture(
+        mean_precision_prior=mean_precision_prior, random_state=rng
+    ).fit(X)
+    assert_almost_equal(mean_precision_prior, bgmm.mean_precision_prior_)
+
+    # Check correct init for the default value of mean_precision_prior
+    bgmm = BayesianGaussianMixture(random_state=rng).fit(X)
+    assert_almost_equal(1.0, bgmm.mean_precision_prior_)
+
+    # Check correct init for a given value of mean_prior
+    mean_prior = rng.rand(n_features)
+    bgmm = BayesianGaussianMixture(
+        n_components=n_components, mean_prior=mean_prior, random_state=rng
+    ).fit(X)
+    assert_almost_equal(mean_prior, bgmm.mean_prior_)
+
+    # Check correct init for the default value of bemean_priorta
+    bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X)
+    assert_almost_equal(X.mean(axis=0), bgmm.mean_prior_)
+
+
+def test_bayesian_mixture_precisions_prior_initialisation():
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 2
+    X = rng.rand(n_samples, n_features)
+
+    # Check raise message for a bad value of degrees_of_freedom_prior
+    bad_degrees_of_freedom_prior_ = n_features - 1.0
+    bgmm = BayesianGaussianMixture(
+        degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng
+    )
+    msg = (
+        "The parameter 'degrees_of_freedom_prior' should be greater than"
+        f" {n_features - 1}, but got {bad_degrees_of_freedom_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
+
+    # Check correct init for a given value of degrees_of_freedom_prior
+    degrees_of_freedom_prior = rng.rand() + n_features - 1.0
+    bgmm = BayesianGaussianMixture(
+        degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng
+    ).fit(X)
+    assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_)
+
+    # Check correct init for the default value of degrees_of_freedom_prior
+    degrees_of_freedom_prior_default = n_features
+    bgmm = BayesianGaussianMixture(
+        degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng
+    ).fit(X)
+    assert_almost_equal(
+        degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_
+    )
+
+    # Check correct init for a given value of covariance_prior
+    covariance_prior = {
+        "full": np.cov(X.T, bias=1) + 10,
+        "tied": np.cov(X.T, bias=1) + 5,
+        "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3,
+        "spherical": rng.rand(),
+    }
+
+    bgmm = BayesianGaussianMixture(random_state=rng)
+    for cov_type in ["full", "tied", "diag", "spherical"]:
+        bgmm.covariance_type = cov_type
+        bgmm.covariance_prior = covariance_prior[cov_type]
+        bgmm.fit(X)
+        assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_)
+
+    # Check correct init for the default value of covariance_prior
+    covariance_prior_default = {
+        "full": np.atleast_2d(np.cov(X.T)),
+        "tied": np.atleast_2d(np.cov(X.T)),
+        "diag": np.var(X, axis=0, ddof=1),
+        "spherical": np.var(X, axis=0, ddof=1).mean(),
+    }
+
+    bgmm = BayesianGaussianMixture(random_state=0)
+    for cov_type in ["full", "tied", "diag", "spherical"]:
+        bgmm.covariance_type = cov_type
+        bgmm.fit(X)
+        assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)
+
+
+def test_bayesian_mixture_check_is_fitted():
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 2
+
+    # Check raise message
+    bgmm = BayesianGaussianMixture(random_state=rng)
+    X = rng.rand(n_samples, n_features)
+
+    msg = "This BayesianGaussianMixture instance is not fitted yet."
+    with pytest.raises(ValueError, match=msg):
+        bgmm.score(X)
+
+
+def test_bayesian_mixture_weights():
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 2
+
+    X = rng.rand(n_samples, n_features)
+
+    # Case Dirichlet distribution for the weight concentration prior type
+    bgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_distribution",
+        n_components=3,
+        random_state=rng,
+    ).fit(X)
+
+    expected_weights = bgmm.weight_concentration_ / np.sum(bgmm.weight_concentration_)
+    assert_almost_equal(expected_weights, bgmm.weights_)
+    assert_almost_equal(np.sum(bgmm.weights_), 1.0)
+
+    # Case Dirichlet process for the weight concentration prior type
+    dpgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_process",
+        n_components=3,
+        random_state=rng,
+    ).fit(X)
+    weight_dirichlet_sum = (
+        dpgmm.weight_concentration_[0] + dpgmm.weight_concentration_[1]
+    )
+    tmp = dpgmm.weight_concentration_[1] / weight_dirichlet_sum
+    expected_weights = (
+        dpgmm.weight_concentration_[0]
+        / weight_dirichlet_sum
+        * np.hstack((1, np.cumprod(tmp[:-1])))
+    )
+    expected_weights /= np.sum(expected_weights)
+    assert_almost_equal(expected_weights, dpgmm.weights_)
+    assert_almost_equal(np.sum(dpgmm.weights_), 1.0)
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_monotonic_likelihood():
+    # We check that each step of the each step of variational inference without
+    # regularization improve monotonically the training set of the bound
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=20)
+    n_components = rand_data.n_components
+
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            bgmm = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=2 * n_components,
+                covariance_type=covar_type,
+                warm_start=True,
+                max_iter=1,
+                random_state=rng,
+                tol=1e-3,
+            )
+            current_lower_bound = -np.inf
+            # Do one training iteration at a time so we can make sure that the
+            # training log likelihood increases after each iteration.
+            for _ in range(600):
+                prev_lower_bound = current_lower_bound
+                current_lower_bound = bgmm.fit(X).lower_bound_
+                assert current_lower_bound >= prev_lower_bound
+
+                if bgmm.converged_:
+                    break
+            assert bgmm.converged_
+
+
+def test_compare_covar_type():
+    # We can compare the 'full' precision with the other cov_type if we apply
+    # 1 iter of the M-step (done during _initialize_parameters).
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    X = rand_data.X["full"]
+    n_components = rand_data.n_components
+
+    for prior_type in PRIOR_TYPE:
+        # Computation of the full_covariance
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components,
+            covariance_type="full",
+            max_iter=1,
+            random_state=0,
+            tol=1e-7,
+        )
+        bgmm._check_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+        full_covariances = (
+            bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis]
+        )
+
+        # Check tied_covariance = mean(full_covariances, 0)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components,
+            covariance_type="tied",
+            max_iter=1,
+            random_state=0,
+            tol=1e-7,
+        )
+        bgmm._check_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
+        assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))
+
+        # Check diag_covariance = diag(full_covariances)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components,
+            covariance_type="diag",
+            max_iter=1,
+            random_state=0,
+            tol=1e-7,
+        )
+        bgmm._check_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        diag_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis]
+        assert_almost_equal(
+            diag_covariances, np.array([np.diag(cov) for cov in full_covariances])
+        )
+
+        # Check spherical_covariance = np.mean(diag_covariances, 0)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components,
+            covariance_type="spherical",
+            max_iter=1,
+            random_state=0,
+            tol=1e-7,
+        )
+        bgmm._check_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
+        assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_check_covariance_precision():
+    # We check that the dot product of the covariance and the precision
+    # matrices is identity.
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_components, n_features = 2 * rand_data.n_components, 2
+
+    # Computation of the full_covariance
+    bgmm = BayesianGaussianMixture(
+        n_components=n_components, max_iter=100, random_state=rng, tol=1e-3, reg_covar=0
+    )
+    for covar_type in COVARIANCE_TYPE:
+        bgmm.covariance_type = covar_type
+        bgmm.fit(rand_data.X[covar_type])
+
+        if covar_type == "full":
+            for covar, precision in zip(bgmm.covariances_, bgmm.precisions_):
+                assert_almost_equal(np.dot(covar, precision), np.eye(n_features))
+        elif covar_type == "tied":
+            assert_almost_equal(
+                np.dot(bgmm.covariances_, bgmm.precisions_), np.eye(n_features)
+            )
+
+        elif covar_type == "diag":
+            assert_almost_equal(
+                bgmm.covariances_ * bgmm.precisions_,
+                np.ones((n_components, n_features)),
+            )
+
+        else:
+            assert_almost_equal(
+                bgmm.covariances_ * bgmm.precisions_, np.ones(n_components)
+            )
+
+
+def test_invariant_translation():
+    # We check here that adding a constant in the data change correctly the
+    # parameters of the mixture
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=100)
+    n_components = 2 * rand_data.n_components
+
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            bgmm1 = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=n_components,
+                max_iter=100,
+                random_state=0,
+                tol=1e-3,
+                reg_covar=0,
+            ).fit(X)
+            bgmm2 = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=n_components,
+                max_iter=100,
+                random_state=0,
+                tol=1e-3,
+                reg_covar=0,
+            ).fit(X + 100)
+
+            assert_almost_equal(bgmm1.means_, bgmm2.means_ - 100)
+            assert_almost_equal(bgmm1.weights_, bgmm2.weights_)
+            assert_almost_equal(bgmm1.covariances_, bgmm2.covariances_)
+
+
+@pytest.mark.filterwarnings("ignore:.*did not converge.*")
+@pytest.mark.parametrize(
+    "seed, max_iter, tol",
+    [
+        (0, 2, 1e-7),  # strict non-convergence
+        (1, 2, 1e-1),  # loose non-convergence
+        (3, 300, 1e-7),  # strict convergence
+        (4, 300, 1e-1),  # loose convergence
+    ],
+)
+def test_bayesian_mixture_fit_predict(seed, max_iter, tol):
+    rng = np.random.RandomState(seed)
+    rand_data = RandomData(rng, n_samples=50, scale=7)
+    n_components = 2 * rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        bgmm1 = BayesianGaussianMixture(
+            n_components=n_components,
+            max_iter=max_iter,
+            random_state=rng,
+            tol=tol,
+            reg_covar=0,
+        )
+        bgmm1.covariance_type = covar_type
+        bgmm2 = copy.deepcopy(bgmm1)
+        X = rand_data.X[covar_type]
+
+        Y_pred1 = bgmm1.fit(X).predict(X)
+        Y_pred2 = bgmm2.fit_predict(X)
+        assert_array_equal(Y_pred1, Y_pred2)
+
+
+def test_bayesian_mixture_fit_predict_n_init():
+    # Check that fit_predict is equivalent to fit.predict, when n_init > 1
+    X = np.random.RandomState(0).randn(50, 5)
+    gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0)
+    y_pred1 = gm.fit_predict(X)
+    y_pred2 = gm.predict(X)
+    assert_array_equal(y_pred1, y_pred2)
+
+
+def test_bayesian_mixture_predict_predict_proba():
+    # this is the same test as test_gaussian_mixture_predict_predict_proba()
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            Y = rand_data.Y
+            bgmm = BayesianGaussianMixture(
+                n_components=rand_data.n_components,
+                random_state=rng,
+                weight_concentration_prior_type=prior_type,
+                covariance_type=covar_type,
+            )
+
+            # Check a warning message arrive if we don't do fit
+            msg = (
+                "This BayesianGaussianMixture instance is not fitted yet. "
+                "Call 'fit' with appropriate arguments before using this "
+                "estimator."
+            )
+            with pytest.raises(NotFittedError, match=msg):
+                bgmm.predict(X)
+
+            bgmm.fit(X)
+            Y_pred = bgmm.predict(X)
+            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
+            assert_array_equal(Y_pred, Y_pred_proba)
+            assert adjusted_rand_score(Y, Y_pred) >= 0.95
diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..488a2ab147e8362eede842f64f4787fda47b9159
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -0,0 +1,1473 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import copy
+import itertools
+import re
+import sys
+import warnings
+from io import StringIO
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+from scipy import linalg, stats
+
+import sklearn
+from sklearn.cluster import KMeans
+from sklearn.covariance import EmpiricalCovariance
+from sklearn.datasets import make_spd_matrix
+from sklearn.exceptions import ConvergenceWarning, NotFittedError
+from sklearn.metrics.cluster import adjusted_rand_score
+from sklearn.mixture import GaussianMixture
+from sklearn.mixture._gaussian_mixture import (
+    _compute_log_det_cholesky,
+    _compute_precision_cholesky,
+    _estimate_gaussian_covariances_diag,
+    _estimate_gaussian_covariances_full,
+    _estimate_gaussian_covariances_spherical,
+    _estimate_gaussian_covariances_tied,
+    _estimate_gaussian_parameters,
+)
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.extmath import fast_logdet
+
+COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"]
+
+
+def generate_data(
+    n_samples, n_features, weights, means, precisions, covariance_type, dtype=np.float64
+):
+    rng = np.random.RandomState(0)
+
+    X = []
+    if covariance_type == "spherical":
+        for _, (w, m, c) in enumerate(zip(weights, means, precisions["spherical"])):
+            X.append(
+                rng.multivariate_normal(
+                    m, c * np.eye(n_features), int(np.round(w * n_samples))
+                ).astype(dtype)
+            )
+    if covariance_type == "diag":
+        for _, (w, m, c) in enumerate(zip(weights, means, precisions["diag"])):
+            X.append(
+                rng.multivariate_normal(
+                    m, np.diag(c), int(np.round(w * n_samples))
+                ).astype(dtype)
+            )
+    if covariance_type == "tied":
+        for _, (w, m) in enumerate(zip(weights, means)):
+            X.append(
+                rng.multivariate_normal(
+                    m, precisions["tied"], int(np.round(w * n_samples))
+                ).astype(dtype)
+            )
+    if covariance_type == "full":
+        for _, (w, m, c) in enumerate(zip(weights, means, precisions["full"])):
+            X.append(
+                rng.multivariate_normal(m, c, int(np.round(w * n_samples))).astype(
+                    dtype
+                )
+            )
+
+    X = np.vstack(X)
+    return X
+
+
+class RandomData:
+    def __init__(
+        self,
+        rng,
+        n_samples=200,
+        n_components=2,
+        n_features=2,
+        scale=50,
+        dtype=np.float64,
+    ):
+        self.n_samples = n_samples
+        self.n_components = n_components
+        self.n_features = n_features
+
+        self.weights = rng.rand(n_components).astype(dtype)
+        self.weights = self.weights.astype(dtype) / self.weights.sum()
+        self.means = rng.rand(n_components, n_features).astype(dtype) * scale
+        self.covariances = {
+            "spherical": 0.5 + rng.rand(n_components).astype(dtype),
+            "diag": (0.5 + rng.rand(n_components, n_features).astype(dtype)) ** 2,
+            "tied": make_spd_matrix(n_features, random_state=rng).astype(dtype),
+            "full": np.array(
+                [
+                    make_spd_matrix(n_features, random_state=rng).astype(dtype) * 0.5
+                    for _ in range(n_components)
+                ]
+            ),
+        }
+        self.precisions = {
+            "spherical": 1.0 / self.covariances["spherical"],
+            "diag": 1.0 / self.covariances["diag"],
+            "tied": linalg.inv(self.covariances["tied"]),
+            "full": np.array(
+                [linalg.inv(covariance) for covariance in self.covariances["full"]]
+            ),
+        }
+
+        self.X = dict(
+            zip(
+                COVARIANCE_TYPE,
+                [
+                    generate_data(
+                        n_samples,
+                        n_features,
+                        self.weights,
+                        self.means,
+                        self.covariances,
+                        covar_type,
+                        dtype=dtype,
+                    )
+                    for covar_type in COVARIANCE_TYPE
+                ],
+            )
+        )
+        self.Y = np.hstack(
+            [
+                np.full(int(np.round(w * n_samples)), k, dtype=int)
+                for k, w in enumerate(self.weights)
+            ]
+        )
+
+
+def test_gaussian_mixture_attributes():
+    # test bad parameters
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2)
+
+    # test good parameters
+    n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1
+    covariance_type, init_params = "full", "random"
+    gmm = GaussianMixture(
+        n_components=n_components,
+        tol=tol,
+        n_init=n_init,
+        max_iter=max_iter,
+        reg_covar=reg_covar,
+        covariance_type=covariance_type,
+        init_params=init_params,
+    ).fit(X)
+
+    assert gmm.n_components == n_components
+    assert gmm.covariance_type == covariance_type
+    assert gmm.tol == tol
+    assert gmm.reg_covar == reg_covar
+    assert gmm.max_iter == max_iter
+    assert gmm.n_init == n_init
+    assert gmm.init_params == init_params
+
+
+def test_check_weights():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+
+    g = GaussianMixture(n_components=n_components)
+
+    # Check bad shape
+    weights_bad_shape = rng.rand(n_components, 1)
+    g.weights_init = weights_bad_shape
+    msg = re.escape(
+        "The parameter 'weights' should have the shape of "
+        f"({n_components},), but got {weights_bad_shape.shape}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
+
+    # Check bad range
+    weights_bad_range = rng.rand(n_components) + 1
+    g.weights_init = weights_bad_range
+    msg = re.escape(
+        "The parameter 'weights' should be in the range [0, 1], but got"
+        f" max value {np.min(weights_bad_range):.5f}, "
+        f"min value {np.max(weights_bad_range):.5f}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
+
+    # Check bad normalization
+    weights_bad_norm = rng.rand(n_components)
+    weights_bad_norm = weights_bad_norm / (weights_bad_norm.sum() + 1)
+    g.weights_init = weights_bad_norm
+    msg = re.escape(
+        "The parameter 'weights' should be normalized, "
+        f"but got sum(weights) = {np.sum(weights_bad_norm):.5f}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
+
+    # Check good weights matrix
+    weights = rand_data.weights
+    g = GaussianMixture(weights_init=weights, n_components=n_components)
+    g.fit(X)
+    assert_array_equal(weights, g.weights_init)
+
+
+def test_check_means():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+
+    n_components, n_features = rand_data.n_components, rand_data.n_features
+    X = rand_data.X["full"]
+
+    g = GaussianMixture(n_components=n_components)
+
+    # Check means bad shape
+    means_bad_shape = rng.rand(n_components + 1, n_features)
+    g.means_init = means_bad_shape
+    msg = "The parameter 'means' should have the shape of "
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
+
+    # Check good means matrix
+    means = rand_data.means
+    g.means_init = means
+    g.fit(X)
+    assert_array_equal(means, g.means_init)
+
+
+def test_check_precisions():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+
+    n_components, n_features = rand_data.n_components, rand_data.n_features
+
+    # Define the bad precisions for each covariance_type
+    precisions_bad_shape = {
+        "full": np.ones((n_components + 1, n_features, n_features)),
+        "tied": np.ones((n_features + 1, n_features + 1)),
+        "diag": np.ones((n_components + 1, n_features)),
+        "spherical": np.ones((n_components + 1)),
+    }
+
+    # Define not positive-definite precisions
+    precisions_not_pos = np.ones((n_components, n_features, n_features))
+    precisions_not_pos[0] = np.eye(n_features)
+    precisions_not_pos[0, 0, 0] = -1.0
+
+    precisions_not_positive = {
+        "full": precisions_not_pos,
+        "tied": precisions_not_pos[0],
+        "diag": np.full((n_components, n_features), -1.0),
+        "spherical": np.full(n_components, -1.0),
+    }
+
+    not_positive_errors = {
+        "full": "symmetric, positive-definite",
+        "tied": "symmetric, positive-definite",
+        "diag": "positive",
+        "spherical": "positive",
+    }
+
+    for covar_type in COVARIANCE_TYPE:
+        X = RandomData(rng).X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components, covariance_type=covar_type, random_state=rng
+        )
+
+        # Check precisions with bad shapes
+        g.precisions_init = precisions_bad_shape[covar_type]
+        msg = f"The parameter '{covar_type} precision' should have the shape of"
+        with pytest.raises(ValueError, match=msg):
+            g.fit(X)
+
+        # Check not positive precisions
+        g.precisions_init = precisions_not_positive[covar_type]
+        msg = f"'{covar_type} precision' should be {not_positive_errors[covar_type]}"
+        with pytest.raises(ValueError, match=msg):
+            g.fit(X)
+
+        # Check the correct init of precisions_init
+        g.precisions_init = rand_data.precisions[covar_type]
+        g.fit(X)
+        assert_array_equal(rand_data.precisions[covar_type], g.precisions_init)
+
+
+def test_suffstat_sk_full():
+    # compare the precision matrix compute from the
+    # EmpiricalCovariance.covariance fitted on X*sqrt(resp)
+    # with _sufficient_sk_full, n_components=1
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 500, 2
+
+    # special case 1, assuming data is "centered"
+    X = rng.rand(n_samples, n_features)
+    resp = rng.rand(n_samples, 1)
+    X_resp = np.sqrt(resp) * X
+    nk = np.array([n_samples])
+    xk = np.zeros((1, n_features))
+    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
+    ecov = EmpiricalCovariance(assume_centered=True)
+    ecov.fit(X_resp)
+    assert_almost_equal(ecov.error_norm(covars_pred[0], norm="frobenius"), 0)
+    assert_almost_equal(ecov.error_norm(covars_pred[0], norm="spectral"), 0)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred, "full")
+    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
+    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
+    assert_array_almost_equal(precs_est, precs_pred)
+
+    # special case 2, assuming resp are all ones
+    resp = np.ones((n_samples, 1))
+    nk = np.array([n_samples])
+    xk = X.mean(axis=0).reshape((1, -1))
+    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
+    ecov = EmpiricalCovariance(assume_centered=False)
+    ecov.fit(X)
+    assert_almost_equal(ecov.error_norm(covars_pred[0], norm="frobenius"), 0)
+    assert_almost_equal(ecov.error_norm(covars_pred[0], norm="spectral"), 0)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred, "full")
+    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
+    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
+    assert_array_almost_equal(precs_est, precs_pred)
+
+
+def test_suffstat_sk_tied():
+    # use equation Nk * Sk / N = S_tied
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 500, 2, 2
+
+    resp = rng.rand(n_samples, n_components)
+    resp = resp / resp.sum(axis=1)[:, np.newaxis]
+    X = rng.rand(n_samples, n_features)
+    nk = resp.sum(axis=0)
+    xk = np.dot(resp.T, X) / nk[:, np.newaxis]
+
+    covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
+    covars_pred_full = (
+        np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full, 0) / n_samples
+    )
+
+    covars_pred_tied = _estimate_gaussian_covariances_tied(resp, X, nk, xk, 0)
+
+    ecov = EmpiricalCovariance()
+    ecov.covariance_ = covars_pred_full
+    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm="frobenius"), 0)
+    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm="spectral"), 0)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred_tied, "tied")
+    precs_pred = np.dot(precs_chol_pred, precs_chol_pred.T)
+    precs_est = linalg.inv(covars_pred_tied)
+    assert_array_almost_equal(precs_est, precs_pred)
+
+
+def test_suffstat_sk_diag():
+    # test against 'full' case
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 500, 2, 2
+
+    resp = rng.rand(n_samples, n_components)
+    resp = resp / resp.sum(axis=1)[:, np.newaxis]
+    X = rng.rand(n_samples, n_features)
+    nk = resp.sum(axis=0)
+    xk = np.dot(resp.T, X) / nk[:, np.newaxis]
+    covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
+    covars_pred_diag = _estimate_gaussian_covariances_diag(resp, X, nk, xk, 0)
+
+    ecov = EmpiricalCovariance()
+    for cov_full, cov_diag in zip(covars_pred_full, covars_pred_diag):
+        ecov.covariance_ = np.diag(np.diag(cov_full))
+        cov_diag = np.diag(cov_diag)
+        assert_almost_equal(ecov.error_norm(cov_diag, norm="frobenius"), 0)
+        assert_almost_equal(ecov.error_norm(cov_diag, norm="spectral"), 0)
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred_diag, "diag")
+    assert_almost_equal(covars_pred_diag, 1.0 / precs_chol_pred**2)
+
+
+def test_gaussian_suffstat_sk_spherical(global_dtype):
+    # computing spherical covariance equals to the variance of one-dimension
+    # data after flattening, n_components=1
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 500, 2
+
+    X = rng.rand(n_samples, n_features).astype(global_dtype)
+    X = X - X.mean()
+    resp = np.ones((n_samples, 1), dtype=global_dtype)
+    nk = np.array([n_samples], dtype=global_dtype)
+    xk = X.mean()
+    covars_pred_spherical = _estimate_gaussian_covariances_spherical(resp, X, nk, xk, 0)
+    covars_pred_spherical2 = np.dot(X.flatten().T, X.flatten()) / (
+        n_features * n_samples
+    )
+    assert_almost_equal(covars_pred_spherical, covars_pred_spherical2)
+    assert covars_pred_spherical.dtype == global_dtype
+
+    # check the precision computation
+    precs_chol_pred = _compute_precision_cholesky(covars_pred_spherical, "spherical")
+    assert_almost_equal(covars_pred_spherical, 1.0 / precs_chol_pred**2)
+    assert precs_chol_pred.dtype == global_dtype
+
+
+def test_compute_log_det_cholesky(global_dtype):
+    n_features = 2
+    rand_data = RandomData(np.random.RandomState(0), dtype=global_dtype)
+
+    for covar_type in COVARIANCE_TYPE:
+        covariance = rand_data.covariances[covar_type]
+
+        if covar_type == "full":
+            predected_det = np.array([linalg.det(cov) for cov in covariance])
+        elif covar_type == "tied":
+            predected_det = linalg.det(covariance)
+        elif covar_type == "diag":
+            predected_det = np.array([np.prod(cov) for cov in covariance])
+        elif covar_type == "spherical":
+            predected_det = covariance**n_features
+
+        # We compute the cholesky decomposition of the covariance matrix
+        assert covariance.dtype == global_dtype
+        expected_det = _compute_log_det_cholesky(
+            _compute_precision_cholesky(covariance, covar_type),
+            covar_type,
+            n_features=n_features,
+        )
+        assert_array_almost_equal(expected_det, -0.5 * np.log(predected_det))
+        assert expected_det.dtype == global_dtype
+
+
+def _naive_lmvnpdf_diag(X, means, covars):
+    resp = np.empty((len(X), len(means)))
+    stds = np.sqrt(covars)
+    for i, (mean, std) in enumerate(zip(means, stds)):
+        resp[:, i] = stats.norm.logpdf(X, mean, std).sum(axis=1)
+    return resp
+
+
+def test_gaussian_mixture_log_probabilities():
+    from sklearn.mixture._gaussian_mixture import _estimate_log_gaussian_prob
+
+    # test against with _naive_lmvnpdf_diag
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_samples = 500
+    n_features = rand_data.n_features
+    n_components = rand_data.n_components
+
+    means = rand_data.means
+    covars_diag = rng.rand(n_components, n_features)
+    X = rng.rand(n_samples, n_features)
+    log_prob_naive = _naive_lmvnpdf_diag(X, means, covars_diag)
+
+    # full covariances
+    precs_full = np.array([np.diag(1.0 / np.sqrt(x)) for x in covars_diag])
+
+    log_prob = _estimate_log_gaussian_prob(X, means, precs_full, "full")
+    assert_array_almost_equal(log_prob, log_prob_naive)
+
+    # diag covariances
+    precs_chol_diag = 1.0 / np.sqrt(covars_diag)
+    log_prob = _estimate_log_gaussian_prob(X, means, precs_chol_diag, "diag")
+    assert_array_almost_equal(log_prob, log_prob_naive)
+
+    # tied
+    covars_tied = np.array([x for x in covars_diag]).mean(axis=0)
+    precs_tied = np.diag(np.sqrt(1.0 / covars_tied))
+
+    log_prob_naive = _naive_lmvnpdf_diag(X, means, [covars_tied] * n_components)
+    log_prob = _estimate_log_gaussian_prob(X, means, precs_tied, "tied")
+
+    assert_array_almost_equal(log_prob, log_prob_naive)
+
+    # spherical
+    covars_spherical = covars_diag.mean(axis=1)
+    precs_spherical = 1.0 / np.sqrt(covars_diag.mean(axis=1))
+    log_prob_naive = _naive_lmvnpdf_diag(
+        X, means, [[k] * n_features for k in covars_spherical]
+    )
+    log_prob = _estimate_log_gaussian_prob(X, means, precs_spherical, "spherical")
+    assert_array_almost_equal(log_prob, log_prob_naive)
+
+
+# skip tests on weighted_log_probabilities, log_weights
+
+
+def test_gaussian_mixture_estimate_log_prob_resp():
+    # test whether responsibilities are normalized
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=5)
+    n_samples = rand_data.n_samples
+    n_features = rand_data.n_features
+    n_components = rand_data.n_components
+
+    X = rng.rand(n_samples, n_features)
+    for covar_type in COVARIANCE_TYPE:
+        weights = rand_data.weights
+        means = rand_data.means
+        precisions = rand_data.precisions[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            random_state=rng,
+            weights_init=weights,
+            means_init=means,
+            precisions_init=precisions,
+            covariance_type=covar_type,
+        )
+        g.fit(X)
+        resp = g.predict_proba(X)
+        assert_array_almost_equal(resp.sum(axis=1), np.ones(n_samples))
+        assert_array_equal(g.weights_init, weights)
+        assert_array_equal(g.means_init, means)
+        assert_array_equal(g.precisions_init, precisions)
+
+
+def test_gaussian_mixture_predict_predict_proba():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        Y = rand_data.Y
+        g = GaussianMixture(
+            n_components=rand_data.n_components,
+            random_state=rng,
+            weights_init=rand_data.weights,
+            means_init=rand_data.means,
+            precisions_init=rand_data.precisions[covar_type],
+            covariance_type=covar_type,
+        )
+
+        # Check a warning message arrive if we don't do fit
+        msg = (
+            "This GaussianMixture instance is not fitted yet. Call 'fit' "
+            "with appropriate arguments before using this estimator."
+        )
+        with pytest.raises(NotFittedError, match=msg):
+            g.predict(X)
+
+        g.fit(X)
+        Y_pred = g.predict(X)
+        Y_pred_proba = g.predict_proba(X).argmax(axis=1)
+        assert_array_equal(Y_pred, Y_pred_proba)
+        assert adjusted_rand_score(Y, Y_pred) > 0.95
+
+
+@pytest.mark.filterwarnings("ignore:.*did not converge.*")
+@pytest.mark.parametrize(
+    "seed, max_iter, tol",
+    [
+        (0, 2, 1e-7),  # strict non-convergence
+        (1, 2, 1e-1),  # loose non-convergence
+        (3, 300, 1e-7),  # strict convergence
+        (4, 300, 1e-1),  # loose convergence
+    ],
+)
+def test_gaussian_mixture_fit_predict(seed, max_iter, tol, global_dtype):
+    rng = np.random.RandomState(seed)
+    rand_data = RandomData(rng, dtype=global_dtype)
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        Y = rand_data.Y
+        g = GaussianMixture(
+            n_components=rand_data.n_components,
+            random_state=rng,
+            weights_init=rand_data.weights,
+            means_init=rand_data.means,
+            precisions_init=rand_data.precisions[covar_type],
+            covariance_type=covar_type,
+            max_iter=max_iter,
+            tol=tol,
+        )
+
+        # check if fit_predict(X) is equivalent to fit(X).predict(X)
+        f = copy.deepcopy(g)
+        Y_pred1 = f.fit(X).predict(X)
+        Y_pred2 = g.fit_predict(X)
+        assert_array_equal(Y_pred1, Y_pred2)
+        assert adjusted_rand_score(Y, Y_pred2) > 0.95
+        assert g.means_.dtype == global_dtype
+        assert g.weights_.dtype == global_dtype
+        assert g.precisions_.dtype == global_dtype
+
+
+def test_gaussian_mixture_fit_predict_n_init():
+    # Check that fit_predict is equivalent to fit.predict, when n_init > 1
+    X = np.random.RandomState(0).randn(1000, 5)
+    gm = GaussianMixture(n_components=5, n_init=5, random_state=0)
+    y_pred1 = gm.fit_predict(X)
+    y_pred2 = gm.predict(X)
+    assert_array_equal(y_pred1, y_pred2)
+
+
+def test_gaussian_mixture_fit(global_dtype):
+    # recover the ground truth
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, dtype=global_dtype)
+    n_features = rand_data.n_features
+    n_components = rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            n_init=20,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+        )
+        g.fit(X)
+
+        # needs more data to pass the test with rtol=1e-7
+        assert_allclose(
+            np.sort(g.weights_), np.sort(rand_data.weights), rtol=0.1, atol=1e-2
+        )
+
+        arg_idx1 = g.means_[:, 0].argsort()
+        arg_idx2 = rand_data.means[:, 0].argsort()
+        assert_allclose(
+            g.means_[arg_idx1], rand_data.means[arg_idx2], rtol=0.1, atol=1e-2
+        )
+
+        if covar_type == "full":
+            prec_pred = g.precisions_
+            prec_test = rand_data.precisions["full"]
+        elif covar_type == "tied":
+            prec_pred = np.array([g.precisions_] * n_components)
+            prec_test = np.array([rand_data.precisions["tied"]] * n_components)
+        elif covar_type == "spherical":
+            prec_pred = np.array([np.eye(n_features) * c for c in g.precisions_])
+            prec_test = np.array(
+                [np.eye(n_features) * c for c in rand_data.precisions["spherical"]]
+            )
+        elif covar_type == "diag":
+            prec_pred = np.array([np.diag(d) for d in g.precisions_])
+            prec_test = np.array([np.diag(d) for d in rand_data.precisions["diag"]])
+
+        arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort()
+        arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort()
+        for k, h in zip(arg_idx1, arg_idx2):
+            ecov = EmpiricalCovariance()
+            ecov.covariance_ = prec_test[h]
+            # the accuracy depends on the number of data and randomness, rng
+            assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.15)
+
+        assert g.means_.dtype == global_dtype
+        assert g.covariances_.dtype == global_dtype
+        assert g.precisions_.dtype == global_dtype
+
+
+def test_gaussian_mixture_fit_best_params():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_components = rand_data.n_components
+    n_init = 10
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            n_init=1,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+        )
+        ll = []
+        for _ in range(n_init):
+            g.fit(X)
+            ll.append(g.score(X))
+        ll = np.array(ll)
+        g_best = GaussianMixture(
+            n_components=n_components,
+            n_init=n_init,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+        )
+        g_best.fit(X)
+        assert_almost_equal(ll.min(), g_best.score(X))
+
+
+def test_gaussian_mixture_fit_convergence_warning():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=1)
+    n_components = rand_data.n_components
+    max_iter = 1
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            n_init=1,
+            max_iter=max_iter,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+        )
+        msg = (
+            "Best performing initialization did not converge. "
+            "Try different init parameters, or increase max_iter, "
+            "tol, or check for degenerate data."
+        )
+        with pytest.warns(ConvergenceWarning, match=msg):
+            g.fit(X)
+
+
+def test_multiple_init():
+    # Test that multiple inits does not much worse than a single one
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 50, 5, 2
+    X = rng.randn(n_samples, n_features)
+    for cv_type in COVARIANCE_TYPE:
+        train1 = (
+            GaussianMixture(
+                n_components=n_components, covariance_type=cv_type, random_state=0
+            )
+            .fit(X)
+            .score(X)
+        )
+        train2 = (
+            GaussianMixture(
+                n_components=n_components,
+                covariance_type=cv_type,
+                random_state=0,
+                n_init=5,
+            )
+            .fit(X)
+            .score(X)
+        )
+        assert train2 >= train1
+
+
+def test_gaussian_mixture_n_parameters():
+    # Test that the right number of parameters is estimated
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 50, 5, 2
+    X = rng.randn(n_samples, n_features)
+    n_params = {"spherical": 13, "diag": 21, "tied": 26, "full": 41}
+    for cv_type in COVARIANCE_TYPE:
+        g = GaussianMixture(
+            n_components=n_components, covariance_type=cv_type, random_state=rng
+        ).fit(X)
+        assert g._n_parameters() == n_params[cv_type]
+
+
+def test_bic_1d_1component():
+    # Test all of the covariance_types return the same BIC score for
+    # 1-dimensional, 1 component fits.
+    rng = np.random.RandomState(0)
+    n_samples, n_dim, n_components = 100, 1, 1
+    X = rng.randn(n_samples, n_dim)
+    bic_full = (
+        GaussianMixture(
+            n_components=n_components, covariance_type="full", random_state=rng
+        )
+        .fit(X)
+        .bic(X)
+    )
+    for covariance_type in ["tied", "diag", "spherical"]:
+        bic = (
+            GaussianMixture(
+                n_components=n_components,
+                covariance_type=covariance_type,
+                random_state=rng,
+            )
+            .fit(X)
+            .bic(X)
+        )
+        assert_almost_equal(bic_full, bic)
+
+
+def test_gaussian_mixture_aic_bic():
+    # Test the aic and bic criteria
+    rng = np.random.RandomState(0)
+    n_samples, n_features, n_components = 50, 3, 2
+    X = rng.randn(n_samples, n_features)
+    # standard gaussian entropy
+    sgh = 0.5 * (
+        fast_logdet(np.cov(X.T, bias=1)) + n_features * (1 + np.log(2 * np.pi))
+    )
+    for cv_type in COVARIANCE_TYPE:
+        g = GaussianMixture(
+            n_components=n_components,
+            covariance_type=cv_type,
+            random_state=rng,
+            max_iter=200,
+        )
+        g.fit(X)
+        aic = 2 * n_samples * sgh + 2 * g._n_parameters()
+        bic = 2 * n_samples * sgh + np.log(n_samples) * g._n_parameters()
+        bound = n_features / np.sqrt(n_samples)
+        assert (g.aic(X) - aic) / n_samples < bound
+        assert (g.bic(X) - bic) / n_samples < bound
+
+
+def test_gaussian_mixture_verbose():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_components = rand_data.n_components
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        g = GaussianMixture(
+            n_components=n_components,
+            n_init=1,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+            verbose=1,
+        )
+        h = GaussianMixture(
+            n_components=n_components,
+            n_init=1,
+            reg_covar=0,
+            random_state=rng,
+            covariance_type=covar_type,
+            verbose=2,
+        )
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+        try:
+            g.fit(X)
+            h.fit(X)
+        finally:
+            sys.stdout = old_stdout
+
+
+@pytest.mark.filterwarnings("ignore:.*did not converge.*")
+@pytest.mark.parametrize("seed", (0, 1, 2))
+def test_warm_start(seed):
+    random_state = seed
+    rng = np.random.RandomState(random_state)
+    n_samples, n_features, n_components = 500, 2, 2
+    X = rng.rand(n_samples, n_features)
+
+    # Assert the warm_start give the same result for the same number of iter
+    g = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=2,
+        reg_covar=0,
+        random_state=random_state,
+        warm_start=False,
+    )
+    h = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=1,
+        reg_covar=0,
+        random_state=random_state,
+        warm_start=True,
+    )
+
+    g.fit(X)
+    score1 = h.fit(X).score(X)
+    score2 = h.fit(X).score(X)
+
+    assert_almost_equal(g.weights_, h.weights_)
+    assert_almost_equal(g.means_, h.means_)
+    assert_almost_equal(g.precisions_, h.precisions_)
+    assert score2 > score1
+
+    # Assert that by using warm_start we can converge to a good solution
+    g = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=5,
+        reg_covar=0,
+        random_state=random_state,
+        warm_start=False,
+        tol=1e-6,
+    )
+    h = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=5,
+        reg_covar=0,
+        random_state=random_state,
+        warm_start=True,
+        tol=1e-6,
+    )
+
+    g.fit(X)
+    assert not g.converged_
+
+    h.fit(X)
+    # depending on the data there is large variability in the number of
+    # refit necessary to converge due to the complete randomness of the
+    # data
+    for _ in range(1000):
+        h.fit(X)
+        if h.converged_:
+            break
+    assert h.converged_
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_convergence_detected_with_warm_start():
+    # We check that convergence is detected when warm_start=True
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng)
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+
+    for max_iter in (1, 2, 50):
+        gmm = GaussianMixture(
+            n_components=n_components,
+            warm_start=True,
+            max_iter=max_iter,
+            random_state=rng,
+        )
+        for _ in range(100):
+            gmm.fit(X)
+            if gmm.converged_:
+                break
+        assert gmm.converged_
+        assert max_iter >= gmm.n_iter_
+
+
+def test_score(global_dtype):
+    covar_type = "full"
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7, dtype=global_dtype)
+    n_components = rand_data.n_components
+    X = rand_data.X[covar_type]
+    assert X.dtype == global_dtype
+
+    # Check the error message if we don't call fit
+    gmm1 = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        max_iter=1,
+        reg_covar=0,
+        random_state=rng,
+        covariance_type=covar_type,
+    )
+    msg = (
+        "This GaussianMixture instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg):
+        gmm1.score(X)
+
+    # Check score value
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        gmm1.fit(X)
+
+    assert gmm1.means_.dtype == global_dtype
+    assert gmm1.covariances_.dtype == global_dtype
+
+    gmm_score = gmm1.score(X)
+    gmm_score_proba = gmm1.score_samples(X).mean()
+    assert_almost_equal(gmm_score, gmm_score_proba)
+    assert gmm_score_proba.dtype == global_dtype
+
+    # Check if the score increase
+    gmm2 = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        reg_covar=0,
+        random_state=rng,
+        covariance_type=covar_type,
+    ).fit(X)
+    assert gmm2.score(X) > gmm1.score(X)
+
+
+def test_score_samples():
+    covar_type = "full"
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_components = rand_data.n_components
+    X = rand_data.X[covar_type]
+
+    # Check the error message if we don't call fit
+    gmm = GaussianMixture(
+        n_components=n_components,
+        n_init=1,
+        reg_covar=0,
+        random_state=rng,
+        covariance_type=covar_type,
+    )
+    msg = (
+        "This GaussianMixture instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg):
+        gmm.score_samples(X)
+
+    gmm_score_samples = gmm.fit(X).score_samples(X)
+    assert gmm_score_samples.shape[0] == rand_data.n_samples
+
+
+def test_monotonic_likelihood():
+    # We check that each step of the EM without regularization improve
+    # monotonically the training set likelihood
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_components = rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+        gmm = GaussianMixture(
+            n_components=n_components,
+            covariance_type=covar_type,
+            reg_covar=0,
+            warm_start=True,
+            max_iter=1,
+            random_state=rng,
+            tol=1e-7,
+        )
+        current_log_likelihood = -np.inf
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            # Do one training iteration at a time so we can make sure that the
+            # training log likelihood increases after each iteration.
+            for _ in range(600):
+                prev_log_likelihood = current_log_likelihood
+                current_log_likelihood = gmm.fit(X).score(X)
+                assert current_log_likelihood >= prev_log_likelihood
+
+                if gmm.converged_:
+                    break
+
+            assert gmm.converged_
+
+
+def test_regularisation():
+    # We train the GaussianMixture on degenerate data by defining two clusters
+    # of a 0 covariance.
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 5
+
+    X = np.vstack(
+        (np.ones((n_samples // 2, n_features)), np.zeros((n_samples // 2, n_features)))
+    )
+
+    for covar_type in COVARIANCE_TYPE:
+        gmm = GaussianMixture(
+            n_components=n_samples,
+            reg_covar=0,
+            covariance_type=covar_type,
+            random_state=rng,
+        )
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", RuntimeWarning)
+            msg = re.escape(
+                "Fitting the mixture model failed because some components have"
+                " ill-defined empirical covariance (for instance caused by "
+                "singleton or collapsed samples). Try to decrease the number "
+                "of components, increase reg_covar, or scale the input data."
+            )
+            with pytest.raises(ValueError, match=msg):
+                gmm.fit(X)
+
+            gmm.set_params(reg_covar=1e-6).fit(X)
+
+
+@pytest.mark.parametrize("covar_type", COVARIANCE_TYPE)
+def test_fitted_precision_covariance_concistency(covar_type, global_dtype):
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7, dtype=global_dtype)
+    n_components = rand_data.n_components
+
+    X = rand_data.X[covar_type]
+    gmm = GaussianMixture(
+        n_components=n_components,
+        covariance_type=covar_type,
+        random_state=rng,
+        n_init=5,
+    )
+    gmm.fit(X)
+    assert gmm.precisions_.dtype == global_dtype
+    assert gmm.covariances_.dtype == global_dtype
+    if covar_type == "full":
+        for prec, covar in zip(gmm.precisions_, gmm.covariances_):
+            assert_array_almost_equal(linalg.inv(prec), covar)
+    elif covar_type == "tied":
+        assert_array_almost_equal(linalg.inv(gmm.precisions_), gmm.covariances_)
+    else:
+        assert_array_almost_equal(gmm.precisions_, 1.0 / gmm.covariances_)
+
+
+def test_sample():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7, n_components=3)
+    n_features, n_components = rand_data.n_features, rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+
+        gmm = GaussianMixture(
+            n_components=n_components, covariance_type=covar_type, random_state=rng
+        )
+        # To sample we need that GaussianMixture is fitted
+        msg = "This GaussianMixture instance is not fitted"
+        with pytest.raises(NotFittedError, match=msg):
+            gmm.sample(0)
+        gmm.fit(X)
+
+        msg = "Invalid value for 'n_samples'"
+        with pytest.raises(ValueError, match=msg):
+            gmm.sample(0)
+
+        # Just to make sure the class samples correctly
+        n_samples = 20000
+        X_s, y_s = gmm.sample(n_samples)
+
+        for k in range(n_components):
+            if covar_type == "full":
+                assert_array_almost_equal(
+                    gmm.covariances_[k], np.cov(X_s[y_s == k].T), decimal=1
+                )
+            elif covar_type == "tied":
+                assert_array_almost_equal(
+                    gmm.covariances_, np.cov(X_s[y_s == k].T), decimal=1
+                )
+            elif covar_type == "diag":
+                assert_array_almost_equal(
+                    gmm.covariances_[k], np.diag(np.cov(X_s[y_s == k].T)), decimal=1
+                )
+            else:
+                assert_array_almost_equal(
+                    gmm.covariances_[k],
+                    np.var(X_s[y_s == k] - gmm.means_[k]),
+                    decimal=1,
+                )
+
+        means_s = np.array([np.mean(X_s[y_s == k], 0) for k in range(n_components)])
+        assert_array_almost_equal(gmm.means_, means_s, decimal=1)
+
+        # Check shapes of sampled data, see
+        # https://github.com/scikit-learn/scikit-learn/issues/7701
+        assert X_s.shape == (n_samples, n_features)
+
+        for sample_size in range(1, 100):
+            X_s, _ = gmm.sample(sample_size)
+            assert X_s.shape == (sample_size, n_features)
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_init():
+    # We check that by increasing the n_init number we have a better solution
+    for random_state in range(15):
+        rand_data = RandomData(
+            np.random.RandomState(random_state), n_samples=50, scale=1
+        )
+        n_components = rand_data.n_components
+        X = rand_data.X["full"]
+
+        gmm1 = GaussianMixture(
+            n_components=n_components, n_init=1, max_iter=1, random_state=random_state
+        ).fit(X)
+        gmm2 = GaussianMixture(
+            n_components=n_components, n_init=10, max_iter=1, random_state=random_state
+        ).fit(X)
+
+        assert gmm2.lower_bound_ >= gmm1.lower_bound_
+
+
+def test_gaussian_mixture_setting_best_params():
+    """`GaussianMixture`'s best_parameters, `n_iter_` and `lower_bound_`
+    must be set appropriately in the case of divergence.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/18216
+    """
+    rnd = np.random.RandomState(0)
+    n_samples = 30
+    X = rnd.uniform(size=(n_samples, 3))
+
+    # following initialization parameters were found to lead to divergence
+    means_init = np.array(
+        [
+            [0.670637869618158, 0.21038256107384043, 0.12892629765485303],
+            [0.09394051075844147, 0.5759464955561779, 0.929296197576212],
+            [0.5033230372781258, 0.9569852381759425, 0.08654043447295741],
+            [0.18578301420435747, 0.5531158970919143, 0.19388943970532435],
+            [0.4548589928173794, 0.35182513658825276, 0.568146063202464],
+            [0.609279894978321, 0.7929063819678847, 0.9620097270828052],
+        ]
+    )
+    precisions_init = np.array(
+        [
+            999999.999604483,
+            999999.9990869573,
+            553.7603944542167,
+            204.78596008931834,
+            15.867423501783637,
+            85.4595728389735,
+        ]
+    )
+    weights_init = [
+        0.03333333333333341,
+        0.03333333333333341,
+        0.06666666666666674,
+        0.06666666666666674,
+        0.7000000000000001,
+        0.10000000000000007,
+    ]
+
+    gmm = GaussianMixture(
+        covariance_type="spherical",
+        reg_covar=0,
+        means_init=means_init,
+        weights_init=weights_init,
+        random_state=rnd,
+        n_components=len(weights_init),
+        precisions_init=precisions_init,
+        max_iter=1,
+    )
+    # ensure that no error is thrown during fit
+    gmm.fit(X)
+
+    # check that the fit did not converge
+    assert not gmm.converged_
+
+    # check that parameters are set for gmm
+    for attr in [
+        "weights_",
+        "means_",
+        "covariances_",
+        "precisions_cholesky_",
+        "n_iter_",
+        "lower_bound_",
+        "lower_bounds_",
+    ]:
+        assert hasattr(gmm, attr)
+
+
+@pytest.mark.parametrize(
+    "init_params", ["random", "random_from_data", "k-means++", "kmeans"]
+)
+def test_init_means_not_duplicated(init_params, global_random_seed):
+    # Check that all initialisations provide not duplicated starting means
+    rng = np.random.RandomState(global_random_seed)
+    rand_data = RandomData(rng, scale=5)
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+
+    gmm = GaussianMixture(
+        n_components=n_components, init_params=init_params, random_state=rng, max_iter=0
+    )
+    gmm.fit(X)
+
+    means = gmm.means_
+    for i_mean, j_mean in itertools.combinations(means, r=2):
+        assert not np.allclose(i_mean, j_mean)
+
+
+@pytest.mark.parametrize(
+    "init_params", ["random", "random_from_data", "k-means++", "kmeans"]
+)
+def test_means_for_all_inits(init_params, global_random_seed, global_dtype):
+    # Check fitted means properties for all initializations
+    rng = np.random.RandomState(global_random_seed)
+    rand_data = RandomData(rng, scale=5, dtype=global_dtype)
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+
+    gmm = GaussianMixture(
+        n_components=n_components, init_params=init_params, random_state=rng
+    )
+    gmm.fit(X)
+
+    assert gmm.means_.shape == (n_components, X.shape[1])
+    assert np.all(X.min(axis=0) <= gmm.means_)
+    assert np.all(gmm.means_ <= X.max(axis=0))
+    assert gmm.converged_
+    assert gmm.means_.dtype == global_dtype
+    assert gmm.covariances_.dtype == global_dtype
+    assert gmm.weights_.dtype == global_dtype
+
+
+def test_max_iter_zero():
+    # Check that max_iter=0 returns initialisation as expected
+    # Pick arbitrary initial means and check equal to max_iter=0
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=5)
+    n_components = rand_data.n_components
+    X = rand_data.X["full"]
+    means_init = [[20, 30], [30, 25]]
+    gmm = GaussianMixture(
+        n_components=n_components,
+        random_state=rng,
+        means_init=means_init,
+        tol=1e-06,
+        max_iter=0,
+    )
+    gmm.fit(X)
+
+    assert_allclose(gmm.means_, means_init)
+
+
+def test_gaussian_mixture_precisions_init_diag(global_dtype):
+    """Check that we properly initialize `precision_cholesky_` when we manually
+    provide the precision matrix.
+
+    In this regard, we check the consistency between estimating the precision
+    matrix and providing the same precision matrix as initialization. It should
+    lead to the same results with the same number of iterations.
+
+    If the initialization is wrong then the number of iterations will increase.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/16944
+    """
+    # generate a toy dataset
+    n_samples = 300
+    rng = np.random.RandomState(0)
+    shifted_gaussian = rng.randn(n_samples, 2) + np.array([20, 20])
+    C = np.array([[0.0, -0.7], [3.5, 0.7]])
+    stretched_gaussian = np.dot(rng.randn(n_samples, 2), C)
+    X = np.vstack([shifted_gaussian, stretched_gaussian]).astype(global_dtype)
+
+    # common parameters to check the consistency of precision initialization
+    n_components, covariance_type, reg_covar, random_state = 2, "diag", 1e-6, 0
+
+    # execute the manual initialization to compute the precision matrix:
+    # - run KMeans to have an initial guess
+    # - estimate the covariance
+    # - compute the precision matrix from the estimated covariance
+    resp = np.zeros((X.shape[0], n_components)).astype(global_dtype)
+    label = (
+        KMeans(n_clusters=n_components, n_init=1, random_state=random_state)
+        .fit(X)
+        .labels_
+    )
+    resp[np.arange(X.shape[0]), label] = 1
+    _, _, covariance = _estimate_gaussian_parameters(
+        X, resp, reg_covar=reg_covar, covariance_type=covariance_type
+    )
+    assert covariance.dtype == global_dtype
+    precisions_init = 1 / covariance
+
+    gm_with_init = GaussianMixture(
+        n_components=n_components,
+        covariance_type=covariance_type,
+        reg_covar=reg_covar,
+        precisions_init=precisions_init,
+        random_state=random_state,
+    ).fit(X)
+    assert gm_with_init.means_.dtype == global_dtype
+    assert gm_with_init.covariances_.dtype == global_dtype
+    assert gm_with_init.precisions_cholesky_.dtype == global_dtype
+
+    gm_without_init = GaussianMixture(
+        n_components=n_components,
+        covariance_type=covariance_type,
+        reg_covar=reg_covar,
+        random_state=random_state,
+    ).fit(X)
+    assert gm_without_init.means_.dtype == global_dtype
+    assert gm_without_init.covariances_.dtype == global_dtype
+    assert gm_without_init.precisions_cholesky_.dtype == global_dtype
+
+    assert gm_without_init.n_iter_ == gm_with_init.n_iter_
+    assert_allclose(
+        gm_with_init.precisions_cholesky_, gm_without_init.precisions_cholesky_
+    )
+
+
+def _generate_data(seed, n_samples, n_features, n_components, dtype=np.float64):
+    """Randomly generate samples and responsibilities."""
+    rs = np.random.RandomState(seed)
+    X = rs.random_sample((n_samples, n_features)).astype(dtype)
+    resp = rs.random_sample((n_samples, n_components)).astype(dtype)
+    resp /= resp.sum(axis=1)[:, np.newaxis]
+    return X, resp
+
+
+def _calculate_precisions(X, resp, covariance_type):
+    """Calculate precision matrix of X and its Cholesky decomposition
+    for the given covariance type.
+    """
+    reg_covar = 1e-6
+    weights, means, covariances = _estimate_gaussian_parameters(
+        X, resp, reg_covar, covariance_type
+    )
+    precisions_cholesky = _compute_precision_cholesky(covariances, covariance_type)
+
+    _, n_components = resp.shape
+    # Instantiate a `GaussianMixture` model in order to use its
+    # `_set_parameters` method to return the `precisions_` and
+    #  `precisions_cholesky_` from matching the `covariance_type`
+    # provided.
+    gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type)
+    params = (weights, means, covariances, precisions_cholesky)
+    gmm._set_parameters(params)
+    return gmm.precisions_, gmm.precisions_cholesky_
+
+
+@pytest.mark.parametrize("covariance_type", COVARIANCE_TYPE)
+def test_gaussian_mixture_precisions_init(
+    covariance_type, global_random_seed, global_dtype
+):
+    """Non-regression test for #26415."""
+
+    X, resp = _generate_data(
+        seed=global_random_seed,
+        n_samples=100,
+        n_features=3,
+        n_components=4,
+        dtype=global_dtype,
+    )
+
+    precisions_init, desired_precisions_cholesky = _calculate_precisions(
+        X, resp, covariance_type
+    )
+    assert precisions_init.dtype == global_dtype
+    assert desired_precisions_cholesky.dtype == global_dtype
+
+    gmm = GaussianMixture(
+        covariance_type=covariance_type, precisions_init=precisions_init
+    )
+    gmm._initialize(X, resp)
+    actual_precisions_cholesky = gmm.precisions_cholesky_
+    assert_allclose(actual_precisions_cholesky, desired_precisions_cholesky)
+
+
+def test_gaussian_mixture_single_component_stable():
+    """
+    Non-regression test for #23032 ensuring 1-component GM works on only a
+    few samples.
+    """
+    rng = np.random.RandomState(0)
+    X = rng.multivariate_normal(np.zeros(2), np.identity(2), size=3)
+    gm = GaussianMixture(n_components=1)
+    gm.fit(X).sample()
+
+
+def test_gaussian_mixture_all_init_does_not_estimate_gaussian_parameters(
+    monkeypatch,
+    global_random_seed,
+):
+    """When all init parameters are provided, the Gaussian parameters
+    are not estimated.
+
+    Non-regression test for gh-26015.
+    """
+
+    mock = Mock(side_effect=_estimate_gaussian_parameters)
+    monkeypatch.setattr(
+        sklearn.mixture._gaussian_mixture, "_estimate_gaussian_parameters", mock
+    )
+
+    rng = np.random.RandomState(global_random_seed)
+    rand_data = RandomData(rng)
+
+    gm = GaussianMixture(
+        n_components=rand_data.n_components,
+        weights_init=rand_data.weights,
+        means_init=rand_data.means,
+        precisions_init=rand_data.precisions["full"],
+        random_state=rng,
+    )
+    gm.fit(rand_data.X["full"])
+    # The initial gaussian parameters are not estimated. They are estimated for every
+    # m_step.
+    assert mock.call_count == gm.n_iter_
diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c98d150f06a8c7685d24c083e2ed2866f17c8ca
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_mixture.py
@@ -0,0 +1,30 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import pytest
+
+from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
+
+
+@pytest.mark.parametrize("estimator", [GaussianMixture(), BayesianGaussianMixture()])
+def test_gaussian_mixture_n_iter(estimator):
+    # check that n_iter is the number of iteration performed.
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 5)
+    max_iter = 1
+    estimator.set_params(max_iter=max_iter)
+    estimator.fit(X)
+    assert estimator.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize("estimator", [GaussianMixture(), BayesianGaussianMixture()])
+def test_mixture_n_components_greater_than_n_samples_error(estimator):
+    """Check error when n_components <= n_samples"""
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 5)
+    estimator.set_params(n_components=12)
+
+    msg = "Expected n_samples >= n_components"
+    with pytest.raises(ValueError, match=msg):
+        estimator.fit(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eb0ef772c552fc6e2171acc13c1e98966a1cfb4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/__init__.py
@@ -0,0 +1,99 @@
+"""Tools for model selection, such as cross validation and hyper-parameter tuning."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import typing
+
+from ._classification_threshold import (
+    FixedThresholdClassifier,
+    TunedThresholdClassifierCV,
+)
+from ._plot import LearningCurveDisplay, ValidationCurveDisplay
+from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV
+from ._split import (
+    BaseCrossValidator,
+    BaseShuffleSplit,
+    GroupKFold,
+    GroupShuffleSplit,
+    KFold,
+    LeaveOneGroupOut,
+    LeaveOneOut,
+    LeavePGroupsOut,
+    LeavePOut,
+    PredefinedSplit,
+    RepeatedKFold,
+    RepeatedStratifiedKFold,
+    ShuffleSplit,
+    StratifiedGroupKFold,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+    TimeSeriesSplit,
+    check_cv,
+    train_test_split,
+)
+from ._validation import (
+    cross_val_predict,
+    cross_val_score,
+    cross_validate,
+    learning_curve,
+    permutation_test_score,
+    validation_curve,
+)
+
+if typing.TYPE_CHECKING:
+    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
+    # TODO: remove this check once the estimator is no longer experimental.
+    from ._search_successive_halving import (  # noqa: F401
+        HalvingGridSearchCV,
+        HalvingRandomSearchCV,
+    )
+
+
+__all__ = [
+    "BaseCrossValidator",
+    "BaseShuffleSplit",
+    "FixedThresholdClassifier",
+    "GridSearchCV",
+    "GroupKFold",
+    "GroupShuffleSplit",
+    "KFold",
+    "LearningCurveDisplay",
+    "LeaveOneGroupOut",
+    "LeaveOneOut",
+    "LeavePGroupsOut",
+    "LeavePOut",
+    "ParameterGrid",
+    "ParameterSampler",
+    "PredefinedSplit",
+    "RandomizedSearchCV",
+    "RepeatedKFold",
+    "RepeatedStratifiedKFold",
+    "ShuffleSplit",
+    "StratifiedGroupKFold",
+    "StratifiedKFold",
+    "StratifiedShuffleSplit",
+    "TimeSeriesSplit",
+    "TunedThresholdClassifierCV",
+    "ValidationCurveDisplay",
+    "check_cv",
+    "cross_val_predict",
+    "cross_val_score",
+    "cross_validate",
+    "learning_curve",
+    "permutation_test_score",
+    "train_test_split",
+    "validation_curve",
+]
+
+
+# TODO: remove this check once the estimator is no longer experimental.
+def __getattr__(name):
+    if name in {"HalvingGridSearchCV", "HalvingRandomSearchCV"}:
+        raise ImportError(
+            f"{name} is experimental and the API might change without any "
+            "deprecation cycle. To use it, you need to explicitly import "
+            "enable_halving_search_cv:\n"
+            "from sklearn.experimental import enable_halving_search_cv"
+        )
+    raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_classification_threshold.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_classification_threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..c68ed38b8819d989d0ec838840b5b5406eec7e57
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_classification_threshold.py
@@ -0,0 +1,889 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from collections.abc import MutableMapping
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MetaEstimatorMixin,
+    _fit_context,
+    clone,
+)
+from ..exceptions import NotFittedError
+from ..metrics import (
+    check_scoring,
+    get_scorer_names,
+)
+from ..metrics._scorer import (
+    _CurveScorer,
+    _threshold_scores_to_class_labels,
+)
+from ..utils import _safe_indexing, get_tags
+from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
+from ..utils._response import _get_response_values_binary
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import type_of_target
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import (
+    _check_method_params,
+    _estimator_has,
+    _num_samples,
+    check_is_fitted,
+    indexable,
+)
+from ._split import StratifiedShuffleSplit, check_cv
+
+
+def _check_is_fitted(estimator):
+    try:
+        check_is_fitted(estimator.estimator)
+    except NotFittedError:
+        check_is_fitted(estimator, "estimator_")
+
+
+class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+    """Base class for binary classifiers that set a non-default decision threshold.
+
+    In this base class, we define the following interface:
+
+    - the validation of common parameters in `fit`;
+    - the different prediction methods that can be used with the classifier.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The binary classifier, fitted or not, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each classifier,
+          `"predict_proba"` or `"decision_function"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [
+            HasMethods(["fit", "predict_proba"]),
+            HasMethods(["fit", "decision_function"]),
+        ],
+        "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})],
+    }
+
+    def __init__(self, estimator, *, response_method="auto"):
+        self.estimator = estimator
+        self.response_method = response_method
+
+    def _get_response_method(self):
+        """Define the response method."""
+        if self.response_method == "auto":
+            response_method = ["predict_proba", "decision_function"]
+        else:
+            response_method = self.response_method
+        return response_method
+
+    @_fit_context(
+        # *ThresholdClassifier*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **params):
+        """Fit the classifier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        _raise_for_params(params, self, None)
+
+        X, y = indexable(X, y)
+
+        y_type = type_of_target(y, input_name="y")
+        if y_type != "binary":
+            raise ValueError(
+                f"Only binary classification is supported. Unknown label type: {y_type}"
+            )
+
+        self._fit(X, y, **params)
+
+        if hasattr(self.estimator_, "n_features_in_"):
+            self.n_features_in_ = self.estimator_.n_features_in_
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
+
+        return self
+
+    @property
+    def classes_(self):
+        """Classes labels."""
+        return self.estimator_.classes_
+
+    @available_if(_estimator_has("predict_proba"))
+    def predict_proba(self, X):
+        """Predict class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples.
+        """
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.predict_proba(X)
+
+    @available_if(_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X):
+        """Predict logarithm class probabilities for `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        log_probabilities : ndarray of shape (n_samples, n_classes)
+            The logarithm class probabilities of the input samples.
+        """
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.predict_log_proba(X)
+
+    @available_if(_estimator_has("decision_function"))
+    def decision_function(self, X):
+        """Decision function for samples in `X` using the fitted estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        decisions : ndarray of shape (n_samples,)
+            The decision function computed the fitted estimator.
+        """
+        _check_is_fitted(self)
+        estimator = getattr(self, "estimator_", self.estimator)
+        return estimator.decision_function(X)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_class = False
+        tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
+        return tags
+
+
+class FixedThresholdClassifier(BaseThresholdClassifier):
+    """Binary classifier that manually sets the decision threshold.
+
+    This classifier allows to change the default decision threshold used for
+    converting posterior probability estimates (i.e. output of `predict_proba`) or
+    decision scores (i.e. output of `decision_function`) into a class label.
+
+    Here, the threshold is not optimized and is set to a constant value.
+
+    Read more in the :ref:`User Guide <FixedThresholdClassifier>`.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The binary classifier, fitted or not, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    threshold : {"auto"} or float, default="auto"
+        The decision threshold to use when converting posterior probability estimates
+        (i.e. output of `predict_proba`) or decision scores (i.e. output of
+        `decision_function`) into a class label. When `"auto"`, the threshold is set
+        to 0.5 if `predict_proba` is used as `response_method`, otherwise it is set to
+        0 (i.e. the default threshold for `decision_function`).
+
+    pos_label : int, float, bool or str, default=None
+        The label of the positive class. Used to process the output of the
+        `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or
+        `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke `"predict_proba"` or `"decision_function"`
+          in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+
+    Attributes
+    ----------
+    estimator_ : estimator instance
+        The fitted classifier used when predicting.
+
+    classes_ : ndarray of shape (n_classes,)
+        The class labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    See Also
+    --------
+    sklearn.model_selection.TunedThresholdClassifierCV : Classifier that post-tunes
+        the decision threshold based on some metrics and using cross-validation.
+    sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
+        probabilities.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.metrics import confusion_matrix
+    >>> from sklearn.model_selection import FixedThresholdClassifier, train_test_split
+    >>> X, y = make_classification(
+    ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
+    ... )
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> classifier = LogisticRegression(random_state=0).fit(X_train, y_train)
+    >>> print(confusion_matrix(y_test, classifier.predict(X_test)))
+    [[217   7]
+     [ 19   7]]
+    >>> classifier_other_threshold = FixedThresholdClassifier(
+    ...     classifier, threshold=0.1, response_method="predict_proba"
+    ... ).fit(X_train, y_train)
+    >>> print(confusion_matrix(y_test, classifier_other_threshold.predict(X_test)))
+    [[184  40]
+     [  6  20]]
+    """
+
+    _parameter_constraints: dict = {
+        **BaseThresholdClassifier._parameter_constraints,
+        "threshold": [StrOptions({"auto"}), Real],
+        "pos_label": [Real, str, "boolean", None],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        threshold="auto",
+        pos_label=None,
+        response_method="auto",
+    ):
+        super().__init__(estimator=estimator, response_method=response_method)
+        self.pos_label = pos_label
+        self.threshold = threshold
+
+    @property
+    def classes_(self):
+        if estimator := getattr(self, "estimator_", None):
+            return estimator.classes_
+        try:
+            check_is_fitted(self.estimator)
+            return self.estimator.classes_
+        except NotFittedError:
+            raise AttributeError(
+                "The underlying estimator is not fitted yet."
+            ) from NotFittedError
+
+    def _fit(self, X, y, **params):
+        """Fit the classifier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        routed_params = process_routing(self, "fit", **params)
+        self.estimator_ = clone(self.estimator).fit(X, y, **routed_params.estimator.fit)
+        return self
+
+    def predict(self, X):
+        """Predict the target of new samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict`.
+
+        Returns
+        -------
+        class_labels : ndarray of shape (n_samples,)
+            The predicted class.
+        """
+        _check_is_fitted(self)
+
+        estimator = getattr(self, "estimator_", self.estimator)
+
+        y_score, _, response_method_used = _get_response_values_binary(
+            estimator,
+            X,
+            self._get_response_method(),
+            pos_label=self.pos_label,
+            return_response_method_used=True,
+        )
+
+        if self.threshold == "auto":
+            decision_threshold = 0.5 if response_method_used == "predict_proba" else 0.0
+        else:
+            decision_threshold = self.threshold
+
+        return _threshold_scores_to_class_labels(
+            y_score, decision_threshold, self.classes_, self.pos_label
+        )
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+        )
+        return router
+
+
+def _fit_and_score_over_thresholds(
+    classifier,
+    X,
+    y,
+    *,
+    fit_params,
+    train_idx,
+    val_idx,
+    curve_scorer,
+    score_params,
+):
+    """Fit a classifier and compute the scores for different decision thresholds.
+
+    Parameters
+    ----------
+    classifier : estimator instance
+        The classifier to fit and use for scoring. If `classifier` is already fitted,
+        it will be used as is.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The entire dataset.
+
+    y : array-like of shape (n_samples,)
+        The entire target vector.
+
+    fit_params : dict
+        Parameters to pass to the `fit` method of the underlying classifier.
+
+    train_idx : ndarray of shape (n_train_samples,) or None
+        The indices of the training set. If `None`, `classifier` is expected to be
+        already fitted.
+
+    val_idx : ndarray of shape (n_val_samples,)
+        The indices of the validation set used to score `classifier`. If `train_idx`,
+        the entire set will be used.
+
+    curve_scorer : scorer instance
+        The scorer taking `classifier` and the validation set as input and outputting
+        decision thresholds and scores as a curve. Note that this is different from
+        the usual scorer that outputs a single score value as `curve_scorer`
+        outputs a single score value for each threshold.
+
+    score_params : dict
+        Parameters to pass to the `score` method of the underlying scorer.
+
+    Returns
+    -------
+    scores : ndarray of shape (thresholds,) or tuple of such arrays
+        The scores computed for each decision threshold. When TPR/TNR or precision/
+        recall are computed, `scores` is a tuple of two arrays.
+
+    potential_thresholds : ndarray of shape (thresholds,)
+        The decision thresholds used to compute the scores. They are returned in
+        ascending order.
+    """
+
+    if train_idx is not None:
+        X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx)
+        y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx)
+        fit_params_train = _check_method_params(X, fit_params, indices=train_idx)
+        score_params_val = _check_method_params(X, score_params, indices=val_idx)
+        classifier.fit(X_train, y_train, **fit_params_train)
+    else:  # prefit estimator, only a validation set is provided
+        X_val, y_val, score_params_val = X, y, score_params
+
+    return curve_scorer(classifier, X_val, y_val, **score_params_val)
+
+
+def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores):
+    """Compute the mean interpolated score across folds by defining common thresholds.
+
+    Parameters
+    ----------
+    target_thresholds : ndarray of shape (thresholds,)
+        The thresholds to use to compute the mean score.
+
+    cv_thresholds : ndarray of shape (n_folds, thresholds_fold)
+        The thresholds used to compute the scores for each fold.
+
+    cv_scores : ndarray of shape (n_folds, thresholds_fold)
+        The scores computed for each threshold for each fold.
+
+    Returns
+    -------
+    mean_score : ndarray of shape (thresholds,)
+        The mean score across all folds for each target threshold.
+    """
+    return np.mean(
+        [
+            np.interp(target_thresholds, split_thresholds, split_score)
+            for split_thresholds, split_score in zip(cv_thresholds, cv_scores)
+        ],
+        axis=0,
+    )
+
+
+class TunedThresholdClassifierCV(BaseThresholdClassifier):
+    """Classifier that post-tunes the decision threshold using cross-validation.
+
+    This estimator post-tunes the decision threshold (cut-off point) that is
+    used for converting posterior probability estimates (i.e. output of
+    `predict_proba`) or decision scores (i.e. output of `decision_function`)
+    into a class label. The tuning is done by optimizing a binary metric,
+    potentially constrained by a another metric.
+
+    Read more in the :ref:`User Guide <TunedThresholdClassifierCV>`.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The classifier, fitted or not, for which we want to optimize
+        the decision threshold used during `predict`.
+
+    scoring : str or callable, default="balanced_accuracy"
+        The objective metric to be optimized. Can be one of:
+
+        - str: string associated to a scoring function for binary classification,
+          see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+
+    response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each classifier,
+          `"predict_proba"` or `"decision_function"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the classifier, it will raise an
+          error.
+
+    thresholds : int or array-like, default=100
+        The number of decision threshold to use when discretizing the output of the
+        classifier `method`. Pass an array-like to manually specify the thresholds
+        to use.
+
+    cv : int, float, cross-validation generator, iterable or "prefit", default=None
+        Determines the cross-validation splitting strategy to train classifier.
+        Possible inputs for cv are:
+
+        * `None`, to use the default 5-fold stratified K-fold cross validation;
+        * An integer number, to specify the number of folds in a stratified k-fold;
+        * A float number, to specify a single shuffle split. The floating number should
+          be in (0, 1) and represent the size of the validation set;
+        * An object to be used as a cross-validation generator;
+        * An iterable yielding train, test splits;
+        * `"prefit"`, to bypass the cross-validation.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. warning::
+            Using `cv="prefit"` and passing the same dataset for fitting `estimator`
+            and tuning the cut-off point is subject to undesired overfitting. You can
+            refer to :ref:`TunedThresholdClassifierCV_no_cv` for an example.
+
+            This option should only be used when the set used to fit `estimator` is
+            different from the one used to tune the cut-off point (by calling
+            :meth:`TunedThresholdClassifierCV.fit`).
+
+    refit : bool, default=True
+        Whether or not to refit the classifier on the entire training set once
+        the decision threshold has been found.
+        Note that forcing `refit=False` on cross-validation having more
+        than a single split will raise an error. Similarly, `refit=True` in
+        conjunction with `cv="prefit"` will raise an error.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. When `cv` represents a
+        cross-validation strategy, the fitting and scoring on each data split
+        is done in parallel. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of cross-validation when `cv` is a float.
+        See :term:`Glossary <random_state>`.
+
+    store_cv_results : bool, default=False
+        Whether to store all scores and thresholds computed during the cross-validation
+        process.
+
+    Attributes
+    ----------
+    estimator_ : estimator instance
+        The fitted classifier used when predicting.
+
+    best_threshold_ : float
+        The new decision threshold.
+
+    best_score_ : float or None
+        The optimal score of the objective metric, evaluated at `best_threshold_`.
+
+    cv_results_ : dict or None
+        A dictionary containing the scores and thresholds computed during the
+        cross-validation process. Only exist if `store_cv_results=True`. The
+        keys are `"thresholds"` and `"scores"`.
+
+    classes_ : ndarray of shape (n_classes,)
+        The class labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+    See Also
+    --------
+    sklearn.model_selection.FixedThresholdClassifier : Classifier that uses a
+        constant threshold.
+    sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates
+        probabilities.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.metrics import classification_report
+    >>> from sklearn.model_selection import TunedThresholdClassifierCV, train_test_split
+    >>> X, y = make_classification(
+    ...     n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42
+    ... )
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> classifier = RandomForestClassifier(random_state=0).fit(X_train, y_train)
+    >>> print(classification_report(y_test, classifier.predict(X_test)))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               0       0.94      0.99      0.96       224
+               1       0.80      0.46      0.59        26
+    <BLANKLINE>
+        accuracy                           0.93       250
+       macro avg       0.87      0.72      0.77       250
+    weighted avg       0.93      0.93      0.92       250
+    <BLANKLINE>
+    >>> classifier_tuned = TunedThresholdClassifierCV(
+    ...     classifier, scoring="balanced_accuracy"
+    ... ).fit(X_train, y_train)
+    >>> print(
+    ...     f"Cut-off point found at {classifier_tuned.best_threshold_:.3f}"
+    ... )
+    Cut-off point found at 0.342
+    >>> print(classification_report(y_test, classifier_tuned.predict(X_test)))
+                  precision    recall  f1-score   support
+    <BLANKLINE>
+               0       0.96      0.95      0.96       224
+               1       0.61      0.65      0.63        26
+    <BLANKLINE>
+        accuracy                           0.92       250
+       macro avg       0.78      0.80      0.79       250
+    weighted avg       0.92      0.92      0.92       250
+    <BLANKLINE>
+    """
+
+    _parameter_constraints: dict = {
+        **BaseThresholdClassifier._parameter_constraints,
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            MutableMapping,
+        ],
+        "thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"],
+        "cv": [
+            "cv_object",
+            StrOptions({"prefit"}),
+            Interval(RealNotInt, 0.0, 1.0, closed="neither"),
+        ],
+        "refit": ["boolean"],
+        "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
+        "store_cv_results": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        scoring="balanced_accuracy",
+        response_method="auto",
+        thresholds=100,
+        cv=None,
+        refit=True,
+        n_jobs=None,
+        random_state=None,
+        store_cv_results=False,
+    ):
+        super().__init__(estimator=estimator, response_method=response_method)
+        self.scoring = scoring
+        self.thresholds = thresholds
+        self.cv = cv
+        self.refit = refit
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.store_cv_results = store_cv_results
+
+    def _fit(self, X, y, **params):
+        """Fit the classifier and post-tune the decision threshold.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **params : dict
+            Parameters to pass to the `fit` method of the underlying
+            classifier and to the `scoring` scorer.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        if isinstance(self.cv, Real) and 0 < self.cv < 1:
+            cv = StratifiedShuffleSplit(
+                n_splits=1, test_size=self.cv, random_state=self.random_state
+            )
+        elif self.cv == "prefit":
+            if self.refit is True:
+                raise ValueError("When cv='prefit', refit cannot be True.")
+            try:
+                check_is_fitted(self.estimator, "classes_")
+            except NotFittedError as exc:
+                raise NotFittedError(
+                    """When cv='prefit', `estimator` must be fitted."""
+                ) from exc
+            cv = self.cv
+        else:
+            cv = check_cv(self.cv, y=y, classifier=True)
+            if self.refit is False and cv.get_n_splits() > 1:
+                raise ValueError("When cv has several folds, refit cannot be False.")
+
+        routed_params = process_routing(self, "fit", **params)
+        self._curve_scorer = self._get_curve_scorer()
+
+        # in the following block, we:
+        # - define the final classifier `self.estimator_` and train it if necessary
+        # - define `classifier` to be used to post-tune the decision threshold
+        # - define `split` to be used to fit/score `classifier`
+        if cv == "prefit":
+            self.estimator_ = self.estimator
+            classifier = self.estimator_
+            splits = [(None, range(_num_samples(X)))]
+        else:
+            self.estimator_ = clone(self.estimator)
+            classifier = clone(self.estimator)
+            splits = cv.split(X, y, **routed_params.splitter.split)
+
+            if self.refit:
+                # train on the whole dataset
+                X_train, y_train, fit_params_train = X, y, routed_params.estimator.fit
+            else:
+                # single split cross-validation
+                train_idx, _ = next(cv.split(X, y, **routed_params.splitter.split))
+                X_train = _safe_indexing(X, train_idx)
+                y_train = _safe_indexing(y, train_idx)
+                fit_params_train = _check_method_params(
+                    X, routed_params.estimator.fit, indices=train_idx
+                )
+
+            self.estimator_.fit(X_train, y_train, **fit_params_train)
+
+        cv_scores, cv_thresholds = zip(
+            *Parallel(n_jobs=self.n_jobs)(
+                delayed(_fit_and_score_over_thresholds)(
+                    clone(classifier) if cv != "prefit" else classifier,
+                    X,
+                    y,
+                    fit_params=routed_params.estimator.fit,
+                    train_idx=train_idx,
+                    val_idx=val_idx,
+                    curve_scorer=self._curve_scorer,
+                    score_params=routed_params.scorer.score,
+                )
+                for train_idx, val_idx in splits
+            )
+        )
+
+        if any(np.isclose(th[0], th[-1]) for th in cv_thresholds):
+            raise ValueError(
+                "The provided estimator makes constant predictions. Therefore, it is "
+                "impossible to optimize the decision threshold."
+            )
+
+        # find the global min and max thresholds across all folds
+        min_threshold = min(
+            split_thresholds.min() for split_thresholds in cv_thresholds
+        )
+        max_threshold = max(
+            split_thresholds.max() for split_thresholds in cv_thresholds
+        )
+        if isinstance(self.thresholds, Integral):
+            decision_thresholds = np.linspace(
+                min_threshold, max_threshold, num=self.thresholds
+            )
+        else:
+            decision_thresholds = np.asarray(self.thresholds)
+
+        objective_scores = _mean_interpolated_score(
+            decision_thresholds, cv_thresholds, cv_scores
+        )
+        best_idx = objective_scores.argmax()
+        self.best_score_ = objective_scores[best_idx]
+        self.best_threshold_ = decision_thresholds[best_idx]
+        if self.store_cv_results:
+            self.cv_results_ = {
+                "thresholds": decision_thresholds,
+                "scores": objective_scores,
+            }
+
+        return self
+
+    def predict(self, X):
+        """Predict the target of new samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The samples, as accepted by `estimator.predict`.
+
+        Returns
+        -------
+        class_labels : ndarray of shape (n_samples,)
+            The predicted class.
+        """
+        check_is_fitted(self, "estimator_")
+        pos_label = self._curve_scorer._get_pos_label()
+        y_score, _ = _get_response_values_binary(
+            self.estimator_,
+            X,
+            self._get_response_method(),
+            pos_label=pos_label,
+        )
+
+        return _threshold_scores_to_class_labels(
+            y_score, self.best_threshold_, self.classes_, pos_label
+        )
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add(
+                estimator=self.estimator,
+                method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+            )
+            .add(
+                splitter=self.cv,
+                method_mapping=MethodMapping().add(callee="split", caller="fit"),
+            )
+            .add(
+                scorer=self._get_curve_scorer(),
+                method_mapping=MethodMapping().add(callee="score", caller="fit"),
+            )
+        )
+        return router
+
+    def _get_curve_scorer(self):
+        """Get the curve scorer based on the objective metric used."""
+        scoring = check_scoring(self.estimator, scoring=self.scoring)
+        curve_scorer = _CurveScorer.from_scorer(
+            scoring, self._get_response_method(), self.thresholds
+        )
+        return curve_scorer
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_plot.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..a69c8f455bd417b97c716c473304bfdc041d85c5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_plot.py
@@ -0,0 +1,885 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+from ..utils._optional_dependencies import check_matplotlib_support
+from ..utils._plotting import _interval_max_min_ratio, _validate_score_name
+from ._validation import learning_curve, validation_curve
+
+
+class _BaseCurveDisplay:
+    def _plot_curve(
+        self,
+        x_data,
+        *,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="test",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        check_matplotlib_support(f"{self.__class__.__name__}.plot")
+
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            _, ax = plt.subplots()
+
+        if negate_score:
+            train_scores, test_scores = -self.train_scores, -self.test_scores
+        else:
+            train_scores, test_scores = self.train_scores, self.test_scores
+
+        if std_display_style not in ("errorbar", "fill_between", None):
+            raise ValueError(
+                f"Unknown std_display_style: {std_display_style}. Should be one of"
+                " 'errorbar', 'fill_between', or None."
+            )
+
+        if score_type not in ("test", "train", "both"):
+            raise ValueError(
+                f"Unknown score_type: {score_type}. Should be one of 'test', "
+                "'train', or 'both'."
+            )
+
+        if score_type == "train":
+            scores = {"Train": train_scores}
+        elif score_type == "test":
+            scores = {"Test": test_scores}
+        else:  # score_type == "both"
+            scores = {"Train": train_scores, "Test": test_scores}
+
+        if std_display_style in ("fill_between", None):
+            # plot the mean score
+            if line_kw is None:
+                line_kw = {}
+
+            self.lines_ = []
+            for line_label, score in scores.items():
+                self.lines_.append(
+                    *ax.plot(
+                        x_data,
+                        score.mean(axis=1),
+                        label=line_label,
+                        **line_kw,
+                    )
+                )
+            self.errorbar_ = None
+            self.fill_between_ = None  # overwritten below by fill_between
+
+        if std_display_style == "errorbar":
+            if errorbar_kw is None:
+                errorbar_kw = {}
+
+            self.errorbar_ = []
+            for line_label, score in scores.items():
+                self.errorbar_.append(
+                    ax.errorbar(
+                        x_data,
+                        score.mean(axis=1),
+                        score.std(axis=1),
+                        label=line_label,
+                        **errorbar_kw,
+                    )
+                )
+            self.lines_, self.fill_between_ = None, None
+        elif std_display_style == "fill_between":
+            if fill_between_kw is None:
+                fill_between_kw = {}
+            default_fill_between_kw = {"alpha": 0.5}
+            fill_between_kw = {**default_fill_between_kw, **fill_between_kw}
+
+            self.fill_between_ = []
+            for line_label, score in scores.items():
+                self.fill_between_.append(
+                    ax.fill_between(
+                        x_data,
+                        score.mean(axis=1) - score.std(axis=1),
+                        score.mean(axis=1) + score.std(axis=1),
+                        **fill_between_kw,
+                    )
+                )
+
+        score_name = self.score_name if score_name is None else score_name
+
+        ax.legend()
+
+        # We found that a ratio, smaller or bigger than 5, between the largest and
+        # smallest gap of the x values is a good indicator to choose between linear
+        # and log scale.
+        if _interval_max_min_ratio(x_data) > 5:
+            xscale = "symlog" if x_data.min() <= 0 else "log"
+        else:
+            xscale = "linear"
+
+        ax.set_xscale(xscale)
+        ax.set_ylabel(f"{score_name}")
+
+        self.ax_ = ax
+        self.figure_ = ax.figure
+
+
+class LearningCurveDisplay(_BaseCurveDisplay):
+    """Learning Curve visualization.
+
+    It is recommended to use
+    :meth:`~sklearn.model_selection.LearningCurveDisplay.from_estimator` to
+    create a :class:`~sklearn.model_selection.LearningCurveDisplay` instance.
+    All parameters are stored as attributes.
+
+    Read more in the :ref:`User Guide <visualizations>` for general information
+    about the visualization API and
+    :ref:`detailed documentation <learning_curve>` regarding the learning
+    curve visualization.
+
+    .. versionadded:: 1.2
+
+    Parameters
+    ----------
+    train_sizes : ndarray of shape (n_unique_ticks,)
+        Numbers of training examples that has been used to generate the
+        learning curve.
+
+    train_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    score_name : str, default=None
+        The name of the score used in `learning_curve`. It will override the name
+        inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if
+        `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a
+        string or a callable, we infer the name. We replace `_` by spaces and capitalize
+        the first letter. We remove `neg_` and replace it by `"Negative"` if
+        `negate_score` is `False` or just remove it otherwise.
+
+    Attributes
+    ----------
+    ax_ : matplotlib Axes
+        Axes with the learning curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the learning curve.
+
+    errorbar_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"errorbar"`, this is a list of
+        `matplotlib.container.ErrorbarContainer` objects. If another style is
+        used, `errorbar_` is `None`.
+
+    lines_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.lines.Line2D` objects corresponding to the mean train and
+        test scores. If another style is used, `line_` is `None`.
+
+    fill_between_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.collections.PolyCollection` objects. If another style is
+        used, `fill_between_` is `None`.
+
+    See Also
+    --------
+    sklearn.model_selection.learning_curve : Compute the learning curve.
+
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import LearningCurveDisplay, learning_curve
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> tree = DecisionTreeClassifier(random_state=0)
+    >>> train_sizes, train_scores, test_scores = learning_curve(
+    ...     tree, X, y)
+    >>> display = LearningCurveDisplay(train_sizes=train_sizes,
+    ...     train_scores=train_scores, test_scores=test_scores, score_name="Score")
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(self, *, train_sizes, train_scores, test_scores, score_name=None):
+        self.train_sizes = train_sizes
+        self.train_scores = train_scores
+        self.test_scores = test_scores
+        self.score_name = score_name
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.learning_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If None, no standard deviation representation is
+            displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.LearningCurveDisplay`
+            Object that stores computed values.
+        """
+        self._plot_curve(
+            self.train_sizes,
+            ax=ax,
+            negate_score=negate_score,
+            score_name=score_name,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+        self.ax_.set_xlabel("Number of samples in the training set")
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        groups=None,
+        train_sizes=np.linspace(0.1, 1.0, 5),
+        cv=None,
+        scoring=None,
+        exploit_incremental_learning=False,
+        n_jobs=None,
+        pre_dispatch="all",
+        verbose=0,
+        shuffle=False,
+        random_state=None,
+        error_score=np.nan,
+        fit_params=None,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Create a learning curve display from an estimator.
+
+        Read more in the :ref:`User Guide <visualizations>` for general
+        information about the visualization API and :ref:`detailed
+        documentation <learning_curve>` regarding the learning curve
+        visualization.
+
+        Parameters
+        ----------
+        estimator : object type that implements the "fit" and "predict" methods
+            An object of that type which is cloned for each validation.
+
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Only used in conjunction with a "Group" :term:`cv`
+            instance (e.g., :class:`GroupKFold`).
+
+        train_sizes : array-like of shape (n_ticks,), \
+                default=np.linspace(0.1, 1.0, 5)
+            Relative or absolute numbers of training examples that will be used
+            to generate the learning curve. If the dtype is float, it is
+            regarded as a fraction of the maximum size of the training set
+            (that is determined by the selected validation method), i.e. it has
+            to be within (0, 1]. Otherwise it is interpreted as absolute sizes
+            of the training sets. Note that for classification the number of
+            samples usually have to be big enough to contain at least one
+            sample from each class.
+
+        cv : int, cross-validation generator or an iterable, default=None
+            Determines the cross-validation splitting strategy.
+            Possible inputs for cv are:
+
+            - None, to use the default 5-fold cross validation,
+            - int, to specify the number of folds in a `(Stratified)KFold`,
+            - :term:`CV splitter`,
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+            For int/None inputs, if the estimator is a classifier and `y` is
+            either binary or multiclass,
+            :class:`~sklearn.model_selection.StratifiedKFold` is used. In all
+            other cases, :class:`~sklearn.model_selection.KFold` is used. These
+            splitters are instantiated with `shuffle=False` so the splits will
+            be the same across calls.
+
+            Refer :ref:`User Guide <cross_validation>` for the various
+            cross-validation strategies that can be used here.
+
+        scoring : str or callable, default=None
+            The scoring method to use when calculating the learning curve. Options:
+
+            - str: see :ref:`scoring_string_names` for options.
+            - callable: a scorer callable object (e.g., function) with signature
+              ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+            - `None`: the `estimator`'s
+              :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+        exploit_incremental_learning : bool, default=False
+            If the estimator supports incremental learning, this will be
+            used to speed up fitting for different training set sizes.
+
+        n_jobs : int, default=None
+            Number of jobs to run in parallel. Training the estimator and
+            computing the score are parallelized over the different training
+            and test sets. `None` means 1 unless in a
+            :obj:`joblib.parallel_backend` context. `-1` means using all
+            processors. See :term:`Glossary <n_jobs>` for more details.
+
+        pre_dispatch : int or str, default='all'
+            Number of predispatched jobs for parallel execution (default is
+            all). The option can reduce the allocated memory. The str can
+            be an expression like '2*n_jobs'.
+
+        verbose : int, default=0
+            Controls the verbosity: the higher, the more messages.
+
+        shuffle : bool, default=False
+            Whether to shuffle training data before taking prefixes of it
+            based on`train_sizes`.
+
+        random_state : int, RandomState instance or None, default=None
+            Used when `shuffle` is True. Pass an int for reproducible
+            output across multiple function calls.
+            See :term:`Glossary <random_state>`.
+
+        error_score : 'raise' or numeric, default=np.nan
+            Value to assign to the score if an error occurs in estimator
+            fitting. If set to 'raise', the error is raised. If a numeric value
+            is given, FitFailedWarning is raised.
+
+        fit_params : dict, default=None
+            Parameters to pass to the fit method of the estimator.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.learning_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If `None`, no representation of the standard deviation
+            is displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.LearningCurveDisplay`
+            Object that stores computed values.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import load_iris
+        >>> from sklearn.model_selection import LearningCurveDisplay
+        >>> from sklearn.tree import DecisionTreeClassifier
+        >>> X, y = load_iris(return_X_y=True)
+        >>> tree = DecisionTreeClassifier(random_state=0)
+        >>> LearningCurveDisplay.from_estimator(tree, X, y)
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+
+        score_name = _validate_score_name(score_name, scoring, negate_score)
+
+        train_sizes, train_scores, test_scores = learning_curve(
+            estimator,
+            X,
+            y,
+            groups=groups,
+            train_sizes=train_sizes,
+            cv=cv,
+            scoring=scoring,
+            exploit_incremental_learning=exploit_incremental_learning,
+            n_jobs=n_jobs,
+            pre_dispatch=pre_dispatch,
+            verbose=verbose,
+            shuffle=shuffle,
+            random_state=random_state,
+            error_score=error_score,
+            return_times=False,
+            fit_params=fit_params,
+        )
+
+        viz = cls(
+            train_sizes=train_sizes,
+            train_scores=train_scores,
+            test_scores=test_scores,
+            score_name=score_name,
+        )
+        return viz.plot(
+            ax=ax,
+            negate_score=negate_score,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+
+
+class ValidationCurveDisplay(_BaseCurveDisplay):
+    """Validation Curve visualization.
+
+    It is recommended to use
+    :meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` to
+    create a :class:`~sklearn.model_selection.ValidationCurveDisplay` instance.
+    All parameters are stored as attributes.
+
+    Read more in the :ref:`User Guide <visualizations>` for general information
+    about the visualization API and :ref:`detailed documentation
+    <validation_curve>` regarding the validation curve visualization.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    param_name : str
+        Name of the parameter that has been varied.
+
+    param_range : array-like of shape (n_ticks,)
+        The values of the parameter that have been evaluated.
+
+    train_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    score_name : str, default=None
+        The name of the score used in `validation_curve`. It will override the name
+        inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if
+        `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a
+        string or a callable, we infer the name. We replace `_` by spaces and capitalize
+        the first letter. We remove `neg_` and replace it by `"Negative"` if
+        `negate_score` is `False` or just remove it otherwise.
+
+    Attributes
+    ----------
+    ax_ : matplotlib Axes
+        Axes with the validation curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the validation curve.
+
+    errorbar_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"errorbar"`, this is a list of
+        `matplotlib.container.ErrorbarContainer` objects. If another style is
+        used, `errorbar_` is `None`.
+
+    lines_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.lines.Line2D` objects corresponding to the mean train and
+        test scores. If another style is used, `line_` is `None`.
+
+    fill_between_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.collections.PolyCollection` objects. If another style is
+        used, `fill_between_` is `None`.
+
+    See Also
+    --------
+    sklearn.model_selection.validation_curve : Compute the validation curve.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import ValidationCurveDisplay, validation_curve
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = make_classification(n_samples=1_000, random_state=0)
+    >>> logistic_regression = LogisticRegression()
+    >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+    >>> train_scores, test_scores = validation_curve(
+    ...     logistic_regression, X, y, param_name=param_name, param_range=param_range
+    ... )
+    >>> display = ValidationCurveDisplay(
+    ...     param_name=param_name, param_range=param_range,
+    ...     train_scores=train_scores, test_scores=test_scores, score_name="Score"
+    ... )
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(
+        self, *, param_name, param_range, train_scores, test_scores, score_name=None
+    ):
+        self.param_name = param_name
+        self.param_range = param_range
+        self.train_scores = train_scores
+        self.test_scores = test_scores
+        self.score_name = score_name
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.validation_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If None, no standard deviation representation is
+            displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.ValidationCurveDisplay`
+            Object that stores computed values.
+        """
+        self._plot_curve(
+            self.param_range,
+            ax=ax,
+            negate_score=negate_score,
+            score_name=score_name,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+        self.ax_.set_xlabel(f"{self.param_name}")
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        param_name,
+        param_range,
+        groups=None,
+        cv=None,
+        scoring=None,
+        n_jobs=None,
+        pre_dispatch="all",
+        verbose=0,
+        error_score=np.nan,
+        fit_params=None,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Create a validation curve display from an estimator.
+
+        Read more in the :ref:`User Guide <visualizations>` for general
+        information about the visualization API and :ref:`detailed
+        documentation <validation_curve>` regarding the validation curve
+        visualization.
+
+        Parameters
+        ----------
+        estimator : object type that implements the "fit" and "predict" methods
+            An object of that type which is cloned for each validation.
+
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        param_name : str
+            Name of the parameter that will be varied.
+
+        param_range : array-like of shape (n_values,)
+            The values of the parameter that will be evaluated.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Only used in conjunction with a "Group" :term:`cv`
+            instance (e.g., :class:`GroupKFold`).
+
+        cv : int, cross-validation generator or an iterable, default=None
+            Determines the cross-validation splitting strategy.
+            Possible inputs for cv are:
+
+            - None, to use the default 5-fold cross validation,
+            - int, to specify the number of folds in a `(Stratified)KFold`,
+            - :term:`CV splitter`,
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+            For int/None inputs, if the estimator is a classifier and `y` is
+            either binary or multiclass,
+            :class:`~sklearn.model_selection.StratifiedKFold` is used. In all
+            other cases, :class:`~sklearn.model_selection.KFold` is used. These
+            splitters are instantiated with `shuffle=False` so the splits will
+            be the same across calls.
+
+            Refer :ref:`User Guide <cross_validation>` for the various
+            cross-validation strategies that can be used here.
+
+        scoring : str or callable, default=None
+            Scoring method to use when computing the validation curve. Options:
+
+            - str: see :ref:`scoring_string_names` for options.
+            - callable: a scorer callable object (e.g., function) with signature
+              ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+            - `None`: the `estimator`'s
+              :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+        n_jobs : int, default=None
+            Number of jobs to run in parallel. Training the estimator and
+            computing the score are parallelized over the different training
+            and test sets. `None` means 1 unless in a
+            :obj:`joblib.parallel_backend` context. `-1` means using all
+            processors. See :term:`Glossary <n_jobs>` for more details.
+
+        pre_dispatch : int or str, default='all'
+            Number of predispatched jobs for parallel execution (default is
+            all). The option can reduce the allocated memory. The str can
+            be an expression like '2*n_jobs'.
+
+        verbose : int, default=0
+            Controls the verbosity: the higher, the more messages.
+
+        error_score : 'raise' or numeric, default=np.nan
+            Value to assign to the score if an error occurs in estimator
+            fitting. If set to 'raise', the error is raised. If a numeric value
+            is given, FitFailedWarning is raised.
+
+        fit_params : dict, default=None
+            Parameters to pass to the fit method of the estimator.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.validation_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If `None`, no representation of the standard deviation
+            is displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.ValidationCurveDisplay`
+            Object that stores computed values.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import ValidationCurveDisplay
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> X, y = make_classification(n_samples=1_000, random_state=0)
+        >>> logistic_regression = LogisticRegression()
+        >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+        >>> ValidationCurveDisplay.from_estimator(
+        ...     logistic_regression, X, y, param_name=param_name,
+        ...     param_range=param_range,
+        ... )
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+
+        score_name = _validate_score_name(score_name, scoring, negate_score)
+
+        train_scores, test_scores = validation_curve(
+            estimator,
+            X,
+            y,
+            param_name=param_name,
+            param_range=param_range,
+            groups=groups,
+            cv=cv,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            pre_dispatch=pre_dispatch,
+            verbose=verbose,
+            error_score=error_score,
+            fit_params=fit_params,
+        )
+
+        viz = cls(
+            param_name=param_name,
+            param_range=np.asarray(param_range),
+            train_scores=train_scores,
+            test_scores=test_scores,
+            score_name=score_name,
+        )
+        return viz.plot(
+            ax=ax,
+            negate_score=negate_score,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bd3f81195631da3fd21b8c3db95dcfc3df258fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py
@@ -0,0 +1,1996 @@
+"""
+The :mod:`sklearn.model_selection._search` includes utilities to fine-tune the
+parameters of an estimator.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import operator
+import time
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import defaultdict
+from collections.abc import Iterable, Mapping, Sequence
+from copy import deepcopy
+from functools import partial, reduce
+from inspect import signature
+from itertools import product
+
+import numpy as np
+from numpy.ma import MaskedArray
+from scipy.stats import rankdata
+
+from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
+from ..exceptions import NotFittedError
+from ..metrics import check_scoring
+from ..metrics._scorer import (
+    _check_multimetric_scoring,
+    _MultimetricScorer,
+    get_scorer_names,
+)
+from ..utils import Bunch, check_random_state
+from ..utils._param_validation import HasMethods, Interval, StrOptions
+from ..utils._repr_html.estimator import _VisualBlock
+from ..utils._tags import get_tags
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.parallel import Parallel, delayed
+from ..utils.random import sample_without_replacement
+from ..utils.validation import _check_method_params, check_is_fitted, indexable
+from ._split import check_cv
+from ._validation import (
+    _aggregate_score_dicts,
+    _fit_and_score,
+    _insert_error_scores,
+    _normalize_score_results,
+    _warn_or_raise_about_fit_failures,
+)
+
+__all__ = ["GridSearchCV", "ParameterGrid", "ParameterSampler", "RandomizedSearchCV"]
+
+
+class ParameterGrid:
+    """Grid of parameters with a discrete number of values for each.
+
+    Can be used to iterate over parameter value combinations with the
+    Python built-in function iter.
+    The order of the generated parameter combinations is deterministic.
+
+    Read more in the :ref:`User Guide <grid_search>`.
+
+    Parameters
+    ----------
+    param_grid : dict of str to sequence, or sequence of such
+        The parameter grid to explore, as a dictionary mapping estimator
+        parameters to sequences of allowed values.
+
+        An empty dict signifies default parameters.
+
+        A sequence of dicts signifies a sequence of grids to search, and is
+        useful to avoid exploring parameter combinations that make no sense
+        or have no effect. See the examples below.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import ParameterGrid
+    >>> param_grid = {'a': [1, 2], 'b': [True, False]}
+    >>> list(ParameterGrid(param_grid)) == (
+    ...    [{'a': 1, 'b': True}, {'a': 1, 'b': False},
+    ...     {'a': 2, 'b': True}, {'a': 2, 'b': False}])
+    True
+
+    >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
+    >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
+    ...                               {'kernel': 'rbf', 'gamma': 1},
+    ...                               {'kernel': 'rbf', 'gamma': 10}]
+    True
+    >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
+    True
+
+    See Also
+    --------
+    GridSearchCV : Uses :class:`ParameterGrid` to perform a full parallelized
+        parameter search.
+    """
+
+    def __init__(self, param_grid):
+        if not isinstance(param_grid, (Mapping, Iterable)):
+            raise TypeError(
+                f"Parameter grid should be a dict or a list, got: {param_grid!r} of"
+                f" type {type(param_grid).__name__}"
+            )
+
+        if isinstance(param_grid, Mapping):
+            # wrap dictionary in a singleton list to support either dict
+            # or list of dicts
+            param_grid = [param_grid]
+
+        # check if all entries are dictionaries of lists
+        for grid in param_grid:
+            if not isinstance(grid, dict):
+                raise TypeError(f"Parameter grid is not a dict ({grid!r})")
+            for key, value in grid.items():
+                if isinstance(value, np.ndarray) and value.ndim > 1:
+                    raise ValueError(
+                        f"Parameter array for {key!r} should be one-dimensional, got:"
+                        f" {value!r} with shape {value.shape}"
+                    )
+                if isinstance(value, str) or not isinstance(
+                    value, (np.ndarray, Sequence)
+                ):
+                    raise TypeError(
+                        f"Parameter grid for parameter {key!r} needs to be a list or a"
+                        f" numpy array, but got {value!r} (of type "
+                        f"{type(value).__name__}) instead. Single values "
+                        "need to be wrapped in a list with one element."
+                    )
+                if len(value) == 0:
+                    raise ValueError(
+                        f"Parameter grid for parameter {key!r} need "
+                        f"to be a non-empty sequence, got: {value!r}"
+                    )
+
+        self.param_grid = param_grid
+
+    def __iter__(self):
+        """Iterate over the points in the grid.
+
+        Returns
+        -------
+        params : iterator over dict of str to any
+            Yields dictionaries mapping each estimator parameter to one of its
+            allowed values.
+        """
+        for p in self.param_grid:
+            # Always sort the keys of a dictionary, for reproducibility
+            items = sorted(p.items())
+            if not items:
+                yield {}
+            else:
+                keys, values = zip(*items)
+                for v in product(*values):
+                    params = dict(zip(keys, v))
+                    yield params
+
+    def __len__(self):
+        """Number of points on the grid."""
+        # Product function that can handle iterables (np.prod can't).
+        product = partial(reduce, operator.mul)
+        return sum(
+            product(len(v) for v in p.values()) if p else 1 for p in self.param_grid
+        )
+
+    def __getitem__(self, ind):
+        """Get the parameters that would be ``ind``th in iteration
+
+        Parameters
+        ----------
+        ind : int
+            The iteration index
+
+        Returns
+        -------
+        params : dict of str to any
+            Equal to list(self)[ind]
+        """
+        # This is used to make discrete sampling without replacement memory
+        # efficient.
+        for sub_grid in self.param_grid:
+            # XXX: could memoize information used here
+            if not sub_grid:
+                if ind == 0:
+                    return {}
+                else:
+                    ind -= 1
+                    continue
+
+            # Reverse so most frequent cycling parameter comes first
+            keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
+            sizes = [len(v_list) for v_list in values_lists]
+            total = np.prod(sizes)
+
+            if ind >= total:
+                # Try the next grid
+                ind -= total
+            else:
+                out = {}
+                for key, v_list, n in zip(keys, values_lists, sizes):
+                    ind, offset = divmod(ind, n)
+                    out[key] = v_list[offset]
+                return out
+
+        raise IndexError("ParameterGrid index out of range")
+
+
+class ParameterSampler:
+    """Generator on parameters sampled from given distributions.
+
+    Non-deterministic iterable over random candidate combinations for hyper-
+    parameter search. If all parameters are presented as a list,
+    sampling without replacement is performed. If at least one parameter
+    is given as a distribution, sampling with replacement is used.
+    It is highly recommended to use continuous distributions for continuous
+    parameters.
+
+    Read more in the :ref:`User Guide <grid_search>`.
+
+    Parameters
+    ----------
+    param_distributions : dict
+        Dictionary with parameters names (`str`) as keys and distributions
+        or lists of parameters to try. Distributions must provide a ``rvs``
+        method for sampling (such as those from scipy.stats.distributions).
+        If a list is given, it is sampled uniformly.
+        If a list of dicts is given, first a dict is sampled uniformly, and
+        then a parameter is sampled using that dict as above.
+
+    n_iter : int
+        Number of parameter settings that are produced.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for random uniform sampling
+        from lists of possible values instead of scipy.stats distributions.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    params : dict of str to any
+        **Yields** dictionaries mapping each estimator parameter to
+        as sampled value.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import ParameterSampler
+    >>> from scipy.stats.distributions import expon
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(0)
+    >>> param_grid = {'a':[1, 2], 'b': expon()}
+    >>> param_list = list(ParameterSampler(param_grid, n_iter=4,
+    ...                                    random_state=rng))
+    >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
+    ...                 for d in param_list]
+    >>> rounded_list == [{'b': 0.89856, 'a': 1},
+    ...                  {'b': 0.923223, 'a': 1},
+    ...                  {'b': 1.878964, 'a': 2},
+    ...                  {'b': 1.038159, 'a': 2}]
+    True
+    """
+
+    def __init__(self, param_distributions, n_iter, *, random_state=None):
+        if not isinstance(param_distributions, (Mapping, Iterable)):
+            raise TypeError(
+                "Parameter distribution is not a dict or a list,"
+                f" got: {param_distributions!r} of type "
+                f"{type(param_distributions).__name__}"
+            )
+
+        if isinstance(param_distributions, Mapping):
+            # wrap dictionary in a singleton list to support either dict
+            # or list of dicts
+            param_distributions = [param_distributions]
+
+        for dist in param_distributions:
+            if not isinstance(dist, dict):
+                raise TypeError(
+                    "Parameter distribution is not a dict ({!r})".format(dist)
+                )
+            for key in dist:
+                if not isinstance(dist[key], Iterable) and not hasattr(
+                    dist[key], "rvs"
+                ):
+                    raise TypeError(
+                        f"Parameter grid for parameter {key!r} is not iterable "
+                        f"or a distribution (value={dist[key]})"
+                    )
+        self.n_iter = n_iter
+        self.random_state = random_state
+        self.param_distributions = param_distributions
+
+    def _is_all_lists(self):
+        return all(
+            all(not hasattr(v, "rvs") for v in dist.values())
+            for dist in self.param_distributions
+        )
+
+    def __iter__(self):
+        rng = check_random_state(self.random_state)
+
+        # if all distributions are given as lists, we want to sample without
+        # replacement
+        if self._is_all_lists():
+            # look up sampled parameter settings in parameter grid
+            param_grid = ParameterGrid(self.param_distributions)
+            grid_size = len(param_grid)
+            n_iter = self.n_iter
+
+            if grid_size < n_iter:
+                warnings.warn(
+                    "The total space of parameters %d is smaller "
+                    "than n_iter=%d. Running %d iterations. For exhaustive "
+                    "searches, use GridSearchCV." % (grid_size, self.n_iter, grid_size),
+                    UserWarning,
+                )
+                n_iter = grid_size
+            for i in sample_without_replacement(grid_size, n_iter, random_state=rng):
+                yield param_grid[i]
+
+        else:
+            for _ in range(self.n_iter):
+                dist = rng.choice(self.param_distributions)
+                # Always sort the keys of a dictionary, for reproducibility
+                items = sorted(dist.items())
+                params = dict()
+                for k, v in items:
+                    if hasattr(v, "rvs"):
+                        params[k] = v.rvs(random_state=rng)
+                    else:
+                        params[k] = v[rng.randint(len(v))]
+                yield params
+
+    def __len__(self):
+        """Number of points that will be sampled."""
+        if self._is_all_lists():
+            grid_size = len(ParameterGrid(self.param_distributions))
+            return min(self.n_iter, grid_size)
+        else:
+            return self.n_iter
+
+
+def _check_refit(search_cv, attr):
+    if not search_cv.refit:
+        raise AttributeError(
+            f"This {type(search_cv).__name__} instance was initialized with "
+            f"`refit=False`. {attr} is available only after refitting on the best "
+            "parameters. You can refit an estimator manually using the "
+            "`best_params_` attribute"
+        )
+
+
+def _search_estimator_has(attr):
+    """Check if we can delegate a method to the underlying estimator.
+
+    Calling a prediction method will only be available if `refit=True`. In
+    such case, we check first the fitted best estimator. If it is not
+    fitted, we check the unfitted estimator.
+
+    Checking the unfitted estimator allows to use `hasattr` on the `SearchCV`
+    instance even before calling `fit`.
+    """
+
+    def check(self):
+        _check_refit(self, attr)
+        if hasattr(self, "best_estimator_"):
+            # raise an AttributeError if `attr` does not exist
+            getattr(self.best_estimator_, attr)
+            return True
+        # raise an AttributeError if `attr` does not exist
+        getattr(self.estimator, attr)
+        return True
+
+    return check
+
+
+def _yield_masked_array_for_each_param(candidate_params):
+    """
+    Yield a masked array for each candidate param.
+
+    `candidate_params` is a sequence of params which were used in
+    a `GridSearchCV`. We use masked arrays for the results, as not
+    all params are necessarily present in each element of
+    `candidate_params`. For example, if using `GridSearchCV` with
+    a `SVC` model, then one might search over params like:
+
+        - kernel=["rbf"], gamma=[0.1, 1]
+        - kernel=["poly"], degree=[1, 2]
+
+    and then param `'gamma'` would not be present in entries of
+    `candidate_params` corresponding to `kernel='poly'`.
+    """
+    n_candidates = len(candidate_params)
+    param_results = defaultdict(dict)
+
+    for cand_idx, params in enumerate(candidate_params):
+        for name, value in params.items():
+            param_results["param_%s" % name][cand_idx] = value
+
+    for key, param_result in param_results.items():
+        param_list = list(param_result.values())
+        try:
+            arr = np.array(param_list)
+        except ValueError:
+            # This can happen when param_list contains lists of different
+            # lengths, for example:
+            # param_list=[[1], [2, 3]]
+            arr_dtype = np.dtype(object)
+        else:
+            # There are two cases when we don't use the automatically inferred
+            # dtype when creating the array and we use object instead:
+            # - string dtype
+            # - when array.ndim > 1, that means that param_list was something
+            #   like a list of same-size sequences, which gets turned into a
+            #   multi-dimensional array but we want a 1d array
+            arr_dtype = arr.dtype if arr.dtype.kind != "U" and arr.ndim == 1 else object
+
+        # Use one MaskedArray and mask all the places where the param is not
+        # applicable for that candidate (which may not contain all the params).
+        ma = MaskedArray(np.empty(n_candidates, dtype=arr_dtype), mask=True)
+        for index, value in param_result.items():
+            # Setting the value at an index unmasks that index
+            ma[index] = value
+        yield (key, ma)
+
+
+class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
+    """Abstract base class for hyper parameter search with cross-validation."""
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit"])],
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            list,
+            tuple,
+            dict,
+            None,
+        ],
+        "n_jobs": [numbers.Integral, None],
+        "refit": ["boolean", str, callable],
+        "cv": ["cv_object"],
+        "verbose": ["verbose"],
+        "pre_dispatch": [numbers.Integral, str],
+        "error_score": [StrOptions({"raise"}), numbers.Real],
+        "return_train_score": ["boolean"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        estimator,
+        *,
+        scoring=None,
+        n_jobs=None,
+        refit=True,
+        cv=None,
+        verbose=0,
+        pre_dispatch="2*n_jobs",
+        error_score=np.nan,
+        return_train_score=True,
+    ):
+        self.scoring = scoring
+        self.estimator = estimator
+        self.n_jobs = n_jobs
+        self.refit = refit
+        self.cv = cv
+        self.verbose = verbose
+        self.pre_dispatch = pre_dispatch
+        self.error_score = error_score
+        self.return_train_score = return_train_score
+
+    @property
+    # TODO(1.8) remove this property
+    def _estimator_type(self):
+        return self.estimator._estimator_type
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        sub_estimator_tags = get_tags(self.estimator)
+        tags.estimator_type = sub_estimator_tags.estimator_type
+        tags.classifier_tags = deepcopy(sub_estimator_tags.classifier_tags)
+        tags.regressor_tags = deepcopy(sub_estimator_tags.regressor_tags)
+        # allows cross-validation to see 'precomputed' metrics
+        tags.input_tags.pairwise = sub_estimator_tags.input_tags.pairwise
+        tags.input_tags.sparse = sub_estimator_tags.input_tags.sparse
+        tags.array_api_support = sub_estimator_tags.array_api_support
+        return tags
+
+    def score(self, X, y=None, **params):
+        """Return the score on the given data, if the estimator has been refit.
+
+        This uses the score defined by ``scoring`` where provided, and the
+        ``best_estimator_.score`` method otherwise.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples, n_output) \
+            or (n_samples,), default=None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        **params : dict
+            Parameters to be passed to the underlying scorer(s).
+
+            .. versionadded:: 1.4
+                Only available if `enable_metadata_routing=True`. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        score : float
+            The score defined by ``scoring`` if provided, and the
+            ``best_estimator_.score`` method otherwise.
+        """
+        _check_refit(self, "score")
+        check_is_fitted(self)
+
+        _raise_for_params(params, self, "score")
+
+        if _routing_enabled():
+            score_params = process_routing(self, "score", **params).scorer["score"]
+        else:
+            score_params = dict()
+
+        if self.scorer_ is None:
+            raise ValueError(
+                "No score function explicitly defined, "
+                "and the estimator doesn't provide one %s" % self.best_estimator_
+            )
+        if isinstance(self.scorer_, dict):
+            if self.multimetric_:
+                scorer = self.scorer_[self.refit]
+            else:
+                scorer = self.scorer_
+            return scorer(self.best_estimator_, X, y, **score_params)
+
+        # callable
+        score = self.scorer_(self.best_estimator_, X, y, **score_params)
+        if self.multimetric_:
+            score = score[self.refit]
+        return score
+
+    @available_if(_search_estimator_has("score_samples"))
+    def score_samples(self, X):
+        """Call score_samples on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``score_samples``.
+
+        .. versionadded:: 0.24
+
+        Parameters
+        ----------
+        X : iterable
+            Data to predict on. Must fulfill input requirements
+            of the underlying estimator.
+
+        Returns
+        -------
+        y_score : ndarray of shape (n_samples,)
+            The ``best_estimator_.score_samples`` method.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.score_samples(X)
+
+    @available_if(_search_estimator_has("predict"))
+    def predict(self, X):
+        """Call predict on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``predict``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            The predicted labels or values for `X` based on the estimator with
+            the best found parameters.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.predict(X)
+
+    @available_if(_search_estimator_has("predict_proba"))
+    def predict_proba(self, X):
+        """Call predict_proba on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``predict_proba``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Predicted class probabilities for `X` based on the estimator with
+            the best found parameters. The order of the classes corresponds
+            to that in the fitted attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.predict_proba(X)
+
+    @available_if(_search_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X):
+        """Call predict_log_proba on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``predict_log_proba``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Predicted class log-probabilities for `X` based on the estimator
+            with the best found parameters. The order of the classes
+            corresponds to that in the fitted attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.predict_log_proba(X)
+
+    @available_if(_search_estimator_has("decision_function"))
+    def decision_function(self, X):
+        """Call decision_function on the estimator with the best found parameters.
+
+        Only available if ``refit=True`` and the underlying estimator supports
+        ``decision_function``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        y_score : ndarray of shape (n_samples,) or (n_samples, n_classes) \
+                or (n_samples, n_classes * (n_classes-1) / 2)
+            Result of the decision function for `X` based on the estimator with
+            the best found parameters.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.decision_function(X)
+
+    @available_if(_search_estimator_has("transform"))
+    def transform(self, X):
+        """Call transform on the estimator with the best found parameters.
+
+        Only available if the underlying estimator supports ``transform`` and
+        ``refit=True``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            `X` transformed in the new space based on the estimator with
+            the best found parameters.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.transform(X)
+
+    @available_if(_search_estimator_has("inverse_transform"))
+    def inverse_transform(self, X):
+        """Call inverse_transform on the estimator with the best found params.
+
+        Only available if the underlying estimator implements
+        ``inverse_transform`` and ``refit=True``.
+
+        Parameters
+        ----------
+        X : indexable, length n_samples
+            Must fulfill the input assumptions of the
+            underlying estimator.
+
+        Returns
+        -------
+        X_original : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Result of the `inverse_transform` function for `X` based on the
+            estimator with the best found parameters.
+        """
+        check_is_fitted(self)
+        return self.best_estimator_.inverse_transform(X)
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during :term:`fit`.
+
+        Only available when `refit=True`.
+        """
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the search estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute.".format(
+                    self.__class__.__name__
+                )
+            ) from nfe
+
+        return self.best_estimator_.n_features_in_
+
+    @property
+    def classes_(self):
+        """Class labels.
+
+        Only available when `refit=True` and the estimator is a classifier.
+        """
+        _search_estimator_has("classes_")(self)
+        return self.best_estimator_.classes_
+
+    def _run_search(self, evaluate_candidates):
+        """Repeatedly calls `evaluate_candidates` to conduct a search.
+
+        This method, implemented in sub-classes, makes it possible to
+        customize the scheduling of evaluations: GridSearchCV and
+        RandomizedSearchCV schedule evaluations for their whole parameter
+        search space at once but other more sequential approaches are also
+        possible: for instance is possible to iteratively schedule evaluations
+        for new regions of the parameter search space based on previously
+        collected evaluation results. This makes it possible to implement
+        Bayesian optimization or more generally sequential model-based
+        optimization by deriving from the BaseSearchCV abstract base class.
+        For example, Successive Halving is implemented by calling
+        `evaluate_candidates` multiples times (once per iteration of the SH
+        process), each time passing a different set of candidates with `X`
+        and `y` of increasing sizes.
+
+        Parameters
+        ----------
+        evaluate_candidates : callable
+            This callback accepts:
+                - a list of candidates, where each candidate is a dict of
+                  parameter settings.
+                - an optional `cv` parameter which can be used to e.g.
+                  evaluate candidates on different dataset splits, or
+                  evaluate candidates on subsampled data (as done in the
+                  Successive Halving estimators). By default, the original
+                  `cv` parameter is used, and it is available as a private
+                  `_checked_cv_orig` attribute.
+                - an optional `more_results` dict. Each key will be added to
+                  the `cv_results_` attribute. Values should be lists of
+                  length `n_candidates`
+
+            It returns a dict of all results so far, formatted like
+            ``cv_results_``.
+
+            Important note (relevant whether the default cv is used or not):
+            in randomized splitters, and unless the random_state parameter of
+            cv was set to an int, calling cv.split() multiple times will
+            yield different splits. Since cv.split() is called in
+            evaluate_candidates, this means that candidates will be evaluated
+            on different splits each time evaluate_candidates is called. This
+            might be a methodological issue depending on the search strategy
+            that you're implementing. To prevent randomized splitters from
+            being used, you may use _split._yields_constant_splits()
+
+        Examples
+        --------
+
+        ::
+
+            def _run_search(self, evaluate_candidates):
+                'Try C=0.1 only if C=1 is better than C=10'
+                all_results = evaluate_candidates([{'C': 1}, {'C': 10}])
+                score = all_results['mean_test_score']
+                if score[0] < score[1]:
+                    evaluate_candidates([{'C': 0.1}])
+        """
+        raise NotImplementedError("_run_search not implemented.")
+
+    def _check_refit_for_multimetric(self, scores):
+        """Check `refit` is compatible with `scores` is valid"""
+        multimetric_refit_msg = (
+            "For multi-metric scoring, the parameter refit must be set to a "
+            "scorer key or a callable to refit an estimator with the best "
+            "parameter setting on the whole data and make the best_* "
+            "attributes available for that metric. If this is not needed, "
+            f"refit should be set to False explicitly. {self.refit!r} was "
+            "passed."
+        )
+
+        valid_refit_dict = isinstance(self.refit, str) and self.refit in scores
+
+        if (
+            self.refit is not False
+            and not valid_refit_dict
+            and not callable(self.refit)
+        ):
+            raise ValueError(multimetric_refit_msg)
+
+    @staticmethod
+    def _select_best_index(refit, refit_metric, results):
+        """Select index of the best combination of hyperparemeters."""
+        if callable(refit):
+            # If callable, refit is expected to return the index of the best
+            # parameter set.
+            best_index = refit(results)
+            if not isinstance(best_index, numbers.Integral):
+                raise TypeError("best_index_ returned is not an integer")
+            if best_index < 0 or best_index >= len(results["params"]):
+                raise IndexError("best_index_ index out of range")
+        else:
+            best_index = results[f"rank_test_{refit_metric}"].argmin()
+        return best_index
+
+    def _get_scorers(self):
+        """Get the scorer(s) to be used.
+
+        This is used in ``fit`` and ``get_metadata_routing``.
+
+        Returns
+        -------
+        scorers, refit_metric
+        """
+        refit_metric = "score"
+
+        if callable(self.scoring):
+            scorers = self.scoring
+        elif self.scoring is None or isinstance(self.scoring, str):
+            scorers = check_scoring(self.estimator, self.scoring)
+        else:
+            scorers = _check_multimetric_scoring(self.estimator, self.scoring)
+            self._check_refit_for_multimetric(scorers)
+            refit_metric = self.refit
+            scorers = _MultimetricScorer(
+                scorers=scorers, raise_exc=(self.error_score == "raise")
+            )
+
+        return scorers, refit_metric
+
+    def _check_scorers_accept_sample_weight(self):
+        # TODO(slep006): remove when metadata routing is the only way
+        scorers, _ = self._get_scorers()
+        # In the multimetric case, warn the user for each scorer separately
+        if isinstance(scorers, _MultimetricScorer):
+            for name, scorer in scorers._scorers.items():
+                if not scorer._accept_sample_weight():
+                    warnings.warn(
+                        f"The scoring {name}={scorer} does not support sample_weight, "
+                        "which may lead to statistically incorrect results when "
+                        f"fitting {self} with sample_weight. "
+                    )
+            return scorers._accept_sample_weight()
+        # In most cases, scorers is a Scorer object
+        # But it's a function when user passes scoring=function
+        if hasattr(scorers, "_accept_sample_weight"):
+            accept = scorers._accept_sample_weight()
+        else:
+            accept = "sample_weight" in signature(scorers).parameters
+        if not accept:
+            warnings.warn(
+                f"The scoring {scorers} does not support sample_weight, "
+                "which may lead to statistically incorrect results when "
+                f"fitting {self} with sample_weight. "
+            )
+        return accept
+
+    def _get_routed_params_for_fit(self, params):
+        """Get the parameters to be used for routing.
+
+        This is a method instead of a snippet in ``fit`` since it's used twice,
+        here in ``fit``, and in ``HalvingRandomSearchCV.fit``.
+        """
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **params)
+        else:
+            params = params.copy()
+            groups = params.pop("groups", None)
+            routed_params = Bunch(
+                estimator=Bunch(fit=params),
+                splitter=Bunch(split={"groups": groups}),
+                scorer=Bunch(score={}),
+            )
+            # NOTE: sample_weight is forwarded to the scorer if sample_weight
+            # is not None and scorers accept sample_weight. For _MultimetricScorer,
+            # sample_weight is forwarded if any scorer accepts sample_weight
+            if (
+                params.get("sample_weight") is not None
+                and self._check_scorers_accept_sample_weight()
+            ):
+                routed_params.scorer.score["sample_weight"] = params["sample_weight"]
+        return routed_params
+
+    @_fit_context(
+        # *SearchCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, **params):
+        """Run fit with all sets of parameters.
+
+        Parameters
+        ----------
+
+        X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features. For precomputed kernel or
+            distance matrix, the expected shape of X is (n_samples, n_samples).
+
+        y : array-like of shape (n_samples, n_output) \
+            or (n_samples,), default=None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        **params : dict of str -> object
+            Parameters passed to the ``fit`` method of the estimator, the scorer,
+            and the CV splitter.
+
+            If a fit parameter is an array-like whose length is equal to
+            `num_samples` then it will be split by cross-validation along with
+            `X` and `y`. For example, the :term:`sample_weight` parameter is
+            split because `len(sample_weights) = len(X)`. However, this behavior
+            does not apply to `groups` which is passed to the splitter configured
+            via the `cv` parameter of the constructor. Thus, `groups` is used
+            *to perform the split* and determines which samples are
+            assigned to the each side of the a split.
+
+        Returns
+        -------
+        self : object
+            Instance of fitted estimator.
+        """
+        estimator = self.estimator
+        scorers, refit_metric = self._get_scorers()
+
+        X, y = indexable(X, y)
+        params = _check_method_params(X, params=params)
+
+        routed_params = self._get_routed_params_for_fit(params)
+
+        cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
+        n_splits = cv_orig.get_n_splits(X, y, **routed_params.splitter.split)
+
+        base_estimator = clone(self.estimator)
+
+        parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)
+
+        fit_and_score_kwargs = dict(
+            scorer=scorers,
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
+            return_train_score=self.return_train_score,
+            return_n_test_samples=True,
+            return_times=True,
+            return_parameters=False,
+            error_score=self.error_score,
+            verbose=self.verbose,
+        )
+        results = {}
+        with parallel:
+            all_candidate_params = []
+            all_out = []
+            all_more_results = defaultdict(list)
+
+            def evaluate_candidates(candidate_params, cv=None, more_results=None):
+                cv = cv or cv_orig
+                candidate_params = list(candidate_params)
+                n_candidates = len(candidate_params)
+
+                if self.verbose > 0:
+                    print(
+                        "Fitting {0} folds for each of {1} candidates,"
+                        " totalling {2} fits".format(
+                            n_splits, n_candidates, n_candidates * n_splits
+                        )
+                    )
+
+                out = parallel(
+                    delayed(_fit_and_score)(
+                        clone(base_estimator),
+                        X,
+                        y,
+                        train=train,
+                        test=test,
+                        parameters=parameters,
+                        split_progress=(split_idx, n_splits),
+                        candidate_progress=(cand_idx, n_candidates),
+                        **fit_and_score_kwargs,
+                    )
+                    for (cand_idx, parameters), (split_idx, (train, test)) in product(
+                        enumerate(candidate_params),
+                        enumerate(cv.split(X, y, **routed_params.splitter.split)),
+                    )
+                )
+
+                if len(out) < 1:
+                    raise ValueError(
+                        "No fits were performed. "
+                        "Was the CV iterator empty? "
+                        "Were there no candidates?"
+                    )
+                elif len(out) != n_candidates * n_splits:
+                    raise ValueError(
+                        "cv.split and cv.get_n_splits returned "
+                        "inconsistent results. Expected {} "
+                        "splits, got {}".format(n_splits, len(out) // n_candidates)
+                    )
+
+                _warn_or_raise_about_fit_failures(out, self.error_score)
+
+                # For callable self.scoring, the return type is only know after
+                # calling. If the return type is a dictionary, the error scores
+                # can now be inserted with the correct key. The type checking
+                # of out will be done in `_insert_error_scores`.
+                if callable(self.scoring):
+                    _insert_error_scores(out, self.error_score)
+
+                all_candidate_params.extend(candidate_params)
+                all_out.extend(out)
+
+                if more_results is not None:
+                    for key, value in more_results.items():
+                        all_more_results[key].extend(value)
+
+                nonlocal results
+                results = self._format_results(
+                    all_candidate_params, n_splits, all_out, all_more_results
+                )
+
+                return results
+
+            self._run_search(evaluate_candidates)
+
+            # multimetric is determined here because in the case of a callable
+            # self.scoring the return type is only known after calling
+            first_test_score = all_out[0]["test_scores"]
+            self.multimetric_ = isinstance(first_test_score, dict)
+
+            # check refit_metric now for a callable scorer that is multimetric
+            if callable(self.scoring) and self.multimetric_:
+                self._check_refit_for_multimetric(first_test_score)
+                refit_metric = self.refit
+
+        # For multi-metric evaluation, store the best_index_, best_params_ and
+        # best_score_ iff refit is one of the scorer names
+        # In single metric evaluation, refit_metric is "score"
+        if self.refit or not self.multimetric_:
+            self.best_index_ = self._select_best_index(
+                self.refit, refit_metric, results
+            )
+            if not callable(self.refit):
+                # With a non-custom callable, we can select the best score
+                # based on the best index
+                self.best_score_ = results[f"mean_test_{refit_metric}"][
+                    self.best_index_
+                ]
+            self.best_params_ = results["params"][self.best_index_]
+
+        if self.refit:
+            # here we clone the estimator as well as the parameters, since
+            # sometimes the parameters themselves might be estimators, e.g.
+            # when we search over different estimators in a pipeline.
+            # ref: https://github.com/scikit-learn/scikit-learn/pull/26786
+            self.best_estimator_ = clone(base_estimator).set_params(
+                **clone(self.best_params_, safe=False)
+            )
+
+            refit_start_time = time.time()
+            if y is not None:
+                self.best_estimator_.fit(X, y, **routed_params.estimator.fit)
+            else:
+                self.best_estimator_.fit(X, **routed_params.estimator.fit)
+            refit_end_time = time.time()
+            self.refit_time_ = refit_end_time - refit_start_time
+
+            if hasattr(self.best_estimator_, "feature_names_in_"):
+                self.feature_names_in_ = self.best_estimator_.feature_names_in_
+
+        # Store the only scorer not as a dict for single metric evaluation
+        if isinstance(scorers, _MultimetricScorer):
+            self.scorer_ = scorers._scorers
+        else:
+            self.scorer_ = scorers
+
+        self.cv_results_ = results
+        self.n_splits_ = n_splits
+
+        return self
+
+    def _format_results(self, candidate_params, n_splits, out, more_results=None):
+        n_candidates = len(candidate_params)
+        out = _aggregate_score_dicts(out)
+
+        results = dict(more_results or {})
+        for key, val in results.items():
+            # each value is a list (as per evaluate_candidate's convention)
+            # we convert it to an array for consistency with the other keys
+            results[key] = np.asarray(val)
+
+        def _store(key_name, array, weights=None, splits=False, rank=False):
+            """A small helper to store the scores/times to the cv_results_"""
+            # When iterated first by splits, then by parameters
+            # We want `array` to have `n_candidates` rows and `n_splits` cols.
+            array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits)
+            if splits:
+                for split_idx in range(n_splits):
+                    # Uses closure to alter the results
+                    results["split%d_%s" % (split_idx, key_name)] = array[:, split_idx]
+
+            array_means = np.average(array, axis=1, weights=weights)
+            results["mean_%s" % key_name] = array_means
+
+            if key_name.startswith(("train_", "test_")) and np.any(
+                ~np.isfinite(array_means)
+            ):
+                warnings.warn(
+                    (
+                        f"One or more of the {key_name.split('_')[0]} scores "
+                        f"are non-finite: {array_means}"
+                    ),
+                    category=UserWarning,
+                )
+
+            # Weighted std is not directly available in numpy
+            array_stds = np.sqrt(
+                np.average(
+                    (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
+                )
+            )
+            results["std_%s" % key_name] = array_stds
+
+            if rank:
+                # When the fit/scoring fails `array_means` contains NaNs, we
+                # will exclude them from the ranking process and consider them
+                # as tied with the worst performers.
+                if np.isnan(array_means).all():
+                    # All fit/scoring routines failed.
+                    rank_result = np.ones_like(array_means, dtype=np.int32)
+                else:
+                    min_array_means = np.nanmin(array_means) - 1
+                    array_means = np.nan_to_num(array_means, nan=min_array_means)
+                    rank_result = rankdata(-array_means, method="min").astype(
+                        np.int32, copy=False
+                    )
+                results["rank_%s" % key_name] = rank_result
+
+        _store("fit_time", out["fit_time"])
+        _store("score_time", out["score_time"])
+        # Store a list of param dicts at the key 'params'
+        for param, ma in _yield_masked_array_for_each_param(candidate_params):
+            results[param] = ma
+        results["params"] = candidate_params
+
+        test_scores_dict = _normalize_score_results(out["test_scores"])
+        if self.return_train_score:
+            train_scores_dict = _normalize_score_results(out["train_scores"])
+
+        for scorer_name in test_scores_dict:
+            # Computed the (weighted) mean and std for test scores alone
+            _store(
+                "test_%s" % scorer_name,
+                test_scores_dict[scorer_name],
+                splits=True,
+                rank=True,
+                weights=None,
+            )
+            if self.return_train_score:
+                _store(
+                    "train_%s" % scorer_name,
+                    train_scores_dict[scorer_name],
+                    splits=True,
+                )
+
+        return results
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.4
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+        router.add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+        )
+
+        scorer, _ = self._get_scorers()
+        router.add(
+            scorer=scorer,
+            method_mapping=MethodMapping()
+            .add(caller="score", callee="score")
+            .add(caller="fit", callee="score"),
+        )
+        router.add(
+            splitter=self.cv,
+            method_mapping=MethodMapping().add(caller="fit", callee="split"),
+        )
+        return router
+
+    def _sk_visual_block_(self):
+        if hasattr(self, "best_estimator_"):
+            key, estimator = "best_estimator_", self.best_estimator_
+        else:
+            key, estimator = "estimator", self.estimator
+
+        return _VisualBlock(
+            "parallel",
+            [estimator],
+            names=[f"{key}: {estimator.__class__.__name__}"],
+            name_details=[str(estimator)],
+        )
+
+
+class GridSearchCV(BaseSearchCV):
+    """Exhaustive search over specified parameter values for an estimator.
+
+    Important members are fit, predict.
+
+    GridSearchCV implements a "fit" and a "score" method.
+    It also implements "score_samples", "predict", "predict_proba",
+    "decision_function", "transform" and "inverse_transform" if they are
+    implemented in the estimator used.
+
+    The parameters of the estimator used to apply these methods are optimized
+    by cross-validated grid-search over a parameter grid.
+
+    Read more in the :ref:`User Guide <grid_search>`.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        This is assumed to implement the scikit-learn estimator interface.
+        Either estimator needs to provide a ``score`` function,
+        or ``scoring`` must be passed.
+
+    param_grid : dict or list of dictionaries
+        Dictionary with parameters names (`str`) as keys and lists of
+        parameter settings to try as values, or a list of such
+        dictionaries, in which case the grids spanned by each dictionary
+        in the list are explored. This enables searching over any sequence
+        of parameter settings.
+
+    scoring : str, callable, list, tuple or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
+
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_string_names`);
+        - a callable (see :ref:`scoring_callable`) that returns a single value;
+        - `None`, the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables as values.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
+    refit : bool, str, or callable, default=True
+        Refit an estimator using the best found parameters on the whole
+        dataset.
+
+        For multiple metric evaluation, this needs to be a `str` denoting the
+        scorer that would be used to find the best parameters for refitting
+        the estimator at the end.
+
+        Where there are considerations other than maximum score in
+        choosing a best estimator, ``refit`` can be set to a function which
+        returns the selected ``best_index_`` given ``cv_results_``. In that
+        case, the ``best_estimator_`` and ``best_params_`` will be set
+        according to the returned ``best_index_`` while the ``best_score_``
+        attribute will not be available.
+
+        The refitted estimator is made available at the ``best_estimator_``
+        attribute and permits using ``predict`` directly on this
+        ``GridSearchCV`` instance.
+
+        Also for multiple metric evaluation, the attributes ``best_index_``,
+        ``best_score_`` and ``best_params_`` will only be available if
+        ``refit`` is set and all of them will be determined w.r.t this specific
+        scorer.
+
+        See ``scoring`` parameter to know more about multiple metric
+        evaluation.
+
+        See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py`
+        to see how to design a custom selection strategy using a callable
+        via `refit`.
+
+        See :ref:`this example
+        <sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py>`
+        for an example of how to use ``refit=callable`` to balance model
+        complexity and cross-validated score.
+
+        .. versionchanged:: 0.20
+            Support for callable added.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    verbose : int
+        Controls the verbosity: the higher, the more messages.
+
+        - >1 : the computation time for each fold and parameter candidate is
+          displayed;
+        - >2 : the score is also displayed;
+        - >3 : the fold and candidate parameter indexes are also displayed
+          together with the starting time of the computation.
+
+    pre_dispatch : int, or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - None, in which case all the jobs are immediately created and spawned. Use
+          this for lightweight and fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised. If a numeric value is given,
+        FitFailedWarning is raised. This parameter does not affect the refit
+        step, which will always raise the error.
+
+    return_train_score : bool, default=False
+        If ``False``, the ``cv_results_`` attribute will not include training
+        scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
+    Attributes
+    ----------
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas ``DataFrame``.
+
+        For instance the below given table
+
+        +------------+-----------+------------+-----------------+---+---------+
+        |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|
+        +============+===========+============+=================+===+=========+
+        |  'poly'    |     --    |      2     |       0.80      |...|    2    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'poly'    |     --    |      3     |       0.70      |...|    4    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'rbf'     |     0.1   |     --     |       0.80      |...|    3    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'rbf'     |     0.2   |     --     |       0.93      |...|    1    |
+        +------------+-----------+------------+-----------------+---+---------+
+
+        will be represented by a ``cv_results_`` dict of::
+
+            {
+            'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
+                                         mask = [False False False False]...)
+            'param_gamma': masked_array(data = [-- -- 0.1 0.2],
+                                        mask = [ True  True False False]...),
+            'param_degree': masked_array(data = [2.0 3.0 -- --],
+                                         mask = [False False  True  True]...),
+            'split0_test_score'  : [0.80, 0.70, 0.80, 0.93],
+            'split1_test_score'  : [0.82, 0.50, 0.70, 0.78],
+            'mean_test_score'    : [0.81, 0.60, 0.75, 0.85],
+            'std_test_score'     : [0.01, 0.10, 0.05, 0.08],
+            'rank_test_score'    : [2, 4, 3, 1],
+            'split0_train_score' : [0.80, 0.92, 0.70, 0.93],
+            'split1_train_score' : [0.82, 0.55, 0.70, 0.87],
+            'mean_train_score'   : [0.81, 0.74, 0.70, 0.90],
+            'std_train_score'    : [0.01, 0.19, 0.00, 0.03],
+            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
+            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
+            'mean_score_time'    : [0.01, 0.06, 0.04, 0.04],
+            'std_score_time'     : [0.00, 0.00, 0.00, 0.01],
+            'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
+            }
+
+        NOTE
+
+        The key ``'params'`` is used to store a list of parameter
+        settings dicts for all the parameter candidates.
+
+        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
+        ``std_score_time`` are all in seconds.
+
+        For multi-metric evaluation, the scores for all the scorers are
+        available in the ``cv_results_`` dict at the keys ending with that
+        scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
+        above. ('split0_test_precision', 'mean_train_precision' etc.)
+
+    best_estimator_ : estimator
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data. Not available if ``refit=False``.
+
+        See ``refit`` parameter for more information on allowed values.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator
+
+        For multi-metric evaluation, this is present only if ``refit`` is
+        specified.
+
+        This attribute is not available if ``refit`` is a function.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data.
+
+        For multi-metric evaluation, this is present only if ``refit`` is
+        specified.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+        the parameter setting for the best model, that gives the highest
+        mean score (``search.best_score_``).
+
+        For multi-metric evaluation, this is present only if ``refit`` is
+        specified.
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+        For multi-metric evaluation, this attribute holds the validated
+        ``scoring`` dict which maps the scorer key to the scorer callable.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+        .. versionadded:: 0.20
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `n_features_in_` when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `feature_names_in_` when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    ParameterGrid : Generates all the combinations of a hyperparameter grid.
+    train_test_split : Utility function to split the data into a development
+        set usable for fitting a GridSearchCV instance and an evaluation set
+        for its final evaluation.
+    sklearn.metrics.make_scorer : Make a scorer from a performance metric or
+        loss function.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the left out
+    data, unless an explicit score is passed in which case it is used instead.
+
+    If `n_jobs` was set to a value higher than one, the data is copied for each
+    point in the grid (and not `n_jobs` times). This is done for efficiency
+    reasons if individual jobs take very little time, but may raise errors if
+    the dataset is large and not enough memory is available.  A workaround in
+    this case is to set `pre_dispatch`. Then, the memory is copied only
+    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
+    n_jobs`.
+
+    Examples
+    --------
+    >>> from sklearn import svm, datasets
+    >>> from sklearn.model_selection import GridSearchCV
+    >>> iris = datasets.load_iris()
+    >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
+    >>> svc = svm.SVC()
+    >>> clf = GridSearchCV(svc, parameters)
+    >>> clf.fit(iris.data, iris.target)
+    GridSearchCV(estimator=SVC(),
+                 param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})
+    >>> sorted(clf.cv_results_.keys())
+    ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
+     'param_C', 'param_kernel', 'params',...
+     'rank_test_score', 'split0_test_score',...
+     'split2_test_score', ...
+     'std_fit_time', 'std_score_time', 'std_test_score']
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSearchCV._parameter_constraints,
+        "param_grid": [dict, list],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        param_grid,
+        *,
+        scoring=None,
+        n_jobs=None,
+        refit=True,
+        cv=None,
+        verbose=0,
+        pre_dispatch="2*n_jobs",
+        error_score=np.nan,
+        return_train_score=False,
+    ):
+        super().__init__(
+            estimator=estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            cv=cv,
+            verbose=verbose,
+            pre_dispatch=pre_dispatch,
+            error_score=error_score,
+            return_train_score=return_train_score,
+        )
+        self.param_grid = param_grid
+
+    def _run_search(self, evaluate_candidates):
+        """Search all candidates in param_grid"""
+        evaluate_candidates(ParameterGrid(self.param_grid))
+
+
+class RandomizedSearchCV(BaseSearchCV):
+    """Randomized search on hyper parameters.
+
+    RandomizedSearchCV implements a "fit" and a "score" method.
+    It also implements "score_samples", "predict", "predict_proba",
+    "decision_function", "transform" and "inverse_transform" if they are
+    implemented in the estimator used.
+
+    The parameters of the estimator used to apply these methods are optimized
+    by cross-validated search over parameter settings.
+
+    In contrast to GridSearchCV, not all parameter values are tried out, but
+    rather a fixed number of parameter settings is sampled from the specified
+    distributions. The number of parameter settings that are tried is
+    given by n_iter.
+
+    If all parameters are presented as a list,
+    sampling without replacement is performed. If at least one parameter
+    is given as a distribution, sampling with replacement is used.
+    It is highly recommended to use continuous distributions for continuous
+    parameters.
+
+    Read more in the :ref:`User Guide <randomized_parameter_search>`.
+
+    .. versionadded:: 0.14
+
+    Parameters
+    ----------
+    estimator : estimator object
+        An object of that type is instantiated for each grid point.
+        This is assumed to implement the scikit-learn estimator interface.
+        Either estimator needs to provide a ``score`` function,
+        or ``scoring`` must be passed.
+
+    param_distributions : dict or list of dicts
+        Dictionary with parameters names (`str`) as keys and distributions
+        or lists of parameters to try. Distributions must provide a ``rvs``
+        method for sampling (such as those from scipy.stats.distributions).
+        If a list is given, it is sampled uniformly.
+        If a list of dicts is given, first a dict is sampled uniformly, and
+        then a parameter is sampled using that dict as above.
+
+    n_iter : int, default=10
+        Number of parameter settings that are sampled. n_iter trades
+        off runtime vs quality of the solution.
+
+    scoring : str, callable, list, tuple or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
+
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_string_names`);
+        - a callable (see :ref:`scoring_callable`) that returns a single value;
+        - `None`, the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables as values.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+        If None, the estimator's score method is used.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
+    refit : bool, str, or callable, default=True
+        Refit an estimator using the best found parameters on the whole
+        dataset.
+
+        For multiple metric evaluation, this needs to be a `str` denoting the
+        scorer that would be used to find the best parameters for refitting
+        the estimator at the end.
+
+        Where there are considerations other than maximum score in
+        choosing a best estimator, ``refit`` can be set to a function which
+        returns the selected ``best_index_`` given the ``cv_results_``. In that
+        case, the ``best_estimator_`` and ``best_params_`` will be set
+        according to the returned ``best_index_`` while the ``best_score_``
+        attribute will not be available.
+
+        The refitted estimator is made available at the ``best_estimator_``
+        attribute and permits using ``predict`` directly on this
+        ``RandomizedSearchCV`` instance.
+
+        Also for multiple metric evaluation, the attributes ``best_index_``,
+        ``best_score_`` and ``best_params_`` will only be available if
+        ``refit`` is set and all of them will be determined w.r.t this specific
+        scorer.
+
+        See ``scoring`` parameter to know more about multiple metric
+        evaluation.
+
+        See :ref:`this example
+        <sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py>`
+        for an example of how to use ``refit=callable`` to balance model
+        complexity and cross-validated score.
+
+        .. versionchanged:: 0.20
+            Support for callable added.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    verbose : int
+        Controls the verbosity: the higher, the more messages.
+
+        - >1 : the computation time for each fold and parameter candidate is
+          displayed;
+        - >2 : the score is also displayed;
+        - >3 : the fold and candidate parameter indexes are also displayed
+          together with the starting time of the computation.
+
+    pre_dispatch : int, or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - None, in which case all the jobs are immediately created and spawned. Use
+          this for lightweight and fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for random uniform sampling
+        from lists of possible values instead of scipy.stats distributions.
+        Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised. If a numeric value is given,
+        FitFailedWarning is raised. This parameter does not affect the refit
+        step, which will always raise the error.
+
+    return_train_score : bool, default=False
+        If ``False``, the ``cv_results_`` attribute will not include training
+        scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
+    Attributes
+    ----------
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas ``DataFrame``.
+
+        For instance the below given table
+
+        +--------------+-------------+-------------------+---+---------------+
+        | param_kernel | param_gamma | split0_test_score |...|rank_test_score|
+        +==============+=============+===================+===+===============+
+        |    'rbf'     |     0.1     |       0.80        |...|       1       |
+        +--------------+-------------+-------------------+---+---------------+
+        |    'rbf'     |     0.2     |       0.84        |...|       3       |
+        +--------------+-------------+-------------------+---+---------------+
+        |    'rbf'     |     0.3     |       0.70        |...|       2       |
+        +--------------+-------------+-------------------+---+---------------+
+
+        will be represented by a ``cv_results_`` dict of::
+
+            {
+            'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
+                                          mask = False),
+            'param_gamma'  : masked_array(data = [0.1 0.2 0.3], mask = False),
+            'split0_test_score'  : [0.80, 0.84, 0.70],
+            'split1_test_score'  : [0.82, 0.50, 0.70],
+            'mean_test_score'    : [0.81, 0.67, 0.70],
+            'std_test_score'     : [0.01, 0.24, 0.00],
+            'rank_test_score'    : [1, 3, 2],
+            'split0_train_score' : [0.80, 0.92, 0.70],
+            'split1_train_score' : [0.82, 0.55, 0.70],
+            'mean_train_score'   : [0.81, 0.74, 0.70],
+            'std_train_score'    : [0.01, 0.19, 0.00],
+            'mean_fit_time'      : [0.73, 0.63, 0.43],
+            'std_fit_time'       : [0.01, 0.02, 0.01],
+            'mean_score_time'    : [0.01, 0.06, 0.04],
+            'std_score_time'     : [0.00, 0.00, 0.00],
+            'params'             : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
+            }
+
+        NOTE
+
+        The key ``'params'`` is used to store a list of parameter
+        settings dicts for all the parameter candidates.
+
+        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
+        ``std_score_time`` are all in seconds.
+
+        For multi-metric evaluation, the scores for all the scorers are
+        available in the ``cv_results_`` dict at the keys ending with that
+        scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
+        above. ('split0_test_precision', 'mean_train_precision' etc.)
+
+    best_estimator_ : estimator
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data. Not available if ``refit=False``.
+
+        For multi-metric evaluation, this attribute is present only if
+        ``refit`` is specified.
+
+        See ``refit`` parameter for more information on allowed values.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator.
+
+        For multi-metric evaluation, this is not available if ``refit`` is
+        ``False``. See ``refit`` parameter for more information.
+
+        This attribute is not available if ``refit`` is a function.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data.
+
+        For multi-metric evaluation, this is not available if ``refit`` is
+        ``False``. See ``refit`` parameter for more information.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+        the parameter setting for the best model, that gives the highest
+        mean score (``search.best_score_``).
+
+        For multi-metric evaluation, this is not available if ``refit`` is
+        ``False``. See ``refit`` parameter for more information.
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+        For multi-metric evaluation, this attribute holds the validated
+        ``scoring`` dict which maps the scorer key to the scorer callable.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+        .. versionadded:: 0.20
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `n_features_in_` when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `feature_names_in_` when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GridSearchCV : Does exhaustive search over a grid of parameters.
+    ParameterSampler : A generator over parameter settings, constructed from
+        param_distributions.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the held-out
+    data, according to the scoring parameter.
+
+    If `n_jobs` was set to a value higher than one, the data is copied for each
+    parameter setting(and not `n_jobs` times). This is done for efficiency
+    reasons if individual jobs take very little time, but may raise errors if
+    the dataset is large and not enough memory is available.  A workaround in
+    this case is to set `pre_dispatch`. Then, the memory is copied only
+    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
+    n_jobs`.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.model_selection import RandomizedSearchCV
+    >>> from scipy.stats import uniform
+    >>> iris = load_iris()
+    >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
+    ...                               random_state=0)
+    >>> distributions = dict(C=uniform(loc=0, scale=4),
+    ...                      penalty=['l2', 'l1'])
+    >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0)
+    >>> search = clf.fit(iris.data, iris.target)
+    >>> search.best_params_
+    {'C': np.float64(2.195...), 'penalty': 'l1'}
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSearchCV._parameter_constraints,
+        "param_distributions": [dict, list],
+        "n_iter": [Interval(numbers.Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        param_distributions,
+        *,
+        n_iter=10,
+        scoring=None,
+        n_jobs=None,
+        refit=True,
+        cv=None,
+        verbose=0,
+        pre_dispatch="2*n_jobs",
+        random_state=None,
+        error_score=np.nan,
+        return_train_score=False,
+    ):
+        self.param_distributions = param_distributions
+        self.n_iter = n_iter
+        self.random_state = random_state
+        super().__init__(
+            estimator=estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            cv=cv,
+            verbose=verbose,
+            pre_dispatch=pre_dispatch,
+            error_score=error_score,
+            return_train_score=return_train_score,
+        )
+
+    def _run_search(self, evaluate_candidates):
+        """Search n_iter candidates from param_distributions"""
+        evaluate_candidates(
+            ParameterSampler(
+                self.param_distributions, self.n_iter, random_state=self.random_state
+            )
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search_successive_halving.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search_successive_halving.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcd9a83e6dc4394c1ab75713a4373dd0709e90cf
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search_successive_halving.py
@@ -0,0 +1,1095 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import abstractmethod
+from math import ceil, floor, log
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import _fit_context, is_classifier
+from ..metrics._scorer import get_scorer_names
+from ..utils import resample
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import _num_samples, validate_data
+from . import ParameterGrid, ParameterSampler
+from ._search import BaseSearchCV
+from ._split import _yields_constant_splits, check_cv
+
+__all__ = ["HalvingGridSearchCV", "HalvingRandomSearchCV"]
+
+
+class _SubsampleMetaSplitter:
+    """Splitter that subsamples a given fraction of the dataset"""
+
+    def __init__(self, *, base_cv, fraction, subsample_test, random_state):
+        self.base_cv = base_cv
+        self.fraction = fraction
+        self.subsample_test = subsample_test
+        self.random_state = random_state
+
+    def split(self, X, y, **kwargs):
+        for train_idx, test_idx in self.base_cv.split(X, y, **kwargs):
+            train_idx = resample(
+                train_idx,
+                replace=False,
+                random_state=self.random_state,
+                n_samples=int(self.fraction * len(train_idx)),
+            )
+            if self.subsample_test:
+                test_idx = resample(
+                    test_idx,
+                    replace=False,
+                    random_state=self.random_state,
+                    n_samples=int(self.fraction * len(test_idx)),
+                )
+            yield train_idx, test_idx
+
+
+def _top_k(results, k, itr):
+    # Return the best candidates of a given iteration
+    iteration, mean_test_score, params = (
+        np.asarray(a)
+        for a in (results["iter"], results["mean_test_score"], results["params"])
+    )
+    iter_indices = np.flatnonzero(iteration == itr)
+    scores = mean_test_score[iter_indices]
+    # argsort() places NaNs at the end of the array so we move NaNs to the
+    # front of the array so the last `k` items are the those with the
+    # highest scores.
+    sorted_indices = np.roll(np.argsort(scores), np.count_nonzero(np.isnan(scores)))
+    return np.array(params[iter_indices][sorted_indices[-k:]])
+
+
+class BaseSuccessiveHalving(BaseSearchCV):
+    """Implements successive halving.
+
+    Ref:
+    Almost optimal exploration in multi-armed bandits, ICML 13
+    Zohar Karnin, Tomer Koren, Oren Somekh
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSearchCV._parameter_constraints,
+        # overwrite `scoring` since multi-metrics are not supported
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "random_state": ["random_state"],
+        "max_resources": [
+            Interval(Integral, 0, None, closed="neither"),
+            StrOptions({"auto"}),
+        ],
+        "min_resources": [
+            Interval(Integral, 0, None, closed="neither"),
+            StrOptions({"exhaust", "smallest"}),
+        ],
+        "resource": [str],
+        "factor": [Interval(Real, 0, None, closed="neither")],
+        "aggressive_elimination": ["boolean"],
+    }
+    _parameter_constraints.pop("pre_dispatch")  # not used in this class
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        scoring=None,
+        n_jobs=None,
+        refit=True,
+        cv=5,
+        verbose=0,
+        random_state=None,
+        error_score=np.nan,
+        return_train_score=True,
+        max_resources="auto",
+        min_resources="exhaust",
+        resource="n_samples",
+        factor=3,
+        aggressive_elimination=False,
+    ):
+        super().__init__(
+            estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            cv=cv,
+            verbose=verbose,
+            error_score=error_score,
+            return_train_score=return_train_score,
+        )
+
+        self.random_state = random_state
+        self.max_resources = max_resources
+        self.resource = resource
+        self.factor = factor
+        self.min_resources = min_resources
+        self.aggressive_elimination = aggressive_elimination
+
+    def _check_input_parameters(self, X, y, split_params):
+        # We need to enforce that successive calls to cv.split() yield the same
+        # splits: see https://github.com/scikit-learn/scikit-learn/issues/15149
+        if not _yields_constant_splits(self._checked_cv_orig):
+            raise ValueError(
+                "The cv parameter must yield consistent folds across "
+                "calls to split(). Set its random_state to an int, or set "
+                "shuffle=False."
+            )
+
+        if (
+            self.resource != "n_samples"
+            and self.resource not in self.estimator.get_params()
+        ):
+            raise ValueError(
+                f"Cannot use resource={self.resource} which is not supported "
+                f"by estimator {self.estimator.__class__.__name__}"
+            )
+
+        if isinstance(self, HalvingRandomSearchCV):
+            if self.min_resources == self.n_candidates == "exhaust":
+                # for n_candidates=exhaust to work, we need to know what
+                # min_resources is. Similarly min_resources=exhaust needs to
+                # know the actual number of candidates.
+                raise ValueError(
+                    "n_candidates and min_resources cannot be both set to 'exhaust'."
+                )
+
+        self.min_resources_ = self.min_resources
+        if self.min_resources_ in ("smallest", "exhaust"):
+            if self.resource == "n_samples":
+                n_splits = self._checked_cv_orig.get_n_splits(X, y, **split_params)
+                # please see https://gph.is/1KjihQe for a justification
+                magic_factor = 2
+                self.min_resources_ = n_splits * magic_factor
+                if is_classifier(self.estimator):
+                    y = validate_data(self, X="no_validation", y=y)
+                    check_classification_targets(y)
+                    n_classes = np.unique(y).shape[0]
+                    self.min_resources_ *= n_classes
+            else:
+                self.min_resources_ = 1
+            # if 'exhaust', min_resources_ might be set to a higher value later
+            # in _run_search
+
+        self.max_resources_ = self.max_resources
+        if self.max_resources_ == "auto":
+            if not self.resource == "n_samples":
+                raise ValueError(
+                    "resource can only be 'n_samples' when max_resources='auto'"
+                )
+            self.max_resources_ = _num_samples(X)
+
+        if self.min_resources_ > self.max_resources_:
+            raise ValueError(
+                f"min_resources_={self.min_resources_} is greater "
+                f"than max_resources_={self.max_resources_}."
+            )
+
+        if self.min_resources_ == 0:
+            raise ValueError(
+                f"min_resources_={self.min_resources_}: you might have passed "
+                "an empty dataset X."
+            )
+
+    @staticmethod
+    def _select_best_index(refit, refit_metric, results):
+        """Custom refit callable to return the index of the best candidate.
+
+        We want the best candidate out of the last iteration. By default
+        BaseSearchCV would return the best candidate out of all iterations.
+
+        Currently, we only support for a single metric thus `refit` and
+        `refit_metric` are not required.
+        """
+        last_iter = np.max(results["iter"])
+        last_iter_indices = np.flatnonzero(results["iter"] == last_iter)
+
+        test_scores = results["mean_test_score"][last_iter_indices]
+        # If all scores are NaNs there is no way to pick between them,
+        # so we (arbitrarily) declare the zero'th entry the best one
+        if np.isnan(test_scores).all():
+            best_idx = 0
+        else:
+            best_idx = np.nanargmax(test_scores)
+
+        return last_iter_indices[best_idx]
+
+    @_fit_context(
+        # Halving*SearchCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, **params):
+        """Run fit with all sets of parameters.
+
+        Parameters
+        ----------
+
+        X : array-like, shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like, shape (n_samples,) or (n_samples, n_output), optional
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        **params : dict of string -> object
+            Parameters passed to the ``fit`` method of the estimator.
+
+        Returns
+        -------
+        self : object
+            Instance of fitted estimator.
+        """
+        self._checked_cv_orig = check_cv(
+            self.cv, y, classifier=is_classifier(self.estimator)
+        )
+
+        routed_params = self._get_routed_params_for_fit(params)
+        self._check_input_parameters(
+            X=X, y=y, split_params=routed_params.splitter.split
+        )
+
+        self._n_samples_orig = _num_samples(X)
+
+        super().fit(X, y=y, **params)
+
+        # Set best_score_: BaseSearchCV does not set it, as refit is a callable
+        self.best_score_ = self.cv_results_["mean_test_score"][self.best_index_]
+
+        return self
+
+    def _run_search(self, evaluate_candidates):
+        candidate_params = self._generate_candidate_params()
+
+        if self.resource != "n_samples" and any(
+            self.resource in candidate for candidate in candidate_params
+        ):
+            # Can only check this now since we need the candidates list
+            raise ValueError(
+                f"Cannot use parameter {self.resource} as the resource since "
+                "it is part of the searched parameters."
+            )
+
+        # n_required_iterations is the number of iterations needed so that the
+        # last iterations evaluates less than `factor` candidates.
+        n_required_iterations = 1 + floor(log(len(candidate_params), self.factor))
+
+        if self.min_resources == "exhaust":
+            # To exhaust the resources, we want to start with the biggest
+            # min_resources possible so that the last (required) iteration
+            # uses as many resources as possible
+            last_iteration = n_required_iterations - 1
+            self.min_resources_ = max(
+                self.min_resources_,
+                self.max_resources_ // self.factor**last_iteration,
+            )
+
+        # n_possible_iterations is the number of iterations that we can
+        # actually do starting from min_resources and without exceeding
+        # max_resources. Depending on max_resources and the number of
+        # candidates, this may be higher or smaller than
+        # n_required_iterations.
+        n_possible_iterations = 1 + floor(
+            log(self.max_resources_ // self.min_resources_, self.factor)
+        )
+
+        if self.aggressive_elimination:
+            n_iterations = n_required_iterations
+        else:
+            n_iterations = min(n_possible_iterations, n_required_iterations)
+
+        if self.verbose:
+            print(f"n_iterations: {n_iterations}")
+            print(f"n_required_iterations: {n_required_iterations}")
+            print(f"n_possible_iterations: {n_possible_iterations}")
+            print(f"min_resources_: {self.min_resources_}")
+            print(f"max_resources_: {self.max_resources_}")
+            print(f"aggressive_elimination: {self.aggressive_elimination}")
+            print(f"factor: {self.factor}")
+
+        self.n_resources_ = []
+        self.n_candidates_ = []
+
+        for itr in range(n_iterations):
+            power = itr  # default
+            if self.aggressive_elimination:
+                # this will set n_resources to the initial value (i.e. the
+                # value of n_resources at the first iteration) for as many
+                # iterations as needed (while candidates are being
+                # eliminated), and then go on as usual.
+                power = max(0, itr - n_required_iterations + n_possible_iterations)
+
+            n_resources = int(self.factor**power * self.min_resources_)
+            # guard, probably not needed
+            n_resources = min(n_resources, self.max_resources_)
+            self.n_resources_.append(n_resources)
+
+            n_candidates = len(candidate_params)
+            self.n_candidates_.append(n_candidates)
+
+            if self.verbose:
+                print("-" * 10)
+                print(f"iter: {itr}")
+                print(f"n_candidates: {n_candidates}")
+                print(f"n_resources: {n_resources}")
+
+            if self.resource == "n_samples":
+                # subsampling will be done in cv.split()
+                cv = _SubsampleMetaSplitter(
+                    base_cv=self._checked_cv_orig,
+                    fraction=n_resources / self._n_samples_orig,
+                    subsample_test=True,
+                    random_state=self.random_state,
+                )
+
+            else:
+                # Need copy so that the n_resources of next iteration does
+                # not overwrite
+                candidate_params = [c.copy() for c in candidate_params]
+                for candidate in candidate_params:
+                    candidate[self.resource] = n_resources
+                cv = self._checked_cv_orig
+
+            more_results = {
+                "iter": [itr] * n_candidates,
+                "n_resources": [n_resources] * n_candidates,
+            }
+
+            results = evaluate_candidates(
+                candidate_params, cv, more_results=more_results
+            )
+
+            n_candidates_to_keep = ceil(n_candidates / self.factor)
+            candidate_params = _top_k(results, n_candidates_to_keep, itr)
+
+        self.n_remaining_candidates_ = len(candidate_params)
+        self.n_required_iterations_ = n_required_iterations
+        self.n_possible_iterations_ = n_possible_iterations
+        self.n_iterations_ = n_iterations
+
+    @abstractmethod
+    def _generate_candidate_params(self):
+        pass
+
+
+class HalvingGridSearchCV(BaseSuccessiveHalving):
+    """Search over specified parameter values with successive halving.
+
+    The search strategy starts evaluating all the candidates with a small
+    amount of resources and iteratively selects the best candidates, using
+    more and more resources.
+
+    Read more in the :ref:`User guide <successive_halving_user_guide>`.
+
+    .. note::
+
+      This estimator is still **experimental** for now: the predictions
+      and the API might change without any deprecation cycle. To use it,
+      you need to explicitly import ``enable_halving_search_cv``::
+
+        >>> # explicitly require this experimental feature
+        >>> from sklearn.experimental import enable_halving_search_cv # noqa
+        >>> # now you can import normally from model_selection
+        >>> from sklearn.model_selection import HalvingGridSearchCV
+
+    Parameters
+    ----------
+    estimator : estimator object
+        This is assumed to implement the scikit-learn estimator interface.
+        Either estimator needs to provide a ``score`` function,
+        or ``scoring`` must be passed.
+
+    param_grid : dict or list of dictionaries
+        Dictionary with parameters names (string) as keys and lists of
+        parameter settings to try as values, or a list of such
+        dictionaries, in which case the grids spanned by each dictionary
+        in the list are explored. This enables searching over any sequence
+        of parameter settings.
+
+    factor : int or float, default=3
+        The 'halving' parameter, which determines the proportion of candidates
+        that are selected for each subsequent iteration. For example,
+        ``factor=3`` means that only one third of the candidates are selected.
+
+    resource : ``'n_samples'`` or str, default='n_samples'
+        Defines the resource that increases with each iteration. By default,
+        the resource is the number of samples. It can also be set to any
+        parameter of the base estimator that accepts positive integer
+        values, e.g. 'n_iterations' or 'n_estimators' for a gradient
+        boosting estimator. In this case ``max_resources`` cannot be 'auto'
+        and must be set explicitly.
+
+    max_resources : int, default='auto'
+        The maximum amount of resource that any candidate is allowed to use
+        for a given iteration. By default, this is set to ``n_samples`` when
+        ``resource='n_samples'`` (default), else an error is raised.
+
+    min_resources : {'exhaust', 'smallest'} or int, default='exhaust'
+        The minimum amount of resource that any candidate is allowed to use
+        for a given iteration. Equivalently, this defines the amount of
+        resources `r0` that are allocated for each candidate at the first
+        iteration.
+
+        - 'smallest' is a heuristic that sets `r0` to a small value:
+
+          - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem
+          - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a
+            classification problem
+          - ``1`` when ``resource != 'n_samples'``
+
+        - 'exhaust' will set `r0` such that the **last** iteration uses as
+          much resources as possible. Namely, the last iteration will use the
+          highest value smaller than ``max_resources`` that is a multiple of
+          both ``min_resources`` and ``factor``. In general, using 'exhaust'
+          leads to a more accurate estimator, but is slightly more time
+          consuming.
+
+        Note that the amount of resources used at each iteration is always a
+        multiple of ``min_resources``.
+
+    aggressive_elimination : bool, default=False
+        This is only relevant in cases where there isn't enough resources to
+        reduce the remaining candidates to at most `factor` after the last
+        iteration. If ``True``, then the search process will 'replay' the
+        first iteration for as long as needed until the number of candidates
+        is small enough. This is ``False`` by default, which means that the
+        last iteration may evaluate more than ``factor`` candidates. See
+        :ref:`aggressive_elimination` for more details.
+
+    cv : int, cross-validation generator or iterable, default=5
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. note::
+            Due to implementation details, the folds produced by `cv` must be
+            the same across multiple calls to `cv.split()`. For
+            built-in `scikit-learn` iterators, this can be achieved by
+            deactivating shuffling (`shuffle=False`), or by setting the
+            `cv`'s `random_state` parameter to an integer.
+
+    scoring : str or callable, default=None
+        Scoring method to use to evaluate the predictions on the test set.
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+    refit : bool or callable, default=True
+        Refit an estimator using the best found parameters on the whole
+        dataset.
+
+        Where there are considerations other than maximum score in
+        choosing a best estimator, ``refit`` can be set to a function which
+        returns the selected ``best_index_`` given ``cv_results_``. In that
+        case, the ``best_estimator_`` and ``best_params_`` will be set
+        according to the returned ``best_index_`` while the ``best_score_``
+        attribute will not be available.
+
+        The refitted estimator is made available at the ``best_estimator_``
+        attribute and permits using ``predict`` directly on this
+        ``HalvingGridSearchCV`` instance.
+
+        See :ref:`this example
+        <sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py>`
+        for an example of how to use ``refit=callable`` to balance model
+        complexity and cross-validated score.
+
+    error_score : 'raise' or numeric
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised. If a numeric value is given,
+        FitFailedWarning is raised. This parameter does not affect the refit
+        step, which will always raise the error. Default is ``np.nan``.
+
+    return_train_score : bool, default=False
+        If ``False``, the ``cv_results_`` attribute will not include training
+        scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for subsampling the dataset
+        when `resources != 'n_samples'`. Ignored otherwise.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int or None, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int
+        Controls the verbosity: the higher, the more messages.
+
+    Attributes
+    ----------
+    n_resources_ : list of int
+        The amount of resources used at each iteration.
+
+    n_candidates_ : list of int
+        The number of candidate parameters that were evaluated at each
+        iteration.
+
+    n_remaining_candidates_ : int
+        The number of candidate parameters that are left after the last
+        iteration. It corresponds to `ceil(n_candidates[-1] / factor)`
+
+    max_resources_ : int
+        The maximum number of resources that any candidate is allowed to use
+        for a given iteration. Note that since the number of resources used
+        at each iteration must be a multiple of ``min_resources_``, the
+        actual number of resources used at the last iteration may be smaller
+        than ``max_resources_``.
+
+    min_resources_ : int
+        The amount of resources that are allocated for each candidate at the
+        first iteration.
+
+    n_iterations_ : int
+        The actual number of iterations that were run. This is equal to
+        ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``.
+        Else, this is equal to ``min(n_possible_iterations_,
+        n_required_iterations_)``.
+
+    n_possible_iterations_ : int
+        The number of iterations that are possible starting with
+        ``min_resources_`` resources and without exceeding
+        ``max_resources_``.
+
+    n_required_iterations_ : int
+        The number of iterations that are required to end up with less than
+        ``factor`` candidates at the last iteration, starting with
+        ``min_resources_`` resources. This will be smaller than
+        ``n_possible_iterations_`` when there isn't enough resources.
+
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas ``DataFrame``. It contains lots of information
+        for analysing the results of a search.
+        Please refer to the :ref:`User guide<successive_halving_cv_results>`
+        for details.
+
+    best_estimator_ : estimator or dict
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data. Not available if ``refit=False``.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+        the parameter setting for the best model, that gives the highest
+        mean score (``search.best_score_``).
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `n_features_in_` when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `feature_names_in_` when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    :class:`HalvingRandomSearchCV`:
+        Random search over a set of parameters using successive halving.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the held-out
+    data, according to the scoring parameter.
+
+    All parameter combinations scored with a NaN will share the lowest rank.
+
+    Examples
+    --------
+
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.experimental import enable_halving_search_cv  # noqa
+    >>> from sklearn.model_selection import HalvingGridSearchCV
+    ...
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = RandomForestClassifier(random_state=0)
+    ...
+    >>> param_grid = {"max_depth": [3, None],
+    ...               "min_samples_split": [5, 10]}
+    >>> search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators',
+    ...                              max_resources=10,
+    ...                              random_state=0).fit(X, y)
+    >>> search.best_params_  # doctest: +SKIP
+    {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSuccessiveHalving._parameter_constraints,
+        "param_grid": [dict, list],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        param_grid,
+        *,
+        factor=3,
+        resource="n_samples",
+        max_resources="auto",
+        min_resources="exhaust",
+        aggressive_elimination=False,
+        cv=5,
+        scoring=None,
+        refit=True,
+        error_score=np.nan,
+        return_train_score=True,
+        random_state=None,
+        n_jobs=None,
+        verbose=0,
+    ):
+        super().__init__(
+            estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            verbose=verbose,
+            cv=cv,
+            random_state=random_state,
+            error_score=error_score,
+            return_train_score=return_train_score,
+            max_resources=max_resources,
+            resource=resource,
+            factor=factor,
+            min_resources=min_resources,
+            aggressive_elimination=aggressive_elimination,
+        )
+        self.param_grid = param_grid
+
+    def _generate_candidate_params(self):
+        return ParameterGrid(self.param_grid)
+
+
+class HalvingRandomSearchCV(BaseSuccessiveHalving):
+    """Randomized search on hyper parameters.
+
+    The search strategy starts evaluating all the candidates with a small
+    amount of resources and iteratively selects the best candidates, using more
+    and more resources.
+
+    The candidates are sampled at random from the parameter space and the
+    number of sampled candidates is determined by ``n_candidates``.
+
+    Read more in the :ref:`User guide<successive_halving_user_guide>`.
+
+    .. note::
+
+      This estimator is still **experimental** for now: the predictions
+      and the API might change without any deprecation cycle. To use it,
+      you need to explicitly import ``enable_halving_search_cv``::
+
+        >>> # explicitly require this experimental feature
+        >>> from sklearn.experimental import enable_halving_search_cv # noqa
+        >>> # now you can import normally from model_selection
+        >>> from sklearn.model_selection import HalvingRandomSearchCV
+
+    Parameters
+    ----------
+    estimator : estimator object
+        This is assumed to implement the scikit-learn estimator interface.
+        Either estimator needs to provide a ``score`` function,
+        or ``scoring`` must be passed.
+
+    param_distributions : dict or list of dicts
+        Dictionary with parameters names (`str`) as keys and distributions
+        or lists of parameters to try. Distributions must provide a ``rvs``
+        method for sampling (such as those from scipy.stats.distributions).
+        If a list is given, it is sampled uniformly.
+        If a list of dicts is given, first a dict is sampled uniformly, and
+        then a parameter is sampled using that dict as above.
+
+    n_candidates : "exhaust" or int, default="exhaust"
+        The number of candidate parameters to sample, at the first
+        iteration. Using 'exhaust' will sample enough candidates so that the
+        last iteration uses as many resources as possible, based on
+        `min_resources`, `max_resources` and `factor`. In this case,
+        `min_resources` cannot be 'exhaust'.
+
+    factor : int or float, default=3
+        The 'halving' parameter, which determines the proportion of candidates
+        that are selected for each subsequent iteration. For example,
+        ``factor=3`` means that only one third of the candidates are selected.
+
+    resource : ``'n_samples'`` or str, default='n_samples'
+        Defines the resource that increases with each iteration. By default,
+        the resource is the number of samples. It can also be set to any
+        parameter of the base estimator that accepts positive integer
+        values, e.g. 'n_iterations' or 'n_estimators' for a gradient
+        boosting estimator. In this case ``max_resources`` cannot be 'auto'
+        and must be set explicitly.
+
+    max_resources : int, default='auto'
+        The maximum number of resources that any candidate is allowed to use
+        for a given iteration. By default, this is set ``n_samples`` when
+        ``resource='n_samples'`` (default), else an error is raised.
+
+    min_resources : {'exhaust', 'smallest'} or int, default='smallest'
+        The minimum amount of resource that any candidate is allowed to use
+        for a given iteration. Equivalently, this defines the amount of
+        resources `r0` that are allocated for each candidate at the first
+        iteration.
+
+        - 'smallest' is a heuristic that sets `r0` to a small value:
+
+          - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem
+          - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a
+            classification problem
+          - ``1`` when ``resource != 'n_samples'``
+
+        - 'exhaust' will set `r0` such that the **last** iteration uses as
+          much resources as possible. Namely, the last iteration will use the
+          highest value smaller than ``max_resources`` that is a multiple of
+          both ``min_resources`` and ``factor``. In general, using 'exhaust'
+          leads to a more accurate estimator, but is slightly more time
+          consuming. 'exhaust' isn't available when `n_candidates='exhaust'`.
+
+        Note that the amount of resources used at each iteration is always a
+        multiple of ``min_resources``.
+
+    aggressive_elimination : bool, default=False
+        This is only relevant in cases where there isn't enough resources to
+        reduce the remaining candidates to at most `factor` after the last
+        iteration. If ``True``, then the search process will 'replay' the
+        first iteration for as long as needed until the number of candidates
+        is small enough. This is ``False`` by default, which means that the
+        last iteration may evaluate more than ``factor`` candidates. See
+        :ref:`aggressive_elimination` for more details.
+
+    cv : int, cross-validation generator or an iterable, default=5
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. note::
+            Due to implementation details, the folds produced by `cv` must be
+            the same across multiple calls to `cv.split()`. For
+            built-in `scikit-learn` iterators, this can be achieved by
+            deactivating shuffling (`shuffle=False`), or by setting the
+            `cv`'s `random_state` parameter to an integer.
+
+    scoring : str or callable, default=None
+        Scoring method to use to evaluate the predictions on the test set.
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+    refit : bool or callable, default=True
+        Refit an estimator using the best found parameters on the whole
+        dataset.
+
+        Where there are considerations other than maximum score in
+        choosing a best estimator, ``refit`` can be set to a function which
+        returns the selected ``best_index_`` given ``cv_results_``. In that
+        case, the ``best_estimator_`` and ``best_params_`` will be set
+        according to the returned ``best_index_`` while the ``best_score_``
+        attribute will not be available.
+
+        The refitted estimator is made available at the ``best_estimator_``
+        attribute and permits using ``predict`` directly on this
+        ``HalvingRandomSearchCV`` instance.
+
+        See :ref:`this example
+        <sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py>`
+        for an example of how to use ``refit=callable`` to balance model
+        complexity and cross-validated score.
+
+    error_score : 'raise' or numeric
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised. If a numeric value is given,
+        FitFailedWarning is raised. This parameter does not affect the refit
+        step, which will always raise the error. Default is ``np.nan``.
+
+    return_train_score : bool, default=False
+        If ``False``, the ``cv_results_`` attribute will not include training
+        scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo random number generator state used for subsampling the dataset
+        when `resources != 'n_samples'`. Also used for random uniform
+        sampling from lists of possible values instead of scipy.stats
+        distributions.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int or None, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int
+        Controls the verbosity: the higher, the more messages.
+
+    Attributes
+    ----------
+    n_resources_ : list of int
+        The amount of resources used at each iteration.
+
+    n_candidates_ : list of int
+        The number of candidate parameters that were evaluated at each
+        iteration.
+
+    n_remaining_candidates_ : int
+        The number of candidate parameters that are left after the last
+        iteration. It corresponds to `ceil(n_candidates[-1] / factor)`
+
+    max_resources_ : int
+        The maximum number of resources that any candidate is allowed to use
+        for a given iteration. Note that since the number of resources used at
+        each iteration must be a multiple of ``min_resources_``, the actual
+        number of resources used at the last iteration may be smaller than
+        ``max_resources_``.
+
+    min_resources_ : int
+        The amount of resources that are allocated for each candidate at the
+        first iteration.
+
+    n_iterations_ : int
+        The actual number of iterations that were run. This is equal to
+        ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``.
+        Else, this is equal to ``min(n_possible_iterations_,
+        n_required_iterations_)``.
+
+    n_possible_iterations_ : int
+        The number of iterations that are possible starting with
+        ``min_resources_`` resources and without exceeding
+        ``max_resources_``.
+
+    n_required_iterations_ : int
+        The number of iterations that are required to end up with less than
+        ``factor`` candidates at the last iteration, starting with
+        ``min_resources_`` resources. This will be smaller than
+        ``n_possible_iterations_`` when there isn't enough resources.
+
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas ``DataFrame``. It contains lots of information
+        for analysing the results of a search.
+        Please refer to the :ref:`User guide<successive_halving_cv_results>`
+        for details.
+
+    best_estimator_ : estimator or dict
+        Estimator that was chosen by the search, i.e. estimator
+        which gave highest score (or smallest loss if specified)
+        on the left out data. Not available if ``refit=False``.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+        the parameter setting for the best model, that gives the highest
+        mean score (``search.best_score_``).
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best
+        parameters for the model.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `n_features_in_` when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Only defined if
+        `best_estimator_` is defined (see the documentation for the `refit`
+        parameter for more details) and that `best_estimator_` exposes
+        `feature_names_in_` when fit.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    :class:`HalvingGridSearchCV`:
+        Search over a grid of parameters using successive halving.
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the held-out
+    data, according to the scoring parameter.
+
+    All parameter combinations scored with a NaN will share the lowest rank.
+
+    Examples
+    --------
+
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.experimental import enable_halving_search_cv  # noqa
+    >>> from sklearn.model_selection import HalvingRandomSearchCV
+    >>> from scipy.stats import randint
+    >>> import numpy as np
+    ...
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = RandomForestClassifier(random_state=0)
+    >>> np.random.seed(0)
+    ...
+    >>> param_distributions = {"max_depth": [3, None],
+    ...                        "min_samples_split": randint(2, 11)}
+    >>> search = HalvingRandomSearchCV(clf, param_distributions,
+    ...                                resource='n_estimators',
+    ...                                max_resources=10,
+    ...                                random_state=0).fit(X, y)
+    >>> search.best_params_  # doctest: +SKIP
+    {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSuccessiveHalving._parameter_constraints,
+        "param_distributions": [dict, list],
+        "n_candidates": [
+            Interval(Integral, 0, None, closed="neither"),
+            StrOptions({"exhaust"}),
+        ],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        param_distributions,
+        *,
+        n_candidates="exhaust",
+        factor=3,
+        resource="n_samples",
+        max_resources="auto",
+        min_resources="smallest",
+        aggressive_elimination=False,
+        cv=5,
+        scoring=None,
+        refit=True,
+        error_score=np.nan,
+        return_train_score=True,
+        random_state=None,
+        n_jobs=None,
+        verbose=0,
+    ):
+        super().__init__(
+            estimator,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            refit=refit,
+            verbose=verbose,
+            cv=cv,
+            random_state=random_state,
+            error_score=error_score,
+            return_train_score=return_train_score,
+            max_resources=max_resources,
+            resource=resource,
+            factor=factor,
+            min_resources=min_resources,
+            aggressive_elimination=aggressive_elimination,
+        )
+        self.param_distributions = param_distributions
+        self.n_candidates = n_candidates
+
+    def _generate_candidate_params(self):
+        n_candidates_first_iter = self.n_candidates
+        if n_candidates_first_iter == "exhaust":
+            # This will generate enough candidate so that the last iteration
+            # uses as much resources as possible
+            n_candidates_first_iter = self.max_resources_ // self.min_resources_
+        return ParameterSampler(
+            self.param_distributions,
+            n_candidates_first_iter,
+            random_state=self.random_state,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_split.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..640b7f6eee2f02c0f7f22d89b8d9523d36ddc27f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_split.py
@@ -0,0 +1,3055 @@
+"""
+The :mod:`sklearn.model_selection._split` module includes classes and
+functions to split the data based on a preset strategy.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import defaultdict
+from collections.abc import Iterable
+from inspect import signature
+from itertools import chain, combinations
+from math import ceil, floor
+
+import numpy as np
+from scipy.special import comb
+
+from ..utils import (
+    _safe_indexing,
+    check_random_state,
+    indexable,
+    metadata_routing,
+)
+from ..utils._array_api import (
+    _convert_to_numpy,
+    ensure_common_namespace_device,
+    get_namespace,
+)
+from ..utils._param_validation import Interval, RealNotInt, validate_params
+from ..utils.extmath import _approximate_mode
+from ..utils.metadata_routing import _MetadataRequester
+from ..utils.multiclass import type_of_target
+from ..utils.validation import _num_samples, check_array, column_or_1d
+
+__all__ = [
+    "BaseCrossValidator",
+    "GroupKFold",
+    "GroupShuffleSplit",
+    "KFold",
+    "LeaveOneGroupOut",
+    "LeaveOneOut",
+    "LeavePGroupsOut",
+    "LeavePOut",
+    "PredefinedSplit",
+    "RepeatedKFold",
+    "RepeatedStratifiedKFold",
+    "ShuffleSplit",
+    "StratifiedGroupKFold",
+    "StratifiedKFold",
+    "StratifiedShuffleSplit",
+    "check_cv",
+    "train_test_split",
+]
+
+
+class _UnsupportedGroupCVMixin:
+    """Mixin for splitters that do not support Groups."""
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        return super().split(X, y, groups=groups)
+
+
+class GroupsConsumerMixin(_MetadataRequester):
+    """A Mixin to ``groups`` by default.
+
+    This Mixin makes the object to request ``groups`` by default as ``True``.
+
+    .. versionadded:: 1.3
+    """
+
+    __metadata_request__split = {"groups": True}
+
+
+class BaseCrossValidator(_MetadataRequester, metaclass=ABCMeta):
+    """Base class for all cross-validators.
+
+    Implementations must define `_iter_test_masks` or `_iter_test_indices`.
+    """
+
+    # This indicates that by default CV splitters don't have a "groups" kwarg,
+    # unless indicated by inheriting from ``GroupsConsumerMixin``.
+    # This also prevents ``set_split_request`` to be generated for splitters
+    # which don't support ``groups``.
+    __metadata_request__split = {"groups": metadata_routing.UNUSED}
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        X, y, groups = indexable(X, y, groups)
+        indices = np.arange(_num_samples(X))
+        for test_index in self._iter_test_masks(X, y, groups):
+            train_index = indices[np.logical_not(test_index)]
+            test_index = indices[test_index]
+            yield train_index, test_index
+
+    # Since subclasses must implement either _iter_test_masks or
+    # _iter_test_indices, neither can be abstract.
+    def _iter_test_masks(self, X=None, y=None, groups=None):
+        """Generates boolean masks corresponding to test sets.
+
+        By default, delegates to _iter_test_indices(X, y, groups)
+        """
+        for test_index in self._iter_test_indices(X, y, groups):
+            test_mask = np.zeros(_num_samples(X), dtype=bool)
+            test_mask[test_index] = True
+            yield test_mask
+
+    def _iter_test_indices(self, X=None, y=None, groups=None):
+        """Generates integer indices corresponding to test sets."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator."""
+
+    def __repr__(self):
+        return _build_repr(self)
+
+
+class LeaveOneOut(_UnsupportedGroupCVMixin, BaseCrossValidator):
+    """Leave-One-Out cross-validator.
+
+    Provides train/test indices to split data in train/test sets. Each
+    sample is used once as a test set (singleton) while the remaining
+    samples form the training set.
+
+    Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and
+    ``LeavePOut(p=1)`` where ``n`` is the number of samples.
+
+    Due to the high number of test sets (which is the same as the
+    number of samples) this cross-validation method can be very costly.
+    For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit`
+    or :class:`StratifiedKFold`.
+
+    Read more in the :ref:`User Guide <leave_one_out>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import LeaveOneOut
+    >>> X = np.array([[1, 2], [3, 4]])
+    >>> y = np.array([1, 2])
+    >>> loo = LeaveOneOut()
+    >>> loo.get_n_splits(X)
+    2
+    >>> print(loo)
+    LeaveOneOut()
+    >>> for i, (train_index, test_index) in enumerate(loo.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1]
+      Test:  index=[0]
+    Fold 1:
+      Train: index=[0]
+      Test:  index=[1]
+
+    See Also
+    --------
+    LeaveOneGroupOut : For splitting the data according to explicit,
+        domain-specific stratification of the dataset.
+    GroupKFold : K-fold iterator variant with non-overlapping groups.
+    """
+
+    def _iter_test_indices(self, X, y=None, groups=None):
+        n_samples = _num_samples(X)
+        if n_samples <= 1:
+            raise ValueError(
+                "Cannot perform LeaveOneOut with n_samples={}.".format(n_samples)
+            )
+        return range(n_samples)
+
+    def get_n_splits(self, X, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        if X is None:
+            raise ValueError("The 'X' parameter should not be None.")
+        return _num_samples(X)
+
+
+class LeavePOut(_UnsupportedGroupCVMixin, BaseCrossValidator):
+    """Leave-P-Out cross-validator.
+
+    Provides train/test indices to split data in train/test sets. This results
+    in testing on all distinct samples of size p, while the remaining n - p
+    samples form the training set in each iteration.
+
+    Note: ``LeavePOut(p)`` is NOT equivalent to
+    ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets.
+
+    Due to the high number of iterations which grows combinatorically with the
+    number of samples this cross-validation method can be very costly. For
+    large datasets one should favor :class:`KFold`, :class:`StratifiedKFold`
+    or :class:`ShuffleSplit`.
+
+    Read more in the :ref:`User Guide <leave_p_out>`.
+
+    Parameters
+    ----------
+    p : int
+        Size of the test sets. Must be strictly less than the number of
+        samples.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import LeavePOut
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> lpo = LeavePOut(2)
+    >>> lpo.get_n_splits(X)
+    6
+    >>> print(lpo)
+    LeavePOut(p=2)
+    >>> for i, (train_index, test_index) in enumerate(lpo.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[2 3]
+      Test:  index=[0 1]
+    Fold 1:
+      Train: index=[1 3]
+      Test:  index=[0 2]
+    Fold 2:
+      Train: index=[1 2]
+      Test:  index=[0 3]
+    Fold 3:
+      Train: index=[0 3]
+      Test:  index=[1 2]
+    Fold 4:
+      Train: index=[0 2]
+      Test:  index=[1 3]
+    Fold 5:
+      Train: index=[0 1]
+      Test:  index=[2 3]
+    """
+
+    def __init__(self, p):
+        self.p = p
+
+    def _iter_test_indices(self, X, y=None, groups=None):
+        n_samples = _num_samples(X)
+        if n_samples <= self.p:
+            raise ValueError(
+                "p={} must be strictly less than the number of samples={}".format(
+                    self.p, n_samples
+                )
+            )
+        for combination in combinations(range(n_samples), self.p):
+            yield np.array(combination)
+
+    def get_n_splits(self, X, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+        """
+        if X is None:
+            raise ValueError("The 'X' parameter should not be None.")
+        return int(comb(_num_samples(X), self.p, exact=True))
+
+
+class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta):
+    """Base class for K-Fold cross-validators and TimeSeriesSplit."""
+
+    @abstractmethod
+    def __init__(self, n_splits, *, shuffle, random_state):
+        if not isinstance(n_splits, numbers.Integral):
+            raise ValueError(
+                "The number of folds must be of Integral type. "
+                "%s of type %s was passed." % (n_splits, type(n_splits))
+            )
+        n_splits = int(n_splits)
+
+        if n_splits <= 1:
+            raise ValueError(
+                "k-fold cross-validation requires at least one"
+                " train/test split by setting n_splits=2 or more,"
+                " got n_splits={0}.".format(n_splits)
+            )
+
+        if not isinstance(shuffle, bool):
+            raise TypeError("shuffle must be True or False; got {0}".format(shuffle))
+
+        if not shuffle and random_state is not None:  # None is the default
+            raise ValueError(
+                (
+                    "Setting a random_state has no effect since shuffle is "
+                    "False. You should leave "
+                    "random_state to its default (None), or set shuffle=True."
+                ),
+            )
+
+        self.n_splits = n_splits
+        self.shuffle = shuffle
+        self.random_state = random_state
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        X, y, groups = indexable(X, y, groups)
+        n_samples = _num_samples(X)
+        if self.n_splits > n_samples:
+            raise ValueError(
+                (
+                    "Cannot have number of splits n_splits={0} greater"
+                    " than the number of samples: n_samples={1}."
+                ).format(self.n_splits, n_samples)
+            )
+
+        for train, test in super().split(X, y, groups):
+            yield train, test
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return self.n_splits
+
+
+class KFold(_UnsupportedGroupCVMixin, _BaseKFold):
+    """K-Fold cross-validator.
+
+    Provides train/test indices to split data in train/test sets. Split
+    dataset into k consecutive folds (without shuffling by default).
+
+    Each fold is then used once as a validation while the k - 1 remaining
+    folds form the training set.
+
+    Read more in the :ref:`User Guide <k_fold>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    shuffle : bool, default=False
+        Whether to shuffle the data before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold. Otherwise, this
+        parameter has no effect.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import KFold
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> kf = KFold(n_splits=2)
+    >>> kf.get_n_splits(X)
+    2
+    >>> print(kf)
+    KFold(n_splits=2, random_state=None, shuffle=False)
+    >>> for i, (train_index, test_index) in enumerate(kf.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[2 3]
+      Test:  index=[0 1]
+    Fold 1:
+      Train: index=[0 1]
+      Test:  index=[2 3]
+
+    Notes
+    -----
+    The first ``n_samples % n_splits`` folds have size
+    ``n_samples // n_splits + 1``, other folds have size
+    ``n_samples // n_splits``, where ``n_samples`` is the number of samples.
+
+    Randomized CV splitters may return different results for each call of
+    split. You can make the results identical by setting `random_state`
+    to an integer.
+
+    See Also
+    --------
+    StratifiedKFold : Takes class information into account to avoid building
+        folds with imbalanced class distributions (for binary or multiclass
+        classification tasks).
+
+    GroupKFold : K-fold iterator variant with non-overlapping groups.
+
+    RepeatedKFold : Repeats K-Fold n times.
+    """
+
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+
+    def _iter_test_indices(self, X, y=None, groups=None):
+        n_samples = _num_samples(X)
+        indices = np.arange(n_samples)
+        if self.shuffle:
+            check_random_state(self.random_state).shuffle(indices)
+
+        n_splits = self.n_splits
+        fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)
+        fold_sizes[: n_samples % n_splits] += 1
+        current = 0
+        for fold_size in fold_sizes:
+            start, stop = current, current + fold_size
+            yield indices[start:stop]
+            current = stop
+
+
+class GroupKFold(GroupsConsumerMixin, _BaseKFold):
+    """K-fold iterator variant with non-overlapping groups.
+
+    Each group will appear exactly once in the test set across all folds (the
+    number of distinct groups has to be at least equal to the number of folds).
+
+    The folds are approximately balanced in the sense that the number of
+    samples is approximately the same in each test fold when `shuffle` is True.
+
+    Read more in the :ref:`User Guide <group_k_fold>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    shuffle : bool, default=False
+        Whether to shuffle the groups before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+
+        .. versionadded:: 1.6
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold. Otherwise, this
+        parameter has no effect.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+        .. versionadded:: 1.6
+
+    Notes
+    -----
+    Groups appear in an arbitrary order throughout the folds.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import GroupKFold
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
+    >>> y = np.array([1, 2, 3, 4, 5, 6])
+    >>> groups = np.array([0, 0, 2, 2, 3, 3])
+    >>> group_kfold = GroupKFold(n_splits=2)
+    >>> group_kfold.get_n_splits(X, y, groups)
+    2
+    >>> print(group_kfold)
+    GroupKFold(n_splits=2, random_state=None, shuffle=False)
+    >>> for i, (train_index, test_index) in enumerate(group_kfold.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}, group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}, group={groups[test_index]}")
+    Fold 0:
+      Train: index=[2 3], group=[2 2]
+      Test:  index=[0 1 4 5], group=[0 0 3 3]
+    Fold 1:
+      Train: index=[0 1 4 5], group=[0 0 3 3]
+      Test:  index=[2 3], group=[2 2]
+
+    See Also
+    --------
+    LeaveOneGroupOut : For splitting the data according to explicit
+        domain-specific stratification of the dataset.
+
+    StratifiedKFold : Takes class information into account to avoid building
+        folds with imbalanced class proportions (for binary or multiclass
+        classification tasks).
+    """
+
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits, shuffle=shuffle, random_state=random_state)
+
+    def _iter_test_indices(self, X, y, groups):
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
+
+        unique_groups, group_idx = np.unique(groups, return_inverse=True)
+        n_groups = len(unique_groups)
+
+        if self.n_splits > n_groups:
+            raise ValueError(
+                "Cannot have number of splits n_splits=%d greater"
+                " than the number of groups: %d." % (self.n_splits, n_groups)
+            )
+
+        if self.shuffle:
+            # Split and shuffle unique groups across n_splits
+            rng = check_random_state(self.random_state)
+            unique_groups = rng.permutation(unique_groups)
+            split_groups = np.array_split(unique_groups, self.n_splits)
+
+            for test_group_ids in split_groups:
+                test_mask = np.isin(groups, test_group_ids)
+                yield np.where(test_mask)[0]
+
+        else:
+            # Weight groups by their number of occurrences
+            n_samples_per_group = np.bincount(group_idx)
+
+            # Distribute the most frequent groups first
+            indices = np.argsort(n_samples_per_group)[::-1]
+            n_samples_per_group = n_samples_per_group[indices]
+
+            # Total weight of each fold
+            n_samples_per_fold = np.zeros(self.n_splits)
+
+            # Mapping from group index to fold index
+            group_to_fold = np.zeros(len(unique_groups))
+
+            # Distribute samples by adding the largest weight to the lightest fold
+            for group_index, weight in enumerate(n_samples_per_group):
+                lightest_fold = np.argmin(n_samples_per_fold)
+                n_samples_per_fold[lightest_fold] += weight
+                group_to_fold[indices[group_index]] = lightest_fold
+
+            indices = group_to_fold[group_idx]
+
+            for f in range(self.n_splits):
+                yield np.where(indices == f)[0]
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        return super().split(X, y, groups)
+
+
+class StratifiedKFold(_BaseKFold):
+    """Class-wise stratified K-Fold cross-validator.
+
+    Provides train/test indices to split data in train/test sets.
+
+    This cross-validation object is a variation of KFold that returns
+    stratified folds. The folds are made by preserving the percentage of
+    samples for each class in `y` in a binary or multiclass classification
+    setting.
+
+    Read more in the :ref:`User Guide <stratified_k_fold>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    .. note::
+
+        Stratification on the class label solves an engineering problem rather
+        than a statistical one. See :ref:`stratification` for more details.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    shuffle : bool, default=False
+        Whether to shuffle each class's samples before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold for each class.
+        Otherwise, leave `random_state` as `None`.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import StratifiedKFold
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> skf = StratifiedKFold(n_splits=2)
+    >>> skf.get_n_splits(X, y)
+    2
+    >>> print(skf)
+    StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
+    >>> for i, (train_index, test_index) in enumerate(skf.split(X, y)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1 3]
+      Test:  index=[0 2]
+    Fold 1:
+      Train: index=[0 2]
+      Test:  index=[1 3]
+
+    Notes
+    -----
+    The implementation is designed to:
+
+    * Generate test sets such that all contain the same distribution of
+      classes, or as close as possible.
+    * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
+      ``y = [1, 0]`` should not change the indices generated.
+    * Preserve order dependencies in the dataset ordering, when
+      ``shuffle=False``: all samples from class k in some test set were
+      contiguous in y, or separated in y by samples from classes other than k.
+    * Generate test sets where the smallest and largest differ by at most one
+      sample.
+
+    .. versionchanged:: 0.22
+        The previous implementation did not follow the last constraint.
+
+    See Also
+    --------
+    RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
+    """
+
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+
+    def _make_test_folds(self, X, y=None):
+        rng = check_random_state(self.random_state)
+        # XXX: as of now, cross-validation splitters only operate in NumPy-land
+        # without attempting to leverage array API namespace features. However
+        # they might be fed by array API inputs, e.g. in CV-enabled estimators so
+        # we need the following explicit conversion:
+        xp, is_array_api = get_namespace(y)
+        if is_array_api:
+            y = _convert_to_numpy(y, xp)
+        else:
+            y = np.asarray(y)
+        type_of_target_y = type_of_target(y)
+        allowed_target_types = ("binary", "multiclass")
+        if type_of_target_y not in allowed_target_types:
+            raise ValueError(
+                "Supported target types are: {}. Got {!r} instead.".format(
+                    allowed_target_types, type_of_target_y
+                )
+            )
+
+        y = column_or_1d(y)
+
+        _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
+        # y_inv encodes y according to lexicographic order. We invert y_idx to
+        # map the classes so that they are encoded by order of appearance:
+        # 0 represents the first label appearing in y, 1 the second, etc.
+        _, class_perm = np.unique(y_idx, return_inverse=True)
+        y_encoded = class_perm[y_inv]
+
+        n_classes = len(y_idx)
+        y_counts = np.bincount(y_encoded)
+        min_groups = np.min(y_counts)
+        if np.all(self.n_splits > y_counts):
+            raise ValueError(
+                "n_splits=%d cannot be greater than the"
+                " number of members in each class." % (self.n_splits)
+            )
+        if self.n_splits > min_groups:
+            warnings.warn(
+                "The least populated class in y has only %d"
+                " members, which is less than n_splits=%d."
+                % (min_groups, self.n_splits),
+                UserWarning,
+            )
+
+        # Determine the optimal number of samples from each class in each fold,
+        # using round robin over the sorted y. (This can be done direct from
+        # counts, but that code is unreadable.)
+        y_order = np.sort(y_encoded)
+        allocation = np.asarray(
+            [
+                np.bincount(y_order[i :: self.n_splits], minlength=n_classes)
+                for i in range(self.n_splits)
+            ]
+        )
+
+        # To maintain the data order dependencies as best as possible within
+        # the stratification constraint, we assign samples from each class in
+        # blocks (and then mess that up when shuffle=True).
+        test_folds = np.empty(len(y), dtype="i")
+        for k in range(n_classes):
+            # since the kth column of allocation stores the number of samples
+            # of class k in each test set, this generates blocks of fold
+            # indices corresponding to the allocation for class k.
+            folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
+            if self.shuffle:
+                rng.shuffle(folds_for_class)
+            test_folds[y_encoded == k] = folds_for_class
+        return test_folds
+
+    def _iter_test_masks(self, X, y=None, groups=None):
+        test_folds = self._make_test_folds(X, y)
+        for i in range(self.n_splits):
+            yield test_folds == i
+
+    def split(self, X, y, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        return super().split(X, y, groups)
+
+
+class StratifiedGroupKFold(GroupsConsumerMixin, _BaseKFold):
+    """Class-wise stratified K-Fold iterator variant with non-overlapping groups.
+
+    This cross-validation object is a variation of StratifiedKFold attempts to
+    return stratified folds with non-overlapping groups. The folds are made by
+    preserving the percentage of samples for each class in `y` in a binary or
+    multiclass classification setting.
+
+    Each group will appear exactly once in the test set across all folds (the
+    number of distinct groups has to be at least equal to the number of folds).
+
+    The difference between :class:`GroupKFold`
+    and `StratifiedGroupKFold` is that
+    the former attempts to create balanced folds such that the number of
+    distinct groups is approximately the same in each fold, whereas
+    `StratifiedGroupKFold` attempts to create folds which preserve the
+    percentage of samples for each class as much as possible given the
+    constraint of non-overlapping groups between splits.
+
+    Read more in the :ref:`User Guide <stratified_group_k_fold>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    .. note::
+
+        Stratification on the class label solves an engineering problem rather
+        than a statistical one. See :ref:`stratification` for more details.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+    shuffle : bool, default=False
+        Whether to shuffle each class's samples before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+        This implementation can only shuffle groups that have approximately the
+        same y distribution, no global shuffle will be performed.
+
+    random_state : int or RandomState instance, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold for each class.
+        Otherwise, leave `random_state` as `None`.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import StratifiedGroupKFold
+    >>> X = np.ones((17, 2))
+    >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])
+    >>> sgkf = StratifiedGroupKFold(n_splits=3)
+    >>> sgkf.get_n_splits(X, y)
+    3
+    >>> print(sgkf)
+    StratifiedGroupKFold(n_splits=3, random_state=None, shuffle=False)
+    >>> for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"         group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}")
+    ...     print(f"         group={groups[test_index]}")
+    Fold 0:
+      Train: index=[ 0  1  2  3  7  8  9 10 11 15 16]
+             group=[1 1 2 2 4 5 5 5 5 8 8]
+      Test:  index=[ 4  5  6 12 13 14]
+             group=[3 3 3 6 6 7]
+    Fold 1:
+      Train: index=[ 4  5  6  7  8  9 10 11 12 13 14]
+             group=[3 3 3 4 5 5 5 5 6 6 7]
+      Test:  index=[ 0  1  2  3 15 16]
+             group=[1 1 2 2 8 8]
+    Fold 2:
+      Train: index=[ 0  1  2  3  4  5  6 12 13 14 15 16]
+             group=[1 1 2 2 3 3 3 6 6 7 8 8]
+      Test:  index=[ 7  8  9 10 11]
+             group=[4 5 5 5 5]
+
+    Notes
+    -----
+    The implementation is designed to:
+
+    * Mimic the behavior of StratifiedKFold as much as possible for trivial
+      groups (e.g. when each group contains only one sample).
+    * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
+      ``y = [1, 0]`` should not change the indices generated.
+    * Stratify based on samples as much as possible while keeping
+      non-overlapping groups constraint. That means that in some cases when
+      there is a small number of groups containing a large number of samples
+      the stratification will not be possible and the behavior will be close
+      to GroupKFold.
+
+    See also
+    --------
+    StratifiedKFold: Takes class information into account to build folds which
+        retain class distributions (for binary or multiclass classification
+        tasks).
+
+    GroupKFold: K-fold iterator variant with non-overlapping groups.
+    """
+
+    def __init__(self, n_splits=5, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+
+    def _iter_test_indices(self, X, y, groups):
+        # Implementation is based on this kaggle kernel:
+        # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
+        # and is a subject to Apache 2.0 License. You may obtain a copy of the
+        # License at http://www.apache.org/licenses/LICENSE-2.0
+        # Changelist:
+        # - Refactored function to a class following scikit-learn KFold
+        #   interface.
+        # - Added heuristic for assigning group to the least populated fold in
+        #   cases when all other criteria are equal
+        # - Swtch from using python ``Counter`` to ``np.unique`` to get class
+        #   distribution
+        # - Added scikit-learn checks for input: checking that target is binary
+        #   or multiclass, checking passed random state, checking that number
+        #   of splits is less than number of members in each class, checking
+        #   that least populated class has more members than there are splits.
+        rng = check_random_state(self.random_state)
+        y = np.asarray(y)
+        type_of_target_y = type_of_target(y)
+        allowed_target_types = ("binary", "multiclass")
+        if type_of_target_y not in allowed_target_types:
+            raise ValueError(
+                "Supported target types are: {}. Got {!r} instead.".format(
+                    allowed_target_types, type_of_target_y
+                )
+            )
+
+        y = column_or_1d(y)
+        _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True)
+        if np.all(self.n_splits > y_cnt):
+            raise ValueError(
+                "n_splits=%d cannot be greater than the"
+                " number of members in each class." % (self.n_splits)
+            )
+        n_smallest_class = np.min(y_cnt)
+        if self.n_splits > n_smallest_class:
+            warnings.warn(
+                "The least populated class in y has only %d"
+                " members, which is less than n_splits=%d."
+                % (n_smallest_class, self.n_splits),
+                UserWarning,
+            )
+        n_classes = len(y_cnt)
+
+        _, groups_inv, groups_cnt = np.unique(
+            groups, return_inverse=True, return_counts=True
+        )
+        y_counts_per_group = np.zeros((len(groups_cnt), n_classes))
+        for class_idx, group_idx in zip(y_inv, groups_inv):
+            y_counts_per_group[group_idx, class_idx] += 1
+
+        y_counts_per_fold = np.zeros((self.n_splits, n_classes))
+        groups_per_fold = defaultdict(set)
+
+        if self.shuffle:
+            rng.shuffle(y_counts_per_group)
+
+        # Stable sort to keep shuffled order for groups with the same
+        # class distribution variance
+        sorted_groups_idx = np.argsort(
+            -np.std(y_counts_per_group, axis=1), kind="mergesort"
+        )
+
+        for group_idx in sorted_groups_idx:
+            group_y_counts = y_counts_per_group[group_idx]
+            best_fold = self._find_best_fold(
+                y_counts_per_fold=y_counts_per_fold,
+                y_cnt=y_cnt,
+                group_y_counts=group_y_counts,
+            )
+            y_counts_per_fold[best_fold] += group_y_counts
+            groups_per_fold[best_fold].add(group_idx)
+
+        for i in range(self.n_splits):
+            test_indices = [
+                idx
+                for idx, group_idx in enumerate(groups_inv)
+                if group_idx in groups_per_fold[i]
+            ]
+            yield test_indices
+
+    def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts):
+        best_fold = None
+        min_eval = np.inf
+        min_samples_in_fold = np.inf
+        for i in range(self.n_splits):
+            y_counts_per_fold[i] += group_y_counts
+            # Summarise the distribution over classes in each proposed fold
+            std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0)
+            y_counts_per_fold[i] -= group_y_counts
+            fold_eval = np.mean(std_per_class)
+            samples_in_fold = np.sum(y_counts_per_fold[i])
+            is_current_fold_better = fold_eval < min_eval or (
+                np.isclose(fold_eval, min_eval)
+                and samples_in_fold < min_samples_in_fold
+            )
+            if is_current_fold_better:
+                min_eval = fold_eval
+                min_samples_in_fold = samples_in_fold
+                best_fold = i
+        return best_fold
+
+
+class TimeSeriesSplit(_BaseKFold):
+    """Time Series cross-validator.
+
+    Provides train/test indices to split time-ordered data, where other
+    cross-validation methods are inappropriate, as they would lead to training
+    on future data and evaluating on past data.
+    To ensure comparable metrics across folds, samples must be equally spaced.
+    Once this condition is met, each test set covers the same time duration,
+    while the train set size accumulates data from previous splits.
+
+    This cross-validation object is a variation of :class:`KFold`.
+    In the k-th split, it returns the first k folds as the train set and the
+    (k+1)-th fold as the test set.
+
+    Note that, unlike standard cross-validation methods, successive
+    training sets are supersets of those that come before them.
+
+    Read more in the :ref:`User Guide <time_series_split>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of splits. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    max_train_size : int, default=None
+        Maximum size for a single training set.
+
+    test_size : int, default=None
+        Used to limit the size of the test set. Defaults to
+        ``n_samples // (n_splits + 1)``, which is the maximum allowed value
+        with ``gap=0``.
+
+        .. versionadded:: 0.24
+
+    gap : int, default=0
+        Number of samples to exclude from the end of each train set before
+        the test set.
+
+        .. versionadded:: 0.24
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import TimeSeriesSplit
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([1, 2, 3, 4, 5, 6])
+    >>> tscv = TimeSeriesSplit()
+    >>> print(tscv)
+    TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
+    >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0]
+      Test:  index=[1]
+    Fold 1:
+      Train: index=[0 1]
+      Test:  index=[2]
+    Fold 2:
+      Train: index=[0 1 2]
+      Test:  index=[3]
+    Fold 3:
+      Train: index=[0 1 2 3]
+      Test:  index=[4]
+    Fold 4:
+      Train: index=[0 1 2 3 4]
+      Test:  index=[5]
+    >>> # Fix test_size to 2 with 12 samples
+    >>> X = np.random.randn(12, 2)
+    >>> y = np.random.randint(0, 2, 12)
+    >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2)
+    >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1 2 3 4 5]
+      Test:  index=[6 7]
+    Fold 1:
+      Train: index=[0 1 2 3 4 5 6 7]
+      Test:  index=[8 9]
+    Fold 2:
+      Train: index=[0 1 2 3 4 5 6 7 8 9]
+      Test:  index=[10 11]
+    >>> # Add in a 2 period gap
+    >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)
+    >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1 2 3]
+      Test:  index=[6 7]
+    Fold 1:
+      Train: index=[0 1 2 3 4 5]
+      Test:  index=[8 9]
+    Fold 2:
+      Train: index=[0 1 2 3 4 5 6 7]
+      Test:  index=[10 11]
+
+    For a more extended example see
+    :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`.
+
+    Notes
+    -----
+    The training set has size ``i * n_samples // (n_splits + 1)
+    + n_samples % (n_splits + 1)`` in the ``i`` th split,
+    with a test set of size ``n_samples//(n_splits + 1)`` by default,
+    where ``n_samples`` is the number of samples. Note that this
+    formula is only valid when ``test_size`` and ``max_train_size`` are
+    left to their default values.
+    """
+
+    def __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0):
+        super().__init__(n_splits, shuffle=False, random_state=None)
+        self.max_train_size = max_train_size
+        self.test_size = test_size
+        self.gap = gap
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Always ignored, exists for compatibility.
+
+        groups : array-like of shape (n_samples,)
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        return self._split(X)
+
+    def _split(self, X):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        (X,) = indexable(X)
+        n_samples = _num_samples(X)
+        n_splits = self.n_splits
+        n_folds = n_splits + 1
+        gap = self.gap
+        test_size = (
+            self.test_size if self.test_size is not None else n_samples // n_folds
+        )
+
+        # Make sure we have enough samples for the given split parameters
+        if n_folds > n_samples:
+            raise ValueError(
+                f"Cannot have number of folds={n_folds} greater"
+                f" than the number of samples={n_samples}."
+            )
+        if n_samples - gap - (test_size * n_splits) <= 0:
+            raise ValueError(
+                f"Too many splits={n_splits} for number of samples"
+                f"={n_samples} with test_size={test_size} and gap={gap}."
+            )
+
+        indices = np.arange(n_samples)
+        test_starts = range(n_samples - n_splits * test_size, n_samples, test_size)
+
+        for test_start in test_starts:
+            train_end = test_start - gap
+            if self.max_train_size and self.max_train_size < train_end:
+                yield (
+                    indices[train_end - self.max_train_size : train_end],
+                    indices[test_start : test_start + test_size],
+                )
+            else:
+                yield (
+                    indices[:train_end],
+                    indices[test_start : test_start + test_size],
+                )
+
+
+class LeaveOneGroupOut(GroupsConsumerMixin, BaseCrossValidator):
+    """Leave One Group Out cross-validator.
+
+    Provides train/test indices to split data such that each training set is
+    comprised of all samples except ones belonging to one specific group.
+    Arbitrary domain specific group information is provided as an array of integers
+    that encodes the group of each sample.
+
+    For instance the groups could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    Read more in the :ref:`User Guide <leave_one_group_out>`.
+
+    Notes
+    -----
+    Splits are ordered according to the index of the group left out. The first
+    split has testing set consisting of the group whose index in `groups` is
+    lowest, and so on.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import LeaveOneGroupOut
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    >>> y = np.array([1, 2, 1, 2])
+    >>> groups = np.array([1, 1, 2, 2])
+    >>> logo = LeaveOneGroupOut()
+    >>> logo.get_n_splits(X, y, groups)
+    2
+    >>> logo.get_n_splits(groups=groups)  # 'groups' is always required
+    2
+    >>> print(logo)
+    LeaveOneGroupOut()
+    >>> for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}, group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}, group={groups[test_index]}")
+    Fold 0:
+      Train: index=[2 3], group=[2 2]
+      Test:  index=[0 1], group=[1 1]
+    Fold 1:
+      Train: index=[0 1], group=[1 1]
+      Test:  index=[2 3], group=[2 2]
+
+    See also
+    --------
+    GroupKFold: K-fold iterator variant with non-overlapping groups.
+    """
+
+    def _iter_test_masks(self, X, y, groups):
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        # We make a copy of groups to avoid side-effects during iteration
+        groups = check_array(
+            groups, input_name="groups", copy=True, ensure_2d=False, dtype=None
+        )
+        unique_groups = np.unique(groups)
+        if len(unique_groups) <= 1:
+            raise ValueError(
+                "The groups parameter contains fewer than 2 unique groups "
+                "(%s). LeaveOneGroupOut expects at least 2." % unique_groups
+            )
+        for i in unique_groups:
+            yield groups == i
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set. This 'groups' parameter must always be specified to
+            calculate the number of splits, though the other parameters can be
+            omitted.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
+        return len(np.unique(groups))
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        return super().split(X, y, groups)
+
+
+class LeavePGroupsOut(GroupsConsumerMixin, BaseCrossValidator):
+    """Leave P Group(s) Out cross-validator.
+
+    Provides train/test indices to split data according to a third-party
+    provided group. This group information can be used to encode arbitrary
+    domain specific stratifications of the samples as integers.
+
+    For instance the groups could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    The difference between LeavePGroupsOut and LeaveOneGroupOut is that
+    the former builds the test sets with all the samples assigned to
+    ``p`` different values of the groups while the latter uses samples
+    all assigned the same groups.
+
+    Read more in the :ref:`User Guide <leave_p_groups_out>`.
+
+    Parameters
+    ----------
+    n_groups : int
+        Number of groups (``p``) to leave out in the test split.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import LeavePGroupsOut
+    >>> X = np.array([[1, 2], [3, 4], [5, 6]])
+    >>> y = np.array([1, 2, 1])
+    >>> groups = np.array([1, 2, 3])
+    >>> lpgo = LeavePGroupsOut(n_groups=2)
+    >>> lpgo.get_n_splits(X, y, groups)
+    3
+    >>> lpgo.get_n_splits(groups=groups)  # 'groups' is always required
+    3
+    >>> print(lpgo)
+    LeavePGroupsOut(n_groups=2)
+    >>> for i, (train_index, test_index) in enumerate(lpgo.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}, group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}, group={groups[test_index]}")
+    Fold 0:
+      Train: index=[2], group=[3]
+      Test:  index=[0 1], group=[1 2]
+    Fold 1:
+      Train: index=[1], group=[2]
+      Test:  index=[0 2], group=[1 3]
+    Fold 2:
+      Train: index=[0], group=[1]
+      Test:  index=[1 2], group=[2 3]
+
+    See Also
+    --------
+    GroupKFold : K-fold iterator variant with non-overlapping groups.
+    """
+
+    def __init__(self, n_groups):
+        self.n_groups = n_groups
+
+    def _iter_test_masks(self, X, y, groups):
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(
+            groups, input_name="groups", copy=True, ensure_2d=False, dtype=None
+        )
+        unique_groups = np.unique(groups)
+        if self.n_groups >= len(unique_groups):
+            raise ValueError(
+                "The groups parameter contains fewer than (or equal to) "
+                "n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut "
+                "expects that at least n_groups + 1 (%d) unique groups be "
+                "present" % (self.n_groups, unique_groups, self.n_groups + 1)
+            )
+        combi = combinations(range(len(unique_groups)), self.n_groups)
+        for indices in combi:
+            test_index = np.zeros(_num_samples(X), dtype=bool)
+            for l in unique_groups[np.array(indices)]:
+                test_index[groups == l] = True
+            yield test_index
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set. This 'groups' parameter must always be specified to
+            calculate the number of splits, though the other parameters can be
+            omitted.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
+        return int(comb(len(np.unique(groups)), self.n_groups, exact=True))
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        return super().split(X, y, groups)
+
+
+class _RepeatedSplits(_MetadataRequester, metaclass=ABCMeta):
+    """Repeated splits for an arbitrary randomized CV splitter.
+
+    Repeats splits for cross-validators n times with different randomization
+    in each repetition.
+
+    Parameters
+    ----------
+    cv : callable
+        Cross-validator class.
+
+    n_repeats : int, default=10
+        Number of times cross-validator needs to be repeated.
+
+    random_state : int, RandomState instance or None, default=None
+        Passes `random_state` to the arbitrary repeating cross validator.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    **cvargs : additional params
+        Constructor parameters for cv. Must not contain random_state
+        and shuffle.
+    """
+
+    # This indicates that by default CV splitters don't have a "groups" kwarg,
+    # unless indicated by inheriting from ``GroupsConsumerMixin``.
+    # This also prevents ``set_split_request`` to be generated for splitters
+    # which don't support ``groups``.
+    __metadata_request__split = {"groups": metadata_routing.UNUSED}
+
+    def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs):
+        if not isinstance(n_repeats, numbers.Integral):
+            raise ValueError("Number of repetitions must be of Integral type.")
+
+        if n_repeats <= 0:
+            raise ValueError("Number of repetitions must be greater than 0.")
+
+        if any(key in cvargs for key in ("random_state", "shuffle")):
+            raise ValueError("cvargs must not contain random_state or shuffle.")
+
+        self.cv = cv
+        self.n_repeats = n_repeats
+        self.random_state = random_state
+        self.cvargs = cvargs
+
+    def split(self, X, y=None, groups=None):
+        """Generates indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        n_repeats = self.n_repeats
+        rng = check_random_state(self.random_state)
+
+        for idx in range(n_repeats):
+            cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
+            for train_index, test_index in cv.split(X, y, groups):
+                yield train_index, test_index
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+            ``np.zeros(n_samples)`` may be used as a placeholder.
+
+        y : object
+            Always ignored, exists for compatibility.
+            ``np.zeros(n_samples)`` may be used as a placeholder.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        rng = check_random_state(self.random_state)
+        cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
+        return cv.get_n_splits(X, y, groups) * self.n_repeats
+
+    def __repr__(self):
+        return _build_repr(self)
+
+
+class RepeatedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits):
+    """Repeated K-Fold cross validator.
+
+    Repeats K-Fold `n_repeats` times with different randomization in each repetition.
+
+    Read more in the :ref:`User Guide <repeated_k_fold>`.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+    n_repeats : int, default=10
+        Number of times cross-validator needs to be repeated.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of each repeated cross-validation instance.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import RepeatedKFold
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
+    >>> rkf.get_n_splits(X, y)
+    4
+    >>> print(rkf)
+    RepeatedKFold(n_repeats=2, n_splits=2, random_state=2652124)
+    >>> for i, (train_index, test_index) in enumerate(rkf.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    ...
+    Fold 0:
+      Train: index=[0 1]
+      Test:  index=[2 3]
+    Fold 1:
+      Train: index=[2 3]
+      Test:  index=[0 1]
+    Fold 2:
+      Train: index=[1 2]
+      Test:  index=[0 3]
+    Fold 3:
+      Train: index=[0 3]
+      Test:  index=[1 2]
+
+    Notes
+    -----
+    Randomized CV splitters may return different results for each call of
+    split. You can make the results identical by setting `random_state`
+    to an integer.
+
+    See Also
+    --------
+    RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
+    """
+
+    def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
+        super().__init__(
+            KFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits
+        )
+
+
+class RepeatedStratifiedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits):
+    """Repeated class-wise stratified K-Fold cross validator.
+
+    Repeats Stratified K-Fold n times with different randomization in each
+    repetition.
+
+    Read more in the :ref:`User Guide <repeated_k_fold>`.
+
+    .. note::
+
+        Stratification on the class label solves an engineering problem rather
+        than a statistical one. See :ref:`stratification` for more details.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+    n_repeats : int, default=10
+        Number of times cross-validator needs to be repeated.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the generation of the random states for each repetition.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import RepeatedStratifiedKFold
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2,
+    ...     random_state=36851234)
+    >>> rskf.get_n_splits(X, y)
+    4
+    >>> print(rskf)
+    RepeatedStratifiedKFold(n_repeats=2, n_splits=2, random_state=36851234)
+    >>> for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    ...
+    Fold 0:
+      Train: index=[1 2]
+      Test:  index=[0 3]
+    Fold 1:
+      Train: index=[0 3]
+      Test:  index=[1 2]
+    Fold 2:
+      Train: index=[1 3]
+      Test:  index=[0 2]
+    Fold 3:
+      Train: index=[0 2]
+      Test:  index=[1 3]
+
+    Notes
+    -----
+    Randomized CV splitters may return different results for each call of
+    split. You can make the results identical by setting `random_state`
+    to an integer.
+
+    See Also
+    --------
+    RepeatedKFold : Repeats K-Fold n times.
+    """
+
+    def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
+        super().__init__(
+            StratifiedKFold,
+            n_repeats=n_repeats,
+            random_state=random_state,
+            n_splits=n_splits,
+        )
+
+    def split(self, X, y, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        return super().split(X, y, groups=groups)
+
+
+class BaseShuffleSplit(_MetadataRequester, metaclass=ABCMeta):
+    """Base class for *ShuffleSplit.
+
+    Parameters
+    ----------
+    n_splits : int, default=10
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.1.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    """
+
+    # This indicates that by default CV splitters don't have a "groups" kwarg,
+    # unless indicated by inheriting from ``GroupsConsumerMixin``.
+    # This also prevents ``set_split_request`` to be generated for splitters
+    # which don't support ``groups``.
+    __metadata_request__split = {"groups": metadata_routing.UNUSED}
+
+    def __init__(
+        self, n_splits=10, *, test_size=None, train_size=None, random_state=None
+    ):
+        self.n_splits = n_splits
+        self.test_size = test_size
+        self.train_size = train_size
+        self.random_state = random_state
+        self._default_test_size = 0.1
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        X, y, groups = indexable(X, y, groups)
+        for train, test in self._iter_indices(X, y, groups):
+            yield train, test
+
+    def _iter_indices(self, X, y=None, groups=None):
+        """Generate (train, test) indices"""
+        n_samples = _num_samples(X)
+        n_train, n_test = _validate_shuffle_split(
+            n_samples,
+            self.test_size,
+            self.train_size,
+            default_test_size=self._default_test_size,
+        )
+
+        rng = check_random_state(self.random_state)
+        for i in range(self.n_splits):
+            # random partition
+            permutation = rng.permutation(n_samples)
+            ind_test = permutation[:n_test]
+            ind_train = permutation[n_test : (n_test + n_train)]
+            yield ind_train, ind_test
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return self.n_splits
+
+    def __repr__(self):
+        return _build_repr(self)
+
+
+class ShuffleSplit(_UnsupportedGroupCVMixin, BaseShuffleSplit):
+    """Random permutation cross-validator.
+
+    Yields indices to split data into training and test sets.
+
+    Note: contrary to other cross-validation strategies, random splits
+    do not guarantee that test sets across all folds will be mutually exclusive,
+    and might include overlapping samples. However, this is still very likely for
+    sizeable datasets.
+
+    Read more in the :ref:`User Guide <ShuffleSplit>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=10
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.1.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import ShuffleSplit
+    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])
+    >>> y = np.array([1, 2, 1, 2, 1, 2])
+    >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
+    >>> rs.get_n_splits(X)
+    5
+    >>> print(rs)
+    ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None)
+    >>> for i, (train_index, test_index) in enumerate(rs.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1 3 0 4]
+      Test:  index=[5 2]
+    Fold 1:
+      Train: index=[4 0 2 5]
+      Test:  index=[1 3]
+    Fold 2:
+      Train: index=[1 2 4 0]
+      Test:  index=[3 5]
+    Fold 3:
+      Train: index=[3 4 1 0]
+      Test:  index=[5 2]
+    Fold 4:
+      Train: index=[3 5 1 0]
+      Test:  index=[2 4]
+    >>> # Specify train and test size
+    >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,
+    ...                   random_state=0)
+    >>> for i, (train_index, test_index) in enumerate(rs.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1 3 0]
+      Test:  index=[5 2]
+    Fold 1:
+      Train: index=[4 0 2]
+      Test:  index=[1 3]
+    Fold 2:
+      Train: index=[1 2 4]
+      Test:  index=[3 5]
+    Fold 3:
+      Train: index=[3 4 1]
+      Test:  index=[5 2]
+    Fold 4:
+      Train: index=[3 5 1]
+      Test:  index=[2 4]
+    """
+
+    def __init__(
+        self, n_splits=10, *, test_size=None, train_size=None, random_state=None
+    ):
+        super().__init__(
+            n_splits=n_splits,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state,
+        )
+        self._default_test_size = 0.1
+
+
+class GroupShuffleSplit(GroupsConsumerMixin, BaseShuffleSplit):
+    """Shuffle-Group(s)-Out cross-validation iterator.
+
+    Provides randomized train/test indices to split data according to a
+    third-party provided group. This group information can be used to encode
+    arbitrary domain specific stratifications of the samples as integers.
+
+    For instance the groups could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    The difference between :class:`LeavePGroupsOut` and ``GroupShuffleSplit`` is that
+    the former generates splits using all subsets of size ``p`` unique groups,
+    whereas ``GroupShuffleSplit`` generates a user-determined number of random
+    test splits, each with a user-determined fraction of unique groups.
+
+    For example, a less computationally intensive alternative to
+    ``LeavePGroupsOut(p=10)`` would be
+    ``GroupShuffleSplit(test_size=10, n_splits=100)``.
+
+    Contrary to other cross-validation strategies, the random splits
+    do not guarantee that test sets across all folds will be mutually exclusive,
+    and might include overlapping samples. However, this is still very likely for
+    sizeable datasets.
+
+    Note: The parameters ``test_size`` and ``train_size`` refer to groups, and
+    not to samples as in :class:`ShuffleSplit`.
+
+    Read more in the :ref:`User Guide <group_shuffle_split>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float, int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of groups to include in the test split (rounded up). If int,
+        represents the absolute number of test groups. If None, the value is
+        set to the complement of the train size. If ``train_size`` is also None,
+        it will be set to 0.2.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the groups to include in the train split. If
+        int, represents the absolute number of train groups. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import GroupShuffleSplit
+    >>> X = np.ones(shape=(8, 2))
+    >>> y = np.ones(shape=(8, 1))
+    >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])
+    >>> print(groups.shape)
+    (8,)
+    >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42)
+    >>> gss.get_n_splits()
+    2
+    >>> print(gss)
+    GroupShuffleSplit(n_splits=2, random_state=42, test_size=None, train_size=0.7)
+    >>> for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}, group={groups[train_index]}")
+    ...     print(f"  Test:  index={test_index}, group={groups[test_index]}")
+    Fold 0:
+      Train: index=[2 3 4 5 6 7], group=[2 2 2 3 3 3]
+      Test:  index=[0 1], group=[1 1]
+    Fold 1:
+      Train: index=[0 1 5 6 7], group=[1 1 3 3 3]
+      Test:  index=[2 3 4], group=[2 2 2]
+
+    See Also
+    --------
+    ShuffleSplit : Shuffles samples to create independent test/train sets.
+
+    LeavePGroupsOut : Train set leaves out all possible subsets of `p` groups.
+    """
+
+    def __init__(
+        self, n_splits=5, *, test_size=None, train_size=None, random_state=None
+    ):
+        super().__init__(
+            n_splits=n_splits,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state,
+        )
+        self._default_test_size = 0.2
+
+    def _iter_indices(self, X, y, groups):
+        if groups is None:
+            raise ValueError("The 'groups' parameter should not be None.")
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
+        classes, group_indices = np.unique(groups, return_inverse=True)
+        for group_train, group_test in super()._iter_indices(X=classes):
+            # these are the indices of classes in the partition
+            # invert them into data indices
+
+            train = np.flatnonzero(np.isin(group_indices, group_train))
+            test = np.flatnonzero(np.isin(group_indices, group_test))
+
+            yield train, test
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        return super().split(X, y, groups)
+
+
+class StratifiedShuffleSplit(BaseShuffleSplit):
+    """Class-wise stratified ShuffleSplit cross-validator.
+
+    Provides train/test indices to split data in train/test sets.
+
+    This cross-validation object is a merge of :class:`StratifiedKFold` and
+    :class:`ShuffleSplit`, which returns stratified randomized folds. The folds
+    are made by preserving the percentage of samples for each class in `y` in a
+    binary or multiclass classification setting.
+
+    Note: like the :class:`ShuffleSplit` strategy, stratified random splits
+    do not guarantee that test sets across all folds will be mutually exclusive,
+    and might include overlapping samples. However, this is still very likely for
+    sizeable datasets.
+
+    Read more in the :ref:`User Guide <stratified_shuffle_split>`.
+
+    For visualisation of cross-validation behaviour and
+    comparison between common scikit-learn split methods
+    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
+
+    .. note::
+
+        Stratification on the class label solves an engineering problem rather
+        than a statistical one. See :ref:`stratification` for more details.
+
+    Parameters
+    ----------
+    n_splits : int, default=10
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.1.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import StratifiedShuffleSplit
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 0, 1, 1, 1])
+    >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
+    >>> sss.get_n_splits(X, y)
+    5
+    >>> print(sss)
+    StratifiedShuffleSplit(n_splits=5, random_state=0, ...)
+    >>> for i, (train_index, test_index) in enumerate(sss.split(X, y)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[5 2 3]
+      Test:  index=[4 1 0]
+    Fold 1:
+      Train: index=[5 1 4]
+      Test:  index=[0 2 3]
+    Fold 2:
+      Train: index=[5 0 2]
+      Test:  index=[4 3 1]
+    Fold 3:
+      Train: index=[4 1 0]
+      Test:  index=[2 3 5]
+    Fold 4:
+      Train: index=[0 5 1]
+      Test:  index=[3 4 2]
+    """
+
+    def __init__(
+        self, n_splits=10, *, test_size=None, train_size=None, random_state=None
+    ):
+        super().__init__(
+            n_splits=n_splits,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state,
+        )
+        self._default_test_size = 0.1
+
+    def _iter_indices(self, X, y, groups=None):
+        n_samples = _num_samples(X)
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        n_train, n_test = _validate_shuffle_split(
+            n_samples,
+            self.test_size,
+            self.train_size,
+            default_test_size=self._default_test_size,
+        )
+
+        # Convert to numpy as not all operations are supported by the Array API.
+        # `y` is probably never a very large array, which means that converting it
+        # should be cheap
+        xp, _ = get_namespace(y)
+        y = _convert_to_numpy(y, xp=xp)
+
+        if y.ndim == 2:
+            # for multi-label y, map each distinct row to a string repr
+            # using join because str(row) uses an ellipsis if len(row) > 1000
+            y = np.array([" ".join(row.astype("str")) for row in y])
+
+        classes, y_indices = np.unique(y, return_inverse=True)
+        n_classes = classes.shape[0]
+
+        class_counts = np.bincount(y_indices)
+        if np.min(class_counts) < 2:
+            raise ValueError(
+                "The least populated class in y has only 1"
+                " member, which is too few. The minimum"
+                " number of groups for any class cannot"
+                " be less than 2."
+            )
+
+        if n_train < n_classes:
+            raise ValueError(
+                "The train_size = %d should be greater or "
+                "equal to the number of classes = %d" % (n_train, n_classes)
+            )
+        if n_test < n_classes:
+            raise ValueError(
+                "The test_size = %d should be greater or "
+                "equal to the number of classes = %d" % (n_test, n_classes)
+            )
+
+        # Find the sorted list of instances for each class:
+        # (np.unique above performs a sort, so code is O(n logn) already)
+        class_indices = np.split(
+            np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
+        )
+
+        rng = check_random_state(self.random_state)
+
+        for _ in range(self.n_splits):
+            # if there are ties in the class-counts, we want
+            # to make sure to break them anew in each iteration
+            n_i = _approximate_mode(class_counts, n_train, rng)
+            class_counts_remaining = class_counts - n_i
+            t_i = _approximate_mode(class_counts_remaining, n_test, rng)
+
+            train = []
+            test = []
+
+            for i in range(n_classes):
+                permutation = rng.permutation(class_counts[i])
+                perm_indices_class_i = class_indices[i].take(permutation, mode="clip")
+
+                train.extend(perm_indices_class_i[: n_i[i]])
+                test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]])
+
+            train = rng.permutation(train)
+            test = rng.permutation(test)
+
+            yield train, test
+
+    def split(self, X, y, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_labels)
+            The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        return super().split(X, y, groups)
+
+
+def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None):
+    """
+    Validation helper to check if the train/test sizes are meaningful w.r.t. the
+    size of the data (n_samples).
+    """
+    if test_size is None and train_size is None:
+        test_size = default_test_size
+
+    test_size_type = np.asarray(test_size).dtype.kind
+    train_size_type = np.asarray(train_size).dtype.kind
+
+    if (test_size_type == "i" and (test_size >= n_samples or test_size <= 0)) or (
+        test_size_type == "f" and (test_size <= 0 or test_size >= 1)
+    ):
+        raise ValueError(
+            "test_size={0} should be either positive and smaller"
+            " than the number of samples {1} or a float in the "
+            "(0, 1) range".format(test_size, n_samples)
+        )
+
+    if (train_size_type == "i" and (train_size >= n_samples or train_size <= 0)) or (
+        train_size_type == "f" and (train_size <= 0 or train_size >= 1)
+    ):
+        raise ValueError(
+            "train_size={0} should be either positive and smaller"
+            " than the number of samples {1} or a float in the "
+            "(0, 1) range".format(train_size, n_samples)
+        )
+
+    if train_size is not None and train_size_type not in ("i", "f"):
+        raise ValueError("Invalid value for train_size: {}".format(train_size))
+    if test_size is not None and test_size_type not in ("i", "f"):
+        raise ValueError("Invalid value for test_size: {}".format(test_size))
+
+    if train_size_type == "f" and test_size_type == "f" and train_size + test_size > 1:
+        raise ValueError(
+            "The sum of test_size and train_size = {}, should be in the (0, 1)"
+            " range. Reduce test_size and/or train_size.".format(train_size + test_size)
+        )
+
+    if test_size_type == "f":
+        n_test = ceil(test_size * n_samples)
+    elif test_size_type == "i":
+        n_test = float(test_size)
+
+    if train_size_type == "f":
+        n_train = floor(train_size * n_samples)
+    elif train_size_type == "i":
+        n_train = float(train_size)
+
+    if train_size is None:
+        n_train = n_samples - n_test
+    elif test_size is None:
+        n_test = n_samples - n_train
+
+    if n_train + n_test > n_samples:
+        raise ValueError(
+            "The sum of train_size and test_size = %d, "
+            "should be smaller than the number of "
+            "samples %d. Reduce test_size and/or "
+            "train_size." % (n_train + n_test, n_samples)
+        )
+
+    n_train, n_test = int(n_train), int(n_test)
+
+    if n_train == 0:
+        raise ValueError(
+            "With n_samples={}, test_size={} and train_size={}, the "
+            "resulting train set will be empty. Adjust any of the "
+            "aforementioned parameters.".format(n_samples, test_size, train_size)
+        )
+
+    return n_train, n_test
+
+
+class PredefinedSplit(BaseCrossValidator):
+    """Predefined split cross-validator.
+
+    Provides train/test indices to split data into train/test sets using a
+    predefined scheme specified by the user with the ``test_fold`` parameter.
+
+    Read more in the :ref:`User Guide <predefined_split>`.
+
+    .. versionadded:: 0.16
+
+    Parameters
+    ----------
+    test_fold : array-like of shape (n_samples,)
+        The entry ``test_fold[i]`` represents the index of the test set that
+        sample ``i`` belongs to. It is possible to exclude sample ``i`` from
+        any test set (i.e. include sample ``i`` in every training set) by
+        setting ``test_fold[i]`` equal to -1.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import PredefinedSplit
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([0, 0, 1, 1])
+    >>> test_fold = [0, 1, -1, 1]
+    >>> ps = PredefinedSplit(test_fold)
+    >>> ps.get_n_splits()
+    2
+    >>> print(ps)
+    PredefinedSplit(test_fold=array([ 0,  1, -1,  1]))
+    >>> for i, (train_index, test_index) in enumerate(ps.split()):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[1 2 3]
+      Test:  index=[0]
+    Fold 1:
+      Train: index=[0 2]
+      Test:  index=[1 3]
+    """
+
+    def __init__(self, test_fold):
+        self.test_fold = np.array(test_fold, dtype=int)
+        self.test_fold = column_or_1d(self.test_fold)
+        self.unique_folds = np.unique(self.test_fold)
+        self.unique_folds = self.unique_folds[self.unique_folds != -1]
+
+    def split(self, X=None, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        if groups is not None:
+            warnings.warn(
+                f"The groups parameter is ignored by {self.__class__.__name__}",
+                UserWarning,
+            )
+        return self._split()
+
+    def _split(self):
+        """Generate indices to split data into training and test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        ind = np.arange(len(self.test_fold))
+        for test_index in self._iter_test_masks():
+            train_index = ind[np.logical_not(test_index)]
+            test_index = ind[test_index]
+            yield train_index, test_index
+
+    def _iter_test_masks(self):
+        """Generates boolean masks corresponding to test sets."""
+        for f in self.unique_folds:
+            test_index = np.where(self.test_fold == f)[0]
+            test_mask = np.zeros(len(self.test_fold), dtype=bool)
+            test_mask[test_index] = True
+            yield test_mask
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return len(self.unique_folds)
+
+
+class _CVIterableWrapper(BaseCrossValidator):
+    """Wrapper class for old style cv objects and iterables."""
+
+    def __init__(self, cv):
+        self.cv = list(cv)
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return len(self.cv)
+
+    def split(self, X=None, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        for train, test in self.cv:
+            yield train, test
+
+
+def check_cv(cv=5, y=None, *, classifier=False):
+    """Input checker utility for building a cross-validator.
+
+    Parameters
+    ----------
+    cv : int, cross-validation generator, iterable or None, default=5
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`,
+        - An iterable that generates (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if classifier is True and ``y`` is either
+        binary or multiclass, :class:`StratifiedKFold` is used. In all other
+        cases, :class:`KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value changed from 3-fold to 5-fold.
+
+    y : array-like, default=None
+        The target variable for supervised learning problems.
+
+    classifier : bool, default=False
+        Whether the task is a classification task, in which case
+        stratified KFold will be used.
+
+    Returns
+    -------
+    checked_cv : a cross-validator instance.
+        The return value is a cross-validator which generates the train/test
+        splits via the ``split`` method.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import check_cv
+    >>> check_cv(cv=5, y=None, classifier=False)
+    KFold(...)
+    >>> check_cv(cv=5, y=[1, 1, 0, 0, 0, 0], classifier=True)
+    StratifiedKFold(...)
+    """
+    cv = 5 if cv is None else cv
+    if isinstance(cv, numbers.Integral):
+        if (
+            classifier
+            and (y is not None)
+            and (type_of_target(y, input_name="y") in ("binary", "multiclass"))
+        ):
+            return StratifiedKFold(cv)
+        else:
+            return KFold(cv)
+
+    if not hasattr(cv, "split") or isinstance(cv, str):
+        if not isinstance(cv, Iterable) or isinstance(cv, str):
+            raise ValueError(
+                "Expected cv as an integer, cross-validation "
+                "object (from sklearn.model_selection) "
+                "or an iterable. Got %s." % cv
+            )
+        return _CVIterableWrapper(cv)
+
+    return cv  # New style cv objects are passed without any modification
+
+
+@validate_params(
+    {
+        "test_size": [
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(numbers.Integral, 1, None, closed="left"),
+            None,
+        ],
+        "train_size": [
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            Interval(numbers.Integral, 1, None, closed="left"),
+            None,
+        ],
+        "random_state": ["random_state"],
+        "shuffle": ["boolean"],
+        "stratify": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def train_test_split(
+    *arrays,
+    test_size=None,
+    train_size=None,
+    random_state=None,
+    shuffle=True,
+    stratify=None,
+):
+    """Split arrays or matrices into random train and test subsets.
+
+    Quick utility that wraps input validation,
+    ``next(ShuffleSplit().split(X, y))``, and application to input data
+    into a single call for splitting (and optionally subsampling) data into a
+    one-liner.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    *arrays : sequence of indexables with same length / shape[0]
+        Allowed inputs are lists, numpy arrays, scipy-sparse
+        matrices or pandas dataframes.
+
+    test_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.25.
+
+    train_size : float or int, default=None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the shuffling applied to the data before applying the split.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    shuffle : bool, default=True
+        Whether or not to shuffle the data before splitting. If shuffle=False
+        then stratify must be None.
+
+    stratify : array-like, default=None
+        If not None, data is split in a stratified fashion, using this as
+        the class labels.
+        Read more in the :ref:`User Guide <stratification>`.
+
+    Returns
+    -------
+    splitting : list, length=2 * len(arrays)
+        List containing train-test split of inputs.
+
+        .. versionadded:: 0.16
+            If the input is sparse, the output will be a
+            ``scipy.sparse.csr_matrix``. Else, output type is the same as the
+            input type.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = np.arange(10).reshape((5, 2)), range(5)
+    >>> X
+    array([[0, 1],
+           [2, 3],
+           [4, 5],
+           [6, 7],
+           [8, 9]])
+    >>> list(y)
+    [0, 1, 2, 3, 4]
+
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, test_size=0.33, random_state=42)
+    ...
+    >>> X_train
+    array([[4, 5],
+           [0, 1],
+           [6, 7]])
+    >>> y_train
+    [2, 0, 3]
+    >>> X_test
+    array([[2, 3],
+           [8, 9]])
+    >>> y_test
+    [1, 4]
+
+    >>> train_test_split(y, shuffle=False)
+    [[0, 1, 2], [3, 4]]
+
+    >>> from sklearn import datasets
+    >>> iris = datasets.load_iris(as_frame=True)
+    >>> X, y = iris['data'], iris['target']
+    >>> X.head()
+        sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
+    0                5.1               3.5                1.4               0.2
+    1                4.9               3.0                1.4               0.2
+    2                4.7               3.2                1.3               0.2
+    3                4.6               3.1                1.5               0.2
+    4                5.0               3.6                1.4               0.2
+    >>> y.head()
+    0    0
+    1    0
+    2    0
+    3    0
+    4    0
+    ...
+
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ... X, y, test_size=0.33, random_state=42)
+    ...
+    >>> X_train.head()
+        sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
+    96                 5.7               2.9                4.2               1.3
+    105                7.6               3.0                6.6               2.1
+    66                 5.6               3.0                4.5               1.5
+    0                  5.1               3.5                1.4               0.2
+    122                7.7               2.8                6.7               2.0
+    >>> y_train.head()
+    96     1
+    105    2
+    66     1
+    0      0
+    122    2
+    ...
+    >>> X_test.head()
+        sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
+    73                 6.1               2.8                4.7               1.2
+    18                 5.7               3.8                1.7               0.3
+    118                7.7               2.6                6.9               2.3
+    78                 6.0               2.9                4.5               1.5
+    76                 6.8               2.8                4.8               1.4
+    >>> y_test.head()
+    73     1
+    18     0
+    118    2
+    78     1
+    76     1
+    ...
+    """
+    n_arrays = len(arrays)
+    if n_arrays == 0:
+        raise ValueError("At least one array required as input")
+
+    arrays = indexable(*arrays)
+
+    n_samples = _num_samples(arrays[0])
+    n_train, n_test = _validate_shuffle_split(
+        n_samples, test_size, train_size, default_test_size=0.25
+    )
+
+    if shuffle is False:
+        if stratify is not None:
+            raise ValueError(
+                "Stratified train/test split is not implemented for shuffle=False"
+            )
+
+        train = np.arange(n_train)
+        test = np.arange(n_train, n_train + n_test)
+
+    else:
+        if stratify is not None:
+            CVClass = StratifiedShuffleSplit
+        else:
+            CVClass = ShuffleSplit
+
+        cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state)
+
+        train, test = next(cv.split(X=arrays[0], y=stratify))
+
+    train, test = ensure_common_namespace_device(arrays[0], train, test)
+
+    return list(
+        chain.from_iterable(
+            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
+        )
+    )
+
+
+# Tell nose that train_test_split is not a test.
+# (Needed for external libraries that may use nose.)
+# Use setattr to avoid mypy errors when monkeypatching.
+setattr(train_test_split, "__test__", False)
+
+
+def _pprint(params, offset=0, printer=repr):
+    """Pretty print the dictionary 'params'
+
+    Parameters
+    ----------
+    params : dict
+        The dictionary to pretty print
+
+    offset : int, default=0
+        The offset in characters to add at the begin of each line.
+
+    printer : callable, default=repr
+        The function to convert entries to strings, typically
+        the builtin str or repr
+
+    """
+    # Do a multi-line justified repr:
+    options = np.get_printoptions()
+    np.set_printoptions(precision=5, threshold=64, edgeitems=2)
+    params_list = list()
+    this_line_length = offset
+    line_sep = ",\n" + (1 + offset // 2) * " "
+    for i, (k, v) in enumerate(sorted(params.items())):
+        if isinstance(v, float):
+            # use str for representing floating point numbers
+            # this way we get consistent representation across
+            # architectures and versions.
+            this_repr = "%s=%s" % (k, str(v))
+        else:
+            # use repr of the rest
+            this_repr = "%s=%s" % (k, printer(v))
+        if len(this_repr) > 500:
+            this_repr = this_repr[:300] + "..." + this_repr[-100:]
+        if i > 0:
+            if this_line_length + len(this_repr) >= 75 or "\n" in this_repr:
+                params_list.append(line_sep)
+                this_line_length = len(line_sep)
+            else:
+                params_list.append(", ")
+                this_line_length += 2
+        params_list.append(this_repr)
+        this_line_length += len(this_repr)
+
+    np.set_printoptions(**options)
+    lines = "".join(params_list)
+    # Strip trailing space to avoid nightmare in doctests
+    lines = "\n".join(l.rstrip(" ") for l in lines.split("\n"))
+    return lines
+
+
+def _build_repr(self):
+    # XXX This is copied from BaseEstimator's get_params
+    cls = self.__class__
+    init = getattr(cls.__init__, "deprecated_original", cls.__init__)
+    # Ignore varargs, kw and default values and pop self
+    init_signature = signature(init)
+    # Consider the constructor parameters excluding 'self'
+    if init is object.__init__:
+        args = []
+    else:
+        args = sorted(
+            [
+                p.name
+                for p in init_signature.parameters.values()
+                if p.name != "self" and p.kind != p.VAR_KEYWORD
+            ]
+        )
+    class_name = self.__class__.__name__
+    params = dict()
+    for key in args:
+        # We need deprecation warnings to always be on in order to
+        # catch deprecated param values.
+        # This is set in utils/__init__.py but it gets overwritten
+        # when running under python3 somehow.
+        warnings.simplefilter("always", FutureWarning)
+        try:
+            with warnings.catch_warnings(record=True) as w:
+                value = getattr(self, key, None)
+                if value is None and hasattr(self, "cvargs"):
+                    value = self.cvargs.get(key, None)
+            if len(w) and w[0].category is FutureWarning:
+                # if the parameter is deprecated, don't show it
+                continue
+        finally:
+            warnings.filters.pop(0)
+        params[key] = value
+
+    return "%s(%s)" % (class_name, _pprint(params, offset=len(class_name)))
+
+
+def _yields_constant_splits(cv):
+    # Return True if calling cv.split() always returns the same splits
+    # We assume that if a cv doesn't have a shuffle parameter, it shuffles by
+    # default (e.g. ShuffleSplit). If it actually doesn't shuffle (e.g.
+    # LeaveOneOut), then it won't have a random_state parameter anyway, in
+    # which case it will default to 0, leading to output=True
+    shuffle = getattr(cv, "shuffle", True)
+    random_state = getattr(cv, "random_state", 0)
+    return isinstance(random_state, numbers.Integral) or not shuffle
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5a1406e6c2a50e70e366be0fd199795eeb60417
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py
@@ -0,0 +1,2530 @@
+"""
+The :mod:`sklearn.model_selection._validation` module includes classes and
+functions to validate the model.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import time
+import warnings
+from collections import Counter
+from contextlib import suppress
+from functools import partial
+from numbers import Real
+from traceback import format_exc
+
+import numpy as np
+import scipy.sparse as sp
+from joblib import logger
+
+from ..base import clone, is_classifier
+from ..exceptions import FitFailedWarning, UnsetMetadataPassedError
+from ..metrics import check_scoring, get_scorer_names
+from ..metrics._scorer import _MultimetricScorer
+from ..preprocessing import LabelEncoder
+from ..utils import Bunch, _safe_indexing, check_random_state, indexable
+from ..utils._array_api import device, get_namespace
+from ..utils._param_validation import (
+    HasMethods,
+    Integral,
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import _safe_split
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _check_method_params, _num_samples
+from ._split import check_cv
+
+__all__ = [
+    "cross_val_predict",
+    "cross_val_score",
+    "cross_validate",
+    "learning_curve",
+    "permutation_test_score",
+    "validation_curve",
+]
+
+
+def _check_params_groups_deprecation(fit_params, params, groups, version):
+    """A helper function to check deprecations on `groups` and `fit_params`.
+
+    # TODO(SLEP6): To be removed when set_config(enable_metadata_routing=False) is not
+    # possible.
+    """
+    if params is not None and fit_params is not None:
+        raise ValueError(
+            "`params` and `fit_params` cannot both be provided. Pass parameters "
+            "via `params`. `fit_params` is deprecated and will be removed in "
+            f"version {version}."
+        )
+    elif fit_params is not None:
+        warnings.warn(
+            (
+                "`fit_params` is deprecated and will be removed in version {version}. "
+                "Pass parameters via `params` instead."
+            ),
+            FutureWarning,
+        )
+        params = fit_params
+
+    params = {} if params is None else params
+
+    _check_groups_routing_disabled(groups)
+
+    return params
+
+
+# TODO(SLEP6): To be removed when set_config(enable_metadata_routing=False) is not
+# possible.
+def _check_groups_routing_disabled(groups):
+    if groups is not None and _routing_enabled():
+        raise ValueError(
+            "`groups` can only be passed if metadata routing is not enabled via"
+            " `sklearn.set_config(enable_metadata_routing=True)`. When routing is"
+            " enabled, pass `groups` alongside other metadata via the `params` argument"
+            " instead."
+        )
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods("fit")],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            list,
+            tuple,
+            dict,
+            None,
+        ],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "params": [dict, None],
+        "pre_dispatch": [Integral, str],
+        "return_train_score": ["boolean"],
+        "return_estimator": ["boolean"],
+        "return_indices": ["boolean"],
+        "error_score": [StrOptions({"raise"}), Real],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def cross_validate(
+    estimator,
+    X,
+    y=None,
+    *,
+    groups=None,
+    scoring=None,
+    cv=None,
+    n_jobs=None,
+    verbose=0,
+    params=None,
+    pre_dispatch="2*n_jobs",
+    return_train_score=False,
+    return_estimator=False,
+    return_indices=False,
+    error_score=np.nan,
+):
+    """Evaluate metric(s) by cross-validation and also record fit/score times.
+
+    Read more in the :ref:`User Guide <multimetric_cross_validation>`.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to fit. Can be for example a list, or an array.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_validate(..., params={'groups': groups})``.
+
+    scoring : str, callable, list, tuple, or dict, default=None
+        Strategy to evaluate the performance of the `estimator` across cross-validation
+        splits.
+
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_string_names`);
+        - a callable (see :ref:`scoring_callable`) that returns a single value.
+        - `None`, the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the cross-validation splits.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit``, the scorer,
+        and the CV splitter.
+
+        .. versionadded:: 1.4
+
+    pre_dispatch : int or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    return_train_score : bool, default=False
+        Whether to include train scores.
+        Computing training scores is used to get insights on how different
+        parameter settings impact the overfitting/underfitting trade-off.
+        However computing the scores on the training set can be computationally
+        expensive and is not strictly required to select the parameters that
+        yield the best generalization performance.
+
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
+    return_estimator : bool, default=False
+        Whether to return the estimators fitted on each split.
+
+        .. versionadded:: 0.20
+
+    return_indices : bool, default=False
+        Whether to return the train-test indices selected for each split.
+
+        .. versionadded:: 1.3
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    scores : dict of float arrays of shape (n_splits,)
+        Array of scores of the estimator for each run of the cross validation.
+
+        A dict of arrays containing the score/time arrays for each scorer is
+        returned. The possible keys for this ``dict`` are:
+
+        ``test_score``
+            The score array for test scores on each cv split.
+            Suffix ``_score`` in ``test_score`` changes to a specific
+            metric like ``test_r2`` or ``test_auc`` if there are
+            multiple scoring metrics in the scoring parameter.
+        ``train_score``
+            The score array for train scores on each cv split.
+            Suffix ``_score`` in ``train_score`` changes to a specific
+            metric like ``train_r2`` or ``train_auc`` if there are
+            multiple scoring metrics in the scoring parameter.
+            This is available only if ``return_train_score`` parameter
+            is ``True``.
+        ``fit_time``
+            The time for fitting the estimator on the train
+            set for each cv split.
+        ``score_time``
+            The time for scoring the estimator on the test set for each
+            cv split. (Note: time for scoring on the train set is not
+            included even if ``return_train_score`` is set to ``True``).
+        ``estimator``
+            The estimator objects for each cv split.
+            This is available only if ``return_estimator`` parameter
+            is set to ``True``.
+        ``indices``
+            The train/test positional indices for each cv split. A dictionary
+            is returned where the keys are either `"train"` or `"test"`
+            and the associated values are a list of integer-dtyped NumPy
+            arrays with the indices. Available only if `return_indices=True`.
+
+    See Also
+    --------
+    cross_val_score : Run cross-validation for single metric evaluation.
+
+    cross_val_predict : Get predictions from each split of cross-validation for
+        diagnostic purposes.
+
+    sklearn.metrics.make_scorer : Make a scorer from a performance metric or
+        loss function.
+
+    Examples
+    --------
+    >>> from sklearn import datasets, linear_model
+    >>> from sklearn.model_selection import cross_validate
+    >>> from sklearn.metrics import make_scorer
+    >>> from sklearn.metrics import confusion_matrix
+    >>> from sklearn.svm import LinearSVC
+    >>> diabetes = datasets.load_diabetes()
+    >>> X = diabetes.data[:150]
+    >>> y = diabetes.target[:150]
+    >>> lasso = linear_model.Lasso()
+
+    Single metric evaluation using ``cross_validate``
+
+    >>> cv_results = cross_validate(lasso, X, y, cv=3)
+    >>> sorted(cv_results.keys())
+    ['fit_time', 'score_time', 'test_score']
+    >>> cv_results['test_score']
+    array([0.3315057 , 0.08022103, 0.03531816])
+
+    Multiple metric evaluation using ``cross_validate``
+    (please refer the ``scoring`` parameter doc for more information)
+
+    >>> scores = cross_validate(lasso, X, y, cv=3,
+    ...                         scoring=('r2', 'neg_mean_squared_error'),
+    ...                         return_train_score=True)
+    >>> print(scores['test_neg_mean_squared_error'])
+    [-3635.5 -3573.3 -6114.7]
+    >>> print(scores['train_r2'])
+    [0.28009951 0.3908844  0.22784907]
+    """
+    _check_groups_routing_disabled(groups)
+
+    X, y = indexable(X, y)
+    params = {} if params is None else params
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+
+    scorers = check_scoring(
+        estimator, scoring=scoring, raise_exc=(error_score == "raise")
+    )
+
+    if _routing_enabled():
+        # For estimators, a MetadataRouter is created in get_metadata_routing
+        # methods. For these router methods, we create the router to use
+        # `process_routing` on it.
+        router = (
+            MetadataRouter(owner="cross_validate")
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata to the predict method for
+                # scoring?
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+            .add(
+                scorer=scorers,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            raise UnsetMetadataPassedError(
+                message=str(e).replace("cross_validate.fit", "cross_validate"),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+    else:
+        routed_params = Bunch()
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.estimator = Bunch(fit=params)
+        routed_params.scorer = Bunch(score={})
+
+    indices = cv.split(X, y, **routed_params.splitter.split)
+    if return_indices:
+        # materialize the indices since we need to store them in the returned dict
+        indices = list(indices)
+
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
+    results = parallel(
+        delayed(_fit_and_score)(
+            clone(estimator),
+            X,
+            y,
+            scorer=scorers,
+            train=train,
+            test=test,
+            verbose=verbose,
+            parameters=None,
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
+            return_train_score=return_train_score,
+            return_times=True,
+            return_estimator=return_estimator,
+            error_score=error_score,
+        )
+        for train, test in indices
+    )
+
+    _warn_or_raise_about_fit_failures(results, error_score)
+
+    # For callable scoring, the return type is only know after calling. If the
+    # return type is a dictionary, the error scores can now be inserted with
+    # the correct key.
+    if callable(scoring):
+        _insert_error_scores(results, error_score)
+
+    results = _aggregate_score_dicts(results)
+
+    ret = {}
+    ret["fit_time"] = results["fit_time"]
+    ret["score_time"] = results["score_time"]
+
+    if return_estimator:
+        ret["estimator"] = results["estimator"]
+
+    if return_indices:
+        ret["indices"] = {}
+        ret["indices"]["train"], ret["indices"]["test"] = zip(*indices)
+
+    test_scores_dict = _normalize_score_results(results["test_scores"])
+    if return_train_score:
+        train_scores_dict = _normalize_score_results(results["train_scores"])
+
+    for name in test_scores_dict:
+        ret["test_%s" % name] = test_scores_dict[name]
+        if return_train_score:
+            key = "train_%s" % name
+            ret[key] = train_scores_dict[name]
+
+    return ret
+
+
+def _insert_error_scores(results, error_score):
+    """Insert error in `results` by replacing them inplace with `error_score`.
+
+    This only applies to multimetric scores because `_fit_and_score` will
+    handle the single metric case.
+    """
+    successful_score = None
+    failed_indices = []
+    for i, result in enumerate(results):
+        if result["fit_error"] is not None:
+            failed_indices.append(i)
+        elif successful_score is None:
+            successful_score = result["test_scores"]
+
+    if isinstance(successful_score, dict):
+        formatted_error = {name: error_score for name in successful_score}
+        for i in failed_indices:
+            results[i]["test_scores"] = formatted_error.copy()
+            if "train_scores" in results[i]:
+                results[i]["train_scores"] = formatted_error.copy()
+
+
+def _normalize_score_results(scores, scaler_score_key="score"):
+    """Creates a scoring dictionary based on the type of `scores`"""
+    if isinstance(scores[0], dict):
+        # multimetric scoring
+        return _aggregate_score_dicts(scores)
+    # scaler
+    return {scaler_score_key: scores}
+
+
+def _warn_or_raise_about_fit_failures(results, error_score):
+    fit_errors = [
+        result["fit_error"] for result in results if result["fit_error"] is not None
+    ]
+    if fit_errors:
+        num_failed_fits = len(fit_errors)
+        num_fits = len(results)
+        fit_errors_counter = Counter(fit_errors)
+        delimiter = "-" * 80 + "\n"
+        fit_errors_summary = "\n".join(
+            f"{delimiter}{n} fits failed with the following error:\n{error}"
+            for error, n in fit_errors_counter.items()
+        )
+
+        if num_failed_fits == num_fits:
+            all_fits_failed_message = (
+                f"\nAll the {num_fits} fits failed.\n"
+                "It is very likely that your model is misconfigured.\n"
+                "You can try to debug the error by setting error_score='raise'.\n\n"
+                f"Below are more details about the failures:\n{fit_errors_summary}"
+            )
+            raise ValueError(all_fits_failed_message)
+
+        else:
+            some_fits_failed_message = (
+                f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n"
+                "The score on these train-test partitions for these parameters"
+                f" will be set to {error_score}.\n"
+                "If these failures are not expected, you can try to debug them "
+                "by setting error_score='raise'.\n\n"
+                f"Below are more details about the failures:\n{fit_errors_summary}"
+            )
+            warnings.warn(some_fits_failed_message, FitFailedWarning)
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods("fit")],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "params": [dict, None],
+        "pre_dispatch": [Integral, str, None],
+        "error_score": [StrOptions({"raise"}), Real],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def cross_val_score(
+    estimator,
+    X,
+    y=None,
+    *,
+    groups=None,
+    scoring=None,
+    cv=None,
+    n_jobs=None,
+    verbose=0,
+    params=None,
+    pre_dispatch="2*n_jobs",
+    error_score=np.nan,
+):
+    """Evaluate a score by cross-validation.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to fit. Can be for example a list, or an array.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
+            default=None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_val_score(..., params={'groups': groups})``.
+
+    scoring : str or callable, default=None
+        Strategy to evaluate the performance of the `estimator` across cross-validation
+        splits.
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``, which should return only a single value.
+          See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+        Similar to the use of `scoring` in :func:`cross_validate` but only a
+        single metric is permitted.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - `None`, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable that generates (train, test) splits as arrays of indices.
+
+        For `int`/`None` inputs, if the estimator is a classifier and `y` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            `cv` default value if `None` changed from 3-fold to 5-fold.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the cross-validation splits.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit``, the scorer,
+        and the CV splitter.
+
+        .. versionadded:: 1.4
+
+    pre_dispatch : int or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - ``None``, in which case all the jobs are immediately created and spawned. Use
+          this for lightweight and fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+    scores : ndarray of float of shape=(len(list(cv)),)
+        Array of scores of the estimator for each run of the cross validation.
+
+    See Also
+    --------
+    cross_validate : To run cross-validation on multiple metrics and also to
+        return train scores, fit times and score times.
+
+    cross_val_predict : Get predictions from each split of cross-validation for
+        diagnostic purposes.
+
+    sklearn.metrics.make_scorer : Make a scorer from a performance metric or
+        loss function.
+
+    Examples
+    --------
+    >>> from sklearn import datasets, linear_model
+    >>> from sklearn.model_selection import cross_val_score
+    >>> diabetes = datasets.load_diabetes()
+    >>> X = diabetes.data[:150]
+    >>> y = diabetes.target[:150]
+    >>> lasso = linear_model.Lasso()
+    >>> print(cross_val_score(lasso, X, y, cv=3))
+    [0.3315057  0.08022103 0.03531816]
+    """
+    # To ensure multimetric format is not supported
+    scorer = check_scoring(estimator, scoring=scoring)
+
+    cv_results = cross_validate(
+        estimator=estimator,
+        X=X,
+        y=y,
+        groups=groups,
+        scoring={"score": scorer},
+        cv=cv,
+        n_jobs=n_jobs,
+        verbose=verbose,
+        params=params,
+        pre_dispatch=pre_dispatch,
+        error_score=error_score,
+    )
+    return cv_results["test_score"]
+
+
+def _fit_and_score(
+    estimator,
+    X,
+    y,
+    *,
+    scorer,
+    train,
+    test,
+    verbose,
+    parameters,
+    fit_params,
+    score_params,
+    return_train_score=False,
+    return_parameters=False,
+    return_n_test_samples=False,
+    return_times=False,
+    return_estimator=False,
+    split_progress=None,
+    candidate_progress=None,
+    error_score=np.nan,
+):
+    """Fit estimator and compute scores for a given dataset split.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : array-like of shape (n_samples, n_features)
+        The data to fit.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    scorer : A single callable or dict mapping scorer name to the callable
+        If it is a single callable, the return value for ``train_scores`` and
+        ``test_scores`` is a single float.
+
+        For a dict, it should be one mapping the scorer name to the scorer
+        callable object / function.
+
+        The callable object / fn should have signature
+        ``scorer(estimator, X, y)``.
+
+    train : array-like of shape (n_train_samples,)
+        Indices of training samples.
+
+    test : array-like of shape (n_test_samples,)
+        Indices of test samples.
+
+    verbose : int
+        The verbosity level.
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+    parameters : dict or None
+        Parameters to be set on the estimator.
+
+    fit_params : dict or None
+        Parameters that will be passed to ``estimator.fit``.
+
+    score_params : dict or None
+        Parameters that will be passed to the scorer.
+
+    return_train_score : bool, default=False
+        Compute and return score on training set.
+
+    return_parameters : bool, default=False
+        Return parameters that has been used for the estimator.
+
+    split_progress : {list, tuple} of int, default=None
+        A list or tuple of format (<current_split_id>, <total_num_of_splits>).
+
+    candidate_progress : {list, tuple} of int, default=None
+        A list or tuple of format
+        (<current_candidate_id>, <total_number_of_candidates>).
+
+    return_n_test_samples : bool, default=False
+        Whether to return the ``n_test_samples``.
+
+    return_times : bool, default=False
+        Whether to return the fit/score times.
+
+    return_estimator : bool, default=False
+        Whether to return the fitted estimator.
+
+    Returns
+    -------
+    result : dict with the following attributes
+        train_scores : dict of scorer name -> float
+            Score on training set (for all the scorers),
+            returned only if `return_train_score` is `True`.
+        test_scores : dict of scorer name -> float
+            Score on testing set (for all the scorers).
+        n_test_samples : int
+            Number of test samples.
+        fit_time : float
+            Time spent for fitting in seconds.
+        score_time : float
+            Time spent for scoring in seconds.
+        parameters : dict or None
+            The parameters that have been evaluated.
+        estimator : estimator object
+            The fitted estimator.
+        fit_error : str or None
+            Traceback str if the fit failed, None if the fit succeeded.
+    """
+    xp, _ = get_namespace(X)
+    X_device = device(X)
+
+    # Make sure that we can fancy index X even if train and test are provided
+    # as NumPy arrays by NumPy only cross-validation splitters.
+    train, test = xp.asarray(train, device=X_device), xp.asarray(test, device=X_device)
+
+    if not isinstance(error_score, numbers.Number) and error_score != "raise":
+        raise ValueError(
+            "error_score must be the string 'raise' or a numeric value. "
+            "(Hint: if using 'raise', please make sure that it has been "
+            "spelled correctly.)"
+        )
+
+    progress_msg = ""
+    if verbose > 2:
+        if split_progress is not None:
+            progress_msg = f" {split_progress[0] + 1}/{split_progress[1]}"
+        if candidate_progress and verbose > 9:
+            progress_msg += f"; {candidate_progress[0] + 1}/{candidate_progress[1]}"
+
+    if verbose > 1:
+        if parameters is None:
+            params_msg = ""
+        else:
+            sorted_keys = sorted(parameters)  # Ensure deterministic o/p
+            params_msg = ", ".join(f"{k}={parameters[k]}" for k in sorted_keys)
+    if verbose > 9:
+        start_msg = f"[CV{progress_msg}] START {params_msg}"
+        print(f"{start_msg}{(80 - len(start_msg)) * '.'}")
+
+    # Adjust length of sample weights
+    fit_params = fit_params if fit_params is not None else {}
+    fit_params = _check_method_params(X, params=fit_params, indices=train)
+    score_params = score_params if score_params is not None else {}
+    score_params_train = _check_method_params(X, params=score_params, indices=train)
+    score_params_test = _check_method_params(X, params=score_params, indices=test)
+
+    if parameters is not None:
+        # here we clone the parameters, since sometimes the parameters
+        # themselves might be estimators, e.g. when we search over different
+        # estimators in a pipeline.
+        # ref: https://github.com/scikit-learn/scikit-learn/pull/26786
+        estimator = estimator.set_params(**clone(parameters, safe=False))
+
+    start_time = time.time()
+
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, y_test = _safe_split(estimator, X, y, test, train)
+
+    result = {}
+    try:
+        if y_train is None:
+            estimator.fit(X_train, **fit_params)
+        else:
+            estimator.fit(X_train, y_train, **fit_params)
+
+    except Exception:
+        # Note fit time as time until error
+        fit_time = time.time() - start_time
+        score_time = 0.0
+        if error_score == "raise":
+            raise
+        elif isinstance(error_score, numbers.Number):
+            if isinstance(scorer, _MultimetricScorer):
+                test_scores = {name: error_score for name in scorer._scorers}
+                if return_train_score:
+                    train_scores = test_scores.copy()
+            else:
+                test_scores = error_score
+                if return_train_score:
+                    train_scores = error_score
+        result["fit_error"] = format_exc()
+    else:
+        result["fit_error"] = None
+
+        fit_time = time.time() - start_time
+        test_scores = _score(
+            estimator, X_test, y_test, scorer, score_params_test, error_score
+        )
+        score_time = time.time() - start_time - fit_time
+        if return_train_score:
+            train_scores = _score(
+                estimator, X_train, y_train, scorer, score_params_train, error_score
+            )
+
+    if verbose > 1:
+        total_time = score_time + fit_time
+        end_msg = f"[CV{progress_msg}] END "
+        result_msg = params_msg + (";" if params_msg else "")
+        if verbose > 2:
+            if isinstance(test_scores, dict):
+                for scorer_name in sorted(test_scores):
+                    result_msg += f" {scorer_name}: ("
+                    if return_train_score:
+                        scorer_scores = train_scores[scorer_name]
+                        result_msg += f"train={scorer_scores:.3f}, "
+                    result_msg += f"test={test_scores[scorer_name]:.3f})"
+            else:
+                result_msg += ", score="
+                if return_train_score:
+                    result_msg += f"(train={train_scores:.3f}, test={test_scores:.3f})"
+                else:
+                    result_msg += f"{test_scores:.3f}"
+        result_msg += f" total time={logger.short_format_time(total_time)}"
+
+        # Right align the result_msg
+        end_msg += "." * (80 - len(end_msg) - len(result_msg))
+        end_msg += result_msg
+        print(end_msg)
+
+    result["test_scores"] = test_scores
+    if return_train_score:
+        result["train_scores"] = train_scores
+    if return_n_test_samples:
+        result["n_test_samples"] = _num_samples(X_test)
+    if return_times:
+        result["fit_time"] = fit_time
+        result["score_time"] = score_time
+    if return_parameters:
+        result["parameters"] = parameters
+    if return_estimator:
+        result["estimator"] = estimator
+    return result
+
+
+def _score(estimator, X_test, y_test, scorer, score_params, error_score="raise"):
+    """Compute the score(s) of an estimator on a given test set.
+
+    Will return a dict of floats if `scorer` is a _MultiMetricScorer, otherwise a single
+    float is returned.
+    """
+    score_params = {} if score_params is None else score_params
+
+    try:
+        if y_test is None:
+            scores = scorer(estimator, X_test, **score_params)
+        else:
+            scores = scorer(estimator, X_test, y_test, **score_params)
+    except Exception:
+        if isinstance(scorer, _MultimetricScorer):
+            # If `_MultimetricScorer` raises exception, the `error_score`
+            # parameter is equal to "raise".
+            raise
+        else:
+            if error_score == "raise":
+                raise
+            else:
+                scores = error_score
+                warnings.warn(
+                    (
+                        "Scoring failed. The score on this train-test partition for "
+                        f"these parameters will be set to {error_score}. Details: \n"
+                        f"{format_exc()}"
+                    ),
+                    UserWarning,
+                )
+
+    # Check non-raised error messages in `_MultimetricScorer`
+    if isinstance(scorer, _MultimetricScorer):
+        exception_messages = [
+            (name, str_e) for name, str_e in scores.items() if isinstance(str_e, str)
+        ]
+        if exception_messages:
+            # error_score != "raise"
+            for name, str_e in exception_messages:
+                scores[name] = error_score
+                warnings.warn(
+                    (
+                        "Scoring failed. The score on this train-test partition for "
+                        f"these parameters will be set to {error_score}. Details: \n"
+                        f"{str_e}"
+                    ),
+                    UserWarning,
+                )
+
+    error_msg = "scoring must return a number, got %s (%s) instead. (scorer=%s)"
+    if isinstance(scores, dict):
+        for name, score in scores.items():
+            if hasattr(score, "item"):
+                with suppress(ValueError):
+                    # e.g. unwrap memmapped scalars
+                    score = score.item()
+            if not isinstance(score, numbers.Number):
+                raise ValueError(error_msg % (score, type(score), name))
+            scores[name] = score
+    else:  # scalar
+        if hasattr(scores, "item"):
+            with suppress(ValueError):
+                # e.g. unwrap memmapped scalars
+                scores = scores.item()
+        if not isinstance(scores, numbers.Number):
+            raise ValueError(error_msg % (scores, type(scores), scorer))
+    return scores
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods(["fit", "predict"])],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", "sparse matrix", None],
+        "groups": ["array-like", None],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "params": [dict, None],
+        "pre_dispatch": [Integral, str, None],
+        "method": [
+            StrOptions(
+                {
+                    "predict",
+                    "predict_proba",
+                    "predict_log_proba",
+                    "decision_function",
+                }
+            )
+        ],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def cross_val_predict(
+    estimator,
+    X,
+    y=None,
+    *,
+    groups=None,
+    cv=None,
+    n_jobs=None,
+    verbose=0,
+    params=None,
+    pre_dispatch="2*n_jobs",
+    method="predict",
+):
+    """Generate cross-validated estimates for each input data point.
+
+    The data is split according to the cv parameter. Each sample belongs
+    to exactly one test set, and its prediction is computed with an
+    estimator fitted on the corresponding training set.
+
+    Passing these predictions into an evaluation metric may not be a valid
+    way to measure generalization performance. Results can differ from
+    :func:`cross_validate` and :func:`cross_val_score` unless all tests sets
+    have equal size and the metric decomposes over samples.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    estimator : estimator
+        The estimator instance to use to fit the data. It must implement a `fit`
+        method and the method given by the `method` parameter.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to fit. Can be, for example a list, or an array at least 2d.
+
+    y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs), \
+            default=None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_val_predict(..., params={'groups': groups})``.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable that generates (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and
+        predicting are parallelized over the cross-validation splits.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit`` and the CV
+        splitter.
+
+        .. versionadded:: 1.4
+
+    pre_dispatch : int or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - None, in which case all the jobs are immediately created and spawned. Use
+          this for lightweight and fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are spawned
+        - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'
+
+    method : {'predict', 'predict_proba', 'predict_log_proba', \
+              'decision_function'}, default='predict'
+        The method to be invoked by `estimator`.
+
+    Returns
+    -------
+    predictions : ndarray
+        This is the result of calling `method`. Shape:
+
+        - When `method` is 'predict' and in special case where `method` is
+          'decision_function' and the target is binary: (n_samples,)
+        - When `method` is one of {'predict_proba', 'predict_log_proba',
+          'decision_function'} (unless special case above):
+          (n_samples, n_classes)
+        - If `estimator` is :term:`multioutput`, an extra dimension
+          'n_outputs' is added to the end of each shape above.
+
+    See Also
+    --------
+    cross_val_score : Calculate score for each CV split.
+    cross_validate : Calculate one or more scores and timings for each CV
+        split.
+
+    Notes
+    -----
+    In the case that one or more classes are absent in a training portion, a
+    default score needs to be assigned to all instances for that class if
+    ``method`` produces columns per class, as in {'decision_function',
+    'predict_proba', 'predict_log_proba'}.  For ``predict_proba`` this value is
+    0.  In order to ensure finite output, we approximate negative infinity by
+    the minimum finite float value for the dtype in other cases.
+
+    Examples
+    --------
+    >>> from sklearn import datasets, linear_model
+    >>> from sklearn.model_selection import cross_val_predict
+    >>> diabetes = datasets.load_diabetes()
+    >>> X = diabetes.data[:150]
+    >>> y = diabetes.target[:150]
+    >>> lasso = linear_model.Lasso()
+    >>> y_pred = cross_val_predict(lasso, X, y, cv=3)
+
+    For a detailed example of using ``cross_val_predict`` to visualize
+    prediction errors, please see
+    :ref:`sphx_glr_auto_examples_model_selection_plot_cv_predict.py`.
+    """
+    _check_groups_routing_disabled(groups)
+    X, y = indexable(X, y)
+    params = {} if params is None else params
+
+    if _routing_enabled():
+        # For estimators, a MetadataRouter is created in get_metadata_routing
+        # methods. For these router methods, we create the router to use
+        # `process_routing` on it.
+        router = (
+            MetadataRouter(owner="cross_val_predict")
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata for the predict method.
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+        )
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            raise UnsetMetadataPassedError(
+                message=str(e).replace("cross_val_predict.fit", "cross_val_predict"),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+    else:
+        routed_params = Bunch()
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.estimator = Bunch(fit=params)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    splits = list(cv.split(X, y, **routed_params.splitter.split))
+
+    test_indices = np.concatenate([test for _, test in splits])
+    if not _check_is_permutation(test_indices, _num_samples(X)):
+        raise ValueError("cross_val_predict only works for partitions")
+
+    # If classification methods produce multiple columns of output,
+    # we need to manually encode classes to ensure consistent column ordering.
+    encode = (
+        method in ["decision_function", "predict_proba", "predict_log_proba"]
+        and y is not None
+    )
+    if encode:
+        y = np.asarray(y)
+        if y.ndim == 1:
+            le = LabelEncoder()
+            y = le.fit_transform(y)
+        elif y.ndim == 2:
+            y_enc = np.zeros_like(y, dtype=int)
+            for i_label in range(y.shape[1]):
+                y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label])
+            y = y_enc
+
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
+    predictions = parallel(
+        delayed(_fit_and_predict)(
+            clone(estimator),
+            X,
+            y,
+            train,
+            test,
+            routed_params.estimator.fit,
+            method,
+        )
+        for train, test in splits
+    )
+
+    inv_test_indices = np.empty(len(test_indices), dtype=int)
+    inv_test_indices[test_indices] = np.arange(len(test_indices))
+
+    if sp.issparse(predictions[0]):
+        predictions = sp.vstack(predictions, format=predictions[0].format)
+    elif encode and isinstance(predictions[0], list):
+        # `predictions` is a list of method outputs from each fold.
+        # If each of those is also a list, then treat this as a
+        # multioutput-multiclass task. We need to separately concatenate
+        # the method outputs for each label into an `n_labels` long list.
+        n_labels = y.shape[1]
+        concat_pred = []
+        for i_label in range(n_labels):
+            label_preds = np.concatenate([p[i_label] for p in predictions])
+            concat_pred.append(label_preds)
+        predictions = concat_pred
+    else:
+        predictions = np.concatenate(predictions)
+
+    if isinstance(predictions, list):
+        return [p[inv_test_indices] for p in predictions]
+    else:
+        return predictions[inv_test_indices]
+
+
+def _fit_and_predict(estimator, X, y, train, test, fit_params, method):
+    """Fit estimator and predict values for a given dataset split.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit' and 'predict'
+        The object to use to fit the data.
+
+    X : array-like of shape (n_samples, n_features)
+        The data to fit.
+
+        .. versionchanged:: 0.20
+            X is only required to be an object with finite length or shape now
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    train : array-like of shape (n_train_samples,)
+        Indices of training samples.
+
+    test : array-like of shape (n_test_samples,)
+        Indices of test samples.
+
+    fit_params : dict or None
+        Parameters that will be passed to ``estimator.fit``.
+
+    method : str
+        Invokes the passed method name of the passed estimator.
+
+    Returns
+    -------
+    predictions : sequence
+        Result of calling 'estimator.method'
+    """
+    # Adjust length of sample weights
+    fit_params = fit_params if fit_params is not None else {}
+    fit_params = _check_method_params(X, params=fit_params, indices=train)
+
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, _ = _safe_split(estimator, X, y, test, train)
+
+    if y_train is None:
+        estimator.fit(X_train, **fit_params)
+    else:
+        estimator.fit(X_train, y_train, **fit_params)
+    func = getattr(estimator, method)
+    predictions = func(X_test)
+
+    encode = (
+        method in ["decision_function", "predict_proba", "predict_log_proba"]
+        and y is not None
+    )
+
+    if encode:
+        if isinstance(predictions, list):
+            predictions = [
+                _enforce_prediction_order(
+                    estimator.classes_[i_label],
+                    predictions[i_label],
+                    n_classes=len(set(y[:, i_label])),
+                    method=method,
+                )
+                for i_label in range(len(predictions))
+            ]
+        else:
+            # A 2D y array should be a binary label indicator matrix
+            n_classes = len(set(y)) if y.ndim == 1 else y.shape[1]
+            predictions = _enforce_prediction_order(
+                estimator.classes_, predictions, n_classes, method
+            )
+    return predictions
+
+
+def _enforce_prediction_order(classes, predictions, n_classes, method):
+    """Ensure that prediction arrays have correct column order
+
+    When doing cross-validation, if one or more classes are
+    not present in the subset of data used for training,
+    then the output prediction array might not have the same
+    columns as other folds. Use the list of class names
+    (assumed to be ints) to enforce the correct column order.
+
+    Note that `classes` is the list of classes in this fold
+    (a subset of the classes in the full training set)
+    and `n_classes` is the number of classes in the full training set.
+    """
+    if n_classes != len(classes):
+        recommendation = (
+            "To fix this, use a cross-validation "
+            "technique resulting in properly "
+            "stratified folds"
+        )
+        warnings.warn(
+            "Number of classes in training fold ({}) does "
+            "not match total number of classes ({}). "
+            "Results may not be appropriate for your use case. "
+            "{}".format(len(classes), n_classes, recommendation),
+            RuntimeWarning,
+        )
+        if method == "decision_function":
+            if predictions.ndim == 2 and predictions.shape[1] != len(classes):
+                # This handles the case when the shape of predictions
+                # does not match the number of classes used to train
+                # it with. This case is found when sklearn.svm.SVC is
+                # set to `decision_function_shape='ovo'`.
+                raise ValueError(
+                    "Output shape {} of {} does not match "
+                    "number of classes ({}) in fold. "
+                    "Irregular decision_function outputs "
+                    "are not currently supported by "
+                    "cross_val_predict".format(predictions.shape, method, len(classes))
+                )
+            if len(classes) <= 2:
+                # In this special case, `predictions` contains a 1D array.
+                raise ValueError(
+                    "Only {} class/es in training fold, but {} "
+                    "in overall dataset. This "
+                    "is not supported for decision_function "
+                    "with imbalanced folds. {}".format(
+                        len(classes), n_classes, recommendation
+                    )
+                )
+
+        float_min = np.finfo(predictions.dtype).min
+        default_values = {
+            "decision_function": float_min,
+            "predict_log_proba": float_min,
+            "predict_proba": 0,
+        }
+        predictions_for_all_classes = np.full(
+            (_num_samples(predictions), n_classes),
+            default_values[method],
+            dtype=predictions.dtype,
+        )
+        predictions_for_all_classes[:, classes] = predictions
+        predictions = predictions_for_all_classes
+    return predictions
+
+
+def _check_is_permutation(indices, n_samples):
+    """Check whether indices is a reordering of the array np.arange(n_samples)
+
+    Parameters
+    ----------
+    indices : ndarray
+        int array to test
+    n_samples : int
+        number of expected elements
+
+    Returns
+    -------
+    is_partition : bool
+        True iff sorted(indices) is np.arange(n)
+    """
+    if len(indices) != n_samples:
+        return False
+    hit = np.zeros(n_samples, dtype=bool)
+    hit[indices] = True
+    if not np.all(hit):
+        return False
+    return True
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods("fit")],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "cv": ["cv_object"],
+        "n_permutations": [Interval(Integral, 1, None, closed="left")],
+        "n_jobs": [Integral, None],
+        "random_state": ["random_state"],
+        "verbose": ["verbose"],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "fit_params": [dict, None],
+        "params": [dict, None],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def permutation_test_score(
+    estimator,
+    X,
+    y,
+    *,
+    groups=None,
+    cv=None,
+    n_permutations=100,
+    n_jobs=None,
+    random_state=0,
+    verbose=0,
+    scoring=None,
+    fit_params=None,
+    params=None,
+):
+    """Evaluate the significance of a cross-validated score with permutations.
+
+    Permutes targets to generate 'randomized data' and compute the empirical
+    p-value against the null hypothesis that features and targets are
+    independent.
+
+    The p-value represents the fraction of randomized data sets where the
+    estimator performed as well or better than on the original data. A small
+    p-value suggests that there is a real dependency between features and
+    targets which has been used by the estimator to give good predictions.
+    A large p-value may be due to lack of real dependency between features
+    and targets or the estimator was not able to use the dependency to
+    give good predictions.
+
+    Read more in the :ref:`User Guide <permutation_test_score>`.
+
+    Parameters
+    ----------
+    estimator : estimator object implementing 'fit'
+        The object to use to fit the data.
+
+    X : array-like of shape at least 2D
+        The data to fit.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        The target variable to try to predict in the case of
+        supervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Labels to constrain permutation within groups, i.e. ``y`` values
+        are permuted among samples with the same group identifier.
+        When not specified, ``y`` values are permuted among all samples.
+
+        When a grouped cross-validator is used, the group labels are
+        also passed on to the ``split`` method of the cross-validator. The
+        cross-validator uses them for grouping the samples  while splitting
+        the dataset into train/test set.
+
+        .. versionchanged:: 1.6
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``permutation_test_score(..., params={'groups': groups})``.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - `None`, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For `int`/`None` inputs, if the estimator is a classifier and `y` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            `cv` default value if `None` changed from 3-fold to 5-fold.
+
+    n_permutations : int, default=100
+        Number of times to permute ``y``.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the cross-validated score are parallelized over the permutations.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance or None, default=0
+        Pass an int for reproducible output for permutation of
+        ``y`` values among samples. See :term:`Glossary <random_state>`.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    scoring : str or callable, default=None
+        Scoring method to use to evaluate the predictions on the validation set.
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``, which should return only a single value.
+          See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+    fit_params : dict, default=None
+        Parameters to pass to the fit method of the estimator.
+
+        .. deprecated:: 1.6
+            This parameter is deprecated and will be removed in version 1.6. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the `fit` method of the estimator, the scorer
+        and the cv splitter.
+
+        - If `enable_metadata_routing=False` (default): Parameters directly passed to
+          the `fit` method of the estimator.
+
+        - If `enable_metadata_routing=True`: Parameters safely routed to the `fit`
+          method of the estimator, `cv` object and `scorer`. See :ref:`Metadata Routing
+          User Guide <metadata_routing>` for more details.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    score : float
+        The true score without permuting targets.
+
+    permutation_scores : array of shape (n_permutations,)
+        The scores obtained for each permutations.
+
+    pvalue : float
+        The p-value, which approximates the probability that the score would
+        be obtained by chance. This is calculated as:
+
+        `(C + 1) / (n_permutations + 1)`
+
+        Where C is the number of permutations whose score >= the true score.
+
+        The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.
+
+    Notes
+    -----
+    This function implements Test 1 in:
+
+    Ojala and Garriga. `Permutation Tests for Studying Classifier Performance
+    <http://www.jmlr.org/papers/volume11/ojala10a/ojala10a.pdf>`_. The
+    Journal of Machine Learning Research (2010) vol. 11
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.model_selection import permutation_test_score
+    >>> X, y = make_classification(random_state=0)
+    >>> estimator = LogisticRegression()
+    >>> score, permutation_scores, pvalue = permutation_test_score(
+    ...     estimator, X, y, random_state=0
+    ... )
+    >>> print(f"Original Score: {score:.3f}")
+    Original Score: 0.810
+    >>> print(
+    ...     f"Permutation Scores: {permutation_scores.mean():.3f} +/- "
+    ...     f"{permutation_scores.std():.3f}"
+    ... )
+    Permutation Scores: 0.505 +/- 0.057
+    >>> print(f"P-value: {pvalue:.3f}")
+    P-value: 0.010
+    """
+    params = _check_params_groups_deprecation(fit_params, params, groups, "1.8")
+
+    X, y, groups = indexable(X, y, groups)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    scorer = check_scoring(estimator, scoring=scoring)
+    random_state = check_random_state(random_state)
+
+    if _routing_enabled():
+        router = (
+            MetadataRouter(owner="permutation_test_score")
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata to the predict method for
+                # scoring?
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                scorer=scorer,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            raise UnsetMetadataPassedError(
+                message=str(e).replace(
+                    "permutation_test_score.fit", "permutation_test_score"
+                ),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+
+    else:
+        routed_params = Bunch()
+        routed_params.estimator = Bunch(fit=params)
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.scorer = Bunch(score={})
+
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    score = _permutation_test_score(
+        clone(estimator),
+        X,
+        y,
+        cv,
+        scorer,
+        split_params=routed_params.splitter.split,
+        fit_params=routed_params.estimator.fit,
+        score_params=routed_params.scorer.score,
+    )
+    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(_permutation_test_score)(
+            clone(estimator),
+            X,
+            _shuffle(y, groups, random_state),
+            cv,
+            scorer,
+            split_params=routed_params.splitter.split,
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
+        )
+        for _ in range(n_permutations)
+    )
+    permutation_scores = np.array(permutation_scores)
+    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
+    return score, permutation_scores, pvalue
+
+
+def _permutation_test_score(
+    estimator, X, y, cv, scorer, split_params, fit_params, score_params
+):
+    """Auxiliary function for permutation_test_score"""
+    # Adjust length of sample weights
+    fit_params = fit_params if fit_params is not None else {}
+    score_params = score_params if score_params is not None else {}
+
+    avg_score = []
+    for train, test in cv.split(X, y, **split_params):
+        X_train, y_train = _safe_split(estimator, X, y, train)
+        X_test, y_test = _safe_split(estimator, X, y, test, train)
+        fit_params_train = _check_method_params(X, params=fit_params, indices=train)
+        score_params_test = _check_method_params(X, params=score_params, indices=test)
+        estimator.fit(X_train, y_train, **fit_params_train)
+        avg_score.append(scorer(estimator, X_test, y_test, **score_params_test))
+    return np.mean(avg_score)
+
+
+def _shuffle(y, groups, random_state):
+    """Return a shuffled copy of y eventually shuffle among same groups."""
+    if groups is None:
+        indices = random_state.permutation(len(y))
+    else:
+        indices = np.arange(len(groups))
+        for group in np.unique(groups):
+            this_mask = groups == group
+            indices[this_mask] = random_state.permutation(indices[this_mask])
+    return _safe_indexing(y, indices)
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods(["fit"])],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "train_sizes": ["array-like"],
+        "cv": ["cv_object"],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "exploit_incremental_learning": ["boolean"],
+        "n_jobs": [Integral, None],
+        "pre_dispatch": [Integral, str],
+        "verbose": ["verbose"],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+        "error_score": [StrOptions({"raise"}), Real],
+        "return_times": ["boolean"],
+        "fit_params": [dict, None],
+        "params": [dict, None],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def learning_curve(
+    estimator,
+    X,
+    y,
+    *,
+    groups=None,
+    train_sizes=np.linspace(0.1, 1.0, 5),
+    cv=None,
+    scoring=None,
+    exploit_incremental_learning=False,
+    n_jobs=None,
+    pre_dispatch="all",
+    verbose=0,
+    shuffle=False,
+    random_state=None,
+    error_score=np.nan,
+    return_times=False,
+    fit_params=None,
+    params=None,
+):
+    """Learning curve.
+
+    Determines cross-validated training and test scores for different training
+    set sizes.
+
+    A cross-validation generator splits the whole dataset k times in training
+    and test data. Subsets of the training set with varying sizes will be used
+    to train the estimator and a score for each training subset size and the
+    test set will be computed. Afterwards, the scores will be averaged over
+    all k runs for each training subset size.
+
+    Read more in the :ref:`User Guide <learning_curve>`.
+
+    Parameters
+    ----------
+    estimator : object type that implements the "fit" method
+        An object of that type which is cloned for each validation. It must
+        also implement "predict" unless `scoring` is a callable that doesn't
+        rely on "predict" to compute a score.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        Target relative to X for classification or regression;
+        None for unsupervised learning.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.6
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``learning_curve(..., params={'groups': groups})``.
+
+    train_sizes : array-like of shape (n_ticks,), \
+            default=np.linspace(0.1, 1.0, 5)
+        Relative or absolute numbers of training examples that will be used to
+        generate the learning curve. If the dtype is float, it is regarded as a
+        fraction of the maximum size of the training set (that is determined
+        by the selected validation method), i.e. it has to be within (0, 1].
+        Otherwise it is interpreted as absolute sizes of the training sets.
+        Note that for classification the number of samples usually has to
+        be big enough to contain at least one sample from each class.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    scoring : str or callable, default=None
+        Scoring method to use to evaluate the training and test sets.
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+    exploit_incremental_learning : bool, default=False
+        If the estimator supports incremental learning, this will be
+        used to speed up fitting for different training set sizes.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the different training and test sets.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    pre_dispatch : int or str, default='all'
+        Number of predispatched jobs for parallel execution (default is
+        all). The option can reduce the allocated memory. The str can
+        be an expression like '2*n_jobs'.
+
+    verbose : int, default=0
+        Controls the verbosity: the higher, the more messages.
+
+    shuffle : bool, default=False
+        Whether to shuffle training data before taking prefixes of it
+        based on``train_sizes``.
+
+    random_state : int, RandomState instance or None, default=None
+        Used when ``shuffle`` is True. Pass an int for reproducible
+        output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+        .. versionadded:: 0.20
+
+    return_times : bool, default=False
+        Whether to return the fit and score times.
+
+    fit_params : dict, default=None
+        Parameters to pass to the fit method of the estimator.
+
+        .. deprecated:: 1.6
+            This parameter is deprecated and will be removed in version 1.8. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the `fit` method of the estimator and to the scorer.
+
+        - If `enable_metadata_routing=False` (default): Parameters directly passed to
+          the `fit` method of the estimator.
+
+        - If `enable_metadata_routing=True`: Parameters safely routed to the `fit`
+          method of the estimator. See :ref:`Metadata Routing User Guide
+          <metadata_routing>` for more details.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    train_sizes_abs : array of shape (n_unique_ticks,)
+        Numbers of training examples that has been used to generate the
+        learning curve. Note that the number of ticks might be less
+        than n_ticks because duplicate entries will be removed.
+
+    train_scores : array of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : array of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    fit_times : array of shape (n_ticks, n_cv_folds)
+        Times spent for fitting in seconds. Only present if ``return_times``
+        is True.
+
+    score_times : array of shape (n_ticks, n_cv_folds)
+        Times spent for scoring in seconds. Only present if ``return_times``
+        is True.
+
+    See Also
+    --------
+    LearningCurveDisplay.from_estimator : Plot a learning curve using an
+        estimator and data.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> from sklearn.model_selection import learning_curve
+    >>> X, y = make_classification(n_samples=100, n_features=10, random_state=42)
+    >>> tree = DecisionTreeClassifier(max_depth=4, random_state=42)
+    >>> train_size_abs, train_scores, test_scores = learning_curve(
+    ...     tree, X, y, train_sizes=[0.3, 0.6, 0.9]
+    ... )
+    >>> for train_size, cv_train_scores, cv_test_scores in zip(
+    ...     train_size_abs, train_scores, test_scores
+    ... ):
+    ...     print(f"{train_size} samples were used to train the model")
+    ...     print(f"The average train accuracy is {cv_train_scores.mean():.2f}")
+    ...     print(f"The average test accuracy is {cv_test_scores.mean():.2f}")
+    24 samples were used to train the model
+    The average train accuracy is 1.00
+    The average test accuracy is 0.85
+    48 samples were used to train the model
+    The average train accuracy is 1.00
+    The average test accuracy is 0.90
+    72 samples were used to train the model
+    The average train accuracy is 1.00
+    The average test accuracy is 0.93
+    """
+    if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
+        raise ValueError(
+            "An estimator must support the partial_fit interface "
+            "to exploit incremental learning"
+        )
+
+    params = _check_params_groups_deprecation(fit_params, params, groups, "1.8")
+
+    X, y, groups = indexable(X, y, groups)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+
+    scorer = check_scoring(estimator, scoring=scoring)
+
+    if _routing_enabled():
+        router = (
+            MetadataRouter(owner="learning_curve")
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata to the predict method for
+                # scoring?
+                method_mapping=MethodMapping()
+                .add(caller="fit", callee="fit")
+                .add(caller="fit", callee="partial_fit"),
+            )
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                scorer=scorer,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            raise UnsetMetadataPassedError(
+                message=str(e).replace("learning_curve.fit", "learning_curve"),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+
+    else:
+        routed_params = Bunch()
+        routed_params.estimator = Bunch(fit=params, partial_fit=params)
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.scorer = Bunch(score={})
+
+    # Store cv as list as we will be iterating over the list multiple times
+    cv_iter = list(cv.split(X, y, **routed_params.splitter.split))
+
+    n_max_training_samples = len(cv_iter[0][0])
+    # Because the lengths of folds can be significantly different, it is
+    # not guaranteed that we use all of the available training data when we
+    # use the first 'n_max_training_samples' samples.
+    train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples)
+    n_unique_ticks = train_sizes_abs.shape[0]
+    if verbose > 0:
+        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))
+
+    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose)
+
+    if shuffle:
+        rng = check_random_state(random_state)
+        cv_iter = ((rng.permutation(train), test) for train, test in cv_iter)
+
+    if exploit_incremental_learning:
+        classes = np.unique(y) if is_classifier(estimator) else None
+        out = parallel(
+            delayed(_incremental_fit_estimator)(
+                clone(estimator),
+                X,
+                y,
+                classes,
+                train,
+                test,
+                train_sizes_abs,
+                scorer,
+                return_times,
+                error_score=error_score,
+                fit_params=routed_params.estimator.partial_fit,
+                score_params=routed_params.scorer.score,
+            )
+            for train, test in cv_iter
+        )
+        out = np.asarray(out).transpose((2, 1, 0))
+    else:
+        train_test_proportions = []
+        for train, test in cv_iter:
+            for n_train_samples in train_sizes_abs:
+                train_test_proportions.append((train[:n_train_samples], test))
+
+        results = parallel(
+            delayed(_fit_and_score)(
+                clone(estimator),
+                X,
+                y,
+                scorer=scorer,
+                train=train,
+                test=test,
+                verbose=verbose,
+                parameters=None,
+                fit_params=routed_params.estimator.fit,
+                score_params=routed_params.scorer.score,
+                return_train_score=True,
+                error_score=error_score,
+                return_times=return_times,
+            )
+            for train, test in train_test_proportions
+        )
+        _warn_or_raise_about_fit_failures(results, error_score)
+        results = _aggregate_score_dicts(results)
+        train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T
+        test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T
+        out = [train_scores, test_scores]
+
+        if return_times:
+            fit_times = results["fit_time"].reshape(-1, n_unique_ticks).T
+            score_times = results["score_time"].reshape(-1, n_unique_ticks).T
+            out.extend([fit_times, score_times])
+
+    ret = train_sizes_abs, out[0], out[1]
+
+    if return_times:
+        ret = ret + (out[2], out[3])
+
+    return ret
+
+
+def _translate_train_sizes(train_sizes, n_max_training_samples):
+    """Determine absolute sizes of training subsets and validate 'train_sizes'.
+
+    Examples:
+        _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]
+        _translate_train_sizes([5, 10], 10) -> [5, 10]
+
+    Parameters
+    ----------
+    train_sizes : array-like of shape (n_ticks,)
+        Numbers of training examples that will be used to generate the
+        learning curve. If the dtype is float, it is regarded as a
+        fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].
+
+    n_max_training_samples : int
+        Maximum number of training samples (upper bound of 'train_sizes').
+
+    Returns
+    -------
+    train_sizes_abs : array of shape (n_unique_ticks,)
+        Numbers of training examples that will be used to generate the
+        learning curve. Note that the number of ticks might be less
+        than n_ticks because duplicate entries will be removed.
+    """
+    train_sizes_abs = np.asarray(train_sizes)
+    n_ticks = train_sizes_abs.shape[0]
+    n_min_required_samples = np.min(train_sizes_abs)
+    n_max_required_samples = np.max(train_sizes_abs)
+    if np.issubdtype(train_sizes_abs.dtype, np.floating):
+        if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0:
+            raise ValueError(
+                "train_sizes has been interpreted as fractions "
+                "of the maximum number of training samples and "
+                "must be within (0, 1], but is within [%f, %f]."
+                % (n_min_required_samples, n_max_required_samples)
+            )
+        train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype(
+            dtype=int, copy=False
+        )
+        train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples)
+    else:
+        if (
+            n_min_required_samples <= 0
+            or n_max_required_samples > n_max_training_samples
+        ):
+            raise ValueError(
+                "train_sizes has been interpreted as absolute "
+                "numbers of training samples and must be within "
+                "(0, %d], but is within [%d, %d]."
+                % (
+                    n_max_training_samples,
+                    n_min_required_samples,
+                    n_max_required_samples,
+                )
+            )
+
+    train_sizes_abs = np.unique(train_sizes_abs)
+    if n_ticks > train_sizes_abs.shape[0]:
+        warnings.warn(
+            "Removed duplicate entries from 'train_sizes'. Number "
+            "of ticks will be less than the size of "
+            "'train_sizes': %d instead of %d." % (train_sizes_abs.shape[0], n_ticks),
+            RuntimeWarning,
+        )
+
+    return train_sizes_abs
+
+
+def _incremental_fit_estimator(
+    estimator,
+    X,
+    y,
+    classes,
+    train,
+    test,
+    train_sizes,
+    scorer,
+    return_times,
+    error_score,
+    fit_params,
+    score_params,
+):
+    """Train estimator on training subsets incrementally and compute scores."""
+    train_scores, test_scores, fit_times, score_times = [], [], [], []
+    partitions = zip(train_sizes, np.split(train, train_sizes)[:-1])
+    if fit_params is None:
+        fit_params = {}
+    if classes is None:
+        partial_fit_func = partial(estimator.partial_fit, **fit_params)
+    else:
+        partial_fit_func = partial(estimator.partial_fit, classes=classes, **fit_params)
+    score_params = score_params if score_params is not None else {}
+    score_params_train = _check_method_params(X, params=score_params, indices=train)
+    score_params_test = _check_method_params(X, params=score_params, indices=test)
+
+    for n_train_samples, partial_train in partitions:
+        train_subset = train[:n_train_samples]
+        X_train, y_train = _safe_split(estimator, X, y, train_subset)
+        X_partial_train, y_partial_train = _safe_split(estimator, X, y, partial_train)
+        X_test, y_test = _safe_split(estimator, X, y, test, train_subset)
+        start_fit = time.time()
+        if y_partial_train is None:
+            partial_fit_func(X_partial_train)
+        else:
+            partial_fit_func(X_partial_train, y_partial_train)
+        fit_time = time.time() - start_fit
+        fit_times.append(fit_time)
+
+        start_score = time.time()
+
+        test_scores.append(
+            _score(
+                estimator,
+                X_test,
+                y_test,
+                scorer,
+                score_params=score_params_test,
+                error_score=error_score,
+            )
+        )
+        train_scores.append(
+            _score(
+                estimator,
+                X_train,
+                y_train,
+                scorer,
+                score_params=score_params_train,
+                error_score=error_score,
+            )
+        )
+        score_time = time.time() - start_score
+        score_times.append(score_time)
+
+    ret = (
+        (train_scores, test_scores, fit_times, score_times)
+        if return_times
+        else (train_scores, test_scores)
+    )
+
+    return np.array(ret).T
+
+
+@validate_params(
+    {
+        "estimator": [HasMethods(["fit"])],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "param_name": [str],
+        "param_range": ["array-like"],
+        "groups": ["array-like", None],
+        "cv": ["cv_object"],
+        "scoring": [StrOptions(set(get_scorer_names())), callable, None],
+        "n_jobs": [Integral, None],
+        "pre_dispatch": [Integral, str],
+        "verbose": ["verbose"],
+        "error_score": [StrOptions({"raise"}), Real],
+        "fit_params": [dict, None],
+        "params": [dict, None],
+    },
+    prefer_skip_nested_validation=False,  # estimator is not validated yet
+)
+def validation_curve(
+    estimator,
+    X,
+    y,
+    *,
+    param_name,
+    param_range,
+    groups=None,
+    cv=None,
+    scoring=None,
+    n_jobs=None,
+    pre_dispatch="all",
+    verbose=0,
+    error_score=np.nan,
+    fit_params=None,
+    params=None,
+):
+    """Validation curve.
+
+    Determine training and test scores for varying parameter values.
+
+    Compute scores for an estimator with different values of a specified
+    parameter. This is similar to grid search with one parameter. However, this
+    will also compute training scores and is merely a utility for plotting the
+    results.
+
+    Read more in the :ref:`User Guide <validation_curve>`.
+
+    Parameters
+    ----------
+    estimator : object type that implements the "fit" method
+        An object of that type which is cloned for each validation. It must
+        also implement "predict" unless `scoring` is a callable that doesn't
+        rely on "predict" to compute a score.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+        Target relative to X for classification or regression;
+        None for unsupervised learning.
+
+    param_name : str
+        Name of the parameter that will be varied.
+
+    param_range : array-like of shape (n_values,)
+        The values of the parameter that will be evaluated.
+
+    groups : array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" :term:`cv`
+        instance (e.g., :class:`GroupKFold`).
+
+        .. versionchanged:: 1.6
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``validation_curve(..., params={'groups': groups})``.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
+    scoring : str or callable, default=None
+        Scoring method to use to evaluate the training and test sets.
+
+        - str: see :ref:`scoring_string_names` for options.
+        - callable: a scorer callable object (e.g., function) with signature
+          ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details.
+        - `None`: the `estimator`'s
+          :ref:`default evaluation criterion <scoring_api_overview>` is used.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimator and computing
+        the score are parallelized over the combinations of each parameter
+        value and each cross-validation split.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    pre_dispatch : int or str, default='all'
+        Number of predispatched jobs for parallel execution (default is
+        all). The option can reduce the allocated memory. The str can
+        be an expression like '2*n_jobs'.
+
+    verbose : int, default=0
+        Controls the verbosity: the higher, the more messages.
+
+    error_score : 'raise' or numeric, default=np.nan
+        Value to assign to the score if an error occurs in estimator fitting.
+        If set to 'raise', the error is raised.
+        If a numeric value is given, FitFailedWarning is raised.
+
+        .. versionadded:: 0.20
+
+    fit_params : dict, default=None
+        Parameters to pass to the fit method of the estimator.
+
+        .. deprecated:: 1.6
+            This parameter is deprecated and will be removed in version 1.8. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the estimator, scorer and cross-validation object.
+
+        - If `enable_metadata_routing=False` (default): Parameters directly passed to
+          the `fit` method of the estimator.
+
+        - If `enable_metadata_routing=True`: Parameters safely routed to the `fit`
+          method of the estimator, to the scorer and to the cross-validation object.
+          See :ref:`Metadata Routing User Guide <metadata_routing>` for more details.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    train_scores : array of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : array of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    See Also
+    --------
+    ValidationCurveDisplay.from_estimator : Plot the validation curve
+        given an estimator, the data, and the parameter to vary.
+
+    Notes
+    -----
+    See :ref:`sphx_glr_auto_examples_model_selection_plot_train_error_vs_test_error.py`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import validation_curve
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = make_classification(n_samples=1_000, random_state=0)
+    >>> logistic_regression = LogisticRegression()
+    >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+    >>> train_scores, test_scores = validation_curve(
+    ...     logistic_regression, X, y, param_name=param_name, param_range=param_range
+    ... )
+    >>> print(f"The average train accuracy is {train_scores.mean():.2f}")
+    The average train accuracy is 0.81
+    >>> print(f"The average test accuracy is {test_scores.mean():.2f}")
+    The average test accuracy is 0.81
+    """
+    params = _check_params_groups_deprecation(fit_params, params, groups, "1.8")
+    X, y, groups = indexable(X, y, groups)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    scorer = check_scoring(estimator, scoring=scoring)
+
+    if _routing_enabled():
+        router = (
+            MetadataRouter(owner="validation_curve")
+            .add(
+                estimator=estimator,
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                scorer=scorer,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            raise UnsetMetadataPassedError(
+                message=str(e).replace("validation_curve.fit", "validation_curve"),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+
+    else:
+        routed_params = Bunch()
+        routed_params.estimator = Bunch(fit=params)
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.scorer = Bunch(score={})
+
+    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose)
+    results = parallel(
+        delayed(_fit_and_score)(
+            clone(estimator),
+            X,
+            y,
+            scorer=scorer,
+            train=train,
+            test=test,
+            verbose=verbose,
+            parameters={param_name: v},
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
+            return_train_score=True,
+            error_score=error_score,
+        )
+        # NOTE do not change order of iteration to allow one time cv splitters
+        for train, test in cv.split(X, y, **routed_params.splitter.split)
+        for v in param_range
+    )
+    n_params = len(param_range)
+
+    results = _aggregate_score_dicts(results)
+    train_scores = results["train_scores"].reshape(-1, n_params).T
+    test_scores = results["test_scores"].reshape(-1, n_params).T
+
+    return train_scores, test_scores
+
+
+def _aggregate_score_dicts(scores):
+    """Aggregate the list of dict to dict of np ndarray
+
+    The aggregated output of _aggregate_score_dicts will be a list of dict
+    of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]
+    Convert it to a dict of array {'prec': np.array([0.1 ...]), ...}
+
+    Parameters
+    ----------
+
+    scores : list of dict
+        List of dicts of the scores for all scorers. This is a flat list,
+        assumed originally to be of row major order.
+
+    Example
+    -------
+
+    >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},
+    ...           {'a': 10, 'b': 10}]                         # doctest: +SKIP
+    >>> _aggregate_score_dicts(scores)                        # doctest: +SKIP
+    {'a': array([1, 2, 3, 10]),
+     'b': array([10, 2, 3, 10])}
+    """
+    return {
+        key: (
+            np.asarray([score[key] for score in scores])
+            if isinstance(scores[0][key], numbers.Number)
+            else [score[key] for score in scores]
+        )
+        for key in scores[0]
+    }
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/common.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..54a993db76933a5e710f0ddd20a4efd0118ecf95
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/common.py
@@ -0,0 +1,24 @@
+"""
+Common utilities for testing model selection.
+"""
+
+import numpy as np
+
+from sklearn.model_selection import KFold
+
+
+class OneTimeSplitter:
+    """A wrapper to make KFold single entry cv iterator"""
+
+    def __init__(self, n_splits=4, n_samples=99):
+        self.n_splits = n_splits
+        self.n_samples = n_samples
+        self.indices = iter(KFold(n_splits=n_splits).split(np.ones(n_samples)))
+
+    def split(self, X=None, y=None, groups=None):
+        """Split can be called only once"""
+        for index in self.indices:
+            yield index
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        return self.n_splits
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_classification_threshold.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_classification_threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba4dcea369748622d366df0477e3b7911873593
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_classification_threshold.py
@@ -0,0 +1,618 @@
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.base import clone
+from sklearn.datasets import (
+    load_breast_cancer,
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+)
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    balanced_accuracy_score,
+    f1_score,
+    fbeta_score,
+    make_scorer,
+)
+from sklearn.metrics._scorer import _CurveScorer
+from sklearn.model_selection import (
+    FixedThresholdClassifier,
+    StratifiedShuffleSplit,
+    TunedThresholdClassifierCV,
+)
+from sklearn.model_selection._classification_threshold import (
+    _fit_and_score_over_thresholds,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._mocking import CheckingClassifier
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_array_equal,
+)
+
+
+def test_fit_and_score_over_thresholds_curve_scorers():
+    """Check that `_fit_and_score_over_thresholds` returns thresholds in ascending order
+    for the different accepted curve scorers."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    train_idx, val_idx = np.arange(50), np.arange(50, 100)
+    classifier = LogisticRegression()
+
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    scores, thresholds = _fit_and_score_over_thresholds(
+        classifier,
+        X,
+        y,
+        fit_params={},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        curve_scorer=curve_scorer,
+        score_params={},
+    )
+
+    assert np.all(thresholds[:-1] <= thresholds[1:])
+    assert isinstance(scores, np.ndarray)
+    assert np.logical_and(scores >= 0, scores <= 1).all()
+
+
+def test_fit_and_score_over_thresholds_prefit():
+    """Check the behaviour with a prefit classifier."""
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    # `train_idx is None` to indicate that the classifier is prefit
+    train_idx, val_idx = None, np.arange(50, 100)
+    classifier = DecisionTreeClassifier(random_state=0).fit(X, y)
+    # make sure that the classifier memorized the full dataset such that
+    # we get perfect predictions and thus match the expected score
+    assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0)
+
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=2,
+        kwargs={},
+    )
+    scores, thresholds = _fit_and_score_over_thresholds(
+        classifier,
+        X,
+        y,
+        fit_params={},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        curve_scorer=curve_scorer,
+        score_params={},
+    )
+    assert np.all(thresholds[:-1] <= thresholds[1:])
+    assert_allclose(scores, [0.5, 1.0])
+
+
+@config_context(enable_metadata_routing=True)
+def test_fit_and_score_over_thresholds_sample_weight():
+    """Check that we dispatch the sample-weight to fit and score the classifier."""
+    X, y = load_iris(return_X_y=True)
+    X, y = X[:100], y[:100]  # only 2 classes
+
+    # create a dataset and repeat twice the sample of class #0
+    X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]])
+    # create a sample weight vector that is equivalent to the repeated dataset
+    sample_weight = np.ones_like(y)
+    sample_weight[:50] *= 2
+
+    classifier = LogisticRegression()
+    train_repeated_idx = np.arange(X_repeated.shape[0])
+    val_repeated_idx = np.arange(X_repeated.shape[0])
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    scores_repeated, thresholds_repeated = _fit_and_score_over_thresholds(
+        classifier,
+        X_repeated,
+        y_repeated,
+        fit_params={},
+        train_idx=train_repeated_idx,
+        val_idx=val_repeated_idx,
+        curve_scorer=curve_scorer,
+        score_params={},
+    )
+
+    train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0])
+    scores, thresholds = _fit_and_score_over_thresholds(
+        classifier.set_fit_request(sample_weight=True),
+        X,
+        y,
+        fit_params={"sample_weight": sample_weight},
+        train_idx=train_idx,
+        val_idx=val_idx,
+        curve_scorer=curve_scorer.set_score_request(sample_weight=True),
+        score_params={"sample_weight": sample_weight},
+    )
+
+    assert_allclose(thresholds_repeated, thresholds)
+    assert_allclose(scores_repeated, scores)
+
+
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+@config_context(enable_metadata_routing=True)
+def test_fit_and_score_over_thresholds_fit_params(fit_params_type):
+    """Check that we pass `fit_params` to the classifier when calling `fit`."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
+    classifier.set_fit_request(a=True, b=True)
+    train_idx, val_idx = np.arange(50), np.arange(50, 100)
+
+    curve_scorer = _CurveScorer(
+        score_func=balanced_accuracy_score,
+        sign=1,
+        response_method="predict_proba",
+        thresholds=10,
+        kwargs={},
+    )
+    _fit_and_score_over_thresholds(
+        classifier,
+        X,
+        y,
+        fit_params=fit_params,
+        train_idx=train_idx,
+        val_idx=val_idx,
+        curve_scorer=curve_scorer,
+        score_params={},
+    )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        make_classification(n_classes=3, n_clusters_per_class=1, random_state=0),
+        make_multilabel_classification(random_state=0),
+    ],
+)
+def test_tuned_threshold_classifier_no_binary(data):
+    """Check that we raise an informative error message for non-binary problem."""
+    err_msg = "Only binary classification is supported."
+    with pytest.raises(ValueError, match=err_msg):
+        TunedThresholdClassifierCV(LogisticRegression()).fit(*data)
+
+
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        (
+            {"cv": "prefit", "refit": True},
+            ValueError,
+            "When cv='prefit', refit cannot be True.",
+        ),
+        (
+            {"cv": 10, "refit": False},
+            ValueError,
+            "When cv has several folds, refit cannot be False.",
+        ),
+        (
+            {"cv": "prefit", "refit": False},
+            NotFittedError,
+            "`estimator` must be fitted.",
+        ),
+    ],
+)
+def test_tuned_threshold_classifier_conflict_cv_refit(params, err_type, err_msg):
+    """Check that we raise an informative error message when `cv` and `refit`
+    cannot be used together.
+    """
+    X, y = make_classification(n_samples=100, random_state=0)
+    with pytest.raises(err_type, match=err_msg):
+        TunedThresholdClassifierCV(LogisticRegression(), **params).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [LogisticRegression(), SVC(), GradientBoostingClassifier(n_estimators=4)],
+)
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "predict_log_proba", "decision_function"]
+)
+@pytest.mark.parametrize(
+    "ThresholdClassifier", [FixedThresholdClassifier, TunedThresholdClassifierCV]
+)
+def test_threshold_classifier_estimator_response_methods(
+    ThresholdClassifier, estimator, response_method
+):
+    """Check that `TunedThresholdClassifierCV` exposes the same response methods as the
+    underlying estimator.
+    """
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    model = ThresholdClassifier(estimator=estimator)
+    assert hasattr(model, response_method) == hasattr(estimator, response_method)
+
+    model.fit(X, y)
+    assert hasattr(model, response_method) == hasattr(estimator, response_method)
+
+    if hasattr(model, response_method):
+        y_pred_cutoff = getattr(model, response_method)(X)
+        y_pred_underlying_estimator = getattr(model.estimator_, response_method)(X)
+
+        assert_allclose(y_pred_cutoff, y_pred_underlying_estimator)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+def test_tuned_threshold_classifier_without_constraint_value(response_method):
+    """Check that `TunedThresholdClassifierCV` is optimizing a given objective
+    metric."""
+    X, y = load_breast_cancer(return_X_y=True)
+    # remove feature to degrade performances
+    X = X[:, :5]
+
+    # make the problem completely imbalanced such that the balanced accuracy is low
+    indices_pos = np.flatnonzero(y == 1)
+    indices_pos = indices_pos[: indices_pos.size // 50]
+    indices_neg = np.flatnonzero(y == 0)
+
+    X = np.vstack([X[indices_neg], X[indices_pos]])
+    y = np.hstack([y[indices_neg], y[indices_pos]])
+
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    thresholds = 100
+    model = TunedThresholdClassifierCV(
+        estimator=lr,
+        scoring="balanced_accuracy",
+        response_method=response_method,
+        thresholds=thresholds,
+        store_cv_results=True,
+    )
+    score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
+    score_baseline = balanced_accuracy_score(y, lr.predict(X))
+    assert score_optimized > score_baseline
+    assert model.cv_results_["thresholds"].shape == (thresholds,)
+    assert model.cv_results_["scores"].shape == (thresholds,)
+
+
+def test_tuned_threshold_classifier_metric_with_parameter():
+    """Check that we can pass a metric with a parameter in addition check that
+    `f_beta` with `beta=1` is equivalent to `f1` and different from `f_beta` with
+    `beta=2`.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model_fbeta_1 = TunedThresholdClassifierCV(
+        estimator=lr, scoring=make_scorer(fbeta_score, beta=1)
+    ).fit(X, y)
+    model_fbeta_2 = TunedThresholdClassifierCV(
+        estimator=lr, scoring=make_scorer(fbeta_score, beta=2)
+    ).fit(X, y)
+    model_f1 = TunedThresholdClassifierCV(
+        estimator=lr, scoring=make_scorer(f1_score)
+    ).fit(X, y)
+
+    assert model_fbeta_1.best_threshold_ == pytest.approx(model_f1.best_threshold_)
+    assert model_fbeta_1.best_threshold_ != pytest.approx(model_fbeta_2.best_threshold_)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+@pytest.mark.parametrize(
+    "metric",
+    [
+        make_scorer(balanced_accuracy_score),
+        make_scorer(f1_score, pos_label="cancer"),
+    ],
+)
+def test_tuned_threshold_classifier_with_string_targets(response_method, metric):
+    """Check that targets represented by str are properly managed.
+    Also, check with several metrics to be sure that `pos_label` is properly
+    dispatched.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    # Encode numeric targets by meaningful strings. We purposely designed the class
+    # names such that the `pos_label` is the first alphabetically sorted class and thus
+    # encoded as 0.
+    classes = np.array(["cancer", "healthy"], dtype=object)
+    y = classes[y]
+    model = TunedThresholdClassifierCV(
+        estimator=make_pipeline(StandardScaler(), LogisticRegression()),
+        scoring=metric,
+        response_method=response_method,
+        thresholds=100,
+    ).fit(X, y)
+    assert_array_equal(model.classes_, np.sort(classes))
+    y_pred = model.predict(X)
+    assert_array_equal(np.unique(y_pred), np.sort(classes))
+
+
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@config_context(enable_metadata_routing=True)
+def test_tuned_threshold_classifier_refit(with_sample_weight, global_random_seed):
+    """Check the behaviour of the `refit` parameter."""
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_classification(n_samples=100, random_state=0)
+    if with_sample_weight:
+        sample_weight = rng.randn(X.shape[0])
+        sample_weight = np.abs(sample_weight, out=sample_weight)
+    else:
+        sample_weight = None
+
+    # check that `estimator_` if fitted on the full dataset when `refit=True`
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    model = TunedThresholdClassifierCV(estimator, refit=True).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is not estimator
+    estimator.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(model.estimator_.coef_, estimator.coef_)
+    assert_allclose(model.estimator_.intercept_, estimator.intercept_)
+
+    # check that `estimator_` was not altered when `refit=False` and `cv="prefit"`
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    estimator.fit(X, y, sample_weight=sample_weight)
+    coef = estimator.coef_.copy()
+    model = TunedThresholdClassifierCV(estimator, cv="prefit", refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is estimator
+    assert_allclose(model.estimator_.coef_, coef)
+
+    # check that we train `estimator_` on the training split of a given cross-validation
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    cv = [
+        (np.arange(50), np.arange(50, 100)),
+    ]  # single split
+    model = TunedThresholdClassifierCV(estimator, cv=cv, refit=False).fit(
+        X, y, sample_weight=sample_weight
+    )
+
+    assert model.estimator_ is not estimator
+    if with_sample_weight:
+        sw_train = sample_weight[cv[0][0]]
+    else:
+        sw_train = None
+    estimator.fit(X[cv[0][0]], y[cv[0][0]], sample_weight=sw_train)
+    assert_allclose(model.estimator_.coef_, estimator.coef_)
+
+
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+@config_context(enable_metadata_routing=True)
+def test_tuned_threshold_classifier_fit_params(fit_params_type):
+    """Check that we pass `fit_params` to the classifier when calling `fit`."""
+    X, y = make_classification(n_samples=100, random_state=0)
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0)
+    classifier.set_fit_request(a=True, b=True)
+    model = TunedThresholdClassifierCV(classifier)
+    model.fit(X, y, **fit_params)
+
+
+@config_context(enable_metadata_routing=True)
+def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence():
+    """Check that passing removing some sample from the dataset `X` is
+    equivalent to passing a `sample_weight` with a factor 0."""
+    X, y = load_iris(return_X_y=True)
+    # Scale the data to avoid any convergence issue
+    X = StandardScaler().fit_transform(X)
+    # Only use 2 classes and select samples such that 2-fold cross-validation
+    # split will lead to an equivalence with a `sample_weight` of 0
+    X = np.vstack((X[:40], X[50:90]))
+    y = np.hstack((y[:40], y[50:90]))
+    sample_weight = np.zeros_like(y)
+    sample_weight[::2] = 1
+
+    estimator = LogisticRegression().set_fit_request(sample_weight=True)
+    model_without_weights = TunedThresholdClassifierCV(estimator, cv=2)
+    model_with_weights = clone(model_without_weights)
+
+    model_with_weights.fit(X, y, sample_weight=sample_weight)
+    model_without_weights.fit(X[::2], y[::2])
+
+    assert_allclose(
+        model_with_weights.estimator_.coef_, model_without_weights.estimator_.coef_
+    )
+
+    y_pred_with_weights = model_with_weights.predict_proba(X)
+    y_pred_without_weights = model_without_weights.predict_proba(X)
+    assert_allclose(y_pred_with_weights, y_pred_without_weights)
+
+
+def test_tuned_threshold_classifier_thresholds_array():
+    """Check that we can pass an array to `thresholds` and it is used as candidate
+    threshold internally."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+    thresholds = np.linspace(0, 1, 11)
+    tuned_model = TunedThresholdClassifierCV(
+        estimator,
+        thresholds=thresholds,
+        response_method="predict_proba",
+        store_cv_results=True,
+    ).fit(X, y)
+    assert_allclose(tuned_model.cv_results_["thresholds"], thresholds)
+
+
+@pytest.mark.parametrize("store_cv_results", [True, False])
+def test_tuned_threshold_classifier_store_cv_results(store_cv_results):
+    """Check that if `cv_results_` exists depending on `store_cv_results`."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+    tuned_model = TunedThresholdClassifierCV(
+        estimator, store_cv_results=store_cv_results
+    ).fit(X, y)
+    if store_cv_results:
+        assert hasattr(tuned_model, "cv_results_")
+    else:
+        assert not hasattr(tuned_model, "cv_results_")
+
+
+def test_tuned_threshold_classifier_cv_float():
+    """Check the behaviour when `cv` is set to a float."""
+    X, y = make_classification(random_state=0)
+
+    # case where `refit=False` and cv is a float: the underlying estimator will be fit
+    # on the training set given by a ShuffleSplit. We check that we get the same model
+    # coefficients.
+    test_size = 0.3
+    estimator = LogisticRegression()
+    tuned_model = TunedThresholdClassifierCV(
+        estimator, cv=test_size, refit=False, random_state=0
+    ).fit(X, y)
+    tuned_model.fit(X, y)
+
+    cv = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=0)
+    train_idx, val_idx = next(cv.split(X, y))
+    cloned_estimator = clone(estimator).fit(X[train_idx], y[train_idx])
+
+    assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_)
+
+    # case where `refit=True`, then the underlying estimator is fitted on the full
+    # dataset.
+    tuned_model.set_params(refit=True).fit(X, y)
+    cloned_estimator = clone(estimator).fit(X, y)
+
+    assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_)
+
+
+def test_tuned_threshold_classifier_error_constant_predictor():
+    """Check that we raise a ValueError if the underlying classifier returns constant
+    probabilities such that we cannot find any threshold.
+    """
+    X, y = make_classification(random_state=0)
+    estimator = DummyClassifier(strategy="constant", constant=1)
+    tuned_model = TunedThresholdClassifierCV(estimator, response_method="predict_proba")
+    err_msg = "The provided estimator makes constant predictions"
+    with pytest.raises(ValueError, match=err_msg):
+        tuned_model.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["auto", "predict_proba", "decision_function"]
+)
+def test_fixed_threshold_classifier_equivalence_default(response_method):
+    """Check that `FixedThresholdClassifier` has the same behaviour as the vanilla
+    classifier.
+    """
+    X, y = make_classification(random_state=0)
+    classifier = LogisticRegression().fit(X, y)
+    classifier_default_threshold = FixedThresholdClassifier(
+        estimator=clone(classifier), response_method=response_method
+    )
+    classifier_default_threshold.fit(X, y)
+
+    # emulate the response method that should take into account the `pos_label`
+    if response_method in ("auto", "predict_proba"):
+        y_score = classifier_default_threshold.predict_proba(X)[:, 1]
+        threshold = 0.5
+    else:  # response_method == "decision_function"
+        y_score = classifier_default_threshold.decision_function(X)
+        threshold = 0.0
+
+    y_pred_lr = (y_score >= threshold).astype(int)
+    assert_allclose(classifier_default_threshold.predict(X), y_pred_lr)
+
+
+@pytest.mark.parametrize(
+    "response_method, threshold", [("predict_proba", 0.7), ("decision_function", 2.0)]
+)
+@pytest.mark.parametrize("pos_label", [0, 1])
+def test_fixed_threshold_classifier(response_method, threshold, pos_label):
+    """Check that applying `predict` lead to the same prediction as applying the
+    threshold to the output of the response method.
+    """
+    X, y = make_classification(n_samples=50, random_state=0)
+    logistic_regression = LogisticRegression().fit(X, y)
+    model = FixedThresholdClassifier(
+        estimator=clone(logistic_regression),
+        threshold=threshold,
+        response_method=response_method,
+        pos_label=pos_label,
+    ).fit(X, y)
+
+    # check that the underlying estimator is the same
+    assert_allclose(model.estimator_.coef_, logistic_regression.coef_)
+
+    # emulate the response method that should take into account the `pos_label`
+    if response_method == "predict_proba":
+        y_score = model.predict_proba(X)[:, pos_label]
+    else:  # response_method == "decision_function"
+        y_score = model.decision_function(X)
+        y_score = y_score if pos_label == 1 else -y_score
+
+    # create a mapping from boolean values to class labels
+    map_to_label = np.array([0, 1]) if pos_label == 1 else np.array([1, 0])
+    y_pred_lr = map_to_label[(y_score >= threshold).astype(int)]
+    assert_allclose(model.predict(X), y_pred_lr)
+
+    for method in ("predict_proba", "predict_log_proba", "decision_function"):
+        assert_allclose(
+            getattr(model, method)(X), getattr(logistic_regression, method)(X)
+        )
+        assert_allclose(
+            getattr(model.estimator_, method)(X),
+            getattr(logistic_regression, method)(X),
+        )
+
+
+@config_context(enable_metadata_routing=True)
+def test_fixed_threshold_classifier_metadata_routing():
+    """Check that everything works with metadata routing."""
+    X, y = make_classification(random_state=0)
+    sample_weight = np.ones_like(y)
+    sample_weight[::2] = 2
+    classifier = LogisticRegression().set_fit_request(sample_weight=True)
+    classifier.fit(X, y, sample_weight=sample_weight)
+    classifier_default_threshold = FixedThresholdClassifier(estimator=clone(classifier))
+    classifier_default_threshold.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(classifier_default_threshold.estimator_.coef_, classifier.coef_)
+
+
+@pytest.mark.parametrize(
+    "method", ["predict_proba", "decision_function", "predict", "predict_log_proba"]
+)
+def test_fixed_threshold_classifier_fitted_estimator(method):
+    """Check that if the underlying estimator is already fitted, no fit is required."""
+    X, y = make_classification(random_state=0)
+    classifier = LogisticRegression().fit(X, y)
+    fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier)
+    # This should not raise an error
+    getattr(fixed_threshold_classifier, method)(X)
+
+
+def test_fixed_threshold_classifier_classes_():
+    """Check that the classes_ attribute is properly set."""
+    X, y = make_classification(random_state=0)
+    with pytest.raises(
+        AttributeError, match="The underlying estimator is not fitted yet."
+    ):
+        FixedThresholdClassifier(estimator=LogisticRegression()).classes_
+
+    classifier = LogisticRegression().fit(X, y)
+    fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier)
+    assert_array_equal(fixed_threshold_classifier.classes_, classifier.classes_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_plot.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e884755174545ababe24d423fd84cf7882104cb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_plot.py
@@ -0,0 +1,572 @@
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_iris
+from sklearn.model_selection import (
+    LearningCurveDisplay,
+    ValidationCurveDisplay,
+    learning_curve,
+    validation_curve,
+)
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils import shuffle
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+
+
+@pytest.fixture
+def data():
+    return shuffle(*load_iris(return_X_y=True), random_state=0)
+
+
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        ({"std_display_style": "invalid"}, ValueError, "Unknown std_display_style:"),
+        ({"score_type": "invalid"}, ValueError, "Unknown score_type:"),
+    ],
+)
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_parameters_validation(
+    pyplot, data, params, err_type, err_msg, CurveDisplay, specific_params
+):
+    """Check that we raise a proper error when passing invalid parameters."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    with pytest.raises(err_type, match=err_msg):
+        CurveDisplay.from_estimator(estimator, X, y, **specific_params, **params)
+
+
+def test_learning_curve_display_default_usage(pyplot, data):
+    """Check the default usage of the LearningCurveDisplay class."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    train_sizes = [0.3, 0.6, 0.9]
+    display = LearningCurveDisplay.from_estimator(
+        estimator, X, y, train_sizes=train_sizes
+    )
+
+    import matplotlib as mpl
+
+    assert display.errorbar_ is None
+
+    assert isinstance(display.lines_, list)
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+
+    assert isinstance(display.fill_between_, list)
+    for fill in display.fill_between_:
+        assert isinstance(fill, mpl.collections.PolyCollection)
+        assert fill.get_alpha() == 0.5
+
+    assert display.score_name == "Score"
+    assert display.ax_.get_xlabel() == "Number of samples in the training set"
+    assert display.ax_.get_ylabel() == "Score"
+
+    _, legend_labels = display.ax_.get_legend_handles_labels()
+    assert legend_labels == ["Train", "Test"]
+
+    train_sizes_abs, train_scores, test_scores = learning_curve(
+        estimator, X, y, train_sizes=train_sizes
+    )
+
+    assert_array_equal(display.train_sizes, train_sizes_abs)
+    assert_allclose(display.train_scores, train_scores)
+    assert_allclose(display.test_scores, test_scores)
+
+
+def test_validation_curve_display_default_usage(pyplot, data):
+    """Check the default usage of the ValidationCurveDisplay class."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    param_name, param_range = "max_depth", [1, 3, 5]
+    display = ValidationCurveDisplay.from_estimator(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    import matplotlib as mpl
+
+    assert display.errorbar_ is None
+
+    assert isinstance(display.lines_, list)
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+
+    assert isinstance(display.fill_between_, list)
+    for fill in display.fill_between_:
+        assert isinstance(fill, mpl.collections.PolyCollection)
+        assert fill.get_alpha() == 0.5
+
+    assert display.score_name == "Score"
+    assert display.ax_.get_xlabel() == f"{param_name}"
+    assert display.ax_.get_ylabel() == "Score"
+
+    _, legend_labels = display.ax_.get_legend_handles_labels()
+    assert legend_labels == ["Train", "Test"]
+
+    train_scores, test_scores = validation_curve(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    assert_array_equal(display.param_range, param_range)
+    assert_allclose(display.train_scores, train_scores)
+    assert_allclose(display.test_scores, test_scores)
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_negate_score(pyplot, data, CurveDisplay, specific_params):
+    """Check the behaviour of the `negate_score` parameter calling `from_estimator` and
+    `plot`.
+    """
+    X, y = data
+    estimator = DecisionTreeClassifier(max_depth=1, random_state=0)
+
+    negate_score = False
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
+    )
+
+    positive_scores = display.lines_[0].get_data()[1]
+    assert (positive_scores >= 0).all()
+    assert display.ax_.get_ylabel() == "Score"
+
+    negate_score = True
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
+    )
+
+    negative_scores = display.lines_[0].get_data()[1]
+    assert (negative_scores <= 0).all()
+    assert_allclose(negative_scores, -positive_scores)
+    assert display.ax_.get_ylabel() == "Negative score"
+
+    negate_score = False
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
+    )
+    assert display.ax_.get_ylabel() == "Score"
+    display.plot(negate_score=not negate_score)
+    assert display.ax_.get_ylabel() == "Score"
+    assert (display.lines_[0].get_data()[1] < 0).all()
+
+
+@pytest.mark.parametrize(
+    "score_name, ylabel", [(None, "Score"), ("Accuracy", "Accuracy")]
+)
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_score_name(
+    pyplot, data, score_name, ylabel, CurveDisplay, specific_params
+):
+    """Check that we can overwrite the default score name shown on the y-axis."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, score_name=score_name
+    )
+
+    assert display.ax_.get_ylabel() == ylabel
+    X, y = data
+    estimator = DecisionTreeClassifier(max_depth=1, random_state=0)
+
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, score_name=score_name
+    )
+
+    assert display.score_name == ylabel
+
+
+@pytest.mark.parametrize("std_display_style", (None, "errorbar"))
+def test_learning_curve_display_score_type(pyplot, data, std_display_style):
+    """Check the behaviour of setting the `score_type` parameter."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    train_sizes = [0.3, 0.6, 0.9]
+    train_sizes_abs, train_scores, test_scores = learning_curve(
+        estimator, X, y, train_sizes=train_sizes
+    )
+
+    score_type = "train"
+    display = LearningCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        train_sizes=train_sizes,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, train_sizes_abs)
+    assert_allclose(y_data, train_scores.mean(axis=1))
+
+    score_type = "test"
+    display = LearningCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        train_sizes=train_sizes,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, train_sizes_abs)
+    assert_allclose(y_data, test_scores.mean(axis=1))
+
+    score_type = "both"
+    display = LearningCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        train_sizes=train_sizes,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train", "Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 2
+        assert display.errorbar_ is None
+        x_data_train, y_data_train = display.lines_[0].get_data()
+        x_data_test, y_data_test = display.lines_[1].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 2
+        x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data()
+        x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data()
+
+    assert_array_equal(x_data_train, train_sizes_abs)
+    assert_allclose(y_data_train, train_scores.mean(axis=1))
+    assert_array_equal(x_data_test, train_sizes_abs)
+    assert_allclose(y_data_test, test_scores.mean(axis=1))
+
+
+@pytest.mark.parametrize("std_display_style", (None, "errorbar"))
+def test_validation_curve_display_score_type(pyplot, data, std_display_style):
+    """Check the behaviour of setting the `score_type` parameter."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    param_name, param_range = "max_depth", [1, 3, 5]
+    train_scores, test_scores = validation_curve(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    score_type = "train"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, param_range)
+    assert_allclose(y_data, train_scores.mean(axis=1))
+
+    score_type = "test"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, param_range)
+    assert_allclose(y_data, test_scores.mean(axis=1))
+
+    score_type = "both"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train", "Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 2
+        assert display.errorbar_ is None
+        x_data_train, y_data_train = display.lines_[0].get_data()
+        x_data_test, y_data_test = display.lines_[1].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 2
+        x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data()
+        x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data()
+
+    assert_array_equal(x_data_train, param_range)
+    assert_allclose(y_data_train, train_scores.mean(axis=1))
+    assert_array_equal(x_data_test, param_range)
+    assert_allclose(y_data_test, test_scores.mean(axis=1))
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params, expected_xscale",
+    [
+        (
+            ValidationCurveDisplay,
+            {"param_name": "max_depth", "param_range": np.arange(1, 5)},
+            "linear",
+        ),
+        (LearningCurveDisplay, {"train_sizes": np.linspace(0.1, 0.9, num=5)}, "linear"),
+        (
+            ValidationCurveDisplay,
+            {
+                "param_name": "max_depth",
+                "param_range": np.round(np.logspace(0, 2, num=5)).astype(np.int64),
+            },
+            "log",
+        ),
+        (LearningCurveDisplay, {"train_sizes": np.logspace(-1, 0, num=5)}, "log"),
+    ],
+)
+def test_curve_display_xscale_auto(
+    pyplot, data, CurveDisplay, specific_params, expected_xscale
+):
+    """Check the behaviour of the x-axis scaling depending on the data provided."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    display = CurveDisplay.from_estimator(estimator, X, y, **specific_params)
+    assert display.ax_.get_xscale() == expected_xscale
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_std_display_style(pyplot, data, CurveDisplay, specific_params):
+    """Check the behaviour of the parameter `std_display_style`."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    import matplotlib as mpl
+
+    std_display_style = None
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+    )
+
+    assert len(display.lines_) == 2
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+    assert display.errorbar_ is None
+    assert display.fill_between_ is None
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert len(legend_label) == 2
+
+    std_display_style = "fill_between"
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+    )
+
+    assert len(display.lines_) == 2
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+    assert display.errorbar_ is None
+    assert len(display.fill_between_) == 2
+    for fill_between in display.fill_between_:
+        assert isinstance(fill_between, mpl.collections.PolyCollection)
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert len(legend_label) == 2
+
+    std_display_style = "errorbar"
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+    )
+
+    assert display.lines_ is None
+    assert len(display.errorbar_) == 2
+    for errorbar in display.errorbar_:
+        assert isinstance(errorbar, mpl.container.ErrorbarContainer)
+    assert display.fill_between_ is None
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert len(legend_label) == 2
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_plot_kwargs(pyplot, data, CurveDisplay, specific_params):
+    """Check the behaviour of the different plotting keyword arguments: `line_kw`,
+    `fill_between_kw`, and `errorbar_kw`."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    std_display_style = "fill_between"
+    line_kw = {"color": "red"}
+    fill_between_kw = {"color": "red", "alpha": 1.0}
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+        line_kw=line_kw,
+        fill_between_kw=fill_between_kw,
+    )
+
+    assert display.lines_[0].get_color() == "red"
+    assert_allclose(
+        display.fill_between_[0].get_facecolor(),
+        [[1.0, 0.0, 0.0, 1.0]],  # trust me, it's red
+    )
+
+    std_display_style = "errorbar"
+    errorbar_kw = {"color": "red"}
+    display = CurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        **specific_params,
+        std_display_style=std_display_style,
+        errorbar_kw=errorbar_kw,
+    )
+
+    assert display.errorbar_[0].lines[0].get_color() == "red"
+
+
+@pytest.mark.parametrize(
+    "param_range, xscale",
+    [([5, 10, 15], "linear"), ([-50, 5, 50, 500], "symlog"), ([5, 50, 500], "log")],
+)
+def test_validation_curve_xscale_from_param_range_provided_as_a_list(
+    pyplot, data, param_range, xscale
+):
+    """Check the induced xscale from the provided param_range values."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    param_name = "max_depth"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+    )
+
+    assert display.ax_.get_xscale() == xscale
+
+
+@pytest.mark.parametrize(
+    "Display, params",
+    [
+        (LearningCurveDisplay, {}),
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+    ],
+)
+def test_subclassing_displays(pyplot, data, Display, params):
+    """Check that named constructors return the correct type when subclassed.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/27675
+    """
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    class SubclassOfDisplay(Display):
+        pass
+
+    display = SubclassOfDisplay.from_estimator(estimator, X, y, **params)
+    assert isinstance(display, SubclassOfDisplay)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_search.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..7888dd2d1766b411549f29c995ac9bd58595158d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_search.py
@@ -0,0 +1,2966 @@
+"""Test the search module"""
+
+import pickle
+import re
+import sys
+import warnings
+from collections.abc import Iterable, Sized
+from functools import partial
+from io import StringIO
+from itertools import chain, product
+from types import GeneratorType
+
+import numpy as np
+import pytest
+from scipy.stats import bernoulli, expon, uniform
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, ClassifierMixin, clone, is_classifier
+from sklearn.cluster import KMeans
+from sklearn.compose import ColumnTransformer
+from sklearn.datasets import (
+    make_blobs,
+    make_classification,
+    make_multilabel_classification,
+)
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.exceptions import FitFailedWarning
+from sklearn.experimental import enable_halving_search_cv  # noqa: F401
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import (
+    LinearRegression,
+    LogisticRegression,
+    Ridge,
+    SGDClassifier,
+)
+from sklearn.metrics import (
+    accuracy_score,
+    confusion_matrix,
+    f1_score,
+    make_scorer,
+    r2_score,
+    recall_score,
+    roc_auc_score,
+)
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.model_selection import (
+    GridSearchCV,
+    GroupKFold,
+    GroupShuffleSplit,
+    HalvingGridSearchCV,
+    KFold,
+    LeaveOneGroupOut,
+    LeavePGroupsOut,
+    ParameterGrid,
+    ParameterSampler,
+    RandomizedSearchCV,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+    train_test_split,
+)
+from sklearn.model_selection._search import (
+    BaseSearchCV,
+    _yield_masked_array_for_each_param,
+)
+from sklearn.model_selection.tests.common import OneTimeSplitter
+from sklearn.naive_bayes import ComplementNB
+from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import (
+    OneHotEncoder,
+    OrdinalEncoder,
+    SplineTransformer,
+    StandardScaler,
+)
+from sklearn.svm import SVC, LinearSVC
+from sklearn.tests.metadata_routing_common import (
+    ConsumingScorer,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils._array_api import (
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._mocking import CheckingClassifier, MockDataFrame
+from sklearn.utils._testing import (
+    MinimalClassifier,
+    MinimalRegressor,
+    MinimalTransformer,
+    _array_api_for_tests,
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    set_random_state,
+)
+from sklearn.utils.estimator_checks import _enforce_estimator_tags_y
+from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.validation import _num_samples
+
+
+# Neither of the following two estimators inherit from BaseEstimator,
+# to test hyperparameter search on user-defined classifiers.
+class MockClassifier(ClassifierMixin, BaseEstimator):
+    """Dummy classifier to test the parameter search algorithms"""
+
+    def __init__(self, foo_param=0):
+        self.foo_param = foo_param
+
+    def fit(self, X, Y):
+        assert len(X) == len(Y)
+        self.classes_ = np.unique(Y)
+        return self
+
+    def predict(self, T):
+        return T.shape[0]
+
+    def transform(self, X):
+        return X + self.foo_param
+
+    def inverse_transform(self, X):
+        return X - self.foo_param
+
+    predict_proba = predict
+    predict_log_proba = predict
+    decision_function = predict
+
+    def score(self, X=None, Y=None):
+        if self.foo_param > 1:
+            score = 1.0
+        else:
+            score = 0.0
+        return score
+
+    def get_params(self, deep=False):
+        return {"foo_param": self.foo_param}
+
+    def set_params(self, **params):
+        self.foo_param = params["foo_param"]
+        return self
+
+
+class LinearSVCNoScore(LinearSVC):
+    """A LinearSVC classifier that has no score method."""
+
+    @property
+    def score(self):
+        raise AttributeError
+
+
+X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+y = np.array([1, 1, 2, 2])
+
+
+def assert_grid_iter_equals_getitem(grid):
+    assert list(grid) == [grid[i] for i in range(len(grid))]
+
+
+@pytest.mark.parametrize("klass", [ParameterGrid, partial(ParameterSampler, n_iter=10)])
+@pytest.mark.parametrize(
+    "input, error_type, error_message",
+    [
+        (0, TypeError, r"Parameter .* a dict or a list, got: 0 of type int"),
+        ([{"foo": [0]}, 0], TypeError, r"Parameter .* is not a dict \(0\)"),
+        (
+            {"foo": 0},
+            TypeError,
+            r"Parameter (grid|distribution) for parameter 'foo' (is not|needs to be) "
+            r"(a list or a numpy array|iterable or a distribution).*",
+        ),
+    ],
+)
+def test_validate_parameter_input(klass, input, error_type, error_message):
+    with pytest.raises(error_type, match=error_message):
+        klass(input)
+
+
+def test_parameter_grid():
+    # Test basic properties of ParameterGrid.
+    params1 = {"foo": [1, 2, 3]}
+    grid1 = ParameterGrid(params1)
+    assert isinstance(grid1, Iterable)
+    assert isinstance(grid1, Sized)
+    assert len(grid1) == 3
+    assert_grid_iter_equals_getitem(grid1)
+
+    params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]}
+    grid2 = ParameterGrid(params2)
+    assert len(grid2) == 6
+
+    # loop to assert we can iterate over the grid multiple times
+    for i in range(2):
+        # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
+        points = set(tuple(chain(*(sorted(p.items())))) for p in grid2)
+        assert points == set(
+            ("bar", x, "foo", y) for x, y in product(params2["bar"], params2["foo"])
+        )
+    assert_grid_iter_equals_getitem(grid2)
+
+    # Special case: empty grid (useful to get default estimator settings)
+    empty = ParameterGrid({})
+    assert len(empty) == 1
+    assert list(empty) == [{}]
+    assert_grid_iter_equals_getitem(empty)
+    with pytest.raises(IndexError):
+        empty[1]
+
+    has_empty = ParameterGrid([{"C": [1, 10]}, {}, {"C": [0.5]}])
+    assert len(has_empty) == 4
+    assert list(has_empty) == [{"C": 1}, {"C": 10}, {}, {"C": 0.5}]
+    assert_grid_iter_equals_getitem(has_empty)
+
+
+def test_grid_search():
+    # Test that the best estimator contains the right value for foo_param
+    clf = MockClassifier()
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3)
+    # make sure it selects the smallest parameter in case of ties
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    grid_search.fit(X, y)
+    sys.stdout = old_stdout
+    assert grid_search.best_estimator_.foo_param == 2
+
+    assert_array_equal(grid_search.cv_results_["param_foo_param"].data, [1, 2, 3])
+
+    # Smoke test the score etc:
+    grid_search.score(X, y)
+    grid_search.predict_proba(X)
+    grid_search.decision_function(X)
+    grid_search.transform(X)
+
+    # Test exception handling on scoring
+    grid_search.scoring = "sklearn"
+    with pytest.raises(ValueError):
+        grid_search.fit(X, y)
+
+
+def test_grid_search_pipeline_steps():
+    # check that parameters that are estimators are cloned before fitting
+    pipe = Pipeline([("regressor", LinearRegression())])
+    param_grid = {"regressor": [LinearRegression(), Ridge()]}
+    grid_search = GridSearchCV(pipe, param_grid, cv=2)
+    grid_search.fit(X, y)
+    regressor_results = grid_search.cv_results_["param_regressor"]
+    assert isinstance(regressor_results[0], LinearRegression)
+    assert isinstance(regressor_results[1], Ridge)
+    assert not hasattr(regressor_results[0], "coef_")
+    assert not hasattr(regressor_results[1], "coef_")
+    assert regressor_results[0] is not grid_search.best_estimator_
+    assert regressor_results[1] is not grid_search.best_estimator_
+    # check that we didn't modify the parameter grid that was passed
+    assert not hasattr(param_grid["regressor"][0], "coef_")
+    assert not hasattr(param_grid["regressor"][1], "coef_")
+
+
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+def test_SearchCV_with_fit_params(SearchCV):
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    clf = CheckingClassifier(expected_fit_params=["spam", "eggs"])
+    searcher = SearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, error_score="raise")
+
+    # The CheckingClassifier generates an assertion error if
+    # a parameter is missing or has length != len(X).
+    err_msg = r"Expected fit parameter\(s\) \['eggs'\] not seen."
+    with pytest.raises(AssertionError, match=err_msg):
+        searcher.fit(X, y, spam=np.ones(10))
+
+    err_msg = "Fit parameter spam has length 1; expected"
+    with pytest.raises(AssertionError, match=err_msg):
+        searcher.fit(X, y, spam=np.ones(1), eggs=np.zeros(10))
+    searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10))
+
+
+def test_grid_search_no_score():
+    # Test grid-search on classifier that has no score function.
+    clf = LinearSVC(random_state=0)
+    X, y = make_blobs(random_state=0, centers=2)
+    Cs = [0.1, 1, 10]
+    clf_no_score = LinearSVCNoScore(random_state=0)
+    grid_search = GridSearchCV(clf, {"C": Cs}, scoring="accuracy")
+    grid_search.fit(X, y)
+
+    grid_search_no_score = GridSearchCV(clf_no_score, {"C": Cs}, scoring="accuracy")
+    # smoketest grid search
+    grid_search_no_score.fit(X, y)
+
+    # check that best params are equal
+    assert grid_search_no_score.best_params_ == grid_search.best_params_
+    # check that we can call score and that it gives the correct result
+    assert grid_search.score(X, y) == grid_search_no_score.score(X, y)
+
+    # giving no scoring function raises an error
+    grid_search_no_score = GridSearchCV(clf_no_score, {"C": Cs})
+    with pytest.raises(TypeError, match="no scoring"):
+        grid_search_no_score.fit([[1]])
+
+
+def test_grid_search_score_method():
+    X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0)
+    clf = LinearSVC(random_state=0)
+    grid = {"C": [0.1]}
+
+    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
+    search_accuracy = GridSearchCV(clf, grid, scoring="accuracy").fit(X, y)
+    search_no_score_method_auc = GridSearchCV(
+        LinearSVCNoScore(), grid, scoring="roc_auc"
+    ).fit(X, y)
+    search_auc = GridSearchCV(clf, grid, scoring="roc_auc").fit(X, y)
+
+    # Check warning only occurs in situation where behavior changed:
+    # estimator requires score method to compete with scoring parameter
+    score_no_scoring = search_no_scoring.score(X, y)
+    score_accuracy = search_accuracy.score(X, y)
+    score_no_score_auc = search_no_score_method_auc.score(X, y)
+    score_auc = search_auc.score(X, y)
+
+    # ensure the test is sane
+    assert score_auc < 1.0
+    assert score_accuracy < 1.0
+    assert score_auc != score_accuracy
+
+    assert_almost_equal(score_accuracy, score_no_scoring)
+    assert_almost_equal(score_auc, score_no_score_auc)
+
+
+def test_grid_search_groups():
+    # Check if ValueError (when groups is None) propagates to GridSearchCV
+    # And also check if groups is correctly passed to the cv object
+    rng = np.random.RandomState(0)
+
+    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
+    groups = rng.randint(0, 3, 15)
+
+    clf = LinearSVC(random_state=0)
+    grid = {"C": [1]}
+
+    group_cvs = [
+        LeaveOneGroupOut(),
+        LeavePGroupsOut(2),
+        GroupKFold(n_splits=3),
+        GroupShuffleSplit(),
+    ]
+    error_msg = "The 'groups' parameter should not be None."
+    for cv in group_cvs:
+        gs = GridSearchCV(clf, grid, cv=cv)
+        with pytest.raises(ValueError, match=error_msg):
+            gs.fit(X, y)
+        gs.fit(X, y, groups=groups)
+
+    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
+    for cv in non_group_cvs:
+        gs = GridSearchCV(clf, grid, cv=cv)
+        # Should not raise an error
+        gs.fit(X, y)
+
+
+def test_classes__property():
+    # Test that classes_ property matches best_estimator_.classes_
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    Cs = [0.1, 1, 10]
+
+    grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs})
+    grid_search.fit(X, y)
+    assert_array_equal(grid_search.best_estimator_.classes_, grid_search.classes_)
+
+    # Test that regressors do not have a classes_ attribute
+    grid_search = GridSearchCV(Ridge(), {"alpha": [1.0, 2.0]})
+    grid_search.fit(X, y)
+    assert not hasattr(grid_search, "classes_")
+
+    # Test that the grid searcher has no classes_ attribute before it's fit
+    grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs})
+    assert not hasattr(grid_search, "classes_")
+
+    # Test that the grid searcher has no classes_ attribute without a refit
+    grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs}, refit=False)
+    grid_search.fit(X, y)
+    assert not hasattr(grid_search, "classes_")
+
+
+def test_trivial_cv_results_attr():
+    # Test search over a "grid" with only one point.
+    clf = MockClassifier()
+    grid_search = GridSearchCV(clf, {"foo_param": [1]}, cv=2)
+    grid_search.fit(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+    random_search = RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1, cv=2)
+    random_search.fit(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+
+def test_no_refit():
+    # Test that GSCV can be used for model selection alone without refitting
+    clf = MockClassifier()
+    for scoring in [None, ["accuracy", "precision"]]:
+        grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=False, cv=2)
+        grid_search.fit(X, y)
+        assert (
+            not hasattr(grid_search, "best_estimator_")
+            and hasattr(grid_search, "best_index_")
+            and hasattr(grid_search, "best_params_")
+        )
+
+        # Make sure the functions predict/transform etc. raise meaningful
+        # error messages
+        for fn_name in (
+            "predict",
+            "predict_proba",
+            "predict_log_proba",
+            "transform",
+            "inverse_transform",
+        ):
+            outer_msg = f"has no attribute '{fn_name}'"
+            inner_msg = (
+                f"`refit=False`. {fn_name} is available only after "
+                "refitting on the best parameters"
+            )
+            with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+                getattr(grid_search, fn_name)(X)
+
+            assert isinstance(exec_info.value.__cause__, AttributeError)
+            assert inner_msg in str(exec_info.value.__cause__)
+
+    # Test that an invalid refit param raises appropriate error messages
+    error_msg = (
+        "For multi-metric scoring, the parameter refit must be set to a scorer key"
+    )
+    for refit in [True, "recall", "accuracy"]:
+        with pytest.raises(ValueError, match=error_msg):
+            GridSearchCV(
+                clf, {}, refit=refit, scoring={"acc": "accuracy", "prec": "precision"}
+            ).fit(X, y)
+
+
+def test_grid_search_error():
+    # Test that grid search will capture errors on data with different length
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    with pytest.raises(ValueError):
+        cv.fit(X_[:180], y_)
+
+
+def test_grid_search_one_grid_point():
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+    param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}
+
+    clf = SVC(gamma="auto")
+    cv = GridSearchCV(clf, param_dict)
+    cv.fit(X_, y_)
+
+    clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
+    clf.fit(X_, y_)
+
+    assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
+
+
+def test_grid_search_when_param_grid_includes_range():
+    # Test that the best estimator contains the right value for foo_param
+    clf = MockClassifier()
+    grid_search = None
+    grid_search = GridSearchCV(clf, {"foo_param": range(1, 4)}, cv=2)
+    grid_search.fit(X, y)
+    assert grid_search.best_estimator_.foo_param == 2
+
+
+def test_grid_search_bad_param_grid():
+    X, y = make_classification(n_samples=10, n_features=5, random_state=0)
+    param_dict = {"C": 1}
+    clf = SVC(gamma="auto")
+    error_msg = re.escape(
+        "Parameter grid for parameter 'C' needs to be a list or "
+        "a numpy array, but got 1 (of type int) instead. Single "
+        "values need to be wrapped in a list with one element."
+    )
+    search = GridSearchCV(clf, param_dict)
+    with pytest.raises(TypeError, match=error_msg):
+        search.fit(X, y)
+
+    param_dict = {"C": []}
+    clf = SVC()
+    error_msg = re.escape(
+        "Parameter grid for parameter 'C' need to be a non-empty sequence, got: []"
+    )
+    search = GridSearchCV(clf, param_dict)
+    with pytest.raises(ValueError, match=error_msg):
+        search.fit(X, y)
+
+    param_dict = {"C": "1,2,3"}
+    clf = SVC(gamma="auto")
+    error_msg = re.escape(
+        "Parameter grid for parameter 'C' needs to be a list or a numpy array, "
+        "but got '1,2,3' (of type str) instead. Single values need to be "
+        "wrapped in a list with one element."
+    )
+    search = GridSearchCV(clf, param_dict)
+    with pytest.raises(TypeError, match=error_msg):
+        search.fit(X, y)
+
+    param_dict = {"C": np.ones((3, 2))}
+    clf = SVC()
+    search = GridSearchCV(clf, param_dict)
+    with pytest.raises(ValueError):
+        search.fit(X, y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_grid_search_sparse(csr_container):
+    # Test that grid search works with both dense and sparse matrices
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    cv.fit(X_[:180], y_[:180])
+    y_pred = cv.predict(X_[180:])
+    C = cv.best_estimator_.C
+
+    X_ = csr_container(X_)
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    cv.fit(X_[:180].tocoo(), y_[:180])
+    y_pred2 = cv.predict(X_[180:])
+    C2 = cv.best_estimator_.C
+
+    assert np.mean(y_pred == y_pred2) >= 0.9
+    assert C == C2
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_grid_search_sparse_scoring(csr_container):
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1")
+    cv.fit(X_[:180], y_[:180])
+    y_pred = cv.predict(X_[180:])
+    C = cv.best_estimator_.C
+
+    X_ = csr_container(X_)
+    clf = LinearSVC()
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1")
+    cv.fit(X_[:180], y_[:180])
+    y_pred2 = cv.predict(X_[180:])
+    C2 = cv.best_estimator_.C
+
+    assert_array_equal(y_pred, y_pred2)
+    assert C == C2
+    # Smoke test the score
+    # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
+    #                            cv.score(X_[:180], y[:180]))
+
+    # test loss where greater is worse
+    def f1_loss(y_true_, y_pred_):
+        return -f1_score(y_true_, y_pred_)
+
+    F1Loss = make_scorer(f1_loss, greater_is_better=False)
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring=F1Loss)
+    cv.fit(X_[:180], y_[:180])
+    y_pred3 = cv.predict(X_[180:])
+    C3 = cv.best_estimator_.C
+
+    assert C == C3
+    assert_array_equal(y_pred, y_pred3)
+
+
+def test_grid_search_precomputed_kernel():
+    # Test that grid search works when the input features are given in the
+    # form of a precomputed kernel matrix
+    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
+
+    # compute the training kernel matrix corresponding to the linear kernel
+    K_train = np.dot(X_[:180], X_[:180].T)
+    y_train = y_[:180]
+
+    clf = SVC(kernel="precomputed")
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    cv.fit(K_train, y_train)
+
+    assert cv.best_score_ >= 0
+
+    # compute the test kernel matrix
+    K_test = np.dot(X_[180:], X_[:180].T)
+    y_test = y_[180:]
+
+    y_pred = cv.predict(K_test)
+
+    assert np.mean(y_pred == y_test) >= 0
+
+    # test error is raised when the precomputed kernel is not array-like
+    # or sparse
+    with pytest.raises(ValueError):
+        cv.fit(K_train.tolist(), y_train)
+
+
+def test_grid_search_precomputed_kernel_error_nonsquare():
+    # Test that grid search returns an error with a non-square precomputed
+    # training kernel matrix
+    K_train = np.zeros((10, 20))
+    y_train = np.ones((10,))
+    clf = SVC(kernel="precomputed")
+    cv = GridSearchCV(clf, {"C": [0.1, 1.0]})
+    with pytest.raises(ValueError):
+        cv.fit(K_train, y_train)
+
+
+class BrokenClassifier(BaseEstimator):
+    """Broken classifier that cannot be fit twice"""
+
+    def __init__(self, parameter=None):
+        self.parameter = parameter
+
+    def fit(self, X, y):
+        assert not hasattr(self, "has_been_fit_")
+        self.has_been_fit_ = True
+
+    def predict(self, X):
+        return np.zeros(X.shape[0])
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.UndefinedMetricWarning")
+def test_refit():
+    # Regression test for bug in refitting
+    # Simulates re-fitting a broken estimator; this used to break with
+    # sparse SVMs.
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+
+    clf = GridSearchCV(
+        BrokenClassifier(), [{"parameter": [0, 1]}], scoring="precision", refit=True
+    )
+    clf.fit(X, y)
+
+
+def test_refit_callable():
+    """
+    Test refit=callable, which adds flexibility in identifying the
+    "best" estimator.
+    """
+
+    def refit_callable(cv_results):
+        """
+        A dummy function tests `refit=callable` interface.
+        Return the index of a model that has the least
+        `mean_test_score`.
+        """
+        # Fit a dummy clf with `refit=True` to get a list of keys in
+        # clf.cv_results_.
+        X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+        clf = GridSearchCV(
+            LinearSVC(random_state=42),
+            {"C": [0.01, 0.1, 1]},
+            scoring="precision",
+            refit=True,
+        )
+        clf.fit(X, y)
+        # Ensure that `best_index_ != 0` for this dummy clf
+        assert clf.best_index_ != 0
+
+        # Assert every key matches those in `cv_results`
+        for key in clf.cv_results_.keys():
+            assert key in cv_results
+
+        return cv_results["mean_test_score"].argmin()
+
+    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+    clf = GridSearchCV(
+        LinearSVC(random_state=42),
+        {"C": [0.01, 0.1, 1]},
+        scoring="precision",
+        refit=refit_callable,
+    )
+    clf.fit(X, y)
+
+    assert clf.best_index_ == 0
+    # Ensure `best_score_` is disabled when using `refit=callable`
+    assert not hasattr(clf, "best_score_")
+
+
+def test_refit_callable_invalid_type():
+    """
+    Test implementation catches the errors when 'best_index_' returns an
+    invalid result.
+    """
+
+    def refit_callable_invalid_type(cv_results):
+        """
+        A dummy function tests when returned 'best_index_' is not integer.
+        """
+        return None
+
+    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+
+    clf = GridSearchCV(
+        LinearSVC(random_state=42),
+        {"C": [0.1, 1]},
+        scoring="precision",
+        refit=refit_callable_invalid_type,
+    )
+    with pytest.raises(TypeError, match="best_index_ returned is not an integer"):
+        clf.fit(X, y)
+
+
+@pytest.mark.parametrize("out_bound_value", [-1, 2])
+@pytest.mark.parametrize("search_cv", [RandomizedSearchCV, GridSearchCV])
+def test_refit_callable_out_bound(out_bound_value, search_cv):
+    """
+    Test implementation catches the errors when 'best_index_' returns an
+    out of bound result.
+    """
+
+    def refit_callable_out_bound(cv_results):
+        """
+        A dummy function tests when returned 'best_index_' is out of bounds.
+        """
+        return out_bound_value
+
+    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+
+    clf = search_cv(
+        LinearSVC(random_state=42),
+        {"C": [0.1, 1]},
+        scoring="precision",
+        refit=refit_callable_out_bound,
+    )
+    with pytest.raises(IndexError, match="best_index_ index out of range"):
+        clf.fit(X, y)
+
+
+def test_refit_callable_multi_metric():
+    """
+    Test refit=callable in multiple metric evaluation setting
+    """
+
+    def refit_callable(cv_results):
+        """
+        A dummy function tests `refit=callable` interface.
+        Return the index of a model that has the least
+        `mean_test_prec`.
+        """
+        assert "mean_test_prec" in cv_results
+        return cv_results["mean_test_prec"].argmin()
+
+    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
+    scoring = {"Accuracy": make_scorer(accuracy_score), "prec": "precision"}
+    clf = GridSearchCV(
+        LinearSVC(random_state=42),
+        {"C": [0.01, 0.1, 1]},
+        scoring=scoring,
+        refit=refit_callable,
+    )
+    clf.fit(X, y)
+
+    assert clf.best_index_ == 0
+    # Ensure `best_score_` is disabled when using `refit=callable`
+    assert not hasattr(clf, "best_score_")
+
+
+def test_gridsearch_nd():
+    # Pass X as list in GridSearchCV
+    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
+    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
+
+    def check_X(x):
+        return x.shape[1:] == (5, 3, 2)
+
+    def check_y(x):
+        return x.shape[1:] == (7, 11)
+
+    clf = CheckingClassifier(
+        check_X=check_X,
+        check_y=check_y,
+        methods_to_check=["fit"],
+    )
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]})
+    grid_search.fit(X_4d, y_3d).score(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+
+def test_X_as_list():
+    # Pass X as list in GridSearchCV
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+
+    clf = CheckingClassifier(
+        check_X=lambda x: isinstance(x, list),
+        methods_to_check=["fit"],
+    )
+    cv = KFold(n_splits=3)
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv)
+    grid_search.fit(X.tolist(), y).score(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+
+def test_y_as_list():
+    # Pass y as list in GridSearchCV
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+
+    clf = CheckingClassifier(
+        check_y=lambda x: isinstance(x, list),
+        methods_to_check=["fit"],
+    )
+    cv = KFold(n_splits=3)
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv)
+    grid_search.fit(X, y.tolist()).score(X, y)
+    assert hasattr(grid_search, "cv_results_")
+
+
+def test_pandas_input():
+    # check cross_val_score doesn't destroy pandas dataframe
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((DataFrame, Series))
+    except ImportError:
+        pass
+
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+
+    for InputFeatureType, TargetType in types:
+        # X dataframe, y series
+        X_df, y_ser = InputFeatureType(X), TargetType(y)
+
+        def check_df(x):
+            return isinstance(x, InputFeatureType)
+
+        def check_series(x):
+            return isinstance(x, TargetType)
+
+        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
+
+        grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]})
+        grid_search.fit(X_df, y_ser).score(X_df, y_ser)
+        grid_search.predict(X_df)
+        assert hasattr(grid_search, "cv_results_")
+
+
+def test_unsupervised_grid_search():
+    # test grid-search with unsupervised estimator
+    X, y = make_blobs(n_samples=50, random_state=0)
+    km = KMeans(random_state=0, init="random", n_init=1)
+
+    # Multi-metric evaluation unsupervised
+    scoring = ["adjusted_rand_score", "fowlkes_mallows_score"]
+    for refit in ["adjusted_rand_score", "fowlkes_mallows_score"]:
+        grid_search = GridSearchCV(
+            km, param_grid=dict(n_clusters=[2, 3, 4]), scoring=scoring, refit=refit
+        )
+        grid_search.fit(X, y)
+        # Both ARI and FMS can find the right number :)
+        assert grid_search.best_params_["n_clusters"] == 3
+
+    # Single metric evaluation unsupervised
+    grid_search = GridSearchCV(
+        km, param_grid=dict(n_clusters=[2, 3, 4]), scoring="fowlkes_mallows_score"
+    )
+    grid_search.fit(X, y)
+    assert grid_search.best_params_["n_clusters"] == 3
+
+    # Now without a score, and without y
+    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
+    grid_search.fit(X)
+    assert grid_search.best_params_["n_clusters"] == 4
+
+
+def test_gridsearch_no_predict():
+    # test grid-search with an estimator without predict.
+    # slight duplication of a test from KDE
+    def custom_scoring(estimator, X):
+        return 42 if estimator.bandwidth == 0.1 else 0
+
+    X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
+    search = GridSearchCV(
+        KernelDensity(),
+        param_grid=dict(bandwidth=[0.01, 0.1, 1]),
+        scoring=custom_scoring,
+    )
+    search.fit(X)
+    assert search.best_params_["bandwidth"] == 0.1
+    assert search.best_score_ == 42
+
+
+def test_param_sampler():
+    # test basic properties of param sampler
+    param_distributions = {"kernel": ["rbf", "linear"], "C": uniform(0, 1)}
+    sampler = ParameterSampler(
+        param_distributions=param_distributions, n_iter=10, random_state=0
+    )
+    samples = [x for x in sampler]
+    assert len(samples) == 10
+    for sample in samples:
+        assert sample["kernel"] in ["rbf", "linear"]
+        assert 0 <= sample["C"] <= 1
+
+    # test that repeated calls yield identical parameters
+    param_distributions = {"C": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
+    sampler = ParameterSampler(
+        param_distributions=param_distributions, n_iter=3, random_state=0
+    )
+    assert [x for x in sampler] == [x for x in sampler]
+
+    param_distributions = {"C": uniform(0, 1)}
+    sampler = ParameterSampler(
+        param_distributions=param_distributions, n_iter=10, random_state=0
+    )
+    assert [x for x in sampler] == [x for x in sampler]
+
+
+def check_cv_results_array_types(
+    search, param_keys, score_keys, expected_cv_results_kinds
+):
+    # Check if the search `cv_results`'s array are of correct types
+    cv_results = search.cv_results_
+    assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys)
+    assert {
+        key: cv_results[key].dtype.kind for key in param_keys
+    } == expected_cv_results_kinds
+    assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys)
+    assert all(
+        cv_results[key].dtype == np.float64
+        for key in score_keys
+        if not key.startswith("rank")
+    )
+
+    scorer_keys = search.scorer_.keys() if search.multimetric_ else ["score"]
+
+    for key in scorer_keys:
+        assert cv_results["rank_test_%s" % key].dtype == np.int32
+
+
+def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()):
+    # Test the search.cv_results_ contains all the required results
+    all_keys = param_keys + score_keys + extra_keys
+    assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",)))
+    assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys)
+
+
+def test_grid_search_cv_results():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_grid_points = 6
+    params = [
+        dict(
+            kernel=[
+                "rbf",
+            ],
+            C=[1, 10],
+            gamma=[0.1, 1],
+        ),
+        dict(
+            kernel=[
+                "poly",
+            ],
+            degree=[1, 2],
+        ),
+    ]
+
+    param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel")
+    score_keys = (
+        "mean_test_score",
+        "mean_train_score",
+        "rank_test_score",
+        "split0_test_score",
+        "split1_test_score",
+        "split2_test_score",
+        "split0_train_score",
+        "split1_train_score",
+        "split2_train_score",
+        "std_test_score",
+        "std_train_score",
+        "mean_fit_time",
+        "std_fit_time",
+        "mean_score_time",
+        "std_score_time",
+    )
+    n_candidates = n_grid_points
+
+    search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True)
+    search.fit(X, y)
+    cv_results = search.cv_results_
+    # Check if score and timing are reasonable
+    assert all(cv_results["rank_test_score"] >= 1)
+    assert (all(cv_results[k] >= 0) for k in score_keys if k != "rank_test_score")
+    assert (
+        all(cv_results[k] <= 1)
+        for k in score_keys
+        if "time" not in k and k != "rank_test_score"
+    )
+    # Check cv_results structure
+    expected_cv_results_kinds = {
+        "param_C": "i",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
+    # Check masking
+    cv_results = search.cv_results_
+
+    poly_results = [
+        (
+            cv_results["param_C"].mask[i]
+            and cv_results["param_gamma"].mask[i]
+            and not cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "poly"
+    ]
+    assert all(poly_results)
+    assert len(poly_results) == 2
+
+    rbf_results = [
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "rbf"
+    ]
+    assert all(rbf_results)
+    assert len(rbf_results) == 4
+
+
+def test_random_search_cv_results():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_search_iter = 30
+
+    params = [
+        {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)},
+        {"kernel": ["poly"], "degree": [2, 3]},
+    ]
+    param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel")
+    score_keys = (
+        "mean_test_score",
+        "mean_train_score",
+        "rank_test_score",
+        "split0_test_score",
+        "split1_test_score",
+        "split2_test_score",
+        "split0_train_score",
+        "split1_train_score",
+        "split2_train_score",
+        "std_test_score",
+        "std_train_score",
+        "mean_fit_time",
+        "std_fit_time",
+        "mean_score_time",
+        "std_score_time",
+    )
+    n_candidates = n_search_iter
+
+    search = RandomizedSearchCV(
+        SVC(),
+        n_iter=n_search_iter,
+        cv=3,
+        param_distributions=params,
+        return_train_score=True,
+    )
+    search.fit(X, y)
+    cv_results = search.cv_results_
+    # Check results structure
+    expected_cv_results_kinds = {
+        "param_C": "f",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
+    assert all(
+        (
+            cv_results["param_C"].mask[i]
+            and cv_results["param_gamma"].mask[i]
+            and not cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "poly"
+    )
+    assert all(
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "rbf"
+    )
+
+
+@pytest.mark.parametrize(
+    "SearchCV, specialized_params",
+    [
+        (GridSearchCV, {"param_grid": {"C": [1, 10]}}),
+        (RandomizedSearchCV, {"param_distributions": {"C": [1, 10]}, "n_iter": 2}),
+    ],
+)
+def test_search_default_iid(SearchCV, specialized_params):
+    # Test the IID parameter  TODO: Clearly this test does something else???
+    # noise-free simple 2d-data
+    X, y = make_blobs(
+        centers=[[0, 0], [1, 0], [0, 1], [1, 1]],
+        random_state=0,
+        cluster_std=0.1,
+        shuffle=False,
+        n_samples=80,
+    )
+    # split dataset into two folds that are not iid
+    # first one contains data of all 4 blobs, second only from two.
+    mask = np.ones(X.shape[0], dtype=bool)
+    mask[np.where(y == 1)[0][::2]] = 0
+    mask[np.where(y == 2)[0][::2]] = 0
+    # this leads to perfect classification on one fold and a score of 1/3 on
+    # the other
+    # create "cv" for splits
+    cv = [[mask, ~mask], [~mask, mask]]
+
+    common_params = {"estimator": SVC(), "cv": cv, "return_train_score": True}
+    search = SearchCV(**common_params, **specialized_params)
+    search.fit(X, y)
+
+    test_cv_scores = np.array(
+        [
+            search.cv_results_["split%d_test_score" % s][0]
+            for s in range(search.n_splits_)
+        ]
+    )
+    test_mean = search.cv_results_["mean_test_score"][0]
+    test_std = search.cv_results_["std_test_score"][0]
+
+    train_cv_scores = np.array(
+        [
+            search.cv_results_["split%d_train_score" % s][0]
+            for s in range(search.n_splits_)
+        ]
+    )
+    train_mean = search.cv_results_["mean_train_score"][0]
+    train_std = search.cv_results_["std_train_score"][0]
+
+    assert search.cv_results_["param_C"][0] == 1
+    # scores are the same as above
+    assert_allclose(test_cv_scores, [1, 1.0 / 3.0])
+    assert_allclose(train_cv_scores, [1, 1])
+    # Unweighted mean/std is used
+    assert test_mean == pytest.approx(np.mean(test_cv_scores))
+    assert test_std == pytest.approx(np.std(test_cv_scores))
+
+    # For the train scores, we do not take a weighted mean irrespective of
+    # i.i.d. or not
+    assert train_mean == pytest.approx(1)
+    assert train_std == pytest.approx(0)
+
+
+def test_grid_search_cv_results_multimetric():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_splits = 3
+    params = [
+        dict(
+            kernel=[
+                "rbf",
+            ],
+            C=[1, 10],
+            gamma=[0.1, 1],
+        ),
+        dict(
+            kernel=[
+                "poly",
+            ],
+            degree=[1, 2],
+        ),
+    ]
+
+    grid_searches = []
+    for scoring in (
+        {"accuracy": make_scorer(accuracy_score), "recall": make_scorer(recall_score)},
+        "accuracy",
+        "recall",
+    ):
+        grid_search = GridSearchCV(
+            SVC(), cv=n_splits, param_grid=params, scoring=scoring, refit=False
+        )
+        grid_search.fit(X, y)
+        grid_searches.append(grid_search)
+
+    compare_cv_results_multimetric_with_single(*grid_searches)
+
+
+def test_random_search_cv_results_multimetric():
+    X, y = make_classification(n_samples=50, n_features=4, random_state=42)
+
+    n_splits = 3
+    n_search_iter = 30
+
+    # Scipy 0.12's stats dists do not accept seed, hence we use param grid
+    params = dict(C=np.logspace(-4, 1, 3), gamma=np.logspace(-5, 0, 3, base=0.1))
+    for refit in (True, False):
+        random_searches = []
+        for scoring in (("accuracy", "recall"), "accuracy", "recall"):
+            # If True, for multi-metric pass refit='accuracy'
+            if refit:
+                probability = True
+                refit = "accuracy" if isinstance(scoring, tuple) else refit
+            else:
+                probability = False
+            clf = SVC(probability=probability, random_state=42)
+            random_search = RandomizedSearchCV(
+                clf,
+                n_iter=n_search_iter,
+                cv=n_splits,
+                param_distributions=params,
+                scoring=scoring,
+                refit=refit,
+                random_state=0,
+            )
+            random_search.fit(X, y)
+            random_searches.append(random_search)
+
+        compare_cv_results_multimetric_with_single(*random_searches)
+        compare_refit_methods_when_refit_with_acc(
+            random_searches[0], random_searches[1], refit
+        )
+
+
+def compare_cv_results_multimetric_with_single(search_multi, search_acc, search_rec):
+    """Compare multi-metric cv_results with the ensemble of multiple
+    single metric cv_results from single metric grid/random search"""
+
+    assert search_multi.multimetric_
+    assert_array_equal(sorted(search_multi.scorer_), ("accuracy", "recall"))
+
+    cv_results_multi = search_multi.cv_results_
+    cv_results_acc_rec = {
+        re.sub("_score$", "_accuracy", k): v for k, v in search_acc.cv_results_.items()
+    }
+    cv_results_acc_rec.update(
+        {re.sub("_score$", "_recall", k): v for k, v in search_rec.cv_results_.items()}
+    )
+
+    # Check if score and timing are reasonable, also checks if the keys
+    # are present
+    assert all(
+        (
+            np.all(cv_results_multi[k] <= 1)
+            for k in (
+                "mean_score_time",
+                "std_score_time",
+                "mean_fit_time",
+                "std_fit_time",
+            )
+        )
+    )
+
+    # Compare the keys, other than time keys, among multi-metric and
+    # single metric grid search results. np.testing.assert_equal performs a
+    # deep nested comparison of the two cv_results dicts
+    np.testing.assert_equal(
+        {k: v for k, v in cv_results_multi.items() if not k.endswith("_time")},
+        {k: v for k, v in cv_results_acc_rec.items() if not k.endswith("_time")},
+    )
+
+
+def compare_refit_methods_when_refit_with_acc(search_multi, search_acc, refit):
+    """Compare refit multi-metric search methods with single metric methods"""
+    assert search_acc.refit == refit
+    if refit:
+        assert search_multi.refit == "accuracy"
+    else:
+        assert not search_multi.refit
+        return  # search cannot predict/score without refit
+
+    X, y = make_blobs(n_samples=100, n_features=4, random_state=42)
+    for method in ("predict", "predict_proba", "predict_log_proba"):
+        assert_almost_equal(
+            getattr(search_multi, method)(X), getattr(search_acc, method)(X)
+        )
+    assert_almost_equal(search_multi.score(X, y), search_acc.score(X, y))
+    for key in ("best_index_", "best_score_", "best_params_"):
+        assert getattr(search_multi, key) == getattr(search_acc, key)
+
+
+@pytest.mark.parametrize(
+    "search_cv",
+    [
+        RandomizedSearchCV(
+            estimator=DecisionTreeClassifier(),
+            param_distributions={"max_depth": [5, 10]},
+        ),
+        GridSearchCV(
+            estimator=DecisionTreeClassifier(), param_grid={"max_depth": [5, 10]}
+        ),
+    ],
+)
+def test_search_cv_score_samples_error(search_cv):
+    X, y = make_blobs(n_samples=100, n_features=4, random_state=42)
+    search_cv.fit(X, y)
+
+    # Make sure to error out when underlying estimator does not implement
+    # the method `score_samples`
+    outer_msg = f"'{search_cv.__class__.__name__}' has no attribute 'score_samples'"
+    inner_msg = "'DecisionTreeClassifier' object has no attribute 'score_samples'"
+
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        search_cv.score_samples(X)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg == str(exec_info.value.__cause__)
+
+
+def test_unsupported_sample_weight_scorer():
+    """Checks that fitting with sample_weight raises a warning if the scorer does not
+    support sample_weight"""
+
+    def fake_score_func(y_true, y_pred):
+        "Fake scoring function that does not support sample_weight"
+        return 0.5
+
+    fake_scorer = make_scorer(fake_score_func)
+
+    X, y = make_classification(n_samples=10, n_features=4, random_state=42)
+    sw = np.ones_like(y)
+    search_cv = GridSearchCV(estimator=LogisticRegression(), param_grid={"C": [1, 10]})
+    # function
+    search_cv.set_params(scoring=fake_score_func)
+    with pytest.warns(UserWarning, match="does not support sample_weight"):
+        search_cv.fit(X, y, sample_weight=sw)
+    # scorer
+    search_cv.set_params(scoring=fake_scorer)
+    with pytest.warns(UserWarning, match="does not support sample_weight"):
+        search_cv.fit(X, y, sample_weight=sw)
+    # multi-metric evaluation
+    search_cv.set_params(
+        scoring=dict(fake=fake_scorer, accuracy="accuracy"), refit=False
+    )
+    # only fake scorer does not support sample_weight
+    with pytest.warns(
+        UserWarning, match=r"The scoring fake=.* does not support sample_weight"
+    ):
+        search_cv.fit(X, y, sample_weight=sw)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        GridSearchCV(estimator=LogisticRegression(), param_grid={"C": [1, 10, 100]}),
+        RandomizedSearchCV(
+            estimator=Ridge(), param_distributions={"alpha": [1, 0.1, 0.01]}
+        ),
+    ],
+)
+def test_search_cv_sample_weight_equivalence(estimator):
+    estimator_weighted = clone(estimator)
+    estimator_repeated = clone(estimator)
+    set_random_state(estimator_weighted, random_state=0)
+    set_random_state(estimator_repeated, random_state=0)
+
+    rng = np.random.RandomState(42)
+    n_classes = 3
+    n_samples_per_group = 30
+    n_groups = 4
+    n_samples = n_groups * n_samples_per_group
+    X = rng.rand(n_samples, n_samples * 2)
+    y = rng.randint(0, n_classes, size=n_samples)
+    sw = rng.randint(0, 5, size=n_samples)
+    # we use groups with LeaveOneGroupOut to ensure that
+    # the splits are the same in the repeated/weighted datasets
+    groups = np.tile(np.arange(n_groups), n_samples_per_group)
+
+    X_weighted = X
+    y_weighted = y
+    groups_weighted = groups
+    splits_weighted = list(LeaveOneGroupOut().split(X_weighted, groups=groups_weighted))
+    estimator_weighted.set_params(cv=splits_weighted)
+    # repeat samples according to weights
+    X_repeated = X_weighted.repeat(repeats=sw, axis=0)
+    y_repeated = y_weighted.repeat(repeats=sw)
+    groups_repeated = groups_weighted.repeat(repeats=sw)
+    splits_repeated = list(LeaveOneGroupOut().split(X_repeated, groups=groups_repeated))
+    estimator_repeated.set_params(cv=splits_repeated)
+
+    y_weighted = _enforce_estimator_tags_y(estimator_weighted, y_weighted)
+    y_repeated = _enforce_estimator_tags_y(estimator_repeated, y_repeated)
+
+    estimator_repeated.fit(X_repeated, y=y_repeated, sample_weight=None)
+    estimator_weighted.fit(X_weighted, y=y_weighted, sample_weight=sw)
+
+    # check that scores stored in cv_results_
+    # are equal for the weighted/repeated datasets
+    score_keys = [
+        key for key in estimator_repeated.cv_results_ if key.endswith("score")
+    ]
+    for key in score_keys:
+        s1 = estimator_repeated.cv_results_[key]
+        s2 = estimator_weighted.cv_results_[key]
+        err_msg = f"{key} values are not equal for weighted/repeated datasets"
+        assert_allclose(s1, s2, err_msg=err_msg)
+
+    for key in ["best_score_", "best_index_"]:
+        s1 = getattr(estimator_repeated, key)
+        s2 = getattr(estimator_weighted, key)
+        err_msg = f"{key} values are not equal for weighted/repeated datasets"
+        assert_almost_equal(s1, s2, err_msg=err_msg)
+
+    for method in ["predict_proba", "decision_function", "predict", "transform"]:
+        if hasattr(estimator, method):
+            s1 = getattr(estimator_repeated, method)(X)
+            s2 = getattr(estimator_weighted, method)(X)
+            err_msg = (
+                f"Comparing the output of {method} revealed that fitting "
+                "with `sample_weight` is not equivalent to fitting with removed "
+                "or repeated data points."
+            )
+            assert_allclose_dense_sparse(s1, s2, err_msg=err_msg)
+
+
+@pytest.mark.parametrize(
+    "search_cv",
+    [
+        RandomizedSearchCV(
+            estimator=LocalOutlierFactor(novelty=True),
+            param_distributions={"n_neighbors": [5, 10]},
+            scoring="precision",
+        ),
+        GridSearchCV(
+            estimator=LocalOutlierFactor(novelty=True),
+            param_grid={"n_neighbors": [5, 10]},
+            scoring="precision",
+        ),
+    ],
+)
+def test_search_cv_score_samples_method(search_cv):
+    # Set parameters
+    rng = np.random.RandomState(42)
+    n_samples = 300
+    outliers_fraction = 0.15
+    n_outliers = int(outliers_fraction * n_samples)
+    n_inliers = n_samples - n_outliers
+
+    # Create dataset
+    X = make_blobs(
+        n_samples=n_inliers,
+        n_features=2,
+        centers=[[0, 0], [0, 0]],
+        cluster_std=0.5,
+        random_state=0,
+    )[0]
+    # Add some noisy points
+    X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0)
+
+    # Define labels to be able to score the estimator with `search_cv`
+    y_true = np.array([1] * n_samples)
+    y_true[-n_outliers:] = -1
+
+    # Fit on data
+    search_cv.fit(X, y_true)
+
+    # Verify that the stand alone estimator yields the same results
+    # as the ones obtained with *SearchCV
+    assert_allclose(
+        search_cv.score_samples(X), search_cv.best_estimator_.score_samples(X)
+    )
+
+
+def test_search_cv_results_rank_tie_breaking():
+    X, y = make_blobs(n_samples=50, random_state=42)
+
+    # The two C values are close enough to give similar models
+    # which would result in a tie of their mean cv-scores
+    param_grid = {"C": [1, 1.001, 0.001]}
+
+    grid_search = GridSearchCV(SVC(), param_grid=param_grid, return_train_score=True)
+    random_search = RandomizedSearchCV(
+        SVC(), n_iter=3, param_distributions=param_grid, return_train_score=True
+    )
+
+    for search in (grid_search, random_search):
+        search.fit(X, y)
+        cv_results = search.cv_results_
+        # Check tie breaking strategy -
+        # Check that there is a tie in the mean scores between
+        # candidates 1 and 2 alone
+        assert_almost_equal(
+            cv_results["mean_test_score"][0], cv_results["mean_test_score"][1]
+        )
+        assert_almost_equal(
+            cv_results["mean_train_score"][0], cv_results["mean_train_score"][1]
+        )
+        assert not np.allclose(
+            cv_results["mean_test_score"][1], cv_results["mean_test_score"][2]
+        )
+        assert not np.allclose(
+            cv_results["mean_train_score"][1], cv_results["mean_train_score"][2]
+        )
+        # 'min' rank should be assigned to the tied candidates
+        assert_almost_equal(search.cv_results_["rank_test_score"], [1, 1, 3])
+
+
+def test_search_cv_results_none_param():
+    X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1]
+    estimators = (DecisionTreeRegressor(), DecisionTreeClassifier())
+    est_parameters = {"random_state": [0, None]}
+    cv = KFold()
+
+    for est in estimators:
+        grid_search = GridSearchCV(
+            est,
+            est_parameters,
+            cv=cv,
+        ).fit(X, y)
+        assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None])
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning")
+def test_search_cv_timing():
+    svc = LinearSVC(random_state=0)
+
+    X = [
+        [
+            1,
+        ],
+        [
+            2,
+        ],
+        [
+            3,
+        ],
+        [
+            4,
+        ],
+    ]
+    y = [0, 1, 1, 0]
+
+    gs = GridSearchCV(svc, {"C": [0, 1]}, cv=2, error_score=0)
+    rs = RandomizedSearchCV(svc, {"C": [0, 1]}, cv=2, error_score=0, n_iter=2)
+
+    for search in (gs, rs):
+        search.fit(X, y)
+        for key in ["mean_fit_time", "std_fit_time"]:
+            # NOTE The precision of time.time in windows is not high
+            # enough for the fit/score times to be non-zero for trivial X and y
+            assert np.all(search.cv_results_[key] >= 0)
+            assert np.all(search.cv_results_[key] < 1)
+
+        for key in ["mean_score_time", "std_score_time"]:
+            assert search.cv_results_[key][1] >= 0
+            assert search.cv_results_[key][0] == 0.0
+            assert np.all(search.cv_results_[key] < 1)
+
+        assert hasattr(search, "refit_time_")
+        assert isinstance(search.refit_time_, float)
+        assert search.refit_time_ >= 0
+
+
+def test_grid_search_correct_score_results():
+    # test that correct scores are used
+    n_splits = 3
+    clf = LinearSVC(random_state=0)
+    X, y = make_blobs(random_state=0, centers=2)
+    Cs = [0.1, 1, 10]
+    for score in ["f1", "roc_auc"]:
+        grid_search = GridSearchCV(clf, {"C": Cs}, scoring=score, cv=n_splits)
+        cv_results = grid_search.fit(X, y).cv_results_
+
+        # Test scorer names
+        result_keys = list(cv_results.keys())
+        expected_keys = ("mean_test_score", "rank_test_score") + tuple(
+            "split%d_test_score" % cv_i for cv_i in range(n_splits)
+        )
+        assert all(np.isin(expected_keys, result_keys))
+
+        cv = StratifiedKFold(n_splits=n_splits)
+        n_splits = grid_search.n_splits_
+        for candidate_i, C in enumerate(Cs):
+            clf.set_params(C=C)
+            cv_scores = np.array(
+                [
+                    grid_search.cv_results_["split%d_test_score" % s][candidate_i]
+                    for s in range(n_splits)
+                ]
+            )
+            for i, (train, test) in enumerate(cv.split(X, y)):
+                clf.fit(X[train], y[train])
+                if score == "f1":
+                    correct_score = f1_score(y[test], clf.predict(X[test]))
+                elif score == "roc_auc":
+                    dec = clf.decision_function(X[test])
+                    correct_score = roc_auc_score(y[test], dec)
+                assert_almost_equal(correct_score, cv_scores[i])
+
+
+def test_pickle():
+    # Test that a fit search can be pickled
+    clf = MockClassifier()
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True, cv=2)
+    grid_search.fit(X, y)
+    grid_search_pickled = pickle.loads(pickle.dumps(grid_search))
+    assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X))
+
+    random_search = RandomizedSearchCV(
+        clf, {"foo_param": [1, 2, 3]}, refit=True, n_iter=3, cv=2
+    )
+    random_search.fit(X, y)
+    random_search_pickled = pickle.loads(pickle.dumps(random_search))
+    assert_array_almost_equal(
+        random_search.predict(X), random_search_pickled.predict(X)
+    )
+
+
+def test_grid_search_with_multioutput_data():
+    # Test search with multi-output estimator
+
+    X, y = make_multilabel_classification(return_indicator=True, random_state=0)
+
+    est_parameters = {"max_depth": [1, 2, 3, 4]}
+    cv = KFold()
+
+    estimators = [
+        DecisionTreeRegressor(random_state=0),
+        DecisionTreeClassifier(random_state=0),
+    ]
+
+    # Test with grid search cv
+    for est in estimators:
+        grid_search = GridSearchCV(est, est_parameters, cv=cv)
+        grid_search.fit(X, y)
+        res_params = grid_search.cv_results_["params"]
+        for cand_i in range(len(res_params)):
+            est.set_params(**res_params[cand_i])
+
+            for i, (train, test) in enumerate(cv.split(X, y)):
+                est.fit(X[train], y[train])
+                correct_score = est.score(X[test], y[test])
+                assert_almost_equal(
+                    correct_score,
+                    grid_search.cv_results_["split%d_test_score" % i][cand_i],
+                )
+
+    # Test with a randomized search
+    for est in estimators:
+        random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3)
+        random_search.fit(X, y)
+        res_params = random_search.cv_results_["params"]
+        for cand_i in range(len(res_params)):
+            est.set_params(**res_params[cand_i])
+
+            for i, (train, test) in enumerate(cv.split(X, y)):
+                est.fit(X[train], y[train])
+                correct_score = est.score(X[test], y[test])
+                assert_almost_equal(
+                    correct_score,
+                    random_search.cv_results_["split%d_test_score" % i][cand_i],
+                )
+
+
+def test_predict_proba_disabled():
+    # Test predict_proba when disabled on estimator.
+    X = np.arange(20).reshape(5, -1)
+    y = [0, 0, 1, 1, 1]
+    clf = SVC(probability=False)
+    gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
+    assert not hasattr(gs, "predict_proba")
+
+
+def test_grid_search_allows_nans():
+    # Test GridSearchCV with SimpleImputer
+    X = np.arange(20, dtype=np.float64).reshape(5, -1)
+    X[2, :] = np.nan
+    y = [0, 0, 1, 1, 1]
+    p = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)),
+            ("classifier", MockClassifier()),
+        ]
+    )
+    GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y)
+
+
+class FailingClassifier(BaseEstimator):
+    """Classifier that raises a ValueError on fit()"""
+
+    FAILING_PARAMETER = 2
+
+    def __init__(self, parameter=None):
+        self.parameter = parameter
+
+    def fit(self, X, y=None):
+        if self.parameter == FailingClassifier.FAILING_PARAMETER:
+            raise ValueError("Failing classifier failed as required")
+
+    def predict(self, X):
+        return np.zeros(X.shape[0])
+
+    def score(self, X=None, Y=None):
+        return 0.0
+
+
+def test_grid_search_failing_classifier():
+    # GridSearchCV with on_error != 'raise'
+    # Ensures that a warning is raised and score reset where appropriate.
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    # refit=False because we only want to check that errors caused by fits
+    # to individual folds will be caught and warnings raised instead. If
+    # refit was done, then an exception would be raised on refit and not
+    # caught by grid_search (expected behavior), and this would cause an
+    # error in this test.
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [0, 1, 2]}],
+        scoring="accuracy",
+        refit=False,
+        error_score=0.0,
+    )
+
+    warning_message = re.compile(
+        "5 fits failed.+total of 15.+The score on these"
+        r" train-test partitions for these parameters will be set to 0\.0.+"
+        "5 fits failed with the following error.+ValueError.+Failing classifier failed"
+        " as required",
+        flags=re.DOTALL,
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        gs.fit(X, y)
+    n_candidates = len(gs.cv_results_["params"])
+
+    # Ensure that grid scores were set to zero as required for those fits
+    # that are expected to fail.
+    def get_cand_scores(i):
+        return np.array(
+            [gs.cv_results_["split%d_test_score" % s][i] for s in range(gs.n_splits_)]
+        )
+
+    assert all(
+        (
+            np.all(get_cand_scores(cand_i) == 0.0)
+            for cand_i in range(n_candidates)
+            if gs.cv_results_["param_parameter"][cand_i]
+            == FailingClassifier.FAILING_PARAMETER
+        )
+    )
+
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [0, 1, 2]}],
+        scoring="accuracy",
+        refit=False,
+        error_score=float("nan"),
+    )
+    warning_message = re.compile(
+        "5 fits failed.+total of 15.+The score on these"
+        r" train-test partitions for these parameters will be set to nan.+"
+        "5 fits failed with the following error.+ValueError.+Failing classifier failed"
+        " as required",
+        flags=re.DOTALL,
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        gs.fit(X, y)
+    n_candidates = len(gs.cv_results_["params"])
+    assert all(
+        np.all(np.isnan(get_cand_scores(cand_i)))
+        for cand_i in range(n_candidates)
+        if gs.cv_results_["param_parameter"][cand_i]
+        == FailingClassifier.FAILING_PARAMETER
+    )
+
+    ranks = gs.cv_results_["rank_test_score"]
+
+    # Check that succeeded estimators have lower ranks
+    assert ranks[0] <= 2 and ranks[1] <= 2
+    # Check that failed estimator has the highest rank
+    assert ranks[clf.FAILING_PARAMETER] == 3
+    assert gs.best_index_ != clf.FAILING_PARAMETER
+
+
+def test_grid_search_classifier_all_fits_fail():
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [FailingClassifier.FAILING_PARAMETER] * 3}],
+        error_score=0.0,
+    )
+
+    warning_message = re.compile(
+        (
+            "All the 15 fits failed.+15 fits failed with the following"
+            " error.+ValueError.+Failing classifier failed as required"
+        ),
+        flags=re.DOTALL,
+    )
+    with pytest.raises(ValueError, match=warning_message):
+        gs.fit(X, y)
+
+
+def test_grid_search_failing_classifier_raise():
+    # GridSearchCV with on_error == 'raise' raises the error
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    # refit=False because we want to test the behaviour of the grid search part
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [0, 1, 2]}],
+        scoring="accuracy",
+        refit=False,
+        error_score="raise",
+    )
+
+    # FailingClassifier issues a ValueError so this is what we look for.
+    with pytest.raises(ValueError):
+        gs.fit(X, y)
+
+
+def test_parameters_sampler_replacement():
+    # raise warning if n_iter is bigger than total parameter space
+    params = [
+        {"first": [0, 1], "second": ["a", "b", "c"]},
+        {"third": ["two", "values"]},
+    ]
+    sampler = ParameterSampler(params, n_iter=9)
+    n_iter = 9
+    grid_size = 8
+    expected_warning = (
+        "The total space of parameters %d is smaller "
+        "than n_iter=%d. Running %d iterations. For "
+        "exhaustive searches, use GridSearchCV." % (grid_size, n_iter, grid_size)
+    )
+    with pytest.warns(UserWarning, match=expected_warning):
+        list(sampler)
+
+    # degenerates to GridSearchCV if n_iter the same as grid_size
+    sampler = ParameterSampler(params, n_iter=8)
+    samples = list(sampler)
+    assert len(samples) == 8
+    for values in ParameterGrid(params):
+        assert values in samples
+    assert len(ParameterSampler(params, n_iter=1000)) == 8
+
+    # test sampling without replacement in a large grid
+    params = {"a": range(10), "b": range(10), "c": range(10)}
+    sampler = ParameterSampler(params, n_iter=99, random_state=42)
+    samples = list(sampler)
+    assert len(samples) == 99
+    hashable_samples = ["a%db%dc%d" % (p["a"], p["b"], p["c"]) for p in samples]
+    assert len(set(hashable_samples)) == 99
+
+    # doesn't go into infinite loops
+    params_distribution = {"first": bernoulli(0.5), "second": ["a", "b", "c"]}
+    sampler = ParameterSampler(params_distribution, n_iter=7)
+    samples = list(sampler)
+    assert len(samples) == 7
+
+
+def test_stochastic_gradient_loss_param():
+    # Make sure the predict_proba works when loss is specified
+    # as one of the parameters in the param_grid.
+    param_grid = {
+        "loss": ["log_loss"],
+    }
+    X = np.arange(24).reshape(6, -1)
+    y = [0, 0, 0, 1, 1, 1]
+    clf = GridSearchCV(
+        estimator=SGDClassifier(loss="hinge"), param_grid=param_grid, cv=3
+    )
+
+    # When the estimator is not fitted, `predict_proba` is not available as the
+    # loss is 'hinge'.
+    assert not hasattr(clf, "predict_proba")
+    clf.fit(X, y)
+    clf.predict_proba(X)
+    clf.predict_log_proba(X)
+
+    # Make sure `predict_proba` is not available when setting loss=['hinge']
+    # in param_grid
+    param_grid = {
+        "loss": ["hinge"],
+    }
+    clf = GridSearchCV(
+        estimator=SGDClassifier(loss="hinge"), param_grid=param_grid, cv=3
+    )
+    assert not hasattr(clf, "predict_proba")
+    clf.fit(X, y)
+    assert not hasattr(clf, "predict_proba")
+
+
+def test_search_train_scores_set_to_false():
+    X = np.arange(6).reshape(6, -1)
+    y = [0, 0, 0, 1, 1, 1]
+    clf = LinearSVC(random_state=0)
+
+    gs = GridSearchCV(clf, param_grid={"C": [0.1, 0.2]}, cv=3)
+    gs.fit(X, y)
+
+
+def test_grid_search_cv_splits_consistency():
+    # Check if a one time iterable is accepted as a cv parameter.
+    n_samples = 100
+    n_splits = 5
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+
+    gs = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.2, 0.3]},
+        cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples),
+        return_train_score=True,
+    )
+    gs.fit(X, y)
+
+    gs2 = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.2, 0.3]},
+        cv=KFold(n_splits=n_splits),
+        return_train_score=True,
+    )
+    gs2.fit(X, y)
+
+    # Give generator as a cv parameter
+    assert isinstance(
+        KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y),
+        GeneratorType,
+    )
+    gs3 = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.2, 0.3]},
+        cv=KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y),
+        return_train_score=True,
+    )
+    gs3.fit(X, y)
+
+    gs4 = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.2, 0.3]},
+        cv=KFold(n_splits=n_splits, shuffle=True, random_state=0),
+        return_train_score=True,
+    )
+    gs4.fit(X, y)
+
+    def _pop_time_keys(cv_results):
+        for key in (
+            "mean_fit_time",
+            "std_fit_time",
+            "mean_score_time",
+            "std_score_time",
+        ):
+            cv_results.pop(key)
+        return cv_results
+
+    # Check if generators are supported as cv and
+    # that the splits are consistent
+    np.testing.assert_equal(
+        _pop_time_keys(gs3.cv_results_), _pop_time_keys(gs4.cv_results_)
+    )
+
+    # OneTimeSplitter is a non-re-entrant cv where split can be called only
+    # once if ``cv.split`` is called once per param setting in GridSearchCV.fit
+    # the 2nd and 3rd parameter will not be evaluated as no train/test indices
+    # will be generated for the 2nd and subsequent cv.split calls.
+    # This is a check to make sure cv.split is not called once per param
+    # setting.
+    np.testing.assert_equal(
+        {k: v for k, v in gs.cv_results_.items() if not k.endswith("_time")},
+        {k: v for k, v in gs2.cv_results_.items() if not k.endswith("_time")},
+    )
+
+    # Check consistency of folds across the parameters
+    gs = GridSearchCV(
+        LinearSVC(random_state=0),
+        param_grid={"C": [0.1, 0.1, 0.2, 0.2]},
+        cv=KFold(n_splits=n_splits, shuffle=True),
+        return_train_score=True,
+    )
+    gs.fit(X, y)
+
+    # As the first two param settings (C=0.1) and the next two param
+    # settings (C=0.2) are same, the test and train scores must also be
+    # same as long as the same train/test indices are generated for all
+    # the cv splits, for both param setting
+    for score_type in ("train", "test"):
+        per_param_scores = {}
+        for param_i in range(4):
+            per_param_scores[param_i] = [
+                gs.cv_results_["split%d_%s_score" % (s, score_type)][param_i]
+                for s in range(5)
+            ]
+
+        assert_array_almost_equal(per_param_scores[0], per_param_scores[1])
+        assert_array_almost_equal(per_param_scores[2], per_param_scores[3])
+
+
+def test_transform_inverse_transform_round_trip():
+    clf = MockClassifier()
+    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3)
+
+    grid_search.fit(X, y)
+    X_round_trip = grid_search.inverse_transform(grid_search.transform(X))
+    assert_array_equal(X, X_round_trip)
+
+
+def test_custom_run_search():
+    def check_results(results, gscv):
+        exp_results = gscv.cv_results_
+        assert sorted(results.keys()) == sorted(exp_results)
+        for k in results:
+            if not k.endswith("_time"):
+                # XXX: results['params'] is a list :|
+                results[k] = np.asanyarray(results[k])
+                if results[k].dtype.kind == "O":
+                    assert_array_equal(
+                        exp_results[k], results[k], err_msg="Checking " + k
+                    )
+                else:
+                    assert_allclose(exp_results[k], results[k], err_msg="Checking " + k)
+
+    def fit_grid(param_grid):
+        return GridSearchCV(clf, param_grid, return_train_score=True).fit(X, y)
+
+    class CustomSearchCV(BaseSearchCV):
+        def __init__(self, estimator, **kwargs):
+            super().__init__(estimator, **kwargs)
+
+        def _run_search(self, evaluate):
+            results = evaluate([{"max_depth": 1}, {"max_depth": 2}])
+            check_results(results, fit_grid({"max_depth": [1, 2]}))
+            results = evaluate([{"min_samples_split": 5}, {"min_samples_split": 10}])
+            check_results(
+                results,
+                fit_grid([{"max_depth": [1, 2]}, {"min_samples_split": [5, 10]}]),
+            )
+
+    # Using regressor to make sure each score differs
+    clf = DecisionTreeRegressor(random_state=0)
+    X, y = make_classification(n_samples=100, n_informative=4, random_state=0)
+    mycv = CustomSearchCV(clf, return_train_score=True).fit(X, y)
+    gscv = fit_grid([{"max_depth": [1, 2]}, {"min_samples_split": [5, 10]}])
+
+    results = mycv.cv_results_
+    check_results(results, gscv)
+    for attr in dir(gscv):
+        if (
+            attr[0].islower()
+            and attr[-1:] == "_"
+            and attr
+            not in {
+                "cv_results_",
+                "best_estimator_",
+                "refit_time_",
+                "classes_",
+                "scorer_",
+            }
+        ):
+            assert getattr(gscv, attr) == getattr(mycv, attr), (
+                "Attribute %s not equal" % attr
+            )
+
+
+def test__custom_fit_no_run_search():
+    class NoRunSearchSearchCV(BaseSearchCV):
+        def __init__(self, estimator, **kwargs):
+            super().__init__(estimator, **kwargs)
+
+        def fit(self, X, y=None, groups=None, **fit_params):
+            return self
+
+    # this should not raise any exceptions
+    NoRunSearchSearchCV(SVC()).fit(X, y)
+
+    class BadSearchCV(BaseSearchCV):
+        def __init__(self, estimator, **kwargs):
+            super().__init__(estimator, **kwargs)
+
+    with pytest.raises(NotImplementedError, match="_run_search not implemented."):
+        # this should raise a NotImplementedError
+        BadSearchCV(SVC()).fit(X, y)
+
+
+def test_empty_cv_iterator_error():
+    # Use global X, y
+
+    # create cv
+    cv = KFold(n_splits=3).split(X)
+
+    # pop all of it, this should cause the expected ValueError
+    [u for u in cv]
+    # cv is empty now
+
+    train_size = 100
+    ridge = RandomizedSearchCV(Ridge(), {"alpha": [1e-3, 1e-2, 1e-1]}, cv=cv, n_jobs=4)
+
+    # assert that this raises an error
+    with pytest.raises(
+        ValueError,
+        match=(
+            "No fits were performed. "
+            "Was the CV iterator empty\\? "
+            "Were there no candidates\\?"
+        ),
+    ):
+        ridge.fit(X[:train_size], y[:train_size])
+
+
+def test_random_search_bad_cv():
+    # Use global X, y
+
+    class BrokenKFold(KFold):
+        def get_n_splits(self, *args, **kw):
+            return 1
+
+    # create bad cv
+    cv = BrokenKFold(n_splits=3)
+
+    train_size = 100
+    ridge = RandomizedSearchCV(Ridge(), {"alpha": [1e-3, 1e-2, 1e-1]}, cv=cv, n_jobs=4)
+
+    # assert that this raises an error
+    with pytest.raises(
+        ValueError,
+        match=(
+            "cv.split and cv.get_n_splits returned "
+            "inconsistent results. Expected \\d+ "
+            "splits, got \\d+"
+        ),
+    ):
+        ridge.fit(X[:train_size], y[:train_size])
+
+
+@pytest.mark.parametrize("return_train_score", [False, True])
+@pytest.mark.parametrize(
+    "SearchCV, specialized_params",
+    [
+        (GridSearchCV, {"param_grid": {"max_depth": [2, 3, 5, 8]}}),
+        (
+            RandomizedSearchCV,
+            {"param_distributions": {"max_depth": [2, 3, 5, 8]}, "n_iter": 4},
+        ),
+    ],
+)
+def test_searchcv_raise_warning_with_non_finite_score(
+    SearchCV, specialized_params, return_train_score
+):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/10529
+    # Check that we raise a UserWarning when a non-finite score is
+    # computed in the SearchCV
+    X, y = make_classification(n_classes=2, random_state=0)
+
+    class FailingScorer:
+        """Scorer that will fail for some split but not all."""
+
+        def __init__(self):
+            self.n_counts = 0
+
+        def __call__(self, estimator, X, y):
+            self.n_counts += 1
+            if self.n_counts % 5 == 0:
+                return np.nan
+            return 1
+
+    grid = SearchCV(
+        DecisionTreeClassifier(),
+        scoring=FailingScorer(),
+        cv=3,
+        return_train_score=return_train_score,
+        **specialized_params,
+    )
+
+    with pytest.warns(UserWarning) as warn_msg:
+        grid.fit(X, y)
+
+    set_with_warning = ["test", "train"] if return_train_score else ["test"]
+    assert len(warn_msg) == len(set_with_warning)
+    for msg, dataset in zip(warn_msg, set_with_warning):
+        assert f"One or more of the {dataset} scores are non-finite" in str(msg.message)
+
+    # all non-finite scores should be equally ranked last
+    last_rank = grid.cv_results_["rank_test_score"].max()
+    non_finite_mask = np.isnan(grid.cv_results_["mean_test_score"])
+    assert_array_equal(grid.cv_results_["rank_test_score"][non_finite_mask], last_rank)
+    # all finite scores should be better ranked than the non-finite scores
+    assert np.all(grid.cv_results_["rank_test_score"][~non_finite_mask] < last_rank)
+
+
+def test_callable_multimetric_confusion_matrix():
+    # Test callable with many metrics inserts the correct names and metrics
+    # into the search cv object
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        cm = confusion_matrix(y, y_pred)
+        return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    est = LinearSVC(random_state=42)
+    search = GridSearchCV(est, {"C": [0.1, 1]}, scoring=custom_scorer, refit="fp")
+
+    search.fit(X, y)
+
+    score_names = ["tn", "fp", "fn", "tp"]
+    for name in score_names:
+        assert "mean_test_{}".format(name) in search.cv_results_
+
+    y_pred = search.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    assert search.score(X, y) == pytest.approx(cm[0, 1])
+
+
+def test_callable_multimetric_same_as_list_of_strings():
+    # Test callable multimetric is the same as a list of strings
+    def custom_scorer(est, X, y):
+        y_pred = est.predict(X)
+        return {
+            "recall": recall_score(y, y_pred),
+            "accuracy": accuracy_score(y, y_pred),
+        }
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    est = LinearSVC(random_state=42)
+    search_callable = GridSearchCV(
+        est, {"C": [0.1, 1]}, scoring=custom_scorer, refit="recall"
+    )
+    search_str = GridSearchCV(
+        est, {"C": [0.1, 1]}, scoring=["recall", "accuracy"], refit="recall"
+    )
+
+    search_callable.fit(X, y)
+    search_str.fit(X, y)
+
+    assert search_callable.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_callable.best_index_ == search_str.best_index_
+    assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y))
+
+
+def test_callable_single_metric_same_as_single_string():
+    # Tests callable scorer is the same as scoring with a single string
+    def custom_scorer(est, X, y):
+        y_pred = est.predict(X)
+        return recall_score(y, y_pred)
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    est = LinearSVC(random_state=42)
+    search_callable = GridSearchCV(
+        est, {"C": [0.1, 1]}, scoring=custom_scorer, refit=True
+    )
+    search_str = GridSearchCV(est, {"C": [0.1, 1]}, scoring="recall", refit="recall")
+    search_list_str = GridSearchCV(
+        est, {"C": [0.1, 1]}, scoring=["recall"], refit="recall"
+    )
+    search_callable.fit(X, y)
+    search_str.fit(X, y)
+    search_list_str.fit(X, y)
+
+    assert search_callable.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_callable.best_index_ == search_str.best_index_
+    assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y))
+
+    assert search_list_str.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_list_str.best_index_ == search_str.best_index_
+    assert search_list_str.score(X, y) == pytest.approx(search_str.score(X, y))
+
+
+def test_callable_multimetric_error_on_invalid_key():
+    # Raises when the callable scorer does not return a dict with `refit` key.
+    def bad_scorer(est, X, y):
+        return {"bad_name": 1}
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    clf = GridSearchCV(
+        LinearSVC(random_state=42),
+        {"C": [0.1, 1]},
+        scoring=bad_scorer,
+        refit="good_name",
+    )
+
+    msg = (
+        "For multi-metric scoring, the parameter refit must be set to a "
+        "scorer key or a callable to refit"
+    )
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_callable_multimetric_error_failing_clf():
+    # Warns when there is an estimator the fails to fit with a float
+    # error_score
+    def custom_scorer(est, X, y):
+        return {"acc": 1}
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [0, 1, 2]}],
+        scoring=custom_scorer,
+        refit=False,
+        error_score=0.1,
+    )
+
+    warning_message = re.compile(
+        "5 fits failed.+total of 15.+The score on these"
+        r" train-test partitions for these parameters will be set to 0\.1",
+        flags=re.DOTALL,
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        gs.fit(X, y)
+
+    assert_allclose(gs.cv_results_["mean_test_acc"], [1, 1, 0.1])
+
+
+def test_callable_multimetric_clf_all_fits_fail():
+    # Warns and raises when all estimator fails to fit.
+    def custom_scorer(est, X, y):
+        return {"acc": 1}
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    gs = GridSearchCV(
+        clf,
+        [{"parameter": [FailingClassifier.FAILING_PARAMETER] * 3}],
+        scoring=custom_scorer,
+        refit=False,
+        error_score=0.1,
+    )
+
+    individual_fit_error_message = "ValueError: Failing classifier failed as required"
+    error_message = re.compile(
+        (
+            "All the 15 fits failed.+your model is misconfigured.+"
+            f"{individual_fit_error_message}"
+        ),
+        flags=re.DOTALL,
+    )
+
+    with pytest.raises(ValueError, match=error_message):
+        gs.fit(X, y)
+
+
+def test_n_features_in():
+    # make sure grid search and random search delegate n_features_in to the
+    # best estimator
+    n_features = 4
+    X, y = make_classification(n_features=n_features)
+    gbdt = HistGradientBoostingClassifier()
+    param_grid = {"max_iter": [3, 4]}
+    gs = GridSearchCV(gbdt, param_grid)
+    rs = RandomizedSearchCV(gbdt, param_grid, n_iter=1)
+    assert not hasattr(gs, "n_features_in_")
+    assert not hasattr(rs, "n_features_in_")
+    gs.fit(X, y)
+    rs.fit(X, y)
+    assert gs.n_features_in_ == n_features
+    assert rs.n_features_in_ == n_features
+
+
+@pytest.mark.parametrize("pairwise", [True, False])
+def test_search_cv_pairwise_property_delegated_to_base_estimator(pairwise):
+    """
+    Test implementation of BaseSearchCV has the pairwise tag
+    which matches the pairwise tag of its estimator.
+    This test make sure pairwise tag is delegated to the base estimator.
+
+    Non-regression test for issue #13920.
+    """
+
+    class TestEstimator(BaseEstimator):
+        def __sklearn_tags__(self):
+            tags = super().__sklearn_tags__()
+            tags.input_tags.pairwise = pairwise
+            return tags
+
+    est = TestEstimator()
+    attr_message = "BaseSearchCV pairwise tag must match estimator"
+    cv = GridSearchCV(est, {"n_neighbors": [10]})
+    assert pairwise == cv.__sklearn_tags__().input_tags.pairwise, attr_message
+
+
+def test_search_cv__pairwise_property_delegated_to_base_estimator():
+    """
+    Test implementation of BaseSearchCV has the pairwise property
+    which matches the pairwise tag of its estimator.
+    This test make sure pairwise tag is delegated to the base estimator.
+
+    Non-regression test for issue #13920.
+    """
+
+    class EstimatorPairwise(BaseEstimator):
+        def __init__(self, pairwise=True):
+            self.pairwise = pairwise
+
+        def __sklearn_tags__(self):
+            tags = super().__sklearn_tags__()
+            tags.input_tags.pairwise = self.pairwise
+            return tags
+
+    est = EstimatorPairwise()
+    attr_message = "BaseSearchCV _pairwise property must match estimator"
+
+    for _pairwise_setting in [True, False]:
+        est.set_params(pairwise=_pairwise_setting)
+        cv = GridSearchCV(est, {"n_neighbors": [10]})
+        assert _pairwise_setting == cv.__sklearn_tags__().input_tags.pairwise, (
+            attr_message
+        )
+
+
+def test_search_cv_pairwise_property_equivalence_of_precomputed():
+    """
+    Test implementation of BaseSearchCV has the pairwise tag
+    which matches the pairwise tag of its estimator.
+    This test ensures the equivalence of 'precomputed'.
+
+    Non-regression test for issue #13920.
+    """
+    n_samples = 50
+    n_splits = 2
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    grid_params = {"n_neighbors": [10]}
+
+    # defaults to euclidean metric (minkowski p = 2)
+    clf = KNeighborsClassifier()
+    cv = GridSearchCV(clf, grid_params, cv=n_splits)
+    cv.fit(X, y)
+    preds_original = cv.predict(X)
+
+    # precompute euclidean metric to validate pairwise is working
+    X_precomputed = euclidean_distances(X)
+    clf = KNeighborsClassifier(metric="precomputed")
+    cv = GridSearchCV(clf, grid_params, cv=n_splits)
+    cv.fit(X_precomputed, y)
+    preds_precomputed = cv.predict(X_precomputed)
+
+    attr_message = "GridSearchCV not identical with precomputed metric"
+    assert (preds_original == preds_precomputed).all(), attr_message
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [(GridSearchCV, {"a": [0.1, 0.01]}), (RandomizedSearchCV, {"a": uniform(1, 3)})],
+)
+def test_scalar_fit_param(SearchCV, param_search):
+    # unofficially sanctioned tolerance for scalar values in fit_params
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15805
+    class TestEstimator(ClassifierMixin, BaseEstimator):
+        def __init__(self, a=None):
+            self.a = a
+
+        def fit(self, X, y, r=None):
+            self.r_ = r
+
+        def predict(self, X):
+            return np.zeros(shape=(len(X)))
+
+    model = SearchCV(TestEstimator(), param_search)
+    X, y = make_classification(random_state=42)
+    model.fit(X, y, r=42)
+    assert model.best_estimator_.r_ == 42
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [
+        (GridSearchCV, {"alpha": [0.1, 0.01]}),
+        (RandomizedSearchCV, {"alpha": uniform(0.01, 0.1)}),
+    ],
+)
+def test_scalar_fit_param_compat(SearchCV, param_search):
+    # check support for scalar values in fit_params, for instance in LightGBM
+    # that do not exactly respect the scikit-learn API contract but that we do
+    # not want to break without an explicit deprecation cycle and API
+    # recommendations for implementing early stopping with a user provided
+    # validation set. non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15805
+    X_train, X_valid, y_train, y_valid = train_test_split(
+        *make_classification(random_state=42), random_state=42
+    )
+
+    class _FitParamClassifier(SGDClassifier):
+        def fit(
+            self,
+            X,
+            y,
+            sample_weight=None,
+            tuple_of_arrays=None,
+            scalar_param=None,
+            callable_param=None,
+        ):
+            super().fit(X, y, sample_weight=sample_weight)
+            assert scalar_param > 0
+            assert callable(callable_param)
+
+            # The tuple of arrays should be preserved as tuple.
+            assert isinstance(tuple_of_arrays, tuple)
+            assert tuple_of_arrays[0].ndim == 2
+            assert tuple_of_arrays[1].ndim == 1
+            return self
+
+    def _fit_param_callable():
+        pass
+
+    model = SearchCV(_FitParamClassifier(), param_search)
+
+    # NOTE: `fit_params` should be data dependent (e.g. `sample_weight`) which
+    # is not the case for the following parameters. But this abuse is common in
+    # popular third-party libraries and we should tolerate this behavior for
+    # now and be careful not to break support for those without following
+    # proper deprecation cycle.
+    fit_params = {
+        "tuple_of_arrays": (X_valid, y_valid),
+        "callable_param": _fit_param_callable,
+        "scalar_param": 42,
+    }
+    model.fit(X_train, y_train, **fit_params)
+
+
+# FIXME: Replace this test with a full `check_estimator` once we have API only
+# checks.
+@pytest.mark.filterwarnings("ignore:The total space of parameters 4 is")
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+@pytest.mark.parametrize("Predictor", [MinimalRegressor, MinimalClassifier])
+def test_search_cv_using_minimal_compatible_estimator(SearchCV, Predictor):
+    # Check that third-party library can run tests without inheriting from
+    # BaseEstimator.
+    rng = np.random.RandomState(0)
+    X, y = rng.randn(25, 2), np.array([0] * 5 + [1] * 20)
+
+    model = Pipeline(
+        [("transformer", MinimalTransformer()), ("predictor", Predictor())]
+    )
+
+    params = {
+        "transformer__param": [1, 10],
+        "predictor__parama": [1, 10],
+    }
+    search = SearchCV(model, params, error_score="raise")
+    search.fit(X, y)
+
+    assert search.best_params_.keys() == params.keys()
+
+    y_pred = search.predict(X)
+    if is_classifier(search):
+        assert_array_equal(y_pred, 1)
+        assert search.score(X, y) == pytest.approx(accuracy_score(y, y_pred))
+    else:
+        assert_allclose(y_pred, y.mean())
+        assert search.score(X, y) == pytest.approx(r2_score(y, y_pred))
+
+
+@pytest.mark.parametrize("return_train_score", [True, False])
+def test_search_cv_verbose_3(capsys, return_train_score):
+    """Check that search cv with verbose>2 shows the score for single
+    metrics. non-regression test for #19658."""
+    X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0)
+    clf = LinearSVC(random_state=0)
+    grid = {"C": [0.1]}
+
+    GridSearchCV(
+        clf,
+        grid,
+        scoring="accuracy",
+        verbose=3,
+        cv=3,
+        return_train_score=return_train_score,
+    ).fit(X, y)
+    captured = capsys.readouterr().out
+    if return_train_score:
+        match = re.findall(r"score=\(train=[\d\.]+, test=[\d.]+\)", captured)
+    else:
+        match = re.findall(r"score=[\d\.]+", captured)
+    assert len(match) == 3
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [
+        (GridSearchCV, "param_grid"),
+        (RandomizedSearchCV, "param_distributions"),
+        (HalvingGridSearchCV, "param_grid"),
+    ],
+)
+def test_search_estimator_param(SearchCV, param_search):
+    # test that SearchCV object doesn't change the object given in the parameter grid
+    X, y = make_classification(random_state=42)
+
+    params = {"clf": [LinearSVC()], "clf__C": [0.01]}
+    orig_C = params["clf"][0].C
+
+    pipe = Pipeline([("trs", MinimalTransformer()), ("clf", None)])
+
+    param_grid_search = {param_search: params}
+    gs = SearchCV(pipe, refit=True, cv=2, scoring="accuracy", **param_grid_search).fit(
+        X, y
+    )
+
+    # testing that the original object in params is not changed
+    assert params["clf"][0].C == orig_C
+    # testing that the GS is setting the parameter of the step correctly
+    assert gs.best_estimator_.named_steps["clf"].C == 0.01
+
+
+def test_search_with_2d_array():
+    parameter_grid = {
+        "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
+        "vect__norm": ("l1", "l2"),
+    }
+    pipeline = Pipeline(
+        [
+            ("vect", TfidfVectorizer()),
+            ("clf", ComplementNB()),
+        ]
+    )
+    random_search = RandomizedSearchCV(
+        estimator=pipeline,
+        param_distributions=parameter_grid,
+        n_iter=3,
+        random_state=0,
+        n_jobs=2,
+        verbose=1,
+        cv=3,
+    )
+    data_train = ["one", "two", "three", "four", "five"]
+    data_target = [0, 0, 1, 0, 1]
+    random_search.fit(data_train, data_target)
+    result = random_search.cv_results_["param_vect__ngram_range"]
+    expected_data = np.empty(3, dtype=object)
+    expected_data[:] = [(1, 2), (1, 2), (1, 1)]
+    np.testing.assert_array_equal(result.data, expected_data)
+
+
+def test_search_html_repr():
+    """Test different HTML representations for GridSearchCV."""
+    X, y = make_classification(random_state=42)
+
+    pipeline = Pipeline([("scale", StandardScaler()), ("clf", DummyClassifier())])
+    param_grid = {"clf": [DummyClassifier(), LogisticRegression()]}
+
+    # Unfitted shows the original pipeline
+    search_cv = GridSearchCV(pipeline, param_grid=param_grid, refit=False)
+    with config_context(display="diagram"):
+        repr_html = search_cv._repr_html_()
+        assert "<div>DummyClassifier</div>" in repr_html
+
+    # Fitted with `refit=False` shows the original pipeline
+    search_cv.fit(X, y)
+    with config_context(display="diagram"):
+        repr_html = search_cv._repr_html_()
+        assert "<div>DummyClassifier</div>" in repr_html
+
+    # Fitted with `refit=True` shows the best estimator
+    search_cv = GridSearchCV(pipeline, param_grid=param_grid, refit=True)
+    search_cv.fit(X, y)
+    with config_context(display="diagram"):
+        repr_html = search_cv._repr_html_()
+        assert "<div>DummyClassifier</div>" not in repr_html
+        assert "<div>LogisticRegression</div>" in repr_html
+
+
+# Metadata Routing Tests
+# ======================
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [
+        (GridSearchCV, "param_grid"),
+        (RandomizedSearchCV, "param_distributions"),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_multi_metric_search_forwards_metadata(SearchCV, param_search):
+    """Test that *SearchCV forwards metadata correctly when passed multiple metrics."""
+    X, y = make_classification(random_state=42)
+    n_samples = _num_samples(X)
+    rng = np.random.RandomState(0)
+    score_weights = rng.rand(n_samples)
+    score_metadata = rng.rand(n_samples)
+
+    est = LinearSVC()
+    param_grid_search = {param_search: {"C": [1]}}
+
+    scorer_registry = _Registry()
+    scorer = ConsumingScorer(registry=scorer_registry).set_score_request(
+        sample_weight="score_weights", metadata="score_metadata"
+    )
+    scoring = dict(my_scorer=scorer, accuracy="accuracy")
+    SearchCV(est, refit="accuracy", cv=2, scoring=scoring, **param_grid_search).fit(
+        X, y, score_weights=score_weights, score_metadata=score_metadata
+    )
+    assert len(scorer_registry)
+    for _scorer in scorer_registry:
+        check_recorded_metadata(
+            obj=_scorer,
+            method="score",
+            parent="_score",
+            split_params=("sample_weight", "metadata"),
+            sample_weight=score_weights,
+            metadata=score_metadata,
+        )
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [
+        (GridSearchCV, "param_grid"),
+        (RandomizedSearchCV, "param_distributions"),
+        (HalvingGridSearchCV, "param_grid"),
+    ],
+)
+def test_score_rejects_params_with_no_routing_enabled(SearchCV, param_search):
+    """*SearchCV should reject **params when metadata routing is not enabled
+    since this is added only when routing is enabled."""
+    X, y = make_classification(random_state=42)
+    est = LinearSVC()
+    param_grid_search = {param_search: {"C": [1]}}
+
+    gs = SearchCV(est, cv=2, **param_grid_search).fit(X, y)
+
+    with pytest.raises(ValueError, match="is only supported if"):
+        gs.score(X, y, metadata=1)
+
+
+# End of Metadata Routing Tests
+# =============================
+
+
+def test_cv_results_dtype_issue_29074():
+    """Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29074"""
+
+    class MetaEstimator(BaseEstimator, ClassifierMixin):
+        def __init__(
+            self,
+            base_clf,
+            parameter1=None,
+            parameter2=None,
+            parameter3=None,
+            parameter4=None,
+        ):
+            self.base_clf = base_clf
+            self.parameter1 = parameter1
+            self.parameter2 = parameter2
+            self.parameter3 = parameter3
+            self.parameter4 = parameter4
+
+        def fit(self, X, y=None):
+            self.base_clf.fit(X, y)
+            return self
+
+        def score(self, X, y):
+            return self.base_clf.score(X, y)
+
+    # Values of param_grid are such that np.result_type gives slightly
+    # different errors, in particular ValueError and TypeError
+    param_grid = {
+        "parameter1": [None, {"option": "A"}, {"option": "B"}],
+        "parameter2": [None, [1, 2]],
+        "parameter3": [{"a": 1}],
+        "parameter4": ["str1", "str2"],
+    }
+    grid_search = GridSearchCV(
+        estimator=MetaEstimator(LogisticRegression()),
+        param_grid=param_grid,
+        cv=3,
+    )
+
+    X, y = make_blobs(random_state=0)
+    grid_search.fit(X, y)
+    for param in param_grid:
+        assert grid_search.cv_results_[f"param_{param}"].dtype == object
+
+
+def test_search_with_estimators_issue_29157():
+    """Check cv_results_ for estimators with a `dtype` parameter, e.g. OneHotEncoder."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {
+            "numeric_1": [1, 2, 3, 4, 5],
+            "object_1": ["a", "a", "a", "a", "a"],
+            "target": [1.0, 4.1, 2.0, 3.0, 1.0],
+        }
+    )
+    X = df.drop("target", axis=1)
+    y = df["target"]
+    enc = ColumnTransformer(
+        [("enc", OneHotEncoder(sparse_output=False), ["object_1"])],
+        remainder="passthrough",
+    )
+    pipe = Pipeline(
+        [
+            ("enc", enc),
+            ("regressor", LinearRegression()),
+        ]
+    )
+    grid_params = {
+        "enc__enc": [
+            OneHotEncoder(sparse_output=False),
+            OrdinalEncoder(),
+        ]
+    }
+    grid_search = GridSearchCV(pipe, grid_params, cv=2)
+    grid_search.fit(X, y)
+    assert grid_search.cv_results_["param_enc__enc"].dtype == object
+
+
+def test_cv_results_multi_size_array():
+    """Check that GridSearchCV works with params that are arrays of different sizes.
+
+    Non-regression test for #29277.
+    """
+    n_features = 10
+    X, y = make_classification(n_features=10)
+
+    spline_reg_pipe = make_pipeline(
+        SplineTransformer(extrapolation="periodic"),
+        LogisticRegression(),
+    )
+
+    n_knots_list = [n_features * i for i in [10, 11, 12]]
+    knots_list = [
+        np.linspace(0, np.pi * 2, n_knots).reshape((-1, n_features))
+        for n_knots in n_knots_list
+    ]
+    spline_reg_pipe_cv = GridSearchCV(
+        estimator=spline_reg_pipe,
+        param_grid={
+            "splinetransformer__knots": knots_list,
+        },
+    )
+
+    spline_reg_pipe_cv.fit(X, y)
+    assert (
+        spline_reg_pipe_cv.cv_results_["param_splinetransformer__knots"].dtype == object
+    )
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+def test_array_api_search_cv_classifier(SearchCV, array_namespace, device, dtype):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    X = np.arange(100).reshape((10, 10))
+    X_np = X.astype(dtype)
+    X_xp = xp.asarray(X_np, device=device)
+
+    # y should always be an integer, no matter what `dtype` is
+    y_np = np.array([0] * 5 + [1] * 5)
+    y_xp = xp.asarray(y_np, device=device)
+
+    with config_context(array_api_dispatch=True):
+        searcher = SearchCV(
+            LinearDiscriminantAnalysis(),
+            {"tol": [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]},
+            cv=2,
+            error_score="raise",
+        )
+        searcher.fit(X_xp, y_xp)
+        searcher.score(X_xp, y_xp)
+
+
+# Construct these outside the tests so that the same object is used
+# for both input and `expected`
+one_hot_encoder = OneHotEncoder()
+ordinal_encoder = OrdinalEncoder()
+
+# If we construct this directly via `MaskedArray`, the list of tuples
+# gets auto-converted to a 2D array.
+ma_with_tuples = np.ma.MaskedArray(np.empty(2), mask=True, dtype=object)  # type: ignore[var-annotated]
+ma_with_tuples[0] = (1, 2)
+ma_with_tuples[1] = (3, 4)
+
+
+@pytest.mark.parametrize(
+    ("candidate_params", "expected"),
+    [
+        pytest.param(
+            [{"foo": 1}, {"foo": 2}],
+            [
+                ("param_foo", np.ma.MaskedArray(np.array([1, 2]))),
+            ],
+            id="simple numeric, single param",
+        ),
+        pytest.param(
+            [{"foo": 1, "bar": 3}, {"foo": 2, "bar": 4}, {"foo": 3}],
+            [
+                ("param_foo", np.ma.MaskedArray(np.array([1, 2, 3]))),
+                (
+                    "param_bar",
+                    np.ma.MaskedArray(np.array([3, 4, 0]), mask=[False, False, True]),
+                ),
+            ],
+            id="simple numeric, one param is missing in one round",
+        ),
+        pytest.param(
+            [{"foo": [[1], [2], [3]]}, {"foo": [[1], [2]]}],
+            [
+                (
+                    "param_foo",
+                    np.ma.MaskedArray([[[1], [2], [3]], [[1], [2]]], dtype=object),
+                ),
+            ],
+            id="lists of different lengths",
+        ),
+        pytest.param(
+            [{"foo": (1, 2)}, {"foo": (3, 4)}],
+            [
+                (
+                    "param_foo",
+                    ma_with_tuples,
+                ),
+            ],
+            id="lists tuples",
+        ),
+        pytest.param(
+            [{"foo": ordinal_encoder}, {"foo": one_hot_encoder}],
+            [
+                (
+                    "param_foo",
+                    np.ma.MaskedArray([ordinal_encoder, one_hot_encoder], dtype=object),
+                ),
+            ],
+            id="estimators",
+        ),
+    ],
+)
+def test_yield_masked_array_for_each_param(candidate_params, expected):
+    result = list(_yield_masked_array_for_each_param(candidate_params))
+    for (key, value), (expected_key, expected_value) in zip(result, expected):
+        assert key == expected_key
+        assert value.dtype == expected_value.dtype
+        np.testing.assert_array_equal(value, expected_value)
+        np.testing.assert_array_equal(value.mask, expected_value.mask)
+
+
+def test_yield_masked_array_no_runtime_warning():
+    # non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29929
+    candidate_params = [{"param": i} for i in range(1000)]
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        list(_yield_masked_array_for_each_param(candidate_params))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_split.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f31055d9b7f959c36888efbd4adae01f0a06822
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_split.py
@@ -0,0 +1,2102 @@
+"""Test the split module"""
+
+import re
+import warnings
+from itertools import combinations, combinations_with_replacement, permutations
+
+import numpy as np
+import pytest
+from scipy import stats
+from scipy.sparse import issparse
+from scipy.special import comb
+
+from sklearn import config_context
+from sklearn.datasets import load_digits, make_classification
+from sklearn.dummy import DummyClassifier
+from sklearn.model_selection import (
+    GridSearchCV,
+    GroupKFold,
+    GroupShuffleSplit,
+    KFold,
+    LeaveOneGroupOut,
+    LeaveOneOut,
+    LeavePGroupsOut,
+    LeavePOut,
+    PredefinedSplit,
+    RepeatedKFold,
+    RepeatedStratifiedKFold,
+    ShuffleSplit,
+    StratifiedGroupKFold,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+    TimeSeriesSplit,
+    check_cv,
+    cross_val_score,
+    train_test_split,
+)
+from sklearn.model_selection._split import (
+    _build_repr,
+    _validate_shuffle_split,
+    _yields_constant_splits,
+)
+from sklearn.svm import SVC
+from sklearn.tests.metadata_routing_common import assert_request_is_empty
+from sklearn.utils._array_api import (
+    _convert_to_numpy,
+    _get_namespace_device_dtype_ids,
+    get_namespace,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._array_api import (
+    device as array_api_device,
+)
+from sklearn.utils._mocking import MockDataFrame
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.estimator_checks import (
+    _array_api_for_tests,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
+from sklearn.utils.validation import _num_samples
+
+NO_GROUP_SPLITTERS = [
+    KFold(),
+    StratifiedKFold(),
+    TimeSeriesSplit(),
+    LeaveOneOut(),
+    LeavePOut(p=2),
+    ShuffleSplit(),
+    StratifiedShuffleSplit(test_size=0.5),
+    PredefinedSplit([1, 1, 2, 2]),
+    RepeatedKFold(),
+    RepeatedStratifiedKFold(),
+]
+
+GROUP_SPLITTERS = [
+    GroupKFold(),
+    LeavePGroupsOut(n_groups=1),
+    StratifiedGroupKFold(),
+    LeaveOneGroupOut(),
+    GroupShuffleSplit(),
+]
+GROUP_SPLITTER_NAMES = set(splitter.__class__.__name__ for splitter in GROUP_SPLITTERS)
+
+ALL_SPLITTERS = NO_GROUP_SPLITTERS + GROUP_SPLITTERS  # type: ignore[list-item]
+
+SPLITTERS_REQUIRING_TARGET = [
+    StratifiedKFold(),
+    StratifiedShuffleSplit(),
+    RepeatedStratifiedKFold(),
+]
+
+X = np.ones(10)
+y = np.arange(10) // 2
+test_groups = (
+    np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
+    np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+    np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+    np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
+    [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
+    ["1", "1", "1", "1", "2", "2", "2", "3", "3", "3", "3", "3"],
+)
+digits = load_digits()
+
+pytestmark = pytest.mark.filterwarnings(
+    "error:The groups parameter:UserWarning:sklearn.*"
+)
+
+
+def _split(splitter, X, y, groups):
+    if splitter.__class__.__name__ in GROUP_SPLITTER_NAMES:
+        return splitter.split(X, y, groups=groups)
+    else:
+        return splitter.split(X, y)
+
+
+def test_cross_validator_with_default_params():
+    n_samples = 4
+    n_unique_groups = 4
+    n_splits = 2
+    p = 2
+    n_shuffle_splits = 10  # (the default value)
+
+    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    X_1d = np.array([1, 2, 3, 4])
+    y = np.array([1, 1, 2, 2])
+    groups = np.array([1, 2, 3, 4])
+    loo = LeaveOneOut()
+    lpo = LeavePOut(p)
+    kf = KFold(n_splits)
+    skf = StratifiedKFold(n_splits)
+    lolo = LeaveOneGroupOut()
+    lopo = LeavePGroupsOut(p)
+    ss = ShuffleSplit(random_state=0)
+    ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2
+    sgkf = StratifiedGroupKFold(n_splits)
+
+    loo_repr = "LeaveOneOut()"
+    lpo_repr = "LeavePOut(p=2)"
+    kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)"
+    skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
+    lolo_repr = "LeaveOneGroupOut()"
+    lopo_repr = "LeavePGroupsOut(n_groups=2)"
+    ss_repr = (
+        "ShuffleSplit(n_splits=10, random_state=0, test_size=None, train_size=None)"
+    )
+    ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"
+    sgkf_repr = "StratifiedGroupKFold(n_splits=2, random_state=None, shuffle=False)"
+
+    n_splits_expected = [
+        n_samples,
+        comb(n_samples, p),
+        n_splits,
+        n_splits,
+        n_unique_groups,
+        comb(n_unique_groups, p),
+        n_shuffle_splits,
+        2,
+        n_splits,
+    ]
+
+    for i, (cv, cv_repr) in enumerate(
+        zip(
+            [loo, lpo, kf, skf, lolo, lopo, ss, ps, sgkf],
+            [
+                loo_repr,
+                lpo_repr,
+                kf_repr,
+                skf_repr,
+                lolo_repr,
+                lopo_repr,
+                ss_repr,
+                ps_repr,
+                sgkf_repr,
+            ],
+        )
+    ):
+        # Test if get_n_splits works correctly
+        assert n_splits_expected[i] == cv.get_n_splits(X, y, groups)
+
+        # Test if the cross-validator works as expected even if
+        # the data is 1d
+        np.testing.assert_equal(
+            list(_split(cv, X, y, groups)), list(_split(cv, X_1d, y, groups))
+        )
+        # Test that train, test indices returned are integers
+        for train, test in _split(cv, X, y, groups):
+            assert np.asarray(train).dtype.kind == "i"
+            assert np.asarray(test).dtype.kind == "i"
+
+        # Test if the repr works without any errors
+        assert cv_repr == repr(cv)
+
+    # ValueError for get_n_splits methods
+    msg = "The 'X' parameter should not be None."
+    with pytest.raises(ValueError, match=msg):
+        loo.get_n_splits(None, y, groups)
+    with pytest.raises(ValueError, match=msg):
+        lpo.get_n_splits(None, y, groups)
+
+
+def test_2d_y():
+    # smoke test for 2d y and multi-label
+    n_samples = 30
+    rng = np.random.RandomState(1)
+    X = rng.randint(0, 3, size=(n_samples, 2))
+    y = rng.randint(0, 3, size=(n_samples,))
+    y_2d = y.reshape(-1, 1)
+    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
+    groups = rng.randint(0, 3, size=(n_samples,))
+    splitters = [
+        LeaveOneOut(),
+        LeavePOut(p=2),
+        KFold(),
+        StratifiedKFold(),
+        RepeatedKFold(),
+        RepeatedStratifiedKFold(),
+        StratifiedGroupKFold(),
+        ShuffleSplit(),
+        StratifiedShuffleSplit(test_size=0.5),
+        GroupShuffleSplit(),
+        LeaveOneGroupOut(),
+        LeavePGroupsOut(n_groups=2),
+        GroupKFold(n_splits=3),
+        TimeSeriesSplit(),
+        PredefinedSplit(test_fold=groups),
+    ]
+    for splitter in splitters:
+        list(_split(splitter, X, y, groups=groups))
+        list(_split(splitter, X, y_2d, groups=groups))
+        try:
+            list(_split(splitter, X, y_multilabel, groups=groups))
+        except ValueError as e:
+            allowed_target_types = ("binary", "multiclass")
+            msg = "Supported target types are: {}. Got 'multilabel".format(
+                allowed_target_types
+            )
+            assert msg in str(e)
+
+
+def check_valid_split(train, test, n_samples=None):
+    # Use python sets to get more informative assertion failure messages
+    train, test = set(train), set(test)
+
+    # Train and test split should not overlap
+    assert train.intersection(test) == set()
+
+    if n_samples is not None:
+        # Check that the union of train an test split cover all the indices
+        assert train.union(test) == set(range(n_samples))
+
+
+def check_cv_coverage(cv, X, y, groups, expected_n_splits):
+    n_samples = _num_samples(X)
+    # Check that a all the samples appear at least once in a test fold
+    assert cv.get_n_splits(X, y, groups) == expected_n_splits
+
+    collected_test_samples = set()
+    iterations = 0
+    for train, test in cv.split(X, y, groups):
+        check_valid_split(train, test, n_samples=n_samples)
+        iterations += 1
+        collected_test_samples.update(test)
+
+    # Check that the accumulated test samples cover the whole dataset
+    assert iterations == expected_n_splits
+    if n_samples is not None:
+        assert collected_test_samples == set(range(n_samples))
+
+
+def test_kfold_valueerrors():
+    X1 = np.array([[1, 2], [3, 4], [5, 6]])
+    X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
+    # Check that errors are raised if there is not enough samples
+    (ValueError, next, KFold(4).split(X1))
+
+    # Check that a warning is raised if the least populated class has too few
+    # members.
+    y = np.array([3, 3, -1, -1, 3])
+
+    skf_3 = StratifiedKFold(3)
+    with pytest.warns(Warning, match="The least populated class"):
+        next(skf_3.split(X2, y))
+
+    sgkf_3 = StratifiedGroupKFold(3)
+    naive_groups = np.arange(len(y))
+    with pytest.warns(Warning, match="The least populated class"):
+        next(sgkf_3.split(X2, y, naive_groups))
+
+    # Check that despite the warning the folds are still computed even
+    # though all the classes are not necessarily represented at on each
+    # side of the split at each split
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        check_cv_coverage(skf_3, X2, y, groups=None, expected_n_splits=3)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        check_cv_coverage(sgkf_3, X2, y, groups=naive_groups, expected_n_splits=3)
+
+    # Check that errors are raised if all n_groups for individual
+    # classes are less than n_splits.
+    y = np.array([3, 3, -1, -1, 2])
+
+    with pytest.raises(ValueError):
+        next(skf_3.split(X2, y))
+    with pytest.raises(ValueError):
+        next(sgkf_3.split(X2, y))
+
+    # Error when number of folds is <= 1
+    with pytest.raises(ValueError):
+        KFold(0)
+    with pytest.raises(ValueError):
+        KFold(1)
+    error_string = "k-fold cross-validation requires at least one train/test split"
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedKFold(0)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedKFold(1)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedGroupKFold(0)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedGroupKFold(1)
+
+    # When n_splits is not integer:
+    with pytest.raises(ValueError):
+        KFold(1.5)
+    with pytest.raises(ValueError):
+        KFold(2.0)
+    with pytest.raises(ValueError):
+        StratifiedKFold(1.5)
+    with pytest.raises(ValueError):
+        StratifiedKFold(2.0)
+    with pytest.raises(ValueError):
+        StratifiedGroupKFold(1.5)
+    with pytest.raises(ValueError):
+        StratifiedGroupKFold(2.0)
+
+    # When shuffle is not  a bool:
+    with pytest.raises(TypeError):
+        KFold(n_splits=4, shuffle=None)
+
+
+def test_kfold_indices():
+    # Check all indices are returned in the test folds
+    X1 = np.ones(18)
+    kf = KFold(3)
+    check_cv_coverage(kf, X1, y=None, groups=None, expected_n_splits=3)
+
+    # Check all indices are returned in the test folds even when equal-sized
+    # folds are not possible
+    X2 = np.ones(17)
+    kf = KFold(3)
+    check_cv_coverage(kf, X2, y=None, groups=None, expected_n_splits=3)
+
+    # Check if get_n_splits returns the number of folds
+    assert 5 == KFold(5).get_n_splits(X2)
+
+
+def test_kfold_no_shuffle():
+    # Manually check that KFold preserves the data ordering on toy datasets
+    X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+
+    splits = KFold(2).split(X2[:-1])
+    train, test = next(splits)
+    assert_array_equal(test, [0, 1])
+    assert_array_equal(train, [2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(test, [2, 3])
+    assert_array_equal(train, [0, 1])
+
+    splits = KFold(2).split(X2)
+    train, test = next(splits)
+    assert_array_equal(test, [0, 1, 2])
+    assert_array_equal(train, [3, 4])
+
+    train, test = next(splits)
+    assert_array_equal(test, [3, 4])
+    assert_array_equal(train, [0, 1, 2])
+
+
+def test_stratified_kfold_no_shuffle():
+    # Manually check that StratifiedKFold preserves the data ordering as much
+    # as possible on toy datasets in order to avoid hiding sample dependencies
+    # when possible
+    X, y = np.ones(4), [1, 1, 0, 0]
+    splits = StratifiedKFold(2).split(X, y)
+    train, test = next(splits)
+    assert_array_equal(test, [0, 2])
+    assert_array_equal(train, [1, 3])
+
+    train, test = next(splits)
+    assert_array_equal(test, [1, 3])
+    assert_array_equal(train, [0, 2])
+
+    X, y = np.ones(7), [1, 1, 1, 0, 0, 0, 0]
+    splits = StratifiedKFold(2).split(X, y)
+    train, test = next(splits)
+    assert_array_equal(test, [0, 1, 3, 4])
+    assert_array_equal(train, [2, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(test, [2, 5, 6])
+    assert_array_equal(train, [0, 1, 3, 4])
+
+    # Check if get_n_splits returns the number of folds
+    assert 5 == StratifiedKFold(5).get_n_splits(X, y)
+
+    # Make sure string labels are also supported
+    X = np.ones(7)
+    y1 = ["1", "1", "1", "0", "0", "0", "0"]
+    y2 = [1, 1, 1, 0, 0, 0, 0]
+    np.testing.assert_equal(
+        list(StratifiedKFold(2).split(X, y1)), list(StratifiedKFold(2).split(X, y2))
+    )
+
+    # Check equivalence to KFold
+    y = [0, 1, 0, 1, 0, 1, 0, 1]
+    X = np.ones_like(y)
+    np.testing.assert_equal(
+        list(StratifiedKFold(3).split(X, y)), list(KFold(3).split(X, y))
+    )
+
+
+@pytest.mark.parametrize("shuffle", [False, True])
+@pytest.mark.parametrize("k", [4, 5, 6, 7, 8, 9, 10])
+@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold])
+def test_stratified_kfold_ratios(k, shuffle, kfold):
+    # Check that stratified kfold preserves class ratios in individual splits
+    # Repeat with shuffling turned off and on
+    n_samples = 1000
+    X = np.ones(n_samples)
+    y = np.array(
+        [4] * int(0.10 * n_samples)
+        + [0] * int(0.89 * n_samples)
+        + [1] * int(0.01 * n_samples)
+    )
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
+    distr = np.bincount(y) / len(y)
+
+    test_sizes = []
+    random_state = None if not shuffle else 0
+    skf = kfold(k, random_state=random_state, shuffle=shuffle)
+    for train, test in _split(skf, X, y, groups=groups):
+        assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02)
+        assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02)
+        test_sizes.append(len(test))
+    assert np.ptp(test_sizes) <= 1
+
+
+@pytest.mark.parametrize("shuffle", [False, True])
+@pytest.mark.parametrize("k", [4, 6, 7])
+@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold])
+def test_stratified_kfold_label_invariance(k, shuffle, kfold):
+    # Check that stratified kfold gives the same indices regardless of labels
+    n_samples = 100
+    y = np.array(
+        [2] * int(0.10 * n_samples)
+        + [0] * int(0.89 * n_samples)
+        + [1] * int(0.01 * n_samples)
+    )
+    X = np.ones(len(y))
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
+
+    def get_splits(y):
+        random_state = None if not shuffle else 0
+        return [
+            (list(train), list(test))
+            for train, test in _split(
+                kfold(k, random_state=random_state, shuffle=shuffle),
+                X,
+                y,
+                groups=groups,
+            )
+        ]
+
+    splits_base = get_splits(y)
+    for perm in permutations([0, 1, 2]):
+        y_perm = np.take(perm, y)
+        splits_perm = get_splits(y_perm)
+        assert splits_perm == splits_base
+
+
+def test_kfold_balance():
+    # Check that KFold returns folds with balanced sizes
+    for i in range(11, 17):
+        kf = KFold(5).split(X=np.ones(i))
+        sizes = [len(test) for _, test in kf]
+
+        assert (np.max(sizes) - np.min(sizes)) <= 1
+        assert np.sum(sizes) == i
+
+
+@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold])
+def test_stratifiedkfold_balance(kfold):
+    # Check that KFold returns folds with balanced sizes (only when
+    # stratification is possible)
+    # Repeat with shuffling turned off and on
+    X = np.ones(17)
+    y = [0] * 3 + [1] * 14
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
+
+    for shuffle in (True, False):
+        cv = kfold(3, shuffle=shuffle)
+        for i in range(11, 17):
+            skf = _split(cv, X[:i], y[:i], groups[:i])
+            sizes = [len(test) for _, test in skf]
+
+            assert (np.max(sizes) - np.min(sizes)) <= 1
+            assert np.sum(sizes) == i
+
+
+def test_shuffle_kfold():
+    # Check the indices are shuffled properly
+    kf = KFold(3)
+    kf2 = KFold(3, shuffle=True, random_state=0)
+    kf3 = KFold(3, shuffle=True, random_state=1)
+
+    X = np.ones(300)
+
+    all_folds = np.zeros(300)
+    for (tr1, te1), (tr2, te2), (tr3, te3) in zip(
+        kf.split(X), kf2.split(X), kf3.split(X)
+    ):
+        for tr_a, tr_b in combinations((tr1, tr2, tr3), 2):
+            # Assert that there is no complete overlap
+            assert len(np.intersect1d(tr_a, tr_b)) != len(tr1)
+
+        # Set all test indices in successive iterations of kf2 to 1
+        all_folds[te2] = 1
+
+    # Check that all indices are returned in the different test folds
+    assert sum(all_folds) == 300
+
+
+@pytest.mark.parametrize("kfold", [KFold, StratifiedKFold, StratifiedGroupKFold])
+def test_shuffle_kfold_stratifiedkfold_reproducibility(kfold):
+    X = np.ones(15)  # Divisible by 3
+    y = [0] * 7 + [1] * 8
+    groups_1 = np.arange(len(y))
+    X2 = np.ones(16)  # Not divisible by 3
+    y2 = [0] * 8 + [1] * 8
+    groups_2 = np.arange(len(y2))
+
+    # Check that when the shuffle is True, multiple split calls produce the
+    # same split when random_state is int
+    kf = kfold(3, shuffle=True, random_state=0)
+
+    np.testing.assert_equal(
+        list(_split(kf, X, y, groups_1)), list(_split(kf, X, y, groups_1))
+    )
+
+    # Check that when the shuffle is True, multiple split calls often
+    # (not always) produce different splits when random_state is
+    # RandomState instance or None
+    kf = kfold(3, shuffle=True, random_state=np.random.RandomState(0))
+    for data in zip((X, X2), (y, y2), (groups_1, groups_2)):
+        # Test if the two splits are different cv
+        for (_, test_a), (_, test_b) in zip(_split(kf, *data), _split(kf, *data)):
+            # cv.split(...) returns an array of tuples, each tuple
+            # consisting of an array with train indices and test indices
+            # Ensure that the splits for data are not same
+            # when random state is not set
+            with pytest.raises(AssertionError):
+                np.testing.assert_array_equal(test_a, test_b)
+
+
+def test_shuffle_stratifiedkfold():
+    # Check that shuffling is happening when requested, and for proper
+    # sample coverage
+    X_40 = np.ones(40)
+    y = [0] * 20 + [1] * 20
+    kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
+    kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
+    for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)):
+        assert set(test0) != set(test1)
+    check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5)
+
+    # Ensure that we shuffle each class's samples with different
+    # random_state in StratifiedKFold
+    # See https://github.com/scikit-learn/scikit-learn/pull/13124
+    X = np.arange(10)
+    y = [0] * 5 + [1] * 5
+    kf1 = StratifiedKFold(5, shuffle=True, random_state=0)
+    kf2 = StratifiedKFold(5, shuffle=True, random_state=1)
+    test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)])
+    test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)])
+    assert test_set1 != test_set2
+
+
+def test_shuffle_groupkfold():
+    # Check that shuffling is happening when requested, and for proper
+    # sample coverage
+    X = np.ones(40)
+    y = [0] * 20 + [1] * 20
+    groups = np.arange(40) // 3
+    gkf0 = GroupKFold(4, shuffle=True, random_state=0)
+    gkf1 = GroupKFold(4, shuffle=True, random_state=1)
+
+    # Check that the groups are shuffled differently
+    test_groups0 = [
+        set(groups[test_idx]) for _, test_idx in gkf0.split(X, None, groups)
+    ]
+    test_groups1 = [
+        set(groups[test_idx]) for _, test_idx in gkf1.split(X, None, groups)
+    ]
+    for g0, g1 in zip(test_groups0, test_groups1):
+        assert g0 != g1, "Test groups should differ with different random states"
+
+    # Check coverage and splits
+    check_cv_coverage(gkf0, X, y, groups, expected_n_splits=4)
+    check_cv_coverage(gkf1, X, y, groups, expected_n_splits=4)
+
+
+def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
+    # The digits samples are dependent: they are apparently grouped by authors
+    # although we don't have any information on the groups segment locations
+    # for this data. We can highlight this fact by computing k-fold cross-
+    # validation with and without shuffling: we observe that the shuffling case
+    # wrongly makes the IID assumption and is therefore too optimistic: it
+    # estimates a much higher accuracy (around 0.93) than that the non
+    # shuffling variant (around 0.81).
+
+    X, y = digits.data[:600], digits.target[:600]
+    model = SVC(C=10, gamma=0.005)
+
+    n_splits = 3
+
+    cv = KFold(n_splits=n_splits, shuffle=False)
+    mean_score = cross_val_score(model, X, y, cv=cv).mean()
+    assert 0.92 > mean_score
+    assert mean_score > 0.80
+
+    # Shuffling the data artificially breaks the dependency and hides the
+    # overfitting of the model with regards to the writing style of the authors
+    # by yielding a seriously overestimated score:
+
+    cv = KFold(n_splits, shuffle=True, random_state=0)
+    mean_score = cross_val_score(model, X, y, cv=cv).mean()
+    assert mean_score > 0.92
+
+    cv = KFold(n_splits, shuffle=True, random_state=1)
+    mean_score = cross_val_score(model, X, y, cv=cv).mean()
+    assert mean_score > 0.92
+
+    # Similarly, StratifiedKFold should try to shuffle the data as little
+    # as possible (while respecting the balanced class constraints)
+    # and thus be able to detect the dependency by not overestimating
+    # the CV score either. As the digits dataset is approximately balanced
+    # the estimated mean score is close to the score measured with
+    # non-shuffled KFold
+
+    cv = StratifiedKFold(n_splits)
+    mean_score = cross_val_score(model, X, y, cv=cv).mean()
+    assert 0.94 > mean_score
+    assert mean_score > 0.80
+
+
+def test_stratified_group_kfold_trivial():
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    # Trivial example - groups with the same distribution
+    y = np.array([1] * 6 + [0] * 12)
+    X = np.ones_like(y).reshape(-1, 1)
+    groups = np.asarray((1, 2, 3, 4, 5, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6))
+    distr = np.bincount(y) / len(y)
+    test_sizes = []
+    for train, test in sgkf.split(X, y, groups):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        # check y distribution
+        assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02)
+        assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02)
+        test_sizes.append(len(test))
+    assert np.ptp(test_sizes) <= 1
+
+
+def test_stratified_group_kfold_approximate():
+    # Not perfect stratification (even though it is possible) because of
+    # iteration over groups
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    y = np.array([1] * 6 + [0] * 12)
+    X = np.ones_like(y).reshape(-1, 1)
+    groups = np.array([1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6])
+    expected = np.asarray([[0.833, 0.166], [0.666, 0.333], [0.5, 0.5]])
+    test_sizes = []
+    for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        split_dist = np.bincount(y[test]) / len(test)
+        assert_allclose(split_dist, expect_dist, atol=0.001)
+        test_sizes.append(len(test))
+    assert np.ptp(test_sizes) <= 1
+
+
+@pytest.mark.parametrize(
+    "y, groups, expected",
+    [
+        (
+            np.array([0] * 6 + [1] * 6),
+            np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]),
+            np.asarray([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]),
+        ),
+        (
+            np.array([0] * 9 + [1] * 3),
+            np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6]),
+            np.asarray([[0.75, 0.25], [0.75, 0.25], [0.75, 0.25]]),
+        ),
+    ],
+)
+def test_stratified_group_kfold_homogeneous_groups(y, groups, expected):
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    X = np.ones_like(y).reshape(-1, 1)
+    for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        split_dist = np.bincount(y[test]) / len(test)
+        assert_allclose(split_dist, expect_dist, atol=0.001)
+
+
+@pytest.mark.parametrize("cls_distr", [(0.4, 0.6), (0.3, 0.7), (0.2, 0.8), (0.8, 0.2)])
+@pytest.mark.parametrize("n_groups", [5, 30, 70])
+def test_stratified_group_kfold_against_group_kfold(cls_distr, n_groups):
+    # Check that given sufficient amount of samples StratifiedGroupKFold
+    # produces better stratified folds than regular GroupKFold
+    n_splits = 5
+    sgkf = StratifiedGroupKFold(n_splits=n_splits)
+    gkf = GroupKFold(n_splits=n_splits)
+    rng = np.random.RandomState(0)
+    n_points = 1000
+    y = rng.choice(2, size=n_points, p=cls_distr)
+    X = np.ones_like(y).reshape(-1, 1)
+    g = rng.choice(n_groups, n_points)
+    sgkf_folds = sgkf.split(X, y, groups=g)
+    gkf_folds = gkf.split(X, y, groups=g)
+    sgkf_entr = 0
+    gkf_entr = 0
+    for (sgkf_train, sgkf_test), (_, gkf_test) in zip(sgkf_folds, gkf_folds):
+        # check group constraint
+        assert np.intersect1d(g[sgkf_train], g[sgkf_test]).size == 0
+        sgkf_distr = np.bincount(y[sgkf_test]) / len(sgkf_test)
+        gkf_distr = np.bincount(y[gkf_test]) / len(gkf_test)
+        sgkf_entr += stats.entropy(sgkf_distr, qk=cls_distr)
+        gkf_entr += stats.entropy(gkf_distr, qk=cls_distr)
+    sgkf_entr /= n_splits
+    gkf_entr /= n_splits
+    assert sgkf_entr <= gkf_entr
+
+
+def test_shuffle_split():
+    ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X)
+    ss2 = ShuffleSplit(test_size=2, random_state=0).split(X)
+    ss3 = ShuffleSplit(test_size=np.int32(2), random_state=0).split(X)
+    ss4 = ShuffleSplit(test_size=2, random_state=0).split(X)
+    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
+        assert_array_equal(t1[0], t2[0])
+        assert_array_equal(t2[0], t3[0])
+        assert_array_equal(t3[0], t4[0])
+        assert_array_equal(t1[1], t2[1])
+        assert_array_equal(t2[1], t3[1])
+        assert_array_equal(t3[1], t4[1])
+
+
+@pytest.mark.parametrize("split_class", [ShuffleSplit, StratifiedShuffleSplit])
+@pytest.mark.parametrize(
+    "train_size, exp_train, exp_test", [(None, 9, 1), (8, 8, 2), (0.8, 8, 2)]
+)
+def test_shuffle_split_default_test_size(split_class, train_size, exp_train, exp_test):
+    # Check that the default value has the expected behavior, i.e. 0.1 if both
+    # unspecified or complement train_size unless both are specified.
+    X = np.ones(10)
+    y = np.ones(10)
+
+    X_train, X_test = next(split_class(train_size=train_size).split(X, y))
+
+    assert len(X_train) == exp_train
+    assert len(X_test) == exp_test
+
+
+@pytest.mark.parametrize(
+    "train_size, exp_train, exp_test", [(None, 8, 2), (7, 7, 3), (0.7, 7, 3)]
+)
+def test_group_shuffle_split_default_test_size(train_size, exp_train, exp_test):
+    # Check that the default value has the expected behavior, i.e. 0.2 if both
+    # unspecified or complement train_size unless both are specified.
+    X = np.ones(10)
+    y = np.ones(10)
+    groups = range(10)
+
+    X_train, X_test = next(GroupShuffleSplit(train_size=train_size).split(X, y, groups))
+
+    assert len(X_train) == exp_train
+    assert len(X_test) == exp_test
+
+
+def test_stratified_shuffle_split_init():
+    X = np.arange(7)
+    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
+    # Check that error is raised if there is a class with only one sample
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, test_size=0.2).split(X, y))
+
+    # Check that error is raised if the test set size is smaller than n_classes
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, test_size=2).split(X, y))
+    # Check that error is raised if the train set size is smaller than
+    # n_classes
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, test_size=3, train_size=2).split(X, y))
+
+    X = np.arange(9)
+    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
+
+    # Train size or test size too small
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(train_size=2).split(X, y))
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(test_size=2).split(X, y))
+
+
+def test_stratified_shuffle_split_respects_test_size():
+    y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2])
+    test_size = 5
+    train_size = 10
+    sss = StratifiedShuffleSplit(
+        6, test_size=test_size, train_size=train_size, random_state=0
+    ).split(np.ones(len(y)), y)
+    for train, test in sss:
+        assert len(train) == train_size
+        assert len(test) == test_size
+
+
+def test_stratified_shuffle_split_iter():
+    ys = [
+        np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
+        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+        np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
+        np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
+        np.array([-1] * 800 + [1] * 50),
+        np.concatenate([[i] * (100 + i) for i in range(11)]),
+        [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
+        ["1", "1", "1", "1", "2", "2", "2", "3", "3", "3", "3", "3"],
+    ]
+
+    for y in ys:
+        sss = StratifiedShuffleSplit(6, test_size=0.33, random_state=0).split(
+            np.ones(len(y)), y
+        )
+        y = np.asanyarray(y)  # To make it indexable for y[train]
+        # this is how test-size is computed internally
+        # in _validate_shuffle_split
+        test_size = np.ceil(0.33 * len(y))
+        train_size = len(y) - test_size
+        for train, test in sss:
+            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
+            # Checks if folds keep classes proportions
+            p_train = np.bincount(np.unique(y[train], return_inverse=True)[1]) / float(
+                len(y[train])
+            )
+            p_test = np.bincount(np.unique(y[test], return_inverse=True)[1]) / float(
+                len(y[test])
+            )
+            assert_array_almost_equal(p_train, p_test, 1)
+            assert len(train) + len(test) == y.size
+            assert len(train) == train_size
+            assert len(test) == test_size
+            assert_array_equal(np.intersect1d(train, test), [])
+
+
+def test_stratified_shuffle_split_even():
+    # Test the StratifiedShuffleSplit, indices are drawn with a
+    # equal chance
+    n_folds = 5
+    n_splits = 1000
+
+    def assert_counts_are_ok(idx_counts, p):
+        # Here we test that the distribution of the counts
+        # per index is close enough to a binomial
+        threshold = 0.05 / n_splits
+        bf = stats.binom(n_splits, p)
+        for count in idx_counts:
+            prob = bf.pmf(count)
+            assert prob > threshold, (
+                "An index is not drawn with chance corresponding to even draws"
+            )
+
+    for n_samples in (6, 22):
+        groups = np.array((n_samples // 2) * [0, 1])
+        splits = StratifiedShuffleSplit(
+            n_splits=n_splits, test_size=1.0 / n_folds, random_state=0
+        )
+
+        train_counts = [0] * n_samples
+        test_counts = [0] * n_samples
+        n_splits_actual = 0
+        for train, test in splits.split(X=np.ones(n_samples), y=groups):
+            n_splits_actual += 1
+            for counter, ids in [(train_counts, train), (test_counts, test)]:
+                for id in ids:
+                    counter[id] += 1
+        assert n_splits_actual == n_splits
+
+        n_train, n_test = _validate_shuffle_split(
+            n_samples, test_size=1.0 / n_folds, train_size=1.0 - (1.0 / n_folds)
+        )
+
+        assert len(train) == n_train
+        assert len(test) == n_test
+        assert len(set(train).intersection(test)) == 0
+
+        group_counts = np.unique(groups)
+        assert splits.test_size == 1.0 / n_folds
+        assert n_train + n_test == len(groups)
+        assert len(group_counts) == 2
+        ex_test_p = float(n_test) / n_samples
+        ex_train_p = float(n_train) / n_samples
+
+        assert_counts_are_ok(train_counts, ex_train_p)
+        assert_counts_are_ok(test_counts, ex_test_p)
+
+
+def test_stratified_shuffle_split_overlap_train_test_bug():
+    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
+    # the original bug report
+    y = [0, 1, 2, 3] * 3 + [4, 5] * 5
+    X = np.ones_like(y)
+
+    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
+
+    train, test = next(sss.split(X=X, y=y))
+
+    # no overlap
+    assert_array_equal(np.intersect1d(train, test), [])
+
+    # complete partition
+    assert_array_equal(np.union1d(train, test), np.arange(len(y)))
+
+
+def test_stratified_shuffle_split_multilabel():
+    # fix for issue 9037
+    for y in [
+        np.array([[0, 1], [1, 0], [1, 0], [0, 1]]),
+        np.array([[0, 1], [1, 1], [1, 1], [0, 1]]),
+    ]:
+        X = np.ones_like(y)
+        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
+        train, test = next(sss.split(X=X, y=y))
+        y_train = y[train]
+        y_test = y[test]
+
+        # no overlap
+        assert_array_equal(np.intersect1d(train, test), [])
+
+        # complete partition
+        assert_array_equal(np.union1d(train, test), np.arange(len(y)))
+
+        # correct stratification of entire rows
+        # (by design, here y[:, 0] uniquely determines the entire row of y)
+        expected_ratio = np.mean(y[:, 0])
+        assert expected_ratio == np.mean(y_train[:, 0])
+        assert expected_ratio == np.mean(y_test[:, 0])
+
+
+def test_stratified_shuffle_split_multilabel_many_labels():
+    # fix in PR #9922: for multilabel data with > 1000 labels, str(row)
+    # truncates with an ellipsis for elements in positions 4 through
+    # len(row) - 4, so labels were not being correctly split using the powerset
+    # method for transforming a multilabel problem to a multiclass one; this
+    # test checks that this problem is fixed.
+    row_with_many_zeros = [1, 0, 1] + [0] * 1000 + [1, 0, 1]
+    row_with_many_ones = [1, 0, 1] + [1] * 1000 + [1, 0, 1]
+    y = np.array([row_with_many_zeros] * 10 + [row_with_many_ones] * 100)
+    X = np.ones_like(y)
+
+    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
+    train, test = next(sss.split(X=X, y=y))
+    y_train = y[train]
+    y_test = y[test]
+
+    # correct stratification of entire rows
+    # (by design, here y[:, 4] uniquely determines the entire row of y)
+    expected_ratio = np.mean(y[:, 4])
+    assert expected_ratio == np.mean(y_train[:, 4])
+    assert expected_ratio == np.mean(y_test[:, 4])
+
+
+def test_predefinedsplit_with_kfold_split():
+    # Check that PredefinedSplit can reproduce a split generated by Kfold.
+    folds = np.full(10, -1.0)
+    kf_train = []
+    kf_test = []
+    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
+        kf_train.append(train_ind)
+        kf_test.append(test_ind)
+        folds[test_ind] = i
+    ps = PredefinedSplit(folds)
+    # n_splits is simply the no of unique folds
+    assert len(np.unique(folds)) == ps.get_n_splits()
+    ps_train, ps_test = zip(*ps.split())
+    assert_array_equal(ps_train, kf_train)
+    assert_array_equal(ps_test, kf_test)
+
+
+def test_group_shuffle_split():
+    for groups_i in test_groups:
+        X = y = np.ones(len(groups_i))
+        n_splits = 6
+        test_size = 1.0 / 3
+        slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0)
+
+        # Make sure the repr works
+        repr(slo)
+
+        # Test that the length is correct
+        assert slo.get_n_splits(X, y, groups=groups_i) == n_splits
+
+        l_unique = np.unique(groups_i)
+        l = np.asarray(groups_i)
+
+        for train, test in slo.split(X, y, groups=groups_i):
+            # First test: no train group is in the test set and vice versa
+            l_train_unique = np.unique(l[train])
+            l_test_unique = np.unique(l[test])
+            assert not np.any(np.isin(l[train], l_test_unique))
+            assert not np.any(np.isin(l[test], l_train_unique))
+
+            # Second test: train and test add up to all the data
+            assert l[train].size + l[test].size == l.size
+
+            # Third test: train and test are disjoint
+            assert_array_equal(np.intersect1d(train, test), [])
+
+            # Fourth test:
+            # unique train and test groups are correct, +- 1 for rounding error
+            assert abs(len(l_test_unique) - round(test_size * len(l_unique))) <= 1
+            assert (
+                abs(len(l_train_unique) - round((1.0 - test_size) * len(l_unique))) <= 1
+            )
+
+
+def test_leave_one_p_group_out():
+    logo = LeaveOneGroupOut()
+    lpgo_1 = LeavePGroupsOut(n_groups=1)
+    lpgo_2 = LeavePGroupsOut(n_groups=2)
+
+    # Make sure the repr works
+    assert repr(logo) == "LeaveOneGroupOut()"
+    assert repr(lpgo_1) == "LeavePGroupsOut(n_groups=1)"
+    assert repr(lpgo_2) == "LeavePGroupsOut(n_groups=2)"
+    assert repr(LeavePGroupsOut(n_groups=3)) == "LeavePGroupsOut(n_groups=3)"
+
+    for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1), (lpgo_2, 2))):
+        for i, groups_i in enumerate(test_groups):
+            n_groups = len(np.unique(groups_i))
+            n_splits = n_groups if p_groups_out == 1 else n_groups * (n_groups - 1) / 2
+            X = y = np.ones(len(groups_i))
+
+            # Test that the length is correct
+            assert cv.get_n_splits(X, y, groups=groups_i) == n_splits
+
+            groups_arr = np.asarray(groups_i)
+
+            # Split using the original list / array / list of string groups_i
+            for train, test in cv.split(X, y, groups=groups_i):
+                # First test: no train group is in the test set and vice versa
+                assert_array_equal(
+                    np.intersect1d(groups_arr[train], groups_arr[test]).tolist(), []
+                )
+
+                # Second test: train and test add up to all the data
+                assert len(train) + len(test) == len(groups_i)
+
+                # Third test:
+                # The number of groups in test must be equal to p_groups_out
+                assert np.unique(groups_arr[test]).shape[0], p_groups_out
+
+    # check get_n_splits() with dummy parameters
+    assert logo.get_n_splits(None, None, ["a", "b", "c", "b", "c"]) == 3
+    assert logo.get_n_splits(groups=[1.0, 1.1, 1.0, 1.2]) == 3
+    assert lpgo_2.get_n_splits(None, None, np.arange(4)) == 6
+    assert lpgo_1.get_n_splits(groups=np.arange(4)) == 4
+
+    # raise ValueError if a `groups` parameter is illegal
+    with pytest.raises(ValueError):
+        logo.get_n_splits(None, None, [0.0, np.nan, 0.0])
+    with pytest.raises(ValueError):
+        lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0])
+
+    msg = "The 'groups' parameter should not be None."
+    with pytest.raises(ValueError, match=msg):
+        logo.get_n_splits(None, None, None)
+    with pytest.raises(ValueError, match=msg):
+        lpgo_1.get_n_splits(None, None, None)
+
+
+def test_leave_group_out_changing_groups():
+    # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if
+    # the groups variable is changed before calling split
+    groups = np.array([0, 1, 2, 1, 1, 2, 0, 0])
+    X = np.ones(len(groups))
+    groups_changing = np.array(groups, copy=True)
+    lolo = LeaveOneGroupOut().split(X, groups=groups)
+    lolo_changing = LeaveOneGroupOut().split(X, groups=groups)
+    lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
+    lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
+    groups_changing[:] = 0
+    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
+        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
+            assert_array_equal(train, train_chan)
+            assert_array_equal(test, test_chan)
+
+    # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3
+    assert 3 == LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, groups=groups)
+    # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups)
+    assert 3 == LeaveOneGroupOut().get_n_splits(X, y=X, groups=groups)
+
+
+def test_leave_group_out_order_dependence():
+    # Check that LeaveOneGroupOut orders the splits according to the index
+    # of the group left out.
+    groups = np.array([2, 2, 0, 0, 1, 1])
+    X = np.ones(len(groups))
+
+    splits = iter(LeaveOneGroupOut().split(X, groups=groups))
+
+    expected_indices = [
+        ([0, 1, 4, 5], [2, 3]),
+        ([0, 1, 2, 3], [4, 5]),
+        ([2, 3, 4, 5], [0, 1]),
+    ]
+
+    for expected_train, expected_test in expected_indices:
+        train, test = next(splits)
+        assert_array_equal(train, expected_train)
+        assert_array_equal(test, expected_test)
+
+
+def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
+    X = y = groups = np.ones(0)
+    msg = re.escape("Found array with 0 sample(s)")
+    with pytest.raises(ValueError, match=msg):
+        next(LeaveOneGroupOut().split(X, y, groups))
+
+    X = y = groups = np.ones(1)
+    msg = re.escape(
+        f"The groups parameter contains fewer than 2 unique groups ({groups})."
+        " LeaveOneGroupOut expects at least 2."
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeaveOneGroupOut().split(X, y, groups))
+
+    X = y = groups = np.ones(1)
+    msg = re.escape(
+        "The groups parameter contains fewer than (or equal to) n_groups "
+        f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects "
+        "that at least n_groups + 1 (4) unique groups "
+        "be present"
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeavePGroupsOut(n_groups=3).split(X, y, groups))
+
+    X = y = groups = np.arange(3)
+    msg = re.escape(
+        "The groups parameter contains fewer than (or equal to) n_groups "
+        f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects "
+        "that at least n_groups + 1 (4) unique groups "
+        "be present"
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeavePGroupsOut(n_groups=3).split(X, y, groups))
+
+
+def test_repeated_cv_value_errors():
+    # n_repeats is not integer or <= 0
+    for cv in (RepeatedKFold, RepeatedStratifiedKFold):
+        with pytest.raises(ValueError):
+            cv(n_repeats=0)
+        with pytest.raises(ValueError):
+            cv(n_repeats=1.5)
+
+
+@pytest.mark.parametrize("RepeatedCV", [RepeatedKFold, RepeatedStratifiedKFold])
+def test_repeated_cv_repr(RepeatedCV):
+    n_splits, n_repeats = 2, 6
+    repeated_cv = RepeatedCV(n_splits=n_splits, n_repeats=n_repeats)
+    repeated_cv_repr = "{}(n_repeats=6, n_splits=2, random_state=None)".format(
+        repeated_cv.__class__.__name__
+    )
+    assert repeated_cv_repr == repr(repeated_cv)
+
+
+def test_repeated_kfold_determinstic_split():
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    random_state = 258173307
+    rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)
+
+    # split should produce same and deterministic splits on
+    # each call
+    for _ in range(3):
+        splits = rkf.split(X)
+        train, test = next(splits)
+        assert_array_equal(train, [2, 4])
+        assert_array_equal(test, [0, 1, 3])
+
+        train, test = next(splits)
+        assert_array_equal(train, [0, 1, 3])
+        assert_array_equal(test, [2, 4])
+
+        train, test = next(splits)
+        assert_array_equal(train, [0, 1])
+        assert_array_equal(test, [2, 3, 4])
+
+        train, test = next(splits)
+        assert_array_equal(train, [2, 3, 4])
+        assert_array_equal(test, [0, 1])
+
+        with pytest.raises(StopIteration):
+            next(splits)
+
+
+def test_get_n_splits_for_repeated_kfold():
+    n_splits = 3
+    n_repeats = 4
+    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)
+    expected_n_splits = n_splits * n_repeats
+    assert expected_n_splits == rkf.get_n_splits()
+
+
+def test_get_n_splits_for_repeated_stratified_kfold():
+    n_splits = 3
+    n_repeats = 4
+    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)
+    expected_n_splits = n_splits * n_repeats
+    assert expected_n_splits == rskf.get_n_splits()
+
+
+def test_repeated_stratified_kfold_determinstic_split():
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    y = [1, 1, 1, 0, 0]
+    random_state = 1944695409
+    rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, random_state=random_state)
+
+    # split should produce same and deterministic splits on
+    # each call
+    for _ in range(3):
+        splits = rskf.split(X, y)
+        train, test = next(splits)
+        assert_array_equal(train, [1, 4])
+        assert_array_equal(test, [0, 2, 3])
+
+        train, test = next(splits)
+        assert_array_equal(train, [0, 2, 3])
+        assert_array_equal(test, [1, 4])
+
+        train, test = next(splits)
+        assert_array_equal(train, [2, 3])
+        assert_array_equal(test, [0, 1, 4])
+
+        train, test = next(splits)
+        assert_array_equal(train, [0, 1, 4])
+        assert_array_equal(test, [2, 3])
+
+        with pytest.raises(StopIteration):
+            next(splits)
+
+
+def test_train_test_split_errors():
+    pytest.raises(ValueError, train_test_split)
+
+    pytest.raises(ValueError, train_test_split, range(3), train_size=1.1)
+
+    pytest.raises(ValueError, train_test_split, range(3), test_size=0.6, train_size=0.6)
+    pytest.raises(
+        ValueError,
+        train_test_split,
+        range(3),
+        test_size=np.float32(0.6),
+        train_size=np.float32(0.6),
+    )
+    pytest.raises(ValueError, train_test_split, range(3), test_size="wrong_type")
+    pytest.raises(ValueError, train_test_split, range(3), test_size=2, train_size=4)
+    pytest.raises(TypeError, train_test_split, range(3), some_argument=1.1)
+    pytest.raises(ValueError, train_test_split, range(3), range(42))
+    pytest.raises(ValueError, train_test_split, range(10), shuffle=False, stratify=True)
+
+    with pytest.raises(
+        ValueError,
+        match=r"train_size=11 should be either positive and "
+        r"smaller than the number of samples 10 or a "
+        r"float in the \(0, 1\) range",
+    ):
+        train_test_split(range(10), train_size=11, test_size=1)
+
+
+@pytest.mark.parametrize(
+    "train_size, exp_train, exp_test", [(None, 7, 3), (8, 8, 2), (0.8, 8, 2)]
+)
+def test_train_test_split_default_test_size(train_size, exp_train, exp_test):
+    # Check that the default value has the expected behavior, i.e. complement
+    # train_size unless both are specified.
+    X_train, X_test = train_test_split(X, train_size=train_size)
+
+    assert len(X_train) == exp_train
+    assert len(X_test) == exp_test
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize(
+    "shuffle,stratify",
+    (
+        (True, None),
+        (True, np.hstack((np.ones(6), np.zeros(4)))),
+        # stratification only works with shuffling
+        (False, None),
+    ),
+)
+def test_array_api_train_test_split(
+    shuffle, stratify, array_namespace, device, dtype_name
+):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    X = np.arange(100).reshape((10, 10))
+    y = np.arange(10)
+
+    X_np = X.astype(dtype_name)
+    X_xp = xp.asarray(X_np, device=device)
+
+    y_np = y.astype(dtype_name)
+    y_xp = xp.asarray(y_np, device=device)
+
+    X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
+        X_np, y, random_state=0, shuffle=shuffle, stratify=stratify
+    )
+    with config_context(array_api_dispatch=True):
+        if stratify is not None:
+            stratify_xp = xp.asarray(stratify)
+        else:
+            stratify_xp = stratify
+        X_train_xp, X_test_xp, y_train_xp, y_test_xp = train_test_split(
+            X_xp, y_xp, shuffle=shuffle, stratify=stratify_xp, random_state=0
+        )
+
+        # Check that namespace is preserved, has to happen with
+        # array_api_dispatch enabled.
+        assert get_namespace(X_train_xp)[0] == get_namespace(X_xp)[0]
+        assert get_namespace(X_test_xp)[0] == get_namespace(X_xp)[0]
+        assert get_namespace(y_train_xp)[0] == get_namespace(y_xp)[0]
+        assert get_namespace(y_test_xp)[0] == get_namespace(y_xp)[0]
+
+    # Check device and dtype is preserved on output
+    assert array_api_device(X_train_xp) == array_api_device(X_xp)
+    assert array_api_device(y_train_xp) == array_api_device(y_xp)
+    assert array_api_device(X_test_xp) == array_api_device(X_xp)
+    assert array_api_device(y_test_xp) == array_api_device(y_xp)
+
+    assert X_train_xp.dtype == X_xp.dtype
+    assert y_train_xp.dtype == y_xp.dtype
+    assert X_test_xp.dtype == X_xp.dtype
+    assert y_test_xp.dtype == y_xp.dtype
+
+    assert_allclose(
+        _convert_to_numpy(X_train_xp, xp=xp),
+        X_train_np,
+    )
+    assert_allclose(
+        _convert_to_numpy(X_test_xp, xp=xp),
+        X_test_np,
+    )
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_train_test_split(coo_container):
+    X = np.arange(100).reshape((10, 10))
+    X_s = coo_container(X)
+    y = np.arange(10)
+
+    # simple test
+    split = train_test_split(X, y, test_size=None, train_size=0.5)
+    X_train, X_test, y_train, y_test = split
+    assert len(y_test) == len(y_train)
+    # test correspondence of X and y
+    assert_array_equal(X_train[:, 0], y_train * 10)
+    assert_array_equal(X_test[:, 0], y_test * 10)
+
+    # don't convert lists to anything else by default
+    split = train_test_split(X, X_s, y.tolist())
+    X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
+    assert isinstance(y_train, list)
+    assert isinstance(y_test, list)
+
+    # allow nd-arrays
+    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
+    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
+    split = train_test_split(X_4d, y_3d)
+    assert split[0].shape == (7, 5, 3, 2)
+    assert split[1].shape == (3, 5, 3, 2)
+    assert split[2].shape == (7, 7, 11)
+    assert split[3].shape == (3, 7, 11)
+
+    # test stratification option
+    y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
+    for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75], [2, 4, 2, 4, 6]):
+        train, test = train_test_split(
+            y, test_size=test_size, stratify=y, random_state=0
+        )
+        assert len(test) == exp_test_size
+        assert len(test) + len(train) == len(y)
+        # check the 1:1 ratio of ones and twos in the data is preserved
+        assert np.sum(train == 1) == np.sum(train == 2)
+
+    # test unshuffled split
+    y = np.arange(10)
+    for test_size in [2, 0.2]:
+        train, test = train_test_split(y, shuffle=False, test_size=test_size)
+        assert_array_equal(test, [8, 9])
+        assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7])
+
+
+def test_train_test_split_32bit_overflow():
+    """Check for integer overflow on 32-bit platforms.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20774
+    """
+
+    # A number 'n' big enough for expression 'n * n * train_size' to cause
+    # an overflow for signed 32-bit integer
+    big_number = 100000
+
+    # Definition of 'y' is a part of reproduction - population for at least
+    # one class should be in the same order of magnitude as size of X
+    X = np.arange(big_number)
+    y = X > (0.99 * big_number)
+
+    split = train_test_split(X, y, stratify=y, train_size=0.25)
+    X_train, X_test, y_train, y_test = split
+
+    assert X_train.size + X_test.size == big_number
+    assert y_train.size + y_test.size == big_number
+
+
+def test_train_test_split_pandas():
+    # check train_test_split doesn't destroy pandas dataframe
+    types = [MockDataFrame]
+    try:
+        from pandas import DataFrame
+
+        types.append(DataFrame)
+    except ImportError:
+        pass
+    for InputFeatureType in types:
+        # X dataframe
+        X_df = InputFeatureType(X)
+        X_train, X_test = train_test_split(X_df)
+        assert isinstance(X_train, InputFeatureType)
+        assert isinstance(X_test, InputFeatureType)
+
+
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_train_test_split_sparse(sparse_container):
+    # check that train_test_split converts scipy sparse matrices
+    # to csr, as stated in the documentation
+    X = np.arange(100).reshape((10, 10))
+    X_s = sparse_container(X)
+    X_train, X_test = train_test_split(X_s)
+    assert issparse(X_train) and X_train.format == "csr"
+    assert issparse(X_test) and X_test.format == "csr"
+
+
+def test_train_test_split_mock_pandas():
+    # X mock dataframe
+    X_df = MockDataFrame(X)
+    X_train, X_test = train_test_split(X_df)
+    assert isinstance(X_train, MockDataFrame)
+    assert isinstance(X_test, MockDataFrame)
+    X_train_arr, X_test_arr = train_test_split(X_df)
+
+
+def test_train_test_split_list_input():
+    # Check that when y is a list / list of string labels, it works.
+    X = np.ones(7)
+    y1 = ["1"] * 4 + ["0"] * 3
+    y2 = np.hstack((np.ones(4), np.zeros(3)))
+    y3 = y2.tolist()
+
+    for stratify in (True, False):
+        X_train1, X_test1, y_train1, y_test1 = train_test_split(
+            X, y1, stratify=y1 if stratify else None, random_state=0
+        )
+        X_train2, X_test2, y_train2, y_test2 = train_test_split(
+            X, y2, stratify=y2 if stratify else None, random_state=0
+        )
+        X_train3, X_test3, y_train3, y_test3 = train_test_split(
+            X, y3, stratify=y3 if stratify else None, random_state=0
+        )
+
+        np.testing.assert_equal(X_train1, X_train2)
+        np.testing.assert_equal(y_train2, y_train3)
+        np.testing.assert_equal(X_test1, X_test3)
+        np.testing.assert_equal(y_test3, y_test2)
+
+
+@pytest.mark.parametrize(
+    "test_size, train_size",
+    [(2.0, None), (1.0, None), (0.1, 0.95), (None, 1j), (11, None), (10, None), (8, 3)],
+)
+def test_shufflesplit_errors(test_size, train_size):
+    with pytest.raises(ValueError):
+        next(ShuffleSplit(test_size=test_size, train_size=train_size).split(X))
+
+
+def test_shufflesplit_reproducible():
+    # Check that iterating twice on the ShuffleSplit gives the same
+    # sequence of train-test when the random_state is given
+    ss = ShuffleSplit(random_state=21)
+    assert_array_equal([a for a, b in ss.split(X)], [a for a, b in ss.split(X)])
+
+
+def test_stratifiedshufflesplit_list_input():
+    # Check that when y is a list / list of string labels, it works.
+    sss = StratifiedShuffleSplit(test_size=2, random_state=42)
+    X = np.ones(7)
+    y1 = ["1"] * 4 + ["0"] * 3
+    y2 = np.hstack((np.ones(4), np.zeros(3)))
+    y3 = y2.tolist()
+
+    np.testing.assert_equal(list(sss.split(X, y1)), list(sss.split(X, y2)))
+    np.testing.assert_equal(list(sss.split(X, y3)), list(sss.split(X, y2)))
+
+
+def test_train_test_split_allow_nans():
+    # Check that train_test_split allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0] / 2)
+    train_test_split(X, y, test_size=0.2, random_state=42)
+
+
+def test_check_cv():
+    X = np.ones(9)
+    cv = check_cv(3, classifier=False)
+    # Use numpy.testing.assert_equal which recursively compares
+    # lists of lists
+    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
+
+    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
+    cv = check_cv(3, y_binary, classifier=True)
+    np.testing.assert_equal(
+        list(StratifiedKFold(3).split(X, y_binary)), list(cv.split(X, y_binary))
+    )
+
+    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
+    cv = check_cv(3, y_multiclass, classifier=True)
+    np.testing.assert_equal(
+        list(StratifiedKFold(3).split(X, y_multiclass)), list(cv.split(X, y_multiclass))
+    )
+    # also works with 2d multiclass
+    y_multiclass_2d = y_multiclass.reshape(-1, 1)
+    cv = check_cv(3, y_multiclass_2d, classifier=True)
+    np.testing.assert_equal(
+        list(StratifiedKFold(3).split(X, y_multiclass_2d)),
+        list(cv.split(X, y_multiclass_2d)),
+    )
+
+    assert not np.all(
+        next(StratifiedKFold(3).split(X, y_multiclass_2d))[0]
+        == next(KFold(3).split(X, y_multiclass_2d))[0]
+    )
+
+    X = np.ones(5)
+    y_multilabel = np.array(
+        [[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1], [1, 1, 0, 1], [0, 0, 1, 0]]
+    )
+    cv = check_cv(3, y_multilabel, classifier=True)
+    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
+
+    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
+    cv = check_cv(3, y_multioutput, classifier=True)
+    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
+
+    with pytest.raises(ValueError):
+        check_cv(cv="lolo")
+
+
+def test_cv_iterable_wrapper():
+    kf_iter = KFold().split(X, y)
+    kf_iter_wrapped = check_cv(kf_iter)
+    # Since the wrapped iterable is enlisted and stored,
+    # split can be called any number of times to produce
+    # consistent results.
+    np.testing.assert_equal(
+        list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y))
+    )
+    # If the splits are randomized, successive calls to split yields different
+    # results
+    kf_randomized_iter = KFold(shuffle=True, random_state=0).split(X, y)
+    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
+    # numpy's assert_array_equal properly compares nested lists
+    np.testing.assert_equal(
+        list(kf_randomized_iter_wrapped.split(X, y)),
+        list(kf_randomized_iter_wrapped.split(X, y)),
+    )
+
+    try:
+        splits_are_equal = True
+        np.testing.assert_equal(
+            list(kf_iter_wrapped.split(X, y)),
+            list(kf_randomized_iter_wrapped.split(X, y)),
+        )
+    except AssertionError:
+        splits_are_equal = False
+    assert not splits_are_equal, (
+        "If the splits are randomized, "
+        "successive calls to split should yield different results"
+    )
+
+
+@pytest.mark.parametrize("kfold", [GroupKFold, StratifiedGroupKFold])
+@pytest.mark.parametrize("shuffle", [True, False])
+def test_group_kfold(kfold, shuffle, global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+
+    # Parameters of the test
+    n_groups = 15
+    n_samples = 1000
+    n_splits = 5
+
+    X = y = np.ones(n_samples)
+
+    # Construct the test data
+    tolerance = 0.05 * n_samples  # 5 percent error allowed
+    groups = rng.randint(0, n_groups, n_samples)
+
+    ideal_n_groups_per_fold = n_samples // n_splits
+
+    len(np.unique(groups))
+    # Get the test fold indices from the test set indices of each fold
+    folds = np.zeros(n_samples)
+    random_state = None if not shuffle else global_random_seed
+    lkf = kfold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
+        folds[test] = i
+
+    # Check that folds have approximately the same size
+    assert len(folds) == len(groups)
+    for i in np.unique(folds):
+        assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold)
+
+    # Check that each group appears only in 1 fold
+    for group in np.unique(groups):
+        assert len(np.unique(folds[groups == group])) == 1
+
+    # Check that no group is on both sides of the split
+    groups = np.asarray(groups, dtype=object)
+    for train, test in lkf.split(X, y, groups):
+        assert len(np.intersect1d(groups[train], groups[test])) == 0
+
+    # Construct the test data
+    groups = np.array(
+        [
+            "Albert",
+            "Jean",
+            "Bertrand",
+            "Michel",
+            "Jean",
+            "Francis",
+            "Robert",
+            "Michel",
+            "Rachel",
+            "Lois",
+            "Michelle",
+            "Bernard",
+            "Marion",
+            "Laura",
+            "Jean",
+            "Rachel",
+            "Franck",
+            "John",
+            "Gael",
+            "Anna",
+            "Alix",
+            "Robert",
+            "Marion",
+            "David",
+            "Tony",
+            "Abel",
+            "Becky",
+            "Madmood",
+            "Cary",
+            "Mary",
+            "Alexandre",
+            "David",
+            "Francis",
+            "Barack",
+            "Abdoul",
+            "Rasha",
+            "Xi",
+            "Silvia",
+        ]
+    )
+
+    n_groups = len(np.unique(groups))
+    n_samples = len(groups)
+    n_splits = 5
+    tolerance = 0.05 * n_samples  # 5 percent error allowed
+    ideal_n_groups_per_fold = n_samples // n_splits
+
+    X = y = np.ones(n_samples)
+
+    # Get the test fold indices from the test set indices of each fold
+    folds = np.zeros(n_samples)
+    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
+        folds[test] = i
+
+    # Check that folds have approximately the same size
+    assert len(folds) == len(groups)
+    if not shuffle:
+        for i in np.unique(folds):
+            assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold)
+
+    # Check that each group appears only in 1 fold
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", FutureWarning)
+        for group in np.unique(groups):
+            assert len(np.unique(folds[groups == group])) == 1
+
+    # Check that no group is on both sides of the split
+    groups = np.asarray(groups, dtype=object)
+    for train, test in lkf.split(X, y, groups):
+        assert len(np.intersect1d(groups[train], groups[test])) == 0
+
+    # groups can also be a list
+    # use a new instance for reproducibility when shuffle=True
+    lkf_copy = kfold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+    cv_iter = list(lkf.split(X, y, groups.tolist()))
+    for (train1, test1), (train2, test2) in zip(lkf_copy.split(X, y, groups), cv_iter):
+        assert_array_equal(train1, train2)
+        assert_array_equal(test1, test2)
+
+    # Should fail if there are more folds than groups
+    groups = np.array([1, 1, 1, 2, 2])
+    X = y = np.ones(len(groups))
+    with pytest.raises(ValueError, match="Cannot have number of splits.*greater"):
+        next(GroupKFold(n_splits=3).split(X, y, groups))
+
+
+def test_time_series_cv():
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]
+
+    # Should fail if there are more folds than samples
+    with pytest.raises(ValueError, match="Cannot have number of folds.*greater"):
+        next(TimeSeriesSplit(n_splits=7).split(X))
+
+    tscv = TimeSeriesSplit(2)
+
+    # Manually check that Time Series CV preserves the data
+    # ordering on toy datasets
+    splits = tscv.split(X[:-1])
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3])
+    assert_array_equal(test, [4, 5])
+
+    splits = TimeSeriesSplit(2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2])
+    assert_array_equal(test, [3, 4])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [5, 6])
+
+    # Check get_n_splits returns the correct number of splits
+    splits = TimeSeriesSplit(2).split(X)
+    n_splits_actual = len(list(splits))
+    assert n_splits_actual == tscv.get_n_splits()
+    assert n_splits_actual == 2
+
+
+def _check_time_series_max_train_size(splits, check_splits, max_train_size):
+    for (train, test), (check_train, check_test) in zip(splits, check_splits):
+        assert_array_equal(test, check_test)
+        assert len(check_train) <= max_train_size
+        suffix_start = max(len(train) - max_train_size, 0)
+        assert_array_equal(check_train, train[suffix_start:])
+
+
+def test_time_series_max_train_size():
+    X = np.zeros((6, 1))
+    splits = TimeSeriesSplit(n_splits=3).split(X)
+    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X)
+    _check_time_series_max_train_size(splits, check_splits, max_train_size=3)
+
+    # Test for the case where the size of a fold is greater than max_train_size
+    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X)
+    _check_time_series_max_train_size(splits, check_splits, max_train_size=2)
+
+    # Test for the case where the size of each fold is less than max_train_size
+    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
+    _check_time_series_max_train_size(splits, check_splits, max_train_size=2)
+
+
+def test_time_series_test_size():
+    X = np.zeros((10, 1))
+
+    # Test alone
+    splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0])
+    assert_array_equal(test, [1, 2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3])
+    assert_array_equal(test, [4, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6])
+    assert_array_equal(test, [7, 8, 9])
+
+    # Test with max_train_size
+    splits = TimeSeriesSplit(n_splits=2, test_size=2, max_train_size=4).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3, 4, 5])
+    assert_array_equal(test, [6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [4, 5, 6, 7])
+    assert_array_equal(test, [8, 9])
+
+    # Should fail with not enough data points for configuration
+    with pytest.raises(ValueError, match="Too many splits.*with test_size"):
+        splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X)
+        next(splits)
+
+
+def test_time_series_gap():
+    X = np.zeros((10, 1))
+
+    # Test alone
+    splits = TimeSeriesSplit(n_splits=2, gap=2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [4, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [7, 8, 9])
+
+    # Test with max_train_size
+    splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [4, 5])
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3])
+    assert_array_equal(test, [6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [4, 5])
+    assert_array_equal(test, [8, 9])
+
+    # Test with test_size
+    splits = TimeSeriesSplit(n_splits=2, gap=2, max_train_size=4, test_size=2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3])
+    assert_array_equal(test, [6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3, 4, 5])
+    assert_array_equal(test, [8, 9])
+
+    # Test with additional test_size
+    splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [4, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [7, 8, 9])
+
+    # Verify proper error is thrown
+    with pytest.raises(ValueError, match="Too many splits.*and gap"):
+        splits = TimeSeriesSplit(n_splits=4, gap=2).split(X)
+        next(splits)
+
+
+@ignore_warnings
+def test_nested_cv():
+    # Test if nested cross validation works with different combinations of cv
+    rng = np.random.RandomState(0)
+
+    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
+    groups = rng.randint(0, 5, 15)
+
+    cvs = [
+        LeaveOneGroupOut(),
+        StratifiedKFold(n_splits=2),
+        LeaveOneOut(),
+        GroupKFold(n_splits=3),
+        StratifiedKFold(),
+        StratifiedGroupKFold(),
+        StratifiedShuffleSplit(n_splits=3, random_state=0),
+    ]
+
+    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
+        gs = GridSearchCV(
+            DummyClassifier(),
+            param_grid={"strategy": ["stratified", "most_frequent"]},
+            cv=inner_cv,
+            error_score="raise",
+        )
+        cross_val_score(
+            gs, X=X, y=y, groups=groups, cv=outer_cv, params={"groups": groups}
+        )
+
+
+def test_build_repr():
+    class MockSplitter:
+        def __init__(self, a, b=0, c=None):
+            self.a = a
+            self.b = b
+            self.c = c
+
+        def __repr__(self):
+            return _build_repr(self)
+
+    assert repr(MockSplitter(5, 6)) == "MockSplitter(a=5, b=6, c=None)"
+
+
+@pytest.mark.parametrize(
+    "CVSplitter", (ShuffleSplit, GroupShuffleSplit, StratifiedShuffleSplit)
+)
+def test_shuffle_split_empty_trainset(CVSplitter):
+    cv = CVSplitter(test_size=0.99)
+    X, y = [[1]], [0]  # 1 sample
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With n_samples=1, test_size=0.99 and train_size=None, "
+            "the resulting train set will be empty"
+        ),
+    ):
+        next(_split(cv, X, y, groups=[1]))
+
+
+def test_train_test_split_empty_trainset():
+    (X,) = [[1]]  # 1 sample
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With n_samples=1, test_size=0.99 and train_size=None, "
+            "the resulting train set will be empty"
+        ),
+    ):
+        train_test_split(X, test_size=0.99)
+
+    X = [[1], [1], [1]]  # 3 samples, ask for more than 2 thirds
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With n_samples=3, test_size=0.67 and train_size=None, "
+            "the resulting train set will be empty"
+        ),
+    ):
+        train_test_split(X, test_size=0.67)
+
+
+def test_leave_one_out_empty_trainset():
+    # LeaveOneGroup out expect at least 2 groups so no need to check
+    cv = LeaveOneOut()
+    X, y = [[1]], [0]  # 1 sample
+    with pytest.raises(ValueError, match="Cannot perform LeaveOneOut with n_samples=1"):
+        next(cv.split(X, y))
+
+
+def test_leave_p_out_empty_trainset():
+    # No need to check LeavePGroupsOut
+    cv = LeavePOut(p=2)
+    X, y = [[1], [2]], [0, 3]  # 2 samples
+    with pytest.raises(
+        ValueError, match="p=2 must be strictly less than the number of samples=2"
+    ):
+        next(cv.split(X, y))
+
+
+@pytest.mark.parametrize(
+    "Klass", (KFold, StratifiedKFold, StratifiedGroupKFold, GroupKFold)
+)
+def test_random_state_shuffle_false(Klass):
+    # passing a non-default random_state when shuffle=False makes no sense
+    with pytest.raises(ValueError, match="has no effect since shuffle is False"):
+        Klass(3, shuffle=False, random_state=0)
+
+
+@pytest.mark.parametrize(
+    "cv, expected",
+    [
+        (KFold(), True),
+        (KFold(shuffle=True, random_state=123), True),
+        (StratifiedKFold(), True),
+        (StratifiedKFold(shuffle=True, random_state=123), True),
+        (StratifiedGroupKFold(shuffle=True, random_state=123), True),
+        (StratifiedGroupKFold(), True),
+        (RepeatedKFold(random_state=123), True),
+        (RepeatedStratifiedKFold(random_state=123), True),
+        (ShuffleSplit(random_state=123), True),
+        (GroupShuffleSplit(random_state=123), True),
+        (StratifiedShuffleSplit(random_state=123), True),
+        (GroupKFold(), True),
+        (GroupKFold(shuffle=True, random_state=123), True),
+        (TimeSeriesSplit(), True),
+        (LeaveOneOut(), True),
+        (LeaveOneGroupOut(), True),
+        (LeavePGroupsOut(n_groups=2), True),
+        (LeavePOut(p=2), True),
+        (KFold(shuffle=True, random_state=None), False),
+        (KFold(shuffle=True, random_state=None), False),
+        (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), False),
+        (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), False),
+        (RepeatedKFold(random_state=None), False),
+        (RepeatedKFold(random_state=np.random.RandomState(0)), False),
+        (RepeatedStratifiedKFold(random_state=None), False),
+        (RepeatedStratifiedKFold(random_state=np.random.RandomState(0)), False),
+        (ShuffleSplit(random_state=None), False),
+        (ShuffleSplit(random_state=np.random.RandomState(0)), False),
+        (GroupShuffleSplit(random_state=None), False),
+        (GroupShuffleSplit(random_state=np.random.RandomState(0)), False),
+        (StratifiedShuffleSplit(random_state=None), False),
+        (StratifiedShuffleSplit(random_state=np.random.RandomState(0)), False),
+    ],
+)
+def test_yields_constant_splits(cv, expected):
+    assert _yields_constant_splits(cv) == expected
+
+
+@pytest.mark.parametrize("cv", ALL_SPLITTERS, ids=[str(cv) for cv in ALL_SPLITTERS])
+def test_splitter_get_metadata_routing(cv):
+    """Check get_metadata_routing returns the correct MetadataRouter."""
+    assert hasattr(cv, "get_metadata_routing")
+    metadata = cv.get_metadata_routing()
+    if cv in GROUP_SPLITTERS:
+        assert metadata.split.requests["groups"] is True
+    elif cv in NO_GROUP_SPLITTERS:
+        assert not metadata.split.requests
+
+    assert_request_is_empty(metadata, exclude=["split"])
+
+
+@pytest.mark.parametrize("cv", ALL_SPLITTERS, ids=[str(cv) for cv in ALL_SPLITTERS])
+def test_splitter_set_split_request(cv):
+    """Check set_split_request is defined for group splitters and not for others."""
+    if cv in GROUP_SPLITTERS:
+        assert hasattr(cv, "set_split_request")
+    elif cv in NO_GROUP_SPLITTERS:
+        assert not hasattr(cv, "set_split_request")
+
+
+@pytest.mark.parametrize("cv", NO_GROUP_SPLITTERS, ids=str)
+def test_no_group_splitters_warns_with_groups(cv):
+    msg = f"The groups parameter is ignored by {cv.__class__.__name__}"
+
+    n_samples = 30
+    rng = np.random.RandomState(1)
+    X = rng.randint(0, 3, size=(n_samples, 2))
+    y = rng.randint(0, 3, size=(n_samples,))
+    groups = rng.randint(0, 3, size=(n_samples,))
+
+    with pytest.warns(UserWarning, match=msg):
+        cv.split(X, y, groups=groups)
+
+
+@pytest.mark.parametrize(
+    "cv", SPLITTERS_REQUIRING_TARGET, ids=[str(cv) for cv in SPLITTERS_REQUIRING_TARGET]
+)
+def test_stratified_splitter_without_y(cv):
+    msg = "missing 1 required positional argument: 'y'"
+    with pytest.raises(TypeError, match=msg):
+        cv.split(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_successive_halving.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_successive_halving.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdfab45b4f7ca337ce7e3ce92df517a00bc42a8e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_successive_halving.py
@@ -0,0 +1,853 @@
+from math import ceil
+
+import numpy as np
+import pytest
+from scipy.stats import expon, norm, randint
+
+from sklearn.datasets import make_classification
+from sklearn.dummy import DummyClassifier
+from sklearn.experimental import enable_halving_search_cv  # noqa: F401
+from sklearn.model_selection import (
+    GroupKFold,
+    GroupShuffleSplit,
+    HalvingGridSearchCV,
+    HalvingRandomSearchCV,
+    KFold,
+    LeaveOneGroupOut,
+    LeavePGroupsOut,
+    ShuffleSplit,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+)
+from sklearn.model_selection._search_successive_halving import (
+    _SubsampleMetaSplitter,
+    _top_k,
+)
+from sklearn.model_selection.tests.test_search import (
+    check_cv_results_array_types,
+    check_cv_results_keys,
+)
+from sklearn.svm import SVC, LinearSVC
+
+
+class FastClassifier(DummyClassifier):
+    """Dummy classifier that accepts parameters a, b, ... z.
+
+    These parameter don't affect the predictions and are useful for fast
+    grid searching."""
+
+    # update the constraints such that we accept all parameters from a to z
+    _parameter_constraints: dict = {
+        **DummyClassifier._parameter_constraints,
+        **{chr(key): "no_validation" for key in range(ord("a"), ord("z") + 1)},
+    }
+
+    def __init__(
+        self, strategy="stratified", random_state=None, constant=None, **kwargs
+    ):
+        super().__init__(
+            strategy=strategy, random_state=random_state, constant=constant
+        )
+
+    def get_params(self, deep=False):
+        params = super().get_params(deep=deep)
+        for char in range(ord("a"), ord("z") + 1):
+            params[chr(char)] = "whatever"
+        return params
+
+
+class SometimesFailClassifier(DummyClassifier):
+    def __init__(
+        self,
+        strategy="stratified",
+        random_state=None,
+        constant=None,
+        n_estimators=10,
+        fail_fit=False,
+        fail_predict=False,
+        a=0,
+    ):
+        self.fail_fit = fail_fit
+        self.fail_predict = fail_predict
+        self.n_estimators = n_estimators
+        self.a = a
+
+        super().__init__(
+            strategy=strategy, random_state=random_state, constant=constant
+        )
+
+    def fit(self, X, y):
+        if self.fail_fit:
+            raise Exception("fitting failed")
+        return super().fit(X, y)
+
+    def predict(self, X):
+        if self.fail_predict:
+            raise Exception("predict failed")
+        return super().predict(X)
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning")
+@pytest.mark.filterwarnings("ignore:Scoring failed:UserWarning")
+@pytest.mark.filterwarnings("ignore:One or more of the:UserWarning")
+@pytest.mark.parametrize("HalvingSearch", (HalvingGridSearchCV, HalvingRandomSearchCV))
+@pytest.mark.parametrize("fail_at", ("fit", "predict"))
+def test_nan_handling(HalvingSearch, fail_at):
+    """Check the selection of the best scores in presence of failure represented by
+    NaN values."""
+    n_samples = 1_000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+
+    search = HalvingSearch(
+        SometimesFailClassifier(),
+        {f"fail_{fail_at}": [False, True], "a": range(3)},
+        resource="n_estimators",
+        max_resources=6,
+        min_resources=1,
+        factor=2,
+    )
+
+    search.fit(X, y)
+
+    # estimators that failed during fit/predict should always rank lower
+    # than ones where the fit/predict succeeded
+    assert not search.best_params_[f"fail_{fail_at}"]
+    scores = search.cv_results_["mean_test_score"]
+    ranks = search.cv_results_["rank_test_score"]
+
+    # some scores should be NaN
+    assert np.isnan(scores).any()
+
+    unique_nan_ranks = np.unique(ranks[np.isnan(scores)])
+    # all NaN scores should have the same rank
+    assert unique_nan_ranks.shape[0] == 1
+    # NaNs should have the lowest rank
+    assert (unique_nan_ranks[0] >= ranks).all()
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+@pytest.mark.parametrize(
+    (
+        "aggressive_elimination,"
+        "max_resources,"
+        "expected_n_iterations,"
+        "expected_n_required_iterations,"
+        "expected_n_possible_iterations,"
+        "expected_n_remaining_candidates,"
+        "expected_n_candidates,"
+        "expected_n_resources,"
+    ),
+    [
+        # notice how it loops at the beginning
+        # also, the number of candidates evaluated at the last iteration is
+        # <= factor
+        (True, "limited", 4, 4, 3, 1, [60, 20, 7, 3], [20, 20, 60, 180]),
+        # no aggressive elimination: we end up with less iterations, and
+        # the number of candidates at the last iter is > factor, which isn't
+        # ideal
+        (False, "limited", 3, 4, 3, 3, [60, 20, 7], [20, 60, 180]),
+        #  # When the amount of resource isn't limited, aggressive_elimination
+        #  # has no effect. Here the default min_resources='exhaust' will take
+        #  # over.
+        (True, "unlimited", 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]),
+        (False, "unlimited", 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]),
+    ],
+)
+def test_aggressive_elimination(
+    Est,
+    aggressive_elimination,
+    max_resources,
+    expected_n_iterations,
+    expected_n_required_iterations,
+    expected_n_possible_iterations,
+    expected_n_remaining_candidates,
+    expected_n_candidates,
+    expected_n_resources,
+):
+    # Test the aggressive_elimination parameter.
+
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": ("l1", "l2"), "b": list(range(30))}
+    base_estimator = FastClassifier()
+
+    if max_resources == "limited":
+        max_resources = 180
+    else:
+        max_resources = n_samples
+
+    sh = Est(
+        base_estimator,
+        param_grid,
+        aggressive_elimination=aggressive_elimination,
+        max_resources=max_resources,
+        factor=3,
+    )
+    sh.set_params(verbose=True)  # just for test coverage
+
+    if Est is HalvingRandomSearchCV:
+        # same number of candidates as with the grid
+        sh.set_params(n_candidates=2 * 30, min_resources="exhaust")
+
+    sh.fit(X, y)
+
+    assert sh.n_iterations_ == expected_n_iterations
+    assert sh.n_required_iterations_ == expected_n_required_iterations
+    assert sh.n_possible_iterations_ == expected_n_possible_iterations
+    assert sh.n_resources_ == expected_n_resources
+    assert sh.n_candidates_ == expected_n_candidates
+    assert sh.n_remaining_candidates_ == expected_n_remaining_candidates
+    assert ceil(sh.n_candidates_[-1] / sh.factor) == sh.n_remaining_candidates_
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+@pytest.mark.parametrize(
+    (
+        "min_resources,"
+        "max_resources,"
+        "expected_n_iterations,"
+        "expected_n_possible_iterations,"
+        "expected_n_resources,"
+    ),
+    [
+        # with enough resources
+        ("smallest", "auto", 2, 4, [20, 60]),
+        # with enough resources but min_resources set manually
+        (50, "auto", 2, 3, [50, 150]),
+        # without enough resources, only one iteration can be done
+        ("smallest", 30, 1, 1, [20]),
+        # with exhaust: use as much resources as possible at the last iter
+        ("exhaust", "auto", 2, 2, [333, 999]),
+        ("exhaust", 1000, 2, 2, [333, 999]),
+        ("exhaust", 999, 2, 2, [333, 999]),
+        ("exhaust", 600, 2, 2, [200, 600]),
+        ("exhaust", 599, 2, 2, [199, 597]),
+        ("exhaust", 300, 2, 2, [100, 300]),
+        ("exhaust", 60, 2, 2, [20, 60]),
+        ("exhaust", 50, 1, 1, [20]),
+        ("exhaust", 20, 1, 1, [20]),
+    ],
+)
+def test_min_max_resources(
+    Est,
+    min_resources,
+    max_resources,
+    expected_n_iterations,
+    expected_n_possible_iterations,
+    expected_n_resources,
+):
+    # Test the min_resources and max_resources parameters, and how they affect
+    # the number of resources used at each iteration
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": [1, 2], "b": [1, 2, 3]}
+    base_estimator = FastClassifier()
+
+    sh = Est(
+        base_estimator,
+        param_grid,
+        factor=3,
+        min_resources=min_resources,
+        max_resources=max_resources,
+    )
+    if Est is HalvingRandomSearchCV:
+        sh.set_params(n_candidates=6)  # same number as with the grid
+
+    sh.fit(X, y)
+
+    expected_n_required_iterations = 2  # given 6 combinations and factor = 3
+    assert sh.n_iterations_ == expected_n_iterations
+    assert sh.n_required_iterations_ == expected_n_required_iterations
+    assert sh.n_possible_iterations_ == expected_n_possible_iterations
+    assert sh.n_resources_ == expected_n_resources
+    if min_resources == "exhaust":
+        assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh.n_resources_)
+
+
+@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV))
+@pytest.mark.parametrize(
+    "max_resources, n_iterations, n_possible_iterations",
+    [
+        ("auto", 5, 9),  # all resources are used
+        (1024, 5, 9),
+        (700, 5, 8),
+        (512, 5, 8),
+        (511, 5, 7),
+        (32, 4, 4),
+        (31, 3, 3),
+        (16, 3, 3),
+        (4, 1, 1),  # max_resources == min_resources, only one iteration is
+        # possible
+    ],
+)
+def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations):
+    # test the number of actual iterations that were run depending on
+    # max_resources
+
+    n_samples = 1024
+    X, y = make_classification(n_samples=n_samples, random_state=1)
+    param_grid = {"a": [1, 2], "b": list(range(10))}
+    base_estimator = FastClassifier()
+    factor = 2
+
+    sh = Est(
+        base_estimator,
+        param_grid,
+        cv=2,
+        factor=factor,
+        max_resources=max_resources,
+        min_resources=4,
+    )
+    if Est is HalvingRandomSearchCV:
+        sh.set_params(n_candidates=20)  # same as for HalvingGridSearchCV
+    sh.fit(X, y)
+    assert sh.n_required_iterations_ == 5
+    assert sh.n_iterations_ == n_iterations
+    assert sh.n_possible_iterations_ == n_possible_iterations
+
+
+@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV))
+def test_resource_parameter(Est):
+    # Test the resource parameter
+
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": [1, 2], "b": list(range(10))}
+    base_estimator = FastClassifier()
+    sh = Est(base_estimator, param_grid, cv=2, resource="c", max_resources=10, factor=3)
+    sh.fit(X, y)
+    assert set(sh.n_resources_) == set([1, 3, 9])
+    for r_i, params, param_c in zip(
+        sh.cv_results_["n_resources"],
+        sh.cv_results_["params"],
+        sh.cv_results_["param_c"],
+    ):
+        assert r_i == params["c"] == param_c
+
+    with pytest.raises(
+        ValueError, match="Cannot use resource=1234 which is not supported "
+    ):
+        sh = HalvingGridSearchCV(
+            base_estimator, param_grid, cv=2, resource="1234", max_resources=10
+        )
+        sh.fit(X, y)
+
+    with pytest.raises(
+        ValueError,
+        match=(
+            "Cannot use parameter c as the resource since it is part "
+            "of the searched parameters."
+        ),
+    ):
+        param_grid = {"a": [1, 2], "b": [1, 2], "c": [1, 3]}
+        sh = HalvingGridSearchCV(
+            base_estimator, param_grid, cv=2, resource="c", max_resources=10
+        )
+        sh.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "max_resources, n_candidates, expected_n_candidates",
+    [
+        (512, "exhaust", 128),  # generate exactly as much as needed
+        (32, "exhaust", 8),
+        (32, 8, 8),
+        (32, 7, 7),  # ask for less than what we could
+        (32, 9, 9),  # ask for more than 'reasonable'
+    ],
+)
+def test_random_search(max_resources, n_candidates, expected_n_candidates):
+    # Test random search and make sure the number of generated candidates is
+    # as expected
+
+    n_samples = 1024
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": norm, "b": norm}
+    base_estimator = FastClassifier()
+    sh = HalvingRandomSearchCV(
+        base_estimator,
+        param_grid,
+        n_candidates=n_candidates,
+        cv=2,
+        max_resources=max_resources,
+        factor=2,
+        min_resources=4,
+    )
+    sh.fit(X, y)
+    assert sh.n_candidates_[0] == expected_n_candidates
+    if n_candidates == "exhaust":
+        # Make sure 'exhaust' makes the last iteration use as much resources as
+        # we can
+        assert sh.n_resources_[-1] == max_resources
+
+
+@pytest.mark.parametrize(
+    "param_distributions, expected_n_candidates",
+    [
+        ({"a": [1, 2]}, 2),  # all lists, sample less than n_candidates
+        ({"a": randint(1, 3)}, 10),  # not all list, respect n_candidates
+    ],
+)
+def test_random_search_discrete_distributions(
+    param_distributions, expected_n_candidates
+):
+    # Make sure random search samples the appropriate number of candidates when
+    # we ask for more than what's possible. How many parameters are sampled
+    # depends whether the distributions are 'all lists' or not (see
+    # ParameterSampler for details). This is somewhat redundant with the checks
+    # in ParameterSampler but interaction bugs were discovered during
+    # development of SH
+
+    n_samples = 1024
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    base_estimator = FastClassifier()
+    sh = HalvingRandomSearchCV(base_estimator, param_distributions, n_candidates=10)
+    sh.fit(X, y)
+    assert sh.n_candidates_[0] == expected_n_candidates
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+@pytest.mark.parametrize(
+    "params, expected_error_message",
+    [
+        (
+            {"resource": "not_a_parameter"},
+            "Cannot use resource=not_a_parameter which is not supported",
+        ),
+        (
+            {"resource": "a", "max_resources": 100},
+            "Cannot use parameter a as the resource since it is part of",
+        ),
+        (
+            {"max_resources": "auto", "resource": "b"},
+            "resource can only be 'n_samples' when max_resources='auto'",
+        ),
+        (
+            {"min_resources": 15, "max_resources": 14},
+            "min_resources_=15 is greater than max_resources_=14",
+        ),
+        ({"cv": KFold(shuffle=True)}, "must yield consistent folds"),
+        ({"cv": ShuffleSplit()}, "must yield consistent folds"),
+    ],
+)
+def test_input_errors(Est, params, expected_error_message):
+    base_estimator = FastClassifier()
+    param_grid = {"a": [1]}
+    X, y = make_classification(100)
+
+    sh = Est(base_estimator, param_grid, **params)
+
+    with pytest.raises(ValueError, match=expected_error_message):
+        sh.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "params, expected_error_message",
+    [
+        (
+            {"n_candidates": "exhaust", "min_resources": "exhaust"},
+            "cannot be both set to 'exhaust'",
+        ),
+    ],
+)
+def test_input_errors_randomized(params, expected_error_message):
+    # tests specific to HalvingRandomSearchCV
+
+    base_estimator = FastClassifier()
+    param_grid = {"a": [1]}
+    X, y = make_classification(100)
+
+    sh = HalvingRandomSearchCV(base_estimator, param_grid, **params)
+
+    with pytest.raises(ValueError, match=expected_error_message):
+        sh.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "fraction, subsample_test, expected_train_size, expected_test_size",
+    [
+        (0.5, True, 40, 10),
+        (0.5, False, 40, 20),
+        (0.2, True, 16, 4),
+        (0.2, False, 16, 20),
+    ],
+)
+def test_subsample_splitter_shapes(
+    fraction, subsample_test, expected_train_size, expected_test_size
+):
+    # Make sure splits returned by SubsampleMetaSplitter are of appropriate
+    # size
+
+    n_samples = 100
+    X, y = make_classification(n_samples)
+    cv = _SubsampleMetaSplitter(
+        base_cv=KFold(5),
+        fraction=fraction,
+        subsample_test=subsample_test,
+        random_state=None,
+    )
+
+    for train, test in cv.split(X, y):
+        assert train.shape[0] == expected_train_size
+        assert test.shape[0] == expected_test_size
+        if subsample_test:
+            assert train.shape[0] + test.shape[0] == int(n_samples * fraction)
+        else:
+            assert test.shape[0] == n_samples // cv.base_cv.get_n_splits()
+
+
+@pytest.mark.parametrize("subsample_test", (True, False))
+def test_subsample_splitter_determinism(subsample_test):
+    # Make sure _SubsampleMetaSplitter is consistent across calls to split():
+    # - we're OK having training sets differ (they're always sampled with a
+    #   different fraction anyway)
+    # - when we don't subsample the test set, we want it to be always the same.
+    #   This check is the most important. This is ensured by the determinism
+    #   of the base_cv.
+
+    # Note: we could force both train and test splits to be always the same if
+    # we drew an int seed in _SubsampleMetaSplitter.__init__
+
+    n_samples = 100
+    X, y = make_classification(n_samples)
+    cv = _SubsampleMetaSplitter(
+        base_cv=KFold(5), fraction=0.5, subsample_test=subsample_test, random_state=None
+    )
+
+    folds_a = list(cv.split(X, y, groups=None))
+    folds_b = list(cv.split(X, y, groups=None))
+
+    for (train_a, test_a), (train_b, test_b) in zip(folds_a, folds_b):
+        assert not np.all(train_a == train_b)
+
+        if subsample_test:
+            assert not np.all(test_a == test_b)
+        else:
+            assert np.all(test_a == test_b)
+            assert np.all(X[test_a] == X[test_b])
+
+
+@pytest.mark.parametrize(
+    "k, itr, expected",
+    [
+        (1, 0, ["c"]),
+        (2, 0, ["a", "c"]),
+        (4, 0, ["d", "b", "a", "c"]),
+        (10, 0, ["d", "b", "a", "c"]),
+        (1, 1, ["e"]),
+        (2, 1, ["f", "e"]),
+        (10, 1, ["f", "e"]),
+        (1, 2, ["i"]),
+        (10, 2, ["g", "h", "i"]),
+    ],
+)
+def test_top_k(k, itr, expected):
+    results = {  # this isn't a 'real world' result dict
+        "iter": [0, 0, 0, 0, 1, 1, 2, 2, 2],
+        "mean_test_score": [4, 3, 5, 1, 11, 10, 5, 6, 9],
+        "params": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
+    }
+    got = _top_k(results, k=k, itr=itr)
+    assert np.all(got == expected)
+
+
+@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV))
+def test_cv_results(Est):
+    # test that the cv_results_ matches correctly the logic of the
+    # tournament: in particular that the candidates continued in each
+    # successive iteration are those that were best in the previous iteration
+    pd = pytest.importorskip("pandas")
+
+    rng = np.random.RandomState(0)
+
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": ("l1", "l2"), "b": list(range(30))}
+    base_estimator = FastClassifier()
+
+    # generate random scores: we want to avoid ties, which would otherwise
+    # mess with the ordering and make testing harder
+    def scorer(est, X, y):
+        return rng.rand()
+
+    sh = Est(base_estimator, param_grid, factor=2, scoring=scorer)
+    if Est is HalvingRandomSearchCV:
+        # same number of candidates as with the grid
+        sh.set_params(n_candidates=2 * 30, min_resources="exhaust")
+
+    sh.fit(X, y)
+
+    # non-regression check for
+    # https://github.com/scikit-learn/scikit-learn/issues/19203
+    assert isinstance(sh.cv_results_["iter"], np.ndarray)
+    assert isinstance(sh.cv_results_["n_resources"], np.ndarray)
+
+    cv_results_df = pd.DataFrame(sh.cv_results_)
+
+    # just make sure we don't have ties
+    assert len(cv_results_df["mean_test_score"].unique()) == len(cv_results_df)
+
+    cv_results_df["params_str"] = cv_results_df["params"].apply(str)
+    table = cv_results_df.pivot(
+        index="params_str", columns="iter", values="mean_test_score"
+    )
+
+    # table looks like something like this:
+    # iter                    0      1       2        3   4   5
+    # params_str
+    # {'a': 'l2', 'b': 23} 0.75    NaN     NaN      NaN NaN NaN
+    # {'a': 'l1', 'b': 30} 0.90  0.875     NaN      NaN NaN NaN
+    # {'a': 'l1', 'b': 0}  0.75    NaN     NaN      NaN NaN NaN
+    # {'a': 'l2', 'b': 3}  0.85  0.925  0.9125  0.90625 NaN NaN
+    # {'a': 'l1', 'b': 5}  0.80    NaN     NaN      NaN NaN NaN
+    # ...
+
+    # where a NaN indicates that the candidate wasn't evaluated at a given
+    # iteration, because it wasn't part of the top-K at some previous
+    # iteration. We here make sure that candidates that aren't in the top-k at
+    # any given iteration are indeed not evaluated at the subsequent
+    # iterations.
+    nan_mask = pd.isna(table)
+    n_iter = sh.n_iterations_
+    for it in range(n_iter - 1):
+        already_discarded_mask = nan_mask[it]
+
+        # make sure that if a candidate is already discarded, we don't evaluate
+        # it later
+        assert (
+            already_discarded_mask & nan_mask[it + 1] == already_discarded_mask
+        ).all()
+
+        # make sure that the number of discarded candidate is correct
+        discarded_now_mask = ~already_discarded_mask & nan_mask[it + 1]
+        kept_mask = ~already_discarded_mask & ~discarded_now_mask
+        assert kept_mask.sum() == sh.n_candidates_[it + 1]
+
+        # make sure that all discarded candidates have a lower score than the
+        # kept candidates
+        discarded_max_score = table[it].where(discarded_now_mask).max()
+        kept_min_score = table[it].where(kept_mask).min()
+        assert discarded_max_score < kept_min_score
+
+    # We now make sure that the best candidate is chosen only from the last
+    # iteration.
+    # We also make sure this is true even if there were higher scores in
+    # earlier rounds (this isn't generally the case, but worth ensuring it's
+    # possible).
+
+    last_iter = cv_results_df["iter"].max()
+    idx_best_last_iter = cv_results_df[cv_results_df["iter"] == last_iter][
+        "mean_test_score"
+    ].idxmax()
+    idx_best_all_iters = cv_results_df["mean_test_score"].idxmax()
+
+    assert sh.best_params_ == cv_results_df.iloc[idx_best_last_iter]["params"]
+    assert (
+        cv_results_df.iloc[idx_best_last_iter]["mean_test_score"]
+        < cv_results_df.iloc[idx_best_all_iters]["mean_test_score"]
+    )
+    assert (
+        cv_results_df.iloc[idx_best_last_iter]["params"]
+        != cv_results_df.iloc[idx_best_all_iters]["params"]
+    )
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+def test_base_estimator_inputs(Est):
+    # make sure that the base estimators are passed the correct parameters and
+    # number of samples at each iteration.
+    pd = pytest.importorskip("pandas")
+
+    passed_n_samples_fit = []
+    passed_n_samples_predict = []
+    passed_params = []
+
+    class FastClassifierBookKeeping(FastClassifier):
+        def fit(self, X, y):
+            passed_n_samples_fit.append(X.shape[0])
+            return super().fit(X, y)
+
+        def predict(self, X):
+            passed_n_samples_predict.append(X.shape[0])
+            return super().predict(X)
+
+        def set_params(self, **params):
+            passed_params.append(params)
+            return super().set_params(**params)
+
+    n_samples = 1024
+    n_splits = 2
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+    param_grid = {"a": ("l1", "l2"), "b": list(range(30))}
+    base_estimator = FastClassifierBookKeeping()
+
+    sh = Est(
+        base_estimator,
+        param_grid,
+        factor=2,
+        cv=n_splits,
+        return_train_score=False,
+        refit=False,
+    )
+    if Est is HalvingRandomSearchCV:
+        # same number of candidates as with the grid
+        sh.set_params(n_candidates=2 * 30, min_resources="exhaust")
+
+    sh.fit(X, y)
+
+    assert len(passed_n_samples_fit) == len(passed_n_samples_predict)
+    passed_n_samples = [
+        x + y for (x, y) in zip(passed_n_samples_fit, passed_n_samples_predict)
+    ]
+
+    # Lists are of length n_splits * n_iter * n_candidates_at_i.
+    # Each chunk of size n_splits corresponds to the n_splits folds for the
+    # same candidate at the same iteration, so they contain equal values. We
+    # subsample such that the lists are of length n_iter * n_candidates_at_it
+    passed_n_samples = passed_n_samples[::n_splits]
+    passed_params = passed_params[::n_splits]
+
+    cv_results_df = pd.DataFrame(sh.cv_results_)
+
+    assert len(passed_params) == len(passed_n_samples) == len(cv_results_df)
+
+    uniques, counts = np.unique(passed_n_samples, return_counts=True)
+    assert (sh.n_resources_ == uniques).all()
+    assert (sh.n_candidates_ == counts).all()
+
+    assert (cv_results_df["params"] == passed_params).all()
+    assert (cv_results_df["n_resources"] == passed_n_samples).all()
+
+
+@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV))
+def test_groups_support(Est):
+    # Check if ValueError (when groups is None) propagates to
+    # HalvingGridSearchCV and HalvingRandomSearchCV
+    # And also check if groups is correctly passed to the cv object
+    rng = np.random.RandomState(0)
+
+    X, y = make_classification(n_samples=50, n_classes=2, random_state=0)
+    groups = rng.randint(0, 3, 50)
+
+    clf = LinearSVC(random_state=0)
+    grid = {"C": [1]}
+
+    group_cvs = [
+        LeaveOneGroupOut(),
+        LeavePGroupsOut(2),
+        GroupKFold(n_splits=3),
+        GroupShuffleSplit(random_state=0),
+    ]
+    error_msg = "The 'groups' parameter should not be None."
+    for cv in group_cvs:
+        gs = Est(clf, grid, cv=cv, random_state=0)
+        with pytest.raises(ValueError, match=error_msg):
+            gs.fit(X, y)
+        gs.fit(X, y, groups=groups)
+
+    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)]
+    for cv in non_group_cvs:
+        gs = Est(clf, grid, cv=cv)
+        # Should not raise an error
+        gs.fit(X, y)
+
+
+@pytest.mark.parametrize("SearchCV", [HalvingRandomSearchCV, HalvingGridSearchCV])
+def test_min_resources_null(SearchCV):
+    """Check that we raise an error if the minimum resources is set to 0."""
+    base_estimator = FastClassifier()
+    param_grid = {"a": [1]}
+    X = np.empty(0).reshape(0, 3)
+
+    search = SearchCV(base_estimator, param_grid, min_resources="smallest")
+
+    err_msg = "min_resources_=0: you might have passed an empty dataset X."
+    with pytest.raises(ValueError, match=err_msg):
+        search.fit(X, [])
+
+
+@pytest.mark.parametrize("SearchCV", [HalvingGridSearchCV, HalvingRandomSearchCV])
+def test_select_best_index(SearchCV):
+    """Check the selection strategy of the halving search."""
+    results = {  # this isn't a 'real world' result dict
+        "iter": np.array([0, 0, 0, 0, 1, 1, 2, 2, 2]),
+        "mean_test_score": np.array([4, 3, 5, 1, 11, 10, 5, 6, 9]),
+        "params": np.array(["a", "b", "c", "d", "e", "f", "g", "h", "i"]),
+    }
+
+    # we expect the index of 'i'
+    best_index = SearchCV._select_best_index(None, None, results)
+    assert best_index == 8
+
+
+def test_halving_random_search_list_of_dicts():
+    """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution`
+    being a list of dictionary.
+    """
+    X, y = make_classification(n_samples=150, n_features=4, random_state=42)
+
+    params = [
+        {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)},
+        {"kernel": ["poly"], "degree": [2, 3]},
+    ]
+    param_keys = (
+        "param_C",
+        "param_degree",
+        "param_gamma",
+        "param_kernel",
+    )
+    score_keys = (
+        "mean_test_score",
+        "mean_train_score",
+        "rank_test_score",
+        "split0_test_score",
+        "split1_test_score",
+        "split2_test_score",
+        "split0_train_score",
+        "split1_train_score",
+        "split2_train_score",
+        "std_test_score",
+        "std_train_score",
+        "mean_fit_time",
+        "std_fit_time",
+        "mean_score_time",
+        "std_score_time",
+    )
+    extra_keys = ("n_resources", "iter")
+
+    search = HalvingRandomSearchCV(
+        SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0
+    )
+    search.fit(X, y)
+    n_candidates = sum(search.n_candidates_)
+    cv_results = search.cv_results_
+    # Check results structure
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
+    expected_cv_results_kinds = {
+        "param_C": "f",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
+
+    assert all(
+        (
+            cv_results["param_C"].mask[i]
+            and cv_results["param_gamma"].mask[i]
+            and not cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "poly"
+    )
+    assert all(
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "rbf"
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_validation.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c20131b8d3f387d32a9abe5cbf80a6387e0017f3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_validation.py
@@ -0,0 +1,2739 @@
+"""Test the validation module"""
+
+import os
+import re
+import sys
+import tempfile
+import warnings
+from functools import partial
+from io import StringIO
+from time import sleep
+
+import numpy as np
+import pytest
+from scipy.sparse import issparse
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.cluster import KMeans
+from sklearn.datasets import (
+    load_diabetes,
+    load_digits,
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.exceptions import FitFailedWarning, UnsetMetadataPassedError
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import (
+    LogisticRegression,
+    PassiveAggressiveClassifier,
+    Ridge,
+    RidgeClassifier,
+    SGDClassifier,
+)
+from sklearn.metrics import (
+    accuracy_score,
+    check_scoring,
+    confusion_matrix,
+    explained_variance_score,
+    make_scorer,
+    mean_squared_error,
+    precision_recall_fscore_support,
+    precision_score,
+    r2_score,
+)
+from sklearn.metrics._scorer import _MultimetricScorer
+from sklearn.model_selection import (
+    GridSearchCV,
+    GroupKFold,
+    GroupShuffleSplit,
+    KFold,
+    LeaveOneGroupOut,
+    LeaveOneOut,
+    LeavePGroupsOut,
+    ShuffleSplit,
+    StratifiedKFold,
+    cross_val_predict,
+    cross_val_score,
+    cross_validate,
+    learning_curve,
+    permutation_test_score,
+    validation_curve,
+)
+from sklearn.model_selection._validation import (
+    _check_is_permutation,
+    _fit_and_score,
+    _score,
+)
+from sklearn.model_selection.tests.common import OneTimeSplitter
+from sklearn.model_selection.tests.test_search import FailingClassifier
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neural_network import MLPRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder, scale
+from sklearn.svm import SVC, LinearSVC
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingScorer,
+    ConsumingSplitter,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.utils import shuffle
+from sklearn.utils._mocking import CheckingClassifier, MockDataFrame
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS
+from sklearn.utils.validation import _num_samples
+
+
+class MockImprovingEstimator(BaseEstimator):
+    """Dummy classifier to test the learning curve"""
+
+    def __init__(self, n_max_train_sizes):
+        self.n_max_train_sizes = n_max_train_sizes
+        self.train_sizes = 0
+        self.X_subset = None
+
+    def fit(self, X_subset, y_subset=None):
+        self.X_subset = X_subset
+        self.train_sizes = X_subset.shape[0]
+        return self
+
+    def predict(self, X):
+        raise NotImplementedError
+
+    def score(self, X=None, Y=None):
+        # training score becomes worse (2 -> 1), test error better (0 -> 1)
+        if self._is_training_data(X):
+            return 2.0 - float(self.train_sizes) / self.n_max_train_sizes
+        else:
+            return float(self.train_sizes) / self.n_max_train_sizes
+
+    def _is_training_data(self, X):
+        return X is self.X_subset
+
+
+class MockIncrementalImprovingEstimator(MockImprovingEstimator):
+    """Dummy classifier that provides partial_fit"""
+
+    def __init__(self, n_max_train_sizes, expected_fit_params=None):
+        super().__init__(n_max_train_sizes)
+        self.x = None
+        self.expected_fit_params = expected_fit_params
+
+    def _is_training_data(self, X):
+        return self.x in X
+
+    def partial_fit(self, X, y=None, **params):
+        self.train_sizes += X.shape[0]
+        self.x = X[0]
+        if self.expected_fit_params:
+            missing = set(self.expected_fit_params) - set(params)
+            if missing:
+                raise AssertionError(
+                    f"Expected fit parameter(s) {list(missing)} not seen."
+                )
+            for key, value in params.items():
+                if key in self.expected_fit_params and _num_samples(
+                    value
+                ) != _num_samples(X):
+                    raise AssertionError(
+                        f"Fit parameter {key} has length {_num_samples(value)}"
+                        f"; expected {_num_samples(X)}."
+                    )
+
+
+class MockEstimatorWithParameter(BaseEstimator):
+    """Dummy classifier to test the validation curve"""
+
+    def __init__(self, param=0.5):
+        self.X_subset = None
+        self.param = param
+
+    def fit(self, X_subset, y_subset):
+        self.X_subset = X_subset
+        self.train_sizes = X_subset.shape[0]
+        return self
+
+    def predict(self, X):
+        raise NotImplementedError
+
+    def score(self, X=None, y=None):
+        return self.param if self._is_training_data(X) else 1 - self.param
+
+    def _is_training_data(self, X):
+        return X is self.X_subset
+
+
+class MockEstimatorWithSingleFitCallAllowed(MockEstimatorWithParameter):
+    """Dummy classifier that disallows repeated calls of fit method"""
+
+    def fit(self, X_subset, y_subset):
+        assert not hasattr(self, "fit_called_"), "fit is called the second time"
+        self.fit_called_ = True
+        return super().fit(X_subset, y_subset)
+
+    def predict(self, X):
+        raise NotImplementedError
+
+
+class MockClassifier(ClassifierMixin, BaseEstimator):
+    """Dummy classifier to test the cross-validation"""
+
+    def __init__(self, a=0, allow_nd=False):
+        self.a = a
+        self.allow_nd = allow_nd
+
+    def fit(
+        self,
+        X,
+        Y=None,
+        sample_weight=None,
+        class_prior=None,
+        sparse_sample_weight=None,
+        sparse_param=None,
+        dummy_int=None,
+        dummy_str=None,
+        dummy_obj=None,
+        callback=None,
+    ):
+        """The dummy arguments are to test that this fit function can
+        accept non-array arguments through cross-validation, such as:
+            - int
+            - str (this is actually array-like)
+            - object
+            - function
+        """
+        self.dummy_int = dummy_int
+        self.dummy_str = dummy_str
+        self.dummy_obj = dummy_obj
+        if callback is not None:
+            callback(self)
+
+        if self.allow_nd:
+            X = X.reshape(len(X), -1)
+        if X.ndim >= 3 and not self.allow_nd:
+            raise ValueError("X cannot be d")
+        if sample_weight is not None:
+            assert sample_weight.shape[0] == X.shape[0], (
+                "MockClassifier extra fit_param "
+                "sample_weight.shape[0] is {0}, should be {1}".format(
+                    sample_weight.shape[0], X.shape[0]
+                )
+            )
+        if class_prior is not None:
+            assert class_prior.shape[0] == len(np.unique(y)), (
+                "MockClassifier extra fit_param class_prior.shape[0]"
+                " is {0}, should be {1}".format(class_prior.shape[0], len(np.unique(y)))
+            )
+        if sparse_sample_weight is not None:
+            fmt = (
+                "MockClassifier extra fit_param sparse_sample_weight"
+                ".shape[0] is {0}, should be {1}"
+            )
+            assert sparse_sample_weight.shape[0] == X.shape[0], fmt.format(
+                sparse_sample_weight.shape[0], X.shape[0]
+            )
+        if sparse_param is not None:
+            fmt = (
+                "MockClassifier extra fit_param sparse_param.shape "
+                "is ({0}, {1}), should be ({2}, {3})"
+            )
+            assert sparse_param.shape == P.shape, fmt.format(
+                sparse_param.shape[0],
+                sparse_param.shape[1],
+                P.shape[0],
+                P.shape[1],
+            )
+        self.classes_ = np.unique(y)
+        return self
+
+    def predict(self, T):
+        if self.allow_nd:
+            T = T.reshape(len(T), -1)
+        return T[:, 0]
+
+    def predict_proba(self, T):
+        return T
+
+    def score(self, X=None, Y=None):
+        return 1.0 / (1 + np.abs(self.a))
+
+    def get_params(self, deep=False):
+        return {"a": self.a, "allow_nd": self.allow_nd}
+
+
+# XXX: use 2D array, since 1D X is being detected as a single sample in
+# check_consistent_length
+X = np.ones((15, 2))
+y = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6])
+# The number of samples per class needs to be > n_splits,
+# for StratifiedKFold(n_splits=3)
+y2 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
+P = np.eye(5)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_score(coo_container):
+    clf = MockClassifier()
+    X_sparse = coo_container(X)
+
+    for a in range(-10, 10):
+        clf.a = a
+        # Smoke test
+        scores = cross_val_score(clf, X, y2)
+        assert_array_equal(scores, clf.score(X, y2))
+
+        # test with multioutput y
+        multioutput_y = np.column_stack([y2, y2[::-1]])
+        scores = cross_val_score(clf, X_sparse, multioutput_y)
+        assert_array_equal(scores, clf.score(X_sparse, multioutput_y))
+
+        scores = cross_val_score(clf, X_sparse, y2)
+        assert_array_equal(scores, clf.score(X_sparse, y2))
+
+        # test with multioutput y
+        scores = cross_val_score(clf, X_sparse, multioutput_y)
+        assert_array_equal(scores, clf.score(X_sparse, multioutput_y))
+
+    # test with X and y as list
+    list_check = lambda x: isinstance(x, list)
+    clf = CheckingClassifier(check_X=list_check)
+    scores = cross_val_score(clf, X.tolist(), y2.tolist(), cv=3)
+
+    clf = CheckingClassifier(check_y=list_check)
+    scores = cross_val_score(clf, X, y2.tolist(), cv=3)
+
+    # test with 3d X and
+    X_3d = X[:, :, np.newaxis]
+    clf = MockClassifier(allow_nd=True)
+    scores = cross_val_score(clf, X_3d, y2)
+
+    clf = MockClassifier(allow_nd=False)
+    with pytest.raises(ValueError):
+        cross_val_score(clf, X_3d, y2, error_score="raise")
+
+
+def test_cross_validate_many_jobs():
+    # regression test for #12154: cv='warn' with n_jobs>1 trigger a copy of
+    # the parameters leading to a failure in check_cv due to cv is 'warn'
+    # instead of cv == 'warn'.
+    X, y = load_iris(return_X_y=True)
+    clf = SVC(gamma="auto")
+    grid = GridSearchCV(clf, param_grid={"C": [1, 10]})
+    cross_validate(grid, X, y, n_jobs=2)
+
+
+def test_cross_validate_invalid_scoring_param():
+    X, y = make_classification(random_state=0)
+    estimator = MockClassifier()
+
+    # Test the errors
+    error_message_regexp = ".*must be unique strings.*"
+
+    # List/tuple of callables should raise a message advising users to use
+    # dict of names to callables mapping
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(
+            estimator,
+            X,
+            y,
+            scoring=(make_scorer(precision_score), make_scorer(accuracy_score)),
+        )
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(estimator, X, y, scoring=(make_scorer(precision_score),))
+
+    # So should empty lists/tuples
+    with pytest.raises(ValueError, match=error_message_regexp + "Empty list.*"):
+        cross_validate(estimator, X, y, scoring=())
+
+    # So should duplicated entries
+    with pytest.raises(ValueError, match=error_message_regexp + "Duplicate.*"):
+        cross_validate(estimator, X, y, scoring=("f1_micro", "f1_micro"))
+
+    # Nested Lists should raise a generic error message
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(estimator, X, y, scoring=[[make_scorer(precision_score)]])
+
+    # Empty dict should raise invalid scoring error
+    with pytest.raises(ValueError, match="An empty dict"):
+        cross_validate(estimator, X, y, scoring=(dict()))
+
+    multiclass_scorer = make_scorer(precision_recall_fscore_support)
+
+    # Multiclass Scorers that return multiple values are not supported yet
+    # the warning message we're expecting to see
+    warning_message = (
+        "Scoring failed. The score on this train-test "
+        f"partition for these parameters will be set to {np.nan}. "
+        "Details: \n"
+    )
+
+    with pytest.warns(UserWarning, match=warning_message):
+        cross_validate(estimator, X, y, scoring=multiclass_scorer)
+
+    with pytest.warns(UserWarning, match=warning_message):
+        cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer})
+
+
+def test_cross_validate_nested_estimator():
+    # Non-regression test to ensure that nested
+    # estimators are properly returned in a list
+    # https://github.com/scikit-learn/scikit-learn/pull/17745
+    (X, y) = load_iris(return_X_y=True)
+    pipeline = Pipeline(
+        [
+            ("imputer", SimpleImputer()),
+            ("classifier", MockClassifier()),
+        ]
+    )
+
+    results = cross_validate(pipeline, X, y, return_estimator=True)
+    estimators = results["estimator"]
+
+    assert isinstance(estimators, list)
+    assert all(isinstance(estimator, Pipeline) for estimator in estimators)
+
+
+@pytest.mark.parametrize("use_sparse", [False, True])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_cross_validate(use_sparse: bool, csr_container):
+    # Compute train and test mse/r2 scores
+    cv = KFold()
+
+    # Regression
+    X_reg, y_reg = make_regression(n_samples=30, random_state=0)
+    reg = Ridge(random_state=0)
+
+    # Classification
+    X_clf, y_clf = make_classification(n_samples=30, random_state=0)
+    clf = SVC(kernel="linear", random_state=0)
+
+    if use_sparse:
+        X_reg = csr_container(X_reg)
+        X_clf = csr_container(X_clf)
+
+    for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)):
+        # It's okay to evaluate regression metrics on classification too
+        mse_scorer = check_scoring(est, scoring="neg_mean_squared_error")
+        r2_scorer = check_scoring(est, scoring="r2")
+        train_mse_scores = []
+        test_mse_scores = []
+        train_r2_scores = []
+        test_r2_scores = []
+        fitted_estimators = []
+
+        for train, test in cv.split(X, y):
+            est = clone(est).fit(X[train], y[train])
+            train_mse_scores.append(mse_scorer(est, X[train], y[train]))
+            train_r2_scores.append(r2_scorer(est, X[train], y[train]))
+            test_mse_scores.append(mse_scorer(est, X[test], y[test]))
+            test_r2_scores.append(r2_scorer(est, X[test], y[test]))
+            fitted_estimators.append(est)
+
+        train_mse_scores = np.array(train_mse_scores)
+        test_mse_scores = np.array(test_mse_scores)
+        train_r2_scores = np.array(train_r2_scores)
+        test_r2_scores = np.array(test_r2_scores)
+        fitted_estimators = np.array(fitted_estimators)
+
+        scores = (
+            train_mse_scores,
+            test_mse_scores,
+            train_r2_scores,
+            test_r2_scores,
+            fitted_estimators,
+        )
+
+        # To ensure that the test does not suffer from
+        # large statistical fluctuations due to slicing small datasets,
+        # we pass the cross-validation instance
+        check_cross_validate_single_metric(est, X, y, scores, cv)
+        check_cross_validate_multi_metric(est, X, y, scores, cv)
+
+
+def check_cross_validate_single_metric(clf, X, y, scores, cv):
+    (
+        train_mse_scores,
+        test_mse_scores,
+        train_r2_scores,
+        test_r2_scores,
+        fitted_estimators,
+    ) = scores
+    # Test single metric evaluation when scoring is string or singleton list
+    for return_train_score, dict_len in ((True, 4), (False, 3)):
+        # Single metric passed as a string
+        if return_train_score:
+            mse_scores_dict = cross_validate(
+                clf,
+                X,
+                y,
+                scoring="neg_mean_squared_error",
+                return_train_score=True,
+                cv=cv,
+            )
+            assert_array_almost_equal(mse_scores_dict["train_score"], train_mse_scores)
+        else:
+            mse_scores_dict = cross_validate(
+                clf,
+                X,
+                y,
+                scoring="neg_mean_squared_error",
+                return_train_score=False,
+                cv=cv,
+            )
+        assert isinstance(mse_scores_dict, dict)
+        assert len(mse_scores_dict) == dict_len
+        assert_array_almost_equal(mse_scores_dict["test_score"], test_mse_scores)
+
+        # Single metric passed as a list
+        if return_train_score:
+            # It must be True by default - deprecated
+            r2_scores_dict = cross_validate(
+                clf, X, y, scoring=["r2"], return_train_score=True, cv=cv
+            )
+            assert_array_almost_equal(r2_scores_dict["train_r2"], train_r2_scores, True)
+        else:
+            r2_scores_dict = cross_validate(
+                clf, X, y, scoring=["r2"], return_train_score=False, cv=cv
+            )
+        assert isinstance(r2_scores_dict, dict)
+        assert len(r2_scores_dict) == dict_len
+        assert_array_almost_equal(r2_scores_dict["test_r2"], test_r2_scores)
+
+    # Test return_estimator option
+    mse_scores_dict = cross_validate(
+        clf, X, y, scoring="neg_mean_squared_error", return_estimator=True, cv=cv
+    )
+    for k, est in enumerate(mse_scores_dict["estimator"]):
+        est_coef = est.coef_.copy()
+        if issparse(est_coef):
+            est_coef = est_coef.toarray()
+
+        fitted_est_coef = fitted_estimators[k].coef_.copy()
+        if issparse(fitted_est_coef):
+            fitted_est_coef = fitted_est_coef.toarray()
+
+        assert_almost_equal(est_coef, fitted_est_coef)
+        assert_almost_equal(est.intercept_, fitted_estimators[k].intercept_)
+
+
+def check_cross_validate_multi_metric(clf, X, y, scores, cv):
+    # Test multimetric evaluation when scoring is a list / dict
+    (
+        train_mse_scores,
+        test_mse_scores,
+        train_r2_scores,
+        test_r2_scores,
+        fitted_estimators,
+    ) = scores
+
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        return {
+            "r2": r2_score(y, y_pred),
+            "neg_mean_squared_error": -mean_squared_error(y, y_pred),
+        }
+
+    all_scoring = (
+        ("r2", "neg_mean_squared_error"),
+        {
+            "r2": make_scorer(r2_score),
+            "neg_mean_squared_error": "neg_mean_squared_error",
+        },
+        custom_scorer,
+    )
+
+    keys_sans_train = {
+        "test_r2",
+        "test_neg_mean_squared_error",
+        "fit_time",
+        "score_time",
+    }
+    keys_with_train = keys_sans_train.union(
+        {"train_r2", "train_neg_mean_squared_error"}
+    )
+
+    for return_train_score in (True, False):
+        for scoring in all_scoring:
+            if return_train_score:
+                # return_train_score must be True by default - deprecated
+                cv_results = cross_validate(
+                    clf, X, y, scoring=scoring, return_train_score=True, cv=cv
+                )
+                assert_array_almost_equal(cv_results["train_r2"], train_r2_scores)
+                assert_array_almost_equal(
+                    cv_results["train_neg_mean_squared_error"], train_mse_scores
+                )
+            else:
+                cv_results = cross_validate(
+                    clf, X, y, scoring=scoring, return_train_score=False, cv=cv
+                )
+            assert isinstance(cv_results, dict)
+            assert set(cv_results.keys()) == (
+                keys_with_train if return_train_score else keys_sans_train
+            )
+            assert_array_almost_equal(cv_results["test_r2"], test_r2_scores)
+            assert_array_almost_equal(
+                cv_results["test_neg_mean_squared_error"], test_mse_scores
+            )
+
+            # Make sure all the arrays are of np.ndarray type
+            assert isinstance(cv_results["test_r2"], np.ndarray)
+            assert isinstance(cv_results["test_neg_mean_squared_error"], np.ndarray)
+            assert isinstance(cv_results["fit_time"], np.ndarray)
+            assert isinstance(cv_results["score_time"], np.ndarray)
+
+            # Ensure all the times are within sane limits
+            assert np.all(cv_results["fit_time"] >= 0)
+            assert np.all(cv_results["fit_time"] < 10)
+            assert np.all(cv_results["score_time"] >= 0)
+            assert np.all(cv_results["score_time"] < 10)
+
+
+def test_cross_val_score_predict_groups():
+    # Check if ValueError (when groups is None) propagates to cross_val_score
+    # and cross_val_predict
+    # And also check if groups is correctly passed to the cv object
+    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)
+
+    clf = SVC(kernel="linear")
+
+    group_cvs = [
+        LeaveOneGroupOut(),
+        LeavePGroupsOut(2),
+        GroupKFold(),
+        GroupShuffleSplit(),
+    ]
+    error_message = "The 'groups' parameter should not be None."
+    for cv in group_cvs:
+        with pytest.raises(ValueError, match=error_message):
+            cross_val_score(estimator=clf, X=X, y=y, cv=cv)
+        with pytest.raises(ValueError, match=error_message):
+            cross_val_predict(estimator=clf, X=X, y=y, cv=cv)
+
+
+def test_cross_val_score_pandas():
+    # check cross_val_score doesn't destroy pandas dataframe
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((Series, DataFrame))
+    except ImportError:
+        pass
+    for TargetType, InputFeatureType in types:
+        # X dataframe, y series
+        # 3 fold cross val is used so we need at least 3 samples per class
+        X_df, y_ser = InputFeatureType(X), TargetType(y2)
+        check_df = lambda x: isinstance(x, InputFeatureType)
+        check_series = lambda x: isinstance(x, TargetType)
+        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
+        cross_val_score(clf, X_df, y_ser, cv=3)
+
+
+def test_cross_val_score_mask():
+    # test that cross_val_score works with boolean masks
+    svm = SVC(kernel="linear")
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    kfold = KFold(5)
+    scores_indices = cross_val_score(svm, X, y, cv=kfold)
+    kfold = KFold(5)
+    cv_masks = []
+    for train, test in kfold.split(X, y):
+        mask_train = np.zeros(len(y), dtype=bool)
+        mask_test = np.zeros(len(y), dtype=bool)
+        mask_train[train] = 1
+        mask_test[test] = 1
+        cv_masks.append((train, test))
+    scores_masks = cross_val_score(svm, X, y, cv=cv_masks)
+    assert_array_equal(scores_indices, scores_masks)
+
+
+def test_cross_val_score_precomputed():
+    # test for svm with precomputed kernel
+    svm = SVC(kernel="precomputed")
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    linear_kernel = np.dot(X, X.T)
+    score_precomputed = cross_val_score(svm, linear_kernel, y)
+    svm = SVC(kernel="linear")
+    score_linear = cross_val_score(svm, X, y)
+    assert_array_almost_equal(score_precomputed, score_linear)
+
+    # test with callable
+    svm = SVC(kernel=lambda x, y: np.dot(x, y.T))
+    score_callable = cross_val_score(svm, X, y)
+    assert_array_almost_equal(score_precomputed, score_callable)
+
+    # Error raised for non-square X
+    svm = SVC(kernel="precomputed")
+    with pytest.raises(ValueError):
+        cross_val_score(svm, X, y)
+
+    # test error is raised when the precomputed kernel is not array-like
+    # or sparse
+    with pytest.raises(ValueError):
+        cross_val_score(svm, linear_kernel.tolist(), y)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_score_fit_params(coo_container):
+    clf = MockClassifier()
+    n_samples = X.shape[0]
+    n_classes = len(np.unique(y))
+
+    W_sparse = coo_container(
+        (np.array([1]), (np.array([1]), np.array([0]))), shape=(15, 1)
+    )
+    P_sparse = coo_container(np.eye(5))
+
+    DUMMY_INT = 42
+    DUMMY_STR = "42"
+    DUMMY_OBJ = object()
+
+    def assert_fit_params(clf):
+        # Function to test that the values are passed correctly to the
+        # classifier arguments for non-array type
+
+        assert clf.dummy_int == DUMMY_INT
+        assert clf.dummy_str == DUMMY_STR
+        assert clf.dummy_obj == DUMMY_OBJ
+
+    fit_params = {
+        "sample_weight": np.ones(n_samples),
+        "class_prior": np.full(n_classes, 1.0 / n_classes),
+        "sparse_sample_weight": W_sparse,
+        "sparse_param": P_sparse,
+        "dummy_int": DUMMY_INT,
+        "dummy_str": DUMMY_STR,
+        "dummy_obj": DUMMY_OBJ,
+        "callback": assert_fit_params,
+    }
+    cross_val_score(clf, X, y2, params=fit_params)
+
+
+def test_cross_val_score_score_func():
+    clf = MockClassifier()
+    _score_func_args = []
+
+    def score_func(y_test, y_predict):
+        _score_func_args.append((y_test, y_predict))
+        return 1.0
+
+    with warnings.catch_warnings(record=True):
+        scoring = make_scorer(score_func)
+        score = cross_val_score(clf, X, y, scoring=scoring, cv=3)
+    assert_array_equal(score, [1.0, 1.0, 1.0])
+    # Test that score function is called only 3 times (for cv=3)
+    assert len(_score_func_args) == 3
+
+
+def test_cross_val_score_with_score_func_classification():
+    iris = load_iris()
+    clf = SVC(kernel="linear")
+
+    # Default score (should be the accuracy score)
+    scores = cross_val_score(clf, iris.data, iris.target)
+    assert_array_almost_equal(scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2)
+
+    # Correct classification score (aka. zero / one score) - should be the
+    # same as the default estimator score
+    zo_scores = cross_val_score(clf, iris.data, iris.target, scoring="accuracy")
+    assert_array_almost_equal(zo_scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2)
+
+    # F1 score (class are balanced so f1_score should be equal to zero/one
+    # score
+    f1_scores = cross_val_score(clf, iris.data, iris.target, scoring="f1_weighted")
+    assert_array_almost_equal(f1_scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2)
+
+
+def test_cross_val_score_with_score_func_regression():
+    X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0)
+    reg = Ridge()
+
+    # Default score of the Ridge regression estimator
+    scores = cross_val_score(reg, X, y)
+    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
+
+    # R2 score (aka. determination coefficient) - should be the
+    # same as the default estimator score
+    r2_scores = cross_val_score(reg, X, y, scoring="r2")
+    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
+
+    # Mean squared error; this is a loss function, so "scores" are negative
+    neg_mse_scores = cross_val_score(reg, X, y, scoring="neg_mean_squared_error")
+    expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
+    assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)
+
+    # Explained variance
+    scoring = make_scorer(explained_variance_score)
+    ev_scores = cross_val_score(reg, X, y, scoring=scoring)
+    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_permutation_score(coo_container):
+    iris = load_iris()
+    X = iris.data
+    X_sparse = coo_container(X)
+    y = iris.target
+    svm = SVC(kernel="linear")
+    cv = StratifiedKFold(2)
+
+    score, scores, pvalue = permutation_test_score(
+        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy"
+    )
+    assert score > 0.9
+    assert_almost_equal(pvalue, 0.0, 1)
+
+    score_group, _, pvalue_group = permutation_test_score(
+        svm,
+        X,
+        y,
+        n_permutations=30,
+        cv=cv,
+        scoring="accuracy",
+        groups=np.ones(y.size),
+        random_state=0,
+    )
+    assert score_group == score
+    assert pvalue_group == pvalue
+
+    # check that we obtain the same results with a sparse representation
+    svm_sparse = SVC(kernel="linear")
+    cv_sparse = StratifiedKFold(2)
+    score_group, _, pvalue_group = permutation_test_score(
+        svm_sparse,
+        X_sparse,
+        y,
+        n_permutations=30,
+        cv=cv_sparse,
+        scoring="accuracy",
+        groups=np.ones(y.size),
+        random_state=0,
+    )
+
+    assert score_group == score
+    assert pvalue_group == pvalue
+
+    # test with custom scoring object
+    def custom_score(y_true, y_pred):
+        return ((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0]
+
+    scorer = make_scorer(custom_score)
+    score, _, pvalue = permutation_test_score(
+        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0
+    )
+    assert_almost_equal(score, 0.93, 2)
+    assert_almost_equal(pvalue, 0.01, 3)
+
+    # set random y
+    y = np.mod(np.arange(len(y)), 3)
+
+    score, scores, pvalue = permutation_test_score(
+        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy"
+    )
+
+    assert score < 0.5
+    assert pvalue > 0.2
+
+
+def test_permutation_test_score_allow_nans():
+    # Check that permutation_test_score allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0] / 2)
+    p = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)),
+            ("classifier", MockClassifier()),
+        ]
+    )
+    permutation_test_score(p, X, y)
+
+
+def test_permutation_test_score_params():
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    clf = CheckingClassifier(expected_sample_weight=True)
+
+    err_msg = r"Expected sample_weight to be passed"
+    with pytest.raises(AssertionError, match=err_msg):
+        permutation_test_score(clf, X, y)
+
+    err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!"
+    with pytest.raises(ValueError, match=err_msg):
+        permutation_test_score(clf, X, y, params={"sample_weight": np.ones(1)})
+    permutation_test_score(clf, X, y, params={"sample_weight": np.ones(10)})
+
+
+def test_cross_val_score_allow_nans():
+    # Check that cross_val_score allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0] / 2)
+    p = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)),
+            ("classifier", MockClassifier()),
+        ]
+    )
+    cross_val_score(p, X, y)
+
+
+def test_cross_val_score_multilabel():
+    X = np.array(
+        [
+            [-3, 4],
+            [2, 4],
+            [3, 3],
+            [0, 2],
+            [-3, 1],
+            [-2, 1],
+            [0, 0],
+            [-2, -1],
+            [-1, -2],
+            [1, -2],
+        ]
+    )
+    y = np.array(
+        [[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]
+    )
+    clf = KNeighborsClassifier(n_neighbors=1)
+    scoring_micro = make_scorer(precision_score, average="micro")
+    scoring_macro = make_scorer(precision_score, average="macro")
+    scoring_samples = make_scorer(precision_score, average="samples")
+    score_micro = cross_val_score(clf, X, y, scoring=scoring_micro)
+    score_macro = cross_val_score(clf, X, y, scoring=scoring_macro)
+    score_samples = cross_val_score(clf, X, y, scoring=scoring_samples)
+    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
+    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
+    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_predict(coo_container):
+    X, y = load_diabetes(return_X_y=True)
+    cv = KFold()
+
+    est = Ridge()
+
+    # Naive loop (should be same as cross_val_predict):
+    preds2 = np.zeros_like(y)
+    for train, test in cv.split(X, y):
+        est.fit(X[train], y[train])
+        preds2[test] = est.predict(X[test])
+
+    preds = cross_val_predict(est, X, y, cv=cv)
+    assert_array_almost_equal(preds, preds2)
+
+    preds = cross_val_predict(est, X, y)
+    assert len(preds) == len(y)
+
+    cv = LeaveOneOut()
+    preds = cross_val_predict(est, X, y, cv=cv)
+    assert len(preds) == len(y)
+
+    Xsp = X.copy()
+    Xsp *= Xsp > np.median(Xsp)
+    Xsp = coo_container(Xsp)
+    preds = cross_val_predict(est, Xsp, y)
+    assert_array_almost_equal(len(preds), len(y))
+
+    preds = cross_val_predict(KMeans(n_init="auto"), X)
+    assert len(preds) == len(y)
+
+    class BadCV:
+        def split(self, X, y=None, groups=None):
+            for i in range(4):
+                yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])
+
+    with pytest.raises(ValueError):
+        cross_val_predict(est, X, y, cv=BadCV())
+
+    X, y = load_iris(return_X_y=True)
+
+    warning_message = (
+        r"Number of classes in training fold \(2\) does "
+        r"not match total number of classes \(3\). "
+        "Results may not be appropriate for your use case."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        cross_val_predict(
+            LogisticRegression(solver="liblinear"),
+            X,
+            y,
+            method="predict_proba",
+            cv=KFold(2),
+        )
+
+
+def test_cross_val_predict_decision_function_shape():
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    preds = cross_val_predict(LogisticRegression(), X, y, method="decision_function")
+    assert preds.shape == (50,)
+
+    X, y = load_iris(return_X_y=True)
+
+    preds = cross_val_predict(LogisticRegression(), X, y, method="decision_function")
+    assert preds.shape == (150, 3)
+
+    # This specifically tests imbalanced splits for binary
+    # classification with decision_function. This is only
+    # applicable to classifiers that can be fit on a single
+    # class.
+    X = X[:100]
+    y = y[:100]
+    error_message = (
+        "Only 1 class/es in training fold,"
+        " but 2 in overall dataset. This"
+        " is not supported for decision_function"
+        " with imbalanced folds. To fix "
+        "this, use a cross-validation technique "
+        "resulting in properly stratified folds"
+    )
+    with pytest.raises(ValueError, match=error_message):
+        cross_val_predict(
+            RidgeClassifier(), X, y, method="decision_function", cv=KFold(2)
+        )
+
+    X, y = load_digits(return_X_y=True)
+    est = SVC(kernel="linear", decision_function_shape="ovo")
+
+    preds = cross_val_predict(est, X, y, method="decision_function")
+    assert preds.shape == (1797, 45)
+
+    ind = np.argsort(y)
+    X, y = X[ind], y[ind]
+    error_message_regexp = (
+        r"Output shape \(599L?, 21L?\) of "
+        "decision_function does not match number of "
+        r"classes \(7\) in fold. Irregular "
+        "decision_function .*"
+    )
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_val_predict(est, X, y, cv=KFold(n_splits=3), method="decision_function")
+
+
+def test_cross_val_predict_predict_proba_shape():
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    preds = cross_val_predict(LogisticRegression(), X, y, method="predict_proba")
+    assert preds.shape == (50, 2)
+
+    X, y = load_iris(return_X_y=True)
+
+    preds = cross_val_predict(LogisticRegression(), X, y, method="predict_proba")
+    assert preds.shape == (150, 3)
+
+
+def test_cross_val_predict_predict_log_proba_shape():
+    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
+
+    preds = cross_val_predict(LogisticRegression(), X, y, method="predict_log_proba")
+    assert preds.shape == (50, 2)
+
+    X, y = load_iris(return_X_y=True)
+
+    preds = cross_val_predict(LogisticRegression(), X, y, method="predict_log_proba")
+    assert preds.shape == (150, 3)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_predict_input_types(coo_container):
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X_sparse = coo_container(X)
+    multioutput_y = np.column_stack([y, y[::-1]])
+
+    clf = Ridge(fit_intercept=False, random_state=0)
+    # 3 fold cv is used --> at least 3 samples per class
+    # Smoke test
+    predictions = cross_val_predict(clf, X, y)
+    assert predictions.shape == (150,)
+
+    # test with multioutput y
+    predictions = cross_val_predict(clf, X_sparse, multioutput_y)
+    assert predictions.shape == (150, 2)
+
+    predictions = cross_val_predict(clf, X_sparse, y)
+    assert_array_equal(predictions.shape, (150,))
+
+    # test with multioutput y
+    predictions = cross_val_predict(clf, X_sparse, multioutput_y)
+    assert_array_equal(predictions.shape, (150, 2))
+
+    # test with X and y as list
+    list_check = lambda x: isinstance(x, list)
+    clf = CheckingClassifier(check_X=list_check)
+    predictions = cross_val_predict(clf, X.tolist(), y.tolist())
+
+    clf = CheckingClassifier(check_y=list_check)
+    predictions = cross_val_predict(clf, X, y.tolist())
+
+    # test with X and y as list and non empty method
+    predictions = cross_val_predict(
+        LogisticRegression(),
+        X.tolist(),
+        y.tolist(),
+        method="decision_function",
+    )
+    predictions = cross_val_predict(
+        LogisticRegression(),
+        X,
+        y.tolist(),
+        method="decision_function",
+    )
+
+    # test with 3d X and
+    X_3d = X[:, :, np.newaxis]
+    check_3d = lambda x: x.ndim == 3
+    clf = CheckingClassifier(check_X=check_3d)
+    predictions = cross_val_predict(clf, X_3d, y)
+    assert_array_equal(predictions.shape, (150,))
+
+
+def test_cross_val_predict_pandas():
+    # check cross_val_score doesn't destroy pandas dataframe
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((Series, DataFrame))
+    except ImportError:
+        pass
+    for TargetType, InputFeatureType in types:
+        # X dataframe, y series
+        X_df, y_ser = InputFeatureType(X), TargetType(y2)
+        check_df = lambda x: isinstance(x, InputFeatureType)
+        check_series = lambda x: isinstance(x, TargetType)
+        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
+        cross_val_predict(clf, X_df, y_ser, cv=3)
+
+
+def test_cross_val_predict_unbalanced():
+    X, y = make_classification(
+        n_samples=100,
+        n_features=2,
+        n_redundant=0,
+        n_informative=2,
+        n_clusters_per_class=1,
+        random_state=1,
+    )
+    # Change the first sample to a new class
+    y[0] = 2
+    clf = LogisticRegression(random_state=1)
+    cv = StratifiedKFold(n_splits=2)
+    train, test = list(cv.split(X, y))
+    yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba")
+    assert y[test[0]][0] == 2  # sanity check for further assertions
+    assert np.all(yhat_proba[test[0]][:, 2] == 0)
+    assert np.all(yhat_proba[test[0]][:, 0:1] > 0)
+    assert np.all(yhat_proba[test[1]] > 0)
+    assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape), decimal=12)
+
+
+def test_cross_val_predict_y_none():
+    # ensure that cross_val_predict works when y is None
+    mock_classifier = MockClassifier()
+    rng = np.random.RandomState(42)
+    X = rng.rand(100, 10)
+    y_hat = cross_val_predict(mock_classifier, X, y=None, cv=5, method="predict")
+    assert_allclose(X[:, 0], y_hat)
+    y_hat_proba = cross_val_predict(
+        mock_classifier, X, y=None, cv=5, method="predict_proba"
+    )
+    assert_allclose(X, y_hat_proba)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_cross_val_score_sparse_fit_params(coo_container):
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    clf = MockClassifier()
+    fit_params = {"sparse_sample_weight": coo_container(np.eye(X.shape[0]))}
+    a = cross_val_score(clf, X, y, params=fit_params, cv=3)
+    assert_array_equal(a, np.ones(3))
+
+
+def test_learning_curve():
+    n_samples = 30
+    n_splits = 3
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(n_samples * ((n_splits - 1) / n_splits))
+    for shuffle_train in [False, True]:
+        with warnings.catch_warnings(record=True) as w:
+            (
+                train_sizes,
+                train_scores,
+                test_scores,
+                fit_times,
+                score_times,
+            ) = learning_curve(
+                estimator,
+                X,
+                y,
+                cv=KFold(n_splits=n_splits),
+                train_sizes=np.linspace(0.1, 1.0, 10),
+                shuffle=shuffle_train,
+                return_times=True,
+            )
+        if len(w) > 0:
+            raise RuntimeError("Unexpected warning: %r" % w[0].message)
+        assert train_scores.shape == (10, 3)
+        assert test_scores.shape == (10, 3)
+        assert fit_times.shape == (10, 3)
+        assert score_times.shape == (10, 3)
+        assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+        assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+        assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+        # Cannot use assert_array_almost_equal for fit and score times because
+        # the values are hardware-dependant
+        assert fit_times.dtype == "float64"
+        assert score_times.dtype == "float64"
+
+        # Test a custom cv splitter that can iterate only once
+        with warnings.catch_warnings(record=True) as w:
+            train_sizes2, train_scores2, test_scores2 = learning_curve(
+                estimator,
+                X,
+                y,
+                cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples),
+                train_sizes=np.linspace(0.1, 1.0, 10),
+                shuffle=shuffle_train,
+            )
+        if len(w) > 0:
+            raise RuntimeError("Unexpected warning: %r" % w[0].message)
+        assert_array_almost_equal(train_scores2, train_scores)
+        assert_array_almost_equal(test_scores2, test_scores)
+
+
+def test_learning_curve_unsupervised():
+    X, _ = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(20)
+    train_sizes, train_scores, test_scores = learning_curve(
+        estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10)
+    )
+    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+    assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+    assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+
+def test_learning_curve_verbose():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(20)
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        train_sizes, train_scores, test_scores = learning_curve(
+            estimator, X, y, cv=3, verbose=1
+        )
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+
+    assert "[learning_curve]" in out
+
+
+def test_learning_curve_incremental_learning_not_possible():
+    X, y = make_classification(
+        n_samples=2,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    # The mockup does not have partial_fit()
+    estimator = MockImprovingEstimator(1)
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, exploit_incremental_learning=True)
+
+
+def test_learning_curve_incremental_learning():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockIncrementalImprovingEstimator(20)
+    for shuffle_train in [False, True]:
+        train_sizes, train_scores, test_scores = learning_curve(
+            estimator,
+            X,
+            y,
+            cv=3,
+            exploit_incremental_learning=True,
+            train_sizes=np.linspace(0.1, 1.0, 10),
+            shuffle=shuffle_train,
+        )
+        assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+        assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+        assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+
+def test_learning_curve_incremental_learning_unsupervised():
+    X, _ = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockIncrementalImprovingEstimator(20)
+    train_sizes, train_scores, test_scores = learning_curve(
+        estimator,
+        X,
+        y=None,
+        cv=3,
+        exploit_incremental_learning=True,
+        train_sizes=np.linspace(0.1, 1.0, 10),
+    )
+    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+    assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+    assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+
+def test_learning_curve_batch_and_incremental_learning_are_equal():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    train_sizes = np.linspace(0.2, 1.0, 5)
+    estimator = PassiveAggressiveClassifier(max_iter=1, tol=None, shuffle=False)
+
+    train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
+        estimator,
+        X,
+        y,
+        train_sizes=train_sizes,
+        cv=3,
+        exploit_incremental_learning=True,
+    )
+    train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve(
+        estimator,
+        X,
+        y,
+        cv=3,
+        train_sizes=train_sizes,
+        exploit_incremental_learning=False,
+    )
+
+    assert_array_equal(train_sizes_inc, train_sizes_batch)
+    assert_array_almost_equal(
+        train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)
+    )
+    assert_array_almost_equal(
+        test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1)
+    )
+
+
+def test_learning_curve_n_sample_range_out_of_bounds():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(20)
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0, 1])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0.0, 1.0])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0.1, 1.1])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0, 20])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[1, 21])
+
+
+def test_learning_curve_remove_duplicate_sample_sizes():
+    X, y = make_classification(
+        n_samples=3,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(2)
+    warning_message = (
+        "Removed duplicate entries from 'train_sizes'. Number of ticks "
+        "will be less than the size of 'train_sizes': 2 instead of 3."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        train_sizes, _, _ = learning_curve(
+            estimator, X, y, cv=3, train_sizes=np.linspace(0.33, 1.0, 3)
+        )
+    assert_array_equal(train_sizes, [1, 2])
+
+
+def test_learning_curve_with_boolean_indices():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockImprovingEstimator(20)
+    cv = KFold(n_splits=3)
+    train_sizes, train_scores, test_scores = learning_curve(
+        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10)
+    )
+    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+    assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10))
+    assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
+
+
+def test_learning_curve_with_shuffle():
+    # Following test case was designed this way to verify the code
+    # changes made in pull request: #7506.
+    X = np.array(
+        [
+            [1, 2],
+            [3, 4],
+            [5, 6],
+            [7, 8],
+            [11, 12],
+            [13, 14],
+            [15, 16],
+            [17, 18],
+            [19, 20],
+            [7, 8],
+            [9, 10],
+            [11, 12],
+            [13, 14],
+            [15, 16],
+            [17, 18],
+        ]
+    )
+    y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4])
+    groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4])
+    # Splits on these groups fail without shuffle as the first iteration
+    # of the learning curve doesn't contain label 4 in the training set.
+    estimator = PassiveAggressiveClassifier(max_iter=5, tol=None, shuffle=False)
+
+    cv = GroupKFold(n_splits=2)
+    train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve(
+        estimator,
+        X,
+        y,
+        cv=cv,
+        n_jobs=1,
+        train_sizes=np.linspace(0.3, 1.0, 3),
+        groups=groups,
+        shuffle=True,
+        random_state=2,
+    )
+    assert_array_almost_equal(
+        train_scores_batch.mean(axis=1), np.array([0.75, 0.3, 0.36111111])
+    )
+    assert_array_almost_equal(
+        test_scores_batch.mean(axis=1), np.array([0.36111111, 0.25, 0.25])
+    )
+    with pytest.raises(ValueError):
+        learning_curve(
+            estimator,
+            X,
+            y,
+            cv=cv,
+            n_jobs=1,
+            train_sizes=np.linspace(0.3, 1.0, 3),
+            groups=groups,
+            error_score="raise",
+        )
+
+    train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
+        estimator,
+        X,
+        y,
+        cv=cv,
+        n_jobs=1,
+        train_sizes=np.linspace(0.3, 1.0, 3),
+        groups=groups,
+        shuffle=True,
+        random_state=2,
+        exploit_incremental_learning=True,
+    )
+    assert_array_almost_equal(
+        train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)
+    )
+    assert_array_almost_equal(
+        test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1)
+    )
+
+
+def test_learning_curve_params():
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    clf = CheckingClassifier(expected_sample_weight=True)
+
+    err_msg = r"Expected sample_weight to be passed"
+    with pytest.raises(AssertionError, match=err_msg):
+        learning_curve(clf, X, y, error_score="raise")
+
+    err_msg = r"sample_weight.shape == \(1,\), expected \(2,\)!"
+    with pytest.raises(ValueError, match=err_msg):
+        learning_curve(
+            clf, X, y, error_score="raise", params={"sample_weight": np.ones(1)}
+        )
+    learning_curve(
+        clf, X, y, error_score="raise", params={"sample_weight": np.ones(10)}
+    )
+
+
+def test_learning_curve_incremental_learning_params():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    estimator = MockIncrementalImprovingEstimator(20, ["sample_weight"])
+    err_msg = r"Expected fit parameter\(s\) \['sample_weight'\] not seen."
+    with pytest.raises(AssertionError, match=err_msg):
+        learning_curve(
+            estimator,
+            X,
+            y,
+            cv=3,
+            exploit_incremental_learning=True,
+            train_sizes=np.linspace(0.1, 1.0, 10),
+            error_score="raise",
+        )
+
+    err_msg = "Fit parameter sample_weight has length 3; expected"
+    with pytest.raises(AssertionError, match=err_msg):
+        learning_curve(
+            estimator,
+            X,
+            y,
+            cv=3,
+            exploit_incremental_learning=True,
+            train_sizes=np.linspace(0.1, 1.0, 10),
+            error_score="raise",
+            params={"sample_weight": np.ones(3)},
+        )
+
+    learning_curve(
+        estimator,
+        X,
+        y,
+        cv=3,
+        exploit_incremental_learning=True,
+        train_sizes=np.linspace(0.1, 1.0, 10),
+        error_score="raise",
+        params={"sample_weight": np.ones(2)},
+    )
+
+
+def test_validation_curve():
+    X, y = make_classification(
+        n_samples=2,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    param_range = np.linspace(0, 1, 10)
+    with warnings.catch_warnings(record=True) as w:
+        train_scores, test_scores = validation_curve(
+            MockEstimatorWithParameter(),
+            X,
+            y,
+            param_name="param",
+            param_range=param_range,
+            cv=2,
+        )
+    if len(w) > 0:
+        raise RuntimeError("Unexpected warning: %r" % w[0].message)
+
+    assert_array_almost_equal(train_scores.mean(axis=1), param_range)
+    assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range)
+
+
+def test_validation_curve_clone_estimator():
+    X, y = make_classification(
+        n_samples=2,
+        n_features=1,
+        n_informative=1,
+        n_redundant=0,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+
+    param_range = np.linspace(1, 0, 10)
+    _, _ = validation_curve(
+        MockEstimatorWithSingleFitCallAllowed(),
+        X,
+        y,
+        param_name="param",
+        param_range=param_range,
+        cv=2,
+    )
+
+
+def test_validation_curve_cv_splits_consistency():
+    n_samples = 100
+    n_splits = 5
+    X, y = make_classification(n_samples=100, random_state=0)
+
+    scores1 = validation_curve(
+        SVC(kernel="linear", random_state=0),
+        X,
+        y,
+        param_name="C",
+        param_range=[0.1, 0.1, 0.2, 0.2],
+        cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples),
+    )
+    # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the
+    # `split` is called for each parameter, the following should produce
+    # identical results for param setting 1 and param setting 2 as both have
+    # the same C value.
+    assert_array_almost_equal(*np.vsplit(np.hstack(scores1)[(0, 2, 1, 3), :], 2))
+
+    scores2 = validation_curve(
+        SVC(kernel="linear", random_state=0),
+        X,
+        y,
+        param_name="C",
+        param_range=[0.1, 0.1, 0.2, 0.2],
+        cv=KFold(n_splits=n_splits, shuffle=True),
+    )
+
+    # For scores2, compare the 1st and 2nd parameter's scores
+    # (Since the C value for 1st two param setting is 0.1, they must be
+    # consistent unless the train test folds differ between the param settings)
+    assert_array_almost_equal(*np.vsplit(np.hstack(scores2)[(0, 2, 1, 3), :], 2))
+
+    scores3 = validation_curve(
+        SVC(kernel="linear", random_state=0),
+        X,
+        y,
+        param_name="C",
+        param_range=[0.1, 0.1, 0.2, 0.2],
+        cv=KFold(n_splits=n_splits),
+    )
+
+    # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check.
+    assert_array_almost_equal(np.array(scores3), np.array(scores1))
+
+
+def test_validation_curve_params():
+    X = np.arange(100).reshape(10, 10)
+    y = np.array([0] * 5 + [1] * 5)
+    clf = CheckingClassifier(expected_sample_weight=True)
+
+    err_msg = r"Expected sample_weight to be passed"
+    with pytest.raises(AssertionError, match=err_msg):
+        validation_curve(
+            clf,
+            X,
+            y,
+            param_name="foo_param",
+            param_range=[1, 2, 3],
+            error_score="raise",
+        )
+
+    err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!"
+    with pytest.raises(ValueError, match=err_msg):
+        validation_curve(
+            clf,
+            X,
+            y,
+            param_name="foo_param",
+            param_range=[1, 2, 3],
+            error_score="raise",
+            params={"sample_weight": np.ones(1)},
+        )
+    validation_curve(
+        clf,
+        X,
+        y,
+        param_name="foo_param",
+        param_range=[1, 2, 3],
+        error_score="raise",
+        params={"sample_weight": np.ones(10)},
+    )
+
+
+def test_check_is_permutation():
+    rng = np.random.RandomState(0)
+    p = np.arange(100)
+    rng.shuffle(p)
+    assert _check_is_permutation(p, 100)
+    assert not _check_is_permutation(np.delete(p, 23), 100)
+
+    p[0] = 23
+    assert not _check_is_permutation(p, 100)
+
+    # Check if the additional duplicate indices are caught
+    assert not _check_is_permutation(np.hstack((p, 0)), 100)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_cross_val_predict_sparse_prediction(csr_container):
+    # check that cross_val_predict gives same result for sparse and dense input
+    X, y = make_multilabel_classification(
+        n_classes=2,
+        n_labels=1,
+        allow_unlabeled=False,
+        return_indicator=True,
+        random_state=1,
+    )
+    X_sparse = csr_container(X)
+    y_sparse = csr_container(y)
+    classif = OneVsRestClassifier(SVC(kernel="linear"))
+    preds = cross_val_predict(classif, X, y, cv=10)
+    preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10)
+    preds_sparse = preds_sparse.toarray()
+    assert_array_almost_equal(preds_sparse, preds)
+
+
+def check_cross_val_predict_binary(est, X, y, method):
+    """Helper for tests of cross_val_predict with binary classification"""
+    cv = KFold(n_splits=3, shuffle=False)
+
+    # Generate expected outputs
+    if y.ndim == 1:
+        exp_shape = (len(X),) if method == "decision_function" else (len(X), 2)
+    else:
+        exp_shape = y.shape
+    expected_predictions = np.zeros(exp_shape)
+    for train, test in cv.split(X, y):
+        est = clone(est).fit(X[train], y[train])
+        expected_predictions[test] = getattr(est, method)(X[test])
+
+    # Check actual outputs for several representations of y
+    for tg in [y, y + 1, y - 2, y.astype("str")]:
+        assert_allclose(
+            cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions
+        )
+
+
+def check_cross_val_predict_multiclass(est, X, y, method):
+    """Helper for tests of cross_val_predict with multiclass classification"""
+    cv = KFold(n_splits=3, shuffle=False)
+
+    # Generate expected outputs
+    float_min = np.finfo(np.float64).min
+    default_values = {
+        "decision_function": float_min,
+        "predict_log_proba": float_min,
+        "predict_proba": 0,
+    }
+    expected_predictions = np.full(
+        (len(X), len(set(y))), default_values[method], dtype=np.float64
+    )
+    _, y_enc = np.unique(y, return_inverse=True)
+    for train, test in cv.split(X, y_enc):
+        est = clone(est).fit(X[train], y_enc[train])
+        fold_preds = getattr(est, method)(X[test])
+        i_cols_fit = np.unique(y_enc[train])
+        expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds
+
+    # Check actual outputs for several representations of y
+    for tg in [y, y + 1, y - 2, y.astype("str")]:
+        assert_allclose(
+            cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions
+        )
+
+
+def check_cross_val_predict_multilabel(est, X, y, method):
+    """Check the output of cross_val_predict for 2D targets using
+    Estimators which provide a predictions as a list with one
+    element per class.
+    """
+    cv = KFold(n_splits=3, shuffle=False)
+
+    # Create empty arrays of the correct size to hold outputs
+    float_min = np.finfo(np.float64).min
+    default_values = {
+        "decision_function": float_min,
+        "predict_log_proba": float_min,
+        "predict_proba": 0,
+    }
+    n_targets = y.shape[1]
+    expected_preds = []
+    for i_col in range(n_targets):
+        n_classes_in_label = len(set(y[:, i_col]))
+        if n_classes_in_label == 2 and method == "decision_function":
+            exp_shape = (len(X),)
+        else:
+            exp_shape = (len(X), n_classes_in_label)
+        expected_preds.append(
+            np.full(exp_shape, default_values[method], dtype=np.float64)
+        )
+
+    # Generate expected outputs
+    y_enc_cols = [
+        np.unique(y[:, i], return_inverse=True)[1][:, np.newaxis]
+        for i in range(y.shape[1])
+    ]
+    y_enc = np.concatenate(y_enc_cols, axis=1)
+    for train, test in cv.split(X, y_enc):
+        est = clone(est).fit(X[train], y_enc[train])
+        fold_preds = getattr(est, method)(X[test])
+        for i_col in range(n_targets):
+            fold_cols = np.unique(y_enc[train][:, i_col])
+            if expected_preds[i_col].ndim == 1:
+                # Decision function with <=2 classes
+                expected_preds[i_col][test] = fold_preds[i_col]
+            else:
+                idx = np.ix_(test, fold_cols)
+                expected_preds[i_col][idx] = fold_preds[i_col]
+
+    # Check actual outputs for several representations of y
+    for tg in [y, y + 1, y - 2, y.astype("str")]:
+        cv_predict_output = cross_val_predict(est, X, tg, method=method, cv=cv)
+        assert len(cv_predict_output) == len(expected_preds)
+        for i in range(len(cv_predict_output)):
+            assert_allclose(cv_predict_output[i], expected_preds[i])
+
+
+def check_cross_val_predict_with_method_binary(est):
+    # This test includes the decision_function with two classes.
+    # This is a special case: it has only one column of output.
+    X, y = make_classification(n_classes=2, random_state=0)
+    for method in ["decision_function", "predict_proba", "predict_log_proba"]:
+        check_cross_val_predict_binary(est, X, y, method)
+
+
+def check_cross_val_predict_with_method_multiclass(est):
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X, y = shuffle(X, y, random_state=0)
+    for method in ["decision_function", "predict_proba", "predict_log_proba"]:
+        check_cross_val_predict_multiclass(est, X, y, method)
+
+
+def test_cross_val_predict_with_method():
+    check_cross_val_predict_with_method_binary(LogisticRegression())
+    check_cross_val_predict_with_method_multiclass(LogisticRegression())
+
+
+def test_cross_val_predict_method_checking():
+    # Regression test for issue #9639. Tests that cross_val_predict does not
+    # check estimator methods (e.g. predict_proba) before fitting
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X, y = shuffle(X, y, random_state=0)
+    for method in ["decision_function", "predict_proba", "predict_log_proba"]:
+        est = SGDClassifier(loss="log_loss", random_state=2)
+        check_cross_val_predict_multiclass(est, X, y, method)
+
+
+def test_gridsearchcv_cross_val_predict_with_method():
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X, y = shuffle(X, y, random_state=0)
+    est = GridSearchCV(LogisticRegression(random_state=42), {"C": [0.1, 1]}, cv=2)
+    for method in ["decision_function", "predict_proba", "predict_log_proba"]:
+        check_cross_val_predict_multiclass(est, X, y, method)
+
+
+def test_cross_val_predict_with_method_multilabel_ovr():
+    # OVR does multilabel predictions, but only arrays of
+    # binary indicator columns. The output of predict_proba
+    # is a 2D array with shape (n_samples, n_classes).
+    n_samp = 100
+    n_classes = 4
+    X, y = make_multilabel_classification(
+        n_samples=n_samp, n_labels=3, n_classes=n_classes, n_features=5, random_state=42
+    )
+    est = OneVsRestClassifier(LogisticRegression(solver="liblinear", random_state=0))
+    for method in ["predict_proba", "decision_function"]:
+        check_cross_val_predict_binary(est, X, y, method=method)
+
+
+class RFWithDecisionFunction(RandomForestClassifier):
+    # None of the current multioutput-multiclass estimators have
+    # decision function methods. Create a mock decision function
+    # to test the cross_val_predict function's handling of this case.
+    def decision_function(self, X):
+        probs = self.predict_proba(X)
+        msg = "This helper should only be used on multioutput-multiclass tasks"
+        assert isinstance(probs, list), msg
+        probs = [p[:, -1] if p.shape[1] == 2 else p for p in probs]
+        return probs
+
+
+def test_cross_val_predict_with_method_multilabel_rf():
+    # The RandomForest allows multiple classes in each label.
+    # Output of predict_proba is a list of outputs of predict_proba
+    # for each individual label.
+    n_classes = 4
+    X, y = make_multilabel_classification(
+        n_samples=100, n_labels=3, n_classes=n_classes, n_features=5, random_state=42
+    )
+    y[:, 0] += y[:, 1]  # Put three classes in the first column
+    for method in ["predict_proba", "predict_log_proba", "decision_function"]:
+        est = RFWithDecisionFunction(n_estimators=5, random_state=0)
+        with warnings.catch_warnings():
+            # Suppress "RuntimeWarning: divide by zero encountered in log"
+            warnings.simplefilter("ignore")
+            check_cross_val_predict_multilabel(est, X, y, method=method)
+
+
+def test_cross_val_predict_with_method_rare_class():
+    # Test a multiclass problem where one class will be missing from
+    # one of the CV training sets.
+    rng = np.random.RandomState(0)
+    X = rng.normal(0, 1, size=(14, 10))
+    y = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 3])
+    est = LogisticRegression()
+    for method in ["predict_proba", "predict_log_proba", "decision_function"]:
+        with warnings.catch_warnings():
+            # Suppress warning about too few examples of a class
+            warnings.simplefilter("ignore")
+            check_cross_val_predict_multiclass(est, X, y, method)
+
+
+def test_cross_val_predict_with_method_multilabel_rf_rare_class():
+    # The RandomForest allows anything for the contents of the labels.
+    # Output of predict_proba is a list of outputs of predict_proba
+    # for each individual label.
+    # In this test, the first label has a class with a single example.
+    # We'll have one CV fold where the training data don't include it.
+    rng = np.random.RandomState(0)
+    X = rng.normal(0, 1, size=(5, 10))
+    y = np.array([[0, 0], [1, 1], [2, 1], [0, 1], [1, 0]])
+    for method in ["predict_proba", "predict_log_proba"]:
+        est = RFWithDecisionFunction(n_estimators=5, random_state=0)
+        with warnings.catch_warnings():
+            # Suppress "RuntimeWarning: divide by zero encountered in log"
+            warnings.simplefilter("ignore")
+            check_cross_val_predict_multilabel(est, X, y, method=method)
+
+
+def get_expected_predictions(X, y, cv, classes, est, method):
+    expected_predictions = np.zeros([len(y), classes])
+    func = getattr(est, method)
+
+    for train, test in cv.split(X, y):
+        est.fit(X[train], y[train])
+        expected_predictions_ = func(X[test])
+        # To avoid 2 dimensional indexing
+        if method == "predict_proba":
+            exp_pred_test = np.zeros((len(test), classes))
+        else:
+            exp_pred_test = np.full(
+                (len(test), classes), np.finfo(expected_predictions.dtype).min
+            )
+        exp_pred_test[:, est.classes_] = expected_predictions_
+        expected_predictions[test] = exp_pred_test
+
+    return expected_predictions
+
+
+def test_cross_val_predict_class_subset():
+    X = np.arange(200).reshape(100, 2)
+    y = np.array([x // 10 for x in range(100)])
+    classes = 10
+
+    kfold3 = KFold(n_splits=3)
+    kfold4 = KFold(n_splits=4)
+
+    le = LabelEncoder()
+
+    methods = ["decision_function", "predict_proba", "predict_log_proba"]
+    for method in methods:
+        est = LogisticRegression()
+
+        # Test with n_splits=3
+        predictions = cross_val_predict(est, X, y, method=method, cv=kfold3)
+
+        # Runs a naive loop (should be same as cross_val_predict):
+        expected_predictions = get_expected_predictions(
+            X, y, kfold3, classes, est, method
+        )
+        assert_array_almost_equal(expected_predictions, predictions)
+
+        # Test with n_splits=4
+        predictions = cross_val_predict(est, X, y, method=method, cv=kfold4)
+        expected_predictions = get_expected_predictions(
+            X, y, kfold4, classes, est, method
+        )
+        assert_array_almost_equal(expected_predictions, predictions)
+
+        # Testing unordered labels
+        y = shuffle(np.repeat(range(10), 10), random_state=0)
+        predictions = cross_val_predict(est, X, y, method=method, cv=kfold3)
+        y = le.fit_transform(y)
+        expected_predictions = get_expected_predictions(
+            X, y, kfold3, classes, est, method
+        )
+        assert_array_almost_equal(expected_predictions, predictions)
+
+
+def test_score_memmap():
+    # Ensure a scalar score of memmap type is accepted
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    clf = MockClassifier()
+    tf = tempfile.NamedTemporaryFile(mode="wb", delete=False)
+    tf.write(b"Hello world!!!!!")
+    tf.close()
+    scores = np.memmap(tf.name, dtype=np.float64)
+    score = np.memmap(tf.name, shape=(), mode="r", dtype=np.float64)
+    try:
+        cross_val_score(clf, X, y, scoring=lambda est, X, y: score)
+        with pytest.raises(ValueError):
+            cross_val_score(clf, X, y, scoring=lambda est, X, y: scores)
+    finally:
+        # Best effort to release the mmap file handles before deleting the
+        # backing file under Windows
+        scores, score = None, None
+        for _ in range(3):
+            try:
+                os.unlink(tf.name)
+                break
+            except OSError:
+                sleep(1.0)
+
+
+def test_permutation_test_score_pandas():
+    # check permutation_test_score doesn't destroy pandas dataframe
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((Series, DataFrame))
+    except ImportError:
+        pass
+    for TargetType, InputFeatureType in types:
+        # X dataframe, y series
+        iris = load_iris()
+        X, y = iris.data, iris.target
+        X_df, y_ser = InputFeatureType(X), TargetType(y)
+        check_df = lambda x: isinstance(x, InputFeatureType)
+        check_series = lambda x: isinstance(x, TargetType)
+        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
+        permutation_test_score(clf, X_df, y_ser)
+
+
+def test_fit_and_score_failing():
+    # Create a failing classifier to deliberately fail
+    failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER)
+    # dummy X data
+    X = np.arange(1, 10)
+    train, test = np.arange(0, 5), np.arange(5, 9)
+    fit_and_score_args = dict(
+        estimator=failing_clf,
+        X=X,
+        y=None,
+        scorer=dict(),
+        train=train,
+        test=test,
+        verbose=0,
+        parameters=None,
+        fit_params=None,
+        score_params=None,
+    )
+    # passing error score to trigger the warning message
+    fit_and_score_args["error_score"] = "raise"
+    # check if exception was raised, with default error_score='raise'
+    with pytest.raises(ValueError, match="Failing classifier failed as required"):
+        _fit_and_score(**fit_and_score_args)
+
+    assert failing_clf.score() == 0.0  # FailingClassifier coverage
+
+
+def test_fit_and_score_working():
+    X, y = make_classification(n_samples=30, random_state=0)
+    clf = SVC(kernel="linear", random_state=0)
+    train, test = next(ShuffleSplit().split(X))
+    # Test return_parameters option
+    fit_and_score_args = dict(
+        estimator=clf,
+        X=X,
+        y=y,
+        scorer=dict(),
+        train=train,
+        test=test,
+        verbose=0,
+        parameters={"max_iter": 100, "tol": 0.1},
+        fit_params=None,
+        score_params=None,
+        return_parameters=True,
+    )
+    result = _fit_and_score(**fit_and_score_args)
+    assert result["parameters"] == fit_and_score_args["parameters"]
+
+
+class DataDependentFailingClassifier(BaseEstimator):
+    def __init__(self, max_x_value=None):
+        self.max_x_value = max_x_value
+
+    def fit(self, X, y=None):
+        num_values_too_high = (X > self.max_x_value).sum()
+        if num_values_too_high:
+            raise ValueError(
+                f"Classifier fit failed with {num_values_too_high} values too high"
+            )
+
+    def score(self, X=None, Y=None):
+        return 0.0
+
+
+@pytest.mark.parametrize("error_score", [np.nan, 0])
+def test_cross_validate_some_failing_fits_warning(error_score):
+    # Create a failing classifier to deliberately fail
+    failing_clf = DataDependentFailingClassifier(max_x_value=8)
+    # dummy X data
+    X = np.arange(1, 10)
+    y = np.ones(9)
+    # passing error score to trigger the warning message
+    cross_validate_args = [failing_clf, X, y]
+    cross_validate_kwargs = {"cv": 3, "error_score": error_score}
+    # check if the warning message type is as expected
+
+    individual_fit_error_message = (
+        "ValueError: Classifier fit failed with 1 values too high"
+    )
+    warning_message = re.compile(
+        (
+            "2 fits failed.+total of 3.+The score on these"
+            " train-test partitions for these parameters will be set to"
+            f" {cross_validate_kwargs['error_score']}.+{individual_fit_error_message}"
+        ),
+        flags=re.DOTALL,
+    )
+
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        cross_validate(*cross_validate_args, **cross_validate_kwargs)
+
+
+@pytest.mark.parametrize("error_score", [np.nan, 0])
+def test_cross_validate_all_failing_fits_error(error_score):
+    # Create a failing classifier to deliberately fail
+    failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER)
+    # dummy X data
+    X = np.arange(1, 10)
+    y = np.ones(9)
+
+    cross_validate_args = [failing_clf, X, y]
+    cross_validate_kwargs = {"cv": 7, "error_score": error_score}
+
+    individual_fit_error_message = "ValueError: Failing classifier failed as required"
+    error_message = re.compile(
+        (
+            "All the 7 fits failed.+your model is misconfigured.+"
+            f"{individual_fit_error_message}"
+        ),
+        flags=re.DOTALL,
+    )
+
+    with pytest.raises(ValueError, match=error_message):
+        cross_validate(*cross_validate_args, **cross_validate_kwargs)
+
+
+def _failing_scorer(estimator, X, y, error_msg):
+    raise ValueError(error_msg)
+
+
+@pytest.mark.filterwarnings("ignore:lbfgs failed to converge")
+@pytest.mark.parametrize("error_score", [np.nan, 0, "raise"])
+def test_cross_val_score_failing_scorer(error_score):
+    # check that an estimator can fail during scoring in `cross_val_score` and
+    # that we can optionally replaced it with `error_score`
+    X, y = load_iris(return_X_y=True)
+    clf = LogisticRegression(max_iter=5).fit(X, y)
+
+    error_msg = "This scorer is supposed to fail!!!"
+    failing_scorer = partial(_failing_scorer, error_msg=error_msg)
+
+    if error_score == "raise":
+        with pytest.raises(ValueError, match=error_msg):
+            cross_val_score(
+                clf, X, y, cv=3, scoring=failing_scorer, error_score=error_score
+            )
+    else:
+        warning_msg = (
+            "Scoring failed. The score on this train-test partition for "
+            f"these parameters will be set to {error_score}"
+        )
+        with pytest.warns(UserWarning, match=warning_msg):
+            scores = cross_val_score(
+                clf, X, y, cv=3, scoring=failing_scorer, error_score=error_score
+            )
+            assert_allclose(scores, error_score)
+
+
+@pytest.mark.filterwarnings("ignore:lbfgs failed to converge")
+@pytest.mark.parametrize("error_score", [np.nan, 0, "raise"])
+@pytest.mark.parametrize("return_train_score", [True, False])
+@pytest.mark.parametrize("with_multimetric", [False, True])
+def test_cross_validate_failing_scorer(
+    error_score, return_train_score, with_multimetric
+):
+    # Check that an estimator can fail during scoring in `cross_validate` and
+    # that we can optionally replace it with `error_score`. In the multimetric
+    # case also check the result of a non-failing scorer where the other scorers
+    # are failing.
+    X, y = load_iris(return_X_y=True)
+    clf = LogisticRegression(max_iter=5).fit(X, y)
+
+    error_msg = "This scorer is supposed to fail!!!"
+    failing_scorer = partial(_failing_scorer, error_msg=error_msg)
+    if with_multimetric:
+        non_failing_scorer = make_scorer(mean_squared_error)
+        scoring = {
+            "score_1": failing_scorer,
+            "score_2": non_failing_scorer,
+            "score_3": failing_scorer,
+        }
+    else:
+        scoring = failing_scorer
+
+    if error_score == "raise":
+        with pytest.raises(ValueError, match=error_msg):
+            cross_validate(
+                clf,
+                X,
+                y,
+                cv=3,
+                scoring=scoring,
+                return_train_score=return_train_score,
+                error_score=error_score,
+            )
+    else:
+        warning_msg = (
+            "Scoring failed. The score on this train-test partition for "
+            f"these parameters will be set to {error_score}"
+        )
+        with pytest.warns(UserWarning, match=warning_msg):
+            results = cross_validate(
+                clf,
+                X,
+                y,
+                cv=3,
+                scoring=scoring,
+                return_train_score=return_train_score,
+                error_score=error_score,
+            )
+            for key in results:
+                if "_score" in key:
+                    if "_score_2" in key:
+                        # check the test (and optionally train) score for the
+                        # scorer that should be non-failing
+                        for i in results[key]:
+                            assert isinstance(i, float)
+                    else:
+                        # check the test (and optionally train) score for all
+                        # scorers that should be assigned to `error_score`.
+                        assert_allclose(results[key], error_score)
+
+
+def three_params_scorer(i, j, k):
+    return 3.4213
+
+
+@pytest.mark.parametrize(
+    "train_score, scorer, verbose, split_prg, cdt_prg, expected",
+    [
+        (
+            False,
+            three_params_scorer,
+            2,
+            (1, 3),
+            (0, 1),
+            r"\[CV\] END ...................................................."
+            r" total time=   0.\ds",
+        ),
+        (
+            True,
+            _MultimetricScorer(
+                scorers={"sc1": three_params_scorer, "sc2": three_params_scorer}
+            ),
+            3,
+            (1, 3),
+            (0, 1),
+            r"\[CV 2/3\] END  sc1: \(train=3.421, test=3.421\) sc2: "
+            r"\(train=3.421, test=3.421\) total time=   0.\ds",
+        ),
+        (
+            False,
+            _MultimetricScorer(
+                scorers={"sc1": three_params_scorer, "sc2": three_params_scorer}
+            ),
+            10,
+            (1, 3),
+            (0, 1),
+            r"\[CV 2/3; 1/1\] END ....... sc1: \(test=3.421\) sc2: \(test=3.421\)"
+            r" total time=   0.\ds",
+        ),
+    ],
+)
+def test_fit_and_score_verbosity(
+    capsys, train_score, scorer, verbose, split_prg, cdt_prg, expected
+):
+    X, y = make_classification(n_samples=30, random_state=0)
+    clf = SVC(kernel="linear", random_state=0)
+    train, test = next(ShuffleSplit().split(X))
+
+    # test print without train score
+    fit_and_score_args = dict(
+        estimator=clf,
+        X=X,
+        y=y,
+        scorer=scorer,
+        train=train,
+        test=test,
+        verbose=verbose,
+        parameters=None,
+        fit_params=None,
+        score_params=None,
+        return_train_score=train_score,
+        split_progress=split_prg,
+        candidate_progress=cdt_prg,
+    )
+    _fit_and_score(**fit_and_score_args)
+    out, _ = capsys.readouterr()
+    outlines = out.split("\n")
+    if len(outlines) > 2:
+        assert re.match(expected, outlines[1])
+    else:
+        assert re.match(expected, outlines[0])
+
+
+def test_score():
+    error_message = "scoring must return a number, got None"
+
+    def two_params_scorer(estimator, X_test):
+        return None
+
+    with pytest.raises(ValueError, match=error_message):
+        _score(
+            estimator=None,
+            X_test=None,
+            y_test=None,
+            scorer=two_params_scorer,
+            score_params=None,
+            error_score=np.nan,
+        )
+
+
+def test_callable_multimetric_confusion_matrix_cross_validate():
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        cm = confusion_matrix(y, y_pred)
+        return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
+
+    X, y = make_classification(n_samples=40, n_features=4, random_state=42)
+    est = LinearSVC(random_state=42)
+    est.fit(X, y)
+    cv_results = cross_validate(est, X, y, cv=5, scoring=custom_scorer)
+
+    score_names = ["tn", "fp", "fn", "tp"]
+    for name in score_names:
+        assert "test_{}".format(name) in cv_results
+
+
+def test_learning_curve_partial_fit_regressors():
+    """Check that regressors with partial_fit is supported.
+
+    Non-regression test for #22981.
+    """
+    X, y = make_regression(random_state=42)
+
+    # Does not error
+    learning_curve(MLPRegressor(), X, y, exploit_incremental_learning=True, cv=2)
+
+
+def test_learning_curve_some_failing_fits_warning(global_random_seed):
+    """Checks for fit failures in `learning_curve` and raises the required warning"""
+
+    X, y = make_classification(
+        n_samples=30,
+        n_classes=3,
+        n_informative=6,
+        shuffle=False,
+        random_state=global_random_seed,
+    )
+    # sorting the target to trigger SVC error on the 2 first splits because a single
+    # class is present
+    sorted_idx = np.argsort(y)
+    X, y = X[sorted_idx], y[sorted_idx]
+
+    svc = SVC()
+    warning_message = "10 fits failed out of a total of 25"
+
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        _, train_score, test_score, *_ = learning_curve(
+            svc, X, y, cv=5, error_score=np.nan
+        )
+
+    # the first 2 splits should lead to warnings and thus np.nan scores
+    for idx in range(2):
+        assert np.isnan(train_score[idx]).all()
+        assert np.isnan(test_score[idx]).all()
+
+    for idx in range(2, train_score.shape[0]):
+        assert not np.isnan(train_score[idx]).any()
+        assert not np.isnan(test_score[idx]).any()
+
+
+def test_cross_validate_return_indices(global_random_seed):
+    """Check the behaviour of `return_indices` in `cross_validate`."""
+    X, y = load_iris(return_X_y=True)
+    X = scale(X)  # scale features for better convergence
+    estimator = LogisticRegression()
+
+    cv = KFold(n_splits=3, shuffle=True, random_state=global_random_seed)
+    cv_results = cross_validate(estimator, X, y, cv=cv, n_jobs=2, return_indices=False)
+    assert "indices" not in cv_results
+
+    cv_results = cross_validate(estimator, X, y, cv=cv, n_jobs=2, return_indices=True)
+    assert "indices" in cv_results
+    train_indices = cv_results["indices"]["train"]
+    test_indices = cv_results["indices"]["test"]
+    assert len(train_indices) == cv.n_splits
+    assert len(test_indices) == cv.n_splits
+
+    assert_array_equal([indices.size for indices in train_indices], 100)
+    assert_array_equal([indices.size for indices in test_indices], 50)
+
+    for split_idx, (expected_train_idx, expected_test_idx) in enumerate(cv.split(X, y)):
+        assert_array_equal(train_indices[split_idx], expected_train_idx)
+        assert_array_equal(test_indices[split_idx], expected_test_idx)
+
+
+# Tests for metadata routing in cross_val* and in *curve
+# ======================================================
+
+
+# TODO(1.8): remove `learning_curve`, `validation_curve` and `permutation_test_score`.
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+def test_fit_param_deprecation(func, extra_args):
+    """Check that we warn about deprecating `fit_params`."""
+    with pytest.warns(FutureWarning, match="`fit_params` is deprecated"):
+        func(
+            estimator=ConsumingClassifier(), X=X, y=y, cv=2, fit_params={}, **extra_args
+        )
+
+    with pytest.raises(
+        ValueError, match="`params` and `fit_params` cannot both be provided"
+    ):
+        func(
+            estimator=ConsumingClassifier(),
+            X=X,
+            y=y,
+            fit_params={},
+            params={},
+            **extra_args,
+        )
+
+
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (cross_validate, {}),
+        (cross_val_score, {}),
+        (cross_val_predict, {}),
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_groups_with_routing_validation(func, extra_args):
+    """Check that we raise an error if `groups` are passed to the cv method instead
+    of `params` when metadata routing is enabled.
+    """
+    with pytest.raises(ValueError, match="`groups` can only be passed if"):
+        func(
+            estimator=ConsumingClassifier(),
+            X=X,
+            y=y,
+            groups=[],
+            **extra_args,
+        )
+
+
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (cross_validate, {}),
+        (cross_val_score, {}),
+        (cross_val_predict, {}),
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_cross_validate_params_none(func, extra_args):
+    """Test that no errors are raised when passing `params=None`, which is the
+    default value.
+    Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/30447
+    """
+    X, y = make_classification(n_samples=100, n_classes=2, random_state=0)
+    func(estimator=ConsumingClassifier(), X=X, y=y, **extra_args)
+
+
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (cross_validate, {}),
+        (cross_val_score, {}),
+        (cross_val_predict, {}),
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_passed_unrequested_metadata(func, extra_args):
+    """Check that we raise an error when passing metadata that is not
+    requested."""
+
+    err_msg = re.escape(
+        "[metadata] are passed but are not explicitly set as requested or not "
+        "requested for ConsumingClassifier.fit, which is used within"
+    )
+    with pytest.raises(UnsetMetadataPassedError, match=err_msg):
+        func(
+            estimator=ConsumingClassifier(),
+            X=X,
+            y=y2,
+            params=dict(metadata=[]),
+            **extra_args,
+        )
+
+    # cross_val_predict doesn't use scoring
+    if func == cross_val_predict:
+        return
+
+    err_msg = re.escape(
+        "[metadata] are passed but are not explicitly set as requested or not "
+        "requested for ConsumingClassifier.score, which is used within"
+    )
+    with pytest.raises(UnsetMetadataPassedError, match=err_msg):
+        func(
+            estimator=ConsumingClassifier()
+            .set_fit_request(metadata=True)
+            .set_partial_fit_request(metadata=True),
+            X=X,
+            y=y2,
+            params=dict(metadata=[]),
+            **extra_args,
+        )
+
+
+@pytest.mark.parametrize(
+    "func, extra_args",
+    [
+        (cross_validate, {}),
+        (cross_val_score, {}),
+        (cross_val_predict, {}),
+        (learning_curve, {}),
+        (permutation_test_score, {}),
+        (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_validation_functions_routing(func, extra_args):
+    """Check that the respective cv method is properly dispatching the metadata
+    to the consumer."""
+    scorer_registry = _Registry()
+    scorer = ConsumingScorer(registry=scorer_registry).set_score_request(
+        sample_weight="score_weights", metadata="score_metadata"
+    )
+    splitter_registry = _Registry()
+    splitter = ConsumingSplitter(registry=splitter_registry).set_split_request(
+        groups="split_groups", metadata="split_metadata"
+    )
+    estimator_registry = _Registry()
+    estimator = ConsumingClassifier(registry=estimator_registry).set_fit_request(
+        sample_weight="fit_sample_weight", metadata="fit_metadata"
+    )
+
+    n_samples = _num_samples(X)
+    rng = np.random.RandomState(0)
+    score_weights = rng.rand(n_samples)
+    score_metadata = rng.rand(n_samples)
+    split_groups = rng.randint(0, 3, n_samples)
+    split_metadata = rng.rand(n_samples)
+    fit_sample_weight = rng.rand(n_samples)
+    fit_metadata = rng.rand(n_samples)
+
+    scoring_args = {
+        cross_validate: dict(scoring=dict(my_scorer=scorer, accuracy="accuracy")),
+        cross_val_score: dict(scoring=scorer),
+        learning_curve: dict(scoring=scorer),
+        validation_curve: dict(scoring=scorer),
+        permutation_test_score: dict(scoring=scorer),
+        cross_val_predict: dict(),
+    }
+
+    params = dict(
+        split_groups=split_groups,
+        split_metadata=split_metadata,
+        fit_sample_weight=fit_sample_weight,
+        fit_metadata=fit_metadata,
+    )
+
+    if func is not cross_val_predict:
+        params.update(
+            score_weights=score_weights,
+            score_metadata=score_metadata,
+        )
+
+    func(
+        estimator,
+        X=X,
+        y=y,
+        cv=splitter,
+        **scoring_args[func],
+        **extra_args,
+        params=params,
+    )
+
+    if func is not cross_val_predict:
+        # cross_val_predict doesn't need a scorer
+        assert len(scorer_registry)
+    for _scorer in scorer_registry:
+        check_recorded_metadata(
+            obj=_scorer,
+            method="score",
+            parent=func.__name__,
+            split_params=("sample_weight", "metadata"),
+            sample_weight=score_weights,
+            metadata=score_metadata,
+        )
+
+    assert len(splitter_registry)
+    for _splitter in splitter_registry:
+        check_recorded_metadata(
+            obj=_splitter,
+            method="split",
+            parent=func.__name__,
+            groups=split_groups,
+            metadata=split_metadata,
+        )
+
+    assert len(estimator_registry)
+    for _estimator in estimator_registry:
+        check_recorded_metadata(
+            obj=_estimator,
+            method="fit",
+            parent=func.__name__,
+            split_params=("sample_weight", "metadata"),
+            sample_weight=fit_sample_weight,
+            metadata=fit_metadata,
+        )
+
+
+@config_context(enable_metadata_routing=True)
+def test_learning_curve_exploit_incremental_learning_routing():
+    """Test that learning_curve routes metadata to the estimator correctly while
+    partial_fitting it with `exploit_incremental_learning=True`."""
+
+    n_samples = _num_samples(X)
+    rng = np.random.RandomState(0)
+    fit_sample_weight = rng.rand(n_samples)
+    fit_metadata = rng.rand(n_samples)
+
+    estimator_registry = _Registry()
+    estimator = ConsumingClassifier(
+        registry=estimator_registry
+    ).set_partial_fit_request(
+        sample_weight="fit_sample_weight", metadata="fit_metadata"
+    )
+
+    learning_curve(
+        estimator,
+        X=X,
+        y=y,
+        cv=ConsumingSplitter(),
+        exploit_incremental_learning=True,
+        params=dict(fit_sample_weight=fit_sample_weight, fit_metadata=fit_metadata),
+    )
+
+    assert len(estimator_registry)
+    for _estimator in estimator_registry:
+        check_recorded_metadata(
+            obj=_estimator,
+            method="partial_fit",
+            parent="learning_curve",
+            split_params=("sample_weight", "metadata"),
+            sample_weight=fit_sample_weight,
+            metadata=fit_metadata,
+        )
+
+
+# End of metadata routing tests
+# =============================
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e0de99f5e7e37bb92643ad29f3c859c689d4918
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/__init__.py
@@ -0,0 +1,42 @@
+"""The k-nearest neighbors algorithms."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._ball_tree import BallTree
+from ._base import VALID_METRICS, VALID_METRICS_SPARSE, sort_graph_by_row_values
+from ._classification import KNeighborsClassifier, RadiusNeighborsClassifier
+from ._graph import (
+    KNeighborsTransformer,
+    RadiusNeighborsTransformer,
+    kneighbors_graph,
+    radius_neighbors_graph,
+)
+from ._kd_tree import KDTree
+from ._kde import KernelDensity
+from ._lof import LocalOutlierFactor
+from ._nca import NeighborhoodComponentsAnalysis
+from ._nearest_centroid import NearestCentroid
+from ._regression import KNeighborsRegressor, RadiusNeighborsRegressor
+from ._unsupervised import NearestNeighbors
+
+__all__ = [
+    "VALID_METRICS",
+    "VALID_METRICS_SPARSE",
+    "BallTree",
+    "KDTree",
+    "KNeighborsClassifier",
+    "KNeighborsRegressor",
+    "KNeighborsTransformer",
+    "KernelDensity",
+    "LocalOutlierFactor",
+    "NearestCentroid",
+    "NearestNeighbors",
+    "NeighborhoodComponentsAnalysis",
+    "RadiusNeighborsClassifier",
+    "RadiusNeighborsRegressor",
+    "RadiusNeighborsTransformer",
+    "kneighbors_graph",
+    "radius_neighbors_graph",
+    "sort_graph_by_row_values",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_ball_tree.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_ball_tree.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..44d876187c54f370a6acaa72645c39371526fac8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_ball_tree.pyx.tp
@@ -0,0 +1,284 @@
+{{py:
+
+# Generated file: _ball_tree.pyx
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    ('64', 'float64_t', 'np.float64'),
+    ('32', 'float32_t', 'np.float32')
+]
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+}}
+
+
+__all__ = ['BallTree', 'BallTree64', 'BallTree32']
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+DOC_DICT{{name_suffix}} = {
+    'BinaryTree': 'BallTree{{name_suffix}}',
+    'binary_tree': 'ball_tree{{name_suffix}}',
+}
+
+VALID_METRICS{{name_suffix}} = [
+    'BrayCurtisDistance{{name_suffix}}',
+    'CanberraDistance{{name_suffix}}',
+    'ChebyshevDistance{{name_suffix}}',
+    'DiceDistance{{name_suffix}}',
+    'EuclideanDistance{{name_suffix}}',
+    'HammingDistance{{name_suffix}}',
+    'HaversineDistance{{name_suffix}}',
+    'JaccardDistance{{name_suffix}}',
+    'MahalanobisDistance{{name_suffix}}',
+    'ManhattanDistance{{name_suffix}}',
+    'MinkowskiDistance{{name_suffix}}',
+    'PyFuncDistance{{name_suffix}}',
+    'RogersTanimotoDistance{{name_suffix}}',
+    'RussellRaoDistance{{name_suffix}}',
+    'SEuclideanDistance{{name_suffix}}',
+    'SokalMichenerDistance{{name_suffix}}',
+    'SokalSneathDistance{{name_suffix}}',
+    'WMinkowskiDistance{{name_suffix}}',
+]
+
+{{endfor}}
+
+include "_binary_tree.pxi"
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}}
+cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}):
+    __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}})
+    pass
+
+{{endfor}}
+
+
+#----------------------------------------------------------------------
+# The functions below specialized the Binary Tree as a Ball Tree
+#
+#   Note that these functions use the concept of "reduced distance".
+#   The reduced distance, defined for some metrics, is a quantity which
+#   is more efficient to compute than the distance, but preserves the
+#   relative rankings of the true distance.  For example, the reduced
+#   distance for the Euclidean metric is the squared-euclidean distance.
+#   For some metrics, the reduced distance is simply the distance.
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+cdef int allocate_data{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t n_nodes,
+    intp_t n_features,
+) except -1:
+    """Allocate arrays needed for the KD Tree"""
+    tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}})
+    return 0
+
+
+cdef int init_node{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    NodeData_t[::1] node_data,
+    intp_t i_node,
+    intp_t idx_start,
+    intp_t idx_end,
+) except -1:
+    """Initialize the node for the dataset stored in tree.data"""
+    cdef intp_t n_features = tree.data.shape[1]
+    cdef intp_t n_points = idx_end - idx_start
+
+    cdef intp_t i, j
+    cdef float64_t radius
+    cdef const {{INPUT_DTYPE_t}} *this_pt
+
+    cdef intp_t* idx_array = &tree.idx_array[0]
+    cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0]
+    cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0]
+
+    cdef bint with_sample_weight = tree.sample_weight is not None
+    cdef const {{INPUT_DTYPE_t}}* sample_weight
+    cdef float64_t sum_weight_node
+    if with_sample_weight:
+        sample_weight = &tree.sample_weight[0]
+
+    # determine Node centroid
+    for j in range(n_features):
+        centroid[j] = 0
+
+    if with_sample_weight:
+        sum_weight_node = 0
+        for i in range(idx_start, idx_end):
+            sum_weight_node += sample_weight[idx_array[i]]
+            this_pt = data + n_features * idx_array[i]
+            for j from 0 <= j < n_features:
+                centroid[j] += this_pt[j] * sample_weight[idx_array[i]]
+
+        for j in range(n_features):
+            centroid[j] /= sum_weight_node
+    else:
+        for i in range(idx_start, idx_end):
+            this_pt = data + n_features * idx_array[i]
+            for j from 0 <= j < n_features:
+                centroid[j] += this_pt[j]
+
+        for j in range(n_features):
+            centroid[j] /= n_points
+
+    # determine Node radius
+    radius = 0
+    for i in range(idx_start, idx_end):
+        radius = fmax(radius,
+                      tree.rdist(centroid,
+                                 data + n_features * idx_array[i],
+                                 n_features))
+
+    node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius)
+    node_data[i_node].idx_start = idx_start
+    node_data[i_node].idx_end = idx_end
+    return 0
+
+
+cdef inline float64_t min_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
+    """Compute the minimum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    return fmax(0, dist_pt - tree.node_data[i_node].radius)
+
+
+cdef inline float64_t max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    return dist_pt + tree.node_data[i_node].radius
+
+
+cdef inline int min_max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+    float64_t* min_dist,
+    float64_t* max_dist,
+) except -1 nogil:
+    """Compute the minimum and maximum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    cdef float64_t rad = tree.node_data[i_node].radius
+    min_dist[0] = fmax(0, dist_pt - rad)
+    max_dist[0] = dist_pt + rad
+    return 0
+
+
+cdef inline float64_t min_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
+    """Compute the minimum reduced-distance between a point and a node"""
+    if tree.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            min_dist{{name_suffix}}(tree, i_node, pt)
+        )
+    else:
+        return tree.dist_metric._dist_to_rdist(
+            min_dist{{name_suffix}}(tree, i_node, pt)
+        )
+
+
+cdef inline float64_t max_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum reduced-distance between a point and a node"""
+    if tree.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            max_dist{{name_suffix}}(tree, i_node, pt)
+        )
+    else:
+        return tree.dist_metric._dist_to_rdist(
+            max_dist{{name_suffix}}(tree, i_node, pt)
+        )
+
+
+cdef inline float64_t min_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the minimum distance between two nodes"""
+    cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
+                                      &tree1.node_bounds[0, i_node1, 0],
+                                      tree1.data.shape[1])
+    return fmax(0, (dist_pt - tree1.node_data[i_node1].radius
+                    - tree2.node_data[i_node2].radius))
+
+
+cdef inline float64_t max_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the maximum distance between two nodes"""
+    cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
+                                      &tree1.node_bounds[0, i_node1, 0],
+                                      tree1.data.shape[1])
+    return (dist_pt + tree1.node_data[i_node1].radius
+            + tree2.node_data[i_node2].radius)
+
+
+cdef inline float64_t min_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the minimum reduced distance between two nodes"""
+    if tree1.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+    else:
+        return tree1.dist_metric._dist_to_rdist(
+            min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+
+
+cdef inline float64_t max_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the maximum reduced distance between two nodes"""
+    if tree1.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+    else:
+        return tree1.dist_metric._dist_to_rdist(
+            max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+
+{{endfor}}
+
+
+class BallTree(BallTree64):
+    __doc__ = CLASS_DOC.format(BinaryTree="BallTree")
+    pass
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_base.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..767eee1358aa873808ab7796d080cea06bae97bc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_base.py
@@ -0,0 +1,1404 @@
+"""Base and mixin classes for nearest neighbors."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import numbers
+import warnings
+from abc import ABCMeta, abstractmethod
+from functools import partial
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+from scipy.sparse import csr_matrix, issparse
+
+from ..base import BaseEstimator, MultiOutputMixin, is_classifier
+from ..exceptions import DataConversionWarning, EfficiencyWarning
+from ..metrics import DistanceMetric, pairwise_distances_chunked
+from ..metrics._pairwise_distances_reduction import (
+    ArgKmin,
+    RadiusNeighbors,
+)
+from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
+from ..utils import (
+    check_array,
+    gen_even_slices,
+    get_tags,
+)
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.fixes import parse_version, sp_base_version
+from ..utils.multiclass import check_classification_targets
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import _to_object_array, check_is_fitted, validate_data
+from ._ball_tree import BallTree
+from ._kd_tree import KDTree
+
+SCIPY_METRICS = [
+    "braycurtis",
+    "canberra",
+    "chebyshev",
+    "correlation",
+    "cosine",
+    "dice",
+    "hamming",
+    "jaccard",
+    "mahalanobis",
+    "minkowski",
+    "rogerstanimoto",
+    "russellrao",
+    "seuclidean",
+    "sokalsneath",
+    "sqeuclidean",
+    "yule",
+]
+if sp_base_version < parse_version("1.17"):
+    # Deprecated in SciPy 1.15 and removed in SciPy 1.17
+    SCIPY_METRICS += ["sokalmichener"]
+if sp_base_version < parse_version("1.11"):
+    # Deprecated in SciPy 1.9 and removed in SciPy 1.11
+    SCIPY_METRICS += ["kulsinski"]
+if sp_base_version < parse_version("1.9"):
+    # Deprecated in SciPy 1.0 and removed in SciPy 1.9
+    SCIPY_METRICS += ["matching"]
+
+VALID_METRICS = dict(
+    ball_tree=BallTree.valid_metrics,
+    kd_tree=KDTree.valid_metrics,
+    # The following list comes from the
+    # sklearn.metrics.pairwise doc string
+    brute=sorted(set(PAIRWISE_DISTANCE_FUNCTIONS).union(SCIPY_METRICS)),
+)
+
+VALID_METRICS_SPARSE = dict(
+    ball_tree=[],
+    kd_tree=[],
+    brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - {"haversine", "nan_euclidean"}),
+)
+
+
+def _get_weights(dist, weights):
+    """Get the weights from an array of distances and a parameter ``weights``.
+
+    Assume weights have already been validated.
+
+    Parameters
+    ----------
+    dist : ndarray
+        The input distances.
+
+    weights : {'uniform', 'distance'}, callable or None
+        The kind of weighting used.
+
+    Returns
+    -------
+    weights_arr : array of the same shape as ``dist``
+        If ``weights == 'uniform'``, then returns None.
+    """
+    if weights in (None, "uniform"):
+        return None
+
+    if weights == "distance":
+        # if user attempts to classify a point that was zero distance from one
+        # or more training points, those training points are weighted as 1.0
+        # and the other points as 0.0
+        if dist.dtype is np.dtype(object):
+            for point_dist_i, point_dist in enumerate(dist):
+                # check if point_dist is iterable
+                # (ex: RadiusNeighborClassifier.predict may set an element of
+                # dist to 1e-6 to represent an 'outlier')
+                if hasattr(point_dist, "__contains__") and 0.0 in point_dist:
+                    dist[point_dist_i] = point_dist == 0.0
+                else:
+                    dist[point_dist_i] = 1.0 / point_dist
+        else:
+            with np.errstate(divide="ignore"):
+                dist = 1.0 / dist
+            inf_mask = np.isinf(dist)
+            inf_row = np.any(inf_mask, axis=1)
+            dist[inf_row] = inf_mask[inf_row]
+        return dist
+
+    if callable(weights):
+        return weights(dist)
+
+
+def _is_sorted_by_data(graph):
+    """Return whether the graph's non-zero entries are sorted by data.
+
+    The non-zero entries are stored in graph.data and graph.indices.
+    For each row (or sample), the non-zero entries can be either:
+        - sorted by indices, as after graph.sort_indices();
+        - sorted by data, as after _check_precomputed(graph);
+        - not sorted.
+
+    Parameters
+    ----------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Neighbors graph as given by `kneighbors_graph` or
+        `radius_neighbors_graph`. Matrix should be of format CSR format.
+
+    Returns
+    -------
+    res : bool
+        Whether input graph is sorted by data.
+    """
+    assert graph.format == "csr"
+    out_of_order = graph.data[:-1] > graph.data[1:]
+    line_change = np.unique(graph.indptr[1:-1] - 1)
+    line_change = line_change[line_change < out_of_order.shape[0]]
+    return out_of_order.sum() == out_of_order[line_change].sum()
+
+
+def _check_precomputed(X):
+    """Check precomputed distance matrix.
+
+    If the precomputed distance matrix is sparse, it checks that the non-zero
+    entries are sorted by distances. If not, the matrix is copied and sorted.
+
+    Parameters
+    ----------
+    X : {sparse matrix, array-like}, (n_samples, n_samples)
+        Distance matrix to other samples. X may be a sparse matrix, in which
+        case only non-zero elements may be considered neighbors.
+
+    Returns
+    -------
+    X : {sparse matrix, array-like}, (n_samples, n_samples)
+        Distance matrix to other samples. X may be a sparse matrix, in which
+        case only non-zero elements may be considered neighbors.
+    """
+    if not issparse(X):
+        X = check_array(X, ensure_non_negative=True, input_name="X")
+        return X
+    else:
+        graph = X
+
+    if graph.format not in ("csr", "csc", "coo", "lil"):
+        raise TypeError(
+            "Sparse matrix in {!r} format is not supported due to "
+            "its handling of explicit zeros".format(graph.format)
+        )
+    copied = graph.format != "csr"
+    graph = check_array(
+        graph,
+        accept_sparse="csr",
+        ensure_non_negative=True,
+        input_name="precomputed distance matrix",
+    )
+    graph = sort_graph_by_row_values(graph, copy=not copied, warn_when_not_sorted=True)
+
+    return graph
+
+
+@validate_params(
+    {
+        "graph": ["sparse matrix"],
+        "copy": ["boolean"],
+        "warn_when_not_sorted": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def sort_graph_by_row_values(graph, copy=False, warn_when_not_sorted=True):
+    """Sort a sparse graph such that each row is stored with increasing values.
+
+    .. versionadded:: 1.2
+
+    Parameters
+    ----------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Distance matrix to other samples, where only non-zero elements are
+        considered neighbors. Matrix is converted to CSR format if not already.
+
+    copy : bool, default=False
+        If True, the graph is copied before sorting. If False, the sorting is
+        performed inplace. If the graph is not of CSR format, `copy` must be
+        True to allow the conversion to CSR format, otherwise an error is
+        raised.
+
+    warn_when_not_sorted : bool, default=True
+        If True, a :class:`~sklearn.exceptions.EfficiencyWarning` is raised
+        when the input graph is not sorted by row values.
+
+    Returns
+    -------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Distance matrix to other samples, where only non-zero elements are
+        considered neighbors. Matrix is in CSR format.
+
+    Examples
+    --------
+    >>> from scipy.sparse import csr_matrix
+    >>> from sklearn.neighbors import sort_graph_by_row_values
+    >>> X = csr_matrix(
+    ...     [[0., 3., 1.],
+    ...      [3., 0., 2.],
+    ...      [1., 2., 0.]])
+    >>> X.data
+    array([3., 1., 3., 2., 1., 2.])
+    >>> X_ = sort_graph_by_row_values(X)
+    >>> X_.data
+    array([1., 3., 2., 3., 1., 2.])
+    """
+    if graph.format == "csr" and _is_sorted_by_data(graph):
+        return graph
+
+    if warn_when_not_sorted:
+        warnings.warn(
+            (
+                "Precomputed sparse input was not sorted by row values. Use the"
+                " function sklearn.neighbors.sort_graph_by_row_values to sort the input"
+                " by row values, with warn_when_not_sorted=False to remove this"
+                " warning."
+            ),
+            EfficiencyWarning,
+        )
+
+    if graph.format not in ("csr", "csc", "coo", "lil"):
+        raise TypeError(
+            f"Sparse matrix in {graph.format!r} format is not supported due to "
+            "its handling of explicit zeros"
+        )
+    elif graph.format != "csr":
+        if not copy:
+            raise ValueError(
+                "The input graph is not in CSR format. Use copy=True to allow "
+                "the conversion to CSR format."
+            )
+        graph = graph.asformat("csr")
+    elif copy:  # csr format with copy=True
+        graph = graph.copy()
+
+    row_nnz = np.diff(graph.indptr)
+    if row_nnz.max() == row_nnz.min():
+        # if each sample has the same number of provided neighbors
+        n_samples = graph.shape[0]
+        distances = graph.data.reshape(n_samples, -1)
+
+        order = np.argsort(distances, kind="mergesort")
+        order += np.arange(n_samples)[:, None] * row_nnz[0]
+        order = order.ravel()
+        graph.data = graph.data[order]
+        graph.indices = graph.indices[order]
+
+    else:
+        for start, stop in zip(graph.indptr, graph.indptr[1:]):
+            order = np.argsort(graph.data[start:stop], kind="mergesort")
+            graph.data[start:stop] = graph.data[start:stop][order]
+            graph.indices[start:stop] = graph.indices[start:stop][order]
+
+    return graph
+
+
+def _kneighbors_from_graph(graph, n_neighbors, return_distance):
+    """Decompose a nearest neighbors sparse graph into distances and indices.
+
+    Parameters
+    ----------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Neighbors graph as given by `kneighbors_graph` or
+        `radius_neighbors_graph`. Matrix should be of format CSR format.
+
+    n_neighbors : int
+        Number of neighbors required for each sample.
+
+    return_distance : bool
+        Whether or not to return the distances.
+
+    Returns
+    -------
+    neigh_dist : ndarray of shape (n_samples, n_neighbors)
+        Distances to nearest neighbors. Only present if `return_distance=True`.
+
+    neigh_ind : ndarray of shape (n_samples, n_neighbors)
+        Indices of nearest neighbors.
+    """
+    n_samples = graph.shape[0]
+    assert graph.format == "csr"
+
+    # number of neighbors by samples
+    row_nnz = np.diff(graph.indptr)
+    row_nnz_min = row_nnz.min()
+    if n_neighbors is not None and row_nnz_min < n_neighbors:
+        raise ValueError(
+            "%d neighbors per samples are required, but some samples have only"
+            " %d neighbors in precomputed graph matrix. Decrease number of "
+            "neighbors used or recompute the graph with more neighbors."
+            % (n_neighbors, row_nnz_min)
+        )
+
+    def extract(a):
+        # if each sample has the same number of provided neighbors
+        if row_nnz.max() == row_nnz_min:
+            return a.reshape(n_samples, -1)[:, :n_neighbors]
+        else:
+            idx = np.tile(np.arange(n_neighbors), (n_samples, 1))
+            idx += graph.indptr[:-1, None]
+            return a.take(idx, mode="clip").reshape(n_samples, n_neighbors)
+
+    if return_distance:
+        return extract(graph.data), extract(graph.indices)
+    else:
+        return extract(graph.indices)
+
+
+def _radius_neighbors_from_graph(graph, radius, return_distance):
+    """Decompose a nearest neighbors sparse graph into distances and indices.
+
+    Parameters
+    ----------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Neighbors graph as given by `kneighbors_graph` or
+        `radius_neighbors_graph`. Matrix should be of format CSR format.
+
+    radius : float
+        Radius of neighborhoods which should be strictly positive.
+
+    return_distance : bool
+        Whether or not to return the distances.
+
+    Returns
+    -------
+    neigh_dist : ndarray of shape (n_samples,) of arrays
+        Distances to nearest neighbors. Only present if `return_distance=True`.
+
+    neigh_ind : ndarray of shape (n_samples,) of arrays
+        Indices of nearest neighbors.
+    """
+    assert graph.format == "csr"
+
+    no_filter_needed = bool(graph.data.max() <= radius)
+
+    if no_filter_needed:
+        data, indices, indptr = graph.data, graph.indices, graph.indptr
+    else:
+        mask = graph.data <= radius
+        if return_distance:
+            data = np.compress(mask, graph.data)
+        indices = np.compress(mask, graph.indices)
+        indptr = np.concatenate(([0], np.cumsum(mask)))[graph.indptr]
+
+    indices = indices.astype(np.intp, copy=no_filter_needed)
+
+    if return_distance:
+        neigh_dist = _to_object_array(np.split(data, indptr[1:-1]))
+    neigh_ind = _to_object_array(np.split(indices, indptr[1:-1]))
+
+    if return_distance:
+        return neigh_dist, neigh_ind
+    else:
+        return neigh_ind
+
+
+class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for nearest neighbors estimators."""
+
+    _parameter_constraints: dict = {
+        "n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
+        "radius": [Interval(Real, 0, None, closed="both"), None],
+        "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
+        "leaf_size": [Interval(Integral, 1, None, closed="left")],
+        "p": [Interval(Real, 0, None, closed="right"), None],
+        "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable],
+        "metric_params": [dict, None],
+        "n_jobs": [Integral, None],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        n_neighbors=None,
+        radius=None,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        self.n_neighbors = n_neighbors
+        self.radius = radius
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.metric = metric
+        self.metric_params = metric_params
+        self.p = p
+        self.n_jobs = n_jobs
+
+    def _check_algorithm_metric(self):
+        if self.algorithm == "auto":
+            if self.metric == "precomputed":
+                alg_check = "brute"
+            elif (
+                callable(self.metric)
+                or self.metric in VALID_METRICS["ball_tree"]
+                or isinstance(self.metric, DistanceMetric)
+            ):
+                alg_check = "ball_tree"
+            else:
+                alg_check = "brute"
+        else:
+            alg_check = self.algorithm
+
+        if callable(self.metric):
+            if self.algorithm == "kd_tree":
+                # callable metric is only valid for brute force and ball_tree
+                raise ValueError(
+                    "kd_tree does not support callable metric '%s'"
+                    "Function call overhead will result"
+                    "in very poor performance." % self.metric
+                )
+        elif self.metric not in VALID_METRICS[alg_check] and not isinstance(
+            self.metric, DistanceMetric
+        ):
+            raise ValueError(
+                "Metric '%s' not valid. Use "
+                "sorted(sklearn.neighbors.VALID_METRICS['%s']) "
+                "to get valid options. "
+                "Metric can also be a callable function." % (self.metric, alg_check)
+            )
+
+        if self.metric_params is not None and "p" in self.metric_params:
+            if self.p is not None:
+                warnings.warn(
+                    (
+                        "Parameter p is found in metric_params. "
+                        "The corresponding parameter from __init__ "
+                        "is ignored."
+                    ),
+                    SyntaxWarning,
+                    stacklevel=3,
+                )
+
+    def _fit(self, X, y=None):
+        ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True
+        if self.__sklearn_tags__().target_tags.required:
+            if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
+                X, y = validate_data(
+                    self,
+                    X,
+                    y,
+                    accept_sparse="csr",
+                    multi_output=True,
+                    order="C",
+                    ensure_all_finite=ensure_all_finite,
+                )
+
+            if is_classifier(self):
+                # Classification targets require a specific format
+                if y.ndim == 1 or (y.ndim == 2 and y.shape[1] == 1):
+                    if y.ndim != 1:
+                        warnings.warn(
+                            (
+                                "A column-vector y was passed when a "
+                                "1d array was expected. Please change "
+                                "the shape of y to (n_samples,), for "
+                                "example using ravel()."
+                            ),
+                            DataConversionWarning,
+                            stacklevel=2,
+                        )
+
+                    self.outputs_2d_ = False
+                    y = y.reshape((-1, 1))
+                else:
+                    self.outputs_2d_ = True
+
+                check_classification_targets(y)
+                self.classes_ = []
+                # Using `dtype=np.intp` is necessary since `np.bincount`
+                # (called in _classification.py) fails when dealing
+                # with a float64 array on 32bit systems.
+                self._y = np.empty(y.shape, dtype=np.intp)
+                for k in range(self._y.shape[1]):
+                    classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True)
+                    self.classes_.append(classes)
+
+                if not self.outputs_2d_:
+                    self.classes_ = self.classes_[0]
+                    self._y = self._y.ravel()
+            else:
+                self._y = y
+
+        else:
+            if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
+                X = validate_data(
+                    self,
+                    X,
+                    ensure_all_finite=ensure_all_finite,
+                    accept_sparse="csr",
+                    order="C",
+                )
+
+        self._check_algorithm_metric()
+        if self.metric_params is None:
+            self.effective_metric_params_ = {}
+        else:
+            self.effective_metric_params_ = self.metric_params.copy()
+
+        effective_p = self.effective_metric_params_.get("p", self.p)
+        if self.metric == "minkowski":
+            self.effective_metric_params_["p"] = effective_p
+
+        self.effective_metric_ = self.metric
+        # For minkowski distance, use more efficient methods where available
+        if self.metric == "minkowski":
+            p = self.effective_metric_params_.pop("p", 2)
+            w = self.effective_metric_params_.pop("w", None)
+
+            if p == 1 and w is None:
+                self.effective_metric_ = "manhattan"
+            elif p == 2 and w is None:
+                self.effective_metric_ = "euclidean"
+            elif p == np.inf and w is None:
+                self.effective_metric_ = "chebyshev"
+            else:
+                # Use the generic minkowski metric, possibly weighted.
+                self.effective_metric_params_["p"] = p
+                self.effective_metric_params_["w"] = w
+
+        if isinstance(X, NeighborsBase):
+            self._fit_X = X._fit_X
+            self._tree = X._tree
+            self._fit_method = X._fit_method
+            self.n_samples_fit_ = X.n_samples_fit_
+            return self
+
+        elif isinstance(X, BallTree):
+            self._fit_X = X.data
+            self._tree = X
+            self._fit_method = "ball_tree"
+            self.n_samples_fit_ = X.data.shape[0]
+            return self
+
+        elif isinstance(X, KDTree):
+            self._fit_X = X.data
+            self._tree = X
+            self._fit_method = "kd_tree"
+            self.n_samples_fit_ = X.data.shape[0]
+            return self
+
+        if self.metric == "precomputed":
+            X = _check_precomputed(X)
+            # Precomputed matrix X must be squared
+            if X.shape[0] != X.shape[1]:
+                raise ValueError(
+                    "Precomputed matrix must be square."
+                    " Input is a {}x{} matrix.".format(X.shape[0], X.shape[1])
+                )
+            self.n_features_in_ = X.shape[1]
+
+        n_samples = X.shape[0]
+        if n_samples == 0:
+            raise ValueError("n_samples must be greater than 0")
+
+        if issparse(X):
+            if self.algorithm not in ("auto", "brute"):
+                warnings.warn("cannot use tree with sparse input: using brute force")
+
+            if (
+                self.effective_metric_ not in VALID_METRICS_SPARSE["brute"]
+                and not callable(self.effective_metric_)
+                and not isinstance(self.effective_metric_, DistanceMetric)
+            ):
+                raise ValueError(
+                    "Metric '%s' not valid for sparse input. "
+                    "Use sorted(sklearn.neighbors."
+                    "VALID_METRICS_SPARSE['brute']) "
+                    "to get valid options. "
+                    "Metric can also be a callable function." % (self.effective_metric_)
+                )
+            self._fit_X = X.copy()
+            self._tree = None
+            self._fit_method = "brute"
+            self.n_samples_fit_ = X.shape[0]
+            return self
+
+        self._fit_method = self.algorithm
+        self._fit_X = X
+        self.n_samples_fit_ = X.shape[0]
+
+        if self._fit_method == "auto":
+            # A tree approach is better for small number of neighbors or small
+            # number of features, with KDTree generally faster when available
+            if (
+                self.metric == "precomputed"
+                or self._fit_X.shape[1] > 15
+                or (
+                    self.n_neighbors is not None
+                    and self.n_neighbors >= self._fit_X.shape[0] // 2
+                )
+            ):
+                self._fit_method = "brute"
+            else:
+                if (
+                    self.effective_metric_ == "minkowski"
+                    and self.effective_metric_params_["p"] < 1
+                ):
+                    self._fit_method = "brute"
+                elif (
+                    self.effective_metric_ == "minkowski"
+                    and self.effective_metric_params_.get("w") is not None
+                ):
+                    # 'minkowski' with weights is not supported by KDTree but is
+                    # supported byBallTree.
+                    self._fit_method = "ball_tree"
+                elif self.effective_metric_ in VALID_METRICS["kd_tree"]:
+                    self._fit_method = "kd_tree"
+                elif (
+                    callable(self.effective_metric_)
+                    or self.effective_metric_ in VALID_METRICS["ball_tree"]
+                ):
+                    self._fit_method = "ball_tree"
+                else:
+                    self._fit_method = "brute"
+
+        if (
+            self.effective_metric_ == "minkowski"
+            and self.effective_metric_params_["p"] < 1
+        ):
+            # For 0 < p < 1 Minkowski distances aren't valid distance
+            # metric as they do not satisfy triangular inequality:
+            # they are semi-metrics.
+            # algorithm="kd_tree" and algorithm="ball_tree" can't be used because
+            # KDTree and BallTree require a proper distance metric to work properly.
+            # However, the brute-force algorithm supports semi-metrics.
+            if self._fit_method == "brute":
+                warnings.warn(
+                    "Mind that for 0 < p < 1, Minkowski metrics are not distance"
+                    " metrics. Continuing the execution with `algorithm='brute'`."
+                )
+            else:  # self._fit_method in ("kd_tree", "ball_tree")
+                raise ValueError(
+                    f'algorithm="{self._fit_method}" does not support 0 < p < 1 for '
+                    "the Minkowski metric. To resolve this problem either "
+                    'set p >= 1 or algorithm="brute".'
+                )
+
+        if self._fit_method == "ball_tree":
+            self._tree = BallTree(
+                X,
+                self.leaf_size,
+                metric=self.effective_metric_,
+                **self.effective_metric_params_,
+            )
+        elif self._fit_method == "kd_tree":
+            if (
+                self.effective_metric_ == "minkowski"
+                and self.effective_metric_params_.get("w") is not None
+            ):
+                raise ValueError(
+                    "algorithm='kd_tree' is not valid for "
+                    "metric='minkowski' with a weight parameter 'w': "
+                    "try algorithm='ball_tree' "
+                    "or algorithm='brute' instead."
+                )
+            self._tree = KDTree(
+                X,
+                self.leaf_size,
+                metric=self.effective_metric_,
+                **self.effective_metric_params_,
+            )
+        elif self._fit_method == "brute":
+            self._tree = None
+
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        # For cross-validation routines to split data correctly
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        # when input is precomputed metric values, all those values need to be positive
+        tags.input_tags.positive_only = tags.input_tags.pairwise
+        tags.input_tags.allow_nan = self.metric == "nan_euclidean"
+        return tags
+
+
+class KNeighborsMixin:
+    """Mixin for k-neighbors searches."""
+
+    def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance):
+        """Reduce a chunk of distances to the nearest neighbors.
+
+        Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`
+
+        Parameters
+        ----------
+        dist : ndarray of shape (n_samples_chunk, n_samples)
+            The distance matrix.
+
+        start : int
+            The index in X which the first row of dist corresponds to.
+
+        n_neighbors : int
+            Number of neighbors required for each sample.
+
+        return_distance : bool
+            Whether or not to return the distances.
+
+        Returns
+        -------
+        dist : array of shape (n_samples_chunk, n_neighbors)
+            Returned only if `return_distance=True`.
+
+        neigh : array of shape (n_samples_chunk, n_neighbors)
+            The neighbors indices.
+        """
+        sample_range = np.arange(dist.shape[0])[:, None]
+        neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
+        neigh_ind = neigh_ind[:, :n_neighbors]
+        # argpartition doesn't guarantee sorted order, so we sort again
+        neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])]
+        if return_distance:
+            if self.effective_metric_ == "euclidean":
+                result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
+            else:
+                result = dist[sample_range, neigh_ind], neigh_ind
+        else:
+            result = neigh_ind
+        return result
+
+    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
+        """Find the K-neighbors of a point.
+
+        Returns indices of and distances to the neighbors of each point.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_queries, n_features), \
+            or (n_queries, n_indexed) if metric == 'precomputed', default=None
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+
+        n_neighbors : int, default=None
+            Number of neighbors required for each sample. The default is the
+            value passed to the constructor.
+
+        return_distance : bool, default=True
+            Whether or not to return the distances.
+
+        Returns
+        -------
+        neigh_dist : ndarray of shape (n_queries, n_neighbors)
+            Array representing the lengths to points, only present if
+            return_distance=True.
+
+        neigh_ind : ndarray of shape (n_queries, n_neighbors)
+            Indices of the nearest points in the population matrix.
+
+        Examples
+        --------
+        In the following example, we construct a NearestNeighbors
+        class from an array representing our data set and ask who's
+        the closest point to [1,1,1]
+
+        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
+        >>> from sklearn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(n_neighbors=1)
+        >>> neigh.fit(samples)
+        NearestNeighbors(n_neighbors=1)
+        >>> print(neigh.kneighbors([[1., 1., 1.]]))
+        (array([[0.5]]), array([[2]]))
+
+        As you can see, it returns [[0.5]], and [[2]], which means that the
+        element is at distance 0.5 and is the third element of samples
+        (indexes start at 0). You can also query for multiple points:
+
+        >>> X = [[0., 1., 0.], [1., 0., 1.]]
+        >>> neigh.kneighbors(X, return_distance=False)
+        array([[1],
+               [2]]...)
+        """
+        check_is_fitted(self)
+
+        if n_neighbors is None:
+            n_neighbors = self.n_neighbors
+        elif n_neighbors <= 0:
+            raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors)
+        elif not isinstance(n_neighbors, numbers.Integral):
+            raise TypeError(
+                "n_neighbors does not take %s value, enter integer value"
+                % type(n_neighbors)
+            )
+
+        ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True
+        query_is_train = X is None
+        if query_is_train:
+            X = self._fit_X
+            # Include an extra neighbor to account for the sample itself being
+            # returned, which is removed later
+            n_neighbors += 1
+        else:
+            if self.metric == "precomputed":
+                X = _check_precomputed(X)
+            else:
+                X = validate_data(
+                    self,
+                    X,
+                    ensure_all_finite=ensure_all_finite,
+                    accept_sparse="csr",
+                    reset=False,
+                    order="C",
+                )
+
+        n_samples_fit = self.n_samples_fit_
+        if n_neighbors > n_samples_fit:
+            if query_is_train:
+                n_neighbors -= 1  # ok to modify inplace because an error is raised
+                inequality_str = "n_neighbors < n_samples_fit"
+            else:
+                inequality_str = "n_neighbors <= n_samples_fit"
+            raise ValueError(
+                f"Expected {inequality_str}, but "
+                f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, "
+                f"n_samples = {X.shape[0]}"  # include n_samples for common tests
+            )
+
+        n_jobs = effective_n_jobs(self.n_jobs)
+        chunked_results = None
+        use_pairwise_distances_reductions = (
+            self._fit_method == "brute"
+            and ArgKmin.is_usable_for(
+                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
+            )
+        )
+        if use_pairwise_distances_reductions:
+            results = ArgKmin.compute(
+                X=X,
+                Y=self._fit_X,
+                k=n_neighbors,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
+                strategy="auto",
+                return_distance=return_distance,
+            )
+
+        elif (
+            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+        ):
+            results = _kneighbors_from_graph(
+                X, n_neighbors=n_neighbors, return_distance=return_distance
+            )
+
+        elif self._fit_method == "brute":
+            # Joblib-based backend, which is used when user-defined callable
+            # are passed for metric.
+
+            # This won't be used in the future once PairwiseDistancesReductions
+            # support:
+            #   - DistanceMetrics which work on supposedly binary data
+            #   - CSR-dense and dense-CSR case if 'euclidean' in metric.
+            reduce_func = partial(
+                self._kneighbors_reduce_func,
+                n_neighbors=n_neighbors,
+                return_distance=return_distance,
+            )
+
+            # for efficiency, use squared euclidean distances
+            if self.effective_metric_ == "euclidean":
+                kwds = {"squared": True}
+            else:
+                kwds = self.effective_metric_params_
+
+            chunked_results = list(
+                pairwise_distances_chunked(
+                    X,
+                    self._fit_X,
+                    reduce_func=reduce_func,
+                    metric=self.effective_metric_,
+                    n_jobs=n_jobs,
+                    **kwds,
+                )
+            )
+
+        elif self._fit_method in ["ball_tree", "kd_tree"]:
+            if issparse(X):
+                raise ValueError(
+                    "%s does not work with sparse matrices. Densify the data, "
+                    "or set algorithm='brute'" % self._fit_method
+                )
+            chunked_results = Parallel(n_jobs, prefer="threads")(
+                delayed(self._tree.query)(X[s], n_neighbors, return_distance)
+                for s in gen_even_slices(X.shape[0], n_jobs)
+            )
+        else:
+            raise ValueError("internal: _fit_method not recognized")
+
+        if chunked_results is not None:
+            if return_distance:
+                neigh_dist, neigh_ind = zip(*chunked_results)
+                results = np.vstack(neigh_dist), np.vstack(neigh_ind)
+            else:
+                results = np.vstack(chunked_results)
+
+        if not query_is_train:
+            return results
+        else:
+            # If the query data is the same as the indexed data, we would like
+            # to ignore the first nearest neighbor of every sample, i.e
+            # the sample itself.
+            if return_distance:
+                neigh_dist, neigh_ind = results
+            else:
+                neigh_ind = results
+
+            n_queries, _ = X.shape
+            sample_range = np.arange(n_queries)[:, None]
+            sample_mask = neigh_ind != sample_range
+
+            # Corner case: When the number of duplicates are more
+            # than the number of neighbors, the first NN will not
+            # be the sample, but a duplicate.
+            # In that case mask the first duplicate.
+            dup_gr_nbrs = np.all(sample_mask, axis=1)
+            sample_mask[:, 0][dup_gr_nbrs] = False
+            neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1))
+
+            if return_distance:
+                neigh_dist = np.reshape(
+                    neigh_dist[sample_mask], (n_queries, n_neighbors - 1)
+                )
+                return neigh_dist, neigh_ind
+            return neigh_ind
+
+    def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"):
+        """Compute the (weighted) graph of k-Neighbors for points in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+            or (n_queries, n_indexed) if metric == 'precomputed', default=None
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+            For ``metric='precomputed'`` the shape should be
+            (n_queries, n_indexed). Otherwise the shape should be
+            (n_queries, n_features).
+
+        n_neighbors : int, default=None
+            Number of neighbors for each sample. The default is the value
+            passed to the constructor.
+
+        mode : {'connectivity', 'distance'}, default='connectivity'
+            Type of returned matrix: 'connectivity' will return the
+            connectivity matrix with ones and zeros, in 'distance' the
+            edges are distances between points, type of distance
+            depends on the selected metric parameter in
+            NearestNeighbors class.
+
+        Returns
+        -------
+        A : sparse-matrix of shape (n_queries, n_samples_fit)
+            `n_samples_fit` is the number of samples in the fitted data.
+            `A[i, j]` gives the weight of the edge connecting `i` to `j`.
+            The matrix is of CSR format.
+
+        See Also
+        --------
+        NearestNeighbors.radius_neighbors_graph : Compute the (weighted) graph
+            of Neighbors for points in X.
+
+        Examples
+        --------
+        >>> X = [[0], [3], [1]]
+        >>> from sklearn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(n_neighbors=2)
+        >>> neigh.fit(X)
+        NearestNeighbors(n_neighbors=2)
+        >>> A = neigh.kneighbors_graph(X)
+        >>> A.toarray()
+        array([[1., 0., 1.],
+               [0., 1., 1.],
+               [1., 0., 1.]])
+        """
+        check_is_fitted(self)
+        if n_neighbors is None:
+            n_neighbors = self.n_neighbors
+
+        # check the input only in self.kneighbors
+
+        # construct CSR matrix representation of the k-NN graph
+        if mode == "connectivity":
+            A_ind = self.kneighbors(X, n_neighbors, return_distance=False)
+            n_queries = A_ind.shape[0]
+            A_data = np.ones(n_queries * n_neighbors)
+
+        elif mode == "distance":
+            A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True)
+            A_data = np.ravel(A_data)
+
+        else:
+            raise ValueError(
+                'Unsupported mode, must be one of "connectivity", '
+                f'or "distance" but got "{mode}" instead'
+            )
+
+        n_queries = A_ind.shape[0]
+        n_samples_fit = self.n_samples_fit_
+        n_nonzero = n_queries * n_neighbors
+        A_indptr = np.arange(0, n_nonzero + 1, n_neighbors)
+
+        kneighbors_graph = csr_matrix(
+            (A_data, A_ind.ravel(), A_indptr), shape=(n_queries, n_samples_fit)
+        )
+
+        return kneighbors_graph
+
+
+class RadiusNeighborsMixin:
+    """Mixin for radius-based neighbors searches."""
+
+    def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance):
+        """Reduce a chunk of distances to the nearest neighbors.
+
+        Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`
+
+        Parameters
+        ----------
+        dist : ndarray of shape (n_samples_chunk, n_samples)
+            The distance matrix.
+
+        start : int
+            The index in X which the first row of dist corresponds to.
+
+        radius : float
+            The radius considered when making the nearest neighbors search.
+
+        return_distance : bool
+            Whether or not to return the distances.
+
+        Returns
+        -------
+        dist : list of ndarray of shape (n_samples_chunk,)
+            Returned only if `return_distance=True`.
+
+        neigh : list of ndarray of shape (n_samples_chunk,)
+            The neighbors indices.
+        """
+        neigh_ind = [np.where(d <= radius)[0] for d in dist]
+
+        if return_distance:
+            if self.effective_metric_ == "euclidean":
+                dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)]
+            else:
+                dist = [d[neigh_ind[i]] for i, d in enumerate(dist)]
+            results = dist, neigh_ind
+        else:
+            results = neigh_ind
+        return results
+
+    def radius_neighbors(
+        self, X=None, radius=None, return_distance=True, sort_results=False
+    ):
+        """Find the neighbors within a given radius of a point or points.
+
+        Return the indices and distances of each point from the dataset
+        lying in a ball with size ``radius`` around the points of the query
+        array. Points lying on the boundary are included in the results.
+
+        The result points are *not* necessarily sorted by distance to their
+        query point.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of (n_samples, n_features), default=None
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+
+        radius : float, default=None
+            Limiting distance of neighbors to return. The default is the value
+            passed to the constructor.
+
+        return_distance : bool, default=True
+            Whether or not to return the distances.
+
+        sort_results : bool, default=False
+            If True, the distances and indices will be sorted by increasing
+            distances before being returned. If False, the results may not
+            be sorted. If `return_distance=False`, setting `sort_results=True`
+            will result in an error.
+
+            .. versionadded:: 0.22
+
+        Returns
+        -------
+        neigh_dist : ndarray of shape (n_samples,) of arrays
+            Array representing the distances to each point, only present if
+            `return_distance=True`. The distance values are computed according
+            to the ``metric`` constructor parameter.
+
+        neigh_ind : ndarray of shape (n_samples,) of arrays
+            An array of arrays of indices of the approximate nearest points
+            from the population matrix that lie within a ball of size
+            ``radius`` around the query points.
+
+        Notes
+        -----
+        Because the number of neighbors of each point is not necessarily
+        equal, the results for multiple query points cannot be fit in a
+        standard data array.
+        For efficiency, `radius_neighbors` returns arrays of objects, where
+        each object is a 1D array of indices or distances.
+
+        Examples
+        --------
+        In the following example, we construct a NeighborsClassifier
+        class from an array representing our data set and ask who's
+        the closest point to [1, 1, 1]:
+
+        >>> import numpy as np
+        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
+        >>> from sklearn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(radius=1.6)
+        >>> neigh.fit(samples)
+        NearestNeighbors(radius=1.6)
+        >>> rng = neigh.radius_neighbors([[1., 1., 1.]])
+        >>> print(np.asarray(rng[0][0]))
+        [1.5 0.5]
+        >>> print(np.asarray(rng[1][0]))
+        [1 2]
+
+        The first array returned contains the distances to all points which
+        are closer than 1.6, while the second array returned contains their
+        indices.  In general, multiple points can be queried at the same time.
+        """
+        check_is_fitted(self)
+
+        if sort_results and not return_distance:
+            raise ValueError("return_distance must be True if sort_results is True.")
+
+        ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True
+        query_is_train = X is None
+        if query_is_train:
+            X = self._fit_X
+        else:
+            if self.metric == "precomputed":
+                X = _check_precomputed(X)
+            else:
+                X = validate_data(
+                    self,
+                    X,
+                    ensure_all_finite=ensure_all_finite,
+                    accept_sparse="csr",
+                    reset=False,
+                    order="C",
+                )
+
+        if radius is None:
+            radius = self.radius
+
+        use_pairwise_distances_reductions = (
+            self._fit_method == "brute"
+            and RadiusNeighbors.is_usable_for(
+                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
+            )
+        )
+
+        if use_pairwise_distances_reductions:
+            results = RadiusNeighbors.compute(
+                X=X,
+                Y=self._fit_X,
+                radius=radius,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
+                strategy="auto",
+                return_distance=return_distance,
+                sort_results=sort_results,
+            )
+
+        elif (
+            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+        ):
+            results = _radius_neighbors_from_graph(
+                X, radius=radius, return_distance=return_distance
+            )
+
+        elif self._fit_method == "brute":
+            # Joblib-based backend, which is used when user-defined callable
+            # are passed for metric.
+
+            # This won't be used in the future once PairwiseDistancesReductions
+            # support:
+            #   - DistanceMetrics which work on supposedly binary data
+            #   - CSR-dense and dense-CSR case if 'euclidean' in metric.
+
+            # for efficiency, use squared euclidean distances
+            if self.effective_metric_ == "euclidean":
+                radius *= radius
+                kwds = {"squared": True}
+            else:
+                kwds = self.effective_metric_params_
+
+            reduce_func = partial(
+                self._radius_neighbors_reduce_func,
+                radius=radius,
+                return_distance=return_distance,
+            )
+
+            chunked_results = pairwise_distances_chunked(
+                X,
+                self._fit_X,
+                reduce_func=reduce_func,
+                metric=self.effective_metric_,
+                n_jobs=self.n_jobs,
+                **kwds,
+            )
+            if return_distance:
+                neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results)
+                neigh_dist_list = list(itertools.chain.from_iterable(neigh_dist_chunks))
+                neigh_ind_list = list(itertools.chain.from_iterable(neigh_ind_chunks))
+                neigh_dist = _to_object_array(neigh_dist_list)
+                neigh_ind = _to_object_array(neigh_ind_list)
+                results = neigh_dist, neigh_ind
+            else:
+                neigh_ind_list = list(itertools.chain.from_iterable(chunked_results))
+                results = _to_object_array(neigh_ind_list)
+
+            if sort_results:
+                for ii in range(len(neigh_dist)):
+                    order = np.argsort(neigh_dist[ii], kind="mergesort")
+                    neigh_ind[ii] = neigh_ind[ii][order]
+                    neigh_dist[ii] = neigh_dist[ii][order]
+                results = neigh_dist, neigh_ind
+
+        elif self._fit_method in ["ball_tree", "kd_tree"]:
+            if issparse(X):
+                raise ValueError(
+                    "%s does not work with sparse matrices. Densify the data, "
+                    "or set algorithm='brute'" % self._fit_method
+                )
+
+            n_jobs = effective_n_jobs(self.n_jobs)
+            delayed_query = delayed(self._tree.query_radius)
+            chunked_results = Parallel(n_jobs, prefer="threads")(
+                delayed_query(X[s], radius, return_distance, sort_results=sort_results)
+                for s in gen_even_slices(X.shape[0], n_jobs)
+            )
+            if return_distance:
+                neigh_ind, neigh_dist = tuple(zip(*chunked_results))
+                results = np.hstack(neigh_dist), np.hstack(neigh_ind)
+            else:
+                results = np.hstack(chunked_results)
+        else:
+            raise ValueError("internal: _fit_method not recognized")
+
+        if not query_is_train:
+            return results
+        else:
+            # If the query data is the same as the indexed data, we would like
+            # to ignore the first nearest neighbor of every sample, i.e
+            # the sample itself.
+            if return_distance:
+                neigh_dist, neigh_ind = results
+            else:
+                neigh_ind = results
+
+            for ind, ind_neighbor in enumerate(neigh_ind):
+                mask = ind_neighbor != ind
+
+                neigh_ind[ind] = ind_neighbor[mask]
+                if return_distance:
+                    neigh_dist[ind] = neigh_dist[ind][mask]
+
+            if return_distance:
+                return neigh_dist, neigh_ind
+            return neigh_ind
+
+    def radius_neighbors_graph(
+        self, X=None, radius=None, mode="connectivity", sort_results=False
+    ):
+        """Compute the (weighted) graph of Neighbors for points in X.
+
+        Neighborhoods are restricted the points at a distance lower than
+        radius.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+
+        radius : float, default=None
+            Radius of neighborhoods. The default is the value passed to the
+            constructor.
+
+        mode : {'connectivity', 'distance'}, default='connectivity'
+            Type of returned matrix: 'connectivity' will return the
+            connectivity matrix with ones and zeros, in 'distance' the
+            edges are distances between points, type of distance
+            depends on the selected metric parameter in
+            NearestNeighbors class.
+
+        sort_results : bool, default=False
+            If True, in each row of the result, the non-zero entries will be
+            sorted by increasing distances. If False, the non-zero entries may
+            not be sorted. Only used with mode='distance'.
+
+            .. versionadded:: 0.22
+
+        Returns
+        -------
+        A : sparse-matrix of shape (n_queries, n_samples_fit)
+            `n_samples_fit` is the number of samples in the fitted data.
+            `A[i, j]` gives the weight of the edge connecting `i` to `j`.
+            The matrix is of CSR format.
+
+        See Also
+        --------
+        kneighbors_graph : Compute the (weighted) graph of k-Neighbors for
+            points in X.
+
+        Examples
+        --------
+        >>> X = [[0], [3], [1]]
+        >>> from sklearn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(radius=1.5)
+        >>> neigh.fit(X)
+        NearestNeighbors(radius=1.5)
+        >>> A = neigh.radius_neighbors_graph(X)
+        >>> A.toarray()
+        array([[1., 0., 1.],
+               [0., 1., 0.],
+               [1., 0., 1.]])
+        """
+        check_is_fitted(self)
+
+        # check the input only in self.radius_neighbors
+
+        if radius is None:
+            radius = self.radius
+
+        # construct CSR matrix representation of the NN graph
+        if mode == "connectivity":
+            A_ind = self.radius_neighbors(X, radius, return_distance=False)
+            A_data = None
+        elif mode == "distance":
+            dist, A_ind = self.radius_neighbors(
+                X, radius, return_distance=True, sort_results=sort_results
+            )
+            A_data = np.concatenate(list(dist))
+        else:
+            raise ValueError(
+                'Unsupported mode, must be one of "connectivity", '
+                f'or "distance" but got "{mode}" instead'
+            )
+
+        n_queries = A_ind.shape[0]
+        n_samples_fit = self.n_samples_fit_
+        n_neighbors = np.array([len(a) for a in A_ind])
+        A_ind = np.concatenate(list(A_ind))
+        if A_data is None:
+            A_data = np.ones(len(A_ind))
+        A_indptr = np.concatenate((np.zeros(1, dtype=int), np.cumsum(n_neighbors)))
+
+        return csr_matrix((A_data, A_ind, A_indptr), shape=(n_queries, n_samples_fit))
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = self.metric == "nan_euclidean"
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_binary_tree.pxi.tp b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_binary_tree.pxi.tp
new file mode 100644
index 0000000000000000000000000000000000000000..de3bcb0e5d916d3153b7d41c8c975927385b8aac
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_binary_tree.pxi.tp
@@ -0,0 +1,2478 @@
+{{py:
+
+# Generated file: _binary_tree.pxi
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE
+    #
+    ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'),
+    ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT')
+]
+
+# KD Tree and Ball Tree
+# =====================
+#
+# _binary_tree.pxi is generated and is then literally Cython included in
+# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp.
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+}}
+
+
+# KD Tree and Ball Tree
+# =====================
+#
+# The routines here are the core algorithms of the KDTree and BallTree
+# structures.  If Cython supported polymorphism, we would be able to
+# create a subclass and derive KDTree and BallTree from it.  Because
+# polymorphism is not an option, we use this single BinaryTree class
+# as a literal include to avoid duplicating the entire file.
+#
+# A series of functions are implemented in kd_tree.pyx and ball_tree.pyx
+# which use the information here to calculate the lower and upper bounds
+# between a node and a point, and between two nodes.  These functions are
+# used here, and are all that are needed to differentiate between the two
+# tree types.
+#
+# Description of Binary Tree Algorithms
+# -------------------------------------
+# A binary tree can be thought of as a collection of nodes.  The top node
+# contains all the points.  The next level consists of two nodes with half
+# the points in each, and this continues recursively.  Each node contains
+# metadata which allow fast computation of distance bounds: in the case of
+# a ball tree, the metadata is a center and a radius.  In the case of a
+# KD tree, the metadata is the minimum and maximum bound along each dimension.
+#
+# In a typical KD Tree or Ball Tree implementation, the nodes are implemented
+# as dynamically allocated structures with pointers linking them.  Here we
+# take a different approach, storing all relevant data in a set of arrays
+# so that the entire tree object can be saved in a pickle file. For efficiency,
+# the data can be stored in such a way that explicit pointers are not
+# necessary: for node data stored at index i, the two child nodes are at
+# index (2 * i + 1) and (2 * i + 2); the parent node is (i - 1) // 2
+# (where // indicates integer division).
+#
+# The data arrays used here are as follows:
+#   data : the [n_samples x n_features] array of data from which the tree
+#          is built
+#   idx_array : the length n_samples array used to keep track of the indices
+#          of data within each node.  Each node has values idx_start and
+#          idx_end: the points within the node are given by (using numpy
+#          syntax) data[idx_array[idx_start:idx_end]].
+#   node_data : the length n_nodes array of structures which store the node
+#          indices, node radii, and leaf information for each node.
+#   node_bounds : the [* x n_nodes x n_features] array containing the node
+#          bound information.  For ball tree, the first dimension is 1, and
+#          each row contains the centroid of the node.  For kd tree, the first
+#          dimension is 2 and the rows for each point contain the arrays of
+#          lower bounds and upper bounds in each direction.
+#
+# The lack of dynamic allocation means the number of nodes must be computed
+# before the building of the tree. This can be done assuming the points are
+# divided equally between child nodes at each step; although this removes
+# some flexibility in tree creation, it ensures a balanced tree and ensures
+# that the number of nodes required can be computed beforehand.  Given a
+# specified leaf_size (the minimum number of points in any node), it is
+# possible to show that a balanced tree will have
+#
+#     n_levels = 1 + max(0, floor(log2((n_samples - 1) / leaf_size)))
+#
+# in order to satisfy
+#
+#     leaf_size <= min(n_points) <= 2 * leaf_size
+#
+# with the exception of the special case where n_samples < leaf_size.
+# for a given number of levels, the number of nodes in the tree is given by
+#
+#     n_nodes = 2 ** n_levels - 1
+#
+# both these results can be straightforwardly shown by induction.  The
+# following code uses these values in the construction of the tree.
+#
+# Distance Metrics
+# ----------------
+# For flexibility, the trees can be built using a variety of distance metrics.
+# The metrics are described in the DistanceMetric class: the standard
+# Euclidean distance is the default, and is inlined to be faster than other
+# metrics.  In addition, each metric defines both a distance and a
+# "reduced distance", which is often faster to compute, and is therefore
+# used in the query architecture whenever possible. (For example, in the
+# case of the standard Euclidean distance, the reduced distance is the
+# squared-distance).
+#
+# Implementation Notes
+# --------------------
+# This implementation uses the common object-oriented approach of having an
+# abstract base class which is extended by the KDTree and BallTree
+# specializations.
+#
+# The BinaryTree "base class" is defined here and then subclassed in the BallTree
+# and KDTree pyx files. These files include implementations of the
+# "abstract" methods.
+
+# Necessary Helper Functions
+# --------------------------
+# These are the names and descriptions of the "abstract" functions which are
+# defined in kd_tree.pyx and ball_tree.pyx:
+
+# cdef int allocate_data(BinaryTree tree, intp_t n_nodes, intp_t n_features):
+#     """Allocate arrays needed for the KD Tree"""
+
+# cdef int init_node(BinaryTree tree, intp_t i_node,
+#                    intp_t idx_start, intp_t idx_end):
+#    """Initialize the node for the dataset stored in tree.data"""
+
+# cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, float64_t* pt):
+#     """Compute the minimum reduced-distance between a point and a node"""
+
+# cdef float64_t min_dist(BinaryTree tree, intp_t i_node, float64_t* pt):
+#     """Compute the minimum distance between a point and a node"""
+
+# cdef float64_t max_rdist(BinaryTree tree, intp_t i_node, float64_t* pt):
+#     """Compute the maximum reduced-distance between a point and a node"""
+
+# cdef float64_t max_dist(BinaryTree tree, intp_t i_node, float64_t* pt):
+#     """Compute the maximum distance between a point and a node"""
+
+# cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt,
+#                              float64_t* min_dist, float64_t* max_dist):
+#     """Compute the minimum and maximum distance between a point and a node"""
+
+# cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1,
+#                                    BinaryTree tree2, intp_t i_node2):
+#     """Compute the minimum reduced distance between two nodes"""
+
+# cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1,
+#                                   BinaryTree tree2, intp_t i_node2):
+#     """Compute the minimum distance between two nodes"""
+
+# cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1,
+#                                    BinaryTree tree2, intp_t i_node2):
+#     """Compute the maximum reduced distance between two nodes"""
+
+# cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1,
+#                                   BinaryTree tree2, intp_t i_node2):
+#     """Compute the maximum distance between two nodes"""
+
+cimport numpy as cnp
+from cython cimport floating
+from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma
+from libc.math cimport fmin, fmax
+from libc.stdlib cimport calloc, malloc, free
+from libc.string cimport memcpy
+
+import numpy as np
+import warnings
+
+from ..metrics._dist_metrics cimport (
+    DistanceMetric,
+    DistanceMetric64,
+    DistanceMetric32,
+    euclidean_dist64,
+    euclidean_dist32,
+    euclidean_rdist64,
+    euclidean_rdist32,
+    euclidean_dist_to_rdist64,
+    euclidean_dist_to_rdist32,
+)
+
+from ._partition_nodes cimport partition_node_indices
+
+from ..utils import check_array
+from ..utils._typedefs cimport float32_t, float64_t, intp_t
+from ..utils._heap cimport heap_push
+from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort
+
+cnp.import_array()
+
+
+# TODO: use cnp.PyArray_ENABLEFLAGS when Cython>=3.0 is used.
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(cnp.ndarray arr, int flags)
+
+
+# some handy constants
+cdef float64_t INF = np.inf
+cdef float64_t NEG_INF = -np.inf
+cdef float64_t PI = np.pi
+cdef float64_t ROOT_2PI = sqrt(2 * PI)
+cdef float64_t LOG_PI = log(PI)
+cdef float64_t LOG_2PI = log(2 * PI)
+
+
+# Some compound datatypes used below:
+cdef struct NodeHeapData_t:
+    float64_t val
+    intp_t i1
+    intp_t i2
+
+# build the corresponding numpy dtype for NodeHeapData
+cdef NodeHeapData_t nhd_tmp
+NodeHeapData = np.asarray(<NodeHeapData_t[:1]>(&nhd_tmp)).dtype
+
+cdef struct NodeData_t:
+    intp_t idx_start
+    intp_t idx_end
+    intp_t is_leaf
+    float64_t radius
+
+# build the corresponding numpy dtype for NodeData
+cdef NodeData_t nd_tmp
+NodeData = np.asarray(<NodeData_t[:1]>(&nd_tmp)).dtype
+
+
+######################################################################
+# Define doc strings, substituting the appropriate class name using
+# the DOC_DICT variable defined in the pyx files.
+CLASS_DOC = """{BinaryTree} for fast generalized N-point problems
+
+Read more in the :ref:`User Guide <unsupervised_neighbors>`.
+
+Parameters
+----------
+X : array-like of shape (n_samples, n_features)
+    n_samples is the number of points in the data set, and
+    n_features is the dimension of the parameter space.
+    Note: if X is a C-contiguous array of doubles then data will
+    not be copied. Otherwise, an internal copy will be made.
+
+leaf_size : positive int, default=40
+    Number of points at which to switch to brute-force. Changing
+    leaf_size will not affect the results of a query, but can
+    significantly impact the speed of a query and the memory required
+    to store the constructed tree.  The amount of memory needed to
+    store the tree scales as approximately n_samples / leaf_size.
+    For a specified ``leaf_size``, a leaf node is guaranteed to
+    satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in
+    the case that ``n_samples < leaf_size``.
+
+metric : str or DistanceMetric64 object, default='minkowski'
+    Metric to use for distance computation. Default is "minkowski", which
+    results in the standard Euclidean distance when p = 2.
+    A list of valid metrics for {BinaryTree} is given by the attribute
+    `valid_metrics`.
+    See the documentation of `scipy.spatial.distance
+    <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+    the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for
+    more information on any distance metric.
+
+Additional keywords are passed to the distance metric class.
+Note: Callable functions in the metric parameter are NOT supported for KDTree
+and Ball Tree. Function call overhead will result in very poor performance.
+
+Attributes
+----------
+data : memory view
+    The training data
+valid_metrics: list of str
+    List of valid distance metrics.
+
+Examples
+--------
+Query for k-nearest neighbors
+
+    >>> import numpy as np
+    >>> from sklearn.neighbors import {BinaryTree}
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.random_sample((10, 3))  # 10 points in 3 dimensions
+    >>> tree = {BinaryTree}(X, leaf_size=2)              # doctest: +SKIP
+    >>> dist, ind = tree.query(X[:1], k=3)                # doctest: +SKIP
+    >>> print(ind)  # indices of 3 closest neighbors
+    [0 3 1]
+    >>> print(dist)  # distances to 3 closest neighbors
+    [ 0.          0.19662693  0.29473397]
+
+Pickle and Unpickle a tree.  Note that the state of the tree is saved in the
+pickle operation: the tree needs not be rebuilt upon unpickling.
+
+    >>> import numpy as np
+    >>> import pickle
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.random_sample((10, 3))  # 10 points in 3 dimensions
+    >>> tree = {BinaryTree}(X, leaf_size=2)        # doctest: +SKIP
+    >>> s = pickle.dumps(tree)                     # doctest: +SKIP
+    >>> tree_copy = pickle.loads(s)                # doctest: +SKIP
+    >>> dist, ind = tree_copy.query(X[:1], k=3)     # doctest: +SKIP
+    >>> print(ind)  # indices of 3 closest neighbors
+    [0 3 1]
+    >>> print(dist)  # distances to 3 closest neighbors
+    [ 0.          0.19662693  0.29473397]
+
+Query for neighbors within a given radius
+
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.random_sample((10, 3))  # 10 points in 3 dimensions
+    >>> tree = {BinaryTree}(X, leaf_size=2)     # doctest: +SKIP
+    >>> print(tree.query_radius(X[:1], r=0.3, count_only=True))
+    3
+    >>> ind = tree.query_radius(X[:1], r=0.3)  # doctest: +SKIP
+    >>> print(ind)  # indices of neighbors within distance 0.3
+    [3 0 1]
+
+
+Compute a gaussian kernel density estimate:
+
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(42)
+    >>> X = rng.random_sample((100, 3))
+    >>> tree = {BinaryTree}(X)                # doctest: +SKIP
+    >>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian')
+    array([ 6.94114649,  7.83281226,  7.2071716 ])
+
+Compute a two-point auto-correlation function
+
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.random_sample((30, 3))
+    >>> r = np.linspace(0, 1, 5)
+    >>> tree = {BinaryTree}(X)                # doctest: +SKIP
+    >>> tree.two_point_correlation(X, r)
+    array([ 30,  62, 278, 580, 820])
+
+"""
+
+
+######################################################################
+# Utility functions
+cdef float64_t logaddexp(float64_t x1, float64_t x2):
+    """logaddexp(x1, x2) -> log(exp(x1) + exp(x2))"""
+    cdef float64_t a = fmax(x1, x2)
+    if a == NEG_INF:
+        return NEG_INF
+    else:
+        return a + log(exp(x1 - a) + exp(x2 - a))
+
+cdef float64_t logsubexp(float64_t x1, float64_t x2):
+    """logsubexp(x1, x2) -> log(exp(x1) - exp(x2))"""
+    if x1 <= x2:
+        return NEG_INF
+    else:
+        return x1 + log(1 - exp(x2 - x1))
+
+
+######################################################################
+# Kernel functions
+#
+# Note: Kernels assume dist is non-negative and h is positive
+#       All kernel functions are normalized such that K(0, h) = 1.
+#       The fully normalized kernel is:
+#         K = exp[kernel_norm(h, d, kernel) + compute_kernel(dist, h, kernel)]
+#       The code only works with non-negative kernels: i.e. K(d, h) >= 0
+#       for all valid d and h.  Note that for precision, the log of both
+#       the kernel and kernel norm is returned.
+cdef enum KernelType:
+    GAUSSIAN_KERNEL = 1
+    TOPHAT_KERNEL = 2
+    EPANECHNIKOV_KERNEL = 3
+    EXPONENTIAL_KERNEL = 4
+    LINEAR_KERNEL = 5
+    COSINE_KERNEL = 6
+
+
+cdef inline float64_t log_gaussian_kernel(float64_t dist, float64_t h):
+    """log of the gaussian kernel for bandwidth h (unnormalized)"""
+    return -0.5 * (dist * dist) / (h * h)
+
+
+cdef inline float64_t log_tophat_kernel(float64_t dist, float64_t h):
+    """log of the tophat kernel for bandwidth h (unnormalized)"""
+    if dist < h:
+        return 0.0
+    else:
+        return NEG_INF
+
+
+cdef inline float64_t log_epanechnikov_kernel(float64_t dist, float64_t h):
+    """log of the epanechnikov kernel for bandwidth h (unnormalized)"""
+    if dist < h:
+        return log(1.0 - (dist * dist) / (h * h))
+    else:
+        return NEG_INF
+
+
+cdef inline float64_t log_exponential_kernel(float64_t dist, float64_t h):
+    """log of the exponential kernel for bandwidth h (unnormalized)"""
+    return -dist / h
+
+
+cdef inline float64_t log_linear_kernel(float64_t dist, float64_t h):
+    """log of the linear kernel for bandwidth h (unnormalized)"""
+    if dist < h:
+        return log(1 - dist / h)
+    else:
+        return NEG_INF
+
+
+cdef inline float64_t log_cosine_kernel(float64_t dist, float64_t h):
+    """log of the cosine kernel for bandwidth h (unnormalized)"""
+    if dist < h:
+        return log(cos(0.5 * PI * dist / h))
+    else:
+        return NEG_INF
+
+
+cdef inline float64_t compute_log_kernel(float64_t dist, float64_t h,
+                                         KernelType kernel):
+    """Given a KernelType enumeration, compute the appropriate log-kernel"""
+    if kernel == GAUSSIAN_KERNEL:
+        return log_gaussian_kernel(dist, h)
+    elif kernel == TOPHAT_KERNEL:
+        return log_tophat_kernel(dist, h)
+    elif kernel == EPANECHNIKOV_KERNEL:
+        return log_epanechnikov_kernel(dist, h)
+    elif kernel == EXPONENTIAL_KERNEL:
+        return log_exponential_kernel(dist, h)
+    elif kernel == LINEAR_KERNEL:
+        return log_linear_kernel(dist, h)
+    elif kernel == COSINE_KERNEL:
+        return log_cosine_kernel(dist, h)
+
+
+# ------------------------------------------------------------
+# Kernel norms are defined via the volume element V_n
+# and surface element S_(n-1) of an n-sphere.
+cdef float64_t logVn(intp_t n):
+    """V_n = pi^(n/2) / gamma(n/2 - 1)"""
+    return 0.5 * n * LOG_PI - lgamma(0.5 * n + 1)
+
+
+cdef float64_t logSn(intp_t n):
+    """V_(n+1) = int_0^1 S_n r^n dr"""
+    return LOG_2PI + logVn(n - 1)
+
+
+cdef float64_t _log_kernel_norm(float64_t h, intp_t d,
+                                KernelType kernel) except -1:
+    """Given a KernelType enumeration, compute the kernel normalization.
+
+    h is the bandwidth, d is the dimension.
+    """
+    cdef float64_t tmp, factor = 0
+    cdef intp_t k
+    if kernel == GAUSSIAN_KERNEL:
+        factor = 0.5 * d * LOG_2PI
+    elif kernel == TOPHAT_KERNEL:
+        factor = logVn(d)
+    elif kernel == EPANECHNIKOV_KERNEL:
+        factor = logVn(d) + log(2. / (d + 2.))
+    elif kernel == EXPONENTIAL_KERNEL:
+        factor = logSn(d - 1) + lgamma(d)
+    elif kernel == LINEAR_KERNEL:
+        factor = logVn(d) - log(d + 1.)
+    elif kernel == COSINE_KERNEL:
+        # this is derived from a chain rule integration
+        factor = 0
+        tmp = 2. / PI
+        for k in range(1, d + 1, 2):
+            factor += tmp
+            tmp *= -(d - k) * (d - k - 1) * (2. / PI) ** 2
+        factor = log(factor) + logSn(d - 1)
+    else:
+        raise ValueError("Kernel code not recognized")
+    return -factor - d * log(h)
+
+
+def kernel_norm(h, d, kernel, return_log=False):
+    """Given a string specification of a kernel, compute the normalization.
+
+    Parameters
+    ----------
+    h : float
+        The bandwidth of the kernel.
+    d : int
+        The dimension of the space in which the kernel norm is computed.
+    kernel : str
+        The kernel identifier.  Must be one of
+        ['gaussian'|'tophat'|'epanechnikov'|
+         'exponential'|'linear'|'cosine']
+    return_log : bool, default=False
+        If True, return the log of the kernel norm.  Otherwise, return the
+        kernel norm.
+    Returns
+    -------
+    knorm or log_knorm : float
+        the kernel norm or logarithm of the kernel norm.
+    """
+    if kernel == 'gaussian':
+        result = _log_kernel_norm(h, d, GAUSSIAN_KERNEL)
+    elif kernel == 'tophat':
+        result = _log_kernel_norm(h, d, TOPHAT_KERNEL)
+    elif kernel == 'epanechnikov':
+        result = _log_kernel_norm(h, d, EPANECHNIKOV_KERNEL)
+    elif kernel == 'exponential':
+        result = _log_kernel_norm(h, d, EXPONENTIAL_KERNEL)
+    elif kernel == 'linear':
+        result = _log_kernel_norm(h, d, LINEAR_KERNEL)
+    elif kernel == 'cosine':
+        result = _log_kernel_norm(h, d, COSINE_KERNEL)
+    else:
+        raise ValueError('kernel not recognized')
+
+    if return_log:
+        return result
+    else:
+        return np.exp(result)
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}}
+
+cdef class NeighborsHeap{{name_suffix}}:
+    """A max-heap structure to keep track of distances/indices of neighbors
+
+    This implements an efficient pre-allocated set of fixed-size heaps
+    for chasing neighbors, holding both an index and a distance.
+    When any row of the heap is full, adding an additional point will push
+    the furthest point off the heap.
+
+    Parameters
+    ----------
+    n_pts : int
+        the number of heaps to use
+    n_nbrs : int
+        the size of each heap.
+    """
+    cdef {{INPUT_DTYPE_t}}[:, ::1] distances
+    cdef intp_t[:, ::1] indices
+
+    def __cinit__(self):
+        # One-element arrays are used as placeholders to prevent
+        # any problem due to potential access to those attributes
+        # (e.g. assigning to NULL or a to value in another segment).
+        self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C')
+        self.indices = np.zeros((1, 1), dtype=np.intp, order='C')
+
+    def __init__(self, n_pts, n_nbrs):
+        self.distances = np.full(
+            (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C'
+        )
+        self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C')
+
+    def get_arrays(self, sort=True):
+        """Get the arrays of distances and indices within the heap.
+
+        If sort=True, then simultaneously sort the indices and distances,
+        so the closer points are listed first.
+        """
+        if sort:
+            self._sort()
+        return self.distances.base, self.indices.base
+
+    cdef inline float64_t largest(self, intp_t row) except -1 nogil:
+        """Return the largest distance in the given row"""
+        return self.distances[row, 0]
+
+    def push(self, intp_t row, float64_t val, intp_t i_val):
+        return self._push(row, val, i_val)
+
+    cdef int _push(self, intp_t row, float64_t val,
+                   intp_t i_val) except -1 nogil:
+        """push (val, i_val) into the given row"""
+        return heap_push(
+            values=&self.distances[row, 0],
+            indices=&self.indices[row, 0],
+            size=self.distances.shape[1],
+            val=val,
+            val_idx=i_val,
+        )
+
+    cdef int _sort(self) except -1:
+        """simultaneously sort the distances and indices"""
+        cdef intp_t row
+        for row in range(self.distances.shape[0]):
+            _simultaneous_sort(
+                dist=&self.distances[row, 0],
+                idx=&self.indices[row, 0],
+                size=self.distances.shape[1],
+            )
+        return 0
+
+{{endfor}}
+
+#------------------------------------------------------------
+# find_node_split_dim:
+#  this computes the equivalent of
+#  j_max = np.argmax(np.max(data, 0) - np.min(data, 0))
+cdef intp_t find_node_split_dim(const floating* data,
+                                 const intp_t* node_indices,
+                                 intp_t n_features,
+                                 intp_t n_points) except -1:
+    """Find the dimension with the largest spread.
+
+    Parameters
+    ----------
+    data : double pointer
+        Pointer to a 2D array of the training data, of shape [N, n_features].
+        N must be greater than any of the values in node_indices.
+    node_indices : int pointer
+        Pointer to a 1D array of length n_points.  This lists the indices of
+        each of the points within the current node.
+
+    Returns
+    -------
+    i_max : int
+        The index of the feature (dimension) within the node that has the
+        largest spread.
+
+    Notes
+    -----
+    In numpy, this operation is equivalent to
+
+    def find_node_split_dim(data, node_indices):
+        return np.argmax(data[node_indices].max(0) - data[node_indices].min(0))
+
+    The cython version is much more efficient in both computation and memory.
+    """
+    cdef float64_t min_val, max_val, val, spread, max_spread
+    cdef intp_t i, j, j_max
+
+    j_max = 0
+    max_spread = 0
+
+    for j in range(n_features):
+        max_val = data[node_indices[0] * n_features + j]
+        min_val = max_val
+        for i in range(1, n_points):
+            val = data[node_indices[i] * n_features + j]
+            max_val = fmax(max_val, val)
+            min_val = fmin(min_val, val)
+        spread = max_val - min_val
+        if spread > max_spread:
+            max_spread = spread
+            j_max = j
+    return j_max
+
+
+######################################################################
+# NodeHeap : min-heap used to keep track of nodes during
+#            breadth-first query
+cdef inline void swap_nodes(NodeHeapData_t* arr, intp_t i1, intp_t i2):
+    cdef NodeHeapData_t tmp = arr[i1]
+    arr[i1] = arr[i2]
+    arr[i2] = tmp
+
+
+cdef class NodeHeap:
+    """NodeHeap
+
+    This is a min-heap implementation for keeping track of nodes
+    during a breadth-first search.  Unlike the NeighborsHeap above,
+    the NodeHeap does not have a fixed size and must be able to grow
+    as elements are added.
+
+    Internally, the data is stored in a simple binary heap which meets
+    the min heap condition:
+
+        heap[i].val < min(heap[2 * i + 1].val, heap[2 * i + 2].val)
+    """
+    cdef NodeHeapData_t[:] data
+    cdef intp_t n
+
+    def __cinit__(self):
+        # A one-elements array is used as a placeholder to prevent
+        # any problem due to potential access to this attribute
+        # (e.g. assigning to NULL or a to value in another segment).
+        self.data = np.zeros(1, dtype=NodeHeapData, order='C')
+
+    def __init__(self, size_guess=100):
+        size_guess = max(size_guess, 1)  # need space for at least one item
+        self.data = np.zeros(size_guess, dtype=NodeHeapData, order='C')
+        self.n = size_guess
+        self.clear()
+
+    cdef int resize(self, intp_t new_size) except -1:
+        """Resize the heap to be either larger or smaller"""
+        cdef:
+            NodeHeapData_t *data_ptr
+            NodeHeapData_t *new_data_ptr
+            intp_t i
+            intp_t size = self.data.shape[0]
+            NodeHeapData_t[:] new_data = np.zeros(
+                new_size,
+                dtype=NodeHeapData,
+            )
+
+        if size > 0 and new_size > 0:
+            data_ptr = &self.data[0]
+            new_data_ptr = &new_data[0]
+            for i in range(min(size, new_size)):
+                new_data_ptr[i] = data_ptr[i]
+
+        if new_size < size:
+            self.n = new_size
+
+        self.data = new_data
+        return 0
+
+    cdef int push(self, NodeHeapData_t data) except -1:
+        """Push a new item onto the heap"""
+        cdef intp_t i, i_parent
+        cdef NodeHeapData_t* data_arr
+        self.n += 1
+        if self.n > self.data.shape[0]:
+            self.resize(2 * self.n)
+
+        # put the new element at the end,
+        # and then perform swaps until the heap is in order
+        data_arr = &self.data[0]
+        i = self.n - 1
+        data_arr[i] = data
+
+        while i > 0:
+            i_parent = (i - 1) // 2
+            if data_arr[i_parent].val <= data_arr[i].val:
+                break
+            else:
+                swap_nodes(data_arr, i, i_parent)
+                i = i_parent
+        return 0
+
+    cdef NodeHeapData_t peek(self):
+        """Peek at the root of the heap, without removing it"""
+        return self.data[0]
+
+    cdef NodeHeapData_t pop(self):
+        """Remove the root of the heap, and update the remaining nodes"""
+        if self.n == 0:
+            raise ValueError('cannot pop on empty heap')
+
+        cdef intp_t i, i_child1, i_child2, i_swap
+        cdef NodeHeapData_t* data_arr = &self.data[0]
+        cdef NodeHeapData_t popped_element = data_arr[0]
+
+        # pop off the first element, move the last element to the front,
+        # and then perform swaps until the heap is back in order
+        data_arr[0] = data_arr[self.n - 1]
+        self.n -= 1
+
+        i = 0
+
+        while (i < self.n):
+            i_child1 = 2 * i + 1
+            i_child2 = 2 * i + 2
+            i_swap = 0
+
+            if i_child2 < self.n:
+                if data_arr[i_child1].val <= data_arr[i_child2].val:
+                    i_swap = i_child1
+                else:
+                    i_swap = i_child2
+            elif i_child1 < self.n:
+                i_swap = i_child1
+            else:
+                break
+
+            if (i_swap > 0) and (data_arr[i_swap].val <= data_arr[i].val):
+                swap_nodes(data_arr, i, i_swap)
+                i = i_swap
+            else:
+                break
+
+        return popped_element
+
+    cdef void clear(self):
+        """Clear the heap"""
+        self.n = 0
+
+
+######################################################################
+# newObj function
+#  this is a helper function for pickling
+def newObj(obj):
+    return obj.__new__(obj)
+
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}}
+
+######################################################################
+# define the reverse mapping of VALID_METRICS{{name_suffix}}
+from sklearn.metrics._dist_metrics import get_valid_metric_ids
+VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}})
+
+
+######################################################################
+# Binary Tree class
+cdef class BinaryTree{{name_suffix}}:
+
+    cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data
+    cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight
+    cdef public float64_t sum_weight
+
+    # TODO: idx_array and node_bounds must not be const, but this change needs
+    # to happen in a way which preserves pickling
+    # See also: https://github.com/cython/cython/issues/5639
+    cdef public const intp_t[::1] idx_array
+    cdef public const NodeData_t[::1] node_data
+    cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds
+
+    cdef intp_t leaf_size
+    cdef intp_t n_levels
+    cdef intp_t n_nodes
+
+    cdef DistanceMetric{{name_suffix}} dist_metric
+    cdef int euclidean
+
+    # variables to keep track of building & querying stats
+    cdef int n_trims
+    cdef int n_leaves
+    cdef int n_splits
+    cdef int n_calls
+
+    valid_metrics = VALID_METRIC_IDS{{name_suffix}}
+
+    # Use cinit to initialize all arrays to empty: this will prevent memory
+    # errors and seg-faults in rare cases where __init__ is not called
+    # A one-elements array is used as a placeholder to prevent
+    # any problem due to potential access to this attribute
+    # (e.g. assigning to NULL or a to value in another segment).
+    def __cinit__(self):
+        self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C')
+        self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C')
+        self.idx_array = np.empty(1, dtype=np.intp, order='C')
+        self.node_data = np.empty(1, dtype=NodeData, order='C')
+        self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}})
+
+        self.leaf_size = 0
+        self.n_levels = 0
+        self.n_nodes = 0
+
+        self.euclidean = False
+
+        self.n_trims = 0
+        self.n_leaves = 0
+        self.n_splits = 0
+        self.n_calls = 0
+
+    def __init__(self, data,
+                 leaf_size=40, metric='minkowski', sample_weight=None, **kwargs):
+        # validate data
+        self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C')
+        if self.data.size == 0:
+            raise ValueError("X is an empty array")
+
+        n_samples = self.data.shape[0]
+        n_features = self.data.shape[1]
+
+        if leaf_size < 1:
+            raise ValueError("leaf_size must be greater than or equal to 1")
+        self.leaf_size = leaf_size
+
+        self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs)
+        self.euclidean = (self.dist_metric.__class__.__name__
+                          == 'EuclideanDistance{{name_suffix}}')
+
+        metric = self.dist_metric.__class__.__name__
+        if metric not in VALID_METRICS{{name_suffix}}:
+            raise ValueError('metric {metric} is not valid for '
+                             '{BinaryTree}'.format(metric=metric,
+                                                   **DOC_DICT{{name_suffix}}))
+        self.dist_metric._validate_data(self.data)
+
+        # determine number of levels in the tree, and from this
+        # the number of nodes in the tree.  This results in leaf nodes
+        # with numbers of points between leaf_size and 2 * leaf_size
+        self.n_levels = int(
+            np.log2(fmax(1, (n_samples - 1) / self.leaf_size)) + 1)
+        self.n_nodes = <int> (2 ** self.n_levels) - 1
+
+        # allocate arrays for storage
+        self.idx_array = np.arange(n_samples, dtype=np.intp)
+        self.node_data = np.zeros(self.n_nodes, dtype=NodeData)
+
+        self._update_sample_weight(n_samples, sample_weight)
+
+        # Allocate tree-specific data
+        allocate_data{{name_suffix}}(self, self.n_nodes, n_features)
+        self._recursive_build(
+            node_data=self.node_data.base,
+            i_node=0,
+            idx_start=0,
+            idx_end=n_samples
+        )
+
+    def _update_sample_weight(self, n_samples, sample_weight):
+        if sample_weight is not None:
+            self.sample_weight = np.asarray(
+                sample_weight, dtype={{INPUT_DTYPE}}, order='C')
+            self.sum_weight = np.sum(self.sample_weight)
+        else:
+            self.sample_weight = None
+            self.sum_weight = <float64_t> n_samples
+
+    def __reduce__(self):
+        """
+        reduce method used for pickling
+        """
+        return (newObj, (type(self),), self.__getstate__())
+
+    def __getstate__(self):
+        """
+        get state for pickling
+        """
+        if self.sample_weight is not None:
+            # pass the numpy array
+            sample_weight = self.sample_weight.base
+        else:
+            # pass None to avoid confusion with the empty place holder
+            # of size 1 from __cinit__
+            sample_weight = None
+        return (self.data.base,
+                self.idx_array.base,
+                self.node_data.base,
+                self.node_bounds.base,
+                int(self.leaf_size),
+                int(self.n_levels),
+                int(self.n_nodes),
+                int(self.n_trims),
+                int(self.n_leaves),
+                int(self.n_splits),
+                int(self.n_calls),
+                self.dist_metric,
+                sample_weight)
+
+    def __setstate__(self, state):
+        """
+        set state for pickling
+        """
+        self.data = state[0]
+        self.idx_array = state[1]
+        self.node_data = state[2]
+        self.node_bounds = state[3]
+        self.leaf_size = state[4]
+        self.n_levels = state[5]
+        self.n_nodes = state[6]
+        self.n_trims = state[7]
+        self.n_leaves = state[8]
+        self.n_splits = state[9]
+        self.n_calls = state[10]
+        self.dist_metric = state[11]
+        sample_weight = state[12]
+
+        self.euclidean = (self.dist_metric.__class__.__name__
+                          == 'EuclideanDistance64')
+        n_samples = self.data.shape[0]
+        self._update_sample_weight(n_samples, sample_weight)
+
+    def get_tree_stats(self):
+        """
+        get_tree_stats()
+
+        Get tree status.
+
+        Returns
+        -------
+        tree_stats: tuple of int
+            (number of trims, number of leaves, number of splits)
+        """
+        return (self.n_trims, self.n_leaves, self.n_splits)
+
+    def reset_n_calls(self):
+        """
+        reset_n_calls()
+
+        Reset number of calls to 0.
+        """
+        self.n_calls = 0
+
+    def get_n_calls(self):
+        """
+        get_n_calls()
+
+        Get number of calls.
+
+        Returns
+        -------
+        n_calls: int
+            number of distance computation calls
+        """
+        return self.n_calls
+
+    def get_arrays(self):
+        """
+        get_arrays()
+
+        Get data and node arrays.
+
+        Returns
+        -------
+        arrays: tuple of array
+            Arrays for storing tree data, index, node data and node bounds.
+        """
+        return (
+            self.data.base,
+            self.idx_array.base,
+            self.node_data.base,
+            self.node_bounds.base,
+        )
+
+    cdef inline float64_t dist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2,
+                             intp_t size) except -1 nogil:
+        """Compute the distance between arrays x1 and x2"""
+        self.n_calls += 1
+        if self.euclidean:
+            return euclidean_dist{{name_suffix}}(x1, x2, size)
+        else:
+            return self.dist_metric.dist(x1, x2, size)
+
+    cdef inline float64_t rdist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2,
+                              intp_t size) except -1 nogil:
+        """Compute the reduced distance between arrays x1 and x2.
+
+        The reduced distance, defined for some metrics, is a quantity which
+        is more efficient to compute than the distance, but preserves the
+        relative rankings of the true distance.  For example, the reduced
+        distance for the Euclidean metric is the squared-euclidean distance.
+        """
+        self.n_calls += 1
+        if self.euclidean:
+            return euclidean_rdist{{name_suffix}}(x1, x2, size)
+        else:
+            return self.dist_metric.rdist(x1, x2, size)
+
+    cdef int _recursive_build(self, NodeData_t[::1] node_data, intp_t i_node, intp_t idx_start,
+                              intp_t idx_end) except -1:
+        """Recursively build the tree.
+
+        Parameters
+        ----------
+        i_node : int
+            the node for the current step
+        idx_start, idx_end : int
+            the bounding indices in the idx_array which define the points that
+            belong to this node.
+        """
+        cdef intp_t imax
+        cdef intp_t n_features = self.data.shape[1]
+        cdef intp_t n_points = idx_end - idx_start
+        cdef intp_t n_mid = n_points / 2
+        cdef intp_t* idx_array = &self.idx_array[idx_start]
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+
+        # initialize node data
+        init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end)
+
+        if 2 * i_node + 1 >= self.n_nodes:
+            node_data[i_node].is_leaf = True
+            if idx_end - idx_start > 2 * self.leaf_size:
+                # this shouldn't happen if our memory allocation is correct
+                # we'll proactively prevent memory errors, but raise a
+                # warning saying we're doing so.
+                import warnings
+                warnings.warn("Internal: memory layout is flawed: "
+                              "not enough nodes allocated")
+
+        elif idx_end - idx_start < 2:
+            # again, this shouldn't happen if our memory allocation
+            # is correct.  Raise a warning.
+            import warnings
+            warnings.warn("Internal: memory layout is flawed: "
+                          "too many nodes allocated")
+            node_data[i_node].is_leaf = True
+
+        else:
+            # split node and recursively construct child nodes.
+            node_data[i_node].is_leaf = False
+            i_max = find_node_split_dim(data, idx_array,
+                                        n_features, n_points)
+            partition_node_indices(data, idx_array, i_max, n_mid,
+                                   n_features, n_points)
+            self._recursive_build(node_data, 2 * i_node + 1,
+                                  idx_start, idx_start + n_mid)
+            self._recursive_build(node_data, 2 * i_node + 2,
+                                  idx_start + n_mid, idx_end)
+
+    def query(self, X, k=1, return_distance=True,
+              dualtree=False, breadth_first=False,
+              sort_results=True):
+        """
+        query(X, k=1, return_distance=True,
+              dualtree=False, breadth_first=False)
+
+        query the tree for the k nearest neighbors
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query
+        k : int, default=1
+            The number of nearest neighbors to return
+        return_distance : bool, default=True
+            if True, return a tuple (d, i) of distances and indices
+            if False, return array i
+        dualtree : bool, default=False
+            if True, use the dual tree formalism for the query: a tree is
+            built for the query points, and the pair of trees is used to
+            efficiently search this space.  This can lead to better
+            performance as the number of points grows large.
+        breadth_first : bool, default=False
+            if True, then query the nodes in a breadth-first manner.
+            Otherwise, query the nodes in a depth-first manner.
+        sort_results : bool, default=True
+            if True, then distances and indices of each point are sorted
+            on return, so that the first column contains the closest points.
+            Otherwise, neighbors are returned in an arbitrary order.
+
+        Returns
+        -------
+        i    : if return_distance == False
+        (d,i) : if return_distance == True
+
+        d : ndarray of shape X.shape[:-1] + (k,), dtype=double
+            Each entry gives the list of distances to the neighbors of the
+            corresponding point.
+
+        i : ndarray of shape X.shape[:-1] + (k,), dtype=int
+            Each entry gives the list of indices of neighbors of the
+            corresponding point.
+        """
+        # XXX: we should allow X to be a pre-built tree.
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
+
+        if X.shape[X.ndim - 1] != self.data.shape[1]:
+            raise ValueError("query data dimension must "
+                             "match training data dimension")
+
+        if self.data.shape[0] < k:
+            raise ValueError("k must be less than or equal "
+                             "to the number of training points")
+
+        # flatten X, and save original shape information
+        np_Xarr = X.reshape((-1, self.data.shape[1]))
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr
+        cdef float64_t reduced_dist_LB
+        cdef intp_t i
+        cdef const {{INPUT_DTYPE_t}}* pt
+
+        # initialize heap for neighbors
+        cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k)
+
+        # node heap for breadth-first queries
+        cdef NodeHeap nodeheap
+        if breadth_first:
+            nodeheap = NodeHeap(self.data.shape[0] // self.leaf_size)
+
+        # bounds is needed for the dual tree algorithm
+        cdef float64_t[::1] bounds
+
+        self.n_trims = 0
+        self.n_leaves = 0
+        self.n_splits = 0
+
+        if dualtree:
+            other = self.__class__(np_Xarr, metric=self.dist_metric,
+                                   leaf_size=self.leaf_size)
+            if breadth_first:
+                self._query_dual_breadthfirst(other, heap, nodeheap)
+            else:
+                reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0)
+                bounds = np.full(other.node_data.shape[0], np.inf)
+                self._query_dual_depthfirst(0, other, 0, bounds,
+                                            heap, reduced_dist_LB)
+
+        else:
+            pt = &Xarr[0, 0]
+            if breadth_first:
+                for i in range(Xarr.shape[0]):
+                    self._query_single_breadthfirst(pt, i, heap, nodeheap)
+                    pt += Xarr.shape[1]
+            else:
+                with nogil:
+                    for i in range(Xarr.shape[0]):
+                        reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt)
+                        self._query_single_depthfirst(0, pt, i, heap,
+                                                      reduced_dist_LB)
+                        pt += Xarr.shape[1]
+
+        distances, indices = heap.get_arrays(sort=sort_results)
+        distances = self.dist_metric.rdist_to_dist(distances)
+
+        # deflatten results
+        if return_distance:
+            return (distances.reshape(X.shape[:X.ndim - 1] + (k,)),
+                    indices.reshape(X.shape[:X.ndim - 1] + (k,)))
+        else:
+            return indices.reshape(X.shape[:X.ndim - 1] + (k,))
+
+    def query_radius(self, X, r, int return_distance=False,
+                     int count_only=False, int sort_results=False):
+        """
+        query_radius(X, r, return_distance=False,
+        count_only=False, sort_results=False)
+
+        query the tree for neighbors within a radius r
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query
+        r : distance within which neighbors are returned
+            r can be a single value, or an array of values of shape
+            x.shape[:-1] if different radii are desired for each point.
+        return_distance : bool, default=False
+            if True,  return distances to neighbors of each point
+            if False, return only neighbors
+            Note that unlike the query() method, setting return_distance=True
+            here adds to the computation time.  Not all distances need to be
+            calculated explicitly for return_distance=False.  Results are
+            not sorted by default: see ``sort_results`` keyword.
+        count_only : bool, default=False
+            if True,  return only the count of points within distance r
+            if False, return the indices of all points within distance r
+            If return_distance==True, setting count_only=True will
+            result in an error.
+        sort_results : bool, default=False
+            if True, the distances and indices will be sorted before being
+            returned.  If False, the results will not be sorted.  If
+            return_distance == False, setting sort_results = True will
+            result in an error.
+
+        Returns
+        -------
+        count       : if count_only == True
+        ind         : if count_only == False and return_distance == False
+        (ind, dist) : if count_only == False and return_distance == True
+
+        count : ndarray of shape X.shape[:-1], dtype=int
+            Each entry gives the number of neighbors within a distance r of the
+            corresponding point.
+
+        ind : ndarray of shape X.shape[:-1], dtype=object
+            Each element is a numpy integer array listing the indices of
+            neighbors of the corresponding point.  Note that unlike
+            the results of a k-neighbors query, the returned neighbors
+            are not sorted by distance by default.
+
+        dist : ndarray of shape X.shape[:-1], dtype=object
+            Each element is a numpy double array listing the distances
+            corresponding to indices in i.
+        """
+        if count_only and return_distance:
+            raise ValueError("count_only and return_distance "
+                             "cannot both be true")
+
+        if sort_results and not return_distance:
+            raise ValueError("return_distance must be True "
+                             "if sort_results is True")
+
+        cdef intp_t i, count_i = 0
+        cdef intp_t n_features = self.data.shape[1]
+        cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i
+        cdef intp_t[::1] idx_arr_i, counts
+        cdef const {{INPUT_DTYPE_t}}* pt
+        cdef intp_t** indices = NULL
+        cdef {{INPUT_DTYPE_t}}** distances = NULL
+
+        # validate X and prepare for query
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
+
+        if X.shape[X.ndim - 1] != self.data.shape[1]:
+            raise ValueError("query data dimension must "
+                             "match training data dimension")
+
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1]))
+
+        # prepare r for query
+        r = np.asarray(r, dtype=np.float64, order='C')
+        r = np.atleast_1d(r)
+        if r.shape == (1,):
+            r = np.full(X.shape[:X.ndim - 1], r[0], dtype=np.float64)
+        else:
+            if r.shape != X.shape[:X.ndim - 1]:
+                raise ValueError("r must be broadcastable to X.shape")
+
+        rarr_np = r.reshape(-1)  # store explicitly to keep in scope
+        cdef float64_t[::1] rarr = rarr_np
+
+        if not count_only:
+            indices = <intp_t**>calloc(Xarr.shape[0], sizeof(intp_t*))
+            if indices == NULL:
+                raise MemoryError()
+            if return_distance:
+                distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*))
+                if distances == NULL:
+                    free(indices)
+                    raise MemoryError()
+
+        np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp)
+        idx_arr_i = np_idx_arr
+
+        np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}})
+        dist_arr_i = np_dist_arr
+
+        counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp)
+        counts = counts_arr
+
+        pt = &Xarr[0, 0]
+        memory_error = False
+        with nogil:
+            for i in range(Xarr.shape[0]):
+                counts[i] = self._query_radius_single(0, pt, rarr[i],
+                                                      &idx_arr_i[0],
+                                                      &dist_arr_i[0],
+                                                      0, count_only,
+                                                      return_distance)
+                pt += n_features
+
+                if count_only:
+                    continue
+
+                if sort_results:
+                    _simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0],
+                                       counts[i])
+
+                # equivalent to: indices[i] = np_idx_arr[:counts[i]].copy()
+                indices[i] = <intp_t*>malloc(counts[i] * sizeof(intp_t))
+                if indices[i] == NULL:
+                    memory_error = True
+                    break
+                memcpy(indices[i], &idx_arr_i[0], counts[i] * sizeof(intp_t))
+
+                if return_distance:
+                    # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy()
+                    distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}}))
+                    if distances[i] == NULL:
+                        memory_error = True
+                        break
+                    memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}}))
+
+        try:
+            if memory_error:
+                raise MemoryError()
+
+            if count_only:
+                # deflatten results
+                return counts_arr.reshape(X.shape[:X.ndim - 1])
+            elif return_distance:
+                indices_npy = np.zeros(Xarr.shape[0], dtype='object')
+                distances_npy = np.zeros(Xarr.shape[0], dtype='object')
+                for i in range(Xarr.shape[0]):
+                    # make a new numpy array that wraps the existing data
+                    # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0
+                    indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&counts[i], cnp.NPY_INTP, indices[i])
+                    # make sure the data will be freed when the numpy array is garbage collected
+                    PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA)
+                    # make sure the data is not freed twice
+                    indices[i] = NULL
+
+                    # make a new numpy array that wraps the existing data
+                    # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0
+                    distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&counts[i], {{NPY_TYPE}}, distances[i])
+                    # make sure the data will be freed when the numpy array is garbage collected
+                    PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA)
+                    # make sure the data is not freed twice
+                    distances[i] = NULL
+
+                # deflatten results
+                return (indices_npy.reshape(X.shape[:X.ndim - 1]),
+                        distances_npy.reshape(X.shape[:X.ndim - 1]))
+            else:
+                indices_npy = np.zeros(Xarr.shape[0], dtype='object')
+                for i in range(Xarr.shape[0]):
+                    # make a new numpy array that wraps the existing data
+                    # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0
+                    indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&counts[i], cnp.NPY_INTP, indices[i])
+                    # make sure the data will be freed when the numpy array is garbage collected
+                    PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA)
+                    # make sure the data is not freed twice
+                    indices[i] = NULL
+
+                # deflatten results
+                return indices_npy.reshape(X.shape[:X.ndim - 1])
+        except MemoryError:
+            # free any buffer that is not owned by a numpy array
+            for i in range(Xarr.shape[0]):
+                free(indices[i])
+                if return_distance:
+                    free(distances[i])
+            raise
+        finally:
+            free(indices)
+            free(distances)
+
+    def kernel_density(self, X, h, kernel='gaussian',
+                       atol=0, rtol=1E-8,
+                       breadth_first=True, return_log=False):
+        """
+        kernel_density(X, h, kernel='gaussian', atol=0, rtol=1E-8,
+                       breadth_first=True, return_log=False)
+
+        Compute the kernel density estimate at points X with the given kernel,
+        using the distance metric specified at tree creation.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query.  Last dimension should match dimension
+            of training data.
+        h : float
+            the bandwidth of the kernel
+        kernel : str, default="gaussian"
+            specify the kernel to use.  Options are
+            - 'gaussian'
+            - 'tophat'
+            - 'epanechnikov'
+            - 'exponential'
+            - 'linear'
+            - 'cosine'
+            Default is kernel = 'gaussian'
+        atol : float, default=0
+            Specify the desired absolute tolerance of the result.
+            If the true result is `K_true`, then the returned result `K_ret`
+            satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret``
+            The default is zero (i.e. machine precision).
+        rtol : float, default=1e-8
+            Specify the desired relative tolerance of the result.
+            If the true result is `K_true`, then the returned result `K_ret`
+            satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret``
+            The default is `1e-8` (i.e. machine precision).
+        breadth_first : bool, default=False
+            If True, use a breadth-first search.  If False (default) use a
+            depth-first search.  Breadth-first is generally faster for
+            compact kernels and/or high tolerances.
+        return_log : bool, default=False
+            Return the logarithm of the result.  This can be more accurate
+            than returning the result itself for narrow kernels.
+
+        Returns
+        -------
+        density : ndarray of shape X.shape[:-1]
+            The array of (log)-density evaluations
+        """
+        cdef float64_t h_c = h
+        cdef float64_t log_atol = log(atol)
+        cdef float64_t log_rtol = log(rtol)
+        cdef float64_t log_min_bound, log_max_bound, log_bound_spread
+        cdef float64_t dist_LB = 0, dist_UB = 0
+
+        cdef intp_t n_samples = self.data.shape[0]
+        cdef intp_t n_features = self.data.shape[1]
+        cdef intp_t i
+        cdef KernelType kernel_c
+
+        # validate kernel
+        if kernel == 'gaussian':
+            kernel_c = GAUSSIAN_KERNEL
+        elif kernel == 'tophat':
+            kernel_c = TOPHAT_KERNEL
+        elif kernel == 'epanechnikov':
+            kernel_c = EPANECHNIKOV_KERNEL
+        elif kernel == 'exponential':
+            kernel_c = EXPONENTIAL_KERNEL
+        elif kernel == 'linear':
+            kernel_c = LINEAR_KERNEL
+        elif kernel == 'cosine':
+            kernel_c = COSINE_KERNEL
+        else:
+            raise ValueError("kernel = '%s' not recognized" % kernel)
+
+        cdef float64_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c)
+
+        # validate X and prepare for query
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
+
+        if X.shape[X.ndim - 1] != n_features:
+            raise ValueError("query data dimension must "
+                             "match training data dimension")
+        Xarr_np = X.reshape((-1, n_features))
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np
+
+        log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}})
+        cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr
+
+        cdef const {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0]
+
+        cdef NodeHeap nodeheap
+        if breadth_first:
+            nodeheap = NodeHeap(self.data.shape[0] // self.leaf_size)
+        cdef float64_t[::1] node_log_min_bounds
+        cdef float64_t[::1] node_bound_widths
+        # TODO: implement dual tree approach.
+        #       this is difficult because of the need to cache values
+        #       computed between node pairs.
+        if breadth_first:
+            node_log_min_bounds_arr = np.full(self.n_nodes, -np.inf)
+            node_log_min_bounds = node_log_min_bounds_arr
+            node_bound_widths_arr = np.zeros(self.n_nodes)
+            node_bound_widths = node_bound_widths_arr
+            for i in range(Xarr.shape[0]):
+                log_density[i] = self._kde_single_breadthfirst(
+                                            pt, kernel_c, h_c,
+                                            log_knorm, log_atol, log_rtol,
+                                            nodeheap,
+                                            &node_log_min_bounds[0],
+                                            &node_bound_widths[0])
+                pt += n_features
+        else:
+            for i in range(Xarr.shape[0]):
+                min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB)
+                # compute max & min bounds on density within top node
+                log_min_bound = (log(self.sum_weight) +
+                                 compute_log_kernel(dist_UB,
+                                                    h_c, kernel_c))
+                log_max_bound = (log(self.sum_weight) +
+                                 compute_log_kernel(dist_LB,
+                                                    h_c, kernel_c))
+                log_bound_spread = logsubexp(log_max_bound, log_min_bound)
+                self._kde_single_depthfirst(0, pt, kernel_c, h_c,
+                                            log_knorm, log_atol, log_rtol,
+                                            log_min_bound,
+                                            log_bound_spread,
+                                            &log_min_bound,
+                                            &log_bound_spread)
+                log_density[i] = logaddexp(log_min_bound,
+                                           log_bound_spread - log(2))
+                pt += n_features
+
+        # normalize the results
+        for i in range(log_density.shape[0]):
+            log_density[i] += log_knorm
+
+        log_density_arr = log_density_arr.reshape(X.shape[:X.ndim - 1])
+
+        if return_log:
+            return log_density_arr
+        else:
+            return np.exp(log_density_arr)
+
+    def two_point_correlation(self, X, r, dualtree=False):
+        """
+        two_point_correlation(X, r, dualtree=False)
+
+        Compute the two-point correlation function
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query.  Last dimension should match dimension
+            of training data.
+        r : array-like
+            A one-dimensional array of distances
+        dualtree : bool, default=False
+            If True, use a dualtree algorithm.  Otherwise, use a single-tree
+            algorithm.  Dual tree algorithms can have better scaling for
+            large N.
+
+        Returns
+        -------
+        counts : ndarray
+            counts[i] contains the number of pairs of points with distance
+            less than or equal to r[i]
+        """
+        cdef intp_t n_features = self.data.shape[1]
+        cdef intp_t i
+
+        # validate X and prepare for query
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
+
+        if X.shape[X.ndim - 1] != self.data.shape[1]:
+            raise ValueError("query data dimension must "
+                             "match training data dimension")
+
+        np_Xarr = X.reshape((-1, self.data.shape[1]))
+        cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr
+
+        # prepare r for query
+        r = np.asarray(r, dtype=np.float64, order='C')
+        r = np.atleast_1d(r)
+        if r.ndim != 1:
+            raise ValueError("r must be a 1-dimensional array")
+        i_rsort = np.argsort(r)
+        rarr_np = r[i_rsort]  # needed to keep memory in scope
+        cdef float64_t[::1] rarr = rarr_np
+
+        # create array to hold counts
+        count = np.zeros(r.shape[0], dtype=np.intp)
+        cdef intp_t[::1] carr = count
+
+        cdef const {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0]
+
+        if dualtree:
+            other = self.__class__(Xarr, metric=self.dist_metric,
+                                   leaf_size=self.leaf_size)
+            self._two_point_dual(0, other, 0, &rarr[0], &carr[0],
+                                 0, rarr.shape[0])
+        else:
+            for i in range(Xarr.shape[0]):
+                self._two_point_single(0, pt, &rarr[0], &carr[0],
+                                       0, rarr.shape[0])
+                pt += n_features
+
+        return count
+
+    cdef int _query_single_depthfirst(
+        self,
+        intp_t i_node,
+        const {{INPUT_DTYPE_t}}* pt,
+        intp_t i_pt,
+        NeighborsHeap{{name_suffix}} heap,
+        float64_t reduced_dist_LB,
+    ) except -1 nogil:
+        """Recursive Single-tree k-neighbors query, depth-first approach"""
+        cdef NodeData_t node_info = self.node_data[i_node]
+
+        cdef float64_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2
+        cdef intp_t i, i1, i2
+
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+
+        # ------------------------------------------------------------
+        # Case 1: query point is outside node radius:
+        #         trim it from the query
+        if reduced_dist_LB > heap.largest(i_pt):
+            self.n_trims += 1
+
+        # ------------------------------------------------------------
+        # Case 2: this is a leaf node.  Update set of nearby points
+        elif node_info.is_leaf:
+            self.n_leaves += 1
+            for i in range(node_info.idx_start, node_info.idx_end):
+                dist_pt = self.rdist(pt,
+                                     &self.data[self.idx_array[i], 0],
+                                     self.data.shape[1])
+                heap._push(i_pt, dist_pt, self.idx_array[i])
+
+        # ------------------------------------------------------------
+        # Case 3: Node is not a leaf.  Recursively query subnodes
+        #         starting with the closest
+        else:
+            self.n_splits += 1
+            i1 = 2 * i_node + 1
+            i2 = i1 + 1
+            reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt)
+            reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt)
+
+            # recursively query subnodes
+            if reduced_dist_LB_1 <= reduced_dist_LB_2:
+                self._query_single_depthfirst(i1, pt, i_pt, heap,
+                                              reduced_dist_LB_1)
+                self._query_single_depthfirst(i2, pt, i_pt, heap,
+                                              reduced_dist_LB_2)
+            else:
+                self._query_single_depthfirst(i2, pt, i_pt, heap,
+                                              reduced_dist_LB_2)
+                self._query_single_depthfirst(i1, pt, i_pt, heap,
+                                              reduced_dist_LB_1)
+        return 0
+
+    cdef int _query_single_breadthfirst(
+        self,
+        const {{INPUT_DTYPE_t}}* pt,
+        intp_t i_pt,
+        NeighborsHeap{{name_suffix}} heap,
+        NodeHeap nodeheap,
+    ) except -1:
+        """Non-recursive single-tree k-neighbors query, breadth-first search"""
+        cdef intp_t i, i_node
+        cdef float64_t dist_pt, reduced_dist_LB
+        cdef const NodeData_t* node_data = &self.node_data[0]
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+
+        # Set up the node heap and push the head node onto it
+        cdef NodeHeapData_t nodeheap_item
+        nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt)
+        nodeheap_item.i1 = 0
+        nodeheap.push(nodeheap_item)
+
+        while nodeheap.n > 0:
+            nodeheap_item = nodeheap.pop()
+            reduced_dist_LB = nodeheap_item.val
+            i_node = nodeheap_item.i1
+            node_info = node_data[i_node]
+
+            # ------------------------------------------------------------
+            # Case 1: query point is outside node radius:
+            #         trim it from the query
+            if reduced_dist_LB > heap.largest(i_pt):
+                self.n_trims += 1
+
+            # ------------------------------------------------------------
+            # Case 2: this is a leaf node.  Update set of nearby points
+            elif node_data[i_node].is_leaf:
+                self.n_leaves += 1
+                for i in range(node_data[i_node].idx_start,
+                               node_data[i_node].idx_end):
+                    dist_pt = self.rdist(pt,
+                                         &self.data[self.idx_array[i], 0],
+                                         self.data.shape[1])
+                    heap._push(i_pt, dist_pt, self.idx_array[i])
+
+            # ------------------------------------------------------------
+            # Case 3: Node is not a leaf.  Add subnodes to the node heap
+            else:
+                self.n_splits += 1
+                for i in range(2 * i_node + 1, 2 * i_node + 3):
+                    nodeheap_item.i1 = i
+                    nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt)
+                    nodeheap.push(nodeheap_item)
+        return 0
+
+    cdef int _query_dual_depthfirst(
+        self,
+        intp_t i_node1,
+        BinaryTree{{name_suffix}} other,
+        intp_t i_node2,
+        float64_t[::1] bounds,
+        NeighborsHeap{{name_suffix}} heap,
+        float64_t reduced_dist_LB,
+    ) except -1:
+        """Recursive dual-tree k-neighbors query, depth-first"""
+        # note that the array `bounds` is maintained such that
+        # bounds[i] is the largest distance among any of the
+        # current neighbors in node i of the other tree.
+        cdef NodeData_t node_info1 = self.node_data[i_node1]
+        cdef NodeData_t node_info2 = other.node_data[i_node2]
+
+        cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
+        cdef intp_t n_features = self.data.shape[1]
+
+        cdef float64_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2
+        cdef intp_t i1, i2, i_pt, i_parent
+
+        # ------------------------------------------------------------
+        # Case 1: nodes are further apart than the current bound:
+        #         trim both from the query
+        if reduced_dist_LB > bounds[i_node2]:
+            pass
+
+        # ------------------------------------------------------------
+        # Case 2: both nodes are leaves:
+        #         do a brute-force search comparing all pairs
+        elif node_info1.is_leaf and node_info2.is_leaf:
+            bounds[i_node2] = 0
+
+            for i2 in range(node_info2.idx_start, node_info2.idx_end):
+                i_pt = other.idx_array[i2]
+
+                if heap.largest(i_pt) <= reduced_dist_LB:
+                    continue
+
+                for i1 in range(node_info1.idx_start, node_info1.idx_end):
+                    dist_pt = self.rdist(
+                        data1 + n_features * self.idx_array[i1],
+                        data2 + n_features * i_pt,
+                        n_features)
+                    heap._push(i_pt, dist_pt, self.idx_array[i1])
+
+                # keep track of node bound
+                bounds[i_node2] = fmax(bounds[i_node2],
+                                       heap.largest(i_pt))
+
+            # update bounds up the tree
+            while i_node2 > 0:
+                i_parent = (i_node2 - 1) // 2
+                bound_max = fmax(bounds[2 * i_parent + 1],
+                                 bounds[2 * i_parent + 2])
+                if bound_max < bounds[i_parent]:
+                    bounds[i_parent] = bound_max
+                    i_node2 = i_parent
+                else:
+                    break
+
+        # ------------------------------------------------------------
+        # Case 3a: node 1 is a leaf or is smaller: split node 2 and
+        #          recursively query, starting with the nearest subnode
+        elif node_info1.is_leaf or (not node_info2.is_leaf
+                                    and node_info2.radius > node_info1.radius):
+            reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1,
+                                              other, 2 * i_node2 + 1)
+            reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1,
+                                              other, 2 * i_node2 + 2)
+
+            if reduced_dist_LB1 < reduced_dist_LB2:
+                self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1,
+                                            bounds, heap, reduced_dist_LB1)
+                self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 2,
+                                            bounds, heap, reduced_dist_LB2)
+            else:
+                self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 2,
+                                            bounds, heap, reduced_dist_LB2)
+                self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1,
+                                            bounds, heap, reduced_dist_LB1)
+
+        # ------------------------------------------------------------
+        # Case 3b: node 2 is a leaf or is smaller: split node 1 and
+        #          recursively query, starting with the nearest subnode
+        else:
+            reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1,
+                                              other, i_node2)
+            reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2,
+                                              other, i_node2)
+
+            if reduced_dist_LB1 < reduced_dist_LB2:
+                self._query_dual_depthfirst(2 * i_node1 + 1, other, i_node2,
+                                            bounds, heap, reduced_dist_LB1)
+                self._query_dual_depthfirst(2 * i_node1 + 2, other, i_node2,
+                                            bounds, heap, reduced_dist_LB2)
+            else:
+                self._query_dual_depthfirst(2 * i_node1 + 2, other, i_node2,
+                                            bounds, heap, reduced_dist_LB2)
+                self._query_dual_depthfirst(2 * i_node1 + 1, other, i_node2,
+                                            bounds, heap, reduced_dist_LB1)
+        return 0
+
+    cdef int _query_dual_breadthfirst(
+        self,
+        BinaryTree{{name_suffix}} other,
+        NeighborsHeap{{name_suffix}} heap,
+        NodeHeap nodeheap,
+    ) except -1:
+        """Non-recursive dual-tree k-neighbors query, breadth-first"""
+        cdef intp_t i, i1, i2, i_node1, i_node2, i_pt
+        cdef float64_t dist_pt, reduced_dist_LB
+        cdef float64_t[::1] bounds = np.full(other.node_data.shape[0], np.inf)
+        cdef const NodeData_t* node_data1 = &self.node_data[0]
+        cdef const NodeData_t* node_data2 = &other.node_data[0]
+        cdef NodeData_t node_info1, node_info2
+        cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
+        cdef intp_t n_features = self.data.shape[1]
+
+        # Set up the node heap and push the head nodes onto it
+        cdef NodeHeapData_t nodeheap_item
+        nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0)
+        nodeheap_item.i1 = 0
+        nodeheap_item.i2 = 0
+        nodeheap.push(nodeheap_item)
+
+        while nodeheap.n > 0:
+            nodeheap_item = nodeheap.pop()
+            reduced_dist_LB = nodeheap_item.val
+            i_node1 = nodeheap_item.i1
+            i_node2 = nodeheap_item.i2
+
+            node_info1 = node_data1[i_node1]
+            node_info2 = node_data2[i_node2]
+
+            # ------------------------------------------------------------
+            # Case 1: nodes are further apart than the current bound:
+            #         trim both from the query
+            if reduced_dist_LB > bounds[i_node2]:
+                pass
+
+            # ------------------------------------------------------------
+            # Case 2: both nodes are leaves:
+            #         do a brute-force search comparing all pairs
+            elif node_info1.is_leaf and node_info2.is_leaf:
+                bounds[i_node2] = -1
+
+                for i2 in range(node_info2.idx_start, node_info2.idx_end):
+                    i_pt = other.idx_array[i2]
+
+                    if heap.largest(i_pt) <= reduced_dist_LB:
+                        continue
+
+                    for i1 in range(node_info1.idx_start, node_info1.idx_end):
+                        dist_pt = self.rdist(
+                            data1 + n_features * self.idx_array[i1],
+                            data2 + n_features * i_pt,
+                            n_features)
+                        heap._push(i_pt, dist_pt, self.idx_array[i1])
+
+                    # keep track of node bound
+                    bounds[i_node2] = fmax(bounds[i_node2],
+                                           heap.largest(i_pt))
+
+            # ------------------------------------------------------------
+            # Case 3a: node 1 is a leaf or is smaller: split node 2 and
+            #          recursively query, starting with the nearest subnode
+            elif node_info1.is_leaf or (not node_info2.is_leaf
+                                        and (node_info2.radius
+                                             > node_info1.radius)):
+                nodeheap_item.i1 = i_node1
+                for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3):
+                    nodeheap_item.i2 = i2
+                    nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1,
+                                                       other, i2)
+                    nodeheap.push(nodeheap_item)
+
+            # ------------------------------------------------------------
+            # Case 3b: node 2 is a leaf or is smaller: split node 1 and
+            #          recursively query, starting with the nearest subnode
+            else:
+                nodeheap_item.i2 = i_node2
+                for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3):
+                    nodeheap_item.i1 = i1
+                    nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1,
+                                                       other, i_node2)
+                    nodeheap.push(nodeheap_item)
+        return 0
+
+    cdef intp_t _query_radius_single(
+        self,
+        intp_t i_node,
+        const {{INPUT_DTYPE_t}}* pt,
+        float64_t r,
+        intp_t* indices,
+        {{INPUT_DTYPE_t}}* distances,
+        intp_t count,
+        int count_only,
+        int return_distance,
+    ) noexcept nogil:
+        """recursive single-tree radius query, depth-first"""
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+        cdef intp_t* idx_array = &self.idx_array[0]
+        cdef intp_t n_features = self.data.shape[1]
+        cdef NodeData_t node_info = self.node_data[i_node]
+
+        cdef intp_t i
+        cdef float64_t reduced_r
+
+        cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
+        min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB)
+
+        # ------------------------------------------------------------
+        # Case 1: all node points are outside distance r.
+        #         prune this branch.
+        if dist_LB > r:
+            pass
+
+        # ------------------------------------------------------------
+        # Case 2: all node points are within distance r
+        #         add all points to neighbors
+        elif dist_UB <= r:
+            if count_only:
+                count += (node_info.idx_end - node_info.idx_start)
+            else:
+                for i in range(node_info.idx_start, node_info.idx_end):
+                    if (count < 0) or (count >= self.data.shape[0]):
+                        return -1
+                    indices[count] = idx_array[i]
+                    if return_distance:
+                        distances[count] = self.dist(pt, (data + n_features
+                                                          * idx_array[i]),
+                                                     n_features)
+                    count += 1
+
+        # ------------------------------------------------------------
+        # Case 3: this is a leaf node.  Go through all points to
+        #         determine if they fall within radius
+        elif node_info.is_leaf:
+            reduced_r = self.dist_metric._dist_to_rdist(r)
+
+            for i in range(node_info.idx_start, node_info.idx_end):
+                dist_pt = self.rdist(pt, (data + n_features * idx_array[i]),
+                                     n_features)
+                if dist_pt <= reduced_r:
+                    if (count < 0) or (count >= self.data.shape[0]):
+                        return -1
+                    if count_only:
+                        pass
+                    else:
+                        indices[count] = idx_array[i]
+                        if return_distance:
+                            distances[count] =\
+                                self.dist_metric._rdist_to_dist(dist_pt)
+                    count += 1
+
+        # ------------------------------------------------------------
+        # Case 4: Node is not a leaf.  Recursively query subnodes
+        else:
+            count = self._query_radius_single(2 * i_node + 1, pt, r,
+                                              indices, distances, count,
+                                              count_only, return_distance)
+            count = self._query_radius_single(2 * i_node + 2, pt, r,
+                                              indices, distances, count,
+                                              count_only, return_distance)
+
+        return count
+
+    cdef float64_t _kde_single_breadthfirst(
+        self, const {{INPUT_DTYPE_t}}* pt,
+        KernelType kernel,
+        float64_t h,
+        float64_t log_knorm,
+        float64_t log_atol,
+        float64_t log_rtol,
+        NodeHeap nodeheap,
+        float64_t* node_log_min_bounds,
+        float64_t* node_log_bound_spreads,
+    ):
+        """non-recursive single-tree kernel density estimation"""
+        # For the given point, node_log_min_bounds and node_log_bound_spreads
+        # will encode the current bounds on the density between the point
+        # and the associated node.
+        # The variables global_log_min_bound and global_log_bound_spread
+        # keep track of the global bounds on density.  The procedure here is
+        # to split nodes, updating these bounds, until the bounds are within
+        # atol & rtol.
+        cdef intp_t i, i1, i2, i_node
+        cdef float64_t N1, N2
+        cdef float64_t global_log_min_bound, global_log_bound_spread
+        cdef float64_t global_log_max_bound
+
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+        cdef bint with_sample_weight = self.sample_weight is not None
+        cdef const {{INPUT_DTYPE_t}}* sample_weight
+        if with_sample_weight:
+            sample_weight = &self.sample_weight[0]
+        cdef intp_t* idx_array = &self.idx_array[0]
+        cdef const NodeData_t* node_data = &self.node_data[0]
+        cdef float64_t N
+        cdef float64_t log_weight
+        if with_sample_weight:
+            N = self.sum_weight
+        else:
+            N = <float64_t> self.data.shape[0]
+        cdef intp_t n_features = self.data.shape[1]
+
+        cdef NodeData_t node_info
+        cdef float64_t dist_pt, log_density
+        cdef float64_t dist_LB_1 = 0, dist_LB_2 = 0
+        cdef float64_t dist_UB_1 = 0, dist_UB_2 = 0
+
+        cdef float64_t dist_UB, dist_LB
+
+        # push the top node to the heap
+        cdef NodeHeapData_t nodeheap_item
+        nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt)
+        nodeheap_item.i1 = 0
+        nodeheap.push(nodeheap_item)
+
+        global_log_min_bound = log(N) + compute_log_kernel(
+            max_dist{{name_suffix}}(self, 0, pt), h, kernel
+        )
+        global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val,
+                                                           h, kernel)
+        global_log_bound_spread = logsubexp(global_log_max_bound,
+                                            global_log_min_bound)
+
+        node_log_min_bounds[0] = global_log_min_bound
+        node_log_bound_spreads[0] = global_log_bound_spread
+
+        while nodeheap.n > 0:
+            nodeheap_item = nodeheap.pop()
+            i_node = nodeheap_item.i1
+
+            node_info = node_data[i_node]
+            if with_sample_weight:
+                N1 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i_node)
+            else:
+                N1 = node_info.idx_end - node_info.idx_start
+
+            # ------------------------------------------------------------
+            # Case 1: local bounds are equal to within per-point tolerance.
+            if (log_knorm + node_log_bound_spreads[i_node] - log(N1) + log(N)
+                <= logaddexp(log_atol, (log_rtol + log_knorm
+                                        + node_log_min_bounds[i_node]))):
+                pass
+
+            # ------------------------------------------------------------
+            # Case 2: global bounds are within rtol & atol.
+            elif (log_knorm + global_log_bound_spread
+                  <= logaddexp(log_atol,
+                               log_rtol + log_knorm + global_log_min_bound)):
+                break
+
+            # ------------------------------------------------------------
+            # Case 3: node is a leaf. Count contributions from all points
+            elif node_info.is_leaf:
+                global_log_min_bound =\
+                    logsubexp(global_log_min_bound,
+                              node_log_min_bounds[i_node])
+                global_log_bound_spread =\
+                    logsubexp(global_log_bound_spread,
+                              node_log_bound_spreads[i_node])
+                for i in range(node_info.idx_start, node_info.idx_end):
+                    dist_pt = self.dist(pt, data + n_features * idx_array[i],
+                                        n_features)
+                    log_density = compute_log_kernel(dist_pt, h, kernel)
+                    if with_sample_weight:
+                        log_weight = np.log(sample_weight[idx_array[i]])
+                    else:
+                        log_weight = 0.
+                    global_log_min_bound = logaddexp(global_log_min_bound,
+                                                     log_density + log_weight)
+
+            # ------------------------------------------------------------
+            # Case 4: split node and query subnodes
+            else:
+                i1 = 2 * i_node + 1
+                i2 = 2 * i_node + 2
+
+                if with_sample_weight:
+                    N1 = _total_node_weight(node_data, sample_weight,
+                                            idx_array, i1)
+                    N2 = _total_node_weight(node_data, sample_weight,
+                                            idx_array, i2)
+                else:
+                    N1 = node_data[i1].idx_end - node_data[i1].idx_start
+                    N2 = node_data[i2].idx_end - node_data[i2].idx_start
+
+                min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1)
+                min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2)
+
+                node_log_min_bounds[i1] = (log(N1) +
+                                           compute_log_kernel(dist_UB_1,
+                                                              h, kernel))
+                node_log_bound_spreads[i1] = (log(N1) +
+                                              compute_log_kernel(dist_LB_1,
+                                                                 h, kernel))
+
+                node_log_min_bounds[i2] = (log(N2) +
+                                           compute_log_kernel(dist_UB_2,
+                                                              h, kernel))
+                node_log_bound_spreads[i2] = (log(N2) +
+                                              compute_log_kernel(dist_LB_2,
+                                                                 h, kernel))
+
+                global_log_min_bound = logsubexp(global_log_min_bound,
+                                                 node_log_min_bounds[i_node])
+                global_log_min_bound = logaddexp(global_log_min_bound,
+                                                 node_log_min_bounds[i1])
+                global_log_min_bound = logaddexp(global_log_min_bound,
+                                                 node_log_min_bounds[i2])
+
+                global_log_bound_spread =\
+                    logsubexp(global_log_bound_spread,
+                              node_log_bound_spreads[i_node])
+                global_log_bound_spread = logaddexp(global_log_bound_spread,
+                                                    node_log_bound_spreads[i1])
+                global_log_bound_spread = logaddexp(global_log_bound_spread,
+                                                    node_log_bound_spreads[i2])
+
+                # TODO: rank by the spread rather than the distance?
+                nodeheap_item.val = dist_LB_1
+                nodeheap_item.i1 = i1
+                nodeheap.push(nodeheap_item)
+
+                nodeheap_item.val = dist_LB_2
+                nodeheap_item.i1 = i2
+                nodeheap.push(nodeheap_item)
+
+        nodeheap.clear()
+        return logaddexp(global_log_min_bound,
+                         global_log_bound_spread - log(2))
+
+    cdef int _kde_single_depthfirst(
+        self,
+        intp_t i_node,
+        const {{INPUT_DTYPE_t}}* pt,
+        KernelType kernel,
+        float64_t h,
+        float64_t log_knorm,
+        float64_t log_atol,
+        float64_t log_rtol,
+        float64_t local_log_min_bound,
+        float64_t local_log_bound_spread,
+        float64_t* global_log_min_bound,
+        float64_t* global_log_bound_spread,
+    ) except -1:
+        """recursive single-tree kernel density estimate, depth-first"""
+        # For the given point, local_min_bound and local_max_bound give the
+        # minimum and maximum density for the current node, while
+        # global_min_bound and global_max_bound give the minimum and maximum
+        # density over the entire tree.  We recurse down until global_min_bound
+        # and global_max_bound are within rtol and atol.
+        cdef intp_t i, i1, i2, iw, start, end
+        cdef float64_t N1, N2
+
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+        cdef const NodeData_t* node_data = &self.node_data[0]
+        cdef bint with_sample_weight = self.sample_weight is not None
+        cdef const {{INPUT_DTYPE_t}}* sample_weight
+        cdef float64_t log_weight
+        if with_sample_weight:
+            sample_weight = &self.sample_weight[0]
+        cdef intp_t* idx_array = &self.idx_array[0]
+        cdef intp_t n_features = self.data.shape[1]
+
+        cdef NodeData_t node_info = self.node_data[i_node]
+        cdef float64_t dist_pt, log_dens_contribution
+
+        cdef float64_t child1_log_min_bound, child2_log_min_bound
+        cdef float64_t child1_log_bound_spread, child2_log_bound_spread
+        cdef float64_t dist_UB = 0, dist_LB = 0
+
+        if with_sample_weight:
+            N1 = _total_node_weight(node_data, sample_weight,
+                                    idx_array, i_node)
+            N2 = self.sum_weight
+        else:
+            N1 = <float64_t>(node_info.idx_end - node_info.idx_start)
+            N2 = <float64_t>self.data.shape[0]
+
+        # ------------------------------------------------------------
+        # Case 1: local bounds are equal to within errors.  Return
+        if (
+            log_knorm + local_log_bound_spread - log(N1) + log(N2)
+            <= logaddexp(log_atol, (log_rtol + log_knorm + local_log_min_bound))
+        ):
+            pass
+
+        # ------------------------------------------------------------
+        # Case 2: global bounds are within rtol & atol. Return
+        elif (
+            log_knorm + global_log_bound_spread[0]
+            <= logaddexp(log_atol, (log_rtol + log_knorm + global_log_min_bound[0]))
+        ):
+            pass
+
+        # ------------------------------------------------------------
+        # Case 3: node is a leaf. Count contributions from all points
+        elif node_info.is_leaf:
+            global_log_min_bound[0] = logsubexp(global_log_min_bound[0],
+                                                local_log_min_bound)
+            global_log_bound_spread[0] = logsubexp(global_log_bound_spread[0],
+                                                   local_log_bound_spread)
+            for i in range(node_info.idx_start, node_info.idx_end):
+                dist_pt = self.dist(pt, (data + n_features * idx_array[i]),
+                                    n_features)
+                log_dens_contribution = compute_log_kernel(dist_pt, h, kernel)
+                if with_sample_weight:
+                    log_weight = np.log(sample_weight[idx_array[i]])
+                else:
+                    log_weight = 0.
+                global_log_min_bound[0] = logaddexp(global_log_min_bound[0],
+                                                    (log_dens_contribution +
+                                                     log_weight))
+
+        # ------------------------------------------------------------
+        # Case 4: split node and query subnodes
+        else:
+            i1 = 2 * i_node + 1
+            i2 = 2 * i_node + 2
+
+            if with_sample_weight:
+                N1 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i1)
+                N2 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i2)
+            else:
+                N1 = <float64_t>(self.node_data[i1].idx_end - self.node_data[i1].idx_start)
+                N2 = <float64_t>(self.node_data[i2].idx_end - self.node_data[i2].idx_start)
+
+            min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB)
+            child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h,
+                                                                kernel)
+            child1_log_bound_spread = logsubexp(log(N1) +
+                                                compute_log_kernel(dist_LB, h,
+                                                                   kernel),
+                                                child1_log_min_bound)
+
+            min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB)
+            child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h,
+                                                                kernel)
+            child2_log_bound_spread = logsubexp(log(N2) +
+                                                compute_log_kernel(dist_LB, h,
+                                                                   kernel),
+                                                child2_log_min_bound)
+
+            global_log_min_bound[0] = logsubexp(global_log_min_bound[0],
+                                                local_log_min_bound)
+            global_log_min_bound[0] = logaddexp(global_log_min_bound[0],
+                                                child1_log_min_bound)
+            global_log_min_bound[0] = logaddexp(global_log_min_bound[0],
+                                                child2_log_min_bound)
+
+            global_log_bound_spread[0] = logsubexp(global_log_bound_spread[0],
+                                                   local_log_bound_spread)
+            global_log_bound_spread[0] = logaddexp(global_log_bound_spread[0],
+                                                   child1_log_bound_spread)
+            global_log_bound_spread[0] = logaddexp(global_log_bound_spread[0],
+                                                   child2_log_bound_spread)
+
+            self._kde_single_depthfirst(i1, pt, kernel, h, log_knorm,
+                                        log_atol, log_rtol,
+                                        child1_log_min_bound,
+                                        child1_log_bound_spread,
+                                        global_log_min_bound,
+                                        global_log_bound_spread)
+            self._kde_single_depthfirst(i2, pt, kernel, h, log_knorm,
+                                        log_atol, log_rtol,
+                                        child2_log_min_bound,
+                                        child2_log_bound_spread,
+                                        global_log_min_bound,
+                                        global_log_bound_spread)
+        return 0
+
+    cdef int _two_point_single(
+        self,
+        intp_t i_node,
+        const {{INPUT_DTYPE_t}}* pt,
+        float64_t* r,
+        intp_t* count,
+        intp_t i_min,
+        intp_t i_max,
+    ) except -1:
+        """recursive single-tree two-point correlation function query"""
+        cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
+        cdef intp_t* idx_array = &self.idx_array[0]
+        cdef intp_t n_features = self.data.shape[1]
+        cdef NodeData_t node_info = self.node_data[i_node]
+
+        cdef intp_t i, j, Npts
+        cdef float64_t reduced_r
+
+        cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
+        min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB)
+
+        # ------------------------------------------------------------
+        # Go through bounds and check for cuts
+        while i_min < i_max:
+            if dist_LB > r[i_min]:
+                i_min += 1
+            else:
+                break
+
+        while i_max > i_min:
+            Npts = (node_info.idx_end - node_info.idx_start)
+            if dist_UB <= r[i_max - 1]:
+                count[i_max - 1] += Npts
+                i_max -= 1
+            else:
+                break
+
+        if i_min < i_max:
+            # If node is a leaf, go through all points
+            if node_info.is_leaf:
+                for i in range(node_info.idx_start, node_info.idx_end):
+                    dist_pt = self.dist(pt, (data + n_features * idx_array[i]),
+                                        n_features)
+                    j = i_max - 1
+                    while (j >= i_min) and (dist_pt <= r[j]):
+                        count[j] += 1
+                        j -= 1
+
+            else:
+                self._two_point_single(2 * i_node + 1, pt, r,
+                                       count, i_min, i_max)
+                self._two_point_single(2 * i_node + 2, pt, r,
+                                       count, i_min, i_max)
+        return 0
+
+    cdef int _two_point_dual(
+        self,
+        intp_t i_node1,
+        BinaryTree{{name_suffix}} other,
+        intp_t i_node2,
+        float64_t* r,
+        intp_t* count,
+        intp_t i_min,
+        intp_t i_max,
+    ) except -1:
+        """recursive dual-tree two-point correlation function query"""
+        cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
+        cdef intp_t* idx_array1 = &self.idx_array[0]
+        cdef intp_t* idx_array2 = &other.idx_array[0]
+        cdef NodeData_t node_info1 = self.node_data[i_node1]
+        cdef NodeData_t node_info2 = other.node_data[i_node2]
+
+        cdef intp_t n_features = self.data.shape[1]
+
+        cdef intp_t i1, i2, j, Npts
+        cdef float64_t reduced_r
+
+        cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
+        dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2)
+        dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2)
+
+        # ------------------------------------------------------------
+        # Go through bounds and check for cuts
+        while i_min < i_max:
+            if dist_LB > r[i_min]:
+                i_min += 1
+            else:
+                break
+
+        while i_max > i_min:
+            Npts = ((node_info1.idx_end - node_info1.idx_start)
+                    * (node_info2.idx_end - node_info2.idx_start))
+            if dist_UB <= r[i_max - 1]:
+                count[i_max - 1] += Npts
+                i_max -= 1
+            else:
+                break
+
+        if i_min < i_max:
+            if node_info1.is_leaf and node_info2.is_leaf:
+                # If both nodes are leaves, go through all points
+                for i1 in range(node_info1.idx_start, node_info1.idx_end):
+                    for i2 in range(node_info2.idx_start, node_info2.idx_end):
+                        dist_pt = self.dist((data1 + n_features
+                                             * idx_array1[i1]),
+                                            (data2 + n_features
+                                             * idx_array2[i2]),
+                                            n_features)
+                        j = i_max - 1
+                        while (j >= i_min) and (dist_pt <= r[j]):
+                            count[j] += 1
+                            j -= 1
+
+            elif node_info1.is_leaf:
+                # If only one is a leaf, split the other
+                for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3):
+                    self._two_point_dual(i_node1, other, i2,
+                                         r, count, i_min, i_max)
+
+            elif node_info2.is_leaf:
+                for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3):
+                    self._two_point_dual(i1, other, i_node2,
+                                         r, count, i_min, i_max)
+
+            else:
+                # neither is a leaf: split & query both
+                for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3):
+                    for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3):
+                        self._two_point_dual(i1, other, i2,
+                                             r, count, i_min, i_max)
+        return 0
+
+{{endfor}}
+
+######################################################################
+# Python functions for benchmarking and testing C implementations
+
+def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices):
+    """In-place simultaneous sort the given row of the arrays
+
+    This python wrapper exists primarily to enable unit testing
+    of the _simultaneous_sort C routine.
+    """
+    assert distances.shape[0] == indices.shape[0]
+    assert distances.shape[1] == indices.shape[1]
+    cdef intp_t row
+    for row in range(distances.shape[0]):
+        _simultaneous_sort(&distances[row, 0],
+                           &indices[row, 0],
+                           distances.shape[1])
+
+
+def nodeheap_sort(float64_t[::1] vals):
+    """In-place reverse sort of vals using NodeHeap"""
+    cdef intp_t[::1] indices = np.zeros(vals.shape[0], dtype=np.intp)
+    cdef float64_t[::1] vals_sorted = np.zeros_like(vals)
+
+    # use initial size 0 to check corner case
+    cdef NodeHeap heap = NodeHeap(0)
+    cdef NodeHeapData_t data
+    cdef intp_t i
+    for i in range(vals.shape[0]):
+        data.val = vals[i]
+        data.i1 = i
+        data.i2 = i + 1
+        heap.push(data)
+
+    for i in range(vals.shape[0]):
+        data = heap.pop()
+        vals_sorted[i] = data.val
+        indices[i] = data.i1
+
+    return np.asarray(vals_sorted), np.asarray(indices)
+
+
+cdef inline float64_t _total_node_weight(
+    const NodeData_t* node_data,
+    const floating* sample_weight,
+    const intp_t* idx_array,
+    intp_t i_node,
+):
+    cdef intp_t i
+    cdef float64_t N = 0.0
+    for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end):
+        N += sample_weight[idx_array[i]]
+    return N
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_classification.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70b83cb1d3bdbcab4f241bf19416d410cbaf9e4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_classification.py
@@ -0,0 +1,919 @@
+"""Nearest Neighbor Classification"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Integral
+
+import numpy as np
+
+from sklearn.neighbors._base import _check_precomputed
+
+from ..base import ClassifierMixin, _fit_context
+from ..metrics._pairwise_distances_reduction import (
+    ArgKminClassMode,
+    RadiusNeighborsClassMode,
+)
+from ..utils._param_validation import StrOptions
+from ..utils.arrayfuncs import _all_with_any_reduction_axis_1
+from ..utils.extmath import weighted_mode
+from ..utils.fixes import _mode
+from ..utils.validation import (
+    _is_arraylike,
+    _num_samples,
+    check_is_fitted,
+    validate_data,
+)
+from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights
+
+
+def _adjusted_metric(metric, metric_kwargs, p=None):
+    metric_kwargs = metric_kwargs or {}
+    if metric == "minkowski":
+        metric_kwargs["p"] = p
+        if p == 2:
+            metric = "euclidean"
+    return metric, metric_kwargs
+
+
+class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
+    """Classifier implementing the k-nearest neighbors vote.
+
+    Read more in the :ref:`User Guide <classification>`.
+
+    Parameters
+    ----------
+    n_neighbors : int, default=5
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+
+    weights : {'uniform', 'distance'}, callable or None, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+        Refer to the example entitled
+        :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py`
+        showing the impact of the `weights` parameter on the decision
+        boundary.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is equivalent
+        to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
+        For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
+        to be positive.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+        Doesn't affect :meth:`fit` method.
+
+    Attributes
+    ----------
+    classes_ : array of shape (n_classes,)
+        Class labels known to the classifier
+
+    effective_metric_ : str or callble
+        The distance metric used. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    outputs_2d_ : bool
+        False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
+        otherwise True.
+
+    See Also
+    --------
+    RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.
+    KNeighborsRegressor: Regression based on k-nearest neighbors.
+    RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.
+    NearestNeighbors: Unsupervised learner for implementing neighbor searches.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    .. warning::
+
+       Regarding the Nearest Neighbors algorithms, if it is found that two
+       neighbors, neighbor `k+1` and `k`, have identical distances
+       but different labels, the results will depend on the ordering of the
+       training data.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
+
+    Examples
+    --------
+    >>> X = [[0], [1], [2], [3]]
+    >>> y = [0, 0, 1, 1]
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> neigh = KNeighborsClassifier(n_neighbors=3)
+    >>> neigh.fit(X, y)
+    KNeighborsClassifier(...)
+    >>> print(neigh.predict([[1.1]]))
+    [0]
+    >>> print(neigh.predict_proba([[0.9]]))
+    [[0.666 0.333]]
+    """
+
+    _parameter_constraints: dict = {**NeighborsBase._parameter_constraints}
+    _parameter_constraints.pop("radius")
+    _parameter_constraints.update(
+        {"weights": [StrOptions({"uniform", "distance"}), callable, None]}
+    )
+
+    def __init__(
+        self,
+        n_neighbors=5,
+        *,
+        weights="uniform",
+        algorithm="auto",
+        leaf_size=30,
+        p=2,
+        metric="minkowski",
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.weights = weights
+
+    @_fit_context(
+        # KNeighborsClassifier.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y):
+        """Fit the k-nearest neighbors classifier from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_outputs)
+            Target values.
+
+        Returns
+        -------
+        self : KNeighborsClassifier
+            The fitted k-nearest neighbors classifier.
+        """
+        return self._fit(X, y)
+
+    def predict(self, X):
+        """Predict the class labels for the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
+            Class labels for each data sample.
+        """
+        check_is_fitted(self, "_fit_method")
+        if self.weights == "uniform":
+            if self._fit_method == "brute" and ArgKminClassMode.is_usable_for(
+                X, self._fit_X, self.metric
+            ):
+                probabilities = self.predict_proba(X)
+                if self.outputs_2d_:
+                    return np.stack(
+                        [
+                            self.classes_[idx][np.argmax(probas, axis=1)]
+                            for idx, probas in enumerate(probabilities)
+                        ],
+                        axis=1,
+                    )
+                return self.classes_[np.argmax(probabilities, axis=1)]
+            # In that case, we do not need the distances to perform
+            # the weighting so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
+
+        classes_ = self.classes_
+        _y = self._y
+        if not self.outputs_2d_:
+            _y = self._y.reshape((-1, 1))
+            classes_ = [self.classes_]
+
+        n_outputs = len(classes_)
+        n_queries = _num_samples(self._fit_X if X is None else X)
+        weights = _get_weights(neigh_dist, self.weights)
+        if weights is not None and _all_with_any_reduction_axis_1(weights, value=0):
+            raise ValueError(
+                "All neighbors of some sample is getting zero weights. "
+                "Please modify 'weights' to avoid this case if you are "
+                "using a user-defined function."
+            )
+
+        y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
+        for k, classes_k in enumerate(classes_):
+            if weights is None:
+                mode, _ = _mode(_y[neigh_ind, k], axis=1)
+            else:
+                mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
+
+            mode = np.asarray(mode.ravel(), dtype=np.intp)
+            y_pred[:, k] = classes_k.take(mode)
+
+        if not self.outputs_2d_:
+            y_pred = y_pred.ravel()
+
+        return y_pred
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        p : ndarray of shape (n_queries, n_classes), or a list of n_outputs \
+                of such arrays if n_outputs > 1.
+            The class probabilities of the input samples. Classes are ordered
+            by lexicographic order.
+        """
+        check_is_fitted(self, "_fit_method")
+        if self.weights == "uniform":
+            # TODO: systematize this mapping of metric for
+            # PairwiseDistancesReductions.
+            metric, metric_kwargs = _adjusted_metric(
+                metric=self.metric, metric_kwargs=self.metric_params, p=self.p
+            )
+            if (
+                self._fit_method == "brute"
+                and ArgKminClassMode.is_usable_for(X, self._fit_X, metric)
+                # TODO: Implement efficient multi-output solution
+                and not self.outputs_2d_
+            ):
+                if self.metric == "precomputed":
+                    X = _check_precomputed(X)
+                else:
+                    X = validate_data(
+                        self, X, accept_sparse="csr", reset=False, order="C"
+                    )
+
+                probabilities = ArgKminClassMode.compute(
+                    X,
+                    self._fit_X,
+                    k=self.n_neighbors,
+                    weights=self.weights,
+                    Y_labels=self._y,
+                    unique_Y_labels=self.classes_,
+                    metric=metric,
+                    metric_kwargs=metric_kwargs,
+                    # `strategy="parallel_on_X"` has in practice be shown
+                    # to be more efficient than `strategy="parallel_on_Y``
+                    # on many combination of datasets.
+                    # Hence, we choose to enforce it here.
+                    # For more information, see:
+                    # https://github.com/scikit-learn/scikit-learn/pull/24076#issuecomment-1445258342
+                    # TODO: adapt the heuristic for `strategy="auto"` for
+                    # `ArgKminClassMode` and use `strategy="auto"`.
+                    strategy="parallel_on_X",
+                )
+                return probabilities
+
+            # In that case, we do not need the distances to perform
+            # the weighting so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
+
+        classes_ = self.classes_
+        _y = self._y
+        if not self.outputs_2d_:
+            _y = self._y.reshape((-1, 1))
+            classes_ = [self.classes_]
+
+        n_queries = _num_samples(self._fit_X if X is None else X)
+
+        weights = _get_weights(neigh_dist, self.weights)
+        if weights is None:
+            weights = np.ones_like(neigh_ind)
+        elif _all_with_any_reduction_axis_1(weights, value=0):
+            raise ValueError(
+                "All neighbors of some sample is getting zero weights. "
+                "Please modify 'weights' to avoid this case if you are "
+                "using a user-defined function."
+            )
+
+        all_rows = np.arange(n_queries)
+        probabilities = []
+        for k, classes_k in enumerate(classes_):
+            pred_labels = _y[:, k][neigh_ind]
+            proba_k = np.zeros((n_queries, classes_k.size))
+
+            # a simple ':' index doesn't work right
+            for i, idx in enumerate(pred_labels.T):  # loop is O(n_neighbors)
+                proba_k[all_rows, idx] += weights[:, i]
+
+            # normalize 'votes' into real [0,1] probabilities
+            normalizer = proba_k.sum(axis=1)[:, np.newaxis]
+            proba_k /= normalizer
+
+            probabilities.append(proba_k)
+
+        if not self.outputs_2d_:
+            probabilities = probabilities[0]
+
+        return probabilities
+
+    # This function is defined here only to modify the parent docstring
+    # and add information about X=None
+    def score(self, X, y, sample_weight=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features), or None
+            Test samples. If `None`, predictions for all indexed points are
+            used; in this case, points are not considered their own
+            neighbors. This means that `knn.fit(X, y).score(None, y)`
+            implicitly performs a leave-one-out cross-validation procedure
+            and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
+            but typically much faster.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for `X`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
+        """
+        return super().score(X, y, sample_weight)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        return tags
+
+
+class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase):
+    """Classifier implementing a vote among neighbors within a given radius.
+
+    Read more in the :ref:`User Guide <classification>`.
+
+    Parameters
+    ----------
+    radius : float, default=1.0
+        Range of parameter space to use by default for :meth:`radius_neighbors`
+        queries.
+
+    weights : {'uniform', 'distance'}, callable or None, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+        Uniform weights are used by default.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+        This parameter is expected to be positive.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    outlier_label : {manual label, 'most_frequent'}, default=None
+        Label for outlier samples (samples with no neighbors in given radius).
+
+        - manual label: str or int label (should be the same type as y)
+          or list of manual labels if multi-output is used.
+        - 'most_frequent' : assign the most frequent label of y to outliers.
+        - None : when any outlier is detected, ValueError will be raised.
+
+        The outlier label should be selected from among the unique 'Y' labels.
+        If it is specified with a different value a warning will be raised and
+        all class probabilities of outliers will be assigned to be 0.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Class labels known to the classifier.
+
+    effective_metric_ : str or callable
+        The distance metric used. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    outlier_label_ : int or array-like of shape (n_class,)
+        Label which is given for outlier samples (samples with no neighbors
+        on given radius).
+
+    outputs_2d_ : bool
+        False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
+        otherwise True.
+
+    See Also
+    --------
+    KNeighborsClassifier : Classifier implementing the k-nearest neighbors
+        vote.
+    RadiusNeighborsRegressor : Regression based on neighbors within a
+        fixed radius.
+    KNeighborsRegressor : Regression based on k-nearest neighbors.
+    NearestNeighbors : Unsupervised learner for implementing neighbor
+        searches.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
+
+    Examples
+    --------
+    >>> X = [[0], [1], [2], [3]]
+    >>> y = [0, 0, 1, 1]
+    >>> from sklearn.neighbors import RadiusNeighborsClassifier
+    >>> neigh = RadiusNeighborsClassifier(radius=1.0)
+    >>> neigh.fit(X, y)
+    RadiusNeighborsClassifier(...)
+    >>> print(neigh.predict([[1.5]]))
+    [0]
+    >>> print(neigh.predict_proba([[1.0]]))
+    [[0.66666667 0.33333333]]
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "weights": [StrOptions({"uniform", "distance"}), callable, None],
+        "outlier_label": [Integral, str, "array-like", None],
+    }
+    _parameter_constraints.pop("n_neighbors")
+
+    def __init__(
+        self,
+        radius=1.0,
+        *,
+        weights="uniform",
+        algorithm="auto",
+        leaf_size=30,
+        p=2,
+        metric="minkowski",
+        outlier_label=None,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.weights = weights
+        self.outlier_label = outlier_label
+
+    @_fit_context(
+        # RadiusNeighborsClassifier.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y):
+        """Fit the radius neighbors classifier from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_outputs)
+            Target values.
+
+        Returns
+        -------
+        self : RadiusNeighborsClassifier
+            The fitted radius neighbors classifier.
+        """
+        self._fit(X, y)
+
+        classes_ = self.classes_
+        _y = self._y
+        if not self.outputs_2d_:
+            _y = self._y.reshape((-1, 1))
+            classes_ = [self.classes_]
+
+        if self.outlier_label is None:
+            outlier_label_ = None
+
+        elif self.outlier_label == "most_frequent":
+            outlier_label_ = []
+            # iterate over multi-output, get the most frequent label for each
+            # output.
+            for k, classes_k in enumerate(classes_):
+                label_count = np.bincount(_y[:, k])
+                outlier_label_.append(classes_k[label_count.argmax()])
+
+        else:
+            if _is_arraylike(self.outlier_label) and not isinstance(
+                self.outlier_label, str
+            ):
+                if len(self.outlier_label) != len(classes_):
+                    raise ValueError(
+                        "The length of outlier_label: {} is "
+                        "inconsistent with the output "
+                        "length: {}".format(self.outlier_label, len(classes_))
+                    )
+                outlier_label_ = self.outlier_label
+            else:
+                outlier_label_ = [self.outlier_label] * len(classes_)
+
+            for classes, label in zip(classes_, outlier_label_):
+                if _is_arraylike(label) and not isinstance(label, str):
+                    # ensure the outlier label for each output is a scalar.
+                    raise TypeError(
+                        "The outlier_label of classes {} is "
+                        "supposed to be a scalar, got "
+                        "{}.".format(classes, label)
+                    )
+                if np.append(classes, label).dtype != classes.dtype:
+                    # ensure the dtype of outlier label is consistent with y.
+                    raise TypeError(
+                        "The dtype of outlier_label {} is "
+                        "inconsistent with classes {} in "
+                        "y.".format(label, classes)
+                    )
+
+        self.outlier_label_ = outlier_label_
+
+        return self
+
+    def predict(self, X):
+        """Predict the class labels for the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
+            Class labels for each data sample.
+        """
+
+        probs = self.predict_proba(X)
+        classes_ = self.classes_
+
+        if not self.outputs_2d_:
+            probs = [probs]
+            classes_ = [self.classes_]
+
+        n_outputs = len(classes_)
+        n_queries = probs[0].shape[0]
+        y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
+
+        for k, prob in enumerate(probs):
+            # iterate over multi-output, assign labels based on probabilities
+            # of each output.
+            max_prob_index = prob.argmax(axis=1)
+            y_pred[:, k] = classes_[k].take(max_prob_index)
+
+            outlier_zero_probs = (prob == 0).all(axis=1)
+            if outlier_zero_probs.any():
+                zero_prob_index = np.flatnonzero(outlier_zero_probs)
+                y_pred[zero_prob_index, k] = self.outlier_label_[k]
+
+        if not self.outputs_2d_:
+            y_pred = y_pred.ravel()
+
+        return y_pred
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        p : ndarray of shape (n_queries, n_classes), or a list of \
+                n_outputs of such arrays if n_outputs > 1.
+            The class probabilities of the input samples. Classes are ordered
+            by lexicographic order.
+        """
+        check_is_fitted(self, "_fit_method")
+        n_queries = _num_samples(self._fit_X if X is None else X)
+
+        metric, metric_kwargs = _adjusted_metric(
+            metric=self.metric, metric_kwargs=self.metric_params, p=self.p
+        )
+
+        if (
+            self.weights == "uniform"
+            and self._fit_method == "brute"
+            and not self.outputs_2d_
+            and RadiusNeighborsClassMode.is_usable_for(X, self._fit_X, metric)
+        ):
+            probabilities = RadiusNeighborsClassMode.compute(
+                X=X,
+                Y=self._fit_X,
+                radius=self.radius,
+                weights=self.weights,
+                Y_labels=self._y,
+                unique_Y_labels=self.classes_,
+                outlier_label=self.outlier_label,
+                metric=metric,
+                metric_kwargs=metric_kwargs,
+                strategy="parallel_on_X",
+                # `strategy="parallel_on_X"` has in practice be shown
+                # to be more efficient than `strategy="parallel_on_Y``
+                # on many combination of datasets.
+                # Hence, we choose to enforce it here.
+                # For more information, see:
+                # https://github.com/scikit-learn/scikit-learn/pull/26828/files#r1282398471
+            )
+            return probabilities
+
+        neigh_dist, neigh_ind = self.radius_neighbors(X)
+        outlier_mask = np.zeros(n_queries, dtype=bool)
+        outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
+        outliers = np.flatnonzero(outlier_mask)
+        inliers = np.flatnonzero(~outlier_mask)
+
+        classes_ = self.classes_
+        _y = self._y
+        if not self.outputs_2d_:
+            _y = self._y.reshape((-1, 1))
+            classes_ = [self.classes_]
+
+        if self.outlier_label_ is None and outliers.size > 0:
+            raise ValueError(
+                "No neighbors found for test samples %r, "
+                "you can try using larger radius, "
+                "giving a label for outliers, "
+                "or considering removing them from your dataset." % outliers
+            )
+
+        weights = _get_weights(neigh_dist, self.weights)
+        if weights is not None:
+            weights = weights[inliers]
+
+        probabilities = []
+        # iterate over multi-output, measure probabilities of the k-th output.
+        for k, classes_k in enumerate(classes_):
+            pred_labels = np.zeros(len(neigh_ind), dtype=object)
+            pred_labels[:] = [_y[ind, k] for ind in neigh_ind]
+
+            proba_k = np.zeros((n_queries, classes_k.size))
+            proba_inl = np.zeros((len(inliers), classes_k.size))
+
+            # samples have different size of neighbors within the same radius
+            if weights is None:
+                for i, idx in enumerate(pred_labels[inliers]):
+                    proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size)
+            else:
+                for i, idx in enumerate(pred_labels[inliers]):
+                    proba_inl[i, :] = np.bincount(
+                        idx, weights[i], minlength=classes_k.size
+                    )
+            proba_k[inliers, :] = proba_inl
+
+            if outliers.size > 0:
+                _outlier_label = self.outlier_label_[k]
+                label_index = np.flatnonzero(classes_k == _outlier_label)
+                if label_index.size == 1:
+                    proba_k[outliers, label_index[0]] = 1.0
+                else:
+                    warnings.warn(
+                        "Outlier label {} is not in training "
+                        "classes. All class probabilities of "
+                        "outliers will be assigned with 0."
+                        "".format(self.outlier_label_[k])
+                    )
+
+            # normalize 'votes' into real [0,1] probabilities
+            normalizer = proba_k.sum(axis=1)[:, np.newaxis]
+            normalizer[normalizer == 0.0] = 1.0
+            proba_k /= normalizer
+
+            probabilities.append(proba_k)
+
+        if not self.outputs_2d_:
+            probabilities = probabilities[0]
+
+        return probabilities
+
+    # This function is defined here only to modify the parent docstring
+    # and add information about X=None
+    def score(self, X, y, sample_weight=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features), or None
+            Test samples. If `None`, predictions for all indexed points are
+            used; in this case, points are not considered their own
+            neighbors. This means that `knn.fit(X, y).score(None, y)`
+            implicitly performs a leave-one-out cross-validation procedure
+            and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
+            but typically much faster.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for `X`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
+        """
+        return super().score(X, y, sample_weight)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_graph.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..3562fab1fcf01b5487d210a11d83d203bffd7835
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_graph.py
@@ -0,0 +1,704 @@
+"""Nearest Neighbors graph functions"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+
+from ..base import ClassNamePrefixFeaturesOutMixin, TransformerMixin, _fit_context
+from ..utils._param_validation import (
+    Integral,
+    Interval,
+    Real,
+    StrOptions,
+    validate_params,
+)
+from ..utils.validation import check_is_fitted
+from ._base import VALID_METRICS, KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin
+from ._unsupervised import NearestNeighbors
+
+
+def _check_params(X, metric, p, metric_params):
+    """Check the validity of the input parameters"""
+    params = zip(["metric", "p", "metric_params"], [metric, p, metric_params])
+    est_params = X.get_params()
+    for param_name, func_param in params:
+        if func_param != est_params[param_name]:
+            raise ValueError(
+                "Got %s for %s, while the estimator has %s for the same parameter."
+                % (func_param, param_name, est_params[param_name])
+            )
+
+
+def _query_include_self(X, include_self, mode):
+    """Return the query based on include_self param"""
+    if include_self == "auto":
+        include_self = mode == "connectivity"
+
+    # it does not include each sample as its own neighbors
+    if not include_self:
+        X = None
+
+    return X
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix", KNeighborsMixin],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "mode": [StrOptions({"connectivity", "distance"})],
+        "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable],
+        "p": [Interval(Real, 0, None, closed="right"), None],
+        "metric_params": [dict, None],
+        "include_self": ["boolean", StrOptions({"auto"})],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=False,  # metric is not validated yet
+)
+def kneighbors_graph(
+    X,
+    n_neighbors,
+    *,
+    mode="connectivity",
+    metric="minkowski",
+    p=2,
+    metric_params=None,
+    include_self=False,
+    n_jobs=None,
+):
+    """Compute the (weighted) graph of k-Neighbors for points in X.
+
+    Read more in the :ref:`User Guide <unsupervised_neighbors>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Sample data.
+
+    n_neighbors : int
+        Number of neighbors for each sample.
+
+    mode : {'connectivity', 'distance'}, default='connectivity'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    metric : str, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is equivalent
+        to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
+        For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
+        to be positive.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    include_self : bool or 'auto', default=False
+        Whether or not to mark each sample as the first nearest neighbor to
+        itself. If 'auto', then True is used for mode='connectivity' and False
+        for mode='distance'.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    A : sparse matrix of shape (n_samples, n_samples)
+        Graph where A[i, j] is assigned the weight of edge that
+        connects i to j. The matrix is of CSR format.
+
+    See Also
+    --------
+    radius_neighbors_graph: Compute the (weighted) graph of Neighbors for points in X.
+
+    Examples
+    --------
+    >>> X = [[0], [3], [1]]
+    >>> from sklearn.neighbors import kneighbors_graph
+    >>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)
+    >>> A.toarray()
+    array([[1., 0., 1.],
+           [0., 1., 1.],
+           [1., 0., 1.]])
+    """
+    if not isinstance(X, KNeighborsMixin):
+        X = NearestNeighbors(
+            n_neighbors=n_neighbors,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        ).fit(X)
+    else:
+        _check_params(X, metric, p, metric_params)
+
+    query = _query_include_self(X._fit_X, include_self, mode)
+    return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix", RadiusNeighborsMixin],
+        "radius": [Interval(Real, 0, None, closed="both")],
+        "mode": [StrOptions({"connectivity", "distance"})],
+        "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable],
+        "p": [Interval(Real, 0, None, closed="right"), None],
+        "metric_params": [dict, None],
+        "include_self": ["boolean", StrOptions({"auto"})],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=False,  # metric is not validated yet
+)
+def radius_neighbors_graph(
+    X,
+    radius,
+    *,
+    mode="connectivity",
+    metric="minkowski",
+    p=2,
+    metric_params=None,
+    include_self=False,
+    n_jobs=None,
+):
+    """Compute the (weighted) graph of Neighbors for points in X.
+
+    Neighborhoods are restricted the points at a distance lower than
+    radius.
+
+    Read more in the :ref:`User Guide <unsupervised_neighbors>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Sample data.
+
+    radius : float
+        Radius of neighborhoods.
+
+    mode : {'connectivity', 'distance'}, default='connectivity'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    metric : str, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    include_self : bool or 'auto', default=False
+        Whether or not to mark each sample as the first nearest neighbor to
+        itself. If 'auto', then True is used for mode='connectivity' and False
+        for mode='distance'.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    A : sparse matrix of shape (n_samples, n_samples)
+        Graph where A[i, j] is assigned the weight of edge that connects
+        i to j. The matrix is of CSR format.
+
+    See Also
+    --------
+    kneighbors_graph: Compute the weighted graph of k-neighbors for points in X.
+
+    Examples
+    --------
+    >>> X = [[0], [3], [1]]
+    >>> from sklearn.neighbors import radius_neighbors_graph
+    >>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',
+    ...                            include_self=True)
+    >>> A.toarray()
+    array([[1., 0., 1.],
+           [0., 1., 0.],
+           [1., 0., 1.]])
+    """
+    if not isinstance(X, RadiusNeighborsMixin):
+        X = NearestNeighbors(
+            radius=radius,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        ).fit(X)
+    else:
+        _check_params(X, metric, p, metric_params)
+
+    query = _query_include_self(X._fit_X, include_self, mode)
+    return X.radius_neighbors_graph(query, radius, mode)
+
+
+class KNeighborsTransformer(
+    ClassNamePrefixFeaturesOutMixin, KNeighborsMixin, TransformerMixin, NeighborsBase
+):
+    """Transform X into a (weighted) graph of k nearest neighbors.
+
+    The transformed data is a sparse graph as returned by kneighbors_graph.
+
+    Read more in the :ref:`User Guide <neighbors_transformer>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    mode : {'distance', 'connectivity'}, default='distance'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    n_neighbors : int, default=5
+        Number of neighbors for each sample in the transformed sparse graph.
+        For compatibility reasons, as each sample is considered as its own
+        neighbor, one extra neighbor will be computed when mode == 'distance'.
+        In this case, the sparse graph contains (n_neighbors + 1) neighbors.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+        This parameter is expected to be positive.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        If ``-1``, then the number of jobs is set to the number of CPU cores.
+
+    Attributes
+    ----------
+    effective_metric_ : str or callable
+        The distance metric used. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    kneighbors_graph : Compute the weighted graph of k-neighbors for
+        points in X.
+    RadiusNeighborsTransformer : Transform X into a weighted graph of
+        neighbors nearer than a radius.
+
+    Notes
+    -----
+    For an example of using :class:`~sklearn.neighbors.KNeighborsTransformer`
+    in combination with :class:`~sklearn.manifold.TSNE` see
+    :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_wine
+    >>> from sklearn.neighbors import KNeighborsTransformer
+    >>> X, _ = load_wine(return_X_y=True)
+    >>> X.shape
+    (178, 13)
+    >>> transformer = KNeighborsTransformer(n_neighbors=5, mode='distance')
+    >>> X_dist_graph = transformer.fit_transform(X)
+    >>> X_dist_graph.shape
+    (178, 178)
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "mode": [StrOptions({"distance", "connectivity"})],
+    }
+    _parameter_constraints.pop("radius")
+
+    def __init__(
+        self,
+        *,
+        mode="distance",
+        n_neighbors=5,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            radius=None,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.mode = mode
+
+    @_fit_context(
+        # KNeighborsTransformer.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit the k-nearest neighbors transformer from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : KNeighborsTransformer
+            The fitted k-nearest neighbors transformer.
+        """
+        self._fit(X)
+        self._n_features_out = self.n_samples_fit_
+        return self
+
+    def transform(self, X):
+        """Compute the (weighted) graph of Neighbors for points in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_transform, n_features)
+            Sample data.
+
+        Returns
+        -------
+        Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+            The matrix is of CSR format.
+        """
+        check_is_fitted(self)
+        add_one = self.mode == "distance"
+        return self.kneighbors_graph(
+            X, mode=self.mode, n_neighbors=self.n_neighbors + add_one
+        )
+
+    def fit_transform(self, X, y=None):
+        """Fit to data, then transform it.
+
+        Fits transformer to X and y with optional parameters fit_params
+        and returns a transformed version of X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        Xt : sparse matrix of shape (n_samples, n_samples)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+            The matrix is of CSR format.
+        """
+        return self.fit(X).transform(X)
+
+
+class RadiusNeighborsTransformer(
+    ClassNamePrefixFeaturesOutMixin,
+    RadiusNeighborsMixin,
+    TransformerMixin,
+    NeighborsBase,
+):
+    """Transform X into a (weighted) graph of neighbors nearer than a radius.
+
+    The transformed data is a sparse graph as returned by
+    `radius_neighbors_graph`.
+
+    Read more in the :ref:`User Guide <neighbors_transformer>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    mode : {'distance', 'connectivity'}, default='distance'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    radius : float, default=1.0
+        Radius of neighborhood in the transformed sparse graph.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+        This parameter is expected to be positive.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        If ``-1``, then the number of jobs is set to the number of CPU cores.
+
+    Attributes
+    ----------
+    effective_metric_ : str or callable
+        The distance metric used. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    kneighbors_graph : Compute the weighted graph of k-neighbors for
+        points in X.
+    KNeighborsTransformer : Transform X into a weighted graph of k
+        nearest neighbors.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import load_wine
+    >>> from sklearn.cluster import DBSCAN
+    >>> from sklearn.neighbors import RadiusNeighborsTransformer
+    >>> from sklearn.pipeline import make_pipeline
+    >>> X, _ = load_wine(return_X_y=True)
+    >>> estimator = make_pipeline(
+    ...     RadiusNeighborsTransformer(radius=42.0, mode='distance'),
+    ...     DBSCAN(eps=25.0, metric='precomputed'))
+    >>> X_clustered = estimator.fit_predict(X)
+    >>> clusters, counts = np.unique(X_clustered, return_counts=True)
+    >>> print(counts)
+    [ 29  15 111  11  12]
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "mode": [StrOptions({"distance", "connectivity"})],
+    }
+    _parameter_constraints.pop("n_neighbors")
+
+    def __init__(
+        self,
+        *,
+        mode="distance",
+        radius=1.0,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=None,
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.mode = mode
+
+    @_fit_context(
+        # RadiusNeighborsTransformer.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit the radius neighbors transformer from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : RadiusNeighborsTransformer
+            The fitted radius neighbors transformer.
+        """
+        self._fit(X)
+        self._n_features_out = self.n_samples_fit_
+        return self
+
+    def transform(self, X):
+        """Compute the (weighted) graph of Neighbors for points in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_transform, n_features)
+            Sample data.
+
+        Returns
+        -------
+        Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+            The matrix is of CSR format.
+        """
+        check_is_fitted(self)
+        return self.radius_neighbors_graph(X, mode=self.mode, sort_results=True)
+
+    def fit_transform(self, X, y=None):
+        """Fit to data, then transform it.
+
+        Fits transformer to X and y with optional parameters fit_params
+        and returns a transformed version of X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        Xt : sparse matrix of shape (n_samples, n_samples)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+            The matrix is of CSR format.
+        """
+        return self.fit(X).transform(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kd_tree.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kd_tree.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..d21af05270b9aad33560ea6ff72d55c3fa5c91b4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kd_tree.pyx.tp
@@ -0,0 +1,336 @@
+{{py:
+
+# Generated file: _kd_tree.pyx
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    ('64', 'float64_t', 'np.float64'),
+    ('32', 'float32_t', 'np.float32')
+]
+
+# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
+# written for the scikit-learn project
+# SPDX-License-Identifier: BSD-3-Clause
+
+}}
+
+
+__all__ = ['KDTree', 'KDTree64', 'KDTree32']
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+DOC_DICT{{name_suffix}} = {
+    'BinaryTree': 'KDTree{{name_suffix}}',
+    'binary_tree': 'kd_tree{{name_suffix}}',
+}
+
+VALID_METRICS{{name_suffix}} = [
+    'EuclideanDistance{{name_suffix}}',
+    'ManhattanDistance{{name_suffix}}',
+    'ChebyshevDistance{{name_suffix}}',
+    'MinkowskiDistance{{name_suffix}}'
+]
+
+{{endfor}}
+
+include "_binary_tree.pxi"
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}}
+cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}):
+    __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}})
+    pass
+
+{{endfor}}
+
+
+# ----------------------------------------------------------------------
+# The functions below specialized the Binary Tree as a KD Tree
+#
+#   Note that these functions use the concept of "reduced distance".
+#   The reduced distance, defined for some metrics, is a quantity which
+#   is more efficient to compute than the distance, but preserves the
+#   relative rankings of the true distance.  For example, the reduced
+#   distance for the Euclidean metric is the squared-euclidean distance.
+#   For some metrics, the reduced distance is simply the distance.
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+cdef int allocate_data{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t n_nodes,
+    intp_t n_features,
+) except -1:
+    """Allocate arrays needed for the KD Tree"""
+    tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}})
+    return 0
+
+
+cdef int init_node{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    NodeData_t[::1] node_data,
+    intp_t i_node,
+    intp_t idx_start,
+    intp_t idx_end,
+) except -1:
+    """Initialize the node for the dataset stored in tree.data"""
+    cdef intp_t n_features = tree.data.shape[1]
+    cdef intp_t i, j
+    cdef float64_t rad = 0
+
+    cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0]
+    cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0]
+    cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0]
+    cdef const intp_t* idx_array = &tree.idx_array[0]
+
+    cdef const {{INPUT_DTYPE_t}}* data_row
+
+    # determine Node bounds
+    for j in range(n_features):
+        lower_bounds[j] = INF
+        upper_bounds[j] = -INF
+
+    # Compute the actual data range.  At build time, this is slightly
+    # slower than using the previously-computed bounds of the parent node,
+    # but leads to more compact trees and thus faster queries.
+    for i in range(idx_start, idx_end):
+        data_row = data + idx_array[i] * n_features
+        for j in range(n_features):
+            lower_bounds[j] = fmin(lower_bounds[j], data_row[j])
+            upper_bounds[j] = fmax(upper_bounds[j], data_row[j])
+
+    for j in range(n_features):
+        if tree.dist_metric.p == INF:
+            rad = fmax(rad, 0.5 * (upper_bounds[j] - lower_bounds[j]))
+        else:
+            rad += pow(0.5 * abs(upper_bounds[j] - lower_bounds[j]),
+                       tree.dist_metric.p)
+
+    node_data[i_node].idx_start = idx_start
+    node_data[i_node].idx_end = idx_end
+
+    # The radius will hold the size of the circumscribed hypersphere measured
+    # with the specified metric: in querying, this is used as a measure of the
+    # size of each node when deciding which nodes to split.
+    node_data[i_node].radius = pow(rad, 1. / tree.dist_metric.p)
+    return 0
+
+
+cdef float64_t min_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
+    """Compute the minimum reduced-distance between a point and a node"""
+    cdef intp_t n_features = tree.data.shape[1]
+    cdef float64_t d, d_lo, d_hi, rdist=0.0
+    cdef intp_t j
+
+    if tree.dist_metric.p == INF:
+        for j in range(n_features):
+            d_lo = tree.node_bounds[0, i_node, j] - pt[j]
+            d_hi = pt[j] - tree.node_bounds[1, i_node, j]
+            d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
+            rdist = fmax(rdist, 0.5 * d)
+    else:
+        # here we'll use the fact that x + abs(x) = 2 * max(x, 0)
+        for j in range(n_features):
+            d_lo = tree.node_bounds[0, i_node, j] - pt[j]
+            d_hi = pt[j] - tree.node_bounds[1, i_node, j]
+            d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
+            rdist += pow(0.5 * d, tree.dist_metric.p)
+
+    return rdist
+
+
+cdef float64_t min_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the minimum distance between a point and a node"""
+    if tree.dist_metric.p == INF:
+        return min_rdist{{name_suffix}}(tree, i_node, pt)
+    else:
+        return pow(
+            min_rdist{{name_suffix}}(tree, i_node, pt),
+            1. / tree.dist_metric.p
+        )
+
+
+cdef float64_t max_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum reduced-distance between a point and a node"""
+    cdef intp_t n_features = tree.data.shape[1]
+
+    cdef float64_t d_lo, d_hi, rdist=0.0
+    cdef intp_t j
+
+    if tree.dist_metric.p == INF:
+        for j in range(n_features):
+            rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[0, i_node, j]))
+            rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[1, i_node, j]))
+    else:
+        for j in range(n_features):
+            d_lo = fabs(pt[j] - tree.node_bounds[0, i_node, j])
+            d_hi = fabs(pt[j] - tree.node_bounds[1, i_node, j])
+            rdist += pow(fmax(d_lo, d_hi), tree.dist_metric.p)
+
+    return rdist
+
+
+cdef float64_t max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum distance between a point and a node"""
+    if tree.dist_metric.p == INF:
+        return max_rdist{{name_suffix}}(tree, i_node, pt)
+    else:
+        return pow(
+            max_rdist{{name_suffix}}(tree, i_node, pt),
+            1. / tree.dist_metric.p
+        )
+
+
+cdef inline int min_max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    const {{INPUT_DTYPE_t}}* pt,
+    float64_t* min_dist,
+    float64_t* max_dist,
+) except -1 nogil:
+    """Compute the minimum and maximum distance between a point and a node"""
+    cdef intp_t n_features = tree.data.shape[1]
+
+    cdef float64_t d, d_lo, d_hi
+    cdef intp_t j
+
+    min_dist[0] = 0.0
+    max_dist[0] = 0.0
+
+    if tree.dist_metric.p == INF:
+        for j in range(n_features):
+            d_lo = tree.node_bounds[0, i_node, j] - pt[j]
+            d_hi = pt[j] - tree.node_bounds[1, i_node, j]
+            d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
+            min_dist[0] = fmax(min_dist[0], 0.5 * d)
+            max_dist[0] = fmax(max_dist[0], fabs(d_lo))
+            max_dist[0] = fmax(max_dist[0], fabs(d_hi))
+    else:
+        # as above, use the fact that x + abs(x) = 2 * max(x, 0)
+        for j in range(n_features):
+            d_lo = tree.node_bounds[0, i_node, j] - pt[j]
+            d_hi = pt[j] - tree.node_bounds[1, i_node, j]
+            d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
+            min_dist[0] += pow(0.5 * d, tree.dist_metric.p)
+            max_dist[0] += pow(fmax(fabs(d_lo), fabs(d_hi)),
+                               tree.dist_metric.p)
+
+        min_dist[0] = pow(min_dist[0], 1. / tree.dist_metric.p)
+        max_dist[0] = pow(max_dist[0], 1. / tree.dist_metric.p)
+
+    return 0
+
+
+cdef inline float64_t min_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """Compute the minimum reduced distance between two nodes"""
+    cdef intp_t n_features = tree1.data.shape[1]
+
+    cdef float64_t d, d1, d2, rdist=0.0
+    cdef intp_t j
+
+    if tree1.dist_metric.p == INF:
+        for j in range(n_features):
+            d1 = (tree1.node_bounds[0, i_node1, j]
+                  - tree2.node_bounds[1, i_node2, j])
+            d2 = (tree2.node_bounds[0, i_node2, j]
+                  - tree1.node_bounds[1, i_node1, j])
+            d = (d1 + fabs(d1)) + (d2 + fabs(d2))
+
+            rdist = fmax(rdist, 0.5 * d)
+    else:
+        # here we'll use the fact that x + abs(x) = 2 * max(x, 0)
+        for j in range(n_features):
+            d1 = (tree1.node_bounds[0, i_node1, j]
+                  - tree2.node_bounds[1, i_node2, j])
+            d2 = (tree2.node_bounds[0, i_node2, j]
+                  - tree1.node_bounds[1, i_node1, j])
+            d = (d1 + fabs(d1)) + (d2 + fabs(d2))
+
+            rdist += pow(0.5 * d, tree1.dist_metric.p)
+
+    return rdist
+
+
+cdef inline float64_t min_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """Compute the minimum distance between two nodes"""
+    return tree1.dist_metric._rdist_to_dist(
+        min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+    )
+
+
+cdef inline float64_t max_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """Compute the maximum reduced distance between two nodes"""
+    cdef intp_t n_features = tree1.data.shape[1]
+
+    cdef float64_t d1, d2, rdist=0.0
+    cdef intp_t j
+
+    if tree1.dist_metric.p == INF:
+        for j in range(n_features):
+            rdist = fmax(rdist, fabs(tree1.node_bounds[0, i_node1, j]
+                                     - tree2.node_bounds[1, i_node2, j]))
+            rdist = fmax(rdist, fabs(tree1.node_bounds[1, i_node1, j]
+                                     - tree2.node_bounds[0, i_node2, j]))
+    else:
+        for j in range(n_features):
+            d1 = fabs(tree1.node_bounds[0, i_node1, j]
+                      - tree2.node_bounds[1, i_node2, j])
+            d2 = fabs(tree1.node_bounds[1, i_node1, j]
+                      - tree2.node_bounds[0, i_node2, j])
+            rdist += pow(fmax(d1, d2), tree1.dist_metric.p)
+
+    return rdist
+
+
+cdef inline float64_t max_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """Compute the maximum distance between two nodes"""
+    return tree1.dist_metric._rdist_to_dist(
+        max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+    )
+
+{{endfor}}
+
+
+class KDTree(KDTree64):
+    __doc__ = CLASS_DOC.format(BinaryTree="KDTree")
+    pass
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kde.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kde.py
new file mode 100644
index 0000000000000000000000000000000000000000..7661308db2e01665c82cf82985586006b7c39a56
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kde.py
@@ -0,0 +1,359 @@
+"""
+Kernel Density Estimation
+-------------------------
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.special import gammainc
+
+from ..base import BaseEstimator, _fit_context
+from ..neighbors._base import VALID_METRICS
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import row_norms
+from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data
+from ._ball_tree import BallTree
+from ._kd_tree import KDTree
+
+VALID_KERNELS = [
+    "gaussian",
+    "tophat",
+    "epanechnikov",
+    "exponential",
+    "linear",
+    "cosine",
+]
+
+TREE_DICT = {"ball_tree": BallTree, "kd_tree": KDTree}
+
+
+# TODO: implement a brute force version for testing purposes
+# TODO: create a density estimation base class?
+class KernelDensity(BaseEstimator):
+    """Kernel Density Estimation.
+
+    Read more in the :ref:`User Guide <kernel_density>`.
+
+    Parameters
+    ----------
+    bandwidth : float or {"scott", "silverman"}, default=1.0
+        The bandwidth of the kernel. If bandwidth is a float, it defines the
+        bandwidth of the kernel. If bandwidth is a string, one of the estimation
+        methods is implemented.
+
+    algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto'
+        The tree algorithm to use.
+
+    kernel : {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', \
+                 'cosine'}, default='gaussian'
+        The kernel to use.
+
+    metric : str, default='euclidean'
+        Metric to use for distance computation. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        Not all metrics are valid with all algorithms: refer to the
+        documentation of :class:`BallTree` and :class:`KDTree`. Note that the
+        normalization of the density output is correct only for the Euclidean
+        distance metric.
+
+    atol : float, default=0
+        The desired absolute tolerance of the result.  A larger tolerance will
+        generally lead to faster execution.
+
+    rtol : float, default=0
+        The desired relative tolerance of the result.  A larger tolerance will
+        generally lead to faster execution.
+
+    breadth_first : bool, default=True
+        If true (default), use a breadth-first approach to the problem.
+        Otherwise use a depth-first approach.
+
+    leaf_size : int, default=40
+        Specify the leaf size of the underlying tree.  See :class:`BallTree`
+        or :class:`KDTree` for details.
+
+    metric_params : dict, default=None
+        Additional parameters to be passed to the tree for use with the
+        metric.  For more information, see the documentation of
+        :class:`BallTree` or :class:`KDTree`.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    tree_ : ``BinaryTree`` instance
+        The tree algorithm for fast generalized N-point problems.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    bandwidth_ : float
+        Value of the bandwidth, given directly by the bandwidth parameter or
+        estimated using the 'scott' or 'silverman' method.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point
+        problems.
+    sklearn.neighbors.BallTree : Ball tree for fast generalized N-point
+        problems.
+
+    Examples
+    --------
+    Compute a gaussian kernel density estimate with a fixed bandwidth.
+
+    >>> from sklearn.neighbors import KernelDensity
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(42)
+    >>> X = rng.random_sample((100, 3))
+    >>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X)
+    >>> log_density = kde.score_samples(X[:3])
+    >>> log_density
+    array([-1.52955942, -1.51462041, -1.60244657])
+    """
+
+    _parameter_constraints: dict = {
+        "bandwidth": [
+            Interval(Real, 0, None, closed="neither"),
+            StrOptions({"scott", "silverman"}),
+        ],
+        "algorithm": [StrOptions(set(TREE_DICT.keys()) | {"auto"})],
+        "kernel": [StrOptions(set(VALID_KERNELS))],
+        "metric": [
+            StrOptions(
+                set(itertools.chain(*[VALID_METRICS[alg] for alg in TREE_DICT.keys()]))
+            )
+        ],
+        "atol": [Interval(Real, 0, None, closed="left")],
+        "rtol": [Interval(Real, 0, None, closed="left")],
+        "breadth_first": ["boolean"],
+        "leaf_size": [Interval(Integral, 1, None, closed="left")],
+        "metric_params": [None, dict],
+    }
+
+    def __init__(
+        self,
+        *,
+        bandwidth=1.0,
+        algorithm="auto",
+        kernel="gaussian",
+        metric="euclidean",
+        atol=0,
+        rtol=0,
+        breadth_first=True,
+        leaf_size=40,
+        metric_params=None,
+    ):
+        self.algorithm = algorithm
+        self.bandwidth = bandwidth
+        self.kernel = kernel
+        self.metric = metric
+        self.atol = atol
+        self.rtol = rtol
+        self.breadth_first = breadth_first
+        self.leaf_size = leaf_size
+        self.metric_params = metric_params
+
+    def _choose_algorithm(self, algorithm, metric):
+        # given the algorithm string + metric string, choose the optimal
+        # algorithm to compute the result.
+        if algorithm == "auto":
+            # use KD Tree if possible
+            if metric in KDTree.valid_metrics:
+                return "kd_tree"
+            elif metric in BallTree.valid_metrics:
+                return "ball_tree"
+        else:  # kd_tree or ball_tree
+            if metric not in TREE_DICT[algorithm].valid_metrics:
+                raise ValueError(
+                    "invalid metric for {0}: '{1}'".format(TREE_DICT[algorithm], metric)
+                )
+            return algorithm
+
+    @_fit_context(
+        # KernelDensity.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, sample_weight=None):
+        """Fit the Kernel Density model on the data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points.  Each row
+            corresponds to a single data point.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            List of sample weights attached to the data X.
+
+            .. versionadded:: 0.20
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        algorithm = self._choose_algorithm(self.algorithm, self.metric)
+
+        if isinstance(self.bandwidth, str):
+            if self.bandwidth == "scott":
+                self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4))
+            elif self.bandwidth == "silverman":
+                self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** (
+                    -1 / (X.shape[1] + 4)
+                )
+        else:
+            self.bandwidth_ = self.bandwidth
+
+        X = validate_data(self, X, order="C", dtype=np.float64)
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=np.float64, ensure_non_negative=True
+            )
+
+        kwargs = self.metric_params
+        if kwargs is None:
+            kwargs = {}
+        self.tree_ = TREE_DICT[algorithm](
+            X,
+            metric=self.metric,
+            leaf_size=self.leaf_size,
+            sample_weight=sample_weight,
+            **kwargs,
+        )
+        return self
+
+    def score_samples(self, X):
+        """Compute the log-likelihood of each sample under the model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            An array of points to query.  Last dimension should match dimension
+            of training data (n_features).
+
+        Returns
+        -------
+        density : ndarray of shape (n_samples,)
+            Log-likelihood of each sample in `X`. These are normalized to be
+            probability densities, so values will be low for high-dimensional
+            data.
+        """
+        check_is_fitted(self)
+        # The returned density is normalized to the number of points.
+        # For it to be a probability, we must scale it.  For this reason
+        # we'll also scale atol.
+        X = validate_data(self, X, order="C", dtype=np.float64, reset=False)
+        if self.tree_.sample_weight is None:
+            N = self.tree_.data.shape[0]
+        else:
+            N = self.tree_.sum_weight
+        atol_N = self.atol * N
+        log_density = self.tree_.kernel_density(
+            X,
+            h=self.bandwidth_,
+            kernel=self.kernel,
+            atol=atol_N,
+            rtol=self.rtol,
+            breadth_first=self.breadth_first,
+            return_log=True,
+        )
+        log_density -= np.log(N)
+        return log_density
+
+    def score(self, X, y=None):
+        """Compute the total log-likelihood under the model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            List of n_features-dimensional data points.  Each row
+            corresponds to a single data point.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        logprob : float
+            Total log-likelihood of the data in X. This is normalized to be a
+            probability density, so the value will be low for high-dimensional
+            data.
+        """
+        return np.sum(self.score_samples(X))
+
+    def sample(self, n_samples=1, random_state=None):
+        """Generate random samples from the model.
+
+        Currently, this is implemented only for gaussian and tophat kernels.
+
+        Parameters
+        ----------
+        n_samples : int, default=1
+            Number of samples to generate.
+
+        random_state : int, RandomState instance or None, default=None
+            Determines random number generation used to generate
+            random samples. Pass an int for reproducible results
+            across multiple function calls.
+            See :term:`Glossary <random_state>`.
+
+        Returns
+        -------
+        X : array-like of shape (n_samples, n_features)
+            List of samples.
+        """
+        check_is_fitted(self)
+        # TODO: implement sampling for other valid kernel shapes
+        if self.kernel not in ["gaussian", "tophat"]:
+            raise NotImplementedError()
+
+        data = np.asarray(self.tree_.data)
+
+        rng = check_random_state(random_state)
+        u = rng.uniform(0, 1, size=n_samples)
+        if self.tree_.sample_weight is None:
+            i = (u * data.shape[0]).astype(np.int64)
+        else:
+            cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight))
+            sum_weight = cumsum_weight[-1]
+            i = np.searchsorted(cumsum_weight, u * sum_weight)
+        if self.kernel == "gaussian":
+            return np.atleast_2d(rng.normal(data[i], self.bandwidth_))
+
+        elif self.kernel == "tophat":
+            # we first draw points from a d-dimensional normal distribution,
+            # then use an incomplete gamma function to map them to a uniform
+            # d-dimensional tophat distribution.
+            dim = data.shape[1]
+            X = rng.normal(size=(n_samples, dim))
+            s_sq = row_norms(X, squared=True)
+            correction = (
+                gammainc(0.5 * dim, 0.5 * s_sq) ** (1.0 / dim)
+                * self.bandwidth_
+                / np.sqrt(s_sq)
+            )
+            return data[i] + X * correction[:, np.newaxis]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_lof.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_lof.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f00be42570e2841e5445b5fd68e1dec5413c6a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_lof.py
@@ -0,0 +1,518 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Real
+
+import numpy as np
+
+from ..base import OutlierMixin, _fit_context
+from ..utils import check_array
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.metaestimators import available_if
+from ..utils.validation import check_is_fitted
+from ._base import KNeighborsMixin, NeighborsBase
+
+__all__ = ["LocalOutlierFactor"]
+
+
+class LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase):
+    """Unsupervised Outlier Detection using the Local Outlier Factor (LOF).
+
+    The anomaly score of each sample is called the Local Outlier Factor.
+    It measures the local deviation of the density of a given sample with respect
+    to its neighbors.
+    It is local in that the anomaly score depends on how isolated the object
+    is with respect to the surrounding neighborhood.
+    More precisely, locality is given by k-nearest neighbors, whose distance
+    is used to estimate the local density.
+    By comparing the local density of a sample to the local densities of its
+    neighbors, one can identify samples that have a substantially lower density
+    than their neighbors. These are considered outliers.
+
+    .. versionadded:: 0.19
+
+    Parameters
+    ----------
+    n_neighbors : int, default=20
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+        If n_neighbors is larger than the number of samples provided,
+        all samples will be used.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can
+        affect the speed of the construction and query, as well as the memory
+        required to store the tree. The optimal value depends on the
+        nature of the problem.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    p : float, default=2
+        Parameter for the Minkowski metric from
+        :func:`sklearn.metrics.pairwise_distances`. When p = 1, this
+        is equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    contamination : 'auto' or float, default='auto'
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. When fitting this is used to define the
+        threshold on the scores of the samples.
+
+        - if 'auto', the threshold is determined as in the
+          original paper,
+        - if a float, the contamination should be in the range (0, 0.5].
+
+        .. versionchanged:: 0.22
+           The default value of ``contamination`` changed from 0.1
+           to ``'auto'``.
+
+    novelty : bool, default=False
+        By default, LocalOutlierFactor is only meant to be used for outlier
+        detection (novelty=False). Set novelty to True if you want to use
+        LocalOutlierFactor for novelty detection. In this case be aware that
+        you should only use predict, decision_function and score_samples
+        on new unseen data and not on the training set; and note that the
+        results obtained this way may differ from the standard LOF results.
+
+        .. versionadded:: 0.20
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    negative_outlier_factor_ : ndarray of shape (n_samples,)
+        The opposite LOF of the training samples. The higher, the more normal.
+        Inliers tend to have a LOF score close to 1
+        (``negative_outlier_factor_`` close to -1), while outliers tend to have
+        a larger LOF score.
+
+        The local outlier factor (LOF) of a sample captures its
+        supposed 'degree of abnormality'.
+        It is the average of the ratio of the local reachability density of
+        a sample and those of its k-nearest neighbors.
+
+    n_neighbors_ : int
+        The actual number of neighbors used for :meth:`kneighbors` queries.
+
+    offset_ : float
+        Offset used to obtain binary labels from the raw scores.
+        Observations having a negative_outlier_factor smaller than `offset_`
+        are detected as abnormal.
+        The offset is set to -1.5 (inliers score around -1), except when a
+        contamination parameter different than "auto" is provided. In that
+        case, the offset is defined in such a way we obtain the expected
+        number of outliers in training.
+
+        .. versionadded:: 0.20
+
+    effective_metric_ : str
+        The effective metric used for the distance computation.
+
+    effective_metric_params_ : dict
+        The effective additional keyword arguments for the metric function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        It is the number of samples in the fitted data.
+
+    See Also
+    --------
+    sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using
+        Support Vector Machine.
+
+    References
+    ----------
+    .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
+           LOF: identifying density-based local outliers. In ACM sigmod record.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.neighbors import LocalOutlierFactor
+    >>> X = [[-1.1], [0.2], [101.1], [0.3]]
+    >>> clf = LocalOutlierFactor(n_neighbors=2)
+    >>> clf.fit_predict(X)
+    array([ 1,  1, -1,  1])
+    >>> clf.negative_outlier_factor_
+    array([ -0.9821,  -1.0370, -73.3697,  -0.9821])
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "contamination": [
+            StrOptions({"auto"}),
+            Interval(Real, 0, 0.5, closed="right"),
+        ],
+        "novelty": ["boolean"],
+    }
+    _parameter_constraints.pop("radius")
+
+    def __init__(
+        self,
+        n_neighbors=20,
+        *,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        contamination="auto",
+        novelty=False,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.contamination = contamination
+        self.novelty = novelty
+
+    def _check_novelty_fit_predict(self):
+        if self.novelty:
+            msg = (
+                "fit_predict is not available when novelty=True. Use "
+                "novelty=False if you want to predict on the training set."
+            )
+            raise AttributeError(msg)
+        return True
+
+    @available_if(_check_novelty_fit_predict)
+    def fit_predict(self, X, y=None):
+        """Fit the model to the training set X and return the labels.
+
+        **Not available for novelty detection (when novelty is set to True).**
+        Label is 1 for an inlier and -1 for an outlier according to the LOF
+        score and the contamination parameter.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and 1 for inliers.
+        """
+
+        # As fit_predict would be different from fit.predict, fit_predict is
+        # only available for outlier detection (novelty=False)
+
+        return self.fit(X)._predict()
+
+    @_fit_context(
+        # LocalOutlierFactor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit the local outlier factor detector from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : LocalOutlierFactor
+            The fitted local outlier factor detector.
+        """
+        self._fit(X)
+
+        n_samples = self.n_samples_fit_
+        if self.n_neighbors > n_samples:
+            warnings.warn(
+                "n_neighbors (%s) is greater than the "
+                "total number of samples (%s). n_neighbors "
+                "will be set to (n_samples - 1) for estimation."
+                % (self.n_neighbors, n_samples)
+            )
+        self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
+
+        self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors(
+            n_neighbors=self.n_neighbors_
+        )
+
+        if self._fit_X.dtype == np.float32:
+            self._distances_fit_X_ = self._distances_fit_X_.astype(
+                self._fit_X.dtype,
+                copy=False,
+            )
+
+        self._lrd = self._local_reachability_density(
+            self._distances_fit_X_, _neighbors_indices_fit_X_
+        )
+
+        # Compute lof score over training samples to define offset_:
+        lrd_ratios_array = (
+            self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]
+        )
+
+        self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)
+
+        if self.contamination == "auto":
+            # inliers score around -1 (the higher, the less abnormal).
+            self.offset_ = -1.5
+        else:
+            self.offset_ = np.percentile(
+                self.negative_outlier_factor_, 100.0 * self.contamination
+            )
+
+        # Verify if negative_outlier_factor_ values are within acceptable range.
+        # Novelty must also be false to detect outliers
+        if np.min(self.negative_outlier_factor_) < -1e7 and not self.novelty:
+            warnings.warn(
+                "Duplicate values are leading to incorrect results. "
+                "Increase the number of neighbors for more accurate results."
+            )
+
+        return self
+
+    def _check_novelty_predict(self):
+        if not self.novelty:
+            msg = (
+                "predict is not available when novelty=False, use "
+                "fit_predict if you want to predict on training data. Use "
+                "novelty=True if you want to use LOF for novelty detection "
+                "and predict on new unseen data."
+            )
+            raise AttributeError(msg)
+        return True
+
+    @available_if(_check_novelty_predict)
+    def predict(self, X=None):
+        """Predict the labels (1 inlier, -1 outlier) of X according to LOF.
+
+        **Only available for novelty detection (when novelty is set to True).**
+        This method allows to generalize prediction to *new observations* (not
+        in the training set). Note that the result of ``clf.fit(X)`` then
+        ``clf.predict(X)`` with ``novelty=True`` may differ from the result
+        obtained by ``clf.fit_predict(X)`` with ``novelty=False``.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and +1 for inliers.
+        """
+        return self._predict(X)
+
+    def _predict(self, X=None):
+        """Predict the labels (1 inlier, -1 outlier) of X according to LOF.
+
+        If X is None, returns the same as fit_predict(X_train).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples. If None, makes prediction on the
+            training data without considering them as their own neighbors.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and +1 for inliers.
+        """
+        check_is_fitted(self)
+
+        if X is not None:
+            shifted_opposite_lof_scores = self.decision_function(X)
+            is_inlier = np.ones(shifted_opposite_lof_scores.shape[0], dtype=int)
+            is_inlier[shifted_opposite_lof_scores < 0] = -1
+        else:
+            is_inlier = np.ones(self.n_samples_fit_, dtype=int)
+            is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
+
+        return is_inlier
+
+    def _check_novelty_decision_function(self):
+        if not self.novelty:
+            msg = (
+                "decision_function is not available when novelty=False. "
+                "Use novelty=True if you want to use LOF for novelty "
+                "detection and compute decision_function for new unseen "
+                "data. Note that the opposite LOF of the training samples "
+                "is always available by considering the "
+                "negative_outlier_factor_ attribute."
+            )
+            raise AttributeError(msg)
+        return True
+
+    @available_if(_check_novelty_decision_function)
+    def decision_function(self, X):
+        """Shifted opposite of the Local Outlier Factor of X.
+
+        Bigger is better, i.e. large values correspond to inliers.
+
+        **Only available for novelty detection (when novelty is set to True).**
+        The shift offset allows a zero threshold for being an outlier.
+        The argument X is supposed to contain *new data*: if X contains a
+        point from training, it considers the later in its own neighborhood.
+        Also, the samples in X are not considered in the neighborhood of any
+        point.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples.
+
+        Returns
+        -------
+        shifted_opposite_lof_scores : ndarray of shape (n_samples,)
+            The shifted opposite of the Local Outlier Factor of each input
+            samples. The lower, the more abnormal. Negative scores represent
+            outliers, positive scores represent inliers.
+        """
+        return self.score_samples(X) - self.offset_
+
+    def _check_novelty_score_samples(self):
+        if not self.novelty:
+            msg = (
+                "score_samples is not available when novelty=False. The "
+                "scores of the training samples are always available "
+                "through the negative_outlier_factor_ attribute. Use "
+                "novelty=True if you want to use LOF for novelty detection "
+                "and compute score_samples for new unseen data."
+            )
+            raise AttributeError(msg)
+        return True
+
+    @available_if(_check_novelty_score_samples)
+    def score_samples(self, X):
+        """Opposite of the Local Outlier Factor of X.
+
+        It is the opposite as bigger is better, i.e. large values correspond
+        to inliers.
+
+        **Only available for novelty detection (when novelty is set to True).**
+        The argument X is supposed to contain *new data*: if X contains a
+        point from training, it considers the later in its own neighborhood.
+        Also, the samples in X are not considered in the neighborhood of any
+        point. Because of this, the scores obtained via ``score_samples`` may
+        differ from the standard LOF scores.
+        The standard LOF scores for the training data is available via the
+        ``negative_outlier_factor_`` attribute.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The query sample or samples to compute the Local Outlier Factor
+            w.r.t. the training samples.
+
+        Returns
+        -------
+        opposite_lof_scores : ndarray of shape (n_samples,)
+            The opposite of the Local Outlier Factor of each input samples.
+            The lower, the more abnormal.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse="csr")
+
+        distances_X, neighbors_indices_X = self.kneighbors(
+            X, n_neighbors=self.n_neighbors_
+        )
+
+        if X.dtype == np.float32:
+            distances_X = distances_X.astype(X.dtype, copy=False)
+
+        X_lrd = self._local_reachability_density(
+            distances_X,
+            neighbors_indices_X,
+        )
+
+        lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis]
+
+        # as bigger is better:
+        return -np.mean(lrd_ratios_array, axis=1)
+
+    def _local_reachability_density(self, distances_X, neighbors_indices):
+        """The local reachability density (LRD)
+
+        The LRD of a sample is the inverse of the average reachability
+        distance of its k-nearest neighbors.
+
+        Parameters
+        ----------
+        distances_X : ndarray of shape (n_queries, self.n_neighbors)
+            Distances to the neighbors (in the training samples `self._fit_X`)
+            of each query point to compute the LRD.
+
+        neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)
+            Neighbors indices (of each query point) among training samples
+            self._fit_X.
+
+        Returns
+        -------
+        local_reachability_density : ndarray of shape (n_queries,)
+            The local reachability density of each sample.
+        """
+        dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1]
+        reach_dist_array = np.maximum(distances_X, dist_k)
+
+        # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
+        return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nca.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nca.py
new file mode 100644
index 0000000000000000000000000000000000000000..8383f95338932cd4a5a88fda6e5e5b9211b9ca0a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nca.py
@@ -0,0 +1,534 @@
+"""
+Neighborhood Component Analysis
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import sys
+import time
+from numbers import Integral, Real
+from warnings import warn
+
+import numpy as np
+from scipy.optimize import minimize
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..decomposition import PCA
+from ..exceptions import ConvergenceWarning
+from ..metrics import pairwise_distances
+from ..preprocessing import LabelEncoder
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import softmax
+from ..utils.fixes import _get_additional_lbfgs_options_dict
+from ..utils.multiclass import check_classification_targets
+from ..utils.random import check_random_state
+from ..utils.validation import check_array, check_is_fitted, validate_data
+
+
+class NeighborhoodComponentsAnalysis(
+    ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
+):
+    """Neighborhood Components Analysis.
+
+    Neighborhood Component Analysis (NCA) is a machine learning algorithm for
+    metric learning. It learns a linear transformation in a supervised fashion
+    to improve the classification accuracy of a stochastic nearest neighbors
+    rule in the transformed space.
+
+    Read more in the :ref:`User Guide <nca>`.
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Preferred dimensionality of the projected space.
+        If None it will be set to `n_features`.
+
+    init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape \
+            (n_features_a, n_features_b), default='auto'
+        Initialization of the linear transformation. Possible options are
+        `'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy
+        array of shape `(n_features_a, n_features_b)`.
+
+        - `'auto'`
+            Depending on `n_components`, the most reasonable initialization
+            is chosen. If `n_components <= min(n_features, n_classes - 1)`
+            we use `'lda'`, as it uses labels information. If not, but
+            `n_components < min(n_features, n_samples)`, we use `'pca'`, as
+            it projects data in meaningful directions (those of higher
+            variance). Otherwise, we just use `'identity'`.
+
+        - `'pca'`
+            `n_components` principal components of the inputs passed
+            to :meth:`fit` will be used to initialize the transformation.
+            (See :class:`~sklearn.decomposition.PCA`)
+
+        - `'lda'`
+            `min(n_components, n_classes)` most discriminative
+            components of the inputs passed to :meth:`fit` will be used to
+            initialize the transformation. (If `n_components > n_classes`,
+            the rest of the components will be zero.) (See
+            :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+        - `'identity'`
+            If `n_components` is strictly smaller than the
+            dimensionality of the inputs passed to :meth:`fit`, the identity
+            matrix will be truncated to the first `n_components` rows.
+
+        - `'random'`
+            The initial transformation will be a random array of shape
+            `(n_components, n_features)`. Each value is sampled from the
+            standard normal distribution.
+
+        - numpy array
+            `n_features_b` must match the dimensionality of the inputs passed
+            to :meth:`fit` and n_features_a must be less than or equal to that.
+            If `n_components` is not `None`, `n_features_a` must match it.
+
+    warm_start : bool, default=False
+        If `True` and :meth:`fit` has been called before, the solution of the
+        previous call to :meth:`fit` is used as the initial linear
+        transformation (`n_components` and `init` will be ignored).
+
+    max_iter : int, default=50
+        Maximum number of iterations in the optimization.
+
+    tol : float, default=1e-5
+        Convergence tolerance for the optimization.
+
+    callback : callable, default=None
+        If not `None`, this function is called after every iteration of the
+        optimizer, taking as arguments the current solution (flattened
+        transformation matrix) and the number of iterations. This might be
+        useful in case one wants to examine or store the transformation
+        found after each iteration.
+
+    verbose : int, default=0
+        If 0, no progress messages will be printed.
+        If 1, progress messages will be printed to stdout.
+        If > 1, progress messages will be printed and the `disp`
+        parameter of :func:`scipy.optimize.minimize` will be set to
+        `verbose - 2`.
+
+    random_state : int or numpy.RandomState, default=None
+        A pseudo random number generator object or a seed for it if int. If
+        `init='random'`, `random_state` is used to initialize the random
+        transformation. If `init='pca'`, `random_state` is passed as an
+        argument to PCA when initializing the transformation. Pass an int
+        for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        The linear transformation learned during fitting.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    n_iter_ : int
+        Counts the number of iterations performed by the optimizer.
+
+    random_state_ : numpy.RandomState
+        Pseudo random number generator object used during initialization.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.discriminant_analysis.LinearDiscriminantAnalysis : Linear
+        Discriminant Analysis.
+    sklearn.decomposition.PCA : Principal component analysis (PCA).
+
+    References
+    ----------
+    .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
+           "Neighbourhood Components Analysis". Advances in Neural Information
+           Processing Systems. 17, 513-520, 2005.
+           http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf
+
+    .. [2] Wikipedia entry on Neighborhood Components Analysis
+           https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
+
+    Examples
+    --------
+    >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = load_iris(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ... stratify=y, test_size=0.7, random_state=42)
+    >>> nca = NeighborhoodComponentsAnalysis(random_state=42)
+    >>> nca.fit(X_train, y_train)
+    NeighborhoodComponentsAnalysis(...)
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> knn.fit(X_train, y_train)
+    KNeighborsClassifier(...)
+    >>> print(knn.score(X_test, y_test))
+    0.933333...
+    >>> knn.fit(nca.transform(X_train), y_train)
+    KNeighborsClassifier(...)
+    >>> print(knn.score(nca.transform(X_test), y_test))
+    0.961904...
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [
+            Interval(Integral, 1, None, closed="left"),
+            None,
+        ],
+        "init": [
+            StrOptions({"auto", "pca", "lda", "identity", "random"}),
+            np.ndarray,
+        ],
+        "warm_start": ["boolean"],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "callback": [callable, None],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=None,
+        *,
+        init="auto",
+        warm_start=False,
+        max_iter=50,
+        tol=1e-5,
+        callback=None,
+        verbose=0,
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.init = init
+        self.warm_start = warm_start
+        self.max_iter = max_iter
+        self.tol = tol
+        self.callback = callback
+        self.verbose = verbose
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training samples.
+
+        y : array-like of shape (n_samples,)
+            The corresponding training labels.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        # Validate the inputs X and y, and converts y to numerical classes.
+        X, y = validate_data(self, X, y, ensure_min_samples=2)
+        check_classification_targets(y)
+        y = LabelEncoder().fit_transform(y)
+
+        # Check the preferred dimensionality of the projected space
+        if self.n_components is not None and self.n_components > X.shape[1]:
+            raise ValueError(
+                "The preferred dimensionality of the "
+                f"projected space `n_components` ({self.n_components}) cannot "
+                "be greater than the given data "
+                f"dimensionality ({X.shape[1]})!"
+            )
+        # If warm_start is enabled, check that the inputs are consistent
+        if (
+            self.warm_start
+            and hasattr(self, "components_")
+            and self.components_.shape[1] != X.shape[1]
+        ):
+            raise ValueError(
+                f"The new inputs dimensionality ({X.shape[1]}) does not "
+                "match the input dimensionality of the "
+                f"previously learned transformation ({self.components_.shape[1]})."
+            )
+        # Check how the linear transformation should be initialized
+        init = self.init
+        if isinstance(init, np.ndarray):
+            init = check_array(init)
+            # Assert that init.shape[1] = X.shape[1]
+            if init.shape[1] != X.shape[1]:
+                raise ValueError(
+                    f"The input dimensionality ({init.shape[1]}) of the given "
+                    "linear transformation `init` must match the "
+                    f"dimensionality of the given inputs `X` ({X.shape[1]})."
+                )
+            # Assert that init.shape[0] <= init.shape[1]
+            if init.shape[0] > init.shape[1]:
+                raise ValueError(
+                    f"The output dimensionality ({init.shape[0]}) of the given "
+                    "linear transformation `init` cannot be "
+                    f"greater than its input dimensionality ({init.shape[1]})."
+                )
+            # Assert that self.n_components = init.shape[0]
+            if self.n_components is not None and self.n_components != init.shape[0]:
+                raise ValueError(
+                    "The preferred dimensionality of the "
+                    f"projected space `n_components` ({self.n_components}) does"
+                    " not match the output dimensionality of "
+                    "the given linear transformation "
+                    f"`init` ({init.shape[0]})!"
+                )
+
+        # Initialize the random generator
+        self.random_state_ = check_random_state(self.random_state)
+
+        # Measure the total training time
+        t_train = time.time()
+
+        # Compute a mask that stays fixed during optimization:
+        same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
+        # (n_samples, n_samples)
+
+        # Initialize the transformation
+        transformation = np.ravel(self._initialize(X, y, init))
+
+        # Create a dictionary of parameters to be passed to the optimizer
+        disp = self.verbose - 2 if self.verbose > 1 else -1
+        optimizer_params = {
+            "method": "L-BFGS-B",
+            "fun": self._loss_grad_lbfgs,
+            "args": (X, same_class_mask, -1.0),
+            "jac": True,
+            "x0": transformation,
+            "tol": self.tol,
+            "options": dict(
+                maxiter=self.max_iter,
+                **_get_additional_lbfgs_options_dict("disp", disp),
+            ),
+            "callback": self._callback,
+        }
+
+        # Call the optimizer
+        self.n_iter_ = 0
+        opt_result = minimize(**optimizer_params)
+
+        # Reshape the solution found by the optimizer
+        self.components_ = opt_result.x.reshape(-1, X.shape[1])
+
+        # Stop timer
+        t_train = time.time() - t_train
+        if self.verbose:
+            cls_name = self.__class__.__name__
+
+            # Warn the user if the algorithm did not converge
+            if not opt_result.success:
+                warn(
+                    "[{}] NCA did not converge: {}".format(
+                        cls_name, opt_result.message
+                    ),
+                    ConvergenceWarning,
+                )
+
+            print("[{}] Training took {:8.2f}s.".format(cls_name, t_train))
+
+        return self
+
+    def transform(self, X):
+        """Apply the learned transformation to the given data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data samples.
+
+        Returns
+        -------
+        X_embedded: ndarray of shape (n_samples, n_components)
+            The data samples transformed.
+
+        Raises
+        ------
+        NotFittedError
+            If :meth:`fit` has not been called before.
+        """
+
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+
+        return np.dot(X, self.components_.T)
+
+    def _initialize(self, X, y, init):
+        """Initialize the transformation.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training samples.
+
+        y : array-like of shape (n_samples,)
+            The training labels.
+
+        init : str or ndarray of shape (n_features_a, n_features_b)
+            The validated initialization of the linear transformation.
+
+        Returns
+        -------
+        transformation : ndarray of shape (n_components, n_features)
+            The initialized linear transformation.
+
+        """
+
+        transformation = init
+        if self.warm_start and hasattr(self, "components_"):
+            transformation = self.components_
+        elif isinstance(init, np.ndarray):
+            pass
+        else:
+            n_samples, n_features = X.shape
+            n_components = self.n_components or n_features
+            if init == "auto":
+                n_classes = len(np.unique(y))
+                if n_components <= min(n_features, n_classes - 1):
+                    init = "lda"
+                elif n_components < min(n_features, n_samples):
+                    init = "pca"
+                else:
+                    init = "identity"
+            if init == "identity":
+                transformation = np.eye(n_components, X.shape[1])
+            elif init == "random":
+                transformation = self.random_state_.standard_normal(
+                    size=(n_components, X.shape[1])
+                )
+            elif init in {"pca", "lda"}:
+                init_time = time.time()
+                if init == "pca":
+                    pca = PCA(
+                        n_components=n_components, random_state=self.random_state_
+                    )
+                    if self.verbose:
+                        print("Finding principal components... ", end="")
+                        sys.stdout.flush()
+                    pca.fit(X)
+                    transformation = pca.components_
+                elif init == "lda":
+                    from ..discriminant_analysis import LinearDiscriminantAnalysis
+
+                    lda = LinearDiscriminantAnalysis(n_components=n_components)
+                    if self.verbose:
+                        print("Finding most discriminative components... ", end="")
+                        sys.stdout.flush()
+                    lda.fit(X, y)
+                    transformation = lda.scalings_.T[:n_components]
+                if self.verbose:
+                    print("done in {:5.2f}s".format(time.time() - init_time))
+        return transformation
+
+    def _callback(self, transformation):
+        """Called after each iteration of the optimizer.
+
+        Parameters
+        ----------
+        transformation : ndarray of shape (n_components * n_features,)
+            The solution computed by the optimizer in this iteration.
+        """
+        if self.callback is not None:
+            self.callback(transformation, self.n_iter_)
+
+        self.n_iter_ += 1
+
+    def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
+        """Compute the loss and the loss gradient w.r.t. `transformation`.
+
+        Parameters
+        ----------
+        transformation : ndarray of shape (n_components * n_features,)
+            The raveled linear transformation on which to compute loss and
+            evaluate gradient.
+
+        X : ndarray of shape (n_samples, n_features)
+            The training samples.
+
+        same_class_mask : ndarray of shape (n_samples, n_samples)
+            A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong
+            to the same class, and `0` otherwise.
+
+        Returns
+        -------
+        loss : float
+            The loss computed for the given transformation.
+
+        gradient : ndarray of shape (n_components * n_features,)
+            The new (flattened) gradient of the loss.
+        """
+
+        if self.n_iter_ == 0:
+            self.n_iter_ += 1
+            if self.verbose:
+                header_fields = ["Iteration", "Objective Value", "Time(s)"]
+                header_fmt = "{:>10} {:>20} {:>10}"
+                header = header_fmt.format(*header_fields)
+                cls_name = self.__class__.__name__
+                print("[{}]".format(cls_name))
+                print(
+                    "[{}] {}\n[{}] {}".format(
+                        cls_name, header, cls_name, "-" * len(header)
+                    )
+                )
+
+        t_funcall = time.time()
+
+        transformation = transformation.reshape(-1, X.shape[1])
+        X_embedded = np.dot(X, transformation.T)  # (n_samples, n_components)
+
+        # Compute softmax distances
+        p_ij = pairwise_distances(X_embedded, squared=True)
+        np.fill_diagonal(p_ij, np.inf)
+        p_ij = softmax(-p_ij)  # (n_samples, n_samples)
+
+        # Compute loss
+        masked_p_ij = p_ij * same_class_mask
+        p = np.sum(masked_p_ij, axis=1, keepdims=True)  # (n_samples, 1)
+        loss = np.sum(p)
+
+        # Compute gradient of loss w.r.t. `transform`
+        weighted_p_ij = masked_p_ij - p_ij * p
+        weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T
+        np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))
+        gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)
+        # time complexity of the gradient: O(n_components x n_samples x (
+        # n_samples + n_features))
+
+        if self.verbose:
+            t_funcall = time.time() - t_funcall
+            values_fmt = "[{}] {:>10} {:>20.6e} {:>10.2f}"
+            print(
+                values_fmt.format(
+                    self.__class__.__name__, self.n_iter_, loss, t_funcall
+                )
+            )
+            sys.stdout.flush()
+
+        return sign * loss, sign * gradient.ravel()
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.required = True
+        return tags
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nearest_centroid.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nearest_centroid.py
new file mode 100644
index 0000000000000000000000000000000000000000..a780c27587792478fcef0965127310d35238040d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nearest_centroid.py
@@ -0,0 +1,359 @@
+"""
+Nearest Centroid Classification
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from numbers import Real
+
+import numpy as np
+from scipy import sparse as sp
+
+from ..base import BaseEstimator, ClassifierMixin, _fit_context
+from ..discriminant_analysis import DiscriminantAnalysisPredictionMixin
+from ..metrics.pairwise import (
+    pairwise_distances,
+    pairwise_distances_argmin,
+)
+from ..preprocessing import LabelEncoder
+from ..utils import get_tags
+from ..utils._available_if import available_if
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.multiclass import check_classification_targets
+from ..utils.sparsefuncs import csc_median_axis_0
+from ..utils.validation import check_is_fitted, validate_data
+
+
+class NearestCentroid(
+    DiscriminantAnalysisPredictionMixin, ClassifierMixin, BaseEstimator
+):
+    """Nearest centroid classifier.
+
+    Each class is represented by its centroid, with test samples classified to
+    the class with the nearest centroid.
+
+    Read more in the :ref:`User Guide <nearest_centroid_classifier>`.
+
+    Parameters
+    ----------
+    metric : {"euclidean", "manhattan"}, default="euclidean"
+        Metric to use for distance computation.
+
+        If `metric="euclidean"`, the centroid for the samples corresponding to each
+        class is the arithmetic mean, which minimizes the sum of squared L1 distances.
+        If `metric="manhattan"`, the centroid is the feature-wise median, which
+        minimizes the sum of L1 distances.
+
+        .. versionchanged:: 1.5
+            All metrics but `"euclidean"` and `"manhattan"` were deprecated and
+            now raise an error.
+
+        .. versionchanged:: 0.19
+            `metric='precomputed'` was deprecated and now raises an error
+
+    shrink_threshold : float, default=None
+        Threshold for shrinking centroids to remove features.
+
+    priors : {"uniform", "empirical"} or array-like of shape (n_classes,), \
+        default="uniform"
+        The class prior probabilities. By default, the class proportions are
+        inferred from the training data.
+
+        .. versionadded:: 1.6
+
+    Attributes
+    ----------
+    centroids_ : array-like of shape (n_classes, n_features)
+        Centroid of each class.
+
+    classes_ : array of shape (n_classes,)
+        The unique classes labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    deviations_ : ndarray of shape (n_classes, n_features)
+        Deviations (or shrinkages) of the centroids of each class from the
+        overall centroid. Equal to eq. (18.4) if `shrink_threshold=None`,
+        else (18.5) p. 653 of [2]. Can be used to identify features used
+        for classification.
+
+        .. versionadded:: 1.6
+
+    within_class_std_dev_ : ndarray of shape (n_features,)
+        Pooled or within-class standard deviation of input data.
+
+        .. versionadded:: 1.6
+
+    class_prior_ : ndarray of shape (n_classes,)
+        The class prior probabilities.
+
+        .. versionadded:: 1.6
+
+    See Also
+    --------
+    KNeighborsClassifier : Nearest neighbors classifier.
+
+    Notes
+    -----
+    When used for text classification with tf-idf vectors, this classifier is
+    also known as the Rocchio classifier.
+
+    References
+    ----------
+    [1] Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of
+    multiple cancer types by shrunken centroids of gene expression. Proceedings
+    of the National Academy of Sciences of the United States of America,
+    99(10), 6567-6572. The National Academy of Sciences.
+
+    [2] Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical
+    Learning Data Mining, Inference, and Prediction. 2nd Edition. New York, Springer.
+
+    Examples
+    --------
+    >>> from sklearn.neighbors import NearestCentroid
+    >>> import numpy as np
+    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>> y = np.array([1, 1, 1, 2, 2, 2])
+    >>> clf = NearestCentroid()
+    >>> clf.fit(X, y)
+    NearestCentroid()
+    >>> print(clf.predict([[-0.8, -1]]))
+    [1]
+    """
+
+    _parameter_constraints: dict = {
+        "metric": [StrOptions({"manhattan", "euclidean"})],
+        "shrink_threshold": [Interval(Real, 0, None, closed="neither"), None],
+        "priors": ["array-like", StrOptions({"empirical", "uniform"})],
+    }
+
+    def __init__(
+        self,
+        metric="euclidean",
+        *,
+        shrink_threshold=None,
+        priors="uniform",
+    ):
+        self.metric = metric
+        self.shrink_threshold = shrink_threshold
+        self.priors = priors
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """
+        Fit the NearestCentroid model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+            Note that centroid shrinking cannot be used with sparse matrices.
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        # If X is sparse and the metric is "manhattan", store it in a csc
+        # format is easier to calculate the median.
+        if self.metric == "manhattan":
+            X, y = validate_data(self, X, y, accept_sparse=["csc"])
+        else:
+            ensure_all_finite = (
+                "allow-nan" if get_tags(self).input_tags.allow_nan else True
+            )
+            X, y = validate_data(
+                self,
+                X,
+                y,
+                ensure_all_finite=ensure_all_finite,
+                accept_sparse=["csr", "csc"],
+            )
+        is_X_sparse = sp.issparse(X)
+        check_classification_targets(y)
+
+        n_samples, n_features = X.shape
+        le = LabelEncoder()
+        y_ind = le.fit_transform(y)
+        self.classes_ = classes = le.classes_
+        n_classes = classes.size
+        if n_classes < 2:
+            raise ValueError(
+                "The number of classes has to be greater than one; got %d class"
+                % (n_classes)
+            )
+
+        if self.priors == "empirical":  # estimate priors from sample
+            _, class_counts = np.unique(y, return_inverse=True)  # non-negative ints
+            self.class_prior_ = np.bincount(class_counts) / float(len(y))
+        elif self.priors == "uniform":
+            self.class_prior_ = np.asarray([1 / n_classes] * n_classes)
+        else:
+            self.class_prior_ = np.asarray(self.priors)
+
+        if (self.class_prior_ < 0).any():
+            raise ValueError("priors must be non-negative")
+        if not np.isclose(self.class_prior_.sum(), 1.0):
+            warnings.warn(
+                "The priors do not sum to 1. Normalizing such that it sums to one.",
+                UserWarning,
+            )
+            self.class_prior_ = self.class_prior_ / self.class_prior_.sum()
+
+        # Mask mapping each class to its members.
+        self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
+
+        # Number of clusters in each class.
+        nk = np.zeros(n_classes)
+
+        for cur_class in range(n_classes):
+            center_mask = y_ind == cur_class
+            nk[cur_class] = np.sum(center_mask)
+            if is_X_sparse:
+                center_mask = np.where(center_mask)[0]
+
+            if self.metric == "manhattan":
+                # NumPy does not calculate median of sparse matrices.
+                if not is_X_sparse:
+                    self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
+                else:
+                    self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
+            else:  # metric == "euclidean"
+                self.centroids_[cur_class] = X[center_mask].mean(axis=0)
+
+        # Compute within-class std_dev with unshrunked centroids
+        variance = np.array(X - self.centroids_[y_ind], copy=False) ** 2
+        self.within_class_std_dev_ = np.array(
+            np.sqrt(variance.sum(axis=0) / (n_samples - n_classes)), copy=False
+        )
+        if any(self.within_class_std_dev_ == 0):
+            warnings.warn(
+                "self.within_class_std_dev_ has at least 1 zero standard deviation."
+                "Inputs within the same classes for at least 1 feature are identical."
+            )
+
+        err_msg = "All features have zero variance. Division by zero."
+        if is_X_sparse and np.all((X.max(axis=0) - X.min(axis=0)).toarray() == 0):
+            raise ValueError(err_msg)
+        elif not is_X_sparse and np.all(np.ptp(X, axis=0) == 0):
+            raise ValueError(err_msg)
+
+        dataset_centroid_ = X.mean(axis=0)
+        # m parameter for determining deviation
+        m = np.sqrt((1.0 / nk) - (1.0 / n_samples))
+        # Calculate deviation using the standard deviation of centroids.
+        # To deter outliers from affecting the results.
+        s = self.within_class_std_dev_ + np.median(self.within_class_std_dev_)
+        mm = m.reshape(len(m), 1)  # Reshape to allow broadcasting.
+        ms = mm * s
+        self.deviations_ = np.array(
+            (self.centroids_ - dataset_centroid_) / ms, copy=False
+        )
+        # Soft thresholding: if the deviation crosses 0 during shrinking,
+        # it becomes zero.
+        if self.shrink_threshold:
+            signs = np.sign(self.deviations_)
+            self.deviations_ = np.abs(self.deviations_) - self.shrink_threshold
+            np.clip(self.deviations_, 0, None, out=self.deviations_)
+            self.deviations_ *= signs
+            # Now adjust the centroids using the deviation
+            msd = ms * self.deviations_
+            self.centroids_ = np.array(dataset_centroid_ + msd, copy=False)
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors `X`.
+
+        The predicted class `C` for each sample in `X` is returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            The predicted classes.
+        """
+        check_is_fitted(self)
+        if np.isclose(self.class_prior_, 1 / len(self.classes_)).all():
+            # `validate_data` is called here since we are not calling `super()`
+            ensure_all_finite = (
+                "allow-nan" if get_tags(self).input_tags.allow_nan else True
+            )
+            X = validate_data(
+                self,
+                X,
+                ensure_all_finite=ensure_all_finite,
+                accept_sparse="csr",
+                reset=False,
+            )
+            return self.classes_[
+                pairwise_distances_argmin(X, self.centroids_, metric=self.metric)
+            ]
+        else:
+            return super().predict(X)
+
+    def _decision_function(self, X):
+        # return discriminant scores, see eq. (18.2) p. 652 of the ESL.
+        check_is_fitted(self, "centroids_")
+
+        X_normalized = validate_data(
+            self, X, copy=True, reset=False, accept_sparse="csr", dtype=np.float64
+        )
+
+        discriminant_score = np.empty(
+            (X_normalized.shape[0], self.classes_.size), dtype=np.float64
+        )
+
+        mask = self.within_class_std_dev_ != 0
+        X_normalized[:, mask] /= self.within_class_std_dev_[mask]
+        centroids_normalized = self.centroids_.copy()
+        centroids_normalized[:, mask] /= self.within_class_std_dev_[mask]
+
+        for class_idx in range(self.classes_.size):
+            distances = pairwise_distances(
+                X_normalized, centroids_normalized[[class_idx]], metric=self.metric
+            ).ravel()
+            distances **= 2
+            discriminant_score[:, class_idx] = np.squeeze(
+                -distances + 2.0 * np.log(self.class_prior_[class_idx])
+            )
+
+        return discriminant_score
+
+    def _check_euclidean_metric(self):
+        return self.metric == "euclidean"
+
+    decision_function = available_if(_check_euclidean_metric)(
+        DiscriminantAnalysisPredictionMixin.decision_function
+    )
+
+    predict_proba = available_if(_check_euclidean_metric)(
+        DiscriminantAnalysisPredictionMixin.predict_proba
+    )
+
+    predict_log_proba = available_if(_check_euclidean_metric)(
+        DiscriminantAnalysisPredictionMixin.predict_log_proba
+    )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = self.metric == "nan_euclidean"
+        tags.input_tags.sparse = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..fae6a33eb2cb132a68f87bfab8e7b3803fd61f70
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pxd b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..bd2160cc3b26f4eaf0821735aeb278fd3a16eb15
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pxd
@@ -0,0 +1,10 @@
+from cython cimport floating
+from ..utils._typedefs cimport float64_t, intp_t
+
+cdef int partition_node_indices(
+        const floating *data,
+        intp_t *node_indices,
+        intp_t split_dim,
+        intp_t split_index,
+        intp_t n_features,
+        intp_t n_points) except -1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pyx b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..111353c49a22becb74cf2d3d609241d208784508
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pyx
@@ -0,0 +1,122 @@
+# BinaryTrees rely on partial sorts to partition their nodes during their
+# initialisation.
+#
+# The C++ std library exposes nth_element, an efficient partial sort for this
+# situation which has a linear time complexity as well as the best performances.
+#
+# To use std::algorithm::nth_element, a few fixture are defined using Cython:
+# - partition_node_indices, a Cython function used in BinaryTrees, that calls
+# - partition_node_indices_inner, a C++ function that wraps nth_element and uses
+# - an IndexComparator to state how to compare KDTrees' indices
+#
+# IndexComparator has been defined so that partial sorts are stable with
+# respect to the nodes initial indices.
+#
+# See for reference:
+#  - https://en.cppreference.com/w/cpp/algorithm/nth_element.
+#  - https://github.com/scikit-learn/scikit-learn/pull/11103
+#  - https://github.com/scikit-learn/scikit-learn/pull/19473
+from cython cimport floating
+
+
+cdef extern from *:
+    """
+    #include <algorithm>
+
+    template<class D, class I>
+    class IndexComparator {
+    private:
+        const D *data;
+        I split_dim, n_features;
+    public:
+        IndexComparator(const D *data, const I &split_dim, const I &n_features):
+            data(data), split_dim(split_dim), n_features(n_features) {}
+
+        bool operator()(const I &a, const I &b) const {
+            D a_value = data[a * n_features + split_dim];
+            D b_value = data[b * n_features + split_dim];
+            return a_value == b_value ? a < b : a_value < b_value;
+        }
+    };
+
+    template<class D, class I>
+    void partition_node_indices_inner(
+        const D *data,
+        I *node_indices,
+        const I &split_dim,
+        const I &split_index,
+        const I &n_features,
+        const I &n_points) {
+        IndexComparator<D, I> index_comparator(data, split_dim, n_features);
+        std::nth_element(
+            node_indices,
+            node_indices + split_index,
+            node_indices + n_points,
+            index_comparator);
+    }
+    """
+    void partition_node_indices_inner[D, I](
+                const D *data,
+                I *node_indices,
+                I split_dim,
+                I split_index,
+                I n_features,
+                I n_points) except +
+
+
+cdef int partition_node_indices(
+        const floating *data,
+        intp_t *node_indices,
+        intp_t split_dim,
+        intp_t split_index,
+        intp_t n_features,
+        intp_t n_points) except -1:
+    """Partition points in the node into two equal-sized groups.
+
+    Upon return, the values in node_indices will be rearranged such that
+    (assuming numpy-style indexing):
+
+        data[node_indices[0:split_index], split_dim]
+          <= data[node_indices[split_index], split_dim]
+
+    and
+
+        data[node_indices[split_index], split_dim]
+          <= data[node_indices[split_index:n_points], split_dim]
+
+    The algorithm is essentially a partial in-place quicksort around a
+    set pivot.
+
+    Parameters
+    ----------
+    data : double pointer
+        Pointer to a 2D array of the training data, of shape [N, n_features].
+        N must be greater than any of the values in node_indices.
+    node_indices : int pointer
+        Pointer to a 1D array of length n_points.  This lists the indices of
+        each of the points within the current node.  This will be modified
+        in-place.
+    split_dim : int
+        the dimension on which to split.  This will usually be computed via
+        the routine ``find_node_split_dim``.
+    split_index : int
+        the index within node_indices around which to split the points.
+    n_features: int
+        the number of features (i.e columns) in the 2D array pointed by data.
+    n_points : int
+        the length of node_indices. This is also the number of points in
+        the original dataset.
+    Returns
+    -------
+    status : int
+        integer exit status.  On return, the contents of node_indices are
+        modified as noted above.
+    """
+    partition_node_indices_inner(
+        data,
+        node_indices,
+        split_dim,
+        split_index,
+        n_features,
+        n_points)
+    return 0
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pxd b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..e7e817902f103fe6e42f37516e56ad273884c507
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pxd
@@ -0,0 +1,92 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# See quad_tree.pyx for details.
+
+cimport numpy as cnp
+from ..utils._typedefs cimport float32_t, intp_t
+
+# This is effectively an ifdef statement in Cython
+# It allows us to write printf debugging lines
+# and remove them at compile time
+cdef enum:
+    DEBUGFLAG = 0
+
+cdef float EPSILON = 1e-6
+
+# XXX: Careful to not change the order of the arguments. It is important to
+# have is_leaf and max_width consecutive as it permits to avoid padding by
+# the compiler and keep the size coherent for both C and numpy data structures.
+cdef struct Cell:
+    # Base storage structure for cells in a QuadTree object
+
+    # Tree structure
+    intp_t parent                # Parent cell of this cell
+    intp_t[8] children           # Array pointing to children of this cell
+
+    # Cell description
+    intp_t cell_id               # Id of the cell in the cells array in the Tree
+    intp_t point_index           # Index of the point at this cell (only defined
+    #                            # in non empty leaf)
+    bint is_leaf                 # Does this cell have children?
+    float32_t squared_max_width  # Squared value of the maximum width w
+    intp_t depth                 # Depth of the cell in the tree
+    intp_t cumulative_size       # Number of points included in the subtree with
+    #                            # this cell as a root.
+
+    # Internal constants
+    float32_t[3] center          # Store the center for quick split of cells
+    float32_t[3] barycenter      # Keep track of the center of mass of the cell
+
+    # Cell boundaries
+    float32_t[3] min_bounds      # Inferior boundaries of this cell (inclusive)
+    float32_t[3] max_bounds      # Superior boundaries of this cell (exclusive)
+
+
+cdef class _QuadTree:
+    # The QuadTree object is a quad tree structure constructed by inserting
+    # recursively points in the tree and splitting cells in 4 so that each
+    # leaf cell contains at most one point.
+    # This structure also handle 3D data, inserted in trees with 8 children
+    # for each node.
+
+    # Parameters of the tree
+    cdef public int n_dimensions         # Number of dimensions in X
+    cdef public int verbose              # Verbosity of the output
+    cdef intp_t n_cells_per_cell         # Number of children per node. (2 ** n_dimension)
+
+    # Tree inner structure
+    cdef public intp_t max_depth         # Max depth of the tree
+    cdef public intp_t cell_count        # Counter for node IDs
+    cdef public intp_t capacity          # Capacity of tree, in terms of nodes
+    cdef public intp_t n_points          # Total number of points
+    cdef Cell* cells                     # Array of nodes
+
+    # Point insertion methods
+    cdef int insert_point(self, float32_t[3] point, intp_t point_index,
+                          intp_t cell_id=*) except -1 nogil
+    cdef intp_t _insert_point_in_new_child(self, float32_t[3] point, Cell* cell,
+                                           intp_t point_index, intp_t size=*
+                                           ) noexcept nogil
+    cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil
+    cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil
+
+    # Create a summary of the Tree compare to a query point
+    cdef long summarize(self, float32_t[3] point, float32_t* results,
+                        float squared_theta=*, intp_t cell_id=*, long idx=*
+                        ) noexcept nogil
+
+    # Internal cell initialization methods
+    cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil
+    cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds
+                         ) noexcept nogil
+
+    # Private methods
+    cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell
+                                  ) except -1 nogil
+
+    # Private array manipulation to manage the ``cells`` array
+    cdef int _resize(self, intp_t capacity) except -1 nogil
+    cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
+    cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=*) except -1 nogil
+    cdef Cell[:] _get_cell_ndarray(self)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pyx b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..aec79da505f52b9620568b3dd7c329a144259a76
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pyx
@@ -0,0 +1,609 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+from cpython cimport Py_INCREF, PyObject, PyTypeObject
+
+from libc.math cimport fabsf
+from libc.stdlib cimport free
+from libc.string cimport memcpy
+from libc.stdio cimport printf
+from libc.stdint cimport SIZE_MAX
+
+from ..tree._utils cimport safe_realloc
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+cdef extern from "numpy/arrayobject.h":
+    object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,
+                                int nd, cnp.npy_intp* dims,
+                                cnp.npy_intp* strides,
+                                void* data, int flags, object obj)
+    int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)
+
+# Build the corresponding numpy dtype for Cell.
+# This works by casting `dummy` to an array of Cell of length 1, which numpy
+# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946
+# for a more detailed explanation.
+cdef Cell dummy
+CELL_DTYPE = np.asarray(<Cell[:1]>(&dummy)).dtype
+
+assert CELL_DTYPE.itemsize == sizeof(Cell)
+
+
+cdef class _QuadTree:
+    """Array-based representation of a QuadTree.
+
+    This class is currently working for indexing 2D data (regular QuadTree) and
+    for indexing 3D data (OcTree). It is planned to split the 2 implementations
+    using `Cython.Tempita` to save some memory for QuadTree.
+
+    Note that this code is currently internally used only by the Barnes-Hut
+    method in `sklearn.manifold.TSNE`. It is planned to be refactored and
+    generalized in the future to be compatible with nearest neighbors API of
+    `sklearn.neighbors` with 2D and 3D data.
+    """
+    def __cinit__(self, int n_dimensions, int verbose):
+        """Constructor."""
+        # Parameters of the tree
+        self.n_dimensions = n_dimensions
+        self.verbose = verbose
+        self.n_cells_per_cell = <int> (2 ** self.n_dimensions)
+
+        # Inner structures
+        self.max_depth = 0
+        self.cell_count = 0
+        self.capacity = 0
+        self.n_points = 0
+        self.cells = NULL
+
+    def __dealloc__(self):
+        """Destructor."""
+        # Free all inner structures
+        free(self.cells)
+
+    @property
+    def cumulative_size(self):
+        cdef Cell[:] cell_mem_view = self._get_cell_ndarray()
+        return cell_mem_view.base['cumulative_size'][:self.cell_count]
+
+    @property
+    def leafs(self):
+        cdef Cell[:] cell_mem_view = self._get_cell_ndarray()
+        return cell_mem_view.base['is_leaf'][:self.cell_count]
+
+    def build_tree(self, X):
+        """Build a tree from an array of points X."""
+        cdef:
+            int i
+            float32_t[3] pt
+            float32_t[3] min_bounds, max_bounds
+
+        # validate X and prepare for query
+        # X = check_array(X, dtype=float32_t, order='C')
+        n_samples = X.shape[0]
+
+        capacity = 100
+        self._resize(capacity)
+        m = np.min(X, axis=0)
+        M = np.max(X, axis=0)
+        # Scale the maximum to get all points strictly in the tree bounding box
+        # The 3 bounds are for positive, negative and small values
+        M = np.maximum(M * (1. + 1e-3 * np.sign(M)), M + 1e-3)
+        for i in range(self.n_dimensions):
+            min_bounds[i] = m[i]
+            max_bounds[i] = M[i]
+
+            if self.verbose > 10:
+                printf("[QuadTree] bounding box axis %i : [%f, %f]\n",
+                       i, min_bounds[i], max_bounds[i])
+
+        # Create the initial node with boundaries from the dataset
+        self._init_root(min_bounds, max_bounds)
+
+        for i in range(n_samples):
+            for j in range(self.n_dimensions):
+                pt[j] = X[i, j]
+            self.insert_point(pt, i)
+
+        # Shrink the cells array to reduce memory usage
+        self._resize(capacity=self.cell_count)
+
+    cdef int insert_point(self, float32_t[3] point, intp_t point_index,
+                          intp_t cell_id=0) except -1 nogil:
+        """Insert a point in the QuadTree."""
+        cdef int ax
+        cdef intp_t selected_child
+        cdef Cell* cell = &self.cells[cell_id]
+        cdef intp_t n_point = cell.cumulative_size
+
+        if self.verbose > 10:
+            printf("[QuadTree] Inserting depth %li\n", cell.depth)
+
+        # Assert that the point is in the right range
+        if DEBUGFLAG:
+            self._check_point_in_cell(point, cell)
+
+        # If the cell is an empty leaf, insert the point in it
+        if cell.cumulative_size == 0:
+            cell.cumulative_size = 1
+            self.n_points += 1
+            for i in range(self.n_dimensions):
+                cell.barycenter[i] = point[i]
+            cell.point_index = point_index
+            if self.verbose > 10:
+                printf("[QuadTree] inserted point %li in cell %li\n",
+                       point_index, cell_id)
+            return cell_id
+
+        # If the cell is not a leaf, update cell internals and
+        # recurse in selected child
+        if not cell.is_leaf:
+            for ax in range(self.n_dimensions):
+                # barycenter update using a weighted mean
+                cell.barycenter[ax] = (
+                    n_point * cell.barycenter[ax] + point[ax]) / (n_point + 1)
+
+            # Increase the size of the subtree starting from this cell
+            cell.cumulative_size += 1
+
+            # Insert child in the correct subtree
+            selected_child = self._select_child(point, cell)
+            if self.verbose > 49:
+                printf("[QuadTree] selected child %li\n", selected_child)
+            if selected_child == -1:
+                self.n_points += 1
+                return self._insert_point_in_new_child(point, cell, point_index)
+            return self.insert_point(point, point_index, selected_child)
+
+        # Finally, if the cell is a leaf with a point already inserted,
+        # split the cell in n_cells_per_cell if the point is not a duplicate.
+        # If it is a duplicate, increase the size of the leaf and return.
+        if self._is_duplicate(point, cell.barycenter):
+            if self.verbose > 10:
+                printf("[QuadTree] found a duplicate!\n")
+            cell.cumulative_size += 1
+            self.n_points += 1
+            return cell_id
+
+        # In a leaf, the barycenter correspond to the only point included
+        # in it.
+        self._insert_point_in_new_child(cell.barycenter, cell, cell.point_index,
+                                        cell.cumulative_size)
+        return self.insert_point(point, point_index, cell_id)
+
+    # XXX: This operation is not Thread safe
+    cdef intp_t _insert_point_in_new_child(
+        self, float32_t[3] point, Cell* cell, intp_t point_index, intp_t size=1
+    ) noexcept nogil:
+        """Create a child of cell which will contain point."""
+
+        # Local variable definition
+        cdef:
+            intp_t cell_id, cell_child_id, parent_id
+            float32_t[3] save_point
+            float32_t width
+            Cell* child
+            int i
+
+        # If the maximal capacity of the Tree have been reached, double the capacity
+        # We need to save the current cell id and the current point to retrieve them
+        # in case the reallocation
+        if self.cell_count + 1 > self.capacity:
+            parent_id = cell.cell_id
+            for i in range(self.n_dimensions):
+                save_point[i] = point[i]
+            self._resize(SIZE_MAX)
+            cell = &self.cells[parent_id]
+            point = save_point
+
+        # Get an empty cell and initialize it
+        cell_id = self.cell_count
+        self.cell_count += 1
+        child = &self.cells[cell_id]
+
+        self._init_cell(child, cell.cell_id, cell.depth + 1)
+        child.cell_id = cell_id
+
+        # Set the cell as an inner cell of the Tree
+        cell.is_leaf = False
+        cell.point_index = -1
+
+        # Set the correct boundary for the cell, store the point in the cell
+        # and compute its index in the children array.
+        cell_child_id = 0
+        for i in range(self.n_dimensions):
+            cell_child_id *= 2
+            if point[i] >= cell.center[i]:
+                cell_child_id += 1
+                child.min_bounds[i] = cell.center[i]
+                child.max_bounds[i] = cell.max_bounds[i]
+            else:
+                child.min_bounds[i] = cell.min_bounds[i]
+                child.max_bounds[i] = cell.center[i]
+            child.center[i] = (child.min_bounds[i] + child.max_bounds[i]) / 2.
+            width = child.max_bounds[i] - child.min_bounds[i]
+
+            child.barycenter[i] = point[i]
+            child.squared_max_width = max(child.squared_max_width, width*width)
+
+        # Store the point info and the size to account for duplicated points
+        child.point_index = point_index
+        child.cumulative_size = size
+
+        # Store the child cell in the correct place in children
+        cell.children[cell_child_id] = child.cell_id
+
+        if DEBUGFLAG:
+            # Assert that the point is in the right range
+            self._check_point_in_cell(point, child)
+        if self.verbose > 10:
+            printf("[QuadTree] inserted point %li in new child %li\n",
+                   point_index, cell_id)
+
+        return cell_id
+
+    cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil:
+        """Check if the two given points are equals."""
+        cdef int i
+        cdef bint res = True
+        for i in range(self.n_dimensions):
+            # Use EPSILON to avoid numerical error that would overgrow the tree
+            res &= fabsf(point1[i] - point2[i]) <= EPSILON
+        return res
+
+    cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil:
+        """Select the child of cell which contains the given query point."""
+        cdef:
+            int i
+            intp_t selected_child = 0
+
+        for i in range(self.n_dimensions):
+            # Select the correct child cell to insert the point by comparing
+            # it to the borders of the cells using precomputed center.
+            selected_child *= 2
+            if point[i] >= cell.center[i]:
+                selected_child += 1
+        return cell.children[selected_child]
+
+    cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil:
+        """Initialize a cell structure with some constants."""
+        cell.parent = parent
+        cell.is_leaf = True
+        cell.depth = depth
+        cell.squared_max_width = 0
+        cell.cumulative_size = 0
+        for i in range(self.n_cells_per_cell):
+            cell.children[i] = SIZE_MAX
+
+    cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds
+                         ) noexcept nogil:
+        """Initialize the root node with the given space boundaries"""
+        cdef:
+            int i
+            float32_t width
+            Cell* root = &self.cells[0]
+
+        self._init_cell(root, -1, 0)
+        for i in range(self.n_dimensions):
+            root.min_bounds[i] = min_bounds[i]
+            root.max_bounds[i] = max_bounds[i]
+            root.center[i] = (max_bounds[i] + min_bounds[i]) / 2.
+            width = max_bounds[i] - min_bounds[i]
+            root.squared_max_width = max(root.squared_max_width, width*width)
+        root.cell_id = 0
+
+        self.cell_count += 1
+
+    cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell
+                                  ) except -1 nogil:
+        """Check that the given point is in the cell boundaries."""
+
+        if self.verbose >= 50:
+            if self.n_dimensions == 3:
+                printf("[QuadTree] Checking point (%f, %f, %f) in cell %li "
+                       "([%f/%f, %f/%f, %f/%f], size %li)\n",
+                       point[0], point[1], point[2], cell.cell_id,
+                       cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1],
+                       cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2],
+                       cell.cumulative_size)
+            else:
+                printf("[QuadTree] Checking point (%f, %f) in cell %li "
+                       "([%f/%f, %f/%f], size %li)\n",
+                       point[0], point[1], cell.cell_id, cell.min_bounds[0],
+                       cell.max_bounds[0], cell.min_bounds[1],
+                       cell.max_bounds[1], cell.cumulative_size)
+
+        for i in range(self.n_dimensions):
+            if (cell.min_bounds[i] > point[i] or
+                    cell.max_bounds[i] <= point[i]):
+                with gil:
+                    msg = "[QuadTree] InsertionError: point out of cell "
+                    msg += "boundary.\nAxis %li: cell [%f, %f]; point %f\n"
+
+                    msg %= i, cell.min_bounds[i],  cell.max_bounds[i], point[i]
+                    raise ValueError(msg)
+
+    def _check_coherence(self):
+        """Check the coherence of the cells of the tree.
+
+        Check that the info stored in each cell is compatible with the info
+        stored in descendent and sibling cells. Raise a ValueError if this
+        fails.
+        """
+        for cell in self.cells[:self.cell_count]:
+            # Check that the barycenter of inserted point is within the cell
+            # boundaries
+            self._check_point_in_cell(cell.barycenter, &cell)
+
+            if not cell.is_leaf:
+                # Compute the number of point in children and compare with
+                # its cummulative_size.
+                n_points = 0
+                for idx in range(self.n_cells_per_cell):
+                    child_id = cell.children[idx]
+                    if child_id != -1:
+                        child = self.cells[child_id]
+                        n_points += child.cumulative_size
+                        assert child.cell_id == child_id, (
+                            "Cell id not correctly initialized.")
+                if n_points != cell.cumulative_size:
+                    raise ValueError(
+                        "Cell {} is incoherent. Size={} but found {} points "
+                        "in children. ({})"
+                        .format(cell.cell_id, cell.cumulative_size,
+                                n_points, cell.children))
+
+        # Make sure that the number of point in the tree correspond to the
+        # cumulative size in root cell.
+        if self.n_points != self.cells[0].cumulative_size:
+            raise ValueError(
+                "QuadTree is incoherent. Size={} but found {} points "
+                "in children."
+                .format(self.n_points, self.cells[0].cumulative_size))
+
+    cdef long summarize(self, float32_t[3] point, float32_t* results,
+                        float squared_theta=.5, intp_t cell_id=0, long idx=0
+                        ) noexcept nogil:
+        """Summarize the tree compared to a query point.
+
+        Input arguments
+        ---------------
+        point : array (n_dimensions)
+             query point to construct the summary.
+        cell_id : integer, optional (default: 0)
+            current cell of the tree summarized. This should be set to 0 for
+            external calls.
+        idx : integer, optional (default: 0)
+            current index in the result array. This should be set to 0 for
+            external calls
+        squared_theta: float, optional (default: .5)
+            threshold to decide whether the node is sufficiently far
+            from the query point to be a good summary. The formula is such that
+            the node is a summary if
+                node_width^2 / dist_node_point^2 < squared_theta.
+            Note that the argument should be passed as theta^2 to avoid
+            computing square roots of the distances.
+
+        Output arguments
+        ----------------
+        results : array (n_samples * (n_dimensions+2))
+            result will contain a summary of the tree information compared to
+            the query point:
+            - results[idx:idx+n_dimensions] contains the coordinate-wise
+                difference between the query point and the summary cell idx.
+                This is useful in t-SNE to compute the negative forces.
+            - result[idx+n_dimensions+1] contains the squared euclidean
+                distance to the summary cell idx.
+            - result[idx+n_dimensions+2] contains the number of point of the
+                tree contained in the summary cell idx.
+
+        Return
+        ------
+        idx : integer
+            number of elements in the results array.
+        """
+        cdef:
+            int i, idx_d = idx + self.n_dimensions
+            bint duplicate = True
+            Cell* cell = &self.cells[cell_id]
+
+        results[idx_d] = 0.
+        for i in range(self.n_dimensions):
+            results[idx + i] = point[i] - cell.barycenter[i]
+            results[idx_d] += results[idx + i] * results[idx + i]
+            duplicate &= fabsf(results[idx + i]) <= EPSILON
+
+        # Do not compute self interactions
+        if duplicate and cell.is_leaf:
+            return idx
+
+        # Check whether we can use this node as a summary
+        # It's a summary node if the angular size as measured from the point
+        # is relatively small (w.r.t. theta) or if it is a leaf node.
+        # If it can be summarized, we use the cell center of mass
+        # Otherwise, we go a higher level of resolution and into the leaves.
+        if cell.is_leaf or (
+                (cell.squared_max_width / results[idx_d]) < squared_theta):
+            results[idx_d + 1] = <float32_t> cell.cumulative_size
+            return idx + self.n_dimensions + 2
+
+        else:
+            # Recursively compute the summary in nodes
+            for c in range(self.n_cells_per_cell):
+                child_id = cell.children[c]
+                if child_id != -1:
+                    idx = self.summarize(point, results, squared_theta,
+                                         child_id, idx)
+
+        return idx
+
+    def get_cell(self, point):
+        """return the id of the cell containing the query point or raise
+        ValueError if the point is not in the tree
+        """
+        cdef float32_t[3] query_pt
+        cdef int i
+
+        assert len(point) == self.n_dimensions, (
+            "Query point should be a point in dimension {}."
+            .format(self.n_dimensions))
+
+        for i in range(self.n_dimensions):
+            query_pt[i] = point[i]
+
+        return self._get_cell(query_pt, 0)
+
+    cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=0
+                       ) except -1 nogil:
+        """guts of get_cell.
+
+        Return the id of the cell containing the query point or raise ValueError
+        if the point is not in the tree"""
+        cdef:
+            intp_t selected_child
+            Cell* cell = &self.cells[cell_id]
+
+        if cell.is_leaf:
+            if self._is_duplicate(cell.barycenter, point):
+                if self.verbose > 99:
+                    printf("[QuadTree] Found point in cell: %li\n",
+                           cell.cell_id)
+                return cell_id
+            with gil:
+                raise ValueError("Query point not in the Tree.")
+
+        selected_child = self._select_child(point, cell)
+        if selected_child > 0:
+            if self.verbose > 99:
+                printf("[QuadTree] Selected_child: %li\n", selected_child)
+            return self._get_cell(point, selected_child)
+        with gil:
+            raise ValueError("Query point not in the Tree.")
+
+    # Pickling primitives
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return (_QuadTree, (self.n_dimensions, self.verbose), self.__getstate__())
+
+    def __getstate__(self):
+        """Getstate re-implementation, for pickling."""
+        d = {}
+        # capacity is inferred during the __setstate__ using nodes
+        d["max_depth"] = self.max_depth
+        d["cell_count"] = self.cell_count
+        d["capacity"] = self.capacity
+        d["n_points"] = self.n_points
+        d["cells"] = self._get_cell_ndarray().base
+        return d
+
+    def __setstate__(self, d):
+        """Setstate re-implementation, for unpickling."""
+        self.max_depth = d["max_depth"]
+        self.cell_count = d["cell_count"]
+        self.capacity = d["capacity"]
+        self.n_points = d["n_points"]
+
+        if 'cells' not in d:
+            raise ValueError('You have loaded Tree version which '
+                             'cannot be imported')
+
+        cell_ndarray = d['cells']
+
+        if (cell_ndarray.ndim != 1 or
+                cell_ndarray.dtype != CELL_DTYPE or
+                not cell_ndarray.flags.c_contiguous):
+            raise ValueError('Did not recognise loaded array layout')
+
+        self.capacity = cell_ndarray.shape[0]
+        if self._resize_c(self.capacity) != 0:
+            raise MemoryError("resizing tree to %d" % self.capacity)
+
+        cdef Cell[:] cell_mem_view = cell_ndarray
+        memcpy(
+            pto=self.cells,
+            pfrom=&cell_mem_view[0],
+            size=self.capacity * sizeof(Cell),
+        )
+
+    # Array manipulation methods, to convert it to numpy or to resize
+    # self.cells array
+
+    cdef Cell[:] _get_cell_ndarray(self):
+        """Wraps nodes as a NumPy struct array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory. Individual fields are publicly accessible as properties of the
+        Tree.
+        """
+        cdef cnp.npy_intp shape[1]
+        shape[0] = <cnp.npy_intp> self.cell_count
+        cdef cnp.npy_intp strides[1]
+        strides[0] = sizeof(Cell)
+        cdef Cell[:] arr
+        Py_INCREF(CELL_DTYPE)
+        arr = PyArray_NewFromDescr(
+            subtype=<PyTypeObject *> np.ndarray,
+            descr=CELL_DTYPE,
+            nd=1,
+            dims=shape,
+            strides=strides,
+            data=<void*> self.cells,
+            flags=cnp.NPY_ARRAY_DEFAULT,
+            obj=None,
+        )
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr.base, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array!")
+        return arr
+
+    cdef int _resize(self, intp_t capacity) except -1 nogil:
+        """Resize all inner arrays to `capacity`, if `capacity` == -1, then
+           double the size of the inner arrays.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if self._resize_c(capacity) != 0:
+            # Acquire gil only if we need to raise
+            with gil:
+                raise MemoryError()
+
+    cdef int _resize_c(self, intp_t capacity=SIZE_MAX) except -1 nogil:
+        """Guts of _resize
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if capacity == self.capacity and self.cells != NULL:
+            return 0
+
+        if <size_t> capacity == SIZE_MAX:
+            if self.capacity == 0:
+                capacity = 9  # default initial value to min
+            else:
+                capacity = 2 * self.capacity
+
+        safe_realloc(&self.cells, capacity)
+
+        # if capacity smaller than cell_count, adjust the counter
+        if capacity < self.cell_count:
+            self.cell_count = capacity
+
+        self.capacity = capacity
+        return 0
+
+    def _py_summarize(self, float32_t[:] query_pt, float32_t[:, :] X, float angle):
+        # Used for testing summarize
+        cdef:
+            float32_t[:] summary
+            int n_samples
+
+        n_samples = X.shape[0]
+        summary = np.empty(4 * n_samples, dtype=np.float32)
+
+        idx = self.summarize(&query_pt[0], &summary[0], angle * angle)
+        return idx, summary
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_regression.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ee0a340b8153b632fb8174785d53d018545f8ce
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_regression.py
@@ -0,0 +1,513 @@
+"""Nearest Neighbor Regression."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+import numpy as np
+
+from ..base import RegressorMixin, _fit_context
+from ..metrics import DistanceMetric
+from ..utils._param_validation import StrOptions
+from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights
+
+
+class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
+    """Regression based on k-nearest neighbors.
+
+    The target is predicted by local interpolation of the targets
+    associated of the nearest neighbors in the training set.
+
+    Read more in the :ref:`User Guide <regression>`.
+
+    .. versionadded:: 0.9
+
+    Parameters
+    ----------
+    n_neighbors : int, default=5
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+
+    weights : {'uniform', 'distance'}, callable or None, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+        Uniform weights are used by default.
+
+        See the following example for a demonstration of the impact of
+        different weighting schemes on predictions:
+        :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric : str, DistanceMetric object or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        If metric is a DistanceMetric object, it will be passed directly to
+        the underlying computation routines.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+        Doesn't affect :meth:`fit` method.
+
+    Attributes
+    ----------
+    effective_metric_ : str or callable
+        The distance metric to use. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    NearestNeighbors : Unsupervised learner for implementing neighbor searches.
+    RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius.
+    KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote.
+    RadiusNeighborsClassifier : Classifier implementing
+        a vote among neighbors within a given radius.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    .. warning::
+
+       Regarding the Nearest Neighbors algorithms, if it is found that two
+       neighbors, neighbor `k+1` and `k`, have identical distances but
+       different labels, the results will depend on the ordering of the
+       training data.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
+
+    Examples
+    --------
+    >>> X = [[0], [1], [2], [3]]
+    >>> y = [0, 0, 1, 1]
+    >>> from sklearn.neighbors import KNeighborsRegressor
+    >>> neigh = KNeighborsRegressor(n_neighbors=2)
+    >>> neigh.fit(X, y)
+    KNeighborsRegressor(...)
+    >>> print(neigh.predict([[1.5]]))
+    [0.5]
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "weights": [StrOptions({"uniform", "distance"}), callable, None],
+    }
+    _parameter_constraints["metric"].append(DistanceMetric)
+    _parameter_constraints.pop("radius")
+
+    def __init__(
+        self,
+        n_neighbors=5,
+        *,
+        weights="uniform",
+        algorithm="auto",
+        leaf_size=30,
+        p=2,
+        metric="minkowski",
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.weights = weights
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # For cross-validation routines to split data correctly
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        return tags
+
+    @_fit_context(
+        # KNeighborsRegressor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y):
+        """Fit the k-nearest neighbors regressor from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_outputs)
+            Target values.
+
+        Returns
+        -------
+        self : KNeighborsRegressor
+            The fitted k-nearest neighbors regressor.
+        """
+        return self._fit(X, y)
+
+    def predict(self, X):
+        """Predict the target for the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
+            Target values.
+        """
+        if self.weights == "uniform":
+            # In that case, we do not need the distances to perform
+            # the weighting so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
+
+        weights = _get_weights(neigh_dist, self.weights)
+
+        _y = self._y
+        if _y.ndim == 1:
+            _y = _y.reshape((-1, 1))
+
+        if weights is None:
+            y_pred = np.mean(_y[neigh_ind], axis=1)
+        else:
+            y_pred = np.empty((neigh_dist.shape[0], _y.shape[1]), dtype=np.float64)
+            denom = np.sum(weights, axis=1)
+
+            for j in range(_y.shape[1]):
+                num = np.sum(_y[neigh_ind, j] * weights, axis=1)
+                y_pred[:, j] = num / denom
+
+        if self._y.ndim == 1:
+            y_pred = y_pred.ravel()
+
+        return y_pred
+
+
+class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBase):
+    """Regression based on neighbors within a fixed radius.
+
+    The target is predicted by local interpolation of the targets
+    associated of the nearest neighbors in the training set.
+
+    Read more in the :ref:`User Guide <regression>`.
+
+    .. versionadded:: 0.9
+
+    Parameters
+    ----------
+    radius : float, default=1.0
+        Range of parameter space to use by default for :meth:`radius_neighbors`
+        queries.
+
+    weights : {'uniform', 'distance'}, callable or None, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+        Uniform weights are used by default.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    p : float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    effective_metric_ : str or callable
+        The distance metric to use. It will be same as the `metric` parameter
+        or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
+        'minkowski' and `p` parameter set to 2.
+
+    effective_metric_params_ : dict
+        Additional keyword arguments for the metric function. For most metrics
+        will be same with `metric_params` parameter, but may also contain the
+        `p` parameter value if the `effective_metric_` attribute is set to
+        'minkowski'.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    NearestNeighbors : Unsupervised learner for implementing neighbor searches.
+    KNeighborsRegressor : Regression based on k-nearest neighbors.
+    KNeighborsClassifier : Classifier based on the k-nearest neighbors.
+    RadiusNeighborsClassifier : Classifier based on neighbors within a given radius.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
+
+    Examples
+    --------
+    >>> X = [[0], [1], [2], [3]]
+    >>> y = [0, 0, 1, 1]
+    >>> from sklearn.neighbors import RadiusNeighborsRegressor
+    >>> neigh = RadiusNeighborsRegressor(radius=1.0)
+    >>> neigh.fit(X, y)
+    RadiusNeighborsRegressor(...)
+    >>> print(neigh.predict([[1.5]]))
+    [0.5]
+    """
+
+    _parameter_constraints: dict = {
+        **NeighborsBase._parameter_constraints,
+        "weights": [StrOptions({"uniform", "distance"}), callable, None],
+    }
+    _parameter_constraints.pop("n_neighbors")
+
+    def __init__(
+        self,
+        radius=1.0,
+        *,
+        weights="uniform",
+        algorithm="auto",
+        leaf_size=30,
+        p=2,
+        metric="minkowski",
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            p=p,
+            metric=metric,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+        self.weights = weights
+
+    @_fit_context(
+        # RadiusNeighborsRegressor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y):
+        """Fit the radius neighbors regressor from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_outputs)
+            Target values.
+
+        Returns
+        -------
+        self : RadiusNeighborsRegressor
+            The fitted radius neighbors regressor.
+        """
+        return self._fit(X, y)
+
+    def predict(self, X):
+        """Predict the target for the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.
+
+        Returns
+        -------
+        y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \
+                dtype=double
+            Target values.
+        """
+        neigh_dist, neigh_ind = self.radius_neighbors(X)
+
+        weights = _get_weights(neigh_dist, self.weights)
+
+        _y = self._y
+        if _y.ndim == 1:
+            _y = _y.reshape((-1, 1))
+
+        empty_obs = np.full_like(_y[0], np.nan)
+
+        if weights is None:
+            y_pred = np.array(
+                [
+                    np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs
+                    for (i, ind) in enumerate(neigh_ind)
+                ]
+            )
+
+        else:
+            y_pred = np.array(
+                [
+                    (
+                        np.average(_y[ind, :], axis=0, weights=weights[i])
+                        if len(ind)
+                        else empty_obs
+                    )
+                    for (i, ind) in enumerate(neigh_ind)
+                ]
+            )
+
+        if np.any(np.isnan(y_pred)):
+            empty_warning_msg = (
+                "One or more samples have no neighbors "
+                "within specified radius; predicting NaN."
+            )
+            warnings.warn(empty_warning_msg)
+
+        if self._y.ndim == 1:
+            y_pred = y_pred.ravel()
+
+        return y_pred
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_unsupervised.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_unsupervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..8888fe18483c6ae5f7008d78b0d6ff97d096a419
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_unsupervised.py
@@ -0,0 +1,179 @@
+"""Unsupervised nearest neighbors learner"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..base import _fit_context
+from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin
+
+
+class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
+    """Unsupervised learner for implementing neighbor searches.
+
+    Read more in the :ref:`User Guide <unsupervised_neighbors>`.
+
+    .. versionadded:: 0.9
+
+    Parameters
+    ----------
+    n_neighbors : int, default=5
+        Number of neighbors to use by default for :meth:`kneighbors` queries.
+
+    radius : float, default=1.0
+        Range of parameter space to use by default for :meth:`radius_neighbors`
+        queries.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    metric : str or callable, default='minkowski'
+        Metric to use for distance computation. Default is "minkowski", which
+        results in the standard Euclidean distance when p = 2. See the
+        documentation of `scipy.spatial.distance
+        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
+        the metrics listed in
+        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
+        values.
+
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors.
+
+        If metric is a callable function, it takes two arrays representing 1D
+        vectors as inputs and must return one value indicating the distance
+        between those vectors. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+    p : float (positive), default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    effective_metric_ : str
+        Metric used to compute distances to neighbors.
+
+    effective_metric_params_ : dict
+        Parameters for the metric used to compute distances to neighbors.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_fit_ : int
+        Number of samples in the fitted data.
+
+    See Also
+    --------
+    KNeighborsClassifier : Classifier implementing the k-nearest neighbors
+        vote.
+    RadiusNeighborsClassifier : Classifier implementing a vote among neighbors
+        within a given radius.
+    KNeighborsRegressor : Regression based on k-nearest neighbors.
+    RadiusNeighborsRegressor : Regression based on neighbors within a fixed
+        radius.
+    BallTree : Space partitioning data structure for organizing points in a
+        multi-dimensional space, used for nearest neighbor search.
+
+    Notes
+    -----
+    See :ref:`Nearest Neighbors <neighbors>` in the online documentation
+    for a discussion of the choice of ``algorithm`` and ``leaf_size``.
+
+    https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.neighbors import NearestNeighbors
+    >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
+    >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
+    >>> neigh.fit(samples)
+    NearestNeighbors(...)
+    >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)
+    array([[2, 0]]...)
+    >>> nbrs = neigh.radius_neighbors(
+    ...    [[0, 0, 1.3]], 0.4, return_distance=False
+    ... )
+    >>> np.asarray(nbrs[0][0])
+    array(2)
+    """
+
+    def __init__(
+        self,
+        *,
+        n_neighbors=5,
+        radius=1.0,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            n_jobs=n_jobs,
+        )
+
+    @_fit_context(
+        # NearestNeighbors.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Fit the nearest neighbors estimator from the training dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples) if metric='precomputed'
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : NearestNeighbors
+            The fitted nearest neighbors estimator.
+        """
+        return self._fit(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/meson.build b/.venv/lib/python3.12/site-packages/sklearn/neighbors/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..7993421896218d3a4c9db8055d2dfd9528ac3746
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/meson.build
@@ -0,0 +1,53 @@
+_binary_tree_pxi = custom_target(
+  '_binary_tree_pxi',
+  output: '_binary_tree.pxi',
+  input: '_binary_tree.pxi.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+)
+
+# .pyx is generated so this is needed to make Cython compilation work. The pxi
+# file is included avoid "missing dependency paths" with ninja -t missindeps
+neighbors_cython_tree = [
+  fs.copyfile('__init__.py'),
+  fs.copyfile('_partition_nodes.pxd'),
+  _binary_tree_pxi,
+]
+
+name_list = ['_ball_tree', '_kd_tree']
+
+foreach name: name_list
+  pyx = custom_target(
+    name + '_pyx',
+    output: name + '.pyx',
+    input: name + '.pyx.tp',
+    command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+    # TODO in principle this should go in py.exension_module below. This is
+    # temporary work-around for dependency issue with .pyx.tp files. For more
+    # details, see https://github.com/mesonbuild/meson/issues/13212
+    depends: [neighbors_cython_tree, utils_cython_tree, metrics_cython_tree],
+  )
+  py.extension_module(
+    name,
+    cython_gen.process(pyx),
+    dependencies: [np_dep],
+    subdir: 'sklearn/neighbors',
+    install: true
+)
+endforeach
+
+neighbors_extension_metadata = {
+  '_partition_nodes':
+      {'sources': [cython_gen_cpp.process('_partition_nodes.pyx')],
+       'dependencies': [np_dep]},
+  '_quad_tree': {'sources': [cython_gen.process('_quad_tree.pyx')], 'dependencies': [np_dep]},
+}
+
+foreach ext_name, ext_dict : neighbors_extension_metadata
+  py.extension_module(
+    ext_name,
+    [ext_dict.get('sources'), utils_cython_tree],
+    dependencies: ext_dict.get('dependencies'),
+    subdir: 'sklearn/neighbors',
+    install: true
+  )
+endforeach
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_ball_tree.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_ball_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..5263f201f320b17ced98fb223e7aaaf624d9271d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_ball_tree.py
@@ -0,0 +1,200 @@
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
+
+from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import _convert_container
+from sklearn.utils.validation import check_array
+
+rng = np.random.RandomState(10)
+V_mahalanobis = rng.rand(3, 3)
+V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
+
+DIMENSION = 3
+
+METRICS = {
+    "euclidean": {},
+    "manhattan": {},
+    "minkowski": dict(p=3),
+    "chebyshev": {},
+}
+
+DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"]
+
+BOOLEAN_METRICS = [
+    "jaccard",
+    "dice",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalmichener",
+    "sokalsneath",
+]
+
+BALL_TREE_CLASSES = [
+    BallTree64,
+    BallTree32,
+]
+
+
+def brute_force_neighbors(X, Y, k, metric, **kwargs):
+    from sklearn.metrics import DistanceMetric
+
+    X, Y = check_array(X), check_array(Y)
+    D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
+    ind = np.argsort(D, axis=1)[:, :k]
+    dist = D[np.arange(Y.shape[0])[:, None], ind]
+    return dist, ind
+
+
+def test_BallTree_is_BallTree64_subclass():
+    assert issubclass(BallTree, BallTree64)
+
+
+@pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS))
+@pytest.mark.parametrize("array_type", ["list", "array"])
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation):
+    rng = check_random_state(0)
+    if metric in BOOLEAN_METRICS:
+        X = rng.random_sample((40, 10)).round(0)
+        Y = rng.random_sample((10, 10)).round(0)
+    elif metric in DISCRETE_METRICS:
+        X = (4 * rng.random_sample((40, 10))).round(0)
+        Y = (4 * rng.random_sample((10, 10))).round(0)
+    X = _convert_container(X, array_type)
+    Y = _convert_container(Y, array_type)
+
+    k = 5
+
+    bt = BallTreeImplementation(X, leaf_size=1, metric=metric)
+    dist1, ind1 = bt.query(Y, k)
+    dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
+    assert_array_almost_equal(dist1, dist2)
+
+
+@pytest.mark.parametrize(
+    "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5])
+)
+def test_query_haversine(BallTreeImplementation, decimal_tol):
+    rng = check_random_state(0)
+    X = 2 * np.pi * rng.random_sample((40, 2))
+    bt = BallTreeImplementation(X, leaf_size=1, metric="haversine")
+    dist1, ind1 = bt.query(X, k=5)
+    dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine")
+
+    assert_array_almost_equal(dist1, dist2, decimal=decimal_tol)
+    assert_array_almost_equal(ind1, ind2)
+
+
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_array_object_type(BallTreeImplementation):
+    """Check that we do not accept object dtype array."""
+    X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
+    with pytest.raises(ValueError, match="setting an array element with a sequence"):
+        BallTreeImplementation(X)
+
+
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_bad_pyfunc_metric(BallTreeImplementation):
+    def wrong_returned_value(x, y):
+        return "1"
+
+    def one_arg_func(x):
+        return 1.0  # pragma: no cover
+
+    X = np.ones((5, 2))
+    msg = "Custom distance function must accept two vectors and return a float."
+    with pytest.raises(TypeError, match=msg):
+        BallTreeImplementation(X, metric=wrong_returned_value)
+
+    msg = "takes 1 positional argument but 2 were given"
+    with pytest.raises(TypeError, match=msg):
+        BallTreeImplementation(X, metric=one_arg_func)
+
+
+@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
+def test_ball_tree_numerical_consistency(global_random_seed, metric):
+    # Results on float64 and float32 versions of a dataset must be
+    # numerically close.
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
+        random_seed=global_random_seed, features=50
+    )
+
+    metric_params = METRICS.get(metric, {})
+    bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
+    bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
+
+    # Test consistency with respect to the `query` method
+    k = 5
+    dist_64, ind_64 = bt_64.query(Y_64, k=k)
+    dist_32, ind_32 = bt_32.query(Y_32, k=k)
+    assert_allclose(dist_64, dist_32, rtol=1e-5)
+    assert_equal(ind_64, ind_32)
+    assert dist_64.dtype == np.float64
+    assert dist_32.dtype == np.float32
+
+    # Test consistency with respect to the `query_radius` method
+    r = 2.38
+    ind_64 = bt_64.query_radius(Y_64, r=r)
+    ind_32 = bt_32.query_radius(Y_32, r=r)
+    for _ind64, _ind32 in zip(ind_64, ind_32):
+        assert_equal(_ind64, _ind32)
+
+    # Test consistency with respect to the `query_radius` method
+    # with return distances being true
+    ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True)
+    ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True)
+    for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
+        assert_equal(_ind64, _ind32)
+        assert_allclose(_dist_64, _dist_32, rtol=1e-5)
+        assert _dist_64.dtype == np.float64
+        assert _dist_32.dtype == np.float32
+
+
+@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
+def test_kernel_density_numerical_consistency(global_random_seed, metric):
+    # Test consistency with respect to the `kernel_density` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    metric_params = METRICS.get(metric, {})
+    bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
+    bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
+
+    kernel = "gaussian"
+    h = 0.1
+    density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
+    density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
+    assert_allclose(density64, density32, rtol=1e-5)
+    assert density64.dtype == np.float64
+    assert density32.dtype == np.float32
+
+
+def test_two_point_correlation_numerical_consistency(global_random_seed):
+    # Test consistency with respect to the `two_point_correlation` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    bt_64 = BallTree64(X_64, leaf_size=10)
+    bt_32 = BallTree32(X_32, leaf_size=10)
+
+    r = np.linspace(0, 1, 10)
+
+    counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True)
+    counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True)
+    assert_allclose(counts_64, counts_32)
+
+
+def get_dataset_for_binary_tree(random_seed, features=3):
+    rng = np.random.RandomState(random_seed)
+    _X = rng.rand(100, features)
+    _Y = rng.rand(5, features)
+
+    X_64 = _X.astype(dtype=np.float64, copy=False)
+    Y_64 = _Y.astype(dtype=np.float64, copy=False)
+
+    X_32 = _X.astype(dtype=np.float32, copy=False)
+    Y_32 = _Y.astype(dtype=np.float32, copy=False)
+
+    return X_64, X_32, Y_64, Y_32
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_graph.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb593485d17a8155f784ef881b3868338348e1a8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_graph.py
@@ -0,0 +1,101 @@
+import numpy as np
+import pytest
+
+from sklearn.metrics import euclidean_distances
+from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer
+from sklearn.neighbors._base import _is_sorted_by_data
+from sklearn.utils._testing import assert_array_equal
+
+
+def test_transformer_result():
+    # Test the number of neighbors returned
+    n_neighbors = 5
+    n_samples_fit = 20
+    n_queries = 18
+    n_features = 10
+
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples_fit, n_features)
+    X2 = rng.randn(n_queries, n_features)
+    radius = np.percentile(euclidean_distances(X), 10)
+
+    # with n_neighbors
+    for mode in ["distance", "connectivity"]:
+        add_one = mode == "distance"
+        nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
+        Xt = nnt.fit_transform(X)
+        assert Xt.shape == (n_samples_fit, n_samples_fit)
+        assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
+        assert Xt.format == "csr"
+        assert _is_sorted_by_data(Xt)
+
+        X2t = nnt.transform(X2)
+        assert X2t.shape == (n_queries, n_samples_fit)
+        assert X2t.data.shape == (n_queries * (n_neighbors + add_one),)
+        assert X2t.format == "csr"
+        assert _is_sorted_by_data(X2t)
+
+    # with radius
+    for mode in ["distance", "connectivity"]:
+        add_one = mode == "distance"
+        nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
+        Xt = nnt.fit_transform(X)
+        assert Xt.shape == (n_samples_fit, n_samples_fit)
+        assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
+        assert Xt.format == "csr"
+        assert _is_sorted_by_data(Xt)
+
+        X2t = nnt.transform(X2)
+        assert X2t.shape == (n_queries, n_samples_fit)
+        assert not X2t.data.shape == (n_queries * (n_neighbors + add_one),)
+        assert X2t.format == "csr"
+        assert _is_sorted_by_data(X2t)
+
+
+def _has_explicit_diagonal(X):
+    """Return True if the diagonal is explicitly stored"""
+    X = X.tocoo()
+    explicit = X.row[X.row == X.col]
+    return len(explicit) == X.shape[0]
+
+
+def test_explicit_diagonal():
+    # Test that the diagonal is explicitly stored in the sparse graph
+    n_neighbors = 5
+    n_samples_fit, n_samples_transform, n_features = 20, 18, 10
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples_fit, n_features)
+    X2 = rng.randn(n_samples_transform, n_features)
+
+    nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
+    Xt = nnt.fit_transform(X)
+    assert _has_explicit_diagonal(Xt)
+    assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
+
+    Xt = nnt.transform(X)
+    assert _has_explicit_diagonal(Xt)
+    assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
+
+    # Using transform on new data should not always have zero diagonal
+    X2t = nnt.transform(X2)
+    assert not _has_explicit_diagonal(X2t)
+
+
+@pytest.mark.parametrize("Klass", [KNeighborsTransformer, RadiusNeighborsTransformer])
+def test_graph_feature_names_out(Klass):
+    """Check `get_feature_names_out` for transformers defined in `_graph.py`."""
+
+    n_samples_fit = 20
+    n_features = 10
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples_fit, n_features)
+
+    est = Klass().fit(X)
+    names_out = est.get_feature_names_out()
+
+    class_name_lower = Klass.__name__.lower()
+    expected_names_out = np.array(
+        [f"{class_name_lower}{i}" for i in range(est.n_samples_fit_)],
+        dtype=object,
+    )
+    assert_array_equal(names_out, expected_names_out)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kd_tree.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kd_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..749601baaf66fdbf96e8396ca1df45c5bdab4a1e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kd_tree.py
@@ -0,0 +1,100 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_equal
+
+from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64
+from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree
+from sklearn.utils.parallel import Parallel, delayed
+
+DIMENSION = 3
+
+METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)}
+
+KD_TREE_CLASSES = [
+    KDTree64,
+    KDTree32,
+]
+
+
+def test_KDTree_is_KDTree64_subclass():
+    assert issubclass(KDTree, KDTree64)
+
+
+@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
+def test_array_object_type(BinarySearchTree):
+    """Check that we do not accept object dtype array."""
+    X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
+    with pytest.raises(ValueError, match="setting an array element with a sequence"):
+        BinarySearchTree(X)
+
+
+@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
+def test_kdtree_picklable_with_joblib(BinarySearchTree):
+    """Make sure that KDTree queries work when joblib memmaps.
+
+    Non-regression test for #21685 and #21228."""
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((10, 3))
+    tree = BinarySearchTree(X, leaf_size=2)
+
+    # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that
+    # use to raise "ValueError: buffer source array is read-only" in a previous
+    # version of the Cython code.
+    Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])
+
+
+@pytest.mark.parametrize("metric", METRICS)
+def test_kd_tree_numerical_consistency(global_random_seed, metric):
+    # Results on float64 and float32 versions of a dataset must be
+    # numerically close.
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
+        random_seed=global_random_seed, features=50
+    )
+
+    metric_params = METRICS.get(metric, {})
+    kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
+    kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
+
+    # Test consistency with respect to the `query` method
+    k = 4
+    dist_64, ind_64 = kd_64.query(Y_64, k=k)
+    dist_32, ind_32 = kd_32.query(Y_32, k=k)
+    assert_allclose(dist_64, dist_32, rtol=1e-5)
+    assert_equal(ind_64, ind_32)
+    assert dist_64.dtype == np.float64
+    assert dist_32.dtype == np.float32
+
+    # Test consistency with respect to the `query_radius` method
+    r = 2.38
+    ind_64 = kd_64.query_radius(Y_64, r=r)
+    ind_32 = kd_32.query_radius(Y_32, r=r)
+    for _ind64, _ind32 in zip(ind_64, ind_32):
+        assert_equal(_ind64, _ind32)
+
+    # Test consistency with respect to the `query_radius` method
+    # with return distances being true
+    ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True)
+    ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True)
+    for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
+        assert_equal(_ind64, _ind32)
+        assert_allclose(_dist_64, _dist_32, rtol=1e-5)
+        assert _dist_64.dtype == np.float64
+        assert _dist_32.dtype == np.float32
+
+
+@pytest.mark.parametrize("metric", METRICS)
+def test_kernel_density_numerical_consistency(global_random_seed, metric):
+    # Test consistency with respect to the `kernel_density` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    metric_params = METRICS.get(metric, {})
+    kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
+    kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
+
+    kernel = "gaussian"
+    h = 0.1
+    density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
+    density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
+    assert_allclose(density64, density32, rtol=1e-5)
+    assert density64.dtype == np.float64
+    assert density32.dtype == np.float32
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kde.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kde.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6bf09d01b672b7ad5a3abf3506443b0ac620915
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kde.py
@@ -0,0 +1,252 @@
+import joblib
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import GridSearchCV
+from sklearn.neighbors import KDTree, KernelDensity, NearestNeighbors
+from sklearn.neighbors._ball_tree import kernel_norm
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils._testing import assert_allclose
+
+
+# XXX Duplicated in test_neighbors_tree, test_kde
+def compute_kernel_slow(Y, X, kernel, h):
+    if h == "scott":
+        h = X.shape[0] ** (-1 / (X.shape[1] + 4))
+    elif h == "silverman":
+        h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
+
+    d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
+    norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]
+
+    if kernel == "gaussian":
+        return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
+    elif kernel == "tophat":
+        return norm * (d < h).sum(-1)
+    elif kernel == "epanechnikov":
+        return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
+    elif kernel == "exponential":
+        return norm * (np.exp(-d / h)).sum(-1)
+    elif kernel == "linear":
+        return norm * ((1 - d / h) * (d < h)).sum(-1)
+    elif kernel == "cosine":
+        return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
+    else:
+        raise ValueError("kernel not recognized")
+
+
+def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
+    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol)
+    log_dens = kde.fit(X).score_samples(Y)
+    assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1e-7, rtol))
+    assert_allclose(
+        np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1e-7, rtol)
+    )
+
+
+@pytest.mark.parametrize(
+    "kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
+)
+@pytest.mark.parametrize("bandwidth", [0.01, 0.1, 1, "scott", "silverman"])
+def test_kernel_density(kernel, bandwidth):
+    n_samples, n_features = (100, 3)
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+    Y = rng.randn(n_samples, n_features)
+
+    dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
+
+    for rtol in [0, 1e-5]:
+        for atol in [1e-6, 1e-2]:
+            for breadth_first in (True, False):
+                check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true)
+
+
+def test_kernel_density_sampling(n_samples=100, n_features=3):
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+
+    bandwidth = 0.2
+
+    for kernel in ["gaussian", "tophat"]:
+        # draw a tophat sample
+        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
+        samp = kde.sample(100)
+        assert X.shape == samp.shape
+
+        # check that samples are in the right range
+        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
+        dist, ind = nbrs.kneighbors(X, return_distance=True)
+
+        if kernel == "tophat":
+            assert np.all(dist < bandwidth)
+        elif kernel == "gaussian":
+            # 5 standard deviations is safe for 100 samples, but there's a
+            # very small chance this test could fail.
+            assert np.all(dist < 5 * bandwidth)
+
+    # check unsupported kernels
+    for kernel in ["epanechnikov", "exponential", "linear", "cosine"]:
+        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
+        with pytest.raises(NotImplementedError):
+            kde.sample(100)
+
+    # non-regression test: used to return a scalar
+    X = rng.randn(4, 1)
+    kde = KernelDensity(kernel="gaussian").fit(X)
+    assert kde.sample().shape == (1, 1)
+
+
+@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree"])
+@pytest.mark.parametrize(
+    "metric", ["euclidean", "minkowski", "manhattan", "chebyshev", "haversine"]
+)
+def test_kde_algorithm_metric_choice(algorithm, metric):
+    # Smoke test for various metrics and algorithms
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 2)  # 2 features required for haversine dist.
+    Y = rng.randn(10, 2)
+
+    kde = KernelDensity(algorithm=algorithm, metric=metric)
+
+    if algorithm == "kd_tree" and metric not in KDTree.valid_metrics:
+        with pytest.raises(ValueError, match="invalid metric"):
+            kde.fit(X)
+    else:
+        kde.fit(X)
+        y_dens = kde.score_samples(Y)
+        assert y_dens.shape == Y.shape[:1]
+
+
+def test_kde_score(n_samples=100, n_features=3):
+    pass
+    # FIXME
+    # rng = np.random.RandomState(0)
+    # X = rng.random_sample((n_samples, n_features))
+    # Y = rng.random_sample((n_samples, n_features))
+
+
+def test_kde_sample_weights_error():
+    kde = KernelDensity()
+    with pytest.raises(ValueError):
+        kde.fit(np.random.random((200, 10)), sample_weight=np.random.random((200, 10)))
+    with pytest.raises(ValueError):
+        kde.fit(np.random.random((200, 10)), sample_weight=-np.random.random(200))
+
+
+def test_kde_pipeline_gridsearch():
+    # test that kde plays nice in pipelines and grid-searches
+    X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
+    pipe1 = make_pipeline(
+        StandardScaler(with_mean=False, with_std=False),
+        KernelDensity(kernel="gaussian"),
+    )
+    params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
+    search = GridSearchCV(pipe1, param_grid=params)
+    search.fit(X)
+    assert search.best_params_["kerneldensity__bandwidth"] == 0.1
+
+
+def test_kde_sample_weights():
+    n_samples = 400
+    size_test = 20
+    weights_neutral = np.full(n_samples, 3.0)
+    for d in [1, 2, 10]:
+        rng = np.random.RandomState(0)
+        X = rng.rand(n_samples, d)
+        weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
+        X_repetitions = np.repeat(X, weights, axis=0)
+        n_samples_test = size_test // d
+        test_points = rng.rand(n_samples_test, d)
+        for algorithm in ["auto", "ball_tree", "kd_tree"]:
+            for metric in ["euclidean", "minkowski", "manhattan", "chebyshev"]:
+                if algorithm != "kd_tree" or metric in KDTree.valid_metrics:
+                    kde = KernelDensity(algorithm=algorithm, metric=metric)
+
+                    # Test that adding a constant sample weight has no effect
+                    kde.fit(X, sample_weight=weights_neutral)
+                    scores_const_weight = kde.score_samples(test_points)
+                    sample_const_weight = kde.sample(random_state=1234)
+                    kde.fit(X)
+                    scores_no_weight = kde.score_samples(test_points)
+                    sample_no_weight = kde.sample(random_state=1234)
+                    assert_allclose(scores_const_weight, scores_no_weight)
+                    assert_allclose(sample_const_weight, sample_no_weight)
+
+                    # Test equivalence between sampling and (integer) weights
+                    kde.fit(X, sample_weight=weights)
+                    scores_weight = kde.score_samples(test_points)
+                    sample_weight = kde.sample(random_state=1234)
+                    kde.fit(X_repetitions)
+                    scores_ref_sampling = kde.score_samples(test_points)
+                    sample_ref_sampling = kde.sample(random_state=1234)
+                    assert_allclose(scores_weight, scores_ref_sampling)
+                    assert_allclose(sample_weight, sample_ref_sampling)
+
+                    # Test that sample weights has a non-trivial effect
+                    diff = np.max(np.abs(scores_no_weight - scores_weight))
+                    assert diff > 0.001
+
+                    # Test invariance with respect to arbitrary scaling
+                    scale_factor = rng.rand()
+                    kde.fit(X, sample_weight=(scale_factor * weights))
+                    scores_scaled_weight = kde.score_samples(test_points)
+                    assert_allclose(scores_scaled_weight, scores_weight)
+
+
+@pytest.mark.parametrize("sample_weight", [None, [0.1, 0.2, 0.3]])
+def test_pickling(tmpdir, sample_weight):
+    # Make sure that predictions are the same before and after pickling. Used
+    # to be a bug because sample_weights wasn't pickled and the resulting tree
+    # would miss some info.
+
+    kde = KernelDensity()
+    data = np.reshape([1.0, 2.0, 3.0], (-1, 1))
+    kde.fit(data, sample_weight=sample_weight)
+
+    X = np.reshape([1.1, 2.1], (-1, 1))
+    scores = kde.score_samples(X)
+
+    file_path = str(tmpdir.join("dump.pkl"))
+    joblib.dump(kde, file_path)
+    kde = joblib.load(file_path)
+    scores_pickled = kde.score_samples(X)
+
+    assert_allclose(scores, scores_pickled)
+
+
+@pytest.mark.parametrize("method", ["score_samples", "sample"])
+def test_check_is_fitted(method):
+    # Check that predict raises an exception in an unfitted estimator.
+    # Unfitted estimators should raise a NotFittedError.
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 2)
+    kde = KernelDensity()
+
+    with pytest.raises(NotFittedError):
+        getattr(kde, method)(X)
+
+
+@pytest.mark.parametrize("bandwidth", ["scott", "silverman", 0.1])
+def test_bandwidth(bandwidth):
+    n_samples, n_features = (100, 3)
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+    kde = KernelDensity(bandwidth=bandwidth).fit(X)
+    samp = kde.sample(100)
+    kde_sc = kde.score_samples(X)
+    assert X.shape == samp.shape
+    assert kde_sc.shape == (n_samples,)
+
+    # Test that the attribute self.bandwidth_ has the expected value
+    if bandwidth == "scott":
+        h = X.shape[0] ** (-1 / (X.shape[1] + 4))
+    elif bandwidth == "silverman":
+        h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
+    else:
+        h = bandwidth
+    assert kde.bandwidth_ == pytest.approx(h)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_lof.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_lof.py
new file mode 100644
index 0000000000000000000000000000000000000000..140d0d9ba6dff1ba15acf54fe769cd526e832c3d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_lof.py
@@ -0,0 +1,394 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+from math import sqrt
+
+import numpy as np
+import pytest
+
+from sklearn import metrics, neighbors
+from sklearn.datasets import load_iris
+from sklearn.metrics import roc_auc_score
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.estimator_checks import (
+    check_outlier_corruption,
+    parametrize_with_checks,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+# load the iris dataset
+# and randomly permute it
+rng = check_random_state(0)
+iris = load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+
+def test_lof(global_dtype):
+    # Toy sample (the last two samples are outliers):
+    X = np.asarray(
+        [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]],
+        dtype=global_dtype,
+    )
+
+    # Test LocalOutlierFactor:
+    clf = neighbors.LocalOutlierFactor(n_neighbors=5)
+    score = clf.fit(X).negative_outlier_factor_
+    assert_array_equal(clf._fit_X, X)
+
+    # Assert largest outlier score is smaller than smallest inlier score:
+    assert np.min(score[:-2]) > np.max(score[-2:])
+
+    # Assert predict() works:
+    clf = neighbors.LocalOutlierFactor(contamination=0.25, n_neighbors=5).fit(X)
+    expected_predictions = 6 * [1] + 2 * [-1]
+    assert_array_equal(clf._predict(), expected_predictions)
+    assert_array_equal(clf.fit_predict(X), expected_predictions)
+
+
+def test_lof_performance(global_dtype):
+    # Generate train/test data
+    rng = check_random_state(2)
+    X = 0.3 * rng.randn(120, 2).astype(global_dtype, copy=False)
+    X_train = X[:100]
+
+    # Generate some abnormal novel observations
+    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)).astype(
+        global_dtype, copy=False
+    )
+    X_test = np.r_[X[100:], X_outliers]
+    y_test = np.array([0] * 20 + [1] * 20)
+
+    # fit the model for novelty detection
+    clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)
+
+    # predict scores (the lower, the more normal)
+    y_pred = -clf.decision_function(X_test)
+
+    # check that roc_auc is good
+    assert roc_auc_score(y_test, y_pred) > 0.99
+
+
+def test_lof_values(global_dtype):
+    # toy samples:
+    X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype)
+    clf1 = neighbors.LocalOutlierFactor(
+        n_neighbors=2, contamination=0.1, novelty=True
+    ).fit(X_train)
+    clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
+    s_0 = 2.0 * sqrt(2.0) / (1.0 + sqrt(2.0))
+    s_1 = (1.0 + sqrt(2)) * (1.0 / (4.0 * sqrt(2.0)) + 1.0 / (2.0 + 2.0 * sqrt(2)))
+    # check predict()
+    assert_allclose(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
+    assert_allclose(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
+    # check predict(one sample not in train)
+    assert_allclose(-clf1.score_samples([[2.0, 2.0]]), [s_0])
+    assert_allclose(-clf2.score_samples([[2.0, 2.0]]), [s_0])
+    # check predict(one sample already in train)
+    assert_allclose(-clf1.score_samples([[1.0, 1.0]]), [s_1])
+    assert_allclose(-clf2.score_samples([[1.0, 1.0]]), [s_1])
+
+
+def test_lof_precomputed(global_dtype, random_state=42):
+    """Tests LOF with a distance matrix."""
+    # Note: smaller samples may result in spurious test success
+    rng = np.random.RandomState(random_state)
+    X = rng.random_sample((10, 4)).astype(global_dtype, copy=False)
+    Y = rng.random_sample((3, 4)).astype(global_dtype, copy=False)
+    DXX = metrics.pairwise_distances(X, metric="euclidean")
+    DYX = metrics.pairwise_distances(Y, X, metric="euclidean")
+    # As a feature matrix (n_samples by n_features)
+    lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
+    lof_X.fit(X)
+    pred_X_X = lof_X._predict()
+    pred_X_Y = lof_X.predict(Y)
+
+    # As a dense distance matrix (n_samples by n_samples)
+    lof_D = neighbors.LocalOutlierFactor(
+        n_neighbors=3, algorithm="brute", metric="precomputed", novelty=True
+    )
+    lof_D.fit(DXX)
+    pred_D_X = lof_D._predict()
+    pred_D_Y = lof_D.predict(DYX)
+
+    assert_allclose(pred_X_X, pred_D_X)
+    assert_allclose(pred_X_Y, pred_D_Y)
+
+
+def test_n_neighbors_attribute():
+    X = iris.data
+    clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
+    assert clf.n_neighbors_ == X.shape[0] - 1
+
+    clf = neighbors.LocalOutlierFactor(n_neighbors=500)
+    msg = "n_neighbors will be set to (n_samples - 1)"
+    with pytest.warns(UserWarning, match=re.escape(msg)):
+        clf.fit(X)
+    assert clf.n_neighbors_ == X.shape[0] - 1
+
+
+def test_score_samples(global_dtype):
+    X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype)
+    X_test = np.asarray([[2.0, 2.0]], dtype=global_dtype)
+    clf1 = neighbors.LocalOutlierFactor(
+        n_neighbors=2, contamination=0.1, novelty=True
+    ).fit(X_train)
+    clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
+
+    clf1_scores = clf1.score_samples(X_test)
+    clf1_decisions = clf1.decision_function(X_test)
+
+    clf2_scores = clf2.score_samples(X_test)
+    clf2_decisions = clf2.decision_function(X_test)
+
+    assert_allclose(
+        clf1_scores,
+        clf1_decisions + clf1.offset_,
+    )
+    assert_allclose(
+        clf2_scores,
+        clf2_decisions + clf2.offset_,
+    )
+    assert_allclose(clf1_scores, clf2_scores)
+
+
+def test_novelty_errors():
+    X = iris.data
+
+    # check errors for novelty=False
+    clf = neighbors.LocalOutlierFactor()
+    clf.fit(X)
+    # predict, decision_function and score_samples raise ValueError
+    for method in ["predict", "decision_function", "score_samples"]:
+        outer_msg = f"'LocalOutlierFactor' has no attribute '{method}'"
+        inner_msg = "{} is not available when novelty=False".format(method)
+        with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+            getattr(clf, method)
+
+        assert isinstance(exec_info.value.__cause__, AttributeError)
+        assert inner_msg in str(exec_info.value.__cause__)
+
+    # check errors for novelty=True
+    clf = neighbors.LocalOutlierFactor(novelty=True)
+
+    outer_msg = "'LocalOutlierFactor' has no attribute 'fit_predict'"
+    inner_msg = "fit_predict is not available when novelty=True"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        getattr(clf, "fit_predict")
+
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+
+def test_novelty_training_scores(global_dtype):
+    # check that the scores of the training samples are still accessible
+    # when novelty=True through the negative_outlier_factor_ attribute
+    X = iris.data.astype(global_dtype)
+
+    # fit with novelty=False
+    clf_1 = neighbors.LocalOutlierFactor()
+    clf_1.fit(X)
+    scores_1 = clf_1.negative_outlier_factor_
+
+    # fit with novelty=True
+    clf_2 = neighbors.LocalOutlierFactor(novelty=True)
+    clf_2.fit(X)
+    scores_2 = clf_2.negative_outlier_factor_
+
+    assert_allclose(scores_1, scores_2)
+
+
+def test_hasattr_prediction():
+    # check availability of prediction methods depending on novelty value.
+    X = [[1, 1], [1, 2], [2, 1]]
+
+    # when novelty=True
+    clf = neighbors.LocalOutlierFactor(novelty=True)
+    clf.fit(X)
+    assert hasattr(clf, "predict")
+    assert hasattr(clf, "decision_function")
+    assert hasattr(clf, "score_samples")
+    assert not hasattr(clf, "fit_predict")
+
+    # when novelty=False
+    clf = neighbors.LocalOutlierFactor(novelty=False)
+    clf.fit(X)
+    assert hasattr(clf, "fit_predict")
+    assert not hasattr(clf, "predict")
+    assert not hasattr(clf, "decision_function")
+    assert not hasattr(clf, "score_samples")
+
+
+@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)])
+def test_novelty_true_common_tests(estimator, check):
+    # the common tests are run for the default LOF (novelty=False).
+    # here we run these common tests for LOF when novelty=True
+    check(estimator)
+
+
+@pytest.mark.parametrize("expected_outliers", [30, 53])
+def test_predicted_outlier_number(expected_outliers):
+    # the number of predicted outliers should be equal to the number of
+    # expected outliers unless there are ties in the abnormality scores.
+    X = iris.data
+    n_samples = X.shape[0]
+    contamination = float(expected_outliers) / n_samples
+
+    clf = neighbors.LocalOutlierFactor(contamination=contamination)
+    y_pred = clf.fit_predict(X)
+
+    num_outliers = np.sum(y_pred != 1)
+    if num_outliers != expected_outliers:
+        y_dec = clf.negative_outlier_factor_
+        check_outlier_corruption(num_outliers, expected_outliers, y_dec)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse(csr_container):
+    # LocalOutlierFactor must support CSR inputs
+    # TODO: compare results on dense and sparse data as proposed in:
+    # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
+    X = csr_container(iris.data)
+
+    lof = neighbors.LocalOutlierFactor(novelty=True)
+    lof.fit(X)
+    lof.predict(X)
+    lof.score_samples(X)
+    lof.decision_function(X)
+
+    lof = neighbors.LocalOutlierFactor(novelty=False)
+    lof.fit_predict(X)
+
+
+def test_lof_error_n_neighbors_too_large():
+    """Check that we raise a proper error message when n_neighbors == n_samples.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/17207
+    """
+    X = np.ones((7, 7))
+
+    msg = (
+        "Expected n_neighbors < n_samples_fit, but n_neighbors = 1, "
+        "n_samples_fit = 1, n_samples = 1"
+    )
+    with pytest.raises(ValueError, match=msg):
+        lof = neighbors.LocalOutlierFactor(n_neighbors=1).fit(X[:1])
+
+    lof = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X[:2])
+    assert lof.n_samples_fit_ == 2
+
+    msg = (
+        "Expected n_neighbors < n_samples_fit, but n_neighbors = 2, "
+        "n_samples_fit = 2, n_samples = 2"
+    )
+    with pytest.raises(ValueError, match=msg):
+        lof.kneighbors(None, n_neighbors=2)
+
+    distances, indices = lof.kneighbors(None, n_neighbors=1)
+    assert distances.shape == (2, 1)
+    assert indices.shape == (2, 1)
+
+    msg = (
+        "Expected n_neighbors <= n_samples_fit, but n_neighbors = 3, "
+        "n_samples_fit = 2, n_samples = 7"
+    )
+    with pytest.raises(ValueError, match=msg):
+        lof.kneighbors(X, n_neighbors=3)
+
+    (
+        distances,
+        indices,
+    ) = lof.kneighbors(X, n_neighbors=2)
+    assert distances.shape == (7, 2)
+    assert indices.shape == (7, 2)
+
+
+@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
+@pytest.mark.parametrize("novelty", [True, False])
+@pytest.mark.parametrize("contamination", [0.5, "auto"])
+def test_lof_input_dtype_preservation(global_dtype, algorithm, contamination, novelty):
+    """Check that the fitted attributes are stored using the data type of X."""
+    X = iris.data.astype(global_dtype, copy=False)
+
+    iso = neighbors.LocalOutlierFactor(
+        n_neighbors=5, algorithm=algorithm, contamination=contamination, novelty=novelty
+    )
+    iso.fit(X)
+
+    assert iso.negative_outlier_factor_.dtype == global_dtype
+
+    for method in ("score_samples", "decision_function"):
+        if hasattr(iso, method):
+            y_pred = getattr(iso, method)(X)
+            assert y_pred.dtype == global_dtype
+
+
+@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
+@pytest.mark.parametrize("novelty", [True, False])
+@pytest.mark.parametrize("contamination", [0.5, "auto"])
+def test_lof_dtype_equivalence(algorithm, novelty, contamination):
+    """Check the equivalence of the results with 32 and 64 bits input."""
+
+    inliers = iris.data[:50]  # setosa iris are really distinct from others
+    outliers = iris.data[-5:]  # virginica will be considered as outliers
+    # lower the precision of the input data to check that we have an equivalence when
+    # making the computation in 32 and 64 bits.
+    X = np.concatenate([inliers, outliers], axis=0).astype(np.float32)
+
+    lof_32 = neighbors.LocalOutlierFactor(
+        algorithm=algorithm, novelty=novelty, contamination=contamination
+    )
+    X_32 = X.astype(np.float32, copy=True)
+    lof_32.fit(X_32)
+
+    lof_64 = neighbors.LocalOutlierFactor(
+        algorithm=algorithm, novelty=novelty, contamination=contamination
+    )
+    X_64 = X.astype(np.float64, copy=True)
+    lof_64.fit(X_64)
+
+    assert_allclose(lof_32.negative_outlier_factor_, lof_64.negative_outlier_factor_)
+
+    for method in ("score_samples", "decision_function", "predict", "fit_predict"):
+        if hasattr(lof_32, method):
+            y_pred_32 = getattr(lof_32, method)(X_32)
+            y_pred_64 = getattr(lof_64, method)(X_64)
+            assert_allclose(y_pred_32, y_pred_64, atol=0.0002)
+
+
+def test_lof_duplicate_samples():
+    """
+    Check that LocalOutlierFactor raises a warning when duplicate values
+    in the training data cause inaccurate results.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27839
+    """
+
+    rng = np.random.default_rng(0)
+
+    x = rng.permutation(
+        np.hstack(
+            [
+                [0.1] * 1000,  # constant values
+                np.linspace(0.1, 0.3, num=3000),
+                rng.random(500) * 100,  # the clear outliers
+            ]
+        )
+    )
+    X = x.reshape(-1, 1)
+
+    error_msg = (
+        "Duplicate values are leading to incorrect results. "
+        "Increase the number of neighbors for more accurate results."
+    )
+
+    lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1)
+
+    # Catch the warning
+    with pytest.warns(UserWarning, match=re.escape(error_msg)):
+        lof.fit_predict(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nca.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nca.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebfb01d12e3acbbb31d79a3a0573f39884cac6bb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nca.py
@@ -0,0 +1,563 @@
+"""
+Testing for Neighborhood Component Analysis module (sklearn.neighbors.nca)
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+from scipy.optimize import check_grad
+
+from sklearn import clone
+from sklearn.datasets import load_iris, make_blobs, make_classification
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import pairwise_distances
+from sklearn.neighbors import NeighborhoodComponentsAnalysis
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import validate_data
+
+rng = check_random_state(0)
+# Load and shuffle the iris dataset.
+iris = load_iris()
+perm = rng.permutation(iris.target.size)
+iris_data = iris.data[perm]
+iris_target = iris.target[perm]
+# Avoid having test data introducing dependencies between tests.
+iris_data.flags.writeable = False
+iris_target.flags.writeable = False
+EPS = np.finfo(float).eps
+
+
+def test_simple_example():
+    """Test on a simple example.
+
+    Puts four points in the input space where the opposite labels points are
+    next to each other. After transform the samples from the same class
+    should be next to each other.
+
+    """
+    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
+    y = np.array([1, 0, 1, 0])
+    nca = NeighborhoodComponentsAnalysis(
+        n_components=2, init="identity", random_state=42
+    )
+    nca.fit(X, y)
+    X_t = nca.transform(X)
+    assert_array_equal(pairwise_distances(X_t).argsort()[:, 1], np.array([2, 3, 0, 1]))
+
+
+def test_toy_example_collapse_points():
+    """Test on a toy example of three points that should collapse
+
+    We build a simple example: two points from the same class and a point from
+    a different class in the middle of them. On this simple example, the new
+    (transformed) points should all collapse into one single point. Indeed, the
+    objective is 2/(1 + exp(d/2)), with d the euclidean distance between the
+    two samples from the same class. This is maximized for d=0 (because d>=0),
+    with an objective equal to 1 (loss=-1.).
+
+    """
+    rng = np.random.RandomState(42)
+    input_dim = 5
+    two_points = rng.randn(2, input_dim)
+    X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
+    y = [0, 0, 1]
+
+    class LossStorer:
+        def __init__(self, X, y):
+            self.loss = np.inf  # initialize the loss to very high
+            # Initialize a fake NCA and variables needed to compute the loss:
+            self.fake_nca = NeighborhoodComponentsAnalysis()
+            self.fake_nca.n_iter_ = np.inf
+            self.X, y = validate_data(self.fake_nca, X, y, ensure_min_samples=2)
+            y = LabelEncoder().fit_transform(y)
+            self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
+
+        def callback(self, transformation, n_iter):
+            """Stores the last value of the loss function"""
+            self.loss, _ = self.fake_nca._loss_grad_lbfgs(
+                transformation, self.X, self.same_class_mask, -1.0
+            )
+
+    loss_storer = LossStorer(X, y)
+    nca = NeighborhoodComponentsAnalysis(random_state=42, callback=loss_storer.callback)
+    X_t = nca.fit_transform(X, y)
+    print(X_t)
+    # test that points are collapsed into one point
+    assert_array_almost_equal(X_t - X_t[0], 0.0)
+    assert abs(loss_storer.loss + 1) < 1e-10
+
+
+def test_finite_differences(global_random_seed):
+    """Test gradient of loss function
+
+    Assert that the gradient is almost equal to its finite differences
+    approximation.
+    """
+    # Initialize the transformation `M`, as well as `X` and `y` and `NCA`
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_classification(random_state=global_random_seed)
+    M = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
+    nca = NeighborhoodComponentsAnalysis()
+    nca.n_iter_ = 0
+    mask = y[:, np.newaxis] == y[np.newaxis, :]
+
+    def fun(M):
+        return nca._loss_grad_lbfgs(M, X, mask)[0]
+
+    def grad(M):
+        return nca._loss_grad_lbfgs(M, X, mask)[1]
+
+    # compare the gradient to a finite difference approximation
+    diff = check_grad(fun, grad, M.ravel())
+    assert diff == pytest.approx(0.0, abs=1e-4)
+
+
+def test_params_validation():
+    # Test that invalid parameters raise value error
+    X = np.arange(12).reshape(4, 3)
+    y = [1, 1, 2, 2]
+    NCA = NeighborhoodComponentsAnalysis
+    rng = np.random.RandomState(42)
+
+    init = rng.rand(5, 3)
+    msg = (
+        f"The output dimensionality ({init.shape[0]}) "
+        "of the given linear transformation `init` cannot be "
+        f"greater than its input dimensionality ({init.shape[1]})."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        NCA(init=init).fit(X, y)
+    n_components = 10
+    msg = (
+        "The preferred dimensionality of the projected space "
+        f"`n_components` ({n_components}) cannot be greater "
+        f"than the given data dimensionality ({X.shape[1]})!"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        NCA(n_components=n_components).fit(X, y)
+
+
+def test_transformation_dimensions():
+    X = np.arange(12).reshape(4, 3)
+    y = [1, 1, 2, 2]
+
+    # Fail if transformation input dimension does not match inputs dimensions
+    transformation = np.array([[1, 2], [3, 4]])
+    with pytest.raises(ValueError):
+        NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
+
+    # Fail if transformation output dimension is larger than
+    # transformation input dimension
+    transformation = np.array([[1, 2], [3, 4], [5, 6]])
+    # len(transformation) > len(transformation[0])
+    with pytest.raises(ValueError):
+        NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
+
+    # Pass otherwise
+    transformation = np.arange(9).reshape(3, 3)
+    NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
+
+
+def test_n_components():
+    rng = np.random.RandomState(42)
+    X = np.arange(12).reshape(4, 3)
+    y = [1, 1, 2, 2]
+
+    init = rng.rand(X.shape[1] - 1, 3)
+
+    # n_components = X.shape[1] != transformation.shape[0]
+    n_components = X.shape[1]
+    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    msg = (
+        "The preferred dimensionality of the projected space "
+        f"`n_components` ({n_components}) does not match the output "
+        "dimensionality of the given linear transformation "
+        f"`init` ({init.shape[0]})!"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
+
+    # n_components > X.shape[1]
+    n_components = X.shape[1] + 2
+    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    msg = (
+        "The preferred dimensionality of the projected space "
+        f"`n_components` ({n_components}) cannot be greater than "
+        f"the given data dimensionality ({X.shape[1]})!"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
+
+    # n_components < X.shape[1]
+    nca = NeighborhoodComponentsAnalysis(n_components=2, init="identity")
+    nca.fit(X, y)
+
+
+def test_init_transformation():
+    rng = np.random.RandomState(42)
+    X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
+
+    # Start learning from scratch
+    nca = NeighborhoodComponentsAnalysis(init="identity")
+    nca.fit(X, y)
+
+    # Initialize with random
+    nca_random = NeighborhoodComponentsAnalysis(init="random")
+    nca_random.fit(X, y)
+
+    # Initialize with auto
+    nca_auto = NeighborhoodComponentsAnalysis(init="auto")
+    nca_auto.fit(X, y)
+
+    # Initialize with PCA
+    nca_pca = NeighborhoodComponentsAnalysis(init="pca")
+    nca_pca.fit(X, y)
+
+    # Initialize with LDA
+    nca_lda = NeighborhoodComponentsAnalysis(init="lda")
+    nca_lda.fit(X, y)
+
+    init = rng.rand(X.shape[1], X.shape[1])
+    nca = NeighborhoodComponentsAnalysis(init=init)
+    nca.fit(X, y)
+
+    # init.shape[1] must match X.shape[1]
+    init = rng.rand(X.shape[1], X.shape[1] + 1)
+    nca = NeighborhoodComponentsAnalysis(init=init)
+    msg = (
+        f"The input dimensionality ({init.shape[1]}) of the given "
+        "linear transformation `init` must match the "
+        f"dimensionality of the given inputs `X` ({X.shape[1]})."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
+
+    # init.shape[0] must be <= init.shape[1]
+    init = rng.rand(X.shape[1] + 1, X.shape[1])
+    nca = NeighborhoodComponentsAnalysis(init=init)
+    msg = (
+        f"The output dimensionality ({init.shape[0]}) of the given "
+        "linear transformation `init` cannot be "
+        f"greater than its input dimensionality ({init.shape[1]})."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
+
+    # init.shape[0] must match n_components
+    init = rng.rand(X.shape[1], X.shape[1])
+    n_components = X.shape[1] - 2
+    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    msg = (
+        "The preferred dimensionality of the "
+        f"projected space `n_components` ({n_components}) "
+        "does not match the output dimensionality of the given "
+        f"linear transformation `init` ({init.shape[0]})!"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
+
+
+@pytest.mark.parametrize("n_samples", [3, 5, 7, 11])
+@pytest.mark.parametrize("n_features", [3, 5, 7, 11])
+@pytest.mark.parametrize("n_classes", [5, 7, 11])
+@pytest.mark.parametrize("n_components", [3, 5, 7, 11])
+def test_auto_init(n_samples, n_features, n_classes, n_components):
+    # Test that auto choose the init as expected with every configuration
+    # of order of n_samples, n_features, n_classes and n_components.
+    rng = np.random.RandomState(42)
+    nca_base = NeighborhoodComponentsAnalysis(
+        init="auto", n_components=n_components, max_iter=1, random_state=rng
+    )
+    if n_classes >= n_samples:
+        pass
+        # n_classes > n_samples is impossible, and n_classes == n_samples
+        # throws an error from lda but is an absurd case
+    else:
+        X = rng.randn(n_samples, n_features)
+        y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
+        if n_components > n_features:
+            # this would return a ValueError, which is already tested in
+            # test_params_validation
+            pass
+        else:
+            nca = clone(nca_base)
+            nca.fit(X, y)
+            if n_components <= min(n_classes - 1, n_features):
+                nca_other = clone(nca_base).set_params(init="lda")
+            elif n_components < min(n_features, n_samples):
+                nca_other = clone(nca_base).set_params(init="pca")
+            else:
+                nca_other = clone(nca_base).set_params(init="identity")
+            nca_other.fit(X, y)
+            assert_array_almost_equal(nca.components_, nca_other.components_)
+
+
+def test_warm_start_validation():
+    X, y = make_classification(
+        n_samples=30,
+        n_features=5,
+        n_classes=4,
+        n_redundant=0,
+        n_informative=5,
+        random_state=0,
+    )
+
+    nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
+    nca.fit(X, y)
+
+    X_less_features, y = make_classification(
+        n_samples=30,
+        n_features=4,
+        n_classes=4,
+        n_redundant=0,
+        n_informative=4,
+        random_state=0,
+    )
+    msg = (
+        f"The new inputs dimensionality ({X_less_features.shape[1]}) "
+        "does not match the input dimensionality of the previously learned "
+        f"transformation ({nca.components_.shape[1]})."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X_less_features, y)
+
+
+def test_warm_start_effectiveness():
+    # A 1-iteration second fit on same data should give almost same result
+    # with warm starting, and quite different result without warm starting.
+
+    nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
+    nca_warm.fit(iris_data, iris_target)
+    transformation_warm = nca_warm.components_
+    nca_warm.max_iter = 1
+    nca_warm.fit(iris_data, iris_target)
+    transformation_warm_plus_one = nca_warm.components_
+
+    nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
+    nca_cold.fit(iris_data, iris_target)
+    transformation_cold = nca_cold.components_
+    nca_cold.max_iter = 1
+    nca_cold.fit(iris_data, iris_target)
+    transformation_cold_plus_one = nca_cold.components_
+
+    diff_warm = np.sum(np.abs(transformation_warm_plus_one - transformation_warm))
+    diff_cold = np.sum(np.abs(transformation_cold_plus_one - transformation_cold))
+    assert diff_warm < 3.0, (
+        "Transformer changed significantly after one "
+        "iteration even though it was warm-started."
+    )
+
+    assert diff_cold > diff_warm, (
+        "Cold-started transformer changed less "
+        "significantly than warm-started "
+        "transformer after one iteration."
+    )
+
+
+@pytest.mark.parametrize(
+    "init_name", ["pca", "lda", "identity", "random", "precomputed"]
+)
+def test_verbose(init_name, capsys):
+    # assert there is proper output when verbose = 1, for every initialization
+    # except auto because auto will call one of the others
+    rng = np.random.RandomState(42)
+    X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
+    regexp_init = r"... done in \ *\d+\.\d{2}s"
+    msgs = {
+        "pca": "Finding principal components" + regexp_init,
+        "lda": "Finding most discriminative components" + regexp_init,
+    }
+    if init_name == "precomputed":
+        init = rng.randn(X.shape[1], X.shape[1])
+    else:
+        init = init_name
+    nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
+    nca.fit(X, y)
+    out, _ = capsys.readouterr()
+
+    # check output
+    lines = re.split("\n+", out)
+    # if pca or lda init, an additional line is printed, so we test
+    # it and remove it to test the rest equally among initializations
+    if init_name in ["pca", "lda"]:
+        assert re.match(msgs[init_name], lines[0])
+        lines = lines[1:]
+    assert lines[0] == "[NeighborhoodComponentsAnalysis]"
+    header = "{:>10} {:>20} {:>10}".format("Iteration", "Objective Value", "Time(s)")
+    assert lines[1] == "[NeighborhoodComponentsAnalysis] {}".format(header)
+    assert lines[2] == "[NeighborhoodComponentsAnalysis] {}".format("-" * len(header))
+    for line in lines[3:-2]:
+        # The following regex will match for instance:
+        # '[NeighborhoodComponentsAnalysis]  0    6.988936e+01   0.01'
+        assert re.match(
+            r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e"
+            r"[+|-]\d+\ *\d+\.\d{2}",
+            line,
+        )
+    assert re.match(
+        r"\[NeighborhoodComponentsAnalysis\] Training took\ *\d+\.\d{2}s\.",
+        lines[-2],
+    )
+    assert lines[-1] == ""
+
+
+def test_no_verbose(capsys):
+    # assert by default there is no output (verbose=0)
+    nca = NeighborhoodComponentsAnalysis()
+    nca.fit(iris_data, iris_target)
+    out, _ = capsys.readouterr()
+    # check output
+    assert out == ""
+
+
+def test_singleton_class():
+    X = iris_data.copy()
+    y = iris_target.copy()
+
+    # one singleton class
+    singleton_class = 1
+    (ind_singleton,) = np.where(y == singleton_class)
+    y[ind_singleton] = 2
+    y[ind_singleton[0]] = singleton_class
+
+    nca = NeighborhoodComponentsAnalysis(max_iter=30)
+    nca.fit(X, y)
+
+    # One non-singleton class
+    (ind_1,) = np.where(y == 1)
+    (ind_2,) = np.where(y == 2)
+    y[ind_1] = 0
+    y[ind_1[0]] = 1
+    y[ind_2] = 0
+    y[ind_2[0]] = 2
+
+    nca = NeighborhoodComponentsAnalysis(max_iter=30)
+    nca.fit(X, y)
+
+    # Only singleton classes
+    (ind_0,) = np.where(y == 0)
+    (ind_1,) = np.where(y == 1)
+    (ind_2,) = np.where(y == 2)
+    X = X[[ind_0[0], ind_1[0], ind_2[0]]]
+    y = y[[ind_0[0], ind_1[0], ind_2[0]]]
+
+    nca = NeighborhoodComponentsAnalysis(init="identity", max_iter=30)
+    nca.fit(X, y)
+    assert_array_equal(X, nca.transform(X))
+
+
+def test_one_class():
+    X = iris_data[iris_target == 0]
+    y = iris_target[iris_target == 0]
+
+    nca = NeighborhoodComponentsAnalysis(
+        max_iter=30, n_components=X.shape[1], init="identity"
+    )
+    nca.fit(X, y)
+    assert_array_equal(X, nca.transform(X))
+
+
+def test_callback(capsys):
+    max_iter = 10
+
+    def my_cb(transformation, n_iter):
+        assert transformation.shape == (iris_data.shape[1] ** 2,)
+        rem_iter = max_iter - n_iter
+        print("{} iterations remaining...".format(rem_iter))
+
+    # assert that my_cb is called
+    nca = NeighborhoodComponentsAnalysis(max_iter=max_iter, callback=my_cb, verbose=1)
+    nca.fit(iris_data, iris_target)
+    out, _ = capsys.readouterr()
+
+    # check output
+    assert "{} iterations remaining...".format(max_iter - 1) in out
+
+
+def test_expected_transformation_shape():
+    """Test that the transformation has the expected shape."""
+    X = iris_data
+    y = iris_target
+
+    class TransformationStorer:
+        def __init__(self, X, y):
+            # Initialize a fake NCA and variables needed to call the loss
+            # function:
+            self.fake_nca = NeighborhoodComponentsAnalysis()
+            self.fake_nca.n_iter_ = np.inf
+            self.X, y = validate_data(self.fake_nca, X, y, ensure_min_samples=2)
+            y = LabelEncoder().fit_transform(y)
+            self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
+
+        def callback(self, transformation, n_iter):
+            """Stores the last value of the transformation taken as input by
+            the optimizer"""
+            self.transformation = transformation
+
+    transformation_storer = TransformationStorer(X, y)
+    cb = transformation_storer.callback
+    nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb)
+    nca.fit(X, y)
+    assert transformation_storer.transformation.size == X.shape[1] ** 2
+
+
+def test_convergence_warning():
+    nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
+    cls_name = nca.__class__.__name__
+    msg = "[{}] NCA did not converge".format(cls_name)
+    with pytest.warns(ConvergenceWarning, match=re.escape(msg)):
+        nca.fit(iris_data, iris_target)
+
+
+@pytest.mark.parametrize(
+    "param, value",
+    [
+        ("n_components", np.int32(3)),
+        ("max_iter", np.int32(100)),
+        ("tol", np.float32(0.0001)),
+    ],
+)
+def test_parameters_valid_types(param, value):
+    # check that no error is raised when parameters have numpy integer or
+    # floating types.
+    nca = NeighborhoodComponentsAnalysis(**{param: value})
+
+    X = iris_data
+    y = iris_target
+
+    nca.fit(X, y)
+
+
+@pytest.mark.parametrize("n_components", [None, 2])
+def test_nca_feature_names_out(n_components):
+    """Check `get_feature_names_out` for `NeighborhoodComponentsAnalysis`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28293
+    """
+
+    X = iris_data
+    y = iris_target
+
+    est = NeighborhoodComponentsAnalysis(n_components=n_components).fit(X, y)
+    names_out = est.get_feature_names_out()
+
+    class_name_lower = est.__class__.__name__.lower()
+
+    if n_components is not None:
+        expected_n_features = n_components
+    else:
+        expected_n_features = X.shape[1]
+
+    expected_names_out = np.array(
+        [f"{class_name_lower}{i}" for i in range(expected_n_features)],
+        dtype=object,
+    )
+
+    assert_array_equal(names_out, expected_names_out)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nearest_centroid.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nearest_centroid.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aa9274cd28a89be3744f56b6c3f31b80c2252ed
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nearest_centroid.py
@@ -0,0 +1,237 @@
+"""
+Testing for the nearest centroid module.
+"""
+
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.neighbors import NearestCentroid
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+T = [[-1, -1], [2, 2], [3, 2]]
+true_result = [-1, 1, 1]
+true_result_prior1 = [-1, 1, 1]
+
+true_discriminant_scores = [-32, 64, 80]
+true_proba = [[1, 1.26642e-14], [1.60381e-28, 1], [1.80485e-35, 1]]
+
+
+# also load the iris dataset
+# and randomly permute it
+iris = datasets.load_iris()
+rng = np.random.RandomState(1)
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_classification_toy(csr_container):
+    # Check classification on a toy dataset, including sparse versions.
+    X_csr = csr_container(X)
+    T_csr = csr_container(T)
+
+    # Check classification on a toy dataset, including sparse versions.
+    clf = NearestCentroid()
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
+    assert_array_almost_equal(clf.predict_proba(T), true_proba)
+
+    # Test uniform priors
+    clf = NearestCentroid(priors="uniform")
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
+    assert_array_almost_equal(clf.predict_proba(T), true_proba)
+
+    clf = NearestCentroid(priors="empirical")
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
+    assert_array_almost_equal(clf.predict_proba(T), true_proba)
+
+    # Test custom priors
+    clf = NearestCentroid(priors=[0.25, 0.75])
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result_prior1)
+
+    # Same test, but with a sparse matrix to fit and test.
+    clf = NearestCentroid()
+    clf.fit(X_csr, y)
+    assert_array_equal(clf.predict(T_csr), true_result)
+
+    # Fit with sparse, test with non-sparse
+    clf = NearestCentroid()
+    clf.fit(X_csr, y)
+    assert_array_equal(clf.predict(T), true_result)
+
+    # Fit with non-sparse, test with sparse
+    clf = NearestCentroid()
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T_csr), true_result)
+
+    # Fit and predict with non-CSR sparse matrices
+    clf = NearestCentroid()
+    clf.fit(X_csr.tocoo(), y)
+    assert_array_equal(clf.predict(T_csr.tolil()), true_result)
+
+
+def test_iris():
+    # Check consistency on dataset iris.
+    for metric in ("euclidean", "manhattan"):
+        clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
+        score = np.mean(clf.predict(iris.data) == iris.target)
+        assert score > 0.9, "Failed with score = " + str(score)
+
+
+def test_iris_shrinkage():
+    # Check consistency on dataset iris, when using shrinkage.
+    for metric in ("euclidean", "manhattan"):
+        for shrink_threshold in [None, 0.1, 0.5]:
+            clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
+            clf = clf.fit(iris.data, iris.target)
+            score = np.mean(clf.predict(iris.data) == iris.target)
+            assert score > 0.8, "Failed with score = " + str(score)
+
+
+def test_pickle():
+    import pickle
+
+    # classification
+    obj = NearestCentroid()
+    obj.fit(iris.data, iris.target)
+    score = obj.score(iris.data, iris.target)
+    s = pickle.dumps(obj)
+
+    obj2 = pickle.loads(s)
+    assert type(obj2) == obj.__class__
+    score2 = obj2.score(iris.data, iris.target)
+    assert_array_equal(
+        score,
+        score2,
+        "Failed to generate same score after pickling (classification).",
+    )
+
+
+def test_shrinkage_correct():
+    # Ensure that the shrinking is correct.
+    # The expected result is calculated by R (pamr),
+    # which is implemented by the author of the original paper.
+    # (One need to modify the code to output the new centroid in pamr.predict)
+
+    X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
+    y = np.array([1, 1, 2, 2, 2])
+    clf = NearestCentroid(shrink_threshold=0.1)
+    clf.fit(X, y)
+    expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
+    np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
+
+
+def test_shrinkage_threshold_decoded_y():
+    clf = NearestCentroid(shrink_threshold=0.01)
+    y_ind = np.asarray(y)
+    y_ind[y_ind == -1] = 0
+    clf.fit(X, y_ind)
+    centroid_encoded = clf.centroids_
+    clf.fit(X, y)
+    assert_array_equal(centroid_encoded, clf.centroids_)
+
+
+def test_predict_translated_data():
+    # Test that NearestCentroid gives same results on translated data
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(50, 50)
+    y = rng.randint(0, 3, 50)
+    noise = rng.rand(50)
+    clf = NearestCentroid(shrink_threshold=0.1)
+    clf.fit(X, y)
+    y_init = clf.predict(X)
+    clf = NearestCentroid(shrink_threshold=0.1)
+    X_noise = X + noise
+    clf.fit(X_noise, y)
+    y_translate = clf.predict(X_noise)
+    assert_array_equal(y_init, y_translate)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_manhattan_metric(csr_container):
+    # Test the manhattan metric.
+    X_csr = csr_container(X)
+
+    clf = NearestCentroid(metric="manhattan")
+    clf.fit(X, y)
+    dense_centroid = clf.centroids_
+    clf.fit(X_csr, y)
+    assert_array_equal(clf.centroids_, dense_centroid)
+    assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
+
+
+def test_features_zero_var():
+    # Test that features with 0 variance throw error
+
+    X = np.empty((10, 2))
+    X[:, 0] = -0.13725701
+    X[:, 1] = -0.9853293
+    y = np.zeros((10))
+    y[0] = 1
+
+    clf = NearestCentroid(shrink_threshold=0.1)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
+
+
+def test_negative_priors_error():
+    """Check that we raise an error when the user-defined priors are negative."""
+    clf = NearestCentroid(priors=[-2, 4])
+    with pytest.raises(ValueError, match="priors must be non-negative"):
+        clf.fit(X, y)
+
+
+def test_warn_non_normalized_priors():
+    """Check that we raise a warning and normalize the user-defined priors when they
+    don't sum to 1.
+    """
+    priors = [2, 4]
+    clf = NearestCentroid(priors=priors)
+    with pytest.warns(
+        UserWarning,
+        match="The priors do not sum to 1. Normalizing such that it sums to one.",
+    ):
+        clf.fit(X, y)
+
+    assert_allclose(clf.class_prior_, np.asarray(priors) / np.asarray(priors).sum())
+
+
+@pytest.mark.parametrize(
+    "response_method", ["decision_function", "predict_proba", "predict_log_proba"]
+)
+def test_method_not_available_with_manhattan(response_method):
+    """Check that we raise an AttributeError with Manhattan metric when trying
+    to call a non-thresholded response method.
+    """
+    clf = NearestCentroid(metric="manhattan").fit(X, y)
+    with pytest.raises(AttributeError):
+        getattr(clf, response_method)(T)
+
+
+@pytest.mark.parametrize("array_constructor", [np.array] + CSR_CONTAINERS)
+def test_error_zero_variances(array_constructor):
+    """Check that we raise an error when the variance for all features is zero."""
+    X = np.ones((len(y), 2))
+    X[:, 1] *= 2
+    X = array_constructor(X)
+
+    clf = NearestCentroid()
+    with pytest.raises(ValueError, match="All features have zero variance"):
+        clf.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae589b30dd74369cb8ef242fb86a11e0c75a09a2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors.py
@@ -0,0 +1,2503 @@
+import re
+import warnings
+from itertools import product
+
+import joblib
+import numpy as np
+import pytest
+from scipy.sparse import issparse
+
+from sklearn import (
+    config_context,
+    datasets,
+    metrics,
+    neighbors,
+)
+from sklearn.base import clone
+from sklearn.exceptions import EfficiencyWarning, NotFittedError
+from sklearn.metrics._dist_metrics import (
+    DistanceMetric,
+)
+from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS, pairwise_distances
+from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS
+from sklearn.metrics.tests.test_pairwise_distances_reduction import (
+    assert_compatible_argkmin_results,
+    assert_compatible_radius_results,
+)
+from sklearn.model_selection import (
+    LeaveOneOut,
+    cross_val_predict,
+    cross_val_score,
+    train_test_split,
+)
+from sklearn.neighbors import (
+    VALID_METRICS_SPARSE,
+    KNeighborsRegressor,
+)
+from sklearn.neighbors._base import (
+    KNeighborsMixin,
+    _check_precomputed,
+    _is_sorted_by_data,
+    sort_graph_by_row_values,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import (
+    BSR_CONTAINERS,
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DIA_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+from sklearn.utils.validation import check_random_state
+
+rng = np.random.RandomState(0)
+# load and shuffle iris dataset
+iris = datasets.load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+# load and shuffle digits
+digits = datasets.load_digits()
+perm = rng.permutation(digits.target.size)
+digits.data = digits.data[perm]
+digits.target = digits.target[perm]
+
+SPARSE_TYPES = tuple(
+    BSR_CONTAINERS
+    + COO_CONTAINERS
+    + CSC_CONTAINERS
+    + CSR_CONTAINERS
+    + DOK_CONTAINERS
+    + LIL_CONTAINERS
+)
+SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,)
+
+ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto")
+COMMON_VALID_METRICS = sorted(
+    set.intersection(*map(set, neighbors.VALID_METRICS.values()))
+)
+
+P = (1, 2, 3, 4, np.inf)
+
+# Filter deprecation warnings.
+neighbors.kneighbors_graph = ignore_warnings(neighbors.kneighbors_graph)
+neighbors.radius_neighbors_graph = ignore_warnings(neighbors.radius_neighbors_graph)
+
+# A list containing metrics where the string specifies the use of the
+# DistanceMetric object directly (as resolved in _parse_metric)
+DISTANCE_METRIC_OBJS = ["DM_euclidean"]
+
+
+def _parse_metric(metric: str, dtype=None):
+    """
+    Helper function for properly building a type-specialized DistanceMetric instances.
+
+    Constructs a type-specialized DistanceMetric instance from a string
+    beginning with "DM_" while allowing a pass-through for other metric-specifying
+    strings. This is necessary since we wish to parameterize dtype independent of
+    metric, yet DistanceMetric requires it for construction.
+
+    """
+    if metric[:3] == "DM_":
+        return DistanceMetric.get_metric(metric[3:], dtype=dtype)
+    return metric
+
+
+def _generate_test_params_for(metric: str, n_features: int):
+    """Return list of DistanceMetric kwargs for tests."""
+
+    # Distinguishing on cases not to compute unneeded datastructures.
+    rng = np.random.RandomState(1)
+
+    if metric == "minkowski":
+        return [
+            dict(p=1.5),
+            dict(p=2),
+            dict(p=3),
+            dict(p=np.inf),
+            dict(p=3, w=rng.rand(n_features)),
+        ]
+
+    if metric == "seuclidean":
+        return [dict(V=rng.rand(n_features))]
+
+    if metric == "mahalanobis":
+        A = rng.rand(n_features, n_features)
+        # Make the matrix symmetric positive definite
+        VI = A + A.T + 3 * np.eye(n_features)
+        return [dict(VI=VI)]
+
+    # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
+    # In those cases, no kwargs are needed.
+    return [{}]
+
+
+def _weight_func(dist):
+    """Weight function to replace lambda d: d ** -2.
+    The lambda function is not valid because:
+    if d==0 then 0^-2 is not valid."""
+
+    # Dist could be multidimensional, flatten it so all values
+    # can be looped
+    with np.errstate(divide="ignore"):
+        retval = 1.0 / dist
+    return retval**2
+
+
+WEIGHTS = ["uniform", "distance", _weight_func]
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, n_query_pts, n_neighbors",
+    [
+        (100, 100, 10, 100),
+        (1000, 5, 100, 1),
+    ],
+)
+@pytest.mark.parametrize("query_is_train", [False, True])
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS)
+def test_unsupervised_kneighbors(
+    global_dtype,
+    n_samples,
+    n_features,
+    n_query_pts,
+    n_neighbors,
+    query_is_train,
+    metric,
+):
+    # The different algorithms must return identical results
+    # on their common metrics, with and without returning
+    # distances
+
+    metric = _parse_metric(metric, global_dtype)
+
+    # Redefining the rng locally to use the same generated X
+    local_rng = np.random.RandomState(0)
+    X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
+
+    query = (
+        X
+        if query_is_train
+        else local_rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False)
+    )
+
+    results_nodist = []
+    results = []
+
+    for algorithm in ALGORITHMS:
+        if isinstance(metric, DistanceMetric) and global_dtype == np.float32:
+            if "tree" in algorithm:  # pragma: nocover
+                pytest.skip(
+                    "Neither KDTree nor BallTree support 32-bit distance metric"
+                    " objects."
+                )
+        neigh = neighbors.NearestNeighbors(
+            n_neighbors=n_neighbors, algorithm=algorithm, metric=metric
+        )
+        neigh.fit(X)
+
+        results_nodist.append(neigh.kneighbors(query, return_distance=False))
+        results.append(neigh.kneighbors(query, return_distance=True))
+
+    for i in range(len(results) - 1):
+        algorithm = ALGORITHMS[i]
+        next_algorithm = ALGORITHMS[i + 1]
+
+        indices_no_dist = results_nodist[i]
+        distances, next_distances = results[i][0], results[i + 1][0]
+        indices, next_indices = results[i][1], results[i + 1][1]
+        assert_array_equal(
+            indices_no_dist,
+            indices,
+            err_msg=(
+                f"The '{algorithm}' algorithm returns different"
+                "indices depending on 'return_distances'."
+            ),
+        )
+        assert_array_equal(
+            indices,
+            next_indices,
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different indices."
+            ),
+        )
+        assert_allclose(
+            distances,
+            next_distances,
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different distances."
+            ),
+            atol=1e-6,
+        )
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, n_query_pts",
+    [
+        (100, 100, 10),
+        (1000, 5, 100),
+    ],
+)
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS)
+@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
+@pytest.mark.parametrize(
+    "NeighborsMixinSubclass",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.RadiusNeighborsRegressor,
+    ],
+)
+def test_neigh_predictions_algorithm_agnosticity(
+    global_dtype,
+    n_samples,
+    n_features,
+    n_query_pts,
+    metric,
+    n_neighbors,
+    radius,
+    NeighborsMixinSubclass,
+):
+    # The different algorithms must return identical predictions results
+    # on their common metrics.
+
+    metric = _parse_metric(metric, global_dtype)
+    if isinstance(metric, DistanceMetric):
+        if "Classifier" in NeighborsMixinSubclass.__name__:
+            pytest.skip(
+                "Metrics of type `DistanceMetric` are not yet supported for"
+                " classifiers."
+            )
+        if "Radius" in NeighborsMixinSubclass.__name__:
+            pytest.skip(
+                "Metrics of type `DistanceMetric` are not yet supported for"
+                " radius-neighbor estimators."
+            )
+
+    # Redefining the rng locally to use the same generated X
+    local_rng = np.random.RandomState(0)
+    X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
+    y = local_rng.randint(3, size=n_samples)
+
+    query = local_rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False)
+
+    predict_results = []
+
+    parameter = (
+        n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius
+    )
+
+    for algorithm in ALGORITHMS:
+        if isinstance(metric, DistanceMetric) and global_dtype == np.float32:
+            if "tree" in algorithm:  # pragma: nocover
+                pytest.skip(
+                    "Neither KDTree nor BallTree support 32-bit distance metric"
+                    " objects."
+                )
+        neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric)
+        neigh.fit(X, y)
+
+        predict_results.append(neigh.predict(query))
+
+    for i in range(len(predict_results) - 1):
+        algorithm = ALGORITHMS[i]
+        next_algorithm = ALGORITHMS[i + 1]
+
+        predictions, next_predictions = predict_results[i], predict_results[i + 1]
+
+        assert_allclose(
+            predictions,
+            next_predictions,
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different predictions."
+            ),
+        )
+
+
+@pytest.mark.parametrize(
+    "KNeighborsMixinSubclass",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.NearestNeighbors,
+    ],
+)
+def test_unsupervised_inputs(global_dtype, KNeighborsMixinSubclass):
+    # Test unsupervised inputs for neighbors estimators
+
+    X = rng.random_sample((10, 3)).astype(global_dtype, copy=False)
+    y = rng.randint(3, size=10)
+    nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1)
+    nbrs_fid.fit(X)
+
+    dist1, ind1 = nbrs_fid.kneighbors(X)
+
+    nbrs = KNeighborsMixinSubclass(n_neighbors=1)
+
+    for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)):
+        nbrs.fit(data, y)
+
+        dist2, ind2 = nbrs.kneighbors(X)
+
+        assert_allclose(dist1, dist2)
+        assert_array_equal(ind1, ind2)
+
+
+def test_not_fitted_error_gets_raised():
+    X = [[1]]
+    neighbors_ = neighbors.NearestNeighbors()
+    with pytest.raises(NotFittedError):
+        neighbors_.kneighbors_graph(X)
+    with pytest.raises(NotFittedError):
+        neighbors_.radius_neighbors_graph(X)
+
+
+@pytest.mark.filterwarnings("ignore:EfficiencyWarning")
+def check_precomputed(make_train_test, estimators):
+    """Tests unsupervised NearestNeighbors with a distance matrix."""
+    # Note: smaller samples may result in spurious test success
+    rng = np.random.RandomState(42)
+    X = rng.random_sample((10, 4))
+    Y = rng.random_sample((3, 4))
+    DXX, DYX = make_train_test(X, Y)
+    for method in [
+        "kneighbors",
+    ]:
+        # TODO: also test radius_neighbors, but requires different assertion
+
+        # As a feature matrix (n_samples by n_features)
+        nbrs_X = neighbors.NearestNeighbors(n_neighbors=3)
+        nbrs_X.fit(X)
+        dist_X, ind_X = getattr(nbrs_X, method)(Y)
+
+        # As a dense distance matrix (n_samples by n_samples)
+        nbrs_D = neighbors.NearestNeighbors(
+            n_neighbors=3, algorithm="brute", metric="precomputed"
+        )
+        nbrs_D.fit(DXX)
+        dist_D, ind_D = getattr(nbrs_D, method)(DYX)
+        assert_allclose(dist_X, dist_D)
+        assert_array_equal(ind_X, ind_D)
+
+        # Check auto works too
+        nbrs_D = neighbors.NearestNeighbors(
+            n_neighbors=3, algorithm="auto", metric="precomputed"
+        )
+        nbrs_D.fit(DXX)
+        dist_D, ind_D = getattr(nbrs_D, method)(DYX)
+        assert_allclose(dist_X, dist_D)
+        assert_array_equal(ind_X, ind_D)
+
+        # Check X=None in prediction
+        dist_X, ind_X = getattr(nbrs_X, method)(None)
+        dist_D, ind_D = getattr(nbrs_D, method)(None)
+        assert_allclose(dist_X, dist_D)
+        assert_array_equal(ind_X, ind_D)
+
+        # Must raise a ValueError if the matrix is not of correct shape
+        with pytest.raises(ValueError):
+            getattr(nbrs_D, method)(X)
+
+    target = np.arange(X.shape[0])
+    for Est in estimators:
+        est = Est(metric="euclidean")
+        est.radius = est.n_neighbors = 1
+        pred_X = est.fit(X, target).predict(Y)
+        est.metric = "precomputed"
+        pred_D = est.fit(DXX, target).predict(DYX)
+        assert_allclose(pred_X, pred_D)
+
+
+def test_precomputed_dense():
+    def make_train_test(X_train, X_test):
+        return (
+            metrics.pairwise_distances(X_train),
+            metrics.pairwise_distances(X_test, X_train),
+        )
+
+    estimators = [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.RadiusNeighborsRegressor,
+    ]
+    check_precomputed(make_train_test, estimators)
+
+
+@pytest.mark.parametrize("fmt", ["csr", "lil"])
+def test_precomputed_sparse_knn(fmt):
+    def make_train_test(X_train, X_test):
+        nn = neighbors.NearestNeighbors(n_neighbors=3 + 1).fit(X_train)
+        return (
+            nn.kneighbors_graph(X_train, mode="distance").asformat(fmt),
+            nn.kneighbors_graph(X_test, mode="distance").asformat(fmt),
+        )
+
+    # We do not test RadiusNeighborsClassifier and RadiusNeighborsRegressor
+    # since the precomputed neighbors graph is built with k neighbors only.
+    estimators = [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+    ]
+    check_precomputed(make_train_test, estimators)
+
+
+@pytest.mark.parametrize("fmt", ["csr", "lil"])
+def test_precomputed_sparse_radius(fmt):
+    def make_train_test(X_train, X_test):
+        nn = neighbors.NearestNeighbors(radius=1).fit(X_train)
+        return (
+            nn.radius_neighbors_graph(X_train, mode="distance").asformat(fmt),
+            nn.radius_neighbors_graph(X_test, mode="distance").asformat(fmt),
+        )
+
+    # We do not test KNeighborsClassifier and KNeighborsRegressor
+    # since the precomputed neighbors graph is built with a radius.
+    estimators = [
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.RadiusNeighborsRegressor,
+    ]
+    check_precomputed(make_train_test, estimators)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_is_sorted_by_data(csr_container):
+    # Test that _is_sorted_by_data works as expected. In CSR sparse matrix,
+    # entries in each row can be sorted by indices, by data, or unsorted.
+    # _is_sorted_by_data should return True when entries are sorted by data,
+    # and False in all other cases.
+
+    # Test with sorted single row sparse array
+    X = csr_container(np.arange(10).reshape(1, 10))
+    assert _is_sorted_by_data(X)
+    # Test with unsorted 1D array
+    X[0, 2] = 5
+    assert not _is_sorted_by_data(X)
+
+    # Test when the data is sorted in each sample, but not necessarily
+    # between samples
+    X = csr_container([[0, 1, 2], [3, 0, 0], [3, 4, 0], [1, 0, 2]])
+    assert _is_sorted_by_data(X)
+
+    # Test with duplicates entries in X.indptr
+    data, indices, indptr = [0, 4, 2, 2], [0, 1, 1, 1], [0, 2, 2, 4]
+    X = csr_container((data, indices, indptr), shape=(3, 3))
+    assert _is_sorted_by_data(X)
+
+
+@pytest.mark.filterwarnings("ignore:EfficiencyWarning")
+@pytest.mark.parametrize("function", [sort_graph_by_row_values, _check_precomputed])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sort_graph_by_row_values(function, csr_container):
+    # Test that sort_graph_by_row_values returns a graph sorted by row values
+    X = csr_container(np.abs(np.random.RandomState(42).randn(10, 10)))
+    assert not _is_sorted_by_data(X)
+    Xt = function(X)
+    assert _is_sorted_by_data(Xt)
+
+    # test with a different number of nonzero entries for each sample
+    mask = np.random.RandomState(42).randint(2, size=(10, 10))
+    X = X.toarray()
+    X[mask == 1] = 0
+    X = csr_container(X)
+    assert not _is_sorted_by_data(X)
+    Xt = function(X)
+    assert _is_sorted_by_data(Xt)
+
+
+@pytest.mark.filterwarnings("ignore:EfficiencyWarning")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sort_graph_by_row_values_copy(csr_container):
+    # Test if the sorting is done inplace if X is CSR, so that Xt is X.
+    X_ = csr_container(np.abs(np.random.RandomState(42).randn(10, 10)))
+    assert not _is_sorted_by_data(X_)
+
+    # sort_graph_by_row_values is done inplace if copy=False
+    X = X_.copy()
+    assert sort_graph_by_row_values(X).data is X.data
+
+    X = X_.copy()
+    assert sort_graph_by_row_values(X, copy=False).data is X.data
+
+    X = X_.copy()
+    assert sort_graph_by_row_values(X, copy=True).data is not X.data
+
+    # _check_precomputed is never done inplace
+    X = X_.copy()
+    assert _check_precomputed(X).data is not X.data
+
+    # do not raise if X is not CSR and copy=True
+    sort_graph_by_row_values(X.tocsc(), copy=True)
+
+    # raise if X is not CSR and copy=False
+    with pytest.raises(ValueError, match="Use copy=True to allow the conversion"):
+        sort_graph_by_row_values(X.tocsc(), copy=False)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sort_graph_by_row_values_warning(csr_container):
+    # Test that the parameter warn_when_not_sorted works as expected.
+    X = csr_container(np.abs(np.random.RandomState(42).randn(10, 10)))
+    assert not _is_sorted_by_data(X)
+
+    # warning
+    with pytest.warns(EfficiencyWarning, match="was not sorted by row values"):
+        sort_graph_by_row_values(X, copy=True)
+    with pytest.warns(EfficiencyWarning, match="was not sorted by row values"):
+        sort_graph_by_row_values(X, copy=True, warn_when_not_sorted=True)
+    with pytest.warns(EfficiencyWarning, match="was not sorted by row values"):
+        _check_precomputed(X)
+
+    # no warning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        sort_graph_by_row_values(X, copy=True, warn_when_not_sorted=False)
+
+
+@pytest.mark.parametrize(
+    "sparse_container", DOK_CONTAINERS + BSR_CONTAINERS + DIA_CONTAINERS
+)
+def test_sort_graph_by_row_values_bad_sparse_format(sparse_container):
+    # Test that sort_graph_by_row_values and _check_precomputed error on bad formats
+    X = sparse_container(np.abs(np.random.RandomState(42).randn(10, 10)))
+    with pytest.raises(TypeError, match="format is not supported"):
+        sort_graph_by_row_values(X)
+    with pytest.raises(TypeError, match="format is not supported"):
+        _check_precomputed(X)
+
+
+@pytest.mark.filterwarnings("ignore:EfficiencyWarning")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_precomputed_sparse_invalid(csr_container):
+    dist = np.array([[0.0, 2.0, 1.0], [2.0, 0.0, 3.0], [1.0, 3.0, 0.0]])
+    dist_csr = csr_container(dist)
+    neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed")
+    neigh.fit(dist_csr)
+    neigh.kneighbors(None, n_neighbors=1)
+    neigh.kneighbors(np.array([[0.0, 0.0, 0.0]]), n_neighbors=2)
+
+    # Ensures enough number of nearest neighbors
+    dist = np.array([[0.0, 2.0, 0.0], [2.0, 0.0, 3.0], [0.0, 3.0, 0.0]])
+    dist_csr = csr_container(dist)
+    neigh.fit(dist_csr)
+    msg = "2 neighbors per samples are required, but some samples have only 1"
+    with pytest.raises(ValueError, match=msg):
+        neigh.kneighbors(None, n_neighbors=1)
+
+    # Checks error with inconsistent distance matrix
+    dist = np.array([[5.0, 2.0, 1.0], [-2.0, 0.0, 3.0], [1.0, 3.0, 0.0]])
+    dist_csr = csr_container(dist)
+    msg = "Negative values in data passed to precomputed distance matrix."
+    with pytest.raises(ValueError, match=msg):
+        neigh.kneighbors(dist_csr, n_neighbors=1)
+
+
+def test_precomputed_cross_validation():
+    # Ensure array is split correctly
+    rng = np.random.RandomState(0)
+    X = rng.rand(20, 2)
+    D = pairwise_distances(X, metric="euclidean")
+    y = rng.randint(3, size=20)
+    for Est in (
+        neighbors.KNeighborsClassifier,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsRegressor,
+    ):
+        metric_score = cross_val_score(Est(), X, y)
+        precomp_score = cross_val_score(Est(metric="precomputed"), D, y)
+        assert_array_equal(metric_score, precomp_score)
+
+
+def test_unsupervised_radius_neighbors(
+    global_dtype, n_samples=20, n_features=5, n_query_pts=2, radius=0.5, random_state=0
+):
+    # Test unsupervised radius-based query
+    rng = np.random.RandomState(random_state)
+
+    X = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
+
+    test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False)
+
+    for p in P:
+        results = []
+
+        for algorithm in ALGORITHMS:
+            neigh = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm, p=p)
+            neigh.fit(X)
+
+            ind1 = neigh.radius_neighbors(test, return_distance=False)
+
+            # sort the results: this is not done automatically for
+            # radius searches
+            dist, ind = neigh.radius_neighbors(test, return_distance=True)
+            for d, i, i1 in zip(dist, ind, ind1):
+                j = d.argsort()
+                d[:] = d[j]
+                i[:] = i[j]
+                i1[:] = i1[j]
+            results.append((dist, ind))
+
+            assert_allclose(np.concatenate(list(ind)), np.concatenate(list(ind1)))
+
+        for i in range(len(results) - 1):
+            assert_allclose(
+                np.concatenate(list(results[i][0])),
+                np.concatenate(list(results[i + 1][0])),
+            )
+            assert_allclose(
+                np.concatenate(list(results[i][1])),
+                np.concatenate(list(results[i + 1][1])),
+            )
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+@pytest.mark.parametrize("weights", WEIGHTS)
+def test_kneighbors_classifier(
+    global_dtype,
+    algorithm,
+    weights,
+    n_samples=40,
+    n_features=5,
+    n_test_pts=10,
+    n_neighbors=5,
+    random_state=0,
+):
+    # Test k-neighbors classification
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features).astype(global_dtype, copy=False) - 1
+    y = ((X**2).sum(axis=1) < 0.5).astype(int)
+    y_str = y.astype(str)
+
+    knn = neighbors.KNeighborsClassifier(
+        n_neighbors=n_neighbors, weights=weights, algorithm=algorithm
+    )
+    knn.fit(X, y)
+    epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
+    y_pred = knn.predict(X[:n_test_pts] + epsilon)
+    assert_array_equal(y_pred, y[:n_test_pts])
+    # Test prediction with y_str
+    knn.fit(X, y_str)
+    y_pred = knn.predict(X[:n_test_pts] + epsilon)
+    assert_array_equal(y_pred, y_str[:n_test_pts])
+
+
+def test_kneighbors_classifier_float_labels(
+    global_dtype,
+    n_samples=40,
+    n_features=5,
+    n_test_pts=10,
+    n_neighbors=5,
+    random_state=0,
+):
+    # Test k-neighbors classification
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features).astype(global_dtype, copy=False) - 1
+    y = ((X**2).sum(axis=1) < 0.5).astype(int)
+
+    knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
+    knn.fit(X, y.astype(float))
+    epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
+    y_pred = knn.predict(X[:n_test_pts] + epsilon)
+    assert_array_equal(y_pred, y[:n_test_pts])
+
+
+def test_kneighbors_classifier_predict_proba(global_dtype):
+    # Test KNeighborsClassifier.predict_proba() method
+    X = np.array(
+        [[0, 2, 0], [0, 2, 1], [2, 0, 0], [2, 2, 0], [0, 0, 2], [0, 0, 1]]
+    ).astype(global_dtype, copy=False)
+    y = np.array([4, 4, 5, 5, 1, 1])
+    cls = neighbors.KNeighborsClassifier(n_neighbors=3, p=1)  # cityblock dist
+    cls.fit(X, y)
+    y_prob = cls.predict_proba(X)
+    real_prob = (
+        np.array(
+            [
+                [0, 2, 1],
+                [1, 2, 0],
+                [1, 0, 2],
+                [0, 1, 2],
+                [2, 1, 0],
+                [2, 1, 0],
+            ]
+        )
+        / 3.0
+    )
+    assert_array_equal(real_prob, y_prob)
+    # Check that it also works with non integer labels
+    cls.fit(X, y.astype(str))
+    y_prob = cls.predict_proba(X)
+    assert_array_equal(real_prob, y_prob)
+    # Check that it works with weights='distance'
+    cls = neighbors.KNeighborsClassifier(n_neighbors=2, p=1, weights="distance")
+    cls.fit(X, y)
+    y_prob = cls.predict_proba(np.array([[0, 2, 0], [2, 2, 2]]))
+    real_prob = np.array([[0, 1, 0], [0, 0.4, 0.6]])
+    assert_allclose(real_prob, y_prob)
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+@pytest.mark.parametrize("weights", WEIGHTS)
+def test_radius_neighbors_classifier(
+    global_dtype,
+    algorithm,
+    weights,
+    n_samples=40,
+    n_features=5,
+    n_test_pts=10,
+    radius=0.5,
+    random_state=0,
+):
+    # Test radius-based classification
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features).astype(global_dtype, copy=False) - 1
+    y = ((X**2).sum(axis=1) < radius).astype(int)
+    y_str = y.astype(str)
+
+    neigh = neighbors.RadiusNeighborsClassifier(
+        radius=radius, weights=weights, algorithm=algorithm
+    )
+    neigh.fit(X, y)
+    epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
+    y_pred = neigh.predict(X[:n_test_pts] + epsilon)
+    assert_array_equal(y_pred, y[:n_test_pts])
+    neigh.fit(X, y_str)
+    y_pred = neigh.predict(X[:n_test_pts] + epsilon)
+    assert_array_equal(y_pred, y_str[:n_test_pts])
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+@pytest.mark.parametrize("weights", WEIGHTS)
+@pytest.mark.parametrize("outlier_label", [0, -1, None])
+def test_radius_neighbors_classifier_when_no_neighbors(
+    global_dtype, algorithm, weights, outlier_label
+):
+    # Test radius-based classifier when no neighbors found.
+    # In this case it should rise an informative exception
+
+    X = np.array([[1.0, 1.0], [2.0, 2.0]], dtype=global_dtype)
+    y = np.array([1, 2])
+    radius = 0.1
+
+    # no outliers
+    z1 = np.array([[1.01, 1.01], [2.01, 2.01]], dtype=global_dtype)
+
+    # one outlier
+    z2 = np.array([[1.01, 1.01], [1.4, 1.4]], dtype=global_dtype)
+
+    rnc = neighbors.RadiusNeighborsClassifier
+    clf = rnc(
+        radius=radius,
+        weights=weights,
+        algorithm=algorithm,
+        outlier_label=outlier_label,
+    )
+    clf.fit(X, y)
+    assert_array_equal(np.array([1, 2]), clf.predict(z1))
+    if outlier_label is None:
+        with pytest.raises(ValueError):
+            clf.predict(z2)
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+@pytest.mark.parametrize("weights", WEIGHTS)
+def test_radius_neighbors_classifier_outlier_labeling(global_dtype, algorithm, weights):
+    # Test radius-based classifier when no neighbors found and outliers
+    # are labeled.
+
+    X = np.array(
+        [[1.0, 1.0], [2.0, 2.0], [0.99, 0.99], [0.98, 0.98], [2.01, 2.01]],
+        dtype=global_dtype,
+    )
+    y = np.array([1, 2, 1, 1, 2])
+    radius = 0.1
+
+    # no outliers
+    z1 = np.array([[1.01, 1.01], [2.01, 2.01]], dtype=global_dtype)
+
+    # one outlier
+    z2 = np.array([[1.4, 1.4], [1.01, 1.01], [2.01, 2.01]], dtype=global_dtype)
+
+    correct_labels1 = np.array([1, 2])
+    correct_labels2 = np.array([-1, 1, 2])
+    outlier_proba = np.array([0, 0])
+
+    clf = neighbors.RadiusNeighborsClassifier(
+        radius=radius, weights=weights, algorithm=algorithm, outlier_label=-1
+    )
+    clf.fit(X, y)
+    assert_array_equal(correct_labels1, clf.predict(z1))
+    with pytest.warns(UserWarning, match="Outlier label -1 is not in training classes"):
+        assert_array_equal(correct_labels2, clf.predict(z2))
+    with pytest.warns(UserWarning, match="Outlier label -1 is not in training classes"):
+        assert_allclose(outlier_proba, clf.predict_proba(z2)[0])
+
+    # test outlier_labeling of using predict_proba()
+    RNC = neighbors.RadiusNeighborsClassifier
+    X = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]], dtype=global_dtype)
+    y = np.array([0, 2, 2, 1, 1, 1, 3, 3, 3, 3])
+
+    # test outlier_label scalar verification
+    def check_array_exception():
+        clf = RNC(radius=1, outlier_label=[[5]])
+        clf.fit(X, y)
+
+    with pytest.raises(TypeError):
+        check_array_exception()
+
+    # test invalid outlier_label dtype
+    def check_dtype_exception():
+        clf = RNC(radius=1, outlier_label="a")
+        clf.fit(X, y)
+
+    with pytest.raises(TypeError):
+        check_dtype_exception()
+
+    # test most frequent
+    clf = RNC(radius=1, outlier_label="most_frequent")
+    clf.fit(X, y)
+    proba = clf.predict_proba([[1], [15]])
+    assert_array_equal(proba[1, :], [0, 0, 0, 1])
+
+    # test manual label in y
+    clf = RNC(radius=1, outlier_label=1)
+    clf.fit(X, y)
+    proba = clf.predict_proba([[1], [15]])
+    assert_array_equal(proba[1, :], [0, 1, 0, 0])
+    pred = clf.predict([[1], [15]])
+    assert_array_equal(pred, [2, 1])
+
+    # test manual label out of y warning
+    def check_warning():
+        clf = RNC(radius=1, outlier_label=4)
+        clf.fit(X, y)
+        clf.predict_proba([[1], [15]])
+
+    with pytest.warns(UserWarning):
+        check_warning()
+
+    # test multi output same outlier label
+    y_multi = [
+        [0, 1],
+        [2, 1],
+        [2, 2],
+        [1, 2],
+        [1, 2],
+        [1, 3],
+        [3, 3],
+        [3, 3],
+        [3, 0],
+        [3, 0],
+    ]
+    clf = RNC(radius=1, outlier_label=1)
+    clf.fit(X, y_multi)
+    proba = clf.predict_proba([[7], [15]])
+    assert_array_equal(proba[1][1, :], [0, 1, 0, 0])
+    pred = clf.predict([[7], [15]])
+    assert_array_equal(pred[1, :], [1, 1])
+
+    # test multi output different outlier label
+    y_multi = [
+        [0, 0],
+        [2, 2],
+        [2, 2],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [3, 3],
+        [3, 3],
+        [3, 3],
+        [3, 3],
+    ]
+    clf = RNC(radius=1, outlier_label=[0, 1])
+    clf.fit(X, y_multi)
+    proba = clf.predict_proba([[7], [15]])
+    assert_array_equal(proba[0][1, :], [1, 0, 0, 0])
+    assert_array_equal(proba[1][1, :], [0, 1, 0, 0])
+    pred = clf.predict([[7], [15]])
+    assert_array_equal(pred[1, :], [0, 1])
+
+    # test inconsistent outlier label list length
+    def check_exception():
+        clf = RNC(radius=1, outlier_label=[0, 1, 2])
+        clf.fit(X, y_multi)
+
+    with pytest.raises(ValueError):
+        check_exception()
+
+
+def test_radius_neighbors_classifier_zero_distance():
+    # Test radius-based classifier, when distance to a sample is zero.
+
+    X = np.array([[1.0, 1.0], [2.0, 2.0]])
+    y = np.array([1, 2])
+    radius = 0.1
+
+    z1 = np.array([[1.01, 1.01], [2.0, 2.0]])
+    correct_labels1 = np.array([1, 2])
+
+    weight_func = _weight_func
+
+    for algorithm in ALGORITHMS:
+        for weights in ["uniform", "distance", weight_func]:
+            clf = neighbors.RadiusNeighborsClassifier(
+                radius=radius, weights=weights, algorithm=algorithm
+            )
+            clf.fit(X, y)
+            with np.errstate(invalid="ignore"):
+                # Ignore the warning raised in _weight_func when making
+                # predictions with null distances resulting in np.inf values.
+                assert_array_equal(correct_labels1, clf.predict(z1))
+
+
+def test_neighbors_regressors_zero_distance():
+    # Test radius-based regressor, when distance to a sample is zero.
+
+    X = np.array([[1.0, 1.0], [1.0, 1.0], [2.0, 2.0], [2.5, 2.5]])
+    y = np.array([1.0, 1.5, 2.0, 0.0])
+    radius = 0.2
+    z = np.array([[1.1, 1.1], [2.0, 2.0]])
+
+    rnn_correct_labels = np.array([1.25, 2.0])
+
+    knn_correct_unif = np.array([1.25, 1.0])
+    knn_correct_dist = np.array([1.25, 2.0])
+
+    for algorithm in ALGORITHMS:
+        # we don't test for weights=_weight_func since user will be expected
+        # to handle zero distances themselves in the function.
+        for weights in ["uniform", "distance"]:
+            rnn = neighbors.RadiusNeighborsRegressor(
+                radius=radius, weights=weights, algorithm=algorithm
+            )
+            rnn.fit(X, y)
+            assert_allclose(rnn_correct_labels, rnn.predict(z))
+
+        for weights, corr_labels in zip(
+            ["uniform", "distance"], [knn_correct_unif, knn_correct_dist]
+        ):
+            knn = neighbors.KNeighborsRegressor(
+                n_neighbors=2, weights=weights, algorithm=algorithm
+            )
+            knn.fit(X, y)
+            assert_allclose(corr_labels, knn.predict(z))
+
+
+def test_radius_neighbors_boundary_handling():
+    """Test whether points lying on boundary are handled consistently
+
+    Also ensures that even with only one query point, an object array
+    is returned rather than a 2d array.
+    """
+
+    X = np.array([[1.5], [3.0], [3.01]])
+    radius = 3.0
+
+    for algorithm in ALGORITHMS:
+        nbrs = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm).fit(X)
+        results = nbrs.radius_neighbors([[0.0]], return_distance=False)
+        assert results.shape == (1,)
+        assert results.dtype == object
+        assert_array_equal(results[0], [0, 1])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_radius_neighbors_returns_array_of_objects(csr_container):
+    # check that we can pass precomputed distances to
+    # NearestNeighbors.radius_neighbors()
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/16036
+    X = csr_container(np.ones((4, 4)))
+    X.setdiag([0, 0, 0, 0])
+
+    nbrs = neighbors.NearestNeighbors(
+        radius=0.5, algorithm="auto", leaf_size=30, metric="precomputed"
+    ).fit(X)
+    neigh_dist, neigh_ind = nbrs.radius_neighbors(X, return_distance=True)
+
+    expected_dist = np.empty(X.shape[0], dtype=object)
+    expected_dist[:] = [np.array([0]), np.array([0]), np.array([0]), np.array([0])]
+    expected_ind = np.empty(X.shape[0], dtype=object)
+    expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]), np.array([3])]
+
+    assert_array_equal(neigh_dist, expected_dist)
+    assert_array_equal(neigh_ind, expected_ind)
+
+
+@pytest.mark.parametrize("algorithm", ["ball_tree", "kd_tree", "brute"])
+def test_query_equidistant_kth_nn(algorithm):
+    # For several candidates for the k-th nearest neighbor position,
+    # the first candidate should be chosen
+    query_point = np.array([[0, 0]])
+    equidistant_points = np.array([[1, 0], [0, 1], [-1, 0], [0, -1]])
+    # The 3rd and 4th points should not replace the 2nd point
+    # for the 2th nearest neighbor position
+    k = 2
+    knn_indices = np.array([[0, 1]])
+    nn = neighbors.NearestNeighbors(algorithm=algorithm).fit(equidistant_points)
+    indices = np.sort(nn.kneighbors(query_point, n_neighbors=k, return_distance=False))
+    assert_array_equal(indices, knn_indices)
+
+
+@pytest.mark.parametrize(
+    ["algorithm", "metric"],
+    list(
+        product(
+            ("kd_tree", "ball_tree", "brute"),
+            ("euclidean", *DISTANCE_METRIC_OBJS),
+        )
+    )
+    + [
+        ("brute", "euclidean"),
+        ("brute", "precomputed"),
+    ],
+)
+def test_radius_neighbors_sort_results(algorithm, metric):
+    # Test radius_neighbors[_graph] output when sort_result is True
+
+    metric = _parse_metric(metric, np.float64)
+    if isinstance(metric, DistanceMetric):
+        pytest.skip(
+            "Metrics of type `DistanceMetric` are not yet supported for radius-neighbor"
+            " estimators."
+        )
+    n_samples = 10
+    rng = np.random.RandomState(42)
+    X = rng.random_sample((n_samples, 4))
+
+    if metric == "precomputed":
+        X = neighbors.radius_neighbors_graph(X, radius=np.inf, mode="distance")
+    model = neighbors.NearestNeighbors(algorithm=algorithm, metric=metric)
+    model.fit(X)
+
+    # self.radius_neighbors
+    distances, indices = model.radius_neighbors(X=X, radius=np.inf, sort_results=True)
+    for ii in range(n_samples):
+        assert_array_equal(distances[ii], np.sort(distances[ii]))
+
+    # sort_results=True and return_distance=False
+    if metric != "precomputed":  # no need to raise with precomputed graph
+        with pytest.raises(ValueError, match="return_distance must be True"):
+            model.radius_neighbors(
+                X=X, radius=np.inf, sort_results=True, return_distance=False
+            )
+
+    # self.radius_neighbors_graph
+    graph = model.radius_neighbors_graph(
+        X=X, radius=np.inf, mode="distance", sort_results=True
+    )
+    assert _is_sorted_by_data(graph)
+
+
+def test_RadiusNeighborsClassifier_multioutput():
+    # Test k-NN classifier on multioutput data
+    rng = check_random_state(0)
+    n_features = 2
+    n_samples = 40
+    n_output = 3
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.randint(0, 3, (n_samples, n_output))
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    weights = [None, "uniform", "distance", _weight_func]
+
+    for algorithm, weights in product(ALGORITHMS, weights):
+        # Stack single output prediction
+        y_pred_so = []
+        for o in range(n_output):
+            rnn = neighbors.RadiusNeighborsClassifier(
+                weights=weights, algorithm=algorithm
+            )
+            rnn.fit(X_train, y_train[:, o])
+            y_pred_so.append(rnn.predict(X_test))
+
+        y_pred_so = np.vstack(y_pred_so).T
+        assert y_pred_so.shape == y_test.shape
+
+        # Multioutput prediction
+        rnn_mo = neighbors.RadiusNeighborsClassifier(
+            weights=weights, algorithm=algorithm
+        )
+        rnn_mo.fit(X_train, y_train)
+        y_pred_mo = rnn_mo.predict(X_test)
+
+        assert y_pred_mo.shape == y_test.shape
+        assert_array_equal(y_pred_mo, y_pred_so)
+
+
+def test_kneighbors_classifier_sparse(
+    n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0
+):
+    # Test k-NN classifier on sparse matrices
+    # Like the above, but with various types of sparse matrices
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features) - 1
+    X *= X > 0.2
+    y = ((X**2).sum(axis=1) < 0.5).astype(int)
+
+    for sparsemat in SPARSE_TYPES:
+        knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm="auto")
+        knn.fit(sparsemat(X), y)
+        epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
+        for sparsev in SPARSE_TYPES + (np.asarray,):
+            X_eps = sparsev(X[:n_test_pts] + epsilon)
+            y_pred = knn.predict(X_eps)
+            assert_array_equal(y_pred, y[:n_test_pts])
+
+
+def test_KNeighborsClassifier_multioutput():
+    # Test k-NN classifier on multioutput data
+    rng = check_random_state(0)
+    n_features = 5
+    n_samples = 50
+    n_output = 3
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.randint(0, 3, (n_samples, n_output))
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    weights = [None, "uniform", "distance", _weight_func]
+
+    for algorithm, weights in product(ALGORITHMS, weights):
+        # Stack single output prediction
+        y_pred_so = []
+        y_pred_proba_so = []
+        for o in range(n_output):
+            knn = neighbors.KNeighborsClassifier(weights=weights, algorithm=algorithm)
+            knn.fit(X_train, y_train[:, o])
+            y_pred_so.append(knn.predict(X_test))
+            y_pred_proba_so.append(knn.predict_proba(X_test))
+
+        y_pred_so = np.vstack(y_pred_so).T
+        assert y_pred_so.shape == y_test.shape
+        assert len(y_pred_proba_so) == n_output
+
+        # Multioutput prediction
+        knn_mo = neighbors.KNeighborsClassifier(weights=weights, algorithm=algorithm)
+        knn_mo.fit(X_train, y_train)
+        y_pred_mo = knn_mo.predict(X_test)
+
+        assert y_pred_mo.shape == y_test.shape
+        assert_array_equal(y_pred_mo, y_pred_so)
+
+        # Check proba
+        y_pred_proba_mo = knn_mo.predict_proba(X_test)
+        assert len(y_pred_proba_mo) == n_output
+
+        for proba_mo, proba_so in zip(y_pred_proba_mo, y_pred_proba_so):
+            assert_array_equal(proba_mo, proba_so)
+
+
+def test_kneighbors_regressor(
+    n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0
+):
+    # Test k-neighbors regression
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features) - 1
+    y = np.sqrt((X**2).sum(1))
+    y /= y.max()
+
+    y_target = y[:n_test_pts]
+
+    weight_func = _weight_func
+
+    for algorithm in ALGORITHMS:
+        for weights in ["uniform", "distance", weight_func]:
+            knn = neighbors.KNeighborsRegressor(
+                n_neighbors=n_neighbors, weights=weights, algorithm=algorithm
+            )
+            knn.fit(X, y)
+            epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
+            y_pred = knn.predict(X[:n_test_pts] + epsilon)
+            assert np.all(abs(y_pred - y_target) < 0.3)
+
+
+def test_KNeighborsRegressor_multioutput_uniform_weight():
+    # Test k-neighbors in multi-output regression with uniform weight
+    rng = check_random_state(0)
+    n_features = 5
+    n_samples = 40
+    n_output = 4
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples, n_output)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    for algorithm, weights in product(ALGORITHMS, [None, "uniform"]):
+        knn = neighbors.KNeighborsRegressor(weights=weights, algorithm=algorithm)
+        knn.fit(X_train, y_train)
+
+        neigh_idx = knn.kneighbors(X_test, return_distance=False)
+        y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx])
+
+        y_pred = knn.predict(X_test)
+
+        assert y_pred.shape == y_test.shape
+        assert y_pred_idx.shape == y_test.shape
+        assert_allclose(y_pred, y_pred_idx)
+
+
+def test_kneighbors_regressor_multioutput(
+    n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0
+):
+    # Test k-neighbors in multi-output regression
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features) - 1
+    y = np.sqrt((X**2).sum(1))
+    y /= y.max()
+    y = np.vstack([y, y]).T
+
+    y_target = y[:n_test_pts]
+
+    weights = ["uniform", "distance", _weight_func]
+    for algorithm, weights in product(ALGORITHMS, weights):
+        knn = neighbors.KNeighborsRegressor(
+            n_neighbors=n_neighbors, weights=weights, algorithm=algorithm
+        )
+        knn.fit(X, y)
+        epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
+        y_pred = knn.predict(X[:n_test_pts] + epsilon)
+        assert y_pred.shape == y_target.shape
+
+        assert np.all(np.abs(y_pred - y_target) < 0.3)
+
+
+def test_radius_neighbors_regressor(
+    n_samples=40, n_features=3, n_test_pts=10, radius=0.5, random_state=0
+):
+    # Test radius-based neighbors regression
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features) - 1
+    y = np.sqrt((X**2).sum(1))
+    y /= y.max()
+
+    y_target = y[:n_test_pts]
+
+    weight_func = _weight_func
+
+    for algorithm in ALGORITHMS:
+        for weights in ["uniform", "distance", weight_func]:
+            neigh = neighbors.RadiusNeighborsRegressor(
+                radius=radius, weights=weights, algorithm=algorithm
+            )
+            neigh.fit(X, y)
+            epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
+            y_pred = neigh.predict(X[:n_test_pts] + epsilon)
+            assert np.all(abs(y_pred - y_target) < radius / 2)
+
+    # test that nan is returned when no nearby observations
+    for weights in ["uniform", "distance"]:
+        neigh = neighbors.RadiusNeighborsRegressor(
+            radius=radius, weights=weights, algorithm="auto"
+        )
+        neigh.fit(X, y)
+        X_test_nan = np.full((1, n_features), -1.0)
+        empty_warning_msg = (
+            "One or more samples have no neighbors "
+            "within specified radius; predicting NaN."
+        )
+        with pytest.warns(UserWarning, match=re.escape(empty_warning_msg)):
+            pred = neigh.predict(X_test_nan)
+        assert np.all(np.isnan(pred))
+
+
+def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight():
+    # Test radius neighbors in multi-output regression (uniform weight)
+
+    rng = check_random_state(0)
+    n_features = 5
+    n_samples = 40
+    n_output = 4
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples, n_output)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    for algorithm, weights in product(ALGORITHMS, [None, "uniform"]):
+        rnn = neighbors.RadiusNeighborsRegressor(weights=weights, algorithm=algorithm)
+        rnn.fit(X_train, y_train)
+
+        neigh_idx = rnn.radius_neighbors(X_test, return_distance=False)
+        y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx])
+
+        y_pred_idx = np.array(y_pred_idx)
+        y_pred = rnn.predict(X_test)
+
+        assert y_pred_idx.shape == y_test.shape
+        assert y_pred.shape == y_test.shape
+        assert_allclose(y_pred, y_pred_idx)
+
+
+def test_RadiusNeighborsRegressor_multioutput(
+    n_samples=40, n_features=5, n_test_pts=10, random_state=0
+):
+    # Test k-neighbors in multi-output regression with various weight
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features) - 1
+    y = np.sqrt((X**2).sum(1))
+    y /= y.max()
+    y = np.vstack([y, y]).T
+
+    y_target = y[:n_test_pts]
+    weights = ["uniform", "distance", _weight_func]
+
+    for algorithm, weights in product(ALGORITHMS, weights):
+        rnn = neighbors.RadiusNeighborsRegressor(weights=weights, algorithm=algorithm)
+        rnn.fit(X, y)
+        epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
+        y_pred = rnn.predict(X[:n_test_pts] + epsilon)
+
+        assert y_pred.shape == y_target.shape
+        assert np.all(np.abs(y_pred - y_target) < 0.3)
+
+
+@pytest.mark.filterwarnings("ignore:EfficiencyWarning")
+def test_kneighbors_regressor_sparse(
+    n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0
+):
+    # Test radius-based regression on sparse matrices
+    # Like the above, but with various types of sparse matrices
+    rng = np.random.RandomState(random_state)
+    X = 2 * rng.rand(n_samples, n_features) - 1
+    y = ((X**2).sum(axis=1) < 0.25).astype(int)
+
+    for sparsemat in SPARSE_TYPES:
+        knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, algorithm="auto")
+        knn.fit(sparsemat(X), y)
+
+        knn_pre = neighbors.KNeighborsRegressor(
+            n_neighbors=n_neighbors, metric="precomputed"
+        )
+        knn_pre.fit(pairwise_distances(X, metric="euclidean"), y)
+
+        for sparsev in SPARSE_OR_DENSE:
+            X2 = sparsev(X)
+            assert np.mean(knn.predict(X2).round() == y) > 0.95
+
+            X2_pre = sparsev(pairwise_distances(X, metric="euclidean"))
+            if sparsev in DOK_CONTAINERS + BSR_CONTAINERS:
+                msg = "not supported due to its handling of explicit zeros"
+                with pytest.raises(TypeError, match=msg):
+                    knn_pre.predict(X2_pre)
+            else:
+                assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95
+
+
+def test_neighbors_iris():
+    # Sanity checks on the iris dataset
+    # Puts three points of each label in the plane and performs a
+    # nearest neighbor query on points near the decision boundary.
+
+    for algorithm in ALGORITHMS:
+        clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm=algorithm)
+        clf.fit(iris.data, iris.target)
+        assert_array_equal(clf.predict(iris.data), iris.target)
+
+        clf.set_params(n_neighbors=9, algorithm=algorithm)
+        clf.fit(iris.data, iris.target)
+        assert np.mean(clf.predict(iris.data) == iris.target) > 0.95
+
+        rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm)
+        rgs.fit(iris.data, iris.target)
+        assert np.mean(rgs.predict(iris.data).round() == iris.target) > 0.95
+
+
+def test_neighbors_digits():
+    # Sanity check on the digits dataset
+    # the 'brute' algorithm has been observed to fail if the input
+    # dtype is uint8 due to overflow in distance calculations.
+
+    X = digits.data.astype("uint8")
+    Y = digits.target
+    (n_samples, n_features) = X.shape
+    train_test_boundary = int(n_samples * 0.8)
+    train = np.arange(0, train_test_boundary)
+    test = np.arange(train_test_boundary, n_samples)
+    (X_train, Y_train, X_test, Y_test) = X[train], Y[train], X[test], Y[test]
+
+    clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm="brute")
+    score_uint8 = clf.fit(X_train, Y_train).score(X_test, Y_test)
+    score_float = clf.fit(X_train.astype(float, copy=False), Y_train).score(
+        X_test.astype(float, copy=False), Y_test
+    )
+    assert score_uint8 == score_float
+
+
+def test_kneighbors_graph():
+    # Test kneighbors_graph to build the k-Nearest Neighbor graph.
+    X = np.array([[0, 1], [1.01, 1.0], [2, 0]])
+
+    # n_neighbors = 1
+    A = neighbors.kneighbors_graph(X, 1, mode="connectivity", include_self=True)
+    assert_array_equal(A.toarray(), np.eye(A.shape[0]))
+
+    A = neighbors.kneighbors_graph(X, 1, mode="distance")
+    assert_allclose(
+        A.toarray(), [[0.00, 1.01, 0.0], [1.01, 0.0, 0.0], [0.00, 1.40716026, 0.0]]
+    )
+
+    # n_neighbors = 2
+    A = neighbors.kneighbors_graph(X, 2, mode="connectivity", include_self=True)
+    assert_array_equal(A.toarray(), [[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 1.0, 1.0]])
+
+    A = neighbors.kneighbors_graph(X, 2, mode="distance")
+    assert_allclose(
+        A.toarray(),
+        [
+            [0.0, 1.01, 2.23606798],
+            [1.01, 0.0, 1.40716026],
+            [2.23606798, 1.40716026, 0.0],
+        ],
+    )
+
+    # n_neighbors = 3
+    A = neighbors.kneighbors_graph(X, 3, mode="connectivity", include_self=True)
+    assert_allclose(A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]])
+
+
+@pytest.mark.parametrize("n_neighbors", [1, 2, 3])
+@pytest.mark.parametrize("mode", ["connectivity", "distance"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_kneighbors_graph_sparse(n_neighbors, mode, csr_container, seed=36):
+    # Test kneighbors_graph to build the k-Nearest Neighbor graph
+    # for sparse input.
+    rng = np.random.RandomState(seed)
+    X = rng.randn(10, 10)
+    Xcsr = csr_container(X)
+
+    assert_allclose(
+        neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(),
+        neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
+    )
+
+
+def test_radius_neighbors_graph():
+    # Test radius_neighbors_graph to build the Nearest Neighbor graph.
+    X = np.array([[0, 1], [1.01, 1.0], [2, 0]])
+
+    A = neighbors.radius_neighbors_graph(X, 1.5, mode="connectivity", include_self=True)
+    assert_array_equal(A.toarray(), [[1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]])
+
+    A = neighbors.radius_neighbors_graph(X, 1.5, mode="distance")
+    assert_allclose(
+        A.toarray(), [[0.0, 1.01, 0.0], [1.01, 0.0, 1.40716026], [0.0, 1.40716026, 0.0]]
+    )
+
+
+@pytest.mark.parametrize("n_neighbors", [1, 2, 3])
+@pytest.mark.parametrize("mode", ["connectivity", "distance"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_radius_neighbors_graph_sparse(n_neighbors, mode, csr_container, seed=36):
+    # Test radius_neighbors_graph to build the Nearest Neighbor graph
+    # for sparse input.
+    rng = np.random.RandomState(seed)
+    X = rng.randn(10, 10)
+    Xcsr = csr_container(X)
+
+    assert_allclose(
+        neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(),
+        neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
+    )
+
+
+@pytest.mark.parametrize(
+    "Estimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsRegressor,
+    ],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_neighbors_validate_parameters(Estimator, csr_container):
+    """Additional parameter validation for *Neighbors* estimators not covered by common
+    validation."""
+    X = rng.random_sample((10, 2))
+    Xsparse = csr_container(X)
+    X3 = rng.random_sample((10, 3))
+    y = np.ones(10)
+
+    nbrs = Estimator(algorithm="ball_tree", metric="haversine")
+    msg = "instance is not fitted yet"
+    with pytest.raises(ValueError, match=msg):
+        nbrs.predict(X)
+    msg = "Metric 'haversine' not valid for sparse input."
+    with pytest.raises(ValueError, match=msg):
+        ignore_warnings(nbrs.fit(Xsparse, y))
+
+    nbrs = Estimator(metric="haversine", algorithm="brute")
+    nbrs.fit(X3, y)
+    msg = "Haversine distance only valid in 2 dimensions"
+    with pytest.raises(ValueError, match=msg):
+        nbrs.predict(X3)
+
+    nbrs = Estimator()
+    msg = re.escape("Found array with 0 sample(s)")
+    with pytest.raises(ValueError, match=msg):
+        nbrs.fit(np.ones((0, 2)), np.ones(0))
+
+    msg = "Found array with dim 3"
+    with pytest.raises(ValueError, match=msg):
+        nbrs.fit(X[:, :, None], y)
+    nbrs.fit(X, y)
+
+    msg = re.escape("Found array with 0 feature(s)")
+    with pytest.raises(ValueError, match=msg):
+        nbrs.predict([[]])
+
+
+@pytest.mark.parametrize(
+    "Estimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsRegressor,
+    ],
+)
+@pytest.mark.parametrize("n_features", [2, 100])
+@pytest.mark.parametrize("algorithm", ["auto", "brute"])
+def test_neighbors_minkowski_semimetric_algo_warn(Estimator, n_features, algorithm):
+    """
+    Validation of all classes extending NeighborsBase with
+    Minkowski semi-metrics (i.e. when 0 < p < 1). That proper
+    Warning is raised for `algorithm="auto"` and "brute".
+    """
+    X = rng.random_sample((10, n_features))
+    y = np.ones(10)
+
+    model = Estimator(p=0.1, algorithm=algorithm)
+    msg = (
+        "Mind that for 0 < p < 1, Minkowski metrics are not distance"
+        " metrics. Continuing the execution with `algorithm='brute'`."
+    )
+    with pytest.warns(UserWarning, match=msg):
+        model.fit(X, y)
+
+    assert model._fit_method == "brute"
+
+
+@pytest.mark.parametrize(
+    "Estimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsRegressor,
+    ],
+)
+@pytest.mark.parametrize("n_features", [2, 100])
+@pytest.mark.parametrize("algorithm", ["kd_tree", "ball_tree"])
+def test_neighbors_minkowski_semimetric_algo_error(Estimator, n_features, algorithm):
+    """Check that we raise a proper error if `algorithm!='brute'` and `p<1`."""
+    X = rng.random_sample((10, 2))
+    y = np.ones(10)
+
+    model = Estimator(algorithm=algorithm, p=0.1)
+    msg = (
+        f'algorithm="{algorithm}" does not support 0 < p < 1 for '
+        "the Minkowski metric. To resolve this problem either "
+        'set p >= 1 or algorithm="brute".'
+    )
+    with pytest.raises(ValueError, match=msg):
+        model.fit(X, y)
+
+
+# TODO: remove when NearestNeighbors methods uses parameter validation mechanism
+def test_nearest_neighbors_validate_params():
+    """Validate parameter of NearestNeighbors."""
+    X = rng.random_sample((10, 2))
+
+    nbrs = neighbors.NearestNeighbors().fit(X)
+    msg = (
+        'Unsupported mode, must be one of "connectivity", or "distance" but got "blah"'
+        " instead"
+    )
+    with pytest.raises(ValueError, match=msg):
+        nbrs.kneighbors_graph(X, mode="blah")
+    with pytest.raises(ValueError, match=msg):
+        nbrs.radius_neighbors_graph(X, mode="blah")
+
+
+@pytest.mark.parametrize(
+    "metric",
+    sorted(
+        set(neighbors.VALID_METRICS["ball_tree"]).intersection(
+            neighbors.VALID_METRICS["brute"]
+        )
+        - set(["pyfunc", *BOOL_METRICS])
+    )
+    + DISTANCE_METRIC_OBJS,
+)
+def test_neighbors_metrics(
+    global_dtype,
+    global_random_seed,
+    metric,
+    n_samples=20,
+    n_features=3,
+    n_query_pts=2,
+    n_neighbors=5,
+):
+    rng = np.random.RandomState(global_random_seed)
+
+    metric = _parse_metric(metric, global_dtype)
+
+    # Test computing the neighbors for various metrics
+    algorithms = ["brute", "ball_tree", "kd_tree"]
+    X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
+    X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False)
+
+    metric_params_list = _generate_test_params_for(metric, n_features)
+
+    for metric_params in metric_params_list:
+        # Some metric (e.g. Weighted minkowski) are not supported by KDTree
+        exclude_kd_tree = (
+            False
+            if isinstance(metric, DistanceMetric)
+            else metric not in neighbors.VALID_METRICS["kd_tree"]
+            or ("minkowski" in metric and "w" in metric_params)
+        )
+        results = {}
+        p = metric_params.pop("p", 2)
+        for algorithm in algorithms:
+            if isinstance(metric, DistanceMetric) and global_dtype == np.float32:
+                if "tree" in algorithm:  # pragma: nocover
+                    pytest.skip(
+                        "Neither KDTree nor BallTree support 32-bit distance metric"
+                        " objects."
+                    )
+            neigh = neighbors.NearestNeighbors(
+                n_neighbors=n_neighbors,
+                algorithm=algorithm,
+                metric=metric,
+                p=p,
+                metric_params=metric_params,
+            )
+
+            if exclude_kd_tree and algorithm == "kd_tree":
+                with pytest.raises(ValueError):
+                    neigh.fit(X_train)
+                continue
+
+            # Haversine distance only accepts 2D data
+            if metric == "haversine":
+                feature_sl = slice(None, 2)
+                X_train = np.ascontiguousarray(X_train[:, feature_sl])
+                X_test = np.ascontiguousarray(X_test[:, feature_sl])
+
+            neigh.fit(X_train)
+            results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
+
+        brute_dst, brute_idx = results["brute"]
+        ball_tree_dst, ball_tree_idx = results["ball_tree"]
+
+        # The returned distances are always in float64 regardless of the input dtype
+        # We need to adjust the tolerance w.r.t the input dtype
+        rtol = 1e-7 if global_dtype == np.float64 else 1e-4
+
+        assert_allclose(brute_dst, ball_tree_dst, rtol=rtol)
+        assert_array_equal(brute_idx, ball_tree_idx)
+
+        if not exclude_kd_tree:
+            kd_tree_dst, kd_tree_idx = results["kd_tree"]
+            assert_allclose(brute_dst, kd_tree_dst, rtol=rtol)
+            assert_array_equal(brute_idx, kd_tree_idx)
+
+            assert_allclose(ball_tree_dst, kd_tree_dst, rtol=rtol)
+            assert_array_equal(ball_tree_idx, kd_tree_idx)
+
+
+# TODO: Remove ignore_warnings when minimum supported SciPy version is 1.17
+# Some scipy metrics are deprecated (depending on the scipy version) but we
+# still want to test them.
+@ignore_warnings(category=DeprecationWarning)
+@pytest.mark.parametrize(
+    "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"]))
+)
+def test_kneighbors_brute_backend(
+    metric,
+    global_dtype,
+    global_random_seed,
+    n_samples=2000,
+    n_features=30,
+    n_query_pts=5,
+    n_neighbors=5,
+):
+    rng = np.random.RandomState(global_random_seed)
+    # Both backend for the 'brute' algorithm of kneighbors must give identical results.
+    X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
+    X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False)
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        feature_sl = slice(None, 2)
+        X_train = np.ascontiguousarray(X_train[:, feature_sl])
+        X_test = np.ascontiguousarray(X_test[:, feature_sl])
+
+    if metric in PAIRWISE_BOOLEAN_FUNCTIONS:
+        X_train = X_train > 0.5
+        X_test = X_test > 0.5
+
+    metric_params_list = _generate_test_params_for(metric, n_features)
+
+    for metric_params in metric_params_list:
+        p = metric_params.pop("p", 2)
+
+        neigh = neighbors.NearestNeighbors(
+            n_neighbors=n_neighbors,
+            algorithm="brute",
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+        )
+
+        neigh.fit(X_train)
+
+        with config_context(enable_cython_pairwise_dist=False):
+            # Use the legacy backend for brute
+            legacy_brute_dst, legacy_brute_idx = neigh.kneighbors(
+                X_test, return_distance=True
+            )
+        with config_context(enable_cython_pairwise_dist=True):
+            # Use the pairwise-distances reduction backend for brute
+            pdr_brute_dst, pdr_brute_idx = neigh.kneighbors(
+                X_test, return_distance=True
+            )
+
+        assert_compatible_argkmin_results(
+            legacy_brute_dst, pdr_brute_dst, legacy_brute_idx, pdr_brute_idx
+        )
+
+
+def test_callable_metric():
+    def custom_metric(x1, x2):
+        return np.sqrt(np.sum(x1**2 + x2**2))
+
+    X = np.random.RandomState(42).rand(20, 2)
+    nbrs1 = neighbors.NearestNeighbors(
+        n_neighbors=3, algorithm="auto", metric=custom_metric
+    )
+    nbrs2 = neighbors.NearestNeighbors(
+        n_neighbors=3, algorithm="brute", metric=custom_metric
+    )
+
+    nbrs1.fit(X)
+    nbrs2.fit(X)
+
+    dist1, ind1 = nbrs1.kneighbors(X)
+    dist2, ind2 = nbrs2.kneighbors(X)
+
+    assert_allclose(dist1, dist2)
+
+
+@pytest.mark.parametrize(
+    "metric", neighbors.VALID_METRICS["brute"] + DISTANCE_METRIC_OBJS
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_valid_brute_metric_for_auto_algorithm(
+    global_dtype, metric, csr_container, n_samples=20, n_features=12
+):
+    metric = _parse_metric(metric, global_dtype)
+
+    X = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
+    Xcsr = csr_container(X)
+
+    metric_params_list = _generate_test_params_for(metric, n_features)
+
+    if metric == "precomputed":
+        X_precomputed = rng.random_sample((10, 4))
+        Y_precomputed = rng.random_sample((3, 4))
+        DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean")
+        DYX = metrics.pairwise_distances(
+            Y_precomputed, X_precomputed, metric="euclidean"
+        )
+        nb_p = neighbors.NearestNeighbors(n_neighbors=3, metric="precomputed")
+        nb_p.fit(DXX)
+        nb_p.kneighbors(DYX)
+
+    else:
+        for metric_params in metric_params_list:
+            nn = neighbors.NearestNeighbors(
+                n_neighbors=3,
+                algorithm="auto",
+                metric=metric,
+                metric_params=metric_params,
+            )
+            # Haversine distance only accepts 2D data
+            if metric == "haversine":
+                feature_sl = slice(None, 2)
+                X = np.ascontiguousarray(X[:, feature_sl])
+
+            nn.fit(X)
+            nn.kneighbors(X)
+
+            if metric in VALID_METRICS_SPARSE["brute"]:
+                nn = neighbors.NearestNeighbors(
+                    n_neighbors=3, algorithm="auto", metric=metric
+                ).fit(Xcsr)
+                nn.kneighbors(Xcsr)
+
+
+def test_metric_params_interface():
+    X = rng.rand(5, 5)
+    y = rng.randint(0, 2, 5)
+    est = neighbors.KNeighborsClassifier(metric_params={"p": 3})
+    with pytest.warns(SyntaxWarning):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_predict_sparse_ball_kd_tree(csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.rand(5, 5)
+    y = rng.randint(0, 2, 5)
+    nbrs1 = neighbors.KNeighborsClassifier(1, algorithm="kd_tree")
+    nbrs2 = neighbors.KNeighborsRegressor(1, algorithm="ball_tree")
+    for model in [nbrs1, nbrs2]:
+        model.fit(X, y)
+        with pytest.raises(ValueError):
+            model.predict(csr_container(X))
+
+
+def test_non_euclidean_kneighbors():
+    rng = np.random.RandomState(0)
+    X = rng.rand(5, 5)
+
+    # Find a reasonable radius.
+    dist_array = pairwise_distances(X).flatten()
+    np.sort(dist_array)
+    radius = dist_array[15]
+
+    # Test kneighbors_graph
+    for metric in ["manhattan", "chebyshev"]:
+        nbrs_graph = neighbors.kneighbors_graph(
+            X, 3, metric=metric, mode="connectivity", include_self=True
+        ).toarray()
+        nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, metric=metric).fit(X)
+        assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())
+
+    # Test radiusneighbors_graph
+    for metric in ["manhattan", "chebyshev"]:
+        nbrs_graph = neighbors.radius_neighbors_graph(
+            X, radius, metric=metric, mode="connectivity", include_self=True
+        ).toarray()
+        nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X)
+        assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).toarray())
+
+    # Raise error when wrong parameters are supplied,
+    X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric="manhattan")
+    X_nbrs.fit(X)
+    with pytest.raises(ValueError):
+        neighbors.kneighbors_graph(X_nbrs, 3, metric="euclidean")
+    X_nbrs = neighbors.NearestNeighbors(radius=radius, metric="manhattan")
+    X_nbrs.fit(X)
+    with pytest.raises(ValueError):
+        neighbors.radius_neighbors_graph(X_nbrs, radius, metric="euclidean")
+
+
+def check_object_arrays(nparray, list_check):
+    for ind, ele in enumerate(nparray):
+        assert_array_equal(ele, list_check[ind])
+
+
+def test_k_and_radius_neighbors_train_is_not_query():
+    # Test kneighbors et.al when query is not training data
+
+    for algorithm in ALGORITHMS:
+        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
+
+        X = [[0], [1]]
+        nn.fit(X)
+        test_data = [[2], [1]]
+
+        # Test neighbors.
+        dist, ind = nn.kneighbors(test_data)
+        assert_array_equal(dist, [[1], [0]])
+        assert_array_equal(ind, [[1], [1]])
+        dist, ind = nn.radius_neighbors([[2], [1]], radius=1.5)
+        check_object_arrays(dist, [[1], [1, 0]])
+        check_object_arrays(ind, [[1], [0, 1]])
+
+        # Test the graph variants.
+        assert_array_equal(
+            nn.kneighbors_graph(test_data).toarray(), [[0.0, 1.0], [0.0, 1.0]]
+        )
+        assert_array_equal(
+            nn.kneighbors_graph([[2], [1]], mode="distance").toarray(),
+            np.array([[0.0, 1.0], [0.0, 0.0]]),
+        )
+        rng = nn.radius_neighbors_graph([[2], [1]], radius=1.5)
+        assert_array_equal(rng.toarray(), [[0, 1], [1, 1]])
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_k_and_radius_neighbors_X_None(algorithm):
+    # Test kneighbors et.al when query is None
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
+
+    X = [[0], [1]]
+    nn.fit(X)
+
+    dist, ind = nn.kneighbors()
+    assert_array_equal(dist, [[1], [1]])
+    assert_array_equal(ind, [[1], [0]])
+    dist, ind = nn.radius_neighbors(None, radius=1.5)
+    check_object_arrays(dist, [[1], [1]])
+    check_object_arrays(ind, [[1], [0]])
+
+    # Test the graph variants.
+    rng = nn.radius_neighbors_graph(None, radius=1.5)
+    kng = nn.kneighbors_graph(None)
+    for graph in [rng, kng]:
+        assert_array_equal(graph.toarray(), [[0, 1], [1, 0]])
+        assert_array_equal(graph.data, [1, 1])
+        assert_array_equal(graph.indices, [1, 0])
+
+    X = [[0, 1], [0, 1], [1, 1]]
+    nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm)
+    nn.fit(X)
+    assert_array_equal(
+        nn.kneighbors_graph().toarray(),
+        np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]),
+    )
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_k_and_radius_neighbors_duplicates(algorithm):
+    # Test behavior of kneighbors when duplicates are present in query
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
+    duplicates = [[0], [1], [3]]
+
+    nn.fit(duplicates)
+
+    # Do not do anything special to duplicates.
+    kng = nn.kneighbors_graph(duplicates, mode="distance")
+    assert_allclose(
+        kng.toarray(), np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
+    )
+    assert_allclose(kng.data, [0.0, 0.0, 0.0])
+    assert_allclose(kng.indices, [0, 1, 2])
+
+    dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
+    check_object_arrays(dist, [[0, 1], [1, 0]])
+    check_object_arrays(ind, [[0, 1], [0, 1]])
+
+    rng = nn.radius_neighbors_graph(duplicates, radius=1.5)
+    assert_allclose(
+        rng.toarray(), np.array([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+    )
+
+    rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
+    rng.sort_indices()
+    assert_allclose(rng.toarray(), [[0, 1, 0], [1, 0, 0]])
+    assert_allclose(rng.indices, [0, 1, 0, 1])
+    assert_allclose(rng.data, [0, 1, 1, 0])
+
+    # Mask the first duplicates when n_duplicates > n_neighbors.
+    X = np.ones((3, 1))
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute")
+    nn.fit(X)
+    dist, ind = nn.kneighbors()
+    assert_allclose(dist, np.zeros((3, 1)))
+    assert_allclose(ind, [[1], [0], [1]])
+
+    # Test that zeros are explicitly marked in kneighbors_graph.
+    kng = nn.kneighbors_graph(mode="distance")
+    assert_allclose(kng.toarray(), np.zeros((3, 3)))
+    assert_allclose(kng.data, np.zeros(3))
+    assert_allclose(kng.indices, [1, 0, 1])
+    assert_allclose(
+        nn.kneighbors_graph().toarray(),
+        np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
+    )
+
+
+def test_include_self_neighbors_graph():
+    # Test include_self parameter in neighbors_graph
+    X = [[2, 3], [4, 5]]
+    kng = neighbors.kneighbors_graph(X, 1, include_self=True).toarray()
+    kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).toarray()
+    assert_array_equal(kng, [[1.0, 0.0], [0.0, 1.0]])
+    assert_array_equal(kng_not_self, [[0.0, 1.0], [1.0, 0.0]])
+
+    rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).toarray()
+    rng_not_self = neighbors.radius_neighbors_graph(
+        X, 5.0, include_self=False
+    ).toarray()
+    assert_array_equal(rng, [[1.0, 1.0], [1.0, 1.0]])
+    assert_array_equal(rng_not_self, [[0.0, 1.0], [1.0, 0.0]])
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_same_knn_parallel(algorithm):
+    X, y = datasets.make_classification(
+        n_samples=30, n_features=5, n_redundant=0, random_state=0
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+    clf = neighbors.KNeighborsClassifier(n_neighbors=3, algorithm=algorithm)
+    clf.fit(X_train, y_train)
+    y = clf.predict(X_test)
+    dist, ind = clf.kneighbors(X_test)
+    graph = clf.kneighbors_graph(X_test, mode="distance").toarray()
+
+    clf.set_params(n_jobs=3)
+    clf.fit(X_train, y_train)
+    y_parallel = clf.predict(X_test)
+    dist_parallel, ind_parallel = clf.kneighbors(X_test)
+    graph_parallel = clf.kneighbors_graph(X_test, mode="distance").toarray()
+
+    assert_array_equal(y, y_parallel)
+    assert_allclose(dist, dist_parallel)
+    assert_array_equal(ind, ind_parallel)
+    assert_allclose(graph, graph_parallel)
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_same_radius_neighbors_parallel(algorithm):
+    X, y = datasets.make_classification(
+        n_samples=30, n_features=5, n_redundant=0, random_state=0
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+    clf = neighbors.RadiusNeighborsClassifier(radius=10, algorithm=algorithm)
+    clf.fit(X_train, y_train)
+    y = clf.predict(X_test)
+    dist, ind = clf.radius_neighbors(X_test)
+    graph = clf.radius_neighbors_graph(X_test, mode="distance").toarray()
+
+    clf.set_params(n_jobs=3)
+    clf.fit(X_train, y_train)
+    y_parallel = clf.predict(X_test)
+    dist_parallel, ind_parallel = clf.radius_neighbors(X_test)
+    graph_parallel = clf.radius_neighbors_graph(X_test, mode="distance").toarray()
+
+    assert_array_equal(y, y_parallel)
+    for i in range(len(dist)):
+        assert_allclose(dist[i], dist_parallel[i])
+        assert_array_equal(ind[i], ind_parallel[i])
+    assert_allclose(graph, graph_parallel)
+
+
+@pytest.mark.parametrize("backend", ["threading", "loky"])
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_knn_forcing_backend(backend, algorithm):
+    # Non-regression test which ensures the knn methods are properly working
+    # even when forcing the global joblib backend.
+    with joblib.parallel_backend(backend):
+        X, y = datasets.make_classification(
+            n_samples=30, n_features=5, n_redundant=0, random_state=0
+        )
+        X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+        clf = neighbors.KNeighborsClassifier(
+            n_neighbors=3, algorithm=algorithm, n_jobs=2
+        )
+        clf.fit(X_train, y_train)
+        clf.predict(X_test)
+        clf.kneighbors(X_test)
+        clf.kneighbors_graph(X_test, mode="distance")
+
+
+def test_dtype_convert():
+    classifier = neighbors.KNeighborsClassifier(n_neighbors=1)
+    CLASSES = 15
+    X = np.eye(CLASSES)
+    y = [ch for ch in "ABCDEFGHIJKLMNOPQRSTU"[:CLASSES]]
+
+    result = classifier.fit(X, y).predict(X)
+    assert_array_equal(result, y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_metric_callable(csr_container):
+    def sparse_metric(x, y):  # Metric accepting sparse matrix input (only)
+        assert issparse(x) and issparse(y)
+        return x.dot(y.T).toarray().item()
+
+    X = csr_container(
+        [[1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [0, 0, 1, 0, 0]]  # Population matrix
+    )
+
+    Y = csr_container([[1, 1, 0, 1, 1], [1, 0, 0, 1, 1]])  # Query matrix
+
+    nn = neighbors.NearestNeighbors(
+        algorithm="brute", n_neighbors=2, metric=sparse_metric
+    ).fit(X)
+    N = nn.kneighbors(Y, return_distance=False)
+
+    # GS indices of nearest neighbours in `X` for `sparse_metric`
+    gold_standard_nn = np.array([[2, 1], [2, 1]])
+
+    assert_array_equal(N, gold_standard_nn)
+
+
+# ignore conversion to boolean in pairwise_distances
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.DataConversionWarning")
+def test_pairwise_boolean_distance():
+    # Non-regression test for #4523
+    # 'brute': uses scipy.spatial.distance through pairwise_distances
+    # 'ball_tree': uses sklearn.neighbors._dist_metrics
+    rng = np.random.RandomState(0)
+    X = rng.uniform(size=(6, 5))
+    NN = neighbors.NearestNeighbors
+
+    nn1 = NN(metric="jaccard", algorithm="brute").fit(X)
+    nn2 = NN(metric="jaccard", algorithm="ball_tree").fit(X)
+    assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0])
+
+
+def test_radius_neighbors_predict_proba():
+    for seed in range(5):
+        X, y = datasets.make_classification(
+            n_samples=50,
+            n_features=5,
+            n_informative=3,
+            n_redundant=0,
+            n_classes=3,
+            random_state=seed,
+        )
+        X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0)
+        outlier_label = int(2 - seed)
+        clf = neighbors.RadiusNeighborsClassifier(radius=2, outlier_label=outlier_label)
+        clf.fit(X_tr, y_tr)
+        pred = clf.predict(X_te)
+        proba = clf.predict_proba(X_te)
+        proba_label = proba.argmax(axis=1)
+        proba_label = np.where(proba.sum(axis=1) == 0, outlier_label, proba_label)
+        assert_array_equal(pred, proba_label)
+
+
+def test_pipeline_with_nearest_neighbors_transformer():
+    # Test chaining KNeighborsTransformer and classifiers/regressors
+    rng = np.random.RandomState(0)
+    X = 2 * rng.rand(40, 5) - 1
+    X2 = 2 * rng.rand(40, 5) - 1
+    y = rng.rand(40, 1)
+
+    n_neighbors = 12
+    radius = 1.5
+    # We precompute more neighbors than necessary, to have equivalence between
+    # k-neighbors estimator after radius-neighbors transformer, and vice-versa.
+    factor = 2
+
+    k_trans = neighbors.KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance")
+    k_trans_factor = neighbors.KNeighborsTransformer(
+        n_neighbors=int(n_neighbors * factor), mode="distance"
+    )
+
+    r_trans = neighbors.RadiusNeighborsTransformer(radius=radius, mode="distance")
+    r_trans_factor = neighbors.RadiusNeighborsTransformer(
+        radius=int(radius * factor), mode="distance"
+    )
+
+    k_reg = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)
+    r_reg = neighbors.RadiusNeighborsRegressor(radius=radius)
+
+    test_list = [
+        (k_trans, k_reg),
+        (k_trans_factor, r_reg),
+        (r_trans, r_reg),
+        (r_trans_factor, k_reg),
+    ]
+
+    for trans, reg in test_list:
+        # compare the chained version and the compact version
+        reg_compact = clone(reg)
+        reg_precomp = clone(reg)
+        reg_precomp.set_params(metric="precomputed")
+
+        reg_chain = make_pipeline(clone(trans), reg_precomp)
+
+        y_pred_chain = reg_chain.fit(X, y).predict(X2)
+        y_pred_compact = reg_compact.fit(X, y).predict(X2)
+        assert_allclose(y_pred_chain, y_pred_compact)
+
+
+@pytest.mark.parametrize(
+    "X, metric, metric_params, expected_algo",
+    [
+        (np.random.randint(10, size=(10, 10)), "precomputed", None, "brute"),
+        (np.random.randn(10, 20), "euclidean", None, "brute"),
+        (np.random.randn(8, 5), "euclidean", None, "brute"),
+        (np.random.randn(10, 5), "euclidean", None, "kd_tree"),
+        (np.random.randn(10, 5), "seuclidean", {"V": [2] * 5}, "ball_tree"),
+        (np.random.randn(10, 5), "correlation", None, "brute"),
+    ],
+)
+def test_auto_algorithm(X, metric, metric_params, expected_algo):
+    model = neighbors.NearestNeighbors(
+        n_neighbors=4, algorithm="auto", metric=metric, metric_params=metric_params
+    )
+    model.fit(X)
+    assert model._fit_method == expected_algo
+
+
+# TODO: Remove ignore_warnings when minimum supported SciPy version is 1.17
+# Some scipy metrics are deprecated (depending on the scipy version) but we
+# still want to test them.
+@ignore_warnings(category=DeprecationWarning)
+@pytest.mark.parametrize(
+    "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"]))
+)
+def test_radius_neighbors_brute_backend(
+    metric,
+    global_random_seed,
+    global_dtype,
+    n_samples=2000,
+    n_features=30,
+    n_query_pts=5,
+    radius=1.0,
+):
+    rng = np.random.RandomState(global_random_seed)
+    # Both backends for the 'brute' algorithm of radius_neighbors
+    # must give identical results.
+    X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
+    X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False)
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        feature_sl = slice(None, 2)
+        X_train = np.ascontiguousarray(X_train[:, feature_sl])
+        X_test = np.ascontiguousarray(X_test[:, feature_sl])
+
+    metric_params_list = _generate_test_params_for(metric, n_features)
+
+    for metric_params in metric_params_list:
+        p = metric_params.pop("p", 2)
+
+        neigh = neighbors.NearestNeighbors(
+            radius=radius,
+            algorithm="brute",
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+        )
+
+        neigh.fit(X_train)
+
+        with config_context(enable_cython_pairwise_dist=False):
+            # Use the legacy backend for brute
+            legacy_brute_dst, legacy_brute_idx = neigh.radius_neighbors(
+                X_test, return_distance=True
+            )
+        with config_context(enable_cython_pairwise_dist=True):
+            # Use the pairwise-distances reduction backend for brute
+            pdr_brute_dst, pdr_brute_idx = neigh.radius_neighbors(
+                X_test, return_distance=True
+            )
+
+        assert_compatible_radius_results(
+            legacy_brute_dst,
+            pdr_brute_dst,
+            legacy_brute_idx,
+            pdr_brute_idx,
+            radius=radius,
+            check_sorted=False,
+        )
+
+
+def test_valid_metrics_has_no_duplicate():
+    for val in neighbors.VALID_METRICS.values():
+        assert len(val) == len(set(val))
+
+
+def test_regressor_predict_on_arraylikes():
+    """Ensures that `predict` works for array-likes when `weights` is a callable.
+
+    Non-regression test for #22687.
+    """
+    X = [[5, 1], [3, 1], [4, 3], [0, 3]]
+    y = [2, 3, 5, 6]
+
+    def _weights(dist):
+        return np.ones_like(dist)
+
+    est = KNeighborsRegressor(n_neighbors=1, algorithm="brute", weights=_weights)
+    est.fit(X, y)
+    assert_allclose(est.predict([[0, 2.5]]), [6])
+
+
+@pytest.mark.parametrize(
+    "Estimator, params",
+    [
+        (neighbors.KNeighborsClassifier, {"n_neighbors": 2}),
+        (neighbors.KNeighborsRegressor, {"n_neighbors": 2}),
+        (neighbors.RadiusNeighborsRegressor, {}),
+        (neighbors.RadiusNeighborsClassifier, {}),
+        (neighbors.KNeighborsTransformer, {"n_neighbors": 2}),
+        (neighbors.RadiusNeighborsTransformer, {"radius": 1.5}),
+        (neighbors.LocalOutlierFactor, {"n_neighbors": 1}),
+    ],
+)
+def test_nan_euclidean_support(Estimator, params):
+    """Check that the different neighbor estimators are lenient towards `nan`
+    values if using `metric="nan_euclidean"`.
+    """
+
+    X = [[0, 1], [1, np.nan], [2, 3], [3, 5]]
+    y = [0, 0, 1, 1]
+
+    params.update({"metric": "nan_euclidean"})
+    estimator = Estimator().set_params(**params).fit(X, y)
+
+    for response_method in ("kneighbors", "predict", "transform", "fit_predict"):
+        if hasattr(estimator, response_method):
+            output = getattr(estimator, response_method)(X)
+            if hasattr(output, "toarray"):
+                assert not np.isnan(output.data).any()
+            else:
+                assert not np.isnan(output).any()
+
+
+def test_predict_dataframe():
+    """Check that KNN predict works with dataframes
+
+    non-regression test for issue #26768
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), columns=["a", "b"])
+    y = np.array([1, 2, 3, 4])
+
+    knn = neighbors.KNeighborsClassifier(n_neighbors=2).fit(X, y)
+    knn.predict(X)
+
+
+def test_nearest_neighbours_works_with_p_less_than_1():
+    """Check that NearestNeighbors works with :math:`p \\in (0,1)` when `algorithm`
+    is `"auto"` or `"brute"` regardless of the dtype of X.
+
+    Non-regression test for issue #26548
+    """
+    X = np.array([[1.0, 0.0], [0.0, 0.0], [0.0, 1.0]])
+    neigh = neighbors.NearestNeighbors(
+        n_neighbors=3, algorithm="brute", metric_params={"p": 0.5}
+    )
+    neigh.fit(X)
+
+    y = neigh.radius_neighbors(X[0].reshape(1, -1), radius=4, return_distance=False)
+    assert_allclose(y[0], [0, 1, 2])
+
+    y = neigh.kneighbors(X[0].reshape(1, -1), return_distance=False)
+    assert_allclose(y[0], [0, 1, 2])
+
+
+def test_KNeighborsClassifier_raise_on_all_zero_weights():
+    """Check that `predict` and `predict_proba` raises on sample of all zeros weights.
+
+    Related to Issue #25854.
+    """
+    X = [[0, 1], [1, 2], [2, 3], [3, 4]]
+    y = [0, 0, 1, 1]
+
+    def _weights(dist):
+        return np.vectorize(lambda x: 0 if x > 0.5 else 1)(dist)
+
+    est = neighbors.KNeighborsClassifier(n_neighbors=3, weights=_weights)
+    est.fit(X, y)
+
+    msg = (
+        "All neighbors of some sample is getting zero weights. "
+        "Please modify 'weights' to avoid this case if you are "
+        "using a user-defined function."
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        est.predict([[1.1, 1.1]])
+
+    with pytest.raises(ValueError, match=msg):
+        est.predict_proba([[1.1, 1.1]])
+
+
+@pytest.mark.parametrize(
+    "nn_model",
+    [
+        neighbors.KNeighborsClassifier(n_neighbors=10),
+        neighbors.RadiusNeighborsClassifier(),
+    ],
+)
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_neighbor_classifiers_loocv(nn_model, algorithm):
+    """Check that `predict` and related functions work fine with X=None
+
+    Calling predict with X=None computes a prediction for each training point
+    from the labels of its neighbors (without the label of the data point being
+    predicted upon). This is therefore mathematically equivalent to
+    leave-one-out cross-validation without having do any retraining (rebuilding
+    a KD-tree or Ball-tree index) or any data reshuffling.
+    """
+    X, y = datasets.make_blobs(n_samples=15, centers=5, n_features=2, random_state=0)
+
+    nn_model = clone(nn_model).set_params(algorithm=algorithm)
+
+    # Set the radius for RadiusNeighborsRegressor to some percentile of the
+    # empirical pairwise distances to avoid trivial test cases and warnings for
+    # predictions with no neighbors within the radius.
+    if "radius" in nn_model.get_params():
+        dists = pairwise_distances(X).ravel()
+        dists = dists[dists > 0]
+        nn_model.set_params(radius=np.percentile(dists, 80))
+
+    loocv = cross_val_score(nn_model, X, y, cv=LeaveOneOut())
+    nn_model.fit(X, y)
+
+    assert_allclose(loocv, nn_model.predict(None) == y)
+    assert np.mean(loocv) == pytest.approx(nn_model.score(None, y))
+
+    # Evaluating `nn_model` on its "training" set should lead to a higher
+    # accuracy value than leaving out each data point in turn because the
+    # former can overfit while the latter cannot by construction.
+    assert nn_model.score(None, y) < nn_model.score(X, y)
+
+
+@pytest.mark.parametrize(
+    "nn_model",
+    [
+        neighbors.KNeighborsRegressor(n_neighbors=10),
+        neighbors.RadiusNeighborsRegressor(),
+    ],
+)
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_neighbor_regressors_loocv(nn_model, algorithm):
+    """Check that `predict` and related functions work fine with X=None"""
+    X, y = datasets.make_regression(n_samples=15, n_features=2, random_state=0)
+
+    # Only checking cross_val_predict and not cross_val_score because
+    # cross_val_score does not work with LeaveOneOut() for a regressor: the
+    # default score method implements R2 score which is not well defined for a
+    # single data point.
+    #
+    # TODO: if score is refactored to evaluate models for other scoring
+    # functions, then this test can be extended to check cross_val_score as
+    # well.
+    nn_model = clone(nn_model).set_params(algorithm=algorithm)
+
+    # Set the radius for RadiusNeighborsRegressor to some percentile of the
+    # empirical pairwise distances to avoid trivial test cases and warnings for
+    # predictions with no neighbors within the radius.
+    if "radius" in nn_model.get_params():
+        dists = pairwise_distances(X).ravel()
+        dists = dists[dists > 0]
+        nn_model.set_params(radius=np.percentile(dists, 80))
+
+    loocv = cross_val_predict(nn_model, X, y, cv=LeaveOneOut())
+    nn_model.fit(X, y)
+    assert_allclose(loocv, nn_model.predict(None))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_pipeline.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ad78824489cada3ad56ccff34d806ba6cf1278a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_pipeline.py
@@ -0,0 +1,256 @@
+"""
+This is testing the equivalence between some estimators with internal nearest
+neighbors computations, and the corresponding pipeline versions with
+KNeighborsTransformer or RadiusNeighborsTransformer to precompute the
+neighbors.
+"""
+
+import numpy as np
+
+from sklearn.base import clone
+from sklearn.cluster import DBSCAN, SpectralClustering
+from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.datasets import make_blobs
+from sklearn.manifold import TSNE, Isomap, SpectralEmbedding
+from sklearn.neighbors import (
+    KNeighborsRegressor,
+    KNeighborsTransformer,
+    LocalOutlierFactor,
+    RadiusNeighborsRegressor,
+    RadiusNeighborsTransformer,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.utils._testing import assert_array_almost_equal
+
+
+def test_spectral_clustering():
+    # Test chaining KNeighborsTransformer and SpectralClustering
+    n_neighbors = 5
+    X, _ = make_blobs(random_state=0)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"),
+        SpectralClustering(
+            n_neighbors=n_neighbors, affinity="precomputed", random_state=42
+        ),
+    )
+    est_compact = SpectralClustering(
+        n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42
+    )
+    labels_compact = est_compact.fit_predict(X)
+    labels_chain = est_chain.fit_predict(X)
+    assert_array_almost_equal(labels_chain, labels_compact)
+
+
+def test_spectral_embedding():
+    # Test chaining KNeighborsTransformer and SpectralEmbedding
+    n_neighbors = 5
+
+    n_samples = 1000
+    centers = np.array(
+        [
+            [0.0, 5.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 4.0, 0.0, 0.0],
+            [1.0, 0.0, 0.0, 5.0, 1.0],
+        ]
+    )
+    S, true_labels = make_blobs(
+        n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
+    )
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"),
+        SpectralEmbedding(
+            n_neighbors=n_neighbors, affinity="precomputed", random_state=42
+        ),
+    )
+    est_compact = SpectralEmbedding(
+        n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42
+    )
+    St_compact = est_compact.fit_transform(S)
+    St_chain = est_chain.fit_transform(S)
+    assert_array_almost_equal(St_chain, St_compact)
+
+
+def test_dbscan():
+    # Test chaining RadiusNeighborsTransformer and DBSCAN
+    radius = 0.3
+    n_clusters = 3
+    X = generate_clustered_data(n_clusters=n_clusters)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        RadiusNeighborsTransformer(radius=radius, mode="distance"),
+        DBSCAN(metric="precomputed", eps=radius),
+    )
+    est_compact = DBSCAN(eps=radius)
+
+    labels_chain = est_chain.fit_predict(X)
+    labels_compact = est_compact.fit_predict(X)
+    assert_array_almost_equal(labels_chain, labels_compact)
+
+
+def test_isomap():
+    # Test chaining KNeighborsTransformer and Isomap with
+    # neighbors_algorithm='precomputed'
+    algorithm = "auto"
+    n_neighbors = 10
+
+    X, _ = make_blobs(random_state=0)
+    X2, _ = make_blobs(random_state=1)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(
+            n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
+        ),
+        Isomap(n_neighbors=n_neighbors, metric="precomputed"),
+    )
+    est_compact = Isomap(n_neighbors=n_neighbors, neighbors_algorithm=algorithm)
+
+    Xt_chain = est_chain.fit_transform(X)
+    Xt_compact = est_compact.fit_transform(X)
+    assert_array_almost_equal(Xt_chain, Xt_compact)
+
+    Xt_chain = est_chain.transform(X2)
+    Xt_compact = est_compact.transform(X2)
+    assert_array_almost_equal(Xt_chain, Xt_compact)
+
+
+def test_tsne():
+    # Test chaining KNeighborsTransformer and TSNE
+    max_iter = 250
+    perplexity = 5
+    n_neighbors = int(3.0 * perplexity + 1)
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(20, 2)
+
+    for metric in ["minkowski", "sqeuclidean"]:
+        # compare the chained version and the compact version
+        est_chain = make_pipeline(
+            KNeighborsTransformer(
+                n_neighbors=n_neighbors, mode="distance", metric=metric
+            ),
+            TSNE(
+                init="random",
+                metric="precomputed",
+                perplexity=perplexity,
+                method="barnes_hut",
+                random_state=42,
+                max_iter=max_iter,
+            ),
+        )
+        est_compact = TSNE(
+            init="random",
+            metric=metric,
+            perplexity=perplexity,
+            max_iter=max_iter,
+            method="barnes_hut",
+            random_state=42,
+        )
+
+        Xt_chain = est_chain.fit_transform(X)
+        Xt_compact = est_compact.fit_transform(X)
+        assert_array_almost_equal(Xt_chain, Xt_compact)
+
+
+def test_lof_novelty_false():
+    # Test chaining KNeighborsTransformer and LocalOutlierFactor
+    n_neighbors = 4
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(40, 2)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
+        LocalOutlierFactor(
+            metric="precomputed",
+            n_neighbors=n_neighbors,
+            novelty=False,
+            contamination="auto",
+        ),
+    )
+    est_compact = LocalOutlierFactor(
+        n_neighbors=n_neighbors, novelty=False, contamination="auto"
+    )
+
+    pred_chain = est_chain.fit_predict(X)
+    pred_compact = est_compact.fit_predict(X)
+    assert_array_almost_equal(pred_chain, pred_compact)
+
+
+def test_lof_novelty_true():
+    # Test chaining KNeighborsTransformer and LocalOutlierFactor
+    n_neighbors = 4
+
+    rng = np.random.RandomState(0)
+    X1 = rng.randn(40, 2)
+    X2 = rng.randn(40, 2)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
+        LocalOutlierFactor(
+            metric="precomputed",
+            n_neighbors=n_neighbors,
+            novelty=True,
+            contamination="auto",
+        ),
+    )
+    est_compact = LocalOutlierFactor(
+        n_neighbors=n_neighbors, novelty=True, contamination="auto"
+    )
+
+    pred_chain = est_chain.fit(X1).predict(X2)
+    pred_compact = est_compact.fit(X1).predict(X2)
+    assert_array_almost_equal(pred_chain, pred_compact)
+
+
+def test_kneighbors_regressor():
+    # Test chaining KNeighborsTransformer and classifiers/regressors
+    rng = np.random.RandomState(0)
+    X = 2 * rng.rand(40, 5) - 1
+    X2 = 2 * rng.rand(40, 5) - 1
+    y = rng.rand(40, 1)
+
+    n_neighbors = 12
+    radius = 1.5
+    # We precompute more neighbors than necessary, to have equivalence between
+    # k-neighbors estimator after radius-neighbors transformer, and vice-versa.
+    factor = 2
+
+    k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance")
+    k_trans_factor = KNeighborsTransformer(
+        n_neighbors=int(n_neighbors * factor), mode="distance"
+    )
+
+    r_trans = RadiusNeighborsTransformer(radius=radius, mode="distance")
+    r_trans_factor = RadiusNeighborsTransformer(
+        radius=int(radius * factor), mode="distance"
+    )
+
+    k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
+    r_reg = RadiusNeighborsRegressor(radius=radius)
+
+    test_list = [
+        (k_trans, k_reg),
+        (k_trans_factor, r_reg),
+        (r_trans, r_reg),
+        (r_trans_factor, k_reg),
+    ]
+
+    for trans, reg in test_list:
+        # compare the chained version and the compact version
+        reg_compact = clone(reg)
+        reg_precomp = clone(reg)
+        reg_precomp.set_params(metric="precomputed")
+
+        reg_chain = make_pipeline(clone(trans), reg_precomp)
+
+        y_pred_chain = reg_chain.fit(X, y).predict(X2)
+        y_pred_compact = reg_compact.fit(X, y).predict(X2)
+        assert_array_almost_equal(y_pred_chain, y_pred_compact)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_tree.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..de19152e8b7f236d0a524f756ca9c40d48023edb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import pickle
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_almost_equal
+
+from sklearn.metrics import DistanceMetric
+from sklearn.neighbors._ball_tree import (
+    BallTree,
+    kernel_norm,
+)
+from sklearn.neighbors._ball_tree import (
+    NeighborsHeap64 as NeighborsHeapBT,
+)
+from sklearn.neighbors._ball_tree import (
+    nodeheap_sort as nodeheap_sort_bt,
+)
+from sklearn.neighbors._ball_tree import (
+    simultaneous_sort as simultaneous_sort_bt,
+)
+from sklearn.neighbors._kd_tree import (
+    KDTree,
+)
+from sklearn.neighbors._kd_tree import (
+    NeighborsHeap64 as NeighborsHeapKDT,
+)
+from sklearn.neighbors._kd_tree import (
+    nodeheap_sort as nodeheap_sort_kdt,
+)
+from sklearn.neighbors._kd_tree import (
+    simultaneous_sort as simultaneous_sort_kdt,
+)
+from sklearn.utils import check_random_state
+
+rng = np.random.RandomState(42)
+V_mahalanobis = rng.rand(3, 3)
+V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
+
+DIMENSION = 3
+
+METRICS = {
+    "euclidean": {},
+    "manhattan": {},
+    "minkowski": dict(p=3),
+    "chebyshev": {},
+    "seuclidean": dict(V=rng.random_sample(DIMENSION)),
+    "mahalanobis": dict(V=V_mahalanobis),
+}
+
+KD_TREE_METRICS = ["euclidean", "manhattan", "chebyshev", "minkowski"]
+BALL_TREE_METRICS = list(METRICS)
+
+
+def dist_func(x1, x2, p):
+    return np.sum((x1 - x2) ** p) ** (1.0 / p)
+
+
+def compute_kernel_slow(Y, X, kernel, h):
+    d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
+    norm = kernel_norm(h, X.shape[1], kernel)
+
+    if kernel == "gaussian":
+        return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
+    elif kernel == "tophat":
+        return norm * (d < h).sum(-1)
+    elif kernel == "epanechnikov":
+        return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
+    elif kernel == "exponential":
+        return norm * (np.exp(-d / h)).sum(-1)
+    elif kernel == "linear":
+        return norm * ((1 - d / h) * (d < h)).sum(-1)
+    elif kernel == "cosine":
+        return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
+    else:
+        raise ValueError("kernel not recognized")
+
+
+def brute_force_neighbors(X, Y, k, metric, **kwargs):
+    D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
+    ind = np.argsort(D, axis=1)[:, :k]
+    dist = D[np.arange(Y.shape[0])[:, None], ind]
+    return dist, ind
+
+
+@pytest.mark.parametrize("Cls", [KDTree, BallTree])
+@pytest.mark.parametrize(
+    "kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
+)
+@pytest.mark.parametrize("h", [0.01, 0.1, 1])
+@pytest.mark.parametrize("rtol", [0, 1e-5])
+@pytest.mark.parametrize("atol", [1e-6, 1e-2])
+@pytest.mark.parametrize("breadth_first", [True, False])
+def test_kernel_density(
+    Cls, kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3
+):
+    rng = check_random_state(1)
+    X = rng.random_sample((n_samples, n_features))
+    Y = rng.random_sample((n_samples, n_features))
+    dens_true = compute_kernel_slow(Y, X, kernel, h)
+
+    tree = Cls(X, leaf_size=10)
+    dens = tree.kernel_density(
+        Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first
+    )
+    assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7))
+
+
+@pytest.mark.parametrize("Cls", [KDTree, BallTree])
+def test_neighbor_tree_query_radius(Cls, n_samples=100, n_features=10):
+    rng = check_random_state(0)
+    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
+    query_pt = np.zeros(n_features, dtype=float)
+
+    eps = 1e-15  # roundoff error can cause test to fail
+    tree = Cls(X, leaf_size=5)
+    rad = np.sqrt(((X - query_pt) ** 2).sum(1))
+
+    for r in np.linspace(rad[0], rad[-1], 100):
+        ind = tree.query_radius([query_pt], r + eps)[0]
+        i = np.where(rad <= r + eps)[0]
+
+        ind.sort()
+        i.sort()
+
+        assert_array_almost_equal(i, ind)
+
+
+@pytest.mark.parametrize("Cls", [KDTree, BallTree])
+def test_neighbor_tree_query_radius_distance(Cls, n_samples=100, n_features=10):
+    rng = check_random_state(0)
+    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
+    query_pt = np.zeros(n_features, dtype=float)
+
+    eps = 1e-15  # roundoff error can cause test to fail
+    tree = Cls(X, leaf_size=5)
+    rad = np.sqrt(((X - query_pt) ** 2).sum(1))
+
+    for r in np.linspace(rad[0], rad[-1], 100):
+        ind, dist = tree.query_radius([query_pt], r + eps, return_distance=True)
+
+        ind = ind[0]
+        dist = dist[0]
+
+        d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))
+
+        assert_array_almost_equal(d, dist)
+
+
+@pytest.mark.parametrize("Cls", [KDTree, BallTree])
+@pytest.mark.parametrize("dualtree", (True, False))
+def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
+    rng = check_random_state(0)
+    X = rng.random_sample((n_samples, n_features))
+    Y = rng.random_sample((n_samples, n_features))
+    r = np.linspace(0, 1, 10)
+    tree = Cls(X, leaf_size=10)
+
+    D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
+    counts_true = [(D <= ri).sum() for ri in r]
+
+    counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree)
+    assert_array_almost_equal(counts, counts_true)
+
+
+@pytest.mark.parametrize("NeighborsHeap", [NeighborsHeapBT, NeighborsHeapKDT])
+def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10):
+    heap = NeighborsHeap(n_pts, n_nbrs)
+    rng = check_random_state(0)
+
+    for row in range(n_pts):
+        d_in = rng.random_sample(2 * n_nbrs).astype(np.float64, copy=False)
+        i_in = np.arange(2 * n_nbrs, dtype=np.intp)
+        for d, i in zip(d_in, i_in):
+            heap.push(row, d, i)
+
+        ind = np.argsort(d_in)
+        d_in = d_in[ind]
+        i_in = i_in[ind]
+
+        d_heap, i_heap = heap.get_arrays(sort=True)
+
+        assert_array_almost_equal(d_in[:n_nbrs], d_heap[row])
+        assert_array_almost_equal(i_in[:n_nbrs], i_heap[row])
+
+
+@pytest.mark.parametrize("nodeheap_sort", [nodeheap_sort_bt, nodeheap_sort_kdt])
+def test_node_heap(nodeheap_sort, n_nodes=50):
+    rng = check_random_state(0)
+    vals = rng.random_sample(n_nodes).astype(np.float64, copy=False)
+
+    i1 = np.argsort(vals)
+    vals2, i2 = nodeheap_sort(vals)
+
+    assert_array_almost_equal(i1, i2)
+    assert_array_almost_equal(vals[i1], vals2)
+
+
+@pytest.mark.parametrize(
+    "simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt]
+)
+def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201):
+    rng = check_random_state(0)
+    dist = rng.random_sample((n_rows, n_pts)).astype(np.float64, copy=False)
+    ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(np.intp, copy=False)
+
+    dist2 = dist.copy()
+    ind2 = ind.copy()
+
+    # simultaneous sort rows using function
+    simultaneous_sort(dist, ind)
+
+    # simultaneous sort rows using numpy
+    i = np.argsort(dist2, axis=1)
+    row_ind = np.arange(n_rows)[:, None]
+    dist2 = dist2[row_ind, i]
+    ind2 = ind2[row_ind, i]
+
+    assert_array_almost_equal(dist, dist2)
+    assert_array_almost_equal(ind, ind2)
+
+
+@pytest.mark.parametrize("Cls", [KDTree, BallTree])
+def test_gaussian_kde(Cls, n_samples=1000):
+    # Compare gaussian KDE results to scipy.stats.gaussian_kde
+    from scipy.stats import gaussian_kde
+
+    rng = check_random_state(0)
+    x_in = rng.normal(0, 1, n_samples)
+    x_out = np.linspace(-5, 5, 30)
+
+    for h in [0.01, 0.1, 1]:
+        tree = Cls(x_in[:, None])
+        gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
+
+        dens_tree = tree.kernel_density(x_out[:, None], h) / n_samples
+        dens_gkde = gkde.evaluate(x_out)
+
+        assert_array_almost_equal(dens_tree, dens_gkde, decimal=3)
+
+
+@pytest.mark.parametrize(
+    "Cls, metric",
+    itertools.chain(
+        [(KDTree, metric) for metric in KD_TREE_METRICS],
+        [(BallTree, metric) for metric in BALL_TREE_METRICS],
+    ),
+)
+@pytest.mark.parametrize("k", (1, 3, 5))
+@pytest.mark.parametrize("dualtree", (True, False))
+@pytest.mark.parametrize("breadth_first", (True, False))
+def test_nn_tree_query(Cls, metric, k, dualtree, breadth_first):
+    rng = check_random_state(0)
+    X = rng.random_sample((40, DIMENSION))
+    Y = rng.random_sample((10, DIMENSION))
+
+    kwargs = METRICS[metric]
+
+    kdt = Cls(X, leaf_size=1, metric=metric, **kwargs)
+    dist1, ind1 = kdt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first)
+    dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)
+
+    # don't check indices here: if there are any duplicate distances,
+    # the indices may not match.  Distances should not have this problem.
+    assert_array_almost_equal(dist1, dist2)
+
+
+@pytest.mark.parametrize(
+    "Cls, metric",
+    [(KDTree, "euclidean"), (BallTree, "euclidean"), (BallTree, dist_func)],
+)
+@pytest.mark.parametrize("protocol", (0, 1, 2))
+def test_pickle(Cls, metric, protocol):
+    rng = check_random_state(0)
+    X = rng.random_sample((10, 3))
+
+    if hasattr(metric, "__call__"):
+        kwargs = {"p": 2}
+    else:
+        kwargs = {}
+
+    tree1 = Cls(X, leaf_size=1, metric=metric, **kwargs)
+
+    ind1, dist1 = tree1.query(X)
+
+    s = pickle.dumps(tree1, protocol=protocol)
+    tree2 = pickle.loads(s)
+
+    ind2, dist2 = tree2.query(X)
+
+    assert_array_almost_equal(ind1, ind2)
+    assert_array_almost_equal(dist1, dist2)
+
+    assert isinstance(tree2, Cls)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_quad_tree.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_quad_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..be9a4c5fe549d32a130f9c6a55f6675fa0e42f20
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_quad_tree.py
@@ -0,0 +1,144 @@
+import pickle
+
+import numpy as np
+import pytest
+
+from sklearn.neighbors._quad_tree import _QuadTree
+from sklearn.utils import check_random_state
+
+
+def test_quadtree_boundary_computation():
+    # Introduce a point into a quad tree with boundaries not easy to compute.
+    Xs = []
+
+    # check a random case
+    Xs.append(np.array([[-1, 1], [-4, -1]], dtype=np.float32))
+    # check the case where only 0 are inserted
+    Xs.append(np.array([[0, 0], [0, 0]], dtype=np.float32))
+    # check the case where only negative are inserted
+    Xs.append(np.array([[-1, -2], [-4, 0]], dtype=np.float32))
+    # check the case where only small numbers are inserted
+    Xs.append(np.array([[-1e-6, 1e-6], [-4e-6, -1e-6]], dtype=np.float32))
+
+    for X in Xs:
+        tree = _QuadTree(n_dimensions=2, verbose=0)
+        tree.build_tree(X)
+        tree._check_coherence()
+
+
+def test_quadtree_similar_point():
+    # Introduce a point into a quad tree where a similar point already exists.
+    # Test will hang if it doesn't complete.
+    Xs = []
+
+    # check the case where points are actually different
+    Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32))
+    # check the case where points are the same on X axis
+    Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32))
+    # check the case where points are arbitrarily close on X axis
+    Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32))
+    # check the case where points are the same on Y axis
+    Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32))
+    # check the case where points are arbitrarily close on Y axis
+    Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32))
+    # check the case where points are arbitrarily close on both axes
+    Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]], dtype=np.float32))
+
+    # check the case where points are arbitrarily close on both axes
+    # close to machine epsilon - x axis
+    Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]], dtype=np.float32))
+
+    # check the case where points are arbitrarily close on both axes
+    # close to machine epsilon - y axis
+    Xs.append(
+        np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]], dtype=np.float32)
+    )
+
+    for X in Xs:
+        tree = _QuadTree(n_dimensions=2, verbose=0)
+        tree.build_tree(X)
+        tree._check_coherence()
+
+
+@pytest.mark.parametrize("n_dimensions", (2, 3))
+@pytest.mark.parametrize("protocol", (0, 1, 2))
+def test_quad_tree_pickle(n_dimensions, protocol):
+    rng = check_random_state(0)
+
+    X = rng.random_sample((10, n_dimensions))
+
+    tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
+    tree.build_tree(X)
+
+    s = pickle.dumps(tree, protocol=protocol)
+    bt2 = pickle.loads(s)
+
+    for x in X:
+        cell_x_tree = tree.get_cell(x)
+        cell_x_bt2 = bt2.get_cell(x)
+        assert cell_x_tree == cell_x_bt2
+
+
+@pytest.mark.parametrize("n_dimensions", (2, 3))
+def test_qt_insert_duplicate(n_dimensions):
+    rng = check_random_state(0)
+
+    X = rng.random_sample((10, n_dimensions))
+    Xd = np.r_[X, X[:5]]
+    tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
+    tree.build_tree(Xd)
+
+    cumulative_size = tree.cumulative_size
+    leafs = tree.leafs
+
+    # Assert that the first 5 are indeed duplicated and that the next
+    # ones are single point leaf
+    for i, x in enumerate(X):
+        cell_id = tree.get_cell(x)
+        assert leafs[cell_id]
+        assert cumulative_size[cell_id] == 1 + (i < 5)
+
+
+def test_summarize():
+    # Simple check for quad tree's summarize
+
+    angle = 0.9
+    X = np.array(
+        [[-10.0, -10.0], [9.0, 10.0], [10.0, 9.0], [10.0, 10.0]], dtype=np.float32
+    )
+    query_pt = X[0, :]
+    n_dimensions = X.shape[1]
+    offset = n_dimensions + 2
+
+    qt = _QuadTree(n_dimensions, verbose=0)
+    qt.build_tree(X)
+
+    idx, summary = qt._py_summarize(query_pt, X, angle)
+
+    node_dist = summary[n_dimensions]
+    node_size = summary[n_dimensions + 1]
+
+    # Summary should contain only 1 node with size 3 and distance to
+    # X[1:] barycenter
+    barycenter = X[1:].mean(axis=0)
+    ds2c = ((X[0] - barycenter) ** 2).sum()
+
+    assert idx == offset
+    assert node_size == 3, "summary size = {}".format(node_size)
+    assert np.isclose(node_dist, ds2c)
+
+    # Summary should contain all 3 node with size 1 and distance to
+    # each point in X[1:] for ``angle=0``
+    idx, summary = qt._py_summarize(query_pt, X, 0.0)
+    barycenter = X[1:].mean(axis=0)
+    ds2c = ((X[0] - barycenter) ** 2).sum()
+
+    assert idx == 3 * (offset)
+    for i in range(3):
+        node_dist = summary[i * offset + n_dimensions]
+        node_size = summary[i * offset + n_dimensions + 1]
+
+        ds2c = ((X[0] - X[i + 1]) ** 2).sum()
+
+        assert node_size == 1, "summary size = {}".format(node_size)
+        assert np.isclose(node_dist, ds2c)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa5980ce24f5c778f8c1cb505c9e5218b5f30a27
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/__init__.py
@@ -0,0 +1,9 @@
+"""Models based on neural networks."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._multilayer_perceptron import MLPClassifier, MLPRegressor
+from ._rbm import BernoulliRBM
+
+__all__ = ["BernoulliRBM", "MLPClassifier", "MLPRegressor"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_base.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..25f0b0a18512b71147e292caf5891cf5620fccb6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_base.py
@@ -0,0 +1,287 @@
+"""Utilities for the neural network modules"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from scipy.special import expit as logistic_sigmoid
+from scipy.special import xlogy
+
+
+def inplace_identity(X):
+    """Simply leave the input array unchanged.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        Data, where `n_samples` is the number of samples
+        and `n_features` is the number of features.
+    """
+    # Nothing to do
+
+
+def inplace_exp(X):
+    """Compute the exponential inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    np.exp(X, out=X)
+
+
+def inplace_logistic(X):
+    """Compute the logistic function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    logistic_sigmoid(X, out=X)
+
+
+def inplace_tanh(X):
+    """Compute the hyperbolic tan function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    np.tanh(X, out=X)
+
+
+def inplace_relu(X):
+    """Compute the rectified linear unit function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    np.maximum(X, 0, out=X)
+
+
+def inplace_softmax(X):
+    """Compute the K-way softmax function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    tmp = X - X.max(axis=1)[:, np.newaxis]
+    np.exp(tmp, out=X)
+    X /= X.sum(axis=1)[:, np.newaxis]
+
+
+ACTIVATIONS = {
+    "identity": inplace_identity,
+    "exp": inplace_exp,
+    "tanh": inplace_tanh,
+    "logistic": inplace_logistic,
+    "relu": inplace_relu,
+    "softmax": inplace_softmax,
+}
+
+
+def inplace_identity_derivative(Z, delta):
+    """Apply the derivative of the identity function: do nothing.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the identity activation function during
+        the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    # Nothing to do
+
+
+def inplace_logistic_derivative(Z, delta):
+    """Apply the derivative of the logistic sigmoid function.
+
+    It exploits the fact that the derivative is a simple function of the output
+    value from logistic function.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the logistic activation function during
+        the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta *= Z
+    delta *= 1 - Z
+
+
+def inplace_tanh_derivative(Z, delta):
+    """Apply the derivative of the hyperbolic tanh function.
+
+    It exploits the fact that the derivative is a simple function of the output
+    value from hyperbolic tangent.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the hyperbolic tangent activation
+        function during the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta *= 1 - Z**2
+
+
+def inplace_relu_derivative(Z, delta):
+    """Apply the derivative of the relu function.
+
+    It exploits the fact that the derivative is a simple function of the output
+    value from rectified linear units activation function.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the rectified linear units activation
+        function during the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta[Z == 0] = 0
+
+
+DERIVATIVES = {
+    "identity": inplace_identity_derivative,
+    "tanh": inplace_tanh_derivative,
+    "logistic": inplace_logistic_derivative,
+    "relu": inplace_relu_derivative,
+}
+
+
+def squared_loss(y_true, y_pred, sample_weight=None):
+    """Compute the squared loss for regression.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) values.
+
+    y_pred : array-like or label indicator matrix
+        Predicted values, as returned by a regression estimator.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    return (
+        0.5 * np.average((y_true - y_pred) ** 2, weights=sample_weight, axis=0).mean()
+    )
+
+
+def poisson_loss(y_true, y_pred, sample_weight=None):
+    """Compute (half of the) Poisson deviance loss for regression.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+
+    y_pred : array-like or label indicator matrix
+        Predicted values, as returned by a regression estimator.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    # TODO: Decide what to do with the term `xlogy(y_true, y_true) - y_true`. For now,
+    # it is included. But the _loss module doesn't use it (for performance reasons) and
+    # only adds it as return of constant_to_optimal_zero (mainly for testing).
+    return np.average(
+        xlogy(y_true, y_true / y_pred) - y_true + y_pred, weights=sample_weight, axis=0
+    ).sum()
+
+
+def log_loss(y_true, y_prob, sample_weight=None):
+    """Compute Logistic loss for classification.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+
+    y_prob : array-like of float, shape = (n_samples, n_classes)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    eps = np.finfo(y_prob.dtype).eps
+    y_prob = np.clip(y_prob, eps, 1 - eps)
+    if y_prob.shape[1] == 1:
+        y_prob = np.append(1 - y_prob, y_prob, axis=1)
+
+    if y_true.shape[1] == 1:
+        y_true = np.append(1 - y_true, y_true, axis=1)
+
+    return -np.average(xlogy(y_true, y_prob), weights=sample_weight, axis=0).sum()
+
+
+def binary_log_loss(y_true, y_prob, sample_weight=None):
+    """Compute binary logistic loss for classification.
+
+    This is identical to log_loss in binary classification case,
+    but is kept for its use in multilabel case.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+
+    y_prob : array-like of float, shape = (n_samples, 1)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    eps = np.finfo(y_prob.dtype).eps
+    y_prob = np.clip(y_prob, eps, 1 - eps)
+    return -np.average(
+        xlogy(y_true, y_prob) + xlogy(1 - y_true, 1 - y_prob),
+        weights=sample_weight,
+        axis=0,
+    ).sum()
+
+
+LOSS_FUNCTIONS = {
+    "squared_error": squared_loss,
+    "poisson": poisson_loss,
+    "log_loss": log_loss,
+    "binary_log_loss": binary_log_loss,
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_multilayer_perceptron.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_multilayer_perceptron.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8260164202e648385618ff32bd9f3a1e5f21617
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_multilayer_perceptron.py
@@ -0,0 +1,1797 @@
+"""Multi-layer Perceptron"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from abc import ABC, abstractmethod
+from itertools import chain, pairwise
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.optimize
+
+from ..base import (
+    BaseEstimator,
+    ClassifierMixin,
+    RegressorMixin,
+    _fit_context,
+    is_classifier,
+)
+from ..exceptions import ConvergenceWarning
+from ..metrics import accuracy_score, r2_score
+from ..model_selection import train_test_split
+from ..preprocessing import LabelBinarizer
+from ..utils import (
+    _safe_indexing,
+    check_random_state,
+    column_or_1d,
+    gen_batches,
+    shuffle,
+)
+from ..utils._param_validation import Interval, Options, StrOptions
+from ..utils.extmath import safe_sparse_dot
+from ..utils.fixes import _get_additional_lbfgs_options_dict
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import (
+    _check_partial_fit_first_call,
+    type_of_target,
+    unique_labels,
+)
+from ..utils.optimize import _check_optimize_result
+from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data
+from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
+from ._stochastic_optimizers import AdamOptimizer, SGDOptimizer
+
+_STOCHASTIC_SOLVERS = ["sgd", "adam"]
+
+
+def _pack(coefs_, intercepts_):
+    """Pack the parameters into a single vector."""
+    return np.hstack([l.ravel() for l in coefs_ + intercepts_])
+
+
+class BaseMultilayerPerceptron(BaseEstimator, ABC):
+    """Base class for MLP classification and regression.
+
+    Warning: This class should not be used directly.
+    Use derived classes instead.
+
+    .. versionadded:: 0.18
+    """
+
+    _parameter_constraints: dict = {
+        "hidden_layer_sizes": [
+            "array-like",
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "activation": [StrOptions({"identity", "logistic", "tanh", "relu"})],
+        "solver": [StrOptions({"lbfgs", "sgd", "adam"})],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "batch_size": [
+            StrOptions({"auto"}),
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "learning_rate": [StrOptions({"constant", "invscaling", "adaptive"})],
+        "learning_rate_init": [Interval(Real, 0, None, closed="neither")],
+        "power_t": [Interval(Real, 0, None, closed="left")],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "verbose": ["verbose"],
+        "warm_start": ["boolean"],
+        "momentum": [Interval(Real, 0, 1, closed="both")],
+        "nesterovs_momentum": ["boolean"],
+        "early_stopping": ["boolean"],
+        "validation_fraction": [Interval(Real, 0, 1, closed="left")],
+        "beta_1": [Interval(Real, 0, 1, closed="left")],
+        "beta_2": [Interval(Real, 0, 1, closed="left")],
+        "epsilon": [Interval(Real, 0, None, closed="neither")],
+        "n_iter_no_change": [
+            Interval(Integral, 1, None, closed="left"),
+            Options(Real, {np.inf}),
+        ],
+        "max_fun": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        hidden_layer_sizes,
+        activation,
+        solver,
+        alpha,
+        batch_size,
+        learning_rate,
+        learning_rate_init,
+        power_t,
+        max_iter,
+        loss,
+        shuffle,
+        random_state,
+        tol,
+        verbose,
+        warm_start,
+        momentum,
+        nesterovs_momentum,
+        early_stopping,
+        validation_fraction,
+        beta_1,
+        beta_2,
+        epsilon,
+        n_iter_no_change,
+        max_fun,
+    ):
+        self.activation = activation
+        self.solver = solver
+        self.alpha = alpha
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.learning_rate_init = learning_rate_init
+        self.power_t = power_t
+        self.max_iter = max_iter
+        self.loss = loss
+        self.hidden_layer_sizes = hidden_layer_sizes
+        self.shuffle = shuffle
+        self.random_state = random_state
+        self.tol = tol
+        self.verbose = verbose
+        self.warm_start = warm_start
+        self.momentum = momentum
+        self.nesterovs_momentum = nesterovs_momentum
+        self.early_stopping = early_stopping
+        self.validation_fraction = validation_fraction
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.n_iter_no_change = n_iter_no_change
+        self.max_fun = max_fun
+
+    def _unpack(self, packed_parameters):
+        """Extract the coefficients and intercepts from packed_parameters."""
+        for i in range(self.n_layers_ - 1):
+            start, end, shape = self._coef_indptr[i]
+            self.coefs_[i] = np.reshape(packed_parameters[start:end], shape)
+
+            start, end = self._intercept_indptr[i]
+            self.intercepts_[i] = packed_parameters[start:end]
+
+    def _forward_pass(self, activations):
+        """Perform a forward pass on the network by computing the values
+        of the neurons in the hidden layers and the output layer.
+
+        Parameters
+        ----------
+        activations : list, length = n_layers - 1
+            The ith element of the list holds the values of the ith layer.
+        """
+        hidden_activation = ACTIVATIONS[self.activation]
+        # Iterate over the hidden layers
+        for i in range(self.n_layers_ - 1):
+            activations[i + 1] = safe_sparse_dot(activations[i], self.coefs_[i])
+            activations[i + 1] += self.intercepts_[i]
+
+            # For the hidden layers
+            if (i + 1) != (self.n_layers_ - 1):
+                hidden_activation(activations[i + 1])
+
+        # For the last layer
+        output_activation = ACTIVATIONS[self.out_activation_]
+        output_activation(activations[i + 1])
+
+        return activations
+
+    def _forward_pass_fast(self, X, check_input=True):
+        """Predict using the trained model
+
+        This is the same as _forward_pass but does not record the activations
+        of all layers and only returns the last layer's activation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+
+        check_input : bool, default=True
+            Perform input data validation or not.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            The decision function of the samples for each class in the model.
+        """
+        if check_input:
+            X = validate_data(self, X, accept_sparse=["csr", "csc"], reset=False)
+
+        # Initialize first layer
+        activation = X
+
+        # Forward propagate
+        hidden_activation = ACTIVATIONS[self.activation]
+        for i in range(self.n_layers_ - 1):
+            activation = safe_sparse_dot(activation, self.coefs_[i])
+            activation += self.intercepts_[i]
+            if i != self.n_layers_ - 2:
+                hidden_activation(activation)
+        output_activation = ACTIVATIONS[self.out_activation_]
+        output_activation(activation)
+
+        return activation
+
+    def _compute_loss_grad(
+        self, layer, sw_sum, activations, deltas, coef_grads, intercept_grads
+    ):
+        """Compute the gradient of loss with respect to coefs and intercept for
+        specified layer.
+
+        This function does backpropagation for the specified one layer.
+        """
+        coef_grads[layer] = safe_sparse_dot(activations[layer].T, deltas[layer])
+        coef_grads[layer] += self.alpha * self.coefs_[layer]
+        coef_grads[layer] /= sw_sum
+
+        intercept_grads[layer] = np.sum(deltas[layer], axis=0) / sw_sum
+
+    def _loss_grad_lbfgs(
+        self,
+        packed_coef_inter,
+        X,
+        y,
+        sample_weight,
+        activations,
+        deltas,
+        coef_grads,
+        intercept_grads,
+    ):
+        """Compute the MLP loss function and its corresponding derivatives
+        with respect to the different parameters given in the initialization.
+
+        Returned gradients are packed in a single vector so it can be used
+        in lbfgs
+
+        Parameters
+        ----------
+        packed_coef_inter : ndarray
+            A vector comprising the flattened coefficients and intercepts.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+
+        y : ndarray of shape (n_samples,)
+            The target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        activations : list, length = n_layers - 1
+            The ith element of the list holds the values of the ith layer.
+
+        deltas : list, length = n_layers - 1
+            The ith element of the list holds the difference between the
+            activations of the i + 1 layer and the backpropagated error.
+            More specifically, deltas are gradients of loss with respect to z
+            in each layer, where z = wx + b is the value of a particular layer
+            before passing through the activation function
+
+        coef_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            coefficient parameters of the ith layer in an iteration.
+
+        intercept_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            intercept parameters of the ith layer in an iteration.
+
+        Returns
+        -------
+        loss : float
+        grad : array-like, shape (number of nodes of all layers,)
+        """
+        self._unpack(packed_coef_inter)
+        loss, coef_grads, intercept_grads = self._backprop(
+            X, y, sample_weight, activations, deltas, coef_grads, intercept_grads
+        )
+        grad = _pack(coef_grads, intercept_grads)
+        return loss, grad
+
+    def _backprop(
+        self, X, y, sample_weight, activations, deltas, coef_grads, intercept_grads
+    ):
+        """Compute the MLP loss function and its corresponding derivatives
+        with respect to each parameter: weights and bias vectors.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+
+        y : ndarray of shape (n_samples,)
+            The target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        activations : list, length = n_layers - 1
+             The ith element of the list holds the values of the ith layer.
+
+        deltas : list, length = n_layers - 1
+            The ith element of the list holds the difference between the
+            activations of the i + 1 layer and the backpropagated error.
+            More specifically, deltas are gradients of loss with respect to z
+            in each layer, where z = wx + b is the value of a particular layer
+            before passing through the activation function
+
+        coef_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            coefficient parameters of the ith layer in an iteration.
+
+        intercept_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            intercept parameters of the ith layer in an iteration.
+
+        Returns
+        -------
+        loss : float
+        coef_grads : list, length = n_layers - 1
+        intercept_grads : list, length = n_layers - 1
+        """
+        n_samples = X.shape[0]
+
+        # Forward propagate
+        activations = self._forward_pass(activations)
+
+        # Get loss
+        loss_func_name = self.loss
+        if loss_func_name == "log_loss" and self.out_activation_ == "logistic":
+            loss_func_name = "binary_log_loss"
+        loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1], sample_weight)
+        # Add L2 regularization term to loss
+        values = 0
+        for s in self.coefs_:
+            s = s.ravel()
+            values += np.dot(s, s)
+        if sample_weight is None:
+            sw_sum = n_samples
+        else:
+            sw_sum = sample_weight.sum()
+        loss += (0.5 * self.alpha) * values / sw_sum
+
+        # Backward propagate
+        last = self.n_layers_ - 2
+
+        # The calculation of delta[last] is as follows:
+        #   delta[last] = d/dz loss(y, act(z)) = act(z) - y
+        # with z=x@w + b being the output of the last layer before passing through the
+        # output activation, act(z) = activations[-1].
+        # The simple formula for delta[last] here works with following (canonical
+        # loss-link) combinations of output activation and loss function:
+        # sigmoid and binary cross entropy, softmax and categorical cross
+        # entropy, and identity with squared loss
+        deltas[last] = activations[-1] - y
+        if sample_weight is not None:
+            deltas[last] *= sample_weight.reshape(-1, 1)
+
+        # Compute gradient for the last layer
+        self._compute_loss_grad(
+            last, sw_sum, activations, deltas, coef_grads, intercept_grads
+        )
+
+        inplace_derivative = DERIVATIVES[self.activation]
+        # Iterate over the hidden layers
+        for i in range(last, 0, -1):
+            deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
+            inplace_derivative(activations[i], deltas[i - 1])
+
+            self._compute_loss_grad(
+                i - 1, sw_sum, activations, deltas, coef_grads, intercept_grads
+            )
+
+        return loss, coef_grads, intercept_grads
+
+    def _initialize(self, y, layer_units, dtype):
+        # set all attributes, allocate weights etc. for first call
+        # Initialize parameters
+        self.n_iter_ = 0
+        self.t_ = 0
+        self.n_outputs_ = y.shape[1]
+
+        # Compute the number of layers
+        self.n_layers_ = len(layer_units)
+
+        # Output for regression
+        if not is_classifier(self):
+            if self.loss == "poisson":
+                self.out_activation_ = "exp"
+            else:
+                # loss = "squared_error"
+                self.out_activation_ = "identity"
+        # Output for multi class
+        elif self._label_binarizer.y_type_ == "multiclass":
+            self.out_activation_ = "softmax"
+        # Output for binary class and multi-label
+        else:
+            self.out_activation_ = "logistic"
+
+        # Initialize coefficient and intercept layers
+        self.coefs_ = []
+        self.intercepts_ = []
+
+        for i in range(self.n_layers_ - 1):
+            coef_init, intercept_init = self._init_coef(
+                layer_units[i], layer_units[i + 1], dtype
+            )
+            self.coefs_.append(coef_init)
+            self.intercepts_.append(intercept_init)
+
+        self._best_coefs = [c.copy() for c in self.coefs_]
+        self._best_intercepts = [i.copy() for i in self.intercepts_]
+
+        if self.solver in _STOCHASTIC_SOLVERS:
+            self.loss_curve_ = []
+            self._no_improvement_count = 0
+            if self.early_stopping:
+                self.validation_scores_ = []
+                self.best_validation_score_ = -np.inf
+                self.best_loss_ = None
+            else:
+                self.best_loss_ = np.inf
+                self.validation_scores_ = None
+                self.best_validation_score_ = None
+
+    def _init_coef(self, fan_in, fan_out, dtype):
+        # Use the initialization method recommended by
+        # Glorot et al.
+        factor = 6.0
+        if self.activation == "logistic":
+            factor = 2.0
+        init_bound = np.sqrt(factor / (fan_in + fan_out))
+
+        # Generate weights and bias:
+        coef_init = self._random_state.uniform(
+            -init_bound, init_bound, (fan_in, fan_out)
+        )
+        intercept_init = self._random_state.uniform(-init_bound, init_bound, fan_out)
+        coef_init = coef_init.astype(dtype, copy=False)
+        intercept_init = intercept_init.astype(dtype, copy=False)
+        return coef_init, intercept_init
+
+    def _fit(self, X, y, sample_weight=None, incremental=False):
+        # Make sure self.hidden_layer_sizes is a list
+        hidden_layer_sizes = self.hidden_layer_sizes
+        if not hasattr(hidden_layer_sizes, "__iter__"):
+            hidden_layer_sizes = [hidden_layer_sizes]
+        hidden_layer_sizes = list(hidden_layer_sizes)
+
+        if np.any(np.array(hidden_layer_sizes) <= 0):
+            raise ValueError(
+                "hidden_layer_sizes must be > 0, got %s." % hidden_layer_sizes
+            )
+        first_pass = not hasattr(self, "coefs_") or (
+            not self.warm_start and not incremental
+        )
+
+        X, y = self._validate_input(X, y, incremental, reset=first_pass)
+        n_samples, n_features = X.shape
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        # Ensure y is 2D
+        if y.ndim == 1:
+            y = y.reshape((-1, 1))
+
+        self.n_outputs_ = y.shape[1]
+
+        layer_units = [n_features] + hidden_layer_sizes + [self.n_outputs_]
+
+        # check random state
+        self._random_state = check_random_state(self.random_state)
+
+        if first_pass:
+            # First time training the model
+            self._initialize(y, layer_units, X.dtype)
+
+        # Initialize lists
+        activations = [X] + [None] * (len(layer_units) - 1)
+        deltas = [None] * (len(activations) - 1)
+
+        coef_grads = [
+            np.empty((n_fan_in_, n_fan_out_), dtype=X.dtype)
+            for n_fan_in_, n_fan_out_ in pairwise(layer_units)
+        ]
+
+        intercept_grads = [
+            np.empty(n_fan_out_, dtype=X.dtype) for n_fan_out_ in layer_units[1:]
+        ]
+
+        # Run the Stochastic optimization solver
+        if self.solver in _STOCHASTIC_SOLVERS:
+            self._fit_stochastic(
+                X,
+                y,
+                sample_weight,
+                activations,
+                deltas,
+                coef_grads,
+                intercept_grads,
+                layer_units,
+                incremental,
+            )
+
+        # Run the LBFGS solver
+        elif self.solver == "lbfgs":
+            self._fit_lbfgs(
+                X,
+                y,
+                sample_weight,
+                activations,
+                deltas,
+                coef_grads,
+                intercept_grads,
+                layer_units,
+            )
+
+        # validate parameter weights
+        weights = chain(self.coefs_, self.intercepts_)
+        if not all(np.isfinite(w).all() for w in weights):
+            raise ValueError(
+                "Solver produced non-finite parameter weights. The input data may"
+                " contain large values and need to be preprocessed."
+            )
+
+        return self
+
+    def _fit_lbfgs(
+        self,
+        X,
+        y,
+        sample_weight,
+        activations,
+        deltas,
+        coef_grads,
+        intercept_grads,
+        layer_units,
+    ):
+        # Store meta information for the parameters
+        self._coef_indptr = []
+        self._intercept_indptr = []
+        start = 0
+
+        # Save sizes and indices of coefficients for faster unpacking
+        for i in range(self.n_layers_ - 1):
+            n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1]
+
+            end = start + (n_fan_in * n_fan_out)
+            self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))
+            start = end
+
+        # Save sizes and indices of intercepts for faster unpacking
+        for i in range(self.n_layers_ - 1):
+            end = start + layer_units[i + 1]
+            self._intercept_indptr.append((start, end))
+            start = end
+
+        # Run LBFGS
+        packed_coef_inter = _pack(self.coefs_, self.intercepts_)
+
+        if self.verbose is True or self.verbose >= 1:
+            iprint = 1
+        else:
+            iprint = -1
+
+        opt_res = scipy.optimize.minimize(
+            self._loss_grad_lbfgs,
+            packed_coef_inter,
+            method="L-BFGS-B",
+            jac=True,
+            options={
+                "maxfun": self.max_fun,
+                "maxiter": self.max_iter,
+                "gtol": self.tol,
+                **_get_additional_lbfgs_options_dict("iprint", iprint),
+            },
+            args=(
+                X,
+                y,
+                sample_weight,
+                activations,
+                deltas,
+                coef_grads,
+                intercept_grads,
+            ),
+        )
+        self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
+        self.loss_ = opt_res.fun
+        self._unpack(opt_res.x)
+
+    def _fit_stochastic(
+        self,
+        X,
+        y,
+        sample_weight,
+        activations,
+        deltas,
+        coef_grads,
+        intercept_grads,
+        layer_units,
+        incremental,
+    ):
+        params = self.coefs_ + self.intercepts_
+        if not incremental or not hasattr(self, "_optimizer"):
+            if self.solver == "sgd":
+                self._optimizer = SGDOptimizer(
+                    params,
+                    self.learning_rate_init,
+                    self.learning_rate,
+                    self.momentum,
+                    self.nesterovs_momentum,
+                    self.power_t,
+                )
+            elif self.solver == "adam":
+                self._optimizer = AdamOptimizer(
+                    params,
+                    self.learning_rate_init,
+                    self.beta_1,
+                    self.beta_2,
+                    self.epsilon,
+                )
+
+        # early_stopping in partial_fit doesn't make sense
+        if self.early_stopping and incremental:
+            raise ValueError("partial_fit does not support early_stopping=True")
+        early_stopping = self.early_stopping
+        if early_stopping:
+            # don't stratify in multilabel classification
+            should_stratify = is_classifier(self) and self.n_outputs_ == 1
+            stratify = y if should_stratify else None
+            if sample_weight is None:
+                X_train, X_val, y_train, y_val = train_test_split(
+                    X,
+                    y,
+                    random_state=self._random_state,
+                    test_size=self.validation_fraction,
+                    stratify=stratify,
+                )
+                sample_weight_train = sample_weight_val = None
+            else:
+                # TODO: incorporate sample_weight in sampling here.
+                (
+                    X_train,
+                    X_val,
+                    y_train,
+                    y_val,
+                    sample_weight_train,
+                    sample_weight_val,
+                ) = train_test_split(
+                    X,
+                    y,
+                    sample_weight,
+                    random_state=self._random_state,
+                    test_size=self.validation_fraction,
+                    stratify=stratify,
+                )
+            if X_val.shape[0] < 2:
+                raise ValueError(
+                    "The validation set is too small. Increase 'validation_fraction' "
+                    "or the size of your dataset."
+                )
+
+            if is_classifier(self):
+                y_val = self._label_binarizer.inverse_transform(y_val)
+        else:
+            X_train, y_train, sample_weight_train = X, y, sample_weight
+            X_val = y_val = sample_weight_val = None
+
+        n_samples = X_train.shape[0]
+        sample_idx = np.arange(n_samples, dtype=int)
+
+        if self.batch_size == "auto":
+            batch_size = min(200, n_samples)
+        else:
+            if self.batch_size > n_samples:
+                warnings.warn(
+                    "Got `batch_size` less than 1 or larger than "
+                    "sample size. It is going to be clipped"
+                )
+            batch_size = np.clip(self.batch_size, 1, n_samples)
+
+        try:
+            self.n_iter_ = 0
+            for it in range(self.max_iter):
+                if self.shuffle:
+                    # Only shuffle the sample indices instead of X and y to
+                    # reduce the memory footprint. These indices will be used
+                    # to slice the X and y.
+                    sample_idx = shuffle(sample_idx, random_state=self._random_state)
+
+                accumulated_loss = 0.0
+                for batch_slice in gen_batches(n_samples, batch_size):
+                    if self.shuffle:
+                        batch_idx = sample_idx[batch_slice]
+                        X_batch = _safe_indexing(X_train, batch_idx)
+                    else:
+                        batch_idx = batch_slice
+                        X_batch = X_train[batch_idx]
+                    y_batch = y_train[batch_idx]
+                    if sample_weight is None:
+                        sample_weight_batch = None
+                    else:
+                        sample_weight_batch = sample_weight_train[batch_idx]
+
+                    activations[0] = X_batch
+                    batch_loss, coef_grads, intercept_grads = self._backprop(
+                        X_batch,
+                        y_batch,
+                        sample_weight_batch,
+                        activations,
+                        deltas,
+                        coef_grads,
+                        intercept_grads,
+                    )
+                    accumulated_loss += batch_loss * (
+                        batch_slice.stop - batch_slice.start
+                    )
+
+                    # update weights
+                    grads = coef_grads + intercept_grads
+                    self._optimizer.update_params(params, grads)
+
+                self.n_iter_ += 1
+                self.loss_ = accumulated_loss / X_train.shape[0]
+
+                self.t_ += n_samples
+                self.loss_curve_.append(self.loss_)
+                if self.verbose:
+                    print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_))
+
+                # update no_improvement_count based on training loss or
+                # validation score according to early_stopping
+                self._update_no_improvement_count(
+                    early_stopping, X_val, y_val, sample_weight_val
+                )
+
+                # for learning rate that needs to be updated at iteration end
+                self._optimizer.iteration_ends(self.t_)
+
+                if self._no_improvement_count > self.n_iter_no_change:
+                    # not better than last `n_iter_no_change` iterations by tol
+                    # stop or decrease learning rate
+                    if early_stopping:
+                        msg = (
+                            "Validation score did not improve more than "
+                            "tol=%f for %d consecutive epochs."
+                            % (self.tol, self.n_iter_no_change)
+                        )
+                    else:
+                        msg = (
+                            "Training loss did not improve more than tol=%f"
+                            " for %d consecutive epochs."
+                            % (self.tol, self.n_iter_no_change)
+                        )
+
+                    is_stopping = self._optimizer.trigger_stopping(msg, self.verbose)
+                    if is_stopping:
+                        break
+                    else:
+                        self._no_improvement_count = 0
+
+                if incremental:
+                    break
+
+                if self.n_iter_ == self.max_iter:
+                    warnings.warn(
+                        "Stochastic Optimizer: Maximum iterations (%d) "
+                        "reached and the optimization hasn't converged yet."
+                        % self.max_iter,
+                        ConvergenceWarning,
+                    )
+        except KeyboardInterrupt:
+            warnings.warn("Training interrupted by user.")
+
+        if early_stopping:
+            # restore best weights
+            self.coefs_ = self._best_coefs
+            self.intercepts_ = self._best_intercepts
+
+    def _update_no_improvement_count(self, early_stopping, X, y, sample_weight):
+        if early_stopping:
+            # compute validation score (can be NaN), use that for stopping
+            val_score = self._score(X, y, sample_weight=sample_weight)
+
+            self.validation_scores_.append(val_score)
+
+            if self.verbose:
+                print("Validation score: %f" % self.validation_scores_[-1])
+            # update best parameters
+            # use validation_scores_, not loss_curve_
+            # let's hope no-one overloads .score with mse
+            last_valid_score = self.validation_scores_[-1]
+
+            if last_valid_score < (self.best_validation_score_ + self.tol):
+                self._no_improvement_count += 1
+            else:
+                self._no_improvement_count = 0
+
+            if last_valid_score > self.best_validation_score_:
+                self.best_validation_score_ = last_valid_score
+                self._best_coefs = [c.copy() for c in self.coefs_]
+                self._best_intercepts = [i.copy() for i in self.intercepts_]
+        else:
+            if self.loss_curve_[-1] > self.best_loss_ - self.tol:
+                self._no_improvement_count += 1
+            else:
+                self._no_improvement_count = 0
+            if self.loss_curve_[-1] < self.best_loss_:
+                self.best_loss_ = self.loss_curve_[-1]
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model to data matrix X and target(s) y.
+
+        Parameters
+        ----------
+        X : ndarray or sparse matrix of shape (n_samples, n_features)
+            The input data.
+
+        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+            .. versionadded:: 1.7
+
+        Returns
+        -------
+        self : object
+            Returns a trained MLP model.
+        """
+        return self._fit(X, y, sample_weight=sample_weight, incremental=False)
+
+    def _check_solver(self):
+        if self.solver not in _STOCHASTIC_SOLVERS:
+            raise AttributeError(
+                "partial_fit is only available for stochastic"
+                " optimizers. %s is not stochastic." % self.solver
+            )
+        return True
+
+    def _score_with_function(self, X, y, sample_weight, score_function):
+        """Private score method without input validation."""
+        # Input validation would remove feature names, so we disable it
+        y_pred = self._predict(X, check_input=False)
+
+        if np.isnan(y_pred).any() or np.isinf(y_pred).any():
+            return np.nan
+
+        return score_function(y, y_pred, sample_weight=sample_weight)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
+    """Multi-layer Perceptron classifier.
+
+    This model optimizes the log-loss function using LBFGS or stochastic
+    gradient descent.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    hidden_layer_sizes : array-like of shape(n_layers - 2,), default=(100,)
+        The ith element represents the number of neurons in the ith
+        hidden layer.
+
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
+        Activation function for the hidden layer.
+
+        - 'identity', no-op activation, useful to implement linear bottleneck,
+          returns f(x) = x
+
+        - 'logistic', the logistic sigmoid function,
+          returns f(x) = 1 / (1 + exp(-x)).
+
+        - 'tanh', the hyperbolic tan function,
+          returns f(x) = tanh(x).
+
+        - 'relu', the rectified linear unit function,
+          returns f(x) = max(0, x)
+
+    solver : {'lbfgs', 'sgd', 'adam'}, default='adam'
+        The solver for weight optimization.
+
+        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
+
+        - 'sgd' refers to stochastic gradient descent.
+
+        - 'adam' refers to a stochastic gradient-based optimizer proposed
+          by Kingma, Diederik, and Jimmy Ba
+
+        For a comparison between Adam optimizer and SGD, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`.
+
+        Note: The default solver 'adam' works pretty well on relatively
+        large datasets (with thousands of training samples or more) in terms of
+        both training time and validation score.
+        For small datasets, however, 'lbfgs' can converge faster and perform
+        better.
+
+    alpha : float, default=0.0001
+        Strength of the L2 regularization term. The L2 regularization term
+        is divided by the sample size when added to the loss.
+
+        For an example usage and visualization of varying regularization, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py`.
+
+    batch_size : int, default='auto'
+        Size of minibatches for stochastic optimizers.
+        If the solver is 'lbfgs', the classifier will not use minibatch.
+        When set to "auto", `batch_size=min(200, n_samples)`.
+
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'
+        Learning rate schedule for weight updates.
+
+        - 'constant' is a constant learning rate given by
+          'learning_rate_init'.
+
+        - 'invscaling' gradually decreases the learning rate at each
+          time step 't' using an inverse scaling exponent of 'power_t'.
+          effective_learning_rate = learning_rate_init / pow(t, power_t)
+
+        - 'adaptive' keeps the learning rate constant to
+          'learning_rate_init' as long as training loss keeps decreasing.
+          Each time two consecutive epochs fail to decrease training loss by at
+          least tol, or fail to increase validation score by at least tol if
+          'early_stopping' is on, the current learning rate is divided by 5.
+
+        Only used when ``solver='sgd'``.
+
+    learning_rate_init : float, default=0.001
+        The initial learning rate used. It controls the step-size
+        in updating the weights. Only used when solver='sgd' or 'adam'.
+
+    power_t : float, default=0.5
+        The exponent for inverse scaling learning rate.
+        It is used in updating effective learning rate when the learning_rate
+        is set to 'invscaling'. Only used when solver='sgd'.
+
+    max_iter : int, default=200
+        Maximum number of iterations. The solver iterates until convergence
+        (determined by 'tol') or this number of iterations. For stochastic
+        solvers ('sgd', 'adam'), note that this determines the number of epochs
+        (how many times each data point will be used), not the number of
+        gradient steps.
+
+    shuffle : bool, default=True
+        Whether to shuffle samples in each iteration. Only used when
+        solver='sgd' or 'adam'.
+
+    random_state : int, RandomState instance, default=None
+        Determines random number generation for weights and bias
+        initialization, train-test split if early stopping is used, and batch
+        sampling when solver='sgd' or 'adam'.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    tol : float, default=1e-4
+        Tolerance for the optimization. When the loss or score is not improving
+        by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
+        unless ``learning_rate`` is set to 'adaptive', convergence is
+        considered to be reached and training stops.
+
+    verbose : bool, default=False
+        Whether to print progress messages to stdout.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous
+        call to fit as initialization, otherwise, just erase the
+        previous solution. See :term:`the Glossary <warm_start>`.
+
+    momentum : float, default=0.9
+        Momentum for gradient descent update. Should be between 0 and 1. Only
+        used when solver='sgd'.
+
+    nesterovs_momentum : bool, default=True
+        Whether to use Nesterov's momentum. Only used when solver='sgd' and
+        momentum > 0.
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to true, it will automatically set
+        aside 10% of training data as validation and terminate training when
+        validation score is not improving by at least ``tol`` for
+        ``n_iter_no_change`` consecutive epochs. The split is stratified,
+        except in a multilabel setting.
+        If early stopping is False, then the training stops when the training
+        loss does not improve by more than tol for n_iter_no_change consecutive
+        passes over the training set.
+        Only effective when solver='sgd' or 'adam'.
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True.
+
+    beta_1 : float, default=0.9
+        Exponential decay rate for estimates of first moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'.
+
+    beta_2 : float, default=0.999
+        Exponential decay rate for estimates of second moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'.
+
+    epsilon : float, default=1e-8
+        Value for numerical stability in adam. Only used when solver='adam'.
+
+    n_iter_no_change : int, default=10
+        Maximum number of epochs to not meet ``tol`` improvement.
+        Only effective when solver='sgd' or 'adam'.
+
+        .. versionadded:: 0.20
+
+    max_fun : int, default=15000
+        Only used when solver='lbfgs'. Maximum number of loss function calls.
+        The solver iterates until convergence (determined by 'tol'), number
+        of iterations reaches max_iter, or this number of loss function calls.
+        Note that number of loss function calls will be greater than or equal
+        to the number of iterations for the `MLPClassifier`.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    classes_ : ndarray or list of ndarray of shape (n_classes,)
+        Class labels for each output.
+
+    loss_ : float
+        The current loss computed with the loss function.
+
+    best_loss_ : float or None
+        The minimum loss reached by the solver throughout fitting.
+        If `early_stopping=True`, this attribute is set to `None`. Refer to
+        the `best_validation_score_` fitted attribute instead.
+
+    loss_curve_ : list of shape (`n_iter_`,)
+        The ith element in the list represents the loss at the ith iteration.
+
+    validation_scores_ : list of shape (`n_iter_`,) or None
+        The score at each iteration on a held-out validation set. The score
+        reported is the accuracy score. Only available if `early_stopping=True`,
+        otherwise the attribute is set to `None`.
+
+    best_validation_score_ : float or None
+        The best validation score (i.e. accuracy score) that triggered the
+        early stopping. Only available if `early_stopping=True`, otherwise the
+        attribute is set to `None`.
+
+    t_ : int
+        The number of training samples seen by the solver during fitting.
+
+    coefs_ : list of shape (n_layers - 1,)
+        The ith element in the list represents the weight matrix corresponding
+        to layer i.
+
+    intercepts_ : list of shape (n_layers - 1,)
+        The ith element in the list represents the bias vector corresponding to
+        layer i + 1.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The number of iterations the solver has run.
+
+    n_layers_ : int
+        Number of layers.
+
+    n_outputs_ : int
+        Number of outputs.
+
+    out_activation_ : str
+        Name of the output activation function.
+
+    See Also
+    --------
+    MLPRegressor : Multi-layer Perceptron regressor.
+    BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).
+
+    Notes
+    -----
+    MLPClassifier trains iteratively since at each time step
+    the partial derivatives of the loss function with respect to the model
+    parameters are computed to update the parameters.
+
+    It can also have a regularization term added to the loss function
+    that shrinks model parameters to prevent overfitting.
+
+    This implementation works with data represented as dense numpy arrays or
+    sparse scipy arrays of floating point values.
+
+    References
+    ----------
+    Hinton, Geoffrey E. "Connectionist learning procedures."
+    Artificial intelligence 40.1 (1989): 185-234.
+
+    Glorot, Xavier, and Yoshua Bengio.
+    "Understanding the difficulty of training deep feedforward neural networks."
+    International Conference on Artificial Intelligence and Statistics. 2010.
+
+    :arxiv:`He, Kaiming, et al (2015). "Delving deep into rectifiers:
+    Surpassing human-level performance on imagenet classification." <1502.01852>`
+
+    :arxiv:`Kingma, Diederik, and Jimmy Ba (2014)
+    "Adam: A method for stochastic optimization." <1412.6980>`
+
+    Examples
+    --------
+    >>> from sklearn.neural_network import MLPClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = make_classification(n_samples=100, random_state=1)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
+    ...                                                     random_state=1)
+    >>> clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
+    >>> clf.predict_proba(X_test[:1])
+    array([[0.0383, 0.961]])
+    >>> clf.predict(X_test[:5, :])
+    array([1, 0, 1, 0, 1])
+    >>> clf.score(X_test, y_test)
+    0.8...
+    """
+
+    def __init__(
+        self,
+        hidden_layer_sizes=(100,),
+        activation="relu",
+        *,
+        solver="adam",
+        alpha=0.0001,
+        batch_size="auto",
+        learning_rate="constant",
+        learning_rate_init=0.001,
+        power_t=0.5,
+        max_iter=200,
+        shuffle=True,
+        random_state=None,
+        tol=1e-4,
+        verbose=False,
+        warm_start=False,
+        momentum=0.9,
+        nesterovs_momentum=True,
+        early_stopping=False,
+        validation_fraction=0.1,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-8,
+        n_iter_no_change=10,
+        max_fun=15000,
+    ):
+        super().__init__(
+            hidden_layer_sizes=hidden_layer_sizes,
+            activation=activation,
+            solver=solver,
+            alpha=alpha,
+            batch_size=batch_size,
+            learning_rate=learning_rate,
+            learning_rate_init=learning_rate_init,
+            power_t=power_t,
+            max_iter=max_iter,
+            loss="log_loss",
+            shuffle=shuffle,
+            random_state=random_state,
+            tol=tol,
+            verbose=verbose,
+            warm_start=warm_start,
+            momentum=momentum,
+            nesterovs_momentum=nesterovs_momentum,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=epsilon,
+            n_iter_no_change=n_iter_no_change,
+            max_fun=max_fun,
+        )
+
+    def _validate_input(self, X, y, incremental, reset):
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csr", "csc"],
+            multi_output=True,
+            dtype=(np.float64, np.float32),
+            reset=reset,
+        )
+        if y.ndim == 2 and y.shape[1] == 1:
+            y = column_or_1d(y, warn=True)
+
+        # Matrix of actions to be taken under the possible combinations:
+        # The case that incremental == True and classes_ not defined is
+        # already checked by _check_partial_fit_first_call that is called
+        # in _partial_fit below.
+        # The cases are already grouped into the respective if blocks below.
+        #
+        # incremental warm_start classes_ def  action
+        #    0            0         0        define classes_
+        #    0            1         0        define classes_
+        #    0            0         1        redefine classes_
+        #
+        #    0            1         1        check compat warm_start
+        #    1            1         1        check compat warm_start
+        #
+        #    1            0         1        check compat last fit
+        #
+        # Note the reliance on short-circuiting here, so that the second
+        # or part implies that classes_ is defined.
+        if (not hasattr(self, "classes_")) or (not self.warm_start and not incremental):
+            self._label_binarizer = LabelBinarizer()
+            self._label_binarizer.fit(y)
+            self.classes_ = self._label_binarizer.classes_
+        else:
+            classes = unique_labels(y)
+            if self.warm_start:
+                if set(classes) != set(self.classes_):
+                    raise ValueError(
+                        "warm_start can only be used where `y` has the same "
+                        "classes as in the previous call to fit. Previously "
+                        f"got {self.classes_}, `y` has {classes}"
+                    )
+            elif len(np.setdiff1d(classes, self.classes_, assume_unique=True)):
+                raise ValueError(
+                    "`y` has classes not in `self.classes_`. "
+                    f"`self.classes_` has {self.classes_}. 'y' has {classes}."
+                )
+
+        # This downcast to bool is to prevent upcasting when working with
+        # float32 data
+        y = self._label_binarizer.transform(y).astype(bool)
+        return X, y
+
+    def predict(self, X):
+        """Predict using the multi-layer perceptron classifier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,) or (n_samples, n_classes)
+            The predicted classes.
+        """
+        check_is_fitted(self)
+        return self._predict(X)
+
+    def _predict(self, X, check_input=True):
+        """Private predict method with optional input validation"""
+        y_pred = self._forward_pass_fast(X, check_input=check_input)
+
+        if self.n_outputs_ == 1:
+            y_pred = y_pred.ravel()
+
+        return self._label_binarizer.inverse_transform(y_pred)
+
+    def _score(self, X, y, sample_weight=None):
+        return super()._score_with_function(
+            X, y, sample_weight=sample_weight, score_function=accuracy_score
+        )
+
+    @available_if(lambda est: est._check_solver())
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y, sample_weight=None, classes=None):
+        """Update the model with a single iteration over the given data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+
+        y : array-like of shape (n_samples,)
+            The target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+            .. versionadded:: 1.7
+
+        classes : array of shape (n_classes,), default=None
+            Classes across all calls to partial_fit.
+            Can be obtained via `np.unique(y_all)`, where y_all is the
+            target vector of the entire dataset.
+            This argument is required for the first call to partial_fit
+            and can be omitted in the subsequent calls.
+            Note that y doesn't need to contain all labels in `classes`.
+
+        Returns
+        -------
+        self : object
+            Trained MLP model.
+        """
+        if _check_partial_fit_first_call(self, classes):
+            self._label_binarizer = LabelBinarizer()
+            if type_of_target(y).startswith("multilabel"):
+                self._label_binarizer.fit(y)
+            else:
+                self._label_binarizer.fit(classes)
+
+        return self._fit(X, y, sample_weight=sample_weight, incremental=True)
+
+    def predict_log_proba(self, X):
+        """Return the log of probability estimates.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        log_y_prob : ndarray of shape (n_samples, n_classes)
+            The predicted log-probability of the sample for each class
+            in the model, where classes are ordered as they are in
+            `self.classes_`. Equivalent to `log(predict_proba(X))`.
+        """
+        y_prob = self.predict_proba(X)
+        return np.log(y_prob, out=y_prob)
+
+    def predict_proba(self, X):
+        """Probability estimates.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        y_prob : ndarray of shape (n_samples, n_classes)
+            The predicted probability of the sample for each class in the
+            model, where classes are ordered as they are in `self.classes_`.
+        """
+        check_is_fitted(self)
+        y_pred = self._forward_pass_fast(X)
+
+        if self.n_outputs_ == 1:
+            y_pred = y_pred.ravel()
+
+        if y_pred.ndim == 1:
+            return np.vstack([1 - y_pred, y_pred]).T
+        else:
+            return y_pred
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        return tags
+
+
+class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
+    """Multi-layer Perceptron regressor.
+
+    This model optimizes the squared error using LBFGS or stochastic gradient
+    descent.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    loss : {'squared_error', 'poisson'}, default='squared_error'
+        The loss function to use when training the weights. Note that the
+        "squared error" and "poisson" losses actually implement
+        "half squares error" and "half poisson deviance" to simplify the
+        computation of the gradient. Furthermore, the "poisson" loss internally uses
+        a log-link (exponential as the output activation function) and requires
+        ``y >= 0``.
+
+        .. versionchanged:: 1.7
+           Added parameter `loss` and option 'poisson'.
+
+    hidden_layer_sizes : array-like of shape(n_layers - 2,), default=(100,)
+        The ith element represents the number of neurons in the ith
+        hidden layer.
+
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
+        Activation function for the hidden layer.
+
+        - 'identity', no-op activation, useful to implement linear bottleneck,
+          returns f(x) = x
+
+        - 'logistic', the logistic sigmoid function,
+          returns f(x) = 1 / (1 + exp(-x)).
+
+        - 'tanh', the hyperbolic tan function,
+          returns f(x) = tanh(x).
+
+        - 'relu', the rectified linear unit function,
+          returns f(x) = max(0, x)
+
+    solver : {'lbfgs', 'sgd', 'adam'}, default='adam'
+        The solver for weight optimization.
+
+        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
+
+        - 'sgd' refers to stochastic gradient descent.
+
+        - 'adam' refers to a stochastic gradient-based optimizer proposed by
+          Kingma, Diederik, and Jimmy Ba
+
+        For a comparison between Adam optimizer and SGD, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`.
+
+        Note: The default solver 'adam' works pretty well on relatively
+        large datasets (with thousands of training samples or more) in terms of
+        both training time and validation score.
+        For small datasets, however, 'lbfgs' can converge faster and perform
+        better.
+
+    alpha : float, default=0.0001
+        Strength of the L2 regularization term. The L2 regularization term
+        is divided by the sample size when added to the loss.
+
+    batch_size : int, default='auto'
+        Size of minibatches for stochastic optimizers.
+        If the solver is 'lbfgs', the regressor will not use minibatch.
+        When set to "auto", `batch_size=min(200, n_samples)`.
+
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'
+        Learning rate schedule for weight updates.
+
+        - 'constant' is a constant learning rate given by
+          'learning_rate_init'.
+
+        - 'invscaling' gradually decreases the learning rate ``learning_rate_``
+          at each time step 't' using an inverse scaling exponent of 'power_t'.
+          effective_learning_rate = learning_rate_init / pow(t, power_t)
+
+        - 'adaptive' keeps the learning rate constant to
+          'learning_rate_init' as long as training loss keeps decreasing.
+          Each time two consecutive epochs fail to decrease training loss by at
+          least tol, or fail to increase validation score by at least tol if
+          'early_stopping' is on, the current learning rate is divided by 5.
+
+        Only used when solver='sgd'.
+
+    learning_rate_init : float, default=0.001
+        The initial learning rate used. It controls the step-size
+        in updating the weights. Only used when solver='sgd' or 'adam'.
+
+    power_t : float, default=0.5
+        The exponent for inverse scaling learning rate.
+        It is used in updating effective learning rate when the learning_rate
+        is set to 'invscaling'. Only used when solver='sgd'.
+
+    max_iter : int, default=200
+        Maximum number of iterations. The solver iterates until convergence
+        (determined by 'tol') or this number of iterations. For stochastic
+        solvers ('sgd', 'adam'), note that this determines the number of epochs
+        (how many times each data point will be used), not the number of
+        gradient steps.
+
+    shuffle : bool, default=True
+        Whether to shuffle samples in each iteration. Only used when
+        solver='sgd' or 'adam'.
+
+    random_state : int, RandomState instance, default=None
+        Determines random number generation for weights and bias
+        initialization, train-test split if early stopping is used, and batch
+        sampling when solver='sgd' or 'adam'.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    tol : float, default=1e-4
+        Tolerance for the optimization. When the loss or score is not improving
+        by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
+        unless ``learning_rate`` is set to 'adaptive', convergence is
+        considered to be reached and training stops.
+
+    verbose : bool, default=False
+        Whether to print progress messages to stdout.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous
+        call to fit as initialization, otherwise, just erase the
+        previous solution. See :term:`the Glossary <warm_start>`.
+
+    momentum : float, default=0.9
+        Momentum for gradient descent update. Should be between 0 and 1. Only
+        used when solver='sgd'.
+
+    nesterovs_momentum : bool, default=True
+        Whether to use Nesterov's momentum. Only used when solver='sgd' and
+        momentum > 0.
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to True, it will automatically set
+        aside ``validation_fraction`` of training data as validation and
+        terminate training when validation score is not improving by at
+        least ``tol`` for ``n_iter_no_change`` consecutive epochs.
+        Only effective when solver='sgd' or 'adam'.
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True.
+
+    beta_1 : float, default=0.9
+        Exponential decay rate for estimates of first moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'.
+
+    beta_2 : float, default=0.999
+        Exponential decay rate for estimates of second moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'.
+
+    epsilon : float, default=1e-8
+        Value for numerical stability in adam. Only used when solver='adam'.
+
+    n_iter_no_change : int, default=10
+        Maximum number of epochs to not meet ``tol`` improvement.
+        Only effective when solver='sgd' or 'adam'.
+
+        .. versionadded:: 0.20
+
+    max_fun : int, default=15000
+        Only used when solver='lbfgs'. Maximum number of function calls.
+        The solver iterates until convergence (determined by ``tol``), number
+        of iterations reaches max_iter, or this number of function calls.
+        Note that number of function calls will be greater than or equal to
+        the number of iterations for the MLPRegressor.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    loss_ : float
+        The current loss computed with the loss function.
+
+    best_loss_ : float
+        The minimum loss reached by the solver throughout fitting.
+        If `early_stopping=True`, this attribute is set to `None`. Refer to
+        the `best_validation_score_` fitted attribute instead.
+        Only accessible when solver='sgd' or 'adam'.
+
+    loss_curve_ : list of shape (`n_iter_`,)
+        Loss value evaluated at the end of each training step.
+        The ith element in the list represents the loss at the ith iteration.
+        Only accessible when solver='sgd' or 'adam'.
+
+    validation_scores_ : list of shape (`n_iter_`,) or None
+        The score at each iteration on a held-out validation set. The score
+        reported is the R2 score. Only available if `early_stopping=True`,
+        otherwise the attribute is set to `None`.
+        Only accessible when solver='sgd' or 'adam'.
+
+    best_validation_score_ : float or None
+        The best validation score (i.e. R2 score) that triggered the
+        early stopping. Only available if `early_stopping=True`, otherwise the
+        attribute is set to `None`.
+        Only accessible when solver='sgd' or 'adam'.
+
+    t_ : int
+        The number of training samples seen by the solver during fitting.
+        Mathematically equals `n_iters * X.shape[0]`, it means
+        `time_step` and it is used by optimizer's learning rate scheduler.
+
+    coefs_ : list of shape (n_layers - 1,)
+        The ith element in the list represents the weight matrix corresponding
+        to layer i.
+
+    intercepts_ : list of shape (n_layers - 1,)
+        The ith element in the list represents the bias vector corresponding to
+        layer i + 1.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The number of iterations the solver has run.
+
+    n_layers_ : int
+        Number of layers.
+
+    n_outputs_ : int
+        Number of outputs.
+
+    out_activation_ : str
+        Name of the output activation function.
+
+    See Also
+    --------
+    BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).
+    MLPClassifier : Multi-layer Perceptron classifier.
+    sklearn.linear_model.SGDRegressor : Linear model fitted by minimizing
+        a regularized empirical loss with SGD.
+
+    Notes
+    -----
+    MLPRegressor trains iteratively since at each time step
+    the partial derivatives of the loss function with respect to the model
+    parameters are computed to update the parameters.
+
+    It can also have a regularization term added to the loss function
+    that shrinks model parameters to prevent overfitting.
+
+    This implementation works with data represented as dense and sparse numpy
+    arrays of floating point values.
+
+    References
+    ----------
+    Hinton, Geoffrey E. "Connectionist learning procedures."
+    Artificial intelligence 40.1 (1989): 185-234.
+
+    Glorot, Xavier, and Yoshua Bengio.
+    "Understanding the difficulty of training deep feedforward neural networks."
+    International Conference on Artificial Intelligence and Statistics. 2010.
+
+    :arxiv:`He, Kaiming, et al (2015). "Delving deep into rectifiers:
+    Surpassing human-level performance on imagenet classification." <1502.01852>`
+
+    :arxiv:`Kingma, Diederik, and Jimmy Ba (2014)
+    "Adam: A method for stochastic optimization." <1412.6980>`
+
+    Examples
+    --------
+    >>> from sklearn.neural_network import MLPRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = make_regression(n_samples=200, n_features=20, random_state=1)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ...                                                     random_state=1)
+    >>> regr = MLPRegressor(random_state=1, max_iter=2000, tol=0.1)
+    >>> regr.fit(X_train, y_train)
+    MLPRegressor(max_iter=2000, random_state=1, tol=0.1)
+    >>> regr.predict(X_test[:2])
+    array([  28.98, -291])
+    >>> regr.score(X_test, y_test)
+    0.98
+    """
+
+    _parameter_constraints: dict = {
+        **BaseMultilayerPerceptron._parameter_constraints,
+        "loss": [StrOptions({"squared_error", "poisson"})],
+    }
+
+    def __init__(
+        self,
+        loss="squared_error",
+        hidden_layer_sizes=(100,),
+        activation="relu",
+        *,
+        solver="adam",
+        alpha=0.0001,
+        batch_size="auto",
+        learning_rate="constant",
+        learning_rate_init=0.001,
+        power_t=0.5,
+        max_iter=200,
+        shuffle=True,
+        random_state=None,
+        tol=1e-4,
+        verbose=False,
+        warm_start=False,
+        momentum=0.9,
+        nesterovs_momentum=True,
+        early_stopping=False,
+        validation_fraction=0.1,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-8,
+        n_iter_no_change=10,
+        max_fun=15000,
+    ):
+        super().__init__(
+            hidden_layer_sizes=hidden_layer_sizes,
+            activation=activation,
+            solver=solver,
+            alpha=alpha,
+            batch_size=batch_size,
+            learning_rate=learning_rate,
+            learning_rate_init=learning_rate_init,
+            power_t=power_t,
+            max_iter=max_iter,
+            loss=loss,
+            shuffle=shuffle,
+            random_state=random_state,
+            tol=tol,
+            verbose=verbose,
+            warm_start=warm_start,
+            momentum=momentum,
+            nesterovs_momentum=nesterovs_momentum,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=epsilon,
+            n_iter_no_change=n_iter_no_change,
+            max_fun=max_fun,
+        )
+
+    def predict(self, X):
+        """Predict using the multi-layer perceptron model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_outputs)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        return self._predict(X)
+
+    def _predict(self, X, check_input=True):
+        """Private predict method with optional input validation"""
+        y_pred = self._forward_pass_fast(X, check_input=check_input)
+        if y_pred.shape[1] == 1:
+            return y_pred.ravel()
+        return y_pred
+
+    def _score(self, X, y, sample_weight=None):
+        return super()._score_with_function(
+            X, y, sample_weight=sample_weight, score_function=r2_score
+        )
+
+    def _validate_input(self, X, y, incremental, reset):
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csr", "csc"],
+            multi_output=True,
+            y_numeric=True,
+            dtype=(np.float64, np.float32),
+            reset=reset,
+        )
+        if y.ndim == 2 and y.shape[1] == 1:
+            y = column_or_1d(y, warn=True)
+        return X, y
+
+    @available_if(lambda est: est._check_solver)
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y, sample_weight=None):
+        """Update the model with a single iteration over the given data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+
+        y : ndarray of shape (n_samples,)
+            The target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+            .. versionadded:: 1.6
+
+        Returns
+        -------
+        self : object
+            Trained MLP model.
+        """
+        return self._fit(X, y, sample_weight=sample_weight, incremental=True)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1d3c2e11b7cd8a43b57aefeda4a93903698264
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py
@@ -0,0 +1,445 @@
+"""Restricted Boltzmann Machine"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import time
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+from scipy.special import expit  # logistic function
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..utils import check_random_state, gen_even_slices
+from ..utils._param_validation import Interval
+from ..utils.extmath import safe_sparse_dot
+from ..utils.validation import check_is_fitted, validate_data
+
+
+class BernoulliRBM(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Bernoulli Restricted Boltzmann Machine (RBM).
+
+    A Restricted Boltzmann Machine with binary visible units and
+    binary hidden units. Parameters are estimated using Stochastic Maximum
+    Likelihood (SML), also known as Persistent Contrastive Divergence (PCD)
+    [2].
+
+    The time complexity of this implementation is ``O(d ** 2)`` assuming
+    d ~ n_features ~ n_components.
+
+    Read more in the :ref:`User Guide <rbm>`.
+
+    Parameters
+    ----------
+    n_components : int, default=256
+        Number of binary hidden units.
+
+    learning_rate : float, default=0.1
+        The learning rate for weight updates. It is *highly* recommended
+        to tune this hyper-parameter. Reasonable values are in the
+        10**[0., -3.] range.
+
+    batch_size : int, default=10
+        Number of examples per minibatch.
+
+    n_iter : int, default=10
+        Number of iterations/sweeps over the training dataset to perform
+        during training.
+
+    verbose : int, default=0
+        The verbosity level. The default, zero, means silent mode. Range
+        of values is [0, inf].
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for:
+
+        - Gibbs sampling from visible and hidden layers.
+
+        - Initializing components, sampling from layers during fit.
+
+        - Corrupting the data when scoring samples.
+
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    intercept_hidden_ : array-like of shape (n_components,)
+        Biases of the hidden units.
+
+    intercept_visible_ : array-like of shape (n_features,)
+        Biases of the visible units.
+
+    components_ : array-like of shape (n_components, n_features)
+        Weight matrix, where `n_features` is the number of
+        visible units and `n_components` is the number of hidden units.
+
+    h_samples_ : array-like of shape (batch_size, n_components)
+        Hidden Activation sampled from the model distribution,
+        where `batch_size` is the number of examples per minibatch and
+        `n_components` is the number of hidden units.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.neural_network.MLPRegressor : Multi-layer Perceptron regressor.
+    sklearn.neural_network.MLPClassifier : Multi-layer Perceptron classifier.
+    sklearn.decomposition.PCA : An unsupervised linear dimensionality
+        reduction model.
+
+    References
+    ----------
+
+    [1] Hinton, G. E., Osindero, S. and Teh, Y. A fast learning algorithm for
+        deep belief nets. Neural Computation 18, pp 1527-1554.
+        https://www.cs.toronto.edu/~hinton/absps/fastnc.pdf
+
+    [2] Tieleman, T. Training Restricted Boltzmann Machines using
+        Approximations to the Likelihood Gradient. International Conference
+        on Machine Learning (ICML) 2008
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from sklearn.neural_network import BernoulliRBM
+    >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
+    >>> model = BernoulliRBM(n_components=2)
+    >>> model.fit(X)
+    BernoulliRBM(n_components=2)
+
+    For a more detailed example usage, see
+    :ref:`sphx_glr_auto_examples_neural_networks_plot_rbm_logistic_classification.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "learning_rate": [Interval(Real, 0, None, closed="neither")],
+        "batch_size": [Interval(Integral, 1, None, closed="left")],
+        "n_iter": [Interval(Integral, 0, None, closed="left")],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=256,
+        *,
+        learning_rate=0.1,
+        batch_size=10,
+        n_iter=10,
+        verbose=0,
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.learning_rate = learning_rate
+        self.batch_size = batch_size
+        self.n_iter = n_iter
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def transform(self, X):
+        """Compute the hidden layer activation probabilities, P(h=1|v=X).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to be transformed.
+
+        Returns
+        -------
+        h : ndarray of shape (n_samples, n_components)
+            Latent representations of the data.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(
+            self, X, accept_sparse="csr", reset=False, dtype=(np.float64, np.float32)
+        )
+        return self._mean_hiddens(X)
+
+    def _mean_hiddens(self, v):
+        """Computes the probabilities P(h=1|v).
+
+        Parameters
+        ----------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer.
+
+        Returns
+        -------
+        h : ndarray of shape (n_samples, n_components)
+            Corresponding mean field values for the hidden layer.
+        """
+        p = safe_sparse_dot(v, self.components_.T)
+        p += self.intercept_hidden_
+        return expit(p, out=p)
+
+    def _sample_hiddens(self, v, rng):
+        """Sample from the distribution P(h|v).
+
+        Parameters
+        ----------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer to sample from.
+
+        rng : RandomState instance
+            Random number generator to use.
+
+        Returns
+        -------
+        h : ndarray of shape (n_samples, n_components)
+            Values of the hidden layer.
+        """
+        p = self._mean_hiddens(v)
+        return rng.uniform(size=p.shape) < p
+
+    def _sample_visibles(self, h, rng):
+        """Sample from the distribution P(v|h).
+
+        Parameters
+        ----------
+        h : ndarray of shape (n_samples, n_components)
+            Values of the hidden layer to sample from.
+
+        rng : RandomState instance
+            Random number generator to use.
+
+        Returns
+        -------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer.
+        """
+        p = np.dot(h, self.components_)
+        p += self.intercept_visible_
+        expit(p, out=p)
+        return rng.uniform(size=p.shape) < p
+
+    def _free_energy(self, v):
+        """Computes the free energy F(v) = - log sum_h exp(-E(v,h)).
+
+        Parameters
+        ----------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer.
+
+        Returns
+        -------
+        free_energy : ndarray of shape (n_samples,)
+            The value of the free energy.
+        """
+        return -safe_sparse_dot(v, self.intercept_visible_) - np.logaddexp(
+            0, safe_sparse_dot(v, self.components_.T) + self.intercept_hidden_
+        ).sum(axis=1)
+
+    def gibbs(self, v):
+        """Perform one Gibbs sampling step.
+
+        Parameters
+        ----------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer to start from.
+
+        Returns
+        -------
+        v_new : ndarray of shape (n_samples, n_features)
+            Values of the visible layer after one Gibbs step.
+        """
+        check_is_fitted(self)
+        if not hasattr(self, "random_state_"):
+            self.random_state_ = check_random_state(self.random_state)
+        h_ = self._sample_hiddens(v, self.random_state_)
+        v_ = self._sample_visibles(h_, self.random_state_)
+
+        return v_
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Fit the model to the partial segment of the data X.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
+            Target values (None for unsupervised transformations).
+
+        Returns
+        -------
+        self : BernoulliRBM
+            The fitted model.
+        """
+        first_pass = not hasattr(self, "components_")
+        X = validate_data(
+            self, X, accept_sparse="csr", dtype=np.float64, reset=first_pass
+        )
+        if not hasattr(self, "random_state_"):
+            self.random_state_ = check_random_state(self.random_state)
+        if not hasattr(self, "components_"):
+            self.components_ = np.asarray(
+                self.random_state_.normal(0, 0.01, (self.n_components, X.shape[1])),
+                order="F",
+            )
+            self._n_features_out = self.components_.shape[0]
+        if not hasattr(self, "intercept_hidden_"):
+            self.intercept_hidden_ = np.zeros(
+                self.n_components,
+            )
+        if not hasattr(self, "intercept_visible_"):
+            self.intercept_visible_ = np.zeros(
+                X.shape[1],
+            )
+        if not hasattr(self, "h_samples_"):
+            self.h_samples_ = np.zeros((self.batch_size, self.n_components))
+
+        self._fit(X, self.random_state_)
+
+    def _fit(self, v_pos, rng):
+        """Inner fit for one mini-batch.
+
+        Adjust the parameters to maximize the likelihood of v using
+        Stochastic Maximum Likelihood (SML).
+
+        Parameters
+        ----------
+        v_pos : ndarray of shape (n_samples, n_features)
+            The data to use for training.
+
+        rng : RandomState instance
+            Random number generator to use for sampling.
+        """
+        h_pos = self._mean_hiddens(v_pos)
+        v_neg = self._sample_visibles(self.h_samples_, rng)
+        h_neg = self._mean_hiddens(v_neg)
+
+        lr = float(self.learning_rate) / v_pos.shape[0]
+        update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T
+        update -= np.dot(h_neg.T, v_neg)
+        self.components_ += lr * update
+        self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0))
+        self.intercept_visible_ += lr * (
+            np.asarray(v_pos.sum(axis=0)).squeeze() - v_neg.sum(axis=0)
+        )
+
+        h_neg[rng.uniform(size=h_neg.shape) < h_neg] = 1.0  # sample binomial
+        self.h_samples_ = np.floor(h_neg, h_neg)
+
+    def score_samples(self, X):
+        """Compute the pseudo-likelihood of X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Values of the visible layer. Must be all-boolean (not checked).
+
+        Returns
+        -------
+        pseudo_likelihood : ndarray of shape (n_samples,)
+            Value of the pseudo-likelihood (proxy for likelihood).
+
+        Notes
+        -----
+        This method is not deterministic: it computes a quantity called the
+        free energy on X, then on a randomly corrupted version of X, and
+        returns the log of the logistic function of the difference.
+        """
+        check_is_fitted(self)
+
+        v = validate_data(self, X, accept_sparse="csr", reset=False)
+        rng = check_random_state(self.random_state)
+
+        # Randomly corrupt one feature in each sample in v.
+        ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0]))
+        if sp.issparse(v):
+            data = -2 * v[ind] + 1
+            if isinstance(data, np.matrix):  # v is a sparse matrix
+                v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape)
+            else:  # v is a sparse array
+                v_ = v + sp.csr_array((data.ravel(), ind), shape=v.shape)
+        else:
+            v_ = v.copy()
+            v_[ind] = 1 - v_[ind]
+
+        fe = self._free_energy(v)
+        fe_ = self._free_energy(v_)
+        # log(expit(x)) = log(1 / (1 + exp(-x)) = -np.logaddexp(0, -x)
+        return -v.shape[1] * np.logaddexp(0, -(fe_ - fe))
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model to the data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
+            Target values (None for unsupervised transformations).
+
+        Returns
+        -------
+        self : BernoulliRBM
+            The fitted model.
+        """
+        X = validate_data(self, X, accept_sparse="csr", dtype=(np.float64, np.float32))
+        n_samples = X.shape[0]
+        rng = check_random_state(self.random_state)
+
+        self.components_ = np.asarray(
+            rng.normal(0, 0.01, (self.n_components, X.shape[1])),
+            order="F",
+            dtype=X.dtype,
+        )
+        self._n_features_out = self.components_.shape[0]
+        self.intercept_hidden_ = np.zeros(self.n_components, dtype=X.dtype)
+        self.intercept_visible_ = np.zeros(X.shape[1], dtype=X.dtype)
+        self.h_samples_ = np.zeros((self.batch_size, self.n_components), dtype=X.dtype)
+
+        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
+        batch_slices = list(
+            gen_even_slices(n_batches * self.batch_size, n_batches, n_samples=n_samples)
+        )
+        verbose = self.verbose
+        begin = time.time()
+        for iteration in range(1, self.n_iter + 1):
+            for batch_slice in batch_slices:
+                self._fit(X[batch_slice], rng)
+
+            if verbose:
+                end = time.time()
+                print(
+                    "[%s] Iteration %d, pseudo-likelihood = %.2f, time = %.2fs"
+                    % (
+                        type(self).__name__,
+                        iteration,
+                        self.score_samples(X).mean(),
+                        end - begin,
+                    )
+                )
+                begin = end
+
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_stochastic_optimizers.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_stochastic_optimizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..52641a91ce4d396dfbd1ab65116f7b8a937ff3e9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_stochastic_optimizers.py
@@ -0,0 +1,287 @@
+"""Stochastic optimization methods for MLP"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+
+class BaseOptimizer:
+    """Base (Stochastic) gradient descent optimizer
+
+    Parameters
+    ----------
+    learning_rate_init : float, default=0.1
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    Attributes
+    ----------
+    learning_rate : float
+        the current learning rate
+    """
+
+    def __init__(self, learning_rate_init=0.1):
+        self.learning_rate_init = learning_rate_init
+        self.learning_rate = float(learning_rate_init)
+
+    def update_params(self, params, grads):
+        """Update parameters with given gradients
+
+        Parameters
+        ----------
+        params : list of length = len(coefs_) + len(intercepts_)
+            The concatenated list containing coefs_ and intercepts_ in MLP
+            model. Used for initializing velocities and updating params
+
+        grads : list of length = len(params)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+        """
+        updates = self._get_updates(grads)
+        for param, update in zip((p for p in params), updates):
+            param += update
+
+    def iteration_ends(self, time_step):
+        """Perform update to learning rate and potentially other states at the
+        end of an iteration
+        """
+        pass
+
+    def trigger_stopping(self, msg, verbose):
+        """Decides whether it is time to stop training
+
+        Parameters
+        ----------
+        msg : str
+            Message passed in for verbose output
+
+        verbose : bool
+            Print message to stdin if True
+
+        Returns
+        -------
+        is_stopping : bool
+            True if training needs to stop
+        """
+        if verbose:
+            print(msg + " Stopping.")
+        return True
+
+
+class SGDOptimizer(BaseOptimizer):
+    """Stochastic gradient descent optimizer with momentum
+
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+
+    learning_rate_init : float, default=0.1
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
+        Learning rate schedule for weight updates.
+
+        -'constant', is a constant learning rate given by
+         'learning_rate_init'.
+
+        -'invscaling' gradually decreases the learning rate 'learning_rate_' at
+          each time step 't' using an inverse scaling exponent of 'power_t'.
+          learning_rate_ = learning_rate_init / pow(t, power_t)
+
+        -'adaptive', keeps the learning rate constant to
+         'learning_rate_init' as long as the training keeps decreasing.
+         Each time 2 consecutive epochs fail to decrease the training loss by
+         tol, or fail to increase validation score by tol if 'early_stopping'
+         is on, the current learning rate is divided by 5.
+
+    momentum : float, default=0.9
+        Value of momentum used, must be larger than or equal to 0
+
+    nesterov : bool, default=True
+        Whether to use nesterov's momentum or not. Use nesterov's if True
+
+    power_t : float, default=0.5
+        Power of time step 't' in inverse scaling. See `lr_schedule` for
+        more details.
+
+    Attributes
+    ----------
+    learning_rate : float
+        the current learning rate
+
+    velocities : list, length = len(params)
+        velocities that are used to update params
+    """
+
+    def __init__(
+        self,
+        params,
+        learning_rate_init=0.1,
+        lr_schedule="constant",
+        momentum=0.9,
+        nesterov=True,
+        power_t=0.5,
+    ):
+        super().__init__(learning_rate_init)
+
+        self.lr_schedule = lr_schedule
+        self.momentum = momentum
+        self.nesterov = nesterov
+        self.power_t = power_t
+        self.velocities = [np.zeros_like(param) for param in params]
+
+    def iteration_ends(self, time_step):
+        """Perform updates to learning rate and potential other states at the
+        end of an iteration
+
+        Parameters
+        ----------
+        time_step : int
+            number of training samples trained on so far, used to update
+            learning rate for 'invscaling'
+        """
+        if self.lr_schedule == "invscaling":
+            self.learning_rate = (
+                float(self.learning_rate_init) / (time_step + 1) ** self.power_t
+            )
+
+    def trigger_stopping(self, msg, verbose):
+        if self.lr_schedule != "adaptive":
+            if verbose:
+                print(msg + " Stopping.")
+            return True
+
+        if self.learning_rate <= 1e-6:
+            if verbose:
+                print(msg + " Learning rate too small. Stopping.")
+            return True
+
+        self.learning_rate /= 5.0
+        if verbose:
+            print(msg + " Setting learning rate to %f" % self.learning_rate)
+        return False
+
+    def _get_updates(self, grads):
+        """Get the values used to update params with given gradients
+
+        Parameters
+        ----------
+        grads : list, length = len(coefs_) + len(intercepts_)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+
+        Returns
+        -------
+        updates : list, length = len(grads)
+            The values to add to params
+        """
+        updates = [
+            self.momentum * velocity - self.learning_rate * grad
+            for velocity, grad in zip(self.velocities, grads)
+        ]
+        self.velocities = updates
+
+        if self.nesterov:
+            updates = [
+                self.momentum * velocity - self.learning_rate * grad
+                for velocity, grad in zip(self.velocities, grads)
+            ]
+
+        return updates
+
+
+class AdamOptimizer(BaseOptimizer):
+    """Stochastic gradient descent optimizer with Adam
+
+    Note: All default values are from the original Adam paper
+
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+
+    learning_rate_init : float, default=0.001
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    beta_1 : float, default=0.9
+        Exponential decay rate for estimates of first moment vector, should be
+        in [0, 1)
+
+    beta_2 : float, default=0.999
+        Exponential decay rate for estimates of second moment vector, should be
+        in [0, 1)
+
+    epsilon : float, default=1e-8
+        Value for numerical stability
+
+    Attributes
+    ----------
+    learning_rate : float
+        The current learning rate
+
+    t : int
+        Timestep
+
+    ms : list, length = len(params)
+        First moment vectors
+
+    vs : list, length = len(params)
+        Second moment vectors
+
+    References
+    ----------
+    :arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for
+        stochastic optimization." <1412.6980>
+    """
+
+    def __init__(
+        self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+    ):
+        super().__init__(learning_rate_init)
+
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.t = 0
+        self.ms = [np.zeros_like(param) for param in params]
+        self.vs = [np.zeros_like(param) for param in params]
+
+    def _get_updates(self, grads):
+        """Get the values used to update params with given gradients
+
+        Parameters
+        ----------
+        grads : list, length = len(coefs_) + len(intercepts_)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+
+        Returns
+        -------
+        updates : list, length = len(grads)
+            The values to add to params
+        """
+        self.t += 1
+        self.ms = [
+            self.beta_1 * m + (1 - self.beta_1) * grad
+            for m, grad in zip(self.ms, grads)
+        ]
+        self.vs = [
+            self.beta_2 * v + (1 - self.beta_2) * (grad**2)
+            for v, grad in zip(self.vs, grads)
+        ]
+        self.learning_rate = (
+            self.learning_rate_init
+            * np.sqrt(1 - self.beta_2**self.t)
+            / (1 - self.beta_1**self.t)
+        )
+        updates = [
+            -self.learning_rate * m / (np.sqrt(v) + self.epsilon)
+            for m, v in zip(self.ms, self.vs)
+        ]
+        return updates
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..598b7e6054eead605e47fbf4e067ba2119f8d5b6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_base.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pytest
+
+from sklearn._loss import HalfPoissonLoss
+from sklearn.neural_network._base import binary_log_loss, log_loss, poisson_loss
+
+
+def test_binary_log_loss_1_prob_finite():
+    # y_proba is equal to one should result in a finite logloss
+    y_true = np.array([[0, 0, 1]]).T
+    y_prob = np.array([[0.9, 1.0, 1.0]]).T
+
+    loss = binary_log_loss(y_true, y_prob)
+    assert np.isfinite(loss)
+
+
+@pytest.mark.parametrize(
+    "y_true, y_prob",
+    [
+        (
+            np.array([[1, 0, 0], [0, 1, 0]]),
+            np.array([[0.0, 1.0, 0.0], [0.9, 0.05, 0.05]]),
+        ),
+        (np.array([[0, 0, 1]]).T, np.array([[0.9, 1.0, 1.0]]).T),
+    ],
+)
+def test_log_loss_1_prob_finite(y_true, y_prob):
+    # y_proba is equal to 1 should result in a finite logloss
+    loss = log_loss(y_true, y_prob)
+    assert np.isfinite(loss)
+
+
+def test_poisson_loss(global_random_seed):
+    """Test Poisson loss against well tested HalfPoissonLoss."""
+    n = 1000
+    rng = np.random.default_rng(global_random_seed)
+    y_true = rng.integers(low=0, high=10, size=n).astype(float)
+    y_raw = rng.standard_normal(n)
+    y_pred = np.exp(y_raw)
+    sw = rng.uniform(low=0.1, high=10, size=n)
+
+    assert 0 in y_true
+
+    loss = poisson_loss(y_true=y_true, y_pred=y_pred, sample_weight=sw)
+    pl = HalfPoissonLoss()
+    loss_ref = (
+        pl(y_true=y_true, raw_prediction=y_raw, sample_weight=sw)
+        + pl.constant_to_optimal_zero(y_true=y_true, sample_weight=sw).mean()
+        / sw.mean()
+    )
+
+    assert loss == pytest.approx(loss_ref, rel=1e-12)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_mlp.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dddb78223ea71cfdfa9dfa9755fe74efef6a42c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_mlp.py
@@ -0,0 +1,1094 @@
+"""
+Testing for Multi-layer Perceptron module (sklearn.neural_network)
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+import sys
+import warnings
+from io import StringIO
+
+import joblib
+import numpy as np
+import pytest
+
+from sklearn.datasets import (
+    load_digits,
+    load_iris,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import PoissonRegressor
+from sklearn.metrics import roc_auc_score
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, scale
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]
+
+X_digits, y_digits = load_digits(n_class=3, return_X_y=True)
+
+X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200])
+y_digits_multi = y_digits[:200]
+
+X_digits, y_digits = load_digits(n_class=2, return_X_y=True)
+
+X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200])
+y_digits_binary = y_digits[:200]
+
+classification_datasets = [
+    (X_digits_multi, y_digits_multi),
+    (X_digits_binary, y_digits_binary),
+]
+
+X_reg, y_reg = make_regression(
+    n_samples=200, n_features=10, bias=20.0, noise=100.0, random_state=7
+)
+y_reg = scale(y_reg)
+regression_datasets = [(X_reg, y_reg)]
+
+iris = load_iris()
+
+X_iris = iris.data
+y_iris = iris.target
+
+
+def test_alpha():
+    # Test that larger alpha yields weights closer to zero
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+
+    alpha_vectors = []
+    alpha_values = np.arange(2)
+    absolute_sum = lambda x: np.sum(np.abs(x))
+
+    for alpha in alpha_values:
+        mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1)
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+        alpha_vectors.append(
+            np.array([absolute_sum(mlp.coefs_[0]), absolute_sum(mlp.coefs_[1])])
+        )
+
+    for i in range(len(alpha_values) - 1):
+        assert (alpha_vectors[i] > alpha_vectors[i + 1]).all()
+
+
+def test_fit():
+    # Test that the algorithm solution is equal to a worked out example.
+    X = np.array([[0.6, 0.8, 0.7]])
+    y = np.array([0])
+    mlp = MLPClassifier(
+        solver="sgd",
+        learning_rate_init=0.1,
+        alpha=0.1,
+        activation="logistic",
+        random_state=1,
+        max_iter=1,
+        hidden_layer_sizes=2,
+        momentum=0,
+    )
+    # set weights
+    mlp.coefs_ = [0] * 2
+    mlp.intercepts_ = [0] * 2
+    mlp.n_outputs_ = 1
+    mlp.coefs_[0] = np.array([[0.1, 0.2], [0.3, 0.1], [0.5, 0]])
+    mlp.coefs_[1] = np.array([[0.1], [0.2]])
+    mlp.intercepts_[0] = np.array([0.1, 0.1])
+    mlp.intercepts_[1] = np.array([1.0])
+    mlp._coef_grads = [] * 2
+    mlp._intercept_grads = [] * 2
+    mlp.n_features_in_ = 3
+
+    # Initialize parameters
+    mlp.n_iter_ = 0
+    mlp.learning_rate_ = 0.1
+
+    # Compute the number of layers
+    mlp.n_layers_ = 3
+
+    # Pre-allocate gradient matrices
+    mlp._coef_grads = [0] * (mlp.n_layers_ - 1)
+    mlp._intercept_grads = [0] * (mlp.n_layers_ - 1)
+
+    mlp.out_activation_ = "logistic"
+    mlp.t_ = 0
+    mlp.best_loss_ = np.inf
+    mlp.loss_curve_ = []
+    mlp._no_improvement_count = 0
+    mlp._intercept_velocity = [
+        np.zeros_like(intercepts) for intercepts in mlp.intercepts_
+    ]
+    mlp._coef_velocity = [np.zeros_like(coefs) for coefs in mlp.coefs_]
+
+    mlp.partial_fit(X, y, classes=[0, 1])
+    # Manually worked out example
+    # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.1 + 0.8 * 0.3 + 0.7 * 0.5 + 0.1)
+    #       =  0.679178699175393
+    # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.2 + 0.8 * 0.1 + 0.7 * 0 + 0.1)
+    #         = 0.574442516811659
+    # o1 = g(h * W2 + b21) = g(0.679 * 0.1 + 0.574 * 0.2 + 1)
+    #       = 0.7654329236196236
+    # d21 = -(0 - 0.765) = 0.765
+    # d11 = (1 - 0.679) * 0.679 * 0.765 * 0.1 = 0.01667
+    # d12 = (1 - 0.574) * 0.574 * 0.765 * 0.2 = 0.0374
+    # W1grad11 = X1 * d11 + alpha * W11 = 0.6 * 0.01667 + 0.1 * 0.1 = 0.0200
+    # W1grad11 = X1 * d12 + alpha * W12 = 0.6 * 0.0374 + 0.1 * 0.2 = 0.04244
+    # W1grad21 = X2 * d11 + alpha * W13 = 0.8 * 0.01667 + 0.1 * 0.3 = 0.043336
+    # W1grad22 = X2 * d12 + alpha * W14 = 0.8 * 0.0374 + 0.1 * 0.1 = 0.03992
+    # W1grad31 = X3 * d11 + alpha * W15 = 0.6 * 0.01667 + 0.1 * 0.5 = 0.060002
+    # W1grad32 = X3 * d12 + alpha * W16 = 0.6 * 0.0374 + 0.1 * 0 = 0.02244
+    # W2grad1 = h1 * d21 + alpha * W21 = 0.679 * 0.765 + 0.1 * 0.1 = 0.5294
+    # W2grad2 = h2 * d21 + alpha * W22 = 0.574 * 0.765 + 0.1 * 0.2 = 0.45911
+    # b1grad1 = d11 = 0.01667
+    # b1grad2 = d12 = 0.0374
+    # b2grad = d21 = 0.765
+    # W1 = W1 - eta * [W1grad11, .., W1grad32] = [[0.1, 0.2], [0.3, 0.1],
+    #          [0.5, 0]] - 0.1 * [[0.0200, 0.04244], [0.043336, 0.03992],
+    #          [0.060002, 0.02244]] = [[0.098, 0.195756], [0.2956664,
+    #          0.096008], [0.4939998, -0.002244]]
+    # W2 = W2 - eta * [W2grad1, W2grad2] = [[0.1], [0.2]] - 0.1 *
+    #        [[0.5294], [0.45911]] = [[0.04706], [0.154089]]
+    # b1 = b1 - eta * [b1grad1, b1grad2] = 0.1 - 0.1 * [0.01667, 0.0374]
+    #         = [0.098333, 0.09626]
+    # b2 = b2 - eta * b2grad = 1.0 - 0.1 * 0.765 = 0.9235
+    assert_almost_equal(
+        mlp.coefs_[0],
+        np.array([[0.098, 0.195756], [0.2956664, 0.096008], [0.4939998, -0.002244]]),
+        decimal=3,
+    )
+    assert_almost_equal(mlp.coefs_[1], np.array([[0.04706], [0.154089]]), decimal=3)
+    assert_almost_equal(mlp.intercepts_[0], np.array([0.098333, 0.09626]), decimal=3)
+    assert_almost_equal(mlp.intercepts_[1], np.array(0.9235), decimal=3)
+    # Testing output
+    #  h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.098 + 0.8 * 0.2956664 +
+    #               0.7 * 0.4939998 + 0.098333) = 0.677
+    #  h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.195756 + 0.8 * 0.096008 +
+    #            0.7 * -0.002244 + 0.09626) = 0.572
+    #  o1 = h * W2 + b21 = 0.677 * 0.04706 +
+    #             0.572 * 0.154089 + 0.9235 = 1.043
+    #  prob = sigmoid(o1) = 0.739
+    assert_almost_equal(mlp.predict_proba(X)[0, 1], 0.739, decimal=3)
+
+
+def test_gradient():
+    # Test gradient.
+
+    # This makes sure that the activation functions and their derivatives
+    # are correct. The numerical and analytical computation of the gradient
+    # should be close.
+    for n_labels in [2, 3]:
+        n_samples = 5
+        n_features = 10
+        random_state = np.random.RandomState(seed=42)
+        X = random_state.rand(n_samples, n_features)
+        y = 1 + np.mod(np.arange(n_samples) + 1, n_labels)
+        Y = LabelBinarizer().fit_transform(y)
+
+        for activation in ACTIVATION_TYPES:
+            mlp = MLPClassifier(
+                activation=activation,
+                hidden_layer_sizes=10,
+                solver="lbfgs",
+                alpha=1e-5,
+                learning_rate_init=0.2,
+                max_iter=1,
+                random_state=1,
+            )
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", ConvergenceWarning)
+                mlp.fit(X, y)
+
+            theta = np.hstack([l.ravel() for l in mlp.coefs_ + mlp.intercepts_])
+
+            layer_units = [X.shape[1]] + [mlp.hidden_layer_sizes] + [mlp.n_outputs_]
+
+            activations = []
+            deltas = []
+            coef_grads = []
+            intercept_grads = []
+
+            activations.append(X)
+            for i in range(mlp.n_layers_ - 1):
+                activations.append(np.empty((X.shape[0], layer_units[i + 1])))
+                deltas.append(np.empty((X.shape[0], layer_units[i + 1])))
+
+                fan_in = layer_units[i]
+                fan_out = layer_units[i + 1]
+                coef_grads.append(np.empty((fan_in, fan_out)))
+                intercept_grads.append(np.empty(fan_out))
+
+            # analytically compute the gradients
+            def loss_grad_fun(t):
+                return mlp._loss_grad_lbfgs(
+                    t, X, Y, None, activations, deltas, coef_grads, intercept_grads
+                )
+
+            [value, grad] = loss_grad_fun(theta)
+            numgrad = np.zeros(np.size(theta))
+            n = np.size(theta, 0)
+            E = np.eye(n)
+            epsilon = 1e-5
+            # numerically compute the gradients
+            for i in range(n):
+                dtheta = E[:, i] * epsilon
+                numgrad[i] = (
+                    loss_grad_fun(theta + dtheta)[0] - loss_grad_fun(theta - dtheta)[0]
+                ) / (epsilon * 2.0)
+            assert_almost_equal(numgrad, grad)
+
+
+@pytest.mark.parametrize("X,y", classification_datasets)
+def test_lbfgs_classification(X, y):
+    # Test lbfgs on classification.
+    # It should achieve a score higher than 0.95 for the binary and multi-class
+    # versions of the digits dataset.
+    X_train = X[:150]
+    y_train = y[:150]
+    X_test = X[150:]
+    expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind)
+
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPClassifier(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=150,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        mlp.fit(X_train, y_train)
+        y_predict = mlp.predict(X_test)
+        assert mlp.score(X_train, y_train) > 0.95
+        assert (y_predict.shape[0], y_predict.dtype.kind) == expected_shape_dtype
+
+
+@pytest.mark.parametrize("X,y", regression_datasets)
+def test_lbfgs_regression(X, y):
+    # Test lbfgs on the regression dataset.
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPRegressor(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=200,
+            tol=1e-3,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        mlp.fit(X, y)
+        if activation == "identity":
+            assert mlp.score(X, y) > 0.80
+        else:
+            # Non linear models perform much better than linear bottleneck:
+            assert mlp.score(X, y) > 0.98
+
+
+@pytest.mark.parametrize("X,y", classification_datasets)
+def test_lbfgs_classification_maxfun(X, y):
+    # Test lbfgs parameter max_fun.
+    # It should independently limit the number of iterations for lbfgs.
+    max_fun = 10
+    # classification tests
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPClassifier(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=150,
+            max_fun=max_fun,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        with pytest.warns(ConvergenceWarning):
+            mlp.fit(X, y)
+            assert max_fun >= mlp.n_iter_
+
+
+@pytest.mark.parametrize("X,y", regression_datasets)
+def test_lbfgs_regression_maxfun(X, y):
+    # Test lbfgs parameter max_fun.
+    # It should independently limit the number of iterations for lbfgs.
+    max_fun = 10
+    # regression tests
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPRegressor(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            tol=0.0,
+            max_iter=150,
+            max_fun=max_fun,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        with pytest.warns(ConvergenceWarning):
+            mlp.fit(X, y)
+            assert max_fun >= mlp.n_iter_
+
+
+def test_learning_rate_warmstart():
+    # Tests that warm_start reuse past solutions.
+    X = [[3, 2], [1, 6], [5, 6], [-2, -4]]
+    y = [1, 1, 1, 0]
+    for learning_rate in ["invscaling", "constant"]:
+        mlp = MLPClassifier(
+            solver="sgd",
+            hidden_layer_sizes=4,
+            learning_rate=learning_rate,
+            max_iter=1,
+            power_t=0.25,
+            warm_start=True,
+        )
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+            prev_eta = mlp._optimizer.learning_rate
+            mlp.fit(X, y)
+            post_eta = mlp._optimizer.learning_rate
+
+        if learning_rate == "constant":
+            assert prev_eta == post_eta
+        elif learning_rate == "invscaling":
+            assert mlp.learning_rate_init / pow(8 + 1, mlp.power_t) == post_eta
+
+
+def test_multilabel_classification():
+    # Test that multi-label classification works as expected.
+    # test fit method
+    X, y = make_multilabel_classification(
+        n_samples=50, random_state=0, return_indicator=True
+    )
+    mlp = MLPClassifier(
+        solver="lbfgs",
+        hidden_layer_sizes=50,
+        alpha=1e-5,
+        max_iter=150,
+        random_state=0,
+        activation="logistic",
+        learning_rate_init=0.2,
+    )
+    mlp.fit(X, y)
+    assert mlp.score(X, y) > 0.97
+
+    # test partial fit method
+    mlp = MLPClassifier(
+        solver="sgd",
+        hidden_layer_sizes=50,
+        max_iter=150,
+        random_state=0,
+        activation="logistic",
+        alpha=1e-5,
+        learning_rate_init=0.2,
+    )
+    for i in range(100):
+        mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
+    assert mlp.score(X, y) > 0.9
+
+    # Make sure early stopping still work now that splitting is stratified by
+    # default (it is disabled for multilabel classification)
+    mlp = MLPClassifier(early_stopping=True)
+    mlp.fit(X, y).predict(X)
+
+
+def test_multioutput_regression():
+    # Test that multi-output regression works as expected
+    X, y = make_regression(n_samples=200, n_targets=5, random_state=11)
+    mlp = MLPRegressor(
+        solver="lbfgs", hidden_layer_sizes=50, max_iter=200, tol=1e-2, random_state=1
+    )
+    mlp.fit(X, y)
+    assert mlp.score(X, y) > 0.9
+
+
+def test_partial_fit_classes_error():
+    # Tests that passing different classes to partial_fit raises an error
+    X = [[3, 2]]
+    y = [0]
+    clf = MLPClassifier(solver="sgd")
+    clf.partial_fit(X, y, classes=[0, 1])
+    with pytest.raises(ValueError):
+        clf.partial_fit(X, y, classes=[1, 2])
+
+
+def test_partial_fit_classification():
+    # Test partial_fit on classification.
+    # `partial_fit` should yield the same results as 'fit' for binary and
+    # multi-class classification.
+    for X, y in classification_datasets:
+        mlp = MLPClassifier(
+            solver="sgd",
+            max_iter=100,
+            random_state=1,
+            tol=0,
+            alpha=1e-5,
+            learning_rate_init=0.2,
+        )
+
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+        pred1 = mlp.predict(X)
+        mlp = MLPClassifier(
+            solver="sgd", random_state=1, alpha=1e-5, learning_rate_init=0.2
+        )
+        for i in range(100):
+            mlp.partial_fit(X, y, classes=np.unique(y))
+        pred2 = mlp.predict(X)
+        assert_array_equal(pred1, pred2)
+        assert mlp.score(X, y) > 0.95
+
+
+def test_partial_fit_unseen_classes():
+    # Non regression test for bug 6994
+    # Tests for labeling errors in partial fit
+
+    clf = MLPClassifier(random_state=0)
+    clf.partial_fit([[1], [2], [3]], ["a", "b", "c"], classes=["a", "b", "c", "d"])
+    clf.partial_fit([[4]], ["d"])
+    assert clf.score([[1], [2], [3], [4]], ["a", "b", "c", "d"]) > 0
+
+
+def test_partial_fit_regression():
+    # Test partial_fit on regression.
+    # `partial_fit` should yield the same results as 'fit' for regression.
+    X = X_reg
+    y = y_reg
+
+    for momentum in [0, 0.9]:
+        mlp = MLPRegressor(
+            solver="sgd",
+            max_iter=100,
+            activation="relu",
+            random_state=1,
+            learning_rate_init=0.01,
+            batch_size=X.shape[0],
+            momentum=momentum,
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            mlp.fit(X, y)
+        pred1 = mlp.predict(X)
+        mlp = MLPRegressor(
+            solver="sgd",
+            activation="relu",
+            learning_rate_init=0.01,
+            random_state=1,
+            batch_size=X.shape[0],
+            momentum=momentum,
+        )
+        for i in range(100):
+            mlp.partial_fit(X, y)
+
+        pred2 = mlp.predict(X)
+        assert_allclose(pred1, pred2)
+        score = mlp.score(X, y)
+        assert score > 0.65
+
+
+def test_partial_fit_errors():
+    # Test partial_fit error handling.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+
+    # no classes passed
+    with pytest.raises(ValueError):
+        MLPClassifier(solver="sgd").partial_fit(X, y, classes=[2])
+
+    # lbfgs doesn't support partial_fit
+    assert not hasattr(MLPClassifier(solver="lbfgs"), "partial_fit")
+
+
+def test_nonfinite_params():
+    # Check that MLPRegressor throws ValueError when dealing with non-finite
+    # parameter values
+    rng = np.random.RandomState(0)
+    n_samples = 10
+    fmax = np.finfo(np.float64).max
+    X = fmax * rng.uniform(size=(n_samples, 2))
+    y = rng.standard_normal(size=n_samples)
+
+    clf = MLPRegressor()
+    msg = (
+        "Solver produced non-finite parameter weights. The input data may contain large"
+        " values and need to be preprocessed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        with warnings.catch_warnings():
+            # RuntimeWarning: overflow encountered in square
+            warnings.simplefilter("ignore")
+            clf.fit(X, y)
+
+
+def test_predict_proba_binary():
+    # Test that predict_proba works as expected for binary class.
+    X = X_digits_binary[:50]
+    y = y_digits_binary[:50]
+
+    clf = MLPClassifier(hidden_layer_sizes=5, activation="logistic", random_state=1)
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    y_proba = clf.predict_proba(X)
+    y_log_proba = clf.predict_log_proba(X)
+
+    (n_samples, n_classes) = y.shape[0], 2
+
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+    assert roc_auc_score(y, y_proba[:, 1]) == 1.0
+
+
+def test_predict_proba_multiclass():
+    # Test that predict_proba works as expected for multi class.
+    X = X_digits_multi[:10]
+    y = y_digits_multi[:10]
+
+    clf = MLPClassifier(hidden_layer_sizes=5)
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    y_proba = clf.predict_proba(X)
+    y_log_proba = clf.predict_log_proba(X)
+
+    (n_samples, n_classes) = y.shape[0], np.unique(y).size
+
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+
+def test_predict_proba_multilabel():
+    # Test that predict_proba works as expected for multilabel.
+    # Multilabel should not use softmax which makes probabilities sum to 1
+    X, Y = make_multilabel_classification(
+        n_samples=50, random_state=0, return_indicator=True
+    )
+    n_samples, n_classes = Y.shape
+
+    clf = MLPClassifier(solver="lbfgs", hidden_layer_sizes=30, random_state=0)
+    clf.fit(X, Y)
+    y_proba = clf.predict_proba(X)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(y_proba > 0.5, Y)
+
+    y_log_proba = clf.predict_log_proba(X)
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert (y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1) > 1e-10
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+
+def test_shuffle():
+    # Test that the shuffle parameter affects the training process (it should)
+    X, y = make_regression(n_samples=50, n_features=5, n_targets=1, random_state=0)
+
+    # The coefficients will be identical if both do or do not shuffle
+    for shuffle in [True, False]:
+        mlp1 = MLPRegressor(
+            hidden_layer_sizes=1,
+            max_iter=1,
+            batch_size=1,
+            random_state=0,
+            shuffle=shuffle,
+        )
+        mlp2 = MLPRegressor(
+            hidden_layer_sizes=1,
+            max_iter=1,
+            batch_size=1,
+            random_state=0,
+            shuffle=shuffle,
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", ConvergenceWarning)
+            mlp1.fit(X, y)
+            mlp2.fit(X, y)
+
+        assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
+
+    # The coefficients will be slightly different if shuffle=True
+    mlp1 = MLPRegressor(
+        hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=True
+    )
+    mlp2 = MLPRegressor(
+        hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=False
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        mlp1.fit(X, y)
+        mlp2.fit(X, y)
+
+    assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_matrices(csr_container):
+    # Test that sparse and dense input matrices output the same results.
+    X = X_digits_binary[:50]
+    y = y_digits_binary[:50]
+    X_sparse = csr_container(X)
+    mlp = MLPClassifier(solver="lbfgs", hidden_layer_sizes=15, random_state=1)
+    mlp.fit(X, y)
+    pred1 = mlp.predict(X)
+    mlp.fit(X_sparse, y)
+    pred2 = mlp.predict(X_sparse)
+    assert_almost_equal(pred1, pred2)
+    pred1 = mlp.predict(X)
+    pred2 = mlp.predict(X_sparse)
+    assert_array_equal(pred1, pred2)
+
+
+def test_tolerance():
+    # Test tolerance.
+    # It should force the solver to exit the loop when it converges.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd")
+    clf.fit(X, y)
+    assert clf.max_iter > clf.n_iter_
+
+
+def test_verbose_sgd():
+    # Test verbose.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(solver="sgd", max_iter=2, verbose=10, hidden_layer_sizes=2)
+    old_stdout = sys.stdout
+    sys.stdout = output = StringIO()
+
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    clf.partial_fit(X, y)
+
+    sys.stdout = old_stdout
+    assert "Iteration" in output.getvalue()
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_early_stopping(MLPEstimator):
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+    tol = 0.2
+    mlp_estimator = MLPEstimator(
+        tol=tol, max_iter=3000, solver="sgd", early_stopping=True
+    )
+    mlp_estimator.fit(X, y)
+    assert mlp_estimator.max_iter > mlp_estimator.n_iter_
+
+    assert mlp_estimator.best_loss_ is None
+    assert isinstance(mlp_estimator.validation_scores_, list)
+
+    valid_scores = mlp_estimator.validation_scores_
+    best_valid_score = mlp_estimator.best_validation_score_
+    assert max(valid_scores) == best_valid_score
+    assert best_valid_score + tol > valid_scores[-2]
+    assert best_valid_score + tol > valid_scores[-1]
+
+    # check that the attributes `validation_scores_` and `best_validation_score_`
+    # are set to None when `early_stopping=False`
+    mlp_estimator = MLPEstimator(
+        tol=tol, max_iter=3000, solver="sgd", early_stopping=False
+    )
+    mlp_estimator.fit(X, y)
+    assert mlp_estimator.validation_scores_ is None
+    assert mlp_estimator.best_validation_score_ is None
+    assert mlp_estimator.best_loss_ is not None
+
+
+def test_adaptive_learning_rate():
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd", learning_rate="adaptive")
+    clf.fit(X, y)
+    assert clf.max_iter > clf.n_iter_
+    assert 1e-6 > clf._optimizer.learning_rate
+
+
+def test_warm_start():
+    X = X_iris
+    y = y_iris
+
+    y_2classes = np.array([0] * 75 + [1] * 75)
+    y_3classes = np.array([0] * 40 + [1] * 40 + [2] * 70)
+    y_3classes_alt = np.array([0] * 50 + [1] * 50 + [3] * 50)
+    y_4classes = np.array([0] * 37 + [1] * 37 + [2] * 38 + [3] * 38)
+    y_5classes = np.array([0] * 30 + [1] * 30 + [2] * 30 + [3] * 30 + [4] * 30)
+
+    # No error raised
+    clf = MLPClassifier(
+        hidden_layer_sizes=2, solver="lbfgs", warm_start=True, random_state=42, tol=1e-2
+    ).fit(X, y)
+    clf.fit(X, y)
+    clf.fit(X, y_3classes)
+
+    for y_i in (y_2classes, y_3classes_alt, y_4classes, y_5classes):
+        clf = MLPClassifier(
+            hidden_layer_sizes=2,
+            solver="lbfgs",
+            warm_start=True,
+            random_state=42,
+            tol=1e-2,
+        ).fit(X, y)
+        message = (
+            "warm_start can only be used where `y` has the same "
+            "classes as in the previous call to fit."
+            " Previously got [0 1 2], `y` has %s" % np.unique(y_i)
+        )
+        with pytest.raises(ValueError, match=re.escape(message)):
+            clf.fit(X, y_i)
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_warm_start_full_iteration(MLPEstimator):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16812
+    # Check that the MLP estimator accomplish `max_iter` with a
+    # warm started estimator.
+    X, y = X_iris, y_iris
+    max_iter = 3
+    clf = MLPEstimator(
+        hidden_layer_sizes=2, solver="sgd", warm_start=True, max_iter=max_iter
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        clf.fit(X, y)
+        assert max_iter == clf.n_iter_
+        clf.fit(X, y)
+        assert max_iter == clf.n_iter_
+
+
+def test_n_iter_no_change():
+    # test n_iter_no_change using binary data set
+    # the classifying fitting process is not prone to loss curve fluctuations
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+    tol = 0.01
+    max_iter = 3000
+
+    # test multiple n_iter_no_change
+    for n_iter_no_change in [2, 5, 10, 50, 100]:
+        clf = MLPClassifier(
+            tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change
+        )
+        clf.fit(X, y)
+
+        # validate n_iter_no_change
+        assert clf._no_improvement_count == n_iter_no_change + 1
+        assert max_iter > clf.n_iter_
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_n_iter_no_change_inf():
+    # test n_iter_no_change using binary data set
+    # the fitting process should go to max_iter iterations
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+
+    # set a ridiculous tolerance
+    # this should always trigger _update_no_improvement_count()
+    tol = 1e9
+
+    # fit
+    n_iter_no_change = np.inf
+    max_iter = 3000
+    clf = MLPClassifier(
+        tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change
+    )
+    clf.fit(X, y)
+
+    # validate n_iter_no_change doesn't cause early stopping
+    assert clf.n_iter_ == max_iter
+
+    # validate _update_no_improvement_count() was always triggered
+    assert clf._no_improvement_count == clf.n_iter_ - 1
+
+
+def test_early_stopping_stratified():
+    # Make sure data splitting for early stopping is stratified
+    X = [[1, 2], [2, 3], [3, 4], [4, 5]]
+    y = [0, 0, 0, 1]
+
+    mlp = MLPClassifier(early_stopping=True)
+    with pytest.raises(
+        ValueError, match="The least populated class in y has only 1 member"
+    ):
+        mlp.fit(X, y)
+
+
+def test_mlp_classifier_dtypes_casting():
+    # Compare predictions for different dtypes
+    mlp_64 = MLPClassifier(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=100, tol=1e-1
+    )
+    mlp_64.fit(X_digits[:300], y_digits[:300])
+    pred_64 = mlp_64.predict(X_digits[300:])
+    proba_64 = mlp_64.predict_proba(X_digits[300:])
+
+    mlp_32 = MLPClassifier(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=100, tol=1e-1
+    )
+    mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300])
+    pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32))
+    proba_32 = mlp_32.predict_proba(X_digits[300:].astype(np.float32))
+
+    assert_array_equal(pred_64, pred_32)
+    assert_allclose(proba_64, proba_32, rtol=1e-02)
+
+
+def test_mlp_regressor_dtypes_casting():
+    mlp_64 = MLPRegressor(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=150, tol=1e-3
+    )
+    mlp_64.fit(X_digits[:300], y_digits[:300])
+    pred_64 = mlp_64.predict(X_digits[300:])
+
+    mlp_32 = MLPRegressor(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=150, tol=1e-3
+    )
+    mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300])
+    pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32))
+
+    assert_allclose(pred_64, pred_32, rtol=5e-04)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor])
+def test_mlp_param_dtypes(dtype, Estimator):
+    # Checks if input dtype is used for network parameters
+    # and predictions
+    X, y = X_digits.astype(dtype), y_digits
+    mlp = Estimator(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50, tol=1e-1
+    )
+    mlp.fit(X[:300], y[:300])
+    pred = mlp.predict(X[300:])
+
+    assert all([intercept.dtype == dtype for intercept in mlp.intercepts_])
+
+    assert all([coef.dtype == dtype for coef in mlp.coefs_])
+
+    if Estimator == MLPRegressor:
+        assert pred.dtype == dtype
+
+
+def test_mlp_loading_from_joblib_partial_fit(tmp_path):
+    """Loading from MLP and partial fitting updates weights. Non-regression
+    test for #19626."""
+    pre_trained_estimator = MLPRegressor(
+        hidden_layer_sizes=(42,), random_state=42, learning_rate_init=0.01, max_iter=200
+    )
+    features, target = [[2]], [4]
+
+    # Fit on x=2, y=4
+    pre_trained_estimator.fit(features, target)
+
+    # dump and load model
+    pickled_file = tmp_path / "mlp.pkl"
+    joblib.dump(pre_trained_estimator, pickled_file)
+    load_estimator = joblib.load(pickled_file)
+
+    # Train for a more epochs on point x=2, y=1
+    fine_tune_features, fine_tune_target = [[2]], [1]
+
+    for _ in range(200):
+        load_estimator.partial_fit(fine_tune_features, fine_tune_target)
+
+    # finetuned model learned the new target
+    predicted_value = load_estimator.predict(fine_tune_features)
+    assert_allclose(predicted_value, fine_tune_target, rtol=1e-4)
+
+
+@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor])
+def test_preserve_feature_names(Estimator):
+    """Check that feature names are preserved when early stopping is enabled.
+
+    Feature names are required for consistency checks during scoring.
+
+    Non-regression test for gh-24846
+    """
+    pd = pytest.importorskip("pandas")
+    rng = np.random.RandomState(0)
+
+    X = pd.DataFrame(data=rng.randn(10, 2), columns=["colname_a", "colname_b"])
+    y = pd.Series(data=np.full(10, 1), name="colname_y")
+
+    model = Estimator(early_stopping=True, validation_fraction=0.2)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        model.fit(X, y)
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_mlp_warm_start_with_early_stopping(MLPEstimator):
+    """Check that early stopping works with warm start."""
+    mlp = MLPEstimator(
+        max_iter=10, random_state=0, warm_start=True, early_stopping=True
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        mlp.fit(X_iris, y_iris)
+        n_validation_scores = len(mlp.validation_scores_)
+        mlp.set_params(max_iter=20)
+        mlp.fit(X_iris, y_iris)
+    assert len(mlp.validation_scores_) > n_validation_scores
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+@pytest.mark.parametrize("solver", ["sgd", "adam", "lbfgs"])
+def test_mlp_warm_start_no_convergence(MLPEstimator, solver):
+    """Check that we stop the number of iteration at `max_iter` when warm starting.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/24764
+    """
+    model = MLPEstimator(
+        solver=solver,
+        warm_start=True,
+        early_stopping=False,
+        max_iter=10,
+        n_iter_no_change=np.inf,
+        random_state=0,
+    )
+
+    with pytest.warns(ConvergenceWarning):
+        model.fit(X_iris, y_iris)
+    assert model.n_iter_ == 10
+
+    model.set_params(max_iter=20)
+    with pytest.warns(ConvergenceWarning):
+        model.fit(X_iris, y_iris)
+    assert model.n_iter_ == 20
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_mlp_partial_fit_after_fit(MLPEstimator):
+    """Check partial fit does not fail after fit when early_stopping=True.
+
+    Non-regression test for gh-25693.
+    """
+    mlp = MLPEstimator(early_stopping=True, random_state=0).fit(X_iris, y_iris)
+
+    msg = "partial_fit does not support early_stopping=True"
+    with pytest.raises(ValueError, match=msg):
+        mlp.partial_fit(X_iris, y_iris)
+
+
+def test_mlp_diverging_loss():
+    """Test that a diverging model does not raise errors when early stopping is enabled.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/29504
+    """
+    mlp = MLPRegressor(
+        hidden_layer_sizes=100,
+        activation="identity",
+        solver="sgd",
+        alpha=0.0001,
+        learning_rate="constant",
+        learning_rate_init=1,
+        shuffle=True,
+        max_iter=20,
+        early_stopping=True,
+        n_iter_no_change=10,
+        random_state=0,
+    )
+
+    with warnings.catch_warnings():
+        # RuntimeWarning: overflow encountered in matmul
+        # ConvergenceWarning: Stochastic Optimizer: Maximum iteration
+        warnings.simplefilter("ignore", RuntimeWarning)
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        mlp.fit(X_iris, y_iris)
+
+    # In python, float("nan") != float("nan")
+    assert str(mlp.validation_scores_[-1]) == str(np.nan)
+    assert isinstance(mlp.validation_scores_[-1], float)
+
+
+def test_mlp_sample_weight_with_early_stopping():
+    # Test code path for inner validation set splitting.
+    X, y = make_regression(
+        n_samples=100,
+        n_features=2,
+        n_informative=2,
+        random_state=42,
+    )
+    sw = np.ones_like(y)
+    params = dict(
+        hidden_layer_sizes=10,
+        solver="adam",
+        early_stopping=True,
+        tol=1e-2,
+        learning_rate_init=0.01,
+        batch_size=10,
+        random_state=42,
+    )
+    m1 = MLPRegressor(
+        **params,
+    )
+    m1.fit(X, y, sample_weight=sw)
+
+    m2 = MLPRegressor(**params).fit(X, y, sample_weight=None)
+    assert_allclose(m1.predict(X), m2.predict(X))
+
+
+def test_mlp_vs_poisson_glm_equivalent(global_random_seed):
+    """Test MLP with Poisson loss and no hidden layer equals GLM."""
+    n = 100
+    rng = np.random.default_rng(global_random_seed)
+    X = np.linspace(0, 1, n)
+    y = rng.poisson(np.exp(X + 1))
+    X = X.reshape(n, -1)
+    glm = PoissonRegressor(alpha=0, tol=1e-7).fit(X, y)
+    # Unfortunately, we can't set a zero hidden_layer_size, so we use a trick by using
+    # just one hidden layer node with an identity activation. Coefficients will
+    # therefore be different, but predictions are the same.
+    mlp = MLPRegressor(
+        loss="poisson",
+        hidden_layer_sizes=(1,),
+        activation="identity",
+        alpha=0,
+        solver="lbfgs",
+        tol=1e-7,
+        random_state=np.random.RandomState(global_random_seed + 1),
+    ).fit(X, y)
+
+    assert_allclose(mlp.predict(X), glm.predict(X), rtol=1e-4)
+
+    # The same does not work with the squared error because the output activation is
+    # the identity instead of the exponential.
+    mlp = MLPRegressor(
+        loss="squared_error",
+        hidden_layer_sizes=(1,),
+        activation="identity",
+        alpha=0,
+        solver="lbfgs",
+        tol=1e-7,
+        random_state=np.random.RandomState(global_random_seed + 1),
+    ).fit(X, y)
+    assert not np.allclose(mlp.predict(X), glm.predict(X), rtol=1e-4)
+
+
+def test_minimum_input_sample_size():
+    """Check error message when the validation set is too small."""
+    X, y = make_regression(n_samples=2, n_features=5, random_state=0)
+    model = MLPRegressor(early_stopping=True, random_state=0)
+    with pytest.raises(ValueError, match="The validation set is too small"):
+        model.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_rbm.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_rbm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8211c9735923d650234d4268cb30336ddc3ebbb1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_rbm.py
@@ -0,0 +1,251 @@
+import re
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_digits
+from sklearn.neural_network import BernoulliRBM
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
+from sklearn.utils.validation import assert_all_finite
+
+Xdigits, _ = load_digits(return_X_y=True)
+Xdigits -= Xdigits.min()
+Xdigits /= Xdigits.max()
+
+
+def test_fit():
+    X = Xdigits.copy()
+
+    rbm = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, n_iter=7, random_state=9
+    )
+    rbm.fit(X)
+
+    assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0)
+
+    # in-place tricks shouldn't have modified X
+    assert_array_equal(X, Xdigits)
+
+
+def test_partial_fit():
+    X = Xdigits.copy()
+    rbm = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=20, random_state=9
+    )
+    n_samples = X.shape[0]
+    n_batches = int(np.ceil(float(n_samples) / rbm.batch_size))
+    batch_slices = np.array_split(X, n_batches)
+
+    for i in range(7):
+        for batch in batch_slices:
+            rbm.partial_fit(batch)
+
+    assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0)
+    assert_array_equal(X, Xdigits)
+
+
+def test_transform():
+    X = Xdigits[:100]
+    rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    rbm1.fit(X)
+
+    Xt1 = rbm1.transform(X)
+    Xt2 = rbm1._mean_hiddens(X)
+
+    assert_array_equal(Xt1, Xt2)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_small_sparse(csr_container):
+    # BernoulliRBM should work on small sparse matrices.
+    X = csr_container(Xdigits[:4])
+    BernoulliRBM().fit(X)  # no exception
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_small_sparse_partial_fit(sparse_container):
+    X_sparse = sparse_container(Xdigits[:100])
+    X = Xdigits[:100].copy()
+
+    rbm1 = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, random_state=9
+    )
+    rbm2 = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, random_state=9
+    )
+
+    rbm1.partial_fit(X_sparse)
+    rbm2.partial_fit(X)
+
+    assert_almost_equal(
+        rbm1.score_samples(X).mean(), rbm2.score_samples(X).mean(), decimal=0
+    )
+
+
+def test_sample_hiddens():
+    rng = np.random.RandomState(0)
+    X = Xdigits[:100]
+    rbm1 = BernoulliRBM(n_components=2, batch_size=5, n_iter=5, random_state=42)
+    rbm1.fit(X)
+
+    h = rbm1._mean_hiddens(X[0])
+    hs = np.mean([rbm1._sample_hiddens(X[0], rng) for i in range(100)], 0)
+
+    assert_almost_equal(h, hs, decimal=1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_fit_gibbs(csc_container):
+    # XXX: this test is very seed-dependent! It probably needs to be rewritten.
+
+    # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]]
+    # from the same input
+    rng = np.random.RandomState(42)
+    X = np.array([[0.0], [1.0]])
+    rbm1 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng)
+    # you need that much iters
+    rbm1.fit(X)
+    assert_almost_equal(
+        rbm1.components_, np.array([[0.02649814], [0.02009084]]), decimal=4
+    )
+    assert_almost_equal(rbm1.gibbs(X), X)
+
+    # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] from
+    # the same input even when the input is sparse, and test against non-sparse
+    rng = np.random.RandomState(42)
+    X = csc_container([[0.0], [1.0]])
+    rbm2 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng)
+    rbm2.fit(X)
+    assert_almost_equal(
+        rbm2.components_, np.array([[0.02649814], [0.02009084]]), decimal=4
+    )
+    assert_almost_equal(rbm2.gibbs(X), X.toarray())
+    assert_almost_equal(rbm1.components_, rbm2.components_)
+
+
+def test_gibbs_smoke():
+    # Check if we don't get NaNs sampling the full digits dataset.
+    # Also check that sampling again will yield different results.
+    X = Xdigits
+    rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42)
+    rbm1.fit(X)
+    X_sampled = rbm1.gibbs(X)
+    assert_all_finite(X_sampled)
+    X_sampled2 = rbm1.gibbs(X)
+    assert np.all((X_sampled != X_sampled2).max(axis=1))
+
+
+@pytest.mark.parametrize("lil_containers", LIL_CONTAINERS)
+def test_score_samples(lil_containers):
+    # Test score_samples (pseudo-likelihood) method.
+    # Assert that pseudo-likelihood is computed without clipping.
+    # See Fabian's blog, http://bit.ly/1iYefRk
+    rng = np.random.RandomState(42)
+    X = np.vstack([np.zeros(1000), np.ones(1000)])
+    rbm1 = BernoulliRBM(n_components=10, batch_size=2, n_iter=10, random_state=rng)
+    rbm1.fit(X)
+    assert (rbm1.score_samples(X) < -300).all()
+
+    # Sparse vs. dense should not affect the output. Also test sparse input
+    # validation.
+    rbm1.random_state = 42
+    d_score = rbm1.score_samples(X)
+    rbm1.random_state = 42
+    s_score = rbm1.score_samples(lil_containers(X))
+    assert_almost_equal(d_score, s_score)
+
+    # Test numerical stability (#2785): would previously generate infinities
+    # and crash with an exception.
+    with np.errstate(under="ignore"):
+        rbm1.score_samples([np.arange(1000) * 100])
+
+
+def test_rbm_verbose():
+    rbm = BernoulliRBM(n_iter=2, verbose=10)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        rbm.fit(Xdigits)
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_sparse_and_verbose(csc_container):
+    # Make sure RBM works with sparse input when verbose=True
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+
+    X = csc_container([[0.0], [1.0]])
+    rbm = BernoulliRBM(
+        n_components=2, batch_size=2, n_iter=1, random_state=42, verbose=True
+    )
+    try:
+        rbm.fit(X)
+        s = sys.stdout.getvalue()
+        # make sure output is sound
+        assert re.match(
+            r"\[BernoulliRBM\] Iteration 1,"
+            r" pseudo-likelihood = -?(\d)+(\.\d+)?,"
+            r" time = (\d|\.)+s",
+            s,
+        )
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize(
+    "dtype_in, dtype_out",
+    [(np.float32, np.float32), (np.float64, np.float64), (int, np.float64)],
+)
+def test_transformer_dtypes_casting(dtype_in, dtype_out):
+    X = Xdigits[:100].astype(dtype_in)
+    rbm = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt = rbm.fit_transform(X)
+
+    # dtype_in and dtype_out should be consistent
+    assert Xt.dtype == dtype_out, "transform dtype: {} - original dtype: {}".format(
+        Xt.dtype, X.dtype
+    )
+
+
+def test_convergence_dtype_consistency():
+    # float 64 transformer
+    X_64 = Xdigits[:100].astype(np.float64)
+    rbm_64 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt_64 = rbm_64.fit_transform(X_64)
+
+    # float 32 transformer
+    X_32 = Xdigits[:100].astype(np.float32)
+    rbm_32 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt_32 = rbm_32.fit_transform(X_32)
+
+    # results and attributes should be close enough in 32 bit and 64 bit
+    assert_allclose(Xt_64, Xt_32, rtol=1e-06, atol=0)
+    assert_allclose(
+        rbm_64.intercept_hidden_, rbm_32.intercept_hidden_, rtol=1e-06, atol=0
+    )
+    assert_allclose(
+        rbm_64.intercept_visible_, rbm_32.intercept_visible_, rtol=1e-05, atol=0
+    )
+    assert_allclose(rbm_64.components_, rbm_32.components_, rtol=1e-03, atol=0)
+    assert_allclose(rbm_64.h_samples_, rbm_32.h_samples_)
+
+
+@pytest.mark.parametrize("method", ["fit", "partial_fit"])
+def test_feature_names_out(method):
+    """Check `get_feature_names_out` for `BernoulliRBM`."""
+    n_components = 10
+    rbm = BernoulliRBM(n_components=n_components)
+    getattr(rbm, method)(Xdigits)
+
+    names = rbm.get_feature_names_out()
+    expected_names = [f"bernoullirbm{i}" for i in range(n_components)]
+    assert_array_equal(expected_names, names)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..58a9f0c7dda13fd288c1c86f6a52fede485787ad
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py
@@ -0,0 +1,112 @@
+import numpy as np
+
+from sklearn.neural_network._stochastic_optimizers import (
+    AdamOptimizer,
+    BaseOptimizer,
+    SGDOptimizer,
+)
+from sklearn.utils._testing import assert_array_equal
+
+shapes = [(4, 6), (6, 8), (7, 8, 9)]
+
+
+def test_base_optimizer():
+    for lr in [10**i for i in range(-3, 4)]:
+        optimizer = BaseOptimizer(lr)
+        assert optimizer.trigger_stopping("", False)
+
+
+def test_sgd_optimizer_no_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    rng = np.random.RandomState(0)
+
+    for lr in [10**i for i in range(-3, 4)]:
+        optimizer = SGDOptimizer(params, lr, momentum=0, nesterov=False)
+        grads = [rng.random_sample(shape) for shape in shapes]
+        expected = [param - lr * grad for param, grad in zip(params, grads)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_sgd_optimizer_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.1
+    rng = np.random.RandomState(0)
+
+    for momentum in np.arange(0.5, 0.9, 0.1):
+        optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=False)
+        velocities = [rng.random_sample(shape) for shape in shapes]
+        optimizer.velocities = velocities
+        grads = [rng.random_sample(shape) for shape in shapes]
+        updates = [
+            momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads)
+        ]
+        expected = [param + update for param, update in zip(params, updates)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_sgd_optimizer_trigger_stopping():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 2e-6
+    optimizer = SGDOptimizer(params, lr, lr_schedule="adaptive")
+    assert not optimizer.trigger_stopping("", False)
+    assert lr / 5 == optimizer.learning_rate
+    assert optimizer.trigger_stopping("", False)
+
+
+def test_sgd_optimizer_nesterovs_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.1
+    rng = np.random.RandomState(0)
+
+    for momentum in np.arange(0.5, 0.9, 0.1):
+        optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=True)
+        velocities = [rng.random_sample(shape) for shape in shapes]
+        optimizer.velocities = velocities
+        grads = [rng.random_sample(shape) for shape in shapes]
+        updates = [
+            momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads)
+        ]
+        updates = [
+            momentum * update - lr * grad for update, grad in zip(updates, grads)
+        ]
+        expected = [param + update for param, update in zip(params, updates)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_adam_optimizer():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.001
+    epsilon = 1e-8
+    rng = np.random.RandomState(0)
+
+    for beta_1 in np.arange(0.9, 1.0, 0.05):
+        for beta_2 in np.arange(0.995, 1.0, 0.001):
+            optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon)
+            ms = [rng.random_sample(shape) for shape in shapes]
+            vs = [rng.random_sample(shape) for shape in shapes]
+            t = 10
+            optimizer.ms = ms
+            optimizer.vs = vs
+            optimizer.t = t - 1
+            grads = [rng.random_sample(shape) for shape in shapes]
+
+            ms = [beta_1 * m + (1 - beta_1) * grad for m, grad in zip(ms, grads)]
+            vs = [beta_2 * v + (1 - beta_2) * (grad**2) for v, grad in zip(vs, grads)]
+            learning_rate = lr * np.sqrt(1 - beta_2**t) / (1 - beta_1**t)
+            updates = [
+                -learning_rate * m / (np.sqrt(v) + epsilon) for m, v in zip(ms, vs)
+            ]
+            expected = [param + update for param, update in zip(params, updates)]
+
+            optimizer.update_params(params, grads)
+            for exp, param in zip(expected, params):
+                assert_array_equal(exp, param)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48bb3aa6a7a4e811f02e13924658858984a21681
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__init__.py
@@ -0,0 +1,63 @@
+"""Methods for scaling, centering, normalization, binarization, and more."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._data import (
+    Binarizer,
+    KernelCenterer,
+    MaxAbsScaler,
+    MinMaxScaler,
+    Normalizer,
+    PowerTransformer,
+    QuantileTransformer,
+    RobustScaler,
+    StandardScaler,
+    add_dummy_feature,
+    binarize,
+    maxabs_scale,
+    minmax_scale,
+    normalize,
+    power_transform,
+    quantile_transform,
+    robust_scale,
+    scale,
+)
+from ._discretization import KBinsDiscretizer
+from ._encoders import OneHotEncoder, OrdinalEncoder
+from ._function_transformer import FunctionTransformer
+from ._label import LabelBinarizer, LabelEncoder, MultiLabelBinarizer, label_binarize
+from ._polynomial import PolynomialFeatures, SplineTransformer
+from ._target_encoder import TargetEncoder
+
+__all__ = [
+    "Binarizer",
+    "FunctionTransformer",
+    "KBinsDiscretizer",
+    "KernelCenterer",
+    "LabelBinarizer",
+    "LabelEncoder",
+    "MaxAbsScaler",
+    "MinMaxScaler",
+    "MultiLabelBinarizer",
+    "Normalizer",
+    "OneHotEncoder",
+    "OrdinalEncoder",
+    "PolynomialFeatures",
+    "PowerTransformer",
+    "QuantileTransformer",
+    "RobustScaler",
+    "SplineTransformer",
+    "StandardScaler",
+    "TargetEncoder",
+    "add_dummy_feature",
+    "binarize",
+    "label_binarize",
+    "maxabs_scale",
+    "minmax_scale",
+    "normalize",
+    "power_transform",
+    "quantile_transform",
+    "robust_scale",
+    "scale",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17e15e243e288a4e726cc39c790f924fb72be9b0
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2c5c382922e1e41dd745f2471b2f91e5c41d254
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08c8341a0166b0d02ddcdf75eddb1ca54d45f53d
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc5ad9de109e7ee0bd4649989883f9117058eedd
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..228020175451be16e55ca1c41da0938173e635bd
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c992002dbc75cc8be6e8da029f0509ec869f2b5
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d111b0de0024befb293fead397d14adceeea9f4f
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..38e5c3069d252c0f31db2fe7b3046390eb30be12
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx
@@ -0,0 +1,258 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..utils._typedefs cimport uint8_t, int64_t, intp_t
+
+ctypedef uint8_t FLAG_t
+
+# We use the following verbatim block to determine whether the current
+# platform's compiler supports 128-bit integer values intrinsically.
+# This should work for GCC and CLANG on 64-bit architectures, but doesn't for
+# MSVC on any architecture. We prefer to use 128-bit integers when possible
+# because the intermediate calculations have a non-trivial risk of overflow. It
+# is, however, very unlikely to come up on an average use case, hence 64-bit
+# integers (i.e. `long long`) are "good enough" for most common cases. There is
+# not much we can do to efficiently mitigate  the overflow risk on the Windows
+# platform at this time. Consider this a "best effort" design decision that
+# could be revisited later in case someone comes up with a safer option that
+# does not hurt the performance of the common cases.
+# See `test_sizeof_LARGEST_INT_t()`for more information on exact type expectations.
+cdef extern from *:
+    """
+    #ifdef __SIZEOF_INT128__
+        typedef __int128 LARGEST_INT_t;
+    #elif (__clang__ || __EMSCRIPTEN__) && !__i386__
+        typedef _BitInt(128) LARGEST_INT_t;
+    #else
+        typedef long long LARGEST_INT_t;
+    #endif
+    """
+    ctypedef long long LARGEST_INT_t
+
+
+# Determine the size of `LARGEST_INT_t` at runtime.
+# Used in `test_sizeof_LARGEST_INT_t`.
+def _get_sizeof_LARGEST_INT_t():
+    return sizeof(LARGEST_INT_t)
+
+
+# TODO: use `{int,float}{32,64}_t` when cython#5230 is resolved:
+# https://github.com/cython/cython/issues/5230
+ctypedef fused DATA_t:
+    float
+    double
+    int
+    long long
+# INDEX_{A,B}_t are defined to generate a proper Cartesian product
+# of types through Cython fused-type expansion.
+ctypedef fused INDEX_A_t:
+    signed int
+    signed long long
+ctypedef fused INDEX_B_t:
+    signed int
+    signed long long
+
+cdef inline int64_t _deg2_column(
+    LARGEST_INT_t n_features,
+    LARGEST_INT_t i,
+    LARGEST_INT_t j,
+    FLAG_t interaction_only
+) nogil:
+    """Compute the index of the column for a degree 2 expansion
+
+    n_features is the dimensionality of the input data, i and j are the indices
+    for the columns involved in the expansion.
+    """
+    if interaction_only:
+        return n_features * i - i * (i + 3) / 2 - 1 + j
+    else:
+        return n_features * i - i* (i + 1) / 2 + j
+
+
+cdef inline int64_t _deg3_column(
+    LARGEST_INT_t n_features,
+    LARGEST_INT_t i,
+    LARGEST_INT_t j,
+    LARGEST_INT_t k,
+    FLAG_t interaction_only
+) nogil:
+    """Compute the index of the column for a degree 3 expansion
+
+    n_features is the dimensionality of the input data, i, j and k are the indices
+    for the columns involved in the expansion.
+    """
+    if interaction_only:
+        return (
+            (
+                (3 * n_features) * (n_features * i - i**2)
+                + i * (i**2 + 11) - (3 * j) * (j + 3)
+            ) / 6 + i**2 + n_features * (j - 1 - 2 * i) + k
+        )
+    else:
+        return (
+            (
+                (3 * n_features) * (n_features * i - i**2)
+                + i ** 3 - i - (3 * j) * (j + 1)
+            ) / 6 + n_features * j + k
+        )
+
+
+def py_calc_expanded_nnz_deg2(n, interaction_only):
+    return n * (n + 1) // 2 - interaction_only * n
+
+
+def py_calc_expanded_nnz_deg3(n, interaction_only):
+    return n * (n**2 + 3 * n + 2) // 6 - interaction_only * n**2
+
+
+cpdef int64_t _calc_expanded_nnz(
+    LARGEST_INT_t n,
+    FLAG_t interaction_only,
+    LARGEST_INT_t degree
+):
+    """
+    Calculates the number of non-zero interaction terms generated by the
+    non-zero elements of a single row.
+    """
+    # This is the maximum value before the intermediate computation
+    # d**2 + d overflows
+    # Solution to d**2 + d = maxint64
+    # SymPy: solve(x**2 + x - int64_max, x)
+    cdef int64_t MAX_SAFE_INDEX_CALC_DEG2 = 3037000499
+
+    # This is the maximum value before the intermediate computation
+    # d**3 + 3 * d**2 + 2*d overflows
+    # Solution to d**3 + 3 * d**2 + 2*d = maxint64
+    # SymPy: solve(x * (x**2 + 3 * x + 2) - int64_max, x)
+    cdef int64_t MAX_SAFE_INDEX_CALC_DEG3 = 2097151
+
+    if degree == 2:
+        # Only need to check when not using 128-bit integers
+        if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG2:
+            return n * (n + 1) / 2 - interaction_only * n
+        return <int64_t> py_calc_expanded_nnz_deg2(n, interaction_only)
+    else:
+        # Only need to check when not using 128-bit integers
+        if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG3:
+            return n * (n**2 + 3 * n + 2) / 6 - interaction_only * n**2
+        return <int64_t> py_calc_expanded_nnz_deg3(n, interaction_only)
+
+cpdef int64_t _calc_total_nnz(
+    INDEX_A_t[:] indptr,
+    FLAG_t interaction_only,
+    int64_t degree,
+):
+    """
+    Calculates the number of non-zero interaction terms generated by the
+    non-zero elements across all rows for a single degree.
+    """
+    cdef int64_t total_nnz=0
+    cdef intp_t row_idx
+    for row_idx in range(len(indptr) - 1):
+        total_nnz += _calc_expanded_nnz(
+            indptr[row_idx + 1] - indptr[row_idx],
+            interaction_only,
+            degree
+        )
+    return total_nnz
+
+
+cpdef void _csr_polynomial_expansion(
+    const DATA_t[:] data,           # IN READ-ONLY
+    const INDEX_A_t[:] indices,     # IN READ-ONLY
+    const INDEX_A_t[:] indptr,      # IN READ-ONLY
+    INDEX_A_t n_features,
+    DATA_t[:] result_data,          # OUT
+    INDEX_B_t[:] result_indices,    # OUT
+    INDEX_B_t[:] result_indptr,     # OUT
+    FLAG_t interaction_only,
+    FLAG_t degree
+):
+    """
+    Perform a second or third degree polynomial or interaction expansion on a
+    compressed sparse row (CSR) matrix. The method used only takes products of
+    non-zero features. For a matrix with density :math:`d`, this results in a
+    speedup on the order of :math:`(1/d)^k` where :math:`k` is the degree of
+    the expansion, assuming all rows are of similar density.
+
+    Parameters
+    ----------
+    data : memory view on nd-array
+        The "data" attribute of the input CSR matrix.
+
+    indices : memory view on nd-array
+        The "indices" attribute of the input CSR matrix.
+
+    indptr : memory view on nd-array
+        The "indptr" attribute of the input CSR matrix.
+
+    n_features : int
+        The dimensionality of the input CSR matrix.
+
+    result_data : nd-array
+        The output CSR matrix's "data" attribute.
+        It is modified by this routine.
+
+    result_indices : nd-array
+        The output CSR matrix's "indices" attribute.
+        It is modified by this routine.
+
+    result_indptr : nd-array
+        The output CSR matrix's "indptr" attribute.
+        It is modified by this routine.
+
+    interaction_only : int
+        0 for a polynomial expansion, 1 for an interaction expansion.
+
+    degree : int
+        The degree of the expansion. This must be either 2 or 3.
+
+    References
+    ----------
+    "Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR
+    Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
+    """
+
+    # Make the arrays that will form the CSR matrix of the expansion.
+    cdef INDEX_A_t row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr
+    cdef INDEX_B_t expanded_index=0, num_cols_in_row, col
+    with nogil:
+        result_indptr[0] = indptr[0]
+        for row_i in range(indptr.shape[0]-1):
+            row_starts = indptr[row_i]
+            row_ends = indptr[row_i + 1]
+            num_cols_in_row = 0
+            for i_ptr in range(row_starts, row_ends):
+                i = indices[i_ptr]
+                for j_ptr in range(i_ptr + interaction_only, row_ends):
+                    j = indices[j_ptr]
+                    if degree == 2:
+                        col = <INDEX_B_t> _deg2_column(
+                            n_features,
+                            i, j,
+                            interaction_only
+                        )
+                        result_indices[expanded_index] = col
+                        result_data[expanded_index] = (
+                            data[i_ptr] * data[j_ptr]
+                        )
+                        expanded_index += 1
+                        num_cols_in_row += 1
+                    else:
+                        # degree == 3
+                        for k_ptr in range(j_ptr + interaction_only, row_ends):
+                            k = indices[k_ptr]
+                            col = <INDEX_B_t> _deg3_column(
+                                n_features,
+                                i, j, k,
+                                interaction_only
+                            )
+                            result_indices[expanded_index] = col
+                            result_data[expanded_index] = (
+                                data[i_ptr] * data[j_ptr] * data[k_ptr]
+                            )
+                            expanded_index += 1
+                            num_cols_in_row += 1
+
+            result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row
+    return
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_data.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe138cda73803ea7612215b0f9ca3abd11083f23
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_data.py
@@ -0,0 +1,3706 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse, stats
+from scipy.special import boxcox, inv_boxcox
+
+from sklearn.utils import metadata_routing
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    OneToOneFeatureMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..utils import _array_api, check_array, resample
+from ..utils._array_api import (
+    _find_matching_floating_dtype,
+    _modify_in_place_if_numpy,
+    device,
+    get_namespace,
+    get_namespace_and_device,
+)
+from ..utils._param_validation import Interval, Options, StrOptions, validate_params
+from ..utils.extmath import _incremental_mean_and_var, row_norms
+from ..utils.fixes import _yeojohnson_lambda
+from ..utils.sparsefuncs import (
+    incr_mean_variance_axis,
+    inplace_column_scale,
+    mean_variance_axis,
+    min_max_axis,
+)
+from ..utils.sparsefuncs_fast import (
+    inplace_csr_row_normalize_l1,
+    inplace_csr_row_normalize_l2,
+)
+from ..utils.validation import (
+    FLOAT_DTYPES,
+    _check_sample_weight,
+    check_is_fitted,
+    check_random_state,
+    validate_data,
+)
+from ._encoders import OneHotEncoder
+
+BOUNDS_THRESHOLD = 1e-7
+
+__all__ = [
+    "Binarizer",
+    "KernelCenterer",
+    "MaxAbsScaler",
+    "MinMaxScaler",
+    "Normalizer",
+    "OneHotEncoder",
+    "PowerTransformer",
+    "QuantileTransformer",
+    "RobustScaler",
+    "StandardScaler",
+    "add_dummy_feature",
+    "binarize",
+    "maxabs_scale",
+    "minmax_scale",
+    "normalize",
+    "power_transform",
+    "quantile_transform",
+    "robust_scale",
+    "scale",
+]
+
+
+def _is_constant_feature(var, mean, n_samples):
+    """Detect if a feature is indistinguishable from a constant feature.
+
+    The detection is based on its computed variance and on the theoretical
+    error bounds of the '2 pass algorithm' for variance computation.
+
+    See "Algorithms for computing the sample variance: analysis and
+    recommendations", by Chan, Golub, and LeVeque.
+    """
+    # In scikit-learn, variance is always computed using float64 accumulators.
+    eps = np.finfo(np.float64).eps
+
+    upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2
+    return var <= upper_bound
+
+
+def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
+    """Set scales of near constant features to 1.
+
+    The goal is to avoid division by very small or zero values.
+
+    Near constant features are detected automatically by identifying
+    scales close to machine precision unless they are precomputed by
+    the caller and passed with the `constant_mask` kwarg.
+
+    Typically for standard scaling, the scales are the standard
+    deviation while near constant features are better detected on the
+    computed variances which are closer to machine precision by
+    construction.
+    """
+    # if we are fitting on 1D arrays, scale might be a scalar
+    if np.isscalar(scale):
+        if scale == 0.0:
+            scale = 1.0
+        return scale
+    # scale is an array
+    else:
+        xp, _ = get_namespace(scale)
+        if constant_mask is None:
+            # Detect near constant values to avoid dividing by a very small
+            # value that could lead to surprising results and numerical
+            # stability issues.
+            constant_mask = scale < 10 * xp.finfo(scale.dtype).eps
+
+        if copy:
+            # New array to avoid side-effects
+            scale = xp.asarray(scale, copy=True)
+        scale[constant_mask] = 1.0
+        return scale
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "axis": [Options(Integral, {0, 1})],
+        "with_mean": ["boolean"],
+        "with_std": ["boolean"],
+        "copy": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
+    """Standardize a dataset along any axis.
+
+    Center to the mean and component wise scale to unit variance.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to center and scale.
+
+    axis : {0, 1}, default=0
+        Axis used to compute the means and standard deviations along. If 0,
+        independently standardize each feature, otherwise (if 1) standardize
+        each sample.
+
+    with_mean : bool, default=True
+        If True, center the data before scaling.
+
+    with_std : bool, default=True
+        If True, scale the data to unit variance (or equivalently,
+        unit standard deviation).
+
+    copy : bool, default=True
+        If False, try to avoid a copy and scale in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    StandardScaler : Performs scaling to unit variance using the Transformer
+        API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    This implementation will refuse to center scipy.sparse matrices
+    since it would make them non-sparse and would potentially crash the
+    program with memory exhaustion problems.
+
+    Instead the caller is expected to either set explicitly
+    `with_mean=False` (in that case, only variance scaling will be
+    performed on the features of the CSC matrix) or to call `X.toarray()`
+    if he/she expects the materialized dense array to fit in memory.
+
+    To avoid memory copy the caller should pass a CSC matrix.
+
+    NaNs are treated as missing values: disregarded to compute the statistics,
+    and maintained during the data transformation.
+
+    We use a biased estimator for the standard deviation, equivalent to
+    `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
+    affect model performance.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.StandardScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import scale
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> scale(X, axis=0)  # scaling each column independently
+    array([[-1.,  1.,  1.],
+           [ 1., -1., -1.]])
+    >>> scale(X, axis=1)  # scaling each row independently
+    array([[-1.37,  0.39,  0.98],
+           [-1.22,  0.     ,  1.22]])
+    """
+    X = check_array(
+        X,
+        accept_sparse="csc",
+        copy=copy,
+        ensure_2d=False,
+        estimator="the scale function",
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
+    )
+    if sparse.issparse(X):
+        if with_mean:
+            raise ValueError(
+                "Cannot center sparse matrices: pass `with_mean=False` instead"
+                " See docstring for motivation and alternatives."
+            )
+        if axis != 0:
+            raise ValueError(
+                "Can only scale sparse matrix on axis=0,  got axis=%d" % axis
+            )
+        if with_std:
+            _, var = mean_variance_axis(X, axis=0)
+            var = _handle_zeros_in_scale(var, copy=False)
+            inplace_column_scale(X, 1 / np.sqrt(var))
+    else:
+        X = np.asarray(X)
+        if with_mean:
+            mean_ = np.nanmean(X, axis)
+        if with_std:
+            scale_ = np.nanstd(X, axis)
+        # Xr is a view on the original array that enables easy use of
+        # broadcasting on the axis in which we are interested in
+        Xr = np.rollaxis(X, axis)
+        if with_mean:
+            Xr -= mean_
+            mean_1 = np.nanmean(Xr, axis=0)
+            # Verify that mean_1 is 'close to zero'. If X contains very
+            # large values, mean_1 can also be very large, due to a lack of
+            # precision of mean_. In this case, a pre-scaling of the
+            # concerned feature is efficient, for instance by its mean or
+            # maximum.
+            if not np.allclose(mean_1, 0):
+                warnings.warn(
+                    "Numerical issues were encountered "
+                    "when centering the data "
+                    "and might not be solved. Dataset may "
+                    "contain too large values. You may need "
+                    "to prescale your features."
+                )
+                Xr -= mean_1
+        if with_std:
+            scale_ = _handle_zeros_in_scale(scale_, copy=False)
+            Xr /= scale_
+            if with_mean:
+                mean_2 = np.nanmean(Xr, axis=0)
+                # If mean_2 is not 'close to zero', it comes from the fact that
+                # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
+                # if mean_1 was close to zero. The problem is thus essentially
+                # due to the lack of precision of mean_. A solution is then to
+                # subtract the mean again:
+                if not np.allclose(mean_2, 0):
+                    warnings.warn(
+                        "Numerical issues were encountered "
+                        "when scaling the data "
+                        "and might not be solved. The standard "
+                        "deviation of the data is probably "
+                        "very close to 0. "
+                    )
+                    Xr -= mean_2
+    return X
+
+
+class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Transform features by scaling each feature to a given range.
+
+    This estimator scales and translates each feature individually such
+    that it is in the given range on the training set, e.g. between
+    zero and one.
+
+    The transformation is given by::
+
+        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
+        X_scaled = X_std * (max - min) + min
+
+    where min, max = feature_range.
+
+    This transformation is often used as an alternative to zero mean,
+    unit variance scaling.
+
+    `MinMaxScaler` doesn't reduce the effect of outliers, but it linearly
+    scales them down into a fixed range, where the largest occurring data point
+    corresponds to the maximum value and the smallest one corresponds to the
+    minimum value. For an example visualization, refer to :ref:`Compare
+    MinMaxScaler with other scalers <plot_all_scaling_minmax_scaler_section>`.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    feature_range : tuple (min, max), default=(0, 1)
+        Desired range of transformed data.
+
+    copy : bool, default=True
+        Set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array).
+
+    clip : bool, default=False
+        Set to True to clip transformed values of held-out data to
+        provided `feature range`.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    min_ : ndarray of shape (n_features,)
+        Per feature adjustment for minimum. Equivalent to
+        ``min - X.min(axis=0) * self.scale_``
+
+    scale_ : ndarray of shape (n_features,)
+        Per feature relative scaling of the data. Equivalent to
+        ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
+
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
+    data_min_ : ndarray of shape (n_features,)
+        Per feature minimum seen in the data
+
+        .. versionadded:: 0.17
+           *data_min_*
+
+    data_max_ : ndarray of shape (n_features,)
+        Per feature maximum seen in the data
+
+        .. versionadded:: 0.17
+           *data_max_*
+
+    data_range_ : ndarray of shape (n_features,)
+        Per feature range ``(data_max_ - data_min_)`` seen in the data
+
+        .. versionadded:: 0.17
+           *data_range_*
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    n_samples_seen_ : int
+        The number of samples processed by the estimator.
+        It will be reset on new calls to fit, but increments across
+        ``partial_fit`` calls.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    minmax_scale : Equivalent function without the estimator API.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import MinMaxScaler
+    >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
+    >>> scaler = MinMaxScaler()
+    >>> print(scaler.fit(data))
+    MinMaxScaler()
+    >>> print(scaler.data_max_)
+    [ 1. 18.]
+    >>> print(scaler.transform(data))
+    [[0.   0.  ]
+     [0.25 0.25]
+     [0.5  0.5 ]
+     [1.   1.  ]]
+    >>> print(scaler.transform([[2, 2]]))
+    [[1.5 0. ]]
+    """
+
+    _parameter_constraints: dict = {
+        "feature_range": [tuple],
+        "copy": ["boolean"],
+        "clip": ["boolean"],
+    }
+
+    def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):
+        self.feature_range = feature_range
+        self.copy = copy
+        self.clip = clip
+
+    def _reset(self):
+        """Reset internal data-dependent state of the scaler, if necessary.
+
+        __init__ parameters are not touched.
+        """
+        # Checking one attribute is enough, because they are all set together
+        # in partial_fit
+        if hasattr(self, "scale_"):
+            del self.scale_
+            del self.min_
+            del self.n_samples_seen_
+            del self.data_min_
+            del self.data_max_
+            del self.data_range_
+
+    def fit(self, X, y=None):
+        """Compute the minimum and maximum to be used for later scaling.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to compute the per-feature minimum and maximum
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # Reset internal state before fitting
+        self._reset()
+        return self.partial_fit(X, y)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Online computation of min and max on X for later scaling.
+
+        All of X is processed as a single batch. This is intended for cases
+        when :meth:`fit` is not feasible due to very large number of
+        `n_samples` or because X is read from a continuous stream.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        feature_range = self.feature_range
+        if feature_range[0] >= feature_range[1]:
+            raise ValueError(
+                "Minimum of desired feature range must be smaller than maximum. Got %s."
+                % str(feature_range)
+            )
+
+        if sparse.issparse(X):
+            raise TypeError(
+                "MinMaxScaler does not support sparse input. "
+                "Consider using MaxAbsScaler instead."
+            )
+
+        xp, _ = get_namespace(X)
+
+        first_pass = not hasattr(self, "n_samples_seen_")
+        X = validate_data(
+            self,
+            X,
+            reset=first_pass,
+            dtype=_array_api.supported_float_dtypes(xp),
+            ensure_all_finite="allow-nan",
+        )
+
+        device_ = device(X)
+        feature_range = (
+            xp.asarray(feature_range[0], dtype=X.dtype, device=device_),
+            xp.asarray(feature_range[1], dtype=X.dtype, device=device_),
+        )
+
+        data_min = _array_api._nanmin(X, axis=0, xp=xp)
+        data_max = _array_api._nanmax(X, axis=0, xp=xp)
+
+        if first_pass:
+            self.n_samples_seen_ = X.shape[0]
+        else:
+            data_min = xp.minimum(self.data_min_, data_min)
+            data_max = xp.maximum(self.data_max_, data_max)
+            self.n_samples_seen_ += X.shape[0]
+
+        data_range = data_max - data_min
+        self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(
+            data_range, copy=True
+        )
+        self.min_ = feature_range[0] - data_min * self.scale_
+        self.data_min_ = data_min
+        self.data_max_ = data_max
+        self.data_range_ = data_range
+        return self
+
+    def transform(self, X):
+        """Scale features of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data that will be transformed.
+
+        Returns
+        -------
+        Xt : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(X)
+
+        X = validate_data(
+            self,
+            X,
+            copy=self.copy,
+            dtype=_array_api.supported_float_dtypes(xp),
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+            reset=False,
+        )
+
+        X *= self.scale_
+        X += self.min_
+        if self.clip:
+            device_ = device(X)
+            X = _modify_in_place_if_numpy(
+                xp,
+                xp.clip,
+                X,
+                xp.asarray(self.feature_range[0], dtype=X.dtype, device=device_),
+                xp.asarray(self.feature_range[1], dtype=X.dtype, device=device_),
+                out=X,
+            )
+        return X
+
+    def inverse_transform(self, X):
+        """Undo the scaling of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data that will be transformed. It cannot be sparse.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(X)
+
+        X = check_array(
+            X,
+            copy=self.copy,
+            dtype=_array_api.supported_float_dtypes(xp),
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        X -= self.min_
+        X /= self.scale_
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        tags.array_api_support = True
+        return tags
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "axis": [Options(Integral, {0, 1})],
+    },
+    prefer_skip_nested_validation=False,
+)
+def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
+    """Transform features by scaling each feature to a given range.
+
+    This estimator scales and translates each feature individually such
+    that it is in the given range on the training set, i.e. between
+    zero and one.
+
+    The transformation is given by (when ``axis=0``)::
+
+        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
+        X_scaled = X_std * (max - min) + min
+
+    where min, max = feature_range.
+
+    The transformation is calculated as (when ``axis=0``)::
+
+       X_scaled = scale * X + min - X.min(axis=0) * scale
+       where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))
+
+    This transformation is often used as an alternative to zero mean,
+    unit variance scaling.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    .. versionadded:: 0.17
+       *minmax_scale* function interface
+       to :class:`~sklearn.preprocessing.MinMaxScaler`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data.
+
+    feature_range : tuple (min, max), default=(0, 1)
+        Desired range of transformed data.
+
+    axis : {0, 1}, default=0
+        Axis used to scale along. If 0, independently scale each feature,
+        otherwise (if 1) scale each sample.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and scale in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_tr : ndarray of shape (n_samples, n_features)
+        The transformed data.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.MinMaxScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.
+
+    See Also
+    --------
+    MinMaxScaler : Performs scaling to a given range using the Transformer
+        API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import minmax_scale
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> minmax_scale(X, axis=0)  # scale each column independently
+    array([[0., 1., 1.],
+           [1., 0., 0.]])
+    >>> minmax_scale(X, axis=1)  # scale each row independently
+    array([[0.  , 0.75, 1.  ],
+           [0.  , 0.5 , 1.  ]])
+    """
+    # Unlike the scaler object, this function allows 1d input.
+    # If copy is required, it will be done inside the scaler object.
+    X = check_array(
+        X,
+        copy=False,
+        ensure_2d=False,
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
+    )
+    original_ndim = X.ndim
+
+    if original_ndim == 1:
+        X = X.reshape(X.shape[0], 1)
+
+    s = MinMaxScaler(feature_range=feature_range, copy=copy)
+    if axis == 0:
+        X = s.fit_transform(X)
+    else:
+        X = s.fit_transform(X.T).T
+
+    if original_ndim == 1:
+        X = X.ravel()
+
+    return X
+
+
+class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Standardize features by removing the mean and scaling to unit variance.
+
+    The standard score of a sample `x` is calculated as:
+
+    .. code-block:: text
+
+        z = (x - u) / s
+
+    where `u` is the mean of the training samples or zero if `with_mean=False`,
+    and `s` is the standard deviation of the training samples or one if
+    `with_std=False`.
+
+    Centering and scaling happen independently on each feature by computing
+    the relevant statistics on the samples in the training set. Mean and
+    standard deviation are then stored to be used on later data using
+    :meth:`transform`.
+
+    Standardization of a dataset is a common requirement for many
+    machine learning estimators: they might behave badly if the
+    individual features do not more or less look like standard normally
+    distributed data (e.g. Gaussian with 0 mean and unit variance).
+
+    For instance many elements used in the objective function of
+    a learning algorithm (such as the RBF kernel of Support Vector
+    Machines or the L1 and L2 regularizers of linear models) assume that
+    all features are centered around 0 and have variance in the same
+    order. If a feature has a variance that is orders of magnitude larger
+    than others, it might dominate the objective function and make the
+    estimator unable to learn from other features correctly as expected.
+
+    `StandardScaler` is sensitive to outliers, and the features may scale
+    differently from each other in the presence of outliers. For an example
+    visualization, refer to :ref:`Compare StandardScaler with other scalers
+    <plot_all_scaling_standard_scaler_section>`.
+
+    This scaler can also be applied to sparse CSR or CSC matrices by passing
+    `with_mean=False` to avoid breaking the sparsity structure of the data.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    copy : bool, default=True
+        If False, try to avoid a copy and do inplace scaling instead.
+        This is not guaranteed to always work inplace; e.g. if the data is
+        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
+        returned.
+
+    with_mean : bool, default=True
+        If True, center the data before scaling.
+        This does not work (and will raise an exception) when attempted on
+        sparse matrices, because centering them entails building a dense
+        matrix which in common use cases is likely to be too large to fit in
+        memory.
+
+    with_std : bool, default=True
+        If True, scale the data to unit variance (or equivalently,
+        unit standard deviation).
+
+    Attributes
+    ----------
+    scale_ : ndarray of shape (n_features,) or None
+        Per feature relative scaling of the data to achieve zero mean and unit
+        variance. Generally this is calculated using `np.sqrt(var_)`. If a
+        variance is zero, we can't achieve unit variance, and the data is left
+        as-is, giving a scaling factor of 1. `scale_` is equal to `None`
+        when `with_std=False`.
+
+        .. versionadded:: 0.17
+           *scale_*
+
+    mean_ : ndarray of shape (n_features,) or None
+        The mean value for each feature in the training set.
+        Equal to ``None`` when ``with_mean=False`` and ``with_std=False``.
+
+    var_ : ndarray of shape (n_features,) or None
+        The variance for each feature in the training set. Used to compute
+        `scale_`. Equal to ``None`` when ``with_mean=False`` and
+        ``with_std=False``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_seen_ : int or ndarray of shape (n_features,)
+        The number of samples processed by the estimator for each feature.
+        If there are no missing samples, the ``n_samples_seen`` will be an
+        integer, otherwise it will be an array of dtype int. If
+        `sample_weights` are used it will be a float (if no missing data)
+        or an array of dtype float that sums the weights seen so far.
+        Will be reset on new calls to fit, but increments across
+        ``partial_fit`` calls.
+
+    See Also
+    --------
+    scale : Equivalent function without the estimator API.
+
+    :class:`~sklearn.decomposition.PCA` : Further removes the linear
+        correlation across features with 'whiten=True'.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    We use a biased estimator for the standard deviation, equivalent to
+    `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
+    affect model performance.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
+    >>> scaler = StandardScaler()
+    >>> print(scaler.fit(data))
+    StandardScaler()
+    >>> print(scaler.mean_)
+    [0.5 0.5]
+    >>> print(scaler.transform(data))
+    [[-1. -1.]
+     [-1. -1.]
+     [ 1.  1.]
+     [ 1.  1.]]
+    >>> print(scaler.transform([[2, 2]]))
+    [[3. 3.]]
+    """
+
+    _parameter_constraints: dict = {
+        "copy": ["boolean"],
+        "with_mean": ["boolean"],
+        "with_std": ["boolean"],
+    }
+
+    def __init__(self, *, copy=True, with_mean=True, with_std=True):
+        self.with_mean = with_mean
+        self.with_std = with_std
+        self.copy = copy
+
+    def _reset(self):
+        """Reset internal data-dependent state of the scaler, if necessary.
+
+        __init__ parameters are not touched.
+        """
+        # Checking one attribute is enough, because they are all set together
+        # in partial_fit
+        if hasattr(self, "scale_"):
+            del self.scale_
+            del self.n_samples_seen_
+            del self.mean_
+            del self.var_
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute the mean and std to be used for later scaling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.24
+               parameter *sample_weight* support to StandardScaler.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # Reset internal state before fitting
+        self._reset()
+        return self.partial_fit(X, y, sample_weight)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None, sample_weight=None):
+        """Online computation of mean and std on X for later scaling.
+
+        All of X is processed as a single batch. This is intended for cases
+        when :meth:`fit` is not feasible due to very large number of
+        `n_samples` or because X is read from a continuous stream.
+
+        The algorithm for incremental mean and std is given in Equation 1.5a,b
+        in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
+        for computing the sample variance: Analysis and recommendations."
+        The American Statistician 37.3 (1983): 242-247:
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.24
+               parameter *sample_weight* support to StandardScaler.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        first_call = not hasattr(self, "n_samples_seen_")
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            dtype=FLOAT_DTYPES,
+            ensure_all_finite="allow-nan",
+            reset=first_call,
+        )
+        n_features = X.shape[1]
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        # Even in the case of `with_mean=False`, we update the mean anyway
+        # This is needed for the incremental computation of the var
+        # See incr_mean_variance_axis and _incremental_mean_variance_axis
+
+        # if n_samples_seen_ is an integer (i.e. no missing values), we need to
+        # transform it to a NumPy array of shape (n_features,) required by
+        # incr_mean_variance_axis and _incremental_variance_axis
+        dtype = np.int64 if sample_weight is None else X.dtype
+        if not hasattr(self, "n_samples_seen_"):
+            self.n_samples_seen_ = np.zeros(n_features, dtype=dtype)
+        elif np.size(self.n_samples_seen_) == 1:
+            self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1])
+            self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False)
+
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot center sparse matrices: pass `with_mean=False` "
+                    "instead. See docstring for motivation and alternatives."
+                )
+            sparse_constructor = (
+                sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix
+            )
+
+            if self.with_std:
+                # First pass
+                if not hasattr(self, "scale_"):
+                    self.mean_, self.var_, self.n_samples_seen_ = mean_variance_axis(
+                        X, axis=0, weights=sample_weight, return_sum_weights=True
+                    )
+                # Next passes
+                else:
+                    (
+                        self.mean_,
+                        self.var_,
+                        self.n_samples_seen_,
+                    ) = incr_mean_variance_axis(
+                        X,
+                        axis=0,
+                        last_mean=self.mean_,
+                        last_var=self.var_,
+                        last_n=self.n_samples_seen_,
+                        weights=sample_weight,
+                    )
+                # We force the mean and variance to float64 for large arrays
+                # See https://github.com/scikit-learn/scikit-learn/pull/12338
+                self.mean_ = self.mean_.astype(np.float64, copy=False)
+                self.var_ = self.var_.astype(np.float64, copy=False)
+            else:
+                self.mean_ = None  # as with_mean must be False for sparse
+                self.var_ = None
+                weights = _check_sample_weight(sample_weight, X)
+                sum_weights_nan = weights @ sparse_constructor(
+                    (np.isnan(X.data), X.indices, X.indptr), shape=X.shape
+                )
+                self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype(
+                    dtype
+                )
+        else:
+            # First pass
+            if not hasattr(self, "scale_"):
+                self.mean_ = 0.0
+                if self.with_std:
+                    self.var_ = 0.0
+                else:
+                    self.var_ = None
+
+            if not self.with_mean and not self.with_std:
+                self.mean_ = None
+                self.var_ = None
+                self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
+
+            else:
+                self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var(
+                    X,
+                    self.mean_,
+                    self.var_,
+                    self.n_samples_seen_,
+                    sample_weight=sample_weight,
+                )
+
+        # for backward-compatibility, reduce n_samples_seen_ to an integer
+        # if the number of samples is the same for each feature (i.e. no
+        # missing values)
+        if np.ptp(self.n_samples_seen_) == 0:
+            self.n_samples_seen_ = self.n_samples_seen_[0]
+
+        if self.with_std:
+            # Extract the list of near constant features on the raw variances,
+            # before taking the square root.
+            constant_mask = _is_constant_feature(
+                self.var_, self.mean_, self.n_samples_seen_
+            )
+            self.scale_ = _handle_zeros_in_scale(
+                np.sqrt(self.var_), copy=False, constant_mask=constant_mask
+            )
+        else:
+            self.scale_ = None
+
+        return self
+
+    def transform(self, X, copy=None):
+        """Perform standardization by centering and scaling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+        copy : bool, default=None
+            Copy the input X or not.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+
+        copy = copy if copy is not None else self.copy
+        X = validate_data(
+            self,
+            X,
+            reset=False,
+            accept_sparse="csr",
+            copy=copy,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot center sparse matrices: pass `with_mean=False` "
+                    "instead. See docstring for motivation and alternatives."
+                )
+            if self.scale_ is not None:
+                inplace_column_scale(X, 1 / self.scale_)
+        else:
+            if self.with_mean:
+                X -= self.mean_
+            if self.with_std:
+                X /= self.scale_
+        return X
+
+    def inverse_transform(self, X, copy=None):
+        """Scale back the data to the original representation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+
+        copy : bool, default=None
+            Copy the input `X` or not.
+
+        Returns
+        -------
+        X_original : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+
+        copy = copy if copy is not None else self.copy
+        X = check_array(
+            X,
+            accept_sparse="csr",
+            copy=copy,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot uncenter sparse matrices: pass `with_mean=False` "
+                    "instead See docstring for motivation and alternatives."
+                )
+            if self.scale_ is not None:
+                inplace_column_scale(X, self.scale_)
+        else:
+            if self.with_std:
+                X *= self.scale_
+            if self.with_mean:
+                X += self.mean_
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        tags.input_tags.sparse = not self.with_mean
+        tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        return tags
+
+
+class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Scale each feature by its maximum absolute value.
+
+    This estimator scales and translates each feature individually such
+    that the maximal absolute value of each feature in the
+    training set will be 1.0. It does not shift/center the data, and
+    thus does not destroy any sparsity.
+
+    This scaler can also be applied to sparse CSR or CSC matrices.
+
+    `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearly
+    scales them down. For an example visualization, refer to :ref:`Compare
+    MaxAbsScaler with other scalers <plot_all_scaling_max_abs_scaler_section>`.
+
+    .. versionadded:: 0.17
+
+    Parameters
+    ----------
+    copy : bool, default=True
+        Set to False to perform inplace scaling and avoid a copy (if the input
+        is already a numpy array).
+
+    Attributes
+    ----------
+    scale_ : ndarray of shape (n_features,)
+        Per feature relative scaling of the data.
+
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
+    max_abs_ : ndarray of shape (n_features,)
+        Per feature maximum absolute value.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_samples_seen_ : int
+        The number of samples processed by the estimator. Will be reset on
+        new calls to fit, but increments across ``partial_fit`` calls.
+
+    See Also
+    --------
+    maxabs_scale : Equivalent function without the estimator API.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import MaxAbsScaler
+    >>> X = [[ 1., -1.,  2.],
+    ...      [ 2.,  0.,  0.],
+    ...      [ 0.,  1., -1.]]
+    >>> transformer = MaxAbsScaler().fit(X)
+    >>> transformer
+    MaxAbsScaler()
+    >>> transformer.transform(X)
+    array([[ 0.5, -1. ,  1. ],
+           [ 1. ,  0. ,  0. ],
+           [ 0. ,  1. , -0.5]])
+    """
+
+    _parameter_constraints: dict = {"copy": ["boolean"]}
+
+    def __init__(self, *, copy=True):
+        self.copy = copy
+
+    def _reset(self):
+        """Reset internal data-dependent state of the scaler, if necessary.
+
+        __init__ parameters are not touched.
+        """
+        # Checking one attribute is enough, because they are all set together
+        # in partial_fit
+        if hasattr(self, "scale_"):
+            del self.scale_
+            del self.n_samples_seen_
+            del self.max_abs_
+
+    def fit(self, X, y=None):
+        """Compute the maximum absolute value to be used for later scaling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the per-feature minimum and maximum
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # Reset internal state before fitting
+        self._reset()
+        return self.partial_fit(X, y)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Online computation of max absolute value of X for later scaling.
+
+        All of X is processed as a single batch. This is intended for cases
+        when :meth:`fit` is not feasible due to very large number of
+        `n_samples` or because X is read from a continuous stream.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        xp, _ = get_namespace(X)
+
+        first_pass = not hasattr(self, "n_samples_seen_")
+        X = validate_data(
+            self,
+            X,
+            reset=first_pass,
+            accept_sparse=("csr", "csc"),
+            dtype=_array_api.supported_float_dtypes(xp),
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
+            max_abs = np.maximum(np.abs(mins), np.abs(maxs))
+        else:
+            max_abs = _array_api._nanmax(xp.abs(X), axis=0, xp=xp)
+
+        if first_pass:
+            self.n_samples_seen_ = X.shape[0]
+        else:
+            max_abs = xp.maximum(self.max_abs_, max_abs)
+            self.n_samples_seen_ += X.shape[0]
+
+        self.max_abs_ = max_abs
+        self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)
+        return self
+
+    def transform(self, X):
+        """Scale the data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data that should be scaled.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(X)
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            copy=self.copy,
+            reset=False,
+            dtype=_array_api.supported_float_dtypes(xp),
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            inplace_column_scale(X, 1.0 / self.scale_)
+        else:
+            X /= self.scale_
+        return X
+
+    def inverse_transform(self, X):
+        """Scale back the data to the original representation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data that should be transformed back.
+
+        Returns
+        -------
+        X_original : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(X)
+
+        X = check_array(
+            X,
+            accept_sparse=("csr", "csc"),
+            copy=self.copy,
+            dtype=_array_api.supported_float_dtypes(xp),
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            inplace_column_scale(X, self.scale_)
+        else:
+            X *= self.scale_
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        tags.input_tags.sparse = True
+        return tags
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "axis": [Options(Integral, {0, 1})],
+    },
+    prefer_skip_nested_validation=False,
+)
+def maxabs_scale(X, *, axis=0, copy=True):
+    """Scale each feature to the [-1, 1] range without breaking the sparsity.
+
+    This estimator scales each feature individually such
+    that the maximal absolute value of each feature in the
+    training set will be 1.0.
+
+    This scaler can also be applied to sparse CSR or CSC matrices.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data.
+
+    axis : {0, 1}, default=0
+        Axis used to scale along. If 0, independently scale each feature,
+        otherwise (if 1) scale each sample.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and scale in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.MaxAbsScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`.
+
+    See Also
+    --------
+    MaxAbsScaler : Performs scaling to the [-1, 1] range using
+        the Transformer API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded to compute the statistics,
+    and maintained during the data transformation.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import maxabs_scale
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> maxabs_scale(X, axis=0)  # scale each column independently
+    array([[-1. ,  1. ,  1. ],
+           [-0.5,  0. ,  0.5]])
+    >>> maxabs_scale(X, axis=1)  # scale each row independently
+    array([[-1. ,  0.5,  1. ],
+           [-1. ,  0. ,  1. ]])
+    """
+    # Unlike the scaler object, this function allows 1d input.
+
+    # If copy is required, it will be done inside the scaler object.
+    X = check_array(
+        X,
+        accept_sparse=("csr", "csc"),
+        copy=False,
+        ensure_2d=False,
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
+    )
+    original_ndim = X.ndim
+
+    if original_ndim == 1:
+        X = X.reshape(X.shape[0], 1)
+
+    s = MaxAbsScaler(copy=copy)
+    if axis == 0:
+        X = s.fit_transform(X)
+    else:
+        X = s.fit_transform(X.T).T
+
+    if original_ndim == 1:
+        X = X.ravel()
+
+    return X
+
+
+class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Scale features using statistics that are robust to outliers.
+
+    This Scaler removes the median and scales the data according to
+    the quantile range (defaults to IQR: Interquartile Range).
+    The IQR is the range between the 1st quartile (25th quantile)
+    and the 3rd quartile (75th quantile).
+
+    Centering and scaling happen independently on each feature by
+    computing the relevant statistics on the samples in the training
+    set. Median and interquartile range are then stored to be used on
+    later data using the :meth:`transform` method.
+
+    Standardization of a dataset is a common preprocessing for many machine
+    learning estimators. Typically this is done by removing the mean and
+    scaling to unit variance. However, outliers can often influence the sample
+    mean / variance in a negative way. In such cases, using the median and the
+    interquartile range often give better results. For an example visualization
+    and comparison to other scalers, refer to :ref:`Compare RobustScaler with
+    other scalers <plot_all_scaling_robust_scaler_section>`.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    with_centering : bool, default=True
+        If `True`, center the data before scaling.
+        This will cause :meth:`transform` to raise an exception when attempted
+        on sparse matrices, because centering them entails building a dense
+        matrix which in common use cases is likely to be too large to fit in
+        memory.
+
+    with_scaling : bool, default=True
+        If `True`, scale the data to interquartile range.
+
+    quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, \
+        default=(25.0, 75.0)
+        Quantile range used to calculate `scale_`. By default this is equal to
+        the IQR, i.e., `q_min` is the first quantile and `q_max` is the third
+        quantile.
+
+        .. versionadded:: 0.18
+
+    copy : bool, default=True
+        If `False`, try to avoid a copy and do inplace scaling instead.
+        This is not guaranteed to always work inplace; e.g. if the data is
+        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
+        returned.
+
+    unit_variance : bool, default=False
+        If `True`, scale data so that normally distributed features have a
+        variance of 1. In general, if the difference between the x-values of
+        `q_max` and `q_min` for a standard normal distribution is greater
+        than 1, the dataset will be scaled down. If less than 1, the dataset
+        will be scaled up.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    center_ : array of floats
+        The median value for each feature in the training set.
+
+    scale_ : array of floats
+        The (scaled) interquartile range for each feature in the training set.
+
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    robust_scale : Equivalent function without the estimator API.
+    sklearn.decomposition.PCA : Further removes the linear correlation across
+        features with 'whiten=True'.
+
+    Notes
+    -----
+
+    https://en.wikipedia.org/wiki/Median
+    https://en.wikipedia.org/wiki/Interquartile_range
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import RobustScaler
+    >>> X = [[ 1., -2.,  2.],
+    ...      [ -2.,  1.,  3.],
+    ...      [ 4.,  1., -2.]]
+    >>> transformer = RobustScaler().fit(X)
+    >>> transformer
+    RobustScaler()
+    >>> transformer.transform(X)
+    array([[ 0. , -2. ,  0. ],
+           [-1. ,  0. ,  0.4],
+           [ 1. ,  0. , -1.6]])
+    """
+
+    _parameter_constraints: dict = {
+        "with_centering": ["boolean"],
+        "with_scaling": ["boolean"],
+        "quantile_range": [tuple],
+        "copy": ["boolean"],
+        "unit_variance": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        with_centering=True,
+        with_scaling=True,
+        quantile_range=(25.0, 75.0),
+        copy=True,
+        unit_variance=False,
+    ):
+        self.with_centering = with_centering
+        self.with_scaling = with_scaling
+        self.quantile_range = quantile_range
+        self.unit_variance = unit_variance
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Compute the median and quantiles to be used for scaling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to compute the median and quantiles
+            used for later scaling along the features axis.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # at fit, convert sparse matrices to csc for optimized computation of
+        # the quantiles
+        X = validate_data(
+            self,
+            X,
+            accept_sparse="csc",
+            dtype=FLOAT_DTYPES,
+            ensure_all_finite="allow-nan",
+        )
+
+        q_min, q_max = self.quantile_range
+        if not 0 <= q_min <= q_max <= 100:
+            raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))
+
+        if self.with_centering:
+            if sparse.issparse(X):
+                raise ValueError(
+                    "Cannot center sparse matrices: use `with_centering=False`"
+                    " instead. See docstring for motivation and alternatives."
+                )
+            self.center_ = np.nanmedian(X, axis=0)
+        else:
+            self.center_ = None
+
+        if self.with_scaling:
+            quantiles = []
+            for feature_idx in range(X.shape[1]):
+                if sparse.issparse(X):
+                    column_nnz_data = X.data[
+                        X.indptr[feature_idx] : X.indptr[feature_idx + 1]
+                    ]
+                    column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)
+                    column_data[: len(column_nnz_data)] = column_nnz_data
+                else:
+                    column_data = X[:, feature_idx]
+
+                quantiles.append(np.nanpercentile(column_data, self.quantile_range))
+
+            quantiles = np.transpose(quantiles)
+
+            self.scale_ = quantiles[1] - quantiles[0]
+            self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
+            if self.unit_variance:
+                adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0)
+                self.scale_ = self.scale_ / adjust
+        else:
+            self.scale_ = None
+
+        return self
+
+    def transform(self, X):
+        """Center and scale the data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the specified axis.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=("csr", "csc"),
+            copy=self.copy,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            reset=False,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            if self.with_scaling:
+                inplace_column_scale(X, 1.0 / self.scale_)
+        else:
+            if self.with_centering:
+                X -= self.center_
+            if self.with_scaling:
+                X /= self.scale_
+        return X
+
+    def inverse_transform(self, X):
+        """Scale back the data to the original representation.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The rescaled data to be transformed back.
+
+        Returns
+        -------
+        X_original : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(
+            X,
+            accept_sparse=("csr", "csc"),
+            copy=self.copy,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            ensure_all_finite="allow-nan",
+        )
+
+        if sparse.issparse(X):
+            if self.with_scaling:
+                inplace_column_scale(X, self.scale_)
+        else:
+            if self.with_scaling:
+                X *= self.scale_
+            if self.with_centering:
+                X += self.center_
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = not self.with_centering
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+@validate_params(
+    {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},
+    prefer_skip_nested_validation=False,
+)
+def robust_scale(
+    X,
+    *,
+    axis=0,
+    with_centering=True,
+    with_scaling=True,
+    quantile_range=(25.0, 75.0),
+    copy=True,
+    unit_variance=False,
+):
+    """Standardize a dataset along any axis.
+
+    Center to the median and component wise scale
+    according to the interquartile range.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_sample, n_features)
+        The data to center and scale.
+
+    axis : int, default=0
+        Axis used to compute the medians and IQR along. If 0,
+        independently scale each feature, otherwise (if 1) scale
+        each sample.
+
+    with_centering : bool, default=True
+        If `True`, center the data before scaling.
+
+    with_scaling : bool, default=True
+        If `True`, scale the data to unit variance (or equivalently,
+        unit standard deviation).
+
+    quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0,\
+        default=(25.0, 75.0)
+        Quantile range used to calculate `scale_`. By default this is equal to
+        the IQR, i.e., `q_min` is the first quantile and `q_max` is the third
+        quantile.
+
+        .. versionadded:: 0.18
+
+    copy : bool, default=True
+        If False, try to avoid a copy and scale in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    unit_variance : bool, default=False
+        If `True`, scale data so that normally distributed features have a
+        variance of 1. In general, if the difference between the x-values of
+        `q_max` and `q_min` for a standard normal distribution is greater
+        than 1, the dataset will be scaled down. If less than 1, the dataset
+        will be scaled up.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    RobustScaler : Performs centering and scaling using the Transformer API
+        (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    This implementation will refuse to center scipy.sparse matrices
+    since it would make them non-sparse and would potentially crash the
+    program with memory exhaustion problems.
+
+    Instead the caller is expected to either set explicitly
+    `with_centering=False` (in that case, only variance scaling will be
+    performed on the features of the CSR matrix) or to call `X.toarray()`
+    if he/she expects the materialized dense array to fit in memory.
+
+    To avoid memory copy the caller should pass a CSR matrix.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.RobustScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import robust_scale
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> robust_scale(X, axis=0)  # scale each column independently
+    array([[-1.,  1.,  1.],
+           [ 1., -1., -1.]])
+    >>> robust_scale(X, axis=1)  # scale each row independently
+    array([[-1.5,  0. ,  0.5],
+           [-1. ,  0. ,  1. ]])
+    """
+    X = check_array(
+        X,
+        accept_sparse=("csr", "csc"),
+        copy=False,
+        ensure_2d=False,
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
+    )
+    original_ndim = X.ndim
+
+    if original_ndim == 1:
+        X = X.reshape(X.shape[0], 1)
+
+    s = RobustScaler(
+        with_centering=with_centering,
+        with_scaling=with_scaling,
+        quantile_range=quantile_range,
+        unit_variance=unit_variance,
+        copy=copy,
+    )
+    if axis == 0:
+        X = s.fit_transform(X)
+    else:
+        X = s.fit_transform(X.T).T
+
+    if original_ndim == 1:
+        X = X.ravel()
+
+    return X
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "norm": [StrOptions({"l1", "l2", "max"})],
+        "axis": [Options(Integral, {0, 1})],
+        "copy": ["boolean"],
+        "return_norm": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False):
+    """Scale input vectors individually to unit norm (vector length).
+
+    Read more in the :ref:`User Guide <preprocessing_normalization>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to normalize, element by element.
+        scipy.sparse matrices should be in CSR format to avoid an
+        un-necessary copy.
+
+    norm : {'l1', 'l2', 'max'}, default='l2'
+        The norm to use to normalize each non zero sample (or each non-zero
+        feature if axis is 0).
+
+    axis : {0, 1}, default=1
+        Define axis used to normalize the data along. If 1, independently
+        normalize each sample, otherwise (if 0) normalize each feature.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and normalize in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    return_norm : bool, default=False
+        Whether to return the computed norms.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        Normalized input X.
+
+    norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, )
+        An array of norms along given axis for X.
+        When X is sparse, a NotImplementedError will be raised
+        for norm 'l1' or 'l2'.
+
+    See Also
+    --------
+    Normalizer : Performs normalization using the Transformer API
+        (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import normalize
+    >>> X = [[-2, 1, 2], [-1, 0, 1]]
+    >>> normalize(X, norm="l1")  # L1 normalization each row independently
+    array([[-0.4,  0.2,  0.4],
+           [-0.5,  0. ,  0.5]])
+    >>> normalize(X, norm="l2")  # L2 normalization each row independently
+    array([[-0.67, 0.33, 0.67],
+           [-0.71, 0.  , 0.71]])
+    """
+    if axis == 0:
+        sparse_format = "csc"
+    else:  # axis == 1:
+        sparse_format = "csr"
+
+    xp, _ = get_namespace(X)
+
+    X = check_array(
+        X,
+        accept_sparse=sparse_format,
+        copy=copy,
+        estimator="the normalize function",
+        dtype=_array_api.supported_float_dtypes(xp),
+        force_writeable=True,
+    )
+    if axis == 0:
+        X = X.T
+
+    if sparse.issparse(X):
+        if return_norm and norm in ("l1", "l2"):
+            raise NotImplementedError(
+                "return_norm=True is not implemented "
+                "for sparse matrices with norm 'l1' "
+                "or norm 'l2'"
+            )
+        if norm == "l1":
+            inplace_csr_row_normalize_l1(X)
+        elif norm == "l2":
+            inplace_csr_row_normalize_l2(X)
+        elif norm == "max":
+            mins, maxes = min_max_axis(X, 1)
+            norms = np.maximum(abs(mins), maxes)
+            norms_elementwise = norms.repeat(np.diff(X.indptr))
+            mask = norms_elementwise != 0
+            X.data[mask] /= norms_elementwise[mask]
+    else:
+        if norm == "l1":
+            norms = xp.sum(xp.abs(X), axis=1)
+        elif norm == "l2":
+            norms = row_norms(X)
+        elif norm == "max":
+            norms = xp.max(xp.abs(X), axis=1)
+        norms = _handle_zeros_in_scale(norms, copy=False)
+        X /= norms[:, None]
+
+    if axis == 0:
+        X = X.T
+
+    if return_norm:
+        return X, norms
+    else:
+        return X
+
+
+class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Normalize samples individually to unit norm.
+
+    Each sample (i.e. each row of the data matrix) with at least one
+    non zero component is rescaled independently of other samples so
+    that its norm (l1, l2 or inf) equals one.
+
+    This transformer is able to work both with dense numpy arrays and
+    scipy.sparse matrix (use CSR format if you want to avoid the burden of
+    a copy / conversion).
+
+    Scaling inputs to unit norms is a common operation for text
+    classification or clustering for instance. For instance the dot
+    product of two l2-normalized TF-IDF vectors is the cosine similarity
+    of the vectors and is the base similarity metric for the Vector
+    Space Model commonly used by the Information Retrieval community.
+
+    For an example visualization, refer to :ref:`Compare Normalizer with other
+    scalers <plot_all_scaling_normalizer_section>`.
+
+    Read more in the :ref:`User Guide <preprocessing_normalization>`.
+
+    Parameters
+    ----------
+    norm : {'l1', 'l2', 'max'}, default='l2'
+        The norm to use to normalize each non zero sample. If norm='max'
+        is used, values will be rescaled by the maximum of the absolute
+        values.
+
+    copy : bool, default=True
+        Set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array or a scipy.sparse
+        CSR matrix).
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    normalize : Equivalent function without the estimator API.
+
+    Notes
+    -----
+    This estimator is :term:`stateless` and does not need to be fitted.
+    However, we recommend to call :meth:`fit_transform` instead of
+    :meth:`transform`, as parameter validation is only performed in
+    :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import Normalizer
+    >>> X = [[4, 1, 2, 2],
+    ...      [1, 3, 9, 3],
+    ...      [5, 7, 5, 1]]
+    >>> transformer = Normalizer().fit(X)  # fit does nothing.
+    >>> transformer
+    Normalizer()
+    >>> transformer.transform(X)
+    array([[0.8, 0.2, 0.4, 0.4],
+           [0.1, 0.3, 0.9, 0.3],
+           [0.5, 0.7, 0.5, 0.1]])
+    """
+
+    _parameter_constraints: dict = {
+        "norm": [StrOptions({"l1", "l2", "max"})],
+        "copy": ["boolean"],
+    }
+
+    def __init__(self, norm="l2", *, copy=True):
+        self.norm = norm
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Only validates estimator's parameters.
+
+        This method allows to: (i) validate the estimator's parameters and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to estimate the normalization parameters.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        validate_data(self, X, accept_sparse="csr")
+        return self
+
+    def transform(self, X, copy=None):
+        """Scale each non zero row of X to unit norm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to normalize, row by row. scipy.sparse matrices should be
+            in CSR format to avoid an un-necessary copy.
+
+        copy : bool, default=None
+            Copy the input X or not.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        copy = copy if copy is not None else self.copy
+        X = validate_data(
+            self, X, accept_sparse="csr", force_writeable=True, copy=copy, reset=False
+        )
+        return normalize(X, norm=self.norm, axis=1, copy=False)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.requires_fit = False
+        tags.array_api_support = True
+        return tags
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "threshold": [Interval(Real, None, None, closed="neither")],
+        "copy": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def binarize(X, *, threshold=0.0, copy=True):
+    """Boolean thresholding of array-like or scipy.sparse matrix.
+
+    Read more in the :ref:`User Guide <preprocessing_binarization>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to binarize, element by element.
+        scipy.sparse matrices should be in CSR or CSC format to avoid an
+        un-necessary copy.
+
+    threshold : float, default=0.0
+        Feature values below or equal to this are replaced by 0, above it by 1.
+        Threshold may not be less than 0 for operations on sparse matrices.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and binarize in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an object dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    Binarizer : Performs binarization using the Transformer API
+        (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import binarize
+    >>> X = [[0.4, 0.6, 0.5], [0.6, 0.1, 0.2]]
+    >>> binarize(X, threshold=0.5)
+    array([[0., 1., 0.],
+           [1., 0., 0.]])
+    """
+    X = check_array(X, accept_sparse=["csr", "csc"], force_writeable=True, copy=copy)
+    if sparse.issparse(X):
+        if threshold < 0:
+            raise ValueError("Cannot binarize a sparse matrix with threshold < 0")
+        cond = X.data > threshold
+        not_cond = np.logical_not(cond)
+        X.data[cond] = 1
+        X.data[not_cond] = 0
+        X.eliminate_zeros()
+    else:
+        xp, _, device = get_namespace_and_device(X)
+        float_dtype = _find_matching_floating_dtype(X, threshold, xp=xp)
+        cond = xp.astype(X, float_dtype, copy=False) > threshold
+        not_cond = xp.logical_not(cond)
+        X[cond] = 1
+        X[not_cond] = 0
+    return X
+
+
+class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Binarize data (set feature values to 0 or 1) according to a threshold.
+
+    Values greater than the threshold map to 1, while values less than
+    or equal to the threshold map to 0. With the default threshold of 0,
+    only positive values map to 1.
+
+    Binarization is a common operation on text count data where the
+    analyst can decide to only consider the presence or absence of a
+    feature rather than a quantified number of occurrences for instance.
+
+    It can also be used as a pre-processing step for estimators that
+    consider boolean random variables (e.g. modelled using the Bernoulli
+    distribution in a Bayesian setting).
+
+    Read more in the :ref:`User Guide <preprocessing_binarization>`.
+
+    Parameters
+    ----------
+    threshold : float, default=0.0
+        Feature values below or equal to this are replaced by 0, above it by 1.
+        Threshold may not be less than 0 for operations on sparse matrices.
+
+    copy : bool, default=True
+        Set to False to perform inplace binarization and avoid a copy (if
+        the input is already a numpy array or a scipy.sparse CSR matrix).
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    binarize : Equivalent function without the estimator API.
+    KBinsDiscretizer : Bin continuous data into intervals.
+    OneHotEncoder : Encode categorical features as a one-hot numeric array.
+
+    Notes
+    -----
+    If the input is a sparse matrix, only the non-zero values are subject
+    to update by the :class:`Binarizer` class.
+
+    This estimator is :term:`stateless` and does not need to be fitted.
+    However, we recommend to call :meth:`fit_transform` instead of
+    :meth:`transform`, as parameter validation is only performed in
+    :meth:`fit`.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import Binarizer
+    >>> X = [[ 1., -1.,  2.],
+    ...      [ 2.,  0.,  0.],
+    ...      [ 0.,  1., -1.]]
+    >>> transformer = Binarizer().fit(X)  # fit does nothing.
+    >>> transformer
+    Binarizer()
+    >>> transformer.transform(X)
+    array([[1., 0., 1.],
+           [1., 0., 0.],
+           [0., 1., 0.]])
+    """
+
+    _parameter_constraints: dict = {
+        "threshold": [Real],
+        "copy": ["boolean"],
+    }
+
+    def __init__(self, *, threshold=0.0, copy=True):
+        self.threshold = threshold
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Only validates estimator's parameters.
+
+        This method allows to: (i) validate the estimator's parameters and
+        (ii) be consistent with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        validate_data(self, X, accept_sparse="csr")
+        return self
+
+    def transform(self, X, copy=None):
+        """Binarize each element of X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to binarize, element by element.
+            scipy.sparse matrices should be in CSR format to avoid an
+            un-necessary copy.
+
+        copy : bool
+            Copy the input X or not.
+
+        Returns
+        -------
+        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Transformed array.
+        """
+        copy = copy if copy is not None else self.copy
+        # TODO: This should be refactored because binarize also calls
+        # check_array
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=["csr", "csc"],
+            force_writeable=True,
+            copy=copy,
+            reset=False,
+        )
+        return binarize(X, threshold=self.threshold, copy=False)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.requires_fit = False
+        tags.array_api_support = True
+        tags.input_tags.sparse = True
+        return tags
+
+
+class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    r"""Center an arbitrary kernel matrix :math:`K`.
+
+    Let define a kernel :math:`K` such that:
+
+    .. math::
+        K(X, Y) = \phi(X) . \phi(Y)^{T}
+
+    :math:`\phi(X)` is a function mapping of rows of :math:`X` to a
+    Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.
+
+    This class allows to compute :math:`\tilde{K}(X, Y)` such that:
+
+    .. math::
+        \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T}
+
+    :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert
+    space.
+
+    `KernelCenterer` centers the features without explicitly computing the
+    mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime
+    expected when dealing with algebra computation such as eigendecomposition
+    for :class:`~sklearn.decomposition.KernelPCA` for instance.
+
+    Read more in the :ref:`User Guide <kernel_centering>`.
+
+    Attributes
+    ----------
+    K_fit_rows_ : ndarray of shape (n_samples,)
+        Average of each column of kernel matrix.
+
+    K_fit_all_ : float
+        Average of kernel matrix.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.kernel_approximation.Nystroem : Approximate a kernel map
+        using a subset of the training data.
+
+    References
+    ----------
+    .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       "Nonlinear component analysis as a kernel eigenvalue problem."
+       Neural computation 10.5 (1998): 1299-1319.
+       <https://www.mlpack.org/papers/kpca.pdf>`_
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import KernelCenterer
+    >>> from sklearn.metrics.pairwise import pairwise_kernels
+    >>> X = [[ 1., -2.,  2.],
+    ...      [ -2.,  1.,  3.],
+    ...      [ 4.,  1., -2.]]
+    >>> K = pairwise_kernels(X, metric='linear')
+    >>> K
+    array([[  9.,   2.,  -2.],
+           [  2.,  14., -13.],
+           [ -2., -13.,  21.]])
+    >>> transformer = KernelCenterer().fit(K)
+    >>> transformer
+    KernelCenterer()
+    >>> transformer.transform(K)
+    array([[  5.,   0.,  -5.],
+           [  0.,  14., -14.],
+           [ -5., -14.,  19.]])
+    """
+
+    # X is called K in these methods.
+    __metadata_request__transform = {"K": metadata_routing.UNUSED}
+    __metadata_request__fit = {"K": metadata_routing.UNUSED}
+
+    def fit(self, K, y=None):
+        """Fit KernelCenterer.
+
+        Parameters
+        ----------
+        K : ndarray of shape (n_samples, n_samples)
+            Kernel matrix.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        xp, _ = get_namespace(K)
+
+        K = validate_data(self, K, dtype=_array_api.supported_float_dtypes(xp))
+
+        if K.shape[0] != K.shape[1]:
+            raise ValueError(
+                "Kernel matrix must be a square matrix."
+                " Input is a {}x{} matrix.".format(K.shape[0], K.shape[1])
+            )
+
+        n_samples = K.shape[0]
+        self.K_fit_rows_ = xp.sum(K, axis=0) / n_samples
+        self.K_fit_all_ = xp.sum(self.K_fit_rows_) / n_samples
+        return self
+
+    def transform(self, K, copy=True):
+        """Center kernel matrix.
+
+        Parameters
+        ----------
+        K : ndarray of shape (n_samples1, n_samples2)
+            Kernel matrix.
+
+        copy : bool, default=True
+            Set to False to perform inplace computation.
+
+        Returns
+        -------
+        K_new : ndarray of shape (n_samples1, n_samples2)
+            Returns the instance itself.
+        """
+        check_is_fitted(self)
+
+        xp, _ = get_namespace(K)
+
+        K = validate_data(
+            self,
+            K,
+            copy=copy,
+            force_writeable=True,
+            dtype=_array_api.supported_float_dtypes(xp),
+            reset=False,
+        )
+
+        K_pred_cols = (xp.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, None]
+
+        K -= self.K_fit_rows_
+        K -= K_pred_cols
+        K += self.K_fit_all_
+
+        return K
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        # Used by ClassNamePrefixFeaturesOutMixin. This model preserves the
+        # number of input features but this is not a one-to-one mapping in the
+        # usual sense. Hence the choice not to use OneToOneFeatureMixin to
+        # implement get_feature_names_out for this class.
+        return self.n_features_in_
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.pairwise = True
+        tags.array_api_support = True
+        return tags
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "value": [Interval(Real, None, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def add_dummy_feature(X, value=1.0):
+    """Augment dataset with an additional dummy feature.
+
+    This is useful for fitting an intercept term with implementations which
+    cannot otherwise fit it directly.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Data.
+
+    value : float
+        Value to use for the dummy feature.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1)
+        Same data with dummy feature added as first column.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import add_dummy_feature
+    >>> add_dummy_feature([[0, 1], [1, 0]])
+    array([[1., 0., 1.],
+           [1., 1., 0.]])
+    """
+    X = check_array(X, accept_sparse=["csc", "csr", "coo"], dtype=FLOAT_DTYPES)
+    n_samples, n_features = X.shape
+    shape = (n_samples, n_features + 1)
+    if sparse.issparse(X):
+        if X.format == "coo":
+            # Shift columns to the right.
+            col = X.col + 1
+            # Column indices of dummy feature are 0 everywhere.
+            col = np.concatenate((np.zeros(n_samples), col))
+            # Row indices of dummy feature are 0, ..., n_samples-1.
+            row = np.concatenate((np.arange(n_samples), X.row))
+            # Prepend the dummy feature n_samples times.
+            data = np.concatenate((np.full(n_samples, value), X.data))
+            return sparse.coo_matrix((data, (row, col)), shape)
+        elif X.format == "csc":
+            # Shift index pointers since we need to add n_samples elements.
+            indptr = X.indptr + n_samples
+            # indptr[0] must be 0.
+            indptr = np.concatenate((np.array([0]), indptr))
+            # Row indices of dummy feature are 0, ..., n_samples-1.
+            indices = np.concatenate((np.arange(n_samples), X.indices))
+            # Prepend the dummy feature n_samples times.
+            data = np.concatenate((np.full(n_samples, value), X.data))
+            return sparse.csc_matrix((data, indices, indptr), shape)
+        else:
+            klass = X.__class__
+            return klass(add_dummy_feature(X.tocoo(), value))
+    else:
+        return np.hstack((np.full((n_samples, 1), value), X))
+
+
+class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Transform features using quantiles information.
+
+    This method transforms the features to follow a uniform or a normal
+    distribution. Therefore, for a given feature, this transformation tends
+    to spread out the most frequent values. It also reduces the impact of
+    (marginal) outliers: this is therefore a robust preprocessing scheme.
+
+    The transformation is applied on each feature independently. First an
+    estimate of the cumulative distribution function of a feature is
+    used to map the original values to a uniform distribution. The obtained
+    values are then mapped to the desired output distribution using the
+    associated quantile function. Features values of new/unseen data that fall
+    below or above the fitted range will be mapped to the bounds of the output
+    distribution. Note that this transform is non-linear. It may distort linear
+    correlations between variables measured at the same scale but renders
+    variables measured at different scales more directly comparable.
+
+    For example visualizations, refer to :ref:`Compare QuantileTransformer with
+    other scalers <plot_all_scaling_quantile_transformer_section>`.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    .. versionadded:: 0.19
+
+    Parameters
+    ----------
+    n_quantiles : int, default=1000 or n_samples
+        Number of quantiles to be computed. It corresponds to the number
+        of landmarks used to discretize the cumulative distribution function.
+        If n_quantiles is larger than the number of samples, n_quantiles is set
+        to the number of samples as a larger number of quantiles does not give
+        a better approximation of the cumulative distribution function
+        estimator.
+
+    output_distribution : {'uniform', 'normal'}, default='uniform'
+        Marginal distribution for the transformed data. The choices are
+        'uniform' (default) or 'normal'.
+
+    ignore_implicit_zeros : bool, default=False
+        Only applies to sparse matrices. If True, the sparse entries of the
+        matrix are discarded to compute the quantile statistics. If False,
+        these entries are treated as zeros.
+
+    subsample : int or None, default=10_000
+        Maximum number of samples used to estimate the quantiles for
+        computational efficiency. Note that the subsampling procedure may
+        differ for value-identical sparse and dense matrices.
+        Disable subsampling by setting `subsample=None`.
+
+        .. versionadded:: 1.5
+           The option `None` to disable subsampling was added.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling and smoothing
+        noise.
+        Please see ``subsample`` for more details.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    copy : bool, default=True
+        Set to False to perform inplace transformation and avoid a copy (if the
+        input is already a numpy array).
+
+    Attributes
+    ----------
+    n_quantiles_ : int
+        The actual number of quantiles used to discretize the cumulative
+        distribution function.
+
+    quantiles_ : ndarray of shape (n_quantiles, n_features)
+        The values corresponding the quantiles of reference.
+
+    references_ : ndarray of shape (n_quantiles, )
+        Quantiles of references.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    quantile_transform : Equivalent function without the estimator API.
+    PowerTransformer : Perform mapping to a normal distribution using a power
+        transform.
+    StandardScaler : Perform standardization that is faster, but less robust
+        to outliers.
+    RobustScaler : Perform robust standardization that removes the influence
+        of outliers but does not put outliers and inliers on the same scale.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import QuantileTransformer
+    >>> rng = np.random.RandomState(0)
+    >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
+    >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)
+    >>> qt.fit_transform(X)
+    array([...])
+    """
+
+    _parameter_constraints: dict = {
+        "n_quantiles": [Interval(Integral, 1, None, closed="left")],
+        "output_distribution": [StrOptions({"uniform", "normal"})],
+        "ignore_implicit_zeros": ["boolean"],
+        "subsample": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "copy": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        n_quantiles=1000,
+        output_distribution="uniform",
+        ignore_implicit_zeros=False,
+        subsample=10_000,
+        random_state=None,
+        copy=True,
+    ):
+        self.n_quantiles = n_quantiles
+        self.output_distribution = output_distribution
+        self.ignore_implicit_zeros = ignore_implicit_zeros
+        self.subsample = subsample
+        self.random_state = random_state
+        self.copy = copy
+
+    def _dense_fit(self, X, random_state):
+        """Compute percentiles for dense matrices.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+        """
+        if self.ignore_implicit_zeros:
+            warnings.warn(
+                "'ignore_implicit_zeros' takes effect only with"
+                " sparse matrix. This parameter has no effect."
+            )
+
+        n_samples, n_features = X.shape
+        references = self.references_ * 100
+
+        if self.subsample is not None and self.subsample < n_samples:
+            # Take a subsample of `X`
+            X = resample(
+                X, replace=False, n_samples=self.subsample, random_state=random_state
+            )
+
+        self.quantiles_ = np.nanpercentile(X, references, axis=0)
+        # Due to floating-point precision error in `np.nanpercentile`,
+        # make sure that quantiles are monotonically increasing.
+        # Upstream issue in numpy:
+        # https://github.com/numpy/numpy/issues/14685
+        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
+
+    def _sparse_fit(self, X, random_state):
+        """Compute percentiles for sparse matrices.
+
+        Parameters
+        ----------
+        X : sparse matrix of shape (n_samples, n_features)
+            The data used to scale along the features axis. The sparse matrix
+            needs to be nonnegative. If a sparse matrix is provided,
+            it will be converted into a sparse ``csc_matrix``.
+        """
+        n_samples, n_features = X.shape
+        references = self.references_ * 100
+
+        self.quantiles_ = []
+        for feature_idx in range(n_features):
+            column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]]
+            if self.subsample is not None and len(column_nnz_data) > self.subsample:
+                column_subsample = self.subsample * len(column_nnz_data) // n_samples
+                if self.ignore_implicit_zeros:
+                    column_data = np.zeros(shape=column_subsample, dtype=X.dtype)
+                else:
+                    column_data = np.zeros(shape=self.subsample, dtype=X.dtype)
+                column_data[:column_subsample] = random_state.choice(
+                    column_nnz_data, size=column_subsample, replace=False
+                )
+            else:
+                if self.ignore_implicit_zeros:
+                    column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype)
+                else:
+                    column_data = np.zeros(shape=n_samples, dtype=X.dtype)
+                column_data[: len(column_nnz_data)] = column_nnz_data
+
+            if not column_data.size:
+                # if no nnz, an error will be raised for computing the
+                # quantiles. Force the quantiles to be zeros.
+                self.quantiles_.append([0] * len(references))
+            else:
+                self.quantiles_.append(np.nanpercentile(column_data, references))
+        self.quantiles_ = np.transpose(self.quantiles_)
+        # due to floating-point precision error in `np.nanpercentile`,
+        # make sure the quantiles are monotonically increasing
+        # Upstream issue in numpy:
+        # https://github.com/numpy/numpy/issues/14685
+        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Compute the quantiles used for transforming.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csc_matrix``. Additionally, the sparse matrix needs to be
+            nonnegative if `ignore_implicit_zeros` is False.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+           Fitted transformer.
+        """
+        if self.subsample is not None and self.n_quantiles > self.subsample:
+            raise ValueError(
+                "The number of quantiles cannot be greater than"
+                " the number of samples used. Got {} quantiles"
+                " and {} samples.".format(self.n_quantiles, self.subsample)
+            )
+
+        X = self._check_inputs(X, in_fit=True, copy=False)
+        n_samples = X.shape[0]
+
+        if self.n_quantiles > n_samples:
+            warnings.warn(
+                "n_quantiles (%s) is greater than the total number "
+                "of samples (%s). n_quantiles is set to "
+                "n_samples." % (self.n_quantiles, n_samples)
+            )
+        self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))
+
+        rng = check_random_state(self.random_state)
+
+        # Create the quantiles of reference
+        self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True)
+        if sparse.issparse(X):
+            self._sparse_fit(X, rng)
+        else:
+            self._dense_fit(X, rng)
+
+        return self
+
+    def _transform_col(self, X_col, quantiles, inverse):
+        """Private function to transform a single feature."""
+
+        output_distribution = self.output_distribution
+
+        if not inverse:
+            lower_bound_x = quantiles[0]
+            upper_bound_x = quantiles[-1]
+            lower_bound_y = 0
+            upper_bound_y = 1
+        else:
+            lower_bound_x = 0
+            upper_bound_x = 1
+            lower_bound_y = quantiles[0]
+            upper_bound_y = quantiles[-1]
+            # for inverse transform, match a uniform distribution
+            with np.errstate(invalid="ignore"):  # hide NaN comparison warnings
+                if output_distribution == "normal":
+                    X_col = stats.norm.cdf(X_col)
+                # else output distribution is already a uniform distribution
+
+        # find index for lower and higher bounds
+        with np.errstate(invalid="ignore"):  # hide NaN comparison warnings
+            if output_distribution == "normal":
+                lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x
+                upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x
+            if output_distribution == "uniform":
+                lower_bounds_idx = X_col == lower_bound_x
+                upper_bounds_idx = X_col == upper_bound_x
+
+        isfinite_mask = ~np.isnan(X_col)
+        X_col_finite = X_col[isfinite_mask]
+        if not inverse:
+            # Interpolate in one direction and in the other and take the
+            # mean. This is in case of repeated values in the features
+            # and hence repeated quantiles
+            #
+            # If we don't do this, only one extreme of the duplicated is
+            # used (the upper when we do ascending, and the
+            # lower for descending). We take the mean of these two
+            X_col[isfinite_mask] = 0.5 * (
+                np.interp(X_col_finite, quantiles, self.references_)
+                - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1])
+            )
+        else:
+            X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles)
+
+        X_col[upper_bounds_idx] = upper_bound_y
+        X_col[lower_bounds_idx] = lower_bound_y
+        # for forward transform, match the output distribution
+        if not inverse:
+            with np.errstate(invalid="ignore"):  # hide NaN comparison warnings
+                if output_distribution == "normal":
+                    X_col = stats.norm.ppf(X_col)
+                    # find the value to clip the data to avoid mapping to
+                    # infinity. Clip such that the inverse transform will be
+                    # consistent
+                    clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
+                    clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1)))
+                    X_col = np.clip(X_col, clip_min, clip_max)
+                # else output distribution is uniform and the ppf is the
+                # identity function so we let X_col unchanged
+
+        return X_col
+
+    def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False):
+        """Check inputs before fit and transform."""
+        X = validate_data(
+            self,
+            X,
+            reset=in_fit,
+            accept_sparse="csc",
+            copy=copy,
+            dtype=FLOAT_DTYPES,
+            # only set force_writeable for the validation at transform time because
+            # it's the only place where QuantileTransformer performs inplace operations.
+            force_writeable=True if not in_fit else None,
+            ensure_all_finite="allow-nan",
+        )
+        # we only accept positive sparse matrix when ignore_implicit_zeros is
+        # false and that we call fit or transform.
+        with np.errstate(invalid="ignore"):  # hide NaN comparison warnings
+            if (
+                not accept_sparse_negative
+                and not self.ignore_implicit_zeros
+                and (sparse.issparse(X) and np.any(X.data < 0))
+            ):
+                raise ValueError(
+                    "QuantileTransformer only accepts non-negative sparse matrices."
+                )
+
+        return X
+
+    def _transform(self, X, inverse=False):
+        """Forward and inverse transform.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+
+        inverse : bool, default=False
+            If False, apply forward transform. If True, apply
+            inverse transform.
+
+        Returns
+        -------
+        X : ndarray of shape (n_samples, n_features)
+            Projected data.
+        """
+        if sparse.issparse(X):
+            for feature_idx in range(X.shape[1]):
+                column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1])
+                X.data[column_slice] = self._transform_col(
+                    X.data[column_slice], self.quantiles_[:, feature_idx], inverse
+                )
+        else:
+            for feature_idx in range(X.shape[1]):
+                X[:, feature_idx] = self._transform_col(
+                    X[:, feature_idx], self.quantiles_[:, feature_idx], inverse
+                )
+
+        return X
+
+    def transform(self, X):
+        """Feature-wise transformation of the data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csc_matrix``. Additionally, the sparse matrix needs to be
+            nonnegative if `ignore_implicit_zeros` is False.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The projected data.
+        """
+        check_is_fitted(self)
+        X = self._check_inputs(X, in_fit=False, copy=self.copy)
+
+        return self._transform(X, inverse=False)
+
+    def inverse_transform(self, X):
+        """Back-projection to the original space.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csc_matrix``. Additionally, the sparse matrix needs to be
+            nonnegative if `ignore_implicit_zeros` is False.
+
+        Returns
+        -------
+        X_original : {ndarray, sparse matrix} of (n_samples, n_features)
+            The projected data.
+        """
+        check_is_fitted(self)
+        X = self._check_inputs(
+            X, in_fit=False, accept_sparse_negative=True, copy=self.copy
+        )
+
+        return self._transform(X, inverse=True)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+@validate_params(
+    {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},
+    prefer_skip_nested_validation=False,
+)
+def quantile_transform(
+    X,
+    *,
+    axis=0,
+    n_quantiles=1000,
+    output_distribution="uniform",
+    ignore_implicit_zeros=False,
+    subsample=int(1e5),
+    random_state=None,
+    copy=True,
+):
+    """Transform features using quantiles information.
+
+    This method transforms the features to follow a uniform or a normal
+    distribution. Therefore, for a given feature, this transformation tends
+    to spread out the most frequent values. It also reduces the impact of
+    (marginal) outliers: this is therefore a robust preprocessing scheme.
+
+    The transformation is applied on each feature independently. First an
+    estimate of the cumulative distribution function of a feature is
+    used to map the original values to a uniform distribution. The obtained
+    values are then mapped to the desired output distribution using the
+    associated quantile function. Features values of new/unseen data that fall
+    below or above the fitted range will be mapped to the bounds of the output
+    distribution. Note that this transform is non-linear. It may distort linear
+    correlations between variables measured at the same scale but renders
+    variables measured at different scales more directly comparable.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to transform.
+
+    axis : int, default=0
+        Axis used to compute the means and standard deviations along. If 0,
+        transform each feature, otherwise (if 1) transform each sample.
+
+    n_quantiles : int, default=1000 or n_samples
+        Number of quantiles to be computed. It corresponds to the number
+        of landmarks used to discretize the cumulative distribution function.
+        If n_quantiles is larger than the number of samples, n_quantiles is set
+        to the number of samples as a larger number of quantiles does not give
+        a better approximation of the cumulative distribution function
+        estimator.
+
+    output_distribution : {'uniform', 'normal'}, default='uniform'
+        Marginal distribution for the transformed data. The choices are
+        'uniform' (default) or 'normal'.
+
+    ignore_implicit_zeros : bool, default=False
+        Only applies to sparse matrices. If True, the sparse entries of the
+        matrix are discarded to compute the quantile statistics. If False,
+        these entries are treated as zeros.
+
+    subsample : int or None, default=1e5
+        Maximum number of samples used to estimate the quantiles for
+        computational efficiency. Note that the subsampling procedure may
+        differ for value-identical sparse and dense matrices.
+        Disable subsampling by setting `subsample=None`.
+
+        .. versionadded:: 1.5
+           The option `None` to disable subsampling was added.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling and smoothing
+        noise.
+        Please see ``subsample`` for more details.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and transform in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+        .. versionchanged:: 0.23
+            The default value of `copy` changed from False to True in 0.23.
+
+    Returns
+    -------
+    Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    QuantileTransformer : Performs quantile-based scaling using the
+        Transformer API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+    power_transform : Maps data to a normal distribution using a
+        power transformation.
+    scale : Performs standardization that is faster, but less robust
+        to outliers.
+    robust_scale : Performs robust standardization that removes the influence
+        of outliers but does not put outliers and inliers on the same scale.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.quantile_transform` unless
+        you know what you are doing. A common mistake is to apply it
+        to the entire data *before* splitting into training and
+        test sets. This will bias the model evaluation because
+        information would have leaked from the test set to the
+        training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.QuantileTransformer` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking:`pipe = make_pipeline(QuantileTransformer(),
+        LogisticRegression())`.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import quantile_transform
+    >>> rng = np.random.RandomState(0)
+    >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
+    >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True)
+    array([...])
+    """
+    n = QuantileTransformer(
+        n_quantiles=n_quantiles,
+        output_distribution=output_distribution,
+        subsample=subsample,
+        ignore_implicit_zeros=ignore_implicit_zeros,
+        random_state=random_state,
+        copy=copy,
+    )
+    if axis == 0:
+        X = n.fit_transform(X)
+    else:  # axis == 1
+        X = n.fit_transform(X.T).T
+    return X
+
+
+class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Apply a power transform featurewise to make data more Gaussian-like.
+
+    Power transforms are a family of parametric, monotonic transformations
+    that are applied to make data more Gaussian-like. This is useful for
+    modeling issues related to heteroscedasticity (non-constant variance),
+    or other situations where normality is desired.
+
+    Currently, PowerTransformer supports the Box-Cox transform and the
+    Yeo-Johnson transform. The optimal parameter for stabilizing variance and
+    minimizing skewness is estimated through maximum likelihood.
+
+    Box-Cox requires input data to be strictly positive, while Yeo-Johnson
+    supports both positive or negative data.
+
+    By default, zero-mean, unit-variance normalization is applied to the
+    transformed data.
+
+    For an example visualization, refer to :ref:`Compare PowerTransformer with
+    other scalers <plot_all_scaling_power_transformer_section>`. To see the
+    effect of Box-Cox and Yeo-Johnson transformations on different
+    distributions, see:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'
+        The power transform method. Available methods are:
+
+        - 'yeo-johnson' [1]_, works with positive and negative values
+        - 'box-cox' [2]_, only works with strictly positive values
+
+    standardize : bool, default=True
+        Set to True to apply zero-mean, unit-variance normalization to the
+        transformed output.
+
+    copy : bool, default=True
+        Set to False to perform inplace computation during transformation.
+
+    Attributes
+    ----------
+    lambdas_ : ndarray of float of shape (n_features,)
+        The parameters of the power transformation for the selected features.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    power_transform : Equivalent function without the estimator API.
+
+    QuantileTransformer : Maps data to a standard normal distribution with
+        the parameter `output_distribution='normal'`.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in ``fit``, and maintained
+    in ``transform``.
+
+    References
+    ----------
+
+    .. [1] :doi:`I.K. Yeo and R.A. Johnson, "A new family of power
+           transformations to improve normality or symmetry." Biometrika,
+           87(4), pp.954-959, (2000). <10.1093/biomet/87.4.954>`
+
+    .. [2] :doi:`G.E.P. Box and D.R. Cox, "An Analysis of Transformations",
+           Journal of the Royal Statistical Society B, 26, 211-252 (1964).
+           <10.1111/j.2517-6161.1964.tb00553.x>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import PowerTransformer
+    >>> pt = PowerTransformer()
+    >>> data = [[1, 2], [3, 2], [4, 5]]
+    >>> print(pt.fit(data))
+    PowerTransformer()
+    >>> print(pt.lambdas_)
+    [ 1.386 -3.100]
+    >>> print(pt.transform(data))
+    [[-1.316 -0.707]
+     [ 0.209 -0.707]
+     [ 1.106  1.414]]
+    """
+
+    _parameter_constraints: dict = {
+        "method": [StrOptions({"yeo-johnson", "box-cox"})],
+        "standardize": ["boolean"],
+        "copy": ["boolean"],
+    }
+
+    def __init__(self, method="yeo-johnson", *, standardize=True, copy=True):
+        self.method = method
+        self.standardize = standardize
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Estimate the optimal parameter lambda for each feature.
+
+        The optimal lambda parameter for minimizing skewness is estimated on
+        each feature independently using maximum likelihood.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to estimate the optimal transformation parameters.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        self._fit(X, y=y, force_transform=False)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None):
+        """Fit `PowerTransformer` to `X`, then transform `X`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to estimate the optimal transformation parameters
+            and to be transformed using a power transformation.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        return self._fit(X, y, force_transform=True)
+
+    def _fit(self, X, y=None, force_transform=False):
+        X = self._check_input(X, in_fit=True, check_positive=True)
+
+        if not self.copy and not force_transform:  # if call from fit()
+            X = X.copy()  # force copy so that fit does not change X inplace
+
+        n_samples = X.shape[0]
+        mean = np.mean(X, axis=0, dtype=np.float64)
+        var = np.var(X, axis=0, dtype=np.float64)
+
+        optim_function = {
+            "box-cox": self._box_cox_optimize,
+            "yeo-johnson": self._yeo_johnson_optimize,
+        }[self.method]
+
+        transform_function = {
+            "box-cox": boxcox,
+            "yeo-johnson": self._yeo_johnson_transform,
+        }[self.method]
+
+        with np.errstate(invalid="ignore"):  # hide NaN warnings
+            self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)
+            for i, col in enumerate(X.T):
+                # For yeo-johnson, leave constant features unchanged
+                # lambda=1 corresponds to the identity transformation
+                is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)
+                if self.method == "yeo-johnson" and is_constant_feature:
+                    self.lambdas_[i] = 1.0
+                    continue
+
+                self.lambdas_[i] = optim_function(col)
+
+                if self.standardize or force_transform:
+                    X[:, i] = transform_function(X[:, i], self.lambdas_[i])
+
+        if self.standardize:
+            self._scaler = StandardScaler(copy=False).set_output(transform="default")
+            if force_transform:
+                X = self._scaler.fit_transform(X)
+            else:
+                self._scaler.fit(X)
+
+        return X
+
+    def transform(self, X):
+        """Apply the power transform to each feature using the fitted lambdas.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to be transformed using a power transformation.
+
+        Returns
+        -------
+        X_trans : ndarray of shape (n_samples, n_features)
+            The transformed data.
+        """
+        check_is_fitted(self)
+        X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True)
+
+        transform_function = {
+            "box-cox": boxcox,
+            "yeo-johnson": self._yeo_johnson_transform,
+        }[self.method]
+        for i, lmbda in enumerate(self.lambdas_):
+            with np.errstate(invalid="ignore"):  # hide NaN warnings
+                X[:, i] = transform_function(X[:, i], lmbda)
+
+        if self.standardize:
+            X = self._scaler.transform(X)
+
+        return X
+
+    def inverse_transform(self, X):
+        """Apply the inverse power transformation using the fitted lambdas.
+
+        The inverse of the Box-Cox transformation is given by::
+
+            if lambda_ == 0:
+                X_original = exp(X_trans)
+            else:
+                X_original = (X * lambda_ + 1) ** (1 / lambda_)
+
+        The inverse of the Yeo-Johnson transformation is given by::
+
+            if X >= 0 and lambda_ == 0:
+                X_original = exp(X) - 1
+            elif X >= 0 and lambda_ != 0:
+                X_original = (X * lambda_ + 1) ** (1 / lambda_) - 1
+            elif X < 0 and lambda_ != 2:
+                X_original = 1 - (-(2 - lambda_) * X + 1) ** (1 / (2 - lambda_))
+            elif X < 0 and lambda_ == 2:
+                X_original = 1 - exp(-X)
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The transformed data.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            The original data.
+        """
+        check_is_fitted(self)
+        X = self._check_input(X, in_fit=False, check_shape=True)
+
+        if self.standardize:
+            X = self._scaler.inverse_transform(X)
+
+        inv_fun = {
+            "box-cox": inv_boxcox,
+            "yeo-johnson": self._yeo_johnson_inverse_transform,
+        }[self.method]
+        for i, lmbda in enumerate(self.lambdas_):
+            with np.errstate(invalid="ignore"):  # hide NaN warnings
+                X[:, i] = inv_fun(X[:, i], lmbda)
+
+        return X
+
+    def _yeo_johnson_inverse_transform(self, x, lmbda):
+        """Return inverse-transformed input x following Yeo-Johnson inverse
+        transform with parameter lambda.
+        """
+        x_inv = np.zeros_like(x)
+        pos = x >= 0
+
+        # when x >= 0
+        if abs(lmbda) < np.spacing(1.0):
+            x_inv[pos] = np.exp(x[pos]) - 1
+        else:  # lmbda != 0
+            x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
+
+        # when x < 0
+        if abs(lmbda - 2) > np.spacing(1.0):
+            x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))
+        else:  # lmbda == 2
+            x_inv[~pos] = 1 - np.exp(-x[~pos])
+
+        return x_inv
+
+    def _yeo_johnson_transform(self, x, lmbda):
+        """Return transformed input x following Yeo-Johnson transform with
+        parameter lambda.
+        """
+
+        out = np.zeros_like(x)
+        pos = x >= 0  # binary mask
+
+        # when x >= 0
+        if abs(lmbda) < np.spacing(1.0):
+            out[pos] = np.log1p(x[pos])
+        else:  # lmbda != 0
+            out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
+
+        # when x < 0
+        if abs(lmbda - 2) > np.spacing(1.0):
+            out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
+        else:  # lmbda == 2
+            out[~pos] = -np.log1p(-x[~pos])
+
+        return out
+
+    def _box_cox_optimize(self, x):
+        """Find and return optimal lambda parameter of the Box-Cox transform by
+        MLE, for observed data x.
+
+        We here use scipy builtins which uses the brent optimizer.
+        """
+        mask = np.isnan(x)
+        if np.all(mask):
+            raise ValueError("Column must not be all nan.")
+
+        # the computation of lambda is influenced by NaNs so we need to
+        # get rid of them
+        _, lmbda = stats.boxcox(x[~mask], lmbda=None)
+
+        return lmbda
+
+    def _yeo_johnson_optimize(self, x):
+        """Find and return optimal lambda parameter of the Yeo-Johnson
+        transform by MLE, for observed data x.
+
+        Like for Box-Cox, MLE is done via the brent optimizer.
+        """
+        x_tiny = np.finfo(np.float64).tiny
+
+        def _neg_log_likelihood(lmbda):
+            """Return the negative log likelihood of the observed data x as a
+            function of lambda."""
+            x_trans = self._yeo_johnson_transform(x, lmbda)
+            n_samples = x.shape[0]
+            x_trans_var = x_trans.var()
+
+            # Reject transformed data that would raise a RuntimeWarning in np.log
+            if x_trans_var < x_tiny:
+                return np.inf
+
+            log_var = np.log(x_trans_var)
+            loglike = -n_samples / 2 * log_var
+            loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()
+
+            return -loglike
+
+        # the computation of lambda is influenced by NaNs so we need to
+        # get rid of them
+        x = x[~np.isnan(x)]
+
+        return _yeojohnson_lambda(_neg_log_likelihood, x)
+
+    def _check_input(self, X, in_fit, check_positive=False, check_shape=False):
+        """Validate the input before fit and transform.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        in_fit : bool
+            Whether or not `_check_input` is called from `fit` or other
+            methods, e.g. `predict`, `transform`, etc.
+
+        check_positive : bool, default=False
+            If True, check that all data is positive and non-zero (only if
+            ``self.method=='box-cox'``).
+
+        check_shape : bool, default=False
+            If True, check that n_features matches the length of self.lambdas_
+        """
+        X = validate_data(
+            self,
+            X,
+            ensure_2d=True,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            copy=self.copy,
+            ensure_all_finite="allow-nan",
+            reset=in_fit,
+        )
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")
+            if check_positive and self.method == "box-cox" and np.nanmin(X) <= 0:
+                raise ValueError(
+                    "The Box-Cox transformation can only be "
+                    "applied to strictly positive data"
+                )
+
+        if check_shape and not X.shape[1] == len(self.lambdas_):
+            raise ValueError(
+                "Input data has a different number of features "
+                "than fitting data. Should have {n}, data has {m}".format(
+                    n=len(self.lambdas_), m=X.shape[1]
+                )
+            )
+
+        return X
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+@validate_params(
+    {"X": ["array-like"]},
+    prefer_skip_nested_validation=False,
+)
+def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True):
+    """Parametric, monotonic transformation to make data more Gaussian-like.
+
+    Power transforms are a family of parametric, monotonic transformations
+    that are applied to make data more Gaussian-like. This is useful for
+    modeling issues related to heteroscedasticity (non-constant variance),
+    or other situations where normality is desired.
+
+    Currently, power_transform supports the Box-Cox transform and the
+    Yeo-Johnson transform. The optimal parameter for stabilizing variance and
+    minimizing skewness is estimated through maximum likelihood.
+
+    Box-Cox requires input data to be strictly positive, while Yeo-Johnson
+    supports both positive or negative data.
+
+    By default, zero-mean, unit-variance normalization is applied to the
+    transformed data.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data to be transformed using a power transformation.
+
+    method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'
+        The power transform method. Available methods are:
+
+        - 'yeo-johnson' [1]_, works with positive and negative values
+        - 'box-cox' [2]_, only works with strictly positive values
+
+        .. versionchanged:: 0.23
+            The default value of the `method` parameter changed from
+            'box-cox' to 'yeo-johnson' in 0.23.
+
+    standardize : bool, default=True
+        Set to True to apply zero-mean, unit-variance normalization to the
+        transformed output.
+
+    copy : bool, default=True
+        If False, try to avoid a copy and transform in place.
+        This is not guaranteed to always work in place; e.g. if the data is
+        a numpy array with an int dtype, a copy will be returned even with
+        copy=False.
+
+    Returns
+    -------
+    X_trans : ndarray of shape (n_samples, n_features)
+        The transformed data.
+
+    See Also
+    --------
+    PowerTransformer : Equivalent transformation with the
+        Transformer API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    quantile_transform : Maps data to a standard normal distribution with
+        the parameter `output_distribution='normal'`.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in ``fit``, and maintained
+    in ``transform``.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
+
+    References
+    ----------
+
+    .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to
+           improve normality or symmetry." Biometrika, 87(4), pp.954-959,
+           (2000).
+
+    .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
+           of the Royal Statistical Society B, 26, 211-252 (1964).
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import power_transform
+    >>> data = [[1, 2], [3, 2], [4, 5]]
+    >>> print(power_transform(data, method='box-cox'))
+    [[-1.332 -0.707]
+     [ 0.256 -0.707]
+     [ 1.076  1.414]]
+
+    .. warning:: Risk of data leak.
+        Do not use :func:`~sklearn.preprocessing.power_transform` unless you
+        know what you are doing. A common mistake is to apply it to the entire
+        data *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.PowerTransformer` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking, e.g.: `pipe = make_pipeline(PowerTransformer(),
+        LogisticRegression())`.
+    """
+    pt = PowerTransformer(method=method, standardize=standardize, copy=copy)
+    return pt.fit_transform(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_discretization.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_discretization.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef5081080bda1813d4f16b9931dc58cf608c9818
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_discretization.py
@@ -0,0 +1,548 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import warnings
+from numbers import Integral
+
+import numpy as np
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import resample
+from ..utils._param_validation import Interval, Options, StrOptions
+from ..utils.stats import _averaged_weighted_percentile, _weighted_percentile
+from ..utils.validation import (
+    _check_feature_names_in,
+    _check_sample_weight,
+    check_array,
+    check_is_fitted,
+    validate_data,
+)
+from ._encoders import OneHotEncoder
+
+
+class KBinsDiscretizer(TransformerMixin, BaseEstimator):
+    """
+    Bin continuous data into intervals.
+
+    Read more in the :ref:`User Guide <preprocessing_discretization>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    n_bins : int or array-like of shape (n_features,), default=5
+        The number of bins to produce. Raises ValueError if ``n_bins < 2``.
+
+    encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
+        Method used to encode the transformed result.
+
+        - 'onehot': Encode the transformed result with one-hot encoding
+          and return a sparse matrix. Ignored features are always
+          stacked to the right.
+        - 'onehot-dense': Encode the transformed result with one-hot encoding
+          and return a dense array. Ignored features are always
+          stacked to the right.
+        - 'ordinal': Return the bin identifier encoded as an integer value.
+
+    strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
+        Strategy used to define the widths of the bins.
+
+        - 'uniform': All bins in each feature have identical widths.
+        - 'quantile': All bins in each feature have the same number of points.
+        - 'kmeans': Values in each bin have the same nearest center of a 1D
+          k-means cluster.
+
+        For an example of the different strategies see:
+        :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
+
+    quantile_method : {"inverted_cdf", "averaged_inverted_cdf",
+            "closest_observation", "interpolated_inverted_cdf", "hazen",
+            "weibull", "linear", "median_unbiased", "normal_unbiased"},
+            default="linear"
+            Method to pass on to np.percentile calculation when using
+            strategy="quantile". Only `averaged_inverted_cdf` and `inverted_cdf`
+            support the use of `sample_weight != None` when subsampling is not
+            active.
+
+            .. versionadded:: 1.7
+
+    dtype : {np.float32, np.float64}, default=None
+        The desired data-type for the output. If None, output dtype is
+        consistent with input dtype. Only np.float32 and np.float64 are
+        supported.
+
+        .. versionadded:: 0.24
+
+    subsample : int or None, default=200_000
+        Maximum number of samples, used to fit the model, for computational
+        efficiency.
+        `subsample=None` means that all the training samples are used when
+        computing the quantiles that determine the binning thresholds.
+        Since quantile computation relies on sorting each column of `X` and
+        that sorting has an `n log(n)` time complexity,
+        it is recommended to use subsampling on datasets with a
+        very large number of samples.
+
+        .. versionchanged:: 1.3
+            The default value of `subsample` changed from `None` to `200_000` when
+            `strategy="quantile"`.
+
+        .. versionchanged:: 1.5
+            The default value of `subsample` changed from `None` to `200_000` when
+            `strategy="uniform"` or `strategy="kmeans"`.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling.
+        Pass an int for reproducible results across multiple function calls.
+        See the `subsample` parameter for more details.
+        See :term:`Glossary <random_state>`.
+
+        .. versionadded:: 1.1
+
+    Attributes
+    ----------
+    bin_edges_ : ndarray of ndarray of shape (n_features,)
+        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
+        Ignored features will have empty arrays.
+
+    n_bins_ : ndarray of shape (n_features,), dtype=np.int64
+        Number of bins per feature. Bins whose width are too small
+        (i.e., <= 1e-8) are removed with a warning.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    Binarizer : Class used to bin values as ``0`` or
+        ``1`` based on a parameter ``threshold``.
+
+    Notes
+    -----
+
+    For a visualization of discretization on different datasets refer to
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
+    On the effect of discretization on linear models see:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
+
+    In bin edges for feature ``i``, the first and last values are used only for
+    ``inverse_transform``. During transform, bin edges are extended to::
+
+      np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
+
+    You can combine ``KBinsDiscretizer`` with
+    :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
+    part of the features.
+
+    ``KBinsDiscretizer`` might produce constant features (e.g., when
+    ``encode = 'onehot'`` and certain bins do not contain any data).
+    These features can be removed with feature selection algorithms
+    (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import KBinsDiscretizer
+    >>> X = [[-2, 1, -4,   -1],
+    ...      [-1, 2, -3, -0.5],
+    ...      [ 0, 3, -2,  0.5],
+    ...      [ 1, 4, -1,    2]]
+    >>> est = KBinsDiscretizer(
+    ...     n_bins=3, encode='ordinal', strategy='uniform'
+    ... )
+    >>> est.fit(X)
+    KBinsDiscretizer(...)
+    >>> Xt = est.transform(X)
+    >>> Xt  # doctest: +SKIP
+    array([[ 0., 0., 0., 0.],
+           [ 1., 1., 1., 0.],
+           [ 2., 2., 2., 1.],
+           [ 2., 2., 2., 2.]])
+
+    Sometimes it may be useful to convert the data back into the original
+    feature space. The ``inverse_transform`` function converts the binned
+    data into the original feature space. Each value will be equal to the mean
+    of the two bin edges.
+
+    >>> est.bin_edges_[0]
+    array([-2., -1.,  0.,  1.])
+    >>> est.inverse_transform(Xt)
+    array([[-1.5,  1.5, -3.5, -0.5],
+           [-0.5,  2.5, -2.5, -0.5],
+           [ 0.5,  3.5, -1.5,  0.5],
+           [ 0.5,  3.5, -1.5,  1.5]])
+    """
+
+    _parameter_constraints: dict = {
+        "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
+        "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
+        "strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
+        "quantile_method": [
+            StrOptions(
+                {
+                    "warn",
+                    "inverted_cdf",
+                    "averaged_inverted_cdf",
+                    "closest_observation",
+                    "interpolated_inverted_cdf",
+                    "hazen",
+                    "weibull",
+                    "linear",
+                    "median_unbiased",
+                    "normal_unbiased",
+                }
+            )
+        ],
+        "dtype": [Options(type, {np.float64, np.float32}), None],
+        "subsample": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_bins=5,
+        *,
+        encode="onehot",
+        strategy="quantile",
+        quantile_method="warn",
+        dtype=None,
+        subsample=200_000,
+        random_state=None,
+    ):
+        self.n_bins = n_bins
+        self.encode = encode
+        self.strategy = strategy
+        self.quantile_method = quantile_method
+        self.dtype = dtype
+        self.subsample = subsample
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """
+        Fit the estimator.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data to be discretized.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+
+            .. versionadded:: 1.3
+
+            .. versionchanged:: 1.7
+               Added support for strategy="uniform".
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = validate_data(self, X, dtype="numeric")
+
+        if self.dtype in (np.float64, np.float32):
+            output_dtype = self.dtype
+        else:  # self.dtype is None
+            output_dtype = X.dtype
+
+        n_samples, n_features = X.shape
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        if self.subsample is not None and n_samples > self.subsample:
+            # Take a subsample of `X`
+            # When resampling, it is important to subsample **with replacement** to
+            # preserve the distribution, in particular in the presence of a few data
+            # points with large weights. You can check this by setting `replace=False`
+            # in sklearn.utils.test.test_indexing.test_resample_weighted and check that
+            # it fails as a justification for this claim.
+            X = resample(
+                X,
+                replace=True,
+                n_samples=self.subsample,
+                random_state=self.random_state,
+                sample_weight=sample_weight,
+            )
+            # Since we already used the weights when resampling when provided,
+            # we set them back to `None` to avoid accounting for the weights twice
+            # in subsequent operations to compute weight-aware bin edges with
+            # quantiles or k-means.
+            sample_weight = None
+
+        n_features = X.shape[1]
+        n_bins = self._validate_n_bins(n_features)
+
+        bin_edges = np.zeros(n_features, dtype=object)
+
+        # TODO(1.9): remove and switch to quantile_method="averaged_inverted_cdf"
+        # by default.
+        quantile_method = self.quantile_method
+        if self.strategy == "quantile" and quantile_method == "warn":
+            warnings.warn(
+                "The current default behavior, quantile_method='linear', will be "
+                "changed to quantile_method='averaged_inverted_cdf' in "
+                "scikit-learn version 1.9 to naturally support sample weight "
+                "equivalence properties by default. Pass "
+                "quantile_method='averaged_inverted_cdf' explicitly to silence this "
+                "warning.",
+                FutureWarning,
+            )
+            quantile_method = "linear"
+
+        if (
+            self.strategy == "quantile"
+            and quantile_method not in ["inverted_cdf", "averaged_inverted_cdf"]
+            and sample_weight is not None
+        ):
+            raise ValueError(
+                "When fitting with strategy='quantile' and sample weights, "
+                "quantile_method should either be set to 'averaged_inverted_cdf' or "
+                f"'inverted_cdf', got quantile_method='{quantile_method}' instead."
+            )
+
+        if self.strategy != "quantile" and sample_weight is not None:
+            # Prepare a mask to filter out zero-weight samples when extracting
+            # the min and max values of each columns which are needed for the
+            # "uniform" and "kmeans" strategies.
+            nnz_weight_mask = sample_weight != 0
+        else:
+            # Otherwise, all samples are used. Use a slice to avoid creating a
+            # new array.
+            nnz_weight_mask = slice(None)
+
+        for jj in range(n_features):
+            column = X[:, jj]
+            col_min = column[nnz_weight_mask].min()
+            col_max = column[nnz_weight_mask].max()
+
+            if col_min == col_max:
+                warnings.warn(
+                    "Feature %d is constant and will be replaced with 0." % jj
+                )
+                n_bins[jj] = 1
+                bin_edges[jj] = np.array([-np.inf, np.inf])
+                continue
+
+            if self.strategy == "uniform":
+                bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
+
+            elif self.strategy == "quantile":
+                percentile_levels = np.linspace(0, 100, n_bins[jj] + 1)
+
+                # method="linear" is the implicit default for any numpy
+                # version. So we keep it version independent in that case by
+                # using an empty param dict.
+                percentile_kwargs = {}
+                if quantile_method != "linear" and sample_weight is None:
+                    percentile_kwargs["method"] = quantile_method
+
+                if sample_weight is None:
+                    bin_edges[jj] = np.asarray(
+                        np.percentile(column, percentile_levels, **percentile_kwargs),
+                        dtype=np.float64,
+                    )
+                else:
+                    # TODO: make _weighted_percentile and
+                    # _averaged_weighted_percentile accept an array of
+                    # quantiles instead of calling it multiple times and
+                    # sorting the column multiple times as a result.
+                    percentile_func = {
+                        "inverted_cdf": _weighted_percentile,
+                        "averaged_inverted_cdf": _averaged_weighted_percentile,
+                    }[quantile_method]
+                    bin_edges[jj] = np.asarray(
+                        [
+                            percentile_func(column, sample_weight, percentile_rank=p)
+                            for p in percentile_levels
+                        ],
+                        dtype=np.float64,
+                    )
+            elif self.strategy == "kmeans":
+                from ..cluster import KMeans  # fixes import loops
+
+                # Deterministic initialization with uniform spacing
+                uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
+                init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
+
+                # 1D k-means procedure
+                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
+                centers = km.fit(
+                    column[:, None], sample_weight=sample_weight
+                ).cluster_centers_[:, 0]
+                # Must sort, centers may be unsorted even with sorted init
+                centers.sort()
+                bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
+                bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
+
+            # Remove bins whose width are too small (i.e., <= 1e-8)
+            if self.strategy in ("quantile", "kmeans"):
+                mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
+                bin_edges[jj] = bin_edges[jj][mask]
+                if len(bin_edges[jj]) - 1 != n_bins[jj]:
+                    warnings.warn(
+                        "Bins whose width are too small (i.e., <= "
+                        "1e-8) in feature %d are removed. Consider "
+                        "decreasing the number of bins." % jj
+                    )
+                    n_bins[jj] = len(bin_edges[jj]) - 1
+
+        self.bin_edges_ = bin_edges
+        self.n_bins_ = n_bins
+
+        if "onehot" in self.encode:
+            self._encoder = OneHotEncoder(
+                categories=[np.arange(i) for i in self.n_bins_],
+                sparse_output=self.encode == "onehot",
+                dtype=output_dtype,
+            )
+            # Fit the OneHotEncoder with toy datasets
+            # so that it's ready for use after the KBinsDiscretizer is fitted
+            self._encoder.fit(np.zeros((1, len(self.n_bins_))))
+
+        return self
+
+    def _validate_n_bins(self, n_features):
+        """Returns n_bins_, the number of bins per feature."""
+        orig_bins = self.n_bins
+        if isinstance(orig_bins, Integral):
+            return np.full(n_features, orig_bins, dtype=int)
+
+        n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
+
+        if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
+            raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
+
+        bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
+
+        violating_indices = np.where(bad_nbins_value)[0]
+        if violating_indices.shape[0] > 0:
+            indices = ", ".join(str(i) for i in violating_indices)
+            raise ValueError(
+                "{} received an invalid number "
+                "of bins at indices {}. Number of bins "
+                "must be at least 2, and must be an int.".format(
+                    KBinsDiscretizer.__name__, indices
+                )
+            )
+        return n_bins
+
+    def transform(self, X):
+        """
+        Discretize the data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data to be discretized.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
+            Data in the binned space. Will be a sparse matrix if
+            `self.encode='onehot'` and ndarray otherwise.
+        """
+        check_is_fitted(self)
+
+        # check input and attribute dtypes
+        dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
+        Xt = validate_data(self, X, copy=True, dtype=dtype, reset=False)
+
+        bin_edges = self.bin_edges_
+        for jj in range(Xt.shape[1]):
+            Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
+
+        if self.encode == "ordinal":
+            return Xt
+
+        dtype_init = None
+        if "onehot" in self.encode:
+            dtype_init = self._encoder.dtype
+            self._encoder.dtype = Xt.dtype
+        try:
+            Xt_enc = self._encoder.transform(Xt)
+        finally:
+            # revert the initial dtype to avoid modifying self.
+            self._encoder.dtype = dtype_init
+        return Xt_enc
+
+    def inverse_transform(self, X):
+        """
+        Transform discretized data back to original feature space.
+
+        Note that this function does not regenerate the original data
+        due to discretization rounding.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Transformed data in the binned space.
+
+        Returns
+        -------
+        X_original : ndarray, dtype={np.float32, np.float64}
+            Data in the original feature space.
+        """
+
+        check_is_fitted(self)
+
+        if "onehot" in self.encode:
+            X = self._encoder.inverse_transform(X)
+
+        Xinv = check_array(X, copy=True, dtype=(np.float64, np.float32))
+        n_features = self.n_bins_.shape[0]
+        if Xinv.shape[1] != n_features:
+            raise ValueError(
+                "Incorrect number of features. Expecting {}, received {}.".format(
+                    n_features, Xinv.shape[1]
+                )
+            )
+
+        for jj in range(n_features):
+            bin_edges = self.bin_edges_[jj]
+            bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
+            Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]
+
+        return Xinv
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        input_features = _check_feature_names_in(self, input_features)
+        if hasattr(self, "_encoder"):
+            return self._encoder.get_feature_names_out(input_features)
+
+        # ordinal encoding
+        return input_features
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f41c9d0c6d22822efd228a94d3c8a8b27b053a3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py
@@ -0,0 +1,1698 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import warnings
+from numbers import Integral
+
+import numpy as np
+from scipy import sparse
+
+from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context
+from ..utils import _safe_indexing, check_array
+from ..utils._encode import _check_unknown, _encode, _get_counts, _unique
+from ..utils._mask import _get_mask
+from ..utils._missing import is_scalar_nan
+from ..utils._param_validation import Interval, RealNotInt, StrOptions
+from ..utils._set_output import _get_output_config
+from ..utils.validation import (
+    _check_feature_names,
+    _check_feature_names_in,
+    _check_n_features,
+    check_is_fitted,
+)
+
+__all__ = ["OneHotEncoder", "OrdinalEncoder"]
+
+
+class _BaseEncoder(TransformerMixin, BaseEstimator):
+    """
+    Base class for encoders that includes the code to categorize and
+    transform the input features.
+
+    """
+
+    def _check_X(self, X, ensure_all_finite=True):
+        """
+        Perform custom check_array:
+        - convert list of strings to object dtype
+        - check for missing values for object dtype data (check_array does
+          not do that)
+        - return list of features (arrays): this list of features is
+          constructed feature by feature to preserve the data types
+          of pandas DataFrame columns, as otherwise information is lost
+          and cannot be used, e.g. for the `categories_` attribute.
+
+        """
+        if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2):
+            # if not a dataframe, do normal check_array validation
+            X_temp = check_array(X, dtype=None, ensure_all_finite=ensure_all_finite)
+            if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_):
+                X = check_array(X, dtype=object, ensure_all_finite=ensure_all_finite)
+            else:
+                X = X_temp
+            needs_validation = False
+        else:
+            # pandas dataframe, do validation later column by column, in order
+            # to keep the dtype information to be used in the encoder.
+            needs_validation = ensure_all_finite
+
+        n_samples, n_features = X.shape
+        X_columns = []
+
+        for i in range(n_features):
+            Xi = _safe_indexing(X, indices=i, axis=1)
+            Xi = check_array(
+                Xi, ensure_2d=False, dtype=None, ensure_all_finite=needs_validation
+            )
+            X_columns.append(Xi)
+
+        return X_columns, n_samples, n_features
+
+    def _fit(
+        self,
+        X,
+        handle_unknown="error",
+        ensure_all_finite=True,
+        return_counts=False,
+        return_and_ignore_missing_for_infrequent=False,
+    ):
+        self._check_infrequent_enabled()
+        _check_n_features(self, X, reset=True)
+        _check_feature_names(self, X, reset=True)
+        X_list, n_samples, n_features = self._check_X(
+            X, ensure_all_finite=ensure_all_finite
+        )
+        self.n_features_in_ = n_features
+
+        if self.categories != "auto":
+            if len(self.categories) != n_features:
+                raise ValueError(
+                    "Shape mismatch: if categories is an array,"
+                    " it has to be of shape (n_features,)."
+                )
+
+        self.categories_ = []
+        category_counts = []
+        compute_counts = return_counts or self._infrequent_enabled
+
+        for i in range(n_features):
+            Xi = X_list[i]
+
+            if self.categories == "auto":
+                result = _unique(Xi, return_counts=compute_counts)
+                if compute_counts:
+                    cats, counts = result
+                    category_counts.append(counts)
+                else:
+                    cats = result
+            else:
+                if np.issubdtype(Xi.dtype, np.str_):
+                    # Always convert string categories to objects to avoid
+                    # unexpected string truncation for longer category labels
+                    # passed in the constructor.
+                    Xi_dtype = object
+                else:
+                    Xi_dtype = Xi.dtype
+
+                cats = np.array(self.categories[i], dtype=Xi_dtype)
+                if (
+                    cats.dtype == object
+                    and isinstance(cats[0], bytes)
+                    and Xi.dtype.kind != "S"
+                ):
+                    msg = (
+                        f"In column {i}, the predefined categories have type 'bytes'"
+                        " which is incompatible with values of type"
+                        f" '{type(Xi[0]).__name__}'."
+                    )
+                    raise ValueError(msg)
+
+                # `nan` must be the last stated category
+                for category in cats[:-1]:
+                    if is_scalar_nan(category):
+                        raise ValueError(
+                            "Nan should be the last element in user"
+                            f" provided categories, see categories {cats}"
+                            f" in column #{i}"
+                        )
+
+                if cats.size != len(_unique(cats)):
+                    msg = (
+                        f"In column {i}, the predefined categories"
+                        " contain duplicate elements."
+                    )
+                    raise ValueError(msg)
+
+                if Xi.dtype.kind not in "OUS":
+                    sorted_cats = np.sort(cats)
+                    error_msg = (
+                        "Unsorted categories are not supported for numerical categories"
+                    )
+                    # if there are nans, nan should be the last element
+                    stop_idx = -1 if np.isnan(sorted_cats[-1]) else None
+                    if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]):
+                        raise ValueError(error_msg)
+
+                if handle_unknown == "error":
+                    diff = _check_unknown(Xi, cats)
+                    if diff:
+                        msg = (
+                            "Found unknown categories {0} in column {1}"
+                            " during fit".format(diff, i)
+                        )
+                        raise ValueError(msg)
+                if compute_counts:
+                    category_counts.append(_get_counts(Xi, cats))
+
+            self.categories_.append(cats)
+
+        output = {"n_samples": n_samples}
+        if return_counts:
+            output["category_counts"] = category_counts
+
+        missing_indices = {}
+        if return_and_ignore_missing_for_infrequent:
+            for feature_idx, categories_for_idx in enumerate(self.categories_):
+                if is_scalar_nan(categories_for_idx[-1]):
+                    # `nan` values can only be placed in the latest position
+                    missing_indices[feature_idx] = categories_for_idx.size - 1
+            output["missing_indices"] = missing_indices
+
+        if self._infrequent_enabled:
+            self._fit_infrequent_category_mapping(
+                n_samples,
+                category_counts,
+                missing_indices,
+            )
+        return output
+
+    def _transform(
+        self,
+        X,
+        handle_unknown="error",
+        ensure_all_finite=True,
+        warn_on_unknown=False,
+        ignore_category_indices=None,
+    ):
+        X_list, n_samples, n_features = self._check_X(
+            X, ensure_all_finite=ensure_all_finite
+        )
+        _check_feature_names(self, X, reset=False)
+        _check_n_features(self, X, reset=False)
+
+        X_int = np.zeros((n_samples, n_features), dtype=int)
+        X_mask = np.ones((n_samples, n_features), dtype=bool)
+
+        columns_with_unknown = []
+        for i in range(n_features):
+            Xi = X_list[i]
+            diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True)
+
+            if not np.all(valid_mask):
+                if handle_unknown == "error":
+                    msg = (
+                        "Found unknown categories {0} in column {1}"
+                        " during transform".format(diff, i)
+                    )
+                    raise ValueError(msg)
+                else:
+                    if warn_on_unknown:
+                        columns_with_unknown.append(i)
+                    # Set the problematic rows to an acceptable value and
+                    # continue `The rows are marked `X_mask` and will be
+                    # removed later.
+                    X_mask[:, i] = valid_mask
+                    # cast Xi into the largest string type necessary
+                    # to handle different lengths of numpy strings
+                    if (
+                        self.categories_[i].dtype.kind in ("U", "S")
+                        and self.categories_[i].itemsize > Xi.itemsize
+                    ):
+                        Xi = Xi.astype(self.categories_[i].dtype)
+                    elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U":
+                        # categories are objects and Xi are numpy strings.
+                        # Cast Xi to an object dtype to prevent truncation
+                        # when setting invalid values.
+                        Xi = Xi.astype("O")
+                    else:
+                        Xi = Xi.copy()
+
+                    Xi[~valid_mask] = self.categories_[i][0]
+            # We use check_unknown=False, since _check_unknown was
+            # already called above.
+            X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)
+        if columns_with_unknown:
+            warnings.warn(
+                (
+                    "Found unknown categories in columns "
+                    f"{columns_with_unknown} during transform. These "
+                    "unknown categories will be encoded as all zeros"
+                ),
+                UserWarning,
+            )
+
+        self._map_infrequent_categories(X_int, X_mask, ignore_category_indices)
+        return X_int, X_mask
+
+    @property
+    def infrequent_categories_(self):
+        """Infrequent categories for each feature."""
+        # raises an AttributeError if `_infrequent_indices` is not defined
+        infrequent_indices = self._infrequent_indices
+        return [
+            None if indices is None else category[indices]
+            for category, indices in zip(self.categories_, infrequent_indices)
+        ]
+
+    def _check_infrequent_enabled(self):
+        """
+        This functions checks whether _infrequent_enabled is True or False.
+        This has to be called after parameter validation in the fit function.
+        """
+        max_categories = getattr(self, "max_categories", None)
+        min_frequency = getattr(self, "min_frequency", None)
+        self._infrequent_enabled = (
+            max_categories is not None and max_categories >= 1
+        ) or min_frequency is not None
+
+    def _identify_infrequent(self, category_count, n_samples, col_idx):
+        """Compute the infrequent indices.
+
+        Parameters
+        ----------
+        category_count : ndarray of shape (n_cardinality,)
+            Category counts.
+
+        n_samples : int
+            Number of samples.
+
+        col_idx : int
+            Index of the current category. Only used for the error message.
+
+        Returns
+        -------
+        output : ndarray of shape (n_infrequent_categories,) or None
+            If there are infrequent categories, indices of infrequent
+            categories. Otherwise None.
+        """
+        if isinstance(self.min_frequency, numbers.Integral):
+            infrequent_mask = category_count < self.min_frequency
+        elif isinstance(self.min_frequency, numbers.Real):
+            min_frequency_abs = n_samples * self.min_frequency
+            infrequent_mask = category_count < min_frequency_abs
+        else:
+            infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)
+
+        n_current_features = category_count.size - infrequent_mask.sum() + 1
+        if self.max_categories is not None and self.max_categories < n_current_features:
+            # max_categories includes the one infrequent category
+            frequent_category_count = self.max_categories - 1
+            if frequent_category_count == 0:
+                # All categories are infrequent
+                infrequent_mask[:] = True
+            else:
+                # stable sort to preserve original count order
+                smallest_levels = np.argsort(category_count, kind="mergesort")[
+                    :-frequent_category_count
+                ]
+                infrequent_mask[smallest_levels] = True
+
+        output = np.flatnonzero(infrequent_mask)
+        return output if output.size > 0 else None
+
+    def _fit_infrequent_category_mapping(
+        self, n_samples, category_counts, missing_indices
+    ):
+        """Fit infrequent categories.
+
+        Defines the private attribute: `_default_to_infrequent_mappings`. For
+        feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping
+        from the integer encoding returned by `super().transform()` into
+        infrequent categories. If `_default_to_infrequent_mappings[i]` is None,
+        there were no infrequent categories in the training set.
+
+        For example if categories 0, 2 and 4 were frequent, while categories
+        1, 3, 5 were infrequent for feature 7, then these categories are mapped
+        to a single output:
+        `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])`
+
+        Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]`
+        is an array of indices such that
+        `categories_[i][_infrequent_indices[i]]` are all the infrequent category
+        labels. If the feature `i` has no infrequent categories
+        `_infrequent_indices[i]` is None.
+
+        .. versionadded:: 1.1
+
+        Parameters
+        ----------
+        n_samples : int
+            Number of samples in training set.
+        category_counts: list of ndarray
+            `category_counts[i]` is the category counts corresponding to
+            `self.categories_[i]`.
+        missing_indices : dict
+            Dict mapping from feature_idx to category index with a missing value.
+        """
+        # Remove missing value from counts, so it is not considered as infrequent
+        if missing_indices:
+            category_counts_ = []
+            for feature_idx, count in enumerate(category_counts):
+                if feature_idx in missing_indices:
+                    category_counts_.append(
+                        np.delete(count, missing_indices[feature_idx])
+                    )
+                else:
+                    category_counts_.append(count)
+        else:
+            category_counts_ = category_counts
+
+        self._infrequent_indices = [
+            self._identify_infrequent(category_count, n_samples, col_idx)
+            for col_idx, category_count in enumerate(category_counts_)
+        ]
+
+        # compute mapping from default mapping to infrequent mapping
+        self._default_to_infrequent_mappings = []
+
+        for feature_idx, infreq_idx in enumerate(self._infrequent_indices):
+            cats = self.categories_[feature_idx]
+            # no infrequent categories
+            if infreq_idx is None:
+                self._default_to_infrequent_mappings.append(None)
+                continue
+
+            n_cats = len(cats)
+            if feature_idx in missing_indices:
+                # Missing index was removed from this category when computing
+                # infrequent indices, thus we need to decrease the number of
+                # total categories when considering the infrequent mapping.
+                n_cats -= 1
+
+            # infrequent indices exist
+            mapping = np.empty(n_cats, dtype=np.int64)
+            n_infrequent_cats = infreq_idx.size
+
+            # infrequent categories are mapped to the last element.
+            n_frequent_cats = n_cats - n_infrequent_cats
+            mapping[infreq_idx] = n_frequent_cats
+
+            frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx)
+            mapping[frequent_indices] = np.arange(n_frequent_cats)
+
+            self._default_to_infrequent_mappings.append(mapping)
+
+    def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices):
+        """Map infrequent categories to integer representing the infrequent category.
+
+        This modifies X_int in-place. Values that were invalid based on `X_mask`
+        are mapped to the infrequent category if there was an infrequent
+        category for that feature.
+
+        Parameters
+        ----------
+        X_int: ndarray of shape (n_samples, n_features)
+            Integer encoded categories.
+
+        X_mask: ndarray of shape (n_samples, n_features)
+            Bool mask for valid values in `X_int`.
+
+        ignore_category_indices : dict
+            Dictionary mapping from feature_idx to category index to ignore.
+            Ignored indexes will not be grouped and the original ordinal encoding
+            will remain.
+        """
+        if not self._infrequent_enabled:
+            return
+
+        ignore_category_indices = ignore_category_indices or {}
+
+        for col_idx in range(X_int.shape[1]):
+            infrequent_idx = self._infrequent_indices[col_idx]
+            if infrequent_idx is None:
+                continue
+
+            X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]
+            if self.handle_unknown == "infrequent_if_exist":
+                # All the unknown values are now mapped to the
+                # infrequent_idx[0], which makes the unknown values valid
+                # This is needed in `transform` when the encoding is formed
+                # using `X_mask`.
+                X_mask[:, col_idx] = True
+
+        # Remaps encoding in `X_int` where the infrequent categories are
+        # grouped together.
+        for i, mapping in enumerate(self._default_to_infrequent_mappings):
+            if mapping is None:
+                continue
+
+            if i in ignore_category_indices:
+                # Update rows that are **not** ignored
+                rows_to_update = X_int[:, i] != ignore_category_indices[i]
+            else:
+                rows_to_update = slice(None)
+
+            X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i])
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.categorical = True
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+class OneHotEncoder(_BaseEncoder):
+    """
+    Encode categorical features as a one-hot numeric array.
+
+    The input to this transformer should be an array-like of integers or
+    strings, denoting the values taken on by categorical (discrete) features.
+    The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
+    encoding scheme. This creates a binary column for each category and
+    returns a sparse matrix or dense array (depending on the ``sparse_output``
+    parameter).
+
+    By default, the encoder derives the categories based on the unique values
+    in each feature. Alternatively, you can also specify the `categories`
+    manually.
+
+    This encoding is needed for feeding categorical data to many scikit-learn
+    estimators, notably linear models and SVMs with the standard kernels.
+
+    Note: a one-hot encoding of y labels should use a LabelBinarizer
+    instead.
+
+    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+    For a comparison of different encoders, refer to:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
+
+    Parameters
+    ----------
+    categories : 'auto' or a list of array-like, default='auto'
+        Categories (unique values) per feature:
+
+        - 'auto' : Determine categories automatically from the training data.
+        - list : ``categories[i]`` holds the categories expected in the ith
+          column. The passed categories should not mix strings and numeric
+          values within a single feature, and should be sorted in case of
+          numeric values.
+
+        The used categories can be found in the ``categories_`` attribute.
+
+        .. versionadded:: 0.20
+
+    drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \
+            default=None
+        Specifies a methodology to use to drop one of the categories per
+        feature. This is useful in situations where perfectly collinear
+        features cause problems, such as when feeding the resulting data
+        into an unregularized linear regression model.
+
+        However, dropping one category breaks the symmetry of the original
+        representation and can therefore induce a bias in downstream models,
+        for instance for penalized linear classification or regression models.
+
+        - None : retain all features (the default).
+        - 'first' : drop the first category in each feature. If only one
+          category is present, the feature will be dropped entirely.
+        - 'if_binary' : drop the first category in each feature with two
+          categories. Features with 1 or more than 2 categories are
+          left intact.
+        - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
+          should be dropped.
+
+        When `max_categories` or `min_frequency` is configured to group
+        infrequent categories, the dropping behavior is handled after the
+        grouping.
+
+        .. versionadded:: 0.21
+           The parameter `drop` was added in 0.21.
+
+        .. versionchanged:: 0.23
+           The option `drop='if_binary'` was added in 0.23.
+
+        .. versionchanged:: 1.1
+            Support for dropping infrequent categories.
+
+    sparse_output : bool, default=True
+        When ``True``, it returns a :class:`scipy.sparse.csr_matrix`,
+        i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format.
+
+        .. versionadded:: 1.2
+           `sparse` was renamed to `sparse_output`
+
+    dtype : number type, default=np.float64
+        Desired dtype of output.
+
+    handle_unknown : {'error', 'ignore', 'infrequent_if_exist', 'warn'}, \
+                     default='error'
+        Specifies the way unknown categories are handled during :meth:`transform`.
+
+        - 'error' : Raise an error if an unknown category is present during transform.
+        - 'ignore' : When an unknown category is encountered during
+          transform, the resulting one-hot encoded columns for this feature
+          will be all zeros. In the inverse transform, an unknown category
+          will be denoted as None.
+        - 'infrequent_if_exist' : When an unknown category is encountered
+          during transform, the resulting one-hot encoded columns for this
+          feature will map to the infrequent category if it exists. The
+          infrequent category will be mapped to the last position in the
+          encoding. During inverse transform, an unknown category will be
+          mapped to the category denoted `'infrequent'` if it exists. If the
+          `'infrequent'` category does not exist, then :meth:`transform` and
+          :meth:`inverse_transform` will handle an unknown category as with
+          `handle_unknown='ignore'`. Infrequent categories exist based on
+          `min_frequency` and `max_categories`. Read more in the
+          :ref:`User Guide <encoder_infrequent_categories>`.
+        - 'warn' : When an unknown category is encountered during transform
+          a warning is issued, and the encoding then proceeds as described for
+          `handle_unknown="infrequent_if_exist"`.
+
+        .. versionchanged:: 1.1
+            `'infrequent_if_exist'` was added to automatically handle unknown
+            categories and infrequent categories.
+
+        .. versionadded:: 1.6
+           The option `"warn"` was added in 1.6.
+
+    min_frequency : int or float, default=None
+        Specifies the minimum frequency below which a category will be
+        considered infrequent.
+
+        - If `int`, categories with a smaller cardinality will be considered
+          infrequent.
+
+        - If `float`, categories with a smaller cardinality than
+          `min_frequency * n_samples`  will be considered infrequent.
+
+        .. versionadded:: 1.1
+            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
+
+    max_categories : int, default=None
+        Specifies an upper limit to the number of output features for each input
+        feature when considering infrequent categories. If there are infrequent
+        categories, `max_categories` includes the category representing the
+        infrequent categories along with the frequent categories. If `None`,
+        there is no limit to the number of output features.
+
+        .. versionadded:: 1.1
+            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
+
+    feature_name_combiner : "concat" or callable, default="concat"
+        Callable with signature `def callable(input_feature, category)` that returns a
+        string. This is used to create feature names to be returned by
+        :meth:`get_feature_names_out`.
+
+        `"concat"` concatenates encoded feature name and category with
+        `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create
+        feature names `X_1, X_6, X_7`.
+
+        .. versionadded:: 1.3
+
+    Attributes
+    ----------
+    categories_ : list of arrays
+        The categories of each feature determined during fitting
+        (in order of the features in X and corresponding with the output
+        of ``transform``). This includes the category specified in ``drop``
+        (if any).
+
+    drop_idx_ : array of shape (n_features,)
+        - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
+          to be dropped for each feature.
+        - ``drop_idx_[i] = None`` if no category is to be dropped from the
+          feature with index ``i``, e.g. when `drop='if_binary'` and the
+          feature isn't binary.
+        - ``drop_idx_ = None`` if all the transformed features will be
+          retained.
+
+        If infrequent categories are enabled by setting `min_frequency` or
+        `max_categories` to a non-default value and `drop_idx[i]` corresponds
+        to a infrequent category, then the entire infrequent category is
+        dropped.
+
+        .. versionchanged:: 0.23
+           Added the possibility to contain `None` values.
+
+    infrequent_categories_ : list of ndarray
+        Defined only if infrequent categories are enabled by setting
+        `min_frequency` or `max_categories` to a non-default value.
+        `infrequent_categories_[i]` are the infrequent categories for feature
+        `i`. If the feature `i` has no infrequent categories
+        `infrequent_categories_[i]` is None.
+
+        .. versionadded:: 1.1
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 1.0
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    feature_name_combiner : callable or None
+        Callable with signature `def callable(input_feature, category)` that returns a
+        string. This is used to create feature names to be returned by
+        :meth:`get_feature_names_out`.
+
+        .. versionadded:: 1.3
+
+    See Also
+    --------
+    OrdinalEncoder : Performs an ordinal (integer)
+      encoding of the categorical features.
+    TargetEncoder : Encodes categorical features using the target.
+    sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
+      dictionary items (also handles string-valued features).
+    sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
+      encoding of dictionary items or strings.
+    LabelBinarizer : Binarizes labels in a one-vs-all
+      fashion.
+    MultiLabelBinarizer : Transforms between iterable of
+      iterables and a multilabel format, e.g. a (samples x classes) binary
+      matrix indicating the presence of a class label.
+
+    Examples
+    --------
+    Given a dataset with two features, we let the encoder find the unique
+    values per feature and transform the data to a binary one-hot encoding.
+
+    >>> from sklearn.preprocessing import OneHotEncoder
+
+    One can discard categories not seen during `fit`:
+
+    >>> enc = OneHotEncoder(handle_unknown='ignore')
+    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    >>> enc.fit(X)
+    OneHotEncoder(handle_unknown='ignore')
+    >>> enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
+    array([[1., 0., 1., 0., 0.],
+           [0., 1., 0., 0., 0.]])
+    >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
+    array([['Male', 1],
+           [None, 2]], dtype=object)
+    >>> enc.get_feature_names_out(['gender', 'group'])
+    array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)
+
+    One can always drop the first column for each feature:
+
+    >>> drop_enc = OneHotEncoder(drop='first').fit(X)
+    >>> drop_enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
+    array([[0., 0., 0.],
+           [1., 1., 0.]])
+
+    Or drop a column for feature only having 2 categories:
+
+    >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
+    >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
+    array([[0., 1., 0., 0.],
+           [1., 0., 1., 0.]])
+
+    One can change the way feature names are created.
+
+    >>> def custom_combiner(feature, category):
+    ...     return str(feature) + "_" + type(category).__name__ + "_" + str(category)
+    >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X)
+    >>> custom_fnames_enc.get_feature_names_out()
+    array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'],
+          dtype=object)
+
+    Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
+
+    >>> import numpy as np
+    >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
+    >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X)
+    >>> ohe.infrequent_categories_
+    [array(['a', 'd'], dtype=object)]
+    >>> ohe.transform([["a"], ["b"]])
+    array([[0., 0., 1.],
+           [1., 0., 0.]])
+    """
+
+    _parameter_constraints: dict = {
+        "categories": [StrOptions({"auto"}), list],
+        "drop": [StrOptions({"first", "if_binary"}), "array-like", None],
+        "dtype": "no_validation",  # validation delegated to numpy
+        "handle_unknown": [
+            StrOptions({"error", "ignore", "infrequent_if_exist", "warn"})
+        ],
+        "max_categories": [Interval(Integral, 1, None, closed="left"), None],
+        "min_frequency": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            None,
+        ],
+        "sparse_output": ["boolean"],
+        "feature_name_combiner": [StrOptions({"concat"}), callable],
+    }
+
+    def __init__(
+        self,
+        *,
+        categories="auto",
+        drop=None,
+        sparse_output=True,
+        dtype=np.float64,
+        handle_unknown="error",
+        min_frequency=None,
+        max_categories=None,
+        feature_name_combiner="concat",
+    ):
+        self.categories = categories
+        self.sparse_output = sparse_output
+        self.dtype = dtype
+        self.handle_unknown = handle_unknown
+        self.drop = drop
+        self.min_frequency = min_frequency
+        self.max_categories = max_categories
+        self.feature_name_combiner = feature_name_combiner
+
+    def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
+        """Convert `drop_idx` into the index for infrequent categories.
+
+        If there are no infrequent categories, then `drop_idx` is
+        returned. This method is called in `_set_drop_idx` when the `drop`
+        parameter is an array-like.
+        """
+        if not self._infrequent_enabled:
+            return drop_idx
+
+        default_to_infrequent = self._default_to_infrequent_mappings[feature_idx]
+        if default_to_infrequent is None:
+            return drop_idx
+
+        # Raise error when explicitly dropping a category that is infrequent
+        infrequent_indices = self._infrequent_indices[feature_idx]
+        if infrequent_indices is not None and drop_idx in infrequent_indices:
+            categories = self.categories_[feature_idx]
+            raise ValueError(
+                f"Unable to drop category {categories[drop_idx].item()!r} from"
+                f" feature {feature_idx} because it is infrequent"
+            )
+        return default_to_infrequent[drop_idx]
+
+    def _set_drop_idx(self):
+        """Compute the drop indices associated with `self.categories_`.
+
+        If `self.drop` is:
+        - `None`, No categories have been dropped.
+        - `'first'`, All zeros to drop the first category.
+        - `'if_binary'`, All zeros if the category is binary and `None`
+          otherwise.
+        - array-like, The indices of the categories that match the
+          categories in `self.drop`. If the dropped category is an infrequent
+          category, then the index for the infrequent category is used. This
+          means that the entire infrequent category is dropped.
+
+        This methods defines a public `drop_idx_` and a private
+        `_drop_idx_after_grouping`.
+
+        - `drop_idx_`: Public facing API that references the drop category in
+          `self.categories_`.
+        - `_drop_idx_after_grouping`: Used internally to drop categories *after* the
+          infrequent categories are grouped together.
+
+        If there are no infrequent categories or drop is `None`, then
+        `drop_idx_=_drop_idx_after_grouping`.
+        """
+        if self.drop is None:
+            drop_idx_after_grouping = None
+        elif isinstance(self.drop, str):
+            if self.drop == "first":
+                drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
+            elif self.drop == "if_binary":
+                n_features_out_no_drop = [len(cat) for cat in self.categories_]
+                if self._infrequent_enabled:
+                    for i, infreq_idx in enumerate(self._infrequent_indices):
+                        if infreq_idx is None:
+                            continue
+                        n_features_out_no_drop[i] -= infreq_idx.size - 1
+
+                drop_idx_after_grouping = np.array(
+                    [
+                        0 if n_features_out == 2 else None
+                        for n_features_out in n_features_out_no_drop
+                    ],
+                    dtype=object,
+                )
+
+        else:
+            drop_array = np.asarray(self.drop, dtype=object)
+            droplen = len(drop_array)
+
+            if droplen != len(self.categories_):
+                msg = (
+                    "`drop` should have length equal to the number "
+                    "of features ({}), got {}"
+                )
+                raise ValueError(msg.format(len(self.categories_), droplen))
+            missing_drops = []
+            drop_indices = []
+            for feature_idx, (drop_val, cat_list) in enumerate(
+                zip(drop_array, self.categories_)
+            ):
+                if not is_scalar_nan(drop_val):
+                    drop_idx = np.where(cat_list == drop_val)[0]
+                    if drop_idx.size:  # found drop idx
+                        drop_indices.append(
+                            self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0])
+                        )
+                    else:
+                        missing_drops.append((feature_idx, drop_val))
+                    continue
+
+                # drop_val is nan, find nan in categories manually
+                if is_scalar_nan(cat_list[-1]):
+                    drop_indices.append(
+                        self._map_drop_idx_to_infrequent(feature_idx, cat_list.size - 1)
+                    )
+                else:  # nan is missing
+                    missing_drops.append((feature_idx, drop_val))
+
+            if any(missing_drops):
+                msg = (
+                    "The following categories were supposed to be "
+                    "dropped, but were not found in the training "
+                    "data.\n{}".format(
+                        "\n".join(
+                            [
+                                "Category: {}, Feature: {}".format(c, v)
+                                for c, v in missing_drops
+                            ]
+                        )
+                    )
+                )
+                raise ValueError(msg)
+            drop_idx_after_grouping = np.array(drop_indices, dtype=object)
+
+        # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
+        # categories are grouped together. If needed, we remap `drop_idx` back
+        # to the categories seen in `self.categories_`.
+        self._drop_idx_after_grouping = drop_idx_after_grouping
+
+        if not self._infrequent_enabled or drop_idx_after_grouping is None:
+            self.drop_idx_ = self._drop_idx_after_grouping
+        else:
+            drop_idx_ = []
+            for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
+                default_to_infrequent = self._default_to_infrequent_mappings[
+                    feature_idx
+                ]
+                if drop_idx is None or default_to_infrequent is None:
+                    orig_drop_idx = drop_idx
+                else:
+                    orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]
+
+                drop_idx_.append(orig_drop_idx)
+
+            self.drop_idx_ = np.asarray(drop_idx_, dtype=object)
+
+    def _compute_transformed_categories(self, i, remove_dropped=True):
+        """Compute the transformed categories used for column `i`.
+
+        1. If there are infrequent categories, the category is named
+        'infrequent_sklearn'.
+        2. Dropped columns are removed when remove_dropped=True.
+        """
+        cats = self.categories_[i]
+
+        if self._infrequent_enabled:
+            infreq_map = self._default_to_infrequent_mappings[i]
+            if infreq_map is not None:
+                frequent_mask = infreq_map < infreq_map.max()
+                infrequent_cat = "infrequent_sklearn"
+                # infrequent category is always at the end
+                cats = np.concatenate(
+                    (cats[frequent_mask], np.array([infrequent_cat], dtype=object))
+                )
+
+        if remove_dropped:
+            cats = self._remove_dropped_categories(cats, i)
+        return cats
+
+    def _remove_dropped_categories(self, categories, i):
+        """Remove dropped categories."""
+        if (
+            self._drop_idx_after_grouping is not None
+            and self._drop_idx_after_grouping[i] is not None
+        ):
+            return np.delete(categories, self._drop_idx_after_grouping[i])
+        return categories
+
+    def _compute_n_features_outs(self):
+        """Compute the n_features_out for each input feature."""
+        output = [len(cats) for cats in self.categories_]
+
+        if self._drop_idx_after_grouping is not None:
+            for i, drop_idx in enumerate(self._drop_idx_after_grouping):
+                if drop_idx is not None:
+                    output[i] -= 1
+
+        if not self._infrequent_enabled:
+            return output
+
+        # infrequent is enabled, the number of features out are reduced
+        # because the infrequent categories are grouped together
+        for i, infreq_idx in enumerate(self._infrequent_indices):
+            if infreq_idx is None:
+                continue
+            output[i] -= infreq_idx.size - 1
+
+        return output
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """
+        Fit OneHotEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        self
+            Fitted encoder.
+        """
+        self._fit(
+            X,
+            handle_unknown=self.handle_unknown,
+            ensure_all_finite="allow-nan",
+        )
+        self._set_drop_idx()
+        self._n_features_outs = self._compute_n_features_outs()
+        return self
+
+    def transform(self, X):
+        """
+        Transform X using one-hot encoding.
+
+        If `sparse_output=True` (default), it returns an instance of
+        :class:`scipy.sparse._csr.csr_matrix` (CSR format).
+
+        If there are infrequent categories for a feature, set by specifying
+        `max_categories` or `min_frequency`, the infrequent categories are
+        grouped into a single category.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to encode.
+
+        Returns
+        -------
+        X_out : {ndarray, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            Transformed input. If `sparse_output=True`, a sparse matrix will be
+            returned.
+        """
+        check_is_fitted(self)
+        transform_output = _get_output_config("transform", estimator=self)["dense"]
+        if transform_output != "default" and self.sparse_output:
+            capitalize_transform_output = transform_output.capitalize()
+            raise ValueError(
+                f"{capitalize_transform_output} output does not support sparse data."
+                f" Set sparse_output=False to output {transform_output} dataframes or"
+                f" disable {capitalize_transform_output} output via"
+                '` ohe.set_output(transform="default").'
+            )
+
+        # validation of X happens in _check_X called by _transform
+        if self.handle_unknown == "warn":
+            warn_on_unknown, handle_unknown = True, "infrequent_if_exist"
+        else:
+            warn_on_unknown = self.drop is not None and self.handle_unknown in {
+                "ignore",
+                "infrequent_if_exist",
+            }
+            handle_unknown = self.handle_unknown
+        X_int, X_mask = self._transform(
+            X,
+            handle_unknown=handle_unknown,
+            ensure_all_finite="allow-nan",
+            warn_on_unknown=warn_on_unknown,
+        )
+
+        n_samples, n_features = X_int.shape
+
+        if self._drop_idx_after_grouping is not None:
+            to_drop = self._drop_idx_after_grouping.copy()
+            # We remove all the dropped categories from mask, and decrement all
+            # categories that occur after them to avoid an empty column.
+            keep_cells = X_int != to_drop
+            for i, cats in enumerate(self.categories_):
+                # drop='if_binary' but feature isn't binary
+                if to_drop[i] is None:
+                    # set to cardinality to not drop from X_int
+                    to_drop[i] = len(cats)
+
+            to_drop = to_drop.reshape(1, -1)
+            X_int[X_int > to_drop] -= 1
+            X_mask &= keep_cells
+
+        mask = X_mask.ravel()
+        feature_indices = np.cumsum([0] + self._n_features_outs)
+        indices = (X_int + feature_indices[:-1]).ravel()[mask]
+
+        indptr = np.empty(n_samples + 1, dtype=int)
+        indptr[0] = 0
+        np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)
+        np.cumsum(indptr[1:], out=indptr[1:])
+        data = np.ones(indptr[-1])
+
+        out = sparse.csr_matrix(
+            (data, indices, indptr),
+            shape=(n_samples, feature_indices[-1]),
+            dtype=self.dtype,
+        )
+        if not self.sparse_output:
+            return out.toarray()
+        else:
+            return out
+
+    def inverse_transform(self, X):
+        """
+        Convert the data back to the original representation.
+
+        When unknown categories are encountered (all zeros in the
+        one-hot encoding), ``None`` is used to represent this category. If the
+        feature with the unknown category has a dropped category, the dropped
+        category will be its inverse.
+
+        For a given input feature, if there is an infrequent category,
+        'infrequent_sklearn' will be used to represent the infrequent category.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            The transformed data.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Inverse transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse="csr")
+
+        n_samples, _ = X.shape
+        n_features = len(self.categories_)
+
+        n_features_out = np.sum(self._n_features_outs)
+
+        # validate shape of passed X
+        msg = (
+            "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
+        )
+        if X.shape[1] != n_features_out:
+            raise ValueError(msg.format(n_features_out, X.shape[1]))
+
+        transformed_features = [
+            self._compute_transformed_categories(i, remove_dropped=False)
+            for i, _ in enumerate(self.categories_)
+        ]
+
+        # create resulting array of appropriate dtype
+        dt = np.result_type(*[cat.dtype for cat in transformed_features])
+        X_tr = np.empty((n_samples, n_features), dtype=dt)
+
+        j = 0
+        found_unknown = {}
+
+        if self._infrequent_enabled:
+            infrequent_indices = self._infrequent_indices
+        else:
+            infrequent_indices = [None] * n_features
+
+        for i in range(n_features):
+            cats_wo_dropped = self._remove_dropped_categories(
+                transformed_features[i], i
+            )
+            n_categories = cats_wo_dropped.shape[0]
+
+            # Only happens if there was a column with a unique
+            # category. In this case we just fill the column with this
+            # unique category value.
+            if n_categories == 0:
+                X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
+                j += n_categories
+                continue
+            sub = X[:, j : j + n_categories]
+            # for sparse X argmax returns 2D matrix, ensure 1D array
+            labels = np.asarray(sub.argmax(axis=1)).flatten()
+            X_tr[:, i] = cats_wo_dropped[labels]
+
+            if self.handle_unknown == "ignore" or (
+                self.handle_unknown in ("infrequent_if_exist", "warn")
+                and infrequent_indices[i] is None
+            ):
+                unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
+                # ignored unknown categories: we have a row of all zero
+                if unknown.any():
+                    # if categories were dropped then unknown categories will
+                    # be mapped to the dropped category
+                    if (
+                        self._drop_idx_after_grouping is None
+                        or self._drop_idx_after_grouping[i] is None
+                    ):
+                        found_unknown[i] = unknown
+                    else:
+                        X_tr[unknown, i] = self.categories_[i][
+                            self._drop_idx_after_grouping[i]
+                        ]
+            else:
+                dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
+                if dropped.any():
+                    if self._drop_idx_after_grouping is None:
+                        all_zero_samples = np.flatnonzero(dropped)
+                        raise ValueError(
+                            f"Samples {all_zero_samples} can not be inverted "
+                            "when drop=None and handle_unknown='error' "
+                            "because they contain all zeros"
+                        )
+                    # we can safely assume that all of the nulls in each column
+                    # are the dropped value
+                    drop_idx = self._drop_idx_after_grouping[i]
+                    X_tr[dropped, i] = transformed_features[i][drop_idx]
+
+            j += n_categories
+
+        # if ignored are found: potentially need to upcast result to
+        # insert None values
+        if found_unknown:
+            if X_tr.dtype != object:
+                X_tr = X_tr.astype(object)
+
+            for idx, mask in found_unknown.items():
+                X_tr[mask, idx] = None
+
+        return X_tr
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self)
+        input_features = _check_feature_names_in(self, input_features)
+        cats = [
+            self._compute_transformed_categories(i)
+            for i, _ in enumerate(self.categories_)
+        ]
+
+        name_combiner = self._check_get_feature_name_combiner()
+        feature_names = []
+        for i in range(len(cats)):
+            names = [name_combiner(input_features[i], t) for t in cats[i]]
+            feature_names.extend(names)
+
+        return np.array(feature_names, dtype=object)
+
+    def _check_get_feature_name_combiner(self):
+        if self.feature_name_combiner == "concat":
+            return lambda feature, category: feature + "_" + str(category)
+        else:  # callable
+            dry_run_combiner = self.feature_name_combiner("feature", "category")
+            if not isinstance(dry_run_combiner, str):
+                raise TypeError(
+                    "When `feature_name_combiner` is a callable, it should return a "
+                    f"Python string. Got {type(dry_run_combiner)} instead."
+                )
+            return self.feature_name_combiner
+
+
+class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
+    """
+    Encode categorical features as an integer array.
+
+    The input to this transformer should be an array-like of integers or
+    strings, denoting the values taken on by categorical (discrete) features.
+    The features are converted to ordinal integers. This results in
+    a single column of integers (0 to n_categories - 1) per feature.
+
+    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+    For a comparison of different encoders, refer to:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    categories : 'auto' or a list of array-like, default='auto'
+        Categories (unique values) per feature:
+
+        - 'auto' : Determine categories automatically from the training data.
+        - list : ``categories[i]`` holds the categories expected in the ith
+          column. The passed categories should not mix strings and numeric
+          values, and should be sorted in case of numeric values.
+
+        The used categories can be found in the ``categories_`` attribute.
+
+    dtype : number type, default=np.float64
+        Desired dtype of output.
+
+    handle_unknown : {'error', 'use_encoded_value'}, default='error'
+        When set to 'error' an error will be raised in case an unknown
+        categorical feature is present during transform. When set to
+        'use_encoded_value', the encoded value of unknown categories will be
+        set to the value given for the parameter `unknown_value`. In
+        :meth:`inverse_transform`, an unknown category will be denoted as None.
+
+        .. versionadded:: 0.24
+
+    unknown_value : int or np.nan, default=None
+        When the parameter handle_unknown is set to 'use_encoded_value', this
+        parameter is required and will set the encoded value of unknown
+        categories. It has to be distinct from the values used to encode any of
+        the categories in `fit`. If set to np.nan, the `dtype` parameter must
+        be a float dtype.
+
+        .. versionadded:: 0.24
+
+    encoded_missing_value : int or np.nan, default=np.nan
+        Encoded value of missing categories. If set to `np.nan`, then the `dtype`
+        parameter must be a float dtype.
+
+        .. versionadded:: 1.1
+
+    min_frequency : int or float, default=None
+        Specifies the minimum frequency below which a category will be
+        considered infrequent.
+
+        - If `int`, categories with a smaller cardinality will be considered
+          infrequent.
+
+        - If `float`, categories with a smaller cardinality than
+          `min_frequency * n_samples`  will be considered infrequent.
+
+        .. versionadded:: 1.3
+            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
+
+    max_categories : int, default=None
+        Specifies an upper limit to the number of output categories for each input
+        feature when considering infrequent categories. If there are infrequent
+        categories, `max_categories` includes the category representing the
+        infrequent categories along with the frequent categories. If `None`,
+        there is no limit to the number of output features.
+
+        `max_categories` do **not** take into account missing or unknown
+        categories. Setting `unknown_value` or `encoded_missing_value` to an
+        integer will increase the number of unique integer codes by one each.
+        This can result in up to `max_categories + 2` integer codes.
+
+        .. versionadded:: 1.3
+            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
+
+    Attributes
+    ----------
+    categories_ : list of arrays
+        The categories of each feature determined during ``fit`` (in order of
+        the features in X and corresponding with the output of ``transform``).
+        This does not include categories that weren't seen during ``fit``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 1.0
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    infrequent_categories_ : list of ndarray
+        Defined only if infrequent categories are enabled by setting
+        `min_frequency` or `max_categories` to a non-default value.
+        `infrequent_categories_[i]` are the infrequent categories for feature
+        `i`. If the feature `i` has no infrequent categories
+        `infrequent_categories_[i]` is None.
+
+        .. versionadded:: 1.3
+
+    See Also
+    --------
+    OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding
+        is suitable for low to medium cardinality categorical variables, both in
+        supervised and unsupervised settings.
+    TargetEncoder : Encodes categorical features using supervised signal
+        in a classification or regression pipeline. This encoding is typically
+        suitable for high cardinality categorical variables.
+    LabelEncoder : Encodes target labels with values between 0 and
+        ``n_classes-1``.
+
+    Notes
+    -----
+    With a high proportion of `nan` values, inferring categories becomes slow with
+    Python versions before 3.10. The handling of `nan` values was improved
+    from Python 3.10 onwards, (c.f.
+    `bpo-43475 <https://github.com/python/cpython/issues/87641>`_).
+
+    Examples
+    --------
+    Given a dataset with two features, we let the encoder find the unique
+    values per feature and transform the data to an ordinal encoding.
+
+    >>> from sklearn.preprocessing import OrdinalEncoder
+    >>> enc = OrdinalEncoder()
+    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    >>> enc.fit(X)
+    OrdinalEncoder()
+    >>> enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> enc.transform([['Female', 3], ['Male', 1]])
+    array([[0., 2.],
+           [1., 0.]])
+
+    >>> enc.inverse_transform([[1, 0], [0, 1]])
+    array([['Male', 1],
+           ['Female', 2]], dtype=object)
+
+    By default, :class:`OrdinalEncoder` is lenient towards missing values by
+    propagating them.
+
+    >>> import numpy as np
+    >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
+    >>> enc.fit_transform(X)
+    array([[ 1.,  0.],
+           [ 0.,  1.],
+           [ 0., nan]])
+
+    You can use the parameter `encoded_missing_value` to encode missing values.
+
+    >>> enc.set_params(encoded_missing_value=-1).fit_transform(X)
+    array([[ 1.,  0.],
+           [ 0.,  1.],
+           [ 0., -1.]])
+
+    Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
+    In the following example, "a" and "d" are considered infrequent and grouped
+    together into a single category, "b" and "c" are their own categories, unknown
+    values are encoded as 3 and missing values are encoded as 4.
+
+    >>> X_train = np.array(
+    ...     [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]],
+    ...     dtype=object).T
+    >>> enc = OrdinalEncoder(
+    ...     handle_unknown="use_encoded_value", unknown_value=3,
+    ...     max_categories=3, encoded_missing_value=4)
+    >>> _ = enc.fit(X_train)
+    >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
+    >>> enc.transform(X_test)
+    array([[2.],
+           [0.],
+           [1.],
+           [2.],
+           [3.],
+           [4.]])
+    """
+
+    _parameter_constraints: dict = {
+        "categories": [StrOptions({"auto"}), list],
+        "dtype": "no_validation",  # validation delegated to numpy
+        "encoded_missing_value": [Integral, type(np.nan)],
+        "handle_unknown": [StrOptions({"error", "use_encoded_value"})],
+        "unknown_value": [Integral, type(np.nan), None],
+        "max_categories": [Interval(Integral, 1, None, closed="left"), None],
+        "min_frequency": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="neither"),
+            None,
+        ],
+    }
+
+    def __init__(
+        self,
+        *,
+        categories="auto",
+        dtype=np.float64,
+        handle_unknown="error",
+        unknown_value=None,
+        encoded_missing_value=np.nan,
+        min_frequency=None,
+        max_categories=None,
+    ):
+        self.categories = categories
+        self.dtype = dtype
+        self.handle_unknown = handle_unknown
+        self.unknown_value = unknown_value
+        self.encoded_missing_value = encoded_missing_value
+        self.min_frequency = min_frequency
+        self.max_categories = max_categories
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """
+        Fit the OrdinalEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        self : object
+            Fitted encoder.
+        """
+        if self.handle_unknown == "use_encoded_value":
+            if is_scalar_nan(self.unknown_value):
+                if np.dtype(self.dtype).kind != "f":
+                    raise ValueError(
+                        "When unknown_value is np.nan, the dtype "
+                        "parameter should be "
+                        f"a float dtype. Got {self.dtype}."
+                    )
+            elif not isinstance(self.unknown_value, numbers.Integral):
+                raise TypeError(
+                    "unknown_value should be an integer or "
+                    "np.nan when "
+                    "handle_unknown is 'use_encoded_value', "
+                    f"got {self.unknown_value}."
+                )
+        elif self.unknown_value is not None:
+            raise TypeError(
+                "unknown_value should only be set when "
+                "handle_unknown is 'use_encoded_value', "
+                f"got {self.unknown_value}."
+            )
+
+        # `_fit` will only raise an error when `self.handle_unknown="error"`
+        fit_results = self._fit(
+            X,
+            handle_unknown=self.handle_unknown,
+            ensure_all_finite="allow-nan",
+            return_and_ignore_missing_for_infrequent=True,
+        )
+        self._missing_indices = fit_results["missing_indices"]
+
+        cardinalities = [len(categories) for categories in self.categories_]
+        if self._infrequent_enabled:
+            # Cardinality decreases because the infrequent categories are grouped
+            # together
+            for feature_idx, infrequent in enumerate(self.infrequent_categories_):
+                if infrequent is not None:
+                    cardinalities[feature_idx] -= len(infrequent)
+
+        # missing values are not considered part of the cardinality
+        # when considering unknown categories or encoded_missing_value
+        for cat_idx, categories_for_idx in enumerate(self.categories_):
+            if is_scalar_nan(categories_for_idx[-1]):
+                cardinalities[cat_idx] -= 1
+
+        if self.handle_unknown == "use_encoded_value":
+            for cardinality in cardinalities:
+                if 0 <= self.unknown_value < cardinality:
+                    raise ValueError(
+                        "The used value for unknown_value "
+                        f"{self.unknown_value} is one of the "
+                        "values already used for encoding the "
+                        "seen categories."
+                    )
+
+        if self._missing_indices:
+            if np.dtype(self.dtype).kind != "f" and is_scalar_nan(
+                self.encoded_missing_value
+            ):
+                raise ValueError(
+                    "There are missing values in features "
+                    f"{list(self._missing_indices)}. For OrdinalEncoder to "
+                    f"encode missing values with dtype: {self.dtype}, set "
+                    "encoded_missing_value to a non-nan value, or "
+                    "set dtype to a float"
+                )
+
+            if not is_scalar_nan(self.encoded_missing_value):
+                # Features are invalid when they contain a missing category
+                # and encoded_missing_value was already used to encode a
+                # known category
+                invalid_features = [
+                    cat_idx
+                    for cat_idx, cardinality in enumerate(cardinalities)
+                    if cat_idx in self._missing_indices
+                    and 0 <= self.encoded_missing_value < cardinality
+                ]
+
+                if invalid_features:
+                    # Use feature names if they are available
+                    if hasattr(self, "feature_names_in_"):
+                        invalid_features = self.feature_names_in_[invalid_features]
+                    raise ValueError(
+                        f"encoded_missing_value ({self.encoded_missing_value}) "
+                        "is already used to encode a known category in features: "
+                        f"{invalid_features}"
+                    )
+
+        return self
+
+    def transform(self, X):
+        """
+        Transform X to ordinal codes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to encode.
+
+        Returns
+        -------
+        X_out : ndarray of shape (n_samples, n_features)
+            Transformed input.
+        """
+        check_is_fitted(self, "categories_")
+        X_int, X_mask = self._transform(
+            X,
+            handle_unknown=self.handle_unknown,
+            ensure_all_finite="allow-nan",
+            ignore_category_indices=self._missing_indices,
+        )
+        X_trans = X_int.astype(self.dtype, copy=False)
+
+        for cat_idx, missing_idx in self._missing_indices.items():
+            X_missing_mask = X_int[:, cat_idx] == missing_idx
+            X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value
+
+        # create separate category for unknown values
+        if self.handle_unknown == "use_encoded_value":
+            X_trans[~X_mask] = self.unknown_value
+        return X_trans
+
+    def inverse_transform(self, X):
+        """
+        Convert the data back to the original representation.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_encoded_features)
+            The transformed data.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Inverse transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(X, ensure_all_finite="allow-nan")
+
+        n_samples, _ = X.shape
+        n_features = len(self.categories_)
+
+        # validate shape of passed X
+        msg = (
+            "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
+        )
+        if X.shape[1] != n_features:
+            raise ValueError(msg.format(n_features, X.shape[1]))
+
+        # create resulting array of appropriate dtype
+        dt = np.result_type(*[cat.dtype for cat in self.categories_])
+        X_tr = np.empty((n_samples, n_features), dtype=dt)
+
+        found_unknown = {}
+        infrequent_masks = {}
+
+        infrequent_indices = getattr(self, "_infrequent_indices", None)
+
+        for i in range(n_features):
+            labels = X[:, i]
+
+            # replace values of X[:, i] that were nan with actual indices
+            if i in self._missing_indices:
+                X_i_mask = _get_mask(labels, self.encoded_missing_value)
+                labels[X_i_mask] = self._missing_indices[i]
+
+            rows_to_update = slice(None)
+            categories = self.categories_[i]
+
+            if infrequent_indices is not None and infrequent_indices[i] is not None:
+                # Compute mask for frequent categories
+                infrequent_encoding_value = len(categories) - len(infrequent_indices[i])
+                infrequent_masks[i] = labels == infrequent_encoding_value
+                rows_to_update = ~infrequent_masks[i]
+
+                # Remap categories to be only frequent categories. The infrequent
+                # categories will be mapped to "infrequent_sklearn" later
+                frequent_categories_mask = np.ones_like(categories, dtype=bool)
+                frequent_categories_mask[infrequent_indices[i]] = False
+                categories = categories[frequent_categories_mask]
+
+            if self.handle_unknown == "use_encoded_value":
+                unknown_labels = _get_mask(labels, self.unknown_value)
+                found_unknown[i] = unknown_labels
+
+                known_labels = ~unknown_labels
+                if isinstance(rows_to_update, np.ndarray):
+                    rows_to_update &= known_labels
+                else:
+                    rows_to_update = known_labels
+
+            labels_int = labels[rows_to_update].astype("int64", copy=False)
+            X_tr[rows_to_update, i] = categories[labels_int]
+
+        if found_unknown or infrequent_masks:
+            X_tr = X_tr.astype(object, copy=False)
+
+        # insert None values for unknown values
+        if found_unknown:
+            for idx, mask in found_unknown.items():
+                X_tr[mask, idx] = None
+
+        if infrequent_masks:
+            for idx, mask in infrequent_masks.items():
+                X_tr[mask, idx] = "infrequent_sklearn"
+
+        return X_tr
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3530f3284dc941f582acd254f563fb29b3215c1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py
@@ -0,0 +1,449 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from functools import partial
+
+import numpy as np
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils._param_validation import StrOptions
+from ..utils._repr_html.estimator import _VisualBlock
+from ..utils._set_output import (
+    _get_adapter_from_container,
+    _get_output_config,
+)
+from ..utils.metaestimators import available_if
+from ..utils.validation import (
+    _allclose_dense_sparse,
+    _check_feature_names,
+    _check_feature_names_in,
+    _check_n_features,
+    _get_feature_names,
+    _is_pandas_df,
+    _is_polars_df,
+    check_array,
+    validate_data,
+)
+
+
+def _identity(X):
+    """The identity function."""
+    return X
+
+
+class FunctionTransformer(TransformerMixin, BaseEstimator):
+    """Constructs a transformer from an arbitrary callable.
+
+    A FunctionTransformer forwards its X (and optionally y) arguments to a
+    user-defined function or function object and returns the result of this
+    function. This is useful for stateless transformations such as taking the
+    log of frequencies, doing custom scaling, etc.
+
+    Note: If a lambda is used as the function, then the resulting
+    transformer will not be pickleable.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <function_transformer>`.
+
+    Parameters
+    ----------
+    func : callable, default=None
+        The callable to use for the transformation. This will be passed
+        the same arguments as transform, with args and kwargs forwarded.
+        If func is None, then func will be the identity function.
+
+    inverse_func : callable, default=None
+        The callable to use for the inverse transformation. This will be
+        passed the same arguments as inverse transform, with args and
+        kwargs forwarded. If inverse_func is None, then inverse_func
+        will be the identity function.
+
+    validate : bool, default=False
+        Indicate that the input X array should be checked before calling
+        ``func``. The possibilities are:
+
+        - If False, there is no input validation.
+        - If True, then X will be converted to a 2-dimensional NumPy array or
+          sparse matrix. If the conversion is not possible an exception is
+          raised.
+
+        .. versionchanged:: 0.22
+           The default of ``validate`` changed from True to False.
+
+    accept_sparse : bool, default=False
+        Indicate that func accepts a sparse matrix as input. If validate is
+        False, this has no effect. Otherwise, if accept_sparse is false,
+        sparse matrix inputs will cause an exception to be raised.
+
+    check_inverse : bool, default=True
+       Whether to check that or ``func`` followed by ``inverse_func`` leads to
+       the original inputs. It can be used for a sanity check, raising a
+       warning when the condition is not fulfilled.
+
+       .. versionadded:: 0.20
+
+    feature_names_out : callable, 'one-to-one' or None, default=None
+        Determines the list of feature names that will be returned by the
+        `get_feature_names_out` method. If it is 'one-to-one', then the output
+        feature names will be equal to the input feature names. If it is a
+        callable, then it must take two positional arguments: this
+        `FunctionTransformer` (`self`) and an array-like of input feature names
+        (`input_features`). It must return an array-like of output feature
+        names. The `get_feature_names_out` method is only defined if
+        `feature_names_out` is not None.
+
+        See ``get_feature_names_out`` for more details.
+
+        .. versionadded:: 1.1
+
+    kw_args : dict, default=None
+        Dictionary of additional keyword arguments to pass to func.
+
+        .. versionadded:: 0.18
+
+    inv_kw_args : dict, default=None
+        Dictionary of additional keyword arguments to pass to inverse_func.
+
+        .. versionadded:: 0.18
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    MaxAbsScaler : Scale each feature by its maximum absolute value.
+    StandardScaler : Standardize features by removing the mean and
+        scaling to unit variance.
+    LabelBinarizer : Binarize labels in a one-vs-all fashion.
+    MultiLabelBinarizer : Transform between iterable of iterables
+        and a multilabel format.
+
+    Notes
+    -----
+    If `func` returns an output with a `columns` attribute, then the columns is enforced
+    to be consistent with the output of `get_feature_names_out`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import FunctionTransformer
+    >>> transformer = FunctionTransformer(np.log1p)
+    >>> X = np.array([[0, 1], [2, 3]])
+    >>> transformer.transform(X)
+    array([[0.       , 0.6931],
+           [1.0986, 1.3862]])
+    """
+
+    _parameter_constraints: dict = {
+        "func": [callable, None],
+        "inverse_func": [callable, None],
+        "validate": ["boolean"],
+        "accept_sparse": ["boolean"],
+        "check_inverse": ["boolean"],
+        "feature_names_out": [callable, StrOptions({"one-to-one"}), None],
+        "kw_args": [dict, None],
+        "inv_kw_args": [dict, None],
+    }
+
+    def __init__(
+        self,
+        func=None,
+        inverse_func=None,
+        *,
+        validate=False,
+        accept_sparse=False,
+        check_inverse=True,
+        feature_names_out=None,
+        kw_args=None,
+        inv_kw_args=None,
+    ):
+        self.func = func
+        self.inverse_func = inverse_func
+        self.validate = validate
+        self.accept_sparse = accept_sparse
+        self.check_inverse = check_inverse
+        self.feature_names_out = feature_names_out
+        self.kw_args = kw_args
+        self.inv_kw_args = inv_kw_args
+
+    def _check_input(self, X, *, reset):
+        if self.validate:
+            return validate_data(self, X, accept_sparse=self.accept_sparse, reset=reset)
+        elif reset:
+            # Set feature_names_in_ and n_features_in_ even if validate=False
+            # We run this only when reset==True to store the attributes but not
+            # validate them, because validate=False
+            _check_n_features(self, X, reset=reset)
+            _check_feature_names(self, X, reset=reset)
+        return X
+
+    def _check_inverse_transform(self, X):
+        """Check that func and inverse_func are the inverse."""
+        idx_selected = slice(None, None, max(1, X.shape[0] // 100))
+        X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
+
+        if hasattr(X, "dtype"):
+            dtypes = [X.dtype]
+        elif hasattr(X, "dtypes"):
+            # Dataframes can have multiple dtypes
+            dtypes = X.dtypes
+
+        # Not all dtypes are numpy dtypes, they can be pandas dtypes as well
+        if not all(
+            isinstance(d, np.dtype) and np.issubdtype(d, np.number) for d in dtypes
+        ):
+            raise ValueError(
+                "'check_inverse' is only supported when all the elements in `X` is"
+                " numerical."
+            )
+
+        if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
+            warnings.warn(
+                (
+                    "The provided functions are not strictly"
+                    " inverse of each other. If you are sure you"
+                    " want to proceed regardless, set"
+                    " 'check_inverse=False'."
+                ),
+                UserWarning,
+            )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit transformer by checking X.
+
+        If ``validate`` is ``True``, ``X`` will be checked.
+
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
+                if `validate=True` else any object that `func` can handle
+            Input array.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            FunctionTransformer class instance.
+        """
+        X = self._check_input(X, reset=True)
+        if self.check_inverse and not (self.func is None or self.inverse_func is None):
+            self._check_inverse_transform(X)
+        return self
+
+    def transform(self, X):
+        """Transform X using the forward function.
+
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
+                if `validate=True` else any object that `func` can handle
+            Input array.
+
+        Returns
+        -------
+        X_out : array-like, shape (n_samples, n_features)
+            Transformed input.
+        """
+        X = self._check_input(X, reset=False)
+        out = self._transform(X, func=self.func, kw_args=self.kw_args)
+        output_config = _get_output_config("transform", self)["dense"]
+
+        if hasattr(out, "columns") and self.feature_names_out is not None:
+            # check the consistency between the column provided by `transform` and
+            # the column names provided by `get_feature_names_out`.
+            feature_names_out = self.get_feature_names_out()
+            if list(out.columns) != list(feature_names_out):
+                # we can override the column names of the output if it is inconsistent
+                # with the column names provided by `get_feature_names_out` in the
+                # following cases:
+                # * `func` preserved the column names between the input and the output
+                # * the input column names are all numbers
+                # * the output is requested to be a DataFrame (pandas or polars)
+                feature_names_in = getattr(
+                    X, "feature_names_in_", _get_feature_names(X)
+                )
+                same_feature_names_in_out = feature_names_in is not None and list(
+                    feature_names_in
+                ) == list(out.columns)
+                not_all_str_columns = not all(
+                    isinstance(col, str) for col in out.columns
+                )
+                if same_feature_names_in_out or not_all_str_columns:
+                    adapter = _get_adapter_from_container(out)
+                    out = adapter.create_container(
+                        X_output=out,
+                        X_original=out,
+                        columns=feature_names_out,
+                        inplace=False,
+                    )
+                else:
+                    raise ValueError(
+                        "The output generated by `func` have different column names "
+                        "than the ones provided by `get_feature_names_out`. "
+                        f"Got output with columns names: {list(out.columns)} and "
+                        "`get_feature_names_out` returned: "
+                        f"{list(self.get_feature_names_out())}. "
+                        "The column names can be overridden by setting "
+                        "`set_output(transform='pandas')` or "
+                        "`set_output(transform='polars')` such that the column names "
+                        "are set to the names provided by `get_feature_names_out`."
+                    )
+
+        if self.feature_names_out is None:
+            warn_msg = (
+                "When `set_output` is configured to be '{0}', `func` should return "
+                "a {0} DataFrame to follow the `set_output` API  or `feature_names_out`"
+                " should be defined."
+            )
+            if output_config == "pandas" and not _is_pandas_df(out):
+                warnings.warn(warn_msg.format("pandas"))
+            elif output_config == "polars" and not _is_polars_df(out):
+                warnings.warn(warn_msg.format("polars"))
+
+        return out
+
+    def inverse_transform(self, X):
+        """Transform X using the inverse function.
+
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
+                if `validate=True` else any object that `inverse_func` can handle
+            Input array.
+
+        Returns
+        -------
+        X_original : array-like, shape (n_samples, n_features)
+            Transformed input.
+        """
+        if self.validate:
+            X = check_array(X, accept_sparse=self.accept_sparse)
+        return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
+
+    @available_if(lambda self: self.feature_names_out is not None)
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        This method is only defined if `feature_names_out` is not None.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input feature names.
+
+            - If `input_features` is None, then `feature_names_in_` is
+              used as the input feature names. If `feature_names_in_` is not
+              defined, then names are generated:
+              `[x0, x1, ..., x(n_features_in_ - 1)]`.
+            - If `input_features` is array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+
+            - If `feature_names_out` is 'one-to-one', the input feature names
+              are returned (see `input_features` above). This requires
+              `feature_names_in_` and/or `n_features_in_` to be defined, which
+              is done automatically if `validate=True`. Alternatively, you can
+              set them in `func`.
+            - If `feature_names_out` is a callable, then it is called with two
+              arguments, `self` and `input_features`, and its return value is
+              returned by this method.
+        """
+        if hasattr(self, "n_features_in_") or input_features is not None:
+            input_features = _check_feature_names_in(self, input_features)
+        if self.feature_names_out == "one-to-one":
+            names_out = input_features
+        elif callable(self.feature_names_out):
+            names_out = self.feature_names_out(self, input_features)
+        else:
+            raise ValueError(
+                f"feature_names_out={self.feature_names_out!r} is invalid. "
+                'It must either be "one-to-one" or a callable with two '
+                "arguments: the function transformer and an array-like of "
+                "input feature names. The callable must return an array-like "
+                "of output feature names."
+            )
+        return np.asarray(names_out, dtype=object)
+
+    def _transform(self, X, func=None, kw_args=None):
+        if func is None:
+            func = _identity
+
+        return func(X, **(kw_args if kw_args else {}))
+
+    def __sklearn_is_fitted__(self):
+        """Return True since FunctionTransfomer is stateless."""
+        return True
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.no_validation = not self.validate
+        tags.requires_fit = False
+        tags.input_tags.sparse = not self.validate or self.accept_sparse
+        return tags
+
+    def set_output(self, *, transform=None):
+        """Set output container.
+
+        See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
+        for an example on how to use the API.
+
+        Parameters
+        ----------
+        transform : {"default", "pandas", "polars"}, default=None
+            Configure output of `transform` and `fit_transform`.
+
+            - `"default"`: Default output format of a transformer
+            - `"pandas"`: DataFrame output
+            - `"polars"`: Polars output
+            - `None`: Transform configuration is unchanged
+
+            .. versionadded:: 1.4
+                `"polars"` option was added.
+
+        Returns
+        -------
+        self : estimator instance
+            Estimator instance.
+        """
+        if not hasattr(self, "_sklearn_output_config"):
+            self._sklearn_output_config = {}
+
+        self._sklearn_output_config["transform"] = transform
+        return self
+
+    def _get_function_name(self):
+        """Get the name display of the `func` used in HTML representation."""
+        if hasattr(self.func, "__name__"):
+            return self.func.__name__
+        if isinstance(self.func, partial):
+            return self.func.func.__name__
+        return f"{self.func.__class__.__name__}(...)"
+
+    def _sk_visual_block_(self):
+        return _VisualBlock(
+            "single",
+            self,
+            names=self._get_function_name(),
+            name_details=str(self),
+            name_caption="FunctionTransformer",
+            doc_link_label="FunctionTransformer",
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_label.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd721b35a35217bc6cb8badfb8ff66e2bdc15c8e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_label.py
@@ -0,0 +1,963 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import array
+import itertools
+import warnings
+from collections import defaultdict
+from numbers import Integral
+
+import numpy as np
+import scipy.sparse as sp
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import column_or_1d
+from ..utils._array_api import device, get_namespace, xpx
+from ..utils._encode import _encode, _unique
+from ..utils._param_validation import Interval, validate_params
+from ..utils.multiclass import type_of_target, unique_labels
+from ..utils.sparsefuncs import min_max_axis
+from ..utils.validation import _num_samples, check_array, check_is_fitted
+
+__all__ = [
+    "LabelBinarizer",
+    "LabelEncoder",
+    "MultiLabelBinarizer",
+    "label_binarize",
+]
+
+
+class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
+    """Encode target labels with value between 0 and n_classes-1.
+
+    This transformer should be used to encode target values, *i.e.* `y`, and
+    not the input `X`.
+
+    Read more in the :ref:`User Guide <preprocessing_targets>`.
+
+    .. versionadded:: 0.12
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Holds the label for each class.
+
+    See Also
+    --------
+    OrdinalEncoder : Encode categorical features using an ordinal encoding
+        scheme.
+    OneHotEncoder : Encode categorical features as a one-hot numeric array.
+
+    Examples
+    --------
+    `LabelEncoder` can be used to normalize labels.
+
+    >>> from sklearn.preprocessing import LabelEncoder
+    >>> le = LabelEncoder()
+    >>> le.fit([1, 2, 2, 6])
+    LabelEncoder()
+    >>> le.classes_
+    array([1, 2, 6])
+    >>> le.transform([1, 1, 2, 6])
+    array([0, 0, 1, 2]...)
+    >>> le.inverse_transform([0, 0, 1, 2])
+    array([1, 1, 2, 6])
+
+    It can also be used to transform non-numerical labels (as long as they are
+    hashable and comparable) to numerical labels.
+
+    >>> le = LabelEncoder()
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder()
+    >>> list(le.classes_)
+    [np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')]
+    >>> le.transform(["tokyo", "tokyo", "paris"])
+    array([2, 2, 1]...)
+    >>> list(le.inverse_transform([2, 2, 1]))
+    [np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')]
+    """
+
+    def fit(self, y):
+        """Fit label encoder.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : returns an instance of self.
+            Fitted label encoder.
+        """
+        y = column_or_1d(y, warn=True)
+        self.classes_ = _unique(y)
+        return self
+
+    def fit_transform(self, y):
+        """Fit label encoder and return encoded labels.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y : array-like of shape (n_samples,)
+            Encoded labels.
+        """
+        y = column_or_1d(y, warn=True)
+        self.classes_, y = _unique(y, return_inverse=True)
+        return y
+
+    def transform(self, y):
+        """Transform labels to normalized encoding.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y : array-like of shape (n_samples,)
+            Labels as normalized encodings.
+        """
+        check_is_fitted(self)
+        xp, _ = get_namespace(y)
+        y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
+        # transform of empty array is empty array
+        if _num_samples(y) == 0:
+            return xp.asarray([])
+
+        return _encode(y, uniques=self.classes_)
+
+    def inverse_transform(self, y):
+        """Transform labels back to original encoding.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y_original : ndarray of shape (n_samples,)
+            Original encoding.
+        """
+        check_is_fitted(self)
+        xp, _ = get_namespace(y)
+        y = column_or_1d(y, warn=True)
+        # inverse transform of empty array is empty array
+        if _num_samples(y) == 0:
+            return xp.asarray([])
+
+        diff = xpx.setdiff1d(
+            y,
+            xp.arange(self.classes_.shape[0], device=device(y)),
+            xp=xp,
+        )
+        if diff.shape[0]:
+            raise ValueError("y contains previously unseen labels: %s" % str(diff))
+        y = xp.asarray(y)
+        return xp.take(self.classes_, y, axis=0)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.array_api_support = True
+        tags.input_tags.two_d_array = False
+        tags.target_tags.one_d_labels = True
+        return tags
+
+
+class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
+    """Binarize labels in a one-vs-all fashion.
+
+    Several regression and binary classification algorithms are
+    available in scikit-learn. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    At learning time, this simply consists in learning one regressor
+    or binary classifier per class. In doing so, one needs to convert
+    multi-class labels to binary labels (belong or does not belong
+    to the class). `LabelBinarizer` makes this process easy with the
+    transform method.
+
+    At prediction time, one assigns the class for which the corresponding
+    model gave the greatest confidence. `LabelBinarizer` makes this easy
+    with the :meth:`inverse_transform` method.
+
+    Read more in the :ref:`User Guide <preprocessing_targets>`.
+
+    Parameters
+    ----------
+    neg_label : int, default=0
+        Value with which negative labels must be encoded.
+
+    pos_label : int, default=1
+        Value with which positive labels must be encoded.
+
+    sparse_output : bool, default=False
+        True if the returned array from transform is desired to be in sparse
+        CSR format.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Holds the label for each class.
+
+    y_type_ : str
+        Represents the type of the target data as evaluated by
+        :func:`~sklearn.utils.multiclass.type_of_target`. Possible type are
+        'continuous', 'continuous-multioutput', 'binary', 'multiclass',
+        'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
+
+    sparse_input_ : bool
+        `True` if the input data to transform is given as a sparse matrix,
+         `False` otherwise.
+
+    See Also
+    --------
+    label_binarize : Function to perform the transform operation of
+        LabelBinarizer with fixed classes.
+    OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
+        scheme.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import LabelBinarizer
+    >>> lb = LabelBinarizer()
+    >>> lb.fit([1, 2, 6, 4, 2])
+    LabelBinarizer()
+    >>> lb.classes_
+    array([1, 2, 4, 6])
+    >>> lb.transform([1, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    Binary targets transform to a column vector
+
+    >>> lb = LabelBinarizer()
+    >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
+    array([[1],
+           [0],
+           [0],
+           [1]])
+
+    Passing a 2D matrix for multilabel classification
+
+    >>> import numpy as np
+    >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
+    LabelBinarizer()
+    >>> lb.classes_
+    array([0, 1, 2])
+    >>> lb.transform([0, 1, 2, 1])
+    array([[1, 0, 0],
+           [0, 1, 0],
+           [0, 0, 1],
+           [0, 1, 0]])
+    """
+
+    _parameter_constraints: dict = {
+        "neg_label": [Integral],
+        "pos_label": [Integral],
+        "sparse_output": ["boolean"],
+    }
+
+    def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
+        self.neg_label = neg_label
+        self.pos_label = pos_label
+        self.sparse_output = sparse_output
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, y):
+        """Fit label binarizer.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        if self.neg_label >= self.pos_label:
+            raise ValueError(
+                f"neg_label={self.neg_label} must be strictly less than "
+                f"pos_label={self.pos_label}."
+            )
+
+        if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
+            raise ValueError(
+                "Sparse binarization is only supported with non "
+                "zero pos_label and zero neg_label, got "
+                f"pos_label={self.pos_label} and neg_label={self.neg_label}"
+            )
+
+        self.y_type_ = type_of_target(y, input_name="y")
+
+        if "multioutput" in self.y_type_:
+            raise ValueError(
+                "Multioutput target data is not supported with label binarization"
+            )
+        if _num_samples(y) == 0:
+            raise ValueError("y has 0 samples: %r" % y)
+
+        self.sparse_input_ = sp.issparse(y)
+        self.classes_ = unique_labels(y)
+        return self
+
+    def fit_transform(self, y):
+        """Fit label binarizer/transform multi-class labels to binary labels.
+
+        The output of transform is sometimes referred to as
+        the 1-of-K coding scheme.
+
+        Parameters
+        ----------
+        y : {ndarray, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification. Sparse matrix can be
+            CSR, CSC, COO, DOK, or LIL.
+
+        Returns
+        -------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Shape will be (n_samples, 1) for binary problems. Sparse matrix
+            will be of CSR format.
+        """
+        return self.fit(y).transform(y)
+
+    def transform(self, y):
+        """Transform multi-class labels to binary labels.
+
+        The output of transform is sometimes referred to by some authors as
+        the 1-of-K coding scheme.
+
+        Parameters
+        ----------
+        y : {array, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification. Sparse matrix can be
+            CSR, CSC, COO, DOK, or LIL.
+
+        Returns
+        -------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Shape will be (n_samples, 1) for binary problems. Sparse matrix
+            will be of CSR format.
+        """
+        check_is_fitted(self)
+
+        y_is_multilabel = type_of_target(y).startswith("multilabel")
+        if y_is_multilabel and not self.y_type_.startswith("multilabel"):
+            raise ValueError("The object was not fitted with multilabel input.")
+
+        return label_binarize(
+            y,
+            classes=self.classes_,
+            pos_label=self.pos_label,
+            neg_label=self.neg_label,
+            sparse_output=self.sparse_output,
+        )
+
+    def inverse_transform(self, Y, threshold=None):
+        """Transform binary labels back to multi-class labels.
+
+        Parameters
+        ----------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Target values. All sparse matrices are converted to CSR before
+            inverse transformation.
+
+        threshold : float, default=None
+            Threshold used in the binary and multi-label cases.
+
+            Use 0 when ``Y`` contains the output of :term:`decision_function`
+            (classifier).
+            Use 0.5 when ``Y`` contains the output of :term:`predict_proba`.
+
+            If None, the threshold is assumed to be half way between
+            neg_label and pos_label.
+
+        Returns
+        -------
+        y_original : {ndarray, sparse matrix} of shape (n_samples,)
+            Target values. Sparse matrix will be of CSR format.
+
+        Notes
+        -----
+        In the case when the binary labels are fractional
+        (probabilistic), :meth:`inverse_transform` chooses the class with the
+        greatest value. Typically, this allows to use the output of a
+        linear model's :term:`decision_function` method directly as the input
+        of :meth:`inverse_transform`.
+        """
+        check_is_fitted(self)
+
+        if threshold is None:
+            threshold = (self.pos_label + self.neg_label) / 2.0
+
+        if self.y_type_ == "multiclass":
+            y_inv = _inverse_binarize_multiclass(Y, self.classes_)
+        else:
+            y_inv = _inverse_binarize_thresholding(
+                Y, self.y_type_, self.classes_, threshold
+            )
+
+        if self.sparse_input_:
+            y_inv = sp.csr_matrix(y_inv)
+        elif sp.issparse(y_inv):
+            y_inv = y_inv.toarray()
+
+        return y_inv
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.two_d_array = False
+        tags.target_tags.one_d_labels = True
+        return tags
+
+
+@validate_params(
+    {
+        "y": ["array-like", "sparse matrix"],
+        "classes": ["array-like"],
+        "neg_label": [Interval(Integral, None, None, closed="neither")],
+        "pos_label": [Interval(Integral, None, None, closed="neither")],
+        "sparse_output": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
+    """Binarize labels in a one-vs-all fashion.
+
+    Several regression and binary classification algorithms are
+    available in scikit-learn. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    This function makes it possible to compute this transformation for a
+    fixed set of class labels known ahead of time.
+
+    Parameters
+    ----------
+    y : array-like or sparse matrix
+        Sequence of integer labels or multilabel data to encode.
+
+    classes : array-like of shape (n_classes,)
+        Uniquely holds the label for each class.
+
+    neg_label : int, default=0
+        Value with which negative labels must be encoded.
+
+    pos_label : int, default=1
+        Value with which positive labels must be encoded.
+
+    sparse_output : bool, default=False,
+        Set to true if output binary array is desired in CSR sparse format.
+
+    Returns
+    -------
+    Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+        Shape will be (n_samples, 1) for binary problems. Sparse matrix will
+        be of CSR format.
+
+    See Also
+    --------
+    LabelBinarizer : Class used to wrap the functionality of label_binarize and
+        allow for fitting to classes independently of the transform operation.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import label_binarize
+    >>> label_binarize([1, 6], classes=[1, 2, 4, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    The class ordering is preserved:
+
+    >>> label_binarize([1, 6], classes=[1, 6, 4, 2])
+    array([[1, 0, 0, 0],
+           [0, 1, 0, 0]])
+
+    Binary targets transform to a column vector
+
+    >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
+    array([[1],
+           [0],
+           [0],
+           [1]])
+    """
+    if not isinstance(y, list):
+        # XXX Workaround that will be removed when list of list format is
+        # dropped
+        y = check_array(
+            y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
+        )
+    else:
+        if _num_samples(y) == 0:
+            raise ValueError("y has 0 samples: %r" % y)
+    if neg_label >= pos_label:
+        raise ValueError(
+            "neg_label={0} must be strictly less than pos_label={1}.".format(
+                neg_label, pos_label
+            )
+        )
+
+    if sparse_output and (pos_label == 0 or neg_label != 0):
+        raise ValueError(
+            "Sparse binarization is only supported with non "
+            "zero pos_label and zero neg_label, got "
+            "pos_label={0} and neg_label={1}"
+            "".format(pos_label, neg_label)
+        )
+
+    # To account for pos_label == 0 in the dense case
+    pos_switch = pos_label == 0
+    if pos_switch:
+        pos_label = -neg_label
+
+    y_type = type_of_target(y)
+    if "multioutput" in y_type:
+        raise ValueError(
+            "Multioutput target data is not supported with label binarization"
+        )
+    if y_type == "unknown":
+        raise ValueError("The type of target data is not known")
+
+    n_samples = y.shape[0] if sp.issparse(y) else len(y)
+    n_classes = len(classes)
+    classes = np.asarray(classes)
+
+    if y_type == "binary":
+        if n_classes == 1:
+            if sparse_output:
+                return sp.csr_matrix((n_samples, 1), dtype=int)
+            else:
+                Y = np.zeros((len(y), 1), dtype=int)
+                Y += neg_label
+                return Y
+        elif len(classes) >= 3:
+            y_type = "multiclass"
+
+    sorted_class = np.sort(classes)
+    if y_type == "multilabel-indicator":
+        y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
+        if classes.size != y_n_classes:
+            raise ValueError(
+                "classes {0} mismatch with the labels {1} found in the data".format(
+                    classes, unique_labels(y)
+                )
+            )
+
+    if y_type in ("binary", "multiclass"):
+        y = column_or_1d(y)
+
+        # pick out the known labels from y
+        y_in_classes = np.isin(y, classes)
+        y_seen = y[y_in_classes]
+        indices = np.searchsorted(sorted_class, y_seen)
+        indptr = np.hstack((0, np.cumsum(y_in_classes)))
+
+        data = np.empty_like(indices)
+        data.fill(pos_label)
+        Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
+    elif y_type == "multilabel-indicator":
+        Y = sp.csr_matrix(y)
+        if pos_label != 1:
+            data = np.empty_like(Y.data)
+            data.fill(pos_label)
+            Y.data = data
+    else:
+        raise ValueError(
+            "%s target data is not supported with label binarization" % y_type
+        )
+
+    if not sparse_output:
+        Y = Y.toarray()
+        Y = Y.astype(int, copy=False)
+
+        if neg_label != 0:
+            Y[Y == 0] = neg_label
+
+        if pos_switch:
+            Y[Y == pos_label] = 0
+    else:
+        Y.data = Y.data.astype(int, copy=False)
+
+    # preserve label ordering
+    if np.any(classes != sorted_class):
+        indices = np.searchsorted(sorted_class, classes)
+        Y = Y[:, indices]
+
+    if y_type == "binary":
+        if sparse_output:
+            Y = Y[:, [-1]]
+        else:
+            Y = Y[:, -1].reshape((-1, 1))
+
+    return Y
+
+
+def _inverse_binarize_multiclass(y, classes):
+    """Inverse label binarization transformation for multiclass.
+
+    Multiclass uses the maximal score instead of a threshold.
+    """
+    classes = np.asarray(classes)
+
+    if sp.issparse(y):
+        # Find the argmax for each row in y where y is a CSR matrix
+
+        y = y.tocsr()
+        n_samples, n_outputs = y.shape
+        outputs = np.arange(n_outputs)
+        row_max = min_max_axis(y, 1)[1]
+        row_nnz = np.diff(y.indptr)
+
+        y_data_repeated_max = np.repeat(row_max, row_nnz)
+        # picks out all indices obtaining the maximum per row
+        y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
+
+        # For corner case where last row has a max of 0
+        if row_max[-1] == 0:
+            y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
+
+        # Gets the index of the first argmax in each row from y_i_all_argmax
+        index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
+        # first argmax of each row
+        y_ind_ext = np.append(y.indices, [0])
+        y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
+        # Handle rows of all 0
+        y_i_argmax[np.where(row_nnz == 0)[0]] = 0
+
+        # Handles rows with max of 0 that contain negative numbers
+        samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
+        for i in samples:
+            ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
+            y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
+
+        return classes[y_i_argmax]
+    else:
+        return classes.take(y.argmax(axis=1), mode="clip")
+
+
+def _inverse_binarize_thresholding(y, output_type, classes, threshold):
+    """Inverse label binarization transformation using thresholding."""
+
+    if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
+        raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
+
+    if output_type != "binary" and y.shape[1] != len(classes):
+        raise ValueError(
+            "The number of class is not equal to the number of dimension of y."
+        )
+
+    classes = np.asarray(classes)
+
+    # Perform thresholding
+    if sp.issparse(y):
+        if threshold > 0:
+            if y.format not in ("csr", "csc"):
+                y = y.tocsr()
+            y.data = np.array(y.data > threshold, dtype=int)
+            y.eliminate_zeros()
+        else:
+            y = np.array(y.toarray() > threshold, dtype=int)
+    else:
+        y = np.array(y > threshold, dtype=int)
+
+    # Inverse transform data
+    if output_type == "binary":
+        if sp.issparse(y):
+            y = y.toarray()
+        if y.ndim == 2 and y.shape[1] == 2:
+            return classes[y[:, 1]]
+        else:
+            if len(classes) == 1:
+                return np.repeat(classes[0], len(y))
+            else:
+                return classes[y.ravel()]
+
+    elif output_type == "multilabel-indicator":
+        return y
+
+    else:
+        raise ValueError("{0} format is not supported".format(output_type))
+
+
+class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
+    """Transform between iterable of iterables and a multilabel format.
+
+    Although a list of sets or tuples is a very intuitive format for multilabel
+    data, it is unwieldy to process. This transformer converts between this
+    intuitive format and the supported multilabel format: a (samples x classes)
+    binary matrix indicating the presence of a class label.
+
+    Parameters
+    ----------
+    classes : array-like of shape (n_classes,), default=None
+        Indicates an ordering for the class labels.
+        All entries should be unique (cannot contain duplicate classes).
+
+    sparse_output : bool, default=False
+        Set to True if output binary array is desired in CSR sparse format.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        A copy of the `classes` parameter when provided.
+        Otherwise it corresponds to the sorted set of classes found
+        when fitting.
+
+    See Also
+    --------
+    OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
+        scheme.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import MultiLabelBinarizer
+    >>> mlb = MultiLabelBinarizer()
+    >>> mlb.fit_transform([(1, 2), (3,)])
+    array([[1, 1, 0],
+           [0, 0, 1]])
+    >>> mlb.classes_
+    array([1, 2, 3])
+
+    >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
+    array([[0, 1, 1],
+           [1, 0, 0]])
+    >>> list(mlb.classes_)
+    ['comedy', 'sci-fi', 'thriller']
+
+    A common mistake is to pass in a list, which leads to the following issue:
+
+    >>> mlb = MultiLabelBinarizer()
+    >>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
+    MultiLabelBinarizer()
+    >>> mlb.classes_
+    array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
+        'y'], dtype=object)
+
+    To correct this, the list of labels should be passed in as:
+
+    >>> mlb = MultiLabelBinarizer()
+    >>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
+    MultiLabelBinarizer()
+    >>> mlb.classes_
+    array(['comedy', 'sci-fi', 'thriller'], dtype=object)
+    """
+
+    _parameter_constraints: dict = {
+        "classes": ["array-like", None],
+        "sparse_output": ["boolean"],
+    }
+
+    def __init__(self, *, classes=None, sparse_output=False):
+        self.classes = classes
+        self.sparse_output = sparse_output
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, y):
+        """Fit the label sets binarizer, storing :term:`classes_`.
+
+        Parameters
+        ----------
+        y : iterable of iterables
+            A set of labels (any orderable and hashable object) for each
+            sample. If the `classes` parameter is set, `y` will not be
+            iterated.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self._cached_dict = None
+
+        if self.classes is None:
+            classes = sorted(set(itertools.chain.from_iterable(y)))
+        elif len(set(self.classes)) < len(self.classes):
+            raise ValueError(
+                "The classes argument contains duplicate "
+                "classes. Remove these duplicates before passing "
+                "them to MultiLabelBinarizer."
+            )
+        else:
+            classes = self.classes
+        dtype = int if all(isinstance(c, int) for c in classes) else object
+        self.classes_ = np.empty(len(classes), dtype=dtype)
+        self.classes_[:] = classes
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, y):
+        """Fit the label sets binarizer and transform the given label sets.
+
+        Parameters
+        ----------
+        y : iterable of iterables
+            A set of labels (any orderable and hashable object) for each
+            sample. If the `classes` parameter is set, `y` will not be
+            iterated.
+
+        Returns
+        -------
+        y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
+            is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
+            format.
+        """
+        if self.classes is not None:
+            return self.fit(y).transform(y)
+
+        self._cached_dict = None
+
+        # Automatically increment on new class
+        class_mapping = defaultdict(int)
+        class_mapping.default_factory = class_mapping.__len__
+        yt = self._transform(y, class_mapping)
+
+        # sort classes and reorder columns
+        tmp = sorted(class_mapping, key=class_mapping.get)
+
+        # (make safe for tuples)
+        dtype = int if all(isinstance(c, int) for c in tmp) else object
+        class_mapping = np.empty(len(tmp), dtype=dtype)
+        class_mapping[:] = tmp
+        self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
+        # ensure yt.indices keeps its current dtype
+        yt.indices = np.asarray(inverse[yt.indices], dtype=yt.indices.dtype)
+
+        if not self.sparse_output:
+            yt = yt.toarray()
+
+        return yt
+
+    def transform(self, y):
+        """Transform the given label sets.
+
+        Parameters
+        ----------
+        y : iterable of iterables
+            A set of labels (any orderable and hashable object) for each
+            sample. If the `classes` parameter is set, `y` will not be
+            iterated.
+
+        Returns
+        -------
+        y_indicator : array or CSR matrix, shape (n_samples, n_classes)
+            A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
+            `y[i]`, and 0 otherwise.
+        """
+        check_is_fitted(self)
+
+        class_to_index = self._build_cache()
+        yt = self._transform(y, class_to_index)
+
+        if not self.sparse_output:
+            yt = yt.toarray()
+
+        return yt
+
+    def _build_cache(self):
+        if self._cached_dict is None:
+            self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
+
+        return self._cached_dict
+
+    def _transform(self, y, class_mapping):
+        """Transforms the label sets with a given mapping.
+
+        Parameters
+        ----------
+        y : iterable of iterables
+            A set of labels (any orderable and hashable object) for each
+            sample. If the `classes` parameter is set, `y` will not be
+            iterated.
+
+        class_mapping : Mapping
+            Maps from label to column index in label indicator matrix.
+
+        Returns
+        -------
+        y_indicator : sparse matrix of shape (n_samples, n_classes)
+            Label indicator matrix. Will be of CSR format.
+        """
+        indices = array.array("i")
+        indptr = array.array("i", [0])
+        unknown = set()
+        for labels in y:
+            index = set()
+            for label in labels:
+                try:
+                    index.add(class_mapping[label])
+                except KeyError:
+                    unknown.add(label)
+            indices.extend(index)
+            indptr.append(len(indices))
+        if unknown:
+            warnings.warn(
+                "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
+            )
+        data = np.ones(len(indices), dtype=int)
+
+        return sp.csr_matrix(
+            (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
+        )
+
+    def inverse_transform(self, yt):
+        """Transform the given indicator matrix into label sets.
+
+        Parameters
+        ----------
+        yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            A matrix containing only 1s ands 0s.
+
+        Returns
+        -------
+        y_original : list of tuples
+            The set of labels for each sample such that `y[i]` consists of
+            `classes_[j]` for each `yt[i, j] == 1`.
+        """
+        check_is_fitted(self)
+
+        if yt.shape[1] != len(self.classes_):
+            raise ValueError(
+                "Expected indicator for {0} classes, but got {1}".format(
+                    len(self.classes_), yt.shape[1]
+                )
+            )
+
+        if sp.issparse(yt):
+            yt = yt.tocsr()
+            if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
+                raise ValueError("Expected only 0s and 1s in label indicator.")
+            return [
+                tuple(self.classes_.take(yt.indices[start:end]))
+                for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
+            ]
+        else:
+            unexpected = np.setdiff1d(yt, [0, 1])
+            if len(unexpected) > 0:
+                raise ValueError(
+                    "Expected only 0s and 1s in label indicator. Also got {0}".format(
+                        unexpected
+                    )
+                )
+            return [tuple(self.classes_.compress(indicators)) for indicators in yt]
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.two_d_array = False
+        tags.target_tags.two_d_labels = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_polynomial.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_polynomial.py
new file mode 100644
index 0000000000000000000000000000000000000000..69bfe7b212bba6b3bfaaa021eed9d26b21b8fd68
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_polynomial.py
@@ -0,0 +1,1153 @@
+"""
+This file contains preprocessing tools based on polynomials.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import collections
+from itertools import chain, combinations
+from itertools import combinations_with_replacement as combinations_w_r
+from numbers import Integral
+
+import numpy as np
+from scipy import sparse
+from scipy.interpolate import BSpline
+from scipy.special import comb
+
+from ..base import BaseEstimator, TransformerMixin, _fit_context
+from ..utils import check_array
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.fixes import parse_version, sp_version
+from ..utils.stats import _weighted_percentile
+from ..utils.validation import (
+    FLOAT_DTYPES,
+    _check_feature_names_in,
+    _check_sample_weight,
+    check_is_fitted,
+    validate_data,
+)
+from ._csr_polynomial_expansion import (
+    _calc_expanded_nnz,
+    _calc_total_nnz,
+    _csr_polynomial_expansion,
+)
+
+__all__ = [
+    "PolynomialFeatures",
+    "SplineTransformer",
+]
+
+
+def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0):
+    """Helper function for creating and appending sparse expansion matrices"""
+
+    total_nnz = _calc_total_nnz(X.indptr, interaction_only, deg)
+    expanded_col = _calc_expanded_nnz(n_features, interaction_only, deg)
+
+    if expanded_col == 0:
+        return None
+    # This only checks whether each block needs 64bit integers upon
+    # expansion. We prefer to keep int32 indexing where we can,
+    # since currently SciPy's CSR construction downcasts when possible,
+    # so we prefer to avoid an unnecessary cast. The dtype may still
+    # change in the concatenation process if needed.
+    # See: https://github.com/scipy/scipy/issues/16569
+    max_indices = expanded_col - 1
+    max_indptr = total_nnz
+    max_int32 = np.iinfo(np.int32).max
+    needs_int64 = max(max_indices, max_indptr) > max_int32
+    index_dtype = np.int64 if needs_int64 else np.int32
+
+    # Result of the expansion, modified in place by the
+    # `_csr_polynomial_expansion` routine.
+    expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype)
+    expanded_indices = np.empty(shape=total_nnz, dtype=index_dtype)
+    expanded_indptr = np.empty(shape=X.indptr.shape[0], dtype=index_dtype)
+    _csr_polynomial_expansion(
+        X.data,
+        X.indices,
+        X.indptr,
+        X.shape[1],
+        expanded_data,
+        expanded_indices,
+        expanded_indptr,
+        interaction_only,
+        deg,
+    )
+    return sparse.csr_matrix(
+        (expanded_data, expanded_indices, expanded_indptr),
+        shape=(X.indptr.shape[0] - 1, expanded_col),
+        dtype=X.dtype,
+    )
+
+
+class PolynomialFeatures(TransformerMixin, BaseEstimator):
+    """Generate polynomial and interaction features.
+
+    Generate a new feature matrix consisting of all polynomial combinations
+    of the features with degree less than or equal to the specified degree.
+    For example, if an input sample is two dimensional and of the form
+    [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
+
+    Read more in the :ref:`User Guide <polynomial_features>`.
+
+    Parameters
+    ----------
+    degree : int or tuple (min_degree, max_degree), default=2
+        If a single int is given, it specifies the maximal degree of the
+        polynomial features. If a tuple `(min_degree, max_degree)` is passed,
+        then `min_degree` is the minimum and `max_degree` is the maximum
+        polynomial degree of the generated features. Note that `min_degree=0`
+        and `min_degree=1` are equivalent as outputting the degree zero term is
+        determined by `include_bias`.
+
+    interaction_only : bool, default=False
+        If `True`, only interaction features are produced: features that are
+        products of at most `degree` *distinct* input features, i.e. terms with
+        power of 2 or higher of the same input feature are excluded:
+
+        - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc.
+        - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc.
+
+    include_bias : bool, default=True
+        If `True` (default), then include a bias column, the feature in which
+        all polynomial powers are zero (i.e. a column of ones - acts as an
+        intercept term in a linear model).
+
+    order : {'C', 'F'}, default='C'
+        Order of output array in the dense case. `'F'` order is faster to
+        compute, but may slow down subsequent estimators.
+
+        .. versionadded:: 0.21
+
+    Attributes
+    ----------
+    powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`)
+        `powers_[i, j]` is the exponent of the jth input in the ith output.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_output_features_ : int
+        The total number of polynomial output features. The number of output
+        features is computed by iterating over all suitably sized combinations
+        of input features.
+
+    See Also
+    --------
+    SplineTransformer : Transformer that generates univariate B-spline bases
+        for features.
+
+    Notes
+    -----
+    Be aware that the number of features in the output array scales
+    polynomially in the number of features of the input array, and
+    exponentially in the degree. High degrees can cause overfitting.
+
+    See :ref:`examples/linear_model/plot_polynomial_interpolation.py
+    <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import PolynomialFeatures
+    >>> X = np.arange(6).reshape(3, 2)
+    >>> X
+    array([[0, 1],
+           [2, 3],
+           [4, 5]])
+    >>> poly = PolynomialFeatures(2)
+    >>> poly.fit_transform(X)
+    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
+           [ 1.,  2.,  3.,  4.,  6.,  9.],
+           [ 1.,  4.,  5., 16., 20., 25.]])
+    >>> poly = PolynomialFeatures(interaction_only=True)
+    >>> poly.fit_transform(X)
+    array([[ 1.,  0.,  1.,  0.],
+           [ 1.,  2.,  3.,  6.],
+           [ 1.,  4.,  5., 20.]])
+    """
+
+    _parameter_constraints: dict = {
+        "degree": [Interval(Integral, 0, None, closed="left"), "array-like"],
+        "interaction_only": ["boolean"],
+        "include_bias": ["boolean"],
+        "order": [StrOptions({"C", "F"})],
+    }
+
+    def __init__(
+        self, degree=2, *, interaction_only=False, include_bias=True, order="C"
+    ):
+        self.degree = degree
+        self.interaction_only = interaction_only
+        self.include_bias = include_bias
+        self.order = order
+
+    @staticmethod
+    def _combinations(
+        n_features, min_degree, max_degree, interaction_only, include_bias
+    ):
+        comb = combinations if interaction_only else combinations_w_r
+        start = max(1, min_degree)
+        iter = chain.from_iterable(
+            comb(range(n_features), i) for i in range(start, max_degree + 1)
+        )
+        if include_bias:
+            iter = chain(comb(range(n_features), 0), iter)
+        return iter
+
+    @staticmethod
+    def _num_combinations(
+        n_features, min_degree, max_degree, interaction_only, include_bias
+    ):
+        """Calculate number of terms in polynomial expansion
+
+        This should be equivalent to counting the number of terms returned by
+        _combinations(...) but much faster.
+        """
+
+        if interaction_only:
+            combinations = sum(
+                [
+                    comb(n_features, i, exact=True)
+                    for i in range(max(1, min_degree), min(max_degree, n_features) + 1)
+                ]
+            )
+        else:
+            combinations = comb(n_features + max_degree, max_degree, exact=True) - 1
+            if min_degree > 0:
+                d = min_degree - 1
+                combinations -= comb(n_features + d, d, exact=True) - 1
+
+        if include_bias:
+            combinations += 1
+
+        return combinations
+
+    @property
+    def powers_(self):
+        """Exponent for each of the inputs in the output."""
+        check_is_fitted(self)
+
+        combinations = self._combinations(
+            n_features=self.n_features_in_,
+            min_degree=self._min_degree,
+            max_degree=self._max_degree,
+            interaction_only=self.interaction_only,
+            include_bias=self.include_bias,
+        )
+        return np.vstack(
+            [np.bincount(c, minlength=self.n_features_in_) for c in combinations]
+        )
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features is None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        powers = self.powers_
+        input_features = _check_feature_names_in(self, input_features)
+        feature_names = []
+        for row in powers:
+            inds = np.where(row)[0]
+            if len(inds):
+                name = " ".join(
+                    (
+                        "%s^%d" % (input_features[ind], exp)
+                        if exp != 1
+                        else input_features[ind]
+                    )
+                    for ind, exp in zip(inds, row[inds])
+                )
+            else:
+                name = "1"
+            feature_names.append(name)
+        return np.asarray(feature_names, dtype=object)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """
+        Compute number of output features.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        _, n_features = validate_data(self, X, accept_sparse=True).shape
+
+        if isinstance(self.degree, Integral):
+            if self.degree == 0 and not self.include_bias:
+                raise ValueError(
+                    "Setting degree to zero and include_bias to False would result in"
+                    " an empty output array."
+                )
+
+            self._min_degree = 0
+            self._max_degree = self.degree
+        elif (
+            isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2
+        ):
+            self._min_degree, self._max_degree = self.degree
+            if not (
+                isinstance(self._min_degree, Integral)
+                and isinstance(self._max_degree, Integral)
+                and self._min_degree >= 0
+                and self._min_degree <= self._max_degree
+            ):
+                raise ValueError(
+                    "degree=(min_degree, max_degree) must "
+                    "be non-negative integers that fulfil "
+                    "min_degree <= max_degree, got "
+                    f"{self.degree}."
+                )
+            elif self._max_degree == 0 and not self.include_bias:
+                raise ValueError(
+                    "Setting both min_degree and max_degree to zero and include_bias to"
+                    " False would result in an empty output array."
+                )
+        else:
+            raise ValueError(
+                "degree must be a non-negative int or tuple "
+                "(min_degree, max_degree), got "
+                f"{self.degree}."
+            )
+
+        self.n_output_features_ = self._num_combinations(
+            n_features=n_features,
+            min_degree=self._min_degree,
+            max_degree=self._max_degree,
+            interaction_only=self.interaction_only,
+            include_bias=self.include_bias,
+        )
+        if self.n_output_features_ > np.iinfo(np.intp).max:
+            msg = (
+                "The output that would result from the current configuration would"
+                f" have {self.n_output_features_} features which is too large to be"
+                f" indexed by {np.intp().dtype.name}. Please change some or all of the"
+                " following:\n- The number of features in the input, currently"
+                f" {n_features=}\n- The range of degrees to calculate, currently"
+                f" [{self._min_degree}, {self._max_degree}]\n- Whether to include only"
+                f" interaction terms, currently {self.interaction_only}\n- Whether to"
+                f" include a bias term, currently {self.include_bias}."
+            )
+            if (
+                np.intp == np.int32
+                and self.n_output_features_ <= np.iinfo(np.int64).max
+            ):  # pragma: nocover
+                msg += (
+                    "\nNote that the current Python runtime has a limited 32 bit "
+                    "address space and that this configuration would have been "
+                    "admissible if run on a 64 bit Python runtime."
+                )
+            raise ValueError(msg)
+        # We also record the number of output features for
+        # _min_degree = 0
+        self._n_out_full = self._num_combinations(
+            n_features=n_features,
+            min_degree=0,
+            max_degree=self._max_degree,
+            interaction_only=self.interaction_only,
+            include_bias=self.include_bias,
+        )
+
+        return self
+
+    def transform(self, X):
+        """Transform data to polynomial features.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to transform, row by row.
+
+            Prefer CSR over CSC for sparse input (for speed), but CSC is
+            required if the degree is 4 or higher. If the degree is less than
+            4 and the input format is CSC, it will be converted to CSR, have
+            its polynomial features generated, then converted back to CSC.
+
+            If the degree is 2 or 3, the method described in "Leveraging
+            Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices
+            Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is
+            used, which is much faster than the method used on CSC input. For
+            this reason, a CSC input will be converted to CSR, and the output
+            will be converted back to CSC prior to being returned, hence the
+            preference of CSR.
+
+        Returns
+        -------
+        XP : {ndarray, sparse matrix} of shape (n_samples, NP)
+            The matrix of features, where `NP` is the number of polynomial
+            features generated from the combination of inputs. If a sparse
+            matrix is provided, it will be converted into a sparse
+            `csr_matrix`.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(
+            self,
+            X,
+            order="F",
+            dtype=FLOAT_DTYPES,
+            reset=False,
+            accept_sparse=("csr", "csc"),
+        )
+
+        n_samples, n_features = X.shape
+        max_int32 = np.iinfo(np.int32).max
+        if sparse.issparse(X) and X.format == "csr":
+            if self._max_degree > 3:
+                return self.transform(X.tocsc()).tocsr()
+            to_stack = []
+            if self.include_bias:
+                to_stack.append(
+                    sparse.csr_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype))
+                )
+            if self._min_degree <= 1 and self._max_degree > 0:
+                to_stack.append(X)
+
+            cumulative_size = sum(mat.shape[1] for mat in to_stack)
+            for deg in range(max(2, self._min_degree), self._max_degree + 1):
+                expanded = _create_expansion(
+                    X=X,
+                    interaction_only=self.interaction_only,
+                    deg=deg,
+                    n_features=n_features,
+                    cumulative_size=cumulative_size,
+                )
+                if expanded is not None:
+                    to_stack.append(expanded)
+                    cumulative_size += expanded.shape[1]
+            if len(to_stack) == 0:
+                # edge case: deal with empty matrix
+                XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)
+            else:
+                # `scipy.sparse.hstack` breaks in scipy<1.9.2
+                # when `n_output_features_ > max_int32`
+                all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack)
+                if (
+                    sp_version < parse_version("1.9.2")
+                    and self.n_output_features_ > max_int32
+                    and all_int32
+                ):
+                    raise ValueError(  # pragma: no cover
+                        "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
+                        " produces negative columns when:\n1. The output shape contains"
+                        " `n_cols` too large to be represented by a 32bit signed"
+                        " integer.\n2. All sub-matrices to be stacked have indices of"
+                        " dtype `np.int32`.\nTo avoid this error, either use a version"
+                        " of scipy `>=1.9.2` or alter the `PolynomialFeatures`"
+                        " transformer to produce fewer than 2^31 output features"
+                    )
+                XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr")
+        elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4:
+            return self.transform(X.tocsr()).tocsc()
+        elif sparse.issparse(X):
+            combinations = self._combinations(
+                n_features=n_features,
+                min_degree=self._min_degree,
+                max_degree=self._max_degree,
+                interaction_only=self.interaction_only,
+                include_bias=self.include_bias,
+            )
+            columns = []
+            for combi in combinations:
+                if combi:
+                    out_col = 1
+                    for col_idx in combi:
+                        out_col = X[:, [col_idx]].multiply(out_col)
+                    columns.append(out_col)
+                else:
+                    bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))
+                    columns.append(bias)
+            XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
+        else:
+            # Do as if _min_degree = 0 and cut down array after the
+            # computation, i.e. use _n_out_full instead of n_output_features_.
+            XP = np.empty(
+                shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order
+            )
+
+            # What follows is a faster implementation of:
+            # for i, comb in enumerate(combinations):
+            #     XP[:, i] = X[:, comb].prod(1)
+            # This implementation uses two optimisations.
+            # First one is broadcasting,
+            # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1]
+            # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2]
+            # ...
+            # multiply ([X[:, start:end], X[:, start]) -> ...
+            # Second optimisation happens for degrees >= 3.
+            # Xi^3 is computed reusing previous computation:
+            # Xi^3 = Xi^2 * Xi.
+
+            # degree 0 term
+            if self.include_bias:
+                XP[:, 0] = 1
+                current_col = 1
+            else:
+                current_col = 0
+
+            if self._max_degree == 0:
+                return XP
+
+            # degree 1 term
+            XP[:, current_col : current_col + n_features] = X
+            index = list(range(current_col, current_col + n_features))
+            current_col += n_features
+            index.append(current_col)
+
+            # loop over degree >= 2 terms
+            for _ in range(2, self._max_degree + 1):
+                new_index = []
+                end = index[-1]
+                for feature_idx in range(n_features):
+                    start = index[feature_idx]
+                    new_index.append(current_col)
+                    if self.interaction_only:
+                        start += index[feature_idx + 1] - index[feature_idx]
+                    next_col = current_col + end - start
+                    if next_col <= current_col:
+                        break
+                    # XP[:, start:end] are terms of degree d - 1
+                    # that exclude feature #feature_idx.
+                    np.multiply(
+                        XP[:, start:end],
+                        X[:, feature_idx : feature_idx + 1],
+                        out=XP[:, current_col:next_col],
+                        casting="no",
+                    )
+                    current_col = next_col
+
+                new_index.append(current_col)
+                index = new_index
+
+            if self._min_degree > 1:
+                n_XP, n_Xout = self._n_out_full, self.n_output_features_
+                if self.include_bias:
+                    Xout = np.empty(
+                        shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order
+                    )
+                    Xout[:, 0] = 1
+                    Xout[:, 1:] = XP[:, n_XP - n_Xout + 1 :]
+                else:
+                    Xout = XP[:, n_XP - n_Xout :].copy()
+                XP = Xout
+        return XP
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class SplineTransformer(TransformerMixin, BaseEstimator):
+    """Generate univariate B-spline bases for features.
+
+    Generate a new feature matrix consisting of
+    `n_splines=n_knots + degree - 1` (`n_knots - 1` for
+    `extrapolation="periodic"`) spline basis functions
+    (B-splines) of polynomial order=`degree` for each feature.
+
+    In order to learn more about the SplineTransformer class go to:
+    :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`
+
+    Read more in the :ref:`User Guide <spline_transformer>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    n_knots : int, default=5
+        Number of knots of the splines if `knots` equals one of
+        {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`
+        is array-like.
+
+    degree : int, default=3
+        The polynomial degree of the spline basis. Must be a non-negative
+        integer.
+
+    knots : {'uniform', 'quantile'} or array-like of shape \
+        (n_knots, n_features), default='uniform'
+        Set knot positions such that first knot <= features <= last knot.
+
+        - If 'uniform', `n_knots` number of knots are distributed uniformly
+          from min to max values of the features.
+        - If 'quantile', they are distributed uniformly along the quantiles of
+          the features.
+        - If an array-like is given, it directly specifies the sorted knot
+          positions including the boundary knots. Note that, internally,
+          `degree` number of knots are added before the first knot, the same
+          after the last knot.
+
+    extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \
+        default='constant'
+        If 'error', values outside the min and max values of the training
+        features raises a `ValueError`. If 'constant', the value of the
+        splines at minimum and maximum value of the features is used as
+        constant extrapolation. If 'linear', a linear extrapolation is used.
+        If 'continue', the splines are extrapolated as is, i.e. option
+        `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If
+        'periodic', periodic splines with a periodicity equal to the distance
+        between the first and last knot are used. Periodic splines enforce
+        equal function values and derivatives at the first and last knot.
+        For example, this makes it possible to avoid introducing an arbitrary
+        jump between Dec 31st and Jan 1st in spline features derived from a
+        naturally periodic "day-of-year" input feature. In this case it is
+        recommended to manually set the knot values to control the period.
+
+    include_bias : bool, default=True
+        If False, then the last spline element inside the data range
+        of a feature is dropped. As B-splines sum to one over the spline basis
+        functions for each data point, they implicitly include a bias term,
+        i.e. a column of ones. It acts as an intercept term in a linear models.
+
+    order : {'C', 'F'}, default='C'
+        Order of output array in the dense case. `'F'` order is faster to compute, but
+        may slow down subsequent estimators.
+
+    sparse_output : bool, default=False
+        Will return sparse CSR matrix if set True else will return an array.
+
+        .. versionadded:: 1.2
+
+    Attributes
+    ----------
+    bsplines_ : list of shape (n_features,)
+        List of BSplines objects, one for each feature.
+
+    n_features_in_ : int
+        The total number of input features.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_features_out_ : int
+        The total number of output features, which is computed as
+        `n_features * n_splines`, where `n_splines` is
+        the number of bases elements of the B-splines,
+        `n_knots + degree - 1` for non-periodic splines and
+        `n_knots - 1` for periodic ones.
+        If `include_bias=False`, then it is only
+        `n_features * (n_splines - 1)`.
+
+    See Also
+    --------
+    KBinsDiscretizer : Transformer that bins continuous data into intervals.
+
+    PolynomialFeatures : Transformer that generates polynomial and interaction
+        features.
+
+    Notes
+    -----
+    High degrees and a high number of knots can cause overfitting.
+
+    See :ref:`examples/linear_model/plot_polynomial_interpolation.py
+    <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import SplineTransformer
+    >>> X = np.arange(6).reshape(6, 1)
+    >>> spline = SplineTransformer(degree=2, n_knots=3)
+    >>> spline.fit_transform(X)
+    array([[0.5 , 0.5 , 0.  , 0.  ],
+           [0.18, 0.74, 0.08, 0.  ],
+           [0.02, 0.66, 0.32, 0.  ],
+           [0.  , 0.32, 0.66, 0.02],
+           [0.  , 0.08, 0.74, 0.18],
+           [0.  , 0.  , 0.5 , 0.5 ]])
+    """
+
+    _parameter_constraints: dict = {
+        "n_knots": [Interval(Integral, 2, None, closed="left")],
+        "degree": [Interval(Integral, 0, None, closed="left")],
+        "knots": [StrOptions({"uniform", "quantile"}), "array-like"],
+        "extrapolation": [
+            StrOptions({"error", "constant", "linear", "continue", "periodic"})
+        ],
+        "include_bias": ["boolean"],
+        "order": [StrOptions({"C", "F"})],
+        "sparse_output": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        n_knots=5,
+        degree=3,
+        *,
+        knots="uniform",
+        extrapolation="constant",
+        include_bias=True,
+        order="C",
+        sparse_output=False,
+    ):
+        self.n_knots = n_knots
+        self.degree = degree
+        self.knots = knots
+        self.extrapolation = extrapolation
+        self.include_bias = include_bias
+        self.order = order
+        self.sparse_output = sparse_output
+
+    @staticmethod
+    def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None):
+        """Calculate base knot positions.
+
+        Base knots such that first knot <= feature <= last knot. For the
+        B-spline construction with scipy.interpolate.BSpline, 2*degree knots
+        beyond the base interval are added.
+
+        Returns
+        -------
+        knots : ndarray of shape (n_knots, n_features), dtype=np.float64
+            Knot positions (points) of base interval.
+        """
+        if knots == "quantile":
+            percentile_ranks = 100 * np.linspace(
+                start=0, stop=1, num=n_knots, dtype=np.float64
+            )
+
+            if sample_weight is None:
+                knots = np.percentile(X, percentile_ranks, axis=0)
+            else:
+                knots = np.array(
+                    [
+                        _weighted_percentile(X, sample_weight, percentile_rank)
+                        for percentile_rank in percentile_ranks
+                    ]
+                )
+
+        else:
+            # knots == 'uniform':
+            # Note that the variable `knots` has already been validated and
+            # `else` is therefore safe.
+            # Disregard observations with zero weight.
+            mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0
+            x_min = np.amin(X[mask], axis=0)
+            x_max = np.amax(X[mask], axis=0)
+
+            knots = np.linspace(
+                start=x_min,
+                stop=x_max,
+                num=n_knots,
+                endpoint=True,
+                dtype=np.float64,
+            )
+
+        return knots
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        n_splines = self.bsplines_[0].c.shape[1]
+
+        input_features = _check_feature_names_in(self, input_features)
+        feature_names = []
+        for i in range(self.n_features_in_):
+            for j in range(n_splines - 1 + self.include_bias):
+                feature_names.append(f"{input_features[i]}_sp_{j}")
+        return np.asarray(feature_names, dtype=object)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute knot positions of splines.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data.
+
+        y : None
+            Ignored.
+
+        sample_weight : array-like of shape (n_samples,), default = None
+            Individual weights for each sample. Used to calculate quantiles if
+            `knots="quantile"`. For `knots="uniform"`, zero weighted
+            observations are ignored for finding the min and max of `X`.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        X = validate_data(
+            self,
+            X,
+            reset=True,
+            accept_sparse=False,
+            ensure_min_samples=2,
+            ensure_2d=True,
+        )
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        _, n_features = X.shape
+
+        if isinstance(self.knots, str):
+            base_knots = self._get_base_knot_positions(
+                X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight
+            )
+        else:
+            base_knots = check_array(self.knots, dtype=np.float64)
+            if base_knots.shape[0] < 2:
+                raise ValueError("Number of knots, knots.shape[0], must be >= 2.")
+            elif base_knots.shape[1] != n_features:
+                raise ValueError("knots.shape[1] == n_features is violated.")
+            elif not np.all(np.diff(base_knots, axis=0) > 0):
+                raise ValueError("knots must be sorted without duplicates.")
+
+        # number of knots for base interval
+        n_knots = base_knots.shape[0]
+
+        if self.extrapolation == "periodic" and n_knots <= self.degree:
+            raise ValueError(
+                "Periodic splines require degree < n_knots. Got n_knots="
+                f"{n_knots} and degree={self.degree}."
+            )
+
+        # number of splines basis functions
+        if self.extrapolation != "periodic":
+            n_splines = n_knots + self.degree - 1
+        else:
+            # periodic splines have self.degree less degrees of freedom
+            n_splines = n_knots - 1
+
+        degree = self.degree
+        n_out = n_features * n_splines
+        # We have to add degree number of knots below, and degree number knots
+        # above the base knots in order to make the spline basis complete.
+        if self.extrapolation == "periodic":
+            # For periodic splines the spacing of the first / last degree knots
+            # needs to be a continuation of the spacing of the last / first
+            # base knots.
+            period = base_knots[-1] - base_knots[0]
+            knots = np.r_[
+                base_knots[-(degree + 1) : -1] - period,
+                base_knots,
+                base_knots[1 : (degree + 1)] + period,
+            ]
+
+        else:
+            # Eilers & Marx in "Flexible smoothing with B-splines and
+            # penalties" https://doi.org/10.1214/ss/1038425655 advice
+            # against repeating first and last knot several times, which
+            # would have inferior behaviour at boundaries if combined with
+            # a penalty (hence P-Spline). We follow this advice even if our
+            # splines are unpenalized. Meaning we do not:
+            # knots = np.r_[
+            #     np.tile(base_knots.min(axis=0), reps=[degree, 1]),
+            #     base_knots,
+            #     np.tile(base_knots.max(axis=0), reps=[degree, 1])
+            # ]
+            # Instead, we reuse the distance of the 2 fist/last knots.
+            dist_min = base_knots[1] - base_knots[0]
+            dist_max = base_knots[-1] - base_knots[-2]
+
+            knots = np.r_[
+                np.linspace(
+                    base_knots[0] - degree * dist_min,
+                    base_knots[0] - dist_min,
+                    num=degree,
+                ),
+                base_knots,
+                np.linspace(
+                    base_knots[-1] + dist_max,
+                    base_knots[-1] + degree * dist_max,
+                    num=degree,
+                ),
+            ]
+
+        # With a diagonal coefficient matrix, we get back the spline basis
+        # elements, i.e. the design matrix of the spline.
+        # Note, BSpline appreciates C-contiguous float64 arrays as c=coef.
+        coef = np.eye(n_splines, dtype=np.float64)
+        if self.extrapolation == "periodic":
+            coef = np.concatenate((coef, coef[:degree, :]))
+
+        extrapolate = self.extrapolation in ["periodic", "continue"]
+
+        bsplines = [
+            BSpline.construct_fast(
+                knots[:, i], coef, self.degree, extrapolate=extrapolate
+            )
+            for i in range(n_features)
+        ]
+        self.bsplines_ = bsplines
+
+        self.n_features_out_ = n_out - n_features * (1 - self.include_bias)
+        return self
+
+    def transform(self, X):
+        """Transform each feature data to B-splines.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to transform.
+
+        Returns
+        -------
+        XBS : {ndarray, sparse matrix} of shape (n_samples, n_features * n_splines)
+            The matrix of features, where n_splines is the number of bases
+            elements of the B-splines, n_knots + degree - 1.
+        """
+        check_is_fitted(self)
+
+        X = validate_data(self, X, reset=False, accept_sparse=False, ensure_2d=True)
+
+        n_samples, n_features = X.shape
+        n_splines = self.bsplines_[0].c.shape[1]
+        degree = self.degree
+
+        # TODO: Remove this condition, once scipy 1.10 is the minimum version.
+        #       Only scipy => 1.10 supports design_matrix(.., extrapolate=..).
+        #       The default (implicit in scipy < 1.10) is extrapolate=False.
+        scipy_1_10 = sp_version >= parse_version("1.10.0")
+        # Note: self.bsplines_[0].extrapolate is True for extrapolation in
+        # ["periodic", "continue"]
+        if scipy_1_10:
+            use_sparse = self.sparse_output
+            kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate}
+        else:
+            use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate
+            kwargs_extrapolate = dict()
+
+        # Note that scipy BSpline returns float64 arrays and converts input
+        # x=X[:, i] to c-contiguous float64.
+        n_out = self.n_features_out_ + n_features * (1 - self.include_bias)
+        if X.dtype in FLOAT_DTYPES:
+            dtype = X.dtype
+        else:
+            dtype = np.float64
+        if use_sparse:
+            output_list = []
+        else:
+            XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)
+
+        for i in range(n_features):
+            spl = self.bsplines_[i]
+
+            if self.extrapolation in ("continue", "error", "periodic"):
+                if self.extrapolation == "periodic":
+                    # With periodic extrapolation we map x to the segment
+                    # [spl.t[k], spl.t[n]].
+                    # This is equivalent to BSpline(.., extrapolate="periodic")
+                    # for scipy>=1.0.0.
+                    n = spl.t.size - spl.k - 1
+                    # Assign to new array to avoid inplace operation
+                    x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % (
+                        spl.t[n] - spl.t[spl.k]
+                    )
+                else:
+                    x = X[:, i]
+
+                if use_sparse:
+                    XBS_sparse = BSpline.design_matrix(
+                        x, spl.t, spl.k, **kwargs_extrapolate
+                    )
+                    if self.extrapolation == "periodic":
+                        # See the construction of coef in fit. We need to add the last
+                        # degree spline basis function to the first degree ones and
+                        # then drop the last ones.
+                        # Note: See comment about SparseEfficiencyWarning below.
+                        XBS_sparse = XBS_sparse.tolil()
+                        XBS_sparse[:, :degree] += XBS_sparse[:, -degree:]
+                        XBS_sparse = XBS_sparse[:, :-degree]
+                else:
+                    XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x)
+            else:  # extrapolation in ("constant", "linear")
+                xmin, xmax = spl.t[degree], spl.t[-degree - 1]
+                # spline values at boundaries
+                f_min, f_max = spl(xmin), spl(xmax)
+                mask = (xmin <= X[:, i]) & (X[:, i] <= xmax)
+                if use_sparse:
+                    mask_inv = ~mask
+                    x = X[:, i].copy()
+                    # Set some arbitrary values outside boundary that will be reassigned
+                    # later.
+                    x[mask_inv] = spl.t[self.degree]
+                    XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k)
+                    # Note: Without converting to lil_matrix we would get:
+                    # scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity
+                    # structure of a csr_matrix is expensive. lil_matrix is more
+                    # efficient.
+                    if np.any(mask_inv):
+                        XBS_sparse = XBS_sparse.tolil()
+                        XBS_sparse[mask_inv, :] = 0
+                else:
+                    XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i])
+
+            # Note for extrapolation:
+            # 'continue' is already returned as is by scipy BSplines
+            if self.extrapolation == "error":
+                # BSpline with extrapolate=False does not raise an error, but
+                # outputs np.nan.
+                if (use_sparse and np.any(np.isnan(XBS_sparse.data))) or (
+                    not use_sparse
+                    and np.any(
+                        np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)])
+                    )
+                ):
+                    raise ValueError(
+                        "X contains values beyond the limits of the knots."
+                    )
+            elif self.extrapolation == "constant":
+                # Set all values beyond xmin and xmax to the value of the
+                # spline basis functions at those two positions.
+                # Only the first degree and last degree number of splines
+                # have non-zero values at the boundaries.
+
+                mask = X[:, i] < xmin
+                if np.any(mask):
+                    if use_sparse:
+                        # Note: See comment about SparseEfficiencyWarning above.
+                        XBS_sparse = XBS_sparse.tolil()
+                        XBS_sparse[mask, :degree] = f_min[:degree]
+
+                    else:
+                        XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[
+                            :degree
+                        ]
+
+                mask = X[:, i] > xmax
+                if np.any(mask):
+                    if use_sparse:
+                        # Note: See comment about SparseEfficiencyWarning above.
+                        XBS_sparse = XBS_sparse.tolil()
+                        XBS_sparse[mask, -degree:] = f_max[-degree:]
+                    else:
+                        XBS[
+                            mask,
+                            ((i + 1) * n_splines - degree) : ((i + 1) * n_splines),
+                        ] = f_max[-degree:]
+
+            elif self.extrapolation == "linear":
+                # Continue the degree first and degree last spline bases
+                # linearly beyond the boundaries, with slope = derivative at
+                # the boundary.
+                # Note that all others have derivative = value = 0 at the
+                # boundaries.
+
+                # spline derivatives = slopes at boundaries
+                fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1)
+                # Compute the linear continuation.
+                if degree <= 1:
+                    # For degree=1, the derivative of 2nd spline is not zero at
+                    # boundary. For degree=0 it is the same as 'constant'.
+                    degree += 1
+                for j in range(degree):
+                    mask = X[:, i] < xmin
+                    if np.any(mask):
+                        linear_extr = f_min[j] + (X[mask, i] - xmin) * fp_min[j]
+                        if use_sparse:
+                            # Note: See comment about SparseEfficiencyWarning above.
+                            XBS_sparse = XBS_sparse.tolil()
+                            XBS_sparse[mask, j] = linear_extr
+                        else:
+                            XBS[mask, i * n_splines + j] = linear_extr
+
+                    mask = X[:, i] > xmax
+                    if np.any(mask):
+                        k = n_splines - 1 - j
+                        linear_extr = f_max[k] + (X[mask, i] - xmax) * fp_max[k]
+                        if use_sparse:
+                            # Note: See comment about SparseEfficiencyWarning above.
+                            XBS_sparse = XBS_sparse.tolil()
+                            XBS_sparse[mask, k : k + 1] = linear_extr[:, None]
+                        else:
+                            XBS[mask, i * n_splines + k] = linear_extr
+
+            if use_sparse:
+                XBS_sparse = XBS_sparse.tocsr()
+                output_list.append(XBS_sparse)
+
+        if use_sparse:
+            # TODO: Remove this conditional error when the minimum supported version of
+            # SciPy is 1.9.2
+            # `scipy.sparse.hstack` breaks in scipy<1.9.2
+            # when `n_features_out_ > max_int32`
+            max_int32 = np.iinfo(np.int32).max
+            all_int32 = True
+            for mat in output_list:
+                all_int32 &= mat.indices.dtype == np.int32
+            if (
+                sp_version < parse_version("1.9.2")
+                and self.n_features_out_ > max_int32
+                and all_int32
+            ):
+                raise ValueError(
+                    "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
+                    " produces negative columns when:\n1. The output shape contains"
+                    " `n_cols` too large to be represented by a 32bit signed"
+                    " integer.\n. All sub-matrices to be stacked have indices of"
+                    " dtype `np.int32`.\nTo avoid this error, either use a version"
+                    " of scipy `>=1.9.2` or alter the `SplineTransformer`"
+                    " transformer to produce fewer than 2^31 output features"
+                )
+            XBS = sparse.hstack(output_list, format="csr")
+        elif self.sparse_output:
+            # TODO: Remove ones scipy 1.10 is the minimum version. See comments above.
+            XBS = sparse.csr_matrix(XBS)
+
+        if self.include_bias:
+            return XBS
+        else:
+            # We throw away one spline basis per feature.
+            # We chose the last one.
+            indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0]
+            return XBS[:, indices]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..77b404e3e39e9e7173d995f207ae1fca30f19f15
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder.py
@@ -0,0 +1,534 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import OneToOneFeatureMixin, _fit_context
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.multiclass import type_of_target
+from ..utils.validation import (
+    _check_feature_names_in,
+    _check_y,
+    check_consistent_length,
+    check_is_fitted,
+)
+from ._encoders import _BaseEncoder
+from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth
+
+
+class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
+    """Target Encoder for regression and classification targets.
+
+    Each category is encoded based on a shrunk estimate of the average target
+    values for observations belonging to the category. The encoding scheme mixes
+    the global target mean with the target mean conditioned on the value of the
+    category (see [MIC]_).
+
+    When the target type is "multiclass", encodings are based
+    on the conditional probability estimate for each class. The target is first
+    binarized using the "one-vs-all" scheme via
+    :class:`~sklearn.preprocessing.LabelBinarizer`, then the average target
+    value for each class and each category is used for encoding, resulting in
+    `n_features` * `n_classes` encoded output features.
+
+    :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
+    as another category and encodes them like any other category. Categories
+    that are not seen during :meth:`fit` are encoded with the target mean, i.e.
+    `target_mean_`.
+
+    For a demo on the importance of the `TargetEncoder` internal cross-fitting,
+    see
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
+    For a comparison of different encoders, refer to
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
+    more in the :ref:`User Guide <target_encoder>`.
+
+    .. note::
+        `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
+        :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+        See the :ref:`User Guide <target_encoder>` for details.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    categories : "auto" or list of shape (n_features,) of array-like, default="auto"
+        Categories (unique values) per feature:
+
+        - `"auto"` : Determine categories automatically from the training data.
+        - list : `categories[i]` holds the categories expected in the i-th column. The
+          passed categories should not mix strings and numeric values within a single
+          feature, and should be sorted in case of numeric values.
+
+        The used categories are stored in the `categories_` fitted attribute.
+
+    target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto"
+        Type of target.
+
+        - `"auto"` : Type of target is inferred with
+          :func:`~sklearn.utils.multiclass.type_of_target`.
+        - `"continuous"` : Continuous target
+        - `"binary"` : Binary target
+        - `"multiclass"` : Multiclass target
+
+        .. note::
+            The type of target inferred with `"auto"` may not be the desired target
+            type used for modeling. For example, if the target consisted of integers
+            between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`
+            will infer the target as `"multiclass"`. In this case, setting
+            `target_type="continuous"` will specify the target as a regression
+            problem. The `target_type_` attribute gives the target type used by the
+            encoder.
+
+        .. versionchanged:: 1.4
+           Added the option 'multiclass'.
+
+    smooth : "auto" or float, default="auto"
+        The amount of mixing of the target mean conditioned on the value of the
+        category with the global target mean. A larger `smooth` value will put
+        more weight on the global target mean.
+        If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
+
+    cv : int, default=5
+        Determines the number of folds in the :term:`cross fitting` strategy used in
+        :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
+        and for continuous targets, `KFold` is used.
+
+    shuffle : bool, default=True
+        Whether to shuffle the data in :meth:`fit_transform` before splitting into
+        folds. Note that the samples within each split will not be shuffled.
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold. Otherwise, this
+        parameter has no effect.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    encodings_ : list of shape (n_features,) or (n_features * n_classes) of \
+                    ndarray
+        Encodings learnt on all of `X`.
+        For feature `i`, `encodings_[i]` are the encodings matching the
+        categories listed in `categories_[i]`. When `target_type_` is
+        "multiclass", the encoding for feature `i` and class `j` is stored in
+        `encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and
+        3 classes (c), encodings are ordered:
+        f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2,
+
+    categories_ : list of shape (n_features,) of ndarray
+        The categories of each input feature determined during fitting or
+        specified in `categories`
+        (in order of the features in `X` and corresponding with the output
+        of :meth:`transform`).
+
+    target_type_ : str
+        Type of target.
+
+    target_mean_ : float
+        The overall mean of the target. This value is only used in :meth:`transform`
+        to encode categories.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    classes_ : ndarray or None
+        If `target_type_` is 'binary' or 'multiclass', holds the label for each class,
+        otherwise `None`.
+
+    See Also
+    --------
+    OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.
+        Contrary to TargetEncoder, this encoding is not supervised. Treating the
+        resulting encoding as a numerical features therefore lead arbitrarily
+        ordered values and therefore typically lead to lower predictive performance
+        when used as preprocessing for a classifier or regressor.
+    OneHotEncoder : Performs a one-hot encoding of categorical features. This
+        unsupervised encoding is better suited for low cardinality categorical
+        variables as it generate one new feature per unique category.
+
+    References
+    ----------
+    .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
+       categorical attributes in classification and prediction problems"
+       SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`
+
+    Examples
+    --------
+    With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:
+
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import TargetEncoder
+    >>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
+    >>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30
+    >>> enc_auto = TargetEncoder(smooth="auto")
+    >>> X_trans = enc_auto.fit_transform(X, y)
+
+    >>> # A high `smooth` parameter puts more weight on global mean on the categorical
+    >>> # encodings:
+    >>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)
+    >>> enc_high_smooth.target_mean_
+    np.float64(44.3)
+    >>> enc_high_smooth.encodings_
+    [array([44.1, 44.4, 44.3])]
+
+    >>> # On the other hand, a low `smooth` parameter puts more weight on target
+    >>> # conditioned on the value of the categorical:
+    >>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)
+    >>> enc_low_smooth.encodings_
+    [array([21, 80.8, 43.2])]
+    """
+
+    _parameter_constraints: dict = {
+        "categories": [StrOptions({"auto"}), list],
+        "target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],
+        "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
+        "cv": [Interval(Integral, 2, None, closed="left")],
+        "shuffle": ["boolean"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        categories="auto",
+        target_type="auto",
+        smooth="auto",
+        cv=5,
+        shuffle=True,
+        random_state=None,
+    ):
+        self.categories = categories
+        self.smooth = smooth
+        self.target_type = target_type
+        self.cv = cv
+        self.shuffle = shuffle
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the :class:`TargetEncoder` to X and y.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : array-like of shape (n_samples,)
+            The target data used to encode the categories.
+
+        Returns
+        -------
+        self : object
+            Fitted encoder.
+        """
+        self._fit_encodings_all(X, y)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y):
+        """Fit :class:`TargetEncoder` and transform X with the target encoding.
+
+        .. note::
+            `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
+            :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+            See the :ref:`User Guide <target_encoder>`. for details.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        y : array-like of shape (n_samples,)
+            The target data used to encode the categories.
+
+        Returns
+        -------
+        X_trans : ndarray of shape (n_samples, n_features) or \
+                    (n_samples, (n_features * n_classes))
+            Transformed input.
+        """
+        from ..model_selection import KFold, StratifiedKFold  # avoid circular import
+
+        X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
+
+        # The cv splitter is voluntarily restricted to *KFold to enforce non
+        # overlapping validation folds, otherwise the fit_transform output will
+        # not be well-specified.
+        if self.target_type_ == "continuous":
+            cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
+        else:
+            cv = StratifiedKFold(
+                self.cv, shuffle=self.shuffle, random_state=self.random_state
+            )
+
+        # If 'multiclass' multiply axis=1 by num classes else keep shape the same
+        if self.target_type_ == "multiclass":
+            X_out = np.empty(
+                (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
+                dtype=np.float64,
+            )
+        else:
+            X_out = np.empty_like(X_ordinal, dtype=np.float64)
+
+        for train_idx, test_idx in cv.split(X, y):
+            X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
+            y_train_mean = np.mean(y_train, axis=0)
+
+            if self.target_type_ == "multiclass":
+                encodings = self._fit_encoding_multiclass(
+                    X_train,
+                    y_train,
+                    n_categories,
+                    y_train_mean,
+                )
+            else:
+                encodings = self._fit_encoding_binary_or_continuous(
+                    X_train,
+                    y_train,
+                    n_categories,
+                    y_train_mean,
+                )
+            self._transform_X_ordinal(
+                X_out,
+                X_ordinal,
+                ~X_known_mask,
+                test_idx,
+                encodings,
+                y_train_mean,
+            )
+        return X_out
+
+    def transform(self, X):
+        """Transform X with the target encoding.
+
+        .. note::
+            `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
+            :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+            See the :ref:`User Guide <target_encoder>`. for details.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+
+        Returns
+        -------
+        X_trans : ndarray of shape (n_samples, n_features) or \
+                    (n_samples, (n_features * n_classes))
+            Transformed input.
+        """
+        X_ordinal, X_known_mask = self._transform(
+            X, handle_unknown="ignore", ensure_all_finite="allow-nan"
+        )
+
+        # If 'multiclass' multiply axis=1 by num of classes else keep shape the same
+        if self.target_type_ == "multiclass":
+            X_out = np.empty(
+                (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
+                dtype=np.float64,
+            )
+        else:
+            X_out = np.empty_like(X_ordinal, dtype=np.float64)
+
+        self._transform_X_ordinal(
+            X_out,
+            X_ordinal,
+            ~X_known_mask,
+            slice(None),
+            self.encodings_,
+            self.target_mean_,
+        )
+        return X_out
+
+    def _fit_encodings_all(self, X, y):
+        """Fit a target encoding with all the data."""
+        # avoid circular import
+        from ..preprocessing import (
+            LabelBinarizer,
+            LabelEncoder,
+        )
+
+        check_consistent_length(X, y)
+        self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan")
+
+        if self.target_type == "auto":
+            accepted_target_types = ("binary", "multiclass", "continuous")
+            inferred_type_of_target = type_of_target(y, input_name="y")
+            if inferred_type_of_target not in accepted_target_types:
+                raise ValueError(
+                    "Unknown label type: Target type was inferred to be "
+                    f"{inferred_type_of_target!r}. Only {accepted_target_types} are "
+                    "supported."
+                )
+            self.target_type_ = inferred_type_of_target
+        else:
+            self.target_type_ = self.target_type
+
+        self.classes_ = None
+        if self.target_type_ == "binary":
+            label_encoder = LabelEncoder()
+            y = label_encoder.fit_transform(y)
+            self.classes_ = label_encoder.classes_
+        elif self.target_type_ == "multiclass":
+            label_binarizer = LabelBinarizer()
+            y = label_binarizer.fit_transform(y)
+            self.classes_ = label_binarizer.classes_
+        else:  # continuous
+            y = _check_y(y, y_numeric=True, estimator=self)
+
+        self.target_mean_ = np.mean(y, axis=0)
+
+        X_ordinal, X_known_mask = self._transform(
+            X, handle_unknown="ignore", ensure_all_finite="allow-nan"
+        )
+        n_categories = np.fromiter(
+            (len(category_for_feature) for category_for_feature in self.categories_),
+            dtype=np.int64,
+            count=len(self.categories_),
+        )
+        if self.target_type_ == "multiclass":
+            encodings = self._fit_encoding_multiclass(
+                X_ordinal,
+                y,
+                n_categories,
+                self.target_mean_,
+            )
+        else:
+            encodings = self._fit_encoding_binary_or_continuous(
+                X_ordinal,
+                y,
+                n_categories,
+                self.target_mean_,
+            )
+        self.encodings_ = encodings
+
+        return X_ordinal, X_known_mask, y, n_categories
+
+    def _fit_encoding_binary_or_continuous(
+        self, X_ordinal, y, n_categories, target_mean
+    ):
+        """Learn target encodings."""
+        if self.smooth == "auto":
+            y_variance = np.var(y)
+            encodings = _fit_encoding_fast_auto_smooth(
+                X_ordinal,
+                y,
+                n_categories,
+                target_mean,
+                y_variance,
+            )
+        else:
+            encodings = _fit_encoding_fast(
+                X_ordinal,
+                y,
+                n_categories,
+                self.smooth,
+                target_mean,
+            )
+        return encodings
+
+    def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
+        """Learn multiclass encodings.
+
+        Learn encodings for each class (c) then reorder encodings such that
+        the same features (f) are grouped together. `reorder_index` enables
+        converting from:
+        f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2
+        to:
+        f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2
+        """
+        n_features = self.n_features_in_
+        n_classes = len(self.classes_)
+
+        encodings = []
+        for i in range(n_classes):
+            y_class = y[:, i]
+            encoding = self._fit_encoding_binary_or_continuous(
+                X_ordinal,
+                y_class,
+                n_categories,
+                target_mean[i],
+            )
+            encodings.extend(encoding)
+
+        reorder_index = (
+            idx
+            for start in range(n_features)
+            for idx in range(start, (n_classes * n_features), n_features)
+        )
+        return [encodings[idx] for idx in reorder_index]
+
+    def _transform_X_ordinal(
+        self,
+        X_out,
+        X_ordinal,
+        X_unknown_mask,
+        row_indices,
+        encodings,
+        target_mean,
+    ):
+        """Transform X_ordinal using encodings.
+
+        In the multiclass case, `X_ordinal` and `X_unknown_mask` have column
+        (axis=1) size `n_features`, while `encodings` has length of size
+        `n_features * n_classes`. `feat_idx` deals with this by repeating
+        feature indices by `n_classes` E.g., for 3 features, 2 classes:
+        0,0,1,1,2,2
+
+        Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx`
+        cycles through 0 to `n_classes` - 1, `n_features` times.
+        """
+        if self.target_type_ == "multiclass":
+            n_classes = len(self.classes_)
+            for e_idx, encoding in enumerate(encodings):
+                # Repeat feature indices by n_classes
+                feat_idx = e_idx // n_classes
+                # Cycle through each class
+                mean_idx = e_idx % n_classes
+                X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]]
+                X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx]
+        else:
+            for e_idx, encoding in enumerate(encodings):
+                X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]]
+                X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names. `feature_names_in_` is used unless it is
+            not defined, in which case the following input feature names are
+            generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            When `type_of_target_` is "multiclass" the names are of the format
+            '<feature_name>_<class_name>'.
+        """
+        check_is_fitted(self, "n_features_in_")
+        feature_names = _check_feature_names_in(self, input_features)
+        if self.target_type_ == "multiclass":
+            feature_names = [
+                f"{feature_name}_{class_name}"
+                for feature_name in feature_names
+                for class_name in self.classes_
+            ]
+            return np.asarray(feature_names, dtype=object)
+        else:
+            return feature_names
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.required = True
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..dca5f78e8d60fd70906b63cc434309b832e68d57
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx
@@ -0,0 +1,167 @@
+from libc.math cimport isnan
+from libcpp.vector cimport vector
+
+from ..utils._typedefs cimport float32_t, float64_t, int32_t, int64_t
+
+import numpy as np
+
+
+ctypedef fused INT_DTYPE:
+    int64_t
+    int32_t
+
+ctypedef fused Y_DTYPE:
+    int64_t
+    int32_t
+    float64_t
+    float32_t
+
+
+def _fit_encoding_fast(
+    INT_DTYPE[:, ::1] X_int,
+    const Y_DTYPE[:] y,
+    int64_t[::1] n_categories,
+    double smooth,
+    double y_mean,
+):
+    """Fit a target encoding on X_int and y.
+
+    This implementation uses Eq 7 from [1] to compute the encoding.
+    As stated in the paper, Eq 7 is the same as Eq 3.
+
+    [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
+         categorical attributes in classification and prediction problems"
+    """
+    cdef:
+        int64_t sample_idx, feat_idx, cat_idx, n_cats
+        INT_DTYPE X_int_tmp
+        int n_samples = X_int.shape[0]
+        int n_features = X_int.shape[1]
+        double smooth_sum = smooth * y_mean
+        int64_t max_n_cats = np.max(n_categories)
+        double[::1] sums = np.empty(max_n_cats, dtype=np.float64)
+        double[::1] counts = np.empty(max_n_cats, dtype=np.float64)
+        list encodings = []
+        double[::1] current_encoding
+        # Gives access to encodings without gil
+        vector[double*] encoding_vec
+
+    encoding_vec.resize(n_features)
+    for feat_idx in range(n_features):
+        current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
+        encoding_vec[feat_idx] = &current_encoding[0]
+        encodings.append(np.asarray(current_encoding))
+
+    with nogil:
+        for feat_idx in range(n_features):
+            n_cats = n_categories[feat_idx]
+
+            for cat_idx in range(n_cats):
+                sums[cat_idx] = smooth_sum
+                counts[cat_idx] = smooth
+
+            for sample_idx in range(n_samples):
+                X_int_tmp = X_int[sample_idx, feat_idx]
+                # -1 are unknown categories, which are not counted
+                if X_int_tmp == -1:
+                    continue
+                sums[X_int_tmp] += y[sample_idx]
+                counts[X_int_tmp] += 1.0
+
+            for cat_idx in range(n_cats):
+                if counts[cat_idx] == 0:
+                    encoding_vec[feat_idx][cat_idx] = y_mean
+                else:
+                    encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx]
+
+    return encodings
+
+
+def _fit_encoding_fast_auto_smooth(
+    INT_DTYPE[:, ::1] X_int,
+    const Y_DTYPE[:] y,
+    int64_t[::1] n_categories,
+    double y_mean,
+    double y_variance,
+):
+    """Fit a target encoding on X_int and y with auto smoothing.
+
+    This implementation uses Eq 5 and 6 from [1].
+
+    [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
+         categorical attributes in classification and prediction problems"
+    """
+    cdef:
+        int64_t sample_idx, feat_idx, cat_idx, n_cats
+        INT_DTYPE X_int_tmp
+        double diff
+        int n_samples = X_int.shape[0]
+        int n_features = X_int.shape[1]
+        int64_t max_n_cats = np.max(n_categories)
+        double[::1] means = np.empty(max_n_cats, dtype=np.float64)
+        int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
+        double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
+        double lambda_
+        list encodings = []
+        double[::1] current_encoding
+        # Gives access to encodings without gil
+        vector[double*] encoding_vec
+
+    encoding_vec.resize(n_features)
+    for feat_idx in range(n_features):
+        current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
+        encoding_vec[feat_idx] = &current_encoding[0]
+        encodings.append(np.asarray(current_encoding))
+
+    # TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's
+    # probably good to parallelize the outer loop. When n_features is too small,
+    # then it would probably better to parallelize the nested loops on n_samples and
+    # n_cats, but the code to handle thread-local temporary variables might be
+    # significantly more complex.
+    with nogil:
+        for feat_idx in range(n_features):
+            n_cats = n_categories[feat_idx]
+
+            for cat_idx in range(n_cats):
+                means[cat_idx] = 0.0
+                counts[cat_idx] = 0
+                sum_of_squared_diffs[cat_idx] = 0.0
+
+            # first pass to compute the mean
+            for sample_idx in range(n_samples):
+                X_int_tmp = X_int[sample_idx, feat_idx]
+
+                # -1 are unknown categories, which are not counted
+                if X_int_tmp == -1:
+                    continue
+                counts[X_int_tmp] += 1
+                means[X_int_tmp] += y[sample_idx]
+
+            for cat_idx in range(n_cats):
+                means[cat_idx] /= counts[cat_idx]
+
+            # second pass to compute the sum of squared differences
+            for sample_idx in range(n_samples):
+                X_int_tmp = X_int[sample_idx, feat_idx]
+                if X_int_tmp == -1:
+                    continue
+                diff = y[sample_idx] - means[X_int_tmp]
+                sum_of_squared_diffs[X_int_tmp] += diff * diff
+
+            for cat_idx in range(n_cats):
+                lambda_ = (
+                    y_variance * counts[cat_idx] /
+                    (y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
+                     counts[cat_idx])
+                )
+                if isnan(lambda_):
+                    # A nan can happen when:
+                    # 1. counts[cat_idx] == 0
+                    # 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0
+                    encoding_vec[feat_idx][cat_idx] = y_mean
+                else:
+                    encoding_vec[feat_idx][cat_idx] = (
+                        lambda_ * means[cat_idx] + (1 - lambda_) * y_mean
+                    )
+
+    return encodings
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/meson.build b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..052c4a6766ad4ed409a08ffe3c8ff31a7412d3dd
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/meson.build
@@ -0,0 +1,13 @@
+py.extension_module(
+  '_csr_polynomial_expansion',
+  [cython_gen.process('_csr_polynomial_expansion.pyx'), utils_cython_tree],
+  subdir: 'sklearn/preprocessing',
+  install: true
+)
+
+py.extension_module(
+  '_target_encoder_fast',
+  [cython_gen_cpp.process('_target_encoder_fast.pyx'), utils_cython_tree],
+  subdir: 'sklearn/preprocessing',
+  install: true
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f702f64ce2367ef6fe47fdb789e0475bf11def
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_common.py
@@ -0,0 +1,187 @@
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.base import clone
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import (
+    MaxAbsScaler,
+    MinMaxScaler,
+    PowerTransformer,
+    QuantileTransformer,
+    RobustScaler,
+    StandardScaler,
+    maxabs_scale,
+    minmax_scale,
+    power_transform,
+    quantile_transform,
+    robust_scale,
+    scale,
+)
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import (
+    BSR_CONTAINERS,
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DIA_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+iris = load_iris()
+
+
+def _get_valid_samples_by_column(X, col):
+    """Get non NaN samples in column of X"""
+    return X[:, [col]][~np.isnan(X[:, col])]
+
+
+@pytest.mark.parametrize(
+    "est, func, support_sparse, strictly_positive, omit_kwargs",
+    [
+        (MaxAbsScaler(), maxabs_scale, True, False, []),
+        (MinMaxScaler(), minmax_scale, False, False, ["clip"]),
+        (StandardScaler(), scale, False, False, []),
+        (StandardScaler(with_mean=False), scale, True, False, []),
+        (PowerTransformer("yeo-johnson"), power_transform, False, False, []),
+        (PowerTransformer("box-cox"), power_transform, False, True, []),
+        (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
+        (RobustScaler(), robust_scale, False, False, []),
+        (RobustScaler(with_centering=False), robust_scale, True, False, []),
+    ],
+)
+def test_missing_value_handling(
+    est, func, support_sparse, strictly_positive, omit_kwargs
+):
+    # check that the preprocessing method let pass nan
+    rng = np.random.RandomState(42)
+    X = iris.data.copy()
+    n_missing = 50
+    X[
+        rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
+    ] = np.nan
+    if strictly_positive:
+        X += np.nanmin(X) + 0.1
+    X_train, X_test = train_test_split(X, random_state=1)
+    # sanity check
+    assert not np.all(np.isnan(X_train), axis=0).any()
+    assert np.any(np.isnan(X_train), axis=0).all()
+    assert np.any(np.isnan(X_test), axis=0).all()
+    X_test[:, 0] = np.nan  # make sure this boundary case is tested
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        Xt = est.fit(X_train).transform(X_test)
+    # ensure no warnings are raised
+    # missing values should still be missing, and only them
+    assert_array_equal(np.isnan(Xt), np.isnan(X_test))
+
+    # check that the function leads to the same results as the class
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        Xt_class = est.transform(X_train)
+    kwargs = est.get_params()
+    # remove the parameters which should be omitted because they
+    # are not defined in the counterpart function of the preprocessing class
+    for kwarg in omit_kwargs:
+        _ = kwargs.pop(kwarg)
+    Xt_func = func(X_train, **kwargs)
+    assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
+    assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
+
+    # check that the inverse transform keep NaN
+    Xt_inv = est.inverse_transform(Xt)
+    assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
+    # FIXME: we can introduce equal_nan=True in recent version of numpy.
+    # For the moment which just check that non-NaN values are almost equal.
+    assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
+
+    for i in range(X.shape[1]):
+        # train only on non-NaN
+        est.fit(_get_valid_samples_by_column(X_train, i))
+        # check transforming with NaN works even when training without NaN
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", RuntimeWarning)
+            Xt_col = est.transform(X_test[:, [i]])
+        assert_allclose(Xt_col, Xt[:, [i]])
+        # check non-NaN is handled as before - the 1st column is all nan
+        if not np.isnan(X_test[:, i]).all():
+            Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
+            assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
+
+    if support_sparse:
+        est_dense = clone(est)
+        est_sparse = clone(est)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", RuntimeWarning)
+            Xt_dense = est_dense.fit(X_train).transform(X_test)
+            Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
+
+        for sparse_container in (
+            BSR_CONTAINERS
+            + COO_CONTAINERS
+            + CSC_CONTAINERS
+            + CSR_CONTAINERS
+            + DIA_CONTAINERS
+            + DOK_CONTAINERS
+            + LIL_CONTAINERS
+        ):
+            # check that the dense and sparse inputs lead to the same results
+            # precompute the matrix to avoid catching side warnings
+            X_train_sp = sparse_container(X_train)
+            X_test_sp = sparse_container(X_test)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", PendingDeprecationWarning)
+                warnings.simplefilter("error", RuntimeWarning)
+                Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
+
+            assert_allclose(Xt_sp.toarray(), Xt_dense)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", PendingDeprecationWarning)
+                warnings.simplefilter("error", RuntimeWarning)
+                Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
+
+            assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense)
+
+
+@pytest.mark.parametrize(
+    "est, func",
+    [
+        (MaxAbsScaler(), maxabs_scale),
+        (MinMaxScaler(), minmax_scale),
+        (StandardScaler(), scale),
+        (StandardScaler(with_mean=False), scale),
+        (PowerTransformer("yeo-johnson"), power_transform),
+        (
+            PowerTransformer("box-cox"),
+            power_transform,
+        ),
+        (QuantileTransformer(n_quantiles=3), quantile_transform),
+        (RobustScaler(), robust_scale),
+        (RobustScaler(with_centering=False), robust_scale),
+    ],
+)
+def test_missing_value_pandas_na_support(est, func):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip("pandas")
+
+    X = np.array(
+        [
+            [1, 2, 3, np.nan, np.nan, 4, 5, 1],
+            [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
+            [1, 2, 3, 4, 5, 6, 7, 8],
+        ]
+    ).T
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
+    X_df["c"] = X_df["c"].astype("int")
+
+    X_trans = est.fit_transform(X)
+    X_df_trans = est.fit_transform(X_df)
+
+    assert_allclose(X_trans, X_df_trans)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_data.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a618d426a7dcb28da4ea858bec03dd957de5eb0c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_data.py
@@ -0,0 +1,2693 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+import warnings
+
+import numpy as np
+import numpy.linalg as la
+import pytest
+from scipy import sparse, stats
+
+from sklearn import config_context, datasets
+from sklearn.base import clone
+from sklearn.exceptions import NotFittedError
+from sklearn.externals._packaging.version import parse as parse_version
+from sklearn.metrics.pairwise import linear_kernel
+from sklearn.model_selection import cross_val_predict
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import (
+    Binarizer,
+    KernelCenterer,
+    MaxAbsScaler,
+    MinMaxScaler,
+    Normalizer,
+    PowerTransformer,
+    QuantileTransformer,
+    RobustScaler,
+    StandardScaler,
+    add_dummy_feature,
+    maxabs_scale,
+    minmax_scale,
+    normalize,
+    power_transform,
+    quantile_transform,
+    robust_scale,
+    scale,
+)
+from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale
+from sklearn.svm import SVR
+from sklearn.utils import gen_batches, shuffle
+from sklearn.utils._array_api import (
+    _convert_to_numpy,
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids
+from sklearn.utils._testing import (
+    _array_api_for_tests,
+    _convert_container,
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_array_less,
+    skip_if_32bit,
+)
+from sklearn.utils.estimator_checks import (
+    check_array_api_input_and_values,
+)
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    LIL_CONTAINERS,
+    sp_version,
+)
+from sklearn.utils.sparsefuncs import mean_variance_axis
+
+iris = datasets.load_iris()
+
+# Make some data to be used many times
+rng = np.random.RandomState(0)
+n_features = 30
+n_samples = 1000
+offsets = rng.uniform(-1, 1, size=n_features)
+scales = rng.uniform(1, 10, size=n_features)
+X_2d = rng.randn(n_samples, n_features) * scales + offsets
+X_1row = X_2d[0, :].reshape(1, n_features)
+X_1col = X_2d[:, 0].reshape(n_samples, 1)
+X_list_1row = X_1row.tolist()
+X_list_1col = X_1col.tolist()
+
+
+def toarray(a):
+    if hasattr(a, "toarray"):
+        a = a.toarray()
+    return a
+
+
+def _check_dim_1axis(a):
+    return np.asarray(a).shape[0]
+
+
+def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen):
+    if batch_stop != n:
+        assert (i + 1) * chunk_size == n_samples_seen
+    else:
+        assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen
+
+
+def test_raises_value_error_if_sample_weights_greater_than_1d():
+    # Sample weights must be either scalar or 1D
+
+    n_sampless = [2, 3]
+    n_featuress = [3, 2]
+
+    for n_samples, n_features in zip(n_sampless, n_featuress):
+        X = rng.randn(n_samples, n_features)
+        y = rng.randn(n_samples)
+
+        scaler = StandardScaler()
+
+        # make sure Error is raised the sample weights greater than 1d
+        sample_weight_notOK = rng.randn(n_samples, 1) ** 2
+        with pytest.raises(ValueError):
+            scaler.fit(X, y, sample_weight=sample_weight_notOK)
+
+
+@pytest.mark.parametrize(
+    ["Xw", "X", "sample_weight"],
+    [
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]),
+        (
+            [[1, 0, 1], [0, 0, 1]],
+            [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
+            np.array([1, 3]),
+        ),
+        (
+            [[1, np.nan, 1], [np.nan, np.nan, 1]],
+            [
+                [1, np.nan, 1],
+                [np.nan, np.nan, 1],
+                [np.nan, np.nan, 1],
+                [np.nan, np.nan, 1],
+            ],
+            np.array([1, 3]),
+        ),
+    ],
+)
+@pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"])
+def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor):
+    with_mean = not array_constructor.startswith("sparse")
+    X = _convert_container(X, array_constructor)
+    Xw = _convert_container(Xw, array_constructor)
+
+    # weighted StandardScaler
+    yw = np.ones(Xw.shape[0])
+    scaler_w = StandardScaler(with_mean=with_mean)
+    scaler_w.fit(Xw, yw, sample_weight=sample_weight)
+
+    # unweighted, but with repeated samples
+    y = np.ones(X.shape[0])
+    scaler = StandardScaler(with_mean=with_mean)
+    scaler.fit(X, y)
+
+    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
+
+    assert_almost_equal(scaler.mean_, scaler_w.mean_)
+    assert_almost_equal(scaler.var_, scaler_w.var_)
+    assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test))
+
+
+def test_standard_scaler_1d():
+    # Test scaling of dataset along single axis
+    for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
+        scaler = StandardScaler()
+        X_scaled = scaler.fit(X).transform(X, copy=True)
+
+        if isinstance(X, list):
+            X = np.array(X)  # cast only after scaling done
+
+        if _check_dim_1axis(X) == 1:
+            assert_almost_equal(scaler.mean_, X.ravel())
+            assert_almost_equal(scaler.scale_, np.ones(n_features))
+            assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
+            assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features))
+        else:
+            assert_almost_equal(scaler.mean_, X.mean())
+            assert_almost_equal(scaler.scale_, X.std())
+            assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
+            assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
+            assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
+        assert scaler.n_samples_seen_ == X.shape[0]
+
+        # check inverse transform
+        X_scaled_back = scaler.inverse_transform(X_scaled)
+        assert_array_almost_equal(X_scaled_back, X)
+
+    # Constant feature
+    X = np.ones((5, 1))
+    scaler = StandardScaler()
+    X_scaled = scaler.fit(X).transform(X, copy=True)
+    assert_almost_equal(scaler.mean_, 1.0)
+    assert_almost_equal(scaler.scale_, 1.0)
+    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
+    assert_array_almost_equal(X_scaled.std(axis=0), 0.0)
+    assert scaler.n_samples_seen_ == X.shape[0]
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+@pytest.mark.parametrize("add_sample_weight", [False, True])
+def test_standard_scaler_dtype(add_sample_weight, sparse_container):
+    # Ensure scaling does not affect dtype
+    rng = np.random.RandomState(0)
+    n_samples = 10
+    n_features = 3
+    if add_sample_weight:
+        sample_weight = np.ones(n_samples)
+    else:
+        sample_weight = None
+    with_mean = True
+    if sparse_container is not None:
+        # scipy sparse containers do not support float16, see
+        # https://github.com/scipy/scipy/issues/7408 for more details.
+        supported_dtype = [np.float64, np.float32]
+    else:
+        supported_dtype = [np.float64, np.float32, np.float16]
+    for dtype in supported_dtype:
+        X = rng.randn(n_samples, n_features).astype(dtype)
+        if sparse_container is not None:
+            X = sparse_container(X)
+            with_mean = False
+
+        scaler = StandardScaler(with_mean=with_mean)
+        X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X)
+        assert X.dtype == X_scaled.dtype
+        assert scaler.mean_.dtype == np.float64
+        assert scaler.scale_.dtype == np.float64
+
+
+@pytest.mark.parametrize(
+    "scaler",
+    [
+        StandardScaler(with_mean=False),
+        RobustScaler(with_centering=False),
+    ],
+)
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+@pytest.mark.parametrize("add_sample_weight", [False, True])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("constant", [0, 1.0, 100.0])
+def test_standard_scaler_constant_features(
+    scaler, add_sample_weight, sparse_container, dtype, constant
+):
+    if isinstance(scaler, RobustScaler) and add_sample_weight:
+        pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight")
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 1
+    if add_sample_weight:
+        fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2)
+    else:
+        fit_params = {}
+    X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype)
+    X = X_array if sparse_container is None else sparse_container(X_array)
+    X_scaled = scaler.fit(X, **fit_params).transform(X)
+
+    if isinstance(scaler, StandardScaler):
+        # The variance info should be close to zero for constant features.
+        assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7)
+
+    # Constant features should not be scaled (scale of 1.):
+    assert_allclose(scaler.scale_, np.ones(X.shape[1]))
+
+    assert X_scaled is not X  # make sure we make a copy
+    assert_allclose_dense_sparse(X_scaled, X)
+
+    if isinstance(scaler, StandardScaler) and not add_sample_weight:
+        # Also check consistency with the standard scale function.
+        X_scaled_2 = scale(X, with_mean=scaler.with_mean)
+        assert X_scaled_2 is not X  # make sure we did a copy
+        assert_allclose_dense_sparse(X_scaled_2, X)
+
+
+@pytest.mark.parametrize("n_samples", [10, 100, 10_000])
+@pytest.mark.parametrize("average", [1e-10, 1, 1e10])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+def test_standard_scaler_near_constant_features(
+    n_samples, sparse_container, average, dtype
+):
+    # Check that when the variance is too small (var << mean**2) the feature
+    # is considered constant and not scaled.
+
+    scale_min, scale_max = -30, 19
+    scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype)
+
+    n_features = scales.shape[0]
+    X = np.empty((n_samples, n_features), dtype=dtype)
+    # Make a dataset of known var = scales**2 and mean = average
+    X[: n_samples // 2, :] = average + scales
+    X[n_samples // 2 :, :] = average - scales
+    X_array = X if sparse_container is None else sparse_container(X)
+
+    scaler = StandardScaler(with_mean=False).fit(X_array)
+
+    # StandardScaler uses float64 accumulators even if the data has a float32
+    # dtype.
+    eps = np.finfo(np.float64).eps
+
+    # if var < bound = N.eps.var + N².eps².mean², the feature is considered
+    # constant and the scale_ attribute is set to 1.
+    bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2
+    within_bounds = scales**2 <= bounds
+
+    # Check that scale_min is small enough to have some scales below the
+    # bound and therefore detected as constant:
+    assert np.any(within_bounds)
+
+    # Check that such features are actually treated as constant by the scaler:
+    assert all(scaler.var_[within_bounds] <= bounds[within_bounds])
+    assert_allclose(scaler.scale_[within_bounds], 1.0)
+
+    # Depending the on the dtype of X, some features might not actually be
+    # representable as non constant for small scales (even if above the
+    # precision bound of the float64 variance estimate). Such feature should
+    # be correctly detected as constants with 0 variance by StandardScaler.
+    representable_diff = X[0, :] - X[-1, :] != 0
+    assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0)
+    assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1)
+
+    # The other features are scaled and scale_ is equal to sqrt(var_) assuming
+    # that scales are large enough for average + scale and average - scale to
+    # be distinct in X (depending on X's dtype).
+    common_mask = np.logical_and(scales**2 > bounds, representable_diff)
+    assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask])
+
+
+def test_scale_1d():
+    # 1-d inputs
+    X_list = [1.0, 3.0, 5.0, 0.0]
+    X_arr = np.array(X_list)
+
+    for X in [X_list, X_arr]:
+        X_scaled = scale(X)
+        assert_array_almost_equal(X_scaled.mean(), 0.0)
+        assert_array_almost_equal(X_scaled.std(), 1.0)
+        assert_array_equal(scale(X, with_mean=False, with_std=False), X)
+
+
+@skip_if_32bit
+def test_standard_scaler_numerical_stability():
+    # Test numerical stability of scaling
+    # np.log(1e-5) is taken because of its floating point representation
+    # was empirically found to cause numerical problems with np.mean & np.std.
+    x = np.full(8, np.log(1e-5), dtype=np.float64)
+    # This does not raise a warning as the number of samples is too low
+    # to trigger the problem in recent numpy
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        scale(x)
+    assert_array_almost_equal(scale(x), np.zeros(8))
+
+    # with 2 more samples, the std computation run into numerical issues:
+    x = np.full(10, np.log(1e-5), dtype=np.float64)
+    warning_message = "standard deviation of the data is probably very close to 0"
+    with pytest.warns(UserWarning, match=warning_message):
+        x_scaled = scale(x)
+    assert_array_almost_equal(x_scaled, np.zeros(10))
+
+    x = np.full(10, 1e-100, dtype=np.float64)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        x_small_scaled = scale(x)
+    assert_array_almost_equal(x_small_scaled, np.zeros(10))
+
+    # Large values can cause (often recoverable) numerical stability issues:
+    x_big = np.full(10, 1e100, dtype=np.float64)
+    warning_message = "Dataset may contain too large values"
+    with pytest.warns(UserWarning, match=warning_message):
+        x_big_scaled = scale(x_big)
+    assert_array_almost_equal(x_big_scaled, np.zeros(10))
+    assert_array_almost_equal(x_big_scaled, x_small_scaled)
+    with pytest.warns(UserWarning, match=warning_message):
+        x_big_centered = scale(x_big, with_std=False)
+    assert_array_almost_equal(x_big_centered, np.zeros(10))
+    assert_array_almost_equal(x_big_centered, x_small_scaled)
+
+
+def test_scaler_2d_arrays():
+    # Test scaling of 2d array along first axis
+    rng = np.random.RandomState(0)
+    n_features = 5
+    n_samples = 4
+    X = rng.randn(n_samples, n_features)
+    X[:, 0] = 0.0  # first feature is always of zero
+
+    scaler = StandardScaler()
+    X_scaled = scaler.fit(X).transform(X, copy=True)
+    assert not np.any(np.isnan(X_scaled))
+    assert scaler.n_samples_seen_ == n_samples
+
+    assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+    # Check that X has been copied
+    assert X_scaled is not X
+
+    # check inverse transform
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
+    X_scaled = scale(X, axis=1, with_std=False)
+    assert not np.any(np.isnan(X_scaled))
+    assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
+    X_scaled = scale(X, axis=1, with_std=True)
+    assert not np.any(np.isnan(X_scaled))
+    assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0])
+    # Check that the data hasn't been modified
+    assert X_scaled is not X
+
+    X_scaled = scaler.fit(X).transform(X, copy=False)
+    assert not np.any(np.isnan(X_scaled))
+    assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+    # Check that X has not been copied
+    assert X_scaled is X
+
+    X = rng.randn(4, 5)
+    X[:, 0] = 1.0  # first feature is a constant, non zero feature
+    scaler = StandardScaler()
+    X_scaled = scaler.fit(X).transform(X, copy=True)
+    assert not np.any(np.isnan(X_scaled))
+    assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+    # Check that X has not been copied
+    assert X_scaled is not X
+
+
+def test_scaler_float16_overflow():
+    # Test if the scaler will not overflow on float16 numpy arrays
+    rng = np.random.RandomState(0)
+    # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000
+    # which is enough to overflow the data type
+    X = rng.uniform(5, 10, [200000, 1]).astype(np.float16)
+
+    with np.errstate(over="raise"):
+        scaler = StandardScaler().fit(X)
+        X_scaled = scaler.transform(X)
+
+    # Calculate the float64 equivalent to verify result
+    X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64))
+
+    # Overflow calculations may cause -inf, inf, or nan. Since there is no nan
+    # input, all of the outputs should be finite. This may be redundant since a
+    # FloatingPointError exception will be thrown on overflow above.
+    assert np.all(np.isfinite(X_scaled))
+
+    # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the
+    # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are
+    # checked to account for precision differences.
+    assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2)
+
+
+def test_handle_zeros_in_scale():
+    s1 = np.array([0, 1e-16, 1, 2, 3])
+    s2 = _handle_zeros_in_scale(s1, copy=True)
+
+    assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3]))
+    assert_allclose(s2, np.array([1, 1, 1, 2, 3]))
+
+
+def test_minmax_scaler_partial_fit():
+    # Test if partial_fit run over many batches of size 1 and 50
+    # gives the same results as fit
+    X = X_2d
+    n = X.shape[0]
+
+    for chunk_size in [1, 2, 50, n, n + 42]:
+        # Test mean at the end of the process
+        scaler_batch = MinMaxScaler().fit(X)
+
+        scaler_incr = MinMaxScaler()
+        for batch in gen_batches(n_samples, chunk_size):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+
+        assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
+        assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+        assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
+
+        # Test std after 1 step
+        batch0 = slice(0, chunk_size)
+        scaler_batch = MinMaxScaler().fit(X[batch0])
+        scaler_incr = MinMaxScaler().partial_fit(X[batch0])
+
+        assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
+        assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+        assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
+
+        # Test std until the end of partial fits, and
+        scaler_batch = MinMaxScaler().fit(X)
+        scaler_incr = MinMaxScaler()  # Clean estimator
+        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+            assert_correct_incr(
+                i,
+                batch_start=batch.start,
+                batch_stop=batch.stop,
+                n=n,
+                chunk_size=chunk_size,
+                n_samples_seen=scaler_incr.n_samples_seen_,
+            )
+
+
+def test_standard_scaler_partial_fit():
+    # Test if partial_fit run over many batches of size 1 and 50
+    # gives the same results as fit
+    X = X_2d
+    n = X.shape[0]
+
+    for chunk_size in [1, 2, 50, n, n + 42]:
+        # Test mean at the end of the process
+        scaler_batch = StandardScaler(with_std=False).fit(X)
+
+        scaler_incr = StandardScaler(with_std=False)
+        for batch in gen_batches(n_samples, chunk_size):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+        assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
+        assert scaler_batch.var_ == scaler_incr.var_  # Nones
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+
+        # Test std after 1 step
+        batch0 = slice(0, chunk_size)
+        scaler_incr = StandardScaler().partial_fit(X[batch0])
+        if chunk_size == 1:
+            assert_array_almost_equal(
+                np.zeros(n_features, dtype=np.float64), scaler_incr.var_
+            )
+            assert_array_almost_equal(
+                np.ones(n_features, dtype=np.float64), scaler_incr.scale_
+            )
+        else:
+            assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_)
+            assert_array_almost_equal(
+                np.std(X[batch0], axis=0), scaler_incr.scale_
+            )  # no constants
+
+        # Test std until the end of partial fits, and
+        scaler_batch = StandardScaler().fit(X)
+        scaler_incr = StandardScaler()  # Clean estimator
+        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+            assert_correct_incr(
+                i,
+                batch_start=batch.start,
+                batch_stop=batch.stop,
+                n=n,
+                chunk_size=chunk_size,
+                n_samples_seen=scaler_incr.n_samples_seen_,
+            )
+
+        assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_standard_scaler_partial_fit_numerical_stability(sparse_container):
+    # Test if the incremental computation introduces significative errors
+    # for large datasets with values of large magniture
+    rng = np.random.RandomState(0)
+    n_features = 2
+    n_samples = 100
+    offsets = rng.uniform(-1e15, 1e15, size=n_features)
+    scales = rng.uniform(1e3, 1e6, size=n_features)
+    X = rng.randn(n_samples, n_features) * scales + offsets
+
+    scaler_batch = StandardScaler().fit(X)
+    scaler_incr = StandardScaler()
+    for chunk in X:
+        scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features))
+
+    # Regardless of abs values, they must not be more diff 6 significant digits
+    tol = 10 ** (-6)
+    assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol)
+    assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol)
+    assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol)
+    # NOTE Be aware that for much larger offsets std is very unstable (last
+    # assert) while mean is OK.
+
+    # Sparse input
+    size = (100, 3)
+    scale = 1e20
+    X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale)
+
+    # with_mean=False is required with sparse input
+    scaler = StandardScaler(with_mean=False).fit(X)
+    scaler_incr = StandardScaler(with_mean=False)
+
+    for chunk in X:
+        if chunk.ndim == 1:
+            # Sparse arrays can be 1D (in scipy 1.14 and later) while old
+            # sparse matrix instances are always 2D.
+            chunk = chunk.reshape(1, -1)
+        scaler_incr = scaler_incr.partial_fit(chunk)
+
+    # Regardless of magnitude, they must not differ more than of 6 digits
+    tol = 10 ** (-6)
+    assert scaler.mean_ is not None
+    assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol)
+    assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol)
+
+
+@pytest.mark.parametrize("sample_weight", [True, None])
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_partial_fit_sparse_input(sample_weight, sparse_container):
+    # Check that sparsity is not destroyed
+    X = sparse_container(np.array([[1.0], [0.0], [0.0], [5.0]]))
+
+    if sample_weight:
+        sample_weight = rng.rand(X.shape[0])
+
+    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
+    X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X)
+    assert_array_equal(X_null.toarray(), X.toarray())
+    X_orig = null_transform.inverse_transform(X_null)
+    assert_array_equal(X_orig.toarray(), X_null.toarray())
+    assert_array_equal(X_orig.toarray(), X.toarray())
+
+
+@pytest.mark.parametrize("sample_weight", [True, None])
+def test_standard_scaler_trasform_with_partial_fit(sample_weight):
+    # Check some postconditions after applying partial_fit and transform
+    X = X_2d[:100, :]
+
+    if sample_weight:
+        sample_weight = rng.rand(X.shape[0])
+
+    scaler_incr = StandardScaler()
+    for i, batch in enumerate(gen_batches(X.shape[0], 1)):
+        X_sofar = X[: (i + 1), :]
+        chunks_copy = X_sofar.copy()
+        if sample_weight is None:
+            scaled_batch = StandardScaler().fit_transform(X_sofar)
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+        else:
+            scaled_batch = StandardScaler().fit_transform(
+                X_sofar, sample_weight=sample_weight[: i + 1]
+            )
+            scaler_incr = scaler_incr.partial_fit(
+                X[batch], sample_weight=sample_weight[batch]
+            )
+        scaled_incr = scaler_incr.transform(X_sofar)
+
+        assert_array_almost_equal(scaled_batch, scaled_incr)
+        assert_array_almost_equal(X_sofar, chunks_copy)  # No change
+        right_input = scaler_incr.inverse_transform(scaled_incr)
+        assert_array_almost_equal(X_sofar, right_input)
+
+        zero = np.zeros(X.shape[1])
+        epsilon = np.finfo(float).eps
+        assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
+        assert_array_less(zero, scaler_incr.scale_ + epsilon)
+        if sample_weight is None:
+            # (i+1) because the Scaler has been already fitted
+            assert (i + 1) == scaler_incr.n_samples_seen_
+        else:
+            assert np.sum(sample_weight[: i + 1]) == pytest.approx(
+                scaler_incr.n_samples_seen_
+            )
+
+
+def test_standard_check_array_of_inverse_transform():
+    # Check if StandardScaler inverse_transform is
+    # converting the integer array to float
+    x = np.array(
+        [
+            [1, 1, 1, 0, 1, 0],
+            [1, 1, 1, 0, 1, 0],
+            [0, 8, 0, 1, 0, 0],
+            [1, 4, 1, 1, 0, 0],
+            [0, 1, 0, 0, 1, 0],
+            [0, 4, 0, 1, 0, 1],
+        ],
+        dtype=np.int32,
+    )
+
+    scaler = StandardScaler()
+    scaler.fit(x)
+
+    # The of inverse_transform should be converted
+    # to a float array.
+    # If not X *= self.scale_ will fail.
+    scaler.inverse_transform(x)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize(
+    "check",
+    [check_array_api_input_and_values],
+    ids=_get_check_estimator_ids,
+)
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        MaxAbsScaler(),
+        MinMaxScaler(),
+        MinMaxScaler(clip=True),
+        KernelCenterer(),
+        Normalizer(norm="l1"),
+        Normalizer(norm="l2"),
+        Normalizer(norm="max"),
+        Binarizer(),
+    ],
+    ids=_get_check_estimator_ids,
+)
+def test_preprocessing_array_api_compliance(
+    estimator, check, array_namespace, device, dtype_name
+):
+    name = estimator.__class__.__name__
+    check(name, estimator, array_namespace, device=device, dtype_name=dtype_name)
+
+
+def test_min_max_scaler_iris():
+    X = iris.data
+    scaler = MinMaxScaler()
+    # default params
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), 0)
+    assert_array_almost_equal(X_trans.max(axis=0), 1)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # not default params: min=1, max=2
+    scaler = MinMaxScaler(feature_range=(1, 2))
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), 1)
+    assert_array_almost_equal(X_trans.max(axis=0), 2)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # min=-.5, max=.6
+    scaler = MinMaxScaler(feature_range=(-0.5, 0.6))
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), -0.5)
+    assert_array_almost_equal(X_trans.max(axis=0), 0.6)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # raises on invalid range
+    scaler = MinMaxScaler(feature_range=(2, 1))
+    with pytest.raises(ValueError):
+        scaler.fit(X)
+
+
+def test_min_max_scaler_zero_variance_features():
+    # Check min max scaler on toy data with zero variance features
+    X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
+
+    X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
+
+    # default params
+    scaler = MinMaxScaler()
+    X_trans = scaler.fit_transform(X)
+    X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]
+    assert_array_almost_equal(X_trans, X_expected_0_1)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    X_trans_new = scaler.transform(X_new)
+    X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]]
+    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)
+
+    # not default params
+    scaler = MinMaxScaler(feature_range=(1, 2))
+    X_trans = scaler.fit_transform(X)
+    X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]]
+    assert_array_almost_equal(X_trans, X_expected_1_2)
+
+    # function interface
+    X_trans = minmax_scale(X)
+    assert_array_almost_equal(X_trans, X_expected_0_1)
+    X_trans = minmax_scale(X, feature_range=(1, 2))
+    assert_array_almost_equal(X_trans, X_expected_1_2)
+
+
+def test_minmax_scale_axis1():
+    X = iris.data
+    X_trans = minmax_scale(X, axis=1)
+    assert_array_almost_equal(np.min(X_trans, axis=1), 0)
+    assert_array_almost_equal(np.max(X_trans, axis=1), 1)
+
+
+def test_min_max_scaler_1d():
+    # Test scaling of dataset along single axis
+    for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
+        scaler = MinMaxScaler(copy=True)
+        X_scaled = scaler.fit(X).transform(X)
+
+        if isinstance(X, list):
+            X = np.array(X)  # cast only after scaling done
+
+        if _check_dim_1axis(X) == 1:
+            assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features))
+            assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features))
+        else:
+            assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
+            assert_array_almost_equal(X_scaled.max(axis=0), 1.0)
+        assert scaler.n_samples_seen_ == X.shape[0]
+
+        # check inverse transform
+        X_scaled_back = scaler.inverse_transform(X_scaled)
+        assert_array_almost_equal(X_scaled_back, X)
+
+    # Constant feature
+    X = np.ones((5, 1))
+    scaler = MinMaxScaler()
+    X_scaled = scaler.fit(X).transform(X)
+    assert X_scaled.min() >= 0.0
+    assert X_scaled.max() <= 1.0
+    assert scaler.n_samples_seen_ == X.shape[0]
+
+    # Function interface
+    X_1d = X_1row.ravel()
+    min_ = X_1d.min()
+    max_ = X_1d.max()
+    assert_array_almost_equal(
+        (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True)
+    )
+
+
+@pytest.mark.parametrize("sample_weight", [True, None])
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_scaler_without_centering(sample_weight, sparse_container):
+    rng = np.random.RandomState(42)
+    X = rng.randn(4, 5)
+    X[:, 0] = 0.0  # first feature is always of zero
+    X_sparse = sparse_container(X)
+
+    if sample_weight:
+        sample_weight = rng.rand(X.shape[0])
+
+    with pytest.raises(ValueError):
+        StandardScaler().fit(X_sparse)
+
+    scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight)
+    X_scaled = scaler.transform(X, copy=True)
+    assert not np.any(np.isnan(X_scaled))
+
+    scaler_sparse = StandardScaler(with_mean=False).fit(
+        X_sparse, sample_weight=sample_weight
+    )
+    X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True)
+    assert not np.any(np.isnan(X_sparse_scaled.data))
+
+    assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_)
+    assert_array_almost_equal(scaler.var_, scaler_sparse.var_)
+    assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_)
+    assert_array_almost_equal(scaler.n_samples_seen_, scaler_sparse.n_samples_seen_)
+
+    if sample_weight is None:
+        assert_array_almost_equal(
+            X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
+        )
+        assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+
+    X_sparse_scaled_mean, X_sparse_scaled_var = mean_variance_axis(X_sparse_scaled, 0)
+    assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0))
+    assert_array_almost_equal(X_sparse_scaled_var, X_scaled.var(axis=0))
+
+    # Check that X has not been modified (copy)
+    assert X_scaled is not X
+    assert X_sparse_scaled is not X_sparse
+
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
+    X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled)
+    assert X_sparse_scaled_back is not X_sparse
+    assert X_sparse_scaled_back is not X_sparse_scaled
+    assert_array_almost_equal(X_sparse_scaled_back.toarray(), X)
+
+    if sparse_container in CSR_CONTAINERS:
+        null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
+        X_null = null_transform.fit_transform(X_sparse)
+        assert_array_equal(X_null.data, X_sparse.data)
+        X_orig = null_transform.inverse_transform(X_null)
+        assert_array_equal(X_orig.data, X_sparse.data)
+
+
+@pytest.mark.parametrize("with_mean", [True, False])
+@pytest.mark.parametrize("with_std", [True, False])
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+def test_scaler_n_samples_seen_with_nan(with_mean, with_std, sparse_container):
+    X = np.array(
+        [[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64
+    )
+    if sparse_container is not None:
+        X = sparse_container(X)
+
+    if sparse.issparse(X) and with_mean:
+        pytest.skip("'with_mean=True' cannot be used with sparse matrix.")
+
+    transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
+    transformer.fit(X)
+
+    assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2]))
+
+
+def _check_identity_scalers_attributes(scaler_1, scaler_2):
+    assert scaler_1.mean_ is scaler_2.mean_ is None
+    assert scaler_1.var_ is scaler_2.var_ is None
+    assert scaler_1.scale_ is scaler_2.scale_ is None
+    assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_scaler_return_identity(sparse_container):
+    # test that the scaler return identity when with_mean and with_std are
+    # False
+    X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64)
+    X_sparse = sparse_container(X_dense)
+
+    transformer_dense = StandardScaler(with_mean=False, with_std=False)
+    X_trans_dense = transformer_dense.fit_transform(X_dense)
+    assert_allclose(X_trans_dense, X_dense)
+
+    transformer_sparse = clone(transformer_dense)
+    X_trans_sparse = transformer_sparse.fit_transform(X_sparse)
+    assert_allclose_dense_sparse(X_trans_sparse, X_sparse)
+
+    _check_identity_scalers_attributes(transformer_dense, transformer_sparse)
+
+    transformer_dense.partial_fit(X_dense)
+    transformer_sparse.partial_fit(X_sparse)
+    _check_identity_scalers_attributes(transformer_dense, transformer_sparse)
+
+    transformer_dense.fit(X_dense)
+    transformer_sparse.fit(X_sparse)
+    _check_identity_scalers_attributes(transformer_dense, transformer_sparse)
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_scaler_int(sparse_container):
+    # test that scaler converts integer input to floating
+    # for both sparse and dense matrices
+    rng = np.random.RandomState(42)
+    X = rng.randint(20, size=(4, 5))
+    X[:, 0] = 0  # first feature is always of zero
+    X_sparse = sparse_container(X)
+
+    with warnings.catch_warnings(record=True):
+        scaler = StandardScaler(with_mean=False).fit(X)
+        X_scaled = scaler.transform(X, copy=True)
+    assert not np.any(np.isnan(X_scaled))
+
+    with warnings.catch_warnings(record=True):
+        scaler_sparse = StandardScaler(with_mean=False).fit(X_sparse)
+        X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True)
+    assert not np.any(np.isnan(X_sparse_scaled.data))
+
+    assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_)
+    assert_array_almost_equal(scaler.var_, scaler_sparse.var_)
+    assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_)
+
+    assert_array_almost_equal(
+        X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2
+    )
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+
+    X_sparse_scaled_mean, X_sparse_scaled_std = mean_variance_axis(
+        X_sparse_scaled.astype(float), 0
+    )
+    assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0))
+    assert_array_almost_equal(X_sparse_scaled_std, X_scaled.std(axis=0))
+
+    # Check that X has not been modified (copy)
+    assert X_scaled is not X
+    assert X_sparse_scaled is not X_sparse
+
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
+    X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled)
+    assert X_sparse_scaled_back is not X_sparse
+    assert X_sparse_scaled_back is not X_sparse_scaled
+    assert_array_almost_equal(X_sparse_scaled_back.toarray(), X)
+
+    if sparse_container in CSR_CONTAINERS:
+        null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
+        with warnings.catch_warnings(record=True):
+            X_null = null_transform.fit_transform(X_sparse)
+        assert_array_equal(X_null.data, X_sparse.data)
+        X_orig = null_transform.inverse_transform(X_null)
+        assert_array_equal(X_orig.data, X_sparse.data)
+
+
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_scaler_without_copy(sparse_container):
+    # Check that StandardScaler.fit does not change input
+    rng = np.random.RandomState(42)
+    X = rng.randn(4, 5)
+    X[:, 0] = 0.0  # first feature is always of zero
+    X_sparse = sparse_container(X)
+
+    X_copy = X.copy()
+    StandardScaler(copy=False).fit(X)
+    assert_array_equal(X, X_copy)
+
+    X_sparse_copy = X_sparse.copy()
+    StandardScaler(with_mean=False, copy=False).fit(X_sparse)
+    assert_array_equal(X_sparse.toarray(), X_sparse_copy.toarray())
+
+
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_scale_sparse_with_mean_raise_exception(sparse_container):
+    rng = np.random.RandomState(42)
+    X = rng.randn(4, 5)
+    X_sparse = sparse_container(X)
+
+    # check scaling and fit with direct calls on sparse data
+    with pytest.raises(ValueError):
+        scale(X_sparse, with_mean=True)
+    with pytest.raises(ValueError):
+        StandardScaler(with_mean=True).fit(X_sparse)
+
+    # check transform and inverse_transform after a fit on a dense array
+    scaler = StandardScaler(with_mean=True).fit(X)
+    with pytest.raises(ValueError):
+        scaler.transform(X_sparse)
+
+    X_transformed_sparse = sparse_container(scaler.transform(X))
+    with pytest.raises(ValueError):
+        scaler.inverse_transform(X_transformed_sparse)
+
+
+def test_scale_input_finiteness_validation():
+    # Check if non finite inputs raise ValueError
+    X = [[np.inf, 5, 6, 7, 8]]
+    with pytest.raises(
+        ValueError, match="Input contains infinity or a value too large"
+    ):
+        scale(X)
+
+
+def test_robust_scaler_error_sparse():
+    X_sparse = sparse.rand(1000, 10)
+    scaler = RobustScaler(with_centering=True)
+    err_msg = "Cannot center sparse matrices"
+    with pytest.raises(ValueError, match=err_msg):
+        scaler.fit(X_sparse)
+
+
+@pytest.mark.parametrize("with_centering", [True, False])
+@pytest.mark.parametrize("with_scaling", [True, False])
+@pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)])
+def test_robust_scaler_attributes(X, with_centering, with_scaling):
+    # check consistent type of attributes
+    if with_centering and sparse.issparse(X):
+        pytest.skip("RobustScaler cannot center sparse matrix")
+
+    scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling)
+    scaler.fit(X)
+
+    if with_centering:
+        assert isinstance(scaler.center_, np.ndarray)
+    else:
+        assert scaler.center_ is None
+    if with_scaling:
+        assert isinstance(scaler.scale_, np.ndarray)
+    else:
+        assert scaler.scale_ is None
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_robust_scaler_col_zero_sparse(csr_container):
+    # check that the scaler is working when there is not data materialized in a
+    # column of a sparse matrix
+    X = np.random.randn(10, 5)
+    X[:, 0] = 0
+    X = csr_container(X)
+
+    scaler = RobustScaler(with_centering=False)
+    scaler.fit(X)
+    assert scaler.scale_[0] == pytest.approx(1)
+
+    X_trans = scaler.transform(X)
+    assert_allclose(X[:, [0]].toarray(), X_trans[:, [0]].toarray())
+
+
+def test_robust_scaler_2d_arrays():
+    # Test robust scaling of 2d array along first axis
+    rng = np.random.RandomState(0)
+    X = rng.randn(4, 5)
+    X[:, 0] = 0.0  # first feature is always of zero
+
+    scaler = RobustScaler()
+    X_scaled = scaler.fit(X).transform(X)
+
+    assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0])
+    assert_array_almost_equal(X_scaled.std(axis=0)[0], 0)
+
+
+@pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1])
+@pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None])
+def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed):
+    # Check the equivalence of the fitting with dense and sparse matrices
+    X_sparse = sparse.rand(1000, 5, density=density).tocsc()
+    if strictly_signed == "positive":
+        X_sparse.data = np.abs(X_sparse.data)
+    elif strictly_signed == "negative":
+        X_sparse.data = -np.abs(X_sparse.data)
+    elif strictly_signed == "zeros":
+        X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64)
+    X_dense = X_sparse.toarray()
+
+    scaler_sparse = RobustScaler(with_centering=False)
+    scaler_dense = RobustScaler(with_centering=False)
+
+    scaler_sparse.fit(X_sparse)
+    scaler_dense.fit(X_dense)
+
+    assert_allclose(scaler_sparse.scale_, scaler_dense.scale_)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_robust_scaler_transform_one_row_csr(csr_container):
+    # Check RobustScaler on transforming csr matrix with one row
+    rng = np.random.RandomState(0)
+    X = rng.randn(4, 5)
+    single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]])
+    scaler = RobustScaler(with_centering=False)
+    scaler = scaler.fit(X)
+    row_trans = scaler.transform(csr_container(single_row))
+    row_expected = single_row / scaler.scale_
+    assert_array_almost_equal(row_trans.toarray(), row_expected)
+    row_scaled_back = scaler.inverse_transform(row_trans)
+    assert_array_almost_equal(single_row, row_scaled_back.toarray())
+
+
+def test_robust_scaler_iris():
+    X = iris.data
+    scaler = RobustScaler()
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(np.median(X_trans, axis=0), 0)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+    q = np.percentile(X_trans, q=(25, 75), axis=0)
+    iqr = q[1] - q[0]
+    assert_array_almost_equal(iqr, 1)
+
+
+def test_robust_scaler_iris_quantiles():
+    X = iris.data
+    scaler = RobustScaler(quantile_range=(10, 90))
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(np.median(X_trans, axis=0), 0)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+    q = np.percentile(X_trans, q=(10, 90), axis=0)
+    q_range = q[1] - q[0]
+    assert_array_almost_equal(q_range, 1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_iris(csc_container):
+    X = iris.data
+    # uniform output distribution
+    transformer = QuantileTransformer(n_quantiles=30)
+    X_trans = transformer.fit_transform(X)
+    X_trans_inv = transformer.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+    # normal output distribution
+    transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal")
+    X_trans = transformer.fit_transform(X)
+    X_trans_inv = transformer.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+    # make sure it is possible to take the inverse of a sparse matrix
+    # which contain negative value; this is the case in the iris dataset
+    X_sparse = csc_container(X)
+    X_sparse_tran = transformer.fit_transform(X_sparse)
+    X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran)
+    assert_array_almost_equal(X_sparse.toarray(), X_sparse_tran_inv.toarray())
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_check_error(csc_container):
+    X = np.transpose(
+        [
+            [0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
+            [2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
+            [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
+        ]
+    )
+    X = csc_container(X)
+    X_neg = np.transpose(
+        [
+            [0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
+            [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
+            [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
+        ]
+    )
+    X_neg = csc_container(X_neg)
+
+    err_msg = (
+        "The number of quantiles cannot be greater than "
+        "the number of samples used. Got 1000 quantiles "
+        "and 10 samples."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        QuantileTransformer(subsample=10).fit(X)
+
+    transformer = QuantileTransformer(n_quantiles=10)
+    err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.fit(X_neg)
+    transformer.fit(X)
+    err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.transform(X_neg)
+
+    X_bad_feat = np.transpose(
+        [[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]
+    )
+    err_msg = (
+        "X has 2 features, but QuantileTransformer is expecting 3 features as input."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.inverse_transform(X_bad_feat)
+
+    transformer = QuantileTransformer(n_quantiles=10).fit(X)
+    # check that an error is raised if input is scalar
+    with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"):
+        transformer.transform(10)
+    # check that a warning is raised is n_quantiles > n_samples
+    transformer = QuantileTransformer(n_quantiles=100)
+    warn_msg = "n_quantiles is set to n_samples"
+    with pytest.warns(UserWarning, match=warn_msg) as record:
+        transformer.fit(X)
+    assert len(record) == 1
+    assert transformer.n_quantiles_ == X.shape[0]
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_sparse_ignore_zeros(csc_container):
+    X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]])
+    X_sparse = csc_container(X)
+    transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
+
+    # dense case -> warning raise
+    warning_message = (
+        "'ignore_implicit_zeros' takes effect"
+        " only with sparse matrix. This parameter has no"
+        " effect."
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        transformer.fit(X)
+
+    X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]])
+    X_trans = transformer.fit_transform(X_sparse)
+    assert_almost_equal(X_expected, X_trans.toarray())
+
+    # consider the case where sparse entries are missing values and user-given
+    # zeros are to be considered
+    X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0])
+    X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8])
+    X_sparse = csc_container((X_data, (X_row, X_col)))
+    X_trans = transformer.fit_transform(X_sparse)
+    X_expected = np.array(
+        [
+            [0.0, 0.5],
+            [0.0, 0.0],
+            [0.0, 1.0],
+            [0.0, 1.0],
+            [0.0, 0.5],
+            [0.0, 0.0],
+            [0.0, 0.5],
+            [0.0, 1.0],
+            [0.0, 0.0],
+        ]
+    )
+    assert_almost_equal(X_expected, X_trans.toarray())
+
+    transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
+    X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1])
+    X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1])
+    X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6])
+    X_sparse = csc_container((X_data, (X_row, X_col)))
+    X_trans = transformer.fit_transform(X_sparse)
+    X_expected = np.array(
+        [[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]]
+    )
+    assert_almost_equal(X_expected, X_trans.toarray())
+    assert_almost_equal(
+        X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray()
+    )
+
+    # check in conjunction with subsampling
+    transformer = QuantileTransformer(
+        ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0
+    )
+    X_trans = transformer.fit_transform(X_sparse)
+    assert_almost_equal(X_expected, X_trans.toarray())
+    assert_almost_equal(
+        X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray()
+    )
+
+
+def test_quantile_transform_dense_toy():
+    X = np.array(
+        [[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]]
+    )
+
+    transformer = QuantileTransformer(n_quantiles=5)
+    transformer.fit(X)
+
+    # using a uniform output, each entry of X should be map between 0 and 1
+    # and equally spaced
+    X_trans = transformer.fit_transform(X)
+    X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T
+    assert_almost_equal(np.sort(X_trans, axis=0), X_expected)
+
+    X_test = np.array(
+        [
+            [-1, 1, 0],
+            [101, 11, 10],
+        ]
+    )
+    X_expected = np.array(
+        [
+            [0, 0, 0],
+            [1, 1, 1],
+        ]
+    )
+    assert_array_almost_equal(transformer.transform(X_test), X_expected)
+
+    X_trans_inv = transformer.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+
+def test_quantile_transform_subsampling():
+    # Test that subsampling the input yield to a consistent results We check
+    # that the computed quantiles are almost mapped to a [0, 1] vector where
+    # values are equally spaced. The infinite norm is checked to be smaller
+    # than a given threshold. This is repeated 5 times.
+
+    # dense support
+    n_samples = 1000000
+    n_quantiles = 1000
+    X = np.sort(np.random.sample((n_samples, 1)), axis=0)
+    ROUND = 5
+    inf_norm_arr = []
+    for random_state in range(ROUND):
+        transformer = QuantileTransformer(
+            random_state=random_state,
+            n_quantiles=n_quantiles,
+            subsample=n_samples // 10,
+        )
+        transformer.fit(X)
+        diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
+        inf_norm = np.max(np.abs(diff))
+        assert inf_norm < 1e-2
+        inf_norm_arr.append(inf_norm)
+    # each random subsampling yield a unique approximation to the expected
+    # linspace CDF
+    assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
+
+    # sparse support
+
+    X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0)
+    inf_norm_arr = []
+    for random_state in range(ROUND):
+        transformer = QuantileTransformer(
+            random_state=random_state,
+            n_quantiles=n_quantiles,
+            subsample=n_samples // 10,
+        )
+        transformer.fit(X)
+        diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
+        inf_norm = np.max(np.abs(diff))
+        assert inf_norm < 1e-1
+        inf_norm_arr.append(inf_norm)
+    # each random subsampling yield a unique approximation to the expected
+    # linspace CDF
+    assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
+
+
+def test_quantile_transform_subsampling_disabled():
+    """Check the behaviour of `QuantileTransformer` when `subsample=None`."""
+    X = np.random.RandomState(0).normal(size=(200, 1))
+
+    n_quantiles = 5
+    transformer = QuantileTransformer(n_quantiles=n_quantiles, subsample=None).fit(X)
+
+    expected_references = np.linspace(0, 1, n_quantiles)
+    assert_allclose(transformer.references_, expected_references)
+    expected_quantiles = np.quantile(X.ravel(), expected_references)
+    assert_allclose(transformer.quantiles_.ravel(), expected_quantiles)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_sparse_toy(csc_container):
+    X = np.array(
+        [
+            [0.0, 2.0, 0.0],
+            [25.0, 4.0, 0.0],
+            [50.0, 0.0, 2.6],
+            [0.0, 0.0, 4.1],
+            [0.0, 6.0, 0.0],
+            [0.0, 8.0, 0.0],
+            [75.0, 0.0, 2.3],
+            [0.0, 10.0, 0.0],
+            [0.0, 0.0, 9.5],
+            [100.0, 0.0, 0.1],
+        ]
+    )
+
+    X = csc_container(X)
+
+    transformer = QuantileTransformer(n_quantiles=10)
+    transformer.fit(X)
+
+    X_trans = transformer.fit_transform(X)
+    assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
+    assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
+
+    X_trans_inv = transformer.inverse_transform(X_trans)
+    assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
+
+    transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray())
+
+    X_trans = transformer_dense.transform(X)
+    assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
+    assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
+
+    X_trans_inv = transformer_dense.inverse_transform(X_trans)
+    assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
+
+
+def test_quantile_transform_axis1():
+    X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]])
+
+    X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5)
+    X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5)
+    assert_array_almost_equal(X_trans_a0, X_trans_a1.T)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_quantile_transform_bounds(csc_container):
+    # Lower and upper bounds are manually mapped. We checked that in the case
+    # of a constant feature and binary feature, the bounds are properly mapped.
+    X_dense = np.array([[0, 0], [0, 0], [1, 0]])
+    X_sparse = csc_container(X_dense)
+
+    # check sparse and dense are consistent
+    X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense)
+    assert_array_almost_equal(X_trans, X_dense)
+    X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(
+        X_sparse
+    )
+    assert_array_almost_equal(X_trans_sp.toarray(), X_dense)
+    assert_array_almost_equal(X_trans, X_trans_sp.toarray())
+
+    # check the consistency of the bounds by learning on 1 matrix
+    # and transforming another
+    X = np.array([[0, 1], [0, 0.5], [1, 0]])
+    X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]])
+    transformer = QuantileTransformer(n_quantiles=3).fit(X)
+    X_trans = transformer.transform(X1)
+    assert_array_almost_equal(X_trans, X1)
+
+    # check that values outside of the range learned will be mapped properly.
+    X = np.random.random((1000, 1))
+    transformer = QuantileTransformer()
+    transformer.fit(X)
+    assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]])
+    assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]])
+    assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform(
+        [[np.min(transformer.references_)]]
+    )
+    assert transformer.inverse_transform([[10]]) == transformer.inverse_transform(
+        [[np.max(transformer.references_)]]
+    )
+
+
+def test_quantile_transform_and_inverse():
+    X_1 = iris.data
+    X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]])
+    for X in [X_1, X_2]:
+        transformer = QuantileTransformer(n_quantiles=1000, random_state=0)
+        X_trans = transformer.fit_transform(X)
+        X_trans_inv = transformer.inverse_transform(X_trans)
+        assert_array_almost_equal(X, X_trans_inv, decimal=9)
+
+
+def test_quantile_transform_nan():
+    X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]])
+
+    transformer = QuantileTransformer(n_quantiles=10, random_state=42)
+    transformer.fit_transform(X)
+
+    # check that the quantile of the first column is all NaN
+    assert np.isnan(transformer.quantiles_[:, 0]).all()
+    # all other column should not contain NaN
+    assert not np.isnan(transformer.quantiles_[:, 1:]).any()
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse"])
+def test_quantile_transformer_sorted_quantiles(array_type):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15733
+    # Taken from upstream bug report:
+    # https://github.com/numpy/numpy/issues/14685
+    X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10)
+    X = 0.1 * X.reshape(-1, 1)
+    X = _convert_container(X, array_type)
+
+    n_quantiles = 100
+    qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X)
+
+    # Check that the estimated quantile thresholds are monotically
+    # increasing:
+    quantiles = qt.quantiles_[:, 0]
+    assert len(quantiles) == 100
+    assert all(np.diff(quantiles) >= 0)
+
+
+def test_robust_scaler_invalid_range():
+    for range_ in [
+        (-1, 90),
+        (-2, -3),
+        (10, 101),
+        (100.5, 101),
+        (90, 50),
+    ]:
+        scaler = RobustScaler(quantile_range=range_)
+
+        with pytest.raises(ValueError, match=r"Invalid quantile range: \("):
+            scaler.fit(iris.data)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_scale_function_without_centering(csr_container):
+    rng = np.random.RandomState(42)
+    X = rng.randn(4, 5)
+    X[:, 0] = 0.0  # first feature is always of zero
+    X_csr = csr_container(X)
+
+    X_scaled = scale(X, with_mean=False)
+    assert not np.any(np.isnan(X_scaled))
+
+    X_csr_scaled = scale(X_csr, with_mean=False)
+    assert not np.any(np.isnan(X_csr_scaled.data))
+
+    # test csc has same outcome
+    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
+    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())
+
+    # raises value error on axis != 0
+    with pytest.raises(ValueError):
+        scale(X_csr, with_mean=False, axis=1)
+
+    assert_array_almost_equal(
+        X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
+    )
+    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
+    # Check that X has not been copied
+    assert X_scaled is not X
+
+    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
+    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
+    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
+
+    # null scale
+    X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True)
+    assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray())
+
+
+def test_robust_scale_axis1():
+    X = iris.data
+    X_trans = robust_scale(X, axis=1)
+    assert_array_almost_equal(np.median(X_trans, axis=1), 0)
+    q = np.percentile(X_trans, q=(25, 75), axis=1)
+    iqr = q[1] - q[0]
+    assert_array_almost_equal(iqr, 1)
+
+
+def test_robust_scale_1d_array():
+    X = iris.data[:, 1]
+    X_trans = robust_scale(X)
+    assert_array_almost_equal(np.median(X_trans), 0)
+    q = np.percentile(X_trans, q=(25, 75))
+    iqr = q[1] - q[0]
+    assert_array_almost_equal(iqr, 1)
+
+
+def test_robust_scaler_zero_variance_features():
+    # Check RobustScaler on toy data with zero variance features
+    X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
+
+    scaler = RobustScaler()
+    X_trans = scaler.fit_transform(X)
+
+    # NOTE: for such a small sample size, what we expect in the third column
+    # depends HEAVILY on the method used to calculate quantiles. The values
+    # here were calculated to fit the quantiles produces by np.percentile
+    # using numpy 1.9 Calculating quantiles with
+    # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles
+    # would yield very different results!
+    X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]]
+    assert_array_almost_equal(X_trans, X_expected)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # make sure new data gets transformed correctly
+    X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
+    X_trans_new = scaler.transform(X_new)
+    X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]]
+    assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3)
+
+
+def test_robust_scaler_unit_variance():
+    # Check RobustScaler with unit_variance=True on standard normal data with
+    # outliers
+    rng = np.random.RandomState(42)
+    X = rng.randn(1000000, 1)
+    X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100])
+
+    quantile_range = (1, 99)
+    robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit(
+        X_with_outliers
+    )
+    X_trans = robust_scaler.transform(X)
+
+    assert robust_scaler.center_ == pytest.approx(0, abs=1e-3)
+    assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2)
+    assert X_trans.std() == pytest.approx(1, abs=1e-2)
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_maxabs_scaler_zero_variance_features(sparse_container):
+    # Check MaxAbsScaler on toy data with zero variance features
+    X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]]
+
+    scaler = MaxAbsScaler()
+    X_trans = scaler.fit_transform(X)
+    X_expected = [
+        [0.0, 1.0, 1.0 / 3.0],
+        [0.0, 1.0, -0.2],
+        [0.0, 1.0, 1.0],
+        [0.0, 0.0, 0.0],
+    ]
+    assert_array_almost_equal(X_trans, X_expected)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # make sure new data gets transformed correctly
+    X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
+    X_trans_new = scaler.transform(X_new)
+    X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]]
+
+    assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2)
+
+    # function interface
+    X_trans = maxabs_scale(X)
+    assert_array_almost_equal(X_trans, X_expected)
+
+    # sparse data
+    X_sparse = sparse_container(X)
+    X_trans_sparse = scaler.fit_transform(X_sparse)
+    X_expected = [
+        [0.0, 1.0, 1.0 / 3.0],
+        [0.0, 1.0, -0.2],
+        [0.0, 1.0, 1.0],
+        [0.0, 0.0, 0.0],
+    ]
+    assert_array_almost_equal(X_trans_sparse.toarray(), X_expected)
+    X_trans_sparse_inv = scaler.inverse_transform(X_trans_sparse)
+    assert_array_almost_equal(X, X_trans_sparse_inv.toarray())
+
+
+def test_maxabs_scaler_large_negative_value():
+    # Check MaxAbsScaler on toy data with a large negative value
+    X = [
+        [0.0, 1.0, +0.5, -1.0],
+        [0.0, 1.0, -0.3, -0.5],
+        [0.0, 1.0, -100.0, 0.0],
+        [0.0, 0.0, +0.0, -2.0],
+    ]
+
+    scaler = MaxAbsScaler()
+    X_trans = scaler.fit_transform(X)
+    X_expected = [
+        [0.0, 1.0, 0.005, -0.5],
+        [0.0, 1.0, -0.003, -0.25],
+        [0.0, 1.0, -1.0, 0.0],
+        [0.0, 0.0, 0.0, -1.0],
+    ]
+    assert_array_almost_equal(X_trans, X_expected)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_maxabs_scaler_transform_one_row_csr(csr_container):
+    # Check MaxAbsScaler on transforming csr matrix with one row
+    X = csr_container([[0.5, 1.0, 1.0]])
+    scaler = MaxAbsScaler()
+    scaler = scaler.fit(X)
+    X_trans = scaler.transform(X)
+    X_expected = csr_container([[1.0, 1.0, 1.0]])
+    assert_array_almost_equal(X_trans.toarray(), X_expected.toarray())
+    X_scaled_back = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X.toarray(), X_scaled_back.toarray())
+
+
+def test_maxabs_scaler_1d():
+    # Test scaling of dataset along single axis
+    for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
+        scaler = MaxAbsScaler(copy=True)
+        X_scaled = scaler.fit(X).transform(X)
+
+        if isinstance(X, list):
+            X = np.array(X)  # cast only after scaling done
+
+        if _check_dim_1axis(X) == 1:
+            assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features))
+        else:
+            assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
+        assert scaler.n_samples_seen_ == X.shape[0]
+
+        # check inverse transform
+        X_scaled_back = scaler.inverse_transform(X_scaled)
+        assert_array_almost_equal(X_scaled_back, X)
+
+    # Constant feature
+    X = np.ones((5, 1))
+    scaler = MaxAbsScaler()
+    X_scaled = scaler.fit(X).transform(X)
+    assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
+    assert scaler.n_samples_seen_ == X.shape[0]
+
+    # function interface
+    X_1d = X_1row.ravel()
+    max_abs = np.abs(X_1d).max()
+    assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_maxabs_scaler_partial_fit(csr_container):
+    # Test if partial_fit run over many batches of size 1 and 50
+    # gives the same results as fit
+    X = X_2d[:100, :]
+    n = X.shape[0]
+
+    for chunk_size in [1, 2, 50, n, n + 42]:
+        # Test mean at the end of the process
+        scaler_batch = MaxAbsScaler().fit(X)
+
+        scaler_incr = MaxAbsScaler()
+        scaler_incr_csr = MaxAbsScaler()
+        scaler_incr_csc = MaxAbsScaler()
+        for batch in gen_batches(n, chunk_size):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+            X_csr = csr_container(X[batch])
+            scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
+            X_csc = csr_container(X[batch])
+            scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)
+
+        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
+        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_)
+        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+        assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_
+        assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
+        assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
+
+        # Test std after 1 step
+        batch0 = slice(0, chunk_size)
+        scaler_batch = MaxAbsScaler().fit(X[batch0])
+        scaler_incr = MaxAbsScaler().partial_fit(X[batch0])
+
+        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
+        assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+        assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
+
+        # Test std until the end of partial fits, and
+        scaler_batch = MaxAbsScaler().fit(X)
+        scaler_incr = MaxAbsScaler()  # Clean estimator
+        for i, batch in enumerate(gen_batches(n, chunk_size)):
+            scaler_incr = scaler_incr.partial_fit(X[batch])
+            assert_correct_incr(
+                i,
+                batch_start=batch.start,
+                batch_stop=batch.stop,
+                n=n,
+                chunk_size=chunk_size,
+                n_samples_seen=scaler_incr.n_samples_seen_,
+            )
+
+
+def check_normalizer(norm, X_norm):
+    """
+    Convenient checking function for `test_normalizer_l1_l2_max` and
+    `test_normalizer_l1_l2_max_non_csr`
+    """
+    if norm == "l1":
+        row_sums = np.abs(X_norm).sum(axis=1)
+        for i in range(3):
+            assert_almost_equal(row_sums[i], 1.0)
+        assert_almost_equal(row_sums[3], 0.0)
+    elif norm == "l2":
+        for i in range(3):
+            assert_almost_equal(la.norm(X_norm[i]), 1.0)
+        assert_almost_equal(la.norm(X_norm[3]), 0.0)
+    elif norm == "max":
+        row_maxs = abs(X_norm).max(axis=1)
+        for i in range(3):
+            assert_almost_equal(row_maxs[i], 1.0)
+        assert_almost_equal(row_maxs[3], 0.0)
+
+
+@pytest.mark.parametrize("norm", ["l1", "l2", "max"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_normalizer_l1_l2_max(norm, csr_container):
+    rng = np.random.RandomState(0)
+    X_dense = rng.randn(4, 5)
+    X_sparse_unpruned = csr_container(X_dense)
+
+    # set the row number 3 to zero
+    X_dense[3, :] = 0.0
+
+    # set the row number 3 to zero without pruning (can happen in real life)
+    indptr_3 = X_sparse_unpruned.indptr[3]
+    indptr_4 = X_sparse_unpruned.indptr[4]
+    X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
+
+    # build the pruned variant using the regular constructor
+    X_sparse_pruned = csr_container(X_dense)
+
+    # check inputs that support the no-copy optim
+    for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
+        normalizer = Normalizer(norm=norm, copy=True)
+        X_norm1 = normalizer.transform(X)
+        assert X_norm1 is not X
+        X_norm1 = toarray(X_norm1)
+
+        normalizer = Normalizer(norm=norm, copy=False)
+        X_norm2 = normalizer.transform(X)
+        assert X_norm2 is X
+        X_norm2 = toarray(X_norm2)
+
+        for X_norm in (X_norm1, X_norm2):
+            check_normalizer(norm, X_norm)
+
+
+@pytest.mark.parametrize("norm", ["l1", "l2", "max"])
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + LIL_CONTAINERS
+)
+def test_normalizer_l1_l2_max_non_csr(norm, sparse_container):
+    rng = np.random.RandomState(0)
+    X_dense = rng.randn(4, 5)
+
+    # set the row number 3 to zero
+    X_dense[3, :] = 0.0
+
+    X = sparse_container(X_dense)
+    X_norm = Normalizer(norm=norm, copy=False).transform(X)
+
+    assert X_norm is not X
+    assert sparse.issparse(X_norm) and X_norm.format == "csr"
+
+    X_norm = toarray(X_norm)
+    check_normalizer(norm, X_norm)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_normalizer_max_sign(csr_container):
+    # check that we normalize by a positive number even for negative data
+    rng = np.random.RandomState(0)
+    X_dense = rng.randn(4, 5)
+    # set the row number 3 to zero
+    X_dense[3, :] = 0.0
+    # check for mixed data where the value with
+    # largest magnitude is negative
+    X_dense[2, abs(X_dense[2, :]).argmax()] *= -1
+    X_all_neg = -np.abs(X_dense)
+    X_all_neg_sparse = csr_container(X_all_neg)
+
+    for X in (X_dense, X_all_neg, X_all_neg_sparse):
+        normalizer = Normalizer(norm="max")
+        X_norm = normalizer.transform(X)
+        assert X_norm is not X
+        X_norm = toarray(X_norm)
+        assert_array_equal(np.sign(X_norm), np.sign(toarray(X)))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_normalize(csr_container):
+    # Test normalize function
+    # Only tests functionality not used by the tests for Normalizer.
+    X = np.random.RandomState(37).randn(3, 2)
+    assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T)
+
+    rs = np.random.RandomState(0)
+    X_dense = rs.randn(10, 5)
+    X_sparse = csr_container(X_dense)
+    ones = np.ones((10))
+    for X in (X_dense, X_sparse):
+        for dtype in (np.float32, np.float64):
+            for norm in ("l1", "l2"):
+                X = X.astype(dtype)
+                X_norm = normalize(X, norm=norm)
+                assert X_norm.dtype == dtype
+
+                X_norm = toarray(X_norm)
+                if norm == "l1":
+                    row_sums = np.abs(X_norm).sum(axis=1)
+                else:
+                    X_norm_squared = X_norm**2
+                    row_sums = X_norm_squared.sum(axis=1)
+
+                assert_array_almost_equal(row_sums, ones)
+
+    # Test return_norm
+    X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]])
+    for norm in ("l1", "l2", "max"):
+        _, norms = normalize(X_dense, norm=norm, return_norm=True)
+        if norm == "l1":
+            assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0]))
+        elif norm == "l2":
+            assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127]))
+        else:
+            assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
+
+    X_sparse = csr_container(X_dense)
+    for norm in ("l1", "l2"):
+        with pytest.raises(NotImplementedError):
+            normalize(X_sparse, norm=norm, return_norm=True)
+    _, norms = normalize(X_sparse, norm="max", return_norm=True)
+    assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
+
+
+@pytest.mark.parametrize(
+    "constructor", [np.array, list] + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_binarizer(constructor):
+    X_ = np.array([[1, 0, 5], [2, 3, -1]])
+    X = constructor(X_.copy())
+
+    binarizer = Binarizer(threshold=2.0, copy=True)
+    X_bin = toarray(binarizer.transform(X))
+    assert np.sum(X_bin == 0) == 4
+    assert np.sum(X_bin == 1) == 2
+    X_bin = binarizer.transform(X)
+    assert sparse.issparse(X) == sparse.issparse(X_bin)
+
+    binarizer = Binarizer(copy=True).fit(X)
+    X_bin = toarray(binarizer.transform(X))
+    assert X_bin is not X
+    assert np.sum(X_bin == 0) == 2
+    assert np.sum(X_bin == 1) == 4
+
+    binarizer = Binarizer(copy=True)
+    X_bin = binarizer.transform(X)
+    assert X_bin is not X
+    X_bin = toarray(X_bin)
+    assert np.sum(X_bin == 0) == 2
+    assert np.sum(X_bin == 1) == 4
+
+    binarizer = Binarizer(copy=False)
+    X_bin = binarizer.transform(X)
+    if constructor is not list:
+        assert X_bin is X
+
+    binarizer = Binarizer(copy=False)
+    X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64)
+    X_bin = binarizer.transform(X_float)
+    if constructor is not list:
+        assert X_bin is X_float
+
+    X_bin = toarray(X_bin)
+    assert np.sum(X_bin == 0) == 2
+    assert np.sum(X_bin == 1) == 4
+
+    binarizer = Binarizer(threshold=-0.5, copy=True)
+    if constructor in (np.array, list):
+        X = constructor(X_.copy())
+
+        X_bin = toarray(binarizer.transform(X))
+        assert np.sum(X_bin == 0) == 1
+        assert np.sum(X_bin == 1) == 5
+        X_bin = binarizer.transform(X)
+
+    # Cannot use threshold < 0 for sparse
+    if constructor in CSC_CONTAINERS:
+        with pytest.raises(ValueError):
+            binarizer.transform(constructor(X))
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
+)
+def test_binarizer_array_api_int(array_namespace, device, dtype_name):
+    # Checks that Binarizer works with integer elements and float threshold
+    xp = _array_api_for_tests(array_namespace, device)
+    for dtype_name_ in [dtype_name, "int32", "int64"]:
+        X_np = np.reshape(np.asarray([0, 1, 2, 3, 4], dtype=dtype_name_), (-1, 1))
+        X_xp = xp.asarray(X_np, device=device)
+        binarized_np = Binarizer(threshold=2.5).fit_transform(X_np)
+        with config_context(array_api_dispatch=True):
+            binarized_xp = Binarizer(threshold=2.5).fit_transform(X_xp)
+        assert_array_equal(_convert_to_numpy(binarized_xp, xp), binarized_np)
+
+
+def test_center_kernel():
+    # Test that KernelCenterer is equivalent to StandardScaler
+    # in feature space
+    rng = np.random.RandomState(0)
+    X_fit = rng.random_sample((5, 4))
+    scaler = StandardScaler(with_std=False)
+    scaler.fit(X_fit)
+    X_fit_centered = scaler.transform(X_fit)
+    K_fit = np.dot(X_fit, X_fit.T)
+
+    # center fit time matrix
+    centerer = KernelCenterer()
+    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
+    K_fit_centered2 = centerer.fit_transform(K_fit)
+    assert_array_almost_equal(K_fit_centered, K_fit_centered2)
+
+    # center predict time matrix
+    X_pred = rng.random_sample((2, 4))
+    K_pred = np.dot(X_pred, X_fit.T)
+    X_pred_centered = scaler.transform(X_pred)
+    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
+    K_pred_centered2 = centerer.transform(K_pred)
+    assert_array_almost_equal(K_pred_centered, K_pred_centered2)
+
+    # check the results coherence with the method proposed in:
+    # B. Schölkopf, A. Smola, and K.R. Müller,
+    # "Nonlinear component analysis as a kernel eigenvalue problem"
+    # equation (B.3)
+
+    # K_centered3 = (I - 1_M) K (I - 1_M)
+    #             =  K - 1_M K - K 1_M + 1_M K 1_M
+    ones_M = np.ones_like(K_fit) / K_fit.shape[0]
+    K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M
+    assert_allclose(K_fit_centered, K_fit_centered3)
+
+    # K_test_centered3 = (K_test - 1'_M K)(I - 1_M)
+    #                  = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
+    ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0]
+    K_pred_centered3 = (
+        K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M
+    )
+    assert_allclose(K_pred_centered, K_pred_centered3)
+
+
+def test_kernelcenterer_non_linear_kernel():
+    """Check kernel centering for non-linear kernel."""
+    rng = np.random.RandomState(0)
+    X, X_test = rng.randn(100, 50), rng.randn(20, 50)
+
+    def phi(X):
+        """Our mapping function phi."""
+        return np.vstack(
+            [
+                np.clip(X, a_min=0, a_max=None),
+                -np.clip(X, a_min=None, a_max=0),
+            ]
+        )
+
+    phi_X = phi(X)
+    phi_X_test = phi(X_test)
+
+    # centered the projection
+    scaler = StandardScaler(with_std=False)
+    phi_X_center = scaler.fit_transform(phi_X)
+    phi_X_test_center = scaler.transform(phi_X_test)
+
+    # create the different kernel
+    K = phi_X @ phi_X.T
+    K_test = phi_X_test @ phi_X.T
+    K_center = phi_X_center @ phi_X_center.T
+    K_test_center = phi_X_test_center @ phi_X_center.T
+
+    kernel_centerer = KernelCenterer()
+    kernel_centerer.fit(K)
+
+    assert_allclose(kernel_centerer.transform(K), K_center)
+    assert_allclose(kernel_centerer.transform(K_test), K_test_center)
+
+    # check the results coherence with the method proposed in:
+    # B. Schölkopf, A. Smola, and K.R. Müller,
+    # "Nonlinear component analysis as a kernel eigenvalue problem"
+    # equation (B.3)
+
+    # K_centered = (I - 1_M) K (I - 1_M)
+    #            =  K - 1_M K - K 1_M + 1_M K 1_M
+    ones_M = np.ones_like(K) / K.shape[0]
+    K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M
+    assert_allclose(kernel_centerer.transform(K), K_centered)
+
+    # K_test_centered = (K_test - 1'_M K)(I - 1_M)
+    #                 = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
+    ones_prime_M = np.ones_like(K_test) / K.shape[0]
+    K_test_centered = (
+        K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M
+    )
+    assert_allclose(kernel_centerer.transform(K_test), K_test_centered)
+
+
+def test_cv_pipeline_precomputed():
+    # Cross-validate a regression on four coplanar points with the same
+    # value. Use precomputed kernel to ensure Pipeline with KernelCenterer
+    # is treated as a pairwise operation.
+    X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
+    y_true = np.ones((4,))
+    K = X.dot(X.T)
+    kcent = KernelCenterer()
+    pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())])
+
+    # did the pipeline set the pairwise attribute?
+    assert pipeline.__sklearn_tags__().input_tags.pairwise
+
+    # test cross-validation, score should be almost perfect
+    # NB: this test is pretty vacuous -- it's mainly to test integration
+    #     of Pipeline and KernelCenterer
+    y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
+    assert_array_almost_equal(y_true, y_pred)
+
+
+def test_fit_transform():
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 4))
+    for obj in (StandardScaler(), Normalizer(), Binarizer()):
+        X_transformed = obj.fit(X).transform(X)
+        X_transformed2 = obj.fit_transform(X)
+        assert_array_equal(X_transformed, X_transformed2)
+
+
+def test_add_dummy_feature():
+    X = [[1, 0], [0, 1], [0, 1]]
+    X = add_dummy_feature(X)
+    assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+
+
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
+)
+def test_add_dummy_feature_sparse(sparse_container):
+    X = sparse_container([[1, 0], [0, 1], [0, 1]])
+    desired_format = X.format
+    X = add_dummy_feature(X)
+    assert sparse.issparse(X) and X.format == desired_format, X
+    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+
+
+def test_fit_cold_start():
+    X = iris.data
+    X_2d = X[:, :2]
+
+    # Scalers that have a partial_fit method
+    scalers = [
+        StandardScaler(with_mean=False, with_std=False),
+        MinMaxScaler(),
+        MaxAbsScaler(),
+    ]
+
+    for scaler in scalers:
+        scaler.fit_transform(X)
+        # with a different shape, this may break the scaler unless the internal
+        # state is reset
+        scaler.fit_transform(X_2d)
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+def test_power_transformer_notfitted(method):
+    pt = PowerTransformer(method=method)
+    X = np.abs(X_1col)
+    with pytest.raises(NotFittedError):
+        pt.transform(X)
+    with pytest.raises(NotFittedError):
+        pt.inverse_transform(X)
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+@pytest.mark.parametrize("standardize", [True, False])
+@pytest.mark.parametrize("X", [X_1col, X_2d])
+def test_power_transformer_inverse(method, standardize, X):
+    # Make sure we get the original input when applying transform and then
+    # inverse transform
+    X = np.abs(X) if method == "box-cox" else X
+    pt = PowerTransformer(method=method, standardize=standardize)
+    X_trans = pt.fit_transform(X)
+    assert_almost_equal(X, pt.inverse_transform(X_trans))
+
+
+def test_power_transformer_1d():
+    X = np.abs(X_1col)
+
+    for standardize in [True, False]:
+        pt = PowerTransformer(method="box-cox", standardize=standardize)
+
+        X_trans = pt.fit_transform(X)
+        X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
+
+        X_expected, lambda_expected = stats.boxcox(X.flatten())
+
+        if standardize:
+            X_expected = scale(X_expected)
+
+        assert_almost_equal(X_expected.reshape(-1, 1), X_trans)
+        assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func)
+
+        assert_almost_equal(X, pt.inverse_transform(X_trans))
+        assert_almost_equal(lambda_expected, pt.lambdas_[0])
+
+        assert len(pt.lambdas_) == X.shape[1]
+        assert isinstance(pt.lambdas_, np.ndarray)
+
+
+def test_power_transformer_2d():
+    X = np.abs(X_2d)
+
+    for standardize in [True, False]:
+        pt = PowerTransformer(method="box-cox", standardize=standardize)
+
+        X_trans_class = pt.fit_transform(X)
+        X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
+
+        for X_trans in [X_trans_class, X_trans_func]:
+            for j in range(X_trans.shape[1]):
+                X_expected, lmbda = stats.boxcox(X[:, j].flatten())
+
+                if standardize:
+                    X_expected = scale(X_expected)
+
+                assert_almost_equal(X_trans[:, j], X_expected)
+                assert_almost_equal(lmbda, pt.lambdas_[j])
+
+            # Test inverse transformation
+            X_inv = pt.inverse_transform(X_trans)
+            assert_array_almost_equal(X_inv, X)
+
+        assert len(pt.lambdas_) == X.shape[1]
+        assert isinstance(pt.lambdas_, np.ndarray)
+
+
+def test_power_transformer_boxcox_strictly_positive_exception():
+    # Exceptions should be raised for negative arrays and zero arrays when
+    # method is boxcox
+
+    pt = PowerTransformer(method="box-cox")
+    pt.fit(np.abs(X_2d))
+    X_with_negatives = X_2d
+    not_positive_message = "strictly positive"
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        pt.transform(X_with_negatives)
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        pt.fit(X_with_negatives)
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        power_transform(X_with_negatives, method="box-cox")
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        pt.transform(np.zeros(X_2d.shape))
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        pt.fit(np.zeros(X_2d.shape))
+
+    with pytest.raises(ValueError, match=not_positive_message):
+        power_transform(np.zeros(X_2d.shape), method="box-cox")
+
+
+@pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)])
+def test_power_transformer_yeojohnson_any_input(X):
+    # Yeo-Johnson method should support any kind of input
+    power_transform(X, method="yeo-johnson")
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+def test_power_transformer_shape_exception(method):
+    pt = PowerTransformer(method=method)
+    X = np.abs(X_2d)
+    pt.fit(X)
+
+    # Exceptions should be raised for arrays with different num_columns
+    # than during fitting
+    wrong_shape_message = (
+        r"X has \d+ features, but PowerTransformer is expecting \d+ features"
+    )
+
+    with pytest.raises(ValueError, match=wrong_shape_message):
+        pt.transform(X[:, 0:1])
+
+    with pytest.raises(ValueError, match=wrong_shape_message):
+        pt.inverse_transform(X[:, 0:1])
+
+
+def test_power_transformer_lambda_zero():
+    pt = PowerTransformer(method="box-cox", standardize=False)
+    X = np.abs(X_2d)[:, 0:1]
+
+    # Test the lambda = 0 case
+    pt.lambdas_ = np.array([0])
+    X_trans = pt.transform(X)
+    assert_array_almost_equal(pt.inverse_transform(X_trans), X)
+
+
+def test_power_transformer_lambda_one():
+    # Make sure lambda = 1 corresponds to the identity for yeo-johnson
+    pt = PowerTransformer(method="yeo-johnson", standardize=False)
+    X = np.abs(X_2d)[:, 0:1]
+
+    pt.lambdas_ = np.array([1])
+    X_trans = pt.transform(X)
+    assert_array_almost_equal(X_trans, X)
+
+
+@pytest.mark.parametrize(
+    "method, lmbda",
+    [
+        ("box-cox", 0.1),
+        ("box-cox", 0.5),
+        ("yeo-johnson", 0.1),
+        ("yeo-johnson", 0.5),
+        ("yeo-johnson", 1.0),
+    ],
+)
+def test_optimization_power_transformer(method, lmbda):
+    # Test the optimization procedure:
+    # - set a predefined value for lambda
+    # - apply inverse_transform to a normal dist (we get X_inv)
+    # - apply fit_transform to X_inv (we get X_inv_trans)
+    # - check that X_inv_trans is roughly equal to X
+
+    rng = np.random.RandomState(0)
+    n_samples = 20000
+    X = rng.normal(loc=0, scale=1, size=(n_samples, 1))
+
+    if method == "box-cox":
+        # For box-cox, means that lmbda * y + 1 > 0 or y > - 1 / lmbda
+        # Clip the data here to make sure the inequality is valid.
+        X = np.clip(X, -1 / lmbda + 1e-5, None)
+
+    pt = PowerTransformer(method=method, standardize=False)
+    pt.lambdas_ = [lmbda]
+    X_inv = pt.inverse_transform(X)
+
+    pt = PowerTransformer(method=method, standardize=False)
+    X_inv_trans = pt.fit_transform(X_inv)
+
+    assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2)
+    assert_almost_equal(0, X_inv_trans.mean(), decimal=1)
+    assert_almost_equal(1, X_inv_trans.std(), decimal=1)
+
+
+def test_invserse_box_cox():
+    # output nan if the input is invalid
+    pt = PowerTransformer(method="box-cox", standardize=False)
+    pt.lambdas_ = [0.5]
+    X_inv = pt.inverse_transform([[-2.1]])
+    assert np.isnan(X_inv)
+
+
+def test_yeo_johnson_darwin_example():
+    # test from original paper "A new family of power transformations to
+    # improve normality or symmetry" by Yeo and Johnson.
+    X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0]
+    X = np.array(X).reshape(-1, 1)
+    lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_
+    assert np.allclose(lmbda, 1.305, atol=1e-3)
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+def test_power_transformer_nans(method):
+    # Make sure lambda estimation is not influenced by NaN values
+    # and that transform() supports NaN silently
+
+    X = np.abs(X_1col)
+    pt = PowerTransformer(method=method)
+    pt.fit(X)
+    lmbda_no_nans = pt.lambdas_[0]
+
+    # concat nans at the end and check lambda stays the same
+    X = np.concatenate([X, np.full_like(X, np.nan)])
+    X = shuffle(X, random_state=0)
+
+    pt.fit(X)
+    lmbda_nans = pt.lambdas_[0]
+
+    assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5)
+
+    X_trans = pt.transform(X)
+    assert_array_equal(np.isnan(X_trans), np.isnan(X))
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_fit_transform(method, standardize):
+    # check that fit_transform() and fit().transform() return the same values
+    X = X_1col
+    if method == "box-cox":
+        X = np.abs(X)
+
+    pt = PowerTransformer(method, standardize=standardize)
+    assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_copy_True(method, standardize):
+    # Check that neither fit, transform, fit_transform nor inverse_transform
+    # modify X inplace when copy=True
+    X = X_1col
+    if method == "box-cox":
+        X = np.abs(X)
+
+    X_original = X.copy()
+    assert X is not X_original  # sanity checks
+    assert_array_almost_equal(X, X_original)
+
+    pt = PowerTransformer(method, standardize=standardize, copy=True)
+
+    pt.fit(X)
+    assert_array_almost_equal(X, X_original)
+    X_trans = pt.transform(X)
+    assert X_trans is not X
+
+    X_trans = pt.fit_transform(X)
+    assert_array_almost_equal(X, X_original)
+    assert X_trans is not X
+
+    X_inv_trans = pt.inverse_transform(X_trans)
+    assert X_trans is not X_inv_trans
+
+
+@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_copy_False(method, standardize):
+    # check that when copy=False fit doesn't change X inplace but transform,
+    # fit_transform and inverse_transform do.
+    X = X_1col
+    if method == "box-cox":
+        X = np.abs(X)
+
+    X_original = X.copy()
+    assert X is not X_original  # sanity checks
+    assert_array_almost_equal(X, X_original)
+
+    pt = PowerTransformer(method, standardize=standardize, copy=False)
+
+    pt.fit(X)
+    assert_array_almost_equal(X, X_original)  # fit didn't change X
+
+    X_trans = pt.transform(X)
+    assert X_trans is X
+
+    if method == "box-cox":
+        X = np.abs(X)
+    X_trans = pt.fit_transform(X)
+    assert X_trans is X
+
+    X_inv_trans = pt.inverse_transform(X_trans)
+    assert X_trans is X_inv_trans
+
+
+def test_power_transformer_box_cox_raise_all_nans_col():
+    """Check that box-cox raises informative when a column contains all nans.
+
+    Non-regression test for gh-26303
+    """
+    X = rng.random_sample((4, 5))
+    X[:, 0] = np.nan
+
+    err_msg = "Column must not be all nan."
+
+    pt = PowerTransformer(method="box-cox")
+    with pytest.raises(ValueError, match=err_msg):
+        pt.fit_transform(X)
+
+
+@pytest.mark.parametrize(
+    "X_2",
+    [sparse.random(10, 1, density=0.8, random_state=0)]
+    + [
+        csr_container(np.full((10, 1), fill_value=np.nan))
+        for csr_container in CSR_CONTAINERS
+    ],
+)
+def test_standard_scaler_sparse_partial_fit_finite_variance(X_2):
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16448
+    X_1 = sparse.random(5, 1, density=0.8)
+    scaler = StandardScaler(with_mean=False)
+    scaler.fit(X_1).partial_fit(X_2)
+    assert np.isfinite(scaler.var_[0])
+
+
+@pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)])
+def test_minmax_scaler_clip(feature_range):
+    # test behaviour of the parameter 'clip' in MinMaxScaler
+    X = iris.data
+    scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X)
+    X_min, X_max = np.min(X, axis=0), np.max(X, axis=0)
+    X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]]
+    X_transformed = scaler.transform(X_test)
+    assert_allclose(
+        X_transformed,
+        [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]],
+    )
+
+
+def test_standard_scaler_raise_error_for_1d_input():
+    """Check that `inverse_transform` from `StandardScaler` raises an error
+    with 1D array.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19518
+    """
+    scaler = StandardScaler().fit(X_2d)
+    err_msg = "Expected 2D array, got 1D array instead"
+    with pytest.raises(ValueError, match=err_msg):
+        scaler.inverse_transform(X_2d[:, 0])
+
+
+def test_power_transformer_significantly_non_gaussian():
+    """Check that significantly non-Gaussian data before transforms correctly.
+
+    For some explored lambdas, the transformed data may be constant and will
+    be rejected. Non-regression test for
+    https://github.com/scikit-learn/scikit-learn/issues/14959
+    """
+
+    X_non_gaussian = 1e6 * np.array(
+        [0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64
+    ).reshape(-1, 1)
+    pt = PowerTransformer()
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        X_trans = pt.fit_transform(X_non_gaussian)
+
+    assert not np.any(np.isnan(X_trans))
+    assert X_trans.mean() == pytest.approx(0.0)
+    assert X_trans.std() == pytest.approx(1.0)
+    assert X_trans.min() > -2
+    assert X_trans.max() < 2
+
+
+@pytest.mark.parametrize(
+    "Transformer",
+    [
+        MinMaxScaler,
+        MaxAbsScaler,
+        RobustScaler,
+        StandardScaler,
+        QuantileTransformer,
+        PowerTransformer,
+    ],
+)
+def test_one_to_one_features(Transformer):
+    """Check one-to-one transformers give correct feature names."""
+    tr = Transformer().fit(iris.data)
+    names_out = tr.get_feature_names_out(iris.feature_names)
+    assert_array_equal(names_out, iris.feature_names)
+
+
+@pytest.mark.parametrize(
+    "Transformer",
+    [
+        MinMaxScaler,
+        MaxAbsScaler,
+        RobustScaler,
+        StandardScaler,
+        QuantileTransformer,
+        PowerTransformer,
+        Normalizer,
+        Binarizer,
+    ],
+)
+def test_one_to_one_features_pandas(Transformer):
+    """Check one-to-one transformers give correct feature names."""
+    pd = pytest.importorskip("pandas")
+
+    df = pd.DataFrame(iris.data, columns=iris.feature_names)
+    tr = Transformer().fit(df)
+
+    names_out_df_default = tr.get_feature_names_out()
+    assert_array_equal(names_out_df_default, iris.feature_names)
+
+    names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names)
+    assert_array_equal(names_out_df_valid_in, iris.feature_names)
+
+    msg = re.escape("input_features is not equal to feature_names_in_")
+    with pytest.raises(ValueError, match=msg):
+        invalid_names = list("abcd")
+        tr.get_feature_names_out(invalid_names)
+
+
+def test_kernel_centerer_feature_names_out():
+    """Test that kernel centerer `feature_names_out`."""
+
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((6, 4))
+    X_pairwise = linear_kernel(X)
+    centerer = KernelCenterer().fit(X_pairwise)
+
+    names_out = centerer.get_feature_names_out()
+    samples_out2 = X_pairwise.shape[1]
+    assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
+
+
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_constant_feature(standardize):
+    """Check that PowerTransfomer leaves constant features unchanged."""
+    X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]
+
+    pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)
+
+    assert_allclose(pt.lambdas_, [1, 1, 1])
+
+    Xft = pt.fit_transform(X)
+    Xt = pt.transform(X)
+
+    for Xt_ in [Xft, Xt]:
+        if standardize:
+            assert_allclose(Xt_, np.zeros_like(X))
+        else:
+            assert_allclose(Xt_, X)
+
+
+@pytest.mark.skipif(
+    sp_version < parse_version("1.12"),
+    reason="scipy version 1.12 required for stable yeo-johnson",
+)
+def test_power_transformer_no_warnings():
+    """Verify that PowerTransformer operates without raising any warnings on valid data.
+
+    This test addresses numerical issues with floating point numbers (mostly
+    overflows) with the Yeo-Johnson transform, see
+    https://github.com/scikit-learn/scikit-learn/issues/23319#issuecomment-1464933635
+    """
+    x = np.array(
+        [
+            2003.0,
+            1950.0,
+            1997.0,
+            2000.0,
+            2009.0,
+            2009.0,
+            1980.0,
+            1999.0,
+            2007.0,
+            1991.0,
+        ]
+    )
+
+    def _test_no_warnings(data):
+        """Internal helper to test for unexpected warnings."""
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            warnings.simplefilter("always")  # Ensure all warnings are captured
+            PowerTransformer(method="yeo-johnson", standardize=True).fit_transform(data)
+
+        assert not caught_warnings, "Unexpected warnings were raised:\n" + "\n".join(
+            str(w.message) for w in caught_warnings
+        )
+
+    # Full dataset: Should not trigger overflow in variance calculation.
+    _test_no_warnings(x.reshape(-1, 1))
+
+    # Subset of data: Should not trigger overflow in power calculation.
+    _test_no_warnings(x[:5].reshape(-1, 1))
+
+
+def test_yeojohnson_for_different_scipy_version():
+    """Check that the results are consistent across different SciPy versions."""
+    pt = PowerTransformer(method="yeo-johnson").fit(X_1col)
+    pt.lambdas_[0] == pytest.approx(0.99546157, rel=1e-7)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_discretization.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_discretization.py
new file mode 100644
index 0000000000000000000000000000000000000000..7463a8608291c9e9f580a3afe8a774a1b3f7e665
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_discretization.py
@@ -0,0 +1,665 @@
+import warnings
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+
+from sklearn import clone
+from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+
+X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
+
+
+@pytest.mark.parametrize(
+    "strategy, quantile_method, expected, sample_weight",
+    [
+        (
+            "uniform",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
+            None,
+        ),
+        (
+            "kmeans",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
+            None,
+        ),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            None,
+        ),
+        (
+            "uniform",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
+            [1, 1, 2, 1],
+        ),
+        (
+            "uniform",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]],
+            [1, 1, 1, 1],
+        ),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            [1, 1, 2, 1],
+        ),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            [1, 1, 1, 1],
+        ),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
+            [0, 1, 1, 1],
+        ),
+        (
+            "kmeans",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
+            [1, 0, 3, 1],
+        ),
+        (
+            "kmeans",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]],
+            [1, 1, 1, 1],
+        ),
+    ],
+)
+def test_fit_transform(strategy, quantile_method, expected, sample_weight):
+    est = KBinsDiscretizer(
+        n_bins=3, encode="ordinal", strategy=strategy, quantile_method=quantile_method
+    )
+    with ignore_warnings(category=UserWarning):
+        # Ignore the warning on removed small bins.
+        est.fit(X, sample_weight=sample_weight)
+    assert_array_equal(est.transform(X), expected)
+
+
+def test_valid_n_bins():
+    KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit_transform(X)
+    KBinsDiscretizer(
+        n_bins=np.array([2])[0], quantile_method="averaged_inverted_cdf"
+    ).fit_transform(X)
+    assert KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit(
+        X
+    ).n_bins_.dtype == np.dtype(int)
+
+
+def test_invalid_n_bins_array():
+    # Bad shape
+    n_bins = np.full((2, 4), 2.0)
+    est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
+    err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+    # Incorrect number of features
+    n_bins = [1, 2, 2]
+    est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
+    err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+    # Bad bin values
+    n_bins = [1, 2, 2, 1]
+    est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
+    err_msg = (
+        "KBinsDiscretizer received an invalid number of bins "
+        "at indices 0, 3. Number of bins must be at least 2, "
+        "and must be an int."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+    # Float bin values
+    n_bins = [2.1, 2, 2.1, 2]
+    est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf")
+    err_msg = (
+        "KBinsDiscretizer received an invalid number of bins "
+        "at indices 0, 2. Number of bins must be at least 2, "
+        "and must be an int."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+
+@pytest.mark.parametrize(
+    "strategy, quantile_method, expected, sample_weight",
+    [
+        (
+            "uniform",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]],
+            None,
+        ),
+        (
+            "kmeans",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
+            None,
+        ),
+        (
+            "quantile",
+            "linear",
+            [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+            None,
+        ),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+            None,
+        ),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+            [1, 1, 1, 1],
+        ),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+            [0, 1, 3, 1],
+        ),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 2, 2, 2], [1, 2, 2, 2]],
+            [1, 1, 3, 1],
+        ),
+        (
+            "kmeans",
+            "warn",  # default, will not warn when strategy != "quantile"
+            [[0, 0, 0, 0], [0, 1, 1, 0], [1, 1, 1, 1], [1, 2, 2, 2]],
+            [1, 0, 3, 1],
+        ),
+    ],
+)
+def test_fit_transform_n_bins_array(strategy, quantile_method, expected, sample_weight):
+    est = KBinsDiscretizer(
+        n_bins=[2, 3, 3, 3],
+        encode="ordinal",
+        strategy=strategy,
+        quantile_method=quantile_method,
+    ).fit(X, sample_weight=sample_weight)
+    assert_array_equal(est.transform(X), expected)
+
+    # test the shape of bin_edges_
+    n_features = np.array(X).shape[1]
+    assert est.bin_edges_.shape == (n_features,)
+    for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
+        assert bin_edges.shape == (n_bins + 1,)
+
+
+@pytest.mark.filterwarnings("ignore: Bins whose width are too small")
+def test_kbinsdiscretizer_effect_sample_weight():
+    """Check the impact of `sample_weight` one computed quantiles."""
+    X = np.array([[-2], [-1], [1], [3], [500], [1000]])
+    # add a large number of bins such that each sample with a non-null weight
+    # will be used as bin edge
+    est = KBinsDiscretizer(
+        n_bins=10,
+        encode="ordinal",
+        strategy="quantile",
+        quantile_method="averaged_inverted_cdf",
+    )
+    est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0])
+    assert_allclose(est.bin_edges_[0], [-2, -1, 0, 1, 3])
+    assert_allclose(est.transform(X), [[0.0], [1.0], [3.0], [3.0], [3.0], [3.0]])
+
+
+@pytest.mark.parametrize("strategy", ["kmeans", "quantile"])
+def test_kbinsdiscretizer_no_mutating_sample_weight(strategy):
+    """Make sure that `sample_weight` is not changed in place."""
+
+    if strategy == "quantile":
+        est = KBinsDiscretizer(
+            n_bins=3,
+            encode="ordinal",
+            strategy=strategy,
+            quantile_method="averaged_inverted_cdf",
+        )
+    else:
+        est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
+    sample_weight = np.array([1, 3, 1, 2], dtype=np.float64)
+    sample_weight_copy = np.copy(sample_weight)
+    est.fit(X, sample_weight=sample_weight)
+    assert_allclose(sample_weight, sample_weight_copy)
+
+
+@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
+def test_same_min_max(strategy):
+    warnings.simplefilter("always")
+    X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
+    if strategy == "quantile":
+        est = KBinsDiscretizer(
+            strategy=strategy,
+            n_bins=3,
+            encode="ordinal",
+            quantile_method="averaged_inverted_cdf",
+        )
+    else:
+        est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
+    warning_message = "Feature 0 is constant and will be replaced with 0."
+    with pytest.warns(UserWarning, match=warning_message):
+        est.fit(X)
+    assert est.n_bins_[0] == 1
+    # replace the feature with zeros
+    Xt = est.transform(X)
+    assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
+
+
+def test_transform_1d_behavior():
+    X = np.arange(4)
+    est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
+    with pytest.raises(ValueError):
+        est.fit(X)
+
+    est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf")
+    est.fit(X.reshape(-1, 1))
+    with pytest.raises(ValueError):
+        est.transform(X)
+
+
+@pytest.mark.parametrize("i", range(1, 9))
+def test_numeric_stability(i):
+    X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
+    Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
+
+    # Test up to discretizing nano units
+    X = X_init / 10**i
+    Xt = KBinsDiscretizer(
+        n_bins=2, encode="ordinal", quantile_method="averaged_inverted_cdf"
+    ).fit_transform(X)
+    assert_array_equal(Xt_expected, Xt)
+
+
+def test_encode_options():
+    est = KBinsDiscretizer(
+        n_bins=[2, 3, 3, 3], encode="ordinal", quantile_method="averaged_inverted_cdf"
+    ).fit(X)
+    Xt_1 = est.transform(X)
+    est = KBinsDiscretizer(
+        n_bins=[2, 3, 3, 3],
+        encode="onehot-dense",
+        quantile_method="averaged_inverted_cdf",
+    ).fit(X)
+    Xt_2 = est.transform(X)
+    assert not sp.issparse(Xt_2)
+    assert_array_equal(
+        OneHotEncoder(
+            categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False
+        ).fit_transform(Xt_1),
+        Xt_2,
+    )
+    est = KBinsDiscretizer(
+        n_bins=[2, 3, 3, 3], encode="onehot", quantile_method="averaged_inverted_cdf"
+    ).fit(X)
+    Xt_3 = est.transform(X)
+    assert sp.issparse(Xt_3)
+    assert_array_equal(
+        OneHotEncoder(
+            categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True
+        )
+        .fit_transform(Xt_1)
+        .toarray(),
+        Xt_3.toarray(),
+    )
+
+
+@pytest.mark.parametrize(
+    "strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins",
+    [
+        ("uniform", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
+        ("kmeans", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
+        (
+            "quantile",
+            "averaged_inverted_cdf",
+            [0, 0, 0, 1, 1, 1],
+            [0, 0, 1, 1, 2, 2],
+            [0, 1, 2, 3, 4, 4],
+        ),
+    ],
+)
+def test_nonuniform_strategies(
+    strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins
+):
+    X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
+
+    # with 2 bins
+    est = KBinsDiscretizer(
+        n_bins=2, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
+    )
+    Xt = est.fit_transform(X)
+    assert_array_equal(expected_2bins, Xt.ravel())
+
+    # with 3 bins
+    est = KBinsDiscretizer(
+        n_bins=3, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
+    )
+    Xt = est.fit_transform(X)
+    assert_array_equal(expected_3bins, Xt.ravel())
+
+    # with 5 bins
+    est = KBinsDiscretizer(
+        n_bins=5, strategy=strategy, quantile_method=quantile_method, encode="ordinal"
+    )
+    Xt = est.fit_transform(X)
+    assert_array_equal(expected_5bins, Xt.ravel())
+
+
+@pytest.mark.parametrize(
+    "strategy, expected_inv,quantile_method",
+    [
+        (
+            "uniform",
+            [
+                [-1.5, 2.0, -3.5, -0.5],
+                [-0.5, 3.0, -2.5, -0.5],
+                [0.5, 4.0, -1.5, 0.5],
+                [0.5, 4.0, -1.5, 1.5],
+            ],
+            "warn",  # default, will not warn when strategy != "quantile"
+        ),
+        (
+            "kmeans",
+            [
+                [-1.375, 2.125, -3.375, -0.5625],
+                [-1.375, 2.125, -3.375, -0.5625],
+                [-0.125, 3.375, -2.125, 0.5625],
+                [0.75, 4.25, -1.25, 1.625],
+            ],
+            "warn",  # default, will not warn when strategy != "quantile"
+        ),
+        (
+            "quantile",
+            [
+                [-1.5, 2.0, -3.5, -0.75],
+                [-0.5, 3.0, -2.5, 0.0],
+                [0.5, 4.0, -1.5, 1.25],
+                [0.5, 4.0, -1.5, 1.25],
+            ],
+            "averaged_inverted_cdf",
+        ),
+    ],
+)
+@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
+def test_inverse_transform(strategy, encode, expected_inv, quantile_method):
+    kbd = KBinsDiscretizer(
+        n_bins=3, strategy=strategy, quantile_method=quantile_method, encode=encode
+    )
+    Xt = kbd.fit_transform(X)
+    Xinv = kbd.inverse_transform(Xt)
+    assert_array_almost_equal(expected_inv, Xinv)
+
+
+@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
+def test_transform_outside_fit_range(strategy):
+    X = np.array([0, 1, 2, 3])[:, None]
+
+    if strategy == "quantile":
+        kbd = KBinsDiscretizer(
+            n_bins=4,
+            strategy=strategy,
+            encode="ordinal",
+            quantile_method="averaged_inverted_cdf",
+        )
+    else:
+        kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
+    kbd.fit(X)
+
+    X2 = np.array([-2, 5])[:, None]
+    X2t = kbd.transform(X2)
+    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
+    assert_array_equal(X2t.min(axis=0), [0])
+
+
+def test_overwrite():
+    X = np.array([0, 1, 2, 3])[:, None]
+    X_before = X.copy()
+
+    est = KBinsDiscretizer(
+        n_bins=3, quantile_method="averaged_inverted_cdf", encode="ordinal"
+    )
+    Xt = est.fit_transform(X)
+    assert_array_equal(X, X_before)
+
+    Xt_before = Xt.copy()
+    Xinv = est.inverse_transform(Xt)
+    assert_array_equal(Xt, Xt_before)
+    assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
+
+
+@pytest.mark.parametrize(
+    "strategy, expected_bin_edges, quantile_method",
+    [
+        ("quantile", [0, 1.5, 3], "averaged_inverted_cdf"),
+        ("kmeans", [0, 1.5, 3], "warn"),
+    ],
+)
+def test_redundant_bins(strategy, expected_bin_edges, quantile_method):
+    X = [[0], [0], [0], [0], [3], [3]]
+    kbd = KBinsDiscretizer(
+        n_bins=3, strategy=strategy, quantile_method=quantile_method, subsample=None
+    )
+    warning_message = "Consider decreasing the number of bins."
+    with pytest.warns(UserWarning, match=warning_message):
+        kbd.fit(X)
+
+    assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
+
+
+def test_percentile_numeric_stability():
+    X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
+    bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
+    Xt = np.array([0, 0, 4]).reshape(-1, 1)
+    kbd = KBinsDiscretizer(
+        n_bins=10,
+        encode="ordinal",
+        strategy="quantile",
+        quantile_method="linear",
+    )
+    ## TODO: change to averaged inverted cdf, but that means we only get bin
+    ## edges of 0.05 and 0.95 and nothing in between
+
+    warning_message = "Consider decreasing the number of bins."
+    with pytest.warns(UserWarning, match=warning_message):
+        kbd.fit(X)
+
+    assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
+    assert_array_almost_equal(kbd.transform(X), Xt)
+
+
+@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize("out_dtype", [None, np.float32, np.float64])
+@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
+def test_consistent_dtype(in_dtype, out_dtype, encode):
+    X_input = np.array(X, dtype=in_dtype)
+    kbd = KBinsDiscretizer(
+        n_bins=3,
+        encode=encode,
+        quantile_method="averaged_inverted_cdf",
+        dtype=out_dtype,
+    )
+    kbd.fit(X_input)
+
+    # test output dtype
+    if out_dtype is not None:
+        expected_dtype = out_dtype
+    elif out_dtype is None and X_input.dtype == np.float16:
+        # wrong numeric input dtype are cast in np.float64
+        expected_dtype = np.float64
+    else:
+        expected_dtype = X_input.dtype
+    Xt = kbd.transform(X_input)
+    assert Xt.dtype == expected_dtype
+
+
+@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
+def test_32_equal_64(input_dtype, encode):
+    # TODO this check is redundant with common checks and can be removed
+    #  once #16290 is merged
+    X_input = np.array(X, dtype=input_dtype)
+
+    # 32 bit output
+    kbd_32 = KBinsDiscretizer(
+        n_bins=3,
+        encode=encode,
+        quantile_method="averaged_inverted_cdf",
+        dtype=np.float32,
+    )
+    kbd_32.fit(X_input)
+    Xt_32 = kbd_32.transform(X_input)
+
+    # 64 bit output
+    kbd_64 = KBinsDiscretizer(
+        n_bins=3,
+        encode=encode,
+        quantile_method="averaged_inverted_cdf",
+        dtype=np.float64,
+    )
+    kbd_64.fit(X_input)
+    Xt_64 = kbd_64.transform(X_input)
+
+    assert_allclose_dense_sparse(Xt_32, Xt_64)
+
+
+def test_kbinsdiscretizer_subsample_default():
+    # Since the size of X is small (< 2e5), subsampling will not take place.
+    X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
+    kbd_default = KBinsDiscretizer(
+        n_bins=10,
+        encode="ordinal",
+        strategy="quantile",
+        quantile_method="averaged_inverted_cdf",
+    )
+    kbd_default.fit(X)
+
+    kbd_without_subsampling = clone(kbd_default)
+    kbd_without_subsampling.set_params(subsample=None)
+    kbd_without_subsampling.fit(X)
+
+    for bin_kbd_default, bin_kbd_with_subsampling in zip(
+        kbd_default.bin_edges_[0], kbd_without_subsampling.bin_edges_[0]
+    ):
+        np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
+    assert kbd_default.bin_edges_.shape == kbd_without_subsampling.bin_edges_.shape
+
+
+@pytest.mark.parametrize(
+    "encode, expected_names",
+    [
+        (
+            "onehot",
+            [
+                f"feat{col_id}_{float(bin_id)}"
+                for col_id in range(3)
+                for bin_id in range(4)
+            ],
+        ),
+        (
+            "onehot-dense",
+            [
+                f"feat{col_id}_{float(bin_id)}"
+                for col_id in range(3)
+                for bin_id in range(4)
+            ],
+        ),
+        ("ordinal", [f"feat{col_id}" for col_id in range(3)]),
+    ],
+)
+def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
+    """Check get_feature_names_out for different settings.
+    Non-regression test for #22731
+    """
+    X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
+
+    kbd = KBinsDiscretizer(
+        n_bins=4, encode=encode, quantile_method="averaged_inverted_cdf"
+    ).fit(X)
+    Xt = kbd.transform(X)
+
+    input_features = [f"feat{i}" for i in range(3)]
+    output_names = kbd.get_feature_names_out(input_features)
+    assert Xt.shape[1] == output_names.shape[0]
+
+    assert_array_equal(output_names, expected_names)
+
+
+@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
+def test_kbinsdiscretizer_subsample(strategy, global_random_seed):
+    # Check that the bin edges are almost the same when subsampling is used.
+    X = np.random.RandomState(global_random_seed).random_sample((100000, 1)) + 1
+
+    if strategy == "quantile":
+        kbd_subsampling = KBinsDiscretizer(
+            strategy=strategy,
+            subsample=50000,
+            random_state=global_random_seed,
+            quantile_method="averaged_inverted_cdf",
+        )
+    else:
+        kbd_subsampling = KBinsDiscretizer(
+            strategy=strategy, subsample=50000, random_state=global_random_seed
+        )
+    kbd_subsampling.fit(X)
+
+    kbd_no_subsampling = clone(kbd_subsampling)
+    kbd_no_subsampling.set_params(subsample=None)
+    kbd_no_subsampling.fit(X)
+
+    # We use a large tolerance because we can't expect the bin edges to be exactly the
+    # same when subsampling is used.
+    assert_allclose(
+        kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2
+    )
+
+
+def test_quantile_method_future_warnings():
+    X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
+    with pytest.warns(
+        FutureWarning,
+        match="The current default behavior, quantile_method='linear', will be "
+        "changed to quantile_method='averaged_inverted_cdf' in "
+        "scikit-learn version 1.9 to naturally support sample weight "
+        "equivalence properties by default. Pass "
+        "quantile_method='averaged_inverted_cdf' explicitly to silence this "
+        "warning.",
+    ):
+        KBinsDiscretizer(strategy="quantile").fit(X)
+
+
+def test_invalid_quantile_method_with_sample_weight():
+    X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
+    expected_msg = (
+        "When fitting with strategy='quantile' and sample weights, "
+        "quantile_method should either be set to 'averaged_inverted_cdf' or "
+        "'inverted_cdf', got quantile_method='linear' instead."
+    )
+    with pytest.raises(
+        ValueError,
+        match=expected_msg,
+    ):
+        KBinsDiscretizer(strategy="quantile", quantile_method="linear").fit(
+            X,
+            sample_weight=[1, 1, 2, 2],
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_encoders.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_encoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..f843a4f16d17074c7f9414bbc4733a8cd49a7ac8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_encoders.py
@@ -0,0 +1,2367 @@
+import re
+import warnings
+
+import numpy as np
+import pytest
+from scipy import sparse
+
+from sklearn.exceptions import NotFittedError
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
+from sklearn.utils._missing import is_scalar_nan
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def test_one_hot_encoder_sparse_dense():
+    # check that sparse and dense will give the same results
+
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    enc_sparse = OneHotEncoder()
+    enc_dense = OneHotEncoder(sparse_output=False)
+
+    X_trans_sparse = enc_sparse.fit_transform(X)
+    X_trans_dense = enc_dense.fit_transform(X)
+
+    assert X_trans_sparse.shape == (2, 5)
+    assert X_trans_dense.shape == (2, 5)
+
+    assert sparse.issparse(X_trans_sparse)
+    assert not sparse.issparse(X_trans_dense)
+
+    # check outcome
+    assert_array_equal(
+        X_trans_sparse.toarray(), [[0.0, 1.0, 0.0, 1.0, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0]]
+    )
+    assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
+
+
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
+def test_one_hot_encoder_handle_unknown(handle_unknown):
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    X2 = np.array([[4, 1, 1]])
+
+    # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown="error")
+    oh.fit(X)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        oh.transform(X2)
+
+    # Test the ignore option, ignores unknown features (giving all 0's)
+    oh = OneHotEncoder(handle_unknown=handle_unknown)
+    oh.fit(X)
+    X2_passed = X2.copy()
+    assert_array_equal(
+        oh.transform(X2_passed).toarray(),
+        np.array([[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]]),
+    )
+    # ensure transformed data was not modified in place
+    assert_allclose(X2, X2_passed)
+
+
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
+def test_one_hot_encoder_handle_unknown_strings(handle_unknown):
+    X = np.array(["11111111", "22", "333", "4444"]).reshape((-1, 1))
+    X2 = np.array(["55555", "22"]).reshape((-1, 1))
+    # Non Regression test for the issue #12470
+    # Test the ignore option, when categories are numpy string dtype
+    # particularly when the known category strings are larger
+    # than the unknown category strings
+    oh = OneHotEncoder(handle_unknown=handle_unknown)
+    oh.fit(X)
+    X2_passed = X2.copy()
+    assert_array_equal(
+        oh.transform(X2_passed).toarray(),
+        np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]),
+    )
+    # ensure transformed data was not modified in place
+    assert_array_equal(X2, X2_passed)
+
+
+@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
+@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
+def test_one_hot_encoder_dtype(input_dtype, output_dtype):
+    X = np.asarray([[0, 1]], dtype=input_dtype).T
+    X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)
+
+    oh = OneHotEncoder(categories="auto", dtype=output_dtype)
+    assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
+    assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)
+
+    oh = OneHotEncoder(categories="auto", dtype=output_dtype, sparse_output=False)
+    assert_array_equal(oh.fit_transform(X), X_expected)
+    assert_array_equal(oh.fit(X).transform(X), X_expected)
+
+
+@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
+def test_one_hot_encoder_dtype_pandas(output_dtype):
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
+    X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)
+
+    oh = OneHotEncoder(dtype=output_dtype)
+    assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
+    assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)
+
+    oh = OneHotEncoder(dtype=output_dtype, sparse_output=False)
+    assert_array_equal(oh.fit_transform(X_df), X_expected)
+    assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
+
+
+def test_one_hot_encoder_feature_names():
+    enc = OneHotEncoder()
+    X = [
+        ["Male", 1, "girl", 2, 3],
+        ["Female", 41, "girl", 1, 10],
+        ["Male", 51, "boy", 12, 3],
+        ["Male", 91, "girl", 21, 30],
+    ]
+
+    enc.fit(X)
+    feature_names = enc.get_feature_names_out()
+
+    assert_array_equal(
+        [
+            "x0_Female",
+            "x0_Male",
+            "x1_1",
+            "x1_41",
+            "x1_51",
+            "x1_91",
+            "x2_boy",
+            "x2_girl",
+            "x3_1",
+            "x3_2",
+            "x3_12",
+            "x3_21",
+            "x4_3",
+            "x4_10",
+            "x4_30",
+        ],
+        feature_names,
+    )
+
+    feature_names2 = enc.get_feature_names_out(["one", "two", "three", "four", "five"])
+
+    assert_array_equal(
+        [
+            "one_Female",
+            "one_Male",
+            "two_1",
+            "two_41",
+            "two_51",
+            "two_91",
+            "three_boy",
+            "three_girl",
+            "four_1",
+            "four_2",
+            "four_12",
+            "four_21",
+            "five_3",
+            "five_10",
+            "five_30",
+        ],
+        feature_names2,
+    )
+
+    with pytest.raises(ValueError, match="input_features should have length"):
+        enc.get_feature_names_out(["one", "two"])
+
+
+def test_one_hot_encoder_feature_names_unicode():
+    enc = OneHotEncoder()
+    X = np.array([["c❤t1", "dat2"]], dtype=object).T
+    enc.fit(X)
+    feature_names = enc.get_feature_names_out()
+    assert_array_equal(["x0_c❤t1", "x0_dat2"], feature_names)
+    feature_names = enc.get_feature_names_out(input_features=["n👍me"])
+    assert_array_equal(["n👍me_c❤t1", "n👍me_dat2"], feature_names)
+
+
+def test_one_hot_encoder_custom_feature_name_combiner():
+    """Check the behaviour of `feature_name_combiner` as a callable."""
+
+    def name_combiner(feature, category):
+        return feature + "_" + repr(category)
+
+    enc = OneHotEncoder(feature_name_combiner=name_combiner)
+    X = np.array([["None", None]], dtype=object).T
+    enc.fit(X)
+    feature_names = enc.get_feature_names_out()
+    assert_array_equal(["x0_'None'", "x0_None"], feature_names)
+    feature_names = enc.get_feature_names_out(input_features=["a"])
+    assert_array_equal(["a_'None'", "a_None"], feature_names)
+
+    def wrong_combiner(feature, category):
+        # we should be returning a Python string
+        return 0
+
+    enc = OneHotEncoder(feature_name_combiner=wrong_combiner).fit(X)
+    err_msg = (
+        "When `feature_name_combiner` is a callable, it should return a Python string."
+    )
+    with pytest.raises(TypeError, match=err_msg):
+        enc.get_feature_names_out()
+
+
+def test_one_hot_encoder_set_params():
+    X = np.array([[1, 2]]).T
+    oh = OneHotEncoder()
+    # set params on not yet fitted object
+    oh.set_params(categories=[[0, 1, 2, 3]])
+    assert oh.get_params()["categories"] == [[0, 1, 2, 3]]
+    assert oh.fit_transform(X).toarray().shape == (2, 4)
+    # set params on already fitted object
+    oh.set_params(categories=[[0, 1, 2, 3, 4]])
+    assert oh.fit_transform(X).toarray().shape == (2, 5)
+
+
+def check_categorical_onehot(X):
+    enc = OneHotEncoder(categories="auto")
+    Xtr1 = enc.fit_transform(X)
+
+    enc = OneHotEncoder(categories="auto", sparse_output=False)
+    Xtr2 = enc.fit_transform(X)
+
+    assert_allclose(Xtr1.toarray(), Xtr2)
+
+    assert sparse.issparse(Xtr1) and Xtr1.format == "csr"
+    return Xtr1.toarray()
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        [["def", 1, 55], ["abc", 2, 55]],
+        np.array([[10, 1, 55], [5, 2, 55]]),
+        np.array([["b", "A", "cat"], ["a", "B", "cat"]], dtype=object),
+        np.array([["b", 1, "cat"], ["a", np.nan, "cat"]], dtype=object),
+        np.array([["b", 1, "cat"], ["a", float("nan"), "cat"]], dtype=object),
+        np.array([[None, 1, "cat"], ["a", 2, "cat"]], dtype=object),
+        np.array([[None, 1, None], ["a", np.nan, None]], dtype=object),
+        np.array([[None, 1, None], ["a", float("nan"), None]], dtype=object),
+    ],
+    ids=[
+        "mixed",
+        "numeric",
+        "object",
+        "mixed-nan",
+        "mixed-float-nan",
+        "mixed-None",
+        "mixed-None-nan",
+        "mixed-None-float-nan",
+    ],
+)
+def test_one_hot_encoder(X):
+    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
+    assert_allclose(Xtr, [[0, 1], [1, 0]])
+
+    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
+    assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
+
+    Xtr = OneHotEncoder(categories="auto").fit_transform(X)
+    assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
+
+
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
+@pytest.mark.parametrize("sparse_", [False, True])
+@pytest.mark.parametrize("drop", [None, "first"])
+def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop):
+    X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
+    enc = OneHotEncoder(sparse_output=sparse_, drop=drop)
+    X_tr = enc.fit_transform(X)
+    exp = np.array(X, dtype=object)
+    assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    X = [[2, 55], [1, 55], [3, 55]]
+    enc = OneHotEncoder(sparse_output=sparse_, categories="auto", drop=drop)
+    X_tr = enc.fit_transform(X)
+    exp = np.array(X)
+    assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    if drop is None:
+        # with unknown categories
+        # drop is incompatible with handle_unknown=ignore
+        X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
+        enc = OneHotEncoder(
+            sparse_output=sparse_,
+            handle_unknown=handle_unknown,
+            categories=[["abc", "def"], [1, 2], [54, 55, 56]],
+        )
+        X_tr = enc.fit_transform(X)
+        exp = np.array(X, dtype=object)
+        exp[2, 1] = None
+        assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+        # with an otherwise numerical output, still object if unknown
+        X = [[2, 55], [1, 55], [3, 55]]
+        enc = OneHotEncoder(
+            sparse_output=sparse_,
+            categories=[[1, 2], [54, 56]],
+            handle_unknown=handle_unknown,
+        )
+        X_tr = enc.fit_transform(X)
+        exp = np.array(X, dtype=object)
+        exp[2, 0] = None
+        exp[:, 1] = None
+        assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    # incorrect shape raises
+    X_tr = np.array([[0, 1, 1], [1, 0, 1]])
+    msg = re.escape("Shape of the passed X data is not correct")
+    with pytest.raises(ValueError, match=msg):
+        enc.inverse_transform(X_tr)
+
+
+@pytest.mark.parametrize("sparse_", [False, True])
+@pytest.mark.parametrize(
+    "X, X_trans",
+    [
+        ([[2, 55], [1, 55], [2, 55]], [[0, 1, 1], [0, 0, 0], [0, 1, 1]]),
+        (
+            [["one", "a"], ["two", "a"], ["three", "b"], ["two", "a"]],
+            [[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]],
+        ),
+    ],
+)
+def test_one_hot_encoder_inverse_transform_raise_error_with_unknown(
+    X, X_trans, sparse_
+):
+    """Check that `inverse_transform` raise an error with unknown samples, no
+    dropped feature, and `handle_unknow="error`.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/14934
+    """
+    enc = OneHotEncoder(sparse_output=sparse_).fit(X)
+    msg = (
+        r"Samples \[(\d )*\d\] can not be inverted when drop=None and "
+        r"handle_unknown='error' because they contain all zeros"
+    )
+
+    if sparse_:
+        # emulate sparse data transform by a one-hot encoder sparse.
+        X_trans = _convert_container(X_trans, "sparse")
+    with pytest.raises(ValueError, match=msg):
+        enc.inverse_transform(X_trans)
+
+
+def test_one_hot_encoder_inverse_if_binary():
+    X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
+    ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
+    X_tr = ohe.fit_transform(X)
+    assert_array_equal(ohe.inverse_transform(X_tr), X)
+
+
+@pytest.mark.parametrize("drop", ["if_binary", "first", None])
+@pytest.mark.parametrize("reset_drop", ["if_binary", "first", None])
+def test_one_hot_encoder_drop_reset(drop, reset_drop):
+    # check that resetting drop option without refitting does not throw an error
+    X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
+    ohe = OneHotEncoder(drop=drop, sparse_output=False)
+    ohe.fit(X)
+    X_tr = ohe.transform(X)
+    feature_names = ohe.get_feature_names_out()
+    ohe.set_params(drop=reset_drop)
+    assert_array_equal(ohe.inverse_transform(X_tr), X)
+    assert_allclose(ohe.transform(X), X_tr)
+    assert_array_equal(ohe.get_feature_names_out(), feature_names)
+
+
+@pytest.mark.parametrize("method", ["fit", "fit_transform"])
+@pytest.mark.parametrize("X", [[1, 2], np.array([3.0, 4.0])])
+def test_X_is_not_1D(X, method):
+    oh = OneHotEncoder()
+
+    msg = "Expected 2D array, got 1D array instead"
+    with pytest.raises(ValueError, match=msg):
+        getattr(oh, method)(X)
+
+
+@pytest.mark.parametrize("method", ["fit", "fit_transform"])
+def test_X_is_not_1D_pandas(method):
+    pd = pytest.importorskip("pandas")
+    X = pd.Series([6, 3, 4, 6])
+    oh = OneHotEncoder()
+
+    msg = f"Expected a 2-dimensional container but got {type(X)} instead."
+    with pytest.raises(ValueError, match=msg):
+        getattr(oh, method)(X)
+
+
+@pytest.mark.parametrize(
+    "X, cat_exp, cat_dtype",
+    [
+        ([["abc", 55], ["def", 55]], [["abc", "def"], [55]], np.object_),
+        (np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
+        (
+            np.array([["A", "cat"], ["B", "cat"]], dtype=object),
+            [["A", "B"], ["cat"]],
+            np.object_,
+        ),
+        (np.array([["A", "cat"], ["B", "cat"]]), [["A", "B"], ["cat"]], np.str_),
+        (np.array([[1, 2], [np.nan, 2]]), [[1, np.nan], [2]], np.float64),
+        (
+            np.array([["A", np.nan], [None, np.nan]], dtype=object),
+            [["A", None], [np.nan]],
+            np.object_,
+        ),
+        (
+            np.array([["A", float("nan")], [None, float("nan")]], dtype=object),
+            [["A", None], [float("nan")]],
+            np.object_,
+        ),
+    ],
+    ids=[
+        "mixed",
+        "numeric",
+        "object",
+        "string",
+        "missing-float",
+        "missing-np.nan-object",
+        "missing-float-nan-object",
+    ],
+)
+def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
+    # order of categories should not depend on order of samples
+    for Xi in [X, X[::-1]]:
+        enc = OneHotEncoder(categories="auto")
+        enc.fit(Xi)
+        # assert enc.categories == 'auto'
+        assert isinstance(enc.categories_, list)
+        for res, exp in zip(enc.categories_, cat_exp):
+            res_list = res.tolist()
+            if is_scalar_nan(exp[-1]):
+                assert is_scalar_nan(res_list[-1])
+                assert res_list[:-1] == exp[:-1]
+            else:
+                assert res.tolist() == exp
+            assert np.issubdtype(res.dtype, cat_dtype)
+
+
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
+@pytest.mark.parametrize(
+    "X, X2, cats, cat_dtype",
+    [
+        (
+            np.array([["a", "b"]], dtype=object).T,
+            np.array([["a", "d"]], dtype=object).T,
+            [["a", "b", "c"]],
+            np.object_,
+        ),
+        (
+            np.array([[1, 2]], dtype="int64").T,
+            np.array([[1, 4]], dtype="int64").T,
+            [[1, 2, 3]],
+            np.int64,
+        ),
+        (
+            np.array([["a", "b"]], dtype=object).T,
+            np.array([["a", "d"]], dtype=object).T,
+            [np.array(["a", "b", "c"])],
+            np.object_,
+        ),
+        (
+            np.array([[None, "a"]], dtype=object).T,
+            np.array([[None, "b"]], dtype=object).T,
+            [[None, "a", "z"]],
+            object,
+        ),
+        (
+            np.array([["a", "b"]], dtype=object).T,
+            np.array([["a", np.nan]], dtype=object).T,
+            [["a", "b", "z"]],
+            object,
+        ),
+        (
+            np.array([["a", None]], dtype=object).T,
+            np.array([["a", np.nan]], dtype=object).T,
+            [["a", None, "z"]],
+            object,
+        ),
+    ],
+    ids=[
+        "object",
+        "numeric",
+        "object-string",
+        "object-string-none",
+        "object-string-nan",
+        "object-None-and-nan",
+    ],
+)
+def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype, handle_unknown):
+    enc = OneHotEncoder(categories=cats)
+    exp = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert list(enc.categories[0]) == list(cats[0])
+    assert enc.categories_[0].tolist() == list(cats[0])
+    # manually specified categories should have same dtype as
+    # the data when coerced from lists
+    assert enc.categories_[0].dtype == cat_dtype
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    enc = OneHotEncoder(categories=cats)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        enc.fit(X2)
+    enc = OneHotEncoder(categories=cats, handle_unknown=handle_unknown)
+    exp = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
+    assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
+
+
+def test_one_hot_encoder_unsorted_categories():
+    X = np.array([["a", "b"]], dtype=object).T
+
+    enc = OneHotEncoder(categories=[["b", "a", "c"]])
+    exp = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]])
+    assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert enc.categories_[0].tolist() == ["b", "a", "c"]
+    assert np.issubdtype(enc.categories_[0].dtype, np.object_)
+
+    # unsorted passed categories still raise for numerical values
+    X = np.array([[1, 2]]).T
+    enc = OneHotEncoder(categories=[[2, 1, 3]])
+    msg = "Unsorted categories are not supported"
+    with pytest.raises(ValueError, match=msg):
+        enc.fit_transform(X)
+
+
+@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
+def test_encoder_nan_ending_specified_categories(Encoder):
+    """Test encoder for specified categories that nan is at the end.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27088
+    """
+    cats = [np.array([0, np.nan, 1])]
+    enc = Encoder(categories=cats)
+    X = np.array([[0, 1]], dtype=object).T
+    with pytest.raises(ValueError, match="Nan should be the last element"):
+        enc.fit(X)
+
+
+def test_one_hot_encoder_specified_categories_mixed_columns():
+    # multiple columns
+    X = np.array([["a", "b"], [0, 2]], dtype=object).T
+    enc = OneHotEncoder(categories=[["a", "b", "c"], [0, 1, 2]])
+    exp = np.array([[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert enc.categories_[0].tolist() == ["a", "b", "c"]
+    assert np.issubdtype(enc.categories_[0].dtype, np.object_)
+    assert enc.categories_[1].tolist() == [0, 1, 2]
+    # integer categories but from object dtype data
+    assert np.issubdtype(enc.categories_[1].dtype, np.object_)
+
+
+def test_one_hot_encoder_pandas():
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
+
+    Xtr = check_categorical_onehot(X_df)
+    assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
+
+
+@pytest.mark.parametrize(
+    "drop, expected_names",
+    [
+        ("first", ["x0_c", "x2_b"]),
+        ("if_binary", ["x0_c", "x1_2", "x2_b"]),
+        (["c", 2, "b"], ["x0_b", "x2_a"]),
+    ],
+    ids=["first", "binary", "manual"],
+)
+def test_one_hot_encoder_feature_names_drop(drop, expected_names):
+    X = [["c", 2, "a"], ["b", 2, "b"]]
+
+    ohe = OneHotEncoder(drop=drop)
+    ohe.fit(X)
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(expected_names, feature_names)
+
+
+def test_one_hot_encoder_drop_equals_if_binary():
+    # Canonical case
+    X = [[10, "yes"], [20, "no"], [30, "yes"]]
+    expected = np.array(
+        [[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0]]
+    )
+    expected_drop_idx = np.array([None, 0])
+
+    ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
+    result = ohe.fit_transform(X)
+    assert_array_equal(ohe.drop_idx_, expected_drop_idx)
+    assert_allclose(result, expected)
+
+    # with only one cat, the behaviour is equivalent to drop=None
+    X = [["true", "a"], ["false", "a"], ["false", "a"]]
+    expected = np.array([[1.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
+    expected_drop_idx = np.array([0, None])
+
+    ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
+    result = ohe.fit_transform(X)
+    assert_array_equal(ohe.drop_idx_, expected_drop_idx)
+    assert_allclose(result, expected)
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        [["abc", 2, 55], ["def", 1, 55]],
+        np.array([[10, 2, 55], [20, 1, 55]]),
+        np.array([["a", "B", "cat"], ["b", "A", "cat"]], dtype=object),
+    ],
+    ids=["mixed", "numeric", "object"],
+)
+def test_ordinal_encoder(X):
+    enc = OrdinalEncoder()
+    exp = np.array([[0, 1, 0], [1, 0, 0]], dtype="int64")
+    assert_array_equal(enc.fit_transform(X), exp.astype("float64"))
+    enc = OrdinalEncoder(dtype="int64")
+    assert_array_equal(enc.fit_transform(X), exp)
+
+
+@pytest.mark.parametrize(
+    "X, X2, cats, cat_dtype",
+    [
+        (
+            np.array([["a", "b"]], dtype=object).T,
+            np.array([["a", "d"]], dtype=object).T,
+            [["a", "b", "c"]],
+            np.object_,
+        ),
+        (
+            np.array([[1, 2]], dtype="int64").T,
+            np.array([[1, 4]], dtype="int64").T,
+            [[1, 2, 3]],
+            np.int64,
+        ),
+        (
+            np.array([["a", "b"]], dtype=object).T,
+            np.array([["a", "d"]], dtype=object).T,
+            [np.array(["a", "b", "c"])],
+            np.object_,
+        ),
+    ],
+    ids=["object", "numeric", "object-string-cat"],
+)
+def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
+    enc = OrdinalEncoder(categories=cats)
+    exp = np.array([[0.0], [1.0]])
+    assert_array_equal(enc.fit_transform(X), exp)
+    assert list(enc.categories[0]) == list(cats[0])
+    assert enc.categories_[0].tolist() == list(cats[0])
+    # manually specified categories should have same dtype as
+    # the data when coerced from lists
+    assert enc.categories_[0].dtype == cat_dtype
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    enc = OrdinalEncoder(categories=cats)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        enc.fit(X2)
+
+
+def test_ordinal_encoder_inverse():
+    X = [["abc", 2, 55], ["def", 1, 55]]
+    enc = OrdinalEncoder()
+    X_tr = enc.fit_transform(X)
+    exp = np.array(X, dtype=object)
+    assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    # incorrect shape raises
+    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
+    msg = re.escape("Shape of the passed X data is not correct")
+    with pytest.raises(ValueError, match=msg):
+        enc.inverse_transform(X_tr)
+
+
+def test_ordinal_encoder_handle_unknowns_string():
+    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2)
+    X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object)
+    X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object)
+    enc.fit(X_fit)
+
+    X_trans_enc = enc.transform(X_trans)
+    exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64")
+    assert_array_equal(X_trans_enc, exp)
+
+    X_trans_inv = enc.inverse_transform(X_trans_enc)
+    inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object)
+    assert_array_equal(X_trans_inv, inv_exp)
+
+
+@pytest.mark.parametrize("dtype", [float, int])
+def test_ordinal_encoder_handle_unknowns_numeric(dtype):
+    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
+    X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype)
+    X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype)
+    enc.fit(X_fit)
+
+    X_trans_enc = enc.transform(X_trans)
+    exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64")
+    assert_array_equal(X_trans_enc, exp)
+
+    X_trans_inv = enc.inverse_transform(X_trans_enc)
+    inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object)
+    assert_array_equal(X_trans_inv, inv_exp)
+
+
+def test_ordinal_encoder_handle_unknowns_nan():
+    # Make sure unknown_value=np.nan properly works
+
+    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
+
+    X_fit = np.array([[1], [2], [3]])
+    enc.fit(X_fit)
+    X_trans = enc.transform([[1], [2], [4]])
+    assert_array_equal(X_trans, [[0], [1], [np.nan]])
+
+
+def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
+    # Make sure an error is raised when unknown_value=np.nan and the dtype
+    # isn't a float dtype
+    enc = OrdinalEncoder(
+        handle_unknown="use_encoded_value", unknown_value=np.nan, dtype=int
+    )
+
+    X_fit = np.array([[1], [2], [3]])
+    with pytest.raises(ValueError, match="dtype parameter should be a float dtype"):
+        enc.fit(X_fit)
+
+
+def test_ordinal_encoder_raise_categories_shape():
+    X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T
+    cats = ["Low", "Medium", "High"]
+    enc = OrdinalEncoder(categories=cats)
+    msg = "Shape mismatch: if categories is an array,"
+
+    with pytest.raises(ValueError, match=msg):
+        enc.fit(X)
+
+
+def test_encoder_dtypes():
+    # check that dtypes are preserved when determining categories
+    enc = OneHotEncoder(categories="auto")
+    exp = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]], dtype="float64")
+
+    for X in [
+        np.array([[1, 2], [3, 4]], dtype="int64"),
+        np.array([[1, 2], [3, 4]], dtype="float64"),
+        np.array([["a", "b"], ["c", "d"]]),  # str dtype
+        np.array([[b"a", b"b"], [b"c", b"d"]]),  # bytes dtype
+        np.array([[1, "a"], [3, "b"]], dtype="object"),
+    ]:
+        enc.fit(X)
+        assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
+        assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = [[1, 2], [3, 4]]
+    enc.fit(X)
+    assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = [[1, "a"], [3, "b"]]
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == "object" for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+
+def test_encoder_dtypes_pandas():
+    # check dtype (similar to test_categorical_encoder_dtypes for dataframes)
+    pd = pytest.importorskip("pandas")
+
+    enc = OneHotEncoder(categories="auto")
+    exp = np.array(
+        [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]],
+        dtype="float64",
+    )
+
+    X = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}, dtype="int64")
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == "int64" for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = pd.DataFrame({"A": [1, 2], "B": ["a", "b"], "C": [3.0, 4.0]})
+    expected_cat_type = ["int64", "object", "float64"]
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == expected_cat_type[i] for i in range(3)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+
+def test_one_hot_encoder_warning():
+    enc = OneHotEncoder()
+    X = [["Male", 1], ["Female", 3]]
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        enc.fit_transform(X)
+
+
+@pytest.mark.parametrize("drop", ["if_binary", "first"])
+def test_ohe_handle_unknown_warn(drop):
+    """Check handle_unknown='warn' works correctly."""
+
+    X = [["a", 0], ["b", 2], ["b", 1]]
+
+    ohe = OneHotEncoder(
+        drop=drop,
+        sparse_output=False,
+        handle_unknown="warn",
+        categories=[["b", "a"], [1, 2]],
+    )
+    ohe.fit(X)
+
+    X_test = [["c", 1]]
+    X_expected = np.array([[0, 0]])
+
+    warn_msg = (
+        r"Found unknown categories in columns \[0\] during transform. "
+        r"These unknown categories will be encoded as all zeros"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+
+@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
+def test_one_hot_encoder_drop_manual(missing_value):
+    cats_to_drop = ["def", 12, 3, 56, missing_value]
+    enc = OneHotEncoder(drop=cats_to_drop)
+    X = [
+        ["abc", 12, 2, 55, "a"],
+        ["def", 12, 1, 55, "a"],
+        ["def", 12, 3, 56, missing_value],
+    ]
+    trans = enc.fit_transform(X).toarray()
+    exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]]
+    assert_array_equal(trans, exp)
+    assert enc.drop is cats_to_drop
+
+    dropped_cats = [
+        cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_)
+    ]
+    X_inv_trans = enc.inverse_transform(trans)
+    X_array = np.array(X, dtype=object)
+
+    # last value is np.nan
+    if is_scalar_nan(cats_to_drop[-1]):
+        assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1])
+        assert is_scalar_nan(dropped_cats[-1])
+        assert is_scalar_nan(cats_to_drop[-1])
+        # do not include the last column which includes missing values
+        assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1])
+
+        # check last column is the missing value
+        assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1])
+        assert is_scalar_nan(X_array[-1, -1])
+        assert is_scalar_nan(X_inv_trans[-1, -1])
+    else:
+        assert_array_equal(dropped_cats, cats_to_drop)
+        assert_array_equal(X_array, X_inv_trans)
+
+
+@pytest.mark.parametrize("drop", [["abc", 3], ["abc", 3, 41, "a"]])
+def test_invalid_drop_length(drop):
+    enc = OneHotEncoder(drop=drop)
+    err_msg = "`drop` should have length equal to the number"
+    with pytest.raises(ValueError, match=err_msg):
+        enc.fit([["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]])
+
+
+@pytest.mark.parametrize("density", [True, False], ids=["sparse", "dense"])
+@pytest.mark.parametrize("drop", ["first", ["a", 2, "b"]], ids=["first", "manual"])
+def test_categories(density, drop):
+    ohe_base = OneHotEncoder(sparse_output=density)
+    ohe_test = OneHotEncoder(sparse_output=density, drop=drop)
+    X = [["c", 1, "a"], ["a", 2, "b"]]
+    ohe_base.fit(X)
+    ohe_test.fit(X)
+    assert_array_equal(ohe_base.categories_, ohe_test.categories_)
+    if drop == "first":
+        assert_array_equal(ohe_test.drop_idx_, 0)
+    else:
+        for drop_cat, drop_idx, cat_list in zip(
+            drop, ohe_test.drop_idx_, ohe_test.categories_
+        ):
+            assert cat_list[int(drop_idx)] == drop_cat
+    assert isinstance(ohe_test.drop_idx_, np.ndarray)
+    assert ohe_test.drop_idx_.dtype == object
+
+
+@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
+def test_encoders_has_categorical_tags(Encoder):
+    assert Encoder().__sklearn_tags__().input_tags.categorical
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"max_categories": 2},
+        {"min_frequency": 11},
+        {"min_frequency": 0.29},
+        {"max_categories": 2, "min_frequency": 6},
+        {"max_categories": 4, "min_frequency": 12},
+    ],
+)
+@pytest.mark.parametrize("categories", ["auto", [["a", "b", "c", "d"]]])
+def test_ohe_infrequent_two_levels(kwargs, categories):
+    """Test that different parameters for combine 'a', 'c', and 'd' into
+    the infrequent category works as expected."""
+
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
+    ohe = OneHotEncoder(
+        categories=categories,
+        handle_unknown="infrequent_if_exist",
+        sparse_output=False,
+        **kwargs,
+    ).fit(X_train)
+    assert_array_equal(ohe.infrequent_categories_, [["a", "c", "d"]])
+
+    X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
+    expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names)
+
+
+@pytest.mark.parametrize("drop", ["if_binary", "first", ["b"]])
+def test_ohe_infrequent_two_levels_drop_frequent(drop):
+    """Test two levels and dropping the frequent category."""
+
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
+    ohe = OneHotEncoder(
+        handle_unknown="infrequent_if_exist",
+        sparse_output=False,
+        max_categories=2,
+        drop=drop,
+    ).fit(X_train)
+    assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
+
+    X_test = np.array([["b"], ["c"]])
+    X_trans = ohe.transform(X_test)
+    assert_allclose([[0], [1]], X_trans)
+
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(["x0_infrequent_sklearn"], feature_names)
+
+    X_inverse = ohe.inverse_transform(X_trans)
+    assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
+
+
+@pytest.mark.parametrize("drop", [["a"], ["d"]])
+def test_ohe_infrequent_two_levels_drop_infrequent_errors(drop):
+    """Test two levels and dropping any infrequent category removes the
+    whole infrequent category."""
+
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
+    ohe = OneHotEncoder(
+        handle_unknown="infrequent_if_exist",
+        sparse_output=False,
+        max_categories=2,
+        drop=drop,
+    )
+
+    msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit(X_train)
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"max_categories": 3},
+        {"min_frequency": 6},
+        {"min_frequency": 9},
+        {"min_frequency": 0.24},
+        {"min_frequency": 0.16},
+        {"max_categories": 3, "min_frequency": 8},
+        {"max_categories": 4, "min_frequency": 6},
+    ],
+)
+def test_ohe_infrequent_three_levels(kwargs):
+    """Test that different parameters for combing 'a', and 'd' into
+    the infrequent category works as expected."""
+
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
+    ohe = OneHotEncoder(
+        handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs
+    ).fit(X_train)
+    assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])
+
+    X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
+    expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1], [0, 0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    expected_inv = [
+        ["b"],
+        ["infrequent_sklearn"],
+        ["c"],
+        ["infrequent_sklearn"],
+        ["infrequent_sklearn"],
+    ]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names)
+
+
+@pytest.mark.parametrize("drop", ["first", ["b"]])
+def test_ohe_infrequent_three_levels_drop_frequent(drop):
+    """Test three levels and dropping the frequent category."""
+
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
+    ohe = OneHotEncoder(
+        handle_unknown="infrequent_if_exist",
+        sparse_output=False,
+        max_categories=3,
+        drop=drop,
+    ).fit(X_train)
+
+    X_test = np.array([["b"], ["c"], ["d"]])
+    assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))
+
+    # Check handle_unknown="ignore"
+    ohe.set_params(handle_unknown="ignore").fit(X_train)
+    msg = "Found unknown categories"
+    with pytest.warns(UserWarning, match=msg):
+        X_trans = ohe.transform([["b"], ["e"]])
+
+    assert_allclose([[0, 0], [0, 0]], X_trans)
+
+
+@pytest.mark.parametrize("drop", [["a"], ["d"]])
+def test_ohe_infrequent_three_levels_drop_infrequent_errors(drop):
+    """Test three levels and dropping the infrequent category."""
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
+    ohe = OneHotEncoder(
+        handle_unknown="infrequent_if_exist",
+        sparse_output=False,
+        max_categories=3,
+        drop=drop,
+    )
+
+    msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit(X_train)
+
+
+def test_ohe_infrequent_handle_unknown_error():
+    """Test that different parameters for combining 'a', and 'd' into
+    the infrequent category works as expected."""
+
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
+    ohe = OneHotEncoder(
+        handle_unknown="error", sparse_output=False, max_categories=3
+    ).fit(X_train)
+    assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])
+
+    # all categories are known
+    X_test = [["b"], ["a"], ["c"], ["d"]]
+    expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    # 'bad' is not known and will error
+    X_test = [["bad"]]
+    msg = r"Found unknown categories \['bad'\] in column 0"
+    with pytest.raises(ValueError, match=msg):
+        ohe.transform(X_test)
+
+
+@pytest.mark.parametrize(
+    "kwargs", [{"max_categories": 3, "min_frequency": 1}, {"min_frequency": 4}]
+)
+def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
+    """'a' is the only frequent category, all other categories are infrequent."""
+
+    X_train = np.array([["a"] * 5 + ["e"] * 30], dtype=object).T
+    ohe = OneHotEncoder(
+        categories=[["c", "d", "a", "b"]],
+        sparse_output=False,
+        handle_unknown="infrequent_if_exist",
+        **kwargs,
+    ).fit(X_train)
+
+    X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
+    expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    # 'a' is dropped
+    drops = ["first", "if_binary", ["a"]]
+    X_test = [["a"], ["c"]]
+    for drop in drops:
+        ohe.set_params(drop=drop).fit(X_train)
+        assert_allclose([[0], [1]], ohe.transform(X_test))
+
+
+def test_ohe_infrequent_two_levels_user_cats():
+    """Test that the order of the categories provided by a user is respected."""
+    X_train = np.array(
+        [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
+    ).T
+    ohe = OneHotEncoder(
+        categories=[["c", "d", "a", "b"]],
+        sparse_output=False,
+        handle_unknown="infrequent_if_exist",
+        max_categories=2,
+    ).fit(X_train)
+
+    assert_array_equal(ohe.infrequent_categories_, [["c", "d", "a"]])
+
+    X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
+    expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    # 'infrequent' is used to denote the infrequent categories for
+    # `inverse_transform`
+    expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+
+def test_ohe_infrequent_three_levels_user_cats():
+    """Test that the order of the categories provided by a user is respected.
+    In this case 'c' is encoded as the first category and 'b' is encoded
+    as the second one."""
+
+    X_train = np.array(
+        [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
+    ).T
+    ohe = OneHotEncoder(
+        categories=[["c", "d", "b", "a"]],
+        sparse_output=False,
+        handle_unknown="infrequent_if_exist",
+        max_categories=3,
+    ).fit(X_train)
+
+    assert_array_equal(ohe.infrequent_categories_, [["d", "a"]])
+
+    X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
+    expected = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [0, 0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    # 'infrequent' is used to denote the infrequent categories for
+    # `inverse_transform`
+    expected_inv = [
+        ["b"],
+        ["infrequent_sklearn"],
+        ["c"],
+        ["infrequent_sklearn"],
+        ["infrequent_sklearn"],
+    ]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+
+def test_ohe_infrequent_mixed():
+    """Test infrequent categories where feature 0 has infrequent categories,
+    and feature 1 does not."""
+
+    # X[:, 0] 1 and 2 are infrequent
+    # X[:, 1] nothing is infrequent
+    X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]]
+
+    ohe = OneHotEncoder(max_categories=3, drop="if_binary", sparse_output=False)
+    ohe.fit(X)
+
+    X_test = [[3, 0], [1, 1]]
+    X_trans = ohe.transform(X_test)
+
+    # feature 1 is binary so it drops a category 0
+    assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]])
+
+
+def test_ohe_infrequent_multiple_categories():
+    """Test infrequent categories with feature matrix with 3 features."""
+
+    X = np.c_[
+        [0, 1, 3, 3, 3, 3, 2, 0, 3],
+        [0, 0, 5, 1, 1, 10, 5, 5, 0],
+        [1, 0, 1, 0, 1, 0, 1, 0, 1],
+    ]
+
+    ohe = OneHotEncoder(
+        categories="auto", max_categories=3, handle_unknown="infrequent_if_exist"
+    )
+    # X[:, 0] 1 and 2 are infrequent
+    # X[:, 1] 1 and 10 are infrequent
+    # X[:, 2] nothing is infrequent
+
+    X_trans = ohe.fit_transform(X).toarray()
+    assert_array_equal(ohe.infrequent_categories_[0], [1, 2])
+    assert_array_equal(ohe.infrequent_categories_[1], [1, 10])
+    assert_array_equal(ohe.infrequent_categories_[2], None)
+
+    # 'infrequent' is used to denote the infrequent categories
+    # For the first column, 1 and 2 have the same frequency. In this case,
+    # 1 will be chosen to be the feature name because is smaller lexiconically
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(
+        [
+            "x0_0",
+            "x0_3",
+            "x0_infrequent_sklearn",
+            "x1_0",
+            "x1_5",
+            "x1_infrequent_sklearn",
+            "x2_0",
+            "x2_1",
+        ],
+        feature_names,
+    )
+
+    expected = [
+        [1, 0, 0, 1, 0, 0, 0, 1],
+        [0, 0, 1, 1, 0, 0, 1, 0],
+        [0, 1, 0, 0, 1, 0, 0, 1],
+        [0, 1, 0, 0, 0, 1, 1, 0],
+        [0, 1, 0, 0, 0, 1, 0, 1],
+        [0, 1, 0, 0, 0, 1, 1, 0],
+        [0, 0, 1, 0, 1, 0, 0, 1],
+        [1, 0, 0, 0, 1, 0, 1, 0],
+        [0, 1, 0, 1, 0, 0, 0, 1],
+    ]
+
+    assert_allclose(expected, X_trans)
+
+    X_test = [[3, 1, 2], [4, 0, 3]]
+
+    X_test_trans = ohe.transform(X_test)
+
+    # X[:, 2] does not have an infrequent category, thus it is encoded as all
+    # zeros
+    expected = [[0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0]]
+    assert_allclose(expected, X_test_trans.toarray())
+
+    X_inv = ohe.inverse_transform(X_test_trans)
+    expected_inv = np.array(
+        [[3, "infrequent_sklearn", None], ["infrequent_sklearn", 0, None]], dtype=object
+    )
+    assert_array_equal(expected_inv, X_inv)
+
+    # error for unknown categories
+    ohe = OneHotEncoder(
+        categories="auto", max_categories=3, handle_unknown="error"
+    ).fit(X)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        ohe.transform(X_test)
+
+    # only infrequent or known categories
+    X_test = [[1, 1, 1], [3, 10, 0]]
+    X_test_trans = ohe.transform(X_test)
+
+    expected = [[0, 0, 1, 0, 0, 1, 0, 1], [0, 1, 0, 0, 0, 1, 1, 0]]
+    assert_allclose(expected, X_test_trans.toarray())
+
+    X_inv = ohe.inverse_transform(X_test_trans)
+
+    expected_inv = np.array(
+        [["infrequent_sklearn", "infrequent_sklearn", 1], [3, "infrequent_sklearn", 0]],
+        dtype=object,
+    )
+    assert_array_equal(expected_inv, X_inv)
+
+
+def test_ohe_infrequent_multiple_categories_dtypes():
+    """Test infrequent categories with a pandas dataframe with multiple dtypes."""
+
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame(
+        {
+            "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
+            "int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
+        },
+        columns=["str", "int"],
+    )
+
+    ohe = OneHotEncoder(
+        categories="auto", max_categories=3, handle_unknown="infrequent_if_exist"
+    )
+    # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
+    # considered infrequent because they are greater
+
+    # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
+    # 0, 3, 12 will be considered infrequent
+
+    X_trans = ohe.fit_transform(X).toarray()
+    assert_array_equal(ohe.infrequent_categories_[0], ["a", "b"])
+    assert_array_equal(ohe.infrequent_categories_[1], [0, 3, 12])
+
+    expected = [
+        [0, 0, 1, 1, 0, 0],
+        [0, 1, 0, 0, 0, 1],
+        [1, 0, 0, 0, 0, 1],
+        [0, 1, 0, 0, 1, 0],
+        [0, 1, 0, 0, 1, 0],
+        [0, 0, 1, 0, 0, 1],
+        [1, 0, 0, 0, 0, 1],
+        [0, 0, 1, 0, 0, 1],
+        [0, 0, 1, 1, 0, 0],
+    ]
+
+    assert_allclose(expected, X_trans)
+
+    X_test = pd.DataFrame({"str": ["b", "f"], "int": [14, 12]}, columns=["str", "int"])
+
+    expected = [[0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]]
+    X_test_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_test_trans.toarray())
+
+    X_inv = ohe.inverse_transform(X_test_trans)
+    expected_inv = np.array(
+        [["infrequent_sklearn", "infrequent_sklearn"], ["f", "infrequent_sklearn"]],
+        dtype=object,
+    )
+    assert_array_equal(expected_inv, X_inv)
+
+    # only infrequent or known categories
+    X_test = pd.DataFrame({"str": ["c", "b"], "int": [12, 5]}, columns=["str", "int"])
+    X_test_trans = ohe.transform(X_test).toarray()
+    expected = [[1, 0, 0, 0, 0, 1], [0, 0, 1, 1, 0, 0]]
+    assert_allclose(expected, X_test_trans)
+
+    X_inv = ohe.inverse_transform(X_test_trans)
+    expected_inv = np.array(
+        [["c", "infrequent_sklearn"], ["infrequent_sklearn", 5]], dtype=object
+    )
+    assert_array_equal(expected_inv, X_inv)
+
+
+@pytest.mark.parametrize("kwargs", [{"min_frequency": 21, "max_categories": 1}])
+def test_ohe_infrequent_one_level_errors(kwargs):
+    """All user provided categories are infrequent."""
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 2]).T
+
+    ohe = OneHotEncoder(
+        handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs
+    )
+    ohe.fit(X_train)
+
+    X_trans = ohe.transform([["a"]])
+    assert_allclose(X_trans, [[1]])
+
+
+@pytest.mark.parametrize("kwargs", [{"min_frequency": 2, "max_categories": 3}])
+def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
+    """All user provided categories are infrequent."""
+
+    X_train = np.array([["e"] * 3], dtype=object).T
+    ohe = OneHotEncoder(
+        categories=[["c", "d", "a", "b"]],
+        sparse_output=False,
+        handle_unknown="infrequent_if_exist",
+        **kwargs,
+    ).fit(X_train)
+
+    X_trans = ohe.transform([["a"], ["e"]])
+    assert_allclose(X_trans, [[1], [1]])
+
+
+# deliberately omit 'OS' as an invalid combo
+@pytest.mark.parametrize(
+    "input_dtype, category_dtype", ["OO", "OU", "UO", "UU", "SO", "SU", "SS"]
+)
+@pytest.mark.parametrize("array_type", ["list", "array", "dataframe"])
+def test_encoders_string_categories(input_dtype, category_dtype, array_type):
+    """Check that encoding work with object, unicode, and byte string dtypes.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/15616
+    https://github.com/scikit-learn/scikit-learn/issues/15726
+    https://github.com/scikit-learn/scikit-learn/issues/19677
+    """
+
+    X = np.array([["b"], ["a"]], dtype=input_dtype)
+    categories = [np.array(["b", "a"], dtype=category_dtype)]
+    ohe = OneHotEncoder(categories=categories, sparse_output=False).fit(X)
+
+    X_test = _convert_container(
+        [["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype
+    )
+    X_trans = ohe.transform(X_test)
+
+    expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
+    assert_allclose(X_trans, expected)
+
+    oe = OrdinalEncoder(categories=categories).fit(X)
+    X_trans = oe.transform(X_test)
+
+    expected = np.array([[1], [1], [0], [1]])
+    assert_array_equal(X_trans, expected)
+
+
+def test_mixed_string_bytes_categoricals():
+    """Check that this mixture of predefined categories and X raises an error.
+
+    Categories defined as bytes can not easily be compared to data that is
+    a string.
+    """
+    # data as unicode
+    X = np.array([["b"], ["a"]], dtype="U")
+    # predefined categories as bytes
+    categories = [np.array(["b", "a"], dtype="S")]
+    ohe = OneHotEncoder(categories=categories, sparse_output=False)
+
+    msg = re.escape(
+        "In column 0, the predefined categories have type 'bytes' which is incompatible"
+        " with values of type 'str_'."
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit(X)
+
+
+@pytest.mark.parametrize("missing_value", [np.nan, None])
+def test_ohe_missing_values_get_feature_names(missing_value):
+    # encoder with missing values with object dtypes
+    X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T
+    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(X)
+    names = ohe.get_feature_names_out()
+    assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"])
+
+
+def test_ohe_missing_value_support_pandas():
+    # check support for pandas with mixed dtypes and missing values
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {
+            "col1": ["dog", "cat", None, "cat"],
+            "col2": np.array([3, 0, 4, np.nan], dtype=float),
+        },
+        columns=["col1", "col2"],
+    )
+    expected_df_trans = np.array(
+        [
+            [0, 1, 0, 0, 1, 0, 0],
+            [1, 0, 0, 1, 0, 0, 0],
+            [0, 0, 1, 0, 0, 1, 0],
+            [1, 0, 0, 0, 0, 0, 1],
+        ]
+    )
+
+    Xtr = check_categorical_onehot(df)
+    assert_allclose(Xtr, expected_df_trans)
+
+
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
+@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
+def test_ohe_missing_value_support_pandas_categorical(pd_nan_type, handle_unknown):
+    # checks pandas dataframe with categorical features
+    pd = pytest.importorskip("pandas")
+
+    pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan
+
+    df = pd.DataFrame(
+        {
+            "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
+        }
+    )
+    expected_df_trans = np.array(
+        [
+            [0, 0, 1, 0],
+            [1, 0, 0, 0],
+            [0, 0, 0, 1],
+            [0, 1, 0, 0],
+            [1, 0, 0, 0],
+        ]
+    )
+
+    ohe = OneHotEncoder(sparse_output=False, handle_unknown=handle_unknown)
+    df_trans = ohe.fit_transform(df)
+    assert_allclose(expected_df_trans, df_trans)
+
+    assert len(ohe.categories_) == 1
+    assert_array_equal(ohe.categories_[0][:-1], ["a", "b", "c"])
+    assert np.isnan(ohe.categories_[0][-1])
+
+
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
+def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
+    """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
+    during transform."""
+    X = [["a", 0], ["b", 2], ["b", 1]]
+
+    ohe = OneHotEncoder(
+        drop="first", sparse_output=False, handle_unknown=handle_unknown
+    )
+    X_trans = ohe.fit_transform(X)
+
+    X_expected = np.array(
+        [
+            [0, 0, 0],
+            [1, 0, 1],
+            [1, 1, 0],
+        ]
+    )
+    assert_allclose(X_trans, X_expected)
+
+    # Both categories are unknown
+    X_test = [["c", 3]]
+    X_expected = np.array([[0, 0, 0]])
+
+    warn_msg = (
+        r"Found unknown categories in columns \[0, 1\] during "
+        "transform. These unknown categories will be encoded as all "
+        "zeros"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+    # inverse_transform maps to None
+    X_inv = ohe.inverse_transform(X_expected)
+    assert_array_equal(X_inv, np.array([["a", 0]], dtype=object))
+
+
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
+def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
+    """Check drop='if_binary' and handle_unknown='ignore' during transform."""
+    X = [["a", 0], ["b", 2], ["b", 1]]
+
+    ohe = OneHotEncoder(
+        drop="if_binary", sparse_output=False, handle_unknown=handle_unknown
+    )
+    X_trans = ohe.fit_transform(X)
+
+    X_expected = np.array(
+        [
+            [0, 1, 0, 0],
+            [1, 0, 0, 1],
+            [1, 0, 1, 0],
+        ]
+    )
+    assert_allclose(X_trans, X_expected)
+
+    # Both categories are unknown
+    X_test = [["c", 3]]
+    X_expected = np.array([[0, 0, 0, 0]])
+
+    warn_msg = (
+        r"Found unknown categories in columns \[0, 1\] during "
+        "transform. These unknown categories will be encoded as all "
+        "zeros"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+    # inverse_transform maps to None
+    X_inv = ohe.inverse_transform(X_expected)
+    assert_array_equal(X_inv, np.array([["a", None]], dtype=object))
+
+
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
+def test_ohe_drop_first_explicit_categories(handle_unknown):
+    """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
+    during fit with categories passed in."""
+
+    X = [["a", 0], ["b", 2], ["b", 1]]
+
+    ohe = OneHotEncoder(
+        drop="first",
+        sparse_output=False,
+        handle_unknown=handle_unknown,
+        categories=[["b", "a"], [1, 2]],
+    )
+    ohe.fit(X)
+
+    X_test = [["c", 1]]
+    X_expected = np.array([[0, 0]])
+
+    warn_msg = (
+        r"Found unknown categories in columns \[0\] during transform. "
+        r"These unknown categories will be encoded as all zeros"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+
+def test_ohe_more_informative_error_message():
+    """Raise informative error message when pandas output and sparse_output=True."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"])
+
+    ohe = OneHotEncoder(sparse_output=True)
+    ohe.set_output(transform="pandas")
+
+    msg = (
+        "Pandas output does not support sparse data. Set "
+        "sparse_output=False to output pandas dataframes or disable Pandas output"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit_transform(df)
+
+    ohe.fit(df)
+    with pytest.raises(ValueError, match=msg):
+        ohe.transform(df)
+
+
+def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
+    """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
+
+    X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T
+    oe = OrdinalEncoder(dtype=np.int32)
+
+    msg = (
+        r"There are missing values in features \[0\]. For OrdinalEncoder "
+        f"to encode missing values with dtype: {np.int32}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        oe.fit(X)
+
+
+@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
+def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value):
+    """Test ordinal encoder with nan on float dtypes."""
+
+    X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
+    oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X)
+
+    assert len(oe.categories_) == 1
+
+    assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])
+
+    X_trans = oe.transform(X)
+    assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]])
+
+    X_inverse = oe.inverse_transform(X_trans)
+    assert_allclose(X_inverse, X)
+
+
+@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
+@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
+def test_ordinal_encoder_missing_value_support_pandas_categorical(
+    pd_nan_type, encoded_missing_value
+):
+    """Check ordinal encoder is compatible with pandas."""
+    # checks pandas dataframe with categorical features
+    pd = pytest.importorskip("pandas")
+
+    pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan
+
+    df = pd.DataFrame(
+        {
+            "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
+        }
+    )
+
+    oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df)
+    assert len(oe.categories_) == 1
+    assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
+    assert np.isnan(oe.categories_[0][-1])
+
+    df_trans = oe.transform(df)
+
+    assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]])
+
+    X_inverse = oe.inverse_transform(df_trans)
+    assert X_inverse.shape == (5, 1)
+    assert_array_equal(X_inverse[:2, 0], ["c", "a"])
+    assert_array_equal(X_inverse[3:, 0], ["b", "a"])
+    assert np.isnan(X_inverse[2, 0])
+
+
+@pytest.mark.parametrize(
+    "X, X2, cats, cat_dtype",
+    [
+        (
+            (
+                np.array([["a", np.nan]], dtype=object).T,
+                np.array([["a", "b"]], dtype=object).T,
+                [np.array(["a", "d", np.nan], dtype=object)],
+                np.object_,
+            )
+        ),
+        (
+            (
+                np.array([["a", np.nan]], dtype=object).T,
+                np.array([["a", "b"]], dtype=object).T,
+                [np.array(["a", "d", np.nan], dtype=object)],
+                np.object_,
+            )
+        ),
+        (
+            (
+                np.array([[2.0, np.nan]], dtype=np.float64).T,
+                np.array([[3.0]], dtype=np.float64).T,
+                [np.array([2.0, 4.0, np.nan])],
+                np.float64,
+            )
+        ),
+    ],
+    ids=[
+        "object-None-missing-value",
+        "object-nan-missing_value",
+        "numeric-missing-value",
+    ],
+)
+def test_ordinal_encoder_specified_categories_missing_passthrough(
+    X, X2, cats, cat_dtype
+):
+    """Test ordinal encoder for specified categories."""
+    oe = OrdinalEncoder(categories=cats)
+    exp = np.array([[0.0], [np.nan]])
+    assert_array_equal(oe.fit_transform(X), exp)
+    # manually specified categories should have same dtype as
+    # the data when coerced from lists
+    assert oe.categories_[0].dtype == cat_dtype
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    oe = OrdinalEncoder(categories=cats)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        oe.fit(X2)
+
+
+@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
+def test_encoder_duplicate_specified_categories(Encoder):
+    """Test encoder for specified categories have duplicate values.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27088
+    """
+    cats = [np.array(["a", "b", "a"], dtype=object)]
+    enc = Encoder(categories=cats)
+    X = np.array([["a", "b"]], dtype=object).T
+    with pytest.raises(
+        ValueError, match="the predefined categories contain duplicate elements."
+    ):
+        enc.fit(X)
+
+
+@pytest.mark.parametrize(
+    "X, expected_X_trans, X_test",
+    [
+        (
+            np.array([[1.0, np.nan, 3.0]]).T,
+            np.array([[0.0, np.nan, 1.0]]).T,
+            np.array([[4.0]]),
+        ),
+        (
+            np.array([[1.0, 4.0, 3.0]]).T,
+            np.array([[0.0, 2.0, 1.0]]).T,
+            np.array([[np.nan]]),
+        ),
+        (
+            np.array([["c", np.nan, "b"]], dtype=object).T,
+            np.array([[1.0, np.nan, 0.0]]).T,
+            np.array([["d"]], dtype=object),
+        ),
+        (
+            np.array([["c", "a", "b"]], dtype=object).T,
+            np.array([[2.0, 0.0, 1.0]]).T,
+            np.array([[np.nan]], dtype=object),
+        ),
+    ],
+)
+def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test):
+    """Test the interaction between missing values and handle_unknown"""
+
+    oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
+
+    X_trans = oe.fit_transform(X)
+    assert_allclose(X_trans, expected_X_trans)
+
+    assert_allclose(oe.transform(X_test), [[-1.0]])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_ordinal_encoder_sparse(csr_container):
+    """Check that we raise proper error with sparse input in OrdinalEncoder.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19878
+    """
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    X_sparse = csr_container(X)
+
+    encoder = OrdinalEncoder()
+
+    err_msg = "Sparse data was passed, but dense data is required"
+    with pytest.raises(TypeError, match=err_msg):
+        encoder.fit(X_sparse)
+    with pytest.raises(TypeError, match=err_msg):
+        encoder.fit_transform(X_sparse)
+
+    X_trans = encoder.fit_transform(X)
+    X_trans_sparse = csr_container(X_trans)
+    with pytest.raises(TypeError, match=err_msg):
+        encoder.inverse_transform(X_trans_sparse)
+
+
+def test_ordinal_encoder_fit_with_unseen_category():
+    """Check OrdinalEncoder.fit works with unseen category when
+    `handle_unknown="use_encoded_value"`.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19872
+    """
+    X = np.array([0, 0, 1, 0, 2, 5])[:, np.newaxis]
+    oe = OrdinalEncoder(
+        categories=[[-1, 0, 1]], handle_unknown="use_encoded_value", unknown_value=-999
+    )
+    oe.fit(X)
+
+    oe = OrdinalEncoder(categories=[[-1, 0, 1]], handle_unknown="error")
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        oe.fit(X)
+
+
+@pytest.mark.parametrize(
+    "X_train",
+    [
+        [["AA", "B"]],
+        np.array([["AA", "B"]], dtype="O"),
+        np.array([["AA", "B"]], dtype="U"),
+    ],
+)
+@pytest.mark.parametrize(
+    "X_test",
+    [
+        [["A", "B"]],
+        np.array([["A", "B"]], dtype="O"),
+        np.array([["A", "B"]], dtype="U"),
+    ],
+)
+def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test):
+    """Checks that `OrdinalEncoder` transforms string dtypes.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19872
+    """
+    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-9)
+    enc.fit(X_train)
+
+    X_trans = enc.transform(X_test)
+    assert_allclose(X_trans, [[-9, 0]])
+
+
+def test_ordinal_encoder_python_integer():
+    """Check that `OrdinalEncoder` accepts Python integers that are potentially
+    larger than 64 bits.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20721
+    """
+    X = np.array(
+        [
+            44253463435747313673,
+            9867966753463435747313673,
+            44253462342215747313673,
+            442534634357764313673,
+        ]
+    ).reshape(-1, 1)
+    encoder = OrdinalEncoder().fit(X)
+    assert_array_equal(encoder.categories_, np.sort(X, axis=0).T)
+    X_trans = encoder.transform(X)
+    assert_array_equal(X_trans, [[0], [3], [2], [1]])
+
+
+def test_ordinal_encoder_features_names_out_pandas():
+    """Check feature names out is same as the input."""
+    pd = pytest.importorskip("pandas")
+
+    names = ["b", "c", "a"]
+    X = pd.DataFrame([[1, 2, 3]], columns=names)
+    enc = OrdinalEncoder().fit(X)
+
+    feature_names_out = enc.get_feature_names_out()
+    assert_array_equal(names, feature_names_out)
+
+
+def test_ordinal_encoder_unknown_missing_interaction():
+    """Check interactions between encode_unknown and missing value encoding."""
+
+    X = np.array([["a"], ["b"], [np.nan]], dtype=object)
+
+    oe = OrdinalEncoder(
+        handle_unknown="use_encoded_value",
+        unknown_value=np.nan,
+        encoded_missing_value=-3,
+    ).fit(X)
+
+    X_trans = oe.transform(X)
+    assert_allclose(X_trans, [[0], [1], [-3]])
+
+    # "c" is unknown and is mapped to np.nan
+    # "None" is a missing value and is set to -3
+    X_test = np.array([["c"], [np.nan]], dtype=object)
+    X_test_trans = oe.transform(X_test)
+    assert_allclose(X_test_trans, [[np.nan], [-3]])
+
+    # Non-regression test for #24082
+    X_roundtrip = oe.inverse_transform(X_test_trans)
+
+    # np.nan is unknown so it maps to None
+    assert X_roundtrip[0][0] is None
+
+    # -3 is the encoded missing value so it maps back to nan
+    assert np.isnan(X_roundtrip[1][0])
+
+
+@pytest.mark.parametrize("with_pandas", [True, False])
+def test_ordinal_encoder_encoded_missing_value_error(with_pandas):
+    """Check OrdinalEncoder errors when encoded_missing_value is used by
+    an known category."""
+    X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object)
+
+    # The 0-th feature has no missing values so it is not included in the list of
+    # features
+    error_msg = (
+        r"encoded_missing_value \(1\) is already used to encode a known category "
+        r"in features: "
+    )
+
+    if with_pandas:
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame(X, columns=["letter", "pet"])
+        error_msg = error_msg + r"\['pet'\]"
+    else:
+        error_msg = error_msg + r"\[1\]"
+
+    oe = OrdinalEncoder(encoded_missing_value=1)
+
+    with pytest.raises(ValueError, match=error_msg):
+        oe.fit(X)
+
+
+@pytest.mark.parametrize(
+    "X_train, X_test_trans_expected, X_roundtrip_expected",
+    [
+        (
+            # missing value is not in training set
+            # inverse transform will considering encoded nan as unknown
+            np.array([["a"], ["1"]], dtype=object),
+            [[0], [np.nan], [np.nan]],
+            np.asarray([["1"], [None], [None]], dtype=object),
+        ),
+        (
+            # missing value in training set,
+            # inverse transform will considering encoded nan as missing
+            np.array([[np.nan], ["1"], ["a"]], dtype=object),
+            [[0], [np.nan], [np.nan]],
+            np.asarray([["1"], [np.nan], [np.nan]], dtype=object),
+        ),
+    ],
+)
+def test_ordinal_encoder_unknown_missing_interaction_both_nan(
+    X_train, X_test_trans_expected, X_roundtrip_expected
+):
+    """Check transform when unknown_value and encoded_missing_value is nan.
+
+    Non-regression test for #24082.
+    """
+    oe = OrdinalEncoder(
+        handle_unknown="use_encoded_value",
+        unknown_value=np.nan,
+        encoded_missing_value=np.nan,
+    ).fit(X_train)
+
+    X_test = np.array([["1"], [np.nan], ["b"]])
+    X_test_trans = oe.transform(X_test)
+
+    # both nan and unknown are encoded as nan
+    assert_allclose(X_test_trans, X_test_trans_expected)
+    X_roundtrip = oe.inverse_transform(X_test_trans)
+
+    n_samples = X_roundtrip_expected.shape[0]
+    for i in range(n_samples):
+        expected_val = X_roundtrip_expected[i, 0]
+        val = X_roundtrip[i, 0]
+
+        if expected_val is None:
+            assert val is None
+        elif is_scalar_nan(expected_val):
+            assert np.isnan(val)
+        else:
+            assert val == expected_val
+
+
+def test_one_hot_encoder_set_output():
+    """Check OneHotEncoder works with set_output."""
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
+    ohe = OneHotEncoder()
+
+    ohe.set_output(transform="pandas")
+
+    match = "Pandas output does not support sparse data. Set sparse_output=False"
+    with pytest.raises(ValueError, match=match):
+        ohe.fit_transform(X_df)
+
+    ohe_default = OneHotEncoder(sparse_output=False).set_output(transform="default")
+    ohe_pandas = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
+
+    X_default = ohe_default.fit_transform(X_df)
+    X_pandas = ohe_pandas.fit_transform(X_df)
+
+    assert_allclose(X_pandas.to_numpy(), X_default)
+    assert_array_equal(ohe_pandas.get_feature_names_out(), X_pandas.columns)
+
+
+def test_ordinal_set_output():
+    """Check OrdinalEncoder works with set_output."""
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
+
+    ord_default = OrdinalEncoder().set_output(transform="default")
+    ord_pandas = OrdinalEncoder().set_output(transform="pandas")
+
+    X_default = ord_default.fit_transform(X_df)
+    X_pandas = ord_pandas.fit_transform(X_df)
+
+    assert_allclose(X_pandas.to_numpy(), X_default)
+    assert_array_equal(ord_pandas.get_feature_names_out(), X_pandas.columns)
+
+
+def test_predefined_categories_dtype():
+    """Check that the categories_ dtype is `object` for string categories
+
+    Regression test for gh-25171.
+    """
+    categories = [["as", "mmas", "eas", "ras", "acs"], ["1", "2"]]
+
+    enc = OneHotEncoder(categories=categories)
+
+    enc.fit([["as", "1"]])
+
+    assert len(categories) == len(enc.categories_)
+    for n, cat in enumerate(enc.categories_):
+        assert cat.dtype == object
+        assert_array_equal(categories[n], cat)
+
+
+def test_ordinal_encoder_missing_unknown_encoding_max():
+    """Check missing value or unknown encoding can equal the cardinality."""
+    X = np.array([["dog"], ["cat"], [np.nan]], dtype=object)
+    X_trans = OrdinalEncoder(encoded_missing_value=2).fit_transform(X)
+    assert_allclose(X_trans, [[1], [0], [2]])
+
+    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2).fit(X)
+    X_test = np.array([["snake"]])
+    X_trans = enc.transform(X_test)
+    assert_allclose(X_trans, [[2]])
+
+
+def test_drop_idx_infrequent_categories():
+    """Check drop_idx is defined correctly with infrequent categories.
+
+    Non-regression test for gh-25550.
+    """
+    X = np.array(
+        [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
+    ).T
+    ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
+    assert_array_equal(
+        ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
+    )
+    assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
+
+    X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
+    ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
+    assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
+    assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"
+
+    X = np.array(
+        [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
+    ).T
+    ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
+    assert_array_equal(
+        ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
+    )
+    assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"
+
+    ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
+    assert_array_equal(
+        ohe.get_feature_names_out(),
+        ["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
+    )
+    assert ohe.drop_idx_ is None
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"max_categories": 3},
+        {"min_frequency": 6},
+        {"min_frequency": 9},
+        {"min_frequency": 0.24},
+        {"min_frequency": 0.16},
+        {"max_categories": 3, "min_frequency": 8},
+        {"max_categories": 4, "min_frequency": 6},
+    ],
+)
+def test_ordinal_encoder_infrequent_three_levels(kwargs):
+    """Test parameters for grouping 'a', and 'd' into the infrequent category."""
+
+    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
+    ordinal = OrdinalEncoder(
+        handle_unknown="use_encoded_value", unknown_value=-1, **kwargs
+    ).fit(X_train)
+    assert_array_equal(ordinal.categories_, [["a", "b", "c", "d"]])
+    assert_array_equal(ordinal.infrequent_categories_, [["a", "d"]])
+
+    X_test = [["a"], ["b"], ["c"], ["d"], ["z"]]
+    expected_trans = [[2], [0], [1], [2], [-1]]
+
+    X_trans = ordinal.transform(X_test)
+    assert_allclose(X_trans, expected_trans)
+
+    X_inverse = ordinal.inverse_transform(X_trans)
+    expected_inverse = [
+        ["infrequent_sklearn"],
+        ["b"],
+        ["c"],
+        ["infrequent_sklearn"],
+        [None],
+    ]
+    assert_array_equal(X_inverse, expected_inverse)
+
+
+def test_ordinal_encoder_infrequent_three_levels_user_cats():
+    """Test that the order of the categories provided by a user is respected.
+
+    In this case 'c' is encoded as the first category and 'b' is encoded
+    as the second one.
+    """
+
+    X_train = np.array(
+        [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
+    ).T
+    ordinal = OrdinalEncoder(
+        categories=[["c", "d", "b", "a"]],
+        max_categories=3,
+        handle_unknown="use_encoded_value",
+        unknown_value=-1,
+    ).fit(X_train)
+    assert_array_equal(ordinal.categories_, [["c", "d", "b", "a"]])
+    assert_array_equal(ordinal.infrequent_categories_, [["d", "a"]])
+
+    X_test = [["a"], ["b"], ["c"], ["d"], ["z"]]
+    expected_trans = [[2], [1], [0], [2], [-1]]
+
+    X_trans = ordinal.transform(X_test)
+    assert_allclose(X_trans, expected_trans)
+
+    X_inverse = ordinal.inverse_transform(X_trans)
+    expected_inverse = [
+        ["infrequent_sklearn"],
+        ["b"],
+        ["c"],
+        ["infrequent_sklearn"],
+        [None],
+    ]
+    assert_array_equal(X_inverse, expected_inverse)
+
+
+def test_ordinal_encoder_infrequent_mixed():
+    """Test when feature 0 has infrequent categories and feature 1 does not."""
+
+    X = np.column_stack(([0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]))
+
+    ordinal = OrdinalEncoder(max_categories=3).fit(X)
+
+    assert_array_equal(ordinal.infrequent_categories_[0], [1, 2])
+    assert ordinal.infrequent_categories_[1] is None
+
+    X_test = [[3, 0], [1, 1]]
+    expected_trans = [[1, 0], [2, 1]]
+
+    X_trans = ordinal.transform(X_test)
+    assert_allclose(X_trans, expected_trans)
+
+    X_inverse = ordinal.inverse_transform(X_trans)
+    expected_inverse = np.array([[3, 0], ["infrequent_sklearn", 1]], dtype=object)
+    assert_array_equal(X_inverse, expected_inverse)
+
+
+def test_ordinal_encoder_infrequent_multiple_categories_dtypes():
+    """Test infrequent categories with a pandas DataFrame with multiple dtypes."""
+
+    pd = pytest.importorskip("pandas")
+    categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"])
+    X = pd.DataFrame(
+        {
+            "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
+            "int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
+            "categorical": pd.Series(
+                ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"],
+                dtype=categorical_dtype,
+            ),
+        },
+        columns=["str", "int", "categorical"],
+    )
+
+    ordinal = OrdinalEncoder(max_categories=3).fit(X)
+    # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
+    # considered infrequent because they appear first when sorted
+
+    # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
+    # 0, 3, 12 will be considered infrequent because they appear first when
+    # sorted.
+
+    # X[:, 2] "snake" and "bird" or infrequent
+
+    assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"])
+    assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12])
+    assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"])
+
+    X_test = pd.DataFrame(
+        {
+            "str": ["a", "b", "f", "c"],
+            "int": [12, 0, 10, 5],
+            "categorical": pd.Series(
+                ["cat"] + ["snake"] + ["bird"] + ["dog"],
+                dtype=categorical_dtype,
+            ),
+        },
+        columns=["str", "int", "categorical"],
+    )
+    expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]]
+
+    X_trans = ordinal.transform(X_test)
+    assert_allclose(X_trans, expected_trans)
+
+
+def test_ordinal_encoder_infrequent_custom_mapping():
+    """Check behavior of unknown_value and encoded_missing_value with infrequent."""
+    X_train = np.array(
+        [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], dtype=object
+    ).T
+
+    ordinal = OrdinalEncoder(
+        handle_unknown="use_encoded_value",
+        unknown_value=2,
+        max_categories=2,
+        encoded_missing_value=3,
+    ).fit(X_train)
+    assert_array_equal(ordinal.infrequent_categories_, [["a", "c", "d"]])
+
+    X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
+    expected_trans = [[1], [0], [1], [1], [2], [3]]
+
+    X_trans = ordinal.transform(X_test)
+    assert_allclose(X_trans, expected_trans)
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"max_categories": 6},
+        {"min_frequency": 2},
+    ],
+)
+def test_ordinal_encoder_all_frequent(kwargs):
+    """All categories are considered frequent have same encoding as default encoder."""
+    X_train = np.array(
+        [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
+    ).T
+
+    adjusted_encoder = OrdinalEncoder(
+        **kwargs, handle_unknown="use_encoded_value", unknown_value=-1
+    ).fit(X_train)
+    default_encoder = OrdinalEncoder(
+        handle_unknown="use_encoded_value", unknown_value=-1
+    ).fit(X_train)
+
+    X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
+
+    assert_allclose(
+        adjusted_encoder.transform(X_test), default_encoder.transform(X_test)
+    )
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"max_categories": 1},
+        {"min_frequency": 100},
+    ],
+)
+def test_ordinal_encoder_all_infrequent(kwargs):
+    """When all categories are infrequent, they are all encoded as zero."""
+    X_train = np.array(
+        [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
+    ).T
+    encoder = OrdinalEncoder(
+        **kwargs, handle_unknown="use_encoded_value", unknown_value=-1
+    ).fit(X_train)
+
+    X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
+    assert_allclose(encoder.transform(X_test), [[0], [0], [0], [0], [-1]])
+
+
+def test_ordinal_encoder_missing_appears_frequent():
+    """Check behavior when missing value appears frequently."""
+    X = np.array(
+        [[np.nan] * 20 + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"]],
+        dtype=object,
+    ).T
+    ordinal = OrdinalEncoder(max_categories=3).fit(X)
+
+    X_test = np.array([["snake", "cat", "dog", np.nan]], dtype=object).T
+    X_trans = ordinal.transform(X_test)
+    assert_allclose(X_trans, [[2], [0], [1], [np.nan]])
+
+
+def test_ordinal_encoder_missing_appears_infrequent():
+    """Check behavior when missing value appears infrequently."""
+
+    # feature 0 has infrequent categories
+    # feature 1 has no infrequent categories
+    X = np.array(
+        [
+            [np.nan] + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"],
+            ["red"] * 9 + ["green"] * 9,
+        ],
+        dtype=object,
+    ).T
+    ordinal = OrdinalEncoder(min_frequency=4).fit(X)
+
+    X_test = np.array(
+        [
+            ["snake", "red"],
+            ["deer", "green"],
+            [np.nan, "green"],
+            ["dog", "green"],
+            ["cat", "red"],
+        ],
+        dtype=object,
+    )
+    X_trans = ordinal.transform(X_test)
+    assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])
+
+
+@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
+def test_encoder_not_fitted(Encoder):
+    """Check that we raise a `NotFittedError` by calling transform before fit with
+    the encoders.
+
+    One could expect that the passing the `categories` argument to the encoder
+    would make it stateless. However, `fit` is making a couple of check, such as the
+    position of `np.nan`.
+    """
+    X = np.array([["A"], ["B"], ["C"]], dtype=object)
+    encoder = Encoder(categories=[["A", "B", "C"]])
+    with pytest.raises(NotFittedError):
+        encoder.transform(X)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_function_transformer.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_function_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bfb5d1367c8da36e2ce829a28a02ff253b34801
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_function_transformer.py
@@ -0,0 +1,579 @@
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, StandardScaler
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose_dense_sparse,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+
+def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
+    def _func(X, *args, **kwargs):
+        args_store.append(X)
+        args_store.extend(args)
+        kwargs_store.update(kwargs)
+        return func(X)
+
+    return _func
+
+
+def test_delegate_to_func():
+    # (args|kwargs)_store will hold the positional and keyword arguments
+    # passed to the function inside the FunctionTransformer.
+    args_store = []
+    kwargs_store = {}
+    X = np.arange(10).reshape((5, 2))
+    assert_array_equal(
+        FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
+        X,
+        "transform should have returned X unchanged",
+    )
+
+    # The function should only have received X.
+    assert args_store == [X], (
+        "Incorrect positional arguments passed to func: {args}".format(args=args_store)
+    )
+
+    assert not kwargs_store, (
+        "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
+    )
+
+    # reset the argument stores.
+    args_store[:] = []
+    kwargs_store.clear()
+    transformed = FunctionTransformer(
+        _make_func(args_store, kwargs_store),
+    ).transform(X)
+
+    assert_array_equal(
+        transformed, X, err_msg="transform should have returned X unchanged"
+    )
+
+    # The function should have received X
+    assert args_store == [X], (
+        "Incorrect positional arguments passed to func: {args}".format(args=args_store)
+    )
+
+    assert not kwargs_store, (
+        "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
+    )
+
+
+def test_np_log():
+    X = np.arange(10).reshape((5, 2))
+
+    # Test that the numpy.log example still works.
+    assert_array_equal(
+        FunctionTransformer(np.log1p).transform(X),
+        np.log1p(X),
+    )
+
+
+def test_kw_arg():
+    X = np.linspace(0, 1, num=10).reshape((5, 2))
+
+    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
+
+    # Test that rounding is correct
+    assert_array_equal(F.transform(X), np.around(X, decimals=3))
+
+
+def test_kw_arg_update():
+    X = np.linspace(0, 1, num=10).reshape((5, 2))
+
+    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
+
+    F.kw_args["decimals"] = 1
+
+    # Test that rounding is correct
+    assert_array_equal(F.transform(X), np.around(X, decimals=1))
+
+
+def test_kw_arg_reset():
+    X = np.linspace(0, 1, num=10).reshape((5, 2))
+
+    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
+
+    F.kw_args = dict(decimals=1)
+
+    # Test that rounding is correct
+    assert_array_equal(F.transform(X), np.around(X, decimals=1))
+
+
+def test_inverse_transform():
+    X = np.array([1, 4, 9, 16]).reshape((2, 2))
+
+    # Test that inverse_transform works correctly
+    F = FunctionTransformer(
+        func=np.sqrt,
+        inverse_func=np.around,
+        inv_kw_args=dict(decimals=3),
+    )
+    assert_array_equal(
+        F.inverse_transform(F.transform(X)),
+        np.around(np.sqrt(X), decimals=3),
+    )
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+def test_check_inverse(sparse_container):
+    X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
+    if sparse_container is not None:
+        X = sparse_container(X)
+
+    trans = FunctionTransformer(
+        func=np.sqrt,
+        inverse_func=np.around,
+        accept_sparse=sparse_container is not None,
+        check_inverse=True,
+        validate=True,
+    )
+    warning_message = (
+        "The provided functions are not strictly"
+        " inverse of each other. If you are sure you"
+        " want to proceed regardless, set"
+        " 'check_inverse=False'."
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        trans.fit(X)
+
+    trans = FunctionTransformer(
+        func=np.expm1,
+        inverse_func=np.log1p,
+        accept_sparse=sparse_container is not None,
+        check_inverse=True,
+        validate=True,
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        Xt = trans.fit_transform(X)
+
+    assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
+
+
+def test_check_inverse_func_or_inverse_not_provided():
+    # check that we don't check inverse when one of the func or inverse is not
+    # provided.
+    X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
+
+    trans = FunctionTransformer(
+        func=np.expm1, inverse_func=None, check_inverse=True, validate=True
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        trans.fit(X)
+    trans = FunctionTransformer(
+        func=None, inverse_func=np.expm1, check_inverse=True, validate=True
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        trans.fit(X)
+
+
+def test_function_transformer_frame():
+    pd = pytest.importorskip("pandas")
+    X_df = pd.DataFrame(np.random.randn(100, 10))
+    transformer = FunctionTransformer()
+    X_df_trans = transformer.fit_transform(X_df)
+    assert hasattr(X_df_trans, "loc")
+
+
+@pytest.mark.parametrize("X_type", ["array", "series"])
+def test_function_transformer_raise_error_with_mixed_dtype(X_type):
+    """Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
+    mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
+    inverse_mapping = {value: key for key, value in mapping.items()}
+    dtype = "object"
+
+    data = ["one", "two", "three", "one", "one", 5, 6]
+    data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
+
+    def func(X):
+        return np.array([mapping[X[i]] for i in range(X.size)], dtype=object)
+
+    def inverse_func(X):
+        return _convert_container(
+            [inverse_mapping[x] for x in X],
+            X_type,
+            columns_name=["value"],
+            dtype=dtype,
+        )
+
+    transformer = FunctionTransformer(
+        func=func, inverse_func=inverse_func, validate=False, check_inverse=True
+    )
+
+    msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
+    with pytest.raises(ValueError, match=msg):
+        transformer.fit(data)
+
+
+def test_function_transformer_support_all_nummerical_dataframes_check_inverse_True():
+    """Check support for dataframes with only numerical values."""
+    pd = pytest.importorskip("pandas")
+
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    transformer = FunctionTransformer(
+        func=lambda x: x + 2, inverse_func=lambda x: x - 2, check_inverse=True
+    )
+
+    # Does not raise an error
+    df_out = transformer.fit_transform(df)
+    assert_allclose_dense_sparse(df_out, df + 2)
+
+
+def test_function_transformer_with_dataframe_and_check_inverse_True():
+    """Check error is raised when check_inverse=True.
+
+    Non-regresion test for gh-25261.
+    """
+    pd = pytest.importorskip("pandas")
+    transformer = FunctionTransformer(
+        func=lambda x: x, inverse_func=lambda x: x, check_inverse=True
+    )
+
+    df_mixed = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
+    with pytest.raises(ValueError, match=msg):
+        transformer.fit(df_mixed)
+
+
+@pytest.mark.parametrize(
+    "X, feature_names_out, input_features, expected",
+    [
+        (
+            # NumPy inputs, default behavior: generate names
+            np.random.rand(100, 3),
+            "one-to-one",
+            None,
+            ("x0", "x1", "x2"),
+        ),
+        (
+            # Pandas input, default behavior: use input feature names
+            {"a": np.random.rand(100), "b": np.random.rand(100)},
+            "one-to-one",
+            None,
+            ("a", "b"),
+        ),
+        (
+            # NumPy input, feature_names_out=callable
+            np.random.rand(100, 3),
+            lambda transformer, input_features: ("a", "b"),
+            None,
+            ("a", "b"),
+        ),
+        (
+            # Pandas input, feature_names_out=callable
+            {"a": np.random.rand(100), "b": np.random.rand(100)},
+            lambda transformer, input_features: ("c", "d", "e"),
+            None,
+            ("c", "d", "e"),
+        ),
+        (
+            # NumPy input, feature_names_out=callable – default input_features
+            np.random.rand(100, 3),
+            lambda transformer, input_features: tuple(input_features) + ("a",),
+            None,
+            ("x0", "x1", "x2", "a"),
+        ),
+        (
+            # Pandas input, feature_names_out=callable – default input_features
+            {"a": np.random.rand(100), "b": np.random.rand(100)},
+            lambda transformer, input_features: tuple(input_features) + ("c",),
+            None,
+            ("a", "b", "c"),
+        ),
+        (
+            # NumPy input, input_features=list of names
+            np.random.rand(100, 3),
+            "one-to-one",
+            ("a", "b", "c"),
+            ("a", "b", "c"),
+        ),
+        (
+            # Pandas input, input_features=list of names
+            {"a": np.random.rand(100), "b": np.random.rand(100)},
+            "one-to-one",
+            ("a", "b"),  # must match feature_names_in_
+            ("a", "b"),
+        ),
+        (
+            # NumPy input, feature_names_out=callable, input_features=list
+            np.random.rand(100, 3),
+            lambda transformer, input_features: tuple(input_features) + ("d",),
+            ("a", "b", "c"),
+            ("a", "b", "c", "d"),
+        ),
+        (
+            # Pandas input, feature_names_out=callable, input_features=list
+            {"a": np.random.rand(100), "b": np.random.rand(100)},
+            lambda transformer, input_features: tuple(input_features) + ("c",),
+            ("a", "b"),  # must match feature_names_in_
+            ("a", "b", "c"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("validate", [True, False])
+def test_function_transformer_get_feature_names_out(
+    X, feature_names_out, input_features, expected, validate
+):
+    if isinstance(X, dict):
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame(X)
+
+    transformer = FunctionTransformer(
+        feature_names_out=feature_names_out, validate=validate
+    )
+    transformer.fit(X)
+    names = transformer.get_feature_names_out(input_features)
+    assert isinstance(names, np.ndarray)
+    assert names.dtype == object
+    assert_array_equal(names, expected)
+
+
+def test_function_transformer_get_feature_names_out_without_validation():
+    transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
+    X = np.random.rand(100, 2)
+    transformer.fit_transform(X)
+
+    names = transformer.get_feature_names_out(("a", "b"))
+    assert isinstance(names, np.ndarray)
+    assert names.dtype == object
+    assert_array_equal(names, ("a", "b"))
+
+
+def test_function_transformer_feature_names_out_is_None():
+    transformer = FunctionTransformer()
+    X = np.random.rand(100, 2)
+    transformer.fit_transform(X)
+
+    msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
+    with pytest.raises(AttributeError, match=msg):
+        transformer.get_feature_names_out()
+
+
+def test_function_transformer_feature_names_out_uses_estimator():
+    def add_n_random_features(X, n):
+        return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
+
+    def feature_names_out(transformer, input_features):
+        n = transformer.kw_args["n"]
+        return list(input_features) + [f"rnd{i}" for i in range(n)]
+
+    transformer = FunctionTransformer(
+        func=add_n_random_features,
+        feature_names_out=feature_names_out,
+        kw_args=dict(n=3),
+        validate=True,
+    )
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
+    transformer.fit_transform(df)
+    names = transformer.get_feature_names_out()
+
+    assert isinstance(names, np.ndarray)
+    assert names.dtype == object
+    assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
+
+
+def test_function_transformer_validate_inverse():
+    """Test that function transformer does not reset estimator in
+    `inverse_transform`."""
+
+    def add_constant_feature(X):
+        X_one = np.ones((X.shape[0], 1))
+        return np.concatenate((X, X_one), axis=1)
+
+    def inverse_add_constant(X):
+        return X[:, :-1]
+
+    X = np.array([[1, 2], [3, 4], [3, 4]])
+    trans = FunctionTransformer(
+        func=add_constant_feature,
+        inverse_func=inverse_add_constant,
+        validate=True,
+    )
+    X_trans = trans.fit_transform(X)
+    assert trans.n_features_in_ == X.shape[1]
+
+    trans.inverse_transform(X_trans)
+    assert trans.n_features_in_ == X.shape[1]
+
+
+@pytest.mark.parametrize(
+    "feature_names_out, expected",
+    [
+        ("one-to-one", ["pet", "color"]),
+        [lambda est, names: [f"{n}_out" for n in names], ["pet_out", "color_out"]],
+    ],
+)
+@pytest.mark.parametrize("in_pipeline", [True, False])
+def test_get_feature_names_out_dataframe_with_string_data(
+    feature_names_out, expected, in_pipeline
+):
+    """Check that get_feature_names_out works with DataFrames with string data."""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame({"pet": ["dog", "cat"], "color": ["red", "green"]})
+
+    def func(X):
+        if feature_names_out == "one-to-one":
+            return X
+        else:
+            name = feature_names_out(None, X.columns)
+            return X.rename(columns=dict(zip(X.columns, name)))
+
+    transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
+    if in_pipeline:
+        transformer = make_pipeline(transformer)
+
+    X_trans = transformer.fit_transform(X)
+    assert isinstance(X_trans, pd.DataFrame)
+
+    names = transformer.get_feature_names_out()
+    assert isinstance(names, np.ndarray)
+    assert names.dtype == object
+    assert_array_equal(names, expected)
+
+
+def test_set_output_func():
+    """Check behavior of set_output with different settings."""
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
+
+    ft = FunctionTransformer(np.log, feature_names_out="one-to-one")
+
+    # no warning is raised when feature_names_out is defined
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        ft.set_output(transform="pandas")
+
+    X_trans = ft.fit_transform(X)
+    assert isinstance(X_trans, pd.DataFrame)
+    assert_array_equal(X_trans.columns, ["a", "b"])
+
+    ft = FunctionTransformer(lambda x: 2 * x)
+    ft.set_output(transform="pandas")
+
+    # no warning is raised when func returns a panda dataframe
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        X_trans = ft.fit_transform(X)
+    assert isinstance(X_trans, pd.DataFrame)
+    assert_array_equal(X_trans.columns, ["a", "b"])
+
+    # Warning is raised when func returns a ndarray
+    ft_np = FunctionTransformer(lambda x: np.asarray(x))
+
+    for transform in ("pandas", "polars"):
+        ft_np.set_output(transform=transform)
+        msg = (
+            f"When `set_output` is configured to be '{transform}'.*{transform} "
+            "DataFrame.*"
+        )
+        with pytest.warns(UserWarning, match=msg):
+            ft_np.fit_transform(X)
+
+    # default transform does not warn
+    ft_np.set_output(transform="default")
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        ft_np.fit_transform(X)
+
+
+def test_consistence_column_name_between_steps():
+    """Check that we have a consistence between the feature names out of
+    `FunctionTransformer` and the feature names in of the next step in the pipeline.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27695
+    """
+    pd = pytest.importorskip("pandas")
+
+    def with_suffix(_, names):
+        return [name + "__log" for name in names]
+
+    pipeline = make_pipeline(
+        FunctionTransformer(np.log1p, feature_names_out=with_suffix), StandardScaler()
+    )
+
+    df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["a", "b"])
+    X_trans = pipeline.fit_transform(df)
+    assert pipeline.get_feature_names_out().tolist() == ["a__log", "b__log"]
+    # StandardScaler will convert to a numpy array
+    assert isinstance(X_trans, np.ndarray)
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+@pytest.mark.parametrize("transform_output", ["default", "pandas", "polars"])
+def test_function_transformer_overwrite_column_names(dataframe_lib, transform_output):
+    """Check that we overwrite the column names when we should."""
+    lib = pytest.importorskip(dataframe_lib)
+    if transform_output != "numpy":
+        pytest.importorskip(transform_output)
+
+    df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
+
+    def with_suffix(_, names):
+        return [name + "__log" for name in names]
+
+    transformer = FunctionTransformer(feature_names_out=with_suffix).set_output(
+        transform=transform_output
+    )
+    X_trans = transformer.fit_transform(df)
+    assert_array_equal(np.asarray(X_trans), np.asarray(df))
+
+    feature_names = transformer.get_feature_names_out()
+    assert list(X_trans.columns) == with_suffix(None, df.columns)
+    assert feature_names.tolist() == with_suffix(None, df.columns)
+
+
+@pytest.mark.parametrize(
+    "feature_names_out",
+    ["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
+)
+def test_function_transformer_overwrite_column_names_numerical(feature_names_out):
+    """Check the same as `test_function_transformer_overwrite_column_names`
+    but for the specific case of pandas where column names can be numerical."""
+    pd = pytest.importorskip("pandas")
+
+    df = pd.DataFrame({0: [1, 2, 3], 1: [10, 20, 100]})
+
+    transformer = FunctionTransformer(feature_names_out=feature_names_out)
+    X_trans = transformer.fit_transform(df)
+    assert_array_equal(np.asarray(X_trans), np.asarray(df))
+
+    feature_names = transformer.get_feature_names_out()
+    assert list(X_trans.columns) == list(feature_names)
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+@pytest.mark.parametrize(
+    "feature_names_out",
+    ["one-to-one", lambda _, names: [f"{name}_log" for name in names]],
+)
+def test_function_transformer_error_column_inconsistent(
+    dataframe_lib, feature_names_out
+):
+    """Check that we raise an error when `func` returns a dataframe with new
+    column names that become inconsistent with `get_feature_names_out`."""
+    lib = pytest.importorskip(dataframe_lib)
+
+    df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
+
+    def func(df):
+        if dataframe_lib == "pandas":
+            return df.rename(columns={"a": "c"})
+        else:
+            return df.rename({"a": "c"})
+
+    transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out)
+    err_msg = "The output generated by `func` have different column names"
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.fit_transform(df).columns
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_label.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..053b474e675bca761b035953b30c495892e2d46a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_label.py
@@ -0,0 +1,748 @@
+import numpy as np
+import pytest
+from scipy.sparse import issparse
+
+from sklearn import config_context, datasets
+from sklearn.preprocessing._label import (
+    LabelBinarizer,
+    LabelEncoder,
+    MultiLabelBinarizer,
+    _inverse_binarize_multiclass,
+    _inverse_binarize_thresholding,
+    label_binarize,
+)
+from sklearn.utils._array_api import (
+    _convert_to_numpy,
+    _get_namespace_device_dtype_ids,
+    get_namespace,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._testing import (
+    _array_api_for_tests,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import _to_object_array
+
+iris = datasets.load_iris()
+
+
+def toarray(a):
+    if hasattr(a, "toarray"):
+        a = a.toarray()
+    return a
+
+
+def test_label_binarizer():
+    # one-class case defaults to negative label
+    # For dense case:
+    inp = ["pos", "pos", "pos", "pos"]
+    lb = LabelBinarizer(sparse_output=False)
+    expected = np.array([[0, 0, 0, 0]]).T
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ["pos"])
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+    # For sparse case:
+    lb = LabelBinarizer(sparse_output=True)
+    got = lb.fit_transform(inp)
+    assert issparse(got)
+    assert_array_equal(lb.classes_, ["pos"])
+    assert_array_equal(expected, got.toarray())
+    assert_array_equal(lb.inverse_transform(got.toarray()), inp)
+
+    lb = LabelBinarizer(sparse_output=False)
+    # two-class case
+    inp = ["neg", "pos", "pos", "neg"]
+    expected = np.array([[0, 1, 1, 0]]).T
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ["neg", "pos"])
+    assert_array_equal(expected, got)
+
+    to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
+    assert_array_equal(lb.inverse_transform(to_invert), inp)
+
+    # multi-class case
+    inp = ["spam", "ham", "eggs", "ham", "0"]
+    expected = np.array(
+        [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
+    )
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+
+def test_label_binarizer_unseen_labels():
+    lb = LabelBinarizer()
+
+    expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    got = lb.fit_transform(["b", "d", "e"])
+    assert_array_equal(expected, got)
+
+    expected = np.array(
+        [[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
+    )
+    got = lb.transform(["a", "b", "c", "d", "e", "f"])
+    assert_array_equal(expected, got)
+
+
+def test_label_binarizer_set_label_encoding():
+    lb = LabelBinarizer(neg_label=-2, pos_label=0)
+
+    # two-class case with pos_label=0
+    inp = np.array([0, 1, 1, 0])
+    expected = np.array([[-2, 0, 0, -2]]).T
+    got = lb.fit_transform(inp)
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+    lb = LabelBinarizer(neg_label=-2, pos_label=2)
+
+    # multi-class case
+    inp = np.array([3, 2, 1, 2, 0])
+    expected = np.array(
+        [
+            [-2, -2, -2, +2],
+            [-2, -2, +2, -2],
+            [-2, +2, -2, -2],
+            [-2, -2, +2, -2],
+            [+2, -2, -2, -2],
+        ]
+    )
+    got = lb.fit_transform(inp)
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+
+@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
+@pytest.mark.parametrize("unique_first", [True, False])
+def test_label_binarizer_pandas_nullable(dtype, unique_first):
+    """Checks that LabelBinarizer works with pandas nullable dtypes.
+
+    Non-regression test for gh-25637.
+    """
+    pd = pytest.importorskip("pandas")
+
+    y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
+    if unique_first:
+        # Calling unique creates a pandas array which has a different interface
+        # compared to a pandas Series. Specifically, pandas arrays do not have "iloc".
+        y_true = y_true.unique()
+    lb = LabelBinarizer().fit(y_true)
+    y_out = lb.transform([1, 0])
+
+    assert_array_equal(y_out, [[1], [0]])
+
+
+def test_label_binarizer_errors():
+    # Check that invalid arguments yield ValueError
+    one_class = np.array([0, 0, 0, 0])
+    lb = LabelBinarizer().fit(one_class)
+
+    multi_label = [(2, 3), (0,), (0, 2)]
+    err_msg = "You appear to be using a legacy multi-label data representation."
+    with pytest.raises(ValueError, match=err_msg):
+        lb.transform(multi_label)
+
+    lb = LabelBinarizer()
+    err_msg = "This LabelBinarizer instance is not fitted yet"
+    with pytest.raises(ValueError, match=err_msg):
+        lb.transform([])
+    with pytest.raises(ValueError, match=err_msg):
+        lb.inverse_transform([])
+
+    input_labels = [0, 1, 0, 1]
+    err_msg = "neg_label=2 must be strictly less than pos_label=1."
+    lb = LabelBinarizer(neg_label=2, pos_label=1)
+    with pytest.raises(ValueError, match=err_msg):
+        lb.fit(input_labels)
+    err_msg = "neg_label=2 must be strictly less than pos_label=2."
+    lb = LabelBinarizer(neg_label=2, pos_label=2)
+    with pytest.raises(ValueError, match=err_msg):
+        lb.fit(input_labels)
+    err_msg = (
+        "Sparse binarization is only supported with non zero pos_label and zero "
+        "neg_label, got pos_label=2 and neg_label=1"
+    )
+    lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
+    with pytest.raises(ValueError, match=err_msg):
+        lb.fit(input_labels)
+
+    # Sequence of seq type should raise ValueError
+    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
+    err_msg = "You appear to be using a legacy multi-label data representation"
+    with pytest.raises(ValueError, match=err_msg):
+        LabelBinarizer().fit_transform(y_seq_of_seqs)
+
+    # Fail on the dimension of 'binary'
+    err_msg = "output_type='binary', but y.shape"
+    with pytest.raises(ValueError, match=err_msg):
+        _inverse_binarize_thresholding(
+            y=np.array([[1, 2, 3], [2, 1, 3]]),
+            output_type="binary",
+            classes=[1, 2, 3],
+            threshold=0,
+        )
+
+    # Fail on multioutput data
+    err_msg = "Multioutput target data is not supported with label binarization"
+    with pytest.raises(ValueError, match=err_msg):
+        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
+    with pytest.raises(ValueError, match=err_msg):
+        label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_label_binarizer_sparse_errors(csr_container):
+    # Fail on y_type
+    err_msg = "foo format is not supported"
+    with pytest.raises(ValueError, match=err_msg):
+        _inverse_binarize_thresholding(
+            y=csr_container([[1, 2], [2, 1]]),
+            output_type="foo",
+            classes=[1, 2],
+            threshold=0,
+        )
+
+    # Fail on the number of classes
+    err_msg = "The number of class is not equal to the number of dimension of y."
+    with pytest.raises(ValueError, match=err_msg):
+        _inverse_binarize_thresholding(
+            y=csr_container([[1, 2], [2, 1]]),
+            output_type="foo",
+            classes=[1, 2, 3],
+            threshold=0,
+        )
+
+
+@pytest.mark.parametrize(
+    "values, classes, unknown",
+    [
+        (
+            np.array([2, 1, 3, 1, 3], dtype="int64"),
+            np.array([1, 2, 3], dtype="int64"),
+            np.array([4], dtype="int64"),
+        ),
+        (
+            np.array(["b", "a", "c", "a", "c"], dtype=object),
+            np.array(["a", "b", "c"], dtype=object),
+            np.array(["d"], dtype=object),
+        ),
+        (
+            np.array(["b", "a", "c", "a", "c"]),
+            np.array(["a", "b", "c"]),
+            np.array(["d"]),
+        ),
+    ],
+    ids=["int64", "object", "str"],
+)
+def test_label_encoder(values, classes, unknown):
+    # Test LabelEncoder's transform, fit_transform and
+    # inverse_transform methods
+    le = LabelEncoder()
+    le.fit(values)
+    assert_array_equal(le.classes_, classes)
+    assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
+    assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
+    le = LabelEncoder()
+    ret = le.fit_transform(values)
+    assert_array_equal(ret, [1, 0, 2, 0, 2])
+
+    with pytest.raises(ValueError, match="unseen labels"):
+        le.transform(unknown)
+
+
+def test_label_encoder_negative_ints():
+    le = LabelEncoder()
+    le.fit([1, 1, 4, 5, -1, 0])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
+    assert_array_equal(
+        le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
+    )
+    with pytest.raises(ValueError):
+        le.transform([0, 6])
+
+
+@pytest.mark.parametrize("dtype", ["str", "object"])
+def test_label_encoder_str_bad_shape(dtype):
+    le = LabelEncoder()
+    le.fit(np.array(["apple", "orange"], dtype=dtype))
+    msg = "should be a 1d array"
+    with pytest.raises(ValueError, match=msg):
+        le.transform("apple")
+
+
+def test_label_encoder_errors():
+    # Check that invalid arguments yield ValueError
+    le = LabelEncoder()
+    with pytest.raises(ValueError):
+        le.transform([])
+    with pytest.raises(ValueError):
+        le.inverse_transform([])
+
+    # Fail on unseen labels
+    le = LabelEncoder()
+    le.fit([1, 2, 3, -1, 1])
+    msg = "contains previously unseen labels"
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform([-2])
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform([-2, -3, -4])
+
+    # Fail on inverse_transform("")
+    msg = r"should be a 1d array.+shape \(\)"
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform("")
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        np.array([2, 1, 3, 1, 3], dtype="int64"),
+        np.array(["b", "a", "c", "a", "c"], dtype=object),
+        np.array(["b", "a", "c", "a", "c"]),
+    ],
+    ids=["int64", "object", "str"],
+)
+def test_label_encoder_empty_array(values):
+    le = LabelEncoder()
+    le.fit(values)
+    # test empty transform
+    transformed = le.transform([])
+    assert_array_equal(np.array([]), transformed)
+    # test empty inverse transform
+    inverse_transformed = le.inverse_transform([])
+    assert_array_equal(np.array([]), inverse_transformed)
+
+
+def test_sparse_output_multilabel_binarizer():
+    # test input as iterable of iterables
+    inputs = [
+        lambda: [(2, 3), (1,), (1, 2)],
+        lambda: ({2, 3}, {1}, {1, 2}),
+        lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
+    ]
+    indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
+
+    inverse = inputs[0]()
+    for sparse_output in [True, False]:
+        for inp in inputs:
+            # With fit_transform
+            mlb = MultiLabelBinarizer(sparse_output=sparse_output)
+            got = mlb.fit_transform(inp())
+            assert issparse(got) == sparse_output
+            if sparse_output:
+                # verify CSR assumption that indices and indptr have same dtype
+                assert got.indices.dtype == got.indptr.dtype
+                got = got.toarray()
+            assert_array_equal(indicator_mat, got)
+            assert_array_equal([1, 2, 3], mlb.classes_)
+            assert mlb.inverse_transform(got) == inverse
+
+            # With fit
+            mlb = MultiLabelBinarizer(sparse_output=sparse_output)
+            got = mlb.fit(inp()).transform(inp())
+            assert issparse(got) == sparse_output
+            if sparse_output:
+                # verify CSR assumption that indices and indptr have same dtype
+                assert got.indices.dtype == got.indptr.dtype
+                got = got.toarray()
+            assert_array_equal(indicator_mat, got)
+            assert_array_equal([1, 2, 3], mlb.classes_)
+            assert mlb.inverse_transform(got) == inverse
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_output_multilabel_binarizer_errors(csr_container):
+    inp = iter([iter((2, 3)), iter((1,)), {1, 2}])
+    mlb = MultiLabelBinarizer(sparse_output=False)
+    mlb.fit(inp)
+    with pytest.raises(ValueError):
+        mlb.inverse_transform(
+            csr_container(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]]))
+        )
+
+
+def test_multilabel_binarizer():
+    # test input as iterable of iterables
+    inputs = [
+        lambda: [(2, 3), (1,), (1, 2)],
+        lambda: ({2, 3}, {1}, {1, 2}),
+        lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
+    ]
+    indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
+    inverse = inputs[0]()
+    for inp in inputs:
+        # With fit_transform
+        mlb = MultiLabelBinarizer()
+        got = mlb.fit_transform(inp())
+        assert_array_equal(indicator_mat, got)
+        assert_array_equal([1, 2, 3], mlb.classes_)
+        assert mlb.inverse_transform(got) == inverse
+
+        # With fit
+        mlb = MultiLabelBinarizer()
+        got = mlb.fit(inp()).transform(inp())
+        assert_array_equal(indicator_mat, got)
+        assert_array_equal([1, 2, 3], mlb.classes_)
+        assert mlb.inverse_transform(got) == inverse
+
+
+def test_multilabel_binarizer_empty_sample():
+    mlb = MultiLabelBinarizer()
+    y = [[1, 2], [1], []]
+    Y = np.array([[1, 1], [1, 0], [0, 0]])
+    assert_array_equal(mlb.fit_transform(y), Y)
+
+
+def test_multilabel_binarizer_unknown_class():
+    mlb = MultiLabelBinarizer()
+    y = [[1, 2]]
+    Y = np.array([[1, 0], [0, 1]])
+    warning_message = "unknown class.* will be ignored"
+    with pytest.warns(UserWarning, match=warning_message):
+        matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
+
+    Y = np.array([[1, 0, 0], [0, 1, 0]])
+    mlb = MultiLabelBinarizer(classes=[1, 2, 3])
+    with pytest.warns(UserWarning, match=warning_message):
+        matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
+    assert_array_equal(matrix, Y)
+
+
+def test_multilabel_binarizer_given_classes():
+    inp = [(2, 3), (1,), (1, 2)]
+    indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
+    # fit_transform()
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2])
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+    assert_array_equal(mlb.classes_, [1, 3, 2])
+
+    # fit().transform()
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2])
+    assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
+    assert_array_equal(mlb.classes_, [1, 3, 2])
+
+    # ensure works with extra class
+    mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
+    assert_array_equal(
+        mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
+    )
+    assert_array_equal(mlb.classes_, [4, 1, 3, 2])
+
+    # ensure fit is no-op as iterable is not consumed
+    inp = iter(inp)
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2])
+    assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
+
+    # ensure a ValueError is thrown if given duplicate classes
+    err_msg = (
+        "The classes argument contains duplicate classes. Remove "
+        "these duplicates before passing them to MultiLabelBinarizer."
+    )
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
+    with pytest.raises(ValueError, match=err_msg):
+        mlb.fit(inp)
+
+
+def test_multilabel_binarizer_multiple_calls():
+    inp = [(2, 3), (1,), (1, 2)]
+    indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
+
+    indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
+
+    # first call
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2])
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+    # second call change class
+    mlb.classes = [1, 2, 3]
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
+
+
+def test_multilabel_binarizer_same_length_sequence():
+    # Ensure sequences of the same length are not interpreted as a 2-d array
+    inp = [[1], [0], [2]]
+    indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
+    # fit_transform()
+    mlb = MultiLabelBinarizer()
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+    assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
+
+    # fit().transform()
+    mlb = MultiLabelBinarizer()
+    assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
+    assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
+
+
+def test_multilabel_binarizer_non_integer_labels():
+    tuple_classes = _to_object_array([(1,), (2,), (3,)])
+    inputs = [
+        ([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
+        ([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
+        ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
+    ]
+    indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
+    for inp, classes in inputs:
+        # fit_transform()
+        mlb = MultiLabelBinarizer()
+        inp = np.array(inp, dtype=object)
+        assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+        assert_array_equal(mlb.classes_, classes)
+        indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
+        assert_array_equal(indicator_mat_inv, inp)
+
+        # fit().transform()
+        mlb = MultiLabelBinarizer()
+        assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
+        assert_array_equal(mlb.classes_, classes)
+        indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
+        assert_array_equal(indicator_mat_inv, inp)
+
+    mlb = MultiLabelBinarizer()
+    with pytest.raises(TypeError):
+        mlb.fit_transform([({}), ({}, {"a": "b"})])
+
+
+def test_multilabel_binarizer_non_unique():
+    inp = [(1, 1, 1, 0)]
+    indicator_mat = np.array([[1, 1]])
+    mlb = MultiLabelBinarizer()
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+
+
+def test_multilabel_binarizer_inverse_validation():
+    inp = [(1, 1, 1, 0)]
+    mlb = MultiLabelBinarizer()
+    mlb.fit_transform(inp)
+    # Not binary
+    with pytest.raises(ValueError):
+        mlb.inverse_transform(np.array([[1, 3]]))
+    # The following binary cases are fine, however
+    mlb.inverse_transform(np.array([[0, 0]]))
+    mlb.inverse_transform(np.array([[1, 1]]))
+    mlb.inverse_transform(np.array([[1, 0]]))
+
+    # Wrong shape
+    with pytest.raises(ValueError):
+        mlb.inverse_transform(np.array([[1]]))
+    with pytest.raises(ValueError):
+        mlb.inverse_transform(np.array([[1, 1, 1]]))
+
+
+def test_label_binarize_with_class_order():
+    out = label_binarize([1, 6], classes=[1, 2, 4, 6])
+    expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
+    assert_array_equal(out, expected)
+
+    # Modified class order
+    out = label_binarize([1, 6], classes=[1, 6, 4, 2])
+    expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
+    assert_array_equal(out, expected)
+
+    out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
+    expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
+    assert_array_equal(out, expected)
+
+
+def check_binarized_results(y, classes, pos_label, neg_label, expected):
+    for sparse_output in [True, False]:
+        if (pos_label == 0 or neg_label != 0) and sparse_output:
+            with pytest.raises(ValueError):
+                label_binarize(
+                    y,
+                    classes=classes,
+                    neg_label=neg_label,
+                    pos_label=pos_label,
+                    sparse_output=sparse_output,
+                )
+            continue
+
+        # check label_binarize
+        binarized = label_binarize(
+            y,
+            classes=classes,
+            neg_label=neg_label,
+            pos_label=pos_label,
+            sparse_output=sparse_output,
+        )
+        assert_array_equal(toarray(binarized), expected)
+        assert issparse(binarized) == sparse_output
+
+        # check inverse
+        y_type = type_of_target(y)
+        if y_type == "multiclass":
+            inversed = _inverse_binarize_multiclass(binarized, classes=classes)
+
+        else:
+            inversed = _inverse_binarize_thresholding(
+                binarized,
+                output_type=y_type,
+                classes=classes,
+                threshold=((neg_label + pos_label) / 2.0),
+            )
+
+        assert_array_equal(toarray(inversed), toarray(y))
+
+        # Check label binarizer
+        lb = LabelBinarizer(
+            neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
+        )
+        binarized = lb.fit_transform(y)
+        assert_array_equal(toarray(binarized), expected)
+        assert issparse(binarized) == sparse_output
+        inverse_output = lb.inverse_transform(binarized)
+        assert_array_equal(toarray(inverse_output), toarray(y))
+        assert issparse(inverse_output) == issparse(y)
+
+
+def test_label_binarize_binary():
+    y = [0, 1, 0]
+    classes = [0, 1]
+    pos_label = 2
+    neg_label = -1
+    expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+    # Binary case where sparse_output = True will not result in a ValueError
+    y = [0, 1, 0]
+    classes = [0, 1]
+    pos_label = 3
+    neg_label = 0
+    expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+
+def test_label_binarize_multiclass():
+    y = [0, 1, 2]
+    classes = [0, 1, 2]
+    pos_label = 2
+    neg_label = 0
+    expected = 2 * np.eye(3)
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+    with pytest.raises(ValueError):
+        label_binarize(
+            y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
+        )
+
+
+@pytest.mark.parametrize(
+    "arr_type",
+    [np.array]
+    + COO_CONTAINERS
+    + CSC_CONTAINERS
+    + CSR_CONTAINERS
+    + DOK_CONTAINERS
+    + LIL_CONTAINERS,
+)
+def test_label_binarize_multilabel(arr_type):
+    y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
+    classes = [0, 1, 2]
+    pos_label = 2
+    neg_label = 0
+    expected = pos_label * y_ind
+    y = arr_type(y_ind)
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+    with pytest.raises(ValueError):
+        label_binarize(
+            y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
+        )
+
+
+def test_invalid_input_label_binarize():
+    with pytest.raises(ValueError):
+        label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
+    with pytest.raises(ValueError, match="continuous target data is not "):
+        label_binarize([1.2, 2.7], classes=[0, 1])
+    with pytest.raises(ValueError, match="mismatch with the labels"):
+        label_binarize([[1, 3]], classes=[1, 2, 3])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_inverse_binarize_multiclass(csr_container):
+    got = _inverse_binarize_multiclass(
+        csr_container([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
+    )
+    assert_array_equal(got, np.array([1, 1, 0]))
+
+
+def test_nan_label_encoder():
+    """Check that label encoder encodes nans in transform.
+
+    Non-regression test for #22628.
+    """
+    le = LabelEncoder()
+    le.fit(["a", "a", "b", np.nan])
+
+    y_trans = le.transform([np.nan])
+    assert_array_equal(y_trans, [2])
+
+
+@pytest.mark.parametrize(
+    "encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()]
+)
+def test_label_encoders_do_not_have_set_output(encoder):
+    """Check that label encoders do not define set_output and work with y as a kwarg.
+
+    Non-regression test for #26854.
+    """
+    assert not hasattr(encoder, "set_output")
+    y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
+    y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
+    assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize(
+    "y",
+    [
+        np.array([2, 1, 3, 1, 3]),
+        np.array([1, 1, 4, 5, -1, 0]),
+        np.array([3, 5, 9, 5, 9, 3]),
+    ],
+)
+def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype):
+    xp = _array_api_for_tests(array_namespace, device)
+    xp_y = xp.asarray(y, device=device)
+    with config_context(array_api_dispatch=True):
+        xp_label = LabelEncoder()
+        np_label = LabelEncoder()
+        xp_label = xp_label.fit(xp_y)
+        xp_transformed = xp_label.transform(xp_y)
+        xp_inv_transformed = xp_label.inverse_transform(xp_transformed)
+        np_label = np_label.fit(y)
+        np_transformed = np_label.transform(y)
+        assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
+        assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__
+        assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
+        assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
+        assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y)
+        assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)
+
+        xp_label = LabelEncoder()
+        np_label = LabelEncoder()
+        xp_transformed = xp_label.fit_transform(xp_y)
+        np_transformed = np_label.fit_transform(y)
+        assert get_namespace(xp_transformed)[0].__name__ == xp.__name__
+        assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__
+        assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed)
+        assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_polynomial.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_polynomial.py
new file mode 100644
index 0000000000000000000000000000000000000000..640bf5705baad6ee644ba81942791864f9587f60
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_polynomial.py
@@ -0,0 +1,1230 @@
+import sys
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+from scipy import sparse
+from scipy.interpolate import BSpline
+from scipy.sparse import random as sparse_random
+
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import (
+    KBinsDiscretizer,
+    PolynomialFeatures,
+    SplineTransformer,
+)
+from sklearn.preprocessing._csr_polynomial_expansion import (
+    _get_sizeof_LARGEST_INT_t,
+)
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils.fixes import (
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    parse_version,
+    sp_version,
+)
+
+
+@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
+def test_polynomial_and_spline_array_order(est):
+    """Test that output array has the given order."""
+    X = np.arange(10).reshape(5, 2)
+
+    def is_c_contiguous(a):
+        return np.isfortran(a.T)
+
+    assert is_c_contiguous(est().fit_transform(X))
+    assert is_c_contiguous(est(order="C").fit_transform(X))
+    assert np.isfortran(est(order="F").fit_transform(X))
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        ({"knots": [[1]]}, r"Number of knots, knots.shape\[0\], must be >= 2."),
+        ({"knots": [[1, 1], [2, 2]]}, r"knots.shape\[1\] == n_features is violated"),
+        ({"knots": [[1], [0]]}, "knots must be sorted without duplicates."),
+    ],
+)
+def test_spline_transformer_input_validation(params, err_msg):
+    """Test that we raise errors for invalid input in SplineTransformer."""
+    X = [[1], [2]]
+
+    with pytest.raises(ValueError, match=err_msg):
+        SplineTransformer(**params).fit(X)
+
+
+@pytest.mark.parametrize("extrapolation", ["continue", "periodic"])
+def test_spline_transformer_integer_knots(extrapolation):
+    """Test that SplineTransformer accepts integer value knot positions."""
+    X = np.arange(20).reshape(10, 2)
+    knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]]
+    _ = SplineTransformer(
+        degree=3, knots=knots, extrapolation=extrapolation
+    ).fit_transform(X)
+
+
+def test_spline_transformer_feature_names():
+    """Test that SplineTransformer generates correct features name."""
+    X = np.arange(20).reshape(10, 2)
+    splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X)
+    feature_names = splt.get_feature_names_out()
+    assert_array_equal(
+        feature_names,
+        [
+            "x0_sp_0",
+            "x0_sp_1",
+            "x0_sp_2",
+            "x0_sp_3",
+            "x0_sp_4",
+            "x1_sp_0",
+            "x1_sp_1",
+            "x1_sp_2",
+            "x1_sp_3",
+            "x1_sp_4",
+        ],
+    )
+
+    splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X)
+    feature_names = splt.get_feature_names_out(["a", "b"])
+    assert_array_equal(
+        feature_names,
+        [
+            "a_sp_0",
+            "a_sp_1",
+            "a_sp_2",
+            "a_sp_3",
+            "b_sp_0",
+            "b_sp_1",
+            "b_sp_2",
+            "b_sp_3",
+        ],
+    )
+
+
+@pytest.mark.parametrize(
+    "extrapolation",
+    ["constant", "linear", "continue", "periodic"],
+)
+@pytest.mark.parametrize("degree", [2, 3])
+def test_split_transform_feature_names_extrapolation_degree(extrapolation, degree):
+    """Test feature names are correct for different extrapolations and degree.
+
+    Non-regression test for gh-25292.
+    """
+    X = np.arange(20).reshape(10, 2)
+    splt = SplineTransformer(degree=degree, extrapolation=extrapolation).fit(X)
+    feature_names = splt.get_feature_names_out(["a", "b"])
+    assert len(feature_names) == splt.n_features_out_
+
+    X_trans = splt.transform(X)
+    assert X_trans.shape[1] == len(feature_names)
+
+
+@pytest.mark.parametrize("degree", range(1, 5))
+@pytest.mark.parametrize("n_knots", range(3, 5))
+@pytest.mark.parametrize("knots", ["uniform", "quantile"])
+@pytest.mark.parametrize("extrapolation", ["constant", "periodic"])
+def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation):
+    """Test that B-splines are indeed a decomposition of unity.
+
+    Splines basis functions must sum up to 1 per row, if we stay in between boundaries.
+    """
+    X = np.linspace(0, 1, 100)[:, None]
+    # make the boundaries 0 and 1 part of X_train, for sure.
+    X_train = np.r_[[[0]], X[::2, :], [[1]]]
+    X_test = X[1::2, :]
+
+    if extrapolation == "periodic":
+        n_knots = n_knots + degree  # periodic splines require degree < n_knots
+
+    splt = SplineTransformer(
+        n_knots=n_knots,
+        degree=degree,
+        knots=knots,
+        include_bias=True,
+        extrapolation=extrapolation,
+    )
+    splt.fit(X_train)
+    for X in [X_train, X_test]:
+        assert_allclose(np.sum(splt.transform(X), axis=1), 1)
+
+
+@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
+def test_spline_transformer_linear_regression(bias, intercept):
+    """Test that B-splines fit a sinusodial curve pretty well."""
+    X = np.linspace(0, 10, 100)[:, None]
+    y = np.sin(X[:, 0]) + 2  # +2 to avoid the value 0 in assert_allclose
+    pipe = Pipeline(
+        steps=[
+            (
+                "spline",
+                SplineTransformer(
+                    n_knots=15,
+                    degree=3,
+                    include_bias=bias,
+                    extrapolation="constant",
+                ),
+            ),
+            ("ols", LinearRegression(fit_intercept=intercept)),
+        ]
+    )
+    pipe.fit(X, y)
+    assert_allclose(pipe.predict(X), y, rtol=1e-3)
+
+
+@pytest.mark.parametrize(
+    ["knots", "n_knots", "sample_weight", "expected_knots"],
+    [
+        ("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])),
+        (
+            "uniform",
+            3,
+            np.array([0, 0, 1, 1, 0, 3, 1]),
+            np.array([[2, 2], [4, 8], [6, 14]]),
+        ),
+        ("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])),
+        ("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])),
+        (
+            "quantile",
+            3,
+            np.array([0, 0, 1, 1, 0, 3, 1]),
+            np.array([[2, 2], [5, 8], [6, 14]]),
+        ),
+    ],
+)
+def test_spline_transformer_get_base_knot_positions(
+    knots, n_knots, sample_weight, expected_knots
+):
+    """Check the behaviour to find knot positions with and without sample_weight."""
+    X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]])
+    base_knots = SplineTransformer._get_base_knot_positions(
+        X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight
+    )
+    assert_allclose(base_knots, expected_knots)
+
+
+@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
+def test_spline_transformer_periodic_linear_regression(bias, intercept):
+    """Test that B-splines fit a periodic curve pretty well."""
+
+    # "+ 3" to avoid the value 0 in assert_allclose
+    def f(x):
+        return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3
+
+    X = np.linspace(0, 1, 101)[:, None]
+    pipe = Pipeline(
+        steps=[
+            (
+                "spline",
+                SplineTransformer(
+                    n_knots=20,
+                    degree=3,
+                    include_bias=bias,
+                    extrapolation="periodic",
+                ),
+            ),
+            ("ols", LinearRegression(fit_intercept=intercept)),
+        ]
+    )
+    pipe.fit(X, f(X[:, 0]))
+
+    # Generate larger array to check periodic extrapolation
+    X_ = np.linspace(-1, 2, 301)[:, None]
+    predictions = pipe.predict(X_)
+    assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
+    assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)
+
+
+def test_spline_transformer_periodic_spline_backport():
+    """Test that the backport of extrapolate="periodic" works correctly"""
+    X = np.linspace(-2, 3.5, 10)[:, None]
+    degree = 2
+
+    # Use periodic extrapolation backport in SplineTransformer
+    transformer = SplineTransformer(
+        degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]]
+    )
+    Xt = transformer.fit_transform(X)
+
+    # Use periodic extrapolation in BSpline
+    coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
+    spl = BSpline(np.arange(-3, 4), coef, degree, "periodic")
+    Xspl = spl(X[:, 0])
+    assert_allclose(Xt, Xspl)
+
+
+def test_spline_transformer_periodic_splines_periodicity():
+    """Test if shifted knots result in the same transformation up to permutation."""
+    X = np.linspace(0, 10, 101)[:, None]
+
+    transformer_1 = SplineTransformer(
+        degree=3,
+        extrapolation="periodic",
+        knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
+    )
+
+    transformer_2 = SplineTransformer(
+        degree=3,
+        extrapolation="periodic",
+        knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]],
+    )
+
+    Xt_1 = transformer_1.fit_transform(X)
+    Xt_2 = transformer_2.fit_transform(X)
+
+    assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]])
+
+
+@pytest.mark.parametrize("degree", [3, 5])
+def test_spline_transformer_periodic_splines_smoothness(degree):
+    """Test that spline transformation is smooth at first / last knot."""
+    X = np.linspace(-2, 10, 10_000)[:, None]
+
+    transformer = SplineTransformer(
+        degree=degree,
+        extrapolation="periodic",
+        knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
+    )
+    Xt = transformer.fit_transform(X)
+
+    delta = (X.max() - X.min()) / len(X)
+    tol = 10 * delta
+
+    dXt = Xt
+    # We expect splines of degree `degree` to be (`degree`-1) times
+    # continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th
+    # derivative should be continuous. This is the case if the (d+1)-th
+    # numerical derivative is reasonably small (smaller than `tol` in absolute
+    # value). We thus compute d-th numeric derivatives for d = 1, ..., `degree`
+    # and compare them to `tol`.
+    #
+    # Note that the 0-th derivative is the function itself, such that we are
+    # also checking its continuity.
+    for d in range(1, degree + 1):
+        # Check continuity of the (d-1)-th derivative
+        diff = np.diff(dXt, axis=0)
+        assert np.abs(diff).max() < tol
+        # Compute d-th numeric derivative
+        dXt = diff / delta
+
+    # As degree `degree` splines are not `degree` times continuously
+    # differentiable at the knots, the `degree + 1`-th numeric derivative
+    # should have spikes at the knots.
+    diff = np.diff(dXt, axis=0)
+    assert np.abs(diff).max() > 1
+
+
+@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
+@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
+def test_spline_transformer_extrapolation(bias, intercept, degree):
+    """Test that B-spline extrapolation works correctly."""
+    # we use a straight line for that
+    X = np.linspace(-1, 1, 100)[:, None]
+    y = X.squeeze()
+
+    # 'constant'
+    pipe = Pipeline(
+        [
+            [
+                "spline",
+                SplineTransformer(
+                    n_knots=4,
+                    degree=degree,
+                    include_bias=bias,
+                    extrapolation="constant",
+                ),
+            ],
+            ["ols", LinearRegression(fit_intercept=intercept)],
+        ]
+    )
+    pipe.fit(X, y)
+    assert_allclose(pipe.predict([[-10], [5]]), [-1, 1])
+
+    # 'linear'
+    pipe = Pipeline(
+        [
+            [
+                "spline",
+                SplineTransformer(
+                    n_knots=4,
+                    degree=degree,
+                    include_bias=bias,
+                    extrapolation="linear",
+                ),
+            ],
+            ["ols", LinearRegression(fit_intercept=intercept)],
+        ]
+    )
+    pipe.fit(X, y)
+    assert_allclose(pipe.predict([[-10], [5]]), [-10, 5])
+
+    # 'error'
+    splt = SplineTransformer(
+        n_knots=4, degree=degree, include_bias=bias, extrapolation="error"
+    )
+    splt.fit(X)
+    msg = "X contains values beyond the limits of the knots"
+    with pytest.raises(ValueError, match=msg):
+        splt.transform([[-10]])
+    with pytest.raises(ValueError, match=msg):
+        splt.transform([[5]])
+
+
+def test_spline_transformer_kbindiscretizer(global_random_seed):
+    """Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer."""
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randn(200).reshape(200, 1)
+    n_bins = 5
+    n_knots = n_bins + 1
+
+    splt = SplineTransformer(
+        n_knots=n_knots, degree=0, knots="quantile", include_bias=True
+    )
+    splines = splt.fit_transform(X)
+
+    kbd = KBinsDiscretizer(
+        n_bins=n_bins,
+        encode="onehot-dense",
+        strategy="quantile",
+        quantile_method="averaged_inverted_cdf",
+    )
+    kbins = kbd.fit_transform(X)
+
+    # Though they should be exactly equal, we test approximately with high
+    # accuracy.
+    assert_allclose(splines, kbins, rtol=1e-13)
+
+
+@pytest.mark.parametrize("degree", range(1, 3))
+@pytest.mark.parametrize("knots", ["uniform", "quantile"])
+@pytest.mark.parametrize(
+    "extrapolation", ["error", "constant", "linear", "continue", "periodic"]
+)
+@pytest.mark.parametrize("include_bias", [False, True])
+def test_spline_transformer_sparse_output(
+    degree, knots, extrapolation, include_bias, global_random_seed
+):
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randn(200).reshape(40, 5)
+
+    splt_dense = SplineTransformer(
+        degree=degree,
+        knots=knots,
+        extrapolation=extrapolation,
+        include_bias=include_bias,
+        sparse_output=False,
+    )
+    splt_sparse = SplineTransformer(
+        degree=degree,
+        knots=knots,
+        extrapolation=extrapolation,
+        include_bias=include_bias,
+        sparse_output=True,
+    )
+
+    splt_dense.fit(X)
+    splt_sparse.fit(X)
+
+    X_trans_sparse = splt_sparse.transform(X)
+    X_trans_dense = splt_dense.transform(X)
+    assert sparse.issparse(X_trans_sparse) and X_trans_sparse.format == "csr"
+    assert_allclose(X_trans_dense, X_trans_sparse.toarray())
+
+    # extrapolation regime
+    X_min = np.amin(X, axis=0)
+    X_max = np.amax(X, axis=0)
+    X_extra = np.r_[
+        np.linspace(X_min - 5, X_min, 10), np.linspace(X_max, X_max + 5, 10)
+    ]
+    if extrapolation == "error":
+        msg = "X contains values beyond the limits of the knots"
+        with pytest.raises(ValueError, match=msg):
+            splt_dense.transform(X_extra)
+        msg = "Out of bounds"
+        with pytest.raises(ValueError, match=msg):
+            splt_sparse.transform(X_extra)
+    else:
+        assert_allclose(
+            splt_dense.transform(X_extra), splt_sparse.transform(X_extra).toarray()
+        )
+
+
+@pytest.mark.parametrize("n_knots", [5, 10])
+@pytest.mark.parametrize("include_bias", [True, False])
+@pytest.mark.parametrize("degree", [3, 4])
+@pytest.mark.parametrize(
+    "extrapolation", ["error", "constant", "linear", "continue", "periodic"]
+)
+@pytest.mark.parametrize("sparse_output", [False, True])
+def test_spline_transformer_n_features_out(
+    n_knots, include_bias, degree, extrapolation, sparse_output
+):
+    """Test that transform results in n_features_out_ features."""
+    splt = SplineTransformer(
+        n_knots=n_knots,
+        degree=degree,
+        include_bias=include_bias,
+        extrapolation=extrapolation,
+        sparse_output=sparse_output,
+    )
+    X = np.linspace(0, 1, 10)[:, None]
+    splt.fit(X)
+
+    assert splt.transform(X).shape[1] == splt.n_features_out_
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        ({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"),
+        ({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"),
+        ({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"),
+        ({"degree": (1, 2, 3)}, r"int or tuple \(min_degree, max_degree\)"),
+    ],
+)
+def test_polynomial_features_input_validation(params, err_msg):
+    """Test that we raise errors for invalid input in PolynomialFeatures."""
+    X = [[1], [2]]
+
+    with pytest.raises(ValueError, match=err_msg):
+        PolynomialFeatures(**params).fit(X)
+
+
+@pytest.fixture()
+def single_feature_degree3():
+    X = np.arange(6)[:, np.newaxis]
+    P = np.hstack([np.ones_like(X), X, X**2, X**3])
+    return X, P
+
+
+@pytest.mark.parametrize(
+    "degree, include_bias, interaction_only, indices",
+    [
+        (3, True, False, slice(None, None)),
+        (3, False, False, slice(1, None)),
+        (3, True, True, [0, 1]),
+        (3, False, True, [1]),
+        ((2, 3), True, False, [0, 2, 3]),
+        ((2, 3), False, False, [2, 3]),
+        ((2, 3), True, True, [0]),
+        ((2, 3), False, True, []),
+    ],
+)
+@pytest.mark.parametrize("X_container", [None] + CSR_CONTAINERS + CSC_CONTAINERS)
+def test_polynomial_features_one_feature(
+    single_feature_degree3,
+    degree,
+    include_bias,
+    interaction_only,
+    indices,
+    X_container,
+):
+    """Test PolynomialFeatures on single feature up to degree 3."""
+    X, P = single_feature_degree3
+    if X_container is not None:
+        X = X_container(X)
+    tf = PolynomialFeatures(
+        degree=degree, include_bias=include_bias, interaction_only=interaction_only
+    ).fit(X)
+    out = tf.transform(X)
+    if X_container is not None:
+        out = out.toarray()
+    assert_allclose(out, P[:, indices])
+    if tf.n_output_features_ > 0:
+        assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
+
+
+@pytest.fixture()
+def two_features_degree3():
+    X = np.arange(6).reshape((3, 2))
+    x1 = X[:, :1]
+    x2 = X[:, 1:]
+    P = np.hstack(
+        [
+            x1**0 * x2**0,  # 0
+            x1**1 * x2**0,  # 1
+            x1**0 * x2**1,  # 2
+            x1**2 * x2**0,  # 3
+            x1**1 * x2**1,  # 4
+            x1**0 * x2**2,  # 5
+            x1**3 * x2**0,  # 6
+            x1**2 * x2**1,  # 7
+            x1**1 * x2**2,  # 8
+            x1**0 * x2**3,  # 9
+        ]
+    )
+    return X, P
+
+
+@pytest.mark.parametrize(
+    "degree, include_bias, interaction_only, indices",
+    [
+        (2, True, False, slice(0, 6)),
+        (2, False, False, slice(1, 6)),
+        (2, True, True, [0, 1, 2, 4]),
+        (2, False, True, [1, 2, 4]),
+        ((2, 2), True, False, [0, 3, 4, 5]),
+        ((2, 2), False, False, [3, 4, 5]),
+        ((2, 2), True, True, [0, 4]),
+        ((2, 2), False, True, [4]),
+        (3, True, False, slice(None, None)),
+        (3, False, False, slice(1, None)),
+        (3, True, True, [0, 1, 2, 4]),
+        (3, False, True, [1, 2, 4]),
+        ((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]),
+        ((2, 3), False, False, slice(3, None)),
+        ((2, 3), True, True, [0, 4]),
+        ((2, 3), False, True, [4]),
+        ((3, 3), True, False, [0, 6, 7, 8, 9]),
+        ((3, 3), False, False, [6, 7, 8, 9]),
+        ((3, 3), True, True, [0]),
+        ((3, 3), False, True, []),  # would need 3 input features
+    ],
+)
+@pytest.mark.parametrize("X_container", [None] + CSR_CONTAINERS + CSC_CONTAINERS)
+def test_polynomial_features_two_features(
+    two_features_degree3,
+    degree,
+    include_bias,
+    interaction_only,
+    indices,
+    X_container,
+):
+    """Test PolynomialFeatures on 2 features up to degree 3."""
+    X, P = two_features_degree3
+    if X_container is not None:
+        X = X_container(X)
+    tf = PolynomialFeatures(
+        degree=degree, include_bias=include_bias, interaction_only=interaction_only
+    ).fit(X)
+    out = tf.transform(X)
+    if X_container is not None:
+        out = out.toarray()
+    assert_allclose(out, P[:, indices])
+    if tf.n_output_features_ > 0:
+        assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
+
+
+def test_polynomial_feature_names():
+    X = np.arange(30).reshape(10, 3)
+    poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
+    feature_names = poly.get_feature_names_out()
+    assert_array_equal(
+        ["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"],
+        feature_names,
+    )
+    assert len(feature_names) == poly.transform(X).shape[1]
+
+    poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
+    feature_names = poly.get_feature_names_out(["a", "b", "c"])
+    assert_array_equal(
+        [
+            "a",
+            "b",
+            "c",
+            "a^2",
+            "a b",
+            "a c",
+            "b^2",
+            "b c",
+            "c^2",
+            "a^3",
+            "a^2 b",
+            "a^2 c",
+            "a b^2",
+            "a b c",
+            "a c^2",
+            "b^3",
+            "b^2 c",
+            "b c^2",
+            "c^3",
+        ],
+        feature_names,
+    )
+    assert len(feature_names) == poly.transform(X).shape[1]
+
+    poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X)
+    feature_names = poly.get_feature_names_out(["a", "b", "c"])
+    assert_array_equal(
+        [
+            "a^2",
+            "a b",
+            "a c",
+            "b^2",
+            "b c",
+            "c^2",
+            "a^3",
+            "a^2 b",
+            "a^2 c",
+            "a b^2",
+            "a b c",
+            "a c^2",
+            "b^3",
+            "b^2 c",
+            "b c^2",
+            "c^3",
+        ],
+        feature_names,
+    )
+    assert len(feature_names) == poly.transform(X).shape[1]
+
+    poly = PolynomialFeatures(
+        degree=(3, 3), include_bias=True, interaction_only=True
+    ).fit(X)
+    feature_names = poly.get_feature_names_out(["a", "b", "c"])
+    assert_array_equal(["1", "a b c"], feature_names)
+    assert len(feature_names) == poly.transform(X).shape[1]
+
+    # test some unicode
+    poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
+    feature_names = poly.get_feature_names_out(["\u0001F40D", "\u262e", "\u05d0"])
+    assert_array_equal(["1", "\u0001F40D", "\u262e", "\u05d0"], feature_names)
+
+
+@pytest.mark.parametrize(
+    ["deg", "include_bias", "interaction_only", "dtype"],
+    [
+        (1, True, False, int),
+        (2, True, False, int),
+        (2, True, False, np.float32),
+        (2, True, False, np.float64),
+        (3, False, False, np.float64),
+        (3, False, True, np.float64),
+        (4, False, False, np.float64),
+        (4, False, True, np.float64),
+    ],
+)
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_polynomial_features_csc_X(
+    deg, include_bias, interaction_only, dtype, csc_container, global_random_seed
+):
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randint(0, 2, (100, 2))
+    X_csc = csc_container(X)
+
+    est = PolynomialFeatures(
+        deg, include_bias=include_bias, interaction_only=interaction_only
+    )
+    Xt_csc = est.fit_transform(X_csc.astype(dtype))
+    Xt_dense = est.fit_transform(X.astype(dtype))
+
+    assert sparse.issparse(Xt_csc) and Xt_csc.format == "csc"
+    assert Xt_csc.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csc.toarray(), Xt_dense)
+
+
+@pytest.mark.parametrize(
+    ["deg", "include_bias", "interaction_only", "dtype"],
+    [
+        (1, True, False, int),
+        (2, True, False, int),
+        (2, True, False, np.float32),
+        (2, True, False, np.float64),
+        (3, False, False, np.float64),
+        (3, False, True, np.float64),
+    ],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_polynomial_features_csr_X(
+    deg, include_bias, interaction_only, dtype, csr_container, global_random_seed
+):
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randint(0, 2, (100, 2))
+    X_csr = csr_container(X)
+
+    est = PolynomialFeatures(
+        deg, include_bias=include_bias, interaction_only=interaction_only
+    )
+    Xt_csr = est.fit_transform(X_csr.astype(dtype))
+    Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
+
+    assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
+
+
+@pytest.mark.parametrize("n_features", [1, 4, 5])
+@pytest.mark.parametrize(
+    "min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)]
+)
+@pytest.mark.parametrize("interaction_only", [True, False])
+@pytest.mark.parametrize("include_bias", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_num_combinations(
+    n_features, min_degree, max_degree, interaction_only, include_bias, csr_container
+):
+    """
+    Test that n_output_features_ is calculated correctly.
+    """
+    x = csr_container(([1], ([0], [n_features - 1])))
+    est = PolynomialFeatures(
+        degree=max_degree,
+        interaction_only=interaction_only,
+        include_bias=include_bias,
+    )
+    est.fit(x)
+    num_combos = est.n_output_features_
+
+    combos = PolynomialFeatures._combinations(
+        n_features=n_features,
+        min_degree=0,
+        max_degree=max_degree,
+        interaction_only=interaction_only,
+        include_bias=include_bias,
+    )
+    assert num_combos == sum([1 for _ in combos])
+
+
+@pytest.mark.parametrize(
+    ["deg", "include_bias", "interaction_only", "dtype"],
+    [
+        (2, True, False, np.float32),
+        (2, True, False, np.float64),
+        (3, False, False, np.float64),
+        (3, False, True, np.float64),
+    ],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_polynomial_features_csr_X_floats(
+    deg, include_bias, interaction_only, dtype, csr_container, global_random_seed
+):
+    X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=global_random_seed))
+    X = X_csr.toarray()
+
+    est = PolynomialFeatures(
+        deg, include_bias=include_bias, interaction_only=interaction_only
+    )
+    Xt_csr = est.fit_transform(X_csr.astype(dtype))
+    Xt_dense = est.fit_transform(X.astype(dtype))
+
+    assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
+
+
+@pytest.mark.parametrize(
+    ["zero_row_index", "deg", "interaction_only"],
+    [
+        (0, 2, True),
+        (1, 2, True),
+        (2, 2, True),
+        (0, 3, True),
+        (1, 3, True),
+        (2, 3, True),
+        (0, 2, False),
+        (1, 2, False),
+        (2, 2, False),
+        (0, 3, False),
+        (1, 3, False),
+        (2, 3, False),
+    ],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_polynomial_features_csr_X_zero_row(
+    zero_row_index, deg, interaction_only, csr_container, global_random_seed
+):
+    X_csr = csr_container(sparse_random(3, 10, 1.0, random_state=global_random_seed))
+    X_csr[zero_row_index, :] = 0.0
+    X = X_csr.toarray()
+
+    est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only)
+    Xt_csr = est.fit_transform(X_csr)
+    Xt_dense = est.fit_transform(X)
+
+    assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
+
+
+# This degree should always be one more than the highest degree supported by
+# _csr_expansion.
+@pytest.mark.parametrize(
+    ["include_bias", "interaction_only"],
+    [(True, True), (True, False), (False, True), (False, False)],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_polynomial_features_csr_X_degree_4(
+    include_bias, interaction_only, csr_container, global_random_seed
+):
+    X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=global_random_seed))
+    X = X_csr.toarray()
+
+    est = PolynomialFeatures(
+        4, include_bias=include_bias, interaction_only=interaction_only
+    )
+    Xt_csr = est.fit_transform(X_csr)
+    Xt_dense = est.fit_transform(X)
+
+    assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
+
+
+@pytest.mark.parametrize(
+    ["deg", "dim", "interaction_only"],
+    [
+        (2, 1, True),
+        (2, 2, True),
+        (3, 1, True),
+        (3, 2, True),
+        (3, 3, True),
+        (2, 1, False),
+        (2, 2, False),
+        (3, 1, False),
+        (3, 2, False),
+        (3, 3, False),
+    ],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_polynomial_features_csr_X_dim_edges(
+    deg, dim, interaction_only, csr_container, global_random_seed
+):
+    X_csr = csr_container(
+        sparse_random(1000, dim, 0.5, random_state=global_random_seed)
+    )
+    X = X_csr.toarray()
+
+    est = PolynomialFeatures(deg, interaction_only=interaction_only)
+    Xt_csr = est.fit_transform(X_csr)
+    Xt_dense = est.fit_transform(X)
+
+    assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
+
+
+@pytest.mark.parametrize("interaction_only", [True, False])
+@pytest.mark.parametrize("include_bias", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_csr_polynomial_expansion_index_overflow_non_regression(
+    interaction_only, include_bias, csr_container
+):
+    """Check the automatic index dtype promotion to `np.int64` when needed.
+
+    This ensures that sufficiently large input configurations get
+    properly promoted to use `np.int64` for index and indptr representation
+    while preserving data integrity. Non-regression test for gh-16803.
+
+    Note that this is only possible for Python runtimes with a 64 bit address
+    space. On 32 bit platforms, a `ValueError` is raised instead.
+    """
+
+    def degree_2_calc(d, i, j):
+        if interaction_only:
+            return d * i - (i**2 + 3 * i) // 2 - 1 + j
+        else:
+            return d * i - (i**2 + i) // 2 + j
+
+    n_samples = 13
+    n_features = 120001
+    data_dtype = np.float32
+    data = np.arange(1, 5, dtype=np.int64)
+    row = np.array([n_samples - 2, n_samples - 2, n_samples - 1, n_samples - 1])
+    # An int64 dtype is required to avoid overflow error on Windows within the
+    # `degree_2_calc` function.
+    col = np.array(
+        [n_features - 2, n_features - 1, n_features - 2, n_features - 1], dtype=np.int64
+    )
+    X = csr_container(
+        (data, (row, col)),
+        shape=(n_samples, n_features),
+        dtype=data_dtype,
+    )
+    pf = PolynomialFeatures(
+        interaction_only=interaction_only, include_bias=include_bias, degree=2
+    )
+
+    # Calculate the number of combinations a-priori, and if needed check for
+    # the correct ValueError and terminate the test early.
+    num_combinations = pf._num_combinations(
+        n_features=n_features,
+        min_degree=0,
+        max_degree=2,
+        interaction_only=pf.interaction_only,
+        include_bias=pf.include_bias,
+    )
+    if num_combinations > np.iinfo(np.intp).max:
+        msg = (
+            r"The output that would result from the current configuration would have"
+            r" \d* features which is too large to be indexed"
+        )
+        with pytest.raises(ValueError, match=msg):
+            pf.fit(X)
+        return
+    X_trans = pf.fit_transform(X)
+    row_nonzero, col_nonzero = X_trans.nonzero()
+    n_degree_1_features_out = n_features + include_bias
+    max_degree_2_idx = (
+        degree_2_calc(n_features, col[int(not interaction_only)], col[1])
+        + n_degree_1_features_out
+    )
+
+    # Account for bias of all samples except last one which will be handled
+    # separately since there are distinct data values before it
+    data_target = [1] * (n_samples - 2) if include_bias else []
+    col_nonzero_target = [0] * (n_samples - 2) if include_bias else []
+
+    for i in range(2):
+        x = data[2 * i]
+        y = data[2 * i + 1]
+        x_idx = col[2 * i]
+        y_idx = col[2 * i + 1]
+        if include_bias:
+            data_target.append(1)
+            col_nonzero_target.append(0)
+        data_target.extend([x, y])
+        col_nonzero_target.extend(
+            [x_idx + int(include_bias), y_idx + int(include_bias)]
+        )
+        if not interaction_only:
+            data_target.extend([x * x, x * y, y * y])
+            col_nonzero_target.extend(
+                [
+                    degree_2_calc(n_features, x_idx, x_idx) + n_degree_1_features_out,
+                    degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out,
+                    degree_2_calc(n_features, y_idx, y_idx) + n_degree_1_features_out,
+                ]
+            )
+        else:
+            data_target.extend([x * y])
+            col_nonzero_target.append(
+                degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out
+            )
+
+    nnz_per_row = int(include_bias) + 3 + 2 * int(not interaction_only)
+
+    assert pf.n_output_features_ == max_degree_2_idx + 1
+    assert X_trans.dtype == data_dtype
+    assert X_trans.shape == (n_samples, max_degree_2_idx + 1)
+    assert X_trans.indptr.dtype == X_trans.indices.dtype == np.int64
+    # Ensure that dtype promotion was actually required:
+    assert X_trans.indices.max() > np.iinfo(np.int32).max
+
+    row_nonzero_target = list(range(n_samples - 2)) if include_bias else []
+    row_nonzero_target.extend(
+        [n_samples - 2] * nnz_per_row + [n_samples - 1] * nnz_per_row
+    )
+
+    assert_allclose(X_trans.data, data_target)
+    assert_array_equal(row_nonzero, row_nonzero_target)
+    assert_array_equal(col_nonzero, col_nonzero_target)
+
+
+@pytest.mark.parametrize(
+    "degree, n_features",
+    [
+        # Needs promotion to int64 when interaction_only=False
+        (2, 65535),
+        (3, 2344),
+        # This guarantees that the intermediate operation when calculating
+        # output columns would overflow a C-long, hence checks that python-
+        # longs are being used.
+        (2, int(np.sqrt(np.iinfo(np.int64).max) + 1)),
+        (3, 65535),
+        # This case tests the second clause of the overflow check which
+        # takes into account the value of `n_features` itself.
+        (2, int(np.sqrt(np.iinfo(np.int64).max))),
+    ],
+)
+@pytest.mark.parametrize("interaction_only", [True, False])
+@pytest.mark.parametrize("include_bias", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_csr_polynomial_expansion_index_overflow(
+    degree, n_features, interaction_only, include_bias, csr_container
+):
+    """Tests known edge-cases to the dtype promotion strategy and custom
+    Cython code, including a current bug in the upstream
+    `scipy.sparse.hstack`.
+    """
+    data = [1.0]
+    # Use int32 indices as much as we can
+    indices_dtype = np.int32 if n_features - 1 <= np.iinfo(np.int32).max else np.int64
+    row = np.array([0], dtype=indices_dtype)
+    col = np.array([n_features - 1], dtype=indices_dtype)
+
+    # First degree index
+    expected_indices = [
+        n_features - 1 + int(include_bias),
+    ]
+    # Second degree index
+    expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0])
+    # Third degree index
+    expected_indices.append(
+        n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]
+    )
+
+    X = csr_container((data, (row, col)))
+    pf = PolynomialFeatures(
+        interaction_only=interaction_only, include_bias=include_bias, degree=degree
+    )
+
+    # Calculate the number of combinations a-priori, and if needed check for
+    # the correct ValueError and terminate the test early.
+    num_combinations = pf._num_combinations(
+        n_features=n_features,
+        min_degree=0,
+        max_degree=degree,
+        interaction_only=pf.interaction_only,
+        include_bias=pf.include_bias,
+    )
+    if num_combinations > np.iinfo(np.intp).max:
+        msg = (
+            r"The output that would result from the current configuration would have"
+            r" \d* features which is too large to be indexed"
+        )
+        with pytest.raises(ValueError, match=msg):
+            pf.fit(X)
+        return
+
+    # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
+    # dtype for representing indices and indptr if `n_features` is still
+    # small enough so that each block matrix's indices and indptr arrays
+    # can be represented with `np.int32`. We test `n_features==65535`
+    # since it is guaranteed to run into this bug.
+    if (
+        sp_version < parse_version("1.9.2")
+        and n_features == 65535
+        and degree == 2
+        and not interaction_only
+    ):  # pragma: no cover
+        msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
+        with pytest.raises(ValueError, match=msg):
+            X_trans = pf.fit_transform(X)
+        return
+    X_trans = pf.fit_transform(X)
+
+    expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32
+    # Terms higher than first degree
+    non_bias_terms = 1 + (degree - 1) * int(not interaction_only)
+    expected_nnz = int(include_bias) + non_bias_terms
+    assert X_trans.dtype == X.dtype
+    assert X_trans.shape == (1, pf.n_output_features_)
+    assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype
+    assert X_trans.nnz == expected_nnz
+
+    if include_bias:
+        assert X_trans[0, 0] == pytest.approx(1.0)
+    for idx in range(non_bias_terms):
+        assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0)
+
+    offset = interaction_only * n_features
+    if degree == 3:
+        offset *= 1 + n_features
+    assert pf.n_output_features_ == expected_indices[degree - 1] + 1 - offset
+
+
+@pytest.mark.parametrize("interaction_only", [True, False])
+@pytest.mark.parametrize("include_bias", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_csr_polynomial_expansion_too_large_to_index(
+    interaction_only, include_bias, csr_container
+):
+    n_features = np.iinfo(np.int64).max // 2
+    data = [1.0]
+    row = [0]
+    col = [n_features - 1]
+    X = csr_container((data, (row, col)))
+    pf = PolynomialFeatures(
+        interaction_only=interaction_only, include_bias=include_bias, degree=(2, 2)
+    )
+    msg = (
+        r"The output that would result from the current configuration would have \d*"
+        r" features which is too large to be indexed"
+    )
+    with pytest.raises(ValueError, match=msg):
+        pf.fit(X)
+    with pytest.raises(ValueError, match=msg):
+        pf.fit_transform(X)
+
+
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_polynomial_features_behaviour_on_zero_degree(sparse_container):
+    """Check that PolynomialFeatures raises error when degree=0 and include_bias=False,
+    and output a single constant column when include_bias=True
+    """
+    X = np.ones((10, 2))
+    poly = PolynomialFeatures(degree=0, include_bias=False)
+    err_msg = (
+        "Setting degree to zero and include_bias to False would result in"
+        " an empty output array."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        poly.fit_transform(X)
+
+    poly = PolynomialFeatures(degree=(0, 0), include_bias=False)
+    err_msg = (
+        "Setting both min_degree and max_degree to zero and include_bias to"
+        " False would result in an empty output array."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        poly.fit_transform(X)
+
+    for _X in [X, sparse_container(X)]:
+        poly = PolynomialFeatures(degree=0, include_bias=True)
+        output = poly.fit_transform(_X)
+        # convert to dense array if needed
+        if sparse.issparse(output):
+            output = output.toarray()
+        assert_array_equal(output, np.ones((X.shape[0], 1)))
+
+
+def test_sizeof_LARGEST_INT_t():
+    # On Windows, scikit-learn is typically compiled with MSVC that
+    # does not support int128 arithmetic (at the time of writing):
+    # https://stackoverflow.com/a/6761962/163740
+    if sys.platform == "win32" or (
+        sys.maxsize <= 2**32 and sys.platform != "emscripten"
+    ):
+        expected_size = 8
+    else:
+        expected_size = 16
+
+    assert _get_sizeof_LARGEST_INT_t() == expected_size
+
+
+@pytest.mark.xfail(
+    sys.platform == "win32",
+    reason=(
+        "On Windows, scikit-learn is typically compiled with MSVC that does not support"
+        " int128 arithmetic (at the time of writing)"
+    ),
+    run=True,
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_csr_polynomial_expansion_windows_fail(csr_container):
+    # Minimum needed to ensure integer overflow occurs while guaranteeing an
+    # int64-indexable output.
+    n_features = int(np.iinfo(np.int64).max ** (1 / 3) + 3)
+    data = [1.0]
+    row = [0]
+    col = [n_features - 1]
+
+    # First degree index
+    expected_indices = [
+        n_features - 1,
+    ]
+    # Second degree index
+    expected_indices.append(
+        int(n_features * (n_features + 1) // 2 + expected_indices[0])
+    )
+    # Third degree index
+    expected_indices.append(
+        int(n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1])
+    )
+
+    X = csr_container((data, (row, col)))
+    pf = PolynomialFeatures(interaction_only=False, include_bias=False, degree=3)
+    if sys.maxsize <= 2**32:
+        msg = (
+            r"The output that would result from the current configuration would"
+            r" have \d*"
+            r" features which is too large to be indexed"
+        )
+        with pytest.raises(ValueError, match=msg):
+            pf.fit_transform(X)
+    else:
+        X_trans = pf.fit_transform(X)
+        for idx in range(3):
+            assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_target_encoder.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_target_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..536f2e031bf771dab7d73b7f4d5447b155c53ec3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_target_encoder.py
@@ -0,0 +1,714 @@
+import re
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import Ridge
+from sklearn.model_selection import (
+    KFold,
+    ShuffleSplit,
+    StratifiedKFold,
+    cross_val_score,
+    train_test_split,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import (
+    KBinsDiscretizer,
+    LabelBinarizer,
+    LabelEncoder,
+    TargetEncoder,
+)
+
+
+def _encode_target(X_ordinal, y_numeric, n_categories, smooth):
+    """Simple Python implementation of target encoding."""
+    cur_encodings = np.zeros(n_categories, dtype=np.float64)
+    y_mean = np.mean(y_numeric)
+
+    if smooth == "auto":
+        y_variance = np.var(y_numeric)
+        for c in range(n_categories):
+            y_subset = y_numeric[X_ordinal == c]
+            n_i = y_subset.shape[0]
+
+            if n_i == 0:
+                cur_encodings[c] = y_mean
+                continue
+
+            y_subset_variance = np.var(y_subset)
+            m = y_subset_variance / y_variance
+            lambda_ = n_i / (n_i + m)
+
+            cur_encodings[c] = lambda_ * np.mean(y_subset) + (1 - lambda_) * y_mean
+        return cur_encodings
+    else:  # float
+        for c in range(n_categories):
+            y_subset = y_numeric[X_ordinal == c]
+            current_sum = np.sum(y_subset) + y_mean * smooth
+            current_cnt = y_subset.shape[0] + smooth
+            cur_encodings[c] = current_sum / current_cnt
+        return cur_encodings
+
+
+@pytest.mark.parametrize(
+    "categories, unknown_value",
+    [
+        ([np.array([0, 1, 2], dtype=np.int64)], 4),
+        ([np.array([1.0, 3.0, np.nan], dtype=np.float64)], 6.0),
+        ([np.array(["cat", "dog", "snake"], dtype=object)], "bear"),
+        ("auto", 3),
+    ],
+)
+@pytest.mark.parametrize("smooth", [5.0, "auto"])
+@pytest.mark.parametrize("target_type", ["binary", "continuous"])
+def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type):
+    """Check encoding for binary and continuous targets.
+
+    Compare the values returned by `TargetEncoder.fit_transform` against the
+    expected encodings for cv splits from a naive reference Python
+    implementation in _encode_target.
+    """
+
+    n_categories = 3
+    X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T
+    X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T
+    n_samples = X_train_int_array.shape[0]
+
+    if categories == "auto":
+        X_train = X_train_int_array
+        X_test = X_test_int_array
+    else:
+        X_train = categories[0][X_train_int_array]
+        X_test = categories[0][X_test_int_array]
+
+    X_test = np.concatenate((X_test, [[unknown_value]]))
+
+    data_rng = np.random.RandomState(global_random_seed)
+    n_splits = 3
+    if target_type == "binary":
+        y_numeric = data_rng.randint(low=0, high=2, size=n_samples)
+        target_names = np.array(["cat", "dog"], dtype=object)
+        y_train = target_names[y_numeric]
+
+    else:
+        assert target_type == "continuous"
+        y_numeric = data_rng.uniform(low=-10, high=20, size=n_samples)
+        y_train = y_numeric
+
+    shuffled_idx = data_rng.permutation(n_samples)
+    X_train_int_array = X_train_int_array[shuffled_idx]
+    X_train = X_train[shuffled_idx]
+    y_train = y_train[shuffled_idx]
+    y_numeric = y_numeric[shuffled_idx]
+
+    # Define our CV splitting strategy
+    if target_type == "binary":
+        cv = StratifiedKFold(
+            n_splits=n_splits, random_state=global_random_seed, shuffle=True
+        )
+    else:
+        cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True)
+
+    # Compute the expected values using our reference Python implementation of
+    # target encoding:
+    expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64)
+
+    for train_idx, test_idx in cv.split(X_train_int_array, y_train):
+        X_, y_ = X_train_int_array[train_idx, 0], y_numeric[train_idx]
+        cur_encodings = _encode_target(X_, y_, n_categories, smooth)
+        expected_X_fit_transform[test_idx, 0] = cur_encodings[
+            X_train_int_array[test_idx, 0]
+        ]
+
+    # Check that we can obtain the same encodings by calling `fit_transform` on
+    # the estimator with the same CV parameters:
+    target_encoder = TargetEncoder(
+        smooth=smooth,
+        categories=categories,
+        cv=n_splits,
+        random_state=global_random_seed,
+    )
+
+    X_fit_transform = target_encoder.fit_transform(X_train, y_train)
+
+    assert target_encoder.target_type_ == target_type
+    assert_allclose(X_fit_transform, expected_X_fit_transform)
+    assert len(target_encoder.encodings_) == 1
+    if target_type == "binary":
+        assert_array_equal(target_encoder.classes_, target_names)
+    else:
+        assert target_encoder.classes_ is None
+
+    # compute encodings for all data to validate `transform`
+    y_mean = np.mean(y_numeric)
+    expected_encodings = _encode_target(
+        X_train_int_array[:, 0], y_numeric, n_categories, smooth
+    )
+    assert_allclose(target_encoder.encodings_[0], expected_encodings)
+    assert target_encoder.target_mean_ == pytest.approx(y_mean)
+
+    # Transform on test data, the last value is unknown so it is encoded as the target
+    # mean
+    expected_X_test_transform = np.concatenate(
+        (expected_encodings, np.array([y_mean]))
+    ).reshape(-1, 1)
+
+    X_test_transform = target_encoder.transform(X_test)
+    assert_allclose(X_test_transform, expected_X_test_transform)
+
+
+@pytest.mark.parametrize(
+    "categories, unknown_values",
+    [
+        ([np.array([0, 1, 2], dtype=np.int64)], "auto"),
+        ([np.array(["cat", "dog", "snake"], dtype=object)], ["bear", "rabbit"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "target_labels", [np.array([1, 2, 3]), np.array(["a", "b", "c"])]
+)
+@pytest.mark.parametrize("smooth", [5.0, "auto"])
+def test_encoding_multiclass(
+    global_random_seed, categories, unknown_values, target_labels, smooth
+):
+    """Check encoding for multiclass targets."""
+    rng = np.random.RandomState(global_random_seed)
+
+    n_samples = 80
+    n_features = 2
+    feat_1_int = np.array(rng.randint(low=0, high=2, size=n_samples))
+    feat_2_int = np.array(rng.randint(low=0, high=3, size=n_samples))
+    feat_1 = categories[0][feat_1_int]
+    feat_2 = categories[0][feat_2_int]
+    X_train = np.column_stack((feat_1, feat_2))
+    X_train_int = np.column_stack((feat_1_int, feat_2_int))
+    categories_ = [[0, 1], [0, 1, 2]]
+
+    n_classes = 3
+    y_train_int = np.array(rng.randint(low=0, high=n_classes, size=n_samples))
+    y_train = target_labels[y_train_int]
+    y_train_enc = LabelBinarizer().fit_transform(y_train)
+
+    n_splits = 3
+    cv = StratifiedKFold(
+        n_splits=n_splits, random_state=global_random_seed, shuffle=True
+    )
+
+    # Manually compute encodings for cv splits to validate `fit_transform`
+    expected_X_fit_transform = np.empty(
+        (X_train_int.shape[0], X_train_int.shape[1] * n_classes),
+        dtype=np.float64,
+    )
+    for f_idx, cats in enumerate(categories_):
+        for c_idx in range(n_classes):
+            for train_idx, test_idx in cv.split(X_train, y_train):
+                y_class = y_train_enc[:, c_idx]
+                X_, y_ = X_train_int[train_idx, f_idx], y_class[train_idx]
+                current_encoding = _encode_target(X_, y_, len(cats), smooth)
+                # f_idx:   0, 0, 0, 1, 1, 1
+                # c_idx:   0, 1, 2, 0, 1, 2
+                # exp_idx: 0, 1, 2, 3, 4, 5
+                exp_idx = c_idx + (f_idx * n_classes)
+                expected_X_fit_transform[test_idx, exp_idx] = current_encoding[
+                    X_train_int[test_idx, f_idx]
+                ]
+
+    target_encoder = TargetEncoder(
+        smooth=smooth,
+        cv=n_splits,
+        random_state=global_random_seed,
+    )
+    X_fit_transform = target_encoder.fit_transform(X_train, y_train)
+
+    assert target_encoder.target_type_ == "multiclass"
+    assert_allclose(X_fit_transform, expected_X_fit_transform)
+
+    # Manually compute encoding to validate `transform`
+    expected_encodings = []
+    for f_idx, cats in enumerate(categories_):
+        for c_idx in range(n_classes):
+            y_class = y_train_enc[:, c_idx]
+            current_encoding = _encode_target(
+                X_train_int[:, f_idx], y_class, len(cats), smooth
+            )
+            expected_encodings.append(current_encoding)
+
+    assert len(target_encoder.encodings_) == n_features * n_classes
+    for i in range(n_features * n_classes):
+        assert_allclose(target_encoder.encodings_[i], expected_encodings[i])
+    assert_array_equal(target_encoder.classes_, target_labels)
+
+    # Include unknown values at the end
+    X_test_int = np.array([[0, 1], [1, 2], [4, 5]])
+    if unknown_values == "auto":
+        X_test = X_test_int
+    else:
+        X_test = np.empty_like(X_test_int[:-1, :], dtype=object)
+        for column_idx in range(X_test_int.shape[1]):
+            X_test[:, column_idx] = categories[0][X_test_int[:-1, column_idx]]
+        # Add unknown values at end
+        X_test = np.vstack((X_test, unknown_values))
+
+    y_mean = np.mean(y_train_enc, axis=0)
+    expected_X_test_transform = np.empty(
+        (X_test_int.shape[0], X_test_int.shape[1] * n_classes),
+        dtype=np.float64,
+    )
+    n_rows = X_test_int.shape[0]
+    f_idx = [0, 0, 0, 1, 1, 1]
+    # Last row are unknowns, dealt with later
+    for row_idx in range(n_rows - 1):
+        for i, enc in enumerate(expected_encodings):
+            expected_X_test_transform[row_idx, i] = enc[X_test_int[row_idx, f_idx[i]]]
+
+    # Unknowns encoded as target mean for each class
+    # `y_mean` contains target mean for each class, thus cycle through mean of
+    # each class, `n_features` times
+    mean_idx = [0, 1, 2, 0, 1, 2]
+    for i in range(n_classes * n_features):
+        expected_X_test_transform[n_rows - 1, i] = y_mean[mean_idx[i]]
+
+    X_test_transform = target_encoder.transform(X_test)
+    assert_allclose(X_test_transform, expected_X_test_transform)
+
+
+@pytest.mark.parametrize(
+    "X, categories",
+    [
+        (
+            np.array([[0] * 10 + [1] * 10 + [3]], dtype=np.int64).T,  # 3 is unknown
+            [[0, 1, 2]],
+        ),
+        (
+            np.array(
+                [["cat"] * 10 + ["dog"] * 10 + ["snake"]], dtype=object
+            ).T,  # snake is unknown
+            [["dog", "cat", "cow"]],
+        ),
+    ],
+)
+@pytest.mark.parametrize("smooth", [4.0, "auto"])
+def test_custom_categories(X, categories, smooth):
+    """Custom categories with unknown categories that are not in training data."""
+    rng = np.random.RandomState(0)
+    y = rng.uniform(low=-10, high=20, size=X.shape[0])
+    enc = TargetEncoder(categories=categories, smooth=smooth, random_state=0).fit(X, y)
+
+    # The last element is unknown and encoded as the mean
+    y_mean = y.mean()
+    X_trans = enc.transform(X[-1:])
+    assert X_trans[0, 0] == pytest.approx(y_mean)
+
+    assert len(enc.encodings_) == 1
+    # custom category that is not in training data
+    assert enc.encodings_[0][-1] == pytest.approx(y_mean)
+
+
+@pytest.mark.parametrize(
+    "y, msg",
+    [
+        ([1, 2, 0, 1], "Found input variables with inconsistent"),
+        (
+            np.array([[1, 2, 0], [1, 2, 3]]).T,
+            "Target type was inferred to be 'multiclass-multioutput'",
+        ),
+    ],
+)
+def test_errors(y, msg):
+    """Check invalidate input."""
+    X = np.array([[1, 0, 1]]).T
+
+    enc = TargetEncoder()
+    with pytest.raises(ValueError, match=msg):
+        enc.fit_transform(X, y)
+
+
+def test_use_regression_target():
+    """Check inferred and specified `target_type` on regression target."""
+    X = np.array([[0, 1, 0, 1, 0, 1]]).T
+    y = np.array([1.0, 2.0, 3.0, 2.0, 3.0, 4.0])
+
+    enc = TargetEncoder(cv=2)
+    with pytest.warns(
+        UserWarning,
+        match=re.escape(
+            "The least populated class in y has only 1 members, which is less than"
+            " n_splits=2."
+        ),
+    ):
+        enc.fit_transform(X, y)
+    assert enc.target_type_ == "multiclass"
+
+    enc = TargetEncoder(cv=2, target_type="continuous")
+    enc.fit_transform(X, y)
+    assert enc.target_type_ == "continuous"
+
+
+@pytest.mark.parametrize(
+    "y, feature_names",
+    [
+        ([1, 2] * 10, ["A", "B"]),
+        ([1, 2, 3] * 6 + [1, 2], ["A_1", "A_2", "A_3", "B_1", "B_2", "B_3"]),
+        (
+            ["y1", "y2", "y3"] * 6 + ["y1", "y2"],
+            ["A_y1", "A_y2", "A_y3", "B_y1", "B_y2", "B_y3"],
+        ),
+    ],
+)
+def test_feature_names_out_set_output(y, feature_names):
+    """Check TargetEncoder works with set_output."""
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame({"A": ["a", "b"] * 10, "B": [1, 2] * 10})
+
+    enc_default = TargetEncoder(cv=2, smooth=3.0, random_state=0)
+    enc_default.set_output(transform="default")
+    enc_pandas = TargetEncoder(cv=2, smooth=3.0, random_state=0)
+    enc_pandas.set_output(transform="pandas")
+
+    X_default = enc_default.fit_transform(X_df, y)
+    X_pandas = enc_pandas.fit_transform(X_df, y)
+
+    assert_allclose(X_pandas.to_numpy(), X_default)
+    assert_array_equal(enc_pandas.get_feature_names_out(), feature_names)
+    assert_array_equal(enc_pandas.get_feature_names_out(), X_pandas.columns)
+
+
+@pytest.mark.parametrize("to_pandas", [True, False])
+@pytest.mark.parametrize("smooth", [1.0, "auto"])
+@pytest.mark.parametrize("target_type", ["binary-ints", "binary-str", "continuous"])
+def test_multiple_features_quick(to_pandas, smooth, target_type):
+    """Check target encoder with multiple features."""
+    X_ordinal = np.array(
+        [[1, 1], [0, 1], [1, 1], [2, 1], [1, 0], [0, 1], [1, 0], [0, 0]], dtype=np.int64
+    )
+    if target_type == "binary-str":
+        y_train = np.array(["a", "b", "a", "a", "b", "b", "a", "b"])
+        y_integer = LabelEncoder().fit_transform(y_train)
+        cv = StratifiedKFold(2, random_state=0, shuffle=True)
+    elif target_type == "binary-ints":
+        y_train = np.array([3, 4, 3, 3, 3, 4, 4, 4])
+        y_integer = LabelEncoder().fit_transform(y_train)
+        cv = StratifiedKFold(2, random_state=0, shuffle=True)
+    else:
+        y_train = np.array([3.0, 5.1, 2.4, 3.5, 4.1, 5.5, 10.3, 7.3], dtype=np.float32)
+        y_integer = y_train
+        cv = KFold(2, random_state=0, shuffle=True)
+    y_mean = np.mean(y_integer)
+    categories = [[0, 1, 2], [0, 1]]
+
+    X_test = np.array(
+        [
+            [0, 1],
+            [3, 0],  # 3 is unknown
+            [1, 10],  # 10 is unknown
+        ],
+        dtype=np.int64,
+    )
+
+    if to_pandas:
+        pd = pytest.importorskip("pandas")
+        # convert second feature to an object
+        X_train = pd.DataFrame(
+            {
+                "feat0": X_ordinal[:, 0],
+                "feat1": np.array(["cat", "dog"], dtype=object)[X_ordinal[:, 1]],
+            }
+        )
+        # "snake" is unknown
+        X_test = pd.DataFrame({"feat0": X_test[:, 0], "feat1": ["dog", "cat", "snake"]})
+    else:
+        X_train = X_ordinal
+
+    # manually compute encoding for fit_transform
+    expected_X_fit_transform = np.empty_like(X_ordinal, dtype=np.float64)
+    for f_idx, cats in enumerate(categories):
+        for train_idx, test_idx in cv.split(X_ordinal, y_integer):
+            X_, y_ = X_ordinal[train_idx, f_idx], y_integer[train_idx]
+            current_encoding = _encode_target(X_, y_, len(cats), smooth)
+            expected_X_fit_transform[test_idx, f_idx] = current_encoding[
+                X_ordinal[test_idx, f_idx]
+            ]
+
+    # manually compute encoding for transform
+    expected_encodings = []
+    for f_idx, cats in enumerate(categories):
+        current_encoding = _encode_target(
+            X_ordinal[:, f_idx], y_integer, len(cats), smooth
+        )
+        expected_encodings.append(current_encoding)
+
+    expected_X_test_transform = np.array(
+        [
+            [expected_encodings[0][0], expected_encodings[1][1]],
+            [y_mean, expected_encodings[1][0]],
+            [expected_encodings[0][1], y_mean],
+        ],
+        dtype=np.float64,
+    )
+
+    enc = TargetEncoder(smooth=smooth, cv=2, random_state=0)
+    X_fit_transform = enc.fit_transform(X_train, y_train)
+    assert_allclose(X_fit_transform, expected_X_fit_transform)
+
+    assert len(enc.encodings_) == 2
+    for i in range(2):
+        assert_allclose(enc.encodings_[i], expected_encodings[i])
+
+    X_test_transform = enc.transform(X_test)
+    assert_allclose(X_test_transform, expected_X_test_transform)
+
+
+@pytest.mark.parametrize(
+    "y, y_mean",
+    [
+        (np.array([3.4] * 20), 3.4),
+        (np.array([0] * 20), 0),
+        (np.array(["a"] * 20, dtype=object), 0),
+    ],
+    ids=["continuous", "binary", "binary-string"],
+)
+@pytest.mark.parametrize("smooth", ["auto", 4.0, 0.0])
+def test_constant_target_and_feature(y, y_mean, smooth):
+    """Check edge case where feature and target is constant."""
+    X = np.array([[1] * 20]).T
+    n_samples = X.shape[0]
+
+    enc = TargetEncoder(cv=2, smooth=smooth, random_state=0)
+    X_trans = enc.fit_transform(X, y)
+    assert_allclose(X_trans, np.repeat([[y_mean]], n_samples, axis=0))
+    assert enc.encodings_[0][0] == pytest.approx(y_mean)
+    assert enc.target_mean_ == pytest.approx(y_mean)
+
+    X_test = np.array([[1], [0]])
+    X_test_trans = enc.transform(X_test)
+    assert_allclose(X_test_trans, np.repeat([[y_mean]], 2, axis=0))
+
+
+def test_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not(
+    global_random_seed,
+):
+    cardinality = 30  # not too large, otherwise we need a very large n_samples
+    n_samples = 3000
+    rng = np.random.RandomState(global_random_seed)
+    y_train = rng.normal(size=n_samples)
+    X_train = rng.randint(0, cardinality, size=n_samples).reshape(-1, 1)
+
+    # Sort by y_train to attempt to cause a leak
+    y_sorted_indices = y_train.argsort()
+    y_train = y_train[y_sorted_indices]
+    X_train = X_train[y_sorted_indices]
+
+    target_encoder = TargetEncoder(shuffle=True, random_state=global_random_seed)
+    X_encoded_train_shuffled = target_encoder.fit_transform(X_train, y_train)
+
+    target_encoder = TargetEncoder(shuffle=False)
+    X_encoded_train_no_shuffled = target_encoder.fit_transform(X_train, y_train)
+
+    # Check that no information about y_train has leaked into X_train:
+    regressor = RandomForestRegressor(
+        n_estimators=10, min_samples_leaf=20, random_state=global_random_seed
+    )
+
+    # It's impossible to learn a good predictive model on the training set when
+    # using the original representation X_train or the target encoded
+    # representation with shuffled inner CV. For the latter, no information
+    # about y_train has inadvertently leaked into the prior used to generate
+    # `X_encoded_train_shuffled`:
+    cv = ShuffleSplit(n_splits=50, random_state=global_random_seed)
+    assert cross_val_score(regressor, X_train, y_train, cv=cv).mean() < 0.1
+    assert (
+        cross_val_score(regressor, X_encoded_train_shuffled, y_train, cv=cv).mean()
+        < 0.1
+    )
+
+    # Without the inner CV shuffling, a lot of information about y_train goes into the
+    # the per-fold y_train.mean() priors: shrinkage is no longer effective in this
+    # case and would no longer be able to prevent downstream over-fitting.
+    assert (
+        cross_val_score(regressor, X_encoded_train_no_shuffled, y_train, cv=cv).mean()
+        > 0.5
+    )
+
+
+def test_smooth_zero():
+    """Check edge case with zero smoothing and cv does not contain category."""
+    X = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]).T
+    y = np.array([2.1, 4.3, 1.2, 3.1, 1.0, 9.0, 10.3, 14.2, 13.3, 15.0])
+
+    enc = TargetEncoder(smooth=0.0, shuffle=False, cv=2)
+    X_trans = enc.fit_transform(X, y)
+
+    # With cv = 2, category 0 does not exist in the second half, thus
+    # it will be encoded as the mean of the second half
+    assert_allclose(X_trans[0], np.mean(y[5:]))
+
+    # category 1 does not exist in the first half, thus it will be encoded as
+    # the mean of the first half
+    assert_allclose(X_trans[-1], np.mean(y[:5]))
+
+
+@pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"])
+def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed):
+    # Check that the encoding does not depend on the integer of the value of
+    # the integer labels. This is quite a trivial property but it is helpful
+    # to understand the following test.
+    rng = np.random.RandomState(global_random_seed)
+
+    # Random y and informative categorical X to make the test non-trivial when
+    # using smoothing.
+    y = rng.normal(size=1000)
+    n_categories = 30
+    X = KBinsDiscretizer(
+        n_bins=n_categories, quantile_method="averaged_inverted_cdf", encode="ordinal"
+    ).fit_transform(y.reshape(-1, 1))
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=global_random_seed
+    )
+
+    # Shuffle the labels to make sure that the encoding is invariant to the
+    # permutation of the labels
+    permutated_labels = rng.permutation(n_categories)
+    X_train_permuted = permutated_labels[X_train.astype(np.int32)]
+    X_test_permuted = permutated_labels[X_test.astype(np.int32)]
+
+    target_encoder = TargetEncoder(smooth=smooth, random_state=global_random_seed)
+    X_train_encoded = target_encoder.fit_transform(X_train, y_train)
+    X_test_encoded = target_encoder.transform(X_test)
+
+    X_train_permuted_encoded = target_encoder.fit_transform(X_train_permuted, y_train)
+    X_test_permuted_encoded = target_encoder.transform(X_test_permuted)
+
+    assert_allclose(X_train_encoded, X_train_permuted_encoded)
+    assert_allclose(X_test_encoded, X_test_permuted_encoded)
+
+
+@pytest.mark.parametrize("smooth", [0.0, "auto"])
+def test_target_encoding_for_linear_regression(smooth, global_random_seed):
+    # Check some expected statistical properties when fitting a linear
+    # regression model on target encoded features depending on their relation
+    # with that target.
+
+    # In this test, we use the Ridge class with the "lsqr" solver and a little
+    # bit of regularization to implement a linear regression model that
+    # converges quickly for large `n_samples` and robustly in case of
+    # correlated features. Since we will fit this model on a mean centered
+    # target, we do not need to fit an intercept and this will help simplify
+    # the analysis with respect to the expected coefficients.
+    linear_regression = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)
+
+    # Construct a random target variable. We need a large number of samples for
+    # this test to be stable across all values of the random seed.
+    n_samples = 50_000
+    rng = np.random.RandomState(global_random_seed)
+    y = rng.randn(n_samples)
+
+    # Generate a single informative ordinal feature with medium cardinality.
+    # Inject some irreducible noise to make it harder for a multivariate model
+    # to identify the informative feature from other pure noise features.
+    noise = 0.8 * rng.randn(n_samples)
+    n_categories = 100
+    X_informative = KBinsDiscretizer(
+        n_bins=n_categories,
+        encode="ordinal",
+        strategy="uniform",
+        random_state=rng,
+    ).fit_transform((y + noise).reshape(-1, 1))
+
+    # Let's permute the labels to hide the fact that this feature is
+    # informative to naive linear regression model trained on the raw ordinal
+    # values. As highlighted in the previous test, the target encoding should be
+    # invariant to such a permutation.
+    permutated_labels = rng.permutation(n_categories)
+    X_informative = permutated_labels[X_informative.astype(np.int32)]
+
+    # Generate a shuffled copy of the informative feature to destroy the
+    # relationship with the target.
+    X_shuffled = rng.permutation(X_informative)
+
+    # Also include a very high cardinality categorical feature that is by
+    # itself independent of the target variable: target encoding such a feature
+    # without internal cross-validation should cause catastrophic overfitting
+    # for the downstream regressor, even with shrinkage. This kind of features
+    # typically represents near unique identifiers of samples. In general they
+    # should be removed from a machine learning datasets but here we want to
+    # study the ability of the default behavior of TargetEncoder to mitigate
+    # them automatically.
+    X_near_unique_categories = rng.choice(
+        int(0.9 * n_samples), size=n_samples, replace=True
+    ).reshape(-1, 1)
+
+    # Assemble the dataset and do a train-test split:
+    X = np.concatenate(
+        [X_informative, X_shuffled, X_near_unique_categories],
+        axis=1,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    # Let's first check that a linear regression model trained on the raw
+    # features underfits because of the meaning-less ordinal encoding of the
+    # labels.
+    raw_model = linear_regression.fit(X_train, y_train)
+    assert raw_model.score(X_train, y_train) < 0.1
+    assert raw_model.score(X_test, y_test) < 0.1
+
+    # Now do the same with target encoding using the internal CV mechanism
+    # implemented when using fit_transform.
+    model_with_cv = make_pipeline(
+        TargetEncoder(smooth=smooth, random_state=rng), linear_regression
+    ).fit(X_train, y_train)
+
+    # This model should be able to fit the data well and also generalise to the
+    # test data (assuming that the binning is fine-grained enough). The R2
+    # scores are not perfect because of the noise injected during the
+    # generation of the unique informative feature.
+    coef = model_with_cv[-1].coef_
+    assert model_with_cv.score(X_train, y_train) > 0.5, coef
+    assert model_with_cv.score(X_test, y_test) > 0.5, coef
+
+    # The target encoder recovers the linear relationship with slope 1 between
+    # the target encoded unique informative predictor and the target. Since the
+    # target encoding of the 2 other features is not informative thanks to the
+    # use of internal cross-validation, the multivariate linear regressor
+    # assigns a coef of 1 to the first feature and 0 to the other 2.
+    assert coef[0] == pytest.approx(1, abs=1e-2)
+    assert (np.abs(coef[1:]) < 0.2).all()
+
+    # Let's now disable the internal cross-validation by calling fit and then
+    # transform separately on the training set:
+    target_encoder = TargetEncoder(smooth=smooth, random_state=rng).fit(
+        X_train, y_train
+    )
+    X_enc_no_cv_train = target_encoder.transform(X_train)
+    X_enc_no_cv_test = target_encoder.transform(X_test)
+    model_no_cv = linear_regression.fit(X_enc_no_cv_train, y_train)
+
+    # The linear regression model should always overfit because it assigns
+    # too much weight to the extremely high cardinality feature relatively to
+    # the informative feature. Note that this is the case even when using
+    # the empirical Bayes smoothing which is not enough to prevent such
+    # overfitting alone.
+    coef = model_no_cv.coef_
+    assert model_no_cv.score(X_enc_no_cv_train, y_train) > 0.7, coef
+    assert model_no_cv.score(X_enc_no_cv_test, y_test) < 0.5, coef
+
+    # The model overfits because it assigns too much weight to the high
+    # cardinality yet non-informative feature instead of the lower
+    # cardinality yet informative feature:
+    assert abs(coef[0]) < abs(coef[2])
+
+
+def test_pandas_copy_on_write():
+    """
+    Test target-encoder cython code when y is read-only.
+
+    The numpy array underlying df["y"] is read-only when copy-on-write is enabled.
+    Non-regression test for gh-27879.
+    """
+    pd = pytest.importorskip("pandas", minversion="2.0")
+    with pd.option_context("mode.copy_on_write", True):
+        df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]})
+        TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..453cd5edc348bf1a0d957e011cd2fa85fee9b34a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/__init__.py
@@ -0,0 +1,13 @@
+"""Semi-supervised learning algorithms.
+
+These algorithms utilize small amounts of labeled data and large amounts of unlabeled
+data for classification tasks.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._label_propagation import LabelPropagation, LabelSpreading
+from ._self_training import SelfTrainingClassifier
+
+__all__ = ["LabelPropagation", "LabelSpreading", "SelfTrainingClassifier"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_label_propagation.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_label_propagation.py
new file mode 100644
index 0000000000000000000000000000000000000000..559a17a13d6ae35f4a97a008d6e4c07e4dc77923
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_label_propagation.py
@@ -0,0 +1,630 @@
+# coding=utf8
+"""
+Label propagation in the context of this module refers to a set of
+semi-supervised classification algorithms. At a high level, these algorithms
+work by forming a fully-connected graph between all points given and solving
+for the steady-state distribution of labels at each point.
+
+These algorithms perform very well in practice. The cost of running can be very
+expensive, at approximately O(N^3) where N is the number of (labeled and
+unlabeled) points. The theory (why they perform so well) is motivated by
+intuitions from random walk algorithms and geometric relationships in the data.
+For more information see the references below.
+
+Model Features
+--------------
+Label clamping:
+  The algorithm tries to learn distributions of labels over the dataset given
+  label assignments over an initial subset. In one variant, the algorithm does
+  not allow for any errors in the initial assignment (hard-clamping) while
+  in another variant, the algorithm allows for some wiggle room for the initial
+  assignments, allowing them to change by a fraction alpha in each iteration
+  (soft-clamping).
+
+Kernel:
+  A function which projects a vector into some higher dimensional space. This
+  implementation supports RBF and KNN kernels. Using the RBF kernel generates
+  a dense matrix of size O(N^2). KNN kernel will generate a sparse matrix of
+  size O(k*N) which will run much faster. See the documentation for SVMs for
+  more info on kernels.
+
+Examples
+--------
+>>> import numpy as np
+>>> from sklearn import datasets
+>>> from sklearn.semi_supervised import LabelPropagation
+>>> label_prop_model = LabelPropagation()
+>>> iris = datasets.load_iris()
+>>> rng = np.random.RandomState(42)
+>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
+>>> labels = np.copy(iris.target)
+>>> labels[random_unlabeled_points] = -1
+>>> label_prop_model.fit(iris.data, labels)
+LabelPropagation(...)
+
+Notes
+-----
+References:
+[1] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised
+Learning (2006), pp. 193-216
+
+[2] Olivier Delalleau, Yoshua Bengio, Nicolas Le Roux. Efficient
+Non-Parametric Function Induction in Semi-Supervised Learning. AISTAT 2005
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse
+
+from ..base import BaseEstimator, ClassifierMixin, _fit_context
+from ..exceptions import ConvergenceWarning
+from ..metrics.pairwise import rbf_kernel
+from ..neighbors import NearestNeighbors
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import safe_sparse_dot
+from ..utils.fixes import laplacian as csgraph_laplacian
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import check_is_fitted, validate_data
+
+
+class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for label propagation module.
+
+     Parameters
+     ----------
+     kernel : {'knn', 'rbf'} or callable, default='rbf'
+         String identifier for kernel function to use or the kernel function
+         itself. Only 'rbf' and 'knn' strings are valid inputs. The function
+         passed should take two inputs, each of shape (n_samples, n_features),
+         and return a (n_samples, n_samples) shaped weight matrix.
+
+     gamma : float, default=20
+         Parameter for rbf kernel.
+
+     n_neighbors : int, default=7
+         Parameter for knn kernel. Need to be strictly positive.
+
+     alpha : float, default=1.0
+         Clamping factor.
+
+     max_iter : int, default=30
+         Change maximum number of iterations allowed.
+
+     tol : float, default=1e-3
+         Convergence tolerance: threshold to consider the system at steady
+         state.
+
+    n_jobs : int, default=None
+         The number of parallel jobs to run.
+         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+         for more details.
+    """
+
+    _parameter_constraints: dict = {
+        "kernel": [StrOptions({"knn", "rbf"}), callable],
+        "gamma": [Interval(Real, 0, None, closed="left")],
+        "n_neighbors": [Interval(Integral, 0, None, closed="neither")],
+        "alpha": [None, Interval(Real, 0, 1, closed="neither")],
+        "max_iter": [Interval(Integral, 0, None, closed="neither")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "n_jobs": [None, Integral],
+    }
+
+    def __init__(
+        self,
+        kernel="rbf",
+        *,
+        gamma=20,
+        n_neighbors=7,
+        alpha=1,
+        max_iter=30,
+        tol=1e-3,
+        n_jobs=None,
+    ):
+        self.max_iter = max_iter
+        self.tol = tol
+
+        # kernel parameters
+        self.kernel = kernel
+        self.gamma = gamma
+        self.n_neighbors = n_neighbors
+
+        # clamping factor
+        self.alpha = alpha
+
+        self.n_jobs = n_jobs
+
+    def _get_kernel(self, X, y=None):
+        if self.kernel == "rbf":
+            if y is None:
+                return rbf_kernel(X, X, gamma=self.gamma)
+            else:
+                return rbf_kernel(X, y, gamma=self.gamma)
+        elif self.kernel == "knn":
+            if self.nn_fit is None:
+                self.nn_fit = NearestNeighbors(
+                    n_neighbors=self.n_neighbors, n_jobs=self.n_jobs
+                ).fit(X)
+            if y is None:
+                return self.nn_fit.kneighbors_graph(
+                    self.nn_fit._fit_X, self.n_neighbors, mode="connectivity"
+                )
+            else:
+                return self.nn_fit.kneighbors(y, return_distance=False)
+        elif callable(self.kernel):
+            if y is None:
+                return self.kernel(X, X)
+            else:
+                return self.kernel(X, y)
+
+    @abstractmethod
+    def _build_graph(self):
+        raise NotImplementedError(
+            "Graph construction must be implemented to fit a label propagation model."
+        )
+
+    def predict(self, X):
+        """Perform inductive inference across the model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            Predictions for input data.
+        """
+        # Note: since `predict` does not accept semi-supervised labels as input,
+        # `fit(X, y).predict(X) != fit(X, y).transduction_`.
+        # Hence, `fit_predict` is not implemented.
+        # See https://github.com/scikit-learn/scikit-learn/pull/24898
+        probas = self.predict_proba(X)
+        return self.classes_[np.argmax(probas, axis=1)].ravel()
+
+    def predict_proba(self, X):
+        """Predict probability for each possible outcome.
+
+        Compute the probability estimates for each single sample in X
+        and each possible outcome seen during training (categorical
+        distribution).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes)
+            Normalized probability distributions across
+            class labels.
+        """
+        check_is_fitted(self)
+
+        X_2d = validate_data(
+            self,
+            X,
+            accept_sparse=["csc", "csr", "coo", "dok", "bsr", "lil", "dia"],
+            reset=False,
+        )
+        weight_matrices = self._get_kernel(self.X_, X_2d)
+        if self.kernel == "knn":
+            probabilities = np.array(
+                [
+                    np.sum(self.label_distributions_[weight_matrix], axis=0)
+                    for weight_matrix in weight_matrices
+                ]
+            )
+        else:
+            weight_matrices = weight_matrices.T
+            probabilities = safe_sparse_dot(weight_matrices, self.label_distributions_)
+        normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
+        probabilities /= normalizer
+        return probabilities
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit a semi-supervised label propagation model to X.
+
+        The input samples (labeled and unlabeled) are provided by matrix X,
+        and target labels are provided by matrix y. We conventionally apply the
+        label -1 to unlabeled samples in matrix y in a semi-supervised
+        classification.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target class values with unlabeled points marked as -1.
+            All unlabeled samples will be transductively assigned labels
+            internally, which are stored in `transduction_`.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csr", "csc"],
+            reset=True,
+        )
+        self.X_ = X
+        check_classification_targets(y)
+
+        # actual graph construction (implementations should override this)
+        graph_matrix = self._build_graph()
+
+        # label construction
+        # construct a categorical distribution for classification only
+        classes = np.unique(y)
+        classes = classes[classes != -1]
+        self.classes_ = classes
+
+        n_samples, n_classes = len(y), len(classes)
+
+        y = np.asarray(y)
+        unlabeled = y == -1
+
+        # initialize distributions
+        self.label_distributions_ = np.zeros((n_samples, n_classes))
+        for label in classes:
+            self.label_distributions_[y == label, classes == label] = 1
+
+        y_static = np.copy(self.label_distributions_)
+        if self._variant == "propagation":
+            # LabelPropagation
+            y_static[unlabeled] = 0
+        else:
+            # LabelSpreading
+            y_static *= 1 - self.alpha
+
+        l_previous = np.zeros((self.X_.shape[0], n_classes))
+
+        unlabeled = unlabeled[:, np.newaxis]
+        if sparse.issparse(graph_matrix):
+            graph_matrix = graph_matrix.tocsr()
+
+        for self.n_iter_ in range(self.max_iter):
+            if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:
+                break
+
+            l_previous = self.label_distributions_
+            self.label_distributions_ = safe_sparse_dot(
+                graph_matrix, self.label_distributions_
+            )
+
+            if self._variant == "propagation":
+                normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
+                normalizer[normalizer == 0] = 1
+                self.label_distributions_ /= normalizer
+                self.label_distributions_ = np.where(
+                    unlabeled, self.label_distributions_, y_static
+                )
+            else:
+                # clamp
+                self.label_distributions_ = (
+                    np.multiply(self.alpha, self.label_distributions_) + y_static
+                )
+        else:
+            warnings.warn(
+                "max_iter=%d was reached without convergence." % self.max_iter,
+                category=ConvergenceWarning,
+            )
+            self.n_iter_ += 1
+
+        normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
+        normalizer[normalizer == 0] = 1
+        self.label_distributions_ /= normalizer
+
+        # set the transduction item
+        transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)]
+        self.transduction_ = transduction.ravel()
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class LabelPropagation(BaseLabelPropagation):
+    """Label Propagation classifier.
+
+    Read more in the :ref:`User Guide <label_propagation>`.
+
+    Parameters
+    ----------
+    kernel : {'knn', 'rbf'} or callable, default='rbf'
+        String identifier for kernel function to use or the kernel function
+        itself. Only 'rbf' and 'knn' strings are valid inputs. The function
+        passed should take two inputs, each of shape (n_samples, n_features),
+        and return a (n_samples, n_samples) shaped weight matrix.
+
+    gamma : float, default=20
+        Parameter for rbf kernel.
+
+    n_neighbors : int, default=7
+        Parameter for knn kernel which need to be strictly positive.
+
+    max_iter : int, default=1000
+        Change maximum number of iterations allowed.
+
+    tol : float, default=1e-3
+        Convergence tolerance: threshold to consider the system at steady
+        state.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    X_ : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input array.
+
+    classes_ : ndarray of shape (n_classes,)
+        The distinct labels used in classifying instances.
+
+    label_distributions_ : ndarray of shape (n_samples, n_classes)
+        Categorical distribution for each item.
+
+    transduction_ : ndarray of shape (n_samples)
+        Label assigned to each item during :term:`fit`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations run.
+
+    See Also
+    --------
+    LabelSpreading : Alternate label propagation strategy more robust to noise.
+
+    References
+    ----------
+    Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data
+    with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon
+    University, 2002 http://pages.cs.wisc.edu/~jerryzhu/pub/CMU-CALD-02-107.pdf
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import datasets
+    >>> from sklearn.semi_supervised import LabelPropagation
+    >>> label_prop_model = LabelPropagation()
+    >>> iris = datasets.load_iris()
+    >>> rng = np.random.RandomState(42)
+    >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
+    >>> labels = np.copy(iris.target)
+    >>> labels[random_unlabeled_points] = -1
+    >>> label_prop_model.fit(iris.data, labels)
+    LabelPropagation(...)
+    """
+
+    _variant = "propagation"
+
+    _parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints}
+    _parameter_constraints.pop("alpha")
+
+    def __init__(
+        self,
+        kernel="rbf",
+        *,
+        gamma=20,
+        n_neighbors=7,
+        max_iter=1000,
+        tol=1e-3,
+        n_jobs=None,
+    ):
+        super().__init__(
+            kernel=kernel,
+            gamma=gamma,
+            n_neighbors=n_neighbors,
+            max_iter=max_iter,
+            tol=tol,
+            n_jobs=n_jobs,
+            alpha=None,
+        )
+
+    def _build_graph(self):
+        """Matrix representing a fully connected graph between each sample
+
+        This basic implementation creates a non-stochastic affinity matrix, so
+        class distributions will exceed 1 (normalization may be desired).
+        """
+        if self.kernel == "knn":
+            self.nn_fit = None
+        affinity_matrix = self._get_kernel(self.X_)
+        normalizer = affinity_matrix.sum(axis=0)
+        if sparse.issparse(affinity_matrix):
+            affinity_matrix.data /= np.diag(np.array(normalizer))
+        else:
+            affinity_matrix /= normalizer[:, np.newaxis]
+        return affinity_matrix
+
+    def fit(self, X, y):
+        """Fit a semi-supervised label propagation model to X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target class values with unlabeled points marked as -1.
+            All unlabeled samples will be transductively assigned labels
+            internally, which are stored in `transduction_`.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        return super().fit(X, y)
+
+
+class LabelSpreading(BaseLabelPropagation):
+    """LabelSpreading model for semi-supervised learning.
+
+    This model is similar to the basic Label Propagation algorithm,
+    but uses affinity matrix based on the normalized graph Laplacian
+    and soft clamping across the labels.
+
+    Read more in the :ref:`User Guide <label_propagation>`.
+
+    Parameters
+    ----------
+    kernel : {'knn', 'rbf'} or callable, default='rbf'
+        String identifier for kernel function to use or the kernel function
+        itself. Only 'rbf' and 'knn' strings are valid inputs. The function
+        passed should take two inputs, each of shape (n_samples, n_features),
+        and return a (n_samples, n_samples) shaped weight matrix.
+
+    gamma : float, default=20
+      Parameter for rbf kernel.
+
+    n_neighbors : int, default=7
+      Parameter for knn kernel which is a strictly positive integer.
+
+    alpha : float, default=0.2
+      Clamping factor. A value in (0, 1) that specifies the relative amount
+      that an instance should adopt the information from its neighbors as
+      opposed to its initial label.
+      alpha=0 means keeping the initial label information; alpha=1 means
+      replacing all initial information.
+
+    max_iter : int, default=30
+      Maximum number of iterations allowed.
+
+    tol : float, default=1e-3
+      Convergence tolerance: threshold to consider the system at steady
+      state.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    X_ : ndarray of shape (n_samples, n_features)
+        Input array.
+
+    classes_ : ndarray of shape (n_classes,)
+        The distinct labels used in classifying instances.
+
+    label_distributions_ : ndarray of shape (n_samples, n_classes)
+        Categorical distribution for each item.
+
+    transduction_ : ndarray of shape (n_samples,)
+        Label assigned to each item during :term:`fit`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations run.
+
+    See Also
+    --------
+    LabelPropagation : Unregularized graph based semi-supervised learning.
+
+    References
+    ----------
+    `Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston,
+    Bernhard Schoelkopf. Learning with local and global consistency (2004)
+    <https://citeseerx.ist.psu.edu/doc_view/pid/d74c37aabf2d5cae663007cbd8718175466aea8c>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import datasets
+    >>> from sklearn.semi_supervised import LabelSpreading
+    >>> label_prop_model = LabelSpreading()
+    >>> iris = datasets.load_iris()
+    >>> rng = np.random.RandomState(42)
+    >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
+    >>> labels = np.copy(iris.target)
+    >>> labels[random_unlabeled_points] = -1
+    >>> label_prop_model.fit(iris.data, labels)
+    LabelSpreading(...)
+    """
+
+    _variant = "spreading"
+
+    _parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints}
+    _parameter_constraints["alpha"] = [Interval(Real, 0, 1, closed="neither")]
+
+    def __init__(
+        self,
+        kernel="rbf",
+        *,
+        gamma=20,
+        n_neighbors=7,
+        alpha=0.2,
+        max_iter=30,
+        tol=1e-3,
+        n_jobs=None,
+    ):
+        # this one has different base parameters
+        super().__init__(
+            kernel=kernel,
+            gamma=gamma,
+            n_neighbors=n_neighbors,
+            alpha=alpha,
+            max_iter=max_iter,
+            tol=tol,
+            n_jobs=n_jobs,
+        )
+
+    def _build_graph(self):
+        """Graph matrix for Label Spreading computes the graph laplacian"""
+        # compute affinity matrix (or gram matrix)
+        if self.kernel == "knn":
+            self.nn_fit = None
+        n_samples = self.X_.shape[0]
+        affinity_matrix = self._get_kernel(self.X_)
+        laplacian = csgraph_laplacian(affinity_matrix, normed=True)
+        laplacian = -laplacian
+        if sparse.issparse(laplacian):
+            diag_mask = laplacian.row == laplacian.col
+            laplacian.data[diag_mask] = 0.0
+        else:
+            laplacian.flat[:: n_samples + 1] = 0.0  # set diag to 0.0
+        return laplacian
diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_self_training.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_self_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe6f57d6c1ed281748e7223554a103a52a01334
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_self_training.py
@@ -0,0 +1,625 @@
+import warnings
+from numbers import Integral, Real
+from warnings import warn
+
+import numpy as np
+
+from ..base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MetaEstimatorMixin,
+    _fit_context,
+    clone,
+)
+from ..utils import Bunch, get_tags, safe_mask
+from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.metaestimators import available_if
+from ..utils.validation import _estimator_has, check_is_fitted, validate_data
+
+__all__ = ["SelfTrainingClassifier"]
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+class SelfTrainingClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
+    """Self-training classifier.
+
+    This :term:`metaestimator` allows a given supervised classifier to function as a
+    semi-supervised classifier, allowing it to learn from unlabeled data. It
+    does this by iteratively predicting pseudo-labels for the unlabeled data
+    and adding them to the training set.
+
+    The classifier will continue iterating until either max_iter is reached, or
+    no pseudo-labels were added to the training set in the previous iteration.
+
+    Read more in the :ref:`User Guide <self_training>`.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        An estimator object implementing `fit` and `predict_proba`.
+        Invoking the `fit` method will fit a clone of the passed estimator,
+        which will be stored in the `estimator_` attribute.
+
+        .. versionadded:: 1.6
+            `estimator` was added to replace `base_estimator`.
+
+    base_estimator : estimator object
+        An estimator object implementing `fit` and `predict_proba`.
+        Invoking the `fit` method will fit a clone of the passed estimator,
+        which will be stored in the `estimator_` attribute.
+
+        .. deprecated:: 1.6
+            `base_estimator` was deprecated in 1.6 and will be removed in 1.8.
+            Use `estimator` instead.
+
+    threshold : float, default=0.75
+        The decision threshold for use with `criterion='threshold'`.
+        Should be in [0, 1). When using the `'threshold'` criterion, a
+        :ref:`well calibrated classifier <calibration>` should be used.
+
+    criterion : {'threshold', 'k_best'}, default='threshold'
+        The selection criterion used to select which labels to add to the
+        training set. If `'threshold'`, pseudo-labels with prediction
+        probabilities above `threshold` are added to the dataset. If `'k_best'`,
+        the `k_best` pseudo-labels with highest prediction probabilities are
+        added to the dataset. When using the 'threshold' criterion, a
+        :ref:`well calibrated classifier <calibration>` should be used.
+
+    k_best : int, default=10
+        The amount of samples to add in each iteration. Only used when
+        `criterion='k_best'`.
+
+    max_iter : int or None, default=10
+        Maximum number of iterations allowed. Should be greater than or equal
+        to 0. If it is `None`, the classifier will continue to predict labels
+        until no new pseudo-labels are added, or all unlabeled samples have
+        been labeled.
+
+    verbose : bool, default=False
+        Enable verbose output.
+
+    Attributes
+    ----------
+    estimator_ : estimator object
+        The fitted estimator.
+
+    classes_ : ndarray or list of ndarray of shape (n_classes,)
+        Class labels for each output. (Taken from the trained
+        `estimator_`).
+
+    transduction_ : ndarray of shape (n_samples,)
+        The labels used for the final fit of the classifier, including
+        pseudo-labels added during fit.
+
+    labeled_iter_ : ndarray of shape (n_samples,)
+        The iteration in which each sample was labeled. When a sample has
+        iteration 0, the sample was already labeled in the original dataset.
+        When a sample has iteration -1, the sample was not labeled in any
+        iteration.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The number of rounds of self-training, that is the number of times the
+        base estimator is fitted on relabeled variants of the training set.
+
+    termination_condition_ : {'max_iter', 'no_change', 'all_labeled'}
+        The reason that fitting was stopped.
+
+        - `'max_iter'`: `n_iter_` reached `max_iter`.
+        - `'no_change'`: no new labels were predicted.
+        - `'all_labeled'`: all unlabeled samples were labeled before `max_iter`
+          was reached.
+
+    See Also
+    --------
+    LabelPropagation : Label propagation classifier.
+    LabelSpreading : Label spreading model for semi-supervised learning.
+
+    References
+    ----------
+    :doi:`David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling
+    supervised methods. In Proceedings of the 33rd annual meeting on
+    Association for Computational Linguistics (ACL '95). Association for
+    Computational Linguistics, Stroudsburg, PA, USA, 189-196.
+    <10.3115/981658.981684>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import datasets
+    >>> from sklearn.semi_supervised import SelfTrainingClassifier
+    >>> from sklearn.svm import SVC
+    >>> rng = np.random.RandomState(42)
+    >>> iris = datasets.load_iris()
+    >>> random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3
+    >>> iris.target[random_unlabeled_points] = -1
+    >>> svc = SVC(probability=True, gamma="auto")
+    >>> self_training_model = SelfTrainingClassifier(svc)
+    >>> self_training_model.fit(iris.data, iris.target)
+    SelfTrainingClassifier(...)
+    """
+
+    _parameter_constraints: dict = {
+        # We don't require `predic_proba` here to allow passing a meta-estimator
+        # that only exposes `predict_proba` after fitting.
+        # TODO(1.8) remove None option
+        "estimator": [None, HasMethods(["fit"])],
+        # TODO(1.8) remove
+        "base_estimator": [
+            HasMethods(["fit"]),
+            Hidden(StrOptions({"deprecated"})),
+        ],
+        "threshold": [Interval(Real, 0.0, 1.0, closed="left")],
+        "criterion": [StrOptions({"threshold", "k_best"})],
+        "k_best": [Interval(Integral, 1, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left"), None],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        estimator=None,
+        base_estimator="deprecated",
+        threshold=0.75,
+        criterion="threshold",
+        k_best=10,
+        max_iter=10,
+        verbose=False,
+    ):
+        self.estimator = estimator
+        self.threshold = threshold
+        self.criterion = criterion
+        self.k_best = k_best
+        self.max_iter = max_iter
+        self.verbose = verbose
+
+        # TODO(1.8) remove
+        self.base_estimator = base_estimator
+
+    def _get_estimator(self):
+        """Get the estimator.
+
+        Returns
+        -------
+        estimator_ : estimator object
+            The cloned estimator object.
+        """
+        # TODO(1.8): remove and only keep clone(self.estimator)
+        if self.estimator is None and self.base_estimator != "deprecated":
+            estimator_ = clone(self.base_estimator)
+
+            warn(
+                (
+                    "`base_estimator` has been deprecated in 1.6 and will be removed"
+                    " in 1.8. Please use `estimator` instead."
+                ),
+                FutureWarning,
+            )
+        # TODO(1.8) remove
+        elif self.estimator is None and self.base_estimator == "deprecated":
+            raise ValueError(
+                "You must pass an estimator to SelfTrainingClassifier. Use `estimator`."
+            )
+        elif self.estimator is not None and self.base_estimator != "deprecated":
+            raise ValueError(
+                "You must pass only one estimator to SelfTrainingClassifier."
+                " Use `estimator`."
+            )
+        else:
+            estimator_ = clone(self.estimator)
+        return estimator_
+
+    @_fit_context(
+        # SelfTrainingClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **params):
+        """
+        Fit self-training classifier using `X`, `y` as training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Array representing the data.
+
+        y : {array-like, sparse matrix} of shape (n_samples,)
+            Array representing the labels. Unlabeled samples should have the
+            label -1.
+
+        **params : dict
+            Parameters to pass to the underlying estimators.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        _raise_for_params(params, self, "fit")
+
+        self.estimator_ = self._get_estimator()
+
+        # we need row slicing support for sparse matrices, but costly finiteness check
+        # can be delegated to the base estimator.
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=["csr", "csc", "lil", "dok"],
+            ensure_all_finite=False,
+        )
+
+        if y.dtype.kind in ["U", "S"]:
+            raise ValueError(
+                "y has dtype string. If you wish to predict on "
+                "string targets, use dtype object, and use -1"
+                " as the label for unlabeled samples."
+            )
+
+        has_label = y != -1
+
+        if np.all(has_label):
+            warnings.warn("y contains no unlabeled samples", UserWarning)
+
+        if self.criterion == "k_best" and (
+            self.k_best > X.shape[0] - np.sum(has_label)
+        ):
+            warnings.warn(
+                (
+                    "k_best is larger than the amount of unlabeled "
+                    "samples. All unlabeled samples will be labeled in "
+                    "the first iteration"
+                ),
+                UserWarning,
+            )
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **params)
+        else:
+            routed_params = Bunch(estimator=Bunch(fit={}))
+
+        self.transduction_ = np.copy(y)
+        self.labeled_iter_ = np.full_like(y, -1)
+        self.labeled_iter_[has_label] = 0
+
+        self.n_iter_ = 0
+
+        while not np.all(has_label) and (
+            self.max_iter is None or self.n_iter_ < self.max_iter
+        ):
+            self.n_iter_ += 1
+            self.estimator_.fit(
+                X[safe_mask(X, has_label)],
+                self.transduction_[has_label],
+                **routed_params.estimator.fit,
+            )
+
+            # Predict on the unlabeled samples
+            prob = self.estimator_.predict_proba(X[safe_mask(X, ~has_label)])
+            pred = self.estimator_.classes_[np.argmax(prob, axis=1)]
+            max_proba = np.max(prob, axis=1)
+
+            # Select new labeled samples
+            if self.criterion == "threshold":
+                selected = max_proba > self.threshold
+            else:
+                n_to_select = min(self.k_best, max_proba.shape[0])
+                if n_to_select == max_proba.shape[0]:
+                    selected = np.ones_like(max_proba, dtype=bool)
+                else:
+                    # NB these are indices, not a mask
+                    selected = np.argpartition(-max_proba, n_to_select)[:n_to_select]
+
+            # Map selected indices into original array
+            selected_full = np.nonzero(~has_label)[0][selected]
+
+            # Add newly labeled confident predictions to the dataset
+            self.transduction_[selected_full] = pred[selected]
+            has_label[selected_full] = True
+            self.labeled_iter_[selected_full] = self.n_iter_
+
+            if selected_full.shape[0] == 0:
+                # no changed labels
+                self.termination_condition_ = "no_change"
+                break
+
+            if self.verbose:
+                print(
+                    f"End of iteration {self.n_iter_},"
+                    f" added {selected_full.shape[0]} new labels."
+                )
+
+        if self.n_iter_ == self.max_iter:
+            self.termination_condition_ = "max_iter"
+        if np.all(has_label):
+            self.termination_condition_ = "all_labeled"
+
+        self.estimator_.fit(
+            X[safe_mask(X, has_label)],
+            self.transduction_[has_label],
+            **routed_params.estimator.fit,
+        )
+        self.classes_ = self.estimator_.classes_
+        return self
+
+    @available_if(_estimator_has("predict"))
+    def predict(self, X, **params):
+        """Predict the classes of `X`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Array representing the data.
+
+        **params : dict of str -> object
+            Parameters to pass to the underlying estimator's ``predict`` method.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            Array with predicted labels.
+        """
+        check_is_fitted(self)
+        _raise_for_params(params, self, "predict")
+
+        if _routing_enabled():
+            # metadata routing is enabled.
+            routed_params = process_routing(self, "predict", **params)
+        else:
+            routed_params = Bunch(estimator=Bunch(predict={}))
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=True,
+            ensure_all_finite=False,
+            reset=False,
+        )
+        return self.estimator_.predict(X, **routed_params.estimator.predict)
+
+    @available_if(_estimator_has("predict_proba"))
+    def predict_proba(self, X, **params):
+        """Predict probability for each possible outcome.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Array representing the data.
+
+        **params : dict of str -> object
+            Parameters to pass to the underlying estimator's
+            ``predict_proba`` method.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_features)
+            Array with prediction probabilities.
+        """
+        check_is_fitted(self)
+        _raise_for_params(params, self, "predict_proba")
+
+        if _routing_enabled():
+            # metadata routing is enabled.
+            routed_params = process_routing(self, "predict_proba", **params)
+        else:
+            routed_params = Bunch(estimator=Bunch(predict_proba={}))
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=True,
+            ensure_all_finite=False,
+            reset=False,
+        )
+        return self.estimator_.predict_proba(X, **routed_params.estimator.predict_proba)
+
+    @available_if(_estimator_has("decision_function"))
+    def decision_function(self, X, **params):
+        """Call decision function of the `estimator`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Array representing the data.
+
+        **params : dict of str -> object
+            Parameters to pass to the underlying estimator's
+            ``decision_function`` method.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_features)
+            Result of the decision function of the `estimator`.
+        """
+        check_is_fitted(self)
+        _raise_for_params(params, self, "decision_function")
+
+        if _routing_enabled():
+            # metadata routing is enabled.
+            routed_params = process_routing(self, "decision_function", **params)
+        else:
+            routed_params = Bunch(estimator=Bunch(decision_function={}))
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=True,
+            ensure_all_finite=False,
+            reset=False,
+        )
+        return self.estimator_.decision_function(
+            X, **routed_params.estimator.decision_function
+        )
+
+    @available_if(_estimator_has("predict_log_proba"))
+    def predict_log_proba(self, X, **params):
+        """Predict log probability for each possible outcome.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Array representing the data.
+
+        **params : dict of str -> object
+            Parameters to pass to the underlying estimator's
+            ``predict_log_proba`` method.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_features)
+            Array with log prediction probabilities.
+        """
+        check_is_fitted(self)
+        _raise_for_params(params, self, "predict_log_proba")
+
+        if _routing_enabled():
+            # metadata routing is enabled.
+            routed_params = process_routing(self, "predict_log_proba", **params)
+        else:
+            routed_params = Bunch(estimator=Bunch(predict_log_proba={}))
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=True,
+            ensure_all_finite=False,
+            reset=False,
+        )
+        return self.estimator_.predict_log_proba(
+            X, **routed_params.estimator.predict_log_proba
+        )
+
+    @available_if(_estimator_has("score"))
+    def score(self, X, y, **params):
+        """Call score on the `estimator`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Array representing the data.
+
+        y : array-like of shape (n_samples,)
+            Array representing the labels.
+
+        **params : dict of str -> object
+            Parameters to pass to the underlying estimator's ``score`` method.
+
+            .. versionadded:: 1.6
+                Only available if `enable_metadata_routing=True`,
+                which can be set by using
+                ``sklearn.set_config(enable_metadata_routing=True)``.
+                See :ref:`Metadata Routing User Guide <metadata_routing>` for
+                more details.
+
+        Returns
+        -------
+        score : float
+            Result of calling score on the `estimator`.
+        """
+        check_is_fitted(self)
+        _raise_for_params(params, self, "score")
+
+        if _routing_enabled():
+            # metadata routing is enabled.
+            routed_params = process_routing(self, "score", **params)
+        else:
+            routed_params = Bunch(estimator=Bunch(score={}))
+
+        X = validate_data(
+            self,
+            X,
+            accept_sparse=True,
+            ensure_all_finite=False,
+            reset=False,
+        )
+        return self.estimator_.score(X, y, **routed_params.estimator.score)
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.6
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+        router.add(
+            estimator=self.estimator,
+            method_mapping=(
+                MethodMapping()
+                .add(callee="fit", caller="fit")
+                .add(callee="score", caller="fit")
+                .add(callee="predict", caller="predict")
+                .add(callee="predict_proba", caller="predict_proba")
+                .add(callee="decision_function", caller="decision_function")
+                .add(callee="predict_log_proba", caller="predict_log_proba")
+                .add(callee="score", caller="score")
+            ),
+        )
+        return router
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # TODO(1.8): remove the condition check together with base_estimator
+        if self.estimator is not None:
+            tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b046aa11125032a706b5c984c5dec5caba72594
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py
@@ -0,0 +1,238 @@
+"""test the label propagation module"""
+
+import warnings
+
+import numpy as np
+import pytest
+from scipy.sparse import issparse
+
+from sklearn.datasets import make_classification
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics.pairwise import rbf_kernel
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import NearestNeighbors
+from sklearn.semi_supervised import _label_propagation as label_propagation
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_array_equal,
+)
+
+CONSTRUCTOR_TYPES = ("array", "sparse_csr", "sparse_csc")
+
+ESTIMATORS = [
+    (label_propagation.LabelPropagation, {"kernel": "rbf"}),
+    (label_propagation.LabelPropagation, {"kernel": "knn", "n_neighbors": 2}),
+    (
+        label_propagation.LabelPropagation,
+        {"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)},
+    ),
+    (label_propagation.LabelSpreading, {"kernel": "rbf"}),
+    (label_propagation.LabelSpreading, {"kernel": "knn", "n_neighbors": 2}),
+    (
+        label_propagation.LabelSpreading,
+        {"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)},
+    ),
+]
+
+
+@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
+def test_fit_transduction(global_dtype, Estimator, parameters):
+    samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype)
+    labels = [0, 1, -1]
+    clf = Estimator(**parameters).fit(samples, labels)
+    assert clf.transduction_[2] == 1
+
+
+@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
+def test_distribution(global_dtype, Estimator, parameters):
+    if parameters["kernel"] == "knn":
+        pytest.skip(
+            "Unstable test for this configuration: changes in k-NN ordering break it."
+        )
+    samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=global_dtype)
+    labels = [0, 1, -1]
+    clf = Estimator(**parameters).fit(samples, labels)
+    assert_allclose(clf.label_distributions_[2], [0.5, 0.5], atol=1e-2)
+
+
+@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
+def test_predict(global_dtype, Estimator, parameters):
+    samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype)
+    labels = [0, 1, -1]
+    clf = Estimator(**parameters).fit(samples, labels)
+    assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
+
+
+@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
+def test_predict_proba(global_dtype, Estimator, parameters):
+    samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], dtype=global_dtype)
+    labels = [0, 1, -1]
+    clf = Estimator(**parameters).fit(samples, labels)
+    assert_allclose(clf.predict_proba([[1.0, 1.0]]), np.array([[0.5, 0.5]]))
+
+
+@pytest.mark.parametrize("alpha", [0.1, 0.3, 0.5, 0.7, 0.9])
+@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
+def test_label_spreading_closed_form(global_dtype, Estimator, parameters, alpha):
+    n_classes = 2
+    X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
+    X = X.astype(global_dtype, copy=False)
+    y[::3] = -1
+
+    gamma = 0.1
+    clf = label_propagation.LabelSpreading(gamma=gamma).fit(X, y)
+    # adopting notation from Zhou et al (2004):
+    S = clf._build_graph()
+    Y = np.zeros((len(y), n_classes + 1), dtype=X.dtype)
+    Y[np.arange(len(y)), y] = 1
+    Y = Y[:, :-1]
+
+    expected = np.dot(np.linalg.inv(np.eye(len(S), dtype=S.dtype) - alpha * S), Y)
+    expected /= expected.sum(axis=1)[:, np.newaxis]
+
+    clf = label_propagation.LabelSpreading(
+        max_iter=100, alpha=alpha, tol=1e-10, gamma=gamma
+    )
+    clf.fit(X, y)
+
+    assert_allclose(expected, clf.label_distributions_)
+
+
+def test_label_propagation_closed_form(global_dtype):
+    n_classes = 2
+    X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
+    X = X.astype(global_dtype, copy=False)
+    y[::3] = -1
+    Y = np.zeros((len(y), n_classes + 1))
+    Y[np.arange(len(y)), y] = 1
+    unlabelled_idx = Y[:, (-1,)].nonzero()[0]
+    labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0]
+
+    clf = label_propagation.LabelPropagation(max_iter=100, tol=1e-10, gamma=0.1)
+    clf.fit(X, y)
+    # adopting notation from Zhu et al 2002
+    T_bar = clf._build_graph()
+    Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx, indexing="ij"))]
+    Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing="ij"))]
+    Y = Y[:, :-1]
+    Y_l = Y[labelled_idx, :]
+    Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l)
+
+    expected = Y.copy()
+    expected[unlabelled_idx, :] = Y_u
+    expected /= expected.sum(axis=1)[:, np.newaxis]
+
+    assert_allclose(expected, clf.label_distributions_, atol=1e-4)
+
+
+@pytest.mark.parametrize("accepted_sparse_type", ["sparse_csr", "sparse_csc"])
+@pytest.mark.parametrize("index_dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
+def test_sparse_input_types(
+    accepted_sparse_type, index_dtype, dtype, Estimator, parameters
+):
+    # This is non-regression test for #17085
+    X = _convert_container([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], accepted_sparse_type)
+    X.data = X.data.astype(dtype, copy=False)
+    X.indices = X.indices.astype(index_dtype, copy=False)
+    X.indptr = X.indptr.astype(index_dtype, copy=False)
+    labels = [0, 1, -1]
+    clf = Estimator(**parameters).fit(X, labels)
+    assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
+
+
+@pytest.mark.parametrize("constructor_type", CONSTRUCTOR_TYPES)
+def test_convergence_speed(constructor_type):
+    # This is a non-regression test for #5774
+    X = _convert_container([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], constructor_type)
+    y = np.array([0, 1, -1])
+    mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=5000)
+    mdl.fit(X, y)
+
+    # this should converge quickly:
+    assert mdl.n_iter_ < 10
+    assert_array_equal(mdl.predict(X), [0, 1, 1])
+
+
+def test_convergence_warning():
+    # This is a non-regression test for #5774
+    X = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]])
+    y = np.array([0, 1, -1])
+    mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=1)
+    warn_msg = "max_iter=1 was reached without convergence."
+    with pytest.warns(ConvergenceWarning, match=warn_msg):
+        mdl.fit(X, y)
+    assert mdl.n_iter_ == mdl.max_iter
+
+    mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=1)
+    with pytest.warns(ConvergenceWarning, match=warn_msg):
+        mdl.fit(X, y)
+    assert mdl.n_iter_ == mdl.max_iter
+
+    mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=500)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", ConvergenceWarning)
+        mdl.fit(X, y)
+
+    mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=500)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", ConvergenceWarning)
+        mdl.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "LabelPropagationCls",
+    [label_propagation.LabelSpreading, label_propagation.LabelPropagation],
+)
+def test_label_propagation_non_zero_normalizer(LabelPropagationCls):
+    # check that we don't divide by zero in case of null normalizer
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/pull/15946
+    # https://github.com/scikit-learn/scikit-learn/issues/9292
+    X = np.array([[100.0, 100.0], [100.0, 100.0], [0.0, 0.0], [0.0, 0.0]])
+    y = np.array([0, 1, -1, -1])
+    mdl = LabelPropagationCls(kernel="knn", max_iter=100, n_neighbors=1)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        mdl.fit(X, y)
+
+
+def test_predict_sparse_callable_kernel(global_dtype):
+    # This is a non-regression test for #15866
+
+    # Custom sparse kernel (top-K RBF)
+    def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
+        nn = NearestNeighbors(n_neighbors=10, metric="euclidean", n_jobs=2)
+        nn.fit(X)
+        W = -1 * nn.kneighbors_graph(Y, mode="distance").power(2) * gamma
+        np.exp(W.data, out=W.data)
+        assert issparse(W)
+        return W.T
+
+    n_classes = 4
+    n_samples = 500
+    n_test = 10
+    X, y = make_classification(
+        n_classes=n_classes,
+        n_samples=n_samples,
+        n_features=20,
+        n_informative=20,
+        n_redundant=0,
+        n_repeated=0,
+        random_state=0,
+    )
+    X = X.astype(global_dtype)
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=n_test, random_state=0
+    )
+
+    model = label_propagation.LabelSpreading(kernel=topk_rbf)
+    model.fit(X_train, y_train)
+    assert model.score(X_test, y_test) >= 0.9
+
+    model = label_propagation.LabelPropagation(kernel=topk_rbf)
+    model.fit(X_train, y_train)
+    assert model.score(X_test, y_test) >= 0.9
diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_self_training.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_self_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..02244063994d573537d7194c2837f8e80ffad0c6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_self_training.py
@@ -0,0 +1,395 @@
+from math import ceil
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.datasets import load_iris, make_blobs
+from sklearn.ensemble import StackingClassifier
+from sklearn.exceptions import NotFittedError
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.semi_supervised import SelfTrainingClassifier
+from sklearn.svm import SVC
+from sklearn.tests.test_pipeline import SimpleEstimator
+from sklearn.tree import DecisionTreeClassifier
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# load the iris dataset and randomly permute it
+iris = load_iris()
+X_train, X_test, y_train, y_test = train_test_split(
+    iris.data, iris.target, random_state=0
+)
+
+n_labeled_samples = 50
+
+y_train_missing_labels = y_train.copy()
+y_train_missing_labels[n_labeled_samples:] = -1
+mapping = {0: "A", 1: "B", 2: "C", -1: "-1"}
+y_train_missing_strings = np.vectorize(mapping.get)(y_train_missing_labels).astype(
+    object
+)
+y_train_missing_strings[y_train_missing_labels == -1] = -1
+
+
+def test_warns_k_best():
+    st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="k_best", k_best=1000)
+    with pytest.warns(UserWarning, match="k_best is larger than"):
+        st.fit(X_train, y_train_missing_labels)
+
+    assert st.termination_condition_ == "all_labeled"
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)],
+)
+@pytest.mark.parametrize("selection_crit", ["threshold", "k_best"])
+def test_classification(estimator, selection_crit):
+    # Check classification for various parameter settings.
+    # Also assert that predictions for strings and numerical labels are equal.
+    # Also test for multioutput classification
+    threshold = 0.75
+    max_iter = 10
+    st = SelfTrainingClassifier(
+        estimator, max_iter=max_iter, threshold=threshold, criterion=selection_crit
+    )
+    st.fit(X_train, y_train_missing_labels)
+    pred = st.predict(X_test)
+    proba = st.predict_proba(X_test)
+
+    st_string = SelfTrainingClassifier(
+        estimator, max_iter=max_iter, criterion=selection_crit, threshold=threshold
+    )
+    st_string.fit(X_train, y_train_missing_strings)
+    pred_string = st_string.predict(X_test)
+    proba_string = st_string.predict_proba(X_test)
+
+    assert_array_equal(np.vectorize(mapping.get)(pred), pred_string)
+    assert_array_equal(proba, proba_string)
+
+    assert st.termination_condition_ == st_string.termination_condition_
+    # Check consistency between labeled_iter, n_iter and max_iter
+    labeled = y_train_missing_labels != -1
+    # assert that labeled samples have labeled_iter = 0
+    assert_array_equal(st.labeled_iter_ == 0, labeled)
+    # assert that labeled samples do not change label during training
+    assert_array_equal(y_train_missing_labels[labeled], st.transduction_[labeled])
+
+    # assert that the max of the iterations is less than the total amount of
+    # iterations
+    assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
+    assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter
+
+    # check shapes
+    assert st.labeled_iter_.shape == st.transduction_.shape
+    assert st_string.labeled_iter_.shape == st_string.transduction_.shape
+
+
+def test_k_best():
+    st = SelfTrainingClassifier(
+        KNeighborsClassifier(n_neighbors=1),
+        criterion="k_best",
+        k_best=10,
+        max_iter=None,
+    )
+    y_train_only_one_label = np.copy(y_train)
+    y_train_only_one_label[1:] = -1
+    n_samples = y_train.shape[0]
+
+    n_expected_iter = ceil((n_samples - 1) / 10)
+    st.fit(X_train, y_train_only_one_label)
+    assert st.n_iter_ == n_expected_iter
+
+    # Check labeled_iter_
+    assert np.sum(st.labeled_iter_ == 0) == 1
+    for i in range(1, n_expected_iter):
+        assert np.sum(st.labeled_iter_ == i) == 10
+    assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10
+    assert st.termination_condition_ == "all_labeled"
+
+
+def test_sanity_classification():
+    estimator = SVC(gamma="scale", probability=True)
+    estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:])
+
+    st = SelfTrainingClassifier(estimator)
+    st.fit(X_train, y_train_missing_labels)
+
+    pred1, pred2 = estimator.predict(X_test), st.predict(X_test)
+    assert not np.array_equal(pred1, pred2)
+    score_supervised = accuracy_score(estimator.predict(X_test), y_test)
+    score_self_training = accuracy_score(st.predict(X_test), y_test)
+
+    assert score_self_training > score_supervised
+
+
+def test_none_iter():
+    # Check that the all samples were labeled after a 'reasonable' number of
+    # iterations.
+    st = SelfTrainingClassifier(KNeighborsClassifier(), threshold=0.55, max_iter=None)
+    st.fit(X_train, y_train_missing_labels)
+
+    assert st.n_iter_ < 10
+    assert st.termination_condition_ == "all_labeled"
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)],
+)
+@pytest.mark.parametrize("y", [y_train_missing_labels, y_train_missing_strings])
+def test_zero_iterations(estimator, y):
+    # Check classification for zero iterations.
+    # Fitting a SelfTrainingClassifier with zero iterations should give the
+    # same results as fitting a supervised classifier.
+    # This also asserts that string arrays work as expected.
+
+    clf1 = SelfTrainingClassifier(estimator, max_iter=0)
+
+    clf1.fit(X_train, y)
+
+    clf2 = estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples])
+
+    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
+    assert clf1.termination_condition_ == "max_iter"
+
+
+def test_prefitted_throws_error():
+    # Test that passing a pre-fitted classifier and calling predict throws an
+    # error
+    knn = KNeighborsClassifier()
+    knn.fit(X_train, y_train)
+    st = SelfTrainingClassifier(knn)
+    with pytest.raises(
+        NotFittedError,
+        match="This SelfTrainingClassifier instance is not fitted yet",
+    ):
+        st.predict(X_train)
+
+
+@pytest.mark.parametrize("max_iter", range(1, 5))
+def test_labeled_iter(max_iter):
+    # Check that the amount of datapoints labeled in iteration 0 is equal to
+    # the amount of labeled datapoints we passed.
+    st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter)
+
+    st.fit(X_train, y_train_missing_labels)
+    amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0])
+    assert amount_iter_0 == n_labeled_samples
+    # Check that the max of the iterations is less than the total amount of
+    # iterations
+    assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
+
+
+def test_no_unlabeled():
+    # Test that training on a fully labeled dataset produces the same results
+    # as training the classifier by itself.
+    knn = KNeighborsClassifier()
+    knn.fit(X_train, y_train)
+    st = SelfTrainingClassifier(knn)
+    with pytest.warns(UserWarning, match="y contains no unlabeled samples"):
+        st.fit(X_train, y_train)
+    assert_array_equal(knn.predict(X_test), st.predict(X_test))
+    # Assert that all samples were labeled in iteration 0 (since there were no
+    # unlabeled samples).
+    assert np.all(st.labeled_iter_ == 0)
+    assert st.termination_condition_ == "all_labeled"
+
+
+def test_early_stopping():
+    svc = SVC(gamma="scale", probability=True)
+    st = SelfTrainingClassifier(svc)
+    X_train_easy = [[1], [0], [1], [0.5]]
+    y_train_easy = [1, 0, -1, -1]
+    # X = [[0.5]] cannot be predicted on with a high confidence, so training
+    # stops early
+    st.fit(X_train_easy, y_train_easy)
+    assert st.n_iter_ == 1
+    assert st.termination_condition_ == "no_change"
+
+
+def test_strings_dtype():
+    clf = SelfTrainingClassifier(KNeighborsClassifier())
+    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
+    labels_multiclass = ["one", "two", "three"]
+
+    y_strings = np.take(labels_multiclass, y)
+
+    with pytest.raises(ValueError, match="dtype"):
+        clf.fit(X, y_strings)
+
+
+@pytest.mark.parametrize("verbose", [True, False])
+def test_verbose(capsys, verbose):
+    clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose)
+    clf.fit(X_train, y_train_missing_labels)
+
+    captured = capsys.readouterr()
+
+    if verbose:
+        assert "iteration" in captured.out
+    else:
+        assert "iteration" not in captured.out
+
+
+def test_verbose_k_best(capsys):
+    st = SelfTrainingClassifier(
+        KNeighborsClassifier(n_neighbors=1),
+        criterion="k_best",
+        k_best=10,
+        verbose=True,
+        max_iter=None,
+    )
+
+    y_train_only_one_label = np.copy(y_train)
+    y_train_only_one_label[1:] = -1
+    n_samples = y_train.shape[0]
+
+    n_expected_iter = ceil((n_samples - 1) / 10)
+    st.fit(X_train, y_train_only_one_label)
+
+    captured = capsys.readouterr()
+
+    msg = "End of iteration {}, added {} new labels."
+    for i in range(1, n_expected_iter):
+        assert msg.format(i, 10) in captured.out
+
+    assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out
+
+
+def test_k_best_selects_best():
+    # Tests that the labels added by st really are the 10 best labels.
+    svc = SVC(gamma="scale", probability=True, random_state=0)
+    st = SelfTrainingClassifier(svc, criterion="k_best", max_iter=1, k_best=10)
+    has_label = y_train_missing_labels != -1
+    st.fit(X_train, y_train_missing_labels)
+
+    got_label = ~has_label & (st.transduction_ != -1)
+
+    svc.fit(X_train[has_label], y_train_missing_labels[has_label])
+    pred = svc.predict_proba(X_train[~has_label])
+    max_proba = np.max(pred, axis=1)
+
+    most_confident_svc = X_train[~has_label][np.argsort(max_proba)[-10:]]
+    added_by_st = X_train[np.where(got_label)].tolist()
+
+    for row in most_confident_svc.tolist():
+        assert row in added_by_st
+
+
+def test_estimator_meta_estimator():
+    # Check that a meta-estimator relying on an estimator implementing
+    # `predict_proba` will work even if it does not expose this method before being
+    # fitted.
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/19119
+
+    estimator = StackingClassifier(
+        estimators=[
+            ("svc_1", SVC(probability=True)),
+            ("svc_2", SVC(probability=True)),
+        ],
+        final_estimator=SVC(probability=True),
+        cv=2,
+    )
+
+    assert hasattr(estimator, "predict_proba")
+    clf = SelfTrainingClassifier(estimator=estimator)
+    clf.fit(X_train, y_train_missing_labels)
+    clf.predict_proba(X_test)
+
+    estimator = StackingClassifier(
+        estimators=[
+            ("svc_1", SVC(probability=False)),
+            ("svc_2", SVC(probability=False)),
+        ],
+        final_estimator=SVC(probability=False),
+        cv=2,
+    )
+
+    assert not hasattr(estimator, "predict_proba")
+    clf = SelfTrainingClassifier(estimator=estimator)
+    with pytest.raises(AttributeError):
+        clf.fit(X_train, y_train_missing_labels)
+
+
+def test_self_training_estimator_attribute_error():
+    """Check that we raise the proper AttributeErrors when the `estimator`
+    does not implement the `predict_proba` method, which is called from within
+    `fit`, or `decision_function`, which is decorated with `available_if`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28108
+    """
+    # `SVC` with `probability=False` does not implement 'predict_proba' that
+    # is required internally in `fit` of `SelfTrainingClassifier`. We expect
+    # an AttributeError to be raised.
+    estimator = SVC(probability=False, gamma="scale")
+    self_training = SelfTrainingClassifier(estimator)
+
+    with pytest.raises(AttributeError, match="has no attribute 'predict_proba'"):
+        self_training.fit(X_train, y_train_missing_labels)
+
+    # `DecisionTreeClassifier` does not implement 'decision_function' and
+    # should raise an AttributeError
+    self_training = SelfTrainingClassifier(estimator=DecisionTreeClassifier())
+
+    outer_msg = "This 'SelfTrainingClassifier' has no attribute 'decision_function'"
+    inner_msg = "'DecisionTreeClassifier' object has no attribute 'decision_function'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        self_training.fit(X_train, y_train_missing_labels).decision_function(X_train)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+
+# TODO(1.8): remove in 1.8
+def test_deprecation_warning_base_estimator():
+    warn_msg = "`base_estimator` has been deprecated in 1.6 and will be removed"
+    with pytest.warns(FutureWarning, match=warn_msg):
+        SelfTrainingClassifier(base_estimator=DecisionTreeClassifier()).fit(
+            X_train, y_train_missing_labels
+        )
+
+    error_msg = "You must pass an estimator to SelfTrainingClassifier"
+    with pytest.raises(ValueError, match=error_msg):
+        SelfTrainingClassifier().fit(X_train, y_train_missing_labels)
+
+    error_msg = "You must pass only one estimator to SelfTrainingClassifier."
+    with pytest.raises(ValueError, match=error_msg):
+        SelfTrainingClassifier(
+            base_estimator=DecisionTreeClassifier(), estimator=DecisionTreeClassifier()
+        ).fit(X_train, y_train_missing_labels)
+
+
+# Metadata routing tests
+# =================================================================
+
+
+@pytest.mark.filterwarnings("ignore:y contains no unlabeled samples:UserWarning")
+@pytest.mark.parametrize(
+    "method", ["decision_function", "predict_log_proba", "predict_proba", "predict"]
+)
+def test_routing_passed_metadata_not_supported(method):
+    """Test that the right error message is raised when metadata is passed while
+    not supported when `enable_metadata_routing=False`."""
+    est = SelfTrainingClassifier(estimator=SimpleEstimator())
+    with pytest.raises(
+        ValueError, match="is only supported if enable_metadata_routing=True"
+    ):
+        est.fit([[1], [1]], [1, 1], sample_weight=[1], prop="a")
+
+    est = SelfTrainingClassifier(estimator=SimpleEstimator())
+    with pytest.raises(
+        ValueError, match="is only supported if enable_metadata_routing=True"
+    ):
+        # make sure that the estimator thinks it is already fitted
+        est.fitted_params_ = True
+        getattr(est, method)([[1]], sample_weight=[1], prop="a")
+
+
+# End of routing tests
+# ====================
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/svm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a039d2e15abddf5aaca8faad462b1b951ec6e18a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/__init__.py
@@ -0,0 +1,21 @@
+"""Support vector machine algorithms."""
+
+# See http://scikit-learn.sourceforge.net/modules/svm.html for complete
+# documentation.
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._bounds import l1_min_c
+from ._classes import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM
+
+__all__ = [
+    "SVC",
+    "SVR",
+    "LinearSVC",
+    "LinearSVR",
+    "NuSVC",
+    "NuSVR",
+    "OneClassSVM",
+    "l1_min_c",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_base.py b/.venv/lib/python3.12/site-packages/sklearn/svm/_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..db295e4e877b50e7dff639de4dd6bb98c95d7b91
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_base.py
@@ -0,0 +1,1262 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+
+from ..base import BaseEstimator, ClassifierMixin, _fit_context
+from ..exceptions import ConvergenceWarning, NotFittedError
+from ..preprocessing import LabelEncoder
+from ..utils import check_array, check_random_state, column_or_1d, compute_class_weight
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import safe_sparse_dot
+from ..utils.metaestimators import available_if
+from ..utils.multiclass import _ovr_decision_function, check_classification_targets
+from ..utils.validation import (
+    _check_large_sparse,
+    _check_sample_weight,
+    _num_samples,
+    check_consistent_length,
+    check_is_fitted,
+    validate_data,
+)
+from . import _liblinear as liblinear  # type: ignore[attr-defined]
+
+# mypy error: error: Module 'sklearn.svm' has no attribute '_libsvm'
+# (and same for other imports)
+from . import _libsvm as libsvm  # type: ignore[attr-defined]
+from . import _libsvm_sparse as libsvm_sparse  # type: ignore[attr-defined]
+
+LIBSVM_IMPL = ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr"]
+
+
+def _one_vs_one_coef(dual_coef, n_support, support_vectors):
+    """Generate primal coefficients from dual coefficients
+    for the one-vs-one multi class LibSVM in the case
+    of a linear kernel."""
+
+    # get 1vs1 weights for all n*(n-1) classifiers.
+    # this is somewhat messy.
+    # shape of dual_coef_ is nSV * (n_classes -1)
+    # see docs for details
+    n_class = dual_coef.shape[0] + 1
+
+    # XXX we could do preallocation of coef but
+    # would have to take care in the sparse case
+    coef = []
+    sv_locs = np.cumsum(np.hstack([[0], n_support]))
+    for class1 in range(n_class):
+        # SVs for class1:
+        sv1 = support_vectors[sv_locs[class1] : sv_locs[class1 + 1], :]
+        for class2 in range(class1 + 1, n_class):
+            # SVs for class1:
+            sv2 = support_vectors[sv_locs[class2] : sv_locs[class2 + 1], :]
+
+            # dual coef for class1 SVs:
+            alpha1 = dual_coef[class2 - 1, sv_locs[class1] : sv_locs[class1 + 1]]
+            # dual coef for class2 SVs:
+            alpha2 = dual_coef[class1, sv_locs[class2] : sv_locs[class2 + 1]]
+            # build weight for class1 vs class2
+
+            coef.append(safe_sparse_dot(alpha1, sv1) + safe_sparse_dot(alpha2, sv2))
+    return coef
+
+
+class BaseLibSVM(BaseEstimator, metaclass=ABCMeta):
+    """Base class for estimators that use libsvm as backing library.
+
+    This implements support vector machine classification and regression.
+
+    Parameter documentation is in the derived `SVC` class.
+    """
+
+    _parameter_constraints: dict = {
+        "kernel": [
+            StrOptions({"linear", "poly", "rbf", "sigmoid", "precomputed"}),
+            callable,
+        ],
+        "degree": [Interval(Integral, 0, None, closed="left")],
+        "gamma": [
+            StrOptions({"scale", "auto"}),
+            Interval(Real, 0.0, None, closed="left"),
+        ],
+        "coef0": [Interval(Real, None, None, closed="neither")],
+        "tol": [Interval(Real, 0.0, None, closed="neither")],
+        "C": [Interval(Real, 0.0, None, closed="right")],
+        "nu": [Interval(Real, 0.0, 1.0, closed="right")],
+        "epsilon": [Interval(Real, 0.0, None, closed="left")],
+        "shrinking": ["boolean"],
+        "probability": ["boolean"],
+        "cache_size": [Interval(Real, 0, None, closed="neither")],
+        "class_weight": [StrOptions({"balanced"}), dict, None],
+        "verbose": ["verbose"],
+        "max_iter": [Interval(Integral, -1, None, closed="left")],
+        "random_state": ["random_state"],
+    }
+
+    # The order of these must match the integer values in LibSVM.
+    # XXX These are actually the same in the dense case. Need to factor
+    # this out.
+    _sparse_kernels = ["linear", "poly", "rbf", "sigmoid", "precomputed"]
+
+    @abstractmethod
+    def __init__(
+        self,
+        kernel,
+        degree,
+        gamma,
+        coef0,
+        tol,
+        C,
+        nu,
+        epsilon,
+        shrinking,
+        probability,
+        cache_size,
+        class_weight,
+        verbose,
+        max_iter,
+        random_state,
+    ):
+        if self._impl not in LIBSVM_IMPL:
+            raise ValueError(
+                "impl should be one of %s, %s was given" % (LIBSVM_IMPL, self._impl)
+            )
+
+        self.kernel = kernel
+        self.degree = degree
+        self.gamma = gamma
+        self.coef0 = coef0
+        self.tol = tol
+        self.C = C
+        self.nu = nu
+        self.epsilon = epsilon
+        self.shrinking = shrinking
+        self.probability = probability
+        self.cache_size = cache_size
+        self.class_weight = class_weight
+        self.verbose = verbose
+        self.max_iter = max_iter
+        self.random_state = random_state
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # Used by cross_val_score.
+        tags.input_tags.pairwise = self.kernel == "precomputed"
+        tags.input_tags.sparse = self.kernel != "precomputed"
+        return tags
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the SVM model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) \
+                or (n_samples, n_samples)
+            Training vectors, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+            For kernel="precomputed", the expected shape of X is
+            (n_samples, n_samples).
+
+        y : array-like of shape (n_samples,)
+            Target values (class labels in classification, real numbers in
+            regression).
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Per-sample weights. Rescale C per sample. Higher weights
+            force the classifier to put more emphasis on these points.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+
+        Notes
+        -----
+        If X and y are not C-ordered and contiguous arrays of np.float64 and
+        X is not a scipy.sparse.csr_matrix, X and/or y may be copied.
+
+        If X is a dense array, then the other methods will not support sparse
+        matrices as input.
+        """
+        rnd = check_random_state(self.random_state)
+
+        sparse = sp.issparse(X)
+        if sparse and self.kernel == "precomputed":
+            raise TypeError("Sparse precomputed kernels are not supported.")
+        self._sparse = sparse and not callable(self.kernel)
+
+        if callable(self.kernel):
+            check_consistent_length(X, y)
+        else:
+            X, y = validate_data(
+                self,
+                X,
+                y,
+                dtype=np.float64,
+                order="C",
+                accept_sparse="csr",
+                accept_large_sparse=False,
+            )
+
+        y = self._validate_targets(y)
+
+        sample_weight = np.asarray(
+            [] if sample_weight is None else sample_weight, dtype=np.float64
+        )
+        solver_type = LIBSVM_IMPL.index(self._impl)
+
+        # input validation
+        n_samples = _num_samples(X)
+        if solver_type != 2 and n_samples != y.shape[0]:
+            raise ValueError(
+                "X and y have incompatible shapes.\n"
+                + "X has %s samples, but y has %s." % (n_samples, y.shape[0])
+            )
+
+        if self.kernel == "precomputed" and n_samples != X.shape[1]:
+            raise ValueError(
+                "Precomputed matrix must be a square matrix."
+                " Input is a {}x{} matrix.".format(X.shape[0], X.shape[1])
+            )
+
+        if sample_weight.shape[0] > 0 and sample_weight.shape[0] != n_samples:
+            raise ValueError(
+                "sample_weight and X have incompatible shapes: "
+                "%r vs %r\n"
+                "Note: Sparse matrices cannot be indexed w/"
+                "boolean masks (use `indices=True` in CV)."
+                % (sample_weight.shape, X.shape)
+            )
+
+        kernel = "precomputed" if callable(self.kernel) else self.kernel
+
+        if kernel == "precomputed":
+            # unused but needs to be a float for cython code that ignores
+            # it anyway
+            self._gamma = 0.0
+        elif isinstance(self.gamma, str):
+            if self.gamma == "scale":
+                # var = E[X^2] - E[X]^2 if sparse
+                X_var = (X.multiply(X)).mean() - (X.mean()) ** 2 if sparse else X.var()
+                self._gamma = 1.0 / (X.shape[1] * X_var) if X_var != 0 else 1.0
+            elif self.gamma == "auto":
+                self._gamma = 1.0 / X.shape[1]
+        elif isinstance(self.gamma, Real):
+            self._gamma = self.gamma
+
+        fit = self._sparse_fit if self._sparse else self._dense_fit
+        if self.verbose:
+            print("[LibSVM]", end="")
+
+        seed = rnd.randint(np.iinfo("i").max)
+        fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
+        # see comment on the other call to np.iinfo in this file
+
+        self.shape_fit_ = X.shape if hasattr(X, "shape") else (n_samples,)
+
+        # In binary case, we need to flip the sign of coef, intercept and
+        # decision function. Use self._intercept_ and self._dual_coef_
+        # internally.
+        self._intercept_ = self.intercept_.copy()
+        self._dual_coef_ = self.dual_coef_
+        if self._impl in ["c_svc", "nu_svc"] and len(self.classes_) == 2:
+            self.intercept_ *= -1
+            self.dual_coef_ = -self.dual_coef_
+
+        dual_coef = self._dual_coef_.data if self._sparse else self._dual_coef_
+        intercept_finiteness = np.isfinite(self._intercept_).all()
+        dual_coef_finiteness = np.isfinite(dual_coef).all()
+        if not (intercept_finiteness and dual_coef_finiteness):
+            raise ValueError(
+                "The dual coefficients or intercepts are not finite."
+                " The input data may contain large values and need to be"
+                " preprocessed."
+            )
+
+        # Since, in the case of SVC and NuSVC, the number of models optimized by
+        # libSVM could be greater than one (depending on the input), `n_iter_`
+        # stores an ndarray.
+        # For the other sub-classes (SVR, NuSVR, and OneClassSVM), the number of
+        # models optimized by libSVM is always one, so `n_iter_` stores an
+        # integer.
+        if self._impl in ["c_svc", "nu_svc"]:
+            self.n_iter_ = self._num_iter
+        else:
+            self.n_iter_ = self._num_iter.item()
+
+        return self
+
+    def _validate_targets(self, y):
+        """Validation of y and class_weight.
+
+        Default implementation for SVR and one-class; overridden in BaseSVC.
+        """
+        return column_or_1d(y, warn=True).astype(np.float64, copy=False)
+
+    def _warn_from_fit_status(self):
+        assert self.fit_status_ in (0, 1)
+        if self.fit_status_ == 1:
+            warnings.warn(
+                "Solver terminated early (max_iter=%i)."
+                "  Consider pre-processing your data with"
+                " StandardScaler or MinMaxScaler." % self.max_iter,
+                ConvergenceWarning,
+            )
+
+    def _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed):
+        if callable(self.kernel):
+            # you must store a reference to X to compute the kernel in predict
+            # TODO: add keyword copy to copy on demand
+            self.__Xfit = X
+            X = self._compute_kernel(X)
+
+            if X.shape[0] != X.shape[1]:
+                raise ValueError("X.shape[0] should be equal to X.shape[1]")
+
+        libsvm.set_verbosity_wrap(self.verbose)
+
+        # we don't pass **self.get_params() to allow subclasses to
+        # add other parameters to __init__
+        (
+            self.support_,
+            self.support_vectors_,
+            self._n_support,
+            self.dual_coef_,
+            self.intercept_,
+            self._probA,
+            self._probB,
+            self.fit_status_,
+            self._num_iter,
+        ) = libsvm.fit(
+            X,
+            y,
+            svm_type=solver_type,
+            sample_weight=sample_weight,
+            class_weight=getattr(self, "class_weight_", np.empty(0)),
+            kernel=kernel,
+            C=self.C,
+            nu=self.nu,
+            probability=self.probability,
+            degree=self.degree,
+            shrinking=self.shrinking,
+            tol=self.tol,
+            cache_size=self.cache_size,
+            coef0=self.coef0,
+            gamma=self._gamma,
+            epsilon=self.epsilon,
+            max_iter=self.max_iter,
+            random_seed=random_seed,
+        )
+
+        self._warn_from_fit_status()
+
+    def _sparse_fit(self, X, y, sample_weight, solver_type, kernel, random_seed):
+        X.data = np.asarray(X.data, dtype=np.float64, order="C")
+        X.sort_indices()
+
+        kernel_type = self._sparse_kernels.index(kernel)
+
+        libsvm_sparse.set_verbosity_wrap(self.verbose)
+
+        (
+            self.support_,
+            self.support_vectors_,
+            dual_coef_data,
+            self.intercept_,
+            self._n_support,
+            self._probA,
+            self._probB,
+            self.fit_status_,
+            self._num_iter,
+        ) = libsvm_sparse.libsvm_sparse_train(
+            X.shape[1],
+            X.data,
+            X.indices,
+            X.indptr,
+            y,
+            solver_type,
+            kernel_type,
+            self.degree,
+            self._gamma,
+            self.coef0,
+            self.tol,
+            self.C,
+            getattr(self, "class_weight_", np.empty(0)),
+            sample_weight,
+            self.nu,
+            self.cache_size,
+            self.epsilon,
+            int(self.shrinking),
+            int(self.probability),
+            self.max_iter,
+            random_seed,
+        )
+
+        self._warn_from_fit_status()
+
+        if hasattr(self, "classes_"):
+            n_class = len(self.classes_) - 1
+        else:  # regression
+            n_class = 1
+        n_SV = self.support_vectors_.shape[0]
+
+        dual_coef_indices = np.tile(np.arange(n_SV), n_class)
+        if not n_SV:
+            self.dual_coef_ = sp.csr_matrix([])
+        else:
+            dual_coef_indptr = np.arange(
+                0, dual_coef_indices.size + 1, dual_coef_indices.size / n_class
+            )
+            self.dual_coef_ = sp.csr_matrix(
+                (dual_coef_data, dual_coef_indices, dual_coef_indptr), (n_class, n_SV)
+            )
+
+    def predict(self, X):
+        """Perform regression on samples in X.
+
+        For an one-class model, +1 (inlier) or -1 (outlier) is returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            For kernel="precomputed", the expected shape of X is
+            (n_samples_test, n_samples_train).
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            The predicted values.
+        """
+        X = self._validate_for_predict(X)
+        predict = self._sparse_predict if self._sparse else self._dense_predict
+        return predict(X)
+
+    def _dense_predict(self, X):
+        X = self._compute_kernel(X)
+        if X.ndim == 1:
+            X = check_array(X, order="C", accept_large_sparse=False)
+
+        kernel = self.kernel
+        if callable(self.kernel):
+            kernel = "precomputed"
+            if X.shape[1] != self.shape_fit_[0]:
+                raise ValueError(
+                    "X.shape[1] = %d should be equal to %d, "
+                    "the number of samples at training time"
+                    % (X.shape[1], self.shape_fit_[0])
+                )
+
+        svm_type = LIBSVM_IMPL.index(self._impl)
+
+        return libsvm.predict(
+            X,
+            self.support_,
+            self.support_vectors_,
+            self._n_support,
+            self._dual_coef_,
+            self._intercept_,
+            self._probA,
+            self._probB,
+            svm_type=svm_type,
+            kernel=kernel,
+            degree=self.degree,
+            coef0=self.coef0,
+            gamma=self._gamma,
+            cache_size=self.cache_size,
+        )
+
+    def _sparse_predict(self, X):
+        # Precondition: X is a csr_matrix of dtype np.float64.
+        kernel = self.kernel
+        if callable(kernel):
+            kernel = "precomputed"
+
+        kernel_type = self._sparse_kernels.index(kernel)
+
+        C = 0.0  # C is not useful here
+
+        return libsvm_sparse.libsvm_sparse_predict(
+            X.data,
+            X.indices,
+            X.indptr,
+            self.support_vectors_.data,
+            self.support_vectors_.indices,
+            self.support_vectors_.indptr,
+            self._dual_coef_.data,
+            self._intercept_,
+            LIBSVM_IMPL.index(self._impl),
+            kernel_type,
+            self.degree,
+            self._gamma,
+            self.coef0,
+            self.tol,
+            C,
+            getattr(self, "class_weight_", np.empty(0)),
+            self.nu,
+            self.epsilon,
+            self.shrinking,
+            self.probability,
+            self._n_support,
+            self._probA,
+            self._probB,
+        )
+
+    def _compute_kernel(self, X):
+        """Return the data transformed by a callable kernel"""
+        if callable(self.kernel):
+            # in the case of precomputed kernel given as a function, we
+            # have to compute explicitly the kernel matrix
+            kernel = self.kernel(X, self.__Xfit)
+            if sp.issparse(kernel):
+                kernel = kernel.toarray()
+            X = np.asarray(kernel, dtype=np.float64, order="C")
+        return X
+
+    def _decision_function(self, X):
+        """Evaluates the decision function for the samples in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        Returns
+        -------
+        X : array-like of shape (n_samples, n_class * (n_class-1) / 2)
+            Returns the decision function of the sample for each class
+            in the model.
+        """
+        # NOTE: _validate_for_predict contains check for is_fitted
+        # hence must be placed before any other attributes are used.
+        X = self._validate_for_predict(X)
+        X = self._compute_kernel(X)
+
+        if self._sparse:
+            dec_func = self._sparse_decision_function(X)
+        else:
+            dec_func = self._dense_decision_function(X)
+
+        # In binary case, we need to flip the sign of coef, intercept and
+        # decision function.
+        if self._impl in ["c_svc", "nu_svc"] and len(self.classes_) == 2:
+            return -dec_func.ravel()
+
+        return dec_func
+
+    def _dense_decision_function(self, X):
+        X = check_array(X, dtype=np.float64, order="C", accept_large_sparse=False)
+
+        kernel = self.kernel
+        if callable(kernel):
+            kernel = "precomputed"
+
+        return libsvm.decision_function(
+            X,
+            self.support_,
+            self.support_vectors_,
+            self._n_support,
+            self._dual_coef_,
+            self._intercept_,
+            self._probA,
+            self._probB,
+            svm_type=LIBSVM_IMPL.index(self._impl),
+            kernel=kernel,
+            degree=self.degree,
+            cache_size=self.cache_size,
+            coef0=self.coef0,
+            gamma=self._gamma,
+        )
+
+    def _sparse_decision_function(self, X):
+        X.data = np.asarray(X.data, dtype=np.float64, order="C")
+
+        kernel = self.kernel
+        if hasattr(kernel, "__call__"):
+            kernel = "precomputed"
+
+        kernel_type = self._sparse_kernels.index(kernel)
+
+        return libsvm_sparse.libsvm_sparse_decision_function(
+            X.data,
+            X.indices,
+            X.indptr,
+            self.support_vectors_.data,
+            self.support_vectors_.indices,
+            self.support_vectors_.indptr,
+            self._dual_coef_.data,
+            self._intercept_,
+            LIBSVM_IMPL.index(self._impl),
+            kernel_type,
+            self.degree,
+            self._gamma,
+            self.coef0,
+            self.tol,
+            self.C,
+            getattr(self, "class_weight_", np.empty(0)),
+            self.nu,
+            self.epsilon,
+            self.shrinking,
+            self.probability,
+            self._n_support,
+            self._probA,
+            self._probB,
+        )
+
+    def _validate_for_predict(self, X):
+        check_is_fitted(self)
+
+        if not callable(self.kernel):
+            X = validate_data(
+                self,
+                X,
+                accept_sparse="csr",
+                dtype=np.float64,
+                order="C",
+                accept_large_sparse=False,
+                reset=False,
+            )
+
+        if self._sparse and not sp.issparse(X):
+            X = sp.csr_matrix(X)
+        if self._sparse:
+            X.sort_indices()
+
+        if sp.issparse(X) and not self._sparse and not callable(self.kernel):
+            raise ValueError(
+                "cannot use sparse input in %r trained on dense data"
+                % type(self).__name__
+            )
+
+        if self.kernel == "precomputed":
+            if X.shape[1] != self.shape_fit_[0]:
+                raise ValueError(
+                    "X.shape[1] = %d should be equal to %d, "
+                    "the number of samples at training time"
+                    % (X.shape[1], self.shape_fit_[0])
+                )
+        # Fixes https://nvd.nist.gov/vuln/detail/CVE-2020-28975
+        # Check that _n_support is consistent with support_vectors
+        sv = self.support_vectors_
+        if not self._sparse and sv.size > 0 and self.n_support_.sum() != sv.shape[0]:
+            raise ValueError(
+                f"The internal representation of {self.__class__.__name__} was altered"
+            )
+        return X
+
+    @property
+    def coef_(self):
+        """Weights assigned to the features when `kernel="linear"`.
+
+        Returns
+        -------
+        ndarray of shape (n_features, n_classes)
+        """
+        if self.kernel != "linear":
+            raise AttributeError("coef_ is only available when using a linear kernel")
+
+        coef = self._get_coef()
+
+        # coef_ being a read-only property, it's better to mark the value as
+        # immutable to avoid hiding potential bugs for the unsuspecting user.
+        if sp.issparse(coef):
+            # sparse matrix do not have global flags
+            coef.data.flags.writeable = False
+        else:
+            # regular dense array
+            coef.flags.writeable = False
+        return coef
+
+    def _get_coef(self):
+        return safe_sparse_dot(self._dual_coef_, self.support_vectors_)
+
+    @property
+    def n_support_(self):
+        """Number of support vectors for each class."""
+        try:
+            check_is_fitted(self)
+        except NotFittedError:
+            raise AttributeError
+
+        svm_type = LIBSVM_IMPL.index(self._impl)
+        if svm_type in (0, 1):
+            return self._n_support
+        else:
+            # SVR and OneClass
+            # _n_support has size 2, we make it size 1
+            return np.array([self._n_support[0]])
+
+
+class BaseSVC(ClassifierMixin, BaseLibSVM, metaclass=ABCMeta):
+    """ABC for LibSVM-based classifiers."""
+
+    _parameter_constraints: dict = {
+        **BaseLibSVM._parameter_constraints,
+        "decision_function_shape": [StrOptions({"ovr", "ovo"})],
+        "break_ties": ["boolean"],
+    }
+    for unused_param in ["epsilon", "nu"]:
+        _parameter_constraints.pop(unused_param)
+
+    @abstractmethod
+    def __init__(
+        self,
+        kernel,
+        degree,
+        gamma,
+        coef0,
+        tol,
+        C,
+        nu,
+        shrinking,
+        probability,
+        cache_size,
+        class_weight,
+        verbose,
+        max_iter,
+        decision_function_shape,
+        random_state,
+        break_ties,
+    ):
+        self.decision_function_shape = decision_function_shape
+        self.break_ties = break_ties
+        super().__init__(
+            kernel=kernel,
+            degree=degree,
+            gamma=gamma,
+            coef0=coef0,
+            tol=tol,
+            C=C,
+            nu=nu,
+            epsilon=0.0,
+            shrinking=shrinking,
+            probability=probability,
+            cache_size=cache_size,
+            class_weight=class_weight,
+            verbose=verbose,
+            max_iter=max_iter,
+            random_state=random_state,
+        )
+
+    def _validate_targets(self, y):
+        y_ = column_or_1d(y, warn=True)
+        check_classification_targets(y)
+        cls, y = np.unique(y_, return_inverse=True)
+        self.class_weight_ = compute_class_weight(self.class_weight, classes=cls, y=y_)
+        if len(cls) < 2:
+            raise ValueError(
+                "The number of classes has to be greater than one; got %d class"
+                % len(cls)
+            )
+
+        self.classes_ = cls
+
+        return np.asarray(y, dtype=np.float64, order="C")
+
+    def decision_function(self, X):
+        """Evaluate the decision function for the samples in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        X : ndarray of shape (n_samples, n_classes * (n_classes-1) / 2)
+            Returns the decision function of the sample for each class
+            in the model.
+            If decision_function_shape='ovr', the shape is (n_samples,
+            n_classes).
+
+        Notes
+        -----
+        If decision_function_shape='ovo', the function values are proportional
+        to the distance of the samples X to the separating hyperplane. If the
+        exact distances are required, divide the function values by the norm of
+        the weight vector (``coef_``). See also `this question
+        <https://stats.stackexchange.com/questions/14876/
+        interpreting-distance-from-hyperplane-in-svm>`_ for further details.
+        If decision_function_shape='ovr', the decision function is a monotonic
+        transformation of ovo decision function.
+        """
+        dec = self._decision_function(X)
+        if self.decision_function_shape == "ovr" and len(self.classes_) > 2:
+            return _ovr_decision_function(dec < 0, -dec, len(self.classes_))
+        return dec
+
+    def predict(self, X):
+        """Perform classification on samples in X.
+
+        For an one-class model, +1 or -1 is returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples_test, n_samples_train)
+            For kernel="precomputed", the expected shape of X is
+            (n_samples_test, n_samples_train).
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Class labels for samples in X.
+        """
+        check_is_fitted(self)
+        if self.break_ties and self.decision_function_shape == "ovo":
+            raise ValueError(
+                "break_ties must be False when decision_function_shape is 'ovo'"
+            )
+
+        if (
+            self.break_ties
+            and self.decision_function_shape == "ovr"
+            and len(self.classes_) > 2
+        ):
+            y = np.argmax(self.decision_function(X), axis=1)
+        else:
+            y = super().predict(X)
+        return self.classes_.take(np.asarray(y, dtype=np.intp))
+
+    # Hacky way of getting predict_proba to raise an AttributeError when
+    # probability=False using properties. Do not use this in new code; when
+    # probabilities are not available depending on a setting, introduce two
+    # estimators.
+    def _check_proba(self):
+        if not self.probability:
+            raise AttributeError(
+                "predict_proba is not available when probability=False"
+            )
+        if self._impl not in ("c_svc", "nu_svc"):
+            raise AttributeError("predict_proba only implemented for SVC and NuSVC")
+        return True
+
+    @available_if(_check_proba)
+    def predict_proba(self, X):
+        """Compute probabilities of possible outcomes for samples in X.
+
+        The model needs to have probability information computed at training
+        time: fit with attribute `probability` set to True.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            For kernel="precomputed", the expected shape of X is
+            (n_samples_test, n_samples_train).
+
+        Returns
+        -------
+        T : ndarray of shape (n_samples, n_classes)
+            Returns the probability of the sample for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute :term:`classes_`.
+
+        Notes
+        -----
+        The probability model is created using cross validation, so
+        the results can be slightly different than those obtained by
+        predict. Also, it will produce meaningless results on very small
+        datasets.
+        """
+        X = self._validate_for_predict(X)
+        if self.probA_.size == 0 or self.probB_.size == 0:
+            raise NotFittedError(
+                "predict_proba is not available when fitted with probability=False"
+            )
+        pred_proba = (
+            self._sparse_predict_proba if self._sparse else self._dense_predict_proba
+        )
+        return pred_proba(X)
+
+    @available_if(_check_proba)
+    def predict_log_proba(self, X):
+        """Compute log probabilities of possible outcomes for samples in X.
+
+        The model need to have probability information computed at training
+        time: fit with attribute `probability` set to True.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples_test, n_samples_train)
+            For kernel="precomputed", the expected shape of X is
+            (n_samples_test, n_samples_train).
+
+        Returns
+        -------
+        T : ndarray of shape (n_samples, n_classes)
+            Returns the log-probabilities of the sample for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute :term:`classes_`.
+
+        Notes
+        -----
+        The probability model is created using cross validation, so
+        the results can be slightly different than those obtained by
+        predict. Also, it will produce meaningless results on very small
+        datasets.
+        """
+        return np.log(self.predict_proba(X))
+
+    def _dense_predict_proba(self, X):
+        X = self._compute_kernel(X)
+
+        kernel = self.kernel
+        if callable(kernel):
+            kernel = "precomputed"
+
+        svm_type = LIBSVM_IMPL.index(self._impl)
+        pprob = libsvm.predict_proba(
+            X,
+            self.support_,
+            self.support_vectors_,
+            self._n_support,
+            self._dual_coef_,
+            self._intercept_,
+            self._probA,
+            self._probB,
+            svm_type=svm_type,
+            kernel=kernel,
+            degree=self.degree,
+            cache_size=self.cache_size,
+            coef0=self.coef0,
+            gamma=self._gamma,
+        )
+
+        return pprob
+
+    def _sparse_predict_proba(self, X):
+        X.data = np.asarray(X.data, dtype=np.float64, order="C")
+
+        kernel = self.kernel
+        if callable(kernel):
+            kernel = "precomputed"
+
+        kernel_type = self._sparse_kernels.index(kernel)
+
+        return libsvm_sparse.libsvm_sparse_predict_proba(
+            X.data,
+            X.indices,
+            X.indptr,
+            self.support_vectors_.data,
+            self.support_vectors_.indices,
+            self.support_vectors_.indptr,
+            self._dual_coef_.data,
+            self._intercept_,
+            LIBSVM_IMPL.index(self._impl),
+            kernel_type,
+            self.degree,
+            self._gamma,
+            self.coef0,
+            self.tol,
+            self.C,
+            getattr(self, "class_weight_", np.empty(0)),
+            self.nu,
+            self.epsilon,
+            self.shrinking,
+            self.probability,
+            self._n_support,
+            self._probA,
+            self._probB,
+        )
+
+    def _get_coef(self):
+        if self.dual_coef_.shape[0] == 1:
+            # binary classifier
+            coef = safe_sparse_dot(self.dual_coef_, self.support_vectors_)
+        else:
+            # 1vs1 classifier
+            coef = _one_vs_one_coef(
+                self.dual_coef_, self._n_support, self.support_vectors_
+            )
+            if sp.issparse(coef[0]):
+                coef = sp.vstack(coef).tocsr()
+            else:
+                coef = np.vstack(coef)
+
+        return coef
+
+    @property
+    def probA_(self):
+        """Parameter learned in Platt scaling when `probability=True`.
+
+        Returns
+        -------
+        ndarray of shape  (n_classes * (n_classes - 1) / 2)
+        """
+        return self._probA
+
+    @property
+    def probB_(self):
+        """Parameter learned in Platt scaling when `probability=True`.
+
+        Returns
+        -------
+        ndarray of shape  (n_classes * (n_classes - 1) / 2)
+        """
+        return self._probB
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = self.kernel != "precomputed"
+        return tags
+
+
+def _get_liblinear_solver_type(multi_class, penalty, loss, dual):
+    """Find the liblinear magic number for the solver.
+
+    This number depends on the values of the following attributes:
+      - multi_class
+      - penalty
+      - loss
+      - dual
+
+    The same number is also internally used by LibLinear to determine
+    which solver to use.
+    """
+    # nested dicts containing level 1: available loss functions,
+    # level2: available penalties for the given loss function,
+    # level3: whether the dual solver is available for the specified
+    # combination of loss function and penalty
+    _solver_type_dict = {
+        "logistic_regression": {"l1": {False: 6}, "l2": {False: 0, True: 7}},
+        "hinge": {"l2": {True: 3}},
+        "squared_hinge": {"l1": {False: 5}, "l2": {False: 2, True: 1}},
+        "epsilon_insensitive": {"l2": {True: 13}},
+        "squared_epsilon_insensitive": {"l2": {False: 11, True: 12}},
+        "crammer_singer": 4,
+    }
+
+    if multi_class == "crammer_singer":
+        return _solver_type_dict[multi_class]
+    elif multi_class != "ovr":
+        raise ValueError(
+            "`multi_class` must be one of `ovr`, `crammer_singer`, got %r" % multi_class
+        )
+
+    _solver_pen = _solver_type_dict.get(loss, None)
+    if _solver_pen is None:
+        error_string = "loss='%s' is not supported" % loss
+    else:
+        _solver_dual = _solver_pen.get(penalty, None)
+        if _solver_dual is None:
+            error_string = (
+                "The combination of penalty='%s' and loss='%s' is not supported"
+                % (penalty, loss)
+            )
+        else:
+            solver_num = _solver_dual.get(dual, None)
+            if solver_num is None:
+                error_string = (
+                    "The combination of penalty='%s' and "
+                    "loss='%s' are not supported when dual=%s" % (penalty, loss, dual)
+                )
+            else:
+                return solver_num
+    raise ValueError(
+        "Unsupported set of arguments: %s, Parameters: penalty=%r, loss=%r, dual=%r"
+        % (error_string, penalty, loss, dual)
+    )
+
+
+def _fit_liblinear(
+    X,
+    y,
+    C,
+    fit_intercept,
+    intercept_scaling,
+    class_weight,
+    penalty,
+    dual,
+    verbose,
+    max_iter,
+    tol,
+    random_state=None,
+    multi_class="ovr",
+    loss="logistic_regression",
+    epsilon=0.1,
+    sample_weight=None,
+):
+    """Used by Logistic Regression (and CV) and LinearSVC/LinearSVR.
+
+    Preprocessing is done in this function before supplying it to liblinear.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : array-like of shape (n_samples,)
+        Target vector relative to X
+
+    C : float
+        Inverse of cross-validation parameter. The lower the C, the higher
+        the penalization.
+
+    fit_intercept : bool
+        Whether or not to fit an intercept. If set to True, the feature vector
+        is extended to include an intercept term: ``[x_1, ..., x_n, 1]``, where
+        1 corresponds to the intercept. If set to False, no intercept will be
+        used in calculations (i.e. data is expected to be already centered).
+
+    intercept_scaling : float
+        Liblinear internally penalizes the intercept, treating it like any
+        other term in the feature vector. To reduce the impact of the
+        regularization on the intercept, the `intercept_scaling` parameter can
+        be set to a value greater than 1; the higher the value of
+        `intercept_scaling`, the lower the impact of regularization on it.
+        Then, the weights become `[w_x_1, ..., w_x_n,
+        w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent
+        the feature weights and the intercept weight is scaled by
+        `intercept_scaling`. This scaling allows the intercept term to have a
+        different regularization behavior compared to the other features.
+
+    class_weight : dict or 'balanced', default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
+    penalty : {'l1', 'l2'}
+        The norm of the penalty used in regularization.
+
+    dual : bool
+        Dual or primal formulation,
+
+    verbose : int
+        Set verbose to any positive number for verbosity.
+
+    max_iter : int
+        Number of iterations.
+
+    tol : float
+        Stopping condition.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the pseudo random number generation for shuffling the data.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    multi_class : {'ovr', 'crammer_singer'}, default='ovr'
+        `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`
+        optimizes a joint objective over all classes.
+        While `crammer_singer` is interesting from an theoretical perspective
+        as it is consistent it is seldom used in practice and rarely leads to
+        better accuracy and is more expensive to compute.
+        If `crammer_singer` is chosen, the options loss, penalty and dual will
+        be ignored.
+
+    loss : {'logistic_regression', 'hinge', 'squared_hinge', \
+            'epsilon_insensitive', 'squared_epsilon_insensitive}, \
+            default='logistic_regression'
+        The loss function used to fit the model.
+
+    epsilon : float, default=0.1
+        Epsilon parameter in the epsilon-insensitive loss function. Note
+        that the value of this parameter depends on the scale of the target
+        variable y. If unsure, set epsilon=0.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Weights assigned to each sample.
+
+    Returns
+    -------
+    coef_ : ndarray of shape (n_features, n_features + 1)
+        The coefficient vector got by minimizing the objective function.
+
+    intercept_ : float
+        The intercept term added to the vector.
+
+    n_iter_ : array of int
+        Number of iterations run across for each class.
+    """
+    if loss not in ["epsilon_insensitive", "squared_epsilon_insensitive"]:
+        enc = LabelEncoder()
+        y_ind = enc.fit_transform(y)
+        classes_ = enc.classes_
+        if len(classes_) < 2:
+            raise ValueError(
+                "This solver needs samples of at least 2 classes"
+                " in the data, but the data contains only one"
+                " class: %r" % classes_[0]
+            )
+        class_weight_ = compute_class_weight(
+            class_weight, classes=classes_, y=y, sample_weight=sample_weight
+        )
+    else:
+        class_weight_ = np.empty(0, dtype=np.float64)
+        y_ind = y
+    liblinear.set_verbosity_wrap(verbose)
+    rnd = check_random_state(random_state)
+    if verbose:
+        print("[LibLinear]", end="")
+
+    # LinearSVC breaks when intercept_scaling is <= 0
+    bias = -1.0
+    if fit_intercept:
+        if intercept_scaling <= 0:
+            raise ValueError(
+                "Intercept scaling is %r but needs to be greater "
+                "than 0. To disable fitting an intercept,"
+                " set fit_intercept=False." % intercept_scaling
+            )
+        else:
+            bias = intercept_scaling
+
+    libsvm.set_verbosity_wrap(verbose)
+    libsvm_sparse.set_verbosity_wrap(verbose)
+    liblinear.set_verbosity_wrap(verbose)
+
+    # Liblinear doesn't support 64bit sparse matrix indices yet
+    if sp.issparse(X):
+        _check_large_sparse(X)
+
+    # LibLinear wants targets as doubles, even for classification
+    y_ind = np.asarray(y_ind, dtype=np.float64).ravel()
+    y_ind = np.require(y_ind, requirements="W")
+
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64)
+
+    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
+    raw_coef_, n_iter_ = liblinear.train_wrap(
+        X,
+        y_ind,
+        sp.issparse(X),
+        solver_type,
+        tol,
+        bias,
+        C,
+        class_weight_,
+        max_iter,
+        rnd.randint(np.iinfo("i").max),
+        epsilon,
+        sample_weight,
+    )
+    # Regarding rnd.randint(..) in the above signature:
+    # seed for srand in range [0..INT_MAX); due to limitations in Numpy
+    # on 32-bit platforms, we can't get to the UINT_MAX limit that
+    # srand supports
+    n_iter_max = max(n_iter_)
+    if n_iter_max >= max_iter:
+        warnings.warn(
+            "Liblinear failed to converge, increase the number of iterations.",
+            ConvergenceWarning,
+        )
+
+    if fit_intercept:
+        coef_ = raw_coef_[:, :-1]
+        intercept_ = intercept_scaling * raw_coef_[:, -1]
+    else:
+        coef_ = raw_coef_
+        intercept_ = 0.0
+
+    return coef_, intercept_, n_iter_
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_bounds.py b/.venv/lib/python3.12/site-packages/sklearn/svm/_bounds.py
new file mode 100644
index 0000000000000000000000000000000000000000..44923cb12976776507a9dc02502424832158391c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_bounds.py
@@ -0,0 +1,98 @@
+"""Determination of parameter bounds"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Real
+
+import numpy as np
+
+from ..preprocessing import LabelBinarizer
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.extmath import safe_sparse_dot
+from ..utils.validation import check_array, check_consistent_length
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like"],
+        "loss": [StrOptions({"squared_hinge", "log"})],
+        "fit_intercept": ["boolean"],
+        "intercept_scaling": [Interval(Real, 0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def l1_min_c(X, y, *, loss="squared_hinge", fit_intercept=True, intercept_scaling=1.0):
+    """Return the lowest bound for `C`.
+
+    The lower bound for `C` is computed such that for `C` in `(l1_min_C, infinity)`
+    the model is guaranteed not to be empty. This applies to l1 penalized
+    classifiers, such as :class:`sklearn.svm.LinearSVC` with penalty='l1' and
+    :class:`sklearn.linear_model.LogisticRegression` with penalty='l1'.
+
+    This value is valid if `class_weight` parameter in `fit()` is not set.
+
+    For an example of how to use this function, see
+    :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : array-like of shape (n_samples,)
+        Target vector relative to X.
+
+    loss : {'squared_hinge', 'log'}, default='squared_hinge'
+        Specifies the loss function.
+        With 'squared_hinge' it is the squared hinge loss (a.k.a. L2 loss).
+        With 'log' it is the loss of logistic regression models.
+
+    fit_intercept : bool, default=True
+        Specifies if the intercept should be fitted by the model.
+        It must match the fit() method parameter.
+
+    intercept_scaling : float, default=1.0
+        When fit_intercept is True, instance vector x becomes
+        [x, intercept_scaling],
+        i.e. a "synthetic" feature with constant value equals to
+        intercept_scaling is appended to the instance vector.
+        It must match the fit() method parameter.
+
+    Returns
+    -------
+    l1_min_c : float
+        Minimum value for C.
+
+    Examples
+    --------
+    >>> from sklearn.svm import l1_min_c
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=100, n_features=20, random_state=42)
+    >>> print(f"{l1_min_c(X, y, loss='squared_hinge', fit_intercept=True):.4f}")
+    0.0044
+    """
+
+    X = check_array(X, accept_sparse="csc")
+    check_consistent_length(X, y)
+
+    Y = LabelBinarizer(neg_label=-1).fit_transform(y).T
+    # maximum absolute value over classes and features
+    den = np.max(np.abs(safe_sparse_dot(Y, X)))
+    if fit_intercept:
+        bias = np.full(
+            (np.size(y), 1), intercept_scaling, dtype=np.array(intercept_scaling).dtype
+        )
+        den = max(den, abs(np.dot(Y, bias)).max())
+
+    if den == 0.0:
+        raise ValueError(
+            "Ill-posed l1_min_c calculation: l1 will always "
+            "select zero coefficients for this data"
+        )
+    if loss == "squared_hinge":
+        return 0.5 / den
+    else:  # loss == 'log':
+        return 2.0 / den
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_classes.py b/.venv/lib/python3.12/site-packages/sklearn/svm/_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..277da42893eaff6737f32fea006e719a2f00e4d0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_classes.py
@@ -0,0 +1,1789 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import BaseEstimator, OutlierMixin, RegressorMixin, _fit_context
+from ..linear_model._base import LinearClassifierMixin, LinearModel, SparseCoefMixin
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import _num_samples, validate_data
+from ._base import BaseLibSVM, BaseSVC, _fit_liblinear, _get_liblinear_solver_type
+
+
+def _validate_dual_parameter(dual, loss, penalty, multi_class, X):
+    """Helper function to assign the value of dual parameter."""
+    if dual == "auto":
+        if X.shape[0] < X.shape[1]:
+            try:
+                _get_liblinear_solver_type(multi_class, penalty, loss, True)
+                return True
+            except ValueError:  # dual not supported for the combination
+                return False
+        else:
+            try:
+                _get_liblinear_solver_type(multi_class, penalty, loss, False)
+                return False
+            except ValueError:  # primal not supported by the combination
+                return True
+    else:
+        return dual
+
+
+class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
+    """Linear Support Vector Classification.
+
+    Similar to SVC with parameter kernel='linear', but implemented in terms of
+    liblinear rather than libsvm, so it has more flexibility in the choice of
+    penalties and loss functions and should scale better to large numbers of
+    samples.
+
+    The main differences between :class:`~sklearn.svm.LinearSVC` and
+    :class:`~sklearn.svm.SVC` lie in the loss function used by default, and in
+    the handling of intercept regularization between those two implementations.
+
+    This class supports both dense and sparse input and the multiclass support
+    is handled according to a one-vs-the-rest scheme.
+
+    Read more in the :ref:`User Guide <svm_classification>`.
+
+    Parameters
+    ----------
+    penalty : {'l1', 'l2'}, default='l2'
+        Specifies the norm used in the penalization. The 'l2'
+        penalty is the standard used in SVC. The 'l1' leads to ``coef_``
+        vectors that are sparse.
+
+    loss : {'hinge', 'squared_hinge'}, default='squared_hinge'
+        Specifies the loss function. 'hinge' is the standard SVM loss
+        (used e.g. by the SVC class) while 'squared_hinge' is the
+        square of the hinge loss. The combination of ``penalty='l1'``
+        and ``loss='hinge'`` is not supported.
+
+    dual : "auto" or bool, default="auto"
+        Select the algorithm to either solve the dual or primal
+        optimization problem. Prefer dual=False when n_samples > n_features.
+        `dual="auto"` will choose the value of the parameter automatically,
+        based on the values of `n_samples`, `n_features`, `loss`, `multi_class`
+        and `penalty`. If `n_samples` < `n_features` and optimizer supports
+        chosen `loss`, `multi_class` and `penalty`, then dual will be set to True,
+        otherwise it will be set to False.
+
+        .. versionchanged:: 1.3
+           The `"auto"` option is added in version 1.3 and will be the default
+           in version 1.5.
+
+    tol : float, default=1e-4
+        Tolerance for stopping criteria.
+
+    C : float, default=1.0
+        Regularization parameter. The strength of the regularization is
+        inversely proportional to C. Must be strictly positive.
+        For an intuitive visualization of the effects of scaling
+        the regularization parameter C, see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.
+
+    multi_class : {'ovr', 'crammer_singer'}, default='ovr'
+        Determines the multi-class strategy if `y` contains more than
+        two classes.
+        ``"ovr"`` trains n_classes one-vs-rest classifiers, while
+        ``"crammer_singer"`` optimizes a joint objective over all classes.
+        While `crammer_singer` is interesting from a theoretical perspective
+        as it is consistent, it is seldom used in practice as it rarely leads
+        to better accuracy and is more expensive to compute.
+        If ``"crammer_singer"`` is chosen, the options loss, penalty and dual
+        will be ignored.
+
+    fit_intercept : bool, default=True
+        Whether or not to fit an intercept. If set to True, the feature vector
+        is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where
+        1 corresponds to the intercept. If set to False, no intercept will be
+        used in calculations (i.e. data is expected to be already centered).
+
+    intercept_scaling : float, default=1.0
+        When `fit_intercept` is True, the instance vector x becomes ``[x_1,
+        ..., x_n, intercept_scaling]``, i.e. a "synthetic" feature with a
+        constant value equal to `intercept_scaling` is appended to the instance
+        vector. The intercept becomes intercept_scaling * synthetic feature
+        weight. Note that liblinear internally penalizes the intercept,
+        treating it like any other term in the feature vector. To reduce the
+        impact of the regularization on the intercept, the `intercept_scaling`
+        parameter can be set to a value greater than 1; the higher the value of
+        `intercept_scaling`, the lower the impact of regularization on it.
+        Then, the weights become `[w_x_1, ..., w_x_n,
+        w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent
+        the feature weights and the intercept weight is scaled by
+        `intercept_scaling`. This scaling allows the intercept term to have a
+        different regularization behavior compared to the other features.
+
+    class_weight : dict or 'balanced', default=None
+        Set the parameter C of class i to ``class_weight[i]*C`` for
+        SVC. If not given, all classes are supposed to have
+        weight one.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+    verbose : int, default=0
+        Enable verbose output. Note that this setting takes advantage of a
+        per-process runtime setting in liblinear that, if enabled, may not work
+        properly in a multithreaded context.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the pseudo random number generation for shuffling the data for
+        the dual coordinate descent (if ``dual=True``). When ``dual=False`` the
+        underlying implementation of :class:`LinearSVC` is not random and
+        ``random_state`` has no effect on the results.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    max_iter : int, default=1000
+        The maximum number of iterations to be run.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 \
+            else (n_classes, n_features)
+        Weights assigned to the features (coefficients in the primal
+        problem).
+
+        ``coef_`` is a readonly property derived from ``raw_coef_`` that
+        follows the internal memory layout of liblinear.
+
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
+        Constants in decision function.
+
+    classes_ : ndarray of shape (n_classes,)
+        The unique classes labels.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Maximum number of iterations run across all classes.
+
+    See Also
+    --------
+    SVC : Implementation of Support Vector Machine classifier using libsvm:
+        the kernel can be non-linear but its SMO algorithm does not
+        scale to large number of samples as LinearSVC does.
+
+        Furthermore SVC multi-class mode is implemented using one
+        vs one scheme while LinearSVC uses one vs the rest. It is
+        possible to implement one vs the rest with SVC by using the
+        :class:`~sklearn.multiclass.OneVsRestClassifier` wrapper.
+
+        Finally SVC can fit dense data without memory copy if the input
+        is C-contiguous. Sparse data will still incur memory copy though.
+
+    sklearn.linear_model.SGDClassifier : SGDClassifier can optimize the same
+        cost function as LinearSVC
+        by adjusting the penalty and loss parameters. In addition it requires
+        less memory, allows incremental (online) learning, and implements
+        various loss functions and regularization regimes.
+
+    Notes
+    -----
+    The underlying C implementation uses a random number generator to
+    select features when fitting the model. It is thus not uncommon
+    to have slightly different results for the same input data. If
+    that happens, try with a smaller ``tol`` parameter.
+
+    The underlying implementation, liblinear, uses a sparse internal
+    representation for the data that will incur a memory copy.
+
+    Predict output may not match that of standalone liblinear in certain
+    cases. See :ref:`differences from liblinear <liblinear_differences>`
+    in the narrative documentation.
+
+    References
+    ----------
+    `LIBLINEAR: A Library for Large Linear Classification
+    <https://www.csie.ntu.edu.tw/~cjlin/liblinear/>`__
+
+    Examples
+    --------
+    >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_features=4, random_state=0)
+    >>> clf = make_pipeline(StandardScaler(),
+    ...                     LinearSVC(random_state=0, tol=1e-5))
+    >>> clf.fit(X, y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])
+
+    >>> print(clf.named_steps['linearsvc'].coef_)
+    [[0.141   0.526 0.679 0.493]]
+
+    >>> print(clf.named_steps['linearsvc'].intercept_)
+    [0.1693]
+    >>> print(clf.predict([[0, 0, 0, 0]]))
+    [1]
+    """
+
+    _parameter_constraints: dict = {
+        "penalty": [StrOptions({"l1", "l2"})],
+        "loss": [StrOptions({"hinge", "squared_hinge"})],
+        "dual": ["boolean", StrOptions({"auto"})],
+        "tol": [Interval(Real, 0.0, None, closed="neither")],
+        "C": [Interval(Real, 0.0, None, closed="neither")],
+        "multi_class": [StrOptions({"ovr", "crammer_singer"})],
+        "fit_intercept": ["boolean"],
+        "intercept_scaling": [Interval(Real, 0, None, closed="neither")],
+        "class_weight": [None, dict, StrOptions({"balanced"})],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        penalty="l2",
+        loss="squared_hinge",
+        *,
+        dual="auto",
+        tol=1e-4,
+        C=1.0,
+        multi_class="ovr",
+        fit_intercept=True,
+        intercept_scaling=1,
+        class_weight=None,
+        verbose=0,
+        random_state=None,
+        max_iter=1000,
+    ):
+        self.dual = dual
+        self.tol = tol
+        self.C = C
+        self.multi_class = multi_class
+        self.fit_intercept = fit_intercept
+        self.intercept_scaling = intercept_scaling
+        self.class_weight = class_weight
+        self.verbose = verbose
+        self.random_state = random_state
+        self.max_iter = max_iter
+        self.penalty = penalty
+        self.loss = loss
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target vector relative to X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Array of weights that are assigned to individual
+            samples. If not provided,
+            then each sample is given unit weight.
+
+            .. versionadded:: 0.18
+
+        Returns
+        -------
+        self : object
+            An instance of the estimator.
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse="csr",
+            dtype=np.float64,
+            order="C",
+            accept_large_sparse=False,
+        )
+        check_classification_targets(y)
+        self.classes_ = np.unique(y)
+
+        _dual = _validate_dual_parameter(
+            self.dual, self.loss, self.penalty, self.multi_class, X
+        )
+
+        self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
+            X,
+            y,
+            self.C,
+            self.fit_intercept,
+            self.intercept_scaling,
+            self.class_weight,
+            self.penalty,
+            _dual,
+            self.verbose,
+            self.max_iter,
+            self.tol,
+            self.random_state,
+            self.multi_class,
+            self.loss,
+            sample_weight=sample_weight,
+        )
+        # Backward compatibility: _fit_liblinear is used both by LinearSVC/R
+        # and LogisticRegression but LogisticRegression sets a structured
+        # `n_iter_` attribute with information about the underlying OvR fits
+        # while LinearSVC/R only reports the maximum value.
+        self.n_iter_ = n_iter_.max().item()
+
+        if self.multi_class == "crammer_singer" and len(self.classes_) == 2:
+            self.coef_ = (self.coef_[1] - self.coef_[0]).reshape(1, -1)
+            if self.fit_intercept:
+                intercept = self.intercept_[1] - self.intercept_[0]
+                self.intercept_ = np.array([intercept])
+
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class LinearSVR(RegressorMixin, LinearModel):
+    """Linear Support Vector Regression.
+
+    Similar to SVR with parameter kernel='linear', but implemented in terms of
+    liblinear rather than libsvm, so it has more flexibility in the choice of
+    penalties and loss functions and should scale better to large numbers of
+    samples.
+
+    The main differences between :class:`~sklearn.svm.LinearSVR` and
+    :class:`~sklearn.svm.SVR` lie in the loss function used by default, and in
+    the handling of intercept regularization between those two implementations.
+
+    This class supports both dense and sparse input.
+
+    Read more in the :ref:`User Guide <svm_regression>`.
+
+    .. versionadded:: 0.16
+
+    Parameters
+    ----------
+    epsilon : float, default=0.0
+        Epsilon parameter in the epsilon-insensitive loss function. Note
+        that the value of this parameter depends on the scale of the target
+        variable y. If unsure, set ``epsilon=0``.
+
+    tol : float, default=1e-4
+        Tolerance for stopping criteria.
+
+    C : float, default=1.0
+        Regularization parameter. The strength of the regularization is
+        inversely proportional to C. Must be strictly positive.
+
+    loss : {'epsilon_insensitive', 'squared_epsilon_insensitive'}, \
+            default='epsilon_insensitive'
+        Specifies the loss function. The epsilon-insensitive loss
+        (standard SVR) is the L1 loss, while the squared epsilon-insensitive
+        loss ('squared_epsilon_insensitive') is the L2 loss.
+
+    fit_intercept : bool, default=True
+        Whether or not to fit an intercept. If set to True, the feature vector
+        is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where
+        1 corresponds to the intercept. If set to False, no intercept will be
+        used in calculations (i.e. data is expected to be already centered).
+
+    intercept_scaling : float, default=1.0
+        When `fit_intercept` is True, the instance vector x becomes `[x_1, ...,
+        x_n, intercept_scaling]`, i.e. a "synthetic" feature with a constant
+        value equal to `intercept_scaling` is appended to the instance vector.
+        The intercept becomes intercept_scaling * synthetic feature weight.
+        Note that liblinear internally penalizes the intercept, treating it
+        like any other term in the feature vector. To reduce the impact of the
+        regularization on the intercept, the `intercept_scaling` parameter can
+        be set to a value greater than 1; the higher the value of
+        `intercept_scaling`, the lower the impact of regularization on it.
+        Then, the weights become `[w_x_1, ..., w_x_n,
+        w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent
+        the feature weights and the intercept weight is scaled by
+        `intercept_scaling`. This scaling allows the intercept term to have a
+        different regularization behavior compared to the other features.
+
+    dual : "auto" or bool, default="auto"
+        Select the algorithm to either solve the dual or primal
+        optimization problem. Prefer dual=False when n_samples > n_features.
+        `dual="auto"` will choose the value of the parameter automatically,
+        based on the values of `n_samples`, `n_features` and `loss`. If
+        `n_samples` < `n_features` and optimizer supports chosen `loss`,
+        then dual will be set to True, otherwise it will be set to False.
+
+        .. versionchanged:: 1.3
+           The `"auto"` option is added in version 1.3 and will be the default
+           in version 1.5.
+
+    verbose : int, default=0
+        Enable verbose output. Note that this setting takes advantage of a
+        per-process runtime setting in liblinear that, if enabled, may not work
+        properly in a multithreaded context.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the pseudo random number generation for shuffling the data.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    max_iter : int, default=1000
+        The maximum number of iterations to be run.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_features) if n_classes == 2 \
+            else (n_classes, n_features)
+        Weights assigned to the features (coefficients in the primal
+        problem).
+
+        `coef_` is a readonly property derived from `raw_coef_` that
+        follows the internal memory layout of liblinear.
+
+    intercept_ : ndarray of shape (1) if n_classes == 2 else (n_classes)
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Maximum number of iterations run across all classes.
+
+    See Also
+    --------
+    LinearSVC : Implementation of Support Vector Machine classifier using the
+        same library as this class (liblinear).
+
+    SVR : Implementation of Support Vector Machine regression using libsvm:
+        the kernel can be non-linear but its SMO algorithm does not scale to
+        large number of samples as :class:`~sklearn.svm.LinearSVR` does.
+
+    sklearn.linear_model.SGDRegressor : SGDRegressor can optimize the same cost
+        function as LinearSVR
+        by adjusting the penalty and loss parameters. In addition it requires
+        less memory, allows incremental (online) learning, and implements
+        various loss functions and regularization regimes.
+
+    Examples
+    --------
+    >>> from sklearn.svm import LinearSVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_features=4, random_state=0)
+    >>> regr = make_pipeline(StandardScaler(),
+    ...                      LinearSVR(random_state=0, tol=1e-5))
+    >>> regr.fit(X, y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('linearsvr', LinearSVR(random_state=0, tol=1e-05))])
+
+    >>> print(regr.named_steps['linearsvr'].coef_)
+    [18.582 27.023 44.357 64.522]
+    >>> print(regr.named_steps['linearsvr'].intercept_)
+    [-4.]
+    >>> print(regr.predict([[0, 0, 0, 0]]))
+    [-2.384]
+    """
+
+    _parameter_constraints: dict = {
+        "epsilon": [Real],
+        "tol": [Interval(Real, 0.0, None, closed="neither")],
+        "C": [Interval(Real, 0.0, None, closed="neither")],
+        "loss": [StrOptions({"epsilon_insensitive", "squared_epsilon_insensitive"})],
+        "fit_intercept": ["boolean"],
+        "intercept_scaling": [Interval(Real, 0, None, closed="neither")],
+        "dual": ["boolean", StrOptions({"auto"})],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        *,
+        epsilon=0.0,
+        tol=1e-4,
+        C=1.0,
+        loss="epsilon_insensitive",
+        fit_intercept=True,
+        intercept_scaling=1.0,
+        dual="auto",
+        verbose=0,
+        random_state=None,
+        max_iter=1000,
+    ):
+        self.tol = tol
+        self.C = C
+        self.epsilon = epsilon
+        self.fit_intercept = fit_intercept
+        self.intercept_scaling = intercept_scaling
+        self.verbose = verbose
+        self.random_state = random_state
+        self.max_iter = max_iter
+        self.dual = dual
+        self.loss = loss
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target vector relative to X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Array of weights that are assigned to individual
+            samples. If not provided,
+            then each sample is given unit weight.
+
+            .. versionadded:: 0.18
+
+        Returns
+        -------
+        self : object
+            An instance of the estimator.
+        """
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse="csr",
+            dtype=np.float64,
+            order="C",
+            accept_large_sparse=False,
+        )
+        penalty = "l2"  # SVR only accepts l2 penalty
+
+        _dual = _validate_dual_parameter(self.dual, self.loss, penalty, "ovr", X)
+
+        self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
+            X,
+            y,
+            self.C,
+            self.fit_intercept,
+            self.intercept_scaling,
+            None,
+            penalty,
+            _dual,
+            self.verbose,
+            self.max_iter,
+            self.tol,
+            self.random_state,
+            loss=self.loss,
+            epsilon=self.epsilon,
+            sample_weight=sample_weight,
+        )
+        self.coef_ = self.coef_.ravel()
+        # Backward compatibility: _fit_liblinear is used both by LinearSVC/R
+        # and LogisticRegression but LogisticRegression sets a structured
+        # `n_iter_` attribute with information about the underlying OvR fits
+        # while LinearSVC/R only reports the maximum value.
+        self.n_iter_ = n_iter_.max().item()
+
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+class SVC(BaseSVC):
+    """C-Support Vector Classification.
+
+    The implementation is based on libsvm. The fit time scales at least
+    quadratically with the number of samples and may be impractical
+    beyond tens of thousands of samples. For large datasets
+    consider using :class:`~sklearn.svm.LinearSVC` or
+    :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a
+    :class:`~sklearn.kernel_approximation.Nystroem` transformer or
+    other :ref:`kernel_approximation`.
+
+    The multiclass support is handled according to a one-vs-one scheme.
+
+    For details on the precise mathematical formulation of the provided
+    kernel functions and how `gamma`, `coef0` and `degree` affect each
+    other, see the corresponding section in the narrative documentation:
+    :ref:`svm_kernels`.
+
+    To learn how to tune SVC's hyperparameters, see the following example:
+    :ref:`sphx_glr_auto_examples_model_selection_plot_nested_cross_validation_iris.py`
+
+    Read more in the :ref:`User Guide <svm_classification>`.
+
+    Parameters
+    ----------
+    C : float, default=1.0
+        Regularization parameter. The strength of the regularization is
+        inversely proportional to C. Must be strictly positive. The penalty
+        is a squared l2 penalty. For an intuitive visualization of the effects
+        of scaling the regularization parameter C, see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.
+
+    kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,  \
+        default='rbf'
+        Specifies the kernel type to be used in the algorithm. If
+        none is given, 'rbf' will be used. If a callable is given it is used to
+        pre-compute the kernel matrix from data matrices; that matrix should be
+        an array of shape ``(n_samples, n_samples)``. For an intuitive
+        visualization of different kernel types see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.
+
+    degree : int, default=3
+        Degree of the polynomial kernel function ('poly').
+        Must be non-negative. Ignored by all other kernels.
+
+    gamma : {'scale', 'auto'} or float, default='scale'
+        Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
+
+        - if ``gamma='scale'`` (default) is passed then it uses
+          1 / (n_features * X.var()) as value of gamma,
+        - if 'auto', uses 1 / n_features
+        - if float, must be non-negative.
+
+        .. versionchanged:: 0.22
+           The default value of ``gamma`` changed from 'auto' to 'scale'.
+
+    coef0 : float, default=0.0
+        Independent term in kernel function.
+        It is only significant in 'poly' and 'sigmoid'.
+
+    shrinking : bool, default=True
+        Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
+
+    probability : bool, default=False
+        Whether to enable probability estimates. This must be enabled prior
+        to calling `fit`, will slow down that method as it internally uses
+        5-fold cross-validation, and `predict_proba` may be inconsistent with
+        `predict`. Read more in the :ref:`User Guide <scores_probabilities>`.
+
+    tol : float, default=1e-3
+        Tolerance for stopping criterion.
+
+    cache_size : float, default=200
+        Specify the size of the kernel cache (in MB).
+
+    class_weight : dict or 'balanced', default=None
+        Set the parameter C of class i to class_weight[i]*C for
+        SVC. If not given, all classes are supposed to have
+        weight one.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+    verbose : bool, default=False
+        Enable verbose output. Note that this setting takes advantage of a
+        per-process runtime setting in libsvm that, if enabled, may not work
+        properly in a multithreaded context.
+
+    max_iter : int, default=-1
+        Hard limit on iterations within solver, or -1 for no limit.
+
+    decision_function_shape : {'ovo', 'ovr'}, default='ovr'
+        Whether to return a one-vs-rest ('ovr') decision function of shape
+        (n_samples, n_classes) as all other classifiers, or the original
+        one-vs-one ('ovo') decision function of libsvm which has shape
+        (n_samples, n_classes * (n_classes - 1) / 2). However, note that
+        internally, one-vs-one ('ovo') is always used as a multi-class strategy
+        to train models; an ovr matrix is only constructed from the ovo matrix.
+        The parameter is ignored for binary classification.
+
+        .. versionchanged:: 0.19
+            decision_function_shape is 'ovr' by default.
+
+        .. versionadded:: 0.17
+           *decision_function_shape='ovr'* is recommended.
+
+        .. versionchanged:: 0.17
+           Deprecated *decision_function_shape='ovo' and None*.
+
+    break_ties : bool, default=False
+        If true, ``decision_function_shape='ovr'``, and number of classes > 2,
+        :term:`predict` will break ties according to the confidence values of
+        :term:`decision_function`; otherwise the first class among the tied
+        classes is returned. Please note that breaking ties comes at a
+        relatively high computational cost compared to a simple predict. See
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an
+        example of its usage with ``decision_function_shape='ovr'``.
+
+        .. versionadded:: 0.22
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the pseudo random number generation for shuffling the data for
+        probability estimates. Ignored when `probability` is False.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    class_weight_ : ndarray of shape (n_classes,)
+        Multipliers of parameter C for each class.
+        Computed based on the ``class_weight`` parameter.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels.
+
+    coef_ : ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)
+        Weights assigned to the features (coefficients in the primal
+        problem). This is only available in the case of a linear kernel.
+
+        `coef_` is a readonly property derived from `dual_coef_` and
+        `support_vectors_`.
+
+    dual_coef_ : ndarray of shape (n_classes -1, n_SV)
+        Dual coefficients of the support vector in the decision
+        function (see :ref:`sgd_mathematical_formulation`), multiplied by
+        their targets.
+        For multiclass, coefficient for all 1-vs-1 classifiers.
+        The layout of the coefficients in the multiclass case is somewhat
+        non-trivial. See the :ref:`multi-class section of the User Guide
+        <svm_multi_class>` for details.
+
+    fit_status_ : int
+        0 if correctly fitted, 1 otherwise (will raise warning)
+
+    intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : ndarray of shape (n_classes * (n_classes - 1) // 2,)
+        Number of iterations run by the optimization routine to fit the model.
+        The shape of this attribute depends on the number of models optimized
+        which in turn depends on the number of classes.
+
+        .. versionadded:: 1.1
+
+    support_ : ndarray of shape (n_SV)
+        Indices of support vectors.
+
+    support_vectors_ : ndarray of shape (n_SV, n_features)
+        Support vectors. An empty array if kernel is precomputed.
+
+    n_support_ : ndarray of shape (n_classes,), dtype=int32
+        Number of support vectors for each class.
+
+    probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2)
+    probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2)
+        If `probability=True`, it corresponds to the parameters learned in
+        Platt scaling to produce probability estimates from decision values.
+        If `probability=False`, it's an empty array. Platt scaling uses the
+        logistic function
+        ``1 / (1 + exp(decision_value * probA_ + probB_))``
+        where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For
+        more information on the multiclass case and training procedure see
+        section 8 of [1]_.
+
+    shape_fit_ : tuple of int of shape (n_dimensions_of_X,)
+        Array dimensions of training vector ``X``.
+
+    See Also
+    --------
+    SVR : Support Vector Machine for Regression implemented using libsvm.
+
+    LinearSVC : Scalable Linear Support Vector Machine for classification
+        implemented using liblinear. Check the See Also section of
+        LinearSVC for more comparison element.
+
+    References
+    ----------
+    .. [1] `LIBSVM: A Library for Support Vector Machines
+        <http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf>`_
+
+    .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector
+        Machines and Comparisons to Regularized Likelihood Methods"
+        <https://citeseerx.ist.psu.edu/doc_view/pid/42e5ed832d4310ce4378c44d05570439df28a393>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> y = np.array([1, 1, 2, 2])
+    >>> from sklearn.svm import SVC
+    >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
+    >>> clf.fit(X, y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('svc', SVC(gamma='auto'))])
+
+    >>> print(clf.predict([[-0.8, -1]]))
+    [1]
+
+    For a comparison of the SVC with other classifiers see:
+    :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py`.
+    """
+
+    _impl = "c_svc"
+
+    def __init__(
+        self,
+        *,
+        C=1.0,
+        kernel="rbf",
+        degree=3,
+        gamma="scale",
+        coef0=0.0,
+        shrinking=True,
+        probability=False,
+        tol=1e-3,
+        cache_size=200,
+        class_weight=None,
+        verbose=False,
+        max_iter=-1,
+        decision_function_shape="ovr",
+        break_ties=False,
+        random_state=None,
+    ):
+        super().__init__(
+            kernel=kernel,
+            degree=degree,
+            gamma=gamma,
+            coef0=coef0,
+            tol=tol,
+            C=C,
+            nu=0.0,
+            shrinking=shrinking,
+            probability=probability,
+            cache_size=cache_size,
+            class_weight=class_weight,
+            verbose=verbose,
+            max_iter=max_iter,
+            decision_function_shape=decision_function_shape,
+            break_ties=break_ties,
+            random_state=random_state,
+        )
+
+
+class NuSVC(BaseSVC):
+    """Nu-Support Vector Classification.
+
+    Similar to SVC but uses a parameter to control the number of support
+    vectors.
+
+    The implementation is based on libsvm.
+
+    Read more in the :ref:`User Guide <svm_classification>`.
+
+    Parameters
+    ----------
+    nu : float, default=0.5
+        An upper bound on the fraction of margin errors (see :ref:`User Guide
+        <nu_svc>`) and a lower bound of the fraction of support vectors.
+        Should be in the interval (0, 1].
+
+    kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,  \
+        default='rbf'
+        Specifies the kernel type to be used in the algorithm.
+        If none is given, 'rbf' will be used. If a callable is given it is
+        used to precompute the kernel matrix. For an intuitive
+        visualization of different kernel types see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.
+
+    degree : int, default=3
+        Degree of the polynomial kernel function ('poly').
+        Must be non-negative. Ignored by all other kernels.
+
+    gamma : {'scale', 'auto'} or float, default='scale'
+        Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
+
+        - if ``gamma='scale'`` (default) is passed then it uses
+          1 / (n_features * X.var()) as value of gamma,
+        - if 'auto', uses 1 / n_features
+        - if float, must be non-negative.
+
+        .. versionchanged:: 0.22
+           The default value of ``gamma`` changed from 'auto' to 'scale'.
+
+    coef0 : float, default=0.0
+        Independent term in kernel function.
+        It is only significant in 'poly' and 'sigmoid'.
+
+    shrinking : bool, default=True
+        Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
+
+    probability : bool, default=False
+        Whether to enable probability estimates. This must be enabled prior
+        to calling `fit`, will slow down that method as it internally uses
+        5-fold cross-validation, and `predict_proba` may be inconsistent with
+        `predict`. Read more in the :ref:`User Guide <scores_probabilities>`.
+
+    tol : float, default=1e-3
+        Tolerance for stopping criterion.
+
+    cache_size : float, default=200
+        Specify the size of the kernel cache (in MB).
+
+    class_weight : {dict, 'balanced'}, default=None
+        Set the parameter C of class i to class_weight[i]*C for
+        SVC. If not given, all classes are supposed to have
+        weight one. The "balanced" mode uses the values of y to automatically
+        adjust weights inversely proportional to class frequencies as
+        ``n_samples / (n_classes * np.bincount(y))``.
+
+    verbose : bool, default=False
+        Enable verbose output. Note that this setting takes advantage of a
+        per-process runtime setting in libsvm that, if enabled, may not work
+        properly in a multithreaded context.
+
+    max_iter : int, default=-1
+        Hard limit on iterations within solver, or -1 for no limit.
+
+    decision_function_shape : {'ovo', 'ovr'}, default='ovr'
+        Whether to return a one-vs-rest ('ovr') decision function of shape
+        (n_samples, n_classes) as all other classifiers, or the original
+        one-vs-one ('ovo') decision function of libsvm which has shape
+        (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one
+        ('ovo') is always used as multi-class strategy. The parameter is
+        ignored for binary classification.
+
+        .. versionchanged:: 0.19
+            decision_function_shape is 'ovr' by default.
+
+        .. versionadded:: 0.17
+           *decision_function_shape='ovr'* is recommended.
+
+        .. versionchanged:: 0.17
+           Deprecated *decision_function_shape='ovo' and None*.
+
+    break_ties : bool, default=False
+        If true, ``decision_function_shape='ovr'``, and number of classes > 2,
+        :term:`predict` will break ties according to the confidence values of
+        :term:`decision_function`; otherwise the first class among the tied
+        classes is returned. Please note that breaking ties comes at a
+        relatively high computational cost compared to a simple predict.
+        See :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an
+        example of its usage with ``decision_function_shape='ovr'``.
+
+        .. versionadded:: 0.22
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the pseudo random number generation for shuffling the data for
+        probability estimates. Ignored when `probability` is False.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    class_weight_ : ndarray of shape (n_classes,)
+        Multipliers of parameter C of each class.
+        Computed based on the ``class_weight`` parameter.
+
+    classes_ : ndarray of shape (n_classes,)
+        The unique classes labels.
+
+    coef_ : ndarray of shape (n_classes * (n_classes -1) / 2, n_features)
+        Weights assigned to the features (coefficients in the primal
+        problem). This is only available in the case of a linear kernel.
+
+        `coef_` is readonly property derived from `dual_coef_` and
+        `support_vectors_`.
+
+    dual_coef_ : ndarray of shape (n_classes - 1, n_SV)
+        Dual coefficients of the support vector in the decision
+        function (see :ref:`sgd_mathematical_formulation`), multiplied by
+        their targets.
+        For multiclass, coefficient for all 1-vs-1 classifiers.
+        The layout of the coefficients in the multiclass case is somewhat
+        non-trivial. See the :ref:`multi-class section of the User Guide
+        <svm_multi_class>` for details.
+
+    fit_status_ : int
+        0 if correctly fitted, 1 if the algorithm did not converge.
+
+    intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : ndarray of shape (n_classes * (n_classes - 1) // 2,)
+        Number of iterations run by the optimization routine to fit the model.
+        The shape of this attribute depends on the number of models optimized
+        which in turn depends on the number of classes.
+
+        .. versionadded:: 1.1
+
+    support_ : ndarray of shape (n_SV,)
+        Indices of support vectors.
+
+    support_vectors_ : ndarray of shape (n_SV, n_features)
+        Support vectors.
+
+    n_support_ : ndarray of shape (n_classes,), dtype=int32
+        Number of support vectors for each class.
+
+    fit_status_ : int
+        0 if correctly fitted, 1 if the algorithm did not converge.
+
+    probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)
+
+    probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)
+        If `probability=True`, it corresponds to the parameters learned in
+        Platt scaling to produce probability estimates from decision values.
+        If `probability=False`, it's an empty array. Platt scaling uses the
+        logistic function
+        ``1 / (1 + exp(decision_value * probA_ + probB_))``
+        where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For
+        more information on the multiclass case and training procedure see
+        section 8 of [1]_.
+
+    shape_fit_ : tuple of int of shape (n_dimensions_of_X,)
+        Array dimensions of training vector ``X``.
+
+    See Also
+    --------
+    SVC : Support Vector Machine for classification using libsvm.
+
+    LinearSVC : Scalable linear Support Vector Machine for classification using
+        liblinear.
+
+    References
+    ----------
+    .. [1] `LIBSVM: A Library for Support Vector Machines
+        <http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf>`_
+
+    .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector
+        Machines and Comparisons to Regularized Likelihood Methods"
+        <https://citeseerx.ist.psu.edu/doc_view/pid/42e5ed832d4310ce4378c44d05570439df28a393>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> y = np.array([1, 1, 2, 2])
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.svm import NuSVC
+    >>> clf = make_pipeline(StandardScaler(), NuSVC())
+    >>> clf.fit(X, y)
+    Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())])
+    >>> print(clf.predict([[-0.8, -1]]))
+    [1]
+    """
+
+    _impl = "nu_svc"
+
+    _parameter_constraints: dict = {
+        **BaseSVC._parameter_constraints,
+        "nu": [Interval(Real, 0.0, 1.0, closed="right")],
+    }
+    _parameter_constraints.pop("C")
+
+    def __init__(
+        self,
+        *,
+        nu=0.5,
+        kernel="rbf",
+        degree=3,
+        gamma="scale",
+        coef0=0.0,
+        shrinking=True,
+        probability=False,
+        tol=1e-3,
+        cache_size=200,
+        class_weight=None,
+        verbose=False,
+        max_iter=-1,
+        decision_function_shape="ovr",
+        break_ties=False,
+        random_state=None,
+    ):
+        super().__init__(
+            kernel=kernel,
+            degree=degree,
+            gamma=gamma,
+            coef0=coef0,
+            tol=tol,
+            C=0.0,
+            nu=nu,
+            shrinking=shrinking,
+            probability=probability,
+            cache_size=cache_size,
+            class_weight=class_weight,
+            verbose=verbose,
+            max_iter=max_iter,
+            decision_function_shape=decision_function_shape,
+            break_ties=break_ties,
+            random_state=random_state,
+        )
+
+
+class SVR(RegressorMixin, BaseLibSVM):
+    """Epsilon-Support Vector Regression.
+
+    The free parameters in the model are C and epsilon.
+
+    The implementation is based on libsvm. The fit time complexity
+    is more than quadratic with the number of samples which makes it hard
+    to scale to datasets with more than a couple of 10000 samples. For large
+    datasets consider using :class:`~sklearn.svm.LinearSVR` or
+    :class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a
+    :class:`~sklearn.kernel_approximation.Nystroem` transformer or
+    other :ref:`kernel_approximation`.
+
+    Read more in the :ref:`User Guide <svm_regression>`.
+
+    Parameters
+    ----------
+    kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,  \
+        default='rbf'
+         Specifies the kernel type to be used in the algorithm.
+         If none is given, 'rbf' will be used. If a callable is given it is
+         used to precompute the kernel matrix.
+         For an intuitive visualization of different kernel types
+         see :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py`
+
+    degree : int, default=3
+        Degree of the polynomial kernel function ('poly').
+        Must be non-negative. Ignored by all other kernels.
+
+    gamma : {'scale', 'auto'} or float, default='scale'
+        Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
+
+        - if ``gamma='scale'`` (default) is passed then it uses
+          1 / (n_features * X.var()) as value of gamma,
+        - if 'auto', uses 1 / n_features
+        - if float, must be non-negative.
+
+        .. versionchanged:: 0.22
+           The default value of ``gamma`` changed from 'auto' to 'scale'.
+
+    coef0 : float, default=0.0
+        Independent term in kernel function.
+        It is only significant in 'poly' and 'sigmoid'.
+
+    tol : float, default=1e-3
+        Tolerance for stopping criterion.
+
+    C : float, default=1.0
+        Regularization parameter. The strength of the regularization is
+        inversely proportional to C. Must be strictly positive.
+        The penalty is a squared l2. For an intuitive visualization of the
+        effects of scaling the regularization parameter C, see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.
+
+    epsilon : float, default=0.1
+         Epsilon in the epsilon-SVR model. It specifies the epsilon-tube
+         within which no penalty is associated in the training loss function
+         with points predicted within a distance epsilon from the actual
+         value. Must be non-negative.
+
+    shrinking : bool, default=True
+        Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
+
+    cache_size : float, default=200
+        Specify the size of the kernel cache (in MB).
+
+    verbose : bool, default=False
+        Enable verbose output. Note that this setting takes advantage of a
+        per-process runtime setting in libsvm that, if enabled, may not work
+        properly in a multithreaded context.
+
+    max_iter : int, default=-1
+        Hard limit on iterations within solver, or -1 for no limit.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features)
+        Weights assigned to the features (coefficients in the primal
+        problem). This is only available in the case of a linear kernel.
+
+        `coef_` is readonly property derived from `dual_coef_` and
+        `support_vectors_`.
+
+    dual_coef_ : ndarray of shape (1, n_SV)
+        Coefficients of the support vector in the decision function.
+
+    fit_status_ : int
+        0 if correctly fitted, 1 otherwise (will raise warning)
+
+    intercept_ : ndarray of shape (1,)
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations run by the optimization routine to fit the model.
+
+        .. versionadded:: 1.1
+
+    n_support_ : ndarray of shape (1,), dtype=int32
+        Number of support vectors.
+
+    shape_fit_ : tuple of int of shape (n_dimensions_of_X,)
+        Array dimensions of training vector ``X``.
+
+    support_ : ndarray of shape (n_SV,)
+        Indices of support vectors.
+
+    support_vectors_ : ndarray of shape (n_SV, n_features)
+        Support vectors.
+
+    See Also
+    --------
+    NuSVR : Support Vector Machine for regression implemented using libsvm
+        using a parameter to control the number of support vectors.
+
+    LinearSVR : Scalable Linear Support Vector Machine for regression
+        implemented using liblinear.
+
+    References
+    ----------
+    .. [1] `LIBSVM: A Library for Support Vector Machines
+        <http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf>`_
+
+    .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector
+        Machines and Comparisons to Regularized Likelihood Methods"
+        <https://citeseerx.ist.psu.edu/doc_view/pid/42e5ed832d4310ce4378c44d05570439df28a393>`_
+
+    Examples
+    --------
+    >>> from sklearn.svm import SVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> import numpy as np
+    >>> n_samples, n_features = 10, 5
+    >>> rng = np.random.RandomState(0)
+    >>> y = rng.randn(n_samples)
+    >>> X = rng.randn(n_samples, n_features)
+    >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
+    >>> regr.fit(X, y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('svr', SVR(epsilon=0.2))])
+    """
+
+    _impl = "epsilon_svr"
+
+    _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints}
+    for unused_param in ["class_weight", "nu", "probability", "random_state"]:
+        _parameter_constraints.pop(unused_param)
+
+    def __init__(
+        self,
+        *,
+        kernel="rbf",
+        degree=3,
+        gamma="scale",
+        coef0=0.0,
+        tol=1e-3,
+        C=1.0,
+        epsilon=0.1,
+        shrinking=True,
+        cache_size=200,
+        verbose=False,
+        max_iter=-1,
+    ):
+        super().__init__(
+            kernel=kernel,
+            degree=degree,
+            gamma=gamma,
+            coef0=coef0,
+            tol=tol,
+            C=C,
+            nu=0.0,
+            epsilon=epsilon,
+            verbose=verbose,
+            shrinking=shrinking,
+            probability=False,
+            cache_size=cache_size,
+            class_weight=None,
+            max_iter=max_iter,
+            random_state=None,
+        )
+
+
+class NuSVR(RegressorMixin, BaseLibSVM):
+    """Nu Support Vector Regression.
+
+    Similar to NuSVC, for regression, uses a parameter nu to control
+    the number of support vectors. However, unlike NuSVC, where nu
+    replaces C, here nu replaces the parameter epsilon of epsilon-SVR.
+
+    The implementation is based on libsvm.
+
+    Read more in the :ref:`User Guide <svm_regression>`.
+
+    Parameters
+    ----------
+    nu : float, default=0.5
+        An upper bound on the fraction of training errors and a lower bound of
+        the fraction of support vectors. Should be in the interval (0, 1].  By
+        default 0.5 will be taken.
+
+    C : float, default=1.0
+        Penalty parameter C of the error term. For an intuitive visualization
+        of the effects of scaling the regularization parameter C, see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.
+
+    kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,  \
+        default='rbf'
+         Specifies the kernel type to be used in the algorithm.
+         If none is given, 'rbf' will be used. If a callable is given it is
+         used to precompute the kernel matrix.
+         For an intuitive visualization of different kernel types see
+         See :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py`
+
+    degree : int, default=3
+        Degree of the polynomial kernel function ('poly').
+        Must be non-negative. Ignored by all other kernels.
+
+    gamma : {'scale', 'auto'} or float, default='scale'
+        Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
+
+        - if ``gamma='scale'`` (default) is passed then it uses
+          1 / (n_features * X.var()) as value of gamma,
+        - if 'auto', uses 1 / n_features
+        - if float, must be non-negative.
+
+        .. versionchanged:: 0.22
+           The default value of ``gamma`` changed from 'auto' to 'scale'.
+
+    coef0 : float, default=0.0
+        Independent term in kernel function.
+        It is only significant in 'poly' and 'sigmoid'.
+
+    shrinking : bool, default=True
+        Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
+
+    tol : float, default=1e-3
+        Tolerance for stopping criterion.
+
+    cache_size : float, default=200
+        Specify the size of the kernel cache (in MB).
+
+    verbose : bool, default=False
+        Enable verbose output. Note that this setting takes advantage of a
+        per-process runtime setting in libsvm that, if enabled, may not work
+        properly in a multithreaded context.
+
+    max_iter : int, default=-1
+        Hard limit on iterations within solver, or -1 for no limit.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features)
+        Weights assigned to the features (coefficients in the primal
+        problem). This is only available in the case of a linear kernel.
+
+        `coef_` is readonly property derived from `dual_coef_` and
+        `support_vectors_`.
+
+    dual_coef_ : ndarray of shape (1, n_SV)
+        Coefficients of the support vector in the decision function.
+
+    fit_status_ : int
+        0 if correctly fitted, 1 otherwise (will raise warning)
+
+    intercept_ : ndarray of shape (1,)
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations run by the optimization routine to fit the model.
+
+        .. versionadded:: 1.1
+
+    n_support_ : ndarray of shape (1,), dtype=int32
+        Number of support vectors.
+
+    shape_fit_ : tuple of int of shape (n_dimensions_of_X,)
+        Array dimensions of training vector ``X``.
+
+    support_ : ndarray of shape (n_SV,)
+        Indices of support vectors.
+
+    support_vectors_ : ndarray of shape (n_SV, n_features)
+        Support vectors.
+
+    See Also
+    --------
+    NuSVC : Support Vector Machine for classification implemented with libsvm
+        with a parameter to control the number of support vectors.
+
+    SVR : Epsilon Support Vector Machine for regression implemented with
+        libsvm.
+
+    References
+    ----------
+    .. [1] `LIBSVM: A Library for Support Vector Machines
+        <http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf>`_
+
+    .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector
+        Machines and Comparisons to Regularized Likelihood Methods"
+        <https://citeseerx.ist.psu.edu/doc_view/pid/42e5ed832d4310ce4378c44d05570439df28a393>`_
+
+    Examples
+    --------
+    >>> from sklearn.svm import NuSVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> import numpy as np
+    >>> n_samples, n_features = 10, 5
+    >>> np.random.seed(0)
+    >>> y = np.random.randn(n_samples)
+    >>> X = np.random.randn(n_samples, n_features)
+    >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1))
+    >>> regr.fit(X, y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('nusvr', NuSVR(nu=0.1))])
+    """
+
+    _impl = "nu_svr"
+
+    _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints}
+    for unused_param in ["class_weight", "epsilon", "probability", "random_state"]:
+        _parameter_constraints.pop(unused_param)
+
+    def __init__(
+        self,
+        *,
+        nu=0.5,
+        C=1.0,
+        kernel="rbf",
+        degree=3,
+        gamma="scale",
+        coef0=0.0,
+        shrinking=True,
+        tol=1e-3,
+        cache_size=200,
+        verbose=False,
+        max_iter=-1,
+    ):
+        super().__init__(
+            kernel=kernel,
+            degree=degree,
+            gamma=gamma,
+            coef0=coef0,
+            tol=tol,
+            C=C,
+            nu=nu,
+            epsilon=0.0,
+            shrinking=shrinking,
+            probability=False,
+            cache_size=cache_size,
+            class_weight=None,
+            verbose=verbose,
+            max_iter=max_iter,
+            random_state=None,
+        )
+
+
+class OneClassSVM(OutlierMixin, BaseLibSVM):
+    """Unsupervised Outlier Detection.
+
+    Estimate the support of a high-dimensional distribution.
+
+    The implementation is based on libsvm.
+
+    Read more in the :ref:`User Guide <outlier_detection>`.
+
+    Parameters
+    ----------
+    kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,  \
+        default='rbf'
+         Specifies the kernel type to be used in the algorithm.
+         If none is given, 'rbf' will be used. If a callable is given it is
+         used to precompute the kernel matrix.
+
+    degree : int, default=3
+        Degree of the polynomial kernel function ('poly').
+        Must be non-negative. Ignored by all other kernels.
+
+    gamma : {'scale', 'auto'} or float, default='scale'
+        Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
+
+        - if ``gamma='scale'`` (default) is passed then it uses
+          1 / (n_features * X.var()) as value of gamma,
+        - if 'auto', uses 1 / n_features
+        - if float, must be non-negative.
+
+        .. versionchanged:: 0.22
+           The default value of ``gamma`` changed from 'auto' to 'scale'.
+
+    coef0 : float, default=0.0
+        Independent term in kernel function.
+        It is only significant in 'poly' and 'sigmoid'.
+
+    tol : float, default=1e-3
+        Tolerance for stopping criterion.
+
+    nu : float, default=0.5
+        An upper bound on the fraction of training
+        errors and a lower bound of the fraction of support
+        vectors. Should be in the interval (0, 1]. By default 0.5
+        will be taken.
+
+    shrinking : bool, default=True
+        Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
+
+    cache_size : float, default=200
+        Specify the size of the kernel cache (in MB).
+
+    verbose : bool, default=False
+        Enable verbose output. Note that this setting takes advantage of a
+        per-process runtime setting in libsvm that, if enabled, may not work
+        properly in a multithreaded context.
+
+    max_iter : int, default=-1
+        Hard limit on iterations within solver, or -1 for no limit.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features)
+        Weights assigned to the features (coefficients in the primal
+        problem). This is only available in the case of a linear kernel.
+
+        `coef_` is readonly property derived from `dual_coef_` and
+        `support_vectors_`.
+
+    dual_coef_ : ndarray of shape (1, n_SV)
+        Coefficients of the support vectors in the decision function.
+
+    fit_status_ : int
+        0 if correctly fitted, 1 otherwise (will raise warning)
+
+    intercept_ : ndarray of shape (1,)
+        Constant in the decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations run by the optimization routine to fit the model.
+
+        .. versionadded:: 1.1
+
+    n_support_ : ndarray of shape (n_classes,), dtype=int32
+        Number of support vectors for each class.
+
+    offset_ : float
+        Offset used to define the decision function from the raw scores.
+        We have the relation: decision_function = score_samples - `offset_`.
+        The offset is the opposite of `intercept_` and is provided for
+        consistency with other outlier detection algorithms.
+
+        .. versionadded:: 0.20
+
+    shape_fit_ : tuple of int of shape (n_dimensions_of_X,)
+        Array dimensions of training vector ``X``.
+
+    support_ : ndarray of shape (n_SV,)
+        Indices of support vectors.
+
+    support_vectors_ : ndarray of shape (n_SV, n_features)
+        Support vectors.
+
+    See Also
+    --------
+    sklearn.linear_model.SGDOneClassSVM : Solves linear One-Class SVM using
+        Stochastic Gradient Descent.
+    sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection using
+        Local Outlier Factor (LOF).
+    sklearn.ensemble.IsolationForest : Isolation Forest Algorithm.
+
+    Examples
+    --------
+    >>> from sklearn.svm import OneClassSVM
+    >>> X = [[0], [0.44], [0.45], [0.46], [1]]
+    >>> clf = OneClassSVM(gamma='auto').fit(X)
+    >>> clf.predict(X)
+    array([-1,  1,  1,  1, -1])
+    >>> clf.score_samples(X)
+    array([1.7798, 2.0547, 2.0556, 2.0561, 1.7332])
+
+    For a more extended example,
+    see :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py`
+    """
+
+    _impl = "one_class"
+
+    _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints}
+    for unused_param in ["C", "class_weight", "epsilon", "probability", "random_state"]:
+        _parameter_constraints.pop(unused_param)
+
+    def __init__(
+        self,
+        *,
+        kernel="rbf",
+        degree=3,
+        gamma="scale",
+        coef0=0.0,
+        tol=1e-3,
+        nu=0.5,
+        shrinking=True,
+        cache_size=200,
+        verbose=False,
+        max_iter=-1,
+    ):
+        super().__init__(
+            kernel,
+            degree,
+            gamma,
+            coef0,
+            tol,
+            0.0,
+            nu,
+            0.0,
+            shrinking,
+            False,
+            cache_size,
+            None,
+            verbose,
+            max_iter,
+            random_state=None,
+        )
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Detect the soft boundary of the set of samples X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Set of samples, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Per-sample weights. Rescale C per sample. Higher weights
+            force the classifier to put more emphasis on these points.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+
+        Notes
+        -----
+        If X is not a C-ordered contiguous array it is copied.
+        """
+        super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight)
+        self.offset_ = -self._intercept_
+        return self
+
+    def decision_function(self, X):
+        """Signed distance to the separating hyperplane.
+
+        Signed distance is positive for an inlier and negative for an outlier.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        dec : ndarray of shape (n_samples,)
+            Returns the decision function of the samples.
+        """
+        dec = self._decision_function(X).ravel()
+        return dec
+
+    def score_samples(self, X):
+        """Raw scoring function of the samples.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        score_samples : ndarray of shape (n_samples,)
+            Returns the (unshifted) scoring function of the samples.
+        """
+        return self.decision_function(X) + self.offset_
+
+    def predict(self, X):
+        """Perform classification on samples in X.
+
+        For a one-class model, +1 or -1 is returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples_test, n_samples_train)
+            For kernel="precomputed", the expected shape of X is
+            (n_samples_test, n_samples_train).
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Class labels for samples in X.
+        """
+        y = super().predict(X)
+        return np.asarray(y, dtype=np.intp)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pxi b/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pxi
new file mode 100644
index 0000000000000000000000000000000000000000..0df269b070f5cad415cbfcd3d3ccf8f30c75fe4d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pxi
@@ -0,0 +1,43 @@
+from ..utils._typedefs cimport intp_t
+
+cdef extern from "_cython_blas_helpers.h":
+    ctypedef double (*dot_func)(int, const double*, int, const double*, int)
+    ctypedef void (*axpy_func)(int, double, const double*, int, double*, int)
+    ctypedef void (*scal_func)(int, double, const double*, int)
+    ctypedef double (*nrm2_func)(int, const double*, int)
+    cdef struct BlasFunctions:
+        dot_func dot
+        axpy_func axpy
+        scal_func scal
+        nrm2_func nrm2
+
+
+cdef extern from "linear.h":
+    cdef struct feature_node
+    cdef struct problem
+    cdef struct model
+    cdef struct parameter
+    ctypedef problem* problem_const_ptr "problem const *"
+    ctypedef parameter* parameter_const_ptr "parameter const *"
+    ctypedef char* char_const_ptr "char const *"
+    char_const_ptr check_parameter(problem_const_ptr prob, parameter_const_ptr param)
+    model *train(problem_const_ptr prob, parameter_const_ptr param, BlasFunctions *blas_functions) nogil
+    int get_nr_feature (model *model)
+    int get_nr_class (model *model)
+    void get_n_iter (model *model, int *n_iter)
+    void free_and_destroy_model (model **)
+    void destroy_param (parameter *)
+
+
+cdef extern from "liblinear_helper.c":
+    void copy_w(void *, model *, int)
+    parameter *set_parameter(int, double, double, int, char *, char *, int, int, double)
+    problem *set_problem (char *, int, int, int, int, double, char *, char *)
+    problem *csr_set_problem (char *, int, char *, char *, int, int, int, double, char *, char *)
+
+    model *set_model(parameter *, char *, intp_t *, char *, double)
+
+    double get_bias(model *)
+    void free_problem (problem *)
+    void free_parameter (parameter *)
+    void set_verbosity(int)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pyx b/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..6d5347e746384d34876ca1d569204afa3573ac76
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pyx
@@ -0,0 +1,147 @@
+"""
+Wrapper for liblinear
+
+Author: fabian.pedregosa@inria.fr
+"""
+
+import  numpy as np
+
+from ..utils._cython_blas cimport _dot, _axpy, _scal, _nrm2
+from ..utils._typedefs cimport float32_t, float64_t, int32_t
+
+include "_liblinear.pxi"
+
+
+def train_wrap(
+    object X,
+    const float64_t[::1] Y,
+    bint is_sparse,
+    int solver_type,
+    double eps,
+    double bias,
+    double C,
+    const float64_t[:] class_weight,
+    int max_iter,
+    unsigned random_seed,
+    double epsilon,
+    const float64_t[::1] sample_weight
+):
+    cdef parameter *param
+    cdef problem *problem
+    cdef model *model
+    cdef char_const_ptr error_msg
+    cdef int len_w
+    cdef bint X_has_type_float64 = X.dtype == np.float64
+    cdef char * X_data_bytes_ptr
+    cdef const float64_t[::1] X_data_64
+    cdef const float32_t[::1] X_data_32
+    cdef const int32_t[::1] X_indices
+    cdef const int32_t[::1] X_indptr
+
+    if is_sparse:
+        X_indices = X.indices
+        X_indptr = X.indptr
+        if X_has_type_float64:
+            X_data_64 = X.data
+            X_data_bytes_ptr = <char *> &X_data_64[0]
+        else:
+            X_data_32 = X.data
+            X_data_bytes_ptr = <char *> &X_data_32[0]
+
+        problem = csr_set_problem(
+            X_data_bytes_ptr,
+            X_has_type_float64,
+            <char *> &X_indices[0],
+            <char *> &X_indptr[0],
+            (<int32_t>X.shape[0]),
+            (<int32_t>X.shape[1]),
+            (<int32_t>X.nnz),
+            bias,
+            <char *> &sample_weight[0],
+            <char *> &Y[0]
+        )
+    else:
+        X_as_1d_array = X.reshape(-1)
+        if X_has_type_float64:
+            X_data_64 = X_as_1d_array
+            X_data_bytes_ptr = <char *> &X_data_64[0]
+        else:
+            X_data_32 = X_as_1d_array
+            X_data_bytes_ptr = <char *> &X_data_32[0]
+
+        problem = set_problem(
+            X_data_bytes_ptr,
+            X_has_type_float64,
+            (<int32_t>X.shape[0]),
+            (<int32_t>X.shape[1]),
+            (<int32_t>np.count_nonzero(X)),
+            bias,
+            <char *> &sample_weight[0],
+            <char *> &Y[0]
+        )
+
+    cdef int32_t[::1] class_weight_label = np.arange(class_weight.shape[0], dtype=np.intc)
+    param = set_parameter(
+        solver_type,
+        eps,
+        C,
+        class_weight.shape[0],
+        <char *> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char *> &class_weight[0] if class_weight.size > 0 else NULL,
+        max_iter,
+        random_seed,
+        epsilon
+    )
+
+    error_msg = check_parameter(problem, param)
+    if error_msg:
+        free_problem(problem)
+        free_parameter(param)
+        raise ValueError(error_msg)
+
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    blas_functions.axpy = _axpy[double]
+    blas_functions.scal = _scal[double]
+    blas_functions.nrm2 = _nrm2[double]
+
+    # early return
+    with nogil:
+        model = train(problem, param, &blas_functions)
+
+    # FREE
+    free_problem(problem)
+    free_parameter(param)
+    # destroy_param(param)  don't call this or it will destroy class_weight_label and class_weight
+
+    # coef matrix holder created as fortran since that's what's used in liblinear
+    cdef float64_t[::1, :] w
+    cdef int nr_class = get_nr_class(model)
+
+    cdef int labels_ = nr_class
+    if nr_class == 2:
+        labels_ = 1
+    cdef int32_t[::1] n_iter = np.zeros(labels_, dtype=np.intc)
+    get_n_iter(model, <int *> &n_iter[0])
+
+    cdef int nr_feature = get_nr_feature(model)
+    if bias > 0:
+        nr_feature = nr_feature + 1
+    if nr_class == 2 and solver_type != 4:  # solver is not Crammer-Singer
+        w = np.empty((1, nr_feature), order='F')
+        copy_w(&w[0, 0], model, nr_feature)
+    else:
+        len_w = (nr_class) * nr_feature
+        w = np.empty((nr_class, nr_feature), order='F')
+        copy_w(&w[0, 0], model, len_w)
+
+    free_and_destroy_model(&model)
+
+    return w.base, n_iter.base
+
+
+def set_verbosity_wrap(int verbosity):
+    """
+    Control verbosity of libsvm library
+    """
+    set_verbosity(verbosity)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pxi b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pxi
new file mode 100644
index 0000000000000000000000000000000000000000..74ddfd66c538e712e95ba183bcf34695f5b85a14
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pxi
@@ -0,0 +1,75 @@
+################################################################################
+# Includes
+from ..utils._typedefs cimport intp_t
+
+cdef extern from "_svm_cython_blas_helpers.h":
+    ctypedef double (*dot_func)(int, const double*, int, const double*, int)
+    cdef struct BlasFunctions:
+        dot_func dot
+
+
+cdef extern from "svm.h":
+    cdef struct svm_node
+    cdef struct svm_model
+    cdef struct svm_parameter:
+        int svm_type
+        int kernel_type
+        int degree    # for poly
+        double gamma  # for poly/rbf/sigmoid
+        double coef0  # for poly/sigmoid
+
+        # these are for training only
+        double cache_size  # in MB
+        double eps         # stopping criteria
+        double C           # for C_SVC, EPSILON_SVR and NU_SVR
+        int nr_weight      # for C_SVC
+        int *weight_label  # for C_SVC
+        double* weight     # for C_SVC
+        double nu          # for NU_SVC, ONE_CLASS, and NU_SVR
+        double p           # for EPSILON_SVR
+        int shrinking      # use the shrinking heuristics
+        int probability    # do probability estimates
+        int max_iter       # ceiling on Solver runtime
+        int random_seed    # seed for random generator in probability estimation
+
+    cdef struct svm_problem:
+        int l
+        double *y
+        svm_node *x
+        double *W  # instance weights
+
+    char *svm_check_parameter(svm_problem *, svm_parameter *)
+    svm_model *svm_train(svm_problem *, svm_parameter *, int *, BlasFunctions *) nogil
+    void svm_free_and_destroy_model(svm_model** model_ptr_ptr)
+    void svm_cross_validation(svm_problem *, svm_parameter *, int nr_fold, double *target, BlasFunctions *) nogil
+
+
+cdef extern from "libsvm_helper.c":
+    # this file contains methods for accessing libsvm 'hidden' fields
+    svm_node **dense_to_sparse (char *, intp_t *)
+    void set_parameter (svm_parameter *, int , int , int , double, double ,
+                        double , double , double , double,
+                        double, int, int, int, char *, char *, int,
+                        int)
+    void set_problem (svm_problem *, char *, char *, char *, intp_t *, int)
+
+    svm_model *set_model (svm_parameter *, int, char *, intp_t *,
+                          char *, intp_t *, intp_t *, char *,
+                          char *, char *, char *, char *)
+
+    void copy_sv_coef   (char *, svm_model *)
+    void copy_n_iter  (char *, svm_model *)
+    void copy_intercept (char *, svm_model *, intp_t *)
+    void copy_SV        (char *, svm_model *, intp_t *)
+    int copy_support (char *data, svm_model *model)
+    int copy_predict (char *, svm_model *, intp_t *, char *, BlasFunctions *) nogil
+    int copy_predict_proba (char *, svm_model *, intp_t *, char *, BlasFunctions *) nogil
+    int copy_predict_values(char *, svm_model *, intp_t *, char *, int, BlasFunctions *) nogil
+    void copy_nSV     (char *, svm_model *)
+    void copy_probA   (char *, svm_model *, intp_t *)
+    void copy_probB   (char *, svm_model *, intp_t *)
+    intp_t  get_l  (svm_model *)
+    intp_t  get_nr (svm_model *)
+    int  free_problem   (svm_problem *)
+    int  free_model     (svm_model *)
+    void set_verbosity(int)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pyx b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..be0a0826c3736469fdafbf5f42bff39d1205a6ec
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pyx
@@ -0,0 +1,917 @@
+"""
+Binding for libsvm_skl
+----------------------
+
+These are the bindings for libsvm_skl, which is a fork of libsvm[1]
+that adds to libsvm some capabilities, like index of support vectors
+and efficient representation of dense matrices.
+
+These are low-level routines, but can be used for flexibility or
+performance reasons. See sklearn.svm for a higher-level API.
+
+Low-level memory management is done in libsvm_helper.c. If we happen
+to run out of memory a MemoryError will be raised. In practice this is
+not very helpful since high chances are malloc fails inside svm.cpp,
+where no sort of memory checks are done.
+
+[1] https://www.csie.ntu.edu.tw/~cjlin/libsvm/
+
+Notes
+-----
+The signature mode='c' is somewhat superficial, since we already
+check that arrays are C-contiguous in svm.py
+
+Authors
+-------
+2010: Fabian Pedregosa <fabian.pedregosa@inria.fr>
+      Gael Varoquaux <gael.varoquaux@normalesup.org>
+"""
+
+import  numpy as np
+from libc.stdlib cimport free
+from ..utils._cython_blas cimport _dot
+from ..utils._typedefs cimport float64_t, int32_t, intp_t
+
+include "_libsvm.pxi"
+
+cdef extern from *:
+    ctypedef struct svm_parameter:
+        pass
+
+
+################################################################################
+# Internal variables
+LIBSVM_KERNEL_TYPES = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
+
+
+################################################################################
+# Wrapper functions
+
+def fit(
+    const float64_t[:, ::1] X,
+    const float64_t[::1] Y,
+    int svm_type=0,
+    kernel='rbf',
+    int degree=3,
+    double gamma=0.1,
+    double coef0=0.0,
+    double tol=1e-3,
+    double C=1.0,
+    double nu=0.5,
+    double epsilon=0.1,
+    const float64_t[::1] class_weight=np.empty(0),
+    const float64_t[::1] sample_weight=np.empty(0),
+    int shrinking=1,
+    int probability=0,
+    double cache_size=100.,
+    int max_iter=-1,
+    int random_seed=0,
+):
+    """
+    Train the model using libsvm (low-level method)
+
+    Parameters
+    ----------
+    X : array-like, dtype=float64 of shape (n_samples, n_features)
+
+    Y : array, dtype=float64 of shape (n_samples,)
+        target vector
+
+    svm_type : {0, 1, 2, 3, 4}, default=0
+        Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR
+        respectively.
+
+    kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf"
+        Kernel to use in the model: linear, polynomial, RBF, sigmoid
+        or precomputed.
+
+    degree : int32, default=3
+        Degree of the polynomial kernel (only relevant if kernel is
+        set to polynomial).
+
+    gamma : float64, default=0.1
+        Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other
+        kernels.
+
+    coef0 : float64, default=0
+        Independent parameter in poly/sigmoid kernel.
+
+    tol : float64, default=1e-3
+        Numeric stopping criterion (WRITEME).
+
+    C : float64, default=1
+        C parameter in C-Support Vector Classification.
+
+    nu : float64, default=0.5
+        An upper bound on the fraction of training errors and a lower bound of
+        the fraction of support vectors. Should be in the interval (0, 1].
+
+    epsilon : double, default=0.1
+        Epsilon parameter in the epsilon-insensitive loss function.
+
+    class_weight : array, dtype=float64, shape (n_classes,), \
+            default=np.empty(0)
+        Set the parameter C of class i to class_weight[i]*C for
+        SVC. If not given, all classes are supposed to have
+        weight one.
+
+    sample_weight : array, dtype=float64, shape (n_samples,), \
+            default=np.empty(0)
+        Weights assigned to each sample.
+
+    shrinking : int, default=1
+        Whether to use the shrinking heuristic.
+
+    probability : int, default=0
+        Whether to enable probability estimates.
+
+    cache_size : float64, default=100
+        Cache size for gram matrix columns (in megabytes).
+
+    max_iter : int (-1 for no limit), default=-1
+        Stop solver after this many iterations regardless of accuracy
+        (XXX Currently there is no API to know whether this kicked in.)
+
+    random_seed : int, default=0
+        Seed for the random number generator used for probability estimates.
+
+    Returns
+    -------
+    support : array of shape (n_support,)
+        Index of support vectors.
+
+    support_vectors : array of shape (n_support, n_features)
+        Support vectors (equivalent to X[support]). Will return an
+        empty array in the case of precomputed kernel.
+
+    n_class_SV : array of shape (n_class,)
+        Number of support vectors in each class.
+
+    sv_coef : array of shape (n_class-1, n_support)
+        Coefficients of support vectors in decision function.
+
+    intercept : array of shape (n_class*(n_class-1)/2,)
+        Intercept in decision function.
+
+    probA, probB : array of shape (n_class*(n_class-1)/2,)
+        Probability estimates, empty array for probability=False.
+
+    n_iter : ndarray of shape (max(1, (n_class * (n_class - 1) // 2)),)
+        Number of iterations run by the optimization routine to fit the model.
+    """
+
+    cdef svm_parameter param
+    cdef svm_problem problem
+    cdef svm_model *model
+    cdef const char *error_msg
+    cdef intp_t SV_len
+
+    if len(sample_weight) == 0:
+        sample_weight = np.ones(X.shape[0], dtype=np.float64)
+    else:
+        assert sample_weight.shape[0] == X.shape[0], (
+            f"sample_weight and X have incompatible shapes: sample_weight has "
+            f"{sample_weight.shape[0]} samples while X has {X.shape[0]}"
+        )
+
+    kernel_index = LIBSVM_KERNEL_TYPES.index(kernel)
+    set_problem(
+        &problem,
+        <char*> &X[0, 0],
+        <char*> &Y[0],
+        <char*> &sample_weight[0],
+        <intp_t*> X.shape,
+        kernel_index,
+    )
+    if problem.x == NULL:
+        raise MemoryError("Seems we've run out of memory")
+    cdef int32_t[::1] class_weight_label = np.arange(
+        class_weight.shape[0], dtype=np.int32
+    )
+    set_parameter(
+        &param,
+        svm_type,
+        kernel_index,
+        degree,
+        gamma,
+        coef0,
+        nu,
+        cache_size,
+        C,
+        tol,
+        epsilon,
+        shrinking,
+        probability,
+        <int> class_weight.shape[0],
+        <char*> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char*> &class_weight[0] if class_weight.size > 0 else NULL,
+        max_iter,
+        random_seed,
+    )
+
+    error_msg = svm_check_parameter(&problem, &param)
+    if error_msg:
+        # for SVR: epsilon is called p in libsvm
+        error_repl = error_msg.decode('utf-8').replace("p < 0", "epsilon < 0")
+        raise ValueError(error_repl)
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    # this does the real work
+    cdef int fit_status = 0
+    with nogil:
+        model = svm_train(&problem, &param, &fit_status, &blas_functions)
+
+    # from here until the end, we just copy the data returned by
+    # svm_train
+    SV_len = get_l(model)
+    n_class = get_nr(model)
+
+    cdef int[::1] n_iter = np.empty(max(1, n_class * (n_class - 1) // 2), dtype=np.intc)
+    copy_n_iter(<char*> &n_iter[0], model)
+
+    cdef float64_t[:, ::1] sv_coef = np.empty((n_class-1, SV_len), dtype=np.float64)
+    copy_sv_coef(<char*> &sv_coef[0, 0] if sv_coef.size > 0 else NULL, model)
+
+    # the intercept is just model.rho but with sign changed
+    cdef float64_t[::1] intercept = np.empty(
+        int((n_class*(n_class-1))/2), dtype=np.float64
+    )
+    copy_intercept(<char*> &intercept[0], model, <intp_t*> intercept.shape)
+
+    cdef int32_t[::1] support = np.empty(SV_len, dtype=np.int32)
+    copy_support(<char*> &support[0] if support.size > 0 else NULL, model)
+
+    # copy model.SV
+    cdef float64_t[:, ::1] support_vectors
+    if kernel_index == 4:
+        # precomputed kernel
+        support_vectors = np.empty((0, 0), dtype=np.float64)
+    else:
+        support_vectors = np.empty((SV_len, X.shape[1]), dtype=np.float64)
+        copy_SV(
+            <char*> &support_vectors[0, 0] if support_vectors.size > 0 else NULL,
+            model,
+            <intp_t*> support_vectors.shape,
+        )
+
+    cdef int32_t[::1] n_class_SV
+    if svm_type == 0 or svm_type == 1:
+        n_class_SV = np.empty(n_class, dtype=np.int32)
+        copy_nSV(<char*> &n_class_SV[0] if n_class_SV.size > 0 else NULL, model)
+    else:
+        # OneClass and SVR are considered to have 2 classes
+        n_class_SV = np.array([SV_len, SV_len], dtype=np.int32)
+
+    cdef float64_t[::1] probA
+    cdef float64_t[::1] probB
+    if probability != 0:
+        if svm_type < 2:  # SVC and NuSVC
+            probA = np.empty(int(n_class*(n_class-1)/2), dtype=np.float64)
+            probB = np.empty(int(n_class*(n_class-1)/2), dtype=np.float64)
+            copy_probB(<char*> &probB[0], model, <intp_t*> probB.shape)
+        else:
+            probA = np.empty(1, dtype=np.float64)
+            probB = np.empty(0, dtype=np.float64)
+        copy_probA(<char*> &probA[0], model, <intp_t*> probA.shape)
+    else:
+        probA = np.empty(0, dtype=np.float64)
+        probB = np.empty(0, dtype=np.float64)
+
+    svm_free_and_destroy_model(&model)
+    free(problem.x)
+
+    return (
+        support.base,
+        support_vectors.base,
+        n_class_SV.base,
+        sv_coef.base,
+        intercept.base,
+        probA.base,
+        probB.base,
+        fit_status,
+        n_iter.base,
+    )
+
+
+cdef void set_predict_params(
+    svm_parameter *param,
+    int svm_type,
+    kernel,
+    int degree,
+    double gamma,
+    double coef0,
+    double cache_size,
+    int probability,
+    int nr_weight,
+    char *weight_label,
+    char *weight,
+) except *:
+    """Fill param with prediction time-only parameters."""
+
+    # training-time only parameters
+    cdef double C = 0.0
+    cdef double epsilon = 0.1
+    cdef int max_iter = 0
+    cdef double nu = 0.5
+    cdef int shrinking = 0
+    cdef double tol = 0.1
+    cdef int random_seed = -1
+
+    kernel_index = LIBSVM_KERNEL_TYPES.index(kernel)
+
+    set_parameter(
+        param,
+        svm_type,
+        kernel_index,
+        degree,
+        gamma,
+        coef0,
+        nu,
+        cache_size,
+        C,
+        tol,
+        epsilon,
+        shrinking,
+        probability,
+        nr_weight,
+        weight_label,
+        weight,
+        max_iter,
+        random_seed,
+    )
+
+
+def predict(
+    const float64_t[:, ::1] X,
+    const int32_t[::1] support,
+    const float64_t[:, ::1] SV,
+    const int32_t[::1] nSV,
+    const float64_t[:, ::1] sv_coef,
+    const float64_t[::1] intercept,
+    const float64_t[::1] probA=np.empty(0),
+    const float64_t[::1] probB=np.empty(0),
+    int svm_type=0,
+    kernel='rbf',
+    int degree=3,
+    double gamma=0.1,
+    double coef0=0.0,
+    const float64_t[::1] class_weight=np.empty(0),
+    const float64_t[::1] sample_weight=np.empty(0),
+    double cache_size=100.0,
+):
+    """
+    Predict target values of X given a model (low-level method)
+
+    Parameters
+    ----------
+    X : array-like, dtype=float of shape (n_samples, n_features)
+
+    support : array of shape (n_support,)
+        Index of support vectors in training set.
+
+    SV : array of shape (n_support, n_features)
+        Support vectors.
+
+    nSV : array of shape (n_class,)
+        Number of support vectors in each class.
+
+    sv_coef : array of shape (n_class-1, n_support)
+        Coefficients of support vectors in decision function.
+
+    intercept : array of shape (n_class*(n_class-1)/2)
+        Intercept in decision function.
+
+    probA, probB : array of shape (n_class*(n_class-1)/2,)
+        Probability estimates.
+
+    svm_type : {0, 1, 2, 3, 4}, default=0
+        Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR
+        respectively.
+
+    kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf"
+        Kernel to use in the model: linear, polynomial, RBF, sigmoid
+        or precomputed.
+
+    degree : int32, default=3
+        Degree of the polynomial kernel (only relevant if kernel is
+        set to polynomial).
+
+    gamma : float64, default=0.1
+        Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other
+        kernels.
+
+    coef0 : float64, default=0.0
+        Independent parameter in poly/sigmoid kernel.
+
+    Returns
+    -------
+    dec_values : array
+        Predicted values.
+    """
+    cdef float64_t[::1] dec_values
+    cdef svm_parameter param
+    cdef svm_model *model
+    cdef int rv
+
+    cdef int32_t[::1] class_weight_label = np.arange(
+        class_weight.shape[0], dtype=np.int32
+    )
+
+    set_predict_params(
+        &param,
+        svm_type,
+        kernel,
+        degree,
+        gamma,
+        coef0,
+        cache_size,
+        0,
+        <int>class_weight.shape[0],
+        <char*> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char*> &class_weight[0] if class_weight.size > 0 else NULL,
+    )
+    model = set_model(
+        &param,
+        <int> nSV.shape[0],
+        <char*> &SV[0, 0] if SV.size > 0 else NULL,
+        <intp_t*> SV.shape,
+        <char*> &support[0] if support.size > 0 else NULL,
+        <intp_t*> support.shape,
+        <intp_t*> sv_coef.strides,
+        <char*> &sv_coef[0, 0] if sv_coef.size > 0 else NULL,
+        <char*> &intercept[0],
+        <char*> &nSV[0],
+        <char*> &probA[0] if probA.size > 0 else NULL,
+        <char*> &probB[0] if probB.size > 0 else NULL,
+    )
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    # TODO: use check_model
+    try:
+        dec_values = np.empty(X.shape[0])
+        with nogil:
+            rv = copy_predict(
+                <char*> &X[0, 0],
+                model,
+                <intp_t*> X.shape,
+                <char*> &dec_values[0],
+                &blas_functions,
+            )
+        if rv < 0:
+            raise MemoryError("We've run out of memory")
+    finally:
+        free_model(model)
+
+    return dec_values.base
+
+
+def predict_proba(
+    const float64_t[:, ::1] X,
+    const int32_t[::1] support,
+    const float64_t[:, ::1] SV,
+    const int32_t[::1] nSV,
+    float64_t[:, ::1] sv_coef,
+    float64_t[::1] intercept,
+    float64_t[::1] probA=np.empty(0),
+    float64_t[::1] probB=np.empty(0),
+    int svm_type=0,
+    kernel='rbf',
+    int degree=3,
+    double gamma=0.1,
+    double coef0=0.0,
+    float64_t[::1] class_weight=np.empty(0),
+    float64_t[::1] sample_weight=np.empty(0),
+    double cache_size=100.0,
+):
+    """
+    Predict probabilities
+
+    svm_model stores all parameters needed to predict a given value.
+
+    For speed, all real work is done at the C level in function
+    copy_predict (libsvm_helper.c).
+
+    We have to reconstruct model and parameters to make sure we stay
+    in sync with the python object.
+
+    See sklearn.svm.predict for a complete list of parameters.
+
+    Parameters
+    ----------
+    X : array-like, dtype=float of shape (n_samples, n_features)
+
+    support : array of shape (n_support,)
+        Index of support vectors in training set.
+
+    SV : array of shape (n_support, n_features)
+        Support vectors.
+
+    nSV : array of shape (n_class,)
+        Number of support vectors in each class.
+
+    sv_coef : array of shape (n_class-1, n_support)
+        Coefficients of support vectors in decision function.
+
+    intercept : array of shape (n_class*(n_class-1)/2,)
+        Intercept in decision function.
+
+    probA, probB : array of shape (n_class*(n_class-1)/2,)
+        Probability estimates.
+
+    svm_type : {0, 1, 2, 3, 4}, default=0
+        Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR
+        respectively.
+
+    kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf"
+        Kernel to use in the model: linear, polynomial, RBF, sigmoid
+        or precomputed.
+
+    degree : int32, default=3
+        Degree of the polynomial kernel (only relevant if kernel is
+        set to polynomial).
+
+    gamma : float64, default=0.1
+        Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other
+        kernels.
+
+    coef0 : float64, default=0.0
+        Independent parameter in poly/sigmoid kernel.
+
+    Returns
+    -------
+    dec_values : array
+        Predicted values.
+    """
+    cdef float64_t[:, ::1] dec_values
+    cdef svm_parameter param
+    cdef svm_model *model
+    cdef int32_t[::1] class_weight_label = np.arange(
+        class_weight.shape[0], dtype=np.int32
+    )
+    cdef int rv
+
+    set_predict_params(
+        &param,
+        svm_type,
+        kernel,
+        degree,
+        gamma,
+        coef0,
+        cache_size,
+        1,
+        <int> class_weight.shape[0],
+        <char*> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char*> &class_weight[0] if class_weight.size > 0 else NULL,
+    )
+    model = set_model(
+        &param,
+        <int> nSV.shape[0],
+        <char*> &SV[0, 0] if SV.size > 0 else NULL,
+        <intp_t*> SV.shape,
+        <char*> &support[0],
+        <intp_t*> support.shape,
+        <intp_t*> sv_coef.strides,
+        <char*> &sv_coef[0, 0],
+        <char*> &intercept[0],
+        <char*> &nSV[0],
+        <char*> &probA[0] if probA.size > 0 else NULL,
+        <char*> &probB[0] if probB.size > 0 else NULL,
+    )
+
+    cdef intp_t n_class = get_nr(model)
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    try:
+        dec_values = np.empty((X.shape[0], n_class), dtype=np.float64)
+        with nogil:
+            rv = copy_predict_proba(
+                <char*> &X[0, 0],
+                model,
+                <intp_t*> X.shape,
+                <char*> &dec_values[0, 0],
+                &blas_functions,
+            )
+        if rv < 0:
+            raise MemoryError("We've run out of memory")
+    finally:
+        free_model(model)
+
+    return dec_values.base
+
+
+def decision_function(
+    const float64_t[:, ::1] X,
+    const int32_t[::1] support,
+    const float64_t[:, ::1] SV,
+    const int32_t[::1] nSV,
+    const float64_t[:, ::1] sv_coef,
+    const float64_t[::1] intercept,
+    const float64_t[::1] probA=np.empty(0),
+    const float64_t[::1] probB=np.empty(0),
+    int svm_type=0,
+    kernel='rbf',
+    int degree=3,
+    double gamma=0.1,
+    double coef0=0.0,
+    const float64_t[::1] class_weight=np.empty(0),
+    const float64_t[::1] sample_weight=np.empty(0),
+    double cache_size=100.0,
+):
+    """
+    Predict margin (libsvm name for this is predict_values)
+
+    We have to reconstruct model and parameters to make sure we stay
+    in sync with the python object.
+
+    Parameters
+    ----------
+    X : array-like, dtype=float, size=[n_samples, n_features]
+
+    support : array, shape=[n_support]
+        Index of support vectors in training set.
+
+    SV : array, shape=[n_support, n_features]
+        Support vectors.
+
+    nSV : array, shape=[n_class]
+        Number of support vectors in each class.
+
+    sv_coef : array, shape=[n_class-1, n_support]
+        Coefficients of support vectors in decision function.
+
+    intercept : array, shape=[n_class*(n_class-1)/2]
+        Intercept in decision function.
+
+    probA, probB : array, shape=[n_class*(n_class-1)/2]
+        Probability estimates.
+
+    svm_type : {0, 1, 2, 3, 4}, optional
+        Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR
+        respectively. 0 by default.
+
+    kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, optional
+        Kernel to use in the model: linear, polynomial, RBF, sigmoid
+        or precomputed. 'rbf' by default.
+
+    degree : int32, optional
+        Degree of the polynomial kernel (only relevant if kernel is
+        set to polynomial), 3 by default.
+
+    gamma : float64, optional
+        Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other
+        kernels. 0.1 by default.
+
+    coef0 : float64, optional
+        Independent parameter in poly/sigmoid kernel. 0 by default.
+
+    Returns
+    -------
+    dec_values : array
+        Predicted values.
+    """
+    cdef float64_t[:, ::1] dec_values
+    cdef svm_parameter param
+    cdef svm_model *model
+    cdef intp_t n_class
+
+    cdef int32_t[::1] class_weight_label = np.arange(
+        class_weight.shape[0], dtype=np.int32
+    )
+
+    cdef int rv
+
+    set_predict_params(
+        &param,
+        svm_type,
+        kernel,
+        degree,
+        gamma,
+        coef0,
+        cache_size,
+        0,
+        <int> class_weight.shape[0],
+        <char*> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char*> &class_weight[0] if class_weight.size > 0 else NULL,
+    )
+
+    model = set_model(
+        &param,
+        <int> nSV.shape[0],
+        <char*> &SV[0, 0] if SV.size > 0 else NULL,
+        <intp_t*> SV.shape,
+        <char*> &support[0],
+        <intp_t*> support.shape,
+        <intp_t*> sv_coef.strides,
+        <char*> &sv_coef[0, 0],
+        <char*> &intercept[0],
+        <char*> &nSV[0],
+        <char*> &probA[0] if probA.size > 0 else NULL,
+        <char*> &probB[0] if probB.size > 0 else NULL,
+    )
+
+    if svm_type > 1:
+        n_class = 1
+    else:
+        n_class = get_nr(model)
+        n_class = n_class * (n_class - 1) // 2
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    try:
+        dec_values = np.empty((X.shape[0], n_class), dtype=np.float64)
+        with nogil:
+            rv = copy_predict_values(
+                <char*> &X[0, 0],
+                model,
+                <intp_t*> X.shape,
+                <char*> &dec_values[0, 0],
+                n_class,
+                &blas_functions,
+            )
+        if rv < 0:
+            raise MemoryError("We've run out of memory")
+    finally:
+        free_model(model)
+
+    return dec_values.base
+
+
+def cross_validation(
+    const float64_t[:, ::1] X,
+    const float64_t[::1] Y,
+    int n_fold,
+    int svm_type=0,
+    kernel='rbf',
+    int degree=3,
+    double gamma=0.1,
+    double coef0=0.0,
+    double tol=1e-3,
+    double C=1.0,
+    double nu=0.5,
+    double epsilon=0.1,
+    float64_t[::1] class_weight=np.empty(0),
+    float64_t[::1] sample_weight=np.empty(0),
+    int shrinking=0,
+    int probability=0,
+    double cache_size=100.0,
+    int max_iter=-1,
+    int random_seed=0,
+):
+    """
+    Binding of the cross-validation routine (low-level routine)
+
+    Parameters
+    ----------
+
+    X : array-like, dtype=float of shape (n_samples, n_features)
+
+    Y : array, dtype=float of shape (n_samples,)
+        target vector
+
+    n_fold : int32
+        Number of folds for cross validation.
+
+    svm_type : {0, 1, 2, 3, 4}, default=0
+        Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR
+        respectively.
+
+    kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default='rbf'
+        Kernel to use in the model: linear, polynomial, RBF, sigmoid
+        or precomputed.
+
+    degree : int32, default=3
+        Degree of the polynomial kernel (only relevant if kernel is
+        set to polynomial).
+
+    gamma : float64, default=0.1
+        Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other
+        kernels.
+
+    coef0 : float64, default=0.0
+        Independent parameter in poly/sigmoid kernel.
+
+    tol : float64, default=1e-3
+        Numeric stopping criterion (WRITEME).
+
+    C : float64, default=1
+        C parameter in C-Support Vector Classification.
+
+    nu : float64, default=0.5
+        An upper bound on the fraction of training errors and a lower bound of
+        the fraction of support vectors. Should be in the interval (0, 1].
+
+    epsilon : double, default=0.1
+        Epsilon parameter in the epsilon-insensitive loss function.
+
+    class_weight : array, dtype=float64, shape (n_classes,), \
+            default=np.empty(0)
+        Set the parameter C of class i to class_weight[i]*C for
+        SVC. If not given, all classes are supposed to have
+        weight one.
+
+    sample_weight : array, dtype=float64, shape (n_samples,), \
+            default=np.empty(0)
+        Weights assigned to each sample.
+
+    shrinking : int, default=1
+        Whether to use the shrinking heuristic.
+
+    probability : int, default=0
+        Whether to enable probability estimates.
+
+    cache_size : float64, default=100
+        Cache size for gram matrix columns (in megabytes).
+
+    max_iter : int (-1 for no limit), default=-1
+        Stop solver after this many iterations regardless of accuracy
+        (XXX Currently there is no API to know whether this kicked in.)
+
+    random_seed : int, default=0
+        Seed for the random number generator used for probability estimates.
+
+    Returns
+    -------
+    target : array, float
+
+    """
+
+    cdef svm_parameter param
+    cdef svm_problem problem
+    cdef const char *error_msg
+
+    if len(sample_weight) == 0:
+        sample_weight = np.ones(X.shape[0], dtype=np.float64)
+    else:
+        assert sample_weight.shape[0] == X.shape[0], (
+            f"sample_weight and X have incompatible shapes: sample_weight has "
+            f"{sample_weight.shape[0]} samples while X has {X.shape[0]}"
+        )
+
+    if X.shape[0] < n_fold:
+        raise ValueError("Number of samples is less than number of folds")
+
+    # set problem
+    kernel_index = LIBSVM_KERNEL_TYPES.index(kernel)
+    set_problem(
+        &problem,
+        <char*> &X[0, 0],
+        <char*> &Y[0],
+        <char*> &sample_weight[0] if sample_weight.size > 0 else NULL,
+        <intp_t*> X.shape,
+        kernel_index,
+    )
+    if problem.x == NULL:
+        raise MemoryError("Seems we've run out of memory")
+    cdef int32_t[::1] class_weight_label = np.arange(
+        class_weight.shape[0], dtype=np.int32
+    )
+
+    # set parameters
+    set_parameter(
+        &param,
+        svm_type,
+        kernel_index,
+        degree,
+        gamma,
+        coef0,
+        nu,
+        cache_size,
+        C,
+        tol,
+        tol,
+        shrinking,
+        probability,
+        <int> class_weight.shape[0],
+        <char*> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char*> &class_weight[0] if class_weight.size > 0 else NULL,
+        max_iter,
+        random_seed,
+    )
+
+    error_msg = svm_check_parameter(&problem, &param)
+    if error_msg:
+        raise ValueError(error_msg)
+
+    cdef float64_t[::1] target
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    try:
+        target = np.empty((X.shape[0]), dtype=np.float64)
+        with nogil:
+            svm_cross_validation(
+                &problem,
+                &param,
+                n_fold,
+                <double *> &target[0],
+                &blas_functions,
+            )
+    finally:
+        free(problem.x)
+
+    return target.base
+
+
+def set_verbosity_wrap(int verbosity):
+    """
+    Control verbosity of libsvm library
+    """
+    set_verbosity(verbosity)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm_sparse.pyx b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm_sparse.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..529758061d299f095bbe3834d85e3f10e475c537
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm_sparse.pyx
@@ -0,0 +1,550 @@
+import  numpy as np
+from scipy import sparse
+from ..utils._cython_blas cimport _dot
+from ..utils._typedefs cimport float64_t, int32_t, intp_t
+
+cdef extern from *:
+    ctypedef char* const_char_p "const char*"
+
+################################################################################
+# Includes
+
+cdef extern from "_svm_cython_blas_helpers.h":
+    ctypedef double (*dot_func)(int, const double*, int, const double*, int)
+    cdef struct BlasFunctions:
+        dot_func dot
+
+cdef extern from "svm.h":
+    cdef struct svm_csr_node
+    cdef struct svm_csr_model
+    cdef struct svm_parameter
+    cdef struct svm_csr_problem
+    char *svm_csr_check_parameter(svm_csr_problem *, svm_parameter *)
+    svm_csr_model *svm_csr_train(svm_csr_problem *, svm_parameter *, int *, BlasFunctions *) nogil
+    void svm_csr_free_and_destroy_model(svm_csr_model** model_ptr_ptr)
+
+cdef extern from "libsvm_sparse_helper.c":
+    # this file contains methods for accessing libsvm 'hidden' fields
+    svm_csr_problem * csr_set_problem (
+        char *, intp_t *, char *, intp_t *, char *, char *, char *, int)
+    svm_csr_model *csr_set_model(svm_parameter *param, int nr_class,
+                                 char *SV_data, intp_t *SV_indices_dims,
+                                 char *SV_indices, intp_t *SV_intptr_dims,
+                                 char *SV_intptr,
+                                 char *sv_coef, char *rho, char *nSV,
+                                 char *probA, char *probB)
+    svm_parameter *set_parameter (int , int , int , double, double ,
+                                  double , double , double , double,
+                                  double, int, int, int, char *, char *, int,
+                                  int)
+    void copy_sv_coef   (char *, svm_csr_model *)
+    void copy_n_iter  (char *, svm_csr_model *)
+    void copy_support   (char *, svm_csr_model *)
+    void copy_intercept (char *, svm_csr_model *, intp_t *)
+    int copy_predict (char *, svm_csr_model *, intp_t *, char *, BlasFunctions *)
+    int csr_copy_predict_values (intp_t *data_size, char *data, intp_t *index_size,
+                                 char *index, intp_t *intptr_size, char *size,
+                                 svm_csr_model *model, char *dec_values, int nr_class, BlasFunctions *)
+    int csr_copy_predict (intp_t *data_size, char *data, intp_t *index_size,
+                          char *index, intp_t *intptr_size, char *size,
+                          svm_csr_model *model, char *dec_values, BlasFunctions *) nogil
+    int csr_copy_predict_proba (intp_t *data_size, char *data, intp_t *index_size,
+                                char *index, intp_t *intptr_size, char *size,
+                                svm_csr_model *model, char *dec_values, BlasFunctions *) nogil
+
+    int  copy_predict_values(char *, svm_csr_model *, intp_t *, char *, int, BlasFunctions *)
+    int  csr_copy_SV (char *values, intp_t *n_indices,
+                      char *indices, intp_t *n_indptr, char *indptr,
+                      svm_csr_model *model, int n_features)
+    intp_t get_nonzero_SV (svm_csr_model *)
+    void copy_nSV     (char *, svm_csr_model *)
+    void copy_probA   (char *, svm_csr_model *, intp_t *)
+    void copy_probB   (char *, svm_csr_model *, intp_t *)
+    intp_t  get_l  (svm_csr_model *)
+    intp_t  get_nr (svm_csr_model *)
+    int  free_problem   (svm_csr_problem *)
+    int  free_model     (svm_csr_model *)
+    int  free_param     (svm_parameter *)
+    int free_model_SV(svm_csr_model *model)
+    void set_verbosity(int)
+
+
+def libsvm_sparse_train (int n_features,
+                         const float64_t[::1] values,
+                         const int32_t[::1] indices,
+                         const int32_t[::1] indptr,
+                         const float64_t[::1] Y,
+                         int svm_type, int kernel_type, int degree, double gamma,
+                         double coef0, double eps, double C,
+                         const float64_t[::1] class_weight,
+                         const float64_t[::1] sample_weight,
+                         double nu, double cache_size, double p, int
+                         shrinking, int probability, int max_iter,
+                         int random_seed):
+    """
+    Wrap svm_train from libsvm using a scipy.sparse.csr matrix
+
+    Work in progress.
+
+    Parameters
+    ----------
+    n_features : number of features.
+        XXX: can we retrieve this from any other parameter ?
+
+    X : array-like, dtype=float, size=[N, D]
+
+    Y : array, dtype=float, size=[N]
+        target vector
+
+    ...
+
+    Notes
+    -------------------
+    See sklearn.svm.predict for a complete list of parameters.
+
+    """
+
+    cdef svm_parameter *param
+    cdef svm_csr_problem *problem
+    cdef svm_csr_model *model
+    cdef const_char_p error_msg
+
+    if len(sample_weight) == 0:
+        sample_weight = np.ones(Y.shape[0], dtype=np.float64)
+    else:
+        assert sample_weight.shape[0] == indptr.shape[0] - 1, \
+               "sample_weight and X have incompatible shapes: " + \
+               "sample_weight has %s samples while X has %s" % \
+               (sample_weight.shape[0], indptr.shape[0] - 1)
+
+    # we should never end up here with a precomputed kernel matrix,
+    # as this is always dense.
+    assert(kernel_type != 4)
+
+    # set libsvm problem
+    problem = csr_set_problem(
+        <char *> &values[0],
+        <intp_t *> indices.shape,
+        <char *> &indices[0],
+        <intp_t *> indptr.shape,
+        <char *> &indptr[0],
+        <char *> &Y[0],
+        <char *> &sample_weight[0],
+        kernel_type,
+    )
+
+    cdef int32_t[::1] \
+        class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32)
+
+    # set parameters
+    param = set_parameter(
+        svm_type,
+        kernel_type,
+        degree,
+        gamma,
+        coef0,
+        nu,
+        cache_size,
+        C,
+        eps,
+        p,
+        shrinking,
+        probability,
+        <int> class_weight.shape[0],
+        <char *> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char *> &class_weight[0] if class_weight.size > 0 else NULL, max_iter,
+        random_seed,
+    )
+
+    # check parameters
+    if (param == NULL or problem == NULL):
+        raise MemoryError("Seems we've run out of memory")
+    error_msg = svm_csr_check_parameter(problem, param)
+    if error_msg:
+        free_problem(problem)
+        free_param(param)
+        raise ValueError(error_msg)
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    # call svm_train, this does the real work
+    cdef int fit_status = 0
+    with nogil:
+        model = svm_csr_train(problem, param, &fit_status, &blas_functions)
+
+    cdef intp_t SV_len = get_l(model)
+    cdef intp_t n_class = get_nr(model)
+
+    cdef int[::1] n_iter
+    n_iter = np.empty(max(1, n_class * (n_class - 1) // 2), dtype=np.intc)
+    copy_n_iter(<char *> &n_iter[0], model)
+
+    # copy model.sv_coef
+    # we create a new array instead of resizing, otherwise
+    # it would not erase previous information
+    cdef float64_t[::1] sv_coef_data
+    sv_coef_data = np.empty((n_class-1)*SV_len, dtype=np.float64)
+    copy_sv_coef (<char *> &sv_coef_data[0] if sv_coef_data.size > 0 else NULL, model)
+
+    cdef int32_t[::1] support
+    support = np.empty(SV_len, dtype=np.int32)
+    copy_support(<char *> &support[0] if support.size > 0 else NULL, model)
+
+    # copy model.rho into the intercept
+    # the intercept is just model.rho but with sign changed
+    cdef float64_t[::1]intercept
+    intercept = np.empty(n_class*(n_class-1)//2, dtype=np.float64)
+    copy_intercept (<char *> &intercept[0], model, <intp_t *> intercept.shape)
+
+    # copy model.SV
+    # we erase any previous information in SV
+    # TODO: custom kernel
+    cdef intp_t nonzero_SV
+    nonzero_SV = get_nonzero_SV (model)
+
+    cdef float64_t[::1] SV_data
+    cdef int32_t[::1] SV_indices, SV_indptr
+    SV_data = np.empty(nonzero_SV, dtype=np.float64)
+    SV_indices = np.empty(nonzero_SV, dtype=np.int32)
+    SV_indptr = np.empty(<intp_t>SV_len + 1, dtype=np.int32)
+    csr_copy_SV(
+        <char *> &SV_data[0] if SV_data.size > 0 else NULL,
+        <intp_t *> SV_indices.shape,
+        <char *> &SV_indices[0] if SV_indices.size > 0 else NULL,
+        <intp_t *> SV_indptr.shape,
+        <char *> &SV_indptr[0] if SV_indptr.size > 0 else NULL,
+        model,
+        n_features,
+    )
+    support_vectors_ = sparse.csr_matrix(
+        (SV_data, SV_indices, SV_indptr), (SV_len, n_features)
+    )
+
+    # copy model.nSV
+    # TODO: do only in classification
+    cdef int32_t[::1]n_class_SV
+    n_class_SV = np.empty(n_class, dtype=np.int32)
+    copy_nSV(<char *> &n_class_SV[0], model)
+
+    # # copy probabilities
+    cdef float64_t[::1] probA, probB
+    if probability != 0:
+        if svm_type < 2:  # SVC and NuSVC
+            probA = np.empty(n_class*(n_class-1)//2, dtype=np.float64)
+            probB = np.empty(n_class*(n_class-1)//2, dtype=np.float64)
+            copy_probB(<char *> &probB[0], model, <intp_t *> probB.shape)
+        else:
+            probA = np.empty(1, dtype=np.float64)
+            probB = np.empty(0, dtype=np.float64)
+        copy_probA(<char *> &probA[0], model, <intp_t *> probA.shape)
+    else:
+        probA = np.empty(0, dtype=np.float64)
+        probB = np.empty(0, dtype=np.float64)
+
+    svm_csr_free_and_destroy_model (&model)
+    free_problem(problem)
+    free_param(param)
+
+    return (
+        support.base,
+        support_vectors_,
+        sv_coef_data.base,
+        intercept.base,
+        n_class_SV.base,
+        probA.base,
+        probB.base,
+        fit_status,
+        n_iter.base,
+    )
+
+
+def libsvm_sparse_predict (const float64_t[::1] T_data,
+                           const int32_t[::1] T_indices,
+                           const int32_t[::1] T_indptr,
+                           const float64_t[::1] SV_data,
+                           const int32_t[::1] SV_indices,
+                           const int32_t[::1] SV_indptr,
+                           const float64_t[::1] sv_coef,
+                           const float64_t[::1]
+                           intercept, int svm_type, int kernel_type, int
+                           degree, double gamma, double coef0, double
+                           eps, double C,
+                           const float64_t[:] class_weight,
+                           double nu, double p, int
+                           shrinking, int probability,
+                           const int32_t[::1] nSV,
+                           const float64_t[::1] probA,
+                           const float64_t[::1] probB):
+    """
+    Predict values T given a model.
+
+    For speed, all real work is done at the C level in function
+    copy_predict (libsvm_helper.c).
+
+    We have to reconstruct model and parameters to make sure we stay
+    in sync with the python object.
+
+    See sklearn.svm.predict for a complete list of parameters.
+
+    Parameters
+    ----------
+    X : array-like, dtype=float
+    Y : array
+        target vector
+
+    Returns
+    -------
+    dec_values : array
+        predicted values.
+    """
+    cdef float64_t[::1] dec_values
+    cdef svm_parameter *param
+    cdef svm_csr_model *model
+    cdef int32_t[::1] \
+        class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32)
+    cdef int rv
+    param = set_parameter(
+        svm_type,
+        kernel_type,
+        degree,
+        gamma,
+        coef0,
+        nu,
+        100.0,  # cache size has no effect on predict
+        C,
+        eps,
+        p,
+        shrinking,
+        probability,
+        <int> class_weight.shape[0],
+        <char *> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char *> &class_weight[0] if class_weight.size > 0 else NULL,
+        -1,
+        -1,  # random seed has no effect on predict either
+    )
+
+    model = csr_set_model(
+        param, <int> nSV.shape[0],
+        <char *> &SV_data[0] if SV_data.size > 0 else NULL,
+        <intp_t *>SV_indices.shape,
+        <char *> &SV_indices[0] if SV_indices.size > 0 else NULL,
+        <intp_t *> SV_indptr.shape,
+        <char *> &SV_indptr[0] if SV_indptr.size > 0 else NULL,
+        <char *> &sv_coef[0] if sv_coef.size > 0 else NULL,
+        <char *> &intercept[0],
+        <char *> &nSV[0],
+        <char *> &probA[0] if probA.size > 0 else NULL,
+        <char *> &probB[0] if probB.size > 0 else NULL,
+    )
+    # TODO: use check_model
+    dec_values = np.empty(T_indptr.shape[0]-1)
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    with nogil:
+        rv = csr_copy_predict(
+            <intp_t *> T_data.shape,
+            <char *> &T_data[0],
+            <intp_t *> T_indices.shape,
+            <char *> &T_indices[0],
+            <intp_t *> T_indptr.shape,
+            <char *> &T_indptr[0],
+            model,
+            <char *> &dec_values[0],
+            &blas_functions,
+        )
+    if rv < 0:
+        raise MemoryError("We've run out of memory")
+    # free model and param
+    free_model_SV(model)
+    free_model(model)
+    free_param(param)
+    return dec_values.base
+
+
+def libsvm_sparse_predict_proba(
+    const float64_t[::1] T_data,
+    const int32_t[::1] T_indices,
+    const int32_t[::1] T_indptr,
+    const float64_t[::1] SV_data,
+    const int32_t[::1] SV_indices,
+    const int32_t[::1] SV_indptr,
+    const float64_t[::1] sv_coef,
+    const float64_t[::1]
+    intercept, int svm_type, int kernel_type, int
+    degree, double gamma, double coef0, double
+    eps, double C,
+    const float64_t[:] class_weight,
+    double nu, double p, int shrinking, int probability,
+    const int32_t[::1] nSV,
+    const float64_t[::1] probA,
+    const float64_t[::1] probB,
+):
+    """
+    Predict values T given a model.
+    """
+    cdef float64_t[:, ::1] dec_values
+    cdef svm_parameter *param
+    cdef svm_csr_model *model
+    cdef int32_t[::1] \
+        class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32)
+    param = set_parameter(
+        svm_type,
+        kernel_type,
+        degree,
+        gamma,
+        coef0,
+        nu,
+        100.0,  # cache size has no effect on predict
+        C,
+        eps,
+        p,
+        shrinking,
+        probability,
+        <int> class_weight.shape[0],
+        <char *> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char *> &class_weight[0] if class_weight.size > 0 else NULL,
+        -1,
+        -1,  # random seed has no effect on predict either
+    )
+
+    model = csr_set_model(
+        param,
+        <int> nSV.shape[0],
+        <char *> &SV_data[0] if SV_data.size > 0 else NULL,
+        <intp_t *> SV_indices.shape,
+        <char *> &SV_indices[0] if SV_indices.size > 0 else NULL,
+        <intp_t *> SV_indptr.shape,
+        <char *> &SV_indptr[0] if SV_indptr.size > 0 else NULL,
+        <char *> &sv_coef[0] if sv_coef.size > 0 else NULL,
+        <char *> &intercept[0],
+        <char *> &nSV[0],
+        <char *> &probA[0] if probA.size > 0 else NULL,
+        <char *> &probB[0] if probB.size > 0 else NULL,
+    )
+    # TODO: use check_model
+    cdef intp_t n_class = get_nr(model)
+    cdef int rv
+    dec_values = np.empty((T_indptr.shape[0]-1, n_class), dtype=np.float64)
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    with nogil:
+        rv = csr_copy_predict_proba(
+            <intp_t *> T_data.shape,
+            <char *> &T_data[0],
+            <intp_t *> T_indices.shape,
+            <char *> &T_indices[0],
+            <intp_t *> T_indptr.shape,
+            <char *> &T_indptr[0],
+            model,
+            <char *> &dec_values[0, 0],
+            &blas_functions,
+        )
+    if rv < 0:
+        raise MemoryError("We've run out of memory")
+    # free model and param
+    free_model_SV(model)
+    free_model(model)
+    free_param(param)
+    return dec_values.base
+
+
+def libsvm_sparse_decision_function(
+    const float64_t[::1] T_data,
+    const int32_t[::1] T_indices,
+    const int32_t[::1] T_indptr,
+    const float64_t[::1] SV_data,
+    const int32_t[::1] SV_indices,
+    const int32_t[::1] SV_indptr,
+    const float64_t[::1] sv_coef,
+    const float64_t[::1]
+    intercept, int svm_type, int kernel_type, int
+    degree, double gamma, double coef0, double
+    eps, double C,
+    const float64_t[:] class_weight,
+    double nu, double p, int shrinking, int probability,
+    const int32_t[::1] nSV,
+    const float64_t[::1] probA,
+    const float64_t[::1] probB,
+):
+    """
+    Predict margin (libsvm name for this is predict_values)
+
+    We have to reconstruct model and parameters to make sure we stay
+    in sync with the python object.
+    """
+    cdef float64_t[:, ::1] dec_values
+    cdef svm_parameter *param
+    cdef intp_t n_class
+
+    cdef svm_csr_model *model
+    cdef int32_t[::1] \
+        class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32)
+    param = set_parameter(
+        svm_type,
+        kernel_type,
+        degree,
+        gamma,
+        coef0,
+        nu,
+        100.0,  # cache size has no effect on predict
+        C,
+        eps,
+        p,
+        shrinking,
+        probability,
+        <int> class_weight.shape[0],
+        <char *> &class_weight_label[0] if class_weight_label.size > 0 else NULL,
+        <char *> &class_weight[0] if class_weight.size > 0 else NULL,
+        -1,
+        -1,
+    )
+
+    model = csr_set_model(
+        param,
+        <int> nSV.shape[0],
+        <char *> &SV_data[0] if SV_data.size > 0 else NULL,
+        <intp_t *> SV_indices.shape,
+        <char *> &SV_indices[0] if SV_indices.size > 0 else NULL,
+        <intp_t *> SV_indptr.shape,
+        <char *> &SV_indptr[0] if SV_indptr.size > 0 else NULL,
+        <char *> &sv_coef[0] if sv_coef.size > 0 else NULL,
+        <char *> &intercept[0],
+        <char *> &nSV[0],
+        <char *> &probA[0] if probA.size > 0 else NULL,
+        <char *> &probB[0] if probB.size > 0 else NULL,
+    )
+
+    if svm_type > 1:
+        n_class = 1
+    else:
+        n_class = get_nr(model)
+        n_class = n_class * (n_class - 1) // 2
+
+    dec_values = np.empty((T_indptr.shape[0] - 1, n_class), dtype=np.float64)
+    cdef BlasFunctions blas_functions
+    blas_functions.dot = _dot[double]
+    if csr_copy_predict_values(
+            <intp_t *> T_data.shape,
+            <char *> &T_data[0],
+            <intp_t *> T_indices.shape,
+            <char *> &T_indices[0],
+            <intp_t *> T_indptr.shape,
+            <char *> &T_indptr[0],
+            model,
+            <char *> &dec_values[0, 0],
+            n_class,
+            &blas_functions,
+    ) < 0:
+        raise MemoryError("We've run out of memory")
+    # free model and param
+    free_model_SV(model)
+    free_model(model)
+    free_param(param)
+
+    return dec_values.base
+
+
+def set_verbosity_wrap(int verbosity):
+    """
+    Control verbosity of libsvm library
+    """
+    set_verbosity(verbosity)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..c7d9391102a41eeab0c670e966fcc2234b0b1af3
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.pyx b/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..af543ed73286a06bfb0053807bc8b8c39bfc53c0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.pyx
@@ -0,0 +1,13 @@
+"""Wrapper for newrand.h"""
+
+cdef extern from "newrand.h":
+    void set_seed(unsigned int)
+    unsigned int bounded_rand_int(unsigned int)
+
+
+def set_seed_wrap(unsigned int custom_seed):
+    set_seed(custom_seed)
+
+
+def bounded_rand_int_wrap(unsigned int range_):
+    return bounded_rand_int(range_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/meson.build b/.venv/lib/python3.12/site-packages/sklearn/svm/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..6232d747d1feb220eb4656396314d7caddac9c52
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/meson.build
@@ -0,0 +1,48 @@
+newrand_include = include_directories('src/newrand')
+libsvm_include = include_directories('src/libsvm')
+liblinear_include = include_directories('src/liblinear')
+
+_newrand = py.extension_module(
+  '_newrand',
+  cython_gen_cpp.process('_newrand.pyx'),
+  include_directories: [newrand_include],
+  subdir: 'sklearn/svm',
+  install: true
+)
+
+libsvm_skl = static_library(
+  'libsvm-skl',
+  ['src/libsvm/libsvm_template.cpp'],
+)
+
+py.extension_module(
+  '_libsvm',
+  [cython_gen.process('_libsvm.pyx'), utils_cython_tree],
+  include_directories: [newrand_include, libsvm_include],
+  link_with: libsvm_skl,
+  subdir: 'sklearn/svm',
+  install: true
+)
+
+py.extension_module(
+  '_libsvm_sparse',
+  [cython_gen.process('_libsvm_sparse.pyx'), utils_cython_tree],
+  include_directories: [newrand_include, libsvm_include],
+  link_with: libsvm_skl,
+  subdir: 'sklearn/svm',
+  install: true
+)
+
+liblinear_skl = static_library(
+  'liblinear-skl',
+  ['src/liblinear/linear.cpp', 'src/liblinear/tron.cpp'],
+)
+
+py.extension_module(
+  '_liblinear',
+  [cython_gen.process('_liblinear.pyx'), utils_cython_tree],
+  include_directories: [newrand_include, liblinear_include],
+  link_with: [liblinear_skl],
+  subdir: 'sklearn/svm',
+  install: true
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/COPYRIGHT b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/COPYRIGHT
new file mode 100644
index 0000000000000000000000000000000000000000..94371bb4cfd3a117775792c38e8354e62c46dc8f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/COPYRIGHT
@@ -0,0 +1,31 @@
+
+Copyright (c) 2007-2014 The LIBLINEAR Project.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither name of copyright holders nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdec1a2f99eb9c0cd57f4e588e9b277ab5f93a6a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h
@@ -0,0 +1,16 @@
+#ifndef _CYTHON_BLAS_HELPERS_H
+#define _CYTHON_BLAS_HELPERS_H
+
+typedef double (*dot_func)(int, const double*, int, const double*, int);
+typedef void (*axpy_func)(int, double, const double*, int, double*, int);
+typedef void (*scal_func)(int, double, const double*, int);
+typedef double (*nrm2_func)(int, const double*, int);
+
+typedef struct BlasFunctions{
+    dot_func dot;
+    axpy_func axpy;
+    scal_func scal;
+    nrm2_func nrm2;
+} BlasFunctions;
+
+#endif
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c
new file mode 100644
index 0000000000000000000000000000000000000000..b66f08413e11b6af16d72a35d1e8e85a5addfd43
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c
@@ -0,0 +1,236 @@
+#include <stdlib.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include "linear.h"
+
+
+/*
+ * Convert matrix to sparse representation suitable for liblinear. x is
+ * expected to be an array of length n_samples*n_features.
+ *
+ * Whether the matrix is densely or sparsely populated, the fastest way to
+ * convert it to liblinear's sparse format is to calculate the amount of memory
+ * needed and allocate a single big block.
+ *
+ * Special care must be taken with indices, since liblinear indices start at 1
+ * and not at 0.
+ *
+ * If bias is > 0, we append an item at the end.
+ */
+static struct feature_node **dense_to_sparse(char *x, int double_precision,
+        int n_samples, int n_features, int n_nonzero, double bias)
+{
+    float *x32 = (float *)x;
+    double *x64 = (double *)x;
+    struct feature_node **sparse;
+    int i, j;                           /* number of nonzero elements in row i */
+    struct feature_node *T;             /* pointer to the top of the stack */
+    int have_bias = (bias > 0);
+
+    sparse = malloc (n_samples * sizeof(struct feature_node *));
+    if (sparse == NULL)
+        return NULL;
+
+    n_nonzero += (have_bias+1) * n_samples;
+    T = malloc (n_nonzero * sizeof(struct feature_node));
+    if (T == NULL) {
+        free(sparse);
+        return NULL;
+    }
+
+    for (i=0; i<n_samples; ++i) {
+        sparse[i] = T;
+
+        for (j=1; j<=n_features; ++j) {
+            if (double_precision) {
+                if (*x64 != 0) {
+                    T->value = *x64;
+                    T->index = j;
+                    ++ T;
+                }
+                ++ x64; /* go to next element */
+            } else {
+                if (*x32 != 0) {
+                    T->value = *x32;
+                    T->index = j;
+                    ++ T;
+                }
+                ++ x32; /* go to next element */
+            }
+        }
+
+        /* set bias element */
+        if (have_bias) {
+                T->value = bias;
+                T->index = j;
+                ++ T;
+            }
+
+        /* set sentinel */
+        T->index = -1;
+        ++ T;
+    }
+
+    return sparse;
+}
+
+
+/*
+ * Convert scipy.sparse.csr to liblinear's sparse data structure
+ */
+static struct feature_node **csr_to_sparse(char *x, int double_precision,
+        int *indices, int *indptr, int n_samples, int n_features, int n_nonzero,
+        double bias)
+{
+    float *x32 = (float *)x;
+    double *x64 = (double *)x;
+    struct feature_node **sparse;
+    int i, j=0, k=0, n;
+    struct feature_node *T;
+    int have_bias = (bias > 0);
+
+    sparse = malloc (n_samples * sizeof(struct feature_node *));
+    if (sparse == NULL)
+        return NULL;
+
+    n_nonzero += (have_bias+1) * n_samples;
+    T = malloc (n_nonzero * sizeof(struct feature_node));
+    if (T == NULL) {
+        free(sparse);
+        return NULL;
+    }
+
+    for (i=0; i<n_samples; ++i) {
+        sparse[i] = T;
+        n = indptr[i+1] - indptr[i]; /* count elements in row i */
+
+        for (j=0; j<n; ++j) {
+            T->value = double_precision ? x64[k] : x32[k];
+            T->index = indices[k] + 1; /* liblinear uses 1-based indexing */
+            ++T;
+            ++k;
+        }
+
+        if (have_bias) {
+            T->value = bias;
+            T->index = n_features + 1;
+            ++T;
+            ++j;
+        }
+
+        /* set sentinel */
+        T->index = -1;
+        ++T;
+    }
+
+    return sparse;
+}
+
+struct problem * set_problem(char *X, int double_precision_X, int n_samples,
+        int n_features, int n_nonzero, double bias, char* sample_weight,
+        char *Y)
+{
+    struct problem *problem;
+    /* not performant but simple */
+    problem = malloc(sizeof(struct problem));
+    if (problem == NULL) return NULL;
+    problem->l = n_samples;
+    problem->n = n_features + (bias > 0);
+    problem->y = (double *) Y;
+    problem->W = (double *) sample_weight;
+    problem->x = dense_to_sparse(X, double_precision_X, n_samples, n_features,
+                        n_nonzero, bias);
+    problem->bias = bias;
+
+    if (problem->x == NULL) {
+        free(problem);
+        return NULL;
+    }
+
+    return problem;
+}
+
+struct problem * csr_set_problem (char *X, int double_precision_X,
+        char *indices, char *indptr, int n_samples, int n_features,
+        int n_nonzero, double bias, char *sample_weight, char *Y)
+{
+    struct problem *problem;
+    problem = malloc (sizeof (struct problem));
+    if (problem == NULL) return NULL;
+    problem->l = n_samples;
+    problem->n = n_features + (bias > 0);
+    problem->y = (double *) Y;
+    problem->W = (double *) sample_weight;
+    problem->x = csr_to_sparse(X, double_precision_X, (int *) indices,
+                        (int *) indptr, n_samples, n_features, n_nonzero, bias);
+    problem->bias = bias;
+
+    if (problem->x == NULL) {
+        free(problem);
+        return NULL;
+    }
+
+    return problem;
+}
+
+
+/* Create a parameter struct with and return it */
+struct parameter *set_parameter(int solver_type, double eps, double C,
+                                Py_ssize_t nr_weight, char *weight_label,
+                                char *weight, int max_iter, unsigned seed,
+                                double epsilon)
+{
+    struct parameter *param = malloc(sizeof(struct parameter));
+    if (param == NULL)
+        return NULL;
+
+    set_seed(seed);
+    param->solver_type = solver_type;
+    param->eps = eps;
+    param->C = C;
+    param->p = epsilon;  // epsilon for epsilon-SVR
+    param->nr_weight = (int) nr_weight;
+    param->weight_label = (int *) weight_label;
+    param->weight = (double *) weight;
+    param->max_iter = max_iter;
+    return param;
+}
+
+void copy_w(void *data, struct model *model, int len)
+{
+    memcpy(data, model->w, len * sizeof(double));
+}
+
+double get_bias(struct model *model)
+{
+    return model->bias;
+}
+
+void free_problem(struct problem *problem)
+{
+    free(problem->x[0]);
+    free(problem->x);
+    free(problem);
+}
+
+void free_parameter(struct parameter *param)
+{
+    free(param);
+}
+
+/* rely on built-in facility to control verbose output */
+static void print_null(const char *s) {}
+
+static void print_string_stdout(const char *s)
+{
+    fputs(s ,stdout);
+    fflush(stdout);
+}
+
+/* provide convenience wrapper */
+void set_verbosity(int verbosity_flag){
+    if (verbosity_flag)
+        set_print_string_function(&print_string_stdout);
+    else
+        set_print_string_function(&print_null);
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.cpp b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..63648adbe2947de03449580f060a795fd4eb3cb6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.cpp
@@ -0,0 +1,3075 @@
+/*
+   Modified 2011:
+
+   - Make labels sorted in group_classes, Dan Yamins.
+
+   Modified 2012:
+
+   - Changes roles of +1 and -1 to match scikit API, Andreas Mueller
+        See issue 546: https://github.com/scikit-learn/scikit-learn/pull/546
+   - Also changed roles for pairwise class weights, Andreas Mueller
+        See issue 1491: https://github.com/scikit-learn/scikit-learn/pull/1491
+
+   Modified 2014:
+
+   - Remove the hard-coded value of max_iter (1000), that allows max_iter
+     to be passed as a parameter from the classes LogisticRegression and
+     LinearSVC, Manoj Kumar
+   - Added function get_n_iter that exposes the number of iterations.
+        See issue 3499: https://github.com/scikit-learn/scikit-learn/issues/3499
+        See pull 3501: https://github.com/scikit-learn/scikit-learn/pull/3501
+
+   Modified 2015:
+   - Patched liblinear for sample_weights - Manoj Kumar
+     See https://github.com/scikit-learn/scikit-learn/pull/5274
+
+   Modified 2020:
+   - Improved random number generator by using a mersenne twister + tweaked
+     lemire postprocessor. This fixed a convergence issue on windows targets.
+     Sylvain Marie, Schneider Electric
+     See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <locale.h>
+#include "linear.h"
+#include "tron.h"
+#include <climits>
+#include <random>
+#include "../newrand/newrand.h"
+
+typedef signed char schar;
+template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
+#ifndef min
+template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
+#endif
+#ifndef max
+template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
+#endif
+template <class S, class T> static inline void clone(T*& dst, S* src, int n)
+{
+	dst = new T[n];
+	memcpy((void *)dst,(void *)src,sizeof(T)*n);
+}
+#define Malloc(type,n) (type *)malloc((n)*sizeof(type))
+#define INF HUGE_VAL
+
+static void print_string_stdout(const char *s)
+{
+	fputs(s,stdout);
+	fflush(stdout);
+}
+
+static void (*liblinear_print_string) (const char *) = &print_string_stdout;
+
+#if 1
+static void info(const char *fmt,...)
+{
+	char buf[BUFSIZ];
+	va_list ap;
+	va_start(ap,fmt);
+	vsprintf(buf,fmt,ap);
+	va_end(ap);
+	(*liblinear_print_string)(buf);
+}
+#else
+static void info(const char *fmt,...) {}
+#endif
+
+class l2r_lr_fun: public function
+{
+public:
+	l2r_lr_fun(const problem *prob, double *C);
+	~l2r_lr_fun();
+
+	double fun(double *w);
+	void grad(double *w, double *g);
+	void Hv(double *s, double *Hs);
+
+	int get_nr_variable(void);
+
+private:
+	void Xv(double *v, double *Xv);
+	void XTv(double *v, double *XTv);
+
+	double *C;
+	double *z;
+	double *D;
+	const problem *prob;
+};
+
+l2r_lr_fun::l2r_lr_fun(const problem *prob, double *C)
+{
+	int l=prob->l;
+
+	this->prob = prob;
+
+	z = new double[l];
+	D = new double[l];
+	this->C = C;
+}
+
+l2r_lr_fun::~l2r_lr_fun()
+{
+	delete[] z;
+	delete[] D;
+}
+
+
+double l2r_lr_fun::fun(double *w)
+{
+	int i;
+	double f=0;
+	double *y=prob->y;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+
+	Xv(w, z);
+
+	for(i=0;i<w_size;i++)
+		f += w[i]*w[i];
+	f /= 2.0;
+	for(i=0;i<l;i++)
+	{
+		double yz = y[i]*z[i];
+		if (yz >= 0)
+			f += C[i]*log(1 + exp(-yz));
+		else
+			f += C[i]*(-yz+log(1 + exp(yz)));
+	}
+
+	return(f);
+}
+
+void l2r_lr_fun::grad(double *w, double *g)
+{
+	int i;
+	double *y=prob->y;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+
+	for(i=0;i<l;i++)
+	{
+		z[i] = 1/(1 + exp(-y[i]*z[i]));
+		D[i] = z[i]*(1-z[i]);
+		z[i] = C[i]*(z[i]-1)*y[i];
+	}
+	XTv(z, g);
+
+	for(i=0;i<w_size;i++)
+		g[i] = w[i] + g[i];
+}
+
+int l2r_lr_fun::get_nr_variable(void)
+{
+	return prob->n;
+}
+
+void l2r_lr_fun::Hv(double *s, double *Hs)
+{
+	int i;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+	double *wa = new double[l];
+
+	Xv(s, wa);
+	for(i=0;i<l;i++)
+		wa[i] = C[i]*D[i]*wa[i];
+
+	XTv(wa, Hs);
+	for(i=0;i<w_size;i++)
+		Hs[i] = s[i] + Hs[i];
+	delete[] wa;
+}
+
+void l2r_lr_fun::Xv(double *v, double *Xv)
+{
+	int i;
+	int l=prob->l;
+	feature_node **x=prob->x;
+
+	for(i=0;i<l;i++)
+	{
+		feature_node *s=x[i];
+		Xv[i]=0;
+		while(s->index!=-1)
+		{
+			Xv[i]+=v[s->index-1]*s->value;
+			s++;
+		}
+	}
+}
+
+void l2r_lr_fun::XTv(double *v, double *XTv)
+{
+	int i;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+	feature_node **x=prob->x;
+
+	for(i=0;i<w_size;i++)
+		XTv[i]=0;
+	for(i=0;i<l;i++)
+	{
+		feature_node *s=x[i];
+		while(s->index!=-1)
+		{
+			XTv[s->index-1]+=v[i]*s->value;
+			s++;
+		}
+	}
+}
+
+class l2r_l2_svc_fun: public function
+{
+public:
+	l2r_l2_svc_fun(const problem *prob, double *C);
+	~l2r_l2_svc_fun();
+
+	double fun(double *w);
+	void grad(double *w, double *g);
+	void Hv(double *s, double *Hs);
+
+	int get_nr_variable(void);
+
+protected:
+	void Xv(double *v, double *Xv);
+	void subXv(double *v, double *Xv);
+	void subXTv(double *v, double *XTv);
+
+	double *C;
+	double *z;
+	double *D;
+	int *I;
+	int sizeI;
+	const problem *prob;
+};
+
+l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, double *C)
+{
+	int l=prob->l;
+
+	this->prob = prob;
+
+	z = new double[l];
+	D = new double[l];
+	I = new int[l];
+	this->C = C;
+}
+
+l2r_l2_svc_fun::~l2r_l2_svc_fun()
+{
+	delete[] z;
+	delete[] D;
+	delete[] I;
+}
+
+double l2r_l2_svc_fun::fun(double *w)
+{
+	int i;
+	double f=0;
+	double *y=prob->y;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+
+	Xv(w, z);
+
+	for(i=0;i<w_size;i++)
+		f += w[i]*w[i];
+	f /= 2.0;
+	for(i=0;i<l;i++)
+	{
+		z[i] = y[i]*z[i];
+		double d = 1-z[i];
+		if (d > 0)
+			f += C[i]*d*d;
+	}
+
+	return(f);
+}
+
+void l2r_l2_svc_fun::grad(double *w, double *g)
+{
+	int i;
+	double *y=prob->y;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+
+	sizeI = 0;
+	for (i=0;i<l;i++)
+		if (z[i] < 1)
+		{
+			z[sizeI] = C[i]*y[i]*(z[i]-1);
+			I[sizeI] = i;
+			sizeI++;
+		}
+	subXTv(z, g);
+
+	for(i=0;i<w_size;i++)
+		g[i] = w[i] + 2*g[i];
+}
+
+int l2r_l2_svc_fun::get_nr_variable(void)
+{
+	return prob->n;
+}
+
+void l2r_l2_svc_fun::Hv(double *s, double *Hs)
+{
+	int i;
+	int w_size=get_nr_variable();
+	double *wa = new double[sizeI];
+
+	subXv(s, wa);
+	for(i=0;i<sizeI;i++)
+		wa[i] = C[I[i]]*wa[i];
+
+	subXTv(wa, Hs);
+	for(i=0;i<w_size;i++)
+		Hs[i] = s[i] + 2*Hs[i];
+	delete[] wa;
+}
+
+void l2r_l2_svc_fun::Xv(double *v, double *Xv)
+{
+	int i;
+	int l=prob->l;
+	feature_node **x=prob->x;
+
+	for(i=0;i<l;i++)
+	{
+		feature_node *s=x[i];
+		Xv[i]=0;
+		while(s->index!=-1)
+		{
+			Xv[i]+=v[s->index-1]*s->value;
+			s++;
+		}
+	}
+}
+
+void l2r_l2_svc_fun::subXv(double *v, double *Xv)
+{
+	int i;
+	feature_node **x=prob->x;
+
+	for(i=0;i<sizeI;i++)
+	{
+		feature_node *s=x[I[i]];
+		Xv[i]=0;
+		while(s->index!=-1)
+		{
+			Xv[i]+=v[s->index-1]*s->value;
+			s++;
+		}
+	}
+}
+
+void l2r_l2_svc_fun::subXTv(double *v, double *XTv)
+{
+	int i;
+	int w_size=get_nr_variable();
+	feature_node **x=prob->x;
+
+	for(i=0;i<w_size;i++)
+		XTv[i]=0;
+	for(i=0;i<sizeI;i++)
+	{
+		feature_node *s=x[I[i]];
+		while(s->index!=-1)
+		{
+			XTv[s->index-1]+=v[i]*s->value;
+			s++;
+		}
+	}
+}
+
+class l2r_l2_svr_fun: public l2r_l2_svc_fun
+{
+public:
+	l2r_l2_svr_fun(const problem *prob, double *C, double p);
+
+	double fun(double *w);
+	void grad(double *w, double *g);
+
+private:
+	double p;
+};
+
+l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, double *C, double p):
+	l2r_l2_svc_fun(prob, C)
+{
+	this->p = p;
+}
+
+double l2r_l2_svr_fun::fun(double *w)
+{
+	int i;
+	double f=0;
+	double *y=prob->y;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+	double d;
+
+	Xv(w, z);
+
+	for(i=0;i<w_size;i++)
+		f += w[i]*w[i];
+	f /= 2;
+	for(i=0;i<l;i++)
+	{
+		d = z[i] - y[i];
+		if(d < -p)
+			f += C[i]*(d+p)*(d+p);
+		else if(d > p)
+			f += C[i]*(d-p)*(d-p);
+	}
+
+	return(f);
+}
+
+void l2r_l2_svr_fun::grad(double *w, double *g)
+{
+	int i;
+	double *y=prob->y;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+	double d;
+
+	sizeI = 0;
+	for(i=0;i<l;i++)
+	{
+		d = z[i] - y[i];
+
+		// generate index set I
+		if(d < -p)
+		{
+			z[sizeI] = C[i]*(d+p);
+			I[sizeI] = i;
+			sizeI++;
+		}
+		else if(d > p)
+		{
+			z[sizeI] = C[i]*(d-p);
+			I[sizeI] = i;
+			sizeI++;
+		}
+
+	}
+	subXTv(z, g);
+
+	for(i=0;i<w_size;i++)
+		g[i] = w[i] + 2*g[i];
+}
+
+// A coordinate descent algorithm for
+// multi-class support vector machines by Crammer and Singer
+//
+//  min_{\alpha}  0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i
+//    s.t.     \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i
+//
+//  where e^m_i = 0 if y_i  = m,
+//        e^m_i = 1 if y_i != m,
+//  C^m_i = C if m  = y_i,
+//  C^m_i = 0 if m != y_i,
+//  and w_m(\alpha) = \sum_i \alpha^m_i x_i
+//
+// Given:
+// x, y, C
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Appendix of LIBLINEAR paper, Fan et al. (2008)
+
+#define GETI(i) (i)
+// To support weights for instances, use GETI(i) (i)
+
+class Solver_MCSVM_CS
+{
+	public:
+		Solver_MCSVM_CS(const problem *prob, int nr_class, double *C, double eps=0.1, int max_iter=100000);
+		~Solver_MCSVM_CS();
+		int Solve(double *w);
+	private:
+		void solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new);
+		bool be_shrunk(int i, int m, int yi, double alpha_i, double minG);
+		double *B, *C, *G;
+		int w_size, l;
+		int nr_class;
+		int max_iter;
+		double eps;
+		const problem *prob;
+};
+
+Solver_MCSVM_CS::Solver_MCSVM_CS(const problem *prob, int nr_class, double *weighted_C, double eps, int max_iter)
+{
+	this->w_size = prob->n;
+	this->l = prob->l;
+	this->nr_class = nr_class;
+	this->eps = eps;
+	this->max_iter = max_iter;
+	this->prob = prob;
+	this->B = new double[nr_class];
+	this->G = new double[nr_class];
+	this->C = new double[prob->l];
+	for(int i = 0; i < prob->l; i++)
+		this->C[i] = prob->W[i] * weighted_C[(int)prob->y[i]];
+}
+
+Solver_MCSVM_CS::~Solver_MCSVM_CS()
+{
+	delete[] B;
+	delete[] G;
+	delete[] C;
+}
+
+int compare_double(const void *a, const void *b)
+{
+	if(*(double *)a > *(double *)b)
+		return -1;
+	if(*(double *)a < *(double *)b)
+		return 1;
+	return 0;
+}
+
+void Solver_MCSVM_CS::solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new)
+{
+	int r;
+	double *D;
+
+	clone(D, B, active_i);
+	if(yi < active_i)
+		D[yi] += A_i*C_yi;
+	qsort(D, active_i, sizeof(double), compare_double);
+
+	double beta = D[0] - A_i*C_yi;
+	for(r=1;r<active_i && beta<r*D[r];r++)
+		beta += D[r];
+	beta /= r;
+
+	for(r=0;r<active_i;r++)
+	{
+		if(r == yi)
+			alpha_new[r] = min(C_yi, (beta-B[r])/A_i);
+		else
+			alpha_new[r] = min((double)0, (beta - B[r])/A_i);
+	}
+	delete[] D;
+}
+
+bool Solver_MCSVM_CS::be_shrunk(int i, int m, int yi, double alpha_i, double minG)
+{
+	double bound = 0;
+	if(m == yi)
+		bound = C[GETI(i)];
+	if(alpha_i == bound && G[m] < minG)
+		return true;
+	return false;
+}
+
+int Solver_MCSVM_CS::Solve(double *w)
+{
+	int i, m, s;
+	int iter = 0;
+	double *alpha =  new double[l*nr_class];
+	double *alpha_new = new double[nr_class];
+	int *index = new int[l];
+	double *QD = new double[l];
+	int *d_ind = new int[nr_class];
+	double *d_val = new double[nr_class];
+	int *alpha_index = new int[nr_class*l];
+	int *y_index = new int[l];
+	int active_size = l;
+	int *active_size_i = new int[l];
+	double eps_shrink = max(10.0*eps, 1.0); // stopping tolerance for shrinking
+	bool start_from_all = true;
+
+	// Initial alpha can be set here. Note that
+	// sum_m alpha[i*nr_class+m] = 0, for all i=1,...,l-1
+	// alpha[i*nr_class+m] <= C[GETI(i)] if prob->y[i] == m
+	// alpha[i*nr_class+m] <= 0 if prob->y[i] != m
+	// If initial alpha isn't zero, uncomment the for loop below to initialize w
+	for(i=0;i<l*nr_class;i++)
+		alpha[i] = 0;
+
+	for(i=0;i<w_size*nr_class;i++)
+		w[i] = 0;
+	for(i=0;i<l;i++)
+	{
+		for(m=0;m<nr_class;m++)
+			alpha_index[i*nr_class+m] = m;
+		feature_node *xi = prob->x[i];
+		QD[i] = 0;
+		while(xi->index != -1)
+		{
+			double val = xi->value;
+			QD[i] += val*val;
+
+			// Uncomment the for loop if initial alpha isn't zero
+			// for(m=0; m<nr_class; m++)
+			//	w[(xi->index-1)*nr_class+m] += alpha[i*nr_class+m]*val;
+			xi++;
+		}
+		active_size_i[i] = nr_class;
+		y_index[i] = (int)prob->y[i];
+		index[i] = i;
+	}
+
+	while(iter < max_iter)
+	{
+		double stopping = -INF;
+		for(i=0;i<active_size;i++)
+		{
+			int j = i+bounded_rand_int(active_size-i);
+			swap(index[i], index[j]);
+		}
+		for(s=0;s<active_size;s++)
+		{
+			i = index[s];
+			double Ai = QD[i];
+			double *alpha_i = &alpha[i*nr_class];
+			int *alpha_index_i = &alpha_index[i*nr_class];
+
+			if(Ai > 0)
+			{
+				for(m=0;m<active_size_i[i];m++)
+					G[m] = 1;
+				if(y_index[i] < active_size_i[i])
+					G[y_index[i]] = 0;
+
+				feature_node *xi = prob->x[i];
+				while(xi->index!= -1)
+				{
+					double *w_i = &w[(xi->index-1)*nr_class];
+					for(m=0;m<active_size_i[i];m++)
+						G[m] += w_i[alpha_index_i[m]]*(xi->value);
+					xi++;
+				}
+
+				double minG = INF;
+				double maxG = -INF;
+				for(m=0;m<active_size_i[i];m++)
+				{
+					if(alpha_i[alpha_index_i[m]] < 0 && G[m] < minG)
+						minG = G[m];
+					if(G[m] > maxG)
+						maxG = G[m];
+				}
+				if(y_index[i] < active_size_i[i])
+					if(alpha_i[(int) prob->y[i]] < C[GETI(i)] && G[y_index[i]] < minG)
+						minG = G[y_index[i]];
+
+				for(m=0;m<active_size_i[i];m++)
+				{
+					if(be_shrunk(i, m, y_index[i], alpha_i[alpha_index_i[m]], minG))
+					{
+						active_size_i[i]--;
+						while(active_size_i[i]>m)
+						{
+							if(!be_shrunk(i, active_size_i[i], y_index[i],
+											alpha_i[alpha_index_i[active_size_i[i]]], minG))
+							{
+								swap(alpha_index_i[m], alpha_index_i[active_size_i[i]]);
+								swap(G[m], G[active_size_i[i]]);
+								if(y_index[i] == active_size_i[i])
+									y_index[i] = m;
+								else if(y_index[i] == m)
+									y_index[i] = active_size_i[i];
+								break;
+							}
+							active_size_i[i]--;
+						}
+					}
+				}
+
+				if(active_size_i[i] <= 1)
+				{
+					active_size--;
+					swap(index[s], index[active_size]);
+					s--;
+					continue;
+				}
+
+				if(maxG-minG <= 1e-12)
+					continue;
+				else
+					stopping = max(maxG - minG, stopping);
+
+				for(m=0;m<active_size_i[i];m++)
+					B[m] = G[m] - Ai*alpha_i[alpha_index_i[m]] ;
+
+				solve_sub_problem(Ai, y_index[i], C[GETI(i)], active_size_i[i], alpha_new);
+				int nz_d = 0;
+				for(m=0;m<active_size_i[i];m++)
+				{
+					double d = alpha_new[m] - alpha_i[alpha_index_i[m]];
+					alpha_i[alpha_index_i[m]] = alpha_new[m];
+					if(fabs(d) >= 1e-12)
+					{
+						d_ind[nz_d] = alpha_index_i[m];
+						d_val[nz_d] = d;
+						nz_d++;
+					}
+				}
+
+				xi = prob->x[i];
+				while(xi->index != -1)
+				{
+					double *w_i = &w[(xi->index-1)*nr_class];
+					for(m=0;m<nz_d;m++)
+						w_i[d_ind[m]] += d_val[m]*xi->value;
+					xi++;
+				}
+			}
+		}
+
+		iter++;
+		if(iter % 10 == 0)
+		{
+			info(".");
+		}
+
+		if(stopping < eps_shrink)
+		{
+			if(stopping < eps && start_from_all == true)
+				break;
+			else
+			{
+				active_size = l;
+				for(i=0;i<l;i++)
+					active_size_i[i] = nr_class;
+				info("*");
+				eps_shrink = max(eps_shrink/2, eps);
+				start_from_all = true;
+			}
+		}
+		else
+			start_from_all = false;
+	}
+
+	info("\noptimization finished, #iter = %d\n",iter);
+	if (iter >= max_iter)
+		info("\nWARNING: reaching max number of iterations\n");
+
+	// calculate objective value
+	double v = 0;
+	int nSV = 0;
+	for(i=0;i<w_size*nr_class;i++)
+		v += w[i]*w[i];
+	v = 0.5*v;
+	for(i=0;i<l*nr_class;i++)
+	{
+		v += alpha[i];
+		if(fabs(alpha[i]) > 0)
+			nSV++;
+	}
+	for(i=0;i<l;i++)
+		v -= alpha[i*nr_class+(int)prob->y[i]];
+	info("Objective value = %lf\n",v);
+	info("nSV = %d\n",nSV);
+
+	delete [] alpha;
+	delete [] alpha_new;
+	delete [] index;
+	delete [] QD;
+	delete [] d_ind;
+	delete [] d_val;
+	delete [] alpha_index;
+	delete [] y_index;
+	delete [] active_size_i;
+	return iter;
+}
+
+// A coordinate descent algorithm for
+// L1-loss and L2-loss SVM dual problems
+//
+//  min_\alpha  0.5(\alpha^T (Q + D)\alpha) - e^T \alpha,
+//    s.t.      0 <= \alpha_i <= upper_bound_i,
+//
+//  where Qij = yi yj xi^T xj and
+//  D is a diagonal matrix
+//
+// In L1-SVM case:
+// 		upper_bound_i = Cp if y_i = 1
+// 		upper_bound_i = Cn if y_i = -1
+// 		D_ii = 0
+// In L2-SVM case:
+// 		upper_bound_i = INF
+// 		D_ii = 1/(2*Cp)	if y_i = 1
+// 		D_ii = 1/(2*Cn)	if y_i = -1
+//
+// Given:
+// x, y, Cp, Cn
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Algorithm 3 of Hsieh et al., ICML 2008
+
+#undef GETI
+#define GETI(i) (i)
+// To support weights for instances, use GETI(i) (i)
+
+static int solve_l2r_l1l2_svc(
+	const problem *prob, double *w, double eps,
+	double Cp, double Cn, int solver_type, int max_iter)
+{
+	int l = prob->l;
+	int w_size = prob->n;
+	int i, s, iter = 0;
+	double C, d, G;
+	double *QD = new double[l];
+	int *index = new int[l];
+	double *alpha = new double[l];
+	schar *y = new schar[l];
+	int active_size = l;
+
+	// PG: projected gradient, for shrinking and stopping
+	double PG;
+	double PGmax_old = INF;
+	double PGmin_old = -INF;
+	double PGmax_new, PGmin_new;
+
+	// default solver_type: L2R_L2LOSS_SVC_DUAL
+	double *diag = new double[l];
+	double *upper_bound = new double[l];
+	double *C_ = new double[l];
+	for(i=0; i<l; i++)
+	{
+		if(prob->y[i]>0)
+			C_[i] = prob->W[i] * Cp;
+		else
+			C_[i] = prob->W[i] * Cn;
+		diag[i] = 0.5/C_[i];
+		upper_bound[i] = INF;
+	}
+	if(solver_type == L2R_L1LOSS_SVC_DUAL)
+	{
+		for(i=0; i<l; i++)
+		{
+			diag[i] = 0;
+			upper_bound[i] = C_[i];
+		}
+	}
+
+	for(i=0; i<l; i++)
+	{
+		if(prob->y[i] > 0)
+		{
+			y[i] = +1;
+		}
+		else
+		{
+			y[i] = -1;
+		}
+	}
+
+	// Initial alpha can be set here. Note that
+	// 0 <= alpha[i] <= upper_bound[GETI(i)]
+	for(i=0; i<l; i++)
+		alpha[i] = 0;
+
+	for(i=0; i<w_size; i++)
+		w[i] = 0;
+	for(i=0; i<l; i++)
+	{
+		QD[i] = diag[GETI(i)];
+
+		feature_node *xi = prob->x[i];
+		while (xi->index != -1)
+		{
+			double val = xi->value;
+			QD[i] += val*val;
+			w[xi->index-1] += y[i]*alpha[i]*val;
+			xi++;
+		}
+		index[i] = i;
+	}
+
+	while (iter < max_iter)
+	{
+		PGmax_new = -INF;
+		PGmin_new = INF;
+
+		for (i=0; i<active_size; i++)
+		{
+			int j = i+bounded_rand_int(active_size-i);
+			swap(index[i], index[j]);
+		}
+
+		for (s=0; s<active_size; s++)
+		{
+			i = index[s];
+			G = 0;
+			schar yi = y[i];
+
+			feature_node *xi = prob->x[i];
+			while(xi->index!= -1)
+			{
+				G += w[xi->index-1]*(xi->value);
+				xi++;
+			}
+			G = G*yi-1;
+
+			C = upper_bound[GETI(i)];
+			G += alpha[i]*diag[GETI(i)];
+
+			PG = 0;
+			if (alpha[i] == 0)
+			{
+				if (G > PGmax_old)
+				{
+					active_size--;
+					swap(index[s], index[active_size]);
+					s--;
+					continue;
+				}
+				else if (G < 0)
+					PG = G;
+			}
+			else if (alpha[i] == C)
+			{
+				if (G < PGmin_old)
+				{
+					active_size--;
+					swap(index[s], index[active_size]);
+					s--;
+					continue;
+				}
+				else if (G > 0)
+					PG = G;
+			}
+			else
+				PG = G;
+
+			PGmax_new = max(PGmax_new, PG);
+			PGmin_new = min(PGmin_new, PG);
+
+			if(fabs(PG) > 1.0e-12)
+			{
+				double alpha_old = alpha[i];
+				alpha[i] = min(max(alpha[i] - G/QD[i], 0.0), C);
+				d = (alpha[i] - alpha_old)*yi;
+				xi = prob->x[i];
+				while (xi->index != -1)
+				{
+					w[xi->index-1] += d*xi->value;
+					xi++;
+				}
+			}
+		}
+
+		iter++;
+		if(iter % 10 == 0)
+			info(".");
+
+		if(PGmax_new - PGmin_new <= eps)
+		{
+			if(active_size == l)
+				break;
+			else
+			{
+				active_size = l;
+				info("*");
+				PGmax_old = INF;
+				PGmin_old = -INF;
+				continue;
+			}
+		}
+		PGmax_old = PGmax_new;
+		PGmin_old = PGmin_new;
+		if (PGmax_old <= 0)
+			PGmax_old = INF;
+		if (PGmin_old >= 0)
+			PGmin_old = -INF;
+	}
+
+	info("\noptimization finished, #iter = %d\n",iter);
+	if (iter >= max_iter)
+		info("\nWARNING: reaching max number of iterations\nUsing -s 2 may be faster (also see FAQ)\n\n");
+
+	// calculate objective value
+
+	double v = 0;
+	int nSV = 0;
+	for(i=0; i<w_size; i++)
+		v += w[i]*w[i];
+	for(i=0; i<l; i++)
+	{
+		v += alpha[i]*(alpha[i]*diag[GETI(i)] - 2);
+		if(alpha[i] > 0)
+			++nSV;
+	}
+	info("Objective value = %lf\n",v/2);
+	info("nSV = %d\n",nSV);
+
+	delete [] QD;
+	delete [] alpha;
+	delete [] y;
+	delete [] index;
+	delete [] diag;
+	delete [] upper_bound;
+	delete [] C_;
+	return iter;
+}
+
+
+// A coordinate descent algorithm for
+// L1-loss and L2-loss epsilon-SVR dual problem
+//
+//  min_\beta  0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i,
+//    s.t.      -upper_bound_i <= \beta_i <= upper_bound_i,
+//
+//  where Qij = xi^T xj and
+//  D is a diagonal matrix
+//
+// In L1-SVM case:
+// 		upper_bound_i = C
+// 		lambda_i = 0
+// In L2-SVM case:
+// 		upper_bound_i = INF
+// 		lambda_i = 1/(2*C)
+//
+// Given:
+// x, y, p, C
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Algorithm 4 of Ho and Lin, 2012
+
+#undef GETI
+#define GETI(i) (i)
+// To support weights for instances, use GETI(i) (i)
+
+static int solve_l2r_l1l2_svr(
+	const problem *prob, double *w, const parameter *param,
+	int solver_type, int max_iter)
+{
+	int l = prob->l;
+	double C = param->C;
+	double p = param->p;
+	int w_size = prob->n;
+	double eps = param->eps;
+	int i, s, iter = 0;
+	int active_size = l;
+	int *index = new int[l];
+
+	double d, G, H;
+	double Gmax_old = INF;
+	double Gmax_new, Gnorm1_new;
+	double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
+	double *beta = new double[l];
+	double *QD = new double[l];
+	double *y = prob->y;
+
+	// L2R_L2LOSS_SVR_DUAL
+	double *lambda = new double[l];
+	double *upper_bound = new double[l];
+	double *C_ = new double[l];
+	for (i=0; i<l; i++)
+	{
+		C_[i] = prob->W[i] * C;
+		lambda[i] = 0.5/C_[i];
+		upper_bound[i] = INF;
+	}
+	if(solver_type == L2R_L1LOSS_SVR_DUAL)
+	{
+		for (i=0; i<l; i++)
+		{
+			lambda[i] = 0;
+			upper_bound[i] = C_[i];
+		}
+	}
+
+	// Initial beta can be set here. Note that
+	// -upper_bound <= beta[i] <= upper_bound
+	for(i=0; i<l; i++)
+		beta[i] = 0;
+
+	for(i=0; i<w_size; i++)
+		w[i] = 0;
+	for(i=0; i<l; i++)
+	{
+		QD[i] = 0;
+		feature_node *xi = prob->x[i];
+		while(xi->index != -1)
+		{
+			double val = xi->value;
+			QD[i] += val*val;
+			w[xi->index-1] += beta[i]*val;
+			xi++;
+		}
+
+		index[i] = i;
+	}
+
+
+	while(iter < max_iter)
+	{
+		Gmax_new = 0;
+		Gnorm1_new = 0;
+
+		for(i=0; i<active_size; i++)
+		{
+			int j = i+bounded_rand_int(active_size-i);
+			swap(index[i], index[j]);
+		}
+
+		for(s=0; s<active_size; s++)
+		{
+			i = index[s];
+			G = -y[i] + lambda[GETI(i)]*beta[i];
+			H = QD[i] + lambda[GETI(i)];
+
+			feature_node *xi = prob->x[i];
+			while(xi->index != -1)
+			{
+				int ind = xi->index-1;
+				double val = xi->value;
+				G += val*w[ind];
+				xi++;
+			}
+
+			double Gp = G+p;
+			double Gn = G-p;
+			double violation = 0;
+			if(beta[i] == 0)
+			{
+				if(Gp < 0)
+					violation = -Gp;
+				else if(Gn > 0)
+					violation = Gn;
+				else if(Gp>Gmax_old && Gn<-Gmax_old)
+				{
+					active_size--;
+					swap(index[s], index[active_size]);
+					s--;
+					continue;
+				}
+			}
+			else if(beta[i] >= upper_bound[GETI(i)])
+			{
+				if(Gp > 0)
+					violation = Gp;
+				else if(Gp < -Gmax_old)
+				{
+					active_size--;
+					swap(index[s], index[active_size]);
+					s--;
+					continue;
+				}
+			}
+			else if(beta[i] <= -upper_bound[GETI(i)])
+			{
+				if(Gn < 0)
+					violation = -Gn;
+				else if(Gn > Gmax_old)
+				{
+					active_size--;
+					swap(index[s], index[active_size]);
+					s--;
+					continue;
+				}
+			}
+			else if(beta[i] > 0)
+				violation = fabs(Gp);
+			else
+				violation = fabs(Gn);
+
+			Gmax_new = max(Gmax_new, violation);
+			Gnorm1_new += violation;
+
+			// obtain Newton direction d
+			if(Gp < H*beta[i])
+				d = -Gp/H;
+			else if(Gn > H*beta[i])
+				d = -Gn/H;
+			else
+				d = -beta[i];
+
+			if(fabs(d) < 1.0e-12)
+				continue;
+
+			double beta_old = beta[i];
+			beta[i] = min(max(beta[i]+d, -upper_bound[GETI(i)]), upper_bound[GETI(i)]);
+			d = beta[i]-beta_old;
+
+			if(d != 0)
+			{
+				xi = prob->x[i];
+				while(xi->index != -1)
+				{
+					w[xi->index-1] += d*xi->value;
+					xi++;
+				}
+			}
+		}
+
+		if(iter == 0)
+			Gnorm1_init = Gnorm1_new;
+		iter++;
+		if(iter % 10 == 0)
+			info(".");
+
+		if(Gnorm1_new <= eps*Gnorm1_init)
+		{
+			if(active_size == l)
+				break;
+			else
+			{
+				active_size = l;
+				info("*");
+				Gmax_old = INF;
+				continue;
+			}
+		}
+
+		Gmax_old = Gmax_new;
+	}
+
+	info("\noptimization finished, #iter = %d\n", iter);
+	if(iter >= max_iter)
+		info("\nWARNING: reaching max number of iterations\nUsing -s 11 may be faster\n\n");
+
+	// calculate objective value
+	double v = 0;
+	int nSV = 0;
+	for(i=0; i<w_size; i++)
+		v += w[i]*w[i];
+	v = 0.5*v;
+	for(i=0; i<l; i++)
+	{
+		v += p*fabs(beta[i]) - y[i]*beta[i] + 0.5*lambda[GETI(i)]*beta[i]*beta[i];
+		if(beta[i] != 0)
+			nSV++;
+	}
+
+	info("Objective value = %lf\n", v);
+	info("nSV = %d\n",nSV);
+
+	delete [] beta;
+	delete [] QD;
+	delete [] index;
+	delete [] lambda;
+	delete [] upper_bound;
+	delete [] C_;
+	return iter;
+}
+
+
+// A coordinate descent algorithm for
+// the dual of L2-regularized logistic regression problems
+//
+//  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - \alpha_i) log (upper_bound_i - \alpha_i),
+//    s.t.      0 <= \alpha_i <= upper_bound_i,
+//
+//  where Qij = yi yj xi^T xj and
+//  upper_bound_i = Cp if y_i = 1
+//  upper_bound_i = Cn if y_i = -1
+//
+// Given:
+// x, y, Cp, Cn
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Algorithm 5 of Yu et al., MLJ 2010
+
+#undef GETI
+#define GETI(i) (i)
+// To support weights for instances, use GETI(i) (i)
+
+int solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, double Cn,
+					   int max_iter)
+{
+	int l = prob->l;
+	int w_size = prob->n;
+	int i, s, iter = 0;
+	double *xTx = new double[l];
+	int *index = new int[l];
+	double *alpha = new double[2*l]; // store alpha and C - alpha
+	schar *y = new schar[l];
+	int max_inner_iter = 100; // for inner Newton
+	double innereps = 1e-2;
+	double innereps_min = min(1e-8, eps);
+	double *upper_bound = new double [l];
+
+	for(i=0; i<l; i++)
+	{
+		if(prob->y[i] > 0)
+		{
+			upper_bound[i] = prob->W[i] * Cp;
+			y[i] = +1;
+		}
+		else
+		{
+			upper_bound[i] = prob->W[i] * Cn;
+			y[i] = -1;
+		}
+	}
+
+	// Initial alpha can be set here. Note that
+	// 0 < alpha[i] < upper_bound[GETI(i)]
+	// alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)]
+	for(i=0; i<l; i++)
+	{
+		alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8);
+		alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i];
+	}
+
+	for(i=0; i<w_size; i++)
+		w[i] = 0;
+	for(i=0; i<l; i++)
+	{
+		xTx[i] = 0;
+		feature_node *xi = prob->x[i];
+		while (xi->index != -1)
+		{
+			double val = xi->value;
+			xTx[i] += val*val;
+			w[xi->index-1] += y[i]*alpha[2*i]*val;
+			xi++;
+		}
+		index[i] = i;
+	}
+
+	while (iter < max_iter)
+	{
+		for (i=0; i<l; i++)
+		{
+			int j = i+bounded_rand_int(l-i);
+			swap(index[i], index[j]);
+		}
+		int newton_iter = 0;
+		double Gmax = 0;
+		for (s=0; s<l; s++)
+		{
+			i = index[s];
+			schar yi = y[i];
+			double C = upper_bound[GETI(i)];
+			double ywTx = 0, xisq = xTx[i];
+			feature_node *xi = prob->x[i];
+			while (xi->index != -1)
+			{
+				ywTx += w[xi->index-1]*xi->value;
+				xi++;
+			}
+			ywTx *= y[i];
+			double a = xisq, b = ywTx;
+
+			// Decide to minimize g_1(z) or g_2(z)
+			int ind1 = 2*i, ind2 = 2*i+1, sign = 1;
+			if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0)
+			{
+				ind1 = 2*i+1;
+				ind2 = 2*i;
+				sign = -1;
+			}
+
+			//  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
+			double alpha_old = alpha[ind1];
+			double z = alpha_old;
+			if(C - z < 0.5 * C)
+				z = 0.1*z;
+			double gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
+			Gmax = max(Gmax, fabs(gp));
+
+			// Newton method on the sub-problem
+			const double eta = 0.1; // xi in the paper
+			int inner_iter = 0;
+			while (inner_iter <= max_inner_iter)
+			{
+				if(fabs(gp) < innereps)
+					break;
+				double gpp = a + C/(C-z)/z;
+				double tmpz = z - gp/gpp;
+				if(tmpz <= 0)
+					z *= eta;
+				else // tmpz in (0, C)
+					z = tmpz;
+				gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
+				newton_iter++;
+				inner_iter++;
+			}
+
+			if(inner_iter > 0) // update w
+			{
+				alpha[ind1] = z;
+				alpha[ind2] = C-z;
+				xi = prob->x[i];
+				while (xi->index != -1)
+				{
+					w[xi->index-1] += sign*(z-alpha_old)*yi*xi->value;
+					xi++;
+				}
+			}
+		}
+
+		iter++;
+		if(iter % 10 == 0)
+			info(".");
+
+		if(Gmax < eps)
+			break;
+
+		if(newton_iter <= l/10)
+			innereps = max(innereps_min, 0.1*innereps);
+
+	}
+
+	info("\noptimization finished, #iter = %d\n",iter);
+	if (iter >= max_iter)
+		info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n");
+
+	// calculate objective value
+
+	double v = 0;
+	for(i=0; i<w_size; i++)
+		v += w[i] * w[i];
+	v *= 0.5;
+	for(i=0; i<l; i++)
+		v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1])
+			- upper_bound[GETI(i)] * log(upper_bound[GETI(i)]);
+	info("Objective value = %lf\n", v);
+
+	delete [] xTx;
+	delete [] alpha;
+	delete [] y;
+	delete [] index;
+	delete [] upper_bound;
+	return iter;
+}
+
+// A coordinate descent algorithm for
+// L1-regularized L2-loss support vector classification
+//
+//  min_w \sum |wj| + C \sum max(0, 1-yi w^T xi)^2,
+//
+// Given:
+// x, y, Cp, Cn
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Yuan et al. (2010) and appendix of LIBLINEAR paper, Fan et al. (2008)
+
+#undef GETI
+#define GETI(i) (i)
+// To support weights for instances, use GETI(i) (i)
+
+static int solve_l1r_l2_svc(
+	problem *prob_col, double *w, double eps,
+	double Cp, double Cn, int max_iter)
+{
+	int l = prob_col->l;
+	int w_size = prob_col->n;
+	int j, s, iter = 0;
+	int active_size = w_size;
+	int max_num_linesearch = 20;
+
+	double sigma = 0.01;
+	double d, G_loss, G, H;
+	double Gmax_old = INF;
+	double Gmax_new, Gnorm1_new;
+	double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
+	double d_old, d_diff;
+	double loss_old, loss_new;
+	double appxcond, cond;
+
+	int *index = new int[w_size];
+	schar *y = new schar[l];
+	double *b = new double[l]; // b = 1-ywTx
+	double *xj_sq = new double[w_size];
+	feature_node *x;
+
+	double *C = new double[l];
+
+	// Initial w can be set here.
+	for(j=0; j<w_size; j++)
+		w[j] = 0;
+
+	for(j=0; j<l; j++)
+	{
+		b[j] = 1;
+		if(prob_col->y[j] > 0)
+		{
+			y[j] = 1;
+			C[j] = prob_col->W[j] * Cp;
+		}
+		else
+		{
+			y[j] = -1;
+			C[j] = prob_col->W[j] * Cn;
+		}
+	}
+	for(j=0; j<w_size; j++)
+	{
+		index[j] = j;
+		xj_sq[j] = 0;
+		x = prob_col->x[j];
+		while(x->index != -1)
+		{
+			int ind = x->index-1;
+			x->value *= y[ind]; // x->value stores yi*xij
+			double val = x->value;
+			b[ind] -= w[j]*val;
+			xj_sq[j] += C[GETI(ind)]*val*val;
+			x++;
+		}
+	}
+
+	while(iter < max_iter)
+	{
+		Gmax_new = 0;
+		Gnorm1_new = 0;
+
+		for(j=0; j<active_size; j++)
+		{
+			int i = j+bounded_rand_int(active_size-j);
+			swap(index[i], index[j]);
+		}
+
+		for(s=0; s<active_size; s++)
+		{
+			j = index[s];
+			G_loss = 0;
+			H = 0;
+
+			x = prob_col->x[j];
+			while(x->index != -1)
+			{
+				int ind = x->index-1;
+				if(b[ind] > 0)
+				{
+					double val = x->value;
+					double tmp = C[GETI(ind)]*val;
+					G_loss -= tmp*b[ind];
+					H += tmp*val;
+				}
+				x++;
+			}
+			G_loss *= 2;
+
+			G = G_loss;
+			H *= 2;
+			H = max(H, 1e-12);
+
+			double Gp = G+1;
+			double Gn = G-1;
+			double violation = 0;
+			if(w[j] == 0)
+			{
+				if(Gp < 0)
+					violation = -Gp;
+				else if(Gn > 0)
+					violation = Gn;
+				else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
+				{
+					active_size--;
+					swap(index[s], index[active_size]);
+					s--;
+					continue;
+				}
+			}
+			else if(w[j] > 0)
+				violation = fabs(Gp);
+			else
+				violation = fabs(Gn);
+
+			Gmax_new = max(Gmax_new, violation);
+			Gnorm1_new += violation;
+
+			// obtain Newton direction d
+			if(Gp < H*w[j])
+				d = -Gp/H;
+			else if(Gn > H*w[j])
+				d = -Gn/H;
+			else
+				d = -w[j];
+
+			if(fabs(d) < 1.0e-12)
+				continue;
+
+			double delta = fabs(w[j]+d)-fabs(w[j]) + G*d;
+			d_old = 0;
+			int num_linesearch;
+			for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
+			{
+				d_diff = d_old - d;
+				cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta;
+
+				appxcond = xj_sq[j]*d*d + G_loss*d + cond;
+				if(appxcond <= 0)
+				{
+					x = prob_col->x[j];
+					while(x->index != -1)
+					{
+						b[x->index-1] += d_diff*x->value;
+						x++;
+					}
+					break;
+				}
+
+				if(num_linesearch == 0)
+				{
+					loss_old = 0;
+					loss_new = 0;
+					x = prob_col->x[j];
+					while(x->index != -1)
+					{
+						int ind = x->index-1;
+						if(b[ind] > 0)
+							loss_old += C[GETI(ind)]*b[ind]*b[ind];
+						double b_new = b[ind] + d_diff*x->value;
+						b[ind] = b_new;
+						if(b_new > 0)
+							loss_new += C[GETI(ind)]*b_new*b_new;
+						x++;
+					}
+				}
+				else
+				{
+					loss_new = 0;
+					x = prob_col->x[j];
+					while(x->index != -1)
+					{
+						int ind = x->index-1;
+						double b_new = b[ind] + d_diff*x->value;
+						b[ind] = b_new;
+						if(b_new > 0)
+							loss_new += C[GETI(ind)]*b_new*b_new;
+						x++;
+					}
+				}
+
+				cond = cond + loss_new - loss_old;
+				if(cond <= 0)
+					break;
+				else
+				{
+					d_old = d;
+					d *= 0.5;
+					delta *= 0.5;
+				}
+			}
+
+			w[j] += d;
+
+			// recompute b[] if line search takes too many steps
+			if(num_linesearch >= max_num_linesearch)
+			{
+				info("#");
+				for(int i=0; i<l; i++)
+					b[i] = 1;
+
+				for(int i=0; i<w_size; i++)
+				{
+					if(w[i]==0) continue;
+					x = prob_col->x[i];
+					while(x->index != -1)
+					{
+						b[x->index-1] -= w[i]*x->value;
+						x++;
+					}
+				}
+			}
+		}
+
+		if(iter == 0)
+			Gnorm1_init = Gnorm1_new;
+		iter++;
+		if(iter % 10 == 0)
+			info(".");
+
+		if(Gnorm1_new <= eps*Gnorm1_init)
+		{
+			if(active_size == w_size)
+				break;
+			else
+			{
+				active_size = w_size;
+				info("*");
+				Gmax_old = INF;
+				continue;
+			}
+		}
+
+		Gmax_old = Gmax_new;
+	}
+
+	info("\noptimization finished, #iter = %d\n", iter);
+	if(iter >= max_iter)
+		info("\nWARNING: reaching max number of iterations\n");
+
+	// calculate objective value
+
+	double v = 0;
+	int nnz = 0;
+	for(j=0; j<w_size; j++)
+	{
+		x = prob_col->x[j];
+		while(x->index != -1)
+		{
+			x->value *= prob_col->y[x->index-1]; // restore x->value
+			x++;
+		}
+		if(w[j] != 0)
+		{
+			v += fabs(w[j]);
+			nnz++;
+		}
+	}
+	for(j=0; j<l; j++)
+		if(b[j] > 0)
+			v += C[GETI(j)]*b[j]*b[j];
+
+	info("Objective value = %lf\n", v);
+	info("#nonzeros/#features = %d/%d\n", nnz, w_size);
+
+	delete [] index;
+	delete [] y;
+	delete [] b;
+	delete [] xj_sq;
+	delete [] C;
+	return iter;
+}
+
+// A coordinate descent algorithm for
+// L1-regularized logistic regression problems
+//
+//  min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)),
+//
+// Given:
+// x, y, Cp, Cn
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008)
+
+#undef GETI
+#define GETI(i) (i)
+// To support weights for instances, use GETI(i) (i)
+
+static int solve_l1r_lr(
+	const problem *prob_col, double *w, double eps,
+	double Cp, double Cn, int max_newton_iter)
+{
+	int l = prob_col->l;
+	int w_size = prob_col->n;
+	int j, s, newton_iter=0, iter=0;
+	int max_iter = 1000;
+	int max_num_linesearch = 20;
+	int active_size;
+	int QP_active_size;
+	int QP_no_change = 0;
+
+	double nu = 1e-12;
+	double inner_eps = 1;
+	double sigma = 0.01;
+	double w_norm, w_norm_new;
+	double z, G, H;
+	double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
+	double Gmax_old = INF;
+	double Gmax_new, Gnorm1_new;
+	double QP_Gmax_old = INF;
+	double QP_Gmax_new, QP_Gnorm1_new;
+	double delta, negsum_xTd, cond;
+
+	int *index = new int[w_size];
+	schar *y = new schar[l];
+	double *Hdiag = new double[w_size];
+	double *Grad = new double[w_size];
+	double *wpd = new double[w_size];
+	double *xjneg_sum = new double[w_size];
+	double *xTd = new double[l];
+	double *exp_wTx = new double[l];
+	double *exp_wTx_new = new double[l];
+	double *tau = new double[l];
+	double *D = new double[l];
+	feature_node *x;
+
+	double *C = new double[l];
+
+	// Initial w can be set here.
+	for(j=0; j<w_size; j++)
+		w[j] = 0;
+
+	for(j=0; j<l; j++)
+	{
+		if(prob_col->y[j] > 0)
+		{
+			y[j] = 1;
+			C[j] = prob_col->W[j] * Cp;
+		}
+		else
+		{
+			y[j] = -1;
+			C[j] = prob_col->W[j] * Cn;
+		}
+
+		exp_wTx[j] = 0;
+	}
+
+	w_norm = 0;
+	for(j=0; j<w_size; j++)
+	{
+		w_norm += fabs(w[j]);
+		wpd[j] = w[j];
+		index[j] = j;
+		xjneg_sum[j] = 0;
+		x = prob_col->x[j];
+		while(x->index != -1)
+		{
+			int ind = x->index-1;
+			double val = x->value;
+			exp_wTx[ind] += w[j]*val;
+			if(y[ind] == -1)
+				xjneg_sum[j] += C[GETI(ind)]*val;
+			x++;
+		}
+	}
+	for(j=0; j<l; j++)
+	{
+		exp_wTx[j] = exp(exp_wTx[j]);
+		double tau_tmp = 1/(1+exp_wTx[j]);
+		tau[j] = C[GETI(j)]*tau_tmp;
+		D[j] = C[GETI(j)]*exp_wTx[j]*tau_tmp*tau_tmp;
+	}
+
+	while(newton_iter < max_newton_iter)
+	{
+		Gmax_new = 0;
+		Gnorm1_new = 0;
+		active_size = w_size;
+
+		for(s=0; s<active_size; s++)
+		{
+			j = index[s];
+			Hdiag[j] = nu;
+			Grad[j] = 0;
+
+			double tmp = 0;
+			x = prob_col->x[j];
+			while(x->index != -1)
+			{
+				int ind = x->index-1;
+				Hdiag[j] += x->value*x->value*D[ind];
+				tmp += x->value*tau[ind];
+				x++;
+			}
+			Grad[j] = -tmp + xjneg_sum[j];
+
+			double Gp = Grad[j]+1;
+			double Gn = Grad[j]-1;
+			double violation = 0;
+			if(w[j] == 0)
+			{
+				if(Gp < 0)
+					violation = -Gp;
+				else if(Gn > 0)
+					violation = Gn;
+				//outer-level shrinking
+				else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
+				{
+					active_size--;
+					swap(index[s], index[active_size]);
+					s--;
+					continue;
+				}
+			}
+			else if(w[j] > 0)
+				violation = fabs(Gp);
+			else
+				violation = fabs(Gn);
+
+			Gmax_new = max(Gmax_new, violation);
+			Gnorm1_new += violation;
+		}
+
+		if(newton_iter == 0)
+			Gnorm1_init = Gnorm1_new;
+
+		// Break outer-loop if the accumulated violation is small.
+		// Also break if no update in QP inner-loop ten times in a row.
+		if(Gnorm1_new <= eps*Gnorm1_init || QP_no_change >= 10)
+			break;
+
+		QP_no_change++;
+
+		iter = 0;
+		QP_Gmax_old = INF;
+		QP_active_size = active_size;
+
+		for(int i=0; i<l; i++)
+			xTd[i] = 0;
+
+		// optimize QP over wpd
+		while(iter < max_iter)
+		{
+			QP_Gmax_new = 0;
+			QP_Gnorm1_new = 0;
+
+			for(j=0; j<QP_active_size; j++)
+			{
+				int i = j+bounded_rand_int(QP_active_size-j);
+				swap(index[i], index[j]);
+			}
+
+			for(s=0; s<QP_active_size; s++)
+			{
+				j = index[s];
+				H = Hdiag[j];
+
+				x = prob_col->x[j];
+				G = Grad[j] + (wpd[j]-w[j])*nu;
+				while(x->index != -1)
+				{
+					int ind = x->index-1;
+					G += x->value*D[ind]*xTd[ind];
+					x++;
+				}
+
+				double Gp = G+1;
+				double Gn = G-1;
+				double violation = 0;
+				if(wpd[j] == 0)
+				{
+					if(Gp < 0)
+						violation = -Gp;
+					else if(Gn > 0)
+						violation = Gn;
+					//inner-level shrinking
+					else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l)
+					{
+						QP_active_size--;
+						swap(index[s], index[QP_active_size]);
+						s--;
+						continue;
+					}
+				}
+				else if(wpd[j] > 0)
+					violation = fabs(Gp);
+				else
+					violation = fabs(Gn);
+
+				// obtain solution of one-variable problem
+				if(Gp < H*wpd[j])
+					z = -Gp/H;
+				else if(Gn > H*wpd[j])
+					z = -Gn/H;
+				else
+					z = -wpd[j];
+
+				if(fabs(z) < 1.0e-12)
+					continue;
+				z = min(max(z,-10.0),10.0);
+
+				QP_no_change = 0;
+				QP_Gmax_new = max(QP_Gmax_new, violation);
+				QP_Gnorm1_new += violation;
+
+				wpd[j] += z;
+
+				x = prob_col->x[j];
+				while(x->index != -1)
+				{
+					int ind = x->index-1;
+					xTd[ind] += x->value*z;
+					x++;
+				}
+			}
+
+			iter++;
+
+			if(QP_Gnorm1_new <= inner_eps*Gnorm1_init)
+			{
+				//inner stopping
+				if(QP_active_size == active_size)
+					break;
+				//active set reactivation
+				else
+				{
+					QP_active_size = active_size;
+					QP_Gmax_old = INF;
+					continue;
+				}
+			}
+
+			QP_Gmax_old = QP_Gmax_new;
+		}
+
+		if(iter >= max_iter)
+			info("WARNING: reaching max number of inner iterations\n");
+
+		delta = 0;
+		w_norm_new = 0;
+		for(j=0; j<w_size; j++)
+		{
+			delta += Grad[j]*(wpd[j]-w[j]);
+			if(wpd[j] != 0)
+				w_norm_new += fabs(wpd[j]);
+		}
+		delta += (w_norm_new-w_norm);
+
+		negsum_xTd = 0;
+		for(int i=0; i<l; i++)
+			if(y[i] == -1)
+				negsum_xTd += C[GETI(i)]*xTd[i];
+
+		int num_linesearch;
+		for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
+		{
+			cond = w_norm_new - w_norm + negsum_xTd - sigma*delta;
+
+			for(int i=0; i<l; i++)
+			{
+				double exp_xTd = exp(xTd[i]);
+				exp_wTx_new[i] = exp_wTx[i]*exp_xTd;
+				cond += C[GETI(i)]*log((1+exp_wTx_new[i])/(exp_xTd+exp_wTx_new[i]));
+			}
+
+			if(cond <= 0)
+			{
+				w_norm = w_norm_new;
+				for(j=0; j<w_size; j++)
+					w[j] = wpd[j];
+				for(int i=0; i<l; i++)
+				{
+					exp_wTx[i] = exp_wTx_new[i];
+					double tau_tmp = 1/(1+exp_wTx[i]);
+					tau[i] = C[GETI(i)]*tau_tmp;
+					D[i] = C[GETI(i)]*exp_wTx[i]*tau_tmp*tau_tmp;
+				}
+				break;
+			}
+			else
+			{
+				w_norm_new = 0;
+				for(j=0; j<w_size; j++)
+				{
+					wpd[j] = (w[j]+wpd[j])*0.5;
+					if(wpd[j] != 0)
+						w_norm_new += fabs(wpd[j]);
+				}
+				delta *= 0.5;
+				negsum_xTd *= 0.5;
+				for(int i=0; i<l; i++)
+					xTd[i] *= 0.5;
+			}
+		}
+
+		// Recompute some info due to too many line search steps
+		if(num_linesearch >= max_num_linesearch)
+		{
+			for(int i=0; i<l; i++)
+				exp_wTx[i] = 0;
+
+			for(int i=0; i<w_size; i++)
+			{
+				if(w[i]==0) continue;
+				x = prob_col->x[i];
+				while(x->index != -1)
+				{
+					exp_wTx[x->index-1] += w[i]*x->value;
+					x++;
+				}
+			}
+
+			for(int i=0; i<l; i++)
+				exp_wTx[i] = exp(exp_wTx[i]);
+		}
+
+		if(iter == 1)
+			inner_eps *= 0.25;
+
+		newton_iter++;
+		Gmax_old = Gmax_new;
+
+		info("iter %3d  #CD cycles %d\n", newton_iter, iter);
+	}
+
+	info("=========================\n");
+	info("optimization finished, #iter = %d\n", newton_iter);
+	if(newton_iter >= max_newton_iter)
+		info("WARNING: reaching max number of iterations\n");
+
+	// calculate objective value
+
+	double v = 0;
+	int nnz = 0;
+	for(j=0; j<w_size; j++)
+		if(w[j] != 0)
+		{
+			v += fabs(w[j]);
+			nnz++;
+		}
+	for(j=0; j<l; j++)
+		if(y[j] == 1)
+			v += C[GETI(j)]*log(1+1/exp_wTx[j]);
+		else
+			v += C[GETI(j)]*log(1+exp_wTx[j]);
+
+	info("Objective value = %lf\n", v);
+	info("#nonzeros/#features = %d/%d\n", nnz, w_size);
+
+	delete [] index;
+	delete [] y;
+	delete [] Hdiag;
+	delete [] Grad;
+	delete [] wpd;
+	delete [] xjneg_sum;
+	delete [] xTd;
+	delete [] exp_wTx;
+	delete [] exp_wTx_new;
+	delete [] tau;
+	delete [] D;
+	delete [] C;
+	return newton_iter;
+}
+
+// transpose matrix X from row format to column format
+static void transpose(const problem *prob, feature_node **x_space_ret, problem *prob_col)
+{
+	int i;
+	int l = prob->l;
+	int n = prob->n;
+	size_t nnz = 0;
+	size_t *col_ptr = new size_t [n+1];
+	feature_node *x_space;
+	prob_col->l = l;
+	prob_col->n = n;
+	prob_col->y = new double[l];
+	prob_col->x = new feature_node*[n];
+	prob_col->W = new double[l];
+
+	for(i=0; i<l; i++)
+	{
+		prob_col->y[i] = prob->y[i];
+		prob_col->W[i] = prob->W[i];
+	}
+
+	for(i=0; i<n+1; i++)
+		col_ptr[i] = 0;
+	for(i=0; i<l; i++)
+	{
+		feature_node *x = prob->x[i];
+		while(x->index != -1)
+		{
+			nnz++;
+			col_ptr[x->index]++;
+			x++;
+		}
+	}
+	for(i=1; i<n+1; i++)
+		col_ptr[i] += col_ptr[i-1] + 1;
+
+	x_space = new feature_node[nnz+n];
+	for(i=0; i<n; i++)
+		prob_col->x[i] = &x_space[col_ptr[i]];
+
+	for(i=0; i<l; i++)
+	{
+		feature_node *x = prob->x[i];
+		while(x->index != -1)
+		{
+			int ind = x->index-1;
+			x_space[col_ptr[ind]].index = i+1; // starts from 1
+			x_space[col_ptr[ind]].value = x->value;
+			col_ptr[ind]++;
+			x++;
+		}
+	}
+	for(i=0; i<n; i++)
+		x_space[col_ptr[i]].index = -1;
+
+	*x_space_ret = x_space;
+
+	delete [] col_ptr;
+}
+
+// label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data
+// perm, length l, must be allocated before calling this subroutine
+static void group_classes(const problem *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm)
+{
+	int l = prob->l;
+	int max_nr_class = 16;
+	int nr_class = 0;
+	int *label = Malloc(int,max_nr_class);
+	int *count = Malloc(int,max_nr_class);
+	int *data_label = Malloc(int,l);
+	int i;
+
+	for(i=0;i<l;i++)
+	{
+		int this_label = (int)prob->y[i];
+		int j;
+		for(j=0;j<nr_class;j++)
+		{
+			if(this_label == label[j])
+			{
+				++count[j];
+				break;
+			}
+		}
+		data_label[i] = j;
+		if(j == nr_class)
+		{
+			if(nr_class == max_nr_class)
+			{
+				max_nr_class *= 2;
+				label = (int *)realloc(label,max_nr_class*sizeof(int));
+				count = (int *)realloc(count,max_nr_class*sizeof(int));
+			}
+			label[nr_class] = this_label;
+			count[nr_class] = 1;
+			++nr_class;
+		}
+	}
+
+        /* START MOD: Sort labels and apply to array count --dyamins */
+
+        int j;
+        for (j=1; j<nr_class; j++)
+        {
+                i = j-1;
+                int this_label = label[j];
+                int this_count = count[j];
+                while(i>=0 && label[i] > this_label)
+                {
+                        label[i+1] = label[i];
+                        count[i+1] = count[i];
+                        i--;
+                }
+                label[i+1] = this_label;
+                count[i+1] = this_count;
+        }
+
+        for (i=0; i <l; i++)
+        {
+                j = 0;
+                int this_label = (int)prob->y[i];
+                while(this_label != label[j])
+                {
+                        j++;
+                }
+                data_label[i] = j;
+
+        }
+
+        /* END MOD */
+
+#if 0
+	//
+	// Labels are ordered by their first occurrence in the training set.
+	// However, for two-class sets with -1/+1 labels and -1 appears first,
+	// we swap labels to ensure that internally the binary SVM has positive data corresponding to the +1 instances.
+	//
+	if (nr_class == 2 && label[0] == -1 && label[1] == 1)
+	{
+		swap(label[0],label[1]);
+		swap(count[0],count[1]);
+		for(i=0;i<l;i++)
+		{
+			if(data_label[i] == 0)
+				data_label[i] = 1;
+			else
+				data_label[i] = 0;
+		}
+	}
+#endif
+
+	int *start = Malloc(int,nr_class);
+	start[0] = 0;
+	for(i=1;i<nr_class;i++)
+		start[i] = start[i-1]+count[i-1];
+	for(i=0;i<l;i++)
+	{
+		perm[start[data_label[i]]] = i;
+		++start[data_label[i]];
+	}
+	start[0] = 0;
+	for(i=1;i<nr_class;i++)
+		start[i] = start[i-1]+count[i-1];
+
+	*nr_class_ret = nr_class;
+	*label_ret = label;
+	*start_ret = start;
+	*count_ret = count;
+	free(data_label);
+}
+
+static int train_one(const problem *prob, const parameter *param, double *w, double Cp, double Cn, BlasFunctions *blas_functions)
+{
+	double eps=param->eps;
+	int max_iter=param->max_iter;
+	int pos = 0;
+	int neg = 0;
+	int n_iter = -1;
+	for(int i=0;i<prob->l;i++)
+		if(prob->y[i] > 0)
+			pos++;
+	neg = prob->l - pos;
+
+	double primal_solver_tol = eps*max(min(pos,neg), 1)/prob->l;
+
+	function *fun_obj=NULL;
+	switch(param->solver_type)
+	{
+		case L2R_LR:
+		{
+			double *C = new double[prob->l];
+			for(int i = 0; i < prob->l; i++)
+			{
+				if(prob->y[i] > 0)
+					C[i] = prob->W[i] * Cp;
+				else
+					C[i] = prob->W[i] * Cn;
+			}
+
+			fun_obj=new l2r_lr_fun(prob, C);
+			TRON tron_obj(fun_obj, primal_solver_tol, max_iter, blas_functions);
+			tron_obj.set_print_string(liblinear_print_string);
+			n_iter=tron_obj.tron(w);
+			delete fun_obj;
+			delete[] C;
+			break;
+		}
+		case L2R_L2LOSS_SVC:
+		{
+			double *C = new double[prob->l];
+			for(int i = 0; i < prob->l; i++)
+			{
+				if(prob->y[i] > 0)
+					C[i] = prob->W[i] * Cp;
+				else
+					C[i] = prob->W[i] * Cn;
+			}
+			fun_obj=new l2r_l2_svc_fun(prob, C);
+			TRON tron_obj(fun_obj, primal_solver_tol, max_iter, blas_functions);
+			tron_obj.set_print_string(liblinear_print_string);
+			n_iter=tron_obj.tron(w);
+			delete fun_obj;
+			delete[] C;
+			break;
+		}
+		case L2R_L2LOSS_SVC_DUAL:
+			n_iter=solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L2LOSS_SVC_DUAL, max_iter);
+			break;
+		case L2R_L1LOSS_SVC_DUAL:
+			n_iter=solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L1LOSS_SVC_DUAL, max_iter);
+			break;
+		case L1R_L2LOSS_SVC:
+		{
+			problem prob_col;
+			feature_node *x_space = NULL;
+			transpose(prob, &x_space ,&prob_col);
+			n_iter=solve_l1r_l2_svc(&prob_col, w, primal_solver_tol, Cp, Cn, max_iter);
+			delete [] prob_col.y;
+			delete [] prob_col.x;
+			delete [] prob_col.W;
+			delete [] x_space;
+			break;
+		}
+		case L1R_LR:
+		{
+			problem prob_col;
+			feature_node *x_space = NULL;
+			transpose(prob, &x_space ,&prob_col);
+			n_iter=solve_l1r_lr(&prob_col, w, primal_solver_tol, Cp, Cn, max_iter);
+			delete [] prob_col.y;
+			delete [] prob_col.x;
+			delete [] prob_col.W;
+			delete [] x_space;
+			break;
+		}
+		case L2R_LR_DUAL:
+			n_iter=solve_l2r_lr_dual(prob, w, eps, Cp, Cn, max_iter);
+			break;
+		case L2R_L2LOSS_SVR:
+		{
+			double *C = new double[prob->l];
+			for(int i = 0; i < prob->l; i++)
+				C[i] = prob->W[i] * param->C;
+
+			fun_obj=new l2r_l2_svr_fun(prob, C, param->p);
+			TRON tron_obj(fun_obj, param->eps, max_iter, blas_functions);
+			tron_obj.set_print_string(liblinear_print_string);
+			n_iter=tron_obj.tron(w);
+			delete fun_obj;
+			delete[] C;
+			break;
+
+		}
+		case L2R_L1LOSS_SVR_DUAL:
+			n_iter=solve_l2r_l1l2_svr(prob, w, param, L2R_L1LOSS_SVR_DUAL, max_iter);
+			break;
+		case L2R_L2LOSS_SVR_DUAL:
+			n_iter=solve_l2r_l1l2_svr(prob, w, param, L2R_L2LOSS_SVR_DUAL, max_iter);
+			break;
+		default:
+			fprintf(stderr, "ERROR: unknown solver_type\n");
+			break;
+	}
+	return n_iter;
+}
+
+//
+// Remove zero weighed data as libsvm and some liblinear solvers require C > 0.
+//
+static void remove_zero_weight(problem *newprob, const problem *prob)
+{
+	int i;
+	int l = 0;
+	for(i=0;i<prob->l;i++)
+		if(prob->W[i] > 0) l++;
+	*newprob = *prob;
+	newprob->l = l;
+	newprob->x = Malloc(feature_node*,l);
+	newprob->y = Malloc(double,l);
+	newprob->W = Malloc(double,l);
+
+	int j = 0;
+	for(i=0;i<prob->l;i++)
+		if(prob->W[i] > 0)
+		{
+			newprob->x[j] = prob->x[i];
+			newprob->y[j] = prob->y[i];
+			newprob->W[j] = prob->W[i];
+			j++;
+		}
+}
+
+//
+// Interface functions
+//
+model* train(const problem *prob, const parameter *param, BlasFunctions *blas_functions)
+{
+	problem newprob;
+	remove_zero_weight(&newprob, prob);
+	prob = &newprob;
+	int i,j;
+	int l = prob->l;
+	int n = prob->n;
+	int w_size = prob->n;
+	model *model_ = Malloc(model,1);
+
+	if(prob->bias>=0)
+		model_->nr_feature=n-1;
+	else
+		model_->nr_feature=n;
+	model_->param = *param;
+	model_->bias = prob->bias;
+
+	if(check_regression_model(model_))
+	{
+		model_->w = Malloc(double, w_size);
+		model_->n_iter = Malloc(int, 1);
+		model_->nr_class = 2;
+		model_->label = NULL;
+		model_->n_iter[0] =train_one(prob, param, &model_->w[0], 0, 0, blas_functions);
+	}
+	else
+	{
+		int nr_class;
+		int *label = NULL;
+		int *start = NULL;
+		int *count = NULL;
+		int *perm = Malloc(int,l);
+
+		// group training data of the same class
+		group_classes(prob,&nr_class,&label,&start,&count,perm);
+
+		model_->nr_class=nr_class;
+		model_->label = Malloc(int,nr_class);
+		for(i=0;i<nr_class;i++)
+			model_->label[i] = label[i];
+
+		// calculate weighted C
+		double *weighted_C = Malloc(double, nr_class);
+		for(i=0;i<nr_class;i++)
+			weighted_C[i] = param->C;
+		for(i=0;i<param->nr_weight;i++)
+		{
+			for(j=0;j<nr_class;j++)
+				if(param->weight_label[i] == label[j])
+					break;
+			if(j == nr_class)
+				fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]);
+			else
+				weighted_C[j] *= param->weight[i];
+		}
+
+		// constructing the subproblem
+		feature_node **x = Malloc(feature_node *,l);
+		for(i=0;i<l;i++)
+			x[i] = prob->x[perm[i]];
+
+		int k;
+		problem sub_prob;
+		sub_prob.l = l;
+		sub_prob.n = n;
+		sub_prob.x = Malloc(feature_node *,sub_prob.l);
+		sub_prob.y = Malloc(double,sub_prob.l);
+		sub_prob.W = Malloc(double,sub_prob.l);
+		for(k=0; k<sub_prob.l; k++){
+			sub_prob.x[k] = x[k];
+			sub_prob.W[k] = prob->W[perm[k]];
+		}
+
+		// multi-class svm by Crammer and Singer
+		if(param->solver_type == MCSVM_CS)
+		{
+			model_->w=Malloc(double, n*nr_class);
+			model_->n_iter=Malloc(int, 1);
+			for(i=0;i<nr_class;i++)
+				for(j=start[i];j<start[i]+count[i];j++)
+					sub_prob.y[j] = i;
+			Solver_MCSVM_CS Solver(&sub_prob, nr_class, weighted_C, param->eps);
+			model_->n_iter[0]=Solver.Solve(model_->w);
+		}
+		else
+		{
+			if(nr_class == 2)
+			{
+				model_->w=Malloc(double, w_size);
+				model_->n_iter=Malloc(int, 1);
+				int e0 = start[0]+count[0];
+				k=0;
+				for(; k<e0; k++)
+					sub_prob.y[k] = -1;
+				for(; k<sub_prob.l; k++)
+					sub_prob.y[k] = +1;
+
+				model_->n_iter[0]=train_one(&sub_prob, param, &model_->w[0], weighted_C[1], weighted_C[0], blas_functions);
+			}
+			else
+			{
+				model_->w=Malloc(double, w_size*nr_class);
+				double *w=Malloc(double, w_size);
+				model_->n_iter=Malloc(int, nr_class);
+				for(i=0;i<nr_class;i++)
+				{
+					int si = start[i];
+					int ei = si+count[i];
+
+					k=0;
+					for(; k<si; k++)
+						sub_prob.y[k] = -1;
+					for(; k<ei; k++)
+						sub_prob.y[k] = +1;
+					for(; k<sub_prob.l; k++)
+						sub_prob.y[k] = -1;
+
+					model_->n_iter[i]=train_one(&sub_prob, param, w, weighted_C[i], param->C, blas_functions);
+
+					for(int j=0;j<w_size;j++)
+						model_->w[j*nr_class+i] = w[j];
+				}
+				free(w);
+			}
+
+		}
+
+		free(x);
+		free(label);
+		free(start);
+		free(count);
+		free(perm);
+		free(sub_prob.x);
+		free(sub_prob.y);
+		free(sub_prob.W);
+		free(weighted_C);
+		free(newprob.x);
+		free(newprob.y);
+		free(newprob.W);
+	}
+	return model_;
+}
+
+#if 0
+void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target)
+{
+	int i;
+	int *fold_start;
+	int l = prob->l;
+	int *perm = Malloc(int,l);
+	if (nr_fold > l)
+	{
+		nr_fold = l;
+		fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
+	}
+	fold_start = Malloc(int,nr_fold+1);
+	for(i=0;i<l;i++) perm[i]=i;
+	for(i=0;i<l;i++)
+	{
+		int j = i+bounded_rand_int(l-i);
+		swap(perm[i],perm[j]);
+	}
+	for(i=0;i<=nr_fold;i++)
+		fold_start[i]=i*l/nr_fold;
+
+	for(i=0;i<nr_fold;i++)
+	{
+		int begin = fold_start[i];
+		int end = fold_start[i+1];
+		int j,k;
+		struct problem subprob;
+
+		subprob.bias = prob->bias;
+		subprob.n = prob->n;
+		subprob.l = l-(end-begin);
+		subprob.x = Malloc(struct feature_node*,subprob.l);
+		subprob.y = Malloc(double,subprob.l);
+
+		k=0;
+		for(j=0;j<begin;j++)
+		{
+			subprob.x[k] = prob->x[perm[j]];
+			subprob.y[k] = prob->y[perm[j]];
+			++k;
+		}
+		for(j=end;j<l;j++)
+		{
+			subprob.x[k] = prob->x[perm[j]];
+			subprob.y[k] = prob->y[perm[j]];
+			++k;
+		}
+		struct model *submodel = train(&subprob,param);
+		for(j=begin;j<end;j++)
+			target[perm[j]] = predict(submodel,prob->x[perm[j]]);
+		free_and_destroy_model(&submodel);
+		free(subprob.x);
+		free(subprob.y);
+	}
+	free(fold_start);
+	free(perm);
+}
+
+double predict_values(const struct model *model_, const struct feature_node *x, double *dec_values)
+{
+	int idx;
+	int n;
+	if(model_->bias>=0)
+		n=model_->nr_feature+1;
+	else
+		n=model_->nr_feature;
+	double *w=model_->w;
+	int nr_class=model_->nr_class;
+	int i;
+	int nr_w;
+	if(nr_class==2 && model_->param.solver_type != MCSVM_CS)
+		nr_w = 1;
+	else
+		nr_w = nr_class;
+
+	const feature_node *lx=x;
+	for(i=0;i<nr_w;i++)
+		dec_values[i] = 0;
+	for(; (idx=lx->index)!=-1; lx++)
+	{
+		// the dimension of testing data may exceed that of training
+		if(idx<=n)
+			for(i=0;i<nr_w;i++)
+				dec_values[i] += w[(idx-1)*nr_w+i]*lx->value;
+	}
+
+	if(nr_class==2)
+	{
+		if(check_regression_model(model_))
+			return dec_values[0];
+		else
+			return (dec_values[0]>0)?model_->label[0]:model_->label[1];
+	}
+	else
+	{
+		int dec_max_idx = 0;
+		for(i=1;i<nr_class;i++)
+		{
+			if(dec_values[i] > dec_values[dec_max_idx])
+				dec_max_idx = i;
+		}
+		return model_->label[dec_max_idx];
+	}
+}
+
+double predict(const model *model_, const feature_node *x)
+{
+	double *dec_values = Malloc(double, model_->nr_class);
+	double label=predict_values(model_, x, dec_values);
+	free(dec_values);
+	return label;
+}
+
+double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates)
+{
+	if(check_probability_model(model_))
+	{
+		int i;
+		int nr_class=model_->nr_class;
+		int nr_w;
+		if(nr_class==2)
+			nr_w = 1;
+		else
+			nr_w = nr_class;
+
+		double label=predict_values(model_, x, prob_estimates);
+		for(i=0;i<nr_w;i++)
+			prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
+
+		if(nr_class==2) // for binary classification
+			prob_estimates[1]=1.-prob_estimates[0];
+		else
+		{
+			double sum=0;
+			for(i=0; i<nr_class; i++)
+				sum+=prob_estimates[i];
+
+			for(i=0; i<nr_class; i++)
+				prob_estimates[i]=prob_estimates[i]/sum;
+		}
+
+		return label;
+	}
+	else
+		return 0;
+}
+
+static const char *solver_type_table[]=
+{
+	"L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS",
+	"L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL",
+	"", "", "",
+	"L2R_L2LOSS_SVR", "L2R_L2LOSS_SVR_DUAL", "L2R_L1LOSS_SVR_DUAL", NULL
+};
+
+int save_model(const char *model_file_name, const struct model *model_)
+{
+	int i;
+	int nr_feature=model_->nr_feature;
+	int n;
+	const parameter& param = model_->param;
+
+	if(model_->bias>=0)
+		n=nr_feature+1;
+	else
+		n=nr_feature;
+	int w_size = n;
+	FILE *fp = fopen(model_file_name,"w");
+	if(fp==NULL) return -1;
+
+	char *old_locale = strdup(setlocale(LC_ALL, NULL));
+	setlocale(LC_ALL, "C");
+
+	int nr_w;
+	if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS)
+		nr_w=1;
+	else
+		nr_w=model_->nr_class;
+
+	fprintf(fp, "solver_type %s\n", solver_type_table[param.solver_type]);
+	fprintf(fp, "nr_class %d\n", model_->nr_class);
+
+	if(model_->label)
+	{
+		fprintf(fp, "label");
+		for(i=0; i<model_->nr_class; i++)
+			fprintf(fp, " %d", model_->label[i]);
+		fprintf(fp, "\n");
+	}
+
+	fprintf(fp, "nr_feature %d\n", nr_feature);
+
+	fprintf(fp, "bias %.16g\n", model_->bias);
+
+	fprintf(fp, "w\n");
+	for(i=0; i<w_size; i++)
+	{
+		int j;
+		for(j=0; j<nr_w; j++)
+			fprintf(fp, "%.16g ", model_->w[i*nr_w+j]);
+		fprintf(fp, "\n");
+	}
+
+	setlocale(LC_ALL, old_locale);
+	free(old_locale);
+
+	if (ferror(fp) != 0 || fclose(fp) != 0) return -1;
+	else return 0;
+}
+
+struct model *load_model(const char *model_file_name)
+{
+	FILE *fp = fopen(model_file_name,"r");
+	if(fp==NULL) return NULL;
+
+	int i;
+	int nr_feature;
+	int n;
+	int nr_class;
+	double bias;
+	model *model_ = Malloc(model,1);
+	parameter& param = model_->param;
+
+	model_->label = NULL;
+
+	char *old_locale = strdup(setlocale(LC_ALL, NULL));
+	setlocale(LC_ALL, "C");
+
+	char cmd[81];
+	while(1)
+	{
+		fscanf(fp,"%80s",cmd);
+		if(strcmp(cmd,"solver_type")==0)
+		{
+			fscanf(fp,"%80s",cmd);
+			int i;
+			for(i=0;solver_type_table[i];i++)
+			{
+				if(strcmp(solver_type_table[i],cmd)==0)
+				{
+					param.solver_type=i;
+					break;
+				}
+			}
+			if(solver_type_table[i] == NULL)
+			{
+				fprintf(stderr,"unknown solver type.\n");
+
+				setlocale(LC_ALL, old_locale);
+				free(model_->label);
+				free(model_);
+				free(old_locale);
+				return NULL;
+			}
+		}
+		else if(strcmp(cmd,"nr_class")==0)
+		{
+			fscanf(fp,"%d",&nr_class);
+			model_->nr_class=nr_class;
+		}
+		else if(strcmp(cmd,"nr_feature")==0)
+		{
+			fscanf(fp,"%d",&nr_feature);
+			model_->nr_feature=nr_feature;
+		}
+		else if(strcmp(cmd,"bias")==0)
+		{
+			fscanf(fp,"%lf",&bias);
+			model_->bias=bias;
+		}
+		else if(strcmp(cmd,"w")==0)
+		{
+			break;
+		}
+		else if(strcmp(cmd,"label")==0)
+		{
+			int nr_class = model_->nr_class;
+			model_->label = Malloc(int,nr_class);
+			for(int i=0;i<nr_class;i++)
+				fscanf(fp,"%d",&model_->label[i]);
+		}
+		else
+		{
+			fprintf(stderr,"unknown text in model file: [%s]\n",cmd);
+			setlocale(LC_ALL, old_locale);
+			free(model_->label);
+			free(model_);
+			free(old_locale);
+			return NULL;
+		}
+	}
+
+	nr_feature=model_->nr_feature;
+	if(model_->bias>=0)
+		n=nr_feature+1;
+	else
+		n=nr_feature;
+	int w_size = n;
+	int nr_w;
+	if(nr_class==2 && param.solver_type != MCSVM_CS)
+		nr_w = 1;
+	else
+		nr_w = nr_class;
+
+	model_->w=Malloc(double, w_size*nr_w);
+	for(i=0; i<w_size; i++)
+	{
+		int j;
+		for(j=0; j<nr_w; j++)
+			fscanf(fp, "%lf ", &model_->w[i*nr_w+j]);
+		fscanf(fp, "\n");
+	}
+
+	setlocale(LC_ALL, old_locale);
+	free(old_locale);
+
+	if (ferror(fp) != 0 || fclose(fp) != 0) return NULL;
+
+	return model_;
+}
+#endif
+
+int get_nr_feature(const model *model_)
+{
+	return model_->nr_feature;
+}
+
+int get_nr_class(const model *model_)
+{
+	return model_->nr_class;
+}
+
+void get_labels(const model *model_, int* label)
+{
+	if (model_->label != NULL)
+		for(int i=0;i<model_->nr_class;i++)
+			label[i] = model_->label[i];
+}
+
+void get_n_iter(const model *model_, int* n_iter)
+{
+    int labels;
+    labels = model_->nr_class;
+    if (labels == 2)
+        labels = 1;
+
+    if (model_->n_iter != NULL)
+        for(int i=0;i<labels;i++)
+            n_iter[i] = model_->n_iter[i];
+}
+
+#if 0
+// use inline here for better performance (around 20% faster than the non-inline one)
+static inline double get_w_value(const struct model *model_, int idx, int label_idx)
+{
+	int nr_class = model_->nr_class;
+	int solver_type = model_->param.solver_type;
+	const double *w = model_->w;
+
+	if(idx < 0 || idx > model_->nr_feature)
+		return 0;
+	if(check_regression_model(model_))
+		return w[idx];
+	else
+	{
+		if(label_idx < 0 || label_idx >= nr_class)
+			return 0;
+		if(nr_class == 2 && solver_type != MCSVM_CS)
+		{
+			if(label_idx == 0)
+				return w[idx];
+			else
+				return -w[idx];
+		}
+		else
+			return w[idx*nr_class+label_idx];
+	}
+}
+
+// feat_idx: starting from 1 to nr_feature
+// label_idx: starting from 0 to nr_class-1 for classification models;
+//            for regression models, label_idx is ignored.
+double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx)
+{
+	if(feat_idx > model_->nr_feature)
+		return 0;
+	return get_w_value(model_, feat_idx-1, label_idx);
+}
+
+double get_decfun_bias(const struct model *model_, int label_idx)
+{
+	int bias_idx = model_->nr_feature;
+	double bias = model_->bias;
+	if(bias <= 0)
+		return 0;
+	else
+		return bias*get_w_value(model_, bias_idx, label_idx);
+}
+#endif
+
+void free_model_content(struct model *model_ptr)
+{
+	if(model_ptr->w != NULL)
+		free(model_ptr->w);
+	if(model_ptr->label != NULL)
+		free(model_ptr->label);
+	if(model_ptr->n_iter != NULL)
+	    free(model_ptr->n_iter);
+}
+
+void free_and_destroy_model(struct model **model_ptr_ptr)
+{
+	struct model *model_ptr = *model_ptr_ptr;
+	if(model_ptr != NULL)
+	{
+		free_model_content(model_ptr);
+		free(model_ptr);
+	}
+}
+
+void destroy_param(parameter* param)
+{
+	if(param->weight_label != NULL)
+		free(param->weight_label);
+	if(param->weight != NULL)
+		free(param->weight);
+}
+
+const char *check_parameter(const problem *prob, const parameter *param)
+{
+	if(param->eps <= 0)
+		return "eps <= 0";
+
+	if(param->C <= 0)
+		return "C <= 0";
+
+	if(param->p < 0)
+		return "p < 0";
+
+	if(param->solver_type != L2R_LR
+		&& param->solver_type != L2R_L2LOSS_SVC_DUAL
+		&& param->solver_type != L2R_L2LOSS_SVC
+		&& param->solver_type != L2R_L1LOSS_SVC_DUAL
+		&& param->solver_type != MCSVM_CS
+		&& param->solver_type != L1R_L2LOSS_SVC
+		&& param->solver_type != L1R_LR
+		&& param->solver_type != L2R_LR_DUAL
+		&& param->solver_type != L2R_L2LOSS_SVR
+		&& param->solver_type != L2R_L2LOSS_SVR_DUAL
+		&& param->solver_type != L2R_L1LOSS_SVR_DUAL)
+		return "unknown solver type";
+
+	return NULL;
+}
+
+#if 0
+int check_probability_model(const struct model *model_)
+{
+	return (model_->param.solver_type==L2R_LR ||
+			model_->param.solver_type==L2R_LR_DUAL ||
+			model_->param.solver_type==L1R_LR);
+}
+#endif
+
+int check_regression_model(const struct model *model_)
+{
+	return (model_->param.solver_type==L2R_L2LOSS_SVR ||
+			model_->param.solver_type==L2R_L1LOSS_SVR_DUAL ||
+			model_->param.solver_type==L2R_L2LOSS_SVR_DUAL);
+}
+
+void set_print_string_function(void (*print_func)(const char*))
+{
+	if (print_func == NULL)
+		liblinear_print_string = &print_string_stdout;
+	else
+		liblinear_print_string = print_func;
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dfc1c0ed014943bc797cd89689237761f41568b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.h
@@ -0,0 +1,86 @@
+#ifndef _LIBLINEAR_H
+#define _LIBLINEAR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "_cython_blas_helpers.h"
+
+struct feature_node
+{
+	int index;
+	double value;
+};
+
+struct problem
+{
+	int l, n;
+	double *y;
+	struct feature_node **x;
+	double bias;            /* < 0 if no bias term */
+	double *W;
+};
+
+enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR = 11, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL }; /* solver_type */
+
+struct parameter
+{
+	int solver_type;
+
+	/* these are for training only */
+	double eps;	        /* stopping criteria */
+	double C;
+	int nr_weight;
+	int *weight_label;
+	double* weight;
+	int max_iter;
+	double p;
+};
+
+struct model
+{
+	struct parameter param;
+	int nr_class;		/* number of classes */
+	int nr_feature;
+	double *w;
+	int *label;		/* label of each class */
+	double bias;
+	int *n_iter;    /* no. of iterations of each class */
+};
+
+void set_seed(unsigned seed);
+
+struct model* train(const struct problem *prob, const struct parameter *param, BlasFunctions *blas_functions);
+void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, double *target);
+
+double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values);
+double predict(const struct model *model_, const struct feature_node *x);
+double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates);
+
+int save_model(const char *model_file_name, const struct model *model_);
+struct model *load_model(const char *model_file_name);
+
+int get_nr_feature(const struct model *model_);
+int get_nr_class(const struct model *model_);
+void get_labels(const struct model *model_, int* label);
+void get_n_iter(const struct model *model_, int* n_iter);
+#if 0
+double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx);
+double get_decfun_bias(const struct model *model_, int label_idx);
+#endif
+
+void free_model_content(struct model *model_ptr);
+void free_and_destroy_model(struct model **model_ptr_ptr);
+void destroy_param(struct parameter *param);
+
+const char *check_parameter(const struct problem *prob, const struct parameter *param);
+int check_probability_model(const struct model *model);
+int check_regression_model(const struct model *model);
+void set_print_string_function(void (*print_func) (const char*));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBLINEAR_H */
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.cpp b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..168a62ca47a2f4850508f6a0130eee3b8bd09194
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.cpp
@@ -0,0 +1,223 @@
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include "tron.h"
+
+#ifndef min
+template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
+#endif
+
+#ifndef max
+template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
+#endif
+
+static void default_print(const char *buf)
+{
+	fputs(buf,stdout);
+	fflush(stdout);
+}
+
+void TRON::info(const char *fmt,...)
+{
+	char buf[BUFSIZ];
+	va_list ap;
+	va_start(ap,fmt);
+	vsprintf(buf,fmt,ap);
+	va_end(ap);
+	(*tron_print_string)(buf);
+}
+
+TRON::TRON(const function *fun_obj, double eps, int max_iter, BlasFunctions *blas)
+{
+	this->fun_obj=const_cast<function *>(fun_obj);
+	this->eps=eps;
+	this->max_iter=max_iter;
+	this->blas=blas;
+	tron_print_string = default_print;
+}
+
+TRON::~TRON()
+{
+}
+
+int TRON::tron(double *w)
+{
+	// Parameters for updating the iterates.
+	double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75;
+
+	// Parameters for updating the trust region size delta.
+	double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4;
+
+	int n = fun_obj->get_nr_variable();
+	int i, cg_iter;
+	double delta, snorm;
+	double alpha, f, fnew, prered, actred, gs;
+	int search = 1, iter = 1, inc = 1;
+	double *s = new double[n];
+	double *r = new double[n];
+	double *w_new = new double[n];
+	double *g = new double[n];
+
+	for (i=0; i<n; i++)
+		w[i] = 0;
+
+	f = fun_obj->fun(w);
+	fun_obj->grad(w, g);
+	delta = blas->nrm2(n, g, inc);
+	double gnorm1 = delta;
+	double gnorm = gnorm1;
+
+	if (gnorm <= eps*gnorm1)
+		search = 0;
+
+	iter = 1;
+
+	while (iter <= max_iter && search)
+	{
+		cg_iter = trcg(delta, g, s, r);
+
+		memcpy(w_new, w, sizeof(double)*n);
+		blas->axpy(n, 1.0, s, inc, w_new, inc);
+
+		gs = blas->dot(n, g, inc, s, inc);
+		prered = -0.5*(gs - blas->dot(n, s, inc, r, inc));
+		fnew = fun_obj->fun(w_new);
+
+		// Compute the actual reduction.
+		actred = f - fnew;
+
+		// On the first iteration, adjust the initial step bound.
+		snorm = blas->nrm2(n, s, inc);
+		if (iter == 1)
+			delta = min(delta, snorm);
+
+		// Compute prediction alpha*snorm of the step.
+		if (fnew - f - gs <= 0)
+			alpha = sigma3;
+		else
+			alpha = max(sigma1, -0.5*(gs/(fnew - f - gs)));
+
+		// Update the trust region bound according to the ratio of actual to predicted reduction.
+		if (actred < eta0*prered)
+			delta = min(max(alpha, sigma1)*snorm, sigma2*delta);
+		else if (actred < eta1*prered)
+			delta = max(sigma1*delta, min(alpha*snorm, sigma2*delta));
+		else if (actred < eta2*prered)
+			delta = max(sigma1*delta, min(alpha*snorm, sigma3*delta));
+		else
+			delta = max(delta, min(alpha*snorm, sigma3*delta));
+
+		info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter);
+
+		if (actred > eta0*prered)
+		{
+			iter++;
+			memcpy(w, w_new, sizeof(double)*n);
+			f = fnew;
+			fun_obj->grad(w, g);
+
+			gnorm = blas->nrm2(n, g, inc);
+			if (gnorm <= eps*gnorm1)
+				break;
+		}
+		if (f < -1.0e+32)
+		{
+			info("WARNING: f < -1.0e+32\n");
+			break;
+		}
+		if (fabs(actred) <= 0 && prered <= 0)
+		{
+			info("WARNING: actred and prered <= 0\n");
+			break;
+		}
+		if (fabs(actred) <= 1.0e-12*fabs(f) &&
+		    fabs(prered) <= 1.0e-12*fabs(f))
+		{
+			info("WARNING: actred and prered too small\n");
+			break;
+		}
+	}
+
+	delete[] g;
+	delete[] r;
+	delete[] w_new;
+	delete[] s;
+	return --iter;
+}
+
+int TRON::trcg(double delta, double *g, double *s, double *r)
+{
+	int i, inc = 1;
+	int n = fun_obj->get_nr_variable();
+	double *d = new double[n];
+	double *Hd = new double[n];
+	double rTr, rnewTrnew, alpha, beta, cgtol;
+
+	for (i=0; i<n; i++)
+	{
+		s[i] = 0;
+		r[i] = -g[i];
+		d[i] = r[i];
+	}
+	cgtol = 0.1 * blas->nrm2(n, g, inc);
+
+	int cg_iter = 0;
+	rTr = blas->dot(n, r, inc, r, inc);
+	while (1)
+	{
+		if (blas->nrm2(n, r, inc) <= cgtol)
+			break;
+		cg_iter++;
+		fun_obj->Hv(d, Hd);
+
+		alpha = rTr / blas->dot(n, d, inc, Hd, inc);
+		blas->axpy(n, alpha, d, inc, s, inc);
+		if (blas->nrm2(n, s, inc) > delta)
+		{
+			info("cg reaches trust region boundary\n");
+			alpha = -alpha;
+			blas->axpy(n, alpha, d, inc, s, inc);
+
+			double std = blas->dot(n, s, inc, d, inc);
+			double sts = blas->dot(n, s, inc, s, inc);
+			double dtd = blas->dot(n, d, inc, d, inc);
+			double dsq = delta*delta;
+			double rad = sqrt(std*std + dtd*(dsq-sts));
+			if (std >= 0)
+				alpha = (dsq - sts)/(std + rad);
+			else
+				alpha = (rad - std)/dtd;
+			blas->axpy(n, alpha, d, inc, s, inc);
+			alpha = -alpha;
+			blas->axpy(n, alpha, Hd, inc, r, inc);
+			break;
+		}
+		alpha = -alpha;
+		blas->axpy(n, alpha, Hd, inc, r, inc);
+		rnewTrnew = blas->dot(n, r, inc, r, inc);
+		beta = rnewTrnew/rTr;
+		blas->scal(n, beta, d, inc);
+		blas->axpy(n, 1.0, r, inc, d, inc);
+		rTr = rnewTrnew;
+	}
+
+	delete[] d;
+	delete[] Hd;
+
+	return(cg_iter);
+}
+
+double TRON::norm_inf(int n, double *x)
+{
+	double dmax = fabs(x[0]);
+	for (int i=1; i<n; i++)
+		if (fabs(x[i]) >= dmax)
+			dmax = fabs(x[i]);
+	return(dmax);
+}
+
+void TRON::set_print_string(void (*print_string) (const char *buf))
+{
+	tron_print_string = print_string;
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.h
new file mode 100644
index 0000000000000000000000000000000000000000..735304ed16b6fc28c5900d2be2f41f47a32ccc9a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.h
@@ -0,0 +1,37 @@
+#ifndef _TRON_H
+#define _TRON_H
+
+#include "_cython_blas_helpers.h"
+
+class function
+{
+public:
+	virtual double fun(double *w) = 0 ;
+	virtual void grad(double *w, double *g) = 0 ;
+	virtual void Hv(double *s, double *Hs) = 0 ;
+
+	virtual int get_nr_variable(void) = 0 ;
+	virtual ~function(void){}
+};
+
+class TRON
+{
+public:
+	TRON(const function *fun_obj, double eps = 0.1, int max_iter = 1000, BlasFunctions *blas = 0);
+	~TRON();
+
+	int tron(double *w);
+	void set_print_string(void (*i_print) (const char *buf));
+
+private:
+	int trcg(double delta, double *g, double *s, double *r);
+	double norm_inf(int n, double *x);
+
+	double eps;
+	int max_iter;
+	function *fun_obj;
+	BlasFunctions *blas;
+	void info(const char *fmt,...);
+	void (*tron_print_string)(const char *buf);
+};
+#endif
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES
new file mode 100644
index 0000000000000000000000000000000000000000..663550b8ddd6fa905d3cec6e02be50faa43859c3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES
@@ -0,0 +1,11 @@
+Changes to Libsvm
+
+This is here mainly as checklist for incorporation of new versions of libsvm.
+
+  * Add copyright to files svm.cpp and svm.h
+  * Add random_seed support and call to srand in fit function
+  * Improved random number generator (fix on windows, enhancement on other
+    platforms). See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+  * invoke scipy blas api for svm kernel function to improve performance with speedup rate of 1.5X to 2X for dense data only. See <https://github.com/scikit-learn/scikit-learn/pull/16530>
+  * Expose the number of iterations run in optimization. See <https://github.com/scikit-learn/scikit-learn/pull/21408>
+The changes made with respect to upstream are detailed in the heading of svm.cpp
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..2548c7844d267ec631102ae1f44e48cab2b0a729
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h
@@ -0,0 +1,9 @@
+#ifndef _SVM_CYTHON_BLAS_HELPERS_H
+#define _SVM_CYTHON_BLAS_HELPERS_H
+
+typedef double (*dot_func)(int, const double*, int, const double*, int);
+typedef struct BlasFunctions{
+    dot_func dot;
+} BlasFunctions;
+
+#endif
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c
new file mode 100644
index 0000000000000000000000000000000000000000..b87b52a6fbdc244df315c6f03f80b3321c852fdc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c
@@ -0,0 +1,425 @@
+#include <stdlib.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include "svm.h"
+#include "_svm_cython_blas_helpers.h"
+
+
+#ifndef MAX
+    #define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#endif
+
+
+/*
+ * Some helper methods for libsvm bindings.
+ *
+ * We need to access from python some parameters stored in svm_model
+ * but libsvm does not expose this structure, so we define it here
+ * along some utilities to convert from numpy arrays.
+ *
+ * Authors: The scikit-learn developers
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ */
+
+
+/*
+ * Convert matrix to sparse representation suitable for libsvm. x is
+ * expected to be an array of length nrow*ncol.
+ *
+ * Typically the matrix will be dense, so we speed up the routine for
+ * this case. We create a temporary array temp that collects non-zero
+ * elements and after we just memcpy that to the proper array.
+ *
+ * Special care must be taken with indinces, since libsvm indices start
+ * at 1 and not at 0.
+ *
+ * Strictly speaking, the C standard does not require that structs are
+ * contiguous, but in practice its a reasonable assumption.
+ *
+ */
+struct svm_node *dense_to_libsvm (double *x, Py_ssize_t *dims)
+{
+    struct svm_node *node;
+    Py_ssize_t len_row = dims[1];
+    double *tx = x;
+    int i;
+
+    node = malloc (dims[0] * sizeof(struct svm_node));
+
+    if (node == NULL) return NULL;
+    for (i=0; i<dims[0]; ++i) {
+        node[i].values = tx;
+        node[i].dim = (int) len_row;
+        node[i].ind = i; /* only used if kernel=precomputed, but not
+                            too much overhead */
+        tx += len_row;
+    }
+
+    return node;
+}
+
+
+/*
+ * Fill an svm_parameter struct.
+ */
+void set_parameter(struct svm_parameter *param, int svm_type, int kernel_type, int degree,
+		double gamma, double coef0, double nu, double cache_size, double C,
+		double eps, double p, int shrinking, int probability, int nr_weight,
+		char *weight_label, char *weight, int max_iter, int random_seed)
+{
+    param->svm_type = svm_type;
+    param->kernel_type = kernel_type;
+    param->degree = degree;
+    param->coef0 = coef0;
+    param->nu = nu;
+    param->cache_size = cache_size;
+    param->C = C;
+    param->eps = eps;
+    param->p = p;
+    param->shrinking = shrinking;
+    param->probability = probability;
+    param->nr_weight = nr_weight;
+    param->weight_label = (int *) weight_label;
+    param->weight = (double *) weight;
+    param->gamma = gamma;
+    param->max_iter = max_iter;
+    param->random_seed = random_seed;
+}
+
+/*
+ * Fill an svm_problem struct. problem->x will be malloc'd.
+ */
+void set_problem(struct svm_problem *problem, char *X, char *Y, char *sample_weight, Py_ssize_t *dims, int kernel_type)
+{
+    if (problem == NULL) return;
+    problem->l = (int) dims[0]; /* number of samples */
+    problem->y = (double *) Y;
+    problem->x = dense_to_libsvm((double *) X, dims); /* implicit call to malloc */
+    problem->W = (double *) sample_weight;
+}
+
+/*
+ * Create and return an instance of svm_model.
+ *
+ * The copy of model->sv_coef should be straightforward, but
+ * unfortunately to represent a matrix numpy and libsvm use different
+ * approaches, so it requires some iteration.
+ *
+ * Possible issue: on 64 bits, the number of columns that numpy can
+ * store is a long, but libsvm enforces this number (model->l) to be
+ * an int, so we might have numpy matrices that do not fit into libsvm's
+ * data structure.
+ *
+ */
+struct svm_model *set_model(struct svm_parameter *param, int nr_class,
+                            char *SV, Py_ssize_t *SV_dims,
+                            char *support, Py_ssize_t *support_dims,
+                            Py_ssize_t *sv_coef_strides,
+                            char *sv_coef, char *rho, char *nSV,
+                            char *probA, char *probB)
+{
+    struct svm_model *model;
+    double *dsv_coef = (double *) sv_coef;
+    int i, m;
+
+    m = nr_class * (nr_class-1)/2;
+
+    if ((model = malloc(sizeof(struct svm_model))) == NULL)
+        goto model_error;
+    if ((model->nSV = malloc(nr_class * sizeof(int))) == NULL)
+        goto nsv_error;
+    if ((model->label = malloc(nr_class * sizeof(int))) == NULL)
+        goto label_error;
+    if ((model->sv_coef = malloc((nr_class-1)*sizeof(double *))) == NULL)
+        goto sv_coef_error;
+    if ((model->rho = malloc( m * sizeof(double))) == NULL)
+        goto rho_error;
+
+    // This is only allocated in dynamic memory while training.
+    model->n_iter = NULL;
+
+    model->nr_class = nr_class;
+    model->param = *param;
+    model->l = (int) support_dims[0];
+
+    if (param->kernel_type == PRECOMPUTED) {
+        if ((model->SV = malloc ((model->l) * sizeof(struct svm_node))) == NULL)
+            goto SV_error;
+        for (i=0; i<model->l; ++i) {
+            model->SV[i].ind = ((int *) support)[i];
+            model->SV[i].values = NULL;
+        }
+    } else {
+        model->SV = dense_to_libsvm((double *) SV, SV_dims);
+    }
+    /*
+     * regression and one-class does not use nSV, label.
+     * TODO: does this provoke memory leaks (we just malloc'ed them)?
+     */
+    if (param->svm_type < 2) {
+        memcpy(model->nSV, nSV,     model->nr_class * sizeof(int));
+        for(i=0; i < model->nr_class; i++)
+            model->label[i] = i;
+    }
+
+    for (i=0; i < model->nr_class-1; i++) {
+        model->sv_coef[i] = dsv_coef + i*(model->l);
+    }
+
+    for (i=0; i<m; ++i) {
+        (model->rho)[i] = -((double *) rho)[i];
+    }
+
+    /*
+     * just to avoid segfaults, these features are not wrapped but
+     * svm_destroy_model will try to free them.
+     */
+
+    if (param->probability) {
+        if ((model->probA = malloc(m * sizeof(double))) == NULL)
+            goto probA_error;
+        memcpy(model->probA, probA, m * sizeof(double));
+        if ((model->probB = malloc(m * sizeof(double))) == NULL)
+            goto probB_error;
+        memcpy(model->probB, probB, m * sizeof(double));
+    } else {
+        model->probA = NULL;
+        model->probB = NULL;
+    }
+
+    /* We'll free SV ourselves */
+    model->free_sv = 0;
+    return model;
+
+probB_error:
+    free(model->probA);
+probA_error:
+    free(model->SV);
+SV_error:
+    free(model->rho);
+rho_error:
+    free(model->sv_coef);
+sv_coef_error:
+    free(model->label);
+label_error:
+    free(model->nSV);
+nsv_error:
+    free(model);
+model_error:
+    return NULL;
+}
+
+
+
+/*
+ * Get the number of support vectors in a model.
+ */
+Py_ssize_t get_l(struct svm_model *model)
+{
+    return (Py_ssize_t) model->l;
+}
+
+/*
+ * Get the number of classes in a model, = 2 in regression/one class
+ * svm.
+ */
+Py_ssize_t get_nr(struct svm_model *model)
+{
+    return (Py_ssize_t) model->nr_class;
+}
+
+/*
+ * Get the number of iterations run in optimization
+ */
+void copy_n_iter(char *data, struct svm_model *model)
+{
+    const int n_models = MAX(1, model->nr_class * (model->nr_class-1) / 2);
+    memcpy(data, model->n_iter, n_models * sizeof(int));
+}
+
+/*
+ * Some helpers to convert from libsvm sparse data structures
+ * model->sv_coef is a double **, whereas data is just a double *,
+ * so we have to do some stupid copying.
+ */
+void copy_sv_coef(char *data, struct svm_model *model)
+{
+    int i, len = model->nr_class-1;
+    double *temp = (double *) data;
+    for(i=0; i<len; ++i) {
+        memcpy(temp, model->sv_coef[i], sizeof(double) * model->l);
+        temp += model->l;
+    }
+}
+
+void copy_intercept(char *data, struct svm_model *model, Py_ssize_t *dims)
+{
+    /* intercept = -rho */
+    Py_ssize_t i, n = dims[0];
+    double t, *ddata = (double *) data;
+    for (i=0; i<n; ++i) {
+        t = model->rho[i];
+        /* we do this to avoid ugly -0.0 */
+        *ddata = (t != 0) ? -t : 0;
+        ++ddata;
+    }
+}
+
+/*
+ * This is a bit more complex since SV are stored as sparse
+ * structures, so we have to do the conversion on the fly and also
+ * iterate fast over data.
+ */
+void copy_SV(char *data, struct svm_model *model, Py_ssize_t *dims)
+{
+    int i, n = model->l;
+    double *tdata = (double *) data;
+    int dim = model->SV[0].dim;
+    for (i=0; i<n; ++i) {
+        memcpy (tdata, model->SV[i].values, dim * sizeof(double));
+        tdata += dim;
+    }
+}
+
+void copy_support (char *data, struct svm_model *model)
+{
+    memcpy (data, model->sv_ind, (model->l) * sizeof(int));
+}
+
+/*
+ * copy svm_model.nSV, an array with the number of SV for each class
+ * will be NULL in the case of SVR, OneClass
+ */
+void copy_nSV(char *data, struct svm_model *model)
+{
+    if (model->label == NULL) return;
+    memcpy(data, model->nSV, model->nr_class * sizeof(int));
+}
+
+void copy_probA(char *data, struct svm_model *model, Py_ssize_t * dims)
+{
+    memcpy(data, model->probA, dims[0] * sizeof(double));
+}
+
+void copy_probB(char *data, struct svm_model *model, Py_ssize_t * dims)
+{
+    memcpy(data, model->probB, dims[0] * sizeof(double));
+}
+
+/*
+ * Predict using model.
+ *
+ *  It will return -1 if we run out of memory.
+ */
+int copy_predict(char *predict, struct svm_model *model, Py_ssize_t *predict_dims,
+                 char *dec_values, BlasFunctions *blas_functions)
+{
+    double *t = (double *) dec_values;
+    struct svm_node *predict_nodes;
+    Py_ssize_t i;
+
+    predict_nodes = dense_to_libsvm((double *) predict, predict_dims);
+
+    if (predict_nodes == NULL)
+        return -1;
+    for(i=0; i<predict_dims[0]; ++i) {
+        *t = svm_predict(model, &predict_nodes[i], blas_functions);
+        ++t;
+    }
+    free(predict_nodes);
+    return 0;
+}
+
+int copy_predict_values(char *predict, struct svm_model *model,
+                        Py_ssize_t *predict_dims, char *dec_values, int nr_class, BlasFunctions *blas_functions)
+{
+    Py_ssize_t i;
+    struct svm_node *predict_nodes;
+    predict_nodes = dense_to_libsvm((double *) predict, predict_dims);
+    if (predict_nodes == NULL)
+        return -1;
+    for(i=0; i<predict_dims[0]; ++i) {
+        svm_predict_values(model, &predict_nodes[i],
+                                ((double *) dec_values) + i*nr_class,
+				blas_functions);
+    }
+
+    free(predict_nodes);
+    return 0;
+}
+
+
+
+int copy_predict_proba(char *predict, struct svm_model *model, Py_ssize_t *predict_dims,
+                 char *dec_values, BlasFunctions *blas_functions)
+{
+    Py_ssize_t i, n, m;
+    struct svm_node *predict_nodes;
+    n = predict_dims[0];
+    m = (Py_ssize_t) model->nr_class;
+    predict_nodes = dense_to_libsvm((double *) predict, predict_dims);
+    if (predict_nodes == NULL)
+        return -1;
+    for(i=0; i<n; ++i) {
+        svm_predict_probability(model, &predict_nodes[i],
+                                ((double *) dec_values) + i*m,
+				blas_functions);
+    }
+    free(predict_nodes);
+    return 0;
+}
+
+
+/*
+ * Some free routines. Some of them are nontrivial since a lot of
+ * sharing happens across objects (they *must* be called in the
+ * correct order)
+ */
+
+int free_model(struct svm_model *model)
+{
+    /* like svm_free_and_destroy_model, but does not free sv_coef[i] */
+    if (model == NULL) return -1;
+    free(model->SV);
+
+    /* We don't free sv_ind and n_iter, since we did not create them in
+       set_model */
+    /* free(model->sv_ind);
+     * free(model->n_iter);
+     */
+    free(model->sv_coef);
+    free(model->rho);
+    free(model->label);
+    free(model->probA);
+    free(model->probB);
+    free(model->nSV);
+    free(model);
+
+    return 0;
+}
+
+int free_param(struct svm_parameter *param)
+{
+    if (param == NULL) return -1;
+    free(param);
+    return 0;
+}
+
+
+/* borrowed from original libsvm code */
+static void print_null(const char *s) {}
+
+static void print_string_stdout(const char *s)
+{
+	fputs(s,stdout);
+	fflush(stdout);
+}
+
+/* provide convenience wrapper */
+void set_verbosity(int verbosity_flag){
+	if (verbosity_flag)
+		svm_set_print_string_function(&print_string_stdout);
+	else
+		svm_set_print_string_function(&print_null);
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c
new file mode 100644
index 0000000000000000000000000000000000000000..0ba153647cb8c158de24cb41e69fad90f44b1fc8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c
@@ -0,0 +1,472 @@
+#include <stdlib.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include "svm.h"
+#include "_svm_cython_blas_helpers.h"
+
+
+#ifndef MAX
+    #define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#endif
+
+
+/*
+ * Convert scipy.sparse.csr to libsvm's sparse data structure
+ */
+struct svm_csr_node **csr_to_libsvm (double *values, int* indices, int* indptr, int n_samples)
+{
+    struct svm_csr_node **sparse, *temp;
+    int i, j=0, k=0, n;
+    sparse = malloc (n_samples * sizeof(struct svm_csr_node *));
+
+    if (sparse == NULL)
+        return NULL;
+
+    for (i=0; i<n_samples; ++i) {
+        n = indptr[i+1] - indptr[i]; /* count elements in row i */
+        temp = malloc ((n+1) * sizeof(struct svm_csr_node));
+
+        if (temp == NULL) {
+            for (j=0; j<i; j++)
+                free(sparse[j]);
+            free(sparse);
+            return NULL;
+        }
+
+        for (j=0; j<n; ++j) {
+            temp[j].value = values[k];
+            temp[j].index = indices[k] + 1; /* libsvm uses 1-based indexing */
+            ++k;
+        }
+        /* set sentinel */
+        temp[n].index = -1;
+        sparse[i] = temp;
+    }
+
+    return sparse;
+}
+
+
+
+struct svm_parameter * set_parameter(int svm_type, int kernel_type, int degree,
+		double gamma, double coef0, double nu, double cache_size, double C,
+		double eps, double p, int shrinking, int probability, int nr_weight,
+		char *weight_label, char *weight, int max_iter, int random_seed)
+{
+    struct svm_parameter *param;
+    param = malloc(sizeof(struct svm_parameter));
+    if (param == NULL) return NULL;
+    param->svm_type = svm_type;
+    param->kernel_type = kernel_type;
+    param->degree = degree;
+    param->coef0 = coef0;
+    param->nu = nu;
+    param->cache_size = cache_size;
+    param->C = C;
+    param->eps = eps;
+    param->p = p;
+    param->shrinking = shrinking;
+    param->probability = probability;
+    param->nr_weight = nr_weight;
+    param->weight_label = (int *) weight_label;
+    param->weight = (double *) weight;
+    param->gamma = gamma;
+    param->max_iter = max_iter;
+    param->random_seed = random_seed;
+    return param;
+}
+
+
+/*
+ * Create and return a svm_csr_problem struct from a scipy.sparse.csr matrix. It is
+ * up to the user to free resulting structure.
+ *
+ * TODO: precomputed kernel.
+ */
+struct svm_csr_problem * csr_set_problem (char *values, Py_ssize_t *n_indices,
+		char *indices, Py_ssize_t *n_indptr, char *indptr, char *Y,
+                char *sample_weight, int kernel_type) {
+
+    struct svm_csr_problem *problem;
+    problem = malloc (sizeof (struct svm_csr_problem));
+    if (problem == NULL) return NULL;
+    problem->l = (int) n_indptr[0] - 1;
+    problem->y = (double *) Y;
+    problem->x = csr_to_libsvm((double *) values, (int *) indices,
+                               (int *) indptr, problem->l);
+    /* should be removed once we implement weighted samples */
+    problem->W = (double *) sample_weight;
+
+    if (problem->x == NULL) {
+        free(problem);
+        return NULL;
+    }
+    return problem;
+}
+
+
+struct svm_csr_model *csr_set_model(struct svm_parameter *param, int nr_class,
+                            char *SV_data, Py_ssize_t *SV_indices_dims,
+                            char *SV_indices, Py_ssize_t *SV_indptr_dims,
+                            char *SV_intptr,
+                            char *sv_coef, char *rho, char *nSV,
+                            char *probA, char *probB)
+{
+    struct svm_csr_model *model;
+    double *dsv_coef = (double *) sv_coef;
+    int i, m;
+
+    m = nr_class * (nr_class-1)/2;
+
+    if ((model = malloc(sizeof(struct svm_csr_model))) == NULL)
+        goto model_error;
+    if ((model->nSV = malloc(nr_class * sizeof(int))) == NULL)
+        goto nsv_error;
+    if ((model->label = malloc(nr_class * sizeof(int))) == NULL)
+        goto label_error;
+    if ((model->sv_coef = malloc((nr_class-1)*sizeof(double *))) == NULL)
+        goto sv_coef_error;
+    if ((model->rho = malloc( m * sizeof(double))) == NULL)
+        goto rho_error;
+
+    // This is only allocated in dynamic memory while training.
+    model->n_iter = NULL;
+
+    /* in the case of precomputed kernels we do not use
+       dense_to_precomputed because we don't want the leading 0. As
+       indices start at 1 (not at 0) this will work */
+    model->l = (int) SV_indptr_dims[0] - 1;
+    model->SV = csr_to_libsvm((double *) SV_data, (int *) SV_indices,
+                              (int *) SV_intptr, model->l);
+    model->nr_class = nr_class;
+    model->param = *param;
+
+    /*
+     * regression and one-class does not use nSV, label.
+     */
+    if (param->svm_type < 2) {
+        memcpy(model->nSV,   nSV,   model->nr_class * sizeof(int));
+        for(i=0; i < model->nr_class; i++)
+            model->label[i] = i;
+    }
+
+    for (i=0; i < model->nr_class-1; i++) {
+        /*
+         * We cannot squash all this mallocs in a single call since
+         * svm_destroy_model will free each element of the array.
+         */
+        if ((model->sv_coef[i] = malloc((model->l) * sizeof(double))) == NULL) {
+            int j;
+            for (j=0; j<i; j++)
+                free(model->sv_coef[j]);
+            goto sv_coef_i_error;
+        }
+        memcpy(model->sv_coef[i], dsv_coef, (model->l) * sizeof(double));
+        dsv_coef += model->l;
+    }
+
+    for (i=0; i<m; ++i) {
+        (model->rho)[i] = -((double *) rho)[i];
+    }
+
+    /*
+     * just to avoid segfaults, these features are not wrapped but
+     * svm_destroy_model will try to free them.
+     */
+
+    if (param->probability) {
+        if ((model->probA = malloc(m * sizeof(double))) == NULL)
+            goto probA_error;
+        memcpy(model->probA, probA, m * sizeof(double));
+        if ((model->probB = malloc(m * sizeof(double))) == NULL)
+            goto probB_error;
+        memcpy(model->probB, probB, m * sizeof(double));
+    } else {
+        model->probA = NULL;
+        model->probB = NULL;
+    }
+
+    /* We'll free SV ourselves */
+    model->free_sv = 0;
+    return model;
+
+probB_error:
+    free(model->probA);
+probA_error:
+    for (i=0; i < model->nr_class-1; i++)
+        free(model->sv_coef[i]);
+sv_coef_i_error:
+    free(model->rho);
+rho_error:
+    free(model->sv_coef);
+sv_coef_error:
+    free(model->label);
+label_error:
+    free(model->nSV);
+nsv_error:
+    free(model);
+model_error:
+    return NULL;
+}
+
+
+/*
+ * Copy support vectors into a scipy.sparse.csr matrix
+ */
+int csr_copy_SV (char *data, Py_ssize_t *n_indices,
+		char *indices, Py_ssize_t *n_indptr, char *indptr,
+		struct svm_csr_model *model, int n_features)
+{
+	int i, j, k=0, index;
+	double *dvalues = (double *) data;
+	int *iindices = (int *) indices;
+	int *iindptr  = (int *) indptr;
+	iindptr[0] = 0;
+	for (i=0; i<model->l; ++i) { /* iterate over support vectors */
+		index = model->SV[i][0].index;
+        for(j=0; index >=0 ; ++j) {
+        	iindices[k] = index - 1;
+            dvalues[k] = model->SV[i][j].value;
+            index = model->SV[i][j+1].index;
+            ++k;
+        }
+        iindptr[i+1] = k;
+	}
+
+	return 0;
+}
+
+/* get number of nonzero coefficients in support vectors */
+Py_ssize_t get_nonzero_SV (struct svm_csr_model *model) {
+	int i, j;
+	Py_ssize_t count=0;
+	for (i=0; i<model->l; ++i) {
+		j = 0;
+		while (model->SV[i][j].index != -1) {
+			++j;
+			++count;
+		}
+	}
+	return count;
+}
+
+
+/*
+ * Predict using a model, where data is expected to be encoded into a csr matrix.
+ */
+int csr_copy_predict (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size,
+		char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model,
+		char *dec_values, BlasFunctions *blas_functions) {
+    double *t = (double *) dec_values;
+    struct svm_csr_node **predict_nodes;
+    Py_ssize_t i;
+
+    predict_nodes = csr_to_libsvm((double *) data, (int *) index,
+                                  (int *) intptr, intptr_size[0]-1);
+
+    if (predict_nodes == NULL)
+        return -1;
+    for(i=0; i < intptr_size[0] - 1; ++i) {
+        *t = svm_csr_predict(model, predict_nodes[i], blas_functions);
+        free(predict_nodes[i]);
+        ++t;
+    }
+    free(predict_nodes);
+    return 0;
+}
+
+int csr_copy_predict_values (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size,
+                char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model,
+                char *dec_values, int nr_class, BlasFunctions *blas_functions) {
+    struct svm_csr_node **predict_nodes;
+    Py_ssize_t i;
+
+    predict_nodes = csr_to_libsvm((double *) data, (int *) index,
+                                  (int *) intptr, intptr_size[0]-1);
+
+    if (predict_nodes == NULL)
+        return -1;
+    for(i=0; i < intptr_size[0] - 1; ++i) {
+        svm_csr_predict_values(model, predict_nodes[i],
+                               ((double *) dec_values) + i*nr_class,
+			       blas_functions);
+        free(predict_nodes[i]);
+    }
+    free(predict_nodes);
+
+    return 0;
+}
+
+int csr_copy_predict_proba (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size,
+		char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model,
+		char *dec_values, BlasFunctions *blas_functions) {
+
+    struct svm_csr_node **predict_nodes;
+    Py_ssize_t i;
+    int m = model->nr_class;
+
+    predict_nodes = csr_to_libsvm((double *) data, (int *) index,
+                                  (int *) intptr, intptr_size[0]-1);
+
+    if (predict_nodes == NULL)
+        return -1;
+    for(i=0; i < intptr_size[0] - 1; ++i) {
+        svm_csr_predict_probability(
+		model, predict_nodes[i], ((double *) dec_values) + i*m, blas_functions);
+        free(predict_nodes[i]);
+    }
+    free(predict_nodes);
+    return 0;
+}
+
+
+Py_ssize_t get_nr(struct svm_csr_model *model)
+{
+    return (Py_ssize_t) model->nr_class;
+}
+
+void copy_intercept(char *data, struct svm_csr_model *model, Py_ssize_t *dims)
+{
+    /* intercept = -rho */
+    Py_ssize_t i, n = dims[0];
+    double t, *ddata = (double *) data;
+    for (i=0; i<n; ++i) {
+        t = model->rho[i];
+        /* we do this to avoid ugly -0.0 */
+        *ddata = (t != 0) ? -t : 0;
+        ++ddata;
+    }
+}
+
+void copy_support (char *data, struct svm_csr_model *model)
+{
+    memcpy (data, model->sv_ind, (model->l) * sizeof(int));
+}
+
+/*
+ * Some helpers to convert from libsvm sparse data structures
+ * model->sv_coef is a double **, whereas data is just a double *,
+ * so we have to do some stupid copying.
+ */
+void copy_sv_coef(char *data, struct svm_csr_model *model)
+{
+    int i, len = model->nr_class-1;
+    double *temp = (double *) data;
+    for(i=0; i<len; ++i) {
+        memcpy(temp, model->sv_coef[i], sizeof(double) * model->l);
+        temp += model->l;
+    }
+}
+
+/*
+ * Get the number of iterations run in optimization
+ */
+void copy_n_iter(char *data, struct svm_csr_model *model)
+{
+    const int n_models = MAX(1, model->nr_class * (model->nr_class-1) / 2);
+    memcpy(data, model->n_iter, n_models * sizeof(int));
+}
+
+/*
+ * Get the number of support vectors in a model.
+ */
+Py_ssize_t get_l(struct svm_csr_model *model)
+{
+    return (Py_ssize_t) model->l;
+}
+
+void copy_nSV(char *data, struct svm_csr_model *model)
+{
+    if (model->label == NULL) return;
+    memcpy(data, model->nSV, model->nr_class * sizeof(int));
+}
+
+/*
+ * same as above with model->label
+ * TODO: merge in the cython layer
+ */
+void copy_label(char *data, struct svm_csr_model *model)
+{
+    if (model->label == NULL) return;
+    memcpy(data, model->label, model->nr_class * sizeof(int));
+}
+
+void copy_probA(char *data, struct svm_csr_model *model, Py_ssize_t * dims)
+{
+    memcpy(data, model->probA, dims[0] * sizeof(double));
+}
+
+void copy_probB(char *data, struct svm_csr_model *model, Py_ssize_t * dims)
+{
+    memcpy(data, model->probB, dims[0] * sizeof(double));
+}
+
+
+/*
+ * Some free routines. Some of them are nontrivial since a lot of
+ * sharing happens across objects (they *must* be called in the
+ * correct order)
+ */
+int free_problem(struct svm_csr_problem *problem)
+{
+    int i;
+    if (problem == NULL) return -1;
+    for (i=0; i<problem->l; ++i)
+        free (problem->x[i]);
+    free (problem->x);
+    free (problem);
+    return 0;
+}
+
+int free_model(struct svm_csr_model *model)
+{
+    /* like svm_free_and_destroy_model, but does not free sv_coef[i] */
+    /* We don't free n_iter, since we did not create them in set_model. */
+    if (model == NULL) return -1;
+    free(model->SV);
+    free(model->sv_coef);
+    free(model->rho);
+    free(model->label);
+    free(model->probA);
+    free(model->probB);
+    free(model->nSV);
+    free(model);
+
+    return 0;
+}
+
+int free_param(struct svm_parameter *param)
+{
+    if (param == NULL) return -1;
+    free(param);
+    return 0;
+}
+
+
+int free_model_SV(struct svm_csr_model *model)
+{
+    int i;
+    for (i=model->l-1; i>=0; --i) free(model->SV[i]);
+    /* svn_destroy_model frees model->SV */
+    for (i=0; i < model->nr_class-1 ; ++i) free(model->sv_coef[i]);
+    /* svn_destroy_model frees model->sv_coef */
+    return 0;
+}
+
+
+/* borrowed from original libsvm code */
+static void print_null(const char *s) {}
+
+static void print_string_stdout(const char *s)
+{
+	fputs(s,stdout);
+	fflush(stdout);
+}
+
+/* provide convenience wrapper */
+void set_verbosity(int verbosity_flag){
+	if (verbosity_flag)
+		svm_set_print_string_function(&print_string_stdout);
+	else
+		svm_set_print_string_function(&print_null);
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f6dbd0dfd9ecd81bdd79c74a19d7299e179389d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp
@@ -0,0 +1,8 @@
+
+/* this is a hack to generate libsvm with both sparse and dense
+   methods in the same binary*/
+
+#define _DENSE_REP
+#include "svm.cpp"
+#undef _DENSE_REP
+#include "svm.cpp"
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.cpp b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a6f191d6616c968e4e2a31e24a23536da329d873
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.cpp
@@ -0,0 +1,3187 @@
+/*
+Copyright (c) 2000-2009 Chih-Chung Chang and Chih-Jen Lin
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither name of copyright holders nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+   Modified 2010:
+
+   - Support for dense data by Ming-Fang Weng
+
+   - Return indices for support vectors, Fabian Pedregosa
+     <fabian.pedregosa@inria.fr>
+
+   - Fixes to avoid name collision, Fabian Pedregosa
+
+   - Add support for instance weights, Fabian Pedregosa based on work
+     by Ming-Wei Chang, Hsuan-Tien Lin, Ming-Hen Tsai, Chia-Hua Ho and
+     Hsiang-Fu Yu,
+     <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/#weights_for_data_instances>.
+
+   - Make labels sorted in svm_group_classes, Fabian Pedregosa.
+
+   Modified 2020:
+
+   - Improved random number generator by using a mersenne twister + tweaked
+     lemire postprocessor. This fixed a convergence issue on windows targets.
+     Sylvain Marie, Schneider Electric
+     see <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+
+   Modified 2021:
+
+   - Exposed number of iterations run in optimization, Juan Martín Loyola.
+     See <https://github.com/scikit-learn/scikit-learn/pull/21408/>
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <stdarg.h>
+#include <climits>
+#include <random>
+#include "svm.h"
+#include "_svm_cython_blas_helpers.h"
+#include "../newrand/newrand.h"
+
+
+#ifndef _LIBSVM_CPP
+typedef float Qfloat;
+typedef signed char schar;
+#ifndef min
+template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
+#endif
+#ifndef max
+template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
+#endif
+template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
+template <class S, class T> static inline void clone(T*& dst, S* src, int n)
+{
+	dst = new T[n];
+	memcpy((void *)dst,(void *)src,sizeof(T)*n);
+}
+static inline double powi(double base, int times)
+{
+	double tmp = base, ret = 1.0;
+
+	for(int t=times; t>0; t/=2)
+	{
+		if(t%2==1) ret*=tmp;
+		tmp = tmp * tmp;
+	}
+	return ret;
+}
+#define INF HUGE_VAL
+#define TAU 1e-12
+#define Malloc(type,n) (type *)malloc((n)*sizeof(type))
+
+static void print_string_stdout(const char *s)
+{
+	fputs(s,stdout);
+	fflush(stdout);
+}
+static void (*svm_print_string) (const char *) = &print_string_stdout;
+
+static void info(const char *fmt,...)
+{
+	char buf[BUFSIZ];
+	va_list ap;
+	va_start(ap,fmt);
+	vsprintf(buf,fmt,ap);
+	va_end(ap);
+	(*svm_print_string)(buf);
+}
+#endif
+#define _LIBSVM_CPP
+
+
+/* yeah, this is ugly.  It helps us to have unique names for both sparse
+and dense versions of this library */
+#ifdef _DENSE_REP
+  #ifdef PREFIX
+    #undef PREFIX
+  #endif
+  #ifdef NAMESPACE
+    #undef NAMESPACE
+  #endif
+  #define PREFIX(name) svm_##name
+  #define NAMESPACE svm
+  namespace svm {
+#else
+  /* sparse representation */
+  #ifdef PREFIX
+    #undef PREFIX
+  #endif
+  #ifdef NAMESPACE
+    #undef NAMESPACE
+  #endif
+  #define PREFIX(name) svm_csr_##name
+  #define NAMESPACE svm_csr
+  namespace svm_csr {
+#endif
+
+
+//
+// Kernel Cache
+//
+// l is the number of total data items
+// size is the cache size limit in bytes
+//
+class Cache
+{
+public:
+	Cache(int l,long int size);
+	~Cache();
+
+	// request data [0,len)
+	// return some position p where [p,len) need to be filled
+	// (p >= len if nothing needs to be filled)
+	int get_data(const int index, Qfloat **data, int len);
+	void swap_index(int i, int j);
+private:
+	int l;
+	long int size;
+	struct head_t
+	{
+		head_t *prev, *next;	// a circular list
+		Qfloat *data;
+		int len;		// data[0,len) is cached in this entry
+	};
+
+	head_t *head;
+	head_t lru_head;
+	void lru_delete(head_t *h);
+	void lru_insert(head_t *h);
+};
+
+Cache::Cache(int l_,long int size_):l(l_),size(size_)
+{
+	head = (head_t *)calloc(l,sizeof(head_t));	// initialized to 0
+	size /= sizeof(Qfloat);
+	size -= l * sizeof(head_t) / sizeof(Qfloat);
+	size = max(size, 2 * (long int) l);	// cache must be large enough for two columns
+	lru_head.next = lru_head.prev = &lru_head;
+}
+
+Cache::~Cache()
+{
+	for(head_t *h = lru_head.next; h != &lru_head; h=h->next)
+		free(h->data);
+	free(head);
+}
+
+void Cache::lru_delete(head_t *h)
+{
+	// delete from current location
+	h->prev->next = h->next;
+	h->next->prev = h->prev;
+}
+
+void Cache::lru_insert(head_t *h)
+{
+	// insert to last position
+	h->next = &lru_head;
+	h->prev = lru_head.prev;
+	h->prev->next = h;
+	h->next->prev = h;
+}
+
+int Cache::get_data(const int index, Qfloat **data, int len)
+{
+	head_t *h = &head[index];
+	if(h->len) lru_delete(h);
+	int more = len - h->len;
+
+	if(more > 0)
+	{
+		// free old space
+		while(size < more)
+		{
+			head_t *old = lru_head.next;
+			lru_delete(old);
+			free(old->data);
+			size += old->len;
+			old->data = 0;
+			old->len = 0;
+		}
+
+		// allocate new space
+		h->data = (Qfloat *)realloc(h->data,sizeof(Qfloat)*len);
+		size -= more;
+		swap(h->len,len);
+	}
+
+	lru_insert(h);
+	*data = h->data;
+	return len;
+}
+
+void Cache::swap_index(int i, int j)
+{
+	if(i==j) return;
+
+	if(head[i].len) lru_delete(&head[i]);
+	if(head[j].len) lru_delete(&head[j]);
+	swap(head[i].data,head[j].data);
+	swap(head[i].len,head[j].len);
+	if(head[i].len) lru_insert(&head[i]);
+	if(head[j].len) lru_insert(&head[j]);
+
+	if(i>j) swap(i,j);
+	for(head_t *h = lru_head.next; h!=&lru_head; h=h->next)
+	{
+		if(h->len > i)
+		{
+			if(h->len > j)
+				swap(h->data[i],h->data[j]);
+			else
+			{
+				// give up
+				lru_delete(h);
+				free(h->data);
+				size += h->len;
+				h->data = 0;
+				h->len = 0;
+			}
+		}
+	}
+}
+
+//
+// Kernel evaluation
+//
+// the static method k_function is for doing single kernel evaluation
+// the constructor of Kernel prepares to calculate the l*l kernel matrix
+// the member function get_Q is for getting one column from the Q Matrix
+//
+class QMatrix {
+public:
+	virtual Qfloat *get_Q(int column, int len) const = 0;
+	virtual double *get_QD() const = 0;
+	virtual void swap_index(int i, int j) const = 0;
+	virtual ~QMatrix() {}
+};
+
+class Kernel: public QMatrix {
+public:
+#ifdef _DENSE_REP
+	Kernel(int l, PREFIX(node) * x, const svm_parameter& param, BlasFunctions *blas_functions);
+#else
+	Kernel(int l, PREFIX(node) * const * x, const svm_parameter& param, BlasFunctions *blas_functions);
+#endif
+	virtual ~Kernel();
+
+	static double k_function(const PREFIX(node) *x, const PREFIX(node) *y,
+				 const svm_parameter& param, BlasFunctions *blas_functions);
+	virtual Qfloat *get_Q(int column, int len) const = 0;
+	virtual double *get_QD() const = 0;
+	virtual void swap_index(int i, int j) const	// no so const...
+	{
+		swap(x[i],x[j]);
+		if(x_square) swap(x_square[i],x_square[j]);
+	}
+protected:
+
+	double (Kernel::*kernel_function)(int i, int j) const;
+
+private:
+#ifdef _DENSE_REP
+	PREFIX(node) *x;
+#else
+	const PREFIX(node) **x;
+#endif
+	double *x_square;
+	// scipy blas pointer
+	BlasFunctions *m_blas;
+
+	// svm_parameter
+	const int kernel_type;
+	const int degree;
+	const double gamma;
+	const double coef0;
+
+	static double dot(const PREFIX(node) *px, const PREFIX(node) *py, BlasFunctions *blas_functions);
+#ifdef _DENSE_REP
+	static double dot(const PREFIX(node) &px, const PREFIX(node) &py, BlasFunctions *blas_functions);
+#endif
+
+	double kernel_linear(int i, int j) const
+	{
+		return dot(x[i],x[j],m_blas);
+	}
+	double kernel_poly(int i, int j) const
+	{
+		return powi(gamma*dot(x[i],x[j],m_blas)+coef0,degree);
+	}
+	double kernel_rbf(int i, int j) const
+	{
+		return exp(-gamma*(x_square[i]+x_square[j]-2*dot(x[i],x[j],m_blas)));
+	}
+	double kernel_sigmoid(int i, int j) const
+	{
+		return tanh(gamma*dot(x[i],x[j],m_blas)+coef0);
+	}
+	double kernel_precomputed(int i, int j) const
+	{
+#ifdef _DENSE_REP
+		return (x+i)->values[x[j].ind];
+#else
+		return x[i][(int)(x[j][0].value)].value;
+#endif
+	}
+};
+
+#ifdef _DENSE_REP
+Kernel::Kernel(int l, PREFIX(node) * x_, const svm_parameter& param, BlasFunctions *blas_functions)
+#else
+Kernel::Kernel(int l, PREFIX(node) * const * x_, const svm_parameter& param, BlasFunctions *blas_functions)
+#endif
+:kernel_type(param.kernel_type), degree(param.degree),
+ gamma(param.gamma), coef0(param.coef0)
+{
+	m_blas = blas_functions;
+	switch(kernel_type)
+	{
+		case LINEAR:
+			kernel_function = &Kernel::kernel_linear;
+			break;
+		case POLY:
+			kernel_function = &Kernel::kernel_poly;
+			break;
+		case RBF:
+			kernel_function = &Kernel::kernel_rbf;
+			break;
+		case SIGMOID:
+			kernel_function = &Kernel::kernel_sigmoid;
+			break;
+		case PRECOMPUTED:
+			kernel_function = &Kernel::kernel_precomputed;
+			break;
+	}
+
+	clone(x,x_,l);
+
+	if(kernel_type == RBF)
+	{
+		x_square = new double[l];
+		for(int i=0;i<l;i++)
+			x_square[i] = dot(x[i],x[i],blas_functions);
+	}
+	else
+		x_square = 0;
+}
+
+Kernel::~Kernel()
+{
+	delete[] x;
+	delete[] x_square;
+}
+
+#ifdef _DENSE_REP
+double Kernel::dot(const PREFIX(node) *px, const PREFIX(node) *py, BlasFunctions *blas_functions)
+{
+	double sum = 0;
+
+	int dim = min(px->dim, py->dim);
+	sum = blas_functions->dot(dim, px->values, 1, py->values, 1);
+	return sum;
+}
+
+double Kernel::dot(const PREFIX(node) &px, const PREFIX(node) &py, BlasFunctions *blas_functions)
+{
+	double sum = 0;
+
+	int dim = min(px.dim, py.dim);
+	sum = blas_functions->dot(dim, px.values, 1, py.values, 1);
+	return sum;
+}
+#else
+double Kernel::dot(const PREFIX(node) *px, const PREFIX(node) *py, BlasFunctions *blas_functions)
+{
+	double sum = 0;
+	while(px->index != -1 && py->index != -1)
+	{
+		if(px->index == py->index)
+		{
+			sum += px->value * py->value;
+			++px;
+			++py;
+		}
+		else
+		{
+			if(px->index > py->index)
+				++py;
+			else
+				++px;
+		}
+	}
+	return sum;
+}
+#endif
+
+double Kernel::k_function(const PREFIX(node) *x, const PREFIX(node) *y,
+			  const svm_parameter& param, BlasFunctions *blas_functions)
+{
+	switch(param.kernel_type)
+	{
+		case LINEAR:
+			return dot(x,y,blas_functions);
+		case POLY:
+			return powi(param.gamma*dot(x,y,blas_functions)+param.coef0,param.degree);
+		case RBF:
+		{
+			double sum = 0;
+#ifdef _DENSE_REP
+			int dim = min(x->dim, y->dim), i;
+			double* m_array = (double*)malloc(sizeof(double)*dim);
+			for (i = 0; i < dim; i++)
+			{
+				m_array[i] = x->values[i] - y->values[i];
+			}
+			sum = blas_functions->dot(dim, m_array, 1, m_array, 1);
+			free(m_array);
+			for (; i < x->dim; i++)
+				sum += x->values[i] * x->values[i];
+			for (; i < y->dim; i++)
+				sum += y->values[i] * y->values[i];
+#else
+			while(x->index != -1 && y->index !=-1)
+			{
+				if(x->index == y->index)
+				{
+					double d = x->value - y->value;
+					sum += d*d;
+					++x;
+					++y;
+				}
+				else
+				{
+					if(x->index > y->index)
+					{
+						sum += y->value * y->value;
+						++y;
+					}
+					else
+					{
+						sum += x->value * x->value;
+						++x;
+					}
+				}
+			}
+
+			while(x->index != -1)
+			{
+				sum += x->value * x->value;
+				++x;
+			}
+
+			while(y->index != -1)
+			{
+				sum += y->value * y->value;
+				++y;
+			}
+#endif
+			return exp(-param.gamma*sum);
+		}
+		case SIGMOID:
+			return tanh(param.gamma*dot(x,y,blas_functions)+param.coef0);
+		case PRECOMPUTED:  //x: test (validation), y: SV
+                    {
+#ifdef _DENSE_REP
+			return x->values[y->ind];
+#else
+			return x[(int)(y->value)].value;
+#endif
+                    }
+		default:
+			return 0;  // Unreachable
+	}
+}
+// An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918
+// Solves:
+//
+//	min 0.5(\alpha^T Q \alpha) + p^T \alpha
+//
+//		y^T \alpha = \delta
+//		y_i = +1 or -1
+//		0 <= alpha_i <= Cp for y_i = 1
+//		0 <= alpha_i <= Cn for y_i = -1
+//
+// Given:
+//
+//	Q, p, y, Cp, Cn, and an initial feasible point \alpha
+//	l is the size of vectors and matrices
+//	eps is the stopping tolerance
+//
+// solution will be put in \alpha, objective value will be put in obj
+//
+
+class Solver {
+public:
+	Solver() {};
+	virtual ~Solver() {};
+
+	struct SolutionInfo {
+		double obj;
+		double rho;
+                double *upper_bound;
+		double r;	// for Solver_NU
+                bool solve_timed_out;
+		int n_iter;
+	};
+
+	void Solve(int l, const QMatrix& Q, const double *p_, const schar *y_,
+		   double *alpha_, const double *C_, double eps,
+		   SolutionInfo* si, int shrinking, int max_iter);
+protected:
+	int active_size;
+	schar *y;
+	double *G;		// gradient of objective function
+	enum { LOWER_BOUND, UPPER_BOUND, FREE };
+	char *alpha_status;	// LOWER_BOUND, UPPER_BOUND, FREE
+	double *alpha;
+	const QMatrix *Q;
+	const double *QD;
+	double eps;
+	double Cp,Cn;
+        double *C;
+	double *p;
+	int *active_set;
+	double *G_bar;		// gradient, if we treat free variables as 0
+	int l;
+	bool unshrink;	// XXX
+
+	double get_C(int i)
+	{
+		return C[i];
+	}
+	void update_alpha_status(int i)
+	{
+		if(alpha[i] >= get_C(i))
+			alpha_status[i] = UPPER_BOUND;
+		else if(alpha[i] <= 0)
+			alpha_status[i] = LOWER_BOUND;
+		else alpha_status[i] = FREE;
+	}
+	bool is_upper_bound(int i) { return alpha_status[i] == UPPER_BOUND; }
+	bool is_lower_bound(int i) { return alpha_status[i] == LOWER_BOUND; }
+	bool is_free(int i) { return alpha_status[i] == FREE; }
+	void swap_index(int i, int j);
+	void reconstruct_gradient();
+	virtual int select_working_set(int &i, int &j);
+	virtual double calculate_rho();
+	virtual void do_shrinking();
+private:
+	bool be_shrunk(int i, double Gmax1, double Gmax2);
+};
+
+void Solver::swap_index(int i, int j)
+{
+	Q->swap_index(i,j);
+	swap(y[i],y[j]);
+	swap(G[i],G[j]);
+	swap(alpha_status[i],alpha_status[j]);
+	swap(alpha[i],alpha[j]);
+	swap(p[i],p[j]);
+	swap(active_set[i],active_set[j]);
+	swap(G_bar[i],G_bar[j]);
+        swap(C[i], C[j]);
+}
+
+void Solver::reconstruct_gradient()
+{
+	// reconstruct inactive elements of G from G_bar and free variables
+
+	if(active_size == l) return;
+
+	int i,j;
+	int nr_free = 0;
+
+	for(j=active_size;j<l;j++)
+		G[j] = G_bar[j] + p[j];
+
+	for(j=0;j<active_size;j++)
+		if(is_free(j))
+			nr_free++;
+
+	if(2*nr_free < active_size)
+		info("\nWarning: using -h 0 may be faster\n");
+
+	if (nr_free*l > 2*active_size*(l-active_size))
+	{
+		for(i=active_size;i<l;i++)
+		{
+			const Qfloat *Q_i = Q->get_Q(i,active_size);
+			for(j=0;j<active_size;j++)
+				if(is_free(j))
+					G[i] += alpha[j] * Q_i[j];
+		}
+	}
+	else
+	{
+		for(i=0;i<active_size;i++)
+			if(is_free(i))
+			{
+				const Qfloat *Q_i = Q->get_Q(i,l);
+				double alpha_i = alpha[i];
+				for(j=active_size;j<l;j++)
+					G[j] += alpha_i * Q_i[j];
+			}
+	}
+}
+
+void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_,
+		   double *alpha_, const double *C_, double eps,
+		   SolutionInfo* si, int shrinking, int max_iter)
+{
+	this->l = l;
+	this->Q = &Q;
+	QD=Q.get_QD();
+	clone(p, p_,l);
+	clone(y, y_,l);
+	clone(alpha,alpha_,l);
+        clone(C, C_, l);
+	this->eps = eps;
+	unshrink = false;
+        si->solve_timed_out = false;
+
+	// initialize alpha_status
+	{
+		alpha_status = new char[l];
+		for(int i=0;i<l;i++)
+			update_alpha_status(i);
+	}
+
+	// initialize active set (for shrinking)
+	{
+		active_set = new int[l];
+		for(int i=0;i<l;i++)
+			active_set[i] = i;
+		active_size = l;
+	}
+
+	// initialize gradient
+	{
+		G = new double[l];
+		G_bar = new double[l];
+		int i;
+		for(i=0;i<l;i++)
+		{
+			G[i] = p[i];
+			G_bar[i] = 0;
+		}
+		for(i=0;i<l;i++)
+			if(!is_lower_bound(i))
+			{
+				const Qfloat *Q_i = Q.get_Q(i,l);
+				double alpha_i = alpha[i];
+				int j;
+				for(j=0;j<l;j++)
+					G[j] += alpha_i*Q_i[j];
+				if(is_upper_bound(i))
+					for(j=0;j<l;j++)
+						G_bar[j] += get_C(i) * Q_i[j];
+			}
+	}
+
+	// optimization step
+
+	int iter = 0;
+	int counter = min(l,1000)+1;
+
+	while(1)
+	{
+                // set max_iter to -1 to disable the mechanism
+                if ((max_iter != -1) && (iter >= max_iter)) {
+                    info("WARN: libsvm Solver reached max_iter");
+                    si->solve_timed_out = true;
+                    break;
+                }
+
+		// show progress and do shrinking
+
+		if(--counter == 0)
+		{
+			counter = min(l,1000);
+			if(shrinking) do_shrinking();
+			info(".");
+		}
+
+		int i,j;
+		if(select_working_set(i,j)!=0)
+		{
+			// reconstruct the whole gradient
+			reconstruct_gradient();
+			// reset active set size and check
+			active_size = l;
+			info("*");
+			if(select_working_set(i,j)!=0)
+				break;
+			else
+				counter = 1;	// do shrinking next iteration
+		}
+
+		++iter;
+
+		// update alpha[i] and alpha[j], handle bounds carefully
+
+		const Qfloat *Q_i = Q.get_Q(i,active_size);
+		const Qfloat *Q_j = Q.get_Q(j,active_size);
+
+		double C_i = get_C(i);
+		double C_j = get_C(j);
+
+		double old_alpha_i = alpha[i];
+		double old_alpha_j = alpha[j];
+
+		if(y[i]!=y[j])
+		{
+			double quad_coef = QD[i]+QD[j]+2*Q_i[j];
+			if (quad_coef <= 0)
+				quad_coef = TAU;
+			double delta = (-G[i]-G[j])/quad_coef;
+			double diff = alpha[i] - alpha[j];
+			alpha[i] += delta;
+			alpha[j] += delta;
+
+			if(diff > 0)
+			{
+				if(alpha[j] < 0)
+				{
+					alpha[j] = 0;
+					alpha[i] = diff;
+				}
+			}
+			else
+			{
+				if(alpha[i] < 0)
+				{
+					alpha[i] = 0;
+					alpha[j] = -diff;
+				}
+			}
+			if(diff > C_i - C_j)
+			{
+				if(alpha[i] > C_i)
+				{
+					alpha[i] = C_i;
+					alpha[j] = C_i - diff;
+				}
+			}
+			else
+			{
+				if(alpha[j] > C_j)
+				{
+					alpha[j] = C_j;
+					alpha[i] = C_j + diff;
+				}
+			}
+		}
+		else
+		{
+			double quad_coef = QD[i]+QD[j]-2*Q_i[j];
+			if (quad_coef <= 0)
+				quad_coef = TAU;
+			double delta = (G[i]-G[j])/quad_coef;
+			double sum = alpha[i] + alpha[j];
+			alpha[i] -= delta;
+			alpha[j] += delta;
+
+			if(sum > C_i)
+			{
+				if(alpha[i] > C_i)
+				{
+					alpha[i] = C_i;
+					alpha[j] = sum - C_i;
+				}
+			}
+			else
+			{
+				if(alpha[j] < 0)
+				{
+					alpha[j] = 0;
+					alpha[i] = sum;
+				}
+			}
+			if(sum > C_j)
+			{
+				if(alpha[j] > C_j)
+				{
+					alpha[j] = C_j;
+					alpha[i] = sum - C_j;
+				}
+			}
+			else
+			{
+				if(alpha[i] < 0)
+				{
+					alpha[i] = 0;
+					alpha[j] = sum;
+				}
+			}
+		}
+
+		// update G
+
+		double delta_alpha_i = alpha[i] - old_alpha_i;
+		double delta_alpha_j = alpha[j] - old_alpha_j;
+
+		for(int k=0;k<active_size;k++)
+		{
+			G[k] += Q_i[k]*delta_alpha_i + Q_j[k]*delta_alpha_j;
+		}
+
+		// update alpha_status and G_bar
+
+		{
+			bool ui = is_upper_bound(i);
+			bool uj = is_upper_bound(j);
+			update_alpha_status(i);
+			update_alpha_status(j);
+			int k;
+			if(ui != is_upper_bound(i))
+			{
+				Q_i = Q.get_Q(i,l);
+				if(ui)
+					for(k=0;k<l;k++)
+						G_bar[k] -= C_i * Q_i[k];
+				else
+					for(k=0;k<l;k++)
+						G_bar[k] += C_i * Q_i[k];
+			}
+
+			if(uj != is_upper_bound(j))
+			{
+				Q_j = Q.get_Q(j,l);
+				if(uj)
+					for(k=0;k<l;k++)
+						G_bar[k] -= C_j * Q_j[k];
+				else
+					for(k=0;k<l;k++)
+						G_bar[k] += C_j * Q_j[k];
+			}
+		}
+	}
+
+	// calculate rho
+
+	si->rho = calculate_rho();
+
+	// calculate objective value
+	{
+		double v = 0;
+		int i;
+		for(i=0;i<l;i++)
+			v += alpha[i] * (G[i] + p[i]);
+
+		si->obj = v/2;
+	}
+
+	// put back the solution
+	{
+		for(int i=0;i<l;i++)
+			alpha_[active_set[i]] = alpha[i];
+	}
+
+	// juggle everything back
+	/*{
+		for(int i=0;i<l;i++)
+			while(active_set[i] != i)
+				swap_index(i,active_set[i]);
+				// or Q.swap_index(i,active_set[i]);
+	}*/
+
+	for(int i=0;i<l;i++)
+		si->upper_bound[i] = C[i];
+
+	// store number of iterations
+	si->n_iter = iter;
+
+	info("\noptimization finished, #iter = %d\n",iter);
+
+	delete[] p;
+	delete[] y;
+	delete[] alpha;
+	delete[] alpha_status;
+	delete[] active_set;
+	delete[] G;
+	delete[] G_bar;
+	delete[] C;
+}
+
+// return 1 if already optimal, return 0 otherwise
+int Solver::select_working_set(int &out_i, int &out_j)
+{
+	// return i,j such that
+	// i: maximizes -y_i * grad(f)_i, i in I_up(\alpha)
+	// j: minimizes the decrease of obj value
+	//    (if quadratic coefficient <= 0, replace it with tau)
+	//    -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)
+
+	double Gmax = -INF;
+	double Gmax2 = -INF;
+	int Gmax_idx = -1;
+	int Gmin_idx = -1;
+	double obj_diff_min = INF;
+
+	for(int t=0;t<active_size;t++)
+		if(y[t]==+1)
+		{
+			if(!is_upper_bound(t))
+				if(-G[t] >= Gmax)
+				{
+					Gmax = -G[t];
+					Gmax_idx = t;
+				}
+		}
+		else
+		{
+			if(!is_lower_bound(t))
+				if(G[t] >= Gmax)
+				{
+					Gmax = G[t];
+					Gmax_idx = t;
+				}
+		}
+
+	int i = Gmax_idx;
+	const Qfloat *Q_i = NULL;
+	if(i != -1) // NULL Q_i not accessed: Gmax=-INF if i=-1
+		Q_i = Q->get_Q(i,active_size);
+
+	for(int j=0;j<active_size;j++)
+	{
+		if(y[j]==+1)
+		{
+			if (!is_lower_bound(j))
+			{
+				double grad_diff=Gmax+G[j];
+				if (G[j] >= Gmax2)
+					Gmax2 = G[j];
+				if (grad_diff > 0)
+				{
+					double obj_diff;
+					double quad_coef = QD[i]+QD[j]-2.0*y[i]*Q_i[j];
+					if (quad_coef > 0)
+						obj_diff = -(grad_diff*grad_diff)/quad_coef;
+					else
+						obj_diff = -(grad_diff*grad_diff)/TAU;
+
+					if (obj_diff <= obj_diff_min)
+					{
+						Gmin_idx=j;
+						obj_diff_min = obj_diff;
+					}
+				}
+			}
+		}
+		else
+		{
+			if (!is_upper_bound(j))
+			{
+				double grad_diff= Gmax-G[j];
+				if (-G[j] >= Gmax2)
+					Gmax2 = -G[j];
+				if (grad_diff > 0)
+				{
+					double obj_diff;
+					double quad_coef = QD[i]+QD[j]+2.0*y[i]*Q_i[j];
+					if (quad_coef > 0)
+						obj_diff = -(grad_diff*grad_diff)/quad_coef;
+					else
+						obj_diff = -(grad_diff*grad_diff)/TAU;
+
+					if (obj_diff <= obj_diff_min)
+					{
+						Gmin_idx=j;
+						obj_diff_min = obj_diff;
+					}
+				}
+			}
+		}
+	}
+
+	if(Gmax+Gmax2 < eps || Gmin_idx == -1)
+		return 1;
+
+	out_i = Gmax_idx;
+	out_j = Gmin_idx;
+	return 0;
+}
+
+bool Solver::be_shrunk(int i, double Gmax1, double Gmax2)
+{
+	if(is_upper_bound(i))
+	{
+		if(y[i]==+1)
+			return(-G[i] > Gmax1);
+		else
+			return(-G[i] > Gmax2);
+	}
+	else if(is_lower_bound(i))
+	{
+		if(y[i]==+1)
+			return(G[i] > Gmax2);
+		else
+			return(G[i] > Gmax1);
+	}
+	else
+		return(false);
+}
+
+void Solver::do_shrinking()
+{
+	int i;
+	double Gmax1 = -INF;		// max { -y_i * grad(f)_i | i in I_up(\alpha) }
+	double Gmax2 = -INF;		// max { y_i * grad(f)_i | i in I_low(\alpha) }
+
+	// find maximal violating pair first
+	for(i=0;i<active_size;i++)
+	{
+		if(y[i]==+1)
+		{
+			if(!is_upper_bound(i))
+			{
+				if(-G[i] >= Gmax1)
+					Gmax1 = -G[i];
+			}
+			if(!is_lower_bound(i))
+			{
+				if(G[i] >= Gmax2)
+					Gmax2 = G[i];
+			}
+		}
+		else
+		{
+			if(!is_upper_bound(i))
+			{
+				if(-G[i] >= Gmax2)
+					Gmax2 = -G[i];
+			}
+			if(!is_lower_bound(i))
+			{
+				if(G[i] >= Gmax1)
+					Gmax1 = G[i];
+			}
+		}
+	}
+
+	if(unshrink == false && Gmax1 + Gmax2 <= eps*10)
+	{
+		unshrink = true;
+		reconstruct_gradient();
+		active_size = l;
+		info("*");
+	}
+
+	for(i=0;i<active_size;i++)
+		if (be_shrunk(i, Gmax1, Gmax2))
+		{
+			active_size--;
+			while (active_size > i)
+			{
+				if (!be_shrunk(active_size, Gmax1, Gmax2))
+				{
+					swap_index(i,active_size);
+					break;
+				}
+				active_size--;
+			}
+		}
+}
+
+double Solver::calculate_rho()
+{
+	double r;
+	int nr_free = 0;
+	double ub = INF, lb = -INF, sum_free = 0;
+	for(int i=0;i<active_size;i++)
+	{
+		double yG = y[i]*G[i];
+
+		if(is_upper_bound(i))
+		{
+			if(y[i]==-1)
+				ub = min(ub,yG);
+			else
+				lb = max(lb,yG);
+		}
+		else if(is_lower_bound(i))
+		{
+			if(y[i]==+1)
+				ub = min(ub,yG);
+			else
+				lb = max(lb,yG);
+		}
+		else
+		{
+			++nr_free;
+			sum_free += yG;
+		}
+	}
+
+	if(nr_free>0)
+		r = sum_free/nr_free;
+	else
+		r = (ub+lb)/2;
+
+	return r;
+}
+
+//
+// Solver for nu-svm classification and regression
+//
+// additional constraint: e^T \alpha = constant
+//
+class Solver_NU : public Solver
+{
+public:
+	Solver_NU() {}
+	void Solve(int l, const QMatrix& Q, const double *p, const schar *y,
+		   double *alpha, const double *C_, double eps,
+		   SolutionInfo* si, int shrinking, int max_iter)
+	{
+		this->si = si;
+		Solver::Solve(l,Q,p,y,alpha,C_,eps,si,shrinking,max_iter);
+	}
+private:
+	SolutionInfo *si;
+	int select_working_set(int &i, int &j);
+	double calculate_rho();
+	bool be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, double Gmax4);
+	void do_shrinking();
+};
+
+// return 1 if already optimal, return 0 otherwise
+int Solver_NU::select_working_set(int &out_i, int &out_j)
+{
+	// return i,j such that y_i = y_j and
+	// i: maximizes -y_i * grad(f)_i, i in I_up(\alpha)
+	// j: minimizes the decrease of obj value
+	//    (if quadratic coefficient <= 0, replace it with tau)
+	//    -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)
+
+	double Gmaxp = -INF;
+	double Gmaxp2 = -INF;
+	int Gmaxp_idx = -1;
+
+	double Gmaxn = -INF;
+	double Gmaxn2 = -INF;
+	int Gmaxn_idx = -1;
+
+	int Gmin_idx = -1;
+	double obj_diff_min = INF;
+
+	for(int t=0;t<active_size;t++)
+		if(y[t]==+1)
+		{
+			if(!is_upper_bound(t))
+				if(-G[t] >= Gmaxp)
+				{
+					Gmaxp = -G[t];
+					Gmaxp_idx = t;
+				}
+		}
+		else
+		{
+			if(!is_lower_bound(t))
+				if(G[t] >= Gmaxn)
+				{
+					Gmaxn = G[t];
+					Gmaxn_idx = t;
+				}
+		}
+
+	int ip = Gmaxp_idx;
+	int in = Gmaxn_idx;
+	const Qfloat *Q_ip = NULL;
+	const Qfloat *Q_in = NULL;
+	if(ip != -1) // NULL Q_ip not accessed: Gmaxp=-INF if ip=-1
+		Q_ip = Q->get_Q(ip,active_size);
+	if(in != -1)
+		Q_in = Q->get_Q(in,active_size);
+
+	for(int j=0;j<active_size;j++)
+	{
+		if(y[j]==+1)
+		{
+			if (!is_lower_bound(j))
+			{
+				double grad_diff=Gmaxp+G[j];
+				if (G[j] >= Gmaxp2)
+					Gmaxp2 = G[j];
+				if (grad_diff > 0)
+				{
+					double obj_diff;
+					double quad_coef = QD[ip]+QD[j]-2*Q_ip[j];
+					if (quad_coef > 0)
+						obj_diff = -(grad_diff*grad_diff)/quad_coef;
+					else
+						obj_diff = -(grad_diff*grad_diff)/TAU;
+
+					if (obj_diff <= obj_diff_min)
+					{
+						Gmin_idx=j;
+						obj_diff_min = obj_diff;
+					}
+				}
+			}
+		}
+		else
+		{
+			if (!is_upper_bound(j))
+			{
+				double grad_diff=Gmaxn-G[j];
+				if (-G[j] >= Gmaxn2)
+					Gmaxn2 = -G[j];
+				if (grad_diff > 0)
+				{
+					double obj_diff;
+					double quad_coef = QD[in]+QD[j]-2*Q_in[j];
+					if (quad_coef > 0)
+						obj_diff = -(grad_diff*grad_diff)/quad_coef;
+					else
+						obj_diff = -(grad_diff*grad_diff)/TAU;
+
+					if (obj_diff <= obj_diff_min)
+					{
+						Gmin_idx=j;
+						obj_diff_min = obj_diff;
+					}
+				}
+			}
+		}
+	}
+
+	if(max(Gmaxp+Gmaxp2,Gmaxn+Gmaxn2) < eps || Gmin_idx == -1)
+		return 1;
+
+	if (y[Gmin_idx] == +1)
+		out_i = Gmaxp_idx;
+	else
+		out_i = Gmaxn_idx;
+	out_j = Gmin_idx;
+
+	return 0;
+}
+
+bool Solver_NU::be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, double Gmax4)
+{
+	if(is_upper_bound(i))
+	{
+		if(y[i]==+1)
+			return(-G[i] > Gmax1);
+		else
+			return(-G[i] > Gmax4);
+	}
+	else if(is_lower_bound(i))
+	{
+		if(y[i]==+1)
+			return(G[i] > Gmax2);
+		else
+			return(G[i] > Gmax3);
+	}
+	else
+		return(false);
+}
+
+void Solver_NU::do_shrinking()
+{
+	double Gmax1 = -INF;	// max { -y_i * grad(f)_i | y_i = +1, i in I_up(\alpha) }
+	double Gmax2 = -INF;	// max { y_i * grad(f)_i | y_i = +1, i in I_low(\alpha) }
+	double Gmax3 = -INF;	// max { -y_i * grad(f)_i | y_i = -1, i in I_up(\alpha) }
+	double Gmax4 = -INF;	// max { y_i * grad(f)_i | y_i = -1, i in I_low(\alpha) }
+
+	// find maximal violating pair first
+	int i;
+	for(i=0;i<active_size;i++)
+	{
+		if(!is_upper_bound(i))
+		{
+			if(y[i]==+1)
+			{
+				if(-G[i] > Gmax1) Gmax1 = -G[i];
+			}
+			else	if(-G[i] > Gmax4) Gmax4 = -G[i];
+		}
+		if(!is_lower_bound(i))
+		{
+			if(y[i]==+1)
+			{
+				if(G[i] > Gmax2) Gmax2 = G[i];
+			}
+			else	if(G[i] > Gmax3) Gmax3 = G[i];
+		}
+	}
+
+	if(unshrink == false && max(Gmax1+Gmax2,Gmax3+Gmax4) <= eps*10)
+	{
+		unshrink = true;
+		reconstruct_gradient();
+		active_size = l;
+	}
+
+	for(i=0;i<active_size;i++)
+		if (be_shrunk(i, Gmax1, Gmax2, Gmax3, Gmax4))
+		{
+			active_size--;
+			while (active_size > i)
+			{
+				if (!be_shrunk(active_size, Gmax1, Gmax2, Gmax3, Gmax4))
+				{
+					swap_index(i,active_size);
+					break;
+				}
+				active_size--;
+			}
+		}
+}
+
+double Solver_NU::calculate_rho()
+{
+	int nr_free1 = 0,nr_free2 = 0;
+	double ub1 = INF, ub2 = INF;
+	double lb1 = -INF, lb2 = -INF;
+	double sum_free1 = 0, sum_free2 = 0;
+
+	for(int i=0;i<active_size;i++)
+	{
+		if(y[i]==+1)
+		{
+			if(is_upper_bound(i))
+				lb1 = max(lb1,G[i]);
+			else if(is_lower_bound(i))
+				ub1 = min(ub1,G[i]);
+			else
+			{
+				++nr_free1;
+				sum_free1 += G[i];
+			}
+		}
+		else
+		{
+			if(is_upper_bound(i))
+				lb2 = max(lb2,G[i]);
+			else if(is_lower_bound(i))
+				ub2 = min(ub2,G[i]);
+			else
+			{
+				++nr_free2;
+				sum_free2 += G[i];
+			}
+		}
+	}
+
+	double r1,r2;
+	if(nr_free1 > 0)
+		r1 = sum_free1/nr_free1;
+	else
+		r1 = (ub1+lb1)/2;
+
+	if(nr_free2 > 0)
+		r2 = sum_free2/nr_free2;
+	else
+		r2 = (ub2+lb2)/2;
+
+	si->r = (r1+r2)/2;
+	return (r1-r2)/2;
+}
+
+//
+// Q matrices for various formulations
+//
+class SVC_Q: public Kernel
+{
+public:
+	SVC_Q(const PREFIX(problem)& prob, const svm_parameter& param, const schar *y_, BlasFunctions *blas_functions)
+	:Kernel(prob.l, prob.x, param, blas_functions)
+	{
+		clone(y,y_,prob.l);
+		cache = new Cache(prob.l,(long int)(param.cache_size*(1<<20)));
+		QD = new double[prob.l];
+		for(int i=0;i<prob.l;i++)
+			QD[i] = (this->*kernel_function)(i,i);
+	}
+
+	Qfloat *get_Q(int i, int len) const
+	{
+		Qfloat *data;
+		int start, j;
+		if((start = cache->get_data(i,&data,len)) < len)
+		{
+			for(j=start;j<len;j++)
+				data[j] = (Qfloat)(y[i]*y[j]*(this->*kernel_function)(i,j));
+		}
+		return data;
+	}
+
+	double *get_QD() const
+	{
+		return QD;
+	}
+
+	void swap_index(int i, int j) const
+	{
+		cache->swap_index(i,j);
+		Kernel::swap_index(i,j);
+		swap(y[i],y[j]);
+		swap(QD[i],QD[j]);
+	}
+
+	~SVC_Q()
+	{
+		delete[] y;
+		delete cache;
+		delete[] QD;
+	}
+private:
+	schar *y;
+	Cache *cache;
+	double *QD;
+};
+
+class ONE_CLASS_Q: public Kernel
+{
+public:
+	ONE_CLASS_Q(const PREFIX(problem)& prob, const svm_parameter& param, BlasFunctions *blas_functions)
+	:Kernel(prob.l, prob.x, param, blas_functions)
+	{
+		cache = new Cache(prob.l,(long int)(param.cache_size*(1<<20)));
+		QD = new double[prob.l];
+		for(int i=0;i<prob.l;i++)
+			QD[i] = (this->*kernel_function)(i,i);
+	}
+
+	Qfloat *get_Q(int i, int len) const
+	{
+		Qfloat *data;
+		int start, j;
+		if((start = cache->get_data(i,&data,len)) < len)
+		{
+			for(j=start;j<len;j++)
+				data[j] = (Qfloat)(this->*kernel_function)(i,j);
+		}
+		return data;
+	}
+
+	double *get_QD() const
+	{
+		return QD;
+	}
+
+	void swap_index(int i, int j) const
+	{
+		cache->swap_index(i,j);
+		Kernel::swap_index(i,j);
+		swap(QD[i],QD[j]);
+	}
+
+	~ONE_CLASS_Q()
+	{
+		delete cache;
+		delete[] QD;
+	}
+private:
+	Cache *cache;
+	double *QD;
+};
+
+class SVR_Q: public Kernel
+{
+public:
+	SVR_Q(const PREFIX(problem)& prob, const svm_parameter& param, BlasFunctions *blas_functions)
+	:Kernel(prob.l, prob.x, param, blas_functions)
+	{
+		l = prob.l;
+		cache = new Cache(l,(long int)(param.cache_size*(1<<20)));
+		QD = new double[2*l];
+		sign = new schar[2*l];
+		index = new int[2*l];
+		for(int k=0;k<l;k++)
+		{
+			sign[k] = 1;
+			sign[k+l] = -1;
+			index[k] = k;
+			index[k+l] = k;
+			QD[k] = (this->*kernel_function)(k,k);
+			QD[k+l] = QD[k];
+		}
+		buffer[0] = new Qfloat[2*l];
+		buffer[1] = new Qfloat[2*l];
+		next_buffer = 0;
+	}
+
+	void swap_index(int i, int j) const
+	{
+		swap(sign[i],sign[j]);
+		swap(index[i],index[j]);
+		swap(QD[i],QD[j]);
+	}
+
+	Qfloat *get_Q(int i, int len) const
+	{
+		Qfloat *data;
+		int j, real_i = index[i];
+		if(cache->get_data(real_i,&data,l) < l)
+		{
+			for(j=0;j<l;j++)
+				data[j] = (Qfloat)(this->*kernel_function)(real_i,j);
+		}
+
+		// reorder and copy
+		Qfloat *buf = buffer[next_buffer];
+		next_buffer = 1 - next_buffer;
+		schar si = sign[i];
+		for(j=0;j<len;j++)
+			buf[j] = (Qfloat) si * (Qfloat) sign[j] * data[index[j]];
+		return buf;
+	}
+
+	double *get_QD() const
+	{
+		return QD;
+	}
+
+	~SVR_Q()
+	{
+		delete cache;
+		delete[] sign;
+		delete[] index;
+		delete[] buffer[0];
+		delete[] buffer[1];
+		delete[] QD;
+	}
+private:
+	int l;
+	Cache *cache;
+	schar *sign;
+	int *index;
+	mutable int next_buffer;
+	Qfloat *buffer[2];
+	double *QD;
+};
+
+//
+// construct and solve various formulations
+//
+static void solve_c_svc(
+	const PREFIX(problem) *prob, const svm_parameter* param,
+	double *alpha, Solver::SolutionInfo* si, double Cp, double Cn, BlasFunctions *blas_functions)
+{
+	int l = prob->l;
+	double *minus_ones = new double[l];
+	schar *y = new schar[l];
+        double *C = new double[l];
+
+	int i;
+
+	for(i=0;i<l;i++)
+	{
+		alpha[i] = 0;
+		minus_ones[i] = -1;
+		if(prob->y[i] > 0)
+		{
+			y[i] = +1;
+			C[i] = prob->W[i]*Cp;
+		}
+		else
+		{
+			y[i] = -1;
+			C[i] = prob->W[i]*Cn;
+		}
+	}
+
+	Solver s;
+	s.Solve(l, SVC_Q(*prob,*param,y, blas_functions), minus_ones, y,
+		alpha, C, param->eps, si, param->shrinking,
+                param->max_iter);
+
+        /*
+	double sum_alpha=0;
+	for(i=0;i<l;i++)
+		sum_alpha += alpha[i];
+
+	if (Cp==Cn)
+		info("nu = %f\n", sum_alpha/(Cp*prob->l));
+        */
+
+	for(i=0;i<l;i++)
+		alpha[i] *= y[i];
+
+        delete[] C;
+	delete[] minus_ones;
+	delete[] y;
+}
+
+static void solve_nu_svc(
+	const PREFIX(problem) *prob, const svm_parameter *param,
+	double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions)
+{
+	int i;
+	int l = prob->l;
+	double nu = param->nu;
+
+	schar *y = new schar[l];
+        double *C = new double[l];
+
+	for(i=0;i<l;i++)
+        {
+		if(prob->y[i]>0)
+			y[i] = +1;
+		else
+			y[i] = -1;
+
+		C[i] = prob->W[i];
+	}
+
+	double nu_l = 0;
+	for(i=0;i<l;i++) nu_l += nu*C[i];
+	double sum_pos = nu_l/2;
+	double sum_neg = nu_l/2;
+
+	for(i=0;i<l;i++)
+		if(y[i] == +1)
+		{
+			alpha[i] = min(C[i],sum_pos);
+			sum_pos -= alpha[i];
+		}
+		else
+		{
+			alpha[i] = min(C[i],sum_neg);
+			sum_neg -= alpha[i];
+		}
+
+	double *zeros = new double[l];
+
+	for(i=0;i<l;i++)
+		zeros[i] = 0;
+
+	Solver_NU s;
+	s.Solve(l, SVC_Q(*prob,*param,y,blas_functions), zeros, y,
+		alpha, C, param->eps, si,  param->shrinking, param->max_iter);
+	double r = si->r;
+
+	info("C = %f\n",1/r);
+
+	for(i=0;i<l;i++)
+        {
+		alpha[i] *= y[i]/r;
+		si->upper_bound[i] /= r;
+        }
+
+	si->rho /= r;
+	si->obj /= (r*r);
+
+        delete[] C;
+	delete[] y;
+	delete[] zeros;
+}
+
+static void solve_one_class(
+	const PREFIX(problem) *prob, const svm_parameter *param,
+	double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions)
+{
+	int l = prob->l;
+	double *zeros = new double[l];
+	schar *ones = new schar[l];
+	double *C = new double[l];
+	int i;
+
+	double nu_l = 0;
+
+	for(i=0;i<l;i++)
+	{
+		C[i] = prob->W[i];
+		nu_l += C[i] * param->nu;
+	}
+
+	i = 0;
+	while(nu_l > 0)
+	{
+		alpha[i] = min(C[i],nu_l);
+		nu_l -= alpha[i];
+		++i;
+	}
+	for(;i<l;i++)
+		alpha[i] = 0;
+
+	for(i=0;i<l;i++)
+	{
+		zeros[i] = 0;
+		ones[i] = 1;
+	}
+
+	Solver s;
+	s.Solve(l, ONE_CLASS_Q(*prob,*param,blas_functions), zeros, ones,
+		alpha, C, param->eps, si, param->shrinking, param->max_iter);
+
+        delete[] C;
+	delete[] zeros;
+	delete[] ones;
+}
+
+static void solve_epsilon_svr(
+	const PREFIX(problem) *prob, const svm_parameter *param,
+	double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions)
+{
+	int l = prob->l;
+	double *alpha2 = new double[2*l];
+	double *linear_term = new double[2*l];
+	schar *y = new schar[2*l];
+        double *C = new double[2*l];
+        int i;
+
+	for(i=0;i<l;i++)
+	{
+		alpha2[i] = 0;
+		linear_term[i] = param->p - prob->y[i];
+		y[i] = 1;
+                C[i] = prob->W[i]*param->C;
+
+		alpha2[i+l] = 0;
+		linear_term[i+l] = param->p + prob->y[i];
+		y[i+l] = -1;
+                C[i+l] = prob->W[i]*param->C;
+	}
+
+	Solver s;
+	s.Solve(2*l, SVR_Q(*prob,*param,blas_functions), linear_term, y,
+		alpha2, C, param->eps, si, param->shrinking, param->max_iter);
+
+	double sum_alpha = 0;
+	for(i=0;i<l;i++)
+	{
+		alpha[i] = alpha2[i] - alpha2[i+l];
+		sum_alpha += fabs(alpha[i]);
+	}
+
+
+	delete[] alpha2;
+	delete[] linear_term;
+        delete[] C;
+	delete[] y;
+}
+
+static void solve_nu_svr(
+	const PREFIX(problem) *prob, const svm_parameter *param,
+	double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions)
+{
+	int l = prob->l;
+	double *C = new double[2*l];
+	double *alpha2 = new double[2*l];
+	double *linear_term = new double[2*l];
+	schar *y = new schar[2*l];
+	int i;
+
+	double sum = 0;
+	for(i=0;i<l;i++)
+	{
+		C[i] = C[i+l] = prob->W[i]*param->C;
+		sum += C[i] * param->nu;
+	}
+	sum /= 2;
+
+	for(i=0;i<l;i++)
+	{
+		alpha2[i] = alpha2[i+l] = min(sum,C[i]);
+		sum -= alpha2[i];
+
+		linear_term[i] = - prob->y[i];
+		y[i] = 1;
+
+		linear_term[i+l] = prob->y[i];
+		y[i+l] = -1;
+	}
+
+	Solver_NU s;
+	s.Solve(2*l, SVR_Q(*prob,*param,blas_functions), linear_term, y,
+		alpha2, C, param->eps, si, param->shrinking, param->max_iter);
+
+	info("epsilon = %f\n",-si->r);
+
+	for(i=0;i<l;i++)
+		alpha[i] = alpha2[i] - alpha2[i+l];
+
+	delete[] alpha2;
+	delete[] linear_term;
+        delete[] C;
+	delete[] y;
+}
+
+//
+// decision_function
+//
+struct decision_function
+{
+	double *alpha;
+	double rho;
+	int n_iter;
+};
+
+static decision_function svm_train_one(
+	const PREFIX(problem) *prob, const svm_parameter *param,
+	double Cp, double Cn, int *status, BlasFunctions *blas_functions)
+{
+	double *alpha = Malloc(double,prob->l);
+	Solver::SolutionInfo si;
+	switch(param->svm_type)
+	{
+ 		case C_SVC:
+			si.upper_bound = Malloc(double,prob->l);
+ 			solve_c_svc(prob,param,alpha,&si,Cp,Cn,blas_functions);
+ 			break;
+ 		case NU_SVC:
+			si.upper_bound = Malloc(double,prob->l);
+ 			solve_nu_svc(prob,param,alpha,&si,blas_functions);
+ 			break;
+ 		case ONE_CLASS:
+			si.upper_bound = Malloc(double,prob->l);
+ 			solve_one_class(prob,param,alpha,&si,blas_functions);
+ 			break;
+ 		case EPSILON_SVR:
+			si.upper_bound = Malloc(double,2*prob->l);
+ 			solve_epsilon_svr(prob,param,alpha,&si,blas_functions);
+ 			break;
+ 		case NU_SVR:
+			si.upper_bound = Malloc(double,2*prob->l);
+ 			solve_nu_svr(prob,param,alpha,&si,blas_functions);
+ 			break;
+	}
+
+        *status |= si.solve_timed_out;
+
+	info("obj = %f, rho = %f\n",si.obj,si.rho);
+
+	// output SVs
+
+	int nSV = 0;
+	int nBSV = 0;
+	for(int i=0;i<prob->l;i++)
+	{
+		if(fabs(alpha[i]) > 0)
+		{
+			++nSV;
+			if(prob->y[i] > 0)
+			{
+				if(fabs(alpha[i]) >= si.upper_bound[i])
+					++nBSV;
+			}
+			else
+			{
+				if(fabs(alpha[i]) >= si.upper_bound[i])
+					++nBSV;
+			}
+		}
+	}
+
+        free(si.upper_bound);
+
+	info("nSV = %d, nBSV = %d\n",nSV,nBSV);
+
+	decision_function f;
+	f.alpha = alpha;
+	f.rho = si.rho;
+	f.n_iter = si.n_iter;
+	return f;
+}
+
+// Platt's binary SVM Probabilistic Output: an improvement from Lin et al.
+static void sigmoid_train(
+	int l, const double *dec_values, const double *labels,
+	double& A, double& B)
+{
+	double prior1=0, prior0 = 0;
+	int i;
+
+	for (i=0;i<l;i++)
+		if (labels[i] > 0) prior1+=1;
+		else prior0+=1;
+
+	int max_iter=100;	// Maximal number of iterations
+	double min_step=1e-10;	// Minimal step taken in line search
+	double sigma=1e-12;	// For numerically strict PD of Hessian
+	double eps=1e-5;
+	double hiTarget=(prior1+1.0)/(prior1+2.0);
+	double loTarget=1/(prior0+2.0);
+	double *t=Malloc(double,l);
+	double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize;
+	double newA,newB,newf,d1,d2;
+	int iter;
+
+	// Initial Point and Initial Fun Value
+	A=0.0; B=log((prior0+1.0)/(prior1+1.0));
+	double fval = 0.0;
+
+	for (i=0;i<l;i++)
+	{
+		if (labels[i]>0) t[i]=hiTarget;
+		else t[i]=loTarget;
+		fApB = dec_values[i]*A+B;
+		if (fApB>=0)
+			fval += t[i]*fApB + log(1+exp(-fApB));
+		else
+			fval += (t[i] - 1)*fApB +log(1+exp(fApB));
+	}
+	for (iter=0;iter<max_iter;iter++)
+	{
+		// Update Gradient and Hessian (use H' = H + sigma I)
+		h11=sigma; // numerically ensures strict PD
+		h22=sigma;
+		h21=0.0;g1=0.0;g2=0.0;
+		for (i=0;i<l;i++)
+		{
+			fApB = dec_values[i]*A+B;
+			if (fApB >= 0)
+			{
+				p=exp(-fApB)/(1.0+exp(-fApB));
+				q=1.0/(1.0+exp(-fApB));
+			}
+			else
+			{
+				p=1.0/(1.0+exp(fApB));
+				q=exp(fApB)/(1.0+exp(fApB));
+			}
+			d2=p*q;
+			h11+=dec_values[i]*dec_values[i]*d2;
+			h22+=d2;
+			h21+=dec_values[i]*d2;
+			d1=t[i]-p;
+			g1+=dec_values[i]*d1;
+			g2+=d1;
+		}
+
+		// Stopping Criteria
+		if (fabs(g1)<eps && fabs(g2)<eps)
+			break;
+
+		// Finding Newton direction: -inv(H') * g
+		det=h11*h22-h21*h21;
+		dA=-(h22*g1 - h21 * g2) / det;
+		dB=-(-h21*g1+ h11 * g2) / det;
+		gd=g1*dA+g2*dB;
+
+
+		stepsize = 1;		// Line Search
+		while (stepsize >= min_step)
+		{
+			newA = A + stepsize * dA;
+			newB = B + stepsize * dB;
+
+			// New function value
+			newf = 0.0;
+			for (i=0;i<l;i++)
+			{
+				fApB = dec_values[i]*newA+newB;
+				if (fApB >= 0)
+					newf += t[i]*fApB + log(1+exp(-fApB));
+				else
+					newf += (t[i] - 1)*fApB +log(1+exp(fApB));
+			}
+			// Check sufficient decrease
+			if (newf<fval+0.0001*stepsize*gd)
+			{
+				A=newA;B=newB;fval=newf;
+				break;
+			}
+			else
+				stepsize = stepsize / 2.0;
+		}
+
+		if (stepsize < min_step)
+		{
+			info("Line search fails in two-class probability estimates\n");
+			break;
+		}
+	}
+
+	if (iter>=max_iter)
+		info("Reaching maximal iterations in two-class probability estimates\n");
+	free(t);
+}
+
+static double sigmoid_predict(double decision_value, double A, double B)
+{
+	double fApB = decision_value*A+B;
+	// 1-p used later; avoid catastrophic cancellation
+	if (fApB >= 0)
+		return exp(-fApB)/(1.0+exp(-fApB));
+	else
+		return 1.0/(1+exp(fApB)) ;
+}
+
+// Method 2 from the multiclass_prob paper by Wu, Lin, and Weng
+static void multiclass_probability(int k, double **r, double *p)
+{
+	int t,j;
+	int iter = 0, max_iter=max(100,k);
+	double **Q=Malloc(double *,k);
+	double *Qp=Malloc(double,k);
+	double pQp, eps=0.005/k;
+
+	for (t=0;t<k;t++)
+	{
+		p[t]=1.0/k;  // Valid if k = 1
+		Q[t]=Malloc(double,k);
+		Q[t][t]=0;
+		for (j=0;j<t;j++)
+		{
+			Q[t][t]+=r[j][t]*r[j][t];
+			Q[t][j]=Q[j][t];
+		}
+		for (j=t+1;j<k;j++)
+		{
+			Q[t][t]+=r[j][t]*r[j][t];
+			Q[t][j]=-r[j][t]*r[t][j];
+		}
+	}
+	for (iter=0;iter<max_iter;iter++)
+	{
+		// stopping condition, recalculate QP,pQP for numerical accuracy
+		pQp=0;
+		for (t=0;t<k;t++)
+		{
+			Qp[t]=0;
+			for (j=0;j<k;j++)
+				Qp[t]+=Q[t][j]*p[j];
+			pQp+=p[t]*Qp[t];
+		}
+		double max_error=0;
+		for (t=0;t<k;t++)
+		{
+			double error=fabs(Qp[t]-pQp);
+			if (error>max_error)
+				max_error=error;
+		}
+		if (max_error<eps) break;
+
+		for (t=0;t<k;t++)
+		{
+			double diff=(-Qp[t]+pQp)/Q[t][t];
+			p[t]+=diff;
+			pQp=(pQp+diff*(diff*Q[t][t]+2*Qp[t]))/(1+diff)/(1+diff);
+			for (j=0;j<k;j++)
+			{
+				Qp[j]=(Qp[j]+diff*Q[t][j])/(1+diff);
+				p[j]/=(1+diff);
+			}
+		}
+	}
+	if (iter>=max_iter)
+		info("Exceeds max_iter in multiclass_prob\n");
+	for(t=0;t<k;t++) free(Q[t]);
+	free(Q);
+	free(Qp);
+}
+
+// Cross-validation decision values for probability estimates
+static void svm_binary_svc_probability(
+	const PREFIX(problem) *prob, const svm_parameter *param,
+	double Cp, double Cn, double& probA, double& probB, int * status, BlasFunctions *blas_functions)
+{
+	int i;
+	int nr_fold = 5;
+	int *perm = Malloc(int,prob->l);
+	double *dec_values = Malloc(double,prob->l);
+
+	// random shuffle
+	for(i=0;i<prob->l;i++) perm[i]=i;
+	for(i=0;i<prob->l;i++)
+	{
+		int j = i+bounded_rand_int(prob->l-i);
+		swap(perm[i],perm[j]);
+	}
+	for(i=0;i<nr_fold;i++)
+	{
+		int begin = i*prob->l/nr_fold;
+		int end = (i+1)*prob->l/nr_fold;
+		int j,k;
+		struct PREFIX(problem) subprob;
+
+		subprob.l = prob->l-(end-begin);
+#ifdef _DENSE_REP
+		subprob.x = Malloc(struct PREFIX(node),subprob.l);
+#else
+		subprob.x = Malloc(struct PREFIX(node)*,subprob.l);
+#endif
+		subprob.y = Malloc(double,subprob.l);
+                subprob.W = Malloc(double,subprob.l);
+
+		k=0;
+		for(j=0;j<begin;j++)
+		{
+			subprob.x[k] = prob->x[perm[j]];
+			subprob.y[k] = prob->y[perm[j]];
+			subprob.W[k] = prob->W[perm[j]];
+			++k;
+		}
+		for(j=end;j<prob->l;j++)
+		{
+			subprob.x[k] = prob->x[perm[j]];
+			subprob.y[k] = prob->y[perm[j]];
+			subprob.W[k] = prob->W[perm[j]];
+			++k;
+		}
+		int p_count=0,n_count=0;
+		for(j=0;j<k;j++)
+			if(subprob.y[j]>0)
+				p_count++;
+			else
+				n_count++;
+
+		if(p_count==0 && n_count==0)
+			for(j=begin;j<end;j++)
+				dec_values[perm[j]] = 0;
+		else if(p_count > 0 && n_count == 0)
+			for(j=begin;j<end;j++)
+				dec_values[perm[j]] = 1;
+		else if(p_count == 0 && n_count > 0)
+			for(j=begin;j<end;j++)
+				dec_values[perm[j]] = -1;
+		else
+		{
+			svm_parameter subparam = *param;
+			subparam.probability=0;
+			subparam.C=1.0;
+			subparam.nr_weight=2;
+			subparam.weight_label = Malloc(int,2);
+			subparam.weight = Malloc(double,2);
+			subparam.weight_label[0]=+1;
+			subparam.weight_label[1]=-1;
+			subparam.weight[0]=Cp;
+			subparam.weight[1]=Cn;
+			struct PREFIX(model) *submodel = PREFIX(train)(&subprob,&subparam, status, blas_functions);
+			for(j=begin;j<end;j++)
+			{
+#ifdef _DENSE_REP
+                                PREFIX(predict_values)(submodel,(prob->x+perm[j]),&(dec_values[perm[j]]), blas_functions);
+#else
+				PREFIX(predict_values)(submodel,prob->x[perm[j]],&(dec_values[perm[j]]), blas_functions);
+#endif
+				// ensure +1 -1 order; reason not using CV subroutine
+				dec_values[perm[j]] *= submodel->label[0];
+			}
+			PREFIX(free_and_destroy_model)(&submodel);
+			PREFIX(destroy_param)(&subparam);
+		}
+		free(subprob.x);
+		free(subprob.y);
+                free(subprob.W);
+	}
+	sigmoid_train(prob->l,dec_values,prob->y,probA,probB);
+	free(dec_values);
+	free(perm);
+}
+
+// Return parameter of a Laplace distribution
+static double svm_svr_probability(
+	const PREFIX(problem) *prob, const svm_parameter *param, BlasFunctions *blas_functions)
+{
+	int i;
+	int nr_fold = 5;
+	double *ymv = Malloc(double,prob->l);
+	double mae = 0;
+
+	svm_parameter newparam = *param;
+	newparam.probability = 0;
+    newparam.random_seed = -1; // This is called from train, which already sets
+                               // the seed.
+	PREFIX(cross_validation)(prob,&newparam,nr_fold,ymv, blas_functions);
+	for(i=0;i<prob->l;i++)
+	{
+		ymv[i]=prob->y[i]-ymv[i];
+		mae += fabs(ymv[i]);
+	}
+	mae /= prob->l;
+	double std=sqrt(2*mae*mae);
+	int count=0;
+	mae=0;
+	for(i=0;i<prob->l;i++)
+		if (fabs(ymv[i]) > 5*std)
+			count=count+1;
+		else
+			mae+=fabs(ymv[i]);
+	mae /= (prob->l-count);
+	info("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma= %g\n",mae);
+	free(ymv);
+	return mae;
+}
+
+
+
+// label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data
+// perm, length l, must be allocated before calling this subroutine
+static void svm_group_classes(const PREFIX(problem) *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm)
+{
+	int l = prob->l;
+	int max_nr_class = 16;
+	int nr_class = 0;
+	int *label = Malloc(int,max_nr_class);
+	int *count = Malloc(int,max_nr_class);
+	int *data_label = Malloc(int,l);
+	int i, j, this_label, this_count;
+
+	for(i=0;i<l;i++)
+	{
+		this_label = (int)prob->y[i];
+		for(j=0;j<nr_class;j++)
+		{
+			if(this_label == label[j])
+			{
+				++count[j];
+				break;
+			}
+		}
+		if(j == nr_class)
+		{
+			if(nr_class == max_nr_class)
+			{
+				max_nr_class *= 2;
+				label = (int *)realloc(label,max_nr_class*sizeof(int));
+				count = (int *)realloc(count,max_nr_class*sizeof(int));
+			}
+			label[nr_class] = this_label;
+			count[nr_class] = 1;
+			++nr_class;
+		}
+	}
+
+        /*
+         * Sort labels by straight insertion and apply the same
+         * transformation to array count.
+         */
+        for(j=1; j<nr_class; j++)
+        {
+                i = j-1;
+                this_label = label[j];
+                this_count = count[j];
+                while(i>=0 && label[i] > this_label)
+                {
+                        label[i+1] = label[i];
+                        count[i+1] = count[i];
+                        i--;
+                }
+                label[i+1] = this_label;
+                count[i+1] = this_count;
+        }
+
+        for (i=0; i<l; i++)
+        {
+                j = 0;
+                this_label = (int)prob->y[i];
+                while(this_label != label[j]){
+                        j ++;
+                }
+                data_label[i] = j;
+        }
+
+	int *start = Malloc(int,nr_class);
+	start[0] = 0;
+	for(i=1;i<nr_class;i++)
+		start[i] = start[i-1]+count[i-1];
+	for(i=0;i<l;i++)
+	{
+		perm[start[data_label[i]]] = i;
+		++start[data_label[i]];
+	}
+
+	start[0] = 0;
+	for(i=1;i<nr_class;i++)
+		start[i] = start[i-1]+count[i-1];
+
+	*nr_class_ret = nr_class;
+	*label_ret = label;
+	*start_ret = start;
+	*count_ret = count;
+	free(data_label);
+}
+
+} /* end namespace */
+
+// Remove zero weighed data as libsvm and some liblinear solvers require C > 0.
+//
+static void remove_zero_weight(PREFIX(problem) *newprob, const PREFIX(problem) *prob)
+{
+	int i;
+	int l = 0;
+	for(i=0;i<prob->l;i++)
+		if(prob->W[i] > 0) l++;
+	*newprob = *prob;
+	newprob->l = l;
+#ifdef _DENSE_REP
+	newprob->x = Malloc(PREFIX(node),l);
+#else
+      	newprob->x = Malloc(PREFIX(node) *,l);
+#endif
+	newprob->y = Malloc(double,l);
+	newprob->W = Malloc(double,l);
+
+	int j = 0;
+	for(i=0;i<prob->l;i++)
+		if(prob->W[i] > 0)
+		{
+			newprob->x[j] = prob->x[i];
+			newprob->y[j] = prob->y[i];
+			newprob->W[j] = prob->W[i];
+			j++;
+		}
+}
+
+//
+// Interface functions
+//
+PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *param,
+        int *status, BlasFunctions *blas_functions)
+{
+	PREFIX(problem) newprob;
+	remove_zero_weight(&newprob, prob);
+	prob = &newprob;
+
+	PREFIX(model) *model = Malloc(PREFIX(model),1);
+	model->param = *param;
+	model->free_sv = 0;	// XXX
+
+    if(param->random_seed >= 0)
+    {
+        set_seed(param->random_seed);
+    }
+
+	if(param->svm_type == ONE_CLASS ||
+	   param->svm_type == EPSILON_SVR ||
+	   param->svm_type == NU_SVR)
+	{
+		// regression or one-class-svm
+		model->nr_class = 2;
+		model->label = NULL;
+		model->nSV = NULL;
+		model->probA = NULL; model->probB = NULL;
+		model->sv_coef = Malloc(double *,1);
+
+		if(param->probability &&
+		   (param->svm_type == EPSILON_SVR ||
+		    param->svm_type == NU_SVR))
+		{
+			model->probA = Malloc(double,1);
+			model->probA[0] = NAMESPACE::svm_svr_probability(prob,param,blas_functions);
+		}
+
+                NAMESPACE::decision_function f = NAMESPACE::svm_train_one(prob,param,0,0, status,blas_functions);
+		model->rho = Malloc(double,1);
+		model->rho[0] = f.rho;
+		model->n_iter = Malloc(int,1);
+		model->n_iter[0] = f.n_iter;
+
+		int nSV = 0;
+		int i;
+		for(i=0;i<prob->l;i++)
+			if(fabs(f.alpha[i]) > 0) ++nSV;
+		model->l = nSV;
+#ifdef _DENSE_REP
+		model->SV = Malloc(PREFIX(node),nSV);
+#else
+		model->SV = Malloc(PREFIX(node) *,nSV);
+#endif
+                model->sv_ind = Malloc(int, nSV);
+		model->sv_coef[0] = Malloc(double, nSV);
+		int j = 0;
+		for(i=0;i<prob->l;i++)
+			if(fabs(f.alpha[i]) > 0)
+			{
+				model->SV[j] = prob->x[i];
+                                model->sv_ind[j] = i;
+				model->sv_coef[0][j] = f.alpha[i];
+				++j;
+			}
+
+		free(f.alpha);
+	}
+	else
+	{
+		// classification
+		int l = prob->l;
+		int nr_class;
+		int *label = NULL;
+		int *start = NULL;
+		int *count = NULL;
+		int *perm = Malloc(int,l);
+
+		// group training data of the same class
+                NAMESPACE::svm_group_classes(prob,&nr_class,&label,&start,&count,perm);
+#ifdef _DENSE_REP
+		PREFIX(node) *x = Malloc(PREFIX(node),l);
+#else
+		PREFIX(node) **x = Malloc(PREFIX(node) *,l);
+#endif
+                double *W = Malloc(double, l);
+
+		int i;
+		for(i=0;i<l;i++)
+                {
+			x[i] = prob->x[perm[i]];
+			W[i] = prob->W[perm[i]];
+                }
+
+		// calculate weighted C
+
+		double *weighted_C = Malloc(double, nr_class);
+		for(i=0;i<nr_class;i++)
+			weighted_C[i] = param->C;
+		for(i=0;i<param->nr_weight;i++)
+		{
+			int j;
+			for(j=0;j<nr_class;j++)
+				if(param->weight_label[i] == label[j])
+					break;
+			if(j == nr_class)
+				fprintf(stderr,"warning: class label %d specified in weight is not found\n", param->weight_label[i]);
+			else
+				weighted_C[j] *= param->weight[i];
+		}
+
+		// train k*(k-1)/2 models
+
+		bool *nonzero = Malloc(bool,l);
+		for(i=0;i<l;i++)
+			nonzero[i] = false;
+                NAMESPACE::decision_function *f = Malloc(NAMESPACE::decision_function,nr_class*(nr_class-1)/2);
+
+		double *probA=NULL,*probB=NULL;
+		if (param->probability)
+		{
+			probA=Malloc(double,nr_class*(nr_class-1)/2);
+			probB=Malloc(double,nr_class*(nr_class-1)/2);
+		}
+
+		int p = 0;
+		for(i=0;i<nr_class;i++)
+			for(int j=i+1;j<nr_class;j++)
+			{
+				PREFIX(problem) sub_prob;
+				int si = start[i], sj = start[j];
+				int ci = count[i], cj = count[j];
+				sub_prob.l = ci+cj;
+#ifdef _DENSE_REP
+				sub_prob.x = Malloc(PREFIX(node),sub_prob.l);
+#else
+				sub_prob.x = Malloc(PREFIX(node) *,sub_prob.l);
+#endif
+				sub_prob.W = Malloc(double,sub_prob.l);
+				sub_prob.y = Malloc(double,sub_prob.l);
+				int k;
+				for(k=0;k<ci;k++)
+				{
+					sub_prob.x[k] = x[si+k];
+					sub_prob.y[k] = +1;
+					sub_prob.W[k] = W[si+k];
+				}
+				for(k=0;k<cj;k++)
+				{
+					sub_prob.x[ci+k] = x[sj+k];
+					sub_prob.y[ci+k] = -1;
+					sub_prob.W[ci+k] = W[sj+k];
+				}
+
+				if(param->probability)
+                                    NAMESPACE::svm_binary_svc_probability(&sub_prob,param,weighted_C[i],weighted_C[j],probA[p],probB[p], status, blas_functions);
+
+				f[p] = NAMESPACE::svm_train_one(&sub_prob,param,weighted_C[i],weighted_C[j], status, blas_functions);
+				for(k=0;k<ci;k++)
+					if(!nonzero[si+k] && fabs(f[p].alpha[k]) > 0)
+						nonzero[si+k] = true;
+				for(k=0;k<cj;k++)
+					if(!nonzero[sj+k] && fabs(f[p].alpha[ci+k]) > 0)
+						nonzero[sj+k] = true;
+				free(sub_prob.x);
+				free(sub_prob.y);
+                                free(sub_prob.W);
+				++p;
+			}
+
+		// build output
+
+		model->nr_class = nr_class;
+
+		model->label = Malloc(int,nr_class);
+		for(i=0;i<nr_class;i++)
+			model->label[i] = label[i];
+
+		model->rho = Malloc(double,nr_class*(nr_class-1)/2);
+		model->n_iter = Malloc(int,nr_class*(nr_class-1)/2);
+		for(i=0;i<nr_class*(nr_class-1)/2;i++)
+		{
+			model->rho[i] = f[i].rho;
+			model->n_iter[i] = f[i].n_iter;
+		}
+
+		if(param->probability)
+		{
+			model->probA = Malloc(double,nr_class*(nr_class-1)/2);
+			model->probB = Malloc(double,nr_class*(nr_class-1)/2);
+			for(i=0;i<nr_class*(nr_class-1)/2;i++)
+			{
+				model->probA[i] = probA[i];
+				model->probB[i] = probB[i];
+			}
+		}
+		else
+		{
+			model->probA=NULL;
+			model->probB=NULL;
+		}
+
+		int total_sv = 0;
+		int *nz_count = Malloc(int,nr_class);
+		model->nSV = Malloc(int,nr_class);
+		for(i=0;i<nr_class;i++)
+		{
+			int nSV = 0;
+			for(int j=0;j<count[i];j++)
+				if(nonzero[start[i]+j])
+				{
+					++nSV;
+					++total_sv;
+				}
+			model->nSV[i] = nSV;
+			nz_count[i] = nSV;
+		}
+
+                info("Total nSV = %d\n",total_sv);
+
+		model->l = total_sv;
+                model->sv_ind = Malloc(int, total_sv);
+#ifdef _DENSE_REP
+		model->SV = Malloc(PREFIX(node),total_sv);
+#else
+		model->SV = Malloc(PREFIX(node) *,total_sv);
+#endif
+		p = 0;
+		for(i=0;i<l;i++) {
+			if(nonzero[i]) {
+                                model->SV[p] = x[i];
+                                model->sv_ind[p] = perm[i];
+                                ++p;
+                        }
+                }
+
+		int *nz_start = Malloc(int,nr_class);
+		nz_start[0] = 0;
+		for(i=1;i<nr_class;i++)
+			nz_start[i] = nz_start[i-1]+nz_count[i-1];
+
+		model->sv_coef = Malloc(double *,nr_class-1);
+		for(i=0;i<nr_class-1;i++)
+			model->sv_coef[i] = Malloc(double,total_sv);
+
+		p = 0;
+		for(i=0;i<nr_class;i++)
+			for(int j=i+1;j<nr_class;j++)
+			{
+				// classifier (i,j): coefficients with
+				// i are in sv_coef[j-1][nz_start[i]...],
+				// j are in sv_coef[i][nz_start[j]...]
+
+				int si = start[i];
+				int sj = start[j];
+				int ci = count[i];
+				int cj = count[j];
+
+				int q = nz_start[i];
+				int k;
+				for(k=0;k<ci;k++)
+					if(nonzero[si+k])
+						model->sv_coef[j-1][q++] = f[p].alpha[k];
+				q = nz_start[j];
+				for(k=0;k<cj;k++)
+					if(nonzero[sj+k])
+						model->sv_coef[i][q++] = f[p].alpha[ci+k];
+				++p;
+			}
+
+		free(label);
+		free(probA);
+		free(probB);
+		free(count);
+		free(perm);
+		free(start);
+                free(W);
+		free(x);
+		free(weighted_C);
+		free(nonzero);
+		for(i=0;i<nr_class*(nr_class-1)/2;i++)
+			free(f[i].alpha);
+		free(f);
+		free(nz_count);
+		free(nz_start);
+	}
+	free(newprob.x);
+	free(newprob.y);
+	free(newprob.W);
+	return model;
+}
+
+// Stratified cross validation
+void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter *param, int nr_fold, double *target, BlasFunctions *blas_functions)
+{
+	int i;
+	int *fold_start = Malloc(int,nr_fold+1);
+	int l = prob->l;
+	int *perm = Malloc(int,l);
+	int nr_class;
+    if(param->random_seed >= 0)
+    {
+        set_seed(param->random_seed);
+    }
+
+	// stratified cv may not give leave-one-out rate
+	// Each class to l folds -> some folds may have zero elements
+	if((param->svm_type == C_SVC ||
+	    param->svm_type == NU_SVC) && nr_fold < l)
+	{
+		int *start = NULL;
+		int *label = NULL;
+		int *count = NULL;
+                NAMESPACE::svm_group_classes(prob,&nr_class,&label,&start,&count,perm);
+
+		// random shuffle and then data grouped by fold using the array perm
+		int *fold_count = Malloc(int,nr_fold);
+		int c;
+		int *index = Malloc(int,l);
+		for(i=0;i<l;i++)
+			index[i]=perm[i];
+		for (c=0; c<nr_class; c++)
+			for(i=0;i<count[c];i++)
+			{
+				int j = i+bounded_rand_int(count[c]-i);
+				swap(index[start[c]+j],index[start[c]+i]);
+			}
+		for(i=0;i<nr_fold;i++)
+		{
+			fold_count[i] = 0;
+			for (c=0; c<nr_class;c++)
+				fold_count[i]+=(i+1)*count[c]/nr_fold-i*count[c]/nr_fold;
+		}
+		fold_start[0]=0;
+		for (i=1;i<=nr_fold;i++)
+			fold_start[i] = fold_start[i-1]+fold_count[i-1];
+		for (c=0; c<nr_class;c++)
+			for(i=0;i<nr_fold;i++)
+			{
+				int begin = start[c]+i*count[c]/nr_fold;
+				int end = start[c]+(i+1)*count[c]/nr_fold;
+				for(int j=begin;j<end;j++)
+				{
+					perm[fold_start[i]] = index[j];
+					fold_start[i]++;
+				}
+			}
+		fold_start[0]=0;
+		for (i=1;i<=nr_fold;i++)
+			fold_start[i] = fold_start[i-1]+fold_count[i-1];
+		free(start);
+		free(label);
+		free(count);
+		free(index);
+		free(fold_count);
+	}
+	else
+	{
+		for(i=0;i<l;i++) perm[i]=i;
+		for(i=0;i<l;i++)
+		{
+			int j = i+bounded_rand_int(l-i);
+			swap(perm[i],perm[j]);
+		}
+		for(i=0;i<=nr_fold;i++)
+			fold_start[i]=i*l/nr_fold;
+	}
+
+	for(i=0;i<nr_fold;i++)
+	{
+		int begin = fold_start[i];
+		int end = fold_start[i+1];
+		int j,k;
+		struct PREFIX(problem) subprob;
+
+		subprob.l = l-(end-begin);
+#ifdef _DENSE_REP
+		subprob.x = Malloc(struct PREFIX(node),subprob.l);
+#else
+		subprob.x = Malloc(struct PREFIX(node)*,subprob.l);
+#endif
+		subprob.y = Malloc(double,subprob.l);
+		subprob.W = Malloc(double,subprob.l);
+
+		k=0;
+		for(j=0;j<begin;j++)
+		{
+			subprob.x[k] = prob->x[perm[j]];
+			subprob.y[k] = prob->y[perm[j]];
+			subprob.W[k] = prob->W[perm[j]];
+			++k;
+		}
+		for(j=end;j<l;j++)
+		{
+			subprob.x[k] = prob->x[perm[j]];
+			subprob.y[k] = prob->y[perm[j]];
+			subprob.W[k] = prob->W[perm[j]];
+			++k;
+		}
+                int dummy_status = 0; // IGNORES TIMEOUT ERRORS
+		struct PREFIX(model) *submodel = PREFIX(train)(&subprob,param, &dummy_status, blas_functions);
+		if(param->probability &&
+		   (param->svm_type == C_SVC || param->svm_type == NU_SVC))
+		{
+			double *prob_estimates=Malloc(double, PREFIX(get_nr_class)(submodel));
+			for(j=begin;j<end;j++)
+#ifdef _DENSE_REP
+				target[perm[j]] = PREFIX(predict_probability)(submodel,(prob->x + perm[j]),prob_estimates, blas_functions);
+#else
+                                target[perm[j]] = PREFIX(predict_probability)(submodel,prob->x[perm[j]],prob_estimates, blas_functions);
+#endif
+			free(prob_estimates);
+		}
+		else
+			for(j=begin;j<end;j++)
+#ifdef _DENSE_REP
+				target[perm[j]] = PREFIX(predict)(submodel,prob->x+perm[j],blas_functions);
+#else
+                target[perm[j]] = PREFIX(predict)(submodel,prob->x[perm[j]],blas_functions);
+#endif
+		PREFIX(free_and_destroy_model)(&submodel);
+		free(subprob.x);
+		free(subprob.y);
+                free(subprob.W);
+	}
+	free(fold_start);
+	free(perm);
+}
+
+
+int PREFIX(get_svm_type)(const PREFIX(model) *model)
+{
+	return model->param.svm_type;
+}
+
+int PREFIX(get_nr_class)(const PREFIX(model) *model)
+{
+	return model->nr_class;
+}
+
+void PREFIX(get_labels)(const PREFIX(model) *model, int* label)
+{
+	if (model->label != NULL)
+		for(int i=0;i<model->nr_class;i++)
+			label[i] = model->label[i];
+}
+
+double PREFIX(get_svr_probability)(const PREFIX(model) *model)
+{
+	if ((model->param.svm_type == EPSILON_SVR || model->param.svm_type == NU_SVR) &&
+	    model->probA!=NULL)
+		return model->probA[0];
+	else
+	{
+		fprintf(stderr,"Model doesn't contain information for SVR probability inference\n");
+		return 0;
+	}
+}
+
+double PREFIX(predict_values)(const PREFIX(model) *model, const PREFIX(node) *x, double* dec_values, BlasFunctions *blas_functions)
+{
+	int i;
+	if(model->param.svm_type == ONE_CLASS ||
+	   model->param.svm_type == EPSILON_SVR ||
+	   model->param.svm_type == NU_SVR)
+	{
+		double *sv_coef = model->sv_coef[0];
+		double sum = 0;
+
+		for(i=0;i<model->l;i++)
+#ifdef _DENSE_REP
+                    sum += sv_coef[i] * NAMESPACE::Kernel::k_function(x,model->SV+i,model->param,blas_functions);
+#else
+                sum += sv_coef[i] * NAMESPACE::Kernel::k_function(x,model->SV[i],model->param,blas_functions);
+#endif
+		sum -= model->rho[0];
+		*dec_values = sum;
+
+		if(model->param.svm_type == ONE_CLASS)
+			return (sum>0)?1:-1;
+		else
+			return sum;
+	}
+	else
+	{
+		int nr_class = model->nr_class;
+		int l = model->l;
+
+		double *kvalue = Malloc(double,l);
+		for(i=0;i<l;i++)
+#ifdef _DENSE_REP
+                    kvalue[i] = NAMESPACE::Kernel::k_function(x,model->SV+i,model->param,blas_functions);
+#else
+                kvalue[i] = NAMESPACE::Kernel::k_function(x,model->SV[i],model->param,blas_functions);
+#endif
+
+		int *start = Malloc(int,nr_class);
+		start[0] = 0;
+		for(i=1;i<nr_class;i++)
+			start[i] = start[i-1]+model->nSV[i-1];
+
+		int *vote = Malloc(int,nr_class);
+		for(i=0;i<nr_class;i++)
+			vote[i] = 0;
+
+		int p=0;
+		for(i=0;i<nr_class;i++)
+			for(int j=i+1;j<nr_class;j++)
+			{
+				double sum = 0;
+				int si = start[i];
+				int sj = start[j];
+				int ci = model->nSV[i];
+				int cj = model->nSV[j];
+
+				int k;
+				double *coef1 = model->sv_coef[j-1];
+				double *coef2 = model->sv_coef[i];
+				for(k=0;k<ci;k++)
+					sum += coef1[si+k] * kvalue[si+k];
+				for(k=0;k<cj;k++)
+					sum += coef2[sj+k] * kvalue[sj+k];
+				sum -= model->rho[p];
+				dec_values[p] = sum;
+
+				if(dec_values[p] > 0)
+					++vote[i];
+				else
+					++vote[j];
+				p++;
+			}
+
+		int vote_max_idx = 0;
+		for(i=1;i<nr_class;i++)
+			if(vote[i] > vote[vote_max_idx])
+				vote_max_idx = i;
+
+		free(kvalue);
+		free(start);
+		free(vote);
+		return model->label[vote_max_idx];
+	}
+}
+
+double PREFIX(predict)(const PREFIX(model) *model, const PREFIX(node) *x, BlasFunctions *blas_functions)
+{
+	int nr_class = model->nr_class;
+	double *dec_values;
+	if(model->param.svm_type == ONE_CLASS ||
+	   model->param.svm_type == EPSILON_SVR ||
+	   model->param.svm_type == NU_SVR)
+		dec_values = Malloc(double, 1);
+	else
+		dec_values = Malloc(double, nr_class*(nr_class-1)/2);
+	double pred_result = PREFIX(predict_values)(model, x, dec_values, blas_functions);
+	free(dec_values);
+	return pred_result;
+}
+
+double PREFIX(predict_probability)(
+	const PREFIX(model) *model, const PREFIX(node) *x, double *prob_estimates, BlasFunctions *blas_functions)
+{
+	if ((model->param.svm_type == C_SVC || model->param.svm_type == NU_SVC) &&
+	    model->probA!=NULL && model->probB!=NULL)
+	{
+		int i;
+		int nr_class = model->nr_class;
+		double *dec_values = Malloc(double, nr_class*(nr_class-1)/2);
+		PREFIX(predict_values)(model, x, dec_values, blas_functions);
+
+		double min_prob=1e-7;
+		double **pairwise_prob=Malloc(double *,nr_class);
+		for(i=0;i<nr_class;i++)
+			pairwise_prob[i]=Malloc(double,nr_class);
+		int k=0;
+		for(i=0;i<nr_class;i++)
+			for(int j=i+1;j<nr_class;j++)
+			{
+                            pairwise_prob[i][j]=min(max(NAMESPACE::sigmoid_predict(dec_values[k],model->probA[k],model->probB[k]),min_prob),1-min_prob);
+				pairwise_prob[j][i]=1-pairwise_prob[i][j];
+				k++;
+			}
+                NAMESPACE::multiclass_probability(nr_class,pairwise_prob,prob_estimates);
+
+		int prob_max_idx = 0;
+		for(i=1;i<nr_class;i++)
+			if(prob_estimates[i] > prob_estimates[prob_max_idx])
+				prob_max_idx = i;
+		for(i=0;i<nr_class;i++)
+			free(pairwise_prob[i]);
+		free(dec_values);
+		free(pairwise_prob);
+		return model->label[prob_max_idx];
+	}
+	else
+		return PREFIX(predict)(model, x, blas_functions);
+}
+
+
+void PREFIX(free_model_content)(PREFIX(model)* model_ptr)
+{
+	if(model_ptr->free_sv && model_ptr->l > 0 && model_ptr->SV != NULL)
+#ifdef _DENSE_REP
+		for (int i = 0; i < model_ptr->l; i++)
+			free(model_ptr->SV[i].values);
+#else
+		free((void *)(model_ptr->SV[0]));
+#endif
+
+	if(model_ptr->sv_coef)
+	{
+		for(int i=0;i<model_ptr->nr_class-1;i++)
+			free(model_ptr->sv_coef[i]);
+	}
+
+	free(model_ptr->SV);
+	model_ptr->SV = NULL;
+
+	free(model_ptr->sv_coef);
+	model_ptr->sv_coef = NULL;
+
+	free(model_ptr->sv_ind);
+	model_ptr->sv_ind = NULL;
+
+	free(model_ptr->rho);
+	model_ptr->rho = NULL;
+
+	free(model_ptr->label);
+	model_ptr->label= NULL;
+
+	free(model_ptr->probA);
+	model_ptr->probA = NULL;
+
+	free(model_ptr->probB);
+	model_ptr->probB= NULL;
+
+	free(model_ptr->nSV);
+	model_ptr->nSV = NULL;
+
+	free(model_ptr->n_iter);
+	model_ptr->n_iter = NULL;
+}
+
+void PREFIX(free_and_destroy_model)(PREFIX(model)** model_ptr_ptr)
+{
+	if(model_ptr_ptr != NULL && *model_ptr_ptr != NULL)
+	{
+		PREFIX(free_model_content)(*model_ptr_ptr);
+		free(*model_ptr_ptr);
+		*model_ptr_ptr = NULL;
+	}
+}
+
+void PREFIX(destroy_param)(svm_parameter* param)
+{
+	free(param->weight_label);
+	free(param->weight);
+}
+
+const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_parameter *param)
+{
+	// svm_type
+
+	int svm_type = param->svm_type;
+	if(svm_type != C_SVC &&
+	   svm_type != NU_SVC &&
+	   svm_type != ONE_CLASS &&
+	   svm_type != EPSILON_SVR &&
+	   svm_type != NU_SVR)
+		return "unknown svm type";
+
+	// kernel_type, degree
+
+	int kernel_type = param->kernel_type;
+	if(kernel_type != LINEAR &&
+	   kernel_type != POLY &&
+	   kernel_type != RBF &&
+	   kernel_type != SIGMOID &&
+	   kernel_type != PRECOMPUTED)
+		return "unknown kernel type";
+
+	if(param->gamma < 0)
+		return "gamma < 0";
+
+	if(param->degree < 0)
+		return "degree of polynomial kernel < 0";
+
+	// cache_size,eps,C,nu,p,shrinking
+
+	if(param->cache_size <= 0)
+		return "cache_size <= 0";
+
+	if(param->eps <= 0)
+		return "eps <= 0";
+
+	if(svm_type == C_SVC ||
+	   svm_type == EPSILON_SVR ||
+	   svm_type == NU_SVR)
+		if(param->C <= 0)
+			return "C <= 0";
+
+	if(svm_type == NU_SVC ||
+	   svm_type == ONE_CLASS ||
+	   svm_type == NU_SVR)
+		if(param->nu <= 0 || param->nu > 1)
+			return "nu <= 0 or nu > 1";
+
+	if(svm_type == EPSILON_SVR)
+		if(param->p < 0)
+			return "p < 0";
+
+	if(param->shrinking != 0 &&
+	   param->shrinking != 1)
+		return "shrinking != 0 and shrinking != 1";
+
+	if(param->probability != 0 &&
+	   param->probability != 1)
+		return "probability != 0 and probability != 1";
+
+	if(param->probability == 1 &&
+	   svm_type == ONE_CLASS)
+		return "one-class SVM probability output not supported yet";
+
+
+	// check whether nu-svc is feasible
+
+	if(svm_type == NU_SVC)
+	{
+		int l = prob->l;
+		int max_nr_class = 16;
+		int nr_class = 0;
+		int *label = Malloc(int,max_nr_class);
+		double *count = Malloc(double,max_nr_class);
+
+		int i;
+		for(i=0;i<l;i++)
+		{
+			int this_label = (int)prob->y[i];
+			int j;
+			for(j=0;j<nr_class;j++)
+				if(this_label == label[j])
+				{
+					count[j] += prob->W[i];
+					break;
+				}
+			if(j == nr_class)
+			{
+				if(nr_class == max_nr_class)
+				{
+					max_nr_class *= 2;
+					label = (int *)realloc(label,max_nr_class*sizeof(int));
+					count = (double *)realloc(count,max_nr_class*sizeof(double));
+
+				}
+				label[nr_class] = this_label;
+				count[nr_class] = prob->W[i];
+				++nr_class;
+			}
+		}
+
+		for(i=0;i<nr_class;i++)
+		{
+			double n1 = count[i];
+			for(int j=i+1;j<nr_class;j++)
+			{
+				double n2 = count[j];
+				if(param->nu*(n1+n2)/2 > min(n1,n2))
+				{
+					free(label);
+					free(count);
+					return "specified nu is infeasible";
+				}
+			}
+		}
+		free(label);
+		free(count);
+	}
+
+	if(svm_type == C_SVC ||
+	   svm_type == EPSILON_SVR ||
+	   svm_type == NU_SVR ||
+	   svm_type == ONE_CLASS)
+	{
+		PREFIX(problem) newprob;
+		// filter samples with negative and null weights
+		remove_zero_weight(&newprob, prob);
+
+		// all samples were removed
+		if(newprob.l == 0) {
+			free(newprob.x);
+			free(newprob.y);
+			free(newprob.W);
+			return "Invalid input - all samples have zero or negative weights.";
+		}
+		else if(prob->l != newprob.l &&
+		        svm_type == C_SVC)
+		{
+			bool only_one_label = true;
+			int first_label = newprob.y[0];
+			for(int i=1;i<newprob.l;i++)
+			{
+				if(newprob.y[i] != first_label)
+				{
+					only_one_label = false;
+					break;
+				}
+			}
+			if(only_one_label) {
+				free(newprob.x);
+				free(newprob.y);
+				free(newprob.W);
+				return "Invalid input - all samples with positive weights belong to the same class.";
+			}
+		}
+
+		free(newprob.x);
+		free(newprob.y);
+		free(newprob.W);
+	}
+	return NULL;
+}
+
+void PREFIX(set_print_string_function)(void (*print_func)(const char *))
+{
+	if(print_func == NULL)
+		svm_print_string = &print_string_stdout;
+	else
+		svm_print_string = print_func;
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.h
new file mode 100644
index 0000000000000000000000000000000000000000..518872c67bc5ca70db197af18dae3a284746b25e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.h
@@ -0,0 +1,176 @@
+#ifndef _LIBSVM_H
+#define _LIBSVM_H
+
+#define LIBSVM_VERSION 310
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "_svm_cython_blas_helpers.h"
+
+struct svm_node
+{
+	int dim;
+	int ind; /* index. A bit redundant, but needed if using a
+                    precomputed kernel */
+	double *values;
+};
+
+struct svm_problem
+{
+	int l;
+	double *y;
+	struct svm_node *x;
+	double *W; /* instance weights */
+};
+
+
+struct svm_csr_node
+{
+	int index;
+	double value;
+};
+
+struct svm_csr_problem
+{
+	int l;
+	double *y;
+	struct svm_csr_node **x;
+        double *W; /* instance weights */
+};
+
+
+enum { C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR };	/* svm_type */
+enum { LINEAR, POLY, RBF, SIGMOID, PRECOMPUTED }; /* kernel_type */
+
+struct svm_parameter
+{
+	int svm_type;
+	int kernel_type;
+	int degree;	/* for poly */
+	double gamma;	/* for poly/rbf/sigmoid */
+	double coef0;	/* for poly/sigmoid */
+
+	/* these are for training only */
+	double cache_size; /* in MB */
+	double eps;	/* stopping criteria */
+	double C;	/* for C_SVC, EPSILON_SVR and NU_SVR */
+	int nr_weight;		/* for C_SVC */
+	int *weight_label;	/* for C_SVC */
+	double* weight;		/* for C_SVC */
+	double nu;	/* for NU_SVC, ONE_CLASS, and NU_SVR */
+	double p;	/* for EPSILON_SVR */
+	int shrinking;	/* use the shrinking heuristics */
+	int probability; /* do probability estimates */
+	int max_iter; /* ceiling on Solver runtime */
+    int random_seed; /* seed for random number generator */
+};
+
+//
+// svm_model
+//
+struct svm_model
+{
+	struct svm_parameter param;	/* parameter */
+	int nr_class;		/* number of classes, = 2 in regression/one class svm */
+	int l;			/* total #SV */
+	struct svm_node *SV;		/* SVs (SV[l]) */
+	double **sv_coef;	/* coefficients for SVs in decision functions (sv_coef[k-1][l]) */
+	int *n_iter;		/* number of iterations run by the optimization routine to fit the model */
+
+	int *sv_ind;            /* index of support vectors */
+
+	double *rho;		/* constants in decision functions (rho[k*(k-1)/2]) */
+	double *probA;		/* pairwise probability information */
+	double *probB;
+
+	/* for classification only */
+
+	int *label;		/* label of each class (label[k]) */
+	int *nSV;		/* number of SVs for each class (nSV[k]) */
+				/* nSV[0] + nSV[1] + ... + nSV[k-1] = l */
+	/* XXX */
+	int free_sv;		/* 1 if svm_model is created by svm_load_model*/
+				/* 0 if svm_model is created by svm_train */
+};
+
+
+struct svm_csr_model
+{
+	struct svm_parameter param;	/* parameter */
+	int nr_class;		/* number of classes, = 2 in regression/one class svm */
+	int l;			/* total #SV */
+	struct svm_csr_node **SV;		/* SVs (SV[l]) */
+	double **sv_coef;	/* coefficients for SVs in decision functions (sv_coef[k-1][l]) */
+	int *n_iter;		/* number of iterations run by the optimization routine to fit the model */
+
+        int *sv_ind;            /* index of support vectors */
+
+	double *rho;		/* constants in decision functions (rho[k*(k-1)/2]) */
+	double *probA;		/* pairwise probability information */
+	double *probB;
+
+	/* for classification only */
+
+	int *label;		/* label of each class (label[k]) */
+	int *nSV;		/* number of SVs for each class (nSV[k]) */
+				/* nSV[0] + nSV[1] + ... + nSV[k-1] = l */
+	/* XXX */
+	int free_sv;		/* 1 if svm_model is created by svm_load_model*/
+				/* 0 if svm_model is created by svm_train */
+};
+
+/* svm_ functions are defined by libsvm_template.cpp from generic versions in svm.cpp */
+struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param, int *status, BlasFunctions *blas_functions);
+void svm_cross_validation(const struct svm_problem *prob, const struct svm_parameter *param, int nr_fold, double *target, BlasFunctions *blas_functions);
+
+int svm_save_model(const char *model_file_name, const struct svm_model *model);
+struct svm_model *svm_load_model(const char *model_file_name);
+
+int svm_get_svm_type(const struct svm_model *model);
+int svm_get_nr_class(const struct svm_model *model);
+void svm_get_labels(const struct svm_model *model, int *label);
+double svm_get_svr_probability(const struct svm_model *model);
+
+double svm_predict_values(const struct svm_model *model, const struct svm_node *x, double* dec_values, BlasFunctions *blas_functions);
+double svm_predict(const struct svm_model *model, const struct svm_node *x, BlasFunctions *blas_functions);
+double svm_predict_probability(const struct svm_model *model, const struct svm_node *x, double* prob_estimates, BlasFunctions *blas_functions);
+
+void svm_free_model_content(struct svm_model *model_ptr);
+void svm_free_and_destroy_model(struct svm_model **model_ptr_ptr);
+void svm_destroy_param(struct svm_parameter *param);
+
+const char *svm_check_parameter(const struct svm_problem *prob, const struct svm_parameter *param);
+
+void svm_set_print_string_function(void (*print_func)(const char *));
+
+
+/* sparse version */
+
+/* svm_csr_ functions are defined by libsvm_template.cpp from generic versions in svm.cpp */
+struct svm_csr_model *svm_csr_train(const struct svm_csr_problem *prob, const struct svm_parameter *param, int *status, BlasFunctions *blas_functions);
+void svm_csr_cross_validation(const struct svm_csr_problem *prob, const struct svm_parameter *param, int nr_fold, double *target, BlasFunctions *blas_functions);
+
+int svm_csr_get_svm_type(const struct svm_csr_model *model);
+int svm_csr_get_nr_class(const struct svm_csr_model *model);
+void svm_csr_get_labels(const struct svm_csr_model *model, int *label);
+double svm_csr_get_svr_probability(const struct svm_csr_model *model);
+
+double svm_csr_predict_values(const struct svm_csr_model *model, const struct svm_csr_node *x, double* dec_values, BlasFunctions *blas_functions);
+double svm_csr_predict(const struct svm_csr_model *model, const struct svm_csr_node *x, BlasFunctions *blas_functions);
+double svm_csr_predict_probability(const struct svm_csr_model *model, const struct svm_csr_node *x, double* prob_estimates, BlasFunctions *blas_functions);
+
+void svm_csr_free_model_content(struct svm_csr_model *model_ptr);
+void svm_csr_free_and_destroy_model(struct svm_csr_model **model_ptr_ptr);
+void svm_csr_destroy_param(struct svm_parameter *param);
+
+const char *svm_csr_check_parameter(const struct svm_csr_problem *prob, const struct svm_parameter *param);
+
+/* end sparse version */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBSVM_H */
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/newrand/newrand.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/newrand/newrand.h
new file mode 100644
index 0000000000000000000000000000000000000000..e01bea99ec17e2d968d6e6aeef59f4e552b51a4f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/newrand/newrand.h
@@ -0,0 +1,59 @@
+/*
+   Creation, 2020:
+   - New random number generator using a mersenne twister + tweaked lemire
+     postprocessor. This fixed a convergence issue on windows targets for
+     libsvm and liblinear.
+     Sylvain Marie, Schneider Electric
+     See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+ */
+#ifndef _NEWRAND_H
+#define _NEWRAND_H
+
+#ifdef __cplusplus
+#include <random>  // needed for cython to generate a .cpp file from newrand.h
+extern "C" {
+#endif
+
+// Scikit-Learn-specific random number generator replacing `rand()` originally
+// used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux,
+// with increased speed
+// - (1) Init a `mt_rand` object
+std::mt19937 mt_rand(std::mt19937::default_seed);
+
+// - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed.
+void set_seed(unsigned custom_seed) {
+    mt_rand.seed(custom_seed);
+}
+
+// - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
+inline uint32_t bounded_rand_int(uint32_t range) {
+    // "LibSVM / LibLinear Original way" - make a 31bit positive
+    // random number and use modulo to make it fit in the range
+    // return abs( (int)mt_rand()) % range;
+
+    // "Better way": tweaked Lemire post-processor
+    // from http://www.pcg-random.org/posts/bounded-rands.html
+    uint32_t x = mt_rand();
+    uint64_t m = uint64_t(x) * uint64_t(range);
+    uint32_t l = uint32_t(m);
+    if (l < range) {
+        uint32_t t = -range;
+        if (t >= range) {
+            t -= range;
+            if (t >= range)
+                t %= range;
+        }
+        while (l < t) {
+            x = mt_rand();
+            m = uint64_t(x) * uint64_t(range);
+            l = uint32_t(m);
+        }
+    }
+    return m >> 32;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NEWRAND_H */
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_bounds.py b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_bounds.py
new file mode 100644
index 0000000000000000000000000000000000000000..af7e8cfb1159d1c7520d4b506015727c80391cad
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_bounds.py
@@ -0,0 +1,147 @@
+import numpy as np
+import pytest
+from scipy import stats
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import LinearSVC
+from sklearn.svm._bounds import l1_min_c
+from sklearn.svm._newrand import bounded_rand_int_wrap, set_seed_wrap
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]]
+
+Y1 = [0, 1, 1, 1]
+Y2 = [2, 1, 0, 0]
+
+
+# TODO(1.8): remove filterwarnings after the deprecation of liblinear multiclass
+#            and maybe remove LogisticRegression from this test
+@pytest.mark.filterwarnings(
+    "ignore:.*'liblinear' solver for multiclass classification is deprecated.*"
+)
+@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [np.array])
+@pytest.mark.parametrize("loss", ["squared_hinge", "log"])
+@pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"])
+@pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"])
+def test_l1_min_c(X_container, loss, Y_label, intercept_label):
+    Ys = {"two-classes": Y1, "multi-class": Y2}
+    intercepts = {
+        "no-intercept": {"fit_intercept": False},
+        "fit-intercept": {"fit_intercept": True, "intercept_scaling": 10},
+    }
+
+    X = X_container(dense_X)
+    Y = Ys[Y_label]
+    intercept_params = intercepts[intercept_label]
+    check_l1_min_c(X, Y, loss, **intercept_params)
+
+
+def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=1.0):
+    min_c = l1_min_c(
+        X,
+        y,
+        loss=loss,
+        fit_intercept=fit_intercept,
+        intercept_scaling=intercept_scaling,
+    )
+
+    clf = {
+        "log": LogisticRegression(penalty="l1", solver="liblinear"),
+        "squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False),
+    }[loss]
+
+    clf.fit_intercept = fit_intercept
+    clf.intercept_scaling = intercept_scaling
+
+    clf.C = min_c
+    clf.fit(X, y)
+    assert (np.asarray(clf.coef_) == 0).all()
+    assert (np.asarray(clf.intercept_) == 0).all()
+
+    clf.C = min_c * 1.01
+    clf.fit(X, y)
+    assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any()
+
+
+def test_ill_posed_min_c():
+    X = [[0, 0], [0, 0]]
+    y = [0, 1]
+    with pytest.raises(ValueError):
+        l1_min_c(X, y)
+
+
+_MAX_UNSIGNED_INT = 4294967295
+
+
+def test_newrand_default():
+    """Test that bounded_rand_int_wrap without seeding respects the range
+
+    Note this test should pass either if executed alone, or in conjunctions
+    with other tests that call set_seed explicit in any order: it checks
+    invariants on the RNG instead of specific values.
+    """
+    generated = [bounded_rand_int_wrap(100) for _ in range(10)]
+    assert all(0 <= x < 100 for x in generated)
+    assert not all(x == generated[0] for x in generated)
+
+
+@pytest.mark.parametrize("seed, expected", [(0, 54), (_MAX_UNSIGNED_INT, 9)])
+def test_newrand_set_seed(seed, expected):
+    """Test that `set_seed` produces deterministic results"""
+    set_seed_wrap(seed)
+    generated = bounded_rand_int_wrap(100)
+    assert generated == expected
+
+
+@pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1])
+def test_newrand_set_seed_overflow(seed):
+    """Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
+    with pytest.raises(OverflowError):
+        set_seed_wrap(seed)
+
+
+@pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)])
+def test_newrand_bounded_rand_int(range_, n_pts):
+    """Test that `bounded_rand_int` follows a uniform distribution"""
+    # XXX: this test is very seed sensitive: either it is wrong (too strict?)
+    # or the wrapped RNG is not uniform enough, at least on some platforms.
+    set_seed_wrap(42)
+    n_iter = 100
+    ks_pvals = []
+    uniform_dist = stats.uniform(loc=0, scale=range_)
+    # perform multiple samplings to make chance of outlier sampling negligible
+    for _ in range(n_iter):
+        # Deterministic random sampling
+        sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
+        res = stats.kstest(sample, uniform_dist.cdf)
+        ks_pvals.append(res.pvalue)
+    # Null hypothesis = samples come from an uniform distribution.
+    # Under the null hypothesis, p-values should be uniformly distributed
+    # and not concentrated on low values
+    # (this may seem counter-intuitive but is backed by multiple refs)
+    # So we can do two checks:
+
+    # (1) check uniformity of p-values
+    uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
+    res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
+    assert res_pvals.pvalue > 0.05, (
+        "Null hypothesis rejected: generated random numbers are not uniform."
+        " Details: the (meta) p-value of the test of uniform distribution"
+        f" of p-values is {res_pvals.pvalue} which is not > 0.05"
+    )
+
+    # (2) (safety belt) check that 90% of p-values are above 0.05
+    min_10pct_pval = np.percentile(ks_pvals, q=10)
+    # lower 10th quantile pvalue <= 0.05 means that the test rejects the
+    # null hypothesis that the sample came from the uniform distribution
+    assert min_10pct_pval > 0.05, (
+        "Null hypothesis rejected: generated random numbers are not uniform. "
+        f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
+    )
+
+
+@pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1])
+def test_newrand_bounded_rand_int_limits(range_):
+    """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
+    with pytest.raises(OverflowError):
+        bounded_rand_int_wrap(range_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_sparse.py b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e22c86a66cd8b5625f100990e441675c7f62e34
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_sparse.py
@@ -0,0 +1,496 @@
+import numpy as np
+import pytest
+from scipy import sparse
+
+from sklearn import base, datasets, linear_model, svm
+from sklearn.datasets import load_digits, make_blobs, make_classification
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.svm.tests import test_svm
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+    skip_if_32bit,
+)
+from sklearn.utils.extmath import safe_sparse_dot
+from sklearn.utils.fixes import (
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+# test sample 1
+X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
+Y = [1, 1, 1, 2, 2, 2]
+T = np.array([[-1, -1], [2, 2], [3, 2]])
+true_result = [1, 2, 2]
+
+# test sample 2
+X2 = np.array(
+    [
+        [0, 0, 0],
+        [1, 1, 1],
+        [2, 0, 0],
+        [0, 0, 2],
+        [3, 3, 3],
+    ]
+)
+Y2 = [1, 2, 2, 2, 3]
+T2 = np.array([[-1, -1, -1], [1, 1, 1], [2, 2, 2]])
+true_result2 = [1, 2, 3]
+
+iris = datasets.load_iris()
+rng = np.random.RandomState(0)
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+X_blobs, y_blobs = make_blobs(n_samples=100, centers=10, random_state=0)
+
+
+def check_svm_model_equal(dense_svm, X_train, y_train, X_test):
+    # Use the original svm model for dense fit and clone an exactly same
+    # svm model for sparse fit
+    sparse_svm = base.clone(dense_svm)
+
+    dense_svm.fit(X_train.toarray(), y_train)
+    if sparse.issparse(X_test):
+        X_test_dense = X_test.toarray()
+    else:
+        X_test_dense = X_test
+    sparse_svm.fit(X_train, y_train)
+    assert sparse.issparse(sparse_svm.support_vectors_)
+    assert sparse.issparse(sparse_svm.dual_coef_)
+    assert_allclose(dense_svm.support_vectors_, sparse_svm.support_vectors_.toarray())
+    assert_allclose(dense_svm.dual_coef_, sparse_svm.dual_coef_.toarray())
+    if dense_svm.kernel == "linear":
+        assert sparse.issparse(sparse_svm.coef_)
+        assert_array_almost_equal(dense_svm.coef_, sparse_svm.coef_.toarray())
+    assert_allclose(dense_svm.support_, sparse_svm.support_)
+    assert_allclose(dense_svm.predict(X_test_dense), sparse_svm.predict(X_test))
+
+    assert_array_almost_equal(
+        dense_svm.decision_function(X_test_dense), sparse_svm.decision_function(X_test)
+    )
+    assert_array_almost_equal(
+        dense_svm.decision_function(X_test_dense),
+        sparse_svm.decision_function(X_test_dense),
+    )
+    if isinstance(dense_svm, svm.OneClassSVM):
+        msg = "cannot use sparse input in 'OneClassSVM' trained on dense data"
+    else:
+        assert_array_almost_equal(
+            dense_svm.predict_proba(X_test_dense),
+            sparse_svm.predict_proba(X_test),
+            decimal=4,
+        )
+        msg = "cannot use sparse input in 'SVC' trained on dense data"
+    if sparse.issparse(X_test):
+        with pytest.raises(ValueError, match=msg):
+            dense_svm.predict(X_test)
+
+
+@skip_if_32bit
+@pytest.mark.parametrize(
+    "X_train, y_train, X_test",
+    [
+        [X, Y, T],
+        [X2, Y2, T2],
+        [X_blobs[:80], y_blobs[:80], X_blobs[80:]],
+        [iris.data, iris.target, iris.data],
+    ],
+)
+@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf", "sigmoid"])
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + LIL_CONTAINERS)
+def test_svc(X_train, y_train, X_test, kernel, sparse_container):
+    """Check that sparse SVC gives the same result as SVC."""
+    X_train = sparse_container(X_train)
+
+    clf = svm.SVC(
+        gamma=1,
+        kernel=kernel,
+        probability=True,
+        random_state=0,
+        decision_function_shape="ovo",
+    )
+    check_svm_model_equal(clf, X_train, y_train, X_test)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_unsorted_indices(csr_container):
+    # test that the result with sorted and unsorted indices in csr is the same
+    # we use a subset of digits as iris, blobs or make_classification didn't
+    # show the problem
+    X, y = load_digits(return_X_y=True)
+    X_test = csr_container(X[50:100])
+    X, y = X[:50], y[:50]
+    tols = dict(rtol=1e-12, atol=1e-14)
+
+    X_sparse = csr_container(X)
+    coef_dense = (
+        svm.SVC(kernel="linear", probability=True, random_state=0).fit(X, y).coef_
+    )
+    sparse_svc = svm.SVC(kernel="linear", probability=True, random_state=0).fit(
+        X_sparse, y
+    )
+    coef_sorted = sparse_svc.coef_
+    # make sure dense and sparse SVM give the same result
+    assert_allclose(coef_dense, coef_sorted.toarray(), **tols)
+
+    # reverse each row's indices
+    def scramble_indices(X):
+        new_data = []
+        new_indices = []
+        for i in range(1, len(X.indptr)):
+            row_slice = slice(*X.indptr[i - 1 : i + 1])
+            new_data.extend(X.data[row_slice][::-1])
+            new_indices.extend(X.indices[row_slice][::-1])
+        return csr_container((new_data, new_indices, X.indptr), shape=X.shape)
+
+    X_sparse_unsorted = scramble_indices(X_sparse)
+    X_test_unsorted = scramble_indices(X_test)
+
+    assert not X_sparse_unsorted.has_sorted_indices
+    assert not X_test_unsorted.has_sorted_indices
+
+    unsorted_svc = svm.SVC(kernel="linear", probability=True, random_state=0).fit(
+        X_sparse_unsorted, y
+    )
+    coef_unsorted = unsorted_svc.coef_
+    # make sure unsorted indices give same result
+    assert_allclose(coef_unsorted.toarray(), coef_sorted.toarray(), **tols)
+    assert_allclose(
+        sparse_svc.predict_proba(X_test_unsorted),
+        sparse_svc.predict_proba(X_test),
+        **tols,
+    )
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_svc_with_custom_kernel(lil_container):
+    def kfunc(x, y):
+        return safe_sparse_dot(x, y.T)
+
+    X_sp = lil_container(X)
+    clf_lin = svm.SVC(kernel="linear").fit(X_sp, Y)
+    clf_mylin = svm.SVC(kernel=kfunc).fit(X_sp, Y)
+    assert_array_equal(clf_lin.predict(X_sp), clf_mylin.predict(X_sp))
+
+
+@skip_if_32bit
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf"])
+def test_svc_iris(csr_container, kernel):
+    # Test the sparse SVC with the iris dataset
+    iris_data_sp = csr_container(iris.data)
+
+    sp_clf = svm.SVC(kernel=kernel).fit(iris_data_sp, iris.target)
+    clf = svm.SVC(kernel=kernel).fit(iris.data, iris.target)
+
+    assert_allclose(clf.support_vectors_, sp_clf.support_vectors_.toarray())
+    assert_allclose(clf.dual_coef_, sp_clf.dual_coef_.toarray())
+    assert_allclose(clf.predict(iris.data), sp_clf.predict(iris_data_sp))
+    if kernel == "linear":
+        assert_allclose(clf.coef_, sp_clf.coef_.toarray())
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_decision_function(csr_container):
+    # Test decision_function
+
+    # Sanity check, test that decision_function implemented in python
+    # returns the same as the one in libsvm
+
+    # multi class:
+    iris_data_sp = csr_container(iris.data)
+    svc = svm.SVC(kernel="linear", C=0.1, decision_function_shape="ovo")
+    clf = svc.fit(iris_data_sp, iris.target)
+
+    dec = safe_sparse_dot(iris_data_sp, clf.coef_.T) + clf.intercept_
+
+    assert_allclose(dec, clf.decision_function(iris_data_sp))
+
+    # binary:
+    clf.fit(X, Y)
+    dec = np.dot(X, clf.coef_.T) + clf.intercept_
+    prediction = clf.predict(X)
+    assert_allclose(dec.ravel(), clf.decision_function(X))
+    assert_allclose(
+        prediction, clf.classes_[(clf.decision_function(X) > 0).astype(int).ravel()]
+    )
+    expected = np.array([-1.0, -0.66, -1.0, 0.66, 1.0, 1.0])
+    assert_array_almost_equal(clf.decision_function(X), expected, decimal=2)
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_error(lil_container):
+    # Test that it gives proper exception on deficient input
+    clf = svm.SVC()
+    X_sp = lil_container(X)
+
+    Y2 = Y[:-1]  # wrong dimensions for labels
+    with pytest.raises(ValueError):
+        clf.fit(X_sp, Y2)
+
+    clf.fit(X_sp, Y)
+    assert_array_equal(clf.predict(T), true_result)
+
+
+@pytest.mark.parametrize(
+    "lil_container, dok_container", zip(LIL_CONTAINERS, DOK_CONTAINERS)
+)
+def test_linearsvc(lil_container, dok_container):
+    # Similar to test_SVC
+    X_sp = lil_container(X)
+    X2_sp = dok_container(X2)
+
+    clf = svm.LinearSVC(random_state=0).fit(X, Y)
+    sp_clf = svm.LinearSVC(random_state=0).fit(X_sp, Y)
+
+    assert sp_clf.fit_intercept
+
+    assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=4)
+    assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=4)
+
+    assert_allclose(clf.predict(X), sp_clf.predict(X_sp))
+
+    clf.fit(X2, Y2)
+    sp_clf.fit(X2_sp, Y2)
+
+    assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=4)
+    assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=4)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_linearsvc_iris(csr_container):
+    # Test the sparse LinearSVC with the iris dataset
+    iris_data_sp = csr_container(iris.data)
+
+    sp_clf = svm.LinearSVC(random_state=0).fit(iris_data_sp, iris.target)
+    clf = svm.LinearSVC(random_state=0).fit(iris.data, iris.target)
+
+    assert clf.fit_intercept == sp_clf.fit_intercept
+
+    assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=1)
+    assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=1)
+    assert_allclose(clf.predict(iris.data), sp_clf.predict(iris_data_sp))
+
+    # check decision_function
+    pred = np.argmax(sp_clf.decision_function(iris_data_sp), axis=1)
+    assert_allclose(pred, clf.predict(iris.data))
+
+    # sparsify the coefficients on both models and check that they still
+    # produce the same results
+    clf.sparsify()
+    assert_array_equal(pred, clf.predict(iris_data_sp))
+    sp_clf.sparsify()
+    assert_array_equal(pred, sp_clf.predict(iris_data_sp))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_weight(csr_container):
+    # Test class weights
+    X_, y_ = make_classification(
+        n_samples=200, n_features=100, weights=[0.833, 0.167], random_state=0
+    )
+
+    X_ = csr_container(X_)
+    for clf in (
+        linear_model.LogisticRegression(),
+        svm.LinearSVC(random_state=0),
+        svm.SVC(),
+    ):
+        clf.set_params(class_weight={0: 5})
+        clf.fit(X_[:180], y_[:180])
+        y_pred = clf.predict(X_[180:])
+        assert np.sum(y_pred == y_[180:]) >= 11
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_sample_weights(lil_container):
+    # Test weights on individual samples
+    X_sp = lil_container(X)
+
+    clf = svm.SVC()
+    clf.fit(X_sp, Y)
+    assert_array_equal(clf.predict([X[2]]), [1.0])
+
+    sample_weight = [0.1] * 3 + [10] * 3
+    clf.fit(X_sp, Y, sample_weight=sample_weight)
+    assert_array_equal(clf.predict([X[2]]), [2.0])
+
+
+def test_sparse_liblinear_intercept_handling():
+    # Test that sparse liblinear honours intercept_scaling param
+    test_svm.test_dense_liblinear_intercept_handling(svm.LinearSVC)
+
+
+@pytest.mark.parametrize(
+    "X_train, y_train, X_test",
+    [
+        [X, None, T],
+        [X2, None, T2],
+        [X_blobs[:80], None, X_blobs[80:]],
+        [iris.data, None, iris.data],
+    ],
+)
+@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf", "sigmoid"])
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + LIL_CONTAINERS)
+@skip_if_32bit
+def test_sparse_oneclasssvm(X_train, y_train, X_test, kernel, sparse_container):
+    # Check that sparse OneClassSVM gives the same result as dense OneClassSVM
+    X_train = sparse_container(X_train)
+
+    clf = svm.OneClassSVM(gamma=1, kernel=kernel)
+    check_svm_model_equal(clf, X_train, y_train, X_test)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_realdata(csr_container):
+    # Test on a subset from the 20newsgroups dataset.
+    # This catches some bugs if input is not correctly converted into
+    # sparse format or weights are not correctly initialized.
+    data = np.array([0.03771744, 0.1003567, 0.01174647, 0.027069])
+
+    # SVC does not support large sparse, so we specify int32 indices
+    # In this case, `csr_matrix` automatically uses int32 regardless of the dtypes of
+    # `indices` and `indptr` but `csr_array` may or may not use the same dtype as
+    # `indices` and `indptr`, which would be int64 if not specified
+    indices = np.array([6, 5, 35, 31], dtype=np.int32)
+    indptr = np.array([0] * 8 + [1] * 32 + [2] * 38 + [4] * 3, dtype=np.int32)
+
+    X = csr_container((data, indices, indptr))
+    y = np.array(
+        [
+            1.0,
+            0.0,
+            2.0,
+            2.0,
+            1.0,
+            1.0,
+            1.0,
+            2.0,
+            2.0,
+            0.0,
+            1.0,
+            2.0,
+            2.0,
+            0.0,
+            2.0,
+            0.0,
+            3.0,
+            0.0,
+            3.0,
+            0.0,
+            1.0,
+            1.0,
+            3.0,
+            2.0,
+            3.0,
+            2.0,
+            0.0,
+            3.0,
+            1.0,
+            0.0,
+            2.0,
+            1.0,
+            2.0,
+            0.0,
+            1.0,
+            0.0,
+            2.0,
+            3.0,
+            1.0,
+            3.0,
+            0.0,
+            1.0,
+            0.0,
+            0.0,
+            2.0,
+            0.0,
+            1.0,
+            2.0,
+            2.0,
+            2.0,
+            3.0,
+            2.0,
+            0.0,
+            3.0,
+            2.0,
+            1.0,
+            2.0,
+            3.0,
+            2.0,
+            2.0,
+            0.0,
+            1.0,
+            0.0,
+            1.0,
+            2.0,
+            3.0,
+            0.0,
+            0.0,
+            2.0,
+            2.0,
+            1.0,
+            3.0,
+            1.0,
+            1.0,
+            0.0,
+            1.0,
+            2.0,
+            1.0,
+            1.0,
+            3.0,
+        ]
+    )
+
+    clf = svm.SVC(kernel="linear").fit(X.toarray(), y)
+    sp_clf = svm.SVC(kernel="linear").fit(X.tocoo(), y)
+
+    assert_array_equal(clf.support_vectors_, sp_clf.support_vectors_.toarray())
+    assert_array_equal(clf.dual_coef_, sp_clf.dual_coef_.toarray())
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_sparse_svc_clone_with_callable_kernel(lil_container):
+    # Test that the "dense_fit" is called even though we use sparse input
+    # meaning that everything works fine.
+    a = svm.SVC(C=1, kernel=lambda x, y: x @ y.T, probability=True, random_state=0)
+    b = base.clone(a)
+
+    X_sp = lil_container(X)
+    b.fit(X_sp, Y)
+    pred = b.predict(X_sp)
+    b.predict_proba(X_sp)
+
+    dense_svm = svm.SVC(
+        C=1, kernel=lambda x, y: np.dot(x, y.T), probability=True, random_state=0
+    )
+    pred_dense = dense_svm.fit(X, Y).predict(X)
+    assert_array_equal(pred_dense, pred)
+    # b.decision_function(X_sp)  # XXX : should be supported
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_timeout(lil_container):
+    sp = svm.SVC(
+        C=1, kernel=lambda x, y: x @ y.T, probability=True, random_state=0, max_iter=1
+    )
+    warning_msg = (
+        r"Solver terminated early \(max_iter=1\).  Consider pre-processing "
+        r"your data with StandardScaler or MinMaxScaler."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_msg):
+        sp.fit(lil_container(X), Y)
+
+
+def test_consistent_proba():
+    a = svm.SVC(probability=True, max_iter=1, random_state=0)
+    with ignore_warnings(category=ConvergenceWarning):
+        proba_1 = a.fit(X, Y).predict_proba(X)
+    a = svm.SVC(probability=True, max_iter=1, random_state=0)
+    with ignore_warnings(category=ConvergenceWarning):
+        proba_2 = a.fit(X, Y).predict_proba(X)
+    assert_allclose(proba_1, proba_2)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_svm.py b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_svm.py
new file mode 100644
index 0000000000000000000000000000000000000000..62396451e736d02fffce21dd1f7219eba2614199
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_svm.py
@@ -0,0 +1,1440 @@
+"""
+Testing for Support Vector Machine module (sklearn.svm)
+
+TODO: remove hard coded numerical results when possible
+"""
+
+import numpy as np
+import pytest
+from numpy.testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+from sklearn import base, datasets, linear_model, metrics, svm
+from sklearn.datasets import make_blobs, make_classification, make_regression
+from sklearn.exceptions import (
+    ConvergenceWarning,
+    NotFittedError,
+)
+from sklearn.metrics import f1_score
+from sklearn.metrics.pairwise import rbf_kernel
+from sklearn.model_selection import train_test_split
+from sklearn.multiclass import OneVsRestClassifier
+
+# mypy error: Module 'sklearn.svm' has no attribute '_libsvm'
+from sklearn.svm import (  # type: ignore[attr-defined]
+    SVR,
+    LinearSVC,
+    LinearSVR,
+    NuSVR,
+    OneClassSVM,
+    _libsvm,
+)
+from sklearn.svm._classes import _validate_dual_parameter
+from sklearn.utils import check_random_state, shuffle
+from sklearn.utils.fixes import _IS_32BIT, CSR_CONTAINERS, LIL_CONTAINERS
+from sklearn.utils.validation import _num_samples
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+Y = [1, 1, 1, 2, 2, 2]
+T = [[-1, -1], [2, 2], [3, 2]]
+true_result = [1, 2, 2]
+
+# also load the iris dataset
+iris = datasets.load_iris()
+rng = check_random_state(42)
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+
+def test_libsvm_parameters():
+    # Test parameters on classes that make use of libsvm.
+    clf = svm.SVC(kernel="linear").fit(X, Y)
+    assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]])
+    assert_array_equal(clf.support_, [1, 3])
+    assert_array_equal(clf.support_vectors_, (X[1], X[3]))
+    assert_array_equal(clf.intercept_, [0.0])
+    assert_array_equal(clf.predict(X), Y)
+
+
+def test_libsvm_iris():
+    # Check consistency on dataset iris.
+
+    # shuffle the dataset so that labels are not ordered
+    for k in ("linear", "rbf"):
+        clf = svm.SVC(kernel=k).fit(iris.data, iris.target)
+        assert np.mean(clf.predict(iris.data) == iris.target) > 0.9
+        assert hasattr(clf, "coef_") == (k == "linear")
+
+    assert_array_equal(clf.classes_, np.sort(clf.classes_))
+
+    # check also the low-level API
+    # We unpack the values to create a dictionary with some of the return values
+    # from Libsvm's fit.
+    (
+        libsvm_support,
+        libsvm_support_vectors,
+        libsvm_n_class_SV,
+        libsvm_sv_coef,
+        libsvm_intercept,
+        libsvm_probA,
+        libsvm_probB,
+        # libsvm_fit_status and libsvm_n_iter won't be used below.
+        libsvm_fit_status,
+        libsvm_n_iter,
+    ) = _libsvm.fit(iris.data, iris.target.astype(np.float64))
+
+    model_params = {
+        "support": libsvm_support,
+        "SV": libsvm_support_vectors,
+        "nSV": libsvm_n_class_SV,
+        "sv_coef": libsvm_sv_coef,
+        "intercept": libsvm_intercept,
+        "probA": libsvm_probA,
+        "probB": libsvm_probB,
+    }
+    pred = _libsvm.predict(iris.data, **model_params)
+    assert np.mean(pred == iris.target) > 0.95
+
+    # We unpack the values to create a dictionary with some of the return values
+    # from Libsvm's fit.
+    (
+        libsvm_support,
+        libsvm_support_vectors,
+        libsvm_n_class_SV,
+        libsvm_sv_coef,
+        libsvm_intercept,
+        libsvm_probA,
+        libsvm_probB,
+        # libsvm_fit_status and libsvm_n_iter won't be used below.
+        libsvm_fit_status,
+        libsvm_n_iter,
+    ) = _libsvm.fit(iris.data, iris.target.astype(np.float64), kernel="linear")
+
+    model_params = {
+        "support": libsvm_support,
+        "SV": libsvm_support_vectors,
+        "nSV": libsvm_n_class_SV,
+        "sv_coef": libsvm_sv_coef,
+        "intercept": libsvm_intercept,
+        "probA": libsvm_probA,
+        "probB": libsvm_probB,
+    }
+    pred = _libsvm.predict(iris.data, **model_params, kernel="linear")
+    assert np.mean(pred == iris.target) > 0.95
+
+    pred = _libsvm.cross_validation(
+        iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0
+    )
+    assert np.mean(pred == iris.target) > 0.95
+
+    # If random_seed >= 0, the libsvm rng is seeded (by calling `srand`), hence
+    # we should get deterministic results (assuming that there is no other
+    # thread calling this wrapper calling `srand` concurrently).
+    pred2 = _libsvm.cross_validation(
+        iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0
+    )
+    assert_array_equal(pred, pred2)
+
+
+def test_precomputed():
+    # SVC with a precomputed kernel.
+    # We test it with a toy dataset and with iris.
+    clf = svm.SVC(kernel="precomputed")
+    # Gram matrix for train data (square matrix)
+    # (we use just a linear kernel)
+    K = np.dot(X, np.array(X).T)
+    clf.fit(K, Y)
+    # Gram matrix for test data (rectangular matrix)
+    KT = np.dot(T, np.array(X).T)
+    pred = clf.predict(KT)
+    with pytest.raises(ValueError):
+        clf.predict(KT.T)
+
+    assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]])
+    assert_array_equal(clf.support_, [1, 3])
+    assert_array_equal(clf.intercept_, [0])
+    assert_array_almost_equal(clf.support_, [1, 3])
+    assert_array_equal(pred, true_result)
+
+    # Gram matrix for test data but compute KT[i,j]
+    # for support vectors j only.
+    KT = np.zeros_like(KT)
+    for i in range(len(T)):
+        for j in clf.support_:
+            KT[i, j] = np.dot(T[i], X[j])
+
+    pred = clf.predict(KT)
+    assert_array_equal(pred, true_result)
+
+    # same as before, but using a callable function instead of the kernel
+    # matrix. kernel is just a linear kernel
+
+    def kfunc(x, y):
+        return np.dot(x, y.T)
+
+    clf = svm.SVC(kernel=kfunc)
+    clf.fit(np.array(X), Y)
+    pred = clf.predict(T)
+
+    assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]])
+    assert_array_equal(clf.intercept_, [0])
+    assert_array_almost_equal(clf.support_, [1, 3])
+    assert_array_equal(pred, true_result)
+
+    # test a precomputed kernel with the iris dataset
+    # and check parameters against a linear SVC
+    clf = svm.SVC(kernel="precomputed")
+    clf2 = svm.SVC(kernel="linear")
+    K = np.dot(iris.data, iris.data.T)
+    clf.fit(K, iris.target)
+    clf2.fit(iris.data, iris.target)
+    pred = clf.predict(K)
+    assert_array_almost_equal(clf.support_, clf2.support_)
+    assert_array_almost_equal(clf.dual_coef_, clf2.dual_coef_)
+    assert_array_almost_equal(clf.intercept_, clf2.intercept_)
+    assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2)
+
+    # Gram matrix for test data but compute KT[i,j]
+    # for support vectors j only.
+    K = np.zeros_like(K)
+    for i in range(len(iris.data)):
+        for j in clf.support_:
+            K[i, j] = np.dot(iris.data[i], iris.data[j])
+
+    pred = clf.predict(K)
+    assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2)
+
+    clf = svm.SVC(kernel=kfunc)
+    clf.fit(iris.data, iris.target)
+    assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2)
+
+
+def test_svr():
+    # Test Support Vector Regression
+
+    diabetes = datasets.load_diabetes()
+    for clf in (
+        svm.NuSVR(kernel="linear", nu=0.4, C=1.0),
+        svm.NuSVR(kernel="linear", nu=0.4, C=10.0),
+        svm.SVR(kernel="linear", C=10.0),
+        svm.LinearSVR(C=10.0),
+        svm.LinearSVR(C=10.0),
+    ):
+        clf.fit(diabetes.data, diabetes.target)
+        assert clf.score(diabetes.data, diabetes.target) > 0.02
+
+    # non-regression test; previously, BaseLibSVM would check that
+    # len(np.unique(y)) < 2, which must only be done for SVC
+    svm.SVR().fit(diabetes.data, np.ones(len(diabetes.data)))
+    svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data)))
+
+
+def test_linearsvr():
+    # check that SVR(kernel='linear') and LinearSVC() give
+    # comparable results
+    diabetes = datasets.load_diabetes()
+    lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target)
+    score1 = lsvr.score(diabetes.data, diabetes.target)
+
+    svr = svm.SVR(kernel="linear", C=1e3).fit(diabetes.data, diabetes.target)
+    score2 = svr.score(diabetes.data, diabetes.target)
+
+    assert_allclose(np.linalg.norm(lsvr.coef_), np.linalg.norm(svr.coef_), 1, 0.0001)
+    assert_almost_equal(score1, score2, 2)
+
+
+def test_linearsvr_fit_sampleweight():
+    # check correct result when sample_weight is 1
+    # check that SVR(kernel='linear') and LinearSVC() give
+    # comparable results
+    diabetes = datasets.load_diabetes()
+    n_samples = len(diabetes.target)
+    unit_weight = np.ones(n_samples)
+    lsvr = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit(
+        diabetes.data, diabetes.target, sample_weight=unit_weight
+    )
+    score1 = lsvr.score(diabetes.data, diabetes.target)
+
+    lsvr_no_weight = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit(
+        diabetes.data, diabetes.target
+    )
+    score2 = lsvr_no_weight.score(diabetes.data, diabetes.target)
+
+    assert_allclose(
+        np.linalg.norm(lsvr.coef_), np.linalg.norm(lsvr_no_weight.coef_), 1, 0.0001
+    )
+    assert_almost_equal(score1, score2, 2)
+
+    # check that fit(X)  = fit([X1, X2, X3], sample_weight = [n1, n2, n3]) where
+    # X = X1 repeated n1 times, X2 repeated n2 times and so forth
+    random_state = check_random_state(0)
+    random_weight = random_state.randint(0, 10, n_samples)
+    lsvr_unflat = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit(
+        diabetes.data, diabetes.target, sample_weight=random_weight
+    )
+    score3 = lsvr_unflat.score(
+        diabetes.data, diabetes.target, sample_weight=random_weight
+    )
+
+    X_flat = np.repeat(diabetes.data, random_weight, axis=0)
+    y_flat = np.repeat(diabetes.target, random_weight, axis=0)
+    lsvr_flat = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit(X_flat, y_flat)
+    score4 = lsvr_flat.score(X_flat, y_flat)
+
+    assert_almost_equal(score3, score4, 2)
+
+
+def test_svr_errors():
+    X = [[0.0], [1.0]]
+    y = [0.0, 0.5]
+
+    # Bad kernel
+    clf = svm.SVR(kernel=lambda x, y: np.array([[1.0]]))
+    clf.fit(X, y)
+    with pytest.raises(ValueError):
+        clf.predict(X)
+
+
+def test_oneclass():
+    # Test OneClassSVM
+    clf = svm.OneClassSVM()
+    clf.fit(X)
+    pred = clf.predict(T)
+
+    assert_array_equal(pred, [1, -1, -1])
+    assert pred.dtype == np.dtype("intp")
+    assert_array_almost_equal(clf.intercept_, [-1.218], decimal=3)
+    assert_array_almost_equal(clf.dual_coef_, [[0.750, 0.750, 0.750, 0.750]], decimal=3)
+    with pytest.raises(AttributeError):
+        (lambda: clf.coef_)()
+
+
+def test_oneclass_decision_function():
+    # Test OneClassSVM decision function
+    clf = svm.OneClassSVM()
+    rnd = check_random_state(2)
+
+    # Generate train data
+    X = 0.3 * rnd.randn(100, 2)
+    X_train = np.r_[X + 2, X - 2]
+
+    # Generate some regular novel observations
+    X = 0.3 * rnd.randn(20, 2)
+    X_test = np.r_[X + 2, X - 2]
+    # Generate some abnormal novel observations
+    X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2))
+
+    # fit the model
+    clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
+    clf.fit(X_train)
+
+    # predict things
+    y_pred_test = clf.predict(X_test)
+    assert np.mean(y_pred_test == 1) > 0.9
+    y_pred_outliers = clf.predict(X_outliers)
+    assert np.mean(y_pred_outliers == -1) > 0.9
+    dec_func_test = clf.decision_function(X_test)
+    assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1)
+    dec_func_outliers = clf.decision_function(X_outliers)
+    assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1)
+
+
+def test_oneclass_score_samples():
+    X_train = [[1, 1], [1, 2], [2, 1]]
+    clf = svm.OneClassSVM(gamma=1).fit(X_train)
+    assert_array_equal(
+        clf.score_samples([[2.0, 2.0]]),
+        clf.decision_function([[2.0, 2.0]]) + clf.offset_,
+    )
+
+
+def test_tweak_params():
+    # Make sure some tweaking of parameters works.
+    # We change clf.dual_coef_ at run time and expect .predict() to change
+    # accordingly. Notice that this is not trivial since it involves a lot
+    # of C/Python copying in the libsvm bindings.
+    # The success of this test ensures that the mapping between libsvm and
+    # the python classifier is complete.
+    clf = svm.SVC(kernel="linear", C=1.0)
+    clf.fit(X, Y)
+    assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]])
+    assert_array_equal(clf.predict([[-0.1, -0.1]]), [1])
+    clf._dual_coef_ = np.array([[0.0, 1.0]])
+    assert_array_equal(clf.predict([[-0.1, -0.1]]), [2])
+
+
+def test_probability():
+    # Predict probabilities using SVC
+    # This uses cross validation, so we use a slightly bigger testing set.
+
+    for clf in (
+        svm.SVC(probability=True, random_state=0, C=1.0),
+        svm.NuSVC(probability=True, random_state=0),
+    ):
+        clf.fit(iris.data, iris.target)
+
+        prob_predict = clf.predict_proba(iris.data)
+        assert_array_almost_equal(np.sum(prob_predict, 1), np.ones(iris.data.shape[0]))
+        assert np.mean(np.argmax(prob_predict, 1) == clf.predict(iris.data)) > 0.9
+
+        assert_almost_equal(
+            clf.predict_proba(iris.data), np.exp(clf.predict_log_proba(iris.data)), 8
+        )
+
+
+def test_decision_function():
+    # Test decision_function
+    # Sanity check, test that decision_function implemented in python
+    # returns the same as the one in libsvm
+    # multi class:
+    clf = svm.SVC(kernel="linear", C=0.1, decision_function_shape="ovo").fit(
+        iris.data, iris.target
+    )
+
+    dec = np.dot(iris.data, clf.coef_.T) + clf.intercept_
+
+    assert_array_almost_equal(dec, clf.decision_function(iris.data))
+
+    # binary:
+    clf.fit(X, Y)
+    dec = np.dot(X, clf.coef_.T) + clf.intercept_
+    prediction = clf.predict(X)
+    assert_array_almost_equal(dec.ravel(), clf.decision_function(X))
+    assert_array_almost_equal(
+        prediction, clf.classes_[(clf.decision_function(X) > 0).astype(int)]
+    )
+    expected = np.array([-1.0, -0.66, -1.0, 0.66, 1.0, 1.0])
+    assert_array_almost_equal(clf.decision_function(X), expected, 2)
+
+    # kernel binary:
+    clf = svm.SVC(kernel="rbf", gamma=1, decision_function_shape="ovo")
+    clf.fit(X, Y)
+
+    rbfs = rbf_kernel(X, clf.support_vectors_, gamma=clf.gamma)
+    dec = np.dot(rbfs, clf.dual_coef_.T) + clf.intercept_
+    assert_array_almost_equal(dec.ravel(), clf.decision_function(X))
+
+
+@pytest.mark.parametrize("SVM", (svm.SVC, svm.NuSVC))
+def test_decision_function_shape(SVM):
+    # check that decision_function_shape='ovr' or 'ovo' gives
+    # correct shape and is consistent with predict
+
+    clf = SVM(kernel="linear", decision_function_shape="ovr").fit(
+        iris.data, iris.target
+    )
+    dec = clf.decision_function(iris.data)
+    assert dec.shape == (len(iris.data), 3)
+    assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1))
+
+    # with five classes:
+    X, y = make_blobs(n_samples=80, centers=5, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    clf = SVM(kernel="linear", decision_function_shape="ovr").fit(X_train, y_train)
+    dec = clf.decision_function(X_test)
+    assert dec.shape == (len(X_test), 5)
+    assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1))
+
+    # check shape of ovo_decition_function=True
+    clf = SVM(kernel="linear", decision_function_shape="ovo").fit(X_train, y_train)
+    dec = clf.decision_function(X_train)
+    assert dec.shape == (len(X_train), 10)
+
+
+def test_svr_predict():
+    # Test SVR's decision_function
+    # Sanity check, test that predict implemented in python
+    # returns the same as the one in libsvm
+
+    X = iris.data
+    y = iris.target
+
+    # linear kernel
+    reg = svm.SVR(kernel="linear", C=0.1).fit(X, y)
+
+    dec = np.dot(X, reg.coef_.T) + reg.intercept_
+    assert_array_almost_equal(dec.ravel(), reg.predict(X).ravel())
+
+    # rbf kernel
+    reg = svm.SVR(kernel="rbf", gamma=1).fit(X, y)
+
+    rbfs = rbf_kernel(X, reg.support_vectors_, gamma=reg.gamma)
+    dec = np.dot(rbfs, reg.dual_coef_.T) + reg.intercept_
+    assert_array_almost_equal(dec.ravel(), reg.predict(X).ravel())
+
+
+def test_weight():
+    # Test class weights
+    clf = svm.SVC(class_weight={1: 0.1})
+    # we give a small weights to class 1
+    clf.fit(X, Y)
+    # so all predicted values belong to class 2
+    assert_array_almost_equal(clf.predict(X), [2] * 6)
+
+    X_, y_ = make_classification(
+        n_samples=200, n_features=10, weights=[0.833, 0.167], random_state=2
+    )
+
+    for clf in (
+        linear_model.LogisticRegression(),
+        svm.LinearSVC(random_state=0),
+        svm.SVC(),
+    ):
+        clf.set_params(class_weight={0: 0.1, 1: 10})
+        clf.fit(X_[:100], y_[:100])
+        y_pred = clf.predict(X_[100:])
+        assert f1_score(y_[100:], y_pred) > 0.3
+
+
+@pytest.mark.parametrize("estimator", [svm.SVC(C=1e-2), svm.NuSVC()])
+def test_svm_classifier_sided_sample_weight(estimator):
+    # fit a linear SVM and check that giving more weight to opposed samples
+    # in the space will flip the decision toward these samples.
+    X = [[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 0]]
+    estimator.set_params(kernel="linear")
+
+    # check that with unit weights, a sample is supposed to be predicted on
+    # the boundary
+    sample_weight = [1] * 6
+    estimator.fit(X, Y, sample_weight=sample_weight)
+    y_pred = estimator.decision_function([[-1.0, 1.0]])
+    assert y_pred == pytest.approx(0)
+
+    # give more weights to opposed samples
+    sample_weight = [10.0, 0.1, 0.1, 0.1, 0.1, 10]
+    estimator.fit(X, Y, sample_weight=sample_weight)
+    y_pred = estimator.decision_function([[-1.0, 1.0]])
+    assert y_pred < 0
+
+    sample_weight = [1.0, 0.1, 10.0, 10.0, 0.1, 0.1]
+    estimator.fit(X, Y, sample_weight=sample_weight)
+    y_pred = estimator.decision_function([[-1.0, 1.0]])
+    assert y_pred > 0
+
+
+@pytest.mark.parametrize("estimator", [svm.SVR(C=1e-2), svm.NuSVR(C=1e-2)])
+def test_svm_regressor_sided_sample_weight(estimator):
+    # similar test to test_svm_classifier_sided_sample_weight but for
+    # SVM regressors
+    X = [[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 0]]
+    estimator.set_params(kernel="linear")
+
+    # check that with unit weights, a sample is supposed to be predicted on
+    # the boundary
+    sample_weight = [1] * 6
+    estimator.fit(X, Y, sample_weight=sample_weight)
+    y_pred = estimator.predict([[-1.0, 1.0]])
+    assert y_pred == pytest.approx(1.5)
+
+    # give more weights to opposed samples
+    sample_weight = [10.0, 0.1, 0.1, 0.1, 0.1, 10]
+    estimator.fit(X, Y, sample_weight=sample_weight)
+    y_pred = estimator.predict([[-1.0, 1.0]])
+    assert y_pred < 1.5
+
+    sample_weight = [1.0, 0.1, 10.0, 10.0, 0.1, 0.1]
+    estimator.fit(X, Y, sample_weight=sample_weight)
+    y_pred = estimator.predict([[-1.0, 1.0]])
+    assert y_pred > 1.5
+
+
+def test_svm_equivalence_sample_weight_C():
+    # test that rescaling all samples is the same as changing C
+    clf = svm.SVC()
+    clf.fit(X, Y)
+    dual_coef_no_weight = clf.dual_coef_
+    clf.set_params(C=100)
+    clf.fit(X, Y, sample_weight=np.repeat(0.01, len(X)))
+    assert_allclose(dual_coef_no_weight, clf.dual_coef_)
+
+
+@pytest.mark.parametrize(
+    "Estimator, err_msg",
+    [
+        (svm.SVC, "Invalid input - all samples have zero or negative weights."),
+        (svm.NuSVC, "(negative dimensions are not allowed|nu is infeasible)"),
+        (svm.SVR, "Invalid input - all samples have zero or negative weights."),
+        (svm.NuSVR, "Invalid input - all samples have zero or negative weights."),
+        (svm.OneClassSVM, "Invalid input - all samples have zero or negative weights."),
+    ],
+    ids=["SVC", "NuSVC", "SVR", "NuSVR", "OneClassSVM"],
+)
+@pytest.mark.parametrize(
+    "sample_weight",
+    [[0] * len(Y), [-0.3] * len(Y)],
+    ids=["weights-are-zero", "weights-are-negative"],
+)
+def test_negative_sample_weights_mask_all_samples(Estimator, err_msg, sample_weight):
+    est = Estimator(kernel="linear")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, Y, sample_weight=sample_weight)
+
+
+@pytest.mark.parametrize(
+    "Classifier, err_msg",
+    [
+        (
+            svm.SVC,
+            (
+                "Invalid input - all samples with positive weights belong to the same"
+                " class"
+            ),
+        ),
+        (svm.NuSVC, "specified nu is infeasible"),
+    ],
+    ids=["SVC", "NuSVC"],
+)
+@pytest.mark.parametrize(
+    "sample_weight",
+    [[0, -0.5, 0, 1, 1, 1], [1, 1, 1, 0, -0.1, -0.3]],
+    ids=["mask-label-1", "mask-label-2"],
+)
+def test_negative_weights_svc_leave_just_one_label(Classifier, err_msg, sample_weight):
+    clf = Classifier(kernel="linear")
+    with pytest.raises(ValueError, match=err_msg):
+        clf.fit(X, Y, sample_weight=sample_weight)
+
+
+@pytest.mark.parametrize(
+    "Classifier, model",
+    [
+        (svm.SVC, {"when-left": [0.3998, 0.4], "when-right": [0.4, 0.3999]}),
+        (svm.NuSVC, {"when-left": [0.3333, 0.3333], "when-right": [0.3333, 0.3333]}),
+    ],
+    ids=["SVC", "NuSVC"],
+)
+@pytest.mark.parametrize(
+    "sample_weight, mask_side",
+    [([1, -0.5, 1, 1, 1, 1], "when-left"), ([1, 1, 1, 0, 1, 1], "when-right")],
+    ids=["partial-mask-label-1", "partial-mask-label-2"],
+)
+def test_negative_weights_svc_leave_two_labels(
+    Classifier, model, sample_weight, mask_side
+):
+    clf = Classifier(kernel="linear")
+    clf.fit(X, Y, sample_weight=sample_weight)
+    assert_allclose(clf.coef_, [model[mask_side]], rtol=1e-3)
+
+
+@pytest.mark.parametrize(
+    "Estimator", [svm.SVC, svm.NuSVC, svm.NuSVR], ids=["SVC", "NuSVC", "NuSVR"]
+)
+@pytest.mark.parametrize(
+    "sample_weight",
+    [[1, -0.5, 1, 1, 1, 1], [1, 1, 1, 0, 1, 1]],
+    ids=["partial-mask-label-1", "partial-mask-label-2"],
+)
+def test_negative_weight_equal_coeffs(Estimator, sample_weight):
+    # model generates equal coefficients
+    est = Estimator(kernel="linear")
+    est.fit(X, Y, sample_weight=sample_weight)
+    coef = np.abs(est.coef_).ravel()
+    assert coef[0] == pytest.approx(coef[1], rel=1e-3)
+
+
+def test_auto_weight():
+    # Test class weights for imbalanced data
+    from sklearn.linear_model import LogisticRegression
+
+    # We take as dataset the two-dimensional projection of iris so
+    # that it is not separable and remove half of predictors from
+    # class 1.
+    # We add one to the targets as a non-regression test:
+    # class_weight="balanced"
+    # used to work only when the labels where a range [0..K).
+    from sklearn.utils import compute_class_weight
+
+    X, y = iris.data[:, :2], iris.target + 1
+    unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2])
+
+    classes = np.unique(y[unbalanced])
+    class_weights = compute_class_weight("balanced", classes=classes, y=y[unbalanced])
+    assert np.argmax(class_weights) == 2
+
+    for clf in (
+        svm.SVC(kernel="linear"),
+        svm.LinearSVC(random_state=0),
+        LogisticRegression(),
+    ):
+        # check that score is better when class='balanced' is set.
+        y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
+        clf.set_params(class_weight="balanced")
+        y_pred_balanced = clf.fit(
+            X[unbalanced],
+            y[unbalanced],
+        ).predict(X)
+        assert metrics.f1_score(y, y_pred, average="macro") <= metrics.f1_score(
+            y, y_pred_balanced, average="macro"
+        )
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_bad_input(lil_container):
+    # Test dimensions for labels
+    Y2 = Y[:-1]  # wrong dimensions for labels
+    with pytest.raises(ValueError):
+        svm.SVC().fit(X, Y2)
+
+    # Test with arrays that are non-contiguous.
+    for clf in (svm.SVC(), svm.LinearSVC(random_state=0)):
+        Xf = np.asfortranarray(X)
+        assert not Xf.flags["C_CONTIGUOUS"]
+        yf = np.ascontiguousarray(np.tile(Y, (2, 1)).T)
+        yf = yf[:, -1]
+        assert not yf.flags["F_CONTIGUOUS"]
+        assert not yf.flags["C_CONTIGUOUS"]
+        clf.fit(Xf, yf)
+        assert_array_equal(clf.predict(T), true_result)
+
+    # error for precomputed kernelsx
+    clf = svm.SVC(kernel="precomputed")
+    with pytest.raises(ValueError):
+        clf.fit(X, Y)
+
+    # predict with sparse input when trained with dense
+    clf = svm.SVC().fit(X, Y)
+    with pytest.raises(ValueError):
+        clf.predict(lil_container(X))
+
+    Xt = np.array(X).T
+    clf.fit(np.dot(X, Xt), Y)
+    with pytest.raises(ValueError):
+        clf.predict(X)
+
+    clf = svm.SVC()
+    clf.fit(X, Y)
+    with pytest.raises(ValueError):
+        clf.predict(Xt)
+
+
+def test_svc_nonfinite_params():
+    # Check SVC throws ValueError when dealing with non-finite parameter values
+    rng = np.random.RandomState(0)
+    n_samples = 10
+    fmax = np.finfo(np.float64).max
+    X = fmax * rng.uniform(size=(n_samples, 2))
+    y = rng.randint(0, 2, size=n_samples)
+
+    clf = svm.SVC()
+    msg = "The dual coefficients or intercepts are not finite"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_unicode_kernel():
+    # Test that a unicode kernel name does not cause a TypeError
+    clf = svm.SVC(kernel="linear", probability=True)
+    clf.fit(X, Y)
+    clf.predict_proba(T)
+    _libsvm.cross_validation(
+        iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0
+    )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_precomputed(csr_container):
+    clf = svm.SVC(kernel="precomputed")
+    sparse_gram = csr_container([[1, 0], [0, 1]])
+    with pytest.raises(TypeError, match="Sparse precomputed"):
+        clf.fit(sparse_gram, [0, 1])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_fit_support_vectors_empty(csr_container):
+    # Regression test for #14893
+    X_train = csr_container([[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]])
+    y_train = np.array([0.04, 0.04, 0.10, 0.16])
+    model = svm.SVR(kernel="linear")
+    model.fit(X_train, y_train)
+    assert not model.support_vectors_.data.size
+    assert not model.dual_coef_.data.size
+
+
+@pytest.mark.parametrize("loss", ["hinge", "squared_hinge"])
+@pytest.mark.parametrize("penalty", ["l1", "l2"])
+@pytest.mark.parametrize("dual", [True, False])
+def test_linearsvc_parameters(loss, penalty, dual):
+    # Test possible parameter combinations in LinearSVC
+    # Generate list of possible parameter combinations
+    X, y = make_classification(n_samples=5, n_features=5, random_state=0)
+
+    clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual, random_state=0)
+    if (
+        (loss, penalty) == ("hinge", "l1")
+        or (loss, penalty, dual) == ("hinge", "l2", False)
+        or (penalty, dual) == ("l1", True)
+    ):
+        with pytest.raises(
+            ValueError,
+            match="Unsupported set of arguments.*penalty='%s.*loss='%s.*dual=%s"
+            % (penalty, loss, dual),
+        ):
+            clf.fit(X, y)
+    else:
+        clf.fit(X, y)
+
+
+def test_linearsvc():
+    # Test basic routines using LinearSVC
+    clf = svm.LinearSVC(random_state=0).fit(X, Y)
+
+    # by default should have intercept
+    assert clf.fit_intercept
+
+    assert_array_equal(clf.predict(T), true_result)
+    assert_array_almost_equal(clf.intercept_, [0], decimal=3)
+
+    # the same with l1 penalty
+    clf = svm.LinearSVC(
+        penalty="l1", loss="squared_hinge", dual=False, random_state=0
+    ).fit(X, Y)
+    assert_array_equal(clf.predict(T), true_result)
+
+    # l2 penalty with dual formulation
+    clf = svm.LinearSVC(penalty="l2", dual=True, random_state=0).fit(X, Y)
+    assert_array_equal(clf.predict(T), true_result)
+
+    # l2 penalty, l1 loss
+    clf = svm.LinearSVC(penalty="l2", loss="hinge", dual=True, random_state=0)
+    clf.fit(X, Y)
+    assert_array_equal(clf.predict(T), true_result)
+
+    # test also decision function
+    dec = clf.decision_function(T)
+    res = (dec > 0).astype(int) + 1
+    assert_array_equal(res, true_result)
+
+
+def test_linearsvc_crammer_singer():
+    # Test LinearSVC with crammer_singer multi-class svm
+    ovr_clf = svm.LinearSVC(random_state=0).fit(iris.data, iris.target)
+    cs_clf = svm.LinearSVC(multi_class="crammer_singer", random_state=0)
+    cs_clf.fit(iris.data, iris.target)
+
+    # similar prediction for ovr and crammer-singer:
+    assert (ovr_clf.predict(iris.data) == cs_clf.predict(iris.data)).mean() > 0.9
+
+    # classifiers shouldn't be the same
+    assert (ovr_clf.coef_ != cs_clf.coef_).all()
+
+    # test decision function
+    assert_array_equal(
+        cs_clf.predict(iris.data),
+        np.argmax(cs_clf.decision_function(iris.data), axis=1),
+    )
+    dec_func = np.dot(iris.data, cs_clf.coef_.T) + cs_clf.intercept_
+    assert_array_almost_equal(dec_func, cs_clf.decision_function(iris.data))
+
+
+def test_linearsvc_fit_sampleweight():
+    # check correct result when sample_weight is 1
+    n_samples = len(X)
+    unit_weight = np.ones(n_samples)
+    clf = svm.LinearSVC(random_state=0).fit(X, Y)
+    clf_unitweight = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit(
+        X, Y, sample_weight=unit_weight
+    )
+
+    # check if same as sample_weight=None
+    assert_array_equal(clf_unitweight.predict(T), clf.predict(T))
+    assert_allclose(clf.coef_, clf_unitweight.coef_, 1, 0.0001)
+
+    # check that fit(X)  = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
+    # X = X1 repeated n1 times, X2 repeated n2 times and so forth
+
+    random_state = check_random_state(0)
+    random_weight = random_state.randint(0, 10, n_samples)
+    lsvc_unflat = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit(
+        X, Y, sample_weight=random_weight
+    )
+
+    pred1 = lsvc_unflat.predict(T)
+
+    X_flat = np.repeat(X, random_weight, axis=0)
+    y_flat = np.repeat(Y, random_weight, axis=0)
+    lsvc_flat = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit(
+        X_flat, y_flat
+    )
+    pred2 = lsvc_flat.predict(T)
+
+    assert_array_equal(pred1, pred2)
+    assert_allclose(lsvc_unflat.coef_, lsvc_flat.coef_, 1, 0.0001)
+
+
+def test_crammer_singer_binary():
+    # Test Crammer-Singer formulation in the binary case
+    X, y = make_classification(n_classes=2, random_state=0)
+
+    for fit_intercept in (True, False):
+        acc = (
+            svm.LinearSVC(
+                fit_intercept=fit_intercept,
+                multi_class="crammer_singer",
+                random_state=0,
+            )
+            .fit(X, y)
+            .score(X, y)
+        )
+        assert acc > 0.9
+
+
+def test_linearsvc_iris():
+    # Test that LinearSVC gives plausible predictions on the iris dataset
+    # Also, test symbolic class names (classes_).
+    target = iris.target_names[iris.target]
+    clf = svm.LinearSVC(random_state=0).fit(iris.data, target)
+    assert set(clf.classes_) == set(iris.target_names)
+    assert np.mean(clf.predict(iris.data) == target) > 0.8
+
+    dec = clf.decision_function(iris.data)
+    pred = iris.target_names[np.argmax(dec, 1)]
+    assert_array_equal(pred, clf.predict(iris.data))
+
+
+def test_dense_liblinear_intercept_handling(classifier=svm.LinearSVC):
+    # Test that dense liblinear honours intercept_scaling param
+    X = [[2, 1], [3, 1], [1, 3], [2, 3]]
+    y = [0, 0, 1, 1]
+    clf = classifier(
+        fit_intercept=True,
+        penalty="l1",
+        loss="squared_hinge",
+        dual=False,
+        C=4,
+        tol=1e-7,
+        random_state=0,
+    )
+    assert clf.intercept_scaling == 1, clf.intercept_scaling
+    assert clf.fit_intercept
+
+    # when intercept_scaling is low the intercept value is highly "penalized"
+    # by regularization
+    clf.intercept_scaling = 1
+    clf.fit(X, y)
+    assert_almost_equal(clf.intercept_, 0, decimal=5)
+
+    # when intercept_scaling is sufficiently high, the intercept value
+    # is not affected by regularization
+    clf.intercept_scaling = 100
+    clf.fit(X, y)
+    intercept1 = clf.intercept_
+    assert intercept1 < -1
+
+    # when intercept_scaling is sufficiently high, the intercept value
+    # doesn't depend on intercept_scaling value
+    clf.intercept_scaling = 1000
+    clf.fit(X, y)
+    intercept2 = clf.intercept_
+    assert_array_almost_equal(intercept1, intercept2, decimal=2)
+
+
+def test_liblinear_set_coef():
+    # multi-class case
+    clf = svm.LinearSVC().fit(iris.data, iris.target)
+    values = clf.decision_function(iris.data)
+    clf.coef_ = clf.coef_.copy()
+    clf.intercept_ = clf.intercept_.copy()
+    values2 = clf.decision_function(iris.data)
+    assert_array_almost_equal(values, values2)
+
+    # binary-class case
+    X = [[2, 1], [3, 1], [1, 3], [2, 3]]
+    y = [0, 0, 1, 1]
+
+    clf = svm.LinearSVC().fit(X, y)
+    values = clf.decision_function(X)
+    clf.coef_ = clf.coef_.copy()
+    clf.intercept_ = clf.intercept_.copy()
+    values2 = clf.decision_function(X)
+    assert_array_equal(values, values2)
+
+
+def test_immutable_coef_property():
+    # Check that primal coef modification are not silently ignored
+    svms = [
+        svm.SVC(kernel="linear").fit(iris.data, iris.target),
+        svm.NuSVC(kernel="linear").fit(iris.data, iris.target),
+        svm.SVR(kernel="linear").fit(iris.data, iris.target),
+        svm.NuSVR(kernel="linear").fit(iris.data, iris.target),
+        svm.OneClassSVM(kernel="linear").fit(iris.data),
+    ]
+    for clf in svms:
+        with pytest.raises(AttributeError):
+            clf.__setattr__("coef_", np.arange(3))
+        with pytest.raises((RuntimeError, ValueError)):
+            clf.coef_.__setitem__((0, 0), 0)
+
+
+def test_linearsvc_verbose():
+    # stdout: redirect
+    import os
+
+    stdout = os.dup(1)  # save original stdout
+    os.dup2(os.pipe()[1], 1)  # replace it
+
+    # actual call
+    clf = svm.LinearSVC(verbose=1)
+    clf.fit(X, Y)
+
+    # stdout: restore
+    os.dup2(stdout, 1)  # restore original stdout
+
+
+def test_svc_clone_with_callable_kernel():
+    # create SVM with callable linear kernel, check that results are the same
+    # as with built-in linear kernel
+    svm_callable = svm.SVC(
+        kernel=lambda x, y: np.dot(x, y.T),
+        probability=True,
+        random_state=0,
+        decision_function_shape="ovr",
+    )
+    # clone for checking clonability with lambda functions..
+    svm_cloned = base.clone(svm_callable)
+    svm_cloned.fit(iris.data, iris.target)
+
+    svm_builtin = svm.SVC(
+        kernel="linear", probability=True, random_state=0, decision_function_shape="ovr"
+    )
+    svm_builtin.fit(iris.data, iris.target)
+
+    assert_array_almost_equal(svm_cloned.dual_coef_, svm_builtin.dual_coef_)
+    assert_array_almost_equal(svm_cloned.intercept_, svm_builtin.intercept_)
+    assert_array_equal(svm_cloned.predict(iris.data), svm_builtin.predict(iris.data))
+
+    assert_array_almost_equal(
+        svm_cloned.predict_proba(iris.data),
+        svm_builtin.predict_proba(iris.data),
+        decimal=4,
+    )
+    assert_array_almost_equal(
+        svm_cloned.decision_function(iris.data),
+        svm_builtin.decision_function(iris.data),
+    )
+
+
+def test_svc_bad_kernel():
+    svc = svm.SVC(kernel=lambda x, y: x)
+    with pytest.raises(ValueError):
+        svc.fit(X, Y)
+
+
+def test_libsvm_convergence_warnings():
+    a = svm.SVC(
+        kernel=lambda x, y: np.dot(x, y.T), probability=True, random_state=0, max_iter=2
+    )
+    warning_msg = (
+        r"Solver terminated early \(max_iter=2\).  Consider pre-processing "
+        r"your data with StandardScaler or MinMaxScaler."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_msg):
+        a.fit(np.array(X), Y)
+    assert np.all(a.n_iter_ == 2)
+
+
+def test_unfitted():
+    X = "foo!"  # input validation not required when SVM not fitted
+
+    clf = svm.SVC()
+    with pytest.raises(Exception, match=r".*\bSVC\b.*\bnot\b.*\bfitted\b"):
+        clf.predict(X)
+
+    clf = svm.NuSVR()
+    with pytest.raises(Exception, match=r".*\bNuSVR\b.*\bnot\b.*\bfitted\b"):
+        clf.predict(X)
+
+
+# ignore convergence warnings from max_iter=1
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+def test_consistent_proba():
+    a = svm.SVC(probability=True, max_iter=1, random_state=0)
+    proba_1 = a.fit(X, Y).predict_proba(X)
+    a = svm.SVC(probability=True, max_iter=1, random_state=0)
+    proba_2 = a.fit(X, Y).predict_proba(X)
+    assert_array_almost_equal(proba_1, proba_2)
+
+
+def test_linear_svm_convergence_warnings():
+    # Test that warnings are raised if model does not converge
+
+    lsvc = svm.LinearSVC(random_state=0, max_iter=2)
+    warning_msg = "Liblinear failed to converge, increase the number of iterations."
+    with pytest.warns(ConvergenceWarning, match=warning_msg):
+        lsvc.fit(X, Y)
+    # Check that we have an n_iter_ attribute with int type as opposed to a
+    # numpy array or an np.int32 so as to match the docstring.
+    assert isinstance(lsvc.n_iter_, int)
+    assert lsvc.n_iter_ == 2
+
+    lsvr = svm.LinearSVR(random_state=0, max_iter=2)
+    with pytest.warns(ConvergenceWarning, match=warning_msg):
+        lsvr.fit(iris.data, iris.target)
+    assert isinstance(lsvr.n_iter_, int)
+    assert lsvr.n_iter_ == 2
+
+
+def test_svr_coef_sign():
+    # Test that SVR(kernel="linear") has coef_ with the right sign.
+    # Non-regression test for #2933.
+    X = np.random.RandomState(21).randn(10, 3)
+    y = np.random.RandomState(12).randn(10)
+
+    for svr in [
+        svm.SVR(kernel="linear"),
+        svm.NuSVR(kernel="linear"),
+        svm.LinearSVR(),
+    ]:
+        svr.fit(X, y)
+        assert_array_almost_equal(
+            svr.predict(X), np.dot(X, svr.coef_.ravel()) + svr.intercept_
+        )
+
+
+def test_lsvc_intercept_scaling_zero():
+    # Test that intercept_scaling is ignored when fit_intercept is False
+
+    lsvc = svm.LinearSVC(fit_intercept=False)
+    lsvc.fit(X, Y)
+    assert lsvc.intercept_ == 0.0
+
+
+def test_hasattr_predict_proba():
+    # Method must be (un)available before or after fit, switched by
+    # `probability` param
+
+    G = svm.SVC(probability=True)
+    assert hasattr(G, "predict_proba")
+    G.fit(iris.data, iris.target)
+    assert hasattr(G, "predict_proba")
+
+    G = svm.SVC(probability=False)
+    assert not hasattr(G, "predict_proba")
+    G.fit(iris.data, iris.target)
+    assert not hasattr(G, "predict_proba")
+
+    # Switching to `probability=True` after fitting should make
+    # predict_proba available, but calling it must not work:
+    G.probability = True
+    assert hasattr(G, "predict_proba")
+    msg = "predict_proba is not available when fitted with probability=False"
+
+    with pytest.raises(NotFittedError, match=msg):
+        G.predict_proba(iris.data)
+
+
+def test_decision_function_shape_two_class():
+    for n_classes in [2, 3]:
+        X, y = make_blobs(centers=n_classes, random_state=0)
+        for estimator in [svm.SVC, svm.NuSVC]:
+            clf = OneVsRestClassifier(estimator(decision_function_shape="ovr")).fit(
+                X, y
+            )
+            assert len(clf.predict(X)) == len(y)
+
+
+def test_ovr_decision_function():
+    # One point from each quadrant represents one class
+    X_train = np.array([[1, 1], [-1, 1], [-1, -1], [1, -1]])
+    y_train = [0, 1, 2, 3]
+
+    # First point is closer to the decision boundaries than the second point
+    base_points = np.array([[5, 5], [10, 10]])
+
+    # For all the quadrants (classes)
+    X_test = np.vstack(
+        (
+            base_points * [1, 1],  # Q1
+            base_points * [-1, 1],  # Q2
+            base_points * [-1, -1],  # Q3
+            base_points * [1, -1],  # Q4
+        )
+    )
+
+    y_test = [0] * 2 + [1] * 2 + [2] * 2 + [3] * 2
+
+    clf = svm.SVC(kernel="linear", decision_function_shape="ovr")
+    clf.fit(X_train, y_train)
+
+    y_pred = clf.predict(X_test)
+
+    # Test if the prediction is the same as y
+    assert_array_equal(y_pred, y_test)
+
+    deci_val = clf.decision_function(X_test)
+
+    # Assert that the predicted class has the maximum value
+    assert_array_equal(np.argmax(deci_val, axis=1), y_pred)
+
+    # Get decision value at test points for the predicted class
+    pred_class_deci_val = deci_val[range(8), y_pred].reshape((4, 2))
+
+    # Assert pred_class_deci_val > 0 here
+    assert np.min(pred_class_deci_val) > 0.0
+
+    # Test if the first point has lower decision value on every quadrant
+    # compared to the second point
+    assert np.all(pred_class_deci_val[:, 0] < pred_class_deci_val[:, 1])
+
+
+@pytest.mark.parametrize("SVCClass", [svm.SVC, svm.NuSVC])
+def test_svc_invalid_break_ties_param(SVCClass):
+    X, y = make_blobs(random_state=42)
+
+    svm = SVCClass(
+        kernel="linear", decision_function_shape="ovo", break_ties=True, random_state=42
+    ).fit(X, y)
+
+    with pytest.raises(ValueError, match="break_ties must be False"):
+        svm.predict(y)
+
+
+@pytest.mark.parametrize("SVCClass", [svm.SVC, svm.NuSVC])
+def test_svc_ovr_tie_breaking(SVCClass):
+    """Test if predict breaks ties in OVR mode.
+    Related issue: https://github.com/scikit-learn/scikit-learn/issues/8277
+    """
+    if SVCClass.__name__ == "NuSVC" and _IS_32BIT:
+        # XXX: known failure to be investigated. Either the code needs to be
+        # fixed or the test itself might need to be made less sensitive to
+        # random changes in test data and rounding errors more generally.
+        # https://github.com/scikit-learn/scikit-learn/issues/29633
+        pytest.xfail("Failing test on 32bit OS")
+
+    X, y = make_blobs(random_state=0, n_samples=20, n_features=2)
+
+    xs = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
+    ys = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
+    xx, yy = np.meshgrid(xs, ys)
+
+    common_params = dict(
+        kernel="rbf", gamma=1e6, random_state=42, decision_function_shape="ovr"
+    )
+    svm = SVCClass(
+        break_ties=False,
+        **common_params,
+    ).fit(X, y)
+    pred = svm.predict(np.c_[xx.ravel(), yy.ravel()])
+    dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
+    assert not np.all(pred == np.argmax(dv, axis=1))
+
+    svm = SVCClass(
+        break_ties=True,
+        **common_params,
+    ).fit(X, y)
+    pred = svm.predict(np.c_[xx.ravel(), yy.ravel()])
+    dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
+    assert np.all(pred == np.argmax(dv, axis=1))
+
+
+def test_gamma_scale():
+    X, y = [[0.0], [1.0]], [0, 1]
+
+    clf = svm.SVC()
+    clf.fit(X, y)
+    assert_almost_equal(clf._gamma, 4)
+
+
+@pytest.mark.parametrize(
+    "SVM, params",
+    [
+        (LinearSVC, {"penalty": "l1", "loss": "squared_hinge", "dual": False}),
+        (LinearSVC, {"penalty": "l2", "loss": "squared_hinge", "dual": True}),
+        (LinearSVC, {"penalty": "l2", "loss": "squared_hinge", "dual": False}),
+        (LinearSVC, {"penalty": "l2", "loss": "hinge", "dual": True}),
+        (LinearSVR, {"loss": "epsilon_insensitive", "dual": True}),
+        (LinearSVR, {"loss": "squared_epsilon_insensitive", "dual": True}),
+        (LinearSVR, {"loss": "squared_epsilon_insensitive", "dual": True}),
+    ],
+)
+def test_linearsvm_liblinear_sample_weight(SVM, params):
+    X = np.array(
+        [
+            [1, 3],
+            [1, 3],
+            [1, 3],
+            [1, 3],
+            [2, 1],
+            [2, 1],
+            [2, 1],
+            [2, 1],
+            [3, 3],
+            [3, 3],
+            [3, 3],
+            [3, 3],
+            [4, 1],
+            [4, 1],
+            [4, 1],
+            [4, 1],
+        ],
+        dtype=np.dtype("float"),
+    )
+    y = np.array(
+        [1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype("int")
+    )
+
+    X2 = np.vstack([X, X])
+    y2 = np.hstack([y, 3 - y])
+    sample_weight = np.ones(shape=len(y) * 2)
+    sample_weight[len(y) :] = 0
+    X2, y2, sample_weight = shuffle(X2, y2, sample_weight, random_state=0)
+
+    base_estimator = SVM(random_state=42)
+    base_estimator.set_params(**params)
+    base_estimator.set_params(tol=1e-12, max_iter=1000)
+    est_no_weight = base.clone(base_estimator).fit(X, y)
+    est_with_weight = base.clone(base_estimator).fit(
+        X2, y2, sample_weight=sample_weight
+    )
+
+    for method in ("predict", "decision_function"):
+        if hasattr(base_estimator, method):
+            X_est_no_weight = getattr(est_no_weight, method)(X)
+            X_est_with_weight = getattr(est_with_weight, method)(X)
+            assert_allclose(X_est_no_weight, X_est_with_weight)
+
+
+@pytest.mark.parametrize("Klass", (OneClassSVM, SVR, NuSVR))
+def test_n_support(Klass):
+    # Make n_support is correct for oneclass and SVR (used to be
+    # non-initialized)
+    # this is a non regression test for issue #14774
+    X = np.array([[0], [0.44], [0.45], [0.46], [1]])
+    y = np.arange(X.shape[0])
+    est = Klass()
+    assert not hasattr(est, "n_support_")
+    est.fit(X, y)
+    assert est.n_support_[0] == est.support_vectors_.shape[0]
+    assert est.n_support_.size == 1
+
+
+@pytest.mark.parametrize("Estimator", [svm.SVC, svm.SVR])
+def test_custom_kernel_not_array_input(Estimator):
+    """Test using a custom kernel that is not fed with array-like for floats"""
+    data = ["A A", "A", "B", "B B", "A B"]
+    X = np.array([[2, 0], [1, 0], [0, 1], [0, 2], [1, 1]])  # count encoding
+    y = np.array([1, 1, 2, 2, 1])
+
+    def string_kernel(X1, X2):
+        assert isinstance(X1[0], str)
+        n_samples1 = _num_samples(X1)
+        n_samples2 = _num_samples(X2)
+        K = np.zeros((n_samples1, n_samples2))
+        for ii in range(n_samples1):
+            for jj in range(ii, n_samples2):
+                K[ii, jj] = X1[ii].count("A") * X2[jj].count("A")
+                K[ii, jj] += X1[ii].count("B") * X2[jj].count("B")
+                K[jj, ii] = K[ii, jj]
+        return K
+
+    K = string_kernel(data, data)
+    assert_array_equal(np.dot(X, X.T), K)
+
+    svc1 = Estimator(kernel=string_kernel).fit(data, y)
+    svc2 = Estimator(kernel="linear").fit(X, y)
+    svc3 = Estimator(kernel="precomputed").fit(K, y)
+
+    assert svc1.score(data, y) == svc3.score(K, y)
+    assert svc1.score(data, y) == svc2.score(X, y)
+    if hasattr(svc1, "decision_function"):  # classifier
+        assert_allclose(svc1.decision_function(data), svc2.decision_function(X))
+        assert_allclose(svc1.decision_function(data), svc3.decision_function(K))
+        assert_array_equal(svc1.predict(data), svc2.predict(X))
+        assert_array_equal(svc1.predict(data), svc3.predict(K))
+    else:  # regressor
+        assert_allclose(svc1.predict(data), svc2.predict(X))
+        assert_allclose(svc1.predict(data), svc3.predict(K))
+
+
+def test_svc_raises_error_internal_representation():
+    """Check that SVC raises error when internal representation is altered.
+
+    Non-regression test for #18891 and https://nvd.nist.gov/vuln/detail/CVE-2020-28975
+    """
+    clf = svm.SVC(kernel="linear").fit(X, Y)
+    clf._n_support[0] = 1000000
+
+    msg = "The internal representation of SVC was altered"
+    with pytest.raises(ValueError, match=msg):
+        clf.predict(X)
+
+
+@pytest.mark.parametrize(
+    "estimator, expected_n_iter_type",
+    [
+        (svm.SVC, np.ndarray),
+        (svm.NuSVC, np.ndarray),
+        (svm.SVR, int),
+        (svm.NuSVR, int),
+        (svm.OneClassSVM, int),
+    ],
+)
+@pytest.mark.parametrize(
+    "dataset",
+    [
+        make_classification(n_classes=2, n_informative=2, random_state=0),
+        make_classification(n_classes=3, n_informative=3, random_state=0),
+        make_classification(n_classes=4, n_informative=4, random_state=0),
+    ],
+)
+def test_n_iter_libsvm(estimator, expected_n_iter_type, dataset):
+    # Check that the type of n_iter_ is correct for the classes that inherit
+    # from BaseSVC.
+    # Note that for SVC, and NuSVC this is an ndarray; while for SVR, NuSVR, and
+    # OneClassSVM, it is an int.
+    # For SVC and NuSVC also check the shape of n_iter_.
+    X, y = dataset
+    n_iter = estimator(kernel="linear").fit(X, y).n_iter_
+    assert type(n_iter) == expected_n_iter_type
+    if estimator in [svm.SVC, svm.NuSVC]:
+        n_classes = len(np.unique(y))
+        assert n_iter.shape == (n_classes * (n_classes - 1) // 2,)
+
+
+@pytest.mark.parametrize("loss", ["squared_hinge", "squared_epsilon_insensitive"])
+def test_dual_auto(loss):
+    # OvR, L2, N > M (6,2)
+    dual = _validate_dual_parameter("auto", loss, "l2", "ovr", np.asarray(X))
+    assert dual is False
+    # OvR, L2, N < M (2,6)
+    dual = _validate_dual_parameter("auto", loss, "l2", "ovr", np.asarray(X).T)
+    assert dual is True
+
+
+def test_dual_auto_edge_cases():
+    # Hinge, OvR, L2, N > M (6,2)
+    dual = _validate_dual_parameter("auto", "hinge", "l2", "ovr", np.asarray(X))
+    assert dual is True  # only supports True
+    dual = _validate_dual_parameter(
+        "auto", "epsilon_insensitive", "l2", "ovr", np.asarray(X)
+    )
+    assert dual is True  # only supports True
+    # SqHinge, OvR, L1, N < M (2,6)
+    dual = _validate_dual_parameter(
+        "auto", "squared_hinge", "l1", "ovr", np.asarray(X).T
+    )
+    assert dual is False  # only supports False
+
+
+@pytest.mark.parametrize(
+    "Estimator, make_dataset",
+    [(svm.SVC, make_classification), (svm.SVR, make_regression)],
+)
+@pytest.mark.parametrize("C_inf", [np.inf, float("inf")])
+def test_svm_with_infinite_C(Estimator, make_dataset, C_inf, global_random_seed):
+    """Check that we can pass `C=inf` that is equivalent to a very large C value.
+
+    Non-regression test for
+    https://github.com/scikit-learn/scikit-learn/issues/29772
+    """
+    X, y = make_dataset(random_state=global_random_seed)
+    estimator_C_inf = Estimator(C=C_inf).fit(X, y)
+    estimator_C_large = Estimator(C=1e10).fit(X, y)
+
+    assert_allclose(estimator_C_large.predict(X), estimator_C_inf.predict(X))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/metadata_routing_common.py b/.venv/lib/python3.12/site-packages/sklearn/tests/metadata_routing_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4dd79581db9097bd45d99b2f11e80f90862d58f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/metadata_routing_common.py
@@ -0,0 +1,584 @@
+import inspect
+from collections import defaultdict
+from functools import partial
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MetaEstimatorMixin,
+    RegressorMixin,
+    TransformerMixin,
+    clone,
+)
+from sklearn.metrics._scorer import _Scorer, mean_squared_error
+from sklearn.model_selection import BaseCrossValidator
+from sklearn.model_selection._split import GroupsConsumerMixin
+from sklearn.utils._metadata_requests import (
+    SIMPLE_METHODS,
+)
+from sklearn.utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    process_routing,
+)
+from sklearn.utils.multiclass import _check_partial_fit_first_call
+
+
+def record_metadata(obj, record_default=True, **kwargs):
+    """Utility function to store passed metadata to a method of obj.
+
+    If record_default is False, kwargs whose values are "default" are skipped.
+    This is so that checks on keyword arguments whose default was not changed
+    are skipped.
+
+    """
+    stack = inspect.stack()
+    callee = stack[1].function
+    caller = stack[2].function
+    if not hasattr(obj, "_records"):
+        obj._records = defaultdict(lambda: defaultdict(list))
+    if not record_default:
+        kwargs = {
+            key: val
+            for key, val in kwargs.items()
+            if not isinstance(val, str) or (val != "default")
+        }
+    obj._records[callee][caller].append(kwargs)
+
+
+def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs):
+    """Check whether the expected metadata is passed to the object's method.
+
+    Parameters
+    ----------
+    obj : estimator object
+        sub-estimator to check routed params for
+    method : str
+        sub-estimator's method where metadata is routed to, or otherwise in
+        the context of metadata routing referred to as 'callee'
+    parent : str
+        the parent method which should have called `method`, or otherwise in
+        the context of metadata routing referred to as 'caller'
+    split_params : tuple, default=empty
+        specifies any parameters which are to be checked as being a subset
+        of the original values
+    **kwargs : dict
+        passed metadata
+    """
+    all_records = (
+        getattr(obj, "_records", dict()).get(method, dict()).get(parent, list())
+    )
+    for record in all_records:
+        # first check that the names of the metadata passed are the same as
+        # expected. The names are stored as keys in `record`.
+        assert set(kwargs.keys()) == set(record.keys()), (
+            f"Expected {kwargs.keys()} vs {record.keys()}"
+        )
+        for key, value in kwargs.items():
+            recorded_value = record[key]
+            # The following condition is used to check for any specified parameters
+            # being a subset of the original values
+            if key in split_params and recorded_value is not None:
+                assert np.isin(recorded_value, value).all()
+            else:
+                if isinstance(recorded_value, np.ndarray):
+                    assert_array_equal(recorded_value, value)
+                else:
+                    assert recorded_value is value, (
+                        f"Expected {recorded_value} vs {value}. Method: {method}"
+                    )
+
+
+record_metadata_not_default = partial(record_metadata, record_default=False)
+
+
+def assert_request_is_empty(metadata_request, exclude=None):
+    """Check if a metadata request dict is empty.
+
+    One can exclude a method or a list of methods from the check using the
+    ``exclude`` parameter. If metadata_request is a MetadataRouter, then
+    ``exclude`` can be of the form ``{"object" : [method, ...]}``.
+    """
+    if isinstance(metadata_request, MetadataRouter):
+        for name, route_mapping in metadata_request:
+            if exclude is not None and name in exclude:
+                _exclude = exclude[name]
+            else:
+                _exclude = None
+            assert_request_is_empty(route_mapping.router, exclude=_exclude)
+        return
+
+    exclude = [] if exclude is None else exclude
+    for method in SIMPLE_METHODS:
+        if method in exclude:
+            continue
+        mmr = getattr(metadata_request, method)
+        props = [
+            prop
+            for prop, alias in mmr.requests.items()
+            if isinstance(alias, str) or alias is not None
+        ]
+        assert not props
+
+
+def assert_request_equal(request, dictionary):
+    for method, requests in dictionary.items():
+        mmr = getattr(request, method)
+        assert mmr.requests == requests
+
+    empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary]
+    for method in empty_methods:
+        assert not len(getattr(request, method).requests)
+
+
+class _Registry(list):
+    # This list is used to get a reference to the sub-estimators, which are not
+    # necessarily stored on the metaestimator. We need to override __deepcopy__
+    # because the sub-estimators are probably cloned, which would result in a
+    # new copy of the list, but we need copy and deep copy both to return the
+    # same instance.
+    def __deepcopy__(self, memo):
+        return self
+
+    def __copy__(self):
+        return self
+
+
+class ConsumingRegressor(RegressorMixin, BaseEstimator):
+    """A regressor consuming metadata.
+
+    Parameters
+    ----------
+    registry : list, default=None
+        If a list, the estimator will append itself to the list in order to have
+        a reference to the estimator later on. Since that reference is not
+        required in all tests, registration can be skipped by leaving this value
+        as None.
+    """
+
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def partial_fit(self, X, y, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return self
+
+    def fit(self, X, y, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return self
+
+    def predict(self, X, y=None, sample_weight="default", metadata="default"):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return np.zeros(shape=(len(X),))
+
+    def score(self, X, y, sample_weight="default", metadata="default"):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return 1
+
+
+class NonConsumingClassifier(ClassifierMixin, BaseEstimator):
+    """A classifier which accepts no metadata on any method."""
+
+    def __init__(self, alpha=0.0):
+        self.alpha = alpha
+
+    def fit(self, X, y):
+        self.classes_ = np.unique(y)
+        self.coef_ = np.ones_like(X)
+        return self
+
+    def partial_fit(self, X, y, classes=None):
+        return self
+
+    def decision_function(self, X):
+        return self.predict(X)
+
+    def predict(self, X):
+        y_pred = np.empty(shape=(len(X),))
+        y_pred[: len(X) // 2] = 0
+        y_pred[len(X) // 2 :] = 1
+        return y_pred
+
+    def predict_proba(self, X):
+        # dummy probabilities to support predict_proba
+        y_proba = np.empty(shape=(len(X), len(self.classes_)), dtype=np.float32)
+        # each row sums up to 1.0:
+        y_proba[:] = np.random.dirichlet(alpha=np.ones(len(self.classes_)), size=len(X))
+        return y_proba
+
+    def predict_log_proba(self, X):
+        # dummy probabilities to support predict_log_proba
+        return self.predict_proba(X)
+
+
+class NonConsumingRegressor(RegressorMixin, BaseEstimator):
+    """A classifier which accepts no metadata on any method."""
+
+    def fit(self, X, y):
+        return self
+
+    def partial_fit(self, X, y):
+        return self
+
+    def predict(self, X):
+        return np.ones(len(X))  # pragma: no cover
+
+
+class ConsumingClassifier(ClassifierMixin, BaseEstimator):
+    """A classifier consuming metadata.
+
+    Parameters
+    ----------
+    registry : list, default=None
+        If a list, the estimator will append itself to the list in order to have
+        a reference to the estimator later on. Since that reference is not
+        required in all tests, registration can be skipped by leaving this value
+        as None.
+
+    alpha : float, default=0
+        This parameter is only used to test the ``*SearchCV`` objects, and
+        doesn't do anything.
+    """
+
+    def __init__(self, registry=None, alpha=0.0):
+        self.alpha = alpha
+        self.registry = registry
+
+    def partial_fit(
+        self, X, y, classes=None, sample_weight="default", metadata="default"
+    ):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        _check_partial_fit_first_call(self, classes)
+        return self
+
+    def fit(self, X, y, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+
+        self.classes_ = np.unique(y)
+        self.coef_ = np.ones_like(X)
+        return self
+
+    def predict(self, X, sample_weight="default", metadata="default"):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        y_score = np.empty(shape=(len(X),), dtype="int8")
+        y_score[len(X) // 2 :] = 0
+        y_score[: len(X) // 2] = 1
+        return y_score
+
+    def predict_proba(self, X, sample_weight="default", metadata="default"):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        y_proba = np.empty(shape=(len(X), len(self.classes_)), dtype=np.float32)
+        # each row sums up to 1.0:
+        y_proba[:] = np.random.dirichlet(alpha=np.ones(len(self.classes_)), size=len(X))
+        return y_proba
+
+    def predict_log_proba(self, X, sample_weight="default", metadata="default"):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return self.predict_proba(X)
+
+    def decision_function(self, X, sample_weight="default", metadata="default"):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        y_score = np.empty(shape=(len(X),))
+        y_score[len(X) // 2 :] = 0
+        y_score[: len(X) // 2] = 1
+        return y_score
+
+    def score(self, X, y, sample_weight="default", metadata="default"):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return 1
+
+
+class ConsumingClassifierWithoutPredictProba(ConsumingClassifier):
+    """ConsumingClassifier without a predict_proba method, but with predict_log_proba.
+
+    Used to mimic dynamic method selection such as in the `_parallel_predict_proba()`
+    function called by `BaggingClassifier`.
+    """
+
+    @property
+    def predict_proba(self):
+        raise AttributeError("This estimator does not support predict_proba")
+
+
+class ConsumingClassifierWithoutPredictLogProba(ConsumingClassifier):
+    """ConsumingClassifier without a predict_log_proba method, but with predict_proba.
+
+    Used to mimic dynamic method selection such as in
+    `BaggingClassifier.predict_log_proba()`.
+    """
+
+    @property
+    def predict_log_proba(self):
+        raise AttributeError("This estimator does not support predict_log_proba")
+
+
+class ConsumingClassifierWithOnlyPredict(ConsumingClassifier):
+    """ConsumingClassifier with only a predict method.
+
+    Used to mimic dynamic method selection such as in
+    `BaggingClassifier.predict_log_proba()`.
+    """
+
+    @property
+    def predict_proba(self):
+        raise AttributeError("This estimator does not support predict_proba")
+
+    @property
+    def predict_log_proba(self):
+        raise AttributeError("This estimator does not support predict_log_proba")
+
+
+class ConsumingTransformer(TransformerMixin, BaseEstimator):
+    """A transformer which accepts metadata on fit and transform.
+
+    Parameters
+    ----------
+    registry : list, default=None
+        If a list, the estimator will append itself to the list in order to have
+        a reference to the estimator later on. Since that reference is not
+        required in all tests, registration can be skipped by leaving this value
+        as None.
+    """
+
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def fit(self, X, y=None, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        self.fitted_ = True
+        return self
+
+    def transform(self, X, sample_weight="default", metadata="default"):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return X + 1
+
+    def fit_transform(self, X, y, sample_weight="default", metadata="default"):
+        # implementing ``fit_transform`` is necessary since
+        # ``TransformerMixin.fit_transform`` doesn't route any metadata to
+        # ``transform``, while here we want ``transform`` to receive
+        # ``sample_weight`` and ``metadata``.
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return self.fit(X, y, sample_weight=sample_weight, metadata=metadata).transform(
+            X, sample_weight=sample_weight, metadata=metadata
+        )
+
+    def inverse_transform(self, X, sample_weight=None, metadata=None):
+        record_metadata_not_default(
+            self, sample_weight=sample_weight, metadata=metadata
+        )
+        return X - 1
+
+
+class ConsumingNoFitTransformTransformer(BaseEstimator):
+    """A metadata consuming transformer that doesn't inherit from
+    TransformerMixin, and thus doesn't implement `fit_transform`. Note that
+    TransformerMixin's `fit_transform` doesn't route metadata to `transform`."""
+
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def fit(self, X, y=None, sample_weight=None, metadata=None):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata(self, sample_weight=sample_weight, metadata=metadata)
+
+        return self
+
+    def transform(self, X, sample_weight=None, metadata=None):
+        record_metadata(self, sample_weight=sample_weight, metadata=metadata)
+        return X
+
+
+class ConsumingScorer(_Scorer):
+    def __init__(self, registry=None):
+        super().__init__(
+            score_func=mean_squared_error, sign=1, kwargs={}, response_method="predict"
+        )
+        self.registry = registry
+
+    def _score(self, method_caller, clf, X, y, **kwargs):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(self, **kwargs)
+
+        sample_weight = kwargs.get("sample_weight", None)
+        return super()._score(method_caller, clf, X, y, sample_weight=sample_weight)
+
+
+class ConsumingSplitter(GroupsConsumerMixin, BaseCrossValidator):
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def split(self, X, y=None, groups="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(self, groups=groups, metadata=metadata)
+
+        split_index = len(X) // 2
+        train_indices = list(range(0, split_index))
+        test_indices = list(range(split_index, len(X)))
+        yield test_indices, train_indices
+        yield train_indices, test_indices
+
+    def get_n_splits(self, X=None, y=None, groups=None, metadata=None):
+        return 2
+
+    def _iter_test_indices(self, X=None, y=None, groups=None):
+        split_index = len(X) // 2
+        train_indices = list(range(0, split_index))
+        test_indices = list(range(split_index, len(X)))
+        yield test_indices
+        yield train_indices
+
+
+class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
+    """A meta-regressor which is only a router."""
+
+    def __init__(self, estimator):
+        self.estimator = estimator
+
+    def fit(self, X, y, **fit_params):
+        params = process_routing(self, "fit", **fit_params)
+        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
+
+    def get_metadata_routing(self):
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+        )
+        return router
+
+
+class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
+    """A meta-regressor which is also a consumer."""
+
+    def __init__(self, estimator, registry=None):
+        self.estimator = estimator
+        self.registry = registry
+
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata(self, sample_weight=sample_weight)
+        params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params)
+        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
+        return self
+
+    def predict(self, X, **predict_params):
+        params = process_routing(self, "predict", **predict_params)
+        return self.estimator_.predict(X, **params.estimator.predict)
+
+    def get_metadata_routing(self):
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(
+                estimator=self.estimator,
+                method_mapping=MethodMapping()
+                .add(caller="fit", callee="fit")
+                .add(caller="predict", callee="predict"),
+            )
+        )
+        return router
+
+
+class WeightedMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
+    """A meta-estimator which also consumes sample_weight itself in ``fit``."""
+
+    def __init__(self, estimator, registry=None):
+        self.estimator = estimator
+        self.registry = registry
+
+    def fit(self, X, y, sample_weight=None, **kwargs):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata(self, sample_weight=sample_weight)
+        params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs)
+        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
+        return self
+
+    def get_metadata_routing(self):
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(
+                estimator=self.estimator,
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+        )
+        return router
+
+
+class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
+    """A simple meta-transformer."""
+
+    def __init__(self, transformer):
+        self.transformer = transformer
+
+    def fit(self, X, y=None, **fit_params):
+        params = process_routing(self, "fit", **fit_params)
+        self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit)
+        return self
+
+    def transform(self, X, y=None, **transform_params):
+        params = process_routing(self, "transform", **transform_params)
+        return self.transformer_.transform(X, **params.transformer.transform)
+
+    def get_metadata_routing(self):
+        return MetadataRouter(owner=self.__class__.__name__).add(
+            transformer=self.transformer,
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="fit")
+            .add(caller="transform", callee="transform"),
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..0842cf0c82b485b16717ac19c78b4d51098769eb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_base.py
@@ -0,0 +1,1081 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import pickle
+import re
+import warnings
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from numpy.testing import assert_allclose
+
+import sklearn
+from sklearn import config_context, datasets
+from sklearn.base import (
+    BaseEstimator,
+    OutlierMixin,
+    TransformerMixin,
+    clone,
+    is_classifier,
+    is_clusterer,
+    is_outlier_detector,
+    is_regressor,
+)
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from sklearn.ensemble import IsolationForest
+from sklearn.exceptions import InconsistentVersionWarning
+from sklearn.metrics import get_scorer
+from sklearn.model_selection import GridSearchCV, KFold
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC, SVR
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils._mocking import MockDataFrame
+from sklearn.utils._set_output import _get_output_config
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_array_equal,
+)
+from sklearn.utils.validation import _check_n_features, validate_data
+
+
+#############################################################################
+# A few test classes
+class MyEstimator(BaseEstimator):
+    def __init__(self, l1=0, empty=None):
+        self.l1 = l1
+        self.empty = empty
+
+
+class K(BaseEstimator):
+    def __init__(self, c=None, d=None):
+        self.c = c
+        self.d = d
+
+
+class T(BaseEstimator):
+    def __init__(self, a=None, b=None):
+        self.a = a
+        self.b = b
+
+
+class NaNTag(BaseEstimator):
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
+
+class NoNaNTag(BaseEstimator):
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags
+
+
+class OverrideTag(NaNTag):
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags
+
+
+class DiamondOverwriteTag(NaNTag, NoNaNTag):
+    pass
+
+
+class InheritDiamondOverwriteTag(DiamondOverwriteTag):
+    pass
+
+
+class ModifyInitParams(BaseEstimator):
+    """Deprecated behavior.
+    Equal parameters but with a type cast.
+    Doesn't fulfill a is a
+    """
+
+    def __init__(self, a=np.array([0])):
+        self.a = a.copy()
+
+
+class Buggy(BaseEstimator):
+    "A buggy estimator that does not set its parameters right."
+
+    def __init__(self, a=None):
+        self.a = 1
+
+
+class NoEstimator:
+    def __init__(self):
+        pass
+
+    def fit(self, X=None, y=None):
+        return self
+
+    def predict(self, X=None):
+        return None
+
+
+class VargEstimator(BaseEstimator):
+    """scikit-learn estimators shouldn't have vargs."""
+
+    def __init__(self, *vargs):
+        pass
+
+
+#############################################################################
+# The tests
+
+
+def test_clone():
+    # Tests that clone creates a correct deep copy.
+    # We create an estimator, make a copy of its original state
+    # (which, in this case, is the current state of the estimator),
+    # and check that the obtained copy is a correct deep copy.
+
+    from sklearn.feature_selection import SelectFpr, f_classif
+
+    selector = SelectFpr(f_classif, alpha=0.1)
+    new_selector = clone(selector)
+    assert selector is not new_selector
+    assert selector.get_params() == new_selector.get_params()
+
+    selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
+    new_selector = clone(selector)
+    assert selector is not new_selector
+
+
+def test_clone_2():
+    # Tests that clone doesn't copy everything.
+    # We first create an estimator, give it an own attribute, and
+    # make a copy of its original state. Then we check that the copy doesn't
+    # have the specific attribute we manually added to the initial estimator.
+
+    from sklearn.feature_selection import SelectFpr, f_classif
+
+    selector = SelectFpr(f_classif, alpha=0.1)
+    selector.own_attribute = "test"
+    new_selector = clone(selector)
+    assert not hasattr(new_selector, "own_attribute")
+
+
+def test_clone_buggy():
+    # Check that clone raises an error on buggy estimators.
+    buggy = Buggy()
+    buggy.a = 2
+    with pytest.raises(RuntimeError):
+        clone(buggy)
+
+    no_estimator = NoEstimator()
+    with pytest.raises(TypeError):
+        clone(no_estimator)
+
+    varg_est = VargEstimator()
+    with pytest.raises(RuntimeError):
+        clone(varg_est)
+
+    est = ModifyInitParams()
+    with pytest.raises(RuntimeError):
+        clone(est)
+
+
+def test_clone_empty_array():
+    # Regression test for cloning estimators with empty arrays
+    clf = MyEstimator(empty=np.array([]))
+    clf2 = clone(clf)
+    assert_array_equal(clf.empty, clf2.empty)
+
+    clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]])))
+    clf2 = clone(clf)
+    assert_array_equal(clf.empty.data, clf2.empty.data)
+
+
+def test_clone_nan():
+    # Regression test for cloning estimators with default parameter as np.nan
+    clf = MyEstimator(empty=np.nan)
+    clf2 = clone(clf)
+
+    assert clf.empty is clf2.empty
+
+
+def test_clone_dict():
+    # test that clone creates a clone of a dict
+    orig = {"a": MyEstimator()}
+    cloned = clone(orig)
+    assert orig["a"] is not cloned["a"]
+
+
+def test_clone_sparse_matrices():
+    sparse_matrix_classes = [
+        cls
+        for name in dir(sp)
+        if name.endswith("_matrix") and type(cls := getattr(sp, name)) is type
+    ]
+
+    for cls in sparse_matrix_classes:
+        sparse_matrix = cls(np.eye(5))
+        clf = MyEstimator(empty=sparse_matrix)
+        clf_cloned = clone(clf)
+        assert clf.empty.__class__ is clf_cloned.empty.__class__
+        assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray())
+
+
+def test_clone_estimator_types():
+    # Check that clone works for parameters that are types rather than
+    # instances
+    clf = MyEstimator(empty=MyEstimator)
+    clf2 = clone(clf)
+
+    assert clf.empty is clf2.empty
+
+
+def test_clone_class_rather_than_instance():
+    # Check that clone raises expected error message when
+    # cloning class rather than instance
+    msg = "You should provide an instance of scikit-learn estimator"
+    with pytest.raises(TypeError, match=msg):
+        clone(MyEstimator)
+
+
+def test_repr():
+    # Smoke test the repr of the base estimator.
+    my_estimator = MyEstimator()
+    repr(my_estimator)
+    test = T(K(), K())
+    assert repr(test) == "T(a=K(), b=K())"
+
+    some_est = T(a=["long_params"] * 1000)
+    assert len(repr(some_est)) == 485
+
+
+def test_str():
+    # Smoke test the str of the base estimator
+    my_estimator = MyEstimator()
+    str(my_estimator)
+
+
+def test_get_params():
+    test = T(K(), K)
+
+    assert "a__d" in test.get_params(deep=True)
+    assert "a__d" not in test.get_params(deep=False)
+
+    test.set_params(a__d=2)
+    assert test.a.d == 2
+
+    with pytest.raises(ValueError):
+        test.set_params(a__a=2)
+
+
+# TODO(1.8): Remove this test when the deprecation is removed
+def test_is_estimator_type_class():
+    with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"):
+        assert is_classifier(SVC)
+
+    with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"):
+        assert is_regressor(SVR)
+
+    with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"):
+        assert is_clusterer(KMeans)
+
+    with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"):
+        assert is_outlier_detector(IsolationForest)
+
+
+@pytest.mark.parametrize(
+    "estimator, expected_result",
+    [
+        (SVC(), True),
+        (GridSearchCV(SVC(), {"C": [0.1, 1]}), True),
+        (Pipeline([("svc", SVC())]), True),
+        (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), True),
+        (SVR(), False),
+        (GridSearchCV(SVR(), {"C": [0.1, 1]}), False),
+        (Pipeline([("svr", SVR())]), False),
+        (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), False),
+    ],
+)
+def test_is_classifier(estimator, expected_result):
+    assert is_classifier(estimator) == expected_result
+
+
+@pytest.mark.parametrize(
+    "estimator, expected_result",
+    [
+        (SVR(), True),
+        (GridSearchCV(SVR(), {"C": [0.1, 1]}), True),
+        (Pipeline([("svr", SVR())]), True),
+        (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), True),
+        (SVC(), False),
+        (GridSearchCV(SVC(), {"C": [0.1, 1]}), False),
+        (Pipeline([("svc", SVC())]), False),
+        (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False),
+    ],
+)
+def test_is_regressor(estimator, expected_result):
+    assert is_regressor(estimator) == expected_result
+
+
+@pytest.mark.parametrize(
+    "estimator, expected_result",
+    [
+        (KMeans(), True),
+        (GridSearchCV(KMeans(), {"n_clusters": [3, 8]}), True),
+        (Pipeline([("km", KMeans())]), True),
+        (Pipeline([("km_cv", GridSearchCV(KMeans(), {"n_clusters": [3, 8]}))]), True),
+        (SVC(), False),
+        (GridSearchCV(SVC(), {"C": [0.1, 1]}), False),
+        (Pipeline([("svc", SVC())]), False),
+        (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False),
+    ],
+)
+def test_is_clusterer(estimator, expected_result):
+    assert is_clusterer(estimator) == expected_result
+
+
+def test_set_params():
+    # test nested estimator parameter setting
+    clf = Pipeline([("svc", SVC())])
+
+    # non-existing parameter in svc
+    with pytest.raises(ValueError):
+        clf.set_params(svc__stupid_param=True)
+
+    # non-existing parameter of pipeline
+    with pytest.raises(ValueError):
+        clf.set_params(svm__stupid_param=True)
+
+    # we don't currently catch if the things in pipeline are estimators
+    # bad_pipeline = Pipeline([("bad", NoEstimator())])
+    # with pytest.raises(AttributeError):
+    #    bad_pipeline.set_params(bad__stupid_param=True)
+
+
+def test_set_params_passes_all_parameters():
+    # Make sure all parameters are passed together to set_params
+    # of nested estimator. Regression test for #9944
+
+    class TestDecisionTree(DecisionTreeClassifier):
+        def set_params(self, **kwargs):
+            super().set_params(**kwargs)
+            # expected_kwargs is in test scope
+            assert kwargs == expected_kwargs
+            return self
+
+    expected_kwargs = {"max_depth": 5, "min_samples_leaf": 2}
+    for est in [
+        Pipeline([("estimator", TestDecisionTree())]),
+        GridSearchCV(TestDecisionTree(), {}),
+    ]:
+        est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2)
+
+
+def test_set_params_updates_valid_params():
+    # Check that set_params tries to set SVC().C, not
+    # DecisionTreeClassifier().C
+    gscv = GridSearchCV(DecisionTreeClassifier(), {})
+    gscv.set_params(estimator=SVC(), estimator__C=42.0)
+    assert gscv.estimator.C == 42.0
+
+
+@pytest.mark.parametrize(
+    "tree,dataset",
+    [
+        (
+            DecisionTreeClassifier(max_depth=2, random_state=0),
+            datasets.make_classification(random_state=0),
+        ),
+        (
+            DecisionTreeRegressor(max_depth=2, random_state=0),
+            datasets.make_regression(random_state=0),
+        ),
+    ],
+)
+def test_score_sample_weight(tree, dataset):
+    rng = np.random.RandomState(0)
+    # check that the score with and without sample weights are different
+    X, y = dataset
+
+    tree.fit(X, y)
+    # generate random sample weights
+    sample_weight = rng.randint(1, 10, size=len(y))
+    score_unweighted = tree.score(X, y)
+    score_weighted = tree.score(X, y, sample_weight=sample_weight)
+    msg = "Unweighted and weighted scores are unexpectedly equal"
+    assert score_unweighted != score_weighted, msg
+
+
+def test_clone_pandas_dataframe():
+    class DummyEstimator(TransformerMixin, BaseEstimator):
+        """This is a dummy class for generating numerical features
+
+        This feature extractor extracts numerical features from pandas data
+        frame.
+
+        Parameters
+        ----------
+
+        df: pandas data frame
+            The pandas data frame parameter.
+
+        Notes
+        -----
+        """
+
+        def __init__(self, df=None, scalar_param=1):
+            self.df = df
+            self.scalar_param = scalar_param
+
+        def fit(self, X, y=None):
+            pass
+
+        def transform(self, X):
+            pass
+
+    # build and clone estimator
+    d = np.arange(10)
+    df = MockDataFrame(d)
+    e = DummyEstimator(df, scalar_param=1)
+    cloned_e = clone(e)
+
+    # the test
+    assert (e.df == cloned_e.df).values.all()
+    assert e.scalar_param == cloned_e.scalar_param
+
+
+def test_clone_protocol():
+    """Checks that clone works with `__sklearn_clone__` protocol."""
+
+    class FrozenEstimator(BaseEstimator):
+        def __init__(self, fitted_estimator):
+            self.fitted_estimator = fitted_estimator
+
+        def __getattr__(self, name):
+            return getattr(self.fitted_estimator, name)
+
+        def __sklearn_clone__(self):
+            return self
+
+        def fit(self, *args, **kwargs):
+            return self
+
+        def fit_transform(self, *args, **kwargs):
+            return self.fitted_estimator.transform(*args, **kwargs)
+
+    X = np.array([[-1, -1], [-2, -1], [-3, -2]])
+    pca = PCA().fit(X)
+    components = pca.components_
+
+    frozen_pca = FrozenEstimator(pca)
+    assert_allclose(frozen_pca.components_, components)
+
+    # Calling PCA methods such as `get_feature_names_out` still works
+    assert_array_equal(frozen_pca.get_feature_names_out(), pca.get_feature_names_out())
+
+    # Fitting on a new data does not alter `components_`
+    X_new = np.asarray([[-1, 2], [3, 4], [1, 2]])
+    frozen_pca.fit(X_new)
+    assert_allclose(frozen_pca.components_, components)
+
+    # `fit_transform` does not alter state
+    frozen_pca.fit_transform(X_new)
+    assert_allclose(frozen_pca.components_, components)
+
+    # Cloning estimator is a no-op
+    clone_frozen_pca = clone(frozen_pca)
+    assert clone_frozen_pca is frozen_pca
+    assert_allclose(clone_frozen_pca.components_, components)
+
+
+def test_pickle_version_warning_is_not_raised_with_matching_version():
+    iris = datasets.load_iris()
+    tree = DecisionTreeClassifier().fit(iris.data, iris.target)
+    tree_pickle = pickle.dumps(tree)
+    assert b"_sklearn_version" in tree_pickle
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        tree_restored = pickle.loads(tree_pickle)
+
+    # test that we can predict with the restored decision tree classifier
+    score_of_original = tree.score(iris.data, iris.target)
+    score_of_restored = tree_restored.score(iris.data, iris.target)
+    assert score_of_original == score_of_restored
+
+
+class TreeBadVersion(DecisionTreeClassifier):
+    def __getstate__(self):
+        return dict(self.__dict__.items(), _sklearn_version="something")
+
+
+pickle_error_message = (
+    "Trying to unpickle estimator {estimator} from "
+    "version {old_version} when using version "
+    "{current_version}. This might "
+    "lead to breaking code or invalid results. "
+    "Use at your own risk."
+)
+
+
+def test_pickle_version_warning_is_issued_upon_different_version():
+    iris = datasets.load_iris()
+    tree = TreeBadVersion().fit(iris.data, iris.target)
+    tree_pickle_other = pickle.dumps(tree)
+    message = pickle_error_message.format(
+        estimator="TreeBadVersion",
+        old_version="something",
+        current_version=sklearn.__version__,
+    )
+    with pytest.warns(UserWarning, match=message) as warning_record:
+        pickle.loads(tree_pickle_other)
+
+    message = warning_record.list[0].message
+    assert isinstance(message, InconsistentVersionWarning)
+    assert message.estimator_name == "TreeBadVersion"
+    assert message.original_sklearn_version == "something"
+    assert message.current_sklearn_version == sklearn.__version__
+
+
+class TreeNoVersion(DecisionTreeClassifier):
+    def __getstate__(self):
+        return self.__dict__
+
+
+def test_pickle_version_warning_is_issued_when_no_version_info_in_pickle():
+    iris = datasets.load_iris()
+    # TreeNoVersion has no getstate, like pre-0.18
+    tree = TreeNoVersion().fit(iris.data, iris.target)
+
+    tree_pickle_noversion = pickle.dumps(tree)
+    assert b"_sklearn_version" not in tree_pickle_noversion
+    message = pickle_error_message.format(
+        estimator="TreeNoVersion",
+        old_version="pre-0.18",
+        current_version=sklearn.__version__,
+    )
+    # check we got the warning about using pre-0.18 pickle
+    with pytest.warns(UserWarning, match=message):
+        pickle.loads(tree_pickle_noversion)
+
+
+def test_pickle_version_no_warning_is_issued_with_non_sklearn_estimator():
+    iris = datasets.load_iris()
+    tree = TreeNoVersion().fit(iris.data, iris.target)
+    tree_pickle_noversion = pickle.dumps(tree)
+    try:
+        module_backup = TreeNoVersion.__module__
+        TreeNoVersion.__module__ = "notsklearn"
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+
+            pickle.loads(tree_pickle_noversion)
+    finally:
+        TreeNoVersion.__module__ = module_backup
+
+
+class DontPickleAttributeMixin:
+    def __getstate__(self):
+        data = self.__dict__.copy()
+        data["_attribute_not_pickled"] = None
+        return data
+
+    def __setstate__(self, state):
+        state["_restored"] = True
+        self.__dict__.update(state)
+
+
+class MultiInheritanceEstimator(DontPickleAttributeMixin, BaseEstimator):
+    def __init__(self, attribute_pickled=5):
+        self.attribute_pickled = attribute_pickled
+        self._attribute_not_pickled = None
+
+
+def test_pickling_when_getstate_is_overwritten_by_mixin():
+    estimator = MultiInheritanceEstimator()
+    estimator._attribute_not_pickled = "this attribute should not be pickled"
+
+    serialized = pickle.dumps(estimator)
+    estimator_restored = pickle.loads(serialized)
+    assert estimator_restored.attribute_pickled == 5
+    assert estimator_restored._attribute_not_pickled is None
+    assert estimator_restored._restored
+
+
+def test_pickling_when_getstate_is_overwritten_by_mixin_outside_of_sklearn():
+    try:
+        estimator = MultiInheritanceEstimator()
+        text = "this attribute should not be pickled"
+        estimator._attribute_not_pickled = text
+        old_mod = type(estimator).__module__
+        type(estimator).__module__ = "notsklearn"
+
+        serialized = estimator.__getstate__()
+        assert serialized == {"_attribute_not_pickled": None, "attribute_pickled": 5}
+
+        serialized["attribute_pickled"] = 4
+        estimator.__setstate__(serialized)
+        assert estimator.attribute_pickled == 4
+        assert estimator._restored
+    finally:
+        type(estimator).__module__ = old_mod
+
+
+class SingleInheritanceEstimator(BaseEstimator):
+    def __init__(self, attribute_pickled=5):
+        self.attribute_pickled = attribute_pickled
+        self._attribute_not_pickled = None
+
+    def __getstate__(self):
+        state = super().__getstate__()
+        state["_attribute_not_pickled"] = None
+        return state
+
+
+def test_pickling_works_when_getstate_is_overwritten_in_the_child_class():
+    estimator = SingleInheritanceEstimator()
+    estimator._attribute_not_pickled = "this attribute should not be pickled"
+
+    serialized = pickle.dumps(estimator)
+    estimator_restored = pickle.loads(serialized)
+    assert estimator_restored.attribute_pickled == 5
+    assert estimator_restored._attribute_not_pickled is None
+
+
+def test_tag_inheritance():
+    # test that changing tags by inheritance is not allowed
+
+    nan_tag_est = NaNTag()
+    no_nan_tag_est = NoNaNTag()
+    assert nan_tag_est.__sklearn_tags__().input_tags.allow_nan
+    assert not no_nan_tag_est.__sklearn_tags__().input_tags.allow_nan
+
+    redefine_tags_est = OverrideTag()
+    assert not redefine_tags_est.__sklearn_tags__().input_tags.allow_nan
+
+    diamond_tag_est = DiamondOverwriteTag()
+    assert diamond_tag_est.__sklearn_tags__().input_tags.allow_nan
+
+    inherit_diamond_tag_est = InheritDiamondOverwriteTag()
+    assert inherit_diamond_tag_est.__sklearn_tags__().input_tags.allow_nan
+
+
+def test_raises_on_get_params_non_attribute():
+    class MyEstimator(BaseEstimator):
+        def __init__(self, param=5):
+            pass
+
+        def fit(self, X, y=None):
+            return self
+
+    est = MyEstimator()
+    msg = "'MyEstimator' object has no attribute 'param'"
+
+    with pytest.raises(AttributeError, match=msg):
+        est.get_params()
+
+
+def test_repr_mimebundle_():
+    # Checks the display configuration flag controls the json output
+    tree = DecisionTreeClassifier()
+    output = tree._repr_mimebundle_()
+    assert "text/plain" in output
+    assert "text/html" in output
+
+    with config_context(display="text"):
+        output = tree._repr_mimebundle_()
+        assert "text/plain" in output
+        assert "text/html" not in output
+
+
+def test_repr_html_wraps():
+    # Checks the display configuration flag controls the html output
+    tree = DecisionTreeClassifier()
+
+    output = tree._repr_html_()
+    assert "<style>" in output
+
+    with config_context(display="text"):
+        msg = "_repr_html_ is only defined when"
+        with pytest.raises(AttributeError, match=msg):
+            output = tree._repr_html_()
+
+
+def test_n_features_in_validation():
+    """Check that `_check_n_features` validates data when reset=False"""
+    est = MyEstimator()
+    X_train = [[1, 2, 3], [4, 5, 6]]
+    _check_n_features(est, X_train, reset=True)
+
+    assert est.n_features_in_ == 3
+
+    msg = "X does not contain any features, but MyEstimator is expecting 3 features"
+    with pytest.raises(ValueError, match=msg):
+        _check_n_features(est, "invalid X", reset=False)
+
+
+def test_n_features_in_no_validation():
+    """Check that `_check_n_features` does not validate data when
+    n_features_in_ is not defined."""
+    est = MyEstimator()
+    _check_n_features(est, "invalid X", reset=True)
+
+    assert not hasattr(est, "n_features_in_")
+
+    # does not raise
+    _check_n_features(est, "invalid X", reset=False)
+
+
+def test_feature_names_in():
+    """Check that feature_name_in are recorded by `_validate_data`"""
+    pd = pytest.importorskip("pandas")
+    iris = datasets.load_iris()
+    X_np = iris.data
+    df = pd.DataFrame(X_np, columns=iris.feature_names)
+
+    class NoOpTransformer(TransformerMixin, BaseEstimator):
+        def fit(self, X, y=None):
+            validate_data(self, X)
+            return self
+
+        def transform(self, X):
+            validate_data(self, X, reset=False)
+            return X
+
+    # fit on dataframe saves the feature names
+    trans = NoOpTransformer().fit(df)
+    assert_array_equal(trans.feature_names_in_, df.columns)
+
+    # fit again but on ndarray does not keep the previous feature names (see #21383)
+    trans.fit(X_np)
+    assert not hasattr(trans, "feature_names_in_")
+
+    trans.fit(df)
+    msg = "The feature names should match those that were passed"
+    df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1])
+    with pytest.raises(ValueError, match=msg):
+        trans.transform(df_bad)
+
+    # warns when fitted on dataframe and transforming a ndarray
+    msg = (
+        "X does not have valid feature names, but NoOpTransformer was "
+        "fitted with feature names"
+    )
+    with pytest.warns(UserWarning, match=msg):
+        trans.transform(X_np)
+
+    # warns when fitted on a ndarray and transforming dataframe
+    msg = "X has feature names, but NoOpTransformer was fitted without feature names"
+    trans = NoOpTransformer().fit(X_np)
+    with pytest.warns(UserWarning, match=msg):
+        trans.transform(df)
+
+    # fit on dataframe with all integer feature names works without warning
+    df_int_names = pd.DataFrame(X_np)
+    trans = NoOpTransformer()
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        trans.fit(df_int_names)
+
+    # fit on dataframe with no feature names or all integer feature names
+    # -> do not warn on transform
+    Xs = [X_np, df_int_names]
+    for X in Xs:
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", UserWarning)
+            trans.transform(X)
+
+    # fit on dataframe with feature names that are mixed raises an error:
+    df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2])
+    trans = NoOpTransformer()
+    msg = re.escape(
+        "Feature names are only supported if all input features have string names, "
+        "but your input has ['int', 'str'] as feature name / column name types. "
+        "If you want feature names to be stored and validated, you must convert "
+        "them all to strings, by using X.columns = X.columns.astype(str) for "
+        "example. Otherwise you can remove feature / column names from your input "
+        "data, or convert them all to a non-string data type."
+    )
+    with pytest.raises(TypeError, match=msg):
+        trans.fit(df_mixed)
+
+    # transform on feature names that are mixed also raises:
+    with pytest.raises(TypeError, match=msg):
+        trans.transform(df_mixed)
+
+
+def test_validate_data_skip_check_array():
+    """Check skip_check_array option of _validate_data."""
+
+    pd = pytest.importorskip("pandas")
+    iris = datasets.load_iris()
+    df = pd.DataFrame(iris.data, columns=iris.feature_names)
+    y = pd.Series(iris.target)
+
+    class NoOpTransformer(TransformerMixin, BaseEstimator):
+        pass
+
+    no_op = NoOpTransformer()
+    X_np_out = validate_data(no_op, df, skip_check_array=False)
+    assert isinstance(X_np_out, np.ndarray)
+    assert_allclose(X_np_out, df.to_numpy())
+
+    X_df_out = validate_data(no_op, df, skip_check_array=True)
+    assert X_df_out is df
+
+    y_np_out = validate_data(no_op, y=y, skip_check_array=False)
+    assert isinstance(y_np_out, np.ndarray)
+    assert_allclose(y_np_out, y.to_numpy())
+
+    y_series_out = validate_data(no_op, y=y, skip_check_array=True)
+    assert y_series_out is y
+
+    X_np_out, y_np_out = validate_data(no_op, df, y, skip_check_array=False)
+    assert isinstance(X_np_out, np.ndarray)
+    assert_allclose(X_np_out, df.to_numpy())
+    assert isinstance(y_np_out, np.ndarray)
+    assert_allclose(y_np_out, y.to_numpy())
+
+    X_df_out, y_series_out = validate_data(no_op, df, y, skip_check_array=True)
+    assert X_df_out is df
+    assert y_series_out is y
+
+    msg = "Validation should be done on X, y or both."
+    with pytest.raises(ValueError, match=msg):
+        validate_data(no_op)
+
+
+def test_clone_keeps_output_config():
+    """Check that clone keeps the set_output config."""
+
+    ss = StandardScaler().set_output(transform="pandas")
+    config = _get_output_config("transform", ss)
+
+    ss_clone = clone(ss)
+    config_clone = _get_output_config("transform", ss_clone)
+    assert config == config_clone
+
+
+class _Empty:
+    pass
+
+
+class EmptyEstimator(_Empty, BaseEstimator):
+    pass
+
+
+@pytest.mark.parametrize("estimator", [BaseEstimator(), EmptyEstimator()])
+def test_estimator_empty_instance_dict(estimator):
+    """Check that ``__getstate__`` returns an empty ``dict`` with an empty
+    instance.
+
+    Python 3.11+ changed behaviour by returning ``None`` instead of raising an
+    ``AttributeError``. Non-regression test for gh-25188.
+    """
+    state = estimator.__getstate__()
+    expected = {"_sklearn_version": sklearn.__version__}
+    assert state == expected
+
+    # this should not raise
+    pickle.loads(pickle.dumps(BaseEstimator()))
+
+
+def test_estimator_getstate_using_slots_error_message():
+    """Using a `BaseEstimator` with `__slots__` is not supported."""
+
+    class WithSlots:
+        __slots__ = ("x",)
+
+    class Estimator(BaseEstimator, WithSlots):
+        pass
+
+    msg = (
+        "You cannot use `__slots__` in objects inheriting from "
+        "`sklearn.base.BaseEstimator`"
+    )
+
+    with pytest.raises(TypeError, match=msg):
+        Estimator().__getstate__()
+
+    with pytest.raises(TypeError, match=msg):
+        pickle.dumps(Estimator())
+
+
+@pytest.mark.parametrize(
+    "constructor_name, minversion",
+    [
+        ("dataframe", "1.5.0"),
+        ("pyarrow", "12.0.0"),
+        ("polars", "0.20.23"),
+    ],
+)
+def test_dataframe_protocol(constructor_name, minversion):
+    """Uses the dataframe exchange protocol to get feature names."""
+    data = [[1, 4, 2], [3, 3, 6]]
+    columns = ["col_0", "col_1", "col_2"]
+    df = _convert_container(
+        data, constructor_name, columns_name=columns, minversion=minversion
+    )
+
+    class NoOpTransformer(TransformerMixin, BaseEstimator):
+        def fit(self, X, y=None):
+            validate_data(self, X)
+            return self
+
+        def transform(self, X):
+            return validate_data(self, X, reset=False)
+
+    no_op = NoOpTransformer()
+    no_op.fit(df)
+    assert_array_equal(no_op.feature_names_in_, columns)
+    X_out = no_op.transform(df)
+
+    if constructor_name != "pyarrow":
+        # pyarrow does not work with `np.asarray`
+        # https://github.com/apache/arrow/issues/34886
+        assert_allclose(df, X_out)
+
+    bad_names = ["a", "b", "c"]
+    df_bad = _convert_container(data, constructor_name, columns_name=bad_names)
+    with pytest.raises(ValueError, match="The feature names should match"):
+        no_op.transform(df_bad)
+
+
+@config_context(enable_metadata_routing=True)
+def test_transformer_fit_transform_with_metadata_in_transform():
+    """Test that having a transformer with metadata for transform raises a
+    warning when calling fit_transform."""
+
+    class CustomTransformer(BaseEstimator, TransformerMixin):
+        def fit(self, X, y=None, prop=None):
+            return self
+
+        def transform(self, X, prop=None):
+            return X
+
+    # passing the metadata to `fit_transform` should raise a warning since it
+    # could potentially be consumed by `transform`
+    with pytest.warns(UserWarning, match="`transform` method which consumes metadata"):
+        CustomTransformer().set_transform_request(prop=True).fit_transform(
+            [[1]], [1], prop=1
+        )
+
+    # not passing a metadata which can potentially be consumed by `transform` should
+    # not raise a warning
+    with warnings.catch_warnings(record=True) as record:
+        CustomTransformer().set_transform_request(prop=True).fit_transform([[1]], [1])
+        assert len(record) == 0
+
+
+@config_context(enable_metadata_routing=True)
+def test_outlier_mixin_fit_predict_with_metadata_in_predict():
+    """Test that having an OutlierMixin with metadata for predict raises a
+    warning when calling fit_predict."""
+
+    class CustomOutlierDetector(BaseEstimator, OutlierMixin):
+        def fit(self, X, y=None, prop=None):
+            return self
+
+        def predict(self, X, prop=None):
+            return X
+
+    # passing the metadata to `fit_predict` should raise a warning since it
+    # could potentially be consumed by `predict`
+    with pytest.warns(UserWarning, match="`predict` method which consumes metadata"):
+        CustomOutlierDetector().set_predict_request(prop=True).fit_predict(
+            [[1]], [1], prop=1
+        )
+
+    # not passing a metadata which can potentially be consumed by `predict` should
+    # not raise a warning
+    with warnings.catch_warnings(record=True) as record:
+        CustomOutlierDetector().set_predict_request(prop=True).fit_predict([[1]], [1])
+        assert len(record) == 0
+
+
+def test_get_params_html():
+    """Check the behaviour of the `_get_params_html` method."""
+    est = MyEstimator(empty="test")
+
+    assert est._get_params_html() == {"l1": 0, "empty": "test"}
+    assert est._get_params_html().non_default == ("empty",)
+
+
+def make_estimator_with_param(default_value):
+    class DynamicEstimator(BaseEstimator):
+        def __init__(self, param=default_value):
+            self.param = param
+
+    return DynamicEstimator
+
+
+@pytest.mark.parametrize(
+    "default_value, test_value",
+    [
+        ((), (1,)),
+        ((), [1]),
+        ((), np.array([1])),
+        ((1, 2), (3, 4)),
+        ((1, 2), [3, 4]),
+        ((1, 2), np.array([3, 4])),
+        (None, 1),
+        (None, []),
+        (None, lambda x: x),
+        (np.nan, 1.0),
+        (np.nan, np.array([np.nan])),
+        ("abc", "def"),
+        ("abc", ["abc"]),
+        (True, False),
+        (1, 2),
+        (1, [1]),
+        (1, np.array([1])),
+        (1.0, 2.0),
+        (1.0, [1.0]),
+        (1.0, np.array([1.0])),
+        ([1, 2], [3]),
+        (np.array([1]), [2, 3]),
+        (None, KFold()),
+        (None, get_scorer("accuracy")),
+    ],
+)
+def test_param_is_non_default(default_value, test_value):
+    """Check that we detect non-default parameters with various types.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/31525
+    """
+    estimator = make_estimator_with_param(default_value)(param=test_value)
+    non_default = estimator._get_params_html().non_default
+    assert "param" in non_default
+
+
+@pytest.mark.parametrize(
+    "default_value, test_value",
+    [
+        (None, None),
+        ((), ()),
+        ((), []),
+        ((), np.array([])),
+        ((1, 2, 3), (1, 2, 3)),
+        ((1, 2, 3), [1, 2, 3]),
+        ((1, 2, 3), np.array([1, 2, 3])),
+        (np.nan, np.nan),
+        ("abc", "abc"),
+        (True, True),
+        (1, 1),
+        (1.0, 1.0),
+        (2, 2.0),
+    ],
+)
+def test_param_is_default(default_value, test_value):
+    """Check that we detect the default parameters and values in an array-like will
+    be reported as default as well.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/31525
+    """
+    estimator = make_estimator_with_param(default_value)(param=test_value)
+    non_default = estimator._get_params_html().non_default
+    assert "param" not in non_default
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_build.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_build.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a960cba6283a16fb961f7408cd938ae5575e38
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_build.py
@@ -0,0 +1,34 @@
+import os
+import textwrap
+
+import pytest
+
+from sklearn import __version__
+from sklearn.utils._openmp_helpers import _openmp_parallelism_enabled
+
+
+def test_openmp_parallelism_enabled():
+    # Check that sklearn is built with OpenMP-based parallelism enabled.
+    # This test can be skipped by setting the environment variable
+    # ``SKLEARN_SKIP_OPENMP_TEST``.
+    if os.getenv("SKLEARN_SKIP_OPENMP_TEST"):
+        pytest.skip("test explicitly skipped (SKLEARN_SKIP_OPENMP_TEST)")
+
+    base_url = "dev" if __version__.endswith(".dev0") else "stable"
+    err_msg = textwrap.dedent(
+        """
+        This test fails because scikit-learn has been built without OpenMP.
+        This is not recommended since some estimators will run in sequential
+        mode instead of leveraging thread-based parallelism.
+
+        You can find instructions to build scikit-learn with OpenMP at this
+        address:
+
+            https://scikit-learn.org/{}/developers/advanced_installation.html
+
+        You can skip this test by setting the environment variable
+        SKLEARN_SKIP_OPENMP_TEST to any value.
+        """
+    ).format(base_url)
+
+    assert _openmp_parallelism_enabled(), err_msg
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_calibration.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_calibration.py
new file mode 100644
index 0000000000000000000000000000000000000000..16c8ac9261f272c8720a0c9ddd899ca6288a89d7
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_calibration.py
@@ -0,0 +1,1136 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.calibration import (
+    CalibratedClassifierCV,
+    CalibrationDisplay,
+    _CalibratedClassifier,
+    _sigmoid_calibration,
+    _SigmoidCalibration,
+    calibration_curve,
+)
+from sklearn.datasets import load_iris, make_blobs, make_classification
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    VotingClassifier,
+)
+from sklearn.exceptions import NotFittedError
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.frozen import FrozenEstimator
+from sklearn.impute import SimpleImputer
+from sklearn.isotonic import IsotonicRegression
+from sklearn.linear_model import LogisticRegression, SGDClassifier
+from sklearn.metrics import brier_score_loss
+from sklearn.model_selection import (
+    KFold,
+    LeaveOneOut,
+    check_cv,
+    cross_val_predict,
+    cross_val_score,
+    train_test_split,
+)
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.svm import LinearSVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._mocking import CheckingClassifier
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+from sklearn.utils.extmath import softmax
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+N_SAMPLES = 200
+
+
+@pytest.fixture(scope="module")
+def data():
+    X, y = make_classification(n_samples=N_SAMPLES, n_features=6, random_state=42)
+    return X, y
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
+@pytest.mark.parametrize("ensemble", [True, False])
+def test_calibration(data, method, csr_container, ensemble):
+    # Test calibration objects with isotonic and sigmoid
+    n_samples = N_SAMPLES // 2
+    X, y = data
+    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)
+
+    X -= X.min()  # MultinomialNB only allows positive X
+
+    # split train and test
+    X_train, y_train, sw_train = X[:n_samples], y[:n_samples], sample_weight[:n_samples]
+    X_test, y_test = X[n_samples:], y[n_samples:]
+
+    # Naive-Bayes
+    clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
+    prob_pos_clf = clf.predict_proba(X_test)[:, 1]
+
+    cal_clf = CalibratedClassifierCV(clf, cv=y.size + 1, ensemble=ensemble)
+    with pytest.raises(ValueError):
+        cal_clf.fit(X, y)
+
+    # Naive Bayes with calibration
+    for this_X_train, this_X_test in [
+        (X_train, X_test),
+        (csr_container(X_train), csr_container(X_test)),
+    ]:
+        cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble)
+        # Note that this fit overwrites the fit on the entire training
+        # set
+        cal_clf.fit(this_X_train, y_train, sample_weight=sw_train)
+        prob_pos_cal_clf = cal_clf.predict_proba(this_X_test)[:, 1]
+
+        # Check that brier score has improved after calibration
+        assert brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
+            y_test, prob_pos_cal_clf
+        )
+
+        # Check invariance against relabeling [0, 1] -> [1, 2]
+        cal_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
+        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
+        assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled)
+
+        # Check invariance against relabeling [0, 1] -> [-1, 1]
+        cal_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
+        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
+        assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled)
+
+        # Check invariance against relabeling [0, 1] -> [1, 0]
+        cal_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train)
+        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
+        if method == "sigmoid":
+            assert_array_almost_equal(prob_pos_cal_clf, 1 - prob_pos_cal_clf_relabeled)
+        else:
+            # Isotonic calibration is not invariant against relabeling
+            # but should improve in both cases
+            assert brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
+                (y_test + 1) % 2, prob_pos_cal_clf_relabeled
+            )
+
+
+def test_calibration_default_estimator(data):
+    # Check estimator default is LinearSVC
+    X, y = data
+    calib_clf = CalibratedClassifierCV(cv=2)
+    calib_clf.fit(X, y)
+
+    base_est = calib_clf.calibrated_classifiers_[0].estimator
+    assert isinstance(base_est, LinearSVC)
+
+
+@pytest.mark.parametrize("ensemble", [True, False])
+def test_calibration_cv_splitter(data, ensemble):
+    # Check when `cv` is a CV splitter
+    X, y = data
+
+    splits = 5
+    kfold = KFold(n_splits=splits)
+    calib_clf = CalibratedClassifierCV(cv=kfold, ensemble=ensemble)
+    assert isinstance(calib_clf.cv, KFold)
+    assert calib_clf.cv.n_splits == splits
+
+    calib_clf.fit(X, y)
+    expected_n_clf = splits if ensemble else 1
+    assert len(calib_clf.calibrated_classifiers_) == expected_n_clf
+
+
+def test_calibration_cv_nfold(data):
+    # Check error raised when number of examples per class less than nfold
+    X, y = data
+
+    kfold = KFold(n_splits=101)
+    calib_clf = CalibratedClassifierCV(cv=kfold, ensemble=True)
+    with pytest.raises(ValueError, match="Requesting 101-fold cross-validation"):
+        calib_clf.fit(X, y)
+
+    calib_clf = CalibratedClassifierCV(cv=LeaveOneOut(), ensemble=True)
+    with pytest.raises(ValueError, match="LeaveOneOut cross-validation does"):
+        calib_clf.fit(X, y)
+
+
+@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
+@pytest.mark.parametrize("ensemble", [True, False])
+def test_sample_weight(data, method, ensemble):
+    n_samples = N_SAMPLES // 2
+    X, y = data
+
+    sample_weight = np.random.RandomState(seed=42).uniform(size=len(y))
+    X_train, y_train, sw_train = X[:n_samples], y[:n_samples], sample_weight[:n_samples]
+    X_test = X[n_samples:]
+
+    estimator = LinearSVC(random_state=42)
+    calibrated_clf = CalibratedClassifierCV(estimator, method=method, ensemble=ensemble)
+    calibrated_clf.fit(X_train, y_train, sample_weight=sw_train)
+    probs_with_sw = calibrated_clf.predict_proba(X_test)
+
+    # As the weights are used for the calibration, they should still yield
+    # different predictions
+    calibrated_clf.fit(X_train, y_train)
+    probs_without_sw = calibrated_clf.predict_proba(X_test)
+
+    diff = np.linalg.norm(probs_with_sw - probs_without_sw)
+    assert diff > 0.1
+
+
+@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
+@pytest.mark.parametrize("ensemble", [True, False])
+def test_parallel_execution(data, method, ensemble):
+    """Test parallel calibration"""
+    X, y = data
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+    estimator = make_pipeline(StandardScaler(), LinearSVC(random_state=42))
+
+    cal_clf_parallel = CalibratedClassifierCV(
+        estimator, method=method, n_jobs=2, ensemble=ensemble
+    )
+    cal_clf_parallel.fit(X_train, y_train)
+    probs_parallel = cal_clf_parallel.predict_proba(X_test)
+
+    cal_clf_sequential = CalibratedClassifierCV(
+        estimator, method=method, n_jobs=1, ensemble=ensemble
+    )
+    cal_clf_sequential.fit(X_train, y_train)
+    probs_sequential = cal_clf_sequential.predict_proba(X_test)
+
+    assert_allclose(probs_parallel, probs_sequential)
+
+
+@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
+@pytest.mark.parametrize("ensemble", [True, False])
+# increase the number of RNG seeds to assess the statistical stability of this
+# test:
+@pytest.mark.parametrize("seed", range(2))
+def test_calibration_multiclass(method, ensemble, seed):
+    def multiclass_brier(y_true, proba_pred, n_classes):
+        Y_onehot = np.eye(n_classes)[y_true]
+        return np.sum((Y_onehot - proba_pred) ** 2) / Y_onehot.shape[0]
+
+    # Test calibration for multiclass with classifier that implements
+    # only decision function.
+    clf = LinearSVC(random_state=7)
+    X, y = make_blobs(
+        n_samples=500, n_features=100, random_state=seed, centers=10, cluster_std=15.0
+    )
+
+    # Use an unbalanced dataset by collapsing 8 clusters into one class
+    # to make the naive calibration based on a softmax more unlikely
+    # to work.
+    y[y > 2] = 2
+    n_classes = np.unique(y).shape[0]
+    X_train, y_train = X[::2], y[::2]
+    X_test, y_test = X[1::2], y[1::2]
+
+    clf.fit(X_train, y_train)
+
+    cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble)
+    cal_clf.fit(X_train, y_train)
+    probas = cal_clf.predict_proba(X_test)
+    # Check probabilities sum to 1
+    assert_allclose(np.sum(probas, axis=1), np.ones(len(X_test)))
+
+    # Check that the dataset is not too trivial, otherwise it's hard
+    # to get interesting calibration data during the internal
+    # cross-validation loop.
+    assert 0.65 < clf.score(X_test, y_test) < 0.95
+
+    # Check that the accuracy of the calibrated model is never degraded
+    # too much compared to the original classifier.
+    assert cal_clf.score(X_test, y_test) > 0.95 * clf.score(X_test, y_test)
+
+    # Check that Brier loss of calibrated classifier is smaller than
+    # loss obtained by naively turning OvR decision function to
+    # probabilities via a softmax
+    uncalibrated_brier = multiclass_brier(
+        y_test, softmax(clf.decision_function(X_test)), n_classes=n_classes
+    )
+    calibrated_brier = multiclass_brier(y_test, probas, n_classes=n_classes)
+
+    assert calibrated_brier < 1.1 * uncalibrated_brier
+
+    # Test that calibration of a multiclass classifier decreases log-loss
+    # for RandomForestClassifier
+    clf = RandomForestClassifier(n_estimators=30, random_state=42)
+    clf.fit(X_train, y_train)
+    clf_probs = clf.predict_proba(X_test)
+    uncalibrated_brier = multiclass_brier(y_test, clf_probs, n_classes=n_classes)
+
+    cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble)
+    cal_clf.fit(X_train, y_train)
+    cal_clf_probs = cal_clf.predict_proba(X_test)
+    calibrated_brier = multiclass_brier(y_test, cal_clf_probs, n_classes=n_classes)
+    assert calibrated_brier < 1.1 * uncalibrated_brier
+
+
+def test_calibration_zero_probability():
+    # Test an edge case where _CalibratedClassifier avoids numerical errors
+    # in the multiclass normalization step if all the calibrators output
+    # are zero all at once for a given sample and instead fallback to uniform
+    # probabilities.
+    class ZeroCalibrator:
+        # This function is called from _CalibratedClassifier.predict_proba.
+        def predict(self, X):
+            return np.zeros(X.shape[0])
+
+    X, y = make_blobs(
+        n_samples=50, n_features=10, random_state=7, centers=10, cluster_std=15.0
+    )
+    clf = DummyClassifier().fit(X, y)
+    calibrator = ZeroCalibrator()
+    cal_clf = _CalibratedClassifier(
+        estimator=clf, calibrators=[calibrator], classes=clf.classes_
+    )
+
+    probas = cal_clf.predict_proba(X)
+
+    # Check that all probabilities are uniformly 1. / clf.n_classes_
+    assert_allclose(probas, 1.0 / clf.n_classes_)
+
+
+@ignore_warnings(category=FutureWarning)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_calibration_prefit(csr_container):
+    """Test calibration for prefitted classifiers"""
+    # TODO(1.8): Remove cv="prefit" options here and the @ignore_warnings of the test
+    n_samples = 50
+    X, y = make_classification(n_samples=3 * n_samples, n_features=6, random_state=42)
+    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)
+
+    X -= X.min()  # MultinomialNB only allows positive X
+
+    # split train and test
+    X_train, y_train, sw_train = X[:n_samples], y[:n_samples], sample_weight[:n_samples]
+    X_calib, y_calib, sw_calib = (
+        X[n_samples : 2 * n_samples],
+        y[n_samples : 2 * n_samples],
+        sample_weight[n_samples : 2 * n_samples],
+    )
+    X_test, y_test = X[2 * n_samples :], y[2 * n_samples :]
+
+    # Naive-Bayes
+    clf = MultinomialNB()
+    # Check error if clf not prefit
+    unfit_clf = CalibratedClassifierCV(clf, cv="prefit")
+    with pytest.raises(NotFittedError):
+        unfit_clf.fit(X_calib, y_calib)
+
+    clf.fit(X_train, y_train, sw_train)
+    prob_pos_clf = clf.predict_proba(X_test)[:, 1]
+
+    # Naive Bayes with calibration
+    for this_X_calib, this_X_test in [
+        (X_calib, X_test),
+        (csr_container(X_calib), csr_container(X_test)),
+    ]:
+        for method in ["isotonic", "sigmoid"]:
+            cal_clf_prefit = CalibratedClassifierCV(clf, method=method, cv="prefit")
+            cal_clf_frozen = CalibratedClassifierCV(FrozenEstimator(clf), method=method)
+
+            for sw in [sw_calib, None]:
+                cal_clf_prefit.fit(this_X_calib, y_calib, sample_weight=sw)
+                cal_clf_frozen.fit(this_X_calib, y_calib, sample_weight=sw)
+
+                y_prob_prefit = cal_clf_prefit.predict_proba(this_X_test)
+                y_prob_frozen = cal_clf_frozen.predict_proba(this_X_test)
+                y_pred_prefit = cal_clf_prefit.predict(this_X_test)
+                y_pred_frozen = cal_clf_frozen.predict(this_X_test)
+                prob_pos_cal_clf_prefit = y_prob_prefit[:, 1]
+                prob_pos_cal_clf_frozen = y_prob_frozen[:, 1]
+                assert_array_equal(y_pred_prefit, y_pred_frozen)
+                assert_array_equal(
+                    y_pred_prefit, np.array([0, 1])[np.argmax(y_prob_prefit, axis=1)]
+                )
+                assert brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
+                    y_test, prob_pos_cal_clf_frozen
+                )
+
+
+@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
+def test_calibration_ensemble_false(data, method):
+    # Test that `ensemble=False` is the same as using predictions from
+    # `cross_val_predict` to train calibrator.
+    X, y = data
+    clf = LinearSVC(random_state=7)
+
+    cal_clf = CalibratedClassifierCV(clf, method=method, cv=3, ensemble=False)
+    cal_clf.fit(X, y)
+    cal_probas = cal_clf.predict_proba(X)
+
+    # Get probas manually
+    unbiased_preds = cross_val_predict(clf, X, y, cv=3, method="decision_function")
+    if method == "isotonic":
+        calibrator = IsotonicRegression(out_of_bounds="clip")
+    else:
+        calibrator = _SigmoidCalibration()
+    calibrator.fit(unbiased_preds, y)
+    # Use `clf` fit on all data
+    clf.fit(X, y)
+    clf_df = clf.decision_function(X)
+    manual_probas = calibrator.predict(clf_df)
+    assert_allclose(cal_probas[:, 1], manual_probas)
+
+
+def test_sigmoid_calibration():
+    """Test calibration values with Platt sigmoid model"""
+    exF = np.array([5, -4, 1.0])
+    exY = np.array([1, -1, -1])
+    # computed from my python port of the C++ code in LibSVM
+    AB_lin_libsvm = np.array([-0.20261354391187855, 0.65236314980010512])
+    assert_array_almost_equal(AB_lin_libsvm, _sigmoid_calibration(exF, exY), 3)
+    lin_prob = 1.0 / (1.0 + np.exp(AB_lin_libsvm[0] * exF + AB_lin_libsvm[1]))
+    sk_prob = _SigmoidCalibration().fit(exF, exY).predict(exF)
+    assert_array_almost_equal(lin_prob, sk_prob, 6)
+
+    # check that _SigmoidCalibration().fit only accepts 1d array or 2d column
+    # arrays
+    with pytest.raises(ValueError):
+        _SigmoidCalibration().fit(np.vstack((exF, exF)), exY)
+
+
+def test_calibration_curve():
+    """Check calibration_curve function"""
+    y_true = np.array([0, 0, 0, 1, 1, 1])
+    y_pred = np.array([0.0, 0.1, 0.2, 0.8, 0.9, 1.0])
+    prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=2)
+    assert len(prob_true) == len(prob_pred)
+    assert len(prob_true) == 2
+    assert_almost_equal(prob_true, [0, 1])
+    assert_almost_equal(prob_pred, [0.1, 0.9])
+
+    # Probabilities outside [0, 1] should not be accepted at all.
+    with pytest.raises(ValueError):
+        calibration_curve([1], [-0.1])
+
+    # test that quantiles work as expected
+    y_true2 = np.array([0, 0, 0, 0, 1, 1])
+    y_pred2 = np.array([0.0, 0.1, 0.2, 0.5, 0.9, 1.0])
+    prob_true_quantile, prob_pred_quantile = calibration_curve(
+        y_true2, y_pred2, n_bins=2, strategy="quantile"
+    )
+
+    assert len(prob_true_quantile) == len(prob_pred_quantile)
+    assert len(prob_true_quantile) == 2
+    assert_almost_equal(prob_true_quantile, [0, 2 / 3])
+    assert_almost_equal(prob_pred_quantile, [0.1, 0.8])
+
+    # Check that error is raised when invalid strategy is selected
+    with pytest.raises(ValueError):
+        calibration_curve(y_true2, y_pred2, strategy="percentile")
+
+
+@pytest.mark.parametrize("ensemble", [True, False])
+def test_calibration_nan_imputer(ensemble):
+    """Test that calibration can accept nan"""
+    X, y = make_classification(
+        n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42
+    )
+    X[0, 0] = np.nan
+    clf = Pipeline(
+        [("imputer", SimpleImputer()), ("rf", RandomForestClassifier(n_estimators=1))]
+    )
+    clf_c = CalibratedClassifierCV(clf, cv=2, method="isotonic", ensemble=ensemble)
+    clf_c.fit(X, y)
+    clf_c.predict(X)
+
+
+@pytest.mark.parametrize("ensemble", [True, False])
+def test_calibration_prob_sum(ensemble):
+    # Test that sum of probabilities is (max) 1. A non-regression test for
+    # issue #7796 - when test has fewer classes than train
+    X, _ = make_classification(n_samples=10, n_features=5, n_classes=2)
+    y = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
+    clf = LinearSVC(C=1.0, random_state=7)
+    # In the first and last fold, test will have 1 class while train will have 2
+    clf_prob = CalibratedClassifierCV(
+        clf, method="sigmoid", cv=KFold(n_splits=3), ensemble=ensemble
+    )
+    clf_prob.fit(X, y)
+    assert_allclose(clf_prob.predict_proba(X).sum(axis=1), 1.0)
+
+
+@pytest.mark.parametrize("ensemble", [True, False])
+def test_calibration_less_classes(ensemble):
+    # Test to check calibration works fine when train set in a test-train
+    # split does not contain all classes
+    # In 1st split, train is missing class 0
+    # In 3rd split, train is missing class 3
+    X = np.random.randn(12, 5)
+    y = [0, 0, 0, 1] + [1, 1, 2, 2] + [2, 3, 3, 3]
+    clf = DecisionTreeClassifier(random_state=7)
+    cal_clf = CalibratedClassifierCV(
+        clf, method="sigmoid", cv=KFold(3), ensemble=ensemble
+    )
+    cal_clf.fit(X, y)
+
+    if ensemble:
+        classes = np.arange(4)
+        for calib_i, class_i in zip([0, 2], [0, 3]):
+            proba = cal_clf.calibrated_classifiers_[calib_i].predict_proba(X)
+            # Check that the unobserved class has proba=0
+            assert_array_equal(proba[:, class_i], np.zeros(len(y)))
+            # Check for all other classes proba>0
+            assert np.all(proba[:, classes != class_i] > 0)
+
+    # When `ensemble=False`, `cross_val_predict` is used to compute predictions
+    # to fit only one `calibrated_classifiers_`
+    else:
+        proba = cal_clf.calibrated_classifiers_[0].predict_proba(X)
+        assert_array_almost_equal(proba.sum(axis=1), np.ones(proba.shape[0]))
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        np.random.RandomState(42).randn(15, 5, 2),
+        np.random.RandomState(42).randn(15, 5, 2, 6),
+    ],
+)
+def test_calibration_accepts_ndarray(X):
+    """Test that calibration accepts n-dimensional arrays as input"""
+    y = [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0]
+
+    class MockTensorClassifier(ClassifierMixin, BaseEstimator):
+        """A toy estimator that accepts tensor inputs"""
+
+        def fit(self, X, y):
+            self.classes_ = np.unique(y)
+            return self
+
+        def decision_function(self, X):
+            # toy decision function that just needs to have the right shape:
+            return X.reshape(X.shape[0], -1).sum(axis=1)
+
+    calibrated_clf = CalibratedClassifierCV(MockTensorClassifier())
+    # we should be able to fit this classifier with no error
+    calibrated_clf.fit(X, y)
+
+
+@pytest.fixture
+def dict_data():
+    dict_data = [
+        {"state": "NY", "age": "adult"},
+        {"state": "TX", "age": "adult"},
+        {"state": "VT", "age": "child"},
+        {"state": "CT", "age": "adult"},
+        {"state": "BR", "age": "child"},
+    ]
+    text_labels = [1, 0, 1, 1, 0]
+    return dict_data, text_labels
+
+
+@pytest.fixture
+def dict_data_pipeline(dict_data):
+    X, y = dict_data
+    pipeline_prefit = Pipeline(
+        [("vectorizer", DictVectorizer()), ("clf", RandomForestClassifier())]
+    )
+    return pipeline_prefit.fit(X, y)
+
+
+def test_calibration_dict_pipeline(dict_data, dict_data_pipeline):
+    """Test that calibration works in prefit pipeline with transformer
+
+    `X` is not array-like, sparse matrix or dataframe at the start.
+    See https://github.com/scikit-learn/scikit-learn/issues/8710
+
+    Also test it can predict without running into validation errors.
+    See https://github.com/scikit-learn/scikit-learn/issues/19637
+    """
+    X, y = dict_data
+    clf = dict_data_pipeline
+    calib_clf = CalibratedClassifierCV(FrozenEstimator(clf), cv=2)
+    calib_clf.fit(X, y)
+    # Check attributes are obtained from fitted estimator
+    assert_array_equal(calib_clf.classes_, clf.classes_)
+
+    # Neither the pipeline nor the calibration meta-estimator
+    # expose the n_features_in_ check on this kind of data.
+    assert not hasattr(clf, "n_features_in_")
+    assert not hasattr(calib_clf, "n_features_in_")
+
+    # Ensure that no error is thrown with predict and predict_proba
+    calib_clf.predict(X)
+    calib_clf.predict_proba(X)
+
+
+@pytest.mark.parametrize(
+    "clf, cv",
+    [
+        pytest.param(LinearSVC(C=1), 2),
+        pytest.param(LinearSVC(C=1), "prefit"),
+    ],
+)
+def test_calibration_attributes(clf, cv):
+    # Check that `n_features_in_` and `classes_` attributes created properly
+    X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7)
+    if cv == "prefit":
+        clf = clf.fit(X, y)
+        calib_clf = CalibratedClassifierCV(clf, cv=cv)
+        with pytest.warns(FutureWarning):
+            calib_clf.fit(X, y)
+    else:
+        calib_clf = CalibratedClassifierCV(clf, cv=cv)
+        calib_clf.fit(X, y)
+
+    if cv == "prefit":
+        assert_array_equal(calib_clf.classes_, clf.classes_)
+        assert calib_clf.n_features_in_ == clf.n_features_in_
+    else:
+        classes = LabelEncoder().fit(y).classes_
+        assert_array_equal(calib_clf.classes_, classes)
+        assert calib_clf.n_features_in_ == X.shape[1]
+
+
+def test_calibration_inconsistent_prefit_n_features_in():
+    # Check that `n_features_in_` from prefit base estimator
+    # is consistent with training set
+    X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7)
+    clf = LinearSVC(C=1).fit(X, y)
+    calib_clf = CalibratedClassifierCV(FrozenEstimator(clf))
+
+    msg = "X has 3 features, but LinearSVC is expecting 5 features as input."
+    with pytest.raises(ValueError, match=msg):
+        calib_clf.fit(X[:, :3], y)
+
+
+def test_calibration_votingclassifier():
+    # Check that `CalibratedClassifier` works with `VotingClassifier`.
+    # The method `predict_proba` from `VotingClassifier` is dynamically
+    # defined via a property that only works when voting="soft".
+    X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7)
+    vote = VotingClassifier(
+        estimators=[("lr" + str(i), LogisticRegression()) for i in range(3)],
+        voting="soft",
+    )
+    vote.fit(X, y)
+
+    calib_clf = CalibratedClassifierCV(estimator=FrozenEstimator(vote))
+    # smoke test: should not raise an error
+    calib_clf.fit(X, y)
+
+
+@pytest.fixture(scope="module")
+def iris_data():
+    return load_iris(return_X_y=True)
+
+
+@pytest.fixture(scope="module")
+def iris_data_binary(iris_data):
+    X, y = iris_data
+    return X[y < 2], y[y < 2]
+
+
+@pytest.mark.parametrize("n_bins", [5, 10])
+@pytest.mark.parametrize("strategy", ["uniform", "quantile"])
+def test_calibration_display_compute(pyplot, iris_data_binary, n_bins, strategy):
+    # Ensure `CalibrationDisplay.from_predictions` and `calibration_curve`
+    # compute the same results. Also checks attributes of the
+    # CalibrationDisplay object.
+    X, y = iris_data_binary
+
+    lr = LogisticRegression().fit(X, y)
+
+    viz = CalibrationDisplay.from_estimator(
+        lr, X, y, n_bins=n_bins, strategy=strategy, alpha=0.8
+    )
+
+    y_prob = lr.predict_proba(X)[:, 1]
+    prob_true, prob_pred = calibration_curve(
+        y, y_prob, n_bins=n_bins, strategy=strategy
+    )
+
+    assert_allclose(viz.prob_true, prob_true)
+    assert_allclose(viz.prob_pred, prob_pred)
+    assert_allclose(viz.y_prob, y_prob)
+
+    assert viz.estimator_name == "LogisticRegression"
+
+    # cannot fail thanks to pyplot fixture
+    import matplotlib as mpl
+
+    assert isinstance(viz.line_, mpl.lines.Line2D)
+    assert viz.line_.get_alpha() == 0.8
+    assert isinstance(viz.ax_, mpl.axes.Axes)
+    assert isinstance(viz.figure_, mpl.figure.Figure)
+
+    assert viz.ax_.get_xlabel() == "Mean predicted probability (Positive class: 1)"
+    assert viz.ax_.get_ylabel() == "Fraction of positives (Positive class: 1)"
+
+    expected_legend_labels = ["LogisticRegression", "Perfectly calibrated"]
+    legend_labels = viz.ax_.get_legend().get_texts()
+    assert len(legend_labels) == len(expected_legend_labels)
+    for labels in legend_labels:
+        assert labels.get_text() in expected_legend_labels
+
+
+def test_plot_calibration_curve_pipeline(pyplot, iris_data_binary):
+    # Ensure pipelines are supported by CalibrationDisplay.from_estimator
+    X, y = iris_data_binary
+    clf = make_pipeline(StandardScaler(), LogisticRegression())
+    clf.fit(X, y)
+    viz = CalibrationDisplay.from_estimator(clf, X, y)
+
+    expected_legend_labels = [viz.estimator_name, "Perfectly calibrated"]
+    legend_labels = viz.ax_.get_legend().get_texts()
+    assert len(legend_labels) == len(expected_legend_labels)
+    for labels in legend_labels:
+        assert labels.get_text() in expected_legend_labels
+
+
+@pytest.mark.parametrize(
+    "name, expected_label", [(None, "_line1"), ("my_est", "my_est")]
+)
+def test_calibration_display_default_labels(pyplot, name, expected_label):
+    prob_true = np.array([0, 1, 1, 0])
+    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
+    y_prob = np.array([])
+
+    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, estimator_name=name)
+    viz.plot()
+
+    expected_legend_labels = [] if name is None else [name]
+    expected_legend_labels.append("Perfectly calibrated")
+    legend_labels = viz.ax_.get_legend().get_texts()
+    assert len(legend_labels) == len(expected_legend_labels)
+    for labels in legend_labels:
+        assert labels.get_text() in expected_legend_labels
+
+
+def test_calibration_display_label_class_plot(pyplot):
+    # Checks that when instantiating `CalibrationDisplay` class then calling
+    # `plot`, `self.estimator_name` is the one given in `plot`
+    prob_true = np.array([0, 1, 1, 0])
+    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
+    y_prob = np.array([])
+
+    name = "name one"
+    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, estimator_name=name)
+    assert viz.estimator_name == name
+    name = "name two"
+    viz.plot(name=name)
+
+    expected_legend_labels = [name, "Perfectly calibrated"]
+    legend_labels = viz.ax_.get_legend().get_texts()
+    assert len(legend_labels) == len(expected_legend_labels)
+    for labels in legend_labels:
+        assert labels.get_text() in expected_legend_labels
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_calibration_display_name_multiple_calls(
+    constructor_name, pyplot, iris_data_binary
+):
+    # Check that the `name` used when calling
+    # `CalibrationDisplay.from_predictions` or
+    # `CalibrationDisplay.from_estimator` is used when multiple
+    # `CalibrationDisplay.viz.plot()` calls are made.
+    X, y = iris_data_binary
+    clf_name = "my hand-crafted name"
+    clf = LogisticRegression().fit(X, y)
+    y_prob = clf.predict_proba(X)[:, 1]
+
+    constructor = getattr(CalibrationDisplay, constructor_name)
+    params = (clf, X, y) if constructor_name == "from_estimator" else (y, y_prob)
+
+    viz = constructor(*params, name=clf_name)
+    assert viz.estimator_name == clf_name
+    pyplot.close("all")
+    viz.plot()
+
+    expected_legend_labels = [clf_name, "Perfectly calibrated"]
+    legend_labels = viz.ax_.get_legend().get_texts()
+    assert len(legend_labels) == len(expected_legend_labels)
+    for labels in legend_labels:
+        assert labels.get_text() in expected_legend_labels
+
+    pyplot.close("all")
+    clf_name = "another_name"
+    viz.plot(name=clf_name)
+    assert len(legend_labels) == len(expected_legend_labels)
+    for labels in legend_labels:
+        assert labels.get_text() in expected_legend_labels
+
+
+def test_calibration_display_ref_line(pyplot, iris_data_binary):
+    # Check that `ref_line` only appears once
+    X, y = iris_data_binary
+    lr = LogisticRegression().fit(X, y)
+    dt = DecisionTreeClassifier().fit(X, y)
+
+    viz = CalibrationDisplay.from_estimator(lr, X, y)
+    viz2 = CalibrationDisplay.from_estimator(dt, X, y, ax=viz.ax_)
+
+    labels = viz2.ax_.get_legend_handles_labels()[1]
+    assert labels.count("Perfectly calibrated") == 1
+
+
+@pytest.mark.parametrize("dtype_y_str", [str, object])
+def test_calibration_curve_pos_label_error_str(dtype_y_str):
+    """Check error message when a `pos_label` is not specified with `str` targets."""
+    rng = np.random.RandomState(42)
+    y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=dtype_y_str)
+    y2 = rng.randint(0, 2, size=y1.size)
+
+    err_msg = (
+        "y_true takes value in {'eggs', 'spam'} and pos_label is not "
+        "specified: either make y_true take value in {0, 1} or {-1, 1} or "
+        "pass pos_label explicitly"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        calibration_curve(y1, y2)
+
+
+@pytest.mark.parametrize("dtype_y_str", [str, object])
+def test_calibration_curve_pos_label(dtype_y_str):
+    """Check the behaviour when passing explicitly `pos_label`."""
+    y_true = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
+    classes = np.array(["spam", "egg"], dtype=dtype_y_str)
+    y_true_str = classes[y_true]
+    y_pred = np.array([0.1, 0.2, 0.3, 0.4, 0.65, 0.7, 0.8, 0.9, 1.0])
+
+    # default case
+    prob_true, _ = calibration_curve(y_true, y_pred, n_bins=4)
+    assert_allclose(prob_true, [0, 0.5, 1, 1])
+    # if `y_true` contains `str`, then `pos_label` is required
+    prob_true, _ = calibration_curve(y_true_str, y_pred, n_bins=4, pos_label="egg")
+    assert_allclose(prob_true, [0, 0.5, 1, 1])
+
+    prob_true, _ = calibration_curve(y_true, 1 - y_pred, n_bins=4, pos_label=0)
+    assert_allclose(prob_true, [0, 0, 0.5, 1])
+    prob_true, _ = calibration_curve(y_true_str, 1 - y_pred, n_bins=4, pos_label="spam")
+    assert_allclose(prob_true, [0, 0, 0.5, 1])
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"c": "red", "lw": 2, "ls": "-."},
+        {"color": "red", "linewidth": 2, "linestyle": "-."},
+    ],
+)
+def test_calibration_display_kwargs(pyplot, iris_data_binary, kwargs):
+    """Check that matplotlib aliases are handled."""
+    X, y = iris_data_binary
+
+    lr = LogisticRegression().fit(X, y)
+    viz = CalibrationDisplay.from_estimator(lr, X, y, **kwargs)
+
+    assert viz.line_.get_color() == "red"
+    assert viz.line_.get_linewidth() == 2
+    assert viz.line_.get_linestyle() == "-."
+
+
+@pytest.mark.parametrize("pos_label, expected_pos_label", [(None, 1), (0, 0), (1, 1)])
+def test_calibration_display_pos_label(
+    pyplot, iris_data_binary, pos_label, expected_pos_label
+):
+    """Check the behaviour of `pos_label` in the `CalibrationDisplay`."""
+    X, y = iris_data_binary
+
+    lr = LogisticRegression().fit(X, y)
+    viz = CalibrationDisplay.from_estimator(lr, X, y, pos_label=pos_label)
+
+    y_prob = lr.predict_proba(X)[:, expected_pos_label]
+    prob_true, prob_pred = calibration_curve(y, y_prob, pos_label=pos_label)
+
+    assert_allclose(viz.prob_true, prob_true)
+    assert_allclose(viz.prob_pred, prob_pred)
+    assert_allclose(viz.y_prob, y_prob)
+
+    assert (
+        viz.ax_.get_xlabel()
+        == f"Mean predicted probability (Positive class: {expected_pos_label})"
+    )
+    assert (
+        viz.ax_.get_ylabel()
+        == f"Fraction of positives (Positive class: {expected_pos_label})"
+    )
+
+    expected_legend_labels = [lr.__class__.__name__, "Perfectly calibrated"]
+    legend_labels = viz.ax_.get_legend().get_texts()
+    assert len(legend_labels) == len(expected_legend_labels)
+    for labels in legend_labels:
+        assert labels.get_text() in expected_legend_labels
+
+
+@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
+@pytest.mark.parametrize("ensemble", [True, False])
+def test_calibrated_classifier_cv_double_sample_weights_equivalence(method, ensemble):
+    """Check that passing repeating twice the dataset `X` is equivalent to
+    passing a `sample_weight` with a factor 2."""
+    X, y = load_iris(return_X_y=True)
+    # Scale the data to avoid any convergence issue
+    X = StandardScaler().fit_transform(X)
+    # Only use 2 classes
+    X, y = X[:100], y[:100]
+    sample_weight = np.ones_like(y) * 2
+
+    # Interlace the data such that a 2-fold cross-validation will be equivalent
+    # to using the original dataset with a sample weights of 2
+    X_twice = np.zeros((X.shape[0] * 2, X.shape[1]), dtype=X.dtype)
+    X_twice[::2, :] = X
+    X_twice[1::2, :] = X
+    y_twice = np.zeros(y.shape[0] * 2, dtype=y.dtype)
+    y_twice[::2] = y
+    y_twice[1::2] = y
+
+    estimator = LogisticRegression()
+    calibrated_clf_without_weights = CalibratedClassifierCV(
+        estimator,
+        method=method,
+        ensemble=ensemble,
+        cv=2,
+    )
+    calibrated_clf_with_weights = clone(calibrated_clf_without_weights)
+
+    calibrated_clf_with_weights.fit(X, y, sample_weight=sample_weight)
+    calibrated_clf_without_weights.fit(X_twice, y_twice)
+
+    # Check that the underlying fitted estimators have the same coefficients
+    for est_with_weights, est_without_weights in zip(
+        calibrated_clf_with_weights.calibrated_classifiers_,
+        calibrated_clf_without_weights.calibrated_classifiers_,
+    ):
+        assert_allclose(
+            est_with_weights.estimator.coef_,
+            est_without_weights.estimator.coef_,
+        )
+
+    # Check that the predictions are the same
+    y_pred_with_weights = calibrated_clf_with_weights.predict_proba(X)
+    y_pred_without_weights = calibrated_clf_without_weights.predict_proba(X)
+
+    assert_allclose(y_pred_with_weights, y_pred_without_weights)
+
+
+@pytest.mark.parametrize("fit_params_type", ["list", "array"])
+def test_calibration_with_fit_params(fit_params_type, data):
+    """Tests that fit_params are passed to the underlying base estimator.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/12384
+    """
+    X, y = data
+    fit_params = {
+        "a": _convert_container(y, fit_params_type),
+        "b": _convert_container(y, fit_params_type),
+    }
+
+    clf = CheckingClassifier(expected_fit_params=["a", "b"])
+    pc_clf = CalibratedClassifierCV(clf)
+
+    pc_clf.fit(X, y, **fit_params)
+
+
+@pytest.mark.parametrize(
+    "sample_weight",
+    [
+        [1.0] * N_SAMPLES,
+        np.ones(N_SAMPLES),
+    ],
+)
+def test_calibration_with_sample_weight_estimator(sample_weight, data):
+    """Tests that sample_weight is passed to the underlying base
+    estimator.
+    """
+    X, y = data
+    clf = CheckingClassifier(expected_sample_weight=True)
+    pc_clf = CalibratedClassifierCV(clf)
+
+    pc_clf.fit(X, y, sample_weight=sample_weight)
+
+
+def test_calibration_without_sample_weight_estimator(data):
+    """Check that even if the estimator doesn't support
+    sample_weight, fitting with sample_weight still works.
+
+    There should be a warning, since the sample_weight is not passed
+    on to the estimator.
+    """
+    X, y = data
+    sample_weight = np.ones_like(y)
+
+    class ClfWithoutSampleWeight(CheckingClassifier):
+        def fit(self, X, y, **fit_params):
+            assert "sample_weight" not in fit_params
+            return super().fit(X, y, **fit_params)
+
+    clf = ClfWithoutSampleWeight()
+    pc_clf = CalibratedClassifierCV(clf)
+
+    with pytest.warns(UserWarning):
+        pc_clf.fit(X, y, sample_weight=sample_weight)
+
+
+def test_calibration_with_non_sample_aligned_fit_param(data):
+    """Check that CalibratedClassifierCV does not enforce sample alignment
+    for fit parameters."""
+
+    class TestClassifier(LogisticRegression):
+        def fit(self, X, y, sample_weight=None, fit_param=None):
+            assert fit_param is not None
+            return super().fit(X, y, sample_weight=sample_weight)
+
+    CalibratedClassifierCV(estimator=TestClassifier()).fit(
+        *data, fit_param=np.ones(len(data[1]) + 1)
+    )
+
+
+def test_calibrated_classifier_cv_works_with_large_confidence_scores(
+    global_random_seed,
+):
+    """Test that :class:`CalibratedClassifierCV` works with large confidence
+    scores when using the `sigmoid` method, particularly with the
+    :class:`SGDClassifier`.
+
+    Non-regression test for issue #26766.
+    """
+    prob = 0.67
+    n = 1000
+    random_noise = np.random.default_rng(global_random_seed).normal(size=n)
+
+    y = np.array([1] * int(n * prob) + [0] * (n - int(n * prob)))
+    X = 1e5 * y.reshape((-1, 1)) + random_noise
+
+    # Check that the decision function of SGDClassifier produces predicted
+    # values that are quite large, for the data under consideration.
+    cv = check_cv(cv=None, y=y, classifier=True)
+    indices = cv.split(X, y)
+    for train, test in indices:
+        X_train, y_train = X[train], y[train]
+        X_test = X[test]
+        sgd_clf = SGDClassifier(loss="squared_hinge", random_state=global_random_seed)
+        sgd_clf.fit(X_train, y_train)
+        predictions = sgd_clf.decision_function(X_test)
+        assert (predictions > 1e4).any()
+
+    # Compare the CalibratedClassifierCV using the sigmoid method with the
+    # CalibratedClassifierCV using the isotonic method. The isotonic method
+    # is used for comparison because it is numerically stable.
+    clf_sigmoid = CalibratedClassifierCV(
+        SGDClassifier(loss="squared_hinge", random_state=global_random_seed),
+        method="sigmoid",
+    )
+    score_sigmoid = cross_val_score(clf_sigmoid, X, y, scoring="roc_auc")
+
+    # The isotonic method is used for comparison because it is numerically
+    # stable.
+    clf_isotonic = CalibratedClassifierCV(
+        SGDClassifier(loss="squared_hinge", random_state=global_random_seed),
+        method="isotonic",
+    )
+    score_isotonic = cross_val_score(clf_isotonic, X, y, scoring="roc_auc")
+
+    # The AUC score should be the same because it is invariant under
+    # strictly monotonic conditions
+    assert_allclose(score_sigmoid, score_isotonic)
+
+
+def test_sigmoid_calibration_max_abs_prediction_threshold(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+    n = 100
+    y = random_state.randint(0, 2, size=n)
+
+    # Check that for small enough predictions ranging from -2 to 2, the
+    # threshold value has no impact on the outcome
+    predictions_small = random_state.uniform(low=-2, high=2, size=100)
+
+    # Using a threshold lower than the maximum absolute value of the
+    # predictions enables internal re-scaling by max(abs(predictions_small)).
+    threshold_1 = 0.1
+    a1, b1 = _sigmoid_calibration(
+        predictions=predictions_small,
+        y=y,
+        max_abs_prediction_threshold=threshold_1,
+    )
+
+    # Using a larger threshold disables rescaling.
+    threshold_2 = 10
+    a2, b2 = _sigmoid_calibration(
+        predictions=predictions_small,
+        y=y,
+        max_abs_prediction_threshold=threshold_2,
+    )
+
+    # Using default threshold of 30 also disables the scaling.
+    a3, b3 = _sigmoid_calibration(
+        predictions=predictions_small,
+        y=y,
+    )
+
+    # Depends on the tolerance of the underlying quasy-newton solver which is
+    # not too strict by default.
+    atol = 1e-6
+    assert_allclose(a1, a2, atol=atol)
+    assert_allclose(a2, a3, atol=atol)
+    assert_allclose(b1, b2, atol=atol)
+    assert_allclose(b2, b3, atol=atol)
+
+
+@pytest.mark.parametrize("use_sample_weight", [True, False])
+@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
+def test_float32_predict_proba(data, use_sample_weight, method):
+    """Check that CalibratedClassifierCV works with float32 predict proba.
+
+    Non-regression test for gh-28245 and gh-28247.
+    """
+    if use_sample_weight:
+        # Use dtype=np.float64 to check that this does not trigger an
+        # unintentional upcasting: the dtype of the base estimator should
+        # control the dtype of the final model. In particular, the
+        # sigmoid calibrator relies on inputs (predictions and sample weights)
+        # with consistent dtypes because it is partially written in Cython.
+        # As this test forces the predictions to be `float32`, we want to check
+        # that `CalibratedClassifierCV` internally converts `sample_weight` to
+        # the same dtype to avoid crashing the Cython call.
+        sample_weight = np.ones_like(data[1], dtype=np.float64)
+    else:
+        sample_weight = None
+
+    class DummyClassifer32(DummyClassifier):
+        def predict_proba(self, X):
+            return super().predict_proba(X).astype(np.float32)
+
+    model = DummyClassifer32()
+    calibrator = CalibratedClassifierCV(model, method=method)
+    # Does not raise an error.
+    calibrator.fit(*data, sample_weight=sample_weight)
+
+    # Check with frozen prefit model
+    model = DummyClassifer32().fit(*data, sample_weight=sample_weight)
+    calibrator = CalibratedClassifierCV(FrozenEstimator(model), method=method)
+    # Does not raise an error.
+    calibrator.fit(*data, sample_weight=sample_weight)
+
+    # TODO(1.8): remove me once the deprecation period is over.
+    # Check with prefit model using the deprecated cv="prefit" argument:
+    model = DummyClassifer32().fit(*data, sample_weight=sample_weight)
+    calibrator = CalibratedClassifierCV(model, method=method, cv="prefit")
+    # Does not raise an error.
+    with pytest.warns(FutureWarning):
+        calibrator.fit(*data, sample_weight=sample_weight)
+
+
+def test_error_less_class_samples_than_folds():
+    """Check that CalibratedClassifierCV works with string targets.
+
+    non-regression test for issue #28841.
+    """
+    X = np.random.normal(size=(20, 3))
+    y = ["a"] * 10 + ["b"] * 10
+
+    CalibratedClassifierCV(cv=3).fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_check_build.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_check_build.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf72093354e19a2554ad0b29693c10f87d33a85
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_check_build.py
@@ -0,0 +1,15 @@
+"""
+Smoke Test the check_build module
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import pytest
+
+from sklearn.__check_build import raise_build_error
+
+
+def test_raise_build_error():
+    with pytest.raises(ImportError):
+        raise_build_error(ImportError())
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ada8c5ef0a30d6a903068a7a8687c569c8ef928
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_common.py
@@ -0,0 +1,406 @@
+"""
+General tests for all estimators in sklearn.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import os
+import pkgutil
+import re
+import warnings
+from functools import partial
+from inspect import isgenerator
+from itertools import chain
+
+import pytest
+from scipy.linalg import LinAlgWarning
+
+import sklearn
+from sklearn.base import BaseEstimator
+from sklearn.compose import ColumnTransformer
+from sklearn.exceptions import ConvergenceWarning
+
+# make it possible to discover experimental estimators when calling `all_estimators`
+from sklearn.experimental import (
+    enable_halving_search_cv,  # noqa: F401
+    enable_iterative_imputer,  # noqa: F401
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import FeatureUnion, make_pipeline
+from sklearn.preprocessing import (
+    FunctionTransformer,
+    MinMaxScaler,
+    OneHotEncoder,
+    StandardScaler,
+)
+from sklearn.utils import all_estimators
+from sklearn.utils._test_common.instance_generator import (
+    _get_check_estimator_ids,
+    _get_expected_failed_checks,
+    _tested_estimators,
+)
+from sklearn.utils._testing import (
+    SkipTest,
+    ignore_warnings,
+)
+from sklearn.utils.estimator_checks import (
+    check_dataframe_column_names_consistency,
+    check_estimator,
+    check_get_feature_names_out_error,
+    check_global_output_transform_pandas,
+    check_global_set_output_transform_polars,
+    check_inplace_ensure_writeable,
+    check_param_validation,
+    check_set_output_transform,
+    check_set_output_transform_pandas,
+    check_set_output_transform_polars,
+    check_transformer_get_feature_names_out,
+    check_transformer_get_feature_names_out_pandas,
+    parametrize_with_checks,
+)
+
+
+def test_all_estimator_no_base_class():
+    # test that all_estimators doesn't find abstract classes.
+    for name, Estimator in all_estimators():
+        msg = (
+            "Base estimators such as {0} should not be included in all_estimators"
+        ).format(name)
+        assert not name.lower().startswith("base"), msg
+
+
+def _sample_func(x, y=1):
+    pass
+
+
+class CallableEstimator(BaseEstimator):
+    """Dummy development stub for an estimator.
+
+    This is to make sure a callable estimator passes common tests.
+    """
+
+    def __call__(self):
+        pass  # pragma: nocover
+
+
+@pytest.mark.parametrize(
+    "val, expected",
+    [
+        (partial(_sample_func, y=1), "_sample_func(y=1)"),
+        (_sample_func, "_sample_func"),
+        (partial(_sample_func, "world"), "_sample_func"),
+        (LogisticRegression(C=2.0), "LogisticRegression(C=2.0)"),
+        (
+            LogisticRegression(
+                random_state=1,
+                solver="newton-cg",
+                class_weight="balanced",
+                warm_start=True,
+            ),
+            (
+                "LogisticRegression(class_weight='balanced',random_state=1,"
+                "solver='newton-cg',warm_start=True)"
+            ),
+        ),
+        (CallableEstimator(), "CallableEstimator()"),
+    ],
+)
+def test_get_check_estimator_ids(val, expected):
+    assert _get_check_estimator_ids(val) == expected
+
+
+@parametrize_with_checks(
+    list(_tested_estimators()), expected_failed_checks=_get_expected_failed_checks
+)
+def test_estimators(estimator, check, request):
+    # Common tests for estimator instances
+    with ignore_warnings(
+        category=(FutureWarning, ConvergenceWarning, UserWarning, LinAlgWarning)
+    ):
+        check(estimator)
+
+
+# TODO(1.8): remove test when generate_only is removed
+def test_check_estimator_generate_only_deprecation():
+    """Check that check_estimator with generate_only=True raises a deprecation
+    warning."""
+    with pytest.warns(FutureWarning, match="`generate_only` is deprecated in 1.6"):
+        all_instance_gen_checks = check_estimator(
+            LogisticRegression(), generate_only=True
+        )
+    assert isgenerator(all_instance_gen_checks)
+
+
+@pytest.mark.filterwarnings(
+    "ignore:Since version 1.0, it is not needed to import "
+    "enable_hist_gradient_boosting anymore"
+)
+# TODO(1.8): remove this filter
+@pytest.mark.filterwarnings(
+    "ignore:Importing from sklearn.utils._estimator_html_repr is deprecated."
+)
+def test_import_all_consistency():
+    sklearn_path = [os.path.dirname(sklearn.__file__)]
+    # Smoke test to check that any name in a __all__ list is actually defined
+    # in the namespace of the module or package.
+    pkgs = pkgutil.walk_packages(
+        path=sklearn_path, prefix="sklearn.", onerror=lambda _: None
+    )
+    submods = [modname for _, modname, _ in pkgs]
+    for modname in submods + ["sklearn"]:
+        if ".tests." in modname or "sklearn.externals" in modname:
+            continue
+        # Avoid test suite depending on build dependencies, for example Cython
+        if "sklearn._build_utils" in modname:
+            continue
+        package = __import__(modname, fromlist="dummy")
+        for name in getattr(package, "__all__", ()):
+            assert hasattr(package, name), "Module '{0}' has no attribute '{1}'".format(
+                modname, name
+            )
+
+
+def test_root_import_all_completeness():
+    sklearn_path = [os.path.dirname(sklearn.__file__)]
+    EXCEPTIONS = ("utils", "tests", "base", "conftest")
+    for _, modname, _ in pkgutil.walk_packages(
+        path=sklearn_path, onerror=lambda _: None
+    ):
+        if "." in modname or modname.startswith("_") or modname in EXCEPTIONS:
+            continue
+        assert modname in sklearn.__all__
+
+
+def test_all_tests_are_importable():
+    # Ensure that for each contentful subpackage, there is a test directory
+    # within it that is also a subpackage (i.e. a directory with __init__.py)
+
+    HAS_TESTS_EXCEPTIONS = re.compile(
+        r"""(?x)
+                                      \.externals(\.|$)|
+                                      \.tests(\.|$)|
+                                      \._
+                                      """
+    )
+    resource_modules = {
+        "sklearn.datasets.data",
+        "sklearn.datasets.descr",
+        "sklearn.datasets.images",
+    }
+    sklearn_path = [os.path.dirname(sklearn.__file__)]
+    lookup = {
+        name: ispkg
+        for _, name, ispkg in pkgutil.walk_packages(sklearn_path, prefix="sklearn.")
+    }
+    missing_tests = [
+        name
+        for name, ispkg in lookup.items()
+        if ispkg
+        and name not in resource_modules
+        and not HAS_TESTS_EXCEPTIONS.search(name)
+        and name + ".tests" not in lookup
+    ]
+    assert missing_tests == [], (
+        "{0} do not have `tests` subpackages. "
+        "Perhaps they require "
+        "__init__.py or a meson.build "
+        "in the parent "
+        "directory".format(missing_tests)
+    )
+
+
+def test_class_support_removed():
+    # Make sure passing classes to check_estimator or parametrize_with_checks
+    # raises an error
+
+    msg = "Passing a class was deprecated.* isn't supported anymore"
+    with pytest.raises(TypeError, match=msg):
+        check_estimator(LogisticRegression)
+
+    with pytest.raises(TypeError, match=msg):
+        parametrize_with_checks([LogisticRegression])
+
+
+def _estimators_that_predict_in_fit():
+    for estimator in _tested_estimators():
+        est_params = set(estimator.get_params())
+        if "oob_score" in est_params:
+            yield estimator.set_params(oob_score=True, bootstrap=True)
+        elif "early_stopping" in est_params:
+            est = estimator.set_params(early_stopping=True, n_iter_no_change=1)
+            if est.__class__.__name__ in {"MLPClassifier", "MLPRegressor"}:
+                # TODO: FIX MLP to not check validation set during MLP
+                yield pytest.param(
+                    est, marks=pytest.mark.xfail(msg="MLP still validates in fit")
+                )
+            else:
+                yield est
+        elif "n_iter_no_change" in est_params:
+            yield estimator.set_params(n_iter_no_change=1)
+
+
+# NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that
+# delegates validation to a base estimator, the check is testing that the base estimator
+# is checking for column name consistency.
+column_name_estimators = list(
+    chain(
+        _tested_estimators(),
+        [make_pipeline(LogisticRegression(C=1))],
+        _estimators_that_predict_in_fit(),
+    )
+)
+
+
+@pytest.mark.parametrize(
+    "estimator", column_name_estimators, ids=_get_check_estimator_ids
+)
+def test_pandas_column_name_consistency(estimator):
+    if isinstance(estimator, ColumnTransformer):
+        pytest.skip("ColumnTransformer is not tested here")
+    if "check_dataframe_column_names_consistency" in _get_expected_failed_checks(
+        estimator
+    ):
+        pytest.skip(
+            "Estimator does not support check_dataframe_column_names_consistency"
+        )
+    with ignore_warnings(category=(FutureWarning)):
+        with warnings.catch_warnings(record=True) as record:
+            check_dataframe_column_names_consistency(
+                estimator.__class__.__name__, estimator
+            )
+        for warning in record:
+            assert "was fitted without feature names" not in str(warning.message)
+
+
+# TODO: As more modules support get_feature_names_out they should be removed
+# from this list to be tested
+GET_FEATURES_OUT_MODULES_TO_IGNORE = [
+    "ensemble",
+    "kernel_approximation",
+]
+
+
+def _include_in_get_feature_names_out_check(transformer):
+    if hasattr(transformer, "get_feature_names_out"):
+        return True
+    module = transformer.__module__.split(".")[1]
+    return module not in GET_FEATURES_OUT_MODULES_TO_IGNORE
+
+
+GET_FEATURES_OUT_ESTIMATORS = [
+    est
+    for est in _tested_estimators("transformer")
+    if _include_in_get_feature_names_out_check(est)
+]
+
+
+@pytest.mark.parametrize(
+    "transformer", GET_FEATURES_OUT_ESTIMATORS, ids=_get_check_estimator_ids
+)
+def test_transformers_get_feature_names_out(transformer):
+    with ignore_warnings(category=(FutureWarning)):
+        check_transformer_get_feature_names_out(
+            transformer.__class__.__name__, transformer
+        )
+        check_transformer_get_feature_names_out_pandas(
+            transformer.__class__.__name__, transformer
+        )
+
+
+ESTIMATORS_WITH_GET_FEATURE_NAMES_OUT = [
+    est for est in _tested_estimators() if hasattr(est, "get_feature_names_out")
+]
+
+
+@pytest.mark.parametrize(
+    "estimator", ESTIMATORS_WITH_GET_FEATURE_NAMES_OUT, ids=_get_check_estimator_ids
+)
+def test_estimators_get_feature_names_out_error(estimator):
+    estimator_name = estimator.__class__.__name__
+    check_get_feature_names_out_error(estimator_name, estimator)
+
+
+@pytest.mark.parametrize(
+    "estimator", list(_tested_estimators()), ids=_get_check_estimator_ids
+)
+def test_check_param_validation(estimator):
+    if isinstance(estimator, FeatureUnion):
+        pytest.skip("FeatureUnion is not tested here")
+    name = estimator.__class__.__name__
+    check_param_validation(name, estimator)
+
+
+SET_OUTPUT_ESTIMATORS = list(
+    chain(
+        _tested_estimators("transformer"),
+        [
+            make_pipeline(StandardScaler(), MinMaxScaler()),
+            OneHotEncoder(sparse_output=False),
+            FunctionTransformer(feature_names_out="one-to-one"),
+        ],
+    )
+)
+
+
+@pytest.mark.parametrize(
+    "estimator", SET_OUTPUT_ESTIMATORS, ids=_get_check_estimator_ids
+)
+def test_set_output_transform(estimator):
+    name = estimator.__class__.__name__
+    if not hasattr(estimator, "set_output"):
+        pytest.skip(
+            f"Skipping check_set_output_transform for {name}: Does not support"
+            " set_output API"
+        )
+    with ignore_warnings(category=(FutureWarning)):
+        check_set_output_transform(estimator.__class__.__name__, estimator)
+
+
+@pytest.mark.parametrize(
+    "estimator", SET_OUTPUT_ESTIMATORS, ids=_get_check_estimator_ids
+)
+@pytest.mark.parametrize(
+    "check_func",
+    [
+        check_set_output_transform_pandas,
+        check_global_output_transform_pandas,
+        check_set_output_transform_polars,
+        check_global_set_output_transform_polars,
+    ],
+)
+def test_set_output_transform_configured(estimator, check_func):
+    name = estimator.__class__.__name__
+    if not hasattr(estimator, "set_output"):
+        pytest.skip(
+            f"Skipping {check_func.__name__} for {name}: Does not support"
+            " set_output API yet"
+        )
+    with ignore_warnings(category=(FutureWarning)):
+        check_func(estimator.__class__.__name__, estimator)
+
+
+@pytest.mark.parametrize(
+    "estimator", _tested_estimators(), ids=_get_check_estimator_ids
+)
+def test_check_inplace_ensure_writeable(estimator):
+    name = estimator.__class__.__name__
+
+    if hasattr(estimator, "copy"):
+        estimator.set_params(copy=False)
+    elif hasattr(estimator, "copy_X"):
+        estimator.set_params(copy_X=False)
+    else:
+        raise SkipTest(f"{name} doesn't require writeable input.")
+
+    # The following estimators can work inplace only with certain settings
+    if name == "HDBSCAN":
+        estimator.set_params(metric="precomputed", algorithm="brute")
+
+    if name == "PCA":
+        estimator.set_params(svd_solver="full")
+
+    if name == "KernelPCA":
+        estimator.set_params(kernel="precomputed")
+
+    check_inplace_ensure_writeable(name, estimator)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_config.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf35eee623c18dad369f8c340887fe8ddc39d3c8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_config.py
@@ -0,0 +1,168 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+
+import sklearn
+from sklearn import config_context, get_config, set_config
+from sklearn.utils.fixes import _IS_WASM
+from sklearn.utils.parallel import Parallel, delayed
+
+
+def test_config_context():
+    assert get_config() == {
+        "assume_finite": False,
+        "working_memory": 1024,
+        "print_changed_only": True,
+        "display": "diagram",
+        "array_api_dispatch": False,
+        "pairwise_dist_chunk_size": 256,
+        "enable_cython_pairwise_dist": True,
+        "transform_output": "default",
+        "enable_metadata_routing": False,
+        "skip_parameter_validation": False,
+    }
+
+    # Not using as a context manager affects nothing
+    config_context(assume_finite=True)
+    assert get_config()["assume_finite"] is False
+
+    with config_context(assume_finite=True):
+        assert get_config() == {
+            "assume_finite": True,
+            "working_memory": 1024,
+            "print_changed_only": True,
+            "display": "diagram",
+            "array_api_dispatch": False,
+            "pairwise_dist_chunk_size": 256,
+            "enable_cython_pairwise_dist": True,
+            "transform_output": "default",
+            "enable_metadata_routing": False,
+            "skip_parameter_validation": False,
+        }
+    assert get_config()["assume_finite"] is False
+
+    with config_context(assume_finite=True):
+        with config_context(assume_finite=None):
+            assert get_config()["assume_finite"] is True
+
+        assert get_config()["assume_finite"] is True
+
+        with config_context(assume_finite=False):
+            assert get_config()["assume_finite"] is False
+
+            with config_context(assume_finite=None):
+                assert get_config()["assume_finite"] is False
+
+                # global setting will not be retained outside of context that
+                # did not modify this setting
+                set_config(assume_finite=True)
+                assert get_config()["assume_finite"] is True
+
+            assert get_config()["assume_finite"] is False
+
+        assert get_config()["assume_finite"] is True
+
+    assert get_config() == {
+        "assume_finite": False,
+        "working_memory": 1024,
+        "print_changed_only": True,
+        "display": "diagram",
+        "array_api_dispatch": False,
+        "pairwise_dist_chunk_size": 256,
+        "enable_cython_pairwise_dist": True,
+        "transform_output": "default",
+        "enable_metadata_routing": False,
+        "skip_parameter_validation": False,
+    }
+
+    # No positional arguments
+    with pytest.raises(TypeError):
+        config_context(True)
+
+    # No unknown arguments
+    with pytest.raises(TypeError):
+        config_context(do_something_else=True).__enter__()
+
+
+def test_config_context_exception():
+    assert get_config()["assume_finite"] is False
+    try:
+        with config_context(assume_finite=True):
+            assert get_config()["assume_finite"] is True
+            raise ValueError()
+    except ValueError:
+        pass
+    assert get_config()["assume_finite"] is False
+
+
+def test_set_config():
+    assert get_config()["assume_finite"] is False
+    set_config(assume_finite=None)
+    assert get_config()["assume_finite"] is False
+    set_config(assume_finite=True)
+    assert get_config()["assume_finite"] is True
+    set_config(assume_finite=None)
+    assert get_config()["assume_finite"] is True
+    set_config(assume_finite=False)
+    assert get_config()["assume_finite"] is False
+
+    # No unknown arguments
+    with pytest.raises(TypeError):
+        set_config(do_something_else=True)
+
+
+def set_assume_finite(assume_finite, sleep_duration):
+    """Return the value of assume_finite after waiting `sleep_duration`."""
+    with config_context(assume_finite=assume_finite):
+        time.sleep(sleep_duration)
+        return get_config()["assume_finite"]
+
+
+@pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"])
+def test_config_threadsafe_joblib(backend):
+    """Test that the global config is threadsafe with all joblib backends.
+    Two jobs are spawned and sets assume_finite to two different values.
+    When the job with a duration 0.1s completes, the assume_finite value
+    should be the same as the value passed to the function. In other words,
+    it is not influenced by the other job setting assume_finite to True.
+    """
+    assume_finites = [False, True, False, True]
+    sleep_durations = [0.1, 0.2, 0.1, 0.2]
+
+    items = Parallel(backend=backend, n_jobs=2)(
+        delayed(set_assume_finite)(assume_finite, sleep_dur)
+        for assume_finite, sleep_dur in zip(assume_finites, sleep_durations)
+    )
+
+    assert items == [False, True, False, True]
+
+
+@pytest.mark.xfail(_IS_WASM, reason="cannot start threads")
+def test_config_threadsafe():
+    """Uses threads directly to test that the global config does not change
+    between threads. Same test as `test_config_threadsafe_joblib` but with
+    `ThreadPoolExecutor`."""
+
+    assume_finites = [False, True, False, True]
+    sleep_durations = [0.1, 0.2, 0.1, 0.2]
+
+    with ThreadPoolExecutor(max_workers=2) as e:
+        items = [
+            output
+            for output in e.map(set_assume_finite, assume_finites, sleep_durations)
+        ]
+
+    assert items == [False, True, False, True]
+
+
+def test_config_array_api_dispatch_error_scipy(monkeypatch):
+    """Check error when SciPy is too old"""
+    monkeypatch.setattr(sklearn.utils._array_api.scipy, "__version__", "1.13.0")
+
+    with pytest.raises(ImportError, match="SciPy must be 1.14.0 or newer"):
+        with config_context(array_api_dispatch=True):
+            pass
+
+    with pytest.raises(ImportError, match="SciPy must be 1.14.0 or newer"):
+        set_config(array_api_dispatch=True)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_discriminant_analysis.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_discriminant_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a74ccf3b35c33e52f89ccfd1fb24d13a097766b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_discriminant_analysis.py
@@ -0,0 +1,670 @@
+import warnings
+
+import numpy as np
+import pytest
+from scipy import linalg
+
+from sklearn.cluster import KMeans
+from sklearn.covariance import LedoitWolf, ShrunkCovariance, ledoit_wolf
+from sklearn.datasets import make_blobs
+from sklearn.discriminant_analysis import (
+    LinearDiscriminantAnalysis,
+    QuadraticDiscriminantAnalysis,
+    _cov,
+)
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+# Data is just 6 separable points in the plane
+X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype="f")
+y = np.array([1, 1, 1, 2, 2, 2])
+y3 = np.array([1, 1, 2, 2, 3, 3])
+
+# Degenerate data with only one feature (still should be separable)
+X1 = np.array(
+    [[-2], [-1], [-1], [1], [1], [2]],
+    dtype="f",
+)
+
+# Data is just 9 separable points in the plane
+X6 = np.array(
+    [[0, 0], [-2, -2], [-2, -1], [-1, -1], [-1, -2], [1, 3], [1, 2], [2, 1], [2, 2]]
+)
+y6 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2])
+y7 = np.array([1, 2, 3, 2, 3, 1, 2, 3, 1])
+
+# Degenerate data with 1 feature (still should be separable)
+X7 = np.array([[-3], [-2], [-1], [-1], [0], [1], [1], [2], [3]])
+
+# Data that has zero variance in one dimension and needs regularization
+X2 = np.array(
+    [[-3, 0], [-2, 0], [-1, 0], [-1, 0], [0, 0], [1, 0], [1, 0], [2, 0], [3, 0]]
+)
+
+# One element class
+y4 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 2])
+
+# Data with less samples in a class than n_features
+X5 = np.c_[np.arange(8), np.zeros((8, 3))]
+y5 = np.array([0, 0, 0, 0, 0, 1, 1, 1])
+
+solver_shrinkage = [
+    ("svd", None),
+    ("lsqr", None),
+    ("eigen", None),
+    ("lsqr", "auto"),
+    ("lsqr", 0),
+    ("lsqr", 0.43),
+    ("eigen", "auto"),
+    ("eigen", 0),
+    ("eigen", 0.43),
+]
+
+
+def test_lda_predict():
+    # Test LDA classification.
+    # This checks that LDA implements fit and predict and returns correct
+    # values for simple toy data.
+    for test_case in solver_shrinkage:
+        solver, shrinkage = test_case
+        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
+        y_pred = clf.fit(X, y).predict(X)
+        assert_array_equal(y_pred, y, "solver %s" % solver)
+
+        # Assert that it works with 1D data
+        y_pred1 = clf.fit(X1, y).predict(X1)
+        assert_array_equal(y_pred1, y, "solver %s" % solver)
+
+        # Test probability estimates
+        y_proba_pred1 = clf.predict_proba(X1)
+        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver)
+        y_log_proba_pred1 = clf.predict_log_proba(X1)
+        assert_allclose(
+            np.exp(y_log_proba_pred1),
+            y_proba_pred1,
+            rtol=1e-6,
+            atol=1e-6,
+            err_msg="solver %s" % solver,
+        )
+
+        # Primarily test for commit 2f34950 -- "reuse" of priors
+        y_pred3 = clf.fit(X, y3).predict(X)
+        # LDA shouldn't be able to separate those
+        assert np.any(y_pred3 != y3), "solver %s" % solver
+
+    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
+    with pytest.raises(NotImplementedError):
+        clf.fit(X, y)
+
+    clf = LinearDiscriminantAnalysis(
+        solver="lsqr", shrinkage=0.1, covariance_estimator=ShrunkCovariance()
+    )
+    with pytest.raises(
+        ValueError,
+        match=(
+            "covariance_estimator and shrinkage "
+            "parameters are not None. "
+            "Only one of the two can be set."
+        ),
+    ):
+        clf.fit(X, y)
+
+    # test bad solver with covariance_estimator
+    clf = LinearDiscriminantAnalysis(solver="svd", covariance_estimator=LedoitWolf())
+    with pytest.raises(
+        ValueError, match="covariance estimator is not supported with svd"
+    ):
+        clf.fit(X, y)
+
+    # test bad covariance estimator
+    clf = LinearDiscriminantAnalysis(
+        solver="lsqr", covariance_estimator=KMeans(n_clusters=2, n_init="auto")
+    )
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
+
+
+@pytest.mark.parametrize("n_classes", [2, 3])
+@pytest.mark.parametrize("solver", ["svd", "lsqr", "eigen"])
+def test_lda_predict_proba(solver, n_classes):
+    def generate_dataset(n_samples, centers, covariances, random_state=None):
+        """Generate a multivariate normal data given some centers and
+        covariances"""
+        rng = check_random_state(random_state)
+        X = np.vstack(
+            [
+                rng.multivariate_normal(mean, cov, size=n_samples // len(centers))
+                for mean, cov in zip(centers, covariances)
+            ]
+        )
+        y = np.hstack(
+            [[clazz] * (n_samples // len(centers)) for clazz in range(len(centers))]
+        )
+        return X, y
+
+    blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes]
+    blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers))
+    X, y = generate_dataset(
+        n_samples=90000, centers=blob_centers, covariances=blob_stds, random_state=42
+    )
+    lda = LinearDiscriminantAnalysis(
+        solver=solver, store_covariance=True, shrinkage=None
+    ).fit(X, y)
+    # check that the empirical means and covariances are close enough to the
+    # one used to generate the data
+    assert_allclose(lda.means_, blob_centers, atol=1e-1)
+    assert_allclose(lda.covariance_, blob_stds[0], atol=1)
+
+    # implement the method to compute the probability given in The Elements
+    # of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression
+    # or LDA?")
+    precision = linalg.inv(blob_stds[0])
+    alpha_k = []
+    alpha_k_0 = []
+    for clazz in range(len(blob_centers) - 1):
+        alpha_k.append(
+            np.dot(precision, (blob_centers[clazz] - blob_centers[-1])[:, np.newaxis])
+        )
+        alpha_k_0.append(
+            np.dot(
+                -0.5 * (blob_centers[clazz] + blob_centers[-1])[np.newaxis, :],
+                alpha_k[-1],
+            )
+        )
+
+    sample = np.array([[-22, 22]])
+
+    def discriminant_func(sample, coef, intercept, clazz):
+        return np.exp(intercept[clazz] + np.dot(sample, coef[clazz])).item()
+
+    prob = np.array(
+        [
+            float(
+                discriminant_func(sample, alpha_k, alpha_k_0, clazz)
+                / (
+                    1
+                    + sum(
+                        [
+                            discriminant_func(sample, alpha_k, alpha_k_0, clazz)
+                            for clazz in range(n_classes - 1)
+                        ]
+                    )
+                )
+            )
+            for clazz in range(n_classes - 1)
+        ]
+    )
+
+    prob_ref = 1 - np.sum(prob)
+
+    # check the consistency of the computed probability
+    # all probabilities should sum to one
+    prob_ref_2 = float(
+        1
+        / (
+            1
+            + sum(
+                [
+                    discriminant_func(sample, alpha_k, alpha_k_0, clazz)
+                    for clazz in range(n_classes - 1)
+                ]
+            )
+        )
+    )
+
+    assert prob_ref == pytest.approx(prob_ref_2)
+    # check that the probability of LDA are close to the theoretical
+    # probabilities
+    assert_allclose(
+        lda.predict_proba(sample), np.hstack([prob, prob_ref])[np.newaxis], atol=1e-2
+    )
+
+
+def test_lda_priors():
+    # Test priors (negative priors)
+    priors = np.array([0.5, -0.5])
+    clf = LinearDiscriminantAnalysis(priors=priors)
+    msg = "priors must be non-negative"
+
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+    # Test that priors passed as a list are correctly handled (run to see if
+    # failure)
+    clf = LinearDiscriminantAnalysis(priors=[0.5, 0.5])
+    clf.fit(X, y)
+
+    # Test that priors always sum to 1
+    priors = np.array([0.5, 0.6])
+    prior_norm = np.array([0.45, 0.55])
+    clf = LinearDiscriminantAnalysis(priors=priors)
+
+    with pytest.warns(UserWarning):
+        clf.fit(X, y)
+
+    assert_array_almost_equal(clf.priors_, prior_norm, 2)
+
+
+def test_lda_coefs():
+    # Test if the coefficients of the solvers are approximately the same.
+    n_features = 2
+    n_classes = 2
+    n_samples = 1000
+    X, y = make_blobs(
+        n_samples=n_samples, n_features=n_features, centers=n_classes, random_state=11
+    )
+
+    clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
+    clf_lda_lsqr = LinearDiscriminantAnalysis(solver="lsqr")
+    clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
+
+    clf_lda_svd.fit(X, y)
+    clf_lda_lsqr.fit(X, y)
+    clf_lda_eigen.fit(X, y)
+
+    assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_lsqr.coef_, 1)
+    assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_eigen.coef_, 1)
+    assert_array_almost_equal(clf_lda_eigen.coef_, clf_lda_lsqr.coef_, 1)
+
+
+def test_lda_transform():
+    # Test LDA transform.
+    clf = LinearDiscriminantAnalysis(solver="svd", n_components=1)
+    X_transformed = clf.fit(X, y).transform(X)
+    assert X_transformed.shape[1] == 1
+    clf = LinearDiscriminantAnalysis(solver="eigen", n_components=1)
+    X_transformed = clf.fit(X, y).transform(X)
+    assert X_transformed.shape[1] == 1
+
+    clf = LinearDiscriminantAnalysis(solver="lsqr", n_components=1)
+    clf.fit(X, y)
+    msg = "transform not implemented for 'lsqr'"
+
+    with pytest.raises(NotImplementedError, match=msg):
+        clf.transform(X)
+
+
+def test_lda_explained_variance_ratio():
+    # Test if the sum of the normalized eigen vectors values equals 1,
+    # Also tests whether the explained_variance_ratio_ formed by the
+    # eigen solver is the same as the explained_variance_ratio_ formed
+    # by the svd solver
+
+    state = np.random.RandomState(0)
+    X = state.normal(loc=0, scale=100, size=(40, 20))
+    y = state.randint(0, 3, size=(40,))
+
+    clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
+    clf_lda_eigen.fit(X, y)
+    assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
+    assert clf_lda_eigen.explained_variance_ratio_.shape == (2,), (
+        "Unexpected length for explained_variance_ratio_"
+    )
+
+    clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
+    clf_lda_svd.fit(X, y)
+    assert_almost_equal(clf_lda_svd.explained_variance_ratio_.sum(), 1.0, 3)
+    assert clf_lda_svd.explained_variance_ratio_.shape == (2,), (
+        "Unexpected length for explained_variance_ratio_"
+    )
+
+    assert_array_almost_equal(
+        clf_lda_svd.explained_variance_ratio_, clf_lda_eigen.explained_variance_ratio_
+    )
+
+
+def test_lda_orthogonality():
+    # arrange four classes with their means in a kite-shaped pattern
+    # the longer distance should be transformed to the first component, and
+    # the shorter distance to the second component.
+    means = np.array([[0, 0, -1], [0, 2, 0], [0, -2, 0], [0, 0, 5]])
+
+    # We construct perfectly symmetric distributions, so the LDA can estimate
+    # precise means.
+    scatter = np.array(
+        [
+            [0.1, 0, 0],
+            [-0.1, 0, 0],
+            [0, 0.1, 0],
+            [0, -0.1, 0],
+            [0, 0, 0.1],
+            [0, 0, -0.1],
+        ]
+    )
+
+    X = (means[:, np.newaxis, :] + scatter[np.newaxis, :, :]).reshape((-1, 3))
+    y = np.repeat(np.arange(means.shape[0]), scatter.shape[0])
+
+    # Fit LDA and transform the means
+    clf = LinearDiscriminantAnalysis(solver="svd").fit(X, y)
+    means_transformed = clf.transform(means)
+
+    d1 = means_transformed[3] - means_transformed[0]
+    d2 = means_transformed[2] - means_transformed[1]
+    d1 /= np.sqrt(np.sum(d1**2))
+    d2 /= np.sqrt(np.sum(d2**2))
+
+    # the transformed within-class covariance should be the identity matrix
+    assert_almost_equal(np.cov(clf.transform(scatter).T), np.eye(2))
+
+    # the means of classes 0 and 3 should lie on the first component
+    assert_almost_equal(np.abs(np.dot(d1[:2], [1, 0])), 1.0)
+
+    # the means of classes 1 and 2 should lie on the second component
+    assert_almost_equal(np.abs(np.dot(d2[:2], [0, 1])), 1.0)
+
+
+def test_lda_scaling():
+    # Test if classification works correctly with differently scaled features.
+    n = 100
+    rng = np.random.RandomState(1234)
+    # use uniform distribution of features to make sure there is absolutely no
+    # overlap between classes.
+    x1 = rng.uniform(-1, 1, (n, 3)) + [-10, 0, 0]
+    x2 = rng.uniform(-1, 1, (n, 3)) + [10, 0, 0]
+    x = np.vstack((x1, x2)) * [1, 100, 10000]
+    y = [-1] * n + [1] * n
+
+    for solver in ("svd", "lsqr", "eigen"):
+        clf = LinearDiscriminantAnalysis(solver=solver)
+        # should be able to separate the data perfectly
+        assert clf.fit(x, y).score(x, y) == 1.0, "using covariance: %s" % solver
+
+
+def test_lda_store_covariance():
+    # Test for solver 'lsqr' and 'eigen'
+    # 'store_covariance' has no effect on 'lsqr' and 'eigen' solvers
+    for solver in ("lsqr", "eigen"):
+        clf = LinearDiscriminantAnalysis(solver=solver).fit(X6, y6)
+        assert hasattr(clf, "covariance_")
+
+        # Test the actual attribute:
+        clf = LinearDiscriminantAnalysis(solver=solver, store_covariance=True).fit(
+            X6, y6
+        )
+        assert hasattr(clf, "covariance_")
+
+        assert_array_almost_equal(
+            clf.covariance_, np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
+        )
+
+    # Test for SVD solver, the default is to not set the covariances_ attribute
+    clf = LinearDiscriminantAnalysis(solver="svd").fit(X6, y6)
+    assert not hasattr(clf, "covariance_")
+
+    # Test the actual attribute:
+    clf = LinearDiscriminantAnalysis(solver=solver, store_covariance=True).fit(X6, y6)
+    assert hasattr(clf, "covariance_")
+
+    assert_array_almost_equal(
+        clf.covariance_, np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
+    )
+
+
+@pytest.mark.parametrize("seed", range(10))
+def test_lda_shrinkage(seed):
+    # Test that shrunk covariance estimator and shrinkage parameter behave the
+    # same
+    rng = np.random.RandomState(seed)
+    X = rng.rand(100, 10)
+    y = rng.randint(3, size=(100))
+    c1 = LinearDiscriminantAnalysis(store_covariance=True, shrinkage=0.5, solver="lsqr")
+    c2 = LinearDiscriminantAnalysis(
+        store_covariance=True,
+        covariance_estimator=ShrunkCovariance(shrinkage=0.5),
+        solver="lsqr",
+    )
+    c1.fit(X, y)
+    c2.fit(X, y)
+    assert_allclose(c1.means_, c2.means_)
+    assert_allclose(c1.covariance_, c2.covariance_)
+
+
+def test_lda_ledoitwolf():
+    # When shrinkage="auto" current implementation uses ledoitwolf estimation
+    # of covariance after standardizing the data. This checks that it is indeed
+    # the case
+    class StandardizedLedoitWolf:
+        def fit(self, X):
+            sc = StandardScaler()  # standardize features
+            X_sc = sc.fit_transform(X)
+            s = ledoit_wolf(X_sc)[0]
+            # rescale
+            s = sc.scale_[:, np.newaxis] * s * sc.scale_[np.newaxis, :]
+            self.covariance_ = s
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 10)
+    y = rng.randint(3, size=(100,))
+    c1 = LinearDiscriminantAnalysis(
+        store_covariance=True, shrinkage="auto", solver="lsqr"
+    )
+    c2 = LinearDiscriminantAnalysis(
+        store_covariance=True,
+        covariance_estimator=StandardizedLedoitWolf(),
+        solver="lsqr",
+    )
+    c1.fit(X, y)
+    c2.fit(X, y)
+    assert_allclose(c1.means_, c2.means_)
+    assert_allclose(c1.covariance_, c2.covariance_)
+
+
+@pytest.mark.parametrize("n_features", [3, 5])
+@pytest.mark.parametrize("n_classes", [5, 3])
+def test_lda_dimension_warning(n_classes, n_features):
+    rng = check_random_state(0)
+    n_samples = 10
+    X = rng.randn(n_samples, n_features)
+    # we create n_classes labels by repeating and truncating a
+    # range(n_classes) until n_samples
+    y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
+    max_components = min(n_features, n_classes - 1)
+
+    for n_components in [max_components - 1, None, max_components]:
+        # if n_components <= min(n_classes - 1, n_features), no warning
+        lda = LinearDiscriminantAnalysis(n_components=n_components)
+        lda.fit(X, y)
+
+    for n_components in [max_components + 1, max(n_features, n_classes - 1) + 1]:
+        # if n_components > min(n_classes - 1, n_features), raise error.
+        # We test one unit higher than max_components, and then something
+        # larger than both n_features and n_classes - 1 to ensure the test
+        # works for any value of n_component
+        lda = LinearDiscriminantAnalysis(n_components=n_components)
+        msg = "n_components cannot be larger than "
+        with pytest.raises(ValueError, match=msg):
+            lda.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "data_type, expected_type",
+    [
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ],
+)
+def test_lda_dtype_match(data_type, expected_type):
+    for solver, shrinkage in solver_shrinkage:
+        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
+        clf.fit(X.astype(data_type), y.astype(data_type))
+        assert clf.coef_.dtype == expected_type
+
+
+def test_lda_numeric_consistency_float32_float64():
+    for solver, shrinkage in solver_shrinkage:
+        clf_32 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
+        clf_32.fit(X.astype(np.float32), y.astype(np.float32))
+        clf_64 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
+        clf_64.fit(X.astype(np.float64), y.astype(np.float64))
+
+        # Check value consistency between types
+        rtol = 1e-6
+        assert_allclose(clf_32.coef_, clf_64.coef_, rtol=rtol)
+
+
+def test_qda():
+    # QDA classification.
+    # This checks that QDA implements fit and predict and returns
+    # correct values for a simple toy dataset.
+    clf = QuadraticDiscriminantAnalysis()
+    y_pred = clf.fit(X6, y6).predict(X6)
+    assert_array_equal(y_pred, y6)
+
+    # Assure that it works with 1D data
+    y_pred1 = clf.fit(X7, y6).predict(X7)
+    assert_array_equal(y_pred1, y6)
+
+    # Test probas estimates
+    y_proba_pred1 = clf.predict_proba(X7)
+    assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6)
+    y_log_proba_pred1 = clf.predict_log_proba(X7)
+    assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8)
+
+    y_pred3 = clf.fit(X6, y7).predict(X6)
+    # QDA shouldn't be able to separate those
+    assert np.any(y_pred3 != y7)
+
+    # Classes should have at least 2 elements
+    with pytest.raises(ValueError):
+        clf.fit(X6, y4)
+
+
+def test_qda_priors():
+    clf = QuadraticDiscriminantAnalysis()
+    y_pred = clf.fit(X6, y6).predict(X6)
+    n_pos = np.sum(y_pred == 2)
+
+    neg = 1e-10
+    clf = QuadraticDiscriminantAnalysis(priors=np.array([neg, 1 - neg]))
+    y_pred = clf.fit(X6, y6).predict(X6)
+    n_pos2 = np.sum(y_pred == 2)
+
+    assert n_pos2 > n_pos
+
+
+@pytest.mark.parametrize("priors_type", ["list", "tuple", "array"])
+def test_qda_prior_type(priors_type):
+    """Check that priors accept array-like."""
+    priors = [0.5, 0.5]
+    clf = QuadraticDiscriminantAnalysis(
+        priors=_convert_container([0.5, 0.5], priors_type)
+    ).fit(X6, y6)
+    assert isinstance(clf.priors_, np.ndarray)
+    assert_array_equal(clf.priors_, priors)
+
+
+def test_qda_prior_copy():
+    """Check that altering `priors` without `fit` doesn't change `priors_`"""
+    priors = np.array([0.5, 0.5])
+    qda = QuadraticDiscriminantAnalysis(priors=priors).fit(X, y)
+
+    # we expect the following
+    assert_array_equal(qda.priors_, qda.priors)
+
+    # altering `priors` without `fit` should not change `priors_`
+    priors[0] = 0.2
+    assert qda.priors_[0] != qda.priors[0]
+
+
+def test_qda_store_covariance():
+    # The default is to not set the covariances_ attribute
+    clf = QuadraticDiscriminantAnalysis().fit(X6, y6)
+    assert not hasattr(clf, "covariance_")
+
+    # Test the actual attribute:
+    clf = QuadraticDiscriminantAnalysis(store_covariance=True).fit(X6, y6)
+    assert hasattr(clf, "covariance_")
+
+    assert_array_almost_equal(clf.covariance_[0], np.array([[0.7, 0.45], [0.45, 0.7]]))
+
+    assert_array_almost_equal(
+        clf.covariance_[1],
+        np.array([[0.33333333, -0.33333333], [-0.33333333, 0.66666667]]),
+    )
+
+
+def test_qda_regularization():
+    # The default is reg_param=0. and will cause issues when there is a
+    # constant variable.
+
+    # Fitting on data with constant variable without regularization
+    # triggers a LinAlgError.
+    msg = r"The covariance matrix of class .+ is not full rank"
+    clf = QuadraticDiscriminantAnalysis()
+    with pytest.warns(linalg.LinAlgWarning, match=msg):
+        y_pred = clf.fit(X2, y6)
+
+    y_pred = clf.predict(X2)
+    assert np.any(y_pred != y6)
+
+    # Adding a little regularization fixes the fit time error.
+    clf = QuadraticDiscriminantAnalysis(reg_param=0.01)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+    clf.fit(X2, y6)
+    y_pred = clf.predict(X2)
+    assert_array_equal(y_pred, y6)
+
+    # LinAlgWarning should also be there for the n_samples_in_a_class <
+    # n_features case.
+    clf = QuadraticDiscriminantAnalysis()
+    with pytest.warns(linalg.LinAlgWarning, match=msg):
+        clf.fit(X5, y5)
+
+    # The error will persist even with regularization
+    clf = QuadraticDiscriminantAnalysis(reg_param=0.3)
+    with pytest.warns(linalg.LinAlgWarning, match=msg):
+        clf.fit(X5, y5)
+
+
+def test_covariance():
+    x, y = make_blobs(n_samples=100, n_features=5, centers=1, random_state=42)
+
+    # make features correlated
+    x = np.dot(x, np.arange(x.shape[1] ** 2).reshape(x.shape[1], x.shape[1]))
+
+    c_e = _cov(x, "empirical")
+    assert_almost_equal(c_e, c_e.T)
+
+    c_s = _cov(x, "auto")
+    assert_almost_equal(c_s, c_s.T)
+
+
+@pytest.mark.parametrize("solver", ["svd", "lsqr", "eigen"])
+def test_raises_value_error_on_same_number_of_classes_and_samples(solver):
+    """
+    Tests that if the number of samples equals the number
+    of classes, a ValueError is raised.
+    """
+    X = np.array([[0.5, 0.6], [0.6, 0.5]])
+    y = np.array(["a", "b"])
+    clf = LinearDiscriminantAnalysis(solver=solver)
+    with pytest.raises(ValueError, match="The number of samples must be more"):
+        clf.fit(X, y)
+
+
+def test_get_feature_names_out():
+    """Check get_feature_names_out uses class name as prefix."""
+
+    est = LinearDiscriminantAnalysis().fit(X, y)
+    names_out = est.get_feature_names_out()
+
+    class_name_lower = "LinearDiscriminantAnalysis".lower()
+    expected_names_out = np.array(
+        [
+            f"{class_name_lower}{i}"
+            for i in range(est.explained_variance_ratio_.shape[0])
+        ],
+        dtype=object,
+    )
+    assert_array_equal(names_out, expected_names_out)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstring_parameters.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstring_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d179df69ddf7a0820a01322975a2f96fca2bf7d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstring_parameters.py
@@ -0,0 +1,328 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import importlib
+import inspect
+import os
+import warnings
+from inspect import signature
+from pkgutil import walk_packages
+
+import numpy as np
+import pytest
+
+import sklearn
+from sklearn.datasets import make_classification
+
+# make it possible to discover experimental estimators when calling `all_estimators`
+from sklearn.experimental import (
+    enable_halving_search_cv,  # noqa: F401
+    enable_iterative_imputer,  # noqa: F401
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import FunctionTransformer
+from sklearn.utils import all_estimators
+from sklearn.utils._test_common.instance_generator import _construct_instances
+from sklearn.utils._testing import (
+    _get_func_name,
+    check_docstring_parameters,
+    ignore_warnings,
+)
+from sklearn.utils.deprecation import _is_deprecated
+from sklearn.utils.estimator_checks import (
+    _enforce_estimator_tags_X,
+    _enforce_estimator_tags_y,
+)
+
+# walk_packages() ignores DeprecationWarnings, now we need to ignore
+# FutureWarnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore", FutureWarning)
+    # mypy error: Module has no attribute "__path__"
+    sklearn_path = [os.path.dirname(sklearn.__file__)]
+    PUBLIC_MODULES = set(
+        [
+            pckg[1]
+            for pckg in walk_packages(prefix="sklearn.", path=sklearn_path)
+            if not any(
+                substr in pckg[1] for substr in ["._", ".tests.", "sklearn.externals"]
+            )
+        ]
+    )
+
+# functions to ignore args / docstring of
+_DOCSTRING_IGNORES = [
+    "sklearn.utils.deprecation.load_mlcomp",
+    "sklearn.pipeline.make_pipeline",
+    "sklearn.pipeline.make_union",
+    "sklearn.utils.extmath.safe_sparse_dot",
+    "HalfBinomialLoss",
+]
+
+# Methods where y param should be ignored if y=None by default
+_METHODS_IGNORE_NONE_Y = [
+    "fit",
+    "score",
+    "fit_predict",
+    "fit_transform",
+    "partial_fit",
+    "predict",
+]
+
+
+def test_docstring_parameters():
+    # Test module docstring formatting
+
+    # Skip test if numpydoc is not found
+    pytest.importorskip(
+        "numpydoc", reason="numpydoc is required to test the docstrings"
+    )
+
+    # XXX unreached code as of v0.22
+    from numpydoc import docscrape
+
+    incorrect = []
+    for name in PUBLIC_MODULES:
+        if name.endswith(".conftest"):
+            # pytest tooling, not part of the scikit-learn API
+            continue
+        if name == "sklearn.utils.fixes":
+            # We cannot always control these docstrings
+            continue
+        with warnings.catch_warnings(record=True):
+            module = importlib.import_module(name)
+        classes = inspect.getmembers(module, inspect.isclass)
+        # Exclude non-scikit-learn classes
+        classes = [cls for cls in classes if cls[1].__module__.startswith("sklearn")]
+        for cname, cls in classes:
+            this_incorrect = []
+            if cname in _DOCSTRING_IGNORES or cname.startswith("_"):
+                continue
+            if inspect.isabstract(cls):
+                continue
+            with warnings.catch_warnings(record=True) as w:
+                cdoc = docscrape.ClassDoc(cls)
+            if len(w):
+                raise RuntimeError(
+                    "Error for __init__ of %s in %s:\n%s" % (cls, name, w[0])
+                )
+
+            # Skip checks on deprecated classes
+            if _is_deprecated(cls.__new__):
+                continue
+
+            this_incorrect += check_docstring_parameters(cls.__init__, cdoc)
+
+            for method_name in cdoc.methods:
+                method = getattr(cls, method_name)
+                if _is_deprecated(method):
+                    continue
+                param_ignore = None
+                # Now skip docstring test for y when y is None
+                # by default for API reason
+                if method_name in _METHODS_IGNORE_NONE_Y:
+                    sig = signature(method)
+                    if "y" in sig.parameters and sig.parameters["y"].default is None:
+                        param_ignore = ["y"]  # ignore y for fit and score
+                result = check_docstring_parameters(method, ignore=param_ignore)
+                this_incorrect += result
+
+            incorrect += this_incorrect
+
+        functions = inspect.getmembers(module, inspect.isfunction)
+        # Exclude imported functions
+        functions = [fn for fn in functions if fn[1].__module__ == name]
+        for fname, func in functions:
+            # Don't test private methods / functions
+            if fname.startswith("_"):
+                continue
+            if fname == "configuration" and name.endswith("setup"):
+                continue
+            name_ = _get_func_name(func)
+            if not any(d in name_ for d in _DOCSTRING_IGNORES) and not _is_deprecated(
+                func
+            ):
+                incorrect += check_docstring_parameters(func)
+
+    msg = "\n".join(incorrect)
+    if len(incorrect) > 0:
+        raise AssertionError("Docstring Error:\n" + msg)
+
+
+def _construct_searchcv_instance(SearchCV):
+    return SearchCV(LogisticRegression(), {"C": [0.1, 1]})
+
+
+def _construct_compose_pipeline_instance(Estimator):
+    # Minimal / degenerate instances: only useful to test the docstrings.
+    if Estimator.__name__ == "ColumnTransformer":
+        return Estimator(transformers=[("transformer", "passthrough", [0, 1])])
+    elif Estimator.__name__ == "Pipeline":
+        return Estimator(steps=[("clf", LogisticRegression())])
+    elif Estimator.__name__ == "FeatureUnion":
+        return Estimator(transformer_list=[("transformer", FunctionTransformer())])
+
+
+def _construct_sparse_coder(Estimator):
+    # XXX: hard-coded assumption that n_features=3
+    dictionary = np.array(
+        [[0, 1, 0], [-1, -1, 2], [1, 1, 1], [0, 1, 1], [0, 2, 1]],
+        dtype=np.float64,
+    )
+    return Estimator(dictionary=dictionary)
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("name, Estimator", all_estimators())
+def test_fit_docstring_attributes(name, Estimator):
+    pytest.importorskip("numpydoc")
+    from numpydoc import docscrape
+
+    doc = docscrape.ClassDoc(Estimator)
+    attributes = doc["Attributes"]
+
+    if Estimator.__name__ in (
+        "HalvingRandomSearchCV",
+        "RandomizedSearchCV",
+        "HalvingGridSearchCV",
+        "GridSearchCV",
+    ):
+        est = _construct_searchcv_instance(Estimator)
+    elif Estimator.__name__ in (
+        "ColumnTransformer",
+        "Pipeline",
+        "FeatureUnion",
+    ):
+        est = _construct_compose_pipeline_instance(Estimator)
+    elif Estimator.__name__ == "SparseCoder":
+        est = _construct_sparse_coder(Estimator)
+    elif Estimator.__name__ == "FrozenEstimator":
+        X, y = make_classification(n_samples=20, n_features=5, random_state=0)
+        est = Estimator(LogisticRegression().fit(X, y))
+    else:
+        # TODO(devtools): use _tested_estimators instead of all_estimators in the
+        # decorator
+        est = next(_construct_instances(Estimator))
+
+    if Estimator.__name__ == "SelectKBest":
+        est.set_params(k=2)
+    elif Estimator.__name__ == "DummyClassifier":
+        est.set_params(strategy="stratified")
+    elif Estimator.__name__ == "CCA" or Estimator.__name__.startswith("PLS"):
+        # default = 2 is invalid for single target
+        est.set_params(n_components=1)
+    elif Estimator.__name__ in (
+        "GaussianRandomProjection",
+        "SparseRandomProjection",
+    ):
+        # default="auto" raises an error with the shape of `X`
+        est.set_params(n_components=2)
+    elif Estimator.__name__ == "TSNE":
+        # default raises an error, perplexity must be less than n_samples
+        est.set_params(perplexity=2)
+    # TODO(1.9) remove
+    elif Estimator.__name__ == "KBinsDiscretizer":
+        # default raises an FutureWarning if quantile method is at default "warn"
+        est.set_params(quantile_method="averaged_inverted_cdf")
+    # TODO(1.9) remove
+    elif Estimator.__name__ == "MDS":
+        # default raises a FutureWarning
+        est.set_params(n_init=1)
+
+    # Low max iter to speed up tests: we are only interested in checking the existence
+    # of fitted attributes. This should be invariant to whether it has converged or not.
+    if "max_iter" in est.get_params():
+        est.set_params(max_iter=2)
+        # min value for `TSNE` is 250
+        if Estimator.__name__ == "TSNE":
+            est.set_params(max_iter=250)
+
+    if "random_state" in est.get_params():
+        est.set_params(random_state=0)
+
+    # In case we want to deprecate some attributes in the future
+    skipped_attributes = {}
+
+    if Estimator.__name__.endswith("Vectorizer"):
+        # Vectorizer require some specific input data
+        if Estimator.__name__ in (
+            "CountVectorizer",
+            "HashingVectorizer",
+            "TfidfVectorizer",
+        ):
+            X = [
+                "This is the first document.",
+                "This document is the second document.",
+                "And this is the third one.",
+                "Is this the first document?",
+            ]
+        elif Estimator.__name__ == "DictVectorizer":
+            X = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}]
+        y = None
+    else:
+        X, y = make_classification(
+            n_samples=20,
+            n_features=3,
+            n_redundant=0,
+            n_classes=2,
+            random_state=2,
+        )
+
+        y = _enforce_estimator_tags_y(est, y)
+        X = _enforce_estimator_tags_X(est, X)
+
+    if est.__sklearn_tags__().target_tags.one_d_labels:
+        est.fit(y)
+    elif est.__sklearn_tags__().target_tags.two_d_labels:
+        est.fit(np.c_[y, y])
+    elif est.__sklearn_tags__().input_tags.three_d_array:
+        est.fit(X[np.newaxis, ...], y)
+    else:
+        est.fit(X, y)
+
+    for attr in attributes:
+        if attr.name in skipped_attributes:
+            continue
+        desc = " ".join(attr.desc).lower()
+        # As certain attributes are present "only" if a certain parameter is
+        # provided, this checks if the word "only" is present in the attribute
+        # description, and if not the attribute is required to be present.
+        if "only " in desc:
+            continue
+        # ignore deprecation warnings
+        with ignore_warnings(category=FutureWarning):
+            assert hasattr(est, attr.name)
+
+    fit_attr = _get_all_fitted_attributes(est)
+    fit_attr_names = [attr.name for attr in attributes]
+    undocumented_attrs = set(fit_attr).difference(fit_attr_names)
+    undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes)
+    if undocumented_attrs:
+        raise AssertionError(
+            f"Undocumented attributes for {Estimator.__name__}: {undocumented_attrs}"
+        )
+
+
+def _get_all_fitted_attributes(estimator):
+    "Get all the fitted attributes of an estimator including properties"
+    # attributes
+    fit_attr = list(estimator.__dict__.keys())
+
+    # properties
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", category=FutureWarning)
+
+        for name in dir(estimator.__class__):
+            obj = getattr(estimator.__class__, name)
+            if not isinstance(obj, property):
+                continue
+
+            # ignore properties that raises an AttributeError and deprecated
+            # properties
+            try:
+                getattr(estimator, name)
+            except (AttributeError, FutureWarning):
+                continue
+            fit_attr.append(name)
+
+    return [k for k in fit_attr if k.endswith("_") and not k.startswith("_")]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstring_parameters_consistency.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstring_parameters_consistency.py
new file mode 100644
index 0000000000000000000000000000000000000000..cecc35131b4f73920f9249ff246f3e86157f1a97
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstring_parameters_consistency.py
@@ -0,0 +1,113 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import pytest
+
+from sklearn import metrics
+from sklearn.ensemble import (
+    BaggingClassifier,
+    BaggingRegressor,
+    IsolationForest,
+    StackingClassifier,
+    StackingRegressor,
+)
+from sklearn.utils._testing import assert_docstring_consistency, skip_if_no_numpydoc
+
+CLASS_DOCSTRING_CONSISTENCY_CASES = [
+    {
+        "objects": [BaggingClassifier, BaggingRegressor, IsolationForest],
+        "include_params": ["max_samples"],
+        "exclude_params": None,
+        "include_attrs": False,
+        "exclude_attrs": None,
+        "include_returns": False,
+        "exclude_returns": None,
+        "descr_regex_pattern": r"The number of samples to draw from X to train each.*",
+        "ignore_types": ("max_samples"),
+    },
+    {
+        "objects": [StackingClassifier, StackingRegressor],
+        "include_params": ["cv", "n_jobs", "passthrough", "verbose"],
+        "exclude_params": None,
+        "include_attrs": True,
+        "exclude_attrs": ["final_estimator_"],
+        "include_returns": False,
+        "exclude_returns": None,
+        "descr_regex_pattern": None,
+    },
+]
+
+FUNCTION_DOCSTRING_CONSISTENCY_CASES = [
+    {
+        "objects": [
+            metrics.precision_recall_fscore_support,
+            metrics.f1_score,
+            metrics.fbeta_score,
+            metrics.precision_score,
+            metrics.recall_score,
+        ],
+        "include_params": True,
+        "exclude_params": ["average", "zero_division"],
+        "include_attrs": False,
+        "exclude_attrs": None,
+        "include_returns": False,
+        "exclude_returns": None,
+        "descr_regex_pattern": None,
+    },
+    {
+        "objects": [
+            metrics.precision_recall_fscore_support,
+            metrics.f1_score,
+            metrics.fbeta_score,
+            metrics.precision_score,
+            metrics.recall_score,
+        ],
+        "include_params": ["average"],
+        "exclude_params": None,
+        "include_attrs": False,
+        "exclude_attrs": None,
+        "include_returns": False,
+        "exclude_returns": None,
+        "descr_regex_pattern": " ".join(
+            (
+                r"""This parameter is required for multiclass/multilabel targets\.
+            If ``None``, the metrics for each class are returned\. Otherwise, this
+            determines the type of averaging performed on the data:
+            ``'binary'``:
+                Only report results for the class specified by ``pos_label``\.
+                This is applicable only if targets \(``y_\{true,pred\}``\) are binary\.
+            ``'micro'``:
+                Calculate metrics globally by counting the total true positives,
+                false negatives and false positives\.
+            ``'macro'``:
+                Calculate metrics for each label, and find their unweighted
+                mean\.  This does not take label imbalance into account\.
+            ``'weighted'``:
+                Calculate metrics for each label, and find their average weighted
+                by support \(the number of true instances for each label\)\. This
+                alters 'macro' to account for label imbalance; it can result in an
+                F-score that is not between precision and recall\."""
+                r"[\s\w]*\.*"  # optionally match additional sentence
+                r"""
+            ``'samples'``:
+                Calculate metrics for each instance, and find their average \(only
+                meaningful for multilabel classification where this differs from
+                :func:`accuracy_score`\)\."""
+            ).split()
+        ),
+    },
+]
+
+
+@pytest.mark.parametrize("case", CLASS_DOCSTRING_CONSISTENCY_CASES)
+@skip_if_no_numpydoc
+def test_class_docstring_consistency(case):
+    """Check docstrings parameters consistency between related classes."""
+    assert_docstring_consistency(**case)
+
+
+@pytest.mark.parametrize("case", FUNCTION_DOCSTRING_CONSISTENCY_CASES)
+@skip_if_no_numpydoc
+def test_function_docstring_consistency(case):
+    """Check docstrings parameters consistency between related functions."""
+    assert_docstring_consistency(**case)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstrings.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstrings.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea625ac076a01fbd0116c8a086eb5477d16bd34c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_docstrings.py
@@ -0,0 +1,208 @@
+import re
+from inspect import signature
+from typing import Optional
+
+import pytest
+
+# make it possible to discover experimental estimators when calling `all_estimators`
+from sklearn.experimental import (
+    enable_halving_search_cv,  # noqa: F401
+    enable_iterative_imputer,  # noqa: F401
+)
+from sklearn.utils.discovery import all_displays, all_estimators, all_functions
+
+numpydoc_validation = pytest.importorskip("numpydoc.validate")
+
+
+def get_all_methods():
+    estimators = all_estimators()
+    displays = all_displays()
+    for name, Klass in estimators + displays:
+        if name.startswith("_"):
+            # skip private classes
+            continue
+        methods = []
+        for name in dir(Klass):
+            if name.startswith("_"):
+                continue
+            method_obj = getattr(Klass, name)
+            if hasattr(method_obj, "__call__") or isinstance(method_obj, property):
+                methods.append(name)
+        methods.append(None)
+
+        for method in sorted(methods, key=str):
+            yield Klass, method
+
+
+def get_all_functions_names():
+    functions = all_functions()
+    for _, func in functions:
+        # exclude functions from utils.fixex since they come from external packages
+        if "utils.fixes" not in func.__module__:
+            yield f"{func.__module__}.{func.__name__}"
+
+
+def filter_errors(errors, method, Klass=None):
+    """
+    Ignore some errors based on the method type.
+
+    These rules are specific for scikit-learn."""
+    for code, message in errors:
+        # We ignore following error code,
+        #  - RT02: The first line of the Returns section
+        #    should contain only the type, ..
+        #   (as we may need refer to the name of the returned
+        #    object)
+        #  - GL01: Docstring text (summary) should start in the line
+        #    immediately after the opening quotes (not in the same line,
+        #    or leaving a blank line in between)
+        #  - GL02: If there's a blank line, it should be before the
+        #    first line of the Returns section, not after (it allows to have
+        #    short docstrings for properties).
+
+        if code in ["RT02", "GL01", "GL02"]:
+            continue
+
+        # Ignore PR02: Unknown parameters for properties. We sometimes use
+        # properties for ducktyping, i.e. SGDClassifier.predict_proba
+        # Ignore GL08: Parsing of the method signature failed, possibly because this is
+        # a property. Properties are sometimes used for deprecated attributes and the
+        # attribute is already documented in the class docstring.
+        #
+        # All error codes:
+        # https://numpydoc.readthedocs.io/en/latest/validation.html#built-in-validation-checks
+        if code in ("PR02", "GL08") and Klass is not None and method is not None:
+            method_obj = getattr(Klass, method)
+            if isinstance(method_obj, property):
+                continue
+
+        # Following codes are only taken into account for the
+        # top level class docstrings:
+        #  - ES01: No extended summary found
+        #  - SA01: See Also section not found
+        #  - EX01: No examples section found
+
+        if method is not None and code in ["EX01", "SA01", "ES01"]:
+            continue
+        yield code, message
+
+
+def repr_errors(res, Klass=None, method: Optional[str] = None) -> str:
+    """Pretty print original docstring and the obtained errors
+
+    Parameters
+    ----------
+    res : dict
+        result of numpydoc.validate.validate
+    Klass : {Estimator, Display, None}
+        estimator object or None
+    method : str
+        if estimator is not None, either the method name or None.
+
+    Returns
+    -------
+    str
+       String representation of the error.
+    """
+    if method is None:
+        if hasattr(Klass, "__init__"):
+            method = "__init__"
+        elif Klass is None:
+            raise ValueError("At least one of Klass, method should be provided")
+        else:
+            raise NotImplementedError
+
+    if Klass is not None:
+        obj = getattr(Klass, method)
+        try:
+            obj_signature = str(signature(obj))
+        except TypeError:
+            # In particular we can't parse the signature of properties
+            obj_signature = (
+                "\nParsing of the method signature failed, "
+                "possibly because this is a property."
+            )
+
+        obj_name = Klass.__name__ + "." + method
+    else:
+        obj_signature = ""
+        obj_name = method
+
+    msg = "\n\n" + "\n\n".join(
+        [
+            str(res["file"]),
+            obj_name + obj_signature,
+            res["docstring"],
+            "# Errors",
+            "\n".join(
+                " - {}: {}".format(code, message) for code, message in res["errors"]
+            ),
+        ]
+    )
+    return msg
+
+
+@pytest.mark.parametrize("function_name", get_all_functions_names())
+def test_function_docstring(function_name, request):
+    """Check function docstrings using numpydoc."""
+    res = numpydoc_validation.validate(function_name)
+
+    res["errors"] = list(filter_errors(res["errors"], method="function"))
+
+    if res["errors"]:
+        msg = repr_errors(res, method=f"Tested function: {function_name}")
+
+        raise ValueError(msg)
+
+
+@pytest.mark.parametrize("Klass, method", get_all_methods())
+def test_docstring(Klass, method, request):
+    base_import_path = Klass.__module__
+    import_path = [base_import_path, Klass.__name__]
+    if method is not None:
+        import_path.append(method)
+
+    import_path = ".".join(import_path)
+
+    res = numpydoc_validation.validate(import_path)
+
+    res["errors"] = list(filter_errors(res["errors"], method, Klass=Klass))
+
+    if res["errors"]:
+        msg = repr_errors(res, Klass, method)
+
+        raise ValueError(msg)
+
+
+if __name__ == "__main__":
+    import argparse
+    import sys
+
+    parser = argparse.ArgumentParser(description="Validate docstring with numpydoc.")
+    parser.add_argument("import_path", help="Import path to validate")
+
+    args = parser.parse_args()
+
+    res = numpydoc_validation.validate(args.import_path)
+
+    import_path_sections = args.import_path.split(".")
+    # When applied to classes, detect class method. For functions
+    # method = None.
+    # TODO: this detection can be improved. Currently we assume that we have
+    # class # methods if the second path element before last is in camel case.
+    if len(import_path_sections) >= 2 and re.match(
+        r"(?:[A-Z][a-z]*)+", import_path_sections[-2]
+    ):
+        method = import_path_sections[-1]
+    else:
+        method = None
+
+    res["errors"] = list(filter_errors(res["errors"], method))
+
+    if res["errors"]:
+        msg = repr_errors(res, method=args.import_path)
+
+        print(msg)
+        sys.exit(1)
+    else:
+        print("All docstring checks passed for {}!".format(args.import_path))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_dummy.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_dummy.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f1803b7a24f5fe1ef83eef95a974acceb9d076
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_dummy.py
@@ -0,0 +1,715 @@
+import warnings
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+
+from sklearn.base import clone
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.exceptions import NotFittedError
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS
+from sklearn.utils.stats import _weighted_percentile
+
+
+def _check_predict_proba(clf, X, y):
+    proba = clf.predict_proba(X)
+
+    # We know that we can have division by zero
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", "divide by zero encountered in log")
+        log_proba = clf.predict_log_proba(X)
+
+    y = np.atleast_1d(y)
+    if y.ndim == 1:
+        y = np.reshape(y, (-1, 1))
+
+    n_outputs = y.shape[1]
+    n_samples = len(X)
+
+    if n_outputs == 1:
+        proba = [proba]
+        log_proba = [log_proba]
+
+    for k in range(n_outputs):
+        assert proba[k].shape[0] == n_samples
+        assert proba[k].shape[1] == len(np.unique(y[:, k]))
+        assert_array_almost_equal(proba[k].sum(axis=1), np.ones(len(X)))
+        # We know that we can have division by zero
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "divide by zero encountered in log")
+            assert_array_almost_equal(np.log(proba[k]), log_proba[k])
+
+
+def _check_behavior_2d(clf):
+    # 1d case
+    X = np.array([[0], [0], [0], [0]])  # ignored
+    y = np.array([1, 2, 1, 1])
+    est = clone(clf)
+    est.fit(X, y)
+    y_pred = est.predict(X)
+    assert y.shape == y_pred.shape
+
+    # 2d case
+    y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
+    est = clone(clf)
+    est.fit(X, y)
+    y_pred = est.predict(X)
+    assert y.shape == y_pred.shape
+
+
+def _check_behavior_2d_for_constant(clf):
+    # 2d case only
+    X = np.array([[0], [0], [0], [0]])  # ignored
+    y = np.array([[1, 0, 5, 4, 3], [2, 0, 1, 2, 5], [1, 0, 4, 5, 2], [1, 3, 3, 2, 0]])
+    est = clone(clf)
+    est.fit(X, y)
+    y_pred = est.predict(X)
+    assert y.shape == y_pred.shape
+
+
+def _check_equality_regressor(statistic, y_learn, y_pred_learn, y_test, y_pred_test):
+    assert_array_almost_equal(np.tile(statistic, (y_learn.shape[0], 1)), y_pred_learn)
+    assert_array_almost_equal(np.tile(statistic, (y_test.shape[0], 1)), y_pred_test)
+
+
+def test_feature_names_in_and_n_features_in_(global_random_seed, n_samples=10):
+    pd = pytest.importorskip("pandas")
+
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X = pd.DataFrame([[0]] * n_samples, columns=["feature_1"])
+    y = random_state.rand(n_samples)
+
+    est = DummyRegressor().fit(X, y)
+    assert hasattr(est, "feature_names_in_")
+    assert hasattr(est, "n_features_in_")
+
+    est = DummyClassifier().fit(X, y)
+    assert hasattr(est, "feature_names_in_")
+    assert hasattr(est, "n_features_in_")
+
+
+def test_most_frequent_and_prior_strategy():
+    X = [[0], [0], [0], [0]]  # ignored
+    y = [1, 2, 1, 1]
+
+    for strategy in ("most_frequent", "prior"):
+        clf = DummyClassifier(strategy=strategy, random_state=0)
+        clf.fit(X, y)
+        assert_array_equal(clf.predict(X), np.ones(len(X)))
+        _check_predict_proba(clf, X, y)
+
+        if strategy == "prior":
+            assert_array_almost_equal(
+                clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1))
+            )
+        else:
+            assert_array_almost_equal(
+                clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1)) > 0.5
+            )
+
+
+def test_most_frequent_and_prior_strategy_with_2d_column_y():
+    # non-regression test added in
+    # https://github.com/scikit-learn/scikit-learn/pull/13545
+    X = [[0], [0], [0], [0]]
+    y_1d = [1, 2, 1, 1]
+    y_2d = [[1], [2], [1], [1]]
+
+    for strategy in ("most_frequent", "prior"):
+        clf_1d = DummyClassifier(strategy=strategy, random_state=0)
+        clf_2d = DummyClassifier(strategy=strategy, random_state=0)
+
+        clf_1d.fit(X, y_1d)
+        clf_2d.fit(X, y_2d)
+        assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
+
+
+def test_most_frequent_and_prior_strategy_multioutput():
+    X = [[0], [0], [0], [0]]  # ignored
+    y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
+
+    n_samples = len(X)
+
+    for strategy in ("prior", "most_frequent"):
+        clf = DummyClassifier(strategy=strategy, random_state=0)
+        clf.fit(X, y)
+        assert_array_equal(
+            clf.predict(X),
+            np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]),
+        )
+        _check_predict_proba(clf, X, y)
+        _check_behavior_2d(clf)
+
+
+def test_stratified_strategy(global_random_seed):
+    X = [[0]] * 5  # ignored
+    y = [1, 2, 1, 1, 2]
+    clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
+    clf.fit(X, y)
+
+    X = [[0]] * 500
+    y_pred = clf.predict(X)
+    p = np.bincount(y_pred) / float(len(X))
+    assert_almost_equal(p[1], 3.0 / 5, decimal=1)
+    assert_almost_equal(p[2], 2.0 / 5, decimal=1)
+    _check_predict_proba(clf, X, y)
+
+
+def test_stratified_strategy_multioutput(global_random_seed):
+    X = [[0]] * 5  # ignored
+    y = np.array([[2, 1], [2, 2], [1, 1], [1, 2], [1, 1]])
+
+    clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
+    clf.fit(X, y)
+
+    X = [[0]] * 500
+    y_pred = clf.predict(X)
+
+    for k in range(y.shape[1]):
+        p = np.bincount(y_pred[:, k]) / float(len(X))
+        assert_almost_equal(p[1], 3.0 / 5, decimal=1)
+        assert_almost_equal(p[2], 2.0 / 5, decimal=1)
+        _check_predict_proba(clf, X, y)
+
+    _check_behavior_2d(clf)
+
+
+def test_uniform_strategy(global_random_seed):
+    X = [[0]] * 4  # ignored
+    y = [1, 2, 1, 1]
+    clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
+    clf.fit(X, y)
+
+    X = [[0]] * 500
+    y_pred = clf.predict(X)
+    p = np.bincount(y_pred) / float(len(X))
+    assert_almost_equal(p[1], 0.5, decimal=1)
+    assert_almost_equal(p[2], 0.5, decimal=1)
+    _check_predict_proba(clf, X, y)
+
+
+def test_uniform_strategy_multioutput(global_random_seed):
+    X = [[0]] * 4  # ignored
+    y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]])
+    clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
+    clf.fit(X, y)
+
+    X = [[0]] * 500
+    y_pred = clf.predict(X)
+
+    for k in range(y.shape[1]):
+        p = np.bincount(y_pred[:, k]) / float(len(X))
+        assert_almost_equal(p[1], 0.5, decimal=1)
+        assert_almost_equal(p[2], 0.5, decimal=1)
+        _check_predict_proba(clf, X, y)
+
+    _check_behavior_2d(clf)
+
+
+def test_string_labels():
+    X = [[0]] * 5
+    y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
+    clf = DummyClassifier(strategy="most_frequent")
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(X), ["paris"] * 5)
+
+
+@pytest.mark.parametrize(
+    "y,y_test",
+    [
+        ([2, 1, 1, 1], [2, 2, 1, 1]),
+        (
+            np.array([[2, 2], [1, 1], [1, 1], [1, 1]]),
+            np.array([[2, 2], [2, 2], [1, 1], [1, 1]]),
+        ),
+    ],
+)
+def test_classifier_score_with_None(y, y_test):
+    clf = DummyClassifier(strategy="most_frequent")
+    clf.fit(None, y)
+    assert clf.score(None, y_test) == 0.5
+
+
+@pytest.mark.parametrize(
+    "strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
+)
+def test_classifier_prediction_independent_of_X(strategy, global_random_seed):
+    y = [0, 2, 1, 1]
+    X1 = [[0]] * 4
+    clf1 = DummyClassifier(
+        strategy=strategy, random_state=global_random_seed, constant=0
+    )
+    clf1.fit(X1, y)
+    predictions1 = clf1.predict(X1)
+
+    X2 = [[1]] * 4
+    clf2 = DummyClassifier(
+        strategy=strategy, random_state=global_random_seed, constant=0
+    )
+    clf2.fit(X2, y)
+    predictions2 = clf2.predict(X2)
+
+    assert_array_equal(predictions1, predictions2)
+
+
+def test_mean_strategy_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X = [[0]] * 4  # ignored
+    y = random_state.randn(4)
+
+    reg = DummyRegressor()
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))
+
+
+def test_mean_strategy_multioutput_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X_learn = random_state.randn(10, 10)
+    y_learn = random_state.randn(10, 5)
+
+    mean = np.mean(y_learn, axis=0).reshape((1, -1))
+
+    X_test = random_state.randn(20, 10)
+    y_test = random_state.randn(20, 5)
+
+    # Correctness oracle
+    est = DummyRegressor()
+    est.fit(X_learn, y_learn)
+    y_pred_learn = est.predict(X_learn)
+    y_pred_test = est.predict(X_test)
+
+    _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
+    _check_behavior_2d(est)
+
+
+def test_regressor_exceptions():
+    reg = DummyRegressor()
+    with pytest.raises(NotFittedError):
+        reg.predict([])
+
+
+def test_median_strategy_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X = [[0]] * 5  # ignored
+    y = random_state.randn(5)
+
+    reg = DummyRegressor(strategy="median")
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
+
+
+def test_median_strategy_multioutput_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X_learn = random_state.randn(10, 10)
+    y_learn = random_state.randn(10, 5)
+
+    median = np.median(y_learn, axis=0).reshape((1, -1))
+
+    X_test = random_state.randn(20, 10)
+    y_test = random_state.randn(20, 5)
+
+    # Correctness oracle
+    est = DummyRegressor(strategy="median")
+    est.fit(X_learn, y_learn)
+    y_pred_learn = est.predict(X_learn)
+    y_pred_test = est.predict(X_test)
+
+    _check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
+    _check_behavior_2d(est)
+
+
+def test_quantile_strategy_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X = [[0]] * 5  # ignored
+    y = random_state.randn(5)
+
+    reg = DummyRegressor(strategy="quantile", quantile=0.5)
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
+
+    reg = DummyRegressor(strategy="quantile", quantile=0)
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [np.min(y)] * len(X))
+
+    reg = DummyRegressor(strategy="quantile", quantile=1)
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [np.max(y)] * len(X))
+
+    reg = DummyRegressor(strategy="quantile", quantile=0.3)
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
+
+
+def test_quantile_strategy_multioutput_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X_learn = random_state.randn(10, 10)
+    y_learn = random_state.randn(10, 5)
+
+    median = np.median(y_learn, axis=0).reshape((1, -1))
+    quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))
+
+    X_test = random_state.randn(20, 10)
+    y_test = random_state.randn(20, 5)
+
+    # Correctness oracle
+    est = DummyRegressor(strategy="quantile", quantile=0.5)
+    est.fit(X_learn, y_learn)
+    y_pred_learn = est.predict(X_learn)
+    y_pred_test = est.predict(X_test)
+
+    _check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
+    _check_behavior_2d(est)
+
+    # Correctness oracle
+    est = DummyRegressor(strategy="quantile", quantile=0.8)
+    est.fit(X_learn, y_learn)
+    y_pred_learn = est.predict(X_learn)
+    y_pred_test = est.predict(X_test)
+
+    _check_equality_regressor(
+        quantile_values, y_learn, y_pred_learn, y_test, y_pred_test
+    )
+    _check_behavior_2d(est)
+
+
+def test_quantile_invalid():
+    X = [[0]] * 5  # ignored
+    y = [0] * 5  # ignored
+
+    est = DummyRegressor(strategy="quantile", quantile=None)
+    err_msg = (
+        "When using `strategy='quantile', you have to specify the desired quantile"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+
+def test_quantile_strategy_empty_train():
+    est = DummyRegressor(strategy="quantile", quantile=0.4)
+    with pytest.raises(IndexError):
+        est.fit([], [])
+
+
+def test_constant_strategy_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X = [[0]] * 5  # ignored
+    y = random_state.randn(5)
+
+    reg = DummyRegressor(strategy="constant", constant=[43])
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [43] * len(X))
+
+    reg = DummyRegressor(strategy="constant", constant=43)
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [43] * len(X))
+
+    # non-regression test for #22478
+    assert not isinstance(reg.constant, np.ndarray)
+
+
+def test_constant_strategy_multioutput_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X_learn = random_state.randn(10, 10)
+    y_learn = random_state.randn(10, 5)
+
+    # test with 2d array
+    constants = random_state.randn(5)
+
+    X_test = random_state.randn(20, 10)
+    y_test = random_state.randn(20, 5)
+
+    # Correctness oracle
+    est = DummyRegressor(strategy="constant", constant=constants)
+    est.fit(X_learn, y_learn)
+    y_pred_learn = est.predict(X_learn)
+    y_pred_test = est.predict(X_test)
+
+    _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
+    _check_behavior_2d_for_constant(est)
+
+
+def test_y_mean_attribute_regressor():
+    X = [[0]] * 5
+    y = [1, 2, 4, 6, 8]
+    # when strategy = 'mean'
+    est = DummyRegressor(strategy="mean")
+    est.fit(X, y)
+
+    assert est.constant_ == np.mean(y)
+
+
+def test_constants_not_specified_regressor():
+    X = [[0]] * 5
+    y = [1, 2, 4, 6, 8]
+
+    est = DummyRegressor(strategy="constant")
+    err_msg = "Constant target value has to be specified"
+    with pytest.raises(TypeError, match=err_msg):
+        est.fit(X, y)
+
+
+def test_constant_size_multioutput_regressor(global_random_seed):
+    random_state = np.random.RandomState(seed=global_random_seed)
+    X = random_state.randn(10, 10)
+    y = random_state.randn(10, 5)
+
+    est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4])
+    err_msg = r"Constant target value should have shape \(5, 1\)."
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+
+def test_constant_strategy():
+    X = [[0], [0], [0], [0]]  # ignored
+    y = [2, 1, 2, 2]
+
+    clf = DummyClassifier(strategy="constant", random_state=0, constant=1)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(X), np.ones(len(X)))
+    _check_predict_proba(clf, X, y)
+
+    X = [[0], [0], [0], [0]]  # ignored
+    y = ["two", "one", "two", "two"]
+    clf = DummyClassifier(strategy="constant", random_state=0, constant="one")
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(X), np.array(["one"] * 4))
+    _check_predict_proba(clf, X, y)
+
+
+def test_constant_strategy_multioutput():
+    X = [[0], [0], [0], [0]]  # ignored
+    y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]])
+
+    n_samples = len(X)
+
+    clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
+    clf.fit(X, y)
+    assert_array_equal(
+        clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
+    )
+    _check_predict_proba(clf, X, y)
+
+
+@pytest.mark.parametrize(
+    "y, params, err_msg",
+    [
+        ([2, 1, 2, 2], {"random_state": 0}, "Constant.*has to be specified"),
+        ([2, 1, 2, 2], {"constant": [2, 0]}, "Constant.*should have shape"),
+        (
+            np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
+            {"constant": 2},
+            "Constant.*should have shape",
+        ),
+        (
+            [2, 1, 2, 2],
+            {"constant": "my-constant"},
+            "constant=my-constant.*Possible values.*\\[1, 2]",
+        ),
+        (
+            np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
+            {"constant": [2, "unknown"]},
+            "constant=\\[2, 'unknown'].*Possible values.*\\[1, 2]",
+        ),
+    ],
+    ids=[
+        "no-constant",
+        "too-many-constant",
+        "not-enough-output",
+        "single-output",
+        "multi-output",
+    ],
+)
+def test_constant_strategy_exceptions(y, params, err_msg):
+    X = [[0], [0], [0], [0]]
+
+    clf = DummyClassifier(strategy="constant", **params)
+    with pytest.raises(ValueError, match=err_msg):
+        clf.fit(X, y)
+
+
+def test_classification_sample_weight():
+    X = [[0], [0], [1]]
+    y = [0, 1, 0]
+    sample_weight = [0.1, 1.0, 0.1]
+
+    clf = DummyClassifier(strategy="stratified").fit(X, y, sample_weight)
+    assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1.0 / 1.2])
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_constant_strategy_sparse_target(csc_container):
+    X = [[0]] * 5  # ignored
+    y = csc_container(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]]))
+
+    n_samples = len(X)
+
+    clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
+    clf.fit(X, y)
+    y_pred = clf.predict(X)
+    assert sp.issparse(y_pred)
+    assert_array_equal(
+        y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
+    )
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_uniform_strategy_sparse_target_warning(global_random_seed, csc_container):
+    X = [[0]] * 5  # ignored
+    y = csc_container(np.array([[2, 1], [2, 2], [1, 4], [4, 2], [1, 1]]))
+
+    clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
+    with pytest.warns(UserWarning, match="the uniform strategy would not save memory"):
+        clf.fit(X, y)
+
+    X = [[0]] * 500
+    y_pred = clf.predict(X)
+
+    for k in range(y.shape[1]):
+        p = np.bincount(y_pred[:, k]) / float(len(X))
+        assert_almost_equal(p[1], 1 / 3, decimal=1)
+        assert_almost_equal(p[2], 1 / 3, decimal=1)
+        assert_almost_equal(p[4], 1 / 3, decimal=1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_stratified_strategy_sparse_target(global_random_seed, csc_container):
+    X = [[0]] * 5  # ignored
+    y = csc_container(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]]))
+
+    clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
+    clf.fit(X, y)
+
+    X = [[0]] * 500
+    y_pred = clf.predict(X)
+    assert sp.issparse(y_pred)
+    y_pred = y_pred.toarray()
+
+    for k in range(y.shape[1]):
+        p = np.bincount(y_pred[:, k]) / float(len(X))
+        assert_almost_equal(p[1], 3.0 / 5, decimal=1)
+        assert_almost_equal(p[0], 1.0 / 5, decimal=1)
+        assert_almost_equal(p[4], 1.0 / 5, decimal=1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_most_frequent_and_prior_strategy_sparse_target(csc_container):
+    X = [[0]] * 5  # ignored
+    y = csc_container(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]]))
+
+    n_samples = len(X)
+    y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
+    for strategy in ("most_frequent", "prior"):
+        clf = DummyClassifier(strategy=strategy, random_state=0)
+        clf.fit(X, y)
+
+        y_pred = clf.predict(X)
+        assert sp.issparse(y_pred)
+        assert_array_equal(y_pred.toarray(), y_expected)
+
+
+def test_dummy_regressor_sample_weight(global_random_seed, n_samples=10):
+    random_state = np.random.RandomState(seed=global_random_seed)
+
+    X = [[0]] * n_samples
+    y = random_state.rand(n_samples)
+    sample_weight = random_state.rand(n_samples)
+
+    est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
+    assert est.constant_ == np.average(y, weights=sample_weight)
+
+    est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
+    assert est.constant_ == _weighted_percentile(y, sample_weight, 50.0)
+
+    est = DummyRegressor(strategy="quantile", quantile=0.95).fit(X, y, sample_weight)
+    assert est.constant_ == _weighted_percentile(y, sample_weight, 95.0)
+
+
+def test_dummy_regressor_on_3D_array():
+    X = np.array([[["foo"]], [["bar"]], [["baz"]]])
+    y = np.array([2, 2, 2])
+    y_expected = np.array([2, 2, 2])
+    cls = DummyRegressor()
+    cls.fit(X, y)
+    y_pred = cls.predict(X)
+    assert_array_equal(y_pred, y_expected)
+
+
+def test_dummy_classifier_on_3D_array():
+    X = np.array([[["foo"]], [["bar"]], [["baz"]]])
+    y = [2, 2, 2]
+    y_expected = [2, 2, 2]
+    y_proba_expected = [[1], [1], [1]]
+    cls = DummyClassifier(strategy="stratified")
+    cls.fit(X, y)
+    y_pred = cls.predict(X)
+    y_pred_proba = cls.predict_proba(X)
+    assert_array_equal(y_pred, y_expected)
+    assert_array_equal(y_pred_proba, y_proba_expected)
+
+
+def test_dummy_regressor_return_std():
+    X = [[0]] * 3  # ignored
+    y = np.array([2, 2, 2])
+    y_std_expected = np.array([0, 0, 0])
+    cls = DummyRegressor()
+    cls.fit(X, y)
+    y_pred_list = cls.predict(X, return_std=True)
+    # there should be two elements when return_std is True
+    assert len(y_pred_list) == 2
+    # the second element should be all zeros
+    assert_array_equal(y_pred_list[1], y_std_expected)
+
+
+@pytest.mark.parametrize(
+    "y,y_test",
+    [
+        ([1, 1, 1, 2], [1.25] * 4),
+        (np.array([[2, 2], [1, 1], [1, 1], [1, 1]]), [[1.25, 1.25]] * 4),
+    ],
+)
+def test_regressor_score_with_None(y, y_test):
+    reg = DummyRegressor()
+    reg.fit(None, y)
+    assert reg.score(None, y_test) == 1.0
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "quantile", "constant"])
+def test_regressor_prediction_independent_of_X(strategy):
+    y = [0, 2, 1, 1]
+    X1 = [[0]] * 4
+    reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
+    reg1.fit(X1, y)
+    predictions1 = reg1.predict(X1)
+
+    X2 = [[1]] * 4
+    reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
+    reg2.fit(X2, y)
+    predictions2 = reg2.predict(X2)
+
+    assert_array_equal(predictions1, predictions2)
+
+
+@pytest.mark.parametrize(
+    "strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
+)
+def test_dtype_of_classifier_probas(strategy):
+    y = [0, 2, 1, 1]
+    X = np.zeros(4)
+    model = DummyClassifier(strategy=strategy, random_state=0, constant=0)
+    probas = model.fit(X, y).predict_proba(X)
+
+    assert probas.dtype == np.float64
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_init.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..4df9c279030cbf0317283736c78b7af8c9db60f3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_init.py
@@ -0,0 +1,20 @@
+# Basic unittests to test functioning of module's top-level
+
+
+__author__ = "Yaroslav Halchenko"
+__license__ = "BSD"
+
+
+try:
+    from sklearn import *  # noqa: F403
+
+    _top_import_error = None
+except Exception as e:
+    _top_import_error = e
+
+
+def test_import_skl():
+    # Test either above import has failed for some reason
+    # "import *" is discouraged outside of the module level, hence we
+    # rely on setting up the variable above
+    assert _top_import_error is None
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_isotonic.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_isotonic.py
new file mode 100644
index 0000000000000000000000000000000000000000..90598b48f6434478cad6593d692c1e3fc607a651
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_isotonic.py
@@ -0,0 +1,708 @@
+import copy
+import pickle
+import warnings
+
+import numpy as np
+import pytest
+from scipy.special import expit
+
+import sklearn
+from sklearn.datasets import make_regression
+from sklearn.isotonic import (
+    IsotonicRegression,
+    _make_unique,
+    check_increasing,
+    isotonic_regression,
+)
+from sklearn.utils import shuffle
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.validation import check_array
+
+
+def test_permutation_invariance():
+    # check that fit is permutation invariant.
+    # regression test of missing sorting of sample-weights
+    ir = IsotonicRegression()
+    x = [1, 2, 3, 4, 5, 6, 7]
+    y = [1, 41, 51, 1, 2, 5, 24]
+    sample_weight = [1, 2, 3, 4, 5, 6, 7]
+    x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0)
+    y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight)
+    y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x)
+
+    assert_array_equal(y_transformed, y_transformed_s)
+
+
+def test_check_increasing_small_number_of_samples():
+    x = [0, 1, 2]
+    y = [1, 1.1, 1.05]
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        is_increasing = check_increasing(x, y)
+
+    assert is_increasing
+
+
+def test_check_increasing_up():
+    x = [0, 1, 2, 3, 4, 5]
+    y = [0, 1.5, 2.77, 8.99, 8.99, 50]
+
+    # Check that we got increasing=True and no warnings
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        is_increasing = check_increasing(x, y)
+
+    assert is_increasing
+
+
+def test_check_increasing_up_extreme():
+    x = [0, 1, 2, 3, 4, 5]
+    y = [0, 1, 2, 3, 4, 5]
+
+    # Check that we got increasing=True and no warnings
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        is_increasing = check_increasing(x, y)
+
+    assert is_increasing
+
+
+def test_check_increasing_down():
+    x = [0, 1, 2, 3, 4, 5]
+    y = [0, -1.5, -2.77, -8.99, -8.99, -50]
+
+    # Check that we got increasing=False and no warnings
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        is_increasing = check_increasing(x, y)
+
+    assert not is_increasing
+
+
+def test_check_increasing_down_extreme():
+    x = [0, 1, 2, 3, 4, 5]
+    y = [0, -1, -2, -3, -4, -5]
+
+    # Check that we got increasing=False and no warnings
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        is_increasing = check_increasing(x, y)
+
+    assert not is_increasing
+
+
+def test_check_ci_warn():
+    x = [0, 1, 2, 3, 4, 5]
+    y = [0, -1, 2, -3, 4, -5]
+
+    # Check that we got increasing=False and CI interval warning
+    msg = "interval"
+    with pytest.warns(UserWarning, match=msg):
+        is_increasing = check_increasing(x, y)
+
+    assert not is_increasing
+
+
+def test_isotonic_regression():
+    y = np.array([3, 7, 5, 9, 8, 7, 10])
+    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
+    assert_array_equal(y_, isotonic_regression(y))
+
+    y = np.array([10, 0, 2])
+    y_ = np.array([4, 4, 4])
+    assert_array_equal(y_, isotonic_regression(y))
+
+    x = np.arange(len(y))
+    ir = IsotonicRegression(y_min=0.0, y_max=1.0)
+    ir.fit(x, y)
+    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
+    assert_array_equal(ir.transform(x), ir.predict(x))
+
+    # check that it is immune to permutation
+    perm = np.random.permutation(len(y))
+    ir = IsotonicRegression(y_min=0.0, y_max=1.0)
+    assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm])
+    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
+
+    # check we don't crash when all x are equal:
+    ir = IsotonicRegression()
+    assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
+
+
+def test_isotonic_regression_ties_min():
+    # Setup examples with ties on minimum
+    x = [1, 1, 2, 3, 4, 5]
+    y = [1, 2, 3, 4, 5, 6]
+    y_true = [1.5, 1.5, 3, 4, 5, 6]
+
+    # Check that we get identical results for fit/transform and fit_transform
+    ir = IsotonicRegression()
+    ir.fit(x, y)
+    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
+    assert_array_equal(y_true, ir.fit_transform(x, y))
+
+
+def test_isotonic_regression_ties_max():
+    # Setup examples with ties on maximum
+    x = [1, 2, 3, 4, 5, 5]
+    y = [1, 2, 3, 4, 5, 6]
+    y_true = [1, 2, 3, 4, 5.5, 5.5]
+
+    # Check that we get identical results for fit/transform and fit_transform
+    ir = IsotonicRegression()
+    ir.fit(x, y)
+    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
+    assert_array_equal(y_true, ir.fit_transform(x, y))
+
+
+def test_isotonic_regression_ties_secondary_():
+    """
+    Test isotonic regression fit, transform  and fit_transform
+    against the "secondary" ties method and "pituitary" data from R
+     "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
+     Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
+    (PAVA) and Active Set Methods
+
+    Set values based on pituitary example and
+     the following R command detailed in the paper above:
+    > library("isotone")
+    > data("pituitary")
+    > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
+    > res1$x
+
+    `isotone` version: 1.0-2, 2014-09-07
+    R version: R version 3.1.1 (2014-07-10)
+    """
+    x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
+    y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
+    y_true = [
+        22.22222,
+        22.22222,
+        22.22222,
+        22.22222,
+        22.22222,
+        22.22222,
+        22.22222,
+        22.22222,
+        22.22222,
+        24.25,
+        24.25,
+    ]
+
+    # Check fit, transform and fit_transform
+    ir = IsotonicRegression()
+    ir.fit(x, y)
+    assert_array_almost_equal(ir.transform(x), y_true, 4)
+    assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
+
+
+def test_isotonic_regression_with_ties_in_differently_sized_groups():
+    """
+    Non-regression test to handle issue 9432:
+    https://github.com/scikit-learn/scikit-learn/issues/9432
+
+    Compare against output in R:
+    > library("isotone")
+    > x <- c(0, 1, 1, 2, 3, 4)
+    > y <- c(0, 0, 1, 0, 0, 1)
+    > res1 <- gpava(x, y, ties="secondary")
+    > res1$x
+
+    `isotone` version: 1.1-0, 2015-07-24
+    R version: R version 3.3.2 (2016-10-31)
+    """
+    x = np.array([0, 1, 1, 2, 3, 4])
+    y = np.array([0, 0, 1, 0, 0, 1])
+    y_true = np.array([0.0, 0.25, 0.25, 0.25, 0.25, 1.0])
+    ir = IsotonicRegression()
+    ir.fit(x, y)
+    assert_array_almost_equal(ir.transform(x), y_true)
+    assert_array_almost_equal(ir.fit_transform(x, y), y_true)
+
+
+def test_isotonic_regression_reversed():
+    y = np.array([10, 9, 10, 7, 6, 6.1, 5])
+    y_result = np.array([10, 9.5, 9.5, 7, 6.05, 6.05, 5])
+
+    y_iso = isotonic_regression(y, increasing=False)
+    assert_allclose(y_iso, y_result)
+
+    y_ = IsotonicRegression(increasing=False).fit_transform(np.arange(len(y)), y)
+    assert_allclose(y_, y_result)
+    assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
+
+
+def test_isotonic_regression_auto_decreasing():
+    # Set y and x for decreasing
+    y = np.array([10, 9, 10, 7, 6, 6.1, 5])
+    x = np.arange(len(y))
+
+    # Create model and fit_transform
+    ir = IsotonicRegression(increasing="auto")
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        y_ = ir.fit_transform(x, y)
+        # work-around for pearson divide warnings in scipy <= 0.17.0
+        assert all(["invalid value encountered in " in str(warn.message) for warn in w])
+
+    # Check that relationship decreases
+    is_increasing = y_[0] < y_[-1]
+    assert not is_increasing
+
+
+def test_isotonic_regression_auto_increasing():
+    # Set y and x for decreasing
+    y = np.array([5, 6.1, 6, 7, 10, 9, 10])
+    x = np.arange(len(y))
+
+    # Create model and fit_transform
+    ir = IsotonicRegression(increasing="auto")
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        y_ = ir.fit_transform(x, y)
+        # work-around for pearson divide warnings in scipy <= 0.17.0
+        assert all(["invalid value encountered in " in str(warn.message) for warn in w])
+
+    # Check that relationship increases
+    is_increasing = y_[0] < y_[-1]
+    assert is_increasing
+
+
+def test_assert_raises_exceptions():
+    ir = IsotonicRegression()
+    rng = np.random.RandomState(42)
+
+    msg = "Found input variables with inconsistent numbers of samples"
+    with pytest.raises(ValueError, match=msg):
+        ir.fit([0, 1, 2], [5, 7, 3], [0.1, 0.6])
+
+    with pytest.raises(ValueError, match=msg):
+        ir.fit([0, 1, 2], [5, 7])
+
+    msg = "X should be a 1d array"
+    with pytest.raises(ValueError, match=msg):
+        ir.fit(rng.randn(3, 10), [0, 1, 2])
+
+    msg = "Isotonic regression input X should be a 1d array"
+    with pytest.raises(ValueError, match=msg):
+        ir.transform(rng.randn(3, 10))
+
+
+def test_isotonic_sample_weight_parameter_default_value():
+    # check if default value of sample_weight parameter is one
+    ir = IsotonicRegression()
+    # random test data
+    rng = np.random.RandomState(42)
+    n = 100
+    x = np.arange(n)
+    y = rng.randint(-50, 50, size=(n,)) + 50.0 * np.log(1 + np.arange(n))
+    # check if value is correctly used
+    weights = np.ones(n)
+    y_set_value = ir.fit_transform(x, y, sample_weight=weights)
+    y_default_value = ir.fit_transform(x, y)
+
+    assert_array_equal(y_set_value, y_default_value)
+
+
+def test_isotonic_min_max_boundaries():
+    # check if min value is used correctly
+    ir = IsotonicRegression(y_min=2, y_max=4)
+    n = 6
+    x = np.arange(n)
+    y = np.arange(n)
+    y_test = [2, 2, 2, 3, 4, 4]
+    y_result = np.round(ir.fit_transform(x, y))
+    assert_array_equal(y_result, y_test)
+
+
+def test_isotonic_sample_weight():
+    ir = IsotonicRegression()
+    x = [1, 2, 3, 4, 5, 6, 7]
+    y = [1, 41, 51, 1, 2, 5, 24]
+    sample_weight = [1, 2, 3, 4, 5, 6, 7]
+    expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24]
+    received_y = ir.fit_transform(x, y, sample_weight=sample_weight)
+
+    assert_array_equal(expected_y, received_y)
+
+
+def test_isotonic_regression_oob_raise():
+    # Set y and x
+    y = np.array([3, 7, 5, 9, 8, 7, 10])
+    x = np.arange(len(y))
+
+    # Create model and fit
+    ir = IsotonicRegression(increasing="auto", out_of_bounds="raise")
+    ir.fit(x, y)
+
+    # Check that an exception is thrown
+    msg = "in x_new is below the interpolation range"
+    with pytest.raises(ValueError, match=msg):
+        ir.predict([min(x) - 10, max(x) + 10])
+
+
+def test_isotonic_regression_oob_clip():
+    # Set y and x
+    y = np.array([3, 7, 5, 9, 8, 7, 10])
+    x = np.arange(len(y))
+
+    # Create model and fit
+    ir = IsotonicRegression(increasing="auto", out_of_bounds="clip")
+    ir.fit(x, y)
+
+    # Predict from  training and test x and check that min/max match.
+    y1 = ir.predict([min(x) - 10, max(x) + 10])
+    y2 = ir.predict(x)
+    assert max(y1) == max(y2)
+    assert min(y1) == min(y2)
+
+
+def test_isotonic_regression_oob_nan():
+    # Set y and x
+    y = np.array([3, 7, 5, 9, 8, 7, 10])
+    x = np.arange(len(y))
+
+    # Create model and fit
+    ir = IsotonicRegression(increasing="auto", out_of_bounds="nan")
+    ir.fit(x, y)
+
+    # Predict from  training and test x and check that we have two NaNs.
+    y1 = ir.predict([min(x) - 10, max(x) + 10])
+    assert sum(np.isnan(y1)) == 2
+
+
+def test_isotonic_regression_pickle():
+    y = np.array([3, 7, 5, 9, 8, 7, 10])
+    x = np.arange(len(y))
+
+    # Create model and fit
+    ir = IsotonicRegression(increasing="auto", out_of_bounds="clip")
+    ir.fit(x, y)
+
+    ir_ser = pickle.dumps(ir, pickle.HIGHEST_PROTOCOL)
+    ir2 = pickle.loads(ir_ser)
+    np.testing.assert_array_equal(ir.predict(x), ir2.predict(x))
+
+
+def test_isotonic_duplicate_min_entry():
+    x = [0, 0, 1]
+    y = [0, 0, 1]
+
+    ir = IsotonicRegression(increasing=True, out_of_bounds="clip")
+    ir.fit(x, y)
+    all_predictions_finite = np.all(np.isfinite(ir.predict(x)))
+    assert all_predictions_finite
+
+
+def test_isotonic_ymin_ymax():
+    # Test from @NelleV's issue:
+    # https://github.com/scikit-learn/scikit-learn/issues/6921
+    x = np.array(
+        [
+            1.263,
+            1.318,
+            -0.572,
+            0.307,
+            -0.707,
+            -0.176,
+            -1.599,
+            1.059,
+            1.396,
+            1.906,
+            0.210,
+            0.028,
+            -0.081,
+            0.444,
+            0.018,
+            -0.377,
+            -0.896,
+            -0.377,
+            -1.327,
+            0.180,
+        ]
+    )
+    y = isotonic_regression(x, y_min=0.0, y_max=0.1)
+
+    assert np.all(y >= 0)
+    assert np.all(y <= 0.1)
+
+    # Also test decreasing case since the logic there is different
+    y = isotonic_regression(x, y_min=0.0, y_max=0.1, increasing=False)
+
+    assert np.all(y >= 0)
+    assert np.all(y <= 0.1)
+
+    # Finally, test with only one bound
+    y = isotonic_regression(x, y_min=0.0, increasing=False)
+
+    assert np.all(y >= 0)
+
+
+def test_isotonic_zero_weight_loop():
+    # Test from @ogrisel's issue:
+    # https://github.com/scikit-learn/scikit-learn/issues/4297
+
+    # Get deterministic RNG with seed
+    rng = np.random.RandomState(42)
+
+    # Create regression and samples
+    regression = IsotonicRegression()
+    n_samples = 50
+    x = np.linspace(-3, 3, n_samples)
+    y = x + rng.uniform(size=n_samples)
+
+    # Get some random weights and zero out
+    w = rng.uniform(size=n_samples)
+    w[5:8] = 0
+    regression.fit(x, y, sample_weight=w)
+
+    # This will hang in failure case.
+    regression.fit(x, y, sample_weight=w)
+
+
+def test_fast_predict():
+    # test that the faster prediction change doesn't
+    # affect out-of-sample predictions:
+    # https://github.com/scikit-learn/scikit-learn/pull/6206
+    rng = np.random.RandomState(123)
+    n_samples = 10**3
+    # X values over the -10,10 range
+    X_train = 20.0 * rng.rand(n_samples) - 10
+    y_train = (
+        np.less(rng.rand(n_samples), expit(X_train)).astype("int64").astype("float64")
+    )
+
+    weights = rng.rand(n_samples)
+    # we also want to test that everything still works when some weights are 0
+    weights[rng.rand(n_samples) < 0.1] = 0
+
+    slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
+    fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
+
+    # Build interpolation function with ALL input data, not just the
+    # non-redundant subset. The following 2 lines are taken from the
+    # .fit() method, without removing unnecessary points
+    X_train_fit, y_train_fit = slow_model._build_y(
+        X_train, y_train, sample_weight=weights, trim_duplicates=False
+    )
+    slow_model._build_f(X_train_fit, y_train_fit)
+
+    # fit with just the necessary data
+    fast_model.fit(X_train, y_train, sample_weight=weights)
+
+    X_test = 20.0 * rng.rand(n_samples) - 10
+    y_pred_slow = slow_model.predict(X_test)
+    y_pred_fast = fast_model.predict(X_test)
+
+    assert_array_equal(y_pred_slow, y_pred_fast)
+
+
+def test_isotonic_copy_before_fit():
+    # https://github.com/scikit-learn/scikit-learn/issues/6628
+    ir = IsotonicRegression()
+    copy.copy(ir)
+
+
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+def test_isotonic_dtype(dtype):
+    y = [2, 1, 4, 3, 5]
+    weights = np.array([0.9, 0.9, 0.9, 0.9, 0.9], dtype=np.float64)
+    reg = IsotonicRegression()
+
+    for sample_weight in (None, weights.astype(np.float32), weights):
+        y_np = np.array(y, dtype=dtype)
+        expected_dtype = check_array(
+            y_np, dtype=[np.float64, np.float32], ensure_2d=False
+        ).dtype
+
+        res = isotonic_regression(y_np, sample_weight=sample_weight)
+        assert res.dtype == expected_dtype
+
+        X = np.arange(len(y)).astype(dtype)
+        reg.fit(X, y_np, sample_weight=sample_weight)
+        res = reg.predict(X)
+        assert res.dtype == expected_dtype
+
+
+@pytest.mark.parametrize("y_dtype", [np.int32, np.int64, np.float32, np.float64])
+def test_isotonic_mismatched_dtype(y_dtype):
+    # regression test for #15004
+    # check that data are converted when X and y dtype differ
+    reg = IsotonicRegression()
+    y = np.array([2, 1, 4, 3, 5], dtype=y_dtype)
+    X = np.arange(len(y), dtype=np.float32)
+    reg.fit(X, y)
+    assert reg.predict(X).dtype == X.dtype
+
+
+def test_make_unique_dtype():
+    x_list = [2, 2, 2, 3, 5]
+    for dtype in (np.float32, np.float64):
+        x = np.array(x_list, dtype=dtype)
+        y = x.copy()
+        w = np.ones_like(x)
+        x, y, w = _make_unique(x, y, w)
+        assert_array_equal(x, [2, 3, 5])
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_make_unique_tolerance(dtype):
+    # Check that equality takes account of np.finfo tolerance
+    x = np.array([0, 1e-16, 1, 1 + 1e-14], dtype=dtype)
+    y = x.copy()
+    w = np.ones_like(x)
+    x, y, w = _make_unique(x, y, w)
+    if dtype == np.float64:
+        x_out = np.array([0, 1, 1 + 1e-14])
+    else:
+        x_out = np.array([0, 1])
+    assert_array_equal(x, x_out)
+
+
+def test_isotonic_make_unique_tolerance():
+    # Check that averaging of targets for duplicate X is done correctly,
+    # taking into account tolerance
+    X = np.array([0, 1, 1 + 1e-16, 2], dtype=np.float64)
+    y = np.array([0, 1, 2, 3], dtype=np.float64)
+    ireg = IsotonicRegression().fit(X, y)
+    y_pred = ireg.predict([0, 0.5, 1, 1.5, 2])
+
+    assert_array_equal(y_pred, np.array([0, 0.75, 1.5, 2.25, 3]))
+    assert_array_equal(ireg.X_thresholds_, np.array([0.0, 1.0, 2.0]))
+    assert_array_equal(ireg.y_thresholds_, np.array([0.0, 1.5, 3.0]))
+
+
+def test_isotonic_non_regression_inf_slope():
+    # Non-regression test to ensure that inf values are not returned
+    # see: https://github.com/scikit-learn/scikit-learn/issues/10903
+    X = np.array([0.0, 4.1e-320, 4.4e-314, 1.0])
+    y = np.array([0.42, 0.42, 0.44, 0.44])
+    ireg = IsotonicRegression().fit(X, y)
+    y_pred = ireg.predict(np.array([0, 2.1e-319, 5.4e-316, 1e-10]))
+    assert np.all(np.isfinite(y_pred))
+
+
+@pytest.mark.parametrize("increasing", [True, False])
+def test_isotonic_thresholds(increasing):
+    rng = np.random.RandomState(42)
+    n_samples = 30
+    X = rng.normal(size=n_samples)
+    y = rng.normal(size=n_samples)
+    ireg = IsotonicRegression(increasing=increasing).fit(X, y)
+    X_thresholds, y_thresholds = ireg.X_thresholds_, ireg.y_thresholds_
+    assert X_thresholds.shape == y_thresholds.shape
+
+    # Input thresholds are a strict subset of the training set (unless
+    # the data is already strictly monotonic which is not the case with
+    # this random data)
+    assert X_thresholds.shape[0] < X.shape[0]
+    assert np.isin(X_thresholds, X).all()
+
+    # Output thresholds lie in the range of the training set:
+    assert y_thresholds.max() <= y.max()
+    assert y_thresholds.min() >= y.min()
+
+    assert all(np.diff(X_thresholds) > 0)
+    if increasing:
+        assert all(np.diff(y_thresholds) >= 0)
+    else:
+        assert all(np.diff(y_thresholds) <= 0)
+
+
+def test_input_shape_validation():
+    # Test from #15012
+    # Check that IsotonicRegression can handle 2darray with only 1 feature
+    X = np.arange(10)
+    X_2d = X.reshape(-1, 1)
+    y = np.arange(10)
+
+    iso_reg = IsotonicRegression().fit(X, y)
+    iso_reg_2d = IsotonicRegression().fit(X_2d, y)
+
+    assert iso_reg.X_max_ == iso_reg_2d.X_max_
+    assert iso_reg.X_min_ == iso_reg_2d.X_min_
+    assert iso_reg.y_max == iso_reg_2d.y_max
+    assert iso_reg.y_min == iso_reg_2d.y_min
+    assert_array_equal(iso_reg.X_thresholds_, iso_reg_2d.X_thresholds_)
+    assert_array_equal(iso_reg.y_thresholds_, iso_reg_2d.y_thresholds_)
+
+    y_pred1 = iso_reg.predict(X)
+    y_pred2 = iso_reg_2d.predict(X_2d)
+    assert_allclose(y_pred1, y_pred2)
+
+
+def test_isotonic_2darray_more_than_1_feature():
+    # Ensure IsotonicRegression raises error if input has more than 1 feature
+    X = np.arange(10)
+    X_2d = np.c_[X, X]
+    y = np.arange(10)
+
+    msg = "should be a 1d array or 2d array with 1 feature"
+    with pytest.raises(ValueError, match=msg):
+        IsotonicRegression().fit(X_2d, y)
+
+    iso_reg = IsotonicRegression().fit(X, y)
+    with pytest.raises(ValueError, match=msg):
+        iso_reg.predict(X_2d)
+
+    with pytest.raises(ValueError, match=msg):
+        iso_reg.transform(X_2d)
+
+
+def test_isotonic_regression_sample_weight_not_overwritten():
+    """Check that calling fitting function of isotonic regression will not
+    overwrite `sample_weight`.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20508
+    """
+    X, y = make_regression(n_samples=10, n_features=1, random_state=41)
+    sample_weight_original = np.ones_like(y)
+    sample_weight_original[0] = 10
+    sample_weight_fit = sample_weight_original.copy()
+
+    isotonic_regression(y, sample_weight=sample_weight_fit)
+    assert_allclose(sample_weight_fit, sample_weight_original)
+
+    IsotonicRegression().fit(X, y, sample_weight=sample_weight_fit)
+    assert_allclose(sample_weight_fit, sample_weight_original)
+
+
+@pytest.mark.parametrize("shape", ["1d", "2d"])
+def test_get_feature_names_out(shape):
+    """Check `get_feature_names_out` for `IsotonicRegression`."""
+    X = np.arange(10)
+    if shape == "2d":
+        X = X.reshape(-1, 1)
+    y = np.arange(10)
+
+    iso = IsotonicRegression().fit(X, y)
+    names = iso.get_feature_names_out()
+    assert isinstance(names, np.ndarray)
+    assert names.dtype == object
+    assert_array_equal(["isotonicregression0"], names)
+
+
+def test_isotonic_regression_output_predict():
+    """Check that `predict` does return the expected output type.
+
+    We need to check that `transform` will output a DataFrame and a NumPy array
+    when we set `transform_output` to `pandas`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/25499
+    """
+    pd = pytest.importorskip("pandas")
+    X, y = make_regression(n_samples=10, n_features=1, random_state=42)
+    regressor = IsotonicRegression()
+    with sklearn.config_context(transform_output="pandas"):
+        regressor.fit(X, y)
+        X_trans = regressor.transform(X)
+        y_pred = regressor.predict(X)
+
+    assert isinstance(X_trans, pd.DataFrame)
+    assert isinstance(y_pred, np.ndarray)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_kernel_approximation.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_kernel_approximation.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b0c47adc3eb6760b0b4da22374d4f7db77af83
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_kernel_approximation.py
@@ -0,0 +1,495 @@
+import re
+
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_classification
+from sklearn.kernel_approximation import (
+    AdditiveChi2Sampler,
+    Nystroem,
+    PolynomialCountSketch,
+    RBFSampler,
+    SkewedChi2Sampler,
+)
+from sklearn.metrics.pairwise import (
+    chi2_kernel,
+    kernel_metrics,
+    polynomial_kernel,
+    rbf_kernel,
+)
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+# generate data
+rng = np.random.RandomState(0)
+X = rng.random_sample(size=(300, 50))
+Y = rng.random_sample(size=(300, 50))
+X /= X.sum(axis=1)[:, np.newaxis]
+Y /= Y.sum(axis=1)[:, np.newaxis]
+
+# Make sure X and Y are not writable to avoid introducing dependencies between
+# tests.
+X.flags.writeable = False
+Y.flags.writeable = False
+
+
+@pytest.mark.parametrize("gamma", [0.1, 1, 2.5])
+@pytest.mark.parametrize("degree, n_components", [(1, 500), (2, 500), (3, 5000)])
+@pytest.mark.parametrize("coef0", [0, 2.5])
+def test_polynomial_count_sketch(gamma, degree, coef0, n_components):
+    # test that PolynomialCountSketch approximates polynomial
+    # kernel on random data
+
+    # compute exact kernel
+    kernel = polynomial_kernel(X, Y, gamma=gamma, degree=degree, coef0=coef0)
+
+    # approximate kernel mapping
+    ps_transform = PolynomialCountSketch(
+        n_components=n_components,
+        gamma=gamma,
+        coef0=coef0,
+        degree=degree,
+        random_state=42,
+    )
+    X_trans = ps_transform.fit_transform(X)
+    Y_trans = ps_transform.transform(Y)
+    kernel_approx = np.dot(X_trans, Y_trans.T)
+
+    error = kernel - kernel_approx
+    assert np.abs(np.mean(error)) <= 0.05  # close to unbiased
+    np.abs(error, out=error)
+    assert np.max(error) <= 0.1  # nothing too far off
+    assert np.mean(error) <= 0.05  # mean is fairly close
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("gamma", [0.1, 1.0])
+@pytest.mark.parametrize("degree", [1, 2, 3])
+@pytest.mark.parametrize("coef0", [0, 2.5])
+def test_polynomial_count_sketch_dense_sparse(gamma, degree, coef0, csr_container):
+    """Check that PolynomialCountSketch results are the same for dense and sparse
+    input.
+    """
+    ps_dense = PolynomialCountSketch(
+        n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42
+    )
+    Xt_dense = ps_dense.fit_transform(X)
+    Yt_dense = ps_dense.transform(Y)
+
+    ps_sparse = PolynomialCountSketch(
+        n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42
+    )
+    Xt_sparse = ps_sparse.fit_transform(csr_container(X))
+    Yt_sparse = ps_sparse.transform(csr_container(Y))
+
+    assert_allclose(Xt_dense, Xt_sparse)
+    assert_allclose(Yt_dense, Yt_sparse)
+
+
+def _linear_kernel(X, Y):
+    return np.dot(X, Y.T)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_additive_chi2_sampler(csr_container):
+    # test that AdditiveChi2Sampler approximates kernel on random data
+
+    # compute exact kernel
+    # abbreviations for easier formula
+    X_ = X[:, np.newaxis, :].copy()
+    Y_ = Y[np.newaxis, :, :].copy()
+
+    large_kernel = 2 * X_ * Y_ / (X_ + Y_)
+
+    # reduce to n_samples_x x n_samples_y by summing over features
+    kernel = large_kernel.sum(axis=2)
+
+    # approximate kernel mapping
+    transform = AdditiveChi2Sampler(sample_steps=3)
+    X_trans = transform.fit_transform(X)
+    Y_trans = transform.transform(Y)
+
+    kernel_approx = np.dot(X_trans, Y_trans.T)
+
+    assert_array_almost_equal(kernel, kernel_approx, 1)
+
+    X_sp_trans = transform.fit_transform(csr_container(X))
+    Y_sp_trans = transform.transform(csr_container(Y))
+
+    assert_array_equal(X_trans, X_sp_trans.toarray())
+    assert_array_equal(Y_trans, Y_sp_trans.toarray())
+
+    # test error is raised on negative input
+    Y_neg = Y.copy()
+    Y_neg[0, 0] = -1
+    msg = "Negative values in data passed to"
+    with pytest.raises(ValueError, match=msg):
+        transform.fit(Y_neg)
+
+
+@pytest.mark.parametrize("method", ["fit", "fit_transform", "transform"])
+@pytest.mark.parametrize("sample_steps", range(1, 4))
+def test_additive_chi2_sampler_sample_steps(method, sample_steps):
+    """Check that the input sample step doesn't raise an error
+    and that sample interval doesn't change after fit.
+    """
+    transformer = AdditiveChi2Sampler(sample_steps=sample_steps)
+    getattr(transformer, method)(X)
+
+    sample_interval = 0.5
+    transformer = AdditiveChi2Sampler(
+        sample_steps=sample_steps,
+        sample_interval=sample_interval,
+    )
+    getattr(transformer, method)(X)
+    assert transformer.sample_interval == sample_interval
+
+
+@pytest.mark.parametrize("method", ["fit", "fit_transform", "transform"])
+def test_additive_chi2_sampler_wrong_sample_steps(method):
+    """Check that we raise a ValueError on invalid sample_steps"""
+    transformer = AdditiveChi2Sampler(sample_steps=4)
+    msg = re.escape(
+        "If sample_steps is not in [1, 2, 3], you need to provide sample_interval"
+    )
+    with pytest.raises(ValueError, match=msg):
+        getattr(transformer, method)(X)
+
+
+def test_skewed_chi2_sampler():
+    # test that RBFSampler approximates kernel on random data
+
+    # compute exact kernel
+    c = 0.03
+    # set on negative component but greater than c to ensure that the kernel
+    # approximation is valid on the group (-c; +\infty) endowed with the skewed
+    # multiplication.
+    Y_ = Y.copy()
+    Y_[0, 0] = -c / 2.0
+
+    # abbreviations for easier formula
+    X_c = (X + c)[:, np.newaxis, :]
+    Y_c = (Y_ + c)[np.newaxis, :, :]
+
+    # we do it in log-space in the hope that it's more stable
+    # this array is n_samples_x x n_samples_y big x n_features
+    log_kernel = (
+        (np.log(X_c) / 2.0) + (np.log(Y_c) / 2.0) + np.log(2.0) - np.log(X_c + Y_c)
+    )
+    # reduce to n_samples_x x n_samples_y by summing over features in log-space
+    kernel = np.exp(log_kernel.sum(axis=2))
+
+    # approximate kernel mapping
+    transform = SkewedChi2Sampler(skewedness=c, n_components=1000, random_state=42)
+    X_trans = transform.fit_transform(X)
+    Y_trans = transform.transform(Y_)
+
+    kernel_approx = np.dot(X_trans, Y_trans.T)
+    assert_array_almost_equal(kernel, kernel_approx, 1)
+    assert np.isfinite(kernel).all(), "NaNs found in the Gram matrix"
+    assert np.isfinite(kernel_approx).all(), "NaNs found in the approximate Gram matrix"
+
+    # test error is raised on when inputs contains values smaller than -c
+    Y_neg = Y_.copy()
+    Y_neg[0, 0] = -c * 2.0
+    msg = "X may not contain entries smaller than -skewedness"
+    with pytest.raises(ValueError, match=msg):
+        transform.transform(Y_neg)
+
+
+def test_additive_chi2_sampler_exceptions():
+    """Ensures correct error message"""
+    transformer = AdditiveChi2Sampler()
+    X_neg = X.copy()
+    X_neg[0, 0] = -1
+    with pytest.raises(ValueError, match="X in AdditiveChi2Sampler"):
+        transformer.fit(X_neg)
+    with pytest.raises(ValueError, match="X in AdditiveChi2Sampler"):
+        transformer.fit(X)
+        transformer.transform(X_neg)
+
+
+def test_rbf_sampler():
+    # test that RBFSampler approximates kernel on random data
+    # compute exact kernel
+    gamma = 10.0
+    kernel = rbf_kernel(X, Y, gamma=gamma)
+
+    # approximate kernel mapping
+    rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42)
+    X_trans = rbf_transform.fit_transform(X)
+    Y_trans = rbf_transform.transform(Y)
+    kernel_approx = np.dot(X_trans, Y_trans.T)
+
+    error = kernel - kernel_approx
+    assert np.abs(np.mean(error)) <= 0.01  # close to unbiased
+    np.abs(error, out=error)
+    assert np.max(error) <= 0.1  # nothing too far off
+    assert np.mean(error) <= 0.05  # mean is fairly close
+
+
+def test_rbf_sampler_fitted_attributes_dtype(global_dtype):
+    """Check that the fitted attributes are stored accordingly to the
+    data type of X."""
+    rbf = RBFSampler()
+
+    X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
+
+    rbf.fit(X)
+
+    assert rbf.random_offset_.dtype == global_dtype
+    assert rbf.random_weights_.dtype == global_dtype
+
+
+def test_rbf_sampler_dtype_equivalence():
+    """Check the equivalence of the results with 32 and 64 bits input."""
+    rbf32 = RBFSampler(random_state=42)
+    X32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
+    rbf32.fit(X32)
+
+    rbf64 = RBFSampler(random_state=42)
+    X64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
+    rbf64.fit(X64)
+
+    assert_allclose(rbf32.random_offset_, rbf64.random_offset_)
+    assert_allclose(rbf32.random_weights_, rbf64.random_weights_)
+
+
+def test_rbf_sampler_gamma_scale():
+    """Check the inner value computed when `gamma='scale'`."""
+    X, y = [[0.0], [1.0]], [0, 1]
+    rbf = RBFSampler(gamma="scale")
+    rbf.fit(X, y)
+    assert rbf._gamma == pytest.approx(4)
+
+
+def test_skewed_chi2_sampler_fitted_attributes_dtype(global_dtype):
+    """Check that the fitted attributes are stored accordingly to the
+    data type of X."""
+    skewed_chi2_sampler = SkewedChi2Sampler()
+
+    X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
+
+    skewed_chi2_sampler.fit(X)
+
+    assert skewed_chi2_sampler.random_offset_.dtype == global_dtype
+    assert skewed_chi2_sampler.random_weights_.dtype == global_dtype
+
+
+def test_skewed_chi2_sampler_dtype_equivalence():
+    """Check the equivalence of the results with 32 and 64 bits input."""
+    skewed_chi2_sampler_32 = SkewedChi2Sampler(random_state=42)
+    X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
+    skewed_chi2_sampler_32.fit(X_32)
+
+    skewed_chi2_sampler_64 = SkewedChi2Sampler(random_state=42)
+    X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
+    skewed_chi2_sampler_64.fit(X_64)
+
+    assert_allclose(
+        skewed_chi2_sampler_32.random_offset_, skewed_chi2_sampler_64.random_offset_
+    )
+    assert_allclose(
+        skewed_chi2_sampler_32.random_weights_, skewed_chi2_sampler_64.random_weights_
+    )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_input_validation(csr_container):
+    # Regression test: kernel approx. transformers should work on lists
+    # No assertions; the old versions would simply crash
+    X = [[1, 2], [3, 4], [5, 6]]
+    AdditiveChi2Sampler().fit(X).transform(X)
+    SkewedChi2Sampler().fit(X).transform(X)
+    RBFSampler().fit(X).transform(X)
+
+    X = csr_container(X)
+    RBFSampler().fit(X).transform(X)
+
+
+def test_nystroem_approximation():
+    # some basic tests
+    rnd = np.random.RandomState(0)
+    X = rnd.uniform(size=(10, 4))
+
+    # With n_components = n_samples this is exact
+    X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
+    K = rbf_kernel(X)
+    assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
+
+    trans = Nystroem(n_components=2, random_state=rnd)
+    X_transformed = trans.fit(X).transform(X)
+    assert X_transformed.shape == (X.shape[0], 2)
+
+    # test callable kernel
+    trans = Nystroem(n_components=2, kernel=_linear_kernel, random_state=rnd)
+    X_transformed = trans.fit(X).transform(X)
+    assert X_transformed.shape == (X.shape[0], 2)
+
+    # test that available kernels fit and transform
+    kernels_available = kernel_metrics()
+    for kern in kernels_available:
+        trans = Nystroem(n_components=2, kernel=kern, random_state=rnd)
+        X_transformed = trans.fit(X).transform(X)
+        assert X_transformed.shape == (X.shape[0], 2)
+
+
+def test_nystroem_default_parameters():
+    rnd = np.random.RandomState(42)
+    X = rnd.uniform(size=(10, 4))
+
+    # rbf kernel should behave as gamma=None by default
+    # aka gamma = 1 / n_features
+    nystroem = Nystroem(n_components=10)
+    X_transformed = nystroem.fit_transform(X)
+    K = rbf_kernel(X, gamma=None)
+    K2 = np.dot(X_transformed, X_transformed.T)
+    assert_array_almost_equal(K, K2)
+
+    # chi2 kernel should behave as gamma=1 by default
+    nystroem = Nystroem(kernel="chi2", n_components=10)
+    X_transformed = nystroem.fit_transform(X)
+    K = chi2_kernel(X, gamma=1)
+    K2 = np.dot(X_transformed, X_transformed.T)
+    assert_array_almost_equal(K, K2)
+
+
+def test_nystroem_singular_kernel():
+    # test that nystroem works with singular kernel matrix
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 20)
+    X = np.vstack([X] * 2)  # duplicate samples
+
+    gamma = 100
+    N = Nystroem(gamma=gamma, n_components=X.shape[0]).fit(X)
+    X_transformed = N.transform(X)
+
+    K = rbf_kernel(X, gamma=gamma)
+
+    assert_array_almost_equal(K, np.dot(X_transformed, X_transformed.T))
+    assert np.all(np.isfinite(Y))
+
+
+def test_nystroem_poly_kernel_params():
+    # Non-regression: Nystroem should pass other parameters beside gamma.
+    rnd = np.random.RandomState(37)
+    X = rnd.uniform(size=(10, 4))
+
+    K = polynomial_kernel(X, degree=3.1, coef0=0.1)
+    nystroem = Nystroem(
+        kernel="polynomial", n_components=X.shape[0], degree=3.1, coef0=0.1
+    )
+    X_transformed = nystroem.fit_transform(X)
+    assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
+
+
+def test_nystroem_callable():
+    # Test Nystroem on a callable.
+    rnd = np.random.RandomState(42)
+    n_samples = 10
+    X = rnd.uniform(size=(n_samples, 4))
+
+    def logging_histogram_kernel(x, y, log):
+        """Histogram kernel that writes to a log."""
+        log.append(1)
+        return np.minimum(x, y).sum()
+
+    kernel_log = []
+    X = list(X)  # test input validation
+    Nystroem(
+        kernel=logging_histogram_kernel,
+        n_components=(n_samples - 1),
+        kernel_params={"log": kernel_log},
+    ).fit(X)
+    assert len(kernel_log) == n_samples * (n_samples - 1) / 2
+
+    # if degree, gamma or coef0 is passed, we raise a ValueError
+    msg = "Don't pass gamma, coef0 or degree to Nystroem"
+    params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
+    for param in params:
+        ny = Nystroem(kernel=_linear_kernel, n_components=(n_samples - 1), **param)
+        with pytest.raises(ValueError, match=msg):
+            ny.fit(X)
+
+
+def test_nystroem_precomputed_kernel():
+    # Non-regression: test Nystroem on precomputed kernel.
+    # PR - 14706
+    rnd = np.random.RandomState(12)
+    X = rnd.uniform(size=(10, 4))
+
+    K = polynomial_kernel(X, degree=2, coef0=0.1)
+    nystroem = Nystroem(kernel="precomputed", n_components=X.shape[0])
+    X_transformed = nystroem.fit_transform(K)
+    assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
+
+    # if degree, gamma or coef0 is passed, we raise a ValueError
+    msg = "Don't pass gamma, coef0 or degree to Nystroem"
+    params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
+    for param in params:
+        ny = Nystroem(kernel="precomputed", n_components=X.shape[0], **param)
+        with pytest.raises(ValueError, match=msg):
+            ny.fit(K)
+
+
+def test_nystroem_component_indices():
+    """Check that `component_indices_` corresponds to the subset of
+    training points used to construct the feature map.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20474
+    """
+    X, _ = make_classification(n_samples=100, n_features=20)
+    feature_map_nystroem = Nystroem(
+        n_components=10,
+        random_state=0,
+    )
+    feature_map_nystroem.fit(X)
+    assert feature_map_nystroem.component_indices_.shape == (10,)
+
+
+@pytest.mark.parametrize(
+    "Estimator", [PolynomialCountSketch, RBFSampler, SkewedChi2Sampler, Nystroem]
+)
+def test_get_feature_names_out(Estimator):
+    """Check get_feature_names_out"""
+    est = Estimator().fit(X)
+    X_trans = est.transform(X)
+
+    names_out = est.get_feature_names_out()
+    class_name = Estimator.__name__.lower()
+    expected_names = [f"{class_name}{i}" for i in range(X_trans.shape[1])]
+    assert_array_equal(names_out, expected_names)
+
+
+def test_additivechi2sampler_get_feature_names_out():
+    """Check get_feature_names_out for AdditiveChi2Sampler."""
+    rng = np.random.RandomState(0)
+    X = rng.random_sample(size=(300, 3))
+
+    chi2_sampler = AdditiveChi2Sampler(sample_steps=3).fit(X)
+    input_names = ["f0", "f1", "f2"]
+    suffixes = [
+        "f0_sqrt",
+        "f1_sqrt",
+        "f2_sqrt",
+        "f0_cos1",
+        "f1_cos1",
+        "f2_cos1",
+        "f0_sin1",
+        "f1_sin1",
+        "f2_sin1",
+        "f0_cos2",
+        "f1_cos2",
+        "f2_cos2",
+        "f0_sin2",
+        "f1_sin2",
+        "f2_sin2",
+    ]
+
+    names_out = chi2_sampler.get_feature_names_out(input_features=input_names)
+    expected_names = [f"additivechi2sampler_{suffix}" for suffix in suffixes]
+    assert_array_equal(names_out, expected_names)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_kernel_ridge.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_kernel_ridge.py
new file mode 100644
index 0000000000000000000000000000000000000000..431d326a82269a72037a6b8eb9d58ccff6db9c36
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_kernel_ridge.py
@@ -0,0 +1,80 @@
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_regression
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.linear_model import Ridge
+from sklearn.metrics.pairwise import pairwise_kernels
+from sklearn.utils._testing import assert_array_almost_equal, ignore_warnings
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+X, y = make_regression(n_features=10, random_state=0)
+Y = np.array([y, y]).T
+
+
+def test_kernel_ridge():
+    pred = Ridge(alpha=1, fit_intercept=False).fit(X, y).predict(X)
+    pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
+    assert_array_almost_equal(pred, pred2)
+
+
+@pytest.mark.parametrize("sparse_container", [*CSR_CONTAINERS, *CSC_CONTAINERS])
+def test_kernel_ridge_sparse(sparse_container):
+    X_sparse = sparse_container(X)
+    pred = (
+        Ridge(alpha=1, fit_intercept=False, solver="cholesky")
+        .fit(X_sparse, y)
+        .predict(X_sparse)
+    )
+    pred2 = KernelRidge(kernel="linear", alpha=1).fit(X_sparse, y).predict(X_sparse)
+    assert_array_almost_equal(pred, pred2)
+
+
+def test_kernel_ridge_singular_kernel():
+    # alpha=0 causes a LinAlgError in computing the dual coefficients,
+    # which causes a fallback to a lstsq solver. This is tested here.
+    pred = Ridge(alpha=0, fit_intercept=False).fit(X, y).predict(X)
+    kr = KernelRidge(kernel="linear", alpha=0)
+    ignore_warnings(kr.fit)(X, y)
+    pred2 = kr.predict(X)
+    assert_array_almost_equal(pred, pred2)
+
+
+def test_kernel_ridge_precomputed():
+    for kernel in ["linear", "rbf", "poly", "cosine"]:
+        K = pairwise_kernels(X, X, metric=kernel)
+        pred = KernelRidge(kernel=kernel).fit(X, y).predict(X)
+        pred2 = KernelRidge(kernel="precomputed").fit(K, y).predict(K)
+        assert_array_almost_equal(pred, pred2)
+
+
+def test_kernel_ridge_precomputed_kernel_unchanged():
+    K = np.dot(X, X.T)
+    K2 = K.copy()
+    KernelRidge(kernel="precomputed").fit(K, y)
+    assert_array_almost_equal(K, K2)
+
+
+def test_kernel_ridge_sample_weights():
+    K = np.dot(X, X.T)  # precomputed kernel
+    sw = np.random.RandomState(0).rand(X.shape[0])
+
+    pred = Ridge(alpha=1, fit_intercept=False).fit(X, y, sample_weight=sw).predict(X)
+    pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y, sample_weight=sw).predict(X)
+    pred3 = (
+        KernelRidge(kernel="precomputed", alpha=1)
+        .fit(K, y, sample_weight=sw)
+        .predict(K)
+    )
+    assert_array_almost_equal(pred, pred2)
+    assert_array_almost_equal(pred, pred3)
+
+
+def test_kernel_ridge_multi_output():
+    pred = Ridge(alpha=1, fit_intercept=False).fit(X, Y).predict(X)
+    pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, Y).predict(X)
+    assert_array_almost_equal(pred, pred2)
+
+    pred3 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
+    pred3 = np.array([pred3, pred3]).T
+    assert_array_almost_equal(pred2, pred3)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_metadata_routing.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_metadata_routing.py
new file mode 100644
index 0000000000000000000000000000000000000000..d936fc1c4f3c022f19f7301d9cedbaf5fa4b5a51
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_metadata_routing.py
@@ -0,0 +1,1158 @@
+"""
+Metadata Routing Utility Tests
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import re
+
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.base import (
+    BaseEstimator,
+    clone,
+)
+from sklearn.exceptions import UnsetMetadataPassedError
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingRegressor,
+    ConsumingTransformer,
+    MetaRegressor,
+    MetaTransformer,
+    NonConsumingClassifier,
+    WeightedMetaClassifier,
+    WeightedMetaRegressor,
+    _Registry,
+    assert_request_equal,
+    assert_request_is_empty,
+    check_recorded_metadata,
+)
+from sklearn.utils import metadata_routing
+from sklearn.utils._metadata_requests import (
+    COMPOSITE_METHODS,
+    METHODS,
+    SIMPLE_METHODS,
+    MethodMetadataRequest,
+    MethodPair,
+    _MetadataRequester,
+    request_is_alias,
+    request_is_valid,
+)
+from sklearn.utils.metadata_routing import (
+    MetadataRequest,
+    MetadataRouter,
+    MethodMapping,
+    _RoutingNotSupportedMixin,
+    get_routing_for_object,
+    process_routing,
+)
+from sklearn.utils.validation import check_is_fitted
+
+rng = np.random.RandomState(42)
+N, M = 100, 4
+X = rng.rand(N, M)
+y = rng.randint(0, 2, size=N)
+my_groups = rng.randint(0, 10, size=N)
+my_weights = rng.rand(N)
+my_other_weights = rng.rand(N)
+
+
+class SimplePipeline(BaseEstimator):
+    """A very simple pipeline, assuming the last step is always a predictor.
+
+    Parameters
+    ----------
+    steps : iterable of objects
+        An iterable of transformers with the last step being a predictor.
+    """
+
+    def __init__(self, steps):
+        self.steps = steps
+
+    def fit(self, X, y, **fit_params):
+        self.steps_ = []
+        params = process_routing(self, "fit", **fit_params)
+        X_transformed = X
+        for i, step in enumerate(self.steps[:-1]):
+            transformer = clone(step).fit(
+                X_transformed, y, **params.get(f"step_{i}").fit
+            )
+            self.steps_.append(transformer)
+            X_transformed = transformer.transform(
+                X_transformed, **params.get(f"step_{i}").transform
+            )
+
+        self.steps_.append(
+            clone(self.steps[-1]).fit(X_transformed, y, **params.predictor.fit)
+        )
+        return self
+
+    def predict(self, X, **predict_params):
+        check_is_fitted(self)
+        X_transformed = X
+        params = process_routing(self, "predict", **predict_params)
+        for i, step in enumerate(self.steps_[:-1]):
+            X_transformed = step.transform(X, **params.get(f"step_{i}").transform)
+
+        return self.steps_[-1].predict(X_transformed, **params.predictor.predict)
+
+    def get_metadata_routing(self):
+        router = MetadataRouter(owner=self.__class__.__name__)
+        for i, step in enumerate(self.steps[:-1]):
+            router.add(
+                **{f"step_{i}": step},
+                method_mapping=MethodMapping()
+                .add(caller="fit", callee="fit")
+                .add(caller="fit", callee="transform")
+                .add(caller="predict", callee="transform"),
+            )
+        router.add(
+            predictor=self.steps[-1],
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="fit")
+            .add(caller="predict", callee="predict"),
+        )
+        return router
+
+
+@config_context(enable_metadata_routing=True)
+def test_assert_request_is_empty():
+    requests = MetadataRequest(owner="test")
+    assert_request_is_empty(requests)
+
+    requests.fit.add_request(param="foo", alias=None)
+    # this should still work, since None is the default value
+    assert_request_is_empty(requests)
+
+    requests.fit.add_request(param="bar", alias="value")
+    with pytest.raises(AssertionError):
+        # now requests is no more empty
+        assert_request_is_empty(requests)
+
+    # but one can exclude a method
+    assert_request_is_empty(requests, exclude="fit")
+
+    requests.score.add_request(param="carrot", alias=True)
+    with pytest.raises(AssertionError):
+        # excluding `fit` is not enough
+        assert_request_is_empty(requests, exclude="fit")
+
+    # and excluding both fit and score would avoid an exception
+    assert_request_is_empty(requests, exclude=["fit", "score"])
+
+    # test if a router is empty
+    assert_request_is_empty(
+        MetadataRouter(owner="test")
+        .add_self_request(WeightedMetaRegressor(estimator=None))
+        .add(
+            estimator=ConsumingRegressor(),
+            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+        )
+    )
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        ConsumingClassifier(registry=_Registry()),
+        ConsumingRegressor(registry=_Registry()),
+        ConsumingTransformer(registry=_Registry()),
+        WeightedMetaClassifier(estimator=ConsumingClassifier(), registry=_Registry()),
+        WeightedMetaRegressor(estimator=ConsumingRegressor(), registry=_Registry()),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_estimator_puts_self_in_registry(estimator):
+    """Check that an estimator puts itself in the registry upon fit."""
+    estimator.fit(X, y)
+    assert estimator in estimator.registry
+
+
+@pytest.mark.parametrize(
+    "val, res",
+    [
+        (False, False),
+        (True, False),
+        (None, False),
+        ("$UNUSED$", False),
+        ("$WARN$", False),
+        ("invalid-input", False),
+        ("valid_arg", True),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_request_type_is_alias(val, res):
+    # Test request_is_alias
+    assert request_is_alias(val) == res
+
+
+@pytest.mark.parametrize(
+    "val, res",
+    [
+        (False, True),
+        (True, True),
+        (None, True),
+        ("$UNUSED$", True),
+        ("$WARN$", True),
+        ("invalid-input", False),
+        ("alias_arg", False),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_request_type_is_valid(val, res):
+    # Test request_is_valid
+    assert request_is_valid(val) == res
+
+
+@config_context(enable_metadata_routing=True)
+def test_default_requests():
+    class OddEstimator(BaseEstimator):
+        __metadata_request__fit = {
+            # set a different default request
+            "sample_weight": True
+        }  # type: ignore[var-annotated]
+
+    odd_request = get_routing_for_object(OddEstimator())
+    assert odd_request.fit.requests == {"sample_weight": True}
+
+    # check other test estimators
+    assert not len(get_routing_for_object(NonConsumingClassifier()).fit.requests)
+    assert_request_is_empty(NonConsumingClassifier().get_metadata_routing())
+
+    trs_request = get_routing_for_object(ConsumingTransformer())
+    assert trs_request.fit.requests == {
+        "sample_weight": None,
+        "metadata": None,
+    }
+    assert trs_request.transform.requests == {"metadata": None, "sample_weight": None}
+    assert_request_is_empty(trs_request)
+
+    est_request = get_routing_for_object(ConsumingClassifier())
+    assert est_request.fit.requests == {
+        "sample_weight": None,
+        "metadata": None,
+    }
+    assert_request_is_empty(est_request)
+
+
+@config_context(enable_metadata_routing=True)
+def test_default_request_override():
+    """Test that default requests are correctly overridden regardless of the ASCII order
+    of the class names, hence testing small and capital letter class name starts.
+    Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/28430
+    """
+
+    class Base(BaseEstimator):
+        __metadata_request__split = {"groups": True}
+
+    class class_1(Base):
+        __metadata_request__split = {"groups": "sample_domain"}
+
+    class Class_1(Base):
+        __metadata_request__split = {"groups": "sample_domain"}
+
+    assert_request_equal(
+        class_1()._get_metadata_request(), {"split": {"groups": "sample_domain"}}
+    )
+    assert_request_equal(
+        Class_1()._get_metadata_request(), {"split": {"groups": "sample_domain"}}
+    )
+
+
+@config_context(enable_metadata_routing=True)
+def test_process_routing_invalid_method():
+    with pytest.raises(TypeError, match="Can only route and process input"):
+        process_routing(ConsumingClassifier(), "invalid_method", groups=my_groups)
+
+
+@config_context(enable_metadata_routing=True)
+def test_process_routing_invalid_object():
+    class InvalidObject:
+        pass
+
+    with pytest.raises(AttributeError, match="either implement the routing method"):
+        process_routing(InvalidObject(), "fit", groups=my_groups)
+
+
+@pytest.mark.parametrize("method", METHODS)
+@pytest.mark.parametrize("default", [None, "default", []])
+@config_context(enable_metadata_routing=True)
+def test_process_routing_empty_params_get_with_default(method, default):
+    empty_params = {}
+    routed_params = process_routing(ConsumingClassifier(), "fit", **empty_params)
+
+    # Behaviour should be an empty dictionary returned for each method when retrieved.
+    params_for_method = routed_params[method]
+    assert isinstance(params_for_method, dict)
+    assert set(params_for_method.keys()) == set(METHODS)
+
+    # No default to `get` should be equivalent to the default
+    default_params_for_method = routed_params.get(method, default=default)
+    assert default_params_for_method == params_for_method
+
+
+@config_context(enable_metadata_routing=True)
+def test_simple_metadata_routing():
+    # Tests that metadata is properly routed
+
+    # The underlying estimator doesn't accept or request metadata
+    clf = WeightedMetaClassifier(estimator=NonConsumingClassifier())
+    clf.fit(X, y)
+
+    # Meta-estimator consumes sample_weight, but doesn't forward it to the underlying
+    # estimator
+    clf = WeightedMetaClassifier(estimator=NonConsumingClassifier())
+    clf.fit(X, y, sample_weight=my_weights)
+
+    # If the estimator accepts the metadata but doesn't explicitly say it doesn't
+    # need it, there's an error
+    clf = WeightedMetaClassifier(estimator=ConsumingClassifier())
+    err_message = (
+        "[sample_weight] are passed but are not explicitly set as requested or"
+        " not requested for ConsumingClassifier.fit"
+    )
+    with pytest.raises(ValueError, match=re.escape(err_message)):
+        clf.fit(X, y, sample_weight=my_weights)
+
+    # Explicitly saying the estimator doesn't need it, makes the error go away,
+    # because in this case `WeightedMetaClassifier` consumes `sample_weight`. If
+    # there was no consumer of sample_weight, passing it would result in an
+    # error.
+    clf = WeightedMetaClassifier(
+        estimator=ConsumingClassifier().set_fit_request(sample_weight=False)
+    )
+    # this doesn't raise since WeightedMetaClassifier itself is a consumer,
+    # and passing metadata to the consumer directly is fine regardless of its
+    # metadata_request values.
+    clf.fit(X, y, sample_weight=my_weights)
+    check_recorded_metadata(clf.estimator_, method="fit", parent="fit")
+
+    # Requesting a metadata will make the meta-estimator forward it correctly
+    clf = WeightedMetaClassifier(
+        estimator=ConsumingClassifier().set_fit_request(sample_weight=True)
+    )
+    clf.fit(X, y, sample_weight=my_weights)
+    check_recorded_metadata(
+        clf.estimator_, method="fit", parent="fit", sample_weight=my_weights
+    )
+
+    # And requesting it with an alias
+    clf = WeightedMetaClassifier(
+        estimator=ConsumingClassifier().set_fit_request(
+            sample_weight="alternative_weight"
+        )
+    )
+    clf.fit(X, y, alternative_weight=my_weights)
+    check_recorded_metadata(
+        clf.estimator_, method="fit", parent="fit", sample_weight=my_weights
+    )
+
+
+@config_context(enable_metadata_routing=True)
+def test_nested_routing():
+    # check if metadata is routed in a nested routing situation.
+    pipeline = SimplePipeline(
+        [
+            MetaTransformer(
+                transformer=ConsumingTransformer()
+                .set_fit_request(metadata=True, sample_weight=False)
+                .set_transform_request(sample_weight=True, metadata=False)
+            ),
+            WeightedMetaRegressor(
+                estimator=ConsumingRegressor()
+                .set_fit_request(sample_weight="inner_weights", metadata=False)
+                .set_predict_request(sample_weight=False)
+            ).set_fit_request(sample_weight="outer_weights"),
+        ]
+    )
+    w1, w2, w3 = [1], [2], [3]
+    pipeline.fit(
+        X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3
+    )
+    check_recorded_metadata(
+        pipeline.steps_[0].transformer_,
+        method="fit",
+        parent="fit",
+        metadata=my_groups,
+    )
+    check_recorded_metadata(
+        pipeline.steps_[0].transformer_,
+        method="transform",
+        parent="fit",
+        sample_weight=w1,
+    )
+    check_recorded_metadata(
+        pipeline.steps_[1], method="fit", parent="fit", sample_weight=w2
+    )
+    check_recorded_metadata(
+        pipeline.steps_[1].estimator_, method="fit", parent="fit", sample_weight=w3
+    )
+
+    pipeline.predict(X, sample_weight=w3)
+    check_recorded_metadata(
+        pipeline.steps_[0].transformer_,
+        method="transform",
+        parent="fit",
+        sample_weight=w3,
+    )
+
+
+@config_context(enable_metadata_routing=True)
+def test_nested_routing_conflict():
+    # check if an error is raised if there's a conflict between keys
+    pipeline = SimplePipeline(
+        [
+            MetaTransformer(
+                transformer=ConsumingTransformer()
+                .set_fit_request(metadata=True, sample_weight=False)
+                .set_transform_request(sample_weight=True)
+            ),
+            WeightedMetaRegressor(
+                estimator=ConsumingRegressor().set_fit_request(sample_weight=True)
+            ).set_fit_request(sample_weight="outer_weights"),
+        ]
+    )
+    w1, w2 = [1], [2]
+    with pytest.raises(
+        ValueError,
+        match=(
+            re.escape(
+                "In WeightedMetaRegressor, there is a conflict on sample_weight between"
+                " what is requested for this estimator and what is requested by its"
+                " children. You can resolve this conflict by using an alias for the"
+                " child estimators' requested metadata."
+            )
+        ),
+    ):
+        pipeline.fit(X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2)
+
+
+@config_context(enable_metadata_routing=True)
+def test_invalid_metadata():
+    # check that passing wrong metadata raises an error
+    trs = MetaTransformer(
+        transformer=ConsumingTransformer().set_transform_request(sample_weight=True)
+    )
+    with pytest.raises(
+        TypeError,
+        match=(re.escape("transform got unexpected argument(s) {'other_param'}")),
+    ):
+        trs.fit(X, y).transform(X, other_param=my_weights)
+
+    # passing a metadata which is not requested by any estimator should also raise
+    trs = MetaTransformer(
+        transformer=ConsumingTransformer().set_transform_request(sample_weight=False)
+    )
+    with pytest.raises(
+        TypeError,
+        match=(re.escape("transform got unexpected argument(s) {'sample_weight'}")),
+    ):
+        trs.fit(X, y).transform(X, sample_weight=my_weights)
+
+
+@config_context(enable_metadata_routing=True)
+def test_get_metadata_routing():
+    class TestDefaultsBadMethodName(_MetadataRequester):
+        __metadata_request__fit = {
+            "sample_weight": None,
+            "my_param": None,
+        }
+        __metadata_request__score = {
+            "sample_weight": None,
+            "my_param": True,
+            "my_other_param": None,
+        }
+        # this will raise an error since we don't understand "other_method" as a method
+        __metadata_request__other_method = {"my_param": True}
+
+    class TestDefaults(_MetadataRequester):
+        __metadata_request__fit = {
+            "sample_weight": None,
+            "my_other_param": None,
+        }
+        __metadata_request__score = {
+            "sample_weight": None,
+            "my_param": True,
+            "my_other_param": None,
+        }
+        __metadata_request__predict = {"my_param": True}
+
+    with pytest.raises(
+        AttributeError, match="'MetadataRequest' object has no attribute 'other_method'"
+    ):
+        TestDefaultsBadMethodName().get_metadata_routing()
+
+    expected = {
+        "score": {
+            "my_param": True,
+            "my_other_param": None,
+            "sample_weight": None,
+        },
+        "fit": {
+            "my_other_param": None,
+            "sample_weight": None,
+        },
+        "predict": {"my_param": True},
+    }
+    assert_request_equal(TestDefaults().get_metadata_routing(), expected)
+
+    est = TestDefaults().set_score_request(my_param="other_param")
+    expected = {
+        "score": {
+            "my_param": "other_param",
+            "my_other_param": None,
+            "sample_weight": None,
+        },
+        "fit": {
+            "my_other_param": None,
+            "sample_weight": None,
+        },
+        "predict": {"my_param": True},
+    }
+    assert_request_equal(est.get_metadata_routing(), expected)
+
+    est = TestDefaults().set_fit_request(sample_weight=True)
+    expected = {
+        "score": {
+            "my_param": True,
+            "my_other_param": None,
+            "sample_weight": None,
+        },
+        "fit": {
+            "my_other_param": None,
+            "sample_weight": True,
+        },
+        "predict": {"my_param": True},
+    }
+    assert_request_equal(est.get_metadata_routing(), expected)
+
+
+@config_context(enable_metadata_routing=True)
+def test_setting_default_requests():
+    # Test _get_default_requests method
+    test_cases = dict()
+
+    class ExplicitRequest(BaseEstimator):
+        # `fit` doesn't accept `props` explicitly, but we want to request it
+        __metadata_request__fit = {"prop": None}
+
+        def fit(self, X, y, **kwargs):
+            return self
+
+    test_cases[ExplicitRequest] = {"prop": None}
+
+    class ExplicitRequestOverwrite(BaseEstimator):
+        # `fit` explicitly accepts `props`, but we want to change the default
+        # request value from None to True
+        __metadata_request__fit = {"prop": True}
+
+        def fit(self, X, y, prop=None, **kwargs):
+            return self
+
+    test_cases[ExplicitRequestOverwrite] = {"prop": True}
+
+    class ImplicitRequest(BaseEstimator):
+        # `fit` requests `prop` and the default None should be used
+        def fit(self, X, y, prop=None, **kwargs):
+            return self
+
+    test_cases[ImplicitRequest] = {"prop": None}
+
+    class ImplicitRequestRemoval(BaseEstimator):
+        # `fit` (in this class or a parent) requests `prop`, but we don't want
+        # it requested at all.
+        __metadata_request__fit = {"prop": metadata_routing.UNUSED}
+
+        def fit(self, X, y, prop=None, **kwargs):
+            return self
+
+    test_cases[ImplicitRequestRemoval] = {}
+
+    for Klass, requests in test_cases.items():
+        assert get_routing_for_object(Klass()).fit.requests == requests
+        assert_request_is_empty(Klass().get_metadata_routing(), exclude="fit")
+        Klass().fit(None, None)  # for coverage
+
+
+@config_context(enable_metadata_routing=True)
+def test_removing_non_existing_param_raises():
+    """Test that removing a metadata using UNUSED which doesn't exist raises."""
+
+    class InvalidRequestRemoval(BaseEstimator):
+        # `fit` (in this class or a parent) requests `prop`, but we don't want
+        # it requested at all.
+        __metadata_request__fit = {"prop": metadata_routing.UNUSED}
+
+        def fit(self, X, y, **kwargs):
+            return self
+
+    with pytest.raises(ValueError, match="Trying to remove parameter"):
+        InvalidRequestRemoval().get_metadata_routing()
+
+
+@config_context(enable_metadata_routing=True)
+def test_method_metadata_request():
+    mmr = MethodMetadataRequest(owner="test", method="fit")
+
+    with pytest.raises(ValueError, match="The alias you're setting for"):
+        mmr.add_request(param="foo", alias=1.4)
+
+    mmr.add_request(param="foo", alias=None)
+    assert mmr.requests == {"foo": None}
+    mmr.add_request(param="foo", alias=False)
+    assert mmr.requests == {"foo": False}
+    mmr.add_request(param="foo", alias=True)
+    assert mmr.requests == {"foo": True}
+    mmr.add_request(param="foo", alias="foo")
+    assert mmr.requests == {"foo": True}
+    mmr.add_request(param="foo", alias="bar")
+    assert mmr.requests == {"foo": "bar"}
+    assert mmr._get_param_names(return_alias=False) == {"foo"}
+    assert mmr._get_param_names(return_alias=True) == {"bar"}
+
+
+@config_context(enable_metadata_routing=True)
+def test_get_routing_for_object():
+    class Consumer(BaseEstimator):
+        __metadata_request__fit = {"prop": None}
+
+    assert_request_is_empty(get_routing_for_object(None))
+    assert_request_is_empty(get_routing_for_object(object()))
+
+    mr = MetadataRequest(owner="test")
+    mr.fit.add_request(param="foo", alias="bar")
+    mr_factory = get_routing_for_object(mr)
+    assert_request_is_empty(mr_factory, exclude="fit")
+    assert mr_factory.fit.requests == {"foo": "bar"}
+
+    mr = get_routing_for_object(Consumer())
+    assert_request_is_empty(mr, exclude="fit")
+    assert mr.fit.requests == {"prop": None}
+
+
+@config_context(enable_metadata_routing=True)
+def test_metadata_request_consumes_method():
+    """Test that MetadataRequest().consumes() method works as expected."""
+    request = MetadataRouter(owner="test")
+    assert request.consumes(method="fit", params={"foo"}) == set()
+
+    request = MetadataRequest(owner="test")
+    request.fit.add_request(param="foo", alias=True)
+    assert request.consumes(method="fit", params={"foo"}) == {"foo"}
+
+    request = MetadataRequest(owner="test")
+    request.fit.add_request(param="foo", alias="bar")
+    assert request.consumes(method="fit", params={"bar", "foo"}) == {"bar"}
+
+
+@config_context(enable_metadata_routing=True)
+def test_metadata_router_consumes_method():
+    """Test that MetadataRouter().consumes method works as expected."""
+    # having it here instead of parametrizing the test since `set_fit_request`
+    # is not available while collecting the tests.
+    cases = [
+        (
+            WeightedMetaRegressor(
+                estimator=ConsumingRegressor().set_fit_request(sample_weight=True)
+            ),
+            {"sample_weight"},
+            {"sample_weight"},
+        ),
+        (
+            WeightedMetaRegressor(
+                estimator=ConsumingRegressor().set_fit_request(
+                    sample_weight="my_weights"
+                )
+            ),
+            {"my_weights", "sample_weight"},
+            {"my_weights"},
+        ),
+    ]
+
+    for obj, input, output in cases:
+        assert obj.get_metadata_routing().consumes(method="fit", params=input) == output
+
+
+@config_context(enable_metadata_routing=True)
+def test_metaestimator_warnings():
+    class WeightedMetaRegressorWarn(WeightedMetaRegressor):
+        __metadata_request__fit = {"sample_weight": metadata_routing.WARN}
+
+    with pytest.warns(
+        UserWarning, match="Support for .* has recently been added to this class"
+    ):
+        WeightedMetaRegressorWarn(
+            estimator=LinearRegression().set_fit_request(sample_weight=False)
+        ).fit(X, y, sample_weight=my_weights)
+
+
+@config_context(enable_metadata_routing=True)
+def test_estimator_warnings():
+    class ConsumingRegressorWarn(ConsumingRegressor):
+        __metadata_request__fit = {"sample_weight": metadata_routing.WARN}
+
+    with pytest.warns(
+        UserWarning, match="Support for .* has recently been added to this class"
+    ):
+        MetaRegressor(estimator=ConsumingRegressorWarn()).fit(
+            X, y, sample_weight=my_weights
+        )
+
+
+@config_context(enable_metadata_routing=True)
+@pytest.mark.parametrize(
+    "obj, string",
+    [
+        (
+            MethodMetadataRequest(owner="test", method="fit").add_request(
+                param="foo", alias="bar"
+            ),
+            "{'foo': 'bar'}",
+        ),
+        (
+            MetadataRequest(owner="test"),
+            "{}",
+        ),
+        (
+            MetadataRouter(owner="test").add(
+                estimator=ConsumingRegressor(),
+                method_mapping=MethodMapping().add(caller="predict", callee="predict"),
+            ),
+            (
+                "{'estimator': {'mapping': [{'caller': 'predict', 'callee':"
+                " 'predict'}], 'router': {'fit': {'sample_weight': None, 'metadata':"
+                " None}, 'partial_fit': {'sample_weight': None, 'metadata': None},"
+                " 'predict': {'sample_weight': None, 'metadata': None}, 'score':"
+                " {'sample_weight': None, 'metadata': None}}}}"
+            ),
+        ),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_string_representations(obj, string):
+    assert str(obj) == string
+
+
+@pytest.mark.parametrize(
+    "obj, method, inputs, err_cls, err_msg",
+    [
+        (
+            MethodMapping(),
+            "add",
+            {"caller": "fit", "callee": "invalid"},
+            ValueError,
+            "Given callee",
+        ),
+        (
+            MethodMapping(),
+            "add",
+            {"caller": "invalid", "callee": "fit"},
+            ValueError,
+            "Given caller",
+        ),
+        (
+            MetadataRouter(owner="test"),
+            "add_self_request",
+            {"obj": MetadataRouter(owner="test")},
+            ValueError,
+            "Given `obj` is neither a `MetadataRequest` nor does it implement",
+        ),
+        (
+            ConsumingClassifier(),
+            "set_fit_request",
+            {"invalid": True},
+            TypeError,
+            "Unexpected args",
+        ),
+    ],
+)
+@config_context(enable_metadata_routing=True)
+def test_validations(obj, method, inputs, err_cls, err_msg):
+    with pytest.raises(err_cls, match=err_msg):
+        getattr(obj, method)(**inputs)
+
+
+@config_context(enable_metadata_routing=True)
+def test_methodmapping():
+    mm = (
+        MethodMapping()
+        .add(caller="fit", callee="transform")
+        .add(caller="fit", callee="fit")
+    )
+
+    mm_list = list(mm)
+    assert mm_list[0] == ("fit", "transform")
+    assert mm_list[1] == ("fit", "fit")
+
+    mm = MethodMapping()
+    for method in METHODS:
+        mm.add(caller=method, callee=method)
+        assert MethodPair(method, method) in mm._routes
+    assert len(mm._routes) == len(METHODS)
+
+    mm = MethodMapping().add(caller="score", callee="score")
+    assert repr(mm) == "[{'caller': 'score', 'callee': 'score'}]"
+
+
+@config_context(enable_metadata_routing=True)
+def test_metadatarouter_add_self_request():
+    # adding a MetadataRequest as `self` adds a copy
+    request = MetadataRequest(owner="nested")
+    request.fit.add_request(param="param", alias=True)
+    router = MetadataRouter(owner="test").add_self_request(request)
+    assert str(router._self_request) == str(request)
+    # should be a copy, not the same object
+    assert router._self_request is not request
+
+    # one can add an estimator as self
+    est = ConsumingRegressor().set_fit_request(sample_weight="my_weights")
+    router = MetadataRouter(owner="test").add_self_request(obj=est)
+    assert str(router._self_request) == str(est.get_metadata_routing())
+    assert router._self_request is not est.get_metadata_routing()
+
+    # adding a consumer+router as self should only add the consumer part
+    est = WeightedMetaRegressor(
+        estimator=ConsumingRegressor().set_fit_request(sample_weight="nested_weights")
+    )
+    router = MetadataRouter(owner="test").add_self_request(obj=est)
+    # _get_metadata_request() returns the consumer part of the requests
+    assert str(router._self_request) == str(est._get_metadata_request())
+    # get_metadata_routing() returns the complete request set, consumer and
+    # router included.
+    assert str(router._self_request) != str(est.get_metadata_routing())
+    # it should be a copy, not the same object
+    assert router._self_request is not est._get_metadata_request()
+
+
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_add():
+    # adding one with a string `method_mapping`
+    router = MetadataRouter(owner="test").add(
+        est=ConsumingRegressor().set_fit_request(sample_weight="weights"),
+        method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+    )
+    assert (
+        str(router)
+        == "{'est': {'mapping': [{'caller': 'fit', 'callee': 'fit'}], 'router': {'fit':"
+        " {'sample_weight': 'weights', 'metadata': None}, 'partial_fit':"
+        " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':"
+        " None, 'metadata': None}, 'score': {'sample_weight': None, 'metadata':"
+        " None}}}}"
+    )
+
+    # adding one with an instance of MethodMapping
+    router = MetadataRouter(owner="test").add(
+        method_mapping=MethodMapping().add(caller="fit", callee="score"),
+        est=ConsumingRegressor().set_score_request(sample_weight=True),
+    )
+    assert (
+        str(router)
+        == "{'est': {'mapping': [{'caller': 'fit', 'callee': 'score'}], 'router':"
+        " {'fit': {'sample_weight': None, 'metadata': None}, 'partial_fit':"
+        " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':"
+        " None, 'metadata': None}, 'score': {'sample_weight': True, 'metadata':"
+        " None}}}}"
+    )
+
+
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_get_param_names():
+    router = (
+        MetadataRouter(owner="test")
+        .add_self_request(
+            WeightedMetaRegressor(estimator=ConsumingRegressor()).set_fit_request(
+                sample_weight="self_weights"
+            )
+        )
+        .add(
+            trs=ConsumingTransformer().set_fit_request(
+                sample_weight="transform_weights"
+            ),
+            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+        )
+    )
+
+    assert (
+        str(router)
+        == "{'$self_request': {'fit': {'sample_weight': 'self_weights'}, 'score':"
+        " {'sample_weight': None}}, 'trs': {'mapping': [{'caller': 'fit', 'callee':"
+        " 'fit'}], 'router': {'fit': {'sample_weight': 'transform_weights',"
+        " 'metadata': None}, 'transform': {'sample_weight': None, 'metadata': None},"
+        " 'inverse_transform': {'sample_weight': None, 'metadata': None}}}}"
+    )
+
+    assert router._get_param_names(
+        method="fit", return_alias=True, ignore_self_request=False
+    ) == {"transform_weights", "metadata", "self_weights"}
+    # return_alias=False will return original names for "self"
+    assert router._get_param_names(
+        method="fit", return_alias=False, ignore_self_request=False
+    ) == {"sample_weight", "metadata", "transform_weights"}
+    # ignoring self would remove "sample_weight"
+    assert router._get_param_names(
+        method="fit", return_alias=False, ignore_self_request=True
+    ) == {"metadata", "transform_weights"}
+    # return_alias is ignored when ignore_self_request=True
+    assert router._get_param_names(
+        method="fit", return_alias=True, ignore_self_request=True
+    ) == router._get_param_names(
+        method="fit", return_alias=False, ignore_self_request=True
+    )
+
+
+@config_context(enable_metadata_routing=True)
+def test_method_generation():
+    # Test if all required request methods are generated.
+
+    # TODO: these test classes can be moved to sklearn.utils._testing once we
+    # have a better idea of what the commonly used classes are.
+    class SimpleEstimator(BaseEstimator):
+        # This class should have no set_{method}_request
+        def fit(self, X, y):
+            pass  # pragma: no cover
+
+        def fit_transform(self, X, y):
+            pass  # pragma: no cover
+
+        def fit_predict(self, X, y):
+            pass  # pragma: no cover
+
+        def partial_fit(self, X, y):
+            pass  # pragma: no cover
+
+        def predict(self, X):
+            pass  # pragma: no cover
+
+        def predict_proba(self, X):
+            pass  # pragma: no cover
+
+        def predict_log_proba(self, X):
+            pass  # pragma: no cover
+
+        def decision_function(self, X):
+            pass  # pragma: no cover
+
+        def score(self, X, y):
+            pass  # pragma: no cover
+
+        def split(self, X, y=None):
+            pass  # pragma: no cover
+
+        def transform(self, X):
+            pass  # pragma: no cover
+
+        def inverse_transform(self, X):
+            pass  # pragma: no cover
+
+    for method in METHODS:
+        assert not hasattr(SimpleEstimator(), f"set_{method}_request")
+
+    class SimpleEstimator(BaseEstimator):
+        # This class should have every set_{method}_request
+        def fit(self, X, y, sample_weight=None):
+            pass  # pragma: no cover
+
+        def fit_transform(self, X, y, sample_weight=None):
+            pass  # pragma: no cover
+
+        def fit_predict(self, X, y, sample_weight=None):
+            pass  # pragma: no cover
+
+        def partial_fit(self, X, y, sample_weight=None):
+            pass  # pragma: no cover
+
+        def predict(self, X, sample_weight=None):
+            pass  # pragma: no cover
+
+        def predict_proba(self, X, sample_weight=None):
+            pass  # pragma: no cover
+
+        def predict_log_proba(self, X, sample_weight=None):
+            pass  # pragma: no cover
+
+        def decision_function(self, X, sample_weight=None):
+            pass  # pragma: no cover
+
+        def score(self, X, y, sample_weight=None):
+            pass  # pragma: no cover
+
+        def split(self, X, y=None, sample_weight=None):
+            pass  # pragma: no cover
+
+        def transform(self, X, sample_weight=None):
+            pass  # pragma: no cover
+
+        def inverse_transform(self, X, sample_weight=None):
+            pass  # pragma: no cover
+
+    # composite methods shouldn't have a corresponding set method.
+    for method in COMPOSITE_METHODS:
+        assert not hasattr(SimpleEstimator(), f"set_{method}_request")
+
+    # simple methods should have a corresponding set method.
+    for method in SIMPLE_METHODS:
+        assert hasattr(SimpleEstimator(), f"set_{method}_request")
+
+
+@config_context(enable_metadata_routing=True)
+def test_composite_methods():
+    # Test the behavior and the values of methods (composite methods) whose
+    # request values are a union of requests by other methods (simple methods).
+    # fit_transform and fit_predict are the only composite methods we have in
+    # scikit-learn.
+    class SimpleEstimator(BaseEstimator):
+        # This class should have every set_{method}_request
+        def fit(self, X, y, foo=None, bar=None):
+            pass  # pragma: no cover
+
+        def predict(self, X, foo=None, bar=None):
+            pass  # pragma: no cover
+
+        def transform(self, X, other_param=None):
+            pass  # pragma: no cover
+
+    est = SimpleEstimator()
+    # Since no request is set for fit or predict or transform, the request for
+    # fit_transform and fit_predict should also be empty.
+    assert est.get_metadata_routing().fit_transform.requests == {
+        "bar": None,
+        "foo": None,
+        "other_param": None,
+    }
+    assert est.get_metadata_routing().fit_predict.requests == {"bar": None, "foo": None}
+
+    # setting the request on only one of them should raise an error
+    est.set_fit_request(foo=True, bar="test")
+    with pytest.raises(ValueError, match="Conflicting metadata requests for"):
+        est.get_metadata_routing().fit_predict
+
+    # setting the request on the other one should fail if not the same as the
+    # first method
+    est.set_predict_request(bar=True)
+    with pytest.raises(ValueError, match="Conflicting metadata requests for"):
+        est.get_metadata_routing().fit_predict
+
+    # now the requests are consistent and getting the requests for fit_predict
+    # shouldn't raise.
+    est.set_predict_request(foo=True, bar="test")
+    est.get_metadata_routing().fit_predict
+
+    # setting the request for a none-overlapping parameter would merge them
+    # together.
+    est.set_transform_request(other_param=True)
+    assert est.get_metadata_routing().fit_transform.requests == {
+        "bar": "test",
+        "foo": True,
+        "other_param": True,
+    }
+
+
+@config_context(enable_metadata_routing=True)
+def test_no_feature_flag_raises_error():
+    """Test that when feature flag disabled, set_{method}_requests raises."""
+    with config_context(enable_metadata_routing=False):
+        with pytest.raises(RuntimeError, match="This method is only available"):
+            ConsumingClassifier().set_fit_request(sample_weight=True)
+
+
+@config_context(enable_metadata_routing=True)
+def test_none_metadata_passed():
+    """Test that passing None as metadata when not requested doesn't raise"""
+    MetaRegressor(estimator=ConsumingRegressor()).fit(X, y, sample_weight=None)
+
+
+@config_context(enable_metadata_routing=True)
+def test_no_metadata_always_works():
+    """Test that when no metadata is passed, having a meta-estimator which does
+    not yet support metadata routing works.
+
+    Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/28246
+    """
+
+    class Estimator(_RoutingNotSupportedMixin, BaseEstimator):
+        def fit(self, X, y, metadata=None):
+            return self
+
+    # This passes since no metadata is passed.
+    MetaRegressor(estimator=Estimator()).fit(X, y)
+    # This fails since metadata is passed but Estimator() does not support it.
+    with pytest.raises(
+        NotImplementedError, match="Estimator has not implemented metadata routing yet."
+    ):
+        MetaRegressor(estimator=Estimator()).fit(X, y, metadata=my_groups)
+
+
+@config_context(enable_metadata_routing=True)
+def test_unsetmetadatapassederror_correct():
+    """Test that UnsetMetadataPassedError raises the correct error message when
+    set_{method}_request is not set in nested cases."""
+    weighted_meta = WeightedMetaClassifier(estimator=ConsumingClassifier())
+    pipe = SimplePipeline([weighted_meta])
+    msg = re.escape(
+        "[metadata] are passed but are not explicitly set as requested or not requested"
+        " for ConsumingClassifier.fit, which is used within WeightedMetaClassifier.fit."
+        " Call `ConsumingClassifier.set_fit_request({metadata}=True/False)` for each"
+        " metadata you want to request/ignore."
+    )
+
+    with pytest.raises(UnsetMetadataPassedError, match=msg):
+        pipe.fit(X, y, metadata="blah")
+
+
+@config_context(enable_metadata_routing=True)
+def test_unsetmetadatapassederror_correct_for_composite_methods():
+    """Test that UnsetMetadataPassedError raises the correct error message when
+    composite metadata request methods are not set in nested cases."""
+    consuming_transformer = ConsumingTransformer()
+    pipe = Pipeline([("consuming_transformer", consuming_transformer)])
+
+    msg = re.escape(
+        "[metadata] are passed but are not explicitly set as requested or not requested"
+        " for ConsumingTransformer.fit_transform, which is used within"
+        " Pipeline.fit_transform. Call"
+        " `ConsumingTransformer.set_fit_request({metadata}=True/False)"
+        ".set_transform_request({metadata}=True/False)`"
+        " for each metadata you want to request/ignore."
+    )
+    with pytest.raises(UnsetMetadataPassedError, match=msg):
+        pipe.fit_transform(X, y, metadata="blah")
+
+
+@config_context(enable_metadata_routing=True)
+def test_unbound_set_methods_work():
+    """Tests that if the set_{method}_request is unbound, it still works.
+
+    Also test that passing positional arguments to the set_{method}_request fails
+    with the right TypeError message.
+
+    Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/28632
+    """
+
+    class A(BaseEstimator):
+        def fit(self, X, y, sample_weight=None):
+            return self
+
+    error_message = re.escape(
+        "set_fit_request() takes 0 positional argument but 1 were given"
+    )
+
+    # Test positional arguments error before making the descriptor method unbound.
+    with pytest.raises(TypeError, match=error_message):
+        A().set_fit_request(True)
+
+    # This somehow makes the descriptor method unbound, which results in the `instance`
+    # argument being None, and instead `self` being passed as a positional argument
+    # to the descriptor method.
+    A.set_fit_request = A.set_fit_request
+
+    # This should pass as usual
+    A().set_fit_request(sample_weight=True)
+
+    # Test positional arguments error after making the descriptor method unbound.
+    with pytest.raises(TypeError, match=error_message):
+        A().set_fit_request(True)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_metaestimators.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_metaestimators.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dbc8f96c10a7c66b557317c89b59c29a7a1f57d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_metaestimators.py
@@ -0,0 +1,337 @@
+"""Common tests for metaestimators"""
+
+import functools
+from contextlib import suppress
+from inspect import signature
+
+import numpy as np
+import pytest
+
+from sklearn.base import BaseEstimator, is_regressor
+from sklearn.datasets import make_classification
+from sklearn.ensemble import BaggingClassifier
+from sklearn.exceptions import NotFittedError
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_selection import RFE, RFECV
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import MaxAbsScaler, StandardScaler
+from sklearn.semi_supervised import SelfTrainingClassifier
+from sklearn.utils import all_estimators
+from sklearn.utils._test_common.instance_generator import _construct_instances
+from sklearn.utils._testing import SkipTest, set_random_state
+from sklearn.utils.estimator_checks import (
+    _enforce_estimator_tags_X,
+    _enforce_estimator_tags_y,
+)
+from sklearn.utils.validation import check_is_fitted
+
+
+class DelegatorData:
+    def __init__(
+        self,
+        name,
+        construct,
+        skip_methods=(),
+        fit_args=make_classification(random_state=0),
+    ):
+        self.name = name
+        self.construct = construct
+        self.fit_args = fit_args
+        self.skip_methods = skip_methods
+
+
+# For the following meta estimators we check for the existence of relevant
+# methods only if the sub estimator also contains them. Any methods that
+# are implemented in the meta estimator themselves and are not dependent
+# on the sub estimator are specified in the `skip_methods` parameter.
+DELEGATING_METAESTIMATORS = [
+    DelegatorData("Pipeline", lambda est: Pipeline([("est", est)])),
+    DelegatorData(
+        "GridSearchCV",
+        lambda est: GridSearchCV(est, param_grid={"param": [5]}, cv=2),
+        skip_methods=["score"],
+    ),
+    DelegatorData(
+        "RandomizedSearchCV",
+        lambda est: RandomizedSearchCV(
+            est, param_distributions={"param": [5]}, cv=2, n_iter=1
+        ),
+        skip_methods=["score"],
+    ),
+    DelegatorData("RFE", RFE, skip_methods=["transform", "inverse_transform"]),
+    DelegatorData(
+        "RFECV", RFECV, skip_methods=["transform", "inverse_transform", "score"]
+    ),
+    DelegatorData(
+        "BaggingClassifier",
+        BaggingClassifier,
+        skip_methods=[
+            "transform",
+            "inverse_transform",
+            "score",
+            "predict_proba",
+            "predict_log_proba",
+            "predict",
+        ],
+    ),
+    DelegatorData(
+        "SelfTrainingClassifier",
+        lambda est: SelfTrainingClassifier(est),
+        skip_methods=["transform", "inverse_transform", "predict_proba"],
+    ),
+]
+
+
+def test_metaestimator_delegation():
+    # Ensures specified metaestimators have methods iff subestimator does
+    def hides(method):
+        @property
+        def wrapper(obj):
+            if obj.hidden_method == method.__name__:
+                raise AttributeError("%r is hidden" % obj.hidden_method)
+            return functools.partial(method, obj)
+
+        return wrapper
+
+    class SubEstimator(BaseEstimator):
+        def __init__(self, param=1, hidden_method=None):
+            self.param = param
+            self.hidden_method = hidden_method
+
+        def fit(self, X, y=None, *args, **kwargs):
+            self.coef_ = np.arange(X.shape[1])
+            self.classes_ = []
+            return True
+
+        def _check_fit(self):
+            check_is_fitted(self)
+
+        @hides
+        def inverse_transform(self, X, *args, **kwargs):
+            self._check_fit()
+            return X
+
+        @hides
+        def transform(self, X, *args, **kwargs):
+            self._check_fit()
+            return X
+
+        @hides
+        def predict(self, X, *args, **kwargs):
+            self._check_fit()
+            return np.ones(X.shape[0])
+
+        @hides
+        def predict_proba(self, X, *args, **kwargs):
+            self._check_fit()
+            return np.ones(X.shape[0])
+
+        @hides
+        def predict_log_proba(self, X, *args, **kwargs):
+            self._check_fit()
+            return np.ones(X.shape[0])
+
+        @hides
+        def decision_function(self, X, *args, **kwargs):
+            self._check_fit()
+            return np.ones(X.shape[0])
+
+        @hides
+        def score(self, X, y, *args, **kwargs):
+            self._check_fit()
+            return 1.0
+
+    methods = [
+        k
+        for k in SubEstimator.__dict__.keys()
+        if not k.startswith("_") and not k.startswith("fit")
+    ]
+    methods.sort()
+
+    for delegator_data in DELEGATING_METAESTIMATORS:
+        delegate = SubEstimator()
+        delegator = delegator_data.construct(delegate)
+        for method in methods:
+            if method in delegator_data.skip_methods:
+                continue
+            assert hasattr(delegate, method)
+            assert hasattr(delegator, method), (
+                "%s does not have method %r when its delegate does"
+                % (
+                    delegator_data.name,
+                    method,
+                )
+            )
+            # delegation before fit raises a NotFittedError
+            if method == "score":
+                with pytest.raises(NotFittedError):
+                    getattr(delegator, method)(
+                        delegator_data.fit_args[0], delegator_data.fit_args[1]
+                    )
+            else:
+                with pytest.raises(NotFittedError):
+                    getattr(delegator, method)(delegator_data.fit_args[0])
+
+        delegator.fit(*delegator_data.fit_args)
+        for method in methods:
+            if method in delegator_data.skip_methods:
+                continue
+            # smoke test delegation
+            if method == "score":
+                getattr(delegator, method)(
+                    delegator_data.fit_args[0], delegator_data.fit_args[1]
+                )
+            else:
+                getattr(delegator, method)(delegator_data.fit_args[0])
+
+        for method in methods:
+            if method in delegator_data.skip_methods:
+                continue
+            delegate = SubEstimator(hidden_method=method)
+            delegator = delegator_data.construct(delegate)
+            assert not hasattr(delegate, method)
+            assert not hasattr(delegator, method), (
+                "%s has method %r when its delegate does not"
+                % (
+                    delegator_data.name,
+                    method,
+                )
+            )
+
+
+def _get_instance_with_pipeline(meta_estimator, init_params):
+    """Given a single meta-estimator instance, generate an instance with a pipeline"""
+    if {"estimator", "base_estimator", "regressor"} & init_params:
+        if is_regressor(meta_estimator):
+            estimator = make_pipeline(TfidfVectorizer(), Ridge())
+            param_grid = {"ridge__alpha": [0.1, 1.0]}
+        else:
+            estimator = make_pipeline(TfidfVectorizer(), LogisticRegression())
+            param_grid = {"logisticregression__C": [0.1, 1.0]}
+
+        if init_params.intersection(
+            {"param_grid", "param_distributions"}
+        ):  # SearchCV estimators
+            extra_params = {"n_iter": 2} if "n_iter" in init_params else {}
+            return type(meta_estimator)(estimator, param_grid, **extra_params)
+        else:
+            return type(meta_estimator)(estimator)
+
+    if "transformer_list" in init_params:
+        # FeatureUnion
+        transformer_list = [
+            ("trans1", make_pipeline(TfidfVectorizer(), MaxAbsScaler())),
+            (
+                "trans2",
+                make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False)),
+            ),
+        ]
+        return type(meta_estimator)(transformer_list)
+
+    if "estimators" in init_params:
+        # stacking, voting
+        if is_regressor(meta_estimator):
+            estimator = [
+                ("est1", make_pipeline(TfidfVectorizer(), Ridge(alpha=0.1))),
+                ("est2", make_pipeline(TfidfVectorizer(), Ridge(alpha=1))),
+            ]
+        else:
+            estimator = [
+                (
+                    "est1",
+                    make_pipeline(TfidfVectorizer(), LogisticRegression(C=0.1)),
+                ),
+                ("est2", make_pipeline(TfidfVectorizer(), LogisticRegression(C=1))),
+            ]
+        return type(meta_estimator)(estimator)
+
+
+def _generate_meta_estimator_instances_with_pipeline():
+    """Generate instances of meta-estimators fed with a pipeline
+
+    Are considered meta-estimators all estimators accepting one of "estimator",
+    "base_estimator" or "estimators".
+    """
+    print("estimators: ", len(all_estimators()))
+    for _, Estimator in sorted(all_estimators()):
+        sig = set(signature(Estimator).parameters)
+
+        print("\n", Estimator.__name__, sig)
+        if not sig.intersection(
+            {
+                "estimator",
+                "base_estimator",
+                "regressor",
+                "transformer_list",
+                "estimators",
+            }
+        ):
+            continue
+
+        with suppress(SkipTest):
+            for meta_estimator in _construct_instances(Estimator):
+                print(meta_estimator)
+                yield _get_instance_with_pipeline(meta_estimator, sig)
+
+
+# TODO: remove data validation for the following estimators
+# They should be able to work on any data and delegate data validation to
+# their inner estimator(s).
+DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE = [
+    "AdaBoostClassifier",
+    "AdaBoostRegressor",
+    "BaggingClassifier",
+    "BaggingRegressor",
+    "ClassifierChain",  # data validation is necessary
+    "FrozenEstimator",  # this estimator cannot be tested like others.
+    "IterativeImputer",
+    "OneVsOneClassifier",  # input validation can't be avoided
+    "RANSACRegressor",
+    "RFE",
+    "RFECV",
+    "RegressorChain",  # data validation is necessary
+    "SelfTrainingClassifier",
+    "SequentialFeatureSelector",  # not applicable (2D data mandatory)
+]
+
+DATA_VALIDATION_META_ESTIMATORS = [
+    est
+    for est in _generate_meta_estimator_instances_with_pipeline()
+    if est.__class__.__name__ not in DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE
+]
+
+
+def _get_meta_estimator_id(estimator):
+    return estimator.__class__.__name__
+
+
+@pytest.mark.parametrize(
+    "estimator", DATA_VALIDATION_META_ESTIMATORS, ids=_get_meta_estimator_id
+)
+def test_meta_estimators_delegate_data_validation(estimator):
+    # Check that meta-estimators delegate data validation to the inner
+    # estimator(s).
+    rng = np.random.RandomState(0)
+    set_random_state(estimator)
+
+    n_samples = 30
+    X = rng.choice(np.array(["aa", "bb", "cc"], dtype=object), size=n_samples)
+
+    if is_regressor(estimator):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(3, size=n_samples)
+
+    # We convert to lists to make sure it works on array-like
+    X = _enforce_estimator_tags_X(estimator, X).tolist()
+    y = _enforce_estimator_tags_y(estimator, y).tolist()
+
+    # Calling fit should not raise any data validation exception since X is a
+    # valid input datastructure for the first step of the pipeline passed as
+    # base estimator to the meta estimator.
+    estimator.fit(X, y)
+
+    # n_features_in_ should not be defined since data is not tabular data.
+    assert not hasattr(estimator, "n_features_in_")
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_metaestimators_metadata_routing.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_metaestimators_metadata_routing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4ed228ec2f9d4ba87def991783d1a5b2237329f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -0,0 +1,927 @@
+import copy
+import re
+
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, is_classifier
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.compose import TransformedTargetRegressor
+from sklearn.covariance import GraphicalLassoCV
+from sklearn.ensemble import (
+    AdaBoostClassifier,
+    AdaBoostRegressor,
+    BaggingClassifier,
+    BaggingRegressor,
+)
+from sklearn.exceptions import UnsetMetadataPassedError
+from sklearn.experimental import (
+    enable_halving_search_cv,  # noqa: F401
+    enable_iterative_imputer,  # noqa: F401
+)
+from sklearn.feature_selection import (
+    RFE,
+    RFECV,
+    SelectFromModel,
+    SequentialFeatureSelector,
+)
+from sklearn.impute import IterativeImputer
+from sklearn.linear_model import (
+    ElasticNetCV,
+    LarsCV,
+    LassoCV,
+    LassoLarsCV,
+    LogisticRegressionCV,
+    MultiTaskElasticNetCV,
+    MultiTaskLassoCV,
+    OrthogonalMatchingPursuitCV,
+    RANSACRegressor,
+    RidgeClassifierCV,
+    RidgeCV,
+)
+from sklearn.metrics._regression import mean_squared_error
+from sklearn.metrics._scorer import make_scorer
+from sklearn.model_selection import (
+    FixedThresholdClassifier,
+    GridSearchCV,
+    GroupKFold,
+    HalvingGridSearchCV,
+    HalvingRandomSearchCV,
+    RandomizedSearchCV,
+    TunedThresholdClassifierCV,
+    cross_validate,
+)
+from sklearn.multiclass import (
+    OneVsOneClassifier,
+    OneVsRestClassifier,
+    OutputCodeClassifier,
+)
+from sklearn.multioutput import (
+    ClassifierChain,
+    MultiOutputClassifier,
+    MultiOutputRegressor,
+    RegressorChain,
+)
+from sklearn.semi_supervised import SelfTrainingClassifier
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingRegressor,
+    ConsumingScorer,
+    ConsumingSplitter,
+    NonConsumingClassifier,
+    NonConsumingRegressor,
+    _Registry,
+    assert_request_is_empty,
+    check_recorded_metadata,
+)
+from sklearn.utils.metadata_routing import MetadataRouter
+
+rng = np.random.RandomState(42)
+N, M = 100, 4
+X = rng.rand(N, M)
+y = rng.randint(0, 3, size=N)
+y_binary = (y >= 1).astype(int)
+classes = np.unique(y)
+y_multi = rng.randint(0, 3, size=(N, 3))
+classes_multi = [np.unique(y_multi[:, i]) for i in range(y_multi.shape[1])]
+metadata = rng.randint(0, 10, size=N)
+sample_weight = rng.rand(N)
+groups = rng.randint(0, 10, size=len(y))
+
+
+METAESTIMATORS: list = [
+    {
+        "metaestimator": MultiOutputRegressor,
+        "estimator_name": "estimator",
+        "estimator": "regressor",
+        "X": X,
+        "y": y_multi,
+        "estimator_routing_methods": ["fit", "partial_fit"],
+    },
+    {
+        "metaestimator": MultiOutputClassifier,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y_multi,
+        "estimator_routing_methods": ["fit", "partial_fit"],
+        "method_args": {"partial_fit": {"classes": classes_multi}},
+    },
+    {
+        "metaestimator": CalibratedClassifierCV,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit"],
+        "preserves_metadata": "subset",
+    },
+    {
+        "metaestimator": ClassifierChain,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y_multi,
+        "estimator_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": RegressorChain,
+        "estimator_name": "estimator",
+        "estimator": "regressor",
+        "X": X,
+        "y": y_multi,
+        "estimator_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": LogisticRegressionCV,
+        "X": X,
+        "y": y,
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit", "score"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": GridSearchCV,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "init_args": {"param_grid": {"alpha": [0.1, 0.2]}},
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit"],
+        "preserves_metadata": "subset",
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit", "score"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": RandomizedSearchCV,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "init_args": {"param_distributions": {"alpha": [0.1, 0.2]}},
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit"],
+        "preserves_metadata": "subset",
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit", "score"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": HalvingGridSearchCV,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "init_args": {"param_grid": {"alpha": [0.1, 0.2]}},
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit"],
+        "preserves_metadata": "subset",
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit", "score"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": HalvingRandomSearchCV,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "init_args": {"param_distributions": {"alpha": [0.1, 0.2]}},
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit"],
+        "preserves_metadata": "subset",
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit", "score"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": FixedThresholdClassifier,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y_binary,
+        "estimator_routing_methods": ["fit"],
+        "preserves_metadata": "subset",
+    },
+    {
+        "metaestimator": TunedThresholdClassifierCV,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y_binary,
+        "estimator_routing_methods": ["fit"],
+        "preserves_metadata": "subset",
+    },
+    {
+        "metaestimator": OneVsRestClassifier,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit", "partial_fit"],
+        "method_args": {"partial_fit": {"classes": classes}},
+    },
+    {
+        "metaestimator": OneVsOneClassifier,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit", "partial_fit"],
+        "preserves_metadata": "subset",
+        "method_args": {"partial_fit": {"classes": classes}},
+    },
+    {
+        "metaestimator": OutputCodeClassifier,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "init_args": {"random_state": 42},
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": SelectFromModel,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit", "partial_fit"],
+        "method_args": {"partial_fit": {"classes": classes}},
+    },
+    {
+        "metaestimator": OrthogonalMatchingPursuitCV,
+        "X": X,
+        "y": y,
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": ElasticNetCV,
+        "X": X,
+        "y": y,
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": LassoCV,
+        "X": X,
+        "y": y,
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": MultiTaskElasticNetCV,
+        "X": X,
+        "y": y_multi,
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": MultiTaskLassoCV,
+        "X": X,
+        "y": y_multi,
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": LarsCV,
+        "X": X,
+        "y": y,
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": LassoLarsCV,
+        "X": X,
+        "y": y,
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": RANSACRegressor,
+        "estimator_name": "estimator",
+        "estimator": "regressor",
+        "init_args": {"min_samples": 0.5},
+        "X": X,
+        "y": y,
+        "preserves_metadata": "subset",
+        "estimator_routing_methods": ["fit", "predict", "score"],
+        "method_mapping": {"fit": ["fit", "score"]},
+    },
+    {
+        "metaestimator": IterativeImputer,
+        "estimator_name": "estimator",
+        "estimator": "regressor",
+        "init_args": {"skip_complete": False},
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": BaggingClassifier,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y,
+        "preserves_metadata": False,
+        "estimator_routing_methods": [
+            "fit",
+            "predict",
+            "predict_proba",
+            "predict_log_proba",
+            "decision_function",
+        ],
+        "method_mapping": {
+            "predict": ["predict", "predict_proba"],
+            "predict_proba": ["predict", "predict_proba"],
+            "predict_log_proba": ["predict", "predict_proba", "predict_log_proba"],
+        },
+    },
+    {
+        "metaestimator": BaggingRegressor,
+        "estimator_name": "estimator",
+        "estimator": "regressor",
+        "X": X,
+        "y": y,
+        "preserves_metadata": False,
+        "estimator_routing_methods": ["fit", "predict"],
+    },
+    {
+        "metaestimator": RidgeCV,
+        "X": X,
+        "y": y,
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": RidgeClassifierCV,
+        "X": X,
+        "y": y,
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": RidgeCV,
+        "X": X,
+        "y": y,
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": RidgeClassifierCV,
+        "X": X,
+        "y": y,
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": GraphicalLassoCV,
+        "X": X,
+        "y": y,
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": TransformedTargetRegressor,
+        "estimator": "regressor",
+        "estimator_name": "regressor",
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit", "predict"],
+    },
+    {
+        "metaestimator": SelfTrainingClassifier,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y,
+        "preserves_metadata": True,
+        "estimator_routing_methods": [
+            "fit",
+            "predict",
+            "predict_proba",
+            "predict_log_proba",
+            "decision_function",
+            "score",
+        ],
+        "method_mapping": {"fit": ["fit", "score"]},
+    },
+    {
+        "metaestimator": SequentialFeatureSelector,
+        "estimator_name": "estimator",
+        "estimator": "classifier",
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit"],
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+    },
+    {
+        "metaestimator": RFE,
+        "estimator": "classifier",
+        "estimator_name": "estimator",
+        "X": X,
+        "y": y,
+        "estimator_routing_methods": ["fit", "predict", "score"],
+    },
+    {
+        "metaestimator": RFECV,
+        "estimator": "classifier",
+        "estimator_name": "estimator",
+        "estimator_routing_methods": ["fit"],
+        "cv_name": "cv",
+        "cv_routing_methods": ["fit"],
+        "scorer_name": "scoring",
+        "scorer_routing_methods": ["fit", "score"],
+        "X": X,
+        "y": y,
+    },
+]
+"""List containing all metaestimators to be tested and their settings
+
+The keys are as follows:
+
+- metaestimator: The metaestimator to be tested
+- estimator_name: The name of the argument for the sub-estimator
+- estimator: The sub-estimator type, either "regressor" or "classifier"
+- init_args: The arguments to be passed to the metaestimator's constructor
+- X: X-data to fit and predict
+- y: y-data to fit
+- estimator_routing_methods: list of all methods to check for routing metadata
+  to the sub-estimator
+- preserves_metadata:
+    - True (default): the metaestimator passes the metadata to the
+      sub-estimator without modification. We check that the values recorded by
+      the sub-estimator are identical to what we've passed to the
+      metaestimator.
+    - False: no check is performed regarding values, we only check that a
+      metadata with the expected names/keys are passed.
+    - "subset": we check that the recorded metadata by the sub-estimator is a
+      subset of what is passed to the metaestimator.
+- scorer_name: The name of the argument for the scorer
+- scorer_routing_methods: list of all methods to check for routing metadata
+  to the scorer
+- cv_name: The name of the argument for the CV splitter
+- cv_routing_methods: list of all methods to check for routing metadata
+  to the splitter
+- method_args: a dict of dicts, defining extra arguments needed to be passed to
+  methods, such as passing `classes` to `partial_fit`.
+- method_mapping: a dict of the form `{caller: [callee1, ...]}` which signals
+  which `.set_{method}_request` methods should be called to set request values.
+  If not present, a one-to-one mapping is assumed.
+"""
+
+# IDs used by pytest to get meaningful verbose messages when running the tests
+METAESTIMATOR_IDS = [str(row["metaestimator"].__name__) for row in METAESTIMATORS]
+
+UNSUPPORTED_ESTIMATORS = [
+    AdaBoostClassifier(),
+    AdaBoostRegressor(),
+]
+
+
+def get_init_args(metaestimator_info, sub_estimator_consumes):
+    """Get the init args for a metaestimator
+
+    This is a helper function to get the init args for a metaestimator from
+    the METAESTIMATORS list. It returns an empty dict if no init args are
+    required.
+
+    Parameters
+    ----------
+    metaestimator_info : dict
+        The metaestimator info from METAESTIMATORS
+
+    sub_estimator_consumes : bool
+        Whether the sub-estimator consumes metadata or not.
+
+    Returns
+    -------
+    kwargs : dict
+        The init args for the metaestimator.
+
+    (estimator, estimator_registry) : (estimator, registry)
+        The sub-estimator and the corresponding registry.
+
+    (scorer, scorer_registry) : (scorer, registry)
+        The scorer and the corresponding registry.
+
+    (cv, cv_registry) : (CV splitter, registry)
+        The CV splitter and the corresponding registry.
+    """
+    kwargs = metaestimator_info.get("init_args", {})
+    estimator, estimator_registry = None, None
+    scorer, scorer_registry = None, None
+    cv, cv_registry = None, None
+    if "estimator" in metaestimator_info:
+        estimator_name = metaestimator_info["estimator_name"]
+        estimator_registry = _Registry()
+        sub_estimator_type = metaestimator_info["estimator"]
+        if sub_estimator_consumes:
+            if sub_estimator_type == "regressor":
+                estimator = ConsumingRegressor(estimator_registry)
+            elif sub_estimator_type == "classifier":
+                estimator = ConsumingClassifier(estimator_registry)
+            else:
+                raise ValueError("Unpermitted `sub_estimator_type`.")  # pragma: nocover
+        else:
+            if sub_estimator_type == "regressor":
+                estimator = NonConsumingRegressor()
+            elif sub_estimator_type == "classifier":
+                estimator = NonConsumingClassifier()
+            else:
+                raise ValueError("Unpermitted `sub_estimator_type`.")  # pragma: nocover
+        kwargs[estimator_name] = estimator
+    if "scorer_name" in metaestimator_info:
+        scorer_name = metaestimator_info["scorer_name"]
+        scorer_registry = _Registry()
+        scorer = ConsumingScorer(registry=scorer_registry)
+        kwargs[scorer_name] = scorer
+    if "cv_name" in metaestimator_info:
+        cv_name = metaestimator_info["cv_name"]
+        cv_registry = _Registry()
+        cv = ConsumingSplitter(registry=cv_registry)
+        kwargs[cv_name] = cv
+
+    return (
+        kwargs,
+        (estimator, estimator_registry),
+        (scorer, scorer_registry),
+        (cv, cv_registry),
+    )
+
+
+def set_requests(obj, *, method_mapping, methods, metadata_name, value=True):
+    """Call `set_{method}_request` on a list of methods from the sub-estimator.
+
+    Parameters
+    ----------
+    obj : BaseEstimator
+        The object for which `set_{method}_request` methods are called.
+
+    method_mapping : dict
+        The method mapping in the form of `{caller: [callee, ...]}`.
+        If a "caller" is not present in the method mapping, a one-to-one mapping is
+        assumed.
+
+    methods : list of str
+        The list of methods as "caller"s for which the request for the child should
+        be set.
+
+    metadata_name : str
+        The name of the metadata to be routed, usually either `"metadata"` or
+        `"sample_weight"` in our tests.
+
+    value : None, bool, or str
+        The request value to be set, by default it's `True`
+    """
+    for caller in methods:
+        for callee in method_mapping.get(caller, [caller]):
+            set_request_for_method = getattr(obj, f"set_{callee}_request")
+            set_request_for_method(**{metadata_name: value})
+            if (
+                isinstance(obj, BaseEstimator)
+                and is_classifier(obj)
+                and callee == "partial_fit"
+            ):
+                set_request_for_method(classes=True)
+
+
+@pytest.mark.parametrize("estimator", UNSUPPORTED_ESTIMATORS)
+@config_context(enable_metadata_routing=True)
+def test_unsupported_estimators_get_metadata_routing(estimator):
+    """Test that get_metadata_routing is not implemented on meta-estimators for
+    which we haven't implemented routing yet."""
+    with pytest.raises(NotImplementedError):
+        estimator.get_metadata_routing()
+
+
+@pytest.mark.parametrize("estimator", UNSUPPORTED_ESTIMATORS)
+@config_context(enable_metadata_routing=True)
+def test_unsupported_estimators_fit_with_metadata(estimator):
+    """Test that fit raises NotImplementedError when metadata routing is
+    enabled and a metadata is passed on meta-estimators for which we haven't
+    implemented routing yet."""
+    with pytest.raises(NotImplementedError):
+        try:
+            estimator.fit([[1]], [1], sample_weight=[1])
+        except TypeError:
+            # not all meta-estimators in the list support sample_weight,
+            # and for those we skip this test.
+            raise NotImplementedError
+
+
+@config_context(enable_metadata_routing=True)
+def test_registry_copy():
+    # test that _Registry is not copied into a new instance.
+    a = _Registry()
+    b = _Registry()
+    assert a is not b
+    assert a is copy.copy(a)
+    assert a is copy.deepcopy(a)
+
+
+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+@config_context(enable_metadata_routing=True)
+def test_default_request(metaestimator):
+    # Check that by default request is empty and the right type
+    metaestimator_class = metaestimator["metaestimator"]
+    kwargs, *_ = get_init_args(metaestimator, sub_estimator_consumes=True)
+    instance = metaestimator_class(**kwargs)
+    if "cv_name" in metaestimator:
+        # Our GroupCV splitters request groups by default, which we should
+        # ignore in this test.
+        exclude = {"splitter": ["split"]}
+    else:
+        exclude = None
+    assert_request_is_empty(instance.get_metadata_routing(), exclude=exclude)
+    assert isinstance(instance.get_metadata_routing(), MetadataRouter)
+
+
+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+@config_context(enable_metadata_routing=True)
+def test_error_on_missing_requests_for_sub_estimator(metaestimator):
+    # Test that a UnsetMetadataPassedError is raised when the sub-estimator's
+    # requests are not set
+    if "estimator" not in metaestimator:
+        # This test only makes sense for metaestimators which have a
+        # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
+        return
+
+    metaestimator_class = metaestimator["metaestimator"]
+    X = metaestimator["X"]
+    y = metaestimator["y"]
+    routing_methods = metaestimator["estimator_routing_methods"]
+
+    for method_name in routing_methods:
+        for key in ["sample_weight", "metadata"]:
+            kwargs, (estimator, _), (scorer, _), *_ = get_init_args(
+                metaestimator, sub_estimator_consumes=True
+            )
+            if scorer:
+                scorer.set_score_request(**{key: True})
+            val = {"sample_weight": sample_weight, "metadata": metadata}[key]
+            method_kwargs = {key: val}
+            instance = metaestimator_class(**kwargs)
+            msg = (
+                f"[{key}] are passed but are not explicitly set as requested or not"
+                f" requested for {estimator.__class__.__name__}.{method_name}"
+            )
+            with pytest.raises(UnsetMetadataPassedError, match=re.escape(msg)):
+                method = getattr(instance, method_name)
+                if "fit" not in method_name:
+                    # set request on fit
+                    set_requests(
+                        estimator,
+                        method_mapping=metaestimator.get("method_mapping", {}),
+                        methods=["fit"],
+                        metadata_name=key,
+                    )
+                    instance.fit(X, y, **method_kwargs)
+                # making sure the requests are unset, in case they were set as a
+                # side effect of setting them for fit. For instance, if method
+                # mapping for fit is: `"fit": ["fit", "score"]`, that would mean
+                # calling `.score` here would not raise, because we have already
+                # set request value for child estimator's `score`.
+                set_requests(
+                    estimator,
+                    method_mapping=metaestimator.get("method_mapping", {}),
+                    methods=["fit"],
+                    metadata_name=key,
+                    value=None,
+                )
+                try:
+                    # `fit`, `partial_fit`, 'score' accept y, others don't.
+                    method(X, y, **method_kwargs)
+                except TypeError:
+                    method(X, **method_kwargs)
+
+
+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+@config_context(enable_metadata_routing=True)
+def test_setting_request_on_sub_estimator_removes_error(metaestimator):
+    # When the metadata is explicitly requested on the sub-estimator, there
+    # should be no errors.
+    if "estimator" not in metaestimator:
+        # This test only makes sense for metaestimators which have a
+        # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
+        return
+
+    metaestimator_class = metaestimator["metaestimator"]
+    X = metaestimator["X"]
+    y = metaestimator["y"]
+    routing_methods = metaestimator["estimator_routing_methods"]
+    method_mapping = metaestimator.get("method_mapping", {})
+    preserves_metadata = metaestimator.get("preserves_metadata", True)
+
+    for method_name in routing_methods:
+        for key in ["sample_weight", "metadata"]:
+            val = {"sample_weight": sample_weight, "metadata": metadata}[key]
+            method_kwargs = {key: val}
+
+            kwargs, (estimator, registry), (scorer, _), (cv, _) = get_init_args(
+                metaestimator, sub_estimator_consumes=True
+            )
+            if scorer:
+                set_requests(
+                    scorer, method_mapping={}, methods=["score"], metadata_name=key
+                )
+            if cv:
+                cv.set_split_request(groups=True, metadata=True)
+
+            # `set_{method}_request({metadata}==True)` on the underlying objects
+            set_requests(
+                estimator,
+                method_mapping=method_mapping,
+                methods=[method_name],
+                metadata_name=key,
+            )
+
+            instance = metaestimator_class(**kwargs)
+            method = getattr(instance, method_name)
+            extra_method_args = metaestimator.get("method_args", {}).get(
+                method_name, {}
+            )
+            if "fit" not in method_name:
+                # fit before calling method
+                instance.fit(X, y)
+            try:
+                # `fit` and `partial_fit` accept y, others don't.
+                method(X, y, **method_kwargs, **extra_method_args)
+            except TypeError:
+                method(X, **method_kwargs, **extra_method_args)
+
+            # sanity check that registry is not empty, or else the test passes
+            # trivially
+            assert registry
+            split_params = (
+                method_kwargs.keys() if preserves_metadata == "subset" else ()
+            )
+            for estimator in registry:
+                check_recorded_metadata(
+                    estimator,
+                    method=method_name,
+                    parent=method_name,
+                    split_params=split_params,
+                    **method_kwargs,
+                )
+
+
+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+@config_context(enable_metadata_routing=True)
+def test_non_consuming_estimator_works(metaestimator):
+    # Test that when a non-consuming estimator is given, the meta-estimator
+    # works w/o setting any requests.
+    # Regression test for https://github.com/scikit-learn/scikit-learn/issues/28239
+    if "estimator" not in metaestimator:
+        # This test only makes sense for metaestimators which have a
+        # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
+        return
+
+    def set_request(estimator, method_name):
+        # e.g. call set_fit_request on estimator
+        if is_classifier(estimator) and method_name == "partial_fit":
+            estimator.set_partial_fit_request(classes=True)
+
+    metaestimator_class = metaestimator["metaestimator"]
+    X = metaestimator["X"]
+    y = metaestimator["y"]
+    routing_methods = metaestimator["estimator_routing_methods"]
+
+    for method_name in routing_methods:
+        kwargs, (estimator, _), (_, _), (_, _) = get_init_args(
+            metaestimator, sub_estimator_consumes=False
+        )
+        instance = metaestimator_class(**kwargs)
+        set_request(estimator, method_name)
+        method = getattr(instance, method_name)
+        extra_method_args = metaestimator.get("method_args", {}).get(method_name, {})
+        if "fit" not in method_name:
+            instance.fit(X, y, **extra_method_args)
+        # The following should pass w/o raising a routing error.
+        try:
+            # `fit` and `partial_fit` accept y, others don't.
+            method(X, y, **extra_method_args)
+        except TypeError:
+            method(X, **extra_method_args)
+
+
+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+@config_context(enable_metadata_routing=True)
+def test_metadata_is_routed_correctly_to_scorer(metaestimator):
+    """Test that any requested metadata is correctly routed to the underlying
+    scorers in CV estimators.
+    """
+    if "scorer_name" not in metaestimator:
+        # This test only makes sense for CV estimators
+        return
+
+    metaestimator_class = metaestimator["metaestimator"]
+    routing_methods = metaestimator["scorer_routing_methods"]
+    method_mapping = metaestimator.get("method_mapping", {})
+
+    for method_name in routing_methods:
+        kwargs, (estimator, _), (scorer, registry), (cv, _) = get_init_args(
+            metaestimator, sub_estimator_consumes=True
+        )
+        scorer.set_score_request(sample_weight=True)
+        if cv:
+            cv.set_split_request(groups=True, metadata=True)
+        if estimator is not None:
+            set_requests(
+                estimator,
+                method_mapping=method_mapping,
+                methods=[method_name],
+                metadata_name="sample_weight",
+            )
+        instance = metaestimator_class(**kwargs)
+        method = getattr(instance, method_name)
+        method_kwargs = {"sample_weight": sample_weight}
+        if "fit" not in method_name:
+            instance.fit(X, y)
+        method(X, y, **method_kwargs)
+
+        assert registry
+        for _scorer in registry:
+            check_recorded_metadata(
+                obj=_scorer,
+                method="score",
+                parent=method_name,
+                split_params=("sample_weight",),
+                **method_kwargs,
+            )
+
+
+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+@config_context(enable_metadata_routing=True)
+def test_metadata_is_routed_correctly_to_splitter(metaestimator):
+    """Test that any requested metadata is correctly routed to the underlying
+    splitters in CV estimators.
+    """
+    if "cv_routing_methods" not in metaestimator:
+        # This test is only for metaestimators accepting a CV splitter
+        return
+
+    metaestimator_class = metaestimator["metaestimator"]
+    routing_methods = metaestimator["cv_routing_methods"]
+    X_ = metaestimator["X"]
+    y_ = metaestimator["y"]
+
+    for method_name in routing_methods:
+        kwargs, (estimator, _), (scorer, _), (cv, registry) = get_init_args(
+            metaestimator, sub_estimator_consumes=True
+        )
+        if estimator:
+            estimator.set_fit_request(sample_weight=False, metadata=False)
+        if scorer:
+            scorer.set_score_request(sample_weight=False, metadata=False)
+        cv.set_split_request(groups=True, metadata=True)
+        instance = metaestimator_class(**kwargs)
+        method_kwargs = {"groups": groups, "metadata": metadata}
+        method = getattr(instance, method_name)
+        method(X_, y_, **method_kwargs)
+        assert registry
+        for _splitter in registry:
+            check_recorded_metadata(
+                obj=_splitter, method="split", parent=method_name, **method_kwargs
+            )
+
+
+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+@config_context(enable_metadata_routing=True)
+def test_metadata_routed_to_group_splitter(metaestimator):
+    """Test that groups are routed correctly if group splitter of CV estimator is used
+    within cross_validate. Regression test for issue described in PR #29634 to test that
+    `ValueError: The 'groups' parameter should not be None.` is not raised."""
+
+    if "cv_routing_methods" not in metaestimator:
+        # This test is only for metaestimators accepting a CV splitter
+        return
+
+    metaestimator_class = metaestimator["metaestimator"]
+    X_ = metaestimator["X"]
+    y_ = metaestimator["y"]
+
+    kwargs, *_ = get_init_args(metaestimator, sub_estimator_consumes=True)
+    # remove `ConsumingSplitter` from kwargs, so 'cv' param isn't passed twice:
+    kwargs.pop("cv", None)
+    instance = metaestimator_class(cv=GroupKFold(n_splits=2), **kwargs)
+    cross_validate(
+        instance,
+        X_,
+        y_,
+        params={"groups": groups},
+        cv=GroupKFold(n_splits=2),
+        scoring=make_scorer(mean_squared_error, response_method="predict"),
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_min_dependencies_readme.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_min_dependencies_readme.py
new file mode 100644
index 0000000000000000000000000000000000000000..6afcd3e57ca04b4a6e200925f59a5318770b307c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_min_dependencies_readme.py
@@ -0,0 +1,133 @@
+"""Tests for the minimum dependencies in README.rst and pyproject.toml"""
+
+import os
+import re
+from collections import defaultdict
+from pathlib import Path
+
+import pytest
+
+import sklearn
+from sklearn._min_dependencies import dependent_packages
+from sklearn.utils.fixes import parse_version
+
+min_depencies_tag_to_packages_without_version = defaultdict(list)
+for package, (min_version, extras) in dependent_packages.items():
+    for extra in extras.split(", "):
+        min_depencies_tag_to_packages_without_version[extra].append(package)
+
+min_dependencies_tag_to_pyproject_section = {
+    "build": "build-system.requires",
+    "install": "project.dependencies",
+}
+for tag in min_depencies_tag_to_packages_without_version:
+    min_dependencies_tag_to_pyproject_section[tag] = (
+        f"project.optional-dependencies.{tag}"
+    )
+
+
+def test_min_dependencies_readme():
+    # Test that the minimum dependencies in the README.rst file are
+    # consistent with the minimum dependencies defined at the file:
+    # sklearn/_min_dependencies.py
+
+    pattern = re.compile(
+        r"\.\. \|"
+        r"([A-Za-z-]+)"
+        r"MinVersion\| replace::"
+        r"( [0-9]+\.[0-9]+(\.[0-9]+)?)"
+    )
+
+    readme_path = Path(sklearn.__file__).parent.parent
+    readme_file = readme_path / "README.rst"
+
+    if not os.path.exists(readme_file):
+        # Skip the test if the README.rst file is not available.
+        # For instance, when installing scikit-learn from wheels
+        pytest.skip("The README.rst file is not available.")
+
+    with readme_file.open("r") as f:
+        for line in f:
+            matched = pattern.match(line)
+
+            if not matched:
+                continue
+
+            package, version = matched.group(0), matched.group(1)
+            package = package.lower()
+
+            if package in dependent_packages:
+                version = parse_version(version)
+                min_version = parse_version(dependent_packages[package][0])
+
+                assert version == min_version, f"{package} has a mismatched version"
+
+
+def check_pyproject_section(
+    pyproject_section, min_dependencies_tag, skip_version_check_for=None
+):
+    # tomllib is available in Python 3.11
+    tomllib = pytest.importorskip("tomllib")
+
+    if skip_version_check_for is None:
+        skip_version_check_for = []
+
+    expected_packages = min_depencies_tag_to_packages_without_version[
+        min_dependencies_tag
+    ]
+
+    root_directory = Path(sklearn.__file__).parent.parent
+    pyproject_toml_path = root_directory / "pyproject.toml"
+
+    if not pyproject_toml_path.exists():
+        # Skip the test if the pyproject.toml file is not available.
+        # For instance, when installing scikit-learn from wheels
+        pytest.skip("pyproject.toml is not available.")
+
+    with pyproject_toml_path.open("rb") as f:
+        pyproject_toml = tomllib.load(f)
+
+    pyproject_section_keys = pyproject_section.split(".")
+    info = pyproject_toml
+    for key in pyproject_section_keys:
+        info = info[key]
+
+    pyproject_build_min_versions = {}
+    for requirement in info:
+        if ">=" in requirement:
+            package, version = requirement.split(">=")
+        elif "==" in requirement:
+            package, version = requirement.split("==")
+        else:
+            raise NotImplementedError(
+                f"{requirement} not supported yet in this test. "
+                "Only >= and == are supported for version requirements"
+            )
+
+        pyproject_build_min_versions[package] = version
+
+    assert sorted(pyproject_build_min_versions) == sorted(expected_packages)
+
+    for package, version in pyproject_build_min_versions.items():
+        version = parse_version(version)
+        expected_min_version = parse_version(dependent_packages[package][0])
+        if package in skip_version_check_for:
+            continue
+
+        assert version == expected_min_version, f"{package} has a mismatched version"
+
+
+@pytest.mark.parametrize(
+    "min_dependencies_tag, pyproject_section",
+    min_dependencies_tag_to_pyproject_section.items(),
+)
+def test_min_dependencies_pyproject_toml(pyproject_section, min_dependencies_tag):
+    """Check versions in pyproject.toml is consistent with _min_dependencies."""
+    # NumPy is more complex because build-time (>=1.25) and run-time (>=1.19.5)
+    # requirement currently don't match
+    skip_version_check_for = ["numpy"] if min_dependencies_tag == "build" else None
+    check_pyproject_section(
+        pyproject_section,
+        min_dependencies_tag,
+        skip_version_check_for=skip_version_check_for,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_multiclass.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_multiclass.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae718436617e1ab2a1c5a55af7f7c8ac14d62cb7
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_multiclass.py
@@ -0,0 +1,971 @@
+from re import escape
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from numpy.testing import assert_allclose
+
+from sklearn import datasets, svm
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.datasets import load_breast_cancer
+from sklearn.exceptions import NotFittedError
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import (
+    ElasticNet,
+    Lasso,
+    LinearRegression,
+    LogisticRegression,
+    Perceptron,
+    Ridge,
+    SGDClassifier,
+)
+from sklearn.metrics import precision_score, recall_score
+from sklearn.model_selection import GridSearchCV, cross_val_score
+from sklearn.multiclass import (
+    OneVsOneClassifier,
+    OneVsRestClassifier,
+    OutputCodeClassifier,
+)
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.svm import SVC, LinearSVC
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils import (
+    check_array,
+    shuffle,
+)
+from sklearn.utils._mocking import CheckingClassifier
+from sklearn.utils._testing import assert_almost_equal, assert_array_equal
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+from sklearn.utils.multiclass import check_classification_targets, type_of_target
+
+iris = datasets.load_iris()
+rng = np.random.RandomState(0)
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+n_classes = 3
+
+
+def test_ovr_exceptions():
+    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
+
+    # test predicting without fitting
+    with pytest.raises(NotFittedError):
+        ovr.predict([])
+
+    # Fail on multioutput data
+    msg = "Multioutput target data is not supported with label binarization"
+    with pytest.raises(ValueError, match=msg):
+        X = np.array([[1, 0], [0, 1]])
+        y = np.array([[1, 2], [3, 1]])
+        OneVsRestClassifier(MultinomialNB()).fit(X, y)
+
+    with pytest.raises(ValueError, match=msg):
+        X = np.array([[1, 0], [0, 1]])
+        y = np.array([[1.5, 2.4], [3.1, 0.8]])
+        OneVsRestClassifier(MultinomialNB()).fit(X, y)
+
+
+def test_check_classification_targets():
+    # Test that check_classification_target return correct type. #5782
+    y = np.array([0.0, 1.1, 2.0, 3.0])
+    msg = type_of_target(y)
+    with pytest.raises(ValueError, match=msg):
+        check_classification_targets(y)
+
+
+def test_ovr_fit_predict():
+    # A classifier which implements decision_function.
+    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
+    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
+    assert len(ovr.estimators_) == n_classes
+
+    clf = LinearSVC(random_state=0)
+    pred2 = clf.fit(iris.data, iris.target).predict(iris.data)
+    assert np.mean(iris.target == pred) == np.mean(iris.target == pred2)
+
+    # A classifier which implements predict_proba.
+    ovr = OneVsRestClassifier(MultinomialNB())
+    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
+    assert np.mean(iris.target == pred) > 0.65
+
+
+def test_ovr_partial_fit():
+    # Test if partial_fit is working as intended
+    X, y = shuffle(iris.data, iris.target, random_state=0)
+    ovr = OneVsRestClassifier(MultinomialNB())
+    ovr.partial_fit(X[:100], y[:100], np.unique(y))
+    ovr.partial_fit(X[100:], y[100:])
+    pred = ovr.predict(X)
+    ovr2 = OneVsRestClassifier(MultinomialNB())
+    pred2 = ovr2.fit(X, y).predict(X)
+
+    assert_almost_equal(pred, pred2)
+    assert len(ovr.estimators_) == len(np.unique(y))
+    assert np.mean(y == pred) > 0.65
+
+    # Test when mini batches doesn't have all classes
+    # with SGDClassifier
+    X = np.abs(np.random.randn(14, 2))
+    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
+
+    ovr = OneVsRestClassifier(
+        SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)
+    )
+    ovr.partial_fit(X[:7], y[:7], np.unique(y))
+    ovr.partial_fit(X[7:], y[7:])
+    pred = ovr.predict(X)
+    ovr1 = OneVsRestClassifier(
+        SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)
+    )
+    pred1 = ovr1.fit(X, y).predict(X)
+    assert np.mean(pred == y) == np.mean(pred1 == y)
+
+    # test partial_fit only exists if estimator has it:
+    ovr = OneVsRestClassifier(SVC())
+    assert not hasattr(ovr, "partial_fit")
+
+
+def test_ovr_partial_fit_exceptions():
+    ovr = OneVsRestClassifier(MultinomialNB())
+    X = np.abs(np.random.randn(14, 2))
+    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
+    ovr.partial_fit(X[:7], y[:7], np.unique(y))
+    # If a new class that was not in the first call of partial fit is seen
+    # it should raise ValueError
+    y1 = [5] + y[7:-1]
+    msg = r"Mini-batch contains \[.+\] while classes must be subset of \[.+\]"
+    with pytest.raises(ValueError, match=msg):
+        ovr.partial_fit(X=X[7:], y=y1)
+
+
+def test_ovr_ovo_regressor():
+    # test that ovr and ovo work on regressors which don't have a decision_
+    # function
+    ovr = OneVsRestClassifier(DecisionTreeRegressor())
+    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
+    assert len(ovr.estimators_) == n_classes
+    assert_array_equal(np.unique(pred), [0, 1, 2])
+    # we are doing something sensible
+    assert np.mean(pred == iris.target) > 0.9
+
+    ovr = OneVsOneClassifier(DecisionTreeRegressor())
+    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
+    assert len(ovr.estimators_) == n_classes * (n_classes - 1) / 2
+    assert_array_equal(np.unique(pred), [0, 1, 2])
+    # we are doing something sensible
+    assert np.mean(pred == iris.target) > 0.9
+
+
+@pytest.mark.parametrize(
+    "sparse_container",
+    CSR_CONTAINERS + CSC_CONTAINERS + COO_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
+)
+def test_ovr_fit_predict_sparse(sparse_container):
+    base_clf = MultinomialNB(alpha=1)
+
+    X, Y = datasets.make_multilabel_classification(
+        n_samples=100,
+        n_features=20,
+        n_classes=5,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+
+    X_train, Y_train = X[:80], Y[:80]
+    X_test = X[80:]
+
+    clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
+    Y_pred = clf.predict(X_test)
+
+    clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse_container(Y_train))
+    Y_pred_sprs = clf_sprs.predict(X_test)
+
+    assert clf.multilabel_
+    assert sp.issparse(Y_pred_sprs)
+    assert_array_equal(Y_pred_sprs.toarray(), Y_pred)
+
+    # Test predict_proba
+    Y_proba = clf_sprs.predict_proba(X_test)
+
+    # predict assigns a label if the probability that the
+    # sample has the label is greater than 0.5.
+    pred = Y_proba > 0.5
+    assert_array_equal(pred, Y_pred_sprs.toarray())
+
+    # Test decision_function
+    clf = svm.SVC()
+    clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse_container(Y_train))
+    dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
+    assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
+
+
+def test_ovr_always_present():
+    # Test that ovr works with classes that are always present or absent.
+    # Note: tests is the case where _ConstantPredictor is utilised
+    X = np.ones((10, 2))
+    X[:5, :] = 0
+
+    # Build an indicator matrix where two features are always on.
+    # As list of lists, it would be: [[int(i >= 5), 2, 3] for i in range(10)]
+    y = np.zeros((10, 3))
+    y[5:, 0] = 1
+    y[:, 1] = 1
+    y[:, 2] = 1
+
+    ovr = OneVsRestClassifier(LogisticRegression())
+    msg = r"Label .+ is present in all training examples"
+    with pytest.warns(UserWarning, match=msg):
+        ovr.fit(X, y)
+    y_pred = ovr.predict(X)
+    assert_array_equal(np.array(y_pred), np.array(y))
+    y_pred = ovr.decision_function(X)
+    assert np.unique(y_pred[:, -2:]) == 1
+    y_pred = ovr.predict_proba(X)
+    assert_array_equal(y_pred[:, -1], np.ones(X.shape[0]))
+
+    # y has a constantly absent label
+    y = np.zeros((10, 2))
+    y[5:, 0] = 1  # variable label
+    ovr = OneVsRestClassifier(LogisticRegression())
+
+    msg = r"Label not 1 is present in all training examples"
+    with pytest.warns(UserWarning, match=msg):
+        ovr.fit(X, y)
+    y_pred = ovr.predict_proba(X)
+    assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
+
+
+def test_ovr_multiclass():
+    # Toy dataset where features correspond directly to labels.
+    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
+    y = ["eggs", "spam", "ham", "eggs", "ham"]
+    Y = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1], [1, 0, 0]])
+
+    classes = set("ham eggs spam".split())
+
+    for base_clf in (
+        MultinomialNB(),
+        LinearSVC(random_state=0),
+        LinearRegression(),
+        Ridge(),
+        ElasticNet(),
+    ):
+        clf = OneVsRestClassifier(base_clf).fit(X, y)
+        assert set(clf.classes_) == classes
+        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
+        assert_array_equal(y_pred, ["eggs"])
+
+        # test input as label indicator matrix
+        clf = OneVsRestClassifier(base_clf).fit(X, Y)
+        y_pred = clf.predict([[0, 0, 4]])[0]
+        assert_array_equal(y_pred, [0, 0, 1])
+
+
+def test_ovr_binary():
+    # Toy dataset where features correspond directly to labels.
+    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
+    y = ["eggs", "spam", "spam", "eggs", "spam"]
+    Y = np.array([[0, 1, 1, 0, 1]]).T
+
+    classes = set("eggs spam".split())
+
+    def conduct_test(base_clf, test_predict_proba=False):
+        clf = OneVsRestClassifier(base_clf).fit(X, y)
+        assert set(clf.classes_) == classes
+        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
+        assert_array_equal(y_pred, ["eggs"])
+        if hasattr(base_clf, "decision_function"):
+            dec = clf.decision_function(X)
+            assert dec.shape == (5,)
+
+        if test_predict_proba:
+            X_test = np.array([[0, 0, 4]])
+            probabilities = clf.predict_proba(X_test)
+            assert 2 == len(probabilities[0])
+            assert clf.classes_[np.argmax(probabilities, axis=1)] == clf.predict(X_test)
+
+        # test input as label indicator matrix
+        clf = OneVsRestClassifier(base_clf).fit(X, Y)
+        y_pred = clf.predict([[3, 0, 0]])[0]
+        assert y_pred == 1
+
+    for base_clf in (
+        LinearSVC(random_state=0),
+        LinearRegression(),
+        Ridge(),
+        ElasticNet(),
+    ):
+        conduct_test(base_clf)
+
+    for base_clf in (MultinomialNB(), SVC(probability=True), LogisticRegression()):
+        conduct_test(base_clf, test_predict_proba=True)
+
+
+def test_ovr_multilabel():
+    # Toy dataset where features correspond directly to labels.
+    X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
+    y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]])
+
+    for base_clf in (
+        MultinomialNB(),
+        LinearSVC(random_state=0),
+        LinearRegression(),
+        Ridge(),
+        ElasticNet(),
+        Lasso(alpha=0.5),
+    ):
+        clf = OneVsRestClassifier(base_clf).fit(X, y)
+        y_pred = clf.predict([[0, 4, 4]])[0]
+        assert_array_equal(y_pred, [0, 1, 1])
+        assert clf.multilabel_
+
+
+def test_ovr_fit_predict_svc():
+    ovr = OneVsRestClassifier(svm.SVC())
+    ovr.fit(iris.data, iris.target)
+    assert len(ovr.estimators_) == 3
+    assert ovr.score(iris.data, iris.target) > 0.9
+
+
+def test_ovr_multilabel_dataset():
+    base_clf = MultinomialNB(alpha=1)
+    for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
+        X, Y = datasets.make_multilabel_classification(
+            n_samples=100,
+            n_features=20,
+            n_classes=5,
+            n_labels=2,
+            length=50,
+            allow_unlabeled=au,
+            random_state=0,
+        )
+        X_train, Y_train = X[:80], Y[:80]
+        X_test, Y_test = X[80:], Y[80:]
+        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
+        Y_pred = clf.predict(X_test)
+
+        assert clf.multilabel_
+        assert_almost_equal(
+            precision_score(Y_test, Y_pred, average="micro"), prec, decimal=2
+        )
+        assert_almost_equal(
+            recall_score(Y_test, Y_pred, average="micro"), recall, decimal=2
+        )
+
+
+def test_ovr_multilabel_predict_proba():
+    base_clf = MultinomialNB(alpha=1)
+    for au in (False, True):
+        X, Y = datasets.make_multilabel_classification(
+            n_samples=100,
+            n_features=20,
+            n_classes=5,
+            n_labels=3,
+            length=50,
+            allow_unlabeled=au,
+            random_state=0,
+        )
+        X_train, Y_train = X[:80], Y[:80]
+        X_test = X[80:]
+        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
+
+        # Decision function only estimator.
+        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
+        assert not hasattr(decision_only, "predict_proba")
+
+        # Estimator with predict_proba disabled, depending on parameters.
+        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
+        assert not hasattr(decision_only, "predict_proba")
+        decision_only.fit(X_train, Y_train)
+        assert not hasattr(decision_only, "predict_proba")
+        assert hasattr(decision_only, "decision_function")
+
+        # Estimator which can get predict_proba enabled after fitting
+        gs = GridSearchCV(
+            svm.SVC(probability=False), param_grid={"probability": [True]}
+        )
+        proba_after_fit = OneVsRestClassifier(gs)
+        assert not hasattr(proba_after_fit, "predict_proba")
+        proba_after_fit.fit(X_train, Y_train)
+        assert hasattr(proba_after_fit, "predict_proba")
+
+        Y_pred = clf.predict(X_test)
+        Y_proba = clf.predict_proba(X_test)
+
+        # predict assigns a label if the probability that the
+        # sample has the label is greater than 0.5.
+        pred = Y_proba > 0.5
+        assert_array_equal(pred, Y_pred)
+
+
+def test_ovr_single_label_predict_proba():
+    base_clf = MultinomialNB(alpha=1)
+    X, Y = iris.data, iris.target
+    X_train, Y_train = X[:80], Y[:80]
+    X_test = X[80:]
+    clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
+
+    # Decision function only estimator.
+    decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
+    assert not hasattr(decision_only, "predict_proba")
+
+    Y_pred = clf.predict(X_test)
+    Y_proba = clf.predict_proba(X_test)
+
+    assert_almost_equal(Y_proba.sum(axis=1), 1.0)
+    # predict assigns a label if the probability that the
+    # sample has the label with the greatest predictive probability.
+    pred = Y_proba.argmax(axis=1)
+    assert not (pred - Y_pred).any()
+
+
+def test_ovr_single_label_predict_proba_zero():
+    """Check that predic_proba returns all zeros when the base estimator
+    never predicts the positive class.
+    """
+
+    class NaiveBinaryClassifier(BaseEstimator, ClassifierMixin):
+        def fit(self, X, y):
+            self.classes_ = np.unique(y)
+            return self
+
+        def predict_proba(self, X):
+            proba = np.ones((len(X), 2))
+            # Probability of being the positive class is always 0
+            proba[:, 1] = 0
+            return proba
+
+    base_clf = NaiveBinaryClassifier()
+    X, y = iris.data, iris.target  # Three-class problem with 150 samples
+
+    clf = OneVsRestClassifier(base_clf).fit(X, y)
+    y_proba = clf.predict_proba(X)
+
+    assert_allclose(y_proba, 0.0)
+
+
+def test_ovr_multilabel_decision_function():
+    X, Y = datasets.make_multilabel_classification(
+        n_samples=100,
+        n_features=20,
+        n_classes=5,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+    X_train, Y_train = X[:80], Y[:80]
+    X_test = X[80:]
+    clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
+    assert_array_equal(
+        (clf.decision_function(X_test) > 0).astype(int), clf.predict(X_test)
+    )
+
+
+def test_ovr_single_label_decision_function():
+    X, Y = datasets.make_classification(n_samples=100, n_features=20, random_state=0)
+    X_train, Y_train = X[:80], Y[:80]
+    X_test = X[80:]
+    clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
+    assert_array_equal(clf.decision_function(X_test).ravel() > 0, clf.predict(X_test))
+
+
+def test_ovr_gridsearch():
+    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
+    Cs = [0.1, 0.5, 0.8]
+    cv = GridSearchCV(ovr, {"estimator__C": Cs})
+    cv.fit(iris.data, iris.target)
+    best_C = cv.best_estimator_.estimators_[0].C
+    assert best_C in Cs
+
+
+def test_ovr_pipeline():
+    # Test with pipeline of length one
+    # This test is needed because the multiclass estimators may fail to detect
+    # the presence of predict_proba or decision_function.
+    clf = Pipeline([("tree", DecisionTreeClassifier())])
+    ovr_pipe = OneVsRestClassifier(clf)
+    ovr_pipe.fit(iris.data, iris.target)
+    ovr = OneVsRestClassifier(DecisionTreeClassifier())
+    ovr.fit(iris.data, iris.target)
+    assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
+
+
+def test_ovo_exceptions():
+    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
+    with pytest.raises(NotFittedError):
+        ovo.predict([])
+
+
+def test_ovo_fit_on_list():
+    # Test that OneVsOne fitting works with a list of targets and yields the
+    # same output as predict from an array
+    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
+    prediction_from_array = ovo.fit(iris.data, iris.target).predict(iris.data)
+    iris_data_list = [list(a) for a in iris.data]
+    prediction_from_list = ovo.fit(iris_data_list, list(iris.target)).predict(
+        iris_data_list
+    )
+    assert_array_equal(prediction_from_array, prediction_from_list)
+
+
+def test_ovo_fit_predict():
+    # A classifier which implements decision_function.
+    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
+    ovo.fit(iris.data, iris.target).predict(iris.data)
+    assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
+
+    # A classifier which implements predict_proba.
+    ovo = OneVsOneClassifier(MultinomialNB())
+    ovo.fit(iris.data, iris.target).predict(iris.data)
+    assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
+
+
+def test_ovo_partial_fit_predict():
+    temp = datasets.load_iris()
+    X, y = temp.data, temp.target
+    ovo1 = OneVsOneClassifier(MultinomialNB())
+    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
+    ovo1.partial_fit(X[100:], y[100:])
+    pred1 = ovo1.predict(X)
+
+    ovo2 = OneVsOneClassifier(MultinomialNB())
+    ovo2.fit(X, y)
+    pred2 = ovo2.predict(X)
+    assert len(ovo1.estimators_) == n_classes * (n_classes - 1) / 2
+    assert np.mean(y == pred1) > 0.65
+    assert_almost_equal(pred1, pred2)
+
+    # Test when mini-batches have binary target classes
+    ovo1 = OneVsOneClassifier(MultinomialNB())
+    ovo1.partial_fit(X[:60], y[:60], np.unique(y))
+    ovo1.partial_fit(X[60:], y[60:])
+    pred1 = ovo1.predict(X)
+    ovo2 = OneVsOneClassifier(MultinomialNB())
+    pred2 = ovo2.fit(X, y).predict(X)
+
+    assert_almost_equal(pred1, pred2)
+    assert len(ovo1.estimators_) == len(np.unique(y))
+    assert np.mean(y == pred1) > 0.65
+
+    ovo = OneVsOneClassifier(MultinomialNB())
+    X = np.random.rand(14, 2)
+    y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2]
+    ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4])
+    ovo.partial_fit(X[7:], y[7:])
+    pred = ovo.predict(X)
+    ovo2 = OneVsOneClassifier(MultinomialNB())
+    pred2 = ovo2.fit(X, y).predict(X)
+    assert_almost_equal(pred, pred2)
+
+    # raises error when mini-batch does not have classes from all_classes
+    ovo = OneVsOneClassifier(MultinomialNB())
+    error_y = [0, 1, 2, 3, 4, 5, 2]
+    message_re = escape(
+        "Mini-batch contains {0} while it must be subset of {1}".format(
+            np.unique(error_y), np.unique(y)
+        )
+    )
+    with pytest.raises(ValueError, match=message_re):
+        ovo.partial_fit(X[:7], error_y, np.unique(y))
+
+    # test partial_fit only exists if estimator has it:
+    ovr = OneVsOneClassifier(SVC())
+    assert not hasattr(ovr, "partial_fit")
+
+
+def test_ovo_decision_function():
+    n_samples = iris.data.shape[0]
+
+    ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0))
+    # first binary
+    ovo_clf.fit(iris.data, iris.target == 0)
+    decisions = ovo_clf.decision_function(iris.data)
+    assert decisions.shape == (n_samples,)
+
+    # then multi-class
+    ovo_clf.fit(iris.data, iris.target)
+    decisions = ovo_clf.decision_function(iris.data)
+
+    assert decisions.shape == (n_samples, n_classes)
+    assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data))
+
+    # Compute the votes
+    votes = np.zeros((n_samples, n_classes))
+
+    k = 0
+    for i in range(n_classes):
+        for j in range(i + 1, n_classes):
+            pred = ovo_clf.estimators_[k].predict(iris.data)
+            votes[pred == 0, i] += 1
+            votes[pred == 1, j] += 1
+            k += 1
+
+    # Extract votes and verify
+    assert_array_equal(votes, np.round(decisions))
+
+    for class_idx in range(n_classes):
+        # For each sample and each class, there only 3 possible vote levels
+        # because they are only 3 distinct class pairs thus 3 distinct
+        # binary classifiers.
+        # Therefore, sorting predictions based on votes would yield
+        # mostly tied predictions:
+        assert set(votes[:, class_idx]).issubset(set([0.0, 1.0, 2.0]))
+
+        # The OVO decision function on the other hand is able to resolve
+        # most of the ties on this data as it combines both the vote counts
+        # and the aggregated confidence levels of the binary classifiers
+        # to compute the aggregate decision function. The iris dataset
+        # has 150 samples with a couple of duplicates. The OvO decisions
+        # can resolve most of the ties:
+        assert len(np.unique(decisions[:, class_idx])) > 146
+
+
+def test_ovo_gridsearch():
+    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
+    Cs = [0.1, 0.5, 0.8]
+    cv = GridSearchCV(ovo, {"estimator__C": Cs})
+    cv.fit(iris.data, iris.target)
+    best_C = cv.best_estimator_.estimators_[0].C
+    assert best_C in Cs
+
+
+def test_ovo_ties():
+    # Test that ties are broken using the decision function,
+    # not defaulting to the smallest label
+    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
+    y = np.array([2, 0, 1, 2])
+    multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None))
+    ovo_prediction = multi_clf.fit(X, y).predict(X)
+    ovo_decision = multi_clf.decision_function(X)
+
+    # Classifiers are in order 0-1, 0-2, 1-2
+    # Use decision_function to compute the votes and the normalized
+    # sum_of_confidences, which is used to disambiguate when there is a tie in
+    # votes.
+    votes = np.round(ovo_decision)
+    normalized_confidences = ovo_decision - votes
+
+    # For the first point, there is one vote per class
+    assert_array_equal(votes[0, :], 1)
+    # For the rest, there is no tie and the prediction is the argmax
+    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
+    # For the tie, the prediction is the class with the highest score
+    assert ovo_prediction[0] == normalized_confidences[0].argmax()
+
+
+def test_ovo_ties2():
+    # test that ties can not only be won by the first two labels
+    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
+    y_ref = np.array([2, 0, 1, 2])
+
+    # cycle through labels so that each label wins once
+    for i in range(3):
+        y = (y_ref + i) % 3
+        multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None))
+        ovo_prediction = multi_clf.fit(X, y).predict(X)
+        assert ovo_prediction[0] == i % 3
+
+
+def test_ovo_string_y():
+    # Test that the OvO doesn't mess up the encoding of string labels
+    X = np.eye(4)
+    y = np.array(["a", "b", "c", "d"])
+
+    ovo = OneVsOneClassifier(LinearSVC())
+    ovo.fit(X, y)
+    assert_array_equal(y, ovo.predict(X))
+
+
+def test_ovo_one_class():
+    # Test error for OvO with one class
+    X = np.eye(4)
+    y = np.array(["a"] * 4)
+
+    ovo = OneVsOneClassifier(LinearSVC())
+    msg = "when only one class"
+    with pytest.raises(ValueError, match=msg):
+        ovo.fit(X, y)
+
+
+def test_ovo_float_y():
+    # Test that the OvO errors on float targets
+    X = iris.data
+    y = iris.data[:, 0]
+
+    ovo = OneVsOneClassifier(LinearSVC())
+    msg = "Unknown label type"
+    with pytest.raises(ValueError, match=msg):
+        ovo.fit(X, y)
+
+
+def test_ecoc_exceptions():
+    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
+    with pytest.raises(NotFittedError):
+        ecoc.predict([])
+
+
+def test_ecoc_fit_predict():
+    # A classifier which implements decision_function.
+    ecoc = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0)
+    ecoc.fit(iris.data, iris.target).predict(iris.data)
+    assert len(ecoc.estimators_) == n_classes * 2
+
+    # A classifier which implements predict_proba.
+    ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2, random_state=0)
+    ecoc.fit(iris.data, iris.target).predict(iris.data)
+    assert len(ecoc.estimators_) == n_classes * 2
+
+
+def test_ecoc_gridsearch():
+    ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0)
+    Cs = [0.1, 0.5, 0.8]
+    cv = GridSearchCV(ecoc, {"estimator__C": Cs})
+    cv.fit(iris.data, iris.target)
+    best_C = cv.best_estimator_.estimators_[0].C
+    assert best_C in Cs
+
+
+def test_ecoc_float_y():
+    # Test that the OCC errors on float targets
+    X = iris.data
+    y = iris.data[:, 0]
+
+    ovo = OutputCodeClassifier(LinearSVC())
+    msg = "Unknown label type"
+    with pytest.raises(ValueError, match=msg):
+        ovo.fit(X, y)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_ecoc_delegate_sparse_base_estimator(csc_container):
+    # Non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/17218
+    X, y = iris.data, iris.target
+    X_sp = csc_container(X)
+
+    # create an estimator that does not support sparse input
+    base_estimator = CheckingClassifier(
+        check_X=check_array,
+        check_X_params={"ensure_2d": True, "accept_sparse": False},
+    )
+    ecoc = OutputCodeClassifier(base_estimator, random_state=0)
+
+    with pytest.raises(TypeError, match="Sparse data was passed"):
+        ecoc.fit(X_sp, y)
+
+    ecoc.fit(X, y)
+    with pytest.raises(TypeError, match="Sparse data was passed"):
+        ecoc.predict(X_sp)
+
+    # smoke test to check when sparse input should be supported
+    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
+    ecoc.fit(X_sp, y).predict(X_sp)
+    assert len(ecoc.estimators_) == 4
+
+
+def test_pairwise_indices():
+    clf_precomputed = svm.SVC(kernel="precomputed")
+    X, y = iris.data, iris.target
+
+    ovr_false = OneVsOneClassifier(clf_precomputed)
+    linear_kernel = np.dot(X, X.T)
+    ovr_false.fit(linear_kernel, y)
+
+    n_estimators = len(ovr_false.estimators_)
+    precomputed_indices = ovr_false.pairwise_indices_
+
+    for idx in precomputed_indices:
+        assert (
+            idx.shape[0] * n_estimators / (n_estimators - 1) == linear_kernel.shape[0]
+        )
+
+
+def test_pairwise_n_features_in():
+    """Check the n_features_in_ attributes of the meta and base estimators
+
+    When the training data is a regular design matrix, everything is intuitive.
+    However, when the training data is a precomputed kernel matrix, the
+    multiclass strategy can resample the kernel matrix of the underlying base
+    estimator both row-wise and column-wise and this has a non-trivial impact
+    on the expected value for the n_features_in_ of both the meta and the base
+    estimators.
+    """
+    X, y = iris.data, iris.target
+
+    # Remove the last sample to make the classes not exactly balanced and make
+    # the test more interesting.
+    assert y[-1] == 0
+    X = X[:-1]
+    y = y[:-1]
+
+    # Fitting directly on the design matrix:
+    assert X.shape == (149, 4)
+
+    clf_notprecomputed = svm.SVC(kernel="linear").fit(X, y)
+    assert clf_notprecomputed.n_features_in_ == 4
+
+    ovr_notprecomputed = OneVsRestClassifier(clf_notprecomputed).fit(X, y)
+    assert ovr_notprecomputed.n_features_in_ == 4
+    for est in ovr_notprecomputed.estimators_:
+        assert est.n_features_in_ == 4
+
+    ovo_notprecomputed = OneVsOneClassifier(clf_notprecomputed).fit(X, y)
+    assert ovo_notprecomputed.n_features_in_ == 4
+    assert ovo_notprecomputed.n_classes_ == 3
+    assert len(ovo_notprecomputed.estimators_) == 3
+    for est in ovo_notprecomputed.estimators_:
+        assert est.n_features_in_ == 4
+
+    # When working with precomputed kernels we have one "feature" per training
+    # sample:
+    K = X @ X.T
+    assert K.shape == (149, 149)
+
+    clf_precomputed = svm.SVC(kernel="precomputed").fit(K, y)
+    assert clf_precomputed.n_features_in_ == 149
+
+    ovr_precomputed = OneVsRestClassifier(clf_precomputed).fit(K, y)
+    assert ovr_precomputed.n_features_in_ == 149
+    assert ovr_precomputed.n_classes_ == 3
+    assert len(ovr_precomputed.estimators_) == 3
+    for est in ovr_precomputed.estimators_:
+        assert est.n_features_in_ == 149
+
+    # This becomes really interesting with OvO and precomputed kernel together:
+    # internally, OvO will drop the samples of the classes not part of the pair
+    # of classes under consideration for a given binary classifier. Since we
+    # use a precomputed kernel, it will also drop the matching columns of the
+    # kernel matrix, and therefore we have fewer "features" as result.
+    #
+    # Since class 0 has 49 samples, and class 1 and 2 have 50 samples each, a
+    # single OvO binary classifier works with a sub-kernel matrix of shape
+    # either (99, 99) or (100, 100).
+    ovo_precomputed = OneVsOneClassifier(clf_precomputed).fit(K, y)
+    assert ovo_precomputed.n_features_in_ == 149
+    assert ovr_precomputed.n_classes_ == 3
+    assert len(ovr_precomputed.estimators_) == 3
+    assert ovo_precomputed.estimators_[0].n_features_in_ == 99  # class 0 vs class 1
+    assert ovo_precomputed.estimators_[1].n_features_in_ == 99  # class 0 vs class 2
+    assert ovo_precomputed.estimators_[2].n_features_in_ == 100  # class 1 vs class 2
+
+
+@pytest.mark.parametrize(
+    "MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
+)
+def test_pairwise_tag(MultiClassClassifier):
+    clf_precomputed = svm.SVC(kernel="precomputed")
+    clf_notprecomputed = svm.SVC()
+
+    ovr_false = MultiClassClassifier(clf_notprecomputed)
+    assert not ovr_false.__sklearn_tags__().input_tags.pairwise
+
+    ovr_true = MultiClassClassifier(clf_precomputed)
+    assert ovr_true.__sklearn_tags__().input_tags.pairwise
+
+
+@pytest.mark.parametrize(
+    "MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
+)
+def test_pairwise_cross_val_score(MultiClassClassifier):
+    clf_precomputed = svm.SVC(kernel="precomputed")
+    clf_notprecomputed = svm.SVC(kernel="linear")
+
+    X, y = iris.data, iris.target
+
+    multiclass_clf_notprecomputed = MultiClassClassifier(clf_notprecomputed)
+    multiclass_clf_precomputed = MultiClassClassifier(clf_precomputed)
+
+    linear_kernel = np.dot(X, X.T)
+    score_not_precomputed = cross_val_score(
+        multiclass_clf_notprecomputed, X, y, error_score="raise"
+    )
+    score_precomputed = cross_val_score(
+        multiclass_clf_precomputed, linear_kernel, y, error_score="raise"
+    )
+    assert_array_equal(score_precomputed, score_not_precomputed)
+
+
+@pytest.mark.parametrize(
+    "MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
+)
+# FIXME: we should move this test in `estimator_checks` once we are able
+# to construct meta-estimator instances
+def test_support_missing_values(MultiClassClassifier):
+    # smoke test to check that pipeline OvR and OvO classifiers are letting
+    # the validation of missing values to
+    # the underlying pipeline or classifiers
+    rng = np.random.RandomState(42)
+    X, y = iris.data, iris.target
+    X = np.copy(X)  # Copy to avoid that the original data is modified
+    mask = rng.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
+    X[mask] = np.nan
+    lr = make_pipeline(SimpleImputer(), LogisticRegression(random_state=rng))
+
+    MultiClassClassifier(lr).fit(X, y).score(X, y)
+
+
+@pytest.mark.parametrize("make_y", [np.ones, np.zeros])
+def test_constant_int_target(make_y):
+    """Check that constant y target does not raise.
+
+    Non-regression test for #21869
+    """
+    X = np.ones((10, 2))
+    y = make_y((10, 1), dtype=np.int32)
+    ovr = OneVsRestClassifier(LogisticRegression())
+
+    ovr.fit(X, y)
+    y_pred = ovr.predict_proba(X)
+    expected = np.zeros((X.shape[0], 2))
+    expected[:, 0] = 1
+    assert_allclose(y_pred, expected)
+
+
+def test_ovo_consistent_binary_classification():
+    """Check that ovo is consistent with binary classifier.
+
+    Non-regression test for #13617.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+
+    clf = KNeighborsClassifier(n_neighbors=8, weights="distance")
+    ovo = OneVsOneClassifier(clf)
+
+    clf.fit(X, y)
+    ovo.fit(X, y)
+
+    assert_array_equal(clf.predict(X), ovo.predict(X))
+
+
+def test_multiclass_estimator_attribute_error():
+    """Check that we raise the proper AttributeError when the final estimator
+    does not implement the `partial_fit` method, which is decorated with
+    `available_if`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28108
+    """
+    iris = datasets.load_iris()
+
+    # LogisticRegression does not implement 'partial_fit' and should raise an
+    # AttributeError
+    clf = OneVsRestClassifier(estimator=LogisticRegression(random_state=42))
+
+    outer_msg = "This 'OneVsRestClassifier' has no attribute 'partial_fit'"
+    inner_msg = "'LogisticRegression' object has no attribute 'partial_fit'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        clf.partial_fit(iris.data, iris.target)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_multioutput.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_multioutput.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8127b805a999c647d980b3f023bac37ec6101de
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_multioutput.py
@@ -0,0 +1,880 @@
+import re
+
+import numpy as np
+import pytest
+from joblib import cpu_count
+
+from sklearn import datasets
+from sklearn.base import ClassifierMixin, clone
+from sklearn.datasets import (
+    load_linnerud,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.ensemble import (
+    GradientBoostingRegressor,
+    RandomForestClassifier,
+    StackingRegressor,
+)
+from sklearn.exceptions import NotFittedError
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import (
+    Lasso,
+    LinearRegression,
+    LogisticRegression,
+    OrthogonalMatchingPursuit,
+    PassiveAggressiveClassifier,
+    Ridge,
+    SGDClassifier,
+    SGDRegressor,
+)
+from sklearn.metrics import jaccard_score, mean_squared_error
+from sklearn.model_selection import GridSearchCV, train_test_split
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.multioutput import (
+    ClassifierChain,
+    MultiOutputClassifier,
+    MultiOutputRegressor,
+    RegressorChain,
+)
+from sklearn.pipeline import make_pipeline
+from sklearn.svm import LinearSVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils import shuffle
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import (
+    BSR_CONTAINERS,
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+
+def test_multi_target_regression():
+    X, y = datasets.make_regression(n_targets=3, random_state=0)
+    X_train, y_train = X[:50], y[:50]
+    X_test, y_test = X[50:], y[50:]
+
+    references = np.zeros_like(y_test)
+    for n in range(3):
+        rgr = GradientBoostingRegressor(random_state=0)
+        rgr.fit(X_train, y_train[:, n])
+        references[:, n] = rgr.predict(X_test)
+
+    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
+    rgr.fit(X_train, y_train)
+    y_pred = rgr.predict(X_test)
+
+    assert_almost_equal(references, y_pred)
+
+
+def test_multi_target_regression_partial_fit():
+    X, y = datasets.make_regression(n_targets=3, random_state=0)
+    X_train, y_train = X[:50], y[:50]
+    X_test, y_test = X[50:], y[50:]
+
+    references = np.zeros_like(y_test)
+    half_index = 25
+    for n in range(3):
+        sgr = SGDRegressor(random_state=0, max_iter=5)
+        sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
+        sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
+        references[:, n] = sgr.predict(X_test)
+
+    sgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
+
+    sgr.partial_fit(X_train[:half_index], y_train[:half_index])
+    sgr.partial_fit(X_train[half_index:], y_train[half_index:])
+
+    y_pred = sgr.predict(X_test)
+    assert_almost_equal(references, y_pred)
+    assert not hasattr(MultiOutputRegressor(Lasso), "partial_fit")
+
+
+def test_multi_target_regression_one_target():
+    # Test multi target regression raises
+    X, y = datasets.make_regression(n_targets=1, random_state=0)
+    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
+    msg = "at least two dimensions"
+    with pytest.raises(ValueError, match=msg):
+        rgr.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "sparse_container",
+    CSR_CONTAINERS
+    + CSC_CONTAINERS
+    + COO_CONTAINERS
+    + LIL_CONTAINERS
+    + DOK_CONTAINERS
+    + BSR_CONTAINERS,
+)
+def test_multi_target_sparse_regression(sparse_container):
+    X, y = datasets.make_regression(n_targets=3, random_state=0)
+    X_train, y_train = X[:50], y[:50]
+    X_test = X[50:]
+
+    rgr = MultiOutputRegressor(Lasso(random_state=0))
+    rgr_sparse = MultiOutputRegressor(Lasso(random_state=0))
+
+    rgr.fit(X_train, y_train)
+    rgr_sparse.fit(sparse_container(X_train), y_train)
+
+    assert_almost_equal(
+        rgr.predict(X_test), rgr_sparse.predict(sparse_container(X_test))
+    )
+
+
+def test_multi_target_sample_weights_api():
+    X = [[1, 2, 3], [4, 5, 6]]
+    y = [[3.141, 2.718], [2.718, 3.141]]
+    w = [0.8, 0.6]
+
+    rgr = MultiOutputRegressor(OrthogonalMatchingPursuit())
+    msg = "does not support sample weights"
+    with pytest.raises(ValueError, match=msg):
+        rgr.fit(X, y, w)
+
+    # no exception should be raised if the base estimator supports weights
+    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
+    rgr.fit(X, y, w)
+
+
+def test_multi_target_sample_weight_partial_fit():
+    # weighted regressor
+    X = [[1, 2, 3], [4, 5, 6]]
+    y = [[3.141, 2.718], [2.718, 3.141]]
+    w = [2.0, 1.0]
+    rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
+    rgr_w.partial_fit(X, y, w)
+
+    # weighted with different weights
+    w = [2.0, 2.0]
+    rgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
+    rgr.partial_fit(X, y, w)
+
+    assert rgr.predict(X)[0][0] != rgr_w.predict(X)[0][0]
+
+
+def test_multi_target_sample_weights():
+    # weighted regressor
+    Xw = [[1, 2, 3], [4, 5, 6]]
+    yw = [[3.141, 2.718], [2.718, 3.141]]
+    w = [2.0, 1.0]
+    rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
+    rgr_w.fit(Xw, yw, w)
+
+    # unweighted, but with repeated samples
+    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
+    y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
+    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
+    rgr.fit(X, y)
+
+    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
+    assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
+
+
+# Import the data
+iris = datasets.load_iris()
+# create a multiple targets by randomized shuffling and concatenating y.
+X = iris.data
+y1 = iris.target
+y2 = shuffle(y1, random_state=1)
+y3 = shuffle(y1, random_state=2)
+y = np.column_stack((y1, y2, y3))
+n_samples, n_features = X.shape
+n_outputs = y.shape[1]
+n_classes = len(np.unique(y1))
+classes = list(map(np.unique, (y1, y2, y3)))
+
+
+def test_multi_output_classification_partial_fit_parallelism():
+    sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
+    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
+    mor.partial_fit(X, y, classes)
+    est1 = mor.estimators_[0]
+    mor.partial_fit(X, y)
+    est2 = mor.estimators_[0]
+    if cpu_count() > 1:
+        # parallelism requires this to be the case for a sane implementation
+        assert est1 is not est2
+
+
+# check multioutput has predict_proba
+def test_hasattr_multi_output_predict_proba():
+    # default SGDClassifier has loss='hinge'
+    # which does not expose a predict_proba method
+    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
+    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
+    multi_target_linear.fit(X, y)
+    assert not hasattr(multi_target_linear, "predict_proba")
+
+    # case where predict_proba attribute exists
+    sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
+    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
+    multi_target_linear.fit(X, y)
+    assert hasattr(multi_target_linear, "predict_proba")
+
+
+# check predict_proba passes
+def test_multi_output_predict_proba():
+    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
+    param = {"loss": ("hinge", "log_loss", "modified_huber")}
+
+    # inner function for custom scoring
+    def custom_scorer(estimator, X, y):
+        if hasattr(estimator, "predict_proba"):
+            return 1.0
+        else:
+            return 0.0
+
+    grid_clf = GridSearchCV(
+        sgd_linear_clf,
+        param_grid=param,
+        scoring=custom_scorer,
+        cv=3,
+        error_score="raise",
+    )
+    multi_target_linear = MultiOutputClassifier(grid_clf)
+    multi_target_linear.fit(X, y)
+
+    multi_target_linear.predict_proba(X)
+
+    # SGDClassifier defaults to loss='hinge' which is not a probabilistic
+    # loss function; therefore it does not expose a predict_proba method
+    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
+    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
+    multi_target_linear.fit(X, y)
+
+    inner2_msg = "probability estimates are not available for loss='hinge'"
+    inner1_msg = "'SGDClassifier' has no attribute 'predict_proba'"
+    outer_msg = "'MultiOutputClassifier' has no attribute 'predict_proba'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        multi_target_linear.predict_proba(X)
+
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner1_msg in str(exec_info.value.__cause__)
+
+    assert isinstance(exec_info.value.__cause__.__cause__, AttributeError)
+    assert inner2_msg in str(exec_info.value.__cause__.__cause__)
+
+
+def test_multi_output_classification_partial_fit():
+    # test if multi_target initializes correctly with base estimator and fit
+    # assert predictions work as expected for predict
+
+    sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
+    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
+
+    # train the multi_target_linear and also get the predictions.
+    half_index = X.shape[0] // 2
+    multi_target_linear.partial_fit(X[:half_index], y[:half_index], classes=classes)
+
+    first_predictions = multi_target_linear.predict(X)
+    assert (n_samples, n_outputs) == first_predictions.shape
+
+    multi_target_linear.partial_fit(X[half_index:], y[half_index:])
+    second_predictions = multi_target_linear.predict(X)
+    assert (n_samples, n_outputs) == second_predictions.shape
+
+    # train the linear classification with each column and assert that
+    # predictions are equal after first partial_fit and second partial_fit
+    for i in range(3):
+        # create a clone with the same state
+        sgd_linear_clf = clone(sgd_linear_clf)
+        sgd_linear_clf.partial_fit(
+            X[:half_index], y[:half_index, i], classes=classes[i]
+        )
+        assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
+        sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
+        assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
+
+
+def test_multi_output_classification_partial_fit_no_first_classes_exception():
+    sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
+    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
+    msg = "classes must be passed on the first call to partial_fit."
+    with pytest.raises(ValueError, match=msg):
+        multi_target_linear.partial_fit(X, y)
+
+
+def test_multi_output_classification():
+    # test if multi_target initializes correctly with base estimator and fit
+    # assert predictions work as expected for predict, prodict_proba and score
+
+    forest = RandomForestClassifier(n_estimators=10, random_state=1)
+    multi_target_forest = MultiOutputClassifier(forest)
+
+    # train the multi_target_forest and also get the predictions.
+    multi_target_forest.fit(X, y)
+
+    predictions = multi_target_forest.predict(X)
+    assert (n_samples, n_outputs) == predictions.shape
+
+    predict_proba = multi_target_forest.predict_proba(X)
+
+    assert len(predict_proba) == n_outputs
+    for class_probabilities in predict_proba:
+        assert (n_samples, n_classes) == class_probabilities.shape
+
+    assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions)
+
+    # train the forest with each column and assert that predictions are equal
+    for i in range(3):
+        forest_ = clone(forest)  # create a clone with the same state
+        forest_.fit(X, y[:, i])
+        assert list(forest_.predict(X)) == list(predictions[:, i])
+        assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
+
+
+def test_multiclass_multioutput_estimator():
+    # test to check meta of meta estimators
+    svc = LinearSVC(random_state=0)
+    multi_class_svc = OneVsRestClassifier(svc)
+    multi_target_svc = MultiOutputClassifier(multi_class_svc)
+
+    multi_target_svc.fit(X, y)
+
+    predictions = multi_target_svc.predict(X)
+    assert (n_samples, n_outputs) == predictions.shape
+
+    # train the forest with each column and assert that predictions are equal
+    for i in range(3):
+        multi_class_svc_ = clone(multi_class_svc)  # create a clone
+        multi_class_svc_.fit(X, y[:, i])
+        assert list(multi_class_svc_.predict(X)) == list(predictions[:, i])
+
+
+def test_multiclass_multioutput_estimator_predict_proba():
+    seed = 542
+
+    # make test deterministic
+    rng = np.random.RandomState(seed)
+
+    # random features
+    X = rng.normal(size=(5, 5))
+
+    # random labels
+    y1 = np.array(["b", "a", "a", "b", "a"]).reshape(5, 1)  # 2 classes
+    y2 = np.array(["d", "e", "f", "e", "d"]).reshape(5, 1)  # 3 classes
+
+    Y = np.concatenate([y1, y2], axis=1)
+
+    clf = MultiOutputClassifier(LogisticRegression(random_state=seed))
+
+    clf.fit(X, Y)
+
+    y_result = clf.predict_proba(X)
+    y_actual = [
+        np.array(
+            [
+                [0.31525135, 0.68474865],
+                [0.81004803, 0.18995197],
+                [0.65664086, 0.34335914],
+                [0.38584929, 0.61415071],
+                [0.83234285, 0.16765715],
+            ]
+        ),
+        np.array(
+            [
+                [0.65759215, 0.20976588, 0.13264197],
+                [0.14996984, 0.82591444, 0.02411571],
+                [0.13111876, 0.13294966, 0.73593158],
+                [0.24663053, 0.65860244, 0.09476703],
+                [0.81458885, 0.1728158, 0.01259535],
+            ]
+        ),
+    ]
+
+    for i in range(len(y_actual)):
+        assert_almost_equal(y_result[i], y_actual[i])
+
+
+def test_multi_output_classification_sample_weights():
+    # weighted classifier
+    Xw = [[1, 2, 3], [4, 5, 6]]
+    yw = [[3, 2], [2, 3]]
+    w = np.asarray([2.0, 1.0])
+    forest = RandomForestClassifier(n_estimators=10, random_state=1)
+    clf_w = MultiOutputClassifier(forest)
+    clf_w.fit(Xw, yw, w)
+
+    # unweighted, but with repeated samples
+    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
+    y = [[3, 2], [3, 2], [2, 3]]
+    forest = RandomForestClassifier(n_estimators=10, random_state=1)
+    clf = MultiOutputClassifier(forest)
+    clf.fit(X, y)
+
+    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
+    assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
+
+
+def test_multi_output_classification_partial_fit_sample_weights():
+    # weighted classifier
+    Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
+    yw = [[3, 2], [2, 3], [3, 2]]
+    w = np.asarray([2.0, 1.0, 1.0])
+    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
+    clf_w = MultiOutputClassifier(sgd_linear_clf)
+    clf_w.fit(Xw, yw, w)
+
+    # unweighted, but with repeated samples
+    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
+    y = [[3, 2], [3, 2], [2, 3], [3, 2]]
+    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
+    clf = MultiOutputClassifier(sgd_linear_clf)
+    clf.fit(X, y)
+    X_test = [[1.5, 2.5, 3.5]]
+    assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
+
+
+def test_multi_output_exceptions():
+    # NotFittedError when fit is not done but score, predict and
+    # and predict_proba are called
+    moc = MultiOutputClassifier(LinearSVC(random_state=0))
+    with pytest.raises(NotFittedError):
+        moc.score(X, y)
+
+    # ValueError when number of outputs is different
+    # for fit and score
+    y_new = np.column_stack((y1, y2))
+    moc.fit(X, y)
+    with pytest.raises(ValueError):
+        moc.score(X, y_new)
+
+    # ValueError when y is continuous
+    msg = "Unknown label type"
+    with pytest.raises(ValueError, match=msg):
+        moc.fit(X, X[:, 1])
+
+
+@pytest.mark.parametrize("response_method", ["predict_proba", "predict"])
+def test_multi_output_not_fitted_error(response_method):
+    """Check that we raise the proper error when the estimator is not fitted"""
+    moc = MultiOutputClassifier(LogisticRegression())
+    with pytest.raises(NotFittedError):
+        getattr(moc, response_method)(X)
+
+
+def test_multi_output_delegate_predict_proba():
+    """Check the behavior for the delegation of predict_proba to the underlying
+    estimator"""
+
+    # A base estimator with `predict_proba`should expose the method even before fit
+    moc = MultiOutputClassifier(LogisticRegression())
+    assert hasattr(moc, "predict_proba")
+    moc.fit(X, y)
+    assert hasattr(moc, "predict_proba")
+
+    # A base estimator without `predict_proba` should raise an AttributeError
+    moc = MultiOutputClassifier(LinearSVC())
+    assert not hasattr(moc, "predict_proba")
+
+    outer_msg = "'MultiOutputClassifier' has no attribute 'predict_proba'"
+    inner_msg = "'LinearSVC' object has no attribute 'predict_proba'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        moc.predict_proba(X)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg == str(exec_info.value.__cause__)
+
+    moc.fit(X, y)
+    assert not hasattr(moc, "predict_proba")
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        moc.predict_proba(X)
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg == str(exec_info.value.__cause__)
+
+
+def generate_multilabel_dataset_with_correlations():
+    # Generate a multilabel data set from a multiclass dataset as a way of
+    # by representing the integer number of the original class using a binary
+    # encoding.
+    X, y = make_classification(
+        n_samples=1000, n_features=100, n_classes=16, n_informative=10, random_state=0
+    )
+
+    Y_multi = np.array([[int(yyy) for yyy in format(yy, "#06b")[2:]] for yy in y])
+    return X, Y_multi
+
+
+@pytest.mark.parametrize("chain_method", ["predict", "decision_function"])
+def test_classifier_chain_fit_and_predict_with_linear_svc(chain_method):
+    # Fit classifier chain and verify predict performance using LinearSVC
+    X, Y = generate_multilabel_dataset_with_correlations()
+    classifier_chain = ClassifierChain(
+        LinearSVC(),
+        chain_method=chain_method,
+    ).fit(X, Y)
+
+    Y_pred = classifier_chain.predict(X)
+    assert Y_pred.shape == Y.shape
+
+    Y_decision = classifier_chain.decision_function(X)
+
+    Y_binary = Y_decision >= 0
+    assert_array_equal(Y_binary, Y_pred)
+    assert not hasattr(classifier_chain, "predict_proba")
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_classifier_chain_fit_and_predict_with_sparse_data(csr_container):
+    # Fit classifier chain with sparse data
+    X, Y = generate_multilabel_dataset_with_correlations()
+    X_sparse = csr_container(X)
+
+    classifier_chain = ClassifierChain(LogisticRegression()).fit(X_sparse, Y)
+    Y_pred_sparse = classifier_chain.predict(X_sparse)
+
+    classifier_chain = ClassifierChain(LogisticRegression()).fit(X, Y)
+    Y_pred_dense = classifier_chain.predict(X)
+
+    assert_array_equal(Y_pred_sparse, Y_pred_dense)
+
+
+def test_classifier_chain_vs_independent_models():
+    # Verify that an ensemble of classifier chains (each of length
+    # N) can achieve a higher Jaccard similarity score than N independent
+    # models
+    X, Y = generate_multilabel_dataset_with_correlations()
+    X_train = X[:600, :]
+    X_test = X[600:, :]
+    Y_train = Y[:600, :]
+    Y_test = Y[600:, :]
+
+    ovr = OneVsRestClassifier(LogisticRegression())
+    ovr.fit(X_train, Y_train)
+    Y_pred_ovr = ovr.predict(X_test)
+
+    chain = ClassifierChain(LogisticRegression())
+    chain.fit(X_train, Y_train)
+    Y_pred_chain = chain.predict(X_test)
+
+    assert jaccard_score(Y_test, Y_pred_chain, average="samples") > jaccard_score(
+        Y_test, Y_pred_ovr, average="samples"
+    )
+
+
+@pytest.mark.parametrize(
+    "chain_method",
+    ["predict", "predict_proba", "predict_log_proba", "decision_function"],
+)
+@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
+def test_classifier_chain_fit_and_predict(chain_method, response_method):
+    # Fit classifier chain and verify predict performance
+    X, Y = generate_multilabel_dataset_with_correlations()
+    chain = ClassifierChain(LogisticRegression(), chain_method=chain_method)
+    chain.fit(X, Y)
+    Y_pred = chain.predict(X)
+    assert Y_pred.shape == Y.shape
+    assert [c.coef_.size for c in chain.estimators_] == list(
+        range(X.shape[1], X.shape[1] + Y.shape[1])
+    )
+
+    Y_prob = getattr(chain, response_method)(X)
+    if response_method == "predict_log_proba":
+        Y_prob = np.exp(Y_prob)
+    Y_binary = Y_prob >= 0.5
+    assert_array_equal(Y_binary, Y_pred)
+
+    assert isinstance(chain, ClassifierMixin)
+
+
+def test_regressor_chain_fit_and_predict():
+    # Fit regressor chain and verify Y and estimator coefficients shape
+    X, Y = generate_multilabel_dataset_with_correlations()
+    chain = RegressorChain(Ridge())
+    chain.fit(X, Y)
+    Y_pred = chain.predict(X)
+    assert Y_pred.shape == Y.shape
+    assert [c.coef_.size for c in chain.estimators_] == list(
+        range(X.shape[1], X.shape[1] + Y.shape[1])
+    )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_base_chain_fit_and_predict_with_sparse_data_and_cv(csr_container):
+    # Fit base chain with sparse data cross_val_predict
+    X, Y = generate_multilabel_dataset_with_correlations()
+    X_sparse = csr_container(X)
+    base_chains = [
+        ClassifierChain(LogisticRegression(), cv=3),
+        RegressorChain(Ridge(), cv=3),
+    ]
+    for chain in base_chains:
+        chain.fit(X_sparse, Y)
+        Y_pred = chain.predict(X_sparse)
+        assert Y_pred.shape == Y.shape
+
+
+def test_base_chain_random_order():
+    # Fit base chain with random order
+    X, Y = generate_multilabel_dataset_with_correlations()
+    for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
+        chain_random = clone(chain).set_params(order="random", random_state=42)
+        chain_random.fit(X, Y)
+        chain_fixed = clone(chain).set_params(order=chain_random.order_)
+        chain_fixed.fit(X, Y)
+        assert_array_equal(chain_fixed.order_, chain_random.order_)
+        assert list(chain_random.order) != list(range(4))
+        assert len(chain_random.order_) == 4
+        assert len(set(chain_random.order_)) == 4
+        # Randomly ordered chain should behave identically to a fixed order
+        # chain with the same order.
+        for est1, est2 in zip(chain_random.estimators_, chain_fixed.estimators_):
+            assert_array_almost_equal(est1.coef_, est2.coef_)
+
+
+@pytest.mark.parametrize(
+    "chain_type, chain_method",
+    [
+        ("classifier", "predict"),
+        ("classifier", "predict_proba"),
+        ("classifier", "predict_log_proba"),
+        ("classifier", "decision_function"),
+        ("regressor", ""),
+    ],
+)
+def test_base_chain_crossval_fit_and_predict(chain_type, chain_method):
+    # Fit chain with cross_val_predict and verify predict
+    # performance
+    X, Y = generate_multilabel_dataset_with_correlations()
+
+    if chain_type == "classifier":
+        chain = ClassifierChain(LogisticRegression(), chain_method=chain_method)
+    else:
+        chain = RegressorChain(Ridge())
+    chain.fit(X, Y)
+    chain_cv = clone(chain).set_params(cv=3)
+    chain_cv.fit(X, Y)
+    Y_pred_cv = chain_cv.predict(X)
+    Y_pred = chain.predict(X)
+
+    assert Y_pred_cv.shape == Y_pred.shape
+    assert not np.all(Y_pred == Y_pred_cv)
+    if isinstance(chain, ClassifierChain):
+        assert jaccard_score(Y, Y_pred_cv, average="samples") > 0.4
+    else:
+        assert mean_squared_error(Y, Y_pred_cv) < 0.25
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        RandomForestClassifier(n_estimators=2),
+        MultiOutputClassifier(RandomForestClassifier(n_estimators=2)),
+        ClassifierChain(RandomForestClassifier(n_estimators=2)),
+    ],
+)
+def test_multi_output_classes_(estimator):
+    # Tests classes_ attribute of multioutput classifiers
+    # RandomForestClassifier supports multioutput out-of-the-box
+    estimator.fit(X, y)
+    assert isinstance(estimator.classes_, list)
+    assert len(estimator.classes_) == n_outputs
+    for estimator_classes, expected_classes in zip(classes, estimator.classes_):
+        assert_array_equal(estimator_classes, expected_classes)
+
+
+class DummyRegressorWithFitParams(DummyRegressor):
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        self._fit_params = fit_params
+        return super().fit(X, y, sample_weight)
+
+
+class DummyClassifierWithFitParams(DummyClassifier):
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        self._fit_params = fit_params
+        return super().fit(X, y, sample_weight)
+
+
+@pytest.mark.parametrize(
+    "estimator, dataset",
+    [
+        (
+            MultiOutputClassifier(DummyClassifierWithFitParams(strategy="prior")),
+            datasets.make_multilabel_classification(),
+        ),
+        (
+            MultiOutputRegressor(DummyRegressorWithFitParams()),
+            datasets.make_regression(n_targets=3, random_state=0),
+        ),
+    ],
+)
+def test_multioutput_estimator_with_fit_params(estimator, dataset):
+    X, y = dataset
+    some_param = np.zeros_like(X)
+    estimator.fit(X, y, some_param=some_param)
+    for dummy_estimator in estimator.estimators_:
+        assert "some_param" in dummy_estimator._fit_params
+
+
+def test_regressor_chain_w_fit_params():
+    # Make sure fit_params are properly propagated to the sub-estimators
+    rng = np.random.RandomState(0)
+    X, y = datasets.make_regression(n_targets=3, random_state=0)
+    weight = rng.rand(y.shape[0])
+
+    class MySGD(SGDRegressor):
+        def fit(self, X, y, **fit_params):
+            self.sample_weight_ = fit_params["sample_weight"]
+            super().fit(X, y, **fit_params)
+
+    model = RegressorChain(MySGD())
+
+    # Fitting with params
+    fit_param = {"sample_weight": weight}
+    model.fit(X, y, **fit_param)
+
+    for est in model.estimators_:
+        assert est.sample_weight_ is weight
+
+
+@pytest.mark.parametrize(
+    "MultiOutputEstimator, Estimator",
+    [(MultiOutputClassifier, LogisticRegression), (MultiOutputRegressor, Ridge)],
+)
+# FIXME: we should move this test in `estimator_checks` once we are able
+# to construct meta-estimator instances
+def test_support_missing_values(MultiOutputEstimator, Estimator):
+    # smoke test to check that pipeline MultioutputEstimators are letting
+    # the validation of missing values to
+    # the underlying pipeline, regressor or classifier
+    rng = np.random.RandomState(42)
+    X, y = rng.randn(50, 2), rng.binomial(1, 0.5, (50, 3))
+    mask = rng.choice([1, 0], X.shape, p=[0.01, 0.99]).astype(bool)
+    X[mask] = np.nan
+
+    pipe = make_pipeline(SimpleImputer(), Estimator())
+    MultiOutputEstimator(pipe).fit(X, y).score(X, y)
+
+
+@pytest.mark.parametrize("order_type", [list, np.array, tuple])
+def test_classifier_chain_tuple_order(order_type):
+    X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
+    y = [[3, 2], [2, 3], [3, 2]]
+    order = order_type([1, 0])
+
+    chain = ClassifierChain(
+        RandomForestClassifier(n_estimators=2, random_state=0), order=order
+    )
+
+    chain.fit(X, y)
+    X_test = [[1.5, 2.5, 3.5]]
+    y_test = [[3, 2]]
+    assert_array_almost_equal(chain.predict(X_test), y_test)
+
+
+def test_classifier_chain_tuple_invalid_order():
+    X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
+    y = [[3, 2], [2, 3], [3, 2]]
+    order = tuple([1, 2])
+
+    chain = ClassifierChain(RandomForestClassifier(), order=order)
+
+    with pytest.raises(ValueError, match="invalid order"):
+        chain.fit(X, y)
+
+
+def test_classifier_chain_verbose(capsys):
+    X, y = make_multilabel_classification(
+        n_samples=100, n_features=5, n_classes=3, n_labels=3, random_state=0
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    pattern = (
+        r"\[Chain\].*\(1 of 3\) Processing order 0, total=.*\n"
+        r"\[Chain\].*\(2 of 3\) Processing order 1, total=.*\n"
+        r"\[Chain\].*\(3 of 3\) Processing order 2, total=.*\n$"
+    )
+
+    classifier = ClassifierChain(
+        DecisionTreeClassifier(),
+        order=[0, 1, 2],
+        random_state=0,
+        verbose=True,
+    )
+    classifier.fit(X_train, y_train)
+    assert re.match(pattern, capsys.readouterr()[0])
+
+
+def test_regressor_chain_verbose(capsys):
+    X, y = make_regression(n_samples=125, n_targets=3, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    pattern = (
+        r"\[Chain\].*\(1 of 3\) Processing order 1, total=.*\n"
+        r"\[Chain\].*\(2 of 3\) Processing order 0, total=.*\n"
+        r"\[Chain\].*\(3 of 3\) Processing order 2, total=.*\n$"
+    )
+    regressor = RegressorChain(
+        LinearRegression(),
+        order=[1, 0, 2],
+        random_state=0,
+        verbose=True,
+    )
+    regressor.fit(X_train, y_train)
+    assert re.match(pattern, capsys.readouterr()[0])
+
+
+def test_multioutputregressor_ducktypes_fitted_estimator():
+    """Test that MultiOutputRegressor checks the fitted estimator for
+    predict. Non-regression test for #16549."""
+    X, y = load_linnerud(return_X_y=True)
+    stacker = StackingRegressor(
+        estimators=[("sgd", SGDRegressor(random_state=1))],
+        final_estimator=Ridge(),
+        cv=2,
+    )
+
+    reg = MultiOutputRegressor(estimator=stacker).fit(X, y)
+
+    # Does not raise
+    reg.predict(X)
+
+
+@pytest.mark.parametrize(
+    "Cls, method", [(ClassifierChain, "fit"), (MultiOutputClassifier, "partial_fit")]
+)
+def test_fit_params_no_routing(Cls, method):
+    """Check that we raise an error when passing metadata not requested by the
+    underlying classifier.
+    """
+    X, y = make_classification(n_samples=50)
+    clf = Cls(PassiveAggressiveClassifier())
+
+    with pytest.raises(ValueError, match="is only supported if"):
+        getattr(clf, method)(X, y, test=1)
+
+
+def test_multioutput_regressor_has_partial_fit():
+    # Test that an unfitted MultiOutputRegressor handles available_if for
+    # partial_fit correctly
+    est = MultiOutputRegressor(LinearRegression())
+    msg = "This 'MultiOutputRegressor' has no attribute 'partial_fit'"
+    with pytest.raises(AttributeError, match=msg):
+        getattr(est, "partial_fit")
+
+
+# TODO(1.9):  remove when deprecated `base_estimator` is removed
+@pytest.mark.parametrize("Estimator", [ClassifierChain, RegressorChain])
+def test_base_estimator_deprecation(Estimator):
+    """Check that we warn about the deprecation of `base_estimator`."""
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([[1, 0], [0, 1]])
+
+    estimator = LogisticRegression()
+
+    with pytest.warns(FutureWarning):
+        Estimator(base_estimator=estimator).fit(X, y)
+
+    with pytest.raises(ValueError):
+        Estimator(base_estimator=estimator, estimator=estimator).fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_naive_bayes.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_naive_bayes.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5638e7384e8674a7e3bb2c1c8135c3d4ffecb9a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_naive_bayes.py
@@ -0,0 +1,979 @@
+import re
+import warnings
+
+import numpy as np
+import pytest
+from scipy.special import logsumexp
+
+from sklearn.datasets import load_digits, load_iris
+from sklearn.model_selection import cross_val_score, train_test_split
+from sklearn.naive_bayes import (
+    BernoulliNB,
+    CategoricalNB,
+    ComplementNB,
+    GaussianNB,
+    MultinomialNB,
+)
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+DISCRETE_NAIVE_BAYES_CLASSES = [BernoulliNB, CategoricalNB, ComplementNB, MultinomialNB]
+ALL_NAIVE_BAYES_CLASSES = DISCRETE_NAIVE_BAYES_CLASSES + [GaussianNB]
+
+# Data is just 6 separable points in the plane
+X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
+y = np.array([1, 1, 1, 2, 2, 2])
+
+
+def get_random_normal_x_binary_y(global_random_seed):
+    # A bit more random tests
+    rng = np.random.RandomState(global_random_seed)
+    X1 = rng.normal(size=(10, 3))
+    y1 = (rng.normal(size=10) > 0).astype(int)
+    return X1, y1
+
+
+def get_random_integer_x_three_classes_y(global_random_seed):
+    # Data is 6 random integer points in a 100 dimensional space classified to
+    # three classes.
+    rng = np.random.RandomState(global_random_seed)
+    X2 = rng.randint(5, size=(6, 100))
+    y2 = np.array([1, 1, 2, 2, 3, 3])
+    return X2, y2
+
+
+def test_gnb():
+    # Gaussian Naive Bayes classification.
+    # This checks that GaussianNB implements fit and predict and returns
+    # correct values for a simple toy dataset.
+
+    clf = GaussianNB()
+    y_pred = clf.fit(X, y).predict(X)
+    assert_array_equal(y_pred, y)
+
+    y_pred_proba = clf.predict_proba(X)
+    y_pred_log_proba = clf.predict_log_proba(X)
+    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
+
+    # Test whether label mismatch between target y and classes raises
+    # an Error
+    # FIXME Remove this test once the more general partial_fit tests are merged
+    with pytest.raises(
+        ValueError, match="The target label.* in y do not exist in the initial classes"
+    ):
+        GaussianNB().partial_fit(X, y, classes=[0, 1])
+
+
+def test_gnb_prior(global_random_seed):
+    # Test whether class priors are properly set.
+    clf = GaussianNB().fit(X, y)
+    assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8)
+    X1, y1 = get_random_normal_x_binary_y(global_random_seed)
+    clf = GaussianNB().fit(X1, y1)
+    # Check that the class priors sum to 1
+    assert_array_almost_equal(clf.class_prior_.sum(), 1)
+
+
+def test_gnb_sample_weight(global_random_seed):
+    """Test whether sample weights are properly used in GNB."""
+    # Sample weights all being 1 should not change results
+    sw = np.ones(6)
+    clf = GaussianNB().fit(X, y)
+    clf_sw = GaussianNB().fit(X, y, sw)
+
+    assert_array_almost_equal(clf.theta_, clf_sw.theta_)
+    assert_array_almost_equal(clf.var_, clf_sw.var_)
+
+    # Fitting twice with half sample-weights should result
+    # in same result as fitting once with full weights
+    rng = np.random.RandomState(global_random_seed)
+
+    sw = rng.rand(y.shape[0])
+    clf1 = GaussianNB().fit(X, y, sample_weight=sw)
+    clf2 = GaussianNB().partial_fit(X, y, classes=[1, 2], sample_weight=sw / 2)
+    clf2.partial_fit(X, y, sample_weight=sw / 2)
+
+    assert_array_almost_equal(clf1.theta_, clf2.theta_)
+    assert_array_almost_equal(clf1.var_, clf2.var_)
+
+    # Check that duplicate entries and correspondingly increased sample
+    # weights yield the same result
+    ind = rng.randint(0, X.shape[0], 20)
+    sample_weight = np.bincount(ind, minlength=X.shape[0])
+
+    clf_dupl = GaussianNB().fit(X[ind], y[ind])
+    clf_sw = GaussianNB().fit(X, y, sample_weight)
+
+    assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_)
+    assert_array_almost_equal(clf_dupl.var_, clf_sw.var_)
+
+    # non-regression test for gh-24140 where a division by zero was
+    # occurring when a single class was present
+    sample_weight = (y == 1).astype(np.float64)
+    clf = GaussianNB().fit(X, y, sample_weight=sample_weight)
+
+
+def test_gnb_neg_priors():
+    """Test whether an error is raised in case of negative priors"""
+    clf = GaussianNB(priors=np.array([-1.0, 2.0]))
+
+    msg = "Priors must be non-negative"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_gnb_priors():
+    """Test whether the class prior override is properly used"""
+    clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y)
+    assert_array_almost_equal(
+        clf.predict_proba([[-0.1, -0.1]]),
+        np.array([[0.825303662161683, 0.174696337838317]]),
+        8,
+    )
+    assert_array_almost_equal(clf.class_prior_, np.array([0.3, 0.7]))
+
+
+def test_gnb_priors_sum_isclose():
+    # test whether the class prior sum is properly tested"""
+    X = np.array(
+        [
+            [-1, -1],
+            [-2, -1],
+            [-3, -2],
+            [-4, -5],
+            [-5, -4],
+            [1, 1],
+            [2, 1],
+            [3, 2],
+            [4, 4],
+            [5, 5],
+        ]
+    )
+    priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14, 0.11, 0.0])
+    Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    clf = GaussianNB(priors=priors)
+    # smoke test for issue #9633
+    clf.fit(X, Y)
+
+
+def test_gnb_wrong_nb_priors():
+    """Test whether an error is raised if the number of prior is different
+    from the number of class"""
+    clf = GaussianNB(priors=np.array([0.25, 0.25, 0.25, 0.25]))
+
+    msg = "Number of priors must match number of classes"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_gnb_prior_greater_one():
+    """Test if an error is raised if the sum of prior greater than one"""
+    clf = GaussianNB(priors=np.array([2.0, 1.0]))
+
+    msg = "The sum of the priors should be 1"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_gnb_prior_large_bias():
+    """Test if good prediction when class prior favor largely one class"""
+    clf = GaussianNB(priors=np.array([0.01, 0.99]))
+    clf.fit(X, y)
+    assert clf.predict([[-0.1, -0.1]]) == np.array([2])
+
+
+def test_gnb_check_update_with_no_data():
+    """Test when the partial fit is called without any data"""
+    # Create an empty array
+    prev_points = 100
+    mean = 0.0
+    var = 1.0
+    x_empty = np.empty((0, X.shape[1]))
+    tmean, tvar = GaussianNB._update_mean_variance(prev_points, mean, var, x_empty)
+    assert tmean == mean
+    assert tvar == var
+
+
+def test_gnb_partial_fit():
+    clf = GaussianNB().fit(X, y)
+    clf_pf = GaussianNB().partial_fit(X, y, np.unique(y))
+    assert_array_almost_equal(clf.theta_, clf_pf.theta_)
+    assert_array_almost_equal(clf.var_, clf_pf.var_)
+    assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_)
+
+    clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y))
+    clf_pf2.partial_fit(X[1::2], y[1::2])
+    assert_array_almost_equal(clf.theta_, clf_pf2.theta_)
+    assert_array_almost_equal(clf.var_, clf_pf2.var_)
+    assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_)
+
+
+def test_gnb_naive_bayes_scale_invariance():
+    # Scaling the data should not change the prediction results
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    labels = [GaussianNB().fit(f * X, y).predict(f * X) for f in [1e-10, 1, 1e10]]
+    assert_array_equal(labels[0], labels[1])
+    assert_array_equal(labels[1], labels[2])
+
+
+@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_prior(DiscreteNaiveBayes, global_random_seed):
+    # Test whether class priors are properly set.
+    X2, y2 = get_random_integer_x_three_classes_y(global_random_seed)
+    clf = DiscreteNaiveBayes().fit(X2, y2)
+    assert_array_almost_equal(
+        np.log(np.array([2, 2, 2]) / 6.0), clf.class_log_prior_, 8
+    )
+
+
+@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_partial_fit(DiscreteNaiveBayes):
+    clf1 = DiscreteNaiveBayes()
+    clf1.fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1])
+
+    clf2 = DiscreteNaiveBayes()
+    clf2.partial_fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1], classes=[0, 1])
+    assert_array_equal(clf1.class_count_, clf2.class_count_)
+    if DiscreteNaiveBayes is CategoricalNB:
+        for i in range(len(clf1.category_count_)):
+            assert_array_equal(clf1.category_count_[i], clf2.category_count_[i])
+    else:
+        assert_array_equal(clf1.feature_count_, clf2.feature_count_)
+
+    clf3 = DiscreteNaiveBayes()
+    # all categories have to appear in the first partial fit
+    clf3.partial_fit([[0, 1]], [0], classes=[0, 1])
+    clf3.partial_fit([[1, 0]], [1])
+    clf3.partial_fit([[1, 1]], [1])
+    assert_array_equal(clf1.class_count_, clf3.class_count_)
+    if DiscreteNaiveBayes is CategoricalNB:
+        # the categories for each feature of CategoricalNB are mapped to an
+        # index chronologically with each call of partial fit and therefore
+        # the category_count matrices cannot be compared for equality
+        for i in range(len(clf1.category_count_)):
+            assert_array_equal(
+                clf1.category_count_[i].shape, clf3.category_count_[i].shape
+            )
+            assert_array_equal(
+                np.sum(clf1.category_count_[i], axis=1),
+                np.sum(clf3.category_count_[i], axis=1),
+            )
+
+        # assert category 0 occurs 1x in the first class and 0x in the 2nd
+        # class
+        assert_array_equal(clf1.category_count_[0][0], np.array([1, 0]))
+        # assert category 1 occurs 0x in the first class and 2x in the 2nd
+        # class
+        assert_array_equal(clf1.category_count_[0][1], np.array([0, 2]))
+
+        # assert category 0 occurs 0x in the first class and 1x in the 2nd
+        # class
+        assert_array_equal(clf1.category_count_[1][0], np.array([0, 1]))
+        # assert category 1 occurs 1x in the first class and 1x in the 2nd
+        # class
+        assert_array_equal(clf1.category_count_[1][1], np.array([1, 1]))
+    else:
+        assert_array_equal(clf1.feature_count_, clf3.feature_count_)
+
+
+@pytest.mark.parametrize("NaiveBayes", ALL_NAIVE_BAYES_CLASSES)
+def test_NB_partial_fit_no_first_classes(NaiveBayes, global_random_seed):
+    # classes is required for first call to partial fit
+    X2, y2 = get_random_integer_x_three_classes_y(global_random_seed)
+
+    with pytest.raises(
+        ValueError, match="classes must be passed on the first call to partial_fit."
+    ):
+        NaiveBayes().partial_fit(X2, y2)
+
+    # check consistency of consecutive classes values
+    clf = NaiveBayes()
+    clf.partial_fit(X2, y2, classes=np.unique(y2))
+    with pytest.raises(
+        ValueError, match="is not the same as on last call to partial_fit"
+    ):
+        clf.partial_fit(X2, y2, classes=np.arange(42))
+
+
+def test_discretenb_predict_proba():
+    # Test discrete NB classes' probability scores
+
+    # The 100s below distinguish Bernoulli from multinomial.
+    # FIXME: write a test to show this.
+    X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
+    X_multinomial = [[0, 1], [1, 3], [4, 0]]
+
+    # test binary case (1-d output)
+    y = [0, 0, 2]  # 2 is regression test for binary case, 02e673
+    for DiscreteNaiveBayes, X in zip(
+        [BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]
+    ):
+        clf = DiscreteNaiveBayes().fit(X, y)
+        assert clf.predict(X[-1:]) == 2
+        assert clf.predict_proba([X[0]]).shape == (1, 2)
+        assert_array_almost_equal(
+            clf.predict_proba(X[:2]).sum(axis=1), np.array([1.0, 1.0]), 6
+        )
+
+    # test multiclass case (2-d output, must sum to one)
+    y = [0, 1, 2]
+    for DiscreteNaiveBayes, X in zip(
+        [BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]
+    ):
+        clf = DiscreteNaiveBayes().fit(X, y)
+        assert clf.predict_proba(X[0:1]).shape == (1, 3)
+        assert clf.predict_proba(X[:2]).shape == (2, 3)
+        assert_almost_equal(np.sum(clf.predict_proba([X[1]])), 1)
+        assert_almost_equal(np.sum(clf.predict_proba([X[-1]])), 1)
+        assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
+
+
+@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_uniform_prior(DiscreteNaiveBayes):
+    # Test whether discrete NB classes fit a uniform prior
+    # when fit_prior=False and class_prior=None
+
+    clf = DiscreteNaiveBayes()
+    clf.set_params(fit_prior=False)
+    clf.fit([[0], [0], [1]], [0, 0, 1])
+    prior = np.exp(clf.class_log_prior_)
+    assert_array_almost_equal(prior, np.array([0.5, 0.5]))
+
+
+@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_provide_prior(DiscreteNaiveBayes):
+    # Test whether discrete NB classes use provided prior
+
+    clf = DiscreteNaiveBayes(class_prior=[0.5, 0.5])
+    clf.fit([[0], [0], [1]], [0, 0, 1])
+    prior = np.exp(clf.class_log_prior_)
+    assert_array_almost_equal(prior, np.array([0.5, 0.5]))
+
+    # Inconsistent number of classes with prior
+    msg = "Number of priors must match number of classes"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit([[0], [1], [2]], [0, 1, 2])
+
+    msg = "is not the same as on last call to partial_fit"
+    with pytest.raises(ValueError, match=msg):
+        clf.partial_fit([[0], [1]], [0, 1], classes=[0, 1, 1])
+
+
+@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_provide_prior_with_partial_fit(DiscreteNaiveBayes):
+    # Test whether discrete NB classes use provided prior
+    # when using partial_fit
+
+    iris = load_iris()
+    iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
+        iris.data, iris.target, test_size=0.4, random_state=415
+    )
+
+    for prior in [None, [0.3, 0.3, 0.4]]:
+        clf_full = DiscreteNaiveBayes(class_prior=prior)
+        clf_full.fit(iris.data, iris.target)
+        clf_partial = DiscreteNaiveBayes(class_prior=prior)
+        clf_partial.partial_fit(iris_data1, iris_target1, classes=[0, 1, 2])
+        clf_partial.partial_fit(iris_data2, iris_target2)
+        assert_array_almost_equal(
+            clf_full.class_log_prior_, clf_partial.class_log_prior_
+        )
+
+
+@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_sample_weight_multiclass(DiscreteNaiveBayes):
+    # check shape consistency for number of samples at fit time
+    X = [
+        [0, 0, 1],
+        [0, 1, 1],
+        [0, 1, 1],
+        [1, 0, 0],
+    ]
+    y = [0, 0, 1, 2]
+    sample_weight = np.array([1, 1, 2, 2], dtype=np.float64)
+    sample_weight /= sample_weight.sum()
+    clf = DiscreteNaiveBayes().fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(clf.predict(X), [0, 1, 1, 2])
+
+    # Check sample weight using the partial_fit method
+    clf = DiscreteNaiveBayes()
+    clf.partial_fit(X[:2], y[:2], classes=[0, 1, 2], sample_weight=sample_weight[:2])
+    clf.partial_fit(X[2:3], y[2:3], sample_weight=sample_weight[2:3])
+    clf.partial_fit(X[3:], y[3:], sample_weight=sample_weight[3:])
+    assert_array_equal(clf.predict(X), [0, 1, 1, 2])
+
+
+@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
+@pytest.mark.parametrize("use_partial_fit", [False, True])
+@pytest.mark.parametrize("train_on_single_class_y", [False, True])
+def test_discretenb_degenerate_one_class_case(
+    DiscreteNaiveBayes,
+    use_partial_fit,
+    train_on_single_class_y,
+):
+    # Most array attributes of a discrete naive Bayes classifier should have a
+    # first-axis length equal to the number of classes. Exceptions include:
+    # ComplementNB.feature_all_, CategoricalNB.n_categories_.
+    # Confirm that this is the case for binary problems and the degenerate
+    # case of a single class in the training set, when fitting with `fit` or
+    # `partial_fit`.
+    # Non-regression test for handling degenerate one-class case:
+    # https://github.com/scikit-learn/scikit-learn/issues/18974
+
+    X = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
+    y = [1, 1, 2]
+    if train_on_single_class_y:
+        X = X[:-1]
+        y = y[:-1]
+    classes = sorted(list(set(y)))
+    num_classes = len(classes)
+
+    clf = DiscreteNaiveBayes()
+    if use_partial_fit:
+        clf.partial_fit(X, y, classes=classes)
+    else:
+        clf.fit(X, y)
+    assert clf.predict(X[:1]) == y[0]
+
+    # Check that attributes have expected first-axis lengths
+    attribute_names = [
+        "classes_",
+        "class_count_",
+        "class_log_prior_",
+        "feature_count_",
+        "feature_log_prob_",
+    ]
+    for attribute_name in attribute_names:
+        attribute = getattr(clf, attribute_name, None)
+        if attribute is None:
+            # CategoricalNB has no feature_count_ attribute
+            continue
+        if isinstance(attribute, np.ndarray):
+            assert attribute.shape[0] == num_classes
+        else:
+            # CategoricalNB.feature_log_prob_ is a list of arrays
+            for element in attribute:
+                assert element.shape[0] == num_classes
+
+
+@pytest.mark.parametrize("kind", ("dense", "sparse"))
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_mnnb(kind, global_random_seed, csr_container):
+    # Test Multinomial Naive Bayes classification.
+    # This checks that MultinomialNB implements fit and predict and returns
+    # correct values for a simple toy dataset.
+    X2, y2 = get_random_integer_x_three_classes_y(global_random_seed)
+
+    if kind == "dense":
+        X = X2
+    elif kind == "sparse":
+        X = csr_container(X2)
+
+    # Check the ability to predict the learning set.
+    clf = MultinomialNB()
+
+    msg = "Negative values in data passed to"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(-X, y2)
+    y_pred = clf.fit(X, y2).predict(X)
+
+    assert_array_equal(y_pred, y2)
+
+    # Verify that np.log(clf.predict_proba(X)) gives the same results as
+    # clf.predict_log_proba(X)
+    y_pred_proba = clf.predict_proba(X)
+    y_pred_log_proba = clf.predict_log_proba(X)
+    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
+
+    # Check that incremental fitting yields the same results
+    clf2 = MultinomialNB()
+    clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2))
+    clf2.partial_fit(X[2:5], y2[2:5])
+    clf2.partial_fit(X[5:], y2[5:])
+
+    y_pred2 = clf2.predict(X)
+    assert_array_equal(y_pred2, y2)
+
+    y_pred_proba2 = clf2.predict_proba(X)
+    y_pred_log_proba2 = clf2.predict_log_proba(X)
+    assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8)
+    assert_array_almost_equal(y_pred_proba2, y_pred_proba)
+    assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba)
+
+    # Partial fit on the whole data at once should be the same as fit too
+    clf3 = MultinomialNB()
+    clf3.partial_fit(X, y2, classes=np.unique(y2))
+
+    y_pred3 = clf3.predict(X)
+    assert_array_equal(y_pred3, y2)
+    y_pred_proba3 = clf3.predict_proba(X)
+    y_pred_log_proba3 = clf3.predict_log_proba(X)
+    assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8)
+    assert_array_almost_equal(y_pred_proba3, y_pred_proba)
+    assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba)
+
+
+def test_mnb_prior_unobserved_targets():
+    # test smoothing of prior for yet unobserved targets
+
+    # Create toy training data
+    X = np.array([[0, 1], [1, 0]])
+    y = np.array([0, 1])
+
+    clf = MultinomialNB()
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+
+        clf.partial_fit(X, y, classes=[0, 1, 2])
+
+    assert clf.predict([[0, 1]]) == 0
+    assert clf.predict([[1, 0]]) == 1
+    assert clf.predict([[1, 1]]) == 0
+
+    # add a training example with previously unobserved class
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+
+        clf.partial_fit([[1, 1]], [2])
+
+    assert clf.predict([[0, 1]]) == 0
+    assert clf.predict([[1, 0]]) == 1
+    assert clf.predict([[1, 1]]) == 2
+
+
+def test_bnb():
+    # Tests that BernoulliNB when alpha=1.0 gives the same values as
+    # those given for the toy example in Manning, Raghavan, and
+    # Schuetze's "Introduction to Information Retrieval" book:
+    # https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
+
+    # Training data points are:
+    # Chinese Beijing Chinese (class: China)
+    # Chinese Chinese Shanghai (class: China)
+    # Chinese Macao (class: China)
+    # Tokyo Japan Chinese (class: Japan)
+
+    # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo
+    X = np.array(
+        [[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]
+    )
+
+    # Classes are China (0), Japan (1)
+    Y = np.array([0, 0, 0, 1])
+
+    # Fit BernoulliBN w/ alpha = 1.0
+    clf = BernoulliNB(alpha=1.0)
+    clf.fit(X, Y)
+
+    # Check the class prior is correct
+    class_prior = np.array([0.75, 0.25])
+    assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior)
+
+    # Check the feature probabilities are correct
+    feature_prob = np.array(
+        [
+            [0.4, 0.8, 0.2, 0.4, 0.4, 0.2],
+            [1 / 3.0, 2 / 3.0, 2 / 3.0, 1 / 3.0, 1 / 3.0, 2 / 3.0],
+        ]
+    )
+    assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob)
+
+    # Testing data point is:
+    # Chinese Chinese Chinese Tokyo Japan
+    X_test = np.array([[0, 1, 1, 0, 0, 1]])
+
+    # Check the predictive probabilities are correct
+    unnorm_predict_proba = np.array([[0.005183999999999999, 0.02194787379972565]])
+    predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba)
+    assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)
+
+
+def test_bnb_feature_log_prob():
+    # Test for issue #4268.
+    # Tests that the feature log prob value computed by BernoulliNB when
+    # alpha=1.0 is equal to the expression given in Manning, Raghavan,
+    # and Schuetze's "Introduction to Information Retrieval" book:
+    # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
+
+    X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
+    Y = np.array([0, 0, 1, 2, 2])
+
+    # Fit Bernoulli NB w/ alpha = 1.0
+    clf = BernoulliNB(alpha=1.0)
+    clf.fit(X, Y)
+
+    # Manually form the (log) numerator and denominator that
+    # constitute P(feature presence | class)
+    num = np.log(clf.feature_count_ + 1.0)
+    denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T
+
+    # Check manual estimate matches
+    assert_array_almost_equal(clf.feature_log_prob_, (num - denom))
+
+
+def test_cnb():
+    # Tests ComplementNB when alpha=1.0 for the toy example in Manning,
+    # Raghavan, and Schuetze's "Introduction to Information Retrieval" book:
+    # https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
+
+    # Training data points are:
+    # Chinese Beijing Chinese (class: China)
+    # Chinese Chinese Shanghai (class: China)
+    # Chinese Macao (class: China)
+    # Tokyo Japan Chinese (class: Japan)
+
+    # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo.
+    X = np.array(
+        [[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]
+    )
+
+    # Classes are China (0), Japan (1).
+    Y = np.array([0, 0, 0, 1])
+
+    # Check that weights are correct. See steps 4-6 in Table 4 of
+    # Rennie et al. (2003).
+    theta = np.array(
+        [
+            [
+                (0 + 1) / (3 + 6),
+                (1 + 1) / (3 + 6),
+                (1 + 1) / (3 + 6),
+                (0 + 1) / (3 + 6),
+                (0 + 1) / (3 + 6),
+                (1 + 1) / (3 + 6),
+            ],
+            [
+                (1 + 1) / (6 + 6),
+                (3 + 1) / (6 + 6),
+                (0 + 1) / (6 + 6),
+                (1 + 1) / (6 + 6),
+                (1 + 1) / (6 + 6),
+                (0 + 1) / (6 + 6),
+            ],
+        ]
+    )
+
+    weights = np.zeros(theta.shape)
+    normed_weights = np.zeros(theta.shape)
+    for i in range(2):
+        weights[i] = -np.log(theta[i])
+        normed_weights[i] = weights[i] / weights[i].sum()
+
+    # Verify inputs are nonnegative.
+    clf = ComplementNB(alpha=1.0)
+
+    msg = re.escape("Negative values in data passed to ComplementNB (input X)")
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(-X, Y)
+
+    clf.fit(X, Y)
+
+    # Check that counts/weights are correct.
+    feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]])
+    assert_array_equal(clf.feature_count_, feature_count)
+    class_count = np.array([3, 1])
+    assert_array_equal(clf.class_count_, class_count)
+    feature_all = np.array([1, 4, 1, 1, 1, 1])
+    assert_array_equal(clf.feature_all_, feature_all)
+    assert_array_almost_equal(clf.feature_log_prob_, weights)
+
+    clf = ComplementNB(alpha=1.0, norm=True)
+    clf.fit(X, Y)
+    assert_array_almost_equal(clf.feature_log_prob_, normed_weights)
+
+
+def test_categoricalnb(global_random_seed):
+    # Check the ability to predict the training set.
+    clf = CategoricalNB()
+    X2, y2 = get_random_integer_x_three_classes_y(global_random_seed)
+
+    y_pred = clf.fit(X2, y2).predict(X2)
+    assert_array_equal(y_pred, y2)
+
+    X3 = np.array([[1, 4], [2, 5]])
+    y3 = np.array([1, 2])
+    clf = CategoricalNB(alpha=1, fit_prior=False)
+
+    clf.fit(X3, y3)
+    assert_array_equal(clf.n_categories_, np.array([3, 6]))
+
+    # Check error is raised for X with negative entries
+    X = np.array([[0, -1]])
+    y = np.array([1])
+    error_msg = re.escape("Negative values in data passed to CategoricalNB (input X)")
+    with pytest.raises(ValueError, match=error_msg):
+        clf.predict(X)
+    with pytest.raises(ValueError, match=error_msg):
+        clf.fit(X, y)
+
+    # Test alpha
+    X3_test = np.array([[2, 5]])
+    # alpha=1 increases the count of all categories by one so the final
+    # probability for each category is not 50/50 but 1/3 to 2/3
+    bayes_numerator = np.array([[1 / 3 * 1 / 3, 2 / 3 * 2 / 3]])
+    bayes_denominator = bayes_numerator.sum()
+    assert_array_almost_equal(
+        clf.predict_proba(X3_test), bayes_numerator / bayes_denominator
+    )
+
+    # Assert category_count has counted all features
+    assert len(clf.category_count_) == X3.shape[1]
+
+    # Check sample_weight
+    X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
+    y = np.array([1, 1, 2, 2])
+    clf = CategoricalNB(alpha=1, fit_prior=False)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1]))
+    assert_array_equal(clf.n_categories_, np.array([2, 2]))
+
+    for factor in [1.0, 0.3, 5, 0.0001]:
+        X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
+        y = np.array([1, 1, 2, 2])
+        sample_weight = np.array([1, 1, 10, 0.1]) * factor
+        clf = CategoricalNB(alpha=1, fit_prior=False)
+        clf.fit(X, y, sample_weight=sample_weight)
+        assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2]))
+        assert_array_equal(clf.n_categories_, np.array([2, 2]))
+
+
+@pytest.mark.parametrize(
+    "min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_",
+    [
+        # check min_categories with int > observed categories
+        (
+            3,
+            np.array([[2, 0, 0], [1, 1, 0]]),
+            np.array([[1, 1, 0], [1, 1, 0]]),
+            np.array([[0, 2]]),
+            np.array([3, 3]),
+        ),
+        # check with list input
+        (
+            [3, 4],
+            np.array([[2, 0, 0], [1, 1, 0]]),
+            np.array([[1, 1, 0, 0], [1, 1, 0, 0]]),
+            np.array([[0, 3]]),
+            np.array([3, 4]),
+        ),
+        # check min_categories with min less than actual
+        (
+            [
+                1,
+                np.array([[2, 0], [1, 1]]),
+                np.array([[1, 1], [1, 1]]),
+                np.array([[0, 1]]),
+                np.array([2, 2]),
+            ]
+        ),
+    ],
+)
+def test_categoricalnb_with_min_categories(
+    min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_
+):
+    X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
+    y_n_categories = np.array([1, 1, 2, 2])
+    expected_prediction = np.array([1])
+
+    clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories)
+    clf.fit(X_n_categories, y_n_categories)
+    X1_count, X2_count = clf.category_count_
+    assert_array_equal(X1_count, exp_X1_count)
+    assert_array_equal(X2_count, exp_X2_count)
+    predictions = clf.predict(new_X)
+    assert_array_equal(predictions, expected_prediction)
+    assert_array_equal(clf.n_categories_, exp_n_categories_)
+
+
+@pytest.mark.parametrize(
+    "min_categories, error_msg",
+    [
+        ([[3, 2], [2, 4]], "'min_categories' should have shape"),
+    ],
+)
+def test_categoricalnb_min_categories_errors(min_categories, error_msg):
+    X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
+    y = np.array([1, 1, 2, 2])
+
+    clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories)
+    with pytest.raises(ValueError, match=error_msg):
+        clf.fit(X, y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_alpha(csr_container):
+    # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
+    X = np.array([[1, 0], [1, 1]])
+    y = np.array([0, 1])
+    nb = BernoulliNB(alpha=0.0, force_alpha=False)
+    msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10"
+    with pytest.warns(UserWarning, match=msg):
+        nb.partial_fit(X, y, classes=[0, 1])
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
+    prob = np.array([[1, 0], [0, 1]])
+    assert_array_almost_equal(nb.predict_proba(X), prob)
+
+    nb = MultinomialNB(alpha=0.0, force_alpha=False)
+    with pytest.warns(UserWarning, match=msg):
+        nb.partial_fit(X, y, classes=[0, 1])
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
+    prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
+    assert_array_almost_equal(nb.predict_proba(X), prob)
+
+    nb = CategoricalNB(alpha=0.0, force_alpha=False)
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
+    prob = np.array([[1.0, 0.0], [0.0, 1.0]])
+    assert_array_almost_equal(nb.predict_proba(X), prob)
+
+    # Test sparse X
+    X = csr_container(X)
+    nb = BernoulliNB(alpha=0.0, force_alpha=False)
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
+    prob = np.array([[1, 0], [0, 1]])
+    assert_array_almost_equal(nb.predict_proba(X), prob)
+
+    nb = MultinomialNB(alpha=0.0, force_alpha=False)
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
+    prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
+    assert_array_almost_equal(nb.predict_proba(X), prob)
+
+
+def test_alpha_vector():
+    X = np.array([[1, 0], [1, 1]])
+    y = np.array([0, 1])
+
+    # Setting alpha=np.array with same length
+    # as number of features should be fine
+    alpha = np.array([1, 2])
+    nb = MultinomialNB(alpha=alpha, force_alpha=False)
+    nb.partial_fit(X, y, classes=[0, 1])
+
+    # Test feature probabilities uses pseudo-counts (alpha)
+    feature_prob = np.array([[1 / 2, 1 / 2], [2 / 5, 3 / 5]])
+    assert_array_almost_equal(nb.feature_log_prob_, np.log(feature_prob))
+
+    # Test predictions
+    prob = np.array([[5 / 9, 4 / 9], [25 / 49, 24 / 49]])
+    assert_array_almost_equal(nb.predict_proba(X), prob)
+
+    # Test alpha non-negative
+    alpha = np.array([1.0, -0.1])
+    m_nb = MultinomialNB(alpha=alpha, force_alpha=False)
+    expected_msg = "All values in alpha must be greater than 0."
+    with pytest.raises(ValueError, match=expected_msg):
+        m_nb.fit(X, y)
+
+    # Test that too small pseudo-counts are replaced
+    ALPHA_MIN = 1e-10
+    alpha = np.array([ALPHA_MIN / 2, 0.5])
+    m_nb = MultinomialNB(alpha=alpha, force_alpha=False)
+    m_nb.partial_fit(X, y, classes=[0, 1])
+    assert_array_almost_equal(m_nb._check_alpha(), [ALPHA_MIN, 0.5], decimal=12)
+
+    # Test correct dimensions
+    alpha = np.array([1.0, 2.0, 3.0])
+    m_nb = MultinomialNB(alpha=alpha, force_alpha=False)
+    expected_msg = "When alpha is an array, it should contains `n_features`"
+    with pytest.raises(ValueError, match=expected_msg):
+        m_nb.fit(X, y)
+
+
+def test_check_accuracy_on_digits():
+    # Non regression test to make sure that any further refactoring / optim
+    # of the NB models do not harm the performance on a slightly non-linearly
+    # separable dataset
+    X, y = load_digits(return_X_y=True)
+    binary_3v8 = np.logical_or(y == 3, y == 8)
+    X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]
+
+    # Multinomial NB
+    scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
+    assert scores.mean() > 0.86
+
+    scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
+    assert scores.mean() > 0.94
+
+    # Bernoulli NB
+    scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
+    assert scores.mean() > 0.83
+
+    scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
+    assert scores.mean() > 0.92
+
+    # Gaussian NB
+    scores = cross_val_score(GaussianNB(), X, y, cv=10)
+    assert scores.mean() > 0.77
+
+    scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10)
+    assert scores.mean() > 0.89
+
+    scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
+    assert scores.mean() > 0.86
+
+
+def test_check_alpha():
+    """The provided value for alpha must only be
+    used if alpha < _ALPHA_MIN and force_alpha is True.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/10772
+    """
+    _ALPHA_MIN = 1e-10
+    b = BernoulliNB(alpha=0, force_alpha=True)
+    assert b._check_alpha() == 0
+
+    alphas = np.array([0.0, 1.0])
+
+    b = BernoulliNB(alpha=alphas, force_alpha=True)
+    # We manually set `n_features_in_` not to have `_check_alpha` err
+    b.n_features_in_ = alphas.shape[0]
+    assert_array_equal(b._check_alpha(), alphas)
+
+    msg = (
+        "alpha too small will result in numeric errors, setting alpha = %.1e"
+        % _ALPHA_MIN
+    )
+    b = BernoulliNB(alpha=0, force_alpha=False)
+    with pytest.warns(UserWarning, match=msg):
+        assert b._check_alpha() == _ALPHA_MIN
+
+    b = BernoulliNB(alpha=0, force_alpha=False)
+    with pytest.warns(UserWarning, match=msg):
+        assert b._check_alpha() == _ALPHA_MIN
+
+    b = BernoulliNB(alpha=alphas, force_alpha=False)
+    # We manually set `n_features_in_` not to have `_check_alpha` err
+    b.n_features_in_ = alphas.shape[0]
+    with pytest.warns(UserWarning, match=msg):
+        assert_array_equal(b._check_alpha(), np.array([_ALPHA_MIN, 1.0]))
+
+
+@pytest.mark.parametrize("Estimator", ALL_NAIVE_BAYES_CLASSES)
+def test_predict_joint_proba(Estimator, global_random_seed):
+    X2, y2 = get_random_integer_x_three_classes_y(global_random_seed)
+    est = Estimator().fit(X2, y2)
+    jll = est.predict_joint_log_proba(X2)
+    log_prob_x = logsumexp(jll, axis=1)
+    log_prob_x_y = jll - np.atleast_2d(log_prob_x).T
+    assert_allclose(est.predict_log_proba(X2), log_prob_x_y)
+
+
+@pytest.mark.parametrize("Estimator", ALL_NAIVE_BAYES_CLASSES)
+def test_categorical_input_tag(Estimator):
+    tags = Estimator().__sklearn_tags__()
+    if Estimator is CategoricalNB:
+        assert tags.input_tags.categorical
+    else:
+        assert not tags.input_tags.categorical
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_pipeline.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..96a3052d38b43e23b722fe4363c032a804e1b8a1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_pipeline.py
@@ -0,0 +1,2429 @@
+"""
+Test the pipeline module.
+"""
+
+import itertools
+import re
+import shutil
+import time
+from tempfile import mkdtemp
+
+import joblib
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.base import (
+    BaseEstimator,
+    ClassifierMixin,
+    TransformerMixin,
+    clone,
+    is_classifier,
+    is_regressor,
+)
+from sklearn.cluster import KMeans
+from sklearn.datasets import load_iris
+from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.dummy import DummyRegressor
+from sklearn.ensemble import (
+    HistGradientBoostingClassifier,
+    RandomForestClassifier,
+    RandomTreesEmbedding,
+)
+from sklearn.exceptions import NotFittedError, UnsetMetadataPassedError
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectKBest, f_classif
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
+from sklearn.metrics import accuracy_score, r2_score
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
+from sklearn.preprocessing import FunctionTransformer, StandardScaler
+from sklearn.svm import SVC
+from sklearn.tests.metadata_routing_common import (
+    ConsumingNoFitTransformTransformer,
+    ConsumingTransformer,
+    _Registry,
+    check_recorded_metadata,
+)
+from sklearn.utils import get_tags
+from sklearn.utils._metadata_requests import COMPOSITE_METHODS, METHODS
+from sklearn.utils._testing import (
+    MinimalClassifier,
+    MinimalRegressor,
+    MinimalTransformer,
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.validation import _check_feature_names, check_is_fitted
+
+# Load a shared tests data sets for the tests in this module. Mark them
+# read-only to avoid unintentional in-place modifications that would introduce
+# side-effects between tests.
+iris = load_iris()
+iris.data.flags.writeable = False
+iris.target.flags.writeable = False
+
+
+JUNK_FOOD_DOCS = (
+    "the pizza pizza beer copyright",
+    "the pizza burger beer copyright",
+    "the the pizza beer beer copyright",
+    "the burger beer beer copyright",
+    "the coke burger coke copyright",
+    "the coke burger burger",
+)
+
+
+class NoFit(BaseEstimator):
+    """Small class to test parameter dispatching."""
+
+    def __init__(self, a=None, b=None):
+        self.a = a
+        self.b = b
+
+
+class NoTrans(NoFit):
+    def fit(self, X, y=None):
+        return self
+
+    def get_params(self, deep=False):
+        return {"a": self.a, "b": self.b}
+
+    def set_params(self, **params):
+        self.a = params["a"]
+        return self
+
+
+class NoInvTransf(TransformerMixin, NoTrans):
+    def transform(self, X):
+        return X
+
+
+class Transf(NoInvTransf):
+    def transform(self, X):
+        return X
+
+    def inverse_transform(self, X):
+        return X
+
+
+class TransfFitParams(Transf):
+    def fit(self, X, y=None, **fit_params):
+        self.fit_params = fit_params
+        return self
+
+
+class Mult(TransformerMixin, BaseEstimator):
+    def __init__(self, mult=1):
+        self.mult = mult
+
+    def __sklearn_is_fitted__(self):
+        return True
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        return np.asarray(X) * self.mult
+
+    def inverse_transform(self, X):
+        return np.asarray(X) / self.mult
+
+    def predict(self, X):
+        return (np.asarray(X) * self.mult).sum(axis=1)
+
+    predict_proba = predict_log_proba = decision_function = predict
+
+    def score(self, X, y=None):
+        return np.sum(X)
+
+
+class FitParamT(BaseEstimator):
+    """Mock classifier"""
+
+    def __init__(self):
+        self.successful = False
+
+    def fit(self, X, y, should_succeed=False):
+        self.successful = should_succeed
+        self.fitted_ = True
+
+    def predict(self, X):
+        return self.successful
+
+    def fit_predict(self, X, y, should_succeed=False):
+        self.fit(X, y, should_succeed=should_succeed)
+        return self.predict(X)
+
+    def score(self, X, y=None, sample_weight=None):
+        if sample_weight is not None:
+            X = X * sample_weight
+        return np.sum(X)
+
+
+class DummyTransf(Transf):
+    """Transformer which store the column means"""
+
+    def fit(self, X, y):
+        self.means_ = np.mean(X, axis=0)
+        # store timestamp to figure out whether the result of 'fit' has been
+        # cached or not
+        self.timestamp_ = time.time()
+        return self
+
+
+class DummyEstimatorParams(BaseEstimator):
+    """Mock classifier that takes params on predict"""
+
+    def __sklearn_is_fitted__(self):
+        return True
+
+    def fit(self, X, y):
+        return self
+
+    def predict(self, X, got_attribute=False):
+        self.got_attribute = got_attribute
+        return self
+
+    def predict_proba(self, X, got_attribute=False):
+        self.got_attribute = got_attribute
+        return self
+
+    def predict_log_proba(self, X, got_attribute=False):
+        self.got_attribute = got_attribute
+        return self
+
+
+def test_pipeline_invalid_parameters():
+    # Test the various init parameters of the pipeline in fit
+    # method
+    pipeline = Pipeline([(1, 1)])
+    with pytest.raises(TypeError):
+        pipeline.fit([[1]], [1])
+
+    # Check that we can't fit pipelines with objects without fit
+    # method
+    msg = (
+        "Last step of Pipeline should implement fit "
+        "or be the string 'passthrough'"
+        ".*NoFit.*"
+    )
+    pipeline = Pipeline([("clf", NoFit())])
+    with pytest.raises(TypeError, match=msg):
+        pipeline.fit([[1]], [1])
+
+    # Smoke test with only an estimator
+    clf = NoTrans()
+    pipe = Pipeline([("svc", clf)])
+    assert pipe.get_params(deep=True) == dict(
+        svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)
+    )
+
+    # Check that params are set
+    pipe.set_params(svc__a=0.1)
+    assert clf.a == 0.1
+    assert clf.b is None
+    # Smoke test the repr:
+    repr(pipe)
+
+    # Test with two objects
+    clf = SVC()
+    filter1 = SelectKBest(f_classif)
+    pipe = Pipeline([("anova", filter1), ("svc", clf)])
+
+    # Check that estimators are not cloned on pipeline construction
+    assert pipe.named_steps["anova"] is filter1
+    assert pipe.named_steps["svc"] is clf
+
+    # Check that we can't fit with non-transformers on the way
+    # Note that NoTrans implements fit, but not transform
+    msg = "All intermediate steps should be transformers.*\\bNoTrans\\b.*"
+    pipeline = Pipeline([("t", NoTrans()), ("svc", clf)])
+    with pytest.raises(TypeError, match=msg):
+        pipeline.fit([[1]], [1])
+
+    # Check that params are set
+    pipe.set_params(svc__C=0.1)
+    assert clf.C == 0.1
+    # Smoke test the repr:
+    repr(pipe)
+
+    # Check that params are not set when naming them wrong
+    msg = re.escape(
+        "Invalid parameter 'C' for estimator SelectKBest(). Valid parameters are: ['k',"
+        " 'score_func']."
+    )
+    with pytest.raises(ValueError, match=msg):
+        pipe.set_params(anova__C=0.1)
+
+    # Test clone
+    pipe2 = clone(pipe)
+    assert pipe.named_steps["svc"] is not pipe2.named_steps["svc"]
+
+    # Check that apart from estimators, the parameters are the same
+    params = pipe.get_params(deep=True)
+    params2 = pipe2.get_params(deep=True)
+
+    for x in pipe.get_params(deep=False):
+        params.pop(x)
+
+    for x in pipe2.get_params(deep=False):
+        params2.pop(x)
+
+    # Remove estimators that where copied
+    params.pop("svc")
+    params.pop("anova")
+    params2.pop("svc")
+    params2.pop("anova")
+    assert params == params2
+
+
+def test_empty_pipeline():
+    X = iris.data
+    y = iris.target
+
+    pipe = Pipeline([])
+    msg = "The pipeline is empty. Please add steps."
+    with pytest.raises(ValueError, match=msg):
+        pipe.fit(X, y)
+
+
+def test_pipeline_init_tuple():
+    # Pipeline accepts steps as tuple
+    X = np.array([[1, 2]])
+    pipe = Pipeline((("transf", Transf()), ("clf", FitParamT())))
+    pipe.fit(X, y=None)
+    pipe.score(X)
+
+    pipe.set_params(transf="passthrough")
+    pipe.fit(X, y=None)
+    pipe.score(X)
+
+
+def test_pipeline_methods_anova():
+    # Test the various methods of the pipeline (anova).
+    X = iris.data
+    y = iris.target
+    # Test with Anova + LogisticRegression
+    clf = LogisticRegression()
+    filter1 = SelectKBest(f_classif, k=2)
+    pipe = Pipeline([("anova", filter1), ("logistic", clf)])
+    pipe.fit(X, y)
+    pipe.predict(X)
+    pipe.predict_proba(X)
+    pipe.predict_log_proba(X)
+    pipe.score(X, y)
+
+
+def test_pipeline_fit_params():
+    # Test that the pipeline can take fit parameters
+    pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())])
+    pipe.fit(X=None, y=None, clf__should_succeed=True)
+    # classifier should return True
+    assert pipe.predict(None)
+    # and transformer params should not be changed
+    assert pipe.named_steps["transf"].a is None
+    assert pipe.named_steps["transf"].b is None
+    # invalid parameters should raise an error message
+
+    msg = re.escape("fit() got an unexpected keyword argument 'bad'")
+    with pytest.raises(TypeError, match=msg):
+        pipe.fit(None, None, clf__bad=True)
+
+
+def test_pipeline_sample_weight_supported():
+    # Pipeline should pass sample_weight
+    X = np.array([[1, 2]])
+    pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())])
+    pipe.fit(X, y=None)
+    assert pipe.score(X) == 3
+    assert pipe.score(X, y=None) == 3
+    assert pipe.score(X, y=None, sample_weight=None) == 3
+    assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
+
+
+def test_pipeline_sample_weight_unsupported():
+    # When sample_weight is None it shouldn't be passed
+    X = np.array([[1, 2]])
+    pipe = Pipeline([("transf", Transf()), ("clf", Mult())])
+    pipe.fit(X, y=None)
+    assert pipe.score(X) == 3
+    assert pipe.score(X, sample_weight=None) == 3
+
+    msg = re.escape("score() got an unexpected keyword argument 'sample_weight'")
+    with pytest.raises(TypeError, match=msg):
+        pipe.score(X, sample_weight=np.array([2, 3]))
+
+
+def test_pipeline_raise_set_params_error():
+    # Test pipeline raises set params error message for nested models.
+    pipe = Pipeline([("cls", LinearRegression())])
+
+    # expected error message
+    error_msg = re.escape(
+        "Invalid parameter 'fake' for estimator Pipeline(steps=[('cls',"
+        " LinearRegression())]). Valid parameters are: ['memory', 'steps',"
+        " 'transform_input', 'verbose']."
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        pipe.set_params(fake="nope")
+
+    # invalid outer parameter name for compound parameter: the expected error message
+    # is the same as above.
+    with pytest.raises(ValueError, match=error_msg):
+        pipe.set_params(fake__estimator="nope")
+
+    # expected error message for invalid inner parameter
+    error_msg = re.escape(
+        "Invalid parameter 'invalid_param' for estimator LinearRegression(). Valid"
+        " parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive', 'tol']."
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        pipe.set_params(cls__invalid_param="nope")
+
+
+def test_pipeline_methods_pca_svm():
+    # Test the various methods of the pipeline (pca + svm).
+    X = iris.data
+    y = iris.target
+    # Test with PCA + SVC
+    clf = SVC(probability=True, random_state=0)
+    pca = PCA(svd_solver="full", n_components="mle", whiten=True)
+    pipe = Pipeline([("pca", pca), ("svc", clf)])
+    pipe.fit(X, y)
+    pipe.predict(X)
+    pipe.predict_proba(X)
+    pipe.predict_log_proba(X)
+    pipe.score(X, y)
+
+
+def test_pipeline_score_samples_pca_lof():
+    X = iris.data
+    # Test that the score_samples method is implemented on a pipeline.
+    # Test that the score_samples method on pipeline yields same results as
+    # applying transform and score_samples steps separately.
+    pca = PCA(svd_solver="full", n_components="mle", whiten=True)
+    lof = LocalOutlierFactor(novelty=True)
+    pipe = Pipeline([("pca", pca), ("lof", lof)])
+    pipe.fit(X)
+    # Check the shapes
+    assert pipe.score_samples(X).shape == (X.shape[0],)
+    # Check the values
+    lof.fit(pca.fit_transform(X))
+    assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
+
+
+def test_score_samples_on_pipeline_without_score_samples():
+    X = np.array([[1], [2]])
+    y = np.array([1, 2])
+    # Test that a pipeline does not have score_samples method when the final
+    # step of the pipeline does not have score_samples defined.
+    pipe = make_pipeline(LogisticRegression())
+    pipe.fit(X, y)
+
+    inner_msg = "'LogisticRegression' object has no attribute 'score_samples'"
+    outer_msg = "'Pipeline' has no attribute 'score_samples'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        pipe.score_samples(X)
+
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+
+def test_pipeline_methods_preprocessing_svm():
+    # Test the various methods of the pipeline (preprocessing + svm).
+    X = iris.data
+    y = iris.target
+    n_samples = X.shape[0]
+    n_classes = len(np.unique(y))
+    scaler = StandardScaler()
+    pca = PCA(n_components=2, svd_solver="randomized", whiten=True)
+    clf = SVC(probability=True, random_state=0, decision_function_shape="ovr")
+
+    for preprocessing in [scaler, pca]:
+        pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)])
+        pipe.fit(X, y)
+
+        # check shapes of various prediction functions
+        predict = pipe.predict(X)
+        assert predict.shape == (n_samples,)
+
+        proba = pipe.predict_proba(X)
+        assert proba.shape == (n_samples, n_classes)
+
+        log_proba = pipe.predict_log_proba(X)
+        assert log_proba.shape == (n_samples, n_classes)
+
+        decision_function = pipe.decision_function(X)
+        assert decision_function.shape == (n_samples, n_classes)
+
+        pipe.score(X, y)
+
+
+def test_fit_predict_on_pipeline():
+    # test that the fit_predict method is implemented on a pipeline
+    # test that the fit_predict on pipeline yields same results as applying
+    # transform and clustering steps separately
+    scaler = StandardScaler()
+    km = KMeans(random_state=0, n_init="auto")
+    # As pipeline doesn't clone estimators on construction,
+    # it must have its own estimators
+    scaler_for_pipeline = StandardScaler()
+    km_for_pipeline = KMeans(random_state=0, n_init="auto")
+
+    # first compute the transform and clustering step separately
+    scaled = scaler.fit_transform(iris.data)
+    separate_pred = km.fit_predict(scaled)
+
+    # use a pipeline to do the transform and clustering in one step
+    pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)])
+    pipeline_pred = pipe.fit_predict(iris.data)
+
+    assert_array_almost_equal(pipeline_pred, separate_pred)
+
+
+def test_fit_predict_on_pipeline_without_fit_predict():
+    # tests that a pipeline does not have fit_predict method when final
+    # step of pipeline does not have fit_predict defined
+    scaler = StandardScaler()
+    pca = PCA(svd_solver="full")
+    pipe = Pipeline([("scaler", scaler), ("pca", pca)])
+
+    outer_msg = "'Pipeline' has no attribute 'fit_predict'"
+    inner_msg = "'PCA' object has no attribute 'fit_predict'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        getattr(pipe, "fit_predict")
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+
+def test_fit_predict_with_intermediate_fit_params():
+    # tests that Pipeline passes fit_params to intermediate steps
+    # when fit_predict is invoked
+    pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())])
+    pipe.fit_predict(
+        X=None, y=None, transf__should_get_this=True, clf__should_succeed=True
+    )
+    assert pipe.named_steps["transf"].fit_params["should_get_this"]
+    assert pipe.named_steps["clf"].successful
+    assert "should_succeed" not in pipe.named_steps["transf"].fit_params
+
+
+@pytest.mark.parametrize(
+    "method_name", ["predict", "predict_proba", "predict_log_proba"]
+)
+def test_predict_methods_with_predict_params(method_name):
+    # tests that Pipeline passes predict_* to the final estimator
+    # when predict_* is invoked
+    pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())])
+    pipe.fit(None, None)
+    method = getattr(pipe, method_name)
+    method(X=None, got_attribute=True)
+
+    assert pipe.named_steps["clf"].got_attribute
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_feature_union(csr_container):
+    # basic sanity check for feature union
+    X = iris.data.copy()
+    X -= X.mean(axis=0)
+    y = iris.target
+    svd = TruncatedSVD(n_components=2, random_state=0)
+    select = SelectKBest(k=1)
+    fs = FeatureUnion([("svd", svd), ("select", select)])
+    fs.fit(X, y)
+    X_transformed = fs.transform(X)
+    assert X_transformed.shape == (X.shape[0], 3)
+
+    # check if it does the expected thing
+    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
+    assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
+
+    # test if it also works for sparse input
+    # We use a different svd object to control the random_state stream
+    fs = FeatureUnion([("svd", svd), ("select", select)])
+    X_sp = csr_container(X)
+    X_sp_transformed = fs.fit_transform(X_sp, y)
+    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())
+
+    # Test clone
+    fs2 = clone(fs)
+    assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1]
+
+    # test setting parameters
+    fs.set_params(select__k=2)
+    assert fs.fit_transform(X, y).shape == (X.shape[0], 4)
+
+    # test it works with transformers missing fit_transform
+    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
+    X_transformed = fs.fit_transform(X, y)
+    assert X_transformed.shape == (X.shape[0], 8)
+
+    # test error if some elements do not support transform
+    msg = "All estimators should implement fit and transform.*\\bNoTrans\\b"
+    fs = FeatureUnion([("transform", Transf()), ("no_transform", NoTrans())])
+    with pytest.raises(TypeError, match=msg):
+        fs.fit(X)
+
+    # test that init accepts tuples
+    fs = FeatureUnion((("svd", svd), ("select", select)))
+    fs.fit(X, y)
+
+
+def test_feature_union_named_transformers():
+    """Check the behaviour of `named_transformers` attribute."""
+    transf = Transf()
+    noinvtransf = NoInvTransf()
+    fs = FeatureUnion([("transf", transf), ("noinvtransf", noinvtransf)])
+    assert fs.named_transformers["transf"] == transf
+    assert fs.named_transformers["noinvtransf"] == noinvtransf
+
+    # test named attribute
+    assert fs.named_transformers.transf == transf
+    assert fs.named_transformers.noinvtransf == noinvtransf
+
+
+def test_make_union():
+    pca = PCA(svd_solver="full")
+    mock = Transf()
+    fu = make_union(pca, mock)
+    names, transformers = zip(*fu.transformer_list)
+    assert names == ("pca", "transf")
+    assert transformers == (pca, mock)
+
+
+def test_make_union_kwargs():
+    pca = PCA(svd_solver="full")
+    mock = Transf()
+    fu = make_union(pca, mock, n_jobs=3)
+    assert fu.transformer_list == make_union(pca, mock).transformer_list
+    assert 3 == fu.n_jobs
+
+    # invalid keyword parameters should raise an error message
+    msg = re.escape(
+        "make_union() got an unexpected keyword argument 'transformer_weights'"
+    )
+    with pytest.raises(TypeError, match=msg):
+        make_union(pca, mock, transformer_weights={"pca": 10, "Transf": 1})
+
+
+def create_mock_transformer(base_name, n_features=3):
+    """Helper to create a mock transformer with custom feature names."""
+    mock = Transf()
+    mock.get_feature_names_out = lambda input_features: [
+        f"{base_name}{i}" for i in range(n_features)
+    ]
+    return mock
+
+
+def test_make_union_passes_verbose_feature_names_out():
+    # Test that make_union passes verbose_feature_names_out
+    # to the FeatureUnion.
+    X = iris.data
+    y = iris.target
+
+    pca = PCA()
+    mock = create_mock_transformer("transf")
+    union = make_union(pca, mock, verbose_feature_names_out=False)
+
+    assert not union.verbose_feature_names_out
+
+    fu_union = make_union(pca, mock, verbose_feature_names_out=True)
+    fu_union.fit(X, y)
+
+    assert_array_equal(
+        [
+            "pca__pca0",
+            "pca__pca1",
+            "pca__pca2",
+            "pca__pca3",
+            "transf__transf0",
+            "transf__transf1",
+            "transf__transf2",
+        ],
+        fu_union.get_feature_names_out(),
+    )
+
+
+def test_pipeline_transform():
+    # Test whether pipeline works with a transformer at the end.
+    # Also test pipeline.transform and pipeline.inverse_transform
+    X = iris.data
+    pca = PCA(n_components=2, svd_solver="full")
+    pipeline = Pipeline([("pca", pca)])
+
+    # test transform and fit_transform:
+    X_trans = pipeline.fit(X).transform(X)
+    X_trans2 = pipeline.fit_transform(X)
+    X_trans3 = pca.fit_transform(X)
+    assert_array_almost_equal(X_trans, X_trans2)
+    assert_array_almost_equal(X_trans, X_trans3)
+
+    X_back = pipeline.inverse_transform(X_trans)
+    X_back2 = pca.inverse_transform(X_trans)
+    assert_array_almost_equal(X_back, X_back2)
+
+
+def test_pipeline_fit_transform():
+    # Test whether pipeline works with a transformer missing fit_transform
+    X = iris.data
+    y = iris.target
+    transf = Transf()
+    pipeline = Pipeline([("mock", transf)])
+
+    # test fit_transform:
+    X_trans = pipeline.fit_transform(X, y)
+    X_trans2 = transf.fit(X, y).transform(X)
+    assert_array_almost_equal(X_trans, X_trans2)
+
+
+@pytest.mark.parametrize(
+    "start, end", [(0, 1), (0, 2), (1, 2), (1, 3), (None, 1), (1, None), (None, None)]
+)
+def test_pipeline_slice(start, end):
+    pipe = Pipeline(
+        [("transf1", Transf()), ("transf2", Transf()), ("clf", FitParamT())],
+        memory="123",
+        verbose=True,
+    )
+    pipe_slice = pipe[start:end]
+    # Test class
+    assert isinstance(pipe_slice, Pipeline)
+    # Test steps
+    assert pipe_slice.steps == pipe.steps[start:end]
+    # Test named_steps attribute
+    assert (
+        list(pipe_slice.named_steps.items())
+        == list(pipe.named_steps.items())[start:end]
+    )
+    # Test the rest of the parameters
+    pipe_params = pipe.get_params(deep=False)
+    pipe_slice_params = pipe_slice.get_params(deep=False)
+    del pipe_params["steps"]
+    del pipe_slice_params["steps"]
+    assert pipe_params == pipe_slice_params
+    # Test exception
+    msg = "Pipeline slicing only supports a step of 1"
+    with pytest.raises(ValueError, match=msg):
+        pipe[start:end:-1]
+
+
+def test_pipeline_index():
+    transf = Transf()
+    clf = FitParamT()
+    pipe = Pipeline([("transf", transf), ("clf", clf)])
+    assert pipe[0] == transf
+    assert pipe["transf"] == transf
+    assert pipe[-1] == clf
+    assert pipe["clf"] == clf
+
+    # should raise an error if slicing out of range
+    with pytest.raises(IndexError):
+        pipe[3]
+
+    # should raise an error if indexing with wrong element name
+    with pytest.raises(KeyError):
+        pipe["foobar"]
+
+
+def test_set_pipeline_steps():
+    transf1 = Transf()
+    transf2 = Transf()
+    pipeline = Pipeline([("mock", transf1)])
+    assert pipeline.named_steps["mock"] is transf1
+
+    # Directly setting attr
+    pipeline.steps = [("mock2", transf2)]
+    assert "mock" not in pipeline.named_steps
+    assert pipeline.named_steps["mock2"] is transf2
+    assert [("mock2", transf2)] == pipeline.steps
+
+    # Using set_params
+    pipeline.set_params(steps=[("mock", transf1)])
+    assert [("mock", transf1)] == pipeline.steps
+
+    # Using set_params to replace single step
+    pipeline.set_params(mock=transf2)
+    assert [("mock", transf2)] == pipeline.steps
+
+    # With invalid data
+    pipeline.set_params(steps=[("junk", ())])
+    msg = re.escape(
+        "Last step of Pipeline should implement fit or be the string 'passthrough'."
+    )
+    with pytest.raises(TypeError, match=msg):
+        pipeline.fit([[1]], [1])
+
+    msg = "This 'Pipeline' has no attribute 'fit_transform'"
+    with pytest.raises(AttributeError, match=msg):
+        pipeline.fit_transform([[1]], [1])
+
+
+def test_pipeline_named_steps():
+    transf = Transf()
+    mult2 = Mult(mult=2)
+    pipeline = Pipeline([("mock", transf), ("mult", mult2)])
+
+    # Test access via named_steps bunch object
+    assert "mock" in pipeline.named_steps
+    assert "mock2" not in pipeline.named_steps
+    assert pipeline.named_steps.mock is transf
+    assert pipeline.named_steps.mult is mult2
+
+    # Test bunch with conflict attribute of dict
+    pipeline = Pipeline([("values", transf), ("mult", mult2)])
+    assert pipeline.named_steps.values is not transf
+    assert pipeline.named_steps.mult is mult2
+
+
+@pytest.mark.parametrize("passthrough", [None, "passthrough"])
+def test_pipeline_correctly_adjusts_steps(passthrough):
+    X = np.array([[1]])
+    y = np.array([1])
+    mult2 = Mult(mult=2)
+    mult3 = Mult(mult=3)
+    mult5 = Mult(mult=5)
+
+    pipeline = Pipeline(
+        [("m2", mult2), ("bad", passthrough), ("m3", mult3), ("m5", mult5)]
+    )
+
+    pipeline.fit(X, y)
+    expected_names = ["m2", "bad", "m3", "m5"]
+    actual_names = [name for name, _ in pipeline.steps]
+    assert expected_names == actual_names
+
+
+@pytest.mark.parametrize("passthrough", [None, "passthrough"])
+def test_set_pipeline_step_passthrough(passthrough):
+    X = np.array([[1]])
+    y = np.array([1])
+    mult2 = Mult(mult=2)
+    mult3 = Mult(mult=3)
+    mult5 = Mult(mult=5)
+
+    def make():
+        return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])
+
+    pipeline = make()
+
+    exp = 2 * 3 * 5
+    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
+    assert_array_equal([exp], pipeline.fit(X).predict(X))
+    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
+
+    pipeline.set_params(m3=passthrough)
+    exp = 2 * 5
+    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
+    assert_array_equal([exp], pipeline.fit(X).predict(X))
+    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
+    assert pipeline.get_params(deep=True) == {
+        "steps": pipeline.steps,
+        "m2": mult2,
+        "m3": passthrough,
+        "last": mult5,
+        "memory": None,
+        "m2__mult": 2,
+        "last__mult": 5,
+        "transform_input": None,
+        "verbose": False,
+    }
+
+    pipeline.set_params(m2=passthrough)
+    exp = 5
+    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
+    assert_array_equal([exp], pipeline.fit(X).predict(X))
+    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
+
+    # for other methods, ensure no AttributeErrors on None:
+    other_methods = [
+        "predict_proba",
+        "predict_log_proba",
+        "decision_function",
+        "transform",
+        "score",
+    ]
+    for method in other_methods:
+        getattr(pipeline, method)(X)
+
+    pipeline.set_params(m2=mult2)
+    exp = 2 * 5
+    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
+    assert_array_equal([exp], pipeline.fit(X).predict(X))
+    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
+
+    pipeline = make()
+    pipeline.set_params(last=passthrough)
+    # mult2 and mult3 are active
+    exp = 6
+    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
+    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
+    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
+
+    inner_msg = "'str' object has no attribute 'predict'"
+    outer_msg = "This 'Pipeline' has no attribute 'predict'"
+    with pytest.raises(AttributeError, match=outer_msg) as exec_info:
+        getattr(pipeline, "predict")
+    assert isinstance(exec_info.value.__cause__, AttributeError)
+    assert inner_msg in str(exec_info.value.__cause__)
+
+    # Check 'passthrough' step at construction time
+    exp = 2 * 5
+    pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)])
+    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
+    assert_array_equal([exp], pipeline.fit(X).predict(X))
+    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
+
+
+def test_pipeline_ducktyping():
+    pipeline = make_pipeline(Mult(5))
+    pipeline.predict
+    pipeline.transform
+    pipeline.inverse_transform
+
+    pipeline = make_pipeline(Transf())
+    assert not hasattr(pipeline, "predict")
+    pipeline.transform
+    pipeline.inverse_transform
+
+    pipeline = make_pipeline("passthrough")
+    assert pipeline.steps[0] == ("passthrough", "passthrough")
+    assert not hasattr(pipeline, "predict")
+    pipeline.transform
+    pipeline.inverse_transform
+
+    pipeline = make_pipeline(Transf(), NoInvTransf())
+    assert not hasattr(pipeline, "predict")
+    pipeline.transform
+    assert not hasattr(pipeline, "inverse_transform")
+
+    pipeline = make_pipeline(NoInvTransf(), Transf())
+    assert not hasattr(pipeline, "predict")
+    pipeline.transform
+    assert not hasattr(pipeline, "inverse_transform")
+
+
+def test_make_pipeline():
+    t1 = Transf()
+    t2 = Transf()
+    pipe = make_pipeline(t1, t2)
+    assert isinstance(pipe, Pipeline)
+    assert pipe.steps[0][0] == "transf-1"
+    assert pipe.steps[1][0] == "transf-2"
+
+    pipe = make_pipeline(t1, t2, FitParamT())
+    assert isinstance(pipe, Pipeline)
+    assert pipe.steps[0][0] == "transf-1"
+    assert pipe.steps[1][0] == "transf-2"
+    assert pipe.steps[2][0] == "fitparamt"
+
+
+@pytest.mark.parametrize(
+    "pipeline, check_estimator_type",
+    [
+        (make_pipeline(StandardScaler(), LogisticRegression()), is_classifier),
+        (make_pipeline(StandardScaler(), LinearRegression()), is_regressor),
+        (
+            make_pipeline(StandardScaler()),
+            lambda est: get_tags(est).estimator_type is None,
+        ),
+        (Pipeline([]), lambda est: est._estimator_type is None),
+    ],
+)
+def test_pipeline_estimator_type(pipeline, check_estimator_type):
+    """Check that the estimator type returned by the pipeline is correct.
+
+    Non-regression test as part of:
+    https://github.com/scikit-learn/scikit-learn/issues/30197
+    """
+    # Smoke test the repr
+    repr(pipeline)
+    assert check_estimator_type(pipeline)
+
+
+def test_sklearn_tags_with_empty_pipeline():
+    """Check that we propagate properly the tags in a Pipeline.
+
+    Non-regression test as part of:
+    https://github.com/scikit-learn/scikit-learn/issues/30197
+    """
+    empty_pipeline = Pipeline(steps=[])
+    be = BaseEstimator()
+
+    expected_tags = be.__sklearn_tags__()
+    assert empty_pipeline.__sklearn_tags__() == expected_tags
+
+
+def test_feature_union_weights():
+    # test feature union with transformer weights
+    X = iris.data
+    y = iris.target
+    pca = PCA(n_components=2, svd_solver="randomized", random_state=0)
+    select = SelectKBest(k=1)
+    # test using fit followed by transform
+    fs = FeatureUnion(
+        [("pca", pca), ("select", select)], transformer_weights={"pca": 10}
+    )
+    fs.fit(X, y)
+    X_transformed = fs.transform(X)
+    # test using fit_transform
+    fs = FeatureUnion(
+        [("pca", pca), ("select", select)], transformer_weights={"pca": 10}
+    )
+    X_fit_transformed = fs.fit_transform(X, y)
+    # test it works with transformers missing fit_transform
+    fs = FeatureUnion(
+        [("mock", Transf()), ("pca", pca), ("select", select)],
+        transformer_weights={"mock": 10},
+    )
+    X_fit_transformed_wo_method = fs.fit_transform(X, y)
+    # check against expected result
+
+    # We use a different pca object to control the random_state stream
+    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
+    assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
+    assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X))
+    assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel())
+    assert X_fit_transformed_wo_method.shape == (X.shape[0], 7)
+
+
+def test_feature_union_parallel():
+    # test that n_jobs work for FeatureUnion
+    X = JUNK_FOOD_DOCS
+
+    fs = FeatureUnion(
+        [
+            ("words", CountVectorizer(analyzer="word")),
+            ("chars", CountVectorizer(analyzer="char")),
+        ]
+    )
+
+    fs_parallel = FeatureUnion(
+        [
+            ("words", CountVectorizer(analyzer="word")),
+            ("chars", CountVectorizer(analyzer="char")),
+        ],
+        n_jobs=2,
+    )
+
+    fs_parallel2 = FeatureUnion(
+        [
+            ("words", CountVectorizer(analyzer="word")),
+            ("chars", CountVectorizer(analyzer="char")),
+        ],
+        n_jobs=2,
+    )
+
+    fs.fit(X)
+    X_transformed = fs.transform(X)
+    assert X_transformed.shape[0] == len(X)
+
+    fs_parallel.fit(X)
+    X_transformed_parallel = fs_parallel.transform(X)
+    assert X_transformed.shape == X_transformed_parallel.shape
+    assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray())
+
+    # fit_transform should behave the same
+    X_transformed_parallel2 = fs_parallel2.fit_transform(X)
+    assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
+
+    # transformers should stay fit after fit_transform
+    X_transformed_parallel2 = fs_parallel2.transform(X)
+    assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
+
+
+def test_feature_union_feature_names():
+    word_vect = CountVectorizer(analyzer="word")
+    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
+    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
+    ft.fit(JUNK_FOOD_DOCS)
+    feature_names = ft.get_feature_names_out()
+    for feat in feature_names:
+        assert "chars__" in feat or "words__" in feat
+    assert len(feature_names) == 35
+
+    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
+
+    msg = re.escape(
+        "Transformer tr1 (type Transf) does not provide get_feature_names_out"
+    )
+    with pytest.raises(AttributeError, match=msg):
+        ft.get_feature_names_out()
+
+
+def test_classes_property():
+    X = iris.data
+    y = iris.target
+
+    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
+    reg.fit(X, y)
+    with pytest.raises(AttributeError):
+        getattr(reg, "classes_")
+
+    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
+    with pytest.raises(AttributeError):
+        getattr(clf, "classes_")
+    clf.fit(X, y)
+    assert_array_equal(clf.classes_, np.unique(y))
+
+
+def test_set_feature_union_steps():
+    mult2 = Mult(2)
+    mult3 = Mult(3)
+    mult5 = Mult(5)
+
+    mult3.get_feature_names_out = lambda input_features: ["x3"]
+    mult2.get_feature_names_out = lambda input_features: ["x2"]
+    mult5.get_feature_names_out = lambda input_features: ["x5"]
+
+    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
+    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
+    assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out())
+
+    # Directly setting attr
+    ft.transformer_list = [("m5", mult5)]
+    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
+    assert_array_equal(["m5__x5"], ft.get_feature_names_out())
+
+    # Using set_params
+    ft.set_params(transformer_list=[("mock", mult3)])
+    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
+    assert_array_equal(["mock__x3"], ft.get_feature_names_out())
+
+    # Using set_params to replace single step
+    ft.set_params(mock=mult5)
+    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
+    assert_array_equal(["mock__x5"], ft.get_feature_names_out())
+
+
+def test_set_feature_union_step_drop():
+    mult2 = Mult(2)
+    mult3 = Mult(3)
+
+    mult2.get_feature_names_out = lambda input_features: ["x2"]
+    mult3.get_feature_names_out = lambda input_features: ["x3"]
+
+    X = np.asarray([[1]])
+
+    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
+    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
+    assert_array_equal([[2, 3]], ft.fit_transform(X))
+    assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out())
+
+    ft.set_params(m2="drop")
+    assert_array_equal([[3]], ft.fit(X).transform(X))
+    assert_array_equal([[3]], ft.fit_transform(X))
+    assert_array_equal(["m3__x3"], ft.get_feature_names_out())
+
+    ft.set_params(m3="drop")
+    assert_array_equal([[]], ft.fit(X).transform(X))
+    assert_array_equal([[]], ft.fit_transform(X))
+    assert_array_equal([], ft.get_feature_names_out())
+
+    # check we can change back
+    ft.set_params(m3=mult3)
+    assert_array_equal([[3]], ft.fit(X).transform(X))
+
+    # Check 'drop' step at construction time
+    ft = FeatureUnion([("m2", "drop"), ("m3", mult3)])
+    assert_array_equal([[3]], ft.fit(X).transform(X))
+    assert_array_equal([[3]], ft.fit_transform(X))
+    assert_array_equal(["m3__x3"], ft.get_feature_names_out())
+
+
+def test_set_feature_union_passthrough():
+    """Check the behaviour of setting a transformer to `"passthrough"`."""
+    mult2 = Mult(2)
+    mult3 = Mult(3)
+
+    # We only test get_features_names_out, as get_feature_names is unsupported by
+    # FunctionTransformer, and hence unsupported by FeatureUnion passthrough.
+    mult2.get_feature_names_out = lambda input_features: ["x2"]
+    mult3.get_feature_names_out = lambda input_features: ["x3"]
+
+    X = np.asarray([[1]])
+
+    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
+    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
+    assert_array_equal([[2, 3]], ft.fit_transform(X))
+    assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out())
+
+    ft.set_params(m2="passthrough")
+    assert_array_equal([[1, 3]], ft.fit(X).transform(X))
+    assert_array_equal([[1, 3]], ft.fit_transform(X))
+    assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"]))
+
+    ft.set_params(m3="passthrough")
+    assert_array_equal([[1, 1]], ft.fit(X).transform(X))
+    assert_array_equal([[1, 1]], ft.fit_transform(X))
+    assert_array_equal(
+        ["m2__myfeat", "m3__myfeat"], ft.get_feature_names_out(["myfeat"])
+    )
+
+    # check we can change back
+    ft.set_params(m3=mult3)
+    assert_array_equal([[1, 3]], ft.fit(X).transform(X))
+    assert_array_equal([[1, 3]], ft.fit_transform(X))
+    assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"]))
+
+    # Check 'passthrough' step at construction time
+    ft = FeatureUnion([("m2", "passthrough"), ("m3", mult3)])
+    assert_array_equal([[1, 3]], ft.fit(X).transform(X))
+    assert_array_equal([[1, 3]], ft.fit_transform(X))
+    assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"]))
+
+    X = iris.data
+    columns = X.shape[1]
+    pca = PCA(n_components=2, svd_solver="randomized", random_state=0)
+
+    ft = FeatureUnion([("passthrough", "passthrough"), ("pca", pca)])
+    assert_array_equal(X, ft.fit(X).transform(X)[:, :columns])
+    assert_array_equal(X, ft.fit_transform(X)[:, :columns])
+    assert_array_equal(
+        [
+            "passthrough__f0",
+            "passthrough__f1",
+            "passthrough__f2",
+            "passthrough__f3",
+            "pca__pca0",
+            "pca__pca1",
+        ],
+        ft.get_feature_names_out(["f0", "f1", "f2", "f3"]),
+    )
+
+    ft.set_params(pca="passthrough")
+    X_ft = ft.fit(X).transform(X)
+    assert_array_equal(X_ft, np.hstack([X, X]))
+    X_ft = ft.fit_transform(X)
+    assert_array_equal(X_ft, np.hstack([X, X]))
+    assert_array_equal(
+        [
+            "passthrough__f0",
+            "passthrough__f1",
+            "passthrough__f2",
+            "passthrough__f3",
+            "pca__f0",
+            "pca__f1",
+            "pca__f2",
+            "pca__f3",
+        ],
+        ft.get_feature_names_out(["f0", "f1", "f2", "f3"]),
+    )
+
+    ft.set_params(passthrough=pca)
+    assert_array_equal(X, ft.fit(X).transform(X)[:, -columns:])
+    assert_array_equal(X, ft.fit_transform(X)[:, -columns:])
+    assert_array_equal(
+        [
+            "passthrough__pca0",
+            "passthrough__pca1",
+            "pca__f0",
+            "pca__f1",
+            "pca__f2",
+            "pca__f3",
+        ],
+        ft.get_feature_names_out(["f0", "f1", "f2", "f3"]),
+    )
+
+    ft = FeatureUnion(
+        [("passthrough", "passthrough"), ("pca", pca)],
+        transformer_weights={"passthrough": 2},
+    )
+    assert_array_equal(X * 2, ft.fit(X).transform(X)[:, :columns])
+    assert_array_equal(X * 2, ft.fit_transform(X)[:, :columns])
+    assert_array_equal(
+        [
+            "passthrough__f0",
+            "passthrough__f1",
+            "passthrough__f2",
+            "passthrough__f3",
+            "pca__pca0",
+            "pca__pca1",
+        ],
+        ft.get_feature_names_out(["f0", "f1", "f2", "f3"]),
+    )
+
+
+def test_feature_union_passthrough_get_feature_names_out_true():
+    """Check feature_names_out for verbose_feature_names_out=True (default)"""
+    X = iris.data
+    pca = PCA(n_components=2, svd_solver="randomized", random_state=0)
+
+    ft = FeatureUnion([("pca", pca), ("passthrough", "passthrough")])
+    ft.fit(X)
+    assert_array_equal(
+        [
+            "pca__pca0",
+            "pca__pca1",
+            "passthrough__x0",
+            "passthrough__x1",
+            "passthrough__x2",
+            "passthrough__x3",
+        ],
+        ft.get_feature_names_out(),
+    )
+
+
+def test_feature_union_passthrough_get_feature_names_out_false():
+    """Check feature_names_out for verbose_feature_names_out=False"""
+    X = iris.data
+    pca = PCA(n_components=2, svd_solver="randomized", random_state=0)
+
+    ft = FeatureUnion(
+        [("pca", pca), ("passthrough", "passthrough")], verbose_feature_names_out=False
+    )
+    ft.fit(X)
+    assert_array_equal(
+        [
+            "pca0",
+            "pca1",
+            "x0",
+            "x1",
+            "x2",
+            "x3",
+        ],
+        ft.get_feature_names_out(),
+    )
+
+
+def test_feature_union_passthrough_get_feature_names_out_false_errors():
+    """Check get_feature_names_out and non-verbose names and colliding names."""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame([[1, 2], [2, 3]], columns=["a", "b"])
+
+    select_a = FunctionTransformer(
+        lambda X: X[["a"]], feature_names_out=lambda self, _: np.asarray(["a"])
+    )
+    union = FeatureUnion(
+        [("t1", StandardScaler()), ("t2", select_a)],
+        verbose_feature_names_out=False,
+    )
+    union.fit(X)
+
+    msg = re.escape(
+        "Output feature names: ['a'] are not unique. "
+        "Please set verbose_feature_names_out=True to add prefixes to feature names"
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        union.get_feature_names_out()
+
+
+def test_feature_union_passthrough_get_feature_names_out_false_errors_overlap_over_5():
+    """Check get_feature_names_out with non-verbose names and >= 5 colliding names."""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame([list(range(10))], columns=[f"f{i}" for i in range(10)])
+
+    union = FeatureUnion(
+        [("t1", "passthrough"), ("t2", "passthrough")],
+        verbose_feature_names_out=False,
+    )
+
+    union.fit(X)
+
+    msg = re.escape(
+        "Output feature names: ['f0', 'f1', 'f2', 'f3', 'f4', ...] "
+        "are not unique. Please set verbose_feature_names_out=True to add prefixes to"
+        " feature names"
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        union.get_feature_names_out()
+
+
+def test_step_name_validation():
+    error_message_1 = r"Estimator names must not contain __: got \['a__q'\]"
+    error_message_2 = r"Names provided are not unique: \['a', 'a'\]"
+    error_message_3 = r"Estimator names conflict with constructor arguments: \['%s'\]"
+    bad_steps1 = [("a__q", Mult(2)), ("b", Mult(3))]
+    bad_steps2 = [("a", Mult(2)), ("a", Mult(3))]
+    for cls, param in [(Pipeline, "steps"), (FeatureUnion, "transformer_list")]:
+        # we validate in construction (despite scikit-learn convention)
+        bad_steps3 = [("a", Mult(2)), (param, Mult(3))]
+        for bad_steps, message in [
+            (bad_steps1, error_message_1),
+            (bad_steps2, error_message_2),
+            (bad_steps3, error_message_3 % param),
+        ]:
+            # three ways to make invalid:
+            # - construction
+            with pytest.raises(ValueError, match=message):
+                cls(**{param: bad_steps}).fit([[1]], [1])
+
+            # - setattr
+            est = cls(**{param: [("a", Mult(1))]})
+            setattr(est, param, bad_steps)
+            with pytest.raises(ValueError, match=message):
+                est.fit([[1]], [1])
+
+            with pytest.raises(ValueError, match=message):
+                est.fit_transform([[1]], [1])
+
+            # - set_params
+            est = cls(**{param: [("a", Mult(1))]})
+            est.set_params(**{param: bad_steps})
+            with pytest.raises(ValueError, match=message):
+                est.fit([[1]], [1])
+
+            with pytest.raises(ValueError, match=message):
+                est.fit_transform([[1]], [1])
+
+
+def test_set_params_nested_pipeline():
+    estimator = Pipeline([("a", Pipeline([("b", DummyRegressor())]))])
+    estimator.set_params(a__b__alpha=0.001, a__b=Lasso())
+    estimator.set_params(a__steps=[("b", LogisticRegression())], a__b__C=5)
+
+
+def test_pipeline_memory():
+    X = iris.data
+    y = iris.target
+    cachedir = mkdtemp()
+    try:
+        memory = joblib.Memory(location=cachedir, verbose=10)
+        # Test with Transformer + SVC
+        clf = SVC(probability=True, random_state=0)
+        transf = DummyTransf()
+        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
+        cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory)
+
+        # Memoize the transformer at the first fit
+        cached_pipe.fit(X, y)
+        pipe.fit(X, y)
+        # Get the time stamp of the transformer in the cached pipeline
+        ts = cached_pipe.named_steps["transf"].timestamp_
+        # Check that cached_pipe and pipe yield identical results
+        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
+        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
+        assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X))
+        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
+        assert_array_equal(
+            pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_
+        )
+        assert not hasattr(transf, "means_")
+        # Check that we are reading the cache while fitting
+        # a second time
+        cached_pipe.fit(X, y)
+        # Check that cached_pipe and pipe yield identical results
+        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
+        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
+        assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X))
+        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
+        assert_array_equal(
+            pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_
+        )
+        assert ts == cached_pipe.named_steps["transf"].timestamp_
+        # Create a new pipeline with cloned estimators
+        # Check that even changing the name step does not affect the cache hit
+        clf_2 = SVC(probability=True, random_state=0)
+        transf_2 = DummyTransf()
+        cached_pipe_2 = Pipeline(
+            [("transf_2", transf_2), ("svc", clf_2)], memory=memory
+        )
+        cached_pipe_2.fit(X, y)
+
+        # Check that cached_pipe and pipe yield identical results
+        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
+        assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X))
+        assert_array_equal(
+            pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)
+        )
+        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
+        assert_array_equal(
+            pipe.named_steps["transf"].means_,
+            cached_pipe_2.named_steps["transf_2"].means_,
+        )
+        assert ts == cached_pipe_2.named_steps["transf_2"].timestamp_
+    finally:
+        shutil.rmtree(cachedir)
+
+
+def test_make_pipeline_memory():
+    cachedir = mkdtemp()
+    memory = joblib.Memory(location=cachedir, verbose=10)
+    pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
+    assert pipeline.memory is memory
+    pipeline = make_pipeline(DummyTransf(), SVC())
+    assert pipeline.memory is None
+    assert len(pipeline) == 2
+
+    shutil.rmtree(cachedir)
+
+
+class FeatureNameSaver(BaseEstimator):
+    def fit(self, X, y=None):
+        _check_feature_names(self, X, reset=True)
+        return self
+
+    def transform(self, X, y=None):
+        return X
+
+    def get_feature_names_out(self, input_features=None):
+        return input_features
+
+
+def test_features_names_passthrough():
+    """Check pipeline.get_feature_names_out with passthrough"""
+    pipe = Pipeline(
+        steps=[
+            ("names", FeatureNameSaver()),
+            ("pass", "passthrough"),
+            ("clf", LogisticRegression()),
+        ]
+    )
+    iris = load_iris()
+    pipe.fit(iris.data, iris.target)
+    assert_array_equal(
+        pipe[:-1].get_feature_names_out(iris.feature_names), iris.feature_names
+    )
+
+
+def test_feature_names_count_vectorizer():
+    """Check pipeline.get_feature_names_out with vectorizers"""
+    pipe = Pipeline(steps=[("vect", CountVectorizer()), ("clf", LogisticRegression())])
+    y = ["pizza" in x for x in JUNK_FOOD_DOCS]
+    pipe.fit(JUNK_FOOD_DOCS, y)
+    assert_array_equal(
+        pipe[:-1].get_feature_names_out(),
+        ["beer", "burger", "coke", "copyright", "pizza", "the"],
+    )
+    assert_array_equal(
+        pipe[:-1].get_feature_names_out("nonsense_is_ignored"),
+        ["beer", "burger", "coke", "copyright", "pizza", "the"],
+    )
+
+
+def test_pipeline_feature_names_out_error_without_definition():
+    """Check that error is raised when a transformer does not define
+    `get_feature_names_out`."""
+    pipe = Pipeline(steps=[("notrans", NoTrans())])
+    iris = load_iris()
+    pipe.fit(iris.data, iris.target)
+
+    msg = "does not provide get_feature_names_out"
+    with pytest.raises(AttributeError, match=msg):
+        pipe.get_feature_names_out()
+
+
+def test_pipeline_param_error():
+    clf = make_pipeline(LogisticRegression())
+    with pytest.raises(
+        ValueError, match="Pipeline.fit does not accept the sample_weight parameter"
+    ):
+        clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])
+
+
+parameter_grid_test_verbose = (
+    (est, pattern, method)
+    for (est, pattern), method in itertools.product(
+        [
+            (
+                Pipeline([("transf", Transf()), ("clf", FitParamT())]),
+                r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
+                r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$",
+            ),
+            (
+                Pipeline([("transf", Transf()), ("noop", None), ("clf", FitParamT())]),
+                r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n"
+                r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n"
+                r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$",
+            ),
+            (
+                Pipeline(
+                    [
+                        ("transf", Transf()),
+                        ("noop", "passthrough"),
+                        ("clf", FitParamT()),
+                    ]
+                ),
+                r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n"
+                r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n"
+                r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$",
+            ),
+            (
+                Pipeline([("transf", Transf()), ("clf", None)]),
+                r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
+                r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$",
+            ),
+            (
+                Pipeline([("transf", None), ("mult", Mult())]),
+                r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
+                r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$",
+            ),
+            (
+                Pipeline([("transf", "passthrough"), ("mult", Mult())]),
+                r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
+                r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$",
+            ),
+            (
+                FeatureUnion([("mult1", Mult()), ("mult2", Mult())]),
+                r"\[FeatureUnion\].*\(step 1 of 2\) Processing mult1.* total=.*\n"
+                r"\[FeatureUnion\].*\(step 2 of 2\) Processing mult2.* total=.*\n$",
+            ),
+            (
+                FeatureUnion([("mult1", "drop"), ("mult2", Mult()), ("mult3", "drop")]),
+                r"\[FeatureUnion\].*\(step 1 of 1\) Processing mult2.* total=.*\n$",
+            ),
+        ],
+        ["fit", "fit_transform", "fit_predict"],
+    )
+    if hasattr(est, method)
+    and not (
+        method == "fit_transform"
+        and hasattr(est, "steps")
+        and isinstance(est.steps[-1][1], FitParamT)
+    )
+)
+
+
+@pytest.mark.parametrize("est, pattern, method", parameter_grid_test_verbose)
+def test_verbose(est, method, pattern, capsys):
+    func = getattr(est, method)
+
+    X = [[1, 2, 3], [4, 5, 6]]
+    y = [[7], [8]]
+
+    est.set_params(verbose=False)
+    func(X, y)
+    assert not capsys.readouterr().out, "Got output for verbose=False"
+
+    est.set_params(verbose=True)
+    func(X, y)
+    assert re.match(pattern, capsys.readouterr().out)
+
+
+def test_n_features_in_pipeline():
+    # make sure pipelines delegate n_features_in to the first step
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    ss = StandardScaler()
+    gbdt = HistGradientBoostingClassifier()
+    pipe = make_pipeline(ss, gbdt)
+    assert not hasattr(pipe, "n_features_in_")
+    pipe.fit(X, y)
+    assert pipe.n_features_in_ == ss.n_features_in_ == 2
+
+    # if the first step has the n_features_in attribute then the pipeline also
+    # has it, even though it isn't fitted.
+    ss = StandardScaler()
+    gbdt = HistGradientBoostingClassifier()
+    pipe = make_pipeline(ss, gbdt)
+    ss.fit(X, y)
+    assert pipe.n_features_in_ == ss.n_features_in_ == 2
+    assert not hasattr(gbdt, "n_features_in_")
+
+
+def test_n_features_in_feature_union():
+    # make sure FeatureUnion delegates n_features_in to the first transformer
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    ss = StandardScaler()
+    fu = make_union(ss)
+    assert not hasattr(fu, "n_features_in_")
+    fu.fit(X, y)
+    assert fu.n_features_in_ == ss.n_features_in_ == 2
+
+    # if the first step has the n_features_in attribute then the feature_union
+    # also has it, even though it isn't fitted.
+    ss = StandardScaler()
+    fu = make_union(ss)
+    ss.fit(X, y)
+    assert fu.n_features_in_ == ss.n_features_in_ == 2
+
+
+def test_feature_union_fit_params():
+    # Regression test for issue: #15117
+    class DummyTransformer(TransformerMixin, BaseEstimator):
+        def fit(self, X, y=None, **fit_params):
+            if fit_params != {"a": 0}:
+                raise ValueError
+            return self
+
+        def transform(self, X, y=None):
+            return X
+
+    X, y = iris.data, iris.target
+    t = FeatureUnion([("dummy0", DummyTransformer()), ("dummy1", DummyTransformer())])
+    with pytest.raises(ValueError):
+        t.fit(X, y)
+
+    with pytest.raises(ValueError):
+        t.fit_transform(X, y)
+
+    t.fit(X, y, a=0)
+    t.fit_transform(X, y, a=0)
+
+
+def test_feature_union_fit_params_without_fit_transform():
+    # Test that metadata is passed correctly to underlying transformers that don't
+    # implement a `fit_transform` method when SLEP6 is not enabled.
+
+    class DummyTransformer(ConsumingNoFitTransformTransformer):
+        def fit(self, X, y=None, **fit_params):
+            if fit_params != {"metadata": 1}:
+                raise ValueError
+            return self
+
+    X, y = iris.data, iris.target
+    t = FeatureUnion(
+        [
+            ("nofittransform0", DummyTransformer()),
+            ("nofittransform1", DummyTransformer()),
+        ]
+    )
+
+    with pytest.raises(ValueError):
+        t.fit_transform(X, y, metadata=0)
+
+    t.fit_transform(X, y, metadata=1)
+
+
+def test_pipeline_missing_values_leniency():
+    # check that pipeline let the missing values validation to
+    # the underlying transformers and predictors.
+    X, y = iris.data.copy(), iris.target.copy()
+    mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
+    X[mask] = np.nan
+    pipe = make_pipeline(SimpleImputer(), LogisticRegression())
+    assert pipe.fit(X, y).score(X, y) > 0.4
+
+
+def test_feature_union_warns_unknown_transformer_weight():
+    # Warn user when transformer_weights containers a key not present in
+    # transformer_list
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    transformer_list = [("transf", Transf())]
+    # Transformer weights dictionary with incorrect name
+    weights = {"transformer": 1}
+    expected_msg = (
+        'Attempting to weight transformer "transformer", '
+        "but it is not present in transformer_list."
+    )
+    union = FeatureUnion(transformer_list, transformer_weights=weights)
+    with pytest.raises(ValueError, match=expected_msg):
+        union.fit(X, y)
+
+
+@pytest.mark.parametrize("passthrough", [None, "passthrough"])
+def test_pipeline_get_tags_none(passthrough):
+    # Checks that tags are set correctly when the first transformer is None or
+    # 'passthrough'
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/18815
+    pipe = make_pipeline(passthrough, SVC())
+    assert not pipe.__sklearn_tags__().input_tags.pairwise
+
+
+# FIXME: Replace this test with a full `check_estimator` once we have API only
+# checks.
+@pytest.mark.parametrize("Predictor", [MinimalRegressor, MinimalClassifier])
+def test_search_cv_using_minimal_compatible_estimator(Predictor):
+    # Check that third-party library estimators can be part of a pipeline
+    # and tuned by grid-search without inheriting from BaseEstimator.
+    rng = np.random.RandomState(0)
+    X, y = rng.randn(25, 2), np.array([0] * 5 + [1] * 20)
+
+    model = Pipeline(
+        [("transformer", MinimalTransformer()), ("predictor", Predictor())]
+    )
+    model.fit(X, y)
+
+    y_pred = model.predict(X)
+    if is_classifier(model):
+        assert_array_equal(y_pred, 1)
+        assert model.score(X, y) == pytest.approx(accuracy_score(y, y_pred))
+    else:
+        assert_allclose(y_pred, y.mean())
+        assert model.score(X, y) == pytest.approx(r2_score(y, y_pred))
+
+
+def test_pipeline_check_if_fitted():
+    class Estimator(BaseEstimator):
+        def fit(self, X, y):
+            self.fitted_ = True
+            return self
+
+    pipeline = Pipeline([("clf", Estimator())])
+    with pytest.raises(NotFittedError):
+        check_is_fitted(pipeline)
+    pipeline.fit(iris.data, iris.target)
+    check_is_fitted(pipeline)
+
+
+def test_feature_union_check_if_fitted():
+    """Check __sklearn_is_fitted__ is defined correctly."""
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    union = FeatureUnion([("clf", MinimalTransformer())])
+    with pytest.raises(NotFittedError):
+        check_is_fitted(union)
+
+    union.fit(X, y)
+    check_is_fitted(union)
+
+    # passthrough is stateless
+    union = FeatureUnion([("pass", "passthrough")])
+    check_is_fitted(union)
+
+    union = FeatureUnion([("clf", MinimalTransformer()), ("pass", "passthrough")])
+    with pytest.raises(NotFittedError):
+        check_is_fitted(union)
+
+    union.fit(X, y)
+    check_is_fitted(union)
+
+
+def test_pipeline_get_feature_names_out_passes_names_through():
+    """Check that pipeline passes names through.
+
+    Non-regresion test for #21349.
+    """
+    X, y = iris.data, iris.target
+
+    class AddPrefixStandardScalar(StandardScaler):
+        def get_feature_names_out(self, input_features=None):
+            names = super().get_feature_names_out(input_features=input_features)
+            return np.asarray([f"my_prefix_{name}" for name in names], dtype=object)
+
+    pipe = make_pipeline(AddPrefixStandardScalar(), StandardScaler())
+    pipe.fit(X, y)
+
+    input_names = iris.feature_names
+    feature_names_out = pipe.get_feature_names_out(input_names)
+
+    assert_array_equal(feature_names_out, [f"my_prefix_{name}" for name in input_names])
+
+
+def test_pipeline_set_output_integration():
+    """Test pipeline's set_output with feature names."""
+    pytest.importorskip("pandas")
+
+    X, y = load_iris(as_frame=True, return_X_y=True)
+
+    pipe = make_pipeline(StandardScaler(), LogisticRegression())
+    pipe.set_output(transform="pandas")
+    pipe.fit(X, y)
+
+    feature_names_in_ = pipe[:-1].get_feature_names_out()
+    log_reg_feature_names = pipe[-1].feature_names_in_
+
+    assert_array_equal(feature_names_in_, log_reg_feature_names)
+
+
+def test_feature_union_set_output():
+    """Test feature union with set_output API."""
+    pd = pytest.importorskip("pandas")
+
+    X, _ = load_iris(as_frame=True, return_X_y=True)
+    X_train, X_test = train_test_split(X, random_state=0)
+    union = FeatureUnion([("scalar", StandardScaler()), ("pca", PCA())])
+    union.set_output(transform="pandas")
+    union.fit(X_train)
+
+    X_trans = union.transform(X_test)
+    assert isinstance(X_trans, pd.DataFrame)
+    assert_array_equal(X_trans.columns, union.get_feature_names_out())
+    assert_array_equal(X_trans.index, X_test.index)
+
+
+def test_feature_union_getitem():
+    """Check FeatureUnion.__getitem__ returns expected results."""
+    scalar = StandardScaler()
+    pca = PCA()
+    union = FeatureUnion(
+        [
+            ("scalar", scalar),
+            ("pca", pca),
+            ("pass", "passthrough"),
+            ("drop_me", "drop"),
+        ]
+    )
+    assert union["scalar"] is scalar
+    assert union["pca"] is pca
+    assert union["pass"] == "passthrough"
+    assert union["drop_me"] == "drop"
+
+
+@pytest.mark.parametrize("key", [0, slice(0, 2)])
+def test_feature_union_getitem_error(key):
+    """Raise error when __getitem__ gets a non-string input."""
+
+    union = FeatureUnion([("scalar", StandardScaler()), ("pca", PCA())])
+
+    msg = "Only string keys are supported"
+    with pytest.raises(KeyError, match=msg):
+        union[key]
+
+
+def test_feature_union_feature_names_in_():
+    """Ensure feature union has `.feature_names_in_` attribute if `X` has a
+    `columns` attribute.
+
+    Test for #24754.
+    """
+    pytest.importorskip("pandas")
+
+    X, _ = load_iris(as_frame=True, return_X_y=True)
+
+    # FeatureUnion should have the feature_names_in_ attribute if the
+    # first transformer also has it
+    scaler = StandardScaler()
+    scaler.fit(X)
+    union = FeatureUnion([("scale", scaler)])
+    assert hasattr(union, "feature_names_in_")
+    assert_array_equal(X.columns, union.feature_names_in_)
+    assert_array_equal(scaler.feature_names_in_, union.feature_names_in_)
+
+    # fit with pandas.DataFrame
+    union = FeatureUnion([("pass", "passthrough")])
+    union.fit(X)
+    assert hasattr(union, "feature_names_in_")
+    assert_array_equal(X.columns, union.feature_names_in_)
+
+    # fit with numpy array
+    X_array = X.to_numpy()
+    union = FeatureUnion([("pass", "passthrough")])
+    union.fit(X_array)
+    assert not hasattr(union, "feature_names_in_")
+
+
+def test_feature_union_1d_output():
+    """Test that FeatureUnion raises error for 1D transformer outputs."""
+    X = np.arange(6).reshape(3, 2)
+
+    with pytest.raises(
+        ValueError,
+        match="Transformer 'b' returned an array or dataframe with 1 dimensions",
+    ):
+        FeatureUnion(
+            [
+                ("a", FunctionTransformer(lambda X: X)),
+                ("b", FunctionTransformer(lambda X: X[:, 1])),
+            ]
+        ).fit_transform(X)
+
+
+# transform_input tests
+# =====================
+
+
+@config_context(enable_metadata_routing=True)
+@pytest.mark.parametrize("method", ["fit", "fit_transform"])
+def test_transform_input_pipeline(method):
+    """Test that with transform_input, data is correctly transformed for each step."""
+
+    def get_transformer(registry, sample_weight, metadata):
+        """Get a transformer with requests set."""
+        return (
+            ConsumingTransformer(registry=registry)
+            .set_fit_request(sample_weight=sample_weight, metadata=metadata)
+            .set_transform_request(sample_weight=sample_weight, metadata=metadata)
+        )
+
+    def get_pipeline():
+        """Get a pipeline and corresponding registries.
+
+        The pipeline has 4 steps, with different request values set to test different
+        cases. One is aliased.
+        """
+        registry_1, registry_2, registry_3, registry_4 = (
+            _Registry(),
+            _Registry(),
+            _Registry(),
+            _Registry(),
+        )
+        pipe = make_pipeline(
+            get_transformer(registry_1, sample_weight=True, metadata=True),
+            get_transformer(registry_2, sample_weight=False, metadata=False),
+            get_transformer(registry_3, sample_weight=True, metadata=True),
+            get_transformer(registry_4, sample_weight="other_weights", metadata=True),
+            transform_input=["sample_weight"],
+        )
+        return pipe, registry_1, registry_2, registry_3, registry_4
+
+    def check_metadata(registry, methods, **metadata):
+        """Check that the right metadata was recorded for the given methods."""
+        assert registry
+        for estimator in registry:
+            for method in methods:
+                check_recorded_metadata(
+                    estimator,
+                    method=method,
+                    parent=method,
+                    **metadata,
+                )
+
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([0, 1])
+    sample_weight = np.array([[1, 2]])
+    other_weights = np.array([[30, 40]])
+    metadata = np.array([[100, 200]])
+
+    pipe, registry_1, registry_2, registry_3, registry_4 = get_pipeline()
+    pipe.fit(
+        X,
+        y,
+        sample_weight=sample_weight,
+        other_weights=other_weights,
+        metadata=metadata,
+    )
+
+    check_metadata(
+        registry_1, ["fit", "transform"], sample_weight=sample_weight, metadata=metadata
+    )
+    check_metadata(registry_2, ["fit", "transform"])
+    check_metadata(
+        registry_3,
+        ["fit", "transform"],
+        sample_weight=sample_weight + 2,
+        metadata=metadata,
+    )
+    check_metadata(
+        registry_4,
+        method.split("_"),  # ["fit", "transform"] if "fit_transform", ["fit"] otherwise
+        sample_weight=other_weights + 3,
+        metadata=metadata,
+    )
+
+
+@config_context(enable_metadata_routing=True)
+def test_transform_input_explicit_value_check():
+    """Test that the right transformed values are passed to `fit`."""
+
+    class Transformer(TransformerMixin, BaseEstimator):
+        def fit(self, X, y):
+            self.fitted_ = True
+            return self
+
+        def transform(self, X):
+            return X + 1
+
+    class Estimator(ClassifierMixin, BaseEstimator):
+        def fit(self, X, y, X_val=None, y_val=None):
+            assert_array_equal(X, np.array([[1, 2]]))
+            assert_array_equal(y, np.array([0, 1]))
+            assert_array_equal(X_val, np.array([[2, 3]]))
+            assert_array_equal(y_val, np.array([0, 1]))
+            return self
+
+    X = np.array([[0, 1]])
+    y = np.array([0, 1])
+    X_val = np.array([[1, 2]])
+    y_val = np.array([0, 1])
+    pipe = Pipeline(
+        [
+            ("transformer", Transformer()),
+            ("estimator", Estimator().set_fit_request(X_val=True, y_val=True)),
+        ],
+        transform_input=["X_val"],
+    )
+    pipe.fit(X, y, X_val=X_val, y_val=y_val)
+
+
+def test_transform_input_no_slep6():
+    """Make sure the right error is raised if slep6 is not enabled."""
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([0, 1])
+    msg = "The `transform_input` parameter can only be set if metadata"
+    with pytest.raises(ValueError, match=msg):
+        make_pipeline(DummyTransf(), transform_input=["blah"]).fit(X, y)
+
+
+@config_context(enable_metadata_routing=True)
+def test_transform_tuple_input():
+    """Test that if metadata is a tuple of arrays, both arrays are transformed."""
+
+    class Estimator(ClassifierMixin, BaseEstimator):
+        def fit(self, X, y, X_val=None, y_val=None):
+            assert isinstance(X_val, tuple)
+            assert isinstance(y_val, tuple)
+            # Here we make sure that each X_val is transformed by the transformer
+            assert_array_equal(X_val[0], np.array([[2, 3]]))
+            assert_array_equal(y_val[0], np.array([0, 1]))
+            assert_array_equal(X_val[1], np.array([[11, 12]]))
+            assert_array_equal(y_val[1], np.array([1, 2]))
+            self.fitted_ = True
+            return self
+
+    class Transformer(TransformerMixin, BaseEstimator):
+        def fit(self, X, y):
+            self.fitted_ = True
+            return self
+
+        def transform(self, X):
+            return X + 1
+
+    X = np.array([[1, 2]])
+    y = np.array([0, 1])
+    X_val0 = np.array([[1, 2]])
+    y_val0 = np.array([0, 1])
+    X_val1 = np.array([[10, 11]])
+    y_val1 = np.array([1, 2])
+    pipe = Pipeline(
+        [
+            ("transformer", Transformer()),
+            ("estimator", Estimator().set_fit_request(X_val=True, y_val=True)),
+        ],
+        transform_input=["X_val"],
+    )
+    pipe.fit(X, y, X_val=(X_val0, X_val1), y_val=(y_val0, y_val1))
+
+
+# end of transform_input tests
+# =============================
+
+
+# TODO(1.8): change warning to checking for NotFittedError
+@pytest.mark.parametrize(
+    "method",
+    [
+        "predict",
+        "predict_proba",
+        "predict_log_proba",
+        "decision_function",
+        "score",
+        "score_samples",
+        "transform",
+        "inverse_transform",
+    ],
+)
+def test_pipeline_warns_not_fitted(method):
+    class StatelessEstimator(BaseEstimator):
+        """Stateless estimator that doesn't check if it's fitted.
+
+        Stateless estimators that don't require fit, should properly set the
+        `requires_fit` flag and implement a `__sklearn_check_is_fitted__` returning
+        `True`.
+        """
+
+        def fit(self, X, y):
+            return self  # pragma: no cover
+
+        def transform(self, X):
+            return X
+
+        def predict(self, X):
+            return np.ones(len(X))
+
+        def predict_proba(self, X):
+            return np.ones(len(X))
+
+        def predict_log_proba(self, X):
+            return np.zeros(len(X))
+
+        def decision_function(self, X):
+            return np.ones(len(X))
+
+        def score(self, X, y):
+            return 1
+
+        def score_samples(self, X):
+            return np.ones(len(X))
+
+        def inverse_transform(self, X):
+            return X
+
+    pipe = Pipeline([("estimator", StatelessEstimator())])
+    with pytest.warns(FutureWarning, match="This Pipeline instance is not fitted yet."):
+        getattr(pipe, method)([[1]])
+
+
+# Test that metadata is routed correctly for pipelines and FeatureUnion
+# =====================================================================
+
+
+class SimpleEstimator(BaseEstimator):
+    # This class is used in this section for testing routing in the pipeline.
+    # This class should have every set_{method}_request
+    def __sklearn_is_fitted__(self):
+        return True
+
+    def fit(self, X, y, sample_weight=None, prop=None):
+        assert sample_weight is not None, sample_weight
+        assert prop is not None, prop
+        return self
+
+    def fit_transform(self, X, y, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return X + 1
+
+    def fit_predict(self, X, y, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return np.ones(len(X))
+
+    def predict(self, X, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return np.ones(len(X))
+
+    def predict_proba(self, X, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return np.ones(len(X))
+
+    def predict_log_proba(self, X, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return np.zeros(len(X))
+
+    def decision_function(self, X, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return np.ones(len(X))
+
+    def score(self, X, y, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return 1
+
+    def transform(self, X, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return X + 1
+
+    def inverse_transform(self, X, sample_weight=None, prop=None):
+        assert sample_weight is not None
+        assert prop is not None
+        return X - 1
+
+
+# split and partial_fit not relevant for pipelines
+@pytest.mark.parametrize("method", sorted(set(METHODS) - {"split", "partial_fit"}))
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_for_pipeline(method):
+    """Test that metadata is routed correctly for pipelines."""
+
+    def set_request(est, method, **kwarg):
+        """Set requests for a given method.
+
+        If the given method is a composite method, set the same requests for
+        all the methods that compose it.
+        """
+        if method in COMPOSITE_METHODS:
+            methods = COMPOSITE_METHODS[method]
+        else:
+            methods = [method]
+
+        for method in methods:
+            getattr(est, f"set_{method}_request")(**kwarg)
+        return est
+
+    X, y = np.array([[1]]), np.array([1])
+    sample_weight, prop, metadata = [1], "a", "b"
+
+    # test that metadata is routed correctly for pipelines when requested
+    est = SimpleEstimator()
+    est = set_request(est, method, sample_weight=True, prop=True)
+    est = set_request(est, "fit", sample_weight=True, prop=True)
+    trs = (
+        ConsumingTransformer()
+        .set_fit_request(sample_weight=True, metadata=True)
+        .set_transform_request(sample_weight=True, metadata=True)
+        .set_inverse_transform_request(sample_weight=True, metadata=True)
+    )
+    pipeline = Pipeline([("trs", trs), ("estimator", est)])
+
+    if "fit" not in method:
+        pipeline = pipeline.fit(X, y, sample_weight=sample_weight, prop=prop)
+
+    try:
+        getattr(pipeline, method)(
+            X, y, sample_weight=sample_weight, prop=prop, metadata=metadata
+        )
+    except TypeError:
+        # Some methods don't accept y
+        getattr(pipeline, method)(
+            X, sample_weight=sample_weight, prop=prop, metadata=metadata
+        )
+
+    # Make sure the transformer has received the metadata
+    # For the transformer, always only `fit` and `transform` are called.
+    check_recorded_metadata(
+        obj=trs,
+        method="fit",
+        parent="fit",
+        sample_weight=sample_weight,
+        metadata=metadata,
+    )
+    check_recorded_metadata(
+        obj=trs,
+        method="transform",
+        parent="transform",
+        sample_weight=sample_weight,
+        metadata=metadata,
+    )
+
+
+# split and partial_fit not relevant for pipelines
+# sorted is here needed to make `pytest -nX` work. W/o it, tests are collected
+# in different orders between workers and that makes it fail.
+@pytest.mark.parametrize("method", sorted(set(METHODS) - {"split", "partial_fit"}))
+@config_context(enable_metadata_routing=True)
+def test_metadata_routing_error_for_pipeline(method):
+    """Test that metadata is not routed for pipelines when not requested."""
+    X, y = [[1]], [1]
+    sample_weight, prop = [1], "a"
+    est = SimpleEstimator()
+    # here not setting sample_weight request and leaving it as None
+    pipeline = Pipeline([("estimator", est)])
+    error_message = (
+        "[sample_weight, prop] are passed but are not explicitly set as requested"
+        f" or not requested for SimpleEstimator.{method}"
+    )
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        try:
+            # passing X, y positional as the first two arguments
+            getattr(pipeline, method)(X, y, sample_weight=sample_weight, prop=prop)
+        except TypeError:
+            # not all methods accept y (like `predict`), so here we only
+            # pass X as a positional arg.
+            getattr(pipeline, method)(X, sample_weight=sample_weight, prop=prop)
+
+
+@pytest.mark.parametrize(
+    "method", ["decision_function", "transform", "inverse_transform"]
+)
+def test_routing_passed_metadata_not_supported(method):
+    """Test that the right error message is raised when metadata is passed while
+    not supported when `enable_metadata_routing=False`."""
+
+    pipe = Pipeline([("estimator", SimpleEstimator())])
+
+    with pytest.raises(
+        ValueError, match="is only supported if enable_metadata_routing=True"
+    ):
+        getattr(pipe, method)([[1]], sample_weight=[1], prop="a")
+
+
+@config_context(enable_metadata_routing=True)
+def test_pipeline_with_estimator_with_len():
+    """Test that pipeline works with estimators that have a `__len__` method."""
+    pipe = Pipeline(
+        [("trs", RandomTreesEmbedding()), ("estimator", RandomForestClassifier())]
+    )
+    pipe.fit([[1]], [1])
+    pipe.predict([[1]])
+
+
+@pytest.mark.parametrize("last_step", [None, "passthrough"])
+@config_context(enable_metadata_routing=True)
+def test_pipeline_with_no_last_step(last_step):
+    """Test that the pipeline works when there is not last step.
+
+    It should just ignore and pass through the data on transform.
+    """
+    pipe = Pipeline([("trs", FunctionTransformer()), ("estimator", last_step)])
+    assert pipe.fit([[1]], [1]).transform([[1], [2], [3]]) == [[1], [2], [3]]
+
+
+@config_context(enable_metadata_routing=True)
+def test_feature_union_metadata_routing_error():
+    """Test that the right error is raised when metadata is not requested."""
+    X = np.array([[0, 1], [2, 2], [4, 6]])
+    y = [1, 2, 3]
+    sample_weight, metadata = [1, 1, 1], "a"
+
+    # test lacking set_fit_request
+    feature_union = FeatureUnion([("sub_transformer", ConsumingTransformer())])
+
+    error_message = (
+        "[sample_weight, metadata] are passed but are not explicitly set as requested"
+        f" or not requested for {ConsumingTransformer.__name__}.fit"
+    )
+
+    with pytest.raises(UnsetMetadataPassedError, match=re.escape(error_message)):
+        feature_union.fit(X, y, sample_weight=sample_weight, metadata=metadata)
+
+    # test lacking set_transform_request
+    feature_union = FeatureUnion(
+        [
+            (
+                "sub_transformer",
+                ConsumingTransformer().set_fit_request(
+                    sample_weight=True, metadata=True
+                ),
+            )
+        ]
+    )
+
+    error_message = (
+        "[sample_weight, metadata] are passed but are not explicitly set as requested "
+        f"or not requested for {ConsumingTransformer.__name__}.transform"
+    )
+
+    with pytest.raises(UnsetMetadataPassedError, match=re.escape(error_message)):
+        feature_union.fit(
+            X, y, sample_weight=sample_weight, metadata=metadata
+        ).transform(X, sample_weight=sample_weight, metadata=metadata)
+
+
+@config_context(enable_metadata_routing=True)
+def test_feature_union_get_metadata_routing_without_fit():
+    """Test that get_metadata_routing() works regardless of the Child's
+    consumption of any metadata."""
+    feature_union = FeatureUnion([("sub_transformer", ConsumingTransformer())])
+    feature_union.get_metadata_routing()
+
+
+@config_context(enable_metadata_routing=True)
+@pytest.mark.parametrize(
+    "transformer", [ConsumingTransformer, ConsumingNoFitTransformTransformer]
+)
+def test_feature_union_metadata_routing(transformer):
+    """Test that metadata is routed correctly for FeatureUnion."""
+    X = np.array([[0, 1], [2, 2], [4, 6]])
+    y = [1, 2, 3]
+    sample_weight, metadata = [1, 1, 1], "a"
+
+    feature_union = FeatureUnion(
+        [
+            (
+                "sub_trans1",
+                transformer(registry=_Registry())
+                .set_fit_request(sample_weight=True, metadata=True)
+                .set_transform_request(sample_weight=True, metadata=True),
+            ),
+            (
+                "sub_trans2",
+                transformer(registry=_Registry())
+                .set_fit_request(sample_weight=True, metadata=True)
+                .set_transform_request(sample_weight=True, metadata=True),
+            ),
+        ]
+    )
+
+    kwargs = {"sample_weight": sample_weight, "metadata": metadata}
+    feature_union.fit(X, y, **kwargs)
+    feature_union.fit_transform(X, y, **kwargs)
+    feature_union.fit(X, y, **kwargs).transform(X, **kwargs)
+
+    for transformer in feature_union.transformer_list:
+        # access sub-transformer in (name, trans) with transformer[1]
+        registry = transformer[1].registry
+        assert len(registry)
+        for sub_trans in registry:
+            check_recorded_metadata(
+                obj=sub_trans,
+                method="fit",
+                parent="fit",
+                **kwargs,
+            )
+
+
+# End of routing tests
+# ====================
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_public_functions.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_public_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..707aa37737c1b533d45b70a11a3c5be58aa62f35
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_public_functions.py
@@ -0,0 +1,403 @@
+from importlib import import_module
+from inspect import signature
+from numbers import Integral, Real
+
+import pytest
+
+from sklearn.utils._param_validation import (
+    Interval,
+    InvalidParameterError,
+    generate_invalid_param_val,
+    generate_valid_param,
+    make_constraint,
+)
+
+
+def _get_func_info(func_module):
+    module_name, func_name = func_module.rsplit(".", 1)
+    module = import_module(module_name)
+    func = getattr(module, func_name)
+
+    func_sig = signature(func)
+    func_params = [
+        p.name
+        for p in func_sig.parameters.values()
+        if p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD)
+    ]
+
+    # The parameters `*args` and `**kwargs` are ignored since we cannot generate
+    # constraints.
+    required_params = [
+        p.name
+        for p in func_sig.parameters.values()
+        if p.default is p.empty and p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD)
+    ]
+
+    return func, func_name, func_params, required_params
+
+
+def _check_function_param_validation(
+    func, func_name, func_params, required_params, parameter_constraints
+):
+    """Check that an informative error is raised when the value of a parameter does not
+    have an appropriate type or value.
+    """
+    # generate valid values for the required parameters
+    valid_required_params = {}
+    for param_name in required_params:
+        if parameter_constraints[param_name] == "no_validation":
+            valid_required_params[param_name] = 1
+        else:
+            valid_required_params[param_name] = generate_valid_param(
+                make_constraint(parameter_constraints[param_name][0])
+            )
+
+    # check that there is a constraint for each parameter
+    if func_params:
+        validation_params = parameter_constraints.keys()
+        unexpected_params = set(validation_params) - set(func_params)
+        missing_params = set(func_params) - set(validation_params)
+        err_msg = (
+            "Mismatch between _parameter_constraints and the parameters of"
+            f" {func_name}.\nConsider the unexpected parameters {unexpected_params} and"
+            f" expected but missing parameters {missing_params}\n"
+        )
+        assert set(validation_params) == set(func_params), err_msg
+
+    # this object does not have a valid type for sure for all params
+    param_with_bad_type = type("BadType", (), {})()
+
+    for param_name in func_params:
+        constraints = parameter_constraints[param_name]
+
+        if constraints == "no_validation":
+            # This parameter is not validated
+            continue
+
+        # Mixing an interval of reals and an interval of integers must be avoided.
+        if any(
+            isinstance(constraint, Interval) and constraint.type == Integral
+            for constraint in constraints
+        ) and any(
+            isinstance(constraint, Interval) and constraint.type == Real
+            for constraint in constraints
+        ):
+            raise ValueError(
+                f"The constraint for parameter {param_name} of {func_name} can't have a"
+                " mix of intervals of Integral and Real types. Use the type"
+                " RealNotInt instead of Real."
+            )
+
+        match = (
+            rf"The '{param_name}' parameter of {func_name} must be .* Got .* instead."
+        )
+
+        err_msg = (
+            f"{func_name} does not raise an informative error message when the "
+            f"parameter {param_name} does not have a valid type. If any Python type "
+            "is valid, the constraint should be 'no_validation'."
+        )
+
+        # First, check that the error is raised if param doesn't match any valid type.
+        with pytest.raises(InvalidParameterError, match=match):
+            func(**{**valid_required_params, param_name: param_with_bad_type})
+            pytest.fail(err_msg)
+
+        # Then, for constraints that are more than a type constraint, check that the
+        # error is raised if param does match a valid type but does not match any valid
+        # value for this type.
+        constraints = [make_constraint(constraint) for constraint in constraints]
+
+        for constraint in constraints:
+            try:
+                bad_value = generate_invalid_param_val(constraint)
+            except NotImplementedError:
+                continue
+
+            err_msg = (
+                f"{func_name} does not raise an informative error message when the "
+                f"parameter {param_name} does not have a valid value.\n"
+                "Constraints should be disjoint. For instance "
+                "[StrOptions({'a_string'}), str] is not a acceptable set of "
+                "constraint because generating an invalid string for the first "
+                "constraint will always produce a valid string for the second "
+                "constraint."
+            )
+
+            with pytest.raises(InvalidParameterError, match=match):
+                func(**{**valid_required_params, param_name: bad_value})
+                pytest.fail(err_msg)
+
+
+PARAM_VALIDATION_FUNCTION_LIST = [
+    "sklearn.calibration.calibration_curve",
+    "sklearn.cluster.cluster_optics_dbscan",
+    "sklearn.cluster.compute_optics_graph",
+    "sklearn.cluster.estimate_bandwidth",
+    "sklearn.cluster.kmeans_plusplus",
+    "sklearn.cluster.cluster_optics_xi",
+    "sklearn.cluster.ward_tree",
+    "sklearn.covariance.empirical_covariance",
+    "sklearn.covariance.ledoit_wolf_shrinkage",
+    "sklearn.covariance.log_likelihood",
+    "sklearn.covariance.shrunk_covariance",
+    "sklearn.datasets.clear_data_home",
+    "sklearn.datasets.dump_svmlight_file",
+    "sklearn.datasets.fetch_20newsgroups",
+    "sklearn.datasets.fetch_20newsgroups_vectorized",
+    "sklearn.datasets.fetch_california_housing",
+    "sklearn.datasets.fetch_covtype",
+    "sklearn.datasets.fetch_kddcup99",
+    "sklearn.datasets.fetch_lfw_pairs",
+    "sklearn.datasets.fetch_lfw_people",
+    "sklearn.datasets.fetch_olivetti_faces",
+    "sklearn.datasets.fetch_rcv1",
+    "sklearn.datasets.fetch_openml",
+    "sklearn.datasets.fetch_species_distributions",
+    "sklearn.datasets.get_data_home",
+    "sklearn.datasets.load_breast_cancer",
+    "sklearn.datasets.load_diabetes",
+    "sklearn.datasets.load_digits",
+    "sklearn.datasets.load_files",
+    "sklearn.datasets.load_iris",
+    "sklearn.datasets.load_linnerud",
+    "sklearn.datasets.load_sample_image",
+    "sklearn.datasets.load_svmlight_file",
+    "sklearn.datasets.load_svmlight_files",
+    "sklearn.datasets.load_wine",
+    "sklearn.datasets.make_biclusters",
+    "sklearn.datasets.make_blobs",
+    "sklearn.datasets.make_checkerboard",
+    "sklearn.datasets.make_circles",
+    "sklearn.datasets.make_classification",
+    "sklearn.datasets.make_friedman1",
+    "sklearn.datasets.make_friedman2",
+    "sklearn.datasets.make_friedman3",
+    "sklearn.datasets.make_gaussian_quantiles",
+    "sklearn.datasets.make_hastie_10_2",
+    "sklearn.datasets.make_low_rank_matrix",
+    "sklearn.datasets.make_moons",
+    "sklearn.datasets.make_multilabel_classification",
+    "sklearn.datasets.make_regression",
+    "sklearn.datasets.make_s_curve",
+    "sklearn.datasets.make_sparse_coded_signal",
+    "sklearn.datasets.make_sparse_spd_matrix",
+    "sklearn.datasets.make_sparse_uncorrelated",
+    "sklearn.datasets.make_spd_matrix",
+    "sklearn.datasets.make_swiss_roll",
+    "sklearn.decomposition.sparse_encode",
+    "sklearn.feature_extraction.grid_to_graph",
+    "sklearn.feature_extraction.img_to_graph",
+    "sklearn.feature_extraction.image.extract_patches_2d",
+    "sklearn.feature_extraction.image.reconstruct_from_patches_2d",
+    "sklearn.feature_selection.chi2",
+    "sklearn.feature_selection.f_classif",
+    "sklearn.feature_selection.f_regression",
+    "sklearn.feature_selection.mutual_info_classif",
+    "sklearn.feature_selection.mutual_info_regression",
+    "sklearn.feature_selection.r_regression",
+    "sklearn.inspection.partial_dependence",
+    "sklearn.inspection.permutation_importance",
+    "sklearn.isotonic.check_increasing",
+    "sklearn.isotonic.isotonic_regression",
+    "sklearn.linear_model.enet_path",
+    "sklearn.linear_model.lars_path",
+    "sklearn.linear_model.lars_path_gram",
+    "sklearn.linear_model.lasso_path",
+    "sklearn.linear_model.orthogonal_mp",
+    "sklearn.linear_model.orthogonal_mp_gram",
+    "sklearn.linear_model.ridge_regression",
+    "sklearn.manifold.locally_linear_embedding",
+    "sklearn.manifold.smacof",
+    "sklearn.manifold.spectral_embedding",
+    "sklearn.manifold.trustworthiness",
+    "sklearn.metrics.accuracy_score",
+    "sklearn.metrics.auc",
+    "sklearn.metrics.average_precision_score",
+    "sklearn.metrics.balanced_accuracy_score",
+    "sklearn.metrics.brier_score_loss",
+    "sklearn.metrics.calinski_harabasz_score",
+    "sklearn.metrics.check_scoring",
+    "sklearn.metrics.completeness_score",
+    "sklearn.metrics.class_likelihood_ratios",
+    "sklearn.metrics.classification_report",
+    "sklearn.metrics.cluster.adjusted_mutual_info_score",
+    "sklearn.metrics.cluster.contingency_matrix",
+    "sklearn.metrics.cluster.entropy",
+    "sklearn.metrics.cluster.fowlkes_mallows_score",
+    "sklearn.metrics.cluster.homogeneity_completeness_v_measure",
+    "sklearn.metrics.cluster.normalized_mutual_info_score",
+    "sklearn.metrics.cluster.silhouette_samples",
+    "sklearn.metrics.cluster.silhouette_score",
+    "sklearn.metrics.cohen_kappa_score",
+    "sklearn.metrics.confusion_matrix",
+    "sklearn.metrics.consensus_score",
+    "sklearn.metrics.coverage_error",
+    "sklearn.metrics.d2_absolute_error_score",
+    "sklearn.metrics.d2_log_loss_score",
+    "sklearn.metrics.d2_pinball_score",
+    "sklearn.metrics.d2_tweedie_score",
+    "sklearn.metrics.davies_bouldin_score",
+    "sklearn.metrics.dcg_score",
+    "sklearn.metrics.det_curve",
+    "sklearn.metrics.explained_variance_score",
+    "sklearn.metrics.f1_score",
+    "sklearn.metrics.fbeta_score",
+    "sklearn.metrics.get_scorer",
+    "sklearn.metrics.hamming_loss",
+    "sklearn.metrics.hinge_loss",
+    "sklearn.metrics.homogeneity_score",
+    "sklearn.metrics.jaccard_score",
+    "sklearn.metrics.label_ranking_average_precision_score",
+    "sklearn.metrics.label_ranking_loss",
+    "sklearn.metrics.log_loss",
+    "sklearn.metrics.make_scorer",
+    "sklearn.metrics.matthews_corrcoef",
+    "sklearn.metrics.max_error",
+    "sklearn.metrics.mean_absolute_error",
+    "sklearn.metrics.mean_absolute_percentage_error",
+    "sklearn.metrics.mean_gamma_deviance",
+    "sklearn.metrics.mean_pinball_loss",
+    "sklearn.metrics.mean_poisson_deviance",
+    "sklearn.metrics.mean_squared_error",
+    "sklearn.metrics.mean_squared_log_error",
+    "sklearn.metrics.mean_tweedie_deviance",
+    "sklearn.metrics.median_absolute_error",
+    "sklearn.metrics.multilabel_confusion_matrix",
+    "sklearn.metrics.mutual_info_score",
+    "sklearn.metrics.ndcg_score",
+    "sklearn.metrics.pair_confusion_matrix",
+    "sklearn.metrics.adjusted_rand_score",
+    "sklearn.metrics.pairwise.additive_chi2_kernel",
+    "sklearn.metrics.pairwise.chi2_kernel",
+    "sklearn.metrics.pairwise.cosine_distances",
+    "sklearn.metrics.pairwise.cosine_similarity",
+    "sklearn.metrics.pairwise.euclidean_distances",
+    "sklearn.metrics.pairwise.haversine_distances",
+    "sklearn.metrics.pairwise.laplacian_kernel",
+    "sklearn.metrics.pairwise.linear_kernel",
+    "sklearn.metrics.pairwise.manhattan_distances",
+    "sklearn.metrics.pairwise.nan_euclidean_distances",
+    "sklearn.metrics.pairwise.paired_cosine_distances",
+    "sklearn.metrics.pairwise.paired_distances",
+    "sklearn.metrics.pairwise.paired_euclidean_distances",
+    "sklearn.metrics.pairwise.paired_manhattan_distances",
+    "sklearn.metrics.pairwise.pairwise_distances_argmin_min",
+    "sklearn.metrics.pairwise.pairwise_kernels",
+    "sklearn.metrics.pairwise.polynomial_kernel",
+    "sklearn.metrics.pairwise.rbf_kernel",
+    "sklearn.metrics.pairwise.sigmoid_kernel",
+    "sklearn.metrics.pairwise_distances",
+    "sklearn.metrics.pairwise_distances_argmin",
+    "sklearn.metrics.pairwise_distances_chunked",
+    "sklearn.metrics.precision_recall_curve",
+    "sklearn.metrics.precision_recall_fscore_support",
+    "sklearn.metrics.precision_score",
+    "sklearn.metrics.r2_score",
+    "sklearn.metrics.rand_score",
+    "sklearn.metrics.recall_score",
+    "sklearn.metrics.roc_auc_score",
+    "sklearn.metrics.roc_curve",
+    "sklearn.metrics.root_mean_squared_error",
+    "sklearn.metrics.root_mean_squared_log_error",
+    "sklearn.metrics.top_k_accuracy_score",
+    "sklearn.metrics.v_measure_score",
+    "sklearn.metrics.zero_one_loss",
+    "sklearn.model_selection.cross_val_predict",
+    "sklearn.model_selection.cross_val_score",
+    "sklearn.model_selection.cross_validate",
+    "sklearn.model_selection.learning_curve",
+    "sklearn.model_selection.permutation_test_score",
+    "sklearn.model_selection.train_test_split",
+    "sklearn.model_selection.validation_curve",
+    "sklearn.neighbors.kneighbors_graph",
+    "sklearn.neighbors.radius_neighbors_graph",
+    "sklearn.neighbors.sort_graph_by_row_values",
+    "sklearn.preprocessing.add_dummy_feature",
+    "sklearn.preprocessing.binarize",
+    "sklearn.preprocessing.label_binarize",
+    "sklearn.preprocessing.normalize",
+    "sklearn.preprocessing.scale",
+    "sklearn.random_projection.johnson_lindenstrauss_min_dim",
+    "sklearn.svm.l1_min_c",
+    "sklearn.tree.export_graphviz",
+    "sklearn.tree.export_text",
+    "sklearn.tree.plot_tree",
+    "sklearn.utils.gen_batches",
+    "sklearn.utils.gen_even_slices",
+    "sklearn.utils.resample",
+    "sklearn.utils.safe_mask",
+    "sklearn.utils.extmath.randomized_svd",
+    "sklearn.utils.class_weight.compute_class_weight",
+    "sklearn.utils.class_weight.compute_sample_weight",
+    "sklearn.utils.graph.single_source_shortest_path_length",
+]
+
+
+@pytest.mark.parametrize("func_module", PARAM_VALIDATION_FUNCTION_LIST)
+def test_function_param_validation(func_module):
+    """Check param validation for public functions that are not wrappers around
+    estimators.
+    """
+    func, func_name, func_params, required_params = _get_func_info(func_module)
+
+    parameter_constraints = getattr(func, "_skl_parameter_constraints")
+
+    _check_function_param_validation(
+        func, func_name, func_params, required_params, parameter_constraints
+    )
+
+
+PARAM_VALIDATION_CLASS_WRAPPER_LIST = [
+    ("sklearn.cluster.affinity_propagation", "sklearn.cluster.AffinityPropagation"),
+    ("sklearn.cluster.dbscan", "sklearn.cluster.DBSCAN"),
+    ("sklearn.cluster.k_means", "sklearn.cluster.KMeans"),
+    ("sklearn.cluster.mean_shift", "sklearn.cluster.MeanShift"),
+    ("sklearn.cluster.spectral_clustering", "sklearn.cluster.SpectralClustering"),
+    ("sklearn.covariance.graphical_lasso", "sklearn.covariance.GraphicalLasso"),
+    ("sklearn.covariance.ledoit_wolf", "sklearn.covariance.LedoitWolf"),
+    ("sklearn.covariance.oas", "sklearn.covariance.OAS"),
+    ("sklearn.decomposition.dict_learning", "sklearn.decomposition.DictionaryLearning"),
+    (
+        "sklearn.decomposition.dict_learning_online",
+        "sklearn.decomposition.MiniBatchDictionaryLearning",
+    ),
+    ("sklearn.decomposition.fastica", "sklearn.decomposition.FastICA"),
+    ("sklearn.decomposition.non_negative_factorization", "sklearn.decomposition.NMF"),
+    ("sklearn.preprocessing.maxabs_scale", "sklearn.preprocessing.MaxAbsScaler"),
+    ("sklearn.preprocessing.minmax_scale", "sklearn.preprocessing.MinMaxScaler"),
+    ("sklearn.preprocessing.power_transform", "sklearn.preprocessing.PowerTransformer"),
+    (
+        "sklearn.preprocessing.quantile_transform",
+        "sklearn.preprocessing.QuantileTransformer",
+    ),
+    ("sklearn.preprocessing.robust_scale", "sklearn.preprocessing.RobustScaler"),
+]
+
+
+@pytest.mark.parametrize(
+    "func_module, class_module", PARAM_VALIDATION_CLASS_WRAPPER_LIST
+)
+def test_class_wrapper_param_validation(func_module, class_module):
+    """Check param validation for public functions that are wrappers around
+    estimators.
+    """
+    func, func_name, func_params, required_params = _get_func_info(func_module)
+
+    module_name, class_name = class_module.rsplit(".", 1)
+    module = import_module(module_name)
+    klass = getattr(module, class_name)
+
+    parameter_constraints_func = getattr(func, "_skl_parameter_constraints")
+    parameter_constraints_class = getattr(klass, "_parameter_constraints")
+    parameter_constraints = {
+        **parameter_constraints_class,
+        **parameter_constraints_func,
+    }
+    parameter_constraints = {
+        k: v for k, v in parameter_constraints.items() if k in func_params
+    }
+
+    _check_function_param_validation(
+        func, func_name, func_params, required_params, parameter_constraints
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_random_projection.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_random_projection.py
new file mode 100644
index 0000000000000000000000000000000000000000..b279ab75ec8d93b07aca0c41f725324a85fe1c7b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_random_projection.py
@@ -0,0 +1,584 @@
+import functools
+import warnings
+from typing import Any, List
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+
+from sklearn.exceptions import DataDimensionalityWarning, NotFittedError
+from sklearn.metrics import euclidean_distances
+from sklearn.random_projection import (
+    GaussianRandomProjection,
+    SparseRandomProjection,
+    _gaussian_random_matrix,
+    _sparse_random_matrix,
+    johnson_lindenstrauss_min_dim,
+)
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import COO_CONTAINERS
+
+all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
+all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
+all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix
+
+all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
+all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
+all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection
+
+
+def make_sparse_random_data(
+    coo_container,
+    n_samples,
+    n_features,
+    n_nonzeros,
+    random_state=None,
+    sparse_format="csr",
+):
+    """Make some random data with uniformly located non zero entries with
+    Gaussian distributed values; `sparse_format` can be `"csr"` (default) or
+    `None` (in which case a dense array is returned).
+    """
+    rng = np.random.RandomState(random_state)
+    data_coo = coo_container(
+        (
+            rng.randn(n_nonzeros),
+            (
+                rng.randint(n_samples, size=n_nonzeros),
+                rng.randint(n_features, size=n_nonzeros),
+            ),
+        ),
+        shape=(n_samples, n_features),
+    )
+    if sparse_format is not None:
+        return data_coo.asformat(sparse_format)
+    else:
+        return data_coo.toarray()
+
+
+def densify(matrix):
+    if not sp.issparse(matrix):
+        return matrix
+    else:
+        return matrix.toarray()
+
+
+n_samples, n_features = (10, 1000)
+n_nonzeros = int(n_samples * n_features / 100.0)
+
+
+###############################################################################
+# test on JL lemma
+###############################################################################
+
+
+@pytest.mark.parametrize(
+    "n_samples, eps",
+    [
+        ([100, 110], [0.9, 1.1]),
+        ([90, 100], [0.1, 0.0]),
+        ([50, -40], [0.1, 0.2]),
+    ],
+)
+def test_invalid_jl_domain(n_samples, eps):
+    with pytest.raises(ValueError):
+        johnson_lindenstrauss_min_dim(n_samples, eps=eps)
+
+
+def test_input_size_jl_min_dim():
+    with pytest.raises(ValueError):
+        johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])
+
+    johnson_lindenstrauss_min_dim(
+        np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
+    )
+
+
+###############################################################################
+# tests random matrix generation
+###############################################################################
+def check_input_size_random_matrix(random_matrix):
+    inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)]
+    for n_components, n_features in inputs:
+        with pytest.raises(ValueError):
+            random_matrix(n_components, n_features)
+
+
+def check_size_generated(random_matrix):
+    inputs = [(1, 5), (5, 1), (5, 5), (1, 1)]
+    for n_components, n_features in inputs:
+        assert random_matrix(n_components, n_features).shape == (
+            n_components,
+            n_features,
+        )
+
+
+def check_zero_mean_and_unit_norm(random_matrix):
+    # All random matrix should produce a transformation matrix
+    # with zero mean and unit norm for each columns
+
+    A = densify(random_matrix(10000, 1, random_state=0))
+
+    assert_array_almost_equal(0, np.mean(A), 3)
+    assert_array_almost_equal(1.0, np.linalg.norm(A), 1)
+
+
+def check_input_with_sparse_random_matrix(random_matrix):
+    n_components, n_features = 5, 10
+
+    for density in [-1.0, 0.0, 1.1]:
+        with pytest.raises(ValueError):
+            random_matrix(n_components, n_features, density=density)
+
+
+@pytest.mark.parametrize("random_matrix", all_random_matrix)
+def test_basic_property_of_random_matrix(random_matrix):
+    # Check basic properties of random matrix generation
+    check_input_size_random_matrix(random_matrix)
+    check_size_generated(random_matrix)
+    check_zero_mean_and_unit_norm(random_matrix)
+
+
+@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix)
+def test_basic_property_of_sparse_random_matrix(random_matrix):
+    check_input_with_sparse_random_matrix(random_matrix)
+
+    random_matrix_dense = functools.partial(random_matrix, density=1.0)
+
+    check_zero_mean_and_unit_norm(random_matrix_dense)
+
+
+def test_gaussian_random_matrix():
+    # Check some statical properties of Gaussian random matrix
+    # Check that the random matrix follow the proper distribution.
+    # Let's say that each element of a_{ij} of A is taken from
+    #   a_ij ~ N(0.0, 1 / n_components).
+    #
+    n_components = 100
+    n_features = 1000
+    A = _gaussian_random_matrix(n_components, n_features, random_state=0)
+
+    assert_array_almost_equal(0.0, np.mean(A), 2)
+    assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1)
+
+
+def test_sparse_random_matrix():
+    # Check some statical properties of sparse random matrix
+    n_components = 100
+    n_features = 500
+
+    for density in [0.3, 1.0]:
+        s = 1 / density
+
+        A = _sparse_random_matrix(
+            n_components, n_features, density=density, random_state=0
+        )
+        A = densify(A)
+
+        # Check possible values
+        values = np.unique(A)
+        assert np.sqrt(s) / np.sqrt(n_components) in values
+        assert -np.sqrt(s) / np.sqrt(n_components) in values
+
+        if density == 1.0:
+            assert np.size(values) == 2
+        else:
+            assert 0.0 in values
+            assert np.size(values) == 3
+
+        # Check that the random matrix follow the proper distribution.
+        # Let's say that each element of a_{ij} of A is taken from
+        #
+        # - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
+        # -  0                              with probability 1 - 1 / s
+        # - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s
+        #
+        assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2)
+        assert_almost_equal(
+            np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
+        )
+        assert_almost_equal(
+            np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
+        )
+
+        assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2)
+        assert_almost_equal(
+            np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1),
+            (1 - 1 / (2 * s)) * 1 / (2 * s),
+            decimal=2,
+        )
+        assert_almost_equal(
+            np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1),
+            (1 - 1 / (2 * s)) * 1 / (2 * s),
+            decimal=2,
+        )
+
+
+###############################################################################
+# tests on random projection transformer
+###############################################################################
+
+
+def test_random_projection_transformer_invalid_input():
+    n_components = "auto"
+    fit_data = [[0, 1, 2]]
+    for RandomProjection in all_RandomProjection:
+        with pytest.raises(ValueError):
+            RandomProjection(n_components=n_components).fit(fit_data)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_try_to_transform_before_fit(coo_container, global_random_seed):
+    data = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros,
+        random_state=global_random_seed,
+        sparse_format=None,
+    )
+    for RandomProjection in all_RandomProjection:
+        with pytest.raises(NotFittedError):
+            RandomProjection(n_components="auto").transform(data)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_too_many_samples_to_find_a_safe_embedding(coo_container, global_random_seed):
+    data = make_sparse_random_data(
+        coo_container,
+        n_samples=1000,
+        n_features=100,
+        n_nonzeros=1000,
+        random_state=global_random_seed,
+        sparse_format=None,
+    )
+
+    for RandomProjection in all_RandomProjection:
+        rp = RandomProjection(n_components="auto", eps=0.1)
+        expected_msg = (
+            "eps=0.100000 and n_samples=1000 lead to a target dimension"
+            " of 5920 which is larger than the original space with"
+            " n_features=100"
+        )
+        with pytest.raises(ValueError, match=expected_msg):
+            rp.fit(data)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_random_projection_embedding_quality(coo_container):
+    data = make_sparse_random_data(
+        coo_container,
+        n_samples=8,
+        n_features=5000,
+        n_nonzeros=15000,
+        random_state=0,
+        sparse_format=None,
+    )
+    eps = 0.2
+
+    original_distances = euclidean_distances(data, squared=True)
+    original_distances = original_distances.ravel()
+    non_identical = original_distances != 0.0
+
+    # remove 0 distances to avoid division by 0
+    original_distances = original_distances[non_identical]
+
+    for RandomProjection in all_RandomProjection:
+        rp = RandomProjection(n_components="auto", eps=eps, random_state=0)
+        projected = rp.fit_transform(data)
+
+        projected_distances = euclidean_distances(projected, squared=True)
+        projected_distances = projected_distances.ravel()
+
+        # remove 0 distances to avoid division by 0
+        projected_distances = projected_distances[non_identical]
+
+        distances_ratio = projected_distances / original_distances
+
+        # check that the automatically tuned values for the density respect the
+        # contract for eps: pairwise distances are preserved according to the
+        # Johnson-Lindenstrauss lemma
+        assert distances_ratio.max() < 1 + eps
+        assert 1 - eps < distances_ratio.min()
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_SparseRandomProj_output_representation(coo_container):
+    dense_data = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros,
+        random_state=0,
+        sparse_format=None,
+    )
+    sparse_data = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros,
+        random_state=0,
+        sparse_format="csr",
+    )
+    for SparseRandomProj in all_SparseRandomProjection:
+        # when using sparse input, the projected data can be forced to be a
+        # dense numpy array
+        rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0)
+        rp.fit(dense_data)
+        assert isinstance(rp.transform(dense_data), np.ndarray)
+        assert isinstance(rp.transform(sparse_data), np.ndarray)
+
+        # the output can be left to a sparse matrix instead
+        rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0)
+        rp = rp.fit(dense_data)
+        # output for dense input will stay dense:
+        assert isinstance(rp.transform(dense_data), np.ndarray)
+
+        # output for sparse output will be sparse:
+        assert sp.issparse(rp.transform(sparse_data))
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_correct_RandomProjection_dimensions_embedding(
+    coo_container, global_random_seed
+):
+    data = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros,
+        random_state=global_random_seed,
+        sparse_format=None,
+    )
+    for RandomProjection in all_RandomProjection:
+        rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data)
+
+        # the number of components is adjusted from the shape of the training
+        # set
+        assert rp.n_components == "auto"
+        assert rp.n_components_ == 110
+
+        if RandomProjection in all_SparseRandomProjection:
+            assert rp.density == "auto"
+            assert_almost_equal(rp.density_, 0.03, 2)
+
+        assert rp.components_.shape == (110, n_features)
+
+        projected_1 = rp.transform(data)
+        assert projected_1.shape == (n_samples, 110)
+
+        # once the RP is 'fitted' the projection is always the same
+        projected_2 = rp.transform(data)
+        assert_array_equal(projected_1, projected_2)
+
+        # fit transform with same random seed will lead to the same results
+        rp2 = RandomProjection(random_state=0, eps=0.5)
+        projected_3 = rp2.fit_transform(data)
+        assert_array_equal(projected_1, projected_3)
+
+        # Try to transform with an input X of size different from fitted.
+        with pytest.raises(ValueError):
+            rp.transform(data[:, 1:5])
+
+        # it is also possible to fix the number of components and the density
+        # level
+        if RandomProjection in all_SparseRandomProjection:
+            rp = RandomProjection(n_components=100, density=0.001, random_state=0)
+            projected = rp.fit_transform(data)
+            assert projected.shape == (n_samples, 100)
+            assert rp.components_.shape == (100, n_features)
+            assert rp.components_.nnz < 115  # close to 1% density
+            assert 85 < rp.components_.nnz  # close to 1% density
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_warning_n_components_greater_than_n_features(
+    coo_container, global_random_seed
+):
+    n_features = 20
+    n_samples = 5
+    n_nonzeros = int(n_features / 4)
+    data = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros,
+        random_state=global_random_seed,
+        sparse_format=None,
+    )
+
+    for RandomProjection in all_RandomProjection:
+        with pytest.warns(DataDimensionalityWarning):
+            RandomProjection(n_components=n_features + 1).fit(data)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_works_with_sparse_data(coo_container, global_random_seed):
+    n_features = 20
+    n_samples = 5
+    n_nonzeros = int(n_features / 4)
+    dense_data = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros,
+        random_state=global_random_seed,
+        sparse_format=None,
+    )
+    sparse_data = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros,
+        random_state=global_random_seed,
+        sparse_format="csr",
+    )
+
+    for RandomProjection in all_RandomProjection:
+        rp_dense = RandomProjection(n_components=3, random_state=1).fit(dense_data)
+        rp_sparse = RandomProjection(n_components=3, random_state=1).fit(sparse_data)
+        assert_array_almost_equal(
+            densify(rp_dense.components_), densify(rp_sparse.components_)
+        )
+
+
+def test_johnson_lindenstrauss_min_dim():
+    """Test Johnson-Lindenstrauss for small eps.
+
+    Regression test for #17111: before #19374, 32-bit systems would fail.
+    """
+    assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
+def test_random_projection_feature_names_out(
+    coo_container, random_projection_cls, global_random_seed
+):
+    data = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros,
+        random_state=global_random_seed,
+        sparse_format=None,
+    )
+    random_projection = random_projection_cls(n_components=2)
+    random_projection.fit(data)
+    names_out = random_projection.get_feature_names_out()
+    class_name_lower = random_projection_cls.__name__.lower()
+    expected_names_out = np.array(
+        [f"{class_name_lower}{i}" for i in range(random_projection.n_components_)],
+        dtype=object,
+    )
+
+    assert_array_equal(names_out, expected_names_out)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+@pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000))
+@pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000))
+@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
+@pytest.mark.parametrize("compute_inverse_components", [True, False])
+def test_inverse_transform(
+    coo_container,
+    n_samples,
+    n_features,
+    random_projection_cls,
+    compute_inverse_components,
+    global_random_seed,
+):
+    n_components = 10
+
+    random_projection = random_projection_cls(
+        n_components=n_components,
+        compute_inverse_components=compute_inverse_components,
+        random_state=global_random_seed,
+    )
+
+    X_dense = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros=n_samples * n_features // 100 + 1,
+        random_state=global_random_seed,
+        sparse_format=None,
+    )
+    X_csr = make_sparse_random_data(
+        coo_container,
+        n_samples,
+        n_features,
+        n_nonzeros=n_samples * n_features // 100 + 1,
+        random_state=global_random_seed,
+        sparse_format="csr",
+    )
+
+    for X in [X_dense, X_csr]:
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                message=(
+                    "The number of components is higher than the number of features"
+                ),
+                category=DataDimensionalityWarning,
+            )
+            projected = random_projection.fit_transform(X)
+
+        if compute_inverse_components:
+            assert hasattr(random_projection, "inverse_components_")
+            inv_components = random_projection.inverse_components_
+            assert inv_components.shape == (n_features, n_components)
+
+        projected_back = random_projection.inverse_transform(projected)
+        assert projected_back.shape == X.shape
+
+        projected_again = random_projection.transform(projected_back)
+        if hasattr(projected, "toarray"):
+            projected = projected.toarray()
+        assert_allclose(projected, projected_again, rtol=1e-7, atol=1e-10)
+
+
+@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
+@pytest.mark.parametrize(
+    "input_dtype, expected_dtype",
+    (
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ),
+)
+def test_random_projection_dtype_match(
+    random_projection_cls, input_dtype, expected_dtype
+):
+    # Verify output matrix dtype
+    rng = np.random.RandomState(42)
+    X = rng.rand(25, 3000)
+    rp = random_projection_cls(random_state=0)
+    transformed = rp.fit_transform(X.astype(input_dtype))
+
+    assert rp.components_.dtype == expected_dtype
+    assert transformed.dtype == expected_dtype
+
+
+@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
+def test_random_projection_numerical_consistency(random_projection_cls):
+    # Verify numerical consistency among np.float32 and np.float64
+    atol = 1e-5
+    rng = np.random.RandomState(42)
+    X = rng.rand(25, 3000)
+    rp_32 = random_projection_cls(random_state=0)
+    rp_64 = random_projection_cls(random_state=0)
+
+    projection_32 = rp_32.fit_transform(X.astype(np.float32))
+    projection_64 = rp_64.fit_transform(X.astype(np.float64))
+
+    assert_allclose(projection_64, projection_32, atol=atol)
+
+    assert_allclose_dense_sparse(rp_32.components_, rp_64.components_)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/tree/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4b03b66eb6e57cb3d7fc66bed6e0d191f75e9d0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/__init__.py
@@ -0,0 +1,24 @@
+"""Decision tree based models for classification and regression."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._classes import (
+    BaseDecisionTree,
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    ExtraTreeClassifier,
+    ExtraTreeRegressor,
+)
+from ._export import export_graphviz, export_text, plot_tree
+
+__all__ = [
+    "BaseDecisionTree",
+    "DecisionTreeClassifier",
+    "DecisionTreeRegressor",
+    "ExtraTreeClassifier",
+    "ExtraTreeRegressor",
+    "export_graphviz",
+    "export_text",
+    "plot_tree",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_classes.py b/.venv/lib/python3.12/site-packages/sklearn/tree/_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..8536ccf0d6f6bbd07d069c67a9f004bc29ca67f3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_classes.py
@@ -0,0 +1,1997 @@
+"""
+This module gathers tree-based methods, including decision, regression and
+randomized trees. Single and multi-output problems are both handled.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import copy
+import numbers
+from abc import ABCMeta, abstractmethod
+from math import ceil
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.sparse import issparse
+
+from sklearn.utils import metadata_routing
+
+from ..base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    _fit_context,
+    clone,
+    is_classifier,
+)
+from ..utils import Bunch, check_random_state, compute_sample_weight
+from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import (
+    _assert_all_finite_element_wise,
+    _check_n_features,
+    _check_sample_weight,
+    assert_all_finite,
+    check_is_fitted,
+    validate_data,
+)
+from . import _criterion, _splitter, _tree
+from ._criterion import Criterion
+from ._splitter import Splitter
+from ._tree import (
+    BestFirstTreeBuilder,
+    DepthFirstTreeBuilder,
+    Tree,
+    _build_pruned_tree_ccp,
+    ccp_pruning_path,
+)
+from ._utils import _any_isnan_axis0
+
+__all__ = [
+    "DecisionTreeClassifier",
+    "DecisionTreeRegressor",
+    "ExtraTreeClassifier",
+    "ExtraTreeRegressor",
+]
+
+
+# =============================================================================
+# Types and constants
+# =============================================================================
+
+DTYPE = _tree.DTYPE
+DOUBLE = _tree.DOUBLE
+
+CRITERIA_CLF = {
+    "gini": _criterion.Gini,
+    "log_loss": _criterion.Entropy,
+    "entropy": _criterion.Entropy,
+}
+CRITERIA_REG = {
+    "squared_error": _criterion.MSE,
+    "friedman_mse": _criterion.FriedmanMSE,
+    "absolute_error": _criterion.MAE,
+    "poisson": _criterion.Poisson,
+}
+
+DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter}
+
+SPARSE_SPLITTERS = {
+    "best": _splitter.BestSparseSplitter,
+    "random": _splitter.RandomSparseSplitter,
+}
+
+# =============================================================================
+# Base decision tree
+# =============================================================================
+
+
+class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for decision trees.
+
+    Warning: This class should not be used directly.
+    Use derived classes instead.
+    """
+
+    # "check_input" is used for optimisation and isn't something to be passed
+    # around in a pipeline.
+    __metadata_request__predict = {"check_input": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        "splitter": [StrOptions({"best", "random"})],
+        "max_depth": [Interval(Integral, 1, None, closed="left"), None],
+        "min_samples_split": [
+            Interval(Integral, 2, None, closed="left"),
+            Interval(RealNotInt, 0.0, 1.0, closed="right"),
+        ],
+        "min_samples_leaf": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0.0, 1.0, closed="neither"),
+        ],
+        "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")],
+        "max_features": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0.0, 1.0, closed="right"),
+            StrOptions({"sqrt", "log2"}),
+            None,
+        ],
+        "random_state": ["random_state"],
+        "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
+        "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
+        "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
+        "monotonic_cst": ["array-like", None],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        *,
+        criterion,
+        splitter,
+        max_depth,
+        min_samples_split,
+        min_samples_leaf,
+        min_weight_fraction_leaf,
+        max_features,
+        max_leaf_nodes,
+        random_state,
+        min_impurity_decrease,
+        class_weight=None,
+        ccp_alpha=0.0,
+        monotonic_cst=None,
+    ):
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.random_state = random_state
+        self.min_impurity_decrease = min_impurity_decrease
+        self.class_weight = class_weight
+        self.ccp_alpha = ccp_alpha
+        self.monotonic_cst = monotonic_cst
+
+    def get_depth(self):
+        """Return the depth of the decision tree.
+
+        The depth of a tree is the maximum distance between the root
+        and any leaf.
+
+        Returns
+        -------
+        self.tree_.max_depth : int
+            The maximum depth of the tree.
+        """
+        check_is_fitted(self)
+        return self.tree_.max_depth
+
+    def get_n_leaves(self):
+        """Return the number of leaves of the decision tree.
+
+        Returns
+        -------
+        self.tree_.n_leaves : int
+            Number of leaves.
+        """
+        check_is_fitted(self)
+        return self.tree_.n_leaves
+
+    def _support_missing_values(self, X):
+        return (
+            not issparse(X)
+            and self.__sklearn_tags__().input_tags.allow_nan
+            and self.monotonic_cst is None
+        )
+
+    def _compute_missing_values_in_feature_mask(self, X, estimator_name=None):
+        """Return boolean mask denoting if there are missing values for each feature.
+
+        This method also ensures that X is finite.
+
+        Parameter
+        ---------
+        X : array-like of shape (n_samples, n_features), dtype=DOUBLE
+            Input data.
+
+        estimator_name : str or None, default=None
+            Name to use when raising an error. Defaults to the class name.
+
+        Returns
+        -------
+        missing_values_in_feature_mask : ndarray of shape (n_features,), or None
+            Missing value mask. If missing values are not supported or there
+            are no missing values, return None.
+        """
+        estimator_name = estimator_name or self.__class__.__name__
+        common_kwargs = dict(estimator_name=estimator_name, input_name="X")
+
+        if not self._support_missing_values(X):
+            assert_all_finite(X, **common_kwargs)
+            return None
+
+        with np.errstate(over="ignore"):
+            overall_sum = np.sum(X)
+
+        if not np.isfinite(overall_sum):
+            # Raise a ValueError in case of the presence of an infinite element.
+            _assert_all_finite_element_wise(X, xp=np, allow_nan=True, **common_kwargs)
+
+        # If the sum is not nan, then there are no missing values
+        if not np.isnan(overall_sum):
+            return None
+
+        missing_values_in_feature_mask = _any_isnan_axis0(X)
+        return missing_values_in_feature_mask
+
+    def _fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        check_input=True,
+        missing_values_in_feature_mask=None,
+    ):
+        random_state = check_random_state(self.random_state)
+
+        if check_input:
+            # Need to validate separately here.
+            # We can't pass multi_output=True because that would allow y to be
+            # csr.
+
+            # _compute_missing_values_in_feature_mask will check for finite values and
+            # compute the missing mask if the tree supports missing values
+            check_X_params = dict(
+                dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False
+            )
+            check_y_params = dict(ensure_2d=False, dtype=None)
+            X, y = validate_data(
+                self, X, y, validate_separately=(check_X_params, check_y_params)
+            )
+
+            missing_values_in_feature_mask = (
+                self._compute_missing_values_in_feature_mask(X)
+            )
+            if issparse(X):
+                X.sort_indices()
+
+                if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
+                    raise ValueError(
+                        "No support for np.int64 index based sparse matrices"
+                    )
+
+            if self.criterion == "poisson":
+                if np.any(y < 0):
+                    raise ValueError(
+                        "Some value(s) of y are negative which is"
+                        " not allowed for Poisson regression."
+                    )
+                if np.sum(y) <= 0:
+                    raise ValueError(
+                        "Sum of y is not positive which is "
+                        "necessary for Poisson regression."
+                    )
+
+        # Determine output settings
+        n_samples, self.n_features_in_ = X.shape
+        is_classification = is_classifier(self)
+
+        y = np.atleast_1d(y)
+        expanded_class_weight = None
+
+        if y.ndim == 1:
+            # reshape is necessary to preserve the data contiguity against vs
+            # [:, np.newaxis] that does not.
+            y = np.reshape(y, (-1, 1))
+
+        self.n_outputs_ = y.shape[1]
+
+        if is_classification:
+            check_classification_targets(y)
+            y = np.copy(y)
+
+            self.classes_ = []
+            self.n_classes_ = []
+
+            if self.class_weight is not None:
+                y_original = np.copy(y)
+
+            y_encoded = np.zeros(y.shape, dtype=int)
+            for k in range(self.n_outputs_):
+                classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
+                self.classes_.append(classes_k)
+                self.n_classes_.append(classes_k.shape[0])
+            y = y_encoded
+
+            if self.class_weight is not None:
+                expanded_class_weight = compute_sample_weight(
+                    self.class_weight, y_original
+                )
+
+            self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
+
+        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
+
+        if isinstance(self.min_samples_leaf, numbers.Integral):
+            min_samples_leaf = self.min_samples_leaf
+        else:  # float
+            min_samples_leaf = ceil(self.min_samples_leaf * n_samples)
+
+        if isinstance(self.min_samples_split, numbers.Integral):
+            min_samples_split = self.min_samples_split
+        else:  # float
+            min_samples_split = ceil(self.min_samples_split * n_samples)
+            min_samples_split = max(2, min_samples_split)
+
+        min_samples_split = max(min_samples_split, 2 * min_samples_leaf)
+
+        if isinstance(self.max_features, str):
+            if self.max_features == "sqrt":
+                max_features = max(1, int(np.sqrt(self.n_features_in_)))
+            elif self.max_features == "log2":
+                max_features = max(1, int(np.log2(self.n_features_in_)))
+        elif self.max_features is None:
+            max_features = self.n_features_in_
+        elif isinstance(self.max_features, numbers.Integral):
+            max_features = self.max_features
+        else:  # float
+            if self.max_features > 0.0:
+                max_features = max(1, int(self.max_features * self.n_features_in_))
+            else:
+                max_features = 0
+
+        self.max_features_ = max_features
+
+        max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
+
+        if len(y) != n_samples:
+            raise ValueError(
+                "Number of labels=%d does not match number of samples=%d"
+                % (len(y), n_samples)
+            )
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=DOUBLE)
+
+        if expanded_class_weight is not None:
+            if sample_weight is not None:
+                sample_weight = sample_weight * expanded_class_weight
+            else:
+                sample_weight = expanded_class_weight
+
+        # Set min_weight_leaf from min_weight_fraction_leaf
+        if sample_weight is None:
+            min_weight_leaf = self.min_weight_fraction_leaf * n_samples
+        else:
+            min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
+
+        # Build tree
+        criterion = self.criterion
+        if not isinstance(criterion, Criterion):
+            if is_classification:
+                criterion = CRITERIA_CLF[self.criterion](
+                    self.n_outputs_, self.n_classes_
+                )
+            else:
+                criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
+        else:
+            # Make a deepcopy in case the criterion has mutable attributes that
+            # might be shared and modified concurrently during parallel fitting
+            criterion = copy.deepcopy(criterion)
+
+        SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
+
+        splitter = self.splitter
+        if self.monotonic_cst is None:
+            monotonic_cst = None
+        else:
+            if self.n_outputs_ > 1:
+                raise ValueError(
+                    "Monotonicity constraints are not supported with multiple outputs."
+                )
+            # Check to correct monotonicity constraint' specification,
+            # by applying element-wise logical conjunction
+            # Note: we do not cast `np.asarray(self.monotonic_cst, dtype=np.int8)`
+            # straight away here so as to generate error messages for invalid
+            # values using the original values prior to any dtype related conversion.
+            monotonic_cst = np.asarray(self.monotonic_cst)
+            if monotonic_cst.shape[0] != X.shape[1]:
+                raise ValueError(
+                    "monotonic_cst has shape {} but the input data "
+                    "X has {} features.".format(monotonic_cst.shape[0], X.shape[1])
+                )
+            valid_constraints = np.isin(monotonic_cst, (-1, 0, 1))
+            if not np.all(valid_constraints):
+                unique_constaints_value = np.unique(monotonic_cst)
+                raise ValueError(
+                    "monotonic_cst must be None or an array-like of -1, 0 or 1, but"
+                    f" got {unique_constaints_value}"
+                )
+            monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
+            if is_classifier(self):
+                if self.n_classes_[0] > 2:
+                    raise ValueError(
+                        "Monotonicity constraints are not supported with multiclass "
+                        "classification"
+                    )
+                # Binary classification trees are built by constraining probabilities
+                # of the *negative class* in order to make the implementation similar
+                # to regression trees.
+                # Since self.monotonic_cst encodes constraints on probabilities of the
+                # *positive class*, all signs must be flipped.
+                monotonic_cst *= -1
+
+        if not isinstance(self.splitter, Splitter):
+            splitter = SPLITTERS[self.splitter](
+                criterion,
+                self.max_features_,
+                min_samples_leaf,
+                min_weight_leaf,
+                random_state,
+                monotonic_cst,
+            )
+
+        if is_classifier(self):
+            self.tree_ = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_)
+        else:
+            self.tree_ = Tree(
+                self.n_features_in_,
+                # TODO: tree shouldn't need this in this case
+                np.array([1] * self.n_outputs_, dtype=np.intp),
+                self.n_outputs_,
+            )
+
+        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
+        if max_leaf_nodes < 0:
+            builder = DepthFirstTreeBuilder(
+                splitter,
+                min_samples_split,
+                min_samples_leaf,
+                min_weight_leaf,
+                max_depth,
+                self.min_impurity_decrease,
+            )
+        else:
+            builder = BestFirstTreeBuilder(
+                splitter,
+                min_samples_split,
+                min_samples_leaf,
+                min_weight_leaf,
+                max_depth,
+                max_leaf_nodes,
+                self.min_impurity_decrease,
+            )
+
+        builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
+
+        if self.n_outputs_ == 1 and is_classifier(self):
+            self.n_classes_ = self.n_classes_[0]
+            self.classes_ = self.classes_[0]
+
+        self._prune_tree()
+
+        return self
+
+    def _validate_X_predict(self, X, check_input):
+        """Validate the training data on predict (probabilities)."""
+        if check_input:
+            if self._support_missing_values(X):
+                ensure_all_finite = "allow-nan"
+            else:
+                ensure_all_finite = True
+            X = validate_data(
+                self,
+                X,
+                dtype=DTYPE,
+                accept_sparse="csr",
+                reset=False,
+                ensure_all_finite=ensure_all_finite,
+            )
+            if issparse(X) and (
+                X.indices.dtype != np.intc or X.indptr.dtype != np.intc
+            ):
+                raise ValueError("No support for np.int64 index based sparse matrices")
+        else:
+            # The number of features is checked regardless of `check_input`
+            _check_n_features(self, X, reset=False)
+        return X
+
+    def predict(self, X, check_input=True):
+        """Predict class or regression value for X.
+
+        For a classification model, the predicted class for each sample in X is
+        returned. For a regression model, the predicted value based on X is
+        returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you're doing.
+
+        Returns
+        -------
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The predicted classes, or the predict values.
+        """
+        check_is_fitted(self)
+        X = self._validate_X_predict(X, check_input)
+        proba = self.tree_.predict(X)
+        n_samples = X.shape[0]
+
+        # Classification
+        if is_classifier(self):
+            if self.n_outputs_ == 1:
+                return self.classes_.take(np.argmax(proba, axis=1), axis=0)
+
+            else:
+                class_type = self.classes_[0].dtype
+                predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type)
+                for k in range(self.n_outputs_):
+                    predictions[:, k] = self.classes_[k].take(
+                        np.argmax(proba[:, k], axis=1), axis=0
+                    )
+
+                return predictions
+
+        # Regression
+        else:
+            if self.n_outputs_ == 1:
+                return proba[:, 0]
+
+            else:
+                return proba[:, :, 0]
+
+    def apply(self, X, check_input=True):
+        """Return the index of the leaf that each sample is predicted as.
+
+        .. versionadded:: 0.17
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you're doing.
+
+        Returns
+        -------
+        X_leaves : array-like of shape (n_samples,)
+            For each datapoint x in X, return the index of the leaf x
+            ends up in. Leaves are numbered within
+            ``[0; self.tree_.node_count)``, possibly with gaps in the
+            numbering.
+        """
+        check_is_fitted(self)
+        X = self._validate_X_predict(X, check_input)
+        return self.tree_.apply(X)
+
+    def decision_path(self, X, check_input=True):
+        """Return the decision path in the tree.
+
+        .. versionadded:: 0.18
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you're doing.
+
+        Returns
+        -------
+        indicator : sparse matrix of shape (n_samples, n_nodes)
+            Return a node indicator CSR matrix where non zero elements
+            indicates that the samples goes through the nodes.
+        """
+        X = self._validate_X_predict(X, check_input)
+        return self.tree_.decision_path(X)
+
+    def _prune_tree(self):
+        """Prune tree using Minimal Cost-Complexity Pruning."""
+        check_is_fitted(self)
+
+        if self.ccp_alpha == 0.0:
+            return
+
+        # build pruned tree
+        if is_classifier(self):
+            n_classes = np.atleast_1d(self.n_classes_)
+            pruned_tree = Tree(self.n_features_in_, n_classes, self.n_outputs_)
+        else:
+            pruned_tree = Tree(
+                self.n_features_in_,
+                # TODO: the tree shouldn't need this param
+                np.array([1] * self.n_outputs_, dtype=np.intp),
+                self.n_outputs_,
+            )
+        _build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha)
+
+        self.tree_ = pruned_tree
+
+    def cost_complexity_pruning_path(self, X, y, sample_weight=None):
+        """Compute the pruning path during Minimal Cost-Complexity Pruning.
+
+        See :ref:`minimal_cost_complexity_pruning` for details on the pruning
+        process.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        Returns
+        -------
+        ccp_path : :class:`~sklearn.utils.Bunch`
+            Dictionary-like object, with the following attributes.
+
+            ccp_alphas : ndarray
+                Effective alphas of subtree during pruning.
+
+            impurities : ndarray
+                Sum of the impurities of the subtree leaves for the
+                corresponding alpha value in ``ccp_alphas``.
+        """
+        est = clone(self).set_params(ccp_alpha=0.0)
+        est.fit(X, y, sample_weight=sample_weight)
+        return Bunch(**ccp_pruning_path(est.tree_))
+
+    @property
+    def feature_importances_(self):
+        """Return the feature importances.
+
+        The importance of a feature is computed as the (normalized) total
+        reduction of the criterion brought by that feature.
+        It is also known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+        Returns
+        -------
+        feature_importances_ : ndarray of shape (n_features,)
+            Normalized total reduction of criteria by feature
+            (Gini importance).
+        """
+        check_is_fitted(self)
+
+        return self.tree_.compute_feature_importances()
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
+
+# =============================================================================
+# Public estimators
+# =============================================================================
+
+
+class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
+    """A decision tree classifier.
+
+    Read more in the :ref:`User Guide <tree>`.
+
+    Parameters
+    ----------
+    criterion : {"gini", "entropy", "log_loss"}, default="gini"
+        The function to measure the quality of a split. Supported criteria are
+        "gini" for the Gini impurity and "log_loss" and "entropy" both for the
+        Shannon information gain, see :ref:`tree_mathematical_formulation`.
+
+    splitter : {"best", "random"}, default="best"
+        The strategy used to choose the split at each node. Supported
+        strategies are "best" to choose the best split and "random" to choose
+        the best random split.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : int, float or {"sqrt", "log2"}, default=None
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at
+          each split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        .. note::
+
+            The search for a split does not stop until at least one
+            valid partition of the node samples is found, even if it requires to
+            effectively inspect more than ``max_features`` features.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the estimator. The features are always
+        randomly permuted at each split, even if ``splitter`` is set to
+        ``"best"``. When ``max_features < n_features``, the algorithm will
+        select ``max_features`` at random at each split before finding the best
+        split among them. But the best found split may vary across different
+        runs, even if ``max_features=n_features``. That is the case, if the
+        improvement of the criterion is identical for several splits and one
+        split has to be selected at random. To obtain a deterministic behaviour
+        during fitting, ``random_state`` has to be fixed to an integer.
+        See :term:`Glossary <random_state>` for details.
+
+    max_leaf_nodes : int, default=None
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    class_weight : dict, list of dict or "balanced", default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If None, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        Note that for multioutput (including multilabel) weights should be
+        defined for each class of every column in its own dict. For example,
+        for four-class multilabel classification weights should be
+        [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
+        [{1:1}, {2:5}, {3:1}, {4:1}].
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,) or list of ndarray
+        The classes labels (single output problem),
+        or a list of arrays of class labels (multi-output problem).
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance [4]_.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    max_features_ : int
+        The inferred value of max_features.
+
+    n_classes_ : int or list of int
+        The number of classes (for single output problems),
+        or a list containing the number of classes for each
+        output (for multi-output problems).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_outputs_ : int
+        The number of outputs when ``fit`` is performed.
+
+    tree_ : Tree instance
+        The underlying Tree object. Please refer to
+        ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
+        :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
+        for basic usage of these attributes.
+
+    See Also
+    --------
+    DecisionTreeRegressor : A decision tree regressor.
+
+    Notes
+    -----
+    The default values for the parameters controlling the size of the trees
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
+    unpruned trees which can potentially be very large on some data sets. To
+    reduce memory consumption, the complexity and size of the trees should be
+    controlled by setting those parameter values.
+
+    The :meth:`predict` method operates using the :func:`numpy.argmax`
+    function on the outputs of :meth:`predict_proba`. This means that in
+    case the highest predicted probabilities are tied, the classifier will
+    predict the tied class with the lowest index in :term:`classes_`.
+
+    References
+    ----------
+
+    .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
+
+    .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
+           and Regression Trees", Wadsworth, Belmont, CA, 1984.
+
+    .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
+           Learning", Springer, 2009.
+
+    .. [4] L. Breiman, and A. Cutler, "Random Forests",
+           https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import cross_val_score
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> clf = DecisionTreeClassifier(random_state=0)
+    >>> iris = load_iris()
+    >>> cross_val_score(clf, iris.data, iris.target, cv=10)
+    ...                             # doctest: +SKIP
+    ...
+    array([ 1.     ,  0.93,  0.86,  0.93,  0.93,
+            0.93,  0.93,  1.     ,  0.93,  1.      ])
+    """
+
+    # "check_input" is used for optimisation and isn't something to be passed
+    # around in a pipeline.
+    __metadata_request__predict_proba = {"check_input": metadata_routing.UNUSED}
+    __metadata_request__fit = {"check_input": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        **BaseDecisionTree._parameter_constraints,
+        "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
+        "class_weight": [dict, list, StrOptions({"balanced"}), None],
+    }
+
+    def __init__(
+        self,
+        *,
+        criterion="gini",
+        splitter="best",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=None,
+        random_state=None,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        class_weight=None,
+        ccp_alpha=0.0,
+        monotonic_cst=None,
+    ):
+        super().__init__(
+            criterion=criterion,
+            splitter=splitter,
+            max_depth=max_depth,
+            min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf,
+            min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_features=max_features,
+            max_leaf_nodes=max_leaf_nodes,
+            class_weight=class_weight,
+            random_state=random_state,
+            min_impurity_decrease=min_impurity_decrease,
+            monotonic_cst=monotonic_cst,
+            ccp_alpha=ccp_alpha,
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None, check_input=True):
+        """Build a decision tree classifier from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you're doing.
+
+        Returns
+        -------
+        self : DecisionTreeClassifier
+            Fitted estimator.
+        """
+
+        super()._fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=check_input,
+        )
+        return self
+
+    def predict_proba(self, X, check_input=True):
+        """Predict class probabilities of the input samples X.
+
+        The predicted class probability is the fraction of samples of the same
+        class in a leaf.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you're doing.
+
+        Returns
+        -------
+        proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \
+            such arrays if n_outputs > 1
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        X = self._validate_X_predict(X, check_input)
+        proba = self.tree_.predict(X)
+
+        if self.n_outputs_ == 1:
+            return proba[:, : self.n_classes_]
+        else:
+            all_proba = []
+            for k in range(self.n_outputs_):
+                proba_k = proba[:, k, : self.n_classes_[k]]
+                all_proba.append(proba_k)
+            return all_proba
+
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities of the input samples X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \
+            such arrays if n_outputs > 1
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        proba = self.predict_proba(X)
+
+        if self.n_outputs_ == 1:
+            return np.log(proba)
+
+        else:
+            for k in range(self.n_outputs_):
+                proba[k] = np.log(proba[k])
+
+            return proba
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # XXX: nan is only support for dense arrays, but we set this for common test to
+        # pass, specifically: check_estimators_nan_inf
+        allow_nan = self.splitter in ("best", "random") and self.criterion in {
+            "gini",
+            "log_loss",
+            "entropy",
+        }
+        tags.classifier_tags.multi_label = True
+        tags.input_tags.allow_nan = allow_nan
+        return tags
+
+
+class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
+    """A decision tree regressor.
+
+    Read more in the :ref:`User Guide <tree>`.
+
+    Parameters
+    ----------
+    criterion : {"squared_error", "friedman_mse", "absolute_error", \
+            "poisson"}, default="squared_error"
+        The function to measure the quality of a split. Supported criteria
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion and minimizes the L2
+        loss using the mean of each terminal node, "friedman_mse", which uses
+        mean squared error with Friedman's improvement score for potential
+        splits, "absolute_error" for the mean absolute error, which minimizes
+        the L1 loss using the median of each terminal node, and "poisson" which
+        uses reduction in the half mean Poisson deviance to find splits.
+
+        .. versionadded:: 0.18
+           Mean Absolute Error (MAE) criterion.
+
+        .. versionadded:: 0.24
+            Poisson deviance criterion.
+
+    splitter : {"best", "random"}, default="best"
+        The strategy used to choose the split at each node. Supported
+        strategies are "best" to choose the best split and "random" to choose
+        the best random split.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+        For an example of how ``max_depth`` influences the model, see
+        :ref:`sphx_glr_auto_examples_tree_plot_tree_regression.py`.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : int, float or {"sqrt", "log2"}, default=None
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at each
+          split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the estimator. The features are always
+        randomly permuted at each split, even if ``splitter`` is set to
+        ``"best"``. When ``max_features < n_features``, the algorithm will
+        select ``max_features`` at random at each split before finding the best
+        split among them. But the best found split may vary across different
+        runs, even if ``max_features=n_features``. That is the case, if the
+        improvement of the criterion is identical for several splits and one
+        split has to be selected at random. To obtain a deterministic behaviour
+        during fitting, ``random_state`` has to be fixed to an integer.
+        See :term:`Glossary <random_state>` for details.
+
+    max_leaf_nodes : int, default=None
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
+    Attributes
+    ----------
+    feature_importances_ : ndarray of shape (n_features,)
+        The feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the
+        (normalized) total reduction of the criterion brought
+        by that feature. It is also known as the Gini importance [4]_.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    max_features_ : int
+        The inferred value of max_features.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_outputs_ : int
+        The number of outputs when ``fit`` is performed.
+
+    tree_ : Tree instance
+        The underlying Tree object. Please refer to
+        ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
+        :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
+        for basic usage of these attributes.
+
+    See Also
+    --------
+    DecisionTreeClassifier : A decision tree classifier.
+
+    Notes
+    -----
+    The default values for the parameters controlling the size of the trees
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
+    unpruned trees which can potentially be very large on some data sets. To
+    reduce memory consumption, the complexity and size of the trees should be
+    controlled by setting those parameter values.
+
+    References
+    ----------
+
+    .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
+
+    .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
+           and Regression Trees", Wadsworth, Belmont, CA, 1984.
+
+    .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
+           Learning", Springer, 2009.
+
+    .. [4] L. Breiman, and A. Cutler, "Random Forests",
+           https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.model_selection import cross_val_score
+    >>> from sklearn.tree import DecisionTreeRegressor
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> regressor = DecisionTreeRegressor(random_state=0)
+    >>> cross_val_score(regressor, X, y, cv=10)
+    ...                    # doctest: +SKIP
+    ...
+    array([-0.39, -0.46,  0.02,  0.06, -0.50,
+           0.16,  0.11, -0.73, -0.30, -0.00])
+    """
+
+    # "check_input" is used for optimisation and isn't something to be passed
+    # around in a pipeline.
+    __metadata_request__fit = {"check_input": metadata_routing.UNUSED}
+
+    _parameter_constraints: dict = {
+        **BaseDecisionTree._parameter_constraints,
+        "criterion": [
+            StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}),
+            Hidden(Criterion),
+        ],
+    }
+
+    def __init__(
+        self,
+        *,
+        criterion="squared_error",
+        splitter="best",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=None,
+        random_state=None,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        ccp_alpha=0.0,
+        monotonic_cst=None,
+    ):
+        super().__init__(
+            criterion=criterion,
+            splitter=splitter,
+            max_depth=max_depth,
+            min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf,
+            min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_features=max_features,
+            max_leaf_nodes=max_leaf_nodes,
+            random_state=random_state,
+            min_impurity_decrease=min_impurity_decrease,
+            ccp_alpha=ccp_alpha,
+            monotonic_cst=monotonic_cst,
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None, check_input=True):
+        """Build a decision tree regressor from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (real numbers). Use ``dtype=np.float64`` and
+            ``order='C'`` for maximum efficiency.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you're doing.
+
+        Returns
+        -------
+        self : DecisionTreeRegressor
+            Fitted estimator.
+        """
+
+        super()._fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=check_input,
+        )
+        return self
+
+    def _compute_partial_dependence_recursion(self, grid, target_features):
+        """Fast partial dependence computation.
+
+        Parameters
+        ----------
+        grid : ndarray of shape (n_samples, n_target_features), dtype=np.float32
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : ndarray of shape (n_target_features), dtype=np.intp
+            The set of target features for which the partial dependence
+            should be evaluated.
+
+        Returns
+        -------
+        averaged_predictions : ndarray of shape (n_samples,), dtype=np.float64
+            The value of the partial dependence function on each grid point.
+        """
+        grid = np.asarray(grid, dtype=DTYPE, order="C")
+        averaged_predictions = np.zeros(
+            shape=grid.shape[0], dtype=np.float64, order="C"
+        )
+        target_features = np.asarray(target_features, dtype=np.intp, order="C")
+
+        self.tree_.compute_partial_dependence(
+            grid, target_features, averaged_predictions
+        )
+        return averaged_predictions
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # XXX: nan is only support for dense arrays, but we set this for common test to
+        # pass, specifically: check_estimators_nan_inf
+        allow_nan = self.splitter in ("best", "random") and self.criterion in {
+            "squared_error",
+            "friedman_mse",
+            "poisson",
+        }
+        tags.input_tags.allow_nan = allow_nan
+        return tags
+
+
+class ExtraTreeClassifier(DecisionTreeClassifier):
+    """An extremely randomized tree classifier.
+
+    Extra-trees differ from classic decision trees in the way they are built.
+    When looking for the best split to separate the samples of a node into two
+    groups, random splits are drawn for each of the `max_features` randomly
+    selected features and the best split among those is chosen. When
+    `max_features` is set 1, this amounts to building a totally random
+    decision tree.
+
+    Warning: Extra-trees should only be used within ensemble methods.
+
+    Read more in the :ref:`User Guide <tree>`.
+
+    Parameters
+    ----------
+    criterion : {"gini", "entropy", "log_loss"}, default="gini"
+        The function to measure the quality of a split. Supported criteria are
+        "gini" for the Gini impurity and "log_loss" and "entropy" both for the
+        Shannon information gain, see :ref:`tree_mathematical_formulation`.
+
+    splitter : {"random", "best"}, default="random"
+        The strategy used to choose the split at each node. Supported
+        strategies are "best" to choose the best split and "random" to choose
+        the best random split.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : int, float, {"sqrt", "log2"} or None, default="sqrt"
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at
+          each split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        .. versionchanged:: 1.1
+            The default of `max_features` changed from `"auto"` to `"sqrt"`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    random_state : int, RandomState instance or None, default=None
+        Used to pick randomly the `max_features` used at each split.
+        See :term:`Glossary <random_state>` for details.
+
+    max_leaf_nodes : int, default=None
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    class_weight : dict, list of dict or "balanced", default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If None, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        Note that for multioutput (including multilabel) weights should be
+        defined for each class of every column in its own dict. For example,
+        for four-class multilabel classification weights should be
+        [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
+        [{1:1}, {2:5}, {3:1}, {4:1}].
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,) or list of ndarray
+        The classes labels (single output problem),
+        or a list of arrays of class labels (multi-output problem).
+
+    max_features_ : int
+        The inferred value of max_features.
+
+    n_classes_ : int or list of int
+        The number of classes (for single output problems),
+        or a list containing the number of classes for each
+        output (for multi-output problems).
+
+    feature_importances_ : ndarray of shape (n_features,)
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_outputs_ : int
+        The number of outputs when ``fit`` is performed.
+
+    tree_ : Tree instance
+        The underlying Tree object. Please refer to
+        ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
+        :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
+        for basic usage of these attributes.
+
+    See Also
+    --------
+    ExtraTreeRegressor : An extremely randomized tree regressor.
+    sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.
+    sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.
+    sklearn.ensemble.RandomForestClassifier : A random forest classifier.
+    sklearn.ensemble.RandomForestRegressor : A random forest regressor.
+    sklearn.ensemble.RandomTreesEmbedding : An ensemble of
+        totally random trees.
+
+    Notes
+    -----
+    The default values for the parameters controlling the size of the trees
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
+    unpruned trees which can potentially be very large on some data sets. To
+    reduce memory consumption, the complexity and size of the trees should be
+    controlled by setting those parameter values.
+
+    References
+    ----------
+
+    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
+           Machine Learning, 63(1), 3-42, 2006.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.ensemble import BaggingClassifier
+    >>> from sklearn.tree import ExtraTreeClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...    X, y, random_state=0)
+    >>> extra_tree = ExtraTreeClassifier(random_state=0)
+    >>> cls = BaggingClassifier(extra_tree, random_state=0).fit(
+    ...    X_train, y_train)
+    >>> cls.score(X_test, y_test)
+    0.8947
+    """
+
+    def __init__(
+        self,
+        *,
+        criterion="gini",
+        splitter="random",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features="sqrt",
+        random_state=None,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        class_weight=None,
+        ccp_alpha=0.0,
+        monotonic_cst=None,
+    ):
+        super().__init__(
+            criterion=criterion,
+            splitter=splitter,
+            max_depth=max_depth,
+            min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf,
+            min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_features=max_features,
+            max_leaf_nodes=max_leaf_nodes,
+            class_weight=class_weight,
+            min_impurity_decrease=min_impurity_decrease,
+            random_state=random_state,
+            ccp_alpha=ccp_alpha,
+            monotonic_cst=monotonic_cst,
+        )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # XXX: nan is only supported for dense arrays, but we set this for the
+        # common test to pass, specifically: check_estimators_nan_inf
+        allow_nan = self.splitter == "random" and self.criterion in {
+            "gini",
+            "log_loss",
+            "entropy",
+        }
+        tags.classifier_tags.multi_label = True
+        tags.input_tags.allow_nan = allow_nan
+        return tags
+
+
+class ExtraTreeRegressor(DecisionTreeRegressor):
+    """An extremely randomized tree regressor.
+
+    Extra-trees differ from classic decision trees in the way they are built.
+    When looking for the best split to separate the samples of a node into two
+    groups, random splits are drawn for each of the `max_features` randomly
+    selected features and the best split among those is chosen. When
+    `max_features` is set 1, this amounts to building a totally random
+    decision tree.
+
+    Warning: Extra-trees should only be used within ensemble methods.
+
+    Read more in the :ref:`User Guide <tree>`.
+
+    Parameters
+    ----------
+    criterion : {"squared_error", "friedman_mse", "absolute_error", "poisson"}, \
+            default="squared_error"
+        The function to measure the quality of a split. Supported criteria
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion and minimizes the L2
+        loss using the mean of each terminal node, "friedman_mse", which uses
+        mean squared error with Friedman's improvement score for potential
+        splits, "absolute_error" for the mean absolute error, which minimizes
+        the L1 loss using the median of each terminal node, and "poisson" which
+        uses reduction in Poisson deviance to find splits.
+
+        .. versionadded:: 0.18
+           Mean Absolute Error (MAE) criterion.
+
+        .. versionadded:: 0.24
+            Poisson deviance criterion.
+
+    splitter : {"random", "best"}, default="random"
+        The strategy used to choose the split at each node. Supported
+        strategies are "best" to choose the best split and "random" to choose
+        the best random split.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for fractions.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : int, float, {"sqrt", "log2"} or None, default=1.0
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at each
+          split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        .. versionchanged:: 1.1
+            The default of `max_features` changed from `"auto"` to `1.0`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    random_state : int, RandomState instance or None, default=None
+        Used to pick randomly the `max_features` used at each split.
+        See :term:`Glossary <random_state>` for details.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    max_leaf_nodes : int, default=None
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details. See
+        :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
+        for an example of such pruning.
+
+        .. versionadded:: 0.22
+
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
+    Attributes
+    ----------
+    max_features_ : int
+        The inferred value of max_features.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    feature_importances_ : ndarray of shape (n_features,)
+        Return impurity-based feature importances (the higher, the more
+        important the feature).
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+    n_outputs_ : int
+        The number of outputs when ``fit`` is performed.
+
+    tree_ : Tree instance
+        The underlying Tree object. Please refer to
+        ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
+        :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
+        for basic usage of these attributes.
+
+    See Also
+    --------
+    ExtraTreeClassifier : An extremely randomized tree classifier.
+    sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.
+    sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.
+
+    Notes
+    -----
+    The default values for the parameters controlling the size of the trees
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
+    unpruned trees which can potentially be very large on some data sets. To
+    reduce memory consumption, the complexity and size of the trees should be
+    controlled by setting those parameter values.
+
+    References
+    ----------
+
+    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
+           Machine Learning, 63(1), 3-42, 2006.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.ensemble import BaggingRegressor
+    >>> from sklearn.tree import ExtraTreeRegressor
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=0)
+    >>> extra_tree = ExtraTreeRegressor(random_state=0)
+    >>> reg = BaggingRegressor(extra_tree, random_state=0).fit(
+    ...     X_train, y_train)
+    >>> reg.score(X_test, y_test)
+    0.33
+    """
+
+    def __init__(
+        self,
+        *,
+        criterion="squared_error",
+        splitter="random",
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=1.0,
+        random_state=None,
+        min_impurity_decrease=0.0,
+        max_leaf_nodes=None,
+        ccp_alpha=0.0,
+        monotonic_cst=None,
+    ):
+        super().__init__(
+            criterion=criterion,
+            splitter=splitter,
+            max_depth=max_depth,
+            min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf,
+            min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_features=max_features,
+            max_leaf_nodes=max_leaf_nodes,
+            min_impurity_decrease=min_impurity_decrease,
+            random_state=random_state,
+            ccp_alpha=ccp_alpha,
+            monotonic_cst=monotonic_cst,
+        )
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # XXX: nan is only supported for dense arrays, but we set this for the
+        # common test to pass, specifically: check_estimators_nan_inf
+        allow_nan = self.splitter == "random" and self.criterion in {
+            "squared_error",
+            "friedman_mse",
+            "poisson",
+        }
+        tags.input_tags.allow_nan = allow_nan
+        return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_criterion.pxd b/.venv/lib/python3.12/site-packages/sklearn/tree/_criterion.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..84d2e800d6a87506852b7b21c8006db5b9ec7e99
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_criterion.pxd
@@ -0,0 +1,109 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# See _criterion.pyx for implementation details.
+from ..utils._typedefs cimport float64_t, int8_t, intp_t
+
+
+cdef class Criterion:
+    # The criterion computes the impurity of a node and the reduction of
+    # impurity of a split on that node. It also computes the output statistics
+    # such as the mean in regression and class probabilities in classification.
+
+    # Internal structures
+    cdef const float64_t[:, ::1] y         # Values of y
+    cdef const float64_t[:] sample_weight  # Sample weights
+
+    cdef const intp_t[:] sample_indices    # Sample indices in X, y
+    cdef intp_t start                      # samples[start:pos] are the samples in the left node
+    cdef intp_t pos                        # samples[pos:end] are the samples in the right node
+    cdef intp_t end
+    cdef intp_t n_missing                  # Number of missing values for the feature being evaluated
+    cdef bint missing_go_to_left           # Whether missing values go to the left node
+
+    cdef intp_t n_outputs                  # Number of outputs
+    cdef intp_t n_samples                  # Number of samples
+    cdef intp_t n_node_samples             # Number of samples in the node (end-start)
+    cdef float64_t weighted_n_samples         # Weighted number of samples (in total)
+    cdef float64_t weighted_n_node_samples    # Weighted number of samples in the node
+    cdef float64_t weighted_n_left            # Weighted number of samples in the left node
+    cdef float64_t weighted_n_right           # Weighted number of samples in the right node
+    cdef float64_t weighted_n_missing         # Weighted number of samples that are missing
+
+    # The criterion object is maintained such that left and right collected
+    # statistics correspond to samples[start:pos] and samples[pos:end].
+
+    # Methods
+    cdef int init(
+        self,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        float64_t weighted_n_samples,
+        const intp_t[:] sample_indices,
+        intp_t start,
+        intp_t end
+    ) except -1 nogil
+    cdef void init_sum_missing(self)
+    cdef void init_missing(self, intp_t n_missing) noexcept nogil
+    cdef int reset(self) except -1 nogil
+    cdef int reverse_reset(self) except -1 nogil
+    cdef int update(self, intp_t new_pos) except -1 nogil
+    cdef float64_t node_impurity(self) noexcept nogil
+    cdef void children_impurity(
+        self,
+        float64_t* impurity_left,
+        float64_t* impurity_right
+    ) noexcept nogil
+    cdef void node_value(
+        self,
+        float64_t* dest
+    ) noexcept nogil
+    cdef void clip_node_value(
+        self,
+        float64_t* dest,
+        float64_t lower_bound,
+        float64_t upper_bound
+    ) noexcept nogil
+    cdef float64_t middle_value(self) noexcept nogil
+    cdef float64_t impurity_improvement(
+        self,
+        float64_t impurity_parent,
+        float64_t impurity_left,
+        float64_t impurity_right
+    ) noexcept nogil
+    cdef float64_t proxy_impurity_improvement(self) noexcept nogil
+    cdef bint check_monotonicity(
+            self,
+            int8_t monotonic_cst,
+            float64_t lower_bound,
+            float64_t upper_bound,
+    ) noexcept nogil
+    cdef inline bint _check_monotonicity(
+            self,
+            int8_t monotonic_cst,
+            float64_t lower_bound,
+            float64_t upper_bound,
+            float64_t sum_left,
+            float64_t sum_right,
+    ) noexcept nogil
+
+cdef class ClassificationCriterion(Criterion):
+    """Abstract criterion for classification."""
+
+    cdef intp_t[::1] n_classes
+    cdef intp_t max_n_classes
+
+    cdef float64_t[:, ::1] sum_total    # The sum of the weighted count of each label.
+    cdef float64_t[:, ::1] sum_left     # Same as above, but for the left side of the split
+    cdef float64_t[:, ::1] sum_right    # Same as above, but for the right side of the split
+    cdef float64_t[:, ::1] sum_missing  # Same as above, but for missing values in X
+
+cdef class RegressionCriterion(Criterion):
+    """Abstract regression criterion."""
+
+    cdef float64_t sq_sum_total
+
+    cdef float64_t[::1] sum_total    # The sum of w*y.
+    cdef float64_t[::1] sum_left     # Same as above, but for the left side of the split
+    cdef float64_t[::1] sum_right    # Same as above, but for the right side of the split
+    cdef float64_t[::1] sum_missing  # Same as above, but for missing values in X
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_criterion.pyx b/.venv/lib/python3.12/site-packages/sklearn/tree/_criterion.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..9f3db83399569e2789894de4697a11e2e5c17c10
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_criterion.pyx
@@ -0,0 +1,1697 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.string cimport memcpy
+from libc.string cimport memset
+from libc.math cimport fabs, INFINITY
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+from scipy.special.cython_special cimport xlogy
+
+from ._utils cimport log
+from ._utils cimport WeightedMedianCalculator
+
+# EPSILON is used in the Poisson criterion
+cdef float64_t EPSILON = 10 * np.finfo('double').eps
+
+cdef class Criterion:
+    """Interface for impurity criteria.
+
+    This object stores methods on how to calculate how good a split is using
+    different metrics.
+    """
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, d):
+        pass
+
+    cdef int init(
+        self,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        float64_t weighted_n_samples,
+        const intp_t[:] sample_indices,
+        intp_t start,
+        intp_t end,
+    ) except -1 nogil:
+        """Placeholder for a method which will initialize the criterion.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        y : ndarray, dtype=float64_t
+            y is a buffer that can store values for n_outputs target variables
+            stored as a Cython memoryview.
+        sample_weight : ndarray, dtype=float64_t
+            The weight of each sample stored as a Cython memoryview.
+        weighted_n_samples : float64_t
+            The total weight of the samples being considered
+        sample_indices : ndarray, dtype=intp_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
+        start : intp_t
+            The first sample to be used on this node
+        end : intp_t
+            The last sample used on this node
+
+        """
+        pass
+
+    cdef void init_missing(self, intp_t n_missing) noexcept nogil:
+        """Initialize sum_missing if there are missing values.
+
+        This method assumes that caller placed the missing samples in
+        self.sample_indices[-n_missing:]
+
+        Parameters
+        ----------
+        n_missing: intp_t
+            Number of missing values for specific feature.
+        """
+        pass
+
+    cdef int reset(self) except -1 nogil:
+        """Reset the criterion at pos=start.
+
+        This method must be implemented by the subclass.
+        """
+        pass
+
+    cdef int reverse_reset(self) except -1 nogil:
+        """Reset the criterion at pos=end.
+
+        This method must be implemented by the subclass.
+        """
+        pass
+
+    cdef int update(self, intp_t new_pos) except -1 nogil:
+        """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
+
+        This updates the collected statistics by moving sample_indices[pos:new_pos]
+        from the right child to the left child. It must be implemented by
+        the subclass.
+
+        Parameters
+        ----------
+        new_pos : intp_t
+            New starting index position of the sample_indices in the right child
+        """
+        pass
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        """Placeholder for calculating the impurity of the node.
+
+        Placeholder for a method which will evaluate the impurity of
+        the current node, i.e. the impurity of sample_indices[start:end]. This is the
+        primary function of the criterion class. The smaller the impurity the
+        better.
+        """
+        pass
+
+    cdef void children_impurity(self, float64_t* impurity_left,
+                                float64_t* impurity_right) noexcept nogil:
+        """Placeholder for calculating the impurity of children.
+
+        Placeholder for a method which evaluates the impurity in
+        children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity
+        of sample_indices[pos:end].
+
+        Parameters
+        ----------
+        impurity_left : float64_t pointer
+            The memory address where the impurity of the left child should be
+            stored.
+        impurity_right : float64_t pointer
+            The memory address where the impurity of the right child should be
+            stored
+        """
+        pass
+
+    cdef void node_value(self, float64_t* dest) noexcept nogil:
+        """Placeholder for storing the node value.
+
+        Placeholder for a method which will compute the node value
+        of sample_indices[start:end] and save the value into dest.
+
+        Parameters
+        ----------
+        dest : float64_t pointer
+            The memory address where the node value should be stored.
+        """
+        pass
+
+    cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil:
+        pass
+
+    cdef float64_t middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints
+
+        This method is implemented in ClassificationCriterion and RegressionCriterion.
+        """
+        pass
+
+    cdef float64_t proxy_impurity_improvement(self) noexcept nogil:
+        """Compute a proxy of the impurity reduction.
+
+        This method is used to speed up the search for the best split.
+        It is a proxy quantity such that the split that maximizes this value
+        also maximizes the impurity improvement. It neglects all constant terms
+        of the impurity decrease for a given split.
+
+        The absolute impurity improvement is only computed by the
+        impurity_improvement method once the best split has been found.
+        """
+        cdef float64_t impurity_left
+        cdef float64_t impurity_right
+        self.children_impurity(&impurity_left, &impurity_right)
+
+        return (- self.weighted_n_right * impurity_right
+                - self.weighted_n_left * impurity_left)
+
+    cdef float64_t impurity_improvement(self, float64_t impurity_parent,
+                                        float64_t impurity_left,
+                                        float64_t impurity_right) noexcept nogil:
+        """Compute the improvement in impurity.
+
+        This method computes the improvement in impurity when a split occurs.
+        The weighted impurity improvement equation is the following:
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where N is the total number of samples, N_t is the number of samples
+        at the current node, N_t_L is the number of samples in the left child,
+        and N_t_R is the number of samples in the right child,
+
+        Parameters
+        ----------
+        impurity_parent : float64_t
+            The initial impurity of the parent node before the split
+
+        impurity_left : float64_t
+            The impurity of the left child
+
+        impurity_right : float64_t
+            The impurity of the right child
+
+        Return
+        ------
+        float64_t : improvement in impurity after the split occurs
+        """
+        return ((self.weighted_n_node_samples / self.weighted_n_samples) *
+                (impurity_parent - (self.weighted_n_right /
+                                    self.weighted_n_node_samples * impurity_right)
+                                 - (self.weighted_n_left /
+                                    self.weighted_n_node_samples * impurity_left)))
+
+    cdef bint check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        float64_t lower_bound,
+        float64_t upper_bound,
+    ) noexcept nogil:
+        pass
+
+    cdef inline bint _check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        float64_t lower_bound,
+        float64_t upper_bound,
+        float64_t value_left,
+        float64_t value_right,
+    ) noexcept nogil:
+        cdef:
+            bint check_lower_bound = (
+                (value_left >= lower_bound) &
+                (value_right >= lower_bound)
+            )
+            bint check_upper_bound = (
+                (value_left <= upper_bound) &
+                (value_right <= upper_bound)
+            )
+            bint check_monotonic_cst = (
+                (value_left - value_right) * monotonic_cst <= 0
+            )
+        return check_lower_bound & check_upper_bound & check_monotonic_cst
+
+    cdef void init_sum_missing(self):
+        """Init sum_missing to hold sums for missing values."""
+
+cdef inline void _move_sums_classification(
+    ClassificationCriterion criterion,
+    float64_t[:, ::1] sum_1,
+    float64_t[:, ::1] sum_2,
+    float64_t* weighted_n_1,
+    float64_t* weighted_n_2,
+    bint put_missing_in_1,
+) noexcept nogil:
+    """Distribute sum_total and sum_missing into sum_1 and sum_2.
+
+    If there are missing values and:
+    - put_missing_in_1 is True, then missing values to go sum_1. Specifically:
+        sum_1 = sum_missing
+        sum_2 = sum_total - sum_missing
+
+    - put_missing_in_1 is False, then missing values go to sum_2. Specifically:
+        sum_1 = 0
+        sum_2 = sum_total
+    """
+    cdef intp_t k, c, n_bytes
+    if criterion.n_missing != 0 and put_missing_in_1:
+        for k in range(criterion.n_outputs):
+            n_bytes = criterion.n_classes[k] * sizeof(float64_t)
+            memcpy(&sum_1[k, 0], &criterion.sum_missing[k, 0], n_bytes)
+
+        for k in range(criterion.n_outputs):
+            for c in range(criterion.n_classes[k]):
+                sum_2[k, c] = criterion.sum_total[k, c] - criterion.sum_missing[k, c]
+
+        weighted_n_1[0] = criterion.weighted_n_missing
+        weighted_n_2[0] = criterion.weighted_n_node_samples - criterion.weighted_n_missing
+    else:
+        # Assigning sum_2 = sum_total for all outputs.
+        for k in range(criterion.n_outputs):
+            n_bytes = criterion.n_classes[k] * sizeof(float64_t)
+            memset(&sum_1[k, 0], 0, n_bytes)
+            memcpy(&sum_2[k, 0], &criterion.sum_total[k, 0], n_bytes)
+
+        weighted_n_1[0] = 0.0
+        weighted_n_2[0] = criterion.weighted_n_node_samples
+
+
+cdef class ClassificationCriterion(Criterion):
+    """Abstract criterion for classification."""
+
+    def __cinit__(self, intp_t n_outputs,
+                  cnp.ndarray[intp_t, ndim=1] n_classes):
+        """Initialize attributes for this criterion.
+
+        Parameters
+        ----------
+        n_outputs : intp_t
+            The number of targets, the dimensionality of the prediction
+        n_classes : numpy.ndarray, dtype=intp_t
+            The number of unique classes in each target
+        """
+        self.start = 0
+        self.pos = 0
+        self.end = 0
+        self.missing_go_to_left = 0
+
+        self.n_outputs = n_outputs
+        self.n_samples = 0
+        self.n_node_samples = 0
+        self.weighted_n_node_samples = 0.0
+        self.weighted_n_left = 0.0
+        self.weighted_n_right = 0.0
+        self.weighted_n_missing = 0.0
+
+        self.n_classes = np.empty(n_outputs, dtype=np.intp)
+
+        cdef intp_t k = 0
+        cdef intp_t max_n_classes = 0
+
+        # For each target, set the number of unique classes in that target,
+        # and also compute the maximal stride of all targets
+        for k in range(n_outputs):
+            self.n_classes[k] = n_classes[k]
+
+            if n_classes[k] > max_n_classes:
+                max_n_classes = n_classes[k]
+
+        self.max_n_classes = max_n_classes
+
+        # Count labels for each output
+        self.sum_total = np.zeros((n_outputs, max_n_classes), dtype=np.float64)
+        self.sum_left = np.zeros((n_outputs, max_n_classes), dtype=np.float64)
+        self.sum_right = np.zeros((n_outputs, max_n_classes), dtype=np.float64)
+
+    def __reduce__(self):
+        return (type(self),
+                (self.n_outputs, np.asarray(self.n_classes)), self.__getstate__())
+
+    cdef int init(
+        self,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        float64_t weighted_n_samples,
+        const intp_t[:] sample_indices,
+        intp_t start,
+        intp_t end
+    ) except -1 nogil:
+        """Initialize the criterion.
+
+        This initializes the criterion at node sample_indices[start:end] and children
+        sample_indices[start:start] and sample_indices[start:end].
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        y : ndarray, dtype=float64_t
+            The target stored as a buffer for memory efficiency.
+        sample_weight : ndarray, dtype=float64_t
+            The weight of each sample stored as a Cython memoryview.
+        weighted_n_samples : float64_t
+            The total weight of all samples
+        sample_indices : ndarray, dtype=intp_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
+        start : intp_t
+            The first sample to use in the mask
+        end : intp_t
+            The last sample to use in the mask
+        """
+        self.y = y
+        self.sample_weight = sample_weight
+        self.sample_indices = sample_indices
+        self.start = start
+        self.end = end
+        self.n_node_samples = end - start
+        self.weighted_n_samples = weighted_n_samples
+        self.weighted_n_node_samples = 0.0
+
+        cdef intp_t i
+        cdef intp_t p
+        cdef intp_t k
+        cdef intp_t c
+        cdef float64_t w = 1.0
+
+        for k in range(self.n_outputs):
+            memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(float64_t))
+
+        for p in range(start, end):
+            i = sample_indices[p]
+
+            # w is originally set to be 1.0, meaning that if no sample weights
+            # are given, the default weight of each sample is 1.0.
+            if sample_weight is not None:
+                w = sample_weight[i]
+
+            # Count weighted class frequency for each target
+            for k in range(self.n_outputs):
+                c = <intp_t> self.y[i, k]
+                self.sum_total[k, c] += w
+
+            self.weighted_n_node_samples += w
+
+        # Reset to pos=start
+        self.reset()
+        return 0
+
+    cdef void init_sum_missing(self):
+        """Init sum_missing to hold sums for missing values."""
+        self.sum_missing = np.zeros((self.n_outputs, self.max_n_classes), dtype=np.float64)
+
+    cdef void init_missing(self, intp_t n_missing) noexcept nogil:
+        """Initialize sum_missing if there are missing values.
+
+        This method assumes that caller placed the missing samples in
+        self.sample_indices[-n_missing:]
+        """
+        cdef intp_t i, p, k, c
+        cdef float64_t w = 1.0
+
+        self.n_missing = n_missing
+        if n_missing == 0:
+            return
+
+        memset(&self.sum_missing[0, 0], 0, self.max_n_classes * self.n_outputs * sizeof(float64_t))
+
+        self.weighted_n_missing = 0.0
+
+        # The missing samples are assumed to be in self.sample_indices[-n_missing:]
+        for p in range(self.end - n_missing, self.end):
+            i = self.sample_indices[p]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
+
+            for k in range(self.n_outputs):
+                c = <intp_t> self.y[i, k]
+                self.sum_missing[k, c] += w
+
+            self.weighted_n_missing += w
+
+    cdef int reset(self) except -1 nogil:
+        """Reset the criterion at pos=start.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        self.pos = self.start
+        _move_sums_classification(
+            self,
+            self.sum_left,
+            self.sum_right,
+            &self.weighted_n_left,
+            &self.weighted_n_right,
+            self.missing_go_to_left,
+        )
+        return 0
+
+    cdef int reverse_reset(self) except -1 nogil:
+        """Reset the criterion at pos=end.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        self.pos = self.end
+        _move_sums_classification(
+            self,
+            self.sum_right,
+            self.sum_left,
+            &self.weighted_n_right,
+            &self.weighted_n_left,
+            not self.missing_go_to_left
+        )
+        return 0
+
+    cdef int update(self, intp_t new_pos) except -1 nogil:
+        """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        new_pos : intp_t
+            The new ending position for which to move sample_indices from the right
+            child to the left child.
+        """
+        cdef intp_t pos = self.pos
+        # The missing samples are assumed to be in
+        # self.sample_indices[-self.n_missing:] that is
+        # self.sample_indices[end_non_missing:self.end].
+        cdef intp_t end_non_missing = self.end - self.n_missing
+
+        cdef const intp_t[:] sample_indices = self.sample_indices
+        cdef const float64_t[:] sample_weight = self.sample_weight
+
+        cdef intp_t i
+        cdef intp_t p
+        cdef intp_t k
+        cdef intp_t c
+        cdef float64_t w = 1.0
+
+        # Update statistics up to new_pos
+        #
+        # Given that
+        #   sum_left[x] +  sum_right[x] = sum_total[x]
+        # and that sum_total is known, we are going to update
+        # sum_left from the direction that require the least amount
+        # of computations, i.e. from pos to new_pos or from end to new_po.
+        if (new_pos - pos) <= (end_non_missing - new_pos):
+            for p in range(pos, new_pos):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    self.sum_left[k, <intp_t> self.y[i, k]] += w
+
+                self.weighted_n_left += w
+
+        else:
+            self.reverse_reset()
+
+            for p in range(end_non_missing - 1, new_pos - 1, -1):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    self.sum_left[k, <intp_t> self.y[i, k]] -= w
+
+                self.weighted_n_left -= w
+
+        # Update right part statistics
+        self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_left
+        for k in range(self.n_outputs):
+            for c in range(self.n_classes[k]):
+                self.sum_right[k, c] = self.sum_total[k, c] - self.sum_left[k, c]
+
+        self.pos = new_pos
+        return 0
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        pass
+
+    cdef void children_impurity(self, float64_t* impurity_left,
+                                float64_t* impurity_right) noexcept nogil:
+        pass
+
+    cdef void node_value(self, float64_t* dest) noexcept nogil:
+        """Compute the node value of sample_indices[start:end] and save it into dest.
+
+        Parameters
+        ----------
+        dest : float64_t pointer
+            The memory address which we will save the node value into.
+        """
+        cdef intp_t k, c
+
+        for k in range(self.n_outputs):
+            for c in range(self.n_classes[k]):
+                dest[c] = self.sum_total[k, c] / self.weighted_n_node_samples
+            dest += self.max_n_classes
+
+    cdef inline void clip_node_value(
+        self, float64_t * dest, float64_t lower_bound, float64_t upper_bound
+    ) noexcept nogil:
+        """Clip the values in dest such that predicted probabilities stay between
+        `lower_bound` and `upper_bound` when monotonic constraints are enforced.
+        Note that monotonicity constraints are only supported for:
+        - single-output trees and
+        - binary classifications.
+        """
+        if dest[0] < lower_bound:
+            dest[0] = lower_bound
+        elif dest[0] > upper_bound:
+            dest[0] = upper_bound
+
+        # Values for binary classification must sum to 1.
+        dest[1] = 1 - dest[0]
+
+    cdef inline float64_t middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints as the simple average
+        of the left and right children values.
+
+        Note that monotonicity constraints are only supported for:
+        - single-output trees and
+        - binary classifications.
+        """
+        return (
+            (self.sum_left[0, 0] / (2 * self.weighted_n_left)) +
+            (self.sum_right[0, 0] / (2 * self.weighted_n_right))
+        )
+
+    cdef inline bint check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        float64_t lower_bound,
+        float64_t upper_bound,
+    ) noexcept nogil:
+        """Check monotonicity constraint is satisfied at the current classification split"""
+        cdef:
+            float64_t value_left = self.sum_left[0][0] / self.weighted_n_left
+            float64_t value_right = self.sum_right[0][0] / self.weighted_n_right
+
+        return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right)
+
+
+cdef class Entropy(ClassificationCriterion):
+    r"""Cross Entropy impurity criterion.
+
+    This handles cases where the target is a classification taking values
+    0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
+    then let
+
+        count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
+
+    be the proportion of class k observations in node m.
+
+    The cross-entropy is then defined as
+
+        cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
+    """
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        """Evaluate the impurity of the current node.
+
+        Evaluate the cross-entropy criterion as impurity of the current node,
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
+        better.
+        """
+        cdef float64_t entropy = 0.0
+        cdef float64_t count_k
+        cdef intp_t k
+        cdef intp_t c
+
+        for k in range(self.n_outputs):
+            for c in range(self.n_classes[k]):
+                count_k = self.sum_total[k, c]
+                if count_k > 0.0:
+                    count_k /= self.weighted_n_node_samples
+                    entropy -= count_k * log(count_k)
+
+        return entropy / self.n_outputs
+
+    cdef void children_impurity(self, float64_t* impurity_left,
+                                float64_t* impurity_right) noexcept nogil:
+        """Evaluate the impurity in children nodes.
+
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity the right child (sample_indices[pos:end]).
+
+        Parameters
+        ----------
+        impurity_left : float64_t pointer
+            The memory address to save the impurity of the left node
+        impurity_right : float64_t pointer
+            The memory address to save the impurity of the right node
+        """
+        cdef float64_t entropy_left = 0.0
+        cdef float64_t entropy_right = 0.0
+        cdef float64_t count_k
+        cdef intp_t k
+        cdef intp_t c
+
+        for k in range(self.n_outputs):
+            for c in range(self.n_classes[k]):
+                count_k = self.sum_left[k, c]
+                if count_k > 0.0:
+                    count_k /= self.weighted_n_left
+                    entropy_left -= count_k * log(count_k)
+
+                count_k = self.sum_right[k, c]
+                if count_k > 0.0:
+                    count_k /= self.weighted_n_right
+                    entropy_right -= count_k * log(count_k)
+
+        impurity_left[0] = entropy_left / self.n_outputs
+        impurity_right[0] = entropy_right / self.n_outputs
+
+
+cdef class Gini(ClassificationCriterion):
+    r"""Gini Index impurity criterion.
+
+    This handles cases where the target is a classification taking values
+    0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
+    then let
+
+        count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k)
+
+    be the proportion of class k observations in node m.
+
+    The Gini Index is then defined as:
+
+        index = \sum_{k=0}^{K-1} count_k (1 - count_k)
+              = 1 - \sum_{k=0}^{K-1} count_k ** 2
+    """
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        """Evaluate the impurity of the current node.
+
+        Evaluate the Gini criterion as impurity of the current node,
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
+        better.
+        """
+        cdef float64_t gini = 0.0
+        cdef float64_t sq_count
+        cdef float64_t count_k
+        cdef intp_t k
+        cdef intp_t c
+
+        for k in range(self.n_outputs):
+            sq_count = 0.0
+
+            for c in range(self.n_classes[k]):
+                count_k = self.sum_total[k, c]
+                sq_count += count_k * count_k
+
+            gini += 1.0 - sq_count / (self.weighted_n_node_samples *
+                                      self.weighted_n_node_samples)
+
+        return gini / self.n_outputs
+
+    cdef void children_impurity(self, float64_t* impurity_left,
+                                float64_t* impurity_right) noexcept nogil:
+        """Evaluate the impurity in children nodes.
+
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity the right child (sample_indices[pos:end]) using the Gini index.
+
+        Parameters
+        ----------
+        impurity_left : float64_t pointer
+            The memory address to save the impurity of the left node to
+        impurity_right : float64_t pointer
+            The memory address to save the impurity of the right node to
+        """
+        cdef float64_t gini_left = 0.0
+        cdef float64_t gini_right = 0.0
+        cdef float64_t sq_count_left
+        cdef float64_t sq_count_right
+        cdef float64_t count_k
+        cdef intp_t k
+        cdef intp_t c
+
+        for k in range(self.n_outputs):
+            sq_count_left = 0.0
+            sq_count_right = 0.0
+
+            for c in range(self.n_classes[k]):
+                count_k = self.sum_left[k, c]
+                sq_count_left += count_k * count_k
+
+                count_k = self.sum_right[k, c]
+                sq_count_right += count_k * count_k
+
+            gini_left += 1.0 - sq_count_left / (self.weighted_n_left *
+                                                self.weighted_n_left)
+
+            gini_right += 1.0 - sq_count_right / (self.weighted_n_right *
+                                                  self.weighted_n_right)
+
+        impurity_left[0] = gini_left / self.n_outputs
+        impurity_right[0] = gini_right / self.n_outputs
+
+
+cdef inline void _move_sums_regression(
+    RegressionCriterion criterion,
+    float64_t[::1] sum_1,
+    float64_t[::1] sum_2,
+    float64_t* weighted_n_1,
+    float64_t* weighted_n_2,
+    bint put_missing_in_1,
+) noexcept nogil:
+    """Distribute sum_total and sum_missing into sum_1 and sum_2.
+
+    If there are missing values and:
+    - put_missing_in_1 is True, then missing values to go sum_1. Specifically:
+        sum_1 = sum_missing
+        sum_2 = sum_total - sum_missing
+
+    - put_missing_in_1 is False, then missing values go to sum_2. Specifically:
+        sum_1 = 0
+        sum_2 = sum_total
+    """
+    cdef:
+        intp_t i
+        intp_t n_bytes = criterion.n_outputs * sizeof(float64_t)
+        bint has_missing = criterion.n_missing != 0
+
+    if has_missing and put_missing_in_1:
+        memcpy(&sum_1[0], &criterion.sum_missing[0], n_bytes)
+        for i in range(criterion.n_outputs):
+            sum_2[i] = criterion.sum_total[i] - criterion.sum_missing[i]
+        weighted_n_1[0] = criterion.weighted_n_missing
+        weighted_n_2[0] = criterion.weighted_n_node_samples - criterion.weighted_n_missing
+    else:
+        memset(&sum_1[0], 0, n_bytes)
+        # Assigning sum_2 = sum_total for all outputs.
+        memcpy(&sum_2[0], &criterion.sum_total[0], n_bytes)
+        weighted_n_1[0] = 0.0
+        weighted_n_2[0] = criterion.weighted_n_node_samples
+
+
+cdef class RegressionCriterion(Criterion):
+    r"""Abstract regression criterion.
+
+    This handles cases where the target is a continuous value, and is
+    evaluated by computing the variance of the target values left and right
+    of the split point. The computation takes linear time with `n_samples`
+    by using ::
+
+        var = \sum_i^n (y_i - y_bar) ** 2
+            = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
+    """
+
+    def __cinit__(self, intp_t n_outputs, intp_t n_samples):
+        """Initialize parameters for this criterion.
+
+        Parameters
+        ----------
+        n_outputs : intp_t
+            The number of targets to be predicted
+
+        n_samples : intp_t
+            The total number of samples to fit on
+        """
+        # Default values
+        self.start = 0
+        self.pos = 0
+        self.end = 0
+
+        self.n_outputs = n_outputs
+        self.n_samples = n_samples
+        self.n_node_samples = 0
+        self.weighted_n_node_samples = 0.0
+        self.weighted_n_left = 0.0
+        self.weighted_n_right = 0.0
+        self.weighted_n_missing = 0.0
+
+        self.sq_sum_total = 0.0
+
+        self.sum_total = np.zeros(n_outputs, dtype=np.float64)
+        self.sum_left = np.zeros(n_outputs, dtype=np.float64)
+        self.sum_right = np.zeros(n_outputs, dtype=np.float64)
+
+    def __reduce__(self):
+        return (type(self), (self.n_outputs, self.n_samples), self.__getstate__())
+
+    cdef int init(
+        self,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        float64_t weighted_n_samples,
+        const intp_t[:] sample_indices,
+        intp_t start,
+        intp_t end,
+    ) except -1 nogil:
+        """Initialize the criterion.
+
+        This initializes the criterion at node sample_indices[start:end] and children
+        sample_indices[start:start] and sample_indices[start:end].
+        """
+        # Initialize fields
+        self.y = y
+        self.sample_weight = sample_weight
+        self.sample_indices = sample_indices
+        self.start = start
+        self.end = end
+        self.n_node_samples = end - start
+        self.weighted_n_samples = weighted_n_samples
+        self.weighted_n_node_samples = 0.
+
+        cdef intp_t i
+        cdef intp_t p
+        cdef intp_t k
+        cdef float64_t y_ik
+        cdef float64_t w_y_ik
+        cdef float64_t w = 1.0
+        self.sq_sum_total = 0.0
+        memset(&self.sum_total[0], 0, self.n_outputs * sizeof(float64_t))
+
+        for p in range(start, end):
+            i = sample_indices[p]
+
+            if sample_weight is not None:
+                w = sample_weight[i]
+
+            for k in range(self.n_outputs):
+                y_ik = self.y[i, k]
+                w_y_ik = w * y_ik
+                self.sum_total[k] += w_y_ik
+                self.sq_sum_total += w_y_ik * y_ik
+
+            self.weighted_n_node_samples += w
+
+        # Reset to pos=start
+        self.reset()
+        return 0
+
+    cdef void init_sum_missing(self):
+        """Init sum_missing to hold sums for missing values."""
+        self.sum_missing = np.zeros(self.n_outputs, dtype=np.float64)
+
+    cdef void init_missing(self, intp_t n_missing) noexcept nogil:
+        """Initialize sum_missing if there are missing values.
+
+        This method assumes that caller placed the missing samples in
+        self.sample_indices[-n_missing:]
+        """
+        cdef intp_t i, p, k
+        cdef float64_t y_ik
+        cdef float64_t w_y_ik
+        cdef float64_t w = 1.0
+
+        self.n_missing = n_missing
+        if n_missing == 0:
+            return
+
+        memset(&self.sum_missing[0], 0, self.n_outputs * sizeof(float64_t))
+
+        self.weighted_n_missing = 0.0
+
+        # The missing samples are assumed to be in self.sample_indices[-n_missing:]
+        for p in range(self.end - n_missing, self.end):
+            i = self.sample_indices[p]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
+
+            for k in range(self.n_outputs):
+                y_ik = self.y[i, k]
+                w_y_ik = w * y_ik
+                self.sum_missing[k] += w_y_ik
+
+            self.weighted_n_missing += w
+
+    cdef int reset(self) except -1 nogil:
+        """Reset the criterion at pos=start."""
+        self.pos = self.start
+        _move_sums_regression(
+            self,
+            self.sum_left,
+            self.sum_right,
+            &self.weighted_n_left,
+            &self.weighted_n_right,
+            self.missing_go_to_left
+        )
+        return 0
+
+    cdef int reverse_reset(self) except -1 nogil:
+        """Reset the criterion at pos=end."""
+        self.pos = self.end
+        _move_sums_regression(
+            self,
+            self.sum_right,
+            self.sum_left,
+            &self.weighted_n_right,
+            &self.weighted_n_left,
+            not self.missing_go_to_left
+        )
+        return 0
+
+    cdef int update(self, intp_t new_pos) except -1 nogil:
+        """Updated statistics by moving sample_indices[pos:new_pos] to the left."""
+        cdef const float64_t[:] sample_weight = self.sample_weight
+        cdef const intp_t[:] sample_indices = self.sample_indices
+
+        cdef intp_t pos = self.pos
+
+        # The missing samples are assumed to be in
+        # self.sample_indices[-self.n_missing:] that is
+        # self.sample_indices[end_non_missing:self.end].
+        cdef intp_t end_non_missing = self.end - self.n_missing
+        cdef intp_t i
+        cdef intp_t p
+        cdef intp_t k
+        cdef float64_t w = 1.0
+
+        # Update statistics up to new_pos
+        #
+        # Given that
+        #           sum_left[x] +  sum_right[x] = sum_total[x]
+        # and that sum_total is known, we are going to update
+        # sum_left from the direction that require the least amount
+        # of computations, i.e. from pos to new_pos or from end to new_pos.
+        if (new_pos - pos) <= (end_non_missing - new_pos):
+            for p in range(pos, new_pos):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    self.sum_left[k] += w * self.y[i, k]
+
+                self.weighted_n_left += w
+        else:
+            self.reverse_reset()
+
+            for p in range(end_non_missing - 1, new_pos - 1, -1):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    self.sum_left[k] -= w * self.y[i, k]
+
+                self.weighted_n_left -= w
+
+        self.weighted_n_right = (self.weighted_n_node_samples -
+                                 self.weighted_n_left)
+        for k in range(self.n_outputs):
+            self.sum_right[k] = self.sum_total[k] - self.sum_left[k]
+
+        self.pos = new_pos
+        return 0
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        pass
+
+    cdef void children_impurity(self, float64_t* impurity_left,
+                                float64_t* impurity_right) noexcept nogil:
+        pass
+
+    cdef void node_value(self, float64_t* dest) noexcept nogil:
+        """Compute the node value of sample_indices[start:end] into dest."""
+        cdef intp_t k
+
+        for k in range(self.n_outputs):
+            dest[k] = self.sum_total[k] / self.weighted_n_node_samples
+
+    cdef inline void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil:
+        """Clip the value in dest between lower_bound and upper_bound for monotonic constraints."""
+        if dest[0] < lower_bound:
+            dest[0] = lower_bound
+        elif dest[0] > upper_bound:
+            dest[0] = upper_bound
+
+    cdef float64_t middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints as the simple average
+        of the left and right children values.
+
+        Monotonicity constraints are only supported for single-output trees we can safely assume
+        n_outputs == 1.
+        """
+        return (
+            (self.sum_left[0] / (2 * self.weighted_n_left)) +
+            (self.sum_right[0] / (2 * self.weighted_n_right))
+        )
+
+    cdef bint check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        float64_t lower_bound,
+        float64_t upper_bound,
+    ) noexcept nogil:
+        """Check monotonicity constraint is satisfied at the current regression split"""
+        cdef:
+            float64_t value_left = self.sum_left[0] / self.weighted_n_left
+            float64_t value_right = self.sum_right[0] / self.weighted_n_right
+
+        return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right)
+
+cdef class MSE(RegressionCriterion):
+    """Mean squared error impurity criterion.
+
+        MSE = var_left + var_right
+    """
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        """Evaluate the impurity of the current node.
+
+        Evaluate the MSE criterion as impurity of the current node,
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
+        better.
+        """
+        cdef float64_t impurity
+        cdef intp_t k
+
+        impurity = self.sq_sum_total / self.weighted_n_node_samples
+        for k in range(self.n_outputs):
+            impurity -= (self.sum_total[k] / self.weighted_n_node_samples)**2.0
+
+        return impurity / self.n_outputs
+
+    cdef float64_t proxy_impurity_improvement(self) noexcept nogil:
+        """Compute a proxy of the impurity reduction.
+
+        This method is used to speed up the search for the best split.
+        It is a proxy quantity such that the split that maximizes this value
+        also maximizes the impurity improvement. It neglects all constant terms
+        of the impurity decrease for a given split.
+
+        The absolute impurity improvement is only computed by the
+        impurity_improvement method once the best split has been found.
+
+        The MSE proxy is derived from
+
+            sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2
+            = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2
+
+        Neglecting constant terms, this gives:
+
+            - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2
+        """
+        cdef intp_t k
+        cdef float64_t proxy_impurity_left = 0.0
+        cdef float64_t proxy_impurity_right = 0.0
+
+        for k in range(self.n_outputs):
+            proxy_impurity_left += self.sum_left[k] * self.sum_left[k]
+            proxy_impurity_right += self.sum_right[k] * self.sum_right[k]
+
+        return (proxy_impurity_left / self.weighted_n_left +
+                proxy_impurity_right / self.weighted_n_right)
+
+    cdef void children_impurity(self, float64_t* impurity_left,
+                                float64_t* impurity_right) noexcept nogil:
+        """Evaluate the impurity in children nodes.
+
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity the right child (sample_indices[pos:end]).
+        """
+        cdef const float64_t[:] sample_weight = self.sample_weight
+        cdef const intp_t[:] sample_indices = self.sample_indices
+        cdef intp_t pos = self.pos
+        cdef intp_t start = self.start
+
+        cdef float64_t y_ik
+
+        cdef float64_t sq_sum_left = 0.0
+        cdef float64_t sq_sum_right
+
+        cdef intp_t i
+        cdef intp_t p
+        cdef intp_t k
+        cdef float64_t w = 1.0
+
+        cdef intp_t end_non_missing
+
+        for p in range(start, pos):
+            i = sample_indices[p]
+
+            if sample_weight is not None:
+                w = sample_weight[i]
+
+            for k in range(self.n_outputs):
+                y_ik = self.y[i, k]
+                sq_sum_left += w * y_ik * y_ik
+
+        if self.missing_go_to_left:
+            # add up the impact of these missing values on the left child
+            # statistics.
+            # Note: this only impacts the square sum as the sum
+            # is modified elsewhere.
+            end_non_missing = self.end - self.n_missing
+
+            for p in range(end_non_missing, self.end):
+                i = sample_indices[p]
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    y_ik = self.y[i, k]
+                    sq_sum_left += w * y_ik * y_ik
+
+        sq_sum_right = self.sq_sum_total - sq_sum_left
+
+        impurity_left[0] = sq_sum_left / self.weighted_n_left
+        impurity_right[0] = sq_sum_right / self.weighted_n_right
+
+        for k in range(self.n_outputs):
+            impurity_left[0] -= (self.sum_left[k] / self.weighted_n_left) ** 2.0
+            impurity_right[0] -= (self.sum_right[k] / self.weighted_n_right) ** 2.0
+
+        impurity_left[0] /= self.n_outputs
+        impurity_right[0] /= self.n_outputs
+
+
+cdef class MAE(RegressionCriterion):
+    r"""Mean absolute error impurity criterion.
+
+       MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true
+       value and f_i is the predicted value."""
+
+    cdef cnp.ndarray left_child
+    cdef cnp.ndarray right_child
+    cdef void** left_child_ptr
+    cdef void** right_child_ptr
+    cdef float64_t[::1] node_medians
+
+    def __cinit__(self, intp_t n_outputs, intp_t n_samples):
+        """Initialize parameters for this criterion.
+
+        Parameters
+        ----------
+        n_outputs : intp_t
+            The number of targets to be predicted
+
+        n_samples : intp_t
+            The total number of samples to fit on
+        """
+        # Default values
+        self.start = 0
+        self.pos = 0
+        self.end = 0
+
+        self.n_outputs = n_outputs
+        self.n_samples = n_samples
+        self.n_node_samples = 0
+        self.weighted_n_node_samples = 0.0
+        self.weighted_n_left = 0.0
+        self.weighted_n_right = 0.0
+
+        self.node_medians = np.zeros(n_outputs, dtype=np.float64)
+
+        self.left_child = np.empty(n_outputs, dtype='object')
+        self.right_child = np.empty(n_outputs, dtype='object')
+        # initialize WeightedMedianCalculators
+        for k in range(n_outputs):
+            self.left_child[k] = WeightedMedianCalculator(n_samples)
+            self.right_child[k] = WeightedMedianCalculator(n_samples)
+
+        self.left_child_ptr = <void**> cnp.PyArray_DATA(self.left_child)
+        self.right_child_ptr = <void**> cnp.PyArray_DATA(self.right_child)
+
+    cdef int init(
+        self,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        float64_t weighted_n_samples,
+        const intp_t[:] sample_indices,
+        intp_t start,
+        intp_t end,
+    ) except -1 nogil:
+        """Initialize the criterion.
+
+        This initializes the criterion at node sample_indices[start:end] and children
+        sample_indices[start:start] and sample_indices[start:end].
+        """
+        cdef intp_t i, p, k
+        cdef float64_t w = 1.0
+
+        # Initialize fields
+        self.y = y
+        self.sample_weight = sample_weight
+        self.sample_indices = sample_indices
+        self.start = start
+        self.end = end
+        self.n_node_samples = end - start
+        self.weighted_n_samples = weighted_n_samples
+        self.weighted_n_node_samples = 0.
+
+        cdef void** left_child = self.left_child_ptr
+        cdef void** right_child = self.right_child_ptr
+
+        for k in range(self.n_outputs):
+            (<WeightedMedianCalculator> left_child[k]).reset()
+            (<WeightedMedianCalculator> right_child[k]).reset()
+
+        for p in range(start, end):
+            i = sample_indices[p]
+
+            if sample_weight is not None:
+                w = sample_weight[i]
+
+            for k in range(self.n_outputs):
+                # push method ends up calling safe_realloc, hence `except -1`
+                # push all values to the right side,
+                # since pos = start initially anyway
+                (<WeightedMedianCalculator> right_child[k]).push(self.y[i, k], w)
+
+            self.weighted_n_node_samples += w
+        # calculate the node medians
+        for k in range(self.n_outputs):
+            self.node_medians[k] = (<WeightedMedianCalculator> right_child[k]).get_median()
+
+        # Reset to pos=start
+        self.reset()
+        return 0
+
+    cdef void init_missing(self, intp_t n_missing) noexcept nogil:
+        """Raise error if n_missing != 0."""
+        if n_missing == 0:
+            return
+        with gil:
+            raise ValueError("missing values is not supported for MAE.")
+
+    cdef int reset(self) except -1 nogil:
+        """Reset the criterion at pos=start.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        cdef intp_t i, k
+        cdef float64_t value
+        cdef float64_t weight
+
+        cdef void** left_child = self.left_child_ptr
+        cdef void** right_child = self.right_child_ptr
+
+        self.weighted_n_left = 0.0
+        self.weighted_n_right = self.weighted_n_node_samples
+        self.pos = self.start
+
+        # reset the WeightedMedianCalculators, left should have no
+        # elements and right should have all elements.
+
+        for k in range(self.n_outputs):
+            # if left has no elements, it's already reset
+            for i in range((<WeightedMedianCalculator> left_child[k]).size()):
+                # remove everything from left and put it into right
+                (<WeightedMedianCalculator> left_child[k]).pop(&value,
+                                                               &weight)
+                # push method ends up calling safe_realloc, hence `except -1`
+                (<WeightedMedianCalculator> right_child[k]).push(value,
+                                                                 weight)
+        return 0
+
+    cdef int reverse_reset(self) except -1 nogil:
+        """Reset the criterion at pos=end.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        self.weighted_n_right = 0.0
+        self.weighted_n_left = self.weighted_n_node_samples
+        self.pos = self.end
+
+        cdef float64_t value
+        cdef float64_t weight
+        cdef void** left_child = self.left_child_ptr
+        cdef void** right_child = self.right_child_ptr
+
+        # reverse reset the WeightedMedianCalculators, right should have no
+        # elements and left should have all elements.
+        for k in range(self.n_outputs):
+            # if right has no elements, it's already reset
+            for i in range((<WeightedMedianCalculator> right_child[k]).size()):
+                # remove everything from right and put it into left
+                (<WeightedMedianCalculator> right_child[k]).pop(&value,
+                                                                &weight)
+                # push method ends up calling safe_realloc, hence `except -1`
+                (<WeightedMedianCalculator> left_child[k]).push(value,
+                                                                weight)
+        return 0
+
+    cdef int update(self, intp_t new_pos) except -1 nogil:
+        """Updated statistics by moving sample_indices[pos:new_pos] to the left.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        cdef const float64_t[:] sample_weight = self.sample_weight
+        cdef const intp_t[:] sample_indices = self.sample_indices
+
+        cdef void** left_child = self.left_child_ptr
+        cdef void** right_child = self.right_child_ptr
+
+        cdef intp_t pos = self.pos
+        cdef intp_t end = self.end
+        cdef intp_t i, p, k
+        cdef float64_t w = 1.0
+
+        # Update statistics up to new_pos
+        #
+        # We are going to update right_child and left_child
+        # from the direction that require the least amount of
+        # computations, i.e. from pos to new_pos or from end to new_pos.
+        if (new_pos - pos) <= (end - new_pos):
+            for p in range(pos, new_pos):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    # remove y_ik and its weight w from right and add to left
+                    (<WeightedMedianCalculator> right_child[k]).remove(self.y[i, k], w)
+                    # push method ends up calling safe_realloc, hence except -1
+                    (<WeightedMedianCalculator> left_child[k]).push(self.y[i, k], w)
+
+                self.weighted_n_left += w
+        else:
+            self.reverse_reset()
+
+            for p in range(end - 1, new_pos - 1, -1):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    # remove y_ik and its weight w from left and add to right
+                    (<WeightedMedianCalculator> left_child[k]).remove(self.y[i, k], w)
+                    (<WeightedMedianCalculator> right_child[k]).push(self.y[i, k], w)
+
+                self.weighted_n_left -= w
+
+        self.weighted_n_right = (self.weighted_n_node_samples -
+                                 self.weighted_n_left)
+        self.pos = new_pos
+        return 0
+
+    cdef void node_value(self, float64_t* dest) noexcept nogil:
+        """Computes the node value of sample_indices[start:end] into dest."""
+        cdef intp_t k
+        for k in range(self.n_outputs):
+            dest[k] = <float64_t> self.node_medians[k]
+
+    cdef inline float64_t middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints as the simple average
+        of the left and right children values.
+
+        Monotonicity constraints are only supported for single-output trees we can safely assume
+        n_outputs == 1.
+        """
+        return (
+                (<WeightedMedianCalculator> self.left_child_ptr[0]).get_median() +
+                (<WeightedMedianCalculator> self.right_child_ptr[0]).get_median()
+        ) / 2
+
+    cdef inline bint check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        float64_t lower_bound,
+        float64_t upper_bound,
+    ) noexcept nogil:
+        """Check monotonicity constraint is satisfied at the current regression split"""
+        cdef:
+            float64_t value_left = (<WeightedMedianCalculator> self.left_child_ptr[0]).get_median()
+            float64_t value_right = (<WeightedMedianCalculator> self.right_child_ptr[0]).get_median()
+
+        return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right)
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        """Evaluate the impurity of the current node.
+
+        Evaluate the MAE criterion as impurity of the current node,
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
+        better.
+        """
+        cdef const float64_t[:] sample_weight = self.sample_weight
+        cdef const intp_t[:] sample_indices = self.sample_indices
+        cdef intp_t i, p, k
+        cdef float64_t w = 1.0
+        cdef float64_t impurity = 0.0
+
+        for k in range(self.n_outputs):
+            for p in range(self.start, self.end):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                impurity += fabs(self.y[i, k] - self.node_medians[k]) * w
+
+        return impurity / (self.weighted_n_node_samples * self.n_outputs)
+
+    cdef void children_impurity(self, float64_t* p_impurity_left,
+                                float64_t* p_impurity_right) noexcept nogil:
+        """Evaluate the impurity in children nodes.
+
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity the right child (sample_indices[pos:end]).
+        """
+        cdef const float64_t[:] sample_weight = self.sample_weight
+        cdef const intp_t[:] sample_indices = self.sample_indices
+
+        cdef intp_t start = self.start
+        cdef intp_t pos = self.pos
+        cdef intp_t end = self.end
+
+        cdef intp_t i, p, k
+        cdef float64_t median
+        cdef float64_t w = 1.0
+        cdef float64_t impurity_left = 0.0
+        cdef float64_t impurity_right = 0.0
+
+        cdef void** left_child = self.left_child_ptr
+        cdef void** right_child = self.right_child_ptr
+
+        for k in range(self.n_outputs):
+            median = (<WeightedMedianCalculator> left_child[k]).get_median()
+            for p in range(start, pos):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                impurity_left += fabs(self.y[i, k] - median) * w
+        p_impurity_left[0] = impurity_left / (self.weighted_n_left *
+                                              self.n_outputs)
+
+        for k in range(self.n_outputs):
+            median = (<WeightedMedianCalculator> right_child[k]).get_median()
+            for p in range(pos, end):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                impurity_right += fabs(self.y[i, k] - median) * w
+        p_impurity_right[0] = impurity_right / (self.weighted_n_right *
+                                                self.n_outputs)
+
+
+cdef class FriedmanMSE(MSE):
+    """Mean squared error impurity criterion with improvement score by Friedman.
+
+    Uses the formula (35) in Friedman's original Gradient Boosting paper:
+
+        diff = mean_left - mean_right
+        improvement = n_left * n_right * diff^2 / (n_left + n_right)
+    """
+
+    cdef float64_t proxy_impurity_improvement(self) noexcept nogil:
+        """Compute a proxy of the impurity reduction.
+
+        This method is used to speed up the search for the best split.
+        It is a proxy quantity such that the split that maximizes this value
+        also maximizes the impurity improvement. It neglects all constant terms
+        of the impurity decrease for a given split.
+
+        The absolute impurity improvement is only computed by the
+        impurity_improvement method once the best split has been found.
+        """
+        cdef float64_t total_sum_left = 0.0
+        cdef float64_t total_sum_right = 0.0
+
+        cdef intp_t k
+        cdef float64_t diff = 0.0
+
+        for k in range(self.n_outputs):
+            total_sum_left += self.sum_left[k]
+            total_sum_right += self.sum_right[k]
+
+        diff = (self.weighted_n_right * total_sum_left -
+                self.weighted_n_left * total_sum_right)
+
+        return diff * diff / (self.weighted_n_left * self.weighted_n_right)
+
+    cdef float64_t impurity_improvement(self, float64_t impurity_parent, float64_t
+                                        impurity_left, float64_t impurity_right) noexcept nogil:
+        # Note: none of the arguments are used here
+        cdef float64_t total_sum_left = 0.0
+        cdef float64_t total_sum_right = 0.0
+
+        cdef intp_t k
+        cdef float64_t diff = 0.0
+
+        for k in range(self.n_outputs):
+            total_sum_left += self.sum_left[k]
+            total_sum_right += self.sum_right[k]
+
+        diff = (self.weighted_n_right * total_sum_left -
+                self.weighted_n_left * total_sum_right) / self.n_outputs
+
+        return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
+                               self.weighted_n_node_samples))
+
+
+cdef class Poisson(RegressionCriterion):
+    """Half Poisson deviance as impurity criterion.
+
+    Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
+
+    Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
+    at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
+    implemented impurity (factor 2 is skipped):
+        1/n * sum(y_true * log(y_true/y_pred)
+    """
+    # FIXME in 1.0:
+    # min_impurity_split with default = 0 forces us to use a non-negative
+    # impurity like the Poisson deviance. Without this restriction, one could
+    # throw away the 'constant' term sum(y_true * log(y_true)) and just use
+    # Poisson loss = - 1/n * sum(y_true * log(y_pred))
+    #              = - 1/n * sum(y_true * log(mean(y_true))
+    #              = - mean(y_true) * log(mean(y_true))
+    # With this trick (used in proxy_impurity_improvement()), as for MSE,
+    # children_impurity would only need to go over left xor right split, not
+    # both. This could be faster.
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        """Evaluate the impurity of the current node.
+
+        Evaluate the Poisson criterion as impurity of the current node,
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
+        better.
+        """
+        return self.poisson_loss(self.start, self.end, self.sum_total,
+                                 self.weighted_n_node_samples)
+
+    cdef float64_t proxy_impurity_improvement(self) noexcept nogil:
+        """Compute a proxy of the impurity reduction.
+
+        This method is used to speed up the search for the best split.
+        It is a proxy quantity such that the split that maximizes this value
+        also maximizes the impurity improvement. It neglects all constant terms
+        of the impurity decrease for a given split.
+
+        The absolute impurity improvement is only computed by the
+        impurity_improvement method once the best split has been found.
+
+        The Poisson proxy is derived from:
+
+              sum_{i left }(y_i * log(y_i / y_pred_L))
+            + sum_{i right}(y_i * log(y_i / y_pred_R))
+            = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i))
+                                 - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i))
+
+        Neglecting constant terms, this gives
+
+            - sum{i left }(y_i) * log(mean{i left}(y_i))
+            - sum{i right}(y_i) * log(mean{i right}(y_i))
+        """
+        cdef intp_t k
+        cdef float64_t proxy_impurity_left = 0.0
+        cdef float64_t proxy_impurity_right = 0.0
+        cdef float64_t y_mean_left = 0.
+        cdef float64_t y_mean_right = 0.
+
+        for k in range(self.n_outputs):
+            if (self.sum_left[k] <= EPSILON) or (self.sum_right[k] <= EPSILON):
+                # Poisson loss does not allow non-positive predictions. We
+                # therefore forbid splits that have child nodes with
+                # sum(y_i) <= 0.
+                # Since sum_right = sum_total - sum_left, it can lead to
+                # floating point rounding error and will not give zero. Thus,
+                # we relax the above comparison to sum(y_i) <= EPSILON.
+                return -INFINITY
+            else:
+                y_mean_left = self.sum_left[k] / self.weighted_n_left
+                y_mean_right = self.sum_right[k] / self.weighted_n_right
+                proxy_impurity_left -= self.sum_left[k] * log(y_mean_left)
+                proxy_impurity_right -= self.sum_right[k] * log(y_mean_right)
+
+        return - proxy_impurity_left - proxy_impurity_right
+
+    cdef void children_impurity(self, float64_t* impurity_left,
+                                float64_t* impurity_right) noexcept nogil:
+        """Evaluate the impurity in children nodes.
+
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity of the right child (sample_indices[pos:end]) for Poisson.
+        """
+        cdef intp_t start = self.start
+        cdef intp_t pos = self.pos
+        cdef intp_t end = self.end
+
+        impurity_left[0] = self.poisson_loss(start, pos, self.sum_left,
+                                             self.weighted_n_left)
+
+        impurity_right[0] = self.poisson_loss(pos, end, self.sum_right,
+                                              self.weighted_n_right)
+
+    cdef inline float64_t poisson_loss(
+        self,
+        intp_t start,
+        intp_t end,
+        const float64_t[::1] y_sum,
+        float64_t weight_sum
+    ) noexcept nogil:
+        """Helper function to compute Poisson loss (~deviance) of a given node.
+        """
+        cdef const float64_t[:, ::1] y = self.y
+        cdef const float64_t[:] sample_weight = self.sample_weight
+        cdef const intp_t[:] sample_indices = self.sample_indices
+
+        cdef float64_t y_mean = 0.
+        cdef float64_t poisson_loss = 0.
+        cdef float64_t w = 1.0
+        cdef intp_t i, k, p
+        cdef intp_t n_outputs = self.n_outputs
+
+        for k in range(n_outputs):
+            if y_sum[k] <= EPSILON:
+                # y_sum could be computed from the subtraction
+                # sum_right = sum_total - sum_left leading to a potential
+                # floating point rounding error.
+                # Thus, we relax the comparison y_sum <= 0 to
+                # y_sum <= EPSILON.
+                return INFINITY
+
+            y_mean = y_sum[k] / weight_sum
+
+            for p in range(start, end):
+                i = sample_indices[p]
+
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                poisson_loss += w * xlogy(y[i, k], y[i, k] / y_mean)
+        return poisson_loss / (weight_sum * n_outputs)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_export.py b/.venv/lib/python3.12/site-packages/sklearn/tree/_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..6726d0c67bfb135aaa54392bb1f89fabe3b996ff
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_export.py
@@ -0,0 +1,1167 @@
+"""
+This module defines export functions for decision trees.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from collections.abc import Iterable
+from io import StringIO
+from numbers import Integral
+
+import numpy as np
+
+from ..base import is_classifier
+from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params
+from ..utils.validation import check_array, check_is_fitted
+from . import DecisionTreeClassifier, DecisionTreeRegressor, _criterion, _tree
+from ._reingold_tilford import Tree, buchheim
+
+
+def _color_brew(n):
+    """Generate n colors with equally spaced hues.
+
+    Parameters
+    ----------
+    n : int
+        The number of colors required.
+
+    Returns
+    -------
+    color_list : list, length n
+        List of n tuples of form (R, G, B) being the components of each color.
+    """
+    color_list = []
+
+    # Initialize saturation & value; calculate chroma & value shift
+    s, v = 0.75, 0.9
+    c = s * v
+    m = v - c
+
+    for h in np.arange(25, 385, 360.0 / n).astype(int):
+        # Calculate some intermediate values
+        h_bar = h / 60.0
+        x = c * (1 - abs((h_bar % 2) - 1))
+        # Initialize RGB with same hue & chroma as our color
+        rgb = [
+            (c, x, 0),
+            (x, c, 0),
+            (0, c, x),
+            (0, x, c),
+            (x, 0, c),
+            (c, 0, x),
+            (c, x, 0),
+        ]
+        r, g, b = rgb[int(h_bar)]
+        # Shift the initial RGB values to match value and store
+        rgb = [(int(255 * (r + m))), (int(255 * (g + m))), (int(255 * (b + m)))]
+        color_list.append(rgb)
+
+    return color_list
+
+
+class Sentinel:
+    def __repr__(self):
+        return '"tree.dot"'
+
+
+SENTINEL = Sentinel()
+
+
+@validate_params(
+    {
+        "decision_tree": [DecisionTreeClassifier, DecisionTreeRegressor],
+        "max_depth": [Interval(Integral, 0, None, closed="left"), None],
+        "feature_names": ["array-like", None],
+        "class_names": ["array-like", "boolean", None],
+        "label": [StrOptions({"all", "root", "none"})],
+        "filled": ["boolean"],
+        "impurity": ["boolean"],
+        "node_ids": ["boolean"],
+        "proportion": ["boolean"],
+        "rounded": ["boolean"],
+        "precision": [Interval(Integral, 0, None, closed="left"), None],
+        "ax": "no_validation",  # delegate validation to matplotlib
+        "fontsize": [Interval(Integral, 0, None, closed="left"), None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def plot_tree(
+    decision_tree,
+    *,
+    max_depth=None,
+    feature_names=None,
+    class_names=None,
+    label="all",
+    filled=False,
+    impurity=True,
+    node_ids=False,
+    proportion=False,
+    rounded=False,
+    precision=3,
+    ax=None,
+    fontsize=None,
+):
+    """Plot a decision tree.
+
+    The sample counts that are shown are weighted with any sample_weights that
+    might be present.
+
+    The visualization is fit automatically to the size of the axis.
+    Use the ``figsize`` or ``dpi`` arguments of ``plt.figure``  to control
+    the size of the rendering.
+
+    Read more in the :ref:`User Guide <tree>`.
+
+    .. versionadded:: 0.21
+
+    Parameters
+    ----------
+    decision_tree : decision tree regressor or classifier
+        The decision tree to be plotted.
+
+    max_depth : int, default=None
+        The maximum depth of the representation. If None, the tree is fully
+        generated.
+
+    feature_names : array-like of str, default=None
+        Names of each of the features.
+        If None, generic names will be used ("x[0]", "x[1]", ...).
+
+    class_names : array-like of str or True, default=None
+        Names of each of the target classes in ascending numerical order.
+        Only relevant for classification and not supported for multi-output.
+        If ``True``, shows a symbolic representation of the class name.
+
+    label : {'all', 'root', 'none'}, default='all'
+        Whether to show informative labels for impurity, etc.
+        Options include 'all' to show at every node, 'root' to show only at
+        the top root node, or 'none' to not show at any node.
+
+    filled : bool, default=False
+        When set to ``True``, paint nodes to indicate majority class for
+        classification, extremity of values for regression, or purity of node
+        for multi-output.
+
+    impurity : bool, default=True
+        When set to ``True``, show the impurity at each node.
+
+    node_ids : bool, default=False
+        When set to ``True``, show the ID number on each node.
+
+    proportion : bool, default=False
+        When set to ``True``, change the display of 'values' and/or 'samples'
+        to be proportions and percentages respectively.
+
+    rounded : bool, default=False
+        When set to ``True``, draw node boxes with rounded corners and use
+        Helvetica fonts instead of Times-Roman.
+
+    precision : int, default=3
+        Number of digits of precision for floating point in the values of
+        impurity, threshold and value attributes of each node.
+
+    ax : matplotlib axis, default=None
+        Axes to plot to. If None, use current axis. Any previous content
+        is cleared.
+
+    fontsize : int, default=None
+        Size of text font. If None, determined automatically to fit figure.
+
+    Returns
+    -------
+    annotations : list of artists
+        List containing the artists for the annotation boxes making up the
+        tree.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn import tree
+
+    >>> clf = tree.DecisionTreeClassifier(random_state=0)
+    >>> iris = load_iris()
+
+    >>> clf = clf.fit(iris.data, iris.target)
+    >>> tree.plot_tree(clf)
+    [...]
+    """
+
+    check_is_fitted(decision_tree)
+
+    exporter = _MPLTreeExporter(
+        max_depth=max_depth,
+        feature_names=feature_names,
+        class_names=class_names,
+        label=label,
+        filled=filled,
+        impurity=impurity,
+        node_ids=node_ids,
+        proportion=proportion,
+        rounded=rounded,
+        precision=precision,
+        fontsize=fontsize,
+    )
+    return exporter.export(decision_tree, ax=ax)
+
+
+class _BaseTreeExporter:
+    def __init__(
+        self,
+        max_depth=None,
+        feature_names=None,
+        class_names=None,
+        label="all",
+        filled=False,
+        impurity=True,
+        node_ids=False,
+        proportion=False,
+        rounded=False,
+        precision=3,
+        fontsize=None,
+    ):
+        self.max_depth = max_depth
+        self.feature_names = feature_names
+        self.class_names = class_names
+        self.label = label
+        self.filled = filled
+        self.impurity = impurity
+        self.node_ids = node_ids
+        self.proportion = proportion
+        self.rounded = rounded
+        self.precision = precision
+        self.fontsize = fontsize
+
+    def get_color(self, value):
+        # Find the appropriate color & intensity for a node
+        if self.colors["bounds"] is None:
+            # Classification tree
+            color = list(self.colors["rgb"][np.argmax(value)])
+            sorted_values = sorted(value, reverse=True)
+            if len(sorted_values) == 1:
+                alpha = 0.0
+            else:
+                alpha = (sorted_values[0] - sorted_values[1]) / (1 - sorted_values[1])
+        else:
+            # Regression tree or multi-output
+            color = list(self.colors["rgb"][0])
+            alpha = (value - self.colors["bounds"][0]) / (
+                self.colors["bounds"][1] - self.colors["bounds"][0]
+            )
+        # compute the color as alpha against white
+        color = [int(round(alpha * c + (1 - alpha) * 255, 0)) for c in color]
+        # Return html color code in #RRGGBB format
+        return "#%2x%2x%2x" % tuple(color)
+
+    def get_fill_color(self, tree, node_id):
+        # Fetch appropriate color for node
+        if "rgb" not in self.colors:
+            # Initialize colors and bounds if required
+            self.colors["rgb"] = _color_brew(tree.n_classes[0])
+            if tree.n_outputs != 1:
+                # Find max and min impurities for multi-output
+                # The next line uses -max(impurity) instead of min(-impurity)
+                # and -min(impurity) instead of max(-impurity) on purpose, in
+                # order to avoid what looks like an issue with SIMD on non
+                # memory aligned arrays on 32bit OS. For more details see
+                # https://github.com/scikit-learn/scikit-learn/issues/27506.
+                self.colors["bounds"] = (-np.max(tree.impurity), -np.min(tree.impurity))
+            elif tree.n_classes[0] == 1 and len(np.unique(tree.value)) != 1:
+                # Find max and min values in leaf nodes for regression
+                self.colors["bounds"] = (np.min(tree.value), np.max(tree.value))
+        if tree.n_outputs == 1:
+            node_val = tree.value[node_id][0, :]
+            if (
+                tree.n_classes[0] == 1
+                and isinstance(node_val, Iterable)
+                and self.colors["bounds"] is not None
+            ):
+                # Unpack the float only for the regression tree case.
+                # Classification tree requires an Iterable in `get_color`.
+                node_val = node_val.item()
+        else:
+            # If multi-output color node by impurity
+            node_val = -tree.impurity[node_id]
+        return self.get_color(node_val)
+
+    def node_to_str(self, tree, node_id, criterion):
+        # Generate the node content string
+        if tree.n_outputs == 1:
+            value = tree.value[node_id][0, :]
+        else:
+            value = tree.value[node_id]
+
+        # Should labels be shown?
+        labels = (self.label == "root" and node_id == 0) or self.label == "all"
+
+        characters = self.characters
+        node_string = characters[-1]
+
+        # Write node ID
+        if self.node_ids:
+            if labels:
+                node_string += "node "
+            node_string += characters[0] + str(node_id) + characters[4]
+
+        # Write decision criteria
+        if tree.children_left[node_id] != _tree.TREE_LEAF:
+            # Always write node decision criteria, except for leaves
+            if self.feature_names is not None:
+                feature = self.feature_names[tree.feature[node_id]]
+                feature = self.str_escape(feature)
+            else:
+                feature = "x%s%s%s" % (
+                    characters[1],
+                    tree.feature[node_id],
+                    characters[2],
+                )
+            node_string += "%s %s %s%s" % (
+                feature,
+                characters[3],
+                round(tree.threshold[node_id], self.precision),
+                characters[4],
+            )
+
+        # Write impurity
+        if self.impurity:
+            if isinstance(criterion, _criterion.FriedmanMSE):
+                criterion = "friedman_mse"
+            elif isinstance(criterion, _criterion.MSE) or criterion == "squared_error":
+                criterion = "squared_error"
+            elif not isinstance(criterion, str):
+                criterion = "impurity"
+            if labels:
+                node_string += "%s = " % criterion
+            node_string += (
+                str(round(tree.impurity[node_id], self.precision)) + characters[4]
+            )
+
+        # Write node sample count
+        if labels:
+            node_string += "samples = "
+        if self.proportion:
+            percent = (
+                100.0 * tree.n_node_samples[node_id] / float(tree.n_node_samples[0])
+            )
+            node_string += str(round(percent, 1)) + "%" + characters[4]
+        else:
+            node_string += str(tree.n_node_samples[node_id]) + characters[4]
+
+        # Write node class distribution / regression value
+        if not self.proportion and tree.n_classes[0] != 1:
+            # For classification this will show the proportion of samples
+            value = value * tree.weighted_n_node_samples[node_id]
+        if labels:
+            node_string += "value = "
+        if tree.n_classes[0] == 1:
+            # Regression
+            value_text = np.around(value, self.precision)
+        elif self.proportion:
+            # Classification
+            value_text = np.around(value, self.precision)
+        elif np.all(np.equal(np.mod(value, 1), 0)):
+            # Classification without floating-point weights
+            value_text = value.astype(int)
+        else:
+            # Classification with floating-point weights
+            value_text = np.around(value, self.precision)
+        # Strip whitespace
+        value_text = str(value_text.astype("S32")).replace("b'", "'")
+        value_text = value_text.replace("' '", ", ").replace("'", "")
+        if tree.n_classes[0] == 1 and tree.n_outputs == 1:
+            value_text = value_text.replace("[", "").replace("]", "")
+        value_text = value_text.replace("\n ", characters[4])
+        node_string += value_text + characters[4]
+
+        # Write node majority class
+        if (
+            self.class_names is not None
+            and tree.n_classes[0] != 1
+            and tree.n_outputs == 1
+        ):
+            # Only done for single-output classification trees
+            if labels:
+                node_string += "class = "
+            if self.class_names is not True:
+                class_name = self.class_names[np.argmax(value)]
+                class_name = self.str_escape(class_name)
+            else:
+                class_name = "y%s%s%s" % (
+                    characters[1],
+                    np.argmax(value),
+                    characters[2],
+                )
+            node_string += class_name
+
+        # Clean up any trailing newlines
+        if node_string.endswith(characters[4]):
+            node_string = node_string[: -len(characters[4])]
+
+        return node_string + characters[5]
+
+    def str_escape(self, string):
+        return string
+
+
+class _DOTTreeExporter(_BaseTreeExporter):
+    def __init__(
+        self,
+        out_file=SENTINEL,
+        max_depth=None,
+        feature_names=None,
+        class_names=None,
+        label="all",
+        filled=False,
+        leaves_parallel=False,
+        impurity=True,
+        node_ids=False,
+        proportion=False,
+        rotate=False,
+        rounded=False,
+        special_characters=False,
+        precision=3,
+        fontname="helvetica",
+    ):
+        super().__init__(
+            max_depth=max_depth,
+            feature_names=feature_names,
+            class_names=class_names,
+            label=label,
+            filled=filled,
+            impurity=impurity,
+            node_ids=node_ids,
+            proportion=proportion,
+            rounded=rounded,
+            precision=precision,
+        )
+        self.leaves_parallel = leaves_parallel
+        self.out_file = out_file
+        self.special_characters = special_characters
+        self.fontname = fontname
+        self.rotate = rotate
+
+        # PostScript compatibility for special characters
+        if special_characters:
+            self.characters = ["&#35;", "<SUB>", "</SUB>", "&le;", "<br/>", ">", "<"]
+        else:
+            self.characters = ["#", "[", "]", "<=", "\\n", '"', '"']
+
+        # The depth of each node for plotting with 'leaf' option
+        self.ranks = {"leaves": []}
+        # The colors to render each node with
+        self.colors = {"bounds": None}
+
+    def export(self, decision_tree):
+        # Check length of feature_names before getting into the tree node
+        # Raise error if length of feature_names does not match
+        # n_features_in_ in the decision_tree
+        if self.feature_names is not None:
+            if len(self.feature_names) != decision_tree.n_features_in_:
+                raise ValueError(
+                    "Length of feature_names, %d does not match number of features, %d"
+                    % (len(self.feature_names), decision_tree.n_features_in_)
+                )
+        # each part writes to out_file
+        self.head()
+        # Now recurse the tree and add node & edge attributes
+        if isinstance(decision_tree, _tree.Tree):
+            self.recurse(decision_tree, 0, criterion="impurity")
+        else:
+            self.recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)
+
+        self.tail()
+
+    def tail(self):
+        # If required, draw leaf nodes at same depth as each other
+        if self.leaves_parallel:
+            for rank in sorted(self.ranks):
+                self.out_file.write(
+                    "{rank=same ; " + "; ".join(r for r in self.ranks[rank]) + "} ;\n"
+                )
+        self.out_file.write("}")
+
+    def head(self):
+        self.out_file.write("digraph Tree {\n")
+
+        # Specify node aesthetics
+        self.out_file.write("node [shape=box")
+        rounded_filled = []
+        if self.filled:
+            rounded_filled.append("filled")
+        if self.rounded:
+            rounded_filled.append("rounded")
+        if len(rounded_filled) > 0:
+            self.out_file.write(
+                ', style="%s", color="black"' % ", ".join(rounded_filled)
+            )
+
+        self.out_file.write(', fontname="%s"' % self.fontname)
+        self.out_file.write("] ;\n")
+
+        # Specify graph & edge aesthetics
+        if self.leaves_parallel:
+            self.out_file.write("graph [ranksep=equally, splines=polyline] ;\n")
+
+        self.out_file.write('edge [fontname="%s"] ;\n' % self.fontname)
+
+        if self.rotate:
+            self.out_file.write("rankdir=LR ;\n")
+
+    def recurse(self, tree, node_id, criterion, parent=None, depth=0):
+        if node_id == _tree.TREE_LEAF:
+            raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
+
+        left_child = tree.children_left[node_id]
+        right_child = tree.children_right[node_id]
+
+        # Add node with description
+        if self.max_depth is None or depth <= self.max_depth:
+            # Collect ranks for 'leaf' option in plot_options
+            if left_child == _tree.TREE_LEAF:
+                self.ranks["leaves"].append(str(node_id))
+            elif str(depth) not in self.ranks:
+                self.ranks[str(depth)] = [str(node_id)]
+            else:
+                self.ranks[str(depth)].append(str(node_id))
+
+            self.out_file.write(
+                "%d [label=%s" % (node_id, self.node_to_str(tree, node_id, criterion))
+            )
+
+            if self.filled:
+                self.out_file.write(
+                    ', fillcolor="%s"' % self.get_fill_color(tree, node_id)
+                )
+            self.out_file.write("] ;\n")
+
+            if parent is not None:
+                # Add edge to parent
+                self.out_file.write("%d -> %d" % (parent, node_id))
+                if parent == 0:
+                    # Draw True/False labels if parent is root node
+                    angles = np.array([45, -45]) * ((self.rotate - 0.5) * -2)
+                    self.out_file.write(" [labeldistance=2.5, labelangle=")
+                    if node_id == 1:
+                        self.out_file.write('%d, headlabel="True"]' % angles[0])
+                    else:
+                        self.out_file.write('%d, headlabel="False"]' % angles[1])
+                self.out_file.write(" ;\n")
+
+            if left_child != _tree.TREE_LEAF:
+                self.recurse(
+                    tree,
+                    left_child,
+                    criterion=criterion,
+                    parent=node_id,
+                    depth=depth + 1,
+                )
+                self.recurse(
+                    tree,
+                    right_child,
+                    criterion=criterion,
+                    parent=node_id,
+                    depth=depth + 1,
+                )
+
+        else:
+            self.ranks["leaves"].append(str(node_id))
+
+            self.out_file.write('%d [label="(...)"' % node_id)
+            if self.filled:
+                # color cropped nodes grey
+                self.out_file.write(', fillcolor="#C0C0C0"')
+            self.out_file.write("] ;\n" % node_id)
+
+            if parent is not None:
+                # Add edge to parent
+                self.out_file.write("%d -> %d ;\n" % (parent, node_id))
+
+    def str_escape(self, string):
+        # override default escaping for graphviz
+        return string.replace('"', r"\"")
+
+
+class _MPLTreeExporter(_BaseTreeExporter):
+    def __init__(
+        self,
+        max_depth=None,
+        feature_names=None,
+        class_names=None,
+        label="all",
+        filled=False,
+        impurity=True,
+        node_ids=False,
+        proportion=False,
+        rounded=False,
+        precision=3,
+        fontsize=None,
+    ):
+        super().__init__(
+            max_depth=max_depth,
+            feature_names=feature_names,
+            class_names=class_names,
+            label=label,
+            filled=filled,
+            impurity=impurity,
+            node_ids=node_ids,
+            proportion=proportion,
+            rounded=rounded,
+            precision=precision,
+        )
+        self.fontsize = fontsize
+
+        # The depth of each node for plotting with 'leaf' option
+        self.ranks = {"leaves": []}
+        # The colors to render each node with
+        self.colors = {"bounds": None}
+
+        self.characters = ["#", "[", "]", "<=", "\n", "", ""]
+        self.bbox_args = dict()
+        if self.rounded:
+            self.bbox_args["boxstyle"] = "round"
+
+        self.arrow_args = dict(arrowstyle="<-")
+
+    def _make_tree(self, node_id, et, criterion, depth=0):
+        # traverses _tree.Tree recursively, builds intermediate
+        # "_reingold_tilford.Tree" object
+        name = self.node_to_str(et, node_id, criterion=criterion)
+        if et.children_left[node_id] != _tree.TREE_LEAF and (
+            self.max_depth is None or depth <= self.max_depth
+        ):
+            children = [
+                self._make_tree(
+                    et.children_left[node_id], et, criterion, depth=depth + 1
+                ),
+                self._make_tree(
+                    et.children_right[node_id], et, criterion, depth=depth + 1
+                ),
+            ]
+        else:
+            return Tree(name, node_id)
+        return Tree(name, node_id, *children)
+
+    def export(self, decision_tree, ax=None):
+        import matplotlib.pyplot as plt
+        from matplotlib.text import Annotation
+
+        if ax is None:
+            ax = plt.gca()
+        ax.clear()
+        ax.set_axis_off()
+        my_tree = self._make_tree(0, decision_tree.tree_, decision_tree.criterion)
+        draw_tree = buchheim(my_tree)
+
+        # important to make sure we're still
+        # inside the axis after drawing the box
+        # this makes sense because the width of a box
+        # is about the same as the distance between boxes
+        max_x, max_y = draw_tree.max_extents() + 1
+        ax_width = ax.get_window_extent().width
+        ax_height = ax.get_window_extent().height
+
+        scale_x = ax_width / max_x
+        scale_y = ax_height / max_y
+        self.recurse(draw_tree, decision_tree.tree_, ax, max_x, max_y)
+
+        anns = [ann for ann in ax.get_children() if isinstance(ann, Annotation)]
+
+        # update sizes of all bboxes
+        renderer = ax.figure.canvas.get_renderer()
+
+        for ann in anns:
+            ann.update_bbox_position_size(renderer)
+
+        if self.fontsize is None:
+            # get figure to data transform
+            # adjust fontsize to avoid overlap
+            # get max box width and height
+            extents = [
+                bbox_patch.get_window_extent()
+                for ann in anns
+                if (bbox_patch := ann.get_bbox_patch()) is not None
+            ]
+            max_width = max([extent.width for extent in extents])
+            max_height = max([extent.height for extent in extents])
+            # width should be around scale_x in axis coordinates
+            size = anns[0].get_fontsize() * min(
+                scale_x / max_width, scale_y / max_height
+            )
+            for ann in anns:
+                ann.set_fontsize(size)
+
+        return anns
+
+    def recurse(self, node, tree, ax, max_x, max_y, depth=0):
+        import matplotlib.pyplot as plt
+
+        # kwargs for annotations without a bounding box
+        common_kwargs = dict(
+            zorder=100 - 10 * depth,
+            xycoords="axes fraction",
+        )
+        if self.fontsize is not None:
+            common_kwargs["fontsize"] = self.fontsize
+
+        # kwargs for annotations with a bounding box
+        kwargs = dict(
+            ha="center",
+            va="center",
+            bbox=self.bbox_args.copy(),
+            arrowprops=self.arrow_args.copy(),
+            **common_kwargs,
+        )
+        kwargs["arrowprops"]["edgecolor"] = plt.rcParams["text.color"]
+
+        # offset things by .5 to center them in plot
+        xy = ((node.x + 0.5) / max_x, (max_y - node.y - 0.5) / max_y)
+
+        if self.max_depth is None or depth <= self.max_depth:
+            if self.filled:
+                kwargs["bbox"]["fc"] = self.get_fill_color(tree, node.tree.node_id)
+            else:
+                kwargs["bbox"]["fc"] = ax.get_facecolor()
+
+            if node.parent is None:
+                # root
+                ax.annotate(node.tree.label, xy, **kwargs)
+            else:
+                xy_parent = (
+                    (node.parent.x + 0.5) / max_x,
+                    (max_y - node.parent.y - 0.5) / max_y,
+                )
+                ax.annotate(node.tree.label, xy_parent, xy, **kwargs)
+
+                # Draw True/False labels if parent is root node
+                if node.parent.parent is None:
+                    # Adjust the position for the text to be slightly above the arrow
+                    text_pos = (
+                        (xy_parent[0] + xy[0]) / 2,
+                        (xy_parent[1] + xy[1]) / 2,
+                    )
+                    # Annotate the arrow with the edge label to indicate the child
+                    # where the sample-split condition is satisfied
+                    if node.parent.left() == node:
+                        label_text, label_ha = ("True  ", "right")
+                    else:
+                        label_text, label_ha = ("  False", "left")
+                    ax.annotate(label_text, text_pos, ha=label_ha, **common_kwargs)
+            for child in node.children:
+                self.recurse(child, tree, ax, max_x, max_y, depth=depth + 1)
+
+        else:
+            xy_parent = (
+                (node.parent.x + 0.5) / max_x,
+                (max_y - node.parent.y - 0.5) / max_y,
+            )
+            kwargs["bbox"]["fc"] = "grey"
+            ax.annotate("\n  (...)  \n", xy_parent, xy, **kwargs)
+
+
+@validate_params(
+    {
+        "decision_tree": "no_validation",
+        "out_file": [str, None, HasMethods("write")],
+        "max_depth": [Interval(Integral, 0, None, closed="left"), None],
+        "feature_names": ["array-like", None],
+        "class_names": ["array-like", "boolean", None],
+        "label": [StrOptions({"all", "root", "none"})],
+        "filled": ["boolean"],
+        "leaves_parallel": ["boolean"],
+        "impurity": ["boolean"],
+        "node_ids": ["boolean"],
+        "proportion": ["boolean"],
+        "rotate": ["boolean"],
+        "rounded": ["boolean"],
+        "special_characters": ["boolean"],
+        "precision": [Interval(Integral, 0, None, closed="left"), None],
+        "fontname": [str],
+    },
+    prefer_skip_nested_validation=True,
+)
+def export_graphviz(
+    decision_tree,
+    out_file=None,
+    *,
+    max_depth=None,
+    feature_names=None,
+    class_names=None,
+    label="all",
+    filled=False,
+    leaves_parallel=False,
+    impurity=True,
+    node_ids=False,
+    proportion=False,
+    rotate=False,
+    rounded=False,
+    special_characters=False,
+    precision=3,
+    fontname="helvetica",
+):
+    """Export a decision tree in DOT format.
+
+    This function generates a GraphViz representation of the decision tree,
+    which is then written into `out_file`. Once exported, graphical renderings
+    can be generated using, for example::
+
+        $ dot -Tps tree.dot -o tree.ps      (PostScript format)
+        $ dot -Tpng tree.dot -o tree.png    (PNG format)
+
+    The sample counts that are shown are weighted with any sample_weights that
+    might be present.
+
+    Read more in the :ref:`User Guide <tree>`.
+
+    Parameters
+    ----------
+    decision_tree : object
+        The decision tree estimator to be exported to GraphViz.
+
+    out_file : object or str, default=None
+        Handle or name of the output file. If ``None``, the result is
+        returned as a string.
+
+        .. versionchanged:: 0.20
+            Default of out_file changed from "tree.dot" to None.
+
+    max_depth : int, default=None
+        The maximum depth of the representation. If None, the tree is fully
+        generated.
+
+    feature_names : array-like of shape (n_features,), default=None
+        An array containing the feature names.
+        If None, generic names will be used ("x[0]", "x[1]", ...).
+
+    class_names : array-like of shape (n_classes,) or bool, default=None
+        Names of each of the target classes in ascending numerical order.
+        Only relevant for classification and not supported for multi-output.
+        If ``True``, shows a symbolic representation of the class name.
+
+    label : {'all', 'root', 'none'}, default='all'
+        Whether to show informative labels for impurity, etc.
+        Options include 'all' to show at every node, 'root' to show only at
+        the top root node, or 'none' to not show at any node.
+
+    filled : bool, default=False
+        When set to ``True``, paint nodes to indicate majority class for
+        classification, extremity of values for regression, or purity of node
+        for multi-output.
+
+    leaves_parallel : bool, default=False
+        When set to ``True``, draw all leaf nodes at the bottom of the tree.
+
+    impurity : bool, default=True
+        When set to ``True``, show the impurity at each node.
+
+    node_ids : bool, default=False
+        When set to ``True``, show the ID number on each node.
+
+    proportion : bool, default=False
+        When set to ``True``, change the display of 'values' and/or 'samples'
+        to be proportions and percentages respectively.
+
+    rotate : bool, default=False
+        When set to ``True``, orient tree left to right rather than top-down.
+
+    rounded : bool, default=False
+        When set to ``True``, draw node boxes with rounded corners.
+
+    special_characters : bool, default=False
+        When set to ``False``, ignore special characters for PostScript
+        compatibility.
+
+    precision : int, default=3
+        Number of digits of precision for floating point in the values of
+        impurity, threshold and value attributes of each node.
+
+    fontname : str, default='helvetica'
+        Name of font used to render text.
+
+    Returns
+    -------
+    dot_data : str
+        String representation of the input tree in GraphViz dot format.
+        Only returned if ``out_file`` is None.
+
+        .. versionadded:: 0.18
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn import tree
+
+    >>> clf = tree.DecisionTreeClassifier()
+    >>> iris = load_iris()
+
+    >>> clf = clf.fit(iris.data, iris.target)
+    >>> tree.export_graphviz(clf)
+    'digraph Tree {...
+    """
+    if feature_names is not None:
+        feature_names = check_array(
+            feature_names, ensure_2d=False, dtype=None, ensure_min_samples=0
+        )
+    if class_names is not None and not isinstance(class_names, bool):
+        class_names = check_array(
+            class_names, ensure_2d=False, dtype=None, ensure_min_samples=0
+        )
+
+    check_is_fitted(decision_tree)
+    own_file = False
+    return_string = False
+    try:
+        if isinstance(out_file, str):
+            out_file = open(out_file, "w", encoding="utf-8")
+            own_file = True
+
+        if out_file is None:
+            return_string = True
+            out_file = StringIO()
+
+        exporter = _DOTTreeExporter(
+            out_file=out_file,
+            max_depth=max_depth,
+            feature_names=feature_names,
+            class_names=class_names,
+            label=label,
+            filled=filled,
+            leaves_parallel=leaves_parallel,
+            impurity=impurity,
+            node_ids=node_ids,
+            proportion=proportion,
+            rotate=rotate,
+            rounded=rounded,
+            special_characters=special_characters,
+            precision=precision,
+            fontname=fontname,
+        )
+        exporter.export(decision_tree)
+
+        if return_string:
+            return exporter.out_file.getvalue()
+
+    finally:
+        if own_file:
+            out_file.close()
+
+
+def _compute_depth(tree, node):
+    """
+    Returns the depth of the subtree rooted in node.
+    """
+
+    def compute_depth_(
+        current_node, current_depth, children_left, children_right, depths
+    ):
+        depths += [current_depth]
+        left = children_left[current_node]
+        right = children_right[current_node]
+        if left != -1 and right != -1:
+            compute_depth_(
+                left, current_depth + 1, children_left, children_right, depths
+            )
+            compute_depth_(
+                right, current_depth + 1, children_left, children_right, depths
+            )
+
+    depths = []
+    compute_depth_(node, 1, tree.children_left, tree.children_right, depths)
+    return max(depths)
+
+
+@validate_params(
+    {
+        "decision_tree": [DecisionTreeClassifier, DecisionTreeRegressor],
+        "feature_names": ["array-like", None],
+        "class_names": ["array-like", None],
+        "max_depth": [Interval(Integral, 0, None, closed="left"), None],
+        "spacing": [Interval(Integral, 1, None, closed="left"), None],
+        "decimals": [Interval(Integral, 0, None, closed="left"), None],
+        "show_weights": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def export_text(
+    decision_tree,
+    *,
+    feature_names=None,
+    class_names=None,
+    max_depth=10,
+    spacing=3,
+    decimals=2,
+    show_weights=False,
+):
+    """Build a text report showing the rules of a decision tree.
+
+    Note that backwards compatibility may not be supported.
+
+    Parameters
+    ----------
+    decision_tree : object
+        The decision tree estimator to be exported.
+        It can be an instance of
+        DecisionTreeClassifier or DecisionTreeRegressor.
+
+    feature_names : array-like of shape (n_features,), default=None
+        An array containing the feature names.
+        If None generic names will be used ("feature_0", "feature_1", ...).
+
+    class_names : array-like of shape (n_classes,), default=None
+        Names of each of the target classes in ascending numerical order.
+        Only relevant for classification and not supported for multi-output.
+
+        - if `None`, the class names are delegated to `decision_tree.classes_`;
+        - otherwise, `class_names` will be used as class names instead of
+          `decision_tree.classes_`. The length of `class_names` must match
+          the length of `decision_tree.classes_`.
+
+        .. versionadded:: 1.3
+
+    max_depth : int, default=10
+        Only the first max_depth levels of the tree are exported.
+        Truncated branches will be marked with "...".
+
+    spacing : int, default=3
+        Number of spaces between edges. The higher it is, the wider the result.
+
+    decimals : int, default=2
+        Number of decimal digits to display.
+
+    show_weights : bool, default=False
+        If true the classification weights will be exported on each leaf.
+        The classification weights are the number of samples each class.
+
+    Returns
+    -------
+    report : str
+        Text summary of all the rules in the decision tree.
+
+    Examples
+    --------
+
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> from sklearn.tree import export_text
+    >>> iris = load_iris()
+    >>> X = iris['data']
+    >>> y = iris['target']
+    >>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
+    >>> decision_tree = decision_tree.fit(X, y)
+    >>> r = export_text(decision_tree, feature_names=iris['feature_names'])
+    >>> print(r)
+    |--- petal width (cm) <= 0.80
+    |   |--- class: 0
+    |--- petal width (cm) >  0.80
+    |   |--- petal width (cm) <= 1.75
+    |   |   |--- class: 1
+    |   |--- petal width (cm) >  1.75
+    |   |   |--- class: 2
+    """
+    if feature_names is not None:
+        feature_names = check_array(
+            feature_names, ensure_2d=False, dtype=None, ensure_min_samples=0
+        )
+    if class_names is not None:
+        class_names = check_array(
+            class_names, ensure_2d=False, dtype=None, ensure_min_samples=0
+        )
+
+    check_is_fitted(decision_tree)
+    tree_ = decision_tree.tree_
+    if is_classifier(decision_tree):
+        if class_names is None:
+            class_names = decision_tree.classes_
+        elif len(class_names) != len(decision_tree.classes_):
+            raise ValueError(
+                "When `class_names` is an array, it should contain as"
+                " many items as `decision_tree.classes_`. Got"
+                f" {len(class_names)} while the tree was fitted with"
+                f" {len(decision_tree.classes_)} classes."
+            )
+    right_child_fmt = "{} {} <= {}\n"
+    left_child_fmt = "{} {} >  {}\n"
+    truncation_fmt = "{} {}\n"
+
+    if feature_names is not None and len(feature_names) != tree_.n_features:
+        raise ValueError(
+            "feature_names must contain %d elements, got %d"
+            % (tree_.n_features, len(feature_names))
+        )
+
+    if isinstance(decision_tree, DecisionTreeClassifier):
+        value_fmt = "{}{} weights: {}\n"
+        if not show_weights:
+            value_fmt = "{}{}{}\n"
+    else:
+        value_fmt = "{}{} value: {}\n"
+
+    if feature_names is not None:
+        feature_names_ = [
+            feature_names[i] if i != _tree.TREE_UNDEFINED else None
+            for i in tree_.feature
+        ]
+    else:
+        feature_names_ = ["feature_{}".format(i) for i in tree_.feature]
+
+    export_text.report = ""
+
+    def _add_leaf(value, weighted_n_node_samples, class_name, indent):
+        val = ""
+        if isinstance(decision_tree, DecisionTreeClassifier):
+            if show_weights:
+                val = [
+                    "{1:.{0}f}, ".format(decimals, v * weighted_n_node_samples)
+                    for v in value
+                ]
+                val = "[" + "".join(val)[:-2] + "]"
+                weighted_n_node_samples
+            val += " class: " + str(class_name)
+        else:
+            val = ["{1:.{0}f}, ".format(decimals, v) for v in value]
+            val = "[" + "".join(val)[:-2] + "]"
+        export_text.report += value_fmt.format(indent, "", val)
+
+    def print_tree_recurse(node, depth):
+        indent = ("|" + (" " * spacing)) * depth
+        indent = indent[:-spacing] + "-" * spacing
+
+        value = None
+        if tree_.n_outputs == 1:
+            value = tree_.value[node][0]
+        else:
+            value = tree_.value[node].T[0]
+        class_name = np.argmax(value)
+
+        if tree_.n_classes[0] != 1 and tree_.n_outputs == 1:
+            class_name = class_names[class_name]
+
+        weighted_n_node_samples = tree_.weighted_n_node_samples[node]
+
+        if depth <= max_depth + 1:
+            info_fmt = ""
+            info_fmt_left = info_fmt
+            info_fmt_right = info_fmt
+
+            if tree_.feature[node] != _tree.TREE_UNDEFINED:
+                name = feature_names_[node]
+                threshold = tree_.threshold[node]
+                threshold = "{1:.{0}f}".format(decimals, threshold)
+                export_text.report += right_child_fmt.format(indent, name, threshold)
+                export_text.report += info_fmt_left
+                print_tree_recurse(tree_.children_left[node], depth + 1)
+
+                export_text.report += left_child_fmt.format(indent, name, threshold)
+                export_text.report += info_fmt_right
+                print_tree_recurse(tree_.children_right[node], depth + 1)
+            else:  # leaf
+                _add_leaf(value, weighted_n_node_samples, class_name, indent)
+        else:
+            subtree_depth = _compute_depth(tree_, node)
+            if subtree_depth == 1:
+                _add_leaf(value, weighted_n_node_samples, class_name, indent)
+            else:
+                trunc_report = "truncated branch of depth %d" % subtree_depth
+                export_text.report += truncation_fmt.format(indent, trunc_report)
+
+    print_tree_recurse(0, 1)
+    return export_text.report
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_partitioner.pxd b/.venv/lib/python3.12/site-packages/sklearn/tree/_partitioner.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..fd41dec2e62c7c09714cfe13cd0bae674470c06b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_partitioner.pxd
@@ -0,0 +1,178 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# See _partitioner.pyx for details.
+
+from ..utils._typedefs cimport (
+    float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
+)
+from ._splitter cimport SplitRecord
+
+
+# Mitigate precision differences between 32 bit and 64 bit
+cdef float32_t FEATURE_THRESHOLD = 1e-7
+
+
+# We provide here the abstract interface for a Partitioner that would be
+# theoretically shared between the Dense and Sparse partitioners. However,
+# we leave it commented out for now as it is not used in the current
+# implementation due to the performance hit from vtable lookups when using
+# inheritance based polymorphism. It is left here for future reference.
+#
+# Note: Instead, in `_splitter.pyx`, we define a fused type that can be used
+# to represent both the dense and sparse partitioners.
+#
+# cdef class BasePartitioner:
+#     cdef intp_t[::1] samples
+#     cdef float32_t[::1] feature_values
+#     cdef intp_t start
+#     cdef intp_t end
+#     cdef intp_t n_missing
+#     cdef const uint8_t[::1] missing_values_in_feature_mask
+
+#     cdef void sort_samples_and_feature_values(
+#         self, intp_t current_feature
+#     ) noexcept nogil
+#     cdef void init_node_split(
+#         self,
+#         intp_t start,
+#         intp_t end
+#     ) noexcept nogil
+#     cdef void find_min_max(
+#         self,
+#         intp_t current_feature,
+#         float32_t* min_feature_value_out,
+#         float32_t* max_feature_value_out,
+#     ) noexcept nogil
+#     cdef void next_p(
+#         self,
+#         intp_t* p_prev,
+#         intp_t* p
+#     ) noexcept nogil
+#     cdef intp_t partition_samples(
+#         self,
+#         float64_t current_threshold
+#     ) noexcept nogil
+#     cdef void partition_samples_final(
+#         self,
+#         intp_t best_pos,
+#         float64_t best_threshold,
+#         intp_t best_feature,
+#         intp_t n_missing,
+#     ) noexcept nogil
+
+
+cdef class DensePartitioner:
+    """Partitioner specialized for dense data.
+
+    Note that this partitioner is agnostic to the splitting strategy (best vs. random).
+    """
+    cdef const float32_t[:, :] X
+    cdef intp_t[::1] samples
+    cdef float32_t[::1] feature_values
+    cdef intp_t start
+    cdef intp_t end
+    cdef intp_t n_missing
+    cdef const uint8_t[::1] missing_values_in_feature_mask
+
+    cdef void sort_samples_and_feature_values(
+        self, intp_t current_feature
+    ) noexcept nogil
+    cdef void init_node_split(
+        self,
+        intp_t start,
+        intp_t end
+    ) noexcept nogil
+    cdef void find_min_max(
+        self,
+        intp_t current_feature,
+        float32_t* min_feature_value_out,
+        float32_t* max_feature_value_out,
+    ) noexcept nogil
+    cdef void next_p(
+        self,
+        intp_t* p_prev,
+        intp_t* p
+    ) noexcept nogil
+    cdef intp_t partition_samples(
+        self,
+        float64_t current_threshold
+    ) noexcept nogil
+    cdef void partition_samples_final(
+        self,
+        intp_t best_pos,
+        float64_t best_threshold,
+        intp_t best_feature,
+        intp_t n_missing,
+    ) noexcept nogil
+
+
+cdef class SparsePartitioner:
+    """Partitioner specialized for sparse CSC data.
+
+    Note that this partitioner is agnostic to the splitting strategy (best vs. random).
+    """
+    cdef const float32_t[::1] X_data
+    cdef const int32_t[::1] X_indices
+    cdef const int32_t[::1] X_indptr
+    cdef intp_t n_total_samples
+    cdef intp_t[::1] index_to_samples
+    cdef intp_t[::1] sorted_samples
+    cdef intp_t start_positive
+    cdef intp_t end_negative
+    cdef bint is_samples_sorted
+
+    cdef intp_t[::1] samples
+    cdef float32_t[::1] feature_values
+    cdef intp_t start
+    cdef intp_t end
+    cdef intp_t n_missing
+    cdef const uint8_t[::1] missing_values_in_feature_mask
+
+    cdef void sort_samples_and_feature_values(
+        self, intp_t current_feature
+    ) noexcept nogil
+    cdef void init_node_split(
+        self,
+        intp_t start,
+        intp_t end
+    ) noexcept nogil
+    cdef void find_min_max(
+        self,
+        intp_t current_feature,
+        float32_t* min_feature_value_out,
+        float32_t* max_feature_value_out,
+    ) noexcept nogil
+    cdef void next_p(
+        self,
+        intp_t* p_prev,
+        intp_t* p
+    ) noexcept nogil
+    cdef intp_t partition_samples(
+        self,
+        float64_t current_threshold
+    ) noexcept nogil
+    cdef void partition_samples_final(
+        self,
+        intp_t best_pos,
+        float64_t best_threshold,
+        intp_t best_feature,
+        intp_t n_missing,
+    ) noexcept nogil
+
+    cdef void extract_nnz(
+        self,
+        intp_t feature
+    ) noexcept nogil
+    cdef intp_t _partition(
+        self,
+        float64_t threshold,
+        intp_t zero_pos
+    ) noexcept nogil
+
+
+cdef void shift_missing_values_to_left_if_required(
+    SplitRecord* best,
+    intp_t[::1] samples,
+    intp_t end,
+) noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_partitioner.pyx b/.venv/lib/python3.12/site-packages/sklearn/tree/_partitioner.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..7c342ed3a7d6b807619900de2ebba6bdf7deabb9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_partitioner.pyx
@@ -0,0 +1,821 @@
+"""Partition samples in the construction of a tree.
+
+This module contains the algorithms for moving sample indices to
+the left and right child node given a split determined by the
+splitting algorithm in `_splitter.pyx`.
+
+Partitioning is done in a way that is efficient for both dense data,
+and sparse data stored in a Compressed Sparse Column (CSC) format.
+"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cython cimport final
+from libc.math cimport isnan, log2
+from libc.stdlib cimport qsort
+from libc.string cimport memcpy
+
+import numpy as np
+from scipy.sparse import issparse
+
+
+# Constant to switch between algorithm non zero value extract algorithm
+# in SparsePartitioner
+cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
+
+# Allow for 32 bit float comparisons
+cdef float32_t INFINITY_32t = np.inf
+
+
+@final
+cdef class DensePartitioner:
+    """Partitioner specialized for dense data.
+
+    Note that this partitioner is agnostic to the splitting strategy (best vs. random).
+    """
+    def __init__(
+        self,
+        const float32_t[:, :] X,
+        intp_t[::1] samples,
+        float32_t[::1] feature_values,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ):
+        self.X = X
+        self.samples = samples
+        self.feature_values = feature_values
+        self.missing_values_in_feature_mask = missing_values_in_feature_mask
+
+    cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
+        """Initialize splitter at the beginning of node_split."""
+        self.start = start
+        self.end = end
+        self.n_missing = 0
+
+    cdef inline void sort_samples_and_feature_values(
+        self, intp_t current_feature
+    ) noexcept nogil:
+        """Simultaneously sort based on the feature_values.
+
+        Missing values are stored at the end of feature_values.
+        The number of missing values observed in feature_values is stored
+        in self.n_missing.
+        """
+        cdef:
+            intp_t i, current_end
+            float32_t[::1] feature_values = self.feature_values
+            const float32_t[:, :] X = self.X
+            intp_t[::1] samples = self.samples
+            intp_t n_missing = 0
+            const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
+
+        # Sort samples along that feature; by copying the values into an array and
+        # sorting the array in a manner which utilizes the cache more effectively.
+        if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
+            i, current_end = self.start, self.end - 1
+            # Missing values are placed at the end and do not participate in the sorting.
+            while i <= current_end:
+                # Finds the right-most value that is not missing so that
+                # it can be swapped with missing values at its left.
+                if isnan(X[samples[current_end], current_feature]):
+                    n_missing += 1
+                    current_end -= 1
+                    continue
+
+                # X[samples[current_end], current_feature] is a non-missing value
+                if isnan(X[samples[i], current_feature]):
+                    samples[i], samples[current_end] = samples[current_end], samples[i]
+                    n_missing += 1
+                    current_end -= 1
+
+                feature_values[i] = X[samples[i], current_feature]
+                i += 1
+        else:
+            # When there are no missing values, we only need to copy the data into
+            # feature_values
+            for i in range(self.start, self.end):
+                feature_values[i] = X[samples[i], current_feature]
+
+        sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
+        self.n_missing = n_missing
+
+    cdef inline void find_min_max(
+        self,
+        intp_t current_feature,
+        float32_t* min_feature_value_out,
+        float32_t* max_feature_value_out,
+    ) noexcept nogil:
+        """Find the minimum and maximum value for current_feature.
+
+        Missing values are stored at the end of feature_values. The number of missing
+        values observed in feature_values is stored in self.n_missing.
+        """
+        cdef:
+            intp_t p, current_end
+            float32_t current_feature_value
+            const float32_t[:, :] X = self.X
+            intp_t[::1] samples = self.samples
+            float32_t min_feature_value = INFINITY_32t
+            float32_t max_feature_value = -INFINITY_32t
+            float32_t[::1] feature_values = self.feature_values
+            intp_t n_missing = 0
+            const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
+
+        # We are copying the values into an array and finding min/max of the array in
+        # a manner which utilizes the cache more effectively. We need to also count
+        # the number of missing-values there are.
+        if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
+            p, current_end = self.start, self.end - 1
+            # Missing values are placed at the end and do not participate in the
+            # min/max calculation.
+            while p <= current_end:
+                # Finds the right-most value that is not missing so that
+                # it can be swapped with missing values towards its left.
+                if isnan(X[samples[current_end], current_feature]):
+                    n_missing += 1
+                    current_end -= 1
+                    continue
+
+                # X[samples[current_end], current_feature] is a non-missing value
+                if isnan(X[samples[p], current_feature]):
+                    samples[p], samples[current_end] = samples[current_end], samples[p]
+                    n_missing += 1
+                    current_end -= 1
+
+                current_feature_value = X[samples[p], current_feature]
+                feature_values[p] = current_feature_value
+                if current_feature_value < min_feature_value:
+                    min_feature_value = current_feature_value
+                elif current_feature_value > max_feature_value:
+                    max_feature_value = current_feature_value
+                p += 1
+        else:
+            min_feature_value = X[samples[self.start], current_feature]
+            max_feature_value = min_feature_value
+
+            feature_values[self.start] = min_feature_value
+            for p in range(self.start + 1, self.end):
+                current_feature_value = X[samples[p], current_feature]
+                feature_values[p] = current_feature_value
+
+                if current_feature_value < min_feature_value:
+                    min_feature_value = current_feature_value
+                elif current_feature_value > max_feature_value:
+                    max_feature_value = current_feature_value
+
+        min_feature_value_out[0] = min_feature_value
+        max_feature_value_out[0] = max_feature_value
+        self.n_missing = n_missing
+
+    cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
+        """Compute the next p_prev and p for iterating over feature values.
+
+        The missing values are not included when iterating through the feature values.
+        """
+        cdef:
+            float32_t[::1] feature_values = self.feature_values
+            intp_t end_non_missing = self.end - self.n_missing
+
+        while (
+            p[0] + 1 < end_non_missing and
+            feature_values[p[0] + 1] <= feature_values[p[0]] + FEATURE_THRESHOLD
+        ):
+            p[0] += 1
+
+        p_prev[0] = p[0]
+
+        # By adding 1, we have
+        # (feature_values[p] >= end) or (feature_values[p] > feature_values[p - 1])
+        p[0] += 1
+
+    cdef inline intp_t partition_samples(
+        self,
+        float64_t current_threshold
+    ) noexcept nogil:
+        """Partition samples for feature_values at the current_threshold."""
+        cdef:
+            intp_t p = self.start
+            intp_t partition_end = self.end - self.n_missing
+            intp_t[::1] samples = self.samples
+            float32_t[::1] feature_values = self.feature_values
+
+        while p < partition_end:
+            if feature_values[p] <= current_threshold:
+                p += 1
+            else:
+                partition_end -= 1
+
+                feature_values[p], feature_values[partition_end] = (
+                    feature_values[partition_end], feature_values[p]
+                )
+                samples[p], samples[partition_end] = samples[partition_end], samples[p]
+
+        return partition_end
+
+    cdef inline void partition_samples_final(
+        self,
+        intp_t best_pos,
+        float64_t best_threshold,
+        intp_t best_feature,
+        intp_t best_n_missing,
+    ) noexcept nogil:
+        """Partition samples for X at the best_threshold and best_feature.
+
+        If missing values are present, this method partitions `samples`
+        so that the `best_n_missing` missing values' indices are in the
+        right-most end of `samples`, that is `samples[end_non_missing:end]`.
+        """
+        cdef:
+            # Local invariance: start <= p <= partition_end <= end
+            intp_t start = self.start
+            intp_t p = start
+            intp_t end = self.end - 1
+            intp_t partition_end = end - best_n_missing
+            intp_t[::1] samples = self.samples
+            const float32_t[:, :] X = self.X
+            float32_t current_value
+
+        if best_n_missing != 0:
+            # Move samples with missing values to the end while partitioning the
+            # non-missing samples
+            while p < partition_end:
+                # Keep samples with missing values at the end
+                if isnan(X[samples[end], best_feature]):
+                    end -= 1
+                    continue
+
+                # Swap sample with missing values with the sample at the end
+                current_value = X[samples[p], best_feature]
+                if isnan(current_value):
+                    samples[p], samples[end] = samples[end], samples[p]
+                    end -= 1
+
+                    # The swapped sample at the end is always a non-missing value, so
+                    # we can continue the algorithm without checking for missingness.
+                    current_value = X[samples[p], best_feature]
+
+                # Partition the non-missing samples
+                if current_value <= best_threshold:
+                    p += 1
+                else:
+                    samples[p], samples[partition_end] = samples[partition_end], samples[p]
+                    partition_end -= 1
+        else:
+            # Partitioning routine when there are no missing values
+            while p < partition_end:
+                if X[samples[p], best_feature] <= best_threshold:
+                    p += 1
+                else:
+                    samples[p], samples[partition_end] = samples[partition_end], samples[p]
+                    partition_end -= 1
+
+
+@final
+cdef class SparsePartitioner:
+    """Partitioner specialized for sparse CSC data.
+
+    Note that this partitioner is agnostic to the splitting strategy (best vs. random).
+    """
+    def __init__(
+        self,
+        object X,
+        intp_t[::1] samples,
+        intp_t n_samples,
+        float32_t[::1] feature_values,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ):
+        if not (issparse(X) and X.format == "csc"):
+            raise ValueError("X should be in csc format")
+
+        self.samples = samples
+        self.feature_values = feature_values
+
+        # Initialize X
+        cdef intp_t n_total_samples = X.shape[0]
+
+        self.X_data = X.data
+        self.X_indices = X.indices
+        self.X_indptr = X.indptr
+        self.n_total_samples = n_total_samples
+
+        # Initialize auxiliary array used to perform split
+        self.index_to_samples = np.full(n_total_samples, fill_value=-1, dtype=np.intp)
+        self.sorted_samples = np.empty(n_samples, dtype=np.intp)
+
+        cdef intp_t p
+        for p in range(n_samples):
+            self.index_to_samples[samples[p]] = p
+
+        self.missing_values_in_feature_mask = missing_values_in_feature_mask
+
+    cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
+        """Initialize splitter at the beginning of node_split."""
+        self.start = start
+        self.end = end
+        self.is_samples_sorted = 0
+        self.n_missing = 0
+
+    cdef inline void sort_samples_and_feature_values(
+        self,
+        intp_t current_feature
+    ) noexcept nogil:
+        """Simultaneously sort based on the feature_values."""
+        cdef:
+            float32_t[::1] feature_values = self.feature_values
+            intp_t[::1] index_to_samples = self.index_to_samples
+            intp_t[::1] samples = self.samples
+
+        self.extract_nnz(current_feature)
+        # Sort the positive and negative parts of `feature_values`
+        sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start)
+        if self.start_positive < self.end:
+            sort(
+                &feature_values[self.start_positive],
+                &samples[self.start_positive],
+                self.end - self.start_positive
+            )
+
+        # Update index_to_samples to take into account the sort
+        for p in range(self.start, self.end_negative):
+            index_to_samples[samples[p]] = p
+        for p in range(self.start_positive, self.end):
+            index_to_samples[samples[p]] = p
+
+        # Add one or two zeros in feature_values, if there is any
+        if self.end_negative < self.start_positive:
+            self.start_positive -= 1
+            feature_values[self.start_positive] = 0.
+
+            if self.end_negative != self.start_positive:
+                feature_values[self.end_negative] = 0.
+                self.end_negative += 1
+
+        # XXX: When sparse supports missing values, this should be set to the
+        # number of missing values for current_feature
+        self.n_missing = 0
+
+    cdef inline void find_min_max(
+        self,
+        intp_t current_feature,
+        float32_t* min_feature_value_out,
+        float32_t* max_feature_value_out,
+    ) noexcept nogil:
+        """Find the minimum and maximum value for current_feature."""
+        cdef:
+            intp_t p
+            float32_t current_feature_value, min_feature_value, max_feature_value
+            float32_t[::1] feature_values = self.feature_values
+
+        self.extract_nnz(current_feature)
+
+        if self.end_negative != self.start_positive:
+            # There is a zero
+            min_feature_value = 0
+            max_feature_value = 0
+        else:
+            min_feature_value = feature_values[self.start]
+            max_feature_value = min_feature_value
+
+        # Find min, max in feature_values[start:end_negative]
+        for p in range(self.start, self.end_negative):
+            current_feature_value = feature_values[p]
+
+            if current_feature_value < min_feature_value:
+                min_feature_value = current_feature_value
+            elif current_feature_value > max_feature_value:
+                max_feature_value = current_feature_value
+
+        # Update min, max given feature_values[start_positive:end]
+        for p in range(self.start_positive, self.end):
+            current_feature_value = feature_values[p]
+
+            if current_feature_value < min_feature_value:
+                min_feature_value = current_feature_value
+            elif current_feature_value > max_feature_value:
+                max_feature_value = current_feature_value
+
+        min_feature_value_out[0] = min_feature_value
+        max_feature_value_out[0] = max_feature_value
+
+    cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
+        """Compute the next p_prev and p for iterating over feature values."""
+        cdef:
+            intp_t p_next
+            float32_t[::1] feature_values = self.feature_values
+
+        if p[0] + 1 != self.end_negative:
+            p_next = p[0] + 1
+        else:
+            p_next = self.start_positive
+
+        while (p_next < self.end and
+                feature_values[p_next] <= feature_values[p[0]] + FEATURE_THRESHOLD):
+            p[0] = p_next
+            if p[0] + 1 != self.end_negative:
+                p_next = p[0] + 1
+            else:
+                p_next = self.start_positive
+
+        p_prev[0] = p[0]
+        p[0] = p_next
+
+    cdef inline intp_t partition_samples(
+        self,
+        float64_t current_threshold
+    ) noexcept nogil:
+        """Partition samples for feature_values at the current_threshold."""
+        return self._partition(current_threshold, self.start_positive)
+
+    cdef inline void partition_samples_final(
+        self,
+        intp_t best_pos,
+        float64_t best_threshold,
+        intp_t best_feature,
+        intp_t n_missing,
+    ) noexcept nogil:
+        """Partition samples for X at the best_threshold and best_feature."""
+        self.extract_nnz(best_feature)
+        self._partition(best_threshold, best_pos)
+
+    cdef inline intp_t _partition(self, float64_t threshold, intp_t zero_pos) noexcept nogil:
+        """Partition samples[start:end] based on threshold."""
+        cdef:
+            intp_t p, partition_end
+            intp_t[::1] index_to_samples = self.index_to_samples
+            float32_t[::1] feature_values = self.feature_values
+            intp_t[::1] samples = self.samples
+
+        if threshold < 0.:
+            p = self.start
+            partition_end = self.end_negative
+        elif threshold > 0.:
+            p = self.start_positive
+            partition_end = self.end
+        else:
+            # Data are already split
+            return zero_pos
+
+        while p < partition_end:
+            if feature_values[p] <= threshold:
+                p += 1
+
+            else:
+                partition_end -= 1
+
+                feature_values[p], feature_values[partition_end] = (
+                    feature_values[partition_end], feature_values[p]
+                )
+                sparse_swap(index_to_samples, samples, p, partition_end)
+
+        return partition_end
+
+    cdef inline void extract_nnz(self, intp_t feature) noexcept nogil:
+        """Extract and partition values for a given feature.
+
+        The extracted values are partitioned between negative values
+        feature_values[start:end_negative[0]] and positive values
+        feature_values[start_positive[0]:end].
+        The samples and index_to_samples are modified according to this
+        partition.
+
+        The extraction corresponds to the intersection between the arrays
+        X_indices[indptr_start:indptr_end] and samples[start:end].
+        This is done efficiently using either an index_to_samples based approach
+        or binary search based approach.
+
+        Parameters
+        ----------
+        feature : intp_t,
+            Index of the feature we want to extract non zero value.
+        """
+        cdef intp_t[::1] samples = self.samples
+        cdef float32_t[::1] feature_values = self.feature_values
+        cdef intp_t indptr_start = self.X_indptr[feature],
+        cdef intp_t indptr_end = self.X_indptr[feature + 1]
+        cdef intp_t n_indices = <intp_t>(indptr_end - indptr_start)
+        cdef intp_t n_samples = self.end - self.start
+        cdef intp_t[::1] index_to_samples = self.index_to_samples
+        cdef intp_t[::1] sorted_samples = self.sorted_samples
+        cdef const int32_t[::1] X_indices = self.X_indices
+        cdef const float32_t[::1] X_data = self.X_data
+
+        # Use binary search if n_samples * log(n_indices) <
+        # n_indices and index_to_samples approach otherwise.
+        # O(n_samples * log(n_indices)) is the running time of binary
+        # search and O(n_indices) is the running time of index_to_samples
+        # approach.
+        if ((1 - self.is_samples_sorted) * n_samples * log2(n_samples) +
+                n_samples * log2(n_indices) < EXTRACT_NNZ_SWITCH * n_indices):
+            extract_nnz_binary_search(X_indices, X_data,
+                                      indptr_start, indptr_end,
+                                      samples, self.start, self.end,
+                                      index_to_samples,
+                                      feature_values,
+                                      &self.end_negative, &self.start_positive,
+                                      sorted_samples, &self.is_samples_sorted)
+
+        # Using an index to samples  technique to extract non zero values
+        # index_to_samples is a mapping from X_indices to samples
+        else:
+            extract_nnz_index_to_samples(X_indices, X_data,
+                                         indptr_start, indptr_end,
+                                         samples, self.start, self.end,
+                                         index_to_samples,
+                                         feature_values,
+                                         &self.end_negative, &self.start_positive)
+
+
+cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil:
+    """Comparison function for sort.
+
+    This must return an `int` as it is used by stdlib's qsort, which expects
+    an `int` return value.
+    """
+    return <int>((<intp_t*>a)[0] - (<intp_t*>b)[0])
+
+
+cdef inline void binary_search(const int32_t[::1] sorted_array,
+                               int32_t start, int32_t end,
+                               intp_t value, intp_t* index,
+                               int32_t* new_start) noexcept nogil:
+    """Return the index of value in the sorted array.
+
+    If not found, return -1. new_start is the last pivot + 1
+    """
+    cdef int32_t pivot
+    index[0] = -1
+    while start < end:
+        pivot = start + (end - start) / 2
+
+        if sorted_array[pivot] == value:
+            index[0] = pivot
+            start = pivot + 1
+            break
+
+        if sorted_array[pivot] < value:
+            start = pivot + 1
+        else:
+            end = pivot
+    new_start[0] = start
+
+
+cdef inline void extract_nnz_index_to_samples(const int32_t[::1] X_indices,
+                                              const float32_t[::1] X_data,
+                                              int32_t indptr_start,
+                                              int32_t indptr_end,
+                                              intp_t[::1] samples,
+                                              intp_t start,
+                                              intp_t end,
+                                              intp_t[::1] index_to_samples,
+                                              float32_t[::1] feature_values,
+                                              intp_t* end_negative,
+                                              intp_t* start_positive) noexcept nogil:
+    """Extract and partition values for a feature using index_to_samples.
+
+    Complexity is O(indptr_end - indptr_start).
+    """
+    cdef int32_t k
+    cdef intp_t index
+    cdef intp_t end_negative_ = start
+    cdef intp_t start_positive_ = end
+
+    for k in range(indptr_start, indptr_end):
+        if start <= index_to_samples[X_indices[k]] < end:
+            if X_data[k] > 0:
+                start_positive_ -= 1
+                feature_values[start_positive_] = X_data[k]
+                index = index_to_samples[X_indices[k]]
+                sparse_swap(index_to_samples, samples, index, start_positive_)
+
+            elif X_data[k] < 0:
+                feature_values[end_negative_] = X_data[k]
+                index = index_to_samples[X_indices[k]]
+                sparse_swap(index_to_samples, samples, index, end_negative_)
+                end_negative_ += 1
+
+    # Returned values
+    end_negative[0] = end_negative_
+    start_positive[0] = start_positive_
+
+
+cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices,
+                                           const float32_t[::1] X_data,
+                                           int32_t indptr_start,
+                                           int32_t indptr_end,
+                                           intp_t[::1] samples,
+                                           intp_t start,
+                                           intp_t end,
+                                           intp_t[::1] index_to_samples,
+                                           float32_t[::1] feature_values,
+                                           intp_t* end_negative,
+                                           intp_t* start_positive,
+                                           intp_t[::1] sorted_samples,
+                                           bint* is_samples_sorted) noexcept nogil:
+    """Extract and partition values for a given feature using binary search.
+
+    If n_samples = end - start and n_indices = indptr_end - indptr_start,
+    the complexity is
+
+        O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) +
+          n_samples * log(n_indices)).
+    """
+    cdef intp_t n_samples
+
+    if not is_samples_sorted[0]:
+        n_samples = end - start
+        memcpy(&sorted_samples[start], &samples[start],
+               n_samples * sizeof(intp_t))
+        qsort(&sorted_samples[start], n_samples, sizeof(intp_t),
+              compare_SIZE_t)
+        is_samples_sorted[0] = 1
+
+    while (indptr_start < indptr_end and
+           sorted_samples[start] > X_indices[indptr_start]):
+        indptr_start += 1
+
+    while (indptr_start < indptr_end and
+           sorted_samples[end - 1] < X_indices[indptr_end - 1]):
+        indptr_end -= 1
+
+    cdef intp_t p = start
+    cdef intp_t index
+    cdef intp_t k
+    cdef intp_t end_negative_ = start
+    cdef intp_t start_positive_ = end
+
+    while (p < end and indptr_start < indptr_end):
+        # Find index of sorted_samples[p] in X_indices
+        binary_search(X_indices, indptr_start, indptr_end,
+                      sorted_samples[p], &k, &indptr_start)
+
+        if k != -1:
+            # If k != -1, we have found a non zero value
+
+            if X_data[k] > 0:
+                start_positive_ -= 1
+                feature_values[start_positive_] = X_data[k]
+                index = index_to_samples[X_indices[k]]
+                sparse_swap(index_to_samples, samples, index, start_positive_)
+
+            elif X_data[k] < 0:
+                feature_values[end_negative_] = X_data[k]
+                index = index_to_samples[X_indices[k]]
+                sparse_swap(index_to_samples, samples, index, end_negative_)
+                end_negative_ += 1
+        p += 1
+
+    # Returned values
+    end_negative[0] = end_negative_
+    start_positive[0] = start_positive_
+
+
+cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples,
+                             intp_t pos_1, intp_t pos_2) noexcept nogil:
+    """Swap sample pos_1 and pos_2 preserving sparse invariant."""
+    samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1]
+    index_to_samples[samples[pos_1]] = pos_1
+    index_to_samples[samples[pos_2]] = pos_2
+
+
+cdef inline void shift_missing_values_to_left_if_required(
+    SplitRecord* best,
+    intp_t[::1] samples,
+    intp_t end,
+) noexcept nogil:
+    """Shift missing value sample indices to the left of the split if required.
+
+    Note: this should always be called at the very end because it will
+    move samples around, thereby affecting the criterion.
+    This affects the computation of the children impurity, which affects
+    the computation of the next node.
+    """
+    cdef intp_t i, p, current_end
+    # The partitioner partitions the data such that the missing values are in
+    # samples[-n_missing:] for the criterion to consume. If the missing values
+    # are going to the right node, then the missing values are already in the
+    # correct position. If the missing values go left, then we move the missing
+    # values to samples[best.pos:best.pos+n_missing] and update `best.pos`.
+    if best.n_missing > 0 and best.missing_go_to_left:
+        for p in range(best.n_missing):
+            i = best.pos + p
+            current_end = end - 1 - p
+            samples[i], samples[current_end] = samples[current_end], samples[i]
+        best.pos += best.n_missing
+
+
+def _py_sort(float32_t[::1] feature_values, intp_t[::1] samples, intp_t n):
+    """Used for testing sort."""
+    sort(&feature_values[0], &samples[0], n)
+
+
+# Sort n-element arrays pointed to by feature_values and samples, simultaneously,
+# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997).
+cdef inline void sort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil:
+    if n == 0:
+        return
+    cdef intp_t maxd = 2 * <intp_t>log2(n)
+    introsort(feature_values, samples, n, maxd)
+
+
+cdef inline void swap(float32_t* feature_values, intp_t* samples,
+                      intp_t i, intp_t j) noexcept nogil:
+    # Helper for sort
+    feature_values[i], feature_values[j] = feature_values[j], feature_values[i]
+    samples[i], samples[j] = samples[j], samples[i]
+
+
+cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogil:
+    # Median of three pivot selection, after Bentley and McIlroy (1993).
+    # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
+    cdef float32_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1]
+    if a < b:
+        if b < c:
+            return b
+        elif a < c:
+            return c
+        else:
+            return a
+    elif b < c:
+        if a < c:
+            return a
+        else:
+            return c
+    else:
+        return b
+
+
+# Introsort with median of 3 pivot selection and 3-way partition function
+# (robust to repeated elements, e.g. lots of zero features).
+cdef void introsort(float32_t* feature_values, intp_t *samples,
+                    intp_t n, intp_t maxd) noexcept nogil:
+    cdef float32_t pivot
+    cdef intp_t i, l, r
+
+    while n > 1:
+        if maxd <= 0:   # max depth limit exceeded ("gone quadratic")
+            heapsort(feature_values, samples, n)
+            return
+        maxd -= 1
+
+        pivot = median3(feature_values, n)
+
+        # Three-way partition.
+        i = l = 0
+        r = n
+        while i < r:
+            if feature_values[i] < pivot:
+                swap(feature_values, samples, i, l)
+                i += 1
+                l += 1
+            elif feature_values[i] > pivot:
+                r -= 1
+                swap(feature_values, samples, i, r)
+            else:
+                i += 1
+
+        introsort(feature_values, samples, l, maxd)
+        feature_values += r
+        samples += r
+        n -= r
+
+
+cdef inline void sift_down(float32_t* feature_values, intp_t* samples,
+                           intp_t start, intp_t end) noexcept nogil:
+    # Restore heap order in feature_values[start:end] by moving the max element to start.
+    cdef intp_t child, maxind, root
+
+    root = start
+    while True:
+        child = root * 2 + 1
+
+        # find max of root, left child, right child
+        maxind = root
+        if child < end and feature_values[maxind] < feature_values[child]:
+            maxind = child
+        if child + 1 < end and feature_values[maxind] < feature_values[child + 1]:
+            maxind = child + 1
+
+        if maxind == root:
+            break
+        else:
+            swap(feature_values, samples, root, maxind)
+            root = maxind
+
+
+cdef void heapsort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil:
+    cdef intp_t start, end
+
+    # heapify
+    start = (n - 2) / 2
+    end = n
+    while True:
+        sift_down(feature_values, samples, start, end)
+        if start == 0:
+            break
+        start -= 1
+
+    # sort by shrinking the heap, putting the max element immediately after it
+    end = n - 1
+    while end > 0:
+        swap(feature_values, samples, 0, end)
+        sift_down(feature_values, samples, 0, end)
+        end = end - 1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_reingold_tilford.py b/.venv/lib/python3.12/site-packages/sklearn/tree/_reingold_tilford.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb4d84f6d324bc640ce8f37d5e9bfbc599816b9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_reingold_tilford.py
@@ -0,0 +1,188 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+
+class DrawTree:
+    def __init__(self, tree, parent=None, depth=0, number=1):
+        self.x = -1.0
+        self.y = depth
+        self.tree = tree
+        self.children = [
+            DrawTree(c, self, depth + 1, i + 1) for i, c in enumerate(tree.children)
+        ]
+        self.parent = parent
+        self.thread = None
+        self.mod = 0
+        self.ancestor = self
+        self.change = self.shift = 0
+        self._lmost_sibling = None
+        # this is the number of the node in its group of siblings 1..n
+        self.number = number
+
+    def left(self):
+        return self.thread or (len(self.children) and self.children[0])
+
+    def right(self):
+        return self.thread or (len(self.children) and self.children[-1])
+
+    def lbrother(self):
+        n = None
+        if self.parent:
+            for node in self.parent.children:
+                if node == self:
+                    return n
+                else:
+                    n = node
+        return n
+
+    def get_lmost_sibling(self):
+        if not self._lmost_sibling and self.parent and self != self.parent.children[0]:
+            self._lmost_sibling = self.parent.children[0]
+        return self._lmost_sibling
+
+    lmost_sibling = property(get_lmost_sibling)
+
+    def __str__(self):
+        return "%s: x=%s mod=%s" % (self.tree, self.x, self.mod)
+
+    def __repr__(self):
+        return self.__str__()
+
+    def max_extents(self):
+        extents = [c.max_extents() for c in self.children]
+        extents.append((self.x, self.y))
+        return np.max(extents, axis=0)
+
+
+def buchheim(tree):
+    dt = first_walk(DrawTree(tree))
+    min = second_walk(dt)
+    if min < 0:
+        third_walk(dt, -min)
+    return dt
+
+
+def third_walk(tree, n):
+    tree.x += n
+    for c in tree.children:
+        third_walk(c, n)
+
+
+def first_walk(v, distance=1.0):
+    if len(v.children) == 0:
+        if v.lmost_sibling:
+            v.x = v.lbrother().x + distance
+        else:
+            v.x = 0.0
+    else:
+        default_ancestor = v.children[0]
+        for w in v.children:
+            first_walk(w)
+            default_ancestor = apportion(w, default_ancestor, distance)
+        # print("finished v =", v.tree, "children")
+        execute_shifts(v)
+
+        midpoint = (v.children[0].x + v.children[-1].x) / 2
+
+        w = v.lbrother()
+        if w:
+            v.x = w.x + distance
+            v.mod = v.x - midpoint
+        else:
+            v.x = midpoint
+    return v
+
+
+def apportion(v, default_ancestor, distance):
+    w = v.lbrother()
+    if w is not None:
+        # in buchheim notation:
+        # i == inner; o == outer; r == right; l == left; r = +; l = -
+        vir = vor = v
+        vil = w
+        vol = v.lmost_sibling
+        sir = sor = v.mod
+        sil = vil.mod
+        sol = vol.mod
+        while vil.right() and vir.left():
+            vil = vil.right()
+            vir = vir.left()
+            vol = vol.left()
+            vor = vor.right()
+            vor.ancestor = v
+            shift = (vil.x + sil) - (vir.x + sir) + distance
+            if shift > 0:
+                move_subtree(ancestor(vil, v, default_ancestor), v, shift)
+                sir = sir + shift
+                sor = sor + shift
+            sil += vil.mod
+            sir += vir.mod
+            sol += vol.mod
+            sor += vor.mod
+        if vil.right() and not vor.right():
+            vor.thread = vil.right()
+            vor.mod += sil - sor
+        else:
+            if vir.left() and not vol.left():
+                vol.thread = vir.left()
+                vol.mod += sir - sol
+            default_ancestor = v
+    return default_ancestor
+
+
+def move_subtree(wl, wr, shift):
+    subtrees = wr.number - wl.number
+    # print(wl.tree, "is conflicted with", wr.tree, 'moving', subtrees,
+    # 'shift', shift)
+    # print wl, wr, wr.number, wl.number, shift, subtrees, shift/subtrees
+    wr.change -= shift / subtrees
+    wr.shift += shift
+    wl.change += shift / subtrees
+    wr.x += shift
+    wr.mod += shift
+
+
+def execute_shifts(v):
+    shift = change = 0
+    for w in v.children[::-1]:
+        # print("shift:", w, shift, w.change)
+        w.x += shift
+        w.mod += shift
+        change += w.change
+        shift += w.shift + change
+
+
+def ancestor(vil, v, default_ancestor):
+    # the relevant text is at the bottom of page 7 of
+    # "Improving Walker's Algorithm to Run in Linear Time" by Buchheim et al,
+    # (2002)
+    # https://citeseerx.ist.psu.edu/doc_view/pid/1f41c3c2a4880dc49238e46d555f16d28da2940d
+    if vil.ancestor in v.parent.children:
+        return vil.ancestor
+    else:
+        return default_ancestor
+
+
+def second_walk(v, m=0, depth=0, min=None):
+    v.x += m
+    v.y = depth
+
+    if min is None or v.x < min:
+        min = v.x
+
+    for w in v.children:
+        min = second_walk(w, m + v.mod, depth + 1, min)
+
+    return min
+
+
+class Tree:
+    def __init__(self, label="", node_id=-1, *children):
+        self.label = label
+        self.node_id = node_id
+        if children:
+            self.children = children
+        else:
+            self.children = []
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_splitter.pxd b/.venv/lib/python3.12/site-packages/sklearn/tree/_splitter.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..42c6c6d935a9cf3382db5bd5ec6a2ce833a6adb6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_splitter.pxd
@@ -0,0 +1,106 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# See _splitter.pyx for details.
+
+from ..utils._typedefs cimport (
+    float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
+)
+from ._criterion cimport Criterion
+from ._tree cimport ParentInfo
+
+
+cdef struct SplitRecord:
+    # Data to track sample split
+    intp_t feature         # Which feature to split on.
+    intp_t pos             # Split samples array at the given position,
+    #                      # i.e. count of samples below threshold for feature.
+    #                      # pos is >= end if the node is a leaf.
+    float64_t threshold       # Threshold to split at.
+    float64_t improvement     # Impurity improvement given parent node.
+    float64_t impurity_left   # Impurity of the left split.
+    float64_t impurity_right  # Impurity of the right split.
+    float64_t lower_bound     # Lower bound on value of both children for monotonicity
+    float64_t upper_bound     # Upper bound on value of both children for monotonicity
+    uint8_t missing_go_to_left  # Controls if missing values go to the left node.
+    intp_t n_missing            # Number of missing values for the feature being split on
+
+cdef class Splitter:
+    # The splitter searches in the input space for a feature and a threshold
+    # to split the samples samples[start:end].
+    #
+    # The impurity computations are delegated to a criterion object.
+
+    # Internal structures
+    cdef public Criterion criterion      # Impurity criterion
+    cdef public intp_t max_features      # Number of features to test
+    cdef public intp_t min_samples_leaf  # Min samples in a leaf
+    cdef public float64_t min_weight_leaf   # Minimum weight in a leaf
+
+    cdef object random_state             # Random state
+    cdef uint32_t rand_r_state           # sklearn_rand_r random number state
+
+    cdef intp_t[::1] samples             # Sample indices in X, y
+    cdef intp_t n_samples                # X.shape[0]
+    cdef float64_t weighted_n_samples       # Weighted number of samples
+    cdef intp_t[::1] features            # Feature indices in X
+    cdef intp_t[::1] constant_features   # Constant features indices
+    cdef intp_t n_features               # X.shape[1]
+    cdef float32_t[::1] feature_values   # temp. array holding feature values
+
+    cdef intp_t start                    # Start position for the current node
+    cdef intp_t end                      # End position for the current node
+
+    cdef const float64_t[:, ::1] y
+    # Monotonicity constraints for each feature.
+    # The encoding is as follows:
+    #   -1: monotonic decrease
+    #    0: no constraint
+    #   +1: monotonic increase
+    cdef const int8_t[:] monotonic_cst
+    cdef bint with_monotonic_cst
+    cdef const float64_t[:] sample_weight
+
+    # The samples vector `samples` is maintained by the Splitter object such
+    # that the samples contained in a node are contiguous. With this setting,
+    # `node_split` reorganizes the node samples `samples[start:end]` in two
+    # subsets `samples[start:pos]` and `samples[pos:end]`.
+
+    # The 1-d  `features` array of size n_features contains the features
+    # indices and allows fast sampling without replacement of features.
+
+    # The 1-d `constant_features` array of size n_features holds in
+    # `constant_features[:n_constant_features]` the feature ids with
+    # constant values for all the samples that reached a specific node.
+    # The value `n_constant_features` is given by the parent node to its
+    # child nodes.  The content of the range `[n_constant_features:]` is left
+    # undefined, but preallocated for performance reasons
+    # This allows optimization with depth-based tree building.
+
+    # Methods
+    cdef int init(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ) except -1
+
+    cdef int node_reset(
+        self,
+        intp_t start,
+        intp_t end,
+        float64_t* weighted_n_node_samples
+    ) except -1 nogil
+
+    cdef int node_split(
+        self,
+        ParentInfo* parent,
+        SplitRecord* split,
+    ) except -1 nogil
+
+    cdef void node_value(self, float64_t* dest) noexcept nogil
+
+    cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil
+
+    cdef float64_t node_impurity(self) noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_splitter.pyx b/.venv/lib/python3.12/site-packages/sklearn/tree/_splitter.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..b557a4d1c6300c3a79e8260e331c27df6c062426
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_splitter.pyx
@@ -0,0 +1,901 @@
+"""Splitting algorithms in the construction of a tree.
+
+This module contains the main splitting algorithms for constructing a tree.
+Splitting is concerned with finding the optimal partition of the data into
+two groups. The impurity of the groups is minimized, and the impurity is measured
+by some criterion, which is typically the Gini impurity or the entropy. Criterion
+are implemented in the ``_criterion`` module.
+
+Splitting evaluates a subset of features (defined by `max_features` also
+known as mtry in the literature). The module supports two primary types
+of splitting strategies:
+
+- Best Split: A greedy approach to find the optimal split. This method
+  ensures that the best possible split is chosen by examining various
+  thresholds for each candidate feature.
+- Random Split: A stochastic approach that selects a split randomly
+  from a subset of the best splits. This method is faster but does
+  not guarantee the optimal split.
+"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.string cimport memcpy
+
+from ..utils._typedefs cimport int8_t
+from ._criterion cimport Criterion
+from ._partitioner cimport (
+    FEATURE_THRESHOLD, DensePartitioner, SparsePartitioner,
+    shift_missing_values_to_left_if_required
+)
+from ._utils cimport RAND_R_MAX, rand_int, rand_uniform
+
+import numpy as np
+
+# Introduce a fused-class to make it possible to share the split implementation
+# between the dense and sparse cases in the node_split_best and node_split_random
+# functions. The alternative would have been to use inheritance-based polymorphism
+# but it would have resulted in a ~10% overall tree fitting performance
+# degradation caused by the overhead frequent virtual method lookups.
+ctypedef fused Partitioner:
+    DensePartitioner
+    SparsePartitioner
+
+
+cdef float64_t INFINITY = np.inf
+
+
+cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil:
+    self.impurity_left = INFINITY
+    self.impurity_right = INFINITY
+    self.pos = start_pos
+    self.feature = 0
+    self.threshold = 0.
+    self.improvement = -INFINITY
+    self.missing_go_to_left = False
+    self.n_missing = 0
+
+cdef class Splitter:
+    """Abstract splitter class.
+
+    Splitters are called by tree builders to find the best splits on both
+    sparse and dense data, one split at a time.
+    """
+
+    def __cinit__(
+        self,
+        Criterion criterion,
+        intp_t max_features,
+        intp_t min_samples_leaf,
+        float64_t min_weight_leaf,
+        object random_state,
+        const int8_t[:] monotonic_cst,
+    ):
+        """
+        Parameters
+        ----------
+        criterion : Criterion
+            The criterion to measure the quality of a split.
+
+        max_features : intp_t
+            The maximal number of randomly selected features which can be
+            considered for a split.
+
+        min_samples_leaf : intp_t
+            The minimal number of samples each leaf can have, where splits
+            which would result in having less samples in a leaf are not
+            considered.
+
+        min_weight_leaf : float64_t
+            The minimal weight each leaf can have, where the weight is the sum
+            of the weights of each sample in it.
+
+        random_state : object
+            The user inputted random state to be used for pseudo-randomness
+
+        monotonic_cst : const int8_t[:]
+            Monotonicity constraints
+
+        """
+
+        self.criterion = criterion
+
+        self.n_samples = 0
+        self.n_features = 0
+
+        self.max_features = max_features
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_leaf = min_weight_leaf
+        self.random_state = random_state
+        self.monotonic_cst = monotonic_cst
+        self.with_monotonic_cst = monotonic_cst is not None
+
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, d):
+        pass
+
+    def __reduce__(self):
+        return (type(self), (self.criterion,
+                             self.max_features,
+                             self.min_samples_leaf,
+                             self.min_weight_leaf,
+                             self.random_state,
+                             self.monotonic_cst), self.__getstate__())
+
+    cdef int init(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ) except -1:
+        """Initialize the splitter.
+
+        Take in the input data X, the target Y, and optional sample weights.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        X : object
+            This contains the inputs. Usually it is a 2d numpy array.
+
+        y : ndarray, dtype=float64_t
+            This is the vector of targets, or true labels, for the samples represented
+            as a Cython memoryview.
+
+        sample_weight : ndarray, dtype=float64_t
+            The weights of the samples, where higher weighted samples are fit
+            closer than lower weight samples. If not provided, all samples
+            are assumed to have uniform weight. This is represented
+            as a Cython memoryview.
+
+        has_missing : bool
+            At least one missing values is in X.
+        """
+
+        self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
+        cdef intp_t n_samples = X.shape[0]
+
+        # Create a new array which will be used to store nonzero
+        # samples from the feature of interest
+        self.samples = np.empty(n_samples, dtype=np.intp)
+        cdef intp_t[::1] samples = self.samples
+
+        cdef intp_t i, j
+        cdef float64_t weighted_n_samples = 0.0
+        j = 0
+
+        for i in range(n_samples):
+            # Only work with positively weighted samples
+            if sample_weight is None or sample_weight[i] != 0.0:
+                samples[j] = i
+                j += 1
+
+            if sample_weight is not None:
+                weighted_n_samples += sample_weight[i]
+            else:
+                weighted_n_samples += 1.0
+
+        # Number of samples is number of positively weighted samples
+        self.n_samples = j
+        self.weighted_n_samples = weighted_n_samples
+
+        cdef intp_t n_features = X.shape[1]
+        self.features = np.arange(n_features, dtype=np.intp)
+        self.n_features = n_features
+
+        self.feature_values = np.empty(n_samples, dtype=np.float32)
+        self.constant_features = np.empty(n_features, dtype=np.intp)
+
+        self.y = y
+
+        self.sample_weight = sample_weight
+        if missing_values_in_feature_mask is not None:
+            self.criterion.init_sum_missing()
+        return 0
+
+    cdef int node_reset(
+        self,
+        intp_t start,
+        intp_t end,
+        float64_t* weighted_n_node_samples
+    ) except -1 nogil:
+        """Reset splitter on node samples[start:end].
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        start : intp_t
+            The index of the first sample to consider
+        end : intp_t
+            The index of the last sample to consider
+        weighted_n_node_samples : ndarray, dtype=float64_t pointer
+            The total weight of those samples
+        """
+
+        self.start = start
+        self.end = end
+
+        self.criterion.init(
+            self.y,
+            self.sample_weight,
+            self.weighted_n_samples,
+            self.samples,
+            start,
+            end
+        )
+
+        weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
+        return 0
+
+    cdef int node_split(
+        self,
+        ParentInfo* parent_record,
+        SplitRecord* split,
+    ) except -1 nogil:
+
+        """Find the best split on node samples[start:end].
+
+        This is a placeholder method. The majority of computation will be done
+        here.
+
+        It should return -1 upon errors.
+        """
+
+        pass
+
+    cdef void node_value(self, float64_t* dest) noexcept nogil:
+        """Copy the value of node samples[start:end] into dest."""
+
+        self.criterion.node_value(dest)
+
+    cdef inline void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil:
+        """Clip the value in dest between lower_bound and upper_bound for monotonic constraints."""
+
+        self.criterion.clip_node_value(dest, lower_bound, upper_bound)
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        """Return the impurity of the current node."""
+
+        return self.criterion.node_impurity()
+
+
+cdef inline int node_split_best(
+    Splitter splitter,
+    Partitioner partitioner,
+    Criterion criterion,
+    SplitRecord* split,
+    ParentInfo* parent_record,
+) except -1 nogil:
+    """Find the best split on node samples[start:end]
+
+    Returns -1 in case of failure to allocate memory (and raise MemoryError)
+    or 0 otherwise.
+    """
+    cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
+    cdef bint with_monotonic_cst = splitter.with_monotonic_cst
+
+    # Find the best split
+    cdef intp_t start = splitter.start
+    cdef intp_t end = splitter.end
+    cdef intp_t end_non_missing
+    cdef intp_t n_missing = 0
+    cdef bint has_missing = 0
+    cdef intp_t n_searches
+    cdef intp_t n_left, n_right
+    cdef bint missing_go_to_left
+
+    cdef intp_t[::1] samples = splitter.samples
+    cdef intp_t[::1] features = splitter.features
+    cdef intp_t[::1] constant_features = splitter.constant_features
+    cdef intp_t n_features = splitter.n_features
+
+    cdef float32_t[::1] feature_values = splitter.feature_values
+    cdef intp_t max_features = splitter.max_features
+    cdef intp_t min_samples_leaf = splitter.min_samples_leaf
+    cdef float64_t min_weight_leaf = splitter.min_weight_leaf
+    cdef uint32_t* random_state = &splitter.rand_r_state
+
+    cdef SplitRecord best_split, current_split
+    cdef float64_t current_proxy_improvement = -INFINITY
+    cdef float64_t best_proxy_improvement = -INFINITY
+
+    cdef float64_t impurity = parent_record.impurity
+    cdef float64_t lower_bound = parent_record.lower_bound
+    cdef float64_t upper_bound = parent_record.upper_bound
+
+    cdef intp_t f_i = n_features
+    cdef intp_t f_j
+    cdef intp_t p
+    cdef intp_t p_prev
+
+    cdef intp_t n_visited_features = 0
+    # Number of features discovered to be constant during the split search
+    cdef intp_t n_found_constants = 0
+    # Number of features known to be constant and drawn without replacement
+    cdef intp_t n_drawn_constants = 0
+    cdef intp_t n_known_constants = parent_record.n_constant_features
+    # n_total_constants = n_known_constants + n_found_constants
+    cdef intp_t n_total_constants = n_known_constants
+
+    _init_split(&best_split, end)
+
+    partitioner.init_node_split(start, end)
+
+    # Sample up to max_features without replacement using a
+    # Fisher-Yates-based algorithm (using the local variables `f_i` and
+    # `f_j` to compute a permutation of the `features` array).
+    #
+    # Skip the CPU intensive evaluation of the impurity criterion for
+    # features that were already detected as constant (hence not suitable
+    # for good splitting) by ancestor nodes and save the information on
+    # newly discovered constant features to spare computation on descendant
+    # nodes.
+    while (f_i > n_total_constants and  # Stop early if remaining features
+                                        # are constant
+            (n_visited_features < max_features or
+             # At least one drawn features must be non constant
+             n_visited_features <= n_found_constants + n_drawn_constants)):
+
+        n_visited_features += 1
+
+        # Loop invariant: elements of features in
+        # - [:n_drawn_constant[ holds drawn and known constant features;
+        # - [n_drawn_constant:n_known_constant[ holds known constant
+        #   features that haven't been drawn yet;
+        # - [n_known_constant:n_total_constant[ holds newly found constant
+        #   features;
+        # - [n_total_constant:f_i[ holds features that haven't been drawn
+        #   yet and aren't constant apriori.
+        # - [f_i:n_features[ holds features that have been drawn
+        #   and aren't constant.
+
+        # Draw a feature at random
+        f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
+                       random_state)
+
+        if f_j < n_known_constants:
+            # f_j in the interval [n_drawn_constants, n_known_constants[
+            features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants]
+
+            n_drawn_constants += 1
+            continue
+
+        # f_j in the interval [n_known_constants, f_i - n_found_constants[
+        f_j += n_found_constants
+        # f_j in the interval [n_total_constants, f_i[
+        current_split.feature = features[f_j]
+        partitioner.sort_samples_and_feature_values(current_split.feature)
+        n_missing = partitioner.n_missing
+        end_non_missing = end - n_missing
+
+        if (
+            # All values for this feature are missing, or
+            end_non_missing == start or
+            # This feature is considered constant (max - min <= FEATURE_THRESHOLD)
+            feature_values[end_non_missing - 1] <= feature_values[start] + FEATURE_THRESHOLD
+        ):
+            # We consider this feature constant in this case.
+            # Since finding a split among constant feature is not valuable,
+            # we do not consider this feature for splitting.
+            features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
+
+            n_found_constants += 1
+            n_total_constants += 1
+            continue
+
+        f_i -= 1
+        features[f_i], features[f_j] = features[f_j], features[f_i]
+        has_missing = n_missing != 0
+        criterion.init_missing(n_missing)  # initialize even when n_missing == 0
+
+        # Evaluate all splits
+
+        # If there are missing values, then we search twice for the most optimal split.
+        # The first search will have all the missing values going to the right node.
+        # The second search will have all the missing values going to the left node.
+        # If there are no missing values, then we search only once for the most
+        # optimal split.
+        n_searches = 2 if has_missing else 1
+
+        for i in range(n_searches):
+            missing_go_to_left = i == 1
+            criterion.missing_go_to_left = missing_go_to_left
+            criterion.reset()
+
+            p = start
+
+            while p < end_non_missing:
+                partitioner.next_p(&p_prev, &p)
+
+                if p >= end_non_missing:
+                    continue
+
+                if missing_go_to_left:
+                    n_left = p - start + n_missing
+                    n_right = end_non_missing - p
+                else:
+                    n_left = p - start
+                    n_right = end_non_missing - p + n_missing
+
+                # Reject if min_samples_leaf is not guaranteed
+                if n_left < min_samples_leaf or n_right < min_samples_leaf:
+                    continue
+
+                current_split.pos = p
+                criterion.update(current_split.pos)
+
+                # Reject if monotonicity constraints are not satisfied
+                if (
+                    with_monotonic_cst and
+                    monotonic_cst[current_split.feature] != 0 and
+                    not criterion.check_monotonicity(
+                        monotonic_cst[current_split.feature],
+                        lower_bound,
+                        upper_bound,
+                    )
+                ):
+                    continue
+
+                # Reject if min_weight_leaf is not satisfied
+                if ((criterion.weighted_n_left < min_weight_leaf) or
+                        (criterion.weighted_n_right < min_weight_leaf)):
+                    continue
+
+                current_proxy_improvement = criterion.proxy_impurity_improvement()
+
+                if current_proxy_improvement > best_proxy_improvement:
+                    best_proxy_improvement = current_proxy_improvement
+                    # sum of halves is used to avoid infinite value
+                    current_split.threshold = (
+                        feature_values[p_prev] / 2.0 + feature_values[p] / 2.0
+                    )
+
+                    if (
+                        current_split.threshold == feature_values[p] or
+                        current_split.threshold == INFINITY or
+                        current_split.threshold == -INFINITY
+                    ):
+                        current_split.threshold = feature_values[p_prev]
+
+                    current_split.n_missing = n_missing
+
+                    # if there are no missing values in the training data, during
+                    # test time, we send missing values to the branch that contains
+                    # the most samples during training time.
+                    if n_missing == 0:
+                        current_split.missing_go_to_left = n_left > n_right
+                    else:
+                        current_split.missing_go_to_left = missing_go_to_left
+
+                    best_split = current_split  # copy
+
+        # Evaluate when there are missing values and all missing values goes
+        # to the right node and non-missing values goes to the left node.
+        if has_missing:
+            n_left, n_right = end - start - n_missing, n_missing
+            p = end - n_missing
+            missing_go_to_left = 0
+
+            if not (n_left < min_samples_leaf or n_right < min_samples_leaf):
+                criterion.missing_go_to_left = missing_go_to_left
+                criterion.update(p)
+
+                if not ((criterion.weighted_n_left < min_weight_leaf) or
+                        (criterion.weighted_n_right < min_weight_leaf)):
+                    current_proxy_improvement = criterion.proxy_impurity_improvement()
+
+                    if current_proxy_improvement > best_proxy_improvement:
+                        best_proxy_improvement = current_proxy_improvement
+                        current_split.threshold = INFINITY
+                        current_split.missing_go_to_left = missing_go_to_left
+                        current_split.n_missing = n_missing
+                        current_split.pos = p
+                        best_split = current_split
+
+    # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end]
+    if best_split.pos < end:
+        partitioner.partition_samples_final(
+            best_split.pos,
+            best_split.threshold,
+            best_split.feature,
+            best_split.n_missing
+        )
+        criterion.init_missing(best_split.n_missing)
+        criterion.missing_go_to_left = best_split.missing_go_to_left
+
+        criterion.reset()
+        criterion.update(best_split.pos)
+        criterion.children_impurity(
+            &best_split.impurity_left, &best_split.impurity_right
+        )
+        best_split.improvement = criterion.impurity_improvement(
+            impurity,
+            best_split.impurity_left,
+            best_split.impurity_right
+        )
+
+        shift_missing_values_to_left_if_required(&best_split, samples, end)
+
+    # Respect invariant for constant features: the original order of
+    # element in features[:n_known_constants] must be preserved for sibling
+    # and child nodes
+    memcpy(&features[0], &constant_features[0], sizeof(intp_t) * n_known_constants)
+
+    # Copy newly found constant features
+    memcpy(&constant_features[n_known_constants],
+           &features[n_known_constants],
+           sizeof(intp_t) * n_found_constants)
+
+    # Return values
+    parent_record.n_constant_features = n_total_constants
+    split[0] = best_split
+    return 0
+
+
+cdef inline int node_split_random(
+    Splitter splitter,
+    Partitioner partitioner,
+    Criterion criterion,
+    SplitRecord* split,
+    ParentInfo* parent_record,
+) except -1 nogil:
+    """Find the best random split on node samples[start:end]
+
+    Returns -1 in case of failure to allocate memory (and raise MemoryError)
+    or 0 otherwise.
+    """
+    cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
+    cdef bint with_monotonic_cst = splitter.with_monotonic_cst
+
+    # Draw random splits and pick the best
+    cdef intp_t start = splitter.start
+    cdef intp_t end = splitter.end
+    cdef intp_t end_non_missing
+    cdef intp_t n_missing = 0
+    cdef bint has_missing = 0
+    cdef intp_t n_left, n_right
+    cdef bint missing_go_to_left
+
+    cdef intp_t[::1] samples = splitter.samples
+    cdef intp_t[::1] features = splitter.features
+    cdef intp_t[::1] constant_features = splitter.constant_features
+    cdef intp_t n_features = splitter.n_features
+
+    cdef intp_t max_features = splitter.max_features
+    cdef intp_t min_samples_leaf = splitter.min_samples_leaf
+    cdef float64_t min_weight_leaf = splitter.min_weight_leaf
+    cdef uint32_t* random_state = &splitter.rand_r_state
+
+    cdef SplitRecord best_split, current_split
+    cdef float64_t current_proxy_improvement = - INFINITY
+    cdef float64_t best_proxy_improvement = - INFINITY
+
+    cdef float64_t impurity = parent_record.impurity
+    cdef float64_t lower_bound = parent_record.lower_bound
+    cdef float64_t upper_bound = parent_record.upper_bound
+
+    cdef intp_t f_i = n_features
+    cdef intp_t f_j
+    # Number of features discovered to be constant during the split search
+    cdef intp_t n_found_constants = 0
+    # Number of features known to be constant and drawn without replacement
+    cdef intp_t n_drawn_constants = 0
+    cdef intp_t n_known_constants = parent_record.n_constant_features
+    # n_total_constants = n_known_constants + n_found_constants
+    cdef intp_t n_total_constants = n_known_constants
+    cdef intp_t n_visited_features = 0
+    cdef float32_t min_feature_value
+    cdef float32_t max_feature_value
+
+    _init_split(&best_split, end)
+
+    partitioner.init_node_split(start, end)
+
+    # Sample up to max_features without replacement using a
+    # Fisher-Yates-based algorithm (using the local variables `f_i` and
+    # `f_j` to compute a permutation of the `features` array).
+    #
+    # Skip the CPU intensive evaluation of the impurity criterion for
+    # features that were already detected as constant (hence not suitable
+    # for good splitting) by ancestor nodes and save the information on
+    # newly discovered constant features to spare computation on descendant
+    # nodes.
+    while (f_i > n_total_constants and  # Stop early if remaining features
+                                        # are constant
+            (n_visited_features < max_features or
+             # At least one drawn features must be non constant
+             n_visited_features <= n_found_constants + n_drawn_constants)):
+        n_visited_features += 1
+
+        # Loop invariant: elements of features in
+        # - [:n_drawn_constant[ holds drawn and known constant features;
+        # - [n_drawn_constant:n_known_constant[ holds known constant
+        #   features that haven't been drawn yet;
+        # - [n_known_constant:n_total_constant[ holds newly found constant
+        #   features;
+        # - [n_total_constant:f_i[ holds features that haven't been drawn
+        #   yet and aren't constant apriori.
+        # - [f_i:n_features[ holds features that have been drawn
+        #   and aren't constant.
+
+        # Draw a feature at random
+        f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
+                       random_state)
+
+        if f_j < n_known_constants:
+            # f_j in the interval [n_drawn_constants, n_known_constants[
+            features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants]
+            n_drawn_constants += 1
+            continue
+
+        # f_j in the interval [n_known_constants, f_i - n_found_constants[
+        f_j += n_found_constants
+        # f_j in the interval [n_total_constants, f_i[
+
+        current_split.feature = features[f_j]
+
+        # Find min, max as we will randomly select a threshold between them
+        partitioner.find_min_max(
+            current_split.feature, &min_feature_value, &max_feature_value
+        )
+        n_missing = partitioner.n_missing
+        end_non_missing = end - n_missing
+
+        if (
+            # All values for this feature are missing, or
+            end_non_missing == start or
+            # This feature is considered constant (max - min <= FEATURE_THRESHOLD)
+            max_feature_value <= min_feature_value + FEATURE_THRESHOLD
+        ):
+            # We consider this feature constant in this case.
+            # Since finding a split with a constant feature is not valuable,
+            # we do not consider this feature for splitting.
+            features[f_j], features[n_total_constants] = features[n_total_constants], current_split.feature
+
+            n_found_constants += 1
+            n_total_constants += 1
+            continue
+
+        f_i -= 1
+        features[f_i], features[f_j] = features[f_j], features[f_i]
+        has_missing = n_missing != 0
+        criterion.init_missing(n_missing)
+
+        # Draw a random threshold
+        current_split.threshold = rand_uniform(
+            min_feature_value,
+            max_feature_value,
+            random_state,
+        )
+
+        if has_missing:
+            # If there are missing values, then we randomly make all missing
+            # values go to the right or left.
+            #
+            # Note: compared to the BestSplitter, we do not evaluate the
+            # edge case where all the missing values go to the right node
+            # and the non-missing values go to the left node. This is because
+            # this would indicate a threshold outside of the observed range
+            # of the feature. However, it is not clear how much probability weight should
+            # be given to this edge case.
+            missing_go_to_left = rand_int(0, 2, random_state)
+        else:
+            missing_go_to_left = 0
+        criterion.missing_go_to_left = missing_go_to_left
+
+        if current_split.threshold == max_feature_value:
+            current_split.threshold = min_feature_value
+
+        # Partition
+        current_split.pos = partitioner.partition_samples(
+            current_split.threshold
+        )
+
+        if missing_go_to_left:
+            n_left = current_split.pos - start + n_missing
+            n_right = end_non_missing - current_split.pos
+        else:
+            n_left = current_split.pos - start
+            n_right = end_non_missing - current_split.pos + n_missing
+
+        # Reject if min_samples_leaf is not guaranteed
+        if n_left < min_samples_leaf or n_right < min_samples_leaf:
+            continue
+
+        # Evaluate split
+        # At this point, the criterion has a view into the samples that was partitioned
+        # by the partitioner. The criterion will use the partition to evaluating the split.
+        criterion.reset()
+        criterion.update(current_split.pos)
+
+        # Reject if min_weight_leaf is not satisfied
+        if ((criterion.weighted_n_left < min_weight_leaf) or
+                (criterion.weighted_n_right < min_weight_leaf)):
+            continue
+
+        # Reject if monotonicity constraints are not satisfied
+        if (
+                with_monotonic_cst and
+                monotonic_cst[current_split.feature] != 0 and
+                not criterion.check_monotonicity(
+                    monotonic_cst[current_split.feature],
+                    lower_bound,
+                    upper_bound,
+                )
+        ):
+            continue
+
+        current_proxy_improvement = criterion.proxy_impurity_improvement()
+
+        if current_proxy_improvement > best_proxy_improvement:
+            current_split.n_missing = n_missing
+
+            # if there are no missing values in the training data, during
+            # test time, we send missing values to the branch that contains
+            # the most samples during training time.
+            if has_missing:
+                current_split.missing_go_to_left = missing_go_to_left
+            else:
+                current_split.missing_go_to_left = n_left > n_right
+
+            best_proxy_improvement = current_proxy_improvement
+            best_split = current_split  # copy
+
+    # Reorganize into samples[start:best.pos] + samples[best.pos:end]
+    if best_split.pos < end:
+        if current_split.feature != best_split.feature:
+            partitioner.partition_samples_final(
+                best_split.pos,
+                best_split.threshold,
+                best_split.feature,
+                best_split.n_missing
+            )
+        criterion.init_missing(best_split.n_missing)
+        criterion.missing_go_to_left = best_split.missing_go_to_left
+
+        criterion.reset()
+        criterion.update(best_split.pos)
+        criterion.children_impurity(
+            &best_split.impurity_left, &best_split.impurity_right
+        )
+        best_split.improvement = criterion.impurity_improvement(
+            impurity,
+            best_split.impurity_left,
+            best_split.impurity_right
+        )
+
+        shift_missing_values_to_left_if_required(&best_split, samples, end)
+
+    # Respect invariant for constant features: the original order of
+    # element in features[:n_known_constants] must be preserved for sibling
+    # and child nodes
+    memcpy(&features[0], &constant_features[0], sizeof(intp_t) * n_known_constants)
+
+    # Copy newly found constant features
+    memcpy(&constant_features[n_known_constants],
+           &features[n_known_constants],
+           sizeof(intp_t) * n_found_constants)
+
+    # Return values
+    parent_record.n_constant_features = n_total_constants
+    split[0] = best_split
+    return 0
+
+
+cdef class BestSplitter(Splitter):
+    """Splitter for finding the best split on dense data."""
+    cdef DensePartitioner partitioner
+    cdef int init(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ) except -1:
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
+        self.partitioner = DensePartitioner(
+            X, self.samples, self.feature_values, missing_values_in_feature_mask
+        )
+
+    cdef int node_split(
+            self,
+            ParentInfo* parent_record,
+            SplitRecord* split,
+    ) except -1 nogil:
+        return node_split_best(
+            self,
+            self.partitioner,
+            self.criterion,
+            split,
+            parent_record,
+        )
+
+cdef class BestSparseSplitter(Splitter):
+    """Splitter for finding the best split, using the sparse data."""
+    cdef SparsePartitioner partitioner
+    cdef int init(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ) except -1:
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
+        self.partitioner = SparsePartitioner(
+            X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
+        )
+
+    cdef int node_split(
+            self,
+            ParentInfo* parent_record,
+            SplitRecord* split,
+    ) except -1 nogil:
+        return node_split_best(
+            self,
+            self.partitioner,
+            self.criterion,
+            split,
+            parent_record,
+        )
+
+cdef class RandomSplitter(Splitter):
+    """Splitter for finding the best random split on dense data."""
+    cdef DensePartitioner partitioner
+    cdef int init(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ) except -1:
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
+        self.partitioner = DensePartitioner(
+            X, self.samples, self.feature_values, missing_values_in_feature_mask
+        )
+
+    cdef int node_split(
+            self,
+            ParentInfo* parent_record,
+            SplitRecord* split,
+    ) except -1 nogil:
+        return node_split_random(
+            self,
+            self.partitioner,
+            self.criterion,
+            split,
+            parent_record,
+        )
+
+cdef class RandomSparseSplitter(Splitter):
+    """Splitter for finding the best random split, using the sparse data."""
+    cdef SparsePartitioner partitioner
+    cdef int init(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ) except -1:
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
+        self.partitioner = SparsePartitioner(
+            X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
+        )
+    cdef int node_split(
+            self,
+            ParentInfo* parent_record,
+            SplitRecord* split,
+    ) except -1 nogil:
+        return node_split_random(
+            self,
+            self.partitioner,
+            self.criterion,
+            split,
+            parent_record,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_tree.pxd b/.venv/lib/python3.12/site-packages/sklearn/tree/_tree.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..2cadca4564a87f8d4b5212d665cf53fbe41cb974
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_tree.pxd
@@ -0,0 +1,133 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# See _tree.pyx for details.
+
+import numpy as np
+cimport numpy as cnp
+
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t
+
+from ._splitter cimport Splitter
+from ._splitter cimport SplitRecord
+
+cdef struct Node:
+    # Base storage structure for the nodes in a Tree object
+
+    intp_t left_child                    # id of the left child of the node
+    intp_t right_child                   # id of the right child of the node
+    intp_t feature                       # Feature used for splitting the node
+    float64_t threshold                  # Threshold value at the node
+    float64_t impurity                   # Impurity of the node (i.e., the value of the criterion)
+    intp_t n_node_samples                # Number of samples at the node
+    float64_t weighted_n_node_samples    # Weighted number of samples at the node
+    uint8_t missing_go_to_left     # Whether features have missing values
+
+
+cdef struct ParentInfo:
+    # Structure to store information about the parent of a node
+    # This is passed to the splitter, to provide information about the previous split
+
+    float64_t lower_bound           # the lower bound of the parent's impurity
+    float64_t upper_bound           # the upper bound of the parent's impurity
+    float64_t impurity              # the impurity of the parent
+    intp_t n_constant_features      # the number of constant features found in parent
+
+cdef class Tree:
+    # The Tree object is a binary tree structure constructed by the
+    # TreeBuilder. The tree structure is used for predictions and
+    # feature importances.
+
+    # Input/Output layout
+    cdef public intp_t n_features        # Number of features in X
+    cdef intp_t* n_classes               # Number of classes in y[:, k]
+    cdef public intp_t n_outputs         # Number of outputs in y
+    cdef public intp_t max_n_classes     # max(n_classes)
+
+    # Inner structures: values are stored separately from node structure,
+    # since size is determined at runtime.
+    cdef public intp_t max_depth         # Max depth of the tree
+    cdef public intp_t node_count        # Counter for node IDs
+    cdef public intp_t capacity          # Capacity of tree, in terms of nodes
+    cdef Node* nodes                     # Array of nodes
+    cdef float64_t* value                # (capacity, n_outputs, max_n_classes) array of values
+    cdef intp_t value_stride             # = n_outputs * max_n_classes
+
+    # Methods
+    cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
+                          intp_t feature, float64_t threshold, float64_t impurity,
+                          intp_t n_node_samples,
+                          float64_t weighted_n_node_samples,
+                          uint8_t missing_go_to_left) except -1 nogil
+    cdef int _resize(self, intp_t capacity) except -1 nogil
+    cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
+
+    cdef cnp.ndarray _get_value_ndarray(self)
+    cdef cnp.ndarray _get_node_ndarray(self)
+
+    cpdef cnp.ndarray predict(self, object X)
+
+    cpdef cnp.ndarray apply(self, object X)
+    cdef cnp.ndarray _apply_dense(self, object X)
+    cdef cnp.ndarray _apply_sparse_csr(self, object X)
+
+    cpdef object decision_path(self, object X)
+    cdef object _decision_path_dense(self, object X)
+    cdef object _decision_path_sparse_csr(self, object X)
+
+    cpdef compute_node_depths(self)
+    cpdef compute_feature_importances(self, normalize=*)
+
+
+# =============================================================================
+# Tree builder
+# =============================================================================
+
+cdef class TreeBuilder:
+    # The TreeBuilder recursively builds a Tree object from training samples,
+    # using a Splitter object for splitting internal nodes and assigning
+    # values to leaves.
+    #
+    # This class controls the various stopping criteria and the node splitting
+    # evaluation order, e.g. depth-first or best-first.
+
+    cdef Splitter splitter              # Splitting algorithm
+
+    cdef intp_t min_samples_split       # Minimum number of samples in an internal node
+    cdef intp_t min_samples_leaf        # Minimum number of samples in a leaf
+    cdef float64_t min_weight_leaf         # Minimum weight in a leaf
+    cdef intp_t max_depth               # Maximal tree depth
+    cdef float64_t min_impurity_decrease   # Impurity threshold for early stopping
+
+    cpdef build(
+        self,
+        Tree tree,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight=*,
+        const uint8_t[::1] missing_values_in_feature_mask=*,
+    )
+
+    cdef _check_input(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+    )
+
+
+# =============================================================================
+# Tree pruning
+# =============================================================================
+
+# The private function allows any external caller to prune the tree and return
+# a new tree with the pruned nodes. The pruned tree is a new tree object.
+#
+# .. warning:: this function is not backwards compatible and may change without
+#              notice.
+cdef void _build_pruned_tree(
+    Tree tree,  # OUT
+    Tree orig_tree,
+    const uint8_t[:] leaves_in_subtree,
+    intp_t capacity
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_tree.pyx b/.venv/lib/python3.12/site-packages/sklearn/tree/_tree.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..9d0b2854c3ba0bfb79035943652fa9b20f7784f1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_tree.pyx
@@ -0,0 +1,1989 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cpython cimport Py_INCREF, PyObject, PyTypeObject
+
+from libc.stdlib cimport free
+from libc.string cimport memcpy
+from libc.string cimport memset
+from libc.stdint cimport INTPTR_MAX
+from libc.math cimport isnan
+from libcpp.vector cimport vector
+from libcpp.algorithm cimport pop_heap
+from libcpp.algorithm cimport push_heap
+from libcpp.stack cimport stack
+from libcpp cimport bool
+
+import struct
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+from scipy.sparse import issparse
+from scipy.sparse import csr_matrix
+
+from ._utils cimport safe_realloc
+from ._utils cimport sizet_ptr_to_ndarray
+
+cdef extern from "numpy/arrayobject.h":
+    object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,
+                                int nd, cnp.npy_intp* dims,
+                                cnp.npy_intp* strides,
+                                void* data, int flags, object obj)
+    int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)
+
+# =============================================================================
+# Types and constants
+# =============================================================================
+
+from numpy import float32 as DTYPE
+from numpy import float64 as DOUBLE
+
+cdef float64_t INFINITY = np.inf
+cdef float64_t EPSILON = np.finfo('double').eps
+
+# Some handy constants (BestFirstTreeBuilder)
+cdef bint IS_FIRST = 1
+cdef bint IS_NOT_FIRST = 0
+cdef bint IS_LEFT = 1
+cdef bint IS_NOT_LEFT = 0
+
+TREE_LEAF = -1
+TREE_UNDEFINED = -2
+cdef intp_t _TREE_LEAF = TREE_LEAF
+cdef intp_t _TREE_UNDEFINED = TREE_UNDEFINED
+
+# Build the corresponding numpy dtype for Node.
+# This works by casting `dummy` to an array of Node of length 1, which numpy
+# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946
+# for a more detailed explanation.
+cdef Node dummy
+NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
+
+cdef inline void _init_parent_record(ParentInfo* record) noexcept nogil:
+    record.n_constant_features = 0
+    record.impurity = INFINITY
+    record.lower_bound = -INFINITY
+    record.upper_bound = INFINITY
+
+# =============================================================================
+# TreeBuilder
+# =============================================================================
+
+cdef class TreeBuilder:
+    """Interface for different tree building strategies."""
+
+    cpdef build(
+        self,
+        Tree tree,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight=None,
+        const uint8_t[::1] missing_values_in_feature_mask=None,
+    ):
+        """Build a decision tree from the training set (X, y)."""
+        pass
+
+    cdef inline _check_input(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+    ):
+        """Check input dtype, layout and format"""
+        if issparse(X):
+            X = X.tocsc()
+            X.sort_indices()
+
+            if X.data.dtype != DTYPE:
+                X.data = np.ascontiguousarray(X.data, dtype=DTYPE)
+
+            if X.indices.dtype != np.int32 or X.indptr.dtype != np.int32:
+                raise ValueError("No support for np.int64 index based "
+                                 "sparse matrices")
+
+        elif X.dtype != DTYPE:
+            # since we have to copy we will make it fortran for efficiency
+            X = np.asfortranarray(X, dtype=DTYPE)
+
+        if sample_weight is not None and not sample_weight.base.flags.contiguous:
+            sample_weight = np.asarray(sample_weight, dtype=DOUBLE, order="C")
+
+        return X, y, sample_weight
+
+# Depth first builder ---------------------------------------------------------
+# A record on the stack for depth-first tree growing
+cdef struct StackRecord:
+    intp_t start
+    intp_t end
+    intp_t depth
+    intp_t parent
+    bint is_left
+    float64_t impurity
+    intp_t n_constant_features
+    float64_t lower_bound
+    float64_t upper_bound
+
+cdef class DepthFirstTreeBuilder(TreeBuilder):
+    """Build a decision tree in depth-first fashion."""
+
+    def __cinit__(self, Splitter splitter, intp_t min_samples_split,
+                  intp_t min_samples_leaf, float64_t min_weight_leaf,
+                  intp_t max_depth, float64_t min_impurity_decrease):
+        self.splitter = splitter
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_leaf = min_weight_leaf
+        self.max_depth = max_depth
+        self.min_impurity_decrease = min_impurity_decrease
+
+    cpdef build(
+        self,
+        Tree tree,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight=None,
+        const uint8_t[::1] missing_values_in_feature_mask=None,
+    ):
+        """Build a decision tree from the training set (X, y)."""
+
+        # check input
+        X, y, sample_weight = self._check_input(X, y, sample_weight)
+
+        # Initial capacity
+        cdef intp_t init_capacity
+
+        if tree.max_depth <= 10:
+            init_capacity = <intp_t> (2 ** (tree.max_depth + 1)) - 1
+        else:
+            init_capacity = 2047
+
+        tree._resize(init_capacity)
+
+        # Parameters
+        cdef Splitter splitter = self.splitter
+        cdef intp_t max_depth = self.max_depth
+        cdef intp_t min_samples_leaf = self.min_samples_leaf
+        cdef float64_t min_weight_leaf = self.min_weight_leaf
+        cdef intp_t min_samples_split = self.min_samples_split
+        cdef float64_t min_impurity_decrease = self.min_impurity_decrease
+
+        # Recursive partition (without actual recursion)
+        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
+
+        cdef intp_t start
+        cdef intp_t end
+        cdef intp_t depth
+        cdef intp_t parent
+        cdef bint is_left
+        cdef intp_t n_node_samples = splitter.n_samples
+        cdef float64_t weighted_n_node_samples
+        cdef SplitRecord split
+        cdef intp_t node_id
+
+        cdef float64_t middle_value
+        cdef float64_t left_child_min
+        cdef float64_t left_child_max
+        cdef float64_t right_child_min
+        cdef float64_t right_child_max
+        cdef bint is_leaf
+        cdef bint first = 1
+        cdef intp_t max_depth_seen = -1
+        cdef int rc = 0
+
+        cdef stack[StackRecord] builder_stack
+        cdef StackRecord stack_record
+
+        cdef ParentInfo parent_record
+        _init_parent_record(&parent_record)
+
+        with nogil:
+            # push root node onto stack
+            builder_stack.push({
+                "start": 0,
+                "end": n_node_samples,
+                "depth": 0,
+                "parent": _TREE_UNDEFINED,
+                "is_left": 0,
+                "impurity": INFINITY,
+                "n_constant_features": 0,
+                "lower_bound": -INFINITY,
+                "upper_bound": INFINITY,
+            })
+
+            while not builder_stack.empty():
+                stack_record = builder_stack.top()
+                builder_stack.pop()
+
+                start = stack_record.start
+                end = stack_record.end
+                depth = stack_record.depth
+                parent = stack_record.parent
+                is_left = stack_record.is_left
+                parent_record.impurity = stack_record.impurity
+                parent_record.n_constant_features = stack_record.n_constant_features
+                parent_record.lower_bound = stack_record.lower_bound
+                parent_record.upper_bound = stack_record.upper_bound
+
+                n_node_samples = end - start
+                splitter.node_reset(start, end, &weighted_n_node_samples)
+
+                is_leaf = (depth >= max_depth or
+                           n_node_samples < min_samples_split or
+                           n_node_samples < 2 * min_samples_leaf or
+                           weighted_n_node_samples < 2 * min_weight_leaf)
+
+                if first:
+                    parent_record.impurity = splitter.node_impurity()
+                    first = 0
+
+                # impurity == 0 with tolerance due to rounding errors
+                is_leaf = is_leaf or parent_record.impurity <= EPSILON
+
+                if not is_leaf:
+                    splitter.node_split(
+                        &parent_record,
+                        &split,
+                    )
+                    # If EPSILON=0 in the below comparison, float precision
+                    # issues stop splitting, producing trees that are
+                    # dissimilar to v0.18
+                    is_leaf = (is_leaf or split.pos >= end or
+                               (split.improvement + EPSILON <
+                                min_impurity_decrease))
+
+                node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
+                                         split.threshold, parent_record.impurity,
+                                         n_node_samples, weighted_n_node_samples,
+                                         split.missing_go_to_left)
+
+                if node_id == INTPTR_MAX:
+                    rc = -1
+                    break
+
+                # Store value for all nodes, to facilitate tree/model
+                # inspection and interpretation
+                splitter.node_value(tree.value + node_id * tree.value_stride)
+                if splitter.with_monotonic_cst:
+                    splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)
+
+                if not is_leaf:
+                    if (
+                        not splitter.with_monotonic_cst or
+                        splitter.monotonic_cst[split.feature] == 0
+                    ):
+                        # Split on a feature with no monotonicity constraint
+
+                        # Current bounds must always be propagated to both children.
+                        # If a monotonic constraint is active, bounds are used in
+                        # node value clipping.
+                        left_child_min = right_child_min = parent_record.lower_bound
+                        left_child_max = right_child_max = parent_record.upper_bound
+                    elif splitter.monotonic_cst[split.feature] == 1:
+                        # Split on a feature with monotonic increase constraint
+                        left_child_min = parent_record.lower_bound
+                        right_child_max = parent_record.upper_bound
+
+                        # Lower bound for right child and upper bound for left child
+                        # are set to the same value.
+                        middle_value = splitter.criterion.middle_value()
+                        right_child_min = middle_value
+                        left_child_max = middle_value
+                    else:  # i.e. splitter.monotonic_cst[split.feature] == -1
+                        # Split on a feature with monotonic decrease constraint
+                        right_child_min = parent_record.lower_bound
+                        left_child_max = parent_record.upper_bound
+
+                        # Lower bound for left child and upper bound for right child
+                        # are set to the same value.
+                        middle_value = splitter.criterion.middle_value()
+                        left_child_min = middle_value
+                        right_child_max = middle_value
+
+                    # Push right child on stack
+                    builder_stack.push({
+                        "start": split.pos,
+                        "end": end,
+                        "depth": depth + 1,
+                        "parent": node_id,
+                        "is_left": 0,
+                        "impurity": split.impurity_right,
+                        "n_constant_features": parent_record.n_constant_features,
+                        "lower_bound": right_child_min,
+                        "upper_bound": right_child_max,
+                    })
+
+                    # Push left child on stack
+                    builder_stack.push({
+                        "start": start,
+                        "end": split.pos,
+                        "depth": depth + 1,
+                        "parent": node_id,
+                        "is_left": 1,
+                        "impurity": split.impurity_left,
+                        "n_constant_features": parent_record.n_constant_features,
+                        "lower_bound": left_child_min,
+                        "upper_bound": left_child_max,
+                    })
+
+                if depth > max_depth_seen:
+                    max_depth_seen = depth
+
+            if rc >= 0:
+                rc = tree._resize_c(tree.node_count)
+
+            if rc >= 0:
+                tree.max_depth = max_depth_seen
+        if rc == -1:
+            raise MemoryError()
+
+
+# Best first builder ----------------------------------------------------------
+cdef struct FrontierRecord:
+    # Record of information of a Node, the frontier for a split. Those records are
+    # maintained in a heap to access the Node with the best improvement in impurity,
+    # allowing growing trees greedily on this improvement.
+    intp_t node_id
+    intp_t start
+    intp_t end
+    intp_t pos
+    intp_t depth
+    bint is_leaf
+    float64_t impurity
+    float64_t impurity_left
+    float64_t impurity_right
+    float64_t improvement
+    float64_t lower_bound
+    float64_t upper_bound
+    float64_t middle_value
+
+cdef inline bool _compare_records(
+    const FrontierRecord& left,
+    const FrontierRecord& right,
+):
+    return left.improvement < right.improvement
+
+cdef inline void _add_to_frontier(
+    FrontierRecord rec,
+    vector[FrontierRecord]& frontier,
+) noexcept nogil:
+    """Adds record `rec` to the priority queue `frontier`."""
+    frontier.push_back(rec)
+    push_heap(frontier.begin(), frontier.end(), &_compare_records)
+
+
+cdef class BestFirstTreeBuilder(TreeBuilder):
+    """Build a decision tree in best-first fashion.
+
+    The best node to expand is given by the node at the frontier that has the
+    highest impurity improvement.
+    """
+    cdef intp_t max_leaf_nodes
+
+    def __cinit__(self, Splitter splitter, intp_t min_samples_split,
+                  intp_t min_samples_leaf,  min_weight_leaf,
+                  intp_t max_depth, intp_t max_leaf_nodes,
+                  float64_t min_impurity_decrease):
+        self.splitter = splitter
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_leaf = min_weight_leaf
+        self.max_depth = max_depth
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+
+    cpdef build(
+        self,
+        Tree tree,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight=None,
+        const uint8_t[::1] missing_values_in_feature_mask=None,
+    ):
+        """Build a decision tree from the training set (X, y)."""
+
+        # check input
+        X, y, sample_weight = self._check_input(X, y, sample_weight)
+
+        # Parameters
+        cdef Splitter splitter = self.splitter
+        cdef intp_t max_leaf_nodes = self.max_leaf_nodes
+
+        # Recursive partition (without actual recursion)
+        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
+
+        cdef vector[FrontierRecord] frontier
+        cdef FrontierRecord record
+        cdef FrontierRecord split_node_left
+        cdef FrontierRecord split_node_right
+        cdef float64_t left_child_min
+        cdef float64_t left_child_max
+        cdef float64_t right_child_min
+        cdef float64_t right_child_max
+
+        cdef intp_t n_node_samples = splitter.n_samples
+        cdef intp_t max_split_nodes = max_leaf_nodes - 1
+        cdef bint is_leaf
+        cdef intp_t max_depth_seen = -1
+        cdef int rc = 0
+        cdef Node* node
+
+        cdef ParentInfo parent_record
+        _init_parent_record(&parent_record)
+
+        # Initial capacity
+        cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes
+        tree._resize(init_capacity)
+
+        with nogil:
+            # add root to frontier
+            rc = self._add_split_node(
+                splitter=splitter,
+                tree=tree,
+                start=0,
+                end=n_node_samples,
+                is_first=IS_FIRST,
+                is_left=IS_LEFT,
+                parent=NULL,
+                depth=0,
+                parent_record=&parent_record,
+                res=&split_node_left,
+            )
+            if rc >= 0:
+                _add_to_frontier(split_node_left, frontier)
+
+            while not frontier.empty():
+                pop_heap(frontier.begin(), frontier.end(), &_compare_records)
+                record = frontier.back()
+                frontier.pop_back()
+
+                node = &tree.nodes[record.node_id]
+                is_leaf = (record.is_leaf or max_split_nodes <= 0)
+
+                if is_leaf:
+                    # Node is not expandable; set node as leaf
+                    node.left_child = _TREE_LEAF
+                    node.right_child = _TREE_LEAF
+                    node.feature = _TREE_UNDEFINED
+                    node.threshold = _TREE_UNDEFINED
+
+                else:
+                    # Node is expandable
+
+                    if (
+                        not splitter.with_monotonic_cst or
+                        splitter.monotonic_cst[node.feature] == 0
+                    ):
+                        # Split on a feature with no monotonicity constraint
+
+                        # Current bounds must always be propagated to both children.
+                        # If a monotonic constraint is active, bounds are used in
+                        # node value clipping.
+                        left_child_min = right_child_min = record.lower_bound
+                        left_child_max = right_child_max = record.upper_bound
+                    elif splitter.monotonic_cst[node.feature] == 1:
+                        # Split on a feature with monotonic increase constraint
+                        left_child_min = record.lower_bound
+                        right_child_max = record.upper_bound
+
+                        # Lower bound for right child and upper bound for left child
+                        # are set to the same value.
+                        right_child_min = record.middle_value
+                        left_child_max = record.middle_value
+                    else:  # i.e. splitter.monotonic_cst[split.feature] == -1
+                        # Split on a feature with monotonic decrease constraint
+                        right_child_min = record.lower_bound
+                        left_child_max = record.upper_bound
+
+                        # Lower bound for left child and upper bound for right child
+                        # are set to the same value.
+                        left_child_min = record.middle_value
+                        right_child_max = record.middle_value
+
+                    # Decrement number of split nodes available
+                    max_split_nodes -= 1
+
+                    # Compute left split node
+                    parent_record.lower_bound = left_child_min
+                    parent_record.upper_bound = left_child_max
+                    parent_record.impurity = record.impurity_left
+                    rc = self._add_split_node(
+                        splitter=splitter,
+                        tree=tree,
+                        start=record.start,
+                        end=record.pos,
+                        is_first=IS_NOT_FIRST,
+                        is_left=IS_LEFT,
+                        parent=node,
+                        depth=record.depth + 1,
+                        parent_record=&parent_record,
+                        res=&split_node_left,
+                    )
+                    if rc == -1:
+                        break
+
+                    # tree.nodes may have changed
+                    node = &tree.nodes[record.node_id]
+
+                    # Compute right split node
+                    parent_record.lower_bound = right_child_min
+                    parent_record.upper_bound = right_child_max
+                    parent_record.impurity = record.impurity_right
+                    rc = self._add_split_node(
+                        splitter=splitter,
+                        tree=tree,
+                        start=record.pos,
+                        end=record.end,
+                        is_first=IS_NOT_FIRST,
+                        is_left=IS_NOT_LEFT,
+                        parent=node,
+                        depth=record.depth + 1,
+                        parent_record=&parent_record,
+                        res=&split_node_right,
+                    )
+                    if rc == -1:
+                        break
+
+                    # Add nodes to queue
+                    _add_to_frontier(split_node_left, frontier)
+                    _add_to_frontier(split_node_right, frontier)
+
+                if record.depth > max_depth_seen:
+                    max_depth_seen = record.depth
+
+            if rc >= 0:
+                rc = tree._resize_c(tree.node_count)
+
+            if rc >= 0:
+                tree.max_depth = max_depth_seen
+
+        if rc == -1:
+            raise MemoryError()
+
+    cdef inline int _add_split_node(
+        self,
+        Splitter splitter,
+        Tree tree,
+        intp_t start,
+        intp_t end,
+        bint is_first,
+        bint is_left,
+        Node* parent,
+        intp_t depth,
+        ParentInfo* parent_record,
+        FrontierRecord* res
+    ) except -1 nogil:
+        """Adds node w/ partition ``[start, end)`` to the frontier. """
+        cdef SplitRecord split
+        cdef intp_t node_id
+        cdef intp_t n_node_samples
+        cdef float64_t min_impurity_decrease = self.min_impurity_decrease
+        cdef float64_t weighted_n_node_samples
+        cdef bint is_leaf
+
+        splitter.node_reset(start, end, &weighted_n_node_samples)
+
+        # reset n_constant_features for this specific split before beginning split search
+        parent_record.n_constant_features = 0
+
+        if is_first:
+            parent_record.impurity = splitter.node_impurity()
+
+        n_node_samples = end - start
+        is_leaf = (depth >= self.max_depth or
+                   n_node_samples < self.min_samples_split or
+                   n_node_samples < 2 * self.min_samples_leaf or
+                   weighted_n_node_samples < 2 * self.min_weight_leaf or
+                   parent_record.impurity <= EPSILON  # impurity == 0 with tolerance
+                   )
+
+        if not is_leaf:
+            splitter.node_split(
+                parent_record,
+                &split,
+            )
+            # If EPSILON=0 in the below comparison, float precision issues stop
+            # splitting early, producing trees that are dissimilar to v0.18
+            is_leaf = (is_leaf or split.pos >= end or
+                       split.improvement + EPSILON < min_impurity_decrease)
+
+        node_id = tree._add_node(parent - tree.nodes
+                                 if parent != NULL
+                                 else _TREE_UNDEFINED,
+                                 is_left, is_leaf,
+                                 split.feature, split.threshold, parent_record.impurity,
+                                 n_node_samples, weighted_n_node_samples,
+                                 split.missing_go_to_left)
+        if node_id == INTPTR_MAX:
+            return -1
+
+        # compute values also for split nodes (might become leafs later).
+        splitter.node_value(tree.value + node_id * tree.value_stride)
+        if splitter.with_monotonic_cst:
+            splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)
+
+        res.node_id = node_id
+        res.start = start
+        res.end = end
+        res.depth = depth
+        res.impurity = parent_record.impurity
+        res.lower_bound = parent_record.lower_bound
+        res.upper_bound = parent_record.upper_bound
+        res.middle_value = splitter.criterion.middle_value()
+
+        if not is_leaf:
+            # is split node
+            res.pos = split.pos
+            res.is_leaf = 0
+            res.improvement = split.improvement
+            res.impurity_left = split.impurity_left
+            res.impurity_right = split.impurity_right
+
+        else:
+            # is leaf => 0 improvement
+            res.pos = end
+            res.is_leaf = 1
+            res.improvement = 0.0
+            res.impurity_left = parent_record.impurity
+            res.impurity_right = parent_record.impurity
+
+        return 0
+
+
+# =============================================================================
+# Tree
+# =============================================================================
+
+cdef class Tree:
+    """Array-based representation of a binary decision tree.
+
+    The binary tree is represented as a number of parallel arrays. The i-th
+    element of each array holds information about the node `i`. Node 0 is the
+    tree's root. You can find a detailed description of all arrays in
+    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
+    nodes, resp. In this case the values of nodes of the other type are
+    arbitrary!
+
+    Attributes
+    ----------
+    node_count : intp_t
+        The number of nodes (internal nodes + leaves) in the tree.
+
+    capacity : intp_t
+        The current capacity (i.e., size) of the arrays, which is at least as
+        great as `node_count`.
+
+    max_depth : intp_t
+        The depth of the tree, i.e. the maximum depth of its leaves.
+
+    children_left : array of intp_t, shape [node_count]
+        children_left[i] holds the node id of the left child of node i.
+        For leaves, children_left[i] == TREE_LEAF. Otherwise,
+        children_left[i] > i. This child handles the case where
+        X[:, feature[i]] <= threshold[i].
+
+    children_right : array of intp_t, shape [node_count]
+        children_right[i] holds the node id of the right child of node i.
+        For leaves, children_right[i] == TREE_LEAF. Otherwise,
+        children_right[i] > i. This child handles the case where
+        X[:, feature[i]] > threshold[i].
+
+    n_leaves : intp_t
+        Number of leaves in the tree.
+
+    feature : array of intp_t, shape [node_count]
+        feature[i] holds the feature to split on, for the internal node i.
+
+    threshold : array of float64_t, shape [node_count]
+        threshold[i] holds the threshold for the internal node i.
+
+    value : array of float64_t, shape [node_count, n_outputs, max_n_classes]
+        Contains the constant prediction value of each node.
+
+    impurity : array of float64_t, shape [node_count]
+        impurity[i] holds the impurity (i.e., the value of the splitting
+        criterion) at node i.
+
+    n_node_samples : array of intp_t, shape [node_count]
+        n_node_samples[i] holds the number of training samples reaching node i.
+
+    weighted_n_node_samples : array of float64_t, shape [node_count]
+        weighted_n_node_samples[i] holds the weighted number of training samples
+        reaching node i.
+
+    missing_go_to_left : array of bool, shape [node_count]
+        missing_go_to_left[i] holds a bool indicating whether or not there were
+        missing values at node i.
+    """
+    # Wrap for outside world.
+    # WARNING: these reference the current `nodes` and `value` buffers, which
+    # must not be freed by a subsequent memory allocation.
+    # (i.e. through `_resize` or `__setstate__`)
+    @property
+    def n_classes(self):
+        return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
+
+    @property
+    def children_left(self):
+        return self._get_node_ndarray()['left_child'][:self.node_count]
+
+    @property
+    def children_right(self):
+        return self._get_node_ndarray()['right_child'][:self.node_count]
+
+    @property
+    def n_leaves(self):
+        return np.sum(np.logical_and(
+            self.children_left == -1,
+            self.children_right == -1))
+
+    @property
+    def feature(self):
+        return self._get_node_ndarray()['feature'][:self.node_count]
+
+    @property
+    def threshold(self):
+        return self._get_node_ndarray()['threshold'][:self.node_count]
+
+    @property
+    def impurity(self):
+        return self._get_node_ndarray()['impurity'][:self.node_count]
+
+    @property
+    def n_node_samples(self):
+        return self._get_node_ndarray()['n_node_samples'][:self.node_count]
+
+    @property
+    def weighted_n_node_samples(self):
+        return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
+
+    @property
+    def missing_go_to_left(self):
+        return self._get_node_ndarray()['missing_go_to_left'][:self.node_count]
+
+    @property
+    def value(self):
+        return self._get_value_ndarray()[:self.node_count]
+
+    # TODO: Convert n_classes to cython.integral memory view once
+    #  https://github.com/cython/cython/issues/5243 is fixed
+    def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs):
+        """Constructor."""
+        cdef intp_t dummy = 0
+        size_t_dtype = np.array(dummy).dtype
+
+        n_classes = _check_n_classes(n_classes, size_t_dtype)
+
+        # Input/Output layout
+        self.n_features = n_features
+        self.n_outputs = n_outputs
+        self.n_classes = NULL
+        safe_realloc(&self.n_classes, n_outputs)
+
+        self.max_n_classes = np.max(n_classes)
+        self.value_stride = n_outputs * self.max_n_classes
+
+        cdef intp_t k
+        for k in range(n_outputs):
+            self.n_classes[k] = n_classes[k]
+
+        # Inner structures
+        self.max_depth = 0
+        self.node_count = 0
+        self.capacity = 0
+        self.value = NULL
+        self.nodes = NULL
+
+    def __dealloc__(self):
+        """Destructor."""
+        # Free all inner structures
+        free(self.n_classes)
+        free(self.value)
+        free(self.nodes)
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return (Tree, (self.n_features,
+                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
+                       self.n_outputs), self.__getstate__())
+
+    def __getstate__(self):
+        """Getstate re-implementation, for pickling."""
+        d = {}
+        # capacity is inferred during the __setstate__ using nodes
+        d["max_depth"] = self.max_depth
+        d["node_count"] = self.node_count
+        d["nodes"] = self._get_node_ndarray()
+        d["values"] = self._get_value_ndarray()
+        return d
+
+    def __setstate__(self, d):
+        """Setstate re-implementation, for unpickling."""
+        self.max_depth = d["max_depth"]
+        self.node_count = d["node_count"]
+
+        if 'nodes' not in d:
+            raise ValueError('You have loaded Tree version which '
+                             'cannot be imported')
+
+        node_ndarray = d['nodes']
+        value_ndarray = d['values']
+
+        value_shape = (node_ndarray.shape[0], self.n_outputs,
+                       self.max_n_classes)
+
+        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
+        value_ndarray = _check_value_ndarray(
+            value_ndarray,
+            expected_dtype=np.dtype(np.float64),
+            expected_shape=value_shape
+        )
+
+        self.capacity = node_ndarray.shape[0]
+        if self._resize_c(self.capacity) != 0:
+            raise MemoryError("resizing tree to %d" % self.capacity)
+
+        memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
+               self.capacity * sizeof(Node))
+        memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
+               self.capacity * self.value_stride * sizeof(float64_t))
+
+    cdef int _resize(self, intp_t capacity) except -1 nogil:
+        """Resize all inner arrays to `capacity`, if `capacity` == -1, then
+           double the size of the inner arrays.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if self._resize_c(capacity) != 0:
+            # Acquire gil only if we need to raise
+            with gil:
+                raise MemoryError()
+
+    cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil:
+        """Guts of _resize
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if capacity == self.capacity and self.nodes != NULL:
+            return 0
+
+        if capacity == INTPTR_MAX:
+            if self.capacity == 0:
+                capacity = 3  # default initial value
+            else:
+                capacity = 2 * self.capacity
+
+        safe_realloc(&self.nodes, capacity)
+        safe_realloc(&self.value, capacity * self.value_stride)
+
+        if capacity > self.capacity:
+            # value memory is initialised to 0 to enable classifier argmax
+            memset(<void*>(self.value + self.capacity * self.value_stride), 0,
+                   (capacity - self.capacity) * self.value_stride *
+                   sizeof(float64_t))
+            # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct)
+            memset(<void*>(self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node))
+
+        # if capacity smaller than node_count, adjust the counter
+        if capacity < self.node_count:
+            self.node_count = capacity
+
+        self.capacity = capacity
+        return 0
+
+    cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
+                          intp_t feature, float64_t threshold, float64_t impurity,
+                          intp_t n_node_samples,
+                          float64_t weighted_n_node_samples,
+                          uint8_t missing_go_to_left) except -1 nogil:
+        """Add a node to the tree.
+
+        The new node registers itself as the child of its parent.
+
+        Returns (size_t)(-1) on error.
+        """
+        cdef intp_t node_id = self.node_count
+
+        if node_id >= self.capacity:
+            if self._resize_c() != 0:
+                return INTPTR_MAX
+
+        cdef Node* node = &self.nodes[node_id]
+        node.impurity = impurity
+        node.n_node_samples = n_node_samples
+        node.weighted_n_node_samples = weighted_n_node_samples
+
+        if parent != _TREE_UNDEFINED:
+            if is_left:
+                self.nodes[parent].left_child = node_id
+            else:
+                self.nodes[parent].right_child = node_id
+
+        if is_leaf:
+            node.left_child = _TREE_LEAF
+            node.right_child = _TREE_LEAF
+            node.feature = _TREE_UNDEFINED
+            node.threshold = _TREE_UNDEFINED
+
+        else:
+            # left_child and right_child will be set later
+            node.feature = feature
+            node.threshold = threshold
+            node.missing_go_to_left = missing_go_to_left
+
+        self.node_count += 1
+
+        return node_id
+
+    cpdef cnp.ndarray predict(self, object X):
+        """Predict target for X."""
+        out = self._get_value_ndarray().take(self.apply(X), axis=0,
+                                             mode='clip')
+        if self.n_outputs == 1:
+            out = out.reshape(X.shape[0], self.max_n_classes)
+        return out
+
+    cpdef cnp.ndarray apply(self, object X):
+        """Finds the terminal region (=leaf node) for each sample in X."""
+        if issparse(X):
+            return self._apply_sparse_csr(X)
+        else:
+            return self._apply_dense(X)
+
+    cdef inline cnp.ndarray _apply_dense(self, object X):
+        """Finds the terminal region (=leaf node) for each sample in X."""
+
+        # Check input
+        if not isinstance(X, np.ndarray):
+            raise ValueError("X should be in np.ndarray format, got %s"
+                             % type(X))
+
+        if X.dtype != DTYPE:
+            raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
+
+        # Extract input
+        cdef const float32_t[:, :] X_ndarray = X
+        cdef intp_t n_samples = X.shape[0]
+        cdef float32_t X_i_node_feature
+
+        # Initialize output
+        cdef intp_t[:] out = np.zeros(n_samples, dtype=np.intp)
+
+        # Initialize auxiliary data-structure
+        cdef Node* node = NULL
+        cdef intp_t i = 0
+
+        with nogil:
+            for i in range(n_samples):
+                node = self.nodes
+                # While node not a leaf
+                while node.left_child != _TREE_LEAF:
+                    X_i_node_feature = X_ndarray[i, node.feature]
+                    # ... and node.right_child != _TREE_LEAF:
+                    if isnan(X_i_node_feature):
+                        if node.missing_go_to_left:
+                            node = &self.nodes[node.left_child]
+                        else:
+                            node = &self.nodes[node.right_child]
+                    elif X_i_node_feature <= node.threshold:
+                        node = &self.nodes[node.left_child]
+                    else:
+                        node = &self.nodes[node.right_child]
+
+                out[i] = <intp_t>(node - self.nodes)  # node offset
+
+        return np.asarray(out)
+
+    cdef inline cnp.ndarray _apply_sparse_csr(self, object X):
+        """Finds the terminal region (=leaf node) for each sample in sparse X.
+        """
+        # Check input
+        if not (issparse(X) and X.format == 'csr'):
+            raise ValueError("X should be in csr_matrix format, got %s"
+                             % type(X))
+
+        if X.dtype != DTYPE:
+            raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
+
+        # Extract input
+        cdef const float32_t[:] X_data = X.data
+        cdef const int32_t[:] X_indices = X.indices
+        cdef const int32_t[:] X_indptr = X.indptr
+
+        cdef intp_t n_samples = X.shape[0]
+        cdef intp_t n_features = X.shape[1]
+
+        # Initialize output
+        cdef intp_t[:] out = np.zeros(n_samples, dtype=np.intp)
+
+        # Initialize auxiliary data-structure
+        cdef float32_t feature_value = 0.
+        cdef Node* node = NULL
+        cdef float32_t* X_sample = NULL
+        cdef intp_t i = 0
+        cdef int32_t k = 0
+
+        # feature_to_sample as a data structure records the last seen sample
+        # for each feature; functionally, it is an efficient way to identify
+        # which features are nonzero in the present sample.
+        cdef intp_t* feature_to_sample = NULL
+
+        safe_realloc(&X_sample, n_features)
+        safe_realloc(&feature_to_sample, n_features)
+
+        with nogil:
+            memset(feature_to_sample, -1, n_features * sizeof(intp_t))
+
+            for i in range(n_samples):
+                node = self.nodes
+
+                for k in range(X_indptr[i], X_indptr[i + 1]):
+                    feature_to_sample[X_indices[k]] = i
+                    X_sample[X_indices[k]] = X_data[k]
+
+                # While node not a leaf
+                while node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+                    if feature_to_sample[node.feature] == i:
+                        feature_value = X_sample[node.feature]
+
+                    else:
+                        feature_value = 0.
+
+                    if feature_value <= node.threshold:
+                        node = &self.nodes[node.left_child]
+                    else:
+                        node = &self.nodes[node.right_child]
+
+                out[i] = <intp_t>(node - self.nodes)  # node offset
+
+            # Free auxiliary arrays
+            free(X_sample)
+            free(feature_to_sample)
+
+        return np.asarray(out)
+
+    cpdef object decision_path(self, object X):
+        """Finds the decision path (=node) for each sample in X."""
+        if issparse(X):
+            return self._decision_path_sparse_csr(X)
+        else:
+            return self._decision_path_dense(X)
+
+    cdef inline object _decision_path_dense(self, object X):
+        """Finds the decision path (=node) for each sample in X."""
+
+        # Check input
+        if not isinstance(X, np.ndarray):
+            raise ValueError("X should be in np.ndarray format, got %s"
+                             % type(X))
+
+        if X.dtype != DTYPE:
+            raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
+
+        # Extract input
+        cdef const float32_t[:, :] X_ndarray = X
+        cdef intp_t n_samples = X.shape[0]
+
+        # Initialize output
+        cdef intp_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp)
+        cdef intp_t[:] indices = np.zeros(
+            n_samples * (1 + self.max_depth), dtype=np.intp
+        )
+
+        # Initialize auxiliary data-structure
+        cdef Node* node = NULL
+        cdef intp_t i = 0
+
+        with nogil:
+            for i in range(n_samples):
+                node = self.nodes
+                indptr[i + 1] = indptr[i]
+
+                # Add all external nodes
+                while node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+                    indices[indptr[i + 1]] = <intp_t>(node - self.nodes)
+                    indptr[i + 1] += 1
+
+                    if X_ndarray[i, node.feature] <= node.threshold:
+                        node = &self.nodes[node.left_child]
+                    else:
+                        node = &self.nodes[node.right_child]
+
+                # Add the leave node
+                indices[indptr[i + 1]] = <intp_t>(node - self.nodes)
+                indptr[i + 1] += 1
+
+        indices = indices[:indptr[n_samples]]
+        cdef intp_t[:] data = np.ones(shape=len(indices), dtype=np.intp)
+        out = csr_matrix((data, indices, indptr),
+                         shape=(n_samples, self.node_count))
+
+        return out
+
+    cdef inline object _decision_path_sparse_csr(self, object X):
+        """Finds the decision path (=node) for each sample in X."""
+
+        # Check input
+        if not (issparse(X) and X.format == "csr"):
+            raise ValueError("X should be in csr_matrix format, got %s"
+                             % type(X))
+
+        if X.dtype != DTYPE:
+            raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
+
+        # Extract input
+        cdef const float32_t[:] X_data = X.data
+        cdef const int32_t[:] X_indices = X.indices
+        cdef const int32_t[:] X_indptr = X.indptr
+
+        cdef intp_t n_samples = X.shape[0]
+        cdef intp_t n_features = X.shape[1]
+
+        # Initialize output
+        cdef intp_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp)
+        cdef intp_t[:] indices = np.zeros(
+            n_samples * (1 + self.max_depth), dtype=np.intp
+        )
+
+        # Initialize auxiliary data-structure
+        cdef float32_t feature_value = 0.
+        cdef Node* node = NULL
+        cdef float32_t* X_sample = NULL
+        cdef intp_t i = 0
+        cdef int32_t k = 0
+
+        # feature_to_sample as a data structure records the last seen sample
+        # for each feature; functionally, it is an efficient way to identify
+        # which features are nonzero in the present sample.
+        cdef intp_t* feature_to_sample = NULL
+
+        safe_realloc(&X_sample, n_features)
+        safe_realloc(&feature_to_sample, n_features)
+
+        with nogil:
+            memset(feature_to_sample, -1, n_features * sizeof(intp_t))
+
+            for i in range(n_samples):
+                node = self.nodes
+                indptr[i + 1] = indptr[i]
+
+                for k in range(X_indptr[i], X_indptr[i + 1]):
+                    feature_to_sample[X_indices[k]] = i
+                    X_sample[X_indices[k]] = X_data[k]
+
+                # While node not a leaf
+                while node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+
+                    indices[indptr[i + 1]] = <intp_t>(node - self.nodes)
+                    indptr[i + 1] += 1
+
+                    if feature_to_sample[node.feature] == i:
+                        feature_value = X_sample[node.feature]
+
+                    else:
+                        feature_value = 0.
+
+                    if feature_value <= node.threshold:
+                        node = &self.nodes[node.left_child]
+                    else:
+                        node = &self.nodes[node.right_child]
+
+                # Add the leave node
+                indices[indptr[i + 1]] = <intp_t>(node - self.nodes)
+                indptr[i + 1] += 1
+
+            # Free auxiliary arrays
+            free(X_sample)
+            free(feature_to_sample)
+
+        indices = indices[:indptr[n_samples]]
+        cdef intp_t[:] data = np.ones(shape=len(indices), dtype=np.intp)
+        out = csr_matrix((data, indices, indptr),
+                         shape=(n_samples, self.node_count))
+
+        return out
+
+    cpdef compute_node_depths(self):
+        """Compute the depth of each node in a tree.
+
+        .. versionadded:: 1.3
+
+        Returns
+        -------
+        depths : ndarray of shape (self.node_count,), dtype=np.int64
+            The depth of each node in the tree.
+        """
+        cdef:
+            cnp.int64_t[::1] depths = np.empty(self.node_count, dtype=np.int64)
+            cnp.npy_intp[:] children_left = self.children_left
+            cnp.npy_intp[:] children_right = self.children_right
+            cnp.npy_intp node_id
+            cnp.npy_intp node_count = self.node_count
+            cnp.int64_t depth
+
+        depths[0] = 1  # init root node
+        for node_id in range(node_count):
+            if children_left[node_id] != _TREE_LEAF:
+                depth = depths[node_id] + 1
+                depths[children_left[node_id]] = depth
+                depths[children_right[node_id]] = depth
+
+        return depths.base
+
+    cpdef compute_feature_importances(self, normalize=True):
+        """Computes the importance of each feature (aka variable)."""
+        cdef Node* left
+        cdef Node* right
+        cdef Node* nodes = self.nodes
+        cdef Node* node = nodes
+        cdef Node* end_node = node + self.node_count
+
+        cdef float64_t normalizer = 0.
+
+        cdef cnp.float64_t[:] importances = np.zeros(self.n_features)
+
+        with nogil:
+            while node != end_node:
+                if node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+                    left = &nodes[node.left_child]
+                    right = &nodes[node.right_child]
+
+                    importances[node.feature] += (
+                        node.weighted_n_node_samples * node.impurity -
+                        left.weighted_n_node_samples * left.impurity -
+                        right.weighted_n_node_samples * right.impurity)
+                node += 1
+
+        for i in range(self.n_features):
+            importances[i] /= nodes[0].weighted_n_node_samples
+
+        if normalize:
+            normalizer = np.sum(importances)
+
+            if normalizer > 0.0:
+                # Avoid dividing by zero (e.g., when root is pure)
+                for i in range(self.n_features):
+                    importances[i] /= normalizer
+
+        return np.asarray(importances)
+
+    cdef cnp.ndarray _get_value_ndarray(self):
+        """Wraps value as a 3-d NumPy array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory.
+        """
+        cdef cnp.npy_intp shape[3]
+        shape[0] = <cnp.npy_intp> self.node_count
+        shape[1] = <cnp.npy_intp> self.n_outputs
+        shape[2] = <cnp.npy_intp> self.max_n_classes
+        cdef cnp.ndarray arr
+        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cdef cnp.ndarray _get_node_ndarray(self):
+        """Wraps nodes as a NumPy struct array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory. Individual fields are publicly accessible as properties of the
+        Tree.
+        """
+        cdef cnp.npy_intp shape[1]
+        shape[0] = <cnp.npy_intp> self.node_count
+        cdef cnp.npy_intp strides[1]
+        strides[0] = sizeof(Node)
+        cdef cnp.ndarray arr
+        Py_INCREF(NODE_DTYPE)
+        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
+                                   <cnp.dtype> NODE_DTYPE, 1, shape,
+                                   strides, <void*> self.nodes,
+                                   cnp.NPY_ARRAY_DEFAULT, None)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    def compute_partial_dependence(self, float32_t[:, ::1] X,
+                                   const intp_t[::1] target_features,
+                                   float64_t[::1] out):
+        """Partial dependence of the response on the ``target_feature`` set.
+
+        For each sample in ``X`` a tree traversal is performed.
+        Each traversal starts from the root with weight 1.0.
+
+        At each non-leaf node that splits on a target feature, either
+        the left child or the right child is visited based on the feature
+        value of the current sample, and the weight is not modified.
+        At each non-leaf node that splits on a complementary feature,
+        both children are visited and the weight is multiplied by the fraction
+        of training samples which went to each child.
+
+        At each leaf, the value of the node is multiplied by the current
+        weight (weights sum to 1 for all visited terminal nodes).
+
+        Parameters
+        ----------
+        X : view on 2d ndarray, shape (n_samples, n_target_features)
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : view on 1d ndarray, shape (n_target_features)
+            The set of target features for which the partial dependence
+            should be evaluated.
+        out : view on 1d ndarray, shape (n_samples)
+            The value of the partial dependence function on each grid
+            point.
+        """
+        cdef:
+            float64_t[::1] weight_stack = np.zeros(self.node_count,
+                                                   dtype=np.float64)
+            intp_t[::1] node_idx_stack = np.zeros(self.node_count,
+                                                  dtype=np.intp)
+            intp_t sample_idx
+            intp_t feature_idx
+            intp_t stack_size
+            float64_t left_sample_frac
+            float64_t current_weight
+            float64_t total_weight  # used for sanity check only
+            Node *current_node  # use a pointer to avoid copying attributes
+            intp_t current_node_idx
+            bint is_target_feature
+            intp_t _TREE_LEAF = TREE_LEAF  # to avoid python interactions
+
+        for sample_idx in range(X.shape[0]):
+            # init stacks for current sample
+            stack_size = 1
+            node_idx_stack[0] = 0  # root node
+            weight_stack[0] = 1  # all the samples are in the root node
+            total_weight = 0
+
+            while stack_size > 0:
+                # pop the stack
+                stack_size -= 1
+                current_node_idx = node_idx_stack[stack_size]
+                current_node = &self.nodes[current_node_idx]
+
+                if current_node.left_child == _TREE_LEAF:
+                    # leaf node
+                    out[sample_idx] += (weight_stack[stack_size] *
+                                        self.value[current_node_idx])
+                    total_weight += weight_stack[stack_size]
+                else:
+                    # non-leaf node
+
+                    # determine if the split feature is a target feature
+                    is_target_feature = False
+                    for feature_idx in range(target_features.shape[0]):
+                        if target_features[feature_idx] == current_node.feature:
+                            is_target_feature = True
+                            break
+
+                    if is_target_feature:
+                        # In this case, we push left or right child on stack
+                        if X[sample_idx, feature_idx] <= current_node.threshold:
+                            node_idx_stack[stack_size] = current_node.left_child
+                        else:
+                            node_idx_stack[stack_size] = current_node.right_child
+                        stack_size += 1
+                    else:
+                        # In this case, we push both children onto the stack,
+                        # and give a weight proportional to the number of
+                        # samples going through each branch.
+
+                        # push left child
+                        node_idx_stack[stack_size] = current_node.left_child
+                        left_sample_frac = (
+                            self.nodes[current_node.left_child].weighted_n_node_samples /
+                            current_node.weighted_n_node_samples)
+                        current_weight = weight_stack[stack_size]
+                        weight_stack[stack_size] = current_weight * left_sample_frac
+                        stack_size += 1
+
+                        # push right child
+                        node_idx_stack[stack_size] = current_node.right_child
+                        weight_stack[stack_size] = (
+                            current_weight * (1 - left_sample_frac))
+                        stack_size += 1
+
+            # Sanity check. Should never happen.
+            if not (0.999 < total_weight < 1.001):
+                raise ValueError("Total weight should be 1.0 but was %.9f" %
+                                 total_weight)
+
+
+def _check_n_classes(n_classes, expected_dtype):
+    if n_classes.ndim != 1:
+        raise ValueError(
+            f"Wrong dimensions for n_classes from the pickle: "
+            f"expected 1, got {n_classes.ndim}"
+        )
+
+    if n_classes.dtype == expected_dtype:
+        return n_classes
+
+    # Handles both different endianness and different bitness
+    if n_classes.dtype.kind == "i" and n_classes.dtype.itemsize in [4, 8]:
+        return n_classes.astype(expected_dtype, casting="same_kind")
+
+    raise ValueError(
+        "n_classes from the pickle has an incompatible dtype:\n"
+        f"- expected: {expected_dtype}\n"
+        f"- got:      {n_classes.dtype}"
+    )
+
+
+def _check_value_ndarray(value_ndarray, expected_dtype, expected_shape):
+    if value_ndarray.shape != expected_shape:
+        raise ValueError(
+            "Wrong shape for value array from the pickle: "
+            f"expected {expected_shape}, got {value_ndarray.shape}"
+        )
+
+    if not value_ndarray.flags.c_contiguous:
+        raise ValueError(
+            "value array from the pickle should be a C-contiguous array"
+        )
+
+    if value_ndarray.dtype == expected_dtype:
+        return value_ndarray
+
+    # Handles different endianness
+    if value_ndarray.dtype.str.endswith('f8'):
+        return value_ndarray.astype(expected_dtype, casting='equiv')
+
+    raise ValueError(
+        "value array from the pickle has an incompatible dtype:\n"
+        f"- expected: {expected_dtype}\n"
+        f"- got:      {value_ndarray.dtype}"
+    )
+
+
+def _dtype_to_dict(dtype):
+    return {name: dt.str for name, (dt, *rest) in dtype.fields.items()}
+
+
+def _dtype_dict_with_modified_bitness(dtype_dict):
+    # field names in Node struct with intp_t types (see sklearn/tree/_tree.pxd)
+    indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"]
+
+    expected_dtype_size = str(struct.calcsize("P"))
+    allowed_dtype_size = "8" if expected_dtype_size == "4" else "4"
+
+    allowed_dtype_dict = dtype_dict.copy()
+    for name in indexing_field_names:
+        allowed_dtype_dict[name] = allowed_dtype_dict[name].replace(
+            expected_dtype_size, allowed_dtype_size
+        )
+
+    return allowed_dtype_dict
+
+
+def _all_compatible_dtype_dicts(dtype):
+    # The Cython code for decision trees uses platform-specific intp_t
+    # typed indexing fields that correspond to either i4 or i8 dtypes for
+    # the matching fields in the numpy array depending on the bitness of
+    # the platform (32 bit or 64 bit respectively).
+    #
+    # We need to cast the indexing fields of the NODE_DTYPE-dtyped array at
+    # pickle load time to enable cross-bitness deployment scenarios. We
+    # typically want to make it possible to run the expensive fit method of
+    # a tree estimator on a 64 bit server platform, pickle the estimator
+    # for deployment and run the predict method of a low power 32 bit edge
+    # platform.
+    #
+    # A similar thing happens for endianness, the machine where the pickle was
+    # saved can have a different endianness than the machine where the pickle
+    # is loaded
+
+    dtype_dict = _dtype_to_dict(dtype)
+    dtype_dict_with_modified_bitness = _dtype_dict_with_modified_bitness(dtype_dict)
+    dtype_dict_with_modified_endianness = _dtype_to_dict(dtype.newbyteorder())
+    dtype_dict_with_modified_bitness_and_endianness = _dtype_dict_with_modified_bitness(
+        dtype_dict_with_modified_endianness
+    )
+
+    return [
+        dtype_dict,
+        dtype_dict_with_modified_bitness,
+        dtype_dict_with_modified_endianness,
+        dtype_dict_with_modified_bitness_and_endianness,
+    ]
+
+
+def _check_node_ndarray(node_ndarray, expected_dtype):
+    if node_ndarray.ndim != 1:
+        raise ValueError(
+            "Wrong dimensions for node array from the pickle: "
+            f"expected 1, got {node_ndarray.ndim}"
+        )
+
+    if not node_ndarray.flags.c_contiguous:
+        raise ValueError(
+            "node array from the pickle should be a C-contiguous array"
+        )
+
+    node_ndarray_dtype = node_ndarray.dtype
+    if node_ndarray_dtype == expected_dtype:
+        return node_ndarray
+
+    node_ndarray_dtype_dict = _dtype_to_dict(node_ndarray_dtype)
+    all_compatible_dtype_dicts = _all_compatible_dtype_dicts(expected_dtype)
+
+    if node_ndarray_dtype_dict not in all_compatible_dtype_dicts:
+        raise ValueError(
+            "node array from the pickle has an incompatible dtype:\n"
+            f"- expected: {expected_dtype}\n"
+            f"- got     : {node_ndarray_dtype}"
+        )
+
+    return node_ndarray.astype(expected_dtype, casting="same_kind")
+
+
+# =============================================================================
+# Build Pruned Tree
+# =============================================================================
+
+
+cdef class _CCPPruneController:
+    """Base class used by build_pruned_tree_ccp and ccp_pruning_path
+    to control pruning.
+    """
+    cdef bint stop_pruning(self, float64_t effective_alpha) noexcept nogil:
+        """Return 1 to stop pruning and 0 to continue pruning"""
+        return 0
+
+    cdef void save_metrics(self, float64_t effective_alpha,
+                           float64_t subtree_impurities) noexcept nogil:
+        """Save metrics when pruning"""
+        pass
+
+    cdef void after_pruning(self, uint8_t[:] in_subtree) noexcept nogil:
+        """Called after pruning"""
+        pass
+
+
+cdef class _AlphaPruner(_CCPPruneController):
+    """Use alpha to control when to stop pruning."""
+    cdef float64_t ccp_alpha
+    cdef intp_t capacity
+
+    def __cinit__(self, float64_t ccp_alpha):
+        self.ccp_alpha = ccp_alpha
+        self.capacity = 0
+
+    cdef bint stop_pruning(self, float64_t effective_alpha) noexcept nogil:
+        # The subtree on the previous iteration has the greatest ccp_alpha
+        # less than or equal to self.ccp_alpha
+        return self.ccp_alpha < effective_alpha
+
+    cdef void after_pruning(self, uint8_t[:] in_subtree) noexcept nogil:
+        """Updates the number of leaves in subtree"""
+        for i in range(in_subtree.shape[0]):
+            if in_subtree[i]:
+                self.capacity += 1
+
+
+cdef class _PathFinder(_CCPPruneController):
+    """Record metrics used to return the cost complexity path."""
+    cdef float64_t[:] ccp_alphas
+    cdef float64_t[:] impurities
+    cdef uint32_t count
+
+    def __cinit__(self,  intp_t node_count):
+        self.ccp_alphas = np.zeros(shape=(node_count), dtype=np.float64)
+        self.impurities = np.zeros(shape=(node_count), dtype=np.float64)
+        self.count = 0
+
+    cdef void save_metrics(self,
+                           float64_t effective_alpha,
+                           float64_t subtree_impurities) noexcept nogil:
+        self.ccp_alphas[self.count] = effective_alpha
+        self.impurities[self.count] = subtree_impurities
+        self.count += 1
+
+
+cdef struct CostComplexityPruningRecord:
+    intp_t node_idx
+    intp_t parent
+
+cdef _cost_complexity_prune(uint8_t[:] leaves_in_subtree,  # OUT
+                            Tree orig_tree,
+                            _CCPPruneController controller):
+    """Perform cost complexity pruning.
+
+    This function takes an already grown tree, `orig_tree` and outputs a
+    boolean mask `leaves_in_subtree` which are the leaves in the pruned tree.
+    During the pruning process, the controller is passed the effective alpha and
+    the subtree impurities. Furthermore, the controller signals when to stop
+    pruning.
+
+    Parameters
+    ----------
+    leaves_in_subtree : uint8_t[:]
+        Output for leaves of subtree
+    orig_tree : Tree
+        Original tree
+    ccp_controller : _CCPPruneController
+        Cost complexity controller
+    """
+
+    cdef:
+        intp_t i
+        intp_t n_nodes = orig_tree.node_count
+        # prior probability using weighted samples
+        float64_t[:] weighted_n_node_samples = orig_tree.weighted_n_node_samples
+        float64_t total_sum_weights = weighted_n_node_samples[0]
+        float64_t[:] impurity = orig_tree.impurity
+        # weighted impurity of each node
+        float64_t[:] r_node = np.empty(shape=n_nodes, dtype=np.float64)
+
+        intp_t[:] child_l = orig_tree.children_left
+        intp_t[:] child_r = orig_tree.children_right
+        intp_t[:] parent = np.zeros(shape=n_nodes, dtype=np.intp)
+
+        stack[CostComplexityPruningRecord] ccp_stack
+        CostComplexityPruningRecord stack_record
+        intp_t node_idx
+        stack[intp_t] node_indices_stack
+
+        intp_t[:] n_leaves = np.zeros(shape=n_nodes, dtype=np.intp)
+        float64_t[:] r_branch = np.zeros(shape=n_nodes, dtype=np.float64)
+        float64_t current_r
+        intp_t leaf_idx
+        intp_t parent_idx
+
+        # candidate nodes that can be pruned
+        uint8_t[:] candidate_nodes = np.zeros(shape=n_nodes, dtype=np.uint8)
+        # nodes in subtree
+        uint8_t[:] in_subtree = np.ones(shape=n_nodes, dtype=np.uint8)
+        intp_t pruned_branch_node_idx
+        float64_t subtree_alpha
+        float64_t effective_alpha
+        intp_t n_pruned_leaves
+        float64_t r_diff
+        float64_t max_float64 = np.finfo(np.float64).max
+
+    # find parent node ids and leaves
+    with nogil:
+
+        for i in range(r_node.shape[0]):
+            r_node[i] = (
+                weighted_n_node_samples[i] * impurity[i] / total_sum_weights)
+
+        # Push the root node
+        ccp_stack.push({"node_idx": 0, "parent": _TREE_UNDEFINED})
+
+        while not ccp_stack.empty():
+            stack_record = ccp_stack.top()
+            ccp_stack.pop()
+
+            node_idx = stack_record.node_idx
+            parent[node_idx] = stack_record.parent
+
+            if child_l[node_idx] == _TREE_LEAF:
+                # ... and child_r[node_idx] == _TREE_LEAF:
+                leaves_in_subtree[node_idx] = 1
+            else:
+                ccp_stack.push({"node_idx": child_l[node_idx], "parent": node_idx})
+                ccp_stack.push({"node_idx": child_r[node_idx], "parent": node_idx})
+
+        # computes number of leaves in all branches and the overall impurity of
+        # the branch. The overall impurity is the sum of r_node in its leaves.
+        for leaf_idx in range(leaves_in_subtree.shape[0]):
+            if not leaves_in_subtree[leaf_idx]:
+                continue
+            r_branch[leaf_idx] = r_node[leaf_idx]
+
+            # bubble up values to ancestor nodes
+            current_r = r_node[leaf_idx]
+            while leaf_idx != 0:
+                parent_idx = parent[leaf_idx]
+                r_branch[parent_idx] += current_r
+                n_leaves[parent_idx] += 1
+                leaf_idx = parent_idx
+
+        for i in range(leaves_in_subtree.shape[0]):
+            candidate_nodes[i] = not leaves_in_subtree[i]
+
+        # save metrics before pruning
+        controller.save_metrics(0.0, r_branch[0])
+
+        # while root node is not a leaf
+        while candidate_nodes[0]:
+
+            # computes ccp_alpha for subtrees and finds the minimal alpha
+            effective_alpha = max_float64
+            for i in range(n_nodes):
+                if not candidate_nodes[i]:
+                    continue
+                subtree_alpha = (r_node[i] - r_branch[i]) / (n_leaves[i] - 1)
+                if subtree_alpha < effective_alpha:
+                    effective_alpha = subtree_alpha
+                    pruned_branch_node_idx = i
+
+            if controller.stop_pruning(effective_alpha):
+                break
+
+            node_indices_stack.push(pruned_branch_node_idx)
+
+            # descendants of branch are not in subtree
+            while not node_indices_stack.empty():
+                node_idx = node_indices_stack.top()
+                node_indices_stack.pop()
+
+                if not in_subtree[node_idx]:
+                    continue  # branch has already been marked for pruning
+                candidate_nodes[node_idx] = 0
+                leaves_in_subtree[node_idx] = 0
+                in_subtree[node_idx] = 0
+
+                if child_l[node_idx] != _TREE_LEAF:
+                    # ... and child_r[node_idx] != _TREE_LEAF:
+                    node_indices_stack.push(child_l[node_idx])
+                    node_indices_stack.push(child_r[node_idx])
+            leaves_in_subtree[pruned_branch_node_idx] = 1
+            in_subtree[pruned_branch_node_idx] = 1
+
+            # updates number of leaves
+            n_pruned_leaves = n_leaves[pruned_branch_node_idx] - 1
+            n_leaves[pruned_branch_node_idx] = 0
+
+            # computes the increase in r_branch to bubble up
+            r_diff = r_node[pruned_branch_node_idx] - r_branch[pruned_branch_node_idx]
+            r_branch[pruned_branch_node_idx] = r_node[pruned_branch_node_idx]
+
+            # bubble up values to ancestors
+            node_idx = parent[pruned_branch_node_idx]
+            while node_idx != _TREE_UNDEFINED:
+                n_leaves[node_idx] -= n_pruned_leaves
+                r_branch[node_idx] += r_diff
+                node_idx = parent[node_idx]
+
+            controller.save_metrics(effective_alpha, r_branch[0])
+
+        controller.after_pruning(in_subtree)
+
+
+def _build_pruned_tree_ccp(
+    Tree tree,  # OUT
+    Tree orig_tree,
+    float64_t ccp_alpha
+):
+    """Build a pruned tree from the original tree using cost complexity
+    pruning.
+
+    The values and nodes from the original tree are copied into the pruned
+    tree.
+
+    Parameters
+    ----------
+    tree : Tree
+        Location to place the pruned tree
+    orig_tree : Tree
+        Original tree
+    ccp_alpha : positive float64_t
+        Complexity parameter. The subtree with the largest cost complexity
+        that is smaller than ``ccp_alpha`` will be chosen. By default,
+        no pruning is performed.
+    """
+
+    cdef:
+        intp_t n_nodes = orig_tree.node_count
+        uint8_t[:] leaves_in_subtree = np.zeros(
+            shape=n_nodes, dtype=np.uint8)
+
+    pruning_controller = _AlphaPruner(ccp_alpha=ccp_alpha)
+
+    _cost_complexity_prune(leaves_in_subtree, orig_tree, pruning_controller)
+
+    _build_pruned_tree(tree, orig_tree, leaves_in_subtree,
+                       pruning_controller.capacity)
+
+
+def ccp_pruning_path(Tree orig_tree):
+    """Computes the cost complexity pruning path.
+
+    Parameters
+    ----------
+    tree : Tree
+        Original tree.
+
+    Returns
+    -------
+    path_info : dict
+        Information about pruning path with attributes:
+
+        ccp_alphas : ndarray
+            Effective alphas of subtree during pruning.
+
+        impurities : ndarray
+            Sum of the impurities of the subtree leaves for the
+            corresponding alpha value in ``ccp_alphas``.
+    """
+    cdef:
+        uint8_t[:] leaves_in_subtree = np.zeros(
+            shape=orig_tree.node_count, dtype=np.uint8)
+
+    path_finder = _PathFinder(orig_tree.node_count)
+
+    _cost_complexity_prune(leaves_in_subtree, orig_tree, path_finder)
+
+    cdef:
+        uint32_t total_items = path_finder.count
+        float64_t[:] ccp_alphas = np.empty(shape=total_items, dtype=np.float64)
+        float64_t[:] impurities = np.empty(shape=total_items, dtype=np.float64)
+        uint32_t count = 0
+
+    while count < total_items:
+        ccp_alphas[count] = path_finder.ccp_alphas[count]
+        impurities[count] = path_finder.impurities[count]
+        count += 1
+
+    return {
+        'ccp_alphas': np.asarray(ccp_alphas),
+        'impurities': np.asarray(impurities),
+    }
+
+
+cdef struct BuildPrunedRecord:
+    intp_t start
+    intp_t depth
+    intp_t parent
+    bint is_left
+
+cdef void _build_pruned_tree(
+    Tree tree,  # OUT
+    Tree orig_tree,
+    const uint8_t[:] leaves_in_subtree,
+    intp_t capacity
+):
+    """Build a pruned tree.
+
+    Build a pruned tree from the original tree by transforming the nodes in
+    ``leaves_in_subtree`` into leaves.
+
+    Parameters
+    ----------
+    tree : Tree
+        Location to place the pruned tree
+    orig_tree : Tree
+        Original tree
+    leaves_in_subtree : uint8_t memoryview, shape=(node_count, )
+        Boolean mask for leaves to include in subtree
+    capacity : intp_t
+        Number of nodes to initially allocate in pruned tree
+    """
+    tree._resize(capacity)
+
+    cdef:
+        intp_t orig_node_id
+        intp_t new_node_id
+        intp_t depth
+        intp_t parent
+        bint is_left
+        bint is_leaf
+
+        # value_stride for original tree and new tree are the same
+        intp_t value_stride = orig_tree.value_stride
+        intp_t max_depth_seen = -1
+        int rc = 0
+        Node* node
+        float64_t* orig_value_ptr
+        float64_t* new_value_ptr
+
+        stack[BuildPrunedRecord] prune_stack
+        BuildPrunedRecord stack_record
+
+    with nogil:
+        # push root node onto stack
+        prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0})
+
+        while not prune_stack.empty():
+            stack_record = prune_stack.top()
+            prune_stack.pop()
+
+            orig_node_id = stack_record.start
+            depth = stack_record.depth
+            parent = stack_record.parent
+            is_left = stack_record.is_left
+
+            is_leaf = leaves_in_subtree[orig_node_id]
+            node = &orig_tree.nodes[orig_node_id]
+
+            # protect against an infinite loop as a runtime error, when leaves_in_subtree
+            # are improperly set where a node is not marked as a leaf, but is a node
+            # in the original tree. Thus, it violates the assumption that the node
+            # is a leaf in the pruned tree, or has a descendant that will be pruned.
+            if (not is_leaf and node.left_child == _TREE_LEAF
+                    and node.right_child == _TREE_LEAF):
+                rc = -2
+                break
+
+            new_node_id = tree._add_node(
+                parent, is_left, is_leaf, node.feature, node.threshold,
+                node.impurity, node.n_node_samples,
+                node.weighted_n_node_samples, node.missing_go_to_left)
+
+            if new_node_id == INTPTR_MAX:
+                rc = -1
+                break
+
+            # copy value from original tree to new tree
+            orig_value_ptr = orig_tree.value + value_stride * orig_node_id
+            new_value_ptr = tree.value + value_stride * new_node_id
+            memcpy(new_value_ptr, orig_value_ptr, sizeof(float64_t) * value_stride)
+
+            if not is_leaf:
+                # Push right child on stack
+                prune_stack.push({"start": node.right_child, "depth": depth + 1,
+                                  "parent": new_node_id, "is_left": 0})
+                # push left child on stack
+                prune_stack.push({"start": node.left_child, "depth": depth + 1,
+                                  "parent": new_node_id, "is_left": 1})
+
+            if depth > max_depth_seen:
+                max_depth_seen = depth
+
+        if rc >= 0:
+            tree.max_depth = max_depth_seen
+    if rc == -1:
+        raise MemoryError("pruning tree")
+    elif rc == -2:
+        raise ValueError(
+            "Node has reached a leaf in the original tree, but is not "
+            "marked as a leaf in the leaves_in_subtree mask."
+        )
+
+
+def _build_pruned_tree_py(Tree tree, Tree orig_tree, const uint8_t[:] leaves_in_subtree):
+    """Build a pruned tree.
+
+    Build a pruned tree from the original tree by transforming the nodes in
+    ``leaves_in_subtree`` into leaves.
+
+    Parameters
+    ----------
+    tree : Tree
+        Location to place the pruned tree
+    orig_tree : Tree
+        Original tree
+    leaves_in_subtree : uint8_t ndarray, shape=(node_count, )
+        Boolean mask for leaves to include in subtree. The array must have
+        the same size as the number of nodes in the original tree.
+    """
+    if leaves_in_subtree.shape[0] != orig_tree.node_count:
+        raise ValueError(
+            f"The length of leaves_in_subtree {len(leaves_in_subtree)} must be "
+            f"equal to the number of nodes in the original tree {orig_tree.node_count}."
+        )
+
+    _build_pruned_tree(tree, orig_tree, leaves_in_subtree, orig_tree.node_count)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pxd b/.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..bc1d7668187d77087b8ad82beef81b012c849a51
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pxd
@@ -0,0 +1,100 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# See _utils.pyx for details.
+
+cimport numpy as cnp
+from ._tree cimport Node
+from ..neighbors._quad_tree cimport Cell
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t
+
+
+cdef enum:
+    # Max value for our rand_r replacement (near the bottom).
+    # We don't use RAND_MAX because it's different across platforms and
+    # particularly tiny on Windows/MSVC.
+    # It corresponds to the maximum representable value for
+    # 32-bit signed integers (i.e. 2^31 - 1).
+    RAND_R_MAX = 2147483647
+
+
+# safe_realloc(&p, n) resizes the allocation of p to n * sizeof(*p) bytes or
+# raises a MemoryError. It never calls free, since that's __dealloc__'s job.
+#   cdef float32_t *p = NULL
+#   safe_realloc(&p, n)
+# is equivalent to p = malloc(n * sizeof(*p)) with error checking.
+ctypedef fused realloc_ptr:
+    # Add pointer types here as needed.
+    (float32_t*)
+    (intp_t*)
+    (uint8_t*)
+    (WeightedPQueueRecord*)
+    (float64_t*)
+    (float64_t**)
+    (Node*)
+    (Cell*)
+    (Node**)
+
+cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil
+
+
+cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size)
+
+
+cdef intp_t rand_int(intp_t low, intp_t high,
+                     uint32_t* random_state) noexcept nogil
+
+
+cdef float64_t rand_uniform(float64_t low, float64_t high,
+                            uint32_t* random_state) noexcept nogil
+
+
+cdef float64_t log(float64_t x) noexcept nogil
+
+# =============================================================================
+# WeightedPQueue data structure
+# =============================================================================
+
+# A record stored in the WeightedPQueue
+cdef struct WeightedPQueueRecord:
+    float64_t data
+    float64_t weight
+
+cdef class WeightedPQueue:
+    cdef intp_t capacity
+    cdef intp_t array_ptr
+    cdef WeightedPQueueRecord* array_
+
+    cdef bint is_empty(self) noexcept nogil
+    cdef int reset(self) except -1 nogil
+    cdef intp_t size(self) noexcept nogil
+    cdef int push(self, float64_t data, float64_t weight) except -1 nogil
+    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil
+    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil
+    cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil
+
+
+# =============================================================================
+# WeightedMedianCalculator data structure
+# =============================================================================
+
+cdef class WeightedMedianCalculator:
+    cdef intp_t initial_capacity
+    cdef WeightedPQueue samples
+    cdef float64_t total_weight
+    cdef intp_t k
+    cdef float64_t sum_w_0_k  # represents sum(weights[0:k]) = w[0] + w[1] + ... + w[k-1]
+    cdef intp_t size(self) noexcept nogil
+    cdef int push(self, float64_t data, float64_t weight) except -1 nogil
+    cdef int reset(self) except -1 nogil
+    cdef int update_median_parameters_post_push(
+        self, float64_t data, float64_t weight,
+        float64_t original_median) noexcept nogil
+    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil
+    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef int update_median_parameters_post_remove(
+        self, float64_t data, float64_t weight,
+        float64_t original_median) noexcept nogil
+    cdef float64_t get_median(self) noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pyx b/.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..c5e936ae48eb14b39553e187166440469947403a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pyx
@@ -0,0 +1,460 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.stdlib cimport free
+from libc.stdlib cimport realloc
+from libc.math cimport log as ln
+from libc.math cimport isnan
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+from ..utils._random cimport our_rand_r
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+
+cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
+    # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
+    # 0.20.1 to crash.
+    cdef size_t nbytes = nelems * sizeof(p[0][0])
+    if nbytes / sizeof(p[0][0]) != nelems:
+        # Overflow in the multiplication
+        raise MemoryError(f"could not allocate ({nelems} * {sizeof(p[0][0])}) bytes")
+
+    cdef realloc_ptr tmp = <realloc_ptr>realloc(p[0], nbytes)
+    if tmp == NULL:
+        raise MemoryError(f"could not allocate {nbytes} bytes")
+
+    p[0] = tmp
+    return 0
+
+
+def _realloc_test():
+    # Helper for tests. Tries to allocate <size_t>(-1) / 2 * sizeof(size_t)
+    # bytes, which will always overflow.
+    cdef intp_t* p = NULL
+    safe_realloc(&p, <size_t>(-1) / 2)
+    if p != NULL:
+        free(p)
+        assert False
+
+
+cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size):
+    """Return copied data as 1D numpy array of intp's."""
+    cdef cnp.npy_intp shape[1]
+    shape[0] = <cnp.npy_intp> size
+    return cnp.PyArray_SimpleNewFromData(1, shape, cnp.NPY_INTP, data).copy()
+
+
+cdef inline intp_t rand_int(intp_t low, intp_t high,
+                            uint32_t* random_state) noexcept nogil:
+    """Generate a random integer in [low; end)."""
+    return low + our_rand_r(random_state) % (high - low)
+
+
+cdef inline float64_t rand_uniform(float64_t low, float64_t high,
+                                   uint32_t* random_state) noexcept nogil:
+    """Generate a random float64_t in [low; high)."""
+    return ((high - low) * <float64_t> our_rand_r(random_state) /
+            <float64_t> RAND_R_MAX) + low
+
+
+cdef inline float64_t log(float64_t x) noexcept nogil:
+    return ln(x) / ln(2.0)
+
+# =============================================================================
+# WeightedPQueue data structure
+# =============================================================================
+
+cdef class WeightedPQueue:
+    """A priority queue class, always sorted in increasing order.
+
+    Attributes
+    ----------
+    capacity : intp_t
+        The capacity of the priority queue.
+
+    array_ptr : intp_t
+        The water mark of the priority queue; the priority queue grows from
+        left to right in the array ``array_``. ``array_ptr`` is always
+        less than ``capacity``.
+
+    array_ : WeightedPQueueRecord*
+        The array of priority queue records. The minimum element is on the
+        left at index 0, and the maximum element is on the right at index
+        ``array_ptr-1``.
+    """
+
+    def __cinit__(self, intp_t capacity):
+        self.capacity = capacity
+        self.array_ptr = 0
+        safe_realloc(&self.array_, capacity)
+
+    def __dealloc__(self):
+        free(self.array_)
+
+    cdef int reset(self) except -1 nogil:
+        """Reset the WeightedPQueue to its state at construction
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        self.array_ptr = 0
+        # Since safe_realloc can raise MemoryError, use `except -1`
+        safe_realloc(&self.array_, self.capacity)
+        return 0
+
+    cdef bint is_empty(self) noexcept nogil:
+        return self.array_ptr <= 0
+
+    cdef intp_t size(self) noexcept nogil:
+        return self.array_ptr
+
+    cdef int push(self, float64_t data, float64_t weight) except -1 nogil:
+        """Push record on the array.
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        cdef intp_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = NULL
+        cdef intp_t i
+
+        # Resize if capacity not sufficient
+        if array_ptr >= self.capacity:
+            self.capacity *= 2
+            # Since safe_realloc can raise MemoryError, use `except -1`
+            safe_realloc(&self.array_, self.capacity)
+
+        # Put element as last element of array
+        array = self.array_
+        array[array_ptr].data = data
+        array[array_ptr].weight = weight
+
+        # bubble last element up according until it is sorted
+        # in ascending order
+        i = array_ptr
+        while(i != 0 and array[i].data < array[i-1].data):
+            array[i], array[i-1] = array[i-1], array[i]
+            i -= 1
+
+        # Increase element count
+        self.array_ptr = array_ptr + 1
+        return 0
+
+    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil:
+        """Remove a specific value/weight record from the array.
+        Returns 0 if successful, -1 if record not found."""
+        cdef intp_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
+        cdef intp_t idx_to_remove = -1
+        cdef intp_t i
+
+        if array_ptr <= 0:
+            return -1
+
+        # find element to remove
+        for i in range(array_ptr):
+            if array[i].data == data and array[i].weight == weight:
+                idx_to_remove = i
+                break
+
+        if idx_to_remove == -1:
+            return -1
+
+        # shift the elements after the removed element
+        # to the left.
+        for i in range(idx_to_remove, array_ptr-1):
+            array[i] = array[i+1]
+
+        self.array_ptr = array_ptr - 1
+        return 0
+
+    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil:
+        """Remove the top (minimum) element from array.
+        Returns 0 if successful, -1 if nothing to remove."""
+        cdef intp_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
+        cdef intp_t i
+
+        if array_ptr <= 0:
+            return -1
+
+        data[0] = array[0].data
+        weight[0] = array[0].weight
+
+        # shift the elements after the removed element
+        # to the left.
+        for i in range(0, array_ptr-1):
+            array[i] = array[i+1]
+
+        self.array_ptr = array_ptr - 1
+        return 0
+
+    cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil:
+        """Write the top element from array to a pointer.
+        Returns 0 if successful, -1 if nothing to write."""
+        cdef WeightedPQueueRecord* array = self.array_
+        if self.array_ptr <= 0:
+            return -1
+        # Take first value
+        data[0] = array[0].data
+        weight[0] = array[0].weight
+        return 0
+
+    cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil:
+        """Given an index between [0,self.current_capacity], access
+        the appropriate heap and return the requested weight"""
+        cdef WeightedPQueueRecord* array = self.array_
+
+        # get weight at index
+        return array[index].weight
+
+    cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil:
+        """Given an index between [0,self.current_capacity], access
+        the appropriate heap and return the requested value"""
+        cdef WeightedPQueueRecord* array = self.array_
+
+        # get value at index
+        return array[index].data
+
+# =============================================================================
+# WeightedMedianCalculator data structure
+# =============================================================================
+
+cdef class WeightedMedianCalculator:
+    """A class to handle calculation of the weighted median from streams of
+    data. To do so, it maintains a parameter ``k`` such that the sum of the
+    weights in the range [0,k) is greater than or equal to half of the total
+    weight. By minimizing the value of ``k`` that fulfills this constraint,
+    calculating the median is done by either taking the value of the sample
+    at index ``k-1`` of ``samples`` (samples[k-1].data) or the average of
+    the samples at index ``k-1`` and ``k`` of ``samples``
+    ((samples[k-1] + samples[k]) / 2).
+
+    Attributes
+    ----------
+    initial_capacity : intp_t
+        The initial capacity of the WeightedMedianCalculator.
+
+    samples : WeightedPQueue
+        Holds the samples (consisting of values and their weights) used in the
+        weighted median calculation.
+
+    total_weight : float64_t
+        The sum of the weights of items in ``samples``. Represents the total
+        weight of all samples used in the median calculation.
+
+    k : intp_t
+        Index used to calculate the median.
+
+    sum_w_0_k : float64_t
+        The sum of the weights from samples[0:k]. Used in the weighted
+        median calculation; minimizing the value of ``k`` such that
+        ``sum_w_0_k`` >= ``total_weight / 2`` provides a mechanism for
+        calculating the median in constant time.
+
+    """
+
+    def __cinit__(self, intp_t initial_capacity):
+        self.initial_capacity = initial_capacity
+        self.samples = WeightedPQueue(initial_capacity)
+        self.total_weight = 0
+        self.k = 0
+        self.sum_w_0_k = 0
+
+    cdef intp_t size(self) noexcept nogil:
+        """Return the number of samples in the
+        WeightedMedianCalculator"""
+        return self.samples.size()
+
+    cdef int reset(self) except -1 nogil:
+        """Reset the WeightedMedianCalculator to its state at construction
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        # samples.reset (WeightedPQueue.reset) uses safe_realloc, hence
+        # except -1
+        self.samples.reset()
+        self.total_weight = 0
+        self.k = 0
+        self.sum_w_0_k = 0
+        return 0
+
+    cdef int push(self, float64_t data, float64_t weight) except -1 nogil:
+        """Push a value and its associated weight to the WeightedMedianCalculator
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        cdef int return_value
+        cdef float64_t original_median = 0.0
+
+        if self.size() != 0:
+            original_median = self.get_median()
+        # samples.push (WeightedPQueue.push) uses safe_realloc, hence except -1
+        return_value = self.samples.push(data, weight)
+        self.update_median_parameters_post_push(data, weight,
+                                                original_median)
+        return return_value
+
+    cdef int update_median_parameters_post_push(
+            self, float64_t data, float64_t weight,
+            float64_t original_median) noexcept nogil:
+        """Update the parameters used in the median calculation,
+        namely `k` and `sum_w_0_k` after an insertion"""
+
+        # trivial case of one element.
+        if self.size() == 1:
+            self.k = 1
+            self.total_weight = weight
+            self.sum_w_0_k = self.total_weight
+            return 0
+
+        # get the original weighted median
+        self.total_weight += weight
+
+        if data < original_median:
+            # inserting below the median, so increment k and
+            # then update self.sum_w_0_k accordingly by adding
+            # the weight that was added.
+            self.k += 1
+            # update sum_w_0_k by adding the weight added
+            self.sum_w_0_k += weight
+
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            # minimum value of k is 1
+            while(self.k > 1 and ((self.sum_w_0_k -
+                                   self.samples.get_weight_from_index(self.k-1))
+                                  >= self.total_weight / 2.0)):
+                self.k -= 1
+                self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
+            return 0
+
+        if data >= original_median:
+            # inserting above or at the median
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            while(self.k < self.samples.size() and
+                  (self.sum_w_0_k < self.total_weight / 2.0)):
+                self.k += 1
+                self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
+            return 0
+
+    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil:
+        """Remove a value from the MedianHeap, removing it
+        from consideration in the median calculation
+        """
+        cdef int return_value
+        cdef float64_t original_median = 0.0
+
+        if self.size() != 0:
+            original_median = self.get_median()
+
+        return_value = self.samples.remove(data, weight)
+        self.update_median_parameters_post_remove(data, weight,
+                                                  original_median)
+        return return_value
+
+    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil:
+        """Pop a value from the MedianHeap, starting from the
+        left and moving to the right.
+        """
+        cdef int return_value
+        cdef float64_t original_median = 0.0
+
+        if self.size() != 0:
+            original_median = self.get_median()
+
+        # no elements to pop
+        if self.samples.size() == 0:
+            return -1
+
+        return_value = self.samples.pop(data, weight)
+        self.update_median_parameters_post_remove(data[0],
+                                                  weight[0],
+                                                  original_median)
+        return return_value
+
+    cdef int update_median_parameters_post_remove(
+            self, float64_t data, float64_t weight,
+            float64_t original_median) noexcept nogil:
+        """Update the parameters used in the median calculation,
+        namely `k` and `sum_w_0_k` after a removal"""
+        # reset parameters because it there are no elements
+        if self.samples.size() == 0:
+            self.k = 0
+            self.total_weight = 0
+            self.sum_w_0_k = 0
+            return 0
+
+        # trivial case of one element.
+        if self.samples.size() == 1:
+            self.k = 1
+            self.total_weight -= weight
+            self.sum_w_0_k = self.total_weight
+            return 0
+
+        # get the current weighted median
+        self.total_weight -= weight
+
+        if data < original_median:
+            # removing below the median, so decrement k and
+            # then update self.sum_w_0_k accordingly by subtracting
+            # the removed weight
+
+            self.k -= 1
+            # update sum_w_0_k by removing the weight at index k
+            self.sum_w_0_k -= weight
+
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            # by incrementing k and updating sum_w_0_k accordingly
+            # until the condition is met.
+            while(self.k < self.samples.size() and
+                  (self.sum_w_0_k < self.total_weight / 2.0)):
+                self.k += 1
+                self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
+            return 0
+
+        if data >= original_median:
+            # removing above the median
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            while(self.k > 1 and ((self.sum_w_0_k -
+                                   self.samples.get_weight_from_index(self.k-1))
+                                  >= self.total_weight / 2.0)):
+                self.k -= 1
+                self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
+            return 0
+
+    cdef float64_t get_median(self) noexcept nogil:
+        """Write the median to a pointer, taking into account
+        sample weights."""
+        if self.sum_w_0_k == (self.total_weight / 2.0):
+            # split median
+            return (self.samples.get_value_from_index(self.k) +
+                    self.samples.get_value_from_index(self.k-1)) / 2.0
+        if self.sum_w_0_k > (self.total_weight / 2.0):
+            # whole median
+            return self.samples.get_value_from_index(self.k-1)
+
+
+def _any_isnan_axis0(const float32_t[:, :] X):
+    """Same as np.any(np.isnan(X), axis=0)"""
+    cdef:
+        intp_t i, j
+        intp_t n_samples = X.shape[0]
+        intp_t n_features = X.shape[1]
+        uint8_t[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_)
+
+    with nogil:
+        for i in range(n_samples):
+            for j in range(n_features):
+                if isnan_out[j]:
+                    continue
+                if isnan(X[i, j]):
+                    isnan_out[j] = True
+                    break
+    return np.asarray(isnan_out)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/meson.build b/.venv/lib/python3.12/site-packages/sklearn/tree/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..87345a1e344bf52095419d40cc0ada947a238396
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/meson.build
@@ -0,0 +1,28 @@
+tree_extension_metadata = {
+  '_tree':
+    {'sources': [cython_gen_cpp.process('_tree.pyx')],
+     'override_options': ['optimization=3']},
+  '_splitter':
+    {'sources': [cython_gen.process('_splitter.pyx')],
+     'override_options': ['optimization=3']},
+  '_partitioner':
+    {'sources': [cython_gen.process('_partitioner.pyx')],
+     'override_options': ['optimization=3']},
+  '_criterion':
+    {'sources': [cython_gen.process('_criterion.pyx')],
+     'override_options': ['optimization=3']},
+  '_utils':
+    {'sources': [cython_gen.process('_utils.pyx')],
+     'override_options': ['optimization=3']},
+}
+
+foreach ext_name, ext_dict : tree_extension_metadata
+  py.extension_module(
+    ext_name,
+    [ext_dict.get('sources'), utils_cython_tree],
+    dependencies: [np_dep],
+    override_options : ext_dict.get('override_options', []),
+    subdir: 'sklearn/tree',
+    install: true
+  )
+endforeach
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_export.py b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..d05e657072b170901d880b74757d1483b38049f1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_export.py
@@ -0,0 +1,630 @@
+"""
+Testing for export functions of decision trees (sklearn.tree.export).
+"""
+
+from io import StringIO
+from re import finditer, search
+from textwrap import dedent
+
+import numpy as np
+import pytest
+from numpy.random import RandomState
+
+from sklearn.base import is_classifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.exceptions import NotFittedError
+from sklearn.tree import (
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    export_graphviz,
+    export_text,
+    plot_tree,
+)
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+y2 = [[-1, 1], [-1, 1], [-1, 1], [1, 2], [1, 2], [1, 3]]
+w = [1, 1, 1, 0.5, 0.5, 0.5]
+y_degraded = [1, 1, 1, 1, 1, 1]
+
+
+def test_graphviz_toy():
+    # Check correctness of export_graphviz
+    clf = DecisionTreeClassifier(
+        max_depth=3, min_samples_split=2, criterion="gini", random_state=2
+    )
+    clf.fit(X, y)
+
+    # Test export code
+    contents1 = export_graphviz(clf, out_file=None)
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
+        'value = [3, 3]"] ;\n'
+        '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        "}"
+    )
+    assert contents1 == contents2
+
+    # Test with feature_names
+    contents1 = export_graphviz(
+        clf, feature_names=["feature0", "feature1"], out_file=None
+    )
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
+        'value = [3, 3]"] ;\n'
+        '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test with feature_names (escaped)
+    contents1 = export_graphviz(
+        clf, feature_names=['feature"0"', 'feature"1"'], out_file=None
+    )
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="feature\\"0\\" <= 0.0\\n'
+        "gini = 0.5\\nsamples = 6\\n"
+        'value = [3, 3]"] ;\n'
+        '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test with class_names
+    contents1 = export_graphviz(clf, class_names=["yes", "no"], out_file=None)
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
+        'value = [3, 3]\\nclass = yes"] ;\n'
+        '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
+        'class = yes"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
+        'class = no"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test with class_names (escaped)
+    contents1 = export_graphviz(clf, class_names=['"yes"', '"no"'], out_file=None)
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
+        'value = [3, 3]\\nclass = \\"yes\\""] ;\n'
+        '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
+        'class = \\"yes\\""] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
+        'class = \\"no\\""] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test plot_options
+    contents1 = export_graphviz(
+        clf,
+        filled=True,
+        impurity=False,
+        proportion=True,
+        special_characters=True,
+        rounded=True,
+        out_file=None,
+        fontname="sans",
+    )
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, style="filled, rounded", color="black", '
+        'fontname="sans"] ;\n'
+        'edge [fontname="sans"] ;\n'
+        "0 [label=<x<SUB>0</SUB> &le; 0.0<br/>samples = 100.0%<br/>"
+        'value = [0.5, 0.5]>, fillcolor="#ffffff"] ;\n'
+        "1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, "
+        'fillcolor="#e58139"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        "2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, "
+        'fillcolor="#399de5"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test max_depth
+    contents1 = export_graphviz(clf, max_depth=0, class_names=True, out_file=None)
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
+        'value = [3, 3]\\nclass = y[0]"] ;\n'
+        '1 [label="(...)"] ;\n'
+        "0 -> 1 ;\n"
+        '2 [label="(...)"] ;\n'
+        "0 -> 2 ;\n"
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test max_depth with plot_options
+    contents1 = export_graphviz(
+        clf, max_depth=0, filled=True, out_file=None, node_ids=True
+    )
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, style="filled", color="black", '
+        'fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="node #0\\nx[0] <= 0.0\\ngini = 0.5\\n'
+        'samples = 6\\nvalue = [3, 3]", fillcolor="#ffffff"] ;\n'
+        '1 [label="(...)", fillcolor="#C0C0C0"] ;\n'
+        "0 -> 1 ;\n"
+        '2 [label="(...)", fillcolor="#C0C0C0"] ;\n'
+        "0 -> 2 ;\n"
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test multi-output with weighted samples
+    clf = DecisionTreeClassifier(
+        max_depth=2, min_samples_split=2, criterion="gini", random_state=2
+    )
+    clf = clf.fit(X, y2, sample_weight=w)
+
+    contents1 = export_graphviz(clf, filled=True, impurity=False, out_file=None)
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, style="filled", color="black", '
+        'fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="x[0] <= 0.0\\nsamples = 6\\n'
+        "value = [[3.0, 1.5, 0.0]\\n"
+        '[3.0, 1.0, 0.5]]", fillcolor="#ffffff"] ;\n'
+        '1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n'
+        '[3, 0, 0]]", fillcolor="#e58139"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        '2 [label="x[0] <= 1.5\\nsamples = 3\\n'
+        "value = [[0.0, 1.5, 0.0]\\n"
+        '[0.0, 1.0, 0.5]]", fillcolor="#f1bd97"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        '3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n'
+        '[0, 1, 0]]", fillcolor="#e58139"] ;\n'
+        "2 -> 3 ;\n"
+        '4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n'
+        '[0.0, 0.0, 0.5]]", fillcolor="#e58139"] ;\n'
+        "2 -> 4 ;\n"
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test regression output with plot_options
+    clf = DecisionTreeRegressor(
+        max_depth=3, min_samples_split=2, criterion="squared_error", random_state=2
+    )
+    clf.fit(X, y)
+
+    contents1 = export_graphviz(
+        clf,
+        filled=True,
+        leaves_parallel=True,
+        out_file=None,
+        rotate=True,
+        rounded=True,
+        fontname="sans",
+    )
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, style="filled, rounded", color="black", '
+        'fontname="sans"] ;\n'
+        "graph [ranksep=equally, splines=polyline] ;\n"
+        'edge [fontname="sans"] ;\n'
+        "rankdir=LR ;\n"
+        '0 [label="x[0] <= 0.0\\nsquared_error = 1.0\\nsamples = 6\\n'
+        'value = 0.0", fillcolor="#f2c09c"] ;\n'
+        '1 [label="squared_error = 0.0\\nsamples = 3\\'
+        'nvalue = -1.0", '
+        'fillcolor="#ffffff"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="True"] ;\n'
+        '2 [label="squared_error = 0.0\\nsamples = 3\\nvalue = 1.0", '
+        'fillcolor="#e58139"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=45, "
+        'headlabel="False"] ;\n'
+        "{rank=same ; 0} ;\n"
+        "{rank=same ; 1; 2} ;\n"
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test classifier with degraded learning set
+    clf = DecisionTreeClassifier(max_depth=3)
+    clf.fit(X, y_degraded)
+
+    contents1 = export_graphviz(clf, filled=True, out_file=None)
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, style="filled", color="black", '
+        'fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="gini = 0.0\\nsamples = 6\\nvalue = 6.0", '
+        'fillcolor="#ffffff"] ;\n'
+        "}"
+    )
+
+
+@pytest.mark.parametrize("constructor", [list, np.array])
+def test_graphviz_feature_class_names_array_support(constructor):
+    # Check that export_graphviz treats feature names
+    # and class names correctly and supports arrays
+    clf = DecisionTreeClassifier(
+        max_depth=3, min_samples_split=2, criterion="gini", random_state=2
+    )
+    clf.fit(X, y)
+
+    # Test with feature_names
+    contents1 = export_graphviz(
+        clf, feature_names=constructor(["feature0", "feature1"]), out_file=None
+    )
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
+        'value = [3, 3]"] ;\n'
+        '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        "}"
+    )
+
+    assert contents1 == contents2
+
+    # Test with class_names
+    contents1 = export_graphviz(
+        clf, class_names=constructor(["yes", "no"]), out_file=None
+    )
+    contents2 = (
+        "digraph Tree {\n"
+        'node [shape=box, fontname="helvetica"] ;\n'
+        'edge [fontname="helvetica"] ;\n'
+        '0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
+        'value = [3, 3]\\nclass = yes"] ;\n'
+        '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
+        'class = yes"] ;\n'
+        "0 -> 1 [labeldistance=2.5, labelangle=45, "
+        'headlabel="True"] ;\n'
+        '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
+        'class = no"] ;\n'
+        "0 -> 2 [labeldistance=2.5, labelangle=-45, "
+        'headlabel="False"] ;\n'
+        "}"
+    )
+
+    assert contents1 == contents2
+
+
+def test_graphviz_errors():
+    # Check for errors of export_graphviz
+    clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)
+
+    # Check not-fitted decision tree error
+    out = StringIO()
+    with pytest.raises(NotFittedError):
+        export_graphviz(clf, out)
+
+    clf.fit(X, y)
+
+    # Check if it errors when length of feature_names
+    # mismatches with number of features
+    message = "Length of feature_names, 1 does not match number of features, 2"
+    with pytest.raises(ValueError, match=message):
+        export_graphviz(clf, None, feature_names=["a"])
+
+    message = "Length of feature_names, 3 does not match number of features, 2"
+    with pytest.raises(ValueError, match=message):
+        export_graphviz(clf, None, feature_names=["a", "b", "c"])
+
+    # Check error when argument is not an estimator
+    message = "is not an estimator instance"
+    with pytest.raises(TypeError, match=message):
+        export_graphviz(clf.fit(X, y).tree_)
+
+    # Check class_names error
+    out = StringIO()
+    with pytest.raises(IndexError):
+        export_graphviz(clf, out, class_names=[])
+
+
+def test_friedman_mse_in_graphviz():
+    clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
+    clf.fit(X, y)
+    dot_data = StringIO()
+    export_graphviz(clf, out_file=dot_data)
+
+    clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
+    clf.fit(X, y)
+    for estimator in clf.estimators_:
+        export_graphviz(estimator[0], out_file=dot_data)
+
+    for finding in finditer(r"\[.*?samples.*?\]", dot_data.getvalue()):
+        assert "friedman_mse" in finding.group()
+
+
+def test_precision():
+    rng_reg = RandomState(2)
+    rng_clf = RandomState(8)
+    for X, y, clf in zip(
+        (rng_reg.random_sample((5, 2)), rng_clf.random_sample((1000, 4))),
+        (rng_reg.random_sample((5,)), rng_clf.randint(2, size=(1000,))),
+        (
+            DecisionTreeRegressor(
+                criterion="friedman_mse", random_state=0, max_depth=1
+            ),
+            DecisionTreeClassifier(max_depth=1, random_state=0),
+        ),
+    ):
+        clf.fit(X, y)
+        for precision in (4, 3):
+            dot_data = export_graphviz(
+                clf, out_file=None, precision=precision, proportion=True
+            )
+
+            # With the current random state, the impurity and the threshold
+            # will have the number of precision set in the export_graphviz
+            # function. We will check the number of precision with a strict
+            # equality. The value reported will have only 2 precision and
+            # therefore, only a less equal comparison will be done.
+
+            # check value
+            for finding in finditer(r"value = \d+\.\d+", dot_data):
+                assert len(search(r"\.\d+", finding.group()).group()) <= precision + 1
+            # check impurity
+            if is_classifier(clf):
+                pattern = r"gini = \d+\.\d+"
+            else:
+                pattern = r"friedman_mse = \d+\.\d+"
+
+            # check impurity
+            for finding in finditer(pattern, dot_data):
+                assert len(search(r"\.\d+", finding.group()).group()) == precision + 1
+            # check threshold
+            for finding in finditer(r"<= \d+\.\d+", dot_data):
+                assert len(search(r"\.\d+", finding.group()).group()) == precision + 1
+
+
+def test_export_text_errors():
+    clf = DecisionTreeClassifier(max_depth=2, random_state=0)
+    clf.fit(X, y)
+    err_msg = "feature_names must contain 2 elements, got 1"
+    with pytest.raises(ValueError, match=err_msg):
+        export_text(clf, feature_names=["a"])
+    err_msg = (
+        "When `class_names` is an array, it should contain as"
+        " many items as `decision_tree.classes_`. Got 1 while"
+        " the tree was fitted with 2 classes."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        export_text(clf, class_names=["a"])
+
+
+def test_export_text():
+    clf = DecisionTreeClassifier(max_depth=2, random_state=0)
+    clf.fit(X, y)
+
+    expected_report = dedent(
+        """
+    |--- feature_1 <= 0.00
+    |   |--- class: -1
+    |--- feature_1 >  0.00
+    |   |--- class: 1
+    """
+    ).lstrip()
+
+    assert export_text(clf) == expected_report
+    # testing that leaves at level 1 are not truncated
+    assert export_text(clf, max_depth=0) == expected_report
+    # testing that the rest of the tree is truncated
+    assert export_text(clf, max_depth=10) == expected_report
+
+    expected_report = dedent(
+        """
+    |--- feature_1 <= 0.00
+    |   |--- weights: [3.00, 0.00] class: -1
+    |--- feature_1 >  0.00
+    |   |--- weights: [0.00, 3.00] class: 1
+    """
+    ).lstrip()
+    assert export_text(clf, show_weights=True) == expected_report
+
+    expected_report = dedent(
+        """
+    |- feature_1 <= 0.00
+    | |- class: -1
+    |- feature_1 >  0.00
+    | |- class: 1
+    """
+    ).lstrip()
+    assert export_text(clf, spacing=1) == expected_report
+
+    X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]]
+    y_l = [-1, -1, -1, 1, 1, 1, 2]
+    clf = DecisionTreeClassifier(max_depth=4, random_state=0)
+    clf.fit(X_l, y_l)
+    expected_report = dedent(
+        """
+    |--- feature_1 <= 0.00
+    |   |--- class: -1
+    |--- feature_1 >  0.00
+    |   |--- truncated branch of depth 2
+    """
+    ).lstrip()
+    assert export_text(clf, max_depth=0) == expected_report
+
+    X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+    y_mo = [[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1], [1, 1]]
+
+    reg = DecisionTreeRegressor(max_depth=2, random_state=0)
+    reg.fit(X_mo, y_mo)
+
+    expected_report = dedent(
+        """
+    |--- feature_1 <= 0.0
+    |   |--- value: [-1.0, -1.0]
+    |--- feature_1 >  0.0
+    |   |--- value: [1.0, 1.0]
+    """
+    ).lstrip()
+    assert export_text(reg, decimals=1) == expected_report
+    assert export_text(reg, decimals=1, show_weights=True) == expected_report
+
+    X_single = [[-2], [-1], [-1], [1], [1], [2]]
+    reg = DecisionTreeRegressor(max_depth=2, random_state=0)
+    reg.fit(X_single, y_mo)
+
+    expected_report = dedent(
+        """
+    |--- first <= 0.0
+    |   |--- value: [-1.0, -1.0]
+    |--- first >  0.0
+    |   |--- value: [1.0, 1.0]
+    """
+    ).lstrip()
+    assert export_text(reg, decimals=1, feature_names=["first"]) == expected_report
+    assert (
+        export_text(reg, decimals=1, show_weights=True, feature_names=["first"])
+        == expected_report
+    )
+
+
+@pytest.mark.parametrize("constructor", [list, np.array])
+def test_export_text_feature_class_names_array_support(constructor):
+    # Check that export_graphviz treats feature names
+    # and class names correctly and supports arrays
+    clf = DecisionTreeClassifier(max_depth=2, random_state=0)
+    clf.fit(X, y)
+
+    expected_report = dedent(
+        """
+    |--- b <= 0.00
+    |   |--- class: -1
+    |--- b >  0.00
+    |   |--- class: 1
+    """
+    ).lstrip()
+    assert export_text(clf, feature_names=constructor(["a", "b"])) == expected_report
+
+    expected_report = dedent(
+        """
+    |--- feature_1 <= 0.00
+    |   |--- class: cat
+    |--- feature_1 >  0.00
+    |   |--- class: dog
+    """
+    ).lstrip()
+    assert export_text(clf, class_names=constructor(["cat", "dog"])) == expected_report
+
+
+def test_plot_tree_entropy(pyplot):
+    # mostly smoke tests
+    # Check correctness of export_graphviz for criterion = entropy
+    clf = DecisionTreeClassifier(
+        max_depth=3, min_samples_split=2, criterion="entropy", random_state=2
+    )
+    clf.fit(X, y)
+
+    # Test export code
+    feature_names = ["first feat", "sepal_width"]
+    nodes = plot_tree(clf, feature_names=feature_names)
+    assert len(nodes) == 5
+    assert (
+        nodes[0].get_text()
+        == "first feat <= 0.0\nentropy = 1.0\nsamples = 6\nvalue = [3, 3]"
+    )
+    assert nodes[1].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [3, 0]"
+    assert nodes[2].get_text() == "True  "
+    assert nodes[3].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [0, 3]"
+    assert nodes[4].get_text() == "  False"
+
+
+@pytest.mark.parametrize("fontsize", [None, 10, 20])
+def test_plot_tree_gini(pyplot, fontsize):
+    # mostly smoke tests
+    # Check correctness of export_graphviz for criterion = gini
+    clf = DecisionTreeClassifier(
+        max_depth=3,
+        min_samples_split=2,
+        criterion="gini",
+        random_state=2,
+    )
+    clf.fit(X, y)
+
+    # Test export code
+    feature_names = ["first feat", "sepal_width"]
+    nodes = plot_tree(clf, feature_names=feature_names, fontsize=fontsize)
+    assert len(nodes) == 5
+    if fontsize is not None:
+        assert all(node.get_fontsize() == fontsize for node in nodes)
+    assert (
+        nodes[0].get_text()
+        == "first feat <= 0.0\ngini = 0.5\nsamples = 6\nvalue = [3, 3]"
+    )
+    assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]"
+    assert nodes[2].get_text() == "True  "
+    assert nodes[3].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
+    assert nodes[4].get_text() == "  False"
+
+
+def test_not_fitted_tree(pyplot):
+    # Testing if not fitted tree throws the correct error
+    clf = DecisionTreeRegressor()
+    with pytest.raises(NotFittedError):
+        plot_tree(clf)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_monotonic_tree.py b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_monotonic_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfe39720df2240c8647be5d8b4666882fc33d520
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_monotonic_tree.py
@@ -0,0 +1,512 @@
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.ensemble import (
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
+    RandomForestClassifier,
+    RandomForestRegressor,
+)
+from sklearn.tree import (
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    ExtraTreeClassifier,
+    ExtraTreeRegressor,
+)
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import CSC_CONTAINERS
+
+TREE_CLASSIFIER_CLASSES = [DecisionTreeClassifier, ExtraTreeClassifier]
+TREE_REGRESSOR_CLASSES = [DecisionTreeRegressor, ExtraTreeRegressor]
+TREE_BASED_CLASSIFIER_CLASSES = TREE_CLASSIFIER_CLASSES + [
+    RandomForestClassifier,
+    ExtraTreesClassifier,
+]
+TREE_BASED_REGRESSOR_CLASSES = TREE_REGRESSOR_CLASSES + [
+    RandomForestRegressor,
+    ExtraTreesRegressor,
+]
+
+
+@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
+@pytest.mark.parametrize("depth_first_builder", (True, False))
+@pytest.mark.parametrize("sparse_splitter", (True, False))
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_monotonic_constraints_classifications(
+    TreeClassifier,
+    depth_first_builder,
+    sparse_splitter,
+    global_random_seed,
+    csc_container,
+):
+    n_samples = 1000
+    n_samples_train = 900
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_classes=2,
+        n_features=5,
+        n_informative=5,
+        n_redundant=0,
+        random_state=global_random_seed,
+    )
+    X_train, y_train = X[:n_samples_train], y[:n_samples_train]
+    X_test, _ = X[n_samples_train:], y[n_samples_train:]
+
+    X_test_0incr, X_test_0decr = np.copy(X_test), np.copy(X_test)
+    X_test_1incr, X_test_1decr = np.copy(X_test), np.copy(X_test)
+    X_test_0incr[:, 0] += 10
+    X_test_0decr[:, 0] -= 10
+    X_test_1incr[:, 1] += 10
+    X_test_1decr[:, 1] -= 10
+    monotonic_cst = np.zeros(X.shape[1])
+    monotonic_cst[0] = 1
+    monotonic_cst[1] = -1
+
+    if depth_first_builder:
+        est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst)
+    else:
+        est = TreeClassifier(
+            max_depth=None,
+            monotonic_cst=monotonic_cst,
+            max_leaf_nodes=n_samples_train,
+        )
+    if hasattr(est, "random_state"):
+        est.set_params(**{"random_state": global_random_seed})
+    if hasattr(est, "n_estimators"):
+        est.set_params(**{"n_estimators": 5})
+    if sparse_splitter:
+        X_train = csc_container(X_train)
+    est.fit(X_train, y_train)
+    proba_test = est.predict_proba(X_test)
+
+    assert np.logical_and(proba_test >= 0.0, proba_test <= 1.0).all(), (
+        "Probability should always be in [0, 1] range."
+    )
+    assert_allclose(proba_test.sum(axis=1), 1.0)
+
+    # Monotonic increase constraint, it applies to the positive class
+    assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= proba_test[:, 1])
+    assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= proba_test[:, 1])
+
+    # Monotonic decrease constraint, it applies to the positive class
+    assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= proba_test[:, 1])
+    assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= proba_test[:, 1])
+
+
+@pytest.mark.parametrize("TreeRegressor", TREE_BASED_REGRESSOR_CLASSES)
+@pytest.mark.parametrize("depth_first_builder", (True, False))
+@pytest.mark.parametrize("sparse_splitter", (True, False))
+@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_monotonic_constraints_regressions(
+    TreeRegressor,
+    depth_first_builder,
+    sparse_splitter,
+    criterion,
+    global_random_seed,
+    csc_container,
+):
+    n_samples = 1000
+    n_samples_train = 900
+    # Build a regression task using 5 informative features
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=5,
+        n_informative=5,
+        random_state=global_random_seed,
+    )
+    train = np.arange(n_samples_train)
+    test = np.arange(n_samples_train, n_samples)
+    X_train = X[train]
+    y_train = y[train]
+    X_test = np.copy(X[test])
+    X_test_incr = np.copy(X_test)
+    X_test_decr = np.copy(X_test)
+    X_test_incr[:, 0] += 10
+    X_test_decr[:, 1] += 10
+    monotonic_cst = np.zeros(X.shape[1])
+    monotonic_cst[0] = 1
+    monotonic_cst[1] = -1
+
+    if depth_first_builder:
+        est = TreeRegressor(
+            max_depth=None,
+            monotonic_cst=monotonic_cst,
+            criterion=criterion,
+        )
+    else:
+        est = TreeRegressor(
+            max_depth=8,
+            monotonic_cst=monotonic_cst,
+            criterion=criterion,
+            max_leaf_nodes=n_samples_train,
+        )
+    if hasattr(est, "random_state"):
+        est.set_params(random_state=global_random_seed)
+    if hasattr(est, "n_estimators"):
+        est.set_params(**{"n_estimators": 5})
+    if sparse_splitter:
+        X_train = csc_container(X_train)
+    est.fit(X_train, y_train)
+    y = est.predict(X_test)
+    # Monotonic increase constraint
+    y_incr = est.predict(X_test_incr)
+    # y_incr should always be greater than y
+    assert np.all(y_incr >= y)
+
+    # Monotonic decrease constraint
+    y_decr = est.predict(X_test_decr)
+    # y_decr should always be lower than y
+    assert np.all(y_decr <= y)
+
+
+@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
+def test_multiclass_raises(TreeClassifier):
+    X, y = make_classification(
+        n_samples=100, n_features=5, n_classes=3, n_informative=3, random_state=0
+    )
+    y[0] = 0
+    monotonic_cst = np.zeros(X.shape[1])
+    monotonic_cst[0] = -1
+    monotonic_cst[1] = 1
+    est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst, random_state=0)
+
+    msg = "Monotonicity constraints are not supported with multiclass classification"
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
+def test_multiple_output_raises(TreeClassifier):
+    X = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
+    y = [[1, 0, 1, 0, 1], [1, 0, 1, 0, 1]]
+
+    est = TreeClassifier(
+        max_depth=None, monotonic_cst=np.array([-1, 1]), random_state=0
+    )
+    msg = "Monotonicity constraints are not supported with multiple output"
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "Tree",
+    [
+        DecisionTreeClassifier,
+        DecisionTreeRegressor,
+        ExtraTreeClassifier,
+        ExtraTreeRegressor,
+    ],
+)
+def test_missing_values_raises(Tree):
+    X, y = make_classification(
+        n_samples=100, n_features=5, n_classes=2, n_informative=3, random_state=0
+    )
+    X[0, 0] = np.nan
+    monotonic_cst = np.zeros(X.shape[1])
+    monotonic_cst[0] = 1
+    est = Tree(max_depth=None, monotonic_cst=monotonic_cst, random_state=0)
+
+    msg = "Input X contains NaN"
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
+def test_bad_monotonic_cst_raises(TreeClassifier):
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    y = [1, 0, 1, 0, 1]
+
+    msg = "monotonic_cst has shape 3 but the input data X has 2 features."
+    est = TreeClassifier(
+        max_depth=None, monotonic_cst=np.array([-1, 1, 0]), random_state=0
+    )
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+    msg = "monotonic_cst must be None or an array-like of -1, 0 or 1."
+    est = TreeClassifier(
+        max_depth=None, monotonic_cst=np.array([-2, 2]), random_state=0
+    )
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+    est = TreeClassifier(
+        max_depth=None, monotonic_cst=np.array([-1, 0.8]), random_state=0
+    )
+    with pytest.raises(ValueError, match=msg + "(.*)0.8]"):
+        est.fit(X, y)
+
+
+def assert_1d_reg_tree_children_monotonic_bounded(tree_, monotonic_sign):
+    values = tree_.value
+    for i in range(tree_.node_count):
+        if tree_.children_left[i] > i and tree_.children_right[i] > i:
+            # Check monotonicity on children
+            i_left = tree_.children_left[i]
+            i_right = tree_.children_right[i]
+            if monotonic_sign == 1:
+                assert values[i_left] <= values[i_right]
+            elif monotonic_sign == -1:
+                assert values[i_left] >= values[i_right]
+            val_middle = (values[i_left] + values[i_right]) / 2
+            # Check bounds on grand-children, filtering out leaf nodes
+            if tree_.feature[i_left] >= 0:
+                i_left_right = tree_.children_right[i_left]
+                if monotonic_sign == 1:
+                    assert values[i_left_right] <= val_middle
+                elif monotonic_sign == -1:
+                    assert values[i_left_right] >= val_middle
+            if tree_.feature[i_right] >= 0:
+                i_right_left = tree_.children_left[i_right]
+                if monotonic_sign == 1:
+                    assert val_middle <= values[i_right_left]
+                elif monotonic_sign == -1:
+                    assert val_middle >= values[i_right_left]
+
+
+def test_assert_1d_reg_tree_children_monotonic_bounded():
+    X = np.linspace(-1, 1, 7).reshape(-1, 1)
+    y = np.sin(2 * np.pi * X.ravel())
+
+    reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
+
+    with pytest.raises(AssertionError):
+        assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, 1)
+
+    with pytest.raises(AssertionError):
+        assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, -1)
+
+
+def assert_1d_reg_monotonic(clf, monotonic_sign, min_x, max_x, n_steps):
+    X_grid = np.linspace(min_x, max_x, n_steps).reshape(-1, 1)
+    y_pred_grid = clf.predict(X_grid)
+    if monotonic_sign == 1:
+        assert (np.diff(y_pred_grid) >= 0.0).all()
+    elif monotonic_sign == -1:
+        assert (np.diff(y_pred_grid) <= 0.0).all()
+
+
+@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
+def test_1d_opposite_monotonicity_cst_data(TreeRegressor):
+    # Check that positive monotonic data with negative monotonic constraint
+    # yield constant predictions, equal to the average of target values
+    X = np.linspace(-2, 2, 10).reshape(-1, 1)
+    y = X.ravel()
+    clf = TreeRegressor(monotonic_cst=[-1])
+    clf.fit(X, y)
+    assert clf.tree_.node_count == 1
+    assert clf.tree_.value[0] == 0.0
+
+    # Swap monotonicity
+    clf = TreeRegressor(monotonic_cst=[1])
+    clf.fit(X, -y)
+    assert clf.tree_.node_count == 1
+    assert clf.tree_.value[0] == 0.0
+
+
+@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
+@pytest.mark.parametrize("monotonic_sign", (-1, 1))
+@pytest.mark.parametrize("depth_first_builder", (True, False))
+@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
+def test_1d_tree_nodes_values(
+    TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
+):
+    # Adaptation from test_nodes_values in test_monotonic_constraints.py
+    # in sklearn.ensemble._hist_gradient_boosting
+    # Build a single tree with only one feature, and make sure the node
+    # values respect the monotonicity constraints.
+
+    # Considering the following tree with a monotonic +1 constraint, we
+    # should have:
+    #
+    #       root
+    #      /    \
+    #     a      b
+    #    / \    / \
+    #   c   d  e   f
+    #
+    #        a <=  root  <= b
+    # c <= d <= (a + b) / 2 <= e <= f
+
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 1000
+    n_features = 1
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    if depth_first_builder:
+        # No max_leaf_nodes, default depth first tree builder
+        clf = TreeRegressor(
+            monotonic_cst=[monotonic_sign],
+            criterion=criterion,
+            random_state=global_random_seed,
+        )
+    else:
+        # max_leaf_nodes triggers best first tree builder
+        clf = TreeRegressor(
+            monotonic_cst=[monotonic_sign],
+            max_leaf_nodes=n_samples,
+            criterion=criterion,
+            random_state=global_random_seed,
+        )
+    clf.fit(X, y)
+
+    assert_1d_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_sign)
+    assert_1d_reg_monotonic(clf, monotonic_sign, np.min(X), np.max(X), 100)
+
+
+def assert_nd_reg_tree_children_monotonic_bounded(tree_, monotonic_cst):
+    upper_bound = np.full(tree_.node_count, np.inf)
+    lower_bound = np.full(tree_.node_count, -np.inf)
+    for i in range(tree_.node_count):
+        feature = tree_.feature[i]
+        node_value = tree_.value[i][0][0]  # unpack value from nx1x1 array
+        # While building the tree, the computed middle value is slightly
+        # different from the average of the siblings values, because
+        # sum_right / weighted_n_right
+        # is slightly different from the value of the right sibling.
+        # This can cause a discrepancy up to numerical noise when clipping,
+        # which is resolved by comparing with some loss of precision.
+        assert np.float32(node_value) <= np.float32(upper_bound[i])
+        assert np.float32(node_value) >= np.float32(lower_bound[i])
+
+        if feature < 0:
+            # Leaf: nothing to do
+            continue
+
+        # Split node: check and update bounds for the children.
+        i_left = tree_.children_left[i]
+        i_right = tree_.children_right[i]
+        # unpack value from nx1x1 array
+        middle_value = (tree_.value[i_left][0][0] + tree_.value[i_right][0][0]) / 2
+
+        if monotonic_cst[feature] == 0:
+            # Feature without monotonicity constraint: propagate bounds
+            # down the tree to both children.
+            # Otherwise, with 2 features and a monotonic increase constraint
+            # (encoded by +1) on feature 0, the following tree can be accepted,
+            # although it does not respect the monotonic increase constraint:
+            #
+            #                      X[0] <= 0
+            #                      value = 100
+            #                     /            \
+            #          X[0] <= -1                X[1] <= 0
+            #          value = 50                value = 150
+            #        /            \             /            \
+            #    leaf           leaf           leaf          leaf
+            #    value = 25     value = 75     value = 50    value = 250
+
+            lower_bound[i_left] = lower_bound[i]
+            upper_bound[i_left] = upper_bound[i]
+            lower_bound[i_right] = lower_bound[i]
+            upper_bound[i_right] = upper_bound[i]
+
+        elif monotonic_cst[feature] == 1:
+            # Feature with constraint: check monotonicity
+            assert tree_.value[i_left] <= tree_.value[i_right]
+
+            # Propagate bounds down the tree to both children.
+            lower_bound[i_left] = lower_bound[i]
+            upper_bound[i_left] = middle_value
+            lower_bound[i_right] = middle_value
+            upper_bound[i_right] = upper_bound[i]
+
+        elif monotonic_cst[feature] == -1:
+            # Feature with constraint: check monotonicity
+            assert tree_.value[i_left] >= tree_.value[i_right]
+
+            # Update and propagate bounds down the tree to both children.
+            lower_bound[i_left] = middle_value
+            upper_bound[i_left] = upper_bound[i]
+            lower_bound[i_right] = lower_bound[i]
+            upper_bound[i_right] = middle_value
+
+        else:  # pragma: no cover
+            raise ValueError(f"monotonic_cst[{feature}]={monotonic_cst[feature]}")
+
+
+def test_assert_nd_reg_tree_children_monotonic_bounded():
+    # Check that assert_nd_reg_tree_children_monotonic_bounded can detect
+    # non-monotonic tree predictions.
+    X = np.linspace(0, 2 * np.pi, 30).reshape(-1, 1)
+    y = np.sin(X).ravel()
+    reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
+
+    with pytest.raises(AssertionError):
+        assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
+
+    with pytest.raises(AssertionError):
+        assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
+
+    assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [0])
+
+    # Check that assert_nd_reg_tree_children_monotonic_bounded raises
+    # when the data (and therefore the model) is naturally monotonic in the
+    # opposite direction.
+    X = np.linspace(-5, 5, 5).reshape(-1, 1)
+    y = X.ravel() ** 3
+    reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
+
+    with pytest.raises(AssertionError):
+        assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
+
+    # For completeness, check that the converse holds when swapping the sign.
+    reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, -y)
+
+    with pytest.raises(AssertionError):
+        assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
+
+
+@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
+@pytest.mark.parametrize("monotonic_sign", (-1, 1))
+@pytest.mark.parametrize("depth_first_builder", (True, False))
+@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
+def test_nd_tree_nodes_values(
+    TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
+):
+    # Build tree with several features, and make sure the nodes
+    # values respect the monotonicity constraints.
+
+    # Considering the following tree with a monotonic increase constraint on X[0],
+    # we should have:
+    #
+    #            root
+    #           X[0]<=t
+    #          /       \
+    #         a         b
+    #     X[0]<=u   X[1]<=v
+    #    /       \   /     \
+    #   c        d  e       f
+    #
+    # i)   a <= root <= b
+    # ii)  c <= a <= d <= (a+b)/2
+    # iii) (a+b)/2 <= min(e,f)
+    # For iii) we check that each node value is within the proper lower and
+    # upper bounds.
+
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 1000
+    n_features = 2
+    monotonic_cst = [monotonic_sign, 0]
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    if depth_first_builder:
+        # No max_leaf_nodes, default depth first tree builder
+        clf = TreeRegressor(
+            monotonic_cst=monotonic_cst,
+            criterion=criterion,
+            random_state=global_random_seed,
+        )
+    else:
+        # max_leaf_nodes triggers best first tree builder
+        clf = TreeRegressor(
+            monotonic_cst=monotonic_cst,
+            max_leaf_nodes=n_samples,
+            criterion=criterion,
+            random_state=global_random_seed,
+        )
+    clf.fit(X, y)
+    assert_nd_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_cst)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_reingold_tilford.py b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_reingold_tilford.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf0ce3ce2cffc2792db28858bff69acb8eb4d45a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_reingold_tilford.py
@@ -0,0 +1,49 @@
+import numpy as np
+import pytest
+
+from sklearn.tree._reingold_tilford import Tree, buchheim
+
+simple_tree = Tree("", 0, Tree("", 1), Tree("", 2))
+
+bigger_tree = Tree(
+    "",
+    0,
+    Tree(
+        "",
+        1,
+        Tree("", 3),
+        Tree("", 4, Tree("", 7), Tree("", 8)),
+    ),
+    Tree("", 2, Tree("", 5), Tree("", 6)),
+)
+
+
+@pytest.mark.parametrize("tree, n_nodes", [(simple_tree, 3), (bigger_tree, 9)])
+def test_buchheim(tree, n_nodes):
+    def walk_tree(draw_tree):
+        res = [(draw_tree.x, draw_tree.y)]
+        for child in draw_tree.children:
+            # parents higher than children:
+            assert child.y == draw_tree.y + 1
+            res.extend(walk_tree(child))
+        if len(draw_tree.children):
+            # these trees are always binary
+            # parents are centered above children
+            assert (
+                draw_tree.x == (draw_tree.children[0].x + draw_tree.children[1].x) / 2
+            )
+        return res
+
+    layout = buchheim(tree)
+    coordinates = walk_tree(layout)
+    assert len(coordinates) == n_nodes
+    # test that x values are unique per depth / level
+    # we could also do it quicker using defaultdicts..
+    depth = 0
+    while True:
+        x_at_this_depth = [node[0] for node in coordinates if node[1] == depth]
+        if not x_at_this_depth:
+            # reached all leafs
+            break
+        assert len(np.unique(x_at_this_depth)) == len(x_at_this_depth)
+        depth += 1
diff --git a/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_tree.py b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..790ebdcea11279bf46a1f0e90ab08b5c890f71f9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/tests/test_tree.py
@@ -0,0 +1,2839 @@
+"""
+Testing for the tree module (sklearn.tree).
+"""
+
+import copy
+import copyreg
+import io
+import pickle
+import re
+import struct
+from itertools import chain, pairwise, product
+
+import joblib
+import numpy as np
+import pytest
+from joblib.numpy_pickle import NumpyPickler
+from numpy.testing import assert_allclose
+
+from sklearn import clone, datasets, tree
+from sklearn.dummy import DummyRegressor
+from sklearn.exceptions import NotFittedError
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, mean_poisson_deviance, mean_squared_error
+from sklearn.model_selection import cross_val_score, train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.random_projection import _sparse_random_matrix
+from sklearn.tree import (
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    ExtraTreeClassifier,
+    ExtraTreeRegressor,
+)
+from sklearn.tree._classes import (
+    CRITERIA_CLF,
+    CRITERIA_REG,
+    DENSE_SPLITTERS,
+    SPARSE_SPLITTERS,
+)
+from sklearn.tree._partitioner import _py_sort
+from sklearn.tree._tree import (
+    NODE_DTYPE,
+    TREE_LEAF,
+    TREE_UNDEFINED,
+    _build_pruned_tree_py,
+    _check_n_classes,
+    _check_node_ndarray,
+    _check_value_ndarray,
+)
+from sklearn.tree._tree import Tree as CythonTree
+from sklearn.utils import compute_sample_weight
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    create_memmap_backed_data,
+    ignore_warnings,
+    skip_if_32bit,
+)
+from sklearn.utils.fixes import (
+    _IS_32BIT,
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+)
+from sklearn.utils.validation import check_random_state
+
+CLF_CRITERIONS = ("gini", "log_loss")
+REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson")
+
+CLF_TREES = {
+    "DecisionTreeClassifier": DecisionTreeClassifier,
+    "ExtraTreeClassifier": ExtraTreeClassifier,
+}
+
+REG_TREES = {
+    "DecisionTreeRegressor": DecisionTreeRegressor,
+    "ExtraTreeRegressor": ExtraTreeRegressor,
+}
+
+ALL_TREES: dict = dict()
+ALL_TREES.update(CLF_TREES)
+ALL_TREES.update(REG_TREES)
+
+SPARSE_TREES = [
+    "DecisionTreeClassifier",
+    "DecisionTreeRegressor",
+    "ExtraTreeClassifier",
+    "ExtraTreeRegressor",
+]
+
+
+X_small = np.array(
+    [
+        [0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0],
+        [0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1],
+        [-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1],
+        [-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1],
+        [-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1],
+        [-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1],
+        [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1],
+        [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1],
+        [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1],
+        [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0],
+        [2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0],
+        [2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0],
+        [2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0],
+        [1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0],
+        [3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1],
+        [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1],
+        [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1],
+        [2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1],
+        [2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1],
+        [2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1],
+        [2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1],
+        [1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1],
+        [3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0],
+    ]
+)
+
+y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]
+y_small_reg = [
+    1.0,
+    2.1,
+    1.2,
+    0.05,
+    10,
+    2.4,
+    3.1,
+    1.01,
+    0.01,
+    2.98,
+    3.1,
+    1.1,
+    0.0,
+    1.2,
+    2,
+    11,
+    0,
+    0,
+    4.5,
+    0.201,
+    1.06,
+    0.9,
+    0,
+]
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+T = [[-1, -1], [2, 2], [3, 2]]
+true_result = [-1, 1, 1]
+
+# also load the iris dataset
+# and randomly permute it
+iris = datasets.load_iris()
+rng = np.random.RandomState(1)
+perm = rng.permutation(iris.target.size)
+iris.data = iris.data[perm]
+iris.target = iris.target[perm]
+
+# also load the diabetes dataset
+# and randomly permute it
+diabetes = datasets.load_diabetes()
+perm = rng.permutation(diabetes.target.size)
+diabetes.data = diabetes.data[perm]
+diabetes.target = diabetes.target[perm]
+
+digits = datasets.load_digits()
+perm = rng.permutation(digits.target.size)
+digits.data = digits.data[perm]
+digits.target = digits.target[perm]
+
+random_state = check_random_state(0)
+X_multilabel, y_multilabel = datasets.make_multilabel_classification(
+    random_state=0, n_samples=30, n_features=10
+)
+
+# NB: despite their names X_sparse_* are numpy arrays (and not sparse matrices)
+X_sparse_pos = random_state.uniform(size=(20, 5))
+X_sparse_pos[X_sparse_pos <= 0.8] = 0.0
+y_random = random_state.randint(0, 4, size=(20,))
+X_sparse_mix = _sparse_random_matrix(20, 10, density=0.25, random_state=0).toarray()
+
+
+DATASETS = {
+    "iris": {"X": iris.data, "y": iris.target},
+    "diabetes": {"X": diabetes.data, "y": diabetes.target},
+    "digits": {"X": digits.data, "y": digits.target},
+    "toy": {"X": X, "y": y},
+    "clf_small": {"X": X_small, "y": y_small},
+    "reg_small": {"X": X_small, "y": y_small_reg},
+    "multilabel": {"X": X_multilabel, "y": y_multilabel},
+    "sparse-pos": {"X": X_sparse_pos, "y": y_random},
+    "sparse-neg": {"X": -X_sparse_pos, "y": y_random},
+    "sparse-mix": {"X": X_sparse_mix, "y": y_random},
+    "zeros": {"X": np.zeros((20, 3)), "y": y_random},
+}
+
+
+def assert_tree_equal(d, s, message):
+    assert s.node_count == d.node_count, (
+        "{0}: inequal number of node ({1} != {2})".format(
+            message, s.node_count, d.node_count
+        )
+    )
+
+    assert_array_equal(
+        d.children_right, s.children_right, message + ": inequal children_right"
+    )
+    assert_array_equal(
+        d.children_left, s.children_left, message + ": inequal children_left"
+    )
+
+    external = d.children_right == TREE_LEAF
+    internal = np.logical_not(external)
+
+    assert_array_equal(
+        d.feature[internal], s.feature[internal], message + ": inequal features"
+    )
+    assert_array_equal(
+        d.threshold[internal], s.threshold[internal], message + ": inequal threshold"
+    )
+    assert_array_equal(
+        d.n_node_samples.sum(),
+        s.n_node_samples.sum(),
+        message + ": inequal sum(n_node_samples)",
+    )
+    assert_array_equal(
+        d.n_node_samples, s.n_node_samples, message + ": inequal n_node_samples"
+    )
+
+    assert_almost_equal(d.impurity, s.impurity, err_msg=message + ": inequal impurity")
+
+    assert_array_almost_equal(
+        d.value[external], s.value[external], err_msg=message + ": inequal value"
+    )
+
+
+def test_classification_toy():
+    # Check classification on a toy dataset.
+    for name, Tree in CLF_TREES.items():
+        clf = Tree(random_state=0)
+        clf.fit(X, y)
+        assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name))
+
+        clf = Tree(max_features=1, random_state=1)
+        clf.fit(X, y)
+        assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name))
+
+
+def test_weighted_classification_toy():
+    # Check classification on a weighted toy dataset.
+    for name, Tree in CLF_TREES.items():
+        clf = Tree(random_state=0)
+
+        clf.fit(X, y, sample_weight=np.ones(len(X)))
+        assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name))
+
+        clf.fit(X, y, sample_weight=np.full(len(X), 0.5))
+        assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name))
+
+
+@pytest.mark.parametrize("Tree", REG_TREES.values())
+@pytest.mark.parametrize("criterion", REG_CRITERIONS)
+def test_regression_toy(Tree, criterion):
+    # Check regression on a toy dataset.
+    if criterion == "poisson":
+        # make target positive while not touching the original y and
+        # true_result
+        a = np.abs(np.min(y)) + 1
+        y_train = np.array(y) + a
+        y_test = np.array(true_result) + a
+    else:
+        y_train = y
+        y_test = true_result
+
+    reg = Tree(criterion=criterion, random_state=1)
+    reg.fit(X, y_train)
+    assert_allclose(reg.predict(T), y_test)
+
+    clf = Tree(criterion=criterion, max_features=1, random_state=1)
+    clf.fit(X, y_train)
+    assert_allclose(reg.predict(T), y_test)
+
+
+def test_xor():
+    # Check on a XOR problem
+    y = np.zeros((10, 10))
+    y[:5, :5] = 1
+    y[5:, 5:] = 1
+
+    gridx, gridy = np.indices(y.shape)
+
+    X = np.vstack([gridx.ravel(), gridy.ravel()]).T
+    y = y.ravel()
+
+    for name, Tree in CLF_TREES.items():
+        clf = Tree(random_state=0)
+        clf.fit(X, y)
+        assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
+
+        clf = Tree(random_state=0, max_features=1)
+        clf.fit(X, y)
+        assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
+
+
+def test_iris():
+    # Check consistency on dataset iris.
+    for (name, Tree), criterion in product(CLF_TREES.items(), CLF_CRITERIONS):
+        clf = Tree(criterion=criterion, random_state=0)
+        clf.fit(iris.data, iris.target)
+        score = accuracy_score(clf.predict(iris.data), iris.target)
+        assert score > 0.9, "Failed with {0}, criterion = {1} and score = {2}".format(
+            name, criterion, score
+        )
+
+        clf = Tree(criterion=criterion, max_features=2, random_state=0)
+        clf.fit(iris.data, iris.target)
+        score = accuracy_score(clf.predict(iris.data), iris.target)
+        assert score > 0.5, "Failed with {0}, criterion = {1} and score = {2}".format(
+            name, criterion, score
+        )
+
+
+@pytest.mark.parametrize("name, Tree", REG_TREES.items())
+@pytest.mark.parametrize("criterion", REG_CRITERIONS)
+def test_diabetes_overfit(name, Tree, criterion):
+    # check consistency of overfitted trees on the diabetes dataset
+    # since the trees will overfit, we expect an MSE of 0
+    reg = Tree(criterion=criterion, random_state=0)
+    reg.fit(diabetes.data, diabetes.target)
+    score = mean_squared_error(diabetes.target, reg.predict(diabetes.data))
+    assert score == pytest.approx(0), (
+        f"Failed with {name}, criterion = {criterion} and score = {score}"
+    )
+
+
+@skip_if_32bit
+@pytest.mark.parametrize("name, Tree", REG_TREES.items())
+@pytest.mark.parametrize(
+    "criterion, max_depth, metric, max_loss",
+    [
+        ("squared_error", 15, mean_squared_error, 60),
+        ("absolute_error", 20, mean_squared_error, 60),
+        ("friedman_mse", 15, mean_squared_error, 60),
+        ("poisson", 15, mean_poisson_deviance, 30),
+    ],
+)
+def test_diabetes_underfit(name, Tree, criterion, max_depth, metric, max_loss):
+    # check consistency of trees when the depth and the number of features are
+    # limited
+
+    reg = Tree(criterion=criterion, max_depth=max_depth, max_features=6, random_state=0)
+    reg.fit(diabetes.data, diabetes.target)
+    loss = metric(diabetes.target, reg.predict(diabetes.data))
+    assert 0 < loss < max_loss
+
+
+def test_probability():
+    # Predict probabilities using DecisionTreeClassifier.
+
+    for name, Tree in CLF_TREES.items():
+        clf = Tree(max_depth=1, max_features=1, random_state=42)
+        clf.fit(iris.data, iris.target)
+
+        prob_predict = clf.predict_proba(iris.data)
+        assert_array_almost_equal(
+            np.sum(prob_predict, 1),
+            np.ones(iris.data.shape[0]),
+            err_msg="Failed with {0}".format(name),
+        )
+        assert_array_equal(
+            np.argmax(prob_predict, 1),
+            clf.predict(iris.data),
+            err_msg="Failed with {0}".format(name),
+        )
+        assert_almost_equal(
+            clf.predict_proba(iris.data),
+            np.exp(clf.predict_log_proba(iris.data)),
+            8,
+            err_msg="Failed with {0}".format(name),
+        )
+
+
+def test_arrayrepr():
+    # Check the array representation.
+    # Check resize
+    X = np.arange(10000)[:, np.newaxis]
+    y = np.arange(10000)
+
+    for name, Tree in REG_TREES.items():
+        reg = Tree(max_depth=None, random_state=0)
+        reg.fit(X, y)
+
+
+def test_pure_set():
+    # Check when y is pure.
+    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+    y = [1, 1, 1, 1, 1, 1]
+
+    for name, TreeClassifier in CLF_TREES.items():
+        clf = TreeClassifier(random_state=0)
+        clf.fit(X, y)
+        assert_array_equal(clf.predict(X), y, err_msg="Failed with {0}".format(name))
+
+    for name, TreeRegressor in REG_TREES.items():
+        reg = TreeRegressor(random_state=0)
+        reg.fit(X, y)
+        assert_almost_equal(reg.predict(X), y, err_msg="Failed with {0}".format(name))
+
+
+def test_numerical_stability():
+    # Check numerical stability.
+    X = np.array(
+        [
+            [152.08097839, 140.40744019, 129.75102234, 159.90493774],
+            [142.50700378, 135.81935120, 117.82884979, 162.75781250],
+            [127.28772736, 140.40744019, 129.75102234, 159.90493774],
+            [132.37025452, 143.71923828, 138.35694885, 157.84558105],
+            [103.10237122, 143.71928406, 138.35696411, 157.84559631],
+            [127.71276855, 143.71923828, 138.35694885, 157.84558105],
+            [120.91514587, 140.40744019, 129.75102234, 159.90493774],
+        ]
+    )
+
+    y = np.array([1.0, 0.70209277, 0.53896582, 0.0, 0.90914464, 0.48026916, 0.49622521])
+
+    with np.errstate(all="raise"):
+        for name, Tree in REG_TREES.items():
+            reg = Tree(random_state=0)
+            reg.fit(X, y)
+            reg.fit(X, -y)
+            reg.fit(-X, y)
+            reg.fit(-X, -y)
+
+
+def test_importances():
+    # Check variable importances.
+    X, y = datasets.make_classification(
+        n_samples=5000,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+    )
+
+    for name, Tree in CLF_TREES.items():
+        clf = Tree(random_state=0)
+
+        clf.fit(X, y)
+        importances = clf.feature_importances_
+        n_important = np.sum(importances > 0.1)
+
+        assert importances.shape[0] == 10, "Failed with {0}".format(name)
+        assert n_important == 3, "Failed with {0}".format(name)
+
+    # Check on iris that importances are the same for all builders
+    clf = DecisionTreeClassifier(random_state=0)
+    clf.fit(iris.data, iris.target)
+    clf2 = DecisionTreeClassifier(random_state=0, max_leaf_nodes=len(iris.data))
+    clf2.fit(iris.data, iris.target)
+
+    assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
+
+
+def test_importances_raises():
+    # Check if variable importance before fit raises ValueError.
+    clf = DecisionTreeClassifier()
+    with pytest.raises(ValueError):
+        getattr(clf, "feature_importances_")
+
+
+def test_importances_gini_equal_squared_error():
+    # Check that gini is equivalent to squared_error for binary output variable
+
+    X, y = datasets.make_classification(
+        n_samples=2000,
+        n_features=10,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        shuffle=False,
+        random_state=0,
+    )
+
+    # The gini index and the mean square error (variance) might differ due
+    # to numerical instability. Since those instabilities mainly occurs at
+    # high tree depth, we restrict this maximal depth.
+    clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=0).fit(
+        X, y
+    )
+    reg = DecisionTreeRegressor(
+        criterion="squared_error", max_depth=5, random_state=0
+    ).fit(X, y)
+
+    assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
+    assert_array_equal(clf.tree_.feature, reg.tree_.feature)
+    assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
+    assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
+    assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
+
+
+def test_max_features():
+    # Check max_features.
+    for name, TreeEstimator in ALL_TREES.items():
+        est = TreeEstimator(max_features="sqrt")
+        est.fit(iris.data, iris.target)
+        assert est.max_features_ == int(np.sqrt(iris.data.shape[1]))
+
+        est = TreeEstimator(max_features="log2")
+        est.fit(iris.data, iris.target)
+        assert est.max_features_ == int(np.log2(iris.data.shape[1]))
+
+        est = TreeEstimator(max_features=1)
+        est.fit(iris.data, iris.target)
+        assert est.max_features_ == 1
+
+        est = TreeEstimator(max_features=3)
+        est.fit(iris.data, iris.target)
+        assert est.max_features_ == 3
+
+        est = TreeEstimator(max_features=0.01)
+        est.fit(iris.data, iris.target)
+        assert est.max_features_ == 1
+
+        est = TreeEstimator(max_features=0.5)
+        est.fit(iris.data, iris.target)
+        assert est.max_features_ == int(0.5 * iris.data.shape[1])
+
+        est = TreeEstimator(max_features=1.0)
+        est.fit(iris.data, iris.target)
+        assert est.max_features_ == iris.data.shape[1]
+
+        est = TreeEstimator(max_features=None)
+        est.fit(iris.data, iris.target)
+        assert est.max_features_ == iris.data.shape[1]
+
+
+def test_error():
+    # Test that it gives proper exception on deficient input.
+    for name, TreeEstimator in CLF_TREES.items():
+        # predict before fit
+        est = TreeEstimator()
+        with pytest.raises(NotFittedError):
+            est.predict_proba(X)
+
+        est.fit(X, y)
+        X2 = [[-2, -1, 1]]  # wrong feature shape for sample
+        with pytest.raises(ValueError):
+            est.predict_proba(X2)
+
+        # Wrong dimensions
+        est = TreeEstimator()
+        y2 = y[:-1]
+        with pytest.raises(ValueError):
+            est.fit(X, y2)
+
+        # Test with arrays that are non-contiguous.
+        Xf = np.asfortranarray(X)
+        est = TreeEstimator()
+        est.fit(Xf, y)
+        assert_almost_equal(est.predict(T), true_result)
+
+        # predict before fitting
+        est = TreeEstimator()
+        with pytest.raises(NotFittedError):
+            est.predict(T)
+
+        # predict on vector with different dims
+        est.fit(X, y)
+        t = np.asarray(T)
+        with pytest.raises(ValueError):
+            est.predict(t[:, 1:])
+
+        # wrong sample shape
+        Xt = np.array(X).T
+
+        est = TreeEstimator()
+        est.fit(np.dot(X, Xt), y)
+        with pytest.raises(ValueError):
+            est.predict(X)
+        with pytest.raises(ValueError):
+            est.apply(X)
+
+        clf = TreeEstimator()
+        clf.fit(X, y)
+        with pytest.raises(ValueError):
+            clf.predict(Xt)
+        with pytest.raises(ValueError):
+            clf.apply(Xt)
+
+        # apply before fitting
+        est = TreeEstimator()
+        with pytest.raises(NotFittedError):
+            est.apply(T)
+
+    # non positive target for Poisson splitting Criterion
+    est = DecisionTreeRegressor(criterion="poisson")
+    with pytest.raises(ValueError, match="y is not positive.*Poisson"):
+        est.fit([[0, 1, 2]], [0, 0, 0])
+    with pytest.raises(ValueError, match="Some.*y are negative.*Poisson"):
+        est.fit([[0, 1, 2]], [5, -0.1, 2])
+
+
+def test_min_samples_split():
+    """Test min_samples_split parameter"""
+    X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
+    y = iris.target
+
+    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
+    # by setting max_leaf_nodes
+    for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
+        TreeEstimator = ALL_TREES[name]
+
+        # test for integer parameter
+        est = TreeEstimator(
+            min_samples_split=10, max_leaf_nodes=max_leaf_nodes, random_state=0
+        )
+        est.fit(X, y)
+        # count samples on nodes, -1 means it is a leaf
+        node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
+
+        assert np.min(node_samples) > 9, "Failed with {0}".format(name)
+
+        # test for float parameter
+        est = TreeEstimator(
+            min_samples_split=0.2, max_leaf_nodes=max_leaf_nodes, random_state=0
+        )
+        est.fit(X, y)
+        # count samples on nodes, -1 means it is a leaf
+        node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
+
+        assert np.min(node_samples) > 9, "Failed with {0}".format(name)
+
+
+def test_min_samples_leaf():
+    # Test if leaves contain more than leaf_count training examples
+    X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
+    y = iris.target
+
+    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
+    # by setting max_leaf_nodes
+    for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
+        TreeEstimator = ALL_TREES[name]
+
+        # test integer parameter
+        est = TreeEstimator(
+            min_samples_leaf=5, max_leaf_nodes=max_leaf_nodes, random_state=0
+        )
+        est.fit(X, y)
+        out = est.tree_.apply(X)
+        node_counts = np.bincount(out)
+        # drop inner nodes
+        leaf_count = node_counts[node_counts != 0]
+        assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
+
+        # test float parameter
+        est = TreeEstimator(
+            min_samples_leaf=0.1, max_leaf_nodes=max_leaf_nodes, random_state=0
+        )
+        est.fit(X, y)
+        out = est.tree_.apply(X)
+        node_counts = np.bincount(out)
+        # drop inner nodes
+        leaf_count = node_counts[node_counts != 0]
+        assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
+
+
+def check_min_weight_fraction_leaf(name, datasets, sparse_container=None):
+    """Test if leaves contain at least min_weight_fraction_leaf of the
+    training set"""
+    X = DATASETS[datasets]["X"].astype(np.float32)
+    if sparse_container is not None:
+        X = sparse_container(X)
+    y = DATASETS[datasets]["y"]
+
+    weights = rng.rand(X.shape[0])
+    total_weight = np.sum(weights)
+
+    TreeEstimator = ALL_TREES[name]
+
+    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
+    # by setting max_leaf_nodes
+    for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
+        est = TreeEstimator(
+            min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0
+        )
+        est.fit(X, y, sample_weight=weights)
+
+        if sparse_container is not None:
+            out = est.tree_.apply(X.tocsr())
+        else:
+            out = est.tree_.apply(X)
+
+        node_weights = np.bincount(out, weights=weights)
+        # drop inner nodes
+        leaf_weights = node_weights[node_weights != 0]
+        assert np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf, (
+            "Failed with {0} min_weight_fraction_leaf={1}".format(
+                name, est.min_weight_fraction_leaf
+            )
+        )
+
+    # test case with no weights passed in
+    total_weight = X.shape[0]
+
+    for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
+        est = TreeEstimator(
+            min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0
+        )
+        est.fit(X, y)
+
+        if sparse_container is not None:
+            out = est.tree_.apply(X.tocsr())
+        else:
+            out = est.tree_.apply(X)
+
+        node_weights = np.bincount(out)
+        # drop inner nodes
+        leaf_weights = node_weights[node_weights != 0]
+        assert np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf, (
+            "Failed with {0} min_weight_fraction_leaf={1}".format(
+                name, est.min_weight_fraction_leaf
+            )
+        )
+
+
+@pytest.mark.parametrize("name", ALL_TREES)
+def test_min_weight_fraction_leaf_on_dense_input(name):
+    check_min_weight_fraction_leaf(name, "iris")
+
+
+@pytest.mark.parametrize("name", SPARSE_TREES)
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_min_weight_fraction_leaf_on_sparse_input(name, csc_container):
+    check_min_weight_fraction_leaf(name, "multilabel", sparse_container=csc_container)
+
+
+def check_min_weight_fraction_leaf_with_min_samples_leaf(
+    name, datasets, sparse_container=None
+):
+    """Test the interaction between min_weight_fraction_leaf and
+    min_samples_leaf when sample_weights is not provided in fit."""
+    X = DATASETS[datasets]["X"].astype(np.float32)
+    if sparse_container is not None:
+        X = sparse_container(X)
+    y = DATASETS[datasets]["y"]
+
+    total_weight = X.shape[0]
+    TreeEstimator = ALL_TREES[name]
+    for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)):
+        # test integer min_samples_leaf
+        est = TreeEstimator(
+            min_weight_fraction_leaf=frac,
+            max_leaf_nodes=max_leaf_nodes,
+            min_samples_leaf=5,
+            random_state=0,
+        )
+        est.fit(X, y)
+
+        if sparse_container is not None:
+            out = est.tree_.apply(X.tocsr())
+        else:
+            out = est.tree_.apply(X)
+
+        node_weights = np.bincount(out)
+        # drop inner nodes
+        leaf_weights = node_weights[node_weights != 0]
+        assert np.min(leaf_weights) >= max(
+            (total_weight * est.min_weight_fraction_leaf), 5
+        ), "Failed with {0} min_weight_fraction_leaf={1}, min_samples_leaf={2}".format(
+            name, est.min_weight_fraction_leaf, est.min_samples_leaf
+        )
+    for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)):
+        # test float min_samples_leaf
+        est = TreeEstimator(
+            min_weight_fraction_leaf=frac,
+            max_leaf_nodes=max_leaf_nodes,
+            min_samples_leaf=0.1,
+            random_state=0,
+        )
+        est.fit(X, y)
+
+        if sparse_container is not None:
+            out = est.tree_.apply(X.tocsr())
+        else:
+            out = est.tree_.apply(X)
+
+        node_weights = np.bincount(out)
+        # drop inner nodes
+        leaf_weights = node_weights[node_weights != 0]
+        assert np.min(leaf_weights) >= max(
+            (total_weight * est.min_weight_fraction_leaf),
+            (total_weight * est.min_samples_leaf),
+        ), "Failed with {0} min_weight_fraction_leaf={1}, min_samples_leaf={2}".format(
+            name, est.min_weight_fraction_leaf, est.min_samples_leaf
+        )
+
+
+@pytest.mark.parametrize("name", ALL_TREES)
+def test_min_weight_fraction_leaf_with_min_samples_leaf_on_dense_input(name):
+    check_min_weight_fraction_leaf_with_min_samples_leaf(name, "iris")
+
+
+@pytest.mark.parametrize("name", SPARSE_TREES)
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input(
+    name, csc_container
+):
+    check_min_weight_fraction_leaf_with_min_samples_leaf(
+        name, "multilabel", sparse_container=csc_container
+    )
+
+
+def test_min_impurity_decrease(global_random_seed):
+    # test if min_impurity_decrease ensure that a split is made only if
+    # if the impurity decrease is at least that value
+    X, y = datasets.make_classification(n_samples=100, random_state=global_random_seed)
+
+    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
+    # by setting max_leaf_nodes
+    for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
+        TreeEstimator = ALL_TREES[name]
+
+        # Check default value of min_impurity_decrease, 1e-7
+        est1 = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0)
+        # Check with explicit value of 0.05
+        est2 = TreeEstimator(
+            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.05, random_state=0
+        )
+        # Check with a much lower value of 0.0001
+        est3 = TreeEstimator(
+            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0
+        )
+        # Check with a much lower value of 0.1
+        est4 = TreeEstimator(
+            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.1, random_state=0
+        )
+
+        for est, expected_decrease in (
+            (est1, 1e-7),
+            (est2, 0.05),
+            (est3, 0.0001),
+            (est4, 0.1),
+        ):
+            assert est.min_impurity_decrease <= expected_decrease, (
+                "Failed, min_impurity_decrease = {0} > {1}".format(
+                    est.min_impurity_decrease, expected_decrease
+                )
+            )
+            est.fit(X, y)
+            for node in range(est.tree_.node_count):
+                # If current node is a not leaf node, check if the split was
+                # justified w.r.t the min_impurity_decrease
+                if est.tree_.children_left[node] != TREE_LEAF:
+                    imp_parent = est.tree_.impurity[node]
+                    wtd_n_node = est.tree_.weighted_n_node_samples[node]
+
+                    left = est.tree_.children_left[node]
+                    wtd_n_left = est.tree_.weighted_n_node_samples[left]
+                    imp_left = est.tree_.impurity[left]
+                    wtd_imp_left = wtd_n_left * imp_left
+
+                    right = est.tree_.children_right[node]
+                    wtd_n_right = est.tree_.weighted_n_node_samples[right]
+                    imp_right = est.tree_.impurity[right]
+                    wtd_imp_right = wtd_n_right * imp_right
+
+                    wtd_avg_left_right_imp = wtd_imp_right + wtd_imp_left
+                    wtd_avg_left_right_imp /= wtd_n_node
+
+                    fractional_node_weight = (
+                        est.tree_.weighted_n_node_samples[node] / X.shape[0]
+                    )
+
+                    actual_decrease = fractional_node_weight * (
+                        imp_parent - wtd_avg_left_right_imp
+                    )
+
+                    assert actual_decrease >= expected_decrease, (
+                        "Failed with {0} expected min_impurity_decrease={1}".format(
+                            actual_decrease, expected_decrease
+                        )
+                    )
+
+
+def test_pickle():
+    """Test pickling preserves Tree properties and performance."""
+    for name, TreeEstimator in ALL_TREES.items():
+        if "Classifier" in name:
+            X, y = iris.data, iris.target
+        else:
+            X, y = diabetes.data, diabetes.target
+
+        est = TreeEstimator(random_state=0)
+        est.fit(X, y)
+        score = est.score(X, y)
+
+        # test that all class properties are maintained
+        attributes = [
+            "max_depth",
+            "node_count",
+            "capacity",
+            "n_classes",
+            "children_left",
+            "children_right",
+            "n_leaves",
+            "feature",
+            "threshold",
+            "impurity",
+            "n_node_samples",
+            "weighted_n_node_samples",
+            "value",
+        ]
+        fitted_attribute = {
+            attribute: getattr(est.tree_, attribute) for attribute in attributes
+        }
+
+        serialized_object = pickle.dumps(est)
+        est2 = pickle.loads(serialized_object)
+        assert type(est2) == est.__class__
+
+        score2 = est2.score(X, y)
+        assert score == score2, (
+            "Failed to generate same score  after pickling with {0}".format(name)
+        )
+        for attribute in fitted_attribute:
+            assert_array_equal(
+                getattr(est2.tree_, attribute),
+                fitted_attribute[attribute],
+                err_msg=(
+                    f"Failed to generate same attribute {attribute} after pickling with"
+                    f" {name}"
+                ),
+            )
+
+
+def test_multioutput():
+    # Check estimators on multi-output problems.
+    X = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+
+    y = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+
+    T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    # toy classification problem
+    for name, TreeClassifier in CLF_TREES.items():
+        clf = TreeClassifier(random_state=0)
+        y_hat = clf.fit(X, y).predict(T)
+        assert_array_equal(y_hat, y_true)
+        assert y_hat.shape == (4, 2)
+
+        proba = clf.predict_proba(T)
+        assert len(proba) == 2
+        assert proba[0].shape == (4, 2)
+        assert proba[1].shape == (4, 4)
+
+        log_proba = clf.predict_log_proba(T)
+        assert len(log_proba) == 2
+        assert log_proba[0].shape == (4, 2)
+        assert log_proba[1].shape == (4, 4)
+
+    # toy regression problem
+    for name, TreeRegressor in REG_TREES.items():
+        reg = TreeRegressor(random_state=0)
+        y_hat = reg.fit(X, y).predict(T)
+        assert_almost_equal(y_hat, y_true)
+        assert y_hat.shape == (4, 2)
+
+
+def test_classes_shape():
+    # Test that n_classes_ and classes_ have proper shape.
+    for name, TreeClassifier in CLF_TREES.items():
+        # Classification, single output
+        clf = TreeClassifier(random_state=0)
+        clf.fit(X, y)
+
+        assert clf.n_classes_ == 2
+        assert_array_equal(clf.classes_, [-1, 1])
+
+        # Classification, multi-output
+        _y = np.vstack((y, np.array(y) * 2)).T
+        clf = TreeClassifier(random_state=0)
+        clf.fit(X, _y)
+        assert len(clf.n_classes_) == 2
+        assert len(clf.classes_) == 2
+        assert_array_equal(clf.n_classes_, [2, 2])
+        assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
+
+
+def test_unbalanced_iris():
+    # Check class rebalancing.
+    unbalanced_X = iris.data[:125]
+    unbalanced_y = iris.target[:125]
+    sample_weight = compute_sample_weight("balanced", unbalanced_y)
+
+    for name, TreeClassifier in CLF_TREES.items():
+        clf = TreeClassifier(random_state=0)
+        clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
+        assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
+
+
+def test_memory_layout():
+    # Check that it works no matter the memory layout
+    for (name, TreeEstimator), dtype in product(
+        ALL_TREES.items(), [np.float64, np.float32]
+    ):
+        est = TreeEstimator(random_state=0)
+
+        # Nothing
+        X = np.asarray(iris.data, dtype=dtype)
+        y = iris.target
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
+        # C-order
+        X = np.asarray(iris.data, order="C", dtype=dtype)
+        y = iris.target
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
+        # F-order
+        X = np.asarray(iris.data, order="F", dtype=dtype)
+        y = iris.target
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
+        # Contiguous
+        X = np.ascontiguousarray(iris.data, dtype=dtype)
+        y = iris.target
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
+        # csr
+        for csr_container in CSR_CONTAINERS:
+            X = csr_container(iris.data, dtype=dtype)
+            y = iris.target
+            assert_array_equal(est.fit(X, y).predict(X), y)
+
+        # csc
+        for csc_container in CSC_CONTAINERS:
+            X = csc_container(iris.data, dtype=dtype)
+            y = iris.target
+            assert_array_equal(est.fit(X, y).predict(X), y)
+
+        # Strided
+        X = np.asarray(iris.data[::3], dtype=dtype)
+        y = iris.target[::3]
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
+
+def test_sample_weight():
+    # Check sample weighting.
+    # Test that zero-weighted samples are not taken into account
+    X = np.arange(100)[:, np.newaxis]
+    y = np.ones(100)
+    y[:50] = 0.0
+
+    sample_weight = np.ones(100)
+    sample_weight[y == 0] = 0.0
+
+    clf = DecisionTreeClassifier(random_state=0)
+    clf.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(clf.predict(X), np.ones(100))
+
+    # Test that low weighted samples are not taken into account at low depth
+    X = np.arange(200)[:, np.newaxis]
+    y = np.zeros(200)
+    y[50:100] = 1
+    y[100:200] = 2
+    X[100:200, 0] = 200
+
+    sample_weight = np.ones(200)
+
+    sample_weight[y == 2] = 0.51  # Samples of class '2' are still weightier
+    clf = DecisionTreeClassifier(max_depth=1, random_state=0)
+    clf.fit(X, y, sample_weight=sample_weight)
+    assert clf.tree_.threshold[0] == 149.5
+
+    sample_weight[y == 2] = 0.5  # Samples of class '2' are no longer weightier
+    clf = DecisionTreeClassifier(max_depth=1, random_state=0)
+    clf.fit(X, y, sample_weight=sample_weight)
+    assert clf.tree_.threshold[0] == 49.5  # Threshold should have moved
+
+    # Test that sample weighting is the same as having duplicates
+    X = iris.data
+    y = iris.target
+
+    duplicates = rng.randint(0, X.shape[0], 100)
+
+    clf = DecisionTreeClassifier(random_state=1)
+    clf.fit(X[duplicates], y[duplicates])
+
+    sample_weight = np.bincount(duplicates, minlength=X.shape[0])
+    clf2 = DecisionTreeClassifier(random_state=1)
+    clf2.fit(X, y, sample_weight=sample_weight)
+
+    internal = clf.tree_.children_left != tree._tree.TREE_LEAF
+    assert_array_almost_equal(
+        clf.tree_.threshold[internal], clf2.tree_.threshold[internal]
+    )
+
+
+def test_sample_weight_invalid():
+    # Check sample weighting raises errors.
+    X = np.arange(100)[:, np.newaxis]
+    y = np.ones(100)
+    y[:50] = 0.0
+
+    clf = DecisionTreeClassifier(random_state=0)
+
+    sample_weight = np.random.rand(100, 1)
+    with pytest.raises(ValueError):
+        clf.fit(X, y, sample_weight=sample_weight)
+
+    sample_weight = np.array(0)
+
+    expected_err = re.escape(
+        (
+            "Input should have at least 1 dimension i.e. satisfy "
+            "`len(x.shape) > 0`, got scalar `array(0.)` instead."
+        )
+    )
+    with pytest.raises(TypeError, match=expected_err):
+        clf.fit(X, y, sample_weight=sample_weight)
+
+
+@pytest.mark.parametrize("name", CLF_TREES)
+def test_class_weights(name):
+    # Test that class_weights resemble sample_weights behavior.
+    TreeClassifier = CLF_TREES[name]
+
+    # Iris is balanced, so no effect expected for using 'balanced' weights
+    clf1 = TreeClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target)
+    clf2 = TreeClassifier(class_weight="balanced", random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Make a multi-output problem with three copies of Iris
+    iris_multi = np.vstack((iris.target, iris.target, iris.target)).T
+    # Create user-defined weights that should balance over the outputs
+    clf3 = TreeClassifier(
+        class_weight=[
+            {0: 2.0, 1: 2.0, 2: 1.0},
+            {0: 2.0, 1: 1.0, 2: 2.0},
+            {0: 1.0, 1: 2.0, 2: 2.0},
+        ],
+        random_state=0,
+    )
+    clf3.fit(iris.data, iris_multi)
+    assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
+    # Check against multi-output "auto" which should also have no effect
+    clf4 = TreeClassifier(class_weight="balanced", random_state=0)
+    clf4.fit(iris.data, iris_multi)
+    assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
+
+    # Inflate importance of class 1, check against user-defined weights
+    sample_weight = np.ones(iris.target.shape)
+    sample_weight[iris.target == 1] *= 100
+    class_weight = {0: 1.0, 1: 100.0, 2: 1.0}
+    clf1 = TreeClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight)
+    clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Check that sample_weight and class_weight are multiplicative
+    clf1 = TreeClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight**2)
+    clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target, sample_weight)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+
+@pytest.mark.parametrize("name", CLF_TREES)
+def test_class_weight_errors(name):
+    # Test if class_weight raises errors and warnings when expected.
+    TreeClassifier = CLF_TREES[name]
+    _y = np.vstack((y, np.array(y) * 2)).T
+
+    # Incorrect length list for multi-output
+    clf = TreeClassifier(class_weight=[{-1: 0.5, 1: 1.0}], random_state=0)
+    err_msg = "number of elements in class_weight should match number of outputs."
+    with pytest.raises(ValueError, match=err_msg):
+        clf.fit(X, _y)
+
+
+def test_max_leaf_nodes():
+    # Test greedy trees with max_depth + 1 leafs.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    k = 4
+    for name, TreeEstimator in ALL_TREES.items():
+        est = TreeEstimator(max_depth=None, max_leaf_nodes=k + 1).fit(X, y)
+        assert est.get_n_leaves() == k + 1
+
+
+def test_max_leaf_nodes_max_depth():
+    # Test precedence of max_leaf_nodes over max_depth.
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
+    k = 4
+    for name, TreeEstimator in ALL_TREES.items():
+        est = TreeEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
+        assert est.get_depth() == 1
+
+
+def test_arrays_persist():
+    # Ensure property arrays' memory stays alive when tree disappears
+    # non-regression for #2726
+    for attr in [
+        "n_classes",
+        "value",
+        "children_left",
+        "children_right",
+        "threshold",
+        "impurity",
+        "feature",
+        "n_node_samples",
+    ]:
+        value = getattr(DecisionTreeClassifier().fit([[0], [1]], [0, 1]).tree_, attr)
+        # if pointing to freed memory, contents may be arbitrary
+        assert -3 <= value.flat[0] < 3, "Array points to arbitrary memory"
+
+
+def test_only_constant_features():
+    random_state = check_random_state(0)
+    X = np.zeros((10, 20))
+    y = random_state.randint(0, 2, (10,))
+    for name, TreeEstimator in ALL_TREES.items():
+        est = TreeEstimator(random_state=0)
+        est.fit(X, y)
+        assert est.tree_.max_depth == 0
+
+
+def test_behaviour_constant_feature_after_splits():
+    X = np.transpose(
+        np.vstack(([[0, 0, 0, 0, 0, 1, 2, 4, 5, 6, 7]], np.zeros((4, 11))))
+    )
+    y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]
+    for name, TreeEstimator in ALL_TREES.items():
+        # do not check extra random trees
+        if "ExtraTree" not in name:
+            est = TreeEstimator(random_state=0, max_features=1)
+            est.fit(X, y)
+            assert est.tree_.max_depth == 2
+            assert est.tree_.node_count == 5
+
+
+def test_with_only_one_non_constant_features():
+    X = np.hstack([np.array([[1.0], [1.0], [0.0], [0.0]]), np.zeros((4, 1000))])
+
+    y = np.array([0.0, 1.0, 0.0, 1.0])
+    for name, TreeEstimator in CLF_TREES.items():
+        est = TreeEstimator(random_state=0, max_features=1)
+        est.fit(X, y)
+        assert est.tree_.max_depth == 1
+        assert_array_equal(est.predict_proba(X), np.full((4, 2), 0.5))
+
+    for name, TreeEstimator in REG_TREES.items():
+        est = TreeEstimator(random_state=0, max_features=1)
+        est.fit(X, y)
+        assert est.tree_.max_depth == 1
+        assert_array_equal(est.predict(X), np.full((4,), 0.5))
+
+
+def test_big_input():
+    # Test if the warning for too large inputs is appropriate.
+    X = np.repeat(10**40.0, 4).astype(np.float64).reshape(-1, 1)
+    clf = DecisionTreeClassifier()
+    with pytest.raises(ValueError, match="float32"):
+        clf.fit(X, [0, 1, 0, 1])
+
+
+def test_realloc():
+    from sklearn.tree._utils import _realloc_test
+
+    with pytest.raises(MemoryError):
+        _realloc_test()
+
+
+def test_huge_allocations():
+    n_bits = 8 * struct.calcsize("P")
+
+    X = np.random.randn(10, 2)
+    y = np.random.randint(0, 2, 10)
+
+    # Sanity check: we cannot request more memory than the size of the address
+    # space. Currently raises OverflowError.
+    huge = 2 ** (n_bits + 1)
+    clf = DecisionTreeClassifier(splitter="best", max_leaf_nodes=huge)
+    with pytest.raises(Exception):
+        clf.fit(X, y)
+
+    # Non-regression test: MemoryError used to be dropped by Cython
+    # because of missing "except *".
+    huge = 2 ** (n_bits - 1) - 1
+    clf = DecisionTreeClassifier(splitter="best", max_leaf_nodes=huge)
+    with pytest.raises(MemoryError):
+        clf.fit(X, y)
+
+
+def check_sparse_input(tree, dataset, max_depth=None):
+    TreeEstimator = ALL_TREES[tree]
+    X = DATASETS[dataset]["X"]
+    y = DATASETS[dataset]["y"]
+
+    # Gain testing time
+    if dataset in ["digits", "diabetes"]:
+        n_samples = X.shape[0] // 5
+        X = X[:n_samples]
+        y = y[:n_samples]
+
+    for sparse_container in COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS:
+        X_sparse = sparse_container(X)
+
+        # Check the default (depth first search)
+        d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
+        s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)
+
+        assert_tree_equal(
+            d.tree_,
+            s.tree_,
+            "{0} with dense and sparse format gave different trees".format(tree),
+        )
+
+        y_pred = d.predict(X)
+        if tree in CLF_TREES:
+            y_proba = d.predict_proba(X)
+            y_log_proba = d.predict_log_proba(X)
+
+        for sparse_container_test in COO_CONTAINERS + CSR_CONTAINERS + CSC_CONTAINERS:
+            X_sparse_test = sparse_container_test(X_sparse, dtype=np.float32)
+
+            assert_array_almost_equal(s.predict(X_sparse_test), y_pred)
+
+            if tree in CLF_TREES:
+                assert_array_almost_equal(s.predict_proba(X_sparse_test), y_proba)
+                assert_array_almost_equal(
+                    s.predict_log_proba(X_sparse_test), y_log_proba
+                )
+
+
+@pytest.mark.parametrize("tree_type", SPARSE_TREES)
+@pytest.mark.parametrize(
+    "dataset",
+    (
+        "clf_small",
+        "toy",
+        "digits",
+        "multilabel",
+        "sparse-pos",
+        "sparse-neg",
+        "sparse-mix",
+        "zeros",
+    ),
+)
+def test_sparse_input(tree_type, dataset):
+    max_depth = 3 if dataset == "digits" else None
+    check_sparse_input(tree_type, dataset, max_depth)
+
+
+@pytest.mark.parametrize("tree_type", sorted(set(SPARSE_TREES).intersection(REG_TREES)))
+@pytest.mark.parametrize("dataset", ["diabetes", "reg_small"])
+def test_sparse_input_reg_trees(tree_type, dataset):
+    # Due to numerical instability of MSE and too strict test, we limit the
+    # maximal depth
+    check_sparse_input(tree_type, dataset, 2)
+
+
+@pytest.mark.parametrize("tree_type", SPARSE_TREES)
+@pytest.mark.parametrize("dataset", ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"])
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_sparse_parameters(tree_type, dataset, csc_container):
+    TreeEstimator = ALL_TREES[tree_type]
+    X = DATASETS[dataset]["X"]
+    X_sparse = csc_container(X)
+    y = DATASETS[dataset]["y"]
+
+    # Check max_features
+    d = TreeEstimator(random_state=0, max_features=1, max_depth=2).fit(X, y)
+    s = TreeEstimator(random_state=0, max_features=1, max_depth=2).fit(X_sparse, y)
+    assert_tree_equal(
+        d.tree_,
+        s.tree_,
+        "{0} with dense and sparse format gave different trees".format(tree_type),
+    )
+    assert_array_almost_equal(s.predict(X), d.predict(X))
+
+    # Check min_samples_split
+    d = TreeEstimator(random_state=0, max_features=1, min_samples_split=10).fit(X, y)
+    s = TreeEstimator(random_state=0, max_features=1, min_samples_split=10).fit(
+        X_sparse, y
+    )
+    assert_tree_equal(
+        d.tree_,
+        s.tree_,
+        "{0} with dense and sparse format gave different trees".format(tree_type),
+    )
+    assert_array_almost_equal(s.predict(X), d.predict(X))
+
+    # Check min_samples_leaf
+    d = TreeEstimator(random_state=0, min_samples_leaf=X_sparse.shape[0] // 2).fit(X, y)
+    s = TreeEstimator(random_state=0, min_samples_leaf=X_sparse.shape[0] // 2).fit(
+        X_sparse, y
+    )
+    assert_tree_equal(
+        d.tree_,
+        s.tree_,
+        "{0} with dense and sparse format gave different trees".format(tree_type),
+    )
+    assert_array_almost_equal(s.predict(X), d.predict(X))
+
+    # Check best-first search
+    d = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X, y)
+    s = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X_sparse, y)
+    assert_tree_equal(
+        d.tree_,
+        s.tree_,
+        "{0} with dense and sparse format gave different trees".format(tree_type),
+    )
+    assert_array_almost_equal(s.predict(X), d.predict(X))
+
+
+@pytest.mark.parametrize(
+    "tree_type, criterion",
+    list(product([tree for tree in SPARSE_TREES if tree in REG_TREES], REG_CRITERIONS))
+    + list(
+        product([tree for tree in SPARSE_TREES if tree in CLF_TREES], CLF_CRITERIONS)
+    ),
+)
+@pytest.mark.parametrize("dataset", ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"])
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_sparse_criteria(tree_type, dataset, csc_container, criterion):
+    TreeEstimator = ALL_TREES[tree_type]
+    X = DATASETS[dataset]["X"]
+    X_sparse = csc_container(X)
+    y = DATASETS[dataset]["y"]
+
+    d = TreeEstimator(random_state=0, max_depth=3, criterion=criterion).fit(X, y)
+    s = TreeEstimator(random_state=0, max_depth=3, criterion=criterion).fit(X_sparse, y)
+
+    assert_tree_equal(
+        d.tree_,
+        s.tree_,
+        "{0} with dense and sparse format gave different trees".format(tree_type),
+    )
+    assert_array_almost_equal(s.predict(X), d.predict(X))
+
+
+@pytest.mark.parametrize("tree_type", SPARSE_TREES)
+@pytest.mark.parametrize(
+    "csc_container,csr_container", zip(CSC_CONTAINERS, CSR_CONTAINERS)
+)
+def test_explicit_sparse_zeros(tree_type, csc_container, csr_container):
+    TreeEstimator = ALL_TREES[tree_type]
+    max_depth = 3
+    n_features = 10
+
+    # n_samples set n_feature to ease construction of a simultaneous
+    # construction of a csr and csc matrix
+    n_samples = n_features
+    samples = np.arange(n_samples)
+
+    # Generate X, y
+    random_state = check_random_state(0)
+    indices = []
+    data = []
+    offset = 0
+    indptr = [offset]
+    for i in range(n_features):
+        n_nonzero_i = random_state.binomial(n_samples, 0.5)
+        indices_i = random_state.permutation(samples)[:n_nonzero_i]
+        indices.append(indices_i)
+        data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i,)) - 1
+        data.append(data_i)
+        offset += n_nonzero_i
+        indptr.append(offset)
+
+    indices = np.concatenate(indices).astype(np.int32)
+    indptr = np.array(indptr, dtype=np.int32)
+    data = np.array(np.concatenate(data), dtype=np.float32)
+    X_sparse = csc_container((data, indices, indptr), shape=(n_samples, n_features))
+    X = X_sparse.toarray()
+    X_sparse_test = csr_container(
+        (data, indices, indptr), shape=(n_samples, n_features)
+    )
+    X_test = X_sparse_test.toarray()
+    y = random_state.randint(0, 3, size=(n_samples,))
+
+    # Ensure that X_sparse_test owns its data, indices and indptr array
+    X_sparse_test = X_sparse_test.copy()
+
+    # Ensure that we have explicit zeros
+    assert (X_sparse.data == 0.0).sum() > 0
+    assert (X_sparse_test.data == 0.0).sum() > 0
+
+    # Perform the comparison
+    d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
+    s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)
+
+    assert_tree_equal(
+        d.tree_,
+        s.tree_,
+        "{0} with dense and sparse format gave different trees".format(tree),
+    )
+
+    Xs = (X_test, X_sparse_test)
+    for X1, X2 in product(Xs, Xs):
+        assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2))
+        assert_array_almost_equal(s.apply(X1), d.apply(X2))
+        assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1))
+
+        assert_array_almost_equal(
+            s.tree_.decision_path(X1).toarray(), d.tree_.decision_path(X2).toarray()
+        )
+        assert_array_almost_equal(
+            s.decision_path(X1).toarray(), d.decision_path(X2).toarray()
+        )
+        assert_array_almost_equal(
+            s.decision_path(X1).toarray(), s.tree_.decision_path(X1).toarray()
+        )
+
+        assert_array_almost_equal(s.predict(X1), d.predict(X2))
+
+        if tree in CLF_TREES:
+            assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2))
+
+
+def check_raise_error_on_1d_input(name):
+    TreeEstimator = ALL_TREES[name]
+
+    X = iris.data[:, 0].ravel()
+    X_2d = iris.data[:, 0].reshape((-1, 1))
+    y = iris.target
+
+    with pytest.raises(ValueError):
+        TreeEstimator(random_state=0).fit(X, y)
+
+    est = TreeEstimator(random_state=0)
+    est.fit(X_2d, y)
+    with pytest.raises(ValueError):
+        est.predict([X])
+
+
+@pytest.mark.parametrize("name", ALL_TREES)
+def test_1d_input(name):
+    with ignore_warnings():
+        check_raise_error_on_1d_input(name)
+
+
+@pytest.mark.parametrize("name", ALL_TREES)
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS)
+def test_min_weight_leaf_split_level(name, sparse_container):
+    TreeEstimator = ALL_TREES[name]
+
+    X = np.array([[0], [0], [0], [0], [1]])
+    y = [0, 0, 0, 0, 1]
+    sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2]
+    if sparse_container is not None:
+        X = sparse_container(X)
+
+    est = TreeEstimator(random_state=0)
+    est.fit(X, y, sample_weight=sample_weight)
+    assert est.tree_.max_depth == 1
+
+    est = TreeEstimator(random_state=0, min_weight_fraction_leaf=0.4)
+    est.fit(X, y, sample_weight=sample_weight)
+    assert est.tree_.max_depth == 0
+
+
+@pytest.mark.parametrize("name", ALL_TREES)
+def test_public_apply_all_trees(name):
+    X_small32 = X_small.astype(tree._tree.DTYPE, copy=False)
+
+    est = ALL_TREES[name]()
+    est.fit(X_small, y_small)
+    assert_array_equal(est.apply(X_small), est.tree_.apply(X_small32))
+
+
+@pytest.mark.parametrize("name", SPARSE_TREES)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_public_apply_sparse_trees(name, csr_container):
+    X_small32 = csr_container(X_small.astype(tree._tree.DTYPE, copy=False))
+
+    est = ALL_TREES[name]()
+    est.fit(X_small, y_small)
+    assert_array_equal(est.apply(X_small), est.tree_.apply(X_small32))
+
+
+def test_decision_path_hardcoded():
+    X = iris.data
+    y = iris.target
+    est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y)
+    node_indicator = est.decision_path(X[:2]).toarray()
+    assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]])
+
+
+@pytest.mark.parametrize("name", ALL_TREES)
+def test_decision_path(name):
+    X = iris.data
+    y = iris.target
+    n_samples = X.shape[0]
+
+    TreeEstimator = ALL_TREES[name]
+    est = TreeEstimator(random_state=0, max_depth=2)
+    est.fit(X, y)
+
+    node_indicator_csr = est.decision_path(X)
+    node_indicator = node_indicator_csr.toarray()
+    assert node_indicator.shape == (n_samples, est.tree_.node_count)
+
+    # Assert that leaves index are correct
+    leaves = est.apply(X)
+    leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
+    assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
+
+    # Ensure only one leave node per sample
+    all_leaves = est.tree_.children_left == TREE_LEAF
+    assert_array_almost_equal(
+        np.dot(node_indicator, all_leaves), np.ones(shape=n_samples)
+    )
+
+    # Ensure max depth is consistent with sum of indicator
+    max_depth = node_indicator.sum(axis=1).max()
+    assert est.tree_.max_depth <= max_depth
+
+
+@pytest.mark.parametrize("name", ALL_TREES)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_no_sparse_y_support(name, csr_container):
+    # Currently we don't support sparse y
+    X, y = X_multilabel, csr_container(y_multilabel)
+    TreeEstimator = ALL_TREES[name]
+    with pytest.raises(TypeError):
+        TreeEstimator(random_state=0).fit(X, y)
+
+
+def test_mae():
+    """Check MAE criterion produces correct results on small toy dataset:
+
+    ------------------
+    | X | y | weight |
+    ------------------
+    | 3 | 3 |  0.1   |
+    | 5 | 3 |  0.3   |
+    | 8 | 4 |  1.0   |
+    | 3 | 6 |  0.6   |
+    | 5 | 7 |  0.3   |
+    ------------------
+    |sum wt:|  2.3   |
+    ------------------
+
+    Because we are dealing with sample weights, we cannot find the median by
+    simply choosing/averaging the centre value(s), instead we consider the
+    median where 50% of the cumulative weight is found (in a y sorted data set)
+    . Therefore with regards to this test data, the cumulative weight is >= 50%
+    when y = 4.  Therefore:
+    Median = 4
+
+    For all the samples, we can get the total error by summing:
+    Absolute(Median - y) * weight
+
+    I.e., total error = (Absolute(4 - 3) * 0.1)
+                      + (Absolute(4 - 3) * 0.3)
+                      + (Absolute(4 - 4) * 1.0)
+                      + (Absolute(4 - 6) * 0.6)
+                      + (Absolute(4 - 7) * 0.3)
+                      = 2.5
+
+    Impurity = Total error / total weight
+             = 2.5 / 2.3
+             = 1.08695652173913
+             ------------------
+
+    From this root node, the next best split is between X values of 3 and 5.
+    Thus, we have left and right child nodes:
+
+    LEFT                    RIGHT
+    ------------------      ------------------
+    | X | y | weight |      | X | y | weight |
+    ------------------      ------------------
+    | 3 | 3 |  0.1   |      | 5 | 3 |  0.3   |
+    | 3 | 6 |  0.6   |      | 8 | 4 |  1.0   |
+    ------------------      | 5 | 7 |  0.3   |
+    |sum wt:|  0.7   |      ------------------
+    ------------------      |sum wt:|  1.6   |
+                            ------------------
+
+    Impurity is found in the same way:
+    Left node Median = 6
+    Total error = (Absolute(6 - 3) * 0.1)
+                + (Absolute(6 - 6) * 0.6)
+                = 0.3
+
+    Left Impurity = Total error / total weight
+            = 0.3 / 0.7
+            = 0.428571428571429
+            -------------------
+
+    Likewise for Right node:
+    Right node Median = 4
+    Total error = (Absolute(4 - 3) * 0.3)
+                + (Absolute(4 - 4) * 1.0)
+                + (Absolute(4 - 7) * 0.3)
+                = 1.2
+
+    Right Impurity = Total error / total weight
+            = 1.2 / 1.6
+            = 0.75
+            ------
+    """
+    dt_mae = DecisionTreeRegressor(
+        random_state=0, criterion="absolute_error", max_leaf_nodes=2
+    )
+
+    # Test MAE where sample weights are non-uniform (as illustrated above):
+    dt_mae.fit(
+        X=[[3], [5], [3], [8], [5]],
+        y=[6, 7, 3, 4, 3],
+        sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3],
+    )
+    assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6])
+    assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0])
+
+    # Test MAE where all sample weights are uniform:
+    dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3], sample_weight=np.ones(5))
+    assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
+    assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
+
+    # Test MAE where a `sample_weight` is not explicitly provided.
+    # This is equivalent to providing uniform sample weights, though
+    # the internal logic is different:
+    dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3])
+    assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
+    assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
+
+
+def test_criterion_copy():
+    # Let's check whether copy of our criterion has the same type
+    # and properties as original
+    n_outputs = 3
+    n_classes = np.arange(3, dtype=np.intp)
+    n_samples = 100
+
+    def _pickle_copy(obj):
+        return pickle.loads(pickle.dumps(obj))
+
+    for copy_func in [copy.copy, copy.deepcopy, _pickle_copy]:
+        for _, typename in CRITERIA_CLF.items():
+            criteria = typename(n_outputs, n_classes)
+            result = copy_func(criteria).__reduce__()
+            typename_, (n_outputs_, n_classes_), _ = result
+            assert typename == typename_
+            assert n_outputs == n_outputs_
+            assert_array_equal(n_classes, n_classes_)
+
+        for _, typename in CRITERIA_REG.items():
+            criteria = typename(n_outputs, n_samples)
+            result = copy_func(criteria).__reduce__()
+            typename_, (n_outputs_, n_samples_), _ = result
+            assert typename == typename_
+            assert n_outputs == n_outputs_
+            assert n_samples == n_samples_
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS)
+def test_empty_leaf_infinite_threshold(sparse_container):
+    # try to make empty leaf by using near infinite value.
+    data = np.random.RandomState(0).randn(100, 11) * 2e38
+    data = np.nan_to_num(data.astype("float32"))
+    X = data[:, :-1]
+    if sparse_container is not None:
+        X = sparse_container(X)
+    y = data[:, -1]
+
+    tree = DecisionTreeRegressor(random_state=0).fit(X, y)
+    terminal_regions = tree.apply(X)
+    left_leaf = set(np.where(tree.tree_.children_left == TREE_LEAF)[0])
+    empty_leaf = left_leaf.difference(terminal_regions)
+    infinite_threshold = np.where(~np.isfinite(tree.tree_.threshold))[0]
+    assert len(infinite_threshold) == 0
+    assert len(empty_leaf) == 0
+
+
+@pytest.mark.parametrize(
+    "dataset", sorted(set(DATASETS.keys()) - {"reg_small", "diabetes"})
+)
+@pytest.mark.parametrize("tree_cls", [DecisionTreeClassifier, ExtraTreeClassifier])
+def test_prune_tree_classifier_are_subtrees(dataset, tree_cls):
+    dataset = DATASETS[dataset]
+    X, y = dataset["X"], dataset["y"]
+    est = tree_cls(max_leaf_nodes=20, random_state=0)
+    info = est.cost_complexity_pruning_path(X, y)
+
+    pruning_path = info.ccp_alphas
+    impurities = info.impurities
+    assert np.all(np.diff(pruning_path) >= 0)
+    assert np.all(np.diff(impurities) >= 0)
+
+    assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)
+
+
+@pytest.mark.parametrize("dataset", DATASETS.keys())
+@pytest.mark.parametrize("tree_cls", [DecisionTreeRegressor, ExtraTreeRegressor])
+def test_prune_tree_regression_are_subtrees(dataset, tree_cls):
+    dataset = DATASETS[dataset]
+    X, y = dataset["X"], dataset["y"]
+
+    est = tree_cls(max_leaf_nodes=20, random_state=0)
+    info = est.cost_complexity_pruning_path(X, y)
+
+    pruning_path = info.ccp_alphas
+    impurities = info.impurities
+    assert np.all(np.diff(pruning_path) >= 0)
+    assert np.all(np.diff(impurities) >= 0)
+
+    assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)
+
+
+def test_prune_single_node_tree():
+    # single node tree
+    clf1 = DecisionTreeClassifier(random_state=0)
+    clf1.fit([[0], [1]], [0, 0])
+
+    # pruned single node tree
+    clf2 = DecisionTreeClassifier(random_state=0, ccp_alpha=10)
+    clf2.fit([[0], [1]], [0, 0])
+
+    assert_is_subtree(clf1.tree_, clf2.tree_)
+
+
+def assert_pruning_creates_subtree(estimator_cls, X, y, pruning_path):
+    # generate trees with increasing alphas
+    estimators = []
+    for ccp_alpha in pruning_path:
+        est = estimator_cls(max_leaf_nodes=20, ccp_alpha=ccp_alpha, random_state=0).fit(
+            X, y
+        )
+        estimators.append(est)
+
+    # A pruned tree must be a subtree of the previous tree (which had a
+    # smaller ccp_alpha)
+    for prev_est, next_est in pairwise(estimators):
+        assert_is_subtree(prev_est.tree_, next_est.tree_)
+
+
+def assert_is_subtree(tree, subtree):
+    assert tree.node_count >= subtree.node_count
+    assert tree.max_depth >= subtree.max_depth
+
+    tree_c_left = tree.children_left
+    tree_c_right = tree.children_right
+    subtree_c_left = subtree.children_left
+    subtree_c_right = subtree.children_right
+
+    stack = [(0, 0)]
+    while stack:
+        tree_node_idx, subtree_node_idx = stack.pop()
+        assert_array_almost_equal(
+            tree.value[tree_node_idx], subtree.value[subtree_node_idx]
+        )
+        assert_almost_equal(
+            tree.impurity[tree_node_idx], subtree.impurity[subtree_node_idx]
+        )
+        assert_almost_equal(
+            tree.n_node_samples[tree_node_idx], subtree.n_node_samples[subtree_node_idx]
+        )
+        assert_almost_equal(
+            tree.weighted_n_node_samples[tree_node_idx],
+            subtree.weighted_n_node_samples[subtree_node_idx],
+        )
+
+        if subtree_c_left[subtree_node_idx] == subtree_c_right[subtree_node_idx]:
+            # is a leaf
+            assert_almost_equal(TREE_UNDEFINED, subtree.threshold[subtree_node_idx])
+        else:
+            # not a leaf
+            assert_almost_equal(
+                tree.threshold[tree_node_idx], subtree.threshold[subtree_node_idx]
+            )
+            stack.append((tree_c_left[tree_node_idx], subtree_c_left[subtree_node_idx]))
+            stack.append(
+                (tree_c_right[tree_node_idx], subtree_c_right[subtree_node_idx])
+            )
+
+
+@pytest.mark.parametrize("name", ALL_TREES)
+@pytest.mark.parametrize("splitter", ["best", "random"])
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
+def test_apply_path_readonly_all_trees(name, splitter, sparse_container):
+    dataset = DATASETS["clf_small"]
+    X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False)
+    if sparse_container is None:
+        X_readonly = create_memmap_backed_data(X_small)
+    else:
+        X_readonly = sparse_container(dataset["X"])
+
+        X_readonly.data = np.array(X_readonly.data, dtype=tree._tree.DTYPE)
+        (
+            X_readonly.data,
+            X_readonly.indices,
+            X_readonly.indptr,
+        ) = create_memmap_backed_data(
+            (X_readonly.data, X_readonly.indices, X_readonly.indptr)
+        )
+
+    y_readonly = create_memmap_backed_data(np.array(y_small, dtype=tree._tree.DTYPE))
+    est = ALL_TREES[name](splitter=splitter)
+    est.fit(X_readonly, y_readonly)
+    assert_array_equal(est.predict(X_readonly), est.predict(X_small))
+    assert_array_equal(
+        est.decision_path(X_readonly).todense(), est.decision_path(X_small).todense()
+    )
+
+
+@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse", "poisson"])
+@pytest.mark.parametrize("Tree", REG_TREES.values())
+def test_balance_property(criterion, Tree):
+    # Test that sum(y_pred)=sum(y_true) on training set.
+    # This works if the mean is predicted (should even be true for each leaf).
+    # MAE predicts the median and is therefore excluded from this test.
+
+    # Choose a training set with non-negative targets (for poisson)
+    X, y = diabetes.data, diabetes.target
+    reg = Tree(criterion=criterion)
+    reg.fit(X, y)
+    assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))
+
+
+@pytest.mark.parametrize("seed", range(3))
+def test_poisson_zero_nodes(seed):
+    # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes.
+    X = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 2], [1, 3]]
+    y = [0, 0, 0, 0, 1, 2, 3, 4]
+    # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can
+    # easily learn that:
+    reg = DecisionTreeRegressor(criterion="squared_error", random_state=seed)
+    reg.fit(X, y)
+    assert np.amin(reg.predict(X)) == 0
+    # whereas Poisson must predict strictly positive numbers
+    reg = DecisionTreeRegressor(criterion="poisson", random_state=seed)
+    reg.fit(X, y)
+    assert np.all(reg.predict(X) > 0)
+
+    # Test additional dataset where something could go wrong.
+    n_features = 10
+    X, y = datasets.make_regression(
+        effective_rank=n_features * 2 // 3,
+        tail_strength=0.6,
+        n_samples=1_000,
+        n_features=n_features,
+        n_informative=n_features * 2 // 3,
+        random_state=seed,
+    )
+    # some excess zeros
+    y[(-1 < y) & (y < 0)] = 0
+    # make sure the target is positive
+    y = np.abs(y)
+    reg = DecisionTreeRegressor(criterion="poisson", random_state=seed)
+    reg.fit(X, y)
+    assert np.all(reg.predict(X) > 0)
+
+
+def test_poisson_vs_mse():
+    # For a Poisson distributed target, Poisson loss should give better results
+    # than squared error measured in Poisson deviance as metric.
+    # We have a similar test, test_poisson(), in
+    # sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 500, 10
+    X = datasets.make_low_rank_matrix(
+        n_samples=n_train + n_test, n_features=n_features, random_state=rng
+    )
+    # We create a log-linear Poisson model and downscale coef as it will get
+    # exponentiated.
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=n_test, random_state=rng
+    )
+    # We prevent some overfitting by setting min_samples_split=10.
+    tree_poi = DecisionTreeRegressor(
+        criterion="poisson", min_samples_split=10, random_state=rng
+    )
+    tree_mse = DecisionTreeRegressor(
+        criterion="squared_error", min_samples_split=10, random_state=rng
+    )
+
+    tree_poi.fit(X_train, y_train)
+    tree_mse.fit(X_train, y_train)
+    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
+
+    for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
+        metric_poi = mean_poisson_deviance(y, tree_poi.predict(X))
+        # squared_error might produce non-positive predictions => clip
+        metric_mse = mean_poisson_deviance(y, np.clip(tree_mse.predict(X), 1e-15, None))
+        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
+        # As squared_error might correctly predict 0 in train set, its train
+        # score can be better than Poisson. This is no longer the case for the
+        # test set.
+        if val == "test":
+            assert metric_poi < 0.5 * metric_mse
+        assert metric_poi < 0.75 * metric_dummy
+
+
+@pytest.mark.parametrize("Tree", [DecisionTreeClassifier, ExtraTreeClassifier])
+@pytest.mark.parametrize("n_classes", [2, 4])
+def test_criterion_entropy_same_as_log_loss(Tree, n_classes):
+    """Test that criterion=entropy gives same as log_loss."""
+    n_samples, n_features = 50, 5
+    X, y = datasets.make_classification(
+        n_classes=n_classes,
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_features,
+        n_redundant=0,
+        random_state=42,
+    )
+    tree_log_loss = Tree(criterion="log_loss", random_state=43).fit(X, y)
+    tree_entropy = Tree(criterion="entropy", random_state=43).fit(X, y)
+
+    assert_tree_equal(
+        tree_log_loss.tree_,
+        tree_entropy.tree_,
+        f"{Tree!r} with criterion 'entropy' and 'log_loss' gave different trees.",
+    )
+    assert_allclose(tree_log_loss.predict(X), tree_entropy.predict(X))
+
+
+def test_different_endianness_pickle():
+    X, y = datasets.make_classification(random_state=0)
+
+    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+
+    def reduce_ndarray(arr):
+        return arr.byteswap().view(arr.dtype.newbyteorder()).__reduce__()
+
+    def get_pickle_non_native_endianness():
+        f = io.BytesIO()
+        p = pickle.Pickler(f)
+        p.dispatch_table = copyreg.dispatch_table.copy()
+        p.dispatch_table[np.ndarray] = reduce_ndarray
+
+        p.dump(clf)
+        f.seek(0)
+        return f
+
+    new_clf = pickle.load(get_pickle_non_native_endianness())
+    new_score = new_clf.score(X, y)
+    assert np.isclose(score, new_score)
+
+
+def test_different_endianness_joblib_pickle():
+    X, y = datasets.make_classification(random_state=0)
+
+    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+
+    class NonNativeEndiannessNumpyPickler(NumpyPickler):
+        def save(self, obj):
+            if isinstance(obj, np.ndarray):
+                obj = obj.byteswap().view(obj.dtype.newbyteorder())
+            super().save(obj)
+
+    def get_joblib_pickle_non_native_endianness():
+        f = io.BytesIO()
+        p = NonNativeEndiannessNumpyPickler(f)
+
+        p.dump(clf)
+        f.seek(0)
+        return f
+
+    new_clf = joblib.load(get_joblib_pickle_non_native_endianness())
+    new_score = new_clf.score(X, y)
+    assert np.isclose(score, new_score)
+
+
+def get_different_bitness_node_ndarray(node_ndarray):
+    new_dtype_for_indexing_fields = np.int64 if _IS_32BIT else np.int32
+
+    # field names in Node struct with SIZE_t types (see sklearn/tree/_tree.pxd)
+    indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"]
+
+    new_dtype_dict = {
+        name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()
+    }
+    for name in indexing_field_names:
+        new_dtype_dict[name] = new_dtype_for_indexing_fields
+
+    new_dtype = np.dtype(
+        {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
+    )
+    return node_ndarray.astype(new_dtype, casting="same_kind")
+
+
+def get_different_alignment_node_ndarray(node_ndarray):
+    new_dtype_dict = {
+        name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()
+    }
+    offsets = [offset for dtype, offset in node_ndarray.dtype.fields.values()]
+    shifted_offsets = [8 + offset for offset in offsets]
+
+    new_dtype = np.dtype(
+        {
+            "names": list(new_dtype_dict.keys()),
+            "formats": list(new_dtype_dict.values()),
+            "offsets": shifted_offsets,
+        }
+    )
+    return node_ndarray.astype(new_dtype, casting="same_kind")
+
+
+def reduce_tree_with_different_bitness(tree):
+    new_dtype = np.int64 if _IS_32BIT else np.int32
+    tree_cls, (n_features, n_classes, n_outputs), state = tree.__reduce__()
+    new_n_classes = n_classes.astype(new_dtype, casting="same_kind")
+
+    new_state = state.copy()
+    new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"])
+
+    return (tree_cls, (n_features, new_n_classes, n_outputs), new_state)
+
+
+def test_different_bitness_pickle():
+    X, y = datasets.make_classification(random_state=0)
+
+    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+
+    def pickle_dump_with_different_bitness():
+        f = io.BytesIO()
+        p = pickle.Pickler(f)
+        p.dispatch_table = copyreg.dispatch_table.copy()
+        p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness
+
+        p.dump(clf)
+        f.seek(0)
+        return f
+
+    new_clf = pickle.load(pickle_dump_with_different_bitness())
+    new_score = new_clf.score(X, y)
+    assert score == pytest.approx(new_score)
+
+
+def test_different_bitness_joblib_pickle():
+    # Make sure that a platform specific pickle generated on a 64 bit
+    # platform can be converted at pickle load time into an estimator
+    # with Cython code that works with the host's native integer precision
+    # to index nodes in the tree data structure when the host is a 32 bit
+    # platform (and vice versa).
+    X, y = datasets.make_classification(random_state=0)
+
+    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+
+    def joblib_dump_with_different_bitness():
+        f = io.BytesIO()
+        p = NumpyPickler(f)
+        p.dispatch_table = copyreg.dispatch_table.copy()
+        p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness
+
+        p.dump(clf)
+        f.seek(0)
+        return f
+
+    new_clf = joblib.load(joblib_dump_with_different_bitness())
+    new_score = new_clf.score(X, y)
+    assert score == pytest.approx(new_score)
+
+
+def test_check_n_classes():
+    expected_dtype = np.dtype(np.int32) if _IS_32BIT else np.dtype(np.int64)
+    allowed_dtypes = [np.dtype(np.int32), np.dtype(np.int64)]
+    allowed_dtypes += [dt.newbyteorder() for dt in allowed_dtypes]
+
+    n_classes = np.array([0, 1], dtype=expected_dtype)
+    for dt in allowed_dtypes:
+        _check_n_classes(n_classes.astype(dt), expected_dtype)
+
+    with pytest.raises(ValueError, match="Wrong dimensions.+n_classes"):
+        wrong_dim_n_classes = np.array([[0, 1]], dtype=expected_dtype)
+        _check_n_classes(wrong_dim_n_classes, expected_dtype)
+
+    with pytest.raises(ValueError, match="n_classes.+incompatible dtype"):
+        wrong_dtype_n_classes = n_classes.astype(np.float64)
+        _check_n_classes(wrong_dtype_n_classes, expected_dtype)
+
+
+def test_check_value_ndarray():
+    expected_dtype = np.dtype(np.float64)
+    expected_shape = (5, 1, 2)
+    value_ndarray = np.zeros(expected_shape, dtype=expected_dtype)
+
+    allowed_dtypes = [expected_dtype, expected_dtype.newbyteorder()]
+
+    for dt in allowed_dtypes:
+        _check_value_ndarray(
+            value_ndarray, expected_dtype=dt, expected_shape=expected_shape
+        )
+
+    with pytest.raises(ValueError, match="Wrong shape.+value array"):
+        _check_value_ndarray(
+            value_ndarray, expected_dtype=expected_dtype, expected_shape=(1, 2)
+        )
+
+    for problematic_arr in [value_ndarray[:, :, :1], np.asfortranarray(value_ndarray)]:
+        with pytest.raises(ValueError, match="value array.+C-contiguous"):
+            _check_value_ndarray(
+                problematic_arr,
+                expected_dtype=expected_dtype,
+                expected_shape=problematic_arr.shape,
+            )
+
+    with pytest.raises(ValueError, match="value array.+incompatible dtype"):
+        _check_value_ndarray(
+            value_ndarray.astype(np.float32),
+            expected_dtype=expected_dtype,
+            expected_shape=expected_shape,
+        )
+
+
+def test_check_node_ndarray():
+    expected_dtype = NODE_DTYPE
+
+    node_ndarray = np.zeros((5,), dtype=expected_dtype)
+
+    valid_node_ndarrays = [
+        node_ndarray,
+        get_different_bitness_node_ndarray(node_ndarray),
+        get_different_alignment_node_ndarray(node_ndarray),
+    ]
+    valid_node_ndarrays += [
+        arr.astype(arr.dtype.newbyteorder()) for arr in valid_node_ndarrays
+    ]
+
+    for arr in valid_node_ndarrays:
+        _check_node_ndarray(node_ndarray, expected_dtype=expected_dtype)
+
+    with pytest.raises(ValueError, match="Wrong dimensions.+node array"):
+        problematic_node_ndarray = np.zeros((5, 2), dtype=expected_dtype)
+        _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)
+
+    with pytest.raises(ValueError, match="node array.+C-contiguous"):
+        problematic_node_ndarray = node_ndarray[::2]
+        _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)
+
+    dtype_dict = {name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()}
+
+    # array with wrong 'threshold' field dtype (int64 rather than float64)
+    new_dtype_dict = dtype_dict.copy()
+    new_dtype_dict["threshold"] = np.int64
+
+    new_dtype = np.dtype(
+        {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
+    )
+    problematic_node_ndarray = node_ndarray.astype(new_dtype)
+
+    with pytest.raises(ValueError, match="node array.+incompatible dtype"):
+        _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)
+
+    # array with wrong 'left_child' field dtype (float64 rather than int64 or int32)
+    new_dtype_dict = dtype_dict.copy()
+    new_dtype_dict["left_child"] = np.float64
+    new_dtype = np.dtype(
+        {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
+    )
+
+    problematic_node_ndarray = node_ndarray.astype(new_dtype)
+
+    with pytest.raises(ValueError, match="node array.+incompatible dtype"):
+        _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)
+
+
+@pytest.mark.parametrize(
+    "Splitter", chain(DENSE_SPLITTERS.values(), SPARSE_SPLITTERS.values())
+)
+def test_splitter_serializable(Splitter):
+    """Check that splitters are serializable."""
+    rng = np.random.RandomState(42)
+    max_features = 10
+    n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp)
+
+    criterion = CRITERIA_CLF["gini"](n_outputs, n_classes)
+    splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None)
+    splitter_serialize = pickle.dumps(splitter)
+
+    splitter_back = pickle.loads(splitter_serialize)
+    assert splitter_back.max_features == max_features
+    assert isinstance(splitter_back, Splitter)
+
+
+def test_tree_deserialization_from_read_only_buffer(tmpdir):
+    """Check that Trees can be deserialized with read only buffers.
+
+    Non-regression test for gh-25584.
+    """
+    pickle_path = str(tmpdir.join("clf.joblib"))
+    clf = DecisionTreeClassifier(random_state=0)
+    clf.fit(X_small, y_small)
+
+    joblib.dump(clf, pickle_path)
+    loaded_clf = joblib.load(pickle_path, mmap_mode="r")
+
+    assert_tree_equal(
+        loaded_clf.tree_,
+        clf.tree_,
+        "The trees of the original and loaded classifiers are not equal.",
+    )
+
+
+@pytest.mark.parametrize("Tree", ALL_TREES.values())
+def test_min_sample_split_1_error(Tree):
+    """Check that an error is raised when min_sample_split=1.
+
+    non-regression test for issue gh-25481.
+    """
+    X = np.array([[0, 0], [1, 1]])
+    y = np.array([0, 1])
+
+    # min_samples_split=1.0 is valid
+    Tree(min_samples_split=1.0).fit(X, y)
+
+    # min_samples_split=1 is invalid
+    tree = Tree(min_samples_split=1)
+    msg = (
+        r"'min_samples_split' .* must be an int in the range \[2, inf\) "
+        r"or a float in the range \(0.0, 1.0\]"
+    )
+    with pytest.raises(ValueError, match=msg):
+        tree.fit(X, y)
+
+
+@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"])
+def test_missing_values_best_splitter_on_equal_nodes_no_missing(criterion):
+    """Check missing values goes to correct node during predictions."""
+    X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
+    y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
+
+    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
+    dtc.fit(X, y)
+
+    # Goes to right node because it has the most data points
+    y_pred = dtc.predict([[np.nan]])
+    assert_allclose(y_pred, [np.mean(y[-5:])])
+
+    # equal number of elements in both nodes
+    X_equal = X[:-1]
+    y_equal = y[:-1]
+
+    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
+    dtc.fit(X_equal, y_equal)
+
+    # Goes to right node because the implementation sets:
+    # missing_go_to_left = n_left > n_right, which is False
+    y_pred = dtc.predict([[np.nan]])
+    assert_allclose(y_pred, [np.mean(y_equal[-4:])])
+
+
+@pytest.mark.parametrize("seed", range(3))
+@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"])
+def test_missing_values_random_splitter_on_equal_nodes_no_missing(criterion, seed):
+    """Check missing values go to the correct node during predictions for ExtraTree.
+
+    Since ETC use random splits, we use different seeds to verify that the
+    left/right node is chosen correctly when the splits occur.
+    """
+    X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
+    y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
+
+    etr = ExtraTreeRegressor(random_state=seed, max_depth=1, criterion=criterion)
+    etr.fit(X, y)
+
+    # Get the left and right children of the root node
+    left_child = etr.tree_.children_left[0]
+    right_child = etr.tree_.children_right[0]
+
+    # Get the number of samples for the left and right children
+    left_samples = etr.tree_.weighted_n_node_samples[left_child]
+    right_samples = etr.tree_.weighted_n_node_samples[right_child]
+    went_left = left_samples > right_samples
+
+    # predictions
+    y_pred_left = etr.tree_.value[left_child][0]
+    y_pred_right = etr.tree_.value[right_child][0]
+
+    # Goes to node with the most data points
+    y_pred = etr.predict([[np.nan]])
+    if went_left:
+        assert_allclose(y_pred_left, y_pred)
+    else:
+        assert_allclose(y_pred_right, y_pred)
+
+
+@pytest.mark.parametrize("criterion", ["entropy", "gini"])
+def test_missing_values_best_splitter_three_classes(criterion):
+    """Test when missing values are uniquely present in a class among 3 classes."""
+    missing_values_class = 0
+    X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 8, 9, 11, 12]]).T
+    y = np.array([missing_values_class] * 4 + [1] * 4 + [2] * 4)
+    dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
+    dtc.fit(X, y)
+
+    X_test = np.array([[np.nan, 3, 12]]).T
+    y_nan_pred = dtc.predict(X_test)
+    # Missing values necessarily are associated to the observed class.
+    assert_array_equal(y_nan_pred, [missing_values_class, 1, 2])
+
+
+@pytest.mark.parametrize("criterion", ["entropy", "gini"])
+def test_missing_values_best_splitter_to_left(criterion):
+    """Missing values spanning only one class at fit-time must make missing
+    values at predict-time be classified has belonging to this class."""
+    X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T
+    y = np.array([0] * 4 + [1] * 6)
+
+    dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
+    dtc.fit(X, y)
+
+    X_test = np.array([[np.nan, 5, np.nan]]).T
+    y_pred = dtc.predict(X_test)
+
+    assert_array_equal(y_pred, [0, 1, 0])
+
+
+@pytest.mark.parametrize("criterion", ["entropy", "gini"])
+def test_missing_values_best_splitter_to_right(criterion):
+    """Missing values and non-missing values sharing one class at fit-time
+    must make missing values at predict-time be classified has belonging
+    to this class."""
+    X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T
+    y = np.array([1] * 4 + [0] * 4 + [1] * 2)
+
+    dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
+    dtc.fit(X, y)
+
+    X_test = np.array([[np.nan, 1.2, 4.8]]).T
+    y_pred = dtc.predict(X_test)
+
+    assert_array_equal(y_pred, [1, 0, 1])
+
+
+@pytest.mark.parametrize("criterion", ["entropy", "gini"])
+def test_missing_values_best_splitter_missing_both_classes_has_nan(criterion):
+    """Check behavior of missing value when there is one missing value in each class."""
+    X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T
+    y = np.array([0] * 5 + [1] * 5)
+
+    dtc = DecisionTreeClassifier(random_state=42, max_depth=1, criterion=criterion)
+    dtc.fit(X, y)
+    X_test = np.array([[np.nan, 2.3, 34.2]]).T
+    y_pred = dtc.predict(X_test)
+
+    # Missing value goes to the class at the right (here 1) because the implementation
+    # searches right first.
+    assert_array_equal(y_pred, [1, 0, 1])
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize(
+    "tree",
+    [
+        DecisionTreeRegressor(criterion="absolute_error"),
+        ExtraTreeRegressor(criterion="absolute_error"),
+    ],
+)
+def test_missing_value_errors(sparse_container, tree):
+    """Check unsupported configurations for missing values."""
+
+    X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T
+    y = np.array([0] * 5 + [1] * 5)
+
+    if sparse_container is not None:
+        X = sparse_container(X)
+
+    with pytest.raises(ValueError, match="Input X contains NaN"):
+        tree.fit(X, y)
+
+
+@pytest.mark.parametrize("Tree", REG_TREES.values())
+def test_missing_values_poisson(Tree):
+    """Smoke test for poisson regression and missing values."""
+    X, y = diabetes.data.copy(), diabetes.target
+
+    # Set some values missing
+    X[::5, 0] = np.nan
+    X[::6, -1] = np.nan
+
+    reg = Tree(criterion="poisson", random_state=42)
+    reg.fit(X, y)
+
+    y_pred = reg.predict(X)
+    assert (y_pred >= 0.0).all()
+
+
+def make_friedman1_classification(*args, **kwargs):
+    X, y = datasets.make_friedman1(*args, **kwargs)
+    y = y > 14
+    return X, y
+
+
+@pytest.mark.parametrize(
+    "make_data, Tree, tolerance",
+    [
+        # Due to the sine link between X and y, we expect the native handling of
+        # missing values to always be better than the naive mean imputation in the
+        # regression case.
+        #
+        # Due to randomness in ExtraTree, we expect the native handling of missing
+        # values to be sometimes better than the naive mean imputation, but not always
+        (datasets.make_friedman1, DecisionTreeRegressor, 0),
+        (datasets.make_friedman1, ExtraTreeRegressor, 0.07),
+        (make_friedman1_classification, DecisionTreeClassifier, 0.03),
+        (make_friedman1_classification, ExtraTreeClassifier, 0.12),
+    ],
+)
+@pytest.mark.parametrize("sample_weight_train", [None, "ones"])
+def test_missing_values_is_resilience(
+    make_data, Tree, sample_weight_train, global_random_seed, tolerance
+):
+    """Check that trees can deal with missing values have decent performance."""
+    n_samples, n_features = 5_000, 10
+    X, y = make_data(
+        n_samples=n_samples,
+        n_features=n_features,
+        noise=1.0,
+        random_state=global_random_seed,
+    )
+
+    X_missing = X.copy()
+    rng = np.random.RandomState(global_random_seed)
+    X_missing[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan
+    X_missing_train, X_missing_test, y_train, y_test = train_test_split(
+        X_missing, y, random_state=global_random_seed
+    )
+    if sample_weight_train == "ones":
+        sample_weight = np.ones(X_missing_train.shape[0])
+    else:
+        sample_weight = None
+
+    # max_depth is used to avoid overfitting and also improve the runtime
+    # of the test.
+    max_depth = 10
+    native_tree = Tree(max_depth=max_depth, random_state=global_random_seed)
+    native_tree.fit(X_missing_train, y_train, sample_weight=sample_weight)
+    score_native_tree = native_tree.score(X_missing_test, y_test)
+
+    tree_with_imputer = make_pipeline(
+        SimpleImputer(), Tree(max_depth=max_depth, random_state=global_random_seed)
+    )
+    tree_with_imputer.fit(X_missing_train, y_train)
+    score_tree_with_imputer = tree_with_imputer.score(X_missing_test, y_test)
+
+    assert score_native_tree + tolerance > score_tree_with_imputer, (
+        f"{score_native_tree=} + {tolerance} should be strictly greater than"
+        f" {score_tree_with_imputer}"
+    )
+
+
+# A single ExtraTree will randomly send missing values down the left, or right child,
+# and therefore will not necessarily have the same performance as the greedy
+# handling of missing values.
+@pytest.mark.parametrize("Tree, expected_score", zip(CLF_TREES.values(), [0.85, 0.53]))
+def test_missing_value_is_predictive(Tree, expected_score, global_random_seed):
+    """Check the tree learns when only the missing value is predictive."""
+    rng = np.random.RandomState(0)
+    n_samples = 500
+
+    X = rng.standard_normal(size=(n_samples, 20))
+    y = np.concatenate([np.zeros(n_samples // 2), np.ones(n_samples // 2)])
+    # y = rng.randint(0, high=2, size=n_samples)
+
+    # Create a predictive feature using `y` and with some noise
+    X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05])
+    y_mask = y.copy().astype(bool)
+    y_mask[X_random_mask] = ~y_mask[X_random_mask]
+
+    X_predictive = rng.standard_normal(size=n_samples)
+    X_predictive[y_mask] = np.nan
+
+    X[:, 5] = X_predictive
+
+    tree = Tree(random_state=global_random_seed)
+
+    # Check that the tree can learn the predictive feature
+    # over an average of cross-validation fits.
+    tree_cv_score = cross_val_score(tree, X, y, cv=5).mean()
+    assert tree_cv_score >= expected_score, (
+        f"Expected CV score: {expected_score} but got {tree_cv_score}"
+    )
+
+
+@pytest.mark.parametrize(
+    "make_data, Tree",
+    [
+        (datasets.make_regression, DecisionTreeRegressor),
+        (datasets.make_classification, DecisionTreeClassifier),
+    ],
+)
+def test_sample_weight_non_uniform(make_data, Tree):
+    """Check sample weight is correctly handled with missing values."""
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 1000, 10
+    X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng)
+
+    # Create dataset with missing values
+    X[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan
+
+    # Zero sample weight is the same as removing the sample
+    sample_weight = np.ones(X.shape[0])
+    sample_weight[::2] = 0.0
+
+    tree_with_sw = Tree(random_state=0)
+    tree_with_sw.fit(X, y, sample_weight=sample_weight)
+
+    tree_samples_removed = Tree(random_state=0)
+    tree_samples_removed.fit(X[1::2, :], y[1::2])
+
+    assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X))
+
+
+def test_deterministic_pickle():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/27268
+    # Uninitialised memory would lead to the two pickle strings being different.
+    tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target)
+    tree2 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target)
+
+    pickle1 = pickle.dumps(tree1)
+    pickle2 = pickle.dumps(tree2)
+
+    assert pickle1 == pickle2
+
+
+@pytest.mark.parametrize("Tree", [DecisionTreeRegressor, ExtraTreeRegressor])
+@pytest.mark.parametrize(
+    "X",
+    [
+        # missing values will go left for greedy splits
+        np.array([np.nan, 2, np.nan, 4, 5, 6]),
+        np.array([np.nan, np.nan, 3, 4, 5, 6]),
+        # missing values will go right for greedy splits
+        np.array([1, 2, 3, 4, np.nan, np.nan]),
+        np.array([1, 2, 3, np.nan, 6, np.nan]),
+    ],
+)
+@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"])
+def test_regression_tree_missing_values_toy(Tree, X, criterion):
+    """Check that we properly handle missing values in regression trees using a toy
+    dataset.
+
+    The regression targeted by this test was that we were not reinitializing the
+    criterion when it comes to the number of missing values. Therefore, the value
+    of the critetion (i.e. MSE) was completely wrong.
+
+    This test check that the MSE is null when there is a single sample in the leaf.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28254
+    https://github.com/scikit-learn/scikit-learn/issues/28316
+    """
+    X = X.reshape(-1, 1)
+    y = np.arange(6)
+
+    tree = Tree(criterion=criterion, random_state=0).fit(X, y)
+    tree_ref = clone(tree).fit(y.reshape(-1, 1), y)
+
+    impurity = tree.tree_.impurity
+    assert all(impurity >= 0), impurity.min()  # MSE should always be positive
+
+    # Check the impurity match after the first split
+    assert_allclose(tree.tree_.impurity[:2], tree_ref.tree_.impurity[:2])
+
+    # Find the leaves with a single sample where the MSE should be 0
+    leaves_idx = np.flatnonzero(
+        (tree.tree_.children_left == -1) & (tree.tree_.n_node_samples == 1)
+    )
+    assert_allclose(tree.tree_.impurity[leaves_idx], 0.0)
+
+
+def test_regression_extra_tree_missing_values_toy(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 100
+    X = np.arange(n_samples, dtype=np.float64).reshape(-1, 1)
+    X[-20:, :] = np.nan
+    rng.shuffle(X)
+    y = np.arange(n_samples)
+
+    tree = ExtraTreeRegressor(random_state=global_random_seed, max_depth=5).fit(X, y)
+
+    impurity = tree.tree_.impurity
+    assert all(impurity >= 0), impurity  # MSE should always be positive
+
+
+def test_classification_tree_missing_values_toy():
+    """Check that we properly handle missing values in classification trees using a toy
+    dataset.
+
+    The test is more involved because we use a case where we detected a regression
+    in a random forest. We therefore define the seed and bootstrap indices to detect
+    one of the non-frequent regression.
+
+    Here, we check that the impurity is null or positive in the leaves.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28254
+    """
+    X, y = datasets.load_iris(return_X_y=True)
+
+    rng = np.random.RandomState(42)
+    X_missing = X.copy()
+    mask = rng.binomial(
+        n=np.ones(shape=(1, 4), dtype=np.int32), p=X[:, [2]] / 8
+    ).astype(bool)
+    X_missing[mask] = np.nan
+    X_train, _, y_train, _ = train_test_split(X_missing, y, random_state=13)
+
+    # fmt: off
+    # no black reformatting for this specific array
+    indices = np.array([
+        2, 81, 39, 97, 91, 38, 46, 31, 101, 13, 89, 82, 100, 42, 69, 27, 81, 16, 73, 74,
+        51, 47, 107, 17, 75, 110, 20, 15, 104, 57, 26, 15, 75, 79, 35, 77, 90, 51, 46,
+        13, 94, 91, 23, 8, 93, 93, 73, 77, 12, 13, 74, 109, 110, 24, 10, 23, 104, 27,
+        92, 52, 20, 109, 8, 8, 28, 27, 35, 12, 12, 7, 43, 0, 30, 31, 78, 12, 24, 105,
+        50, 0, 73, 12, 102, 105, 13, 31, 1, 69, 11, 32, 75, 90, 106, 94, 60, 56, 35, 17,
+        62, 85, 81, 39, 80, 16, 63, 6, 80, 84, 3, 3, 76, 78
+    ], dtype=np.int32)
+    # fmt: on
+
+    tree = DecisionTreeClassifier(
+        max_depth=3, max_features="sqrt", random_state=1857819720
+    )
+    tree.fit(X_train[indices], y_train[indices])
+    assert all(tree.tree_.impurity >= 0)
+
+    leaves_idx = np.flatnonzero(
+        (tree.tree_.children_left == -1) & (tree.tree_.n_node_samples == 1)
+    )
+    assert_allclose(tree.tree_.impurity[leaves_idx], 0.0)
+
+
+def test_build_pruned_tree_py():
+    """Test pruning a tree with the Python caller of the Cythonized prune tree."""
+    tree = DecisionTreeClassifier(random_state=0, max_depth=1)
+    tree.fit(iris.data, iris.target)
+
+    n_classes = np.atleast_1d(tree.n_classes_)
+    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)
+
+    # only keep the root note
+    leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)
+    leave_in_subtree[0] = 1
+    _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree)
+
+    assert tree.tree_.node_count == 3
+    assert pruned_tree.node_count == 1
+    with pytest.raises(AssertionError):
+        assert_array_equal(tree.tree_.value, pruned_tree.value)
+    assert_array_equal(tree.tree_.value[0], pruned_tree.value[0])
+
+    # now keep all the leaves
+    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)
+    leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)
+    leave_in_subtree[1:] = 1
+
+    # Prune the tree
+    _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree)
+    assert tree.tree_.node_count == 3
+    assert pruned_tree.node_count == 3, pruned_tree.node_count
+    assert_array_equal(tree.tree_.value, pruned_tree.value)
+
+
+def test_build_pruned_tree_infinite_loop():
+    """Test pruning a tree does not result in an infinite loop."""
+
+    # Create a tree with root and two children
+    tree = DecisionTreeClassifier(random_state=0, max_depth=1)
+    tree.fit(iris.data, iris.target)
+    n_classes = np.atleast_1d(tree.n_classes_)
+    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)
+
+    # only keeping one child as a leaf results in an improper tree
+    leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)
+    leave_in_subtree[1] = 1
+    with pytest.raises(
+        ValueError, match="Node has reached a leaf in the original tree"
+    ):
+        _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree)
+
+
+def test_sort_log2_build():
+    """Non-regression test for gh-30554.
+
+    Using log2 and log in sort correctly sorts feature_values, but the tie breaking is
+    different which can results in placing samples in a different order.
+    """
+    rng = np.random.default_rng(75)
+    some = rng.normal(loc=0.0, scale=10.0, size=10).astype(np.float32)
+    feature_values = np.concatenate([some] * 5)
+    samples = np.arange(50, dtype=np.intp)
+    _py_sort(feature_values, samples, 50)
+    # fmt: off
+    # no black reformatting for this specific array
+    expected_samples = [
+        0, 40, 30, 20, 10, 29, 39, 19, 49,  9, 45, 15, 35,  5, 25, 11, 31,
+        41,  1, 21, 22, 12,  2, 42, 32, 23, 13, 43,  3, 33,  6, 36, 46, 16,
+        26,  4, 14, 24, 34, 44, 27, 47,  7, 37, 17,  8, 38, 48, 28, 18
+    ]
+    # fmt: on
+    assert_array_equal(samples, expected_samples)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fd8a315a0be2946de8f46b790db1072b93d8ba9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/__init__.py
@@ -0,0 +1,84 @@
+"""Various utilities to help with development."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..exceptions import DataConversionWarning
+from . import metadata_routing
+from ._bunch import Bunch
+from ._chunking import gen_batches, gen_even_slices
+
+# Make _safe_indexing importable from here for backward compat as this particular
+# helper is considered semi-private and typically very useful for third-party
+# libraries that want to comply with scikit-learn's estimator API. In particular,
+# _safe_indexing was included in our public API documentation despite the leading
+# `_` in its name.
+from ._indexing import (
+    _safe_indexing,  # noqa: F401
+    resample,
+    shuffle,
+)
+from ._mask import safe_mask
+from ._repr_html.base import _HTMLDocumentationLinkMixin  # noqa: F401
+from ._repr_html.estimator import estimator_html_repr
+from ._tags import (
+    ClassifierTags,
+    InputTags,
+    RegressorTags,
+    Tags,
+    TargetTags,
+    TransformerTags,
+    get_tags,
+)
+from .class_weight import compute_class_weight, compute_sample_weight
+from .deprecation import deprecated
+from .discovery import all_estimators
+from .extmath import safe_sqr
+from .murmurhash import murmurhash3_32
+from .validation import (
+    as_float_array,
+    assert_all_finite,
+    check_array,
+    check_consistent_length,
+    check_random_state,
+    check_scalar,
+    check_symmetric,
+    check_X_y,
+    column_or_1d,
+    indexable,
+)
+
+__all__ = [
+    "Bunch",
+    "ClassifierTags",
+    "DataConversionWarning",
+    "InputTags",
+    "RegressorTags",
+    "Tags",
+    "TargetTags",
+    "TransformerTags",
+    "all_estimators",
+    "as_float_array",
+    "assert_all_finite",
+    "check_X_y",
+    "check_array",
+    "check_consistent_length",
+    "check_random_state",
+    "check_scalar",
+    "check_symmetric",
+    "column_or_1d",
+    "compute_class_weight",
+    "compute_sample_weight",
+    "deprecated",
+    "estimator_html_repr",
+    "gen_batches",
+    "gen_even_slices",
+    "get_tags",
+    "indexable",
+    "metadata_routing",
+    "murmurhash3_32",
+    "resample",
+    "safe_mask",
+    "safe_sqr",
+    "shuffle",
+]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2cc400964b8c5a86eb867e6641ad9138c0c928b
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_array_api.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_array_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92a13b79564fb8b7e26e1273a93965bd96b98463
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_array_api.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_available_if.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_available_if.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cffda42f3e6b1d43955d2a1a7ae52cd8b8cee81a
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_available_if.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_bunch.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_bunch.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40e2c08fc47917b696379eec14207be6a69c5d90
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_bunch.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_chunking.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_chunking.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a14b21aa83cf7ea4655a1f12b28f236428643e1
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_chunking.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_encode.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_encode.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6b6378a030ea38d6955f65cde6ee6ae566649ea
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_encode.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_indexing.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_indexing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c4eb853decc9eb2297981e29b929b45152fc810
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_indexing.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_mask.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_mask.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d102e9cd481af7cacc3fecd5ea2411317e559001
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_mask.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_metadata_requests.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_metadata_requests.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccc4afedcf86ed5bc4c4b6ea3910721e5922ff64
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_metadata_requests.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_missing.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_missing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdcd308d651c79b9b7b594153bf8396b196269d7
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_missing.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_optional_dependencies.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_optional_dependencies.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e0f42fa3416be0ce69165ea641781618ee8e2a6
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_optional_dependencies.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_param_validation.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_param_validation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5a2222611a3ec1cbf0f9b2f8c8a10dcb027c3e2
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_param_validation.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_plotting.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_plotting.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79e39057bc0654cab0e36323115c349017d1b8ea
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_plotting.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_response.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_response.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4589f5f8a851901fc56f1b4b2911542c1ebbb5e1
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_response.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_set_output.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_set_output.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da1d6505758026d7b902ccde48ac5063bd65ae62
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_set_output.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_show_versions.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_show_versions.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77b3e0be887bbaefd60b42505211d7303e5bf97d
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_show_versions.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_tags.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_tags.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cdd35a688a46bc2381a1a92ea49c798f56371bf
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_tags.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_unique.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_unique.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9183e393fc257853a019ce2d59d7a96d5a185af5
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/_unique.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/class_weight.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/class_weight.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b906b87524ddb4f4b217e793de43414974ec421
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/class_weight.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/deprecation.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/deprecation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2702b1e3f48f6cacdc2db8acac5ed3d68b17a62f
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/deprecation.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/discovery.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/discovery.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e5a1b0eba923f87c56556cd521dea362582f9d7
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/discovery.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/extmath.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/extmath.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c658c5983d2372ecae84168f3ee85be7d0f0f157
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/extmath.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/fixes.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/fixes.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6352a54ed5766d5038e00650dca7a13c95e35660
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/fixes.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/metadata_routing.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/metadata_routing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cef9ec746ed23dd23fc09ad24ff51b0b4219b7ea
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/metadata_routing.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/metaestimators.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/metaestimators.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a05c693bcfdc048b0ae07e89e0d79c9971585b2f
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/metaestimators.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/multiclass.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/multiclass.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f2c957bda827b0ad157b14b56a29d69f9c983d6
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/multiclass.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/parallel.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/parallel.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a03fdc625d5e3e4a6d11ec2297bbfbc6495b55a
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/parallel.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/sparsefuncs.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/sparsefuncs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92e381bd7eac0e6a836147ecde15158fdced915d
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/sparsefuncs.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/stats.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/stats.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7ebd2789bc3d990413840bee7934a22e2f6f1f8
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/__pycache__/stats.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_arpack.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_arpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba82127f98c43130a4af692d8694423997120e89
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_arpack.py
@@ -0,0 +1,33 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from .validation import check_random_state
+
+
+def _init_arpack_v0(size, random_state):
+    """Initialize the starting vector for iteration in ARPACK functions.
+
+    Initialize a ndarray with values sampled from the uniform distribution on
+    [-1, 1]. This initialization model has been chosen to be consistent with
+    the ARPACK one as another initialization can lead to convergence issues.
+
+    Parameters
+    ----------
+    size : int
+        The size of the eigenvalue vector to be initialized.
+
+    random_state : int, RandomState instance or None, default=None
+        The seed of the pseudo random number generator used to generate a
+        uniform distribution. If int, random_state is the seed used by the
+        random number generator; If RandomState instance, random_state is the
+        random number generator; If None, the random number generator is the
+        RandomState instance used by `np.random`.
+
+    Returns
+    -------
+    v0 : ndarray of shape (size,)
+        The initialized vector.
+    """
+    random_state = check_random_state(random_state)
+    v0 = random_state.uniform(-1, 1, size)
+    return v0
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_array_api.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_array_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a59249d99bd7433889d8fae948be1959680c0e9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_array_api.py
@@ -0,0 +1,1006 @@
+"""Tools to support array_api."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+import math
+import os
+from functools import wraps
+
+import numpy
+import scipy
+import scipy.sparse as sp
+import scipy.special as special
+
+from .._config import get_config
+from ..externals import array_api_compat
+from ..externals import array_api_extra as xpx
+from ..externals.array_api_compat import numpy as np_compat
+from .fixes import parse_version
+
+# TODO: complete __all__
+__all__ = ["xpx"]  # we import xpx here just to re-export it, need this to appease ruff
+
+_NUMPY_NAMESPACE_NAMES = {"numpy", "sklearn.externals.array_api_compat.numpy"}
+
+
+def yield_namespaces(include_numpy_namespaces=True):
+    """Yield supported namespace.
+
+    This is meant to be used for testing purposes only.
+
+    Parameters
+    ----------
+    include_numpy_namespaces : bool, default=True
+        If True, also yield numpy namespaces.
+
+    Returns
+    -------
+    array_namespace : str
+        The name of the Array API namespace.
+    """
+    for array_namespace in [
+        # The following is used to test the array_api_compat wrapper when
+        # array_api_dispatch is enabled: in particular, the arrays used in the
+        # tests are regular numpy arrays without any "device" attribute.
+        "numpy",
+        # Stricter NumPy-based Array API implementation. The
+        # array_api_strict.Array instances always have a dummy "device" attribute.
+        "array_api_strict",
+        "cupy",
+        "torch",
+    ]:
+        if not include_numpy_namespaces and array_namespace in _NUMPY_NAMESPACE_NAMES:
+            continue
+        yield array_namespace
+
+
+def yield_namespace_device_dtype_combinations(include_numpy_namespaces=True):
+    """Yield supported namespace, device, dtype tuples for testing.
+
+    Use this to test that an estimator works with all combinations.
+    Use in conjunction with `ids=_get_namespace_device_dtype_ids` to give
+    clearer pytest parametrization ID names.
+
+    Parameters
+    ----------
+    include_numpy_namespaces : bool, default=True
+        If True, also yield numpy namespaces.
+
+    Returns
+    -------
+    array_namespace : str
+        The name of the Array API namespace.
+
+    device : str
+        The name of the device on which to allocate the arrays. Can be None to
+        indicate that the default value should be used.
+
+    dtype_name : str
+        The name of the data type to use for arrays. Can be None to indicate
+        that the default value should be used.
+    """
+    for array_namespace in yield_namespaces(
+        include_numpy_namespaces=include_numpy_namespaces
+    ):
+        if array_namespace == "torch":
+            for device, dtype in itertools.product(
+                ("cpu", "cuda"), ("float64", "float32")
+            ):
+                yield array_namespace, device, dtype
+            yield array_namespace, "mps", "float32"
+
+        elif array_namespace == "array_api_strict":
+            try:
+                import array_api_strict
+
+                yield array_namespace, array_api_strict.Device("CPU_DEVICE"), "float64"
+                yield array_namespace, array_api_strict.Device("device1"), "float32"
+            except ImportError:
+                # Those combinations will typically be skipped by pytest if
+                # array_api_strict is not installed but we still need to see them in
+                # the test output.
+                yield array_namespace, "CPU_DEVICE", "float64"
+                yield array_namespace, "device1", "float32"
+        else:
+            yield array_namespace, None, None
+
+
+def _get_namespace_device_dtype_ids(param):
+    """Get pytest parametrization IDs for `yield_namespace_device_dtype_combinations`"""
+    # Gives clearer IDs for array-api-strict devices, see #31042 for details
+    try:
+        import array_api_strict
+    except ImportError:
+        # `None` results in the default pytest representation
+        return None
+    else:
+        if param == array_api_strict.Device("CPU_DEVICE"):
+            return "CPU_DEVICE"
+        if param == array_api_strict.Device("device1"):
+            return "device1"
+        if param == array_api_strict.Device("device2"):
+            return "device2"
+
+
+def _check_array_api_dispatch(array_api_dispatch):
+    """Check that array_api_compat is installed and NumPy version is compatible.
+
+    array_api_compat follows NEP29, which has a higher minimum NumPy version than
+    scikit-learn.
+    """
+    if not array_api_dispatch:
+        return
+
+    scipy_version = parse_version(scipy.__version__)
+    min_scipy_version = "1.14.0"
+    if scipy_version < parse_version(min_scipy_version):
+        raise ImportError(
+            f"SciPy must be {min_scipy_version} or newer"
+            " (found {scipy.__version__}) to dispatch array using"
+            " the array API specification"
+        )
+
+    if os.environ.get("SCIPY_ARRAY_API") != "1":
+        raise RuntimeError(
+            "Scikit-learn array API support was enabled but scipy's own support is "
+            "not enabled. Please set the SCIPY_ARRAY_API=1 environment variable "
+            "before importing sklearn or scipy. More details at: "
+            "https://docs.scipy.org/doc/scipy/dev/api-dev/array_api.html"
+        )
+
+
+def _single_array_device(array):
+    """Hardware device where the array data resides on."""
+    if (
+        isinstance(array, (numpy.ndarray, numpy.generic))
+        or not hasattr(array, "device")
+        # When array API dispatch is disabled, we expect the scikit-learn code
+        # to use np.asarray so that the resulting NumPy array will implicitly use the
+        # CPU. In this case, scikit-learn should stay as device neutral as possible,
+        # hence the use of `device=None` which is accepted by all libraries, before
+        # and after the expected conversion to NumPy via np.asarray.
+        or not get_config()["array_api_dispatch"]
+    ):
+        return None
+    else:
+        return array.device
+
+
+def device(*array_list, remove_none=True, remove_types=(str,)):
+    """Hardware device where the array data resides on.
+
+    If the hardware device is not the same for all arrays, an error is raised.
+
+    Parameters
+    ----------
+    *array_list : arrays
+        List of array instances from NumPy or an array API compatible library.
+
+    remove_none : bool, default=True
+        Whether to ignore None objects passed in array_list.
+
+    remove_types : tuple or list, default=(str,)
+        Types to ignore in array_list.
+
+    Returns
+    -------
+    out : device
+        `device` object (see the "Device Support" section of the array API spec).
+    """
+    array_list = _remove_non_arrays(
+        *array_list, remove_none=remove_none, remove_types=remove_types
+    )
+
+    if not array_list:
+        return None
+
+    device_ = _single_array_device(array_list[0])
+
+    # Note: here we cannot simply use a Python `set` as it requires
+    # hashable members which is not guaranteed for Array API device
+    # objects. In particular, CuPy devices are not hashable at the
+    # time of writing.
+    for array in array_list[1:]:
+        device_other = _single_array_device(array)
+        if device_ != device_other:
+            raise ValueError(
+                f"Input arrays use different devices: {device_}, {device_other}"
+            )
+
+    return device_
+
+
+def size(x):
+    """Return the total number of elements of x.
+
+    Parameters
+    ----------
+    x : array
+        Array instance from NumPy or an array API compatible library.
+
+    Returns
+    -------
+    out : int
+        Total number of elements.
+    """
+    return math.prod(x.shape)
+
+
+def _is_numpy_namespace(xp):
+    """Return True if xp is backed by NumPy."""
+    return xp.__name__ in _NUMPY_NAMESPACE_NAMES
+
+
+def _union1d(a, b, xp):
+    if _is_numpy_namespace(xp):
+        # avoid circular import
+        from ._unique import cached_unique
+
+        a_unique, b_unique = cached_unique(a, b, xp=xp)
+        return xp.asarray(numpy.union1d(a_unique, b_unique))
+    assert a.ndim == b.ndim == 1
+    return xp.unique_values(xp.concat([xp.unique_values(a), xp.unique_values(b)]))
+
+
+def isdtype(dtype, kind, *, xp):
+    """Returns a boolean indicating whether a provided dtype is of type "kind".
+
+    Included in the v2022.12 of the Array API spec.
+    https://data-apis.org/array-api/latest/API_specification/generated/array_api.isdtype.html
+    """
+    if isinstance(kind, tuple):
+        return any(_isdtype_single(dtype, k, xp=xp) for k in kind)
+    else:
+        return _isdtype_single(dtype, kind, xp=xp)
+
+
+def _isdtype_single(dtype, kind, *, xp):
+    if isinstance(kind, str):
+        if kind == "bool":
+            return dtype == xp.bool
+        elif kind == "signed integer":
+            return dtype in {xp.int8, xp.int16, xp.int32, xp.int64}
+        elif kind == "unsigned integer":
+            return dtype in {xp.uint8, xp.uint16, xp.uint32, xp.uint64}
+        elif kind == "integral":
+            return any(
+                _isdtype_single(dtype, k, xp=xp)
+                for k in ("signed integer", "unsigned integer")
+            )
+        elif kind == "real floating":
+            return dtype in supported_float_dtypes(xp)
+        elif kind == "complex floating":
+            # Some name spaces might not have support for complex dtypes.
+            complex_dtypes = set()
+            if hasattr(xp, "complex64"):
+                complex_dtypes.add(xp.complex64)
+            if hasattr(xp, "complex128"):
+                complex_dtypes.add(xp.complex128)
+            return dtype in complex_dtypes
+        elif kind == "numeric":
+            return any(
+                _isdtype_single(dtype, k, xp=xp)
+                for k in ("integral", "real floating", "complex floating")
+            )
+        else:
+            raise ValueError(f"Unrecognized data type kind: {kind!r}")
+    else:
+        return dtype == kind
+
+
+def supported_float_dtypes(xp):
+    """Supported floating point types for the namespace.
+
+    Note: float16 is not officially part of the Array API spec at the
+    time of writing but scikit-learn estimators and functions can choose
+    to accept it when xp.float16 is defined.
+
+    https://data-apis.org/array-api/latest/API_specification/data_types.html
+    """
+    if hasattr(xp, "float16"):
+        return (xp.float64, xp.float32, xp.float16)
+    else:
+        return (xp.float64, xp.float32)
+
+
+def ensure_common_namespace_device(reference, *arrays):
+    """Ensure that all arrays use the same namespace and device as reference.
+
+    If necessary the arrays are moved to the same namespace and device as
+    the reference array.
+
+    Parameters
+    ----------
+    reference : array
+        Reference array.
+
+    *arrays : array
+        Arrays to check.
+
+    Returns
+    -------
+    arrays : list
+        Arrays with the same namespace and device as reference.
+    """
+    xp, is_array_api = get_namespace(reference)
+
+    if is_array_api:
+        device_ = device(reference)
+        # Move arrays to the same namespace and device as the reference array.
+        return [xp.asarray(a, device=device_) for a in arrays]
+    else:
+        return arrays
+
+
+def _check_device_cpu(device):
+    if device not in {"cpu", None}:
+        raise ValueError(f"Unsupported device for NumPy: {device!r}")
+
+
+def _accept_device_cpu(func):
+    @wraps(func)
+    def wrapped_func(*args, **kwargs):
+        _check_device_cpu(kwargs.pop("device", None))
+        return func(*args, **kwargs)
+
+    return wrapped_func
+
+
+def _remove_non_arrays(*arrays, remove_none=True, remove_types=(str,)):
+    """Filter arrays to exclude None and/or specific types.
+
+    Sparse arrays are always filtered out.
+
+    Parameters
+    ----------
+    *arrays : array objects
+        Array objects.
+
+    remove_none : bool, default=True
+        Whether to ignore None objects passed in arrays.
+
+    remove_types : tuple or list, default=(str,)
+        Types to ignore in the arrays.
+
+    Returns
+    -------
+    filtered_arrays : list
+        List of arrays filtered as requested. An empty list is returned if no input
+        passes the filters.
+    """
+    filtered_arrays = []
+    remove_types = tuple(remove_types)
+    for array in arrays:
+        if remove_none and array is None:
+            continue
+        if isinstance(array, remove_types):
+            continue
+        if sp.issparse(array):
+            continue
+        filtered_arrays.append(array)
+
+    return filtered_arrays
+
+
+def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None):
+    """Get namespace of arrays.
+
+    Introspect `arrays` arguments and return their common Array API compatible
+    namespace object, if any.
+
+    Note that sparse arrays are filtered by default.
+
+    See: https://numpy.org/neps/nep-0047-array-api-standard.html
+
+    If `arrays` are regular numpy arrays, `array_api_compat.numpy` is returned instead.
+
+    Namespace support is not enabled by default. To enabled it call:
+
+      sklearn.set_config(array_api_dispatch=True)
+
+    or:
+
+      with sklearn.config_context(array_api_dispatch=True):
+          # your code here
+
+    Otherwise `array_api_compat.numpy` is
+    always returned irrespective of the fact that arrays implement the
+    `__array_namespace__` protocol or not.
+
+    Note that if no arrays pass the set filters, ``_NUMPY_API_WRAPPER_INSTANCE, False``
+    is returned.
+
+    Parameters
+    ----------
+    *arrays : array objects
+        Array objects.
+
+    remove_none : bool, default=True
+        Whether to ignore None objects passed in arrays.
+
+    remove_types : tuple or list, default=(str,)
+        Types to ignore in the arrays.
+
+    xp : module, default=None
+        Precomputed array namespace module. When passed, typically from a caller
+        that has already performed inspection of its own inputs, skips array
+        namespace inspection.
+
+    Returns
+    -------
+    namespace : module
+        Namespace shared by array objects. If any of the `arrays` are not arrays,
+        the namespace defaults to the NumPy namespace.
+
+    is_array_api_compliant : bool
+        True if the arrays are containers that implement the array API spec (see
+        https://data-apis.org/array-api/latest/index.html).
+        Always False when array_api_dispatch=False.
+    """
+    array_api_dispatch = get_config()["array_api_dispatch"]
+    if not array_api_dispatch:
+        if xp is not None:
+            return xp, False
+        else:
+            return np_compat, False
+
+    if xp is not None:
+        return xp, True
+
+    arrays = _remove_non_arrays(
+        *arrays,
+        remove_none=remove_none,
+        remove_types=remove_types,
+    )
+
+    if not arrays:
+        return np_compat, False
+
+    _check_array_api_dispatch(array_api_dispatch)
+
+    namespace, is_array_api_compliant = array_api_compat.get_namespace(*arrays), True
+
+    if namespace.__name__ == "array_api_strict" and hasattr(
+        namespace, "set_array_api_strict_flags"
+    ):
+        namespace.set_array_api_strict_flags(api_version="2024.12")
+
+    return namespace, is_array_api_compliant
+
+
+def get_namespace_and_device(
+    *array_list, remove_none=True, remove_types=(str,), xp=None
+):
+    """Combination into one single function of `get_namespace` and `device`.
+
+    Parameters
+    ----------
+    *array_list : array objects
+        Array objects.
+    remove_none : bool, default=True
+        Whether to ignore None objects passed in arrays.
+    remove_types : tuple or list, default=(str,)
+        Types to ignore in the arrays.
+    xp : module, default=None
+        Precomputed array namespace module. When passed, typically from a caller
+        that has already performed inspection of its own inputs, skips array
+        namespace inspection.
+
+    Returns
+    -------
+    namespace : module
+        Namespace shared by array objects. If any of the `arrays` are not arrays,
+        the namespace defaults to NumPy.
+    is_array_api_compliant : bool
+        True if the arrays are containers that implement the Array API spec.
+        Always False when array_api_dispatch=False.
+    device : device
+        `device` object (see the "Device Support" section of the array API spec).
+    """
+    skip_remove_kwargs = dict(remove_none=False, remove_types=[])
+
+    array_list = _remove_non_arrays(
+        *array_list,
+        remove_none=remove_none,
+        remove_types=remove_types,
+    )
+    arrays_device = device(*array_list, **skip_remove_kwargs)
+
+    if xp is None:
+        xp, is_array_api = get_namespace(*array_list, **skip_remove_kwargs)
+    else:
+        xp, is_array_api = xp, True
+
+    if is_array_api:
+        return xp, is_array_api, arrays_device
+    else:
+        return xp, False, arrays_device
+
+
+def _expit(X, xp=None):
+    xp, _ = get_namespace(X, xp=xp)
+    if _is_numpy_namespace(xp):
+        return xp.asarray(special.expit(numpy.asarray(X)))
+
+    return 1.0 / (1.0 + xp.exp(-X))
+
+
+def _fill_or_add_to_diagonal(array, value, xp, add_value=True, wrap=False):
+    """Implementation to facilitate adding or assigning specified values to the
+    diagonal of a 2-d array.
+
+    If ``add_value`` is `True` then the values will be added to the diagonal
+    elements otherwise the values will be assigned to the diagonal elements.
+    By default, ``add_value`` is set to `True. This is currently only
+    supported for 2-d arrays.
+
+    The implementation is taken from the `numpy.fill_diagonal` function:
+    https://github.com/numpy/numpy/blob/v2.0.0/numpy/lib/_index_tricks_impl.py#L799-L929
+    """
+    if array.ndim != 2:
+        raise ValueError(
+            f"array should be 2-d. Got array with shape {tuple(array.shape)}"
+        )
+
+    value = xp.asarray(value, dtype=array.dtype, device=device(array))
+    end = None
+    # Explicit, fast formula for the common case.  For 2-d arrays, we
+    # accept rectangular ones.
+    step = array.shape[1] + 1
+    if not wrap:
+        end = array.shape[1] * array.shape[1]
+
+    array_flat = xp.reshape(array, (-1,))
+    if add_value:
+        array_flat[:end:step] += value
+    else:
+        array_flat[:end:step] = value
+
+
+def _is_xp_namespace(xp, name):
+    return xp.__name__ in (
+        name,
+        f"array_api_compat.{name}",
+        f"sklearn.externals.array_api_compat.{name}",
+    )
+
+
+def _max_precision_float_dtype(xp, device):
+    """Return the float dtype with the highest precision supported by the device."""
+    # TODO: Update to use `__array_namespace__info__()` from array-api v2023.12
+    # when/if that becomes more widespread.
+    if _is_xp_namespace(xp, "torch") and str(device).startswith(
+        "mps"
+    ):  # pragma: no cover
+        return xp.float32
+    return xp.float64
+
+
+def _find_matching_floating_dtype(*arrays, xp):
+    """Find a suitable floating point dtype when computing with arrays.
+
+    If any of the arrays are floating point, return the dtype with the highest
+    precision by following official type promotion rules:
+
+    https://data-apis.org/array-api/latest/API_specification/type_promotion.html
+
+    If there are no floating point input arrays (all integral inputs for
+    instance), return the default floating point dtype for the namespace.
+    """
+    dtyped_arrays = [xp.asarray(a) for a in arrays if hasattr(a, "dtype")]
+    floating_dtypes = [
+        a.dtype for a in dtyped_arrays if xp.isdtype(a.dtype, "real floating")
+    ]
+    if floating_dtypes:
+        # Return the floating dtype with the highest precision:
+        return xp.result_type(*floating_dtypes)
+
+    # If none of the input arrays have a floating point dtype, they must be all
+    # integer arrays or containers of Python scalars: return the default
+    # floating point dtype for the namespace (implementation specific).
+    return xp.asarray(0.0).dtype
+
+
+def _average(a, axis=None, weights=None, normalize=True, xp=None):
+    """Partial port of np.average to support the Array API.
+
+    It does a best effort at mimicking the return dtype rule described at
+    https://numpy.org/doc/stable/reference/generated/numpy.average.html but
+    only for the common cases needed in scikit-learn.
+    """
+    xp, _, device_ = get_namespace_and_device(a, weights)
+
+    if _is_numpy_namespace(xp):
+        if normalize:
+            return xp.asarray(numpy.average(a, axis=axis, weights=weights))
+        elif axis is None and weights is not None:
+            return xp.asarray(numpy.dot(a, weights))
+
+    a = xp.asarray(a, device=device_)
+    if weights is not None:
+        weights = xp.asarray(weights, device=device_)
+
+    if weights is not None and a.shape != weights.shape:
+        if axis is None:
+            raise TypeError(
+                f"Axis must be specified when the shape of a {tuple(a.shape)} and "
+                f"weights {tuple(weights.shape)} differ."
+            )
+
+        if tuple(weights.shape) != (a.shape[axis],):
+            raise ValueError(
+                f"Shape of weights weights.shape={tuple(weights.shape)} must be "
+                f"consistent with a.shape={tuple(a.shape)} and {axis=}."
+            )
+
+        # If weights are 1D, add singleton dimensions for broadcasting
+        shape = [1] * a.ndim
+        shape[axis] = a.shape[axis]
+        weights = xp.reshape(weights, tuple(shape))
+
+    if xp.isdtype(a.dtype, "complex floating"):
+        raise NotImplementedError(
+            "Complex floating point values are not supported by average."
+        )
+    if weights is not None and xp.isdtype(weights.dtype, "complex floating"):
+        raise NotImplementedError(
+            "Complex floating point values are not supported by average."
+        )
+
+    output_dtype = _find_matching_floating_dtype(a, weights, xp=xp)
+    a = xp.astype(a, output_dtype)
+
+    if weights is None:
+        return (xp.mean if normalize else xp.sum)(a, axis=axis)
+
+    weights = xp.astype(weights, output_dtype)
+
+    sum_ = xp.sum(xp.multiply(a, weights), axis=axis)
+
+    if not normalize:
+        return sum_
+
+    scale = xp.sum(weights, axis=axis)
+    if xp.any(scale == 0.0):
+        raise ZeroDivisionError("Weights sum to zero, can't be normalized")
+
+    return sum_ / scale
+
+
+def _xlogy(x, y, xp=None):
+    # TODO: Remove this once https://github.com/scipy/scipy/issues/21736 is fixed
+    xp, _, device_ = get_namespace_and_device(x, y, xp=xp)
+
+    with numpy.errstate(divide="ignore", invalid="ignore"):
+        temp = x * xp.log(y)
+    return xp.where(x == 0.0, xp.asarray(0.0, dtype=temp.dtype, device=device_), temp)
+
+
+def _nanmin(X, axis=None, xp=None):
+    # TODO: refactor once nan-aware reductions are standardized:
+    # https://github.com/data-apis/array-api/issues/621
+    xp, _, device_ = get_namespace_and_device(X, xp=xp)
+    if _is_numpy_namespace(xp):
+        return xp.asarray(numpy.nanmin(X, axis=axis))
+
+    else:
+        mask = xp.isnan(X)
+        X = xp.min(
+            xp.where(mask, xp.asarray(+xp.inf, dtype=X.dtype, device=device_), X),
+            axis=axis,
+        )
+        # Replace Infs from all NaN slices with NaN again
+        mask = xp.all(mask, axis=axis)
+        if xp.any(mask):
+            X = xp.where(mask, xp.asarray(xp.nan, dtype=X.dtype, device=device_), X)
+        return X
+
+
+def _nanmax(X, axis=None, xp=None):
+    # TODO: refactor once nan-aware reductions are standardized:
+    # https://github.com/data-apis/array-api/issues/621
+    xp, _, device_ = get_namespace_and_device(X, xp=xp)
+    if _is_numpy_namespace(xp):
+        return xp.asarray(numpy.nanmax(X, axis=axis))
+
+    else:
+        mask = xp.isnan(X)
+        X = xp.max(
+            xp.where(mask, xp.asarray(-xp.inf, dtype=X.dtype, device=device_), X),
+            axis=axis,
+        )
+        # Replace Infs from all NaN slices with NaN again
+        mask = xp.all(mask, axis=axis)
+        if xp.any(mask):
+            X = xp.where(mask, xp.asarray(xp.nan, dtype=X.dtype, device=device_), X)
+        return X
+
+
+def _nanmean(X, axis=None, xp=None):
+    # TODO: refactor once nan-aware reductions are standardized:
+    # https://github.com/data-apis/array-api/issues/621
+    xp, _, device_ = get_namespace_and_device(X, xp=xp)
+    if _is_numpy_namespace(xp):
+        return xp.asarray(numpy.nanmean(X, axis=axis))
+    else:
+        mask = xp.isnan(X)
+        total = xp.sum(
+            xp.where(mask, xp.asarray(0.0, dtype=X.dtype, device=device_), X), axis=axis
+        )
+        count = xp.sum(xp.astype(xp.logical_not(mask), X.dtype), axis=axis)
+        return total / count
+
+
+def _asarray_with_order(
+    array, dtype=None, order=None, copy=None, *, xp=None, device=None
+):
+    """Helper to support the order kwarg only for NumPy-backed arrays
+
+    Memory layout parameter `order` is not exposed in the Array API standard,
+    however some input validation code in scikit-learn needs to work both
+    for classes and functions that will leverage Array API only operations
+    and for code that inherently relies on NumPy backed data containers with
+    specific memory layout constraints (e.g. our own Cython code). The
+    purpose of this helper is to make it possible to share code for data
+    container validation without memory copies for both downstream use cases:
+    the `order` parameter is only enforced if the input array implementation
+    is NumPy based, otherwise `order` is just silently ignored.
+    """
+    xp, _ = get_namespace(array, xp=xp)
+    if _is_numpy_namespace(xp):
+        # Use NumPy API to support order
+        if copy is True:
+            array = numpy.array(array, order=order, dtype=dtype)
+        else:
+            array = numpy.asarray(array, order=order, dtype=dtype)
+
+        # At this point array is a NumPy ndarray. We convert it to an array
+        # container that is consistent with the input's namespace.
+        return xp.asarray(array)
+    else:
+        return xp.asarray(array, dtype=dtype, copy=copy, device=device)
+
+
+def _ravel(array, xp=None):
+    """Array API compliant version of np.ravel.
+
+    For non numpy namespaces, it just returns a flattened array, that might
+    be or not be a copy.
+    """
+    xp, _ = get_namespace(array, xp=xp)
+    if _is_numpy_namespace(xp):
+        array = numpy.asarray(array)
+        return xp.asarray(numpy.ravel(array, order="C"))
+
+    return xp.reshape(array, shape=(-1,))
+
+
+def _convert_to_numpy(array, xp):
+    """Convert X into a NumPy ndarray on the CPU."""
+    if _is_xp_namespace(xp, "torch"):
+        return array.cpu().numpy()
+    elif _is_xp_namespace(xp, "cupy"):  # pragma: nocover
+        return array.get()
+    elif _is_xp_namespace(xp, "array_api_strict"):
+        return numpy.asarray(xp.asarray(array, device=xp.Device("CPU_DEVICE")))
+
+    return numpy.asarray(array)
+
+
+def _estimator_with_converted_arrays(estimator, converter):
+    """Create new estimator which converting all attributes that are arrays.
+
+    The converter is called on all NumPy arrays and arrays that support the
+    `DLPack interface <https://dmlc.github.io/dlpack/latest/>`__.
+
+    Parameters
+    ----------
+    estimator : Estimator
+        Estimator to convert
+
+    converter : callable
+        Callable that takes an array attribute and returns the converted array.
+
+    Returns
+    -------
+    new_estimator : Estimator
+        Convert estimator
+    """
+    from sklearn.base import clone
+
+    new_estimator = clone(estimator)
+    for key, attribute in vars(estimator).items():
+        if hasattr(attribute, "__dlpack__") or isinstance(attribute, numpy.ndarray):
+            attribute = converter(attribute)
+        setattr(new_estimator, key, attribute)
+    return new_estimator
+
+
+def _atol_for_type(dtype_or_dtype_name):
+    """Return the absolute tolerance for a given numpy dtype."""
+    if dtype_or_dtype_name is None:
+        # If no dtype is specified when running tests for a given namespace, we
+        # expect the same floating precision level as NumPy's default floating
+        # point dtype.
+        dtype_or_dtype_name = numpy.float64
+    return numpy.finfo(dtype_or_dtype_name).eps * 100
+
+
+def indexing_dtype(xp):
+    """Return a platform-specific integer dtype suitable for indexing.
+
+    On 32-bit platforms, this will typically return int32 and int64 otherwise.
+
+    Note: using dtype is recommended for indexing transient array
+    datastructures. For long-lived arrays, such as the fitted attributes of
+    estimators, it is instead recommended to use platform-independent int32 if
+    we do not expect to index more 2B elements. Using fixed dtypes simplifies
+    the handling of serialized models, e.g. to deploy a model fit on a 64-bit
+    platform to a target 32-bit platform such as WASM/pyodide.
+    """
+    # Currently this is implemented with simple hack that assumes that
+    # following "may be" statements in the Array API spec always hold:
+    # > The default integer data type should be the same across platforms, but
+    # > the default may vary depending on whether Python is 32-bit or 64-bit.
+    # > The default array index data type may be int32 on 32-bit platforms, but
+    # > the default should be int64 otherwise.
+    # https://data-apis.org/array-api/latest/API_specification/data_types.html#default-data-types
+    # TODO: once sufficiently adopted, we might want to instead rely on the
+    # newer inspection API: https://github.com/data-apis/array-api/issues/640
+    return xp.asarray(0).dtype
+
+
+def _searchsorted(a, v, *, side="left", sorter=None, xp=None):
+    # Temporary workaround needed as long as searchsorted is not widely
+    # adopted by implementers of the Array API spec. This is a quite
+    # recent addition to the spec:
+    # https://data-apis.org/array-api/latest/API_specification/generated/array_api.searchsorted.html
+    xp, _ = get_namespace(a, v, xp=xp)
+    if hasattr(xp, "searchsorted"):
+        return xp.searchsorted(a, v, side=side, sorter=sorter)
+
+    a_np = _convert_to_numpy(a, xp=xp)
+    v_np = _convert_to_numpy(v, xp=xp)
+    indices = numpy.searchsorted(a_np, v_np, side=side, sorter=sorter)
+    return xp.asarray(indices, device=device(a))
+
+
+def _isin(element, test_elements, xp, assume_unique=False, invert=False):
+    """Calculates ``element in test_elements``, broadcasting over `element`
+    only.
+
+    Returns a boolean array of the same shape as `element` that is True
+    where an element of `element` is in `test_elements` and False otherwise.
+    """
+    if _is_numpy_namespace(xp):
+        return xp.asarray(
+            numpy.isin(
+                element=element,
+                test_elements=test_elements,
+                assume_unique=assume_unique,
+                invert=invert,
+            )
+        )
+
+    original_element_shape = element.shape
+    element = xp.reshape(element, (-1,))
+    test_elements = xp.reshape(test_elements, (-1,))
+    return xp.reshape(
+        _in1d(
+            ar1=element,
+            ar2=test_elements,
+            xp=xp,
+            assume_unique=assume_unique,
+            invert=invert,
+        ),
+        original_element_shape,
+    )
+
+
+# Note: This is a helper for the function `_isin`.
+# It is not meant to be called directly.
+def _in1d(ar1, ar2, xp, assume_unique=False, invert=False):
+    """Checks whether each element of an array is also present in a
+    second array.
+
+    Returns a boolean array the same length as `ar1` that is True
+    where an element of `ar1` is in `ar2` and False otherwise.
+
+    This function has been adapted using the original implementation
+    present in numpy:
+    https://github.com/numpy/numpy/blob/v1.26.0/numpy/lib/arraysetops.py#L524-L758
+    """
+    xp, _ = get_namespace(ar1, ar2, xp=xp)
+
+    # This code is run to make the code significantly faster
+    if ar2.shape[0] < 10 * ar1.shape[0] ** 0.145:
+        if invert:
+            mask = xp.ones(ar1.shape[0], dtype=xp.bool, device=device(ar1))
+            for a in ar2:
+                mask &= ar1 != a
+        else:
+            mask = xp.zeros(ar1.shape[0], dtype=xp.bool, device=device(ar1))
+            for a in ar2:
+                mask |= ar1 == a
+        return mask
+
+    if not assume_unique:
+        ar1, rev_idx = xp.unique_inverse(ar1)
+        ar2 = xp.unique_values(ar2)
+
+    ar = xp.concat((ar1, ar2))
+    device_ = device(ar)
+    # We need this to be a stable sort.
+    order = xp.argsort(ar, stable=True)
+    reverse_order = xp.argsort(order, stable=True)
+    sar = xp.take(ar, order, axis=0)
+    if size(sar) >= 1:
+        bool_ar = sar[1:] != sar[:-1] if invert else sar[1:] == sar[:-1]
+    else:
+        # indexing undefined in standard when sar is empty
+        bool_ar = xp.asarray([False]) if invert else xp.asarray([True])
+    flag = xp.concat((bool_ar, xp.asarray([invert], device=device_)))
+    ret = xp.take(flag, reverse_order, axis=0)
+
+    if assume_unique:
+        return ret[: ar1.shape[0]]
+    else:
+        return xp.take(ret, rev_idx, axis=0)
+
+
+def _count_nonzero(X, axis=None, sample_weight=None, xp=None, device=None):
+    """A variant of `sklearn.utils.sparsefuncs.count_nonzero` for the Array API.
+
+    If the array `X` is sparse, and we are using the numpy namespace then we
+    simply call the original function. This function only supports 2D arrays.
+    """
+    from .sparsefuncs import count_nonzero
+
+    xp, _ = get_namespace(X, sample_weight, xp=xp)
+    if _is_numpy_namespace(xp) and sp.issparse(X):
+        return count_nonzero(X, axis=axis, sample_weight=sample_weight)
+
+    assert X.ndim == 2
+
+    weights = xp.ones_like(X, device=device)
+    if sample_weight is not None:
+        sample_weight = xp.asarray(sample_weight, device=device)
+        sample_weight = xp.reshape(sample_weight, (sample_weight.shape[0], 1))
+        weights = xp.astype(weights, sample_weight.dtype) * sample_weight
+
+    zero_scalar = xp.asarray(0, device=device, dtype=weights.dtype)
+    return xp.sum(xp.where(X != 0, weights, zero_scalar), axis=axis)
+
+
+def _modify_in_place_if_numpy(xp, func, *args, out=None, **kwargs):
+    if _is_numpy_namespace(xp):
+        func(*args, out=out, **kwargs)
+    else:
+        out = func(*args, **kwargs)
+    return out
+
+
+def _bincount(array, weights=None, minlength=None, xp=None):
+    # TODO: update if bincount is ever adopted in a future version of the standard:
+    # https://github.com/data-apis/array-api/issues/812
+    xp, _ = get_namespace(array, xp=xp)
+    if hasattr(xp, "bincount"):
+        return xp.bincount(array, weights=weights, minlength=minlength)
+
+    array_np = _convert_to_numpy(array, xp=xp)
+    if weights is not None:
+        weights_np = _convert_to_numpy(weights, xp=xp)
+    else:
+        weights_np = None
+    bin_out = numpy.bincount(array_np, weights=weights_np, minlength=minlength)
+    return xp.asarray(bin_out, device=device(array))
+
+
+def _tolist(array, xp=None):
+    xp, _ = get_namespace(array, xp=xp)
+    if _is_numpy_namespace(xp):
+        return array.tolist()
+    array_np = _convert_to_numpy(array, xp=xp)
+    return [element.item() for element in array_np]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_available_if.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_available_if.py
new file mode 100644
index 0000000000000000000000000000000000000000..91dee2641f20cbc8fa3d131188c9173037081db3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_available_if.py
@@ -0,0 +1,96 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from functools import update_wrapper, wraps
+from types import MethodType
+
+
+class _AvailableIfDescriptor:
+    """Implements a conditional property using the descriptor protocol.
+
+    Using this class to create a decorator will raise an ``AttributeError``
+    if check(self) returns a falsey value. Note that if check raises an error
+    this will also result in hasattr returning false.
+
+    See https://docs.python.org/3/howto/descriptor.html for an explanation of
+    descriptors.
+    """
+
+    def __init__(self, fn, check, attribute_name):
+        self.fn = fn
+        self.check = check
+        self.attribute_name = attribute_name
+
+        # update the docstring of the descriptor
+        update_wrapper(self, fn)
+
+    def _check(self, obj, owner):
+        attr_err_msg = (
+            f"This {owner.__name__!r} has no attribute {self.attribute_name!r}"
+        )
+        try:
+            check_result = self.check(obj)
+        except Exception as e:
+            raise AttributeError(attr_err_msg) from e
+
+        if not check_result:
+            raise AttributeError(attr_err_msg)
+
+    def __get__(self, obj, owner=None):
+        if obj is not None:
+            # delegate only on instances, not the classes.
+            # this is to allow access to the docstrings.
+            self._check(obj, owner=owner)
+            out = MethodType(self.fn, obj)
+
+        else:
+            # This makes it possible to use the decorated method as an unbound method,
+            # for instance when monkeypatching.
+            @wraps(self.fn)
+            def out(*args, **kwargs):
+                self._check(args[0], owner=owner)
+                return self.fn(*args, **kwargs)
+
+        return out
+
+
+def available_if(check):
+    """An attribute that is available only if check returns a truthy value.
+
+    Parameters
+    ----------
+    check : callable
+        When passed the object with the decorated method, this should return
+        a truthy value if the attribute is available, and either return False
+        or raise an AttributeError if not available.
+
+    Returns
+    -------
+    callable
+        Callable makes the decorated method available if `check` returns
+        a truthy value, otherwise the decorated method is unavailable.
+
+    Examples
+    --------
+    >>> from sklearn.utils.metaestimators import available_if
+    >>> class HelloIfEven:
+    ...    def __init__(self, x):
+    ...        self.x = x
+    ...
+    ...    def _x_is_even(self):
+    ...        return self.x % 2 == 0
+    ...
+    ...    @available_if(_x_is_even)
+    ...    def say_hello(self):
+    ...        print("Hello")
+    ...
+    >>> obj = HelloIfEven(1)
+    >>> hasattr(obj, "say_hello")
+    False
+    >>> obj.x = 2
+    >>> hasattr(obj, "say_hello")
+    True
+    >>> obj.say_hello()
+    Hello
+    """
+    return lambda fn: _AvailableIfDescriptor(fn, check, attribute_name=fn.__name__)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_bunch.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_bunch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a11e80e366135d181bb9f4b4c37639726f874588
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_bunch.py
@@ -0,0 +1,70 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+
+class Bunch(dict):
+    """Container object exposing keys as attributes.
+
+    Bunch objects are sometimes used as an output for functions and methods.
+    They extend dictionaries by enabling values to be accessed by key,
+    `bunch["value_key"]`, or by an attribute, `bunch.value_key`.
+
+    Examples
+    --------
+    >>> from sklearn.utils import Bunch
+    >>> b = Bunch(a=1, b=2)
+    >>> b['b']
+    2
+    >>> b.b
+    2
+    >>> b.a = 3
+    >>> b['a']
+    3
+    >>> b.c = 6
+    >>> b['c']
+    6
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(kwargs)
+
+        # Map from deprecated key to warning message
+        self.__dict__["_deprecated_key_to_warnings"] = {}
+
+    def __getitem__(self, key):
+        if key in self.__dict__.get("_deprecated_key_to_warnings", {}):
+            warnings.warn(
+                self._deprecated_key_to_warnings[key],
+                FutureWarning,
+            )
+        return super().__getitem__(key)
+
+    def _set_deprecated(self, value, *, new_key, deprecated_key, warning_message):
+        """Set key in dictionary to be deprecated with its warning message."""
+        self.__dict__["_deprecated_key_to_warnings"][deprecated_key] = warning_message
+        self[new_key] = self[deprecated_key] = value
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+    def __dir__(self):
+        return self.keys()
+
+    def __getattr__(self, key):
+        try:
+            return self[key]
+        except KeyError:
+            raise AttributeError(key)
+
+    def __setstate__(self, state):
+        # Bunch pickles generated with scikit-learn 0.16.* have an non
+        # empty __dict__. This causes a surprising behaviour when
+        # loading these pickles scikit-learn 0.17: reading bunch.key
+        # uses __dict__ but assigning to bunch.key use __setattr__ and
+        # only changes bunch['key']. More details can be found at:
+        # https://github.com/scikit-learn/scikit-learn/issues/6196.
+        # Overriding __setstate__ to be a noop has the effect of
+        # ignoring the pickled __dict__
+        pass
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_chunking.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_chunking.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cb5bb819cec7f49f43f26b207e0b61defb3b8db
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_chunking.py
@@ -0,0 +1,178 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from itertools import islice
+from numbers import Integral
+
+import numpy as np
+
+from .._config import get_config
+from ._param_validation import Interval, validate_params
+
+
+def chunk_generator(gen, chunksize):
+    """Chunk generator, ``gen`` into lists of length ``chunksize``. The last
+    chunk may have a length less than ``chunksize``."""
+    while True:
+        chunk = list(islice(gen, chunksize))
+        if chunk:
+            yield chunk
+        else:
+            return
+
+
+@validate_params(
+    {
+        "n": [Interval(Integral, 1, None, closed="left")],
+        "batch_size": [Interval(Integral, 1, None, closed="left")],
+        "min_batch_size": [Interval(Integral, 0, None, closed="left")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def gen_batches(n, batch_size, *, min_batch_size=0):
+    """Generator to create slices containing `batch_size` elements from 0 to `n`.
+
+    The last slice may contain less than `batch_size` elements, when
+    `batch_size` does not divide `n`.
+
+    Parameters
+    ----------
+    n : int
+        Size of the sequence.
+    batch_size : int
+        Number of elements in each batch.
+    min_batch_size : int, default=0
+        Minimum number of elements in each batch.
+
+    Yields
+    ------
+    slice of `batch_size` elements
+
+    See Also
+    --------
+    gen_even_slices: Generator to create n_packs slices going up to n.
+
+    Examples
+    --------
+    >>> from sklearn.utils import gen_batches
+    >>> list(gen_batches(7, 3))
+    [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
+    >>> list(gen_batches(6, 3))
+    [slice(0, 3, None), slice(3, 6, None)]
+    >>> list(gen_batches(2, 3))
+    [slice(0, 2, None)]
+    >>> list(gen_batches(7, 3, min_batch_size=0))
+    [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
+    >>> list(gen_batches(7, 3, min_batch_size=2))
+    [slice(0, 3, None), slice(3, 7, None)]
+    """
+    start = 0
+    for _ in range(int(n // batch_size)):
+        end = start + batch_size
+        if end + min_batch_size > n:
+            continue
+        yield slice(start, end)
+        start = end
+    if start < n:
+        yield slice(start, n)
+
+
+@validate_params(
+    {
+        "n": [Interval(Integral, 1, None, closed="left")],
+        "n_packs": [Interval(Integral, 1, None, closed="left")],
+        "n_samples": [Interval(Integral, 1, None, closed="left"), None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def gen_even_slices(n, n_packs, *, n_samples=None):
+    """Generator to create `n_packs` evenly spaced slices going up to `n`.
+
+    If `n_packs` does not divide `n`, except for the first `n % n_packs`
+    slices, remaining slices may contain fewer elements.
+
+    Parameters
+    ----------
+    n : int
+        Size of the sequence.
+    n_packs : int
+        Number of slices to generate.
+    n_samples : int, default=None
+        Number of samples. Pass `n_samples` when the slices are to be used for
+        sparse matrix indexing; slicing off-the-end raises an exception, while
+        it works for NumPy arrays.
+
+    Yields
+    ------
+    `slice` representing a set of indices from 0 to n.
+
+    See Also
+    --------
+    gen_batches: Generator to create slices containing batch_size elements
+        from 0 to n.
+
+    Examples
+    --------
+    >>> from sklearn.utils import gen_even_slices
+    >>> list(gen_even_slices(10, 1))
+    [slice(0, 10, None)]
+    >>> list(gen_even_slices(10, 10))
+    [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
+    >>> list(gen_even_slices(10, 5))
+    [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
+    >>> list(gen_even_slices(10, 3))
+    [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
+    """
+    start = 0
+    for pack_num in range(n_packs):
+        this_n = n // n_packs
+        if pack_num < n % n_packs:
+            this_n += 1
+        if this_n > 0:
+            end = start + this_n
+            if n_samples is not None:
+                end = min(n_samples, end)
+            yield slice(start, end, None)
+            start = end
+
+
+def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
+    """Calculate how many rows can be processed within `working_memory`.
+
+    Parameters
+    ----------
+    row_bytes : int
+        The expected number of bytes of memory that will be consumed
+        during the processing of each row.
+    max_n_rows : int, default=None
+        The maximum return value.
+    working_memory : int or float, default=None
+        The number of rows to fit inside this number of MiB will be
+        returned. When None (default), the value of
+        ``sklearn.get_config()['working_memory']`` is used.
+
+    Returns
+    -------
+    int
+        The number of rows which can be processed within `working_memory`.
+
+    Warns
+    -----
+    Issues a UserWarning if `row_bytes exceeds `working_memory` MiB.
+    """
+
+    if working_memory is None:
+        working_memory = get_config()["working_memory"]
+
+    chunk_n_rows = int(working_memory * (2**20) // row_bytes)
+    if max_n_rows is not None:
+        chunk_n_rows = min(chunk_n_rows, max_n_rows)
+    if chunk_n_rows < 1:
+        warnings.warn(
+            "Could not adhere to working_memory config. "
+            "Currently %.0fMiB, %.0fMiB required."
+            % (working_memory, np.ceil(row_bytes * 2**-20))
+        )
+        chunk_n_rows = 1
+    return chunk_n_rows
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_cython_blas.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/_cython_blas.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..1187eb49d25d4248cda1cc7d06f62583b1761f49
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_cython_blas.pxd
@@ -0,0 +1,41 @@
+from cython cimport floating
+
+
+cpdef enum BLAS_Order:
+    RowMajor  # C contiguous
+    ColMajor  # Fortran contiguous
+
+
+cpdef enum BLAS_Trans:
+    NoTrans = 110  # correspond to 'n'
+    Trans = 116    # correspond to 't'
+
+
+# BLAS Level 1 ################################################################
+cdef floating _dot(int, const floating*, int, const floating*, int) noexcept nogil
+
+cdef floating _asum(int, const floating*, int) noexcept nogil
+
+cdef void _axpy(int, floating, const floating*, int, floating*, int) noexcept nogil
+
+cdef floating _nrm2(int, const floating*, int) noexcept nogil
+
+cdef void _copy(int, const floating*, int, const floating*, int) noexcept nogil
+
+cdef void _scal(int, floating, const floating*, int) noexcept nogil
+
+cdef void _rotg(floating*, floating*, floating*, floating*) noexcept nogil
+
+cdef void _rot(int, floating*, int, floating*, int, floating, floating) noexcept nogil
+
+# BLAS Level 2 ################################################################
+cdef void _gemv(BLAS_Order, BLAS_Trans, int, int, floating, const floating*, int,
+                const floating*, int, floating, floating*, int) noexcept nogil
+
+cdef void _ger(BLAS_Order, int, int, floating, const floating*, int, const floating*,
+               int, floating*, int) noexcept nogil
+
+# BLASLevel 3 ################################################################
+cdef void _gemm(BLAS_Order, BLAS_Trans, BLAS_Trans, int, int, int, floating,
+                const floating*, int, const floating*, int, floating, floating*,
+                int) noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_cython_blas.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_cython_blas.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..ac23d0c4000ff1cb1ae61019c89c765ba2d5bbe8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_cython_blas.pyx
@@ -0,0 +1,239 @@
+from cython cimport floating
+
+from scipy.linalg.cython_blas cimport sdot, ddot
+from scipy.linalg.cython_blas cimport sasum, dasum
+from scipy.linalg.cython_blas cimport saxpy, daxpy
+from scipy.linalg.cython_blas cimport snrm2, dnrm2
+from scipy.linalg.cython_blas cimport scopy, dcopy
+from scipy.linalg.cython_blas cimport sscal, dscal
+from scipy.linalg.cython_blas cimport srotg, drotg
+from scipy.linalg.cython_blas cimport srot, drot
+from scipy.linalg.cython_blas cimport sgemv, dgemv
+from scipy.linalg.cython_blas cimport sger, dger
+from scipy.linalg.cython_blas cimport sgemm, dgemm
+
+
+################
+# BLAS Level 1 #
+################
+
+cdef floating _dot(int n, const floating *x, int incx,
+                   const floating *y, int incy) noexcept nogil:
+    """x.T.y"""
+    if floating is float:
+        return sdot(&n, <float *> x, &incx, <float *> y, &incy)
+    else:
+        return ddot(&n, <double *> x, &incx, <double *> y, &incy)
+
+
+cpdef _dot_memview(const floating[::1] x, const floating[::1] y):
+    return _dot(x.shape[0], &x[0], 1, &y[0], 1)
+
+
+cdef floating _asum(int n, const floating *x, int incx) noexcept nogil:
+    """sum(|x_i|)"""
+    if floating is float:
+        return sasum(&n, <float *> x, &incx)
+    else:
+        return dasum(&n, <double *> x, &incx)
+
+
+cpdef _asum_memview(const floating[::1] x):
+    return _asum(x.shape[0], &x[0], 1)
+
+
+cdef void _axpy(int n, floating alpha, const floating *x, int incx,
+                floating *y, int incy) noexcept nogil:
+    """y := alpha * x + y"""
+    if floating is float:
+        saxpy(&n, &alpha, <float *> x, &incx, y, &incy)
+    else:
+        daxpy(&n, &alpha, <double *> x, &incx, y, &incy)
+
+
+cpdef _axpy_memview(floating alpha, const floating[::1] x, floating[::1] y):
+    _axpy(x.shape[0], alpha, &x[0], 1, &y[0], 1)
+
+
+cdef floating _nrm2(int n, const floating *x, int incx) noexcept nogil:
+    """sqrt(sum((x_i)^2))"""
+    if floating is float:
+        return snrm2(&n, <float *> x, &incx)
+    else:
+        return dnrm2(&n, <double *> x, &incx)
+
+
+cpdef _nrm2_memview(const floating[::1] x):
+    return _nrm2(x.shape[0], &x[0], 1)
+
+
+cdef void _copy(int n, const floating *x, int incx, const floating *y, int incy) noexcept nogil:
+    """y := x"""
+    if floating is float:
+        scopy(&n, <float *> x, &incx, <float *> y, &incy)
+    else:
+        dcopy(&n, <double *> x, &incx, <double *> y, &incy)
+
+
+cpdef _copy_memview(const floating[::1] x, const floating[::1] y):
+    _copy(x.shape[0], &x[0], 1, &y[0], 1)
+
+
+cdef void _scal(int n, floating alpha, const floating *x, int incx) noexcept nogil:
+    """x := alpha * x"""
+    if floating is float:
+        sscal(&n, &alpha, <float *> x, &incx)
+    else:
+        dscal(&n, &alpha, <double *> x, &incx)
+
+
+cpdef _scal_memview(floating alpha, const floating[::1] x):
+    _scal(x.shape[0], alpha, &x[0], 1)
+
+
+cdef void _rotg(floating *a, floating *b, floating *c, floating *s) noexcept nogil:
+    """Generate plane rotation"""
+    if floating is float:
+        srotg(a, b, c, s)
+    else:
+        drotg(a, b, c, s)
+
+
+cpdef _rotg_memview(floating a, floating b, floating c, floating s):
+    _rotg(&a, &b, &c, &s)
+    return a, b, c, s
+
+
+cdef void _rot(int n, floating *x, int incx, floating *y, int incy,
+               floating c, floating s) noexcept nogil:
+    """Apply plane rotation"""
+    if floating is float:
+        srot(&n, x, &incx, y, &incy, &c, &s)
+    else:
+        drot(&n, x, &incx, y, &incy, &c, &s)
+
+
+cpdef _rot_memview(floating[::1] x, floating[::1] y, floating c, floating s):
+    _rot(x.shape[0], &x[0], 1, &y[0], 1, c, s)
+
+
+################
+# BLAS Level 2 #
+################
+
+cdef void _gemv(BLAS_Order order, BLAS_Trans ta, int m, int n, floating alpha,
+                const floating *A, int lda, const floating *x, int incx,
+                floating beta, floating *y, int incy) noexcept nogil:
+    """y := alpha * op(A).x + beta * y"""
+    cdef char ta_ = ta
+    if order == BLAS_Order.RowMajor:
+        ta_ = BLAS_Trans.NoTrans if ta == BLAS_Trans.Trans else BLAS_Trans.Trans
+        if floating is float:
+            sgemv(&ta_, &n, &m, &alpha, <float *> A, &lda, <float *> x,
+                  &incx, &beta, y, &incy)
+        else:
+            dgemv(&ta_, &n, &m, &alpha, <double *> A, &lda, <double *> x,
+                  &incx, &beta, y, &incy)
+    else:
+        if floating is float:
+            sgemv(&ta_, &m, &n, &alpha, <float *> A, &lda, <float *> x,
+                  &incx, &beta, y, &incy)
+        else:
+            dgemv(&ta_, &m, &n, &alpha, <double *> A, &lda, <double *> x,
+                  &incx, &beta, y, &incy)
+
+
+cpdef _gemv_memview(BLAS_Trans ta, floating alpha, const floating[:, :] A,
+                    const floating[::1] x, floating beta, floating[::1] y):
+    cdef:
+        int m = A.shape[0]
+        int n = A.shape[1]
+        BLAS_Order order = (
+            BLAS_Order.ColMajor if A.strides[0] == A.itemsize else BLAS_Order.RowMajor
+        )
+        int lda = m if order == BLAS_Order.ColMajor else n
+
+    _gemv(order, ta, m, n, alpha, &A[0, 0], lda, &x[0], 1, beta, &y[0], 1)
+
+
+cdef void _ger(BLAS_Order order, int m, int n, floating alpha,
+               const floating *x, int incx, const floating *y,
+               int incy, floating *A, int lda) noexcept nogil:
+    """A := alpha * x.y.T + A"""
+    if order == BLAS_Order.RowMajor:
+        if floating is float:
+            sger(&n, &m, &alpha, <float *> y, &incy, <float *> x, &incx, A, &lda)
+        else:
+            dger(&n, &m, &alpha, <double *> y, &incy, <double *> x, &incx, A, &lda)
+    else:
+        if floating is float:
+            sger(&m, &n, &alpha, <float *> x, &incx, <float *> y, &incy, A, &lda)
+        else:
+            dger(&m, &n, &alpha, <double *> x, &incx, <double *> y, &incy, A, &lda)
+
+
+cpdef _ger_memview(floating alpha, const floating[::1] x,
+                   const floating[::1] y, floating[:, :] A):
+    cdef:
+        int m = A.shape[0]
+        int n = A.shape[1]
+        BLAS_Order order = (
+            BLAS_Order.ColMajor if A.strides[0] == A.itemsize else BLAS_Order.RowMajor
+        )
+        int lda = m if order == BLAS_Order.ColMajor else n
+
+    _ger(order, m, n, alpha, &x[0], 1, &y[0], 1, &A[0, 0], lda)
+
+
+################
+# BLAS Level 3 #
+################
+
+cdef void _gemm(BLAS_Order order, BLAS_Trans ta, BLAS_Trans tb, int m, int n,
+                int k, floating alpha, const floating *A, int lda, const floating *B,
+                int ldb, floating beta, floating *C, int ldc) noexcept nogil:
+    """C := alpha * op(A).op(B) + beta * C"""
+    # TODO: Remove the pointer casts below once SciPy uses const-qualification.
+    # See: https://github.com/scipy/scipy/issues/14262
+    cdef:
+        char ta_ = ta
+        char tb_ = tb
+    if order == BLAS_Order.RowMajor:
+        if floating is float:
+            sgemm(&tb_, &ta_, &n, &m, &k, &alpha, <float*>B,
+                  &ldb, <float*>A, &lda, &beta, C, &ldc)
+        else:
+            dgemm(&tb_, &ta_, &n, &m, &k, &alpha, <double*>B,
+                  &ldb, <double*>A, &lda, &beta, C, &ldc)
+    else:
+        if floating is float:
+            sgemm(&ta_, &tb_, &m, &n, &k, &alpha, <float*>A,
+                  &lda, <float*>B, &ldb, &beta, C, &ldc)
+        else:
+            dgemm(&ta_, &tb_, &m, &n, &k, &alpha, <double*>A,
+                  &lda, <double*>B, &ldb, &beta, C, &ldc)
+
+
+cpdef _gemm_memview(BLAS_Trans ta, BLAS_Trans tb, floating alpha,
+                    const floating[:, :] A, const floating[:, :] B, floating beta,
+                    floating[:, :] C):
+    cdef:
+        int m = A.shape[0] if ta == BLAS_Trans.NoTrans else A.shape[1]
+        int n = B.shape[1] if tb == BLAS_Trans.NoTrans else B.shape[0]
+        int k = A.shape[1] if ta == BLAS_Trans.NoTrans else A.shape[0]
+        int lda, ldb, ldc
+        BLAS_Order order = (
+            BLAS_Order.ColMajor if A.strides[0] == A.itemsize else BLAS_Order.RowMajor
+        )
+
+    if order == BLAS_Order.RowMajor:
+        lda = k if ta == BLAS_Trans.NoTrans else m
+        ldb = n if tb == BLAS_Trans.NoTrans else k
+        ldc = n
+    else:
+        lda = m if ta == BLAS_Trans.NoTrans else k
+        ldb = k if tb == BLAS_Trans.NoTrans else n
+        ldc = m
+
+    _gemm(order, ta, tb, m, n, k, alpha, &A[0, 0],
+          lda, &B[0, 0], ldb, beta, &C[0, 0], ldc)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_encode.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_encode.py
new file mode 100644
index 0000000000000000000000000000000000000000..147ba5abf11da536d7d4a5d8e3de50f9f4f67759
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_encode.py
@@ -0,0 +1,376 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from collections import Counter
+from contextlib import suppress
+from typing import NamedTuple
+
+import numpy as np
+
+from ._array_api import (
+    _isin,
+    _searchsorted,
+    device,
+    get_namespace,
+    xpx,
+)
+from ._missing import is_scalar_nan
+
+
+def _unique(values, *, return_inverse=False, return_counts=False):
+    """Helper function to find unique values with support for python objects.
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+
+    Parameters
+    ----------
+    values : ndarray
+        Values to check for unknowns.
+
+    return_inverse : bool, default=False
+        If True, also return the indices of the unique values.
+
+    return_counts : bool, default=False
+        If True, also return the number of times each unique item appears in
+        values.
+
+    Returns
+    -------
+    unique : ndarray
+        The sorted unique values.
+
+    unique_inverse : ndarray
+        The indices to reconstruct the original array from the unique array.
+        Only provided if `return_inverse` is True.
+
+    unique_counts : ndarray
+        The number of times each of the unique values comes up in the original
+        array. Only provided if `return_counts` is True.
+    """
+    if values.dtype == object:
+        return _unique_python(
+            values, return_inverse=return_inverse, return_counts=return_counts
+        )
+    # numerical
+    return _unique_np(
+        values, return_inverse=return_inverse, return_counts=return_counts
+    )
+
+
+def _unique_np(values, return_inverse=False, return_counts=False):
+    """Helper function to find unique values for numpy arrays that correctly
+    accounts for nans. See `_unique` documentation for details."""
+    xp, _ = get_namespace(values)
+
+    inverse, counts = None, None
+
+    if return_inverse and return_counts:
+        uniques, _, inverse, counts = xp.unique_all(values)
+    elif return_inverse:
+        uniques, inverse = xp.unique_inverse(values)
+    elif return_counts:
+        uniques, counts = xp.unique_counts(values)
+    else:
+        uniques = xp.unique_values(values)
+
+    # np.unique will have duplicate missing values at the end of `uniques`
+    # here we clip the nans and remove it from uniques
+    if uniques.size and is_scalar_nan(uniques[-1]):
+        nan_idx = _searchsorted(uniques, xp.nan, xp=xp)
+        uniques = uniques[: nan_idx + 1]
+        if return_inverse:
+            inverse[inverse > nan_idx] = nan_idx
+
+        if return_counts:
+            counts[nan_idx] = xp.sum(counts[nan_idx:])
+            counts = counts[: nan_idx + 1]
+
+    ret = (uniques,)
+
+    if return_inverse:
+        ret += (inverse,)
+
+    if return_counts:
+        ret += (counts,)
+
+    return ret[0] if len(ret) == 1 else ret
+
+
+class MissingValues(NamedTuple):
+    """Data class for missing data information"""
+
+    nan: bool
+    none: bool
+
+    def to_list(self):
+        """Convert tuple to a list where None is always first."""
+        output = []
+        if self.none:
+            output.append(None)
+        if self.nan:
+            output.append(np.nan)
+        return output
+
+
+def _extract_missing(values):
+    """Extract missing values from `values`.
+
+    Parameters
+    ----------
+    values: set
+        Set of values to extract missing from.
+
+    Returns
+    -------
+    output: set
+        Set with missing values extracted.
+
+    missing_values: MissingValues
+        Object with missing value information.
+    """
+    missing_values_set = {
+        value for value in values if value is None or is_scalar_nan(value)
+    }
+
+    if not missing_values_set:
+        return values, MissingValues(nan=False, none=False)
+
+    if None in missing_values_set:
+        if len(missing_values_set) == 1:
+            output_missing_values = MissingValues(nan=False, none=True)
+        else:
+            # If there is more than one missing value, then it has to be
+            # float('nan') or np.nan
+            output_missing_values = MissingValues(nan=True, none=True)
+    else:
+        output_missing_values = MissingValues(nan=True, none=False)
+
+    # create set without the missing values
+    output = values - missing_values_set
+    return output, output_missing_values
+
+
+class _nandict(dict):
+    """Dictionary with support for nans."""
+
+    def __init__(self, mapping):
+        super().__init__(mapping)
+        for key, value in mapping.items():
+            if is_scalar_nan(key):
+                self.nan_value = value
+                break
+
+    def __missing__(self, key):
+        if hasattr(self, "nan_value") and is_scalar_nan(key):
+            return self.nan_value
+        raise KeyError(key)
+
+
+def _map_to_integer(values, uniques):
+    """Map values based on its position in uniques."""
+    xp, _ = get_namespace(values, uniques)
+    table = _nandict({val: i for i, val in enumerate(uniques)})
+    return xp.asarray([table[v] for v in values], device=device(values))
+
+
+def _unique_python(values, *, return_inverse, return_counts):
+    # Only used in `_uniques`, see docstring there for details
+    try:
+        uniques_set = set(values)
+        uniques_set, missing_values = _extract_missing(uniques_set)
+
+        uniques = sorted(uniques_set)
+        uniques.extend(missing_values.to_list())
+        uniques = np.array(uniques, dtype=values.dtype)
+    except TypeError:
+        types = sorted(t.__qualname__ for t in set(type(v) for v in values))
+        raise TypeError(
+            "Encoders require their input argument must be uniformly "
+            f"strings or numbers. Got {types}"
+        )
+    ret = (uniques,)
+
+    if return_inverse:
+        ret += (_map_to_integer(values, uniques),)
+
+    if return_counts:
+        ret += (_get_counts(values, uniques),)
+
+    return ret[0] if len(ret) == 1 else ret
+
+
+def _encode(values, *, uniques, check_unknown=True):
+    """Helper function to encode values into [0, n_uniques - 1].
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+    The numpy method has the limitation that the `uniques` need to
+    be sorted. Importantly, this is not checked but assumed to already be
+    the case. The calling method needs to ensure this for all non-object
+    values.
+
+    Parameters
+    ----------
+    values : ndarray
+        Values to encode.
+    uniques : ndarray
+        The unique values in `values`. If the dtype is not object, then
+        `uniques` needs to be sorted.
+    check_unknown : bool, default=True
+        If True, check for values in `values` that are not in `unique`
+        and raise an error. This is ignored for object dtype, and treated as
+        True in this case. This parameter is useful for
+        _BaseEncoder._transform() to avoid calling _check_unknown()
+        twice.
+
+    Returns
+    -------
+    encoded : ndarray
+        Encoded values
+    """
+    xp, _ = get_namespace(values, uniques)
+    if not xp.isdtype(values.dtype, "numeric"):
+        try:
+            return _map_to_integer(values, uniques)
+        except KeyError as e:
+            raise ValueError(f"y contains previously unseen labels: {e}")
+    else:
+        if check_unknown:
+            diff = _check_unknown(values, uniques)
+            if diff:
+                raise ValueError(f"y contains previously unseen labels: {diff}")
+        return _searchsorted(uniques, values, xp=xp)
+
+
+def _check_unknown(values, known_values, return_mask=False):
+    """
+    Helper function to check for unknowns in values to be encoded.
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+
+    Parameters
+    ----------
+    values : array
+        Values to check for unknowns.
+    known_values : array
+        Known values. Must be unique.
+    return_mask : bool, default=False
+        If True, return a mask of the same shape as `values` indicating
+        the valid values.
+
+    Returns
+    -------
+    diff : list
+        The unique values present in `values` and not in `know_values`.
+    valid_mask : boolean array
+        Additionally returned if ``return_mask=True``.
+
+    """
+    xp, _ = get_namespace(values, known_values)
+    valid_mask = None
+
+    if not xp.isdtype(values.dtype, "numeric"):
+        values_set = set(values)
+        values_set, missing_in_values = _extract_missing(values_set)
+
+        uniques_set = set(known_values)
+        uniques_set, missing_in_uniques = _extract_missing(uniques_set)
+        diff = values_set - uniques_set
+
+        nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan
+        none_in_diff = missing_in_values.none and not missing_in_uniques.none
+
+        def is_valid(value):
+            return (
+                value in uniques_set
+                or (missing_in_uniques.none and value is None)
+                or (missing_in_uniques.nan and is_scalar_nan(value))
+            )
+
+        if return_mask:
+            if diff or nan_in_diff or none_in_diff:
+                valid_mask = xp.array([is_valid(value) for value in values])
+            else:
+                valid_mask = xp.ones(len(values), dtype=xp.bool)
+
+        diff = list(diff)
+        if none_in_diff:
+            diff.append(None)
+        if nan_in_diff:
+            diff.append(np.nan)
+    else:
+        unique_values = xp.unique_values(values)
+        diff = xpx.setdiff1d(unique_values, known_values, assume_unique=True, xp=xp)
+        if return_mask:
+            if diff.size:
+                valid_mask = _isin(values, known_values, xp)
+            else:
+                valid_mask = xp.ones(len(values), dtype=xp.bool)
+
+        # check for nans in the known_values
+        if xp.any(xp.isnan(known_values)):
+            diff_is_nan = xp.isnan(diff)
+            if xp.any(diff_is_nan):
+                # removes nan from valid_mask
+                if diff.size and return_mask:
+                    is_nan = xp.isnan(values)
+                    valid_mask[is_nan] = 1
+
+                # remove nan from diff
+                diff = diff[~diff_is_nan]
+        diff = list(diff)
+
+    if return_mask:
+        return diff, valid_mask
+    return diff
+
+
+class _NaNCounter(Counter):
+    """Counter with support for nan values."""
+
+    def __init__(self, items):
+        super().__init__(self._generate_items(items))
+
+    def _generate_items(self, items):
+        """Generate items without nans. Stores the nan counts separately."""
+        for item in items:
+            if not is_scalar_nan(item):
+                yield item
+                continue
+            if not hasattr(self, "nan_count"):
+                self.nan_count = 0
+            self.nan_count += 1
+
+    def __missing__(self, key):
+        if hasattr(self, "nan_count") and is_scalar_nan(key):
+            return self.nan_count
+        raise KeyError(key)
+
+
+def _get_counts(values, uniques):
+    """Get the count of each of the `uniques` in `values`.
+
+    The counts will use the order passed in by `uniques`. For non-object dtypes,
+    `uniques` is assumed to be sorted and `np.nan` is at the end.
+    """
+    if values.dtype.kind in "OU":
+        counter = _NaNCounter(values)
+        output = np.zeros(len(uniques), dtype=np.int64)
+        for i, item in enumerate(uniques):
+            with suppress(KeyError):
+                output[i] = counter[item]
+        return output
+
+    unique_values, counts = _unique_np(values, return_counts=True)
+
+    # Recorder unique_values based on input: `uniques`
+    uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
+    if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
+        uniques_in_values[-1] = True
+
+    unique_valid_indices = np.searchsorted(unique_values, uniques[uniques_in_values])
+    output = np.zeros_like(uniques, dtype=np.int64)
+    output[uniques_in_values] = counts[unique_valid_indices]
+    return output
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_estimator_html_repr.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_estimator_html_repr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7898ae5e76ccd2265c559398acb25d74e0f4f7a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_estimator_html_repr.py
@@ -0,0 +1,34 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+
+from ._repr_html.base import _HTMLDocumentationLinkMixin
+from ._repr_html.estimator import (
+    _get_visual_block,
+    _IDCounter,
+    _VisualBlock,
+    _write_estimator_html,
+    _write_label_html,
+    estimator_html_repr,
+)
+
+__all__ = [
+    "_HTMLDocumentationLinkMixin",
+    "_IDCounter",
+    "_VisualBlock",
+    "_get_visual_block",
+    "_write_estimator_html",
+    "_write_label_html",
+    "estimator_html_repr",
+]
+
+# TODO(1.8): Remove the entire module
+warnings.warn(
+    "Importing from sklearn.utils._estimator_html_repr is deprecated. The tools have "
+    "been moved to sklearn.utils._repr_html. Be aware that this module is private and "
+    "may be subject to change in the future. The module _estimator_html_repr will be "
+    "removed in 1.8.0.",
+    FutureWarning,
+    stacklevel=2,
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_fast_dict.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/_fast_dict.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..e37f254661ce6944efed5709d895a0b4186b2bc8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_fast_dict.pxd
@@ -0,0 +1,19 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Uses C++ map containers for fast dict-like behavior with keys being
+integers, and values float.
+"""
+
+from libcpp.map cimport map as cpp_map
+
+from ._typedefs cimport float64_t, intp_t
+
+
+###############################################################################
+# An object to be used in Python
+
+cdef class IntFloatDict:
+    cdef cpp_map[intp_t, float64_t] my_map
+    cdef _to_arrays(self, intp_t [:] keys, float64_t [:] values)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_fast_dict.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_fast_dict.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..cdf84d9b592e187238c4465663f310bf4720f969
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_fast_dict.pyx
@@ -0,0 +1,137 @@
+"""
+Uses C++ map containers for fast dict-like behavior with keys being
+integers, and values float.
+"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# C++
+from cython.operator cimport dereference as deref, preincrement as inc
+from libcpp.utility cimport pair
+from libcpp.map cimport map as cpp_map
+
+import numpy as np
+
+from ._typedefs cimport float64_t, intp_t
+
+
+###############################################################################
+# An object to be used in Python
+
+# Lookup is faster than dict (up to 10 times), and so is full traversal
+# (up to 50 times), and assignment (up to 6 times), but creation is
+# slower (up to 3 times). Also, a large benefit is that memory
+# consumption is reduced a lot compared to a Python dict
+
+cdef class IntFloatDict:
+
+    def __init__(
+        self,
+        intp_t[:] keys,
+        float64_t[:] values,
+    ):
+        cdef int i
+        cdef int size = values.size
+        # Should check that sizes for keys and values are equal, and
+        # after should boundcheck(False)
+        for i in range(size):
+            self.my_map[keys[i]] = values[i]
+
+    def __len__(self):
+        return self.my_map.size()
+
+    def __getitem__(self, int key):
+        cdef cpp_map[intp_t, float64_t].iterator it = self.my_map.find(key)
+        if it == self.my_map.end():
+            # The key is not in the dict
+            raise KeyError('%i' % key)
+        return deref(it).second
+
+    def __setitem__(self, int key, float value):
+        self.my_map[key] = value
+
+    # Cython 0.20 generates buggy code below. Commenting this out for now
+    # and relying on the to_arrays method
+    # def __iter__(self):
+    #     cdef cpp_map[intp_t, float64_t].iterator it = self.my_map.begin()
+    #     cdef cpp_map[intp_t, float64_t].iterator end = self.my_map.end()
+    #     while it != end:
+    #         yield deref(it).first, deref(it).second
+    #         inc(it)
+
+    def __iter__(self):
+        cdef int size = self.my_map.size()
+        cdef intp_t [:] keys = np.empty(size, dtype=np.intp)
+        cdef float64_t [:] values = np.empty(size, dtype=np.float64)
+        self._to_arrays(keys, values)
+        cdef int idx
+        cdef intp_t key
+        cdef float64_t value
+        for idx in range(size):
+            key = keys[idx]
+            value = values[idx]
+            yield key, value
+
+    def to_arrays(self):
+        """Return the key, value representation of the IntFloatDict
+           object.
+
+           Returns
+           =======
+           keys : ndarray, shape (n_items, ), dtype=int
+                The indices of the data points
+           values : ndarray, shape (n_items, ), dtype=float
+                The values of the data points
+        """
+        cdef int size = self.my_map.size()
+        keys = np.empty(size, dtype=np.intp)
+        values = np.empty(size, dtype=np.float64)
+        self._to_arrays(keys, values)
+        return keys, values
+
+    cdef _to_arrays(self, intp_t [:] keys, float64_t [:] values):
+        # Internal version of to_arrays that takes already-initialized arrays
+        cdef cpp_map[intp_t, float64_t].iterator it = self.my_map.begin()
+        cdef cpp_map[intp_t, float64_t].iterator end = self.my_map.end()
+        cdef int index = 0
+        while it != end:
+            keys[index] = deref(it).first
+            values[index] = deref(it).second
+            inc(it)
+            index += 1
+
+    def update(self, IntFloatDict other):
+        cdef cpp_map[intp_t, float64_t].iterator it = other.my_map.begin()
+        cdef cpp_map[intp_t, float64_t].iterator end = other.my_map.end()
+        while it != end:
+            self.my_map[deref(it).first] = deref(it).second
+            inc(it)
+
+    def copy(self):
+        cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
+        # The '=' operator is a copy operator for C++ maps
+        out_obj.my_map = self.my_map
+        return out_obj
+
+    def append(self, intp_t key, float64_t value):
+        # Construct our arguments
+        cdef pair[intp_t, float64_t] args
+        args.first = key
+        args.second = value
+        self.my_map.insert(args)
+
+
+###############################################################################
+# operation on dict
+
+def argmin(IntFloatDict d):
+    cdef cpp_map[intp_t, float64_t].iterator it = d.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator end = d.my_map.end()
+    cdef intp_t min_key = -1
+    cdef float64_t min_value = np.inf
+    while it != end:
+        if deref(it).second < min_value:
+            min_value = deref(it).second
+            min_key = deref(it).first
+        inc(it)
+    return min_key, min_value
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..f9a02e0f75916035d312760bc344ad3d6591751c
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..39de4dc02d315f8decd1fa06f430759eaa57e68e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.pxd
@@ -0,0 +1,14 @@
+# Heap routines, used in various Cython implementations.
+
+from cython cimport floating
+
+from ._typedefs cimport intp_t
+
+
+cdef int heap_push(
+    floating* values,
+    intp_t* indices,
+    intp_t size,
+    floating val,
+    intp_t val_idx,
+) noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..98bc3046a0798bee36e9e5782701788a595e237d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_heap.pyx
@@ -0,0 +1,85 @@
+from cython cimport floating
+
+from ._typedefs cimport intp_t
+
+
+cdef inline int heap_push(
+    floating* values,
+    intp_t* indices,
+    intp_t size,
+    floating val,
+    intp_t val_idx,
+) noexcept nogil:
+    """Push a tuple (val, val_idx) onto a fixed-size max-heap.
+
+    The max-heap is represented as a Structure of Arrays where:
+     - values is the array containing the data to construct the heap with
+     - indices is the array containing the indices (meta-data) of each value
+
+    Notes
+    -----
+    Arrays are manipulated via a pointer to there first element and their size
+    as to ease the processing of dynamically allocated buffers.
+
+    For instance, in pseudo-code:
+
+        values = [1.2, 0.4, 0.1],
+        indices = [42, 1, 5],
+        heap_push(
+            values=values,
+            indices=indices,
+            size=3,
+            val=0.2,
+            val_idx=4,
+        )
+
+    will modify values and indices inplace, giving at the end of the call:
+
+        values  == [0.4, 0.2, 0.1]
+        indices == [1, 4, 5]
+
+    """
+    cdef:
+        intp_t current_idx, left_child_idx, right_child_idx, swap_idx
+
+    # Check if val should be in heap
+    if val >= values[0]:
+        return 0
+
+    # Insert val at position zero
+    values[0] = val
+    indices[0] = val_idx
+
+    # Descend the heap, swapping values until the max heap criterion is met
+    current_idx = 0
+    while True:
+        left_child_idx = 2 * current_idx + 1
+        right_child_idx = left_child_idx + 1
+
+        if left_child_idx >= size:
+            break
+        elif right_child_idx >= size:
+            if values[left_child_idx] > val:
+                swap_idx = left_child_idx
+            else:
+                break
+        elif values[left_child_idx] >= values[right_child_idx]:
+            if val < values[left_child_idx]:
+                swap_idx = left_child_idx
+            else:
+                break
+        else:
+            if val < values[right_child_idx]:
+                swap_idx = right_child_idx
+            else:
+                break
+
+        values[current_idx] = values[swap_idx]
+        indices[current_idx] = indices[swap_idx]
+
+        current_idx = swap_idx
+
+    values[current_idx] = val
+    indices[current_idx] = val_idx
+
+    return 0
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_indexing.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec83cf6660b25706627e5b65812054ee6a83662f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_indexing.py
@@ -0,0 +1,755 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import sys
+import warnings
+from collections import UserList
+from itertools import compress, islice
+
+import numpy as np
+from scipy.sparse import issparse
+
+from sklearn.utils.fixes import PYARROW_VERSION_BELOW_17
+
+from ._array_api import _is_numpy_namespace, get_namespace
+from ._param_validation import Interval, validate_params
+from .extmath import _approximate_mode
+from .validation import (
+    _check_sample_weight,
+    _is_arraylike_not_scalar,
+    _is_pandas_df,
+    _is_polars_df_or_series,
+    _is_pyarrow_data,
+    _use_interchange_protocol,
+    check_array,
+    check_consistent_length,
+    check_random_state,
+)
+
+
+def _array_indexing(array, key, key_dtype, axis):
+    """Index an array or scipy.sparse consistently across NumPy version."""
+    xp, is_array_api = get_namespace(array)
+    if is_array_api:
+        return xp.take(array, key, axis=axis)
+    if issparse(array) and key_dtype == "bool":
+        key = np.asarray(key)
+    if isinstance(key, tuple):
+        key = list(key)
+    return array[key, ...] if axis == 0 else array[:, key]
+
+
+def _pandas_indexing(X, key, key_dtype, axis):
+    """Index a pandas dataframe or a series."""
+    if _is_arraylike_not_scalar(key):
+        key = np.asarray(key)
+
+    if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):
+        # using take() instead of iloc[] ensures the return value is a "proper"
+        # copy that will not raise SettingWithCopyWarning
+        return X.take(key, axis=axis)
+    else:
+        # check whether we should index with loc or iloc
+        indexer = X.iloc if key_dtype == "int" else X.loc
+        return indexer[:, key] if axis else indexer[key]
+
+
+def _list_indexing(X, key, key_dtype):
+    """Index a Python list."""
+    if np.isscalar(key) or isinstance(key, slice):
+        # key is a slice or a scalar
+        return X[key]
+    if key_dtype == "bool":
+        # key is a boolean array-like
+        return list(compress(X, key))
+    # key is a integer array-like of key
+    return [X[idx] for idx in key]
+
+
+def _polars_indexing(X, key, key_dtype, axis):
+    """Index a polars dataframe or series."""
+    # Polars behavior is more consistent with lists
+    if isinstance(key, np.ndarray):
+        # Convert each element of the array to a Python scalar
+        key = key.tolist()
+    elif not (np.isscalar(key) or isinstance(key, slice)):
+        key = list(key)
+
+    if axis == 1:
+        # Here we are certain to have a polars DataFrame; which can be indexed with
+        # integer and string scalar, and list of integer, string and boolean
+        return X[:, key]
+
+    if key_dtype == "bool":
+        # Boolean mask can be indexed in the same way for Series and DataFrame (axis=0)
+        return X.filter(key)
+
+    # Integer scalar and list of integer can be indexed in the same way for Series and
+    # DataFrame (axis=0)
+    X_indexed = X[key]
+    if np.isscalar(key) and len(X.shape) == 2:
+        # `X_indexed` is a DataFrame with a single row; we return a Series to be
+        # consistent with pandas
+        pl = sys.modules["polars"]
+        return pl.Series(X_indexed.row(0))
+    return X_indexed
+
+
+def _pyarrow_indexing(X, key, key_dtype, axis):
+    """Index a pyarrow data."""
+    scalar_key = np.isscalar(key)
+    if isinstance(key, slice):
+        if isinstance(key.stop, str):
+            start = X.column_names.index(key.start)
+            stop = X.column_names.index(key.stop) + 1
+        else:
+            start = 0 if not key.start else key.start
+            stop = key.stop
+        step = 1 if not key.step else key.step
+        key = list(range(start, stop, step))
+
+    if axis == 1:
+        # Here we are certain that X is a pyarrow Table or RecordBatch.
+        if key_dtype == "int" and not isinstance(key, list):
+            # pyarrow's X.select behavior is more consistent with integer lists.
+            key = np.asarray(key).tolist()
+        if key_dtype == "bool":
+            key = np.asarray(key).nonzero()[0].tolist()
+
+        if scalar_key:
+            return X.column(key)
+
+        return X.select(key)
+
+    # axis == 0 from here on
+    if scalar_key:
+        if hasattr(X, "shape"):
+            # X is a Table or RecordBatch
+            key = [key]
+        else:
+            return X[key].as_py()
+    elif not isinstance(key, list):
+        key = np.asarray(key)
+
+    if key_dtype == "bool":
+        # TODO(pyarrow): remove version checking and following if-branch when
+        # pyarrow==17.0.0 is the minimal version, see pyarrow issue
+        # https://github.com/apache/arrow/issues/42013 for more info
+        if PYARROW_VERSION_BELOW_17:
+            import pyarrow
+
+            if not isinstance(key, pyarrow.BooleanArray):
+                key = pyarrow.array(key, type=pyarrow.bool_())
+
+        X_indexed = X.filter(key)
+
+    else:
+        X_indexed = X.take(key)
+
+    if scalar_key and len(getattr(X, "shape", [0])) == 2:
+        # X_indexed is a dataframe-like with a single row; we return a Series to be
+        # consistent with pandas
+        pa = sys.modules["pyarrow"]
+        return pa.array(X_indexed.to_pylist()[0].values())
+    return X_indexed
+
+
+def _determine_key_type(key, accept_slice=True):
+    """Determine the data type of key.
+
+    Parameters
+    ----------
+    key : scalar, slice or array-like
+        The key from which we want to infer the data type.
+
+    accept_slice : bool, default=True
+        Whether or not to raise an error if the key is a slice.
+
+    Returns
+    -------
+    dtype : {'int', 'str', 'bool', None}
+        Returns the data type of key.
+    """
+    err_msg = (
+        "No valid specification of the columns. Only a scalar, list or "
+        "slice of all integers or all strings, or boolean mask is "
+        "allowed"
+    )
+
+    dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"}
+    array_dtype_to_str = {
+        "i": "int",
+        "u": "int",
+        "b": "bool",
+        "O": "str",
+        "U": "str",
+        "S": "str",
+    }
+
+    if key is None:
+        return None
+    if isinstance(key, tuple(dtype_to_str.keys())):
+        try:
+            return dtype_to_str[type(key)]
+        except KeyError:
+            raise ValueError(err_msg)
+    if isinstance(key, slice):
+        if not accept_slice:
+            raise TypeError(
+                "Only array-like or scalar are supported. A Python slice was given."
+            )
+        if key.start is None and key.stop is None:
+            return None
+        key_start_type = _determine_key_type(key.start)
+        key_stop_type = _determine_key_type(key.stop)
+        if key_start_type is not None and key_stop_type is not None:
+            if key_start_type != key_stop_type:
+                raise ValueError(err_msg)
+        if key_start_type is not None:
+            return key_start_type
+        return key_stop_type
+    # TODO(1.9) remove UserList when the force_int_remainder_cols param
+    # of ColumnTransformer is removed
+    if isinstance(key, (list, tuple, UserList)):
+        unique_key = set(key)
+        key_type = {_determine_key_type(elt) for elt in unique_key}
+        if not key_type:
+            return None
+        if len(key_type) != 1:
+            raise ValueError(err_msg)
+        return key_type.pop()
+    if hasattr(key, "dtype"):
+        xp, is_array_api = get_namespace(key)
+        # NumPy arrays are special-cased in their own branch because the Array API
+        # cannot handle object/string-based dtypes that are often used to index
+        # columns of dataframes by names.
+        if is_array_api and not _is_numpy_namespace(xp):
+            if xp.isdtype(key.dtype, "bool"):
+                return "bool"
+            elif xp.isdtype(key.dtype, "integral"):
+                return "int"
+            else:
+                raise ValueError(err_msg)
+        else:
+            try:
+                return array_dtype_to_str[key.dtype.kind]
+            except KeyError:
+                raise ValueError(err_msg)
+    raise ValueError(err_msg)
+
+
+def _safe_indexing(X, indices, *, axis=0):
+    """Return rows, items or columns of X using indices.
+
+    .. warning::
+
+        This utility is documented, but **private**. This means that
+        backward compatibility might be broken without any deprecation
+        cycle.
+
+    Parameters
+    ----------
+    X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
+        Data from which to sample rows, items or columns. `list` are only
+        supported when `axis=0`.
+    indices : bool, int, str, slice, array-like
+        - If `axis=0`, boolean and integer array-like, integer slice,
+          and scalar integer are supported.
+        - If `axis=1`:
+            - to select a single column, `indices` can be of `int` type for
+              all `X` types and `str` only for dataframe. The selected subset
+              will be 1D, unless `X` is a sparse matrix in which case it will
+              be 2D.
+            - to select multiples columns, `indices` can be one of the
+              following: `list`, `array`, `slice`. The type used in
+              these containers can be one of the following: `int`, 'bool' and
+              `str`. However, `str` is only supported when `X` is a dataframe.
+              The selected subset will be 2D.
+    axis : int, default=0
+        The axis along which `X` will be subsampled. `axis=0` will select
+        rows while `axis=1` will select columns.
+
+    Returns
+    -------
+    subset
+        Subset of X on axis 0 or 1.
+
+    Notes
+    -----
+    CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
+    not supported.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils import _safe_indexing
+    >>> data = np.array([[1, 2], [3, 4], [5, 6]])
+    >>> _safe_indexing(data, 0, axis=0)  # select the first row
+    array([1, 2])
+    >>> _safe_indexing(data, 0, axis=1)  # select the first column
+    array([1, 3, 5])
+    """
+    if indices is None:
+        return X
+
+    if axis not in (0, 1):
+        raise ValueError(
+            "'axis' should be either 0 (to index rows) or 1 (to index "
+            " column). Got {} instead.".format(axis)
+        )
+
+    indices_dtype = _determine_key_type(indices)
+
+    if axis == 0 and indices_dtype == "str":
+        raise ValueError("String indexing is not supported with 'axis=0'")
+
+    if axis == 1 and isinstance(X, list):
+        raise ValueError("axis=1 is not supported for lists")
+
+    if axis == 1 and (ndim := len(getattr(X, "shape", [0]))) != 2:
+        raise ValueError(
+            "'X' should be a 2D NumPy array, 2D sparse matrix or "
+            "dataframe when indexing the columns (i.e. 'axis=1'). "
+            f"Got {type(X)} instead with {ndim} dimension(s)."
+        )
+
+    if (
+        axis == 1
+        and indices_dtype == "str"
+        and not (_is_pandas_df(X) or _use_interchange_protocol(X))
+    ):
+        raise ValueError(
+            "Specifying the columns using strings is only supported for dataframes."
+        )
+
+    if hasattr(X, "iloc"):
+        # TODO: we should probably use _is_pandas_df_or_series(X) instead but:
+        # 1) Currently, it (probably) works for dataframes compliant to pandas' API.
+        # 2) Updating would require updating some tests such as
+        #    test_train_test_split_mock_pandas.
+        return _pandas_indexing(X, indices, indices_dtype, axis=axis)
+    elif _is_polars_df_or_series(X):
+        return _polars_indexing(X, indices, indices_dtype, axis=axis)
+    elif _is_pyarrow_data(X):
+        return _pyarrow_indexing(X, indices, indices_dtype, axis=axis)
+    elif _use_interchange_protocol(X):  # pragma: no cover
+        # Once the dataframe X is converted into its dataframe interchange protocol
+        # version by calling X.__dataframe__(), it becomes very hard to turn it back
+        # into its original type, e.g., a pyarrow.Table, see
+        # https://github.com/data-apis/dataframe-api/issues/85.
+        raise warnings.warn(
+            message="A data object with support for the dataframe interchange protocol"
+            "was passed, but scikit-learn does currently not know how to handle this "
+            "kind of data. Some array/list indexing will be tried.",
+            category=UserWarning,
+        )
+
+    if hasattr(X, "shape"):
+        return _array_indexing(X, indices, indices_dtype, axis=axis)
+    else:
+        return _list_indexing(X, indices, indices_dtype)
+
+
+def _safe_assign(X, values, *, row_indexer=None, column_indexer=None):
+    """Safe assignment to a numpy array, sparse matrix, or pandas dataframe.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse-matrix, dataframe}
+        Array to be modified. It is expected to be 2-dimensional.
+
+    values : ndarray
+        The values to be assigned to `X`.
+
+    row_indexer : array-like, dtype={int, bool}, default=None
+        A 1-dimensional array to select the rows of interest. If `None`, all
+        rows are selected.
+
+    column_indexer : array-like, dtype={int, bool}, default=None
+        A 1-dimensional array to select the columns of interest. If `None`, all
+        columns are selected.
+    """
+    row_indexer = slice(None, None, None) if row_indexer is None else row_indexer
+    column_indexer = (
+        slice(None, None, None) if column_indexer is None else column_indexer
+    )
+
+    if hasattr(X, "iloc"):  # pandas dataframe
+        with warnings.catch_warnings():
+            # pandas >= 1.5 raises a warning when using iloc to set values in a column
+            # that does not have the same type as the column being set. It happens
+            # for instance when setting a categorical column with a string.
+            # In the future the behavior won't change and the warning should disappear.
+            # TODO(1.3): check if the warning is still raised or remove the filter.
+            warnings.simplefilter("ignore", FutureWarning)
+            X.iloc[row_indexer, column_indexer] = values
+    else:  # numpy array or sparse matrix
+        X[row_indexer, column_indexer] = values
+
+
+def _get_column_indices_for_bool_or_int(key, n_columns):
+    # Convert key into list of positive integer indexes
+    try:
+        idx = _safe_indexing(np.arange(n_columns), key)
+    except IndexError as e:
+        raise ValueError(
+            f"all features must be in [0, {n_columns - 1}] or [-{n_columns}, 0]"
+        ) from e
+    return np.atleast_1d(idx).tolist()
+
+
+def _get_column_indices(X, key):
+    """Get feature column indices for input data X and key.
+
+    For accepted values of `key`, see the docstring of
+    :func:`_safe_indexing`.
+    """
+    key_dtype = _determine_key_type(key)
+    if _use_interchange_protocol(X):
+        return _get_column_indices_interchange(X.__dataframe__(), key, key_dtype)
+
+    n_columns = X.shape[1]
+    if isinstance(key, (list, tuple)) and not key:
+        # we get an empty list
+        return []
+    elif key_dtype in ("bool", "int"):
+        return _get_column_indices_for_bool_or_int(key, n_columns)
+    else:
+        try:
+            all_columns = X.columns
+        except AttributeError:
+            raise ValueError(
+                "Specifying the columns using strings is only supported for dataframes."
+            )
+        if isinstance(key, str):
+            columns = [key]
+        elif isinstance(key, slice):
+            start, stop = key.start, key.stop
+            if start is not None:
+                start = all_columns.get_loc(start)
+            if stop is not None:
+                # pandas indexing with strings is endpoint included
+                stop = all_columns.get_loc(stop) + 1
+            else:
+                stop = n_columns + 1
+            return list(islice(range(n_columns), start, stop))
+        else:
+            columns = list(key)
+
+        try:
+            column_indices = []
+            for col in columns:
+                col_idx = all_columns.get_loc(col)
+                if not isinstance(col_idx, numbers.Integral):
+                    raise ValueError(
+                        f"Selected columns, {columns}, are not unique in dataframe"
+                    )
+                column_indices.append(col_idx)
+
+        except KeyError as e:
+            raise ValueError("A given column is not a column of the dataframe") from e
+
+        return column_indices
+
+
+def _get_column_indices_interchange(X_interchange, key, key_dtype):
+    """Same as _get_column_indices but for X with __dataframe__ protocol."""
+
+    n_columns = X_interchange.num_columns()
+
+    if isinstance(key, (list, tuple)) and not key:
+        # we get an empty list
+        return []
+    elif key_dtype in ("bool", "int"):
+        return _get_column_indices_for_bool_or_int(key, n_columns)
+    else:
+        column_names = list(X_interchange.column_names())
+
+        if isinstance(key, slice):
+            if key.step not in [1, None]:
+                raise NotImplementedError("key.step must be 1 or None")
+            start, stop = key.start, key.stop
+            if start is not None:
+                start = column_names.index(start)
+
+            if stop is not None:
+                stop = column_names.index(stop) + 1
+            else:
+                stop = n_columns + 1
+            return list(islice(range(n_columns), start, stop))
+
+        selected_columns = [key] if np.isscalar(key) else key
+
+        try:
+            return [column_names.index(col) for col in selected_columns]
+        except ValueError as e:
+            raise ValueError("A given column is not a column of the dataframe") from e
+
+
+@validate_params(
+    {
+        "replace": ["boolean"],
+        "n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "stratify": ["array-like", "sparse matrix", None],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def resample(
+    *arrays,
+    replace=True,
+    n_samples=None,
+    random_state=None,
+    stratify=None,
+    sample_weight=None,
+):
+    """Resample arrays or sparse matrices in a consistent way.
+
+    The default strategy implements one step of the bootstrapping
+    procedure.
+
+    Parameters
+    ----------
+    *arrays : sequence of array-like of shape (n_samples,) or \
+            (n_samples, n_outputs)
+        Indexable data-structures can be arrays, lists, dataframes or scipy
+        sparse matrices with consistent first dimension.
+
+    replace : bool, default=True
+        Implements resampling with replacement. It must be set to True
+        whenever sampling with non-uniform weights: a few data points with very large
+        weights are expected to be sampled several times with probability to preserve
+        the distribution induced by the weights. If False, this will implement
+        (sliced) random permutations.
+
+    n_samples : int, default=None
+        Number of samples to generate. If left to None this is
+        automatically set to the first dimension of the arrays.
+        If replace is False it should not be larger than the length of
+        arrays.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for shuffling
+        the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    stratify : {array-like, sparse matrix} of shape (n_samples,) or \
+            (n_samples, n_outputs), default=None
+        If not None, data is split in a stratified fashion, using this as
+        the class labels.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Contains weight values to be associated with each sample. Values are
+        normalized to sum to one and interpreted as probability for sampling
+        each data point.
+
+        .. versionadded:: 1.7
+
+    Returns
+    -------
+    resampled_arrays : sequence of array-like of shape (n_samples,) or \
+            (n_samples, n_outputs)
+        Sequence of resampled copies of the collections. The original arrays
+        are not impacted.
+
+    See Also
+    --------
+    shuffle : Shuffle arrays or sparse matrices in a consistent way.
+
+    Examples
+    --------
+    It is possible to mix sparse and dense arrays in the same run::
+
+      >>> import numpy as np
+      >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
+      >>> y = np.array([0, 1, 2])
+
+      >>> from scipy.sparse import coo_matrix
+      >>> X_sparse = coo_matrix(X)
+
+      >>> from sklearn.utils import resample
+      >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
+      >>> X
+      array([[1., 0.],
+             [2., 1.],
+             [1., 0.]])
+
+      >>> X_sparse
+      <Compressed Sparse Row sparse matrix of dtype 'float64'
+          with 4 stored elements and shape (3, 2)>
+
+      >>> X_sparse.toarray()
+      array([[1., 0.],
+             [2., 1.],
+             [1., 0.]])
+
+      >>> y
+      array([0, 1, 0])
+
+      >>> resample(y, n_samples=2, random_state=0)
+      array([0, 1])
+
+    Example using stratification::
+
+      >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]
+      >>> resample(y, n_samples=5, replace=False, stratify=y,
+      ...          random_state=0)
+      [1, 1, 1, 0, 1]
+    """
+    max_n_samples = n_samples
+    random_state = check_random_state(random_state)
+
+    if len(arrays) == 0:
+        return None
+
+    first = arrays[0]
+    n_samples = first.shape[0] if hasattr(first, "shape") else len(first)
+
+    if max_n_samples is None:
+        max_n_samples = n_samples
+    elif (max_n_samples > n_samples) and (not replace):
+        raise ValueError(
+            "Cannot sample %d out of arrays with dim %d when replace is False"
+            % (max_n_samples, n_samples)
+        )
+
+    check_consistent_length(*arrays)
+
+    if sample_weight is not None and not replace:
+        raise NotImplementedError(
+            "Resampling with sample_weight is only implemented for replace=True."
+        )
+    if sample_weight is not None and stratify is not None:
+        raise NotImplementedError(
+            "Resampling with sample_weight is only implemented for stratify=None."
+        )
+    if stratify is None:
+        if replace:
+            if sample_weight is not None:
+                sample_weight = _check_sample_weight(
+                    sample_weight, first, dtype=np.float64
+                )
+                p = sample_weight / sample_weight.sum()
+            else:
+                p = None
+            indices = random_state.choice(
+                n_samples,
+                size=max_n_samples,
+                p=p,
+                replace=True,
+            )
+        else:
+            indices = np.arange(n_samples)
+            random_state.shuffle(indices)
+            indices = indices[:max_n_samples]
+    else:
+        # Code adapted from StratifiedShuffleSplit()
+        y = check_array(stratify, ensure_2d=False, dtype=None)
+        if y.ndim == 2:
+            # for multi-label y, map each distinct row to a string repr
+            # using join because str(row) uses an ellipsis if len(row) > 1000
+            y = np.array([" ".join(row.astype("str")) for row in y])
+
+        classes, y_indices = np.unique(y, return_inverse=True)
+        n_classes = classes.shape[0]
+
+        class_counts = np.bincount(y_indices)
+
+        # Find the sorted list of instances for each class:
+        # (np.unique above performs a sort, so code is O(n logn) already)
+        class_indices = np.split(
+            np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
+        )
+
+        n_i = _approximate_mode(class_counts, max_n_samples, random_state)
+
+        indices = []
+
+        for i in range(n_classes):
+            indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace)
+            indices.extend(indices_i)
+
+        indices = random_state.permutation(indices)
+
+    # convert sparse matrices to CSR for row-based indexing
+    arrays = [a.tocsr() if issparse(a) else a for a in arrays]
+    resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
+    if len(resampled_arrays) == 1:
+        # syntactic sugar for the unit argument case
+        return resampled_arrays[0]
+    else:
+        return resampled_arrays
+
+
+def shuffle(*arrays, random_state=None, n_samples=None):
+    """Shuffle arrays or sparse matrices in a consistent way.
+
+    This is a convenience alias to ``resample(*arrays, replace=False)`` to do
+    random permutations of the collections.
+
+    Parameters
+    ----------
+    *arrays : sequence of indexable data-structures
+        Indexable data-structures can be arrays, lists, dataframes or scipy
+        sparse matrices with consistent first dimension.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for shuffling
+        the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_samples : int, default=None
+        Number of samples to generate. If left to None this is
+        automatically set to the first dimension of the arrays.  It should
+        not be larger than the length of arrays.
+
+    Returns
+    -------
+    shuffled_arrays : sequence of indexable data-structures
+        Sequence of shuffled copies of the collections. The original arrays
+        are not impacted.
+
+    See Also
+    --------
+    resample : Resample arrays or sparse matrices in a consistent way.
+
+    Examples
+    --------
+    It is possible to mix sparse and dense arrays in the same run::
+
+      >>> import numpy as np
+      >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
+      >>> y = np.array([0, 1, 2])
+
+      >>> from scipy.sparse import coo_matrix
+      >>> X_sparse = coo_matrix(X)
+
+      >>> from sklearn.utils import shuffle
+      >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
+      >>> X
+      array([[0., 0.],
+             [2., 1.],
+             [1., 0.]])
+
+      >>> X_sparse
+      <Compressed Sparse Row sparse matrix of dtype 'float64'
+          with 3 stored elements and shape (3, 2)>
+
+      >>> X_sparse.toarray()
+      array([[0., 0.],
+             [2., 1.],
+             [1., 0.]])
+
+      >>> y
+      array([2, 1, 0])
+
+      >>> shuffle(y, n_samples=2, random_state=0)
+      array([0, 1])
+    """
+    return resample(
+        *arrays, replace=False, n_samples=n_samples, random_state=random_state
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_isfinite.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_isfinite.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..f3918eeacb5c467f78ae06b04a9f5c3e587f6174
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_isfinite.pyx
@@ -0,0 +1,51 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.math cimport isnan, isinf
+from cython cimport floating
+
+
+cpdef enum FiniteStatus:
+    all_finite = 0
+    has_nan = 1
+    has_infinite = 2
+
+
+def cy_isfinite(floating[::1] a, bint allow_nan=False):
+    cdef FiniteStatus result
+    with nogil:
+        result = _isfinite(a, allow_nan)
+    return result
+
+
+cdef inline FiniteStatus _isfinite(floating[::1] a, bint allow_nan) noexcept nogil:
+    cdef floating* a_ptr = &a[0]
+    cdef Py_ssize_t length = len(a)
+    if allow_nan:
+        return _isfinite_allow_nan(a_ptr, length)
+    else:
+        return _isfinite_disable_nan(a_ptr, length)
+
+
+cdef inline FiniteStatus _isfinite_allow_nan(floating* a_ptr,
+                                             Py_ssize_t length) noexcept nogil:
+    cdef Py_ssize_t i
+    cdef floating v
+    for i in range(length):
+        v = a_ptr[i]
+        if isinf(v):
+            return FiniteStatus.has_infinite
+    return FiniteStatus.all_finite
+
+
+cdef inline FiniteStatus _isfinite_disable_nan(floating* a_ptr,
+                                               Py_ssize_t length) noexcept nogil:
+    cdef Py_ssize_t i
+    cdef floating v
+    for i in range(length):
+        v = a_ptr[i]
+        if isnan(v):
+            return FiniteStatus.has_nan
+        elif isinf(v):
+            return FiniteStatus.has_infinite
+    return FiniteStatus.all_finite
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_mask.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..da21c8e68b72d35b4ba907c47630502f7556c92f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_mask.py
@@ -0,0 +1,181 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from contextlib import suppress
+
+import numpy as np
+from scipy import sparse as sp
+
+from ._missing import is_scalar_nan
+from ._param_validation import validate_params
+from .fixes import _object_dtype_isnan
+
+
+def _get_dense_mask(X, value_to_mask):
+    with suppress(ImportError, AttributeError):
+        # We also suppress `AttributeError` because older versions of pandas do
+        # not have `NA`.
+        import pandas
+
+        if value_to_mask is pandas.NA:
+            return pandas.isna(X)
+
+    if is_scalar_nan(value_to_mask):
+        if X.dtype.kind == "f":
+            Xt = np.isnan(X)
+        elif X.dtype.kind in ("i", "u"):
+            # can't have NaNs in integer array.
+            Xt = np.zeros(X.shape, dtype=bool)
+        else:
+            # np.isnan does not work on object dtypes.
+            Xt = _object_dtype_isnan(X)
+    else:
+        Xt = X == value_to_mask
+
+    return Xt
+
+
+def _get_mask(X, value_to_mask):
+    """Compute the boolean mask X == value_to_mask.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        Input data, where ``n_samples`` is the number of samples and
+        ``n_features`` is the number of features.
+
+    value_to_mask : {int, float}
+        The value which is to be masked in X.
+
+    Returns
+    -------
+    X_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        Missing mask.
+    """
+    if not sp.issparse(X):
+        # For all cases apart of a sparse input where we need to reconstruct
+        # a sparse output
+        return _get_dense_mask(X, value_to_mask)
+
+    Xt = _get_dense_mask(X.data, value_to_mask)
+
+    sparse_constructor = sp.csr_matrix if X.format == "csr" else sp.csc_matrix
+    Xt_sparse = sparse_constructor(
+        (Xt, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool
+    )
+
+    return Xt_sparse
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "mask": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def safe_mask(X, mask):
+    """Return a mask which is safe to use on X.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}
+        Data on which to apply mask.
+
+    mask : array-like
+        Mask to be used on X.
+
+    Returns
+    -------
+    mask : ndarray
+        Array that is safe to use on X.
+
+    Examples
+    --------
+    >>> from sklearn.utils import safe_mask
+    >>> from scipy.sparse import csr_matrix
+    >>> data = csr_matrix([[1], [2], [3], [4], [5]])
+    >>> condition = [False, True, True, False, True]
+    >>> mask = safe_mask(data, condition)
+    >>> data[mask].toarray()
+    array([[2],
+           [3],
+           [5]])
+    """
+    mask = np.asarray(mask)
+    if np.issubdtype(mask.dtype, np.signedinteger):
+        return mask
+
+    if hasattr(X, "toarray"):
+        ind = np.arange(mask.shape[0])
+        mask = ind[mask]
+    return mask
+
+
+def axis0_safe_slice(X, mask, len_mask):
+    """Return a mask which is safer to use on X than safe_mask.
+
+    This mask is safer than safe_mask since it returns an
+    empty array, when a sparse matrix is sliced with a boolean mask
+    with all False, instead of raising an unhelpful error in older
+    versions of SciPy.
+
+    See: https://github.com/scipy/scipy/issues/5361
+
+    Also note that we can avoid doing the dot product by checking if
+    the len_mask is not zero in _huber_loss_and_gradient but this
+    is not going to be the bottleneck, since the number of outliers
+    and non_outliers are typically non-zero and it makes the code
+    tougher to follow.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}
+        Data on which to apply mask.
+
+    mask : ndarray
+        Mask to be used on X.
+
+    len_mask : int
+        The length of the mask.
+
+    Returns
+    -------
+    mask : ndarray
+        Array that is safe to use on X.
+    """
+    if len_mask != 0:
+        return X[safe_mask(X, mask), :]
+    return np.zeros(shape=(0, X.shape[1]))
+
+
+def indices_to_mask(indices, mask_length):
+    """Convert list of indices to boolean mask.
+
+    Parameters
+    ----------
+    indices : list-like
+        List of integers treated as indices.
+    mask_length : int
+        Length of boolean mask to be generated.
+        This parameter must be greater than max(indices).
+
+    Returns
+    -------
+    mask : 1d boolean nd-array
+        Boolean array that is True where indices are present, else False.
+
+    Examples
+    --------
+    >>> from sklearn.utils._mask import indices_to_mask
+    >>> indices = [1, 2 , 3, 4]
+    >>> indices_to_mask(indices, 5)
+    array([False,  True,  True,  True,  True])
+    """
+    if mask_length <= np.max(indices):
+        raise ValueError("mask_length must be greater than max(indices)")
+
+    mask = np.zeros(mask_length, dtype=bool)
+    mask[indices] = True
+
+    return mask
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_metadata_requests.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_metadata_requests.py
new file mode 100644
index 0000000000000000000000000000000000000000..70369c03fce978882fe740abd752a5269899b005
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_metadata_requests.py
@@ -0,0 +1,1628 @@
+"""
+Metadata Routing Utility
+
+In order to better understand the components implemented in this file, one
+needs to understand their relationship to one another.
+
+The only relevant public API for end users are the ``set_{method}_request`` methods,
+e.g. ``estimator.set_fit_request(sample_weight=True)``. However, third-party
+developers and users who implement custom meta-estimators, need to deal with
+the objects implemented in this file.
+
+The routing is coordinated by building ``MetadataRequest`` objects
+for objects that consume metadata, and ``MetadataRouter`` objects for objects that
+can route metadata, which are then aligned during a call to `process_routing()`. This
+function returns a Bunch object (dictionary-like) with all the information on the
+consumers and which metadata they had requested and the actual metadata values. A
+routing method (such as `fit` in a meta-estimator) can now provide the metadata to the
+relevant consuming method (such as `fit` in a sub-estimator).
+
+The ``MetadataRequest`` and ``MetadataRouter`` objects are constructed via a
+``get_metadata_routing`` method, which all scikit-learn estimators provide.
+This method is automatically implemented via ``BaseEstimator`` for all simple
+estimators, but needs a custom implementation for meta-estimators.
+
+MetadataRequest
+~~~~~~~~~~~~~~~
+
+In non-routing consumers, the simplest case, e.g. ``SVM``, ``get_metadata_routing``
+returns a ``MetadataRequest`` object  which is assigned to the consumer's
+`_metadata_request` attribute. It stores which metadata is required by each method of
+the consumer by including one ``MethodMetadataRequest`` per method in ``METHODS``
+(e. g. ``fit``, ``score``, etc).
+
+Users and developers almost never need to directly add a new ``MethodMetadataRequest``,
+to the consumer's `_metadata_request` attribute, since these are generated
+automatically. This attribute is modified while running `set_{method}_request` methods
+(such as `set_fit_request()`), which adds the request via
+`method_metadata_request.add_request(param=prop, alias=alias)`.
+
+The ``alias`` in the ``add_request`` method has to be either a string (an alias),
+or one of ``[True (requested), False (unrequested), None (error if passed)]``. There
+are some other special values such as ``UNUSED`` and ``WARN`` which are used
+for purposes such as warning of removing a metadata in a child class, but not
+used by the end users.
+
+MetadataRouter
+~~~~~~~~~~~~~~
+
+In routers (such as meta-estimators or multi metric scorers), ``get_metadata_routing``
+returns a ``MetadataRouter`` object. It provides information about which method, from
+the router object, calls which method in a consumer's object, and also, which metadata
+had been requested by the consumer's methods, thus specifying how metadata is to be
+passed. If a sub-estimator is a router as well, their routing information is also stored
+in the meta-estimators router.
+
+Conceptually, this information looks like:
+
+```
+{
+    "sub_estimator1": (
+        mapping=[(caller="fit", callee="transform"), ...],
+        router=MetadataRequest(...),  # or another MetadataRouter
+    ),
+    ...
+}
+```
+
+The `MetadataRouter` objects are never stored and are always recreated anew whenever
+the object's `get_metadata_routing` method is called.
+
+An object that is both a router and a consumer, e.g. a meta-estimator which
+consumes ``sample_weight`` and routes ``sample_weight`` to its sub-estimators
+also returns a ``MetadataRouter`` object. Its routing information includes both
+information about what metadata is required by the object itself (added via
+``MetadataRouter.add_self_request``), as well as the routing information for its
+sub-estimators (added via ``MetadataRouter.add``).
+
+Implementation Details
+~~~~~~~~~~~~~~~~~~~~~~
+
+To give the above representation some structure, we use the following objects:
+
+- ``(caller=..., callee=...)`` is a namedtuple called ``MethodPair``.
+
+- The list of ``MethodPair`` stored in the ``mapping`` field of a `RouterMappingPair` is
+  a ``MethodMapping`` object.
+
+- ``(mapping=..., router=...)`` is a namedtuple called ``RouterMappingPair``.
+
+The ``set_{method}_request`` methods are dynamically generated for estimators
+which inherit from ``BaseEstimator``. This is done by attaching instances
+of the ``RequestMethod`` descriptor to classes, which is done in the
+``_MetadataRequester`` class, and ``BaseEstimator`` inherits from this mixin.
+This mixin also implements the ``get_metadata_routing``, which meta-estimators
+need to override, but it works for simple consumers as is.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import inspect
+from collections import namedtuple
+from copy import deepcopy
+from typing import TYPE_CHECKING, Optional, Union
+from warnings import warn
+
+from .. import get_config
+from ..exceptions import UnsetMetadataPassedError
+from ._bunch import Bunch
+
+# Only the following methods are supported in the routing mechanism. Adding new
+# methods at the moment involves monkeypatching this list.
+# Note that if this list is changed or monkeypatched, the corresponding method
+# needs to be added under a TYPE_CHECKING condition like the one done here in
+# _MetadataRequester
+SIMPLE_METHODS = [
+    "fit",
+    "partial_fit",
+    "predict",
+    "predict_proba",
+    "predict_log_proba",
+    "decision_function",
+    "score",
+    "split",
+    "transform",
+    "inverse_transform",
+]
+
+# These methods are a composite of other methods and one cannot set their
+# requests directly. Instead they should be set by setting the requests of the
+# simple methods which make the composite ones.
+COMPOSITE_METHODS = {
+    "fit_transform": ["fit", "transform"],
+    "fit_predict": ["fit", "predict"],
+}
+
+METHODS = SIMPLE_METHODS + list(COMPOSITE_METHODS.keys())
+
+
+def _routing_enabled():
+    """Return whether metadata routing is enabled.
+
+    .. versionadded:: 1.3
+
+    Returns
+    -------
+    enabled : bool
+        Whether metadata routing is enabled. If the config is not set, it
+        defaults to False.
+    """
+    return get_config().get("enable_metadata_routing", False)
+
+
+def _raise_for_params(params, owner, method, allow=None):
+    """Raise an error if metadata routing is not enabled and params are passed.
+
+    .. versionadded:: 1.4
+
+    Parameters
+    ----------
+    params : dict
+        The metadata passed to a method.
+
+    owner : object
+        The object to which the method belongs.
+
+    method : str
+        The name of the method, e.g. "fit".
+
+    allow : list of str, default=None
+        A list of parameters which are allowed to be passed even if metadata
+        routing is not enabled.
+
+    Raises
+    ------
+    ValueError
+        If metadata routing is not enabled and params are passed.
+    """
+    caller = (
+        f"{owner.__class__.__name__}.{method}" if method else owner.__class__.__name__
+    )
+
+    allow = allow if allow is not None else {}
+
+    if not _routing_enabled() and (params.keys() - allow):
+        raise ValueError(
+            f"Passing extra keyword arguments to {caller} is only supported if"
+            " enable_metadata_routing=True, which you can set using"
+            " `sklearn.set_config`. See the User Guide"
+            " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+            f" details. Extra parameters passed are: {set(params)}"
+        )
+
+
+def _raise_for_unsupported_routing(obj, method, **kwargs):
+    """Raise when metadata routing is enabled and metadata is passed.
+
+    This is used in meta-estimators which have not implemented metadata routing
+    to prevent silent bugs. There is no need to use this function if the
+    meta-estimator is not accepting any metadata, especially in `fit`, since
+    if a meta-estimator accepts any metadata, they would do that in `fit` as
+    well.
+
+    Parameters
+    ----------
+    obj : estimator
+        The estimator for which we're raising the error.
+
+    method : str
+        The method where the error is raised.
+
+    **kwargs : dict
+        The metadata passed to the method.
+    """
+    kwargs = {key: value for key, value in kwargs.items() if value is not None}
+    if _routing_enabled() and kwargs:
+        cls_name = obj.__class__.__name__
+        raise NotImplementedError(
+            f"{cls_name}.{method} cannot accept given metadata ({set(kwargs.keys())})"
+            f" since metadata routing is not yet implemented for {cls_name}."
+        )
+
+
+class _RoutingNotSupportedMixin:
+    """A mixin to be used to remove the default `get_metadata_routing`.
+
+    This is used in meta-estimators where metadata routing is not yet
+    implemented.
+
+    This also makes it clear in our rendered documentation that this method
+    cannot be used.
+    """
+
+    def get_metadata_routing(self):
+        """Raise `NotImplementedError`.
+
+        This estimator does not support metadata routing yet."""
+        raise NotImplementedError(
+            f"{self.__class__.__name__} has not implemented metadata routing yet."
+        )
+
+
+# Request values
+# ==============
+# Each request value needs to be one of the following values, or an alias.
+
+# this is used in `__metadata_request__*` attributes to indicate that a
+# metadata is not present even though it may be present in the
+# corresponding method's signature.
+UNUSED = "$UNUSED$"
+
+# this is used whenever a default value is changed, and therefore the user
+# should explicitly set the value, otherwise a warning is shown. An example
+# is when a meta-estimator is only a router, but then becomes also a
+# consumer in a new release.
+WARN = "$WARN$"
+
+# this is the default used in `set_{method}_request` methods to indicate no
+# change requested by the user.
+UNCHANGED = "$UNCHANGED$"
+
+VALID_REQUEST_VALUES = [False, True, None, UNUSED, WARN]
+
+
+def request_is_alias(item):
+    """Check if an item is a valid string alias for a metadata.
+
+    Values in ``VALID_REQUEST_VALUES`` are not considered aliases in this
+    context. Only a string which is a valid identifier is.
+
+    Parameters
+    ----------
+    item : object
+        The given item to be checked if it can be an alias for the metadata.
+
+    Returns
+    -------
+    result : bool
+        Whether the given item is a valid alias.
+    """
+    if item in VALID_REQUEST_VALUES:
+        return False
+
+    # item is only an alias if it's a valid identifier
+    return isinstance(item, str) and item.isidentifier()
+
+
+def request_is_valid(item):
+    """Check if an item is a valid request value (and not an alias).
+
+    Parameters
+    ----------
+    item : object
+        The given item to be checked.
+
+    Returns
+    -------
+    result : bool
+        Whether the given item is valid.
+    """
+    return item in VALID_REQUEST_VALUES
+
+
+# Metadata Request for Simple Consumers
+# =====================================
+# This section includes MethodMetadataRequest and MetadataRequest which are
+# used in simple consumers.
+
+
+class MethodMetadataRequest:
+    """Container for metadata requests associated with a single method.
+
+    Instances of this class get used within a :class:`MetadataRequest` - one per each
+    public method (`fit`, `transform`, ...) that its owning consumer has.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    owner : str
+        A display name for the object owning these requests.
+
+    method : str
+        The name of the method to which these requests belong.
+
+    requests : dict of {str: bool, None or str}, default=None
+        The initial requests for this method.
+    """
+
+    def __init__(self, owner, method, requests=None):
+        self._requests = requests or dict()
+        self.owner = owner
+        self.method = method
+
+    @property
+    def requests(self):
+        """Dictionary of the form: ``{key: alias}``."""
+        return self._requests
+
+    def add_request(
+        self,
+        *,
+        param,
+        alias,
+    ):
+        """Add request info for a metadata.
+
+        Parameters
+        ----------
+        param : str
+            The metadata for which a request is set.
+
+        alias : str, or {True, False, None}
+            Specifies which metadata should be routed to the method that owns this
+            `MethodMetadataRequest`.
+
+            - str: the name (or alias) of metadata given to a meta-estimator that
+              should be routed to the method that owns this `MethodMetadataRequest`.
+
+            - True: requested
+
+            - False: not requested
+
+            - None: error if passed
+        """
+        if not request_is_alias(alias) and not request_is_valid(alias):
+            raise ValueError(
+                f"The alias you're setting for `{param}` should be either a "
+                "valid identifier or one of {None, True, False}, but given "
+                f"value is: `{alias}`"
+            )
+
+        if alias == param:
+            alias = True
+
+        if alias == UNUSED:
+            if param in self._requests:
+                del self._requests[param]
+            else:
+                raise ValueError(
+                    f"Trying to remove parameter {param} with UNUSED which doesn't"
+                    " exist."
+                )
+        else:
+            self._requests[param] = alias
+
+        return self
+
+    def _get_param_names(self, return_alias):
+        """Get names of all metadata that can be consumed or routed by this method.
+
+        This method returns the names of all metadata, even the ``False``
+        ones.
+
+        Parameters
+        ----------
+        return_alias : bool
+            Controls whether original or aliased names should be returned. If
+            ``False``, aliases are ignored and original names are returned.
+
+        Returns
+        -------
+        names : set of str
+            A set of strings with the names of all metadata.
+        """
+        return set(
+            alias if return_alias and not request_is_valid(alias) else prop
+            for prop, alias in self._requests.items()
+            if not request_is_valid(alias) or alias is not False
+        )
+
+    def _check_warnings(self, *, params):
+        """Check whether metadata is passed which is marked as WARN.
+
+        If any metadata is passed which is marked as WARN, a warning is raised.
+
+        Parameters
+        ----------
+        params : dict
+            The metadata passed to a method.
+        """
+        params = {} if params is None else params
+        warn_params = {
+            prop
+            for prop, alias in self._requests.items()
+            if alias == WARN and prop in params
+        }
+        for param in warn_params:
+            warn(
+                f"Support for {param} has recently been added to this class. "
+                "To maintain backward compatibility, it is ignored now. "
+                f"Using `set_{self.method}_request({param}={{True, False}})` "
+                "on this method of the class, you can set the request value "
+                "to False to silence this warning, or to True to consume and "
+                "use the metadata."
+            )
+
+    def _route_params(self, params, parent, caller):
+        """Prepare the given metadata to be passed to the method.
+
+        The output of this method can be used directly as the input to the
+        corresponding method as **kwargs.
+
+        Parameters
+        ----------
+        params : dict
+            A dictionary of provided metadata.
+
+        parent : object
+            Parent class object, that routes the metadata.
+
+        caller : str
+            Method from the parent class object, where the metadata is routed from.
+
+        Returns
+        -------
+        params : Bunch
+            A :class:`~sklearn.utils.Bunch` of {metadata: value} which can be
+            passed to the corresponding method.
+        """
+        self._check_warnings(params=params)
+        unrequested = dict()
+        args = {arg: value for arg, value in params.items() if value is not None}
+        res = Bunch()
+        for prop, alias in self._requests.items():
+            if alias is False or alias == WARN:
+                continue
+            elif alias is True and prop in args:
+                res[prop] = args[prop]
+            elif alias is None and prop in args:
+                unrequested[prop] = args[prop]
+            elif alias in args:
+                res[prop] = args[alias]
+        if unrequested:
+            if self.method in COMPOSITE_METHODS:
+                callee_methods = COMPOSITE_METHODS[self.method]
+            else:
+                callee_methods = [self.method]
+            set_requests_on = "".join(
+                [
+                    f".set_{method}_request({{metadata}}=True/False)"
+                    for method in callee_methods
+                ]
+            )
+            message = (
+                f"[{', '.join([key for key in unrequested])}] are passed but are not"
+                " explicitly set as requested or not requested for"
+                f" {self.owner}.{self.method}, which is used within"
+                f" {parent}.{caller}. Call `{self.owner}"
+                + set_requests_on
+                + "` for each metadata you want to request/ignore. See the"
+                " Metadata Routing User guide"
+                " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+                " information."
+            )
+            raise UnsetMetadataPassedError(
+                message=message,
+                unrequested_params=unrequested,
+                routed_params=res,
+            )
+        return res
+
+    def _consumes(self, params):
+        """Check whether the given metadata are consumed by this method.
+
+        Parameters
+        ----------
+        params : iterable of str
+            An iterable of parameters to check.
+
+        Returns
+        -------
+        consumed : set of str
+            A set of parameters which are consumed by this method.
+        """
+        params = set(params)
+        res = set()
+        for prop, alias in self._requests.items():
+            if alias is True and prop in params:
+                res.add(prop)
+            elif isinstance(alias, str) and alias in params:
+                res.add(alias)
+        return res
+
+    def _serialize(self):
+        """Serialize the object.
+
+        Returns
+        -------
+        obj : dict
+            A serialized version of the instance in the form of a dictionary.
+        """
+        return self._requests
+
+    def __repr__(self):
+        return str(self._serialize())
+
+    def __str__(self):
+        return str(repr(self))
+
+
+class MetadataRequest:
+    """Contains the metadata request info of a consumer.
+
+    Instances of `MethodMetadataRequest` are used in this class for each
+    available method under `metadatarequest.{method}`.
+
+    Consumer-only classes such as simple estimators return a serialized
+    version of this class as the output of `get_metadata_routing()`.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    owner : str
+        The name of the object to which these requests belong.
+    """
+
+    # this is here for us to use this attribute's value instead of doing
+    # `isinstance` in our checks, so that we avoid issues when people vendor
+    # this file instead of using it directly from scikit-learn.
+    _type = "metadata_request"
+
+    def __init__(self, owner):
+        self.owner = owner
+        for method in SIMPLE_METHODS:
+            setattr(
+                self,
+                method,
+                MethodMetadataRequest(owner=owner, method=method),
+            )
+
+    def consumes(self, method, params):
+        """Check whether the given metadata are consumed by the given method.
+
+        .. versionadded:: 1.4
+
+        Parameters
+        ----------
+        method : str
+            The name of the method to check.
+
+        params : iterable of str
+            An iterable of parameters to check.
+
+        Returns
+        -------
+        consumed : set of str
+            A set of parameters which are consumed by the given method.
+        """
+        return getattr(self, method)._consumes(params=params)
+
+    def __getattr__(self, name):
+        # Called when the default attribute access fails with an AttributeError
+        # (either __getattribute__() raises an AttributeError because name is
+        # not an instance attribute or an attribute in the class tree for self;
+        # or __get__() of a name property raises AttributeError). This method
+        # should either return the (computed) attribute value or raise an
+        # AttributeError exception.
+        # https://docs.python.org/3/reference/datamodel.html#object.__getattr__
+        if name not in COMPOSITE_METHODS:
+            raise AttributeError(
+                f"'{self.__class__.__name__}' object has no attribute '{name}'"
+            )
+
+        requests = {}
+        for method in COMPOSITE_METHODS[name]:
+            mmr = getattr(self, method)
+            existing = set(requests.keys())
+            upcoming = set(mmr.requests.keys())
+            common = existing & upcoming
+            conflicts = [key for key in common if requests[key] != mmr._requests[key]]
+            if conflicts:
+                raise ValueError(
+                    f"Conflicting metadata requests for {', '.join(conflicts)} while"
+                    f" composing the requests for {name}. Metadata with the same name"
+                    f" for methods {', '.join(COMPOSITE_METHODS[name])} should have the"
+                    " same request value."
+                )
+            requests.update(mmr._requests)
+        return MethodMetadataRequest(owner=self.owner, method=name, requests=requests)
+
+    def _get_param_names(self, method, return_alias, ignore_self_request=None):
+        """Get names of all metadata that can be consumed or routed by specified \
+            method.
+
+        This method returns the names of all metadata, even the ``False``
+        ones.
+
+        Parameters
+        ----------
+        method : str
+            The name of the method for which metadata names are requested.
+
+        return_alias : bool
+            Controls whether original or aliased names should be returned. If
+            ``False``, aliases are ignored and original names are returned.
+
+        ignore_self_request : bool
+            Ignored. Present for API compatibility.
+
+        Returns
+        -------
+        names : set of str
+            A set of strings with the names of all metadata.
+        """
+        return getattr(self, method)._get_param_names(return_alias=return_alias)
+
+    def _route_params(self, *, params, method, parent, caller):
+        """Prepare the given parameters to be passed to the method.
+
+        The output of this method can be used directly as the input to the
+        corresponding method as extra keyword arguments to pass metadata.
+
+        Parameters
+        ----------
+        params : dict
+            A dictionary of provided metadata.
+
+        method : str
+            The name of the method for which the parameters are requested and
+            routed.
+
+        parent : object
+            Parent class object, that routes the metadata.
+
+        caller : str
+            Method from the parent class object, where the metadata is routed from.
+
+        Returns
+        -------
+        params : Bunch
+            A :class:`~sklearn.utils.Bunch` of {metadata: value} which can be given to
+            the corresponding method.
+        """
+        return getattr(self, method)._route_params(
+            params=params, parent=parent, caller=caller
+        )
+
+    def _check_warnings(self, *, method, params):
+        """Check whether metadata is passed which is marked as WARN.
+
+        If any metadata is passed which is marked as WARN, a warning is raised.
+
+        Parameters
+        ----------
+        method : str
+            The name of the method for which the warnings should be checked.
+
+        params : dict
+            The metadata passed to a method.
+        """
+        getattr(self, method)._check_warnings(params=params)
+
+    def _serialize(self):
+        """Serialize the object.
+
+        Returns
+        -------
+        obj : dict
+            A serialized version of the instance in the form of a dictionary.
+        """
+        output = dict()
+        for method in SIMPLE_METHODS:
+            mmr = getattr(self, method)
+            if len(mmr.requests):
+                output[method] = mmr._serialize()
+        return output
+
+    def __repr__(self):
+        return str(self._serialize())
+
+    def __str__(self):
+        return str(repr(self))
+
+
+# Metadata Request for Routers
+# ============================
+# This section includes all objects required for MetadataRouter which is used
+# in routers, returned by their ``get_metadata_routing``.
+
+# `RouterMappingPair` is used to store a (mapping, router) tuple where `mapping` is a
+# `MethodMapping` object and `router` is the output of `get_metadata_routing`.
+# `MetadataRouter` stores a collection of `RouterMappingPair` objects in its
+# `_route_mappings` attribute.
+RouterMappingPair = namedtuple("RouterMappingPair", ["mapping", "router"])
+
+# `MethodPair` is used to store a single method routing. `MethodMapping` stores a list
+# of `MethodPair` objects in its `_routes` attribute.
+MethodPair = namedtuple("MethodPair", ["caller", "callee"])
+
+
+class MethodMapping:
+    """Stores the mapping between caller and callee methods for a :term:`router`.
+
+    This class is primarily used in a ``get_metadata_routing()`` of a router
+    object when defining the mapping between the router's methods and a sub-object (a
+    sub-estimator or a scorer).
+
+    Iterating through an instance of this class yields
+    ``MethodPair(caller, callee)`` instances.
+
+    .. versionadded:: 1.3
+    """
+
+    def __init__(self):
+        self._routes = []
+
+    def __iter__(self):
+        return iter(self._routes)
+
+    def add(self, *, caller, callee):
+        """Add a method mapping.
+
+        Parameters
+        ----------
+
+        caller : str
+            Parent estimator's method name in which the ``callee`` is called.
+
+        callee : str
+            Child object's method name. This method is called in ``caller``.
+
+        Returns
+        -------
+        self : MethodMapping
+            Returns self.
+        """
+        if caller not in METHODS:
+            raise ValueError(
+                f"Given caller:{caller} is not a valid method. Valid methods are:"
+                f" {METHODS}"
+            )
+        if callee not in METHODS:
+            raise ValueError(
+                f"Given callee:{callee} is not a valid method. Valid methods are:"
+                f" {METHODS}"
+            )
+        self._routes.append(MethodPair(caller=caller, callee=callee))
+        return self
+
+    def _serialize(self):
+        """Serialize the object.
+
+        Returns
+        -------
+        obj : list
+            A serialized version of the instance in the form of a list.
+        """
+        result = list()
+        for route in self._routes:
+            result.append({"caller": route.caller, "callee": route.callee})
+        return result
+
+    def __repr__(self):
+        return str(self._serialize())
+
+    def __str__(self):
+        return str(repr(self))
+
+
+class MetadataRouter:
+    """Coordinates metadata routing for a :term:`router` object.
+
+    This class is used by :term:`meta-estimators` or functions that can route metadata,
+    to handle their metadata routing. Routing information is stored in a
+    dictionary-like structure of the form ``{"object_name":
+    RouterMappingPair(mapping, router)}``, where ``mapping``
+    is an instance of :class:`~sklearn.utils.metadata_routing.MethodMapping` and
+    ``router`` is either a
+    :class:`~sklearn.utils.metadata_routing.MetadataRequest` or another
+    :class:`~sklearn.utils.metadata_routing.MetadataRouter` instance.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    owner : str
+        The name of the object to which these requests belong.
+    """
+
+    # this is here for us to use this attribute's value instead of doing
+    # `isinstance`` in our checks, so that we avoid issues when people vendor
+    # this file instead of using it directly from scikit-learn.
+    _type = "metadata_router"
+
+    def __init__(self, owner):
+        self._route_mappings = dict()
+        # `_self_request` is used if the router is also a consumer.
+        # _self_request, (added using `add_self_request()`) is treated
+        # differently from the other consumer objects which are stored in
+        # _route_mappings.
+        self._self_request = None
+        self.owner = owner
+
+    def add_self_request(self, obj):
+        """Add `self` (as a :term:`consumer`) to the `MetadataRouter`.
+
+        This method is used if the :term:`router` is also a :term:`consumer`, and hence
+        the router itself needs to be included in the routing. The passed object
+        can be an estimator or a
+        :class:`~sklearn.utils.metadata_routing.MetadataRequest`.
+
+        A router should add itself using this method instead of `add` since it
+        should be treated differently than the other consumer objects to which metadata
+        is routed by the router.
+
+        Parameters
+        ----------
+        obj : object
+            This is typically the router instance, i.e. `self` in a
+            ``get_metadata_routing()`` implementation. It can also be a
+            ``MetadataRequest`` instance.
+
+        Returns
+        -------
+        self : MetadataRouter
+            Returns `self`.
+        """
+        if getattr(obj, "_type", None) == "metadata_request":
+            self._self_request = deepcopy(obj)
+        elif hasattr(obj, "_get_metadata_request"):
+            self._self_request = deepcopy(obj._get_metadata_request())
+        else:
+            raise ValueError(
+                "Given `obj` is neither a `MetadataRequest` nor does it implement the"
+                " required API. Inheriting from `BaseEstimator` implements the required"
+                " API."
+            )
+        return self
+
+    def add(self, *, method_mapping, **objs):
+        """Add :term:`consumers <consumer>` to the `MetadataRouter`.
+
+        The estimators that consume metadata are passed as named objects along with a
+        method mapping, that defines how their methods relate to those of the
+        :term:`router`.
+
+        Parameters
+        ----------
+        method_mapping : MethodMapping
+            The mapping between the child (:term:`consumer`) and the parent's
+            (:term:`router`'s) methods.
+
+        **objs : dict
+            A dictionary of objects, whose requests are extracted by calling
+            :func:`~sklearn.utils.metadata_routing.get_routing_for_object` on them.
+
+        Returns
+        -------
+        self : MetadataRouter
+            Returns `self`.
+        """
+        method_mapping = deepcopy(method_mapping)
+
+        for name, obj in objs.items():
+            self._route_mappings[name] = RouterMappingPair(
+                mapping=method_mapping, router=get_routing_for_object(obj)
+            )
+        return self
+
+    def consumes(self, method, params):
+        """Check whether the given metadata is consumed by the given method.
+
+        .. versionadded:: 1.4
+
+        Parameters
+        ----------
+        method : str
+            The name of the method to check.
+
+        params : iterable of str
+            An iterable of parameters to check.
+
+        Returns
+        -------
+        consumed : set of str
+            A set of parameters which are consumed by the given method.
+        """
+        res = set()
+        if self._self_request:
+            res = res | self._self_request.consumes(method=method, params=params)
+
+        for _, route_mapping in self._route_mappings.items():
+            for caller, callee in route_mapping.mapping:
+                if caller == method:
+                    res = res | route_mapping.router.consumes(
+                        method=callee, params=params
+                    )
+
+        return res
+
+    def _get_param_names(self, *, method, return_alias, ignore_self_request):
+        """Get names of all metadata that can be consumed or routed by specified \
+            method.
+
+        This method returns the names of all metadata, even the ``False``
+        ones.
+
+        Parameters
+        ----------
+        method : str
+            The name of the method for which metadata names are requested.
+
+        return_alias : bool
+            Controls whether original or aliased names should be returned,
+            which only applies to the stored `self`. If no `self` routing
+            object is stored, this parameter has no effect.
+
+        ignore_self_request : bool
+            If `self._self_request` should be ignored. This is used in `_route_params`.
+            If ``True``, ``return_alias`` has no effect.
+
+        Returns
+        -------
+        names : set of str
+            A set of strings with the names of all metadata.
+        """
+        res = set()
+        if self._self_request and not ignore_self_request:
+            res = res.union(
+                self._self_request._get_param_names(
+                    method=method, return_alias=return_alias
+                )
+            )
+
+        for name, route_mapping in self._route_mappings.items():
+            for caller, callee in route_mapping.mapping:
+                if caller == method:
+                    res = res.union(
+                        route_mapping.router._get_param_names(
+                            method=callee, return_alias=True, ignore_self_request=False
+                        )
+                    )
+        return res
+
+    def _route_params(self, *, params, method, parent, caller):
+        """Prepare the given metadata to be passed to the method.
+
+        This is used when a router is used as a child object of another router.
+        The parent router then passes all parameters understood by the child
+        object to it and delegates their validation to the child.
+
+        The output of this method can be used directly as the input to the
+        corresponding method as **kwargs.
+
+        Parameters
+        ----------
+        params : dict
+            A dictionary of provided metadata.
+
+        method : str
+            The name of the method for which the metadata is requested and routed.
+
+        parent : object
+            Parent class object, that routes the metadata.
+
+        caller : str
+            Method from the parent class object, where the metadata is routed from.
+
+        Returns
+        -------
+        params : Bunch
+            A :class:`~sklearn.utils.Bunch` of {metadata: value} which can be given to
+            the corresponding method.
+        """
+        res = Bunch()
+        if self._self_request:
+            res.update(
+                self._self_request._route_params(
+                    params=params,
+                    method=method,
+                    parent=parent,
+                    caller=caller,
+                )
+            )
+
+        param_names = self._get_param_names(
+            method=method, return_alias=True, ignore_self_request=True
+        )
+        child_params = {
+            key: value for key, value in params.items() if key in param_names
+        }
+        for key in set(res.keys()).intersection(child_params.keys()):
+            # conflicts are okay if the passed objects are the same, but it's
+            # an issue if they're different objects.
+            if child_params[key] is not res[key]:
+                raise ValueError(
+                    f"In {self.owner}, there is a conflict on {key} between what is"
+                    " requested for this estimator and what is requested by its"
+                    " children. You can resolve this conflict by using an alias for"
+                    " the child estimators' requested metadata."
+                )
+
+        res.update(child_params)
+        return res
+
+    def route_params(self, *, caller, params):
+        """Get the values of metadata requested by :term:`consumers <consumer>`.
+
+        Returns a :class:`~sklearn.utils.Bunch` containing the metadata that this
+        :term:`router`'s `caller` method needs to route, organized by each
+        :term:`consumer` and their corresponding methods.
+
+        This can be used to pass the required metadata to corresponding methods in
+        consumers.
+
+        Parameters
+        ----------
+        caller : str
+            The name of the :term:`router`'s method through which the metadata is
+            routed. For example, if called inside the :term:`fit` method of a router,
+            this would be `"fit"`.
+
+        params : dict
+            A dictionary of provided metadata.
+
+        Returns
+        -------
+        params : Bunch
+            A :class:`~sklearn.utils.Bunch` of the form
+            ``{"object_name": {"method_name": {metadata: value}}}``.
+        """
+        if self._self_request:
+            self._self_request._check_warnings(params=params, method=caller)
+
+        res = Bunch()
+        for name, route_mapping in self._route_mappings.items():
+            router, mapping = route_mapping.router, route_mapping.mapping
+
+            res[name] = Bunch()
+            for _caller, _callee in mapping:
+                if _caller == caller:
+                    res[name][_callee] = router._route_params(
+                        params=params,
+                        method=_callee,
+                        parent=self.owner,
+                        caller=caller,
+                    )
+        return res
+
+    def validate_metadata(self, *, method, params):
+        """Validate given metadata for a method.
+
+        This raises a ``TypeError`` if some of the passed metadata are not
+        understood by child objects.
+
+        Parameters
+        ----------
+        method : str
+            The name of the :term:`router`'s method through which the metadata is
+            routed. For example, if called inside the :term:`fit` method of a router,
+            this would be `"fit"`.
+
+        params : dict
+            A dictionary of provided metadata.
+        """
+        param_names = self._get_param_names(
+            method=method, return_alias=False, ignore_self_request=False
+        )
+        if self._self_request:
+            self_params = self._self_request._get_param_names(
+                method=method, return_alias=False
+            )
+        else:
+            self_params = set()
+        extra_keys = set(params.keys()) - param_names - self_params
+        if extra_keys:
+            raise TypeError(
+                f"{self.owner}.{method} got unexpected argument(s) {extra_keys}, which"
+                " are not routed to any object."
+            )
+
+    def _serialize(self):
+        """Serialize the object.
+
+        Returns
+        -------
+        obj : dict
+            A serialized version of the instance in the form of a dictionary.
+        """
+        res = dict()
+        if self._self_request:
+            res["$self_request"] = self._self_request._serialize()
+        for name, route_mapping in self._route_mappings.items():
+            res[name] = dict()
+            res[name]["mapping"] = route_mapping.mapping._serialize()
+            res[name]["router"] = route_mapping.router._serialize()
+
+        return res
+
+    def __iter__(self):
+        if self._self_request:
+            method_mapping = MethodMapping()
+            for method in METHODS:
+                method_mapping.add(caller=method, callee=method)
+            yield (
+                "$self_request",
+                RouterMappingPair(mapping=method_mapping, router=self._self_request),
+            )
+        for name, route_mapping in self._route_mappings.items():
+            yield (name, route_mapping)
+
+    def __repr__(self):
+        return str(self._serialize())
+
+    def __str__(self):
+        return str(repr(self))
+
+
+def get_routing_for_object(obj=None):
+    """Get a ``Metadata{Router, Request}`` instance from the given object.
+
+    This function returns a
+    :class:`~sklearn.utils.metadata_routing.MetadataRouter` or a
+    :class:`~sklearn.utils.metadata_routing.MetadataRequest` from the given input.
+
+    This function always returns a copy or an instance constructed from the
+    input, such that changing the output of this function will not change the
+    original object.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    obj : object
+        - If the object provides a `get_metadata_routing` method, return a copy
+            of the output of that method.
+        - If the object is already a
+            :class:`~sklearn.utils.metadata_routing.MetadataRequest` or a
+            :class:`~sklearn.utils.metadata_routing.MetadataRouter`, return a copy
+            of that.
+        - Returns an empty :class:`~sklearn.utils.metadata_routing.MetadataRequest`
+            otherwise.
+
+    Returns
+    -------
+    obj : MetadataRequest or MetadataRouter
+        A ``MetadataRequest`` or a ``MetadataRouter`` taken or created from
+        the given object.
+    """
+    # doing this instead of a try/except since an AttributeError could be raised
+    # for other reasons.
+    if hasattr(obj, "get_metadata_routing"):
+        return deepcopy(obj.get_metadata_routing())
+
+    elif getattr(obj, "_type", None) in ["metadata_request", "metadata_router"]:
+        return deepcopy(obj)
+
+    return MetadataRequest(owner=None)
+
+
+# Request method
+# ==============
+# This section includes what's needed for the `RequestMethod` descriptor and
+# the dynamic generation of `set_{method}_request` methods in the `_MetadataRequester`
+# mixin class.
+
+# These strings are used to dynamically generate the docstrings for the methods.
+REQUESTER_DOC = """        Configure whether metadata should be requested to be \
+passed to the ``{method}`` method.
+
+        Note that this method is only relevant when this estimator is used as a
+        sub-estimator within a :term:`meta-estimator` and metadata routing is enabled
+        with ``enable_metadata_routing=True`` (see :func:`sklearn.set_config`).
+        Please check the :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        The options for each parameter are:
+
+        - ``True``: metadata is requested, and \
+passed to ``{method}`` if provided. The request is ignored if \
+metadata is not provided.
+
+        - ``False``: metadata is not requested and the meta-estimator \
+will not pass it to ``{method}``.
+
+        - ``None``: metadata is not requested, and the meta-estimator \
+will raise an error if the user provides it.
+
+        - ``str``: metadata should be passed to the meta-estimator with \
+this given alias instead of the original name.
+
+        The default (``sklearn.utils.metadata_routing.UNCHANGED``) retains the
+        existing request. This allows you to change the request for some
+        parameters and not others.
+
+        .. versionadded:: 1.3
+
+        Parameters
+        ----------
+"""
+REQUESTER_DOC_PARAM = """        {metadata} : str, True, False, or None, \
+                    default=sklearn.utils.metadata_routing.UNCHANGED
+            Metadata routing for ``{metadata}`` parameter in ``{method}``.
+
+"""
+REQUESTER_DOC_RETURN = """        Returns
+        -------
+        self : object
+            The updated object.
+"""
+
+
+class RequestMethod:
+    """
+    Descriptor for defining `set_{method}_request` methods in estimators.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    name : str
+        The name of the method for which the request function should be
+        created, e.g. ``"fit"`` would create a ``set_fit_request`` function.
+
+    keys : list of str
+        A list of strings which are accepted parameters by the created
+        function, e.g. ``["sample_weight"]`` if the corresponding method
+        accepts it as a metadata.
+
+    validate_keys : bool, default=True
+        Whether to check if the requested parameters fit the actual parameters
+        of the method.
+
+    Notes
+    -----
+    This class is a descriptor [1]_ and uses PEP-362 to set the signature of
+    the returned function [2]_.
+
+    References
+    ----------
+    .. [1] https://docs.python.org/3/howto/descriptor.html
+
+    .. [2] https://www.python.org/dev/peps/pep-0362/
+    """
+
+    def __init__(self, name, keys, validate_keys=True):
+        self.name = name
+        self.keys = keys
+        self.validate_keys = validate_keys
+
+    def __get__(self, instance, owner):
+        # we would want to have a method which accepts only the expected args
+        def func(*args, **kw):
+            """Updates the `_metadata_request` attribute of the consumer (`instance`)
+            for the parameters provided as `**kw`.
+
+            This docstring is overwritten below.
+            See REQUESTER_DOC for expected functionality.
+            """
+            if not _routing_enabled():
+                raise RuntimeError(
+                    "This method is only available when metadata routing is enabled."
+                    " You can enable it using"
+                    " sklearn.set_config(enable_metadata_routing=True)."
+                )
+
+            if self.validate_keys and (set(kw) - set(self.keys)):
+                raise TypeError(
+                    f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
+                    f"Accepted arguments are: {set(self.keys)}"
+                )
+
+            # This makes it possible to use the decorated method as an unbound method,
+            # for instance when monkeypatching.
+            # https://github.com/scikit-learn/scikit-learn/issues/28632
+            if instance is None:
+                _instance = args[0]
+                args = args[1:]
+            else:
+                _instance = instance
+
+            # Replicating python's behavior when positional args are given other than
+            # `self`, and `self` is only allowed if this method is unbound.
+            if args:
+                raise TypeError(
+                    f"set_{self.name}_request() takes 0 positional argument but"
+                    f" {len(args)} were given"
+                )
+
+            requests = _instance._get_metadata_request()
+            method_metadata_request = getattr(requests, self.name)
+
+            for prop, alias in kw.items():
+                if alias is not UNCHANGED:
+                    method_metadata_request.add_request(param=prop, alias=alias)
+            _instance._metadata_request = requests
+
+            return _instance
+
+        # Now we set the relevant attributes of the function so that it seems
+        # like a normal method to the end user, with known expected arguments.
+        func.__name__ = f"set_{self.name}_request"
+        params = [
+            inspect.Parameter(
+                name="self",
+                kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                annotation=owner,
+            )
+        ]
+        params.extend(
+            [
+                inspect.Parameter(
+                    k,
+                    inspect.Parameter.KEYWORD_ONLY,
+                    default=UNCHANGED,
+                    annotation=Optional[Union[bool, None, str]],
+                )
+                for k in self.keys
+            ]
+        )
+        func.__signature__ = inspect.Signature(
+            params,
+            return_annotation=owner,
+        )
+        doc = REQUESTER_DOC.format(method=self.name)
+        for metadata in self.keys:
+            doc += REQUESTER_DOC_PARAM.format(metadata=metadata, method=self.name)
+        doc += REQUESTER_DOC_RETURN
+        func.__doc__ = doc
+        return func
+
+
+class _MetadataRequester:
+    """Mixin class for adding metadata request functionality.
+
+    ``BaseEstimator`` inherits from this Mixin.
+
+    .. versionadded:: 1.3
+    """
+
+    if TYPE_CHECKING:  # pragma: no cover
+        # This code is never run in runtime, but it's here for type checking.
+        # Type checkers fail to understand that the `set_{method}_request`
+        # methods are dynamically generated, and they complain that they are
+        # not defined. We define them here to make type checkers happy.
+        # During type checking analyzers assume this to be True.
+        # The following list of defined methods mirrors the list of methods
+        # in SIMPLE_METHODS.
+        # fmt: off
+        def set_fit_request(self, **kwargs): pass
+        def set_partial_fit_request(self, **kwargs): pass
+        def set_predict_request(self, **kwargs): pass
+        def set_predict_proba_request(self, **kwargs): pass
+        def set_predict_log_proba_request(self, **kwargs): pass
+        def set_decision_function_request(self, **kwargs): pass
+        def set_score_request(self, **kwargs): pass
+        def set_split_request(self, **kwargs): pass
+        def set_transform_request(self, **kwargs): pass
+        def set_inverse_transform_request(self, **kwargs): pass
+        # fmt: on
+
+    def __init_subclass__(cls, **kwargs):
+        """Set the ``set_{method}_request`` methods.
+
+        This uses PEP-487 [1]_ to set the ``set_{method}_request`` methods. It
+        looks for the information available in the set default values which are
+        set using ``__metadata_request__*`` class attributes, or inferred
+        from method signatures.
+
+        The ``__metadata_request__*`` class attributes are used when a method
+        does not explicitly accept a metadata through its arguments or if the
+        developer would like to specify a request value for those metadata
+        which are different from the default ``None``.
+
+        References
+        ----------
+        .. [1] https://www.python.org/dev/peps/pep-0487
+        """
+        try:
+            requests = cls._get_default_requests()
+        except Exception:
+            # if there are any issues in the default values, it will be raised
+            # when ``get_metadata_routing`` is called. Here we are going to
+            # ignore all the issues such as bad defaults etc.
+            super().__init_subclass__(**kwargs)
+            return
+
+        for method in SIMPLE_METHODS:
+            mmr = getattr(requests, method)
+            # set ``set_{method}_request`` methods
+            if not len(mmr.requests):
+                continue
+            setattr(
+                cls,
+                f"set_{method}_request",
+                RequestMethod(method, sorted(mmr.requests.keys())),
+            )
+        super().__init_subclass__(**kwargs)
+
+    @classmethod
+    def _build_request_for_signature(cls, router, method):
+        """Build the `MethodMetadataRequest` for a method using its signature.
+
+        This method takes all arguments from the method signature and uses
+        ``None`` as their default request value, except ``X``, ``y``, ``Y``,
+        ``Xt``, ``yt``, ``*args``, and ``**kwargs``.
+
+        Parameters
+        ----------
+        router : MetadataRequest
+            The parent object for the created `MethodMetadataRequest`.
+        method : str
+            The name of the method.
+
+        Returns
+        -------
+        method_request : MethodMetadataRequest
+            The prepared request using the method's signature.
+        """
+        mmr = MethodMetadataRequest(owner=cls.__name__, method=method)
+        # Here we use `isfunction` instead of `ismethod` because calling `getattr`
+        # on a class instead of an instance returns an unbound function.
+        if not hasattr(cls, method) or not inspect.isfunction(getattr(cls, method)):
+            return mmr
+        # ignore the first parameter of the method, which is usually "self"
+        params = list(inspect.signature(getattr(cls, method)).parameters.items())[1:]
+        for pname, param in params:
+            if pname in {"X", "y", "Y", "Xt", "yt"}:
+                continue
+            if param.kind in {param.VAR_POSITIONAL, param.VAR_KEYWORD}:
+                continue
+            mmr.add_request(
+                param=pname,
+                alias=None,
+            )
+        return mmr
+
+    @classmethod
+    def _get_default_requests(cls):
+        """Collect default request values.
+
+        This method combines the information present in ``__metadata_request__*``
+        class attributes, as well as determining request keys from method
+        signatures.
+        """
+        requests = MetadataRequest(owner=cls.__name__)
+
+        for method in SIMPLE_METHODS:
+            setattr(
+                requests,
+                method,
+                cls._build_request_for_signature(router=requests, method=method),
+            )
+
+        # Then overwrite those defaults with the ones provided in
+        # __metadata_request__* attributes. Defaults set in
+        # __metadata_request__* attributes take precedence over signature
+        # sniffing.
+
+        # need to go through the MRO since this is a class attribute and
+        # ``vars`` doesn't report the parent class attributes. We go through
+        # the reverse of the MRO so that child classes have precedence over
+        # their parents.
+        substr = "__metadata_request__"
+        for base_class in reversed(inspect.getmro(cls)):
+            for attr, value in vars(base_class).items():
+                if substr not in attr:
+                    continue
+                # we don't check for attr.startswith() since python prefixes attrs
+                # starting with __ with the `_ClassName`.
+                method = attr[attr.index(substr) + len(substr) :]
+                for prop, alias in value.items():
+                    # Here we add request values specified via those class attributes
+                    # to the `MetadataRequest` object. Adding a request which already
+                    # exists will override the previous one. Since we go through the
+                    # MRO in reverse order, the one specified by the lowest most classes
+                    # in the inheritance tree are the ones which take effect.
+                    getattr(requests, method).add_request(param=prop, alias=alias)
+
+        return requests
+
+    def _get_metadata_request(self):
+        """Get requested metadata for the instance.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        request : MetadataRequest
+            A :class:`~sklearn.utils.metadata_routing.MetadataRequest` instance.
+        """
+        if hasattr(self, "_metadata_request"):
+            requests = get_routing_for_object(self._metadata_request)
+        else:
+            requests = self._get_default_requests()
+
+        return requests
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        Returns
+        -------
+        routing : MetadataRequest
+            A :class:`~sklearn.utils.metadata_routing.MetadataRequest` encapsulating
+            routing information.
+        """
+        return self._get_metadata_request()
+
+
+# Process Routing in Routers
+# ==========================
+# This is almost always the only method used in routers to process and route
+# given metadata. This is to minimize the boilerplate required in routers.
+
+
+# Here the first two arguments are positional only which makes everything
+# passed as keyword argument a metadata. The first two args also have an `_`
+# prefix to reduce the chances of name collisions with the passed metadata, and
+# since they're positional only, users will never type those underscores.
+def process_routing(_obj, _method, /, **kwargs):
+    """Validate and route metadata.
+
+    This function is used inside a :term:`router`'s method, e.g. :term:`fit`,
+    to validate the metadata and handle the routing.
+
+    Assuming this signature of a router's fit method:
+    ``fit(self, X, y, sample_weight=None, **fit_params)``,
+    a call to this function would be:
+    ``process_routing(self, "fit", sample_weight=sample_weight, **fit_params)``.
+
+    Note that if routing is not enabled and ``kwargs`` is empty, then it
+    returns an empty routing where ``process_routing(...).ANYTHING.ANY_METHOD``
+    is always an empty dictionary.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    _obj : object
+        An object implementing ``get_metadata_routing``. Typically a
+        :term:`meta-estimator`.
+
+    _method : str
+        The name of the router's method in which this function is called.
+
+    **kwargs : dict
+        Metadata to be routed.
+
+    Returns
+    -------
+    routed_params : Bunch
+        A :class:`~utils.Bunch` of the form ``{"object_name": {"method_name":
+        {metadata: value}}}`` which can be used to pass the required metadata to
+        A :class:`~sklearn.utils.Bunch` of the form ``{"object_name": {"method_name":
+        {metadata: value}}}`` which can be used to pass the required metadata to
+        corresponding methods or corresponding child objects. The object names
+        are those defined in `obj.get_metadata_routing()`.
+    """
+    if not kwargs:
+        # If routing is not enabled and kwargs are empty, then we don't have to
+        # try doing any routing, we can simply return a structure which returns
+        # an empty dict on routed_params.ANYTHING.ANY_METHOD.
+        class EmptyRequest:
+            def get(self, name, default=None):
+                return Bunch(**{method: dict() for method in METHODS})
+
+            def __getitem__(self, name):
+                return Bunch(**{method: dict() for method in METHODS})
+
+            def __getattr__(self, name):
+                return Bunch(**{method: dict() for method in METHODS})
+
+        return EmptyRequest()
+
+    if not (hasattr(_obj, "get_metadata_routing") or isinstance(_obj, MetadataRouter)):
+        raise AttributeError(
+            f"The given object ({_obj.__class__.__name__!r}) needs to either"
+            " implement the routing method `get_metadata_routing` or be a"
+            " `MetadataRouter` instance."
+        )
+    if _method not in METHODS:
+        raise TypeError(
+            f"Can only route and process input on these methods: {METHODS}, "
+            f"while the passed method is: {_method}."
+        )
+
+    request_routing = get_routing_for_object(_obj)
+    request_routing.validate_metadata(params=kwargs, method=_method)
+    routed_params = request_routing.route_params(params=kwargs, caller=_method)
+
+    return routed_params
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_missing.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_missing.py
new file mode 100644
index 0000000000000000000000000000000000000000..daeb9ba68cc1cc0a73cb5f1ce19bd8eff13045b9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_missing.py
@@ -0,0 +1,68 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import math
+import numbers
+from contextlib import suppress
+
+
+def is_scalar_nan(x):
+    """Test if x is NaN.
+
+    This function is meant to overcome the issue that np.isnan does not allow
+    non-numerical types as input, and that np.nan is not float('nan').
+
+    Parameters
+    ----------
+    x : any type
+        Any scalar value.
+
+    Returns
+    -------
+    bool
+        Returns true if x is NaN, and false otherwise.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils._missing import is_scalar_nan
+    >>> is_scalar_nan(np.nan)
+    True
+    >>> is_scalar_nan(float("nan"))
+    True
+    >>> is_scalar_nan(None)
+    False
+    >>> is_scalar_nan("")
+    False
+    >>> is_scalar_nan([np.nan])
+    False
+    """
+    return (
+        not isinstance(x, numbers.Integral)
+        and isinstance(x, numbers.Real)
+        and math.isnan(x)
+    )
+
+
+def is_pandas_na(x):
+    """Test if x is pandas.NA.
+
+    We intentionally do not use this function to return `True` for `pd.NA` in
+    `is_scalar_nan`, because estimators that support `pd.NA` are the exception
+    rather than the rule at the moment. When `pd.NA` is more universally
+    supported, we may reconsider this decision.
+
+    Parameters
+    ----------
+    x : any type
+
+    Returns
+    -------
+    boolean
+    """
+    with suppress(ImportError):
+        from pandas import NA
+
+        return x is NA
+
+    return False
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_mocking.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_mocking.py
new file mode 100644
index 0000000000000000000000000000000000000000..87fb4106f3b595b6edc5e08903cbfd5f6d95a7f6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_mocking.py
@@ -0,0 +1,419 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+from ..base import BaseEstimator, ClassifierMixin
+from ..utils._metadata_requests import RequestMethod
+from .metaestimators import available_if
+from .validation import (
+    _check_sample_weight,
+    _num_samples,
+    check_array,
+    check_is_fitted,
+    check_random_state,
+)
+
+
+class ArraySlicingWrapper:
+    """
+    Parameters
+    ----------
+    array
+    """
+
+    def __init__(self, array):
+        self.array = array
+
+    def __getitem__(self, aslice):
+        return MockDataFrame(self.array[aslice])
+
+
+class MockDataFrame:
+    """
+    Parameters
+    ----------
+    array
+    """
+
+    # have shape and length but don't support indexing.
+
+    def __init__(self, array):
+        self.array = array
+        self.values = array
+        self.shape = array.shape
+        self.ndim = array.ndim
+        # ugly hack to make iloc work.
+        self.iloc = ArraySlicingWrapper(array)
+
+    def __len__(self):
+        return len(self.array)
+
+    def __array__(self, dtype=None):
+        # Pandas data frames also are array-like: we want to make sure that
+        # input validation in cross-validation does not try to call that
+        # method.
+        return self.array
+
+    def __eq__(self, other):
+        return MockDataFrame(self.array == other.array)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def take(self, indices, axis=0):
+        return MockDataFrame(self.array.take(indices, axis=axis))
+
+
+class CheckingClassifier(ClassifierMixin, BaseEstimator):
+    """Dummy classifier to test pipelining and meta-estimators.
+
+    Checks some property of `X` and `y`in fit / predict.
+    This allows testing whether pipelines / cross-validation or metaestimators
+    changed the input.
+
+    Can also be used to check if `fit_params` are passed correctly, and
+    to force a certain score to be returned.
+
+    Parameters
+    ----------
+    check_y, check_X : callable, default=None
+        The callable used to validate `X` and `y`. These callable should return
+        a bool where `False` will trigger an `AssertionError`. If `None`, the
+        data is not validated. Default is `None`.
+
+    check_y_params, check_X_params : dict, default=None
+        The optional parameters to pass to `check_X` and `check_y`. If `None`,
+        then no parameters are passed in.
+
+    methods_to_check : "all" or list of str, default="all"
+        The methods in which the checks should be applied. By default,
+        all checks will be done on all methods (`fit`, `predict`,
+        `predict_proba`, `decision_function` and `score`).
+
+    foo_param : int, default=0
+        A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1
+        otherwise it is 0.
+
+    expected_sample_weight : bool, default=False
+        Whether to check if a valid `sample_weight` was passed to `fit`.
+
+    expected_fit_params : list of str, default=None
+        A list of the expected parameters given when calling `fit`.
+
+    Attributes
+    ----------
+    classes_ : int
+        The classes seen during `fit`.
+
+    n_features_in_ : int
+        The number of features seen during `fit`.
+
+    Examples
+    --------
+    >>> from sklearn.utils._mocking import CheckingClassifier
+
+    This helper allow to assert to specificities regarding `X` or `y`. In this
+    case we expect `check_X` or `check_y` to return a boolean.
+
+    >>> from sklearn.datasets import load_iris
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = CheckingClassifier(check_X=lambda x: x.shape == (150, 4))
+    >>> clf.fit(X, y)
+    CheckingClassifier(...)
+
+    We can also provide a check which might raise an error. In this case, we
+    expect `check_X` to return `X` and `check_y` to return `y`.
+
+    >>> from sklearn.utils import check_array
+    >>> clf = CheckingClassifier(check_X=check_array)
+    >>> clf.fit(X, y)
+    CheckingClassifier(...)
+    """
+
+    def __init__(
+        self,
+        *,
+        check_y=None,
+        check_y_params=None,
+        check_X=None,
+        check_X_params=None,
+        methods_to_check="all",
+        foo_param=0,
+        expected_sample_weight=None,
+        expected_fit_params=None,
+        random_state=None,
+    ):
+        self.check_y = check_y
+        self.check_y_params = check_y_params
+        self.check_X = check_X
+        self.check_X_params = check_X_params
+        self.methods_to_check = methods_to_check
+        self.foo_param = foo_param
+        self.expected_sample_weight = expected_sample_weight
+        self.expected_fit_params = expected_fit_params
+        self.random_state = random_state
+
+    def _check_X_y(self, X, y=None, should_be_fitted=True):
+        """Validate X and y and make extra check.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data set.
+            `X` is checked only if `check_X` is not `None` (default is None).
+        y : array-like of shape (n_samples), default=None
+            The corresponding target, by default `None`.
+            `y` is checked only if `check_y` is not `None` (default is None).
+        should_be_fitted : bool, default=True
+            Whether or not the classifier should be already fitted.
+            By default True.
+
+        Returns
+        -------
+        X, y
+        """
+        if should_be_fitted:
+            check_is_fitted(self)
+        if self.check_X is not None:
+            params = {} if self.check_X_params is None else self.check_X_params
+            checked_X = self.check_X(X, **params)
+            if isinstance(checked_X, (bool, np.bool_)):
+                assert checked_X
+            else:
+                X = checked_X
+        if y is not None and self.check_y is not None:
+            params = {} if self.check_y_params is None else self.check_y_params
+            checked_y = self.check_y(y, **params)
+            if isinstance(checked_y, (bool, np.bool_)):
+                assert checked_y
+            else:
+                y = checked_y
+        return X, y
+
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        """Fit classifier.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples, n_outputs) or (n_samples,), \
+                default=None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+
+        **fit_params : dict of string -> object
+            Parameters passed to the ``fit`` method of the estimator
+
+        Returns
+        -------
+        self
+        """
+        assert _num_samples(X) == _num_samples(y)
+        if self.methods_to_check == "all" or "fit" in self.methods_to_check:
+            X, y = self._check_X_y(X, y, should_be_fitted=False)
+        self.n_features_in_ = np.shape(X)[1]
+        self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True))
+        if self.expected_fit_params:
+            missing = set(self.expected_fit_params) - set(fit_params)
+            if missing:
+                raise AssertionError(
+                    f"Expected fit parameter(s) {list(missing)} not seen."
+                )
+            for key, value in fit_params.items():
+                if _num_samples(value) != _num_samples(X):
+                    raise AssertionError(
+                        f"Fit parameter {key} has length {_num_samples(value)}"
+                        f"; expected {_num_samples(X)}."
+                    )
+        if self.expected_sample_weight:
+            if sample_weight is None:
+                raise AssertionError("Expected sample_weight to be passed")
+            _check_sample_weight(sample_weight, X)
+
+        return self
+
+    def predict(self, X):
+        """Predict the first class seen in `classes_`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        preds : ndarray of shape (n_samples,)
+            Predictions of the first class seen in `classes_`.
+        """
+        if self.methods_to_check == "all" or "predict" in self.methods_to_check:
+            X, y = self._check_X_y(X)
+        rng = check_random_state(self.random_state)
+        return rng.choice(self.classes_, size=_num_samples(X))
+
+    def predict_proba(self, X):
+        """Predict probabilities for each class.
+
+        Here, the dummy classifier will provide a probability of 1 for the
+        first class of `classes_` and 0 otherwise.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        proba : ndarray of shape (n_samples, n_classes)
+            The probabilities for each sample and class.
+        """
+        if self.methods_to_check == "all" or "predict_proba" in self.methods_to_check:
+            X, y = self._check_X_y(X)
+        rng = check_random_state(self.random_state)
+        proba = rng.randn(_num_samples(X), len(self.classes_))
+        proba = np.abs(proba, out=proba)
+        proba /= np.sum(proba, axis=1)[:, np.newaxis]
+        return proba
+
+    def decision_function(self, X):
+        """Confidence score.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        decision : ndarray of shape (n_samples,) if n_classes == 2\
+                else (n_samples, n_classes)
+            Confidence score.
+        """
+        if (
+            self.methods_to_check == "all"
+            or "decision_function" in self.methods_to_check
+        ):
+            X, y = self._check_X_y(X)
+        rng = check_random_state(self.random_state)
+        if len(self.classes_) == 2:
+            # for binary classifier, the confidence score is related to
+            # classes_[1] and therefore should be null.
+            return rng.randn(_num_samples(X))
+        else:
+            return rng.randn(_num_samples(X), len(self.classes_))
+
+    def score(self, X=None, Y=None):
+        """Fake score.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Y : array-like of shape (n_samples, n_output) or (n_samples,)
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        Returns
+        -------
+        score : float
+            Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>
+            score=1` otherwise `score=0`).
+        """
+        if self.methods_to_check == "all" or "score" in self.methods_to_check:
+            self._check_X_y(X, Y)
+        if self.foo_param > 1:
+            score = 1.0
+        else:
+            score = 0.0
+        return score
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags._skip_test = True
+        tags.input_tags.two_d_array = False
+        tags.target_tags.one_d_labels = True
+        return tags
+
+
+# Deactivate key validation for CheckingClassifier because we want to be able to
+# call fit with arbitrary fit_params and record them. Without this change, we
+# would get an error because those arbitrary params are not expected.
+CheckingClassifier.set_fit_request = RequestMethod(  # type: ignore[assignment,method-assign]
+    name="fit", keys=[], validate_keys=False
+)
+
+
+class NoSampleWeightWrapper(BaseEstimator):
+    """Wrap estimator which will not expose `sample_weight`.
+
+    Parameters
+    ----------
+    est : estimator, default=None
+        The estimator to wrap.
+    """
+
+    def __init__(self, est=None):
+        self.est = est
+
+    def fit(self, X, y):
+        return self.est.fit(X, y)
+
+    def predict(self, X):
+        return self.est.predict(X)
+
+    def predict_proba(self, X):
+        return self.est.predict_proba(X)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags._skip_test = True
+        return tags
+
+
+def _check_response(method):
+    def check(self):
+        return self.response_methods is not None and method in self.response_methods
+
+    return check
+
+
+class _MockEstimatorOnOffPrediction(BaseEstimator):
+    """Estimator for which we can turn on/off the prediction methods.
+
+    Parameters
+    ----------
+    response_methods: list of \
+            {"predict", "predict_proba", "decision_function"}, default=None
+        List containing the response implemented by the estimator. When, the
+        response is in the list, it will return the name of the response method
+        when called. Otherwise, an `AttributeError` is raised. It allows to
+        use `getattr` as any conventional estimator. By default, no response
+        methods are mocked.
+    """
+
+    def __init__(self, response_methods=None):
+        self.response_methods = response_methods
+
+    def fit(self, X, y):
+        self.classes_ = np.unique(y)
+        return self
+
+    @available_if(_check_response("predict"))
+    def predict(self, X):
+        return "predict"
+
+    @available_if(_check_response("predict_proba"))
+    def predict_proba(self, X):
+        return "predict_proba"
+
+    @available_if(_check_response("decision_function"))
+    def decision_function(self, X):
+        return "decision_function"
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..905941259f9620f8c98fcaa2b88ff6105bd2f10b
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..a7694d0be2d93b77fc89e7c8eb8d15338fe3ebb4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.pxd
@@ -0,0 +1,33 @@
+# Helpers to safely access OpenMP routines
+#
+# no-op implementations are provided for the case where OpenMP is not available.
+#
+# All calls to OpenMP routines should be cimported from this module.
+
+cdef extern from *:
+    """
+    #ifdef _OPENMP
+        #include <omp.h>
+        #define SKLEARN_OPENMP_PARALLELISM_ENABLED 1
+    #else
+        #define SKLEARN_OPENMP_PARALLELISM_ENABLED 0
+        #define omp_lock_t int
+        #define omp_init_lock(l) (void)0
+        #define omp_destroy_lock(l) (void)0
+        #define omp_set_lock(l) (void)0
+        #define omp_unset_lock(l) (void)0
+        #define omp_get_thread_num() 0
+        #define omp_get_max_threads() 1
+    #endif
+    """
+    bint SKLEARN_OPENMP_PARALLELISM_ENABLED
+
+    ctypedef struct omp_lock_t:
+        pass
+
+    void omp_init_lock(omp_lock_t*) noexcept nogil
+    void omp_destroy_lock(omp_lock_t*) noexcept nogil
+    void omp_set_lock(omp_lock_t*) noexcept nogil
+    void omp_unset_lock(omp_lock_t*) noexcept nogil
+    int omp_get_thread_num() noexcept nogil
+    int omp_get_max_threads() noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..88dca51089c56629aa2e59f98359aa4ee5a39027
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_openmp_helpers.pyx
@@ -0,0 +1,77 @@
+import os
+from joblib import cpu_count
+
+
+# Module level cache for cpu_count as we do not expect this to change during
+# the lifecycle of a Python program. This dictionary is keyed by
+# only_physical_cores.
+_CPU_COUNTS = {}
+
+
+def _openmp_parallelism_enabled():
+    """Determines whether scikit-learn has been built with OpenMP
+
+    It allows to retrieve at runtime the information gathered at compile time.
+    """
+    # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time and defined
+    # in _openmp_helpers.pxd as a boolean. This function exposes it to Python.
+    return SKLEARN_OPENMP_PARALLELISM_ENABLED
+
+
+cpdef _openmp_effective_n_threads(n_threads=None, only_physical_cores=True):
+    """Determine the effective number of threads to be used for OpenMP calls
+
+    - For ``n_threads = None``,
+      - if the ``OMP_NUM_THREADS`` environment variable is set, return
+        ``openmp.omp_get_max_threads()``
+      - otherwise, return the minimum between ``openmp.omp_get_max_threads()``
+        and the number of cpus, taking cgroups quotas into account. Cgroups
+        quotas can typically be set by tools such as Docker.
+      The result of ``omp_get_max_threads`` can be influenced by environment
+      variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``.
+
+    - For ``n_threads > 0``, return this as the maximal number of threads for
+      parallel OpenMP calls.
+
+    - For ``n_threads < 0``, return the maximal number of threads minus
+      ``|n_threads + 1|``. In particular ``n_threads = -1`` will use as many
+      threads as there are available cores on the machine.
+
+    - Raise a ValueError for ``n_threads = 0``.
+
+    Passing the `only_physical_cores=False` flag makes it possible to use extra
+    threads for SMT/HyperThreading logical cores. It has been empirically
+    observed that using as many threads as available SMT cores can slightly
+    improve the performance in some cases, but can severely degrade
+    performance other times. Therefore it is recommended to use
+    `only_physical_cores=True` unless an empirical study has been conducted to
+    assess the impact of SMT on a case-by-case basis (using various input data
+    shapes, in particular small data shapes).
+
+    If scikit-learn is built without OpenMP support, always return 1.
+    """
+    if n_threads == 0:
+        raise ValueError("n_threads = 0 is invalid")
+
+    if not SKLEARN_OPENMP_PARALLELISM_ENABLED:
+        # OpenMP disabled at build-time => sequential mode
+        return 1
+
+    if os.getenv("OMP_NUM_THREADS"):
+        # Fall back to user provided number of threads making it possible
+        # to exceed the number of cpus.
+        max_n_threads = omp_get_max_threads()
+    else:
+        try:
+            n_cpus = _CPU_COUNTS[only_physical_cores]
+        except KeyError:
+            n_cpus = cpu_count(only_physical_cores=only_physical_cores)
+            _CPU_COUNTS[only_physical_cores] = n_cpus
+        max_n_threads = min(omp_get_max_threads(), n_cpus)
+
+    if n_threads is None:
+        return max_n_threads
+    elif n_threads < 0:
+        return max(1, max_n_threads + n_threads + 1)
+
+    return n_threads
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_optional_dependencies.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_optional_dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f0041285090a014145f89de2141fe39d1f39bfe
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_optional_dependencies.py
@@ -0,0 +1,46 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+def check_matplotlib_support(caller_name):
+    """Raise ImportError with detailed error message if mpl is not installed.
+
+    Plot utilities like any of the Display's plotting functions should lazily import
+    matplotlib and call this helper before any computation.
+
+    Parameters
+    ----------
+    caller_name : str
+        The name of the caller that requires matplotlib.
+    """
+    try:
+        import matplotlib  # noqa: F401
+    except ImportError as e:
+        raise ImportError(
+            "{} requires matplotlib. You can install matplotlib with "
+            "`pip install matplotlib`".format(caller_name)
+        ) from e
+
+
+def check_pandas_support(caller_name):
+    """Raise ImportError with detailed error message if pandas is not installed.
+
+    Plot utilities like :func:`fetch_openml` should lazily import
+    pandas and call this helper before any computation.
+
+    Parameters
+    ----------
+    caller_name : str
+        The name of the caller that requires pandas.
+
+    Returns
+    -------
+    pandas
+        The pandas package.
+    """
+    try:
+        import pandas
+
+        return pandas
+    except ImportError as e:
+        raise ImportError("{} requires pandas.".format(caller_name)) from e
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..27df9f4526d5c441457923a5119786e67905a8bb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py
@@ -0,0 +1,910 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import functools
+import math
+import operator
+import re
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from inspect import signature
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.sparse import csr_matrix, issparse
+
+from .._config import config_context, get_config
+from .validation import _is_arraylike_not_scalar
+
+
+class InvalidParameterError(ValueError, TypeError):
+    """Custom exception to be raised when the parameter of a class/method/function
+    does not have a valid type or value.
+    """
+
+    # Inherits from ValueError and TypeError to keep backward compatibility.
+
+
+def validate_parameter_constraints(parameter_constraints, params, caller_name):
+    """Validate types and values of given parameters.
+
+    Parameters
+    ----------
+    parameter_constraints : dict or {"no_validation"}
+        If "no_validation", validation is skipped for this parameter.
+
+        If a dict, it must be a dictionary `param_name: list of constraints`.
+        A parameter is valid if it satisfies one of the constraints from the list.
+        Constraints can be:
+        - an Interval object, representing a continuous or discrete range of numbers
+        - the string "array-like"
+        - the string "sparse matrix"
+        - the string "random_state"
+        - callable
+        - None, meaning that None is a valid value for the parameter
+        - any type, meaning that any instance of this type is valid
+        - an Options object, representing a set of elements of a given type
+        - a StrOptions object, representing a set of strings
+        - the string "boolean"
+        - the string "verbose"
+        - the string "cv_object"
+        - the string "nan"
+        - a MissingValues object representing markers for missing values
+        - a HasMethods object, representing method(s) an object must have
+        - a Hidden object, representing a constraint not meant to be exposed to the user
+
+    params : dict
+        A dictionary `param_name: param_value`. The parameters to validate against the
+        constraints.
+
+    caller_name : str
+        The name of the estimator or function or method that called this function.
+    """
+    for param_name, param_val in params.items():
+        # We allow parameters to not have a constraint so that third party estimators
+        # can inherit from sklearn estimators without having to necessarily use the
+        # validation tools.
+        if param_name not in parameter_constraints:
+            continue
+
+        constraints = parameter_constraints[param_name]
+
+        if constraints == "no_validation":
+            continue
+
+        constraints = [make_constraint(constraint) for constraint in constraints]
+
+        for constraint in constraints:
+            if constraint.is_satisfied_by(param_val):
+                # this constraint is satisfied, no need to check further.
+                break
+        else:
+            # No constraint is satisfied, raise with an informative message.
+
+            # Ignore constraints that we don't want to expose in the error message,
+            # i.e. options that are for internal purpose or not officially supported.
+            constraints = [
+                constraint for constraint in constraints if not constraint.hidden
+            ]
+
+            if len(constraints) == 1:
+                constraints_str = f"{constraints[0]}"
+            else:
+                constraints_str = (
+                    f"{', '.join([str(c) for c in constraints[:-1]])} or"
+                    f" {constraints[-1]}"
+                )
+
+            raise InvalidParameterError(
+                f"The {param_name!r} parameter of {caller_name} must be"
+                f" {constraints_str}. Got {param_val!r} instead."
+            )
+
+
+def make_constraint(constraint):
+    """Convert the constraint into the appropriate Constraint object.
+
+    Parameters
+    ----------
+    constraint : object
+        The constraint to convert.
+
+    Returns
+    -------
+    constraint : instance of _Constraint
+        The converted constraint.
+    """
+    if isinstance(constraint, str) and constraint == "array-like":
+        return _ArrayLikes()
+    if isinstance(constraint, str) and constraint == "sparse matrix":
+        return _SparseMatrices()
+    if isinstance(constraint, str) and constraint == "random_state":
+        return _RandomStates()
+    if constraint is callable:
+        return _Callables()
+    if constraint is None:
+        return _NoneConstraint()
+    if isinstance(constraint, type):
+        return _InstancesOf(constraint)
+    if isinstance(
+        constraint, (Interval, StrOptions, Options, HasMethods, MissingValues)
+    ):
+        return constraint
+    if isinstance(constraint, str) and constraint == "boolean":
+        return _Booleans()
+    if isinstance(constraint, str) and constraint == "verbose":
+        return _VerboseHelper()
+    if isinstance(constraint, str) and constraint == "cv_object":
+        return _CVObjects()
+    if isinstance(constraint, Hidden):
+        constraint = make_constraint(constraint.constraint)
+        constraint.hidden = True
+        return constraint
+    if (isinstance(constraint, str) and constraint == "nan") or (
+        isinstance(constraint, float) and np.isnan(constraint)
+    ):
+        return _NanConstraint()
+    raise ValueError(f"Unknown constraint type: {constraint}")
+
+
+def validate_params(parameter_constraints, *, prefer_skip_nested_validation):
+    """Decorator to validate types and values of functions and methods.
+
+    Parameters
+    ----------
+    parameter_constraints : dict
+        A dictionary `param_name: list of constraints`. See the docstring of
+        `validate_parameter_constraints` for a description of the accepted constraints.
+
+        Note that the *args and **kwargs parameters are not validated and must not be
+        present in the parameter_constraints dictionary.
+
+    prefer_skip_nested_validation : bool
+        If True, the validation of parameters of inner estimators or functions
+        called by the decorated function will be skipped.
+
+        This is useful to avoid validating many times the parameters passed by the
+        user from the public facing API. It's also useful to avoid validating
+        parameters that we pass internally to inner functions that are guaranteed to
+        be valid by the test suite.
+
+        It should be set to True for most functions, except for those that receive
+        non-validated objects as parameters or that are just wrappers around classes
+        because they only perform a partial validation.
+
+    Returns
+    -------
+    decorated_function : function or method
+        The decorated function.
+    """
+
+    def decorator(func):
+        # The dict of parameter constraints is set as an attribute of the function
+        # to make it possible to dynamically introspect the constraints for
+        # automatic testing.
+        setattr(func, "_skl_parameter_constraints", parameter_constraints)
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            global_skip_validation = get_config()["skip_parameter_validation"]
+            if global_skip_validation:
+                return func(*args, **kwargs)
+
+            func_sig = signature(func)
+
+            # Map *args/**kwargs to the function signature
+            params = func_sig.bind(*args, **kwargs)
+            params.apply_defaults()
+
+            # ignore self/cls and positional/keyword markers
+            to_ignore = [
+                p.name
+                for p in func_sig.parameters.values()
+                if p.kind in (p.VAR_POSITIONAL, p.VAR_KEYWORD)
+            ]
+            to_ignore += ["self", "cls"]
+            params = {k: v for k, v in params.arguments.items() if k not in to_ignore}
+
+            validate_parameter_constraints(
+                parameter_constraints, params, caller_name=func.__qualname__
+            )
+
+            try:
+                with config_context(
+                    skip_parameter_validation=(
+                        prefer_skip_nested_validation or global_skip_validation
+                    )
+                ):
+                    return func(*args, **kwargs)
+            except InvalidParameterError as e:
+                # When the function is just a wrapper around an estimator, we allow
+                # the function to delegate validation to the estimator, but we replace
+                # the name of the estimator by the name of the function in the error
+                # message to avoid confusion.
+                msg = re.sub(
+                    r"parameter of \w+ must be",
+                    f"parameter of {func.__qualname__} must be",
+                    str(e),
+                )
+                raise InvalidParameterError(msg) from e
+
+        return wrapper
+
+    return decorator
+
+
+class RealNotInt(Real):
+    """A type that represents reals that are not instances of int.
+
+    Behaves like float, but also works with values extracted from numpy arrays.
+    isintance(1, RealNotInt) -> False
+    isinstance(1.0, RealNotInt) -> True
+    """
+
+
+RealNotInt.register(float)
+
+
+def _type_name(t):
+    """Convert type into human readable string."""
+    module = t.__module__
+    qualname = t.__qualname__
+    if module == "builtins":
+        return qualname
+    elif t == Real:
+        return "float"
+    elif t == Integral:
+        return "int"
+    return f"{module}.{qualname}"
+
+
+class _Constraint(ABC):
+    """Base class for the constraint objects."""
+
+    def __init__(self):
+        self.hidden = False
+
+    @abstractmethod
+    def is_satisfied_by(self, val):
+        """Whether or not a value satisfies the constraint.
+
+        Parameters
+        ----------
+        val : object
+            The value to check.
+
+        Returns
+        -------
+        is_satisfied : bool
+            Whether or not the constraint is satisfied by this value.
+        """
+
+    @abstractmethod
+    def __str__(self):
+        """A human readable representational string of the constraint."""
+
+
+class _InstancesOf(_Constraint):
+    """Constraint representing instances of a given type.
+
+    Parameters
+    ----------
+    type : type
+        The valid type.
+    """
+
+    def __init__(self, type):
+        super().__init__()
+        self.type = type
+
+    def is_satisfied_by(self, val):
+        return isinstance(val, self.type)
+
+    def __str__(self):
+        return f"an instance of {_type_name(self.type)!r}"
+
+
+class _NoneConstraint(_Constraint):
+    """Constraint representing the None singleton."""
+
+    def is_satisfied_by(self, val):
+        return val is None
+
+    def __str__(self):
+        return "None"
+
+
+class _NanConstraint(_Constraint):
+    """Constraint representing the indicator `np.nan`."""
+
+    def is_satisfied_by(self, val):
+        return (
+            not isinstance(val, Integral) and isinstance(val, Real) and math.isnan(val)
+        )
+
+    def __str__(self):
+        return "numpy.nan"
+
+
+class _PandasNAConstraint(_Constraint):
+    """Constraint representing the indicator `pd.NA`."""
+
+    def is_satisfied_by(self, val):
+        try:
+            import pandas as pd
+
+            return isinstance(val, type(pd.NA)) and pd.isna(val)
+        except ImportError:
+            return False
+
+    def __str__(self):
+        return "pandas.NA"
+
+
+class Options(_Constraint):
+    """Constraint representing a finite set of instances of a given type.
+
+    Parameters
+    ----------
+    type : type
+
+    options : set
+        The set of valid scalars.
+
+    deprecated : set or None, default=None
+        A subset of the `options` to mark as deprecated in the string
+        representation of the constraint.
+    """
+
+    def __init__(self, type, options, *, deprecated=None):
+        super().__init__()
+        self.type = type
+        self.options = options
+        self.deprecated = deprecated or set()
+
+        if self.deprecated - self.options:
+            raise ValueError("The deprecated options must be a subset of the options.")
+
+    def is_satisfied_by(self, val):
+        return isinstance(val, self.type) and val in self.options
+
+    def _mark_if_deprecated(self, option):
+        """Add a deprecated mark to an option if needed."""
+        option_str = f"{option!r}"
+        if option in self.deprecated:
+            option_str = f"{option_str} (deprecated)"
+        return option_str
+
+    def __str__(self):
+        options_str = (
+            f"{', '.join([self._mark_if_deprecated(o) for o in self.options])}"
+        )
+        return f"a {_type_name(self.type)} among {{{options_str}}}"
+
+
+class StrOptions(Options):
+    """Constraint representing a finite set of strings.
+
+    Parameters
+    ----------
+    options : set of str
+        The set of valid strings.
+
+    deprecated : set of str or None, default=None
+        A subset of the `options` to mark as deprecated in the string
+        representation of the constraint.
+    """
+
+    def __init__(self, options, *, deprecated=None):
+        super().__init__(type=str, options=options, deprecated=deprecated)
+
+
+class Interval(_Constraint):
+    """Constraint representing a typed interval.
+
+    Parameters
+    ----------
+    type : {numbers.Integral, numbers.Real, RealNotInt}
+        The set of numbers in which to set the interval.
+
+        If RealNotInt, only reals that don't have the integer type
+        are allowed. For example 1.0 is allowed but 1 is not.
+
+    left : float or int or None
+        The left bound of the interval. None means left bound is -∞.
+
+    right : float, int or None
+        The right bound of the interval. None means right bound is +∞.
+
+    closed : {"left", "right", "both", "neither"}
+        Whether the interval is open or closed. Possible choices are:
+
+        - `"left"`: the interval is closed on the left and open on the right.
+          It is equivalent to the interval `[ left, right )`.
+        - `"right"`: the interval is closed on the right and open on the left.
+          It is equivalent to the interval `( left, right ]`.
+        - `"both"`: the interval is closed.
+          It is equivalent to the interval `[ left, right ]`.
+        - `"neither"`: the interval is open.
+          It is equivalent to the interval `( left, right )`.
+
+    Notes
+    -----
+    Setting a bound to `None` and setting the interval closed is valid. For instance,
+    strictly speaking, `Interval(Real, 0, None, closed="both")` corresponds to
+    `[0, +∞) U {+∞}`.
+    """
+
+    def __init__(self, type, left, right, *, closed):
+        super().__init__()
+        self.type = type
+        self.left = left
+        self.right = right
+        self.closed = closed
+
+        self._check_params()
+
+    def _check_params(self):
+        if self.type not in (Integral, Real, RealNotInt):
+            raise ValueError(
+                "type must be either numbers.Integral, numbers.Real or RealNotInt."
+                f" Got {self.type} instead."
+            )
+
+        if self.closed not in ("left", "right", "both", "neither"):
+            raise ValueError(
+                "closed must be either 'left', 'right', 'both' or 'neither'. "
+                f"Got {self.closed} instead."
+            )
+
+        if self.type is Integral:
+            suffix = "for an interval over the integers."
+            if self.left is not None and not isinstance(self.left, Integral):
+                raise TypeError(f"Expecting left to be an int {suffix}")
+            if self.right is not None and not isinstance(self.right, Integral):
+                raise TypeError(f"Expecting right to be an int {suffix}")
+            if self.left is None and self.closed in ("left", "both"):
+                raise ValueError(
+                    f"left can't be None when closed == {self.closed} {suffix}"
+                )
+            if self.right is None and self.closed in ("right", "both"):
+                raise ValueError(
+                    f"right can't be None when closed == {self.closed} {suffix}"
+                )
+        else:
+            if self.left is not None and not isinstance(self.left, Real):
+                raise TypeError("Expecting left to be a real number.")
+            if self.right is not None and not isinstance(self.right, Real):
+                raise TypeError("Expecting right to be a real number.")
+
+        if self.right is not None and self.left is not None and self.right <= self.left:
+            raise ValueError(
+                f"right can't be less than left. Got left={self.left} and "
+                f"right={self.right}"
+            )
+
+    def __contains__(self, val):
+        if not isinstance(val, Integral) and np.isnan(val):
+            return False
+
+        left_cmp = operator.lt if self.closed in ("left", "both") else operator.le
+        right_cmp = operator.gt if self.closed in ("right", "both") else operator.ge
+
+        left = -np.inf if self.left is None else self.left
+        right = np.inf if self.right is None else self.right
+
+        if left_cmp(val, left):
+            return False
+        if right_cmp(val, right):
+            return False
+        return True
+
+    def is_satisfied_by(self, val):
+        if not isinstance(val, self.type):
+            return False
+
+        return val in self
+
+    def __str__(self):
+        type_str = "an int" if self.type is Integral else "a float"
+        left_bracket = "[" if self.closed in ("left", "both") else "("
+        left_bound = "-inf" if self.left is None else self.left
+        right_bound = "inf" if self.right is None else self.right
+        right_bracket = "]" if self.closed in ("right", "both") else ")"
+
+        # better repr if the bounds were given as integers
+        if not self.type == Integral and isinstance(self.left, Real):
+            left_bound = float(left_bound)
+        if not self.type == Integral and isinstance(self.right, Real):
+            right_bound = float(right_bound)
+
+        return (
+            f"{type_str} in the range "
+            f"{left_bracket}{left_bound}, {right_bound}{right_bracket}"
+        )
+
+
+class _ArrayLikes(_Constraint):
+    """Constraint representing array-likes"""
+
+    def is_satisfied_by(self, val):
+        return _is_arraylike_not_scalar(val)
+
+    def __str__(self):
+        return "an array-like"
+
+
+class _SparseMatrices(_Constraint):
+    """Constraint representing sparse matrices."""
+
+    def is_satisfied_by(self, val):
+        return issparse(val)
+
+    def __str__(self):
+        return "a sparse matrix"
+
+
+class _Callables(_Constraint):
+    """Constraint representing callables."""
+
+    def is_satisfied_by(self, val):
+        return callable(val)
+
+    def __str__(self):
+        return "a callable"
+
+
+class _RandomStates(_Constraint):
+    """Constraint representing random states.
+
+    Convenience class for
+    [Interval(Integral, 0, 2**32 - 1, closed="both"), np.random.RandomState, None]
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._constraints = [
+            Interval(Integral, 0, 2**32 - 1, closed="both"),
+            _InstancesOf(np.random.RandomState),
+            _NoneConstraint(),
+        ]
+
+    def is_satisfied_by(self, val):
+        return any(c.is_satisfied_by(val) for c in self._constraints)
+
+    def __str__(self):
+        return (
+            f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
+            f" {self._constraints[-1]}"
+        )
+
+
+class _Booleans(_Constraint):
+    """Constraint representing boolean likes.
+
+    Convenience class for
+    [bool, np.bool_]
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._constraints = [
+            _InstancesOf(bool),
+            _InstancesOf(np.bool_),
+        ]
+
+    def is_satisfied_by(self, val):
+        return any(c.is_satisfied_by(val) for c in self._constraints)
+
+    def __str__(self):
+        return (
+            f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
+            f" {self._constraints[-1]}"
+        )
+
+
+class _VerboseHelper(_Constraint):
+    """Helper constraint for the verbose parameter.
+
+    Convenience class for
+    [Interval(Integral, 0, None, closed="left"), bool, numpy.bool_]
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._constraints = [
+            Interval(Integral, 0, None, closed="left"),
+            _InstancesOf(bool),
+            _InstancesOf(np.bool_),
+        ]
+
+    def is_satisfied_by(self, val):
+        return any(c.is_satisfied_by(val) for c in self._constraints)
+
+    def __str__(self):
+        return (
+            f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
+            f" {self._constraints[-1]}"
+        )
+
+
+class MissingValues(_Constraint):
+    """Helper constraint for the `missing_values` parameters.
+
+    Convenience for
+    [
+        Integral,
+        Interval(Real, None, None, closed="both"),
+        str,   # when numeric_only is False
+        None,  # when numeric_only is False
+        _NanConstraint(),
+        _PandasNAConstraint(),
+    ]
+
+    Parameters
+    ----------
+    numeric_only : bool, default=False
+        Whether to consider only numeric missing value markers.
+
+    """
+
+    def __init__(self, numeric_only=False):
+        super().__init__()
+
+        self.numeric_only = numeric_only
+
+        self._constraints = [
+            _InstancesOf(Integral),
+            # we use an interval of Real to ignore np.nan that has its own constraint
+            Interval(Real, None, None, closed="both"),
+            _NanConstraint(),
+            _PandasNAConstraint(),
+        ]
+        if not self.numeric_only:
+            self._constraints.extend([_InstancesOf(str), _NoneConstraint()])
+
+    def is_satisfied_by(self, val):
+        return any(c.is_satisfied_by(val) for c in self._constraints)
+
+    def __str__(self):
+        return (
+            f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
+            f" {self._constraints[-1]}"
+        )
+
+
+class HasMethods(_Constraint):
+    """Constraint representing objects that expose specific methods.
+
+    It is useful for parameters following a protocol and where we don't want to impose
+    an affiliation to a specific module or class.
+
+    Parameters
+    ----------
+    methods : str or list of str
+        The method(s) that the object is expected to expose.
+    """
+
+    @validate_params(
+        {"methods": [str, list]},
+        prefer_skip_nested_validation=True,
+    )
+    def __init__(self, methods):
+        super().__init__()
+        if isinstance(methods, str):
+            methods = [methods]
+        self.methods = methods
+
+    def is_satisfied_by(self, val):
+        return all(callable(getattr(val, method, None)) for method in self.methods)
+
+    def __str__(self):
+        if len(self.methods) == 1:
+            methods = f"{self.methods[0]!r}"
+        else:
+            methods = (
+                f"{', '.join([repr(m) for m in self.methods[:-1]])} and"
+                f" {self.methods[-1]!r}"
+            )
+        return f"an object implementing {methods}"
+
+
+class _IterablesNotString(_Constraint):
+    """Constraint representing iterables that are not strings."""
+
+    def is_satisfied_by(self, val):
+        return isinstance(val, Iterable) and not isinstance(val, str)
+
+    def __str__(self):
+        return "an iterable"
+
+
+class _CVObjects(_Constraint):
+    """Constraint representing cv objects.
+
+    Convenient class for
+    [
+        Interval(Integral, 2, None, closed="left"),
+        HasMethods(["split", "get_n_splits"]),
+        _IterablesNotString(),
+        None,
+    ]
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._constraints = [
+            Interval(Integral, 2, None, closed="left"),
+            HasMethods(["split", "get_n_splits"]),
+            _IterablesNotString(),
+            _NoneConstraint(),
+        ]
+
+    def is_satisfied_by(self, val):
+        return any(c.is_satisfied_by(val) for c in self._constraints)
+
+    def __str__(self):
+        return (
+            f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
+            f" {self._constraints[-1]}"
+        )
+
+
+class Hidden:
+    """Class encapsulating a constraint not meant to be exposed to the user.
+
+    Parameters
+    ----------
+    constraint : str or _Constraint instance
+        The constraint to be used internally.
+    """
+
+    def __init__(self, constraint):
+        self.constraint = constraint
+
+
+def generate_invalid_param_val(constraint):
+    """Return a value that does not satisfy the constraint.
+
+    Raises a NotImplementedError if there exists no invalid value for this constraint.
+
+    This is only useful for testing purpose.
+
+    Parameters
+    ----------
+    constraint : _Constraint instance
+        The constraint to generate a value for.
+
+    Returns
+    -------
+    val : object
+        A value that does not satisfy the constraint.
+    """
+    if isinstance(constraint, StrOptions):
+        return f"not {' or '.join(constraint.options)}"
+
+    if isinstance(constraint, MissingValues):
+        return np.array([1, 2, 3])
+
+    if isinstance(constraint, _VerboseHelper):
+        return -1
+
+    if isinstance(constraint, HasMethods):
+        return type("HasNotMethods", (), {})()
+
+    if isinstance(constraint, _IterablesNotString):
+        return "a string"
+
+    if isinstance(constraint, _CVObjects):
+        return "not a cv object"
+
+    if isinstance(constraint, Interval) and constraint.type is Integral:
+        if constraint.left is not None:
+            return constraint.left - 1
+        if constraint.right is not None:
+            return constraint.right + 1
+
+        # There's no integer outside (-inf, +inf)
+        raise NotImplementedError
+
+    if isinstance(constraint, Interval) and constraint.type in (Real, RealNotInt):
+        if constraint.left is not None:
+            return constraint.left - 1e-6
+        if constraint.right is not None:
+            return constraint.right + 1e-6
+
+        # bounds are -inf, +inf
+        if constraint.closed in ("right", "neither"):
+            return -np.inf
+        if constraint.closed in ("left", "neither"):
+            return np.inf
+
+        # interval is [-inf, +inf]
+        return np.nan
+
+    raise NotImplementedError
+
+
+def generate_valid_param(constraint):
+    """Return a value that does satisfy a constraint.
+
+    This is only useful for testing purpose.
+
+    Parameters
+    ----------
+    constraint : Constraint instance
+        The constraint to generate a value for.
+
+    Returns
+    -------
+    val : object
+        A value that does satisfy the constraint.
+    """
+    if isinstance(constraint, _ArrayLikes):
+        return np.array([1, 2, 3])
+
+    if isinstance(constraint, _SparseMatrices):
+        return csr_matrix([[0, 1], [1, 0]])
+
+    if isinstance(constraint, _RandomStates):
+        return np.random.RandomState(42)
+
+    if isinstance(constraint, _Callables):
+        return lambda x: x
+
+    if isinstance(constraint, _NoneConstraint):
+        return None
+
+    if isinstance(constraint, _InstancesOf):
+        if constraint.type is np.ndarray:
+            # special case for ndarray since it can't be instantiated without arguments
+            return np.array([1, 2, 3])
+
+        if constraint.type in (Integral, Real):
+            # special case for Integral and Real since they are abstract classes
+            return 1
+
+        return constraint.type()
+
+    if isinstance(constraint, _Booleans):
+        return True
+
+    if isinstance(constraint, _VerboseHelper):
+        return 1
+
+    if isinstance(constraint, MissingValues) and constraint.numeric_only:
+        return np.nan
+
+    if isinstance(constraint, MissingValues) and not constraint.numeric_only:
+        return "missing"
+
+    if isinstance(constraint, HasMethods):
+        return type(
+            "ValidHasMethods", (), {m: lambda self: None for m in constraint.methods}
+        )()
+
+    if isinstance(constraint, _IterablesNotString):
+        return [1, 2, 3]
+
+    if isinstance(constraint, _CVObjects):
+        return 5
+
+    if isinstance(constraint, Options):  # includes StrOptions
+        for option in constraint.options:
+            return option
+
+    if isinstance(constraint, Interval):
+        interval = constraint
+        if interval.left is None and interval.right is None:
+            return 0
+        elif interval.left is None:
+            return interval.right - 1
+        elif interval.right is None:
+            return interval.left + 1
+        else:
+            if interval.type is Real:
+                return (interval.left + interval.right) / 2
+            else:
+                return interval.left + 1
+
+    raise ValueError(f"Unknown constraint type: {constraint}")
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_plotting.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_plotting.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a3883b7db7f5d9f4f04dbecf0720230278e41dc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_plotting.py
@@ -0,0 +1,419 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+import warnings
+from collections.abc import Mapping
+
+import numpy as np
+
+from . import check_consistent_length
+from ._optional_dependencies import check_matplotlib_support
+from ._response import _get_response_values_binary
+from .fixes import parse_version
+from .multiclass import type_of_target
+from .validation import _check_pos_label_consistency, _num_samples
+
+
+class _BinaryClassifierCurveDisplayMixin:
+    """Mixin class to be used in Displays requiring a binary classifier.
+
+    The aim of this class is to centralize some validations regarding the estimator and
+    the target and gather the response of the estimator.
+    """
+
+    def _validate_plot_params(self, *, ax=None, name=None):
+        check_matplotlib_support(f"{self.__class__.__name__}.plot")
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            _, ax = plt.subplots()
+
+        # Display classes are in process of changing from `estimator_name` to `name`.
+        # Try old attr name: `estimator_name` first.
+        if name is None:
+            name = getattr(self, "estimator_name", getattr(self, "name", None))
+        return ax, ax.figure, name
+
+    @classmethod
+    def _validate_and_get_response_values(
+        cls, estimator, X, y, *, response_method="auto", pos_label=None, name=None
+    ):
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+
+        name = estimator.__class__.__name__ if name is None else name
+
+        y_pred, pos_label = _get_response_values_binary(
+            estimator,
+            X,
+            response_method=response_method,
+            pos_label=pos_label,
+        )
+
+        return y_pred, pos_label, name
+
+    @classmethod
+    def _validate_from_predictions_params(
+        cls, y_true, y_pred, *, sample_weight=None, pos_label=None, name=None
+    ):
+        check_matplotlib_support(f"{cls.__name__}.from_predictions")
+
+        if type_of_target(y_true) != "binary":
+            raise ValueError(
+                f"The target y is not binary. Got {type_of_target(y_true)} type of"
+                " target."
+            )
+
+        check_consistent_length(y_true, y_pred, sample_weight)
+        pos_label = _check_pos_label_consistency(pos_label, y_true)
+
+        name = name if name is not None else "Classifier"
+
+        return pos_label, name
+
+    @classmethod
+    def _validate_from_cv_results_params(
+        cls,
+        cv_results,
+        X,
+        y,
+        *,
+        sample_weight,
+        pos_label,
+    ):
+        check_matplotlib_support(f"{cls.__name__}.from_cv_results")
+
+        required_keys = {"estimator", "indices"}
+        if not all(key in cv_results for key in required_keys):
+            raise ValueError(
+                "`cv_results` does not contain one of the following required keys: "
+                f"{required_keys}. Set explicitly the parameters "
+                "`return_estimator=True` and `return_indices=True` to the function"
+                "`cross_validate`."
+            )
+
+        train_size, test_size = (
+            len(cv_results["indices"]["train"][0]),
+            len(cv_results["indices"]["test"][0]),
+        )
+
+        if _num_samples(X) != train_size + test_size:
+            raise ValueError(
+                "`X` does not contain the correct number of samples. "
+                f"Expected {train_size + test_size}, got {_num_samples(X)}."
+            )
+
+        if type_of_target(y) != "binary":
+            raise ValueError(
+                f"The target `y` is not binary. Got {type_of_target(y)} type of target."
+            )
+        check_consistent_length(X, y, sample_weight)
+
+        try:
+            pos_label = _check_pos_label_consistency(pos_label, y)
+        except ValueError as e:
+            # Adapt error message
+            raise ValueError(str(e).replace("y_true", "y"))
+
+        return pos_label
+
+    @staticmethod
+    def _get_legend_label(curve_legend_metric, curve_name, legend_metric_name):
+        """Helper to get legend label using `name` and `legend_metric`"""
+        if curve_legend_metric is not None and curve_name is not None:
+            label = f"{curve_name} ({legend_metric_name} = {curve_legend_metric:0.2f})"
+        elif curve_legend_metric is not None:
+            label = f"{legend_metric_name} = {curve_legend_metric:0.2f}"
+        elif curve_name is not None:
+            label = curve_name
+        else:
+            label = None
+        return label
+
+    @staticmethod
+    def _validate_curve_kwargs(
+        n_curves,
+        name,
+        legend_metric,
+        legend_metric_name,
+        curve_kwargs,
+        **kwargs,
+    ):
+        """Get validated line kwargs for each curve.
+
+        Parameters
+        ----------
+        n_curves : int
+            Number of curves.
+
+        name : list of str or None
+            Name for labeling legend entries.
+
+        legend_metric : dict
+            Dictionary with "mean" and "std" keys, or "metric" key of metric
+            values for each curve. If None, "label" will not contain metric values.
+
+        legend_metric_name : str
+            Name of the summary value provided in `legend_metrics`.
+
+        curve_kwargs : dict or list of dict or None
+            Dictionary with keywords passed to the matplotlib's `plot` function
+            to draw the individual curves. If a list is provided, the
+            parameters are applied to the curves sequentially. If a single
+            dictionary is provided, the same parameters are applied to all
+            curves.
+
+        **kwargs : dict
+            Deprecated. Keyword arguments to be passed to matplotlib's `plot`.
+        """
+        # TODO(1.9): Remove deprecated **kwargs
+        if curve_kwargs and kwargs:
+            raise ValueError(
+                "Cannot provide both `curve_kwargs` and `kwargs`. `**kwargs` is "
+                "deprecated in 1.7 and will be removed in 1.9. Pass all matplotlib "
+                "arguments to `curve_kwargs` as a dictionary."
+            )
+        if kwargs:
+            warnings.warn(
+                "`**kwargs` is deprecated and will be removed in 1.9. Pass all "
+                "matplotlib arguments to `curve_kwargs` as a dictionary instead.",
+                FutureWarning,
+            )
+            curve_kwargs = kwargs
+
+        if isinstance(curve_kwargs, list) and len(curve_kwargs) != n_curves:
+            raise ValueError(
+                f"`curve_kwargs` must be None, a dictionary or a list of length "
+                f"{n_curves}. Got: {curve_kwargs}."
+            )
+
+        # Ensure valid `name` and `curve_kwargs` combination.
+        if (
+            isinstance(name, list)
+            and len(name) != 1
+            and not isinstance(curve_kwargs, list)
+        ):
+            raise ValueError(
+                "To avoid labeling individual curves that have the same appearance, "
+                f"`curve_kwargs` should be a list of {n_curves} dictionaries. "
+                "Alternatively, set `name` to `None` or a single string to label "
+                "a single legend entry with mean ROC AUC score of all curves."
+            )
+
+        # Ensure `name` is of the correct length
+        if isinstance(name, str):
+            name = [name]
+        if isinstance(name, list) and len(name) == 1:
+            name = name * n_curves
+        name = [None] * n_curves if name is None else name
+
+        # Ensure `curve_kwargs` is of correct length
+        if isinstance(curve_kwargs, Mapping):
+            curve_kwargs = [curve_kwargs] * n_curves
+
+        default_multi_curve_kwargs = {"alpha": 0.5, "linestyle": "--", "color": "blue"}
+        if curve_kwargs is None:
+            if n_curves > 1:
+                curve_kwargs = [default_multi_curve_kwargs] * n_curves
+            else:
+                curve_kwargs = [{}]
+
+        labels = []
+        if "mean" in legend_metric:
+            label_aggregate = _BinaryClassifierCurveDisplayMixin._get_legend_label(
+                legend_metric["mean"], name[0], legend_metric_name
+            )
+            # Note: "std" always `None` when "mean" is `None` - no metric value added
+            # to label in this case
+            if legend_metric["std"] is not None:
+                # Add the "+/- std" to the end (in brackets if name provided)
+                if name[0] is not None:
+                    label_aggregate = (
+                        label_aggregate[:-1] + f" +/- {legend_metric['std']:0.2f})"
+                    )
+                else:
+                    label_aggregate = (
+                        label_aggregate + f" +/- {legend_metric['std']:0.2f}"
+                    )
+            # Add `label` for first curve only, set to `None` for remaining curves
+            labels.extend([label_aggregate] + [None] * (n_curves - 1))
+        else:
+            for curve_legend_metric, curve_name in zip(legend_metric["metric"], name):
+                labels.append(
+                    _BinaryClassifierCurveDisplayMixin._get_legend_label(
+                        curve_legend_metric, curve_name, legend_metric_name
+                    )
+                )
+
+        curve_kwargs_ = [
+            _validate_style_kwargs({"label": label}, curve_kwargs[fold_idx])
+            for fold_idx, label in enumerate(labels)
+        ]
+        return curve_kwargs_
+
+
+def _validate_score_name(score_name, scoring, negate_score):
+    """Validate the `score_name` parameter.
+
+    If `score_name` is provided, we just return it as-is.
+    If `score_name` is `None`, we use `Score` if `negate_score` is `False` and
+    `Negative score` otherwise.
+    If `score_name` is a string or a callable, we infer the name. We replace `_` by
+    spaces and capitalize the first letter. We remove `neg_` and replace it by
+    `"Negative"` if `negate_score` is `False` or just remove it otherwise.
+    """
+    if score_name is not None:
+        return score_name
+    elif scoring is None:
+        return "Negative score" if negate_score else "Score"
+    else:
+        score_name = scoring.__name__ if callable(scoring) else scoring
+        if negate_score:
+            if score_name.startswith("neg_"):
+                score_name = score_name[4:]
+            else:
+                score_name = f"Negative {score_name}"
+        elif score_name.startswith("neg_"):
+            score_name = f"Negative {score_name[4:]}"
+        score_name = score_name.replace("_", " ")
+        return score_name.capitalize()
+
+
+def _interval_max_min_ratio(data):
+    """Compute the ratio between the largest and smallest inter-point distances.
+
+    A value larger than 5 typically indicates that the parameter range would
+    better be displayed with a log scale while a linear scale would be more
+    suitable otherwise.
+    """
+    diff = np.diff(np.sort(data))
+    return diff.max() / diff.min()
+
+
+def _validate_style_kwargs(default_style_kwargs, user_style_kwargs):
+    """Create valid style kwargs by avoiding Matplotlib alias errors.
+
+    Matplotlib raises an error when, for example, 'color' and 'c', or 'linestyle' and
+    'ls', are specified together. To avoid this, we automatically keep only the one
+    specified by the user and raise an error if the user specifies both.
+
+    Parameters
+    ----------
+    default_style_kwargs : dict
+        The Matplotlib style kwargs used by default in the scikit-learn display.
+    user_style_kwargs : dict
+        The user-defined Matplotlib style kwargs.
+
+    Returns
+    -------
+    valid_style_kwargs : dict
+        The validated style kwargs taking into account both default and user-defined
+        Matplotlib style kwargs.
+    """
+
+    invalid_to_valid_kw = {
+        "ls": "linestyle",
+        "c": "color",
+        "ec": "edgecolor",
+        "fc": "facecolor",
+        "lw": "linewidth",
+        "mec": "markeredgecolor",
+        "mfcalt": "markerfacecoloralt",
+        "ms": "markersize",
+        "mew": "markeredgewidth",
+        "mfc": "markerfacecolor",
+        "aa": "antialiased",
+        "ds": "drawstyle",
+        "font": "fontproperties",
+        "family": "fontfamily",
+        "name": "fontname",
+        "size": "fontsize",
+        "stretch": "fontstretch",
+        "style": "fontstyle",
+        "variant": "fontvariant",
+        "weight": "fontweight",
+        "ha": "horizontalalignment",
+        "va": "verticalalignment",
+        "ma": "multialignment",
+    }
+    for invalid_key, valid_key in invalid_to_valid_kw.items():
+        if invalid_key in user_style_kwargs and valid_key in user_style_kwargs:
+            raise TypeError(
+                f"Got both {invalid_key} and {valid_key}, which are aliases of one "
+                "another"
+            )
+    valid_style_kwargs = default_style_kwargs.copy()
+
+    for key in user_style_kwargs.keys():
+        if key in invalid_to_valid_kw:
+            valid_style_kwargs[invalid_to_valid_kw[key]] = user_style_kwargs[key]
+        else:
+            valid_style_kwargs[key] = user_style_kwargs[key]
+
+    return valid_style_kwargs
+
+
+def _despine(ax):
+    """Remove the top and right spines of the plot.
+
+    Parameters
+    ----------
+    ax : matplotlib.axes.Axes
+        The axes of the plot to despine.
+    """
+    for s in ["top", "right"]:
+        ax.spines[s].set_visible(False)
+    for s in ["bottom", "left"]:
+        ax.spines[s].set_bounds(0, 1)
+
+
+def _deprecate_estimator_name(estimator_name, name, version):
+    """Deprecate `estimator_name` in favour of `name`."""
+    version = parse_version(version)
+    version_remove = f"{version.major}.{version.minor + 2}"
+    if estimator_name != "deprecated":
+        if name:
+            raise ValueError(
+                "Cannot provide both `estimator_name` and `name`. `estimator_name` "
+                f"is deprecated in {version} and will be removed in {version_remove}. "
+                "Use `name` only."
+            )
+        warnings.warn(
+            f"`estimator_name` is deprecated in {version} and will be removed in "
+            f"{version_remove}. Use `name` instead.",
+            FutureWarning,
+        )
+        return estimator_name
+    return name
+
+
+def _convert_to_list_leaving_none(param):
+    """Convert parameters to a list, leaving `None` as is."""
+    if param is None:
+        return None
+    if isinstance(param, list):
+        return param
+    return [param]
+
+
+def _check_param_lengths(required, optional, class_name):
+    """Check required and optional parameters are of the same length."""
+    optional_provided = {}
+    for name, param in optional.items():
+        if isinstance(param, list):
+            optional_provided[name] = param
+
+    all_params = {**required, **optional_provided}
+    if len({len(param) for param in all_params.values()}) > 1:
+        param_keys = [key for key in all_params.keys()]
+        # Note: below code requires `len(param_keys) >= 2`, which is the case for all
+        # display classes
+        params_formatted = " and ".join([", ".join(param_keys[:-1]), param_keys[-1]])
+        or_plot = ""
+        if "'name' (or self.name)" in param_keys:
+            or_plot = " (or `plot`)"
+        lengths_formatted = ", ".join(
+            f"{key}: {len(value)}" for key, value in all_params.items()
+        )
+        raise ValueError(
+            f"{params_formatted} from `{class_name}` initialization{or_plot}, "
+            f"should all be lists of the same length. Got: {lengths_formatted}"
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_pprint.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_pprint.py
new file mode 100644
index 0000000000000000000000000000000000000000..527843fe42f0b79eac46802a12d17e5f5b1b222c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_pprint.py
@@ -0,0 +1,463 @@
+"""This module contains the _EstimatorPrettyPrinter class used in
+BaseEstimator.__repr__ for pretty-printing estimators"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+# 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 Python Software Foundation;
+# All Rights Reserved
+
+# Authors: Fred L. Drake, Jr. <fdrake@acm.org> (built-in CPython pprint module)
+#          Nicolas Hug (scikit-learn specific changes)
+
+# License: PSF License version 2 (see below)
+
+# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+# --------------------------------------------
+
+# 1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"),
+# and the Individual or Organization ("Licensee") accessing and otherwise
+# using this software ("Python") in source or binary form and its associated
+# documentation.
+
+# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
+# grants Licensee a nonexclusive, royalty-free, world-wide license to
+# reproduce, analyze, test, perform and/or display publicly, prepare
+# derivative works, distribute, and otherwise use Python alone or in any
+# derivative version, provided, however, that PSF's License Agreement and
+# PSF's notice of copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004,
+# 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
+# 2017, 2018 Python Software Foundation; All Rights Reserved" are retained in
+# Python alone or in any derivative version prepared by Licensee.
+
+# 3. In the event Licensee prepares a derivative work that is based on or
+# incorporates Python or any part thereof, and wants to make the derivative
+# work available to others as provided herein, then Licensee hereby agrees to
+# include in any such work a brief summary of the changes made to Python.
+
+# 4. PSF is making Python available to Licensee on an "AS IS" basis. PSF MAKES
+# NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT
+# NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF
+# MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
+# PYTHON WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
+
+# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY
+# INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF
+# MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE
+# THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+# 6. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+
+# 7. Nothing in this License Agreement shall be deemed to create any
+# relationship of agency, partnership, or joint venture between PSF and
+# Licensee. This License Agreement does not grant permission to use PSF
+# trademarks or trade name in a trademark sense to endorse or promote products
+# or services of Licensee, or any third party.
+
+# 8. By copying, installing or otherwise using Python, Licensee agrees to be
+# bound by the terms and conditions of this License Agreement.
+
+
+# Brief summary of changes to original code:
+# - "compact" parameter is supported for dicts, not just lists or tuples
+# - estimators have a custom handler, they're not just treated as objects
+# - long sequences (lists, tuples, dict items) with more than N elements are
+#   shortened using ellipsis (', ...') at the end.
+
+import inspect
+import pprint
+
+from .._config import get_config
+from ..base import BaseEstimator
+from ._missing import is_scalar_nan
+
+
+class KeyValTuple(tuple):
+    """Dummy class for correctly rendering key-value tuples from dicts."""
+
+    def __repr__(self):
+        # needed for _dispatch[tuple.__repr__] not to be overridden
+        return super().__repr__()
+
+
+class KeyValTupleParam(KeyValTuple):
+    """Dummy class for correctly rendering key-value tuples from parameters."""
+
+    pass
+
+
+def _changed_params(estimator):
+    """Return dict (param_name: value) of parameters that were given to
+    estimator with non-default values."""
+
+    params = estimator.get_params(deep=False)
+    init_func = getattr(estimator.__init__, "deprecated_original", estimator.__init__)
+    init_params = inspect.signature(init_func).parameters
+    init_params = {name: param.default for name, param in init_params.items()}
+
+    def has_changed(k, v):
+        if k not in init_params:  # happens if k is part of a **kwargs
+            return True
+        if init_params[k] == inspect._empty:  # k has no default value
+            return True
+        # try to avoid calling repr on nested estimators
+        if isinstance(v, BaseEstimator) and v.__class__ != init_params[k].__class__:
+            return True
+        # Use repr as a last resort. It may be expensive.
+        if repr(v) != repr(init_params[k]) and not (
+            is_scalar_nan(init_params[k]) and is_scalar_nan(v)
+        ):
+            return True
+        return False
+
+    return {k: v for k, v in params.items() if has_changed(k, v)}
+
+
+class _EstimatorPrettyPrinter(pprint.PrettyPrinter):
+    """Pretty Printer class for estimator objects.
+
+    This extends the pprint.PrettyPrinter class, because:
+    - we need estimators to be printed with their parameters, e.g.
+      Estimator(param1=value1, ...) which is not supported by default.
+    - the 'compact' parameter of PrettyPrinter is ignored for dicts, which
+      may lead to very long representations that we want to avoid.
+
+    Quick overview of pprint.PrettyPrinter (see also
+    https://stackoverflow.com/questions/49565047/pprint-with-hex-numbers):
+
+    - the entry point is the _format() method which calls format() (overridden
+      here)
+    - format() directly calls _safe_repr() for a first try at rendering the
+      object
+    - _safe_repr formats the whole object recursively, only calling itself,
+      not caring about line length or anything
+    - back to _format(), if the output string is too long, _format() then calls
+      the appropriate _pprint_TYPE() method (e.g. _pprint_list()) depending on
+      the type of the object. This where the line length and the compact
+      parameters are taken into account.
+    - those _pprint_TYPE() methods will internally use the format() method for
+      rendering the nested objects of an object (e.g. the elements of a list)
+
+    In the end, everything has to be implemented twice: in _safe_repr and in
+    the custom _pprint_TYPE methods. Unfortunately PrettyPrinter is really not
+    straightforward to extend (especially when we want a compact output), so
+    the code is a bit convoluted.
+
+    This class overrides:
+    - format() to support the changed_only parameter
+    - _safe_repr to support printing of estimators (for when they fit on a
+      single line)
+    - _format_dict_items so that dict are correctly 'compacted'
+    - _format_items so that ellipsis is used on long lists and tuples
+
+    When estimators cannot be printed on a single line, the builtin _format()
+    will call _pprint_estimator() because it was registered to do so (see
+    _dispatch[BaseEstimator.__repr__] = _pprint_estimator).
+
+    both _format_dict_items() and _pprint_estimator() use the
+    _format_params_or_dict_items() method that will format parameters and
+    key-value pairs respecting the compact parameter. This method needs another
+    subroutine _pprint_key_val_tuple() used when a parameter or a key-value
+    pair is too long to fit on a single line. This subroutine is called in
+    _format() and is registered as well in the _dispatch dict (just like
+    _pprint_estimator). We had to create the two classes KeyValTuple and
+    KeyValTupleParam for this.
+    """
+
+    def __init__(
+        self,
+        indent=1,
+        width=80,
+        depth=None,
+        stream=None,
+        *,
+        compact=False,
+        indent_at_name=True,
+        n_max_elements_to_show=None,
+    ):
+        super().__init__(indent, width, depth, stream, compact=compact)
+        self._indent_at_name = indent_at_name
+        if self._indent_at_name:
+            self._indent_per_level = 1  # ignore indent param
+        self._changed_only = get_config()["print_changed_only"]
+        # Max number of elements in a list, dict, tuple until we start using
+        # ellipsis. This also affects the number of arguments of an estimators
+        # (they are treated as dicts)
+        self.n_max_elements_to_show = n_max_elements_to_show
+
+    def format(self, object, context, maxlevels, level):
+        return _safe_repr(
+            object, context, maxlevels, level, changed_only=self._changed_only
+        )
+
+    def _pprint_estimator(self, object, stream, indent, allowance, context, level):
+        stream.write(object.__class__.__name__ + "(")
+        if self._indent_at_name:
+            indent += len(object.__class__.__name__)
+
+        if self._changed_only:
+            params = _changed_params(object)
+        else:
+            params = object.get_params(deep=False)
+
+        self._format_params(
+            sorted(params.items()), stream, indent, allowance + 1, context, level
+        )
+        stream.write(")")
+
+    def _format_dict_items(self, items, stream, indent, allowance, context, level):
+        return self._format_params_or_dict_items(
+            items, stream, indent, allowance, context, level, is_dict=True
+        )
+
+    def _format_params(self, items, stream, indent, allowance, context, level):
+        return self._format_params_or_dict_items(
+            items, stream, indent, allowance, context, level, is_dict=False
+        )
+
+    def _format_params_or_dict_items(
+        self, object, stream, indent, allowance, context, level, is_dict
+    ):
+        """Format dict items or parameters respecting the compact=True
+        parameter. For some reason, the builtin rendering of dict items doesn't
+        respect compact=True and will use one line per key-value if all cannot
+        fit in a single line.
+        Dict items will be rendered as <'key': value> while params will be
+        rendered as <key=value>. The implementation is mostly copy/pasting from
+        the builtin _format_items().
+        This also adds ellipsis if the number of items is greater than
+        self.n_max_elements_to_show.
+        """
+        write = stream.write
+        indent += self._indent_per_level
+        delimnl = ",\n" + " " * indent
+        delim = ""
+        width = max_width = self._width - indent + 1
+        it = iter(object)
+        try:
+            next_ent = next(it)
+        except StopIteration:
+            return
+        last = False
+        n_items = 0
+        while not last:
+            if n_items == self.n_max_elements_to_show:
+                write(", ...")
+                break
+            n_items += 1
+            ent = next_ent
+            try:
+                next_ent = next(it)
+            except StopIteration:
+                last = True
+                max_width -= allowance
+                width -= allowance
+            if self._compact:
+                k, v = ent
+                krepr = self._repr(k, context, level)
+                vrepr = self._repr(v, context, level)
+                if not is_dict:
+                    krepr = krepr.strip("'")
+                middle = ": " if is_dict else "="
+                rep = krepr + middle + vrepr
+                w = len(rep) + 2
+                if width < w:
+                    width = max_width
+                    if delim:
+                        delim = delimnl
+                if width >= w:
+                    width -= w
+                    write(delim)
+                    delim = ", "
+                    write(rep)
+                    continue
+            write(delim)
+            delim = delimnl
+            class_ = KeyValTuple if is_dict else KeyValTupleParam
+            self._format(
+                class_(ent), stream, indent, allowance if last else 1, context, level
+            )
+
+    def _format_items(self, items, stream, indent, allowance, context, level):
+        """Format the items of an iterable (list, tuple...). Same as the
+        built-in _format_items, with support for ellipsis if the number of
+        elements is greater than self.n_max_elements_to_show.
+        """
+        write = stream.write
+        indent += self._indent_per_level
+        if self._indent_per_level > 1:
+            write((self._indent_per_level - 1) * " ")
+        delimnl = ",\n" + " " * indent
+        delim = ""
+        width = max_width = self._width - indent + 1
+        it = iter(items)
+        try:
+            next_ent = next(it)
+        except StopIteration:
+            return
+        last = False
+        n_items = 0
+        while not last:
+            if n_items == self.n_max_elements_to_show:
+                write(", ...")
+                break
+            n_items += 1
+            ent = next_ent
+            try:
+                next_ent = next(it)
+            except StopIteration:
+                last = True
+                max_width -= allowance
+                width -= allowance
+            if self._compact:
+                rep = self._repr(ent, context, level)
+                w = len(rep) + 2
+                if width < w:
+                    width = max_width
+                    if delim:
+                        delim = delimnl
+                if width >= w:
+                    width -= w
+                    write(delim)
+                    delim = ", "
+                    write(rep)
+                    continue
+            write(delim)
+            delim = delimnl
+            self._format(ent, stream, indent, allowance if last else 1, context, level)
+
+    def _pprint_key_val_tuple(self, object, stream, indent, allowance, context, level):
+        """Pretty printing for key-value tuples from dict or parameters."""
+        k, v = object
+        rep = self._repr(k, context, level)
+        if isinstance(object, KeyValTupleParam):
+            rep = rep.strip("'")
+            middle = "="
+        else:
+            middle = ": "
+        stream.write(rep)
+        stream.write(middle)
+        self._format(
+            v, stream, indent + len(rep) + len(middle), allowance, context, level
+        )
+
+    # Note: need to copy _dispatch to prevent instances of the builtin
+    # PrettyPrinter class to call methods of _EstimatorPrettyPrinter (see issue
+    # 12906)
+    # mypy error: "Type[PrettyPrinter]" has no attribute "_dispatch"
+    _dispatch = pprint.PrettyPrinter._dispatch.copy()  # type: ignore[attr-defined]
+    _dispatch[BaseEstimator.__repr__] = _pprint_estimator
+    _dispatch[KeyValTuple.__repr__] = _pprint_key_val_tuple
+
+
+def _safe_repr(object, context, maxlevels, level, changed_only=False):
+    """Same as the builtin _safe_repr, with added support for Estimator
+    objects."""
+    typ = type(object)
+
+    if typ in pprint._builtin_scalars:
+        return repr(object), True, False
+
+    r = getattr(typ, "__repr__", None)
+    if issubclass(typ, dict) and r is dict.__repr__:
+        if not object:
+            return "{}", True, False
+        objid = id(object)
+        if maxlevels and level >= maxlevels:
+            return "{...}", False, objid in context
+        if objid in context:
+            return pprint._recursion(object), False, True
+        context[objid] = 1
+        readable = True
+        recursive = False
+        components = []
+        append = components.append
+        level += 1
+        saferepr = _safe_repr
+        items = sorted(object.items(), key=pprint._safe_tuple)
+        for k, v in items:
+            krepr, kreadable, krecur = saferepr(
+                k, context, maxlevels, level, changed_only=changed_only
+            )
+            vrepr, vreadable, vrecur = saferepr(
+                v, context, maxlevels, level, changed_only=changed_only
+            )
+            append("%s: %s" % (krepr, vrepr))
+            readable = readable and kreadable and vreadable
+            if krecur or vrecur:
+                recursive = True
+        del context[objid]
+        return "{%s}" % ", ".join(components), readable, recursive
+
+    if (issubclass(typ, list) and r is list.__repr__) or (
+        issubclass(typ, tuple) and r is tuple.__repr__
+    ):
+        if issubclass(typ, list):
+            if not object:
+                return "[]", True, False
+            format = "[%s]"
+        elif len(object) == 1:
+            format = "(%s,)"
+        else:
+            if not object:
+                return "()", True, False
+            format = "(%s)"
+        objid = id(object)
+        if maxlevels and level >= maxlevels:
+            return format % "...", False, objid in context
+        if objid in context:
+            return pprint._recursion(object), False, True
+        context[objid] = 1
+        readable = True
+        recursive = False
+        components = []
+        append = components.append
+        level += 1
+        for o in object:
+            orepr, oreadable, orecur = _safe_repr(
+                o, context, maxlevels, level, changed_only=changed_only
+            )
+            append(orepr)
+            if not oreadable:
+                readable = False
+            if orecur:
+                recursive = True
+        del context[objid]
+        return format % ", ".join(components), readable, recursive
+
+    if issubclass(typ, BaseEstimator):
+        objid = id(object)
+        if maxlevels and level >= maxlevels:
+            return f"{typ.__name__}(...)", False, objid in context
+        if objid in context:
+            return pprint._recursion(object), False, True
+        context[objid] = 1
+        readable = True
+        recursive = False
+        if changed_only:
+            params = _changed_params(object)
+        else:
+            params = object.get_params(deep=False)
+        components = []
+        append = components.append
+        level += 1
+        saferepr = _safe_repr
+        items = sorted(params.items(), key=pprint._safe_tuple)
+        for k, v in items:
+            krepr, kreadable, krecur = saferepr(
+                k, context, maxlevels, level, changed_only=changed_only
+            )
+            vrepr, vreadable, vrecur = saferepr(
+                v, context, maxlevels, level, changed_only=changed_only
+            )
+            append("%s=%s" % (krepr.strip("'"), vrepr))
+            readable = readable and kreadable and vreadable
+            if krecur or vrecur:
+                recursive = True
+        del context[objid]
+        return ("%s(%s)" % (typ.__name__, ", ".join(components)), readable, recursive)
+
+    rep = repr(object)
+    return rep, (rep and not rep.startswith("<")), False
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_random.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/_random.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..7ac4f9774cfa4dc99bd5f23604dc9b00d4f47e65
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_random.pxd
@@ -0,0 +1,34 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._typedefs cimport uint32_t
+
+
+cdef inline uint32_t DEFAULT_SEED = 1
+
+cdef enum:
+    # Max value for our rand_r replacement (near the bottom).
+    # We don't use RAND_MAX because it's different across platforms and
+    # particularly tiny on Windows/MSVC.
+    # It corresponds to the maximum representable value for
+    # 32-bit signed integers (i.e. 2^31 - 1).
+    RAND_R_MAX = 2147483647
+
+
+# rand_r replacement using a 32bit XorShift generator
+# See http://www.jstatsoft.org/v08/i14/paper for details
+cdef inline uint32_t our_rand_r(uint32_t* seed) nogil:
+    """Generate a pseudo-random np.uint32 from a np.uint32 seed"""
+    # seed shouldn't ever be 0.
+    if (seed[0] == 0):
+        seed[0] = DEFAULT_SEED
+
+    seed[0] ^= <uint32_t>(seed[0] << 13)
+    seed[0] ^= <uint32_t>(seed[0] >> 17)
+    seed[0] ^= <uint32_t>(seed[0] << 5)
+
+    # Use the modulo to make sure that we don't return a values greater than the
+    # maximum representable value for signed 32bit integers (i.e. 2^31 - 1).
+    # Note that the parenthesis are needed to avoid overflow: here
+    # RAND_R_MAX is cast to uint32_t before 1 is added.
+    return seed[0] % ((<uint32_t>RAND_R_MAX) + 1)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_random.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_random.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..f0e649e60fe7c83bff5b5c6d1419bc142feaed1c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_random.pyx
@@ -0,0 +1,355 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Random utility function
+=======================
+This module complements missing features of ``numpy.random``.
+
+The module contains:
+    * Several algorithms to sample integers without replacement.
+    * Fast rand_r alternative based on xor shifts
+"""
+import numpy as np
+from . import check_random_state
+
+from ._typedefs cimport intp_t
+
+
+cdef uint32_t DEFAULT_SEED = 1
+
+
+# Compatibility type to always accept the default int type used by NumPy, both
+# before and after NumPy 2. On Windows, `long` does not always match `inp_t`.
+# See the comments in the `sample_without_replacement` Python function for more
+# details.
+ctypedef fused default_int:
+    intp_t
+    long
+
+
+cpdef _sample_without_replacement_check_input(default_int n_population,
+                                              default_int n_samples):
+    """ Check that input are consistent for sample_without_replacement"""
+    if n_population < 0:
+        raise ValueError('n_population should be greater than 0, got %s.'
+                         % n_population)
+
+    if n_samples > n_population:
+        raise ValueError('n_population should be greater or equal than '
+                         'n_samples, got n_samples > n_population (%s > %s)'
+                         % (n_samples, n_population))
+
+
+cpdef _sample_without_replacement_with_tracking_selection(
+        default_int n_population,
+        default_int n_samples,
+        random_state=None):
+    r"""Sample integers without replacement.
+
+    Select n_samples integers from the set [0, n_population) without
+    replacement.
+
+    Time complexity:
+        - Worst-case: unbounded
+        - Average-case:
+            O(O(np.random.randint) * \sum_{i=1}^n_samples 1 /
+                                              (1 - i / n_population)))
+            <= O(O(np.random.randint) *
+                   n_population * ln((n_population - 2)
+                                     /(n_population - 1 - n_samples)))
+            <= O(O(np.random.randint) *
+                 n_population * 1 / (1 - n_samples / n_population))
+
+    Space complexity of O(n_samples) in a python set.
+
+
+    Parameters
+    ----------
+    n_population : int
+        The size of the set to sample from.
+
+    n_samples : int
+        The number of integer to sample.
+
+    random_state : int, RandomState instance or None, default=None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    Returns
+    -------
+    out : ndarray of shape (n_samples,)
+        The sampled subsets of integer.
+    """
+    _sample_without_replacement_check_input(n_population, n_samples)
+
+    cdef default_int i
+    cdef default_int j
+    cdef default_int[::1] out = np.empty((n_samples, ), dtype=int)
+
+    rng = check_random_state(random_state)
+    rng_randint = rng.randint
+
+    # The following line of code are heavily inspired from python core,
+    # more precisely of random.sample.
+    cdef set selected = set()
+
+    for i in range(n_samples):
+        j = rng_randint(n_population)
+        while j in selected:
+            j = rng_randint(n_population)
+        selected.add(j)
+        out[i] = j
+
+    return np.asarray(out)
+
+
+cpdef _sample_without_replacement_with_pool(default_int n_population,
+                                            default_int n_samples,
+                                            random_state=None):
+    """Sample integers without replacement.
+
+    Select n_samples integers from the set [0, n_population) without
+    replacement.
+
+    Time complexity: O(n_population +  O(np.random.randint) * n_samples)
+
+    Space complexity of O(n_population + n_samples).
+
+
+    Parameters
+    ----------
+    n_population : int
+        The size of the set to sample from.
+
+    n_samples : int
+        The number of integer to sample.
+
+    random_state : int, RandomState instance or None, default=None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    Returns
+    -------
+    out : ndarray of shape (n_samples,)
+        The sampled subsets of integer.
+    """
+    _sample_without_replacement_check_input(n_population, n_samples)
+
+    cdef default_int i
+    cdef default_int j
+    cdef default_int[::1] out = np.empty((n_samples,), dtype=int)
+    cdef default_int[::1] pool = np.empty((n_population,), dtype=int)
+
+    rng = check_random_state(random_state)
+    rng_randint = rng.randint
+
+    # Initialize the pool
+    for i in range(n_population):
+        pool[i] = i
+
+    # The following line of code are heavily inspired from python core,
+    # more precisely of random.sample.
+    for i in range(n_samples):
+        j = rng_randint(n_population - i)  # invariant: non-selected at [0,n-i)
+        out[i] = pool[j]
+        pool[j] = pool[n_population - i - 1]  # move non-selected item into vacancy
+
+    return np.asarray(out)
+
+
+cpdef _sample_without_replacement_with_reservoir_sampling(
+    default_int n_population,
+    default_int n_samples,
+    random_state=None
+):
+    """Sample integers without replacement.
+
+    Select n_samples integers from the set [0, n_population) without
+    replacement.
+
+    Time complexity of
+        O((n_population - n_samples) * O(np.random.randint) + n_samples)
+    Space complexity of O(n_samples)
+
+
+    Parameters
+    ----------
+    n_population : int
+        The size of the set to sample from.
+
+    n_samples : int
+         The number of integer to sample.
+
+    random_state : int, RandomState instance or None, default=None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    Returns
+    -------
+    out : ndarray of shape (n_samples,)
+        The sampled subsets of integer. The order of the items is not
+        necessarily random. Use a random permutation of the array if the order
+        of the items has to be randomized.
+    """
+    _sample_without_replacement_check_input(n_population, n_samples)
+
+    cdef default_int i
+    cdef default_int j
+    cdef default_int[::1] out = np.empty((n_samples, ), dtype=int)
+
+    rng = check_random_state(random_state)
+    rng_randint = rng.randint
+
+    # This cython implementation is based on the one of Robert Kern:
+    # http://mail.scipy.org/pipermail/numpy-discussion/2010-December/
+    # 054289.html
+    #
+    for i in range(n_samples):
+        out[i] = i
+
+    for i from n_samples <= i < n_population:
+        j = rng_randint(0, i + 1)
+        if j < n_samples:
+            out[j] = i
+
+    return np.asarray(out)
+
+
+cdef _sample_without_replacement(default_int n_population,
+                                 default_int n_samples,
+                                 method="auto",
+                                 random_state=None):
+    """Sample integers without replacement.
+
+    Private function for the implementation, see sample_without_replacement
+    documentation for more details.
+    """
+    _sample_without_replacement_check_input(n_population, n_samples)
+
+    all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
+
+    ratio = <double> n_samples / n_population if n_population != 0.0 else 1.0
+
+    # Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
+    if method == "auto" and ratio > 0.01 and ratio < 0.99:
+        rng = check_random_state(random_state)
+        return rng.permutation(n_population)[:n_samples]
+
+    if method == "auto" or method == "tracking_selection":
+        # TODO the pool based method can also be used.
+        #      however, it requires special benchmark to take into account
+        #      the memory requirement of the array vs the set.
+
+        # The value 0.2 has been determined through benchmarking.
+        if ratio < 0.2:
+            return _sample_without_replacement_with_tracking_selection(
+                n_population, n_samples, random_state)
+        else:
+            return _sample_without_replacement_with_reservoir_sampling(
+                n_population, n_samples, random_state)
+
+    elif method == "reservoir_sampling":
+        return _sample_without_replacement_with_reservoir_sampling(
+            n_population, n_samples, random_state)
+
+    elif method == "pool":
+        return _sample_without_replacement_with_pool(n_population, n_samples,
+                                                     random_state)
+    else:
+        raise ValueError('Expected a method name in %s, got %s. '
+                         % (all_methods, method))
+
+
+def sample_without_replacement(
+        object n_population, object n_samples, method="auto", random_state=None):
+    """Sample integers without replacement.
+
+    Select n_samples integers from the set [0, n_population) without
+    replacement.
+
+
+    Parameters
+    ----------
+    n_population : int
+        The size of the set to sample from.
+
+    n_samples : int
+        The number of integer to sample.
+
+    random_state : int, RandomState instance or None, default=None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    method : {"auto", "tracking_selection", "reservoir_sampling", "pool"}, \
+            default='auto'
+        If method == "auto", the ratio of n_samples / n_population is used
+        to determine which algorithm to use:
+        If ratio is between 0 and 0.01, tracking selection is used.
+        If ratio is between 0.01 and 0.99, numpy.random.permutation is used.
+        If ratio is greater than 0.99, reservoir sampling is used.
+        The order of the selected integers is undefined. If a random order is
+        desired, the selected subset should be shuffled.
+
+        If method =="tracking_selection", a set based implementation is used
+        which is suitable for `n_samples` <<< `n_population`.
+
+        If method == "reservoir_sampling", a reservoir sampling algorithm is
+        used which is suitable for high memory constraint or when
+        O(`n_samples`) ~ O(`n_population`).
+        The order of the selected integers is undefined. If a random order is
+        desired, the selected subset should be shuffled.
+
+        If method == "pool", a pool based algorithm is particularly fast, even
+        faster than the tracking selection method. However, a vector containing
+        the entire population has to be initialized.
+        If n_samples ~ n_population, the reservoir sampling method is faster.
+
+    Returns
+    -------
+    out : ndarray of shape (n_samples,)
+        The sampled subsets of integer. The subset of selected integer might
+        not be randomized, see the method argument.
+
+    Examples
+    --------
+    >>> from sklearn.utils.random import sample_without_replacement
+    >>> sample_without_replacement(10, 5, random_state=42)
+    array([8, 1, 5, 0, 7])
+    """
+    cdef:
+        intp_t n_pop_intp, n_samples_intp
+        long n_pop_long, n_samples_long
+
+    # On most platforms `np.int_ is np.intp`.  However, before NumPy 2 the
+    # default integer `np.int_` was a long which is 32bit on 64bit windows
+    # while `intp` is 64bit on 64bit platforms and 32bit on 32bit ones.
+    if np.int_ is np.intp:
+        # Branch always taken on NumPy >=2 (or when not on 64bit windows).
+        # Cython has different rules for conversion of values to integers.
+        # For NumPy <1.26.2 AND Cython 3, this first branch requires `int()`
+        # called explicitly to allow e.g. floats.
+        n_pop_intp = int(n_population)
+        n_samples_intp = int(n_samples)
+        return _sample_without_replacement(
+                n_pop_intp, n_samples_intp, method, random_state)
+    else:
+        # Branch taken on 64bit windows with Numpy<2.0 where `long` is 32bit
+        n_pop_long = n_population
+        n_samples_long = n_samples
+        return _sample_without_replacement(
+                n_pop_long, n_samples_long, method, random_state)
+
+
+def _our_rand_r_py(seed):
+    """Python utils to test the our_rand_r function"""
+    cdef uint32_t my_seed = seed
+    return our_rand_r(&my_seed)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..67dd18fb94b593f0a3125c1f5833f3b9597614ba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__init__.py
@@ -0,0 +1,2 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72f37d5700eaad496b7313dfe6b1a93827254ff0
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/__init__.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/base.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbf505c66be00c2aa24d069f9e7898b10141a087
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/base.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/estimator.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/estimator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc92ea7120ba16198621240f1019d5869cd118b7
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/estimator.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/params.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/params.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b0aee9801907a98daaf018282185d83d66d88e9
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/__pycache__/params.cpython-312.pyc differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/base.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..28020a2a74698b5ab8161421bf0e996834d3f0ef
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/base.py
@@ -0,0 +1,152 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+
+from ... import __version__
+from ..._config import get_config
+from ..fixes import parse_version
+
+
+class _HTMLDocumentationLinkMixin:
+    """Mixin class allowing to generate a link to the API documentation.
+
+    This mixin relies on three attributes:
+    - `_doc_link_module`: it corresponds to the root module (e.g. `sklearn`). Using this
+      mixin, the default value is `sklearn`.
+    - `_doc_link_template`: it corresponds to the template used to generate the
+      link to the API documentation. Using this mixin, the default value is
+      `"https://scikit-learn.org/{version_url}/modules/generated/
+      {estimator_module}.{estimator_name}.html"`.
+    - `_doc_link_url_param_generator`: it corresponds to a function that generates the
+      parameters to be used in the template when the estimator module and name are not
+      sufficient.
+
+    The method :meth:`_get_doc_link` generates the link to the API documentation for a
+    given estimator.
+
+    This useful provides all the necessary states for
+    :func:`sklearn.utils.estimator_html_repr` to generate a link to the API
+    documentation for the estimator HTML diagram.
+
+    Examples
+    --------
+    If the default values for `_doc_link_module`, `_doc_link_template` are not suitable,
+    then you can override them and provide a method to generate the URL parameters:
+    >>> from sklearn.base import BaseEstimator
+    >>> doc_link_template = "https://address.local/{single_param}.html"
+    >>> def url_param_generator(estimator):
+    ...     return {"single_param": estimator.__class__.__name__}
+    >>> class MyEstimator(BaseEstimator):
+    ...     # use "builtins" since it is the associated module when declaring
+    ...     # the class in a docstring
+    ...     _doc_link_module = "builtins"
+    ...     _doc_link_template = doc_link_template
+    ...     _doc_link_url_param_generator = url_param_generator
+    >>> estimator = MyEstimator()
+    >>> estimator._get_doc_link()
+    'https://address.local/MyEstimator.html'
+
+    If instead of overriding the attributes inside the class definition, you want to
+    override a class instance, you can use `types.MethodType` to bind the method to the
+    instance:
+    >>> import types
+    >>> estimator = BaseEstimator()
+    >>> estimator._doc_link_template = doc_link_template
+    >>> estimator._doc_link_url_param_generator = types.MethodType(
+    ...     url_param_generator, estimator)
+    >>> estimator._get_doc_link()
+    'https://address.local/BaseEstimator.html'
+    """
+
+    _doc_link_module = "sklearn"
+    _doc_link_url_param_generator = None
+
+    @property
+    def _doc_link_template(self):
+        sklearn_version = parse_version(__version__)
+        if sklearn_version.dev is None:
+            version_url = f"{sklearn_version.major}.{sklearn_version.minor}"
+        else:
+            version_url = "dev"
+        return getattr(
+            self,
+            "__doc_link_template",
+            (
+                f"https://scikit-learn.org/{version_url}/modules/generated/"
+                "{estimator_module}.{estimator_name}.html"
+            ),
+        )
+
+    @_doc_link_template.setter
+    def _doc_link_template(self, value):
+        setattr(self, "__doc_link_template", value)
+
+    def _get_doc_link(self):
+        """Generates a link to the API documentation for a given estimator.
+
+        This method generates the link to the estimator's documentation page
+        by using the template defined by the attribute `_doc_link_template`.
+
+        Returns
+        -------
+        url : str
+            The URL to the API documentation for this estimator. If the estimator does
+            not belong to module `_doc_link_module`, the empty string (i.e. `""`) is
+            returned.
+        """
+        if self.__class__.__module__.split(".")[0] != self._doc_link_module:
+            return ""
+
+        if self._doc_link_url_param_generator is None:
+            estimator_name = self.__class__.__name__
+            # Construct the estimator's module name, up to the first private submodule.
+            # This works because in scikit-learn all public estimators are exposed at
+            # that level, even if they actually live in a private sub-module.
+            estimator_module = ".".join(
+                itertools.takewhile(
+                    lambda part: not part.startswith("_"),
+                    self.__class__.__module__.split("."),
+                )
+            )
+            return self._doc_link_template.format(
+                estimator_module=estimator_module, estimator_name=estimator_name
+            )
+        return self._doc_link_template.format(**self._doc_link_url_param_generator())
+
+
+class ReprHTMLMixin:
+    """Mixin to handle consistently the HTML representation.
+
+    When inheriting from this class, you need to define an attribute `_html_repr`
+    which is a callable that returns the HTML representation to be shown.
+    """
+
+    @property
+    def _repr_html_(self):
+        """HTML representation of estimator.
+        This is redundant with the logic of `_repr_mimebundle_`. The latter
+        should be favored in the long term, `_repr_html_` is only
+        implemented for consumers who do not interpret `_repr_mimbundle_`.
+        """
+        if get_config()["display"] != "diagram":
+            raise AttributeError(
+                "_repr_html_ is only defined when the "
+                "'display' configuration option is set to "
+                "'diagram'"
+            )
+        return self._repr_html_inner
+
+    def _repr_html_inner(self):
+        """This function is returned by the @property `_repr_html_` to make
+        `hasattr(estimator, "_repr_html_") return `True` or `False` depending
+        on `get_config()["display"]`.
+        """
+        return self._html_repr()
+
+    def _repr_mimebundle_(self, **kwargs):
+        """Mime bundle used by jupyter kernels to display estimator"""
+        output = {"text/plain": repr(self)}
+        if get_config()["display"] == "diagram":
+            output["text/html"] = self._html_repr()
+        return output
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.css b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.css
new file mode 100644
index 0000000000000000000000000000000000000000..ece8781c6bd76af4d3350b476430d6ecf2c9ed4c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.css
@@ -0,0 +1,413 @@
+#$id {
+  /* Definition of color scheme common for light and dark mode */
+  --sklearn-color-text: #000;
+  --sklearn-color-text-muted: #666;
+  --sklearn-color-line: gray;
+  /* Definition of color scheme for unfitted estimators */
+  --sklearn-color-unfitted-level-0: #fff5e6;
+  --sklearn-color-unfitted-level-1: #f6e4d2;
+  --sklearn-color-unfitted-level-2: #ffe0b3;
+  --sklearn-color-unfitted-level-3: chocolate;
+  /* Definition of color scheme for fitted estimators */
+  --sklearn-color-fitted-level-0: #f0f8ff;
+  --sklearn-color-fitted-level-1: #d4ebff;
+  --sklearn-color-fitted-level-2: #b3dbfd;
+  --sklearn-color-fitted-level-3: cornflowerblue;
+
+  /* Specific color for light theme */
+  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
+  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));
+  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
+  --sklearn-color-icon: #696969;
+
+  @media (prefers-color-scheme: dark) {
+    /* Redefinition of color scheme for dark theme */
+    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
+    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));
+    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
+    --sklearn-color-icon: #878787;
+  }
+}
+
+#$id {
+  color: var(--sklearn-color-text);
+}
+
+#$id pre {
+  padding: 0;
+}
+
+#$id input.sk-hidden--visually {
+  border: 0;
+  clip: rect(1px 1px 1px 1px);
+  clip: rect(1px, 1px, 1px, 1px);
+  height: 1px;
+  margin: -1px;
+  overflow: hidden;
+  padding: 0;
+  position: absolute;
+  width: 1px;
+}
+
+#$id div.sk-dashed-wrapped {
+  border: 1px dashed var(--sklearn-color-line);
+  margin: 0 0.4em 0.5em 0.4em;
+  box-sizing: border-box;
+  padding-bottom: 0.4em;
+  background-color: var(--sklearn-color-background);
+}
+
+#$id div.sk-container {
+  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`
+     but bootstrap.min.css set `[hidden] { display: none !important; }`
+     so we also need the `!important` here to be able to override the
+     default hidden behavior on the sphinx rendered scikit-learn.org.
+     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */
+  display: inline-block !important;
+  position: relative;
+}
+
+#$id div.sk-text-repr-fallback {
+  display: none;
+}
+
+div.sk-parallel-item,
+div.sk-serial,
+div.sk-item {
+  /* draw centered vertical line to link estimators */
+  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));
+  background-size: 2px 100%;
+  background-repeat: no-repeat;
+  background-position: center center;
+}
+
+/* Parallel-specific style estimator block */
+
+#$id div.sk-parallel-item::after {
+  content: "";
+  width: 100%;
+  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);
+  flex-grow: 1;
+}
+
+#$id div.sk-parallel {
+  display: flex;
+  align-items: stretch;
+  justify-content: center;
+  background-color: var(--sklearn-color-background);
+  position: relative;
+}
+
+#$id div.sk-parallel-item {
+  display: flex;
+  flex-direction: column;
+}
+
+#$id div.sk-parallel-item:first-child::after {
+  align-self: flex-end;
+  width: 50%;
+}
+
+#$id div.sk-parallel-item:last-child::after {
+  align-self: flex-start;
+  width: 50%;
+}
+
+#$id div.sk-parallel-item:only-child::after {
+  width: 0;
+}
+
+/* Serial-specific style estimator block */
+
+#$id div.sk-serial {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  background-color: var(--sklearn-color-background);
+  padding-right: 1em;
+  padding-left: 1em;
+}
+
+
+/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is
+clickable and can be expanded/collapsed.
+- Pipeline and ColumnTransformer use this feature and define the default style
+- Estimators will overwrite some part of the style using the `sk-estimator` class
+*/
+
+/* Pipeline and ColumnTransformer style (default) */
+
+#$id div.sk-toggleable {
+  /* Default theme specific background. It is overwritten whether we have a
+  specific estimator or a Pipeline/ColumnTransformer */
+  background-color: var(--sklearn-color-background);
+}
+
+/* Toggleable label */
+#$id label.sk-toggleable__label {
+  cursor: pointer;
+  display: flex;
+  width: 100%;
+  margin-bottom: 0;
+  padding: 0.5em;
+  box-sizing: border-box;
+  text-align: center;
+  align-items: start;
+  justify-content: space-between;
+  gap: 0.5em;
+}
+
+#$id label.sk-toggleable__label .caption {
+  font-size: 0.6rem;
+  font-weight: lighter;
+  color: var(--sklearn-color-text-muted);
+}
+
+#$id label.sk-toggleable__label-arrow:before {
+  /* Arrow on the left of the label */
+  content: "▸";
+  float: left;
+  margin-right: 0.25em;
+  color: var(--sklearn-color-icon);
+}
+
+#$id label.sk-toggleable__label-arrow:hover:before {
+  color: var(--sklearn-color-text);
+}
+
+/* Toggleable content - dropdown */
+
+#$id div.sk-toggleable__content {
+  display: none;
+  text-align: left;
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#$id div.sk-toggleable__content.fitted {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+#$id div.sk-toggleable__content pre {
+  margin: 0.2em;
+  border-radius: 0.25em;
+  color: var(--sklearn-color-text);
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#$id div.sk-toggleable__content.fitted pre {
+  /* unfitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+#$id input.sk-toggleable__control:checked~div.sk-toggleable__content {
+  /* Expand drop-down */
+  display: block;
+  width: 100%;
+  overflow: visible;
+}
+
+#$id input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {
+  content: "▾";
+}
+
+/* Pipeline/ColumnTransformer-specific style */
+
+#$id div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#$id div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Estimator-specific style */
+
+/* Colorize estimator box */
+#$id div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#$id div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+#$id div.sk-label label.sk-toggleable__label,
+#$id div.sk-label label {
+  /* The background is the default theme color */
+  color: var(--sklearn-color-text-on-default-background);
+}
+
+/* On hover, darken the color of the background */
+#$id div.sk-label:hover label.sk-toggleable__label {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+/* Label box, darken color on hover, fitted */
+#$id div.sk-label.fitted:hover label.sk-toggleable__label.fitted {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Estimator label */
+
+#$id div.sk-label label {
+  font-family: monospace;
+  font-weight: bold;
+  display: inline-block;
+  line-height: 1.2em;
+}
+
+#$id div.sk-label-container {
+  text-align: center;
+}
+
+/* Estimator-specific */
+#$id div.sk-estimator {
+  font-family: monospace;
+  border: 1px dotted var(--sklearn-color-border-box);
+  border-radius: 0.25em;
+  box-sizing: border-box;
+  margin-bottom: 0.5em;
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#$id div.sk-estimator.fitted {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+/* on hover */
+#$id div.sk-estimator:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#$id div.sk-estimator.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Specification for estimator info (e.g. "i" and "?") */
+
+/* Common style for "i" and "?" */
+
+.sk-estimator-doc-link,
+a:link.sk-estimator-doc-link,
+a:visited.sk-estimator-doc-link {
+  float: right;
+  font-size: smaller;
+  line-height: 1em;
+  font-family: monospace;
+  background-color: var(--sklearn-color-background);
+  border-radius: 1em;
+  height: 1em;
+  width: 1em;
+  text-decoration: none !important;
+  margin-left: 0.5em;
+  text-align: center;
+  /* unfitted */
+  border: var(--sklearn-color-unfitted-level-1) 1pt solid;
+  color: var(--sklearn-color-unfitted-level-1);
+}
+
+.sk-estimator-doc-link.fitted,
+a:link.sk-estimator-doc-link.fitted,
+a:visited.sk-estimator-doc-link.fitted {
+  /* fitted */
+  border: var(--sklearn-color-fitted-level-1) 1pt solid;
+  color: var(--sklearn-color-fitted-level-1);
+}
+
+/* On hover */
+div.sk-estimator:hover .sk-estimator-doc-link:hover,
+.sk-estimator-doc-link:hover,
+div.sk-label-container:hover .sk-estimator-doc-link:hover,
+.sk-estimator-doc-link:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,
+.sk-estimator-doc-link.fitted:hover,
+div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,
+.sk-estimator-doc-link.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+/* Span, style for the box shown on hovering the info icon */
+.sk-estimator-doc-link span {
+  display: none;
+  z-index: 9999;
+  position: relative;
+  font-weight: normal;
+  right: .2ex;
+  padding: .5ex;
+  margin: .5ex;
+  width: min-content;
+  min-width: 20ex;
+  max-width: 50ex;
+  color: var(--sklearn-color-text);
+  box-shadow: 2pt 2pt 4pt #999;
+  /* unfitted */
+  background: var(--sklearn-color-unfitted-level-0);
+  border: .5pt solid var(--sklearn-color-unfitted-level-3);
+}
+
+.sk-estimator-doc-link.fitted span {
+  /* fitted */
+  background: var(--sklearn-color-fitted-level-0);
+  border: var(--sklearn-color-fitted-level-3);
+}
+
+.sk-estimator-doc-link:hover span {
+  display: block;
+}
+
+/* "?"-specific style due to the `<a>` HTML tag */
+
+#$id a.estimator_doc_link {
+  float: right;
+  font-size: 1rem;
+  line-height: 1em;
+  font-family: monospace;
+  background-color: var(--sklearn-color-background);
+  border-radius: 1rem;
+  height: 1rem;
+  width: 1rem;
+  text-decoration: none;
+  /* unfitted */
+  color: var(--sklearn-color-unfitted-level-1);
+  border: var(--sklearn-color-unfitted-level-1) 1pt solid;
+}
+
+#$id a.estimator_doc_link.fitted {
+  /* fitted */
+  border: var(--sklearn-color-fitted-level-1) 1pt solid;
+  color: var(--sklearn-color-fitted-level-1);
+}
+
+/* On hover */
+#$id a.estimator_doc_link:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+#$id a.estimator_doc_link.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-3);
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.js b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.js
new file mode 100644
index 0000000000000000000000000000000000000000..5de0a021c63bbaa9ef6e6493e7f57f11b73abdd8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.js
@@ -0,0 +1,42 @@
+function copyToClipboard(text, element) {
+    // Get the parameter prefix from the closest toggleable content
+    const toggleableContent = element.closest('.sk-toggleable__content');
+    const paramPrefix = toggleableContent ? toggleableContent.dataset.paramPrefix : '';
+    const fullParamName = paramPrefix ? `${paramPrefix}${text}` : text;
+
+    const originalStyle = element.style;
+    const computedStyle = window.getComputedStyle(element);
+    const originalWidth = computedStyle.width;
+    const originalHTML = element.innerHTML.replace('Copied!', '');
+
+    navigator.clipboard.writeText(fullParamName)
+        .then(() => {
+            element.style.width = originalWidth;
+            element.style.color = 'green';
+            element.innerHTML = "Copied!";
+
+            setTimeout(() => {
+                element.innerHTML = originalHTML;
+                element.style = originalStyle;
+            }, 2000);
+        })
+        .catch(err => {
+            console.error('Failed to copy:', err);
+            element.style.color = 'red';
+            element.innerHTML = "Failed!";
+            setTimeout(() => {
+                element.innerHTML = originalHTML;
+                element.style = originalStyle;
+            }, 2000);
+        });
+    return false;
+}
+
+document.querySelectorAll('.fa-regular.fa-copy').forEach(function(element) {
+    const toggleableContent = element.closest('.sk-toggleable__content');
+    const paramPrefix = toggleableContent ? toggleableContent.dataset.paramPrefix : '';
+    const paramName = element.parentElement.nextElementSibling.textContent.trim();
+    const fullParamName = paramPrefix ? `${paramPrefix}${paramName}` : paramName;
+
+    element.setAttribute('title', fullParamName);
+});
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d101dde58d74be820fdd6797691e9cf1b23d0a8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/estimator.py
@@ -0,0 +1,497 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import html
+from contextlib import closing
+from inspect import isclass
+from io import StringIO
+from pathlib import Path
+from string import Template
+
+from ... import config_context
+
+
+class _IDCounter:
+    """Generate sequential ids with a prefix."""
+
+    def __init__(self, prefix):
+        self.prefix = prefix
+        self.count = 0
+
+    def get_id(self):
+        self.count += 1
+        return f"{self.prefix}-{self.count}"
+
+
+def _get_css_style():
+    estimator_css_file = Path(__file__).parent / "estimator.css"
+    params_css_file = Path(__file__).parent / "params.css"
+
+    estimator_css = estimator_css_file.read_text(encoding="utf-8")
+    params_css = params_css_file.read_text(encoding="utf-8")
+
+    return f"{estimator_css}\n{params_css}"
+
+
+_CONTAINER_ID_COUNTER = _IDCounter("sk-container-id")
+_ESTIMATOR_ID_COUNTER = _IDCounter("sk-estimator-id")
+_CSS_STYLE = _get_css_style()
+
+
+class _VisualBlock:
+    """HTML Representation of Estimator
+
+    Parameters
+    ----------
+    kind : {'serial', 'parallel', 'single'}
+        kind of HTML block
+
+    estimators : list of estimators or `_VisualBlock`s or a single estimator
+        If kind != 'single', then `estimators` is a list of
+        estimators.
+        If kind == 'single', then `estimators` is a single estimator.
+
+    names : list of str, default=None
+        If kind != 'single', then `names` corresponds to estimators.
+        If kind == 'single', then `names` is a single string corresponding to
+        the single estimator.
+
+    name_details : list of str, str, or None, default=None
+        If kind != 'single', then `name_details` corresponds to `names`.
+        If kind == 'single', then `name_details` is a single string
+        corresponding to the single estimator.
+
+    name_caption : str, default=None
+        The caption below the name. `None` stands for no caption.
+        Only active when kind == 'single'.
+
+    doc_link_label : str, default=None
+        The label for the documentation link. If provided, the label would be
+        "Documentation for {doc_link_label}". Otherwise it will look for `names`.
+        Only active when kind == 'single'.
+
+    dash_wrapped : bool, default=True
+        If true, wrapped HTML element will be wrapped with a dashed border.
+        Only active when kind != 'single'.
+    """
+
+    def __init__(
+        self,
+        kind,
+        estimators,
+        *,
+        names=None,
+        name_details=None,
+        name_caption=None,
+        doc_link_label=None,
+        dash_wrapped=True,
+    ):
+        self.kind = kind
+        self.estimators = estimators
+        self.dash_wrapped = dash_wrapped
+        self.name_caption = name_caption
+        self.doc_link_label = doc_link_label
+
+        if self.kind in ("parallel", "serial"):
+            if names is None:
+                names = (None,) * len(estimators)
+            if name_details is None:
+                name_details = (None,) * len(estimators)
+
+        self.names = names
+        self.name_details = name_details
+
+    def _sk_visual_block_(self):
+        return self
+
+
+def _write_label_html(
+    out,
+    params,
+    name,
+    name_details,
+    name_caption=None,
+    doc_link_label=None,
+    outer_class="sk-label-container",
+    inner_class="sk-label",
+    checked=False,
+    doc_link="",
+    is_fitted_css_class="",
+    is_fitted_icon="",
+    param_prefix="",
+):
+    """Write labeled html with or without a dropdown with named details.
+
+    Parameters
+    ----------
+    out : file-like object
+        The file to write the HTML representation to.
+    params: str
+        If estimator has `get_params` method, this is the HTML representation
+        of the estimator's parameters and their values. When the estimator
+        does not have `get_params`, it is an empty string.
+    name : str
+        The label for the estimator. It corresponds either to the estimator class name
+        for a simple estimator or in the case of a `Pipeline` and `ColumnTransformer`,
+        it corresponds to the name of the step.
+    name_details : str
+        The details to show as content in the dropdown part of the toggleable label. It
+        can contain information such as non-default parameters or column information for
+        `ColumnTransformer`.
+    name_caption : str, default=None
+        The caption below the name. If `None`, no caption will be created.
+    doc_link_label : str, default=None
+        The label for the documentation link. If provided, the label would be
+        "Documentation for {doc_link_label}". Otherwise it will look for `name`.
+    outer_class : {"sk-label-container", "sk-item"}, default="sk-label-container"
+        The CSS class for the outer container.
+    inner_class : {"sk-label", "sk-estimator"}, default="sk-label"
+        The CSS class for the inner container.
+    checked : bool, default=False
+        Whether the dropdown is folded or not. With a single estimator, we intend to
+        unfold the content.
+    doc_link : str, default=""
+        The link to the documentation for the estimator. If an empty string, no link is
+        added to the diagram. This can be generated for an estimator if it uses the
+        `_HTMLDocumentationLinkMixin`.
+    is_fitted_css_class : {"", "fitted"}
+        The CSS class to indicate whether or not the estimator is fitted. The
+        empty string means that the estimator is not fitted and "fitted" means that the
+        estimator is fitted.
+    is_fitted_icon : str, default=""
+        The HTML representation to show the fitted information in the diagram. An empty
+        string means that no information is shown.
+    param_prefix : str, default=""
+        The prefix to prepend to parameter names for nested estimators.
+    """
+    out.write(
+        f'<div class="{outer_class}"><div'
+        f' class="{inner_class} {is_fitted_css_class} sk-toggleable">'
+    )
+    name = html.escape(name)
+    if name_details is not None:
+        name_details = html.escape(str(name_details))
+        checked_str = "checked" if checked else ""
+        est_id = _ESTIMATOR_ID_COUNTER.get_id()
+
+        if doc_link:
+            doc_label = "<span>Online documentation</span>"
+            if doc_link_label is not None:
+                doc_label = f"<span>Documentation for {doc_link_label}</span>"
+            elif name is not None:
+                doc_label = f"<span>Documentation for {name}</span>"
+            doc_link = (
+                f'<a class="sk-estimator-doc-link {is_fitted_css_class}"'
+                f' rel="noreferrer" target="_blank" href="{doc_link}">?{doc_label}</a>'
+            )
+
+        name_caption_div = (
+            ""
+            if name_caption is None
+            else f'<div class="caption">{html.escape(name_caption)}</div>'
+        )
+        name_caption_div = f"<div><div>{name}</div>{name_caption_div}</div>"
+        links_div = (
+            f"<div>{doc_link}{is_fitted_icon}</div>"
+            if doc_link or is_fitted_icon
+            else ""
+        )
+
+        label_html = (
+            f'<label for="{est_id}" class="sk-toggleable__label {is_fitted_css_class} '
+            f'sk-toggleable__label-arrow">{name_caption_div}{links_div}</label>'
+        )
+
+        fmt_str = (
+            f'<input class="sk-toggleable__control sk-hidden--visually" id="{est_id}" '
+            f'type="checkbox" {checked_str}>{label_html}<div '
+            f'class="sk-toggleable__content {is_fitted_css_class}" '
+            f'data-param-prefix="{html.escape(param_prefix)}">'
+        )
+
+        if params:
+            fmt_str = "".join([fmt_str, f"{params}</div>"])
+        elif name_details and ("Pipeline" not in name):
+            fmt_str = "".join([fmt_str, f"<pre>{name_details}</pre></div>"])
+
+        out.write(fmt_str)
+    else:
+        out.write(f"<label>{name}</label>")
+    out.write("</div></div>")  # outer_class inner_class
+
+
+def _get_visual_block(estimator):
+    """Generate information about how to display an estimator."""
+    if hasattr(estimator, "_sk_visual_block_"):
+        try:
+            return estimator._sk_visual_block_()
+        except Exception:
+            return _VisualBlock(
+                "single",
+                estimator,
+                names=estimator.__class__.__name__,
+                name_details=str(estimator),
+            )
+
+    if isinstance(estimator, str):
+        return _VisualBlock(
+            "single", estimator, names=estimator, name_details=estimator
+        )
+    elif estimator is None:
+        return _VisualBlock("single", estimator, names="None", name_details="None")
+
+    # check if estimator looks like a meta estimator (wraps estimators)
+    if hasattr(estimator, "get_params") and not isclass(estimator):
+        estimators = [
+            (key, est)
+            for key, est in estimator.get_params(deep=False).items()
+            if hasattr(est, "get_params") and hasattr(est, "fit") and not isclass(est)
+        ]
+        if estimators:
+            return _VisualBlock(
+                "parallel",
+                [est for _, est in estimators],
+                names=[f"{key}: {est.__class__.__name__}" for key, est in estimators],
+                name_details=[str(est) for _, est in estimators],
+            )
+
+    return _VisualBlock(
+        "single",
+        estimator,
+        names=estimator.__class__.__name__,
+        name_details=str(estimator),
+    )
+
+
+def _write_estimator_html(
+    out,
+    estimator,
+    estimator_label,
+    estimator_label_details,
+    is_fitted_css_class,
+    is_fitted_icon="",
+    first_call=False,
+    param_prefix="",
+):
+    """Write estimator to html in serial, parallel, or by itself (single).
+
+    For multiple estimators, this function is called recursively.
+
+    Parameters
+    ----------
+    out : file-like object
+        The file to write the HTML representation to.
+    estimator : estimator object
+        The estimator to visualize.
+    estimator_label : str
+        The label for the estimator. It corresponds either to the estimator class name
+        for simple estimator or in the case of `Pipeline` and `ColumnTransformer`, it
+        corresponds to the name of the step.
+    estimator_label_details : str
+        The details to show as content in the dropdown part of the toggleable label.
+        It can contain information as non-default parameters or column information for
+        `ColumnTransformer`.
+    is_fitted_css_class : {"", "fitted"}
+        The CSS class to indicate whether or not the estimator is fitted or not. The
+        empty string means that the estimator is not fitted and "fitted" means that the
+        estimator is fitted.
+    is_fitted_icon : str, default=""
+        The HTML representation to show the fitted information in the diagram. An empty
+        string means that no information is shown. If the estimator to be shown is not
+        the first estimator (i.e. `first_call=False`), `is_fitted_icon` is always an
+        empty string.
+    first_call : bool, default=False
+        Whether this is the first time this function is called.
+    param_prefix : str, default=""
+        The prefix to prepend to parameter names for nested estimators.
+        For example, in a pipeline this might be "pipeline__stepname__".
+    """
+    if first_call:
+        est_block = _get_visual_block(estimator)
+    else:
+        is_fitted_icon = ""
+        with config_context(print_changed_only=True):
+            est_block = _get_visual_block(estimator)
+    # `estimator` can also be an instance of `_VisualBlock`
+    if hasattr(estimator, "_get_doc_link"):
+        doc_link = estimator._get_doc_link()
+    else:
+        doc_link = ""
+    if est_block.kind in ("serial", "parallel"):
+        dashed_wrapped = first_call or est_block.dash_wrapped
+        dash_cls = " sk-dashed-wrapped" if dashed_wrapped else ""
+        out.write(f'<div class="sk-item{dash_cls}">')
+
+        if estimator_label:
+            if hasattr(estimator, "get_params") and hasattr(
+                estimator, "_get_params_html"
+            ):
+                params = estimator._get_params_html(deep=False)._repr_html_inner()
+            else:
+                params = ""
+
+            _write_label_html(
+                out,
+                params,
+                estimator_label,
+                estimator_label_details,
+                doc_link=doc_link,
+                is_fitted_css_class=is_fitted_css_class,
+                is_fitted_icon=is_fitted_icon,
+                param_prefix=param_prefix,
+            )
+
+        kind = est_block.kind
+        out.write(f'<div class="sk-{kind}">')
+        est_infos = zip(est_block.estimators, est_block.names, est_block.name_details)
+
+        for est, name, name_details in est_infos:
+            # Build the parameter prefix for nested estimators
+
+            if param_prefix and hasattr(name, "split"):
+                # If we already have a prefix, append the new component
+                new_prefix = f"{param_prefix}{name.split(':')[0]}__"
+            elif hasattr(name, "split"):
+                # If this is the first level, start the prefix
+                new_prefix = f"{name.split(':')[0]}__" if name else ""
+            else:
+                new_prefix = param_prefix
+
+            if kind == "serial":
+                _write_estimator_html(
+                    out,
+                    est,
+                    name,
+                    name_details,
+                    is_fitted_css_class=is_fitted_css_class,
+                    param_prefix=new_prefix,
+                )
+            else:  # parallel
+                out.write('<div class="sk-parallel-item">')
+                # wrap element in a serial visualblock
+                serial_block = _VisualBlock("serial", [est], dash_wrapped=False)
+                _write_estimator_html(
+                    out,
+                    serial_block,
+                    name,
+                    name_details,
+                    is_fitted_css_class=is_fitted_css_class,
+                    param_prefix=new_prefix,
+                )
+                out.write("</div>")  # sk-parallel-item
+
+        out.write("</div></div>")
+    elif est_block.kind == "single":
+        if hasattr(estimator, "_get_params_html"):
+            params = estimator._get_params_html()._repr_html_inner()
+        else:
+            params = ""
+
+        _write_label_html(
+            out,
+            params,
+            est_block.names,
+            est_block.name_details,
+            est_block.name_caption,
+            est_block.doc_link_label,
+            outer_class="sk-item",
+            inner_class="sk-estimator",
+            checked=first_call,
+            doc_link=doc_link,
+            is_fitted_css_class=is_fitted_css_class,
+            is_fitted_icon=is_fitted_icon,
+            param_prefix=param_prefix,
+        )
+
+
+def estimator_html_repr(estimator):
+    """Build a HTML representation of an estimator.
+
+    Read more in the :ref:`User Guide <visualizing_composite_estimators>`.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        The estimator to visualize.
+
+    Returns
+    -------
+    html: str
+        HTML representation of estimator.
+
+    Examples
+    --------
+    >>> from sklearn.utils._repr_html.estimator import estimator_html_repr
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> estimator_html_repr(LogisticRegression())
+    '<style>#sk-container-id...'
+    """
+    from sklearn.exceptions import NotFittedError
+    from sklearn.utils.validation import check_is_fitted
+
+    if not hasattr(estimator, "fit"):
+        status_label = "<span>Not fitted</span>"
+        is_fitted_css_class = ""
+    else:
+        try:
+            check_is_fitted(estimator)
+            status_label = "<span>Fitted</span>"
+            is_fitted_css_class = "fitted"
+        except NotFittedError:
+            status_label = "<span>Not fitted</span>"
+            is_fitted_css_class = ""
+
+    is_fitted_icon = (
+        f'<span class="sk-estimator-doc-link {is_fitted_css_class}">'
+        f"i{status_label}</span>"
+    )
+    with closing(StringIO()) as out:
+        container_id = _CONTAINER_ID_COUNTER.get_id()
+        style_template = Template(_CSS_STYLE)
+        style_with_id = style_template.substitute(id=container_id)
+        estimator_str = str(estimator)
+
+        # The fallback message is shown by default and loading the CSS sets
+        # div.sk-text-repr-fallback to display: none to hide the fallback message.
+        #
+        # If the notebook is trusted, the CSS is loaded which hides the fallback
+        # message. If the notebook is not trusted, then the CSS is not loaded and the
+        # fallback message is shown by default.
+        #
+        # The reverse logic applies to HTML repr div.sk-container.
+        # div.sk-container is hidden by default and the loading the CSS displays it.
+        fallback_msg = (
+            "In a Jupyter environment, please rerun this cell to show the HTML"
+            " representation or trust the notebook. <br />On GitHub, the"
+            " HTML representation is unable to render, please try loading this page"
+            " with nbviewer.org."
+        )
+        html_template = (
+            f"<style>{style_with_id}</style>"
+            f"<body>"
+            f'<div id="{container_id}" class="sk-top-container">'
+            '<div class="sk-text-repr-fallback">'
+            f"<pre>{html.escape(estimator_str)}</pre><b>{fallback_msg}</b>"
+            "</div>"
+            '<div class="sk-container" hidden>'
+        )
+
+        out.write(html_template)
+        _write_estimator_html(
+            out,
+            estimator,
+            estimator.__class__.__name__,
+            estimator_str,
+            first_call=True,
+            is_fitted_css_class=is_fitted_css_class,
+            is_fitted_icon=is_fitted_icon,
+        )
+        with open(str(Path(__file__).parent / "estimator.js"), "r") as f:
+            script = f.read()
+
+        html_end = f"</div></div><script>{script}</script></body>"
+
+        out.write(html_end)
+
+        html_output = out.getvalue()
+        return html_output
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.css b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.css
new file mode 100644
index 0000000000000000000000000000000000000000..df815f966ffcfe3544b5c73a9c87c0f7f2d256f5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.css
@@ -0,0 +1,63 @@
+.estimator-table summary {
+    padding: .5rem;
+    font-family: monospace;
+    cursor: pointer;
+}
+
+.estimator-table details[open] {
+    padding-left: 0.1rem;
+    padding-right: 0.1rem;
+    padding-bottom: 0.3rem;
+}
+
+.estimator-table .parameters-table {
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+
+.estimator-table .parameters-table tr:nth-child(odd) {
+    background-color: #fff;
+}
+
+.estimator-table .parameters-table tr:nth-child(even) {
+    background-color: #f6f6f6;
+}
+
+.estimator-table .parameters-table tr:hover {
+    background-color: #e0e0e0;
+}
+
+.estimator-table table td {
+    border: 1px solid rgba(106, 105, 104, 0.232);
+}
+
+.user-set td {
+    color:rgb(255, 94, 0);
+    text-align: left;
+}
+
+.user-set td.value pre {
+    color:rgb(255, 94, 0) !important;
+    background-color: transparent !important;
+}
+
+.default td {
+    color: black;
+    text-align: left;
+}
+
+.user-set td i,
+.default td i {
+    color: black;
+}
+
+.copy-paste-icon {
+    background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCA0NDggNTEyIj48IS0tIUZvbnQgQXdlc29tZSBGcmVlIDYuNy4yIGJ5IEBmb250YXdlc29tZSAtIGh0dHBzOi8vZm9udGF3ZXNvbWUuY29tIExpY2Vuc2UgLSBodHRwczovL2ZvbnRhd2Vzb21lLmNvbS9saWNlbnNlL2ZyZWUgQ29weXJpZ2h0IDIwMjUgRm9udGljb25zLCBJbmMuLS0+PHBhdGggZD0iTTIwOCAwTDMzMi4xIDBjMTIuNyAwIDI0LjkgNS4xIDMzLjkgMTQuMWw2Ny45IDY3LjljOSA5IDE0LjEgMjEuMiAxNC4xIDMzLjlMNDQ4IDMzNmMwIDI2LjUtMjEuNSA0OC00OCA0OGwtMTkyIDBjLTI2LjUgMC00OC0yMS41LTQ4LTQ4bDAtMjg4YzAtMjYuNSAyMS41LTQ4IDQ4LTQ4ek00OCAxMjhsODAgMCAwIDY0LTY0IDAgMCAyNTYgMTkyIDAgMC0zMiA2NCAwIDAgNDhjMCAyNi41LTIxLjUgNDgtNDggNDhMNDggNTEyYy0yNi41IDAtNDgtMjEuNS00OC00OEwwIDE3NmMwLTI2LjUgMjEuNS00OCA0OC00OHoiLz48L3N2Zz4=);
+    background-repeat: no-repeat;
+    background-size: 14px 14px;
+    background-position: 0;
+    display: inline-block;
+    width: 14px;
+    height: 14px;
+    cursor: pointer;
+}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85bf1280a8fcb16bd61895a96d9959d735fc5e0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.py
@@ -0,0 +1,83 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import html
+import reprlib
+from collections import UserDict
+
+from sklearn.utils._repr_html.base import ReprHTMLMixin
+
+
+def _read_params(name, value, non_default_params):
+    """Categorizes parameters as 'default' or 'user-set' and formats their values.
+    Escapes or truncates parameter values for display safety and readability.
+    """
+    r = reprlib.Repr()
+    r.maxlist = 2  # Show only first 2 items of lists
+    r.maxtuple = 1  # Show only first item of tuples
+    r.maxstring = 50  # Limit string length
+    cleaned_value = html.escape(r.repr(value))
+
+    param_type = "user-set" if name in non_default_params else "default"
+
+    return {"param_type": param_type, "param_name": name, "param_value": cleaned_value}
+
+
+def _params_html_repr(params):
+    """Generate HTML representation of estimator parameters.
+
+    Creates an HTML table with parameter names and values, wrapped in a
+    collapsible details element. Parameters are styled differently based
+    on whether they are default or user-set values.
+    """
+    HTML_TEMPLATE = """
+        <div class="estimator-table">
+            <details>
+                <summary>Parameters</summary>
+                <table class="parameters-table">
+                  <tbody>
+                    {rows}
+                  </tbody>
+                </table>
+            </details>
+        </div>
+    """
+    ROW_TEMPLATE = """
+        <tr class="{param_type}">
+            <td><i class="copy-paste-icon"
+                 onclick="copyToClipboard('{param_name}',
+                          this.parentElement.nextElementSibling)"
+            ></i></td>
+            <td class="param">{param_name}&nbsp;</td>
+            <td class="value">{param_value}</td>
+        </tr>
+    """
+
+    rows = [
+        ROW_TEMPLATE.format(**_read_params(name, value, params.non_default))
+        for name, value in params.items()
+    ]
+
+    return HTML_TEMPLATE.format(rows="\n".join(rows))
+
+
+class ParamsDict(ReprHTMLMixin, UserDict):
+    """Dictionary-like class to store and provide an HTML representation.
+
+    It builds an HTML structure to be used with Jupyter notebooks or similar
+    environments. It allows storing metadata to track non-default parameters.
+
+    Parameters
+    ----------
+    params : dict, default=None
+        The original dictionary of parameters and their values.
+
+    non_default : tuple
+        The list of non-default parameters.
+    """
+
+    _html_repr = _params_html_repr
+
+    def __init__(self, params=None, non_default=tuple()):
+        super().__init__(params or {})
+        self.non_default = non_default
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_estimator.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc975d854ed8f416d3f48e16dfd0fbaa10a9352a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_estimator.py
@@ -0,0 +1,616 @@
+import html
+import locale
+import re
+import types
+from contextlib import closing
+from functools import partial
+from io import StringIO
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator
+from sklearn.cluster import AgglomerativeClustering, Birch
+from sklearn.compose import ColumnTransformer, make_column_transformer
+from sklearn.datasets import load_iris
+from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.ensemble import StackingClassifier, StackingRegressor, VotingClassifier
+from sklearn.feature_selection import SelectPercentile
+from sklearn.gaussian_process.kernels import ExpSineSquared
+from sklearn.impute import SimpleImputer
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.multiclass import OneVsOneClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
+from sklearn.svm import LinearSVC, LinearSVR
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._repr_html.base import _HTMLDocumentationLinkMixin
+from sklearn.utils._repr_html.estimator import (
+    _get_css_style,
+    _get_visual_block,
+    _write_label_html,
+    estimator_html_repr,
+)
+from sklearn.utils.fixes import parse_version
+
+
+def dummy_function(x, y):
+    return x + y  # pragma: nocover
+
+
+@pytest.mark.parametrize("checked", [True, False])
+def test_write_label_html(checked):
+    # Test checking logic and labeling
+    name = "LogisticRegression"
+    params = ""
+    tool_tip = "hello-world"
+
+    with closing(StringIO()) as out:
+        _write_label_html(out, params, name, tool_tip, checked=checked)
+        html_label = out.getvalue()
+
+        p = (
+            r'<label for="sk-estimator-id-[0-9]*"'
+            r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow">'
+            r"<div><div>LogisticRegression</div></div>"
+        )
+        re_compiled = re.compile(p)
+        assert re_compiled.search(html_label)
+        assert html_label.startswith('<div class="sk-label-container">')
+        assert "<pre>hello-world</pre>" in html_label
+
+        if checked:
+            assert "checked>" in html_label
+
+
+@pytest.mark.parametrize("est", ["passthrough", "drop", None])
+def test_get_visual_block_single_str_none(est):
+    # Test estimators that are represented by strings
+    est_html_info = _get_visual_block(est)
+    assert est_html_info.kind == "single"
+    assert est_html_info.estimators == est
+    assert est_html_info.names == str(est)
+    assert est_html_info.name_details == str(est)
+
+
+def test_get_visual_block_single_estimator():
+    est = LogisticRegression(C=10.0)
+    est_html_info = _get_visual_block(est)
+    assert est_html_info.kind == "single"
+    assert est_html_info.estimators == est
+    assert est_html_info.names == est.__class__.__name__
+    assert est_html_info.name_details == str(est)
+
+
+def test_get_visual_block_pipeline():
+    pipe = Pipeline(
+        [
+            ("imputer", SimpleImputer()),
+            ("do_nothing", "passthrough"),
+            ("do_nothing_more", None),
+            ("classifier", LogisticRegression()),
+        ]
+    )
+    est_html_info = _get_visual_block(pipe)
+    assert est_html_info.kind == "serial"
+    assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
+    assert est_html_info.names == [
+        "imputer: SimpleImputer",
+        "do_nothing: passthrough",
+        "do_nothing_more: passthrough",
+        "classifier: LogisticRegression",
+    ]
+    assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
+
+
+def test_get_visual_block_feature_union():
+    f_union = FeatureUnion([("pca", PCA()), ("svd", TruncatedSVD())])
+    est_html_info = _get_visual_block(f_union)
+    assert est_html_info.kind == "parallel"
+    assert est_html_info.names == ("pca", "svd")
+    assert est_html_info.estimators == tuple(
+        trans[1] for trans in f_union.transformer_list
+    )
+    assert est_html_info.name_details == (None, None)
+
+
+def test_get_visual_block_voting():
+    clf = VotingClassifier(
+        [("log_reg", LogisticRegression()), ("mlp", MLPClassifier())]
+    )
+    est_html_info = _get_visual_block(clf)
+    assert est_html_info.kind == "parallel"
+    assert est_html_info.estimators == tuple(trans[1] for trans in clf.estimators)
+    assert est_html_info.names == ("log_reg", "mlp")
+    assert est_html_info.name_details == (None, None)
+
+
+def test_get_visual_block_column_transformer():
+    ct = ColumnTransformer(
+        [("pca", PCA(), ["num1", "num2"]), ("svd", TruncatedSVD, [0, 3])]
+    )
+    est_html_info = _get_visual_block(ct)
+    assert est_html_info.kind == "parallel"
+    assert est_html_info.estimators == tuple(trans[1] for trans in ct.transformers)
+    assert est_html_info.names == ("pca", "svd")
+    assert est_html_info.name_details == (["num1", "num2"], [0, 3])
+
+
+def test_estimator_html_repr_an_empty_pipeline():
+    """Check that the representation of an empty Pipeline does not fail.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/30197
+    """
+    empty_pipeline = Pipeline([])
+    estimator_html_repr(empty_pipeline)
+
+
+def test_estimator_html_repr_pipeline():
+    num_trans = Pipeline(
+        steps=[("pass", "passthrough"), ("imputer", SimpleImputer(strategy="median"))]
+    )
+
+    cat_trans = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="constant", missing_values="empty")),
+            ("one-hot", OneHotEncoder(drop="first")),
+        ]
+    )
+
+    preprocess = ColumnTransformer(
+        [
+            ("num", num_trans, ["a", "b", "c", "d", "e"]),
+            ("cat", cat_trans, [0, 1, 2, 3]),
+        ]
+    )
+
+    feat_u = FeatureUnion(
+        [
+            ("pca", PCA(n_components=1)),
+            (
+                "tsvd",
+                Pipeline(
+                    [
+                        ("first", TruncatedSVD(n_components=3)),
+                        ("select", SelectPercentile()),
+                    ]
+                ),
+            ),
+        ]
+    )
+
+    clf = VotingClassifier(
+        [
+            ("lr", LogisticRegression(solver="lbfgs", random_state=1)),
+            ("mlp", MLPClassifier(alpha=0.001)),
+        ]
+    )
+
+    pipe = Pipeline(
+        [("preprocessor", preprocess), ("feat_u", feat_u), ("classifier", clf)]
+    )
+    html_output = estimator_html_repr(pipe)
+
+    # top level estimators show estimator with changes
+    assert html.escape(str(pipe)) in html_output
+    for _, est in pipe.steps:
+        assert html.escape(str(est))[:44] in html_output
+
+    # low level estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert html.escape(str(num_trans["pass"])) in html_output
+        assert "<div><div>passthrough</div></div></label>" in html_output
+        assert html.escape(str(num_trans["imputer"])) in html_output
+
+        for _, _, cols in preprocess.transformers:
+            assert f"<pre>{html.escape(str(cols))}</pre>" in html_output
+
+        # feature union
+        for name, _ in feat_u.transformer_list:
+            assert f"<label>{html.escape(name)}</label>" in html_output
+
+        pca = feat_u.transformer_list[0][1]
+
+        assert html.escape(str(pca)) in html_output
+
+        tsvd = feat_u.transformer_list[1][1]
+        first = tsvd["first"]
+        select = tsvd["select"]
+        assert html.escape(str(first)) in html_output
+        assert html.escape(str(select)) in html_output
+
+        # voting classifier
+        for name, est in clf.estimators:
+            assert html.escape(name) in html_output
+            assert html.escape(str(est)) in html_output
+
+    # verify that prefers-color-scheme is implemented
+    assert "prefers-color-scheme" in html_output
+
+
+@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
+def test_stacking_classifier(final_estimator):
+    estimators = [
+        ("mlp", MLPClassifier(alpha=0.001)),
+        ("tree", DecisionTreeClassifier()),
+    ]
+    clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
+
+    html_output = estimator_html_repr(clf)
+
+    assert html.escape(str(clf)) in html_output
+    # If final_estimator's default changes from LogisticRegression
+    # this should be updated
+    if final_estimator is None:
+        assert "LogisticRegression" in html_output
+    else:
+        assert final_estimator.__class__.__name__ in html_output
+
+
+@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
+def test_stacking_regressor(final_estimator):
+    reg = StackingRegressor(
+        estimators=[("svr", LinearSVR())], final_estimator=final_estimator
+    )
+    html_output = estimator_html_repr(reg)
+
+    assert html.escape(str(reg.estimators[0][0])) in html_output
+    p = (
+        r'<label for="sk-estimator-id-[0-9]*"'
+        r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow">'
+        r"<div><div>LinearSVR</div></div>"
+    )
+    re_compiled = re.compile(p)
+    assert re_compiled.search(html_output)
+
+    if final_estimator is None:
+        p = (
+            r'<label for="sk-estimator-id-[0-9]*"'
+            r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow">'
+            r"<div><div>RidgeCV</div></div>"
+        )
+        re_compiled = re.compile(p)
+        assert re_compiled.search(html_output)
+    else:
+        assert html.escape(final_estimator.__class__.__name__) in html_output
+
+
+def test_birch_duck_typing_meta():
+    # Test duck typing meta estimators with Birch
+    birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
+    html_output = estimator_html_repr(birch)
+
+    # inner estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert f"<pre>{html.escape(str(birch.n_clusters))}" in html_output
+
+        p = r"<div><div>AgglomerativeClustering</div></div><div>.+</div></label>"
+        re_compiled = re.compile(p)
+        assert re_compiled.search(html_output)
+
+    # outer estimator contains all changes
+    assert f"<pre>{html.escape(str(birch))}" in html_output
+
+
+def test_ovo_classifier_duck_typing_meta():
+    # Test duck typing metaestimators with OVO
+    ovo = OneVsOneClassifier(LinearSVC(penalty="l1"))
+    html_output = estimator_html_repr(ovo)
+
+    # inner estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert f"<pre>{html.escape(str(ovo.estimator))}" in html_output
+        # regex to match the start of the tag
+        p = (
+            r'<label for="sk-estimator-id-[0-9]*" '
+            r'class="sk-toggleable__label  sk-toggleable__label-arrow">'
+            r"<div><div>LinearSVC</div></div>"
+        )
+        re_compiled = re.compile(p)
+        assert re_compiled.search(html_output)
+
+    # outer estimator
+    assert f"<pre>{html.escape(str(ovo))}" in html_output
+
+
+def test_duck_typing_nested_estimator():
+    # Test duck typing metaestimators with random search
+    kernel_ridge = KernelRidge(kernel=ExpSineSquared())
+    param_distributions = {"alpha": [1, 2]}
+
+    kernel_ridge_tuned = RandomizedSearchCV(
+        kernel_ridge,
+        param_distributions=param_distributions,
+    )
+    html_output = estimator_html_repr(kernel_ridge_tuned)
+    assert "<div><div>estimator: KernelRidge</div></div></label>" in html_output
+
+
+@pytest.mark.parametrize("print_changed_only", [True, False])
+def test_one_estimator_print_change_only(print_changed_only):
+    pca = PCA(n_components=10)
+
+    with config_context(print_changed_only=print_changed_only):
+        pca_repr = html.escape(str(pca))
+        html_output = estimator_html_repr(pca)
+        assert pca_repr in html_output
+
+
+def test_fallback_exists():
+    """Check that repr fallback is in the HTML."""
+    pca = PCA(n_components=10)
+    html_output = estimator_html_repr(pca)
+
+    assert (
+        f'<div class="sk-text-repr-fallback"><pre>{html.escape(str(pca))}'
+        in html_output
+    )
+
+
+def test_show_arrow_pipeline():
+    """Show arrow in pipeline for top level in pipeline"""
+    pipe = Pipeline([("scale", StandardScaler()), ("log_Reg", LogisticRegression())])
+
+    html_output = estimator_html_repr(pipe)
+    assert (
+        'class="sk-toggleable__label  sk-toggleable__label-arrow">'
+        "<div><div>Pipeline</div></div>" in html_output
+    )
+
+
+def test_invalid_parameters_in_stacking():
+    """Invalidate stacking configuration uses default repr.
+
+    Non-regression test for #24009.
+    """
+    stacker = StackingClassifier(estimators=[])
+
+    html_output = estimator_html_repr(stacker)
+    assert html.escape(str(stacker)) in html_output
+
+
+def test_estimator_get_params_return_cls():
+    """Check HTML repr works where a value in get_params is a class."""
+
+    class MyEstimator:
+        def get_params(self, deep=False):
+            return {"inner_cls": LogisticRegression}
+
+    est = MyEstimator()
+    assert "MyEstimator" in estimator_html_repr(est)
+
+
+def test_estimator_html_repr_unfitted_vs_fitted():
+    """Check that we have the information that the estimator is fitted or not in the
+    HTML representation.
+    """
+
+    class MyEstimator(BaseEstimator):
+        def fit(self, X, y):
+            self.fitted_ = True
+            return self
+
+    X, y = load_iris(return_X_y=True)
+    estimator = MyEstimator()
+    assert "<span>Not fitted</span>" in estimator_html_repr(estimator)
+    estimator.fit(X, y)
+    assert "<span>Fitted</span>" in estimator_html_repr(estimator)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        LogisticRegression(),
+        make_pipeline(StandardScaler(), LogisticRegression()),
+        make_pipeline(
+            make_column_transformer((StandardScaler(), slice(0, 3))),
+            LogisticRegression(),
+        ),
+    ],
+)
+def test_estimator_html_repr_fitted_icon(estimator):
+    """Check that we are showing the fitted status icon only once."""
+    pattern = '<span class="sk-estimator-doc-link ">i<span>Not fitted</span></span>'
+    assert estimator_html_repr(estimator).count(pattern) == 1
+    X, y = load_iris(return_X_y=True)
+    estimator.fit(X, y)
+    pattern = '<span class="sk-estimator-doc-link fitted">i<span>Fitted</span></span>'
+    assert estimator_html_repr(estimator).count(pattern) == 1
+
+
+@pytest.mark.parametrize("mock_version", ["1.3.0.dev0", "1.3.0"])
+def test_html_documentation_link_mixin_sklearn(mock_version):
+    """Check the behaviour of the `_HTMLDocumentationLinkMixin` class for scikit-learn
+    default.
+    """
+
+    # mock the `__version__` where the mixin is located
+    with patch("sklearn.utils._repr_html.base.__version__", mock_version):
+        mixin = _HTMLDocumentationLinkMixin()
+
+        assert mixin._doc_link_module == "sklearn"
+        sklearn_version = parse_version(mock_version)
+        # we need to parse the version manually to be sure that this test is passing in
+        # other branches than `main` (that is "dev").
+        if sklearn_version.dev is None:
+            version = f"{sklearn_version.major}.{sklearn_version.minor}"
+        else:
+            version = "dev"
+        assert (
+            mixin._doc_link_template
+            == f"https://scikit-learn.org/{version}/modules/generated/"
+            "{estimator_module}.{estimator_name}.html"
+        )
+        assert (
+            mixin._get_doc_link()
+            == f"https://scikit-learn.org/{version}/modules/generated/"
+            "sklearn.utils._HTMLDocumentationLinkMixin.html"
+        )
+
+
+@pytest.mark.parametrize(
+    "module_path,expected_module",
+    [
+        ("prefix.mymodule", "prefix.mymodule"),
+        ("prefix._mymodule", "prefix"),
+        ("prefix.mypackage._mymodule", "prefix.mypackage"),
+        ("prefix.mypackage._mymodule.submodule", "prefix.mypackage"),
+        ("prefix.mypackage.mymodule.submodule", "prefix.mypackage.mymodule.submodule"),
+    ],
+)
+def test_html_documentation_link_mixin_get_doc_link_instance(
+    module_path, expected_module
+):
+    """Check the behaviour of the `_get_doc_link` with various parameter."""
+
+    class FooBar(_HTMLDocumentationLinkMixin):
+        pass
+
+    FooBar.__module__ = module_path
+    est = FooBar()
+    # if we set `_doc_link`, then we expect to infer a module and name for the estimator
+    est._doc_link_module = "prefix"
+    est._doc_link_template = (
+        "https://website.com/{estimator_module}.{estimator_name}.html"
+    )
+    assert est._get_doc_link() == f"https://website.com/{expected_module}.FooBar.html"
+
+
+@pytest.mark.parametrize(
+    "module_path,expected_module",
+    [
+        ("prefix.mymodule", "prefix.mymodule"),
+        ("prefix._mymodule", "prefix"),
+        ("prefix.mypackage._mymodule", "prefix.mypackage"),
+        ("prefix.mypackage._mymodule.submodule", "prefix.mypackage"),
+        ("prefix.mypackage.mymodule.submodule", "prefix.mypackage.mymodule.submodule"),
+    ],
+)
+def test_html_documentation_link_mixin_get_doc_link_class(module_path, expected_module):
+    """Check the behaviour of the `_get_doc_link` when `_doc_link_module` and
+    `_doc_link_template` are defined at the class level and not at the instance
+    level."""
+
+    class FooBar(_HTMLDocumentationLinkMixin):
+        _doc_link_module = "prefix"
+        _doc_link_template = (
+            "https://website.com/{estimator_module}.{estimator_name}.html"
+        )
+
+    FooBar.__module__ = module_path
+    est = FooBar()
+    assert est._get_doc_link() == f"https://website.com/{expected_module}.FooBar.html"
+
+
+def test_html_documentation_link_mixin_get_doc_link_out_of_library():
+    """Check the behaviour of the `_get_doc_link` with various parameter."""
+    mixin = _HTMLDocumentationLinkMixin()
+
+    # if the `_doc_link_module` does not refer to the root module of the estimator
+    # (here the mixin), then we should return an empty string.
+    mixin._doc_link_module = "xxx"
+    assert mixin._get_doc_link() == ""
+
+
+def test_html_documentation_link_mixin_doc_link_url_param_generator_instance():
+    mixin = _HTMLDocumentationLinkMixin()
+    # we can bypass the generation by providing our own callable
+    mixin._doc_link_template = (
+        "https://website.com/{my_own_variable}.{another_variable}.html"
+    )
+
+    def url_param_generator(estimator):
+        return {
+            "my_own_variable": "value_1",
+            "another_variable": "value_2",
+        }
+
+    mixin._doc_link_url_param_generator = types.MethodType(url_param_generator, mixin)
+
+    assert mixin._get_doc_link() == "https://website.com/value_1.value_2.html"
+
+
+def test_html_documentation_link_mixin_doc_link_url_param_generator_class():
+    # we can bypass the generation by providing our own callable
+
+    def url_param_generator(estimator):
+        return {
+            "my_own_variable": "value_1",
+            "another_variable": "value_2",
+        }
+
+    class FooBar(_HTMLDocumentationLinkMixin):
+        _doc_link_template = (
+            "https://website.com/{my_own_variable}.{another_variable}.html"
+        )
+        _doc_link_url_param_generator = url_param_generator
+
+    estimator = FooBar()
+    assert estimator._get_doc_link() == "https://website.com/value_1.value_2.html"
+
+
+@pytest.fixture
+def set_non_utf8_locale():
+    """Pytest fixture to set non utf-8 locale during the test.
+
+    The locale is set to the original one after the test has run.
+    """
+    try:
+        locale.setlocale(locale.LC_CTYPE, "C")
+    except locale.Error:
+        pytest.skip("'C' locale is not available on this OS")
+
+    yield
+
+    # Resets the locale to the original one. Python calls setlocale(LC_TYPE, "")
+    # at startup according to
+    # https://docs.python.org/3/library/locale.html#background-details-hints-tips-and-caveats.
+    # This assumes that no other locale changes have been made. For some reason,
+    # on some platforms, trying to restore locale with something like
+    # locale.setlocale(locale.LC_CTYPE, locale.getlocale()) raises a
+    # locale.Error: unsupported locale setting
+    locale.setlocale(locale.LC_CTYPE, "")
+
+
+def test_non_utf8_locale(set_non_utf8_locale):
+    """Checks that utf8 encoding is used when reading the CSS file.
+
+    Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/27725
+    """
+    _get_css_style()
+
+
+@pytest.mark.parametrize(
+    "func, expected_name",
+    [
+        (lambda x: x + 1, html.escape("<lambda>")),
+        (dummy_function, "dummy_function"),
+        (partial(dummy_function, y=1), "dummy_function"),
+        (np.vectorize(partial(dummy_function, y=1)), re.escape("vectorize(...)")),
+    ],
+)
+def test_function_transformer_show_caption(func, expected_name):
+    # Test that function name is shown as the name and "FunctionTransformer" is shown
+    # in the caption
+    ft = FunctionTransformer(func)
+    html_output = estimator_html_repr(ft)
+
+    p = (
+        r'<label for="sk-estimator-id-[0-9]*" class="sk-toggleable__label fitted '
+        rf'sk-toggleable__label-arrow"><div><div>{expected_name}</div>'
+        r'<div class="caption">FunctionTransformer</div></div>'
+    )
+    re_compiled = re.compile(p)
+    assert re_compiled.search(html_output)
+
+
+def test_estimator_html_repr_table():
+    """Check that we add the table of parameters in the HTML representation."""
+    est = LogisticRegression(C=10.0, fit_intercept=False)
+    assert "parameters-table" in estimator_html_repr(est)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_params.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd1c7dfb9aff79ea980bf224c2ab136647cebe9f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_params.py
@@ -0,0 +1,74 @@
+import pytest
+
+from sklearn import config_context
+from sklearn.utils._repr_html.params import ParamsDict, _params_html_repr, _read_params
+
+
+def test_params_dict_content():
+    """Check the behavior of the ParamsDict class."""
+    params = ParamsDict({"a": 1, "b": 2})
+    assert params["a"] == 1
+    assert params["b"] == 2
+    assert params.non_default == ()
+
+    params = ParamsDict({"a": 1, "b": 2}, non_default=("a",))
+    assert params["a"] == 1
+    assert params["b"] == 2
+    assert params.non_default == ("a",)
+
+
+def test_params_dict_repr_html_():
+    params = ParamsDict({"a": 1, "b": 2}, non_default=("a",))
+    out = params._repr_html_()
+    assert "<summary>Parameters</summary>" in out
+
+    with config_context(display="text"):
+        msg = "_repr_html_ is only defined when"
+        with pytest.raises(AttributeError, match=msg):
+            params._repr_html_()
+
+
+def test_params_dict_repr_mimebundle():
+    params = ParamsDict({"a": 1, "b": 2}, non_default=("a",))
+    out = params._repr_mimebundle_()
+
+    assert "text/plain" in out
+    assert "text/html" in out
+    assert "<summary>Parameters</summary>" in out["text/html"]
+    assert out["text/plain"] == "{'a': 1, 'b': 2}"
+
+    with config_context(display="text"):
+        out = params._repr_mimebundle_()
+        assert "text/plain" in out
+        assert "text/html" not in out
+
+
+def test_read_params():
+    """Check the behavior of the `_read_params` function."""
+    out = _read_params("a", 1, tuple())
+    assert out["param_type"] == "default"
+    assert out["param_name"] == "a"
+    assert out["param_value"] == "1"
+
+    # check non-default parameters
+    out = _read_params("a", 1, ("a",))
+    assert out["param_type"] == "user-set"
+    assert out["param_name"] == "a"
+    assert out["param_value"] == "1"
+
+    # check that we escape html tags
+    tag_injection = "<script>alert('xss')</script>"
+    out = _read_params("a", tag_injection, tuple())
+    assert (
+        out["param_value"]
+        == "&quot;&lt;script&gt;alert(&#x27;xss&#x27;)&lt;/script&gt;&quot;"
+    )
+    assert out["param_name"] == "a"
+    assert out["param_type"] == "default"
+
+
+def test_params_html_repr():
+    """Check returned HTML template"""
+    params = ParamsDict({"a": 1, "b": 2})
+    assert "parameters-table" in _params_html_repr(params)
+    assert "estimator-table" in _params_html_repr(params)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_response.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_response.py
new file mode 100644
index 0000000000000000000000000000000000000000..9003699d4351da00d3673172582b2990ef6606db
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_response.py
@@ -0,0 +1,317 @@
+"""Utilities to get the response values of a classifier or a regressor.
+
+It allows to make uniform checks and validation.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+from ..base import is_classifier
+from .multiclass import type_of_target
+from .validation import _check_response_method, check_is_fitted
+
+
+def _process_predict_proba(*, y_pred, target_type, classes, pos_label):
+    """Get the response values when the response method is `predict_proba`.
+
+    This function process the `y_pred` array in the binary and multi-label cases.
+    In the binary case, it selects the column corresponding to the positive
+    class. In the multi-label case, it stacks the predictions if they are not
+    in the "compressed" format `(n_samples, n_outputs)`.
+
+    Parameters
+    ----------
+    y_pred : ndarray
+        Output of `estimator.predict_proba`. The shape depends on the target type:
+
+        - for binary classification, it is a 2d array of shape `(n_samples, 2)`;
+        - for multiclass classification, it is a 2d array of shape
+          `(n_samples, n_classes)`;
+        - for multilabel classification, it is either a list of 2d arrays of shape
+          `(n_samples, 2)` (e.g. `RandomForestClassifier` or `KNeighborsClassifier`) or
+          an array of shape `(n_samples, n_outputs)` (e.g. `MLPClassifier` or
+          `RidgeClassifier`).
+
+    target_type : {"binary", "multiclass", "multilabel-indicator"}
+        Type of the target.
+
+    classes : ndarray of shape (n_classes,) or list of such arrays
+        Class labels as reported by `estimator.classes_`.
+
+    pos_label : int, float, bool or str
+        Only used with binary and multiclass targets.
+
+    Returns
+    -------
+    y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
+            (n_samples, n_output)
+        Compressed predictions format as requested by the metrics.
+    """
+    if target_type == "binary" and y_pred.shape[1] < 2:
+        # We don't handle classifiers trained on a single class.
+        raise ValueError(
+            f"Got predict_proba of shape {y_pred.shape}, but need "
+            "classifier with two classes."
+        )
+
+    if target_type == "binary":
+        col_idx = np.flatnonzero(classes == pos_label)[0]
+        return y_pred[:, col_idx]
+    elif target_type == "multilabel-indicator":
+        # Use a compress format of shape `(n_samples, n_output)`.
+        # Only `MLPClassifier` and `RidgeClassifier` return an array of shape
+        # `(n_samples, n_outputs)`.
+        if isinstance(y_pred, list):
+            # list of arrays of shape `(n_samples, 2)`
+            return np.vstack([p[:, -1] for p in y_pred]).T
+        else:
+            # array of shape `(n_samples, n_outputs)`
+            return y_pred
+
+    return y_pred
+
+
+def _process_decision_function(*, y_pred, target_type, classes, pos_label):
+    """Get the response values when the response method is `decision_function`.
+
+    This function process the `y_pred` array in the binary and multi-label cases.
+    In the binary case, it inverts the sign of the score if the positive label
+    is not `classes[1]`. In the multi-label case, it stacks the predictions if
+    they are not in the "compressed" format `(n_samples, n_outputs)`.
+
+    Parameters
+    ----------
+    y_pred : ndarray
+        Output of `estimator.decision_function`. The shape depends on the target type:
+
+        - for binary classification, it is a 1d array of shape `(n_samples,)` where the
+          sign is assuming that `classes[1]` is the positive class;
+        - for multiclass classification, it is a 2d array of shape
+          `(n_samples, n_classes)`;
+        - for multilabel classification, it is a 2d array of shape `(n_samples,
+          n_outputs)`.
+
+    target_type : {"binary", "multiclass", "multilabel-indicator"}
+        Type of the target.
+
+    classes : ndarray of shape (n_classes,) or list of such arrays
+        Class labels as reported by `estimator.classes_`.
+
+    pos_label : int, float, bool or str
+        Only used with binary and multiclass targets.
+
+    Returns
+    -------
+    y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
+            (n_samples, n_output)
+        Compressed predictions format as requested by the metrics.
+    """
+    if target_type == "binary" and pos_label == classes[0]:
+        return -1 * y_pred
+    return y_pred
+
+
+def _get_response_values(
+    estimator,
+    X,
+    response_method,
+    pos_label=None,
+    return_response_method_used=False,
+):
+    """Compute the response values of a classifier, an outlier detector, or a regressor.
+
+    The response values are predictions such that it follows the following shape:
+
+    - for binary classification, it is a 1d array of shape `(n_samples,)`;
+    - for multiclass classification, it is a 2d array of shape `(n_samples, n_classes)`;
+    - for multilabel classification, it is a 2d array of shape `(n_samples, n_outputs)`;
+    - for outlier detection, it is a 1d array of shape `(n_samples,)`;
+    - for regression, it is a 1d array of shape `(n_samples,)`.
+
+    If `estimator` is a binary classifier, also return the label for the
+    effective positive class.
+
+    This utility is used primarily in the displays and the scikit-learn scorers.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Fitted classifier, outlier detector, or regressor or a
+        fitted :class:`~sklearn.pipeline.Pipeline` in which the last estimator is a
+        classifier, an outlier detector, or a regressor.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input values.
+
+    response_method : {"predict_proba", "predict_log_proba", "decision_function", \
+            "predict"} or list of such str
+        Specifies the response method to use get prediction from an estimator
+        (i.e. :term:`predict_proba`, :term:`predict_log_proba`,
+        :term:`decision_function` or :term:`predict`). Possible choices are:
+
+        - if `str`, it corresponds to the name to the method to return;
+        - if a list of `str`, it provides the method names in order of
+          preference. The method returned corresponds to the first method in
+          the list and which is implemented by `estimator`.
+
+    pos_label : int, float, bool or str, default=None
+        The class considered as the positive class when computing
+        the metrics. If `None` and target is 'binary', `estimators.classes_[1]` is
+        considered as the positive class.
+
+    return_response_method_used : bool, default=False
+        Whether to return the response method used to compute the response
+        values.
+
+        .. versionadded:: 1.4
+
+    Returns
+    -------
+    y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
+            (n_samples, n_outputs)
+        Target scores calculated from the provided `response_method`
+        and `pos_label`.
+
+    pos_label : int, float, bool, str or None
+        The class considered as the positive class when computing
+        the metrics. Returns `None` if `estimator` is a regressor or an outlier
+        detector.
+
+    response_method_used : str
+        The response method used to compute the response values. Only returned
+        if `return_response_method_used` is `True`.
+
+        .. versionadded:: 1.4
+
+    Raises
+    ------
+    ValueError
+        If `pos_label` is not a valid label.
+        If the shape of `y_pred` is not consistent for binary classifier.
+        If the response method can be applied to a classifier only and
+        `estimator` is a regressor.
+    """
+    from sklearn.base import is_classifier, is_outlier_detector
+
+    if is_classifier(estimator):
+        prediction_method = _check_response_method(estimator, response_method)
+        classes = estimator.classes_
+        target_type = type_of_target(classes)
+
+        if target_type in ("binary", "multiclass"):
+            if pos_label is not None and pos_label not in classes.tolist():
+                raise ValueError(
+                    f"pos_label={pos_label} is not a valid label: It should be "
+                    f"one of {classes}"
+                )
+            elif pos_label is None and target_type == "binary":
+                pos_label = classes[-1]
+
+        y_pred = prediction_method(X)
+
+        if prediction_method.__name__ in ("predict_proba", "predict_log_proba"):
+            y_pred = _process_predict_proba(
+                y_pred=y_pred,
+                target_type=target_type,
+                classes=classes,
+                pos_label=pos_label,
+            )
+        elif prediction_method.__name__ == "decision_function":
+            y_pred = _process_decision_function(
+                y_pred=y_pred,
+                target_type=target_type,
+                classes=classes,
+                pos_label=pos_label,
+            )
+    elif is_outlier_detector(estimator):
+        prediction_method = _check_response_method(estimator, response_method)
+        y_pred, pos_label = prediction_method(X), None
+    else:  # estimator is a regressor
+        if response_method != "predict":
+            raise ValueError(
+                f"{estimator.__class__.__name__} should either be a classifier to be "
+                f"used with response_method={response_method} or the response_method "
+                "should be 'predict'. Got a regressor with response_method="
+                f"{response_method} instead."
+            )
+        prediction_method = estimator.predict
+        y_pred, pos_label = prediction_method(X), None
+
+    if return_response_method_used:
+        return y_pred, pos_label, prediction_method.__name__
+    return y_pred, pos_label
+
+
+def _get_response_values_binary(
+    estimator, X, response_method, pos_label=None, return_response_method_used=False
+):
+    """Compute the response values of a binary classifier.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a binary classifier.
+
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Input values.
+
+    response_method : {'auto', 'predict_proba', 'decision_function'}
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. If set to 'auto',
+        :term:`predict_proba` is tried first and if it does not exist
+        :term:`decision_function` is tried next.
+
+    pos_label : int, float, bool or str, default=None
+        The class considered as the positive class when computing
+        the metrics. By default, `estimators.classes_[1]` is
+        considered as the positive class.
+
+    return_response_method_used : bool, default=False
+        Whether to return the response method used to compute the response
+        values.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    y_pred : ndarray of shape (n_samples,)
+        Target scores calculated from the provided response_method
+        and pos_label.
+
+    pos_label : int, float, bool or str
+        The class considered as the positive class when computing
+        the metrics.
+
+    response_method_used : str
+        The response method used to compute the response values. Only returned
+        if `return_response_method_used` is `True`.
+
+        .. versionadded:: 1.5
+    """
+    classification_error = "Expected 'estimator' to be a binary classifier."
+
+    check_is_fitted(estimator)
+    if not is_classifier(estimator):
+        raise ValueError(
+            classification_error + f" Got {estimator.__class__.__name__} instead."
+        )
+    elif len(estimator.classes_) != 2:
+        raise ValueError(
+            classification_error + f" Got {len(estimator.classes_)} classes instead."
+        )
+
+    if response_method == "auto":
+        response_method = ["predict_proba", "decision_function"]
+
+    return _get_response_values(
+        estimator,
+        X,
+        response_method,
+        pos_label=pos_label,
+        return_response_method_used=return_response_method_used,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_seq_dataset.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/utils/_seq_dataset.pxd.tp
new file mode 100644
index 0000000000000000000000000000000000000000..9a15673353d2d32c0cd9abdcf0eae33b2fa9f0ab
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_seq_dataset.pxd.tp
@@ -0,0 +1,76 @@
+{{py:
+
+"""
+Dataset abstractions for sequential data access.
+
+Template file for easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: _seq_dataset.pxd
+
+Each class is duplicated for all dtypes (float and double). The keywords
+between double braces are substituted during the build.
+"""
+
+# name_suffix, c_type
+dtypes = [('64', 'float64_t'),
+          ('32', 'float32_t')]
+
+}}
+"""Dataset abstractions for sequential data access."""
+
+from ._typedefs cimport float32_t, float64_t, intp_t, uint32_t
+
+# SequentialDataset and its two concrete subclasses are (optionally randomized)
+# iterators over the rows of a matrix X and corresponding target values y.
+
+{{for name_suffix, c_type in dtypes}}
+
+#------------------------------------------------------------------------------
+
+cdef class SequentialDataset{{name_suffix}}:
+    cdef int current_index
+    cdef int[::1] index
+    cdef int *index_data_ptr
+    cdef Py_ssize_t n_samples
+    cdef uint32_t seed
+
+    cdef void shuffle(self, uint32_t seed) noexcept nogil
+    cdef int _get_next_index(self) noexcept nogil
+    cdef int _get_random_index(self) noexcept nogil
+
+    cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
+                      int current_index) noexcept nogil
+    cdef void next(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) noexcept nogil
+    cdef int random(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) noexcept nogil
+
+
+cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
+    cdef const {{c_type}}[:, ::1] X
+    cdef const {{c_type}}[::1] Y
+    cdef const {{c_type}}[::1] sample_weights
+    cdef Py_ssize_t n_features
+    cdef intp_t X_stride
+    cdef {{c_type}} *X_data_ptr
+    cdef {{c_type}} *Y_data_ptr
+    cdef const int[::1] feature_indices
+    cdef int *feature_indices_ptr
+    cdef {{c_type}} *sample_weight_data
+
+
+cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
+    cdef const {{c_type}}[::1] X_data
+    cdef const int[::1] X_indptr
+    cdef const int[::1] X_indices
+    cdef const {{c_type}}[::1] Y
+    cdef const {{c_type}}[::1] sample_weights
+    cdef {{c_type}} *X_data_ptr
+    cdef int *X_indptr_ptr
+    cdef int *X_indices_ptr
+    cdef {{c_type}} *Y_data_ptr
+    cdef {{c_type}} *sample_weight_data
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_seq_dataset.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/utils/_seq_dataset.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..026768e77b50c9164a8fd4892ff98a604bd19224
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_seq_dataset.pyx.tp
@@ -0,0 +1,348 @@
+{{py:
+
+"""
+Dataset abstractions for sequential data access.
+Template file for easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: _seq_dataset.pyx
+
+Each class is duplicated for all dtypes (float and double). The keywords
+between double braces are substituted during the build.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# name_suffix, c_type, np_type
+dtypes = [('64', 'float64_t', 'np.float64'),
+          ('32', 'float32_t', 'np.float32')]
+
+}}
+"""Dataset abstractions for sequential data access."""
+
+import numpy as np
+
+cimport cython
+from libc.limits cimport INT_MAX
+
+from ._random cimport our_rand_r
+from ._typedefs cimport float32_t, float64_t, uint32_t
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+#------------------------------------------------------------------------------
+
+cdef class SequentialDataset{{name_suffix}}:
+    """Base class for datasets with sequential data access.
+
+    SequentialDataset is used to iterate over the rows of a matrix X and
+    corresponding target values y, i.e. to iterate over samples.
+    There are two methods to get the next sample:
+        - next : Iterate sequentially (optionally randomized)
+        - random : Iterate randomly (with replacement)
+
+    Attributes
+    ----------
+    index : np.ndarray
+        Index array for fast shuffling.
+
+    index_data_ptr : int
+        Pointer to the index array.
+
+    current_index : int
+        Index of current sample in ``index``.
+        The index of current sample in the data is given by
+        index_data_ptr[current_index].
+
+    n_samples : Py_ssize_t
+        Number of samples in the dataset.
+
+    seed : uint32_t
+        Seed used for random sampling. This attribute is modified at each call to the
+        `random` method.
+    """
+
+    cdef void next(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) noexcept nogil:
+        """Get the next example ``x`` from the dataset.
+
+        This method gets the next sample looping sequentially over all samples.
+        The order can be shuffled with the method ``shuffle``.
+        Shuffling once before iterating over all samples corresponds to a
+        random draw without replacement. It is used for instance in SGD solver.
+
+        Parameters
+        ----------
+        x_data_ptr : {{c_type}}**
+            A pointer to the {{c_type}} array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : {{c_type}}*
+            The target value of the next example.
+
+        sample_weight : {{c_type}}*
+            The weight of the next example.
+        """
+        cdef int current_index = self._get_next_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+
+    cdef int random(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) noexcept nogil:
+        """Get a random example ``x`` from the dataset.
+
+        This method gets next sample chosen randomly over a uniform
+        distribution. It corresponds to a random draw with replacement.
+        It is used for instance in SAG solver.
+
+        Parameters
+        ----------
+        x_data_ptr : {{c_type}}**
+            A pointer to the {{c_type}} array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : {{c_type}}*
+            The target value of the next example.
+
+        sample_weight : {{c_type}}*
+            The weight of the next example.
+
+        Returns
+        -------
+        current_index : int
+            Index of current sample.
+        """
+        cdef int current_index = self._get_random_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+        return current_index
+
+    cdef void shuffle(self, uint32_t seed) noexcept nogil:
+        """Permutes the ordering of examples."""
+        # Fisher-Yates shuffle
+        cdef int *ind = self.index_data_ptr
+        cdef int n = self.n_samples
+        cdef unsigned i, j
+        for i in range(n - 1):
+            j = i + our_rand_r(&seed) % (n - i)
+            ind[i], ind[j] = ind[j], ind[i]
+
+    cdef int _get_next_index(self) noexcept nogil:
+        cdef int current_index = self.current_index
+        if current_index >= (self.n_samples - 1):
+            current_index = -1
+
+        current_index += 1
+        self.current_index = current_index
+        return self.current_index
+
+    cdef int _get_random_index(self) noexcept nogil:
+        cdef int n = self.n_samples
+        cdef int current_index = our_rand_r(&self.seed) % n
+        self.current_index = current_index
+        return current_index
+
+    cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
+                      int current_index) noexcept nogil:
+        pass
+
+    def _shuffle_py(self, uint32_t seed):
+        """python function used for easy testing"""
+        self.shuffle(seed)
+
+    def _next_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_next_index()
+        return self._sample_py(current_index)
+
+    def _random_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_random_index()
+        return self._sample_py(current_index)
+
+    def _sample_py(self, int current_index):
+        """python function used for easy testing"""
+        cdef {{c_type}}* x_data_ptr
+        cdef int* x_indices_ptr
+        cdef int nnz, j
+        cdef {{c_type}} y, sample_weight
+
+        # call _sample in cython
+        self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight,
+                     current_index)
+
+        # transform the pointed data in numpy CSR array
+        cdef {{c_type}}[:] x_data = np.empty(nnz, dtype={{np_type}})
+        cdef int[:] x_indices = np.empty(nnz, dtype=np.int32)
+        cdef int[:] x_indptr = np.asarray([0, nnz], dtype=np.int32)
+
+        for j in range(nnz):
+            x_data[j] = x_data_ptr[j]
+            x_indices[j] = x_indices_ptr[j]
+
+        cdef int sample_idx = self.index_data_ptr[current_index]
+
+        return (
+            (np.asarray(x_data), np.asarray(x_indices), np.asarray(x_indptr)),
+            y,
+            sample_weight,
+            sample_idx,
+        )
+
+
+cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
+    """Dataset backed by a two-dimensional numpy array.
+
+    The dtype of the numpy array is expected to be ``{{np_type}}`` ({{c_type}})
+    and C-style memory layout.
+    """
+
+    def __cinit__(
+        self,
+        const {{c_type}}[:, ::1] X,
+        const {{c_type}}[::1] Y,
+        const {{c_type}}[::1] sample_weights,
+        uint32_t seed=1,
+    ):
+        """A ``SequentialDataset`` backed by a two-dimensional numpy array.
+
+        Parameters
+        ----------
+        X : ndarray, dtype={{c_type}}, ndim=2, mode='c'
+            The sample array, of shape(n_samples, n_features)
+
+        Y : ndarray, dtype={{c_type}}, ndim=1, mode='c'
+            The target array, of shape(n_samples, )
+
+        sample_weights : ndarray, dtype={{c_type}}, ndim=1, mode='c'
+            The weight of each sample, of shape(n_samples,)
+        """
+        if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX:
+            raise ValueError("More than %d samples or features not supported;"
+                             " got (%d, %d)."
+                             % (INT_MAX, X.shape[0], X.shape[1]))
+
+        # keep a reference to the data to prevent garbage collection
+        self.X = X
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = X.shape[0]
+        self.n_features = X.shape[1]
+
+        self.feature_indices = np.arange(0, self.n_features, dtype=np.intc)
+        self.feature_indices_ptr = <int *> &self.feature_indices[0]
+
+        self.current_index = -1
+        self.X_stride = X.strides[0] // X.itemsize
+        self.X_data_ptr = <{{c_type}} *> &X[0, 0]
+        self.Y_data_ptr = <{{c_type}} *> &Y[0]
+        self.sample_weight_data = <{{c_type}} *> &sample_weights[0]
+
+        # Use index array for fast shuffling
+        self.index = np.arange(0, self.n_samples, dtype=np.intc)
+        self.index_data_ptr = <int *> &self.index[0]
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
+                      int current_index) noexcept nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = sample_idx * self.X_stride
+
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.feature_indices_ptr
+        nnz[0] = self.n_features
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+
+
+cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
+    """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
+
+    def __cinit__(
+        self,
+        const {{c_type}}[::1] X_data,
+        const int[::1] X_indptr,
+        const int[::1] X_indices,
+        const {{c_type}}[::1] Y,
+        const {{c_type}}[::1] sample_weights,
+        uint32_t seed=1,
+    ):
+        """Dataset backed by a scipy sparse CSR matrix.
+
+        The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
+        The corresponding feature values are given by
+        x_data_ptr[0:nnz].
+
+        Parameters
+        ----------
+        X_data : ndarray, dtype={{c_type}}, ndim=1, mode='c'
+            The data array of the CSR features matrix.
+
+        X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The index pointer array of the CSR features matrix.
+
+        X_indices : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The column indices array of the CSR features matrix.
+
+        Y : ndarray, dtype={{c_type}}, ndim=1, mode='c'
+            The target values.
+
+        sample_weights : ndarray, dtype={{c_type}}, ndim=1, mode='c'
+            The weight of each sample.
+        """
+        # keep a reference to the data to prevent garbage collection
+        self.X_data = X_data
+        self.X_indptr = X_indptr
+        self.X_indices = X_indices
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = Y.shape[0]
+        self.current_index = -1
+        self.X_data_ptr = <{{c_type}} *> &X_data[0]
+        self.X_indptr_ptr = <int *> &X_indptr[0]
+        self.X_indices_ptr = <int *> &X_indices[0]
+
+        self.Y_data_ptr = <{{c_type}} *> &Y[0]
+        self.sample_weight_data = <{{c_type}} *> &sample_weights[0]
+
+        # Use index array for fast shuffling
+        self.index = np.arange(self.n_samples, dtype=np.intc)
+        self.index_data_ptr = <int *> &self.index[0]
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
+                      int current_index) noexcept nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = self.X_indptr_ptr[sample_idx]
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.X_indices_ptr + offset
+        nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..6219b2f172a27538b94d50d90c86ca3d189c284e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py
@@ -0,0 +1,460 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import importlib
+from functools import wraps
+from typing import Protocol, runtime_checkable
+
+import numpy as np
+from scipy.sparse import issparse
+
+from .._config import get_config
+from ._available_if import available_if
+
+
+def check_library_installed(library):
+    """Check library is installed."""
+    try:
+        return importlib.import_module(library)
+    except ImportError as exc:
+        raise ImportError(
+            f"Setting output container to '{library}' requires {library} to be"
+            " installed"
+        ) from exc
+
+
+def get_columns(columns):
+    if callable(columns):
+        try:
+            return columns()
+        except Exception:
+            return None
+    return columns
+
+
+@runtime_checkable
+class ContainerAdapterProtocol(Protocol):
+    container_lib: str
+
+    def create_container(self, X_output, X_original, columns, inplace=False):
+        """Create container from `X_output` with additional metadata.
+
+        Parameters
+        ----------
+        X_output : {ndarray, dataframe}
+            Data to wrap.
+
+        X_original : {ndarray, dataframe}
+            Original input dataframe. This is used to extract the metadata that should
+            be passed to `X_output`, e.g. pandas row index.
+
+        columns : callable, ndarray, or None
+            The column names or a callable that returns the column names. The
+            callable is useful if the column names require some computation. If `None`,
+            then no columns are passed to the container's constructor.
+
+        inplace : bool, default=False
+            Whether or not we intend to modify `X_output` in-place. However, it does
+            not guarantee that we return the same object if the in-place operation
+            is not possible.
+
+        Returns
+        -------
+        wrapped_output : container_type
+            `X_output` wrapped into the container type.
+        """
+
+    def is_supported_container(self, X):
+        """Return True if X is a supported container.
+
+        Parameters
+        ----------
+        Xs: container
+            Containers to be checked.
+
+        Returns
+        -------
+        is_supported_container : bool
+            True if X is a supported container.
+        """
+
+    def rename_columns(self, X, columns):
+        """Rename columns in `X`.
+
+        Parameters
+        ----------
+        X : container
+            Container which columns is updated.
+
+        columns : ndarray of str
+            Columns to update the `X`'s columns with.
+
+        Returns
+        -------
+        updated_container : container
+            Container with new names.
+        """
+
+    def hstack(self, Xs):
+        """Stack containers horizontally (column-wise).
+
+        Parameters
+        ----------
+        Xs : list of containers
+            List of containers to stack.
+
+        Returns
+        -------
+        stacked_Xs : container
+            Stacked containers.
+        """
+
+
+class PandasAdapter:
+    container_lib = "pandas"
+
+    def create_container(self, X_output, X_original, columns, inplace=True):
+        pd = check_library_installed("pandas")
+        columns = get_columns(columns)
+
+        if not inplace or not isinstance(X_output, pd.DataFrame):
+            # In all these cases, we need to create a new DataFrame
+
+            # Unfortunately, we cannot use `getattr(container, "index")`
+            # because `list` exposes an `index` attribute.
+            if isinstance(X_output, pd.DataFrame):
+                index = X_output.index
+            elif isinstance(X_original, (pd.DataFrame, pd.Series)):
+                index = X_original.index
+            else:
+                index = None
+
+            # We don't pass columns here because it would intend columns selection
+            # instead of renaming.
+            X_output = pd.DataFrame(X_output, index=index, copy=not inplace)
+
+        if columns is not None:
+            return self.rename_columns(X_output, columns)
+        return X_output
+
+    def is_supported_container(self, X):
+        pd = check_library_installed("pandas")
+        return isinstance(X, pd.DataFrame)
+
+    def rename_columns(self, X, columns):
+        # we cannot use `rename` since it takes a dictionary and at this stage we have
+        # potentially duplicate column names in `X`
+        X.columns = columns
+        return X
+
+    def hstack(self, Xs):
+        pd = check_library_installed("pandas")
+        return pd.concat(Xs, axis=1)
+
+
+class PolarsAdapter:
+    container_lib = "polars"
+
+    def create_container(self, X_output, X_original, columns, inplace=True):
+        pl = check_library_installed("polars")
+        columns = get_columns(columns)
+        columns = columns.tolist() if isinstance(columns, np.ndarray) else columns
+
+        if not inplace or not isinstance(X_output, pl.DataFrame):
+            # In all these cases, we need to create a new DataFrame
+            return pl.DataFrame(X_output, schema=columns, orient="row")
+
+        if columns is not None:
+            return self.rename_columns(X_output, columns)
+        return X_output
+
+    def is_supported_container(self, X):
+        pl = check_library_installed("polars")
+        return isinstance(X, pl.DataFrame)
+
+    def rename_columns(self, X, columns):
+        # we cannot use `rename` since it takes a dictionary and at this stage we have
+        # potentially duplicate column names in `X`
+        X.columns = columns
+        return X
+
+    def hstack(self, Xs):
+        pl = check_library_installed("polars")
+        return pl.concat(Xs, how="horizontal")
+
+
+class ContainerAdaptersManager:
+    def __init__(self):
+        self.adapters = {}
+
+    @property
+    def supported_outputs(self):
+        return {"default"} | set(self.adapters)
+
+    def register(self, adapter):
+        self.adapters[adapter.container_lib] = adapter
+
+
+ADAPTERS_MANAGER = ContainerAdaptersManager()
+ADAPTERS_MANAGER.register(PandasAdapter())
+ADAPTERS_MANAGER.register(PolarsAdapter())
+
+
+def _get_adapter_from_container(container):
+    """Get the adapter that knows how to handle such container.
+
+    See :class:`sklearn.utils._set_output.ContainerAdapterProtocol` for more
+    details.
+    """
+    module_name = container.__class__.__module__.split(".")[0]
+    try:
+        return ADAPTERS_MANAGER.adapters[module_name]
+    except KeyError as exc:
+        available_adapters = list(ADAPTERS_MANAGER.adapters.keys())
+        raise ValueError(
+            "The container does not have a registered adapter in scikit-learn. "
+            f"Available adapters are: {available_adapters} while the container "
+            f"provided is: {container!r}."
+        ) from exc
+
+
+def _get_container_adapter(method, estimator=None):
+    """Get container adapter."""
+    dense_config = _get_output_config(method, estimator)["dense"]
+    try:
+        return ADAPTERS_MANAGER.adapters[dense_config]
+    except KeyError:
+        return None
+
+
+def _get_output_config(method, estimator=None):
+    """Get output config based on estimator and global configuration.
+
+    Parameters
+    ----------
+    method : {"transform"}
+        Estimator's method for which the output container is looked up.
+
+    estimator : estimator instance or None
+        Estimator to get the output configuration from. If `None`, check global
+        configuration is used.
+
+    Returns
+    -------
+    config : dict
+        Dictionary with keys:
+
+        - "dense": specifies the dense container for `method`. This can be
+          `"default"` or `"pandas"`.
+    """
+    est_sklearn_output_config = getattr(estimator, "_sklearn_output_config", {})
+    if method in est_sklearn_output_config:
+        dense_config = est_sklearn_output_config[method]
+    else:
+        dense_config = get_config()[f"{method}_output"]
+
+    supported_outputs = ADAPTERS_MANAGER.supported_outputs
+    if dense_config not in supported_outputs:
+        raise ValueError(
+            f"output config must be in {sorted(supported_outputs)}, got {dense_config}"
+        )
+
+    return {"dense": dense_config}
+
+
+def _wrap_data_with_container(method, data_to_wrap, original_input, estimator):
+    """Wrap output with container based on an estimator's or global config.
+
+    Parameters
+    ----------
+    method : {"transform"}
+        Estimator's method to get container output for.
+
+    data_to_wrap : {ndarray, dataframe}
+        Data to wrap with container.
+
+    original_input : {ndarray, dataframe}
+        Original input of function.
+
+    estimator : estimator instance
+        Estimator with to get the output configuration from.
+
+    Returns
+    -------
+    output : {ndarray, dataframe}
+        If the output config is "default" or the estimator is not configured
+        for wrapping return `data_to_wrap` unchanged.
+        If the output config is "pandas", return `data_to_wrap` as a pandas
+        DataFrame.
+    """
+    output_config = _get_output_config(method, estimator)
+
+    if output_config["dense"] == "default" or not _auto_wrap_is_configured(estimator):
+        return data_to_wrap
+
+    dense_config = output_config["dense"]
+    if issparse(data_to_wrap):
+        raise ValueError(
+            "The transformer outputs a scipy sparse matrix. "
+            "Try to set the transformer output to a dense array or disable "
+            f"{dense_config.capitalize()} output with set_output(transform='default')."
+        )
+
+    adapter = ADAPTERS_MANAGER.adapters[dense_config]
+    return adapter.create_container(
+        data_to_wrap,
+        original_input,
+        columns=estimator.get_feature_names_out,
+    )
+
+
+def _wrap_method_output(f, method):
+    """Wrapper used by `_SetOutputMixin` to automatically wrap methods."""
+
+    @wraps(f)
+    def wrapped(self, X, *args, **kwargs):
+        data_to_wrap = f(self, X, *args, **kwargs)
+        if isinstance(data_to_wrap, tuple):
+            # only wrap the first output for cross decomposition
+            return_tuple = (
+                _wrap_data_with_container(method, data_to_wrap[0], X, self),
+                *data_to_wrap[1:],
+            )
+            # Support for namedtuples `_make` is a documented API for namedtuples:
+            # https://docs.python.org/3/library/collections.html#collections.somenamedtuple._make
+            if hasattr(type(data_to_wrap), "_make"):
+                return type(data_to_wrap)._make(return_tuple)
+            return return_tuple
+
+        return _wrap_data_with_container(method, data_to_wrap, X, self)
+
+    return wrapped
+
+
+def _auto_wrap_is_configured(estimator):
+    """Return True if estimator is configured for auto-wrapping the transform method.
+
+    `_SetOutputMixin` sets `_sklearn_auto_wrap_output_keys` to `set()` if auto wrapping
+    is manually disabled.
+    """
+    auto_wrap_output_keys = getattr(estimator, "_sklearn_auto_wrap_output_keys", set())
+    return (
+        hasattr(estimator, "get_feature_names_out")
+        and "transform" in auto_wrap_output_keys
+    )
+
+
+class _SetOutputMixin:
+    """Mixin that dynamically wraps methods to return container based on config.
+
+    Currently `_SetOutputMixin` wraps `transform` and `fit_transform` and configures
+    it based on `set_output` of the global configuration.
+
+    `set_output` is only defined if `get_feature_names_out` is defined and
+    `auto_wrap_output_keys` is the default value.
+    """
+
+    def __init_subclass__(cls, auto_wrap_output_keys=("transform",), **kwargs):
+        super().__init_subclass__(**kwargs)
+
+        # Dynamically wraps `transform` and `fit_transform` and configure it's
+        # output based on `set_output`.
+        if not (
+            isinstance(auto_wrap_output_keys, tuple) or auto_wrap_output_keys is None
+        ):
+            raise ValueError("auto_wrap_output_keys must be None or a tuple of keys.")
+
+        if auto_wrap_output_keys is None:
+            cls._sklearn_auto_wrap_output_keys = set()
+            return
+
+        # Mapping from method to key in configurations
+        method_to_key = {
+            "transform": "transform",
+            "fit_transform": "transform",
+        }
+        cls._sklearn_auto_wrap_output_keys = set()
+
+        for method, key in method_to_key.items():
+            if not hasattr(cls, method) or key not in auto_wrap_output_keys:
+                continue
+            cls._sklearn_auto_wrap_output_keys.add(key)
+
+            # Only wrap methods defined by cls itself
+            if method not in cls.__dict__:
+                continue
+            wrapped_method = _wrap_method_output(getattr(cls, method), key)
+            setattr(cls, method, wrapped_method)
+
+    @available_if(_auto_wrap_is_configured)
+    def set_output(self, *, transform=None):
+        """Set output container.
+
+        See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
+        for an example on how to use the API.
+
+        Parameters
+        ----------
+        transform : {"default", "pandas", "polars"}, default=None
+            Configure output of `transform` and `fit_transform`.
+
+            - `"default"`: Default output format of a transformer
+            - `"pandas"`: DataFrame output
+            - `"polars"`: Polars output
+            - `None`: Transform configuration is unchanged
+
+            .. versionadded:: 1.4
+                `"polars"` option was added.
+
+        Returns
+        -------
+        self : estimator instance
+            Estimator instance.
+        """
+        if transform is None:
+            return self
+
+        if not hasattr(self, "_sklearn_output_config"):
+            self._sklearn_output_config = {}
+
+        self._sklearn_output_config["transform"] = transform
+        return self
+
+
+def _safe_set_output(estimator, *, transform=None):
+    """Safely call estimator.set_output and error if it not available.
+
+    This is used by meta-estimators to set the output for child estimators.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Estimator instance.
+
+    transform : {"default", "pandas", "polars"}, default=None
+        Configure output of the following estimator's methods:
+
+        - `"transform"`
+        - `"fit_transform"`
+
+        If `None`, this operation is a no-op.
+
+    Returns
+    -------
+    estimator : estimator instance
+        Estimator instance.
+    """
+    set_output_for_transform = hasattr(estimator, "transform") or (
+        hasattr(estimator, "fit_transform") and transform is not None
+    )
+    if not set_output_for_transform:
+        # If estimator can not transform, then `set_output` does not need to be
+        # called.
+        return
+
+    if not hasattr(estimator, "set_output"):
+        raise ValueError(
+            f"Unable to configure output for {estimator} because `set_output` "
+            "is not available."
+        )
+    return estimator.set_output(transform=transform)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_show_versions.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_show_versions.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbdece30db326723de7a99fc7590b5748622e0b5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_show_versions.py
@@ -0,0 +1,115 @@
+"""
+Utility methods to print system info for debugging
+
+adapted from :func:`pandas.show_versions`
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import platform
+import sys
+
+from threadpoolctl import threadpool_info
+
+from .. import __version__
+from ._openmp_helpers import _openmp_parallelism_enabled
+
+
+def _get_sys_info():
+    """System information
+
+    Returns
+    -------
+    sys_info : dict
+        system and Python version information
+
+    """
+    python = sys.version.replace("\n", " ")
+
+    blob = [
+        ("python", python),
+        ("executable", sys.executable),
+        ("machine", platform.platform()),
+    ]
+
+    return dict(blob)
+
+
+def _get_deps_info():
+    """Overview of the installed version of main dependencies
+
+    This function does not import the modules to collect the version numbers
+    but instead relies on standard Python package metadata.
+
+    Returns
+    -------
+    deps_info: dict
+        version information on relevant Python libraries
+
+    """
+    deps = [
+        "pip",
+        "setuptools",
+        "numpy",
+        "scipy",
+        "Cython",
+        "pandas",
+        "matplotlib",
+        "joblib",
+        "threadpoolctl",
+    ]
+
+    deps_info = {
+        "sklearn": __version__,
+    }
+
+    from importlib.metadata import PackageNotFoundError, version
+
+    for modname in deps:
+        try:
+            deps_info[modname] = version(modname)
+        except PackageNotFoundError:
+            deps_info[modname] = None
+    return deps_info
+
+
+def show_versions():
+    """Print useful debugging information"
+
+    .. versionadded:: 0.20
+
+    Examples
+    --------
+    >>> from sklearn import show_versions
+    >>> show_versions()  # doctest: +SKIP
+    """
+
+    sys_info = _get_sys_info()
+    deps_info = _get_deps_info()
+
+    print("\nSystem:")
+    for k, stat in sys_info.items():
+        print("{k:>10}: {stat}".format(k=k, stat=stat))
+
+    print("\nPython dependencies:")
+    for k, stat in deps_info.items():
+        print("{k:>13}: {stat}".format(k=k, stat=stat))
+
+    print(
+        "\n{k}: {stat}".format(
+            k="Built with OpenMP", stat=_openmp_parallelism_enabled()
+        )
+    )
+
+    # show threadpoolctl results
+    threadpool_results = threadpool_info()
+    if threadpool_results:
+        print()
+        print("threadpoolctl info:")
+
+        for i, result in enumerate(threadpool_results):
+            for key, val in result.items():
+                print(f"{key:>15}: {val}")
+            if i != len(threadpool_results) - 1:
+                print()
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..9a685a18a2b4e88fc7ba52153431b7bc8b3362e4
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..51f21afd4d3e401bfe4021ff308cd2d5c36e5c4b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.pxd
@@ -0,0 +1,9 @@
+from ._typedefs cimport intp_t
+
+from cython cimport floating
+
+cdef int simultaneous_sort(
+    floating *dist,
+    intp_t *idx,
+    intp_t size,
+) noexcept nogil
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..13b2d872392b9941de18ebfce519e330960fcd22
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_sorting.pyx
@@ -0,0 +1,93 @@
+from cython cimport floating
+
+cdef inline void dual_swap(
+    floating* darr,
+    intp_t *iarr,
+    intp_t a,
+    intp_t b,
+) noexcept nogil:
+    """Swap the values at index a and b of both darr and iarr"""
+    cdef floating dtmp = darr[a]
+    darr[a] = darr[b]
+    darr[b] = dtmp
+
+    cdef intp_t itmp = iarr[a]
+    iarr[a] = iarr[b]
+    iarr[b] = itmp
+
+
+cdef int simultaneous_sort(
+    floating* values,
+    intp_t* indices,
+    intp_t size,
+) noexcept nogil:
+    """
+    Perform a recursive quicksort on the values array as to sort them ascendingly.
+    This simultaneously performs the swaps on both the values and the indices arrays.
+
+    The numpy equivalent is:
+
+        def simultaneous_sort(dist, idx):
+             i = np.argsort(dist)
+             return dist[i], idx[i]
+
+    Notes
+    -----
+    Arrays are manipulated via a pointer to there first element and their size
+    as to ease the processing of dynamically allocated buffers.
+    """
+    # TODO: In order to support discrete distance metrics, we need to have a
+    # simultaneous sort which breaks ties on indices when distances are identical.
+    # The best might be using a std::stable_sort and a Comparator which might need
+    # an Array of Structures (AoS) instead of the Structure of Arrays (SoA)
+    # currently used.
+    cdef:
+        intp_t pivot_idx, i, store_idx
+        floating pivot_val
+
+    # in the small-array case, do things efficiently
+    if size <= 1:
+        pass
+    elif size == 2:
+        if values[0] > values[1]:
+            dual_swap(values, indices, 0, 1)
+    elif size == 3:
+        if values[0] > values[1]:
+            dual_swap(values, indices, 0, 1)
+        if values[1] > values[2]:
+            dual_swap(values, indices, 1, 2)
+            if values[0] > values[1]:
+                dual_swap(values, indices, 0, 1)
+    else:
+        # Determine the pivot using the median-of-three rule.
+        # The smallest of the three is moved to the beginning of the array,
+        # the middle (the pivot value) is moved to the end, and the largest
+        # is moved to the pivot index.
+        pivot_idx = size // 2
+        if values[0] > values[size - 1]:
+            dual_swap(values, indices, 0, size - 1)
+        if values[size - 1] > values[pivot_idx]:
+            dual_swap(values, indices, size - 1, pivot_idx)
+            if values[0] > values[size - 1]:
+                dual_swap(values, indices, 0, size - 1)
+        pivot_val = values[size - 1]
+
+        # Partition indices about pivot.  At the end of this operation,
+        # pivot_idx will contain the pivot value, everything to the left
+        # will be smaller, and everything to the right will be larger.
+        store_idx = 0
+        for i in range(size - 1):
+            if values[i] < pivot_val:
+                dual_swap(values, indices, i, store_idx)
+                store_idx += 1
+        dual_swap(values, indices, store_idx, size - 1)
+        pivot_idx = store_idx
+
+        # Recursively sort each side of the pivot
+        if pivot_idx > 1:
+            simultaneous_sort(values, indices, pivot_idx)
+        if pivot_idx + 2 < size:
+            simultaneous_sort(values + pivot_idx + 1,
+                              indices + pivot_idx + 1,
+                              size - pivot_idx - 1)
+    return 0
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_tags.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_tags.py
new file mode 100644
index 0000000000000000000000000000000000000000..44b3eb64523c9cda2e94d71dfb5e37d2333047ad
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_tags.py
@@ -0,0 +1,355 @@
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass, field
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+@dataclass(slots=True)
+class InputTags:
+    """Tags for the input data.
+
+    Parameters
+    ----------
+    one_d_array : bool, default=False
+        Whether the input can be a 1D array.
+
+    two_d_array : bool, default=True
+        Whether the input can be a 2D array. Note that most common
+        tests currently run only if this flag is set to ``True``.
+
+    three_d_array : bool, default=False
+        Whether the input can be a 3D array.
+
+    sparse : bool, default=False
+        Whether the input can be a sparse matrix.
+
+    categorical : bool, default=False
+        Whether the input can be categorical.
+
+    string : bool, default=False
+        Whether the input can be an array-like of strings.
+
+    dict : bool, default=False
+        Whether the input can be a dictionary.
+
+    positive_only : bool, default=False
+        Whether the estimator requires positive X.
+
+    allow_nan : bool, default=False
+        Whether the estimator supports data with missing values encoded as `np.nan`.
+
+    pairwise : bool, default=False
+        This boolean attribute indicates whether the data (`X`),
+        :term:`fit` and similar methods consists of pairwise measures
+        over samples rather than a feature representation for each
+        sample.  It is usually `True` where an estimator has a
+        `metric` or `affinity` or `kernel` parameter with value
+        'precomputed'. Its primary purpose is to support a
+        :term:`meta-estimator` or a cross validation procedure that
+        extracts a sub-sample of data intended for a pairwise
+        estimator, where the data needs to be indexed on both axes.
+        Specifically, this tag is used by
+        `sklearn.utils.metaestimators._safe_split` to slice rows and
+        columns.
+
+        Note that if setting this tag to ``True`` means the estimator can take only
+        positive values, the `positive_only` tag must reflect it and also be set to
+        ``True``.
+    """
+
+    one_d_array: bool = False
+    two_d_array: bool = True
+    three_d_array: bool = False
+    sparse: bool = False
+    categorical: bool = False
+    string: bool = False
+    dict: bool = False
+    positive_only: bool = False
+    allow_nan: bool = False
+    pairwise: bool = False
+
+
+@dataclass(slots=True)
+class TargetTags:
+    """Tags for the target data.
+
+    Parameters
+    ----------
+    required : bool
+        Whether the estimator requires y to be passed to `fit`,
+        `fit_predict` or `fit_transform` methods. The tag is ``True``
+        for estimators inheriting from `~sklearn.base.RegressorMixin`
+        and `~sklearn.base.ClassifierMixin`.
+
+    one_d_labels : bool, default=False
+        Whether the input is a 1D labels (y).
+
+    two_d_labels : bool, default=False
+        Whether the input is a 2D labels (y).
+
+    positive_only : bool, default=False
+        Whether the estimator requires a positive y (only applicable
+        for regression).
+
+    multi_output : bool, default=False
+        Whether a regressor supports multi-target outputs or a classifier supports
+        multi-class multi-output.
+
+        See :term:`multi-output` in the glossary.
+
+    single_output : bool, default=True
+        Whether the target can be single-output. This can be ``False`` if the
+        estimator supports only multi-output cases.
+    """
+
+    required: bool
+    one_d_labels: bool = False
+    two_d_labels: bool = False
+    positive_only: bool = False
+    multi_output: bool = False
+    single_output: bool = True
+
+
+@dataclass(slots=True)
+class TransformerTags:
+    """Tags for the transformer.
+
+    Parameters
+    ----------
+    preserves_dtype : list[str], default=["float64"]
+        Applies only on transformers. It corresponds to the data types
+        which will be preserved such that `X_trans.dtype` is the same
+        as `X.dtype` after calling `transformer.transform(X)`. If this
+        list is empty, then the transformer is not expected to
+        preserve the data type. The first value in the list is
+        considered as the default data type, corresponding to the data
+        type of the output when the input data type is not going to be
+        preserved.
+    """
+
+    preserves_dtype: list[str] = field(default_factory=lambda: ["float64"])
+
+
+@dataclass(slots=True)
+class ClassifierTags:
+    """Tags for the classifier.
+
+    Parameters
+    ----------
+    poor_score : bool, default=False
+        Whether the estimator fails to provide a "reasonable" test-set
+        score, which currently for classification is an accuracy of
+        0.83 on ``make_blobs(n_samples=300, random_state=0)``. The
+        datasets and values are based on current estimators in scikit-learn
+        and might be replaced by something more systematic.
+
+    multi_class : bool, default=True
+        Whether the classifier can handle multi-class
+        classification. Note that all classifiers support binary
+        classification. Therefore this flag indicates whether the
+        classifier is a binary-classifier-only or not.
+
+        See :term:`multi-class` in the glossary.
+
+    multi_label : bool, default=False
+        Whether the classifier supports multi-label output: a data point can
+        be predicted to belong to a variable number of classes.
+
+        See :term:`multi-label` in the glossary.
+    """
+
+    poor_score: bool = False
+    multi_class: bool = True
+    multi_label: bool = False
+
+
+@dataclass(slots=True)
+class RegressorTags:
+    """Tags for the regressor.
+
+    Parameters
+    ----------
+    poor_score : bool, default=False
+        Whether the estimator fails to provide a "reasonable" test-set
+        score, which currently for regression is an R2 of 0.5 on
+        ``make_regression(n_samples=200, n_features=10,
+        n_informative=1, bias=5.0, noise=20, random_state=42)``. The
+        dataset and values are based on current estimators in scikit-learn
+        and might be replaced by something more systematic.
+    """
+
+    poor_score: bool = False
+
+
+@dataclass(slots=True)
+class Tags:
+    """Tags for the estimator.
+
+    See :ref:`estimator_tags` for more information.
+
+    Parameters
+    ----------
+    estimator_type : str or None
+        The type of the estimator. Can be one of:
+        - "classifier"
+        - "regressor"
+        - "transformer"
+        - "clusterer"
+        - "outlier_detector"
+        - "density_estimator"
+
+    target_tags : :class:`TargetTags`
+        The target(y) tags.
+
+    transformer_tags : :class:`TransformerTags` or None
+        The transformer tags.
+
+    classifier_tags : :class:`ClassifierTags` or None
+        The classifier tags.
+
+    regressor_tags : :class:`RegressorTags` or None
+        The regressor tags.
+
+    array_api_support : bool, default=False
+        Whether the estimator supports Array API compatible inputs.
+
+    no_validation : bool, default=False
+        Whether the estimator skips input-validation. This is only meant for
+        stateless and dummy transformers!
+
+    non_deterministic : bool, default=False
+        Whether the estimator is not deterministic given a fixed ``random_state``.
+
+    requires_fit : bool, default=True
+        Whether the estimator requires to be fitted before calling one of
+        `transform`, `predict`, `predict_proba`, or `decision_function`.
+
+    _skip_test : bool, default=False
+        Whether to skip common tests entirely. Don't use this unless
+        you have a *very good* reason.
+
+    input_tags : :class:`InputTags`
+        The input data(X) tags.
+    """
+
+    estimator_type: str | None
+    target_tags: TargetTags
+    transformer_tags: TransformerTags | None = None
+    classifier_tags: ClassifierTags | None = None
+    regressor_tags: RegressorTags | None = None
+    array_api_support: bool = False
+    no_validation: bool = False
+    non_deterministic: bool = False
+    requires_fit: bool = True
+    _skip_test: bool = False
+    input_tags: InputTags = field(default_factory=InputTags)
+
+
+# TODO(1.8): Remove this function
+def default_tags(estimator) -> Tags:
+    """Get the default tags for an estimator.
+
+    This ignores any ``__sklearn_tags__`` method that the estimator may have.
+
+    If the estimator is a classifier or a regressor, ``target_tags.required``
+    will be set to ``True``, otherwise it will be set to ``False``.
+
+    ``transformer_tags`` will be set to :class:`~.sklearn.utils. TransformerTags` if the
+    estimator has a ``transform`` or ``fit_transform`` method, otherwise it will be set
+    to ``None``.
+
+    ``classifier_tags`` will be set to :class:`~.sklearn.utils.ClassifierTags` if the
+    estimator is a classifier, otherwise it will be set to ``None``.
+    a classifier, otherwise it will be set to ``None``.
+
+    ``regressor_tags`` will be set to :class:`~.sklearn.utils.RegressorTags` if the
+    estimator is a regressor, otherwise it will be set to ``None``.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        The estimator for which to get the default tags.
+
+    Returns
+    -------
+    tags : Tags
+        The default tags for the estimator.
+    """
+    est_is_classifier = getattr(estimator, "_estimator_type", None) == "classifier"
+    est_is_regressor = getattr(estimator, "_estimator_type", None) == "regressor"
+    target_required = est_is_classifier or est_is_regressor
+
+    return Tags(
+        estimator_type=getattr(estimator, "_estimator_type", None),
+        target_tags=TargetTags(required=target_required),
+        transformer_tags=(
+            TransformerTags()
+            if hasattr(estimator, "transform") or hasattr(estimator, "fit_transform")
+            else None
+        ),
+        classifier_tags=ClassifierTags() if est_is_classifier else None,
+        regressor_tags=RegressorTags() if est_is_regressor else None,
+    )
+
+
+def get_tags(estimator) -> Tags:
+    """Get estimator tags.
+
+    :class:`~sklearn.BaseEstimator` provides the estimator tags machinery.
+    However, if an estimator does not inherit from this base class, we should
+    fall-back to the default tags.
+
+    For scikit-learn built-in estimators, we should still rely on
+    `self.__sklearn_tags__()`. `get_tags(est)` should be used when we
+    are not sure where `est` comes from: typically
+    `get_tags(self.estimator)` where `self` is a meta-estimator, or in
+    the common checks.
+
+    .. versionadded:: 1.6
+
+    Parameters
+    ----------
+    estimator : estimator object
+        The estimator from which to get the tag.
+
+    Returns
+    -------
+    tags : :class:`~.sklearn.utils.Tags`
+        The estimator tags.
+    """
+
+    try:
+        tags = estimator.__sklearn_tags__()
+    except AttributeError as exc:
+        # TODO(1.8): turn the warning into an error
+        if "object has no attribute '__sklearn_tags__'" in str(exc):
+            # Fall back to the default tags if the estimator does not
+            # implement __sklearn_tags__.
+            # In particular, workaround the regression reported in
+            # https://github.com/scikit-learn/scikit-learn/issues/30479
+            # `__sklearn_tags__` is implemented by calling
+            # `super().__sklearn_tags__()` but there is no `__sklearn_tags__`
+            # method in the base class. Typically happens when only inheriting
+            # from Mixins.
+
+            warnings.warn(
+                f"The following error was raised: {exc}. It seems that "
+                "there are no classes that implement `__sklearn_tags__` "
+                "in the MRO and/or all classes in the MRO call "
+                "`super().__sklearn_tags__()`. Make sure to inherit from "
+                "`BaseEstimator` which implements `__sklearn_tags__` (or "
+                "alternatively define `__sklearn_tags__` but we don't recommend "
+                "this approach). Note that `BaseEstimator` needs to be on the "
+                "right side of other Mixins in the inheritance order. The "
+                "default are now used instead since retrieving tags failed. "
+                "This warning will be replaced by an error in 1.8.",
+                category=DeprecationWarning,
+            )
+            tags = default_tags(estimator)
+        else:
+            raise
+
+    return tags
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_test_common/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_test_common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..67dd18fb94b593f0a3125c1f5833f3b9597614ba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_test_common/__init__.py
@@ -0,0 +1,2 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_test_common/instance_generator.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_test_common/instance_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d88ad23eb5e954c92e5b87dd8a8d25449856a63
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_test_common/instance_generator.py
@@ -0,0 +1,1293 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import re
+import warnings
+from contextlib import suppress
+from functools import partial
+from inspect import isfunction
+
+from sklearn import clone, config_context
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.cluster import (
+    HDBSCAN,
+    AffinityPropagation,
+    AgglomerativeClustering,
+    Birch,
+    BisectingKMeans,
+    FeatureAgglomeration,
+    KMeans,
+    MeanShift,
+    MiniBatchKMeans,
+    SpectralBiclustering,
+    SpectralClustering,
+    SpectralCoclustering,
+)
+from sklearn.compose import ColumnTransformer
+from sklearn.covariance import GraphicalLasso, GraphicalLassoCV
+from sklearn.cross_decomposition import CCA, PLSSVD, PLSCanonical, PLSRegression
+from sklearn.decomposition import (
+    NMF,
+    PCA,
+    DictionaryLearning,
+    FactorAnalysis,
+    FastICA,
+    IncrementalPCA,
+    KernelPCA,
+    LatentDirichletAllocation,
+    MiniBatchDictionaryLearning,
+    MiniBatchNMF,
+    MiniBatchSparsePCA,
+    SparseCoder,
+    SparsePCA,
+    TruncatedSVD,
+)
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import (
+    AdaBoostClassifier,
+    AdaBoostRegressor,
+    BaggingClassifier,
+    BaggingRegressor,
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
+    GradientBoostingClassifier,
+    GradientBoostingRegressor,
+    HistGradientBoostingClassifier,
+    HistGradientBoostingRegressor,
+    IsolationForest,
+    RandomForestClassifier,
+    RandomForestRegressor,
+    RandomTreesEmbedding,
+    StackingClassifier,
+    StackingRegressor,
+    VotingClassifier,
+    VotingRegressor,
+)
+from sklearn.exceptions import SkipTestWarning
+from sklearn.experimental import enable_halving_search_cv  # noqa: F401
+from sklearn.feature_selection import (
+    RFE,
+    RFECV,
+    SelectFdr,
+    SelectFromModel,
+    SelectKBest,
+    SequentialFeatureSelector,
+)
+from sklearn.frozen import FrozenEstimator
+from sklearn.kernel_approximation import (
+    Nystroem,
+    PolynomialCountSketch,
+    RBFSampler,
+    SkewedChi2Sampler,
+)
+from sklearn.linear_model import (
+    ARDRegression,
+    BayesianRidge,
+    ElasticNet,
+    ElasticNetCV,
+    GammaRegressor,
+    HuberRegressor,
+    LarsCV,
+    Lasso,
+    LassoCV,
+    LassoLars,
+    LassoLarsCV,
+    LassoLarsIC,
+    LinearRegression,
+    LogisticRegression,
+    LogisticRegressionCV,
+    MultiTaskElasticNet,
+    MultiTaskElasticNetCV,
+    MultiTaskLasso,
+    MultiTaskLassoCV,
+    OrthogonalMatchingPursuitCV,
+    PassiveAggressiveClassifier,
+    PassiveAggressiveRegressor,
+    Perceptron,
+    PoissonRegressor,
+    QuantileRegressor,
+    RANSACRegressor,
+    Ridge,
+    RidgeClassifier,
+    SGDClassifier,
+    SGDOneClassSVM,
+    SGDRegressor,
+    TheilSenRegressor,
+    TweedieRegressor,
+)
+from sklearn.manifold import (
+    MDS,
+    TSNE,
+    Isomap,
+    LocallyLinearEmbedding,
+    SpectralEmbedding,
+)
+from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
+from sklearn.model_selection import (
+    FixedThresholdClassifier,
+    GridSearchCV,
+    HalvingGridSearchCV,
+    HalvingRandomSearchCV,
+    RandomizedSearchCV,
+    TunedThresholdClassifierCV,
+)
+from sklearn.multiclass import (
+    OneVsOneClassifier,
+    OneVsRestClassifier,
+    OutputCodeClassifier,
+)
+from sklearn.multioutput import (
+    ClassifierChain,
+    MultiOutputClassifier,
+    MultiOutputRegressor,
+    RegressorChain,
+)
+from sklearn.neighbors import (
+    KernelDensity,
+    KNeighborsClassifier,
+    KNeighborsRegressor,
+    KNeighborsTransformer,
+    NeighborhoodComponentsAnalysis,
+    RadiusNeighborsTransformer,
+)
+from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor
+from sklearn.pipeline import FeatureUnion, Pipeline
+from sklearn.preprocessing import (
+    KBinsDiscretizer,
+    OneHotEncoder,
+    SplineTransformer,
+    StandardScaler,
+    TargetEncoder,
+)
+from sklearn.random_projection import (
+    GaussianRandomProjection,
+    SparseRandomProjection,
+)
+from sklearn.semi_supervised import (
+    LabelPropagation,
+    LabelSpreading,
+    SelfTrainingClassifier,
+)
+from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils import all_estimators
+from sklearn.utils._tags import get_tags
+from sklearn.utils._testing import SkipTest
+from sklearn.utils.fixes import _IS_32BIT, parse_version, sp_base_version
+
+CROSS_DECOMPOSITION = ["PLSCanonical", "PLSRegression", "CCA", "PLSSVD"]
+
+# The following dictionary is to indicate constructor arguments suitable for the test
+# suite, which uses very small datasets, and is intended to run rather quickly.
+INIT_PARAMS = {
+    AdaBoostClassifier: dict(n_estimators=5),
+    AdaBoostRegressor: dict(n_estimators=5),
+    AffinityPropagation: dict(max_iter=5),
+    AgglomerativeClustering: dict(n_clusters=2),
+    ARDRegression: dict(max_iter=5),
+    BaggingClassifier: dict(n_estimators=5),
+    BaggingRegressor: dict(n_estimators=5),
+    BayesianGaussianMixture: dict(n_init=2, max_iter=5),
+    BayesianRidge: dict(max_iter=5),
+    BernoulliRBM: dict(n_iter=5, batch_size=10),
+    Birch: dict(n_clusters=2),
+    BisectingKMeans: dict(n_init=2, n_clusters=2, max_iter=5),
+    CalibratedClassifierCV: dict(estimator=LogisticRegression(C=1), cv=3),
+    CCA: dict(n_components=1, max_iter=5),
+    ClassifierChain: dict(estimator=LogisticRegression(C=1), cv=3),
+    ColumnTransformer: dict(transformers=[("trans1", StandardScaler(), [0, 1])]),
+    DictionaryLearning: dict(max_iter=20, transform_algorithm="lasso_lars"),
+    # the default strategy prior would output constant predictions and fail
+    # for check_classifiers_predictions
+    DummyClassifier: [dict(strategy="stratified"), dict(strategy="most_frequent")],
+    ElasticNetCV: dict(max_iter=5, cv=3),
+    ElasticNet: dict(max_iter=5),
+    ExtraTreesClassifier: dict(n_estimators=5),
+    ExtraTreesRegressor: dict(n_estimators=5),
+    FactorAnalysis: dict(max_iter=5),
+    FastICA: dict(max_iter=5),
+    FeatureAgglomeration: dict(n_clusters=2),
+    FeatureUnion: dict(transformer_list=[("trans1", StandardScaler())]),
+    FixedThresholdClassifier: dict(estimator=LogisticRegression(C=1)),
+    GammaRegressor: dict(max_iter=5),
+    GaussianMixture: dict(n_init=2, max_iter=5),
+    # Due to the jl lemma and often very few samples, the number
+    # of components of the random matrix projection will be probably
+    # greater than the number of features.
+    # So we impose a smaller number (avoid "auto" mode)
+    GaussianRandomProjection: dict(n_components=2),
+    GradientBoostingClassifier: dict(n_estimators=5),
+    GradientBoostingRegressor: dict(n_estimators=5),
+    GraphicalLassoCV: dict(max_iter=5, cv=3),
+    GraphicalLasso: dict(max_iter=5),
+    GridSearchCV: [
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Ridge(),
+            param_grid={"alpha": [0.1, 1.0]},
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=LogisticRegression(),
+            param_grid={"C": [0.1, 1.0]},
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
+            param_grid={"ridge__alpha": [0.1, 1.0]},
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(
+                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
+            ),
+            param_grid={"logisticregression__C": [0.1, 1.0]},
+        ),
+    ],
+    HalvingGridSearchCV: [
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Ridge(),
+            min_resources="smallest",
+            param_grid={"alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=LogisticRegression(),
+            min_resources="smallest",
+            param_grid={"C": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
+            min_resources="smallest",
+            param_grid={"ridge__alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(
+                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
+            ),
+            min_resources="smallest",
+            param_grid={"logisticregression__C": [0.1, 1.0]},
+            random_state=0,
+        ),
+    ],
+    HalvingRandomSearchCV: [
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Ridge(),
+            param_distributions={"alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=LogisticRegression(),
+            param_distributions={"C": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
+            param_distributions={"ridge__alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(
+                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
+            ),
+            param_distributions={"logisticregression__C": [0.1, 1.0]},
+            random_state=0,
+        ),
+    ],
+    HDBSCAN: dict(min_samples=1),
+    # The default min_samples_leaf (20) isn't appropriate for small
+    # datasets (only very shallow trees are built) that the checks use.
+    HistGradientBoostingClassifier: dict(max_iter=5, min_samples_leaf=5),
+    HistGradientBoostingRegressor: dict(max_iter=5, min_samples_leaf=5),
+    HuberRegressor: dict(max_iter=5),
+    IncrementalPCA: dict(batch_size=10),
+    IsolationForest: dict(n_estimators=5),
+    KMeans: dict(n_init=2, n_clusters=2, max_iter=5),
+    KNeighborsClassifier: [dict(n_neighbors=2), dict(metric="precomputed")],
+    KNeighborsRegressor: [dict(n_neighbors=2), dict(metric="precomputed")],
+    LabelPropagation: dict(max_iter=5),
+    LabelSpreading: dict(max_iter=5),
+    LarsCV: dict(max_iter=5, cv=3),
+    LassoCV: dict(max_iter=5, cv=3),
+    Lasso: dict(max_iter=5),
+    LassoLarsCV: dict(max_iter=5, cv=3),
+    LassoLars: dict(max_iter=5),
+    # Noise variance estimation does not work when `n_samples < n_features`.
+    # We need to provide the noise variance explicitly.
+    LassoLarsIC: dict(max_iter=5, noise_variance=1.0),
+    LatentDirichletAllocation: dict(max_iter=5, batch_size=10),
+    LinearSVC: dict(max_iter=20),
+    LinearSVR: dict(max_iter=20),
+    LocallyLinearEmbedding: dict(max_iter=5),
+    LogisticRegressionCV: dict(max_iter=5, cv=3),
+    LogisticRegression: dict(max_iter=5),
+    MDS: dict(n_init=2, max_iter=5),
+    # In the case of check_fit2d_1sample, bandwidth is set to None and
+    # is thus estimated. De facto it is 0.0 as a single sample is provided
+    # and this makes the test fails. Hence we give it a placeholder value.
+    MeanShift: dict(max_iter=5, bandwidth=1.0),
+    MiniBatchDictionaryLearning: dict(batch_size=10, max_iter=5),
+    MiniBatchKMeans: dict(n_init=2, n_clusters=2, max_iter=5, batch_size=10),
+    MiniBatchNMF: dict(batch_size=10, max_iter=20, fresh_restarts=True),
+    MiniBatchSparsePCA: dict(max_iter=5, batch_size=10),
+    MLPClassifier: dict(max_iter=100),
+    MLPRegressor: dict(max_iter=100),
+    MultiOutputClassifier: dict(estimator=LogisticRegression(C=1)),
+    MultiOutputRegressor: dict(estimator=Ridge()),
+    MultiTaskElasticNetCV: dict(max_iter=5, cv=3),
+    MultiTaskElasticNet: dict(max_iter=5),
+    MultiTaskLassoCV: dict(max_iter=5, cv=3),
+    MultiTaskLasso: dict(max_iter=5),
+    NeighborhoodComponentsAnalysis: dict(max_iter=5),
+    NMF: dict(max_iter=500),
+    NuSVC: dict(max_iter=-1),
+    NuSVR: dict(max_iter=-1),
+    OneClassSVM: dict(max_iter=-1),
+    OneHotEncoder: dict(handle_unknown="ignore"),
+    OneVsOneClassifier: dict(estimator=LogisticRegression(C=1)),
+    OneVsRestClassifier: dict(estimator=LogisticRegression(C=1)),
+    OrthogonalMatchingPursuitCV: dict(cv=3),
+    OutputCodeClassifier: dict(estimator=LogisticRegression(C=1)),
+    PassiveAggressiveClassifier: dict(max_iter=5),
+    PassiveAggressiveRegressor: dict(max_iter=5),
+    Perceptron: dict(max_iter=5),
+    Pipeline: [
+        {"steps": [("scaler", StandardScaler()), ("final_estimator", Ridge())]},
+        {
+            "steps": [
+                ("scaler", StandardScaler()),
+                ("final_estimator", LogisticRegression()),
+            ]
+        },
+    ],
+    PLSCanonical: dict(n_components=1, max_iter=5),
+    PLSRegression: dict(n_components=1, max_iter=5),
+    PLSSVD: dict(n_components=1),
+    PoissonRegressor: dict(max_iter=5),
+    RandomForestClassifier: dict(n_estimators=5),
+    RandomForestRegressor: dict(n_estimators=5),
+    RandomizedSearchCV: [
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Ridge(),
+            param_distributions={"alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=LogisticRegression(),
+            param_distributions={"C": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(steps=[("pca", PCA()), ("ridge", Ridge())]),
+            param_distributions={"ridge__alpha": [0.1, 1.0]},
+            random_state=0,
+        ),
+        dict(
+            cv=2,
+            error_score="raise",
+            estimator=Pipeline(
+                steps=[("pca", PCA()), ("logisticregression", LogisticRegression())]
+            ),
+            param_distributions={"logisticregression__C": [0.1, 1.0]},
+            random_state=0,
+        ),
+    ],
+    RandomTreesEmbedding: dict(n_estimators=5),
+    # `RANSACRegressor` will raise an error with any model other
+    # than `LinearRegression` if we don't fix the `min_samples` parameter.
+    # For common tests, we can enforce using `LinearRegression` that
+    # is the default estimator in `RANSACRegressor` instead of `Ridge`.
+    RANSACRegressor: dict(estimator=LinearRegression(), max_trials=10),
+    RegressorChain: dict(estimator=Ridge(), cv=3),
+    RFECV: dict(estimator=LogisticRegression(C=1), cv=3),
+    RFE: dict(estimator=LogisticRegression(C=1)),
+    # be tolerant of noisy datasets (not actually speed)
+    SelectFdr: dict(alpha=0.5),
+    # Increases coverage because SGDRegressor has partial_fit
+    SelectFromModel: dict(estimator=SGDRegressor(random_state=0)),
+    # SelectKBest has a default of k=10
+    # which is more feature than we have in most case.
+    SelectKBest: dict(k=1),
+    SelfTrainingClassifier: dict(estimator=LogisticRegression(C=1), max_iter=5),
+    SequentialFeatureSelector: dict(estimator=LogisticRegression(C=1), cv=3),
+    SGDClassifier: dict(max_iter=5),
+    SGDOneClassSVM: dict(max_iter=5),
+    SGDRegressor: dict(max_iter=5),
+    SparsePCA: dict(max_iter=5),
+    # Due to the jl lemma and often very few samples, the number
+    # of components of the random matrix projection will be probably
+    # greater than the number of features.
+    # So we impose a smaller number (avoid "auto" mode)
+    SparseRandomProjection: dict(n_components=2),
+    SpectralBiclustering: dict(n_init=2, n_best=1, n_clusters=2),
+    SpectralClustering: dict(n_init=2, n_clusters=2),
+    SpectralCoclustering: dict(n_init=2, n_clusters=2),
+    # Default "auto" parameter can lead to different ordering of eigenvalues on
+    # windows: #24105
+    SpectralEmbedding: dict(eigen_tol=1e-05),
+    StackingClassifier: dict(
+        estimators=[
+            ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
+        ],
+        cv=3,
+    ),
+    StackingRegressor: dict(
+        estimators=[
+            ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
+        ],
+        cv=3,
+    ),
+    SVC: [dict(max_iter=-1), dict(kernel="precomputed")],
+    SVR: [dict(max_iter=-1), dict(kernel="precomputed")],
+    TargetEncoder: dict(cv=3),
+    TheilSenRegressor: dict(max_iter=5, max_subpopulation=100),
+    # TruncatedSVD doesn't run with n_components = n_features
+    TruncatedSVD: dict(n_iter=5, n_components=1),
+    TSNE: dict(perplexity=2),
+    TunedThresholdClassifierCV: dict(estimator=LogisticRegression(C=1), cv=3),
+    TweedieRegressor: dict(max_iter=5),
+    VotingClassifier: dict(
+        estimators=[
+            ("est1", DecisionTreeClassifier(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeClassifier(max_depth=3, random_state=1)),
+        ]
+    ),
+    VotingRegressor: dict(
+        estimators=[
+            ("est1", DecisionTreeRegressor(max_depth=3, random_state=0)),
+            ("est2", DecisionTreeRegressor(max_depth=3, random_state=1)),
+        ]
+    ),
+}
+
+# This dictionary stores parameters for specific checks. It also enables running the
+# same check with multiple instances of the same estimator with different parameters.
+# The special key "*" allows to apply the parameters to all checks.
+# TODO(devtools): allow third-party developers to pass test specific params to checks
+PER_ESTIMATOR_CHECK_PARAMS: dict = {
+    # TODO(devtools): check that function names here exist in checks for the estimator
+    AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)},
+    BayesianGaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
+    BernoulliRBM: {"check_dict_unchanged": dict(n_components=1, n_iter=5)},
+    Birch: {"check_dict_unchanged": dict(n_clusters=1)},
+    BisectingKMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
+    CCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    DecisionTreeRegressor: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(criterion="squared_error"),
+            dict(criterion="absolute_error"),
+            dict(criterion="friedman_mse"),
+            dict(criterion="poisson"),
+        ],
+        "check_sample_weight_equivalence_on_sparse_data": [
+            dict(criterion="squared_error"),
+            dict(criterion="absolute_error"),
+            dict(criterion="friedman_mse"),
+            dict(criterion="poisson"),
+        ],
+    },
+    DecisionTreeClassifier: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(criterion="gini"),
+            dict(criterion="log_loss"),
+            dict(criterion="entropy"),
+        ],
+        "check_sample_weight_equivalence_on_sparse_data": [
+            dict(criterion="gini"),
+            dict(criterion="log_loss"),
+            dict(criterion="entropy"),
+        ],
+    },
+    DictionaryLearning: {
+        "check_dict_unchanged": dict(
+            max_iter=20, n_components=1, transform_algorithm="lasso_lars"
+        )
+    },
+    FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    FastICA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)},
+    FeatureUnion: {
+        "check_estimator_sparse_tag": [
+            dict(transformer_list=[("trans1", StandardScaler())]),
+            dict(
+                transformer_list=[
+                    ("trans1", StandardScaler(with_mean=False)),
+                    ("trans2", "drop"),
+                    ("trans3", "passthrough"),
+                ]
+            ),
+        ]
+    },
+    GammaRegressor: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(solver="newton-cholesky"),
+            dict(solver="lbfgs"),
+        ],
+    },
+    GaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
+    GaussianRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
+    IncrementalPCA: {"check_dict_unchanged": dict(batch_size=10, n_components=1)},
+    Isomap: {"check_dict_unchanged": dict(n_components=1)},
+    KMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
+    # TODO(1.9) simplify when averaged_inverted_cdf is the default
+    KBinsDiscretizer: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            # Using subsample != None leads to a stochastic fit that is not
+            # handled by the check_sample_weight_equivalence_on_dense_data test.
+            dict(strategy="quantile", subsample=None, quantile_method="inverted_cdf"),
+            dict(
+                strategy="quantile",
+                subsample=None,
+                quantile_method="averaged_inverted_cdf",
+            ),
+            dict(strategy="uniform", subsample=None),
+            # The "kmeans" strategy leads to a stochastic fit that is not
+            # handled by the check_sample_weight_equivalence test.
+        ],
+        "check_sample_weights_list": dict(
+            strategy="quantile", quantile_method="averaged_inverted_cdf"
+        ),
+        "check_sample_weights_pandas_series": dict(
+            strategy="quantile", quantile_method="averaged_inverted_cdf"
+        ),
+        "check_sample_weights_shape": dict(
+            strategy="quantile", quantile_method="averaged_inverted_cdf"
+        ),
+        "check_sample_weights_not_an_array": dict(
+            strategy="quantile", quantile_method="averaged_inverted_cdf"
+        ),
+        "check_sample_weights_not_overwritten": dict(
+            strategy="quantile", quantile_method="averaged_inverted_cdf"
+        ),
+    },
+    KernelPCA: {"check_dict_unchanged": dict(n_components=1)},
+    LassoLars: {"check_non_transformer_estimators_n_iter": dict(alpha=0.0)},
+    LatentDirichletAllocation: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    LinearDiscriminantAnalysis: {"check_dict_unchanged": dict(n_components=1)},
+    LinearSVC: {
+        "check_sample_weight_equivalence": [
+            # TODO: dual=True is a stochastic solver: we cannot rely on
+            # check_sample_weight_equivalence to check the correct handling of
+            # sample_weight and we would need a statistical test instead, see
+            # meta-issue #162298.
+            # dict(max_iter=20, dual=True, tol=1e-12),
+            dict(dual=False, tol=1e-12),
+            dict(dual=False, tol=1e-12, class_weight="balanced"),
+        ]
+    },
+    LinearRegression: {
+        "check_estimator_sparse_tag": [dict(positive=False), dict(positive=True)],
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(positive=False),
+            dict(positive=True),
+        ],
+        "check_sample_weight_equivalence_on_sparse_data": [dict(tol=1e-12)],
+    },
+    LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    LogisticRegression: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(solver="lbfgs"),
+            dict(solver="liblinear"),
+            dict(solver="newton-cg"),
+            dict(solver="newton-cholesky"),
+            dict(solver="newton-cholesky", class_weight="balanced"),
+        ]
+    },
+    LogisticRegressionCV: {
+        "check_sample_weight_equivalence": [
+            dict(solver="lbfgs"),
+            dict(solver="newton-cholesky"),
+            dict(solver="newton-cholesky", class_weight="balanced"),
+        ],
+        "check_sample_weight_equivalence_on_sparse_data": [
+            dict(solver="liblinear"),
+        ],
+    },
+    MDS: {"check_dict_unchanged": dict(max_iter=5, n_components=1, n_init=2)},
+    MLPClassifier: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(solver="lbfgs"),
+        ]
+    },
+    MLPRegressor: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(solver="sgd", tol=1e-2, random_state=42),
+        ]
+    },
+    MiniBatchDictionaryLearning: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    MiniBatchKMeans: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_clusters=1, n_init=2)
+    },
+    MiniBatchNMF: {
+        "check_dict_unchanged": dict(
+            batch_size=10, fresh_restarts=True, max_iter=20, n_components=1
+        )
+    },
+    MiniBatchSparsePCA: {
+        "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
+    },
+    NMF: {"check_dict_unchanged": dict(max_iter=500, n_components=1)},
+    NeighborhoodComponentsAnalysis: {
+        "check_dict_unchanged": dict(max_iter=5, n_components=1)
+    },
+    Nystroem: {"check_dict_unchanged": dict(n_components=1)},
+    PCA: {"check_dict_unchanged": dict(n_components=1)},
+    PLSCanonical: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    PLSRegression: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    PLSSVD: {"check_dict_unchanged": dict(n_components=1)},
+    PoissonRegressor: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(solver="newton-cholesky"),
+            dict(solver="lbfgs"),
+        ],
+    },
+    PolynomialCountSketch: {"check_dict_unchanged": dict(n_components=1)},
+    QuantileRegressor: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(quantile=0.5),
+            dict(quantile=0.75),
+            dict(solver="highs-ds"),
+            dict(solver="highs-ipm"),
+        ],
+    },
+    RBFSampler: {"check_dict_unchanged": dict(n_components=1)},
+    Ridge: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(solver="svd"),
+            dict(solver="cholesky"),
+            dict(solver="sparse_cg"),
+            dict(solver="lsqr"),
+            dict(solver="lbfgs", positive=True),
+        ],
+        "check_sample_weight_equivalence_on_sparse_data": [
+            dict(solver="sparse_cg"),
+            dict(solver="lsqr"),
+        ],
+    },
+    RidgeClassifier: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(solver="svd"),
+            dict(solver="cholesky"),
+            dict(solver="sparse_cg"),
+            dict(solver="lsqr"),
+        ],
+        "check_sample_weight_equivalence_on_sparse_data": [
+            dict(solver="sparse_cg"),
+            dict(solver="lsqr"),
+        ],
+    },
+    SkewedChi2Sampler: {"check_dict_unchanged": dict(n_components=1)},
+    SparsePCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
+    SparseRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
+    SpectralBiclustering: {
+        "check_dict_unchanged": dict(n_best=1, n_clusters=1, n_components=1, n_init=2)
+    },
+    SpectralClustering: {
+        "check_dict_unchanged": dict(n_clusters=1, n_components=1, n_init=2)
+    },
+    SpectralCoclustering: {"check_dict_unchanged": dict(n_clusters=1, n_init=2)},
+    SpectralEmbedding: {"check_dict_unchanged": dict(eigen_tol=1e-05, n_components=1)},
+    StandardScaler: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(with_mean=True),
+            dict(with_mean=False),
+        ],
+        "check_sample_weight_equivalence_on_sparse_data": [
+            dict(with_mean=False),
+        ],
+    },
+    TSNE: {"check_dict_unchanged": dict(n_components=1, perplexity=2)},
+    TruncatedSVD: {"check_dict_unchanged": dict(n_components=1)},
+    TweedieRegressor: {
+        "check_sample_weight_equivalence_on_dense_data": [
+            dict(solver="newton-cholesky"),
+            dict(solver="lbfgs"),
+        ],
+    },
+}
+
+
+def _tested_estimators(type_filter=None):
+    for _, Estimator in all_estimators(type_filter=type_filter):
+        with suppress(SkipTest):
+            for estimator in _construct_instances(Estimator):
+                yield estimator
+
+
+SKIPPED_ESTIMATORS = [SparseCoder, FrozenEstimator]
+
+
+def _construct_instances(Estimator):
+    """Construct Estimator instances if possible.
+
+    If parameter sets in INIT_PARAMS are provided, use them. If there are a list
+    of parameter sets, return one instance for each set.
+    """
+    if Estimator in SKIPPED_ESTIMATORS:
+        msg = f"Can't instantiate estimator {Estimator.__name__}"
+        # raise additional warning to be shown by pytest
+        warnings.warn(msg, SkipTestWarning)
+        raise SkipTest(msg)
+
+    if Estimator in INIT_PARAMS:
+        param_sets = INIT_PARAMS[Estimator]
+        if not isinstance(param_sets, list):
+            param_sets = [param_sets]
+        for params in param_sets:
+            est = Estimator(**params)
+            yield est
+    else:
+        yield Estimator()
+
+
+def _get_check_estimator_ids(obj):
+    """Create pytest ids for checks.
+
+    When `obj` is an estimator, this returns the pprint version of the
+    estimator (with `print_changed_only=True`). When `obj` is a function, the
+    name of the function is returned with its keyword arguments.
+
+    `_get_check_estimator_ids` is designed to be used as the `id` in
+    `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`
+    is yielding estimators and checks.
+
+    Parameters
+    ----------
+    obj : estimator or function
+        Items generated by `check_estimator`.
+
+    Returns
+    -------
+    id : str or None
+
+    See Also
+    --------
+    check_estimator
+    """
+    if isfunction(obj):
+        return obj.__name__
+    if isinstance(obj, partial):
+        if not obj.keywords:
+            return obj.func.__name__
+        kwstring = ",".join(["{}={}".format(k, v) for k, v in obj.keywords.items()])
+        return "{}({})".format(obj.func.__name__, kwstring)
+    if hasattr(obj, "get_params"):
+        with config_context(print_changed_only=True):
+            return re.sub(r"\s", "", str(obj))
+
+
+def _yield_instances_for_check(check, estimator_orig):
+    """Yield instances for a check.
+
+    For most estimators, this is a no-op.
+
+    For estimators which have an entry in PER_ESTIMATOR_CHECK_PARAMS, this will yield
+    an estimator for each parameter set in PER_ESTIMATOR_CHECK_PARAMS[estimator].
+    """
+    # TODO(devtools): enable this behavior for third party estimators as well
+    if type(estimator_orig) not in PER_ESTIMATOR_CHECK_PARAMS:
+        yield estimator_orig
+        return
+
+    check_params = PER_ESTIMATOR_CHECK_PARAMS[type(estimator_orig)]
+
+    try:
+        check_name = check.__name__
+    except AttributeError:
+        # partial tests
+        check_name = check.func.__name__
+
+    if check_name not in check_params:
+        yield estimator_orig
+        return
+
+    param_set = check_params[check_name]
+    if isinstance(param_set, dict):
+        param_set = [param_set]
+
+    for params in param_set:
+        estimator = clone(estimator_orig)
+        estimator.set_params(**params)
+        yield estimator
+
+
+PER_ESTIMATOR_XFAIL_CHECKS = {
+    AdaBoostClassifier: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    AdaBoostRegressor: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    BaggingClassifier: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    BaggingRegressor: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    BernoulliRBM: {
+        "check_methods_subset_invariance": ("fails for the decision_function method"),
+        "check_methods_sample_order_invariance": ("fails for the score_samples method"),
+    },
+    BisectingKMeans: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    ColumnTransformer: {
+        "check_estimators_empty_data_messages": "FIXME",
+        "check_estimators_nan_inf": "FIXME",
+        "check_estimator_sparse_array": "FIXME",
+        "check_estimator_sparse_matrix": "FIXME",
+        "check_fit1d": "FIXME",
+        "check_fit2d_predict1d": "FIXME",
+        "check_complex_data": "FIXME",
+        "check_fit2d_1feature": "FIXME",
+    },
+    DummyClassifier: {
+        "check_methods_subset_invariance": "fails for the predict method",
+        "check_methods_sample_order_invariance": "fails for the predict method",
+    },
+    FeatureUnion: {
+        "check_estimators_overwrite_params": "FIXME",
+        "check_estimators_nan_inf": "FIXME",
+        "check_dont_overwrite_parameters": "FIXME",
+    },
+    FixedThresholdClassifier: {
+        "check_classifiers_train": "Threshold at probability 0.5 does not hold",
+        "check_sample_weight_equivalence_on_dense_data": (
+            "Due to the cross-validation and sample ordering, removing a sample"
+            " is not strictly equal to putting is weight to zero. Specific unit"
+            " tests are added for TunedThresholdClassifierCV specifically."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    GradientBoostingClassifier: {
+        # TODO: investigate failure see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    GradientBoostingRegressor: {
+        # TODO: investigate failure see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    GridSearchCV: {
+        "check_supervised_y_2d": "DataConversionWarning not caught",
+        "check_requires_y_none": "Doesn't fail gracefully",
+    },
+    HalvingGridSearchCV: {
+        "check_fit2d_1sample": (
+            "Fail during parameter check since min/max resources requires more samples"
+        ),
+        "check_estimators_nan_inf": "FIXME",
+        "check_classifiers_one_label_sample_weights": "FIXME",
+        "check_fit2d_1feature": "FIXME",
+        "check_supervised_y_2d": "DataConversionWarning not caught",
+        "check_requires_y_none": "Doesn't fail gracefully",
+    },
+    HalvingRandomSearchCV: {
+        "check_fit2d_1sample": (
+            "Fail during parameter check since min/max resources requires more samples"
+        ),
+        "check_estimators_nan_inf": "FIXME",
+        "check_classifiers_one_label_sample_weights": "FIXME",
+        "check_fit2d_1feature": "FIXME",
+        "check_supervised_y_2d": "DataConversionWarning not caught",
+        "check_requires_y_none": "Doesn't fail gracefully",
+    },
+    HistGradientBoostingClassifier: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    HistGradientBoostingRegressor: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    IsolationForest: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    KernelDensity: {
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight must have positive values"
+        ),
+    },
+    KMeans: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    KNeighborsTransformer: {
+        "check_methods_sample_order_invariance": "check is not applicable."
+    },
+    LinearSVC: {
+        # TODO: replace by a statistical test when _dual=True, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_non_transformer_estimators_n_iter": (
+            "n_iter_ cannot be easily accessed."
+        ),
+    },
+    LinearSVR: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    LogisticRegression: {
+        # TODO: fix sample_weight handling of this estimator, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    MiniBatchKMeans: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    NuSVC: {
+        "check_class_weight_classifiers": "class_weight is ignored.",
+        # TODO: fix sample_weight handling of this estimator when probability=False
+        # TODO: replace by a statistical test when probability=True
+        # see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_classifiers_one_label_sample_weights": (
+            "specified nu is infeasible for the fit."
+        ),
+    },
+    NuSVR: {
+        # TODO: fix sample_weight handling of this estimator, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    Nystroem: {
+        "check_transformer_preserves_dtypes": (
+            "dtypes are preserved but not at a close enough precision"
+        )
+    },
+    OneClassSVM: {
+        # TODO: fix sample_weight handling of this estimator, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    Perceptron: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    Pipeline: {
+        "check_dont_overwrite_parameters": (
+            "Pipeline changes the `steps` parameter, which it shouldn't."
+            "Therefore this test is x-fail until we fix this."
+        ),
+        "check_estimators_overwrite_params": (
+            "Pipeline changes the `steps` parameter, which it shouldn't."
+            "Therefore this test is x-fail until we fix this."
+        ),
+    },
+    RadiusNeighborsTransformer: {
+        "check_methods_sample_order_invariance": "check is not applicable."
+    },
+    RandomForestClassifier: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    RandomForestRegressor: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    RandomizedSearchCV: {
+        "check_supervised_y_2d": "DataConversionWarning not caught",
+        "check_requires_y_none": "Doesn't fail gracefully",
+    },
+    RandomTreesEmbedding: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    RANSACRegressor: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    Ridge: {
+        "check_non_transformer_estimators_n_iter": (
+            "n_iter_ cannot be easily accessed."
+        )
+    },
+    RidgeClassifier: {
+        "check_non_transformer_estimators_n_iter": (
+            "n_iter_ cannot be easily accessed."
+        )
+    },
+    SelfTrainingClassifier: {
+        "check_non_transformer_estimators_n_iter": "n_iter_ can be 0."
+    },
+    SGDClassifier: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    SGDOneClassSVM: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    SGDRegressor: {
+        # TODO: replace by a statistical test, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    SpectralCoclustering: {
+        "check_estimators_dtypes": "raises nan error",
+        "check_fit2d_1sample": "_scale_normalize fails",
+        "check_fit2d_1feature": "raises apply_along_axis error",
+        "check_estimator_sparse_matrix": "does not fail gracefully",
+        "check_estimator_sparse_array": "does not fail gracefully",
+        "check_methods_subset_invariance": "empty array passed inside",
+        "check_dont_overwrite_parameters": "empty array passed inside",
+        "check_fit2d_predict1d": "empty array passed inside",
+        # ValueError: Found array with 0 feature(s) (shape=(23, 0))
+        # while a minimum of 1 is required.
+        "check_dict_unchanged": "FIXME",
+    },
+    SpectralBiclustering: {
+        "check_estimators_dtypes": "raises nan error",
+        "check_fit2d_1sample": "_scale_normalize fails",
+        "check_fit2d_1feature": "raises apply_along_axis error",
+        "check_estimator_sparse_matrix": "does not fail gracefully",
+        "check_estimator_sparse_array": "does not fail gracefully",
+        "check_methods_subset_invariance": "empty array passed inside",
+        "check_dont_overwrite_parameters": "empty array passed inside",
+        "check_fit2d_predict1d": "empty array passed inside",
+    },
+    SVC: {
+        # TODO: fix sample_weight handling of this estimator when probability=False
+        # TODO: replace by a statistical test when probability=True
+        # see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    SVR: {
+        # TODO: fix sample_weight handling of this estimator, see meta-issue #16298
+        "check_sample_weight_equivalence_on_dense_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+        "check_sample_weight_equivalence_on_sparse_data": (
+            "sample_weight is not equivalent to removing/repeating samples."
+        ),
+    },
+    TunedThresholdClassifierCV: {
+        "check_classifiers_train": "Threshold at probability 0.5 does not hold",
+        "check_sample_weight_equivalence_on_dense_data": (
+            "Due to the cross-validation and sample ordering, removing a sample"
+            " is not strictly equal to putting is weight to zero. Specific unit"
+            " tests are added for TunedThresholdClassifierCV specifically."
+        ),
+    },
+}
+
+# TODO: remove when scipy min version >= 1.11
+if sp_base_version < parse_version("1.11"):
+    PER_ESTIMATOR_XFAIL_CHECKS[SplineTransformer] = {
+        "check_estimators_pickle": (
+            "scipy < 1.11 implementation of _bsplines does not"
+            "support const memory views."
+        ),
+    }
+
+
+def _get_expected_failed_checks(estimator):
+    """Get the expected failed checks for all estimators in scikit-learn."""
+    failed_checks = PER_ESTIMATOR_XFAIL_CHECKS.get(type(estimator), {})
+
+    tags = get_tags(estimator)
+
+    # all xfail marks that depend on the instance, come here. As of now, we have only
+    # these two cases.
+    if type(estimator) in [KNeighborsClassifier, KNeighborsRegressor]:
+        if tags.input_tags.pairwise:
+            failed_checks.update(
+                {
+                    "check_n_features_in_after_fitting": "FIXME",
+                    "check_dataframe_column_names_consistency": "FIXME",
+                }
+            )
+    if type(estimator) == LinearRegression:
+        # TODO: remove when scipy min version >= 1.16
+        # Regression introduced in scipy 1.15 and fixed in 1.16, see
+        # https://github.com/scipy/scipy/issues/22791
+        if (
+            parse_version("1.15.0") <= sp_base_version < parse_version("1.16")
+            and _IS_32BIT
+        ):
+            failed_checks.update(
+                {
+                    "check_sample_weight_equivalence_on_dense_data": (
+                        "Issue #31098. Fails on 32-bit platforms with recent scipy."
+                    ),
+                    "check_sample_weight_equivalence_on_sparse_data": (
+                        "Issue #31098. Fails on 32-bit platforms with recent scipy."
+                    ),
+                }
+            )
+
+    return failed_checks
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_testing.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6582bb763641e8027be9e069156c8ae1e9f7aa61
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_testing.py
@@ -0,0 +1,1454 @@
+"""Testing utilities."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import atexit
+import contextlib
+import functools
+import importlib
+import inspect
+import os
+import os.path as op
+import re
+import shutil
+import sys
+import tempfile
+import textwrap
+import unittest
+import warnings
+from collections import defaultdict, namedtuple
+from collections.abc import Iterable
+from dataclasses import dataclass
+from difflib import context_diff
+from functools import wraps
+from inspect import signature
+from itertools import chain, groupby
+from subprocess import STDOUT, CalledProcessError, TimeoutExpired, check_output
+
+import joblib
+import numpy as np
+import scipy as sp
+from numpy.testing import assert_allclose as np_assert_allclose
+from numpy.testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_array_less,
+)
+
+import sklearn
+from sklearn.utils import (
+    ClassifierTags,
+    RegressorTags,
+    Tags,
+    TargetTags,
+    TransformerTags,
+)
+from sklearn.utils._array_api import _check_array_api_dispatch
+from sklearn.utils.fixes import (
+    _IS_32BIT,
+    VisibleDeprecationWarning,
+    _in_unstable_openblas_configuration,
+)
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.validation import (
+    check_array,
+    check_is_fitted,
+    check_X_y,
+)
+
+__all__ = [
+    "SkipTest",
+    "assert_allclose",
+    "assert_almost_equal",
+    "assert_array_almost_equal",
+    "assert_array_equal",
+    "assert_array_less",
+    "assert_run_python_script_without_output",
+]
+
+SkipTest = unittest.case.SkipTest
+
+
+def ignore_warnings(obj=None, category=Warning):
+    """Context manager and decorator to ignore warnings.
+
+    Note: Using this (in both variants) will clear all warnings
+    from all python modules loaded. In case you need to test
+    cross-module-warning-logging, this is not your tool of choice.
+
+    Parameters
+    ----------
+    obj : callable, default=None
+        callable where you want to ignore the warnings.
+    category : warning class, default=Warning
+        The category to filter. If Warning, all categories will be muted.
+
+    Examples
+    --------
+    >>> import warnings
+    >>> from sklearn.utils._testing import ignore_warnings
+    >>> with ignore_warnings():
+    ...     warnings.warn('buhuhuhu')
+
+    >>> def nasty_warn():
+    ...     warnings.warn('buhuhuhu')
+    ...     print(42)
+
+    >>> ignore_warnings(nasty_warn)()
+    42
+    """
+    if isinstance(obj, type) and issubclass(obj, Warning):
+        # Avoid common pitfall of passing category as the first positional
+        # argument which result in the test not being run
+        warning_name = obj.__name__
+        raise ValueError(
+            "'obj' should be a callable where you want to ignore warnings. "
+            "You passed a warning class instead: 'obj={warning_name}'. "
+            "If you want to pass a warning class to ignore_warnings, "
+            "you should use 'category={warning_name}'".format(warning_name=warning_name)
+        )
+    elif callable(obj):
+        return _IgnoreWarnings(category=category)(obj)
+    else:
+        return _IgnoreWarnings(category=category)
+
+
+class _IgnoreWarnings:
+    """Improved and simplified Python warnings context manager and decorator.
+
+    This class allows the user to ignore the warnings raised by a function.
+    Copied from Python 2.7.5 and modified as required.
+
+    Parameters
+    ----------
+    category : tuple of warning class, default=Warning
+        The category to filter. By default, all the categories will be muted.
+
+    """
+
+    def __init__(self, category):
+        self._record = True
+        self._module = sys.modules["warnings"]
+        self._entered = False
+        self.log = []
+        self.category = category
+
+    def __call__(self, fn):
+        """Decorator to catch and hide warnings without visual nesting."""
+
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", self.category)
+                return fn(*args, **kwargs)
+
+        return wrapper
+
+    def __repr__(self):
+        args = []
+        if self._record:
+            args.append("record=True")
+        if self._module is not sys.modules["warnings"]:
+            args.append("module=%r" % self._module)
+        name = type(self).__name__
+        return "%s(%s)" % (name, ", ".join(args))
+
+    def __enter__(self):
+        if self._entered:
+            raise RuntimeError("Cannot enter %r twice" % self)
+        self._entered = True
+        self._filters = self._module.filters
+        self._module.filters = self._filters[:]
+        self._showwarning = self._module.showwarning
+        warnings.simplefilter("ignore", self.category)
+
+    def __exit__(self, *exc_info):
+        if not self._entered:
+            raise RuntimeError("Cannot exit %r without entering first" % self)
+        self._module.filters = self._filters
+        self._module.showwarning = self._showwarning
+        self.log[:] = []
+
+
+def assert_allclose(
+    actual, desired, rtol=None, atol=0.0, equal_nan=True, err_msg="", verbose=True
+):
+    """dtype-aware variant of numpy.testing.assert_allclose
+
+    This variant introspects the least precise floating point dtype
+    in the input argument and automatically sets the relative tolerance
+    parameter to 1e-4 float32 and use 1e-7 otherwise (typically float64
+    in scikit-learn).
+
+    `atol` is always left to 0. by default. It should be adjusted manually
+    to an assertion-specific value in case there are null values expected
+    in `desired`.
+
+    The aggregate tolerance is `atol + rtol * abs(desired)`.
+
+    Parameters
+    ----------
+    actual : array_like
+        Array obtained.
+    desired : array_like
+        Array desired.
+    rtol : float, optional, default=None
+        Relative tolerance.
+        If None, it is set based on the provided arrays' dtypes.
+    atol : float, optional, default=0.
+        Absolute tolerance.
+    equal_nan : bool, optional, default=True
+        If True, NaNs will compare equal.
+    err_msg : str, optional, default=''
+        The error message to be printed in case of failure.
+    verbose : bool, optional, default=True
+        If True, the conflicting values are appended to the error message.
+
+    Raises
+    ------
+    AssertionError
+        If actual and desired are not equal up to specified precision.
+
+    See Also
+    --------
+    numpy.testing.assert_allclose
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils._testing import assert_allclose
+    >>> x = [1e-5, 1e-3, 1e-1]
+    >>> y = np.arccos(np.cos(x))
+    >>> assert_allclose(x, y, rtol=1e-5, atol=0)
+    >>> a = np.full(shape=10, fill_value=1e-5, dtype=np.float32)
+    >>> assert_allclose(a, 1e-5)
+    """
+    dtypes = []
+
+    actual, desired = np.asanyarray(actual), np.asanyarray(desired)
+    dtypes = [actual.dtype, desired.dtype]
+
+    if rtol is None:
+        rtols = [1e-4 if dtype == np.float32 else 1e-7 for dtype in dtypes]
+        rtol = max(rtols)
+
+    np_assert_allclose(
+        actual,
+        desired,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        err_msg=err_msg,
+        verbose=verbose,
+    )
+
+
+def assert_allclose_dense_sparse(x, y, rtol=1e-07, atol=1e-9, err_msg=""):
+    """Assert allclose for sparse and dense data.
+
+    Both x and y need to be either sparse or dense, they
+    can't be mixed.
+
+    Parameters
+    ----------
+    x : {array-like, sparse matrix}
+        First array to compare.
+
+    y : {array-like, sparse matrix}
+        Second array to compare.
+
+    rtol : float, default=1e-07
+        relative tolerance; see numpy.allclose.
+
+    atol : float, default=1e-9
+        absolute tolerance; see numpy.allclose. Note that the default here is
+        more tolerant than the default for numpy.testing.assert_allclose, where
+        atol=0.
+
+    err_msg : str, default=''
+        Error message to raise.
+    """
+    if sp.sparse.issparse(x) and sp.sparse.issparse(y):
+        x = x.tocsr()
+        y = y.tocsr()
+        x.sum_duplicates()
+        y.sum_duplicates()
+        assert_array_equal(x.indices, y.indices, err_msg=err_msg)
+        assert_array_equal(x.indptr, y.indptr, err_msg=err_msg)
+        assert_allclose(x.data, y.data, rtol=rtol, atol=atol, err_msg=err_msg)
+    elif not sp.sparse.issparse(x) and not sp.sparse.issparse(y):
+        # both dense
+        assert_allclose(x, y, rtol=rtol, atol=atol, err_msg=err_msg)
+    else:
+        raise ValueError(
+            "Can only compare two sparse matrices, not a sparse matrix and an array."
+        )
+
+
+def set_random_state(estimator, random_state=0):
+    """Set random state of an estimator if it has the `random_state` param.
+
+    Parameters
+    ----------
+    estimator : object
+        The estimator.
+    random_state : int, RandomState instance or None, default=0
+        Pseudo random number generator state.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    """
+    if "random_state" in estimator.get_params():
+        estimator.set_params(random_state=random_state)
+
+
+def _is_numpydoc():
+    try:
+        import numpydoc  # noqa: F401
+    except (ImportError, AssertionError):
+        return False
+    else:
+        return True
+
+
+try:
+    _check_array_api_dispatch(True)
+    ARRAY_API_COMPAT_FUNCTIONAL = True
+except (ImportError, RuntimeError):
+    ARRAY_API_COMPAT_FUNCTIONAL = False
+
+try:
+    import pytest
+
+    skip_if_32bit = pytest.mark.skipif(_IS_32BIT, reason="skipped on 32bit platforms")
+    fails_if_unstable_openblas = pytest.mark.xfail(
+        _in_unstable_openblas_configuration(),
+        reason="OpenBLAS is unstable for this configuration",
+    )
+    skip_if_no_parallel = pytest.mark.skipif(
+        not joblib.parallel.mp, reason="joblib is in serial mode"
+    )
+    skip_if_array_api_compat_not_configured = pytest.mark.skipif(
+        not ARRAY_API_COMPAT_FUNCTIONAL,
+        reason="SCIPY_ARRAY_API not set, or versions of NumPy/SciPy too old.",
+    )
+
+    #  Decorator for tests involving both BLAS calls and multiprocessing.
+    #
+    #  Under POSIX (e.g. Linux or OSX), using multiprocessing in conjunction
+    #  with some implementation of BLAS (or other libraries that manage an
+    #  internal posix thread pool) can cause a crash or a freeze of the Python
+    #  process.
+    #
+    #  In practice all known packaged distributions (from Linux distros or
+    #  Anaconda) of BLAS under Linux seems to be safe. So we this problem seems
+    #  to only impact OSX users.
+    #
+    #  This wrapper makes it possible to skip tests that can possibly cause
+    #  this crash under OS X with.
+    #
+    #  Under Python 3.4+ it is possible to use the `forkserver` start method
+    #  for multiprocessing to avoid this issue. However it can cause pickling
+    #  errors on interactively defined functions. It therefore not enabled by
+    #  default.
+
+    if_safe_multiprocessing_with_blas = pytest.mark.skipif(
+        sys.platform == "darwin", reason="Possible multi-process bug with some BLAS"
+    )
+    skip_if_no_numpydoc = pytest.mark.skipif(
+        not _is_numpydoc(),
+        reason="numpydoc is required to test the docstrings",
+    )
+except ImportError:
+    pass
+
+
+def check_skip_network():
+    if int(os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", 0)):
+        raise SkipTest("Text tutorial requires large dataset download")
+
+
+def _delete_folder(folder_path, warn=False):
+    """Utility function to cleanup a temporary folder if still existing.
+
+    Copy from joblib.pool (for independence).
+    """
+    try:
+        if os.path.exists(folder_path):
+            # This can fail under windows,
+            #  but will succeed when called by atexit
+            shutil.rmtree(folder_path)
+    except OSError:
+        if warn:
+            warnings.warn("Could not delete temporary folder %s" % folder_path)
+
+
+class TempMemmap:
+    """
+    Parameters
+    ----------
+    data
+    mmap_mode : str, default='r'
+    """
+
+    def __init__(self, data, mmap_mode="r"):
+        self.mmap_mode = mmap_mode
+        self.data = data
+
+    def __enter__(self):
+        data_read_only, self.temp_folder = create_memmap_backed_data(
+            self.data, mmap_mode=self.mmap_mode, return_folder=True
+        )
+        return data_read_only
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        _delete_folder(self.temp_folder)
+
+
+def create_memmap_backed_data(data, mmap_mode="r", return_folder=False):
+    """
+    Parameters
+    ----------
+    data
+    mmap_mode : str, default='r'
+    return_folder :  bool, default=False
+    """
+    temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_")
+    atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
+    filename = op.join(temp_folder, "data.pkl")
+    joblib.dump(data, filename)
+    memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
+    result = (
+        memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder)
+    )
+    return result
+
+
+# Utils to test docstrings
+
+
+def _get_args(function, varargs=False):
+    """Helper to get function arguments."""
+
+    try:
+        params = signature(function).parameters
+    except ValueError:
+        # Error on builtin C function
+        return []
+    args = [
+        key
+        for key, param in params.items()
+        if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)
+    ]
+    if varargs:
+        varargs = [
+            param.name
+            for param in params.values()
+            if param.kind == param.VAR_POSITIONAL
+        ]
+        if len(varargs) == 0:
+            varargs = None
+        return args, varargs
+    else:
+        return args
+
+
+def _get_func_name(func):
+    """Get function full name.
+
+    Parameters
+    ----------
+    func : callable
+        The function object.
+
+    Returns
+    -------
+    name : str
+        The function name.
+    """
+    parts = []
+    module = inspect.getmodule(func)
+    if module:
+        parts.append(module.__name__)
+
+    qualname = func.__qualname__
+    if qualname != func.__name__:
+        parts.append(qualname[: qualname.find(".")])
+
+    parts.append(func.__name__)
+    return ".".join(parts)
+
+
+def check_docstring_parameters(func, doc=None, ignore=None):
+    """Helper to check docstring.
+
+    Parameters
+    ----------
+    func : callable
+        The function object to test.
+    doc : str, default=None
+        Docstring if it is passed manually to the test.
+    ignore : list, default=None
+        Parameters to ignore.
+
+    Returns
+    -------
+    incorrect : list
+        A list of string describing the incorrect results.
+    """
+    from numpydoc import docscrape
+
+    incorrect = []
+    ignore = [] if ignore is None else ignore
+
+    func_name = _get_func_name(func)
+    if not func_name.startswith("sklearn.") or func_name.startswith(
+        "sklearn.externals"
+    ):
+        return incorrect
+    # Don't check docstring for property-functions
+    if inspect.isdatadescriptor(func):
+        return incorrect
+    # Don't check docstring for setup / teardown pytest functions
+    if func_name.split(".")[-1] in ("setup_module", "teardown_module"):
+        return incorrect
+    # Dont check estimator_checks module
+    if func_name.split(".")[2] == "estimator_checks":
+        return incorrect
+    # Get the arguments from the function signature
+    param_signature = list(filter(lambda x: x not in ignore, _get_args(func)))
+    # drop self
+    if len(param_signature) > 0 and param_signature[0] == "self":
+        param_signature.remove("self")
+
+    # Analyze function's docstring
+    if doc is None:
+        records = []
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("error", UserWarning)
+            try:
+                doc = docscrape.FunctionDoc(func)
+            except UserWarning as exp:
+                if "potentially wrong underline length" in str(exp):
+                    # Catch warning raised as of numpydoc 1.2 when
+                    # the underline length for a section of a docstring
+                    # is not consistent.
+                    message = str(exp).split("\n")[:3]
+                    incorrect += [f"In function: {func_name}"] + message
+                    return incorrect
+                records.append(str(exp))
+            except Exception as exp:
+                incorrect += [func_name + " parsing error: " + str(exp)]
+                return incorrect
+        if len(records):
+            raise RuntimeError("Error for %s:\n%s" % (func_name, records[0]))
+
+    param_docs = []
+    for name, type_definition, param_doc in doc["Parameters"]:
+        # Type hints are empty only if parameter name ended with :
+        if not type_definition.strip():
+            if ":" in name and name[: name.index(":")][-1:].strip():
+                incorrect += [
+                    func_name
+                    + " There was no space between the param name and colon (%r)" % name
+                ]
+            elif name.rstrip().endswith(":"):
+                incorrect += [
+                    func_name
+                    + " Parameter %r has an empty type spec. Remove the colon"
+                    % (name.lstrip())
+                ]
+
+        # Create a list of parameters to compare with the parameters gotten
+        # from the func signature
+        if "*" not in name:
+            param_docs.append(name.split(":")[0].strip("` "))
+
+    # If one of the docstring's parameters had an error then return that
+    # incorrect message
+    if len(incorrect) > 0:
+        return incorrect
+
+    # Remove the parameters that should be ignored from list
+    param_docs = list(filter(lambda x: x not in ignore, param_docs))
+
+    # The following is derived from pytest, Copyright (c) 2004-2017 Holger
+    # Krekel and others, Licensed under MIT License. See
+    # https://github.com/pytest-dev/pytest
+
+    message = []
+    for i in range(min(len(param_docs), len(param_signature))):
+        if param_signature[i] != param_docs[i]:
+            message += [
+                "There's a parameter name mismatch in function"
+                " docstring w.r.t. function signature, at index %s"
+                " diff: %r != %r" % (i, param_signature[i], param_docs[i])
+            ]
+            break
+    if len(param_signature) > len(param_docs):
+        message += [
+            "Parameters in function docstring have less items w.r.t."
+            " function signature, first missing item: %s"
+            % param_signature[len(param_docs)]
+        ]
+
+    elif len(param_signature) < len(param_docs):
+        message += [
+            "Parameters in function docstring have more items w.r.t."
+            " function signature, first extra item: %s"
+            % param_docs[len(param_signature)]
+        ]
+
+    # If there wasn't any difference in the parameters themselves between
+    # docstring and signature including having the same length then return
+    # empty list
+    if len(message) == 0:
+        return []
+
+    import difflib
+    import pprint
+
+    param_docs_formatted = pprint.pformat(param_docs).splitlines()
+    param_signature_formatted = pprint.pformat(param_signature).splitlines()
+
+    message += ["Full diff:"]
+
+    message.extend(
+        line.strip()
+        for line in difflib.ndiff(param_signature_formatted, param_docs_formatted)
+    )
+
+    incorrect.extend(message)
+
+    # Prepend function name
+    incorrect = ["In function: " + func_name] + incorrect
+
+    return incorrect
+
+
+def _check_item_included(item_name, args):
+    """Helper to check if item should be included in checking."""
+    if args.include is not True and item_name not in args.include:
+        return False
+    if args.exclude is not None and item_name in args.exclude:
+        return False
+    return True
+
+
+def _diff_key(line):
+    """Key for grouping output from `context_diff`."""
+    if line.startswith("  "):
+        return "  "
+    elif line.startswith("- "):
+        return "- "
+    elif line.startswith("+ "):
+        return "+ "
+    elif line.startswith("! "):
+        return "! "
+    return None
+
+
+def _get_diff_msg(docstrings_grouped):
+    """Get message showing the difference between type/desc docstrings of all objects.
+
+    `docstrings_grouped` keys should be the type/desc docstrings and values are a list
+    of objects with that docstring. Objects with the same type/desc docstring are
+    thus grouped together.
+    """
+    msg_diff = ""
+    ref_str = ""
+    ref_group = []
+    for docstring, group in docstrings_grouped.items():
+        if not ref_str and not ref_group:
+            ref_str += docstring
+            ref_group.extend(group)
+        diff = list(
+            context_diff(
+                ref_str.split(),
+                docstring.split(),
+                fromfile=str(ref_group),
+                tofile=str(group),
+                n=8,
+            )
+        )
+        # Add header
+        msg_diff += "".join((diff[:3]))
+        # Group consecutive 'diff' words to shorten error message
+        for start, group in groupby(diff[3:], key=_diff_key):
+            if start is None:
+                msg_diff += "\n" + "\n".join(group)
+            else:
+                msg_diff += "\n" + start + " ".join(word[2:] for word in group)
+        # Add new lines at end of diff, to separate comparisons
+        msg_diff += "\n\n"
+    return msg_diff
+
+
+def _check_consistency_items(
+    items_docs,
+    type_or_desc,
+    section,
+    n_objects,
+    descr_regex_pattern="",
+    ignore_types=tuple(),
+):
+    """Helper to check docstring consistency of all `items_docs`.
+
+    If item is not present in all objects, checking is skipped and warning raised.
+    If `regex` provided, match descriptions to all descriptions.
+
+    Parameters
+    ----------
+    items_doc : dict of dict of str
+        Dictionary where the key is the string type or description, value is
+        a dictionary where the key is "type description" or "description"
+        and the value is a list of object names with the same string type or
+        description.
+
+    type_or_desc : {"type description", "description"}
+        Whether to check type description or description between objects.
+
+    section : {"Parameters", "Attributes", "Returns"}
+        Name of the section type.
+
+    n_objects : int
+        Total number of objects.
+
+    descr_regex_pattern : str, default=""
+        Regex pattern to match for description of all objects.
+        Ignored when `type_or_desc="type description".
+
+    ignore_types : tuple of str, default=()
+        Tuple of parameter/attribute/return names for which type description
+        matching is ignored. Ignored when `type_or_desc="description".
+    """
+    skipped = []
+    for item_name, docstrings_grouped in items_docs.items():
+        # If item not found in all objects, skip
+        if sum([len(objs) for objs in docstrings_grouped.values()]) < n_objects:
+            skipped.append(item_name)
+        # If regex provided, match to all descriptions
+        elif type_or_desc == "description" and descr_regex_pattern:
+            not_matched = []
+            for docstring, group in docstrings_grouped.items():
+                if not re.search(descr_regex_pattern, docstring):
+                    not_matched.extend(group)
+            if not_matched:
+                msg = textwrap.fill(
+                    f"The description of {section[:-1]} '{item_name}' in {not_matched}"
+                    f" does not match 'descr_regex_pattern': {descr_regex_pattern} "
+                )
+                raise AssertionError(msg)
+        # Skip type checking for items in `ignore_types`
+        elif type_or_desc == "type specification" and item_name in ignore_types:
+            continue
+        # Otherwise, if more than one key, docstrings not consistent between objects
+        elif len(docstrings_grouped.keys()) > 1:
+            msg_diff = _get_diff_msg(docstrings_grouped)
+            obj_groups = " and ".join(
+                str(group) for group in docstrings_grouped.values()
+            )
+            msg = textwrap.fill(
+                f"The {type_or_desc} of {section[:-1]} '{item_name}' is inconsistent "
+                f"between {obj_groups}:"
+            )
+            msg += msg_diff
+            raise AssertionError(msg)
+    if skipped:
+        warnings.warn(
+            f"Checking was skipped for {section}: {skipped} as they were "
+            "not found in all objects."
+        )
+
+
+def assert_docstring_consistency(
+    objects,
+    include_params=False,
+    exclude_params=None,
+    include_attrs=False,
+    exclude_attrs=None,
+    include_returns=False,
+    exclude_returns=None,
+    descr_regex_pattern=None,
+    ignore_types=tuple(),
+):
+    r"""Check consistency between docstring parameters/attributes/returns of objects.
+
+    Checks if parameters/attributes/returns have the same type specification and
+    description (ignoring whitespace) across `objects`. Intended to be used for
+    related classes/functions/data descriptors.
+
+    Entries that do not appear across all `objects` are ignored.
+
+    Parameters
+    ----------
+    objects : list of {classes, functions, data descriptors}
+        Objects to check.
+        Objects may be classes, functions or data descriptors with docstrings that
+        can be parsed by numpydoc.
+
+    include_params : list of str or bool, default=False
+        List of parameters to be included. If True, all parameters are included,
+        if False, checking is skipped for parameters.
+        Can only be set if `exclude_params` is None.
+
+    exclude_params : list of str or None, default=None
+        List of parameters to be excluded. If None, no parameters are excluded.
+        Can only be set if `include_params` is True.
+
+    include_attrs : list of str or bool, default=False
+        List of attributes to be included. If True, all attributes are included,
+        if False, checking is skipped for attributes.
+        Can only be set if `exclude_attrs` is None.
+
+    exclude_attrs : list of str or None, default=None
+        List of attributes to be excluded. If None, no attributes are excluded.
+        Can only be set if `include_attrs` is True.
+
+    include_returns : list of str or bool, default=False
+        List of returns to be included. If True, all returns are included,
+        if False, checking is skipped for returns.
+        Can only be set if `exclude_returns` is None.
+
+    exclude_returns : list of str or None, default=None
+        List of returns to be excluded. If None, no returns are excluded.
+        Can only be set if `include_returns` is True.
+
+    descr_regex_pattern : str, default=None
+        Regular expression to match to all descriptions of included
+        parameters/attributes/returns. If None, will revert to default behavior
+        of comparing descriptions between objects.
+
+    ignore_types : tuple of str, default=tuple()
+        Tuple of parameter/attribute/return names to exclude from type description
+        matching between objects.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import (accuracy_score, classification_report,
+    ... mean_absolute_error, mean_squared_error, median_absolute_error)
+    >>> from sklearn.utils._testing import assert_docstring_consistency
+    ... # doctest: +SKIP
+    >>> assert_docstring_consistency([mean_absolute_error, mean_squared_error],
+    ... include_params=['y_true', 'y_pred', 'sample_weight'])  # doctest: +SKIP
+    >>> assert_docstring_consistency([median_absolute_error, mean_squared_error],
+    ... include_params=True)  # doctest: +SKIP
+    >>> assert_docstring_consistency([accuracy_score, classification_report],
+    ... include_params=["y_true"],
+    ... descr_regex_pattern=r"Ground truth \(correct\) (labels|target values)")
+    ... # doctest: +SKIP
+    """
+    from numpydoc.docscrape import NumpyDocString
+
+    Args = namedtuple("args", ["include", "exclude", "arg_name"])
+
+    def _create_args(include, exclude, arg_name, section_name):
+        if exclude and include is not True:
+            raise TypeError(
+                f"The 'exclude_{arg_name}' argument can be set only when the "
+                f"'include_{arg_name}' argument is True."
+            )
+        if include is False:
+            return {}
+        return {section_name: Args(include, exclude, arg_name)}
+
+    section_args = {
+        **_create_args(include_params, exclude_params, "params", "Parameters"),
+        **_create_args(include_attrs, exclude_attrs, "attrs", "Attributes"),
+        **_create_args(include_returns, exclude_returns, "returns", "Returns"),
+    }
+
+    objects_doc = dict()
+    for obj in objects:
+        if (
+            inspect.isdatadescriptor(obj)
+            or inspect.isfunction(obj)
+            or inspect.isclass(obj)
+        ):
+            objects_doc[obj.__name__] = NumpyDocString(inspect.getdoc(obj))
+        else:
+            raise TypeError(
+                "All 'objects' must be one of: function, class or descriptor, "
+                f"got a: {type(obj)}."
+            )
+
+    n_objects = len(objects)
+    for section, args in section_args.items():
+        type_items = defaultdict(lambda: defaultdict(list))
+        desc_items = defaultdict(lambda: defaultdict(list))
+        for obj_name, obj_doc in objects_doc.items():
+            for item_name, type_def, desc in obj_doc[section]:
+                if _check_item_included(item_name, args):
+                    # Normalize white space
+                    type_def = " ".join(type_def.strip().split())
+                    desc = " ".join(chain.from_iterable(line.split() for line in desc))
+                    # Use string type/desc as key, to group consistent objs together
+                    type_items[item_name][type_def].append(obj_name)
+                    desc_items[item_name][desc].append(obj_name)
+
+        _check_consistency_items(
+            type_items,
+            "type specification",
+            section,
+            n_objects,
+            ignore_types=ignore_types,
+        )
+        _check_consistency_items(
+            desc_items,
+            "description",
+            section,
+            n_objects,
+            descr_regex_pattern=descr_regex_pattern,
+        )
+
+
+def assert_run_python_script_without_output(source_code, pattern=".+", timeout=60):
+    """Utility to check assertions in an independent Python subprocess.
+
+    The script provided in the source code should return 0 and the stdtout +
+    stderr should not match the pattern `pattern`.
+
+    This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle
+
+    Parameters
+    ----------
+    source_code : str
+        The Python source code to execute.
+    pattern : str
+        Pattern that the stdout + stderr should not match. By default, unless
+        stdout + stderr are both empty, an error will be raised.
+    timeout : int, default=60
+        Time in seconds before timeout.
+    """
+    fd, source_file = tempfile.mkstemp(suffix="_src_test_sklearn.py")
+    os.close(fd)
+    try:
+        with open(source_file, "wb") as f:
+            f.write(source_code.encode("utf-8"))
+        cmd = [sys.executable, source_file]
+        cwd = op.normpath(op.join(op.dirname(sklearn.__file__), ".."))
+        env = os.environ.copy()
+        try:
+            env["PYTHONPATH"] = os.pathsep.join([cwd, env["PYTHONPATH"]])
+        except KeyError:
+            env["PYTHONPATH"] = cwd
+        kwargs = {"cwd": cwd, "stderr": STDOUT, "env": env}
+        # If coverage is running, pass the config file to the subprocess
+        coverage_rc = os.environ.get("COVERAGE_PROCESS_START")
+        if coverage_rc:
+            kwargs["env"]["COVERAGE_PROCESS_START"] = coverage_rc
+
+        kwargs["timeout"] = timeout
+        try:
+            try:
+                out = check_output(cmd, **kwargs)
+            except CalledProcessError as e:
+                raise RuntimeError(
+                    "script errored with output:\n%s" % e.output.decode("utf-8")
+                )
+
+            out = out.decode("utf-8")
+            if re.search(pattern, out):
+                if pattern == ".+":
+                    expectation = "Expected no output"
+                else:
+                    expectation = f"The output was not supposed to match {pattern!r}"
+
+                message = f"{expectation}, got the following output instead: {out!r}"
+                raise AssertionError(message)
+        except TimeoutExpired as e:
+            raise RuntimeError(
+                "script timeout, output so far:\n%s" % e.output.decode("utf-8")
+            )
+    finally:
+        os.unlink(source_file)
+
+
+def _convert_container(
+    container,
+    constructor_name,
+    columns_name=None,
+    dtype=None,
+    minversion=None,
+    categorical_feature_names=None,
+):
+    """Convert a given container to a specific array-like with a dtype.
+
+    Parameters
+    ----------
+    container : array-like
+        The container to convert.
+    constructor_name : {"list", "tuple", "array", "sparse", "dataframe", \
+            "series", "index", "slice", "sparse_csr", "sparse_csc", \
+            "sparse_csr_array", "sparse_csc_array", "pyarrow", "polars", \
+            "polars_series"}
+        The type of the returned container.
+    columns_name : index or array-like, default=None
+        For pandas container supporting `columns_names`, it will affect
+        specific names.
+    dtype : dtype, default=None
+        Force the dtype of the container. Does not apply to `"slice"`
+        container.
+    minversion : str, default=None
+        Minimum version for package to install.
+    categorical_feature_names : list of str, default=None
+        List of column names to cast to categorical dtype.
+
+    Returns
+    -------
+    converted_container
+    """
+    if constructor_name == "list":
+        if dtype is None:
+            return list(container)
+        else:
+            return np.asarray(container, dtype=dtype).tolist()
+    elif constructor_name == "tuple":
+        if dtype is None:
+            return tuple(container)
+        else:
+            return tuple(np.asarray(container, dtype=dtype).tolist())
+    elif constructor_name == "array":
+        return np.asarray(container, dtype=dtype)
+    elif constructor_name in ("pandas", "dataframe"):
+        pd = pytest.importorskip("pandas", minversion=minversion)
+        result = pd.DataFrame(container, columns=columns_name, dtype=dtype, copy=False)
+        if categorical_feature_names is not None:
+            for col_name in categorical_feature_names:
+                result[col_name] = result[col_name].astype("category")
+        return result
+    elif constructor_name == "pyarrow":
+        pa = pytest.importorskip("pyarrow", minversion=minversion)
+        array = np.asarray(container)
+        array = array[:, None] if array.ndim == 1 else array
+        if columns_name is None:
+            columns_name = [f"col{i}" for i in range(array.shape[1])]
+        data = {name: array[:, i] for i, name in enumerate(columns_name)}
+        result = pa.Table.from_pydict(data)
+        if categorical_feature_names is not None:
+            for col_idx, col_name in enumerate(result.column_names):
+                if col_name in categorical_feature_names:
+                    result = result.set_column(
+                        col_idx, col_name, result.column(col_name).dictionary_encode()
+                    )
+        return result
+    elif constructor_name == "polars":
+        pl = pytest.importorskip("polars", minversion=minversion)
+        result = pl.DataFrame(container, schema=columns_name, orient="row")
+        if categorical_feature_names is not None:
+            for col_name in categorical_feature_names:
+                result = result.with_columns(pl.col(col_name).cast(pl.Categorical))
+        return result
+    elif constructor_name == "series":
+        pd = pytest.importorskip("pandas", minversion=minversion)
+        return pd.Series(container, dtype=dtype)
+    elif constructor_name == "pyarrow_array":
+        pa = pytest.importorskip("pyarrow", minversion=minversion)
+        return pa.array(container)
+    elif constructor_name == "polars_series":
+        pl = pytest.importorskip("polars", minversion=minversion)
+        return pl.Series(values=container)
+    elif constructor_name == "index":
+        pd = pytest.importorskip("pandas", minversion=minversion)
+        return pd.Index(container, dtype=dtype)
+    elif constructor_name == "slice":
+        return slice(container[0], container[1])
+    elif "sparse" in constructor_name:
+        if not sp.sparse.issparse(container):
+            # For scipy >= 1.13, sparse array constructed from 1d array may be
+            # 1d or raise an exception. To avoid this, we make sure that the
+            # input container is 2d. For more details, see
+            # https://github.com/scipy/scipy/pull/18530#issuecomment-1878005149
+            container = np.atleast_2d(container)
+
+        if constructor_name in ("sparse", "sparse_csr"):
+            # sparse and sparse_csr are equivalent for legacy reasons
+            return sp.sparse.csr_matrix(container, dtype=dtype)
+        elif constructor_name == "sparse_csr_array":
+            return sp.sparse.csr_array(container, dtype=dtype)
+        elif constructor_name == "sparse_csc":
+            return sp.sparse.csc_matrix(container, dtype=dtype)
+        elif constructor_name == "sparse_csc_array":
+            return sp.sparse.csc_array(container, dtype=dtype)
+
+
+def raises(expected_exc_type, match=None, may_pass=False, err_msg=None):
+    """Context manager to ensure exceptions are raised within a code block.
+
+    This is similar to and inspired from pytest.raises, but supports a few
+    other cases.
+
+    This is only intended to be used in estimator_checks.py where we don't
+    want to use pytest. In the rest of the code base, just use pytest.raises
+    instead.
+
+    Parameters
+    ----------
+    excepted_exc_type : Exception or list of Exception
+        The exception that should be raised by the block. If a list, the block
+        should raise one of the exceptions.
+    match : str or list of str, default=None
+        A regex that the exception message should match. If a list, one of
+        the entries must match. If None, match isn't enforced.
+    may_pass : bool, default=False
+        If True, the block is allowed to not raise an exception. Useful in
+        cases where some estimators may support a feature but others must
+        fail with an appropriate error message. By default, the context
+        manager will raise an exception if the block does not raise an
+        exception.
+    err_msg : str, default=None
+        If the context manager fails (e.g. the block fails to raise the
+        proper exception, or fails to match), then an AssertionError is
+        raised with this message. By default, an AssertionError is raised
+        with a default error message (depends on the kind of failure). Use
+        this to indicate how users should fix their estimators to pass the
+        checks.
+
+    Attributes
+    ----------
+    raised_and_matched : bool
+        True if an exception was raised and a match was found, False otherwise.
+    """
+    return _Raises(expected_exc_type, match, may_pass, err_msg)
+
+
+class _Raises(contextlib.AbstractContextManager):
+    # see raises() for parameters
+    def __init__(self, expected_exc_type, match, may_pass, err_msg):
+        self.expected_exc_types = (
+            expected_exc_type
+            if isinstance(expected_exc_type, Iterable)
+            else [expected_exc_type]
+        )
+        self.matches = [match] if isinstance(match, str) else match
+        self.may_pass = may_pass
+        self.err_msg = err_msg
+        self.raised_and_matched = False
+
+    def __exit__(self, exc_type, exc_value, _):
+        # see
+        # https://docs.python.org/2.5/whatsnew/pep-343.html#SECTION000910000000000000000
+
+        if exc_type is None:  # No exception was raised in the block
+            if self.may_pass:
+                return True  # CM is happy
+            else:
+                err_msg = self.err_msg or f"Did not raise: {self.expected_exc_types}"
+                raise AssertionError(err_msg)
+
+        if not any(
+            issubclass(exc_type, expected_type)
+            for expected_type in self.expected_exc_types
+        ):
+            if self.err_msg is not None:
+                raise AssertionError(self.err_msg) from exc_value
+            else:
+                return False  # will re-raise the original exception
+
+        if self.matches is not None:
+            err_msg = self.err_msg or (
+                "The error message should contain one of the following "
+                "patterns:\n{}\nGot {}".format("\n".join(self.matches), str(exc_value))
+            )
+            if not any(re.search(match, str(exc_value)) for match in self.matches):
+                raise AssertionError(err_msg) from exc_value
+            self.raised_and_matched = True
+
+        return True
+
+
+class MinimalClassifier:
+    """Minimal classifier implementation without inheriting from BaseEstimator.
+
+    This estimator should be tested with:
+
+    * `check_estimator` in `test_estimator_checks.py`;
+    * within a `Pipeline` in `test_pipeline.py`;
+    * within a `SearchCV` in `test_search.py`.
+    """
+
+    def __init__(self, param=None):
+        self.param = param
+
+    def get_params(self, deep=True):
+        return {"param": self.param}
+
+    def set_params(self, **params):
+        for key, value in params.items():
+            setattr(self, key, value)
+        return self
+
+    def fit(self, X, y):
+        X, y = check_X_y(X, y)
+        check_classification_targets(y)
+        self.classes_, counts = np.unique(y, return_counts=True)
+        self._most_frequent_class_idx = counts.argmax()
+        return self
+
+    def predict_proba(self, X):
+        check_is_fitted(self)
+        X = check_array(X)
+        proba_shape = (X.shape[0], self.classes_.size)
+        y_proba = np.zeros(shape=proba_shape, dtype=np.float64)
+        y_proba[:, self._most_frequent_class_idx] = 1.0
+        return y_proba
+
+    def predict(self, X):
+        y_proba = self.predict_proba(X)
+        y_pred = y_proba.argmax(axis=1)
+        return self.classes_[y_pred]
+
+    def score(self, X, y):
+        from sklearn.metrics import accuracy_score
+
+        return accuracy_score(y, self.predict(X))
+
+    def __sklearn_tags__(self):
+        return Tags(
+            estimator_type="classifier",
+            classifier_tags=ClassifierTags(),
+            regressor_tags=None,
+            transformer_tags=None,
+            target_tags=TargetTags(required=True),
+        )
+
+
+class MinimalRegressor:
+    """Minimal regressor implementation without inheriting from BaseEstimator.
+
+    This estimator should be tested with:
+
+    * `check_estimator` in `test_estimator_checks.py`;
+    * within a `Pipeline` in `test_pipeline.py`;
+    * within a `SearchCV` in `test_search.py`.
+    """
+
+    def __init__(self, param=None):
+        self.param = param
+
+    def get_params(self, deep=True):
+        return {"param": self.param}
+
+    def set_params(self, **params):
+        for key, value in params.items():
+            setattr(self, key, value)
+        return self
+
+    def fit(self, X, y):
+        X, y = check_X_y(X, y)
+        self.is_fitted_ = True
+        self._mean = np.mean(y)
+        return self
+
+    def predict(self, X):
+        check_is_fitted(self)
+        X = check_array(X)
+        return np.ones(shape=(X.shape[0],)) * self._mean
+
+    def score(self, X, y):
+        from sklearn.metrics import r2_score
+
+        return r2_score(y, self.predict(X))
+
+    def __sklearn_tags__(self):
+        return Tags(
+            estimator_type="regressor",
+            classifier_tags=None,
+            regressor_tags=RegressorTags(),
+            transformer_tags=None,
+            target_tags=TargetTags(required=True),
+        )
+
+
+class MinimalTransformer:
+    """Minimal transformer implementation without inheriting from
+    BaseEstimator.
+
+    This estimator should be tested with:
+
+    * `check_estimator` in `test_estimator_checks.py`;
+    * within a `Pipeline` in `test_pipeline.py`;
+    * within a `SearchCV` in `test_search.py`.
+    """
+
+    def __init__(self, param=None):
+        self.param = param
+
+    def get_params(self, deep=True):
+        return {"param": self.param}
+
+    def set_params(self, **params):
+        for key, value in params.items():
+            setattr(self, key, value)
+        return self
+
+    def fit(self, X, y=None):
+        check_array(X)
+        self.is_fitted_ = True
+        return self
+
+    def transform(self, X, y=None):
+        check_is_fitted(self)
+        X = check_array(X)
+        return X
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X, y).transform(X, y)
+
+    def __sklearn_tags__(self):
+        return Tags(
+            estimator_type="transformer",
+            classifier_tags=None,
+            regressor_tags=None,
+            transformer_tags=TransformerTags(),
+            target_tags=TargetTags(required=False),
+        )
+
+
+def _array_api_for_tests(array_namespace, device):
+    try:
+        array_mod = importlib.import_module(array_namespace)
+    except (ModuleNotFoundError, ImportError):
+        raise SkipTest(
+            f"{array_namespace} is not installed: not checking array_api input"
+        )
+
+    if os.environ.get("SCIPY_ARRAY_API") is None:
+        raise SkipTest("SCIPY_ARRAY_API is not set: not checking array_api input")
+
+    from sklearn.externals.array_api_compat import get_namespace
+
+    # First create an array using the chosen array module and then get the
+    # corresponding (compatibility wrapped) array namespace based on it.
+    # This is because `cupy` is not the same as the compatibility wrapped
+    # namespace of a CuPy array.
+    xp = get_namespace(array_mod.asarray(1))
+    if (
+        array_namespace == "torch"
+        and device == "cuda"
+        and not xp.backends.cuda.is_built()
+    ):
+        raise SkipTest("PyTorch test requires cuda, which is not available")
+    elif array_namespace == "torch" and device == "mps":
+        if os.getenv("PYTORCH_ENABLE_MPS_FALLBACK") != "1":
+            # For now we need PYTORCH_ENABLE_MPS_FALLBACK=1 for all estimators to work
+            # when using the MPS device.
+            raise SkipTest(
+                "Skipping MPS device test because PYTORCH_ENABLE_MPS_FALLBACK is not "
+                "set."
+            )
+        if not xp.backends.mps.is_built():
+            raise SkipTest(
+                "MPS is not available because the current PyTorch install was not "
+                "built with MPS enabled."
+            )
+    elif array_namespace == "cupy":  # pragma: nocover
+        import cupy
+
+        if cupy.cuda.runtime.getDeviceCount() == 0:
+            raise SkipTest("CuPy test requires cuda, which is not available")
+    return xp
+
+
+def _get_warnings_filters_info_list():
+    @dataclass
+    class WarningInfo:
+        action: "warnings._ActionKind"  # type: ignore[annotation-unchecked]
+        message: str = ""  # type: ignore[annotation-unchecked]
+        category: type[Warning] = Warning  # type: ignore[annotation-unchecked]
+
+        def to_filterwarning_str(self):
+            if self.category.__module__ == "builtins":
+                category = self.category.__name__
+            else:
+                category = f"{self.category.__module__}.{self.category.__name__}"
+
+            return f"{self.action}:{self.message}:{category}"
+
+    return [
+        WarningInfo("error", category=DeprecationWarning),
+        WarningInfo("error", category=FutureWarning),
+        WarningInfo("error", category=VisibleDeprecationWarning),
+        # TODO: remove when pyamg > 5.0.1
+        # Avoid a deprecation warning due pkg_resources usage in pyamg.
+        WarningInfo(
+            "ignore",
+            message="pkg_resources is deprecated as an API",
+            category=DeprecationWarning,
+        ),
+        WarningInfo(
+            "ignore",
+            message="Deprecated call to `pkg_resources",
+            category=DeprecationWarning,
+        ),
+        # pytest-cov issue https://github.com/pytest-dev/pytest-cov/issues/557 not
+        # fixed although it has been closed. https://github.com/pytest-dev/pytest-cov/pull/623
+        # would probably fix it.
+        WarningInfo(
+            "ignore",
+            message=(
+                "The --rsyncdir command line argument and rsyncdirs config variable are"
+                " deprecated"
+            ),
+            category=DeprecationWarning,
+        ),
+        # XXX: Easiest way to ignore pandas Pyarrow DeprecationWarning in the
+        # short-term. See https://github.com/pandas-dev/pandas/issues/54466 for
+        # more details.
+        WarningInfo(
+            "ignore",
+            message=r"\s*Pyarrow will become a required dependency",
+            category=DeprecationWarning,
+        ),
+        # warnings has been fixed from dateutil main but not released yet, see
+        # https://github.com/dateutil/dateutil/issues/1314
+        WarningInfo(
+            "ignore",
+            message="datetime.datetime.utcfromtimestamp",
+            category=DeprecationWarning,
+        ),
+        # Python 3.12 warnings from joblib fixed in master but not released yet,
+        # see https://github.com/joblib/joblib/pull/1518
+        WarningInfo(
+            "ignore", message="ast.Num is deprecated", category=DeprecationWarning
+        ),
+        WarningInfo(
+            "ignore", message="Attribute n is deprecated", category=DeprecationWarning
+        ),
+        # Python 3.12 warnings from sphinx-gallery fixed in master but not
+        # released yet, see
+        # https://github.com/sphinx-gallery/sphinx-gallery/pull/1242
+        WarningInfo(
+            "ignore", message="ast.Str is deprecated", category=DeprecationWarning
+        ),
+        WarningInfo(
+            "ignore", message="Attribute s is deprecated", category=DeprecationWarning
+        ),
+        # Plotly deprecated something which we're not using, but internally it's used
+        # and needs to be fixed on their side.
+        # https://github.com/plotly/plotly.py/issues/4997
+        WarningInfo(
+            "ignore",
+            message=".+scattermapbox.+deprecated.+scattermap.+instead",
+            category=DeprecationWarning,
+        ),
+    ]
+
+
+def get_pytest_filterwarning_lines():
+    warning_filters_info_list = _get_warnings_filters_info_list()
+    return [
+        warning_info.to_filterwarning_str()
+        for warning_info in warning_filters_info_list
+    ]
+
+
+def turn_warnings_into_errors():
+    warnings_filters_info_list = _get_warnings_filters_info_list()
+    for warning_info in warnings_filters_info_list:
+        warnings.filterwarnings(
+            warning_info.action,
+            message=warning_info.message,
+            category=warning_info.category,
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_typedefs.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/_typedefs.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..f77227466158055ba2a2d4c8db51f235400c7801
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_typedefs.pxd
@@ -0,0 +1,41 @@
+# Commonly used types
+# These are redefinitions of the ones defined by numpy in
+# https://github.com/numpy/numpy/blob/main/numpy/__init__.pxd.
+# It will eventually avoid having to always include the numpy headers even when we
+# would only use it for the types.
+#
+# When used to declare variables that will receive values from numpy arrays, it
+# should match the dtype of the array. For example, to declare a variable that will
+# receive values from a numpy array of dtype np.float64, the type float64_t must be
+# used.
+#
+# TODO: Stop defining custom types locally or globally like DTYPE_t and friends and
+# use these consistently throughout the codebase.
+# NOTE: Extend this list as needed when converting more cython extensions.
+ctypedef unsigned char uint8_t
+ctypedef unsigned int uint32_t
+ctypedef unsigned long long uint64_t
+# Note: In NumPy 2, indexing always happens with npy_intp which is an alias for
+# the Py_ssize_t type, see PEP 353.
+#
+# Note that on most platforms Py_ssize_t is equivalent to C99's intptr_t,
+# but they can differ on architecture with segmented memory (none
+# supported by scikit-learn at the time of writing).
+#
+# intp_t/np.intp should be used to index arrays in a platform dependent way.
+# Storing arrays with platform dependent dtypes as attribute on picklable
+# objects is not recommended as it requires special care when loading and
+# using such datastructures on a host with different bitness. Instead one
+# should rather use fixed width integer types such as int32 or uint32 when we know
+# that the number of elements to index is not larger to 2 or 4 billions.
+ctypedef Py_ssize_t intp_t
+ctypedef float float32_t
+ctypedef double float64_t
+# Sparse matrices indices and indices' pointers arrays must use int32_t over
+# intp_t because intp_t is platform dependent.
+# When large sparse matrices are supported, indexing must use int64_t.
+# See https://github.com/scikit-learn/scikit-learn/issues/23653 which tracks the
+# ongoing work to support large sparse matrices.
+ctypedef signed char int8_t
+ctypedef signed int int32_t
+ctypedef signed long long int64_t
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_typedefs.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_typedefs.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..2d8eaab49e1b7d1b209760a8744cb52c051c35fc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_typedefs.pyx
@@ -0,0 +1,23 @@
+# _typedefs is a declaration only module
+#
+# The functions implemented here are for testing purpose only.
+
+
+import numpy as np
+
+
+ctypedef fused testing_type_t:
+    float32_t
+    float64_t
+    int8_t
+    int32_t
+    int64_t
+    intp_t
+    uint8_t
+    uint32_t
+    uint64_t
+
+
+def testing_make_array_from_typed_val(testing_type_t val):
+    cdef testing_type_t[:] val_view = <testing_type_t[:1]>&val
+    return np.asarray(val_view)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_unique.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_unique.py
new file mode 100644
index 0000000000000000000000000000000000000000..0234058a92df49ca84772f567863c0e5203372b8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_unique.py
@@ -0,0 +1,108 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+from sklearn.utils._array_api import get_namespace
+
+
+def _attach_unique(y):
+    """Attach unique values of y to y and return the result.
+
+    The result is a view of y, and the metadata (unique) is not attached to y.
+    """
+    if not isinstance(y, np.ndarray):
+        return y
+    try:
+        # avoid recalculating unique in nested calls.
+        if "unique" in y.dtype.metadata:
+            return y
+    except (AttributeError, TypeError):
+        pass
+
+    unique = np.unique(y)
+    unique_dtype = np.dtype(y.dtype, metadata={"unique": unique})
+    return y.view(dtype=unique_dtype)
+
+
+def attach_unique(*ys, return_tuple=False):
+    """Attach unique values of ys to ys and return the results.
+
+    The result is a view of y, and the metadata (unique) is not attached to y.
+
+    IMPORTANT: The output of this function should NEVER be returned in functions.
+    This is to avoid this pattern:
+
+    .. code:: python
+
+        y = np.array([1, 2, 3])
+        y = attach_unique(y)
+        y[1] = -1
+        # now np.unique(y) will be different from cached_unique(y)
+
+    Parameters
+    ----------
+    *ys : sequence of array-like
+        Input data arrays.
+
+    return_tuple : bool, default=False
+        If True, always return a tuple even if there is only one array.
+
+    Returns
+    -------
+    ys : tuple of array-like or array-like
+        Input data with unique values attached.
+    """
+    res = tuple(_attach_unique(y) for y in ys)
+    if len(res) == 1 and not return_tuple:
+        return res[0]
+    return res
+
+
+def _cached_unique(y, xp=None):
+    """Return the unique values of y.
+
+    Use the cached values from dtype.metadata if present.
+
+    This function does NOT cache the values in y, i.e. it doesn't change y.
+
+    Call `attach_unique` to attach the unique values to y.
+    """
+    try:
+        if y.dtype.metadata is not None and "unique" in y.dtype.metadata:
+            return y.dtype.metadata["unique"]
+    except AttributeError:
+        # in case y is not a numpy array
+        pass
+    xp, _ = get_namespace(y, xp=xp)
+    return xp.unique_values(y)
+
+
+def cached_unique(*ys, xp=None):
+    """Return the unique values of ys.
+
+    Use the cached values from dtype.metadata if present.
+
+    This function does NOT cache the values in y, i.e. it doesn't change y.
+
+    Call `attach_unique` to attach the unique values to y.
+
+    Parameters
+    ----------
+    *ys : sequence of array-like
+        Input data arrays.
+
+    xp : module, default=None
+        Precomputed array namespace module. When passed, typically from a caller
+        that has already performed inspection of its own inputs, skips array
+        namespace inspection.
+
+    Returns
+    -------
+    res : tuple of array-like or array-like
+        Unique values of ys.
+    """
+    res = tuple(_cached_unique(y, xp=xp) for y in ys)
+    if len(res) == 1:
+        return res[0]
+    return res
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_user_interface.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_user_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e7550b09be2cc0ba395fde424d137ad5b2156b0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_user_interface.py
@@ -0,0 +1,57 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import timeit
+from contextlib import contextmanager
+
+
+def _message_with_time(source, message, time):
+    """Create one line message for logging purposes.
+
+    Parameters
+    ----------
+    source : str
+        String indicating the source or the reference of the message.
+
+    message : str
+        Short message.
+
+    time : int
+        Time in seconds.
+    """
+    start_message = "[%s] " % source
+
+    # adapted from joblib.logger.short_format_time without the Windows -.1s
+    # adjustment
+    if time > 60:
+        time_str = "%4.1fmin" % (time / 60)
+    else:
+        time_str = " %5.1fs" % time
+    end_message = " %s, total=%s" % (message, time_str)
+    dots_len = 70 - len(start_message) - len(end_message)
+    return "%s%s%s" % (start_message, dots_len * ".", end_message)
+
+
+@contextmanager
+def _print_elapsed_time(source, message=None):
+    """Log elapsed time to stdout when the context is exited.
+
+    Parameters
+    ----------
+    source : str
+        String indicating the source or the reference of the message.
+
+    message : str, default=None
+        Short message. If None, nothing will be printed.
+
+    Returns
+    -------
+    context_manager
+        Prints elapsed time upon exit if verbose.
+    """
+    if message is None:
+        yield
+    else:
+        start = timeit.default_timer()
+        yield
+        print(_message_with_time(source, message, timeit.default_timer() - start))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_vector_sentinel.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/_vector_sentinel.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..64de6c18830b5e24c77bfed38cfffccc3b62955a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_vector_sentinel.pxd
@@ -0,0 +1,12 @@
+cimport numpy as cnp
+
+from libcpp.vector cimport vector
+from ..utils._typedefs cimport intp_t, float64_t, int32_t, int64_t
+
+ctypedef fused vector_typed:
+    vector[float64_t]
+    vector[intp_t]
+    vector[int32_t]
+    vector[int64_t]
+
+cdef cnp.ndarray vector_to_nd_array(vector_typed * vect_ptr)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_vector_sentinel.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/_vector_sentinel.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..146234568963f387d9108b2851c2d521f1b8e6ba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_vector_sentinel.pyx
@@ -0,0 +1,118 @@
+from cython.operator cimport dereference as deref
+from cpython.ref cimport Py_INCREF
+cimport numpy as cnp
+
+cnp.import_array()
+
+
+cdef StdVectorSentinel _create_sentinel(vector_typed * vect_ptr):
+    if vector_typed is vector[float64_t]:
+        return StdVectorSentinelFloat64.create_for(vect_ptr)
+    elif vector_typed is vector[int32_t]:
+        return StdVectorSentinelInt32.create_for(vect_ptr)
+    elif vector_typed is vector[int64_t]:
+        return StdVectorSentinelInt64.create_for(vect_ptr)
+    else:  # intp_t
+        return StdVectorSentinelIntP.create_for(vect_ptr)
+
+
+cdef class StdVectorSentinel:
+    """Wraps a reference to a vector which will be deallocated with this object.
+
+    When created, the StdVectorSentinel swaps the reference of its internal
+    vectors with the provided one (vect_ptr), thus making the StdVectorSentinel
+    manage the provided one's lifetime.
+    """
+    cdef void* get_data(self):
+        """Return pointer to data."""
+
+    cdef int get_typenum(self):
+        """Get typenum for PyArray_SimpleNewFromData."""
+
+
+cdef class StdVectorSentinelFloat64(StdVectorSentinel):
+    cdef vector[float64_t] vec
+
+    @staticmethod
+    cdef StdVectorSentinel create_for(vector[float64_t] * vect_ptr):
+        # This initializes the object directly without calling __init__
+        # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa
+        cdef StdVectorSentinelFloat64 sentinel = StdVectorSentinelFloat64.__new__(StdVectorSentinelFloat64)
+        sentinel.vec.swap(deref(vect_ptr))
+        return sentinel
+
+    cdef void* get_data(self):
+        return self.vec.data()
+
+    cdef int get_typenum(self):
+        return cnp.NPY_FLOAT64
+
+
+cdef class StdVectorSentinelIntP(StdVectorSentinel):
+    cdef vector[intp_t] vec
+
+    @staticmethod
+    cdef StdVectorSentinel create_for(vector[intp_t] * vect_ptr):
+        # This initializes the object directly without calling __init__
+        # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa
+        cdef StdVectorSentinelIntP sentinel = StdVectorSentinelIntP.__new__(StdVectorSentinelIntP)
+        sentinel.vec.swap(deref(vect_ptr))
+        return sentinel
+
+    cdef void* get_data(self):
+        return self.vec.data()
+
+    cdef int get_typenum(self):
+        return cnp.NPY_INTP
+
+
+cdef class StdVectorSentinelInt32(StdVectorSentinel):
+    cdef vector[int32_t] vec
+
+    @staticmethod
+    cdef StdVectorSentinel create_for(vector[int32_t] * vect_ptr):
+        # This initializes the object directly without calling __init__
+        # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa
+        cdef StdVectorSentinelInt32 sentinel = StdVectorSentinelInt32.__new__(StdVectorSentinelInt32)
+        sentinel.vec.swap(deref(vect_ptr))
+        return sentinel
+
+    cdef void* get_data(self):
+        return self.vec.data()
+
+    cdef int get_typenum(self):
+        return cnp.NPY_INT32
+
+
+cdef class StdVectorSentinelInt64(StdVectorSentinel):
+    cdef vector[int64_t] vec
+
+    @staticmethod
+    cdef StdVectorSentinel create_for(vector[int64_t] * vect_ptr):
+        # This initializes the object directly without calling __init__
+        # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa
+        cdef StdVectorSentinelInt64 sentinel = StdVectorSentinelInt64.__new__(StdVectorSentinelInt64)
+        sentinel.vec.swap(deref(vect_ptr))
+        return sentinel
+
+    cdef void* get_data(self):
+        return self.vec.data()
+
+    cdef int get_typenum(self):
+        return cnp.NPY_INT64
+
+
+cdef cnp.ndarray vector_to_nd_array(vector_typed * vect_ptr):
+    cdef:
+        cnp.npy_intp size = deref(vect_ptr).size()
+        StdVectorSentinel sentinel = _create_sentinel(vect_ptr)
+        cnp.ndarray arr = cnp.PyArray_SimpleNewFromData(
+            1, &size, sentinel.get_typenum(), sentinel.get_data())
+
+    # Makes the numpy array responsible of the life-cycle of its buffer.
+    # A reference to the StdVectorSentinel will be stolen by the call to
+    # `PyArray_SetBaseObject` below, so we increase its reference counter.
+    # See: https://docs.python.org/3/c-api/intro.html#reference-count-details
+    Py_INCREF(sentinel)
+    cnp.PyArray_SetBaseObject(arr, sentinel)
+    return arr
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..7708d53d2109056ce82cd9aaeba61bd8fc143489
Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.cpython-312-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.pxd.tp
new file mode 100644
index 0000000000000000000000000000000000000000..bb1a4db486d2a4c4c185b39e9a37922dde0d94a0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.pxd.tp
@@ -0,0 +1,45 @@
+{{py:
+
+"""
+Efficient (dense) parameter vector implementation for linear models.
+
+Template file for easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: weight_vector.pxd
+
+Each class is duplicated for all dtypes (float and double). The keywords
+between double braces are substituted during the build.
+"""
+
+# name_suffix, c_type
+dtypes = [('64', 'double'),
+          ('32', 'float')]
+
+}}
+
+{{for name_suffix, c_type in dtypes}}
+
+cdef class WeightVector{{name_suffix}}(object):
+    cdef readonly {{c_type}}[::1] w
+    cdef readonly {{c_type}}[::1] aw
+    cdef {{c_type}} *w_data_ptr
+    cdef {{c_type}} *aw_data_ptr
+
+    cdef double wscale
+    cdef double average_a
+    cdef double average_b
+    cdef int n_features
+    cdef double sq_norm
+
+    cdef void add(self, {{c_type}} *x_data_ptr, int *x_ind_ptr,
+                  int xnnz, {{c_type}} c) noexcept nogil
+    cdef void add_average(self, {{c_type}} *x_data_ptr, int *x_ind_ptr,
+                          int xnnz, {{c_type}} c, {{c_type}} num_iter) noexcept nogil
+    cdef {{c_type}} dot(self, {{c_type}} *x_data_ptr, int *x_ind_ptr,
+                    int xnnz) noexcept nogil
+    cdef void scale(self, {{c_type}} c) noexcept nogil
+    cdef void reset_wscale(self) noexcept nogil
+    cdef {{c_type}} norm(self) noexcept nogil
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.pyx.tp
new file mode 100644
index 0000000000000000000000000000000000000000..d831a6f81c1dabd1002fd6800547f66fc10b9142
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_weight_vector.pyx.tp
@@ -0,0 +1,207 @@
+{{py:
+
+"""
+Efficient (dense) parameter vector implementation for linear models.
+
+Template file for easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: weight_vector.pxd
+
+Each class is duplicated for all dtypes (float and double). The keywords
+between double braces are substituted during the build.
+"""
+
+# name_suffix, c_type, reset_wscale_threshold
+dtypes = [('64', 'double', 1e-9),
+          ('32', 'float', 1e-6)]
+
+}}
+
+# cython: binding=False
+#
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+cimport cython
+from libc.limits cimport INT_MAX
+from libc.math cimport sqrt
+
+from ._cython_blas cimport _dot, _scal, _axpy
+
+{{for name_suffix, c_type, reset_wscale_threshold in dtypes}}
+
+cdef class WeightVector{{name_suffix}}(object):
+    """Dense vector represented by a scalar and a numpy array.
+
+    The class provides methods to ``add`` a sparse vector
+    and scale the vector.
+    Representing a vector explicitly as a scalar times a
+    vector allows for efficient scaling operations.
+
+    Attributes
+    ----------
+    w : ndarray, dtype={{c_type}}, order='C'
+        The numpy array which backs the weight vector.
+    aw : ndarray, dtype={{c_type}}, order='C'
+        The numpy array which backs the average_weight vector.
+    w_data_ptr : {{c_type}}*
+        A pointer to the data of the numpy array.
+    wscale : {{c_type}}
+        The scale of the vector.
+    n_features : int
+        The number of features (= dimensionality of ``w``).
+    sq_norm : {{c_type}}
+        The squared norm of ``w``.
+    """
+
+    def __cinit__(self,
+                  {{c_type}}[::1] w,
+                  {{c_type}}[::1] aw):
+
+        if w.shape[0] > INT_MAX:
+            raise ValueError("More than %d features not supported; got %d."
+                             % (INT_MAX, w.shape[0]))
+        self.w = w
+        self.w_data_ptr = &w[0]
+        self.wscale = 1.0
+        self.n_features = w.shape[0]
+        self.sq_norm = _dot(self.n_features, self.w_data_ptr, 1, self.w_data_ptr, 1)
+
+        self.aw = aw
+        if self.aw is not None:
+            self.aw_data_ptr = &aw[0]
+            self.average_a = 0.0
+            self.average_b = 1.0
+
+    cdef void add(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz,
+                  {{c_type}} c) noexcept nogil:
+        """Scales sample x by constant c and adds it to the weight vector.
+
+        This operation updates ``sq_norm``.
+
+        Parameters
+        ----------
+        x_data_ptr : {{c_type}}*
+            The array which holds the feature values of ``x``.
+        x_ind_ptr : np.intc*
+            The array which holds the feature indices of ``x``.
+        xnnz : int
+            The number of non-zero features of ``x``.
+        c : {{c_type}}
+            The scaling constant for the example.
+        """
+        cdef int j
+        cdef int idx
+        cdef double val
+        cdef double innerprod = 0.0
+        cdef double xsqnorm = 0.0
+
+        # the next two lines save a factor of 2!
+        cdef {{c_type}} wscale = self.wscale
+        cdef {{c_type}}* w_data_ptr = self.w_data_ptr
+
+        for j in range(xnnz):
+            idx = x_ind_ptr[j]
+            val = x_data_ptr[j]
+            innerprod += (w_data_ptr[idx] * val)
+            xsqnorm += (val * val)
+            w_data_ptr[idx] += val * (c / wscale)
+
+        self.sq_norm += (xsqnorm * c * c) + (2.0 * innerprod * wscale * c)
+
+    # Update the average weights according to the sparse trick defined
+    # here: https://research.microsoft.com/pubs/192769/tricks-2012.pdf
+    # by Leon Bottou
+    cdef void add_average(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz,
+                          {{c_type}} c, {{c_type}} num_iter) noexcept nogil:
+        """Updates the average weight vector.
+
+        Parameters
+        ----------
+        x_data_ptr : {{c_type}}*
+            The array which holds the feature values of ``x``.
+        x_ind_ptr : np.intc*
+            The array which holds the feature indices of ``x``.
+        xnnz : int
+            The number of non-zero features of ``x``.
+        c : {{c_type}}
+            The scaling constant for the example.
+        num_iter : {{c_type}}
+            The total number of iterations.
+        """
+        cdef int j
+        cdef int idx
+        cdef double val
+        cdef double mu = 1.0 / num_iter
+        cdef double average_a = self.average_a
+        cdef double wscale = self.wscale
+        cdef {{c_type}}* aw_data_ptr = self.aw_data_ptr
+
+        for j in range(xnnz):
+            idx = x_ind_ptr[j]
+            val = x_data_ptr[j]
+            aw_data_ptr[idx] += (self.average_a * val * (-c / wscale))
+
+        # Once the sample has been processed
+        # update the average_a and average_b
+        if num_iter > 1:
+            self.average_b /= (1.0 - mu)
+        self.average_a += mu * self.average_b * wscale
+
+    cdef {{c_type}} dot(self, {{c_type}} *x_data_ptr, int *x_ind_ptr,
+                    int xnnz) noexcept nogil:
+        """Computes the dot product of a sample x and the weight vector.
+
+        Parameters
+        ----------
+        x_data_ptr : {{c_type}}*
+            The array which holds the feature values of ``x``.
+        x_ind_ptr : np.intc*
+            The array which holds the feature indices of ``x``.
+        xnnz : int
+            The number of non-zero features of ``x`` (length of x_ind_ptr).
+
+        Returns
+        -------
+        innerprod : {{c_type}}
+            The inner product of ``x`` and ``w``.
+        """
+        cdef int j
+        cdef int idx
+        cdef double innerprod = 0.0
+        cdef {{c_type}}* w_data_ptr = self.w_data_ptr
+        for j in range(xnnz):
+            idx = x_ind_ptr[j]
+            innerprod += w_data_ptr[idx] * x_data_ptr[j]
+        innerprod *= self.wscale
+        return innerprod
+
+    cdef void scale(self, {{c_type}} c) noexcept nogil:
+        """Scales the weight vector by a constant ``c``.
+
+        It updates ``wscale`` and ``sq_norm``. If ``wscale`` gets too
+        small we call ``reset_swcale``."""
+        self.wscale *= c
+        self.sq_norm *= (c * c)
+
+        if self.wscale < {{reset_wscale_threshold}}:
+            self.reset_wscale()
+
+    cdef void reset_wscale(self) noexcept nogil:
+        """Scales each coef of ``w`` by ``wscale`` and resets it to 1. """
+        if self.aw_data_ptr != NULL:
+            _axpy(self.n_features, self.average_a,
+                  self.w_data_ptr, 1, self.aw_data_ptr, 1)
+            _scal(self.n_features, 1.0 / self.average_b, self.aw_data_ptr, 1)
+            self.average_a = 0.0
+            self.average_b = 1.0
+
+        _scal(self.n_features, self.wscale, self.w_data_ptr, 1)
+        self.wscale = 1.0
+
+    cdef {{c_type}} norm(self) noexcept nogil:
+        """The L2 norm of the weight vector. """
+        return sqrt(self.sq_norm)
+
+{{endfor}}
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/arrayfuncs.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/arrayfuncs.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..951751fd08fedae5c2f1020f6ef6c51e96547b18
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/arrayfuncs.pyx
@@ -0,0 +1,118 @@
+"""A small collection of auxiliary functions that operate on arrays."""
+
+from cython cimport floating
+from libc.math cimport fabs
+from libc.float cimport DBL_MAX, FLT_MAX
+
+from ._cython_blas cimport _copy, _rotg, _rot
+
+
+ctypedef fused real_numeric:
+    short
+    int
+    long
+    long long
+    float
+    double
+
+
+def min_pos(const floating[:] X):
+    """Find the minimum value of an array over positive values.
+
+    Returns the maximum representable value of the input dtype if none of the
+    values are positive.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n,)
+        Input array.
+
+    Returns
+    -------
+    min_val : float
+        The smallest positive value in the array, or the maximum representable value
+         of the input dtype if no positive values are found.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.arrayfuncs import min_pos
+    >>> X = np.array([0, -1, 2, 3, -4, 5])
+    >>> min_pos(X)
+    2.0
+    """
+    cdef Py_ssize_t i
+    cdef floating min_val = FLT_MAX if floating is float else DBL_MAX
+    for i in range(X.size):
+        if 0. < X[i] < min_val:
+            min_val = X[i]
+    return min_val
+
+
+def _all_with_any_reduction_axis_1(real_numeric[:, :] array, real_numeric value):
+    """Check whether any row contains all values equal to `value`.
+
+    It is equivalent to `np.any(np.all(X == value, axis=1))`, but it avoids to
+    materialize the temporary boolean matrices in memory.
+
+    Parameters
+    ----------
+    array: array-like
+        The array to be checked.
+    value: short, int, long, float, or double
+        The value to use for the comparison.
+
+    Returns
+    -------
+    any_all_equal: bool
+        Whether or not any rows contains all values equal to `value`.
+    """
+    cdef Py_ssize_t i, j
+
+    for i in range(array.shape[0]):
+        for j in range(array.shape[1]):
+            if array[i, j] != value:
+                break
+        else:  # no break
+            return True
+    return False
+
+
+# General Cholesky Delete.
+# Remove an element from the cholesky factorization
+# m = columns
+# n = rows
+#
+# TODO: put transpose as an option
+def cholesky_delete(floating[:, :] L, int go_out):
+    cdef:
+        int n = L.shape[0]
+        int m = L.strides[0]
+        floating c, s
+        floating *L1
+        int i
+
+    if floating is float:
+        m /= sizeof(float)
+    else:
+        m /= sizeof(double)
+
+    # delete row go_out
+    L1 = &L[0, 0] + (go_out * m)
+    for i in range(go_out, n-1):
+        _copy(i + 2, L1 + m, 1, L1, 1)
+        L1 += m
+
+    L1 = &L[0, 0] + (go_out * m)
+    for i in range(go_out, n-1):
+        _rotg(L1 + i, L1 + i + 1, &c, &s)
+        if L1[i] < 0:
+            # Diagonals cannot be negative
+            L1[i] = fabs(L1[i])
+            c = -c
+            s = -s
+
+        L1[i + 1] = 0.  # just for cleanup
+        L1 += m
+
+        _rot(n - i - 2, L1 + i, m, L1 + i + 1, m, c, s)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/class_weight.py b/.venv/lib/python3.12/site-packages/sklearn/utils/class_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..df175d057cfbf617add8d01bd15e37fa34034818
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/class_weight.py
@@ -0,0 +1,231 @@
+"""Utilities for handling weights based on class labels."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from scipy import sparse
+
+from ._param_validation import StrOptions, validate_params
+from .validation import _check_sample_weight
+
+
+@validate_params(
+    {
+        "class_weight": [dict, StrOptions({"balanced"}), None],
+        "classes": [np.ndarray],
+        "y": ["array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def compute_class_weight(class_weight, *, classes, y, sample_weight=None):
+    """Estimate class weights for unbalanced datasets.
+
+    Parameters
+    ----------
+    class_weight : dict, "balanced" or None
+        If "balanced", class weights will be given by
+        `n_samples / (n_classes * np.bincount(y))` or their weighted equivalent if
+        `sample_weight` is provided.
+        If a dictionary is given, keys are classes and values are corresponding class
+        weights.
+        If `None` is given, the class weights will be uniform.
+
+    classes : ndarray
+        Array of the classes occurring in the data, as given by
+        `np.unique(y_org)` with `y_org` the original class labels.
+
+    y : array-like of shape (n_samples,)
+        Array of original class labels per sample.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Array of weights that are assigned to individual samples. Only used when
+        `class_weight='balanced'`.
+
+    Returns
+    -------
+    class_weight_vect : ndarray of shape (n_classes,)
+        Array with `class_weight_vect[i]` the weight for i-th class.
+
+    References
+    ----------
+    The "balanced" heuristic is inspired by
+    Logistic Regression in Rare Events Data, King, Zen, 2001.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.class_weight import compute_class_weight
+    >>> y = [1, 1, 1, 1, 0, 0]
+    >>> compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
+    array([1.5 , 0.75])
+    """
+    # Import error caused by circular imports.
+    from ..preprocessing import LabelEncoder
+
+    if set(y) - set(classes):
+        raise ValueError("classes should include all valid labels that can be in y")
+    if class_weight is None or len(class_weight) == 0:
+        # uniform class weights
+        weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
+    elif class_weight == "balanced":
+        # Find the weight of each class as present in y.
+        le = LabelEncoder()
+        y_ind = le.fit_transform(y)
+        if not all(np.isin(classes, le.classes_)):
+            raise ValueError("classes should have valid labels that are in y")
+
+        sample_weight = _check_sample_weight(sample_weight, y)
+        weighted_class_counts = np.bincount(y_ind, weights=sample_weight)
+        recip_freq = weighted_class_counts.sum() / (
+            len(le.classes_) * weighted_class_counts
+        )
+        weight = recip_freq[le.transform(classes)]
+    else:
+        # user-defined dictionary
+        weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
+        unweighted_classes = []
+        for i, c in enumerate(classes):
+            if c in class_weight:
+                weight[i] = class_weight[c]
+            else:
+                unweighted_classes.append(c)
+
+        n_weighted_classes = len(classes) - len(unweighted_classes)
+        if unweighted_classes and n_weighted_classes != len(class_weight):
+            unweighted_classes_user_friendly_str = np.array(unweighted_classes).tolist()
+            raise ValueError(
+                f"The classes, {unweighted_classes_user_friendly_str}, are not in"
+                " class_weight"
+            )
+
+    return weight
+
+
+@validate_params(
+    {
+        "class_weight": [dict, list, StrOptions({"balanced"}), None],
+        "y": ["array-like", "sparse matrix"],
+        "indices": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def compute_sample_weight(class_weight, y, *, indices=None):
+    """Estimate sample weights by class for unbalanced datasets.
+
+    Parameters
+    ----------
+    class_weight : dict, list of dicts, "balanced", or None
+        Weights associated with classes in the form `{class_label: weight}`.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        Note that for multioutput (including multilabel) weights should be
+        defined for each class of every column in its own dict. For example,
+        for four-class multilabel classification weights should be
+        `[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]` instead of
+        `[{1:1}, {2:5}, {3:1}, {4:1}]`.
+
+        The `"balanced"` mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data:
+        `n_samples / (n_classes * np.bincount(y))`.
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+    y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)
+        Array of original class labels per sample.
+
+    indices : array-like of shape (n_subsample,), default=None
+        Array of indices to be used in a subsample. Can be of length less than
+        `n_samples` in the case of a subsample, or equal to `n_samples` in the
+        case of a bootstrap subsample with repeated indices. If `None`, the
+        sample weight will be calculated over the full sample. Only `"balanced"`
+        is supported for `class_weight` if this is provided.
+
+    Returns
+    -------
+    sample_weight_vect : ndarray of shape (n_samples,)
+        Array with sample weights as applied to the original `y`.
+
+    Examples
+    --------
+    >>> from sklearn.utils.class_weight import compute_sample_weight
+    >>> y = [1, 1, 1, 1, 0, 0]
+    >>> compute_sample_weight(class_weight="balanced", y=y)
+    array([0.75, 0.75, 0.75, 0.75, 1.5 , 1.5 ])
+    """
+
+    # Ensure y is 2D. Sparse matrices are already 2D.
+    if not sparse.issparse(y):
+        y = np.atleast_1d(y)
+        if y.ndim == 1:
+            y = np.reshape(y, (-1, 1))
+    n_outputs = y.shape[1]
+
+    if indices is not None and class_weight != "balanced":
+        raise ValueError(
+            "The only valid class_weight for subsampling is 'balanced'. "
+            f"Given {class_weight}."
+        )
+    elif n_outputs > 1:
+        if class_weight is None or isinstance(class_weight, dict):
+            raise ValueError(
+                "For multi-output, class_weight should be a list of dicts, or the "
+                "string 'balanced'."
+            )
+        elif isinstance(class_weight, list) and len(class_weight) != n_outputs:
+            raise ValueError(
+                "For multi-output, number of elements in class_weight should match "
+                f"number of outputs. Got {len(class_weight)} element(s) while having "
+                f"{n_outputs} outputs."
+            )
+
+    expanded_class_weight = []
+    for k in range(n_outputs):
+        if sparse.issparse(y):
+            # Ok to densify a single column at a time
+            y_full = y[:, [k]].toarray().flatten()
+        else:
+            y_full = y[:, k]
+        classes_full = np.unique(y_full)
+        classes_missing = None
+
+        if class_weight == "balanced" or n_outputs == 1:
+            class_weight_k = class_weight
+        else:
+            class_weight_k = class_weight[k]
+
+        if indices is not None:
+            # Get class weights for the subsample, covering all classes in
+            # case some labels that were present in the original data are
+            # missing from the sample.
+            y_subsample = y_full[indices]
+            classes_subsample = np.unique(y_subsample)
+
+            weight_k = np.take(
+                compute_class_weight(
+                    class_weight_k, classes=classes_subsample, y=y_subsample
+                ),
+                np.searchsorted(classes_subsample, classes_full),
+                mode="clip",
+            )
+
+            classes_missing = set(classes_full) - set(classes_subsample)
+        else:
+            weight_k = compute_class_weight(
+                class_weight_k, classes=classes_full, y=y_full
+            )
+
+        weight_k = weight_k[np.searchsorted(classes_full, y_full)]
+
+        if classes_missing:
+            # Make missing classes' weight zero
+            weight_k[np.isin(y_full, list(classes_missing))] = 0.0
+
+        expanded_class_weight.append(weight_k)
+
+    expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64)
+
+    return expanded_class_weight
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/deprecation.py b/.venv/lib/python3.12/site-packages/sklearn/utils/deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d03978a8d243e05e9e456d822ad0a32082b6d271
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/deprecation.py
@@ -0,0 +1,149 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import functools
+import warnings
+from inspect import signature
+
+__all__ = ["deprecated"]
+
+
+class deprecated:
+    """Decorator to mark a function or class as deprecated.
+
+    Issue a warning when the function is called/the class is instantiated and
+    adds a warning to the docstring.
+
+    The optional extra argument will be appended to the deprecation message
+    and the docstring. Note: to use this with the default value for extra, put
+    in an empty of parentheses:
+
+    Examples
+    --------
+    >>> from sklearn.utils import deprecated
+    >>> deprecated()
+    <sklearn.utils.deprecation.deprecated object at ...>
+    >>> @deprecated()
+    ... def some_function(): pass
+
+    Parameters
+    ----------
+    extra : str, default=''
+          To be added to the deprecation messages.
+    """
+
+    # Adapted from https://wiki.python.org/moin/PythonDecoratorLibrary,
+    # but with many changes.
+
+    def __init__(self, extra=""):
+        self.extra = extra
+
+    def __call__(self, obj):
+        """Call method
+
+        Parameters
+        ----------
+        obj : object
+        """
+        if isinstance(obj, type):
+            return self._decorate_class(obj)
+        elif isinstance(obj, property):
+            # Note that this is only triggered properly if the `deprecated`
+            # decorator is placed before the `property` decorator, like so:
+            #
+            # @deprecated(msg)
+            # @property
+            # def deprecated_attribute_(self):
+            #     ...
+            return self._decorate_property(obj)
+        else:
+            return self._decorate_fun(obj)
+
+    def _decorate_class(self, cls):
+        msg = "Class %s is deprecated" % cls.__name__
+        if self.extra:
+            msg += "; %s" % self.extra
+
+        new = cls.__new__
+        sig = signature(cls)
+
+        def wrapped(cls, *args, **kwargs):
+            warnings.warn(msg, category=FutureWarning)
+            if new is object.__new__:
+                return object.__new__(cls)
+
+            return new(cls, *args, **kwargs)
+
+        cls.__new__ = wrapped
+
+        wrapped.__name__ = "__new__"
+        wrapped.deprecated_original = new
+        # Restore the original signature, see PEP 362.
+        cls.__signature__ = sig
+
+        return cls
+
+    def _decorate_fun(self, fun):
+        """Decorate function fun"""
+
+        msg = "Function %s is deprecated" % fun.__name__
+        if self.extra:
+            msg += "; %s" % self.extra
+
+        @functools.wraps(fun)
+        def wrapped(*args, **kwargs):
+            warnings.warn(msg, category=FutureWarning)
+            return fun(*args, **kwargs)
+
+        # Add a reference to the wrapped function so that we can introspect
+        # on function arguments in Python 2 (already works in Python 3)
+        wrapped.__wrapped__ = fun
+
+        return wrapped
+
+    def _decorate_property(self, prop):
+        msg = self.extra
+
+        @property
+        @functools.wraps(prop.fget)
+        def wrapped(*args, **kwargs):
+            warnings.warn(msg, category=FutureWarning)
+            return prop.fget(*args, **kwargs)
+
+        return wrapped
+
+
+def _is_deprecated(func):
+    """Helper to check if func is wrapped by our deprecated decorator"""
+    closures = getattr(func, "__closure__", [])
+    if closures is None:
+        closures = []
+    is_deprecated = "deprecated" in "".join(
+        [c.cell_contents for c in closures if isinstance(c.cell_contents, str)]
+    )
+    return is_deprecated
+
+
+# TODO(1.8): remove force_all_finite and change the default value of ensure_all_finite
+# to True (remove None without deprecation).
+def _deprecate_force_all_finite(force_all_finite, ensure_all_finite):
+    """Helper to deprecate force_all_finite in favor of ensure_all_finite."""
+    if force_all_finite != "deprecated":
+        warnings.warn(
+            "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be "
+            "removed in 1.8.",
+            FutureWarning,
+        )
+
+        if ensure_all_finite is not None:
+            raise ValueError(
+                "'force_all_finite' and 'ensure_all_finite' cannot be used together. "
+                "Pass `ensure_all_finite` only."
+            )
+
+        return force_all_finite
+
+    if ensure_all_finite is None:
+        return True
+
+    return ensure_all_finite
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/discovery.py b/.venv/lib/python3.12/site-packages/sklearn/utils/discovery.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffa57c37aa3046573eb26c3649622f4f31f3214c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/discovery.py
@@ -0,0 +1,255 @@
+"""Utilities to discover scikit-learn objects."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import inspect
+import pkgutil
+from importlib import import_module
+from operator import itemgetter
+from pathlib import Path
+
+_MODULE_TO_IGNORE = {
+    "tests",
+    "externals",
+    "setup",
+    "conftest",
+    "experimental",
+    "estimator_checks",
+}
+
+
+def all_estimators(type_filter=None):
+    """Get a list of all estimators from `sklearn`.
+
+    This function crawls the module and gets all classes that inherit
+    from BaseEstimator. Classes that are defined in test-modules are not
+    included.
+
+    Parameters
+    ----------
+    type_filter : {"classifier", "regressor", "cluster", "transformer"} \
+            or list of such str, default=None
+        Which kind of estimators should be returned. If None, no filter is
+        applied and all estimators are returned.  Possible values are
+        'classifier', 'regressor', 'cluster' and 'transformer' to get
+        estimators only of these specific types, or a list of these to
+        get the estimators that fit at least one of the types.
+
+    Returns
+    -------
+    estimators : list of tuples
+        List of (name, class), where ``name`` is the class name as string
+        and ``class`` is the actual type of the class.
+
+    Examples
+    --------
+    >>> from sklearn.utils.discovery import all_estimators
+    >>> estimators = all_estimators()
+    >>> type(estimators)
+    <class 'list'>
+    >>> type(estimators[0])
+    <class 'tuple'>
+    >>> estimators[:2]
+    [('ARDRegression', <class 'sklearn.linear_model._bayes.ARDRegression'>),
+     ('AdaBoostClassifier',
+      <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>)]
+    >>> classifiers = all_estimators(type_filter="classifier")
+    >>> classifiers[:2]
+    [('AdaBoostClassifier',
+      <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>),
+     ('BaggingClassifier', <class 'sklearn.ensemble._bagging.BaggingClassifier'>)]
+    >>> regressors = all_estimators(type_filter="regressor")
+    >>> regressors[:2]
+    [('ARDRegression', <class 'sklearn.linear_model._bayes.ARDRegression'>),
+     ('AdaBoostRegressor',
+      <class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>)]
+    >>> both = all_estimators(type_filter=["classifier", "regressor"])
+    >>> both[:2]
+    [('ARDRegression', <class 'sklearn.linear_model._bayes.ARDRegression'>),
+     ('AdaBoostClassifier',
+      <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>)]
+    """
+    # lazy import to avoid circular imports from sklearn.base
+    from ..base import (
+        BaseEstimator,
+        ClassifierMixin,
+        ClusterMixin,
+        RegressorMixin,
+        TransformerMixin,
+    )
+    from ._testing import ignore_warnings
+
+    def is_abstract(c):
+        if not (hasattr(c, "__abstractmethods__")):
+            return False
+        if not len(c.__abstractmethods__):
+            return False
+        return True
+
+    all_classes = []
+    root = str(Path(__file__).parent.parent)  # sklearn package
+    # Ignore deprecation warnings triggered at import time and from walking
+    # packages
+    with ignore_warnings(category=FutureWarning):
+        for _, module_name, _ in pkgutil.walk_packages(path=[root], prefix="sklearn."):
+            module_parts = module_name.split(".")
+            if (
+                any(part in _MODULE_TO_IGNORE for part in module_parts)
+                or "._" in module_name
+            ):
+                continue
+            module = import_module(module_name)
+            classes = inspect.getmembers(module, inspect.isclass)
+            classes = [
+                (name, est_cls) for name, est_cls in classes if not name.startswith("_")
+            ]
+
+            all_classes.extend(classes)
+
+    all_classes = set(all_classes)
+
+    estimators = [
+        c
+        for c in all_classes
+        if (issubclass(c[1], BaseEstimator) and c[0] != "BaseEstimator")
+    ]
+    # get rid of abstract base classes
+    estimators = [c for c in estimators if not is_abstract(c[1])]
+
+    if type_filter is not None:
+        if not isinstance(type_filter, list):
+            type_filter = [type_filter]
+        else:
+            type_filter = list(type_filter)  # copy
+        filtered_estimators = []
+        filters = {
+            "classifier": ClassifierMixin,
+            "regressor": RegressorMixin,
+            "transformer": TransformerMixin,
+            "cluster": ClusterMixin,
+        }
+        for name, mixin in filters.items():
+            if name in type_filter:
+                type_filter.remove(name)
+                filtered_estimators.extend(
+                    [est for est in estimators if issubclass(est[1], mixin)]
+                )
+        estimators = filtered_estimators
+        if type_filter:
+            raise ValueError(
+                "Parameter type_filter must be 'classifier', "
+                "'regressor', 'transformer', 'cluster' or "
+                "None, got"
+                f" {type_filter!r}."
+            )
+
+    # drop duplicates, sort for reproducibility
+    # itemgetter is used to ensure the sort does not extend to the 2nd item of
+    # the tuple
+    return sorted(set(estimators), key=itemgetter(0))
+
+
+def all_displays():
+    """Get a list of all displays from `sklearn`.
+
+    Returns
+    -------
+    displays : list of tuples
+        List of (name, class), where ``name`` is the display class name as
+        string and ``class`` is the actual type of the class.
+
+    Examples
+    --------
+    >>> from sklearn.utils.discovery import all_displays
+    >>> displays = all_displays()
+    >>> displays[0]
+    ('CalibrationDisplay', <class 'sklearn.calibration.CalibrationDisplay'>)
+    """
+    # lazy import to avoid circular imports from sklearn.base
+    from ._testing import ignore_warnings
+
+    all_classes = []
+    root = str(Path(__file__).parent.parent)  # sklearn package
+    # Ignore deprecation warnings triggered at import time and from walking
+    # packages
+    with ignore_warnings(category=FutureWarning):
+        for _, module_name, _ in pkgutil.walk_packages(path=[root], prefix="sklearn."):
+            module_parts = module_name.split(".")
+            if (
+                any(part in _MODULE_TO_IGNORE for part in module_parts)
+                or "._" in module_name
+            ):
+                continue
+            module = import_module(module_name)
+            classes = inspect.getmembers(module, inspect.isclass)
+            classes = [
+                (name, display_class)
+                for name, display_class in classes
+                if not name.startswith("_") and name.endswith("Display")
+            ]
+            all_classes.extend(classes)
+
+    return sorted(set(all_classes), key=itemgetter(0))
+
+
+def _is_checked_function(item):
+    if not inspect.isfunction(item):
+        return False
+
+    if item.__name__.startswith("_"):
+        return False
+
+    mod = item.__module__
+    if not mod.startswith("sklearn.") or mod.endswith("estimator_checks"):
+        return False
+
+    return True
+
+
+def all_functions():
+    """Get a list of all functions from `sklearn`.
+
+    Returns
+    -------
+    functions : list of tuples
+        List of (name, function), where ``name`` is the function name as
+        string and ``function`` is the actual function.
+
+    Examples
+    --------
+    >>> from sklearn.utils.discovery import all_functions
+    >>> functions = all_functions()
+    >>> name, function = functions[0]
+    >>> name
+    'accuracy_score'
+    """
+    # lazy import to avoid circular imports from sklearn.base
+    from ._testing import ignore_warnings
+
+    all_functions = []
+    root = str(Path(__file__).parent.parent)  # sklearn package
+    # Ignore deprecation warnings triggered at import time and from walking
+    # packages
+    with ignore_warnings(category=FutureWarning):
+        for _, module_name, _ in pkgutil.walk_packages(path=[root], prefix="sklearn."):
+            module_parts = module_name.split(".")
+            if (
+                any(part in _MODULE_TO_IGNORE for part in module_parts)
+                or "._" in module_name
+            ):
+                continue
+
+            module = import_module(module_name)
+            functions = inspect.getmembers(module, _is_checked_function)
+            functions = [
+                (func.__name__, func)
+                for name, func in functions
+                if not name.startswith("_")
+            ]
+            all_functions.extend(functions)
+
+    # drop duplicates, sort for reproducibility
+    # itemgetter is used to ensure the sort does not extend to the 2nd item of
+    # the tuple
+    return sorted(set(all_functions), key=itemgetter(0))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/estimator_checks.py b/.venv/lib/python3.12/site-packages/sklearn/utils/estimator_checks.py
new file mode 100644
index 0000000000000000000000000000000000000000..349c1ae0712da2af02878071f0d066113842386f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/estimator_checks.py
@@ -0,0 +1,5348 @@
+"""Various utilities to check the compatibility of estimators with scikit-learn API."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+from __future__ import annotations
+
+import pickle
+import re
+import textwrap
+import warnings
+from contextlib import nullcontext
+from copy import deepcopy
+from functools import partial, wraps
+from inspect import signature
+from numbers import Integral, Real
+from typing import Callable, Literal
+
+import joblib
+import numpy as np
+from scipy import sparse
+from scipy.stats import rankdata
+
+from sklearn.base import (
+    BaseEstimator,
+    BiclusterMixin,
+    ClassifierMixin,
+    ClassNamePrefixFeaturesOutMixin,
+    DensityMixin,
+    MetaEstimatorMixin,
+    MultiOutputMixin,
+    OneToOneFeatureMixin,
+    OutlierMixin,
+    RegressorMixin,
+    TransformerMixin,
+)
+
+from .. import config_context
+from ..base import (
+    ClusterMixin,
+    clone,
+    is_classifier,
+    is_outlier_detector,
+    is_regressor,
+)
+from ..datasets import (
+    load_iris,
+    make_blobs,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
+from ..exceptions import (
+    DataConversionWarning,
+    EstimatorCheckFailedWarning,
+    NotFittedError,
+    SkipTestWarning,
+)
+from ..linear_model._base import LinearClassifierMixin
+from ..metrics import accuracy_score, adjusted_rand_score, f1_score
+from ..metrics.pairwise import linear_kernel, pairwise_distances, rbf_kernel
+from ..model_selection import LeaveOneGroupOut, ShuffleSplit, train_test_split
+from ..model_selection._validation import _safe_split
+from ..pipeline import make_pipeline
+from ..preprocessing import StandardScaler, scale
+from ..utils import _safe_indexing
+from ..utils._array_api import (
+    _atol_for_type,
+    _convert_to_numpy,
+    get_namespace,
+    yield_namespace_device_dtype_combinations,
+)
+from ..utils._array_api import device as array_device
+from ..utils._param_validation import (
+    InvalidParameterError,
+    generate_invalid_param_val,
+    make_constraint,
+)
+from . import shuffle
+from ._missing import is_scalar_nan
+from ._param_validation import Interval, StrOptions, validate_params
+from ._tags import (
+    ClassifierTags,
+    InputTags,
+    RegressorTags,
+    TargetTags,
+    TransformerTags,
+    get_tags,
+)
+from ._test_common.instance_generator import (
+    CROSS_DECOMPOSITION,
+    _get_check_estimator_ids,
+    _yield_instances_for_check,
+)
+from ._testing import (
+    SkipTest,
+    _array_api_for_tests,
+    _get_args,
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_array_less,
+    create_memmap_backed_data,
+    ignore_warnings,
+    raises,
+    set_random_state,
+)
+from .validation import _num_samples, check_is_fitted, has_fit_parameter
+
+REGRESSION_DATASET = None
+
+
+def _raise_for_missing_tags(estimator, tag_name, Mixin):
+    tags = get_tags(estimator)
+    estimator_type = Mixin.__name__.replace("Mixin", "")
+    if getattr(tags, tag_name) is None:
+        raise RuntimeError(
+            f"Estimator {estimator.__class__.__name__} seems to be a {estimator_type},"
+            f" but the `{tag_name}` tag is not set. Either set the tag manually"
+            f" or inherit from the {Mixin.__name__}. Note that the order of inheritance"
+            f" matters, the {Mixin.__name__} should come before BaseEstimator."
+        )
+
+
+def _yield_api_checks(estimator):
+    if not isinstance(estimator, BaseEstimator):
+        warnings.warn(
+            f"Estimator {estimator.__class__.__name__} does not inherit from"
+            " `sklearn.base.BaseEstimator`. This might lead to unexpected behavior, or"
+            " even errors when collecting tests.",
+            category=UserWarning,
+        )
+
+    tags = get_tags(estimator)
+    yield check_estimator_cloneable
+    yield check_estimator_tags_renamed
+    yield check_valid_tag_types
+    yield check_estimator_repr
+    yield check_no_attributes_set_in_init
+    yield check_fit_score_takes_y
+    yield check_estimators_overwrite_params
+    yield check_dont_overwrite_parameters
+    yield check_estimators_fit_returns_self
+    yield check_readonly_memmap_input
+    if tags.requires_fit:
+        yield check_estimators_unfitted
+    yield check_do_not_raise_errors_in_init_or_set_params
+    yield check_n_features_in_after_fitting
+    yield check_mixin_order
+    yield check_positive_only_tag_during_fit
+
+
+def _yield_checks(estimator):
+    name = estimator.__class__.__name__
+    tags = get_tags(estimator)
+
+    yield check_estimators_dtypes
+    if has_fit_parameter(estimator, "sample_weight"):
+        yield check_sample_weights_pandas_series
+        yield check_sample_weights_not_an_array
+        yield check_sample_weights_list
+        if not tags.input_tags.pairwise:
+            # We skip pairwise because the data is not pairwise
+            yield check_sample_weights_shape
+            yield check_sample_weights_not_overwritten
+            yield check_sample_weight_equivalence_on_dense_data
+            if tags.input_tags.sparse:
+                yield check_sample_weight_equivalence_on_sparse_data
+
+    # Check that all estimator yield informative messages when
+    # trained on empty datasets
+    if not tags.no_validation:
+        yield check_complex_data
+        yield check_dtype_object
+        yield check_estimators_empty_data_messages
+
+    if name not in CROSS_DECOMPOSITION:
+        # cross-decomposition's "transform" returns X and Y
+        yield check_pipeline_consistency
+
+    if not tags.input_tags.allow_nan and not tags.no_validation:
+        # Test that all estimators check their input for NaN's and infs
+        yield check_estimators_nan_inf
+
+    if tags.input_tags.pairwise:
+        # Check that pairwise estimator throws error on non-square input
+        yield check_nonsquare_error
+
+    if hasattr(estimator, "sparsify"):
+        yield check_sparsify_coefficients
+
+    yield check_estimator_sparse_tag
+    yield check_estimator_sparse_array
+    yield check_estimator_sparse_matrix
+
+    # Test that estimators can be pickled, and once pickled
+    # give the same answer as before.
+    yield check_estimators_pickle
+    yield partial(check_estimators_pickle, readonly_memmap=True)
+
+    if tags.array_api_support:
+        for check in _yield_array_api_checks(estimator):
+            yield check
+
+    yield check_f_contiguous_array_estimator
+
+
+def _yield_classifier_checks(classifier):
+    _raise_for_missing_tags(classifier, "classifier_tags", ClassifierMixin)
+    tags = get_tags(classifier)
+
+    # test classifiers can handle non-array data and pandas objects
+    yield check_classifier_data_not_an_array
+    # test classifiers trained on a single label always return this label
+    yield check_classifiers_one_label
+    yield check_classifiers_one_label_sample_weights
+    yield check_classifiers_classes
+    yield check_estimators_partial_fit_n_features
+    if tags.target_tags.multi_output:
+        yield check_classifier_multioutput
+    # basic consistency testing
+    yield check_classifiers_train
+    yield partial(check_classifiers_train, readonly_memmap=True)
+    yield partial(check_classifiers_train, readonly_memmap=True, X_dtype="float32")
+    yield check_classifiers_regression_target
+    if tags.classifier_tags.multi_label:
+        yield check_classifiers_multilabel_representation_invariance
+        yield check_classifiers_multilabel_output_format_predict
+        yield check_classifiers_multilabel_output_format_predict_proba
+        yield check_classifiers_multilabel_output_format_decision_function
+    if not tags.no_validation:
+        yield check_supervised_y_no_nan
+        if tags.target_tags.single_output:
+            yield check_supervised_y_2d
+    if "class_weight" in classifier.get_params().keys():
+        yield check_class_weight_classifiers
+
+    yield check_non_transformer_estimators_n_iter
+    # test if predict_proba is a monotonic transformation of decision_function
+    yield check_decision_proba_consistency
+
+    if isinstance(classifier, LinearClassifierMixin):
+        if "class_weight" in classifier.get_params().keys():
+            yield check_class_weight_balanced_linear_classifier
+    if (
+        isinstance(classifier, LinearClassifierMixin)
+        and "class_weight" in classifier.get_params().keys()
+    ):
+        yield check_class_weight_balanced_linear_classifier
+
+    if not tags.classifier_tags.multi_class:
+        yield check_classifier_not_supporting_multiclass
+
+
+def _yield_regressor_checks(regressor):
+    _raise_for_missing_tags(regressor, "regressor_tags", RegressorMixin)
+    tags = get_tags(regressor)
+    # TODO: test with intercept
+    # TODO: test with multiple responses
+    # basic testing
+    yield check_regressors_train
+    yield partial(check_regressors_train, readonly_memmap=True)
+    yield partial(check_regressors_train, readonly_memmap=True, X_dtype="float32")
+    yield check_regressor_data_not_an_array
+    yield check_estimators_partial_fit_n_features
+    if tags.target_tags.multi_output:
+        yield check_regressor_multioutput
+    yield check_regressors_no_decision_function
+    if not tags.no_validation and tags.target_tags.single_output:
+        yield check_supervised_y_2d
+    yield check_supervised_y_no_nan
+    name = regressor.__class__.__name__
+    if name != "CCA":
+        # check that the regressor handles int input
+        yield check_regressors_int
+    yield check_non_transformer_estimators_n_iter
+
+
+def _yield_transformer_checks(transformer):
+    _raise_for_missing_tags(transformer, "transformer_tags", TransformerMixin)
+    tags = get_tags(transformer)
+    # All transformers should either deal with sparse data or raise an
+    # exception with type TypeError and an intelligible error message
+    if not tags.no_validation:
+        yield check_transformer_data_not_an_array
+    # these don't actually fit the data, so don't raise errors
+    yield check_transformer_general
+    if tags.transformer_tags.preserves_dtype:
+        yield check_transformer_preserve_dtypes
+    yield partial(check_transformer_general, readonly_memmap=True)
+    if get_tags(transformer).requires_fit:
+        yield check_transformers_unfitted
+    else:
+        yield check_transformers_unfitted_stateless
+    # Dependent on external solvers and hence accessing the iter
+    # param is non-trivial.
+    external_solver = [
+        "Isomap",
+        "KernelPCA",
+        "LocallyLinearEmbedding",
+        "LogisticRegressionCV",
+        "BisectingKMeans",
+    ]
+
+    name = transformer.__class__.__name__
+    if name not in external_solver:
+        yield check_transformer_n_iter
+
+
+def _yield_clustering_checks(clusterer):
+    yield check_clusterer_compute_labels_predict
+    name = clusterer.__class__.__name__
+    if name not in ("WardAgglomeration", "FeatureAgglomeration"):
+        # this is clustering on the features
+        # let's not test that here.
+        yield check_clustering
+        yield partial(check_clustering, readonly_memmap=True)
+        yield check_estimators_partial_fit_n_features
+    if not hasattr(clusterer, "transform"):
+        yield check_non_transformer_estimators_n_iter
+
+
+def _yield_outliers_checks(estimator):
+    # checks for the contamination parameter
+    if hasattr(estimator, "contamination"):
+        yield check_outlier_contamination
+
+    # checks for outlier detectors that have a fit_predict method
+    if hasattr(estimator, "fit_predict"):
+        yield check_outliers_fit_predict
+
+    # checks for estimators that can be used on a test set
+    if hasattr(estimator, "predict"):
+        yield check_outliers_train
+        yield partial(check_outliers_train, readonly_memmap=True)
+        # test outlier detectors can handle non-array data
+        yield check_classifier_data_not_an_array
+    yield check_non_transformer_estimators_n_iter
+
+
+def _yield_array_api_checks(estimator):
+    for (
+        array_namespace,
+        device,
+        dtype_name,
+    ) in yield_namespace_device_dtype_combinations():
+        yield partial(
+            check_array_api_input,
+            array_namespace=array_namespace,
+            dtype_name=dtype_name,
+            device=device,
+        )
+
+
+def _yield_all_checks(estimator, legacy: bool):
+    name = estimator.__class__.__name__
+    tags = get_tags(estimator)
+    if not tags.input_tags.two_d_array:
+        warnings.warn(
+            "Can't test estimator {} which requires input  of type {}".format(
+                name, tags.input_tags
+            ),
+            SkipTestWarning,
+        )
+        return
+    if tags._skip_test:
+        warnings.warn(
+            "Explicit SKIP via _skip_test tag for estimator {}.".format(name),
+            SkipTestWarning,
+        )
+        return
+
+    for check in _yield_api_checks(estimator):
+        yield check
+
+    if not legacy:
+        return  # pragma: no cover
+
+    for check in _yield_checks(estimator):
+        yield check
+    if is_classifier(estimator):
+        for check in _yield_classifier_checks(estimator):
+            yield check
+    if is_regressor(estimator):
+        for check in _yield_regressor_checks(estimator):
+            yield check
+    if hasattr(estimator, "transform"):
+        for check in _yield_transformer_checks(estimator):
+            yield check
+    if isinstance(estimator, ClusterMixin):
+        for check in _yield_clustering_checks(estimator):
+            yield check
+    if is_outlier_detector(estimator):
+        for check in _yield_outliers_checks(estimator):
+            yield check
+    yield check_parameters_default_constructible
+    if not tags.non_deterministic:
+        yield check_methods_sample_order_invariance
+        yield check_methods_subset_invariance
+    yield check_fit2d_1sample
+    yield check_fit2d_1feature
+    yield check_get_params_invariance
+    yield check_set_params
+    yield check_dict_unchanged
+    yield check_fit_idempotent
+    yield check_fit_check_is_fitted
+    if not tags.no_validation:
+        yield check_n_features_in
+        yield check_fit1d
+        yield check_fit2d_predict1d
+        if tags.target_tags.required:
+            yield check_requires_y_none
+    if tags.input_tags.positive_only:
+        yield check_fit_non_negative
+
+
+def _check_name(check):
+    if hasattr(check, "__wrapped__"):
+        return _check_name(check.__wrapped__)
+    return check.func.__name__ if isinstance(check, partial) else check.__name__
+
+
+def _maybe_mark(
+    estimator,
+    check,
+    expected_failed_checks: dict[str, str] | None = None,
+    mark: Literal["xfail", "skip", None] = None,
+    pytest=None,
+):
+    """Mark the test as xfail or skip if needed.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        Estimator instance for which to generate checks.
+    check : partial or callable
+        Check to be marked.
+    expected_failed_checks : dict[str, str], default=None
+        Dictionary of the form {check_name: reason} for checks that are expected to
+        fail.
+    mark : "xfail" or "skip" or None
+        Whether to mark the check as xfail or skip.
+    pytest : pytest module, default=None
+        Pytest module to use to mark the check. This is only needed if ``mark`` is
+        `"xfail"`. Note that one can run `check_estimator` without having `pytest`
+        installed. This is used in combination with `parametrize_with_checks` only.
+    """
+    should_be_marked, reason = _should_be_skipped_or_marked(
+        estimator, check, expected_failed_checks
+    )
+    if not should_be_marked or mark is None:
+        return estimator, check
+
+    estimator_name = estimator.__class__.__name__
+    if mark == "xfail":
+        return pytest.param(estimator, check, marks=pytest.mark.xfail(reason=reason))
+    else:
+
+        @wraps(check)
+        def wrapped(*args, **kwargs):
+            raise SkipTest(
+                f"Skipping {_check_name(check)} for {estimator_name}: {reason}"
+            )
+
+        return estimator, wrapped
+
+
+def _should_be_skipped_or_marked(
+    estimator, check, expected_failed_checks: dict[str, str] | None = None
+) -> tuple[bool, str]:
+    """Check whether a check should be skipped or marked as xfail.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        Estimator instance for which to generate checks.
+    check : partial or callable
+        Check to be marked.
+    expected_failed_checks : dict[str, str], default=None
+        Dictionary of the form {check_name: reason} for checks that are expected to
+        fail.
+
+    Returns
+    -------
+    should_be_marked : bool
+        Whether the check should be marked as xfail or skipped.
+    reason : str
+        Reason for skipping the check.
+    """
+
+    expected_failed_checks = expected_failed_checks or {}
+
+    check_name = _check_name(check)
+    if check_name in expected_failed_checks:
+        return True, expected_failed_checks[check_name]
+
+    return False, "Check is not expected to fail"
+
+
+def estimator_checks_generator(
+    estimator,
+    *,
+    legacy: bool = True,
+    expected_failed_checks: dict[str, str] | None = None,
+    mark: Literal["xfail", "skip", None] = None,
+):
+    """Iteratively yield all check callables for an estimator.
+
+    This function is used by
+    :func:`~sklearn.utils.estimator_checks.parametrize_with_checks` and
+    :func:`~sklearn.utils.estimator_checks.check_estimator` to yield all check callables
+    for an estimator. In most cases, these functions should be used instead. When
+    implementing a custom equivalent, please refer to their source code to
+    understand how `estimator_checks_generator` is intended to be used.
+
+    .. versionadded:: 1.6
+
+    Parameters
+    ----------
+    estimator : estimator object
+        Estimator instance for which to generate checks.
+    legacy : bool, default=True
+        Whether to include legacy checks. Over time we remove checks from this category
+        and move them into their specific category.
+    expected_failed_checks : dict[str, str], default=None
+        Dictionary of the form {check_name: reason} for checks that are expected to
+        fail.
+    mark : {"xfail", "skip"} or None, default=None
+        Whether to mark the checks that are expected to fail as
+        xfail(`pytest.mark.xfail`) or skip. Marking a test as "skip" is done via
+        wrapping the check in a function that raises a
+        :class:`~sklearn.exceptions.SkipTest` exception.
+
+    Returns
+    -------
+    estimator_checks_generator : generator
+        Generator that yields (estimator, check) tuples.
+    """
+    if mark == "xfail":
+        import pytest
+    else:
+        pytest = None  # type: ignore[assignment]
+
+    name = type(estimator).__name__
+    # First check that the estimator is cloneable which is needed for the rest
+    # of the checks to run
+    yield estimator, partial(check_estimator_cloneable, name)
+    for check in _yield_all_checks(estimator, legacy=legacy):
+        check_with_name = partial(check, name)
+        for check_instance in _yield_instances_for_check(check, estimator):
+            yield _maybe_mark(
+                check_instance,
+                check_with_name,
+                expected_failed_checks=expected_failed_checks,
+                mark=mark,
+                pytest=pytest,
+            )
+
+
+def parametrize_with_checks(
+    estimators,
+    *,
+    legacy: bool = True,
+    expected_failed_checks: Callable | None = None,
+):
+    """Pytest specific decorator for parametrizing estimator checks.
+
+    Checks are categorised into the following groups:
+
+    - API checks: a set of checks to ensure API compatibility with scikit-learn.
+      Refer to https://scikit-learn.org/dev/developers/develop.html a requirement of
+      scikit-learn estimators.
+    - legacy: a set of checks which gradually will be grouped into other categories.
+
+    The `id` of each check is set to be a pprint version of the estimator
+    and the name of the check with its keyword arguments.
+    This allows to use `pytest -k` to specify which tests to run::
+
+        pytest test_check_estimators.py -k check_estimators_fit_returns_self
+
+    Parameters
+    ----------
+    estimators : list of estimators instances
+        Estimators to generated checks for.
+
+        .. versionchanged:: 0.24
+           Passing a class was deprecated in version 0.23, and support for
+           classes was removed in 0.24. Pass an instance instead.
+
+        .. versionadded:: 0.24
+
+
+    legacy : bool, default=True
+        Whether to include legacy checks. Over time we remove checks from this category
+        and move them into their specific category.
+
+        .. versionadded:: 1.6
+
+    expected_failed_checks : callable, default=None
+        A callable that takes an estimator as input and returns a dictionary of the
+        form::
+
+            {
+                "check_name": "my reason",
+            }
+
+        Where `"check_name"` is the name of the check, and `"my reason"` is why
+        the check fails. These tests will be marked as xfail if the check fails.
+
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    decorator : `pytest.mark.parametrize`
+
+    See Also
+    --------
+    check_estimator : Check if estimator adheres to scikit-learn conventions.
+
+    Examples
+    --------
+    >>> from sklearn.utils.estimator_checks import parametrize_with_checks
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.tree import DecisionTreeRegressor
+
+    >>> @parametrize_with_checks([LogisticRegression(),
+    ...                           DecisionTreeRegressor()])
+    ... def test_sklearn_compatible_estimator(estimator, check):
+    ...     check(estimator)
+
+    """
+    import pytest
+
+    if any(isinstance(est, type) for est in estimators):
+        msg = (
+            "Passing a class was deprecated in version 0.23 "
+            "and isn't supported anymore from 0.24."
+            "Please pass an instance instead."
+        )
+        raise TypeError(msg)
+
+    def _checks_generator(estimators, legacy, expected_failed_checks):
+        for estimator in estimators:
+            args = {"estimator": estimator, "legacy": legacy, "mark": "xfail"}
+            if callable(expected_failed_checks):
+                args["expected_failed_checks"] = expected_failed_checks(estimator)
+            yield from estimator_checks_generator(**args)
+
+    return pytest.mark.parametrize(
+        "estimator, check",
+        _checks_generator(estimators, legacy, expected_failed_checks),
+        ids=_get_check_estimator_ids,
+    )
+
+
+@validate_params(
+    {
+        "generate_only": ["boolean"],
+        "legacy": ["boolean"],
+        "expected_failed_checks": [dict, None],
+        "on_skip": [StrOptions({"warn"}), None],
+        "on_fail": [StrOptions({"raise", "warn"}), None],
+        "callback": [callable, None],
+    },
+    prefer_skip_nested_validation=False,
+)
+def check_estimator(
+    estimator=None,
+    generate_only=False,
+    *,
+    legacy: bool = True,
+    expected_failed_checks: dict[str, str] | None = None,
+    on_skip: Literal["warn"] | None = "warn",
+    on_fail: Literal["raise", "warn"] | None = "raise",
+    callback: Callable | None = None,
+):
+    """Check if estimator adheres to scikit-learn conventions.
+
+    This function will run an extensive test-suite for input validation,
+    shapes, etc, making sure that the estimator complies with `scikit-learn`
+    conventions as detailed in :ref:`rolling_your_own_estimator`.
+    Additional tests for classifiers, regressors, clustering or transformers
+    will be run if the Estimator class inherits from the corresponding mixin
+    from sklearn.base.
+
+    scikit-learn also provides a pytest specific decorator,
+    :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`, making it
+    easier to test multiple estimators.
+
+    Checks are categorised into the following groups:
+
+    - API checks: a set of checks to ensure API compatibility with scikit-learn.
+      Refer to https://scikit-learn.org/dev/developers/develop.html a requirement of
+      scikit-learn estimators.
+    - legacy: a set of checks which gradually will be grouped into other categories.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        Estimator instance to check.
+
+    generate_only : bool, default=False
+        When `False`, checks are evaluated when `check_estimator` is called.
+        When `True`, `check_estimator` returns a generator that yields
+        (estimator, check) tuples. The check is run by calling
+        `check(estimator)`.
+
+        .. versionadded:: 0.22
+
+        .. deprecated:: 1.6
+            `generate_only` will be removed in 1.8. Use
+            :func:`~sklearn.utils.estimator_checks.estimator_checks_generator` instead.
+
+    legacy : bool, default=True
+        Whether to include legacy checks. Over time we remove checks from this category
+        and move them into their specific category.
+
+        .. versionadded:: 1.6
+
+    expected_failed_checks : dict, default=None
+        A dictionary of the form::
+
+            {
+                "check_name": "this check is expected to fail because ...",
+            }
+
+        Where `"check_name"` is the name of the check, and `"my reason"` is why
+        the check fails.
+
+        .. versionadded:: 1.6
+
+    on_skip : "warn", None, default="warn"
+        This parameter controls what happens when a check is skipped.
+
+        - "warn": A :class:`~sklearn.exceptions.SkipTestWarning` is logged
+          and running tests continue.
+        - None: No warning is logged and running tests continue.
+
+        .. versionadded:: 1.6
+
+    on_fail : {"raise", "warn"}, None, default="raise"
+        This parameter controls what happens when a check fails.
+
+        - "raise": The exception raised by the first failing check is raised and
+          running tests are aborted. This does not included tests that are expected
+          to fail.
+        - "warn": A :class:`~sklearn.exceptions.EstimatorCheckFailedWarning` is logged
+          and running tests continue.
+        - None: No exception is raised and no warning is logged.
+
+        Note that if ``on_fail != "raise"``, no exception is raised, even if the checks
+        fail. You'd need to inspect the return result of ``check_estimator`` to check
+        if any checks failed.
+
+        .. versionadded:: 1.6
+
+    callback : callable, or None, default=None
+        This callback will be called with the estimator and the check name,
+        the exception (if any), the status of the check (xfail, failed, skipped,
+        passed), and the reason for the expected failure if the check is
+        expected to fail. The callable's signature needs to be::
+
+            def callback(
+                estimator,
+                check_name: str,
+                exception: Exception,
+                status: Literal["xfail", "failed", "skipped", "passed"],
+                expected_to_fail: bool,
+                expected_to_fail_reason: str,
+            )
+
+        ``callback`` cannot be provided together with ``on_fail="raise"``.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    test_results : list
+        List of dictionaries with the results of the failing tests, of the form::
+
+            {
+                "estimator": estimator,
+                "check_name": check_name,
+                "exception": exception,
+                "status": status (one of "xfail", "failed", "skipped", "passed"),
+                "expected_to_fail": expected_to_fail,
+                "expected_to_fail_reason": expected_to_fail_reason,
+            }
+
+    estimator_checks_generator : generator
+        Generator that yields (estimator, check) tuples. Returned when
+        `generate_only=True`.
+
+        ..
+            TODO(1.8): remove return value
+
+        .. deprecated:: 1.6
+            ``generate_only`` will be removed in 1.8. Use
+            :func:`~sklearn.utils.estimator_checks.estimator_checks_generator` instead.
+
+    Raises
+    ------
+    Exception
+        If ``on_fail="raise"``, the exception raised by the first failing check is
+        raised and running tests are aborted.
+
+        Note that if ``on_fail != "raise"``, no exception is raised, even if the checks
+        fail. You'd need to inspect the return result of ``check_estimator`` to check
+        if any checks failed.
+
+    See Also
+    --------
+    parametrize_with_checks : Pytest specific decorator for parametrizing estimator
+        checks.
+    estimator_checks_generator : Generator that yields (estimator, check) tuples.
+
+    Examples
+    --------
+    >>> from sklearn.utils.estimator_checks import check_estimator
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> check_estimator(LogisticRegression())
+    [...]
+    """
+    if isinstance(estimator, type):
+        msg = (
+            "Passing a class was deprecated in version 0.23 "
+            "and isn't supported anymore from 0.24."
+            "Please pass an instance instead."
+        )
+        raise TypeError(msg)
+
+    if on_fail == "raise" and callback is not None:
+        raise ValueError("callback cannot be provided together with on_fail='raise'")
+
+    name = type(estimator).__name__
+
+    # TODO(1.8): remove generate_only
+    if generate_only:
+        warnings.warn(
+            "`generate_only` is deprecated in 1.6 and will be removed in 1.8. "
+            "Use :func:`~sklearn.utils.estimator_checks.estimator_checks_generator` "
+            "instead.",
+            FutureWarning,
+        )
+        return estimator_checks_generator(
+            estimator, legacy=legacy, expected_failed_checks=None, mark="skip"
+        )
+
+    test_results = []
+
+    for estimator, check in estimator_checks_generator(
+        estimator,
+        legacy=legacy,
+        expected_failed_checks=expected_failed_checks,
+        # Not marking tests to be skipped here, we run and simulate an xfail behavior
+        mark=None,
+    ):
+        test_can_fail, reason = _should_be_skipped_or_marked(
+            estimator, check, expected_failed_checks
+        )
+        try:
+            check(estimator)
+        except SkipTest as e:
+            # We get here if the test raises SkipTest, which is expected in cases where
+            # the check cannot run for instance if a required dependency is not
+            # installed.
+            check_result = {
+                "estimator": estimator,
+                "check_name": _check_name(check),
+                "exception": e,
+                "status": "skipped",
+                "expected_to_fail": test_can_fail,
+                "expected_to_fail_reason": reason,
+            }
+            if on_skip == "warn":
+                warnings.warn(
+                    f"Skipping check {_check_name(check)} for {name} because it raised "
+                    f"{type(e).__name__}: {e}",
+                    SkipTestWarning,
+                )
+        except Exception as e:
+            if on_fail == "raise" and not test_can_fail:
+                raise
+
+            check_result = {
+                "estimator": estimator,
+                "check_name": _check_name(check),
+                "exception": e,
+                "expected_to_fail": test_can_fail,
+                "expected_to_fail_reason": reason,
+            }
+
+            if test_can_fail:
+                # This check failed, but could be expected to fail, therefore we mark it
+                # as xfail.
+                check_result["status"] = "xfail"
+            else:
+                check_result["status"] = "failed"
+
+            if on_fail == "warn":
+                warning = EstimatorCheckFailedWarning(**check_result)
+                warnings.warn(warning)
+        else:
+            check_result = {
+                "estimator": estimator,
+                "check_name": _check_name(check),
+                "exception": None,
+                "status": "passed",
+                "expected_to_fail": test_can_fail,
+                "expected_to_fail_reason": reason,
+            }
+
+        test_results.append(check_result)
+
+        if callback:
+            callback(**check_result)
+
+    return test_results
+
+
+def _regression_dataset():
+    global REGRESSION_DATASET
+    if REGRESSION_DATASET is None:
+        X, y = make_regression(
+            n_samples=200,
+            n_features=10,
+            n_informative=1,
+            bias=5.0,
+            noise=20,
+            random_state=42,
+        )
+        X = StandardScaler().fit_transform(X)
+        REGRESSION_DATASET = X, y
+    return REGRESSION_DATASET
+
+
+class _NotAnArray:
+    """An object that is convertible to an array.
+
+    Parameters
+    ----------
+    data : array-like
+        The data.
+    """
+
+    def __init__(self, data):
+        self.data = np.asarray(data)
+
+    def __array__(self, dtype=None, copy=None):
+        return self.data
+
+    def __array_function__(self, func, types, args, kwargs):
+        if func.__name__ == "may_share_memory":
+            return True
+        raise TypeError("Don't want to call array_function {}!".format(func.__name__))
+
+
+def _is_pairwise_metric(estimator):
+    """Returns True if estimator accepts pairwise metric.
+
+    Parameters
+    ----------
+    estimator : object
+        Estimator object to test.
+
+    Returns
+    -------
+    out : bool
+        True if _pairwise is set to True and False otherwise.
+    """
+    metric = getattr(estimator, "metric", None)
+
+    return bool(metric == "precomputed")
+
+
+def _generate_sparse_data(X_csr):
+    """Generate sparse matrices or arrays with {32,64}bit indices of diverse format.
+
+    Parameters
+    ----------
+    X_csr: scipy.sparse.csr_matrix or scipy.sparse.csr_array
+        Input in CSR format.
+
+    Returns
+    -------
+    out: iter(Matrices) or iter(Arrays)
+        In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',
+        'coo_64', 'csc_64', 'csr_64']
+    """
+
+    assert X_csr.format == "csr"
+    yield "csr", X_csr.copy()
+    for sparse_format in ["dok", "lil", "dia", "bsr", "csc", "coo"]:
+        yield sparse_format, X_csr.asformat(sparse_format)
+
+    # Generate large indices matrix only if its supported by scipy
+    X_coo = X_csr.asformat("coo")
+    X_coo.row = X_coo.row.astype("int64")
+    X_coo.col = X_coo.col.astype("int64")
+    yield "coo_64", X_coo
+
+    for sparse_format in ["csc", "csr"]:
+        X = X_csr.asformat(sparse_format)
+        X.indices = X.indices.astype("int64")
+        X.indptr = X.indptr.astype("int64")
+        yield sparse_format + "_64", X
+
+
+@ignore_warnings(category=FutureWarning)
+def check_supervised_y_no_nan(name, estimator_orig):
+    # Checks that the Estimator targets are not NaN.
+    estimator = clone(estimator_orig)
+    rng = np.random.RandomState(888)
+    X = rng.standard_normal(size=(10, 5))
+
+    for value in [np.nan, np.inf]:
+        y = np.full(10, value)
+        y = _enforce_estimator_tags_y(estimator, y)
+
+        module_name = estimator.__module__
+        if module_name.startswith("sklearn.") and not (
+            "test_" in module_name or module_name.endswith("_testing")
+        ):
+            # In scikit-learn we want the error message to mention the input
+            # name and be specific about the kind of unexpected value.
+            if np.isinf(value):
+                match = (
+                    r"Input (y|Y) contains infinity or a value too large for"
+                    r" dtype\('float64'\)."
+                )
+            else:
+                match = r"Input (y|Y) contains NaN."
+        else:
+            # Do not impose a particular error message to third-party libraries.
+            match = None
+        err_msg = (
+            f"Estimator {name} should have raised error on fitting array y with inf"
+            " value."
+        )
+        with raises(ValueError, match=match, err_msg=err_msg):
+            estimator.fit(X, y)
+
+
+def check_array_api_input(
+    name,
+    estimator_orig,
+    array_namespace,
+    device=None,
+    dtype_name="float64",
+    check_values=False,
+):
+    """Check that the estimator can work consistently with the Array API
+
+    By default, this just checks that the types and shapes of the arrays are
+    consistent with calling the same estimator with numpy arrays.
+
+    When check_values is True, it also checks that calling the estimator on the
+    array_api Array gives the same results as ndarrays.
+    """
+    xp = _array_api_for_tests(array_namespace, device)
+
+    X, y = make_classification(random_state=42)
+    X = X.astype(dtype_name, copy=False)
+
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+
+    est = clone(estimator_orig)
+
+    X_xp = xp.asarray(X, device=device)
+    y_xp = xp.asarray(y, device=device)
+
+    est.fit(X, y)
+
+    array_attributes = {
+        key: value for key, value in vars(est).items() if isinstance(value, np.ndarray)
+    }
+
+    est_xp = clone(est)
+    with config_context(array_api_dispatch=True):
+        est_xp.fit(X_xp, y_xp)
+        input_ns = get_namespace(X_xp)[0].__name__
+
+    # Fitted attributes which are arrays must have the same
+    # namespace as the one of the training data.
+    for key, attribute in array_attributes.items():
+        est_xp_param = getattr(est_xp, key)
+        with config_context(array_api_dispatch=True):
+            attribute_ns = get_namespace(est_xp_param)[0].__name__
+        assert attribute_ns == input_ns, (
+            f"'{key}' attribute is in wrong namespace, expected {input_ns} "
+            f"got {attribute_ns}"
+        )
+
+        assert array_device(est_xp_param) == array_device(X_xp)
+
+        est_xp_param_np = _convert_to_numpy(est_xp_param, xp=xp)
+        if check_values:
+            assert_allclose(
+                attribute,
+                est_xp_param_np,
+                err_msg=f"{key} not the same",
+                atol=_atol_for_type(X.dtype),
+            )
+        else:
+            assert attribute.shape == est_xp_param_np.shape
+            assert attribute.dtype == est_xp_param_np.dtype
+
+    # Check estimator methods, if supported, give the same results
+    methods = (
+        "score",
+        "score_samples",
+        "decision_function",
+        "predict",
+        "predict_log_proba",
+        "predict_proba",
+        "transform",
+    )
+
+    try:
+        np.asarray(X_xp)
+        np.asarray(y_xp)
+        # TODO There are a few errors in SearchCV with array-api-strict because
+        # we end up doing X[train_indices] where X is an array-api-strict array
+        # and train_indices is a numpy array. array-api-strict insists
+        # train_indices should be an array-api-strict array. On the other hand,
+        # all the array API libraries (PyTorch, jax, CuPy) accept indexing with a
+        # numpy array. This is probably not worth doing anything about for
+        # now since array-api-strict seems a bit too strict ...
+        numpy_asarray_works = xp.__name__ != "array_api_strict"
+
+    except (TypeError, RuntimeError, ValueError):
+        # PyTorch with CUDA device and CuPy raise TypeError consistently.
+        # array-api-strict chose to raise RuntimeError instead. NumPy emits
+        # a ValueError if `__array__` dunder does not return an array.
+        # Exception type may need to be updated in the future for other libraries.
+        numpy_asarray_works = False
+
+    if numpy_asarray_works:
+        # In this case, array_api_dispatch is disabled and we rely on np.asarray
+        # being called to convert the non-NumPy inputs to NumPy arrays when needed.
+        est_fitted_with_as_array = clone(est).fit(X_xp, y_xp)
+        # We only do a smoke test for now, in order to avoid complicating the
+        # test function even further.
+        for method_name in methods:
+            method = getattr(est_fitted_with_as_array, method_name, None)
+            if method is None:
+                continue
+
+            if method_name == "score":
+                method(X_xp, y_xp)
+            else:
+                method(X_xp)
+
+    for method_name in methods:
+        method = getattr(est, method_name, None)
+        if method is None:
+            continue
+
+        if method_name == "score":
+            result = method(X, y)
+            with config_context(array_api_dispatch=True):
+                result_xp = getattr(est_xp, method_name)(X_xp, y_xp)
+            # score typically returns a Python float
+            assert isinstance(result, float)
+            assert isinstance(result_xp, float)
+            if check_values:
+                assert abs(result - result_xp) < _atol_for_type(X.dtype)
+            continue
+        else:
+            result = method(X)
+            with config_context(array_api_dispatch=True):
+                result_xp = getattr(est_xp, method_name)(X_xp)
+
+        with config_context(array_api_dispatch=True):
+            result_ns = get_namespace(result_xp)[0].__name__
+        assert result_ns == input_ns, (
+            f"'{method}' output is in wrong namespace, expected {input_ns}, "
+            f"got {result_ns}."
+        )
+
+        assert array_device(result_xp) == array_device(X_xp)
+        result_xp_np = _convert_to_numpy(result_xp, xp=xp)
+
+        if check_values:
+            assert_allclose(
+                result,
+                result_xp_np,
+                err_msg=f"{method} did not the return the same result",
+                atol=_atol_for_type(X.dtype),
+            )
+        else:
+            if hasattr(result, "shape"):
+                assert result.shape == result_xp_np.shape
+                assert result.dtype == result_xp_np.dtype
+
+        if method_name == "transform" and hasattr(est, "inverse_transform"):
+            inverse_result = est.inverse_transform(result)
+            with config_context(array_api_dispatch=True):
+                invese_result_xp = est_xp.inverse_transform(result_xp)
+                inverse_result_ns = get_namespace(invese_result_xp)[0].__name__
+            assert inverse_result_ns == input_ns, (
+                "'inverse_transform' output is in wrong namespace, expected"
+                f" {input_ns}, got {inverse_result_ns}."
+            )
+
+            assert array_device(invese_result_xp) == array_device(X_xp)
+
+            invese_result_xp_np = _convert_to_numpy(invese_result_xp, xp=xp)
+            if check_values:
+                assert_allclose(
+                    inverse_result,
+                    invese_result_xp_np,
+                    err_msg="inverse_transform did not the return the same result",
+                    atol=_atol_for_type(X.dtype),
+                )
+            else:
+                assert inverse_result.shape == invese_result_xp_np.shape
+                assert inverse_result.dtype == invese_result_xp_np.dtype
+
+
+def check_array_api_input_and_values(
+    name,
+    estimator_orig,
+    array_namespace,
+    device=None,
+    dtype_name="float64",
+):
+    return check_array_api_input(
+        name,
+        estimator_orig,
+        array_namespace=array_namespace,
+        device=device,
+        dtype_name=dtype_name,
+        check_values=True,
+    )
+
+
+def check_estimator_sparse_tag(name, estimator_orig):
+    """Check that estimator tag related with accepting sparse data is properly set."""
+    estimator = clone(estimator_orig)
+
+    rng = np.random.RandomState(0)
+    n_samples = 15 if name == "SpectralCoclustering" else 40
+    X = rng.uniform(size=(n_samples, 3))
+    X[X < 0.6] = 0
+    y = rng.randint(0, 3, size=n_samples)
+    X = _enforce_estimator_tags_X(estimator, X)
+    y = _enforce_estimator_tags_y(estimator, y)
+    X = sparse.csr_array(X)
+
+    tags = get_tags(estimator)
+    if tags.input_tags.sparse:
+        try:
+            estimator.fit(X, y)  # should pass
+        except Exception as e:
+            err_msg = (
+                f"Estimator {name} raised an exception. "
+                f"The tag self.input_tags.sparse={tags.input_tags.sparse} "
+                "might not be consistent with the estimator's ability to "
+                "handle sparse data (i.e. controlled by the parameter `accept_sparse`"
+                " in `validate_data` or `check_array` functions)."
+            )
+            raise AssertionError(err_msg) from e
+    else:
+        err_msg = (
+            f"Estimator {name} raised an exception. "
+            "The estimator failed when fitted on sparse data in accordance "
+            f"with its tag self.input_tags.sparse={tags.input_tags.sparse} "
+            "but didn't raise the appropriate error: error message should "
+            "state explicitly that sparse input is not supported if this is "
+            "not the case, e.g. by using check_array(X, accept_sparse=False)."
+        )
+        try:
+            estimator.fit(X, y)  # should fail with appropriate error
+        except (ValueError, TypeError) as e:
+            if re.search("[Ss]parse", str(e)):
+                # Got the right error type and mentioning sparse issue
+                return
+            raise AssertionError(err_msg) from e
+        except Exception as e:
+            raise AssertionError(err_msg) from e
+        raise AssertionError(
+            f"Estimator {name} didn't fail when fitted on sparse data "
+            "but should have according to its tag "
+            f"self.input_tags.sparse={tags.input_tags.sparse}. "
+            f"The tag is inconsistent and must be fixed."
+        )
+
+
+def _check_estimator_sparse_container(name, estimator_orig, sparse_type):
+    rng = np.random.RandomState(0)
+    X = rng.uniform(size=(40, 3))
+    X[X < 0.6] = 0
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = (4 * rng.uniform(size=X.shape[0])).astype(np.int32)
+    # catch deprecation warnings
+    with ignore_warnings(category=FutureWarning):
+        estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+    tags = get_tags(estimator_orig)
+    for matrix_format, X in _generate_sparse_data(sparse_type(X)):
+        # catch deprecation warnings
+        with ignore_warnings(category=FutureWarning):
+            estimator = clone(estimator_orig)
+            if name in ["Scaler", "StandardScaler"]:
+                estimator.set_params(with_mean=False)
+        # fit and predict
+        if "64" in matrix_format:
+            err_msg = (
+                f"Estimator {name} doesn't seem to support {matrix_format} "
+                "matrix, and is not failing gracefully, e.g. by using "
+                "check_array(X, accept_large_sparse=False)."
+            )
+        else:
+            err_msg = (
+                f"Estimator {name} doesn't seem to fail gracefully on sparse "
+                "data: error message should state explicitly that sparse "
+                "input is not supported if this is not the case, e.g. by using "
+                "check_array(X, accept_sparse=False)."
+            )
+        with raises(
+            (TypeError, ValueError),
+            match=["sparse", "Sparse"],
+            may_pass=True,
+            err_msg=err_msg,
+        ):
+            with ignore_warnings(category=FutureWarning):
+                estimator.fit(X, y)
+            if hasattr(estimator, "predict"):
+                pred = estimator.predict(X)
+                if tags.target_tags.multi_output and not tags.target_tags.single_output:
+                    assert pred.shape == (X.shape[0], 1)
+                else:
+                    assert pred.shape == (X.shape[0],)
+            if hasattr(estimator, "predict_proba"):
+                probs = estimator.predict_proba(X)
+                if not tags.classifier_tags.multi_class:
+                    expected_probs_shape = (X.shape[0], 2)
+                else:
+                    expected_probs_shape = (X.shape[0], 4)
+                assert probs.shape == expected_probs_shape
+
+
+def check_estimator_sparse_matrix(name, estimator_orig):
+    _check_estimator_sparse_container(name, estimator_orig, sparse.csr_matrix)
+
+
+def check_estimator_sparse_array(name, estimator_orig):
+    _check_estimator_sparse_container(name, estimator_orig, sparse.csr_array)
+
+
+def check_f_contiguous_array_estimator(name, estimator_orig):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/23988
+    # https://github.com/scikit-learn/scikit-learn/issues/24013
+    estimator = clone(estimator_orig)
+
+    rng = np.random.RandomState(0)
+    X = 3 * rng.uniform(size=(20, 3))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    X = np.asfortranarray(X)
+    y = X[:, 0].astype(int)
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+
+    estimator.fit(X, y)
+
+    if hasattr(estimator, "transform"):
+        estimator.transform(X)
+
+    if hasattr(estimator, "predict"):
+        estimator.predict(X)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_sample_weights_pandas_series(name, estimator_orig):
+    # check that estimators will accept a 'sample_weight' parameter of
+    # type pandas.Series in the 'fit' function.
+    estimator = clone(estimator_orig)
+    try:
+        import pandas as pd
+
+        X = np.array(
+            [
+                [1, 1],
+                [1, 2],
+                [1, 3],
+                [1, 4],
+                [2, 1],
+                [2, 2],
+                [2, 3],
+                [2, 4],
+                [3, 1],
+                [3, 2],
+                [3, 3],
+                [3, 4],
+            ]
+        )
+        X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X), copy=False)
+        y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
+        weights = pd.Series([1] * 12)
+        if (
+            not get_tags(estimator).target_tags.single_output
+            and get_tags(estimator).target_tags.multi_output
+        ):
+            y = pd.DataFrame(y, copy=False)
+        try:
+            estimator.fit(X, y, sample_weight=weights)
+        except ValueError:
+            raise ValueError(
+                "Estimator {0} raises error if "
+                "'sample_weight' parameter is of "
+                "type pandas.Series".format(name)
+            )
+    except ImportError:
+        raise SkipTest(
+            "pandas is not installed: not testing for "
+            "input of type pandas.Series to class weight."
+        )
+
+
+@ignore_warnings(category=(FutureWarning))
+def check_sample_weights_not_an_array(name, estimator_orig):
+    # check that estimators will accept a 'sample_weight' parameter of
+    # type _NotAnArray in the 'fit' function.
+    estimator = clone(estimator_orig)
+    X = np.array(
+        [
+            [1, 1],
+            [1, 2],
+            [1, 3],
+            [1, 4],
+            [2, 1],
+            [2, 2],
+            [2, 3],
+            [2, 4],
+            [3, 1],
+            [3, 2],
+            [3, 3],
+            [3, 4],
+        ]
+    )
+    X = _NotAnArray(_enforce_estimator_tags_X(estimator_orig, X))
+    y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
+    weights = _NotAnArray([1] * 12)
+    tags = get_tags(estimator)
+    if not tags.target_tags.single_output and tags.target_tags.multi_output:
+        y = _NotAnArray(y.data.reshape(-1, 1))
+    estimator.fit(X, y, sample_weight=weights)
+
+
+@ignore_warnings(category=(FutureWarning))
+def check_sample_weights_list(name, estimator_orig):
+    # check that estimators will accept a 'sample_weight' parameter of
+    # type list in the 'fit' function.
+    estimator = clone(estimator_orig)
+    rnd = np.random.RandomState(0)
+    n_samples = 30
+    X = _enforce_estimator_tags_X(estimator_orig, rnd.uniform(size=(n_samples, 3)))
+    y = np.arange(n_samples) % 3
+    y = _enforce_estimator_tags_y(estimator, y)
+    sample_weight = [3] * n_samples
+    # Test that estimators don't raise any exception
+    estimator.fit(X, y, sample_weight=sample_weight)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_sample_weights_shape(name, estimator_orig):
+    # check that estimators raise an error if sample_weight
+    # shape mismatches the input
+    estimator = clone(estimator_orig)
+    X = np.array(
+        [
+            [1, 3],
+            [1, 3],
+            [1, 3],
+            [1, 3],
+            [2, 1],
+            [2, 1],
+            [2, 1],
+            [2, 1],
+            [3, 3],
+            [3, 3],
+            [3, 3],
+            [3, 3],
+            [4, 1],
+            [4, 1],
+            [4, 1],
+            [4, 1],
+        ]
+    )
+    y = np.array([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2])
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    estimator.fit(X, y, sample_weight=np.ones(len(y)))
+
+    with raises(ValueError):
+        estimator.fit(X, y, sample_weight=np.ones(2 * len(y)))
+
+    with raises(ValueError):
+        estimator.fit(X, y, sample_weight=np.ones((len(y), 2)))
+
+
+@ignore_warnings(category=FutureWarning)
+def _check_sample_weight_equivalence(name, estimator_orig, sparse_container):
+    # check that setting sample_weight to zero / integer is equivalent
+    # to removing / repeating corresponding samples.
+    estimator_weighted = clone(estimator_orig)
+    estimator_repeated = clone(estimator_orig)
+    set_random_state(estimator_weighted, random_state=0)
+    set_random_state(estimator_repeated, random_state=0)
+
+    rng = np.random.RandomState(42)
+    n_samples = 15
+    X = rng.rand(n_samples, n_samples * 2)
+    y = rng.randint(0, 3, size=n_samples)
+    # Use random integers (including zero) as weights.
+    sw = rng.randint(0, 5, size=n_samples)
+
+    X_weighted = X
+    y_weighted = y
+    # repeat samples according to weights
+    X_repeated = X_weighted.repeat(repeats=sw, axis=0)
+    y_repeated = y_weighted.repeat(repeats=sw)
+
+    X_weighted, y_weighted, sw = shuffle(X_weighted, y_weighted, sw, random_state=0)
+
+    # when the estimator has an internal CV scheme
+    # we only use weights / repetitions in a specific CV group (here group=0)
+    if "cv" in estimator_orig.get_params():
+        groups_weighted = np.hstack(
+            [np.full_like(y_weighted, 0), np.full_like(y, 1), np.full_like(y, 2)]
+        )
+        sw = np.hstack([sw, np.ones_like(y), np.ones_like(y)])
+        X_weighted = np.vstack([X_weighted, X, X])
+        y_weighted = np.hstack([y_weighted, y, y])
+        splits_weighted = list(
+            LeaveOneGroupOut().split(X_weighted, groups=groups_weighted)
+        )
+        estimator_weighted.set_params(cv=splits_weighted)
+
+        groups_repeated = np.hstack(
+            [np.full_like(y_repeated, 0), np.full_like(y, 1), np.full_like(y, 2)]
+        )
+        X_repeated = np.vstack([X_repeated, X, X])
+        y_repeated = np.hstack([y_repeated, y, y])
+        splits_repeated = list(
+            LeaveOneGroupOut().split(X_repeated, groups=groups_repeated)
+        )
+        estimator_repeated.set_params(cv=splits_repeated)
+
+    y_weighted = _enforce_estimator_tags_y(estimator_weighted, y_weighted)
+    y_repeated = _enforce_estimator_tags_y(estimator_repeated, y_repeated)
+
+    # convert to sparse X if needed
+    if sparse_container is not None:
+        X_weighted = sparse_container(X_weighted)
+        X_repeated = sparse_container(X_repeated)
+
+    estimator_repeated.fit(X_repeated, y=y_repeated, sample_weight=None)
+    estimator_weighted.fit(X_weighted, y=y_weighted, sample_weight=sw)
+
+    for method in ["predict_proba", "decision_function", "predict", "transform"]:
+        if hasattr(estimator_orig, method):
+            X_pred1 = getattr(estimator_repeated, method)(X)
+            X_pred2 = getattr(estimator_weighted, method)(X)
+            err_msg = (
+                f"Comparing the output of {name}.{method} revealed that fitting "
+                "with `sample_weight` is not equivalent to fitting with removed "
+                "or repeated data points."
+            )
+            assert_allclose_dense_sparse(X_pred1, X_pred2, err_msg=err_msg)
+
+
+def check_sample_weight_equivalence_on_dense_data(name, estimator_orig):
+    _check_sample_weight_equivalence(name, estimator_orig, sparse_container=None)
+
+
+def check_sample_weight_equivalence_on_sparse_data(name, estimator_orig):
+    _check_sample_weight_equivalence(name, estimator_orig, sparse.csr_array)
+
+
+def check_sample_weights_not_overwritten(name, estimator_orig):
+    # check that estimators don't override the passed sample_weight parameter
+    estimator = clone(estimator_orig)
+    set_random_state(estimator, random_state=0)
+
+    X = np.array(
+        [
+            [1, 3],
+            [1, 3],
+            [1, 3],
+            [1, 3],
+            [2, 1],
+            [2, 1],
+            [2, 1],
+            [2, 1],
+            [3, 3],
+            [3, 3],
+            [3, 3],
+            [3, 3],
+            [4, 1],
+            [4, 1],
+            [4, 1],
+            [4, 1],
+        ],
+        dtype=np.float64,
+    )
+    y = np.array([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    sample_weight_original = np.ones(y.shape[0])
+    sample_weight_original[0] = 10.0
+
+    sample_weight_fit = sample_weight_original.copy()
+
+    estimator.fit(X, y, sample_weight=sample_weight_fit)
+
+    err_msg = f"{name} overwrote the original `sample_weight` given during fit"
+    assert_allclose(sample_weight_fit, sample_weight_original, err_msg=err_msg)
+
+
+@ignore_warnings(category=(FutureWarning, UserWarning))
+def check_dtype_object(name, estimator_orig):
+    # check that estimators treat dtype object as numeric if possible
+    rng = np.random.RandomState(0)
+    X = _enforce_estimator_tags_X(estimator_orig, rng.uniform(size=(40, 10)))
+    X = X.astype(object)
+    tags = get_tags(estimator_orig)
+    y = (X[:, 0] * 4).astype(int)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    estimator.fit(X, y)
+    if hasattr(estimator, "predict"):
+        estimator.predict(X)
+
+    if hasattr(estimator, "transform"):
+        estimator.transform(X)
+
+    err_msg = (
+        "y with unknown label type is passed, but an error with no proper message "
+        "is raised. You can use `type_of_target(..., raise_unknown=True)` to check "
+        "and raise the right error, or include 'Unknown label type' in the error "
+        "message."
+    )
+    with raises(Exception, match="Unknown label type", may_pass=True, err_msg=err_msg):
+        estimator.fit(X, y.astype(object))
+
+    if not tags.input_tags.string:
+        X[0, 0] = {"foo": "bar"}
+        # This error is raised by:
+        # - `np.asarray` in `check_array`
+        # - `_unique_python` for encoders
+        msg = "argument must be .* string.* number"
+        with raises(TypeError, match=msg):
+            estimator.fit(X, y)
+    else:
+        # Estimators supporting string will not call np.asarray to convert the
+        # data to numeric and therefore, the error will not be raised.
+        # Checking for each element dtype in the input array will be costly.
+        # Refer to #11401 for full discussion.
+        estimator.fit(X, y)
+
+
+def check_complex_data(name, estimator_orig):
+    rng = np.random.RandomState(42)
+    # check that estimators raise an exception on providing complex data
+    X = rng.uniform(size=10) + 1j * rng.uniform(size=10)
+    X = X.reshape(-1, 1)
+
+    # Something both valid for classification and regression
+    y = rng.randint(low=0, high=2, size=10) + 1j
+    estimator = clone(estimator_orig)
+    set_random_state(estimator, random_state=0)
+    with raises(ValueError, match="Complex data not supported"):
+        estimator.fit(X, y)
+
+
+@ignore_warnings
+def check_dict_unchanged(name, estimator_orig):
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(20, 3))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+
+    y = X[:, 0].astype(int)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+    set_random_state(estimator, 1)
+
+    estimator.fit(X, y)
+    for method in ["predict", "transform", "decision_function", "predict_proba"]:
+        if hasattr(estimator, method):
+            dict_before = estimator.__dict__.copy()
+            getattr(estimator, method)(X)
+            assert estimator.__dict__ == dict_before, (
+                "Estimator changes __dict__ during %s" % method
+            )
+
+
+def _is_public_parameter(attr):
+    return not (attr.startswith("_") or attr.endswith("_"))
+
+
+@ignore_warnings(category=FutureWarning)
+def check_dont_overwrite_parameters(name, estimator_orig):
+    # check that fit method only changes or sets private attributes
+    if hasattr(estimator_orig.__init__, "deprecated_original"):
+        # to not check deprecated classes
+        return
+    estimator = clone(estimator_orig)
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(20, 3))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = X[:, 0].astype(int)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 1
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = 1
+
+    set_random_state(estimator, 1)
+    dict_before_fit = estimator.__dict__.copy()
+    estimator.fit(X, y)
+
+    dict_after_fit = estimator.__dict__
+
+    public_keys_after_fit = [
+        key for key in dict_after_fit.keys() if _is_public_parameter(key)
+    ]
+
+    attrs_added_by_fit = [
+        key for key in public_keys_after_fit if key not in dict_before_fit.keys()
+    ]
+
+    # check that fit doesn't add any public attribute
+    assert not attrs_added_by_fit, (
+        "Estimator adds public attribute(s) during"
+        " the fit method."
+        " Estimators are only allowed to add private attributes"
+        " either started with _ or ended"
+        " with _ but %s added" % ", ".join(attrs_added_by_fit)
+    )
+
+    # check that fit doesn't change any public attribute
+    attrs_changed_by_fit = [
+        key
+        for key in public_keys_after_fit
+        if (dict_before_fit[key] is not dict_after_fit[key])
+    ]
+
+    assert not attrs_changed_by_fit, (
+        "Estimator changes public attribute(s) during"
+        " the fit method. Estimators are only allowed"
+        " to change attributes started"
+        " or ended with _, but"
+        " %s changed" % ", ".join(attrs_changed_by_fit)
+    )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_fit2d_predict1d(name, estimator_orig):
+    # check by fitting a 2d array and predicting with a 1d array
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(20, 3))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = X[:, 0].astype(int)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 1
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = 1
+
+    set_random_state(estimator, 1)
+    estimator.fit(X, y)
+
+    for method in ["predict", "transform", "decision_function", "predict_proba"]:
+        if hasattr(estimator, method):
+            with raises(ValueError, match="Reshape your data"):
+                getattr(estimator, method)(X[0])
+
+
+def _apply_on_subsets(func, X):
+    # apply function on the whole set and on mini batches
+    result_full = func(X)
+    n_features = X.shape[1]
+    result_by_batch = [func(batch.reshape(1, n_features)) for batch in X]
+
+    # func can output tuple (e.g. score_samples)
+    if isinstance(result_full, tuple):
+        result_full = result_full[0]
+        result_by_batch = list(map(lambda x: x[0], result_by_batch))
+
+    if sparse.issparse(result_full):
+        result_full = result_full.toarray()
+        result_by_batch = [x.toarray() for x in result_by_batch]
+
+    return np.ravel(result_full), np.ravel(result_by_batch)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_methods_subset_invariance(name, estimator_orig):
+    # check that method gives invariant results if applied
+    # on mini batches or the whole set
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(20, 3))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = X[:, 0].astype(int)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 1
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = 1
+
+    set_random_state(estimator, 1)
+    estimator.fit(X, y)
+
+    for method in [
+        "predict",
+        "transform",
+        "decision_function",
+        "score_samples",
+        "predict_proba",
+    ]:
+        msg = ("{method} of {name} is not invariant when applied to a subset.").format(
+            method=method, name=name
+        )
+
+        if hasattr(estimator, method):
+            result_full, result_by_batch = _apply_on_subsets(
+                getattr(estimator, method), X
+            )
+            assert_allclose(result_full, result_by_batch, atol=1e-7, err_msg=msg)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_methods_sample_order_invariance(name, estimator_orig):
+    # check that method gives invariant results if applied
+    # on a subset with different sample order
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(20, 3))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = X[:, 0].astype(np.int64)
+    tags = get_tags(estimator_orig)
+    if tags.classifier_tags is not None and not tags.classifier_tags.multi_class:
+        y[y == 2] = 1
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 1
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = 2
+
+    set_random_state(estimator, 1)
+    estimator.fit(X, y)
+
+    idx = np.random.permutation(X.shape[0])
+
+    for method in [
+        "predict",
+        "transform",
+        "decision_function",
+        "score_samples",
+        "predict_proba",
+    ]:
+        msg = (
+            "{method} of {name} is not invariant when applied to a dataset"
+            "with different sample order."
+        ).format(method=method, name=name)
+
+        if hasattr(estimator, method):
+            assert_allclose_dense_sparse(
+                _safe_indexing(getattr(estimator, method)(X), idx),
+                getattr(estimator, method)(_safe_indexing(X, idx)),
+                atol=1e-9,
+                err_msg=msg,
+            )
+
+
+@ignore_warnings
+def check_fit2d_1sample(name, estimator_orig):
+    # Check that fitting a 2d array with only one sample either works or
+    # returns an informative message. The error message should either mention
+    # the number of samples or the number of classes.
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(1, 10))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+
+    y = X[:, 0].astype(int)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 1
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = 1
+
+    set_random_state(estimator, 1)
+
+    # min_cluster_size cannot be less than the data size for OPTICS.
+    if name == "OPTICS":
+        estimator.set_params(min_samples=1.0)
+
+    # perplexity cannot be more than the number of samples for TSNE.
+    if name == "TSNE":
+        estimator.set_params(perplexity=0.5)
+
+    msgs = [
+        "1 sample",
+        "n_samples = 1",
+        "n_samples=1",
+        "one sample",
+        "1 class",
+        "one class",
+    ]
+
+    with raises(ValueError, match=msgs, may_pass=True):
+        estimator.fit(X, y)
+
+
+@ignore_warnings
+def check_fit2d_1feature(name, estimator_orig):
+    # check fitting a 2d array with only 1 feature either works or returns
+    # informative message
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(10, 1))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = X[:, 0].astype(int)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 1
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = 1
+    # ensure two labels in subsample for RandomizedLogisticRegression
+    if name == "RandomizedLogisticRegression":
+        estimator.sample_fraction = 1
+    # ensure non skipped trials for RANSACRegressor
+    if name == "RANSACRegressor":
+        estimator.residual_threshold = 0.5
+
+    y = _enforce_estimator_tags_y(estimator, y)
+    set_random_state(estimator, 1)
+
+    msgs = [r"1 feature\(s\)", "n_features = 1", "n_features=1"]
+
+    with raises(ValueError, match=msgs, may_pass=True):
+        estimator.fit(X, y)
+
+
+@ignore_warnings
+def check_fit1d(name, estimator_orig):
+    # check fitting 1d X array raises a ValueError
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(20))
+    y = X.astype(int)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 1
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = 1
+
+    set_random_state(estimator, 1)
+    with raises(ValueError):
+        estimator.fit(X, y)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_transformer_general(name, transformer, readonly_memmap=False):
+    X, y = make_blobs(
+        n_samples=30,
+        centers=[[0, 0, 0], [1, 1, 1]],
+        random_state=0,
+        n_features=2,
+        cluster_std=0.1,
+    )
+    X = StandardScaler().fit_transform(X)
+    X = _enforce_estimator_tags_X(transformer, X)
+
+    if readonly_memmap:
+        X, y = create_memmap_backed_data([X, y])
+
+    _check_transformer(name, transformer, X, y)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_transformer_data_not_an_array(name, transformer):
+    X, y = make_blobs(
+        n_samples=30,
+        centers=[[0, 0, 0], [1, 1, 1]],
+        random_state=0,
+        n_features=2,
+        cluster_std=0.1,
+    )
+    X = StandardScaler().fit_transform(X)
+    X = _enforce_estimator_tags_X(transformer, X)
+    this_X = _NotAnArray(X)
+    this_y = _NotAnArray(np.asarray(y))
+    _check_transformer(name, transformer, this_X, this_y)
+    # try the same with some list
+    _check_transformer(name, transformer, X.tolist(), y.tolist())
+
+
+@ignore_warnings(category=FutureWarning)
+def check_transformers_unfitted(name, transformer):
+    X, y = _regression_dataset()
+
+    transformer = clone(transformer)
+    with raises(
+        (AttributeError, ValueError),
+        err_msg=(
+            "The unfitted "
+            f"transformer {name} does not raise an error when "
+            "transform is called. Perhaps use "
+            "check_is_fitted in transform."
+        ),
+    ):
+        transformer.transform(X)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_transformers_unfitted_stateless(name, transformer):
+    """Check that using transform without prior fitting
+    doesn't raise a NotFittedError for stateless transformers.
+    """
+    rng = np.random.RandomState(0)
+    X = rng.uniform(size=(20, 5))
+    X = _enforce_estimator_tags_X(transformer, X)
+
+    transformer = clone(transformer)
+    X_trans = transformer.transform(X)
+
+    assert X_trans.shape[0] == X.shape[0]
+
+
+def _check_transformer(name, transformer_orig, X, y):
+    n_samples, n_features = np.asarray(X).shape
+    transformer = clone(transformer_orig)
+    set_random_state(transformer)
+
+    # fit
+
+    if name in CROSS_DECOMPOSITION:
+        y_ = np.c_[np.asarray(y), np.asarray(y)]
+        y_[::2, 1] *= 2
+        if isinstance(X, _NotAnArray):
+            y_ = _NotAnArray(y_)
+    else:
+        y_ = y
+
+    transformer.fit(X, y_)
+    # fit_transform method should work on non fitted estimator
+    transformer_clone = clone(transformer)
+    X_pred = transformer_clone.fit_transform(X, y=y_)
+
+    if isinstance(X_pred, tuple):
+        for x_pred in X_pred:
+            assert x_pred.shape[0] == n_samples
+    else:
+        # check for consistent n_samples
+        assert X_pred.shape[0] == n_samples
+
+    if hasattr(transformer, "transform"):
+        if name in CROSS_DECOMPOSITION:
+            X_pred2 = transformer.transform(X, y_)
+            X_pred3 = transformer.fit_transform(X, y=y_)
+        else:
+            X_pred2 = transformer.transform(X)
+            X_pred3 = transformer.fit_transform(X, y=y_)
+
+        if get_tags(transformer_orig).non_deterministic:
+            msg = name + " is non deterministic"
+            raise SkipTest(msg)
+        if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
+            for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
+                assert_allclose_dense_sparse(
+                    x_pred,
+                    x_pred2,
+                    atol=1e-2,
+                    err_msg="fit_transform and transform outcomes not consistent in %s"
+                    % transformer,
+                )
+                assert_allclose_dense_sparse(
+                    x_pred,
+                    x_pred3,
+                    atol=1e-2,
+                    err_msg="consecutive fit_transform outcomes not consistent in %s"
+                    % transformer,
+                )
+        else:
+            assert_allclose_dense_sparse(
+                X_pred,
+                X_pred2,
+                err_msg="fit_transform and transform outcomes not consistent in %s"
+                % transformer,
+                atol=1e-2,
+            )
+            assert_allclose_dense_sparse(
+                X_pred,
+                X_pred3,
+                atol=1e-2,
+                err_msg="consecutive fit_transform outcomes not consistent in %s"
+                % transformer,
+            )
+            assert _num_samples(X_pred2) == n_samples
+            assert _num_samples(X_pred3) == n_samples
+
+        # raises error on malformed input for transform
+        if (
+            hasattr(X, "shape")
+            and get_tags(transformer).requires_fit
+            and X.ndim == 2
+            and X.shape[1] > 1
+        ):
+            # If it's not an array, it does not have a 'T' property
+            with raises(
+                ValueError,
+                err_msg=(
+                    f"The transformer {name} does not raise an error "
+                    "when the number of features in transform is different from "
+                    "the number of features in fit."
+                ),
+            ):
+                transformer.transform(X[:, :-1])
+
+
+@ignore_warnings
+def check_pipeline_consistency(name, estimator_orig):
+    if get_tags(estimator_orig).non_deterministic:
+        msg = name + " is non deterministic"
+        raise SkipTest(msg)
+
+    # check that make_pipeline(est) gives same score as est
+    X, y = make_blobs(
+        n_samples=30,
+        centers=[[0, 0, 0], [1, 1, 1]],
+        random_state=0,
+        n_features=2,
+        cluster_std=0.1,
+    )
+    X = _enforce_estimator_tags_X(estimator_orig, X, kernel=rbf_kernel)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+    set_random_state(estimator)
+    pipeline = make_pipeline(estimator)
+    estimator.fit(X, y)
+    pipeline.fit(X, y)
+
+    funcs = ["score", "fit_transform"]
+
+    for func_name in funcs:
+        func = getattr(estimator, func_name, None)
+        if func is not None:
+            func_pipeline = getattr(pipeline, func_name)
+            result = func(X, y)
+            result_pipe = func_pipeline(X, y)
+            assert_allclose_dense_sparse(result, result_pipe)
+
+
+@ignore_warnings
+def check_mixin_order(name, estimator_orig):
+    """Check that mixins are inherited in the correct order."""
+    # We define a list of edges, which in effect define a DAG of mixins and their
+    # required order of inheritance.
+    # This is of the form (mixin_a_should_be_before, mixin_b_should_be_after)
+    dag = [
+        (ClassifierMixin, BaseEstimator),
+        (RegressorMixin, BaseEstimator),
+        (ClusterMixin, BaseEstimator),
+        (TransformerMixin, BaseEstimator),
+        (BiclusterMixin, BaseEstimator),
+        (OneToOneFeatureMixin, BaseEstimator),
+        (ClassNamePrefixFeaturesOutMixin, BaseEstimator),
+        (DensityMixin, BaseEstimator),
+        (OutlierMixin, BaseEstimator),
+        (MetaEstimatorMixin, BaseEstimator),
+        (MultiOutputMixin, BaseEstimator),
+    ]
+    violations = []
+    mro = type(estimator_orig).mro()
+    for mixin_a, mixin_b in dag:
+        if (
+            mixin_a in mro
+            and mixin_b in mro
+            and mro.index(mixin_a) > mro.index(mixin_b)
+        ):
+            violations.append((mixin_a, mixin_b))
+    violation_str = "\n".join(
+        f"{mixin_a.__name__} comes before/left side of {mixin_b.__name__}"
+        for mixin_a, mixin_b in violations
+    )
+    assert not violations, (
+        f"{name} is inheriting from mixins in the wrong order. In general, in mixin "
+        "inheritance, more specialized mixins must come before more general ones. "
+        "This means, for instance, `BaseEstimator` should be on the right side of most "
+        "other mixins. You need to change the order so that:\n"
+        f"{violation_str}"
+    )
+
+
+@ignore_warnings
+def check_fit_score_takes_y(name, estimator_orig):
+    # check that all estimators accept an optional y
+    # in fit and score so they can be used in pipelines
+    rnd = np.random.RandomState(0)
+    n_samples = 30
+    X = rnd.uniform(size=(n_samples, 3))
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = np.arange(n_samples) % 3
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+    set_random_state(estimator)
+
+    funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"]
+    for func_name in funcs:
+        func = getattr(estimator, func_name, None)
+        if func is not None:
+            func(X, y)
+            args = [p.name for p in signature(func).parameters.values()]
+            if args[0] == "self":
+                # available_if makes methods into functions
+                # with an explicit "self", so need to shift arguments
+                args = args[1:]
+            assert args[1] in ["y", "Y"], (
+                "Expected y or Y as second argument for method "
+                "%s of %s. Got arguments: %r."
+                % (func_name, type(estimator).__name__, args)
+            )
+
+
+@ignore_warnings
+def check_estimators_dtypes(name, estimator_orig):
+    rnd = np.random.RandomState(0)
+    X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32)
+    X_train_32 = _enforce_estimator_tags_X(estimator_orig, X_train_32)
+    X_train_64 = X_train_32.astype(np.float64)
+    X_train_int_64 = X_train_32.astype(np.int64)
+    X_train_int_32 = X_train_32.astype(np.int32)
+    y = np.array([1, 2] * 10, dtype=np.int64)
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+
+    methods = ["predict", "transform", "decision_function", "predict_proba"]
+
+    for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]:
+        estimator = clone(estimator_orig)
+        set_random_state(estimator, 1)
+        estimator.fit(X_train, y)
+
+        for method in methods:
+            if hasattr(estimator, method):
+                getattr(estimator, method)(X_train)
+
+
+def check_transformer_preserve_dtypes(name, transformer_orig):
+    # check that dtype are preserved meaning if input X is of some dtype
+    # X_transformed should be from the same dtype.
+    transformer = clone(transformer_orig)
+    if hasattr(transformer, "set_output"):
+        transformer.set_output(transform="default")
+    X, y = make_blobs(
+        n_samples=30,
+        centers=[[0, 0, 0], [1, 1, 1]],
+        random_state=0,
+        cluster_std=0.1,
+    )
+    X = StandardScaler().fit_transform(X)
+    X = _enforce_estimator_tags_X(transformer_orig, X)
+
+    for dtype in get_tags(transformer_orig).transformer_tags.preserves_dtype:
+        X_cast = X.astype(dtype)
+        set_random_state(transformer)
+        X_trans1 = transformer.fit_transform(X_cast, y)
+        X_trans2 = transformer.fit(X_cast, y).transform(X_cast)
+
+        for Xt, method in zip([X_trans1, X_trans2], ["fit_transform", "transform"]):
+            if isinstance(Xt, tuple):
+                # cross-decompostion returns a tuple of (x_scores, y_scores)
+                # when given y with fit_transform; only check the first element
+                Xt = Xt[0]
+
+            # check that the output dtype is preserved
+            assert Xt.dtype == dtype, (
+                f"{name} (method={method}) does not preserve dtype. "
+                f"Original/Expected dtype={dtype}, got dtype={Xt.dtype}."
+            )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_estimators_empty_data_messages(name, estimator_orig):
+    e = clone(estimator_orig)
+    set_random_state(e, 1)
+
+    X_zero_samples = np.empty(0).reshape(0, 3)
+    # The precise message can change depending on whether X or y is
+    # validated first. Let us test the type of exception only:
+    err_msg = (
+        f"The estimator {name} does not raise a ValueError when an "
+        "empty data is used to train. Perhaps use check_array in train."
+    )
+    with raises(ValueError, err_msg=err_msg):
+        e.fit(X_zero_samples, [])
+
+    X_zero_features = np.empty(0).reshape(12, 0)
+    # the following y should be accepted by both classifiers and regressors
+    # and ignored by unsupervised models
+    y = _enforce_estimator_tags_y(e, np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]))
+    msg = r"0 feature\(s\) \(shape=\(\d*, 0\)\) while a minimum of \d* is required."
+    with raises(ValueError, match=msg):
+        e.fit(X_zero_features, y)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_estimators_nan_inf(name, estimator_orig):
+    # Checks that Estimator X's do not contain NaN or inf.
+    rnd = np.random.RandomState(0)
+    X_train_finite = _enforce_estimator_tags_X(
+        estimator_orig, rnd.uniform(size=(10, 3))
+    )
+    X_train_nan = rnd.uniform(size=(10, 3))
+    X_train_nan[0, 0] = np.nan
+    X_train_inf = rnd.uniform(size=(10, 3))
+    X_train_inf[0, 0] = np.inf
+    y = np.ones(10)
+    y[:5] = 0
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+    error_string_fit = f"Estimator {name} doesn't check for NaN and inf in fit."
+    error_string_predict = f"Estimator {name} doesn't check for NaN and inf in predict."
+    error_string_transform = (
+        f"Estimator {name} doesn't check for NaN and inf in transform."
+    )
+    for X_train in [X_train_nan, X_train_inf]:
+        # catch deprecation warnings
+        with ignore_warnings(category=FutureWarning):
+            estimator = clone(estimator_orig)
+            set_random_state(estimator, 1)
+            # try to fit
+            with raises(ValueError, match=["inf", "NaN"], err_msg=error_string_fit):
+                estimator.fit(X_train, y)
+            # actually fit
+            estimator.fit(X_train_finite, y)
+
+            # predict
+            if hasattr(estimator, "predict"):
+                with raises(
+                    ValueError,
+                    match=["inf", "NaN"],
+                    err_msg=error_string_predict,
+                ):
+                    estimator.predict(X_train)
+
+            # transform
+            if hasattr(estimator, "transform"):
+                with raises(
+                    ValueError,
+                    match=["inf", "NaN"],
+                    err_msg=error_string_transform,
+                ):
+                    estimator.transform(X_train)
+
+
+@ignore_warnings
+def check_nonsquare_error(name, estimator_orig):
+    """Test that error is thrown when non-square data provided."""
+
+    X, y = make_blobs(n_samples=20, n_features=10)
+    estimator = clone(estimator_orig)
+
+    with raises(
+        ValueError,
+        err_msg=(
+            f"The pairwise estimator {name} does not raise an error on non-square data"
+        ),
+    ):
+        estimator.fit(X, y)
+
+
+@ignore_warnings
+def check_estimators_pickle(name, estimator_orig, readonly_memmap=False):
+    """Test that we can pickle all estimators."""
+    check_methods = ["predict", "transform", "decision_function", "predict_proba"]
+
+    X, y = make_blobs(
+        n_samples=30,
+        centers=[[0, 0, 0], [1, 1, 1]],
+        random_state=0,
+        n_features=2,
+        cluster_std=0.1,
+    )
+
+    X = _enforce_estimator_tags_X(estimator_orig, X, kernel=rbf_kernel)
+
+    tags = get_tags(estimator_orig)
+    # include NaN values when the estimator should deal with them
+    if tags.input_tags.allow_nan:
+        # set randomly 10 elements to np.nan
+        rng = np.random.RandomState(42)
+        mask = rng.choice(X.size, 10, replace=False)
+        X.reshape(-1)[mask] = np.nan
+
+    estimator = clone(estimator_orig)
+
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    set_random_state(estimator)
+    estimator.fit(X, y)
+
+    if readonly_memmap:
+        unpickled_estimator = create_memmap_backed_data(estimator)
+    else:
+        # No need to touch the file system in that case.
+        pickled_estimator = pickle.dumps(estimator)
+        module_name = estimator.__module__
+        if module_name.startswith("sklearn.") and not (
+            "test_" in module_name or module_name.endswith("_testing")
+        ):
+            # strict check for sklearn estimators that are not implemented in test
+            # modules.
+            assert b"_sklearn_version" in pickled_estimator
+        unpickled_estimator = pickle.loads(pickled_estimator)
+
+    result = dict()
+    for method in check_methods:
+        if hasattr(estimator, method):
+            result[method] = getattr(estimator, method)(X)
+
+    for method in result:
+        unpickled_result = getattr(unpickled_estimator, method)(X)
+        assert_allclose_dense_sparse(result[method], unpickled_result)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_estimators_partial_fit_n_features(name, estimator_orig):
+    # check if number of features changes between calls to partial_fit.
+    if not hasattr(estimator_orig, "partial_fit"):
+        return
+    estimator = clone(estimator_orig)
+    X, y = make_blobs(n_samples=50, random_state=1)
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+
+    try:
+        if is_classifier(estimator):
+            classes = np.unique(y)
+            estimator.partial_fit(X, y, classes=classes)
+        else:
+            estimator.partial_fit(X, y)
+    except NotImplementedError:
+        return
+
+    with raises(
+        ValueError,
+        err_msg=(
+            f"The estimator {name} does not raise an error when the "
+            "number of features changes between calls to partial_fit."
+        ),
+    ):
+        estimator.partial_fit(X[:, :-1], y)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifier_multioutput(name, estimator_orig):
+    n_samples, n_labels, n_classes = 42, 5, 3
+    tags = get_tags(estimator_orig)
+    estimator = clone(estimator_orig)
+    X, y = make_multilabel_classification(
+        random_state=42, n_samples=n_samples, n_labels=n_labels, n_classes=n_classes
+    )
+    X = _enforce_estimator_tags_X(estimator, X)
+    estimator.fit(X, y)
+    y_pred = estimator.predict(X)
+
+    assert y_pred.shape == (n_samples, n_classes), (
+        "The shape of the prediction for multioutput data is "
+        "incorrect. Expected {}, got {}.".format((n_samples, n_labels), y_pred.shape)
+    )
+    assert y_pred.dtype.kind == "i"
+
+    if hasattr(estimator, "decision_function"):
+        decision = estimator.decision_function(X)
+        assert isinstance(decision, np.ndarray)
+        assert decision.shape == (n_samples, n_classes), (
+            "The shape of the decision function output for "
+            "multioutput data is incorrect. Expected {}, got {}.".format(
+                (n_samples, n_classes), decision.shape
+            )
+        )
+
+        dec_pred = (decision > 0).astype(int)
+        dec_exp = estimator.classes_[dec_pred]
+        assert_array_equal(dec_exp, y_pred)
+
+    if hasattr(estimator, "predict_proba"):
+        y_prob = estimator.predict_proba(X)
+
+        if isinstance(y_prob, list) and not tags.classifier_tags.poor_score:
+            for i in range(n_classes):
+                assert y_prob[i].shape == (n_samples, 2), (
+                    "The shape of the probability for multioutput data is"
+                    " incorrect. Expected {}, got {}.".format(
+                        (n_samples, 2), y_prob[i].shape
+                    )
+                )
+                assert_array_equal(
+                    np.argmax(y_prob[i], axis=1).astype(int), y_pred[:, i]
+                )
+        elif not tags.classifier_tags.poor_score:
+            assert y_prob.shape == (n_samples, n_classes), (
+                "The shape of the probability for multioutput data is"
+                " incorrect. Expected {}, got {}.".format(
+                    (n_samples, n_classes), y_prob.shape
+                )
+            )
+            assert_array_equal(y_prob.round().astype(int), y_pred)
+
+    if hasattr(estimator, "decision_function") and hasattr(estimator, "predict_proba"):
+        for i in range(n_classes):
+            y_proba = estimator.predict_proba(X)[:, i]
+            y_decision = estimator.decision_function(X)
+            assert_array_equal(rankdata(y_proba), rankdata(y_decision[:, i]))
+
+
+@ignore_warnings(category=FutureWarning)
+def check_regressor_multioutput(name, estimator):
+    estimator = clone(estimator)
+    n_samples = n_features = 10
+
+    if not _is_pairwise_metric(estimator):
+        n_samples = n_samples + 1
+
+    X, y = make_regression(
+        random_state=42, n_targets=5, n_samples=n_samples, n_features=n_features
+    )
+    X = _enforce_estimator_tags_X(estimator, X)
+
+    estimator.fit(X, y)
+    y_pred = estimator.predict(X)
+
+    assert y_pred.dtype == np.dtype("float64"), (
+        "Multioutput predictions by a regressor are expected to be"
+        f" floating-point precision. Got {y_pred.dtype} instead"
+    )
+    assert y_pred.shape == y.shape, (
+        "The shape of the prediction for multioutput data is incorrect."
+        f" Expected {y_pred.shape}, got {y.shape}."
+    )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_clustering(name, clusterer_orig, readonly_memmap=False):
+    clusterer = clone(clusterer_orig)
+    X, y = make_blobs(n_samples=50, random_state=1)
+    X, y = shuffle(X, y, random_state=7)
+    X = StandardScaler().fit_transform(X)
+    rng = np.random.RandomState(7)
+    X_noise = np.concatenate([X, rng.uniform(low=-3, high=3, size=(5, 2))])
+
+    if readonly_memmap:
+        X, y, X_noise = create_memmap_backed_data([X, y, X_noise])
+
+    n_samples, n_features = X.shape
+    # catch deprecation and neighbors warnings
+    if hasattr(clusterer, "n_clusters"):
+        clusterer.set_params(n_clusters=3)
+    set_random_state(clusterer)
+    if name == "AffinityPropagation":
+        clusterer.set_params(preference=-100)
+        clusterer.set_params(max_iter=100)
+
+    # fit
+    clusterer.fit(X)
+    # with lists
+    clusterer.fit(X.tolist())
+
+    pred = clusterer.labels_
+    assert pred.shape == (n_samples,)
+    assert adjusted_rand_score(pred, y) > 0.4
+    if get_tags(clusterer).non_deterministic:
+        return
+    set_random_state(clusterer)
+    with warnings.catch_warnings(record=True):
+        pred2 = clusterer.fit_predict(X)
+    assert_array_equal(pred, pred2)
+
+    # fit_predict(X) and labels_ should be of type int
+    assert pred.dtype in [np.dtype("int32"), np.dtype("int64")]
+    assert pred2.dtype in [np.dtype("int32"), np.dtype("int64")]
+
+    # Add noise to X to test the possible values of the labels
+    labels = clusterer.fit_predict(X_noise)
+
+    # There should be at least one sample in every cluster. Equivalently
+    # labels_ should contain all the consecutive values between its
+    # min and its max.
+    labels_sorted = np.unique(labels)
+    assert_array_equal(
+        labels_sorted, np.arange(labels_sorted[0], labels_sorted[-1] + 1)
+    )
+
+    # Labels are expected to start at 0 (no noise) or -1 (if noise)
+    assert labels_sorted[0] in [0, -1]
+    # Labels should be less than n_clusters - 1
+    if hasattr(clusterer, "n_clusters"):
+        n_clusters = getattr(clusterer, "n_clusters")
+        assert n_clusters - 1 >= labels_sorted[-1]
+    # else labels should be less than max(labels_) which is necessarily true
+
+
+@ignore_warnings(category=FutureWarning)
+def check_clusterer_compute_labels_predict(name, clusterer_orig):
+    """Check that predict is invariant of compute_labels."""
+    X, y = make_blobs(n_samples=20, random_state=0)
+    clusterer = clone(clusterer_orig)
+    set_random_state(clusterer)
+
+    if hasattr(clusterer, "compute_labels"):
+        # MiniBatchKMeans
+        X_pred1 = clusterer.fit(X).predict(X)
+        clusterer.set_params(compute_labels=False)
+        X_pred2 = clusterer.fit(X).predict(X)
+        assert_array_equal(X_pred1, X_pred2)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifiers_one_label(name, classifier_orig):
+    error_string_fit = "Classifier can't train when only one class is present."
+    error_string_predict = "Classifier can't predict when only one class is present."
+    classifier = clone(classifier_orig)
+    rnd = np.random.RandomState(0)
+    X_train = rnd.uniform(size=(10, 3))
+    X_test = rnd.uniform(size=(10, 3))
+    X_train, X_test = _enforce_estimator_tags_X(classifier, X_train, X_test=X_test)
+    y = np.ones(10)
+    # catch deprecation warnings
+    with ignore_warnings(category=FutureWarning):
+        with raises(
+            ValueError, match="class", may_pass=True, err_msg=error_string_fit
+        ) as cm:
+            classifier.fit(X_train, y)
+
+        if cm.raised_and_matched:
+            # ValueError was raised with proper error message
+            return
+
+        assert_array_equal(classifier.predict(X_test), y, err_msg=error_string_predict)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifiers_one_label_sample_weights(name, classifier_orig):
+    """Check that classifiers accepting sample_weight fit or throws a ValueError with
+    an explicit message if the problem is reduced to one class.
+    """
+    error_fit = (
+        f"{name} failed when fitted on one label after sample_weight trimming. Error "
+        "message is not explicit, it should have 'class'."
+    )
+    error_predict = f"{name} prediction results should only output the remaining class."
+    rnd = np.random.RandomState(0)
+    # X should be square for test on SVC with precomputed kernel
+    X_train = rnd.uniform(size=(10, 10))
+    X_test = rnd.uniform(size=(10, 10))
+    y = np.arange(10) % 2
+    sample_weight = y.copy()  # select a single class
+    classifier = clone(classifier_orig)
+
+    if has_fit_parameter(classifier, "sample_weight"):
+        match = [r"\bclass(es)?\b", error_predict]
+        err_type, err_msg = (AssertionError, ValueError), error_fit
+    else:
+        match = r"\bsample_weight\b"
+        err_type, err_msg = (TypeError, ValueError), None
+
+    with raises(err_type, match=match, may_pass=True, err_msg=err_msg) as cm:
+        classifier.fit(X_train, y, sample_weight=sample_weight)
+        if cm.raised_and_matched:
+            # raise the proper error type with the proper error message
+            return
+        # for estimators that do not fail, they should be able to predict the only
+        # class remaining during fit
+        assert_array_equal(
+            classifier.predict(X_test), np.ones(10), err_msg=error_predict
+        )
+
+
+@ignore_warnings  # Warnings are raised by decision function
+def check_classifiers_train(
+    name, classifier_orig, readonly_memmap=False, X_dtype="float64"
+):
+    X_m, y_m = make_blobs(n_samples=300, random_state=0)
+    X_m = X_m.astype(X_dtype)
+    X_m, y_m = shuffle(X_m, y_m, random_state=7)
+    X_m = StandardScaler().fit_transform(X_m)
+    # generate binary problem from multi-class one
+    y_b = y_m[y_m != 2]
+    X_b = X_m[y_m != 2]
+
+    if name in ["BernoulliNB", "MultinomialNB", "ComplementNB", "CategoricalNB"]:
+        X_m -= X_m.min()
+        X_b -= X_b.min()
+
+    if readonly_memmap:
+        X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b])
+
+    problems = [(X_b, y_b)]
+    tags = get_tags(classifier_orig)
+    if tags.classifier_tags.multi_class:
+        problems.append((X_m, y_m))
+
+    for X, y in problems:
+        classes = np.unique(y)
+        n_classes = len(classes)
+        n_samples, n_features = X.shape
+        classifier = clone(classifier_orig)
+        X = _enforce_estimator_tags_X(classifier, X)
+        y = _enforce_estimator_tags_y(classifier, y)
+
+        set_random_state(classifier)
+        # raises error on malformed input for fit
+        if not tags.no_validation:
+            with raises(
+                ValueError,
+                err_msg=(
+                    f"The classifier {name} does not raise an error when "
+                    "incorrect/malformed input data for fit is passed. The number "
+                    "of training examples is not the same as the number of "
+                    "labels. Perhaps use check_X_y in fit."
+                ),
+            ):
+                classifier.fit(X, y[:-1])
+
+        # fit
+        classifier.fit(X, y)
+        # with lists
+        classifier.fit(X.tolist(), y.tolist())
+        assert hasattr(classifier, "classes_")
+        y_pred = classifier.predict(X)
+
+        assert y_pred.shape == (n_samples,)
+        # training set performance
+        if not tags.classifier_tags.poor_score:
+            assert accuracy_score(y, y_pred) > 0.83
+
+        # raises error on malformed input for predict
+        msg_pairwise = (
+            "The classifier {} does not raise an error when shape of X in "
+            " {} is not equal to (n_test_samples, n_training_samples)"
+        )
+        msg = (
+            "The classifier {} does not raise an error when the number of "
+            "features in {} is different from the number of features in "
+            "fit."
+        )
+
+        if not tags.no_validation:
+            if tags.input_tags.pairwise:
+                with raises(
+                    ValueError,
+                    err_msg=msg_pairwise.format(name, "predict"),
+                ):
+                    classifier.predict(X.reshape(-1, 1))
+            else:
+                with raises(ValueError, err_msg=msg.format(name, "predict")):
+                    classifier.predict(X.T)
+        if hasattr(classifier, "decision_function"):
+            try:
+                # decision_function agrees with predict
+                decision = classifier.decision_function(X)
+                if n_classes == 2:
+                    if tags.target_tags.single_output:
+                        assert decision.shape == (n_samples,)
+                    else:
+                        assert decision.shape == (n_samples, 1)
+                    dec_pred = (decision.ravel() > 0).astype(int)
+                    assert_array_equal(dec_pred, y_pred)
+                else:
+                    assert decision.shape == (n_samples, n_classes)
+                    assert_array_equal(np.argmax(decision, axis=1), y_pred)
+
+                # raises error on malformed input for decision_function
+                if not tags.no_validation:
+                    if tags.input_tags.pairwise:
+                        with raises(
+                            ValueError,
+                            err_msg=msg_pairwise.format(name, "decision_function"),
+                        ):
+                            classifier.decision_function(X.reshape(-1, 1))
+                    else:
+                        with raises(
+                            ValueError,
+                            err_msg=msg.format(name, "decision_function"),
+                        ):
+                            classifier.decision_function(X.T)
+            except NotImplementedError:
+                pass
+
+        if hasattr(classifier, "predict_proba"):
+            # predict_proba agrees with predict
+            y_prob = classifier.predict_proba(X)
+            assert y_prob.shape == (n_samples, n_classes)
+            assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
+            # check that probas for all classes sum to one
+            assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples))
+            if not tags.no_validation:
+                # raises error on malformed input for predict_proba
+                if tags.input_tags.pairwise:
+                    with raises(
+                        ValueError,
+                        err_msg=msg_pairwise.format(name, "predict_proba"),
+                    ):
+                        classifier.predict_proba(X.reshape(-1, 1))
+                else:
+                    with raises(
+                        ValueError,
+                        err_msg=msg.format(name, "predict_proba"),
+                    ):
+                        classifier.predict_proba(X.T)
+            if hasattr(classifier, "predict_log_proba"):
+                # predict_log_proba is a transformation of predict_proba
+                y_log_prob = classifier.predict_log_proba(X)
+                assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9)
+                assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
+
+
+def check_outlier_corruption(num_outliers, expected_outliers, decision):
+    # Check for deviation from the precise given contamination level that may
+    # be due to ties in the anomaly scores.
+    if num_outliers < expected_outliers:
+        start = num_outliers
+        end = expected_outliers + 1
+    else:
+        start = expected_outliers
+        end = num_outliers + 1
+
+    # ensure that all values in the 'critical area' are tied,
+    # leading to the observed discrepancy between provided
+    # and actual contamination levels.
+    sorted_decision = np.sort(decision)
+    msg = (
+        "The number of predicted outliers is not equal to the expected "
+        "number of outliers and this difference is not explained by the "
+        "number of ties in the decision_function values"
+    )
+    assert len(np.unique(sorted_decision[start:end])) == 1, msg
+
+
+def check_outliers_train(name, estimator_orig, readonly_memmap=True):
+    n_samples = 300
+    X, _ = make_blobs(n_samples=n_samples, random_state=0)
+    X = shuffle(X, random_state=7)
+
+    if readonly_memmap:
+        X = create_memmap_backed_data(X)
+
+    n_samples, n_features = X.shape
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+
+    # fit
+    estimator.fit(X)
+    # with lists
+    estimator.fit(X.tolist())
+
+    y_pred = estimator.predict(X)
+    assert y_pred.shape == (n_samples,)
+    assert y_pred.dtype.kind == "i"
+    assert_array_equal(np.unique(y_pred), np.array([-1, 1]))
+
+    decision = estimator.decision_function(X)
+    scores = estimator.score_samples(X)
+    for output in [decision, scores]:
+        assert output.dtype == np.dtype("float")
+        assert output.shape == (n_samples,)
+
+    # raises error on malformed input for predict
+    with raises(ValueError):
+        estimator.predict(X.T)
+
+    # decision_function agrees with predict
+    dec_pred = (decision >= 0).astype(int)
+    dec_pred[dec_pred == 0] = -1
+    assert_array_equal(dec_pred, y_pred)
+
+    # raises error on malformed input for decision_function
+    with raises(ValueError):
+        estimator.decision_function(X.T)
+
+    # decision_function is a translation of score_samples
+    y_dec = scores - estimator.offset_
+    assert_allclose(y_dec, decision)
+
+    # raises error on malformed input for score_samples
+    with raises(ValueError):
+        estimator.score_samples(X.T)
+
+    # contamination parameter (not for OneClassSVM which has the nu parameter)
+    if hasattr(estimator, "contamination") and not hasattr(estimator, "novelty"):
+        # proportion of outliers equal to contamination parameter when not
+        # set to 'auto'. This is true for the training set and cannot thus be
+        # checked as follows for estimators with a novelty parameter such as
+        # LocalOutlierFactor (tested in check_outliers_fit_predict)
+        expected_outliers = 30
+        contamination = expected_outliers / n_samples
+        estimator.set_params(contamination=contamination)
+        estimator.fit(X)
+        y_pred = estimator.predict(X)
+
+        num_outliers = np.sum(y_pred != 1)
+        # num_outliers should be equal to expected_outliers unless
+        # there are ties in the decision_function values. this can
+        # only be tested for estimators with a decision_function
+        # method, i.e. all estimators except LOF which is already
+        # excluded from this if branch.
+        if num_outliers != expected_outliers:
+            decision = estimator.decision_function(X)
+            check_outlier_corruption(num_outliers, expected_outliers, decision)
+
+
+def check_outlier_contamination(name, estimator_orig):
+    # Check that the contamination parameter is in (0.0, 0.5] when it is an
+    # interval constraint.
+
+    if not hasattr(estimator_orig, "_parameter_constraints"):
+        # Only estimator implementing parameter constraints will be checked
+        return
+
+    if "contamination" not in estimator_orig._parameter_constraints:
+        return
+
+    contamination_constraints = estimator_orig._parameter_constraints["contamination"]
+    if not any([isinstance(c, Interval) for c in contamination_constraints]):
+        raise AssertionError(
+            "contamination constraints should contain a Real Interval constraint."
+        )
+
+    for constraint in contamination_constraints:
+        if isinstance(constraint, Interval):
+            assert (
+                constraint.type == Real
+                and constraint.left >= 0.0
+                and constraint.right <= 0.5
+                and (constraint.left > 0 or constraint.closed in {"right", "neither"})
+            ), "contamination constraint should be an interval in (0, 0.5]"
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifiers_multilabel_representation_invariance(name, classifier_orig):
+    X, y = make_multilabel_classification(
+        n_samples=100,
+        n_features=2,
+        n_classes=5,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+    X = scale(X)
+
+    X_train, y_train = X[:80], y[:80]
+    X_test = X[80:]
+    X_train, X_test = _enforce_estimator_tags_X(classifier_orig, X_train, X_test=X_test)
+
+    y_train_list_of_lists = y_train.tolist()
+    y_train_list_of_arrays = list(y_train)
+
+    classifier = clone(classifier_orig)
+    set_random_state(classifier)
+
+    y_pred = classifier.fit(X_train, y_train).predict(X_test)
+
+    y_pred_list_of_lists = classifier.fit(X_train, y_train_list_of_lists).predict(
+        X_test
+    )
+
+    y_pred_list_of_arrays = classifier.fit(X_train, y_train_list_of_arrays).predict(
+        X_test
+    )
+
+    assert_array_equal(y_pred, y_pred_list_of_arrays)
+    assert_array_equal(y_pred, y_pred_list_of_lists)
+
+    assert y_pred.dtype == y_pred_list_of_arrays.dtype
+    assert y_pred.dtype == y_pred_list_of_lists.dtype
+    assert type(y_pred) == type(y_pred_list_of_arrays)
+    assert type(y_pred) == type(y_pred_list_of_lists)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifiers_multilabel_output_format_predict(name, classifier_orig):
+    """Check the output of the `predict` method for classifiers supporting
+    multilabel-indicator targets."""
+    classifier = clone(classifier_orig)
+    set_random_state(classifier)
+
+    n_samples, test_size, n_outputs = 100, 25, 5
+    X, y = make_multilabel_classification(
+        n_samples=n_samples,
+        n_features=2,
+        n_classes=n_outputs,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+    X = scale(X)
+
+    X_train, X_test = X[:-test_size], X[-test_size:]
+    y_train, y_test = y[:-test_size], y[-test_size:]
+    X_train, X_test = _enforce_estimator_tags_X(classifier_orig, X_train, X_test=X_test)
+    classifier.fit(X_train, y_train)
+
+    response_method_name = "predict"
+    predict_method = getattr(classifier, response_method_name, None)
+    if predict_method is None:
+        raise SkipTest(f"{name} does not have a {response_method_name} method.")
+
+    y_pred = predict_method(X_test)
+
+    # y_pred.shape -> y_test.shape with the same dtype
+    assert isinstance(y_pred, np.ndarray), (
+        f"{name}.predict is expected to output a NumPy array. Got "
+        f"{type(y_pred)} instead."
+    )
+    assert y_pred.shape == y_test.shape, (
+        f"{name}.predict outputs a NumPy array of shape {y_pred.shape} "
+        f"instead of {y_test.shape}."
+    )
+    assert y_pred.dtype == y_test.dtype, (
+        f"{name}.predict does not output the same dtype than the targets. "
+        f"Got {y_pred.dtype} instead of {y_test.dtype}."
+    )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifiers_multilabel_output_format_predict_proba(name, classifier_orig):
+    """Check the output of the `predict_proba` method for classifiers supporting
+    multilabel-indicator targets."""
+    classifier = clone(classifier_orig)
+    set_random_state(classifier)
+
+    n_samples, test_size, n_outputs = 100, 25, 5
+    X, y = make_multilabel_classification(
+        n_samples=n_samples,
+        n_features=2,
+        n_classes=n_outputs,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+    X = scale(X)
+
+    X_train, X_test = X[:-test_size], X[-test_size:]
+    y_train = y[:-test_size]
+    X_train, X_test = _enforce_estimator_tags_X(classifier_orig, X_train, X_test=X_test)
+    classifier.fit(X_train, y_train)
+
+    response_method_name = "predict_proba"
+    predict_proba_method = getattr(classifier, response_method_name, None)
+    if predict_proba_method is None:
+        raise SkipTest(f"{name} does not have a {response_method_name} method.")
+
+    y_pred = predict_proba_method(X_test)
+
+    # y_pred.shape -> 2 possibilities:
+    # - list of length n_outputs of shape (n_samples, 2);
+    # - ndarray of shape (n_samples, n_outputs).
+    # dtype should be floating
+    if isinstance(y_pred, list):
+        assert len(y_pred) == n_outputs, (
+            f"When {name}.predict_proba returns a list, the list should "
+            "be of length n_outputs and contain NumPy arrays. Got length "
+            f"of {len(y_pred)} instead of {n_outputs}."
+        )
+        for pred in y_pred:
+            assert pred.shape == (test_size, 2), (
+                f"When {name}.predict_proba returns a list, this list "
+                "should contain NumPy arrays of shape (n_samples, 2). Got "
+                f"NumPy arrays of shape {pred.shape} instead of "
+                f"{(test_size, 2)}."
+            )
+            assert pred.dtype.kind == "f", (
+                f"When {name}.predict_proba returns a list, it should "
+                "contain NumPy arrays with floating dtype. Got "
+                f"{pred.dtype} instead."
+            )
+            # check that we have the correct probabilities
+            err_msg = (
+                f"When {name}.predict_proba returns a list, each NumPy "
+                "array should contain probabilities for each class and "
+                "thus each row should sum to 1 (or close to 1 due to "
+                "numerical errors)."
+            )
+            assert_allclose(pred.sum(axis=1), 1, err_msg=err_msg)
+    elif isinstance(y_pred, np.ndarray):
+        assert y_pred.shape == (test_size, n_outputs), (
+            f"When {name}.predict_proba returns a NumPy array, the "
+            f"expected shape is (n_samples, n_outputs). Got {y_pred.shape}"
+            f" instead of {(test_size, n_outputs)}."
+        )
+        assert y_pred.dtype.kind == "f", (
+            f"When {name}.predict_proba returns a NumPy array, the "
+            f"expected data type is floating. Got {y_pred.dtype} instead."
+        )
+        err_msg = (
+            f"When {name}.predict_proba returns a NumPy array, this array "
+            "is expected to provide probabilities of the positive class "
+            "and should therefore contain values between 0 and 1."
+        )
+        assert_array_less(0, y_pred, err_msg=err_msg)
+        assert_array_less(y_pred, 1, err_msg=err_msg)
+    else:
+        raise ValueError(
+            f"Unknown returned type {type(y_pred)} by {name}."
+            "predict_proba. A list or a Numpy array is expected."
+        )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifiers_multilabel_output_format_decision_function(name, classifier_orig):
+    """Check the output of the `decision_function` method for classifiers supporting
+    multilabel-indicator targets."""
+    classifier = clone(classifier_orig)
+    set_random_state(classifier)
+
+    n_samples, test_size, n_outputs = 100, 25, 5
+    X, y = make_multilabel_classification(
+        n_samples=n_samples,
+        n_features=2,
+        n_classes=n_outputs,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+    X = scale(X)
+
+    X_train, X_test = X[:-test_size], X[-test_size:]
+    y_train = y[:-test_size]
+    X_train, X_test = _enforce_estimator_tags_X(classifier_orig, X_train, X_test=X_test)
+    classifier.fit(X_train, y_train)
+
+    response_method_name = "decision_function"
+    decision_function_method = getattr(classifier, response_method_name, None)
+    if decision_function_method is None:
+        raise SkipTest(f"{name} does not have a {response_method_name} method.")
+
+    y_pred = decision_function_method(X_test)
+
+    # y_pred.shape -> y_test.shape with floating dtype
+    assert isinstance(y_pred, np.ndarray), (
+        f"{name}.decision_function is expected to output a NumPy array."
+        f" Got {type(y_pred)} instead."
+    )
+    assert y_pred.shape == (test_size, n_outputs), (
+        f"{name}.decision_function is expected to provide a NumPy array "
+        f"of shape (n_samples, n_outputs). Got {y_pred.shape} instead of "
+        f"{(test_size, n_outputs)}."
+    )
+    assert y_pred.dtype.kind == "f", (
+        f"{name}.decision_function is expected to output a floating dtype."
+        f" Got {y_pred.dtype} instead."
+    )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_get_feature_names_out_error(name, estimator_orig):
+    """Check the error raised by get_feature_names_out when called before fit.
+
+    Unfitted estimators with get_feature_names_out should raise a NotFittedError.
+    """
+
+    estimator = clone(estimator_orig)
+    err_msg = (
+        f"Estimator {name} should have raised a NotFitted error when fit is called"
+        " before get_feature_names_out"
+    )
+    with raises(NotFittedError, err_msg=err_msg):
+        estimator.get_feature_names_out()
+
+
+@ignore_warnings(category=FutureWarning)
+def check_estimators_fit_returns_self(name, estimator_orig):
+    """Check if self is returned when calling fit."""
+    X, y = make_blobs(random_state=0, n_samples=21)
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    set_random_state(estimator)
+    assert estimator.fit(X, y) is estimator
+
+
+@ignore_warnings(category=FutureWarning)
+def check_readonly_memmap_input(name, estimator_orig):
+    """Check that the estimator can handle readonly memmap backed data.
+
+    This is particularly needed to support joblib parallelisation.
+    """
+    X, y = make_blobs(random_state=0, n_samples=21)
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    X, y = create_memmap_backed_data([X, y])
+
+    set_random_state(estimator)
+    # This should not raise an error and should return self
+    assert estimator.fit(X, y) is estimator
+
+
+@ignore_warnings
+def check_estimators_unfitted(name, estimator_orig):
+    """Check that predict raises an exception in an unfitted estimator.
+
+    Unfitted estimators should raise a NotFittedError.
+    """
+    err_msg = (
+        "Estimator should raise a NotFittedError when calling `{method}` before fit. "
+        "Either call `check_is_fitted(self)` at the beginning of `{method}` or "
+        "set `tags.requires_fit=False` on estimator tags to disable this check.\n"
+        "- `check_is_fitted`: https://scikit-learn.org/dev/modules/generated/sklearn."
+        "utils.validation.check_is_fitted.html\n"
+        "- Estimator Tags: https://scikit-learn.org/dev/developers/develop."
+        "html#estimator-tags"
+    )
+    # Common test for Regressors, Classifiers and Outlier detection estimators
+    X, y = _regression_dataset()
+
+    estimator = clone(estimator_orig)
+    for method in (
+        "decision_function",
+        "predict",
+        "predict_proba",
+        "predict_log_proba",
+    ):
+        if hasattr(estimator, method):
+            with raises(NotFittedError, err_msg=err_msg.format(method=method)):
+                getattr(estimator, method)(X)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_supervised_y_2d(name, estimator_orig):
+    tags = get_tags(estimator_orig)
+    rnd = np.random.RandomState(0)
+    n_samples = 30
+    X = _enforce_estimator_tags_X(estimator_orig, rnd.uniform(size=(n_samples, 3)))
+    y = np.arange(n_samples) % 3
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+    # fit
+    estimator.fit(X, y)
+    y_pred = estimator.predict(X)
+
+    set_random_state(estimator)
+    # Check that when a 2D y is given, a DataConversionWarning is
+    # raised
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always", DataConversionWarning)
+        warnings.simplefilter("ignore", RuntimeWarning)
+        estimator.fit(X, y[:, np.newaxis])
+    y_pred_2d = estimator.predict(X)
+    msg = "expected 1 DataConversionWarning, got: %s" % ", ".join(
+        [str(w_x) for w_x in w]
+    )
+    if not tags.target_tags.multi_output:
+        # check that we warned if we don't support multi-output
+        assert len(w) > 0, msg
+        assert (
+            "DataConversionWarning('A column-vector y"
+            " was passed when a 1d array was expected" in msg
+        )
+    assert_allclose(y_pred.ravel(), y_pred_2d.ravel())
+
+
+@ignore_warnings
+def check_classifiers_predictions(X, y, name, classifier_orig):
+    classes = np.unique(y)
+    classifier = clone(classifier_orig)
+    if name == "BernoulliNB":
+        X = X > X.mean()
+    set_random_state(classifier)
+
+    classifier.fit(X, y)
+    y_pred = classifier.predict(X)
+
+    if hasattr(classifier, "decision_function"):
+        decision = classifier.decision_function(X)
+        assert isinstance(decision, np.ndarray)
+        if len(classes) == 2:
+            dec_pred = (decision.ravel() > 0).astype(int)
+            dec_exp = classifier.classes_[dec_pred]
+            assert_array_equal(
+                dec_exp,
+                y_pred,
+                err_msg=(
+                    "decision_function does not match "
+                    "classifier for %r: expected '%s', got '%s'"
+                )
+                % (
+                    classifier,
+                    ", ".join(map(str, dec_exp)),
+                    ", ".join(map(str, y_pred)),
+                ),
+            )
+        elif getattr(classifier, "decision_function_shape", "ovr") == "ovr":
+            decision_y = np.argmax(decision, axis=1).astype(int)
+            y_exp = classifier.classes_[decision_y]
+            assert_array_equal(
+                y_exp,
+                y_pred,
+                err_msg=(
+                    "decision_function does not match "
+                    "classifier for %r: expected '%s', got '%s'"
+                )
+                % (
+                    classifier,
+                    ", ".join(map(str, y_exp)),
+                    ", ".join(map(str, y_pred)),
+                ),
+            )
+
+    assert_array_equal(
+        classes,
+        classifier.classes_,
+        err_msg="Unexpected classes_ attribute for %r: expected '%s', got '%s'"
+        % (
+            classifier,
+            ", ".join(map(str, classes)),
+            ", ".join(map(str, classifier.classes_)),
+        ),
+    )
+
+
+def _choose_check_classifiers_labels(name, y, y_names):
+    # Semisupervised classifiers use -1 as the indicator for an unlabeled
+    # sample.
+    return (
+        y
+        if name in ["LabelPropagation", "LabelSpreading", "SelfTrainingClassifier"]
+        else y_names
+    )
+
+
+def check_classifiers_classes(name, classifier_orig):
+    X_multiclass, y_multiclass = make_blobs(
+        n_samples=30, random_state=0, cluster_std=0.1
+    )
+    X_multiclass, y_multiclass = shuffle(X_multiclass, y_multiclass, random_state=7)
+    X_multiclass = StandardScaler().fit_transform(X_multiclass)
+
+    X_binary = X_multiclass[y_multiclass != 2]
+    y_binary = y_multiclass[y_multiclass != 2]
+
+    X_multiclass = _enforce_estimator_tags_X(classifier_orig, X_multiclass)
+    X_binary = _enforce_estimator_tags_X(classifier_orig, X_binary)
+
+    labels_multiclass = ["one", "two", "three"]
+    labels_binary = ["one", "two"]
+
+    y_names_multiclass = np.take(labels_multiclass, y_multiclass)
+    y_names_binary = np.take(labels_binary, y_binary)
+
+    problems = [(X_binary, y_binary, y_names_binary)]
+    if get_tags(classifier_orig).classifier_tags.multi_class:
+        problems.append((X_multiclass, y_multiclass, y_names_multiclass))
+
+    for X, y, y_names in problems:
+        for y_names_i in [y_names, y_names.astype("O")]:
+            y_ = _choose_check_classifiers_labels(name, y, y_names_i)
+            check_classifiers_predictions(X, y_, name, classifier_orig)
+
+    labels_binary = [-1, 1]
+    y_names_binary = np.take(labels_binary, y_binary)
+    y_binary = _choose_check_classifiers_labels(name, y_binary, y_names_binary)
+    check_classifiers_predictions(X_binary, y_binary, name, classifier_orig)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_regressors_int(name, regressor_orig):
+    X, _ = _regression_dataset()
+    X = _enforce_estimator_tags_X(regressor_orig, X[:50])
+    rnd = np.random.RandomState(0)
+    y = rnd.randint(3, size=X.shape[0])
+    y = _enforce_estimator_tags_y(regressor_orig, y)
+    rnd = np.random.RandomState(0)
+    # separate estimators to control random seeds
+    regressor_1 = clone(regressor_orig)
+    regressor_2 = clone(regressor_orig)
+    set_random_state(regressor_1)
+    set_random_state(regressor_2)
+
+    if name in CROSS_DECOMPOSITION:
+        y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
+        y_ = y_.T
+    else:
+        y_ = y
+
+    # fit
+    regressor_1.fit(X, y_)
+    pred1 = regressor_1.predict(X)
+    regressor_2.fit(X, y_.astype(float))
+    pred2 = regressor_2.predict(X)
+    assert_allclose(pred1, pred2, atol=1e-2, err_msg=name)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_regressors_train(
+    name, regressor_orig, readonly_memmap=False, X_dtype=np.float64
+):
+    X, y = _regression_dataset()
+    X = X.astype(X_dtype)
+    y = scale(y)  # X is already scaled
+    regressor = clone(regressor_orig)
+    X = _enforce_estimator_tags_X(regressor, X)
+    y = _enforce_estimator_tags_y(regressor, y)
+    if name in CROSS_DECOMPOSITION:
+        rnd = np.random.RandomState(0)
+        y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
+        y_ = y_.T
+    else:
+        y_ = y
+
+    if readonly_memmap:
+        X, y, y_ = create_memmap_backed_data([X, y, y_])
+
+    if not hasattr(regressor, "alphas") and hasattr(regressor, "alpha"):
+        # linear regressors need to set alpha, but not generalized CV ones
+        regressor.alpha = 0.01
+    if name == "PassiveAggressiveRegressor":
+        regressor.C = 0.01
+
+    # raises error on malformed input for fit
+    with raises(
+        ValueError,
+        err_msg=(
+            f"The classifier {name} does not raise an error when "
+            "incorrect/malformed input data for fit is passed. The number of "
+            "training examples is not the same as the number of labels. Perhaps "
+            "use check_X_y in fit."
+        ),
+    ):
+        regressor.fit(X, y[:-1])
+    # fit
+    set_random_state(regressor)
+    regressor.fit(X, y_)
+    regressor.fit(X.tolist(), y_.tolist())
+    y_pred = regressor.predict(X)
+    assert y_pred.shape == y_.shape
+
+    # TODO: find out why PLS and CCA fail. RANSAC is random
+    # and furthermore assumes the presence of outliers, hence
+    # skipped
+    if not get_tags(regressor).regressor_tags.poor_score:
+        assert regressor.score(X, y_) > 0.5
+
+
+@ignore_warnings
+def check_regressors_no_decision_function(name, regressor_orig):
+    # check that regressors don't have a decision_function, predict_proba, or
+    # predict_log_proba method.
+    rng = np.random.RandomState(0)
+    regressor = clone(regressor_orig)
+
+    X = rng.normal(size=(10, 4))
+    X = _enforce_estimator_tags_X(regressor_orig, X)
+    y = _enforce_estimator_tags_y(regressor, X[:, 0])
+
+    regressor.fit(X, y)
+    funcs = ["decision_function", "predict_proba", "predict_log_proba"]
+    for func_name in funcs:
+        assert not hasattr(regressor, func_name)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_class_weight_classifiers(name, classifier_orig):
+    if get_tags(classifier_orig).classifier_tags.multi_class:
+        problems = [2, 3]
+    else:  # binary classification only
+        problems = [2]
+
+    for n_centers in problems:
+        # create a very noisy dataset
+        X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.5, random_state=0
+        )
+
+        # can't use gram_if_pairwise() here, setting up gram matrix manually
+        if get_tags(classifier_orig).input_tags.pairwise:
+            X_test = rbf_kernel(X_test, X_train)
+            X_train = rbf_kernel(X_train, X_train)
+
+        n_centers = len(np.unique(y_train))
+
+        if n_centers == 2:
+            class_weight = {0: 1000, 1: 0.0001}
+        else:
+            class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}
+
+        classifier = clone(classifier_orig).set_params(class_weight=class_weight)
+        if hasattr(classifier, "n_iter"):
+            classifier.set_params(n_iter=100)
+        if hasattr(classifier, "max_iter"):
+            classifier.set_params(max_iter=1000)
+        if hasattr(classifier, "min_weight_fraction_leaf"):
+            classifier.set_params(min_weight_fraction_leaf=0.01)
+        if hasattr(classifier, "n_iter_no_change"):
+            classifier.set_params(n_iter_no_change=20)
+
+        set_random_state(classifier)
+        classifier.fit(X_train, y_train)
+        y_pred = classifier.predict(X_test)
+        # XXX: Generally can use 0.89 here. On Windows, LinearSVC gets
+        #      0.88 (Issue #9111)
+        if not get_tags(classifier_orig).classifier_tags.poor_score:
+            assert np.mean(y_pred == 0) > 0.87
+
+
+@ignore_warnings(category=FutureWarning)
+def check_class_weight_balanced_classifiers(
+    name, classifier_orig, X_train, y_train, X_test, y_test, weights
+):
+    classifier = clone(classifier_orig)
+    if hasattr(classifier, "n_iter"):
+        classifier.set_params(n_iter=100)
+    if hasattr(classifier, "max_iter"):
+        classifier.set_params(max_iter=1000)
+
+    set_random_state(classifier)
+    classifier.fit(X_train, y_train)
+    y_pred = classifier.predict(X_test)
+
+    classifier.set_params(class_weight="balanced")
+    classifier.fit(X_train, y_train)
+    y_pred_balanced = classifier.predict(X_test)
+    assert f1_score(y_test, y_pred_balanced, average="weighted") > f1_score(
+        y_test, y_pred, average="weighted"
+    )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_class_weight_balanced_linear_classifier(name, estimator_orig):
+    """Test class weights with non-contiguous class labels."""
+    # this is run on classes, not instances, though this should be changed
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y = np.array([1, 1, 1, -1, -1])
+
+    classifier = clone(estimator_orig)
+
+    if hasattr(classifier, "n_iter"):
+        # This is a very small dataset, default n_iter are likely to prevent
+        # convergence
+        classifier.set_params(n_iter=1000)
+    if hasattr(classifier, "max_iter"):
+        classifier.set_params(max_iter=1000)
+    if hasattr(classifier, "cv"):
+        classifier.set_params(cv=3)
+    set_random_state(classifier)
+
+    # Let the model compute the class frequencies
+    classifier.set_params(class_weight="balanced")
+    coef_balanced = classifier.fit(X, y).coef_.copy()
+
+    # Count each label occurrence to reweight manually
+    n_samples = len(y)
+    n_classes = float(len(np.unique(y)))
+
+    class_weight = {
+        1: n_samples / (np.sum(y == 1) * n_classes),
+        -1: n_samples / (np.sum(y == -1) * n_classes),
+    }
+    classifier.set_params(class_weight=class_weight)
+    coef_manual = classifier.fit(X, y).coef_.copy()
+
+    assert_allclose(
+        coef_balanced,
+        coef_manual,
+        err_msg="Classifier %s is not computing class_weight=balanced properly." % name,
+    )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_estimators_overwrite_params(name, estimator_orig):
+    X, y = make_blobs(random_state=0, n_samples=21)
+    X = _enforce_estimator_tags_X(estimator_orig, X, kernel=rbf_kernel)
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    set_random_state(estimator)
+
+    # Make a physical copy of the original estimator parameters before fitting.
+    params = estimator.get_params()
+    original_params = deepcopy(params)
+
+    # Fit the model
+    estimator.fit(X, y)
+
+    # Compare the state of the model parameters with the original parameters
+    new_params = estimator.get_params()
+    for param_name, original_value in original_params.items():
+        new_value = new_params[param_name]
+
+        # We should never change or mutate the internal state of input
+        # parameters by default. To check this we use the joblib.hash function
+        # that introspects recursively any subobjects to compute a checksum.
+        # The only exception to this rule of immutable constructor parameters
+        # is possible RandomState instance but in this check we explicitly
+        # fixed the random_state params recursively to be integer seeds.
+        assert joblib.hash(new_value) == joblib.hash(original_value), (
+            "Estimator %s should not change or mutate "
+            " the parameter %s from %s to %s during fit."
+            % (name, param_name, original_value, new_value)
+        )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_no_attributes_set_in_init(name, estimator_orig):
+    """Check setting during init."""
+    try:
+        # Clone fails if the estimator does not store
+        # all parameters as an attribute during init
+        estimator = clone(estimator_orig)
+    except AttributeError:
+        raise AttributeError(
+            f"Estimator {name} should store all parameters as an attribute during init."
+        )
+
+    if hasattr(type(estimator).__init__, "deprecated_original"):
+        return
+
+    init_params = _get_args(type(estimator).__init__)
+    parents_init_params = [
+        param
+        for params_parent in (_get_args(parent) for parent in type(estimator).__mro__)
+        for param in params_parent
+    ]
+
+    # Test for no setting apart from parameters during init
+    invalid_attr = set(vars(estimator)) - set(init_params) - set(parents_init_params)
+    # Ignore private attributes
+    invalid_attr = set([attr for attr in invalid_attr if not attr.startswith("_")])
+    assert not invalid_attr, (
+        "Estimator %s should not set any attribute apart"
+        " from parameters during init. Found attributes %s."
+        % (name, sorted(invalid_attr))
+    )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_sparsify_coefficients(name, estimator_orig):
+    X = np.array(
+        [
+            [-2, -1],
+            [-1, -1],
+            [-1, -2],
+            [1, 1],
+            [1, 2],
+            [2, 1],
+            [-1, -2],
+            [2, 2],
+            [-2, -2],
+        ]
+    )
+    y = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3])
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+    est = clone(estimator_orig)
+
+    est.fit(X, y)
+    pred_orig = est.predict(X)
+
+    # test sparsify with dense inputs
+    est.sparsify()
+    assert sparse.issparse(est.coef_)
+    pred = est.predict(X)
+    assert_array_equal(pred, pred_orig)
+
+    # pickle and unpickle with sparse coef_
+    est = pickle.loads(pickle.dumps(est))
+    assert sparse.issparse(est.coef_)
+    pred = est.predict(X)
+    assert_array_equal(pred, pred_orig)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifier_data_not_an_array(name, estimator_orig):
+    X = np.array(
+        [
+            [3, 0],
+            [0, 1],
+            [0, 2],
+            [1, 1],
+            [1, 2],
+            [2, 1],
+            [0, 3],
+            [1, 0],
+            [2, 0],
+            [4, 4],
+            [2, 3],
+            [3, 2],
+        ]
+    )
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2])
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+    for obj_type in ["NotAnArray", "PandasDataframe"]:
+        check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_regressor_data_not_an_array(name, estimator_orig):
+    X, y = _regression_dataset()
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+    for obj_type in ["NotAnArray", "PandasDataframe"]:
+        check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type):
+    if name in CROSS_DECOMPOSITION:
+        raise SkipTest(
+            "Skipping check_estimators_data_not_an_array "
+            "for cross decomposition module as estimators "
+            "are not deterministic."
+        )
+    # separate estimators to control random seeds
+    estimator_1 = clone(estimator_orig)
+    estimator_2 = clone(estimator_orig)
+    set_random_state(estimator_1)
+    set_random_state(estimator_2)
+
+    if obj_type not in ["NotAnArray", "PandasDataframe"]:
+        raise ValueError("Data type {0} not supported".format(obj_type))
+
+    if obj_type == "NotAnArray":
+        y_ = _NotAnArray(np.asarray(y))
+        X_ = _NotAnArray(np.asarray(X))
+    else:
+        # Here pandas objects (Series and DataFrame) are tested explicitly
+        # because some estimators may handle them (especially their indexing)
+        # specially.
+        try:
+            import pandas as pd
+
+            y_ = np.asarray(y)
+            if y_.ndim == 1:
+                y_ = pd.Series(y_, copy=False)
+            else:
+                y_ = pd.DataFrame(y_, copy=False)
+            X_ = pd.DataFrame(np.asarray(X), copy=False)
+
+        except ImportError:
+            raise SkipTest(
+                "pandas is not installed: not checking estimators for pandas objects."
+            )
+
+    # fit
+    estimator_1.fit(X_, y_)
+    pred1 = estimator_1.predict(X_)
+    estimator_2.fit(X, y)
+    pred2 = estimator_2.predict(X)
+    assert_allclose(pred1, pred2, atol=1e-2, err_msg=name)
+
+
+def check_estimator_cloneable(name, estimator_orig):
+    """Checks whether the estimator can be cloned."""
+    try:
+        clone(estimator_orig)
+    except Exception as e:
+        raise AssertionError(f"Cloning of {name} failed with error: {e}.") from e
+
+
+def check_estimator_repr(name, estimator_orig):
+    """Check that the estimator has a functioning repr."""
+    estimator = clone(estimator_orig)
+    try:
+        repr(estimator)
+    except Exception as e:
+        raise AssertionError(f"Repr of {name} failed with error: {e}.") from e
+
+
+def check_parameters_default_constructible(name, estimator_orig):
+    # test default-constructibility
+    # get rid of deprecation warnings
+
+    Estimator = estimator_orig.__class__
+    estimator = clone(estimator_orig)
+
+    with ignore_warnings(category=FutureWarning):
+        # test that set_params returns self
+        # TODO(devtools): this should be a separate check.
+        assert estimator.set_params() is estimator
+
+        # test if init does nothing but set parameters
+        # this is important for grid_search etc.
+        # We get the default parameters from init and then
+        # compare these against the actual values of the attributes.
+
+        # this comes from getattr. Gets rid of deprecation decorator.
+        init = getattr(estimator.__init__, "deprecated_original", estimator.__init__)
+
+        try:
+
+            def param_default_value(p):
+                """Identify hyper parameters of an estimator."""
+                return (
+                    p.name != "self"
+                    and p.kind != p.VAR_KEYWORD
+                    and p.kind != p.VAR_POSITIONAL
+                    # and it should have a default value for this test
+                    and p.default != p.empty
+                )
+
+            def param_required(p):
+                """Identify hyper parameters of an estimator."""
+                return (
+                    p.name != "self"
+                    and p.kind != p.VAR_KEYWORD
+                    # technically VAR_POSITIONAL is also required, but we don't have a
+                    # nice way to check for it. We assume there's no VAR_POSITIONAL in
+                    # the constructor parameters.
+                    #
+                    # TODO(devtools): separately check that the constructor doesn't
+                    # have *args.
+                    and p.kind != p.VAR_POSITIONAL
+                    # these are parameters that don't have a default value and are
+                    # required to construct the estimator.
+                    and p.default == p.empty
+                )
+
+            required_params_names = [
+                p.name for p in signature(init).parameters.values() if param_required(p)
+            ]
+
+            default_value_params = [
+                p for p in signature(init).parameters.values() if param_default_value(p)
+            ]
+
+        except (TypeError, ValueError):
+            # init is not a python function.
+            # true for mixins
+            return
+
+        # here we construct an instance of the estimator using only the required
+        # parameters.
+        old_params = estimator.get_params()
+        init_params = {
+            param: old_params[param]
+            for param in old_params
+            if param in required_params_names
+        }
+        estimator = Estimator(**init_params)
+        params = estimator.get_params()
+
+        for init_param in default_value_params:
+            allowed_types = {
+                str,
+                int,
+                float,
+                bool,
+                tuple,
+                type(None),
+                type,
+            }
+            # Any numpy numeric such as np.int32.
+            allowed_types.update(np.sctypeDict.values())
+
+            allowed_value = (
+                type(init_param.default) in allowed_types
+                or
+                # Although callables are mutable, we accept them as argument
+                # default value and trust that neither the implementation of
+                # the callable nor of the estimator changes the state of the
+                # callable.
+                callable(init_param.default)
+            )
+
+            assert allowed_value, (
+                f"Parameter '{init_param.name}' of estimator "
+                f"'{Estimator.__name__}' is of type "
+                f"{type(init_param.default).__name__} which is not allowed. "
+                f"'{init_param.name}' must be a callable or must be of type "
+                f"{set(type.__name__ for type in allowed_types)}."
+            )
+            if init_param.name not in params.keys():
+                # deprecated parameter, not in get_params
+                assert init_param.default is None, (
+                    f"Estimator parameter '{init_param.name}' of estimator "
+                    f"'{Estimator.__name__}' is not returned by get_params. "
+                    "If it is deprecated, set its default value to None."
+                )
+                continue
+
+            param_value = params[init_param.name]
+            if isinstance(param_value, np.ndarray):
+                assert_array_equal(param_value, init_param.default)
+            else:
+                failure_text = (
+                    f"Parameter {init_param.name} was mutated on init. All "
+                    "parameters must be stored unchanged."
+                )
+                if is_scalar_nan(param_value):
+                    # Allows to set default parameters to np.nan
+                    assert param_value is init_param.default, failure_text
+                else:
+                    assert param_value == init_param.default, failure_text
+
+
+def _enforce_estimator_tags_y(estimator, y):
+    # Estimators with a `requires_positive_y` tag only accept strictly positive
+    # data
+    tags = get_tags(estimator)
+    if tags.target_tags.positive_only:
+        # Create strictly positive y. The minimal increment above 0 is 1, as
+        # y could be of integer dtype.
+        y += 1 + abs(y.min())
+    if (
+        tags.classifier_tags is not None
+        and not tags.classifier_tags.multi_class
+        and y.size > 0
+    ):
+        y = np.where(y == y.min(), y, y.min() + 1)
+    # Estimators in mono_output_task_error raise ValueError if y is of 1-D
+    # Convert into a 2-D y for those estimators.
+    if tags.target_tags.multi_output and not tags.target_tags.single_output:
+        return np.reshape(y, (-1, 1))
+    return y
+
+
+def _enforce_estimator_tags_X(estimator, X, X_test=None, kernel=linear_kernel):
+    # Estimators with `1darray` in `X_types` tag only accept
+    # X of shape (`n_samples`,)
+    if get_tags(estimator).input_tags.one_d_array:
+        X = X[:, 0]
+        if X_test is not None:
+            X_test = X_test[:, 0]  # pragma: no cover
+    # Estimators with a `requires_positive_X` tag only accept
+    # strictly positive data
+    if get_tags(estimator).input_tags.positive_only:
+        X = X - X.min()
+        if X_test is not None:
+            X_test = X_test - X_test.min()  # pragma: no cover
+    if get_tags(estimator).input_tags.categorical:
+        dtype = np.float64 if get_tags(estimator).input_tags.allow_nan else np.int32
+        X = np.round((X - X.min())).astype(dtype)
+        if X_test is not None:
+            X_test = np.round((X_test - X_test.min())).astype(dtype)  # pragma: no cover
+
+    if estimator.__class__.__name__ == "SkewedChi2Sampler":
+        # SkewedChi2Sampler requires X > -skewdness in transform
+        X = X - X.min()
+        if X_test is not None:
+            X_test = X_test - X_test.min()  # pragma: no cover
+
+    X_res = X
+
+    # Pairwise estimators only accept
+    # X of shape (`n_samples`, `n_samples`)
+    if _is_pairwise_metric(estimator):
+        X_res = pairwise_distances(X, metric="euclidean")
+        if X_test is not None:
+            X_test = pairwise_distances(
+                X_test, X, metric="euclidean"
+            )  # pragma: no cover
+    elif get_tags(estimator).input_tags.pairwise:
+        X_res = kernel(X, X)
+        if X_test is not None:
+            X_test = kernel(X_test, X)  # pragma: no cover
+    if X_test is not None:
+        return X_res, X_test
+    return X_res
+
+
+@ignore_warnings(category=FutureWarning)
+def check_positive_only_tag_during_fit(name, estimator_orig):
+    """Test that the estimator correctly sets the tags.input_tags.positive_only
+
+    If the tag is False, the estimator should accept negative input regardless of the
+    tags.input_tags.pairwise flag.
+    """
+    estimator = clone(estimator_orig)
+    tags = get_tags(estimator)
+
+    X, y = load_iris(return_X_y=True)
+    y = _enforce_estimator_tags_y(estimator, y)
+    set_random_state(estimator, 0)
+    X = _enforce_estimator_tags_X(estimator, X)
+    # Make sure that the dtype of X stays unchanged: for instance estimator
+    # that expect categorical inputs typically expected integer-based encoded
+    # categories.
+    X -= X.mean().astype(X.dtype)
+
+    if tags.input_tags.positive_only:
+        with raises(ValueError, match="Negative values in data"):
+            estimator.fit(X, y)
+    else:
+        # This should pass
+        try:
+            estimator.fit(X, y)
+        except Exception as e:
+            err_msg = (
+                f"Estimator {name!r} raised {e.__class__.__name__} unexpectedly."
+                " This happens when passing negative input values as X."
+                " If negative values are not supported for this estimator instance,"
+                " then the tags.input_tags.positive_only tag needs to be set to True."
+            )
+            raise AssertionError(err_msg) from e
+
+
+@ignore_warnings(category=FutureWarning)
+def check_non_transformer_estimators_n_iter(name, estimator_orig):
+    # Test that estimators that are not transformers with a parameter
+    # max_iter, return the attribute of n_iter_ at least 1.
+
+    if not hasattr(estimator_orig, "max_iter"):
+        return
+
+    estimator = clone(estimator_orig)
+    iris = load_iris()
+    X, y_ = iris.data, iris.target
+    y_ = _enforce_estimator_tags_y(estimator, y_)
+    set_random_state(estimator, 0)
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+
+    estimator.fit(X, y_)
+
+    assert np.all(np.asarray(estimator.n_iter_) >= 1), (
+        "Estimators with a `max_iter` parameter, should expose an `n_iter_` attribute,"
+        " indicating the number of iterations that were executed. The values in the "
+        "`n_iter_` attribute should be greater or equal to 1."
+    )
+
+
+@ignore_warnings(category=FutureWarning)
+def check_transformer_n_iter(name, estimator_orig):
+    # Test that transformers with a parameter max_iter, return the
+    # attribute of n_iter_ at least 1.
+    estimator = clone(estimator_orig)
+    if hasattr(estimator, "max_iter"):
+        if name in CROSS_DECOMPOSITION:
+            # Check using default data
+            X = [[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [2.0, 2.0, 2.0], [2.0, 5.0, 4.0]]
+            y_ = [[0.1, -0.2], [0.9, 1.1], [0.1, -0.5], [0.3, -0.2]]
+
+        else:
+            X, y_ = make_blobs(
+                n_samples=30,
+                centers=[[0, 0, 0], [1, 1, 1]],
+                random_state=0,
+                n_features=2,
+                cluster_std=0.1,
+            )
+            X = _enforce_estimator_tags_X(estimator_orig, X)
+        set_random_state(estimator, 0)
+        estimator.fit(X, y_)
+
+        # These return a n_iter per component.
+        if name in CROSS_DECOMPOSITION:
+            for iter_ in estimator.n_iter_:
+                assert iter_ >= 1
+        else:
+            assert estimator.n_iter_ >= 1
+
+
+@ignore_warnings(category=FutureWarning)
+def check_get_params_invariance(name, estimator_orig):
+    # Checks if get_params(deep=False) is a subset of get_params(deep=True)
+    e = clone(estimator_orig)
+
+    shallow_params = e.get_params(deep=False)
+    deep_params = e.get_params(deep=True)
+
+    assert all(item in deep_params.items() for item in shallow_params.items())
+
+
+@ignore_warnings(category=FutureWarning)
+def check_set_params(name, estimator_orig):
+    # Check that get_params() returns the same thing
+    # before and after set_params() with some fuzz
+    estimator = clone(estimator_orig)
+
+    orig_params = estimator.get_params(deep=False)
+    msg = "get_params result does not match what was passed to set_params"
+
+    estimator.set_params(**orig_params)
+    curr_params = estimator.get_params(deep=False)
+    assert set(orig_params.keys()) == set(curr_params.keys()), msg
+    for k, v in curr_params.items():
+        assert orig_params[k] is v, msg
+
+    # some fuzz values
+    test_values = [-np.inf, np.inf, None]
+
+    test_params = deepcopy(orig_params)
+    for param_name in orig_params.keys():
+        default_value = orig_params[param_name]
+        for value in test_values:
+            test_params[param_name] = value
+            try:
+                estimator.set_params(**test_params)
+            except (TypeError, ValueError) as e:
+                e_type = e.__class__.__name__
+                # Exception occurred, possibly parameter validation
+                warnings.warn(
+                    "{0} occurred during set_params of param {1} on "
+                    "{2}. It is recommended to delay parameter "
+                    "validation until fit.".format(e_type, param_name, name)
+                )
+
+                change_warning_msg = (
+                    "Estimator's parameters changed after set_params raised {}".format(
+                        e_type
+                    )
+                )
+                params_before_exception = curr_params
+                curr_params = estimator.get_params(deep=False)
+                try:
+                    assert set(params_before_exception.keys()) == set(
+                        curr_params.keys()
+                    )
+                    for k, v in curr_params.items():
+                        assert params_before_exception[k] is v
+                except AssertionError:
+                    warnings.warn(change_warning_msg)
+            else:
+                curr_params = estimator.get_params(deep=False)
+                assert set(test_params.keys()) == set(curr_params.keys()), msg
+                for k, v in curr_params.items():
+                    assert test_params[k] is v, msg
+        test_params[param_name] = default_value
+
+
+@ignore_warnings(category=FutureWarning)
+def check_classifiers_regression_target(name, estimator_orig):
+    # Check if classifier throws an exception when fed regression targets
+
+    X, y = _regression_dataset()
+
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    e = clone(estimator_orig)
+    err_msg = (
+        "When a classifier is passed a continuous target, it should raise a ValueError"
+        " with a message containing 'Unknown label type: ' or a message indicating that"
+        " a continuous target is passed and the message should include the word"
+        " 'continuous'"
+    )
+    msg = "Unknown label type: |continuous"
+    if not get_tags(e).no_validation:
+        with raises(ValueError, match=msg, err_msg=err_msg):
+            e.fit(X, y)
+
+
+@ignore_warnings(category=FutureWarning)
+def check_decision_proba_consistency(name, estimator_orig):
+    # Check whether an estimator having both decision_function and
+    # predict_proba methods has outputs with perfect rank correlation.
+
+    centers = [(2, 2), (4, 4)]
+    X, y = make_blobs(
+        n_samples=100,
+        random_state=0,
+        n_features=4,
+        centers=centers,
+        cluster_std=1.0,
+        shuffle=True,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=0
+    )
+    estimator = clone(estimator_orig)
+
+    if hasattr(estimator, "decision_function") and hasattr(estimator, "predict_proba"):
+        estimator.fit(X_train, y_train)
+        # Since the link function from decision_function() to predict_proba()
+        # is sometimes not precise enough (typically expit), we round to the
+        # 10th decimal to avoid numerical issues: we compare the rank
+        # with deterministic ties rather than get platform specific rank
+        # inversions in case of machine level differences.
+        a = estimator.predict_proba(X_test)[:, 1].round(decimals=10)
+        b = estimator.decision_function(X_test).round(decimals=10)
+
+        rank_proba, rank_score = rankdata(a), rankdata(b)
+        try:
+            assert_array_almost_equal(rank_proba, rank_score)
+        except AssertionError:
+            # Sometimes, the rounding applied on the probabilities will have
+            # ties that are not present in the scores because it is
+            # numerically more precise. In this case, we relax the test by
+            # grouping the decision function scores based on the probability
+            # rank and check that the score is monotonically increasing.
+            grouped_y_score = np.array(
+                [b[rank_proba == group].mean() for group in np.unique(rank_proba)]
+            )
+            sorted_idx = np.argsort(grouped_y_score)
+            assert_array_equal(sorted_idx, np.arange(len(sorted_idx)))
+
+
+def check_outliers_fit_predict(name, estimator_orig):
+    # Check fit_predict for outlier detectors.
+
+    n_samples = 300
+    X, _ = make_blobs(n_samples=n_samples, random_state=0)
+    X = shuffle(X, random_state=7)
+    n_samples, n_features = X.shape
+    estimator = clone(estimator_orig)
+
+    set_random_state(estimator)
+
+    y_pred = estimator.fit_predict(X)
+    assert y_pred.shape == (n_samples,)
+    assert y_pred.dtype.kind == "i"
+    assert_array_equal(np.unique(y_pred), np.array([-1, 1]))
+
+    # check fit_predict = fit.predict when the estimator has both a predict and
+    # a fit_predict method. recall that it is already assumed here that the
+    # estimator has a fit_predict method
+    if hasattr(estimator, "predict"):
+        y_pred_2 = estimator.fit(X).predict(X)
+        assert_array_equal(y_pred, y_pred_2)
+
+    if hasattr(estimator, "contamination"):
+        # proportion of outliers equal to contamination parameter when not
+        # set to 'auto'
+        expected_outliers = 30
+        contamination = float(expected_outliers) / n_samples
+        estimator.set_params(contamination=contamination)
+        y_pred = estimator.fit_predict(X)
+
+        num_outliers = np.sum(y_pred != 1)
+        # num_outliers should be equal to expected_outliers unless
+        # there are ties in the decision_function values. this can
+        # only be tested for estimators with a decision_function
+        # method
+        if num_outliers != expected_outliers and hasattr(
+            estimator, "decision_function"
+        ):
+            decision = estimator.decision_function(X)
+            check_outlier_corruption(num_outliers, expected_outliers, decision)
+
+
+def check_fit_non_negative(name, estimator_orig):
+    # Check that proper warning is raised for non-negative X
+    # when tag requires_positive_X is present
+    X = np.array([[-1.0, 1], [-1.0, 1]])
+    y = np.array([1, 2])
+    estimator = clone(estimator_orig)
+    with raises(ValueError):
+        estimator.fit(X, y)
+
+
+def check_fit_idempotent(name, estimator_orig):
+    # Check that est.fit(X) is the same as est.fit(X).fit(X). Ideally we would
+    # check that the estimated parameters during training (e.g. coefs_) are
+    # the same, but having a universal comparison function for those
+    # attributes is difficult and full of edge cases. So instead we check that
+    # predict(), predict_proba(), decision_function() and transform() return
+    # the same results.
+
+    check_methods = ["predict", "transform", "decision_function", "predict_proba"]
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+    if "warm_start" in estimator.get_params().keys():
+        estimator.set_params(warm_start=False)
+
+    n_samples = 100
+    X = rng.normal(loc=100, size=(n_samples, 2))
+    X = _enforce_estimator_tags_X(estimator, X)
+    if is_regressor(estimator_orig):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    train, test = next(ShuffleSplit(test_size=0.2, random_state=rng).split(X))
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, y_test = _safe_split(estimator, X, y, test, train)
+
+    # Fit for the first time
+    estimator.fit(X_train, y_train)
+
+    result = {
+        method: getattr(estimator, method)(X_test)
+        for method in check_methods
+        if hasattr(estimator, method)
+    }
+
+    # Fit again
+    set_random_state(estimator)
+    estimator.fit(X_train, y_train)
+
+    for method in check_methods:
+        if hasattr(estimator, method):
+            new_result = getattr(estimator, method)(X_test)
+            if hasattr(new_result, "dtype") and np.issubdtype(
+                new_result.dtype, np.floating
+            ):
+                tol = 2 * np.finfo(new_result.dtype).eps
+            else:
+                tol = 2 * np.finfo(np.float64).eps
+            assert_allclose_dense_sparse(
+                result[method],
+                new_result,
+                atol=max(tol, 1e-9),
+                rtol=max(tol, 1e-7),
+                err_msg="Idempotency check failed for method {}".format(method),
+            )
+
+
+def check_fit_check_is_fitted(name, estimator_orig):
+    # Make sure that estimator doesn't pass check_is_fitted before calling fit
+    # and that passes check_is_fitted once it's fit.
+
+    rng = np.random.RandomState(42)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+    if "warm_start" in estimator.get_params():
+        estimator.set_params(warm_start=False)
+
+    n_samples = 100
+    X = rng.normal(loc=100, size=(n_samples, 2))
+    X = _enforce_estimator_tags_X(estimator, X)
+    if is_regressor(estimator_orig):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if get_tags(estimator).requires_fit:
+        # stateless estimators (such as FunctionTransformer) are always "fit"!
+        try:
+            check_is_fitted(estimator)
+            raise AssertionError(
+                f"{estimator.__class__.__name__} passes check_is_fitted before being"
+                " fit!"
+            )
+        except NotFittedError:
+            pass
+    estimator.fit(X, y)
+    try:
+        check_is_fitted(estimator)
+    except NotFittedError as e:
+        raise NotFittedError(
+            "Estimator fails to pass `check_is_fitted` even though it has been fit."
+        ) from e
+
+
+def check_n_features_in(name, estimator_orig):
+    # Make sure that n_features_in_ attribute doesn't exist until fit is
+    # called, and that its value is correct.
+
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+    if "warm_start" in estimator.get_params():
+        estimator.set_params(warm_start=False)
+
+    n_samples = 100
+    X = rng.normal(loc=100, size=(n_samples, 2))
+    X = _enforce_estimator_tags_X(estimator, X)
+    if is_regressor(estimator_orig):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    assert not hasattr(estimator, "n_features_in_")
+    estimator.fit(X, y)
+    assert hasattr(estimator, "n_features_in_")
+    assert estimator.n_features_in_ == X.shape[1]
+
+
+def check_requires_y_none(name, estimator_orig):
+    # Make sure that an estimator with requires_y=True fails gracefully when
+    # given y=None
+
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+
+    n_samples = 100
+    X = rng.normal(loc=100, size=(n_samples, 2))
+    X = _enforce_estimator_tags_X(estimator, X)
+
+    expected_err_msgs = (
+        "requires y to be passed, but the target y is None",
+        "Expected array-like (array or non-string sequence), got None",
+        "y should be a 1d array",
+    )
+
+    try:
+        estimator.fit(X, None)
+    except ValueError as ve:
+        if not any(msg in str(ve) for msg in expected_err_msgs):
+            raise ve
+
+
+@ignore_warnings(category=FutureWarning)
+def check_n_features_in_after_fitting(name, estimator_orig):
+    # Make sure that n_features_in are checked after fitting
+    tags = get_tags(estimator_orig)
+
+    is_supported_X_types = tags.input_tags.two_d_array or tags.input_tags.categorical
+
+    if not is_supported_X_types or tags.no_validation:
+        return
+
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+    if "warm_start" in estimator.get_params():
+        estimator.set_params(warm_start=False)
+
+    n_samples = 10
+    X = rng.normal(size=(n_samples, 4))
+    X = _enforce_estimator_tags_X(estimator, X)
+
+    if is_regressor(estimator):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    err_msg = (
+        "`{name}.fit()` does not set the `n_features_in_` attribute. "
+        "You might want to use `sklearn.utils.validation.validate_data` instead "
+        "of `check_array` in `{name}.fit()` which takes care of setting the "
+        "attribute.".format(name=name)
+    )
+
+    estimator.fit(X, y)
+    assert hasattr(estimator, "n_features_in_"), err_msg
+    assert estimator.n_features_in_ == X.shape[1], err_msg
+
+    # check methods will check n_features_in_
+    check_methods = [
+        "predict",
+        "transform",
+        "decision_function",
+        "predict_proba",
+        "score",
+    ]
+    X_bad = X[:, [1]]
+
+    err_msg = """\
+        `{name}.{method}()` does not check for consistency between input number
+        of features with {name}.fit(), via the `n_features_in_` attribute.
+        You might want to use `sklearn.utils.validation.validate_data` instead
+        of `check_array` in `{name}.fit()` and {name}.{method}()`. This can be done
+        like the following:
+        from sklearn.utils.validation import validate_data
+        ...
+        class MyEstimator(BaseEstimator):
+            ...
+            def fit(self, X, y):
+                X, y = validate_data(self, X, y, ...)
+                ...
+                return self
+            ...
+            def {method}(self, X):
+                X = validate_data(self, X, ..., reset=False)
+                ...
+            return X
+    """
+    err_msg = textwrap.dedent(err_msg)
+
+    msg = f"X has 1 features, but \\w+ is expecting {X.shape[1]} features as input"
+    for method in check_methods:
+        if not hasattr(estimator, method):
+            continue
+
+        callable_method = getattr(estimator, method)
+        if method == "score":
+            callable_method = partial(callable_method, y=y)
+
+        with raises(
+            ValueError, match=msg, err_msg=err_msg.format(name=name, method=method)
+        ):
+            callable_method(X_bad)
+
+    # partial_fit will check in the second call
+    if not hasattr(estimator, "partial_fit"):
+        return
+
+    estimator = clone(estimator_orig)
+    if is_classifier(estimator):
+        estimator.partial_fit(X, y, classes=np.unique(y))
+    else:
+        estimator.partial_fit(X, y)
+    assert estimator.n_features_in_ == X.shape[1]
+
+    with raises(ValueError, match=msg):
+        estimator.partial_fit(X_bad, y)
+
+
+def check_valid_tag_types(name, estimator):
+    """Check that estimator tags are valid."""
+    assert hasattr(estimator, "__sklearn_tags__"), (
+        f"Estimator {name} does not have `__sklearn_tags__` method. This method is"
+        " implemented in BaseEstimator and returns a sklearn.utils.Tags instance."
+    )
+    err_msg = (
+        "Tag values need to be of a certain type. "
+        "Please refer to the documentation of `sklearn.utils.Tags` for more details."
+    )
+    tags = get_tags(estimator)
+    assert isinstance(tags.estimator_type, (str, type(None))), err_msg
+    assert isinstance(tags.target_tags, TargetTags), err_msg
+    assert isinstance(tags.classifier_tags, (ClassifierTags, type(None))), err_msg
+    assert isinstance(tags.regressor_tags, (RegressorTags, type(None))), err_msg
+    assert isinstance(tags.transformer_tags, (TransformerTags, type(None))), err_msg
+    assert isinstance(tags.input_tags, InputTags), err_msg
+    assert isinstance(tags.array_api_support, bool), err_msg
+    assert isinstance(tags.no_validation, bool), err_msg
+    assert isinstance(tags.non_deterministic, bool), err_msg
+    assert isinstance(tags.requires_fit, bool), err_msg
+    assert isinstance(tags._skip_test, bool), err_msg
+
+    assert isinstance(tags.target_tags.required, bool), err_msg
+    assert isinstance(tags.target_tags.one_d_labels, bool), err_msg
+    assert isinstance(tags.target_tags.two_d_labels, bool), err_msg
+    assert isinstance(tags.target_tags.positive_only, bool), err_msg
+    assert isinstance(tags.target_tags.multi_output, bool), err_msg
+    assert isinstance(tags.target_tags.single_output, bool), err_msg
+
+    assert isinstance(tags.input_tags.pairwise, bool), err_msg
+    assert isinstance(tags.input_tags.allow_nan, bool), err_msg
+    assert isinstance(tags.input_tags.sparse, bool), err_msg
+    assert isinstance(tags.input_tags.categorical, bool), err_msg
+    assert isinstance(tags.input_tags.string, bool), err_msg
+    assert isinstance(tags.input_tags.dict, bool), err_msg
+    assert isinstance(tags.input_tags.one_d_array, bool), err_msg
+    assert isinstance(tags.input_tags.two_d_array, bool), err_msg
+    assert isinstance(tags.input_tags.three_d_array, bool), err_msg
+    assert isinstance(tags.input_tags.positive_only, bool), err_msg
+
+    if tags.classifier_tags is not None:
+        assert isinstance(tags.classifier_tags.poor_score, bool), err_msg
+        assert isinstance(tags.classifier_tags.multi_class, bool), err_msg
+        assert isinstance(tags.classifier_tags.multi_label, bool), err_msg
+
+    if tags.regressor_tags is not None:
+        assert isinstance(tags.regressor_tags.poor_score, bool), err_msg
+
+    if tags.transformer_tags is not None:
+        assert isinstance(tags.transformer_tags.preserves_dtype, list), err_msg
+
+
+def check_estimator_tags_renamed(name, estimator_orig):
+    help = """{tags_func}() was removed in 1.6. Please use __sklearn_tags__ instead.
+You can implement both __sklearn_tags__() and {tags_func}() to support multiple
+scikit-learn versions.
+"""
+
+    for klass in type(estimator_orig).mro():
+        if (
+            # Here we check vars(...) because we want to check if the method is
+            # explicitly defined in the class instead of inherited from a parent class.
+            ("_more_tags" in vars(klass) or "_get_tags" in vars(klass))
+            and "__sklearn_tags__" not in vars(klass)
+        ):
+            raise TypeError(
+                f"Estimator {name} has defined either `_more_tags` or `_get_tags`,"
+                " but not `__sklearn_tags__`. If you're customizing tags, and need to"
+                " support multiple scikit-learn versions, you can implement both"
+                " `__sklearn_tags__` and `_more_tags` or `_get_tags`. This change was"
+                " introduced in scikit-learn=1.6"
+            )
+
+
+def check_dataframe_column_names_consistency(name, estimator_orig):
+    try:
+        import pandas as pd
+    except ImportError:
+        raise SkipTest(
+            "pandas is not installed: not checking column name consistency for pandas"
+        )
+
+    tags = get_tags(estimator_orig)
+    is_supported_X_types = tags.input_tags.two_d_array or tags.input_tags.categorical
+
+    if not is_supported_X_types or tags.no_validation:
+        return
+
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+
+    X_orig = rng.normal(size=(150, 8))
+
+    X_orig = _enforce_estimator_tags_X(estimator, X_orig)
+    n_samples, n_features = X_orig.shape
+
+    names = np.array([f"col_{i}" for i in range(n_features)])
+    X = pd.DataFrame(X_orig, columns=names, copy=False)
+
+    if is_regressor(estimator):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    # Check that calling `fit` does not raise any warnings about feature names.
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "error",
+            message="X does not have valid feature names",
+            category=UserWarning,
+            module="sklearn",
+        )
+        estimator.fit(X, y)
+
+    if not hasattr(estimator, "feature_names_in_"):
+        raise ValueError(
+            "Estimator does not have a feature_names_in_ "
+            "attribute after fitting with a dataframe"
+        )
+    assert isinstance(estimator.feature_names_in_, np.ndarray)
+    assert estimator.feature_names_in_.dtype == object
+    assert_array_equal(estimator.feature_names_in_, names)
+
+    # Only check sklearn estimators for feature_names_in_ in docstring
+    module_name = estimator_orig.__module__
+    if (
+        module_name.startswith("sklearn.")
+        and not ("test_" in module_name or module_name.endswith("_testing"))
+        and ("feature_names_in_" not in (estimator_orig.__doc__))
+    ):
+        raise ValueError(
+            f"Estimator {name} does not document its feature_names_in_ attribute"
+        )
+
+    check_methods = []
+    for method in (
+        "predict",
+        "transform",
+        "decision_function",
+        "predict_proba",
+        "score",
+        "score_samples",
+        "predict_log_proba",
+    ):
+        if not hasattr(estimator, method):
+            continue
+
+        callable_method = getattr(estimator, method)
+        if method == "score":
+            callable_method = partial(callable_method, y=y)
+        check_methods.append((method, callable_method))
+
+    for _, method in check_methods:
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "error",
+                message="X does not have valid feature names",
+                category=UserWarning,
+                module="sklearn",
+            )
+            method(X)  # works without UserWarning for valid features
+
+    invalid_names = [
+        (names[::-1], "Feature names must be in the same order as they were in fit."),
+        (
+            [f"another_prefix_{i}" for i in range(n_features)],
+            (
+                "Feature names unseen at fit time:\n- another_prefix_0\n-"
+                " another_prefix_1\n"
+            ),
+        ),
+        (
+            names[:3],
+            f"Feature names seen at fit time, yet now missing:\n- {min(names[3:])}\n",
+        ),
+    ]
+    params = {
+        key: value
+        for key, value in estimator.get_params().items()
+        if "early_stopping" in key
+    }
+    early_stopping_enabled = any(value is True for value in params.values())
+
+    for invalid_name, additional_message in invalid_names:
+        X_bad = pd.DataFrame(X, columns=invalid_name, copy=False)
+
+        expected_msg = re.escape(
+            "The feature names should match those that were passed during fit.\n"
+            f"{additional_message}"
+        )
+        for name, method in check_methods:
+            with raises(
+                ValueError, match=expected_msg, err_msg=f"{name} did not raise"
+            ):
+                method(X_bad)
+
+        # partial_fit checks on second call
+        # Do not call partial fit if early_stopping is on
+        if not hasattr(estimator, "partial_fit") or early_stopping_enabled:
+            continue
+
+        estimator = clone(estimator_orig)
+        if is_classifier(estimator):
+            classes = np.unique(y)
+            estimator.partial_fit(X, y, classes=classes)
+        else:
+            estimator.partial_fit(X, y)
+
+        with raises(ValueError, match=expected_msg):
+            estimator.partial_fit(X_bad, y)
+
+
+def check_transformer_get_feature_names_out(name, transformer_orig):
+    tags = get_tags(transformer_orig)
+    if not tags.input_tags.two_d_array or tags.no_validation:
+        return
+
+    X, y = make_blobs(
+        n_samples=30,
+        centers=[[0, 0, 0], [1, 1, 1]],
+        random_state=0,
+        n_features=2,
+        cluster_std=0.1,
+    )
+    X = StandardScaler().fit_transform(X)
+
+    transformer = clone(transformer_orig)
+    X = _enforce_estimator_tags_X(transformer, X)
+
+    n_features = X.shape[1]
+    set_random_state(transformer)
+
+    y_ = y
+    if name in CROSS_DECOMPOSITION:
+        y_ = np.c_[np.asarray(y), np.asarray(y)]
+        y_[::2, 1] *= 2
+
+    X_transform = transformer.fit_transform(X, y=y_)
+    input_features = [f"feature{i}" for i in range(n_features)]
+
+    # input_features names is not the same length as n_features_in_
+    with raises(ValueError, match="input_features should have length equal"):
+        transformer.get_feature_names_out(input_features[::2])
+
+    feature_names_out = transformer.get_feature_names_out(input_features)
+    assert feature_names_out is not None
+    assert isinstance(feature_names_out, np.ndarray)
+    assert feature_names_out.dtype == object
+    assert all(isinstance(name, str) for name in feature_names_out)
+
+    if isinstance(X_transform, tuple):
+        n_features_out = X_transform[0].shape[1]
+    else:
+        n_features_out = X_transform.shape[1]
+
+    assert len(feature_names_out) == n_features_out, (
+        f"Expected {n_features_out} feature names, got {len(feature_names_out)}"
+    )
+
+
+def check_transformer_get_feature_names_out_pandas(name, transformer_orig):
+    try:
+        import pandas as pd
+    except ImportError:
+        raise SkipTest(
+            "pandas is not installed: not checking column name consistency for pandas"
+        )
+
+    tags = get_tags(transformer_orig)
+    if not tags.input_tags.two_d_array or tags.no_validation:
+        return
+
+    X, y = make_blobs(
+        n_samples=30,
+        centers=[[0, 0, 0], [1, 1, 1]],
+        random_state=0,
+        n_features=2,
+        cluster_std=0.1,
+    )
+    X = StandardScaler().fit_transform(X)
+
+    transformer = clone(transformer_orig)
+    X = _enforce_estimator_tags_X(transformer, X)
+
+    n_features = X.shape[1]
+    set_random_state(transformer)
+
+    y_ = y
+    if name in CROSS_DECOMPOSITION:
+        y_ = np.c_[np.asarray(y), np.asarray(y)]
+        y_[::2, 1] *= 2
+
+    feature_names_in = [f"col{i}" for i in range(n_features)]
+    df = pd.DataFrame(X, columns=feature_names_in, copy=False)
+    X_transform = transformer.fit_transform(df, y=y_)
+
+    # error is raised when `input_features` do not match feature_names_in
+    invalid_feature_names = [f"bad{i}" for i in range(n_features)]
+    with raises(ValueError, match="input_features is not equal to feature_names_in_"):
+        transformer.get_feature_names_out(invalid_feature_names)
+
+    feature_names_out_default = transformer.get_feature_names_out()
+    feature_names_in_explicit_names = transformer.get_feature_names_out(
+        feature_names_in
+    )
+    assert_array_equal(feature_names_out_default, feature_names_in_explicit_names)
+
+    if isinstance(X_transform, tuple):
+        n_features_out = X_transform[0].shape[1]
+    else:
+        n_features_out = X_transform.shape[1]
+
+    assert len(feature_names_out_default) == n_features_out, (
+        f"Expected {n_features_out} feature names, got {len(feature_names_out_default)}"
+    )
+
+
+def check_param_validation(name, estimator_orig):
+    # Check that an informative error is raised when the value of a constructor
+    # parameter does not have an appropriate type or value.
+    rng = np.random.RandomState(0)
+    X = rng.uniform(size=(20, 5))
+    y = rng.randint(0, 2, size=20)
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+    tags = get_tags(estimator_orig)
+
+    estimator_params = estimator_orig.get_params(deep=False).keys()
+
+    # check that there is a constraint for each parameter
+    if estimator_params:
+        validation_params = estimator_orig._parameter_constraints.keys()
+        unexpected_params = set(validation_params) - set(estimator_params)
+        missing_params = set(estimator_params) - set(validation_params)
+        err_msg = (
+            f"Mismatch between _parameter_constraints and the parameters of {name}."
+            f"\nConsider the unexpected parameters {unexpected_params} and expected but"
+            f" missing parameters {missing_params}"
+        )
+        assert validation_params == estimator_params, err_msg
+
+    # this object does not have a valid type for sure for all params
+    param_with_bad_type = type("BadType", (), {})()
+
+    fit_methods = ["fit", "partial_fit", "fit_transform", "fit_predict"]
+
+    for param_name in estimator_params:
+        constraints = estimator_orig._parameter_constraints[param_name]
+
+        if constraints == "no_validation":
+            # This parameter is not validated
+            continue
+
+        # Mixing an interval of reals and an interval of integers must be avoided.
+        if any(
+            isinstance(constraint, Interval) and constraint.type == Integral
+            for constraint in constraints
+        ) and any(
+            isinstance(constraint, Interval) and constraint.type == Real
+            for constraint in constraints
+        ):
+            raise ValueError(
+                f"The constraint for parameter {param_name} of {name} can't have a mix"
+                " of intervals of Integral and Real types. Use the type RealNotInt"
+                " instead of Real."
+            )
+
+        match = rf"The '{param_name}' parameter of {name} must be .* Got .* instead."
+        err_msg = (
+            f"{name} does not raise an informative error message when the "
+            f"parameter {param_name} does not have a valid type or value."
+        )
+
+        estimator = clone(estimator_orig)
+
+        # First, check that the error is raised if param doesn't match any valid type.
+        estimator.set_params(**{param_name: param_with_bad_type})
+
+        for method in fit_methods:
+            if not hasattr(estimator, method):
+                # the method is not accessible with the current set of parameters
+                continue
+
+            err_msg = (
+                f"{name} does not raise an informative error message when the parameter"
+                f" {param_name} does not have a valid type. If any Python type is"
+                " valid, the constraint should be 'no_validation'."
+            )
+
+            with raises(InvalidParameterError, match=match, err_msg=err_msg):
+                if tags.target_tags.one_d_labels or tags.target_tags.two_d_labels:
+                    # The estimator is a label transformer and take only `y`
+                    getattr(estimator, method)(y)
+                else:
+                    getattr(estimator, method)(X, y)
+
+        # Then, for constraints that are more than a type constraint, check that the
+        # error is raised if param does match a valid type but does not match any valid
+        # value for this type.
+        constraints = [make_constraint(constraint) for constraint in constraints]
+
+        for constraint in constraints:
+            try:
+                bad_value = generate_invalid_param_val(constraint)
+            except NotImplementedError:
+                continue
+
+            estimator.set_params(**{param_name: bad_value})
+
+            for method in fit_methods:
+                if not hasattr(estimator, method):
+                    # the method is not accessible with the current set of parameters
+                    continue
+
+                err_msg = (
+                    f"{name} does not raise an informative error message when the "
+                    f"parameter {param_name} does not have a valid value.\n"
+                    "Constraints should be disjoint. For instance "
+                    "[StrOptions({'a_string'}), str] is not a acceptable set of "
+                    "constraint because generating an invalid string for the first "
+                    "constraint will always produce a valid string for the second "
+                    "constraint."
+                )
+
+                with raises(InvalidParameterError, match=match, err_msg=err_msg):
+                    if tags.target_tags.one_d_labels or tags.target_tags.two_d_labels:
+                        # The estimator is a label transformer and take only `y`
+                        getattr(estimator, method)(y)
+                    else:
+                        getattr(estimator, method)(X, y)
+
+
+def check_set_output_transform(name, transformer_orig):
+    # Check transformer.set_output with the default configuration does not
+    # change the transform output.
+    tags = get_tags(transformer_orig)
+    if not tags.input_tags.two_d_array or tags.no_validation:
+        return
+
+    rng = np.random.RandomState(0)
+    transformer = clone(transformer_orig)
+
+    X = rng.uniform(size=(20, 5))
+    X = _enforce_estimator_tags_X(transformer_orig, X)
+    y = rng.randint(0, 2, size=20)
+    y = _enforce_estimator_tags_y(transformer_orig, y)
+    set_random_state(transformer)
+
+    def fit_then_transform(est):
+        if name in CROSS_DECOMPOSITION:
+            return est.fit(X, y).transform(X, y)
+        return est.fit(X, y).transform(X)
+
+    def fit_transform(est):
+        return est.fit_transform(X, y)
+
+    transform_methods = {
+        "transform": fit_then_transform,
+        "fit_transform": fit_transform,
+    }
+    for name, transform_method in transform_methods.items():
+        transformer = clone(transformer)
+        if not hasattr(transformer, name):
+            continue
+        X_trans_no_setting = transform_method(transformer)
+
+        # Auto wrapping only wraps the first array
+        if name in CROSS_DECOMPOSITION:
+            X_trans_no_setting = X_trans_no_setting[0]
+
+        transformer.set_output(transform="default")
+        X_trans_default = transform_method(transformer)
+
+        if name in CROSS_DECOMPOSITION:
+            X_trans_default = X_trans_default[0]
+
+        # Default and no setting -> returns the same transformation
+        assert_allclose_dense_sparse(X_trans_no_setting, X_trans_default)
+
+
+def _output_from_fit_transform(transformer, name, X, df, y):
+    """Generate output to test `set_output` for different configuration:
+
+    - calling either `fit.transform` or `fit_transform`;
+    - passing either a dataframe or a numpy array to fit;
+    - passing either a dataframe or a numpy array to transform.
+    """
+    outputs = {}
+
+    # fit then transform case:
+    cases = [
+        ("fit.transform/df/df", df, df),
+        ("fit.transform/df/array", df, X),
+        ("fit.transform/array/df", X, df),
+        ("fit.transform/array/array", X, X),
+    ]
+    if all(hasattr(transformer, meth) for meth in ["fit", "transform"]):
+        for (
+            case,
+            data_fit,
+            data_transform,
+        ) in cases:
+            transformer.fit(data_fit, y)
+            if name in CROSS_DECOMPOSITION:
+                X_trans, _ = transformer.transform(data_transform, y)
+            else:
+                X_trans = transformer.transform(data_transform)
+            outputs[case] = (X_trans, transformer.get_feature_names_out())
+
+    # fit_transform case:
+    cases = [
+        ("fit_transform/df", df),
+        ("fit_transform/array", X),
+    ]
+    if hasattr(transformer, "fit_transform"):
+        for case, data in cases:
+            if name in CROSS_DECOMPOSITION:
+                X_trans, _ = transformer.fit_transform(data, y)
+            else:
+                X_trans = transformer.fit_transform(data, y)
+            outputs[case] = (X_trans, transformer.get_feature_names_out())
+
+    return outputs
+
+
+def _check_generated_dataframe(
+    name,
+    case,
+    index,
+    outputs_default,
+    outputs_dataframe_lib,
+    is_supported_dataframe,
+    create_dataframe,
+    assert_frame_equal,
+):
+    """Check if the generated DataFrame by the transformer is valid.
+
+    The DataFrame implementation is specified through the parameters of this function.
+
+    Parameters
+    ----------
+    name : str
+        The name of the transformer.
+    case : str
+        A single case from the cases generated by `_output_from_fit_transform`.
+    index : index or None
+        The index of the DataFrame. `None` if the library does not implement a DataFrame
+        with an index.
+    outputs_default : tuple
+        A tuple containing the output data and feature names for the default output.
+    outputs_dataframe_lib : tuple
+        A tuple containing the output data and feature names for the pandas case.
+    is_supported_dataframe : callable
+        A callable that takes a DataFrame instance as input and return whether or
+        E.g. `lambda X: isintance(X, pd.DataFrame)`.
+    create_dataframe : callable
+        A callable taking as parameters `data`, `columns`, and `index` and returns
+        a callable. Be aware that `index` can be ignored. For example, polars dataframes
+        would ignore the idnex.
+    assert_frame_equal : callable
+        A callable taking 2 dataframes to compare if they are equal.
+    """
+    X_trans, feature_names_default = outputs_default
+    df_trans, feature_names_dataframe_lib = outputs_dataframe_lib
+
+    assert is_supported_dataframe(df_trans)
+    # We always rely on the output of `get_feature_names_out` of the
+    # transformer used to generate the dataframe as a ground-truth of the
+    # columns.
+    # If a dataframe is passed into transform, then the output should have the same
+    # index
+    expected_index = index if case.endswith("df") else None
+    expected_dataframe = create_dataframe(
+        X_trans, columns=feature_names_dataframe_lib, index=expected_index
+    )
+
+    try:
+        assert_frame_equal(df_trans, expected_dataframe)
+    except AssertionError as e:
+        raise AssertionError(
+            f"{name} does not generate a valid dataframe in the {case} "
+            "case. The generated dataframe is not equal to the expected "
+            f"dataframe. The error message is: {e}"
+        ) from e
+
+
+def _check_set_output_transform_dataframe(
+    name,
+    transformer_orig,
+    *,
+    dataframe_lib,
+    is_supported_dataframe,
+    create_dataframe,
+    assert_frame_equal,
+    context,
+):
+    """Check that a transformer can output a DataFrame when requested.
+
+    The DataFrame implementation is specified through the parameters of this function.
+
+    Parameters
+    ----------
+    name : str
+        The name of the transformer.
+    transformer_orig : estimator
+        The original transformer instance.
+    dataframe_lib : str
+        The name of the library implementing the DataFrame.
+    is_supported_dataframe : callable
+        A callable that takes a DataFrame instance as input and returns whether or
+        not it is supported by the dataframe library.
+        E.g. `lambda X: isintance(X, pd.DataFrame)`.
+    create_dataframe : callable
+        A callable taking as parameters `data`, `columns`, and `index` and returns
+        a callable. Be aware that `index` can be ignored. For example, polars dataframes
+        will ignore the index.
+    assert_frame_equal : callable
+        A callable taking 2 dataframes to compare if they are equal.
+    context : {"local", "global"}
+        Whether to use a local context by setting `set_output(...)` on the transformer
+        or a global context by using the `with config_context(...)`
+    """
+    # Check transformer.set_output configures the output of transform="pandas".
+    tags = get_tags(transformer_orig)
+    if not tags.input_tags.two_d_array or tags.no_validation:
+        return
+
+    rng = np.random.RandomState(0)
+    transformer = clone(transformer_orig)
+
+    X = rng.uniform(size=(20, 5))
+    X = _enforce_estimator_tags_X(transformer_orig, X)
+    y = rng.randint(0, 2, size=20)
+    y = _enforce_estimator_tags_y(transformer_orig, y)
+    set_random_state(transformer)
+
+    feature_names_in = [f"col{i}" for i in range(X.shape[1])]
+    index = [f"index{i}" for i in range(X.shape[0])]
+    df = create_dataframe(X, columns=feature_names_in, index=index)
+
+    transformer_default = clone(transformer).set_output(transform="default")
+    outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
+
+    if context == "local":
+        transformer_df = clone(transformer).set_output(transform=dataframe_lib)
+        context_to_use = nullcontext()
+    else:  # global
+        transformer_df = clone(transformer)
+        context_to_use = config_context(transform_output=dataframe_lib)
+
+    try:
+        with context_to_use:
+            outputs_df = _output_from_fit_transform(transformer_df, name, X, df, y)
+    except ValueError as e:
+        # transformer does not support sparse data
+        capitalized_lib = dataframe_lib.capitalize()
+        error_message = str(e)
+        assert (
+            f"{capitalized_lib} output does not support sparse data." in error_message
+            or "The transformer outputs a scipy sparse matrix." in error_message
+        ), e
+        return
+
+    for case in outputs_default:
+        _check_generated_dataframe(
+            name,
+            case,
+            index,
+            outputs_default[case],
+            outputs_df[case],
+            is_supported_dataframe,
+            create_dataframe,
+            assert_frame_equal,
+        )
+
+
+def _check_set_output_transform_pandas_context(name, transformer_orig, context):
+    try:
+        import pandas as pd
+    except ImportError:  # pragma: no cover
+        raise SkipTest("pandas is not installed: not checking set output")
+
+    _check_set_output_transform_dataframe(
+        name,
+        transformer_orig,
+        dataframe_lib="pandas",
+        is_supported_dataframe=lambda X: isinstance(X, pd.DataFrame),
+        create_dataframe=lambda X, columns, index: pd.DataFrame(
+            X, columns=columns, copy=False, index=index
+        ),
+        assert_frame_equal=pd.testing.assert_frame_equal,
+        context=context,
+    )
+
+
+def check_set_output_transform_pandas(name, transformer_orig):
+    _check_set_output_transform_pandas_context(name, transformer_orig, "local")
+
+
+def check_global_output_transform_pandas(name, transformer_orig):
+    _check_set_output_transform_pandas_context(name, transformer_orig, "global")
+
+
+def _check_set_output_transform_polars_context(name, transformer_orig, context):
+    try:
+        import polars as pl
+        from polars.testing import assert_frame_equal
+    except ImportError:  # pragma: no cover
+        raise SkipTest("polars is not installed: not checking set output")
+
+    def create_dataframe(X, columns, index):
+        if isinstance(columns, np.ndarray):
+            columns = columns.tolist()
+
+        return pl.DataFrame(X, schema=columns, orient="row")
+
+    _check_set_output_transform_dataframe(
+        name,
+        transformer_orig,
+        dataframe_lib="polars",
+        is_supported_dataframe=lambda X: isinstance(X, pl.DataFrame),
+        create_dataframe=create_dataframe,
+        assert_frame_equal=assert_frame_equal,
+        context=context,
+    )
+
+
+def check_set_output_transform_polars(name, transformer_orig):
+    _check_set_output_transform_polars_context(name, transformer_orig, "local")
+
+
+def check_global_set_output_transform_polars(name, transformer_orig):
+    _check_set_output_transform_polars_context(name, transformer_orig, "global")
+
+
+@ignore_warnings(category=FutureWarning)
+def check_inplace_ensure_writeable(name, estimator_orig):
+    """Check that estimators able to do inplace operations can work on read-only
+    input data even if a copy is not explicitly requested by the user.
+
+    Make sure that a copy is made and consequently that the input array and its
+    writeability are not modified by the estimator.
+    """
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+
+    n_samples = 100
+
+    X, _ = make_blobs(n_samples=n_samples, n_features=3, random_state=rng)
+    X = _enforce_estimator_tags_X(estimator, X)
+
+    # These estimators can only work inplace with fortran ordered input
+    if name in ("Lasso", "ElasticNet", "MultiTaskElasticNet", "MultiTaskLasso"):
+        X = np.asfortranarray(X)
+
+    # Add a missing value for imputers so that transform has to do something
+    if hasattr(estimator, "missing_values"):
+        X[0, 0] = np.nan
+
+    if is_regressor(estimator):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    X_copy = X.copy()
+
+    # Make X read-only
+    X.setflags(write=False)
+
+    estimator.fit(X, y)
+
+    if hasattr(estimator, "transform"):
+        estimator.transform(X)
+
+    assert not X.flags.writeable
+    assert_allclose(X, X_copy)
+
+
+def check_do_not_raise_errors_in_init_or_set_params(name, estimator_orig):
+    """Check that init or set_param does not raise errors."""
+    Estimator = type(estimator_orig)
+    params = signature(Estimator).parameters
+
+    smoke_test_values = [-1, 3.0, "helloworld", np.array([1.0, 4.0]), [1], {}, []]
+    for value in smoke_test_values:
+        new_params = {key: value for key in params}
+
+        # Does not raise
+        est = Estimator(**new_params)
+
+        # Also do does not raise
+        est.set_params(**new_params)
+
+
+def check_classifier_not_supporting_multiclass(name, estimator_orig):
+    """Check that if the classifier has tags.classifier_tags.multi_class=False,
+    then it should raise a ValueError when calling fit with a multiclass dataset.
+
+    This test is not yielded if the tag is not False.
+    """
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+
+    X, y = make_classification(
+        n_samples=100,
+        n_classes=3,
+        n_informative=3,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    err_msg = """\
+        The estimator tag `tags.classifier_tags.multi_class` is False for {name}
+        which means it does not support multiclass classification. However, it does
+        not raise the right `ValueError` when calling fit with a multiclass dataset,
+        including the error message 'Only binary classification is supported.' This
+        can be achieved by the following pattern:
+
+        y_type = type_of_target(y, input_name='y', raise_unknown=True)
+        if y_type != 'binary':
+            raise ValueError(
+                'Only binary classification is supported. The type of the target '
+                f'is {{y_type}}.'
+        )
+    """.format(name=name)
+    err_msg = textwrap.dedent(err_msg)
+
+    with raises(
+        ValueError, match="Only binary classification is supported.", err_msg=err_msg
+    ):
+        estimator.fit(X, y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/extmath.py b/.venv/lib/python3.12/site-packages/sklearn/utils/extmath.py
new file mode 100644
index 0000000000000000000000000000000000000000..b98a7747c28aa3452bf92fbf5efb9a9cdc026701
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/extmath.py
@@ -0,0 +1,1395 @@
+"""Utilities to perform optimal mathematical operations in scikit-learn."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from functools import partial
+from numbers import Integral
+
+import numpy as np
+from scipy import linalg, sparse
+
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ._array_api import _average, _is_numpy_namespace, _nanmean, device, get_namespace
+from .sparsefuncs_fast import csr_row_norms
+from .validation import check_array, check_random_state
+
+
+def squared_norm(x):
+    """Squared Euclidean or Frobenius norm of x.
+
+    Faster than norm(x) ** 2.
+
+    Parameters
+    ----------
+    x : array-like
+        The input array which could be either be a vector or a 2 dimensional array.
+
+    Returns
+    -------
+    float
+        The Euclidean norm when x is a vector, the Frobenius norm when x
+        is a matrix (2-d array).
+    """
+    x = np.ravel(x, order="K")
+    if np.issubdtype(x.dtype, np.integer):
+        warnings.warn(
+            (
+                "Array type is integer, np.dot may overflow. "
+                "Data should be float type to avoid this issue"
+            ),
+            UserWarning,
+        )
+    return np.dot(x, x)
+
+
+def row_norms(X, squared=False):
+    """Row-wise (squared) Euclidean norm of X.
+
+    Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
+    matrices and does not create an X.shape-sized temporary.
+
+    Performs no input validation.
+
+    Parameters
+    ----------
+    X : array-like
+        The input array.
+    squared : bool, default=False
+        If True, return squared norms.
+
+    Returns
+    -------
+    array-like
+        The row-wise (squared) Euclidean norm of X.
+    """
+    if sparse.issparse(X):
+        X = X.tocsr()
+        norms = csr_row_norms(X)
+        if not squared:
+            norms = np.sqrt(norms)
+    else:
+        xp, _ = get_namespace(X)
+        if _is_numpy_namespace(xp):
+            X = np.asarray(X)
+            norms = np.einsum("ij,ij->i", X, X)
+            norms = xp.asarray(norms)
+        else:
+            norms = xp.sum(xp.multiply(X, X), axis=1)
+        if not squared:
+            norms = xp.sqrt(norms)
+    return norms
+
+
+def fast_logdet(A):
+    """Compute logarithm of determinant of a square matrix.
+
+    The (natural) logarithm of the determinant of a square matrix
+    is returned if det(A) is non-negative and well defined.
+    If the determinant is zero or negative returns -Inf.
+
+    Equivalent to : np.log(np.det(A)) but more robust.
+
+    Parameters
+    ----------
+    A : array_like of shape (n, n)
+        The square matrix.
+
+    Returns
+    -------
+    logdet : float
+        When det(A) is strictly positive, log(det(A)) is returned.
+        When det(A) is non-positive or not defined, then -inf is returned.
+
+    See Also
+    --------
+    numpy.linalg.slogdet : Compute the sign and (natural) logarithm of the determinant
+        of an array.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.extmath import fast_logdet
+    >>> a = np.array([[5, 1], [2, 8]])
+    >>> fast_logdet(a)
+    np.float64(3.6375861597263857)
+    """
+    xp, _ = get_namespace(A)
+    sign, ld = xp.linalg.slogdet(A)
+    if not sign > 0:
+        return -xp.inf
+    return ld
+
+
+def density(w):
+    """Compute density of a sparse vector.
+
+    Parameters
+    ----------
+    w : {ndarray, sparse matrix}
+        The input data can be numpy ndarray or a sparse matrix.
+
+    Returns
+    -------
+    float
+        The density of w, between 0 and 1.
+
+    Examples
+    --------
+    >>> from scipy import sparse
+    >>> from sklearn.utils.extmath import density
+    >>> X = sparse.random(10, 10, density=0.25, random_state=0)
+    >>> density(X)
+    0.25
+    """
+    if hasattr(w, "toarray"):
+        d = float(w.nnz) / (w.shape[0] * w.shape[1])
+    else:
+        d = 0 if w is None else float((w != 0).sum()) / w.size
+    return d
+
+
+def safe_sparse_dot(a, b, *, dense_output=False):
+    """Dot product that handle the sparse matrix case correctly.
+
+    Parameters
+    ----------
+    a : {ndarray, sparse matrix}
+    b : {ndarray, sparse matrix}
+    dense_output : bool, default=False
+        When False, ``a`` and ``b`` both being sparse will yield sparse output.
+        When True, output will always be a dense array.
+
+    Returns
+    -------
+    dot_product : {ndarray, sparse matrix}
+        Sparse if ``a`` and ``b`` are sparse and ``dense_output=False``.
+
+    Examples
+    --------
+    >>> from scipy.sparse import csr_matrix
+    >>> from sklearn.utils.extmath import safe_sparse_dot
+    >>> X = csr_matrix([[1, 2], [3, 4], [5, 6]])
+    >>> dot_product = safe_sparse_dot(X, X.T)
+    >>> dot_product.toarray()
+    array([[ 5, 11, 17],
+           [11, 25, 39],
+           [17, 39, 61]])
+    """
+    xp, _ = get_namespace(a, b)
+    if a.ndim > 2 or b.ndim > 2:
+        if sparse.issparse(a):
+            # sparse is always 2D. Implies b is 3D+
+            # [i, j] @ [k, ..., l, m, n] -> [i, k, ..., l, n]
+            b_ = np.rollaxis(b, -2)
+            b_2d = b_.reshape((b.shape[-2], -1))
+            ret = a @ b_2d
+            ret = ret.reshape(a.shape[0], *b_.shape[1:])
+        elif sparse.issparse(b):
+            # sparse is always 2D. Implies a is 3D+
+            # [k, ..., l, m] @ [i, j] -> [k, ..., l, j]
+            a_2d = a.reshape(-1, a.shape[-1])
+            ret = a_2d @ b
+            ret = ret.reshape(*a.shape[:-1], b.shape[1])
+        else:
+            # Alternative for `np.dot` when dealing with a or b having
+            # more than 2 dimensions, that works with the array api.
+            # If b is 1-dim then the last axis for b is taken otherwise
+            # if b is >= 2-dim then the second to last axis is taken.
+            b_axis = -1 if b.ndim == 1 else -2
+            ret = xp.tensordot(a, b, axes=[-1, b_axis])
+    else:
+        ret = a @ b
+
+    if (
+        sparse.issparse(a)
+        and sparse.issparse(b)
+        and dense_output
+        and hasattr(ret, "toarray")
+    ):
+        return ret.toarray()
+    return ret
+
+
+def randomized_range_finder(
+    A, *, size, n_iter, power_iteration_normalizer="auto", random_state=None
+):
+    """Compute an orthonormal matrix whose range approximates the range of A.
+
+    Parameters
+    ----------
+    A : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The input data matrix.
+
+    size : int
+        Size of the return array.
+
+    n_iter : int
+        Number of power iterations used to stabilize the result.
+
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+
+        .. versionadded:: 0.18
+
+    random_state : int, RandomState instance or None, default=None
+        The seed of the pseudo random number generator to use when shuffling
+        the data, i.e. getting the random vectors to initialize the algorithm.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    Q : ndarray of shape (size, size)
+        A projection matrix, the range of which approximates well the range of the
+        input matrix A.
+
+    Notes
+    -----
+
+    Follows Algorithm 4.3 of
+    :arxiv:`"Finding structure with randomness:
+    Stochastic algorithms for constructing approximate matrix decompositions"
+    <0909.4061>`
+    Halko, et al. (2009)
+
+    An implementation of a randomized algorithm for principal component
+    analysis
+    A. Szlam et al. 2014
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.extmath import randomized_range_finder
+    >>> A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> randomized_range_finder(A, size=2, n_iter=2, random_state=42)
+    array([[-0.214,  0.887],
+           [-0.521,  0.249],
+           [-0.826, -0.388]])
+    """
+    A = check_array(A, accept_sparse=True)
+
+    return _randomized_range_finder(
+        A,
+        size=size,
+        n_iter=n_iter,
+        power_iteration_normalizer=power_iteration_normalizer,
+        random_state=random_state,
+    )
+
+
+def _randomized_range_finder(
+    A, *, size, n_iter, power_iteration_normalizer="auto", random_state=None
+):
+    """Body of randomized_range_finder without input validation."""
+    xp, is_array_api_compliant = get_namespace(A)
+    random_state = check_random_state(random_state)
+
+    # Generating normal random vectors with shape: (A.shape[1], size)
+    # XXX: generate random number directly from xp if it's possible
+    # one day.
+    Q = xp.asarray(random_state.normal(size=(A.shape[1], size)))
+    if hasattr(A, "dtype") and xp.isdtype(A.dtype, kind="real floating"):
+        # Use float32 computation and components if A has a float32 dtype.
+        Q = xp.astype(Q, A.dtype, copy=False)
+
+    # Move Q to device if needed only after converting to float32 if needed to
+    # avoid allocating unnecessary memory on the device.
+
+    # Note: we cannot combine the astype and to_device operations in one go
+    # using xp.asarray(..., dtype=dtype, device=device) because downcasting
+    # from float64 to float32 in asarray might not always be accepted as only
+    # casts following type promotion rules are guarateed to work.
+    # https://github.com/data-apis/array-api/issues/647
+    if is_array_api_compliant:
+        Q = xp.asarray(Q, device=device(A))
+
+    # Deal with "auto" mode
+    if power_iteration_normalizer == "auto":
+        if n_iter <= 2:
+            power_iteration_normalizer = "none"
+        elif is_array_api_compliant:
+            # XXX: https://github.com/data-apis/array-api/issues/627
+            warnings.warn(
+                "Array API does not support LU factorization, falling back to QR"
+                " instead. Set `power_iteration_normalizer='QR'` explicitly to silence"
+                " this warning."
+            )
+            power_iteration_normalizer = "QR"
+        else:
+            power_iteration_normalizer = "LU"
+    elif power_iteration_normalizer == "LU" and is_array_api_compliant:
+        raise ValueError(
+            "Array API does not support LU factorization. Set "
+            "`power_iteration_normalizer='QR'` instead."
+        )
+
+    if is_array_api_compliant:
+        qr_normalizer = partial(xp.linalg.qr, mode="reduced")
+    else:
+        # Use scipy.linalg instead of numpy.linalg when not explicitly
+        # using the Array API.
+        qr_normalizer = partial(linalg.qr, mode="economic", check_finite=False)
+
+    if power_iteration_normalizer == "QR":
+        normalizer = qr_normalizer
+    elif power_iteration_normalizer == "LU":
+        normalizer = partial(linalg.lu, permute_l=True, check_finite=False)
+    else:
+        normalizer = lambda x: (x, None)
+
+    # Perform power iterations with Q to further 'imprint' the top
+    # singular vectors of A in Q
+    for _ in range(n_iter):
+        Q, _ = normalizer(A @ Q)
+        Q, _ = normalizer(A.T @ Q)
+
+    # Sample the range of A using by linear projection of Q
+    # Extract an orthonormal basis
+    Q, _ = qr_normalizer(A @ Q)
+
+    return Q
+
+
+@validate_params(
+    {
+        "M": ["array-like", "sparse matrix"],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "n_oversamples": [Interval(Integral, 0, None, closed="left")],
+        "n_iter": [Interval(Integral, 0, None, closed="left"), StrOptions({"auto"})],
+        "power_iteration_normalizer": [StrOptions({"auto", "QR", "LU", "none"})],
+        "transpose": ["boolean", StrOptions({"auto"})],
+        "flip_sign": ["boolean"],
+        "random_state": ["random_state"],
+        "svd_lapack_driver": [StrOptions({"gesdd", "gesvd"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def randomized_svd(
+    M,
+    n_components,
+    *,
+    n_oversamples=10,
+    n_iter="auto",
+    power_iteration_normalizer="auto",
+    transpose="auto",
+    flip_sign=True,
+    random_state=None,
+    svd_lapack_driver="gesdd",
+):
+    """Compute a truncated randomized SVD.
+
+    This method solves the fixed-rank approximation problem described in [1]_
+    (problem (1.5), p5).
+
+    Refer to
+    :ref:`sphx_glr_auto_examples_applications_wikipedia_principal_eigenvector.py`
+    for a typical example where the power iteration algorithm is used to rank web pages.
+    This algorithm is also known to be used as a building block in Google's PageRank
+    algorithm.
+
+    Parameters
+    ----------
+    M : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Matrix to decompose.
+
+    n_components : int
+        Number of singular values and vectors to extract.
+
+    n_oversamples : int, default=10
+        Additional number of random vectors to sample the range of `M` so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of `M` is `n_components + n_oversamples`. Smaller
+        number can improve speed but can negatively impact the quality of
+        approximation of singular vectors and singular values. Users might wish
+        to increase this parameter up to `2*k - n_components` where k is the
+        effective rank, for large matrices, noisy problems, matrices with
+        slowly decaying spectrums, or to increase precision accuracy. See [1]_
+        (pages 5, 23 and 26).
+
+    n_iter : int or 'auto', default='auto'
+        Number of power iterations. It can be used to deal with very noisy
+        problems. When 'auto', it is set to 4, unless `n_components` is small
+        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
+        This improves precision with few components. Note that in general
+        users should rather increase `n_oversamples` before increasing `n_iter`
+        as the principle of the randomized method is to avoid usage of these
+        more costly power iterations steps. When `n_components` is equal
+        or greater to the effective matrix rank and the spectrum does not
+        present a slow decay, `n_iter=0` or `1` should even work fine in theory
+        (see [1]_ page 9).
+
+        .. versionchanged:: 0.18
+
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+
+        .. versionadded:: 0.18
+
+    transpose : bool or 'auto', default='auto'
+        Whether the algorithm should be applied to M.T instead of M. The
+        result should approximately be the same. The 'auto' mode will
+        trigger the transposition if M.shape[1] > M.shape[0] since this
+        implementation of randomized SVD tend to be a little faster in that
+        case.
+
+        .. versionchanged:: 0.18
+
+    flip_sign : bool, default=True
+        The output of a singular value decomposition is only unique up to a
+        permutation of the signs of the singular vectors. If `flip_sign` is
+        set to `True`, the sign ambiguity is resolved by making the largest
+        loadings for each component in the left singular vectors positive.
+
+    random_state : int, RandomState instance or None, default='warn'
+        The seed of the pseudo random number generator to use when
+        shuffling the data, i.e. getting the random vectors to initialize
+        the algorithm. Pass an int for reproducible results across multiple
+        function calls. See :term:`Glossary <random_state>`.
+
+        .. versionchanged:: 1.2
+            The default value changed from 0 to None.
+
+    svd_lapack_driver : {"gesdd", "gesvd"}, default="gesdd"
+        Whether to use the more efficient divide-and-conquer approach
+        (`"gesdd"`) or more general rectangular approach (`"gesvd"`) to compute
+        the SVD of the matrix B, which is the projection of M into a low
+        dimensional subspace, as described in [1]_.
+
+        .. versionadded:: 1.2
+
+    Returns
+    -------
+    u : ndarray of shape (n_samples, n_components)
+        Unitary matrix having left singular vectors with signs flipped as columns.
+    s : ndarray of shape (n_components,)
+        The singular values, sorted in non-increasing order.
+    vh : ndarray of shape (n_components, n_features)
+        Unitary matrix having right singular vectors with signs flipped as rows.
+
+    Notes
+    -----
+    This algorithm finds a (usually very good) approximate truncated
+    singular value decomposition using randomization to speed up the
+    computations. It is particularly fast on large matrices on which
+    you wish to extract only a small number of components. In order to
+    obtain further speed up, `n_iter` can be set <=2 (at the cost of
+    loss of precision). To increase the precision it is recommended to
+    increase `n_oversamples`, up to `2*k-n_components` where k is the
+    effective rank. Usually, `n_components` is chosen to be greater than k
+    so increasing `n_oversamples` up to `n_components` should be enough.
+
+    References
+    ----------
+    .. [1] :arxiv:`"Finding structure with randomness:
+      Stochastic algorithms for constructing approximate matrix decompositions"
+      <0909.4061>`
+      Halko, et al. (2009)
+
+    .. [2] A randomized algorithm for the decomposition of matrices
+      Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
+
+    .. [3] An implementation of a randomized algorithm for principal component
+      analysis A. Szlam et al. 2014
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.extmath import randomized_svd
+    >>> a = np.array([[1, 2, 3, 5],
+    ...               [3, 4, 5, 6],
+    ...               [7, 8, 9, 10]])
+    >>> U, s, Vh = randomized_svd(a, n_components=2, random_state=0)
+    >>> U.shape, s.shape, Vh.shape
+    ((3, 2), (2,), (2, 4))
+    """
+    M = check_array(M, accept_sparse=True)
+    return _randomized_svd(
+        M,
+        n_components=n_components,
+        n_oversamples=n_oversamples,
+        n_iter=n_iter,
+        power_iteration_normalizer=power_iteration_normalizer,
+        transpose=transpose,
+        flip_sign=flip_sign,
+        random_state=random_state,
+        svd_lapack_driver=svd_lapack_driver,
+    )
+
+
+def _randomized_svd(
+    M,
+    n_components,
+    *,
+    n_oversamples=10,
+    n_iter="auto",
+    power_iteration_normalizer="auto",
+    transpose="auto",
+    flip_sign=True,
+    random_state=None,
+    svd_lapack_driver="gesdd",
+):
+    """Body of randomized_svd without input validation."""
+    xp, is_array_api_compliant = get_namespace(M)
+
+    if sparse.issparse(M) and M.format in ("lil", "dok"):
+        warnings.warn(
+            "Calculating SVD of a {} is expensive. "
+            "csr_matrix is more efficient.".format(type(M).__name__),
+            sparse.SparseEfficiencyWarning,
+        )
+
+    random_state = check_random_state(random_state)
+    n_random = n_components + n_oversamples
+    n_samples, n_features = M.shape
+
+    if n_iter == "auto":
+        # Checks if the number of iterations is explicitly specified
+        # Adjust n_iter. 7 was found a good compromise for PCA. See #5299
+        n_iter = 7 if n_components < 0.1 * min(M.shape) else 4
+
+    if transpose == "auto":
+        transpose = n_samples < n_features
+    if transpose:
+        # this implementation is a bit faster with smaller shape[1]
+        M = M.T
+
+    Q = _randomized_range_finder(
+        M,
+        size=n_random,
+        n_iter=n_iter,
+        power_iteration_normalizer=power_iteration_normalizer,
+        random_state=random_state,
+    )
+
+    # project M to the (k + p) dimensional space using the basis vectors
+    B = Q.T @ M
+
+    # compute the SVD on the thin matrix: (k + p) wide
+    if is_array_api_compliant:
+        Uhat, s, Vt = xp.linalg.svd(B, full_matrices=False)
+    else:
+        # When array_api_dispatch is disabled, rely on scipy.linalg
+        # instead of numpy.linalg to avoid introducing a behavior change w.r.t.
+        # previous versions of scikit-learn.
+        Uhat, s, Vt = linalg.svd(
+            B, full_matrices=False, lapack_driver=svd_lapack_driver
+        )
+    del B
+    U = Q @ Uhat
+
+    if flip_sign:
+        if not transpose:
+            U, Vt = svd_flip(U, Vt)
+        else:
+            # In case of transpose u_based_decision=false
+            # to actually flip based on u and not v.
+            U, Vt = svd_flip(U, Vt, u_based_decision=False)
+
+    if transpose:
+        # transpose back the results according to the input convention
+        return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T
+    else:
+        return U[:, :n_components], s[:n_components], Vt[:n_components, :]
+
+
+def _randomized_eigsh(
+    M,
+    n_components,
+    *,
+    n_oversamples=10,
+    n_iter="auto",
+    power_iteration_normalizer="auto",
+    selection="module",
+    random_state=None,
+):
+    """Computes a truncated eigendecomposition using randomized methods
+
+    This method solves the fixed-rank approximation problem described in the
+    Halko et al paper.
+
+    The choice of which components to select can be tuned with the `selection`
+    parameter.
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    M : ndarray or sparse matrix
+        Matrix to decompose, it should be real symmetric square or complex
+        hermitian
+
+    n_components : int
+        Number of eigenvalues and vectors to extract.
+
+    n_oversamples : int, default=10
+        Additional number of random vectors to sample the range of M so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of M is n_components + n_oversamples. Smaller
+        number can improve speed but can negatively impact the quality of
+        approximation of eigenvectors and eigenvalues. Users might wish
+        to increase this parameter up to `2*k - n_components` where k is the
+        effective rank, for large matrices, noisy problems, matrices with
+        slowly decaying spectrums, or to increase precision accuracy. See Halko
+        et al (pages 5, 23 and 26).
+
+    n_iter : int or 'auto', default='auto'
+        Number of power iterations. It can be used to deal with very noisy
+        problems. When 'auto', it is set to 4, unless `n_components` is small
+        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
+        This improves precision with few components. Note that in general
+        users should rather increase `n_oversamples` before increasing `n_iter`
+        as the principle of the randomized method is to avoid usage of these
+        more costly power iterations steps. When `n_components` is equal
+        or greater to the effective matrix rank and the spectrum does not
+        present a slow decay, `n_iter=0` or `1` should even work fine in theory
+        (see Halko et al paper, page 9).
+
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+
+    selection : {'value', 'module'}, default='module'
+        Strategy used to select the n components. When `selection` is `'value'`
+        (not yet implemented, will become the default when implemented), the
+        components corresponding to the n largest eigenvalues are returned.
+        When `selection` is `'module'`, the components corresponding to the n
+        eigenvalues with largest modules are returned.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator to use when shuffling
+        the data, i.e. getting the random vectors to initialize the algorithm.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Notes
+    -----
+    This algorithm finds a (usually very good) approximate truncated
+    eigendecomposition using randomized methods to speed up the computations.
+
+    This method is particularly fast on large matrices on which
+    you wish to extract only a small number of components. In order to
+    obtain further speed up, `n_iter` can be set <=2 (at the cost of
+    loss of precision). To increase the precision it is recommended to
+    increase `n_oversamples`, up to `2*k-n_components` where k is the
+    effective rank. Usually, `n_components` is chosen to be greater than k
+    so increasing `n_oversamples` up to `n_components` should be enough.
+
+    Strategy 'value': not implemented yet.
+    Algorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good
+    candidates for a future implementation.
+
+    Strategy 'module':
+    The principle is that for diagonalizable matrices, the singular values and
+    eigenvalues are related: if t is an eigenvalue of A, then :math:`|t|` is a
+    singular value of A. This method relies on a randomized SVD to find the n
+    singular components corresponding to the n singular values with largest
+    modules, and then uses the signs of the singular vectors to find the true
+    sign of t: if the sign of left and right singular vectors are different
+    then the corresponding eigenvalue is negative.
+
+    Returns
+    -------
+    eigvals : 1D array of shape (n_components,) containing the `n_components`
+        eigenvalues selected (see ``selection`` parameter).
+    eigvecs : 2D array of shape (M.shape[0], n_components) containing the
+        `n_components` eigenvectors corresponding to the `eigvals`, in the
+        corresponding order. Note that this follows the `scipy.linalg.eigh`
+        convention.
+
+    See Also
+    --------
+    :func:`randomized_svd`
+
+    References
+    ----------
+    * :arxiv:`"Finding structure with randomness:
+      Stochastic algorithms for constructing approximate matrix decompositions"
+      (Algorithm 4.3 for strategy 'module') <0909.4061>`
+      Halko, et al. (2009)
+    """
+    if selection == "value":  # pragma: no cover
+        # to do : an algorithm can be found in the Halko et al reference
+        raise NotImplementedError()
+
+    elif selection == "module":
+        # Note: no need for deterministic U and Vt (flip_sign=True),
+        # as we only use the dot product UVt afterwards
+        U, S, Vt = randomized_svd(
+            M,
+            n_components=n_components,
+            n_oversamples=n_oversamples,
+            n_iter=n_iter,
+            power_iteration_normalizer=power_iteration_normalizer,
+            flip_sign=False,
+            random_state=random_state,
+        )
+
+        eigvecs = U[:, :n_components]
+        eigvals = S[:n_components]
+
+        # Conversion of Singular values into Eigenvalues:
+        # For any eigenvalue t, the corresponding singular value is |t|.
+        # So if there is a negative eigenvalue t, the corresponding singular
+        # value will be -t, and the left (U) and right (V) singular vectors
+        # will have opposite signs.
+        # Fastest way: see <https://stackoverflow.com/a/61974002/7262247>
+        diag_VtU = np.einsum("ji,ij->j", Vt[:n_components, :], U[:, :n_components])
+        signs = np.sign(diag_VtU)
+        eigvals = eigvals * signs
+
+    else:  # pragma: no cover
+        raise ValueError("Invalid `selection`: %r" % selection)
+
+    return eigvals, eigvecs
+
+
+def weighted_mode(a, w, *, axis=0):
+    """Return an array of the weighted modal (most common) value in the passed array.
+
+    If there is more than one such value, only the first is returned.
+    The bin-count for the modal bins is also returned.
+
+    This is an extension of the algorithm in scipy.stats.mode.
+
+    Parameters
+    ----------
+    a : array-like of shape (n_samples,)
+        Array of which values to find mode(s).
+    w : array-like of shape (n_samples,)
+        Array of weights for each value.
+    axis : int, default=0
+        Axis along which to operate. Default is 0, i.e. the first axis.
+
+    Returns
+    -------
+    vals : ndarray
+        Array of modal values.
+    score : ndarray
+        Array of weighted counts for each mode.
+
+    See Also
+    --------
+    scipy.stats.mode: Calculates the Modal (most common) value of array elements
+        along specified axis.
+
+    Examples
+    --------
+    >>> from sklearn.utils.extmath import weighted_mode
+    >>> x = [4, 1, 4, 2, 4, 2]
+    >>> weights = [1, 1, 1, 1, 1, 1]
+    >>> weighted_mode(x, weights)
+    (array([4.]), array([3.]))
+
+    The value 4 appears three times: with uniform weights, the result is
+    simply the mode of the distribution.
+
+    >>> weights = [1, 3, 0.5, 1.5, 1, 2]  # deweight the 4's
+    >>> weighted_mode(x, weights)
+    (array([2.]), array([3.5]))
+
+    The value 2 has the highest score: it appears twice with weights of
+    1.5 and 2: the sum of these is 3.5.
+    """
+    if axis is None:
+        a = np.ravel(a)
+        w = np.ravel(w)
+        axis = 0
+    else:
+        a = np.asarray(a)
+        w = np.asarray(w)
+
+    if a.shape != w.shape:
+        w = np.full(a.shape, w, dtype=w.dtype)
+
+    scores = np.unique(np.ravel(a))  # get ALL unique values
+    testshape = list(a.shape)
+    testshape[axis] = 1
+    oldmostfreq = np.zeros(testshape)
+    oldcounts = np.zeros(testshape)
+    for score in scores:
+        template = np.zeros(a.shape)
+        ind = a == score
+        template[ind] = w[ind]
+        counts = np.expand_dims(np.sum(template, axis), axis)
+        mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
+        oldcounts = np.maximum(counts, oldcounts)
+        oldmostfreq = mostfrequent
+    return mostfrequent, oldcounts
+
+
+def cartesian(arrays, out=None):
+    """Generate a cartesian product of input arrays.
+
+    Parameters
+    ----------
+    arrays : list of array-like
+        1-D arrays to form the cartesian product of.
+    out : ndarray of shape (M, len(arrays)), default=None
+        Array to place the cartesian product in.
+
+    Returns
+    -------
+    out : ndarray of shape (M, len(arrays))
+        Array containing the cartesian products formed of input arrays.
+        If not provided, the `dtype` of the output array is set to the most
+        permissive `dtype` of the input arrays, according to NumPy type
+        promotion.
+
+        .. versionadded:: 1.2
+           Add support for arrays of different types.
+
+    Notes
+    -----
+    This function may not be used on more than 32 arrays
+    because the underlying numpy functions do not support it.
+
+    Examples
+    --------
+    >>> from sklearn.utils.extmath import cartesian
+    >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
+    array([[1, 4, 6],
+           [1, 4, 7],
+           [1, 5, 6],
+           [1, 5, 7],
+           [2, 4, 6],
+           [2, 4, 7],
+           [2, 5, 6],
+           [2, 5, 7],
+           [3, 4, 6],
+           [3, 4, 7],
+           [3, 5, 6],
+           [3, 5, 7]])
+    """
+    arrays = [np.asarray(x) for x in arrays]
+    shape = (len(x) for x in arrays)
+
+    ix = np.indices(shape)
+    ix = ix.reshape(len(arrays), -1).T
+
+    if out is None:
+        dtype = np.result_type(*arrays)  # find the most permissive dtype
+        out = np.empty_like(ix, dtype=dtype)
+
+    for n, arr in enumerate(arrays):
+        out[:, n] = arrays[n][ix[:, n]]
+
+    return out
+
+
+def svd_flip(u, v, u_based_decision=True):
+    """Sign correction to ensure deterministic output from SVD.
+
+    Adjusts the columns of u and the rows of v such that the loadings in the
+    columns in u that are largest in absolute value are always positive.
+
+    If u_based_decision is False, then the same sign correction is applied to
+    so that the rows in v that are largest in absolute value are always
+    positive.
+
+    Parameters
+    ----------
+    u : ndarray
+        Parameters u and v are the output of `linalg.svd` or
+        :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner
+        dimensions so one can compute `np.dot(u * s, v)`.
+        u can be None if `u_based_decision` is False.
+
+    v : ndarray
+        Parameters u and v are the output of `linalg.svd` or
+        :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner
+        dimensions so one can compute `np.dot(u * s, v)`. The input v should
+        really be called vt to be consistent with scipy's output.
+        v can be None if `u_based_decision` is True.
+
+    u_based_decision : bool, default=True
+        If True, use the columns of u as the basis for sign flipping.
+        Otherwise, use the rows of v. The choice of which variable to base the
+        decision on is generally algorithm dependent.
+
+    Returns
+    -------
+    u_adjusted : ndarray
+        Array u with adjusted columns and the same dimensions as u.
+
+    v_adjusted : ndarray
+        Array v with adjusted rows and the same dimensions as v.
+    """
+    xp, _ = get_namespace(*[a for a in [u, v] if a is not None])
+
+    if u_based_decision:
+        # columns of u, rows of v, or equivalently rows of u.T and v
+        max_abs_u_cols = xp.argmax(xp.abs(u.T), axis=1)
+        shift = xp.arange(u.T.shape[0], device=device(u))
+        indices = max_abs_u_cols + shift * u.T.shape[1]
+        signs = xp.sign(xp.take(xp.reshape(u.T, (-1,)), indices, axis=0))
+        u *= signs[np.newaxis, :]
+        if v is not None:
+            v *= signs[:, np.newaxis]
+    else:
+        # rows of v, columns of u
+        max_abs_v_rows = xp.argmax(xp.abs(v), axis=1)
+        shift = xp.arange(v.shape[0], device=device(v))
+        indices = max_abs_v_rows + shift * v.shape[1]
+        signs = xp.sign(xp.take(xp.reshape(v, (-1,)), indices, axis=0))
+        if u is not None:
+            u *= signs[np.newaxis, :]
+        v *= signs[:, np.newaxis]
+    return u, v
+
+
+def softmax(X, copy=True):
+    """
+    Calculate the softmax function.
+
+    The softmax function is calculated by
+    np.exp(X) / np.sum(np.exp(X), axis=1)
+
+    This will cause overflow when large values are exponentiated.
+    Hence the largest value in each row is subtracted from each data
+    point to prevent this.
+
+    Parameters
+    ----------
+    X : array-like of float of shape (M, N)
+        Argument to the logistic function.
+
+    copy : bool, default=True
+        Copy X or not.
+
+    Returns
+    -------
+    out : ndarray of shape (M, N)
+        Softmax function evaluated at every point in x.
+    """
+    xp, is_array_api_compliant = get_namespace(X)
+    if copy:
+        X = xp.asarray(X, copy=True)
+    max_prob = xp.reshape(xp.max(X, axis=1), (-1, 1))
+    X -= max_prob
+
+    if _is_numpy_namespace(xp):
+        # optimization for NumPy arrays
+        np.exp(X, out=np.asarray(X))
+    else:
+        # array_api does not have `out=`
+        X = xp.exp(X)
+
+    sum_prob = xp.reshape(xp.sum(X, axis=1), (-1, 1))
+    X /= sum_prob
+    return X
+
+
+def make_nonnegative(X, min_value=0):
+    """Ensure `X.min()` >= `min_value`.
+
+    Parameters
+    ----------
+    X : array-like
+        The matrix to make non-negative.
+    min_value : float, default=0
+        The threshold value.
+
+    Returns
+    -------
+    array-like
+        The thresholded array.
+
+    Raises
+    ------
+    ValueError
+        When X is sparse.
+    """
+    min_ = X.min()
+    if min_ < min_value:
+        if sparse.issparse(X):
+            raise ValueError(
+                "Cannot make the data matrix"
+                " nonnegative because it is sparse."
+                " Adding a value to every entry would"
+                " make it no longer sparse."
+            )
+        X = X + (min_value - min_)
+    return X
+
+
+# Use at least float64 for the accumulating functions to avoid precision issue
+# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained
+# as it is in case the float overflows
+def _safe_accumulator_op(op, x, *args, **kwargs):
+    """
+    This function provides numpy accumulator functions with a float64 dtype
+    when used on a floating point input. This prevents accumulator overflow on
+    smaller floating point dtypes.
+
+    Parameters
+    ----------
+    op : function
+        A numpy accumulator function such as np.mean or np.sum.
+    x : ndarray
+        A numpy array to apply the accumulator function.
+    *args : positional arguments
+        Positional arguments passed to the accumulator function after the
+        input x.
+    **kwargs : keyword arguments
+        Keyword arguments passed to the accumulator function.
+
+    Returns
+    -------
+    result
+        The output of the accumulator function passed to this function.
+    """
+    if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8:
+        result = op(x, *args, **kwargs, dtype=np.float64)
+    else:
+        result = op(x, *args, **kwargs)
+    return result
+
+
+def _incremental_mean_and_var(
+    X, last_mean, last_variance, last_sample_count, sample_weight=None
+):
+    """Calculate mean update and a Youngs and Cramer variance update.
+
+    If sample_weight is given, the weighted mean and variance is computed.
+
+    Update a given mean and (possibly) variance according to new data given
+    in X. last_mean is always required to compute the new mean.
+    If last_variance is None, no variance is computed and None return for
+    updated_variance.
+
+    From the paper "Algorithms for computing the sample variance: analysis and
+    recommendations", by Chan, Golub, and LeVeque.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data to use for variance update.
+
+    last_mean : array-like of shape (n_features,)
+
+    last_variance : array-like of shape (n_features,)
+
+    last_sample_count : array-like of shape (n_features,)
+        The number of samples encountered until now if sample_weight is None.
+        If sample_weight is not None, this is the sum of sample_weight
+        encountered.
+
+    sample_weight : array-like of shape (n_samples,) or None
+        Sample weights. If None, compute the unweighted mean/variance.
+
+    Returns
+    -------
+    updated_mean : ndarray of shape (n_features,)
+
+    updated_variance : ndarray of shape (n_features,)
+        None if last_variance was None.
+
+    updated_sample_count : ndarray of shape (n_features,)
+
+    Notes
+    -----
+    NaNs are ignored during the algorithm.
+
+    References
+    ----------
+    T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
+        variance: recommendations, The American Statistician, Vol. 37, No. 3,
+        pp. 242-247
+
+    Also, see the sparse implementation of this in
+    `utils.sparsefuncs.incr_mean_variance_axis` and
+    `utils.sparsefuncs_fast.incr_mean_variance_axis0`
+    """
+    # old = stats until now
+    # new = the current increment
+    # updated = the aggregated stats
+    last_sum = last_mean * last_sample_count
+    X_nan_mask = np.isnan(X)
+    if np.any(X_nan_mask):
+        sum_op = np.nansum
+    else:
+        sum_op = np.sum
+    if sample_weight is not None:
+        # equivalent to np.nansum(X * sample_weight, axis=0)
+        # safer because np.float64(X*W) != np.float64(X)*np.float64(W)
+        new_sum = _safe_accumulator_op(
+            np.matmul, sample_weight, np.where(X_nan_mask, 0, X)
+        )
+        new_sample_count = _safe_accumulator_op(
+            np.sum, sample_weight[:, None] * (~X_nan_mask), axis=0
+        )
+    else:
+        new_sum = _safe_accumulator_op(sum_op, X, axis=0)
+        n_samples = X.shape[0]
+        new_sample_count = n_samples - np.sum(X_nan_mask, axis=0)
+
+    updated_sample_count = last_sample_count + new_sample_count
+
+    updated_mean = (last_sum + new_sum) / updated_sample_count
+
+    if last_variance is None:
+        updated_variance = None
+    else:
+        T = new_sum / new_sample_count
+        temp = X - T
+        if sample_weight is not None:
+            # equivalent to np.nansum((X-T)**2 * sample_weight, axis=0)
+            # safer because np.float64(X*W) != np.float64(X)*np.float64(W)
+            correction = _safe_accumulator_op(
+                np.matmul, sample_weight, np.where(X_nan_mask, 0, temp)
+            )
+            temp **= 2
+            new_unnormalized_variance = _safe_accumulator_op(
+                np.matmul, sample_weight, np.where(X_nan_mask, 0, temp)
+            )
+        else:
+            correction = _safe_accumulator_op(sum_op, temp, axis=0)
+            temp **= 2
+            new_unnormalized_variance = _safe_accumulator_op(sum_op, temp, axis=0)
+
+        # correction term of the corrected 2 pass algorithm.
+        # See "Algorithms for computing the sample variance: analysis
+        # and recommendations", by Chan, Golub, and LeVeque.
+        new_unnormalized_variance -= correction**2 / new_sample_count
+
+        last_unnormalized_variance = last_variance * last_sample_count
+
+        with np.errstate(divide="ignore", invalid="ignore"):
+            last_over_new_count = last_sample_count / new_sample_count
+            updated_unnormalized_variance = (
+                last_unnormalized_variance
+                + new_unnormalized_variance
+                + last_over_new_count
+                / updated_sample_count
+                * (last_sum / last_over_new_count - new_sum) ** 2
+            )
+
+        zeros = last_sample_count == 0
+        updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros]
+        updated_variance = updated_unnormalized_variance / updated_sample_count
+
+    return updated_mean, updated_variance, updated_sample_count
+
+
+def _deterministic_vector_sign_flip(u):
+    """Modify the sign of vectors for reproducibility.
+
+    Flips the sign of elements of all the vectors (rows of u) such that
+    the absolute maximum element of each vector is positive.
+
+    Parameters
+    ----------
+    u : ndarray
+        Array with vectors as its rows.
+
+    Returns
+    -------
+    u_flipped : ndarray with same shape as u
+        Array with the sign flipped vectors as its rows.
+    """
+    max_abs_rows = np.argmax(np.abs(u), axis=1)
+    signs = np.sign(u[range(u.shape[0]), max_abs_rows])
+    u *= signs[:, np.newaxis]
+    return u
+
+
+def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
+    """Use high precision for cumsum and check that final value matches sum.
+
+    Warns if the final cumulative sum does not match the sum (up to the chosen
+    tolerance).
+
+    Parameters
+    ----------
+    arr : array-like
+        To be cumulatively summed as flat.
+    axis : int, default=None
+        Axis along which the cumulative sum is computed.
+        The default (None) is to compute the cumsum over the flattened array.
+    rtol : float, default=1e-05
+        Relative tolerance, see ``np.allclose``.
+    atol : float, default=1e-08
+        Absolute tolerance, see ``np.allclose``.
+
+    Returns
+    -------
+    out : ndarray
+        Array with the cumulative sums along the chosen axis.
+    """
+    out = np.cumsum(arr, axis=axis, dtype=np.float64)
+    expected = np.sum(arr, axis=axis, dtype=np.float64)
+    if not np.allclose(
+        out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
+    ):
+        warnings.warn(
+            (
+                "cumsum was found to be unstable: "
+                "its last element does not correspond to sum"
+            ),
+            RuntimeWarning,
+        )
+    return out
+
+
+def _nanaverage(a, weights=None):
+    """Compute the weighted average, ignoring NaNs.
+
+    Parameters
+    ----------
+    a : ndarray
+        Array containing data to be averaged.
+    weights : array-like, default=None
+        An array of weights associated with the values in a. Each value in a
+        contributes to the average according to its associated weight. The
+        weights array can either be 1-D of the same shape as a. If `weights=None`,
+        then all data in a are assumed to have a weight equal to one.
+
+    Returns
+    -------
+    weighted_average : float
+        The weighted average.
+
+    Notes
+    -----
+    This wrapper to combine :func:`numpy.average` and :func:`numpy.nanmean`, so
+    that :func:`np.nan` values are ignored from the average and weights can
+    be passed. Note that when possible, we delegate to the prime methods.
+    """
+    xp, _ = get_namespace(a)
+    if a.shape[0] == 0:
+        return xp.nan
+
+    mask = xp.isnan(a)
+    if xp.all(mask):
+        return xp.nan
+
+    if weights is None:
+        return _nanmean(a, xp=xp)
+
+    weights = xp.asarray(weights)
+    a, weights = a[~mask], weights[~mask]
+    try:
+        return _average(a, weights=weights)
+    except ZeroDivisionError:
+        # this is when all weights are zero, then ignore them
+        return _average(a)
+
+
+def safe_sqr(X, *, copy=True):
+    """Element wise squaring of array-likes and sparse matrices.
+
+    Parameters
+    ----------
+    X : {array-like, ndarray, sparse matrix}
+
+    copy : bool, default=True
+        Whether to create a copy of X and operate on it or to perform
+        inplace computation (default behaviour).
+
+    Returns
+    -------
+    X ** 2 : element wise square
+         Return the element-wise square of the input.
+
+    Examples
+    --------
+    >>> from sklearn.utils import safe_sqr
+    >>> safe_sqr([1, 2, 3])
+    array([1, 4, 9])
+    """
+    X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)
+    if sparse.issparse(X):
+        if copy:
+            X = X.copy()
+        X.data **= 2
+    else:
+        if copy:
+            X = X**2
+        else:
+            X **= 2
+    return X
+
+
+def _approximate_mode(class_counts, n_draws, rng):
+    """Computes approximate mode of multivariate hypergeometric.
+
+    This is an approximation to the mode of the multivariate
+    hypergeometric given by class_counts and n_draws.
+    It shouldn't be off by more than one.
+
+    It is the mostly likely outcome of drawing n_draws many
+    samples from the population given by class_counts.
+
+    Parameters
+    ----------
+    class_counts : ndarray of int
+        Population per class.
+    n_draws : int
+        Number of draws (samples to draw) from the overall population.
+    rng : random state
+        Used to break ties.
+
+    Returns
+    -------
+    sampled_classes : ndarray of int
+        Number of samples drawn from each class.
+        np.sum(sampled_classes) == n_draws
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.extmath import _approximate_mode
+    >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
+    array([2, 1])
+    >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
+    array([3, 1])
+    >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
+    ...                   n_draws=2, rng=0)
+    array([0, 1, 1, 0])
+    >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
+    ...                   n_draws=2, rng=42)
+    array([1, 1, 0, 0])
+    """
+    rng = check_random_state(rng)
+    # this computes a bad approximation to the mode of the
+    # multivariate hypergeometric given by class_counts and n_draws
+    continuous = class_counts / class_counts.sum() * n_draws
+    # floored means we don't overshoot n_samples, but probably undershoot
+    floored = np.floor(continuous)
+    # we add samples according to how much "left over" probability
+    # they had, until we arrive at n_samples
+    need_to_add = int(n_draws - floored.sum())
+    if need_to_add > 0:
+        remainder = continuous - floored
+        values = np.sort(np.unique(remainder))[::-1]
+        # add according to remainder, but break ties
+        # randomly to avoid biases
+        for value in values:
+            (inds,) = np.where(remainder == value)
+            # if we need_to_add less than what's in inds
+            # we draw randomly from them.
+            # if we need to add more, we add them all and
+            # go to the next value
+            add_now = min(len(inds), need_to_add)
+            inds = rng.choice(inds, size=add_now, replace=False)
+            floored[inds] += 1
+            need_to_add -= add_now
+            if need_to_add == 0:
+                break
+    return floored.astype(int)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/fixes.py b/.venv/lib/python3.12/site-packages/sklearn/utils/fixes.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c847d3aa34c7eb1e790c27f421db7b9927dad2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/fixes.py
@@ -0,0 +1,427 @@
+"""Compatibility fixes for older version of python, numpy and scipy
+
+If you add content to this file, please give the version of the package
+at which the fix is no longer needed.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import platform
+import struct
+
+import numpy as np
+import scipy
+import scipy.sparse.linalg
+import scipy.stats
+from scipy import optimize
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
+from ..externals._packaging.version import parse as parse_version
+from .parallel import _get_threadpool_controller
+
+_IS_32BIT = 8 * struct.calcsize("P") == 32
+_IS_WASM = platform.machine() in ["wasm32", "wasm64"]
+
+np_version = parse_version(np.__version__)
+np_base_version = parse_version(np_version.base_version)
+sp_version = parse_version(scipy.__version__)
+sp_base_version = parse_version(sp_version.base_version)
+
+# TODO: We can consider removing the containers and importing
+# directly from SciPy when sparse matrices will be deprecated.
+CSR_CONTAINERS = [scipy.sparse.csr_matrix, scipy.sparse.csr_array]
+CSC_CONTAINERS = [scipy.sparse.csc_matrix, scipy.sparse.csc_array]
+COO_CONTAINERS = [scipy.sparse.coo_matrix, scipy.sparse.coo_array]
+LIL_CONTAINERS = [scipy.sparse.lil_matrix, scipy.sparse.lil_array]
+DOK_CONTAINERS = [scipy.sparse.dok_matrix, scipy.sparse.dok_array]
+BSR_CONTAINERS = [scipy.sparse.bsr_matrix, scipy.sparse.bsr_array]
+DIA_CONTAINERS = [scipy.sparse.dia_matrix, scipy.sparse.dia_array]
+
+# Remove when minimum scipy version is 1.11.0
+try:
+    from scipy.sparse import sparray  # noqa: F401
+
+    SPARRAY_PRESENT = True
+except ImportError:
+    SPARRAY_PRESENT = False
+
+
+def _object_dtype_isnan(X):
+    return X != X
+
+
+# TODO: Remove when SciPy 1.11 is the minimum supported version
+def _mode(a, axis=0):
+    if sp_version >= parse_version("1.9.0"):
+        mode = scipy.stats.mode(a, axis=axis, keepdims=True)
+        if sp_version >= parse_version("1.10.999"):
+            # scipy.stats.mode has changed returned array shape with axis=None
+            # and keepdims=True, see https://github.com/scipy/scipy/pull/17561
+            if axis is None:
+                mode = np.ravel(mode)
+        return mode
+    return scipy.stats.mode(a, axis=axis)
+
+
+# TODO: Remove when Scipy 1.12 is the minimum supported version
+if sp_base_version >= parse_version("1.12.0"):
+    _sparse_linalg_cg = scipy.sparse.linalg.cg
+else:
+
+    def _sparse_linalg_cg(A, b, **kwargs):
+        if "rtol" in kwargs:
+            kwargs["tol"] = kwargs.pop("rtol")
+        if "atol" not in kwargs:
+            kwargs["atol"] = "legacy"
+        return scipy.sparse.linalg.cg(A, b, **kwargs)
+
+
+# TODO : remove this when required minimum version of scipy >= 1.9.0
+def _yeojohnson_lambda(_neg_log_likelihood, x):
+    """Estimate the optimal Yeo-Johnson transformation parameter (lambda).
+
+    This function provides a compatibility workaround for versions of SciPy
+    older than 1.9.0, where `scipy.stats.yeojohnson` did not return
+    the estimated lambda directly.
+
+    Parameters
+    ----------
+    _neg_log_likelihood : callable
+        A function that computes the negative log-likelihood of the Yeo-Johnson
+        transformation for a given lambda. Used only for SciPy versions < 1.9.0.
+
+    x : array-like
+        Input data to estimate the Yeo-Johnson transformation parameter.
+
+    Returns
+    -------
+    lmbda : float
+        The estimated lambda parameter for the Yeo-Johnson transformation.
+    """
+    min_scipy_version = "1.9.0"
+
+    if sp_version < parse_version(min_scipy_version):
+        # choosing bracket -2, 2 like for boxcox
+        return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
+
+    _, lmbda = scipy.stats.yeojohnson(x, lmbda=None)
+    return lmbda
+
+
+# TODO: Fuse the modern implementations of _sparse_min_max and _sparse_nan_min_max
+# into the public min_max_axis function when Scipy 1.11 is the minimum supported
+# version and delete the backport in the else branch below.
+if sp_base_version >= parse_version("1.11.0"):
+
+    def _sparse_min_max(X, axis):
+        the_min = X.min(axis=axis)
+        the_max = X.max(axis=axis)
+
+        if axis is not None:
+            the_min = the_min.toarray().ravel()
+            the_max = the_max.toarray().ravel()
+
+        return the_min, the_max
+
+    def _sparse_nan_min_max(X, axis):
+        the_min = X.nanmin(axis=axis)
+        the_max = X.nanmax(axis=axis)
+
+        if axis is not None:
+            the_min = the_min.toarray().ravel()
+            the_max = the_max.toarray().ravel()
+
+        return the_min, the_max
+
+else:
+    # This code is mostly taken from scipy 0.14 and extended to handle nans, see
+    # https://github.com/scikit-learn/scikit-learn/pull/11196
+    def _minor_reduce(X, ufunc):
+        major_index = np.flatnonzero(np.diff(X.indptr))
+
+        # reduceat tries casts X.indptr to intp, which errors
+        # if it is int64 on a 32 bit system.
+        # Reinitializing prevents this where possible, see #13737
+        X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
+        value = ufunc.reduceat(X.data, X.indptr[major_index])
+        return major_index, value
+
+    def _min_or_max_axis(X, axis, min_or_max):
+        N = X.shape[axis]
+        if N == 0:
+            raise ValueError("zero-size array to reduction operation")
+        M = X.shape[1 - axis]
+        mat = X.tocsc() if axis == 0 else X.tocsr()
+        mat.sum_duplicates()
+        major_index, value = _minor_reduce(mat, min_or_max)
+        not_full = np.diff(mat.indptr)[major_index] < N
+        value[not_full] = min_or_max(value[not_full], 0)
+        mask = value != 0
+        major_index = np.compress(mask, major_index)
+        value = np.compress(mask, value)
+
+        if axis == 0:
+            res = scipy.sparse.coo_matrix(
+                (value, (np.zeros(len(value)), major_index)),
+                dtype=X.dtype,
+                shape=(1, M),
+            )
+        else:
+            res = scipy.sparse.coo_matrix(
+                (value, (major_index, np.zeros(len(value)))),
+                dtype=X.dtype,
+                shape=(M, 1),
+            )
+        return res.toarray().ravel()
+
+    def _sparse_min_or_max(X, axis, min_or_max):
+        if axis is None:
+            if 0 in X.shape:
+                raise ValueError("zero-size array to reduction operation")
+            zero = X.dtype.type(0)
+            if X.nnz == 0:
+                return zero
+            m = min_or_max.reduce(X.data.ravel())
+            if X.nnz != np.prod(X.shape):
+                m = min_or_max(zero, m)
+            return m
+        if axis < 0:
+            axis += 2
+        if (axis == 0) or (axis == 1):
+            return _min_or_max_axis(X, axis, min_or_max)
+        else:
+            raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
+
+    def _sparse_min_max(X, axis):
+        return (
+            _sparse_min_or_max(X, axis, np.minimum),
+            _sparse_min_or_max(X, axis, np.maximum),
+        )
+
+    def _sparse_nan_min_max(X, axis):
+        return (
+            _sparse_min_or_max(X, axis, np.fmin),
+            _sparse_min_or_max(X, axis, np.fmax),
+        )
+
+
+# For +1.25 NumPy versions exceptions and warnings are being moved
+# to a dedicated submodule.
+if np_version >= parse_version("1.25.0"):
+    from numpy.exceptions import ComplexWarning, VisibleDeprecationWarning
+else:
+    from numpy import (  # noqa: F401
+        ComplexWarning,
+        VisibleDeprecationWarning,
+    )
+
+
+# TODO: Adapt when Pandas > 2.2 is the minimum supported version
+def pd_fillna(pd, frame):
+    pd_version = parse_version(pd.__version__).base_version
+    if parse_version(pd_version) < parse_version("2.2"):
+        frame = frame.fillna(value=np.nan)
+    else:
+        infer_objects_kwargs = (
+            {} if parse_version(pd_version) >= parse_version("3") else {"copy": False}
+        )
+        with pd.option_context("future.no_silent_downcasting", True):
+            frame = frame.fillna(value=np.nan).infer_objects(**infer_objects_kwargs)
+    return frame
+
+
+# TODO: remove when SciPy 1.12 is the minimum supported version
+def _preserve_dia_indices_dtype(
+    sparse_container, original_container_format, requested_sparse_format
+):
+    """Preserve indices dtype for SciPy < 1.12 when converting from DIA to CSR/CSC.
+
+    For SciPy < 1.12, DIA arrays indices are upcasted to `np.int64` that is
+    inconsistent with DIA matrices. We downcast the indices dtype to `np.int32` to
+    be consistent with DIA matrices.
+
+    The converted indices arrays are affected back inplace to the sparse container.
+
+    Parameters
+    ----------
+    sparse_container : sparse container
+        Sparse container to be checked.
+    requested_sparse_format : str or bool
+        The type of format of `sparse_container`.
+
+    Notes
+    -----
+    See https://github.com/scipy/scipy/issues/19245 for more details.
+    """
+    if original_container_format == "dia_array" and requested_sparse_format in (
+        "csr",
+        "coo",
+    ):
+        if requested_sparse_format == "csr":
+            index_dtype = _smallest_admissible_index_dtype(
+                arrays=(sparse_container.indptr, sparse_container.indices),
+                maxval=max(sparse_container.nnz, sparse_container.shape[1]),
+                check_contents=True,
+            )
+            sparse_container.indices = sparse_container.indices.astype(
+                index_dtype, copy=False
+            )
+            sparse_container.indptr = sparse_container.indptr.astype(
+                index_dtype, copy=False
+            )
+        else:  # requested_sparse_format == "coo"
+            index_dtype = _smallest_admissible_index_dtype(
+                maxval=max(sparse_container.shape)
+            )
+            sparse_container.row = sparse_container.row.astype(index_dtype, copy=False)
+            sparse_container.col = sparse_container.col.astype(index_dtype, copy=False)
+
+
+# TODO: remove when SciPy 1.12 is the minimum supported version
+def _smallest_admissible_index_dtype(arrays=(), maxval=None, check_contents=False):
+    """Based on input (integer) arrays `a`, determine a suitable index data
+    type that can hold the data in the arrays.
+
+    This function returns `np.int64` if it either required by `maxval` or based on the
+    largest precision of the dtype of the arrays passed as argument, or by their
+    contents (when `check_contents is True`). If none of the condition requires
+    `np.int64` then this function returns `np.int32`.
+
+    Parameters
+    ----------
+    arrays : ndarray or tuple of ndarrays, default=()
+        Input arrays whose types/contents to check.
+
+    maxval : float, default=None
+        Maximum value needed.
+
+    check_contents : bool, default=False
+        Whether to check the values in the arrays and not just their types.
+        By default, check only the types.
+
+    Returns
+    -------
+    dtype : {np.int32, np.int64}
+        Suitable index data type (int32 or int64).
+    """
+
+    int32min = np.int32(np.iinfo(np.int32).min)
+    int32max = np.int32(np.iinfo(np.int32).max)
+
+    if maxval is not None:
+        if maxval > np.iinfo(np.int64).max:
+            raise ValueError(
+                f"maxval={maxval} is to large to be represented as np.int64."
+            )
+        if maxval > int32max:
+            return np.int64
+
+    if isinstance(arrays, np.ndarray):
+        arrays = (arrays,)
+
+    for arr in arrays:
+        if not isinstance(arr, np.ndarray):
+            raise TypeError(
+                f"Arrays should be of type np.ndarray, got {type(arr)} instead."
+            )
+        if not np.issubdtype(arr.dtype, np.integer):
+            raise ValueError(
+                f"Array dtype {arr.dtype} is not supported for index dtype. We expect "
+                "integral values."
+            )
+        if not np.can_cast(arr.dtype, np.int32):
+            if not check_contents:
+                # when `check_contents` is False, we stay on the safe side and return
+                # np.int64.
+                return np.int64
+            if arr.size == 0:
+                # a bigger type not needed yet, let's look at the next array
+                continue
+            else:
+                maxval = arr.max()
+                minval = arr.min()
+                if minval < int32min or maxval > int32max:
+                    # a big index type is actually needed
+                    return np.int64
+
+    return np.int32
+
+
+# TODO: Remove when Scipy 1.12 is the minimum supported version
+if sp_version < parse_version("1.12"):
+    from ..externals._scipy.sparse.csgraph import laplacian
+else:
+    from scipy.sparse.csgraph import (
+        laplacian,  # noqa: F401  # pragma: no cover
+    )
+
+
+# TODO: Remove when Python min version >= 3.12.
+def tarfile_extractall(tarfile, path):
+    try:
+        # Use filter="data" to prevent the most dangerous security issues.
+        # For more details, see
+        # https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractall
+        tarfile.extractall(path, filter="data")
+    except TypeError:
+        tarfile.extractall(path)
+
+
+def _in_unstable_openblas_configuration():
+    """Return True if in an unstable configuration for OpenBLAS"""
+
+    # Import libraries which might load OpenBLAS.
+    import numpy  # noqa: F401
+    import scipy  # noqa: F401
+
+    modules_info = _get_threadpool_controller().info()
+
+    open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
+    if not open_blas_used:
+        return False
+
+    # OpenBLAS 0.3.16 fixed instability for arm64, see:
+    # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58
+    openblas_arm64_stable_version = parse_version("0.3.16")
+    for info in modules_info:
+        if info["internal_api"] != "openblas":
+            continue
+        openblas_version = info.get("version")
+        openblas_architecture = info.get("architecture")
+        if openblas_version is None or openblas_architecture is None:
+            # Cannot be sure that OpenBLAS is good enough. Assume unstable:
+            return True  # pragma: no cover
+        if (
+            openblas_architecture == "neoversen1"
+            and parse_version(openblas_version) < openblas_arm64_stable_version
+        ):
+            # See discussions in https://github.com/numpy/numpy/issues/19411
+            return True  # pragma: no cover
+    return False
+
+
+# TODO: Remove when Scipy 1.15 is the minimum supported version. In scipy 1.15,
+# the internal info details (via 'iprint' and 'disp' options) were dropped,
+# following the LBFGS rewrite from Fortran to C, see
+# https://github.com/scipy/scipy/issues/23186#issuecomment-2987801035. For
+# scipy 1.15, 'iprint' and 'disp' have no effect and for scipy >= 1.16 a
+# DeprecationWarning is emitted.
+def _get_additional_lbfgs_options_dict(key, value):
+    return {} if sp_version >= parse_version("1.15") else {key: value}
+
+
+# TODO(pyarrow): Remove when minimum pyarrow version is 17.0.0
+PYARROW_VERSION_BELOW_17 = False
+try:
+    import pyarrow
+
+    pyarrow_version = parse_version(pyarrow.__version__)
+    if pyarrow_version < parse_version("17.0.0"):
+        PYARROW_VERSION_BELOW_17 = True
+except ModuleNotFoundError:  # pragma: no cover
+    pass
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/graph.py b/.venv/lib/python3.12/site-packages/sklearn/utils/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..47026f0611dfa2ddb997484fcea5f2b8103b243a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/graph.py
@@ -0,0 +1,162 @@
+"""Graph utilities and algorithms."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from scipy import sparse
+
+from ..metrics.pairwise import pairwise_distances
+from ._param_validation import Integral, Interval, validate_params
+
+
+###############################################################################
+# Path and connected component analysis.
+# Code adapted from networkx
+@validate_params(
+    {
+        "graph": ["array-like", "sparse matrix"],
+        "source": [Interval(Integral, 0, None, closed="left")],
+        "cutoff": [Interval(Integral, 0, None, closed="left"), None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def single_source_shortest_path_length(graph, source, *, cutoff=None):
+    """Return the length of the shortest path from source to all reachable nodes.
+
+    Parameters
+    ----------
+    graph : {array-like, sparse matrix} of shape (n_nodes, n_nodes)
+        Adjacency matrix of the graph. Sparse matrix of format LIL is
+        preferred.
+
+    source : int
+       Start node for path.
+
+    cutoff : int, default=None
+        Depth to stop the search - only paths of length <= cutoff are returned.
+
+    Returns
+    -------
+    paths : dict
+        Reachable end nodes mapped to length of path from source,
+        i.e. `{end: path_length}`.
+
+    Examples
+    --------
+    >>> from sklearn.utils.graph import single_source_shortest_path_length
+    >>> import numpy as np
+    >>> graph = np.array([[ 0, 1, 0, 0],
+    ...                   [ 1, 0, 1, 0],
+    ...                   [ 0, 1, 0, 0],
+    ...                   [ 0, 0, 0, 0]])
+    >>> single_source_shortest_path_length(graph, 0)
+    {0: 0, 1: 1, 2: 2}
+    >>> graph = np.ones((6, 6))
+    >>> sorted(single_source_shortest_path_length(graph, 2).items())
+    [(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)]
+    """
+    if sparse.issparse(graph):
+        graph = graph.tolil()
+    else:
+        graph = sparse.lil_matrix(graph)
+    seen = {}  # level (number of hops) when seen in BFS
+    level = 0  # the current level
+    next_level = [source]  # dict of nodes to check at next level
+    while next_level:
+        this_level = next_level  # advance to next level
+        next_level = set()  # and start a new list (fringe)
+        for v in this_level:
+            if v not in seen:
+                seen[v] = level  # set the level of vertex v
+                next_level.update(graph.rows[v])
+        if cutoff is not None and cutoff <= level:
+            break
+        level += 1
+    return seen  # return all path lengths as dictionary
+
+
+def _fix_connected_components(
+    X,
+    graph,
+    n_connected_components,
+    component_labels,
+    mode="distance",
+    metric="euclidean",
+    **kwargs,
+):
+    """Add connections to sparse graph to connect unconnected components.
+
+    For each pair of unconnected components, compute all pairwise distances
+    from one component to the other, and add a connection on the closest pair
+    of samples. This is a hacky way to get a graph with a single connected
+    component, which is necessary for example to compute a shortest path
+    between all pairs of samples in the graph.
+
+    Parameters
+    ----------
+    X : array of shape (n_samples, n_features) or (n_samples, n_samples)
+        Features to compute the pairwise distances. If `metric =
+        "precomputed"`, X is the matrix of pairwise distances.
+
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Graph of connection between samples.
+
+    n_connected_components : int
+        Number of connected components, as computed by
+        `scipy.sparse.csgraph.connected_components`.
+
+    component_labels : array of shape (n_samples)
+        Labels of connected components, as computed by
+        `scipy.sparse.csgraph.connected_components`.
+
+    mode : {'connectivity', 'distance'}, default='distance'
+        Type of graph matrix: 'connectivity' corresponds to the connectivity
+        matrix with ones and zeros, and 'distance' corresponds to the distances
+        between neighbors according to the given metric.
+
+    metric : str
+        Metric used in `sklearn.metrics.pairwise.pairwise_distances`.
+
+    kwargs : kwargs
+        Keyword arguments passed to
+        `sklearn.metrics.pairwise.pairwise_distances`.
+
+    Returns
+    -------
+    graph : sparse matrix of shape (n_samples, n_samples)
+        Graph of connection between samples, with a single connected component.
+    """
+    if metric == "precomputed" and sparse.issparse(X):
+        raise RuntimeError(
+            "_fix_connected_components with metric='precomputed' requires the "
+            "full distance matrix in X, and does not work with a sparse "
+            "neighbors graph."
+        )
+
+    for i in range(n_connected_components):
+        idx_i = np.flatnonzero(component_labels == i)
+        Xi = X[idx_i]
+        for j in range(i):
+            idx_j = np.flatnonzero(component_labels == j)
+            Xj = X[idx_j]
+
+            if metric == "precomputed":
+                D = X[np.ix_(idx_i, idx_j)]
+            else:
+                D = pairwise_distances(Xi, Xj, metric=metric, **kwargs)
+
+            ii, jj = np.unravel_index(D.argmin(axis=None), D.shape)
+            if mode == "connectivity":
+                graph[idx_i[ii], idx_j[jj]] = 1
+                graph[idx_j[jj], idx_i[ii]] = 1
+            elif mode == "distance":
+                graph[idx_i[ii], idx_j[jj]] = D[ii, jj]
+                graph[idx_j[jj], idx_i[ii]] = D[ii, jj]
+            else:
+                raise ValueError(
+                    "Unknown mode=%r, should be one of ['connectivity', 'distance']."
+                    % mode
+                )
+
+    return graph
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/meson.build b/.venv/lib/python3.12/site-packages/sklearn/utils/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..ae490e987a4ff19dd378957ea85fc4bd45737a6b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/meson.build
@@ -0,0 +1,75 @@
+# utils is cimported from other subpackages so this is needed for the cimport
+# to work
+utils_cython_tree = [
+  # We add sklearn_root_cython_tree to make sure sklearn/__init__.py is copied
+  # early in the build
+  sklearn_root_cython_tree,
+  fs.copyfile('__init__.py'),
+  fs.copyfile('_cython_blas.pxd'),
+  fs.copyfile('_heap.pxd'),
+  fs.copyfile('_openmp_helpers.pxd'),
+  fs.copyfile('_random.pxd'),
+  fs.copyfile('_sorting.pxd'),
+  fs.copyfile('_typedefs.pxd'),
+  fs.copyfile('_vector_sentinel.pxd'),
+]
+
+utils_extension_metadata = {
+  'sparsefuncs_fast':
+    {'sources': [cython_gen.process('sparsefuncs_fast.pyx')]},
+  '_cython_blas': {'sources': [cython_gen.process('_cython_blas.pyx')]},
+  'arrayfuncs': {'sources': [cython_gen.process('arrayfuncs.pyx')]},
+  'murmurhash': {
+      'sources': [cython_gen.process('murmurhash.pyx'), 'src' / 'MurmurHash3.cpp'],
+  },
+  '_fast_dict':
+    {'sources': [cython_gen_cpp.process('_fast_dict.pyx')]},
+  '_openmp_helpers': {'sources': [cython_gen.process('_openmp_helpers.pyx')], 'dependencies': [openmp_dep]},
+  '_random': {'sources': [cython_gen.process('_random.pyx')]},
+  '_typedefs': {'sources': [cython_gen.process('_typedefs.pyx')]},
+  '_heap': {'sources': [cython_gen.process('_heap.pyx')]},
+  '_sorting': {'sources': [cython_gen.process('_sorting.pyx')]},
+  '_vector_sentinel':
+    {'sources': [cython_gen_cpp.process('_vector_sentinel.pyx')],
+     'dependencies': [np_dep]},
+  '_isfinite': {'sources': [cython_gen.process('_isfinite.pyx')]},
+}
+
+foreach ext_name, ext_dict : utils_extension_metadata
+  py.extension_module(
+    ext_name,
+    [ext_dict.get('sources'), utils_cython_tree],
+    dependencies: ext_dict.get('dependencies', []),
+    subdir: 'sklearn/utils',
+    install: true
+  )
+endforeach
+
+util_extension_names = ['_seq_dataset', '_weight_vector']
+
+foreach name: util_extension_names
+  pxd = custom_target(
+    name + '_pxd',
+    output: name + '.pxd',
+    input: name + '.pxd.tp',
+    command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  )
+  utils_cython_tree += [pxd]
+
+  pyx = custom_target(
+    name + '_pyx',
+    output: name + '.pyx',
+    input: name + '.pyx.tp',
+    command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+    # TODO in principle this should go in py.exension_module below. This is
+    # temporary work-around for dependency issue with .pyx.tp files. For more
+    # details, see https://github.com/mesonbuild/meson/issues/13212
+    depends: [pxd, utils_cython_tree],
+  )
+  py.extension_module(
+    name,
+    cython_gen.process(pyx),
+    subdir: 'sklearn/utils',
+    install: true
+   )
+endforeach
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/metadata_routing.py b/.venv/lib/python3.12/site-packages/sklearn/utils/metadata_routing.py
new file mode 100644
index 0000000000000000000000000000000000000000..5068d1b9e3726744bd77220fb607050b3a87fd9f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/metadata_routing.py
@@ -0,0 +1,23 @@
+"""Utilities to route metadata within scikit-learn estimators."""
+
+# This module is not a separate sub-folder since that would result in a circular
+# import issue.
+#
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._metadata_requests import (  # noqa: F401
+    UNCHANGED,
+    UNUSED,
+    WARN,
+    MetadataRequest,
+    MetadataRouter,
+    MethodMapping,
+    _MetadataRequester,
+    _raise_for_params,
+    _raise_for_unsupported_routing,
+    _routing_enabled,
+    _RoutingNotSupportedMixin,
+    get_routing_for_object,
+    process_routing,
+)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/metaestimators.py b/.venv/lib/python3.12/site-packages/sklearn/utils/metaestimators.py
new file mode 100644
index 0000000000000000000000000000000000000000..dced64f2fe3926623d9bc5a874e1036602d54548
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/metaestimators.py
@@ -0,0 +1,163 @@
+"""Utilities for meta-estimators."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import ABCMeta, abstractmethod
+from contextlib import suppress
+from typing import Any, List
+
+import numpy as np
+
+from ..base import BaseEstimator
+from ..utils import _safe_indexing
+from ..utils._tags import get_tags
+from ._available_if import available_if
+
+__all__ = ["available_if"]
+
+
+class _BaseComposition(BaseEstimator, metaclass=ABCMeta):
+    """Handles parameter management for classifiers composed of named estimators."""
+
+    steps: List[Any]
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    def _get_params(self, attr, deep=True):
+        out = super().get_params(deep=deep)
+        if not deep:
+            return out
+
+        estimators = getattr(self, attr)
+        try:
+            out.update(estimators)
+        except (TypeError, ValueError):
+            # Ignore TypeError for cases where estimators is not a list of
+            # (name, estimator) and ignore ValueError when the list is not
+            # formatted correctly. This is to prevent errors when calling
+            # `set_params`. `BaseEstimator.set_params` calls `get_params` which
+            # can error for invalid values for `estimators`.
+            return out
+
+        for name, estimator in estimators:
+            if hasattr(estimator, "get_params"):
+                for key, value in estimator.get_params(deep=True).items():
+                    out["%s__%s" % (name, key)] = value
+        return out
+
+    def _set_params(self, attr, **params):
+        # Ensure strict ordering of parameter setting:
+        # 1. All steps
+        if attr in params:
+            setattr(self, attr, params.pop(attr))
+        # 2. Replace items with estimators in params
+        items = getattr(self, attr)
+        if isinstance(items, list) and items:
+            # Get item names used to identify valid names in params
+            # `zip` raises a TypeError when `items` does not contains
+            # elements of length 2
+            with suppress(TypeError):
+                item_names, _ = zip(*items)
+                for name in list(params.keys()):
+                    if "__" not in name and name in item_names:
+                        self._replace_estimator(attr, name, params.pop(name))
+
+        # 3. Step parameters and other initialisation arguments
+        super().set_params(**params)
+        return self
+
+    def _replace_estimator(self, attr, name, new_val):
+        # assumes `name` is a valid estimator name
+        new_estimators = list(getattr(self, attr))
+        for i, (estimator_name, _) in enumerate(new_estimators):
+            if estimator_name == name:
+                new_estimators[i] = (name, new_val)
+                break
+        setattr(self, attr, new_estimators)
+
+    def _validate_names(self, names):
+        if len(set(names)) != len(names):
+            raise ValueError("Names provided are not unique: {0!r}".format(list(names)))
+        invalid_names = set(names).intersection(self.get_params(deep=False))
+        if invalid_names:
+            raise ValueError(
+                "Estimator names conflict with constructor arguments: {0!r}".format(
+                    sorted(invalid_names)
+                )
+            )
+        invalid_names = [name for name in names if "__" in name]
+        if invalid_names:
+            raise ValueError(
+                "Estimator names must not contain __: got {0!r}".format(invalid_names)
+            )
+
+
+def _safe_split(estimator, X, y, indices, train_indices=None):
+    """Create subset of dataset and properly handle kernels.
+
+    Slice X, y according to indices for cross-validation, but take care of
+    precomputed kernel-matrices or pairwise affinities / distances.
+
+    If ``estimator._pairwise is True``, X needs to be square and
+    we slice rows and columns. If ``train_indices`` is not None,
+    we slice rows using ``indices`` (assumed the test set) and columns
+    using ``train_indices``, indicating the training set.
+
+    Labels y will always be indexed only along the first axis.
+
+    Parameters
+    ----------
+    estimator : object
+        Estimator to determine whether we should slice only rows or rows and
+        columns.
+
+    X : array-like, sparse matrix or iterable
+        Data to be indexed. If ``estimator._pairwise is True``,
+        this needs to be a square array-like or sparse matrix.
+
+    y : array-like, sparse matrix or iterable
+        Targets to be indexed.
+
+    indices : array of int
+        Rows to select from X and y.
+        If ``estimator._pairwise is True`` and ``train_indices is None``
+        then ``indices`` will also be used to slice columns.
+
+    train_indices : array of int or None, default=None
+        If ``estimator._pairwise is True`` and ``train_indices is not None``,
+        then ``train_indices`` will be use to slice the columns of X.
+
+    Returns
+    -------
+    X_subset : array-like, sparse matrix or list
+        Indexed data.
+
+    y_subset : array-like, sparse matrix or list
+        Indexed targets.
+
+    """
+    if get_tags(estimator).input_tags.pairwise:
+        if not hasattr(X, "shape"):
+            raise ValueError(
+                "Precomputed kernels or affinity matrices have "
+                "to be passed as arrays or sparse matrices."
+            )
+        # X is a precomputed square kernel matrix
+        if X.shape[0] != X.shape[1]:
+            raise ValueError("X should be a square kernel matrix")
+        if train_indices is None:
+            X_subset = X[np.ix_(indices, indices)]
+        else:
+            X_subset = X[np.ix_(indices, train_indices)]
+    else:
+        X_subset = _safe_indexing(X, indices)
+
+    if y is not None:
+        y_subset = _safe_indexing(y, indices)
+    else:
+        y_subset = None
+
+    return X_subset, y_subset
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/multiclass.py b/.venv/lib/python3.12/site-packages/sklearn/utils/multiclass.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b8611341805d1d4349331dc4b8c98dd125aaaf
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/multiclass.py
@@ -0,0 +1,584 @@
+"""Utilities to handle multiclass/multioutput target in classifiers."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import warnings
+from collections.abc import Sequence
+from itertools import chain
+
+import numpy as np
+from scipy.sparse import issparse
+
+from ..utils._array_api import get_namespace
+from ..utils.fixes import VisibleDeprecationWarning
+from ._unique import attach_unique, cached_unique
+from .validation import _assert_all_finite, check_array
+
+
+def _unique_multiclass(y, xp=None):
+    xp, is_array_api_compliant = get_namespace(y, xp=xp)
+    if hasattr(y, "__array__") or is_array_api_compliant:
+        return cached_unique(xp.asarray(y), xp=xp)
+    else:
+        return set(y)
+
+
+def _unique_indicator(y, xp=None):
+    xp, _ = get_namespace(y, xp=xp)
+    return xp.arange(
+        check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]
+    )
+
+
+_FN_UNIQUE_LABELS = {
+    "binary": _unique_multiclass,
+    "multiclass": _unique_multiclass,
+    "multilabel-indicator": _unique_indicator,
+}
+
+
+def unique_labels(*ys):
+    """Extract an ordered array of unique labels.
+
+    We don't allow:
+        - mix of multilabel and multiclass (single label) targets
+        - mix of label indicator matrix and anything else,
+          because there are no explicit labels)
+        - mix of label indicator matrices of different sizes
+        - mix of string and integer labels
+
+    At the moment, we also don't allow "multiclass-multioutput" input type.
+
+    Parameters
+    ----------
+    *ys : array-likes
+        Label values.
+
+    Returns
+    -------
+    out : ndarray of shape (n_unique_labels,)
+        An ordered array of unique labels.
+
+    Examples
+    --------
+    >>> from sklearn.utils.multiclass import unique_labels
+    >>> unique_labels([3, 5, 5, 5, 7, 7])
+    array([3, 5, 7])
+    >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
+    array([1, 2, 3, 4])
+    >>> unique_labels([1, 2, 10], [5, 11])
+    array([ 1,  2,  5, 10, 11])
+    """
+    ys = attach_unique(*ys, return_tuple=True)
+    xp, is_array_api_compliant = get_namespace(*ys)
+    if len(ys) == 0:
+        raise ValueError("No argument has been passed.")
+    # Check that we don't mix label format
+
+    ys_types = set(type_of_target(x) for x in ys)
+    if ys_types == {"binary", "multiclass"}:
+        ys_types = {"multiclass"}
+
+    if len(ys_types) > 1:
+        raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
+
+    label_type = ys_types.pop()
+
+    # Check consistency for the indicator format
+    if (
+        label_type == "multilabel-indicator"
+        and len(
+            set(
+                check_array(y, accept_sparse=["csr", "csc", "coo"]).shape[1] for y in ys
+            )
+        )
+        > 1
+    ):
+        raise ValueError(
+            "Multi-label binary indicator input with different numbers of labels"
+        )
+
+    # Get the unique set of labels
+    _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
+    if not _unique_labels:
+        raise ValueError("Unknown label type: %s" % repr(ys))
+
+    if is_array_api_compliant:
+        # array_api does not allow for mixed dtypes
+        unique_ys = xp.concat([_unique_labels(y, xp=xp) for y in ys])
+        return xp.unique_values(unique_ys)
+
+    ys_labels = set(
+        chain.from_iterable((i for i in _unique_labels(y, xp=xp)) for y in ys)
+    )
+    # Check that we don't mix string type with number type
+    if len(set(isinstance(label, str) for label in ys_labels)) > 1:
+        raise ValueError("Mix of label input types (string and number)")
+
+    return xp.asarray(sorted(ys_labels))
+
+
+def _is_integral_float(y):
+    xp, is_array_api_compliant = get_namespace(y)
+    return xp.isdtype(y.dtype, "real floating") and bool(
+        xp.all(xp.astype((xp.astype(y, xp.int64)), y.dtype) == y)
+    )
+
+
+def is_multilabel(y):
+    """Check if ``y`` is in a multilabel format.
+
+    Parameters
+    ----------
+    y : ndarray of shape (n_samples,)
+        Target values.
+
+    Returns
+    -------
+    out : bool
+        Return ``True``, if ``y`` is in a multilabel format, else ``False``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.multiclass import is_multilabel
+    >>> is_multilabel([0, 1, 0, 1])
+    False
+    >>> is_multilabel([[1], [0, 2], []])
+    False
+    >>> is_multilabel(np.array([[1, 0], [0, 0]]))
+    True
+    >>> is_multilabel(np.array([[1], [0], [0]]))
+    False
+    >>> is_multilabel(np.array([[1, 0, 0]]))
+    True
+    """
+    xp, is_array_api_compliant = get_namespace(y)
+    if hasattr(y, "__array__") or isinstance(y, Sequence) or is_array_api_compliant:
+        # DeprecationWarning will be replaced by ValueError, see NEP 34
+        # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
+        check_y_kwargs = dict(
+            accept_sparse=True,
+            allow_nd=True,
+            ensure_all_finite=False,
+            ensure_2d=False,
+            ensure_min_samples=0,
+            ensure_min_features=0,
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", VisibleDeprecationWarning)
+            try:
+                y = check_array(y, dtype=None, **check_y_kwargs)
+            except (VisibleDeprecationWarning, ValueError) as e:
+                if str(e).startswith("Complex data not supported"):
+                    raise
+
+                # dtype=object should be provided explicitly for ragged arrays,
+                # see NEP 34
+                y = check_array(y, dtype=object, **check_y_kwargs)
+
+    if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
+        return False
+
+    if issparse(y):
+        if y.format in ("dok", "lil"):
+            y = y.tocsr()
+        labels = xp.unique_values(y.data)
+        return len(y.data) == 0 or (
+            (labels.size == 1 or ((labels.size == 2) and (0 in labels)))
+            and (y.dtype.kind in "biu" or _is_integral_float(labels))  # bool, int, uint
+        )
+    else:
+        labels = cached_unique(y, xp=xp)
+
+        return labels.shape[0] < 3 and (
+            xp.isdtype(y.dtype, ("bool", "signed integer", "unsigned integer"))
+            or _is_integral_float(labels)
+        )
+
+
+def check_classification_targets(y):
+    """Ensure that target y is of a non-regression type.
+
+    Only the following target types (as defined in type_of_target) are allowed:
+        'binary', 'multiclass', 'multiclass-multioutput',
+        'multilabel-indicator', 'multilabel-sequences'
+
+    Parameters
+    ----------
+    y : array-like
+        Target values.
+    """
+    y_type = type_of_target(y, input_name="y")
+    if y_type not in [
+        "binary",
+        "multiclass",
+        "multiclass-multioutput",
+        "multilabel-indicator",
+        "multilabel-sequences",
+    ]:
+        raise ValueError(
+            f"Unknown label type: {y_type}. Maybe you are trying to fit a "
+            "classifier, which expects discrete classes on a "
+            "regression target with continuous values."
+        )
+
+
+def type_of_target(y, input_name="", raise_unknown=False):
+    """Determine the type of data indicated by the target.
+
+    Note that this type is the most specific type that can be inferred.
+    For example:
+
+    * ``binary`` is more specific but compatible with ``multiclass``.
+    * ``multiclass`` of integers is more specific but compatible with ``continuous``.
+    * ``multilabel-indicator`` is more specific but compatible with
+      ``multiclass-multioutput``.
+
+    Parameters
+    ----------
+    y : {array-like, sparse matrix}
+        Target values. If a sparse matrix, `y` is expected to be a
+        CSR/CSC matrix.
+
+    input_name : str, default=""
+        The data name used to construct the error message.
+
+        .. versionadded:: 1.1.0
+
+    raise_unknown : bool, default=False
+        If `True`, raise an error when the type of target returned by
+        :func:`~sklearn.utils.multiclass.type_of_target` is `"unknown"`.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    target_type : str
+        One of:
+
+        * 'continuous': `y` is an array-like of floats that are not all
+          integers, and is 1d or a column vector.
+        * 'continuous-multioutput': `y` is a 2d array of floats that are
+          not all integers, and both dimensions are of size > 1.
+        * 'binary': `y` contains <= 2 discrete values and is 1d or a column
+          vector.
+        * 'multiclass': `y` contains more than two discrete values, is not a
+          sequence of sequences, and is 1d or a column vector.
+        * 'multiclass-multioutput': `y` is a 2d array that contains more
+          than two discrete values, is not a sequence of sequences, and both
+          dimensions are of size > 1.
+        * 'multilabel-indicator': `y` is a label indicator matrix, an array
+          of two dimensions with at least two columns, and at most 2 unique
+          values.
+        * 'unknown': `y` is array-like but none of the above, such as a 3d
+          array, sequence of sequences, or an array of non-sequence objects.
+
+    Examples
+    --------
+    >>> from sklearn.utils.multiclass import type_of_target
+    >>> import numpy as np
+    >>> type_of_target([0.1, 0.6])
+    'continuous'
+    >>> type_of_target([1, -1, -1, 1])
+    'binary'
+    >>> type_of_target(['a', 'b', 'a'])
+    'binary'
+    >>> type_of_target([1.0, 2.0])
+    'binary'
+    >>> type_of_target([1, 0, 2])
+    'multiclass'
+    >>> type_of_target([1.0, 0.0, 3.0])
+    'multiclass'
+    >>> type_of_target(['a', 'b', 'c'])
+    'multiclass'
+    >>> type_of_target(np.array([[1, 2], [3, 1]]))
+    'multiclass-multioutput'
+    >>> type_of_target([[1, 2]])
+    'multilabel-indicator'
+    >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
+    'continuous-multioutput'
+    >>> type_of_target(np.array([[0, 1], [1, 1]]))
+    'multilabel-indicator'
+    """
+    xp, is_array_api_compliant = get_namespace(y)
+
+    def _raise_or_return():
+        """Depending on the value of raise_unknown, either raise an error or return
+        'unknown'.
+        """
+        if raise_unknown:
+            input = input_name if input_name else "data"
+            raise ValueError(f"Unknown label type for {input}: {y!r}")
+        else:
+            return "unknown"
+
+    valid = (
+        (isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__"))
+        and not isinstance(y, str)
+    ) or is_array_api_compliant
+
+    if not valid:
+        raise ValueError(
+            "Expected array-like (array or non-string sequence), got %r" % y
+        )
+
+    sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]
+    if sparse_pandas:
+        raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
+
+    if is_multilabel(y):
+        return "multilabel-indicator"
+
+    # DeprecationWarning will be replaced by ValueError, see NEP 34
+    # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
+    # We therefore catch both deprecation (NumPy < 1.24) warning and
+    # value error (NumPy >= 1.24).
+    check_y_kwargs = dict(
+        accept_sparse=True,
+        allow_nd=True,
+        ensure_all_finite=False,
+        ensure_2d=False,
+        ensure_min_samples=0,
+        ensure_min_features=0,
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", VisibleDeprecationWarning)
+        if not issparse(y):
+            try:
+                y = check_array(y, dtype=None, **check_y_kwargs)
+            except (VisibleDeprecationWarning, ValueError) as e:
+                if str(e).startswith("Complex data not supported"):
+                    raise
+
+                # dtype=object should be provided explicitly for ragged arrays,
+                # see NEP 34
+                y = check_array(y, dtype=object, **check_y_kwargs)
+
+    try:
+        first_row_or_val = y[[0], :] if issparse(y) else y[0]
+        # labels in bytes format
+        if isinstance(first_row_or_val, bytes):
+            raise TypeError(
+                "Support for labels represented as bytes is not supported. Convert "
+                "the labels to a string or integer format."
+            )
+        # The old sequence of sequences format
+        if (
+            not hasattr(first_row_or_val, "__array__")
+            and isinstance(first_row_or_val, Sequence)
+            and not isinstance(first_row_or_val, str)
+        ):
+            raise ValueError(
+                "You appear to be using a legacy multi-label data"
+                " representation. Sequence of sequences are no"
+                " longer supported; use a binary array or sparse"
+                " matrix instead - the MultiLabelBinarizer"
+                " transformer can convert to this format."
+            )
+    except IndexError:
+        pass
+
+    # Invalid inputs
+    if y.ndim not in (1, 2):
+        # Number of dimension greater than 2: [[[1, 2]]]
+        return _raise_or_return()
+    if not min(y.shape):
+        # Empty ndarray: []/[[]]
+        if y.ndim == 1:
+            # 1-D empty array: []
+            return "binary"  # []
+        # 2-D empty array: [[]]
+        return _raise_or_return()
+    if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str):
+        # [obj_1] and not ["label_1"]
+        return _raise_or_return()
+
+    # Check if multioutput
+    if y.ndim == 2 and y.shape[1] > 1:
+        suffix = "-multioutput"  # [[1, 2], [1, 2]]
+    else:
+        suffix = ""  # [1, 2, 3] or [[1], [2], [3]]
+
+    # Check float and contains non-integer float values
+    if xp.isdtype(y.dtype, "real floating"):
+        # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
+        data = y.data if issparse(y) else y
+        if xp.any(data != xp.astype(data, int)):
+            _assert_all_finite(data, input_name=input_name)
+            return "continuous" + suffix
+
+    # Check multiclass
+    if issparse(first_row_or_val):
+        first_row_or_val = first_row_or_val.data
+    classes = cached_unique(y)
+    if y.shape[0] > 20 and y.shape[0] > classes.shape[0] > round(0.5 * y.shape[0]):
+        # Only raise the warning when we have at least 20 samples.
+        warnings.warn(
+            "The number of unique classes is greater than 50% of the number "
+            "of samples. `y` could represent a regression problem, not a "
+            "classification problem.",
+            UserWarning,
+            stacklevel=2,
+        )
+    if classes.shape[0] > 2 or (y.ndim == 2 and len(first_row_or_val) > 1):
+        # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
+        return "multiclass" + suffix
+    else:
+        return "binary"  # [1, 2] or [["a"], ["b"]]
+
+
+def _check_partial_fit_first_call(clf, classes=None):
+    """Private helper function for factorizing common classes param logic.
+
+    Estimators that implement the ``partial_fit`` API need to be provided with
+    the list of possible classes at the first call to partial_fit.
+
+    Subsequent calls to partial_fit should check that ``classes`` is still
+    consistent with a previous value of ``clf.classes_`` when provided.
+
+    This function returns True if it detects that this was the first call to
+    ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
+    set on ``clf``.
+
+    """
+    if getattr(clf, "classes_", None) is None and classes is None:
+        raise ValueError("classes must be passed on the first call to partial_fit.")
+
+    elif classes is not None:
+        if getattr(clf, "classes_", None) is not None:
+            if not np.array_equal(clf.classes_, unique_labels(classes)):
+                raise ValueError(
+                    "`classes=%r` is not the same as on last call "
+                    "to partial_fit, was: %r" % (classes, clf.classes_)
+                )
+
+        else:
+            # This is the first call to partial_fit
+            clf.classes_ = unique_labels(classes)
+            return True
+
+    # classes is None and clf.classes_ has already previously been set:
+    # nothing to do
+    return False
+
+
+def class_distribution(y, sample_weight=None):
+    """Compute class priors from multioutput-multiclass target data.
+
+    Parameters
+    ----------
+    y : {array-like, sparse matrix} of size (n_samples, n_outputs)
+        The labels for each example.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    classes : list of size n_outputs of ndarray of size (n_classes,)
+        List of classes for each column.
+
+    n_classes : list of int of size n_outputs
+        Number of classes in each column.
+
+    class_prior : list of size n_outputs of ndarray of size (n_classes,)
+        Class distribution of each column.
+    """
+    classes = []
+    n_classes = []
+    class_prior = []
+
+    n_samples, n_outputs = y.shape
+    if sample_weight is not None:
+        sample_weight = np.asarray(sample_weight)
+
+    if issparse(y):
+        y = y.tocsc()
+        y_nnz = np.diff(y.indptr)
+
+        for k in range(n_outputs):
+            col_nonzero = y.indices[y.indptr[k] : y.indptr[k + 1]]
+            # separate sample weights for zero and non-zero elements
+            if sample_weight is not None:
+                nz_samp_weight = sample_weight[col_nonzero]
+                zeros_samp_weight_sum = np.sum(sample_weight) - np.sum(nz_samp_weight)
+            else:
+                nz_samp_weight = None
+                zeros_samp_weight_sum = y.shape[0] - y_nnz[k]
+
+            classes_k, y_k = np.unique(
+                y.data[y.indptr[k] : y.indptr[k + 1]], return_inverse=True
+            )
+            class_prior_k = np.bincount(y_k, weights=nz_samp_weight)
+
+            # An explicit zero was found, combine its weight with the weight
+            # of the implicit zeros
+            if 0 in classes_k:
+                class_prior_k[classes_k == 0] += zeros_samp_weight_sum
+
+            # If an there is an implicit zero and it is not in classes and
+            # class_prior, make an entry for it
+            if 0 not in classes_k and y_nnz[k] < y.shape[0]:
+                classes_k = np.insert(classes_k, 0, 0)
+                class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum)
+
+            classes.append(classes_k)
+            n_classes.append(classes_k.shape[0])
+            class_prior.append(class_prior_k / class_prior_k.sum())
+    else:
+        for k in range(n_outputs):
+            classes_k, y_k = np.unique(y[:, k], return_inverse=True)
+            classes.append(classes_k)
+            n_classes.append(classes_k.shape[0])
+            class_prior_k = np.bincount(y_k, weights=sample_weight)
+            class_prior.append(class_prior_k / class_prior_k.sum())
+
+    return (classes, n_classes, class_prior)
+
+
+def _ovr_decision_function(predictions, confidences, n_classes):
+    """Compute a continuous, tie-breaking OvR decision function from OvO.
+
+    It is important to include a continuous value, not only votes,
+    to make computing AUC or calibration meaningful.
+
+    Parameters
+    ----------
+    predictions : array-like of shape (n_samples, n_classifiers)
+        Predicted classes for each binary classifier.
+
+    confidences : array-like of shape (n_samples, n_classifiers)
+        Decision functions or predicted probabilities for positive class
+        for each binary classifier.
+
+    n_classes : int
+        Number of classes. n_classifiers must be
+        ``n_classes * (n_classes - 1 ) / 2``.
+    """
+    n_samples = predictions.shape[0]
+    votes = np.zeros((n_samples, n_classes))
+    sum_of_confidences = np.zeros((n_samples, n_classes))
+
+    k = 0
+    for i in range(n_classes):
+        for j in range(i + 1, n_classes):
+            sum_of_confidences[:, i] -= confidences[:, k]
+            sum_of_confidences[:, j] += confidences[:, k]
+            votes[predictions[:, k] == 0, i] += 1
+            votes[predictions[:, k] == 1, j] += 1
+            k += 1
+
+    # Monotonically transform the sum_of_confidences to (-1/3, 1/3)
+    # and add it with votes. The monotonic transformation  is
+    # f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2
+    # to ensure that we won't reach the limits and change vote order.
+    # The motivation is to use confidence levels as a way to break ties in
+    # the votes without switching any decision made based on a difference
+    # of 1 vote.
+    transformed_confidences = sum_of_confidences / (
+        3 * (np.abs(sum_of_confidences) + 1)
+    )
+    return votes + transformed_confidences
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/murmurhash.pxd b/.venv/lib/python3.12/site-packages/sklearn/utils/murmurhash.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..126674bfa7e796f4cdc7c906005ab34e906816a1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/murmurhash.pxd
@@ -0,0 +1,21 @@
+"""Export fast murmurhash C/C++ routines + cython wrappers"""
+
+from ..utils._typedefs cimport int32_t, uint32_t
+
+# The C API is disabled for now, since it requires -I flags to get
+# compilation to work even when these functions are not used.
+# cdef extern from "MurmurHash3.h":
+#     void MurmurHash3_x86_32(void* key, int len, unsigned int seed,
+#                             void* out)
+#
+#     void MurmurHash3_x86_128(void* key, int len, unsigned int seed,
+#                              void* out)
+#
+#     void MurmurHash3_x64_128(void* key, int len, unsigned int seed,
+#                              void* out)
+
+
+cpdef uint32_t murmurhash3_int_u32(int key, unsigned int seed)
+cpdef int32_t murmurhash3_int_s32(int key, unsigned int seed)
+cpdef uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed)
+cpdef int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/murmurhash.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/murmurhash.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..fee239acd98fb94d105db5b3ae04932e0e6a75a7
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/murmurhash.pyx
@@ -0,0 +1,135 @@
+"""Cython wrapper for MurmurHash3 non-cryptographic hash function.
+
+MurmurHash is an extensively tested and very fast hash function that has
+good distribution properties suitable for machine learning use cases
+such as feature hashing and random projections.
+
+The original C++ code by Austin Appleby is released the public domain
+and can be found here:
+
+  https://code.google.com/p/smhasher/
+
+"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..utils._typedefs cimport int32_t, uint32_t
+
+import numpy as np
+
+cdef extern from "src/MurmurHash3.h":
+    void MurmurHash3_x86_32(void *key, int len, uint32_t seed, void *out)
+    void MurmurHash3_x86_128(void *key, int len, uint32_t seed, void *out)
+    void MurmurHash3_x64_128 (void *key, int len, uint32_t seed, void *out)
+
+
+cpdef uint32_t murmurhash3_int_u32(int key, unsigned int seed):
+    """Compute the 32bit murmurhash3 of a int key at seed."""
+    cdef uint32_t out
+    MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
+    return out
+
+
+cpdef int32_t murmurhash3_int_s32(int key, unsigned int seed):
+    """Compute the 32bit murmurhash3 of a int key at seed."""
+    cdef int32_t out
+    MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
+    return out
+
+
+cpdef uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed):
+    """Compute the 32bit murmurhash3 of a bytes key at seed."""
+    cdef uint32_t out
+    MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
+    return out
+
+
+cpdef int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed):
+    """Compute the 32bit murmurhash3 of a bytes key at seed."""
+    cdef int32_t out
+    MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
+    return out
+
+
+def _murmurhash3_bytes_array_u32(
+    const int32_t[:] key,
+    unsigned int seed,
+):
+    """Compute 32bit murmurhash3 hashes of a key int array at seed."""
+    # TODO make it possible to pass preallocated output array
+    cdef:
+        uint32_t[:] out = np.zeros(key.size, np.uint32)
+        Py_ssize_t i
+    for i in range(key.shape[0]):
+        out[i] = murmurhash3_int_u32(key[i], seed)
+    return np.asarray(out)
+
+
+def _murmurhash3_bytes_array_s32(
+    const int32_t[:] key,
+    unsigned int seed,
+):
+    """Compute 32bit murmurhash3 hashes of a key int array at seed."""
+    # TODO make it possible to pass preallocated output array
+    cdef:
+        int32_t[:] out = np.zeros(key.size, np.int32)
+        Py_ssize_t i
+    for i in range(key.shape[0]):
+        out[i] = murmurhash3_int_s32(key[i], seed)
+    return np.asarray(out)
+
+
+def murmurhash3_32(key, seed=0, positive=False):
+    """Compute the 32bit murmurhash3 of key at seed.
+
+    The underlying implementation is MurmurHash3_x86_32 generating low
+    latency 32bits hash suitable for implementing lookup tables, Bloom
+    filters, count min sketch or feature hashing.
+
+    Parameters
+    ----------
+    key : np.int32, bytes, unicode or ndarray of dtype=np.int32
+        The physical object to hash.
+
+    seed : int, default=0
+        Integer seed for the hashing algorithm.
+
+    positive : bool, default=False
+        True: the results is casted to an unsigned int
+          from 0 to 2 ** 32 - 1
+        False: the results is casted to a signed int
+          from -(2 ** 31) to 2 ** 31 - 1
+
+    Examples
+    --------
+    >>> from sklearn.utils import murmurhash3_32
+    >>> murmurhash3_32(b"Hello World!", seed=42)
+    3565178
+    """
+    if isinstance(key, bytes):
+        if positive:
+            return murmurhash3_bytes_u32(key, seed)
+        else:
+            return murmurhash3_bytes_s32(key, seed)
+    elif isinstance(key, unicode):
+        if positive:
+            return murmurhash3_bytes_u32(key.encode('utf-8'), seed)
+        else:
+            return murmurhash3_bytes_s32(key.encode('utf-8'), seed)
+    elif isinstance(key, int) or isinstance(key, np.int32):
+        if positive:
+            return murmurhash3_int_u32(<int32_t>key, seed)
+        else:
+            return murmurhash3_int_s32(<int32_t>key, seed)
+    elif isinstance(key, np.ndarray):
+        if key.dtype != np.int32:
+            raise TypeError(
+                "key.dtype should be int32, got %s" % key.dtype)
+        if positive:
+            return _murmurhash3_bytes_array_u32(key.ravel(), seed).reshape(key.shape)
+        else:
+            return _murmurhash3_bytes_array_s32(key.ravel(), seed).reshape(key.shape)
+    else:
+        raise TypeError(
+            "key %r with type %s is not supported. "
+            "Explicit conversion to bytes is required" % (key, type(key)))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/optimize.py b/.venv/lib/python3.12/site-packages/sklearn/utils/optimize.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0d21b179658223e9c687de7004491a131696f38
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/optimize.py
@@ -0,0 +1,389 @@
+"""
+Our own implementation of the Newton algorithm
+
+Unlike the scipy.optimize version, this version of the Newton conjugate
+gradient solver uses only one function call to retrieve the
+func value, the gradient value and a callable for the Hessian matvec
+product. If the function call is very expensive (e.g. for logistic
+regression with large design matrix), this approach gives very
+significant speedups.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# This is a modified file from scipy.optimize
+# Original authors: Travis Oliphant, Eric Jones
+
+import warnings
+
+import numpy as np
+import scipy
+from scipy.optimize._linesearch import line_search_wolfe1, line_search_wolfe2
+
+from ..exceptions import ConvergenceWarning
+
+
+class _LineSearchError(RuntimeError):
+    pass
+
+
+def _line_search_wolfe12(
+    f, fprime, xk, pk, gfk, old_fval, old_old_fval, verbose=0, **kwargs
+):
+    """
+    Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
+    suitable step length is not found, and raise an exception if a
+    suitable step length is not found.
+
+    Raises
+    ------
+    _LineSearchError
+        If no suitable step size is found.
+
+    """
+    is_verbose = verbose >= 2
+    eps = 16 * np.finfo(np.asarray(old_fval).dtype).eps
+    if is_verbose:
+        print("  Line Search")
+        print(f"    eps=16 * finfo.eps={eps}")
+        print("    try line search wolfe1")
+
+    ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)
+
+    if is_verbose:
+        _not_ = "not " if ret[0] is None else ""
+        print("    wolfe1 line search was " + _not_ + "successful")
+
+    if ret[0] is None:
+        # Have a look at the line_search method of our NewtonSolver class. We borrow
+        # the logic from there
+        # Deal with relative loss differences around machine precision.
+        args = kwargs.get("args", tuple())
+        fval = f(xk + pk, *args)
+        tiny_loss = np.abs(old_fval * eps)
+        loss_improvement = fval - old_fval
+        check = np.abs(loss_improvement) <= tiny_loss
+        if is_verbose:
+            print(
+                "    check loss |improvement| <= eps * |loss_old|:"
+                f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
+            )
+        if check:
+            # 2.1 Check sum of absolute gradients as alternative condition.
+            sum_abs_grad_old = scipy.linalg.norm(gfk, ord=1)
+            grad = fprime(xk + pk, *args)
+            sum_abs_grad = scipy.linalg.norm(grad, ord=1)
+            check = sum_abs_grad < sum_abs_grad_old
+            if is_verbose:
+                print(
+                    "    check sum(|gradient|) < sum(|gradient_old|): "
+                    f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
+                )
+            if check:
+                ret = (
+                    1.0,  # step size
+                    ret[1] + 1,  # number of function evaluations
+                    ret[2] + 1,  # number of gradient evaluations
+                    fval,
+                    old_fval,
+                    grad,
+                )
+
+    if ret[0] is None:
+        # line search failed: try different one.
+        # TODO: It seems that the new check for the sum of absolute gradients above
+        # catches all cases that, earlier, ended up here. In fact, our tests never
+        # trigger this "if branch" here and we can consider to remove it.
+        if is_verbose:
+            print("    last resort: try line search wolfe2")
+        ret = line_search_wolfe2(
+            f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs
+        )
+        if is_verbose:
+            _not_ = "not " if ret[0] is None else ""
+            print("    wolfe2 line search was " + _not_ + "successful")
+
+    if ret[0] is None:
+        raise _LineSearchError()
+
+    return ret
+
+
+def _cg(fhess_p, fgrad, maxiter, tol, verbose=0):
+    """
+    Solve iteratively the linear system 'fhess_p . xsupi = fgrad'
+    with a conjugate gradient descent.
+
+    Parameters
+    ----------
+    fhess_p : callable
+        Function that takes the gradient as a parameter and returns the
+        matrix product of the Hessian and gradient.
+
+    fgrad : ndarray of shape (n_features,) or (n_features + 1,)
+        Gradient vector.
+
+    maxiter : int
+        Number of CG iterations.
+
+    tol : float
+        Stopping criterion.
+
+    Returns
+    -------
+    xsupi : ndarray of shape (n_features,) or (n_features + 1,)
+        Estimated solution.
+    """
+    eps = 16 * np.finfo(np.float64).eps
+    xsupi = np.zeros(len(fgrad), dtype=fgrad.dtype)
+    ri = np.copy(fgrad)  # residual = fgrad - fhess_p @ xsupi
+    psupi = -ri
+    i = 0
+    dri0 = np.dot(ri, ri)
+    # We also keep track of |p_i|^2.
+    psupi_norm2 = dri0
+    is_verbose = verbose >= 2
+
+    while i <= maxiter:
+        if np.sum(np.abs(ri)) <= tol:
+            if is_verbose:
+                print(
+                    f"  Inner CG solver iteration {i} stopped with\n"
+                    f"    sum(|residuals|) <= tol: {np.sum(np.abs(ri))} <= {tol}"
+                )
+            break
+
+        Ap = fhess_p(psupi)
+        # check curvature
+        curv = np.dot(psupi, Ap)
+        if 0 <= curv <= eps * psupi_norm2:
+            # See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient.
+            if is_verbose:
+                print(
+                    f"  Inner CG solver iteration {i} stopped with\n"
+                    f"    tiny_|p| = eps * ||p||^2, eps = {eps}, "
+                    f"squared L2 norm ||p||^2 = {psupi_norm2}\n"
+                    f"    curvature <= tiny_|p|: {curv} <= {eps * psupi_norm2}"
+                )
+            break
+        elif curv < 0:
+            if i > 0:
+                if is_verbose:
+                    print(
+                        f"  Inner CG solver iteration {i} stopped with negative "
+                        f"curvature, curvature = {curv}"
+                    )
+                break
+            else:
+                # fall back to steepest descent direction
+                xsupi += dri0 / curv * psupi
+                if is_verbose:
+                    print("  Inner CG solver iteration 0 fell back to steepest descent")
+                break
+        alphai = dri0 / curv
+        xsupi += alphai * psupi
+        ri += alphai * Ap
+        dri1 = np.dot(ri, ri)
+        betai = dri1 / dri0
+        psupi = -ri + betai * psupi
+        # We use  |p_i|^2 = |r_i|^2 + beta_i^2 |p_{i-1}|^2
+        psupi_norm2 = dri1 + betai**2 * psupi_norm2
+        i = i + 1
+        dri0 = dri1  # update np.dot(ri,ri) for next time.
+    if is_verbose and i > maxiter:
+        print(
+            f"  Inner CG solver stopped reaching maxiter={i - 1} with "
+            f"sum(|residuals|) = {np.sum(np.abs(ri))}"
+        )
+    return xsupi
+
+
+def _newton_cg(
+    grad_hess,
+    func,
+    grad,
+    x0,
+    args=(),
+    tol=1e-4,
+    maxiter=100,
+    maxinner=200,
+    line_search=True,
+    warn=True,
+    verbose=0,
+):
+    """
+    Minimization of scalar function of one or more variables using the
+    Newton-CG algorithm.
+
+    Parameters
+    ----------
+    grad_hess : callable
+        Should return the gradient and a callable returning the matvec product
+        of the Hessian.
+
+    func : callable
+        Should return the value of the function.
+
+    grad : callable
+        Should return the function value and the gradient. This is used
+        by the linesearch functions.
+
+    x0 : array of float
+        Initial guess.
+
+    args : tuple, default=()
+        Arguments passed to func_grad_hess, func and grad.
+
+    tol : float, default=1e-4
+        Stopping criterion. The iteration will stop when
+        ``max{|g_i | i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient.
+
+    maxiter : int, default=100
+        Number of Newton iterations.
+
+    maxinner : int, default=200
+        Number of CG iterations.
+
+    line_search : bool, default=True
+        Whether to use a line search or not.
+
+    warn : bool, default=True
+        Whether to warn when didn't converge.
+
+    Returns
+    -------
+    xk : ndarray of float
+        Estimated minimum.
+    """
+    x0 = np.asarray(x0).flatten()
+    xk = np.copy(x0)
+    k = 0
+
+    if line_search:
+        old_fval = func(x0, *args)
+        old_old_fval = None
+    else:
+        old_fval = 0
+
+    is_verbose = verbose > 0
+
+    # Outer loop: our Newton iteration
+    while k < maxiter:
+        # Compute a search direction pk by applying the CG method to
+        #  del2 f(xk) p = - fgrad f(xk) starting from 0.
+        fgrad, fhess_p = grad_hess(xk, *args)
+
+        absgrad = np.abs(fgrad)
+        max_absgrad = np.max(absgrad)
+        check = max_absgrad <= tol
+        if is_verbose:
+            print(f"Newton-CG iter = {k}")
+            print("  Check Convergence")
+            print(f"    max |gradient| <= tol: {max_absgrad} <= {tol} {check}")
+        if check:
+            break
+
+        maggrad = np.sum(absgrad)
+        eta = min([0.5, np.sqrt(maggrad)])
+        termcond = eta * maggrad
+
+        # Inner loop: solve the Newton update by conjugate gradient, to
+        # avoid inverting the Hessian
+        xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond, verbose=verbose)
+
+        alphak = 1.0
+
+        if line_search:
+            try:
+                alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12(
+                    func,
+                    grad,
+                    xk,
+                    xsupi,
+                    fgrad,
+                    old_fval,
+                    old_old_fval,
+                    verbose=verbose,
+                    args=args,
+                )
+            except _LineSearchError:
+                warnings.warn("Line Search failed")
+                break
+
+        xk += alphak * xsupi  # upcast if necessary
+        k += 1
+
+    if warn and k >= maxiter:
+        warnings.warn(
+            (
+                f"newton-cg failed to converge at loss = {old_fval}. Increase the"
+                " number of iterations."
+            ),
+            ConvergenceWarning,
+        )
+    elif is_verbose:
+        print(f"  Solver did converge at loss = {old_fval}.")
+    return xk, k
+
+
+def _check_optimize_result(solver, result, max_iter=None, extra_warning_msg=None):
+    """Check the OptimizeResult for successful convergence
+
+    Parameters
+    ----------
+    solver : str
+       Solver name. Currently only `lbfgs` is supported.
+
+    result : OptimizeResult
+       Result of the scipy.optimize.minimize function.
+
+    max_iter : int, default=None
+       Expected maximum number of iterations.
+
+    extra_warning_msg : str, default=None
+        Extra warning message.
+
+    Returns
+    -------
+    n_iter : int
+       Number of iterations.
+    """
+    # handle both scipy and scikit-learn solver names
+    if solver == "lbfgs":
+        if max_iter is not None:
+            # In scipy <= 1.0.0, nit may exceed maxiter for lbfgs.
+            # See https://github.com/scipy/scipy/issues/7854
+            n_iter_i = min(result.nit, max_iter)
+        else:
+            n_iter_i = result.nit
+
+        if result.status != 0:
+            warning_msg = (
+                f"{solver} failed to converge after {n_iter_i} iteration(s) "
+                f"(status={result.status}):\n"
+                f"{result.message}\n"
+            )
+            # Append a recommendation to increase iterations only when the
+            # number of iterations reaches the maximum allowed (max_iter),
+            # as this suggests the optimization may have been prematurely
+            # terminated due to the iteration limit.
+            if max_iter is not None and n_iter_i == max_iter:
+                warning_msg += (
+                    f"\nIncrease the number of iterations to improve the "
+                    f"convergence (max_iter={max_iter})."
+                )
+            warning_msg += (
+                "\nYou might also want to scale the data as shown in:\n"
+                "    https://scikit-learn.org/stable/modules/"
+                "preprocessing.html"
+            )
+            if extra_warning_msg is not None:
+                warning_msg += "\n" + extra_warning_msg
+            warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2)
+
+    else:
+        raise NotImplementedError
+
+    return n_iter_i
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/parallel.py b/.venv/lib/python3.12/site-packages/sklearn/utils/parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..743162dbc478d597da48a06d600fcef2cecc6f51
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/parallel.py
@@ -0,0 +1,177 @@
+"""Customizations of :mod:`joblib` and :mod:`threadpoolctl` tools for scikit-learn
+usage.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import functools
+import warnings
+from functools import update_wrapper
+
+import joblib
+from threadpoolctl import ThreadpoolController
+
+from .._config import config_context, get_config
+
+# Global threadpool controller instance that can be used to locally limit the number of
+# threads without looping through all shared libraries every time.
+# It should not be accessed directly and _get_threadpool_controller should be used
+# instead.
+_threadpool_controller = None
+
+
+def _with_config_and_warning_filters(delayed_func, config, warning_filters):
+    """Helper function that intends to attach a config to a delayed function."""
+    if hasattr(delayed_func, "with_config_and_warning_filters"):
+        return delayed_func.with_config_and_warning_filters(config, warning_filters)
+    else:
+        warnings.warn(
+            (
+                "`sklearn.utils.parallel.Parallel` needs to be used in "
+                "conjunction with `sklearn.utils.parallel.delayed` instead of "
+                "`joblib.delayed` to correctly propagate the scikit-learn "
+                "configuration to the joblib workers."
+            ),
+            UserWarning,
+        )
+        return delayed_func
+
+
+class Parallel(joblib.Parallel):
+    """Tweak of :class:`joblib.Parallel` that propagates the scikit-learn configuration.
+
+    This subclass of :class:`joblib.Parallel` ensures that the active configuration
+    (thread-local) of scikit-learn is propagated to the parallel workers for the
+    duration of the execution of the parallel tasks.
+
+    The API does not change and you can refer to :class:`joblib.Parallel`
+    documentation for more details.
+
+    .. versionadded:: 1.3
+    """
+
+    def __call__(self, iterable):
+        """Dispatch the tasks and return the results.
+
+        Parameters
+        ----------
+        iterable : iterable
+            Iterable containing tuples of (delayed_function, args, kwargs) that should
+            be consumed.
+
+        Returns
+        -------
+        results : list
+            List of results of the tasks.
+        """
+        # Capture the thread-local scikit-learn configuration at the time
+        # Parallel.__call__ is issued since the tasks can be dispatched
+        # in a different thread depending on the backend and on the value of
+        # pre_dispatch and n_jobs.
+        config = get_config()
+        warning_filters = warnings.filters
+        iterable_with_config_and_warning_filters = (
+            (
+                _with_config_and_warning_filters(delayed_func, config, warning_filters),
+                args,
+                kwargs,
+            )
+            for delayed_func, args, kwargs in iterable
+        )
+        return super().__call__(iterable_with_config_and_warning_filters)
+
+
+# remove when https://github.com/joblib/joblib/issues/1071 is fixed
+def delayed(function):
+    """Decorator used to capture the arguments of a function.
+
+    This alternative to `joblib.delayed` is meant to be used in conjunction
+    with `sklearn.utils.parallel.Parallel`. The latter captures the scikit-
+    learn configuration by calling `sklearn.get_config()` in the current
+    thread, prior to dispatching the first task. The captured configuration is
+    then propagated and enabled for the duration of the execution of the
+    delayed function in the joblib workers.
+
+    .. versionchanged:: 1.3
+       `delayed` was moved from `sklearn.utils.fixes` to `sklearn.utils.parallel`
+       in scikit-learn 1.3.
+
+    Parameters
+    ----------
+    function : callable
+        The function to be delayed.
+
+    Returns
+    -------
+    output: tuple
+        Tuple containing the delayed function, the positional arguments, and the
+        keyword arguments.
+    """
+
+    @functools.wraps(function)
+    def delayed_function(*args, **kwargs):
+        return _FuncWrapper(function), args, kwargs
+
+    return delayed_function
+
+
+class _FuncWrapper:
+    """Load the global configuration before calling the function."""
+
+    def __init__(self, function):
+        self.function = function
+        update_wrapper(self, self.function)
+
+    def with_config_and_warning_filters(self, config, warning_filters):
+        self.config = config
+        self.warning_filters = warning_filters
+        return self
+
+    def __call__(self, *args, **kwargs):
+        config = getattr(self, "config", {})
+        warning_filters = getattr(self, "warning_filters", [])
+        if not config or not warning_filters:
+            warnings.warn(
+                (
+                    "`sklearn.utils.parallel.delayed` should be used with"
+                    " `sklearn.utils.parallel.Parallel` to make it possible to"
+                    " propagate the scikit-learn configuration of the current thread to"
+                    " the joblib workers."
+                ),
+                UserWarning,
+            )
+
+        with config_context(**config), warnings.catch_warnings():
+            warnings.filters = warning_filters
+            return self.function(*args, **kwargs)
+
+
+def _get_threadpool_controller():
+    """Return the global threadpool controller instance."""
+    global _threadpool_controller
+
+    if _threadpool_controller is None:
+        _threadpool_controller = ThreadpoolController()
+
+    return _threadpool_controller
+
+
+def _threadpool_controller_decorator(limits=1, user_api="blas"):
+    """Decorator to limit the number of threads used at the function level.
+
+    It should be preferred over `threadpoolctl.ThreadpoolController.wrap` because this
+    one only loads the shared libraries when the function is called while the latter
+    loads them at import time.
+    """
+
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            controller = _get_threadpool_controller()
+            with controller.limit(limits=limits, user_api=user_api):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/random.py b/.venv/lib/python3.12/site-packages/sklearn/utils/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..aad8b84828514c443cd972a8887505cac28d2555
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/random.py
@@ -0,0 +1,101 @@
+"""Utilities for random sampling."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import array
+
+import numpy as np
+import scipy.sparse as sp
+
+from . import check_random_state
+from ._random import sample_without_replacement
+
+__all__ = ["sample_without_replacement"]
+
+
+def _random_choice_csc(n_samples, classes, class_probability=None, random_state=None):
+    """Generate a sparse random matrix given column class distributions
+
+    Parameters
+    ----------
+    n_samples : int,
+        Number of samples to draw in each column.
+
+    classes : list of size n_outputs of arrays of size (n_classes,)
+        List of classes for each column.
+
+    class_probability : list of size n_outputs of arrays of \
+        shape (n_classes,), default=None
+        Class distribution of each column. If None, uniform distribution is
+        assumed.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the sampled classes.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    random_matrix : sparse csc matrix of size (n_samples, n_outputs)
+
+    """
+    data = array.array("i")
+    indices = array.array("i")
+    indptr = array.array("i", [0])
+
+    for j in range(len(classes)):
+        classes[j] = np.asarray(classes[j])
+        if classes[j].dtype.kind != "i":
+            raise ValueError("class dtype %s is not supported" % classes[j].dtype)
+        classes[j] = classes[j].astype(np.int64, copy=False)
+
+        # use uniform distribution if no class_probability is given
+        if class_probability is None:
+            class_prob_j = np.empty(shape=classes[j].shape[0])
+            class_prob_j.fill(1 / classes[j].shape[0])
+        else:
+            class_prob_j = np.asarray(class_probability[j])
+
+        if not np.isclose(np.sum(class_prob_j), 1.0):
+            raise ValueError(
+                "Probability array at index {0} does not sum to one".format(j)
+            )
+
+        if class_prob_j.shape[0] != classes[j].shape[0]:
+            raise ValueError(
+                "classes[{0}] (length {1}) and "
+                "class_probability[{0}] (length {2}) have "
+                "different length.".format(
+                    j, classes[j].shape[0], class_prob_j.shape[0]
+                )
+            )
+
+        # If 0 is not present in the classes insert it with a probability 0.0
+        if 0 not in classes[j]:
+            classes[j] = np.insert(classes[j], 0, 0)
+            class_prob_j = np.insert(class_prob_j, 0, 0.0)
+
+        # If there are nonzero classes choose randomly using class_probability
+        rng = check_random_state(random_state)
+        if classes[j].shape[0] > 1:
+            index_class_0 = np.flatnonzero(classes[j] == 0).item()
+            p_nonzero = 1 - class_prob_j[index_class_0]
+            nnz = int(n_samples * p_nonzero)
+            ind_sample = sample_without_replacement(
+                n_population=n_samples, n_samples=nnz, random_state=random_state
+            )
+            indices.extend(ind_sample)
+
+            # Normalize probabilities for the nonzero elements
+            classes_j_nonzero = classes[j] != 0
+            class_probability_nz = class_prob_j[classes_j_nonzero]
+            class_probability_nz_norm = class_probability_nz / np.sum(
+                class_probability_nz
+            )
+            classes_ind = np.searchsorted(
+                class_probability_nz_norm.cumsum(), rng.uniform(size=nnz)
+            )
+            data.extend(classes[j][classes_j_nonzero][classes_ind])
+        indptr.append(len(indices))
+
+    return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/sparsefuncs.py b/.venv/lib/python3.12/site-packages/sklearn/utils/sparsefuncs.py
new file mode 100644
index 0000000000000000000000000000000000000000..00e359bf7954715f176b80e82d1b28e706121f12
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/sparsefuncs.py
@@ -0,0 +1,742 @@
+"""A collection of utilities to work with sparse matrices and arrays."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import scipy.sparse as sp
+from scipy.sparse.linalg import LinearOperator
+
+from ..utils.fixes import _sparse_min_max, _sparse_nan_min_max
+from ..utils.validation import _check_sample_weight
+from .sparsefuncs_fast import (
+    csc_mean_variance_axis0 as _csc_mean_var_axis0,
+)
+from .sparsefuncs_fast import (
+    csr_mean_variance_axis0 as _csr_mean_var_axis0,
+)
+from .sparsefuncs_fast import (
+    incr_mean_variance_axis0 as _incr_mean_var_axis0,
+)
+
+
+def _raise_typeerror(X):
+    """Raises a TypeError if X is not a CSR or CSC matrix"""
+    input_type = X.format if sp.issparse(X) else type(X)
+    err = "Expected a CSR or CSC sparse matrix, got %s." % input_type
+    raise TypeError(err)
+
+
+def _raise_error_wrong_axis(axis):
+    if axis not in (0, 1):
+        raise ValueError(
+            "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis
+        )
+
+
+def inplace_csr_column_scale(X, scale):
+    """Inplace column scaling of a CSR matrix.
+
+    Scale each feature of the data matrix by multiplying with specific scale
+    provided by the caller assuming a (n_samples, n_features) shape.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Matrix to normalize using the variance of the features.
+        It should be of CSR format.
+
+    scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
+        Array of precomputed feature-wise values to use for scaling.
+
+    Examples
+    --------
+    >>> from sklearn.utils import sparsefuncs
+    >>> from scipy import sparse
+    >>> import numpy as np
+    >>> indptr = np.array([0, 3, 4, 4, 4])
+    >>> indices = np.array([0, 1, 2, 2])
+    >>> data = np.array([8, 1, 2, 5])
+    >>> scale = np.array([2, 3, 2])
+    >>> csr = sparse.csr_matrix((data, indices, indptr))
+    >>> csr.todense()
+    matrix([[8, 1, 2],
+            [0, 0, 5],
+            [0, 0, 0],
+            [0, 0, 0]])
+    >>> sparsefuncs.inplace_csr_column_scale(csr, scale)
+    >>> csr.todense()
+    matrix([[16,  3,  4],
+            [ 0,  0, 10],
+            [ 0,  0,  0],
+            [ 0,  0,  0]])
+    """
+    assert scale.shape[0] == X.shape[1]
+    X.data *= scale.take(X.indices, mode="clip")
+
+
+def inplace_csr_row_scale(X, scale):
+    """Inplace row scaling of a CSR matrix.
+
+    Scale each sample of the data matrix by multiplying with specific scale
+    provided by the caller assuming a (n_samples, n_features) shape.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Matrix to be scaled. It should be of CSR format.
+
+    scale : ndarray of float of shape (n_samples,)
+        Array of precomputed sample-wise values to use for scaling.
+    """
+    assert scale.shape[0] == X.shape[0]
+    X.data *= np.repeat(scale, np.diff(X.indptr))
+
+
+def mean_variance_axis(X, axis, weights=None, return_sum_weights=False):
+    """Compute mean and variance along an axis on a CSR or CSC matrix.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Input data. It can be of CSR or CSC format.
+
+    axis : {0, 1}
+        Axis along which the axis should be computed.
+
+    weights : ndarray of shape (n_samples,) or (n_features,), default=None
+        If axis is set to 0 shape is (n_samples,) or
+        if axis is set to 1 shape is (n_features,).
+        If it is set to None, then samples are equally weighted.
+
+        .. versionadded:: 0.24
+
+    return_sum_weights : bool, default=False
+        If True, returns the sum of weights seen for each feature
+        if `axis=0` or each sample if `axis=1`.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+
+    means : ndarray of shape (n_features,), dtype=floating
+        Feature-wise means.
+
+    variances : ndarray of shape (n_features,), dtype=floating
+        Feature-wise variances.
+
+    sum_weights : ndarray of shape (n_features,), dtype=floating
+        Returned if `return_sum_weights` is `True`.
+
+    Examples
+    --------
+    >>> from sklearn.utils import sparsefuncs
+    >>> from scipy import sparse
+    >>> import numpy as np
+    >>> indptr = np.array([0, 3, 4, 4, 4])
+    >>> indices = np.array([0, 1, 2, 2])
+    >>> data = np.array([8, 1, 2, 5])
+    >>> scale = np.array([2, 3, 2])
+    >>> csr = sparse.csr_matrix((data, indices, indptr))
+    >>> csr.todense()
+    matrix([[8, 1, 2],
+            [0, 0, 5],
+            [0, 0, 0],
+            [0, 0, 0]])
+    >>> sparsefuncs.mean_variance_axis(csr, axis=0)
+    (array([2.  , 0.25, 1.75]), array([12.    ,  0.1875,  4.1875]))
+    """
+    _raise_error_wrong_axis(axis)
+
+    if sp.issparse(X) and X.format == "csr":
+        if axis == 0:
+            return _csr_mean_var_axis0(
+                X, weights=weights, return_sum_weights=return_sum_weights
+            )
+        else:
+            return _csc_mean_var_axis0(
+                X.T, weights=weights, return_sum_weights=return_sum_weights
+            )
+    elif sp.issparse(X) and X.format == "csc":
+        if axis == 0:
+            return _csc_mean_var_axis0(
+                X, weights=weights, return_sum_weights=return_sum_weights
+            )
+        else:
+            return _csr_mean_var_axis0(
+                X.T, weights=weights, return_sum_weights=return_sum_weights
+            )
+    else:
+        _raise_typeerror(X)
+
+
+def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None):
+    """Compute incremental mean and variance along an axis on a CSR or CSC matrix.
+
+    last_mean, last_var are the statistics computed at the last step by this
+    function. Both must be initialized to 0-arrays of the proper size, i.e.
+    the number of features in X. last_n is the number of samples encountered
+    until now.
+
+    Parameters
+    ----------
+    X : CSR or CSC sparse matrix of shape (n_samples, n_features)
+        Input data.
+
+    axis : {0, 1}
+        Axis along which the axis should be computed.
+
+    last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating
+        Array of means to update with the new data X.
+        Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
+
+    last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating
+        Array of variances to update with the new data X.
+        Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
+
+    last_n : float or ndarray of shape (n_features,) or (n_samples,), \
+            dtype=floating
+        Sum of the weights seen so far, excluding the current weights
+        If not float, it should be of shape (n_features,) if
+        axis=0 or (n_samples,) if axis=1. If float it corresponds to
+        having same weights for all samples (or features).
+
+    weights : ndarray of shape (n_samples,) or (n_features,), default=None
+        If axis is set to 0 shape is (n_samples,) or
+        if axis is set to 1 shape is (n_features,).
+        If it is set to None, then samples are equally weighted.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    means : ndarray of shape (n_features,) or (n_samples,), dtype=floating
+        Updated feature-wise means if axis = 0 or
+        sample-wise means if axis = 1.
+
+    variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating
+        Updated feature-wise variances if axis = 0 or
+        sample-wise variances if axis = 1.
+
+    n : ndarray of shape (n_features,) or (n_samples,), dtype=integral
+        Updated number of seen samples per feature if axis=0
+        or number of seen features per sample if axis=1.
+
+        If weights is not None, n is a sum of the weights of the seen
+        samples or features instead of the actual number of seen
+        samples or features.
+
+    Notes
+    -----
+    NaNs are ignored in the algorithm.
+
+    Examples
+    --------
+    >>> from sklearn.utils import sparsefuncs
+    >>> from scipy import sparse
+    >>> import numpy as np
+    >>> indptr = np.array([0, 3, 4, 4, 4])
+    >>> indices = np.array([0, 1, 2, 2])
+    >>> data = np.array([8, 1, 2, 5])
+    >>> scale = np.array([2, 3, 2])
+    >>> csr = sparse.csr_matrix((data, indices, indptr))
+    >>> csr.todense()
+    matrix([[8, 1, 2],
+            [0, 0, 5],
+            [0, 0, 0],
+            [0, 0, 0]])
+    >>> sparsefuncs.incr_mean_variance_axis(
+    ...     csr, axis=0, last_mean=np.zeros(3), last_var=np.zeros(3), last_n=2
+    ... )
+    (array([1.33, 0.167, 1.17]), array([8.88, 0.139, 3.47]),
+    array([6., 6., 6.]))
+    """
+    _raise_error_wrong_axis(axis)
+
+    if not (sp.issparse(X) and X.format in ("csc", "csr")):
+        _raise_typeerror(X)
+
+    if np.size(last_n) == 1:
+        last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype)
+
+    if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)):
+        raise ValueError("last_mean, last_var, last_n do not have the same shapes.")
+
+    if axis == 1:
+        if np.size(last_mean) != X.shape[0]:
+            raise ValueError(
+                "If axis=1, then last_mean, last_n, last_var should be of "
+                f"size n_samples {X.shape[0]} (Got {np.size(last_mean)})."
+            )
+    else:  # axis == 0
+        if np.size(last_mean) != X.shape[1]:
+            raise ValueError(
+                "If axis=0, then last_mean, last_n, last_var should be of "
+                f"size n_features {X.shape[1]} (Got {np.size(last_mean)})."
+            )
+
+    X = X.T if axis == 1 else X
+
+    if weights is not None:
+        weights = _check_sample_weight(weights, X, dtype=X.dtype)
+
+    return _incr_mean_var_axis0(
+        X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights
+    )
+
+
+def inplace_column_scale(X, scale):
+    """Inplace column scaling of a CSC/CSR matrix.
+
+    Scale each feature of the data matrix by multiplying with specific scale
+    provided by the caller assuming a (n_samples, n_features) shape.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Matrix to normalize using the variance of the features. It should be
+        of CSC or CSR format.
+
+    scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
+        Array of precomputed feature-wise values to use for scaling.
+
+    Examples
+    --------
+    >>> from sklearn.utils import sparsefuncs
+    >>> from scipy import sparse
+    >>> import numpy as np
+    >>> indptr = np.array([0, 3, 4, 4, 4])
+    >>> indices = np.array([0, 1, 2, 2])
+    >>> data = np.array([8, 1, 2, 5])
+    >>> scale = np.array([2, 3, 2])
+    >>> csr = sparse.csr_matrix((data, indices, indptr))
+    >>> csr.todense()
+    matrix([[8, 1, 2],
+            [0, 0, 5],
+            [0, 0, 0],
+            [0, 0, 0]])
+    >>> sparsefuncs.inplace_column_scale(csr, scale)
+    >>> csr.todense()
+    matrix([[16,  3,  4],
+            [ 0,  0, 10],
+            [ 0,  0,  0],
+            [ 0,  0,  0]])
+    """
+    if sp.issparse(X) and X.format == "csc":
+        inplace_csr_row_scale(X.T, scale)
+    elif sp.issparse(X) and X.format == "csr":
+        inplace_csr_column_scale(X, scale)
+    else:
+        _raise_typeerror(X)
+
+
+def inplace_row_scale(X, scale):
+    """Inplace row scaling of a CSR or CSC matrix.
+
+    Scale each row of the data matrix by multiplying with specific scale
+    provided by the caller assuming a (n_samples, n_features) shape.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Matrix to be scaled. It should be of CSR or CSC format.
+
+    scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
+        Array of precomputed sample-wise values to use for scaling.
+
+    Examples
+    --------
+    >>> from sklearn.utils import sparsefuncs
+    >>> from scipy import sparse
+    >>> import numpy as np
+    >>> indptr = np.array([0, 2, 3, 4, 5])
+    >>> indices = np.array([0, 1, 2, 3, 3])
+    >>> data = np.array([8, 1, 2, 5, 6])
+    >>> scale = np.array([2, 3, 4, 5])
+    >>> csr = sparse.csr_matrix((data, indices, indptr))
+    >>> csr.todense()
+    matrix([[8, 1, 0, 0],
+            [0, 0, 2, 0],
+            [0, 0, 0, 5],
+            [0, 0, 0, 6]])
+    >>> sparsefuncs.inplace_row_scale(csr, scale)
+    >>> csr.todense()
+     matrix([[16,  2,  0,  0],
+             [ 0,  0,  6,  0],
+             [ 0,  0,  0, 20],
+             [ 0,  0,  0, 30]])
+    """
+    if sp.issparse(X) and X.format == "csc":
+        inplace_csr_column_scale(X.T, scale)
+    elif sp.issparse(X) and X.format == "csr":
+        inplace_csr_row_scale(X, scale)
+    else:
+        _raise_typeerror(X)
+
+
+def inplace_swap_row_csc(X, m, n):
+    """Swap two rows of a CSC matrix in-place.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Matrix whose two rows are to be swapped. It should be of
+        CSC format.
+
+    m : int
+        Index of the row of X to be swapped.
+
+    n : int
+        Index of the row of X to be swapped.
+    """
+    for t in [m, n]:
+        if isinstance(t, np.ndarray):
+            raise TypeError("m and n should be valid integers")
+
+    if m < 0:
+        m += X.shape[0]
+    if n < 0:
+        n += X.shape[0]
+
+    m_mask = X.indices == m
+    X.indices[X.indices == n] = m
+    X.indices[m_mask] = n
+
+
+def inplace_swap_row_csr(X, m, n):
+    """Swap two rows of a CSR matrix in-place.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Matrix whose two rows are to be swapped. It should be of
+        CSR format.
+
+    m : int
+        Index of the row of X to be swapped.
+
+    n : int
+        Index of the row of X to be swapped.
+    """
+    for t in [m, n]:
+        if isinstance(t, np.ndarray):
+            raise TypeError("m and n should be valid integers")
+
+    if m < 0:
+        m += X.shape[0]
+    if n < 0:
+        n += X.shape[0]
+
+    # The following swapping makes life easier since m is assumed to be the
+    # smaller integer below.
+    if m > n:
+        m, n = n, m
+
+    indptr = X.indptr
+    m_start = indptr[m]
+    m_stop = indptr[m + 1]
+    n_start = indptr[n]
+    n_stop = indptr[n + 1]
+    nz_m = m_stop - m_start
+    nz_n = n_stop - n_start
+
+    if nz_m != nz_n:
+        # Modify indptr first
+        X.indptr[m + 2 : n] += nz_n - nz_m
+        X.indptr[m + 1] = m_start + nz_n
+        X.indptr[n] = n_stop - nz_m
+
+    X.indices = np.concatenate(
+        [
+            X.indices[:m_start],
+            X.indices[n_start:n_stop],
+            X.indices[m_stop:n_start],
+            X.indices[m_start:m_stop],
+            X.indices[n_stop:],
+        ]
+    )
+    X.data = np.concatenate(
+        [
+            X.data[:m_start],
+            X.data[n_start:n_stop],
+            X.data[m_stop:n_start],
+            X.data[m_start:m_stop],
+            X.data[n_stop:],
+        ]
+    )
+
+
+def inplace_swap_row(X, m, n):
+    """
+    Swap two rows of a CSC/CSR matrix in-place.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Matrix whose two rows are to be swapped. It should be of CSR or
+        CSC format.
+
+    m : int
+        Index of the row of X to be swapped.
+
+    n : int
+        Index of the row of X to be swapped.
+
+    Examples
+    --------
+    >>> from sklearn.utils import sparsefuncs
+    >>> from scipy import sparse
+    >>> import numpy as np
+    >>> indptr = np.array([0, 2, 3, 3, 3])
+    >>> indices = np.array([0, 2, 2])
+    >>> data = np.array([8, 2, 5])
+    >>> csr = sparse.csr_matrix((data, indices, indptr))
+    >>> csr.todense()
+    matrix([[8, 0, 2],
+            [0, 0, 5],
+            [0, 0, 0],
+            [0, 0, 0]])
+    >>> sparsefuncs.inplace_swap_row(csr, 0, 1)
+    >>> csr.todense()
+    matrix([[0, 0, 5],
+            [8, 0, 2],
+            [0, 0, 0],
+            [0, 0, 0]])
+    """
+    if sp.issparse(X) and X.format == "csc":
+        inplace_swap_row_csc(X, m, n)
+    elif sp.issparse(X) and X.format == "csr":
+        inplace_swap_row_csr(X, m, n)
+    else:
+        _raise_typeerror(X)
+
+
+def inplace_swap_column(X, m, n):
+    """
+    Swap two columns of a CSC/CSR matrix in-place.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Matrix whose two columns are to be swapped. It should be of
+        CSR or CSC format.
+
+    m : int
+        Index of the column of X to be swapped.
+
+    n : int
+        Index of the column of X to be swapped.
+
+    Examples
+    --------
+    >>> from sklearn.utils import sparsefuncs
+    >>> from scipy import sparse
+    >>> import numpy as np
+    >>> indptr = np.array([0, 2, 3, 3, 3])
+    >>> indices = np.array([0, 2, 2])
+    >>> data = np.array([8, 2, 5])
+    >>> csr = sparse.csr_matrix((data, indices, indptr))
+    >>> csr.todense()
+    matrix([[8, 0, 2],
+            [0, 0, 5],
+            [0, 0, 0],
+            [0, 0, 0]])
+    >>> sparsefuncs.inplace_swap_column(csr, 0, 1)
+    >>> csr.todense()
+    matrix([[0, 8, 2],
+            [0, 0, 5],
+            [0, 0, 0],
+            [0, 0, 0]])
+    """
+    if m < 0:
+        m += X.shape[1]
+    if n < 0:
+        n += X.shape[1]
+    if sp.issparse(X) and X.format == "csc":
+        inplace_swap_row_csr(X, m, n)
+    elif sp.issparse(X) and X.format == "csr":
+        inplace_swap_row_csc(X, m, n)
+    else:
+        _raise_typeerror(X)
+
+
+def min_max_axis(X, axis, ignore_nan=False):
+    """Compute minimum and maximum along an axis on a CSR or CSC matrix.
+
+     Optionally ignore NaN values.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Input data. It should be of CSR or CSC format.
+
+    axis : {0, 1}
+        Axis along which the axis should be computed.
+
+    ignore_nan : bool, default=False
+        Ignore or passing through NaN values.
+
+        .. versionadded:: 0.20
+
+    Returns
+    -------
+
+    mins : ndarray of shape (n_features,), dtype={np.float32, np.float64}
+        Feature-wise minima.
+
+    maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}
+        Feature-wise maxima.
+    """
+    if sp.issparse(X) and X.format in ("csr", "csc"):
+        if ignore_nan:
+            return _sparse_nan_min_max(X, axis=axis)
+        else:
+            return _sparse_min_max(X, axis=axis)
+    else:
+        _raise_typeerror(X)
+
+
+def count_nonzero(X, axis=None, sample_weight=None):
+    """A variant of X.getnnz() with extension to weighting on axis 0.
+
+    Useful in efficiently calculating multilabel metrics.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_labels)
+        Input data. It should be of CSR format.
+
+    axis : {0, 1}, default=None
+        The axis on which the data is aggregated.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Weight for each row of X.
+
+    Returns
+    -------
+    nnz : int, float, ndarray of shape (n_samples,) or ndarray of shape (n_features,)
+        Number of non-zero values in the array along a given axis. Otherwise,
+        the total number of non-zero values in the array is returned.
+    """
+    if axis == -1:
+        axis = 1
+    elif axis == -2:
+        axis = 0
+    elif X.format != "csr":
+        raise TypeError("Expected CSR sparse format, got {0}".format(X.format))
+
+    # We rely here on the fact that np.diff(Y.indptr) for a CSR
+    # will return the number of nonzero entries in each row.
+    # A bincount over Y.indices will return the number of nonzeros
+    # in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14.
+    if axis is None:
+        if sample_weight is None:
+            return X.nnz
+        else:
+            return np.dot(np.diff(X.indptr), sample_weight)
+    elif axis == 1:
+        out = np.diff(X.indptr)
+        if sample_weight is None:
+            # astype here is for consistency with axis=0 dtype
+            return out.astype("intp")
+        return out * sample_weight
+    elif axis == 0:
+        if sample_weight is None:
+            return np.bincount(X.indices, minlength=X.shape[1])
+        else:
+            weights = np.repeat(sample_weight, np.diff(X.indptr))
+            return np.bincount(X.indices, minlength=X.shape[1], weights=weights)
+    else:
+        raise ValueError("Unsupported axis: {0}".format(axis))
+
+
+def _get_median(data, n_zeros):
+    """Compute the median of data with n_zeros additional zeros.
+
+    This function is used to support sparse matrices; it modifies data
+    in-place.
+    """
+    n_elems = len(data) + n_zeros
+    if not n_elems:
+        return np.nan
+    n_negative = np.count_nonzero(data < 0)
+    middle, is_odd = divmod(n_elems, 2)
+    data.sort()
+
+    if is_odd:
+        return _get_elem_at_rank(middle, data, n_negative, n_zeros)
+
+    return (
+        _get_elem_at_rank(middle - 1, data, n_negative, n_zeros)
+        + _get_elem_at_rank(middle, data, n_negative, n_zeros)
+    ) / 2.0
+
+
+def _get_elem_at_rank(rank, data, n_negative, n_zeros):
+    """Find the value in data augmented with n_zeros for the given rank"""
+    if rank < n_negative:
+        return data[rank]
+    if rank - n_negative < n_zeros:
+        return 0
+    return data[rank - n_zeros]
+
+
+def csc_median_axis_0(X):
+    """Find the median across axis 0 of a CSC matrix.
+
+    It is equivalent to doing np.median(X, axis=0).
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        Input data. It should be of CSC format.
+
+    Returns
+    -------
+    median : ndarray of shape (n_features,)
+        Median.
+    """
+    if not (sp.issparse(X) and X.format == "csc"):
+        raise TypeError("Expected matrix of CSC format, got %s" % X.format)
+
+    indptr = X.indptr
+    n_samples, n_features = X.shape
+    median = np.zeros(n_features)
+
+    for f_ind, (start, end) in enumerate(itertools.pairwise(indptr)):
+        # Prevent modifying X in place
+        data = np.copy(X.data[start:end])
+        nz = n_samples - data.size
+        median[f_ind] = _get_median(data, nz)
+
+    return median
+
+
+def _implicit_column_offset(X, offset):
+    """Create an implicitly offset linear operator.
+
+    This is used by PCA on sparse data to avoid densifying the whole data
+    matrix.
+
+    Params
+    ------
+        X : sparse matrix of shape (n_samples, n_features)
+        offset : ndarray of shape (n_features,)
+
+    Returns
+    -------
+    centered : LinearOperator
+    """
+    offset = offset[None, :]
+    XT = X.T
+    return LinearOperator(
+        matvec=lambda x: X @ x - offset @ x,
+        matmat=lambda x: X @ x - offset @ x,
+        rmatvec=lambda x: XT @ x - (offset * x.sum()),
+        rmatmat=lambda x: XT @ x - offset.T @ x.sum(axis=0)[None, :],
+        dtype=X.dtype,
+        shape=X.shape,
+    )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/sparsefuncs_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/utils/sparsefuncs_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..23261c59de3204ad4155a8e87ca2d316da598c57
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/sparsefuncs_fast.pyx
@@ -0,0 +1,640 @@
+"""Utilities to work with sparse matrices and arrays written in Cython."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.math cimport fabs, sqrt, isnan
+from libc.stdint cimport intptr_t
+
+import numpy as np
+from cython cimport floating
+from ..utils._typedefs cimport float64_t, int32_t, int64_t, intp_t, uint64_t
+
+
+ctypedef fused integral:
+    int32_t
+    int64_t
+
+
+def csr_row_norms(X):
+    """Squared L2 norm of each row in CSR matrix X."""
+    if X.dtype not in [np.float32, np.float64]:
+        X = X.astype(np.float64)
+    return _sqeuclidean_row_norms_sparse(X.data, X.indptr)
+
+
+def _sqeuclidean_row_norms_sparse(
+    const floating[::1] X_data,
+    const integral[::1] X_indptr,
+):
+    cdef:
+        integral n_samples = X_indptr.shape[0] - 1
+        integral i, j
+
+    dtype = np.float32 if floating is float else np.float64
+
+    cdef floating[::1] squared_row_norms = np.zeros(n_samples, dtype=dtype)
+
+    with nogil:
+        for i in range(n_samples):
+            for j in range(X_indptr[i], X_indptr[i + 1]):
+                squared_row_norms[i] += X_data[j] * X_data[j]
+
+    return np.asarray(squared_row_norms)
+
+
+def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):
+    """Compute mean and variance along axis 0 on a CSR matrix
+
+    Uses a np.float64 accumulator.
+
+    Parameters
+    ----------
+    X : CSR sparse matrix, shape (n_samples, n_features)
+        Input data.
+
+    weights : ndarray of shape (n_samples,), dtype=floating, default=None
+        If it is set to None samples will be equally weighted.
+
+        .. versionadded:: 0.24
+
+    return_sum_weights : bool, default=False
+        If True, returns the sum of weights seen for each feature.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    means : float array with shape (n_features,)
+        Feature-wise means
+
+    variances : float array with shape (n_features,)
+        Feature-wise variances
+
+    sum_weights : ndarray of shape (n_features,), dtype=floating
+        Returned if return_sum_weights is True.
+    """
+    if X.dtype not in [np.float32, np.float64]:
+        X = X.astype(np.float64)
+
+    if weights is None:
+        weights = np.ones(X.shape[0], dtype=X.dtype)
+
+    means, variances, sum_weights = _csr_mean_variance_axis0(
+        X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)
+
+    if return_sum_weights:
+        return means, variances, sum_weights
+    return means, variances
+
+
+def _csr_mean_variance_axis0(
+    const floating[::1] X_data,
+    uint64_t n_samples,
+    uint64_t n_features,
+    const integral[:] X_indices,
+    const integral[:] X_indptr,
+    const floating[:] weights,
+):
+    # Implement the function here since variables using fused types
+    # cannot be declared directly and can only be passed as function arguments
+    cdef:
+        intp_t row_ind
+        uint64_t feature_idx
+        integral i, col_ind
+        float64_t diff
+        # means[j] contains the mean of feature j
+        float64_t[::1] means = np.zeros(n_features)
+        # variances[j] contains the variance of feature j
+        float64_t[::1] variances = np.zeros(n_features)
+
+        float64_t[::1] sum_weights = np.full(
+            fill_value=np.sum(weights, dtype=np.float64), shape=n_features
+        )
+        float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
+        float64_t[::1] correction = np.zeros(shape=n_features)
+
+        uint64_t[::1] counts = np.full(
+            fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
+        )
+        uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)
+
+    for row_ind in range(len(X_indptr) - 1):
+        for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
+            col_ind = X_indices[i]
+            if not isnan(X_data[i]):
+                means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
+                # sum of weights where X[:, col_ind] is non-zero
+                sum_weights_nz[col_ind] += weights[row_ind]
+                # number of non-zero elements of X[:, col_ind]
+                counts_nz[col_ind] += 1
+            else:
+                # sum of weights where X[:, col_ind] is not nan
+                sum_weights[col_ind] -= weights[row_ind]
+                # number of non nan elements of X[:, col_ind]
+                counts[col_ind] -= 1
+
+    for feature_idx in range(n_features):
+        means[feature_idx] /= sum_weights[feature_idx]
+
+    for row_ind in range(len(X_indptr) - 1):
+        for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
+            col_ind = X_indices[i]
+            if not isnan(X_data[i]):
+                diff = X_data[i] - means[col_ind]
+                # correction term of the corrected 2 pass algorithm.
+                # See "Algorithms for computing the sample variance: analysis
+                # and recommendations", by Chan, Golub, and LeVeque.
+                correction[col_ind] += diff * weights[row_ind]
+                variances[col_ind] += diff * diff * weights[row_ind]
+
+    for feature_idx in range(n_features):
+        if counts[feature_idx] != counts_nz[feature_idx]:
+            correction[feature_idx] -= (
+                sum_weights[feature_idx] - sum_weights_nz[feature_idx]
+            ) * means[feature_idx]
+        correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx]
+        if counts[feature_idx] != counts_nz[feature_idx]:
+            # only compute it when it's guaranteed to be non-zero to avoid
+            # catastrophic cancellation.
+            variances[feature_idx] += (
+                sum_weights[feature_idx] - sum_weights_nz[feature_idx]
+            ) * means[feature_idx]**2
+        variances[feature_idx] = (
+            (variances[feature_idx] - correction[feature_idx]) /
+            sum_weights[feature_idx]
+        )
+
+    if floating is float:
+        return (
+            np.array(means, dtype=np.float32),
+            np.array(variances, dtype=np.float32),
+            np.array(sum_weights, dtype=np.float32),
+        )
+    else:
+        return (
+            np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
+        )
+
+
+def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
+    """Compute mean and variance along axis 0 on a CSC matrix
+
+    Uses a np.float64 accumulator.
+
+    Parameters
+    ----------
+    X : CSC sparse matrix, shape (n_samples, n_features)
+        Input data.
+
+    weights : ndarray of shape (n_samples,), dtype=floating, default=None
+        If it is set to None samples will be equally weighted.
+
+        .. versionadded:: 0.24
+
+    return_sum_weights : bool, default=False
+        If True, returns the sum of weights seen for each feature.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    means : float array with shape (n_features,)
+        Feature-wise means
+
+    variances : float array with shape (n_features,)
+        Feature-wise variances
+
+    sum_weights : ndarray of shape (n_features,), dtype=floating
+        Returned if return_sum_weights is True.
+    """
+    if X.dtype not in [np.float32, np.float64]:
+        X = X.astype(np.float64)
+
+    if weights is None:
+        weights = np.ones(X.shape[0], dtype=X.dtype)
+
+    means, variances, sum_weights = _csc_mean_variance_axis0(
+        X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)
+
+    if return_sum_weights:
+        return means, variances, sum_weights
+    return means, variances
+
+
+def _csc_mean_variance_axis0(
+    const floating[::1] X_data,
+    uint64_t n_samples,
+    uint64_t n_features,
+    const integral[:] X_indices,
+    const integral[:] X_indptr,
+    const floating[:] weights,
+):
+    # Implement the function here since variables using fused types
+    # cannot be declared directly and can only be passed as function arguments
+    cdef:
+        integral i, row_ind
+        uint64_t feature_idx, col_ind
+        float64_t diff
+        # means[j] contains the mean of feature j
+        float64_t[::1] means = np.zeros(n_features)
+        # variances[j] contains the variance of feature j
+        float64_t[::1] variances = np.zeros(n_features)
+
+        float64_t[::1] sum_weights = np.full(
+            fill_value=np.sum(weights, dtype=np.float64), shape=n_features
+        )
+        float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
+        float64_t[::1] correction = np.zeros(shape=n_features)
+
+        uint64_t[::1] counts = np.full(
+            fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
+        )
+        uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)
+
+    for col_ind in range(n_features):
+        for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
+            row_ind = X_indices[i]
+            if not isnan(X_data[i]):
+                means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
+                # sum of weights where X[:, col_ind] is non-zero
+                sum_weights_nz[col_ind] += weights[row_ind]
+                # number of non-zero elements of X[:, col_ind]
+                counts_nz[col_ind] += 1
+            else:
+                # sum of weights where X[:, col_ind] is not nan
+                sum_weights[col_ind] -= weights[row_ind]
+                # number of non nan elements of X[:, col_ind]
+                counts[col_ind] -= 1
+
+    for feature_idx in range(n_features):
+        means[feature_idx] /= sum_weights[feature_idx]
+
+    for col_ind in range(n_features):
+        for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
+            row_ind = X_indices[i]
+            if not isnan(X_data[i]):
+                diff = X_data[i] - means[col_ind]
+                # correction term of the corrected 2 pass algorithm.
+                # See "Algorithms for computing the sample variance: analysis
+                # and recommendations", by Chan, Golub, and LeVeque.
+                correction[col_ind] += diff * weights[row_ind]
+                variances[col_ind] += diff * diff * weights[row_ind]
+
+    for feature_idx in range(n_features):
+        if counts[feature_idx] != counts_nz[feature_idx]:
+            correction[feature_idx] -= (
+                sum_weights[feature_idx] - sum_weights_nz[feature_idx]
+            ) * means[feature_idx]
+        correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx]
+        if counts[feature_idx] != counts_nz[feature_idx]:
+            # only compute it when it's guaranteed to be non-zero to avoid
+            # catastrophic cancellation.
+            variances[feature_idx] += (
+                sum_weights[feature_idx] - sum_weights_nz[feature_idx]
+            ) * means[feature_idx]**2
+        variances[feature_idx] = (
+            (variances[feature_idx] - correction[feature_idx])
+        ) / sum_weights[feature_idx]
+
+    if floating is float:
+        return (np.array(means, dtype=np.float32),
+                np.array(variances, dtype=np.float32),
+                np.array(sum_weights, dtype=np.float32))
+    else:
+        return (
+            np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
+        )
+
+
+def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
+    """Compute mean and variance along axis 0 on a CSR or CSC matrix.
+
+    last_mean, last_var are the statistics computed at the last step by this
+    function. Both must be initialized to 0.0. last_n is the
+    number of samples encountered until now and is initialized at 0.
+
+    Parameters
+    ----------
+    X : CSR or CSC sparse matrix, shape (n_samples, n_features)
+      Input data.
+
+    last_mean : float array with shape (n_features,)
+      Array of feature-wise means to update with the new data X.
+
+    last_var : float array with shape (n_features,)
+      Array of feature-wise var to update with the new data X.
+
+    last_n : float array with shape (n_features,)
+      Sum of the weights seen so far (if weights are all set to 1
+      this will be the same as number of samples seen so far, before X).
+
+    weights : float array with shape (n_samples,) or None. If it is set
+      to None samples will be equally weighted.
+
+    Returns
+    -------
+    updated_mean : float array with shape (n_features,)
+      Feature-wise means
+
+    updated_variance : float array with shape (n_features,)
+      Feature-wise variances
+
+    updated_n : int array with shape (n_features,)
+      Updated number of samples seen
+
+    Notes
+    -----
+    NaNs are ignored during the computation.
+
+    References
+    ----------
+    T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
+      variance: recommendations, The American Statistician, Vol. 37, No. 3,
+      pp. 242-247
+
+    Also, see the non-sparse implementation of this in
+    `utils.extmath._batch_mean_variance_update`.
+
+    """
+    if X.dtype not in [np.float32, np.float64]:
+        X = X.astype(np.float64)
+    X_dtype = X.dtype
+    if weights is None:
+        weights = np.ones(X.shape[0], dtype=X_dtype)
+    elif weights.dtype not in [np.float32, np.float64]:
+        weights = weights.astype(np.float64, copy=False)
+    if last_n.dtype not in [np.float32, np.float64]:
+        last_n = last_n.astype(np.float64, copy=False)
+
+    return _incr_mean_variance_axis0(X.data,
+                                     np.sum(weights),
+                                     X.shape[1],
+                                     X.indices,
+                                     X.indptr,
+                                     X.format,
+                                     last_mean.astype(X_dtype, copy=False),
+                                     last_var.astype(X_dtype, copy=False),
+                                     last_n.astype(X_dtype, copy=False),
+                                     weights.astype(X_dtype, copy=False))
+
+
+def _incr_mean_variance_axis0(
+    const floating[:] X_data,
+    floating n_samples,
+    uint64_t n_features,
+    const int[:] X_indices,
+    # X_indptr might be either int32 or int64
+    const integral[:] X_indptr,
+    str X_format,
+    floating[:] last_mean,
+    floating[:] last_var,
+    floating[:] last_n,
+    # previous sum of the weights (ie float)
+    const floating[:] weights,
+):
+    # Implement the function here since variables using fused types
+    # cannot be declared directly and can only be passed as function arguments
+    cdef:
+        uint64_t i
+
+        # last = stats until now
+        # new = the current increment
+        # updated = the aggregated stats
+        # when arrays, they are indexed by i per-feature
+        floating[::1] new_mean
+        floating[::1] new_var
+        floating[::1] updated_mean
+        floating[::1] updated_var
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    new_mean = np.zeros(n_features, dtype=dtype)
+    new_var = np.zeros_like(new_mean, dtype=dtype)
+    updated_mean = np.zeros_like(new_mean, dtype=dtype)
+    updated_var = np.zeros_like(new_mean, dtype=dtype)
+
+    cdef:
+        floating[::1] new_n
+        floating[::1] updated_n
+        floating[::1] last_over_new_n
+
+    # Obtain new stats first
+    updated_n = np.zeros(shape=n_features, dtype=dtype)
+    last_over_new_n = np.zeros_like(updated_n, dtype=dtype)
+
+    # X can be a CSR or CSC matrix
+    if X_format == 'csr':
+        new_mean, new_var, new_n = _csr_mean_variance_axis0(
+            X_data, n_samples, n_features, X_indices, X_indptr, weights)
+    else:  # X_format == 'csc'
+        new_mean, new_var, new_n = _csc_mean_variance_axis0(
+            X_data, n_samples, n_features, X_indices, X_indptr, weights)
+
+    # First pass
+    cdef bint is_first_pass = True
+    for i in range(n_features):
+        if last_n[i] > 0:
+            is_first_pass = False
+            break
+
+    if is_first_pass:
+        return np.asarray(new_mean), np.asarray(new_var), np.asarray(new_n)
+
+    for i in range(n_features):
+        updated_n[i] = last_n[i] + new_n[i]
+
+    # Next passes
+    for i in range(n_features):
+        if new_n[i] > 0:
+            last_over_new_n[i] = dtype(last_n[i]) / dtype(new_n[i])
+            # Unnormalized stats
+            last_mean[i] *= last_n[i]
+            last_var[i] *= last_n[i]
+            new_mean[i] *= new_n[i]
+            new_var[i] *= new_n[i]
+            # Update stats
+            updated_var[i] = (
+                last_var[i] + new_var[i] +
+                last_over_new_n[i] / updated_n[i] *
+                (last_mean[i] / last_over_new_n[i] - new_mean[i])**2
+            )
+            updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n[i]
+            updated_var[i] /= updated_n[i]
+        else:
+            updated_var[i] = last_var[i]
+            updated_mean[i] = last_mean[i]
+            updated_n[i] = last_n[i]
+
+    return (
+        np.asarray(updated_mean),
+        np.asarray(updated_var),
+        np.asarray(updated_n),
+    )
+
+
+def inplace_csr_row_normalize_l1(X):
+    """Normalize inplace the rows of a CSR matrix or array by their L1 norm.
+
+    Parameters
+    ----------
+    X : scipy.sparse.csr_matrix and scipy.sparse.csr_array, \
+            shape=(n_samples, n_features)
+        The input matrix or array to be modified inplace.
+
+    Examples
+    --------
+    >>> from scipy.sparse import csr_matrix
+    >>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
+    >>> import numpy as np
+    >>> indptr = np.array([0, 2, 3, 4])
+    >>> indices = np.array([0, 1, 2, 3])
+    >>> data = np.array([1.0, 2.0, 3.0, 4.0])
+    >>> X = csr_matrix((data, indices, indptr), shape=(3, 4))
+    >>> X.toarray()
+    array([[1., 2., 0., 0.],
+           [0., 0., 3., 0.],
+           [0., 0., 0., 4.]])
+    >>> inplace_csr_row_normalize_l1(X)
+    >>> X.toarray()
+    array([[0.33...   , 0.66...   , 0.        , 0.        ],
+           [0.        , 0.        , 1.        , 0.        ],
+           [0.        , 0.        , 0.        , 1.        ]])
+    """
+    _inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr)
+
+
+def _inplace_csr_row_normalize_l1(
+    floating[:] X_data,
+    shape,
+    const integral[:] X_indices,
+    const integral[:] X_indptr,
+):
+    cdef:
+        uint64_t n_samples = shape[0]
+
+        # the column indices for row i are stored in:
+        #    indices[indptr[i]:indices[i+1]]
+        # and their corresponding values are stored in:
+        #    data[indptr[i]:indptr[i+1]]
+        uint64_t i
+        integral j
+        double sum_
+
+    for i in range(n_samples):
+        sum_ = 0.0
+
+        for j in range(X_indptr[i], X_indptr[i + 1]):
+            sum_ += fabs(X_data[j])
+
+        if sum_ == 0.0:
+            # do not normalize empty rows (can happen if CSR is not pruned
+            # correctly)
+            continue
+
+        for j in range(X_indptr[i], X_indptr[i + 1]):
+            X_data[j] /= sum_
+
+
+def inplace_csr_row_normalize_l2(X):
+    """Normalize inplace the rows of a CSR matrix or array by their L2 norm.
+
+    Parameters
+    ----------
+    X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
+        The input matrix or array to be modified inplace.
+
+    Examples
+    --------
+    >>> from scipy.sparse import csr_matrix
+    >>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
+    >>> import numpy as np
+    >>> indptr = np.array([0, 2, 3, 4])
+    >>> indices = np.array([0, 1, 2, 3])
+    >>> data = np.array([1.0, 2.0, 3.0, 4.0])
+    >>> X = csr_matrix((data, indices, indptr), shape=(3, 4))
+    >>> X.toarray()
+    array([[1., 2., 0., 0.],
+           [0., 0., 3., 0.],
+           [0., 0., 0., 4.]])
+    >>> inplace_csr_row_normalize_l2(X)
+    >>> X.toarray()
+    array([[0.44...   , 0.89...   , 0.        , 0.        ],
+           [0.        , 0.        , 1.        , 0.        ],
+           [0.        , 0.        , 0.        , 1.        ]])
+    """
+    _inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr)
+
+
+def _inplace_csr_row_normalize_l2(
+    floating[:] X_data,
+    shape,
+    const integral[:] X_indices,
+    const integral[:] X_indptr,
+):
+    cdef:
+        uint64_t n_samples = shape[0]
+        uint64_t i
+        integral j
+        double sum_
+
+    for i in range(n_samples):
+        sum_ = 0.0
+
+        for j in range(X_indptr[i], X_indptr[i + 1]):
+            sum_ += (X_data[j] * X_data[j])
+
+        if sum_ == 0.0:
+            # do not normalize empty rows (can happen if CSR is not pruned
+            # correctly)
+            continue
+
+        sum_ = sqrt(sum_)
+
+        for j in range(X_indptr[i], X_indptr[i + 1]):
+            X_data[j] /= sum_
+
+
+def assign_rows_csr(
+    X,
+    const intptr_t[:] X_rows,
+    const intptr_t[:] out_rows,
+    floating[:, ::1] out,
+):
+    """Densify selected rows of a CSR matrix into a preallocated array.
+
+    Like out[out_rows] = X[X_rows].toarray() but without copying.
+    No-copy supported for both dtype=np.float32 and dtype=np.float64.
+
+    Parameters
+    ----------
+    X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
+    X_rows : array, dtype=np.intp, shape=n_rows
+    out_rows : array, dtype=np.intp, shape=n_rows
+    out : array, shape=(arbitrary, n_features)
+    """
+    cdef:
+        # intptr_t (npy_intp, np.intp in Python) is what np.where returns,
+        # but int is what scipy.sparse uses.
+        intp_t i, ind, j, k
+        intptr_t rX
+        const floating[:] data = X.data
+        const int32_t[:] indices = X.indices
+        const int32_t[:] indptr = X.indptr
+
+    if X_rows.shape[0] != out_rows.shape[0]:
+        raise ValueError("cannot assign %d rows to %d"
+                         % (X_rows.shape[0], out_rows.shape[0]))
+
+    with nogil:
+        for k in range(out_rows.shape[0]):
+            out[out_rows[k]] = 0.0
+
+        for i in range(X_rows.shape[0]):
+            rX = X_rows[i]
+            for ind in range(indptr[rX], indptr[rX + 1]):
+                j = indices[ind]
+                out[out_rows[i], j] = data[ind]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/src/MurmurHash3.cpp b/.venv/lib/python3.12/site-packages/sklearn/utils/src/MurmurHash3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c42316121e246acc676563969c74db2f103fa5b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/src/MurmurHash3.cpp
@@ -0,0 +1,345 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE	__forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)	_rotl(x,y)
+#define ROTL64(x,y)	_rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#if defined(GNUC) && ((GNUC > 4) || (GNUC == 4 && GNUC_MINOR >= 4))
+
+/* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6.
+ * Don't inline for RHEL 5 gcc which is 4.1 */
+#define FORCE_INLINE attribute((always_inline))
+
+#else
+
+#define FORCE_INLINE
+
+#endif
+
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define	ROTL32(x,y)	rotl32(x,y)
+#define ROTL64(x,y)	rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
+{
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
+{
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  uint32_t c1 = 0xcc9e2d51;
+  uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  }
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix(h1);
+
+  *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  uint32_t c1 = 0x239b961b;
+  uint32_t c2 = 0xab0e9789;
+  uint32_t c3 = 0x38b34ae5;
+  uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock(blocks,i*4+0);
+    uint32_t k2 = getblock(blocks,i*4+1);
+    uint32_t k3 = getblock(blocks,i*4+2);
+    uint32_t k4 = getblock(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  }
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix(h1);
+  h2 = fmix(h2);
+  h3 = fmix(h3);
+  h4 = fmix(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  for(int i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock(blocks,i*2+0);
+    uint64_t k2 = getblock(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= uint64_t(tail[14]) << 48;
+  case 14: k2 ^= uint64_t(tail[13]) << 40;
+  case 13: k2 ^= uint64_t(tail[12]) << 32;
+  case 12: k2 ^= uint64_t(tail[11]) << 24;
+  case 11: k2 ^= uint64_t(tail[10]) << 16;
+  case 10: k2 ^= uint64_t(tail[ 9]) << 8;
+  case  9: k2 ^= uint64_t(tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= uint64_t(tail[ 7]) << 56;
+  case  7: k1 ^= uint64_t(tail[ 6]) << 48;
+  case  6: k1 ^= uint64_t(tail[ 5]) << 40;
+  case  5: k1 ^= uint64_t(tail[ 4]) << 32;
+  case  4: k1 ^= uint64_t(tail[ 3]) << 24;
+  case  3: k1 ^= uint64_t(tail[ 2]) << 16;
+  case  2: k1 ^= uint64_t(tail[ 1]) << 8;
+  case  1: k1 ^= uint64_t(tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  }
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix(h1);
+  h2 = fmix(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/src/MurmurHash3.h b/.venv/lib/python3.12/site-packages/sklearn/utils/src/MurmurHash3.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bfaca6ba52b962b820270fb1722fdc25b0de8fb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/src/MurmurHash3.h
@@ -0,0 +1,45 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+#ifdef __cplusplus
+}
+#endif
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/stats.py b/.venv/lib/python3.12/site-packages/sklearn/utils/stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..66179e5ea3aba3940ce445a82baa4c0fc77e0134
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/stats.py
@@ -0,0 +1,122 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..utils._array_api import (
+    _find_matching_floating_dtype,
+    get_namespace_and_device,
+)
+
+
+def _weighted_percentile(array, sample_weight, percentile_rank=50, xp=None):
+    """Compute the weighted percentile with method 'inverted_cdf'.
+
+    When the percentile lies between two data points of `array`, the function returns
+    the lower value.
+
+    If `array` is a 2D array, the `values` are selected along axis 0.
+
+    `NaN` values are ignored by setting their weights to 0. If `array` is 2D, this
+    is done in a column-isolated manner: a `NaN` in the second column, does not impact
+    the percentile computed for the first column even if `sample_weight` is 1D.
+
+        .. versionchanged:: 0.24
+            Accepts 2D `array`.
+
+        .. versionchanged:: 1.7
+            Supports handling of `NaN` values.
+
+    Parameters
+    ----------
+    array : 1D or 2D array
+        Values to take the weighted percentile of.
+
+    sample_weight: 1D or 2D array
+        Weights for each value in `array`. Must be same shape as `array` or of shape
+        `(array.shape[0],)`.
+
+    percentile_rank: int or float, default=50
+        The probability level of the percentile to compute, in percent. Must be between
+        0 and 100.
+
+    xp : array_namespace, default=None
+        The standard-compatible namespace for `array`. Default: infer.
+
+    Returns
+    -------
+    percentile : scalar or 0D array if `array` 1D (or 0D), array if `array` 2D
+        Weighted percentile at the requested probability level.
+    """
+    xp, _, device = get_namespace_and_device(array)
+    # `sample_weight` should follow `array` for dtypes
+    floating_dtype = _find_matching_floating_dtype(array, xp=xp)
+    array = xp.asarray(array, dtype=floating_dtype, device=device)
+    sample_weight = xp.asarray(sample_weight, dtype=floating_dtype, device=device)
+
+    n_dim = array.ndim
+    if n_dim == 0:
+        return array
+    if array.ndim == 1:
+        array = xp.reshape(array, (-1, 1))
+    # When sample_weight 1D, repeat for each array.shape[1]
+    if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]:
+        sample_weight = xp.tile(sample_weight, (array.shape[1], 1)).T
+    # Sort `array` and `sample_weight` along axis=0:
+    sorted_idx = xp.argsort(array, axis=0)
+    sorted_weights = xp.take_along_axis(sample_weight, sorted_idx, axis=0)
+
+    # Set NaN values in `sample_weight` to 0. Only perform this operation if NaN
+    # values present to avoid temporary allocations of size `(n_samples, n_features)`.
+    n_features = array.shape[1]
+    largest_value_per_column = array[
+        sorted_idx[-1, ...], xp.arange(n_features, device=device)
+    ]
+    # NaN values get sorted to end (largest value)
+    if xp.any(xp.isnan(largest_value_per_column)):
+        sorted_nan_mask = xp.take_along_axis(xp.isnan(array), sorted_idx, axis=0)
+        sorted_weights[sorted_nan_mask] = 0
+
+    # Compute the weighted cumulative distribution function (CDF) based on
+    # `sample_weight` and scale `percentile_rank` along it.
+    #
+    # Note: we call `xp.cumulative_sum` on the transposed `sorted_weights` to
+    # ensure that the result is of shape `(n_features, n_samples)` so
+    # `xp.searchsorted` calls take contiguous inputs as a result (for
+    # performance reasons).
+    weight_cdf = xp.cumulative_sum(sorted_weights.T, axis=1)
+    adjusted_percentile_rank = percentile_rank / 100 * weight_cdf[..., -1]
+
+    # Ignore leading `sample_weight=0` observations when `percentile_rank=0` (#20528)
+    mask = adjusted_percentile_rank == 0
+    adjusted_percentile_rank[mask] = xp.nextafter(
+        adjusted_percentile_rank[mask], adjusted_percentile_rank[mask] + 1
+    )
+    # For each feature with index j, find sample index i of the scalar value
+    # `adjusted_percentile_rank[j]` in 1D array `weight_cdf[j]`, such that:
+    # weight_cdf[j, i-1] < adjusted_percentile_rank[j] <= weight_cdf[j, i].
+    percentile_indices = xp.stack(
+        [
+            xp.searchsorted(
+                weight_cdf[feature_idx, ...], adjusted_percentile_rank[feature_idx]
+            )
+            for feature_idx in range(weight_cdf.shape[0])
+        ],
+    )
+    # In rare cases, `percentile_indices` equals to `sorted_idx.shape[0]`
+    max_idx = sorted_idx.shape[0] - 1
+    percentile_indices = xp.clip(percentile_indices, 0, max_idx)
+
+    col_indices = xp.arange(array.shape[1], device=device)
+    percentile_in_sorted = sorted_idx[percentile_indices, col_indices]
+
+    result = array[percentile_in_sorted, col_indices]
+
+    return result[0] if n_dim == 1 else result
+
+
+# TODO: refactor to do the symmetrisation inside _weighted_percentile to avoid
+# sorting the input array twice.
+def _averaged_weighted_percentile(array, sample_weight, percentile_rank=50, xp=None):
+    return (
+        _weighted_percentile(array, sample_weight, percentile_rank, xp=xp)
+        - _weighted_percentile(-array, sample_weight, 100 - percentile_rank, xp=xp)
+    ) / 2
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_arpack.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_arpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab1d622d51a08eca5ecf526b77d1bbeab17737f6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_arpack.py
@@ -0,0 +1,16 @@
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn.utils import check_random_state
+from sklearn.utils._arpack import _init_arpack_v0
+
+
+@pytest.mark.parametrize("seed", range(100))
+def test_init_arpack_v0(seed):
+    # check that the initialization a sampling from an uniform distribution
+    # where we can fix the random state
+    size = 1000
+    v0 = _init_arpack_v0(size, seed)
+
+    rng = check_random_state(seed)
+    assert_allclose(v0, rng.uniform(-1, 1, size=size))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_array_api.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_array_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dfbfd4d62ea1f2f0022213799af92171fca19ae
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_array_api.py
@@ -0,0 +1,605 @@
+import os
+from functools import partial
+
+import numpy
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn._config import config_context
+from sklearn.base import BaseEstimator
+from sklearn.utils._array_api import (
+    _asarray_with_order,
+    _atol_for_type,
+    _average,
+    _convert_to_numpy,
+    _count_nonzero,
+    _estimator_with_converted_arrays,
+    _fill_or_add_to_diagonal,
+    _get_namespace_device_dtype_ids,
+    _is_numpy_namespace,
+    _isin,
+    _max_precision_float_dtype,
+    _nanmax,
+    _nanmean,
+    _nanmin,
+    _ravel,
+    device,
+    get_namespace,
+    get_namespace_and_device,
+    indexing_dtype,
+    np_compat,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._testing import (
+    SkipTest,
+    _array_api_for_tests,
+    assert_array_equal,
+    skip_if_array_api_compat_not_configured,
+)
+from sklearn.utils.fixes import _IS_32BIT, CSR_CONTAINERS, np_version, parse_version
+
+
+@pytest.mark.parametrize("X", [numpy.asarray([1, 2, 3]), [1, 2, 3]])
+def test_get_namespace_ndarray_default(X):
+    """Check that get_namespace returns NumPy wrapper"""
+    xp_out, is_array_api_compliant = get_namespace(X)
+    assert xp_out is np_compat
+    assert not is_array_api_compliant
+
+
+def test_get_namespace_ndarray_creation_device():
+    """Check expected behavior with device and creation functions."""
+    X = numpy.asarray([1, 2, 3])
+    xp_out, _ = get_namespace(X)
+
+    full_array = xp_out.full(10, fill_value=2.0, device="cpu")
+    assert_allclose(full_array, [2.0] * 10)
+
+    with pytest.raises(ValueError, match="Unsupported device"):
+        xp_out.zeros(10, device="cuda")
+
+
+@skip_if_array_api_compat_not_configured
+def test_get_namespace_ndarray_with_dispatch():
+    """Test get_namespace on NumPy ndarrays."""
+
+    X_np = numpy.asarray([[1, 2, 3]])
+
+    with config_context(array_api_dispatch=True):
+        xp_out, is_array_api_compliant = get_namespace(X_np)
+        assert is_array_api_compliant
+
+        # In the future, NumPy should become API compliant library and we should have
+        # assert xp_out is numpy
+        assert xp_out is np_compat
+
+
+@skip_if_array_api_compat_not_configured
+def test_get_namespace_array_api(monkeypatch):
+    """Test get_namespace for ArrayAPI arrays."""
+    xp = pytest.importorskip("array_api_strict")
+
+    X_np = numpy.asarray([[1, 2, 3]])
+    X_xp = xp.asarray(X_np)
+    with config_context(array_api_dispatch=True):
+        xp_out, is_array_api_compliant = get_namespace(X_xp)
+        assert is_array_api_compliant
+
+        with pytest.raises(TypeError):
+            xp_out, is_array_api_compliant = get_namespace(X_xp, X_np)
+
+        def mock_getenv(key):
+            if key == "SCIPY_ARRAY_API":
+                return "0"
+
+        monkeypatch.setattr("os.environ.get", mock_getenv)
+        assert os.environ.get("SCIPY_ARRAY_API") != "1"
+        with pytest.raises(
+            RuntimeError,
+            match="scipy's own support is not enabled.",
+        ):
+            get_namespace(X_xp)
+
+
+@pytest.mark.parametrize("array_api", ["numpy", "array_api_strict"])
+def test_asarray_with_order(array_api):
+    """Test _asarray_with_order passes along order for NumPy arrays."""
+    xp = pytest.importorskip(array_api)
+
+    X = xp.asarray([1.2, 3.4, 5.1])
+    X_new = _asarray_with_order(X, order="F", xp=xp)
+
+    X_new_np = numpy.asarray(X_new)
+    assert X_new_np.flags["F_CONTIGUOUS"]
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device_, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize(
+    "weights, axis, normalize, expected",
+    [
+        # normalize = True
+        (None, None, True, 3.5),
+        (None, 0, True, [2.5, 3.5, 4.5]),
+        (None, 1, True, [2, 5]),
+        ([True, False], 0, True, [1, 2, 3]),  # boolean weights
+        ([True, True, False], 1, True, [1.5, 4.5]),  # boolean weights
+        ([0.4, 0.1], 0, True, [1.6, 2.6, 3.6]),
+        ([0.4, 0.2, 0.2], 1, True, [1.75, 4.75]),
+        ([1, 2], 0, True, [3, 4, 5]),
+        ([1, 1, 2], 1, True, [2.25, 5.25]),
+        ([[1, 2, 3], [1, 2, 3]], 0, True, [2.5, 3.5, 4.5]),
+        ([[1, 2, 1], [2, 2, 2]], 1, True, [2, 5]),
+        # normalize = False
+        (None, None, False, 21),
+        (None, 0, False, [5, 7, 9]),
+        (None, 1, False, [6, 15]),
+        ([True, False], 0, False, [1, 2, 3]),  # boolean weights
+        ([True, True, False], 1, False, [3, 9]),  # boolean weights
+        ([0.4, 0.1], 0, False, [0.8, 1.3, 1.8]),
+        ([0.4, 0.2, 0.2], 1, False, [1.4, 3.8]),
+        ([1, 2], 0, False, [9, 12, 15]),
+        ([1, 1, 2], 1, False, [9, 21]),
+        ([[1, 2, 3], [1, 2, 3]], 0, False, [5, 14, 27]),
+        ([[1, 2, 1], [2, 2, 2]], 1, False, [8, 30]),
+    ],
+)
+def test_average(
+    array_namespace, device_, dtype_name, weights, axis, normalize, expected
+):
+    xp = _array_api_for_tests(array_namespace, device_)
+    array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
+    array_in = xp.asarray(array_in, device=device_)
+    if weights is not None:
+        weights = numpy.asarray(weights, dtype=dtype_name)
+        weights = xp.asarray(weights, device=device_)
+
+    with config_context(array_api_dispatch=True):
+        result = _average(array_in, axis=axis, weights=weights, normalize=normalize)
+
+    if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
+        # NumPy 2.0 has a problem with the device attribute of scalar arrays:
+        # https://github.com/numpy/numpy/issues/26850
+        assert device(array_in) == device(result)
+
+    result = _convert_to_numpy(result, xp)
+    assert_allclose(result, expected, atol=_atol_for_type(dtype_name))
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(include_numpy_namespaces=False),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_average_raises_with_wrong_dtype(array_namespace, device, dtype_name):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    array_in = numpy.asarray([2, 0], dtype=dtype_name) + 1j * numpy.asarray(
+        [4, 3], dtype=dtype_name
+    )
+    complex_type_name = array_in.dtype.name
+    if not hasattr(xp, complex_type_name):
+        # This is the case for cupy as of March 2024 for instance.
+        pytest.skip(f"{array_namespace} does not support {complex_type_name}")
+
+    array_in = xp.asarray(array_in, device=device)
+
+    err_msg = "Complex floating point values are not supported by average."
+    with (
+        config_context(array_api_dispatch=True),
+        pytest.raises(NotImplementedError, match=err_msg),
+    ):
+        _average(array_in)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(include_numpy_namespaces=True),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize(
+    "axis, weights, error, error_msg",
+    (
+        (
+            None,
+            [1, 2],
+            TypeError,
+            "Axis must be specified",
+        ),
+        (
+            0,
+            [[1, 2]],
+            # NumPy 2 raises ValueError, NumPy 1 raises TypeError
+            (ValueError, TypeError),
+            "weights",  # the message is different for NumPy 1 and 2...
+        ),
+        (
+            0,
+            [1, 2, 3, 4],
+            ValueError,
+            "weights",
+        ),
+        (0, [-1, 1], ZeroDivisionError, "Weights sum to zero, can't be normalized"),
+    ),
+)
+def test_average_raises_with_invalid_parameters(
+    array_namespace, device, dtype_name, axis, weights, error, error_msg
+):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
+    array_in = xp.asarray(array_in, device=device)
+
+    weights = numpy.asarray(weights, dtype=dtype_name)
+    weights = xp.asarray(weights, device=device)
+
+    with config_context(array_api_dispatch=True), pytest.raises(error, match=error_msg):
+        _average(array_in, axis=axis, weights=weights)
+
+
+def test_device_none_if_no_input():
+    assert device() is None
+
+    assert device(None, "name") is None
+
+
+@skip_if_array_api_compat_not_configured
+def test_device_inspection():
+    class Device:
+        def __init__(self, name):
+            self.name = name
+
+        def __eq__(self, device):
+            return self.name == device.name
+
+        def __hash__(self):
+            raise TypeError("Device object is not hashable")
+
+        def __str__(self):
+            return self.name
+
+    class Array:
+        def __init__(self, device_name):
+            self.device = Device(device_name)
+
+    # Sanity check: ensure our Device mock class is non hashable, to
+    # accurately account for non-hashable device objects in some array
+    # libraries, because of which the `device` inspection function shouldn't
+    # make use of hash lookup tables (in particular, not use `set`)
+    with pytest.raises(TypeError):
+        hash(Array("device").device)
+
+    # If array API dispatch is disabled the device should be ignored. Erroring
+    # early for different devices would prevent the np.asarray conversion to
+    # happen. For example, `r2_score(np.ones(5), torch.ones(5))` should work
+    # fine with array API disabled.
+    assert device(Array("cpu"), Array("mygpu")) is None
+
+    # Test that ValueError is raised if on different devices and array API dispatch is
+    # enabled.
+    err_msg = "Input arrays use different devices: cpu, mygpu"
+    with config_context(array_api_dispatch=True):
+        with pytest.raises(ValueError, match=err_msg):
+            device(Array("cpu"), Array("mygpu"))
+
+        # Test expected value is returned otherwise
+        array1 = Array("device")
+        array2 = Array("device")
+
+        assert array1.device == device(array1)
+        assert array1.device == device(array1, array2)
+        assert array1.device == device(array1, array1, array2)
+
+
+# TODO: add cupy to the list of libraries once the following upstream issue
+# has been fixed:
+# https://github.com/cupy/cupy/issues/8180
+@skip_if_array_api_compat_not_configured
+@pytest.mark.parametrize("library", ["numpy", "array_api_strict", "torch"])
+@pytest.mark.parametrize(
+    "X,reduction,expected",
+    [
+        ([1, 2, numpy.nan], _nanmin, 1),
+        ([1, -2, -numpy.nan], _nanmin, -2),
+        ([numpy.inf, numpy.inf], _nanmin, numpy.inf),
+        (
+            [[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
+            partial(_nanmin, axis=0),
+            [1.0, 2.0, 3.0],
+        ),
+        (
+            [[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
+            partial(_nanmin, axis=1),
+            [1.0, numpy.nan, 4.0],
+        ),
+        ([1, 2, numpy.nan], _nanmax, 2),
+        ([1, 2, numpy.nan], _nanmax, 2),
+        ([-numpy.inf, -numpy.inf], _nanmax, -numpy.inf),
+        (
+            [[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
+            partial(_nanmax, axis=0),
+            [4.0, 5.0, 6.0],
+        ),
+        (
+            [[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
+            partial(_nanmax, axis=1),
+            [3.0, numpy.nan, 6.0],
+        ),
+        ([1, 2, numpy.nan], _nanmean, 1.5),
+        ([1, -2, -numpy.nan], _nanmean, -0.5),
+        ([-numpy.inf, -numpy.inf], _nanmean, -numpy.inf),
+        (
+            [[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
+            partial(_nanmean, axis=0),
+            [2.5, 3.5, 4.5],
+        ),
+        (
+            [[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
+            partial(_nanmean, axis=1),
+            [2.0, numpy.nan, 5.0],
+        ),
+    ],
+)
+def test_nan_reductions(library, X, reduction, expected):
+    """Check NaN reductions like _nanmin and _nanmax"""
+    xp = pytest.importorskip(library)
+
+    with config_context(array_api_dispatch=True):
+        result = reduction(xp.asarray(X))
+
+    result = _convert_to_numpy(result, xp)
+    assert_allclose(result, expected)
+
+
+@pytest.mark.parametrize(
+    "namespace, _device, _dtype",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_ravel(namespace, _device, _dtype):
+    xp = _array_api_for_tests(namespace, _device)
+
+    array = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
+    array_xp = xp.asarray(array, device=_device)
+    with config_context(array_api_dispatch=True):
+        result = _ravel(array_xp)
+
+    result = _convert_to_numpy(result, xp)
+    expected = numpy.ravel(array, order="C")
+
+    assert_allclose(expected, result)
+
+    if _is_numpy_namespace(xp):
+        assert numpy.asarray(result).flags["C_CONTIGUOUS"]
+
+
+@skip_if_array_api_compat_not_configured
+@pytest.mark.parametrize("library", ["cupy", "torch"])
+def test_convert_to_numpy_gpu(library):  # pragma: nocover
+    """Check convert_to_numpy for GPU backed libraries."""
+    xp = pytest.importorskip(library)
+
+    if library == "torch":
+        if not xp.backends.cuda.is_built():
+            pytest.skip("test requires cuda")
+        X_gpu = xp.asarray([1.0, 2.0, 3.0], device="cuda")
+    else:
+        X_gpu = xp.asarray([1.0, 2.0, 3.0])
+
+    X_cpu = _convert_to_numpy(X_gpu, xp=xp)
+    expected_output = numpy.asarray([1.0, 2.0, 3.0])
+    assert_allclose(X_cpu, expected_output)
+
+
+def test_convert_to_numpy_cpu():
+    """Check convert_to_numpy for PyTorch CPU arrays."""
+    torch = pytest.importorskip("torch")
+    X_torch = torch.asarray([1.0, 2.0, 3.0], device="cpu")
+
+    X_cpu = _convert_to_numpy(X_torch, xp=torch)
+    expected_output = numpy.asarray([1.0, 2.0, 3.0])
+    assert_allclose(X_cpu, expected_output)
+
+
+class SimpleEstimator(BaseEstimator):
+    def fit(self, X, y=None):
+        self.X_ = X
+        self.n_features_ = X.shape[0]
+        return self
+
+
+@skip_if_array_api_compat_not_configured
+@pytest.mark.parametrize(
+    "array_namespace, converter",
+    [
+        ("torch", lambda array: array.cpu().numpy()),
+        ("array_api_strict", lambda array: numpy.asarray(array)),
+        ("cupy", lambda array: array.get()),
+    ],
+)
+def test_convert_estimator_to_ndarray(array_namespace, converter):
+    """Convert estimator attributes to ndarray."""
+    xp = pytest.importorskip(array_namespace)
+
+    X = xp.asarray([[1.3, 4.5]])
+    est = SimpleEstimator().fit(X)
+
+    new_est = _estimator_with_converted_arrays(est, converter)
+    assert isinstance(new_est.X_, numpy.ndarray)
+
+
+@skip_if_array_api_compat_not_configured
+def test_convert_estimator_to_array_api():
+    """Convert estimator attributes to ArrayAPI arrays."""
+    xp = pytest.importorskip("array_api_strict")
+
+    X_np = numpy.asarray([[1.3, 4.5]])
+    est = SimpleEstimator().fit(X_np)
+
+    new_est = _estimator_with_converted_arrays(est, lambda array: xp.asarray(array))
+    assert hasattr(new_est.X_, "__array_namespace__")
+
+
+@pytest.mark.parametrize(
+    "namespace, _device, _dtype",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_indexing_dtype(namespace, _device, _dtype):
+    xp = _array_api_for_tests(namespace, _device)
+
+    if _IS_32BIT:
+        assert indexing_dtype(xp) == xp.int32
+    else:
+        assert indexing_dtype(xp) == xp.int64
+
+
+@pytest.mark.parametrize(
+    "namespace, _device, _dtype",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_max_precision_float_dtype(namespace, _device, _dtype):
+    xp = _array_api_for_tests(namespace, _device)
+    expected_dtype = xp.float32 if _device == "mps" else xp.float64
+    assert _max_precision_float_dtype(xp, _device) == expected_dtype
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, _",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize("invert", [True, False])
+@pytest.mark.parametrize("assume_unique", [True, False])
+@pytest.mark.parametrize("element_size", [6, 10, 14])
+@pytest.mark.parametrize("int_dtype", ["int16", "int32", "int64", "uint8"])
+def test_isin(
+    array_namespace, device, _, invert, assume_unique, element_size, int_dtype
+):
+    xp = _array_api_for_tests(array_namespace, device)
+    r = element_size // 2
+    element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(int_dtype)
+    test_elements = numpy.array(numpy.arange(14), dtype=int_dtype)
+    element_xp = xp.asarray(element, device=device)
+    test_elements_xp = xp.asarray(test_elements, device=device)
+    expected = numpy.isin(
+        element=element,
+        test_elements=test_elements,
+        assume_unique=assume_unique,
+        invert=invert,
+    )
+    with config_context(array_api_dispatch=True):
+        result = _isin(
+            element=element_xp,
+            test_elements=test_elements_xp,
+            xp=xp,
+            assume_unique=assume_unique,
+            invert=invert,
+        )
+
+    assert_array_equal(_convert_to_numpy(result, xp=xp), expected)
+
+
+@pytest.mark.skipif(
+    os.environ.get("SCIPY_ARRAY_API") != "1", reason="SCIPY_ARRAY_API not set to 1."
+)
+def test_get_namespace_and_device():
+    # Use torch as a library with custom Device objects:
+    torch = pytest.importorskip("torch")
+
+    from sklearn.externals.array_api_compat import torch as torch_compat
+
+    some_torch_tensor = torch.arange(3, device="cpu")
+    some_numpy_array = numpy.arange(3)
+
+    # When dispatch is disabled, get_namespace_and_device should return the
+    # default NumPy wrapper namespace and "cpu" device. Our code will handle such
+    # inputs via the usual __array__ interface without attempting to dispatch
+    # via the array API.
+    namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
+    assert namespace is get_namespace(some_numpy_array)[0]
+    assert not is_array_api
+    assert device is None
+
+    # Otherwise, expose the torch namespace and device via array API compat
+    # wrapper.
+    with config_context(array_api_dispatch=True):
+        namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
+        assert namespace is torch_compat
+        assert is_array_api
+        assert device == some_torch_tensor.device
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device_, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("axis", [0, 1, None, -1, -2])
+@pytest.mark.parametrize("sample_weight_type", [None, "int", "float"])
+def test_count_nonzero(
+    array_namespace, device_, dtype_name, csr_container, axis, sample_weight_type
+):
+    from sklearn.utils.sparsefuncs import count_nonzero as sparse_count_nonzero
+
+    xp = _array_api_for_tests(array_namespace, device_)
+    array = numpy.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]])
+    if sample_weight_type == "int":
+        sample_weight = numpy.asarray([1, 2, 2, 3, 1])
+    elif sample_weight_type == "float":
+        sample_weight = numpy.asarray([0.5, 1.5, 0.8, 3.2, 2.4], dtype=dtype_name)
+    else:
+        sample_weight = None
+    expected = sparse_count_nonzero(
+        csr_container(array), axis=axis, sample_weight=sample_weight
+    )
+    array_xp = xp.asarray(array, device=device_)
+
+    with config_context(array_api_dispatch=True):
+        result = _count_nonzero(
+            array_xp, axis=axis, sample_weight=sample_weight, xp=xp, device=device_
+        )
+
+    assert_allclose(_convert_to_numpy(result, xp=xp), expected)
+
+    if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
+        # NumPy 2.0 has a problem with the device attribute of scalar arrays:
+        # https://github.com/numpy/numpy/issues/26850
+        assert device(array_xp) == device(result)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device_, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize("wrap", [True, False])
+def test_fill_or_add_to_diagonal(array_namespace, device_, dtype_name, wrap):
+    xp = _array_api_for_tests(array_namespace, device_)
+
+    array_np = numpy.zeros((5, 4), dtype=dtype_name)
+    array_xp = xp.asarray(array_np.copy(), device=device_)
+
+    numpy.fill_diagonal(array_np, val=1, wrap=wrap)
+    with config_context(array_api_dispatch=True):
+        _fill_or_add_to_diagonal(array_xp, value=1, xp=xp, add_value=False, wrap=wrap)
+
+    assert_array_equal(_convert_to_numpy(array_xp, xp=xp), array_np)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("dispatch", [True, False])
+def test_sparse_device(csr_container, dispatch):
+    a, b = csr_container(numpy.array([[1]])), csr_container(numpy.array([[2]]))
+    if dispatch and os.environ.get("SCIPY_ARRAY_API") is None:
+        raise SkipTest("SCIPY_ARRAY_API is not set: not checking array_api input")
+    with config_context(array_api_dispatch=dispatch):
+        assert device(a, b) is None
+        assert device(a, numpy.array([1])) is None
+        assert get_namespace_and_device(a, b)[2] is None
+        assert get_namespace_and_device(a, numpy.array([1]))[2] is None
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_arrayfuncs.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_arrayfuncs.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5c99427cbd003eebe0eb4228667be056a0ab642
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_arrayfuncs.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pytest
+
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.arrayfuncs import _all_with_any_reduction_axis_1, min_pos
+
+
+def test_min_pos():
+    # Check that min_pos returns a positive value and that it's consistent
+    # between float and double
+    X = np.random.RandomState(0).randn(100)
+
+    min_double = min_pos(X)
+    min_float = min_pos(X.astype(np.float32))
+
+    assert_allclose(min_double, min_float)
+    assert min_double >= 0
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_min_pos_no_positive(dtype):
+    # Check that the return value of min_pos is the maximum representable
+    # value of the input dtype when all input elements are <= 0 (#19328)
+    X = np.full(100, -1.0).astype(dtype, copy=False)
+
+    assert min_pos(X) == np.finfo(dtype).max
+
+
+@pytest.mark.parametrize(
+    "dtype", [np.int16, np.int32, np.int64, np.float32, np.float64]
+)
+@pytest.mark.parametrize("value", [0, 1.5, -1])
+def test_all_with_any_reduction_axis_1(dtype, value):
+    # Check that return value is False when there is no row equal to `value`
+    X = np.arange(12, dtype=dtype).reshape(3, 4)
+    assert not _all_with_any_reduction_axis_1(X, value=value)
+
+    # Make a row equal to `value`
+    X[1, :] = value
+    assert _all_with_any_reduction_axis_1(X, value=value)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_bunch.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_bunch.py
new file mode 100644
index 0000000000000000000000000000000000000000..15463475747f4229ed7ab320d8f5be8005f9cf0a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_bunch.py
@@ -0,0 +1,32 @@
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.utils import Bunch
+
+
+def test_bunch_attribute_deprecation():
+    """Check that bunch raises deprecation message with `__getattr__`."""
+    bunch = Bunch()
+    values = np.asarray([1, 2, 3])
+    msg = (
+        "Key: 'values', is deprecated in 1.3 and will be "
+        "removed in 1.5. Please use 'grid_values' instead"
+    )
+    bunch._set_deprecated(
+        values, new_key="grid_values", deprecated_key="values", warning_message=msg
+    )
+
+    with warnings.catch_warnings():
+        # Does not warn for "grid_values"
+        warnings.simplefilter("error")
+        v = bunch["grid_values"]
+
+    assert v is values
+
+    with pytest.warns(FutureWarning, match=msg):
+        # Warns for "values"
+        v = bunch["values"]
+
+    assert v is values
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_chunking.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_chunking.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c7ed17a0c2da4365f7b4a326368d432b3131bf
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_chunking.py
@@ -0,0 +1,73 @@
+import warnings
+from itertools import chain
+
+import pytest
+
+from sklearn import config_context
+from sklearn.utils._chunking import gen_even_slices, get_chunk_n_rows
+from sklearn.utils._testing import assert_array_equal
+
+
+def test_gen_even_slices():
+    # check that gen_even_slices contains all samples
+    some_range = range(10)
+    joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)]))
+    assert_array_equal(some_range, joined_range)
+
+
+@pytest.mark.parametrize(
+    ("row_bytes", "max_n_rows", "working_memory", "expected"),
+    [
+        (1024, None, 1, 1024),
+        (1024, None, 0.99999999, 1023),
+        (1023, None, 1, 1025),
+        (1025, None, 1, 1023),
+        (1024, None, 2, 2048),
+        (1024, 7, 1, 7),
+        (1024 * 1024, None, 1, 1),
+    ],
+)
+def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        actual = get_chunk_n_rows(
+            row_bytes=row_bytes,
+            max_n_rows=max_n_rows,
+            working_memory=working_memory,
+        )
+
+    assert actual == expected
+    assert type(actual) is type(expected)
+    with config_context(working_memory=working_memory):
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", UserWarning)
+            actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
+        assert actual == expected
+        assert type(actual) is type(expected)
+
+
+def test_get_chunk_n_rows_warns():
+    """Check that warning is raised when working_memory is too low."""
+    row_bytes = 1024 * 1024 + 1
+    max_n_rows = None
+    working_memory = 1
+    expected = 1
+
+    warn_msg = (
+        "Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        actual = get_chunk_n_rows(
+            row_bytes=row_bytes,
+            max_n_rows=max_n_rows,
+            working_memory=working_memory,
+        )
+
+    assert actual == expected
+    assert type(actual) is type(expected)
+
+    with config_context(working_memory=working_memory):
+        with pytest.warns(UserWarning, match=warn_msg):
+            actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
+        assert actual == expected
+        assert type(actual) is type(expected)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_class_weight.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_class_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..3efee050c3b9021ea2ce8a61bd25e2c71848c13f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_class_weight.py
@@ -0,0 +1,334 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn.datasets import make_blobs
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal
+from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
+from sklearn.utils.fixes import CSC_CONTAINERS
+
+
+def test_compute_class_weight():
+    # Test (and demo) compute_class_weight.
+    y = np.asarray([2, 2, 2, 3, 3, 4])
+    classes = np.unique(y)
+
+    cw = compute_class_weight("balanced", classes=classes, y=y)
+    # total effect of samples is preserved
+    class_counts = np.bincount(y)[2:]
+    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
+    assert cw[0] < cw[1] < cw[2]
+
+
+@pytest.mark.parametrize(
+    "y_type, class_weight, classes, err_msg",
+    [
+        (
+            "numeric",
+            "balanced",
+            np.arange(4),
+            "classes should have valid labels that are in y",
+        ),
+        # Non-regression for https://github.com/scikit-learn/scikit-learn/issues/8312
+        (
+            "numeric",
+            {"label_not_present": 1.0},
+            np.arange(4),
+            r"The classes, \[0, 1, 2, 3\], are not in class_weight",
+        ),
+        (
+            "numeric",
+            "balanced",
+            np.arange(2),
+            "classes should include all valid labels",
+        ),
+        (
+            "numeric",
+            {0: 1.0, 1: 2.0},
+            np.arange(2),
+            "classes should include all valid labels",
+        ),
+        (
+            "string",
+            {"dogs": 3, "cat": 2},
+            np.array(["dog", "cat"]),
+            r"The classes, \['dog'\], are not in class_weight",
+        ),
+    ],
+)
+def test_compute_class_weight_not_present(y_type, class_weight, classes, err_msg):
+    # Raise error when y does not contain all class labels
+    y = (
+        np.asarray([0, 0, 0, 1, 1, 2])
+        if y_type == "numeric"
+        else np.asarray(["dog", "cat", "dog"])
+    )
+
+    print(y)
+    with pytest.raises(ValueError, match=err_msg):
+        compute_class_weight(class_weight, classes=classes, y=y)
+
+
+def test_compute_class_weight_dict():
+    classes = np.arange(3)
+    class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
+    y = np.asarray([0, 0, 1, 2])
+    cw = compute_class_weight(class_weights, classes=classes, y=y)
+
+    # When the user specifies class weights, compute_class_weights should just
+    # return them.
+    assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
+
+    # When a class weight is specified that isn't in classes, the weight is ignored
+    class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
+    cw = compute_class_weight(class_weights, classes=classes, y=y)
+    assert_allclose([1.0, 2.0, 3.0], cw)
+
+    class_weights = {-1: 5.0, 0: 4.0, 1: 2.0, 2: 3.0}
+    cw = compute_class_weight(class_weights, classes=classes, y=y)
+    assert_allclose([4.0, 2.0, 3.0], cw)
+
+
+def test_compute_class_weight_invariance():
+    # Test that results with class_weight="balanced" is invariant wrt
+    # class imbalance if the number of samples is identical.
+    # The test uses a balanced two class dataset with 100 datapoints.
+    # It creates three versions, one where class 1 is duplicated
+    # resulting in 150 points of class 1 and 50 of class 0,
+    # one where there are 50 points in class 1 and 150 in class 0,
+    # and one where there are 100 points of each class (this one is balanced
+    # again).
+    # With balancing class weights, all three should give the same model.
+    X, y = make_blobs(centers=2, random_state=0)
+    # create dataset where class 1 is duplicated twice
+    X_1 = np.vstack([X] + [X[y == 1]] * 2)
+    y_1 = np.hstack([y] + [y[y == 1]] * 2)
+    # create dataset where class 0 is duplicated twice
+    X_0 = np.vstack([X] + [X[y == 0]] * 2)
+    y_0 = np.hstack([y] + [y[y == 0]] * 2)
+    # duplicate everything
+    X_ = np.vstack([X] * 2)
+    y_ = np.hstack([y] * 2)
+    # results should be identical
+    logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
+    logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
+    logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
+    assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
+    assert_array_almost_equal(logreg.coef_, logreg0.coef_)
+
+
+def test_compute_class_weight_balanced_negative():
+    # Test compute_class_weight when labels are negative
+    # Test with balanced class labels.
+    classes = np.array([-2, -1, 0])
+    y = np.asarray([-1, -1, 0, 0, -2, -2])
+
+    cw = compute_class_weight("balanced", classes=classes, y=y)
+    assert len(cw) == len(classes)
+    assert_array_almost_equal(cw, np.array([1.0, 1.0, 1.0]))
+
+
+def test_compute_class_weight_balanced_sample_weight_equivalence():
+    # Test with unbalanced and negative class labels for
+    # equivalence between repeated and weighted samples
+
+    classes = np.array([-2, -1, 0])
+    y = np.asarray([-1, -1, 0, 0, -2, -2])
+    sw = np.asarray([1, 0, 1, 1, 1, 2])
+
+    y_rep = np.repeat(y, sw, axis=0)
+
+    class_weights_weighted = compute_class_weight(
+        "balanced", classes=classes, y=y, sample_weight=sw
+    )
+    class_weights_repeated = compute_class_weight("balanced", classes=classes, y=y_rep)
+    assert len(class_weights_weighted) == len(classes)
+    assert len(class_weights_repeated) == len(classes)
+
+    class_counts_weighted = np.bincount(y + 2, weights=sw)
+    class_counts_repeated = np.bincount(y_rep + 2)
+
+    assert np.dot(class_weights_weighted, class_counts_weighted) == pytest.approx(
+        np.dot(class_weights_repeated, class_counts_repeated)
+    )
+
+    assert_allclose(class_weights_weighted, class_weights_repeated)
+
+
+def test_compute_class_weight_balanced_unordered():
+    # Test compute_class_weight when classes are unordered
+    classes = np.array([1, 0, 3])
+    y = np.asarray([1, 0, 0, 3, 3, 3])
+
+    cw = compute_class_weight("balanced", classes=classes, y=y)
+    class_counts = np.bincount(y)[classes]
+    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
+    assert_array_almost_equal(cw, [2.0, 1.0, 2.0 / 3])
+
+
+def test_compute_class_weight_default():
+    # Test for the case where no weight is given for a present class.
+    # Current behaviour is to assign the unweighted classes a weight of 1.
+    y = np.asarray([2, 2, 2, 3, 3, 4])
+    classes = np.unique(y)
+    classes_len = len(classes)
+
+    # Test for non specified weights
+    cw = compute_class_weight(None, classes=classes, y=y)
+    assert len(cw) == classes_len
+    assert_array_almost_equal(cw, np.ones(3))
+
+    # Tests for partly specified weights
+    cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
+    assert len(cw) == classes_len
+    assert_array_almost_equal(cw, [1.5, 1.0, 1.0])
+
+    cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
+    assert len(cw) == classes_len
+    assert_array_almost_equal(cw, [1.5, 1.0, 0.5])
+
+
+def test_compute_sample_weight():
+    # Test (and demo) compute_sample_weight.
+    # Test with balanced classes
+    y = np.asarray([1, 1, 1, 2, 2, 2])
+    sample_weight = compute_sample_weight("balanced", y)
+    assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+
+    # Test with user-defined weights
+    sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
+    assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 1.0, 1.0, 1.0])
+
+    # Test with column vector of balanced classes
+    y = np.asarray([[1], [1], [1], [2], [2], [2]])
+    sample_weight = compute_sample_weight("balanced", y)
+    assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+
+    # Test with unbalanced classes
+    y = np.asarray([1, 1, 1, 2, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y)
+    expected_balanced = np.array(
+        [0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333]
+    )
+    assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
+
+    # Test with `None` weights
+    sample_weight = compute_sample_weight(None, y)
+    assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+
+    # Test with multi-output of balanced classes
+    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
+    sample_weight = compute_sample_weight("balanced", y)
+    assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+
+    # Test with multi-output with user-defined weights
+    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
+    sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
+    assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
+
+    # Test with multi-output of unbalanced classes
+    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
+    sample_weight = compute_sample_weight("balanced", y)
+    assert_array_almost_equal(sample_weight, expected_balanced**2, decimal=3)
+
+
+def test_compute_sample_weight_with_subsample():
+    # Test compute_sample_weight with subsamples specified.
+    # Test with balanced classes and all samples present
+    y = np.asarray([1, 1, 1, 2, 2, 2])
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
+    assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+
+    # Test with column vector of balanced classes and all samples present
+    y = np.asarray([[1], [1], [1], [2], [2], [2]])
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
+    assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+
+    # Test with a subsample
+    y = np.asarray([1, 1, 1, 2, 2, 2])
+    sample_weight = compute_sample_weight("balanced", y, indices=range(4))
+    assert_array_almost_equal(sample_weight, [2.0 / 3, 2.0 / 3, 2.0 / 3, 2.0, 2.0, 2.0])
+
+    # Test with a bootstrap subsample
+    y = np.asarray([1, 1, 1, 2, 2, 2])
+    sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
+    expected_balanced = np.asarray([0.6, 0.6, 0.6, 3.0, 3.0, 3.0])
+    assert_array_almost_equal(sample_weight, expected_balanced)
+
+    # Test with a bootstrap subsample for multi-output
+    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
+    sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
+    assert_array_almost_equal(sample_weight, expected_balanced**2)
+
+    # Test with a missing class
+    y = np.asarray([1, 1, 1, 2, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
+    assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
+
+    # Test with a missing class for multi-output
+    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
+    assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
+
+
+@pytest.mark.parametrize(
+    "y_type, class_weight, indices, err_msg",
+    [
+        (
+            "single-output",
+            {1: 2, 2: 1},
+            range(4),
+            "The only valid class_weight for subsampling is 'balanced'.",
+        ),
+        (
+            "multi-output",
+            {1: 2, 2: 1},
+            None,
+            "For multi-output, class_weight should be a list of dicts, or the string",
+        ),
+        (
+            "multi-output",
+            [{1: 2, 2: 1}],
+            None,
+            r"Got 1 element\(s\) while having 2 outputs",
+        ),
+    ],
+)
+def test_compute_sample_weight_errors(y_type, class_weight, indices, err_msg):
+    # Test compute_sample_weight raises errors expected.
+    # Invalid preset string
+    y_single_output = np.asarray([1, 1, 1, 2, 2, 2])
+    y_multi_output = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
+
+    y = y_single_output if y_type == "single-output" else y_multi_output
+    with pytest.raises(ValueError, match=err_msg):
+        compute_sample_weight(class_weight, y, indices=indices)
+
+
+def test_compute_sample_weight_more_than_32():
+    # Non-regression smoke test for #12146
+    y = np.arange(50)  # more than 32 distinct classes
+    indices = np.arange(50)  # use subsampling
+    weight = compute_sample_weight("balanced", y, indices=indices)
+    assert_array_almost_equal(weight, np.ones(y.shape[0]))
+
+
+def test_class_weight_does_not_contains_more_classes():
+    """Check that class_weight can contain more labels than in y.
+
+    Non-regression test for #22413
+    """
+    tree = DecisionTreeClassifier(class_weight={0: 1, 1: 10, 2: 20})
+
+    # Does not raise
+    tree.fit([[0, 0, 1], [1, 0, 1], [1, 2, 0]], [0, 0, 1])
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_compute_sample_weight_sparse(csc_container):
+    """Check that we can compute weight for sparse `y`."""
+    y = csc_container(np.asarray([[0], [1], [1]]))
+    sample_weight = compute_sample_weight("balanced", y)
+    assert_allclose(sample_weight, [1.5, 0.75, 0.75])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_cython_blas.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_cython_blas.py
new file mode 100644
index 0000000000000000000000000000000000000000..e221c3fea4e02fc66cb206e856415a963b66026f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_cython_blas.py
@@ -0,0 +1,250 @@
+import numpy as np
+import pytest
+
+from sklearn.utils._cython_blas import (
+    BLAS_Order,
+    BLAS_Trans,
+    _asum_memview,
+    _axpy_memview,
+    _copy_memview,
+    _dot_memview,
+    _gemm_memview,
+    _gemv_memview,
+    _ger_memview,
+    _nrm2_memview,
+    _rot_memview,
+    _rotg_memview,
+    _scal_memview,
+)
+from sklearn.utils._testing import assert_allclose
+
+
+def _numpy_to_cython(dtype):
+    cython = pytest.importorskip("cython")
+    if dtype == np.float32:
+        return cython.float
+    elif dtype == np.float64:
+        return cython.double
+
+
+RTOL = {np.float32: 1e-6, np.float64: 1e-12}
+ORDER = {BLAS_Order.RowMajor: "C", BLAS_Order.ColMajor: "F"}
+
+
+def _no_op(x):
+    return x
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_dot(dtype):
+    dot = _dot_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    x = rng.random_sample(10).astype(dtype, copy=False)
+    y = rng.random_sample(10).astype(dtype, copy=False)
+
+    expected = x.dot(y)
+    actual = dot(x, y)
+
+    assert_allclose(actual, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_asum(dtype):
+    asum = _asum_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    x = rng.random_sample(10).astype(dtype, copy=False)
+
+    expected = np.abs(x).sum()
+    actual = asum(x)
+
+    assert_allclose(actual, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_axpy(dtype):
+    axpy = _axpy_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    x = rng.random_sample(10).astype(dtype, copy=False)
+    y = rng.random_sample(10).astype(dtype, copy=False)
+    alpha = 2.5
+
+    expected = alpha * x + y
+    axpy(alpha, x, y)
+
+    assert_allclose(y, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_nrm2(dtype):
+    nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    x = rng.random_sample(10).astype(dtype, copy=False)
+
+    expected = np.linalg.norm(x)
+    actual = nrm2(x)
+
+    assert_allclose(actual, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_copy(dtype):
+    copy = _copy_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    x = rng.random_sample(10).astype(dtype, copy=False)
+    y = np.empty_like(x)
+
+    expected = x.copy()
+    copy(x, y)
+
+    assert_allclose(y, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_scal(dtype):
+    scal = _scal_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    x = rng.random_sample(10).astype(dtype, copy=False)
+    alpha = 2.5
+
+    expected = alpha * x
+    scal(alpha, x)
+
+    assert_allclose(x, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_rotg(dtype):
+    rotg = _rotg_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    a = dtype(rng.randn())
+    b = dtype(rng.randn())
+    c, s = 0.0, 0.0
+
+    def expected_rotg(a, b):
+        roe = a if abs(a) > abs(b) else b
+        if a == 0 and b == 0:
+            c, s, r, z = (1, 0, 0, 0)
+        else:
+            r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
+            c, s = a / r, b / r
+            z = s if roe == a else (1 if c == 0 else 1 / c)
+        return r, z, c, s
+
+    expected = expected_rotg(a, b)
+    actual = rotg(a, b, c, s)
+
+    assert_allclose(actual, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_rot(dtype):
+    rot = _rot_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    x = rng.random_sample(10).astype(dtype, copy=False)
+    y = rng.random_sample(10).astype(dtype, copy=False)
+    c = dtype(rng.randn())
+    s = dtype(rng.randn())
+
+    expected_x = c * x + s * y
+    expected_y = c * y - s * x
+
+    rot(x, y, c, s)
+
+    assert_allclose(x, expected_x)
+    assert_allclose(y, expected_y)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize(
+    "opA, transA",
+    [(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
+    ids=["NoTrans", "Trans"],
+)
+@pytest.mark.parametrize(
+    "order",
+    [BLAS_Order.RowMajor, BLAS_Order.ColMajor],
+    ids=["RowMajor", "ColMajor"],
+)
+def test_gemv(dtype, opA, transA, order):
+    gemv = _gemv_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    A = np.asarray(
+        opA(rng.random_sample((20, 10)).astype(dtype, copy=False)), order=ORDER[order]
+    )
+    x = rng.random_sample(10).astype(dtype, copy=False)
+    y = rng.random_sample(20).astype(dtype, copy=False)
+    alpha, beta = 2.5, -0.5
+
+    expected = alpha * opA(A).dot(x) + beta * y
+    gemv(transA, alpha, A, x, beta, y)
+
+    assert_allclose(y, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize(
+    "order",
+    [BLAS_Order.RowMajor, BLAS_Order.ColMajor],
+    ids=["BLAS_Order.RowMajor", "BLAS_Order.ColMajor"],
+)
+def test_ger(dtype, order):
+    ger = _ger_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    x = rng.random_sample(10).astype(dtype, copy=False)
+    y = rng.random_sample(20).astype(dtype, copy=False)
+    A = np.asarray(
+        rng.random_sample((10, 20)).astype(dtype, copy=False), order=ORDER[order]
+    )
+    alpha = 2.5
+
+    expected = alpha * np.outer(x, y) + A
+    ger(alpha, x, y, A)
+
+    assert_allclose(A, expected, rtol=RTOL[dtype])
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize(
+    "opB, transB",
+    [(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
+    ids=["NoTrans", "Trans"],
+)
+@pytest.mark.parametrize(
+    "opA, transA",
+    [(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
+    ids=["NoTrans", "Trans"],
+)
+@pytest.mark.parametrize(
+    "order",
+    [BLAS_Order.RowMajor, BLAS_Order.ColMajor],
+    ids=["BLAS_Order.RowMajor", "BLAS_Order.ColMajor"],
+)
+def test_gemm(dtype, opA, transA, opB, transB, order):
+    gemm = _gemm_memview[_numpy_to_cython(dtype)]
+
+    rng = np.random.RandomState(0)
+    A = np.asarray(
+        opA(rng.random_sample((30, 10)).astype(dtype, copy=False)), order=ORDER[order]
+    )
+    B = np.asarray(
+        opB(rng.random_sample((10, 20)).astype(dtype, copy=False)), order=ORDER[order]
+    )
+    C = np.asarray(
+        rng.random_sample((30, 20)).astype(dtype, copy=False), order=ORDER[order]
+    )
+    alpha, beta = 2.5, -0.5
+
+    expected = alpha * opA(A).dot(opB(B)) + beta * C
+    gemm(transA, transB, alpha, A, B, beta, C)
+
+    assert_allclose(C, expected, rtol=RTOL[dtype])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_deprecation.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec83182bf5769ac0d5af21cbd58eafade4ade35
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_deprecation.py
@@ -0,0 +1,98 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import pickle
+from inspect import signature
+
+import pytest
+
+from sklearn.utils.deprecation import _is_deprecated, deprecated
+
+
+@deprecated("qwerty")
+class MockClass1:
+    pass
+
+
+class MockClass2:
+    @deprecated("mockclass2_method")
+    def method(self):
+        pass
+
+    @deprecated("n_features_ is deprecated")  # type: ignore[prop-decorator]
+    @property
+    def n_features_(self):
+        """Number of input features."""
+        return 10
+
+
+class MockClass3:
+    @deprecated()
+    def __init__(self):
+        pass
+
+
+class MockClass4:
+    pass
+
+
+class MockClass5(MockClass1):
+    """Inherit from deprecated class but does not call super().__init__."""
+
+    def __init__(self, a):
+        self.a = a
+
+
+@deprecated("a message")
+class MockClass6:
+    """A deprecated class that overrides __new__."""
+
+    def __new__(cls, *args, **kwargs):
+        assert len(args) > 0
+        return super().__new__(cls)
+
+
+@deprecated()
+def mock_function():
+    return 10
+
+
+def test_deprecated():
+    with pytest.warns(FutureWarning, match="qwerty"):
+        MockClass1()
+    with pytest.warns(FutureWarning, match="mockclass2_method"):
+        MockClass2().method()
+    with pytest.warns(FutureWarning, match="deprecated"):
+        MockClass3()
+    with pytest.warns(FutureWarning, match="qwerty"):
+        MockClass5(42)
+    with pytest.warns(FutureWarning, match="a message"):
+        MockClass6(42)
+    with pytest.warns(FutureWarning, match="deprecated"):
+        val = mock_function()
+    assert val == 10
+
+
+def test_is_deprecated():
+    # Test if _is_deprecated helper identifies wrapping via deprecated
+    # NOTE it works only for class methods and functions
+    assert _is_deprecated(MockClass1.__new__)
+    assert _is_deprecated(MockClass2().method)
+    assert _is_deprecated(MockClass3.__init__)
+    assert not _is_deprecated(MockClass4.__init__)
+    assert _is_deprecated(MockClass5.__new__)
+    assert _is_deprecated(mock_function)
+
+
+def test_pickle():
+    pickle.loads(pickle.dumps(mock_function))
+
+
+def test_deprecated_class_signature():
+    @deprecated()
+    class MockClass:
+        def __init__(self, a, b=1, c=2):
+            pass
+
+    assert list(signature(MockClass).parameters.keys()) == ["a", "b", "c"]
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_encode.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_encode.py
new file mode 100644
index 0000000000000000000000000000000000000000..9118eb56f0ba4bbc5227b6c57450af891876acfa
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_encode.py
@@ -0,0 +1,274 @@
+import pickle
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
+
+
+@pytest.mark.parametrize(
+    "values, expected",
+    [
+        (np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
+        (
+            np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
+            np.array([1, 2, np.nan], dtype="float32"),
+        ),
+        (
+            np.array(["b", "a", "c", "a", "c"], dtype=object),
+            np.array(["a", "b", "c"], dtype=object),
+        ),
+        (
+            np.array(["b", "a", None, "a", None], dtype=object),
+            np.array(["a", "b", None], dtype=object),
+        ),
+        (np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
+    ],
+    ids=["int64", "float32-nan", "object", "object-None", "str"],
+)
+def test_encode_util(values, expected):
+    uniques = _unique(values)
+    assert_array_equal(uniques, expected)
+
+    result, encoded = _unique(values, return_inverse=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+
+    encoded = _encode(values, uniques=uniques)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+
+    result, counts = _unique(values, return_counts=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(counts, np.array([2, 1, 2]))
+
+    result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+    assert_array_equal(counts, np.array([2, 1, 2]))
+
+
+def test_encode_with_check_unknown():
+    # test for the check_unknown parameter of _encode()
+    uniques = np.array([1, 2, 3])
+    values = np.array([1, 2, 3, 4])
+
+    # Default is True, raise error
+    with pytest.raises(ValueError, match="y contains previously unseen labels"):
+        _encode(values, uniques=uniques, check_unknown=True)
+
+    # dont raise error if False
+    _encode(values, uniques=uniques, check_unknown=False)
+
+    # parameter is ignored for object dtype
+    uniques = np.array(["a", "b", "c"], dtype=object)
+    values = np.array(["a", "b", "c", "d"], dtype=object)
+    with pytest.raises(ValueError, match="y contains previously unseen labels"):
+        _encode(values, uniques=uniques, check_unknown=False)
+
+
+def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
+    diff = _check_unknown(values, uniques)
+    assert_array_equal(diff, expected_diff)
+
+    diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
+    assert_array_equal(diff, expected_diff)
+    assert_array_equal(valid_mask, expected_mask)
+
+
+@pytest.mark.parametrize(
+    "values, uniques, expected_diff, expected_mask",
+    [
+        (np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
+        (np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
+        (np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
+        (
+            np.array([2, 1, 4, np.nan]),
+            np.array([2, 5, 1, np.nan]),
+            [4],
+            [True, True, False, True],
+        ),
+        (
+            np.array([2, 1, 4, np.nan]),
+            np.array([2, 5, 1]),
+            [4, np.nan],
+            [True, True, False, False],
+        ),
+        (
+            np.array([2, 1, 4, 5]),
+            np.array([2, 5, 1, np.nan]),
+            [4],
+            [True, True, False, True],
+        ),
+        (
+            np.array(["a", "b", "c", "d"], dtype=object),
+            np.array(["a", "b", "c"], dtype=object),
+            np.array(["d"], dtype=object),
+            [True, True, True, False],
+        ),
+        (
+            np.array(["d", "c", "a", "b"], dtype=object),
+            np.array(["a", "c", "b"], dtype=object),
+            np.array(["d"], dtype=object),
+            [False, True, True, True],
+        ),
+        (
+            np.array(["a", "b", "c", "d"]),
+            np.array(["a", "b", "c"]),
+            np.array(["d"]),
+            [True, True, True, False],
+        ),
+        (
+            np.array(["d", "c", "a", "b"]),
+            np.array(["a", "c", "b"]),
+            np.array(["d"]),
+            [False, True, True, True],
+        ),
+    ],
+)
+def test_check_unknown(values, uniques, expected_diff, expected_mask):
+    _assert_check_unknown(values, uniques, expected_diff, expected_mask)
+
+
+@pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
+@pytest.mark.parametrize("pickle_uniques", [True, False])
+def test_check_unknown_missing_values(missing_value, pickle_uniques):
+    # check for check_unknown with missing values with object dtypes
+    values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
+    uniques = np.array(["c", "a", "b", missing_value], dtype=object)
+    if pickle_uniques:
+        uniques = pickle.loads(pickle.dumps(uniques))
+
+    expected_diff = ["d"]
+    expected_mask = [False, True, True, True, True]
+    _assert_check_unknown(values, uniques, expected_diff, expected_mask)
+
+    values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
+    uniques = np.array(["c", "a", "b"], dtype=object)
+    if pickle_uniques:
+        uniques = pickle.loads(pickle.dumps(uniques))
+
+    expected_diff = ["d", missing_value]
+
+    expected_mask = [False, True, True, True, False]
+    _assert_check_unknown(values, uniques, expected_diff, expected_mask)
+
+    values = np.array(["a", missing_value], dtype=object)
+    uniques = np.array(["a", "b", "z"], dtype=object)
+    if pickle_uniques:
+        uniques = pickle.loads(pickle.dumps(uniques))
+
+    expected_diff = [missing_value]
+    expected_mask = [True, False]
+    _assert_check_unknown(values, uniques, expected_diff, expected_mask)
+
+
+@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
+@pytest.mark.parametrize("pickle_uniques", [True, False])
+def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
+    # check for _unique and _encode with missing values with object dtypes
+    values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
+    expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
+
+    uniques = _unique(values)
+
+    if missing_value is None:
+        assert_array_equal(uniques, expected_uniques)
+    else:  # missing_value == np.nan
+        assert_array_equal(uniques[:-1], expected_uniques[:-1])
+        assert np.isnan(uniques[-1])
+
+    if pickle_uniques:
+        uniques = pickle.loads(pickle.dumps(uniques))
+
+    encoded = _encode(values, uniques=uniques)
+    assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
+
+
+def test_unique_util_missing_values_numeric():
+    # Check missing values in numerical values
+    values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
+    expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
+    expected_inverse = np.array([1, 0, 3, 2, 1, 3])
+
+    uniques = _unique(values)
+    assert_array_equal(uniques, expected_uniques)
+
+    uniques, inverse = _unique(values, return_inverse=True)
+    assert_array_equal(uniques, expected_uniques)
+    assert_array_equal(inverse, expected_inverse)
+
+    encoded = _encode(values, uniques=uniques)
+    assert_array_equal(encoded, expected_inverse)
+
+
+def test_unique_util_with_all_missing_values():
+    # test for all types of missing values for object dtype
+    values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
+
+    uniques = _unique(values)
+    assert_array_equal(uniques[:-1], ["a", "c", None])
+    # last value is nan
+    assert np.isnan(uniques[-1])
+
+    expected_inverse = [3, 0, 1, 1, 2, 3, 2]
+    _, inverse = _unique(values, return_inverse=True)
+    assert_array_equal(inverse, expected_inverse)
+
+
+def test_check_unknown_with_both_missing_values():
+    # test for both types of missing values for object dtype
+    values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
+
+    diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
+    assert diff[0] is None
+    assert np.isnan(diff[1])
+
+    diff, valid_mask = _check_unknown(
+        values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
+    )
+
+    assert diff[0] is None
+    assert np.isnan(diff[1])
+    assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
+
+
+@pytest.mark.parametrize(
+    "values, uniques, expected_counts",
+    [
+        (np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
+        (
+            np.array([1] * 10 + [2] * 4 + [3] * 15),
+            np.array([1, 2, 3, 5]),
+            [10, 4, 15, 0],
+        ),
+        (
+            np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
+            np.array([2, 3, np.nan]),
+            [4, 15, 10],
+        ),
+        (
+            np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
+            ["a", "b", "c"],
+            [16, 4, 20],
+        ),
+        (
+            np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
+            ["c", "b", "a"],
+            [20, 4, 16],
+        ),
+        (
+            np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
+            ["c", np.nan, "a"],
+            [20, 4, 16],
+        ),
+        (
+            np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
+            ["a", "b", "c", "e"],
+            [16, 4, 20, 0],
+        ),
+    ],
+)
+def test_get_counts(values, uniques, expected_counts):
+    counts = _get_counts(values, uniques)
+    assert_array_equal(counts, expected_counts)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_estimator_checks.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_estimator_checks.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd313d2397a0f467d46cf9cdbae63777c17c99df
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_estimator_checks.py
@@ -0,0 +1,1665 @@
+# We can not use pytest here, because we run
+# build_tools/azure/test_pytest_soft_dependency.sh on these
+# tests to make sure estimator_checks works without pytest.
+
+import importlib
+import re
+import sys
+import unittest
+import warnings
+from inspect import isgenerator
+from numbers import Integral, Real
+
+import joblib
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn import config_context, get_config
+from sklearn.base import BaseEstimator, ClassifierMixin, OutlierMixin, TransformerMixin
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.datasets import (
+    load_iris,
+    make_multilabel_classification,
+)
+from sklearn.decomposition import PCA
+from sklearn.exceptions import (
+    ConvergenceWarning,
+    EstimatorCheckFailedWarning,
+    SkipTestWarning,
+)
+from sklearn.linear_model import (
+    LinearRegression,
+    LogisticRegression,
+    MultiTaskElasticNet,
+    SGDClassifier,
+)
+from sklearn.mixture import GaussianMixture
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC, NuSVC
+from sklearn.utils import _array_api, all_estimators, deprecated
+from sklearn.utils._param_validation import Interval, StrOptions
+from sklearn.utils._test_common.instance_generator import (
+    _construct_instances,
+    _get_expected_failed_checks,
+)
+from sklearn.utils._testing import (
+    MinimalClassifier,
+    MinimalRegressor,
+    MinimalTransformer,
+    SkipTest,
+    ignore_warnings,
+    raises,
+)
+from sklearn.utils.estimator_checks import (
+    _check_name,
+    _NotAnArray,
+    _yield_all_checks,
+    check_array_api_input,
+    check_class_weight_balanced_linear_classifier,
+    check_classifier_data_not_an_array,
+    check_classifier_not_supporting_multiclass,
+    check_classifiers_multilabel_output_format_decision_function,
+    check_classifiers_multilabel_output_format_predict,
+    check_classifiers_multilabel_output_format_predict_proba,
+    check_classifiers_one_label_sample_weights,
+    check_dataframe_column_names_consistency,
+    check_decision_proba_consistency,
+    check_dict_unchanged,
+    check_dont_overwrite_parameters,
+    check_estimator,
+    check_estimator_cloneable,
+    check_estimator_repr,
+    check_estimator_sparse_array,
+    check_estimator_sparse_matrix,
+    check_estimator_sparse_tag,
+    check_estimator_tags_renamed,
+    check_estimators_nan_inf,
+    check_estimators_overwrite_params,
+    check_estimators_unfitted,
+    check_fit_check_is_fitted,
+    check_fit_score_takes_y,
+    check_methods_sample_order_invariance,
+    check_methods_subset_invariance,
+    check_mixin_order,
+    check_no_attributes_set_in_init,
+    check_outlier_contamination,
+    check_outlier_corruption,
+    check_parameters_default_constructible,
+    check_positive_only_tag_during_fit,
+    check_regressor_data_not_an_array,
+    check_requires_y_none,
+    check_sample_weights_pandas_series,
+    check_set_params,
+    estimator_checks_generator,
+    set_random_state,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS, SPARRAY_PRESENT
+from sklearn.utils.metaestimators import available_if
+from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import (
+    check_array,
+    check_is_fitted,
+    check_X_y,
+    validate_data,
+)
+
+
+class CorrectNotFittedError(ValueError):
+    """Exception class to raise if estimator is used before fitting.
+
+    Like NotFittedError, it inherits from ValueError, but not from
+    AttributeError. Used for testing only.
+    """
+
+
+class BaseBadClassifier(ClassifierMixin, BaseEstimator):
+    def fit(self, X, y):
+        return self
+
+    def predict(self, X):
+        return np.ones(X.shape[0])
+
+
+class ChangesDict(BaseEstimator):
+    def __init__(self, key=0):
+        self.key = key
+
+    def fit(self, X, y=None):
+        X, y = validate_data(self, X, y)
+        return self
+
+    def predict(self, X):
+        X = check_array(X)
+        self.key = 1000
+        return np.ones(X.shape[0])
+
+
+class SetsWrongAttribute(BaseEstimator):
+    def __init__(self, acceptable_key=0):
+        self.acceptable_key = acceptable_key
+
+    def fit(self, X, y=None):
+        self.wrong_attribute = 0
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class ChangesWrongAttribute(BaseEstimator):
+    def __init__(self, wrong_attribute=0):
+        self.wrong_attribute = wrong_attribute
+
+    def fit(self, X, y=None):
+        self.wrong_attribute = 1
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class ChangesUnderscoreAttribute(BaseEstimator):
+    def fit(self, X, y=None):
+        self._good_attribute = 1
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class RaisesErrorInSetParams(BaseEstimator):
+    def __init__(self, p=0):
+        self.p = p
+
+    def set_params(self, **kwargs):
+        if "p" in kwargs:
+            p = kwargs.pop("p")
+            if p < 0:
+                raise ValueError("p can't be less than 0")
+            self.p = p
+        return super().set_params(**kwargs)
+
+    def fit(self, X, y=None):
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class HasMutableParameters(BaseEstimator):
+    def __init__(self, p=object()):
+        self.p = p
+
+    def fit(self, X, y=None):
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class HasImmutableParameters(BaseEstimator):
+    # Note that object is an uninitialized class, thus immutable.
+    def __init__(self, p=42, q=np.int32(42), r=object):
+        self.p = p
+        self.q = q
+        self.r = r
+
+    def fit(self, X, y=None):
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class ModifiesValueInsteadOfRaisingError(BaseEstimator):
+    def __init__(self, p=0):
+        self.p = p
+
+    def set_params(self, **kwargs):
+        if "p" in kwargs:
+            p = kwargs.pop("p")
+            if p < 0:
+                p = 0
+            self.p = p
+        return super().set_params(**kwargs)
+
+    def fit(self, X, y=None):
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class ModifiesAnotherValue(BaseEstimator):
+    def __init__(self, a=0, b="method1"):
+        self.a = a
+        self.b = b
+
+    def set_params(self, **kwargs):
+        if "a" in kwargs:
+            a = kwargs.pop("a")
+            self.a = a
+            if a is None:
+                kwargs.pop("b")
+                self.b = "method2"
+        return super().set_params(**kwargs)
+
+    def fit(self, X, y=None):
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class NoCheckinPredict(BaseBadClassifier):
+    def fit(self, X, y):
+        X, y = validate_data(self, X, y)
+        return self
+
+
+class NoSparseClassifier(BaseBadClassifier):
+    def __init__(self, raise_for_type=None):
+        # raise_for_type : str, expects "sparse_array" or "sparse_matrix"
+        self.raise_for_type = raise_for_type
+
+    def fit(self, X, y):
+        X, y = validate_data(self, X, y, accept_sparse=["csr", "csc"])
+        if self.raise_for_type == "sparse_array":
+            correct_type = isinstance(X, sp.sparray)
+        elif self.raise_for_type == "sparse_matrix":
+            correct_type = isinstance(X, sp.spmatrix)
+        if correct_type:
+            raise ValueError("Nonsensical Error")
+        return self
+
+    def predict(self, X):
+        X = check_array(X)
+        return np.ones(X.shape[0])
+
+
+class CorrectNotFittedErrorClassifier(BaseBadClassifier):
+    def fit(self, X, y):
+        X, y = validate_data(self, X, y)
+        self.coef_ = np.ones(X.shape[1])
+        return self
+
+    def predict(self, X):
+        check_is_fitted(self)
+        X = check_array(X)
+        return np.ones(X.shape[0])
+
+
+class NoSampleWeightPandasSeriesType(BaseEstimator):
+    def fit(self, X, y, sample_weight=None):
+        # Convert data
+        X, y = validate_data(
+            self, X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
+        )
+        # Function is only called after we verify that pandas is installed
+        from pandas import Series
+
+        if isinstance(sample_weight, Series):
+            raise ValueError(
+                "Estimator does not accept 'sample_weight'of type pandas.Series"
+            )
+        return self
+
+    def predict(self, X):
+        X = check_array(X)
+        return np.ones(X.shape[0])
+
+
+class BadBalancedWeightsClassifier(BaseBadClassifier):
+    def __init__(self, class_weight=None):
+        self.class_weight = class_weight
+
+    def fit(self, X, y):
+        from sklearn.preprocessing import LabelEncoder
+        from sklearn.utils import compute_class_weight
+
+        label_encoder = LabelEncoder().fit(y)
+        classes = label_encoder.classes_
+        class_weight = compute_class_weight(self.class_weight, classes=classes, y=y)
+
+        # Intentionally modify the balanced class_weight
+        # to simulate a bug and raise an exception
+        if self.class_weight == "balanced":
+            class_weight += 1.0
+
+        # Simply assigning coef_ to the class_weight
+        self.coef_ = class_weight
+        return self
+
+
+class BadTransformerWithoutMixin(BaseEstimator):
+    def fit(self, X, y=None):
+        X = validate_data(self, X)
+        return self
+
+    def transform(self, X):
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+        return X
+
+
+class NotInvariantPredict(BaseEstimator):
+    def fit(self, X, y):
+        # Convert data
+        X, y = validate_data(
+            self, X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
+        )
+        return self
+
+    def predict(self, X):
+        # return 1 if X has more than one element else return 0
+        X = check_array(X)
+        if X.shape[0] > 1:
+            return np.ones(X.shape[0])
+        return np.zeros(X.shape[0])
+
+
+class NotInvariantSampleOrder(BaseEstimator):
+    def fit(self, X, y):
+        X, y = validate_data(
+            self, X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
+        )
+        # store the original X to check for sample order later
+        self._X = X
+        return self
+
+    def predict(self, X):
+        X = check_array(X)
+        # if the input contains the same elements but different sample order,
+        # then just return zeros.
+        if (
+            np.array_equiv(np.sort(X, axis=0), np.sort(self._X, axis=0))
+            and (X != self._X).any()
+        ):
+            return np.zeros(X.shape[0])
+        return X[:, 0]
+
+
+class OneClassSampleErrorClassifier(BaseBadClassifier):
+    """Classifier allowing to trigger different behaviors when `sample_weight` reduces
+    the number of classes to 1."""
+
+    def __init__(self, raise_when_single_class=False):
+        self.raise_when_single_class = raise_when_single_class
+
+    def fit(self, X, y, sample_weight=None):
+        X, y = check_X_y(
+            X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
+        )
+
+        self.has_single_class_ = False
+        self.classes_, y = np.unique(y, return_inverse=True)
+        n_classes_ = self.classes_.shape[0]
+        if n_classes_ < 2 and self.raise_when_single_class:
+            self.has_single_class_ = True
+            raise ValueError("normal class error")
+
+        # find the number of class after trimming
+        if sample_weight is not None:
+            if isinstance(sample_weight, np.ndarray) and len(sample_weight) > 0:
+                n_classes_ = np.count_nonzero(np.bincount(y, sample_weight))
+            if n_classes_ < 2:
+                self.has_single_class_ = True
+                raise ValueError("Nonsensical Error")
+
+        return self
+
+    def predict(self, X):
+        check_is_fitted(self)
+        X = check_array(X)
+        if self.has_single_class_:
+            return np.zeros(X.shape[0])
+        return np.ones(X.shape[0])
+
+
+class LargeSparseNotSupportedClassifier(BaseEstimator):
+    """Estimator that claims to support large sparse data
+    (accept_large_sparse=True), but doesn't"""
+
+    def __init__(self, raise_for_type=None):
+        # raise_for_type : str, expects "sparse_array" or "sparse_matrix"
+        self.raise_for_type = raise_for_type
+
+    def fit(self, X, y):
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=("csr", "csc", "coo"),
+            accept_large_sparse=True,
+            multi_output=True,
+            y_numeric=True,
+        )
+        if self.raise_for_type == "sparse_array":
+            correct_type = isinstance(X, sp.sparray)
+        elif self.raise_for_type == "sparse_matrix":
+            correct_type = isinstance(X, sp.spmatrix)
+        if correct_type:
+            if X.format == "coo":
+                if X.row.dtype == "int64" or X.col.dtype == "int64":
+                    raise ValueError("Estimator doesn't support 64-bit indices")
+            elif X.format in ["csc", "csr"]:
+                assert "int64" not in (
+                    X.indices.dtype,
+                    X.indptr.dtype,
+                ), "Estimator doesn't support 64-bit indices"
+
+        return self
+
+
+class SparseTransformer(TransformerMixin, BaseEstimator):
+    def __init__(self, sparse_container=None):
+        self.sparse_container = sparse_container
+
+    def fit(self, X, y=None):
+        validate_data(self, X)
+        return self
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X, y).transform(X)
+
+    def transform(self, X):
+        check_is_fitted(self)
+        X = validate_data(self, X, accept_sparse=True, reset=False)
+        return self.sparse_container(X)
+
+
+class EstimatorInconsistentForPandas(BaseEstimator):
+    def fit(self, X, y):
+        try:
+            from pandas import DataFrame
+
+            if isinstance(X, DataFrame):
+                self.value_ = X.iloc[0, 0]
+            else:
+                X = check_array(X)
+                self.value_ = X[1, 0]
+            return self
+
+        except ImportError:
+            X = check_array(X)
+            self.value_ = X[1, 0]
+            return self
+
+    def predict(self, X):
+        X = check_array(X)
+        return np.array([self.value_] * X.shape[0])
+
+
+class UntaggedBinaryClassifier(SGDClassifier):
+    # Toy classifier that only supports binary classification, will fail tests.
+    def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
+        super().fit(X, y, coef_init, intercept_init, sample_weight)
+        if len(self.classes_) > 2:
+            raise ValueError("Only 2 classes are supported")
+        return self
+
+    def partial_fit(self, X, y, classes=None, sample_weight=None):
+        super().partial_fit(X=X, y=y, classes=classes, sample_weight=sample_weight)
+        if len(self.classes_) > 2:
+            raise ValueError("Only 2 classes are supported")
+        return self
+
+
+class TaggedBinaryClassifier(UntaggedBinaryClassifier):
+    def fit(self, X, y):
+        y_type = type_of_target(y, input_name="y", raise_unknown=True)
+        if y_type != "binary":
+            raise ValueError(
+                "Only binary classification is supported. The type of the target "
+                f"is {y_type}."
+            )
+        return super().fit(X, y)
+
+    # Toy classifier that only supports binary classification.
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_class = False
+        return tags
+
+
+class RequiresPositiveXRegressor(LinearRegression):
+    def fit(self, X, y):
+        # reject sparse X to be able to call (X < 0).any()
+        X, y = validate_data(self, X, y, accept_sparse=False, multi_output=True)
+        if (X < 0).any():
+            raise ValueError("Negative values in data passed to X.")
+        return super().fit(X, y)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.positive_only = True
+        # reject sparse X to be able to call (X < 0).any()
+        tags.input_tags.sparse = False
+        return tags
+
+
+class RequiresPositiveYRegressor(LinearRegression):
+    def fit(self, X, y):
+        X, y = validate_data(self, X, y, accept_sparse=True, multi_output=True)
+        if (y <= 0).any():
+            raise ValueError("negative y values not supported!")
+        return super().fit(X, y)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.positive_only = True
+        return tags
+
+
+class PoorScoreLogisticRegression(LogisticRegression):
+    def decision_function(self, X):
+        return super().decision_function(X) + 1
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.poor_score = True
+        return tags
+
+
+class PartialFitChecksName(BaseEstimator):
+    def fit(self, X, y):
+        validate_data(self, X, y)
+        return self
+
+    def partial_fit(self, X, y):
+        reset = not hasattr(self, "_fitted")
+        validate_data(self, X, y, reset=reset)
+        self._fitted = True
+        return self
+
+
+class BrokenArrayAPI(BaseEstimator):
+    """Make different predictions when using Numpy and the Array API"""
+
+    def fit(self, X, y):
+        return self
+
+    def predict(self, X):
+        enabled = get_config()["array_api_dispatch"]
+        xp, _ = _array_api.get_namespace(X)
+        if enabled:
+            return xp.asarray([1, 2, 3])
+        else:
+            return np.array([3, 2, 1])
+
+
+def test_check_array_api_input():
+    try:
+        importlib.import_module("array_api_strict")
+    except ModuleNotFoundError:  # pragma: nocover
+        raise SkipTest("array-api-strict is required to run this test")
+
+    with raises(AssertionError, match="Not equal to tolerance"):
+        check_array_api_input(
+            "BrokenArrayAPI",
+            BrokenArrayAPI(),
+            array_namespace="array_api_strict",
+            check_values=True,
+        )
+
+
+def test_not_an_array_array_function():
+    not_array = _NotAnArray(np.ones(10))
+    msg = "Don't want to call array_function sum!"
+    with raises(TypeError, match=msg):
+        np.sum(not_array)
+    # always returns True
+    assert np.may_share_memory(not_array, None)
+
+
+def test_check_fit_score_takes_y_works_on_deprecated_fit():
+    # Tests that check_fit_score_takes_y works on a class with
+    # a deprecated fit method
+
+    class TestEstimatorWithDeprecatedFitMethod(BaseEstimator):
+        @deprecated("Deprecated for the purpose of testing check_fit_score_takes_y")
+        def fit(self, X, y):
+            return self
+
+    check_fit_score_takes_y("test", TestEstimatorWithDeprecatedFitMethod())
+
+
+def test_check_estimator_with_class_removed():
+    """Test that passing a class instead of an instance fails."""
+    msg = "Passing a class was deprecated"
+    with raises(TypeError, match=msg):
+        check_estimator(LogisticRegression)
+
+
+def test_mutable_default_params():
+    """Test that constructor cannot have mutable default parameters."""
+    msg = (
+        "Parameter 'p' of estimator 'HasMutableParameters' is of type "
+        "object which is not allowed"
+    )
+    # check that the "default_constructible" test checks for mutable parameters
+    check_parameters_default_constructible(
+        "Immutable", HasImmutableParameters()
+    )  # should pass
+    with raises(AssertionError, match=msg):
+        check_parameters_default_constructible("Mutable", HasMutableParameters())
+
+
+def test_check_set_params():
+    """Check set_params doesn't fail and sets the right values."""
+    # check that values returned by get_params match set_params
+    msg = "get_params result does not match what was passed to set_params"
+    with raises(AssertionError, match=msg):
+        check_set_params("test", ModifiesValueInsteadOfRaisingError())
+
+    with warnings.catch_warnings(record=True) as records:
+        check_set_params("test", RaisesErrorInSetParams())
+    assert UserWarning in [rec.category for rec in records]
+
+    with raises(AssertionError, match=msg):
+        check_set_params("test", ModifiesAnotherValue())
+
+
+def test_check_estimators_nan_inf():
+    # check that predict does input validation (doesn't accept dicts in input)
+    msg = "Estimator NoCheckinPredict doesn't check for NaN and inf in predict"
+    with raises(AssertionError, match=msg):
+        check_estimators_nan_inf("NoCheckinPredict", NoCheckinPredict())
+
+
+def test_check_dict_unchanged():
+    # check that estimator state does not change
+    # at transform/predict/predict_proba time
+    msg = "Estimator changes __dict__ during predict"
+    with raises(AssertionError, match=msg):
+        check_dict_unchanged("test", ChangesDict())
+
+
+def test_check_sample_weights_pandas_series():
+    # check that sample_weights in fit accepts pandas.Series type
+    try:
+        from pandas import Series  # noqa: F401
+
+        msg = (
+            "Estimator NoSampleWeightPandasSeriesType raises error if "
+            "'sample_weight' parameter is of type pandas.Series"
+        )
+        with raises(ValueError, match=msg):
+            check_sample_weights_pandas_series(
+                "NoSampleWeightPandasSeriesType", NoSampleWeightPandasSeriesType()
+            )
+    except ImportError:
+        pass
+
+
+def test_check_estimators_overwrite_params():
+    # check that `fit` only changes attributes that
+    # are private (start with an _ or end with a _).
+    msg = (
+        "Estimator ChangesWrongAttribute should not change or mutate  "
+        "the parameter wrong_attribute from 0 to 1 during fit."
+    )
+    with raises(AssertionError, match=msg):
+        check_estimators_overwrite_params(
+            "ChangesWrongAttribute", ChangesWrongAttribute()
+        )
+    check_estimators_overwrite_params("test", ChangesUnderscoreAttribute())
+
+
+def test_check_dont_overwrite_parameters():
+    # check that `fit` doesn't add any public attribute
+    msg = (
+        r"Estimator adds public attribute\(s\) during the fit method."
+        " Estimators are only allowed to add private attributes"
+        " either started with _ or ended"
+        " with _ but wrong_attribute added"
+    )
+    with raises(AssertionError, match=msg):
+        check_dont_overwrite_parameters("test", SetsWrongAttribute())
+
+
+def test_check_methods_sample_order_invariance():
+    # check for sample order invariance
+    name = NotInvariantSampleOrder.__name__
+    method = "predict"
+    msg = (
+        "{method} of {name} is not invariant when applied to a dataset"
+        "with different sample order."
+    ).format(method=method, name=name)
+    with raises(AssertionError, match=msg):
+        check_methods_sample_order_invariance(
+            "NotInvariantSampleOrder", NotInvariantSampleOrder()
+        )
+
+
+def test_check_methods_subset_invariance():
+    # check for invariant method
+    name = NotInvariantPredict.__name__
+    method = "predict"
+    msg = ("{method} of {name} is not invariant when applied to a subset.").format(
+        method=method, name=name
+    )
+    with raises(AssertionError, match=msg):
+        check_methods_subset_invariance("NotInvariantPredict", NotInvariantPredict())
+
+
+def test_check_estimator_sparse_data():
+    # check for sparse data input handling
+    name = NoSparseClassifier.__name__
+    msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
+    with raises(AssertionError, match=msg):
+        check_estimator_sparse_matrix(name, NoSparseClassifier("sparse_matrix"))
+
+    if SPARRAY_PRESENT:
+        with raises(AssertionError, match=msg):
+            check_estimator_sparse_array(name, NoSparseClassifier("sparse_array"))
+
+    # Large indices test on bad estimator
+    msg = (
+        "Estimator LargeSparseNotSupportedClassifier doesn't seem to "
+        r"support \S{3}_64 matrix, and is not failing gracefully.*"
+    )
+    with raises(AssertionError, match=msg):
+        check_estimator_sparse_matrix(
+            "LargeSparseNotSupportedClassifier",
+            LargeSparseNotSupportedClassifier("sparse_matrix"),
+        )
+
+    if SPARRAY_PRESENT:
+        with raises(AssertionError, match=msg):
+            check_estimator_sparse_array(
+                "LargeSparseNotSupportedClassifier",
+                LargeSparseNotSupportedClassifier("sparse_array"),
+            )
+
+
+def test_check_classifiers_one_label_sample_weights():
+    # check for classifiers reducing to less than two classes via sample weights
+    name = OneClassSampleErrorClassifier.__name__
+    msg = (
+        f"{name} failed when fitted on one label after sample_weight "
+        "trimming. Error message is not explicit, it should have "
+        "'class'."
+    )
+    with raises(AssertionError, match=msg):
+        check_classifiers_one_label_sample_weights(
+            "OneClassSampleErrorClassifier", OneClassSampleErrorClassifier()
+        )
+
+
+def test_check_estimator_not_fail_fast():
+    """Check the contents of the results returned with on_fail!="raise".
+
+    This results should contain details about the observed failures, expected
+    or not.
+    """
+    check_results = check_estimator(BaseEstimator(), on_fail=None)
+    assert isinstance(check_results, list)
+    assert len(check_results) > 0
+    assert all(
+        isinstance(item, dict)
+        and set(item.keys())
+        == {
+            "estimator",
+            "check_name",
+            "exception",
+            "status",
+            "expected_to_fail",
+            "expected_to_fail_reason",
+        }
+        for item in check_results
+    )
+    # Some tests are expected to fail, some are expected to pass.
+    assert any(item["status"] == "failed" for item in check_results)
+    assert any(item["status"] == "passed" for item in check_results)
+
+
+def test_check_estimator():
+    # tests that the estimator actually fails on "bad" estimators.
+    # not a complete test of all checks, which are very extensive.
+
+    # check that we have a fit method
+    msg = "object has no attribute 'fit'"
+    with raises(AttributeError, match=msg):
+        check_estimator(BaseEstimator())
+
+    # does error on binary_only untagged estimator
+    msg = "Only 2 classes are supported"
+    with raises(ValueError, match=msg):
+        check_estimator(UntaggedBinaryClassifier())
+
+    for csr_container in CSR_CONTAINERS:
+        # non-regression test for estimators transforming to sparse data
+        check_estimator(SparseTransformer(sparse_container=csr_container))
+
+    # doesn't error on actual estimator
+    check_estimator(LogisticRegression())
+    check_estimator(LogisticRegression(C=0.01))
+    check_estimator(MultiTaskElasticNet())
+
+    # doesn't error on binary_only tagged estimator
+    check_estimator(TaggedBinaryClassifier())
+    check_estimator(RequiresPositiveXRegressor())
+
+    # Check regressor with requires_positive_y estimator tag
+    msg = "negative y values not supported!"
+    with raises(ValueError, match=msg):
+        check_estimator(RequiresPositiveYRegressor())
+
+    # Does not raise error on classifier with poor_score tag
+    check_estimator(PoorScoreLogisticRegression())
+
+
+def test_check_outlier_corruption():
+    # should raise AssertionError
+    decision = np.array([0.0, 1.0, 1.5, 2.0])
+    with raises(AssertionError):
+        check_outlier_corruption(1, 2, decision)
+    # should pass
+    decision = np.array([0.0, 1.0, 1.0, 2.0])
+    check_outlier_corruption(1, 2, decision)
+
+
+def test_check_estimator_sparse_tag():
+    """Test that check_estimator_sparse_tag raises error when sparse tag is
+    misaligned."""
+
+    class EstimatorWithSparseConfig(BaseEstimator):
+        def __init__(self, tag_sparse, accept_sparse, fit_error=None):
+            self.tag_sparse = tag_sparse
+            self.accept_sparse = accept_sparse
+            self.fit_error = fit_error
+
+        def fit(self, X, y=None):
+            if self.fit_error:
+                raise self.fit_error
+            validate_data(self, X, y, accept_sparse=self.accept_sparse)
+            return self
+
+        def __sklearn_tags__(self):
+            tags = super().__sklearn_tags__()
+            tags.input_tags.sparse = self.tag_sparse
+            return tags
+
+    test_cases = [
+        {"tag_sparse": True, "accept_sparse": True, "error_type": None},
+        {"tag_sparse": False, "accept_sparse": False, "error_type": None},
+        {"tag_sparse": False, "accept_sparse": True, "error_type": AssertionError},
+        {"tag_sparse": True, "accept_sparse": False, "error_type": AssertionError},
+    ]
+
+    for test_case in test_cases:
+        estimator = EstimatorWithSparseConfig(
+            test_case["tag_sparse"],
+            test_case["accept_sparse"],
+        )
+        if test_case["error_type"] is None:
+            check_estimator_sparse_tag(estimator.__class__.__name__, estimator)
+        else:
+            with raises(test_case["error_type"]):
+                check_estimator_sparse_tag(estimator.__class__.__name__, estimator)
+
+    # estimator `tag_sparse=accept_sparse=False` fails on sparse data
+    # but does not raise the appropriate error
+    for fit_error in [TypeError("unexpected error"), KeyError("other error")]:
+        estimator = EstimatorWithSparseConfig(False, False, fit_error)
+        with raises(AssertionError):
+            check_estimator_sparse_tag(estimator.__class__.__name__, estimator)
+
+
+def test_check_estimator_transformer_no_mixin():
+    # check that TransformerMixin is not required for transformer tests to run
+    # but it fails since the tag is not set
+    with raises(RuntimeError, "the `transformer_tags` tag is not set"):
+        check_estimator(BadTransformerWithoutMixin())
+
+
+def test_check_estimator_clones():
+    # check that check_estimator doesn't modify the estimator it receives
+
+    iris = load_iris()
+
+    for Estimator in [
+        GaussianMixture,
+        LinearRegression,
+        SGDClassifier,
+        PCA,
+        MiniBatchKMeans,
+    ]:
+        # without fitting
+        with ignore_warnings(category=ConvergenceWarning):
+            est = Estimator()
+            set_random_state(est)
+            old_hash = joblib.hash(est)
+            check_estimator(
+                est, expected_failed_checks=_get_expected_failed_checks(est)
+            )
+        assert old_hash == joblib.hash(est)
+
+        # with fitting
+        with ignore_warnings(category=ConvergenceWarning):
+            est = Estimator()
+            set_random_state(est)
+            est.fit(iris.data, iris.target)
+            old_hash = joblib.hash(est)
+            check_estimator(
+                est, expected_failed_checks=_get_expected_failed_checks(est)
+            )
+        assert old_hash == joblib.hash(est)
+
+
+def test_check_estimators_unfitted():
+    # check that a ValueError/AttributeError is raised when calling predict
+    # on an unfitted estimator
+    msg = "Estimator should raise a NotFittedError when calling"
+    with raises(AssertionError, match=msg):
+        check_estimators_unfitted("estimator", NoSparseClassifier())
+
+    # check that CorrectNotFittedError inherit from either ValueError
+    # or AttributeError
+    check_estimators_unfitted("estimator", CorrectNotFittedErrorClassifier())
+
+
+def test_check_no_attributes_set_in_init():
+    class NonConformantEstimatorPrivateSet(BaseEstimator):
+        def __init__(self):
+            self.you_should_not_set_this_ = None
+
+    class NonConformantEstimatorNoParamSet(BaseEstimator):
+        def __init__(self, you_should_set_this_=None):
+            pass
+
+    class ConformantEstimatorClassAttribute(BaseEstimator):
+        # making sure our __metadata_request__* class attributes are okay!
+        __metadata_request__fit = {"foo": True}
+
+    msg = (
+        "Estimator estimator_name should not set any"
+        " attribute apart from parameters during init."
+        r" Found attributes \['you_should_not_set_this_'\]."
+    )
+    with raises(AssertionError, match=msg):
+        check_no_attributes_set_in_init(
+            "estimator_name", NonConformantEstimatorPrivateSet()
+        )
+
+    msg = (
+        "Estimator estimator_name should store all parameters as an attribute"
+        " during init"
+    )
+    with raises(AttributeError, match=msg):
+        check_no_attributes_set_in_init(
+            "estimator_name", NonConformantEstimatorNoParamSet()
+        )
+
+    # a private class attribute is okay!
+    check_no_attributes_set_in_init(
+        "estimator_name", ConformantEstimatorClassAttribute()
+    )
+    # also check if cloning an estimator which has non-default set requests is
+    # fine. Setting a non-default value via `set_{method}_request` sets the
+    # private _metadata_request instance attribute which is copied in `clone`.
+    with config_context(enable_metadata_routing=True):
+        check_no_attributes_set_in_init(
+            "estimator_name",
+            ConformantEstimatorClassAttribute().set_fit_request(foo=True),
+        )
+
+
+def test_check_estimator_pairwise():
+    # check that check_estimator() works on estimator with _pairwise
+    # kernel or metric
+
+    # test precomputed kernel
+    est = SVC(kernel="precomputed")
+    check_estimator(est)
+
+    # test precomputed metric
+    est = KNeighborsRegressor(metric="precomputed")
+    check_estimator(est, expected_failed_checks=_get_expected_failed_checks(est))
+
+
+def test_check_classifier_data_not_an_array():
+    with raises(AssertionError, match="Not equal to tolerance"):
+        check_classifier_data_not_an_array(
+            "estimator_name", EstimatorInconsistentForPandas()
+        )
+
+
+def test_check_regressor_data_not_an_array():
+    with raises(AssertionError, match="Not equal to tolerance"):
+        check_regressor_data_not_an_array(
+            "estimator_name", EstimatorInconsistentForPandas()
+        )
+
+
+def test_check_dataframe_column_names_consistency():
+    err_msg = "Estimator does not have a feature_names_in_"
+    with raises(ValueError, match=err_msg):
+        check_dataframe_column_names_consistency("estimator_name", BaseBadClassifier())
+    check_dataframe_column_names_consistency("estimator_name", PartialFitChecksName())
+
+    lr = LogisticRegression()
+    check_dataframe_column_names_consistency(lr.__class__.__name__, lr)
+    lr.__doc__ = "Docstring that does not document the estimator's attributes"
+    err_msg = (
+        "Estimator LogisticRegression does not document its feature_names_in_ attribute"
+    )
+    with raises(ValueError, match=err_msg):
+        check_dataframe_column_names_consistency(lr.__class__.__name__, lr)
+
+
+class _BaseMultiLabelClassifierMock(ClassifierMixin, BaseEstimator):
+    def __init__(self, response_output):
+        self.response_output = response_output
+
+    def fit(self, X, y):
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        return tags
+
+
+def test_check_classifiers_multilabel_output_format_predict():
+    n_samples, test_size, n_outputs = 100, 25, 5
+    _, y = make_multilabel_classification(
+        n_samples=n_samples,
+        n_features=2,
+        n_classes=n_outputs,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+    y_test = y[-test_size:]
+
+    class MultiLabelClassifierPredict(_BaseMultiLabelClassifierMock):
+        def predict(self, X):
+            return self.response_output
+
+    # 1. inconsistent array type
+    clf = MultiLabelClassifierPredict(response_output=y_test.tolist())
+    err_msg = (
+        r"MultiLabelClassifierPredict.predict is expected to output a "
+        r"NumPy array. Got <class 'list'> instead."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf)
+    # 2. inconsistent shape
+    clf = MultiLabelClassifierPredict(response_output=y_test[:, :-1])
+    err_msg = (
+        r"MultiLabelClassifierPredict.predict outputs a NumPy array of "
+        r"shape \(25, 4\) instead of \(25, 5\)."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf)
+    # 3. inconsistent dtype
+    clf = MultiLabelClassifierPredict(response_output=y_test.astype(np.float64))
+    err_msg = (
+        r"MultiLabelClassifierPredict.predict does not output the same "
+        r"dtype than the targets."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf)
+
+
+def test_check_classifiers_multilabel_output_format_predict_proba():
+    n_samples, test_size, n_outputs = 100, 25, 5
+    _, y = make_multilabel_classification(
+        n_samples=n_samples,
+        n_features=2,
+        n_classes=n_outputs,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+    y_test = y[-test_size:]
+
+    class MultiLabelClassifierPredictProba(_BaseMultiLabelClassifierMock):
+        def predict_proba(self, X):
+            return self.response_output
+
+    for csr_container in CSR_CONTAINERS:
+        # 1. unknown output type
+        clf = MultiLabelClassifierPredictProba(response_output=csr_container(y_test))
+        err_msg = (
+            f"Unknown returned type .*{csr_container.__name__}.* by "
+            r"MultiLabelClassifierPredictProba.predict_proba. A list or a Numpy "
+            r"array is expected."
+        )
+        with raises(ValueError, match=err_msg):
+            check_classifiers_multilabel_output_format_predict_proba(
+                clf.__class__.__name__,
+                clf,
+            )
+    # 2. for list output
+    # 2.1. inconsistent length
+    clf = MultiLabelClassifierPredictProba(response_output=y_test.tolist())
+    err_msg = (
+        "When MultiLabelClassifierPredictProba.predict_proba returns a list, "
+        "the list should be of length n_outputs and contain NumPy arrays. Got "
+        f"length of {test_size} instead of {n_outputs}."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict_proba(
+            clf.__class__.__name__,
+            clf,
+        )
+    # 2.2. array of inconsistent shape
+    response_output = [np.ones_like(y_test) for _ in range(n_outputs)]
+    clf = MultiLabelClassifierPredictProba(response_output=response_output)
+    err_msg = (
+        r"When MultiLabelClassifierPredictProba.predict_proba returns a list, "
+        r"this list should contain NumPy arrays of shape \(n_samples, 2\). Got "
+        r"NumPy arrays of shape \(25, 5\) instead of \(25, 2\)."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict_proba(
+            clf.__class__.__name__,
+            clf,
+        )
+    # 2.3. array of inconsistent dtype
+    response_output = [
+        np.ones(shape=(y_test.shape[0], 2), dtype=np.int64) for _ in range(n_outputs)
+    ]
+    clf = MultiLabelClassifierPredictProba(response_output=response_output)
+    err_msg = (
+        "When MultiLabelClassifierPredictProba.predict_proba returns a list, "
+        "it should contain NumPy arrays with floating dtype."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict_proba(
+            clf.__class__.__name__,
+            clf,
+        )
+    # 2.4. array does not contain probability (each row should sum to 1)
+    response_output = [
+        np.ones(shape=(y_test.shape[0], 2), dtype=np.float64) for _ in range(n_outputs)
+    ]
+    clf = MultiLabelClassifierPredictProba(response_output=response_output)
+    err_msg = (
+        r"When MultiLabelClassifierPredictProba.predict_proba returns a list, "
+        r"each NumPy array should contain probabilities for each class and "
+        r"thus each row should sum to 1"
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict_proba(
+            clf.__class__.__name__,
+            clf,
+        )
+    # 3 for array output
+    # 3.1. array of inconsistent shape
+    clf = MultiLabelClassifierPredictProba(response_output=y_test[:, :-1])
+    err_msg = (
+        r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy "
+        r"array, the expected shape is \(n_samples, n_outputs\). Got \(25, 4\)"
+        r" instead of \(25, 5\)."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict_proba(
+            clf.__class__.__name__,
+            clf,
+        )
+    # 3.2. array of inconsistent dtype
+    response_output = np.zeros_like(y_test, dtype=np.int64)
+    clf = MultiLabelClassifierPredictProba(response_output=response_output)
+    err_msg = (
+        r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy "
+        r"array, the expected data type is floating."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict_proba(
+            clf.__class__.__name__,
+            clf,
+        )
+    # 4. array does not contain probabilities
+    clf = MultiLabelClassifierPredictProba(response_output=y_test * 2.0)
+    err_msg = (
+        r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy "
+        r"array, this array is expected to provide probabilities of the "
+        r"positive class and should therefore contain values between 0 and 1."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_predict_proba(
+            clf.__class__.__name__,
+            clf,
+        )
+
+
+def test_check_classifiers_multilabel_output_format_decision_function():
+    n_samples, test_size, n_outputs = 100, 25, 5
+    _, y = make_multilabel_classification(
+        n_samples=n_samples,
+        n_features=2,
+        n_classes=n_outputs,
+        n_labels=3,
+        length=50,
+        allow_unlabeled=True,
+        random_state=0,
+    )
+    y_test = y[-test_size:]
+
+    class MultiLabelClassifierDecisionFunction(_BaseMultiLabelClassifierMock):
+        def decision_function(self, X):
+            return self.response_output
+
+    # 1. inconsistent array type
+    clf = MultiLabelClassifierDecisionFunction(response_output=y_test.tolist())
+    err_msg = (
+        r"MultiLabelClassifierDecisionFunction.decision_function is expected "
+        r"to output a NumPy array. Got <class 'list'> instead."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_decision_function(
+            clf.__class__.__name__,
+            clf,
+        )
+    # 2. inconsistent shape
+    clf = MultiLabelClassifierDecisionFunction(response_output=y_test[:, :-1])
+    err_msg = (
+        r"MultiLabelClassifierDecisionFunction.decision_function is expected "
+        r"to provide a NumPy array of shape \(n_samples, n_outputs\). Got "
+        r"\(25, 4\) instead of \(25, 5\)"
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_decision_function(
+            clf.__class__.__name__,
+            clf,
+        )
+    # 3. inconsistent dtype
+    clf = MultiLabelClassifierDecisionFunction(response_output=y_test)
+    err_msg = (
+        r"MultiLabelClassifierDecisionFunction.decision_function is expected "
+        r"to output a floating dtype."
+    )
+    with raises(AssertionError, match=err_msg):
+        check_classifiers_multilabel_output_format_decision_function(
+            clf.__class__.__name__,
+            clf,
+        )
+
+
+def run_tests_without_pytest():
+    """Runs the tests in this file without using pytest."""
+    main_module = sys.modules["__main__"]
+    test_functions = [
+        getattr(main_module, name)
+        for name in dir(main_module)
+        if name.startswith("test_")
+    ]
+    test_cases = [unittest.FunctionTestCase(fn) for fn in test_functions]
+    suite = unittest.TestSuite()
+    suite.addTests(test_cases)
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
+
+
+def test_check_class_weight_balanced_linear_classifier():
+    # check that ill-computed balanced weights raises an exception
+    msg = "Classifier estimator_name is not computing class_weight=balanced properly"
+    with raises(AssertionError, match=msg):
+        check_class_weight_balanced_linear_classifier(
+            "estimator_name", BadBalancedWeightsClassifier()
+        )
+
+
+def test_all_estimators_all_public():
+    # all_estimator should not fail when pytest is not installed and return
+    # only public estimators
+    with warnings.catch_warnings(record=True) as record:
+        estimators = all_estimators()
+    # no warnings are raised
+    assert not record
+    for est in estimators:
+        assert not est.__class__.__name__.startswith("_")
+
+
+if __name__ == "__main__":
+    # This module is run as a script to check that we have no dependency on
+    # pytest for estimator checks.
+    run_tests_without_pytest()
+
+
+def test_estimator_checks_generator_skipping_tests():
+    # Make sure the checks generator skips tests that are expected to fail
+    est = next(_construct_instances(NuSVC))
+    expected_to_fail = _get_expected_failed_checks(est)
+    checks = estimator_checks_generator(
+        est, legacy=True, expected_failed_checks=expected_to_fail, mark="skip"
+    )
+    # making sure we use a class that has expected failures
+    assert len(expected_to_fail) > 0
+    skipped_checks = []
+    for estimator, check in checks:
+        try:
+            check(estimator)
+        except SkipTest:
+            skipped_checks.append(_check_name(check))
+    # all checks expected to fail are skipped
+    # some others might also be skipped, if their dependencies are not installed.
+    assert set(expected_to_fail.keys()) <= set(skipped_checks)
+
+
+def test_xfail_count_with_no_fast_fail():
+    """Test that the right number of xfail warnings are raised when on_fail is "warn".
+
+    It also checks the number of raised EstimatorCheckFailedWarning, and checks the
+    output of check_estimator.
+    """
+    est = NuSVC()
+    expected_failed_checks = _get_expected_failed_checks(est)
+    # This is to make sure we test a class that has some expected failures
+    assert len(expected_failed_checks) > 0
+    with warnings.catch_warnings(record=True) as records:
+        logs = check_estimator(
+            est,
+            expected_failed_checks=expected_failed_checks,
+            on_fail="warn",
+        )
+    xfail_warns = [w for w in records if w.category != SkipTestWarning]
+    assert all([rec.category == EstimatorCheckFailedWarning for rec in xfail_warns])
+    assert len(xfail_warns) == len(expected_failed_checks)
+
+    xfailed = [log for log in logs if log["status"] == "xfail"]
+    assert len(xfailed) == len(expected_failed_checks)
+
+
+def test_check_estimator_callback():
+    """Test that the callback is called with the right arguments."""
+    call_count = {"xfail": 0, "skipped": 0, "passed": 0, "failed": 0}
+
+    def callback(
+        *,
+        estimator,
+        check_name,
+        exception,
+        status,
+        expected_to_fail,
+        expected_to_fail_reason,
+    ):
+        assert status in ("xfail", "skipped", "passed", "failed")
+        nonlocal call_count
+        call_count[status] += 1
+
+    est = NuSVC()
+    expected_failed_checks = _get_expected_failed_checks(est)
+    # This is to make sure we test a class that has some expected failures
+    assert len(expected_failed_checks) > 0
+    with warnings.catch_warnings(record=True):
+        check_estimator(
+            est,
+            expected_failed_checks=expected_failed_checks,
+            on_fail=None,
+            callback=callback,
+        )
+    all_checks_count = len(list(estimator_checks_generator(est, legacy=True)))
+    assert call_count["xfail"] == len(expected_failed_checks)
+    assert call_count["passed"] > 0
+    assert call_count["failed"] == 0
+    assert call_count["skipped"] == (
+        all_checks_count - call_count["xfail"] - call_count["passed"]
+    )
+
+
+# FIXME: this test should be uncommented when the checks will be granular
+# enough. In 0.24, these tests fail due to low estimator performance.
+def test_minimal_class_implementation_checks():
+    # Check that third-party library can run tests without inheriting from
+    # BaseEstimator.
+    # FIXME
+    raise SkipTest
+    minimal_estimators = [MinimalTransformer(), MinimalRegressor(), MinimalClassifier()]
+    for estimator in minimal_estimators:
+        check_estimator(estimator)
+
+
+def test_check_fit_check_is_fitted():
+    class Estimator(BaseEstimator):
+        def __init__(self, behavior="attribute"):
+            self.behavior = behavior
+
+        def fit(self, X, y, **kwargs):
+            if self.behavior == "attribute":
+                self.is_fitted_ = True
+            elif self.behavior == "method":
+                self._is_fitted = True
+            return self
+
+        @available_if(lambda self: self.behavior in {"method", "always-true"})
+        def __sklearn_is_fitted__(self):
+            if self.behavior == "always-true":
+                return True
+            return hasattr(self, "_is_fitted")
+
+    with raises(Exception, match="passes check_is_fitted before being fit"):
+        check_fit_check_is_fitted("estimator", Estimator(behavior="always-true"))
+
+    check_fit_check_is_fitted("estimator", Estimator(behavior="method"))
+    check_fit_check_is_fitted("estimator", Estimator(behavior="attribute"))
+
+
+def test_check_requires_y_none():
+    class Estimator(BaseEstimator):
+        def fit(self, X, y):
+            X, y = check_X_y(X, y)
+
+    with warnings.catch_warnings(record=True) as record:
+        check_requires_y_none("estimator", Estimator())
+
+    # no warnings are raised
+    assert not [r.message for r in record]
+
+
+def test_non_deterministic_estimator_skip_tests():
+    # check estimators with non_deterministic tag set to True
+    # will skip certain tests, refer to issue #22313 for details
+    for Estimator in [MinimalTransformer, MinimalRegressor, MinimalClassifier]:
+        all_tests = list(_yield_all_checks(Estimator(), legacy=True))
+        assert check_methods_sample_order_invariance in all_tests
+        assert check_methods_subset_invariance in all_tests
+
+        class MyEstimator(Estimator):
+            def __sklearn_tags__(self):
+                tags = super().__sklearn_tags__()
+                tags.non_deterministic = True
+                return tags
+
+        all_tests = list(_yield_all_checks(MyEstimator(), legacy=True))
+        assert check_methods_sample_order_invariance not in all_tests
+        assert check_methods_subset_invariance not in all_tests
+
+
+def test_check_outlier_contamination():
+    """Check the test for the contamination parameter in the outlier detectors."""
+
+    # Without any parameter constraints, the estimator will early exit the test by
+    # returning None.
+    class OutlierDetectorWithoutConstraint(OutlierMixin, BaseEstimator):
+        """Outlier detector without parameter validation."""
+
+        def __init__(self, contamination=0.1):
+            self.contamination = contamination
+
+        def fit(self, X, y=None, sample_weight=None):
+            return self  # pragma: no cover
+
+        def predict(self, X, y=None):
+            return np.ones(X.shape[0])
+
+    detector = OutlierDetectorWithoutConstraint()
+    assert check_outlier_contamination(detector.__class__.__name__, detector) is None
+
+    # Now, we check that with the parameter constraints, the test should only be valid
+    # if an Interval constraint with bound in [0, 1] is provided.
+    class OutlierDetectorWithConstraint(OutlierDetectorWithoutConstraint):
+        _parameter_constraints = {"contamination": [StrOptions({"auto"})]}
+
+    detector = OutlierDetectorWithConstraint()
+    err_msg = "contamination constraints should contain a Real Interval constraint."
+    with raises(AssertionError, match=err_msg):
+        check_outlier_contamination(detector.__class__.__name__, detector)
+
+    # Add a correct interval constraint and check that the test passes.
+    OutlierDetectorWithConstraint._parameter_constraints["contamination"] = [
+        Interval(Real, 0, 0.5, closed="right")
+    ]
+    detector = OutlierDetectorWithConstraint()
+    check_outlier_contamination(detector.__class__.__name__, detector)
+
+    incorrect_intervals = [
+        Interval(Integral, 0, 1, closed="right"),  # not an integral interval
+        Interval(Real, -1, 1, closed="right"),  # lower bound is negative
+        Interval(Real, 0, 2, closed="right"),  # upper bound is greater than 1
+        Interval(Real, 0, 0.5, closed="left"),  # lower bound include 0
+    ]
+
+    err_msg = r"contamination constraint should be an interval in \(0, 0.5\]"
+    for interval in incorrect_intervals:
+        OutlierDetectorWithConstraint._parameter_constraints["contamination"] = [
+            interval
+        ]
+        detector = OutlierDetectorWithConstraint()
+        with raises(AssertionError, match=err_msg):
+            check_outlier_contamination(detector.__class__.__name__, detector)
+
+
+def test_decision_proba_tie_ranking():
+    """Check that in case with some probabilities ties, we relax the
+    ranking comparison with the decision function.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/24025
+    """
+    estimator = SGDClassifier(loss="log_loss")
+    check_decision_proba_consistency("SGDClassifier", estimator)
+
+
+def test_yield_all_checks_legacy():
+    # Test that _yield_all_checks with legacy=True returns more checks.
+    estimator = MinimalClassifier()
+
+    legacy_checks = list(_yield_all_checks(estimator, legacy=True))
+    non_legacy_checks = list(_yield_all_checks(estimator, legacy=False))
+
+    assert len(legacy_checks) > len(non_legacy_checks)
+
+    def get_check_name(check):
+        try:
+            return check.__name__
+        except AttributeError:
+            return check.func.__name__
+
+    # Check that all non-legacy checks are included in legacy checks
+    non_legacy_check_names = {get_check_name(check) for check in non_legacy_checks}
+    legacy_check_names = {get_check_name(check) for check in legacy_checks}
+    assert non_legacy_check_names.issubset(legacy_check_names)
+
+
+def test_check_estimator_cloneable_error():
+    """Check that the right error is raised when the estimator is not cloneable."""
+
+    class NotCloneable(BaseEstimator):
+        def __sklearn_clone__(self):
+            raise NotImplementedError("This estimator is not cloneable.")
+
+    estimator = NotCloneable()
+    msg = "Cloning of .* failed with error"
+    with raises(AssertionError, match=msg):
+        check_estimator_cloneable("NotCloneable", estimator)
+
+
+def test_estimator_repr_error():
+    """Check that the right error is raised when the estimator does not have a repr."""
+
+    class NotRepr(BaseEstimator):
+        def __repr__(self):
+            raise NotImplementedError("This estimator does not have a repr.")
+
+    estimator = NotRepr()
+    msg = "Repr of .* failed with error"
+    with raises(AssertionError, match=msg):
+        check_estimator_repr("NotRepr", estimator)
+
+
+def test_check_estimator_tags_renamed():
+    class BadEstimator1:
+        def _more_tags(self):
+            return None  # pragma: no cover
+
+    class BadEstimator2:
+        def _get_tags(self):
+            return None  # pragma: no cover
+
+    class OkayEstimator:
+        def __sklearn_tags__(self):
+            return None  # pragma: no cover
+
+        def _more_tags(self):
+            return None  # pragma: no cover
+
+    msg = "has defined either `_more_tags` or `_get_tags`"
+    with raises(TypeError, match=msg):
+        check_estimator_tags_renamed("BadEstimator1", BadEstimator1())
+    with raises(TypeError, match=msg):
+        check_estimator_tags_renamed("BadEstimator2", BadEstimator2())
+
+    # This shouldn't fail since we allow both __sklearn_tags__ and _more_tags
+    # to exist so that third party estimators can easily support multiple sklearn
+    # versions.
+    check_estimator_tags_renamed("OkayEstimator", OkayEstimator())
+
+
+def test_check_classifier_not_supporting_multiclass():
+    """Check that when the estimator has the wrong tags.classifier_tags.multi_class
+    set, the test fails."""
+
+    class BadEstimator(BaseEstimator):
+        # we don't actually need to define the tag here since we're running the test
+        # manually, and BaseEstimator defaults to multi_output=False.
+        def fit(self, X, y):
+            return self
+
+    msg = "The estimator tag `tags.classifier_tags.multi_class` is False"
+    with raises(AssertionError, match=msg):
+        check_classifier_not_supporting_multiclass("BadEstimator", BadEstimator())
+
+
+# Test that set_output doesn't make the tests to fail.
+def test_estimator_with_set_output():
+    # Doing this since pytest is not available for this file.
+    for lib in ["pandas", "polars"]:
+        try:
+            importlib.__import__(lib)
+        except ImportError:
+            raise SkipTest(f"Library {lib} is not installed")
+
+        estimator = StandardScaler().set_output(transform=lib)
+        check_estimator(estimator)
+
+
+def test_estimator_checks_generator():
+    """Check that checks_generator returns a generator."""
+    all_instance_gen_checks = estimator_checks_generator(LogisticRegression())
+    assert isgenerator(all_instance_gen_checks)
+
+
+def test_check_estimator_callback_with_fast_fail_error():
+    """Check that check_estimator fails correctly with on_fail='raise' and callback."""
+    with raises(
+        ValueError, match="callback cannot be provided together with on_fail='raise'"
+    ):
+        check_estimator(LogisticRegression(), on_fail="raise", callback=lambda: None)
+
+
+def test_check_mixin_order():
+    """Test that the check raises an error when the mixin order is incorrect."""
+
+    class BadEstimator(BaseEstimator, TransformerMixin):
+        def fit(self, X, y=None):
+            return self
+
+    msg = "TransformerMixin comes before/left side of BaseEstimator"
+    with raises(AssertionError, match=re.escape(msg)):
+        check_mixin_order("BadEstimator", BadEstimator())
+
+
+def test_check_positive_only_tag_during_fit():
+    class RequiresPositiveXBadTag(RequiresPositiveXRegressor):
+        def __sklearn_tags__(self):
+            tags = super().__sklearn_tags__()
+            tags.input_tags.positive_only = False
+            return tags
+
+    with raises(
+        AssertionError, match="This happens when passing negative input values as X."
+    ):
+        check_positive_only_tag_during_fit(
+            "RequiresPositiveXBadTag", RequiresPositiveXBadTag()
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_estimator_html_repr.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_estimator_html_repr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d24e357b74426bc787d688ac8004ade5d2ff9333
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_estimator_html_repr.py
@@ -0,0 +1,21 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import importlib
+import sys
+
+import pytest
+
+
+# TODO(1.8): Remove the entire file
+def test_estimator_html_repr_warning():
+    with pytest.warns(FutureWarning):
+        # Make sure that we check for the warning when loading the module (reloading it
+        # if needed).
+        module_name = "sklearn.utils._estimator_html_repr"
+        if module_name in sys.modules:
+            importlib.reload(sys.modules[module_name])
+        else:
+            importlib.import_module(module_name)
+
+    assert sys.modules[module_name] is not None
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_extmath.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_extmath.py
new file mode 100644
index 0000000000000000000000000000000000000000..907de11702af286650d9c1894df140f096fda799
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_extmath.py
@@ -0,0 +1,1121 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+from scipy import linalg, sparse
+from scipy.linalg import eigh
+from scipy.sparse.linalg import eigsh
+
+from sklearn import config_context
+from sklearn.datasets import make_low_rank_matrix, make_sparse_spd_matrix
+from sklearn.utils import gen_batches
+from sklearn.utils._arpack import _init_arpack_v0
+from sklearn.utils._array_api import (
+    _convert_to_numpy,
+    _get_namespace_device_dtype_ids,
+    get_namespace,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._testing import (
+    _array_api_for_tests,
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    skip_if_32bit,
+)
+from sklearn.utils.extmath import (
+    _approximate_mode,
+    _deterministic_vector_sign_flip,
+    _incremental_mean_and_var,
+    _randomized_eigsh,
+    _safe_accumulator_op,
+    cartesian,
+    density,
+    randomized_range_finder,
+    randomized_svd,
+    row_norms,
+    safe_sparse_dot,
+    softmax,
+    stable_cumsum,
+    svd_flip,
+    weighted_mode,
+)
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+    _mode,
+)
+
+
+@pytest.mark.parametrize(
+    "sparse_container",
+    COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS + LIL_CONTAINERS,
+)
+def test_density(sparse_container):
+    rng = np.random.RandomState(0)
+    X = rng.randint(10, size=(10, 5))
+    X[1, 2] = 0
+    X[5, 3] = 0
+
+    assert density(sparse_container(X)) == density(X)
+
+
+def test_uniform_weights():
+    # with uniform weights, results should be identical to stats.mode
+    rng = np.random.RandomState(0)
+    x = rng.randint(10, size=(10, 5))
+    weights = np.ones(x.shape)
+
+    for axis in (None, 0, 1):
+        mode, score = _mode(x, axis)
+        mode2, score2 = weighted_mode(x, weights, axis=axis)
+
+        assert_array_equal(mode, mode2)
+        assert_array_equal(score, score2)
+
+
+def test_random_weights():
+    # set this up so that each row should have a weighted mode of 6,
+    # with a score that is easily reproduced
+    mode_result = 6
+
+    rng = np.random.RandomState(0)
+    x = rng.randint(mode_result, size=(100, 10))
+    w = rng.random_sample(x.shape)
+
+    x[:, :5] = mode_result
+    w[:, :5] += 1
+
+    mode, score = weighted_mode(x, w, axis=1)
+
+    assert_array_equal(mode, mode_result)
+    assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
+
+
+@pytest.mark.parametrize("dtype", (np.int32, np.int64, np.float32, np.float64))
+def test_randomized_svd_low_rank_all_dtypes(dtype):
+    # Check that extmath.randomized_svd is consistent with linalg.svd
+    n_samples = 100
+    n_features = 500
+    rank = 5
+    k = 10
+    decimal = 5 if dtype == np.float32 else 7
+    dtype = np.dtype(dtype)
+
+    # generate a matrix X of approximate effective rank `rank` and no noise
+    # component (very structured signal):
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=rank,
+        tail_strength=0.0,
+        random_state=0,
+    ).astype(dtype, copy=False)
+    assert X.shape == (n_samples, n_features)
+
+    # compute the singular values of X using the slow exact method
+    U, s, Vt = linalg.svd(X, full_matrices=False)
+
+    # Convert the singular values to the specific dtype
+    U = U.astype(dtype, copy=False)
+    s = s.astype(dtype, copy=False)
+    Vt = Vt.astype(dtype, copy=False)
+
+    for normalizer in ["auto", "LU", "QR"]:  # 'none' would not be stable
+        # compute the singular values of X using the fast approximate method
+        Ua, sa, Va = randomized_svd(
+            X, k, power_iteration_normalizer=normalizer, random_state=0
+        )
+
+        # If the input dtype is float, then the output dtype is float of the
+        # same bit size (f32 is not upcast to f64)
+        # But if the input dtype is int, the output dtype is float64
+        if dtype.kind == "f":
+            assert Ua.dtype == dtype
+            assert sa.dtype == dtype
+            assert Va.dtype == dtype
+        else:
+            assert Ua.dtype == np.float64
+            assert sa.dtype == np.float64
+            assert Va.dtype == np.float64
+
+        assert Ua.shape == (n_samples, k)
+        assert sa.shape == (k,)
+        assert Va.shape == (k, n_features)
+
+        # ensure that the singular values of both methods are equal up to the
+        # real rank of the matrix
+        assert_almost_equal(s[:k], sa, decimal=decimal)
+
+        # check the singular vectors too (while not checking the sign)
+        assert_almost_equal(
+            np.dot(U[:, :k], Vt[:k, :]), np.dot(Ua, Va), decimal=decimal
+        )
+
+        # check the sparse matrix representation
+        for csr_container in CSR_CONTAINERS:
+            X = csr_container(X)
+
+            # compute the singular values of X using the fast approximate method
+            Ua, sa, Va = randomized_svd(
+                X, k, power_iteration_normalizer=normalizer, random_state=0
+            )
+            if dtype.kind == "f":
+                assert Ua.dtype == dtype
+                assert sa.dtype == dtype
+                assert Va.dtype == dtype
+            else:
+                assert Ua.dtype.kind == "f"
+                assert sa.dtype.kind == "f"
+                assert Va.dtype.kind == "f"
+
+            assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
+
+
+@pytest.mark.parametrize("dtype", (np.int32, np.int64, np.float32, np.float64))
+def test_randomized_eigsh(dtype):
+    """Test that `_randomized_eigsh` returns the appropriate components"""
+
+    rng = np.random.RandomState(42)
+    X = np.diag(np.array([1.0, -2.0, 0.0, 3.0], dtype=dtype))
+    # random rotation that preserves the eigenvalues of X
+    rand_rot = np.linalg.qr(rng.normal(size=X.shape))[0]
+    X = rand_rot @ X @ rand_rot.T
+
+    # with 'module' selection method, the negative eigenvalue shows up
+    eigvals, eigvecs = _randomized_eigsh(X, n_components=2, selection="module")
+    # eigenvalues
+    assert eigvals.shape == (2,)
+    assert_array_almost_equal(eigvals, [3.0, -2.0])  # negative eigenvalue here
+    # eigenvectors
+    assert eigvecs.shape == (4, 2)
+
+    # with 'value' selection method, the negative eigenvalue does not show up
+    with pytest.raises(NotImplementedError):
+        _randomized_eigsh(X, n_components=2, selection="value")
+
+
+@pytest.mark.parametrize("k", (10, 50, 100, 199, 200))
+def test_randomized_eigsh_compared_to_others(k):
+    """Check that `_randomized_eigsh` is similar to other `eigsh`
+
+    Tests that for a random PSD matrix, `_randomized_eigsh` provides results
+    comparable to LAPACK (scipy.linalg.eigh) and ARPACK
+    (scipy.sparse.linalg.eigsh).
+
+    Note: some versions of ARPACK do not support k=n_features.
+    """
+
+    # make a random PSD matrix
+    n_features = 200
+    X = make_sparse_spd_matrix(n_features, random_state=0)
+
+    # compare two versions of randomized
+    # rough and fast
+    eigvals, eigvecs = _randomized_eigsh(
+        X, n_components=k, selection="module", n_iter=25, random_state=0
+    )
+    # more accurate but slow (TODO find realistic settings here)
+    eigvals_qr, eigvecs_qr = _randomized_eigsh(
+        X,
+        n_components=k,
+        n_iter=25,
+        n_oversamples=20,
+        random_state=0,
+        power_iteration_normalizer="QR",
+        selection="module",
+    )
+
+    # with LAPACK
+    eigvals_lapack, eigvecs_lapack = eigh(
+        X, subset_by_index=(n_features - k, n_features - 1)
+    )
+    indices = eigvals_lapack.argsort()[::-1]
+    eigvals_lapack = eigvals_lapack[indices]
+    eigvecs_lapack = eigvecs_lapack[:, indices]
+
+    # -- eigenvalues comparison
+    assert eigvals_lapack.shape == (k,)
+    # comparison precision
+    assert_array_almost_equal(eigvals, eigvals_lapack, decimal=6)
+    assert_array_almost_equal(eigvals_qr, eigvals_lapack, decimal=6)
+
+    # -- eigenvectors comparison
+    assert eigvecs_lapack.shape == (n_features, k)
+    # flip eigenvectors' sign to enforce deterministic output
+    dummy_vecs = np.zeros_like(eigvecs).T
+    eigvecs, _ = svd_flip(eigvecs, dummy_vecs)
+    eigvecs_qr, _ = svd_flip(eigvecs_qr, dummy_vecs)
+    eigvecs_lapack, _ = svd_flip(eigvecs_lapack, dummy_vecs)
+    assert_array_almost_equal(eigvecs, eigvecs_lapack, decimal=4)
+    assert_array_almost_equal(eigvecs_qr, eigvecs_lapack, decimal=6)
+
+    # comparison ARPACK ~ LAPACK (some ARPACK implems do not support k=n)
+    if k < n_features:
+        v0 = _init_arpack_v0(n_features, random_state=0)
+        # "LA" largest algebraic <=> selection="value" in randomized_eigsh
+        eigvals_arpack, eigvecs_arpack = eigsh(
+            X, k, which="LA", tol=0, maxiter=None, v0=v0
+        )
+        indices = eigvals_arpack.argsort()[::-1]
+        # eigenvalues
+        eigvals_arpack = eigvals_arpack[indices]
+        assert_array_almost_equal(eigvals_lapack, eigvals_arpack, decimal=10)
+        # eigenvectors
+        eigvecs_arpack = eigvecs_arpack[:, indices]
+        eigvecs_arpack, _ = svd_flip(eigvecs_arpack, dummy_vecs)
+        assert_array_almost_equal(eigvecs_arpack, eigvecs_lapack, decimal=8)
+
+
+@pytest.mark.parametrize(
+    "n,rank",
+    [
+        (10, 7),
+        (100, 10),
+        (100, 80),
+        (500, 10),
+        (500, 250),
+        (500, 400),
+    ],
+)
+def test_randomized_eigsh_reconst_low_rank(n, rank):
+    """Check that randomized_eigsh is able to reconstruct a low rank psd matrix
+
+    Tests that the decomposition provided by `_randomized_eigsh` leads to
+    orthonormal eigenvectors, and that a low rank PSD matrix can be effectively
+    reconstructed with good accuracy using it.
+    """
+    assert rank < n
+
+    # create a low rank PSD
+    rng = np.random.RandomState(69)
+    X = rng.randn(n, rank)
+    A = X @ X.T
+
+    # approximate A with the "right" number of components
+    S, V = _randomized_eigsh(A, n_components=rank, random_state=rng)
+    # orthonormality checks
+    assert_array_almost_equal(np.linalg.norm(V, axis=0), np.ones(S.shape))
+    assert_array_almost_equal(V.T @ V, np.diag(np.ones(S.shape)))
+    # reconstruction
+    A_reconstruct = V @ np.diag(S) @ V.T
+
+    # test that the approximation is good
+    assert_array_almost_equal(A_reconstruct, A, decimal=6)
+
+
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_row_norms(dtype, csr_container):
+    X = np.random.RandomState(42).randn(100, 100)
+    if dtype is np.float32:
+        precision = 4
+    else:
+        precision = 5
+
+    X = X.astype(dtype, copy=False)
+    sq_norm = (X**2).sum(axis=1)
+
+    assert_array_almost_equal(sq_norm, row_norms(X, squared=True), precision)
+    assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)
+
+    for csr_index_dtype in [np.int32, np.int64]:
+        Xcsr = csr_container(X, dtype=dtype)
+        # csr_matrix will use int32 indices by default,
+        # up-casting those to int64 when necessary
+        if csr_index_dtype is np.int64:
+            Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype, copy=False)
+            Xcsr.indices = Xcsr.indices.astype(csr_index_dtype, copy=False)
+        assert Xcsr.indices.dtype == csr_index_dtype
+        assert Xcsr.indptr.dtype == csr_index_dtype
+        assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), precision)
+        assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
+
+
+def test_randomized_svd_low_rank_with_noise():
+    # Check that extmath.randomized_svd can handle noisy matrices
+    n_samples = 100
+    n_features = 500
+    rank = 5
+    k = 10
+
+    # generate a matrix X wity structure approximate rank `rank` and an
+    # important noisy component
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=rank,
+        tail_strength=0.1,
+        random_state=0,
+    )
+    assert X.shape == (n_samples, n_features)
+
+    # compute the singular values of X using the slow exact method
+    _, s, _ = linalg.svd(X, full_matrices=False)
+
+    for normalizer in ["auto", "none", "LU", "QR"]:
+        # compute the singular values of X using the fast approximate
+        # method without the iterated power method
+        _, sa, _ = randomized_svd(
+            X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0
+        )
+
+        # the approximation does not tolerate the noise:
+        assert np.abs(s[:k] - sa).max() > 0.01
+
+        # compute the singular values of X using the fast approximate
+        # method with iterated power method
+        _, sap, _ = randomized_svd(
+            X, k, power_iteration_normalizer=normalizer, random_state=0
+        )
+
+        # the iterated power method is helping getting rid of the noise:
+        assert_almost_equal(s[:k], sap, decimal=3)
+
+
+def test_randomized_svd_infinite_rank():
+    # Check that extmath.randomized_svd can handle noisy matrices
+    n_samples = 100
+    n_features = 500
+    rank = 5
+    k = 10
+
+    # let us try again without 'low_rank component': just regularly but slowly
+    # decreasing singular values: the rank of the data matrix is infinite
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=rank,
+        tail_strength=1.0,
+        random_state=0,
+    )
+    assert X.shape == (n_samples, n_features)
+
+    # compute the singular values of X using the slow exact method
+    _, s, _ = linalg.svd(X, full_matrices=False)
+    for normalizer in ["auto", "none", "LU", "QR"]:
+        # compute the singular values of X using the fast approximate method
+        # without the iterated power method
+        _, sa, _ = randomized_svd(
+            X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0
+        )
+
+        # the approximation does not tolerate the noise:
+        assert np.abs(s[:k] - sa).max() > 0.1
+
+        # compute the singular values of X using the fast approximate method
+        # with iterated power method
+        _, sap, _ = randomized_svd(
+            X, k, n_iter=5, power_iteration_normalizer=normalizer, random_state=0
+        )
+
+        # the iterated power method is still managing to get most of the
+        # structure at the requested rank
+        assert_almost_equal(s[:k], sap, decimal=3)
+
+
+def test_randomized_svd_transpose_consistency():
+    # Check that transposing the design matrix has limited impact
+    n_samples = 100
+    n_features = 500
+    rank = 4
+    k = 10
+
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=rank,
+        tail_strength=0.5,
+        random_state=0,
+    )
+    assert X.shape == (n_samples, n_features)
+
+    U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0)
+    U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0)
+    U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose="auto", random_state=0)
+    U4, s4, V4 = linalg.svd(X, full_matrices=False)
+
+    assert_almost_equal(s1, s4[:k], decimal=3)
+    assert_almost_equal(s2, s4[:k], decimal=3)
+    assert_almost_equal(s3, s4[:k], decimal=3)
+
+    assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2)
+    assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2)
+
+    # in this case 'auto' is equivalent to transpose
+    assert_almost_equal(s2, s3)
+
+
+def test_randomized_svd_power_iteration_normalizer():
+    # randomized_svd with power_iteration_normalized='none' diverges for
+    # large number of power iterations on this dataset
+    rng = np.random.RandomState(42)
+    X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
+    X += 3 * rng.randint(0, 2, size=X.shape)
+    n_components = 50
+
+    # Check that it diverges with many (non-normalized) power iterations
+    U, s, Vt = randomized_svd(
+        X, n_components, n_iter=2, power_iteration_normalizer="none", random_state=0
+    )
+    A = X - U.dot(np.diag(s).dot(Vt))
+    error_2 = linalg.norm(A, ord="fro")
+    U, s, Vt = randomized_svd(
+        X, n_components, n_iter=20, power_iteration_normalizer="none", random_state=0
+    )
+    A = X - U.dot(np.diag(s).dot(Vt))
+    error_20 = linalg.norm(A, ord="fro")
+    assert np.abs(error_2 - error_20) > 100
+
+    for normalizer in ["LU", "QR", "auto"]:
+        U, s, Vt = randomized_svd(
+            X,
+            n_components,
+            n_iter=2,
+            power_iteration_normalizer=normalizer,
+            random_state=0,
+        )
+        A = X - U.dot(np.diag(s).dot(Vt))
+        error_2 = linalg.norm(A, ord="fro")
+
+        for i in [5, 10, 50]:
+            U, s, Vt = randomized_svd(
+                X,
+                n_components,
+                n_iter=i,
+                power_iteration_normalizer=normalizer,
+                random_state=0,
+            )
+            A = X - U.dot(np.diag(s).dot(Vt))
+            error = linalg.norm(A, ord="fro")
+            assert 15 > np.abs(error_2 - error)
+
+
+@pytest.mark.parametrize("sparse_container", DOK_CONTAINERS + LIL_CONTAINERS)
+def test_randomized_svd_sparse_warnings(sparse_container):
+    # randomized_svd throws a warning for lil and dok matrix
+    rng = np.random.RandomState(42)
+    X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
+    n_components = 5
+
+    X = sparse_container(X)
+    warn_msg = (
+        "Calculating SVD of a {} is expensive. csr_matrix is more efficient.".format(
+            sparse_container.__name__
+        )
+    )
+    with pytest.warns(sparse.SparseEfficiencyWarning, match=warn_msg):
+        randomized_svd(X, n_components, n_iter=1, power_iteration_normalizer="none")
+
+
+def test_svd_flip():
+    # Check that svd_flip works in both situations, and reconstructs input.
+    rs = np.random.RandomState(1999)
+    n_samples = 20
+    n_features = 10
+    X = rs.randn(n_samples, n_features)
+
+    # Check matrix reconstruction
+    U, S, Vt = linalg.svd(X, full_matrices=False)
+    U1, V1 = svd_flip(U, Vt, u_based_decision=False)
+    assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)
+
+    # Check transposed matrix reconstruction
+    XT = X.T
+    U, S, Vt = linalg.svd(XT, full_matrices=False)
+    U2, V2 = svd_flip(U, Vt, u_based_decision=True)
+    assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)
+
+    # Check that different flip methods are equivalent under reconstruction
+    U_flip1, V_flip1 = svd_flip(U, Vt, u_based_decision=True)
+    assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
+    U_flip2, V_flip2 = svd_flip(U, Vt, u_based_decision=False)
+    assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
+
+
+@pytest.mark.parametrize("n_samples, n_features", [(3, 4), (4, 3)])
+def test_svd_flip_max_abs_cols(n_samples, n_features, global_random_seed):
+    rs = np.random.RandomState(global_random_seed)
+    X = rs.randn(n_samples, n_features)
+    U, _, Vt = linalg.svd(X, full_matrices=False)
+
+    U1, _ = svd_flip(U, Vt, u_based_decision=True)
+    max_abs_U1_row_idx_for_col = np.argmax(np.abs(U1), axis=0)
+    assert (U1[max_abs_U1_row_idx_for_col, np.arange(U1.shape[1])] >= 0).all()
+
+    _, V2 = svd_flip(U, Vt, u_based_decision=False)
+    max_abs_V2_col_idx_for_row = np.argmax(np.abs(V2), axis=1)
+    assert (V2[np.arange(V2.shape[0]), max_abs_V2_col_idx_for_row] >= 0).all()
+
+
+def test_randomized_svd_sign_flip():
+    a = np.array([[2.0, 0.0], [0.0, 1.0]])
+    u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
+    for seed in range(10):
+        u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
+        assert_almost_equal(u1, u2)
+        assert_almost_equal(v1, v2)
+        assert_almost_equal(np.dot(u2 * s2, v2), a)
+        assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
+        assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
+
+
+def test_randomized_svd_sign_flip_with_transpose():
+    # Check if the randomized_svd sign flipping is always done based on u
+    # irrespective of transpose.
+    # See https://github.com/scikit-learn/scikit-learn/issues/5608
+    # for more details.
+    def max_loading_is_positive(u, v):
+        """
+        returns bool tuple indicating if the values maximising np.abs
+        are positive across all rows for u and across all columns for v.
+        """
+        u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
+        v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
+        return u_based, v_based
+
+    mat = np.arange(10 * 8).reshape(10, -1)
+
+    # Without transpose
+    u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True, random_state=0)
+    u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
+    assert u_based
+    assert not v_based
+
+    # With transpose
+    u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
+        mat, 3, flip_sign=True, transpose=True, random_state=0
+    )
+    u_based, v_based = max_loading_is_positive(
+        u_flipped_with_transpose, v_flipped_with_transpose
+    )
+    assert u_based
+    assert not v_based
+
+
+@pytest.mark.parametrize("n", [50, 100, 300])
+@pytest.mark.parametrize("m", [50, 100, 300])
+@pytest.mark.parametrize("k", [10, 20, 50])
+@pytest.mark.parametrize("seed", range(5))
+def test_randomized_svd_lapack_driver(n, m, k, seed):
+    # Check that different SVD drivers provide consistent results
+
+    # Matrix being compressed
+    rng = np.random.RandomState(seed)
+    X = rng.rand(n, m)
+
+    # Number of components
+    u1, s1, vt1 = randomized_svd(X, k, svd_lapack_driver="gesdd", random_state=0)
+    u2, s2, vt2 = randomized_svd(X, k, svd_lapack_driver="gesvd", random_state=0)
+
+    # Check shape and contents
+    assert u1.shape == u2.shape
+    assert_allclose(u1, u2, atol=0, rtol=1e-3)
+
+    assert s1.shape == s2.shape
+    assert_allclose(s1, s2, atol=0, rtol=1e-3)
+
+    assert vt1.shape == vt2.shape
+    assert_allclose(vt1, vt2, atol=0, rtol=1e-3)
+
+
+def test_cartesian():
+    # Check if cartesian product delivers the right results
+
+    axes = (np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7]))
+
+    true_out = np.array(
+        [
+            [1, 4, 6],
+            [1, 4, 7],
+            [1, 5, 6],
+            [1, 5, 7],
+            [2, 4, 6],
+            [2, 4, 7],
+            [2, 5, 6],
+            [2, 5, 7],
+            [3, 4, 6],
+            [3, 4, 7],
+            [3, 5, 6],
+            [3, 5, 7],
+        ]
+    )
+
+    out = cartesian(axes)
+    assert_array_equal(true_out, out)
+
+    # check single axis
+    x = np.arange(3)
+    assert_array_equal(x[:, np.newaxis], cartesian((x,)))
+
+
+@pytest.mark.parametrize(
+    "arrays, output_dtype",
+    [
+        (
+            [np.array([1, 2, 3], dtype=np.int32), np.array([4, 5], dtype=np.int64)],
+            np.dtype(np.int64),
+        ),
+        (
+            [np.array([1, 2, 3], dtype=np.int32), np.array([4, 5], dtype=np.float64)],
+            np.dtype(np.float64),
+        ),
+        (
+            [np.array([1, 2, 3], dtype=np.int32), np.array(["x", "y"], dtype=object)],
+            np.dtype(object),
+        ),
+    ],
+)
+def test_cartesian_mix_types(arrays, output_dtype):
+    """Check that the cartesian product works with mixed types."""
+    output = cartesian(arrays)
+
+    assert output.dtype == output_dtype
+
+
+@pytest.fixture()
+def rng():
+    return np.random.RandomState(42)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_incremental_weighted_mean_and_variance_simple(rng, dtype):
+    mult = 10
+    X = rng.rand(1000, 20).astype(dtype) * mult
+    sample_weight = rng.rand(X.shape[0]) * mult
+    mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0, sample_weight=sample_weight)
+
+    expected_mean = np.average(X, weights=sample_weight, axis=0)
+    expected_var = np.average(X**2, weights=sample_weight, axis=0) - expected_mean**2
+    assert_almost_equal(mean, expected_mean)
+    assert_almost_equal(var, expected_var)
+
+
+@pytest.mark.parametrize("mean", [0, 1e7, -1e7])
+@pytest.mark.parametrize("var", [1, 1e-8, 1e5])
+@pytest.mark.parametrize(
+    "weight_loc, weight_scale", [(0, 1), (0, 1e-8), (1, 1e-8), (10, 1), (1e7, 1)]
+)
+def test_incremental_weighted_mean_and_variance(
+    mean, var, weight_loc, weight_scale, rng
+):
+    # Testing of correctness and numerical stability
+    def _assert(X, sample_weight, expected_mean, expected_var):
+        n = X.shape[0]
+        for chunk_size in [1, n // 10 + 1, n // 4 + 1, n // 2 + 1, n]:
+            last_mean, last_weight_sum, last_var = 0, 0, 0
+            for batch in gen_batches(n, chunk_size):
+                last_mean, last_var, last_weight_sum = _incremental_mean_and_var(
+                    X[batch],
+                    last_mean,
+                    last_var,
+                    last_weight_sum,
+                    sample_weight=sample_weight[batch],
+                )
+            assert_allclose(last_mean, expected_mean)
+            assert_allclose(last_var, expected_var, atol=1e-6)
+
+    size = (100, 20)
+    weight = rng.normal(loc=weight_loc, scale=weight_scale, size=size[0])
+
+    # Compare to weighted average: np.average
+    X = rng.normal(loc=mean, scale=var, size=size)
+    expected_mean = _safe_accumulator_op(np.average, X, weights=weight, axis=0)
+    expected_var = _safe_accumulator_op(
+        np.average, (X - expected_mean) ** 2, weights=weight, axis=0
+    )
+    _assert(X, weight, expected_mean, expected_var)
+
+    # Compare to unweighted mean: np.mean
+    X = rng.normal(loc=mean, scale=var, size=size)
+    ones_weight = np.ones(size[0])
+    expected_mean = _safe_accumulator_op(np.mean, X, axis=0)
+    expected_var = _safe_accumulator_op(np.var, X, axis=0)
+    _assert(X, ones_weight, expected_mean, expected_var)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_incremental_weighted_mean_and_variance_ignore_nan(dtype):
+    old_means = np.array([535.0, 535.0, 535.0, 535.0])
+    old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
+    old_weight_sum = np.array([2, 2, 2, 2], dtype=np.int32)
+    sample_weights_X = np.ones(3)
+    sample_weights_X_nan = np.ones(4)
+
+    X = np.array(
+        [[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]]
+    ).astype(dtype)
+
+    X_nan = np.array(
+        [
+            [170, np.nan, 170, 170],
+            [np.nan, 170, 430, 430],
+            [430, 430, np.nan, 300],
+            [300, 300, 300, np.nan],
+        ]
+    ).astype(dtype)
+
+    X_means, X_variances, X_count = _incremental_mean_and_var(
+        X, old_means, old_variances, old_weight_sum, sample_weight=sample_weights_X
+    )
+    X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
+        X_nan,
+        old_means,
+        old_variances,
+        old_weight_sum,
+        sample_weight=sample_weights_X_nan,
+    )
+
+    assert_allclose(X_nan_means, X_means)
+    assert_allclose(X_nan_variances, X_variances)
+    assert_allclose(X_nan_count, X_count)
+
+
+def test_incremental_variance_update_formulas():
+    # Test Youngs and Cramer incremental variance formulas.
+    # Doggie data from https://www.mathsisfun.com/data/standard-deviation.html
+    A = np.array(
+        [
+            [600, 470, 170, 430, 300],
+            [600, 470, 170, 430, 300],
+            [600, 470, 170, 430, 300],
+            [600, 470, 170, 430, 300],
+        ]
+    ).T
+    idx = 2
+    X1 = A[:idx, :]
+    X2 = A[idx:, :]
+
+    old_means = X1.mean(axis=0)
+    old_variances = X1.var(axis=0)
+    old_sample_count = np.full(X1.shape[1], X1.shape[0], dtype=np.int32)
+    final_means, final_variances, final_count = _incremental_mean_and_var(
+        X2, old_means, old_variances, old_sample_count
+    )
+    assert_almost_equal(final_means, A.mean(axis=0), 6)
+    assert_almost_equal(final_variances, A.var(axis=0), 6)
+    assert_almost_equal(final_count, A.shape[0])
+
+
+def test_incremental_mean_and_variance_ignore_nan():
+    old_means = np.array([535.0, 535.0, 535.0, 535.0])
+    old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
+    old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32)
+
+    X = np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])
+
+    X_nan = np.array(
+        [
+            [170, np.nan, 170, 170],
+            [np.nan, 170, 430, 430],
+            [430, 430, np.nan, 300],
+            [300, 300, 300, np.nan],
+        ]
+    )
+
+    X_means, X_variances, X_count = _incremental_mean_and_var(
+        X, old_means, old_variances, old_sample_count
+    )
+    X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
+        X_nan, old_means, old_variances, old_sample_count
+    )
+
+    assert_allclose(X_nan_means, X_means)
+    assert_allclose(X_nan_variances, X_variances)
+    assert_allclose(X_nan_count, X_count)
+
+
+@skip_if_32bit
+def test_incremental_variance_numerical_stability():
+    # Test Youngs and Cramer incremental variance formulas.
+
+    def np_var(A):
+        return A.var(axis=0)
+
+    # Naive one pass variance computation - not numerically stable
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+    def one_pass_var(X):
+        n = X.shape[0]
+        exp_x2 = (X**2).sum(axis=0) / n
+        expx_2 = (X.sum(axis=0) / n) ** 2
+        return exp_x2 - expx_2
+
+    # Two-pass algorithm, stable.
+    # We use it as a benchmark. It is not an online algorithm
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
+    def two_pass_var(X):
+        mean = X.mean(axis=0)
+        Y = X.copy()
+        return np.mean((Y - mean) ** 2, axis=0)
+
+    # Naive online implementation
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
+    # This works only for chunks for size 1
+    def naive_mean_variance_update(x, last_mean, last_variance, last_sample_count):
+        updated_sample_count = last_sample_count + 1
+        samples_ratio = last_sample_count / float(updated_sample_count)
+        updated_mean = x / updated_sample_count + last_mean * samples_ratio
+        updated_variance = (
+            last_variance * samples_ratio
+            + (x - last_mean) * (x - updated_mean) / updated_sample_count
+        )
+        return updated_mean, updated_variance, updated_sample_count
+
+    # We want to show a case when one_pass_var has error > 1e-3 while
+    # _batch_mean_variance_update has less.
+    tol = 200
+    n_features = 2
+    n_samples = 10000
+    x1 = np.array(1e8, dtype=np.float64)
+    x2 = np.log(1e-5, dtype=np.float64)
+    A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64)
+    A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64)
+    A = np.vstack((A0, A1))
+
+    # Naive one pass var: >tol (=1063)
+    assert np.abs(np_var(A) - one_pass_var(A)).max() > tol
+
+    # Starting point for online algorithms: after A0
+
+    # Naive implementation: >tol (436)
+    mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
+    for i in range(A1.shape[0]):
+        mean, var, n = naive_mean_variance_update(A1[i, :], mean, var, n)
+    assert n == A.shape[0]
+    # the mean is also slightly unstable
+    assert np.abs(A.mean(axis=0) - mean).max() > 1e-6
+    assert np.abs(np_var(A) - var).max() > tol
+
+    # Robust implementation: <tol (177)
+    mean, var = A0[0, :], np.zeros(n_features)
+    n = np.full(n_features, n_samples // 2, dtype=np.int32)
+    for i in range(A1.shape[0]):
+        mean, var, n = _incremental_mean_and_var(
+            A1[i, :].reshape((1, A1.shape[1])), mean, var, n
+        )
+    assert_array_equal(n, A.shape[0])
+    assert_array_almost_equal(A.mean(axis=0), mean)
+    assert tol > np.abs(np_var(A) - var).max()
+
+
+def test_incremental_variance_ddof():
+    # Test that degrees of freedom parameter for calculations are correct.
+    rng = np.random.RandomState(1999)
+    X = rng.randn(50, 10)
+    n_samples, n_features = X.shape
+    for batch_size in [11, 20, 37]:
+        steps = np.arange(0, X.shape[0], batch_size)
+        if steps[-1] != X.shape[0]:
+            steps = np.hstack([steps, n_samples])
+
+        for i, j in itertools.pairwise(steps):
+            batch = X[i:j, :]
+            if i == 0:
+                incremental_means = batch.mean(axis=0)
+                incremental_variances = batch.var(axis=0)
+                # Assign this twice so that the test logic is consistent
+                incremental_count = batch.shape[0]
+                sample_count = np.full(batch.shape[1], batch.shape[0], dtype=np.int32)
+            else:
+                result = _incremental_mean_and_var(
+                    batch, incremental_means, incremental_variances, sample_count
+                )
+                (incremental_means, incremental_variances, incremental_count) = result
+                sample_count += batch.shape[0]
+
+            calculated_means = np.mean(X[:j], axis=0)
+            calculated_variances = np.var(X[:j], axis=0)
+            assert_almost_equal(incremental_means, calculated_means, 6)
+            assert_almost_equal(incremental_variances, calculated_variances, 6)
+            assert_array_equal(incremental_count, sample_count)
+
+
+def test_vector_sign_flip():
+    # Testing that sign flip is working & largest value has positive sign
+    data = np.random.RandomState(36).randn(5, 5)
+    max_abs_rows = np.argmax(np.abs(data), axis=1)
+    data_flipped = _deterministic_vector_sign_flip(data)
+    max_rows = np.argmax(data_flipped, axis=1)
+    assert_array_equal(max_abs_rows, max_rows)
+    signs = np.sign(data[range(data.shape[0]), max_abs_rows])
+    assert_array_equal(data, data_flipped * signs[:, np.newaxis])
+
+
+def test_softmax():
+    rng = np.random.RandomState(0)
+    X = rng.randn(3, 5)
+    exp_X = np.exp(X)
+    sum_exp_X = np.sum(exp_X, axis=1).reshape((-1, 1))
+    assert_array_almost_equal(softmax(X), exp_X / sum_exp_X)
+
+
+def test_stable_cumsum():
+    assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
+    r = np.random.RandomState(0).rand(100000)
+    with pytest.warns(RuntimeWarning):
+        stable_cumsum(r, rtol=0, atol=0)
+
+    # test axis parameter
+    A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
+    assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
+    assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
+    assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
+
+
+@pytest.mark.parametrize(
+    "A_container",
+    [np.array, *CSR_CONTAINERS],
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+@pytest.mark.parametrize(
+    "B_container",
+    [np.array, *CSR_CONTAINERS],
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+def test_safe_sparse_dot_2d(A_container, B_container):
+    rng = np.random.RandomState(0)
+
+    A = rng.random_sample((30, 10))
+    B = rng.random_sample((10, 20))
+    expected = np.dot(A, B)
+
+    A = A_container(A)
+    B = B_container(B)
+    actual = safe_sparse_dot(A, B, dense_output=True)
+
+    assert_allclose(actual, expected)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_safe_sparse_dot_nd(csr_container):
+    rng = np.random.RandomState(0)
+
+    # dense ND / sparse
+    A = rng.random_sample((2, 3, 4, 5, 6))
+    B = rng.random_sample((6, 7))
+    expected = np.dot(A, B)
+    B = csr_container(B)
+    actual = safe_sparse_dot(A, B)
+    assert_allclose(actual, expected)
+
+    # sparse / dense ND
+    A = rng.random_sample((2, 3))
+    B = rng.random_sample((4, 5, 3, 6))
+    expected = np.dot(A, B)
+    A = csr_container(A)
+    actual = safe_sparse_dot(A, B)
+    assert_allclose(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "container",
+    [np.array, *CSR_CONTAINERS],
+    ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
+)
+def test_safe_sparse_dot_2d_1d(container):
+    rng = np.random.RandomState(0)
+    B = rng.random_sample((10))
+
+    # 2D @ 1D
+    A = rng.random_sample((30, 10))
+    expected = np.dot(A, B)
+    actual = safe_sparse_dot(container(A), B)
+    assert_allclose(actual, expected)
+
+    # 1D @ 2D
+    A = rng.random_sample((10, 30))
+    expected = np.dot(B, A)
+    actual = safe_sparse_dot(B, container(A))
+    assert_allclose(actual, expected)
+
+
+@pytest.mark.parametrize("dense_output", [True, False])
+def test_safe_sparse_dot_dense_output(dense_output):
+    rng = np.random.RandomState(0)
+
+    A = sparse.random(30, 10, density=0.1, random_state=rng)
+    B = sparse.random(10, 20, density=0.1, random_state=rng)
+
+    expected = A.dot(B)
+    actual = safe_sparse_dot(A, B, dense_output=dense_output)
+
+    assert sparse.issparse(actual) == (not dense_output)
+
+    if dense_output:
+        expected = expected.toarray()
+    assert_allclose_dense_sparse(actual, expected)
+
+
+def test_approximate_mode():
+    """Make sure sklearn.utils.extmath._approximate_mode returns valid
+    results for cases where "class_counts * n_draws" is enough
+    to overflow 32-bit signed integer.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20774
+    """
+    X = np.array([99000, 1000], dtype=np.int32)
+    ret = _approximate_mode(class_counts=X, n_draws=25000, rng=0)
+
+    # Draws 25% of the total population, so in this case a fair draw means:
+    # 25% * 99.000 = 24.750
+    # 25% *  1.000 =    250
+    assert_array_equal(ret, [24750, 250])
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_randomized_svd_array_api_compliance(array_namespace, device, dtype):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(30, 10)).astype(dtype)
+    X_xp = xp.asarray(X, device=device)
+    n_components = 5
+    atol = 1e-5 if dtype == "float32" else 0
+
+    with config_context(array_api_dispatch=True):
+        u_np, s_np, vt_np = randomized_svd(X, n_components, random_state=0)
+        u_xp, s_xp, vt_xp = randomized_svd(X_xp, n_components, random_state=0)
+
+        assert get_namespace(u_xp)[0].__name__ == xp.__name__
+        assert get_namespace(s_xp)[0].__name__ == xp.__name__
+        assert get_namespace(vt_xp)[0].__name__ == xp.__name__
+
+        assert_allclose(_convert_to_numpy(u_xp, xp), u_np, atol=atol)
+        assert_allclose(_convert_to_numpy(s_xp, xp), s_np, atol=atol)
+        assert_allclose(_convert_to_numpy(vt_xp, xp), vt_np, atol=atol)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_randomized_range_finder_array_api_compliance(array_namespace, device, dtype):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(30, 10)).astype(dtype)
+    X_xp = xp.asarray(X, device=device)
+    size = 5
+    n_iter = 10
+    atol = 1e-5 if dtype == "float32" else 0
+
+    with config_context(array_api_dispatch=True):
+        Q_np = randomized_range_finder(X, size=size, n_iter=n_iter, random_state=0)
+        Q_xp = randomized_range_finder(X_xp, size=size, n_iter=n_iter, random_state=0)
+
+        assert get_namespace(Q_xp)[0].__name__ == xp.__name__
+        assert_allclose(_convert_to_numpy(Q_xp, xp), Q_np, atol=atol)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_fast_dict.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_fast_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..c44250c36daacc920b50c776d3bc013af86f8425
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_fast_dict.py
@@ -0,0 +1,47 @@
+"""Test fast_dict."""
+
+import numpy as np
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn.utils._fast_dict import IntFloatDict, argmin
+
+
+def test_int_float_dict():
+    rng = np.random.RandomState(0)
+    keys = np.unique(rng.randint(100, size=10).astype(np.intp))
+    values = rng.rand(len(keys))
+
+    d = IntFloatDict(keys, values)
+    for key, value in zip(keys, values):
+        assert d[key] == value
+    assert len(d) == len(keys)
+
+    d.append(120, 3.0)
+    assert d[120] == 3.0
+    assert len(d) == len(keys) + 1
+    for i in range(2000):
+        d.append(i + 1000, 4.0)
+    assert d[1100] == 4.0
+
+
+def test_int_float_dict_argmin():
+    # Test the argmin implementation on the IntFloatDict
+    keys = np.arange(100, dtype=np.intp)
+    values = np.arange(100, dtype=np.float64)
+    d = IntFloatDict(keys, values)
+    assert argmin(d) == (0, 0)
+
+
+def test_to_arrays():
+    # Test that an IntFloatDict is converted into arrays
+    # of keys and values correctly
+    keys_in = np.array([1, 2, 3], dtype=np.intp)
+    values_in = np.array([4, 5, 6], dtype=np.float64)
+
+    d = IntFloatDict(keys_in, values_in)
+    keys_out, values_out = d.to_arrays()
+
+    assert keys_out.dtype == keys_in.dtype
+    assert values_in.dtype == values_out.dtype
+    assert_array_equal(keys_out, keys_in)
+    assert_allclose(values_out, values_in)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_fixes.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_fixes.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aa370df705a31f6771a5b7cfa184ab901de5e0d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_fixes.py
@@ -0,0 +1,160 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import pytest
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.fixes import _object_dtype_isnan, _smallest_admissible_index_dtype
+
+
+@pytest.mark.parametrize("dtype, val", ([object, 1], [object, "a"], [float, 1]))
+def test_object_dtype_isnan(dtype, val):
+    X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype)
+
+    expected_mask = np.array([[False, True], [True, False]])
+
+    mask = _object_dtype_isnan(X)
+
+    assert_array_equal(mask, expected_mask)
+
+
+@pytest.mark.parametrize(
+    "params, expected_dtype",
+    [
+        ({}, np.int32),  # default behaviour
+        ({"maxval": np.iinfo(np.int32).max}, np.int32),
+        ({"maxval": np.iinfo(np.int32).max + 1}, np.int64),
+    ],
+)
+def test_smallest_admissible_index_dtype_max_val(params, expected_dtype):
+    """Check the behaviour of `smallest_admissible_index_dtype` depending only on the
+    `max_val` parameter.
+    """
+    assert _smallest_admissible_index_dtype(**params) == expected_dtype
+
+
+@pytest.mark.parametrize(
+    "params, expected_dtype",
+    [
+        # Arrays dtype is int64 and thus should not be downcasted to int32 without
+        # checking the content of providing maxval.
+        ({"arrays": np.array([1, 2], dtype=np.int64)}, np.int64),
+        # One of the array is int64 and should not be downcasted to int32
+        # for the same reasons.
+        (
+            {
+                "arrays": (
+                    np.array([1, 2], dtype=np.int32),
+                    np.array([1, 2], dtype=np.int64),
+                )
+            },
+            np.int64,
+        ),
+        # Both arrays are already int32: we can just keep this dtype.
+        (
+            {
+                "arrays": (
+                    np.array([1, 2], dtype=np.int32),
+                    np.array([1, 2], dtype=np.int32),
+                )
+            },
+            np.int32,
+        ),
+        # Arrays should be upcasted to at least int32 precision.
+        ({"arrays": np.array([1, 2], dtype=np.int8)}, np.int32),
+        # Check that `maxval` takes precedence over the arrays and thus upcast to
+        # int64.
+        (
+            {
+                "arrays": np.array([1, 2], dtype=np.int32),
+                "maxval": np.iinfo(np.int32).max + 1,
+            },
+            np.int64,
+        ),
+    ],
+)
+def test_smallest_admissible_index_dtype_without_checking_contents(
+    params, expected_dtype
+):
+    """Check the behaviour of `smallest_admissible_index_dtype` using the passed
+    arrays but without checking the contents of the arrays.
+    """
+    assert _smallest_admissible_index_dtype(**params) == expected_dtype
+
+
+@pytest.mark.parametrize(
+    "params, expected_dtype",
+    [
+        # empty arrays should always be converted to int32 indices
+        (
+            {
+                "arrays": (np.array([], dtype=np.int64), np.array([], dtype=np.int64)),
+                "check_contents": True,
+            },
+            np.int32,
+        ),
+        # arrays respecting np.iinfo(np.int32).min < x < np.iinfo(np.int32).max should
+        # be converted to int32,
+        (
+            {"arrays": np.array([1], dtype=np.int64), "check_contents": True},
+            np.int32,
+        ),
+        # otherwise, it should be converted to int64. We need to create a uint32
+        # arrays to accommodate a value > np.iinfo(np.int32).max
+        (
+            {
+                "arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
+                "check_contents": True,
+            },
+            np.int64,
+        ),
+        # maxval should take precedence over the arrays contents and thus upcast to
+        # int64.
+        (
+            {
+                "arrays": np.array([1], dtype=np.int32),
+                "check_contents": True,
+                "maxval": np.iinfo(np.int32).max + 1,
+            },
+            np.int64,
+        ),
+        # when maxval is small, but check_contents is True and the contents
+        # require np.int64, we still require np.int64 indexing in the end.
+        (
+            {
+                "arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
+                "check_contents": True,
+                "maxval": 1,
+            },
+            np.int64,
+        ),
+    ],
+)
+def test_smallest_admissible_index_dtype_by_checking_contents(params, expected_dtype):
+    """Check the behaviour of `smallest_admissible_index_dtype` using the dtype of the
+    arrays but as well the contents.
+    """
+    assert _smallest_admissible_index_dtype(**params) == expected_dtype
+
+
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        (
+            {"maxval": np.iinfo(np.int64).max + 1},
+            ValueError,
+            "is to large to be represented as np.int64",
+        ),
+        (
+            {"arrays": np.array([1, 2], dtype=np.float64)},
+            ValueError,
+            "Array dtype float64 is not supported",
+        ),
+        ({"arrays": [1, 2]}, TypeError, "Arrays should be of type np.ndarray"),
+    ],
+)
+def test_smallest_admissible_index_dtype_error(params, err_type, err_msg):
+    """Check that we raise the proper error message."""
+    with pytest.raises(err_type, match=err_msg):
+        _smallest_admissible_index_dtype(**params)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_graph.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..d64108a40d8ab4a919b5f0e8b86749232fc4b79e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_graph.py
@@ -0,0 +1,80 @@
+import numpy as np
+import pytest
+from scipy.sparse.csgraph import connected_components
+
+from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.neighbors import kneighbors_graph
+from sklearn.utils.graph import _fix_connected_components
+
+
+def test_fix_connected_components():
+    # Test that _fix_connected_components reduces the number of component to 1.
+    X = np.array([0, 1, 2, 5, 6, 7])[:, None]
+    graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
+
+    n_connected_components, labels = connected_components(graph)
+    assert n_connected_components > 1
+
+    graph = _fix_connected_components(X, graph, n_connected_components, labels)
+
+    n_connected_components, labels = connected_components(graph)
+    assert n_connected_components == 1
+
+
+def test_fix_connected_components_precomputed():
+    # Test that _fix_connected_components accepts precomputed distance matrix.
+    X = np.array([0, 1, 2, 5, 6, 7])[:, None]
+    graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
+
+    n_connected_components, labels = connected_components(graph)
+    assert n_connected_components > 1
+
+    distances = pairwise_distances(X)
+    graph = _fix_connected_components(
+        distances, graph, n_connected_components, labels, metric="precomputed"
+    )
+
+    n_connected_components, labels = connected_components(graph)
+    assert n_connected_components == 1
+
+    # but it does not work with precomputed neighbors graph
+    with pytest.raises(RuntimeError, match="does not work with a sparse"):
+        _fix_connected_components(
+            graph, graph, n_connected_components, labels, metric="precomputed"
+        )
+
+
+def test_fix_connected_components_wrong_mode():
+    # Test that the an error is raised if the mode string is incorrect.
+    X = np.array([0, 1, 2, 5, 6, 7])[:, None]
+    graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
+    n_connected_components, labels = connected_components(graph)
+
+    with pytest.raises(ValueError, match="Unknown mode"):
+        graph = _fix_connected_components(
+            X, graph, n_connected_components, labels, mode="foo"
+        )
+
+
+def test_fix_connected_components_connectivity_mode():
+    # Test that the connectivity mode fill new connections with ones.
+    X = np.array([0, 1, 6, 7])[:, None]
+    graph = kneighbors_graph(X, n_neighbors=1, mode="connectivity")
+    n_connected_components, labels = connected_components(graph)
+    graph = _fix_connected_components(
+        X, graph, n_connected_components, labels, mode="connectivity"
+    )
+    assert np.all(graph.data == 1)
+
+
+def test_fix_connected_components_distance_mode():
+    # Test that the distance mode does not fill new connections with ones.
+    X = np.array([0, 1, 6, 7])[:, None]
+    graph = kneighbors_graph(X, n_neighbors=1, mode="distance")
+    assert np.all(graph.data == 1)
+
+    n_connected_components, labels = connected_components(graph)
+    graph = _fix_connected_components(
+        X, graph, n_connected_components, labels, mode="distance"
+    )
+    assert not np.all(graph.data == 1)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_indexing.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7127638d6abb74f7c29f40de27079e3bff41f16
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_indexing.py
@@ -0,0 +1,663 @@
+import warnings
+from copy import copy
+from unittest import SkipTest
+
+import numpy as np
+import pytest
+from scipy.stats import kstest
+
+import sklearn
+from sklearn.externals._packaging.version import parse as parse_version
+from sklearn.utils import _safe_indexing, resample, shuffle
+from sklearn.utils._array_api import (
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._indexing import (
+    _determine_key_type,
+    _get_column_indices,
+    _safe_assign,
+)
+from sklearn.utils._mocking import MockDataFrame
+from sklearn.utils._testing import (
+    _array_api_for_tests,
+    _convert_container,
+    assert_allclose_dense_sparse,
+    assert_array_equal,
+    skip_if_array_api_compat_not_configured,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+# toy array
+X_toy = np.arange(9).reshape((3, 3))
+
+
+def test_polars_indexing():
+    """Check _safe_indexing for polars as expected."""
+    pl = pytest.importorskip("polars", minversion="0.18.2")
+    df = pl.DataFrame(
+        {"a": [1, 2, 3, 4], "b": [4, 5, 6, 8], "c": [1, 4, 1, 10]}, orient="row"
+    )
+
+    from polars.testing import assert_frame_equal
+
+    str_keys = [["b"], ["a", "b"], ["b", "a", "c"], ["c"], ["a"]]
+
+    for key in str_keys:
+        out = _safe_indexing(df, key, axis=1)
+        assert_frame_equal(df[key], out)
+
+    bool_keys = [([True, False, True], ["a", "c"]), ([False, False, True], ["c"])]
+
+    for bool_key, str_key in bool_keys:
+        out = _safe_indexing(df, bool_key, axis=1)
+        assert_frame_equal(df[:, str_key], out)
+
+    int_keys = [([0, 1], ["a", "b"]), ([2], ["c"])]
+
+    for int_key, str_key in int_keys:
+        out = _safe_indexing(df, int_key, axis=1)
+        assert_frame_equal(df[:, str_key], out)
+
+    axis_0_keys = [[0, 1], [1, 3], [3, 2]]
+    for key in axis_0_keys:
+        out = _safe_indexing(df, key, axis=0)
+        assert_frame_equal(df[key], out)
+
+
+@pytest.mark.parametrize(
+    "key, dtype",
+    [
+        (0, "int"),
+        ("0", "str"),
+        (True, "bool"),
+        (np.bool_(True), "bool"),
+        ([0, 1, 2], "int"),
+        (["0", "1", "2"], "str"),
+        ((0, 1, 2), "int"),
+        (("0", "1", "2"), "str"),
+        (slice(None, None), None),
+        (slice(0, 2), "int"),
+        (np.array([0, 1, 2], dtype=np.int32), "int"),
+        (np.array([0, 1, 2], dtype=np.int64), "int"),
+        (np.array([0, 1, 2], dtype=np.uint8), "int"),
+        ([True, False], "bool"),
+        ((True, False), "bool"),
+        (np.array([True, False]), "bool"),
+        ("col_0", "str"),
+        (["col_0", "col_1", "col_2"], "str"),
+        (("col_0", "col_1", "col_2"), "str"),
+        (slice("begin", "end"), "str"),
+        (np.array(["col_0", "col_1", "col_2"]), "str"),
+        (np.array(["col_0", "col_1", "col_2"], dtype=object), "str"),
+    ],
+)
+def test_determine_key_type(key, dtype):
+    assert _determine_key_type(key) == dtype
+
+
+def test_determine_key_type_error():
+    with pytest.raises(ValueError, match="No valid specification of the"):
+        _determine_key_type(1.0)
+
+
+def test_determine_key_type_slice_error():
+    with pytest.raises(TypeError, match="Only array-like or scalar are"):
+        _determine_key_type(slice(0, 2, 1), accept_slice=False)
+
+
+@skip_if_array_api_compat_not_configured
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_determine_key_type_array_api(array_namespace, device, dtype_name):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    with sklearn.config_context(array_api_dispatch=True):
+        int_array_key = xp.asarray([1, 2, 3])
+        assert _determine_key_type(int_array_key) == "int"
+
+        bool_array_key = xp.asarray([True, False, True])
+        assert _determine_key_type(bool_array_key) == "bool"
+
+        try:
+            complex_array_key = xp.asarray([1 + 1j, 2 + 2j, 3 + 3j])
+        except TypeError:
+            # Complex numbers are not supported by all Array API libraries.
+            complex_array_key = None
+
+        if complex_array_key is not None:
+            with pytest.raises(ValueError, match="No valid specification of the"):
+                _determine_key_type(complex_array_key)
+
+
+@pytest.mark.parametrize(
+    "array_type", ["list", "array", "sparse", "dataframe", "polars", "pyarrow"]
+)
+@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
+def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
+    indices = [1, 2]
+    if indices_type == "slice" and isinstance(indices[1], int):
+        indices[1] += 1
+    array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
+    indices = _convert_container(indices, indices_type)
+    subset = _safe_indexing(array, indices, axis=0)
+    assert_allclose_dense_sparse(
+        subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
+    )
+
+
+@pytest.mark.parametrize(
+    "array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
+)
+@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
+def test_safe_indexing_1d_container(array_type, indices_type):
+    indices = [1, 2]
+    if indices_type == "slice" and isinstance(indices[1], int):
+        indices[1] += 1
+    array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
+    indices = _convert_container(indices, indices_type)
+    subset = _safe_indexing(array, indices, axis=0)
+    assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
+
+
+@pytest.mark.parametrize(
+    "array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
+)
+@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
+@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
+def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
+    # validation of the indices
+    # we make a copy because indices is mutable and shared between tests
+    indices_converted = copy(indices)
+    if indices_type == "slice" and isinstance(indices[1], int):
+        indices_converted[1] += 1
+
+    columns_name = ["col_0", "col_1", "col_2"]
+    array = _convert_container(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
+    )
+    indices_converted = _convert_container(indices_converted, indices_type)
+
+    if isinstance(indices[0], str) and array_type in ("array", "sparse"):
+        err_msg = (
+            "Specifying the columns using strings is only supported for dataframes"
+        )
+        with pytest.raises(ValueError, match=err_msg):
+            _safe_indexing(array, indices_converted, axis=1)
+    else:
+        subset = _safe_indexing(array, indices_converted, axis=1)
+        assert_allclose_dense_sparse(
+            subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
+        )
+
+
+@pytest.mark.parametrize("array_read_only", [True, False])
+@pytest.mark.parametrize("indices_read_only", [True, False])
+@pytest.mark.parametrize(
+    "array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
+)
+@pytest.mark.parametrize("indices_type", ["array", "series"])
+@pytest.mark.parametrize(
+    "axis, expected_array", [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
+)
+def test_safe_indexing_2d_read_only_axis_1(
+    array_read_only, indices_read_only, array_type, indices_type, axis, expected_array
+):
+    array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    if array_read_only:
+        array.setflags(write=False)
+    array = _convert_container(array, array_type)
+    indices = np.array([1, 2])
+    if indices_read_only:
+        indices.setflags(write=False)
+    indices = _convert_container(indices, indices_type)
+    subset = _safe_indexing(array, indices, axis=axis)
+    assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
+
+
+@pytest.mark.parametrize(
+    "array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
+)
+@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
+def test_safe_indexing_1d_container_mask(array_type, indices_type):
+    indices = [False] + [True] * 2 + [False] * 6
+    array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
+    indices = _convert_container(indices, indices_type)
+    subset = _safe_indexing(array, indices, axis=0)
+    assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
+
+
+@pytest.mark.parametrize(
+    "array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
+)
+@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
+@pytest.mark.parametrize(
+    "axis, expected_subset",
+    [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])],
+)
+def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset):
+    columns_name = ["col_0", "col_1", "col_2"]
+    array = _convert_container(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
+    )
+    indices = [False, True, True]
+    indices = _convert_container(indices, indices_type)
+
+    subset = _safe_indexing(array, indices, axis=axis)
+    assert_allclose_dense_sparse(
+        subset, _convert_container(expected_subset, array_type)
+    )
+
+
+@pytest.mark.parametrize(
+    "array_type, expected_output_type",
+    [
+        ("list", "list"),
+        ("array", "array"),
+        ("sparse", "sparse"),
+        ("dataframe", "series"),
+        ("polars", "polars_series"),
+        ("pyarrow", "pyarrow_array"),
+    ],
+)
+def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
+    array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
+    indices = 2
+    subset = _safe_indexing(array, indices, axis=0)
+    expected_array = _convert_container([7, 8, 9], expected_output_type)
+    assert_allclose_dense_sparse(subset, expected_array)
+
+
+@pytest.mark.parametrize(
+    "array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
+)
+def test_safe_indexing_1d_scalar(array_type):
+    array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
+    indices = 2
+    subset = _safe_indexing(array, indices, axis=0)
+    assert subset == 3
+
+
+@pytest.mark.parametrize(
+    "array_type, expected_output_type",
+    [
+        ("array", "array"),
+        ("sparse", "sparse"),
+        ("dataframe", "series"),
+        ("polars", "polars_series"),
+        ("pyarrow", "pyarrow_array"),
+    ],
+)
+@pytest.mark.parametrize("indices", [2, "col_2"])
+def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices):
+    columns_name = ["col_0", "col_1", "col_2"]
+    array = _convert_container(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
+    )
+
+    if isinstance(indices, str) and array_type in ("array", "sparse"):
+        err_msg = (
+            "Specifying the columns using strings is only supported for dataframes"
+        )
+        with pytest.raises(ValueError, match=err_msg):
+            _safe_indexing(array, indices, axis=1)
+    else:
+        subset = _safe_indexing(array, indices, axis=1)
+        expected_output = [3, 6, 9]
+        if expected_output_type == "sparse":
+            # sparse matrix are keeping the 2D shape
+            expected_output = [[3], [6], [9]]
+        expected_array = _convert_container(expected_output, expected_output_type)
+        assert_allclose_dense_sparse(subset, expected_array)
+
+
+@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
+def test_safe_indexing_None_axis_0(array_type):
+    X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
+    X_subset = _safe_indexing(X, None, axis=0)
+    assert_allclose_dense_sparse(X_subset, X)
+
+
+def test_safe_indexing_pandas_no_matching_cols_error():
+    pd = pytest.importorskip("pandas")
+    err_msg = "No valid specification of the columns."
+    X = pd.DataFrame(X_toy)
+    with pytest.raises(ValueError, match=err_msg):
+        _safe_indexing(X, [1.0], axis=1)
+
+
+@pytest.mark.parametrize("axis", [None, 3])
+def test_safe_indexing_error_axis(axis):
+    with pytest.raises(ValueError, match="'axis' should be either 0"):
+        _safe_indexing(X_toy, [0, 1], axis=axis)
+
+
+@pytest.mark.parametrize(
+    "X_constructor", ["array", "series", "polars_series", "pyarrow_array"]
+)
+def test_safe_indexing_1d_array_error(X_constructor):
+    # check that we are raising an error if the array-like passed is 1D and
+    # we try to index on the 2nd dimension
+    X = list(range(5))
+    if X_constructor == "array":
+        X_constructor = np.asarray(X)
+    elif X_constructor == "series":
+        pd = pytest.importorskip("pandas")
+        X_constructor = pd.Series(X)
+    elif X_constructor == "polars_series":
+        pl = pytest.importorskip("polars")
+        X_constructor = pl.Series(values=X)
+    elif X_constructor == "pyarrow_array":
+        pa = pytest.importorskip("pyarrow")
+        X_constructor = pa.array(X)
+
+    err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or dataframe"
+    with pytest.raises(ValueError, match=err_msg):
+        _safe_indexing(X_constructor, [0, 1], axis=1)
+
+
+def test_safe_indexing_container_axis_0_unsupported_type():
+    indices = ["col_1", "col_2"]
+    array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+    err_msg = "String indexing is not supported with 'axis=0'"
+    with pytest.raises(ValueError, match=err_msg):
+        _safe_indexing(array, indices, axis=0)
+
+
+def test_safe_indexing_pandas_no_settingwithcopy_warning():
+    # Using safe_indexing with an array-like indexer gives a copy of the
+    # DataFrame -> ensure it doesn't raise a warning if modified
+    pd = pytest.importorskip("pandas")
+
+    pd_version = parse_version(pd.__version__)
+    pd_base_version = parse_version(pd_version.base_version)
+
+    if pd_base_version >= parse_version("3"):
+        raise SkipTest("SettingWithCopyWarning has been removed in pandas 3.0.0.dev")
+
+    X = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+    subset = _safe_indexing(X, [0, 1], axis=0)
+    if hasattr(pd.errors, "SettingWithCopyWarning"):
+        SettingWithCopyWarning = pd.errors.SettingWithCopyWarning
+    else:
+        # backward compatibility for pandas < 1.5
+        SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", SettingWithCopyWarning)
+        subset.iloc[0, 0] = 10
+    # The original dataframe is unaffected by the assignment on the subset:
+    assert X.iloc[0, 0] == 1
+
+
+@pytest.mark.parametrize("indices", [0, [0, 1], slice(0, 2), np.array([0, 1])])
+def test_safe_indexing_list_axis_1_unsupported(indices):
+    """Check that we raise a ValueError when axis=1 with input as list."""
+    X = [[1, 2], [4, 5], [7, 8]]
+    err_msg = "axis=1 is not supported for lists"
+    with pytest.raises(ValueError, match=err_msg):
+        _safe_indexing(X, indices, axis=1)
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
+def test_safe_assign(array_type):
+    """Check that `_safe_assign` works as expected."""
+    rng = np.random.RandomState(0)
+    X_array = rng.randn(10, 5)
+
+    row_indexer = [1, 2]
+    values = rng.randn(len(row_indexer), X_array.shape[1])
+    X = _convert_container(X_array, array_type)
+    _safe_assign(X, values, row_indexer=row_indexer)
+
+    assigned_portion = _safe_indexing(X, row_indexer, axis=0)
+    assert_allclose_dense_sparse(
+        assigned_portion, _convert_container(values, array_type)
+    )
+
+    column_indexer = [1, 2]
+    values = rng.randn(X_array.shape[0], len(column_indexer))
+    X = _convert_container(X_array, array_type)
+    _safe_assign(X, values, column_indexer=column_indexer)
+
+    assigned_portion = _safe_indexing(X, column_indexer, axis=1)
+    assert_allclose_dense_sparse(
+        assigned_portion, _convert_container(values, array_type)
+    )
+
+    row_indexer, column_indexer = None, None
+    values = rng.randn(*X.shape)
+    X = _convert_container(X_array, array_type)
+    _safe_assign(X, values, column_indexer=column_indexer)
+
+    assert_allclose_dense_sparse(X, _convert_container(values, array_type))
+
+
+@pytest.mark.parametrize(
+    "key, err_msg",
+    [
+        (10, r"all features must be in \[0, 2\]"),
+        ("whatever", "A given column is not a column of the dataframe"),
+        (object(), "No valid specification of the columns"),
+    ],
+)
+def test_get_column_indices_error(key, err_msg):
+    pd = pytest.importorskip("pandas")
+    X_df = pd.DataFrame(X_toy, columns=["col_0", "col_1", "col_2"])
+
+    with pytest.raises(ValueError, match=err_msg):
+        _get_column_indices(X_df, key)
+
+
+@pytest.mark.parametrize(
+    "key", [["col1"], ["col2"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]
+)
+def test_get_column_indices_pandas_nonunique_columns_error(key):
+    pd = pytest.importorskip("pandas")
+    toy = np.zeros((1, 5), dtype=int)
+    columns = ["col1", "col1", "col2", "col3", "col2"]
+    X = pd.DataFrame(toy, columns=columns)
+
+    err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
+    with pytest.raises(ValueError) as exc_info:
+        _get_column_indices(X, key)
+    assert str(exc_info.value) == err_msg
+
+
+def test_get_column_indices_interchange():
+    """Check _get_column_indices for edge cases with the interchange"""
+    pl = pytest.importorskip("polars")
+
+    # Polars dataframes go down the interchange path.
+    df = pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"])
+
+    key_results = [
+        (slice(1, None), [1, 2]),
+        (slice(None, 2), [0, 1]),
+        (slice(1, 2), [1]),
+        (["b", "c"], [1, 2]),
+        (slice("a", "b"), [0, 1]),
+        (slice("a", None), [0, 1, 2]),
+        (slice(None, "a"), [0]),
+        (["c", "a"], [2, 0]),
+        ([], []),
+    ]
+    for key, result in key_results:
+        assert _get_column_indices(df, key) == result
+
+    msg = "A given column is not a column of the dataframe"
+    with pytest.raises(ValueError, match=msg):
+        _get_column_indices(df, ["not_a_column"])
+
+    msg = "key.step must be 1 or None"
+    with pytest.raises(NotImplementedError, match=msg):
+        _get_column_indices(df, slice("a", None, 2))
+
+
+def test_resample():
+    # Border case not worth mentioning in doctests
+    assert resample() is None
+
+    # Check that invalid arguments yield ValueError
+    with pytest.raises(ValueError):
+        resample([0], [0, 1])
+    with pytest.raises(ValueError):
+        resample([0, 1], [0, 1], replace=False, n_samples=3)
+
+    # Issue:6581, n_samples can be more when replace is True (default).
+    assert len(resample([1, 2], n_samples=5)) == 5
+
+
+def test_resample_weighted():
+    # Check that sampling with replacement with integer weights yields the
+    # samples from the same distribution as sampling uniformly with
+    # repeated data points.
+    data = np.array([-1, 0, 1])
+    sample_weight = np.asarray([0, 100, 1])
+
+    mean_repeated = []
+    mean_reweighted = []
+
+    for seed in range(100):
+        mean_repeated.append(
+            resample(
+                data.repeat(sample_weight),
+                replace=True,
+                random_state=seed,
+                n_samples=data.shape[0],
+            ).mean()
+        )
+        mean_reweighted.append(
+            resample(
+                data,
+                sample_weight=sample_weight,
+                replace=True,
+                random_state=seed,
+                n_samples=data.shape[0],
+            ).mean()
+        )
+
+    mean_repeated = np.asarray(mean_repeated)
+    mean_reweighted = np.asarray(mean_reweighted)
+
+    test_result = kstest(mean_repeated, mean_reweighted)
+    # Should never be negative because -1 has a 0 weight.
+    assert np.all(mean_reweighted >= 0)
+    # The null-hypothesis (the computed means are identically distributed)
+    # cannot be rejected.
+    assert test_result.pvalue > 0.05
+
+
+def test_resample_stratified():
+    # Make sure resample can stratify
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    p = 0.9
+    X = rng.normal(size=(n_samples, 1))
+    y = rng.binomial(1, p, size=n_samples)
+
+    _, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None)
+    assert np.all(y_not_stratified == 1)
+
+    _, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
+    assert not np.all(y_stratified == 1)
+    assert np.sum(y_stratified) == 9  # all 1s, one 0
+
+
+def test_resample_stratified_replace():
+    # Make sure stratified resampling supports the replace parameter
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 1))
+    y = rng.randint(0, 2, size=n_samples)
+
+    X_replace, _ = resample(
+        X, y, replace=True, n_samples=50, random_state=rng, stratify=y
+    )
+    X_no_replace, _ = resample(
+        X, y, replace=False, n_samples=50, random_state=rng, stratify=y
+    )
+    assert np.unique(X_replace).shape[0] < 50
+    assert np.unique(X_no_replace).shape[0] == 50
+
+    # make sure n_samples can be greater than X.shape[0] if we sample with
+    # replacement
+    X_replace, _ = resample(
+        X, y, replace=True, n_samples=1000, random_state=rng, stratify=y
+    )
+    assert X_replace.shape[0] == 1000
+    assert np.unique(X_replace).shape[0] == 100
+
+
+def test_resample_stratify_2dy():
+    # Make sure y can be 2d when stratifying
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 1))
+    y = rng.randint(0, 2, size=(n_samples, 2))
+    X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
+    assert y.ndim == 2
+
+
+def test_notimplementederror():
+    with pytest.raises(
+        NotImplementedError,
+        match="Resampling with sample_weight is only implemented for replace=True.",
+    ):
+        resample([0, 1], [0, 1], sample_weight=[1, 1], replace=False)
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Resampling with sample_weight is only implemented for stratify=None",
+    ):
+        resample([0, 1], [0, 1], sample_weight=[1, 1], stratify=[0, 1])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_resample_stratify_sparse_error(csr_container):
+    # resample must be ndarray
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 2))
+    y = rng.randint(0, 2, size=n_samples)
+    stratify = csr_container(y.reshape(-1, 1))
+    with pytest.raises(TypeError, match="Sparse data was passed"):
+        X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
+
+
+def test_shuffle_on_ndim_equals_three():
+    def to_tuple(A):  # to make the inner arrays hashable
+        return tuple(tuple(tuple(C) for C in B) for B in A)
+
+    A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])  # A.shape = (2,2,2)
+    S = set(to_tuple(A))
+    shuffle(A)  # shouldn't raise a ValueError for dim = 3
+    assert set(to_tuple(A)) == S
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_shuffle_dont_convert_to_array(csc_container):
+    # Check that shuffle does not try to convert to numpy arrays with float
+    # dtypes can let any indexable datastructure pass-through.
+    a = ["a", "b", "c"]
+    b = np.array(["a", "b", "c"], dtype=object)
+    c = [1, 2, 3]
+    d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
+    e = csc_container(np.arange(6).reshape(3, 2))
+    a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
+
+    assert a_s == ["c", "b", "a"]
+    assert type(a_s) == list
+
+    assert_array_equal(b_s, ["c", "b", "a"])
+    assert b_s.dtype == object
+
+    assert c_s == [3, 2, 1]
+    assert type(c_s) == list
+
+    assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
+    assert type(d_s) == MockDataFrame
+
+    assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_mask.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eb88e71771f896b84cfaea2a27657999ab167ea
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_mask.py
@@ -0,0 +1,19 @@
+import pytest
+
+from sklearn.utils._mask import safe_mask
+from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.validation import check_random_state
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_safe_mask(csr_container):
+    random_state = check_random_state(0)
+    X = random_state.rand(5, 4)
+    X_csr = csr_container(X)
+    mask = [False, False, True, True, True]
+
+    mask = safe_mask(X, mask)
+    assert X[mask].shape[0] == 3
+
+    mask = safe_mask(X_csr, mask)
+    assert X_csr[mask].shape[0] == 3
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_metaestimators.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_metaestimators.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e6d4eec35973e8d21c14af76264ca9fd88480f9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_metaestimators.py
@@ -0,0 +1,63 @@
+import pickle
+
+import pytest
+
+from sklearn.utils.metaestimators import available_if
+
+
+class AvailableParameterEstimator:
+    """This estimator's `available` parameter toggles the presence of a method"""
+
+    def __init__(self, available=True, return_value=1):
+        self.available = available
+        self.return_value = return_value
+
+    @available_if(lambda est: est.available)
+    def available_func(self):
+        """This is a mock available_if function"""
+        return self.return_value
+
+
+def test_available_if_docstring():
+    assert "This is a mock available_if function" in str(
+        AvailableParameterEstimator.__dict__["available_func"].__doc__
+    )
+    assert "This is a mock available_if function" in str(
+        AvailableParameterEstimator.available_func.__doc__
+    )
+    assert "This is a mock available_if function" in str(
+        AvailableParameterEstimator().available_func.__doc__
+    )
+
+
+def test_available_if():
+    assert hasattr(AvailableParameterEstimator(), "available_func")
+    assert not hasattr(AvailableParameterEstimator(available=False), "available_func")
+
+
+def test_available_if_unbound_method():
+    # This is a non regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/20614
+    # to make sure that decorated functions can be used as an unbound method,
+    # for instance when monkeypatching.
+    est = AvailableParameterEstimator()
+    AvailableParameterEstimator.available_func(est)
+
+    est = AvailableParameterEstimator(available=False)
+    with pytest.raises(
+        AttributeError,
+        match="This 'AvailableParameterEstimator' has no attribute 'available_func'",
+    ):
+        AvailableParameterEstimator.available_func(est)
+
+
+def test_available_if_methods_can_be_pickled():
+    """Check that available_if methods can be pickled.
+
+    Non-regression test for #21344.
+    """
+    return_value = 10
+    est = AvailableParameterEstimator(available=True, return_value=return_value)
+    pickled_bytes = pickle.dumps(est.available_func)
+    unpickled_func = pickle.loads(pickled_bytes)
+    assert unpickled_func() == return_value
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_missing.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_missing.py
new file mode 100644
index 0000000000000000000000000000000000000000..830e327f06a1173106662ffccf39c3ea281fb04d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_missing.py
@@ -0,0 +1,27 @@
+import numpy as np
+import pytest
+
+from sklearn.utils._missing import is_scalar_nan
+
+
+@pytest.mark.parametrize(
+    "value, result",
+    [
+        (float("nan"), True),
+        (np.nan, True),
+        (float(np.nan), True),
+        (np.float32(np.nan), True),
+        (np.float64(np.nan), True),
+        (0, False),
+        (0.0, False),
+        (None, False),
+        ("", False),
+        ("nan", False),
+        ([np.nan], False),
+        (9867966753463435747313673, False),  # Python int that overflows with C type
+    ],
+)
+def test_is_scalar_nan(value, result):
+    assert is_scalar_nan(value) is result
+    # make sure that we are returning a Python bool
+    assert isinstance(is_scalar_nan(value), bool)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_mocking.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_mocking.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd143855e6dcd33b691b9f702745c4d87bb3f4be
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_mocking.py
@@ -0,0 +1,205 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+from scipy import sparse
+
+from sklearn.datasets import load_iris
+from sklearn.utils import _safe_indexing, check_array
+from sklearn.utils._mocking import (
+    CheckingClassifier,
+    _MockEstimatorOnOffPrediction,
+)
+from sklearn.utils._testing import _convert_container
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+@pytest.fixture
+def iris():
+    return load_iris(return_X_y=True)
+
+
+def _success(x):
+    return True
+
+
+def _fail(x):
+    return False
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {},
+        {"check_X": _success},
+        {"check_y": _success},
+        {"check_X": _success, "check_y": _success},
+    ],
+)
+def test_check_on_fit_success(iris, kwargs):
+    X, y = iris
+    CheckingClassifier(**kwargs).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"check_X": _fail},
+        {"check_y": _fail},
+        {"check_X": _success, "check_y": _fail},
+        {"check_X": _fail, "check_y": _success},
+        {"check_X": _fail, "check_y": _fail},
+    ],
+)
+def test_check_on_fit_fail(iris, kwargs):
+    X, y = iris
+    clf = CheckingClassifier(**kwargs)
+    with pytest.raises(AssertionError):
+        clf.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "pred_func", ["predict", "predict_proba", "decision_function", "score"]
+)
+def test_check_X_on_predict_success(iris, pred_func):
+    X, y = iris
+    clf = CheckingClassifier(check_X=_success).fit(X, y)
+    getattr(clf, pred_func)(X)
+
+
+@pytest.mark.parametrize(
+    "pred_func", ["predict", "predict_proba", "decision_function", "score"]
+)
+def test_check_X_on_predict_fail(iris, pred_func):
+    X, y = iris
+    clf = CheckingClassifier(check_X=_success).fit(X, y)
+    clf.set_params(check_X=_fail)
+    with pytest.raises(AssertionError):
+        getattr(clf, pred_func)(X)
+
+
+@pytest.mark.parametrize("input_type", ["list", "array", "sparse", "dataframe"])
+def test_checking_classifier(iris, input_type):
+    # Check that the CheckingClassifier outputs what we expect
+    X, y = iris
+    X = _convert_container(X, input_type)
+    clf = CheckingClassifier()
+    clf.fit(X, y)
+
+    assert_array_equal(clf.classes_, np.unique(y))
+    assert len(clf.classes_) == 3
+    assert clf.n_features_in_ == 4
+
+    y_pred = clf.predict(X)
+    assert all(pred in clf.classes_ for pred in y_pred)
+
+    assert clf.score(X) == pytest.approx(0)
+    clf.set_params(foo_param=10)
+    assert clf.fit(X, y).score(X) == pytest.approx(1)
+
+    y_proba = clf.predict_proba(X)
+    assert y_proba.shape == (150, 3)
+    assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
+
+    y_decision = clf.decision_function(X)
+    assert y_decision.shape == (150, 3)
+
+    # check the shape in case of binary classification
+    first_2_classes = np.logical_or(y == 0, y == 1)
+    X = _safe_indexing(X, first_2_classes)
+    y = _safe_indexing(y, first_2_classes)
+    clf.fit(X, y)
+
+    y_proba = clf.predict_proba(X)
+    assert y_proba.shape == (100, 2)
+    assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
+
+    y_decision = clf.decision_function(X)
+    assert y_decision.shape == (100,)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_checking_classifier_with_params(iris, csr_container):
+    X, y = iris
+    X_sparse = csr_container(X)
+
+    clf = CheckingClassifier(check_X=sparse.issparse)
+    with pytest.raises(AssertionError):
+        clf.fit(X, y)
+    clf.fit(X_sparse, y)
+
+    clf = CheckingClassifier(
+        check_X=check_array, check_X_params={"accept_sparse": False}
+    )
+    clf.fit(X, y)
+    with pytest.raises(TypeError, match="Sparse data was passed"):
+        clf.fit(X_sparse, y)
+
+
+def test_checking_classifier_fit_params(iris):
+    # check the error raised when the number of samples is not the one expected
+    X, y = iris
+    clf = CheckingClassifier(expected_sample_weight=True)
+    sample_weight = np.ones(len(X) // 2)
+
+    msg = f"sample_weight.shape == ({len(X) // 2},), expected ({len(X)},)!"
+    with pytest.raises(ValueError) as exc:
+        clf.fit(X, y, sample_weight=sample_weight)
+    assert exc.value.args[0] == msg
+
+
+def test_checking_classifier_missing_fit_params(iris):
+    X, y = iris
+    clf = CheckingClassifier(expected_sample_weight=True)
+    err_msg = "Expected sample_weight to be passed"
+    with pytest.raises(AssertionError, match=err_msg):
+        clf.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "methods_to_check",
+    [["predict"], ["predict", "predict_proba"]],
+)
+@pytest.mark.parametrize(
+    "predict_method", ["predict", "predict_proba", "decision_function", "score"]
+)
+def test_checking_classifier_methods_to_check(iris, methods_to_check, predict_method):
+    # check that methods_to_check allows to bypass checks
+    X, y = iris
+
+    clf = CheckingClassifier(
+        check_X=sparse.issparse,
+        methods_to_check=methods_to_check,
+    )
+
+    clf.fit(X, y)
+    if predict_method in methods_to_check:
+        with pytest.raises(AssertionError):
+            getattr(clf, predict_method)(X)
+    else:
+        getattr(clf, predict_method)(X)
+
+
+@pytest.mark.parametrize(
+    "response_methods",
+    [
+        ["predict"],
+        ["predict", "predict_proba"],
+        ["predict", "decision_function"],
+        ["predict", "predict_proba", "decision_function"],
+    ],
+)
+def test_mock_estimator_on_off_prediction(iris, response_methods):
+    X, y = iris
+    estimator = _MockEstimatorOnOffPrediction(response_methods=response_methods)
+
+    estimator.fit(X, y)
+    assert hasattr(estimator, "classes_")
+    assert_array_equal(estimator.classes_, np.unique(y))
+
+    possible_responses = ["predict", "predict_proba", "decision_function"]
+    for response in possible_responses:
+        if response in response_methods:
+            assert hasattr(estimator, response)
+            assert getattr(estimator, response)(X) == response
+        else:
+            assert not hasattr(estimator, response)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_multiclass.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_multiclass.py
new file mode 100644
index 0000000000000000000000000000000000000000..a686b721f239375d7e7787681d1f73e00f43bd6a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_multiclass.py
@@ -0,0 +1,645 @@
+import warnings
+from itertools import product
+
+import numpy as np
+import pytest
+from scipy.sparse import issparse
+
+from sklearn import config_context, datasets
+from sklearn.model_selection import ShuffleSplit
+from sklearn.svm import SVC
+from sklearn.utils._array_api import (
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._testing import (
+    _array_api_for_tests,
+    _convert_container,
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.estimator_checks import _NotAnArray
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DOK_CONTAINERS,
+    LIL_CONTAINERS,
+)
+from sklearn.utils.metaestimators import _safe_split
+from sklearn.utils.multiclass import (
+    _ovr_decision_function,
+    check_classification_targets,
+    class_distribution,
+    is_multilabel,
+    type_of_target,
+    unique_labels,
+)
+
+multilabel_explicit_zero = np.array([[0, 1], [1, 0]])
+multilabel_explicit_zero[:, 0] = 0
+
+
+def _generate_sparse(
+    data,
+    sparse_containers=tuple(
+        COO_CONTAINERS
+        + CSC_CONTAINERS
+        + CSR_CONTAINERS
+        + DOK_CONTAINERS
+        + LIL_CONTAINERS
+    ),
+    dtypes=(bool, int, np.int8, np.uint8, float, np.float32),
+):
+    return [
+        sparse_container(data, dtype=dtype)
+        for sparse_container in sparse_containers
+        for dtype in dtypes
+    ]
+
+
+EXAMPLES = {
+    "multilabel-indicator": [
+        # valid when the data is formatted as sparse or dense, identified
+        # by CSR format when the testing takes place
+        *_generate_sparse(
+            np.random.RandomState(42).randint(2, size=(10, 10)),
+            sparse_containers=CSR_CONTAINERS,
+            dtypes=(int,),
+        ),
+        [[0, 1], [1, 0]],
+        [[0, 1]],
+        *_generate_sparse(
+            multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,)
+        ),
+        *_generate_sparse([[0, 1], [1, 0]]),
+        *_generate_sparse([[0, 0], [0, 0]]),
+        *_generate_sparse([[0, 1]]),
+        # Only valid when data is dense
+        [[-1, 1], [1, -1]],
+        np.array([[-1, 1], [1, -1]]),
+        np.array([[-3, 3], [3, -3]]),
+        _NotAnArray(np.array([[-3, 3], [3, -3]])),
+    ],
+    "multiclass": [
+        [1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
+        np.array([1, 0, 2]),
+        np.array([1, 0, 2], dtype=np.int8),
+        np.array([1, 0, 2], dtype=np.uint8),
+        np.array([1, 0, 2], dtype=float),
+        np.array([1, 0, 2], dtype=np.float32),
+        np.array([[1], [0], [2]]),
+        _NotAnArray(np.array([1, 0, 2])),
+        [0, 1, 2],
+        ["a", "b", "c"],
+        np.array(["a", "b", "c"]),
+        np.array(["a", "b", "c"], dtype=object),
+        np.array(["a", "b", "c"], dtype=object),
+    ],
+    "multiclass-multioutput": [
+        [[1, 0, 2, 2], [1, 4, 2, 4]],
+        [["a", "b"], ["c", "d"]],
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
+        *_generate_sparse(
+            [[1, 0, 2, 2], [1, 4, 2, 4]],
+            sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
+            dtypes=(int, np.int8, np.uint8, float, np.float32),
+        ),
+        np.array([["a", "b"], ["c", "d"]]),
+        np.array([["a", "b"], ["c", "d"]]),
+        np.array([["a", "b"], ["c", "d"]], dtype=object),
+        np.array([[1, 0, 2]]),
+        _NotAnArray(np.array([[1, 0, 2]])),
+    ],
+    "binary": [
+        [0, 1],
+        [1, 1],
+        [],
+        [0],
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
+        np.array([[0], [1]]),
+        _NotAnArray(np.array([[0], [1]])),
+        [1, -1],
+        [3, 5],
+        ["a"],
+        ["a", "b"],
+        ["abc", "def"],
+        np.array(["abc", "def"]),
+        ["a", "b"],
+        np.array(["abc", "def"], dtype=object),
+    ],
+    "continuous": [
+        [1e-5],
+        [0, 0.5],
+        np.array([[0], [0.5]]),
+        np.array([[0], [0.5]], dtype=np.float32),
+    ],
+    "continuous-multioutput": [
+        np.array([[0, 0.5], [0.5, 0]]),
+        np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
+        np.array([[0, 0.5]]),
+        *_generate_sparse(
+            [[0, 0.5], [0.5, 0]],
+            sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
+            dtypes=(float, np.float32),
+        ),
+        *_generate_sparse(
+            [[0, 0.5]],
+            sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
+            dtypes=(float, np.float32),
+        ),
+    ],
+    "unknown": [
+        [[]],
+        np.array([[]], dtype=object),
+        [()],
+        # sequence of sequences that weren't supported even before deprecation
+        np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
+        [np.array([]), np.array([1, 2, 3])],
+        [{1, 2, 3}, {1, 2}],
+        [frozenset([1, 2, 3]), frozenset([1, 2])],
+        # and also confusable as sequences of sequences
+        [{0: "a", 1: "b"}, {0: "a"}],
+        # ndim 0
+        np.array(0),
+        # empty second dimension
+        np.array([[], []]),
+        # 3d
+        np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
+    ],
+}
+
+ARRAY_API_EXAMPLES = {
+    "multilabel-indicator": [
+        np.random.RandomState(42).randint(2, size=(10, 10)),
+        [[0, 1], [1, 0]],
+        [[0, 1]],
+        multilabel_explicit_zero,
+        [[0, 0], [0, 0]],
+        [[-1, 1], [1, -1]],
+        np.array([[-1, 1], [1, -1]]),
+        np.array([[-3, 3], [3, -3]]),
+        _NotAnArray(np.array([[-3, 3], [3, -3]])),
+    ],
+    "multiclass": [
+        [1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
+        np.array([1, 0, 2]),
+        np.array([1, 0, 2], dtype=np.int8),
+        np.array([1, 0, 2], dtype=np.uint8),
+        np.array([1, 0, 2], dtype=float),
+        np.array([1, 0, 2], dtype=np.float32),
+        np.array([[1], [0], [2]]),
+        _NotAnArray(np.array([1, 0, 2])),
+        [0, 1, 2],
+    ],
+    "multiclass-multioutput": [
+        [[1, 0, 2, 2], [1, 4, 2, 4]],
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
+        np.array([[1, 0, 2]]),
+        _NotAnArray(np.array([[1, 0, 2]])),
+    ],
+    "binary": [
+        [0, 1],
+        [1, 1],
+        [],
+        [0],
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
+        np.array([[0], [1]]),
+        _NotAnArray(np.array([[0], [1]])),
+        [1, -1],
+        [3, 5],
+    ],
+    "continuous": [
+        [1e-5],
+        [0, 0.5],
+        np.array([[0], [0.5]]),
+        np.array([[0], [0.5]], dtype=np.float32),
+    ],
+    "continuous-multioutput": [
+        np.array([[0, 0.5], [0.5, 0]]),
+        np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
+        np.array([[0, 0.5]]),
+    ],
+    "unknown": [
+        [[]],
+        [()],
+        np.array(0),
+        np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
+    ],
+}
+
+
+NON_ARRAY_LIKE_EXAMPLES = [
+    {1, 2, 3},
+    {0: "a", 1: "b"},
+    {0: [5], 1: [5]},
+    "abc",
+    frozenset([1, 2, 3]),
+    None,
+]
+
+MULTILABEL_SEQUENCES = [
+    [[1], [2], [0, 1]],
+    [(), (2), (0, 1)],
+    np.array([[], [1, 2]], dtype="object"),
+    _NotAnArray(np.array([[], [1, 2]], dtype="object")),
+]
+
+
+def test_unique_labels():
+    # Empty iterable
+    with pytest.raises(ValueError):
+        unique_labels()
+
+    # Multiclass problem
+    assert_array_equal(unique_labels(range(10)), np.arange(10))
+    assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
+    assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
+
+    # Multilabel indicator
+    assert_array_equal(
+        unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)
+    )
+
+    assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3))
+
+    # Several arrays passed
+    assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5))
+    assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3))
+
+    # Border line case with binary indicator matrix
+    with pytest.raises(ValueError):
+        unique_labels([4, 0, 2], np.ones((5, 5)))
+    with pytest.raises(ValueError):
+        unique_labels(np.ones((5, 4)), np.ones((5, 5)))
+
+    assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))
+
+
+def test_type_of_target_too_many_unique_classes():
+    """Check that we raise a warning when the number of unique classes is greater than
+    50% of the number of samples.
+
+    We need to check that we don't raise if we have less than 20 samples.
+    """
+
+    # Create array of unique labels, except '0', which appears twice.
+    # This does raise a warning.
+    # Note warning would not be raised if we passed only unique
+    # labels, which happens when `type_of_target` is passed `classes_`.
+    y = np.hstack((np.arange(20), [0]))
+    msg = r"The number of unique classes is greater than 50% of the number of samples."
+    with pytest.warns(UserWarning, match=msg):
+        type_of_target(y)
+
+    # less than 20 samples, no warning should be raised
+    y = np.arange(10)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        type_of_target(y)
+
+    # More than 20 samples but only unique classes, simulating passing
+    # `classes_` to `type_of_target` (when number of classes is large).
+    # No warning should be raised
+    y = np.arange(25)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        type_of_target(y)
+
+
+def test_unique_labels_non_specific():
+    # Test unique_labels with a variety of collected examples
+
+    # Smoke test for all supported format
+    for format in ["binary", "multiclass", "multilabel-indicator"]:
+        for y in EXAMPLES[format]:
+            unique_labels(y)
+
+    # We don't support those format at the moment
+    for example in NON_ARRAY_LIKE_EXAMPLES:
+        with pytest.raises(ValueError):
+            unique_labels(example)
+
+    for y_type in [
+        "unknown",
+        "continuous",
+        "continuous-multioutput",
+        "multiclass-multioutput",
+    ]:
+        for example in EXAMPLES[y_type]:
+            with pytest.raises(ValueError):
+                unique_labels(example)
+
+
+def test_unique_labels_mixed_types():
+    # Mix with binary or multiclass and multilabel
+    mix_clf_format = product(
+        EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"]
+    )
+
+    for y_multilabel, y_multiclass in mix_clf_format:
+        with pytest.raises(ValueError):
+            unique_labels(y_multiclass, y_multilabel)
+        with pytest.raises(ValueError):
+            unique_labels(y_multilabel, y_multiclass)
+
+    with pytest.raises(ValueError):
+        unique_labels([[1, 2]], [["a", "d"]])
+
+    with pytest.raises(ValueError):
+        unique_labels(["1", 2])
+
+    with pytest.raises(ValueError):
+        unique_labels([["1", 2], [1, 3]])
+
+    with pytest.raises(ValueError):
+        unique_labels([["1", "2"], [2, 3]])
+
+
+def test_is_multilabel():
+    for group, group_examples in EXAMPLES.items():
+        dense_exp = group == "multilabel-indicator"
+
+        for example in group_examples:
+            # Only mark explicitly defined sparse examples as valid sparse
+            # multilabel-indicators
+            sparse_exp = dense_exp and issparse(example)
+
+            if issparse(example) or (
+                hasattr(example, "__array__")
+                and np.asarray(example).ndim == 2
+                and np.asarray(example).dtype.kind in "biuf"
+                and np.asarray(example).shape[1] > 0
+            ):
+                examples_sparse = [
+                    sparse_container(example)
+                    for sparse_container in (
+                        COO_CONTAINERS
+                        + CSC_CONTAINERS
+                        + CSR_CONTAINERS
+                        + DOK_CONTAINERS
+                        + LIL_CONTAINERS
+                    )
+                ]
+                for exmpl_sparse in examples_sparse:
+                    assert sparse_exp == is_multilabel(exmpl_sparse), (
+                        f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}"
+                    )
+
+            # Densify sparse examples before testing
+            if issparse(example):
+                example = example.toarray()
+
+            assert dense_exp == is_multilabel(example), (
+                f"is_multilabel({example!r}) should be {dense_exp}"
+            )
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_is_multilabel_array_api_compliance(array_namespace, device, dtype_name):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    for group, group_examples in ARRAY_API_EXAMPLES.items():
+        dense_exp = group == "multilabel-indicator"
+        for example in group_examples:
+            if np.asarray(example).dtype.kind == "f":
+                example = np.asarray(example, dtype=dtype_name)
+            else:
+                example = np.asarray(example)
+            example = xp.asarray(example, device=device)
+
+            with config_context(array_api_dispatch=True):
+                assert dense_exp == is_multilabel(example), (
+                    f"is_multilabel({example!r}) should be {dense_exp}"
+                )
+
+
+def test_check_classification_targets():
+    for y_type in EXAMPLES.keys():
+        if y_type in ["unknown", "continuous", "continuous-multioutput"]:
+            for example in EXAMPLES[y_type]:
+                msg = "Unknown label type: "
+                with pytest.raises(ValueError, match=msg):
+                    check_classification_targets(example)
+        else:
+            for example in EXAMPLES[y_type]:
+                check_classification_targets(example)
+
+
+def test_type_of_target():
+    for group, group_examples in EXAMPLES.items():
+        for example in group_examples:
+            assert type_of_target(example) == group, (
+                "type_of_target(%r) should be %r, got %r"
+                % (
+                    example,
+                    group,
+                    type_of_target(example),
+                )
+            )
+
+    for example in NON_ARRAY_LIKE_EXAMPLES:
+        msg_regex = r"Expected array-like \(array or non-string sequence\).*"
+        with pytest.raises(ValueError, match=msg_regex):
+            type_of_target(example)
+
+    for example in MULTILABEL_SEQUENCES:
+        msg = (
+            "You appear to be using a legacy multi-label data "
+            "representation. Sequence of sequences are no longer supported;"
+            " use a binary array or sparse matrix instead."
+        )
+        with pytest.raises(ValueError, match=msg):
+            type_of_target(example)
+
+
+def test_type_of_target_pandas_sparse():
+    pd = pytest.importorskip("pandas")
+
+    y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan])
+    msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
+    with pytest.raises(ValueError, match=msg):
+        type_of_target(y)
+
+
+def test_type_of_target_pandas_nullable():
+    """Check that type_of_target works with pandas nullable dtypes."""
+    pd = pytest.importorskip("pandas")
+
+    for dtype in ["Int32", "Float32"]:
+        y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype)
+        assert type_of_target(y_true) == "multiclass"
+
+        y_true = pd.Series([1, 0, 1, 0], dtype=dtype)
+        assert type_of_target(y_true) == "binary"
+
+    y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32")
+    assert type_of_target(y_true) == "continuous-multioutput"
+
+    y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32")
+    assert type_of_target(y_true) == "multilabel-indicator"
+
+    y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32")
+    assert type_of_target(y_true) == "multiclass-multioutput"
+
+
+@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
+def test_unique_labels_pandas_nullable(dtype):
+    """Checks that unique_labels work with pandas nullable dtypes.
+
+    Non-regression test for gh-25634.
+    """
+    pd = pytest.importorskip("pandas")
+
+    y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
+    y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64")
+
+    labels = unique_labels(y_true, y_predicted)
+    assert_array_equal(labels, [0, 1])
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_class_distribution(csc_container):
+    y = np.array(
+        [
+            [1, 0, 0, 1],
+            [2, 2, 0, 1],
+            [1, 3, 0, 1],
+            [4, 2, 0, 1],
+            [2, 0, 0, 1],
+            [1, 3, 0, 1],
+        ]
+    )
+    # Define the sparse matrix with a mix of implicit and explicit zeros
+    data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
+    indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
+    indptr = np.array([0, 6, 11, 11, 17])
+    y_sp = csc_container((data, indices, indptr), shape=(6, 4))
+
+    classes, n_classes, class_prior = class_distribution(y)
+    classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
+    classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]]
+    n_classes_expected = [3, 3, 1, 1]
+    class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]]
+
+    for k in range(y.shape[1]):
+        assert_array_almost_equal(classes[k], classes_expected[k])
+        assert_array_almost_equal(n_classes[k], n_classes_expected[k])
+        assert_array_almost_equal(class_prior[k], class_prior_expected[k])
+
+        assert_array_almost_equal(classes_sp[k], classes_expected[k])
+        assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
+        assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
+
+    # Test again with explicit sample weights
+    (classes, n_classes, class_prior) = class_distribution(
+        y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
+    )
+    (classes_sp, n_classes_sp, class_prior_sp) = class_distribution(
+        y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
+    )
+    class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]]
+
+    for k in range(y.shape[1]):
+        assert_array_almost_equal(classes[k], classes_expected[k])
+        assert_array_almost_equal(n_classes[k], n_classes_expected[k])
+        assert_array_almost_equal(class_prior[k], class_prior_expected[k])
+
+        assert_array_almost_equal(classes_sp[k], classes_expected[k])
+        assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
+        assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
+
+
+def test_safe_split_with_precomputed_kernel():
+    clf = SVC()
+    clfp = SVC(kernel="precomputed")
+
+    iris = datasets.load_iris()
+    X, y = iris.data, iris.target
+    K = np.dot(X, X.T)
+
+    cv = ShuffleSplit(test_size=0.25, random_state=0)
+    train, test = next(iter(cv.split(X)))
+
+    X_train, y_train = _safe_split(clf, X, y, train)
+    K_train, y_train2 = _safe_split(clfp, K, y, train)
+    assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
+    assert_array_almost_equal(y_train, y_train2)
+
+    X_test, y_test = _safe_split(clf, X, y, test, train)
+    K_test, y_test2 = _safe_split(clfp, K, y, test, train)
+    assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
+    assert_array_almost_equal(y_test, y_test2)
+
+
+def test_ovr_decision_function():
+    # test properties for ovr decision function
+
+    predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]])
+
+    confidences = np.array(
+        [[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]]
+    )
+
+    n_classes = 3
+
+    dec_values = _ovr_decision_function(predictions, confidences, n_classes)
+
+    # check that the decision values are within 0.5 range of the votes
+    votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]])
+
+    assert_allclose(votes, dec_values, atol=0.5)
+
+    # check that the prediction are what we expect
+    # highest vote or highest confidence if there is a tie.
+    # for the second sample we have a tie (should be won by 1)
+    expected_prediction = np.array([2, 1, 2, 2])
+    assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
+
+    # third and fourth sample have the same vote but third sample
+    # has higher confidence, this should reflect on the decision values
+    assert dec_values[2, 2] > dec_values[3, 2]
+
+    # assert subset invariance.
+    dec_values_one = [
+        _ovr_decision_function(
+            np.array([predictions[i]]), np.array([confidences[i]]), n_classes
+        )[0]
+        for i in range(4)
+    ]
+
+    assert_allclose(dec_values, dec_values_one, atol=1e-6)
+
+
+@pytest.mark.parametrize("input_type", ["list", "array"])
+def test_labels_in_bytes_format_error(input_type):
+    # check that we raise an error with bytes encoded labels
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16980
+    target = _convert_container([b"a", b"b"], input_type)
+    err_msg = "Support for labels represented as bytes is not supported"
+    with pytest.raises(TypeError, match=err_msg):
+        type_of_target(target)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_murmurhash.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_murmurhash.py
new file mode 100644
index 0000000000000000000000000000000000000000..20721c6e98f524ff03db28e60909fb8979090e81
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_murmurhash.py
@@ -0,0 +1,73 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from sklearn.utils.murmurhash import murmurhash3_32
+
+
+def test_mmhash3_int():
+    assert murmurhash3_32(3) == 847579505
+    assert murmurhash3_32(3, seed=0) == 847579505
+    assert murmurhash3_32(3, seed=42) == -1823081949
+
+    assert murmurhash3_32(3, positive=False) == 847579505
+    assert murmurhash3_32(3, seed=0, positive=False) == 847579505
+    assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
+
+    assert murmurhash3_32(3, positive=True) == 847579505
+    assert murmurhash3_32(3, seed=0, positive=True) == 847579505
+    assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
+
+
+def test_mmhash3_int_array():
+    rng = np.random.RandomState(42)
+    keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
+    keys = keys.reshape((3, 2, 1))
+
+    for seed in [0, 42]:
+        expected = np.array([murmurhash3_32(int(k), seed) for k in keys.flat])
+        expected = expected.reshape(keys.shape)
+        assert_array_equal(murmurhash3_32(keys, seed), expected)
+
+    for seed in [0, 42]:
+        expected = np.array([murmurhash3_32(k, seed, positive=True) for k in keys.flat])
+        expected = expected.reshape(keys.shape)
+        assert_array_equal(murmurhash3_32(keys, seed, positive=True), expected)
+
+
+def test_mmhash3_bytes():
+    assert murmurhash3_32(b"foo", 0) == -156908512
+    assert murmurhash3_32(b"foo", 42) == -1322301282
+
+    assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
+    assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014
+
+
+def test_mmhash3_unicode():
+    assert murmurhash3_32("foo", 0) == -156908512
+    assert murmurhash3_32("foo", 42) == -1322301282
+
+    assert murmurhash3_32("foo", 0, positive=True) == 4138058784
+    assert murmurhash3_32("foo", 42, positive=True) == 2972666014
+
+
+def test_no_collision_on_byte_range():
+    previous_hashes = set()
+    for i in range(100):
+        h = murmurhash3_32(" " * i, 0)
+        assert h not in previous_hashes, "Found collision on growing empty string"
+
+
+def test_uniform_distribution():
+    n_bins, n_samples = 10, 100000
+    bins = np.zeros(n_bins, dtype=np.float64)
+
+    for i in range(n_samples):
+        bins[murmurhash3_32(i, positive=True) % n_bins] += 1
+
+    means = bins / n_samples
+    expected = np.full(n_bins, 1.0 / n_bins)
+
+    assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_optimize.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_optimize.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99f3a91318082b22a772642b73f376ef9493273
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_optimize.py
@@ -0,0 +1,220 @@
+import warnings
+
+import numpy as np
+import pytest
+from scipy.optimize import fmin_ncg
+
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.utils._bunch import Bunch
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.optimize import _check_optimize_result, _newton_cg
+
+
+def test_newton_cg(global_random_seed):
+    # Test that newton_cg gives same result as scipy's fmin_ncg
+
+    rng = np.random.RandomState(global_random_seed)
+    A = rng.normal(size=(10, 10))
+    x0 = np.ones(10)
+
+    def func(x):
+        Ax = A.dot(x)
+        return 0.5 * (Ax).dot(Ax)
+
+    def grad(x):
+        return A.T.dot(A.dot(x))
+
+    def hess(x, p):
+        return p.dot(A.T.dot(A.dot(x.all())))
+
+    def grad_hess(x):
+        return grad(x), lambda x: A.T.dot(A.dot(x))
+
+    # func is a definite positive quadratic form, so the minimum is at x = 0
+    # hence the use of absolute tolerance.
+    assert np.all(np.abs(_newton_cg(grad_hess, func, grad, x0, tol=1e-10)[0]) <= 1e-7)
+    assert_allclose(
+        _newton_cg(grad_hess, func, grad, x0, tol=1e-7)[0],
+        fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess),
+        atol=1e-5,
+    )
+
+
+@pytest.mark.parametrize("verbose", [0, 1, 2])
+def test_newton_cg_verbosity(capsys, verbose):
+    """Test the std output of verbose newton_cg solver."""
+    A = np.eye(2)
+    b = np.array([1, 2], dtype=float)
+
+    _newton_cg(
+        grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
+        func=lambda x: 0.5 * x @ A @ x - b @ x,
+        grad=lambda x: A @ x - b,
+        x0=np.zeros(A.shape[0]),
+        verbose=verbose,
+    )  # returns array([1., 2])
+    captured = capsys.readouterr()
+
+    if verbose == 0:
+        assert captured.out == ""
+    else:
+        msg = [
+            "Newton-CG iter = 1",
+            "Check Convergence",
+            "max |gradient|",
+            "Solver did converge at loss = ",
+        ]
+        for m in msg:
+            assert m in captured.out
+
+    if verbose >= 2:
+        msg = [
+            "Inner CG solver iteration 1 stopped with",
+            "sum(|residuals|) <= tol",
+            "Line Search",
+            "try line search wolfe1",
+            "wolfe1 line search was successful",
+        ]
+        for m in msg:
+            assert m in captured.out
+
+    if verbose >= 2:
+        # Set up a badly scaled singular Hessian with a completely wrong starting
+        # position. This should trigger 2nd line search check
+        A = np.array([[1.0, 2], [2, 4]]) * 1e30  # collinear columns
+        b = np.array([1.0, 2.0])
+        # Note that scipy.optimize._linesearch LineSearchWarning inherits from
+        # RuntimeWarning, but we do not want to import from non public APIs.
+        with pytest.warns(RuntimeWarning):
+            _newton_cg(
+                grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
+                func=lambda x: 0.5 * x @ A @ x - b @ x,
+                grad=lambda x: A @ x - b,
+                x0=np.array([-2.0, 1]),  # null space of hessian
+                verbose=verbose,
+            )
+        captured = capsys.readouterr()
+        msg = [
+            "wolfe1 line search was not successful",
+            "check loss |improvement| <= eps * |loss_old|:",
+            "check sum(|gradient|) < sum(|gradient_old|):",
+            "last resort: try line search wolfe2",
+        ]
+        for m in msg:
+            assert m in captured.out
+
+        # Set up a badly conditioned Hessian that leads to tiny curvature.
+        # X.T @ X have singular values array([1.00000400e+01, 1.00008192e-11])
+        A = np.array([[1.0, 2], [1, 2 + 1e-15]])
+        b = np.array([-2.0, 1])
+        with pytest.warns(ConvergenceWarning):
+            _newton_cg(
+                grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
+                func=lambda x: 0.5 * x @ A @ x - b @ x,
+                grad=lambda x: A @ x - b,
+                x0=b,
+                verbose=verbose,
+                maxiter=2,
+            )
+        captured = capsys.readouterr()
+        msg = [
+            "tiny_|p| = eps * ||p||^2",
+        ]
+        for m in msg:
+            assert m in captured.out
+
+        # Test for a case with negative Hessian.
+        # We do not trigger "Inner CG solver iteration {i} stopped with negative
+        # curvature", but that is very hard to trigger.
+        A = np.eye(2)
+        b = np.array([-2.0, 1])
+        with pytest.warns(RuntimeWarning):
+            _newton_cg(
+                # Note the wrong sign in the hessian product.
+                grad_hess=lambda x: (A @ x - b, lambda z: -A @ z),
+                func=lambda x: 0.5 * x @ A @ x - b @ x,
+                grad=lambda x: A @ x - b,
+                x0=np.array([1.0, 1.0]),
+                verbose=verbose,
+                maxiter=3,
+            )
+        captured = capsys.readouterr()
+        msg = [
+            "Inner CG solver iteration 0 fell back to steepest descent",
+        ]
+        for m in msg:
+            assert m in captured.out
+
+        A = np.diag([1e-3, 1, 1e3])
+        b = np.array([-2.0, 1, 2.0])
+        with pytest.warns(ConvergenceWarning):
+            _newton_cg(
+                grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
+                func=lambda x: 0.5 * x @ A @ x - b @ x,
+                grad=lambda x: A @ x - b,
+                x0=np.ones_like(b),
+                verbose=verbose,
+                maxiter=2,
+                maxinner=1,
+            )
+        captured = capsys.readouterr()
+        msg = [
+            "Inner CG solver stopped reaching maxiter=1",
+        ]
+        for m in msg:
+            assert m in captured.out
+
+
+def test_check_optimize():
+    # Mock some lbfgs output using a Bunch instance:
+    result = Bunch()
+
+    # First case: no warnings
+    result.nit = 1
+    result.status = 0
+    result.message = "OK"
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        _check_optimize_result("lbfgs", result)
+
+    # Second case: warning about implicit `max_iter`: do not recommend the user
+    # to increase `max_iter` this is not a user settable parameter.
+    result.status = 1
+    result.message = "STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT"
+    with pytest.warns(ConvergenceWarning) as record:
+        _check_optimize_result("lbfgs", result)
+
+    assert len(record) == 1
+    warn_msg = record[0].message.args[0]
+    assert "lbfgs failed to converge after 1 iteration(s)" in warn_msg
+    assert result.message in warn_msg
+    assert "Increase the number of iterations" not in warn_msg
+    assert "scale the data" in warn_msg
+
+    # Third case: warning about explicit `max_iter`: recommend user to increase
+    # `max_iter`.
+    with pytest.warns(ConvergenceWarning) as record:
+        _check_optimize_result("lbfgs", result, max_iter=1)
+
+    assert len(record) == 1
+    warn_msg = record[0].message.args[0]
+    assert "lbfgs failed to converge after 1 iteration(s)" in warn_msg
+    assert result.message in warn_msg
+    assert "Increase the number of iterations" in warn_msg
+    assert "scale the data" in warn_msg
+
+    # Fourth case: other convergence problem before reaching `max_iter`: do not
+    # recommend increasing `max_iter`.
+    result.nit = 2
+    result.status = 2
+    result.message = "ABNORMAL"
+    with pytest.warns(ConvergenceWarning) as record:
+        _check_optimize_result("lbfgs", result, max_iter=10)
+
+    assert len(record) == 1
+    warn_msg = record[0].message.args[0]
+    assert "lbfgs failed to converge after 2 iteration(s)" in warn_msg
+    assert result.message in warn_msg
+    assert "Increase the number of iterations" not in warn_msg
+    assert "scale the data" in warn_msg
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_parallel.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..e79adf064b44e6ca12ebc85731a970b9f782b4d4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_parallel.py
@@ -0,0 +1,155 @@
+import time
+import warnings
+
+import joblib
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn import config_context, get_config
+from sklearn.compose import make_column_transformer
+from sklearn.datasets import load_iris
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils.fixes import _IS_WASM
+from sklearn.utils.parallel import Parallel, delayed
+
+
+def get_working_memory():
+    return get_config()["working_memory"]
+
+
+@pytest.mark.parametrize("n_jobs", [1, 2])
+@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
+def test_configuration_passes_through_to_joblib(n_jobs, backend):
+    # Tests that the global global configuration is passed to joblib jobs
+
+    with config_context(working_memory=123):
+        results = Parallel(n_jobs=n_jobs, backend=backend)(
+            delayed(get_working_memory)() for _ in range(2)
+        )
+
+    assert_array_equal(results, [123] * 2)
+
+
+def test_parallel_delayed_warnings():
+    """Informative warnings should be raised when mixing sklearn and joblib API"""
+    # We should issue a warning when one wants to use sklearn.utils.fixes.Parallel
+    # with joblib.delayed. The config will not be propagated to the workers.
+    warn_msg = "`sklearn.utils.parallel.Parallel` needs to be used in conjunction"
+    with pytest.warns(UserWarning, match=warn_msg) as records:
+        Parallel()(joblib.delayed(time.sleep)(0) for _ in range(10))
+    assert len(records) == 10
+
+    # We should issue a warning if one wants to use sklearn.utils.fixes.delayed with
+    # joblib.Parallel
+    warn_msg = (
+        "`sklearn.utils.parallel.delayed` should be used with "
+        "`sklearn.utils.parallel.Parallel` to make it possible to propagate"
+    )
+    with pytest.warns(UserWarning, match=warn_msg) as records:
+        joblib.Parallel()(delayed(time.sleep)(0) for _ in range(10))
+    assert len(records) == 10
+
+
+@pytest.mark.parametrize("n_jobs", [1, 2])
+def test_dispatch_config_parallel(n_jobs):
+    """Check that we properly dispatch the configuration in parallel processing.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/25239
+    """
+    pd = pytest.importorskip("pandas")
+    iris = load_iris(as_frame=True)
+
+    class TransformerRequiredDataFrame(StandardScaler):
+        def fit(self, X, y=None):
+            assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
+            return super().fit(X, y)
+
+        def transform(self, X, y=None):
+            assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
+            return super().transform(X, y)
+
+    dropper = make_column_transformer(
+        ("drop", [0]),
+        remainder="passthrough",
+        n_jobs=n_jobs,
+    )
+    param_grid = {"randomforestclassifier__max_depth": [1, 2, 3]}
+    search_cv = GridSearchCV(
+        make_pipeline(
+            dropper,
+            TransformerRequiredDataFrame(),
+            RandomForestClassifier(n_estimators=5, n_jobs=n_jobs),
+        ),
+        param_grid,
+        cv=5,
+        n_jobs=n_jobs,
+        error_score="raise",  # this search should not fail
+    )
+
+    # make sure that `fit` would fail in case we don't request dataframe
+    with pytest.raises(AssertionError, match="X should be a DataFrame"):
+        search_cv.fit(iris.data, iris.target)
+
+    with config_context(transform_output="pandas"):
+        # we expect each intermediate steps to output a DataFrame
+        search_cv.fit(iris.data, iris.target)
+
+    assert not np.isnan(search_cv.cv_results_["mean_test_score"]).any()
+
+
+def raise_warning():
+    warnings.warn("Convergence warning", ConvergenceWarning)
+
+
+@pytest.mark.parametrize("n_jobs", [1, 2])
+@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
+def test_filter_warning_propagates(n_jobs, backend):
+    """Check warning propagates to the job."""
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", category=ConvergenceWarning)
+
+        with pytest.raises(ConvergenceWarning):
+            Parallel(n_jobs=n_jobs, backend=backend)(
+                delayed(raise_warning)() for _ in range(2)
+            )
+
+
+def get_warnings():
+    return warnings.filters
+
+
+def test_check_warnings_threading():
+    """Check that warnings filters are set correctly in the threading backend."""
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", category=ConvergenceWarning)
+
+        filters = warnings.filters
+        assert ("error", None, ConvergenceWarning, None, 0) in filters
+
+        all_warnings = Parallel(n_jobs=2, backend="threading")(
+            delayed(get_warnings)() for _ in range(2)
+        )
+
+        assert all(w == filters for w in all_warnings)
+
+
+@pytest.mark.xfail(_IS_WASM, reason="Pyodide always use the sequential backend")
+def test_filter_warning_propagates_no_side_effect_with_loky_backend():
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", category=ConvergenceWarning)
+
+        Parallel(n_jobs=2, backend="loky")(delayed(time.sleep)(0) for _ in range(10))
+
+        # Since loky workers are reused, make sure that inside the loky workers,
+        # warnings filters have been reset to their original value. Using joblib
+        # directly should not turn ConvergenceWarning into an error.
+        joblib.Parallel(n_jobs=2, backend="loky")(
+            joblib.delayed(warnings.warn)("Convergence warning", ConvergenceWarning)
+            for _ in range(10)
+        )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_param_validation.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_param_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..a47eaace5b9a2362e7abf6326f2e0c7ad10c112f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_param_validation.py
@@ -0,0 +1,786 @@
+from numbers import Integral, Real
+
+import numpy as np
+import pytest
+from scipy.sparse import csr_matrix
+
+from sklearn._config import config_context, get_config
+from sklearn.base import BaseEstimator, _fit_context
+from sklearn.model_selection import LeaveOneOut
+from sklearn.utils import deprecated
+from sklearn.utils._param_validation import (
+    HasMethods,
+    Hidden,
+    Interval,
+    InvalidParameterError,
+    MissingValues,
+    Options,
+    RealNotInt,
+    StrOptions,
+    _ArrayLikes,
+    _Booleans,
+    _Callables,
+    _CVObjects,
+    _InstancesOf,
+    _IterablesNotString,
+    _NanConstraint,
+    _NoneConstraint,
+    _PandasNAConstraint,
+    _RandomStates,
+    _SparseMatrices,
+    _VerboseHelper,
+    generate_invalid_param_val,
+    generate_valid_param,
+    make_constraint,
+    validate_params,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+# Some helpers for the tests
+@validate_params(
+    {"a": [Real], "b": [Real], "c": [Real], "d": [Real]},
+    prefer_skip_nested_validation=True,
+)
+def _func(a, b=0, *args, c, d=0, **kwargs):
+    """A function to test the validation of functions."""
+
+
+class _Class:
+    """A class to test the _InstancesOf constraint and the validation of methods."""
+
+    @validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
+    def _method(self, a):
+        """A validated method"""
+
+    @deprecated()
+    @validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
+    def _deprecated_method(self, a):
+        """A deprecated validated method"""
+
+
+class _Estimator(BaseEstimator):
+    """An estimator to test the validation of estimator parameters."""
+
+    _parameter_constraints: dict = {"a": [Real]}
+
+    def __init__(self, a):
+        self.a = a
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X=None, y=None):
+        pass
+
+
+@pytest.mark.parametrize("interval_type", [Integral, Real])
+def test_interval_range(interval_type):
+    """Check the range of values depending on closed."""
+    interval = Interval(interval_type, -2, 2, closed="left")
+    assert -2 in interval
+    assert 2 not in interval
+
+    interval = Interval(interval_type, -2, 2, closed="right")
+    assert -2 not in interval
+    assert 2 in interval
+
+    interval = Interval(interval_type, -2, 2, closed="both")
+    assert -2 in interval
+    assert 2 in interval
+
+    interval = Interval(interval_type, -2, 2, closed="neither")
+    assert -2 not in interval
+    assert 2 not in interval
+
+
+@pytest.mark.parametrize("interval_type", [Integral, Real])
+def test_interval_large_integers(interval_type):
+    """Check that Interval constraint work with large integers.
+
+    non-regression test for #26648.
+    """
+    interval = Interval(interval_type, 0, 2, closed="neither")
+    assert 2**65 not in interval
+    assert 2**128 not in interval
+    assert float(2**65) not in interval
+    assert float(2**128) not in interval
+
+    interval = Interval(interval_type, 0, 2**128, closed="neither")
+    assert 2**65 in interval
+    assert 2**128 not in interval
+    assert float(2**65) in interval
+    assert float(2**128) not in interval
+
+    assert 2**1024 not in interval
+
+
+def test_interval_inf_in_bounds():
+    """Check that inf is included iff a bound is closed and set to None.
+
+    Only valid for real intervals.
+    """
+    interval = Interval(Real, 0, None, closed="right")
+    assert np.inf in interval
+
+    interval = Interval(Real, None, 0, closed="left")
+    assert -np.inf in interval
+
+    interval = Interval(Real, None, None, closed="neither")
+    assert np.inf not in interval
+    assert -np.inf not in interval
+
+
+@pytest.mark.parametrize(
+    "interval",
+    [Interval(Real, 0, 1, closed="left"), Interval(Real, None, None, closed="both")],
+)
+def test_nan_not_in_interval(interval):
+    """Check that np.nan is not in any interval."""
+    assert np.nan not in interval
+
+
+@pytest.mark.parametrize(
+    "params, error, match",
+    [
+        (
+            {"type": Integral, "left": 1.0, "right": 2, "closed": "both"},
+            TypeError,
+            r"Expecting left to be an int for an interval over the integers",
+        ),
+        (
+            {"type": Integral, "left": 1, "right": 2.0, "closed": "neither"},
+            TypeError,
+            "Expecting right to be an int for an interval over the integers",
+        ),
+        (
+            {"type": Integral, "left": None, "right": 0, "closed": "left"},
+            ValueError,
+            r"left can't be None when closed == left",
+        ),
+        (
+            {"type": Integral, "left": 0, "right": None, "closed": "right"},
+            ValueError,
+            r"right can't be None when closed == right",
+        ),
+        (
+            {"type": Integral, "left": 1, "right": -1, "closed": "both"},
+            ValueError,
+            r"right can't be less than left",
+        ),
+    ],
+)
+def test_interval_errors(params, error, match):
+    """Check that informative errors are raised for invalid combination of parameters"""
+    with pytest.raises(error, match=match):
+        Interval(**params)
+
+
+def test_stroptions():
+    """Sanity check for the StrOptions constraint"""
+    options = StrOptions({"a", "b", "c"}, deprecated={"c"})
+    assert options.is_satisfied_by("a")
+    assert options.is_satisfied_by("c")
+    assert not options.is_satisfied_by("d")
+
+    assert "'c' (deprecated)" in str(options)
+
+
+def test_options():
+    """Sanity check for the Options constraint"""
+    options = Options(Real, {-0.5, 0.5, np.inf}, deprecated={-0.5})
+    assert options.is_satisfied_by(-0.5)
+    assert options.is_satisfied_by(np.inf)
+    assert not options.is_satisfied_by(1.23)
+
+    assert "-0.5 (deprecated)" in str(options)
+
+
+@pytest.mark.parametrize(
+    "type, expected_type_name",
+    [
+        (int, "int"),
+        (Integral, "int"),
+        (Real, "float"),
+        (np.ndarray, "numpy.ndarray"),
+    ],
+)
+def test_instances_of_type_human_readable(type, expected_type_name):
+    """Check the string representation of the _InstancesOf constraint."""
+    constraint = _InstancesOf(type)
+    assert str(constraint) == f"an instance of '{expected_type_name}'"
+
+
+def test_hasmethods():
+    """Check the HasMethods constraint."""
+    constraint = HasMethods(["a", "b"])
+
+    class _Good:
+        def a(self):
+            pass  # pragma: no cover
+
+        def b(self):
+            pass  # pragma: no cover
+
+    class _Bad:
+        def a(self):
+            pass  # pragma: no cover
+
+    assert constraint.is_satisfied_by(_Good())
+    assert not constraint.is_satisfied_by(_Bad())
+    assert str(constraint) == "an object implementing 'a' and 'b'"
+
+
+@pytest.mark.parametrize(
+    "constraint",
+    [
+        Interval(Real, None, 0, closed="left"),
+        Interval(Real, 0, None, closed="left"),
+        Interval(Real, None, None, closed="neither"),
+        StrOptions({"a", "b", "c"}),
+        MissingValues(),
+        MissingValues(numeric_only=True),
+        _VerboseHelper(),
+        HasMethods("fit"),
+        _IterablesNotString(),
+        _CVObjects(),
+    ],
+)
+def test_generate_invalid_param_val(constraint):
+    """Check that the value generated does not satisfy the constraint"""
+    bad_value = generate_invalid_param_val(constraint)
+    assert not constraint.is_satisfied_by(bad_value)
+
+
+@pytest.mark.parametrize(
+    "integer_interval, real_interval",
+    [
+        (
+            Interval(Integral, None, 3, closed="right"),
+            Interval(RealNotInt, -5, 5, closed="both"),
+        ),
+        (
+            Interval(Integral, None, 3, closed="right"),
+            Interval(RealNotInt, -5, 5, closed="neither"),
+        ),
+        (
+            Interval(Integral, None, 3, closed="right"),
+            Interval(RealNotInt, 4, 5, closed="both"),
+        ),
+        (
+            Interval(Integral, None, 3, closed="right"),
+            Interval(RealNotInt, 5, None, closed="left"),
+        ),
+        (
+            Interval(Integral, None, 3, closed="right"),
+            Interval(RealNotInt, 4, None, closed="neither"),
+        ),
+        (
+            Interval(Integral, 3, None, closed="left"),
+            Interval(RealNotInt, -5, 5, closed="both"),
+        ),
+        (
+            Interval(Integral, 3, None, closed="left"),
+            Interval(RealNotInt, -5, 5, closed="neither"),
+        ),
+        (
+            Interval(Integral, 3, None, closed="left"),
+            Interval(RealNotInt, 1, 2, closed="both"),
+        ),
+        (
+            Interval(Integral, 3, None, closed="left"),
+            Interval(RealNotInt, None, -5, closed="left"),
+        ),
+        (
+            Interval(Integral, 3, None, closed="left"),
+            Interval(RealNotInt, None, -4, closed="neither"),
+        ),
+        (
+            Interval(Integral, -5, 5, closed="both"),
+            Interval(RealNotInt, None, 1, closed="right"),
+        ),
+        (
+            Interval(Integral, -5, 5, closed="both"),
+            Interval(RealNotInt, 1, None, closed="left"),
+        ),
+        (
+            Interval(Integral, -5, 5, closed="both"),
+            Interval(RealNotInt, -10, -4, closed="neither"),
+        ),
+        (
+            Interval(Integral, -5, 5, closed="both"),
+            Interval(RealNotInt, -10, -4, closed="right"),
+        ),
+        (
+            Interval(Integral, -5, 5, closed="neither"),
+            Interval(RealNotInt, 6, 10, closed="neither"),
+        ),
+        (
+            Interval(Integral, -5, 5, closed="neither"),
+            Interval(RealNotInt, 6, 10, closed="left"),
+        ),
+        (
+            Interval(Integral, 2, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="both"),
+        ),
+        (
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="both"),
+        ),
+    ],
+)
+def test_generate_invalid_param_val_2_intervals(integer_interval, real_interval):
+    """Check that the value generated for an interval constraint does not satisfy any of
+    the interval constraints.
+    """
+    bad_value = generate_invalid_param_val(constraint=real_interval)
+    assert not real_interval.is_satisfied_by(bad_value)
+    assert not integer_interval.is_satisfied_by(bad_value)
+
+    bad_value = generate_invalid_param_val(constraint=integer_interval)
+    assert not real_interval.is_satisfied_by(bad_value)
+    assert not integer_interval.is_satisfied_by(bad_value)
+
+
+@pytest.mark.parametrize(
+    "constraint",
+    [
+        _ArrayLikes(),
+        _InstancesOf(list),
+        _Callables(),
+        _NoneConstraint(),
+        _RandomStates(),
+        _SparseMatrices(),
+        _Booleans(),
+        Interval(Integral, None, None, closed="neither"),
+    ],
+)
+def test_generate_invalid_param_val_all_valid(constraint):
+    """Check that the function raises NotImplementedError when there's no invalid value
+    for the constraint.
+    """
+    with pytest.raises(NotImplementedError):
+        generate_invalid_param_val(constraint)
+
+
+@pytest.mark.parametrize(
+    "constraint",
+    [
+        _ArrayLikes(),
+        _Callables(),
+        _InstancesOf(list),
+        _NoneConstraint(),
+        _RandomStates(),
+        _SparseMatrices(),
+        _Booleans(),
+        _VerboseHelper(),
+        MissingValues(),
+        MissingValues(numeric_only=True),
+        StrOptions({"a", "b", "c"}),
+        Options(Integral, {1, 2, 3}),
+        Interval(Integral, None, None, closed="neither"),
+        Interval(Integral, 0, 10, closed="neither"),
+        Interval(Integral, 0, None, closed="neither"),
+        Interval(Integral, None, 0, closed="neither"),
+        Interval(Real, 0, 1, closed="neither"),
+        Interval(Real, 0, None, closed="both"),
+        Interval(Real, None, 0, closed="right"),
+        HasMethods("fit"),
+        _IterablesNotString(),
+        _CVObjects(),
+    ],
+)
+def test_generate_valid_param(constraint):
+    """Check that the value generated does satisfy the constraint."""
+    value = generate_valid_param(constraint)
+    assert constraint.is_satisfied_by(value)
+
+
+@pytest.mark.parametrize(
+    "constraint_declaration, value",
+    [
+        (Interval(Real, 0, 1, closed="both"), 0.42),
+        (Interval(Integral, 0, None, closed="neither"), 42),
+        (StrOptions({"a", "b", "c"}), "b"),
+        (Options(type, {np.float32, np.float64}), np.float64),
+        (callable, lambda x: x + 1),
+        (None, None),
+        ("array-like", [[1, 2], [3, 4]]),
+        ("array-like", np.array([[1, 2], [3, 4]])),
+        ("sparse matrix", csr_matrix([[1, 2], [3, 4]])),
+        *[
+            ("sparse matrix", container([[1, 2], [3, 4]]))
+            for container in CSR_CONTAINERS
+        ],
+        ("random_state", 0),
+        ("random_state", np.random.RandomState(0)),
+        ("random_state", None),
+        (_Class, _Class()),
+        (int, 1),
+        (Real, 0.5),
+        ("boolean", False),
+        ("verbose", 1),
+        ("nan", np.nan),
+        (MissingValues(), -1),
+        (MissingValues(), -1.0),
+        (MissingValues(), 2**1028),
+        (MissingValues(), None),
+        (MissingValues(), float("nan")),
+        (MissingValues(), np.nan),
+        (MissingValues(), "missing"),
+        (HasMethods("fit"), _Estimator(a=0)),
+        ("cv_object", 5),
+    ],
+)
+def test_is_satisfied_by(constraint_declaration, value):
+    """Sanity check for the is_satisfied_by method"""
+    constraint = make_constraint(constraint_declaration)
+    assert constraint.is_satisfied_by(value)
+
+
+@pytest.mark.parametrize(
+    "constraint_declaration, expected_constraint_class",
+    [
+        (Interval(Real, 0, 1, closed="both"), Interval),
+        (StrOptions({"option1", "option2"}), StrOptions),
+        (Options(Real, {0.42, 1.23}), Options),
+        ("array-like", _ArrayLikes),
+        ("sparse matrix", _SparseMatrices),
+        ("random_state", _RandomStates),
+        (None, _NoneConstraint),
+        (callable, _Callables),
+        (int, _InstancesOf),
+        ("boolean", _Booleans),
+        ("verbose", _VerboseHelper),
+        (MissingValues(numeric_only=True), MissingValues),
+        (HasMethods("fit"), HasMethods),
+        ("cv_object", _CVObjects),
+        ("nan", _NanConstraint),
+        (np.nan, _NanConstraint),
+    ],
+)
+def test_make_constraint(constraint_declaration, expected_constraint_class):
+    """Check that make_constraint dispatches to the appropriate constraint class"""
+    constraint = make_constraint(constraint_declaration)
+    assert constraint.__class__ is expected_constraint_class
+
+
+def test_make_constraint_unknown():
+    """Check that an informative error is raised when an unknown constraint is passed"""
+    with pytest.raises(ValueError, match="Unknown constraint"):
+        make_constraint("not a valid constraint")
+
+
+def test_validate_params():
+    """Check that validate_params works no matter how the arguments are passed"""
+    with pytest.raises(
+        InvalidParameterError, match="The 'a' parameter of _func must be"
+    ):
+        _func("wrong", c=1)
+
+    with pytest.raises(
+        InvalidParameterError, match="The 'b' parameter of _func must be"
+    ):
+        _func(*[1, "wrong"], c=1)
+
+    with pytest.raises(
+        InvalidParameterError, match="The 'c' parameter of _func must be"
+    ):
+        _func(1, **{"c": "wrong"})
+
+    with pytest.raises(
+        InvalidParameterError, match="The 'd' parameter of _func must be"
+    ):
+        _func(1, c=1, d="wrong")
+
+    # check in the presence of extra positional and keyword args
+    with pytest.raises(
+        InvalidParameterError, match="The 'b' parameter of _func must be"
+    ):
+        _func(0, *["wrong", 2, 3], c=4, **{"e": 5})
+
+    with pytest.raises(
+        InvalidParameterError, match="The 'c' parameter of _func must be"
+    ):
+        _func(0, *[1, 2, 3], c="four", **{"e": 5})
+
+
+def test_validate_params_missing_params():
+    """Check that no error is raised when there are parameters without
+    constraints
+    """
+
+    @validate_params({"a": [int]}, prefer_skip_nested_validation=True)
+    def func(a, b):
+        pass
+
+    func(1, 2)
+
+
+def test_decorate_validated_function():
+    """Check that validate_params functions can be decorated"""
+    decorated_function = deprecated()(_func)
+
+    with pytest.warns(FutureWarning, match="Function _func is deprecated"):
+        decorated_function(1, 2, c=3)
+
+    # outer decorator does not interfere with validation
+    with pytest.warns(FutureWarning, match="Function _func is deprecated"):
+        with pytest.raises(
+            InvalidParameterError, match=r"The 'c' parameter of _func must be"
+        ):
+            decorated_function(1, 2, c="wrong")
+
+
+def test_validate_params_method():
+    """Check that validate_params works with methods"""
+    with pytest.raises(
+        InvalidParameterError, match="The 'a' parameter of _Class._method must be"
+    ):
+        _Class()._method("wrong")
+
+    # validated method can be decorated
+    with pytest.warns(FutureWarning, match="Function _deprecated_method is deprecated"):
+        with pytest.raises(
+            InvalidParameterError,
+            match="The 'a' parameter of _Class._deprecated_method must be",
+        ):
+            _Class()._deprecated_method("wrong")
+
+
+def test_validate_params_estimator():
+    """Check that validate_params works with Estimator instances"""
+    # no validation in init
+    est = _Estimator("wrong")
+
+    with pytest.raises(
+        InvalidParameterError, match="The 'a' parameter of _Estimator must be"
+    ):
+        est.fit()
+
+
+def test_stroptions_deprecated_subset():
+    """Check that the deprecated parameter must be a subset of options."""
+    with pytest.raises(ValueError, match="deprecated options must be a subset"):
+        StrOptions({"a", "b", "c"}, deprecated={"a", "d"})
+
+
+def test_hidden_constraint():
+    """Check that internal constraints are not exposed in the error message."""
+
+    @validate_params(
+        {"param": [Hidden(list), dict]}, prefer_skip_nested_validation=True
+    )
+    def f(param):
+        pass
+
+    # list and dict are valid params
+    f({"a": 1, "b": 2, "c": 3})
+    f([1, 2, 3])
+
+    with pytest.raises(
+        InvalidParameterError, match="The 'param' parameter"
+    ) as exc_info:
+        f(param="bad")
+
+    # the list option is not exposed in the error message
+    err_msg = str(exc_info.value)
+    assert "an instance of 'dict'" in err_msg
+    assert "an instance of 'list'" not in err_msg
+
+
+def test_hidden_stroptions():
+    """Check that we can have 2 StrOptions constraints, one being hidden."""
+
+    @validate_params(
+        {"param": [StrOptions({"auto"}), Hidden(StrOptions({"warn"}))]},
+        prefer_skip_nested_validation=True,
+    )
+    def f(param):
+        pass
+
+    # "auto" and "warn" are valid params
+    f("auto")
+    f("warn")
+
+    with pytest.raises(
+        InvalidParameterError, match="The 'param' parameter"
+    ) as exc_info:
+        f(param="bad")
+
+    # the "warn" option is not exposed in the error message
+    err_msg = str(exc_info.value)
+    assert "auto" in err_msg
+    assert "warn" not in err_msg
+
+
+def test_validate_params_set_param_constraints_attribute():
+    """Check that the validate_params decorator properly sets the parameter constraints
+    as attribute of the decorated function/method.
+    """
+    assert hasattr(_func, "_skl_parameter_constraints")
+    assert hasattr(_Class()._method, "_skl_parameter_constraints")
+
+
+def test_boolean_constraint_deprecated_int():
+    """Check that validate_params raise a deprecation message but still passes
+    validation when using an int for a parameter accepting a boolean.
+    """
+
+    @validate_params({"param": ["boolean"]}, prefer_skip_nested_validation=True)
+    def f(param):
+        pass
+
+    # True/False and np.bool_(True/False) are valid params
+    f(True)
+    f(np.bool_(False))
+
+
+def test_no_validation():
+    """Check that validation can be skipped for a parameter."""
+
+    @validate_params(
+        {"param1": [int, None], "param2": "no_validation"},
+        prefer_skip_nested_validation=True,
+    )
+    def f(param1=None, param2=None):
+        pass
+
+    # param1 is validated
+    with pytest.raises(InvalidParameterError, match="The 'param1' parameter"):
+        f(param1="wrong")
+
+    # param2 is not validated: any type is valid.
+    class SomeType:
+        pass
+
+    f(param2=SomeType)
+    f(param2=SomeType())
+
+
+def test_pandas_na_constraint_with_pd_na():
+    """Add a specific test for checking support for `pandas.NA`."""
+    pd = pytest.importorskip("pandas")
+
+    na_constraint = _PandasNAConstraint()
+    assert na_constraint.is_satisfied_by(pd.NA)
+    assert not na_constraint.is_satisfied_by(np.array([1, 2, 3]))
+
+
+def test_iterable_not_string():
+    """Check that a string does not satisfy the _IterableNotString constraint."""
+    constraint = _IterablesNotString()
+    assert constraint.is_satisfied_by([1, 2, 3])
+    assert constraint.is_satisfied_by(range(10))
+    assert not constraint.is_satisfied_by("some string")
+
+
+def test_cv_objects():
+    """Check that the _CVObjects constraint accepts all current ways
+    to pass cv objects."""
+    constraint = _CVObjects()
+    assert constraint.is_satisfied_by(5)
+    assert constraint.is_satisfied_by(LeaveOneOut())
+    assert constraint.is_satisfied_by([([1, 2], [3, 4]), ([3, 4], [1, 2])])
+    assert constraint.is_satisfied_by(None)
+    assert not constraint.is_satisfied_by("not a CV object")
+
+
+def test_third_party_estimator():
+    """Check that the validation from a scikit-learn estimator inherited by a third
+    party estimator does not impose a match between the dict of constraints and the
+    parameters of the estimator.
+    """
+
+    class ThirdPartyEstimator(_Estimator):
+        def __init__(self, b):
+            self.b = b
+            super().__init__(a=0)
+
+        def fit(self, X=None, y=None):
+            super().fit(X, y)
+
+    # does not raise, even though "b" is not in the constraints dict and "a" is not
+    # a parameter of the estimator.
+    ThirdPartyEstimator(b=0).fit()
+
+
+def test_interval_real_not_int():
+    """Check for the type RealNotInt in the Interval constraint."""
+    constraint = Interval(RealNotInt, 0, 1, closed="both")
+    assert constraint.is_satisfied_by(1.0)
+    assert not constraint.is_satisfied_by(1)
+
+
+def test_real_not_int():
+    """Check for the RealNotInt type."""
+    assert isinstance(1.0, RealNotInt)
+    assert not isinstance(1, RealNotInt)
+    assert isinstance(np.float64(1), RealNotInt)
+    assert not isinstance(np.int64(1), RealNotInt)
+
+
+def test_skip_param_validation():
+    """Check that param validation can be skipped using config_context."""
+
+    @validate_params({"a": [int]}, prefer_skip_nested_validation=True)
+    def f(a):
+        pass
+
+    with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
+        f(a="1")
+
+    # does not raise
+    with config_context(skip_parameter_validation=True):
+        f(a="1")
+
+
+@pytest.mark.parametrize("prefer_skip_nested_validation", [True, False])
+def test_skip_nested_validation(prefer_skip_nested_validation):
+    """Check that nested validation can be skipped."""
+
+    @validate_params({"a": [int]}, prefer_skip_nested_validation=True)
+    def f(a):
+        pass
+
+    @validate_params(
+        {"b": [int]},
+        prefer_skip_nested_validation=prefer_skip_nested_validation,
+    )
+    def g(b):
+        # calls f with a bad parameter type
+        return f(a="invalid_param_value")
+
+    # Validation for g is never skipped.
+    with pytest.raises(InvalidParameterError, match="The 'b' parameter"):
+        g(b="invalid_param_value")
+
+    if prefer_skip_nested_validation:
+        g(b=1)  # does not raise because inner f is not validated
+    else:
+        with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
+            g(b=1)
+
+
+@pytest.mark.parametrize(
+    "skip_parameter_validation, prefer_skip_nested_validation, expected_skipped",
+    [
+        (True, True, True),
+        (True, False, True),
+        (False, True, True),
+        (False, False, False),
+    ],
+)
+def test_skip_nested_validation_and_config_context(
+    skip_parameter_validation, prefer_skip_nested_validation, expected_skipped
+):
+    """Check interaction between global skip and local skip."""
+
+    @validate_params(
+        {"a": [int]}, prefer_skip_nested_validation=prefer_skip_nested_validation
+    )
+    def g(a):
+        return get_config()["skip_parameter_validation"]
+
+    with config_context(skip_parameter_validation=skip_parameter_validation):
+        actual_skipped = g(1)
+
+    assert actual_skipped == expected_skipped
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_plotting.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_plotting.py
new file mode 100644
index 0000000000000000000000000000000000000000..db2f797ac2547a50eb75c7abcc6be027f64cffba
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_plotting.py
@@ -0,0 +1,544 @@
+import numpy as np
+import pytest
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.utils._plotting import (
+    _BinaryClassifierCurveDisplayMixin,
+    _deprecate_estimator_name,
+    _despine,
+    _interval_max_min_ratio,
+    _validate_score_name,
+    _validate_style_kwargs,
+)
+from sklearn.utils._response import _get_response_values_binary
+from sklearn.utils._testing import assert_allclose
+
+
+@pytest.mark.parametrize("ax", [None, "Ax"])
+@pytest.mark.parametrize(
+    "name, expected_name_out", [(None, "TestEstimator"), ("CustomName", "CustomName")]
+)
+def test_validate_plot_params(pyplot, ax, name, expected_name_out):
+    """Check `_validate_plot_params` returns the correct values."""
+    display = _BinaryClassifierCurveDisplayMixin()
+    display.estimator_name = "TestEstimator"
+    if ax:
+        _, ax = pyplot.subplots()
+    ax_out, _, name_out = display._validate_plot_params(ax=ax, name=name)
+
+    assert name_out == expected_name_out
+
+    if ax:
+        assert ax == ax_out
+
+
+@pytest.mark.parametrize("pos_label", [None, 0])
+@pytest.mark.parametrize("name", [None, "CustomName"])
+@pytest.mark.parametrize(
+    "response_method", ["auto", "predict_proba", "decision_function"]
+)
+def test_validate_and_get_response_values(pyplot, pos_label, name, response_method):
+    """Check `_validate_and_get_response_values` returns the correct values."""
+    X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
+    y = np.array([0, 0, 2, 2])
+    estimator = LogisticRegression().fit(X, y)
+
+    y_pred, pos_label, name_out = (
+        _BinaryClassifierCurveDisplayMixin._validate_and_get_response_values(
+            estimator,
+            X,
+            y,
+            response_method=response_method,
+            pos_label=pos_label,
+            name=name,
+        )
+    )
+
+    expected_y_pred, expected_pos_label = _get_response_values_binary(
+        estimator, X, response_method=response_method, pos_label=pos_label
+    )
+
+    assert_allclose(y_pred, expected_y_pred)
+    assert pos_label == expected_pos_label
+
+    # Check name is handled correctly
+    expected_name = name if name is not None else "LogisticRegression"
+    assert name_out == expected_name
+
+
+@pytest.mark.parametrize(
+    "y_true, error_message",
+    [
+        (np.array([0, 1, 2]), "The target y is not binary."),
+        (np.array([0, 1]), "Found input variables with inconsistent"),
+        (np.array([0, 2, 0, 2]), r"y_true takes value in \{0, 2\} and pos_label"),
+    ],
+)
+def test_validate_from_predictions_params_errors(pyplot, y_true, error_message):
+    """Check `_validate_from_predictions_params` raises the correct errors."""
+    y_pred = np.array([0.1, 0.2, 0.3, 0.4])
+    sample_weight = np.ones(4)
+
+    with pytest.raises(ValueError, match=error_message):
+        _BinaryClassifierCurveDisplayMixin._validate_from_predictions_params(
+            y_true=y_true,
+            y_pred=y_pred,
+            sample_weight=sample_weight,
+            pos_label=None,
+        )
+
+
+@pytest.mark.parametrize("name", [None, "CustomName"])
+@pytest.mark.parametrize(
+    "pos_label, y_true",
+    [
+        (None, np.array([0, 1, 0, 1])),
+        (2, np.array([0, 2, 0, 2])),
+    ],
+)
+def test_validate_from_predictions_params_returns(pyplot, name, pos_label, y_true):
+    """Check `_validate_from_predictions_params` returns the correct values."""
+    y_pred = np.array([0.1, 0.2, 0.3, 0.4])
+    pos_label_out, name_out = (
+        _BinaryClassifierCurveDisplayMixin._validate_from_predictions_params(
+            y_true=y_true,
+            y_pred=y_pred,
+            sample_weight=None,
+            pos_label=pos_label,
+            name=name,
+        )
+    )
+
+    # Check name is handled correctly
+    expected_name = name if name is not None else "Classifier"
+    assert name_out == expected_name
+
+    # Check pos_label is handled correctly
+    expected_pos_label = pos_label if pos_label is not None else 1
+    assert pos_label_out == expected_pos_label
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        (
+            {
+                # Missing "indices" key
+                "cv_results": {"estimator": "dummy"},
+                "X": np.array([[1, 2], [3, 4]]),
+                "y": np.array([0, 1]),
+                "sample_weight": None,
+                "pos_label": None,
+            },
+            "`cv_results` does not contain one of the following",
+        ),
+        (
+            {
+                "cv_results": {
+                    "estimator": "dummy",
+                    "indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
+                },
+                # `X` wrong length
+                "X": np.array([[1, 2]]),
+                "y": np.array([0, 1]),
+                "sample_weight": None,
+                "pos_label": None,
+            },
+            "`X` does not contain the correct number of",
+        ),
+        (
+            {
+                "cv_results": {
+                    "estimator": "dummy",
+                    "indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
+                },
+                "X": np.array([1, 2, 3, 4]),
+                # `y` not binary
+                "y": np.array([0, 2, 1, 3]),
+                "sample_weight": None,
+                "pos_label": None,
+            },
+            "The target `y` is not binary",
+        ),
+        (
+            {
+                "cv_results": {
+                    "estimator": "dummy",
+                    "indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
+                },
+                "X": np.array([1, 2, 3, 4]),
+                "y": np.array([0, 1, 0, 1]),
+                # `sample_weight` wrong length
+                "sample_weight": np.array([0.5]),
+                "pos_label": None,
+            },
+            "Found input variables with inconsistent",
+        ),
+        (
+            {
+                "cv_results": {
+                    "estimator": "dummy",
+                    "indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
+                },
+                "X": np.array([1, 2, 3, 4]),
+                "y": np.array([2, 3, 2, 3]),
+                "sample_weight": None,
+                # Not specified when `y` not in {0, 1} or {-1, 1}
+                "pos_label": None,
+            },
+            "y takes value in {2, 3} and pos_label is not specified",
+        ),
+    ],
+)
+def test_validate_from_cv_results_params(pyplot, params, err_msg):
+    """Check parameter validation is performed correctly."""
+    with pytest.raises(ValueError, match=err_msg):
+        _BinaryClassifierCurveDisplayMixin()._validate_from_cv_results_params(**params)
+
+
+@pytest.mark.parametrize(
+    "curve_legend_metric, curve_name, expected_label",
+    [
+        (0.85, None, "AUC = 0.85"),
+        (None, "Model A", "Model A"),
+        (0.95, "Random Forest", "Random Forest (AUC = 0.95)"),
+        (None, None, None),
+    ],
+)
+def test_get_legend_label(curve_legend_metric, curve_name, expected_label):
+    """Check `_get_legend_label` returns the correct label."""
+    legend_metric_name = "AUC"
+    label = _BinaryClassifierCurveDisplayMixin._get_legend_label(
+        curve_legend_metric, curve_name, legend_metric_name
+    )
+    assert label == expected_label
+
+
+# TODO(1.9) : Remove
+@pytest.mark.parametrize("curve_kwargs", [{"alpha": 1.0}, None])
+@pytest.mark.parametrize("kwargs", [{}, {"alpha": 1.0}])
+def test_validate_curve_kwargs_deprecate_kwargs(curve_kwargs, kwargs):
+    """Check `_validate_curve_kwargs` deprecates kwargs correctly."""
+    n_curves = 1
+    name = None
+    legend_metric = {"mean": 0.8, "std": 0.1}
+    legend_metric_name = "AUC"
+
+    if curve_kwargs and kwargs:
+        with pytest.raises(ValueError, match="Cannot provide both `curve_kwargs`"):
+            _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
+                n_curves,
+                name,
+                legend_metric,
+                legend_metric_name,
+                curve_kwargs,
+                **kwargs,
+            )
+    elif kwargs:
+        with pytest.warns(FutureWarning, match=r"`\*\*kwargs` is deprecated and"):
+            _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
+                n_curves,
+                name,
+                legend_metric,
+                legend_metric_name,
+                curve_kwargs,
+                **kwargs,
+            )
+    else:
+        # No warning or error should be raised
+        _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
+            n_curves, name, legend_metric, legend_metric_name, curve_kwargs, **kwargs
+        )
+
+
+def test_validate_curve_kwargs_error():
+    """Check `_validate_curve_kwargs` performs parameter validation correctly."""
+    n_curves = 3
+    legend_metric = {"mean": 0.8, "std": 0.1}
+    legend_metric_name = "AUC"
+    with pytest.raises(ValueError, match="`curve_kwargs` must be None"):
+        _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
+            n_curves=n_curves,
+            name=None,
+            legend_metric=legend_metric,
+            legend_metric_name=legend_metric_name,
+            curve_kwargs=[{"alpha": 1.0}],
+        )
+    with pytest.raises(ValueError, match="To avoid labeling individual curves"):
+        name = ["one", "two", "three"]
+        _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
+            n_curves=n_curves,
+            name=name,
+            legend_metric=legend_metric,
+            legend_metric_name=legend_metric_name,
+            curve_kwargs=None,
+        )
+        _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
+            n_curves=n_curves,
+            name=name,
+            legend_metric=legend_metric,
+            legend_metric_name=legend_metric_name,
+            curve_kwargs={"alpha": 1.0},
+        )
+
+
+@pytest.mark.parametrize("name", [None, "curve_name", ["curve_name"]])
+@pytest.mark.parametrize(
+    "legend_metric",
+    [
+        {"mean": 0.8, "std": 0.2},
+        {"mean": None, "std": None},
+    ],
+)
+@pytest.mark.parametrize("legend_metric_name", ["AUC", "AP"])
+@pytest.mark.parametrize(
+    "curve_kwargs",
+    [
+        None,
+        {"color": "red"},
+    ],
+)
+def test_validate_curve_kwargs_single_legend(
+    name, legend_metric, legend_metric_name, curve_kwargs
+):
+    """Check `_validate_curve_kwargs` returns correct kwargs for single legend entry."""
+    n_curves = 3
+    curve_kwargs_out = _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
+        n_curves=n_curves,
+        name=name,
+        legend_metric=legend_metric,
+        legend_metric_name=legend_metric_name,
+        curve_kwargs=curve_kwargs,
+    )
+
+    assert isinstance(curve_kwargs_out, list)
+    assert len(curve_kwargs_out) == n_curves
+
+    expected_label = None
+    if isinstance(name, list):
+        name = name[0]
+    if name is not None:
+        expected_label = name
+        if legend_metric["mean"] is not None:
+            expected_label = expected_label + f" ({legend_metric_name} = 0.80 +/- 0.20)"
+    # `name` is None
+    elif legend_metric["mean"] is not None:
+        expected_label = f"{legend_metric_name} = 0.80 +/- 0.20"
+
+    assert curve_kwargs_out[0]["label"] == expected_label
+    # All remaining curves should have None as "label"
+    assert curve_kwargs_out[1]["label"] is None
+    assert curve_kwargs_out[2]["label"] is None
+
+    # Default multi-curve kwargs
+    if curve_kwargs is None:
+        assert all(len(kwargs) == 4 for kwargs in curve_kwargs_out)
+        assert all(kwargs["alpha"] == 0.5 for kwargs in curve_kwargs_out)
+        assert all(kwargs["linestyle"] == "--" for kwargs in curve_kwargs_out)
+        assert all(kwargs["color"] == "blue" for kwargs in curve_kwargs_out)
+    else:
+        assert all(len(kwargs) == 2 for kwargs in curve_kwargs_out)
+        assert all(kwargs["color"] == "red" for kwargs in curve_kwargs_out)
+
+
+@pytest.mark.parametrize("name", [None, "curve_name", ["one", "two", "three"]])
+@pytest.mark.parametrize(
+    "legend_metric", [{"metric": [1.0, 1.0, 1.0]}, {"metric": [None, None, None]}]
+)
+@pytest.mark.parametrize("legend_metric_name", ["AUC", "AP"])
+def test_validate_curve_kwargs_multi_legend(name, legend_metric, legend_metric_name):
+    """Check `_validate_curve_kwargs` returns correct kwargs for multi legend entry."""
+    n_curves = 3
+    curve_kwargs = [{"color": "red"}, {"color": "yellow"}, {"color": "blue"}]
+    curve_kwargs_out = _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
+        n_curves=n_curves,
+        name=name,
+        legend_metric=legend_metric,
+        legend_metric_name=legend_metric_name,
+        curve_kwargs=curve_kwargs,
+    )
+
+    assert isinstance(curve_kwargs_out, list)
+    assert len(curve_kwargs_out) == n_curves
+
+    expected_labels = [None, None, None]
+    if isinstance(name, str):
+        expected_labels = "curve_name"
+        if legend_metric["metric"][0] is not None:
+            expected_labels = expected_labels + f" ({legend_metric_name} = 1.00)"
+        expected_labels = [expected_labels] * n_curves
+    elif isinstance(name, list) and legend_metric["metric"][0] is None:
+        expected_labels = name
+    elif isinstance(name, list) and legend_metric["metric"][0] is not None:
+        expected_labels = [
+            f"{name_single} ({legend_metric_name} = 1.00)" for name_single in name
+        ]
+    # `name` is None
+    elif legend_metric["metric"][0] is not None:
+        expected_labels = [f"{legend_metric_name} = 1.00"] * n_curves
+
+    for idx, expected_label in enumerate(expected_labels):
+        assert curve_kwargs_out[idx]["label"] == expected_label
+
+    assert all(len(kwargs) == 2 for kwargs in curve_kwargs_out)
+    for curve_kwarg, curve_kwarg_out in zip(curve_kwargs, curve_kwargs_out):
+        assert curve_kwarg_out["color"] == curve_kwarg["color"]
+
+
+def metric():
+    pass  # pragma: no cover
+
+
+def neg_metric():
+    pass  # pragma: no cover
+
+
+@pytest.mark.parametrize(
+    "score_name, scoring, negate_score, expected_score_name",
+    [
+        ("accuracy", None, False, "accuracy"),  # do not transform the name
+        (None, "accuracy", False, "Accuracy"),  # capitalize the name
+        (None, "accuracy", True, "Negative accuracy"),  # add "Negative"
+        (None, "neg_mean_absolute_error", False, "Negative mean absolute error"),
+        (None, "neg_mean_absolute_error", True, "Mean absolute error"),  # remove "neg_"
+        ("MAE", "neg_mean_absolute_error", True, "MAE"),  # keep score_name
+        (None, None, False, "Score"),  # default name
+        (None, None, True, "Negative score"),  # default name but negated
+        ("Some metric", metric, False, "Some metric"),  # do not transform the name
+        ("Some metric", metric, True, "Some metric"),  # do not transform the name
+        (None, metric, False, "Metric"),  # default name
+        (None, metric, True, "Negative metric"),  # default name but negated
+        ("Some metric", neg_metric, False, "Some metric"),  # do not transform the name
+        ("Some metric", neg_metric, True, "Some metric"),  # do not transform the name
+        (None, neg_metric, False, "Negative metric"),  # default name
+        (None, neg_metric, True, "Metric"),  # default name but negated
+    ],
+)
+def test_validate_score_name(score_name, scoring, negate_score, expected_score_name):
+    """Check that we return the right score name."""
+    assert (
+        _validate_score_name(score_name, scoring, negate_score) == expected_score_name
+    )
+
+
+# In the following test, we check the value of the max to min ratio
+# for parameter value intervals to check that using a decision threshold
+# of 5. is a good heuristic to decide between linear and log scales on
+# common ranges of parameter values.
+@pytest.mark.parametrize(
+    "data, lower_bound, upper_bound",
+    [
+        # Such a range could be clearly displayed with either log scale or linear
+        # scale.
+        (np.geomspace(0.1, 1, 5), 5, 6),
+        # Checking that the ratio is still positive on a negative log scale.
+        (-np.geomspace(0.1, 1, 10), 7, 8),
+        # Evenly spaced parameter values lead to a ratio of 1.
+        (np.linspace(0, 1, 5), 0.9, 1.1),
+        # This is not exactly spaced on a log scale but we will benefit from treating
+        # it as such for visualization.
+        ([1, 2, 5, 10, 20, 50], 20, 40),
+    ],
+)
+def test_inverval_max_min_ratio(data, lower_bound, upper_bound):
+    assert lower_bound < _interval_max_min_ratio(data) < upper_bound
+
+
+@pytest.mark.parametrize(
+    "default_kwargs, user_kwargs, expected",
+    [
+        (
+            {"color": "blue", "linewidth": 2},
+            {"linestyle": "dashed"},
+            {"color": "blue", "linewidth": 2, "linestyle": "dashed"},
+        ),
+        (
+            {"color": "blue", "linestyle": "solid"},
+            {"c": "red", "ls": "dashed"},
+            {"color": "red", "linestyle": "dashed"},
+        ),
+        (
+            {"label": "xxx", "color": "k", "linestyle": "--"},
+            {"ls": "-."},
+            {"label": "xxx", "color": "k", "linestyle": "-."},
+        ),
+        ({}, {}, {}),
+        (
+            {},
+            {
+                "ls": "dashed",
+                "c": "red",
+                "ec": "black",
+                "fc": "yellow",
+                "lw": 2,
+                "mec": "green",
+                "mfcalt": "blue",
+                "ms": 5,
+            },
+            {
+                "linestyle": "dashed",
+                "color": "red",
+                "edgecolor": "black",
+                "facecolor": "yellow",
+                "linewidth": 2,
+                "markeredgecolor": "green",
+                "markerfacecoloralt": "blue",
+                "markersize": 5,
+            },
+        ),
+    ],
+)
+def test_validate_style_kwargs(default_kwargs, user_kwargs, expected):
+    """Check the behaviour of `validate_style_kwargs` with various type of entries."""
+    result = _validate_style_kwargs(default_kwargs, user_kwargs)
+    assert result == expected, (
+        "The validation of style keywords does not provide the expected results: "
+        f"Got {result} instead of {expected}."
+    )
+
+
+@pytest.mark.parametrize(
+    "default_kwargs, user_kwargs",
+    [({}, {"ls": 2, "linestyle": 3}), ({}, {"c": "r", "color": "blue"})],
+)
+def test_validate_style_kwargs_error(default_kwargs, user_kwargs):
+    """Check that `validate_style_kwargs` raises TypeError"""
+    with pytest.raises(TypeError):
+        _validate_style_kwargs(default_kwargs, user_kwargs)
+
+
+def test_despine(pyplot):
+    ax = pyplot.gca()
+    _despine(ax)
+    assert ax.spines["top"].get_visible() is False
+    assert ax.spines["right"].get_visible() is False
+    assert ax.spines["bottom"].get_bounds() == (0, 1)
+    assert ax.spines["left"].get_bounds() == (0, 1)
+
+
+@pytest.mark.parametrize("estimator_name", ["my_est_name", "deprecated"])
+@pytest.mark.parametrize("name", [None, "my_name"])
+def test_deprecate_estimator_name(estimator_name, name):
+    """Check `_deprecate_estimator_name` behaves correctly"""
+    version = "1.7"
+    version_remove = "1.9"
+
+    if estimator_name == "deprecated":
+        name_out = _deprecate_estimator_name(estimator_name, name, version)
+        assert name_out == name
+    # `estimator_name` is provided and `name` is:
+    elif name is None:
+        warning_message = (
+            f"`estimator_name` is deprecated in {version} and will be removed in "
+            f"{version_remove}. Use `name` instead."
+        )
+        with pytest.warns(FutureWarning, match=warning_message):
+            result = _deprecate_estimator_name(estimator_name, name, version)
+        assert result == estimator_name
+    elif name is not None:
+        error_message = (
+            f"Cannot provide both `estimator_name` and `name`. `estimator_name` "
+            f"is deprecated in {version} and will be removed in {version_remove}. "
+        )
+        with pytest.raises(ValueError, match=error_message):
+            _deprecate_estimator_name(estimator_name, name, version)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_pprint.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_pprint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3e267dd5cbe317d93f52d3a1db350465050996
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_pprint.py
@@ -0,0 +1,695 @@
+import re
+from pprint import PrettyPrinter
+
+import numpy as np
+import pytest
+
+from sklearn import config_context
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.feature_selection import SelectKBest, chi2
+from sklearn.linear_model import LogisticRegressionCV
+from sklearn.pipeline import make_pipeline
+from sklearn.utils._pprint import _EstimatorPrettyPrinter
+
+
+# Constructors excerpted to test pprinting
+class LogisticRegression(BaseEstimator):
+    def __init__(
+        self,
+        penalty="l2",
+        dual=False,
+        tol=1e-4,
+        C=1.0,
+        fit_intercept=True,
+        intercept_scaling=1,
+        class_weight=None,
+        random_state=None,
+        solver="warn",
+        max_iter=100,
+        multi_class="warn",
+        verbose=0,
+        warm_start=False,
+        n_jobs=None,
+        l1_ratio=None,
+    ):
+        self.penalty = penalty
+        self.dual = dual
+        self.tol = tol
+        self.C = C
+        self.fit_intercept = fit_intercept
+        self.intercept_scaling = intercept_scaling
+        self.class_weight = class_weight
+        self.random_state = random_state
+        self.solver = solver
+        self.max_iter = max_iter
+        self.multi_class = multi_class
+        self.verbose = verbose
+        self.warm_start = warm_start
+        self.n_jobs = n_jobs
+        self.l1_ratio = l1_ratio
+
+    def fit(self, X, y):
+        return self
+
+
+class StandardScaler(TransformerMixin, BaseEstimator):
+    def __init__(self, copy=True, with_mean=True, with_std=True):
+        self.with_mean = with_mean
+        self.with_std = with_std
+        self.copy = copy
+
+    def transform(self, X, copy=None):
+        return self
+
+
+class RFE(BaseEstimator):
+    def __init__(self, estimator, n_features_to_select=None, step=1, verbose=0):
+        self.estimator = estimator
+        self.n_features_to_select = n_features_to_select
+        self.step = step
+        self.verbose = verbose
+
+
+class GridSearchCV(BaseEstimator):
+    def __init__(
+        self,
+        estimator,
+        param_grid,
+        scoring=None,
+        n_jobs=None,
+        iid="warn",
+        refit=True,
+        cv="warn",
+        verbose=0,
+        pre_dispatch="2*n_jobs",
+        error_score="raise-deprecating",
+        return_train_score=False,
+    ):
+        self.estimator = estimator
+        self.param_grid = param_grid
+        self.scoring = scoring
+        self.n_jobs = n_jobs
+        self.iid = iid
+        self.refit = refit
+        self.cv = cv
+        self.verbose = verbose
+        self.pre_dispatch = pre_dispatch
+        self.error_score = error_score
+        self.return_train_score = return_train_score
+
+
+class CountVectorizer(BaseEstimator):
+    def __init__(
+        self,
+        input="content",
+        encoding="utf-8",
+        decode_error="strict",
+        strip_accents=None,
+        lowercase=True,
+        preprocessor=None,
+        tokenizer=None,
+        stop_words=None,
+        token_pattern=r"(?u)\b\w\w+\b",
+        ngram_range=(1, 1),
+        analyzer="word",
+        max_df=1.0,
+        min_df=1,
+        max_features=None,
+        vocabulary=None,
+        binary=False,
+        dtype=np.int64,
+    ):
+        self.input = input
+        self.encoding = encoding
+        self.decode_error = decode_error
+        self.strip_accents = strip_accents
+        self.preprocessor = preprocessor
+        self.tokenizer = tokenizer
+        self.analyzer = analyzer
+        self.lowercase = lowercase
+        self.token_pattern = token_pattern
+        self.stop_words = stop_words
+        self.max_df = max_df
+        self.min_df = min_df
+        self.max_features = max_features
+        self.ngram_range = ngram_range
+        self.vocabulary = vocabulary
+        self.binary = binary
+        self.dtype = dtype
+
+
+class Pipeline(BaseEstimator):
+    def __init__(self, steps, memory=None):
+        self.steps = steps
+        self.memory = memory
+
+
+class SVC(BaseEstimator):
+    def __init__(
+        self,
+        C=1.0,
+        kernel="rbf",
+        degree=3,
+        gamma="auto_deprecated",
+        coef0=0.0,
+        shrinking=True,
+        probability=False,
+        tol=1e-3,
+        cache_size=200,
+        class_weight=None,
+        verbose=False,
+        max_iter=-1,
+        decision_function_shape="ovr",
+        random_state=None,
+    ):
+        self.kernel = kernel
+        self.degree = degree
+        self.gamma = gamma
+        self.coef0 = coef0
+        self.tol = tol
+        self.C = C
+        self.shrinking = shrinking
+        self.probability = probability
+        self.cache_size = cache_size
+        self.class_weight = class_weight
+        self.verbose = verbose
+        self.max_iter = max_iter
+        self.decision_function_shape = decision_function_shape
+        self.random_state = random_state
+
+
+class PCA(BaseEstimator):
+    def __init__(
+        self,
+        n_components=None,
+        copy=True,
+        whiten=False,
+        svd_solver="auto",
+        tol=0.0,
+        iterated_power="auto",
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.copy = copy
+        self.whiten = whiten
+        self.svd_solver = svd_solver
+        self.tol = tol
+        self.iterated_power = iterated_power
+        self.random_state = random_state
+
+
+class NMF(BaseEstimator):
+    def __init__(
+        self,
+        n_components=None,
+        init=None,
+        solver="cd",
+        beta_loss="frobenius",
+        tol=1e-4,
+        max_iter=200,
+        random_state=None,
+        alpha=0.0,
+        l1_ratio=0.0,
+        verbose=0,
+        shuffle=False,
+    ):
+        self.n_components = n_components
+        self.init = init
+        self.solver = solver
+        self.beta_loss = beta_loss
+        self.tol = tol
+        self.max_iter = max_iter
+        self.random_state = random_state
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.verbose = verbose
+        self.shuffle = shuffle
+
+
+class SimpleImputer(BaseEstimator):
+    def __init__(
+        self,
+        missing_values=np.nan,
+        strategy="mean",
+        fill_value=None,
+        verbose=0,
+        copy=True,
+    ):
+        self.missing_values = missing_values
+        self.strategy = strategy
+        self.fill_value = fill_value
+        self.verbose = verbose
+        self.copy = copy
+
+
+def test_basic(print_changed_only_false):
+    # Basic pprint test
+    lr = LogisticRegression()
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   intercept_scaling=1, l1_ratio=None, max_iter=100,
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+
+    expected = expected[1:]  # remove first \n
+    assert lr.__repr__() == expected
+
+
+def test_changed_only():
+    # Make sure the changed_only param is correctly used when True (default)
+    lr = LogisticRegression(C=99)
+    expected = """LogisticRegression(C=99)"""
+    assert lr.__repr__() == expected
+
+    # Check with a repr that doesn't fit on a single line
+    lr = LogisticRegression(
+        C=99, class_weight=0.4, fit_intercept=False, tol=1234, verbose=True
+    )
+    expected = """
+LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
+                   verbose=True)"""
+    expected = expected[1:]  # remove first \n
+    assert lr.__repr__() == expected
+
+    imputer = SimpleImputer(missing_values=0)
+    expected = """SimpleImputer(missing_values=0)"""
+    assert imputer.__repr__() == expected
+
+    # Defaults to np.nan, trying with float('NaN')
+    imputer = SimpleImputer(missing_values=float("NaN"))
+    expected = """SimpleImputer()"""
+    assert imputer.__repr__() == expected
+
+    # make sure array parameters don't throw error (see #13583)
+    repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))
+
+
+def test_pipeline(print_changed_only_false):
+    # Render a pipeline object
+    pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
+    expected = """
+Pipeline(memory=None,
+         steps=[('standardscaler',
+                 StandardScaler(copy=True, with_mean=True, with_std=True)),
+                ('logisticregression',
+                 LogisticRegression(C=999, class_weight=None, dual=False,
+                                    fit_intercept=True, intercept_scaling=1,
+                                    l1_ratio=None, max_iter=100,
+                                    multi_class='warn', n_jobs=None,
+                                    penalty='l2', random_state=None,
+                                    solver='warn', tol=0.0001, verbose=0,
+                                    warm_start=False))],
+         transform_input=None, verbose=False)"""
+
+    expected = expected[1:]  # remove first \n
+    assert pipeline.__repr__() == expected
+
+
+def test_deeply_nested(print_changed_only_false):
+    # Render a deeply nested estimator
+    rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
+    expected = """
+RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
+                                                                                                                     class_weight=None,
+                                                                                                                     dual=False,
+                                                                                                                     fit_intercept=True,
+                                                                                                                     intercept_scaling=1,
+                                                                                                                     l1_ratio=None,
+                                                                                                                     max_iter=100,
+                                                                                                                     multi_class='warn',
+                                                                                                                     n_jobs=None,
+                                                                                                                     penalty='l2',
+                                                                                                                     random_state=None,
+                                                                                                                     solver='warn',
+                                                                                                                     tol=0.0001,
+                                                                                                                     verbose=0,
+                                                                                                                     warm_start=False),
+                                                                                        n_features_to_select=None,
+                                                                                        step=1,
+                                                                                        verbose=0),
+                                                                          n_features_to_select=None,
+                                                                          step=1,
+                                                                          verbose=0),
+                                                            n_features_to_select=None,
+                                                            step=1, verbose=0),
+                                              n_features_to_select=None, step=1,
+                                              verbose=0),
+                                n_features_to_select=None, step=1, verbose=0),
+                  n_features_to_select=None, step=1, verbose=0),
+    n_features_to_select=None, step=1, verbose=0)"""
+
+    expected = expected[1:]  # remove first \n
+    assert rfe.__repr__() == expected
+
+
+@pytest.mark.parametrize(
+    ("print_changed_only", "expected"),
+    [
+        (True, "RFE(estimator=RFE(...))"),
+        (
+            False,
+            "RFE(estimator=RFE(...), n_features_to_select=None, step=1, verbose=0)",
+        ),
+    ],
+)
+def test_print_estimator_max_depth(print_changed_only, expected):
+    with config_context(print_changed_only=print_changed_only):
+        pp = _EstimatorPrettyPrinter(depth=1)
+
+        rfe = RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))
+        assert pp.pformat(rfe) == expected
+
+
+def test_gridsearch(print_changed_only_false):
+    # render a gridsearch
+    param_grid = [
+        {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
+        {"kernel": ["linear"], "C": [1, 10, 100, 1000]},
+    ]
+    gs = GridSearchCV(SVC(), param_grid, cv=5)
+
+    expected = """
+GridSearchCV(cv=5, error_score='raise-deprecating',
+             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
+                           decision_function_shape='ovr', degree=3,
+                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
+                           probability=False, random_state=None, shrinking=True,
+                           tol=0.001, verbose=False),
+             iid='warn', n_jobs=None,
+             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
+                          'kernel': ['rbf']},
+                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
+             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
+             scoring=None, verbose=0)"""
+
+    expected = expected[1:]  # remove first \n
+    assert gs.__repr__() == expected
+
+
+def test_gridsearch_pipeline(print_changed_only_false):
+    # render a pipeline inside a gridsearch
+    pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
+
+    pipeline = Pipeline([("reduce_dim", PCA()), ("classify", SVC())])
+    N_FEATURES_OPTIONS = [2, 4, 8]
+    C_OPTIONS = [1, 10, 100, 1000]
+    param_grid = [
+        {
+            "reduce_dim": [PCA(iterated_power=7), NMF()],
+            "reduce_dim__n_components": N_FEATURES_OPTIONS,
+            "classify__C": C_OPTIONS,
+        },
+        {
+            "reduce_dim": [SelectKBest(chi2)],
+            "reduce_dim__k": N_FEATURES_OPTIONS,
+            "classify__C": C_OPTIONS,
+        },
+    ]
+    gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
+    expected = """
+GridSearchCV(cv=3, error_score='raise-deprecating',
+             estimator=Pipeline(memory=None,
+                                steps=[('reduce_dim',
+                                        PCA(copy=True, iterated_power='auto',
+                                            n_components=None,
+                                            random_state=None,
+                                            svd_solver='auto', tol=0.0,
+                                            whiten=False)),
+                                       ('classify',
+                                        SVC(C=1.0, cache_size=200,
+                                            class_weight=None, coef0=0.0,
+                                            decision_function_shape='ovr',
+                                            degree=3, gamma='auto_deprecated',
+                                            kernel='rbf', max_iter=-1,
+                                            probability=False,
+                                            random_state=None, shrinking=True,
+                                            tol=0.001, verbose=False))]),
+             iid='warn', n_jobs=1,
+             param_grid=[{'classify__C': [1, 10, 100, 1000],
+                          'reduce_dim': [PCA(copy=True, iterated_power=7,
+                                             n_components=None,
+                                             random_state=None,
+                                             svd_solver='auto', tol=0.0,
+                                             whiten=False),
+                                         NMF(alpha=0.0, beta_loss='frobenius',
+                                             init=None, l1_ratio=0.0,
+                                             max_iter=200, n_components=None,
+                                             random_state=None, shuffle=False,
+                                             solver='cd', tol=0.0001,
+                                             verbose=0)],
+                          'reduce_dim__n_components': [2, 4, 8]},
+                         {'classify__C': [1, 10, 100, 1000],
+                          'reduce_dim': [SelectKBest(k=10,
+                                                     score_func=<function chi2 at some_address>)],
+                          'reduce_dim__k': [2, 4, 8]}],
+             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
+             scoring=None, verbose=0)"""  # noqa: E501
+
+    expected = expected[1:]  # remove first \n
+    repr_ = pp.pformat(gspipline)
+    # Remove address of '<function chi2 at 0x.....>' for reproducibility
+    repr_ = re.sub("function chi2 at 0x.*>", "function chi2 at some_address>", repr_)
+    assert repr_ == expected
+
+
+def test_n_max_elements_to_show(print_changed_only_false):
+    n_max_elements_to_show = 30
+    pp = _EstimatorPrettyPrinter(
+        compact=True,
+        indent=1,
+        indent_at_name=True,
+        n_max_elements_to_show=n_max_elements_to_show,
+    )
+
+    # No ellipsis
+    vocabulary = {i: i for i in range(n_max_elements_to_show)}
+    vectorizer = CountVectorizer(vocabulary=vocabulary)
+
+    expected = r"""
+CountVectorizer(analyzer='word', binary=False, decode_error='strict',
+                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
+                lowercase=True, max_df=1.0, max_features=None, min_df=1,
+                ngram_range=(1, 1), preprocessor=None, stop_words=None,
+                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
+                tokenizer=None,
+                vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
+                            8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
+                            15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
+                            21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
+                            27: 27, 28: 28, 29: 29})"""
+
+    expected = expected[1:]  # remove first \n
+    assert pp.pformat(vectorizer) == expected
+
+    # Now with ellipsis
+    vocabulary = {i: i for i in range(n_max_elements_to_show + 1)}
+    vectorizer = CountVectorizer(vocabulary=vocabulary)
+
+    expected = r"""
+CountVectorizer(analyzer='word', binary=False, decode_error='strict',
+                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
+                lowercase=True, max_df=1.0, max_features=None, min_df=1,
+                ngram_range=(1, 1), preprocessor=None, stop_words=None,
+                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
+                tokenizer=None,
+                vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
+                            8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
+                            15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
+                            21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
+                            27: 27, 28: 28, 29: 29, ...})"""
+
+    expected = expected[1:]  # remove first \n
+    assert pp.pformat(vectorizer) == expected
+
+    # Also test with lists
+    param_grid = {"C": list(range(n_max_elements_to_show))}
+    gs = GridSearchCV(SVC(), param_grid)
+    expected = """
+GridSearchCV(cv='warn', error_score='raise-deprecating',
+             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
+                           decision_function_shape='ovr', degree=3,
+                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
+                           probability=False, random_state=None, shrinking=True,
+                           tol=0.001, verbose=False),
+             iid='warn', n_jobs=None,
+             param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+                               27, 28, 29]},
+             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
+             scoring=None, verbose=0)"""
+
+    expected = expected[1:]  # remove first \n
+    assert pp.pformat(gs) == expected
+
+    # Now with ellipsis
+    param_grid = {"C": list(range(n_max_elements_to_show + 1))}
+    gs = GridSearchCV(SVC(), param_grid)
+    expected = """
+GridSearchCV(cv='warn', error_score='raise-deprecating',
+             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
+                           decision_function_shape='ovr', degree=3,
+                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
+                           probability=False, random_state=None, shrinking=True,
+                           tol=0.001, verbose=False),
+             iid='warn', n_jobs=None,
+             param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+                               27, 28, 29, ...]},
+             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
+             scoring=None, verbose=0)"""
+
+    expected = expected[1:]  # remove first \n
+    assert pp.pformat(gs) == expected
+
+
+def test_bruteforce_ellipsis(print_changed_only_false):
+    # Check that the bruteforce ellipsis (used when the number of non-blank
+    # characters exceeds N_CHAR_MAX) renders correctly.
+
+    lr = LogisticRegression()
+
+    # test when the left and right side of the ellipsis aren't on the same
+    # line.
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   in...
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=150)
+
+    # test with very small N_CHAR_MAX
+    # Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
+    # weird reprs we still keep the whole line of the right part (after the
+    # ellipsis).
+    expected = """
+Lo...
+                   warm_start=False)"""
+
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=4)
+
+    # test with N_CHAR_MAX == number of non-blank characters: In this case we
+    # don't want ellipsis
+    full_repr = lr.__repr__(N_CHAR_MAX=float("inf"))
+    n_nonblank = len("".join(full_repr.split()))
+    assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
+    assert "..." not in full_repr
+
+    # test with N_CHAR_MAX == number of non-blank characters - 10: the left and
+    # right side of the ellispsis are on different lines. In this case we
+    # want to expend the whole line of the right side
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   intercept_scaling=1, l1_ratio=None, max_i...
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)
+
+    # test with N_CHAR_MAX == number of non-blank characters - 10: the left and
+    # right side of the ellispsis are on the same line. In this case we don't
+    # want to expend the whole line of the right side, just add the ellispsis
+    # between the 2 sides.
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   intercept_scaling=1, l1_ratio=None, max_iter...,
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)
+
+    # test with N_CHAR_MAX == number of non-blank characters - 2: the left and
+    # right side of the ellispsis are on the same line, but adding the ellipsis
+    # would actually make the repr longer. So we don't add the ellipsis.
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   intercept_scaling=1, l1_ratio=None, max_iter=100,
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)
+
+
+def test_builtin_prettyprinter():
+    # non regression test than ensures we can still use the builtin
+    # PrettyPrinter class for estimators (as done e.g. by joblib).
+    # Used to be a bug
+
+    PrettyPrinter().pprint(LogisticRegression())
+
+
+def test_kwargs_in_init():
+    # Make sure the changed_only=True mode is OK when an argument is passed as
+    # kwargs.
+    # Non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/17206
+
+    class WithKWargs(BaseEstimator):
+        # Estimator with a kwargs argument. These need to hack around
+        # set_params and get_params. Here we mimic what LightGBM does.
+        def __init__(self, a="willchange", b="unchanged", **kwargs):
+            self.a = a
+            self.b = b
+            self._other_params = {}
+            self.set_params(**kwargs)
+
+        def get_params(self, deep=True):
+            params = super().get_params(deep=deep)
+            params.update(self._other_params)
+            return params
+
+        def set_params(self, **params):
+            for key, value in params.items():
+                setattr(self, key, value)
+                self._other_params[key] = value
+            return self
+
+    est = WithKWargs(a="something", c="abcd", d=None)
+
+    expected = "WithKWargs(a='something', c='abcd', d=None)"
+    assert expected == est.__repr__()
+
+    with config_context(print_changed_only=False):
+        expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
+        assert expected == est.__repr__()
+
+
+def test_complexity_print_changed_only():
+    # Make sure `__repr__` is called the same amount of times
+    # whether `print_changed_only` is True or False
+    # Non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/18490
+
+    class DummyEstimator(TransformerMixin, BaseEstimator):
+        nb_times_repr_called = 0
+
+        def __init__(self, estimator=None):
+            self.estimator = estimator
+
+        def __repr__(self):
+            DummyEstimator.nb_times_repr_called += 1
+            return super().__repr__()
+
+        def transform(self, X, copy=None):  # pragma: no cover
+            return X
+
+    estimator = DummyEstimator(
+        make_pipeline(DummyEstimator(DummyEstimator()), DummyEstimator(), "passthrough")
+    )
+    with config_context(print_changed_only=False):
+        repr(estimator)
+        nb_repr_print_changed_only_false = DummyEstimator.nb_times_repr_called
+
+    DummyEstimator.nb_times_repr_called = 0
+    with config_context(print_changed_only=True):
+        repr(estimator)
+        nb_repr_print_changed_only_true = DummyEstimator.nb_times_repr_called
+
+    assert nb_repr_print_changed_only_false == nb_repr_print_changed_only_true
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_random.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..13e1c9f1951b9b4390606b1b7b7725b4cdeba5e5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_random.py
@@ -0,0 +1,192 @@
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from numpy.testing import assert_array_almost_equal
+from scipy.special import comb
+
+from sklearn.utils._random import _our_rand_r_py
+from sklearn.utils.random import _random_choice_csc, sample_without_replacement
+
+
+###############################################################################
+# test custom sampling without replacement algorithm
+###############################################################################
+def test_invalid_sample_without_replacement_algorithm():
+    with pytest.raises(ValueError):
+        sample_without_replacement(5, 4, "unknown")
+
+
+def test_sample_without_replacement_algorithms():
+    methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
+
+    for m in methods:
+
+        def sample_without_replacement_method(
+            n_population, n_samples, random_state=None
+        ):
+            return sample_without_replacement(
+                n_population, n_samples, method=m, random_state=random_state
+            )
+
+        check_edge_case_of_sample_int(sample_without_replacement_method)
+        check_sample_int(sample_without_replacement_method)
+        check_sample_int_distribution(sample_without_replacement_method)
+
+
+def check_edge_case_of_sample_int(sample_without_replacement):
+    # n_population < n_sample
+    with pytest.raises(ValueError):
+        sample_without_replacement(0, 1)
+    with pytest.raises(ValueError):
+        sample_without_replacement(1, 2)
+
+    # n_population == n_samples
+    assert sample_without_replacement(0, 0).shape == (0,)
+
+    assert sample_without_replacement(1, 1).shape == (1,)
+
+    # n_population >= n_samples
+    assert sample_without_replacement(5, 0).shape == (0,)
+    assert sample_without_replacement(5, 1).shape == (1,)
+
+    # n_population < 0 or n_samples < 0
+    with pytest.raises(ValueError):
+        sample_without_replacement(-1, 5)
+    with pytest.raises(ValueError):
+        sample_without_replacement(5, -1)
+
+
+def check_sample_int(sample_without_replacement):
+    # This test is heavily inspired from test_random.py of python-core.
+    #
+    # For the entire allowable range of 0 <= k <= N, validate that
+    # the sample is of the correct length and contains only unique items
+    n_population = 100
+
+    for n_samples in range(n_population + 1):
+        s = sample_without_replacement(n_population, n_samples)
+        assert len(s) == n_samples
+        unique = np.unique(s)
+        assert np.size(unique) == n_samples
+        assert np.all(unique < n_population)
+
+    # test edge case n_population == n_samples == 0
+    assert np.size(sample_without_replacement(0, 0)) == 0
+
+
+def check_sample_int_distribution(sample_without_replacement):
+    # This test is heavily inspired from test_random.py of python-core.
+    #
+    # For the entire allowable range of 0 <= k <= N, validate that
+    # sample generates all possible permutations
+    n_population = 10
+
+    # a large number of trials prevents false negatives without slowing normal
+    # case
+    n_trials = 10000
+
+    for n_samples in range(n_population):
+        # Counting the number of combinations is not as good as counting the
+        # the number of permutations. However, it works with sampling algorithm
+        # that does not provide a random permutation of the subset of integer.
+        n_expected = comb(n_population, n_samples, exact=True)
+
+        output = {}
+        for i in range(n_trials):
+            output[frozenset(sample_without_replacement(n_population, n_samples))] = (
+                None
+            )
+
+            if len(output) == n_expected:
+                break
+        else:
+            raise AssertionError(
+                "number of combinations != number of expected (%s != %s)"
+                % (len(output), n_expected)
+            )
+
+
+def test_random_choice_csc(n_samples=10000, random_state=24):
+    # Explicit class probabilities
+    classes = [np.array([0, 1]), np.array([0, 1, 2])]
+    class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
+
+    got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
+    assert sp.issparse(got)
+
+    for k in range(len(classes)):
+        p = np.bincount(got[:, [k]].toarray().ravel()) / float(n_samples)
+        assert_array_almost_equal(class_probabilities[k], p, decimal=1)
+
+    # Implicit class probabilities
+    classes = [[0, 1], [1, 2]]  # test for array-like support
+    class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])]
+
+    got = _random_choice_csc(
+        n_samples=n_samples, classes=classes, random_state=random_state
+    )
+    assert sp.issparse(got)
+
+    for k in range(len(classes)):
+        p = np.bincount(got[:, [k]].toarray().ravel()) / float(n_samples)
+        assert_array_almost_equal(class_probabilities[k], p, decimal=1)
+
+    # Edge case probabilities 1.0 and 0.0
+    classes = [np.array([0, 1]), np.array([0, 1, 2])]
+    class_probabilities = [np.array([0.0, 1.0]), np.array([0.0, 1.0, 0.0])]
+
+    got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
+    assert sp.issparse(got)
+
+    for k in range(len(classes)):
+        p = (
+            np.bincount(
+                got[:, [k]].toarray().ravel(), minlength=len(class_probabilities[k])
+            )
+            / n_samples
+        )
+        assert_array_almost_equal(class_probabilities[k], p, decimal=1)
+
+    # One class target data
+    classes = [[1], [0]]  # test for array-like support
+    class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
+
+    got = _random_choice_csc(
+        n_samples=n_samples, classes=classes, random_state=random_state
+    )
+    assert sp.issparse(got)
+
+    for k in range(len(classes)):
+        p = np.bincount(got[:, [k]].toarray().ravel()) / n_samples
+        assert_array_almost_equal(class_probabilities[k], p, decimal=1)
+
+
+def test_random_choice_csc_errors():
+    # the length of an array in classes and class_probabilities is mismatched
+    classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
+    class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
+    with pytest.raises(ValueError):
+        _random_choice_csc(4, classes, class_probabilities, 1)
+
+    # the class dtype is not supported
+    classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
+    class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
+    with pytest.raises(ValueError):
+        _random_choice_csc(4, classes, class_probabilities, 1)
+
+    # the class dtype is not supported
+    classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
+    class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
+    with pytest.raises(ValueError):
+        _random_choice_csc(4, classes, class_probabilities, 1)
+
+    # Given probabilities don't sum to 1
+    classes = [np.array([0, 1]), np.array([0, 1, 2])]
+    class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
+    with pytest.raises(ValueError):
+        _random_choice_csc(4, classes, class_probabilities, 1)
+
+
+def test_our_rand_r():
+    assert 131541053 == _our_rand_r_py(1273642419)
+    assert 270369 == _our_rand_r_py(0)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_response.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_response.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f791b59dfaa3aa94ca71295f5b9f74fd885964e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_response.py
@@ -0,0 +1,394 @@
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.datasets import (
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.ensemble import IsolationForest
+from sklearn.linear_model import (
+    LinearRegression,
+    LogisticRegression,
+)
+from sklearn.multioutput import ClassifierChain
+from sklearn.preprocessing import scale
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils._mocking import _MockEstimatorOnOffPrediction
+from sklearn.utils._response import _get_response_values, _get_response_values_binary
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+
+X, y = load_iris(return_X_y=True)
+# scale the data to avoid ConvergenceWarning with LogisticRegression
+X = scale(X, copy=False)
+X_binary, y_binary = X[:100], y[:100]
+
+
+@pytest.mark.parametrize(
+    "response_method", ["decision_function", "predict_proba", "predict_log_proba"]
+)
+def test_get_response_values_regressor_error(response_method):
+    """Check the error message with regressor an not supported response
+    method."""
+    my_estimator = _MockEstimatorOnOffPrediction(response_methods=[response_method])
+    X = "mocking_data", "mocking_target"
+    err_msg = f"{my_estimator.__class__.__name__} should either be a classifier"
+    with pytest.raises(ValueError, match=err_msg):
+        _get_response_values(my_estimator, X, response_method=response_method)
+
+
+@pytest.mark.parametrize("return_response_method_used", [True, False])
+def test_get_response_values_regressor(return_response_method_used):
+    """Check the behaviour of `_get_response_values` with regressor."""
+    X, y = make_regression(n_samples=10, random_state=0)
+    regressor = LinearRegression().fit(X, y)
+    results = _get_response_values(
+        regressor,
+        X,
+        response_method="predict",
+        return_response_method_used=return_response_method_used,
+    )
+    assert_array_equal(results[0], regressor.predict(X))
+    assert results[1] is None
+    if return_response_method_used:
+        assert results[2] == "predict"
+
+
+@pytest.mark.parametrize(
+    "response_method",
+    ["predict", "decision_function", ["decision_function", "predict"]],
+)
+@pytest.mark.parametrize("return_response_method_used", [True, False])
+def test_get_response_values_outlier_detection(
+    response_method, return_response_method_used
+):
+    """Check the behaviour of `_get_response_values` with outlier detector."""
+    X, y = make_classification(n_samples=50, random_state=0)
+    outlier_detector = IsolationForest(random_state=0).fit(X, y)
+    results = _get_response_values(
+        outlier_detector,
+        X,
+        response_method=response_method,
+        return_response_method_used=return_response_method_used,
+    )
+    chosen_response_method = (
+        response_method[0] if isinstance(response_method, list) else response_method
+    )
+    prediction_method = getattr(outlier_detector, chosen_response_method)
+    assert_array_equal(results[0], prediction_method(X))
+    assert results[1] is None
+    if return_response_method_used:
+        assert results[2] == chosen_response_method
+
+
+@pytest.mark.parametrize(
+    "response_method",
+    ["predict_proba", "decision_function", "predict", "predict_log_proba"],
+)
+def test_get_response_values_classifier_unknown_pos_label(response_method):
+    """Check that `_get_response_values` raises the proper error message with
+    classifier."""
+    X, y = make_classification(n_samples=10, n_classes=2, random_state=0)
+    classifier = LogisticRegression().fit(X, y)
+
+    # provide a `pos_label` which is not in `y`
+    err_msg = r"pos_label=whatever is not a valid label: It should be one of \[0 1\]"
+    with pytest.raises(ValueError, match=err_msg):
+        _get_response_values(
+            classifier,
+            X,
+            response_method=response_method,
+            pos_label="whatever",
+        )
+
+
+@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
+def test_get_response_values_classifier_inconsistent_y_pred_for_binary_proba(
+    response_method,
+):
+    """Check that `_get_response_values` will raise an error when `y_pred` has a
+    single class with `predict_proba`."""
+    X, y_two_class = make_classification(n_samples=10, n_classes=2, random_state=0)
+    y_single_class = np.zeros_like(y_two_class)
+    classifier = DecisionTreeClassifier().fit(X, y_single_class)
+
+    err_msg = (
+        r"Got predict_proba of shape \(10, 1\), but need classifier with "
+        r"two classes"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        _get_response_values(classifier, X, response_method=response_method)
+
+
+@pytest.mark.parametrize("return_response_method_used", [True, False])
+def test_get_response_values_binary_classifier_decision_function(
+    return_response_method_used,
+):
+    """Check the behaviour of `_get_response_values` with `decision_function`
+    and binary classifier."""
+    X, y = make_classification(
+        n_samples=10,
+        n_classes=2,
+        weights=[0.3, 0.7],
+        random_state=0,
+    )
+    classifier = LogisticRegression().fit(X, y)
+    response_method = "decision_function"
+
+    # default `pos_label`
+    results = _get_response_values(
+        classifier,
+        X,
+        response_method=response_method,
+        pos_label=None,
+        return_response_method_used=return_response_method_used,
+    )
+    assert_allclose(results[0], classifier.decision_function(X))
+    assert results[1] == 1
+    if return_response_method_used:
+        assert results[2] == "decision_function"
+
+    # when forcing `pos_label=classifier.classes_[0]`
+    results = _get_response_values(
+        classifier,
+        X,
+        response_method=response_method,
+        pos_label=classifier.classes_[0],
+        return_response_method_used=return_response_method_used,
+    )
+    assert_allclose(results[0], classifier.decision_function(X) * -1)
+    assert results[1] == 0
+    if return_response_method_used:
+        assert results[2] == "decision_function"
+
+
+@pytest.mark.parametrize("return_response_method_used", [True, False])
+@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
+def test_get_response_values_binary_classifier_predict_proba(
+    return_response_method_used, response_method
+):
+    """Check that `_get_response_values` with `predict_proba` and binary
+    classifier."""
+    X, y = make_classification(
+        n_samples=10,
+        n_classes=2,
+        weights=[0.3, 0.7],
+        random_state=0,
+    )
+    classifier = LogisticRegression().fit(X, y)
+
+    # default `pos_label`
+    results = _get_response_values(
+        classifier,
+        X,
+        response_method=response_method,
+        pos_label=None,
+        return_response_method_used=return_response_method_used,
+    )
+    assert_allclose(results[0], getattr(classifier, response_method)(X)[:, 1])
+    assert results[1] == 1
+    if return_response_method_used:
+        assert len(results) == 3
+        assert results[2] == response_method
+    else:
+        assert len(results) == 2
+
+    # when forcing `pos_label=classifier.classes_[0]`
+    y_pred, pos_label, *_ = _get_response_values(
+        classifier,
+        X,
+        response_method=response_method,
+        pos_label=classifier.classes_[0],
+        return_response_method_used=return_response_method_used,
+    )
+    assert_allclose(y_pred, getattr(classifier, response_method)(X)[:, 0])
+    assert pos_label == 0
+
+
+@pytest.mark.parametrize(
+    "estimator, X, y, err_msg, params",
+    [
+        (
+            DecisionTreeRegressor(),
+            X_binary,
+            y_binary,
+            "Expected 'estimator' to be a binary classifier",
+            {"response_method": "auto"},
+        ),
+        (
+            DecisionTreeClassifier(),
+            X_binary,
+            y_binary,
+            r"pos_label=unknown is not a valid label: It should be one of \[0 1\]",
+            {"response_method": "auto", "pos_label": "unknown"},
+        ),
+        (
+            DecisionTreeClassifier(),
+            X,
+            y,
+            "be a binary classifier. Got 3 classes instead.",
+            {"response_method": "predict_proba"},
+        ),
+    ],
+)
+def test_get_response_error(estimator, X, y, err_msg, params):
+    """Check that we raise the proper error messages in _get_response_values_binary."""
+
+    estimator.fit(X, y)
+    with pytest.raises(ValueError, match=err_msg):
+        _get_response_values_binary(estimator, X, **params)
+
+
+@pytest.mark.parametrize("return_response_method_used", [True, False])
+def test_get_response_predict_proba(return_response_method_used):
+    """Check the behaviour of `_get_response_values_binary` using `predict_proba`."""
+    classifier = DecisionTreeClassifier().fit(X_binary, y_binary)
+    results = _get_response_values_binary(
+        classifier,
+        X_binary,
+        response_method="predict_proba",
+        return_response_method_used=return_response_method_used,
+    )
+    assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 1])
+    assert results[1] == 1
+    if return_response_method_used:
+        assert results[2] == "predict_proba"
+
+    results = _get_response_values_binary(
+        classifier,
+        X_binary,
+        response_method="predict_proba",
+        pos_label=0,
+        return_response_method_used=return_response_method_used,
+    )
+    assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 0])
+    assert results[1] == 0
+    if return_response_method_used:
+        assert results[2] == "predict_proba"
+
+
+@pytest.mark.parametrize("return_response_method_used", [True, False])
+def test_get_response_decision_function(return_response_method_used):
+    """Check the behaviour of `_get_response_values_binary` using decision_function."""
+    classifier = LogisticRegression().fit(X_binary, y_binary)
+    results = _get_response_values_binary(
+        classifier,
+        X_binary,
+        response_method="decision_function",
+        return_response_method_used=return_response_method_used,
+    )
+    assert_allclose(results[0], classifier.decision_function(X_binary))
+    assert results[1] == 1
+    if return_response_method_used:
+        assert results[2] == "decision_function"
+
+    results = _get_response_values_binary(
+        classifier,
+        X_binary,
+        response_method="decision_function",
+        pos_label=0,
+        return_response_method_used=return_response_method_used,
+    )
+    assert_allclose(results[0], classifier.decision_function(X_binary) * -1)
+    assert results[1] == 0
+    if return_response_method_used:
+        assert results[2] == "decision_function"
+
+
+@pytest.mark.parametrize(
+    "estimator, response_method",
+    [
+        (DecisionTreeClassifier(max_depth=2, random_state=0), "predict_proba"),
+        (DecisionTreeClassifier(max_depth=2, random_state=0), "predict_log_proba"),
+        (LogisticRegression(), "decision_function"),
+    ],
+)
+def test_get_response_values_multiclass(estimator, response_method):
+    """Check that we can call `_get_response_values` with a multiclass estimator.
+    It should return the predictions untouched.
+    """
+    estimator.fit(X, y)
+    predictions, pos_label = _get_response_values(
+        estimator, X, response_method=response_method
+    )
+
+    assert pos_label is None
+    assert predictions.shape == (X.shape[0], len(estimator.classes_))
+    if response_method == "predict_proba":
+        assert np.logical_and(predictions >= 0, predictions <= 1).all()
+    elif response_method == "predict_log_proba":
+        assert (predictions <= 0.0).all()
+
+
+def test_get_response_values_with_response_list():
+    """Check the behaviour of passing a list of responses to `_get_response_values`."""
+    classifier = LogisticRegression().fit(X_binary, y_binary)
+
+    # it should use `predict_proba`
+    y_pred, pos_label, response_method = _get_response_values(
+        classifier,
+        X_binary,
+        response_method=["predict_proba", "decision_function"],
+        return_response_method_used=True,
+    )
+    assert_allclose(y_pred, classifier.predict_proba(X_binary)[:, 1])
+    assert pos_label == 1
+    assert response_method == "predict_proba"
+
+    # it should use `decision_function`
+    y_pred, pos_label, response_method = _get_response_values(
+        classifier,
+        X_binary,
+        response_method=["decision_function", "predict_proba"],
+        return_response_method_used=True,
+    )
+    assert_allclose(y_pred, classifier.decision_function(X_binary))
+    assert pos_label == 1
+    assert response_method == "decision_function"
+
+
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "decision_function", "predict"]
+)
+def test_get_response_values_multilabel_indicator(response_method):
+    X, Y = make_multilabel_classification(random_state=0)
+    estimator = ClassifierChain(LogisticRegression()).fit(X, Y)
+
+    y_pred, pos_label = _get_response_values(
+        estimator, X, response_method=response_method
+    )
+    assert pos_label is None
+    assert y_pred.shape == Y.shape
+
+    if response_method == "predict_proba":
+        assert np.logical_and(y_pred >= 0, y_pred <= 1).all()
+    elif response_method == "decision_function":
+        # values returned by `decision_function` are not bounded in [0, 1]
+        assert (y_pred < 0).sum() > 0
+        assert (y_pred > 1).sum() > 0
+    else:  # response_method == "predict"
+        assert np.logical_or(y_pred == 0, y_pred == 1).all()
+
+
+def test_response_values_type_of_target_on_classes_no_warning():
+    """
+    Ensure `_get_response_values` doesn't raise spurious warning.
+
+    "The number of unique classes is greater than > 50% of samples"
+    warning should not be raised when calling `type_of_target(classes_)`.
+
+    Non-regression test for issue #31583.
+    """
+    X = np.random.RandomState(0).randn(120, 3)
+    # 30 classes, less than 50% of number of samples
+    y = np.repeat(np.arange(30), 4)
+
+    clf = LogisticRegression().fit(X, y)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+
+        _get_response_values(clf, X, response_method="predict_proba")
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_seq_dataset.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_seq_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3420aeb83c224e0bb47f9f918c80f7ac3e9bd1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_seq_dataset.py
@@ -0,0 +1,183 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from itertools import product
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.datasets import load_iris
+from sklearn.utils._seq_dataset import (
+    ArrayDataset32,
+    ArrayDataset64,
+    CSRDataset32,
+    CSRDataset64,
+)
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+iris = load_iris()
+X64 = iris.data.astype(np.float64)
+y64 = iris.target.astype(np.float64)
+sample_weight64 = np.arange(y64.size, dtype=np.float64)
+
+X32 = iris.data.astype(np.float32)
+y32 = iris.target.astype(np.float32)
+sample_weight32 = np.arange(y32.size, dtype=np.float32)
+
+floating = [np.float32, np.float64]
+
+
+def assert_csr_equal_values(current, expected):
+    current.eliminate_zeros()
+    expected.eliminate_zeros()
+    expected = expected.astype(current.dtype)
+    assert current.shape[0] == expected.shape[0]
+    assert current.shape[1] == expected.shape[1]
+    assert_array_equal(current.data, expected.data)
+    assert_array_equal(current.indices, expected.indices)
+    assert_array_equal(current.indptr, expected.indptr)
+
+
+def _make_dense_dataset(float_dtype):
+    if float_dtype == np.float32:
+        return ArrayDataset32(X32, y32, sample_weight32, seed=42)
+    return ArrayDataset64(X64, y64, sample_weight64, seed=42)
+
+
+def _make_sparse_dataset(csr_container, float_dtype):
+    if float_dtype == np.float32:
+        X, y, sample_weight, csr_dataset = X32, y32, sample_weight32, CSRDataset32
+    else:
+        X, y, sample_weight, csr_dataset = X64, y64, sample_weight64, CSRDataset64
+    X = csr_container(X)
+    return csr_dataset(X.data, X.indptr, X.indices, y, sample_weight, seed=42)
+
+
+def _make_dense_datasets():
+    return [_make_dense_dataset(float_dtype) for float_dtype in floating]
+
+
+def _make_sparse_datasets():
+    return [
+        _make_sparse_dataset(csr_container, float_dtype)
+        for csr_container, float_dtype in product(CSR_CONTAINERS, floating)
+    ]
+
+
+def _make_fused_types_datasets():
+    all_datasets = _make_dense_datasets() + _make_sparse_datasets()
+    # group dataset by array types to get a tuple (float32, float64)
+    return (all_datasets[idx : idx + 2] for idx in range(0, len(all_datasets), 2))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("dataset", _make_dense_datasets() + _make_sparse_datasets())
+def test_seq_dataset_basic_iteration(dataset, csr_container):
+    NUMBER_OF_RUNS = 5
+    X_csr64 = csr_container(X64)
+    for _ in range(NUMBER_OF_RUNS):
+        # next sample
+        xi_, yi, swi, idx = dataset._next_py()
+        xi = csr_container(xi_, shape=(1, X64.shape[1]))
+
+        assert_csr_equal_values(xi, X_csr64[[idx]])
+        assert yi == y64[idx]
+        assert swi == sample_weight64[idx]
+
+        # random sample
+        xi_, yi, swi, idx = dataset._random_py()
+        xi = csr_container(xi_, shape=(1, X64.shape[1]))
+
+        assert_csr_equal_values(xi, X_csr64[[idx]])
+        assert yi == y64[idx]
+        assert swi == sample_weight64[idx]
+
+
+@pytest.mark.parametrize(
+    "dense_dataset,sparse_dataset",
+    [
+        (
+            _make_dense_dataset(float_dtype),
+            _make_sparse_dataset(csr_container, float_dtype),
+        )
+        for float_dtype, csr_container in product(floating, CSR_CONTAINERS)
+    ],
+)
+def test_seq_dataset_shuffle(dense_dataset, sparse_dataset):
+    # not shuffled
+    for i in range(5):
+        _, _, _, idx1 = dense_dataset._next_py()
+        _, _, _, idx2 = sparse_dataset._next_py()
+        assert idx1 == i
+        assert idx2 == i
+
+    for i in [132, 50, 9, 18, 58]:
+        _, _, _, idx1 = dense_dataset._random_py()
+        _, _, _, idx2 = sparse_dataset._random_py()
+        assert idx1 == i
+        assert idx2 == i
+
+    seed = 77
+    dense_dataset._shuffle_py(seed)
+    sparse_dataset._shuffle_py(seed)
+
+    idx_next = [63, 91, 148, 87, 29]
+    idx_shuffle = [137, 125, 56, 121, 127]
+    for i, j in zip(idx_next, idx_shuffle):
+        _, _, _, idx1 = dense_dataset._next_py()
+        _, _, _, idx2 = sparse_dataset._next_py()
+        assert idx1 == i
+        assert idx2 == i
+
+        _, _, _, idx1 = dense_dataset._random_py()
+        _, _, _, idx2 = sparse_dataset._random_py()
+        assert idx1 == j
+        assert idx2 == j
+
+
+@pytest.mark.parametrize("dataset_32,dataset_64", _make_fused_types_datasets())
+def test_fused_types_consistency(dataset_32, dataset_64):
+    NUMBER_OF_RUNS = 5
+    for _ in range(NUMBER_OF_RUNS):
+        # next sample
+        (xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
+        (xi_data64, _, _), yi64, _, _ = dataset_64._next_py()
+
+        assert xi_data32.dtype == np.float32
+        assert xi_data64.dtype == np.float64
+
+        assert_allclose(xi_data64, xi_data32, rtol=1e-5)
+        assert_allclose(yi64, yi32, rtol=1e-5)
+
+
+def test_buffer_dtype_mismatch_error():
+    with pytest.raises(ValueError, match="Buffer dtype mismatch"):
+        ArrayDataset64(X32, y32, sample_weight32, seed=42)
+
+    with pytest.raises(ValueError, match="Buffer dtype mismatch"):
+        ArrayDataset32(X64, y64, sample_weight64, seed=42)
+
+    for csr_container in CSR_CONTAINERS:
+        X_csr32 = csr_container(X32)
+        X_csr64 = csr_container(X64)
+        with pytest.raises(ValueError, match="Buffer dtype mismatch"):
+            CSRDataset64(
+                X_csr32.data,
+                X_csr32.indptr,
+                X_csr32.indices,
+                y32,
+                sample_weight32,
+                seed=42,
+            )
+
+        with pytest.raises(ValueError, match="Buffer dtype mismatch"):
+            CSRDataset32(
+                X_csr64.data,
+                X_csr64.indptr,
+                X_csr64.indices,
+                y64,
+                sample_weight64,
+                seed=42,
+            )
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_set_output.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_set_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..146f0a6c28592694ad52f672c38ad1d5a5505a14
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_set_output.py
@@ -0,0 +1,471 @@
+import importlib
+from collections import namedtuple
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn._config import config_context, get_config
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils._set_output import (
+    ADAPTERS_MANAGER,
+    ContainerAdapterProtocol,
+    _get_adapter_from_container,
+    _get_output_config,
+    _safe_set_output,
+    _SetOutputMixin,
+    _wrap_data_with_container,
+    check_library_installed,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def test_pandas_adapter():
+    """Check pandas adapter has expected behavior."""
+    pd = pytest.importorskip("pandas")
+    X_np = np.asarray([[1, 0, 3], [0, 0, 1]])
+    columns = np.asarray(["f0", "f1", "f2"], dtype=object)
+    index = np.asarray([1, 2])
+    X_df_orig = pd.DataFrame([[1, 2], [1, 3]], index=index)
+    X_ser_orig = pd.Series([2, 3], index=index)
+
+    adapter = ADAPTERS_MANAGER.adapters["pandas"]
+    X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
+    assert isinstance(X_container, pd.DataFrame)
+    assert_array_equal(X_container.columns, columns)
+    assert_array_equal(X_container.index, index)
+
+    # use original index when the original is a series
+    X_container = adapter.create_container(X_np, X_ser_orig, columns=lambda: columns)
+    assert isinstance(X_container, pd.DataFrame)
+    assert_array_equal(X_container.columns, columns)
+    assert_array_equal(X_container.index, index)
+
+    # Input dataframe's index does not change
+    new_columns = np.asarray(["f0", "f1"], dtype=object)
+    X_df = pd.DataFrame([[1, 2], [1, 3]], index=[10, 12])
+    new_df = adapter.create_container(X_df, X_df_orig, columns=new_columns)
+    assert_array_equal(new_df.columns, new_columns)
+    assert_array_equal(new_df.index, X_df.index)
+
+    assert adapter.is_supported_container(X_df)
+    assert not adapter.is_supported_container(X_np)
+
+    # adapter.update_columns updates the columns
+    new_columns = np.array(["a", "c"], dtype=object)
+    new_df = adapter.rename_columns(X_df, new_columns)
+    assert_array_equal(new_df.columns, new_columns)
+
+    # adapter.hstack stacks the dataframes horizontally.
+    X_df_1 = pd.DataFrame([[1, 2, 5], [3, 4, 6]], columns=["a", "b", "e"])
+    X_df_2 = pd.DataFrame([[4], [5]], columns=["c"])
+    X_stacked = adapter.hstack([X_df_1, X_df_2])
+
+    expected_df = pd.DataFrame(
+        [[1, 2, 5, 4], [3, 4, 6, 5]], columns=["a", "b", "e", "c"]
+    )
+    pd.testing.assert_frame_equal(X_stacked, expected_df)
+
+    # check that we update properly the columns even with duplicate column names
+    # this use-case potentially happen when using ColumnTransformer
+    # non-regression test for gh-28260
+    X_df = pd.DataFrame([[1, 2], [1, 3]], columns=["a", "a"])
+    new_columns = np.array(["x__a", "y__a"], dtype=object)
+    new_df = adapter.rename_columns(X_df, new_columns)
+    assert_array_equal(new_df.columns, new_columns)
+
+    # check the behavior of the inplace parameter in `create_container`
+    # we should trigger a copy
+    X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
+    X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=False)
+    assert X_output is not X_df
+    assert list(X_df.columns) == [0, 1]
+    assert list(X_output.columns) == ["a", "b"]
+
+    # the operation is inplace
+    X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
+    X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=True)
+    assert X_output is X_df
+    assert list(X_df.columns) == ["a", "b"]
+    assert list(X_output.columns) == ["a", "b"]
+
+
+def test_polars_adapter():
+    """Check Polars adapter has expected behavior."""
+    pl = pytest.importorskip("polars")
+    X_np = np.array([[1, 0, 3], [0, 0, 1]])
+    columns = ["f1", "f2", "f3"]
+    X_df_orig = pl.DataFrame(X_np, schema=columns, orient="row")
+
+    adapter = ADAPTERS_MANAGER.adapters["polars"]
+    X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
+
+    assert isinstance(X_container, pl.DataFrame)
+    assert_array_equal(X_container.columns, columns)
+
+    # Update columns with create_container
+    new_columns = np.asarray(["a", "b", "c"], dtype=object)
+    new_df = adapter.create_container(X_df_orig, X_df_orig, columns=new_columns)
+    assert_array_equal(new_df.columns, new_columns)
+
+    assert adapter.is_supported_container(X_df_orig)
+    assert not adapter.is_supported_container(X_np)
+
+    # adapter.update_columns updates the columns
+    new_columns = np.array(["a", "c", "g"], dtype=object)
+    new_df = adapter.rename_columns(X_df_orig, new_columns)
+    assert_array_equal(new_df.columns, new_columns)
+
+    # adapter.hstack stacks the dataframes horizontally.
+    X_df_1 = pl.DataFrame([[1, 2, 5], [3, 4, 6]], schema=["a", "b", "e"], orient="row")
+    X_df_2 = pl.DataFrame([[4], [5]], schema=["c"], orient="row")
+    X_stacked = adapter.hstack([X_df_1, X_df_2])
+
+    expected_df = pl.DataFrame(
+        [[1, 2, 5, 4], [3, 4, 6, 5]], schema=["a", "b", "e", "c"], orient="row"
+    )
+    from polars.testing import assert_frame_equal
+
+    assert_frame_equal(X_stacked, expected_df)
+
+    # check the behavior of the inplace parameter in `create_container`
+    # we should trigger a copy
+    X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
+    X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=False)
+    assert X_output is not X_df
+    assert list(X_df.columns) == ["a", "b"]
+    assert list(X_output.columns) == ["c", "d"]
+
+    # the operation is inplace
+    X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
+    X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=True)
+    assert X_output is X_df
+    assert list(X_df.columns) == ["c", "d"]
+    assert list(X_output.columns) == ["c", "d"]
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test__container_error_validation(csr_container):
+    """Check errors in _wrap_data_with_container."""
+    X = np.asarray([[1, 0, 3], [0, 0, 1]])
+    X_csr = csr_container(X)
+    match = "The transformer outputs a scipy sparse matrix."
+    with config_context(transform_output="pandas"):
+        with pytest.raises(ValueError, match=match):
+            _wrap_data_with_container("transform", X_csr, X, StandardScaler())
+
+
+class EstimatorWithoutSetOutputAndWithoutTransform:
+    pass
+
+
+class EstimatorNoSetOutputWithTransform:
+    def transform(self, X, y=None):
+        return X  # pragma: no cover
+
+
+class EstimatorWithSetOutput(_SetOutputMixin):
+    def fit(self, X, y=None):
+        self.n_features_in_ = X.shape[1]
+        return self
+
+    def transform(self, X, y=None):
+        return X
+
+    def get_feature_names_out(self, input_features=None):
+        return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
+
+
+def test__safe_set_output():
+    """Check _safe_set_output works as expected."""
+
+    # Estimator without transform will not raise when setting set_output for transform.
+    est = EstimatorWithoutSetOutputAndWithoutTransform()
+    _safe_set_output(est, transform="pandas")
+
+    # Estimator with transform but without set_output will raise
+    est = EstimatorNoSetOutputWithTransform()
+    with pytest.raises(ValueError, match="Unable to configure output"):
+        _safe_set_output(est, transform="pandas")
+
+    est = EstimatorWithSetOutput().fit(np.asarray([[1, 2, 3]]))
+    _safe_set_output(est, transform="pandas")
+    config = _get_output_config("transform", est)
+    assert config["dense"] == "pandas"
+
+    _safe_set_output(est, transform="default")
+    config = _get_output_config("transform", est)
+    assert config["dense"] == "default"
+
+    # transform is None is a no-op, so the config remains "default"
+    _safe_set_output(est, transform=None)
+    config = _get_output_config("transform", est)
+    assert config["dense"] == "default"
+
+
+class EstimatorNoSetOutputWithTransformNoFeatureNamesOut(_SetOutputMixin):
+    def transform(self, X, y=None):
+        return X  # pragma: no cover
+
+
+def test_set_output_mixin():
+    """Estimator without get_feature_names_out does not define `set_output`."""
+    est = EstimatorNoSetOutputWithTransformNoFeatureNamesOut()
+    assert not hasattr(est, "set_output")
+
+
+def test__safe_set_output_error():
+    """Check transform with invalid config."""
+    X = np.asarray([[1, 0, 3], [0, 0, 1]])
+
+    est = EstimatorWithSetOutput()
+    _safe_set_output(est, transform="bad")
+
+    msg = "output config must be in"
+    with pytest.raises(ValueError, match=msg):
+        est.transform(X)
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_set_output_method(dataframe_lib):
+    """Check that the output is a dataframe."""
+    lib = pytest.importorskip(dataframe_lib)
+
+    X = np.asarray([[1, 0, 3], [0, 0, 1]])
+    est = EstimatorWithSetOutput().fit(X)
+
+    # transform=None is a no-op
+    est2 = est.set_output(transform=None)
+    assert est2 is est
+    X_trans_np = est2.transform(X)
+    assert isinstance(X_trans_np, np.ndarray)
+
+    est.set_output(transform=dataframe_lib)
+
+    X_trans_pd = est.transform(X)
+
+    assert isinstance(X_trans_pd, lib.DataFrame)
+
+
+def test_set_output_method_error():
+    """Check transform fails with invalid transform."""
+
+    X = np.asarray([[1, 0, 3], [0, 0, 1]])
+    est = EstimatorWithSetOutput().fit(X)
+    est.set_output(transform="bad")
+
+    msg = "output config must be in"
+    with pytest.raises(ValueError, match=msg):
+        est.transform(X)
+
+
+@pytest.mark.parametrize("transform_output", ["pandas", "polars"])
+def test__get_output_config(transform_output):
+    """Check _get_output_config works as expected."""
+
+    # Without a configuration set, the global config is used
+    global_config = get_config()["transform_output"]
+    config = _get_output_config("transform")
+    assert config["dense"] == global_config
+
+    with config_context(transform_output=transform_output):
+        # with estimator=None, the global config is used
+        config = _get_output_config("transform")
+        assert config["dense"] == transform_output
+
+        est = EstimatorNoSetOutputWithTransform()
+        config = _get_output_config("transform", est)
+        assert config["dense"] == transform_output
+
+        est = EstimatorWithSetOutput()
+        # If estimator has not config, use global config
+        config = _get_output_config("transform", est)
+        assert config["dense"] == transform_output
+
+        # If estimator has a config, use local config
+        est.set_output(transform="default")
+        config = _get_output_config("transform", est)
+        assert config["dense"] == "default"
+
+    est.set_output(transform=transform_output)
+    config = _get_output_config("transform", est)
+    assert config["dense"] == transform_output
+
+
+class EstimatorWithSetOutputNoAutoWrap(_SetOutputMixin, auto_wrap_output_keys=None):
+    def transform(self, X, y=None):
+        return X
+
+
+def test_get_output_auto_wrap_false():
+    """Check that auto_wrap_output_keys=None does not wrap."""
+    est = EstimatorWithSetOutputNoAutoWrap()
+    assert not hasattr(est, "set_output")
+
+    X = np.asarray([[1, 0, 3], [0, 0, 1]])
+    assert X is est.transform(X)
+
+
+def test_auto_wrap_output_keys_errors_with_incorrect_input():
+    msg = "auto_wrap_output_keys must be None or a tuple of keys."
+    with pytest.raises(ValueError, match=msg):
+
+        class BadEstimator(_SetOutputMixin, auto_wrap_output_keys="bad_parameter"):
+            pass
+
+
+class AnotherMixin:
+    def __init_subclass__(cls, custom_parameter, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls.custom_parameter = custom_parameter
+
+
+def test_set_output_mixin_custom_mixin():
+    """Check that multiple init_subclasses passes parameters up."""
+
+    class BothMixinEstimator(_SetOutputMixin, AnotherMixin, custom_parameter=123):
+        def transform(self, X, y=None):
+            return X
+
+        def get_feature_names_out(self, input_features=None):
+            return input_features
+
+    est = BothMixinEstimator()
+    assert est.custom_parameter == 123
+    assert hasattr(est, "set_output")
+
+
+def test_set_output_mro():
+    """Check that multi-inheritance resolves to the correct class method.
+
+    Non-regression test gh-25293.
+    """
+
+    class Base(_SetOutputMixin):
+        def transform(self, X):
+            return "Base"
+
+    class A(Base):
+        pass
+
+    class B(Base):
+        def transform(self, X):
+            return "B"
+
+    class C(A, B):
+        pass
+
+    assert C().transform(None) == "B"
+
+
+class EstimatorWithSetOutputIndex(_SetOutputMixin):
+    def fit(self, X, y=None):
+        self.n_features_in_ = X.shape[1]
+        return self
+
+    def transform(self, X, y=None):
+        import pandas as pd
+
+        # transform by giving output a new index.
+        return pd.DataFrame(X.to_numpy(), index=[f"s{i}" for i in range(X.shape[0])])
+
+    def get_feature_names_out(self, input_features=None):
+        return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
+
+
+def test_set_output_pandas_keep_index():
+    """Check that set_output does not override index.
+
+    Non-regression test for gh-25730.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=[0, 1])
+    est = EstimatorWithSetOutputIndex().set_output(transform="pandas")
+    est.fit(X)
+
+    X_trans = est.transform(X)
+    assert_array_equal(X_trans.index, ["s0", "s1"])
+
+
+class EstimatorReturnTuple(_SetOutputMixin):
+    def __init__(self, OutputTuple):
+        self.OutputTuple = OutputTuple
+
+    def transform(self, X, y=None):
+        return self.OutputTuple(X, 2 * X)
+
+
+def test_set_output_named_tuple_out():
+    """Check that namedtuples are kept by default."""
+    Output = namedtuple("Output", "X, Y")
+    X = np.asarray([[1, 2, 3]])
+    est = EstimatorReturnTuple(OutputTuple=Output)
+    X_trans = est.transform(X)
+
+    assert isinstance(X_trans, Output)
+    assert_array_equal(X_trans.X, X)
+    assert_array_equal(X_trans.Y, 2 * X)
+
+
+class EstimatorWithListInput(_SetOutputMixin):
+    def fit(self, X, y=None):
+        assert isinstance(X, list)
+        self.n_features_in_ = len(X[0])
+        return self
+
+    def transform(self, X, y=None):
+        return X
+
+    def get_feature_names_out(self, input_features=None):
+        return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_set_output_list_input(dataframe_lib):
+    """Check set_output for list input.
+
+    Non-regression test for #27037.
+    """
+    lib = pytest.importorskip(dataframe_lib)
+
+    X = [[0, 1, 2, 3], [4, 5, 6, 7]]
+    est = EstimatorWithListInput()
+    est.set_output(transform=dataframe_lib)
+
+    X_out = est.fit(X).transform(X)
+    assert isinstance(X_out, lib.DataFrame)
+    assert_array_equal(X_out.columns, ["X0", "X1", "X2", "X3"])
+
+
+@pytest.mark.parametrize("name", sorted(ADAPTERS_MANAGER.adapters))
+def test_adapter_class_has_interface(name):
+    """Check adapters have the correct interface."""
+    assert isinstance(ADAPTERS_MANAGER.adapters[name], ContainerAdapterProtocol)
+
+
+def test_check_library_installed(monkeypatch):
+    """Check import error changed."""
+    orig_import_module = importlib.import_module
+
+    def patched_import_module(name):
+        if name == "pandas":
+            raise ImportError()
+        orig_import_module(name, package=None)
+
+    monkeypatch.setattr(importlib, "import_module", patched_import_module)
+
+    msg = "Setting output container to 'pandas' requires"
+    with pytest.raises(ImportError, match=msg):
+        check_library_installed("pandas")
+
+
+def test_get_adapter_from_container():
+    """Check the behavior fo `_get_adapter_from_container`."""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
+    adapter = _get_adapter_from_container(X)
+    assert adapter.container_lib == "pandas"
+    err_msg = "The container does not have a registered adapter in scikit-learn."
+    with pytest.raises(ValueError, match=err_msg):
+        _get_adapter_from_container(X.to_numpy())
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_shortest_path.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_shortest_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..c070ccd70b63dc413b7cfa867d32cb27c67d462e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_shortest_path.py
@@ -0,0 +1,65 @@
+from collections import defaultdict
+
+import numpy as np
+from numpy.testing import assert_array_almost_equal
+
+from sklearn.utils.graph import single_source_shortest_path_length
+
+
+def floyd_warshall_slow(graph, directed=False):
+    N = graph.shape[0]
+
+    # set nonzero entries to infinity
+    graph[np.where(graph == 0)] = np.inf
+
+    # set diagonal to zero
+    graph.flat[:: N + 1] = 0
+
+    if not directed:
+        graph = np.minimum(graph, graph.T)
+
+    for k in range(N):
+        for i in range(N):
+            for j in range(N):
+                graph[i, j] = min(graph[i, j], graph[i, k] + graph[k, j])
+
+    graph[np.where(np.isinf(graph))] = 0
+
+    return graph
+
+
+def generate_graph(N=20):
+    # sparse grid of distances
+    rng = np.random.RandomState(0)
+    dist_matrix = rng.random_sample((N, N))
+
+    # make symmetric: distances are not direction-dependent
+    dist_matrix = dist_matrix + dist_matrix.T
+
+    # make graph sparse
+    i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2))
+    dist_matrix[i] = 0
+
+    # set diagonal to zero
+    dist_matrix.flat[:: N + 1] = 0
+
+    return dist_matrix
+
+
+def test_shortest_path():
+    dist_matrix = generate_graph(20)
+    # We compare path length and not costs (-> set distances to 0 or 1)
+    dist_matrix[dist_matrix != 0] = 1
+
+    for directed in (True, False):
+        if not directed:
+            dist_matrix = np.minimum(dist_matrix, dist_matrix.T)
+
+        graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
+        for i in range(dist_matrix.shape[0]):
+            # Non-reachable nodes have distance 0 in graph_py
+            dist_dict = defaultdict(int)
+            dist_dict.update(single_source_shortest_path_length(dist_matrix, i))
+
+            for j in range(graph_py[i].shape[0]):
+                assert_array_almost_equal(dist_dict[j], graph_py[i, j])
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_show_versions.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_show_versions.py
new file mode 100644
index 0000000000000000000000000000000000000000..aade231e46f560cb3b42d1616fe208d8996b1ab3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_show_versions.py
@@ -0,0 +1,40 @@
+from threadpoolctl import threadpool_info
+
+from sklearn.utils._show_versions import _get_deps_info, _get_sys_info, show_versions
+from sklearn.utils._testing import ignore_warnings
+
+
+def test_get_sys_info():
+    sys_info = _get_sys_info()
+
+    assert "python" in sys_info
+    assert "executable" in sys_info
+    assert "machine" in sys_info
+
+
+def test_get_deps_info():
+    with ignore_warnings():
+        deps_info = _get_deps_info()
+
+    assert "pip" in deps_info
+    assert "setuptools" in deps_info
+    assert "sklearn" in deps_info
+    assert "numpy" in deps_info
+    assert "scipy" in deps_info
+    assert "Cython" in deps_info
+    assert "pandas" in deps_info
+    assert "matplotlib" in deps_info
+    assert "joblib" in deps_info
+
+
+def test_show_versions(capsys):
+    with ignore_warnings():
+        show_versions()
+        out, err = capsys.readouterr()
+
+    assert "python" in out
+    assert "numpy" in out
+
+    info = threadpool_info()
+    if info:
+        assert "threadpoolctl info:" in out
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_sparsefuncs.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_sparsefuncs.py
new file mode 100644
index 0000000000000000000000000000000000000000..f80b75c02d515388e0607b002f4712e8d4af54b6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_sparsefuncs.py
@@ -0,0 +1,998 @@
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from numpy.random import RandomState
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+from scipy import linalg
+
+from sklearn.datasets import make_classification
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
+from sklearn.utils.sparsefuncs import (
+    _implicit_column_offset,
+    count_nonzero,
+    csc_median_axis_0,
+    incr_mean_variance_axis,
+    inplace_column_scale,
+    inplace_row_scale,
+    inplace_swap_column,
+    inplace_swap_row,
+    mean_variance_axis,
+    min_max_axis,
+)
+from sklearn.utils.sparsefuncs_fast import (
+    assign_rows_csr,
+    csr_row_norms,
+    inplace_csr_row_normalize_l1,
+    inplace_csr_row_normalize_l2,
+)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_mean_variance_axis0(csc_container, csr_container, lil_container):
+    X, _ = make_classification(5, 4, random_state=0)
+    # Sparsify the array a little bit
+    X[0, 0] = 0
+    X[2, 1] = 0
+    X[4, 3] = 0
+    X_lil = lil_container(X)
+    X_lil[1, 0] = 0
+    X[1, 0] = 0
+
+    with pytest.raises(TypeError):
+        mean_variance_axis(X_lil, axis=0)
+
+    X_csr = csr_container(X_lil)
+    X_csc = csc_container(X_lil)
+
+    expected_dtypes = [
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ]
+
+    for input_dtype, output_dtype in expected_dtypes:
+        X_test = X.astype(input_dtype)
+        for X_sparse in (X_csr, X_csc):
+            X_sparse = X_sparse.astype(input_dtype)
+            X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
+            assert X_means.dtype == output_dtype
+            assert X_vars.dtype == output_dtype
+            assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
+            assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_mean_variance_axis0_precision(dtype, sparse_constructor):
+    # Check that there's no big loss of precision when the real variance is
+    # exactly 0. (#19766)
+    rng = np.random.RandomState(0)
+    X = np.full(fill_value=100.0, shape=(1000, 1), dtype=dtype)
+    # Add some missing records which should be ignored:
+    missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False)
+    X[missing_indices, 0] = np.nan
+    X = sparse_constructor(X)
+
+    # Random positive weights:
+    sample_weight = rng.rand(X.shape[0]).astype(dtype)
+
+    _, var = mean_variance_axis(X, weights=sample_weight, axis=0)
+
+    assert var < np.finfo(dtype).eps
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_mean_variance_axis1(csc_container, csr_container, lil_container):
+    X, _ = make_classification(5, 4, random_state=0)
+    # Sparsify the array a little bit
+    X[0, 0] = 0
+    X[2, 1] = 0
+    X[4, 3] = 0
+    X_lil = lil_container(X)
+    X_lil[1, 0] = 0
+    X[1, 0] = 0
+
+    with pytest.raises(TypeError):
+        mean_variance_axis(X_lil, axis=1)
+
+    X_csr = csr_container(X_lil)
+    X_csc = csc_container(X_lil)
+
+    expected_dtypes = [
+        (np.float32, np.float32),
+        (np.float64, np.float64),
+        (np.int32, np.float64),
+        (np.int64, np.float64),
+    ]
+
+    for input_dtype, output_dtype in expected_dtypes:
+        X_test = X.astype(input_dtype)
+        for X_sparse in (X_csr, X_csc):
+            X_sparse = X_sparse.astype(input_dtype)
+            X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
+            assert X_means.dtype == output_dtype
+            assert X_vars.dtype == output_dtype
+            assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
+            assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
+
+
+@pytest.mark.parametrize(
+    ["Xw", "X", "weights"],
+    [
+        ([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1, 1]),
+        ([[0, 0, 1], [0, 1, 1]], [[0, 0, 0, 1], [0, 1, 1, 1]], [1, 2, 1]),
+        ([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
+        (
+            [[0, np.nan, 2], [0, np.nan, np.nan]],
+            [[0, np.nan, 2], [0, np.nan, np.nan]],
+            [1.0, 1.0, 1.0],
+        ),
+        (
+            [[0, 0], [1, np.nan], [2, 0], [0, 3], [np.nan, np.nan], [np.nan, 2]],
+            [
+                [0, 0, 0],
+                [1, 1, np.nan],
+                [2, 2, 0],
+                [0, 0, 3],
+                [np.nan, np.nan, np.nan],
+                [np.nan, np.nan, 2],
+            ],
+            [2.0, 1.0],
+        ),
+        (
+            [[1, 0, 1], [0, 3, 1]],
+            [[1, 0, 0, 0, 1], [0, 3, 3, 3, 1]],
+            np.array([1, 3, 1]),
+        ),
+    ],
+)
+@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_incr_mean_variance_axis_weighted_axis1(
+    Xw, X, weights, sparse_constructor, dtype
+):
+    axis = 1
+    Xw_sparse = sparse_constructor(Xw).astype(dtype)
+    X_sparse = sparse_constructor(X).astype(dtype)
+
+    last_mean = np.zeros(np.shape(Xw)[0], dtype=dtype)
+    last_var = np.zeros_like(last_mean, dtype=dtype)
+    last_n = np.zeros_like(last_mean, dtype=np.int64)
+    means0, vars0, n_incr0 = incr_mean_variance_axis(
+        X=X_sparse,
+        axis=axis,
+        last_mean=last_mean,
+        last_var=last_var,
+        last_n=last_n,
+        weights=None,
+    )
+
+    means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
+        X=Xw_sparse,
+        axis=axis,
+        last_mean=last_mean,
+        last_var=last_var,
+        last_n=last_n,
+        weights=weights,
+    )
+
+    assert means_w0.dtype == dtype
+    assert vars_w0.dtype == dtype
+    assert n_incr_w0.dtype == dtype
+
+    means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
+
+    assert_array_almost_equal(means0, means_w0)
+    assert_array_almost_equal(means0, means_simple)
+    assert_array_almost_equal(vars0, vars_w0)
+    assert_array_almost_equal(vars0, vars_simple)
+    assert_array_almost_equal(n_incr0, n_incr_w0)
+
+    # check second round for incremental
+    means1, vars1, n_incr1 = incr_mean_variance_axis(
+        X=X_sparse,
+        axis=axis,
+        last_mean=means0,
+        last_var=vars0,
+        last_n=n_incr0,
+        weights=None,
+    )
+
+    means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
+        X=Xw_sparse,
+        axis=axis,
+        last_mean=means_w0,
+        last_var=vars_w0,
+        last_n=n_incr_w0,
+        weights=weights,
+    )
+
+    assert_array_almost_equal(means1, means_w1)
+    assert_array_almost_equal(vars1, vars_w1)
+    assert_array_almost_equal(n_incr1, n_incr_w1)
+
+    assert means_w1.dtype == dtype
+    assert vars_w1.dtype == dtype
+    assert n_incr_w1.dtype == dtype
+
+
+@pytest.mark.parametrize(
+    ["Xw", "X", "weights"],
+    [
+        ([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1]),
+        ([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1], [0, 1, 1]], [1, 2]),
+        ([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
+        (
+            [[0, np.nan, 2], [0, np.nan, np.nan]],
+            [[0, np.nan, 2], [0, np.nan, np.nan]],
+            [1.0, 1.0],
+        ),
+        (
+            [[0, 0, 1, np.nan, 2, 0], [0, 3, np.nan, np.nan, np.nan, 2]],
+            [
+                [0, 0, 1, np.nan, 2, 0],
+                [0, 0, 1, np.nan, 2, 0],
+                [0, 3, np.nan, np.nan, np.nan, 2],
+            ],
+            [2.0, 1.0],
+        ),
+        (
+            [[1, 0, 1], [0, 0, 1]],
+            [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
+            np.array([1, 3]),
+        ),
+    ],
+)
+@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_incr_mean_variance_axis_weighted_axis0(
+    Xw, X, weights, sparse_constructor, dtype
+):
+    axis = 0
+    Xw_sparse = sparse_constructor(Xw).astype(dtype)
+    X_sparse = sparse_constructor(X).astype(dtype)
+
+    last_mean = np.zeros(np.size(Xw, 1), dtype=dtype)
+    last_var = np.zeros_like(last_mean)
+    last_n = np.zeros_like(last_mean, dtype=np.int64)
+    means0, vars0, n_incr0 = incr_mean_variance_axis(
+        X=X_sparse,
+        axis=axis,
+        last_mean=last_mean,
+        last_var=last_var,
+        last_n=last_n,
+        weights=None,
+    )
+
+    means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
+        X=Xw_sparse,
+        axis=axis,
+        last_mean=last_mean,
+        last_var=last_var,
+        last_n=last_n,
+        weights=weights,
+    )
+
+    assert means_w0.dtype == dtype
+    assert vars_w0.dtype == dtype
+    assert n_incr_w0.dtype == dtype
+
+    means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
+
+    assert_array_almost_equal(means0, means_w0)
+    assert_array_almost_equal(means0, means_simple)
+    assert_array_almost_equal(vars0, vars_w0)
+    assert_array_almost_equal(vars0, vars_simple)
+    assert_array_almost_equal(n_incr0, n_incr_w0)
+
+    # check second round for incremental
+    means1, vars1, n_incr1 = incr_mean_variance_axis(
+        X=X_sparse,
+        axis=axis,
+        last_mean=means0,
+        last_var=vars0,
+        last_n=n_incr0,
+        weights=None,
+    )
+
+    means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
+        X=Xw_sparse,
+        axis=axis,
+        last_mean=means_w0,
+        last_var=vars_w0,
+        last_n=n_incr_w0,
+        weights=weights,
+    )
+
+    assert_array_almost_equal(means1, means_w1)
+    assert_array_almost_equal(vars1, vars_w1)
+    assert_array_almost_equal(n_incr1, n_incr_w1)
+
+    assert means_w1.dtype == dtype
+    assert vars_w1.dtype == dtype
+    assert n_incr_w1.dtype == dtype
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_incr_mean_variance_axis(csc_container, csr_container, lil_container):
+    for axis in [0, 1]:
+        rng = np.random.RandomState(0)
+        n_features = 50
+        n_samples = 10
+        if axis == 0:
+            data_chunks = [rng.randint(0, 2, size=n_features) for i in range(n_samples)]
+        else:
+            data_chunks = [rng.randint(0, 2, size=n_samples) for i in range(n_features)]
+
+        # default params for incr_mean_variance
+        last_mean = np.zeros(n_features) if axis == 0 else np.zeros(n_samples)
+        last_var = np.zeros_like(last_mean)
+        last_n = np.zeros_like(last_mean, dtype=np.int64)
+
+        # Test errors
+        X = np.array(data_chunks[0])
+        X = np.atleast_2d(X)
+        X = X.T if axis == 1 else X
+        X_lil = lil_container(X)
+        X_csr = csr_container(X_lil)
+
+        with pytest.raises(TypeError):
+            incr_mean_variance_axis(
+                X=axis, axis=last_mean, last_mean=last_var, last_var=last_n
+            )
+        with pytest.raises(TypeError):
+            incr_mean_variance_axis(
+                X_lil, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
+            )
+
+        # Test _incr_mean_and_var with a 1 row input
+        X_means, X_vars = mean_variance_axis(X_csr, axis)
+        X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
+            X_csr, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
+        )
+        assert_array_almost_equal(X_means, X_means_incr)
+        assert_array_almost_equal(X_vars, X_vars_incr)
+        # X.shape[axis] picks # samples
+        assert_array_equal(X.shape[axis], n_incr)
+
+        X_csc = csc_container(X_lil)
+        X_means, X_vars = mean_variance_axis(X_csc, axis)
+        assert_array_almost_equal(X_means, X_means_incr)
+        assert_array_almost_equal(X_vars, X_vars_incr)
+        assert_array_equal(X.shape[axis], n_incr)
+
+        # Test _incremental_mean_and_var with whole data
+        X = np.vstack(data_chunks)
+        X = X.T if axis == 1 else X
+        X_lil = lil_container(X)
+        X_csr = csr_container(X_lil)
+        X_csc = csc_container(X_lil)
+
+        expected_dtypes = [
+            (np.float32, np.float32),
+            (np.float64, np.float64),
+            (np.int32, np.float64),
+            (np.int64, np.float64),
+        ]
+
+        for input_dtype, output_dtype in expected_dtypes:
+            for X_sparse in (X_csr, X_csc):
+                X_sparse = X_sparse.astype(input_dtype)
+                last_mean = last_mean.astype(output_dtype)
+                last_var = last_var.astype(output_dtype)
+                X_means, X_vars = mean_variance_axis(X_sparse, axis)
+                X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
+                    X_sparse,
+                    axis=axis,
+                    last_mean=last_mean,
+                    last_var=last_var,
+                    last_n=last_n,
+                )
+                assert X_means_incr.dtype == output_dtype
+                assert X_vars_incr.dtype == output_dtype
+                assert_array_almost_equal(X_means, X_means_incr)
+                assert_array_almost_equal(X_vars, X_vars_incr)
+                assert_array_equal(X.shape[axis], n_incr)
+
+
+@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_incr_mean_variance_axis_dim_mismatch(sparse_constructor):
+    """Check that we raise proper error when axis=1 and the dimension mismatch.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/18655
+    """
+    n_samples, n_features = 60, 4
+    rng = np.random.RandomState(42)
+    X = sparse_constructor(rng.rand(n_samples, n_features))
+
+    last_mean = np.zeros(n_features)
+    last_var = np.zeros_like(last_mean)
+    last_n = np.zeros(last_mean.shape, dtype=np.int64)
+
+    kwargs = dict(last_mean=last_mean, last_var=last_var, last_n=last_n)
+    mean0, var0, _ = incr_mean_variance_axis(X, axis=0, **kwargs)
+    assert_allclose(np.mean(X.toarray(), axis=0), mean0)
+    assert_allclose(np.var(X.toarray(), axis=0), var0)
+
+    # test ValueError if axis=1 and last_mean.size == n_features
+    with pytest.raises(ValueError):
+        incr_mean_variance_axis(X, axis=1, **kwargs)
+
+    # test inconsistent shapes of last_mean, last_var, last_n
+    kwargs = dict(last_mean=last_mean[:-1], last_var=last_var, last_n=last_n)
+    with pytest.raises(ValueError):
+        incr_mean_variance_axis(X, axis=0, **kwargs)
+
+
+@pytest.mark.parametrize(
+    "X1, X2",
+    [
+        (
+            sp.random(5, 2, density=0.8, format="csr", random_state=0),
+            sp.random(13, 2, density=0.8, format="csr", random_state=0),
+        ),
+        (
+            sp.random(5, 2, density=0.8, format="csr", random_state=0),
+            sp.hstack(
+                [
+                    np.full((13, 1), fill_value=np.nan),
+                    sp.random(13, 1, density=0.8, random_state=42),
+                ],
+                format="csr",
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2, csr_container):
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16448
+    # check that computing the incremental mean and variance is equivalent to
+    # computing the mean and variance on the stacked dataset.
+    X1 = csr_container(X1)
+    X2 = csr_container(X2)
+    axis = 0
+    last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
+    last_n = np.zeros(X1.shape[1], dtype=np.int64)
+    updated_mean, updated_var, updated_n = incr_mean_variance_axis(
+        X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
+    )
+    updated_mean, updated_var, updated_n = incr_mean_variance_axis(
+        X2, axis=axis, last_mean=updated_mean, last_var=updated_var, last_n=updated_n
+    )
+    X = sp.vstack([X1, X2])
+    assert_allclose(updated_mean, np.nanmean(X.toarray(), axis=axis))
+    assert_allclose(updated_var, np.nanvar(X.toarray(), axis=axis))
+    assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.toarray()), axis=0))
+
+
+def test_incr_mean_variance_no_new_n():
+    # check the behaviour when we update the variance with an empty matrix
+    axis = 0
+    X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr()
+    X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr()
+    last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
+    last_n = np.zeros(X1.shape[1], dtype=np.int64)
+    last_mean, last_var, last_n = incr_mean_variance_axis(
+        X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
+    )
+    # update statistic with a column which should ignored
+    updated_mean, updated_var, updated_n = incr_mean_variance_axis(
+        X2, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
+    )
+    assert_allclose(updated_mean, last_mean)
+    assert_allclose(updated_var, last_var)
+    assert_allclose(updated_n, last_n)
+
+
+def test_incr_mean_variance_n_float():
+    # check the behaviour when last_n is just a number
+    axis = 0
+    X = sp.random(5, 2, density=0.8, random_state=0).tocsr()
+    last_mean, last_var = np.zeros(X.shape[1]), np.zeros(X.shape[1])
+    last_n = 0
+    _, _, new_n = incr_mean_variance_axis(
+        X, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
+    )
+    assert_allclose(new_n, np.full(X.shape[1], X.shape[0]))
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor):
+    old_means = np.array([535.0, 535.0, 535.0, 535.0])
+    old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
+    old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64)
+
+    X = sparse_constructor(
+        np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])
+    )
+
+    X_nan = sparse_constructor(
+        np.array(
+            [
+                [170, np.nan, 170, 170],
+                [np.nan, 170, 430, 430],
+                [430, 430, np.nan, 300],
+                [300, 300, 300, np.nan],
+            ]
+        )
+    )
+
+    # we avoid creating specific data for axis 0 and 1: translating the data is
+    # enough.
+    if axis:
+        X = X.T
+        X_nan = X_nan.T
+
+    # take a copy of the old statistics since they are modified in place.
+    X_means, X_vars, X_sample_count = incr_mean_variance_axis(
+        X,
+        axis=axis,
+        last_mean=old_means.copy(),
+        last_var=old_variances.copy(),
+        last_n=old_sample_count.copy(),
+    )
+    X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis(
+        X_nan,
+        axis=axis,
+        last_mean=old_means.copy(),
+        last_var=old_variances.copy(),
+        last_n=old_sample_count.copy(),
+    )
+
+    assert_allclose(X_nan_means, X_means)
+    assert_allclose(X_nan_vars, X_vars)
+    assert_allclose(X_nan_sample_count, X_sample_count)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_mean_variance_illegal_axis(csr_container):
+    X, _ = make_classification(5, 4, random_state=0)
+    # Sparsify the array a little bit
+    X[0, 0] = 0
+    X[2, 1] = 0
+    X[4, 3] = 0
+    X_csr = csr_container(X)
+    with pytest.raises(ValueError):
+        mean_variance_axis(X_csr, axis=-3)
+    with pytest.raises(ValueError):
+        mean_variance_axis(X_csr, axis=2)
+    with pytest.raises(ValueError):
+        mean_variance_axis(X_csr, axis=-1)
+
+    with pytest.raises(ValueError):
+        incr_mean_variance_axis(
+            X_csr, axis=-3, last_mean=None, last_var=None, last_n=None
+        )
+
+    with pytest.raises(ValueError):
+        incr_mean_variance_axis(
+            X_csr, axis=2, last_mean=None, last_var=None, last_n=None
+        )
+
+    with pytest.raises(ValueError):
+        incr_mean_variance_axis(
+            X_csr, axis=-1, last_mean=None, last_var=None, last_n=None
+        )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_densify_rows(csr_container):
+    for dtype in (np.float32, np.float64):
+        X = csr_container(
+            [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=dtype
+        )
+        X_rows = np.array([0, 2, 3], dtype=np.intp)
+        out = np.ones((6, X.shape[1]), dtype=dtype)
+        out_rows = np.array([1, 3, 4], dtype=np.intp)
+
+        expect = np.ones_like(out)
+        expect[out_rows] = X[X_rows, :].toarray()
+
+        assign_rows_csr(X, X_rows, out_rows, out)
+        assert_array_equal(out, expect)
+
+
+def test_inplace_column_scale():
+    rng = np.random.RandomState(0)
+    X = sp.random(100, 200, density=0.05)
+    Xr = X.tocsr()
+    Xc = X.tocsc()
+    XA = X.toarray()
+    scale = rng.rand(200)
+    XA *= scale
+
+    inplace_column_scale(Xc, scale)
+    inplace_column_scale(Xr, scale)
+    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
+    assert_array_almost_equal(XA, Xc.toarray())
+    assert_array_almost_equal(XA, Xr.toarray())
+    with pytest.raises(TypeError):
+        inplace_column_scale(X.tolil(), scale)
+
+    X = X.astype(np.float32)
+    scale = scale.astype(np.float32)
+    Xr = X.tocsr()
+    Xc = X.tocsc()
+    XA = X.toarray()
+    XA *= scale
+    inplace_column_scale(Xc, scale)
+    inplace_column_scale(Xr, scale)
+    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
+    assert_array_almost_equal(XA, Xc.toarray())
+    assert_array_almost_equal(XA, Xr.toarray())
+    with pytest.raises(TypeError):
+        inplace_column_scale(X.tolil(), scale)
+
+
+def test_inplace_row_scale():
+    rng = np.random.RandomState(0)
+    X = sp.random(100, 200, density=0.05)
+    Xr = X.tocsr()
+    Xc = X.tocsc()
+    XA = X.toarray()
+    scale = rng.rand(100)
+    XA *= scale.reshape(-1, 1)
+
+    inplace_row_scale(Xc, scale)
+    inplace_row_scale(Xr, scale)
+    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
+    assert_array_almost_equal(XA, Xc.toarray())
+    assert_array_almost_equal(XA, Xr.toarray())
+    with pytest.raises(TypeError):
+        inplace_column_scale(X.tolil(), scale)
+
+    X = X.astype(np.float32)
+    scale = scale.astype(np.float32)
+    Xr = X.tocsr()
+    Xc = X.tocsc()
+    XA = X.toarray()
+    XA *= scale.reshape(-1, 1)
+    inplace_row_scale(Xc, scale)
+    inplace_row_scale(Xr, scale)
+    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
+    assert_array_almost_equal(XA, Xc.toarray())
+    assert_array_almost_equal(XA, Xr.toarray())
+    with pytest.raises(TypeError):
+        inplace_column_scale(X.tolil(), scale)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_inplace_swap_row(csc_container, csr_container):
+    X = np.array(
+        [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
+    )
+    X_csr = csr_container(X)
+    X_csc = csc_container(X)
+
+    swap = linalg.get_blas_funcs(("swap",), (X,))
+    swap = swap[0]
+    X[0], X[-1] = swap(X[0], X[-1])
+    inplace_swap_row(X_csr, 0, -1)
+    inplace_swap_row(X_csc, 0, -1)
+    assert_array_equal(X_csr.toarray(), X_csc.toarray())
+    assert_array_equal(X, X_csc.toarray())
+    assert_array_equal(X, X_csr.toarray())
+
+    X[2], X[3] = swap(X[2], X[3])
+    inplace_swap_row(X_csr, 2, 3)
+    inplace_swap_row(X_csc, 2, 3)
+    assert_array_equal(X_csr.toarray(), X_csc.toarray())
+    assert_array_equal(X, X_csc.toarray())
+    assert_array_equal(X, X_csr.toarray())
+    with pytest.raises(TypeError):
+        inplace_swap_row(X_csr.tolil())
+
+    X = np.array(
+        [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
+    )
+    X_csr = csr_container(X)
+    X_csc = csc_container(X)
+    swap = linalg.get_blas_funcs(("swap",), (X,))
+    swap = swap[0]
+    X[0], X[-1] = swap(X[0], X[-1])
+    inplace_swap_row(X_csr, 0, -1)
+    inplace_swap_row(X_csc, 0, -1)
+    assert_array_equal(X_csr.toarray(), X_csc.toarray())
+    assert_array_equal(X, X_csc.toarray())
+    assert_array_equal(X, X_csr.toarray())
+    X[2], X[3] = swap(X[2], X[3])
+    inplace_swap_row(X_csr, 2, 3)
+    inplace_swap_row(X_csc, 2, 3)
+    assert_array_equal(X_csr.toarray(), X_csc.toarray())
+    assert_array_equal(X, X_csc.toarray())
+    assert_array_equal(X, X_csr.toarray())
+    with pytest.raises(TypeError):
+        inplace_swap_row(X_csr.tolil())
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_inplace_swap_column(csc_container, csr_container):
+    X = np.array(
+        [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
+    )
+    X_csr = csr_container(X)
+    X_csc = csc_container(X)
+
+    swap = linalg.get_blas_funcs(("swap",), (X,))
+    swap = swap[0]
+    X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
+    inplace_swap_column(X_csr, 0, -1)
+    inplace_swap_column(X_csc, 0, -1)
+    assert_array_equal(X_csr.toarray(), X_csc.toarray())
+    assert_array_equal(X, X_csc.toarray())
+    assert_array_equal(X, X_csr.toarray())
+
+    X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
+    inplace_swap_column(X_csr, 0, 1)
+    inplace_swap_column(X_csc, 0, 1)
+    assert_array_equal(X_csr.toarray(), X_csc.toarray())
+    assert_array_equal(X, X_csc.toarray())
+    assert_array_equal(X, X_csr.toarray())
+    with pytest.raises(TypeError):
+        inplace_swap_column(X_csr.tolil())
+
+    X = np.array(
+        [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
+    )
+    X_csr = csr_container(X)
+    X_csc = csc_container(X)
+    swap = linalg.get_blas_funcs(("swap",), (X,))
+    swap = swap[0]
+    X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
+    inplace_swap_column(X_csr, 0, -1)
+    inplace_swap_column(X_csc, 0, -1)
+    assert_array_equal(X_csr.toarray(), X_csc.toarray())
+    assert_array_equal(X, X_csc.toarray())
+    assert_array_equal(X, X_csr.toarray())
+    X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
+    inplace_swap_column(X_csr, 0, 1)
+    inplace_swap_column(X_csc, 0, 1)
+    assert_array_equal(X_csr.toarray(), X_csc.toarray())
+    assert_array_equal(X, X_csc.toarray())
+    assert_array_equal(X, X_csr.toarray())
+    with pytest.raises(TypeError):
+        inplace_swap_column(X_csr.tolil())
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("axis", [0, 1, None])
+@pytest.mark.parametrize("sparse_format", CSC_CONTAINERS + CSR_CONTAINERS)
+@pytest.mark.parametrize(
+    "missing_values, min_func, max_func, ignore_nan",
+    [(0, np.min, np.max, False), (np.nan, np.nanmin, np.nanmax, True)],
+)
+@pytest.mark.parametrize("large_indices", [True, False])
+def test_min_max(
+    dtype,
+    axis,
+    sparse_format,
+    missing_values,
+    min_func,
+    max_func,
+    ignore_nan,
+    large_indices,
+):
+    X = np.array(
+        [
+            [0, 3, 0],
+            [2, -1, missing_values],
+            [0, 0, 0],
+            [9, missing_values, 7],
+            [4, 0, 5],
+        ],
+        dtype=dtype,
+    )
+    X_sparse = sparse_format(X)
+
+    if large_indices:
+        X_sparse.indices = X_sparse.indices.astype("int64")
+        X_sparse.indptr = X_sparse.indptr.astype("int64")
+
+    mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan)
+    assert_array_equal(mins_sparse, min_func(X, axis=axis))
+    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_min_max_axis_errors(csc_container, csr_container):
+    X = np.array(
+        [[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
+    )
+    X_csr = csr_container(X)
+    X_csc = csc_container(X)
+    with pytest.raises(TypeError):
+        min_max_axis(X_csr.tolil(), axis=0)
+    with pytest.raises(ValueError):
+        min_max_axis(X_csr, axis=2)
+    with pytest.raises(ValueError):
+        min_max_axis(X_csc, axis=-3)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_count_nonzero(csc_container, csr_container):
+    X = np.array(
+        [[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
+    )
+    X_csr = csr_container(X)
+    X_csc = csc_container(X)
+    X_nonzero = X != 0
+    sample_weight = [0.5, 0.2, 0.3, 0.1, 0.1]
+    X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None]
+
+    for axis in [0, 1, -1, -2, None]:
+        assert_array_almost_equal(
+            count_nonzero(X_csr, axis=axis), X_nonzero.sum(axis=axis)
+        )
+        assert_array_almost_equal(
+            count_nonzero(X_csr, axis=axis, sample_weight=sample_weight),
+            X_nonzero_weighted.sum(axis=axis),
+        )
+
+    with pytest.raises(TypeError):
+        count_nonzero(X_csc)
+    with pytest.raises(ValueError):
+        count_nonzero(X_csr, axis=2)
+
+    assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
+    assert (
+        count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
+        == count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
+    )
+
+    # Check dtypes with large sparse matrices too
+    # XXX: test fails on 32bit (Windows/Linux)
+    try:
+        X_csr.indices = X_csr.indices.astype(np.int64)
+        X_csr.indptr = X_csr.indptr.astype(np.int64)
+        assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
+        assert (
+            count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
+            == count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
+        )
+    except TypeError as e:
+        assert "according to the rule 'safe'" in e.args[0] and np.intp().nbytes < 8, e
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_csc_row_median(csc_container, csr_container):
+    # Test csc_row_median actually calculates the median.
+
+    # Test that it gives the same output when X is dense.
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 50)
+    dense_median = np.median(X, axis=0)
+    csc = csc_container(X)
+    sparse_median = csc_median_axis_0(csc)
+    assert_array_equal(sparse_median, dense_median)
+
+    # Test that it gives the same output when X is sparse
+    X = rng.rand(51, 100)
+    X[X < 0.7] = 0.0
+    ind = rng.randint(0, 50, 10)
+    X[ind] = -X[ind]
+    csc = csc_container(X)
+    dense_median = np.median(X, axis=0)
+    sparse_median = csc_median_axis_0(csc)
+    assert_array_equal(sparse_median, dense_median)
+
+    # Test for toy data.
+    X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
+    csc = csc_container(X)
+    assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
+    X = [[0, -2], [-1, -5], [1, -3]]
+    csc = csc_container(X)
+    assert_array_equal(csc_median_axis_0(csc), np.array([0.0, -3]))
+
+    # Test that it raises an Error for non-csc matrices.
+    with pytest.raises(TypeError):
+        csc_median_axis_0(csr_container(X))
+
+
+@pytest.mark.parametrize(
+    "inplace_csr_row_normalize",
+    (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2),
+)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_inplace_normalize(csr_container, inplace_csr_row_normalize):
+    if csr_container is sp.csr_matrix:
+        ones = np.ones((10, 1))
+    else:
+        ones = np.ones(10)
+    rs = RandomState(10)
+
+    for dtype in (np.float64, np.float32):
+        X = rs.randn(10, 5).astype(dtype)
+        X_csr = csr_container(X)
+        for index_dtype in [np.int32, np.int64]:
+            # csr_matrix will use int32 indices by default,
+            # up-casting those to int64 when necessary
+            if index_dtype is np.int64:
+                X_csr.indptr = X_csr.indptr.astype(index_dtype)
+                X_csr.indices = X_csr.indices.astype(index_dtype)
+            assert X_csr.indices.dtype == index_dtype
+            assert X_csr.indptr.dtype == index_dtype
+            inplace_csr_row_normalize(X_csr)
+            assert X_csr.dtype == dtype
+            if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
+                X_csr.data **= 2
+            assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_csr_row_norms(dtype):
+    # checks that csr_row_norms returns the same output as
+    # scipy.sparse.linalg.norm, and that the dype is the same as X.dtype.
+    X = sp.random(100, 10, format="csr", dtype=dtype, random_state=42)
+
+    scipy_norms = sp.linalg.norm(X, axis=1) ** 2
+    norms = csr_row_norms(X)
+
+    assert norms.dtype == dtype
+    rtol = 1e-6 if dtype == np.float32 else 1e-7
+    assert_allclose(norms, scipy_norms, rtol=rtol)
+
+
+@pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS)
+def centered_matrices(request):
+    """Returns equivalent tuple[sp.linalg.LinearOperator, np.ndarray]."""
+    sparse_container = request.param
+
+    random_state = np.random.default_rng(42)
+
+    X_sparse = sparse_container(
+        sp.random(500, 100, density=0.1, format="csr", random_state=random_state)
+    )
+    X_dense = X_sparse.toarray()
+    mu = np.asarray(X_sparse.mean(axis=0)).ravel()
+
+    X_sparse_centered = _implicit_column_offset(X_sparse, mu)
+    X_dense_centered = X_dense - mu
+
+    return X_sparse_centered, X_dense_centered
+
+
+def test_implicit_center_matmat(global_random_seed, centered_matrices):
+    X_sparse_centered, X_dense_centered = centered_matrices
+    rng = np.random.default_rng(global_random_seed)
+    Y = rng.standard_normal((X_dense_centered.shape[1], 50))
+    assert_allclose(X_dense_centered @ Y, X_sparse_centered.matmat(Y))
+    assert_allclose(X_dense_centered @ Y, X_sparse_centered @ Y)
+
+
+def test_implicit_center_matvec(global_random_seed, centered_matrices):
+    X_sparse_centered, X_dense_centered = centered_matrices
+    rng = np.random.default_rng(global_random_seed)
+    y = rng.standard_normal(X_dense_centered.shape[1])
+    assert_allclose(X_dense_centered @ y, X_sparse_centered.matvec(y))
+    assert_allclose(X_dense_centered @ y, X_sparse_centered @ y)
+
+
+def test_implicit_center_rmatmat(global_random_seed, centered_matrices):
+    X_sparse_centered, X_dense_centered = centered_matrices
+    rng = np.random.default_rng(global_random_seed)
+    Y = rng.standard_normal((X_dense_centered.shape[0], 50))
+    assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.rmatmat(Y))
+    assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.T @ Y)
+
+
+def test_implit_center_rmatvec(global_random_seed, centered_matrices):
+    X_sparse_centered, X_dense_centered = centered_matrices
+    rng = np.random.default_rng(global_random_seed)
+    y = rng.standard_normal(X_dense_centered.shape[0])
+    assert_allclose(X_dense_centered.T @ y, X_sparse_centered.rmatvec(y))
+    assert_allclose(X_dense_centered.T @ y, X_sparse_centered.T @ y)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_stats.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c979425f12f8e555dd71b74d87b2e6fdbd03d30
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_stats.py
@@ -0,0 +1,352 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+from pytest import approx
+
+from sklearn._config import config_context
+from sklearn.utils._array_api import (
+    _convert_to_numpy,
+    get_namespace,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._array_api import device as array_device
+from sklearn.utils.estimator_checks import _array_api_for_tests
+from sklearn.utils.fixes import np_version, parse_version
+from sklearn.utils.stats import _averaged_weighted_percentile, _weighted_percentile
+
+
+def test_averaged_weighted_median():
+    y = np.array([0, 1, 2, 3, 4, 5])
+    sw = np.array([1, 1, 1, 1, 1, 1])
+
+    score = _averaged_weighted_percentile(y, sw, 50)
+
+    assert score == np.median(y)
+
+
+def test_averaged_weighted_percentile(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    y = rng.randint(20, size=10)
+
+    sw = np.ones(10)
+
+    score = _averaged_weighted_percentile(y, sw, 20)
+
+    assert score == np.percentile(y, 20, method="averaged_inverted_cdf")
+
+
+def test_averaged_and_weighted_percentile():
+    y = np.array([0, 1, 2])
+    sw = np.array([5, 1, 5])
+    q = 50
+
+    score_averaged = _averaged_weighted_percentile(y, sw, q)
+    score = _weighted_percentile(y, sw, q)
+
+    assert score_averaged == score
+
+
+def test_weighted_percentile():
+    """Check `weighted_percentile` on artificial data with obvious median."""
+    y = np.empty(102, dtype=np.float64)
+    y[:50] = 0
+    y[-51:] = 2
+    y[-1] = 100000
+    y[50] = 1
+    sw = np.ones(102, dtype=np.float64)
+    sw[-1] = 0.0
+    value = _weighted_percentile(y, sw, 50)
+    assert approx(value) == 1
+
+
+def test_weighted_percentile_equal():
+    """Check `weighted_percentile` with all weights equal to 1."""
+    y = np.empty(102, dtype=np.float64)
+    y.fill(0.0)
+    sw = np.ones(102, dtype=np.float64)
+    score = _weighted_percentile(y, sw, 50)
+    assert approx(score) == 0
+
+
+def test_weighted_percentile_zero_weight():
+    """Check `weighted_percentile` with all weights equal to 0."""
+    y = np.empty(102, dtype=np.float64)
+    y.fill(1.0)
+    sw = np.ones(102, dtype=np.float64)
+    sw.fill(0.0)
+    value = _weighted_percentile(y, sw, 50)
+    assert approx(value) == 1.0
+
+
+def test_weighted_percentile_zero_weight_zero_percentile():
+    """Check `weighted_percentile(percentile_rank=0)` behaves correctly.
+
+    Ensures that (leading)zero-weight observations ignored when `percentile_rank=0`.
+    See #20528 for details.
+    """
+    y = np.array([0, 1, 2, 3, 4, 5])
+    sw = np.array([0, 0, 1, 1, 1, 0])
+    value = _weighted_percentile(y, sw, 0)
+    assert approx(value) == 2
+
+    value = _weighted_percentile(y, sw, 50)
+    assert approx(value) == 3
+
+    value = _weighted_percentile(y, sw, 100)
+    assert approx(value) == 4
+
+
+def test_weighted_median_equal_weights(global_random_seed):
+    """Checks `_weighted_percentile(percentile_rank=50)` is the same as `np.median`.
+
+    `sample_weights` are all 1s and the number of samples is odd.
+    When number of samples is odd, `_weighted_percentile` always falls on a single
+    observation (not between 2 values, in which case the lower value would be taken)
+    and is thus equal to `np.median`.
+    For an even number of samples, this check will not always hold as (note that
+    for some other percentile methods it will always hold). See #17370 for details.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    x = rng.randint(10, size=11)
+    weights = np.ones(x.shape)
+    median = np.median(x)
+    w_median = _weighted_percentile(x, weights)
+    assert median == approx(w_median)
+
+
+def test_weighted_median_integer_weights(global_random_seed):
+    # Checks average weighted percentile_rank=0.5 is same as median when manually weight
+    # data
+    rng = np.random.RandomState(global_random_seed)
+    x = rng.randint(20, size=10)
+    weights = rng.choice(5, size=10)
+    x_manual = np.repeat(x, weights)
+    median = np.median(x_manual)
+    w_median = _averaged_weighted_percentile(x, weights)
+    assert median == approx(w_median)
+
+
+def test_weighted_percentile_2d(global_random_seed):
+    # Check for when array 2D and sample_weight 1D
+    rng = np.random.RandomState(global_random_seed)
+    x1 = rng.randint(10, size=10)
+    w1 = rng.choice(5, size=10)
+
+    x2 = rng.randint(20, size=10)
+    x_2d = np.vstack((x1, x2)).T
+
+    w_median = _weighted_percentile(x_2d, w1)
+    p_axis_0 = [_weighted_percentile(x_2d[:, i], w1) for i in range(x_2d.shape[1])]
+    assert_allclose(w_median, p_axis_0)
+    # Check when array and sample_weight both 2D
+    w2 = rng.choice(5, size=10)
+    w_2d = np.vstack((w1, w2)).T
+
+    w_median = _weighted_percentile(x_2d, w_2d)
+    p_axis_0 = [
+        _weighted_percentile(x_2d[:, i], w_2d[:, i]) for i in range(x_2d.shape[1])
+    ]
+    assert_allclose(w_median, p_axis_0)
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize(
+    "data, weights, percentile",
+    [
+        # NumPy scalars input (handled as 0D arrays on array API)
+        (np.float32(42), np.int32(1), 50),
+        # Random 1D array, constant weights
+        (lambda rng: rng.rand(50), np.ones(50).astype(np.int32), 50),
+        # Random 2D array and random 1D weights
+        (lambda rng: rng.rand(50, 3), lambda rng: rng.rand(50).astype(np.float32), 75),
+        # Random 2D array and random 2D weights
+        (
+            lambda rng: rng.rand(20, 3),
+            lambda rng: rng.rand(20, 3).astype(np.float32),
+            25,
+        ),
+        # zero-weights and `rank_percentile=0` (#20528) (`sample_weight` dtype: int64)
+        (np.array([0, 1, 2, 3, 4, 5]), np.array([0, 0, 1, 1, 1, 0]), 0),
+        # np.nan's in data and some zero-weights (`sample_weight` dtype: int64)
+        (np.array([np.nan, np.nan, 0, 3, 4, 5]), np.array([0, 1, 1, 1, 1, 0]), 0),
+        # `sample_weight` dtype: int32
+        (
+            np.array([0, 1, 2, 3, 4, 5]),
+            np.array([0, 1, 1, 1, 1, 0], dtype=np.int32),
+            25,
+        ),
+    ],
+)
+def test_weighted_percentile_array_api_consistency(
+    global_random_seed, array_namespace, device, dtype_name, data, weights, percentile
+):
+    """Check `_weighted_percentile` gives consistent results with array API."""
+    if array_namespace == "array_api_strict":
+        try:
+            import array_api_strict
+        except ImportError:
+            pass
+        else:
+            if device == array_api_strict.Device("device1"):
+                # See https://github.com/data-apis/array-api-strict/issues/134
+                pytest.xfail(
+                    "array_api_strict has bug when indexing with tuple of arrays "
+                    "on non-'CPU_DEVICE' devices."
+                )
+
+    xp = _array_api_for_tests(array_namespace, device)
+
+    # Skip test for percentile=0 edge case (#20528) on namespace/device where
+    # xp.nextafter is broken. This is the case for torch with MPS device:
+    # https://github.com/pytorch/pytorch/issues/150027
+    zero = xp.zeros(1, device=device)
+    one = xp.ones(1, device=device)
+    if percentile == 0 and xp.all(xp.nextafter(zero, one) == zero):
+        pytest.xfail(f"xp.nextafter is broken on {device}")
+
+    rng = np.random.RandomState(global_random_seed)
+    X_np = data(rng) if callable(data) else data
+    weights_np = weights(rng) if callable(weights) else weights
+    # Ensure `data` of correct dtype
+    X_np = X_np.astype(dtype_name)
+
+    result_np = _weighted_percentile(X_np, weights_np, percentile)
+    # Convert to Array API arrays
+    X_xp = xp.asarray(X_np, device=device)
+    weights_xp = xp.asarray(weights_np, device=device)
+
+    with config_context(array_api_dispatch=True):
+        result_xp = _weighted_percentile(X_xp, weights_xp, percentile)
+        assert array_device(result_xp) == array_device(X_xp)
+        assert get_namespace(result_xp)[0] == get_namespace(X_xp)[0]
+        result_xp_np = _convert_to_numpy(result_xp, xp=xp)
+
+    assert result_xp_np.dtype == result_np.dtype
+    assert result_xp_np.shape == result_np.shape
+    assert_allclose(result_np, result_xp_np)
+
+    # Check dtype correct (`sample_weight` should follow `array`)
+    if dtype_name == "float32":
+        assert result_xp_np.dtype == result_np.dtype == np.float32
+    else:
+        assert result_xp_np.dtype == np.float64
+
+
+@pytest.mark.parametrize("sample_weight_ndim", [1, 2])
+def test_weighted_percentile_nan_filtered(sample_weight_ndim, global_random_seed):
+    """Test that calling _weighted_percentile on an array with nan values returns
+    the same results as calling _weighted_percentile on a filtered version of the data.
+    We test both with sample_weight of the same shape as the data and with
+    one-dimensional sample_weight."""
+
+    rng = np.random.RandomState(global_random_seed)
+    array_with_nans = rng.rand(100, 10)
+    array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
+    nan_mask = np.isnan(array_with_nans)
+
+    if sample_weight_ndim == 2:
+        sample_weight = rng.randint(1, 6, size=(100, 10))
+    else:
+        sample_weight = rng.randint(1, 6, size=(100,))
+
+    # Find the weighted percentile on the array with nans:
+    results = _weighted_percentile(array_with_nans, sample_weight, 30)
+
+    # Find the weighted percentile on the filtered array:
+    filtered_array = [
+        array_with_nans[~nan_mask[:, col], col]
+        for col in range(array_with_nans.shape[1])
+    ]
+    if sample_weight.ndim == 1:
+        sample_weight = np.repeat(sample_weight, array_with_nans.shape[1]).reshape(
+            array_with_nans.shape[0], array_with_nans.shape[1]
+        )
+    filtered_weights = [
+        sample_weight[~nan_mask[:, col], col] for col in range(array_with_nans.shape[1])
+    ]
+
+    expected_results = np.array(
+        [
+            _weighted_percentile(filtered_array[col], filtered_weights[col], 30)
+            for col in range(array_with_nans.shape[1])
+        ]
+    )
+
+    assert_array_equal(expected_results, results)
+
+
+def test_weighted_percentile_all_nan_column():
+    """Check that nans are ignored in general, except for all NaN columns."""
+
+    array = np.array(
+        [
+            [np.nan, 5],
+            [np.nan, 1],
+            [np.nan, np.nan],
+            [np.nan, np.nan],
+            [np.nan, 2],
+            [np.nan, np.nan],
+        ]
+    )
+    weights = np.ones_like(array)
+    percentile_rank = 90
+
+    values = _weighted_percentile(array, weights, percentile_rank)
+
+    # The percentile of the second column should be `5` even though there are many nan
+    # values present; the percentile of the first column can only be nan, since there
+    # are no other possible values:
+    assert np.array_equal(values, np.array([np.nan, 5]), equal_nan=True)
+
+
+@pytest.mark.skipif(
+    np_version < parse_version("2.0"),
+    reason="np.quantile only accepts weights since version 2.0",
+)
+@pytest.mark.parametrize("percentile", [66, 10, 50])
+def test_weighted_percentile_like_numpy_quantile(percentile, global_random_seed):
+    """Check that _weighted_percentile delivers equivalent results as np.quantile
+    with weights."""
+
+    rng = np.random.RandomState(global_random_seed)
+    array = rng.rand(10, 100)
+    sample_weight = rng.randint(1, 6, size=(10, 100))
+
+    percentile_weighted_percentile = _weighted_percentile(
+        array, sample_weight, percentile
+    )
+    percentile_numpy_quantile = np.quantile(
+        array, percentile / 100, weights=sample_weight, axis=0, method="inverted_cdf"
+    )
+
+    assert_array_equal(percentile_weighted_percentile, percentile_numpy_quantile)
+
+
+@pytest.mark.skipif(
+    np_version < parse_version("2.0"),
+    reason="np.nanquantile only accepts weights since version 2.0",
+)
+@pytest.mark.parametrize("percentile", [66, 10, 50])
+def test_weighted_percentile_like_numpy_nanquantile(percentile, global_random_seed):
+    """Check that _weighted_percentile delivers equivalent results as np.nanquantile
+    with weights."""
+
+    rng = np.random.RandomState(global_random_seed)
+    array_with_nans = rng.rand(10, 100)
+    array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
+    sample_weight = rng.randint(1, 6, size=(10, 100))
+
+    percentile_weighted_percentile = _weighted_percentile(
+        array_with_nans, sample_weight, percentile
+    )
+    percentile_numpy_nanquantile = np.nanquantile(
+        array_with_nans,
+        percentile / 100,
+        weights=sample_weight,
+        axis=0,
+        method="inverted_cdf",
+    )
+
+    assert_array_equal(percentile_weighted_percentile, percentile_numpy_nanquantile)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_tags.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_tags.py
new file mode 100644
index 0000000000000000000000000000000000000000..38be48e85e38e306ecc58fc365c181331071edee
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_tags.py
@@ -0,0 +1,153 @@
+from dataclasses import dataclass, fields
+
+import numpy as np
+import pytest
+
+from sklearn.base import (
+    BaseEstimator,
+    ClassifierMixin,
+    RegressorMixin,
+    TransformerMixin,
+)
+from sklearn.pipeline import Pipeline
+from sklearn.utils import (
+    Tags,
+    get_tags,
+)
+from sklearn.utils.estimator_checks import (
+    check_estimator_tags_renamed,
+    check_valid_tag_types,
+)
+
+
+class NoTagsEstimator:
+    pass
+
+
+class ClassifierEstimator:
+    # This is to test whether not inheriting from mixins works.
+    _estimator_type = "classifier"
+
+
+class EmptyTransformer(TransformerMixin, BaseEstimator):
+    pass
+
+
+class EmptyRegressor(RegressorMixin, BaseEstimator):
+    pass
+
+
+# TODO(1.8): Update when implementing __sklearn_tags__ is required
+@pytest.mark.filterwarnings(
+    "ignore:.*no attribute '__sklearn_tags__'.*:DeprecationWarning"
+)
+@pytest.mark.parametrize(
+    "estimator, value",
+    [
+        [NoTagsEstimator(), False],
+        [ClassifierEstimator(), True],
+        [EmptyTransformer(), False],
+        [EmptyRegressor(), True],
+        [BaseEstimator(), False],
+    ],
+)
+def test_requires_y(estimator, value):
+    assert get_tags(estimator).target_tags.required == value
+
+
+def test_no___sklearn_tags__with_more_tags():
+    """Test that calling `get_tags` on a class that defines `_more_tags` but not
+    `__sklearn_tags__` raises an error.
+    """
+
+    class MoreTagsEstimator(BaseEstimator):
+        def _more_tags(self):
+            return {"requires_y": True}  # pragma: no cover
+
+    with pytest.raises(
+        TypeError, match="has defined either `_more_tags` or `_get_tags`"
+    ):
+        check_estimator_tags_renamed("MoreTagsEstimator", MoreTagsEstimator())
+
+
+def test_tag_test_passes_with_inheritance():
+    @dataclass
+    class MyTags(Tags):
+        my_tag: bool = True  # type: ignore[annotation-unchecked]
+
+    class MyEstimator(BaseEstimator):
+        def __sklearn_tags__(self):
+            tags_orig = super().__sklearn_tags__()
+            as_dict = {
+                field.name: getattr(tags_orig, field.name)
+                for field in fields(tags_orig)
+            }
+            tags = MyTags(**as_dict)
+            tags.my_tag = True
+            return tags
+
+    check_valid_tag_types("MyEstimator", MyEstimator())
+
+
+# TODO(1.8): Update this test to check for errors
+def test_tags_no_sklearn_tags_concrete_implementation():
+    """Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/30479
+
+    Either the estimator doesn't implement `__sklearn_tags` or there is no class
+    implementing `__sklearn_tags__` without calling `super().__sklearn_tags__()` in
+    its mro. Thus, we raise a warning and request to inherit from
+    `BaseEstimator` that implements `__sklearn_tags__`.
+    """
+
+    X = np.array([[1, 2], [2, 3], [3, 4]])
+    y = np.array([1, 0, 1])
+
+    # 1st case, the estimator inherits from a class that only implements
+    # `__sklearn_tags__` by calling `super().__sklearn_tags__()`.
+    class MyEstimator(ClassifierMixin):
+        def __init__(self, *, param=1):
+            self.param = param
+
+        def fit(self, X, y=None):
+            self.is_fitted_ = True
+            return self
+
+        def predict(self, X):
+            return np.full(shape=X.shape[0], fill_value=self.param)
+
+    my_pipeline = Pipeline([("estimator", MyEstimator(param=1))])
+    with pytest.warns(DeprecationWarning, match="The following error was raised"):
+        my_pipeline.fit(X, y).predict(X)
+
+    # 2nd case, the estimator doesn't implement `__sklearn_tags__` at all.
+    class MyEstimator2:
+        def __init__(self, *, param=1):
+            self.param = param
+
+        def fit(self, X, y=None):
+            self.is_fitted_ = True
+            return self
+
+        def predict(self, X):
+            return np.full(shape=X.shape[0], fill_value=self.param)
+
+    my_pipeline = Pipeline([("estimator", MyEstimator2(param=1))])
+    with pytest.warns(DeprecationWarning, match="The following error was raised"):
+        my_pipeline.fit(X, y).predict(X)
+
+    # check that we still raise an error if it is not a AttributeError or related to
+    # __sklearn_tags__
+    class MyEstimator3(MyEstimator, BaseEstimator):
+        def __init__(self, *, param=1, error_type=AttributeError):
+            self.param = param
+            self.error_type = error_type
+
+        def __sklearn_tags__(self):
+            super().__sklearn_tags__()
+            raise self.error_type("test")
+
+    for error_type in (AttributeError, TypeError, ValueError):
+        estimator = MyEstimator3(param=1, error_type=error_type)
+        with pytest.raises(error_type):
+            get_tags(estimator)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_testing.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae9c380941c8ce4c3b661387b9873f6019c75f51
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_testing.py
@@ -0,0 +1,1143 @@
+import atexit
+import os
+import warnings
+
+import numpy as np
+import pytest
+from scipy import sparse
+
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._testing import (
+    TempMemmap,
+    _convert_container,
+    _delete_folder,
+    _get_warnings_filters_info_list,
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_docstring_consistency,
+    assert_run_python_script_without_output,
+    check_docstring_parameters,
+    create_memmap_backed_data,
+    ignore_warnings,
+    raises,
+    set_random_state,
+    skip_if_no_numpydoc,
+    turn_warnings_into_errors,
+)
+from sklearn.utils.deprecation import deprecated
+from sklearn.utils.fixes import (
+    _IS_WASM,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+)
+from sklearn.utils.metaestimators import available_if
+
+
+def test_set_random_state():
+    lda = LinearDiscriminantAnalysis()
+    tree = DecisionTreeClassifier()
+    # Linear Discriminant Analysis doesn't have random state: smoke test
+    set_random_state(lda, 3)
+    set_random_state(tree, 3)
+    assert tree.random_state == 3
+
+
+@pytest.mark.parametrize("csr_container", CSC_CONTAINERS)
+def test_assert_allclose_dense_sparse(csr_container):
+    x = np.arange(9).reshape(3, 3)
+    msg = "Not equal to tolerance "
+    y = csr_container(x)
+    for X in [x, y]:
+        # basic compare
+        with pytest.raises(AssertionError, match=msg):
+            assert_allclose_dense_sparse(X, X * 2)
+        assert_allclose_dense_sparse(X, X)
+
+    with pytest.raises(ValueError, match="Can only compare two sparse"):
+        assert_allclose_dense_sparse(x, y)
+
+    A = sparse.diags(np.ones(5), offsets=0).tocsr()
+    B = csr_container(np.ones((1, 5)))
+    with pytest.raises(AssertionError, match="Arrays are not equal"):
+        assert_allclose_dense_sparse(B, A)
+
+
+def test_ignore_warning():
+    # This check that ignore_warning decorator and context manager are working
+    # as expected
+    def _warning_function():
+        warnings.warn("deprecation warning", DeprecationWarning)
+
+    def _multiple_warning_function():
+        warnings.warn("deprecation warning", DeprecationWarning)
+        warnings.warn("deprecation warning")
+
+    # Check the function directly
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        ignore_warnings(_warning_function)
+        ignore_warnings(_warning_function, category=DeprecationWarning)
+
+    with pytest.warns(DeprecationWarning):
+        ignore_warnings(_warning_function, category=UserWarning)()
+
+    with pytest.warns() as record:
+        ignore_warnings(_multiple_warning_function, category=FutureWarning)()
+    assert len(record) == 2
+    assert isinstance(record[0].message, DeprecationWarning)
+    assert isinstance(record[1].message, UserWarning)
+
+    with pytest.warns() as record:
+        ignore_warnings(_multiple_warning_function, category=UserWarning)()
+    assert len(record) == 1
+    assert isinstance(record[0].message, DeprecationWarning)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        ignore_warnings(_warning_function, category=(DeprecationWarning, UserWarning))
+
+    # Check the decorator
+    @ignore_warnings
+    def decorator_no_warning():
+        _warning_function()
+        _multiple_warning_function()
+
+    @ignore_warnings(category=(DeprecationWarning, UserWarning))
+    def decorator_no_warning_multiple():
+        _multiple_warning_function()
+
+    @ignore_warnings(category=DeprecationWarning)
+    def decorator_no_deprecation_warning():
+        _warning_function()
+
+    @ignore_warnings(category=UserWarning)
+    def decorator_no_user_warning():
+        _warning_function()
+
+    @ignore_warnings(category=DeprecationWarning)
+    def decorator_no_deprecation_multiple_warning():
+        _multiple_warning_function()
+
+    @ignore_warnings(category=UserWarning)
+    def decorator_no_user_multiple_warning():
+        _multiple_warning_function()
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        decorator_no_warning()
+        decorator_no_warning_multiple()
+        decorator_no_deprecation_warning()
+
+    with pytest.warns(DeprecationWarning):
+        decorator_no_user_warning()
+    with pytest.warns(UserWarning):
+        decorator_no_deprecation_multiple_warning()
+    with pytest.warns(DeprecationWarning):
+        decorator_no_user_multiple_warning()
+
+    # Check the context manager
+    def context_manager_no_warning():
+        with ignore_warnings():
+            _warning_function()
+
+    def context_manager_no_warning_multiple():
+        with ignore_warnings(category=(DeprecationWarning, UserWarning)):
+            _multiple_warning_function()
+
+    def context_manager_no_deprecation_warning():
+        with ignore_warnings(category=DeprecationWarning):
+            _warning_function()
+
+    def context_manager_no_user_warning():
+        with ignore_warnings(category=UserWarning):
+            _warning_function()
+
+    def context_manager_no_deprecation_multiple_warning():
+        with ignore_warnings(category=DeprecationWarning):
+            _multiple_warning_function()
+
+    def context_manager_no_user_multiple_warning():
+        with ignore_warnings(category=UserWarning):
+            _multiple_warning_function()
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        context_manager_no_warning()
+        context_manager_no_warning_multiple()
+        context_manager_no_deprecation_warning()
+
+    with pytest.warns(DeprecationWarning):
+        context_manager_no_user_warning()
+    with pytest.warns(UserWarning):
+        context_manager_no_deprecation_multiple_warning()
+    with pytest.warns(DeprecationWarning):
+        context_manager_no_user_multiple_warning()
+
+    # Check that passing warning class as first positional argument
+    warning_class = UserWarning
+    match = "'obj' should be a callable.+you should use 'category=UserWarning'"
+
+    with pytest.raises(ValueError, match=match):
+        silence_warnings_func = ignore_warnings(warning_class)(_warning_function)
+        silence_warnings_func()
+
+    with pytest.raises(ValueError, match=match):
+
+        @ignore_warnings(warning_class)
+        def test():
+            pass
+
+
+# Tests for docstrings:
+
+
+def f_ok(a, b):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+    b : float
+        Parameter b
+
+    Returns
+    -------
+    c : list
+        Parameter c
+    """
+    c = a + b
+    return c
+
+
+def f_bad_sections(a, b):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+    b : float
+        Parameter b
+
+    Results
+    -------
+    c : list
+        Parameter c
+    """
+    c = a + b
+    return c
+
+
+def f_bad_order(b, a):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+    b : float
+        Parameter b
+
+    Returns
+    -------
+    c : list
+        Parameter c
+    """
+    c = a + b
+    return c
+
+
+def f_too_many_param_docstring(a, b):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+    b : int
+        Parameter b
+    c : int
+        Parameter c
+
+    Returns
+    -------
+    d : list
+        Parameter c
+    """
+    d = a + b
+    return d
+
+
+def f_missing(a, b):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+
+    Returns
+    -------
+    c : list
+        Parameter c
+    """
+    c = a + b
+    return c
+
+
+def f_check_param_definition(a, b, c, d, e):
+    """Function f
+
+    Parameters
+    ----------
+    a: int
+        Parameter a
+    b:
+        Parameter b
+    c :
+        This is parsed correctly in numpydoc 1.2
+    d:int
+        Parameter d
+    e
+        No typespec is allowed without colon
+    """
+    return a + b + c + d
+
+
+class Klass:
+    def f_missing(self, X, y):
+        pass
+
+    def f_bad_sections(self, X, y):
+        """Function f
+
+        Parameter
+        ---------
+        a : int
+            Parameter a
+        b : float
+            Parameter b
+
+        Results
+        -------
+        c : list
+            Parameter c
+        """
+        pass
+
+
+class MockEst:
+    def __init__(self):
+        """MockEstimator"""
+
+    def fit(self, X, y):
+        return X
+
+    def predict(self, X):
+        return X
+
+    def predict_proba(self, X):
+        return X
+
+    def score(self, X):
+        return 1.0
+
+
+class MockMetaEstimator:
+    def __init__(self, delegate):
+        """MetaEstimator to check if doctest on delegated methods work.
+
+        Parameters
+        ---------
+        delegate : estimator
+            Delegated estimator.
+        """
+        self.delegate = delegate
+
+    @available_if(lambda self: hasattr(self.delegate, "predict"))
+    def predict(self, X):
+        """This is available only if delegate has predict.
+
+        Parameters
+        ----------
+        y : ndarray
+            Parameter y
+        """
+        return self.delegate.predict(X)
+
+    @available_if(lambda self: hasattr(self.delegate, "score"))
+    @deprecated("Testing a deprecated delegated method")
+    def score(self, X):
+        """This is available only if delegate has score.
+
+        Parameters
+        ---------
+        y : ndarray
+            Parameter y
+        """
+
+    @available_if(lambda self: hasattr(self.delegate, "predict_proba"))
+    def predict_proba(self, X):
+        """This is available only if delegate has predict_proba.
+
+        Parameters
+        ---------
+        X : ndarray
+            Parameter X
+        """
+        return X
+
+    @deprecated("Testing deprecated function with wrong params")
+    def fit(self, X, y):
+        """Incorrect docstring but should not be tested"""
+
+
+@skip_if_no_numpydoc
+def test_check_docstring_parameters():
+    incorrect = check_docstring_parameters(f_ok)
+    assert incorrect == []
+    incorrect = check_docstring_parameters(f_ok, ignore=["b"])
+    assert incorrect == []
+    incorrect = check_docstring_parameters(f_missing, ignore=["b"])
+    assert incorrect == []
+    with pytest.raises(RuntimeError, match="Unknown section Results"):
+        check_docstring_parameters(f_bad_sections)
+    with pytest.raises(RuntimeError, match="Unknown section Parameter"):
+        check_docstring_parameters(Klass.f_bad_sections)
+
+    incorrect = check_docstring_parameters(f_check_param_definition)
+    mock_meta = MockMetaEstimator(delegate=MockEst())
+    mock_meta_name = mock_meta.__class__.__name__
+    assert incorrect == [
+        (
+            "sklearn.utils.tests.test_testing.f_check_param_definition There "
+            "was no space between the param name and colon ('a: int')"
+        ),
+        (
+            "sklearn.utils.tests.test_testing.f_check_param_definition There "
+            "was no space between the param name and colon ('b:')"
+        ),
+        (
+            "sklearn.utils.tests.test_testing.f_check_param_definition There "
+            "was no space between the param name and colon ('d:int')"
+        ),
+    ]
+
+    messages = [
+        [
+            "In function: sklearn.utils.tests.test_testing.f_bad_order",
+            (
+                "There's a parameter name mismatch in function docstring w.r.t."
+                " function signature, at index 0 diff: 'b' != 'a'"
+            ),
+            "Full diff:",
+            "- ['b', 'a']",
+            "+ ['a', 'b']",
+        ],
+        [
+            "In function: sklearn.utils.tests.test_testing.f_too_many_param_docstring",
+            (
+                "Parameters in function docstring have more items w.r.t. function"
+                " signature, first extra item: c"
+            ),
+            "Full diff:",
+            "- ['a', 'b']",
+            "+ ['a', 'b', 'c']",
+            "?          +++++",
+        ],
+        [
+            "In function: sklearn.utils.tests.test_testing.f_missing",
+            (
+                "Parameters in function docstring have less items w.r.t. function"
+                " signature, first missing item: b"
+            ),
+            "Full diff:",
+            "- ['a', 'b']",
+            "+ ['a']",
+        ],
+        [
+            "In function: sklearn.utils.tests.test_testing.Klass.f_missing",
+            (
+                "Parameters in function docstring have less items w.r.t. function"
+                " signature, first missing item: X"
+            ),
+            "Full diff:",
+            "- ['X', 'y']",
+            "+ []",
+        ],
+        [
+            f"In function: sklearn.utils.tests.test_testing.{mock_meta_name}.predict",
+            (
+                "There's a parameter name mismatch in function docstring w.r.t."
+                " function signature, at index 0 diff: 'X' != 'y'"
+            ),
+            "Full diff:",
+            "- ['X']",
+            "?   ^",
+            "+ ['y']",
+            "?   ^",
+        ],
+        [
+            "In function: "
+            f"sklearn.utils.tests.test_testing.{mock_meta_name}."
+            "predict_proba",
+            "potentially wrong underline length... ",
+            "Parameters ",
+            "--------- in ",
+        ],
+        [
+            f"In function: sklearn.utils.tests.test_testing.{mock_meta_name}.score",
+            "potentially wrong underline length... ",
+            "Parameters ",
+            "--------- in ",
+        ],
+        [
+            f"In function: sklearn.utils.tests.test_testing.{mock_meta_name}.fit",
+            (
+                "Parameters in function docstring have less items w.r.t. function"
+                " signature, first missing item: X"
+            ),
+            "Full diff:",
+            "- ['X', 'y']",
+            "+ []",
+        ],
+    ]
+
+    for msg, f in zip(
+        messages,
+        [
+            f_bad_order,
+            f_too_many_param_docstring,
+            f_missing,
+            Klass.f_missing,
+            mock_meta.predict,
+            mock_meta.predict_proba,
+            mock_meta.score,
+            mock_meta.fit,
+        ],
+    ):
+        incorrect = check_docstring_parameters(f)
+        assert msg == incorrect, '\n"%s"\n not in \n"%s"' % (msg, incorrect)
+
+
+def f_one(a, b):  # pragma: no cover
+    """Function one.
+
+    Parameters
+    ----------
+    a : int,   float
+        Parameter a.
+        Second    line.
+
+    b : str
+        Parameter b.
+
+    Returns
+    -------
+    c : int
+       Returning
+
+    d : int
+       Returning
+    """
+    pass
+
+
+def f_two(a, b):  # pragma: no cover
+    """Function two.
+
+    Parameters
+    ----------
+    a :   int, float
+        Parameter a.
+          Second line.
+
+    b : str
+        Parameter bb.
+
+    e : int
+        Extra parameter.
+
+    Returns
+    -------
+    c : int
+       Returning
+
+    d : int
+       Returning
+    """
+    pass
+
+
+def f_three(a, b):  # pragma: no cover
+    """Function two.
+
+    Parameters
+    ----------
+    a :   int, float
+        Parameter a.
+
+    b : str
+        Parameter B!
+
+    e :
+        Extra parameter.
+
+    Returns
+    -------
+    c : int
+       Returning.
+
+    d : int
+       Returning
+    """
+    pass
+
+
+@skip_if_no_numpydoc
+def test_assert_docstring_consistency_object_type():
+    """Check error raised when `objects` incorrect type."""
+    with pytest.raises(TypeError, match="All 'objects' must be one of"):
+        assert_docstring_consistency(["string", f_one])
+
+
+@skip_if_no_numpydoc
+@pytest.mark.parametrize(
+    "objects, kwargs, error",
+    [
+        (
+            [f_one, f_two],
+            {"include_params": ["a"], "exclude_params": ["b"]},
+            "The 'exclude_params' argument",
+        ),
+        (
+            [f_one, f_two],
+            {"include_returns": False, "exclude_returns": ["c"]},
+            "The 'exclude_returns' argument",
+        ),
+    ],
+)
+def test_assert_docstring_consistency_arg_checks(objects, kwargs, error):
+    """Check `assert_docstring_consistency` argument checking correct."""
+    with pytest.raises(TypeError, match=error):
+        assert_docstring_consistency(objects, **kwargs)
+
+
+@skip_if_no_numpydoc
+@pytest.mark.parametrize(
+    "objects, kwargs, error, warn",
+    [
+        pytest.param(
+            [f_one, f_two], {"include_params": ["a"]}, "", "", id="whitespace"
+        ),
+        pytest.param([f_one, f_two], {"include_returns": True}, "", "", id="incl_all"),
+        pytest.param(
+            [f_one, f_two, f_three],
+            {"include_params": ["a"]},
+            (
+                r"The description of Parameter 'a' is inconsistent between "
+                r"\['f_one',\n'f_two'\]"
+            ),
+            "",
+            id="2-1 group",
+        ),
+        pytest.param(
+            [f_one, f_two, f_three],
+            {"include_params": ["b"]},
+            (
+                r"The description of Parameter 'b' is inconsistent between "
+                r"\['f_one'\] and\n\['f_two'\] and"
+            ),
+            "",
+            id="1-1-1 group",
+        ),
+        pytest.param(
+            [f_two, f_three],
+            {"include_params": ["e"]},
+            (
+                r"The type specification of Parameter 'e' is inconsistent between\n"
+                r"\['f_two'\] and"
+            ),
+            "",
+            id="empty type",
+        ),
+        pytest.param(
+            [f_one, f_two],
+            {"include_params": True, "exclude_params": ["b"]},
+            "",
+            r"Checking was skipped for Parameters: \['e'\]",
+            id="skip warn",
+        ),
+    ],
+)
+def test_assert_docstring_consistency(objects, kwargs, error, warn):
+    """Check `assert_docstring_consistency` gives correct results."""
+    if error:
+        with pytest.raises(AssertionError, match=error):
+            assert_docstring_consistency(objects, **kwargs)
+    elif warn:
+        with pytest.warns(UserWarning, match=warn):
+            assert_docstring_consistency(objects, **kwargs)
+    else:
+        assert_docstring_consistency(objects, **kwargs)
+
+
+def f_four(labels):  # pragma: no cover
+    """Function four.
+
+    Parameters
+    ----------
+
+    labels : array-like, default=None
+        The set of labels to include when `average != 'binary'`, and their
+        order if `average is None`. Labels present in the data can be excluded.
+    """
+    pass
+
+
+def f_five(labels):  # pragma: no cover
+    """Function five.
+
+    Parameters
+    ----------
+
+    labels : array-like, default=None
+        The set of labels to include when `average != 'binary'`, and their
+        order if `average is None`. This is an extra line. Labels present in the
+        data can be excluded.
+    """
+    pass
+
+
+def f_six(labels):  # pragma: no cover
+    """Function six.
+
+    Parameters
+    ----------
+
+    labels : array-like, default=None
+        The group of labels to add when `average != 'binary'`, and the
+        order if `average is None`. Labels present on them datas can be excluded.
+    """
+    pass
+
+
+@skip_if_no_numpydoc
+def test_assert_docstring_consistency_error_msg():
+    """Check `assert_docstring_consistency` difference message."""
+    msg = r"""The description of Parameter 'labels' is inconsistent between
+\['f_four'\] and \['f_five'\] and \['f_six'\]:
+
+\*\*\* \['f_four'\]
+--- \['f_five'\]
+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*
+
+\*\*\* 10,25 \*\*\*\*
+
+--- 10,30 ----
+
+  'binary'`, and their order if `average is None`.
+\+ This is an extra line.
+  Labels present in the data can be excluded.
+
+\*\*\* \['f_four'\]
+--- \['f_six'\]
+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*
+
+\*\*\* 1,25 \*\*\*\*
+
+  The
+! set
+  of labels to
+! include
+  when `average != 'binary'`, and
+! their
+  order if `average is None`. Labels present
+! in the data
+  can be excluded.
+--- 1,25 ----
+
+  The
+! group
+  of labels to
+! add
+  when `average != 'binary'`, and
+! the
+  order if `average is None`. Labels present
+! on them datas
+  can be excluded."""
+
+    with pytest.raises(AssertionError, match=msg):
+        assert_docstring_consistency([f_four, f_five, f_six], include_params=True)
+
+
+@skip_if_no_numpydoc
+def test_assert_docstring_consistency_descr_regex_pattern():
+    """Check `assert_docstring_consistency` `descr_regex_pattern` works."""
+    # Check regex that matches full parameter descriptions
+    regex_full = (
+        r"The (set|group) "  # match 'set' or 'group'
+        r"of labels to (include|add) "  # match 'include' or 'add'
+        r"when `average \!\= 'binary'`, and (their|the) "  #  match 'their' or 'the'
+        r"order if `average is None`\."
+        r"[\s\w]*\.* "  # optionally match additional sentence
+        r"Labels present (on|in) "  # match 'on' or 'in'
+        r"(them|the) "  # match 'them' or 'the'
+        r"datas? can be excluded\."  # match 'data' or 'datas'
+    )
+
+    assert_docstring_consistency(
+        [f_four, f_five, f_six],
+        include_params=True,
+        descr_regex_pattern=" ".join(regex_full.split()),
+    )
+    # Check we can just match a few alternate words
+    regex_words = r"(labels|average|binary)"  # match any of these 3 words
+    assert_docstring_consistency(
+        [f_four, f_five, f_six],
+        include_params=True,
+        descr_regex_pattern=" ".join(regex_words.split()),
+    )
+    # Check error raised when regex doesn't match
+    regex_error = r"The set of labels to include when.+"
+    msg = r"The description of Parameter 'labels' in \['f_six'\] does not match"
+    with pytest.raises(AssertionError, match=msg):
+        assert_docstring_consistency(
+            [f_four, f_five, f_six],
+            include_params=True,
+            descr_regex_pattern=" ".join(regex_error.split()),
+        )
+
+
+class RegistrationCounter:
+    def __init__(self):
+        self.nb_calls = 0
+
+    def __call__(self, to_register_func):
+        self.nb_calls += 1
+        assert to_register_func.func is _delete_folder
+
+
+def check_memmap(input_array, mmap_data, mmap_mode="r"):
+    assert isinstance(mmap_data, np.memmap)
+    writeable = mmap_mode != "r"
+    assert mmap_data.flags.writeable is writeable
+    np.testing.assert_array_equal(input_array, mmap_data)
+
+
+def test_tempmemmap(monkeypatch):
+    registration_counter = RegistrationCounter()
+    monkeypatch.setattr(atexit, "register", registration_counter)
+
+    input_array = np.ones(3)
+    with TempMemmap(input_array) as data:
+        check_memmap(input_array, data)
+        temp_folder = os.path.dirname(data.filename)
+    if os.name != "nt":
+        assert not os.path.exists(temp_folder)
+    assert registration_counter.nb_calls == 1
+
+    mmap_mode = "r+"
+    with TempMemmap(input_array, mmap_mode=mmap_mode) as data:
+        check_memmap(input_array, data, mmap_mode=mmap_mode)
+        temp_folder = os.path.dirname(data.filename)
+    if os.name != "nt":
+        assert not os.path.exists(temp_folder)
+    assert registration_counter.nb_calls == 2
+
+
+def test_create_memmap_backed_data(monkeypatch):
+    registration_counter = RegistrationCounter()
+    monkeypatch.setattr(atexit, "register", registration_counter)
+
+    input_array = np.ones(3)
+    data = create_memmap_backed_data(input_array)
+    check_memmap(input_array, data)
+    assert registration_counter.nb_calls == 1
+
+    data, folder = create_memmap_backed_data(input_array, return_folder=True)
+    check_memmap(input_array, data)
+    assert folder == os.path.dirname(data.filename)
+    assert registration_counter.nb_calls == 2
+
+    mmap_mode = "r+"
+    data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
+    check_memmap(input_array, data, mmap_mode)
+    assert registration_counter.nb_calls == 3
+
+    input_list = [input_array, input_array + 1, input_array + 2]
+    mmap_data_list = create_memmap_backed_data(input_list)
+    for input_array, data in zip(input_list, mmap_data_list):
+        check_memmap(input_array, data)
+    assert registration_counter.nb_calls == 4
+
+    output_data, other = create_memmap_backed_data([input_array, "not-an-array"])
+    check_memmap(input_array, output_data)
+    assert other == "not-an-array"
+
+
+@pytest.mark.parametrize(
+    "constructor_name, container_type",
+    [
+        ("list", list),
+        ("tuple", tuple),
+        ("array", np.ndarray),
+        ("sparse", sparse.csr_matrix),
+        # using `zip` will only keep the available sparse containers
+        # depending of the installed SciPy version
+        *zip(["sparse_csr", "sparse_csr_array"], CSR_CONTAINERS),
+        *zip(["sparse_csc", "sparse_csc_array"], CSC_CONTAINERS),
+        ("dataframe", lambda: pytest.importorskip("pandas").DataFrame),
+        ("series", lambda: pytest.importorskip("pandas").Series),
+        ("index", lambda: pytest.importorskip("pandas").Index),
+        ("pyarrow", lambda: pytest.importorskip("pyarrow").Table),
+        ("pyarrow_array", lambda: pytest.importorskip("pyarrow").Array),
+        ("polars", lambda: pytest.importorskip("polars").DataFrame),
+        ("polars_series", lambda: pytest.importorskip("polars").Series),
+        ("slice", slice),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, superdtype",
+    [
+        (np.int32, np.integer),
+        (np.int64, np.integer),
+        (np.float32, np.floating),
+        (np.float64, np.floating),
+    ],
+)
+def test_convert_container(
+    constructor_name,
+    container_type,
+    dtype,
+    superdtype,
+):
+    """Check that we convert the container to the right type of array with the
+    right data type."""
+    if constructor_name in (
+        "dataframe",
+        "index",
+        "polars",
+        "polars_series",
+        "pyarrow",
+        "pyarrow_array",
+        "series",
+    ):
+        # delay the import of pandas/polars within the function to only skip this test
+        # instead of the whole file
+        container_type = container_type()
+    container = [0, 1]
+
+    container_converted = _convert_container(
+        container,
+        constructor_name,
+        dtype=dtype,
+    )
+    assert isinstance(container_converted, container_type)
+
+    if constructor_name in ("list", "tuple", "index"):
+        # list and tuple will use Python class dtype: int, float
+        # pandas index will always use high precision: np.int64 and np.float64
+        assert np.issubdtype(type(container_converted[0]), superdtype)
+    elif constructor_name in ("polars", "polars_series", "pyarrow", "pyarrow_array"):
+        return
+    elif hasattr(container_converted, "dtype"):
+        assert container_converted.dtype == dtype
+    elif hasattr(container_converted, "dtypes"):
+        assert container_converted.dtypes[0] == dtype
+
+
+def test_convert_container_categories_pandas():
+    pytest.importorskip("pandas")
+    df = _convert_container(
+        [["x"]], "dataframe", ["A"], categorical_feature_names=["A"]
+    )
+    assert df.dtypes.iloc[0] == "category"
+
+
+def test_convert_container_categories_polars():
+    pl = pytest.importorskip("polars")
+    df = _convert_container([["x"]], "polars", ["A"], categorical_feature_names=["A"])
+    assert df.schema["A"] == pl.Categorical()
+
+
+def test_convert_container_categories_pyarrow():
+    pa = pytest.importorskip("pyarrow")
+    df = _convert_container([["x"]], "pyarrow", ["A"], categorical_feature_names=["A"])
+    assert type(df.schema[0].type) is pa.DictionaryType
+
+
+def test_raises():
+    # Tests for the raises context manager
+
+    # Proper type, no match
+    with raises(TypeError):
+        raise TypeError()
+
+    # Proper type, proper match
+    with raises(TypeError, match="how are you") as cm:
+        raise TypeError("hello how are you")
+    assert cm.raised_and_matched
+
+    # Proper type, proper match with multiple patterns
+    with raises(TypeError, match=["not this one", "how are you"]) as cm:
+        raise TypeError("hello how are you")
+    assert cm.raised_and_matched
+
+    # bad type, no match
+    with pytest.raises(ValueError, match="this will be raised"):
+        with raises(TypeError) as cm:
+            raise ValueError("this will be raised")
+    assert not cm.raised_and_matched
+
+    # Bad type, no match, with a err_msg
+    with pytest.raises(AssertionError, match="the failure message"):
+        with raises(TypeError, err_msg="the failure message") as cm:
+            raise ValueError()
+    assert not cm.raised_and_matched
+
+    # bad type, with match (is ignored anyway)
+    with pytest.raises(ValueError, match="this will be raised"):
+        with raises(TypeError, match="this is ignored") as cm:
+            raise ValueError("this will be raised")
+    assert not cm.raised_and_matched
+
+    # proper type but bad match
+    with pytest.raises(
+        AssertionError, match="should contain one of the following patterns"
+    ):
+        with raises(TypeError, match="hello") as cm:
+            raise TypeError("Bad message")
+    assert not cm.raised_and_matched
+
+    # proper type but bad match, with err_msg
+    with pytest.raises(AssertionError, match="the failure message"):
+        with raises(TypeError, match="hello", err_msg="the failure message") as cm:
+            raise TypeError("Bad message")
+    assert not cm.raised_and_matched
+
+    # no raise with default may_pass=False
+    with pytest.raises(AssertionError, match="Did not raise"):
+        with raises(TypeError) as cm:
+            pass
+    assert not cm.raised_and_matched
+
+    # no raise with may_pass=True
+    with raises(TypeError, match="hello", may_pass=True) as cm:
+        pass  # still OK
+    assert not cm.raised_and_matched
+
+    # Multiple exception types:
+    with raises((TypeError, ValueError)):
+        raise TypeError()
+    with raises((TypeError, ValueError)):
+        raise ValueError()
+    with pytest.raises(AssertionError):
+        with raises((TypeError, ValueError)):
+            pass
+
+
+def test_float32_aware_assert_allclose():
+    # The relative tolerance for float32 inputs is 1e-4
+    assert_allclose(np.array([1.0 + 2e-5], dtype=np.float32), 1.0)
+    with pytest.raises(AssertionError):
+        assert_allclose(np.array([1.0 + 2e-4], dtype=np.float32), 1.0)
+
+    # The relative tolerance for other inputs is left to 1e-7 as in
+    # the original numpy version.
+    assert_allclose(np.array([1.0 + 2e-8], dtype=np.float64), 1.0)
+    with pytest.raises(AssertionError):
+        assert_allclose(np.array([1.0 + 2e-7], dtype=np.float64), 1.0)
+
+    # atol is left to 0.0 by default, even for float32
+    with pytest.raises(AssertionError):
+        assert_allclose(np.array([1e-5], dtype=np.float32), 0.0)
+    assert_allclose(np.array([1e-5], dtype=np.float32), 0.0, atol=2e-5)
+
+
+@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess")
+def test_assert_run_python_script_without_output():
+    code = "x = 1"
+    assert_run_python_script_without_output(code)
+
+    code = "print('something to stdout')"
+    with pytest.raises(AssertionError, match="Expected no output"):
+        assert_run_python_script_without_output(code)
+
+    code = "print('something to stdout')"
+    with pytest.raises(
+        AssertionError,
+        match="output was not supposed to match.+got.+something to stdout",
+    ):
+        assert_run_python_script_without_output(code, pattern="to.+stdout")
+
+    code = "\n".join(["import sys", "print('something to stderr', file=sys.stderr)"])
+    with pytest.raises(
+        AssertionError,
+        match="output was not supposed to match.+got.+something to stderr",
+    ):
+        assert_run_python_script_without_output(code, pattern="to.+stderr")
+
+
+@pytest.mark.parametrize(
+    "constructor_name",
+    [
+        "sparse_csr",
+        "sparse_csc",
+        pytest.param(
+            "sparse_csr_array",
+        ),
+        pytest.param(
+            "sparse_csc_array",
+        ),
+    ],
+)
+def test_convert_container_sparse_to_sparse(constructor_name):
+    """Non-regression test to check that we can still convert a sparse container
+    from a given format to another format.
+    """
+    X_sparse = sparse.random(10, 10, density=0.1, format="csr")
+    _convert_container(X_sparse, constructor_name)
+
+
+def check_warnings_as_errors(warning_info, warnings_as_errors):
+    if warning_info.action == "error" and warnings_as_errors:
+        with pytest.raises(warning_info.category, match=warning_info.message):
+            warnings.warn(
+                message=warning_info.message,
+                category=warning_info.category,
+            )
+    if warning_info.action == "ignore":
+        with warnings.catch_warnings(record=True) as record:
+            message = warning_info.message
+            # Special treatment when regex is used
+            if "Pyarrow" in message:
+                message = "\nPyarrow will become a required dependency"
+
+            warnings.warn(
+                message=message,
+                category=warning_info.category,
+            )
+            assert len(record) == 0 if warnings_as_errors else 1
+            if record:
+                assert str(record[0].message) == message
+                assert record[0].category == warning_info.category
+
+
+@pytest.mark.parametrize("warning_info", _get_warnings_filters_info_list())
+def test_sklearn_warnings_as_errors(warning_info):
+    warnings_as_errors = os.environ.get("SKLEARN_WARNINGS_AS_ERRORS", "0") != "0"
+    check_warnings_as_errors(warning_info, warnings_as_errors=warnings_as_errors)
+
+
+@pytest.mark.parametrize("warning_info", _get_warnings_filters_info_list())
+def test_turn_warnings_into_errors(warning_info):
+    with warnings.catch_warnings():
+        turn_warnings_into_errors()
+        check_warnings_as_errors(warning_info, warnings_as_errors=True)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_typedefs.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_typedefs.py
new file mode 100644
index 0000000000000000000000000000000000000000..da7e7a2df7dbb0c2cdee1b368455e64b643656d3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_typedefs.py
@@ -0,0 +1,25 @@
+import numpy as np
+import pytest
+
+from sklearn.utils._typedefs import testing_make_array_from_typed_val
+
+
+@pytest.mark.parametrize(
+    "type_t, value, expected_dtype",
+    [
+        ("float64_t", 1.0, np.float64),
+        ("float32_t", 1.0, np.float32),
+        ("intp_t", 1, np.intp),
+        ("int8_t", 1, np.int8),
+        ("int32_t", 1, np.int32),
+        ("int64_t", 1, np.int64),
+        ("uint8_t", 1, np.uint8),
+        ("uint32_t", 1, np.uint32),
+        ("uint64_t", 1, np.uint64),
+    ],
+)
+def test_types(type_t, value, expected_dtype):
+    """Check that the types defined in _typedefs correspond to the expected
+    numpy dtypes.
+    """
+    assert testing_make_array_from_typed_val[type_t](value).dtype == expected_dtype
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_unique.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_unique.py
new file mode 100644
index 0000000000000000000000000000000000000000..daa6918b49cda156c0a5316b8df9eba900a15e5e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_unique.py
@@ -0,0 +1,54 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.utils._unique import attach_unique, cached_unique
+from sklearn.utils.validation import check_array
+
+
+def test_attach_unique_attaches_unique_to_array():
+    arr = np.array([1, 2, 2, 3, 4, 4, 5])
+    arr_ = attach_unique(arr)
+    assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
+    assert_array_equal(arr_, arr)
+
+
+def test_cached_unique_returns_cached_unique():
+    my_dtype = np.dtype(np.float64, metadata={"unique": np.array([1, 2])})
+    arr = np.array([1, 2, 2, 3, 4, 4, 5], dtype=my_dtype)
+    assert_array_equal(cached_unique(arr), np.array([1, 2]))
+
+
+def test_attach_unique_not_ndarray():
+    """Test that when not np.ndarray, we don't touch the array."""
+    arr = [1, 2, 2, 3, 4, 4, 5]
+    arr_ = attach_unique(arr)
+    assert arr_ is arr
+
+
+def test_attach_unique_returns_view():
+    """Test that attach_unique returns a view of the array."""
+    arr = np.array([1, 2, 2, 3, 4, 4, 5])
+    arr_ = attach_unique(arr)
+    assert arr_.base is arr
+
+
+def test_attach_unique_return_tuple():
+    """Test return_tuple argument of the function."""
+    arr = np.array([1, 2, 2, 3, 4, 4, 5])
+    arr_tuple = attach_unique(arr, return_tuple=True)
+    assert isinstance(arr_tuple, tuple)
+    assert len(arr_tuple) == 1
+    assert_array_equal(arr_tuple[0], arr)
+
+    arr_single = attach_unique(arr, return_tuple=False)
+    assert isinstance(arr_single, np.ndarray)
+    assert_array_equal(arr_single, arr)
+
+
+def test_check_array_keeps_unique():
+    """Test that check_array keeps the unique metadata."""
+    arr = np.array([[1, 2, 2, 3, 4, 4, 5]])
+    arr_ = attach_unique(arr)
+    arr_ = check_array(arr_)
+    assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
+    assert_array_equal(arr_, arr)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_user_interface.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_user_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aa9d41ba9aef78700c3e8b43dcd40e7012c94e2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_user_interface.py
@@ -0,0 +1,65 @@
+import string
+import timeit
+
+import pytest
+
+from sklearn.utils._user_interface import _message_with_time, _print_elapsed_time
+
+
+@pytest.mark.parametrize(
+    ["source", "message", "is_long"],
+    [
+        ("ABC", string.ascii_lowercase, False),
+        ("ABCDEF", string.ascii_lowercase, False),
+        ("ABC", string.ascii_lowercase * 3, True),
+        ("ABC" * 10, string.ascii_lowercase, True),
+        ("ABC", string.ascii_lowercase + "\u1048", False),
+    ],
+)
+@pytest.mark.parametrize(
+    ["time", "time_str"],
+    [
+        (0.2, "   0.2s"),
+        (20, "  20.0s"),
+        (2000, "33.3min"),
+        (20000, "333.3min"),
+    ],
+)
+def test_message_with_time(source, message, is_long, time, time_str):
+    out = _message_with_time(source, message, time)
+    if is_long:
+        assert len(out) > 70
+    else:
+        assert len(out) == 70
+
+    assert out.startswith("[" + source + "] ")
+    out = out[len(source) + 3 :]
+
+    assert out.endswith(time_str)
+    out = out[: -len(time_str)]
+    assert out.endswith(", total=")
+    out = out[: -len(", total=")]
+    assert out.endswith(message)
+    out = out[: -len(message)]
+    assert out.endswith(" ")
+    out = out[:-1]
+
+    if is_long:
+        assert not out
+    else:
+        assert list(set(out)) == ["."]
+
+
+@pytest.mark.parametrize(
+    ["message", "expected"],
+    [
+        ("hello", _message_with_time("ABC", "hello", 0.1) + "\n"),
+        ("", _message_with_time("ABC", "", 0.1) + "\n"),
+        (None, ""),
+    ],
+)
+def test_print_elapsed_time(message, expected, capsys, monkeypatch):
+    monkeypatch.setattr(timeit, "default_timer", lambda: 0)
+    with _print_elapsed_time("ABC", message):
+        monkeypatch.setattr(timeit, "default_timer", lambda: 0.1)
+    assert capsys.readouterr().out == expected
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_validation.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..99db6cdfb16aa56d7faf2500f7997467f2a69076
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_validation.py
@@ -0,0 +1,2374 @@
+"""Tests for input validation functions"""
+
+import numbers
+import re
+import warnings
+from itertools import product
+from operator import itemgetter
+from tempfile import NamedTemporaryFile
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from pytest import importorskip
+
+import sklearn
+from sklearn._config import config_context
+from sklearn._min_dependencies import dependent_packages
+from sklearn.base import BaseEstimator
+from sklearn.datasets import make_blobs
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.exceptions import NotFittedError, PositiveSpectrumWarning
+from sklearn.linear_model import ARDRegression
+
+# TODO: add this estimator into the _mocking module in a further refactoring
+from sklearn.metrics.tests.test_score_objects import EstimatorWithFit
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.random_projection import _sparse_random_matrix
+from sklearn.svm import SVR
+from sklearn.utils import (
+    _safe_indexing,
+    as_float_array,
+    check_array,
+    check_symmetric,
+    check_X_y,
+    deprecated,
+)
+from sklearn.utils._array_api import (
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._mocking import (
+    MockDataFrame,
+    _MockEstimatorOnOffPrediction,
+)
+from sklearn.utils._testing import (
+    SkipTest,
+    TempMemmap,
+    _array_api_for_tests,
+    _convert_container,
+    assert_allclose,
+    assert_allclose_dense_sparse,
+    assert_array_equal,
+    create_memmap_backed_data,
+    skip_if_array_api_compat_not_configured,
+)
+from sklearn.utils.estimator_checks import _NotAnArray
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    DIA_CONTAINERS,
+    DOK_CONTAINERS,
+)
+from sklearn.utils.validation import (
+    FLOAT_DTYPES,
+    _allclose_dense_sparse,
+    _check_feature_names_in,
+    _check_method_params,
+    _check_psd_eigenvalues,
+    _check_response_method,
+    _check_sample_weight,
+    _check_y,
+    _deprecate_positional_args,
+    _estimator_has,
+    _get_feature_names,
+    _is_fitted,
+    _is_pandas_df,
+    _is_polars_df,
+    _num_features,
+    _num_samples,
+    _to_object_array,
+    assert_all_finite,
+    check_consistent_length,
+    check_is_fitted,
+    check_memory,
+    check_non_negative,
+    check_random_state,
+    check_scalar,
+    column_or_1d,
+    has_fit_parameter,
+    validate_data,
+)
+
+
+def test_make_rng():
+    # Check the check_random_state utility function behavior
+    assert check_random_state(None) is np.random.mtrand._rand
+    assert check_random_state(np.random) is np.random.mtrand._rand
+
+    rng_42 = np.random.RandomState(42)
+    assert check_random_state(42).randint(100) == rng_42.randint(100)
+
+    rng_42 = np.random.RandomState(42)
+    assert check_random_state(rng_42) is rng_42
+
+    rng_42 = np.random.RandomState(42)
+    assert check_random_state(43).randint(100) != rng_42.randint(100)
+
+    with pytest.raises(ValueError):
+        check_random_state("some invalid seed")
+
+
+def test_as_float_array():
+    # Test function for as_float_array
+    X = np.ones((3, 10), dtype=np.int32)
+    X = X + np.arange(10, dtype=np.int32)
+    X2 = as_float_array(X, copy=False)
+    assert X2.dtype == np.float32
+    # Another test
+    X = X.astype(np.int64)
+    X2 = as_float_array(X, copy=True)
+    # Checking that the array wasn't overwritten
+    assert as_float_array(X, copy=False) is not X
+    assert X2.dtype == np.float64
+    # Test int dtypes <= 32bit
+    tested_dtypes = [bool, np.int8, np.int16, np.int32, np.uint8, np.uint16, np.uint32]
+    for dtype in tested_dtypes:
+        X = X.astype(dtype)
+        X2 = as_float_array(X)
+        assert X2.dtype == np.float32
+
+    # Test object dtype
+    X = X.astype(object)
+    X2 = as_float_array(X, copy=True)
+    assert X2.dtype == np.float64
+
+    # Here, X is of the right type, it shouldn't be modified
+    X = np.ones((3, 2), dtype=np.float32)
+    assert as_float_array(X, copy=False) is X
+    # Test that if X is fortran ordered it stays
+    X = np.asfortranarray(X)
+    assert np.isfortran(as_float_array(X, copy=True))
+
+    # Test the copy parameter with some matrices
+    matrices = [
+        sp.csc_matrix(np.arange(5)).toarray(),
+        _sparse_random_matrix(10, 10, density=0.10).toarray(),
+    ]
+    for M in matrices:
+        N = as_float_array(M, copy=True)
+        N[0, 0] = np.nan
+        assert not np.isnan(M).any()
+
+
+@pytest.mark.parametrize(
+    "X", [np.random.random((10, 2)), sp.random(10, 2, format="csr")]
+)
+def test_as_float_array_nan(X):
+    X[5, 0] = np.nan
+    X[6, 1] = np.nan
+    X_converted = as_float_array(X, ensure_all_finite="allow-nan")
+    assert_allclose_dense_sparse(X_converted, X)
+
+
+def test_np_matrix():
+    # Confirm that input validation code does not return np.matrix
+    X = np.arange(12).reshape(3, 4)
+
+    assert not isinstance(as_float_array(X), np.matrix)
+    assert not isinstance(as_float_array(sp.csc_matrix(X)), np.matrix)
+
+
+def test_memmap():
+    # Confirm that input validation code doesn't copy memory mapped arrays
+
+    asflt = lambda x: as_float_array(x, copy=False)
+
+    with NamedTemporaryFile(prefix="sklearn-test") as tmp:
+        M = np.memmap(tmp, shape=(10, 10), dtype=np.float32)
+        M[:] = 0
+
+        for f in (check_array, np.asarray, asflt):
+            X = f(M)
+            X[:] = 1
+            assert_array_equal(X.ravel(), M.ravel())
+            X[:] = 0
+
+
+def test_ordering():
+    # Check that ordering is enforced correctly by validation utilities.
+    # We need to check each validation utility, because a 'copy' without
+    # 'order=K' will kill the ordering.
+    X = np.ones((10, 5))
+    for A in X, X.T:
+        for copy in (True, False):
+            B = check_array(A, order="C", copy=copy)
+            assert B.flags["C_CONTIGUOUS"]
+            B = check_array(A, order="F", copy=copy)
+            assert B.flags["F_CONTIGUOUS"]
+            if copy:
+                assert A is not B
+
+    X = sp.csr_matrix(X)
+    X.data = X.data[::-1]
+    assert not X.data.flags["C_CONTIGUOUS"]
+
+
+@pytest.mark.parametrize(
+    "value, ensure_all_finite",
+    [(np.inf, False), (np.nan, "allow-nan"), (np.nan, False)],
+)
+@pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix])
+def test_check_array_ensure_all_finite_valid(value, ensure_all_finite, retype):
+    X = retype(np.arange(4).reshape(2, 2).astype(float))
+    X[0, 0] = value
+    X_checked = check_array(X, ensure_all_finite=ensure_all_finite, accept_sparse=True)
+    assert_allclose_dense_sparse(X, X_checked)
+
+
+@pytest.mark.parametrize(
+    "value, input_name, ensure_all_finite, match_msg",
+    [
+        (np.inf, "", True, "Input contains infinity"),
+        (np.inf, "X", True, "Input X contains infinity"),
+        (np.inf, "sample_weight", True, "Input sample_weight contains infinity"),
+        (np.inf, "X", "allow-nan", "Input X contains infinity"),
+        (np.nan, "", True, "Input contains NaN"),
+        (np.nan, "X", True, "Input X contains NaN"),
+        (np.nan, "y", True, "Input y contains NaN"),
+        (
+            np.nan,
+            "",
+            "allow-inf",
+            "ensure_all_finite should be a bool or 'allow-nan'",
+        ),
+        (np.nan, "", 1, "Input contains NaN"),
+    ],
+)
+@pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix])
+def test_check_array_ensure_all_finite_invalid(
+    value, input_name, ensure_all_finite, match_msg, retype
+):
+    X = retype(np.arange(4).reshape(2, 2).astype(np.float64))
+    X[0, 0] = value
+    with pytest.raises(ValueError, match=match_msg):
+        check_array(
+            X,
+            input_name=input_name,
+            ensure_all_finite=ensure_all_finite,
+            accept_sparse=True,
+        )
+
+
+@pytest.mark.parametrize("input_name", ["X", "y", "sample_weight"])
+@pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix])
+def test_check_array_links_to_imputer_doc_only_for_X(input_name, retype):
+    data = retype(np.arange(4).reshape(2, 2).astype(np.float64))
+    data[0, 0] = np.nan
+    estimator = SVR()
+    extended_msg = (
+        f"\n{estimator.__class__.__name__} does not accept missing values"
+        " encoded as NaN natively. For supervised learning, you might want"
+        " to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor"
+        " which accept missing values encoded as NaNs natively."
+        " Alternatively, it is possible to preprocess the"
+        " data, for instance by using an imputer transformer in a pipeline"
+        " or drop samples with missing values. See"
+        " https://scikit-learn.org/stable/modules/impute.html"
+        " You can find a list of all estimators that handle NaN values"
+        " at the following page:"
+        " https://scikit-learn.org/stable/modules/impute.html"
+        "#estimators-that-handle-nan-values"
+    )
+
+    with pytest.raises(ValueError, match=f"Input {input_name} contains NaN") as ctx:
+        check_array(
+            data,
+            estimator=estimator,
+            input_name=input_name,
+            accept_sparse=True,
+        )
+
+    if input_name == "X":
+        assert extended_msg in ctx.value.args[0]
+    else:
+        assert extended_msg not in ctx.value.args[0]
+
+    if input_name == "X":
+        # Veriy that _validate_data is automatically called with the right argument
+        # to generate the same exception:
+        with pytest.raises(ValueError, match=f"Input {input_name} contains NaN") as ctx:
+            SVR().fit(data, np.ones(data.shape[0]))
+        assert extended_msg in ctx.value.args[0]
+
+
+def test_check_array_ensure_all_finite_object():
+    X = np.array([["a", "b", np.nan]], dtype=object).T
+
+    X_checked = check_array(X, dtype=None, ensure_all_finite="allow-nan")
+    assert X is X_checked
+
+    X_checked = check_array(X, dtype=None, ensure_all_finite=False)
+    assert X is X_checked
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        check_array(X, dtype=None, ensure_all_finite=True)
+
+
+@pytest.mark.parametrize(
+    "X, err_msg",
+    [
+        (
+            np.array([[1, np.nan]]),
+            "Input contains NaN.",
+        ),
+        (
+            np.array([[1, np.nan]]),
+            "Input contains NaN.",
+        ),
+        (
+            np.array([[1, np.inf]]),
+            "Input contains infinity or a value too large for.*int",
+        ),
+        (np.array([[1, np.nan]], dtype=object), "cannot convert float NaN to integer"),
+    ],
+)
+@pytest.mark.parametrize("ensure_all_finite", [True, False])
+def test_check_array_ensure_all_finite_object_unsafe_casting(
+    X, err_msg, ensure_all_finite
+):
+    # casting a float array containing NaN or inf to int dtype should
+    # raise an error irrespective of the ensure_all_finite parameter.
+    with pytest.raises(ValueError, match=err_msg):
+        check_array(X, dtype=int, ensure_all_finite=ensure_all_finite)
+
+
+def test_check_array_series_err_msg():
+    """
+    Check that we raise a proper error message when passing a Series and we expect a
+    2-dimensional container.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27498
+    """
+    pd = pytest.importorskip("pandas")
+    ser = pd.Series([1, 2, 3])
+    msg = f"Expected a 2-dimensional container but got {type(ser)} instead."
+    with pytest.raises(ValueError, match=msg):
+        check_array(ser, ensure_2d=True)
+
+
+@pytest.mark.filterwarnings("ignore:Can't check dok sparse matrix for nan or inf")
+def test_check_array():
+    # accept_sparse == False
+    # raise error on sparse inputs
+    X = [[1, 2], [3, 4]]
+    X_csr = sp.csr_matrix(X)
+    with pytest.raises(TypeError):
+        check_array(X_csr)
+
+    # ensure_2d=False
+    X_array = check_array([0, 1, 2], ensure_2d=False)
+    assert X_array.ndim == 1
+    # ensure_2d=True with 1d array
+    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+        check_array([0, 1, 2], ensure_2d=True)
+
+    # ensure_2d=True with scalar array
+    with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"):
+        check_array(10, ensure_2d=True)
+
+    # ensure_2d=True with 1d sparse array
+    if hasattr(sp, "csr_array"):
+        sparse_row = next(iter(sp.csr_array(X)))
+        if sparse_row.ndim == 1:
+            # In scipy 1.14 and later, sparse row is 1D while it was 2D before.
+            with pytest.raises(ValueError, match="Expected 2D input, got"):
+                check_array(sparse_row, accept_sparse=True, ensure_2d=True)
+
+    # don't allow ndim > 3
+    X_ndim = np.arange(8).reshape(2, 2, 2)
+    with pytest.raises(ValueError):
+        check_array(X_ndim)
+    check_array(X_ndim, allow_nd=True)  # doesn't raise
+
+    # dtype and order enforcement.
+    X_C = np.arange(4).reshape(2, 2).copy("C")
+    X_F = X_C.copy("F")
+    X_int = X_C.astype(int)
+    X_float = X_C.astype(float)
+    Xs = [X_C, X_F, X_int, X_float]
+    dtypes = [np.int32, int, float, np.float32, None, bool, object]
+    orders = ["C", "F", None]
+    copys = [True, False]
+
+    for X, dtype, order, copy in product(Xs, dtypes, orders, copys):
+        X_checked = check_array(X, dtype=dtype, order=order, copy=copy)
+        if dtype is not None:
+            assert X_checked.dtype == dtype
+        else:
+            assert X_checked.dtype == X.dtype
+        if order == "C":
+            assert X_checked.flags["C_CONTIGUOUS"]
+            assert not X_checked.flags["F_CONTIGUOUS"]
+        elif order == "F":
+            assert X_checked.flags["F_CONTIGUOUS"]
+            assert not X_checked.flags["C_CONTIGUOUS"]
+        if copy:
+            assert X is not X_checked
+        else:
+            # doesn't copy if it was already good
+            if (
+                X.dtype == X_checked.dtype
+                and X_checked.flags["C_CONTIGUOUS"] == X.flags["C_CONTIGUOUS"]
+                and X_checked.flags["F_CONTIGUOUS"] == X.flags["F_CONTIGUOUS"]
+            ):
+                assert X is X_checked
+
+    # allowed sparse != None
+
+    # try different type of sparse format
+    Xs = []
+    Xs.extend(
+        [
+            sparse_container(X_C)
+            for sparse_container in CSR_CONTAINERS
+            + CSC_CONTAINERS
+            + COO_CONTAINERS
+            + DOK_CONTAINERS
+        ]
+    )
+    Xs.extend([Xs[0].astype(np.int64), Xs[0].astype(np.float64)])
+
+    accept_sparses = [["csr", "coo"], ["coo", "dok"]]
+    # scipy sparse matrices do not support the object dtype so
+    # this dtype is skipped in this loop
+    non_object_dtypes = [dt for dt in dtypes if dt is not object]
+    for X, dtype, accept_sparse, copy in product(
+        Xs, non_object_dtypes, accept_sparses, copys
+    ):
+        X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, copy=copy)
+        if dtype is not None:
+            assert X_checked.dtype == dtype
+        else:
+            assert X_checked.dtype == X.dtype
+        if X.format in accept_sparse:
+            # no change if allowed
+            assert X.format == X_checked.format
+        else:
+            # got converted
+            assert X_checked.format == accept_sparse[0]
+        if copy:
+            assert X is not X_checked
+        else:
+            # doesn't copy if it was already good
+            if X.dtype == X_checked.dtype and X.format == X_checked.format:
+                assert X is X_checked
+
+    # other input formats
+    # convert lists to arrays
+    X_dense = check_array([[1, 2], [3, 4]])
+    assert isinstance(X_dense, np.ndarray)
+    # raise on too deep lists
+    with pytest.raises(ValueError):
+        check_array(X_ndim.tolist())
+    check_array(X_ndim.tolist(), allow_nd=True)  # doesn't raise
+
+    # convert weird stuff to arrays
+    X_no_array = _NotAnArray(X_dense)
+    result = check_array(X_no_array)
+    assert isinstance(result, np.ndarray)
+
+    # check negative values when ensure_non_negative=True
+    X_neg = check_array([[1, 2], [-3, 4]])
+    err_msg = "Negative values in data passed to X in RandomForestRegressor"
+    with pytest.raises(ValueError, match=err_msg):
+        check_array(
+            X_neg,
+            ensure_non_negative=True,
+            input_name="X",
+            estimator=RandomForestRegressor(),
+        )
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        [["1", "2"], ["3", "4"]],
+        np.array([["1", "2"], ["3", "4"]], dtype="U"),
+        np.array([["1", "2"], ["3", "4"]], dtype="S"),
+        [[b"1", b"2"], [b"3", b"4"]],
+        np.array([[b"1", b"2"], [b"3", b"4"]], dtype="V1"),
+    ],
+)
+def test_check_array_numeric_error(X):
+    """Test that check_array errors when it receives an array of bytes/string
+    while a numeric dtype is required."""
+    expected_msg = r"dtype='numeric' is not compatible with arrays of bytes/strings"
+    with pytest.raises(ValueError, match=expected_msg):
+        check_array(X, dtype="numeric")
+
+
+@pytest.mark.parametrize(
+    "pd_dtype", ["Int8", "Int16", "UInt8", "UInt16", "Float32", "Float64"]
+)
+@pytest.mark.parametrize(
+    "dtype, expected_dtype",
+    [
+        ([np.float32, np.float64], np.float32),
+        (np.float64, np.float64),
+        ("numeric", np.float64),
+    ],
+)
+def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
+    # Test pandas numerical extension arrays with pd.NA
+    pd = pytest.importorskip("pandas")
+
+    if pd_dtype in {"Float32", "Float64"}:
+        # Extension dtypes with Floats was added in 1.2
+        pd = pytest.importorskip("pandas", minversion="1.2")
+
+    X_np = np.array(
+        [[1, 2, 3, np.nan, np.nan], [np.nan, np.nan, 8, 4, 6], [1, 2, 3, 4, 5]]
+    ).T
+
+    # Creates dataframe with numerical extension arrays with pd.NA
+    X = pd.DataFrame(X_np, dtype=pd_dtype, columns=["a", "b", "c"])
+    # column c has no nans
+    X["c"] = X["c"].astype("float")
+    X_checked = check_array(X, ensure_all_finite="allow-nan", dtype=dtype)
+    assert_allclose(X_checked, X_np)
+    assert X_checked.dtype == expected_dtype
+
+    X_checked = check_array(X, ensure_all_finite=False, dtype=dtype)
+    assert_allclose(X_checked, X_np)
+    assert X_checked.dtype == expected_dtype
+
+    msg = "Input contains NaN"
+    with pytest.raises(ValueError, match=msg):
+        check_array(X, ensure_all_finite=True)
+
+
+def test_check_array_panadas_na_support_series():
+    """Check check_array is correct with pd.NA in a series."""
+    pd = pytest.importorskip("pandas")
+
+    X_int64 = pd.Series([1, 2, pd.NA], dtype="Int64")
+
+    msg = "Input contains NaN"
+    with pytest.raises(ValueError, match=msg):
+        check_array(X_int64, ensure_all_finite=True, ensure_2d=False)
+
+    X_out = check_array(X_int64, ensure_all_finite=False, ensure_2d=False)
+    assert_allclose(X_out, [1, 2, np.nan])
+    assert X_out.dtype == np.float64
+
+    X_out = check_array(
+        X_int64, ensure_all_finite=False, ensure_2d=False, dtype=np.float32
+    )
+    assert_allclose(X_out, [1, 2, np.nan])
+    assert X_out.dtype == np.float32
+
+
+def test_check_array_pandas_dtype_casting():
+    # test that data-frames with homogeneous dtype are not upcast
+    pd = pytest.importorskip("pandas")
+    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
+    X_df = pd.DataFrame(X)
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df = X_df.astype({0: np.float16})
+    assert_array_equal(X_df.dtypes, (np.float16, np.float32, np.float32))
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df = X_df.astype({0: np.int16})
+    # float16, int16, float32 casts to float32
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df = X_df.astype({2: np.float16})
+    # float16, int16, float16 casts to float32
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df = X_df.astype(np.int16)
+    assert check_array(X_df).dtype == np.int16
+    # we're not using upcasting rules for determining
+    # the target type yet, so we cast to the default of float64
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64
+
+    # check that we handle pandas dtypes in a semi-reasonable way
+    # this is actually tricky because we can't really know that this
+    # should be integer ahead of converting it.
+    cat_df = pd.DataFrame({"cat_col": pd.Categorical([1, 2, 3])})
+    assert check_array(cat_df).dtype == np.int64
+    assert check_array(cat_df, dtype=FLOAT_DTYPES).dtype == np.float64
+
+
+def test_check_array_on_mock_dataframe():
+    arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]])
+    mock_df = MockDataFrame(arr)
+    checked_arr = check_array(mock_df)
+    assert checked_arr.dtype == arr.dtype
+    checked_arr = check_array(mock_df, dtype=np.float32)
+    assert checked_arr.dtype == np.dtype(np.float32)
+
+
+def test_check_array_dtype_stability():
+    # test that lists with ints don't get converted to floats
+    X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+    assert check_array(X).dtype.kind == "i"
+    assert check_array(X, ensure_2d=False).dtype.kind == "i"
+
+
+def test_check_array_dtype_warning():
+    X_int_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+    X_float32 = np.asarray(X_int_list, dtype=np.float32)
+    X_int64 = np.asarray(X_int_list, dtype=np.int64)
+    X_csr_float32 = sp.csr_matrix(X_float32)
+    X_csc_float32 = sp.csc_matrix(X_float32)
+    X_csc_int32 = sp.csc_matrix(X_int64, dtype=np.int32)
+    integer_data = [X_int64, X_csc_int32]
+    float32_data = [X_float32, X_csr_float32, X_csc_float32]
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        for X in integer_data:
+            X_checked = check_array(X, dtype=np.float64, accept_sparse=True)
+            assert X_checked.dtype == np.float64
+
+        for X in float32_data:
+            X_checked = check_array(
+                X, dtype=[np.float64, np.float32], accept_sparse=True
+            )
+            assert X_checked.dtype == np.float32
+            assert X_checked is X
+
+            X_checked = check_array(
+                X,
+                dtype=[np.float64, np.float32],
+                accept_sparse=["csr", "dok"],
+                copy=True,
+            )
+            assert X_checked.dtype == np.float32
+            assert X_checked is not X
+
+        X_checked = check_array(
+            X_csc_float32,
+            dtype=[np.float64, np.float32],
+            accept_sparse=["csr", "dok"],
+            copy=False,
+        )
+        assert X_checked.dtype == np.float32
+        assert X_checked is not X_csc_float32
+        assert X_checked.format == "csr"
+
+
+def test_check_array_accept_sparse_type_exception():
+    X = [[1, 2], [3, 4]]
+    X_csr = sp.csr_matrix(X)
+    invalid_type = SVR()
+
+    msg = (
+        "Sparse data was passed, but dense data is required. "
+        r"Use '.toarray\(\)' to convert to a dense numpy array."
+    )
+    with pytest.raises(TypeError, match=msg):
+        check_array(X_csr, accept_sparse=False)
+
+    msg = (
+        "Parameter 'accept_sparse' should be a string, "
+        "boolean or list of strings. You provided 'accept_sparse=.*'."
+    )
+    with pytest.raises(ValueError, match=msg):
+        check_array(X_csr, accept_sparse=invalid_type)
+
+    msg = (
+        "When providing 'accept_sparse' as a tuple or list, "
+        "it must contain at least one string value."
+    )
+    with pytest.raises(ValueError, match=msg):
+        check_array(X_csr, accept_sparse=[])
+    with pytest.raises(ValueError, match=msg):
+        check_array(X_csr, accept_sparse=())
+    with pytest.raises(TypeError, match="SVR"):
+        check_array(X_csr, accept_sparse=[invalid_type])
+
+
+def test_check_array_accept_sparse_no_exception():
+    X = [[1, 2], [3, 4]]
+    X_csr = sp.csr_matrix(X)
+
+    check_array(X_csr, accept_sparse=True)
+    check_array(X_csr, accept_sparse="csr")
+    check_array(X_csr, accept_sparse=["csr"])
+    check_array(X_csr, accept_sparse=("csr",))
+
+
+@pytest.fixture(params=["csr", "csc", "coo", "bsr"])
+def X_64bit(request):
+    X = sp.random(20, 10, format=request.param)
+
+    if request.param == "coo":
+        if hasattr(X, "coords"):
+            # for scipy >= 1.13 .coords is a new attribute and is a tuple. The
+            # .col and .row attributes do not seem to be able to change the
+            # dtype, for more details see https://github.com/scipy/scipy/pull/18530/
+            # and https://github.com/scipy/scipy/pull/20003 where .indices was
+            # renamed to .coords
+            X.coords = tuple(v.astype("int64") for v in X.coords)
+        else:
+            # scipy < 1.13
+            X.row = X.row.astype("int64")
+            X.col = X.col.astype("int64")
+    else:
+        X.indices = X.indices.astype("int64")
+        X.indptr = X.indptr.astype("int64")
+
+    yield X
+
+
+def test_check_array_accept_large_sparse_no_exception(X_64bit):
+    # When large sparse are allowed
+    check_array(X_64bit, accept_large_sparse=True, accept_sparse=True)
+
+
+def test_check_array_accept_large_sparse_raise_exception(X_64bit):
+    # When large sparse are not allowed
+    msg = (
+        "Only sparse matrices with 32-bit integer indices "
+        "are accepted. Got int64 indices. Please do report"
+    )
+    with pytest.raises(ValueError, match=msg):
+        check_array(X_64bit, accept_sparse=True, accept_large_sparse=False)
+
+
+def test_check_array_min_samples_and_features_messages():
+    # empty list is considered 2D by default:
+    msg = r"0 feature\(s\) \(shape=\(1, 0\)\) while a minimum of 1 is required."
+    with pytest.raises(ValueError, match=msg):
+        check_array([[]])
+
+    # If considered a 1D collection when ensure_2d=False, then the minimum
+    # number of samples will break:
+    msg = r"0 sample\(s\) \(shape=\(0,\)\) while a minimum of 1 is required."
+    with pytest.raises(ValueError, match=msg):
+        check_array([], ensure_2d=False)
+
+    # Invalid edge case when checking the default minimum sample of a scalar
+    msg = re.escape(
+        (
+            "Input should have at least 1 dimension i.e. satisfy "
+            "`len(x.shape) > 0`, got scalar `array(42)` instead."
+        )
+    )
+    with pytest.raises(TypeError, match=msg):
+        check_array(42, ensure_2d=False)
+
+    # Simulate a model that would need at least 2 samples to be well defined
+    X = np.ones((1, 10))
+    y = np.ones(1)
+    msg = r"1 sample\(s\) \(shape=\(1, 10\)\) while a minimum of 2 is required."
+    with pytest.raises(ValueError, match=msg):
+        check_X_y(X, y, ensure_min_samples=2)
+
+    # The same message is raised if the data has 2 dimensions even if this is
+    # not mandatory
+    with pytest.raises(ValueError, match=msg):
+        check_X_y(X, y, ensure_min_samples=2, ensure_2d=False)
+
+    # Simulate a model that would require at least 3 features (e.g. SelectKBest
+    # with k=3)
+    X = np.ones((10, 2))
+    y = np.ones(2)
+    msg = r"2 feature\(s\) \(shape=\(10, 2\)\) while a minimum of 3 is required."
+    with pytest.raises(ValueError, match=msg):
+        check_X_y(X, y, ensure_min_features=3)
+
+    # Only the feature check is enabled whenever the number of dimensions is 2
+    # even if allow_nd is enabled:
+    with pytest.raises(ValueError, match=msg):
+        check_X_y(X, y, ensure_min_features=3, allow_nd=True)
+
+    # Simulate a case where a pipeline stage as trimmed all the features of a
+    # 2D dataset.
+    X = np.empty(0).reshape(10, 0)
+    y = np.ones(10)
+    msg = r"0 feature\(s\) \(shape=\(10, 0\)\) while a minimum of 1 is required."
+    with pytest.raises(ValueError, match=msg):
+        check_X_y(X, y)
+
+    # nd-data is not checked for any minimum number of features by default:
+    X = np.ones((10, 0, 28, 28))
+    y = np.ones(10)
+    X_checked, y_checked = check_X_y(X, y, allow_nd=True)
+    assert_array_equal(X, X_checked)
+    assert_array_equal(y, y_checked)
+
+
+def test_check_array_complex_data_error():
+    X = np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]])
+    with pytest.raises(ValueError, match="Complex data not supported"):
+        check_array(X)
+
+    # list of lists
+    X = [[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]
+    with pytest.raises(ValueError, match="Complex data not supported"):
+        check_array(X)
+
+    # tuple of tuples
+    X = ((1 + 2j, 3 + 4j, 5 + 7j), (2 + 3j, 4 + 5j, 6 + 7j))
+    with pytest.raises(ValueError, match="Complex data not supported"):
+        check_array(X)
+
+    # list of np arrays
+    X = [np.array([1 + 2j, 3 + 4j, 5 + 7j]), np.array([2 + 3j, 4 + 5j, 6 + 7j])]
+    with pytest.raises(ValueError, match="Complex data not supported"):
+        check_array(X)
+
+    # tuple of np arrays
+    X = (np.array([1 + 2j, 3 + 4j, 5 + 7j]), np.array([2 + 3j, 4 + 5j, 6 + 7j]))
+    with pytest.raises(ValueError, match="Complex data not supported"):
+        check_array(X)
+
+    # dataframe
+    X = MockDataFrame(np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]))
+    with pytest.raises(ValueError, match="Complex data not supported"):
+        check_array(X)
+
+    # sparse matrix
+    X = sp.coo_matrix([[0, 1 + 2j], [0, 0]])
+    with pytest.raises(ValueError, match="Complex data not supported"):
+        check_array(X)
+
+    # target variable does not always go through check_array but should
+    # never accept complex data either.
+    y = np.array([1 + 2j, 3 + 4j, 5 + 7j, 2 + 3j, 4 + 5j, 6 + 7j])
+    with pytest.raises(ValueError, match="Complex data not supported"):
+        _check_y(y)
+
+
+def test_has_fit_parameter():
+    assert not has_fit_parameter(KNeighborsClassifier, "sample_weight")
+    assert has_fit_parameter(RandomForestRegressor, "sample_weight")
+    assert has_fit_parameter(SVR, "sample_weight")
+    assert has_fit_parameter(SVR(), "sample_weight")
+
+    class TestClassWithDeprecatedFitMethod:
+        @deprecated("Deprecated for the purpose of testing has_fit_parameter")
+        def fit(self, X, y, sample_weight=None):
+            pass
+
+    assert has_fit_parameter(TestClassWithDeprecatedFitMethod, "sample_weight"), (
+        "has_fit_parameter fails for class with deprecated fit method."
+    )
+
+
+def test_check_symmetric():
+    arr_sym = np.array([[0, 1], [1, 2]])
+    arr_bad = np.ones(2)
+    arr_asym = np.array([[0, 2], [0, 2]])
+
+    test_arrays = {
+        "dense": arr_asym,
+        "dok": sp.dok_matrix(arr_asym),
+        "csr": sp.csr_matrix(arr_asym),
+        "csc": sp.csc_matrix(arr_asym),
+        "coo": sp.coo_matrix(arr_asym),
+        "lil": sp.lil_matrix(arr_asym),
+        "bsr": sp.bsr_matrix(arr_asym),
+    }
+
+    # check error for bad inputs
+    with pytest.raises(ValueError):
+        check_symmetric(arr_bad)
+
+    # check that asymmetric arrays are properly symmetrized
+    for arr_format, arr in test_arrays.items():
+        # Check for warnings and errors
+        with pytest.warns(UserWarning):
+            check_symmetric(arr)
+        with pytest.raises(ValueError):
+            check_symmetric(arr, raise_exception=True)
+
+        output = check_symmetric(arr, raise_warning=False)
+        if sp.issparse(output):
+            assert output.format == arr_format
+            assert_array_equal(output.toarray(), arr_sym)
+        else:
+            assert_array_equal(output, arr_sym)
+
+
+def test_check_is_fitted_with_is_fitted():
+    class Estimator(BaseEstimator):
+        def fit(self, **kwargs):
+            self._is_fitted = True
+            return self
+
+        def __sklearn_is_fitted__(self):
+            return hasattr(self, "_is_fitted") and self._is_fitted
+
+    with pytest.raises(NotFittedError):
+        check_is_fitted(Estimator())
+    check_is_fitted(Estimator().fit())
+
+
+def test_check_is_fitted_stateless():
+    """Check that check_is_fitted passes for stateless estimators."""
+
+    class StatelessEstimator(BaseEstimator):
+        def fit(self, **kwargs):
+            return self  # pragma: no cover
+
+        def __sklearn_tags__(self):
+            tags = super().__sklearn_tags__()
+            tags.requires_fit = False
+            return tags
+
+    check_is_fitted(StatelessEstimator())
+
+
+def test_check_is_fitted():
+    # Check is TypeError raised when non estimator instance passed
+    with pytest.raises(TypeError):
+        check_is_fitted(ARDRegression)
+    with pytest.raises(TypeError):
+        check_is_fitted("SVR")
+
+    ard = ARDRegression()
+    svr = SVR()
+
+    try:
+        with pytest.raises(NotFittedError):
+            check_is_fitted(ard)
+        with pytest.raises(NotFittedError):
+            check_is_fitted(svr)
+    except ValueError:
+        assert False, "check_is_fitted failed with ValueError"
+
+    # NotFittedError is a subclass of both ValueError and AttributeError
+    msg = "Random message %(name)s, %(name)s"
+    match = "Random message ARDRegression, ARDRegression"
+    with pytest.raises(ValueError, match=match):
+        check_is_fitted(ard, msg=msg)
+
+    msg = "Another message %(name)s, %(name)s"
+    match = "Another message SVR, SVR"
+    with pytest.raises(AttributeError, match=match):
+        check_is_fitted(svr, msg=msg)
+
+    ard.fit(*make_blobs())
+    svr.fit(*make_blobs())
+
+    assert check_is_fitted(ard) is None
+    assert check_is_fitted(svr) is None
+
+
+def test_check_is_fitted_attributes():
+    class MyEstimator(BaseEstimator):
+        def fit(self, X, y):
+            return self
+
+    msg = "not fitted"
+    est = MyEstimator()
+
+    assert not _is_fitted(est, attributes=["a_", "b_"])
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"])
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+
+    est.a_ = "a"
+    assert not _is_fitted(est, attributes=["a_", "b_"])
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"])
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+    check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+
+    est.b_ = "b"
+    assert _is_fitted(est, attributes=["a_", "b_"])
+    check_is_fitted(est, attributes=["a_", "b_"])
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+    check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+
+
+@pytest.mark.parametrize(
+    "wrap", [itemgetter(0), list, tuple], ids=["single", "list", "tuple"]
+)
+def test_check_is_fitted_with_attributes(wrap):
+    ard = ARDRegression()
+    with pytest.raises(NotFittedError, match="is not fitted yet"):
+        check_is_fitted(ard, wrap(["coef_"]))
+
+    ard.fit(*make_blobs())
+
+    # Does not raise
+    check_is_fitted(ard, wrap(["coef_"]))
+
+    # Raises when using attribute that is not defined
+    with pytest.raises(NotFittedError, match="is not fitted yet"):
+        check_is_fitted(ard, wrap(["coef_bad_"]))
+
+
+def test_check_consistent_length():
+    """Test that `check_consistent_length` raises on inconsistent lengths and wrong
+    input types trigger TypeErrors."""
+    check_consistent_length([1], [2], [3], [4], [5])
+    check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ["a", "b"])
+    check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2)))
+    with pytest.raises(ValueError, match="inconsistent numbers of samples"):
+        check_consistent_length([1, 2], [1])
+    with pytest.raises(TypeError, match=r"got <\w+ 'int'>"):
+        check_consistent_length([1, 2], 1)
+    with pytest.raises(TypeError, match=r"got <\w+ 'object'>"):
+        check_consistent_length([1, 2], object())
+    with pytest.raises(TypeError):
+        check_consistent_length([1, 2], np.array(1))
+    # Despite ensembles having __len__ they must raise TypeError
+    with pytest.raises(TypeError, match="Expected sequence or array-like"):
+        check_consistent_length([1, 2], RandomForestRegressor())
+    # XXX: We should have a test with a string, but what is correct behaviour?
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, _",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+def test_check_consistent_length_array_api(array_namespace, device, _):
+    """Test that check_consistent_length works with different array types."""
+    xp = _array_api_for_tests(array_namespace, device)
+
+    with config_context(array_api_dispatch=True):
+        check_consistent_length(
+            xp.asarray([1, 2, 3], device=device),
+            xp.asarray([[1, 1], [2, 2], [3, 3]], device=device),
+            [1, 2, 3],
+            ["a", "b", "c"],
+            np.asarray(("a", "b", "c"), dtype=object),
+            sp.csr_array([[0, 1], [1, 0], [0, 0]]),
+        )
+
+        with pytest.raises(ValueError, match="inconsistent numbers of samples"):
+            check_consistent_length(
+                xp.asarray([1, 2], device=device), xp.asarray([1], device=device)
+            )
+
+
+def test_check_dataframe_fit_attribute():
+    # check pandas dataframe with 'fit' column does not raise error
+    # https://github.com/scikit-learn/scikit-learn/issues/8415
+    try:
+        import pandas as pd
+
+        X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        X_df = pd.DataFrame(X, columns=["a", "b", "fit"])
+        check_consistent_length(X_df)
+    except ImportError:
+        raise SkipTest("Pandas not found")
+
+
+def test_suppress_validation():
+    X = np.array([0, np.inf])
+    with pytest.raises(ValueError):
+        assert_all_finite(X)
+    sklearn.set_config(assume_finite=True)
+    assert_all_finite(X)
+    sklearn.set_config(assume_finite=False)
+    with pytest.raises(ValueError):
+        assert_all_finite(X)
+
+
+def test_check_array_series():
+    # regression test that check_array works on pandas Series
+    pd = importorskip("pandas")
+    res = check_array(pd.Series([1, 2, 3]), ensure_2d=False)
+    assert_array_equal(res, np.array([1, 2, 3]))
+
+    # with categorical dtype (not a numpy dtype) (GH12699)
+    s = pd.Series(["a", "b", "c"]).astype("category")
+    res = check_array(s, dtype=None, ensure_2d=False)
+    assert_array_equal(res, np.array(["a", "b", "c"], dtype=object))
+
+
+@pytest.mark.parametrize(
+    "dtype", ((np.float64, np.float32), np.float64, None, "numeric")
+)
+@pytest.mark.parametrize("bool_dtype", ("bool", "boolean"))
+def test_check_dataframe_mixed_float_dtypes(dtype, bool_dtype):
+    # pandas dataframe will coerce a boolean into a object, this is a mismatch
+    # with np.result_type which will return a float
+    # check_array needs to explicitly check for bool dtype in a dataframe for
+    # this situation
+    # https://github.com/scikit-learn/scikit-learn/issues/15787
+
+    pd = importorskip("pandas")
+
+    df = pd.DataFrame(
+        {
+            "int": [1, 2, 3],
+            "float": [0, 0.1, 2.1],
+            "bool": pd.Series([True, False, True], dtype=bool_dtype),
+        },
+        columns=["int", "float", "bool"],
+    )
+
+    array = check_array(df, dtype=dtype)
+    assert array.dtype == np.float64
+    expected_array = np.array(
+        [[1.0, 0.0, 1.0], [2.0, 0.1, 0.0], [3.0, 2.1, 1.0]], dtype=float
+    )
+    assert_allclose_dense_sparse(array, expected_array)
+
+
+def test_check_dataframe_with_only_bool():
+    """Check that dataframe with bool return a boolean arrays."""
+    pd = importorskip("pandas")
+    df = pd.DataFrame({"bool": [True, False, True]})
+
+    array = check_array(df, dtype=None)
+    assert array.dtype == np.bool_
+    assert_array_equal(array, [[True], [False], [True]])
+
+    # common dtype is int for bool + int
+    df = pd.DataFrame(
+        {"bool": [True, False, True], "int": [1, 2, 3]},
+        columns=["bool", "int"],
+    )
+    array = check_array(df, dtype="numeric")
+    assert array.dtype == np.int64
+    assert_array_equal(array, [[1, 1], [0, 2], [1, 3]])
+
+
+def test_check_dataframe_with_only_boolean():
+    """Check that dataframe with boolean return a float array with dtype=None"""
+    pd = importorskip("pandas")
+    df = pd.DataFrame({"bool": pd.Series([True, False, True], dtype="boolean")})
+
+    array = check_array(df, dtype=None)
+    assert array.dtype == np.float64
+    assert_array_equal(array, [[True], [False], [True]])
+
+
+class DummyMemory:
+    def cache(self, func):
+        return func
+
+
+class WrongDummyMemory:
+    pass
+
+
+def test_check_memory(tmp_path):
+    cache_directory = str(tmp_path / "cache_directory")
+    memory = check_memory(cache_directory)
+    assert memory.location == cache_directory
+
+    memory = check_memory(None)
+    assert memory.location is None
+
+    dummy = DummyMemory()
+    memory = check_memory(dummy)
+    assert memory is dummy
+
+    msg = (
+        "'memory' should be None, a string or have the same interface as"
+        " joblib.Memory. Got memory='1' instead."
+    )
+    with pytest.raises(ValueError, match=msg):
+        check_memory(1)
+    dummy = WrongDummyMemory()
+    msg = (
+        "'memory' should be None, a string or have the same interface as"
+        " joblib.Memory. Got memory='{}' instead.".format(dummy)
+    )
+    with pytest.raises(ValueError, match=msg):
+        check_memory(dummy)
+
+
+@pytest.mark.parametrize("copy", [True, False])
+def test_check_array_memmap(copy):
+    X = np.ones((4, 4))
+    with TempMemmap(X, mmap_mode="r") as X_memmap:
+        X_checked = check_array(X_memmap, copy=copy)
+        assert np.may_share_memory(X_memmap, X_checked) == (not copy)
+        assert X_checked.flags["WRITEABLE"] == copy
+
+
+@pytest.mark.parametrize(
+    "estimator_name, estimator_value, delegates, expected_result, expected_exception",
+    [
+        (
+            "estimator_",
+            type("SubEstimator", (), {"attribute_present": True}),
+            None,  # default delegates - ["estimator_", "estimator"]
+            True,  # expected_result is True b/c delegate and attribute are present
+            None,  # expected_exception not relevant for this case
+        ),
+        (
+            "estimator",
+            type("SubEstimator", (), {"attribute_present": True}),
+            None,  # default delegates - ["estimator_", "estimator"]
+            True,  # expected_result is True b/c delegate and attribute are present
+            None,  # expected_exception not relevant for this case
+        ),
+        (
+            "estimators_",
+            [
+                type("SubEstimator", (), {"attribute_present": True})
+            ],  # list of sub-estimators
+            ["estimators_"],
+            True,  # expected_result is True b/c delegate and attribute are present
+            None,  # expected_exception not relevant for this case
+        ),
+        (
+            "custom_estimator",  # custom estimator attribute name
+            type("SubEstimator", (), {"attribute_present": True}),
+            ["custom_estimator"],  # custom delegates
+            True,  # expected_result is True b/c delegate and attribute are present
+            None,  # expected_exception not relevant for this case
+        ),
+        (
+            "no_estimator",  # no estimator attribute name
+            type("SubEstimator", (), {"attribute_present": True}),
+            None,  # default delegates - ["estimator_", "estimator"]
+            None,  # expected_result is not relevant for this case
+            ValueError,  # should raise ValueError b/c no estimator found from delegates
+        ),
+        (
+            "estimator",
+            type("SubEstimator", (), {"attribute_absent": True}),  # attribute_absent
+            None,  # default delegates - ["estimator_", "estimator"]
+            None,  # expected_result is not relevant for this case
+            AttributeError,  # should raise AttributeError b/c attribute is absent
+        ),
+    ],
+    ids=[
+        "fitted_estimator_with_default_delegates",
+        "estimator_with_default_delegates",
+        "list_of_estimators_with_estimators_",
+        "custom_estimator_with_custom_delegates",
+        "no_estimator_with_default_delegates",
+        "estimator_with_default_delegates_but_absent_attribute",
+    ],
+)
+def test_estimator_has(
+    estimator_name, estimator_value, delegates, expected_result, expected_exception
+):
+    """
+    Tests the _estimator_has function by verifying:
+    - Functionality with default and custom delegates.
+    - Raises ValueError if delegates are missing.
+    - Raises AttributeError if the specified attribute is missing.
+    """
+
+    # always checks for attribute - "attribute_present"
+    # ["estimator_", "estimator"] is default value for delegates
+    if delegates is None:
+        check = _estimator_has("attribute_present")
+    else:
+        check = _estimator_has("attribute_present", delegates=delegates)
+
+    class MockEstimator:
+        pass
+
+    a = MockEstimator()
+    setattr(a, estimator_name, estimator_value)
+
+    if expected_exception:
+        with pytest.raises(expected_exception):
+            check(a)
+    else:
+        assert check(a) == expected_result
+
+
+@pytest.mark.parametrize(
+    "retype",
+    [
+        np.asarray,
+        sp.csr_matrix,
+        sp.csc_matrix,
+        sp.coo_matrix,
+        sp.lil_matrix,
+        sp.bsr_matrix,
+        sp.dok_matrix,
+        sp.dia_matrix,
+    ],
+)
+def test_check_non_negative(retype):
+    A = np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
+    X = retype(A)
+    check_non_negative(X, "")
+    X = retype([[0, 0], [0, 0]])
+    check_non_negative(X, "")
+
+    A[0, 0] = -1
+    X = retype(A)
+    with pytest.raises(ValueError, match="Negative "):
+        check_non_negative(X, "")
+
+
+def test_check_X_y_informative_error():
+    X = np.ones((2, 2))
+    y = None
+    msg = "estimator requires y to be passed, but the target y is None"
+    with pytest.raises(ValueError, match=msg):
+        check_X_y(X, y)
+
+    msg = "RandomForestRegressor requires y to be passed, but the target y is None"
+    with pytest.raises(ValueError, match=msg):
+        check_X_y(X, y, estimator=RandomForestRegressor())
+
+
+def test_retrieve_samples_from_non_standard_shape():
+    class TestNonNumericShape:
+        def __init__(self):
+            self.shape = ("not numeric",)
+
+        def __len__(self):
+            return len([1, 2, 3])
+
+    X = TestNonNumericShape()
+    assert _num_samples(X) == len(X)
+
+    # check that it gives a good error if there's no __len__
+    class TestNoLenWeirdShape:
+        def __init__(self):
+            self.shape = ("not numeric",)
+
+    with pytest.raises(TypeError, match="Expected sequence or array-like"):
+        _num_samples(TestNoLenWeirdShape())
+
+
+@pytest.mark.parametrize("x", [2, 3, 2.5, 5])
+def test_check_scalar_valid(x):
+    """Test that check_scalar returns no error/warning if valid inputs are
+    provided"""
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        scalar = check_scalar(
+            x,
+            "test_name",
+            target_type=numbers.Real,
+            min_val=2,
+            max_val=5,
+            include_boundaries="both",
+        )
+    assert scalar == x
+
+
+@pytest.mark.parametrize(
+    "x, target_name, target_type, min_val, max_val, include_boundaries, err_msg",
+    [
+        (
+            1,
+            "test_name1",
+            float,
+            2,
+            4,
+            "neither",
+            TypeError("test_name1 must be an instance of float, not int."),
+        ),
+        (
+            None,
+            "test_name1",
+            numbers.Real,
+            2,
+            4,
+            "neither",
+            TypeError("test_name1 must be an instance of float, not NoneType."),
+        ),
+        (
+            None,
+            "test_name1",
+            numbers.Integral,
+            2,
+            4,
+            "neither",
+            TypeError("test_name1 must be an instance of int, not NoneType."),
+        ),
+        (
+            1,
+            "test_name1",
+            (float, bool),
+            2,
+            4,
+            "neither",
+            TypeError("test_name1 must be an instance of {float, bool}, not int."),
+        ),
+        (
+            1,
+            "test_name2",
+            int,
+            2,
+            4,
+            "neither",
+            ValueError("test_name2 == 1, must be > 2."),
+        ),
+        (
+            5,
+            "test_name3",
+            int,
+            2,
+            4,
+            "neither",
+            ValueError("test_name3 == 5, must be < 4."),
+        ),
+        (
+            2,
+            "test_name4",
+            int,
+            2,
+            4,
+            "right",
+            ValueError("test_name4 == 2, must be > 2."),
+        ),
+        (
+            4,
+            "test_name5",
+            int,
+            2,
+            4,
+            "left",
+            ValueError("test_name5 == 4, must be < 4."),
+        ),
+        (
+            4,
+            "test_name6",
+            int,
+            2,
+            4,
+            "bad parameter value",
+            ValueError(
+                "Unknown value for `include_boundaries`: 'bad parameter value'. "
+                "Possible values are: ('left', 'right', 'both', 'neither')."
+            ),
+        ),
+        (
+            4,
+            "test_name7",
+            int,
+            None,
+            4,
+            "left",
+            ValueError(
+                "`include_boundaries`='left' without specifying explicitly `min_val` "
+                "is inconsistent."
+            ),
+        ),
+        (
+            4,
+            "test_name8",
+            int,
+            2,
+            None,
+            "right",
+            ValueError(
+                "`include_boundaries`='right' without specifying explicitly `max_val` "
+                "is inconsistent."
+            ),
+        ),
+    ],
+)
+def test_check_scalar_invalid(
+    x, target_name, target_type, min_val, max_val, include_boundaries, err_msg
+):
+    """Test that check_scalar returns the right error if a wrong input is
+    given"""
+    with pytest.raises(Exception) as raised_error:
+        check_scalar(
+            x,
+            target_name,
+            target_type=target_type,
+            min_val=min_val,
+            max_val=max_val,
+            include_boundaries=include_boundaries,
+        )
+    assert str(raised_error.value) == str(err_msg)
+    assert isinstance(raised_error.value, type(err_msg))
+
+
+_psd_cases_valid = {
+    "nominal": ((1, 2), np.array([1, 2]), None, ""),
+    "nominal_np_array": (np.array([1, 2]), np.array([1, 2]), None, ""),
+    "insignificant_imag": (
+        (5, 5e-5j),
+        np.array([5, 0]),
+        PositiveSpectrumWarning,
+        "There are imaginary parts in eigenvalues \\(1e\\-05 of the maximum real part",
+    ),
+    "insignificant neg": ((5, -5e-5), np.array([5, 0]), PositiveSpectrumWarning, ""),
+    "insignificant neg float32": (
+        np.array([1, -1e-6], dtype=np.float32),
+        np.array([1, 0], dtype=np.float32),
+        PositiveSpectrumWarning,
+        "There are negative eigenvalues \\(1e\\-06 of the maximum positive",
+    ),
+    "insignificant neg float64": (
+        np.array([1, -1e-10], dtype=np.float64),
+        np.array([1, 0], dtype=np.float64),
+        PositiveSpectrumWarning,
+        "There are negative eigenvalues \\(1e\\-10 of the maximum positive",
+    ),
+    "insignificant pos": (
+        (5, 4e-12),
+        np.array([5, 0]),
+        PositiveSpectrumWarning,
+        "the largest eigenvalue is more than 1e\\+12 times the smallest",
+    ),
+}
+
+
+@pytest.mark.parametrize(
+    "lambdas, expected_lambdas, w_type, w_msg",
+    list(_psd_cases_valid.values()),
+    ids=list(_psd_cases_valid.keys()),
+)
+@pytest.mark.parametrize("enable_warnings", [True, False])
+def test_check_psd_eigenvalues_valid(
+    lambdas, expected_lambdas, w_type, w_msg, enable_warnings
+):
+    # Test that ``_check_psd_eigenvalues`` returns the right output for valid
+    # input, possibly raising the right warning
+
+    if not enable_warnings:
+        w_type = None
+
+    if w_type is None:
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", PositiveSpectrumWarning)
+            lambdas_fixed = _check_psd_eigenvalues(
+                lambdas, enable_warnings=enable_warnings
+            )
+    else:
+        with pytest.warns(w_type, match=w_msg):
+            lambdas_fixed = _check_psd_eigenvalues(
+                lambdas, enable_warnings=enable_warnings
+            )
+
+    assert_allclose(expected_lambdas, lambdas_fixed)
+
+
+_psd_cases_invalid = {
+    "significant_imag": (
+        (5, 5j),
+        ValueError,
+        "There are significant imaginary parts in eigenv",
+    ),
+    "all negative": (
+        (-5, -1),
+        ValueError,
+        "All eigenvalues are negative \\(maximum is -1",
+    ),
+    "significant neg": (
+        (5, -1),
+        ValueError,
+        "There are significant negative eigenvalues",
+    ),
+    "significant neg float32": (
+        np.array([3e-4, -2e-6], dtype=np.float32),
+        ValueError,
+        "There are significant negative eigenvalues",
+    ),
+    "significant neg float64": (
+        np.array([1e-5, -2e-10], dtype=np.float64),
+        ValueError,
+        "There are significant negative eigenvalues",
+    ),
+}
+
+
+@pytest.mark.parametrize(
+    "lambdas, err_type, err_msg",
+    list(_psd_cases_invalid.values()),
+    ids=list(_psd_cases_invalid.keys()),
+)
+def test_check_psd_eigenvalues_invalid(lambdas, err_type, err_msg):
+    # Test that ``_check_psd_eigenvalues`` raises the right error for invalid
+    # input
+
+    with pytest.raises(err_type, match=err_msg):
+        _check_psd_eigenvalues(lambdas)
+
+
+def test_check_sample_weight():
+    # check array order
+    sample_weight = np.ones(10)[::2]
+    assert not sample_weight.flags["C_CONTIGUOUS"]
+    sample_weight = _check_sample_weight(sample_weight, X=np.ones((5, 1)))
+    assert sample_weight.flags["C_CONTIGUOUS"]
+
+    # check None input
+    sample_weight = _check_sample_weight(None, X=np.ones((5, 2)))
+    assert_allclose(sample_weight, np.ones(5))
+
+    # check numbers input
+    sample_weight = _check_sample_weight(2.0, X=np.ones((5, 2)))
+    assert_allclose(sample_weight, 2 * np.ones(5))
+
+    # check wrong number of dimensions
+    with pytest.raises(ValueError, match="Sample weights must be 1D array or scalar"):
+        _check_sample_weight(np.ones((2, 4)), X=np.ones((2, 2)))
+
+    # check incorrect n_samples
+    msg = r"sample_weight.shape == \(4,\), expected \(2,\)!"
+    with pytest.raises(ValueError, match=msg):
+        _check_sample_weight(np.ones(4), X=np.ones((2, 2)))
+
+    # float32 dtype is preserved
+    X = np.ones((5, 2))
+    sample_weight = np.ones(5, dtype=np.float32)
+    sample_weight = _check_sample_weight(sample_weight, X)
+    assert sample_weight.dtype == np.float32
+
+    # int dtype will be converted to float64 instead
+    X = np.ones((5, 2), dtype=int)
+    sample_weight = _check_sample_weight(None, X, dtype=X.dtype)
+    assert sample_weight.dtype == np.float64
+
+    # check negative weight when ensure_non_negative=True
+    X = np.ones((5, 2))
+    sample_weight = np.ones(_num_samples(X))
+    sample_weight[-1] = -10
+    err_msg = "Negative values in data passed to `sample_weight`"
+    with pytest.raises(ValueError, match=err_msg):
+        _check_sample_weight(sample_weight, X, ensure_non_negative=True)
+
+
+@pytest.mark.parametrize("toarray", [np.array, sp.csr_matrix, sp.csc_matrix])
+def test_allclose_dense_sparse_equals(toarray):
+    base = np.arange(9).reshape(3, 3)
+    x, y = toarray(base), toarray(base)
+    assert _allclose_dense_sparse(x, y)
+
+
+@pytest.mark.parametrize("toarray", [np.array, sp.csr_matrix, sp.csc_matrix])
+def test_allclose_dense_sparse_not_equals(toarray):
+    base = np.arange(9).reshape(3, 3)
+    x, y = toarray(base), toarray(base + 1)
+    assert not _allclose_dense_sparse(x, y)
+
+
+@pytest.mark.parametrize("toarray", [sp.csr_matrix, sp.csc_matrix])
+def test_allclose_dense_sparse_raise(toarray):
+    x = np.arange(9).reshape(3, 3)
+    y = toarray(x + 1)
+
+    msg = "Can only compare two sparse matrices, not a sparse matrix and an array"
+    with pytest.raises(ValueError, match=msg):
+        _allclose_dense_sparse(x, y)
+
+
+def test_deprecate_positional_args_warns_for_function():
+    @_deprecate_positional_args
+    def f1(a, b, *, c=1, d=1):
+        pass
+
+    with pytest.warns(FutureWarning, match=r"Pass c=3 as keyword args"):
+        f1(1, 2, 3)
+
+    with pytest.warns(FutureWarning, match=r"Pass c=3, d=4 as keyword args"):
+        f1(1, 2, 3, 4)
+
+    @_deprecate_positional_args
+    def f2(a=1, *, b=1, c=1, d=1):
+        pass
+
+    with pytest.warns(FutureWarning, match=r"Pass b=2 as keyword args"):
+        f2(1, 2)
+
+    # The * is place before a keyword only argument without a default value
+    @_deprecate_positional_args
+    def f3(a, *, b, c=1, d=1):
+        pass
+
+    with pytest.warns(FutureWarning, match=r"Pass b=2 as keyword args"):
+        f3(1, 2)
+
+
+def test_deprecate_positional_args_warns_for_function_version():
+    @_deprecate_positional_args(version="1.1")
+    def f1(a, *, b):
+        pass
+
+    with pytest.warns(
+        FutureWarning, match=r"From version 1.1 passing these as positional"
+    ):
+        f1(1, 2)
+
+
+def test_deprecate_positional_args_warns_for_class():
+    class A1:
+        @_deprecate_positional_args
+        def __init__(self, a, b, *, c=1, d=1):
+            pass
+
+    with pytest.warns(FutureWarning, match=r"Pass c=3 as keyword args"):
+        A1(1, 2, 3)
+
+    with pytest.warns(FutureWarning, match=r"Pass c=3, d=4 as keyword args"):
+        A1(1, 2, 3, 4)
+
+    class A2:
+        @_deprecate_positional_args
+        def __init__(self, a=1, b=1, *, c=1, d=1):
+            pass
+
+    with pytest.warns(FutureWarning, match=r"Pass c=3 as keyword args"):
+        A2(1, 2, 3)
+
+    with pytest.warns(FutureWarning, match=r"Pass c=3, d=4 as keyword args"):
+        A2(1, 2, 3, 4)
+
+
+@pytest.mark.parametrize("indices", [None, [1, 3]])
+def test_check_method_params(indices):
+    X = np.random.randn(4, 2)
+    _params = {
+        "list": [1, 2, 3, 4],
+        "array": np.array([1, 2, 3, 4]),
+        "sparse-col": sp.csc_matrix([1, 2, 3, 4]).T,
+        "sparse-row": sp.csc_matrix([1, 2, 3, 4]),
+        "scalar-int": 1,
+        "scalar-str": "xxx",
+        "None": None,
+    }
+    result = _check_method_params(X, params=_params, indices=indices)
+    indices_ = indices if indices is not None else list(range(X.shape[0]))
+
+    for key in ["sparse-row", "scalar-int", "scalar-str", "None"]:
+        assert result[key] is _params[key]
+
+    assert result["list"] == _safe_indexing(_params["list"], indices_)
+    assert_array_equal(result["array"], _safe_indexing(_params["array"], indices_))
+    assert_allclose_dense_sparse(
+        result["sparse-col"], _safe_indexing(_params["sparse-col"], indices_)
+    )
+
+
+@pytest.mark.parametrize("sp_format", [True, "csr", "csc", "coo", "bsr"])
+def test_check_sparse_pandas_sp_format(sp_format):
+    # check_array converts pandas dataframe with only sparse arrays into
+    # sparse matrix
+    pd = pytest.importorskip("pandas")
+    sp_mat = _sparse_random_matrix(10, 3)
+
+    sdf = pd.DataFrame.sparse.from_spmatrix(sp_mat)
+    result = check_array(sdf, accept_sparse=sp_format)
+
+    if sp_format is True:
+        # by default pandas converts to coo when accept_sparse is True
+        sp_format = "coo"
+
+    assert sp.issparse(result)
+    assert result.format == sp_format
+    assert_allclose_dense_sparse(sp_mat, result)
+
+
+@pytest.mark.parametrize(
+    "ntype1, ntype2",
+    [
+        ("longdouble", "float16"),
+        ("float16", "float32"),
+        ("float32", "double"),
+        ("int16", "int32"),
+        ("int32", "long"),
+        ("byte", "uint16"),
+        ("ushort", "uint32"),
+        ("uint32", "uint64"),
+        ("uint8", "int8"),
+    ],
+)
+def test_check_pandas_sparse_mixed_dtypes(ntype1, ntype2):
+    """Check that pandas dataframes having sparse extension arrays with mixed dtypes
+    works."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {
+            "col1": pd.arrays.SparseArray([0, 1, 0], dtype=ntype1, fill_value=0),
+            "col2": pd.arrays.SparseArray([1, 0, 1], dtype=ntype2, fill_value=0),
+        }
+    )
+    check_array(df, accept_sparse=["csr", "csc"])
+
+
+@pytest.mark.parametrize(
+    "ntype1, ntype2, expected_subtype",
+    [
+        ("double", "longdouble", np.floating),
+        ("single", "float32", np.floating),
+        ("double", "float64", np.floating),
+        ("int8", "byte", np.integer),
+        ("short", "int16", np.integer),
+        ("intc", "int32", np.integer),
+        ("intp", "long", np.integer),
+        ("int", "long", np.integer),
+        ("int64", "longlong", np.integer),
+        ("int_", "intp", np.integer),
+        ("ubyte", "uint8", np.unsignedinteger),
+        ("uint16", "ushort", np.unsignedinteger),
+        ("uintc", "uint32", np.unsignedinteger),
+        ("uint", "uint64", np.unsignedinteger),
+        ("uintp", "ulonglong", np.unsignedinteger),
+    ],
+)
+def test_check_pandas_sparse_valid(ntype1, ntype2, expected_subtype):
+    # check that we support the conversion of sparse dataframe with mixed
+    # type which can be converted safely.
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {
+            "col1": pd.arrays.SparseArray([0, 1, 0], dtype=ntype1, fill_value=0),
+            "col2": pd.arrays.SparseArray([1, 0, 1], dtype=ntype2, fill_value=0),
+        }
+    )
+    arr = check_array(df, accept_sparse=["csr", "csc"])
+    assert np.issubdtype(arr.dtype, expected_subtype)
+
+
+@pytest.mark.parametrize(
+    "constructor_name",
+    ["list", "tuple", "array", "dataframe", "sparse_csr", "sparse_csc"],
+)
+def test_num_features(constructor_name):
+    """Check _num_features for array-likes."""
+    X = [[1, 2, 3], [4, 5, 6]]
+    X = _convert_container(X, constructor_name)
+    assert _num_features(X) == 3
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        [1, 2, 3],
+        ["a", "b", "c"],
+        [False, True, False],
+        [1.0, 3.4, 4.0],
+        [{"a": 1}, {"b": 2}, {"c": 3}],
+    ],
+    ids=["int", "str", "bool", "float", "dict"],
+)
+@pytest.mark.parametrize("constructor_name", ["list", "tuple", "array", "series"])
+def test_num_features_errors_1d_containers(X, constructor_name):
+    X = _convert_container(X, constructor_name)
+    if constructor_name == "array":
+        expected_type_name = "numpy.ndarray"
+    elif constructor_name == "series":
+        expected_type_name = "pandas.*Series"
+    else:
+        expected_type_name = constructor_name
+    message = (
+        f"Unable to find the number of features from X of type {expected_type_name}"
+    )
+    if hasattr(X, "shape"):
+        message += re.escape(" with shape (3,)")
+    elif isinstance(X[0], str):
+        message += " where the samples are of type str"
+    elif isinstance(X[0], dict):
+        message += " where the samples are of type dict"
+    with pytest.raises(TypeError, match=message):
+        _num_features(X)
+
+
+@pytest.mark.parametrize("X", [1, "b", False, 3.0], ids=["int", "str", "bool", "float"])
+def test_num_features_errors_scalars(X):
+    msg = f"Unable to find the number of features from X of type {type(X).__qualname__}"
+    with pytest.raises(TypeError, match=msg):
+        _num_features(X)
+
+
+@pytest.mark.parametrize(
+    "names",
+    [list(range(2)), range(2), None, [["a", "b"], ["c", "d"]]],
+    ids=["list-int", "range", "default", "MultiIndex"],
+)
+def test_get_feature_names_pandas_with_ints_no_warning(names):
+    """Get feature names with pandas dataframes without warning.
+
+    Column names with consistent dtypes will not warn, such as int or MultiIndex.
+    """
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", FutureWarning)
+        names = _get_feature_names(X)
+    assert names is None
+
+
+def test_get_feature_names_pandas():
+    """Get feature names with pandas dataframes."""
+    pd = pytest.importorskip("pandas")
+    columns = [f"col_{i}" for i in range(3)]
+    X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=columns)
+    feature_names = _get_feature_names(X)
+
+    assert_array_equal(feature_names, columns)
+
+
+@pytest.mark.parametrize(
+    "constructor_name, minversion",
+    [("pyarrow", "12.0.0"), ("dataframe", "1.5.0"), ("polars", "0.18.2")],
+)
+def test_get_feature_names_dataframe_protocol(constructor_name, minversion):
+    """Uses the dataframe exchange protocol to get feature names."""
+    data = [[1, 4, 2], [3, 3, 6]]
+    columns = ["col_0", "col_1", "col_2"]
+    df = _convert_container(
+        data, constructor_name, columns_name=columns, minversion=minversion
+    )
+    feature_names = _get_feature_names(df)
+
+    assert_array_equal(feature_names, columns)
+
+
+@pytest.mark.parametrize("constructor_name", ["pyarrow", "dataframe", "polars"])
+def test_is_pandas_df_other_libraries(constructor_name):
+    df = _convert_container([[1, 4, 2], [3, 3, 6]], constructor_name)
+    if constructor_name in ("pyarrow", "polars"):
+        assert not _is_pandas_df(df)
+    else:
+        assert _is_pandas_df(df)
+
+
+def test_is_pandas_df():
+    """Check behavior of is_pandas_df when pandas is installed."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame([[1, 2, 3]])
+    assert _is_pandas_df(df)
+    assert not _is_pandas_df(np.asarray([1, 2, 3]))
+    assert not _is_pandas_df(1)
+
+
+def test_is_pandas_df_pandas_not_installed(hide_available_pandas):
+    """Check _is_pandas_df when pandas is not installed."""
+
+    assert not _is_pandas_df(np.asarray([1, 2, 3]))
+    assert not _is_pandas_df(1)
+
+
+@pytest.mark.parametrize(
+    "constructor_name, minversion",
+    [
+        ("pyarrow", dependent_packages["pyarrow"][0]),
+        ("dataframe", dependent_packages["pandas"][0]),
+        ("polars", dependent_packages["polars"][0]),
+    ],
+)
+def test_is_polars_df_other_libraries(constructor_name, minversion):
+    df = _convert_container(
+        [[1, 4, 2], [3, 3, 6]],
+        constructor_name,
+        minversion=minversion,
+    )
+    if constructor_name in ("pyarrow", "dataframe"):
+        assert not _is_polars_df(df)
+    else:
+        assert _is_polars_df(df)
+
+
+def test_is_polars_df_for_duck_typed_polars_dataframe():
+    """Check _is_polars_df for object that looks like a polars dataframe"""
+
+    class NotAPolarsDataFrame:
+        def __init__(self):
+            self.columns = [1, 2, 3]
+            self.schema = "my_schema"
+
+    not_a_polars_df = NotAPolarsDataFrame()
+    assert not _is_polars_df(not_a_polars_df)
+
+
+def test_get_feature_names_numpy():
+    """Get feature names return None for numpy arrays."""
+    X = np.array([[1, 2, 3], [4, 5, 6]])
+    names = _get_feature_names(X)
+    assert names is None
+
+
+@pytest.mark.parametrize(
+    "names, dtypes",
+    [
+        (["a", 1], "['int', 'str']"),
+        (["pizza", ["a", "b"]], "['list', 'str']"),
+    ],
+    ids=["int-str", "list-str"],
+)
+def test_get_feature_names_invalid_dtypes(names, dtypes):
+    """Get feature names errors when the feature names have mixed dtypes"""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
+
+    msg = re.escape(
+        "Feature names are only supported if all input features have string names, "
+        f"but your input has {dtypes} as feature name / column name types. "
+        "If you want feature names to be stored and validated, you must convert "
+        "them all to strings, by using X.columns = X.columns.astype(str) for "
+        "example. Otherwise you can remove feature / column names from your input "
+        "data, or convert them all to a non-string data type."
+    )
+    with pytest.raises(TypeError, match=msg):
+        names = _get_feature_names(X)
+
+
+class PassthroughTransformer(BaseEstimator):
+    def fit(self, X, y=None):
+        validate_data(self, X, reset=True)
+        return self
+
+    def transform(self, X):
+        return X
+
+    def get_feature_names_out(self, input_features=None):
+        return _check_feature_names_in(self, input_features)
+
+
+def test_check_feature_names_in():
+    """Check behavior of check_feature_names_in for arrays."""
+    X = np.array([[0.0, 1.0, 2.0]])
+    est = PassthroughTransformer().fit(X)
+
+    names = est.get_feature_names_out()
+    assert_array_equal(names, ["x0", "x1", "x2"])
+
+    incorrect_len_names = ["x10", "x1"]
+    with pytest.raises(ValueError, match="input_features should have length equal to"):
+        est.get_feature_names_out(incorrect_len_names)
+
+    # remove n_feature_in_
+    del est.n_features_in_
+    with pytest.raises(ValueError, match="Unable to generate feature names"):
+        est.get_feature_names_out()
+
+
+def test_check_feature_names_in_pandas():
+    """Check behavior of check_feature_names_in for pandas dataframes."""
+    pd = pytest.importorskip("pandas")
+    names = ["a", "b", "c"]
+    df = pd.DataFrame([[0.0, 1.0, 2.0]], columns=names)
+    est = PassthroughTransformer().fit(df)
+
+    names = est.get_feature_names_out()
+    assert_array_equal(names, ["a", "b", "c"])
+
+    with pytest.raises(ValueError, match="input_features is not equal to"):
+        est.get_feature_names_out(["x1", "x2", "x3"])
+
+
+def test_check_response_method_unknown_method():
+    """Check the error message when passing an unknown response method."""
+    err_msg = (
+        "RandomForestRegressor has none of the following attributes: unknown_method."
+    )
+    with pytest.raises(AttributeError, match=err_msg):
+        _check_response_method(RandomForestRegressor(), "unknown_method")
+
+
+@pytest.mark.parametrize(
+    "response_method", ["decision_function", "predict_proba", "predict"]
+)
+def test_check_response_method_not_supported_response_method(response_method):
+    """Check the error message when a response method is not supported by the
+    estimator."""
+    err_msg = (
+        f"EstimatorWithFit has none of the following attributes: {response_method}."
+    )
+    with pytest.raises(AttributeError, match=err_msg):
+        _check_response_method(EstimatorWithFit(), response_method)
+
+
+def test_check_response_method_list_str():
+    """Check that we can pass a list of ordered method."""
+    method_implemented = ["predict_proba"]
+    my_estimator = _MockEstimatorOnOffPrediction(method_implemented)
+
+    X = "mocking_data"
+
+    # raise an error when no methods are defined
+    response_method = ["decision_function", "predict"]
+    err_msg = (
+        "_MockEstimatorOnOffPrediction has none of the following attributes: "
+        f"{', '.join(response_method)}."
+    )
+    with pytest.raises(AttributeError, match=err_msg):
+        _check_response_method(my_estimator, response_method)(X)
+
+    # check that we don't get issue when one of the method is defined
+    response_method = ["decision_function", "predict_proba"]
+    method_name_predicting = _check_response_method(my_estimator, response_method)(X)
+    assert method_name_predicting == "predict_proba"
+
+    # check the order of the methods returned
+    method_implemented = ["predict_proba", "predict"]
+    my_estimator = _MockEstimatorOnOffPrediction(method_implemented)
+    response_method = ["decision_function", "predict", "predict_proba"]
+    method_name_predicting = _check_response_method(my_estimator, response_method)(X)
+    assert method_name_predicting == "predict"
+
+
+def test_boolean_series_remains_boolean():
+    """Regression test for gh-25145"""
+    pd = importorskip("pandas")
+    res = check_array(pd.Series([True, False]), ensure_2d=False)
+    expected = np.array([True, False])
+
+    assert res.dtype == expected.dtype
+    assert_array_equal(res, expected)
+
+
+@pytest.mark.parametrize("input_values", [[0, 1, 0, 1, 0, np.nan], [0, 1, 0, 1, 0, 1]])
+def test_pandas_array_returns_ndarray(input_values):
+    """Check pandas array with extensions dtypes returns a numeric ndarray.
+
+    Non-regression test for gh-25637.
+    """
+    pd = importorskip("pandas")
+    input_series = pd.array(input_values, dtype="Int32")
+    result = check_array(
+        input_series,
+        dtype=None,
+        ensure_2d=False,
+        allow_nd=False,
+        ensure_all_finite=False,
+    )
+    assert np.issubdtype(result.dtype.kind, np.floating)
+    assert_allclose(result, input_values)
+
+
+@skip_if_array_api_compat_not_configured
+def test_check_array_array_api_has_non_finite():
+    """Checks that Array API arrays checks non-finite correctly."""
+    xp = pytest.importorskip("array_api_strict")
+
+    X_nan = xp.asarray([[xp.nan, 1, 0], [0, xp.nan, 3]], dtype=xp.float32)
+    with config_context(array_api_dispatch=True):
+        with pytest.raises(ValueError, match="Input contains NaN."):
+            check_array(X_nan)
+
+    X_inf = xp.asarray([[xp.inf, 1, 0], [0, xp.inf, 3]], dtype=xp.float32)
+    with config_context(array_api_dispatch=True):
+        with pytest.raises(ValueError, match="infinity or a value too large"):
+            check_array(X_inf)
+
+
+@pytest.mark.parametrize(
+    "extension_dtype, regular_dtype",
+    [
+        ("boolean", "bool"),
+        ("Int64", "int64"),
+        ("Float64", "float64"),
+        ("category", "object"),
+    ],
+)
+@pytest.mark.parametrize("include_object", [True, False])
+def test_check_array_multiple_extensions(
+    extension_dtype, regular_dtype, include_object
+):
+    """Check pandas extension arrays give the same result as non-extension arrays."""
+    pd = pytest.importorskip("pandas")
+    X_regular = pd.DataFrame(
+        {
+            "a": pd.Series([1, 0, 1, 0], dtype=regular_dtype),
+            "c": pd.Series([9, 8, 7, 6], dtype="int64"),
+        }
+    )
+    if include_object:
+        X_regular["b"] = pd.Series(["a", "b", "c", "d"], dtype="object")
+
+    X_extension = X_regular.assign(a=X_regular["a"].astype(extension_dtype))
+
+    X_regular_checked = check_array(X_regular, dtype=None)
+    X_extension_checked = check_array(X_extension, dtype=None)
+    assert_array_equal(X_regular_checked, X_extension_checked)
+
+
+def test_num_samples_dataframe_protocol():
+    """Use the DataFrame interchange protocol to get n_samples from polars."""
+    pl = pytest.importorskip("polars")
+
+    df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    assert _num_samples(df) == 3
+
+
+@pytest.mark.parametrize(
+    "sparse_container",
+    CSR_CONTAINERS + CSC_CONTAINERS + COO_CONTAINERS + DIA_CONTAINERS,
+)
+@pytest.mark.parametrize("output_format", ["csr", "csc", "coo"])
+def test_check_array_dia_to_int32_indexed_csr_csc_coo(sparse_container, output_format):
+    """Check the consistency of the indices dtype with sparse matrices/arrays."""
+    X = sparse_container([[0, 1], [1, 0]], dtype=np.float64)
+
+    # Explicitly set the dtype of the indexing arrays
+    if hasattr(X, "offsets"):  # DIA matrix
+        X.offsets = X.offsets.astype(np.int32)
+    elif hasattr(X, "row") and hasattr(X, "col"):  # COO matrix
+        X.row = X.row.astype(np.int32)
+    elif hasattr(X, "indices") and hasattr(X, "indptr"):  # CSR or CSC matrix
+        X.indices = X.indices.astype(np.int32)
+        X.indptr = X.indptr.astype(np.int32)
+
+    X_checked = check_array(X, accept_sparse=output_format)
+    if output_format == "coo":
+        assert X_checked.row.dtype == np.int32
+        assert X_checked.col.dtype == np.int32
+    else:  # output_format in ["csr", "csc"]
+        assert X_checked.indices.dtype == np.int32
+        assert X_checked.indptr.dtype == np.int32
+
+
+@pytest.mark.parametrize("sequence", [[np.array(1), np.array(2)], [[1, 2], [3, 4]]])
+def test_to_object_array(sequence):
+    out = _to_object_array(sequence)
+    assert isinstance(out, np.ndarray)
+    assert out.dtype.kind == "O"
+    assert out.ndim == 1
+
+
+def test_column_or_1d():
+    EXAMPLES = [
+        ("binary", ["spam", "egg", "spam"]),
+        ("binary", [0, 1, 0, 1]),
+        ("continuous", np.arange(10) / 20.0),
+        ("multiclass", [1, 2, 3]),
+        ("multiclass", [0, 1, 2, 2, 0]),
+        ("multiclass", [[1], [2], [3]]),
+        ("multilabel-indicator", [[0, 1, 0], [0, 0, 1]]),
+        ("multiclass-multioutput", [[1, 2, 3]]),
+        ("multiclass-multioutput", [[1, 1], [2, 2], [3, 1]]),
+        ("multiclass-multioutput", [[5, 1], [4, 2], [3, 1]]),
+        ("multiclass-multioutput", [[1, 2, 3]]),
+        ("continuous-multioutput", np.arange(30).reshape((-1, 3))),
+    ]
+
+    for y_type, y in EXAMPLES:
+        if y_type in ["binary", "multiclass", "continuous"]:
+            assert_array_equal(column_or_1d(y), np.ravel(y))
+        else:
+            with pytest.raises(ValueError):
+                column_or_1d(y)
+
+
+def test__is_polars_df():
+    """Check that _is_polars_df return False for non-dataframe objects."""
+
+    class LooksLikePolars:
+        def __init__(self):
+            self.columns = ["a", "b"]
+            self.schema = ["a", "b"]
+
+    assert not _is_polars_df(LooksLikePolars())
+
+
+def test_check_array_writeable_np():
+    """Check the behavior of check_array when a writeable array is requested
+    without copy if possible, on numpy arrays.
+    """
+    X = np.random.uniform(size=(10, 10))
+
+    out = check_array(X, copy=False, force_writeable=True)
+    # X is already writeable, no copy is needed
+    assert np.may_share_memory(out, X)
+    assert out.flags.writeable
+
+    X.flags.writeable = False
+
+    out = check_array(X, copy=False, force_writeable=True)
+    # X is not writeable, a copy is made
+    assert not np.may_share_memory(out, X)
+    assert out.flags.writeable
+
+
+def test_check_array_writeable_mmap():
+    """Check the behavior of check_array when a writeable array is requested
+    without copy if possible, on a memory-map.
+
+    A common situation is when a meta-estimators run in parallel using multiprocessing
+    with joblib, which creates read-only memory-maps of large arrays.
+    """
+    X = np.random.uniform(size=(10, 10))
+
+    mmap = create_memmap_backed_data(X, mmap_mode="w+")
+    out = check_array(mmap, copy=False, force_writeable=True)
+    # mmap is already writeable, no copy is needed
+    assert np.may_share_memory(out, mmap)
+    assert out.flags.writeable
+
+    mmap = create_memmap_backed_data(X, mmap_mode="r")
+    out = check_array(mmap, copy=False, force_writeable=True)
+    # mmap is read-only, a copy is made
+    assert not np.may_share_memory(out, mmap)
+    assert out.flags.writeable
+
+
+def test_check_array_writeable_df():
+    """Check the behavior of check_array when a writeable array is requested
+    without copy if possible, on a dataframe.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = np.random.uniform(size=(10, 10))
+    df = pd.DataFrame(X, copy=False)
+
+    out = check_array(df, copy=False, force_writeable=True)
+    # df is backed by a writeable array, no copy is needed
+    assert np.may_share_memory(out, df)
+    assert out.flags.writeable
+
+    X.flags.writeable = False
+    df = pd.DataFrame(X, copy=False)
+
+    out = check_array(df, copy=False, force_writeable=True)
+    # df is backed by a read-only array, a copy is made
+    assert not np.may_share_memory(out, df)
+    assert out.flags.writeable
+
+
+@skip_if_array_api_compat_not_configured
+def test_check_array_on_sparse_inputs_with_array_api_enabled():
+    X_sp = sp.csr_array([[0, 1, 0], [1, 0, 1]])
+    with config_context(array_api_dispatch=True):
+        assert sp.issparse(check_array(X_sp, accept_sparse=True))
+
+        with pytest.raises(TypeError):
+            check_array(X_sp)
+
+
+# TODO(1.8): remove
+def test_force_all_finite_rename_warning():
+    X = np.random.uniform(size=(10, 10))
+    y = np.random.randint(1, size=(10,))
+
+    msg = "'force_all_finite' was renamed to 'ensure_all_finite'"
+
+    with pytest.warns(FutureWarning, match=msg):
+        check_array(X, force_all_finite=True)
+
+    with pytest.warns(FutureWarning, match=msg):
+        check_X_y(X, y, force_all_finite=True)
+
+    with pytest.warns(FutureWarning, match=msg):
+        as_float_array(X, force_all_finite=True)
+
+
+@pytest.mark.parametrize(
+    ["X", "estimator", "expected_error_message"],
+    [
+        (
+            np.array([[[1, 2], [3, 4]], [[1, 2], [3, 4]]]),
+            RandomForestRegressor(),
+            "Found array with dim 3, while dim <= 2 is required by "
+            "RandomForestRegressor.",
+        ),
+        (
+            np.array([[[1, 2], [3, 4]], [[1, 2], [3, 4]]]),
+            None,
+            "Found array with dim 3, while dim <= 2 is required.",
+        ),
+    ],
+)
+def test_check_array_allow_nd_errors(X, estimator, expected_error_message):
+    with pytest.raises(ValueError, match=expected_error_message):
+        check_array(X, estimator=estimator)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_weight_vector.py b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_weight_vector.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b19792475e06369fb26e526e2644a3285f4223f
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/tests/test_weight_vector.py
@@ -0,0 +1,25 @@
+import numpy as np
+import pytest
+
+from sklearn.utils._weight_vector import (
+    WeightVector32,
+    WeightVector64,
+)
+
+
+@pytest.mark.parametrize(
+    "dtype, WeightVector",
+    [
+        (np.float32, WeightVector32),
+        (np.float64, WeightVector64),
+    ],
+)
+def test_type_invariance(dtype, WeightVector):
+    """Check the `dtype` consistency of `WeightVector`."""
+    weights = np.random.rand(100).astype(dtype)
+    average_weights = np.random.rand(100).astype(dtype)
+
+    weight_vector = WeightVector(weights, average_weights)
+
+    assert np.asarray(weight_vector.w).dtype is np.dtype(dtype)
+    assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype)
diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py b/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..86bdd07c41f1c36fcd5827cf3bb5487af7e22627
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py
@@ -0,0 +1,2977 @@
+"""Functions to validate input and parameters within scikit-learn estimators."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import operator
+import sys
+import warnings
+from collections.abc import Sequence
+from contextlib import suppress
+from functools import reduce, wraps
+from inspect import Parameter, isclass, signature
+
+import joblib
+import numpy as np
+import scipy.sparse as sp
+
+from .. import get_config as _get_config
+from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning
+from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace
+from ..utils.deprecation import _deprecate_force_all_finite
+from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype
+from ._isfinite import FiniteStatus, cy_isfinite
+from ._tags import get_tags
+from .fixes import _object_dtype_isnan
+
+FLOAT_DTYPES = (np.float64, np.float32, np.float16)
+
+
+# This function is not used anymore at this moment in the code base but we keep it in
+# case that we merge a new public function without kwarg only by mistake, which would
+# require a deprecation cycle to fix.
+def _deprecate_positional_args(func=None, *, version="1.3"):
+    """Decorator for methods that issues warnings for positional arguments.
+
+    Using the keyword-only argument syntax in pep 3102, arguments after the
+    * will issue a warning when passed as a positional argument.
+
+    Parameters
+    ----------
+    func : callable, default=None
+        Function to check arguments on.
+    version : callable, default="1.3"
+        The version when positional arguments will result in error.
+    """
+
+    def _inner_deprecate_positional_args(f):
+        sig = signature(f)
+        kwonly_args = []
+        all_args = []
+
+        for name, param in sig.parameters.items():
+            if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
+                all_args.append(name)
+            elif param.kind == Parameter.KEYWORD_ONLY:
+                kwonly_args.append(name)
+
+        @wraps(f)
+        def inner_f(*args, **kwargs):
+            extra_args = len(args) - len(all_args)
+            if extra_args <= 0:
+                return f(*args, **kwargs)
+
+            # extra_args > 0
+            args_msg = [
+                "{}={}".format(name, arg)
+                for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])
+            ]
+            args_msg = ", ".join(args_msg)
+            warnings.warn(
+                (
+                    f"Pass {args_msg} as keyword args. From version "
+                    f"{version} passing these as positional arguments "
+                    "will result in an error"
+                ),
+                FutureWarning,
+            )
+            kwargs.update(zip(sig.parameters, args))
+            return f(**kwargs)
+
+        return inner_f
+
+    if func is not None:
+        return _inner_deprecate_positional_args(func)
+
+    return _inner_deprecate_positional_args
+
+
+def _assert_all_finite(
+    X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name=""
+):
+    """Like assert_all_finite, but only for ndarray."""
+
+    xp, is_array_api = get_namespace(X)
+
+    if _get_config()["assume_finite"]:
+        return
+
+    X = xp.asarray(X)
+
+    # for object dtype data, we only check for NaNs (GH-13254)
+    if not is_array_api and X.dtype == np.dtype("object") and not allow_nan:
+        if _object_dtype_isnan(X).any():
+            raise ValueError("Input contains NaN")
+
+    # We need only consider float arrays, hence can early return for all else.
+    if not xp.isdtype(X.dtype, ("real floating", "complex floating")):
+        return
+
+    # First try an O(n) time, O(1) space solution for the common case that
+    # everything is finite; fall back to O(n) space `np.isinf/isnan` or custom
+    # Cython implementation to prevent false positives and provide a detailed
+    # error message.
+    with np.errstate(over="ignore"):
+        first_pass_isfinite = xp.isfinite(xp.sum(X))
+    if first_pass_isfinite:
+        return
+
+    _assert_all_finite_element_wise(
+        X,
+        xp=xp,
+        allow_nan=allow_nan,
+        msg_dtype=msg_dtype,
+        estimator_name=estimator_name,
+        input_name=input_name,
+    )
+
+
+def _assert_all_finite_element_wise(
+    X, *, xp, allow_nan, msg_dtype=None, estimator_name=None, input_name=""
+):
+    # Cython implementation doesn't support FP16 or complex numbers
+    use_cython = (
+        xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64}
+    )
+    if use_cython:
+        out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan)
+        has_nan_error = False if allow_nan else out == FiniteStatus.has_nan
+        has_inf = out == FiniteStatus.has_infinite
+    else:
+        has_inf = xp.any(xp.isinf(X))
+        has_nan_error = False if allow_nan else xp.any(xp.isnan(X))
+    if has_inf or has_nan_error:
+        if has_nan_error:
+            type_err = "NaN"
+        else:
+            msg_dtype = msg_dtype if msg_dtype is not None else X.dtype
+            type_err = f"infinity or a value too large for {msg_dtype!r}"
+        padded_input_name = input_name + " " if input_name else ""
+        msg_err = f"Input {padded_input_name}contains {type_err}."
+        if estimator_name and input_name == "X" and has_nan_error:
+            # Improve the error message on how to handle missing values in
+            # scikit-learn.
+            msg_err += (
+                f"\n{estimator_name} does not accept missing values"
+                " encoded as NaN natively. For supervised learning, you might want"
+                " to consider sklearn.ensemble.HistGradientBoostingClassifier and"
+                " Regressor which accept missing values encoded as NaNs natively."
+                " Alternatively, it is possible to preprocess the data, for"
+                " instance by using an imputer transformer in a pipeline or drop"
+                " samples with missing values. See"
+                " https://scikit-learn.org/stable/modules/impute.html"
+                " You can find a list of all estimators that handle NaN values"
+                " at the following page:"
+                " https://scikit-learn.org/stable/modules/impute.html"
+                "#estimators-that-handle-nan-values"
+            )
+        raise ValueError(msg_err)
+
+
+def assert_all_finite(
+    X,
+    *,
+    allow_nan=False,
+    estimator_name=None,
+    input_name="",
+):
+    """Throw a ValueError if X contains NaN or infinity.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix}
+        The input data.
+
+    allow_nan : bool, default=False
+        If True, do not throw error when `X` contains NaN.
+
+    estimator_name : str, default=None
+        The estimator name, used to construct the error message.
+
+    input_name : str, default=""
+        The data name used to construct the error message. In particular
+        if `input_name` is "X" and the data has NaN values and
+        allow_nan is False, the error message will link to the imputer
+        documentation.
+
+    Examples
+    --------
+    >>> from sklearn.utils import assert_all_finite
+    >>> import numpy as np
+    >>> array = np.array([1, np.inf, np.nan, 4])
+    >>> try:
+    ...     assert_all_finite(array)
+    ...     print("Test passed: Array contains only finite values.")
+    ... except ValueError:
+    ...     print("Test failed: Array contains non-finite values.")
+    Test failed: Array contains non-finite values.
+    """
+    _assert_all_finite(
+        X.data if sp.issparse(X) else X,
+        allow_nan=allow_nan,
+        estimator_name=estimator_name,
+        input_name=input_name,
+    )
+
+
+def as_float_array(
+    X, *, copy=True, force_all_finite="deprecated", ensure_all_finite=None
+):
+    """Convert an array-like to an array of floats.
+
+    The new dtype will be np.float32 or np.float64, depending on the original
+    type. The function can create a copy or modify the argument depending
+    on the argument copy.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}
+        The input data.
+
+    copy : bool, default=True
+        If True, a copy of X will be created. If False, a copy may still be
+        returned if X's dtype is not a floating point type.
+
+    force_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. The
+        possibilities are:
+
+        - True: Force all values of X to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
+          be infinite.
+
+        .. versionadded:: 0.20
+           ``force_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
+        .. deprecated:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite` and will be removed
+           in 1.8.
+
+    ensure_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. The
+        possibilities are:
+
+        - True: Force all values of X to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
+          be infinite.
+
+        .. versionadded:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite`.
+
+    Returns
+    -------
+    XT : {ndarray, sparse matrix}
+        An array of type float.
+
+    Examples
+    --------
+    >>> from sklearn.utils import as_float_array
+    >>> import numpy as np
+    >>> array = np.array([0, 0, 1, 2, 2], dtype=np.int64)
+    >>> as_float_array(array)
+    array([0., 0., 1., 2., 2.])
+    """
+    ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite)
+
+    if isinstance(X, np.matrix) or (
+        not isinstance(X, np.ndarray) and not sp.issparse(X)
+    ):
+        return check_array(
+            X,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype=np.float64,
+            copy=copy,
+            ensure_all_finite=ensure_all_finite,
+            ensure_2d=False,
+        )
+    elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
+        return X.copy() if copy else X
+    elif X.dtype in [np.float32, np.float64]:  # is numpy array
+        return X.copy("F" if X.flags["F_CONTIGUOUS"] else "C") if copy else X
+    else:
+        if X.dtype.kind in "uib" and X.dtype.itemsize <= 4:
+            return_dtype = np.float32
+        else:
+            return_dtype = np.float64
+        return X.astype(return_dtype)
+
+
+def _is_arraylike(x):
+    """Returns whether the input is array-like."""
+    if sp.issparse(x):
+        return False
+
+    return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__")
+
+
+def _is_arraylike_not_scalar(array):
+    """Return True if array is array-like and not a scalar"""
+    return _is_arraylike(array) and not np.isscalar(array)
+
+
+def _use_interchange_protocol(X):
+    """Use interchange protocol for non-pandas dataframes that follow the protocol.
+
+    Note: at this point we chose not to use the interchange API on pandas dataframe
+    to ensure strict behavioral backward compatibility with older versions of
+    scikit-learn.
+    """
+    return not _is_pandas_df(X) and hasattr(X, "__dataframe__")
+
+
+def _num_features(X):
+    """Return the number of features in an array-like X.
+
+    This helper function tries hard to avoid to materialize an array version
+    of X unless necessary. For instance, if X is a list of lists,
+    this function will return the length of the first element, assuming
+    that subsequent elements are all lists of the same length without
+    checking.
+    Parameters
+    ----------
+    X : array-like
+        array-like to get the number of features.
+
+    Returns
+    -------
+    features : int
+        Number of features
+    """
+    type_ = type(X)
+    if type_.__module__ == "builtins":
+        type_name = type_.__qualname__
+    else:
+        type_name = f"{type_.__module__}.{type_.__qualname__}"
+    message = f"Unable to find the number of features from X of type {type_name}"
+    if not hasattr(X, "__len__") and not hasattr(X, "shape"):
+        if not hasattr(X, "__array__"):
+            raise TypeError(message)
+        # Only convert X to a numpy array if there is no cheaper, heuristic
+        # option.
+        X = np.asarray(X)
+
+    if hasattr(X, "shape"):
+        if not hasattr(X.shape, "__len__") or len(X.shape) <= 1:
+            message += f" with shape {X.shape}"
+            raise TypeError(message)
+        return X.shape[1]
+
+    first_sample = X[0]
+
+    # Do not consider an array-like of strings or dicts to be a 2D array
+    if isinstance(first_sample, (str, bytes, dict)):
+        message += f" where the samples are of type {type(first_sample).__qualname__}"
+        raise TypeError(message)
+
+    try:
+        # If X is a list of lists, for instance, we assume that all nested
+        # lists have the same length without checking or converting to
+        # a numpy array to keep this function call as cheap as possible.
+        return len(first_sample)
+    except Exception as err:
+        raise TypeError(message) from err
+
+
+def _num_samples(x):
+    """Return number of samples in array-like x."""
+    message = "Expected sequence or array-like, got %s" % type(x)
+    if hasattr(x, "fit") and callable(x.fit):
+        # Don't get num_samples from an ensembles length!
+        raise TypeError(message)
+
+    if _use_interchange_protocol(x):
+        return x.__dataframe__().num_rows()
+
+    if not hasattr(x, "__len__") and not hasattr(x, "shape"):
+        if hasattr(x, "__array__"):
+            x = np.asarray(x)
+        else:
+            raise TypeError(message)
+
+    if hasattr(x, "shape") and x.shape is not None:
+        if len(x.shape) == 0:
+            raise TypeError(
+                "Input should have at least 1 dimension i.e. satisfy "
+                f"`len(x.shape) > 0`, got scalar `{x!r}` instead."
+            )
+        # Check that shape is returning an integer or default to len
+        # Dask dataframes may not return numeric shape[0] value
+        if isinstance(x.shape[0], numbers.Integral):
+            return x.shape[0]
+
+    try:
+        return len(x)
+    except TypeError as type_error:
+        raise TypeError(message) from type_error
+
+
+def check_memory(memory):
+    """Check that ``memory`` is joblib.Memory-like.
+
+    joblib.Memory-like means that ``memory`` can be converted into a
+    joblib.Memory instance (typically a str denoting the ``location``)
+    or has the same interface (has a ``cache`` method).
+
+    Parameters
+    ----------
+    memory : None, str or object with the joblib.Memory interface
+        - If string, the location where to create the `joblib.Memory` interface.
+        - If None, no caching is done and the Memory object is completely transparent.
+
+    Returns
+    -------
+    memory : object with the joblib.Memory interface
+        A correct joblib.Memory object.
+
+    Raises
+    ------
+    ValueError
+        If ``memory`` is not joblib.Memory-like.
+
+    Examples
+    --------
+    >>> from sklearn.utils.validation import check_memory
+    >>> check_memory("caching_dir")
+    Memory(location=caching_dir/joblib)
+    """
+    if memory is None or isinstance(memory, str):
+        memory = joblib.Memory(location=memory, verbose=0)
+    elif not hasattr(memory, "cache"):
+        raise ValueError(
+            "'memory' should be None, a string or have the same"
+            " interface as joblib.Memory."
+            " Got memory='{}' instead.".format(memory)
+        )
+    return memory
+
+
+def check_consistent_length(*arrays):
+    """Check that all arrays have consistent first dimensions.
+
+    Checks whether all objects in arrays have the same shape or length.
+
+    Parameters
+    ----------
+    *arrays : list or tuple of input objects.
+        Objects that will be checked for consistent length.
+
+    Examples
+    --------
+    >>> from sklearn.utils.validation import check_consistent_length
+    >>> a = [1, 2, 3]
+    >>> b = [2, 3, 4]
+    >>> check_consistent_length(a, b)
+    """
+    lengths = [_num_samples(X) for X in arrays if X is not None]
+    if len(set(lengths)) > 1:
+        raise ValueError(
+            "Found input variables with inconsistent numbers of samples: %r"
+            % [int(l) for l in lengths]
+        )
+
+
+def _make_indexable(iterable):
+    """Ensure iterable supports indexing or convert to an indexable variant.
+
+    Convert sparse matrices to csr and other non-indexable iterable to arrays.
+    Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
+
+    Parameters
+    ----------
+    iterable : {list, dataframe, ndarray, sparse matrix} or None
+        Object to be converted to an indexable iterable.
+    """
+    if sp.issparse(iterable):
+        return iterable.tocsr()
+    elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
+        return iterable
+    elif iterable is None:
+        return iterable
+    return np.array(iterable)
+
+
+def indexable(*iterables):
+    """Make arrays indexable for cross-validation.
+
+    Checks consistent length, passes through None, and ensures that everything
+    can be indexed by converting sparse matrices to csr and converting
+    non-iterable objects to arrays.
+
+    Parameters
+    ----------
+    *iterables : {lists, dataframes, ndarrays, sparse matrices}
+        List of objects to ensure sliceability.
+
+    Returns
+    -------
+    result : list of {ndarray, sparse matrix, dataframe} or None
+        Returns a list containing indexable arrays (i.e. NumPy array,
+        sparse matrix, or dataframe) or `None`.
+
+    Examples
+    --------
+    >>> from sklearn.utils import indexable
+    >>> from scipy.sparse import csr_matrix
+    >>> import numpy as np
+    >>> iterables = [
+    ...     [1, 2, 3], np.array([2, 3, 4]), None, csr_matrix([[5], [6], [7]])
+    ... ]
+    >>> indexable(*iterables)
+    [[1, 2, 3], array([2, 3, 4]), None, <...Sparse...dtype 'int64'...shape (3, 1)>]
+    """
+
+    result = [_make_indexable(X) for X in iterables]
+    check_consistent_length(*result)
+    return result
+
+
+def _ensure_sparse_format(
+    sparse_container,
+    accept_sparse,
+    dtype,
+    copy,
+    ensure_all_finite,
+    accept_large_sparse,
+    estimator_name=None,
+    input_name="",
+):
+    """Convert a sparse container to a given format.
+
+    Checks the sparse format of `sparse_container` and converts if necessary.
+
+    Parameters
+    ----------
+    sparse_container : sparse matrix or array
+        Input to validate and convert.
+
+    accept_sparse : str, bool or list/tuple of str
+        String[s] representing allowed sparse matrix formats ('csc',
+        'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
+        not in the allowed format, it will be converted to the first listed
+        format. True allows the input to be any format. False means
+        that a sparse matrix input will raise an error.
+
+    dtype : str, type or None
+        Data type of result. If None, the dtype of the input is preserved.
+
+    copy : bool
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
+    ensure_all_finite : bool or 'allow-nan'
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. The
+        possibilities are:
+
+        - True: Force all values of X to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
+          be infinite.
+
+        .. versionadded:: 0.20
+           ``ensure_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
+
+    estimator_name : str, default=None
+        The estimator name, used to construct the error message.
+
+    input_name : str, default=""
+        The data name used to construct the error message. In particular
+        if `input_name` is "X" and the data has NaN values and
+        allow_nan is False, the error message will link to the imputer
+        documentation.
+
+    Returns
+    -------
+    sparse_container_converted : sparse matrix or array
+        Sparse container (matrix/array) that is ensured to have an allowed type.
+    """
+    if dtype is None:
+        dtype = sparse_container.dtype
+
+    changed_format = False
+    sparse_container_type_name = type(sparse_container).__name__
+
+    if isinstance(accept_sparse, str):
+        accept_sparse = [accept_sparse]
+
+    # Indices dtype validation
+    _check_large_sparse(sparse_container, accept_large_sparse)
+
+    if accept_sparse is False:
+        padded_input = " for " + input_name if input_name else ""
+        raise TypeError(
+            f"Sparse data was passed{padded_input}, but dense data is required. "
+            "Use '.toarray()' to convert to a dense numpy array."
+        )
+    elif isinstance(accept_sparse, (list, tuple)):
+        if len(accept_sparse) == 0:
+            raise ValueError(
+                "When providing 'accept_sparse' as a tuple or list, it must contain at "
+                "least one string value."
+            )
+        # ensure correct sparse format
+        if sparse_container.format not in accept_sparse:
+            # create new with correct sparse
+            sparse_container = sparse_container.asformat(accept_sparse[0])
+            changed_format = True
+    elif accept_sparse is not True:
+        # any other type
+        raise ValueError(
+            "Parameter 'accept_sparse' should be a string, boolean or list of strings."
+            f" You provided 'accept_sparse={accept_sparse}'."
+        )
+
+    if dtype != sparse_container.dtype:
+        # convert dtype
+        sparse_container = sparse_container.astype(dtype)
+    elif copy and not changed_format:
+        # force copy
+        sparse_container = sparse_container.copy()
+
+    if ensure_all_finite:
+        if not hasattr(sparse_container, "data"):
+            warnings.warn(
+                f"Can't check {sparse_container.format} sparse matrix for nan or inf.",
+                stacklevel=2,
+            )
+        else:
+            _assert_all_finite(
+                sparse_container.data,
+                allow_nan=ensure_all_finite == "allow-nan",
+                estimator_name=estimator_name,
+                input_name=input_name,
+            )
+
+    # TODO: Remove when the minimum version of SciPy supported is 1.12
+    # With SciPy sparse arrays, conversion from DIA format to COO, CSR, or BSR
+    # triggers the use of `np.int64` indices even if the data is such that it could
+    # be more efficiently represented with `np.int32` indices.
+    # https://github.com/scipy/scipy/issues/19245 Since not all scikit-learn
+    # algorithms support large indices, the following code downcasts to `np.int32`
+    # indices when it's safe to do so.
+    if changed_format:
+        # accept_sparse is specified to a specific format and a conversion occurred
+        requested_sparse_format = accept_sparse[0]
+        _preserve_dia_indices_dtype(
+            sparse_container, sparse_container_type_name, requested_sparse_format
+        )
+
+    return sparse_container
+
+
+def _ensure_no_complex_data(array):
+    if (
+        hasattr(array, "dtype")
+        and array.dtype is not None
+        and hasattr(array.dtype, "kind")
+        and array.dtype.kind == "c"
+    ):
+        raise ValueError("Complex data not supported\n{}\n".format(array))
+
+
+def _check_estimator_name(estimator):
+    if estimator is not None:
+        if isinstance(estimator, str):
+            return estimator
+        else:
+            return estimator.__class__.__name__
+    return None
+
+
+def _pandas_dtype_needs_early_conversion(pd_dtype):
+    """Return True if pandas extension pd_dtype need to be converted early."""
+    # Check these early for pandas versions without extension dtypes
+    from pandas import SparseDtype
+    from pandas.api.types import (
+        is_bool_dtype,
+        is_float_dtype,
+        is_integer_dtype,
+    )
+
+    if is_bool_dtype(pd_dtype):
+        # bool and extension booleans need early conversion because __array__
+        # converts mixed dtype dataframes into object dtypes
+        return True
+
+    if isinstance(pd_dtype, SparseDtype):
+        # Sparse arrays will be converted later in `check_array`
+        return False
+
+    try:
+        from pandas.api.types import is_extension_array_dtype
+    except ImportError:
+        return False
+
+    if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype):
+        # Sparse arrays will be converted later in `check_array`
+        # Only handle extension arrays for integer and floats
+        return False
+    elif is_float_dtype(pd_dtype):
+        # Float ndarrays can normally support nans. They need to be converted
+        # first to map pd.NA to np.nan
+        return True
+    elif is_integer_dtype(pd_dtype):
+        # XXX: Warn when converting from a high integer to a float
+        return True
+
+    return False
+
+
+def _is_extension_array_dtype(array):
+    # Pandas extension arrays have a dtype with an na_value
+    return hasattr(array, "dtype") and hasattr(array.dtype, "na_value")
+
+
+def check_array(
+    array,
+    accept_sparse=False,
+    *,
+    accept_large_sparse=True,
+    dtype="numeric",
+    order=None,
+    copy=False,
+    force_writeable=False,
+    force_all_finite="deprecated",
+    ensure_all_finite=None,
+    ensure_non_negative=False,
+    ensure_2d=True,
+    allow_nd=False,
+    ensure_min_samples=1,
+    ensure_min_features=1,
+    estimator=None,
+    input_name="",
+):
+    """Input validation on an array, list, sparse matrix or similar.
+
+    By default, the input is checked to be a non-empty 2D array containing
+    only finite values. If the dtype of the array is object, attempt
+    converting to float, raising on failure.
+
+    Parameters
+    ----------
+    array : object
+        Input object to check / convert.
+
+    accept_sparse : str, bool or list/tuple of str, default=False
+        String[s] representing allowed sparse matrix formats, such as 'csc',
+        'csr', etc. If the input is sparse but not in the allowed format,
+        it will be converted to the first listed format. True allows the input
+        to be any format. False means that a sparse matrix input will
+        raise an error.
+
+    accept_large_sparse : bool, default=True
+        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
+        accept_sparse, accept_large_sparse=False will cause it to be accepted
+        only if its indices are stored with a 32-bit dtype.
+
+        .. versionadded:: 0.20
+
+    dtype : 'numeric', type, list of type or None, default='numeric'
+        Data type of result. If None, the dtype of the input is preserved.
+        If "numeric", dtype is preserved unless array.dtype is object.
+        If dtype is a list of types, conversion on the first type is only
+        performed if the dtype of the input is not in the list.
+
+    order : {'F', 'C'} or None, default=None
+        Whether an array will be forced to be fortran or c-style.
+        When order is None (default), then if copy=False, nothing is ensured
+        about the memory layout of the output array; otherwise (copy=True)
+        the memory layout of the returned array is kept as close as possible
+        to the original array.
+
+    copy : bool, default=False
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
+    force_writeable : bool, default=False
+        Whether to force the output array to be writeable. If True, the returned array
+        is guaranteed to be writeable, which may require a copy. Otherwise the
+        writeability of the input array is preserved.
+
+        .. versionadded:: 1.6
+
+    force_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
+        possibilities are:
+
+        - True: Force all values of array to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
+
+        .. versionadded:: 0.20
+           ``force_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
+        .. deprecated:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite` and will be removed
+           in 1.8.
+
+    ensure_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
+        possibilities are:
+
+        - True: Force all values of array to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
+
+        .. versionadded:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite`.
+
+    ensure_non_negative : bool, default=False
+        Make sure the array has only non-negative values. If True, an array that
+        contains negative values will raise a ValueError.
+
+        .. versionadded:: 1.6
+
+    ensure_2d : bool, default=True
+        Whether to raise a value error if array is not 2D.
+
+    allow_nd : bool, default=False
+        Whether to allow array.ndim > 2.
+
+    ensure_min_samples : int, default=1
+        Make sure that the array has a minimum number of samples in its first
+        axis (rows for a 2D array). Setting to 0 disables this check.
+
+    ensure_min_features : int, default=1
+        Make sure that the 2D array has some minimum number of features
+        (columns). The default value of 1 rejects empty datasets.
+        This check is only enforced when the input data has effectively 2
+        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
+        disables this check.
+
+    estimator : str or estimator instance, default=None
+        If passed, include the name of the estimator in warning messages.
+
+    input_name : str, default=""
+        The data name used to construct the error message. In particular
+        if `input_name` is "X" and the data has NaN values and
+        allow_nan is False, the error message will link to the imputer
+        documentation.
+
+        .. versionadded:: 1.1.0
+
+    Returns
+    -------
+    array_converted : object
+        The converted and validated array.
+
+    Examples
+    --------
+    >>> from sklearn.utils.validation import check_array
+    >>> X = [[1, 2, 3], [4, 5, 6]]
+    >>> X_checked = check_array(X)
+    >>> X_checked
+    array([[1, 2, 3], [4, 5, 6]])
+    """
+    ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite)
+
+    if isinstance(array, np.matrix):
+        raise TypeError(
+            "np.matrix is not supported. Please convert to a numpy array with "
+            "np.asarray. For more information see: "
+            "https://numpy.org/doc/stable/reference/generated/numpy.matrix.html"
+        )
+
+    xp, is_array_api_compliant = get_namespace(array)
+
+    # store reference to original array to check if copy is needed when
+    # function returns
+    array_orig = array
+
+    # store whether originally we wanted numeric dtype
+    dtype_numeric = isinstance(dtype, str) and dtype == "numeric"
+
+    dtype_orig = getattr(array, "dtype", None)
+    if not is_array_api_compliant and not hasattr(dtype_orig, "kind"):
+        # not a data type (e.g. a column named dtype in a pandas DataFrame)
+        dtype_orig = None
+
+    # check if the object contains several dtypes (typically a pandas
+    # DataFrame), and store them. If not, store None.
+    dtypes_orig = None
+    pandas_requires_conversion = False
+    # track if we have a Series-like object to raise a better error message
+    type_if_series = None
+    if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
+        # throw warning if columns are sparse. If all columns are sparse, then
+        # array.sparse exists and sparsity will be preserved (later).
+        with suppress(ImportError):
+            from pandas import SparseDtype
+
+            def is_sparse(dtype):
+                return isinstance(dtype, SparseDtype)
+
+            if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
+                warnings.warn(
+                    "pandas.DataFrame with sparse columns found."
+                    "It will be converted to a dense numpy array."
+                )
+
+        dtypes_orig = list(array.dtypes)
+        pandas_requires_conversion = any(
+            _pandas_dtype_needs_early_conversion(i) for i in dtypes_orig
+        )
+        if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
+            dtype_orig = np.result_type(*dtypes_orig)
+        elif pandas_requires_conversion and any(d == object for d in dtypes_orig):
+            # Force object if any of the dtypes is an object
+            dtype_orig = object
+
+    elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
+        array, "dtype"
+    ):
+        # array is a pandas series
+        type_if_series = type(array)
+        pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
+        if isinstance(array.dtype, np.dtype):
+            dtype_orig = array.dtype
+        else:
+            # Set to None to let array.astype work out the best dtype
+            dtype_orig = None
+
+    if dtype_numeric:
+        if (
+            dtype_orig is not None
+            and hasattr(dtype_orig, "kind")
+            and dtype_orig.kind == "O"
+        ):
+            # if input is object, convert to float.
+            dtype = xp.float64
+        else:
+            dtype = None
+
+    if isinstance(dtype, (list, tuple)):
+        if dtype_orig is not None and dtype_orig in dtype:
+            # no dtype conversion required
+            dtype = None
+        else:
+            # dtype conversion required. Let's select the first element of the
+            # list of accepted types.
+            dtype = dtype[0]
+
+    if pandas_requires_conversion:
+        # pandas dataframe requires conversion earlier to handle extension dtypes with
+        # nans
+        # Use the original dtype for conversion if dtype is None
+        new_dtype = dtype_orig if dtype is None else dtype
+        array = array.astype(new_dtype)
+        # Since we converted here, we do not need to convert again later
+        dtype = None
+
+    if ensure_all_finite not in (True, False, "allow-nan"):
+        raise ValueError(
+            "ensure_all_finite should be a bool or 'allow-nan'. Got "
+            f"{ensure_all_finite!r} instead."
+        )
+
+    if dtype is not None and _is_numpy_namespace(xp):
+        # convert to dtype object to conform to Array API to be use `xp.isdtype` later
+        dtype = np.dtype(dtype)
+
+    estimator_name = _check_estimator_name(estimator)
+    context = " by %s" % estimator_name if estimator is not None else ""
+
+    # When all dataframe columns are sparse, convert to a sparse array
+    if hasattr(array, "sparse") and array.ndim > 1:
+        with suppress(ImportError):
+            from pandas import SparseDtype
+
+            def is_sparse(dtype):
+                return isinstance(dtype, SparseDtype)
+
+            if array.dtypes.apply(is_sparse).all():
+                # DataFrame.sparse only supports `to_coo`
+                array = array.sparse.to_coo()
+                if array.dtype == np.dtype("object"):
+                    unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes])
+                    if len(unique_dtypes) > 1:
+                        raise ValueError(
+                            "Pandas DataFrame with mixed sparse extension arrays "
+                            "generated a sparse matrix with object dtype which "
+                            "can not be converted to a scipy sparse matrix."
+                            "Sparse extension arrays should all have the same "
+                            "numeric type."
+                        )
+
+    if sp.issparse(array):
+        _ensure_no_complex_data(array)
+        array = _ensure_sparse_format(
+            array,
+            accept_sparse=accept_sparse,
+            dtype=dtype,
+            copy=copy,
+            ensure_all_finite=ensure_all_finite,
+            accept_large_sparse=accept_large_sparse,
+            estimator_name=estimator_name,
+            input_name=input_name,
+        )
+        if ensure_2d and array.ndim < 2:
+            raise ValueError(
+                f"Expected 2D input, got input with shape {array.shape}.\n"
+                "Reshape your data either using array.reshape(-1, 1) if "
+                "your data has a single feature or array.reshape(1, -1) "
+                "if it contains a single sample."
+            )
+    else:
+        # If np.array(..) gives ComplexWarning, then we convert the warning
+        # to an error. This is needed because specifying a non complex
+        # dtype to the function converts complex to real dtype,
+        # thereby passing the test made in the lines following the scope
+        # of warnings context manager.
+        with warnings.catch_warnings():
+            try:
+                warnings.simplefilter("error", ComplexWarning)
+                if dtype is not None and xp.isdtype(dtype, "integral"):
+                    # Conversion float -> int should not contain NaN or
+                    # inf (numpy#14412). We cannot use casting='safe' because
+                    # then conversion float -> int would be disallowed.
+                    array = _asarray_with_order(array, order=order, xp=xp)
+                    if xp.isdtype(array.dtype, ("real floating", "complex floating")):
+                        _assert_all_finite(
+                            array,
+                            allow_nan=False,
+                            msg_dtype=dtype,
+                            estimator_name=estimator_name,
+                            input_name=input_name,
+                        )
+                    array = xp.astype(array, dtype, copy=False)
+                else:
+                    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
+            except ComplexWarning as complex_warning:
+                raise ValueError(
+                    "Complex data not supported\n{}\n".format(array)
+                ) from complex_warning
+
+        # It is possible that the np.array(..) gave no warning. This happens
+        # when no dtype conversion happened, for example dtype = None. The
+        # result is that np.array(..) produces an array of complex dtype
+        # and we need to catch and raise exception for such cases.
+        _ensure_no_complex_data(array)
+
+        if ensure_2d:
+            # If input is scalar raise error
+            if array.ndim == 0:
+                raise ValueError(
+                    "Expected 2D array, got scalar array instead:\narray={}.\n"
+                    "Reshape your data either using array.reshape(-1, 1) if "
+                    "your data has a single feature or array.reshape(1, -1) "
+                    "if it contains a single sample.".format(array)
+                )
+            # If input is 1D raise error
+            if array.ndim == 1:
+                # If input is a Series-like object (eg. pandas Series or polars Series)
+                if type_if_series is not None:
+                    msg = (
+                        f"Expected a 2-dimensional container but got {type_if_series} "
+                        "instead. Pass a DataFrame containing a single row (i.e. "
+                        "single sample) or a single column (i.e. single feature) "
+                        "instead."
+                    )
+                else:
+                    msg = (
+                        f"Expected 2D array, got 1D array instead:\narray={array}.\n"
+                        "Reshape your data either using array.reshape(-1, 1) if "
+                        "your data has a single feature or array.reshape(1, -1) "
+                        "if it contains a single sample."
+                    )
+                raise ValueError(msg)
+
+        if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV":
+            raise ValueError(
+                "dtype='numeric' is not compatible with arrays of bytes/strings."
+                "Convert your data to numeric values explicitly instead."
+            )
+        if not allow_nd and array.ndim >= 3:
+            raise ValueError(
+                f"Found array with dim {array.ndim},"
+                f" while dim <= 2 is required{context}."
+            )
+
+        if ensure_all_finite:
+            _assert_all_finite(
+                array,
+                input_name=input_name,
+                estimator_name=estimator_name,
+                allow_nan=ensure_all_finite == "allow-nan",
+            )
+
+        if copy:
+            if _is_numpy_namespace(xp):
+                # only make a copy if `array` and `array_orig` may share memory`
+                if np.may_share_memory(array, array_orig):
+                    array = _asarray_with_order(
+                        array, dtype=dtype, order=order, copy=True, xp=xp
+                    )
+            else:
+                # always make a copy for non-numpy arrays
+                array = _asarray_with_order(
+                    array, dtype=dtype, order=order, copy=True, xp=xp
+                )
+
+    if ensure_min_samples > 0:
+        n_samples = _num_samples(array)
+        if n_samples < ensure_min_samples:
+            raise ValueError(
+                "Found array with %d sample(s) (shape=%s) while a"
+                " minimum of %d is required%s."
+                % (n_samples, array.shape, ensure_min_samples, context)
+            )
+
+    if ensure_min_features > 0 and array.ndim == 2:
+        n_features = array.shape[1]
+        if n_features < ensure_min_features:
+            raise ValueError(
+                "Found array with %d feature(s) (shape=%s) while"
+                " a minimum of %d is required%s."
+                % (n_features, array.shape, ensure_min_features, context)
+            )
+
+    if ensure_non_negative:
+        whom = input_name
+        if estimator_name:
+            whom += f" in {estimator_name}"
+        check_non_negative(array, whom)
+
+    if force_writeable:
+        # By default, array.copy() creates a C-ordered copy. We set order=K to
+        # preserve the order of the array.
+        copy_params = {"order": "K"} if not sp.issparse(array) else {}
+
+        array_data = array.data if sp.issparse(array) else array
+        flags = getattr(array_data, "flags", None)
+        if not getattr(flags, "writeable", True):
+            # This situation can only happen when copy=False, the array is read-only and
+            # a writeable output is requested. This is an ambiguous setting so we chose
+            # to always (except for one specific setting, see below) make a copy to
+            # ensure that the output is writeable, even if avoidable, to not overwrite
+            # the user's data by surprise.
+
+            if _is_pandas_df_or_series(array_orig):
+                try:
+                    # In pandas >= 3, np.asarray(df), called earlier in check_array,
+                    # returns a read-only intermediate array. It can be made writeable
+                    # safely without copy because if the original DataFrame was backed
+                    # by a read-only array, trying to change the flag would raise an
+                    # error, in which case we make a copy.
+                    array_data.flags.writeable = True
+                except ValueError:
+                    array = array.copy(**copy_params)
+            else:
+                array = array.copy(**copy_params)
+
+    return array
+
+
+def _check_large_sparse(X, accept_large_sparse=False):
+    """Raise a ValueError if X has 64bit indices and accept_large_sparse=False"""
+    if not accept_large_sparse:
+        supported_indices = ["int32"]
+        if X.format == "coo":
+            index_keys = ["col", "row"]
+        elif X.format in ["csr", "csc", "bsr"]:
+            index_keys = ["indices", "indptr"]
+        else:
+            return
+        for key in index_keys:
+            indices_datatype = getattr(X, key).dtype
+            if indices_datatype not in supported_indices:
+                raise ValueError(
+                    "Only sparse matrices with 32-bit integer indices are accepted."
+                    f" Got {indices_datatype} indices. Please do report a minimal"
+                    " reproducer on scikit-learn issue tracker so that support for"
+                    " your use-case can be studied by maintainers. See:"
+                    " https://scikit-learn.org/dev/developers/minimal_reproducer.html"
+                )
+
+
+def check_X_y(
+    X,
+    y,
+    accept_sparse=False,
+    *,
+    accept_large_sparse=True,
+    dtype="numeric",
+    order=None,
+    copy=False,
+    force_writeable=False,
+    force_all_finite="deprecated",
+    ensure_all_finite=None,
+    ensure_2d=True,
+    allow_nd=False,
+    multi_output=False,
+    ensure_min_samples=1,
+    ensure_min_features=1,
+    y_numeric=False,
+    estimator=None,
+):
+    """Input validation for standard estimators.
+
+    Checks X and y for consistent length, enforces X to be 2D and y 1D. By
+    default, X is checked to be non-empty and containing only finite values.
+    Standard input checks are also applied to y, such as checking that y
+    does not have np.nan or np.inf targets. For multi-label y, set
+    multi_output=True to allow 2D and sparse y. If the dtype of X is
+    object, attempt converting to float, raising on failure.
+
+    Parameters
+    ----------
+    X : {ndarray, list, sparse matrix}
+        Input data.
+
+    y : {ndarray, list, sparse matrix}
+        Labels.
+
+    accept_sparse : str, bool or list of str, default=False
+        String[s] representing allowed sparse matrix formats, such as 'csc',
+        'csr', etc. If the input is sparse but not in the allowed format,
+        it will be converted to the first listed format. True allows the input
+        to be any format. False means that a sparse matrix input will
+        raise an error.
+
+    accept_large_sparse : bool, default=True
+        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
+        accept_sparse, accept_large_sparse will cause it to be accepted only
+        if its indices are stored with a 32-bit dtype.
+
+        .. versionadded:: 0.20
+
+    dtype : 'numeric', type, list of type or None, default='numeric'
+        Data type of result. If None, the dtype of the input is preserved.
+        If "numeric", dtype is preserved unless array.dtype is object.
+        If dtype is a list of types, conversion on the first type is only
+        performed if the dtype of the input is not in the list.
+
+    order : {'F', 'C'}, default=None
+        Whether an array will be forced to be fortran or c-style. If
+        `None`, then the input data's order is preserved when possible.
+
+    copy : bool, default=False
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
+    force_writeable : bool, default=False
+        Whether to force the output array to be writeable. If True, the returned array
+        is guaranteed to be writeable, which may require a copy. Otherwise the
+        writeability of the input array is preserved.
+
+        .. versionadded:: 1.6
+
+    force_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. This parameter
+        does not influence whether y can have np.inf, np.nan, pd.NA values.
+        The possibilities are:
+
+        - True: Force all values of X to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
+          be infinite.
+
+        .. versionadded:: 0.20
+           ``force_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
+        .. deprecated:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite` and will be removed
+           in 1.8.
+
+    ensure_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. This parameter
+        does not influence whether y can have np.inf, np.nan, pd.NA values.
+        The possibilities are:
+
+        - True: Force all values of X to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
+          be infinite.
+
+        .. versionadded:: 1.6
+           `force_all_finite` was renamed to `ensure_all_finite`.
+
+    ensure_2d : bool, default=True
+        Whether to raise a value error if X is not 2D.
+
+    allow_nd : bool, default=False
+        Whether to allow X.ndim > 2.
+
+    multi_output : bool, default=False
+        Whether to allow 2D y (array or sparse matrix). If false, y will be
+        validated as a vector. y cannot have np.nan or np.inf values if
+        multi_output=True.
+
+    ensure_min_samples : int, default=1
+        Make sure that X has a minimum number of samples in its first
+        axis (rows for a 2D array).
+
+    ensure_min_features : int, default=1
+        Make sure that the 2D array has some minimum number of features
+        (columns). The default value of 1 rejects empty datasets.
+        This check is only enforced when X has effectively 2 dimensions or
+        is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
+        this check.
+
+    y_numeric : bool, default=False
+        Whether to ensure that y has a numeric type. If dtype of y is object,
+        it is converted to float64. Should only be used for regression
+        algorithms.
+
+    estimator : str or estimator instance, default=None
+        If passed, include the name of the estimator in warning messages.
+
+    Returns
+    -------
+    X_converted : object
+        The converted and validated X.
+
+    y_converted : object
+        The converted and validated y.
+
+    Examples
+    --------
+    >>> from sklearn.utils.validation import check_X_y
+    >>> X = [[1, 2], [3, 4], [5, 6]]
+    >>> y = [1, 2, 3]
+    >>> X, y = check_X_y(X, y)
+    >>> X
+    array([[1, 2],
+          [3, 4],
+          [5, 6]])
+    >>> y
+    array([1, 2, 3])
+    """
+    if y is None:
+        if estimator is None:
+            estimator_name = "estimator"
+        else:
+            estimator_name = _check_estimator_name(estimator)
+        raise ValueError(
+            f"{estimator_name} requires y to be passed, but the target y is None"
+        )
+
+    ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite)
+
+    X = check_array(
+        X,
+        accept_sparse=accept_sparse,
+        accept_large_sparse=accept_large_sparse,
+        dtype=dtype,
+        order=order,
+        copy=copy,
+        force_writeable=force_writeable,
+        ensure_all_finite=ensure_all_finite,
+        ensure_2d=ensure_2d,
+        allow_nd=allow_nd,
+        ensure_min_samples=ensure_min_samples,
+        ensure_min_features=ensure_min_features,
+        estimator=estimator,
+        input_name="X",
+    )
+
+    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
+
+    check_consistent_length(X, y)
+
+    return X, y
+
+
+def _check_y(y, multi_output=False, y_numeric=False, estimator=None):
+    """Isolated part of check_X_y dedicated to y validation"""
+    if multi_output:
+        y = check_array(
+            y,
+            accept_sparse="csr",
+            ensure_all_finite=True,
+            ensure_2d=False,
+            dtype=None,
+            input_name="y",
+            estimator=estimator,
+        )
+    else:
+        estimator_name = _check_estimator_name(estimator)
+        y = column_or_1d(y, warn=True)
+        _assert_all_finite(y, input_name="y", estimator_name=estimator_name)
+        _ensure_no_complex_data(y)
+    if y_numeric and hasattr(y.dtype, "kind") and y.dtype.kind == "O":
+        y = y.astype(np.float64)
+
+    return y
+
+
+def column_or_1d(y, *, dtype=None, warn=False, device=None):
+    """Ravel column or 1d numpy array, else raises an error.
+
+    Parameters
+    ----------
+    y : array-like
+       Input data.
+
+    dtype : data-type, default=None
+        Data type for `y`.
+
+        .. versionadded:: 1.2
+
+    warn : bool, default=False
+       To control display of warnings.
+
+    device : device, default=None
+        `device` object.
+        See the :ref:`Array API User Guide <array_api>` for more details.
+
+        .. versionadded:: 1.6
+
+    Returns
+    -------
+    y : ndarray
+       Output data.
+
+    Raises
+    ------
+    ValueError
+        If `y` is not a 1D array or a 2D array with a single row or column.
+
+    Examples
+    --------
+    >>> from sklearn.utils.validation import column_or_1d
+    >>> column_or_1d([1, 1])
+    array([1, 1])
+    """
+    xp, _ = get_namespace(y)
+    y = check_array(
+        y,
+        ensure_2d=False,
+        dtype=dtype,
+        input_name="y",
+        ensure_all_finite=False,
+        ensure_min_samples=0,
+    )
+
+    shape = y.shape
+    if len(shape) == 1:
+        return _asarray_with_order(
+            xp.reshape(y, (-1,)), order="C", xp=xp, device=device
+        )
+    if len(shape) == 2 and shape[1] == 1:
+        if warn:
+            warnings.warn(
+                (
+                    "A column-vector y was passed when a 1d array was"
+                    " expected. Please change the shape of y to "
+                    "(n_samples, ), for example using ravel()."
+                ),
+                DataConversionWarning,
+                stacklevel=2,
+            )
+        return _asarray_with_order(
+            xp.reshape(y, (-1,)), order="C", xp=xp, device=device
+        )
+
+    raise ValueError(
+        "y should be a 1d array, got an array of shape {} instead.".format(shape)
+    )
+
+
+def check_random_state(seed):
+    """Turn seed into a np.random.RandomState instance.
+
+    Parameters
+    ----------
+    seed : None, int or instance of RandomState
+        If seed is None, return the RandomState singleton used by np.random.
+        If seed is an int, return a new RandomState instance seeded with seed.
+        If seed is already a RandomState instance, return it.
+        Otherwise raise ValueError.
+
+    Returns
+    -------
+    :class:`numpy:numpy.random.RandomState`
+        The random state object based on `seed` parameter.
+
+    Examples
+    --------
+    >>> from sklearn.utils.validation import check_random_state
+    >>> check_random_state(42)
+    RandomState(MT19937) at 0x...
+    """
+    if seed is None or seed is np.random:
+        return np.random.mtrand._rand
+    if isinstance(seed, numbers.Integral):
+        return np.random.RandomState(seed)
+    if isinstance(seed, np.random.RandomState):
+        return seed
+    raise ValueError(
+        "%r cannot be used to seed a numpy.random.RandomState instance" % seed
+    )
+
+
+def has_fit_parameter(estimator, parameter):
+    """Check whether the estimator's fit method supports the given parameter.
+
+    Parameters
+    ----------
+    estimator : object
+        An estimator to inspect.
+
+    parameter : str
+        The searched parameter.
+
+    Returns
+    -------
+    is_parameter : bool
+        Whether the parameter was found to be a named parameter of the
+        estimator's fit method.
+
+    Examples
+    --------
+    >>> from sklearn.svm import SVC
+    >>> from sklearn.utils.validation import has_fit_parameter
+    >>> has_fit_parameter(SVC(), "sample_weight")
+    True
+    """
+    return (
+        # This is used during test collection in common tests. The
+        # hasattr(estimator, "fit") makes it so that we don't fail for an estimator
+        # that does not have a `fit` method during collection of checks. The right
+        # checks will fail later.
+        hasattr(estimator, "fit") and parameter in signature(estimator.fit).parameters
+    )
+
+
+def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=False):
+    """Make sure that array is 2D, square and symmetric.
+
+    If the array is not symmetric, then a symmetrized version is returned.
+    Optionally, a warning or exception is raised if the matrix is not
+    symmetric.
+
+    Parameters
+    ----------
+    array : {ndarray, sparse matrix}
+        Input object to check / convert. Must be two-dimensional and square,
+        otherwise a ValueError will be raised.
+
+    tol : float, default=1e-10
+        Absolute tolerance for equivalence of arrays. Default = 1E-10.
+
+    raise_warning : bool, default=True
+        If True then raise a warning if conversion is required.
+
+    raise_exception : bool, default=False
+        If True then raise an exception if array is not symmetric.
+
+    Returns
+    -------
+    array_sym : {ndarray, sparse matrix}
+        Symmetrized version of the input array, i.e. the average of array
+        and array.transpose(). If sparse, then duplicate entries are first
+        summed and zeros are eliminated.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.validation import check_symmetric
+    >>> symmetric_array = np.array([[0, 1, 2], [1, 0, 1], [2, 1, 0]])
+    >>> check_symmetric(symmetric_array)
+    array([[0, 1, 2],
+           [1, 0, 1],
+           [2, 1, 0]])
+    >>> from scipy.sparse import csr_matrix
+    >>> sparse_symmetric_array = csr_matrix(symmetric_array)
+    >>> check_symmetric(sparse_symmetric_array)
+    <Compressed Sparse Row sparse matrix of dtype 'int64'
+        with 6 stored elements and shape (3, 3)>
+    """
+    if (array.ndim != 2) or (array.shape[0] != array.shape[1]):
+        raise ValueError(
+            "array must be 2-dimensional and square. shape = {0}".format(array.shape)
+        )
+
+    if sp.issparse(array):
+        diff = array - array.T
+        # only csr, csc, and coo have `data` attribute
+        if diff.format not in ["csr", "csc", "coo"]:
+            diff = diff.tocsr()
+        symmetric = np.all(abs(diff.data) < tol)
+    else:
+        symmetric = np.allclose(array, array.T, atol=tol)
+
+    if not symmetric:
+        if raise_exception:
+            raise ValueError("Array must be symmetric")
+        if raise_warning:
+            warnings.warn(
+                (
+                    "Array is not symmetric, and will be converted "
+                    "to symmetric by average with its transpose."
+                ),
+                stacklevel=2,
+            )
+        if sp.issparse(array):
+            conversion = "to" + array.format
+            array = getattr(0.5 * (array + array.T), conversion)()
+        else:
+            array = 0.5 * (array + array.T)
+
+    return array
+
+
+def _is_fitted(estimator, attributes=None, all_or_any=all):
+    """Determine if an estimator is fitted
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Estimator instance for which the check is performed.
+
+    attributes : str, list or tuple of str, default=None
+        Attribute name(s) given as string or a list/tuple of strings
+        Eg.: ``["coef_", "estimator_", ...], "coef_"``
+
+        If `None`, `estimator` is considered fitted if there exist an
+        attribute that ends with a underscore and does not start with double
+        underscore.
+
+    all_or_any : callable, {all, any}, default=all
+        Specify whether all or any of the given attributes must exist.
+
+    Returns
+    -------
+    fitted : bool
+        Whether the estimator is fitted.
+    """
+    if attributes is not None:
+        if not isinstance(attributes, (list, tuple)):
+            attributes = [attributes]
+        return all_or_any([hasattr(estimator, attr) for attr in attributes])
+
+    if hasattr(estimator, "__sklearn_is_fitted__"):
+        return estimator.__sklearn_is_fitted__()
+
+    fitted_attrs = [
+        v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
+    ]
+    return len(fitted_attrs) > 0
+
+
+def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
+    """Perform is_fitted validation for estimator.
+
+    Checks if the estimator is fitted by verifying the presence of
+    fitted attributes (ending with a trailing underscore) and otherwise
+    raises a :class:`~sklearn.exceptions.NotFittedError` with the given message.
+
+    If an estimator does not set any attributes with a trailing underscore, it
+    can define a ``__sklearn_is_fitted__`` method returning a boolean to
+    specify if the estimator is fitted or not. See
+    :ref:`sphx_glr_auto_examples_developing_estimators_sklearn_is_fitted.py`
+    for an example on how to use the API.
+
+    If no `attributes` are passed, this function will pass if an estimator is stateless.
+    An estimator can indicate it's stateless by setting the `requires_fit` tag. See
+    :ref:`estimator_tags` for more information. Note that the `requires_fit` tag
+    is ignored if `attributes` are passed.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Estimator instance for which the check is performed.
+
+    attributes : str, list or tuple of str, default=None
+        Attribute name(s) given as string or a list/tuple of strings
+        Eg.: ``["coef_", "estimator_", ...], "coef_"``
+
+        If `None`, `estimator` is considered fitted if there exist an
+        attribute that ends with a underscore and does not start with double
+        underscore.
+
+    msg : str, default=None
+        The default error message is, "This %(name)s instance is not fitted
+        yet. Call 'fit' with appropriate arguments before using this
+        estimator."
+
+        For custom messages if "%(name)s" is present in the message string,
+        it is substituted for the estimator name.
+
+        Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
+
+    all_or_any : callable, {all, any}, default=all
+        Specify whether all or any of the given attributes must exist.
+
+    Raises
+    ------
+    TypeError
+        If the estimator is a class or not an estimator instance
+
+    NotFittedError
+        If the attributes are not found.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.utils.validation import check_is_fitted
+    >>> from sklearn.exceptions import NotFittedError
+    >>> lr = LogisticRegression()
+    >>> try:
+    ...     check_is_fitted(lr)
+    ... except NotFittedError as exc:
+    ...     print(f"Model is not fitted yet.")
+    Model is not fitted yet.
+    >>> lr.fit([[1, 2], [1, 3]], [1, 0])
+    LogisticRegression()
+    >>> check_is_fitted(lr)
+    """
+    if isclass(estimator):
+        raise TypeError("{} is a class, not an instance.".format(estimator))
+    if msg is None:
+        msg = (
+            "This %(name)s instance is not fitted yet. Call 'fit' with "
+            "appropriate arguments before using this estimator."
+        )
+
+    if not hasattr(estimator, "fit"):
+        raise TypeError("%s is not an estimator instance." % (estimator))
+
+    tags = get_tags(estimator)
+
+    if not tags.requires_fit and attributes is None:
+        return
+
+    if not _is_fitted(estimator, attributes, all_or_any):
+        raise NotFittedError(msg % {"name": type(estimator).__name__})
+
+
+def _estimator_has(attr, *, delegates=("estimator_", "estimator")):
+    """Check if we can delegate a method to the underlying estimator.
+
+    We check the `delegates` in the order they are passed. By default, we first check
+    the fitted estimator if available, otherwise we check the unfitted estimator.
+
+    Parameters
+    ----------
+    attr : str
+        Name of the attribute the delegate might or might not have.
+
+    delegates: tuple of str, default=("estimator_", "estimator")
+        A tuple of sub-estimator(s) to check if we can delegate the `attr` method.
+
+    Returns
+    -------
+    check : function
+        Function to check if the delegate has the attribute.
+
+    Raises
+    ------
+    ValueError
+        Raised when none of the delegates are present in the object.
+    """
+
+    def check(self):
+        for delegate in delegates:
+            # In meta estimators with multiple sub estimators,
+            # only the attribute of the first sub estimator is checked,
+            # assuming uniformity across all sub estimators.
+            if hasattr(self, delegate):
+                delegator = getattr(self, delegate)
+                if isinstance(delegator, Sequence):
+                    return getattr(delegator[0], attr)
+                else:
+                    return getattr(delegator, attr)
+
+        raise ValueError(f"None of the delegates {delegates} are present in the class.")
+
+    return check
+
+
+def check_non_negative(X, whom):
+    """
+    Check if there is any negative value in an array.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}
+        Input data.
+
+    whom : str
+        Who passed X to this function.
+    """
+    xp, _ = get_namespace(X)
+    # avoid X.min() on sparse matrix since it also sorts the indices
+    if sp.issparse(X):
+        if X.format in ["lil", "dok"]:
+            X = X.tocsr()
+        if X.data.size == 0:
+            X_min = 0
+        else:
+            X_min = X.data.min()
+    else:
+        X_min = xp.min(X)
+
+    if X_min < 0:
+        raise ValueError(f"Negative values in data passed to {whom}.")
+
+
+def check_scalar(
+    x,
+    name,
+    target_type,
+    *,
+    min_val=None,
+    max_val=None,
+    include_boundaries="both",
+):
+    """Validate scalar parameters type and value.
+
+    Parameters
+    ----------
+    x : object
+        The scalar parameter to validate.
+
+    name : str
+        The name of the parameter to be printed in error messages.
+
+    target_type : type or tuple
+        Acceptable data types for the parameter.
+
+    min_val : float or int, default=None
+        The minimum valid value the parameter can take. If None (default) it
+        is implied that the parameter does not have a lower bound.
+
+    max_val : float or int, default=None
+        The maximum valid value the parameter can take. If None (default) it
+        is implied that the parameter does not have an upper bound.
+
+    include_boundaries : {"left", "right", "both", "neither"}, default="both"
+        Whether the interval defined by `min_val` and `max_val` should include
+        the boundaries. Possible choices are:
+
+        - `"left"`: only `min_val` is included in the valid interval.
+          It is equivalent to the interval `[ min_val, max_val )`.
+        - `"right"`: only `max_val` is included in the valid interval.
+          It is equivalent to the interval `( min_val, max_val ]`.
+        - `"both"`: `min_val` and `max_val` are included in the valid interval.
+          It is equivalent to the interval `[ min_val, max_val ]`.
+        - `"neither"`: neither `min_val` nor `max_val` are included in the
+          valid interval. It is equivalent to the interval `( min_val, max_val )`.
+
+    Returns
+    -------
+    x : numbers.Number
+        The validated number.
+
+    Raises
+    ------
+    TypeError
+        If the parameter's type does not match the desired type.
+
+    ValueError
+        If the parameter's value violates the given bounds.
+        If `min_val`, `max_val` and `include_boundaries` are inconsistent.
+
+    Examples
+    --------
+    >>> from sklearn.utils.validation import check_scalar
+    >>> check_scalar(10, "x", int, min_val=1, max_val=20)
+    10
+    """
+
+    def type_name(t):
+        """Convert type into humman readable string."""
+        module = t.__module__
+        qualname = t.__qualname__
+        if module == "builtins":
+            return qualname
+        elif t == numbers.Real:
+            return "float"
+        elif t == numbers.Integral:
+            return "int"
+        return f"{module}.{qualname}"
+
+    if not isinstance(x, target_type):
+        if isinstance(target_type, tuple):
+            types_str = ", ".join(type_name(t) for t in target_type)
+            target_type_str = f"{{{types_str}}}"
+        else:
+            target_type_str = type_name(target_type)
+
+        raise TypeError(
+            f"{name} must be an instance of {target_type_str}, not"
+            f" {type(x).__qualname__}."
+        )
+
+    expected_include_boundaries = ("left", "right", "both", "neither")
+    if include_boundaries not in expected_include_boundaries:
+        raise ValueError(
+            f"Unknown value for `include_boundaries`: {include_boundaries!r}. "
+            f"Possible values are: {expected_include_boundaries}."
+        )
+
+    if max_val is None and include_boundaries == "right":
+        raise ValueError(
+            "`include_boundaries`='right' without specifying explicitly `max_val` "
+            "is inconsistent."
+        )
+
+    if min_val is None and include_boundaries == "left":
+        raise ValueError(
+            "`include_boundaries`='left' without specifying explicitly `min_val` "
+            "is inconsistent."
+        )
+
+    comparison_operator = (
+        operator.lt if include_boundaries in ("left", "both") else operator.le
+    )
+    if min_val is not None and comparison_operator(x, min_val):
+        raise ValueError(
+            f"{name} == {x}, must be"
+            f" {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}."
+        )
+
+    comparison_operator = (
+        operator.gt if include_boundaries in ("right", "both") else operator.ge
+    )
+    if max_val is not None and comparison_operator(x, max_val):
+        raise ValueError(
+            f"{name} == {x}, must be"
+            f" {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}."
+        )
+
+    return x
+
+
+def _check_psd_eigenvalues(lambdas, enable_warnings=False):
+    """Check the eigenvalues of a positive semidefinite (PSD) matrix.
+
+    Checks the provided array of PSD matrix eigenvalues for numerical or
+    conditioning issues and returns a fixed validated version. This method
+    should typically be used if the PSD matrix is user-provided (e.g. a
+    Gram matrix) or computed using a user-provided dissimilarity metric
+    (e.g. kernel function), or if the decomposition process uses approximation
+    methods (randomized SVD, etc.).
+
+    It checks for three things:
+
+    - that there are no significant imaginary parts in eigenvalues (more than
+      1e-5 times the maximum real part). If this check fails, it raises a
+      ``ValueError``. Otherwise all non-significant imaginary parts that may
+      remain are set to zero. This operation is traced with a
+      ``PositiveSpectrumWarning`` when ``enable_warnings=True``.
+
+    - that eigenvalues are not all negative. If this check fails, it raises a
+      ``ValueError``
+
+    - that there are no significant negative eigenvalues with absolute value
+      more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest
+      positive eigenvalue in double (simple) precision. If this check fails,
+      it raises a ``ValueError``. Otherwise all negative eigenvalues that may
+      remain are set to zero. This operation is traced with a
+      ``PositiveSpectrumWarning`` when ``enable_warnings=True``.
+
+    Finally, all the positive eigenvalues that are too small (with a value
+    smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to
+    zero. This operation is traced with a ``PositiveSpectrumWarning`` when
+    ``enable_warnings=True``.
+
+    Parameters
+    ----------
+    lambdas : array-like of shape (n_eigenvalues,)
+        Array of eigenvalues to check / fix.
+
+    enable_warnings : bool, default=False
+        When this is set to ``True``, a ``PositiveSpectrumWarning`` will be
+        raised when there are imaginary parts, negative eigenvalues, or
+        extremely small non-zero eigenvalues. Otherwise no warning will be
+        raised. In both cases, imaginary parts, negative eigenvalues, and
+        extremely small non-zero eigenvalues will be set to zero.
+
+    Returns
+    -------
+    lambdas_fixed : ndarray of shape (n_eigenvalues,)
+        A fixed validated copy of the array of eigenvalues.
+
+    Examples
+    --------
+    >>> from sklearn.utils.validation import _check_psd_eigenvalues
+    >>> _check_psd_eigenvalues([1, 2])      # nominal case
+    array([1, 2])
+    >>> _check_psd_eigenvalues([5, 5j])     # significant imag part
+    Traceback (most recent call last):
+        ...
+    ValueError: There are significant imaginary parts in eigenvalues (1
+        of the maximum real part). Either the matrix is not PSD, or there was
+        an issue while computing the eigendecomposition of the matrix.
+    >>> _check_psd_eigenvalues([5, 5e-5j])  # insignificant imag part
+    array([5., 0.])
+    >>> _check_psd_eigenvalues([-5, -1])    # all negative
+    Traceback (most recent call last):
+        ...
+    ValueError: All eigenvalues are negative (maximum is -1). Either the
+        matrix is not PSD, or there was an issue while computing the
+        eigendecomposition of the matrix.
+    >>> _check_psd_eigenvalues([5, -1])     # significant negative
+    Traceback (most recent call last):
+        ...
+    ValueError: There are significant negative eigenvalues (0.2 of the
+        maximum positive). Either the matrix is not PSD, or there was an issue
+        while computing the eigendecomposition of the matrix.
+    >>> _check_psd_eigenvalues([5, -5e-5])  # insignificant negative
+    array([5., 0.])
+    >>> _check_psd_eigenvalues([5, 4e-12])  # bad conditioning (too small)
+    array([5., 0.])
+
+    """
+
+    lambdas = np.array(lambdas)
+    is_double_precision = lambdas.dtype == np.float64
+
+    # note: the minimum value available is
+    #  - single-precision: np.finfo('float32').eps = 1.2e-07
+    #  - double-precision: np.finfo('float64').eps = 2.2e-16
+
+    # the various thresholds used for validation
+    # we may wish to change the value according to precision.
+    significant_imag_ratio = 1e-5
+    significant_neg_ratio = 1e-5 if is_double_precision else 5e-3
+    significant_neg_value = 1e-10 if is_double_precision else 1e-6
+    small_pos_ratio = 1e-12 if is_double_precision else 2e-7
+
+    # Check that there are no significant imaginary parts
+    if not np.isreal(lambdas).all():
+        max_imag_abs = np.abs(np.imag(lambdas)).max()
+        max_real_abs = np.abs(np.real(lambdas)).max()
+        if max_imag_abs > significant_imag_ratio * max_real_abs:
+            raise ValueError(
+                "There are significant imaginary parts in eigenvalues (%g "
+                "of the maximum real part). Either the matrix is not PSD, or "
+                "there was an issue while computing the eigendecomposition "
+                "of the matrix." % (max_imag_abs / max_real_abs)
+            )
+
+        # warn about imaginary parts being removed
+        if enable_warnings:
+            warnings.warn(
+                "There are imaginary parts in eigenvalues (%g "
+                "of the maximum real part). Either the matrix is not"
+                " PSD, or there was an issue while computing the "
+                "eigendecomposition of the matrix. Only the real "
+                "parts will be kept." % (max_imag_abs / max_real_abs),
+                PositiveSpectrumWarning,
+            )
+
+    # Remove all imaginary parts (even if zero)
+    lambdas = np.real(lambdas)
+
+    # Check that there are no significant negative eigenvalues
+    max_eig = lambdas.max()
+    if max_eig < 0:
+        raise ValueError(
+            "All eigenvalues are negative (maximum is %g). "
+            "Either the matrix is not PSD, or there was an "
+            "issue while computing the eigendecomposition of "
+            "the matrix." % max_eig
+        )
+
+    else:
+        min_eig = lambdas.min()
+        if (
+            min_eig < -significant_neg_ratio * max_eig
+            and min_eig < -significant_neg_value
+        ):
+            raise ValueError(
+                "There are significant negative eigenvalues (%g"
+                " of the maximum positive). Either the matrix is "
+                "not PSD, or there was an issue while computing "
+                "the eigendecomposition of the matrix." % (-min_eig / max_eig)
+            )
+        elif min_eig < 0:
+            # Remove all negative values and warn about it
+            if enable_warnings:
+                warnings.warn(
+                    "There are negative eigenvalues (%g of the "
+                    "maximum positive). Either the matrix is not "
+                    "PSD, or there was an issue while computing the"
+                    " eigendecomposition of the matrix. Negative "
+                    "eigenvalues will be replaced with 0." % (-min_eig / max_eig),
+                    PositiveSpectrumWarning,
+                )
+            lambdas[lambdas < 0] = 0
+
+    # Check for conditioning (small positive non-zeros)
+    too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig)
+    if too_small_lambdas.any():
+        if enable_warnings:
+            warnings.warn(
+                "Badly conditioned PSD matrix spectrum: the largest "
+                "eigenvalue is more than %g times the smallest. "
+                "Small eigenvalues will be replaced with 0."
+                "" % (1 / small_pos_ratio),
+                PositiveSpectrumWarning,
+            )
+        lambdas[too_small_lambdas] = 0
+
+    return lambdas
+
+
+def _check_sample_weight(
+    sample_weight, X, *, dtype=None, ensure_non_negative=False, copy=False
+):
+    """Validate sample weights.
+
+    Note that passing sample_weight=None will output an array of ones.
+    Therefore, in some cases, you may want to protect the call with:
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(...)
+
+    Parameters
+    ----------
+    sample_weight : {ndarray, Number or None}, shape (n_samples,)
+        Input sample weights.
+
+    X : {ndarray, list, sparse matrix}
+        Input data.
+
+    dtype : dtype, default=None
+        dtype of the validated `sample_weight`.
+        If None, and `sample_weight` is an array:
+
+            - If `sample_weight.dtype` is one of `{np.float64, np.float32}`,
+              then the dtype is preserved.
+            - Else the output has NumPy's default dtype: `np.float64`.
+
+        If `dtype` is not `{np.float32, np.float64, None}`, then output will
+        be `np.float64`.
+
+    ensure_non_negative : bool, default=False,
+        Whether or not the weights are expected to be non-negative.
+
+        .. versionadded:: 1.0
+
+    copy : bool, default=False
+        If True, a copy of sample_weight will be created.
+
+    Returns
+    -------
+    sample_weight : ndarray of shape (n_samples,)
+        Validated sample weight. It is guaranteed to be "C" contiguous.
+    """
+    n_samples = _num_samples(X)
+
+    xp, _ = get_namespace(X)
+
+    if dtype is not None and dtype not in [xp.float32, xp.float64]:
+        dtype = xp.float64
+
+    if sample_weight is None:
+        sample_weight = xp.ones(n_samples, dtype=dtype)
+    elif isinstance(sample_weight, numbers.Number):
+        sample_weight = xp.full(n_samples, sample_weight, dtype=dtype)
+    else:
+        if dtype is None:
+            dtype = [xp.float64, xp.float32]
+        sample_weight = check_array(
+            sample_weight,
+            accept_sparse=False,
+            ensure_2d=False,
+            dtype=dtype,
+            order="C",
+            copy=copy,
+            input_name="sample_weight",
+        )
+        if sample_weight.ndim != 1:
+            raise ValueError("Sample weights must be 1D array or scalar")
+
+        if sample_weight.shape != (n_samples,):
+            raise ValueError(
+                "sample_weight.shape == {}, expected {}!".format(
+                    sample_weight.shape, (n_samples,)
+                )
+            )
+
+    if ensure_non_negative:
+        check_non_negative(sample_weight, "`sample_weight`")
+
+    return sample_weight
+
+
+def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
+    """Check allclose for sparse and dense data.
+
+    Both x and y need to be either sparse or dense, they
+    can't be mixed.
+
+    Parameters
+    ----------
+    x : {array-like, sparse matrix}
+        First array to compare.
+
+    y : {array-like, sparse matrix}
+        Second array to compare.
+
+    rtol : float, default=1e-7
+        Relative tolerance; see numpy.allclose.
+
+    atol : float, default=1e-9
+        absolute tolerance; see numpy.allclose. Note that the default here is
+        more tolerant than the default for numpy.testing.assert_allclose, where
+        atol=0.
+    """
+    if sp.issparse(x) and sp.issparse(y):
+        x = x.tocsr()
+        y = y.tocsr()
+        x.sum_duplicates()
+        y.sum_duplicates()
+        return (
+            np.array_equal(x.indices, y.indices)
+            and np.array_equal(x.indptr, y.indptr)
+            and np.allclose(x.data, y.data, rtol=rtol, atol=atol)
+        )
+    elif not sp.issparse(x) and not sp.issparse(y):
+        return np.allclose(x, y, rtol=rtol, atol=atol)
+    raise ValueError(
+        "Can only compare two sparse matrices, not a sparse matrix and an array"
+    )
+
+
+def _check_response_method(estimator, response_method):
+    """Check if `response_method` is available in estimator and return it.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Classifier or regressor to check.
+
+    response_method : {"predict_proba", "predict_log_proba", "decision_function",
+            "predict"} or list of such str
+        Specifies the response method to use get prediction from an estimator
+        (i.e. :term:`predict_proba`, :term:`predict_log_proba`,
+        :term:`decision_function` or :term:`predict`). Possible choices are:
+        - if `str`, it corresponds to the name to the method to return;
+        - if a list of `str`, it provides the method names in order of
+          preference. The method returned corresponds to the first method in
+          the list and which is implemented by `estimator`.
+
+    Returns
+    -------
+    prediction_method : callable
+        Prediction method of estimator.
+
+    Raises
+    ------
+    AttributeError
+        If `response_method` is not available in `estimator`.
+    """
+    if isinstance(response_method, str):
+        list_methods = [response_method]
+    else:
+        list_methods = response_method
+
+    prediction_method = [getattr(estimator, method, None) for method in list_methods]
+    prediction_method = reduce(lambda x, y: x or y, prediction_method)
+    if prediction_method is None:
+        raise AttributeError(
+            f"{estimator.__class__.__name__} has none of the following attributes: "
+            f"{', '.join(list_methods)}."
+        )
+
+    return prediction_method
+
+
+def _check_method_params(X, params, indices=None):
+    """Check and validate the parameters passed to a specific
+    method like `fit`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data array.
+
+    params : dict
+        Dictionary containing the parameters passed to the method.
+
+    indices : array-like of shape (n_samples,), default=None
+        Indices to be selected if the parameter has the same size as `X`.
+
+    Returns
+    -------
+    method_params_validated : dict
+        Validated parameters. We ensure that the values support indexing.
+    """
+    from . import _safe_indexing
+
+    method_params_validated = {}
+    for param_key, param_value in params.items():
+        if (
+            not _is_arraylike(param_value) and not sp.issparse(param_value)
+        ) or _num_samples(param_value) != _num_samples(X):
+            # Non-indexable pass-through (for now for backward-compatibility).
+            # https://github.com/scikit-learn/scikit-learn/issues/15805
+            method_params_validated[param_key] = param_value
+        else:
+            # Any other method_params should support indexing
+            # (e.g. for cross-validation).
+            method_params_validated[param_key] = _make_indexable(param_value)
+            method_params_validated[param_key] = _safe_indexing(
+                method_params_validated[param_key], indices
+            )
+
+    return method_params_validated
+
+
+def _is_pandas_df_or_series(X):
+    """Return True if the X is a pandas dataframe or series."""
+    try:
+        pd = sys.modules["pandas"]
+    except KeyError:
+        return False
+    return isinstance(X, (pd.DataFrame, pd.Series))
+
+
+def _is_pandas_df(X):
+    """Return True if the X is a pandas dataframe."""
+    try:
+        pd = sys.modules["pandas"]
+    except KeyError:
+        return False
+    return isinstance(X, pd.DataFrame)
+
+
+def _is_pyarrow_data(X):
+    """Return True if the X is a pyarrow Table, RecordBatch, Array or ChunkedArray."""
+    try:
+        pa = sys.modules["pyarrow"]
+    except KeyError:
+        return False
+    return isinstance(X, (pa.Table, pa.RecordBatch, pa.Array, pa.ChunkedArray))
+
+
+def _is_polars_df_or_series(X):
+    """Return True if the X is a polars dataframe or series."""
+    try:
+        pl = sys.modules["polars"]
+    except KeyError:
+        return False
+    return isinstance(X, (pl.DataFrame, pl.Series))
+
+
+def _is_polars_df(X):
+    """Return True if the X is a polars dataframe."""
+    try:
+        pl = sys.modules["polars"]
+    except KeyError:
+        return False
+    return isinstance(X, pl.DataFrame)
+
+
+def _get_feature_names(X):
+    """Get feature names from X.
+
+    Support for other array containers should place its implementation here.
+
+    Parameters
+    ----------
+    X : {ndarray, dataframe} of shape (n_samples, n_features)
+        Array container to extract feature names.
+
+        - pandas dataframe : The columns will be considered to be feature
+          names. If the dataframe contains non-string feature names, `None` is
+          returned.
+        - All other array containers will return `None`.
+
+    Returns
+    -------
+    names: ndarray or None
+        Feature names of `X`. Unrecognized array containers will return `None`.
+    """
+    feature_names = None
+
+    # extract feature names for support array containers
+    if _is_pandas_df(X):
+        # Make sure we can inspect columns names from pandas, even with
+        # versions too old to expose a working implementation of
+        # __dataframe__.column_names() and avoid introducing any
+        # additional copy.
+        # TODO: remove the pandas-specific branch once the minimum supported
+        # version of pandas has a working implementation of
+        # __dataframe__.column_names() that is guaranteed to not introduce any
+        # additional copy of the data without having to impose allow_copy=False
+        # that could fail with other libraries. Note: in the longer term, we
+        # could decide to instead rely on the __dataframe_namespace__ API once
+        # adopted by our minimally supported pandas version.
+        feature_names = np.asarray(X.columns, dtype=object)
+    elif hasattr(X, "__dataframe__"):
+        df_protocol = X.__dataframe__()
+        feature_names = np.asarray(list(df_protocol.column_names()), dtype=object)
+
+    if feature_names is None or len(feature_names) == 0:
+        return
+
+    types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))
+
+    # mixed type of string and non-string is not supported
+    if len(types) > 1 and "str" in types:
+        raise TypeError(
+            "Feature names are only supported if all input features have string names, "
+            f"but your input has {types} as feature name / column name types. "
+            "If you want feature names to be stored and validated, you must convert "
+            "them all to strings, by using X.columns = X.columns.astype(str) for "
+            "example. Otherwise you can remove feature / column names from your input "
+            "data, or convert them all to a non-string data type."
+        )
+
+    # Only feature names of all strings are supported
+    if len(types) == 1 and types[0] == "str":
+        return feature_names
+
+
+def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):
+    """Check `input_features` and generate names if needed.
+
+    Commonly used in :term:`get_feature_names_out`.
+
+    Parameters
+    ----------
+    input_features : array-like of str or None, default=None
+        Input features.
+
+        - If `input_features` is `None`, then `feature_names_in_` is
+          used as feature names in. If `feature_names_in_` is not defined,
+          then the following input feature names are generated:
+          `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+        - If `input_features` is an array-like, then `input_features` must
+          match `feature_names_in_` if `feature_names_in_` is defined.
+
+    generate_names : bool, default=True
+        Whether to generate names when `input_features` is `None` and
+        `estimator.feature_names_in_` is not defined. This is useful for transformers
+        that validates `input_features` but do not require them in
+        :term:`get_feature_names_out` e.g. `PCA`.
+
+    Returns
+    -------
+    feature_names_in : ndarray of str or `None`
+        Feature names in.
+    """
+
+    feature_names_in_ = getattr(estimator, "feature_names_in_", None)
+    n_features_in_ = getattr(estimator, "n_features_in_", None)
+
+    if input_features is not None:
+        input_features = np.asarray(input_features, dtype=object)
+        if feature_names_in_ is not None and not np.array_equal(
+            feature_names_in_, input_features
+        ):
+            raise ValueError("input_features is not equal to feature_names_in_")
+
+        if n_features_in_ is not None and len(input_features) != n_features_in_:
+            raise ValueError(
+                "input_features should have length equal to number of "
+                f"features ({n_features_in_}), got {len(input_features)}"
+            )
+        return input_features
+
+    if feature_names_in_ is not None:
+        return feature_names_in_
+
+    if not generate_names:
+        return
+
+    # Generates feature names if `n_features_in_` is defined
+    if n_features_in_ is None:
+        raise ValueError("Unable to generate feature names without n_features_in_")
+
+    return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object)
+
+
+def _generate_get_feature_names_out(estimator, n_features_out, input_features=None):
+    """Generate feature names out for estimator using the estimator name as the prefix.
+
+    The input_feature names are validated but not used. This function is useful
+    for estimators that generate their own names based on `n_features_out`, i.e. PCA.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Estimator producing output feature names.
+
+    n_feature_out : int
+        Number of feature names out.
+
+    input_features : array-like of str or None, default=None
+        Only used to validate feature names with `estimator.feature_names_in_`.
+
+    Returns
+    -------
+    feature_names_in : ndarray of str or `None`
+        Feature names in.
+    """
+    _check_feature_names_in(estimator, input_features, generate_names=False)
+    estimator_name = estimator.__class__.__name__.lower()
+    return np.asarray(
+        [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object
+    )
+
+
+def _check_monotonic_cst(estimator, monotonic_cst=None):
+    """Check the monotonic constraints and return the corresponding array.
+
+    This helper function should be used in the `fit` method of an estimator
+    that supports monotonic constraints and called after the estimator has
+    introspected input data to set the `n_features_in_` and optionally the
+    `feature_names_in_` attributes.
+
+    .. versionadded:: 1.2
+
+    Parameters
+    ----------
+    estimator : estimator instance
+
+    monotonic_cst : array-like of int, dict of str or None, default=None
+        Monotonic constraints for the features.
+
+        - If array-like, then it should contain only -1, 0 or 1. Each value
+            will be checked to be in [-1, 0, 1]. If a value is -1, then the
+            corresponding feature is required to be monotonically decreasing.
+        - If dict, then it the keys should be the feature names occurring in
+            `estimator.feature_names_in_` and the values should be -1, 0 or 1.
+        - If None, then an array of 0s will be allocated.
+
+    Returns
+    -------
+    monotonic_cst : ndarray of int
+        Monotonic constraints for each feature.
+    """
+    original_monotonic_cst = monotonic_cst
+    if monotonic_cst is None or isinstance(monotonic_cst, dict):
+        monotonic_cst = np.full(
+            shape=estimator.n_features_in_,
+            fill_value=0,
+            dtype=np.int8,
+        )
+        if isinstance(original_monotonic_cst, dict):
+            if not hasattr(estimator, "feature_names_in_"):
+                raise ValueError(
+                    f"{estimator.__class__.__name__} was not fitted on data "
+                    "with feature names. Pass monotonic_cst as an integer "
+                    "array instead."
+                )
+            unexpected_feature_names = list(
+                set(original_monotonic_cst) - set(estimator.feature_names_in_)
+            )
+            unexpected_feature_names.sort()  # deterministic error message
+            n_unexpeced = len(unexpected_feature_names)
+            if unexpected_feature_names:
+                if len(unexpected_feature_names) > 5:
+                    unexpected_feature_names = unexpected_feature_names[:5]
+                    unexpected_feature_names.append("...")
+                raise ValueError(
+                    f"monotonic_cst contains {n_unexpeced} unexpected feature "
+                    f"names: {unexpected_feature_names}."
+                )
+            for feature_idx, feature_name in enumerate(estimator.feature_names_in_):
+                if feature_name in original_monotonic_cst:
+                    cst = original_monotonic_cst[feature_name]
+                    if cst not in [-1, 0, 1]:
+                        raise ValueError(
+                            f"monotonic_cst['{feature_name}'] must be either "
+                            f"-1, 0 or 1. Got {cst!r}."
+                        )
+                    monotonic_cst[feature_idx] = cst
+    else:
+        unexpected_cst = np.setdiff1d(monotonic_cst, [-1, 0, 1])
+        if unexpected_cst.shape[0]:
+            raise ValueError(
+                "monotonic_cst must be an array-like of -1, 0 or 1. Observed "
+                f"values: {unexpected_cst.tolist()}."
+            )
+
+        monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
+        if monotonic_cst.shape[0] != estimator.n_features_in_:
+            raise ValueError(
+                f"monotonic_cst has shape {monotonic_cst.shape} but the input data "
+                f"X has {estimator.n_features_in_} features."
+            )
+    return monotonic_cst
+
+
+def _check_pos_label_consistency(pos_label, y_true):
+    """Check if `pos_label` need to be specified or not.
+
+    In binary classification, we fix `pos_label=1` if the labels are in the set
+    {-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the
+    `pos_label` parameters.
+
+    Parameters
+    ----------
+    pos_label : int, float, bool, str or None
+        The positive label.
+    y_true : ndarray of shape (n_samples,)
+        The target vector.
+
+    Returns
+    -------
+    pos_label : int, float, bool or str
+        If `pos_label` can be inferred, it will be returned.
+
+    Raises
+    ------
+    ValueError
+        In the case that `y_true` does not have label in {-1, 1} or {0, 1},
+        it will raise a `ValueError`.
+    """
+    # ensure binary classification if pos_label is not specified
+    # classes.dtype.kind in ('O', 'U', 'S') is required to avoid
+    # triggering a FutureWarning by calling np.array_equal(a, b)
+    # when elements in the two arrays are not comparable.
+    if pos_label is None:
+        # Compute classes only if pos_label is not specified:
+        classes = np.unique(y_true)
+        if classes.dtype.kind in "OUS" or not (
+            np.array_equal(classes, [0, 1])
+            or np.array_equal(classes, [-1, 1])
+            or np.array_equal(classes, [0])
+            or np.array_equal(classes, [-1])
+            or np.array_equal(classes, [1])
+        ):
+            classes_repr = ", ".join([repr(c) for c in classes.tolist()])
+            raise ValueError(
+                f"y_true takes value in {{{classes_repr}}} and pos_label is not "
+                "specified: either make y_true take value in {0, 1} or "
+                "{-1, 1} or pass pos_label explicitly."
+            )
+        pos_label = 1
+
+    return pos_label
+
+
+def _to_object_array(sequence):
+    """Convert sequence to a 1-D NumPy array of object dtype.
+
+    numpy.array constructor has a similar use but it's output
+    is ambiguous. It can be 1-D NumPy array of object dtype if
+    the input is a ragged array, but if the input is a list of
+    equal length arrays, then the output is a 2D numpy.array.
+    _to_object_array solves this ambiguity by guarantying that
+    the output is a 1-D NumPy array of objects for any input.
+
+    Parameters
+    ----------
+    sequence : array-like of shape (n_elements,)
+        The sequence to be converted.
+
+    Returns
+    -------
+    out : ndarray of shape (n_elements,), dtype=object
+        The converted sequence into a 1-D NumPy array of object dtype.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.validation import _to_object_array
+    >>> _to_object_array([np.array([0]), np.array([1])])
+    array([array([0]), array([1])], dtype=object)
+    >>> _to_object_array([np.array([0]), np.array([1, 2])])
+    array([array([0]), array([1, 2])], dtype=object)
+    >>> _to_object_array([np.array([0]), np.array([1, 2])])
+    array([array([0]), array([1, 2])], dtype=object)
+    """
+    out = np.empty(len(sequence), dtype=object)
+    out[:] = sequence
+    return out
+
+
+def _check_feature_names(estimator, X, *, reset):
+    """Set or check the `feature_names_in_` attribute of an estimator.
+
+    .. versionadded:: 1.0
+
+    .. versionchanged:: 1.6
+        Moved from :class:`~sklearn.base.BaseEstimator` to
+        :mod:`sklearn.utils.validation`.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The estimator to validate the input for.
+
+    X : {ndarray, dataframe} of shape (n_samples, n_features)
+        The input samples.
+
+    reset : bool
+        Whether to reset the `feature_names_in_` attribute.
+        If False, the input will be checked for consistency with
+        feature names of data provided when reset was last True.
+        .. note::
+           It is recommended to call `reset=True` in `fit` and in the first
+           call to `partial_fit`. All other methods that validate `X`
+           should set `reset=False`.
+    """
+
+    if reset:
+        feature_names_in = _get_feature_names(X)
+        if feature_names_in is not None:
+            estimator.feature_names_in_ = feature_names_in
+        elif hasattr(estimator, "feature_names_in_"):
+            # Delete the attribute when the estimator is fitted on a new dataset
+            # that has no feature names.
+            delattr(estimator, "feature_names_in_")
+        return
+
+    fitted_feature_names = getattr(estimator, "feature_names_in_", None)
+    X_feature_names = _get_feature_names(X)
+
+    if fitted_feature_names is None and X_feature_names is None:
+        # no feature names seen in fit and in X
+        return
+
+    if X_feature_names is not None and fitted_feature_names is None:
+        warnings.warn(
+            f"X has feature names, but {estimator.__class__.__name__} was fitted "
+            "without feature names"
+        )
+        return
+
+    if X_feature_names is None and fitted_feature_names is not None:
+        warnings.warn(
+            "X does not have valid feature names, but"
+            f" {estimator.__class__.__name__} was fitted with feature names"
+        )
+        return
+
+    # validate the feature names against the `feature_names_in_` attribute
+    if len(fitted_feature_names) != len(X_feature_names) or np.any(
+        fitted_feature_names != X_feature_names
+    ):
+        message = "The feature names should match those that were passed during fit.\n"
+        fitted_feature_names_set = set(fitted_feature_names)
+        X_feature_names_set = set(X_feature_names)
+
+        unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set)
+        missing_names = sorted(fitted_feature_names_set - X_feature_names_set)
+
+        def add_names(names):
+            output = ""
+            max_n_names = 5
+            for i, name in enumerate(names):
+                if i >= max_n_names:
+                    output += "- ...\n"
+                    break
+                output += f"- {name}\n"
+            return output
+
+        if unexpected_names:
+            message += "Feature names unseen at fit time:\n"
+            message += add_names(unexpected_names)
+
+        if missing_names:
+            message += "Feature names seen at fit time, yet now missing:\n"
+            message += add_names(missing_names)
+
+        if not missing_names and not unexpected_names:
+            message += "Feature names must be in the same order as they were in fit.\n"
+
+        raise ValueError(message)
+
+
+def _check_n_features(estimator, X, reset):
+    """Set the `n_features_in_` attribute, or check against it on an estimator.
+
+    .. versionchanged:: 1.6
+        Moved from :class:`~sklearn.base.BaseEstimator` to
+        :mod:`~sklearn.utils.validation`.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        The estimator to validate the input for.
+
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The input samples.
+
+    reset : bool
+        If True, the `n_features_in_` attribute is set to `X.shape[1]`.
+        If False and the attribute exists, then check that it is equal to
+        `X.shape[1]`. If False and the attribute does *not* exist, then
+        the check is skipped.
+        .. note::
+           It is recommended to call reset=True in `fit` and in the first
+           call to `partial_fit`. All other methods that validate `X`
+           should set `reset=False`.
+    """
+    try:
+        n_features = _num_features(X)
+    except TypeError as e:
+        if not reset and hasattr(estimator, "n_features_in_"):
+            raise ValueError(
+                "X does not contain any features, but "
+                f"{estimator.__class__.__name__} is expecting "
+                f"{estimator.n_features_in_} features"
+            ) from e
+        # If the number of features is not defined and reset=True,
+        # then we skip this check
+        return
+
+    if reset:
+        estimator.n_features_in_ = n_features
+        return
+
+    if not hasattr(estimator, "n_features_in_"):
+        # Skip this check if the expected number of expected input features
+        # was not recorded by calling fit first. This is typically the case
+        # for stateless transformers.
+        return
+
+    if n_features != estimator.n_features_in_:
+        raise ValueError(
+            f"X has {n_features} features, but {estimator.__class__.__name__} "
+            f"is expecting {estimator.n_features_in_} features as input."
+        )
+
+
+def validate_data(
+    _estimator,
+    /,
+    X="no_validation",
+    y="no_validation",
+    reset=True,
+    validate_separately=False,
+    skip_check_array=False,
+    **check_params,
+):
+    """Validate input data and set or check feature names and counts of the input.
+
+    This helper function should be used in an estimator that requires input
+    validation. This mutates the estimator and sets the `n_features_in_` and
+    `feature_names_in_` attributes if `reset=True`.
+
+    .. versionadded:: 1.6
+
+    Parameters
+    ----------
+    _estimator : estimator instance
+        The estimator to validate the input for.
+
+    X : {array-like, sparse matrix, dataframe} of shape \
+            (n_samples, n_features), default='no validation'
+        The input samples.
+        If `'no_validation'`, no validation is performed on `X`. This is
+        useful for meta-estimator which can delegate input validation to
+        their underlying estimator(s). In that case `y` must be passed and
+        the only accepted `check_params` are `multi_output` and
+        `y_numeric`.
+
+    y : array-like of shape (n_samples,), default='no_validation'
+        The targets.
+
+        - If `None`, :func:`~sklearn.utils.check_array` is called on `X`. If
+          the estimator's `requires_y` tag is True, then an error will be raised.
+        - If `'no_validation'`, :func:`~sklearn.utils.check_array` is called
+          on `X` and the estimator's `requires_y` tag is ignored. This is a default
+          placeholder and is never meant to be explicitly set. In that case `X` must be
+          passed.
+        - Otherwise, only `y` with `_check_y` or both `X` and `y` are checked with
+          either :func:`~sklearn.utils.check_array` or
+          :func:`~sklearn.utils.check_X_y` depending on `validate_separately`.
+
+    reset : bool, default=True
+        Whether to reset the `n_features_in_` attribute.
+        If False, the input will be checked for consistency with data
+        provided when reset was last True.
+
+        .. note::
+
+           It is recommended to call `reset=True` in `fit` and in the first
+           call to `partial_fit`. All other methods that validate `X`
+           should set `reset=False`.
+
+    validate_separately : False or tuple of dicts, default=False
+        Only used if `y` is not `None`.
+        If `False`, call :func:`~sklearn.utils.check_X_y`. Else, it must be a tuple of
+        kwargs to be used for calling :func:`~sklearn.utils.check_array` on `X` and `y`
+        respectively.
+
+        `estimator=self` is automatically added to these dicts to generate
+        more informative error message in case of invalid input data.
+
+    skip_check_array : bool, default=False
+        If `True`, `X` and `y` are unchanged and only `feature_names_in_` and
+        `n_features_in_` are checked. Otherwise, :func:`~sklearn.utils.check_array`
+        is called on `X` and `y`.
+
+    **check_params : kwargs
+        Parameters passed to :func:`~sklearn.utils.check_array` or
+        :func:`~sklearn.utils.check_X_y`. Ignored if validate_separately
+        is not False.
+
+        `estimator=self` is automatically added to these params to generate
+        more informative error message in case of invalid input data.
+
+    Returns
+    -------
+    out : {ndarray, sparse matrix} or tuple of these
+        The validated input. A tuple is returned if both `X` and `y` are
+        validated.
+    """
+    _check_feature_names(_estimator, X, reset=reset)
+    tags = get_tags(_estimator)
+    if y is None and tags.target_tags.required:
+        raise ValueError(
+            f"This {_estimator.__class__.__name__} estimator "
+            "requires y to be passed, but the target y is None."
+        )
+
+    no_val_X = isinstance(X, str) and X == "no_validation"
+    no_val_y = y is None or (isinstance(y, str) and y == "no_validation")
+
+    if no_val_X and no_val_y:
+        raise ValueError("Validation should be done on X, y or both.")
+
+    default_check_params = {"estimator": _estimator}
+    check_params = {**default_check_params, **check_params}
+
+    if skip_check_array:
+        if not no_val_X and no_val_y:
+            out = X
+        elif no_val_X and not no_val_y:
+            out = y
+        else:
+            out = X, y
+    elif not no_val_X and no_val_y:
+        out = check_array(X, input_name="X", **check_params)
+    elif no_val_X and not no_val_y:
+        out = _check_y(y, **check_params)
+    else:
+        if validate_separately:
+            # We need this because some estimators validate X and y
+            # separately, and in general, separately calling check_array()
+            # on X and y isn't equivalent to just calling check_X_y()
+            # :(
+            check_X_params, check_y_params = validate_separately
+            if "estimator" not in check_X_params:
+                check_X_params = {**default_check_params, **check_X_params}
+            X = check_array(X, input_name="X", **check_X_params)
+            if "estimator" not in check_y_params:
+                check_y_params = {**default_check_params, **check_y_params}
+            y = check_array(y, input_name="y", **check_y_params)
+        else:
+            X, y = check_X_y(X, y, **check_params)
+        out = X, y
+
+    if not no_val_X and check_params.get("ensure_2d", True):
+        _check_n_features(_estimator, X, reset=reset)
+
+    return out
diff --git a/illustrious_generated/00f5e16a2236.png b/illustrious_generated/00f5e16a2236.png
new file mode 100644
index 0000000000000000000000000000000000000000..648fd1c84f7a5d745c88c41fecdb22b32366d5a2
--- /dev/null
+++ b/illustrious_generated/00f5e16a2236.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:900ae62c1f40dacfdc69728f11ee6db0d25839f1f43d6297230851d279477994
+size 5060054
diff --git a/illustrious_generated/00ff6449b55d.png b/illustrious_generated/00ff6449b55d.png
new file mode 100644
index 0000000000000000000000000000000000000000..c711d21366081128291fe95ba59d3dfba4225f2f
--- /dev/null
+++ b/illustrious_generated/00ff6449b55d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bc9c962eee29736fc6bde0c11fa47c375797bd0c3fbabc7fbe01216c780770c
+size 623772
diff --git a/illustrious_generated/0367ba694b76.png b/illustrious_generated/0367ba694b76.png
new file mode 100644
index 0000000000000000000000000000000000000000..dc7afefc76e8d70c010ce616f8c7e9a962fc5cd9
--- /dev/null
+++ b/illustrious_generated/0367ba694b76.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:322f882e0f4dc4032c35225036ada7512a7359ce47796e86ff3f026bc519f168
+size 333951
diff --git a/illustrious_generated/05972b153525.png b/illustrious_generated/05972b153525.png
new file mode 100644
index 0000000000000000000000000000000000000000..c86d0558ccfa2cb20686e1c7d62290f573d74e1e
--- /dev/null
+++ b/illustrious_generated/05972b153525.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd8e73b99666a51a8a4aada320fa0447ba90401c4c43babe4bdafc568673f04c
+size 1317078
diff --git a/illustrious_generated/060e926dcc0a.png b/illustrious_generated/060e926dcc0a.png
new file mode 100644
index 0000000000000000000000000000000000000000..22676ff55defe0202ab39fb2d7f02608a8787eae
--- /dev/null
+++ b/illustrious_generated/060e926dcc0a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1371d680affa74e9b5b0c12e050acc9e9f744d409253806d4d8ff68b7e03e285
+size 3193498
diff --git a/illustrious_generated/06da7f820423.png b/illustrious_generated/06da7f820423.png
new file mode 100644
index 0000000000000000000000000000000000000000..69e225a491020e6a9beeda0fb4e1c53fe038e0bd
--- /dev/null
+++ b/illustrious_generated/06da7f820423.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff4469996bdc5507dd09aa6697337e0a02efc157579c2c696f1559450fd7c4c
+size 1081961
diff --git a/illustrious_generated/0706f94ebdc3.png b/illustrious_generated/0706f94ebdc3.png
new file mode 100644
index 0000000000000000000000000000000000000000..9a789140e8cc3dfe9ea1d90d8677776f5c1a08e9
--- /dev/null
+++ b/illustrious_generated/0706f94ebdc3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:939087bcc23a6c6d09b777ddb7cf40d4d8a743cbb352a4b4f6ab4da7d0c95a70
+size 1140591
diff --git a/illustrious_generated/073f299a3b06.png b/illustrious_generated/073f299a3b06.png
new file mode 100644
index 0000000000000000000000000000000000000000..b2510903c7319b5cbd806b918810bc51b7be582d
--- /dev/null
+++ b/illustrious_generated/073f299a3b06.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50bf4d54bcdff4c87c3bffdc3c14897110c937e3072cc90e758ae358615182db
+size 995842
diff --git a/illustrious_generated/085929212457.png b/illustrious_generated/085929212457.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5b6ca6fd7b3b95ecc823b154908a29d4a9c196
--- /dev/null
+++ b/illustrious_generated/085929212457.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:718e9ffdac3856eb1006ab68a2c8544409cb1f098f920fedf057727b8bad5cfe
+size 1079010
diff --git a/illustrious_generated/08e454ab01c2.png b/illustrious_generated/08e454ab01c2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ca9bb0f69424bda12219c3c264710e606eda776
--- /dev/null
+++ b/illustrious_generated/08e454ab01c2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92083e3b106274f56068d21b6c34c39a5a659852bf679b453786f72623881d0c
+size 1494385
diff --git a/illustrious_generated/095dc81d1160.png b/illustrious_generated/095dc81d1160.png
new file mode 100644
index 0000000000000000000000000000000000000000..bc746196b15946451421b4175d6b7d893f3a2c7a
--- /dev/null
+++ b/illustrious_generated/095dc81d1160.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e6e2b269dd0e12ced50a709f1ed11d2ab3cc94206b8fd1c456c81f678b41857
+size 2493470
diff --git a/illustrious_generated/0ad3307ea09c.png b/illustrious_generated/0ad3307ea09c.png
new file mode 100644
index 0000000000000000000000000000000000000000..72b754e147f4157413a7a230a18d18be6871f229
--- /dev/null
+++ b/illustrious_generated/0ad3307ea09c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e507eeada8235f0acd9b24b9a74c3ab177c89e336d1a3a99440694877d9b560
+size 440220
diff --git a/illustrious_generated/0b77d88bc5f0.png b/illustrious_generated/0b77d88bc5f0.png
new file mode 100644
index 0000000000000000000000000000000000000000..c14758fb42ad9aeec9a037cedeb10a24d3a683e6
--- /dev/null
+++ b/illustrious_generated/0b77d88bc5f0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:830c181dfd95983a2948cd0c80b64f6f2799c55feb0436635f734d85d000f049
+size 1560417
diff --git a/illustrious_generated/0d55065059c0.png b/illustrious_generated/0d55065059c0.png
new file mode 100644
index 0000000000000000000000000000000000000000..8f1de6a406a9c088cfcde6968e8807512824f766
--- /dev/null
+++ b/illustrious_generated/0d55065059c0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c90351602cffbaea5a5b9b517fcda898ed745dd4fe0bed59c31f2f2d56a2be54
+size 3677093
diff --git a/illustrious_generated/0e0acc59ef85.png b/illustrious_generated/0e0acc59ef85.png
new file mode 100644
index 0000000000000000000000000000000000000000..b8e090f71a50500d87c9f087677543c02938c829
--- /dev/null
+++ b/illustrious_generated/0e0acc59ef85.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed9141300152d8abd021389b6fa79a5e118a1cad78e43a160136ef51240ec868
+size 739602
diff --git a/illustrious_generated/0ef8c1ed2c6c.png b/illustrious_generated/0ef8c1ed2c6c.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b28a628a57098d061dbfd991505864a0b967a55
--- /dev/null
+++ b/illustrious_generated/0ef8c1ed2c6c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14fc918319fc905446e62c3991d3762d333a8b1f9b9238c12ff980606ab7daa5
+size 1487128
diff --git a/illustrious_generated/11c7f55b2aab.png b/illustrious_generated/11c7f55b2aab.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e002f23fd96dc188edd792a1ea419ead95c9805
--- /dev/null
+++ b/illustrious_generated/11c7f55b2aab.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b34eadac26ceaebbc895ff32a74cf818a8e6c0f796c732b7c2dbdb451e542f6
+size 407038
diff --git a/illustrious_generated/12875eda15eb.png b/illustrious_generated/12875eda15eb.png
new file mode 100644
index 0000000000000000000000000000000000000000..e13549e746567f4a429597398a06164c2c9b1832
--- /dev/null
+++ b/illustrious_generated/12875eda15eb.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e08d1616f4ec108d9c21fdec81b1e15c9cb4d98fa9616282246ae2fc393f7b07
+size 4702488
diff --git a/illustrious_generated/13166cbea867.png b/illustrious_generated/13166cbea867.png
new file mode 100644
index 0000000000000000000000000000000000000000..2abf6fcc690f61f46766e6be4b2dd5f1d4b5a0b3
--- /dev/null
+++ b/illustrious_generated/13166cbea867.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a32882cc2f9a58341138c9752c31ede9da7cab8b5df764ba131029b3c3388948
+size 3439767
diff --git a/illustrious_generated/13cdedc9c525.png b/illustrious_generated/13cdedc9c525.png
new file mode 100644
index 0000000000000000000000000000000000000000..82fd676718488fe1de0911123c30f028390960be
--- /dev/null
+++ b/illustrious_generated/13cdedc9c525.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f48713479aaeafc40b6c2095d3d38176ef1e006fa29f8baaedc143cea3ecda1
+size 1463084
diff --git a/illustrious_generated/1506e01a5598.png b/illustrious_generated/1506e01a5598.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1118b248bdc0a4806da5af3ac0e63de9c5ffffb
--- /dev/null
+++ b/illustrious_generated/1506e01a5598.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0220297630bcf5df11800b7d65abbfbf1addadafd9743e2b83668bfc3aecabde
+size 2268451
diff --git a/illustrious_generated/162e3face5a7.png b/illustrious_generated/162e3face5a7.png
new file mode 100644
index 0000000000000000000000000000000000000000..00e75a44cb608aecff0d77fa583ea4fa128ffae0
--- /dev/null
+++ b/illustrious_generated/162e3face5a7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a1696f5491f5b44f230031ceff2953663c34014fce8a7547b5bbeda0f795951
+size 1047621
diff --git a/illustrious_generated/190beb9306ef.png b/illustrious_generated/190beb9306ef.png
new file mode 100644
index 0000000000000000000000000000000000000000..99441ab7bb2989113faf2de195f7a54e3ae45cdb
--- /dev/null
+++ b/illustrious_generated/190beb9306ef.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a87e64e708f00ea24c9f9dbb82d703d9fc596890e28cd885b0304bdf710de3f
+size 1779404
diff --git a/illustrious_generated/1927adcb399a.png b/illustrious_generated/1927adcb399a.png
new file mode 100644
index 0000000000000000000000000000000000000000..cd228ce917fa86a952636883b05bf42ab5d064b5
--- /dev/null
+++ b/illustrious_generated/1927adcb399a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31378bc38efa5bda10446ea2453ab672214cfdb0dbab4adbe8d9dacfba1ef7ba
+size 1307765
diff --git a/illustrious_generated/19ff2ce2a961.png b/illustrious_generated/19ff2ce2a961.png
new file mode 100644
index 0000000000000000000000000000000000000000..028219409e0ce96b66d3ff6ea38c8b5205b2246e
--- /dev/null
+++ b/illustrious_generated/19ff2ce2a961.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64cafdf7492dd4436c2a2d47d56a03abd6e10a7d324da4ab240c1f92faca582c
+size 1611522
diff --git a/illustrious_generated/1c7a7ed6f359.png b/illustrious_generated/1c7a7ed6f359.png
new file mode 100644
index 0000000000000000000000000000000000000000..48cbb30195382fccaf233b09a142c4fbae73104c
--- /dev/null
+++ b/illustrious_generated/1c7a7ed6f359.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82754ebef93d0dcb59682a5116b774d3f52b44662f04783efef4de548caab162
+size 4087169
diff --git a/illustrious_generated/1e54c0c78134.png b/illustrious_generated/1e54c0c78134.png
new file mode 100644
index 0000000000000000000000000000000000000000..efe5e5a7ece05fb79c99e16323e2c3d23b6f0f40
--- /dev/null
+++ b/illustrious_generated/1e54c0c78134.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fdd3691d6da40e391f07f92bcbb7207077c2718da41009a98f68462657d725b
+size 3080060
diff --git a/illustrious_generated/1e774fcc188d.png b/illustrious_generated/1e774fcc188d.png
new file mode 100644
index 0000000000000000000000000000000000000000..0b4b100d95eb796865d9613408410e4228f02ba8
--- /dev/null
+++ b/illustrious_generated/1e774fcc188d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a840e4c96290b6f44a4370bc14ebf2c478a7b5320efec70ac4cd00be81eddbd6
+size 2932026
diff --git a/illustrious_generated/205b715d279f.png b/illustrious_generated/205b715d279f.png
new file mode 100644
index 0000000000000000000000000000000000000000..ef9c168f498fcb5dac8c946a1268aa758ba61261
--- /dev/null
+++ b/illustrious_generated/205b715d279f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:238da097c4b8e250e6db49f2a59a6256630053d9d534dfbbdf22d117c14b22fa
+size 1898807
diff --git a/illustrious_generated/224c2084abb8.png b/illustrious_generated/224c2084abb8.png
new file mode 100644
index 0000000000000000000000000000000000000000..a233e5ce04e8e0f44a72331829647fc760394bfc
--- /dev/null
+++ b/illustrious_generated/224c2084abb8.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3babfb9b5db82ba346c06af3fbb58478873cdea0febe1149d2a6d8c23ff99de
+size 1054120
diff --git a/illustrious_generated/2286bf835a6b.png b/illustrious_generated/2286bf835a6b.png
new file mode 100644
index 0000000000000000000000000000000000000000..f6accff0b0e1cd650c2056fba713ff34b63c3991
--- /dev/null
+++ b/illustrious_generated/2286bf835a6b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb16cf47ca6ea33bb4480145c75dcc3cc16b1677c3e2f445db177bac9bd2bbcf
+size 874587
diff --git a/illustrious_generated/22af9def0424.png b/illustrious_generated/22af9def0424.png
new file mode 100644
index 0000000000000000000000000000000000000000..4f34d55023352dda17728c1ab175669966bb06d0
--- /dev/null
+++ b/illustrious_generated/22af9def0424.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f2054f233afe3428d366b524fae5c8d11a7f37605d71961f592796de615a2b
+size 2467286
diff --git a/illustrious_generated/24e5b9fe7d38.png b/illustrious_generated/24e5b9fe7d38.png
new file mode 100644
index 0000000000000000000000000000000000000000..abb4ac9695e4dbcc2a60813d9cf2cbcaacbc8b09
--- /dev/null
+++ b/illustrious_generated/24e5b9fe7d38.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26d9796d257074882d3c1581d30344e6589d1bfdeeb8d3e0b87e42c4da4d0b3e
+size 1538271
diff --git a/illustrious_generated/2549abad7eff.png b/illustrious_generated/2549abad7eff.png
new file mode 100644
index 0000000000000000000000000000000000000000..6678291ff95d0f332dcf4c8049dc1ddea6a0adf7
--- /dev/null
+++ b/illustrious_generated/2549abad7eff.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e01101a919f73f8baabf9ea27ff7d98318c21d3755b2581b8fd910bc9674d8a
+size 1658881
diff --git a/illustrious_generated/26185801988b.png b/illustrious_generated/26185801988b.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e736f7d4fd345e7f352b34c23b537241efab973
--- /dev/null
+++ b/illustrious_generated/26185801988b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b212f298fe6f9f7b418600d374c9d90142f238ff66b084dc21dd5a95597bbaa9
+size 1917519
diff --git a/illustrious_generated/269ee6e9a79c.png b/illustrious_generated/269ee6e9a79c.png
new file mode 100644
index 0000000000000000000000000000000000000000..0fca3c11fe1dae823d789b85ae046b9dce7ebed1
--- /dev/null
+++ b/illustrious_generated/269ee6e9a79c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5083cfd059503292a38f07247e2d1bfb5ee3944853a69544be722e1821a02642
+size 2598401
diff --git a/illustrious_generated/26d2ef2d7d03.png b/illustrious_generated/26d2ef2d7d03.png
new file mode 100644
index 0000000000000000000000000000000000000000..d6cf424d0462ca69689f1c95ce8beb8d97eede8b
--- /dev/null
+++ b/illustrious_generated/26d2ef2d7d03.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3083bbead5c4bb80d7d44f68405710ed85b9b18d51cdd5deb148af187e5410f
+size 500223
diff --git a/illustrious_generated/275253c8ad6b.png b/illustrious_generated/275253c8ad6b.png
new file mode 100644
index 0000000000000000000000000000000000000000..5a3ee25fc32584de6d94ea6b5f323fe89968e4b9
--- /dev/null
+++ b/illustrious_generated/275253c8ad6b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c90e987fb9bb1b57cf4d5c237a436f51ab32558263e7dc674a8c6bef25d00ea2
+size 1434852
diff --git a/illustrious_generated/2bac6ab4413e.png b/illustrious_generated/2bac6ab4413e.png
new file mode 100644
index 0000000000000000000000000000000000000000..0cbd49c0702d1549814c4c806ea02d926e85cc54
--- /dev/null
+++ b/illustrious_generated/2bac6ab4413e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36827f6d9faeaa38bfacaa1cf99490dcd75b15d0e5f89f878035fd02172692b0
+size 725383
diff --git a/illustrious_generated/2bb0e99b92bc.png b/illustrious_generated/2bb0e99b92bc.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f97ec5b8e32239c859b1829c7f1164d0f393790
--- /dev/null
+++ b/illustrious_generated/2bb0e99b92bc.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6a0eaf4f29ab14406723f24d81ca92c65f3c774c404f4860b95f055cbc551fb
+size 3848408
diff --git a/illustrious_generated/2cd36314054f.png b/illustrious_generated/2cd36314054f.png
new file mode 100644
index 0000000000000000000000000000000000000000..34945bb9b2228e375dd13bc55b344887e1096be4
--- /dev/null
+++ b/illustrious_generated/2cd36314054f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d154ccea72fe8f658a3f3918ce6043653d93e407fffc50052d30619b15444f2
+size 619793
diff --git a/illustrious_generated/2ea3ba7918b4.png b/illustrious_generated/2ea3ba7918b4.png
new file mode 100644
index 0000000000000000000000000000000000000000..04951a654a884ff7e297880330f602683eeac268
--- /dev/null
+++ b/illustrious_generated/2ea3ba7918b4.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ed116ecedac5ed3745d2dbef0094029751aa0dab9259cb33db5fd77a288b92a
+size 3988163
diff --git a/illustrious_generated/2ffb09f5cbc0.png b/illustrious_generated/2ffb09f5cbc0.png
new file mode 100644
index 0000000000000000000000000000000000000000..55e0a3328752690a5d9e89fb98ddc481affde561
--- /dev/null
+++ b/illustrious_generated/2ffb09f5cbc0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca52d4c0e3da736c89733b1fa410fd28b921dcbe60cb8ea977351a72793d9cee
+size 647740
diff --git a/illustrious_generated/3030bee9df5a.png b/illustrious_generated/3030bee9df5a.png
new file mode 100644
index 0000000000000000000000000000000000000000..233443b8f3c0bd323a76781a22d89e3c8d4a007b
--- /dev/null
+++ b/illustrious_generated/3030bee9df5a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a54f65b7b06b2947ef380ca369083b024212dcac9be60bc0b40ab78eea1fa29
+size 1875872
diff --git a/illustrious_generated/3048ba382498.png b/illustrious_generated/3048ba382498.png
new file mode 100644
index 0000000000000000000000000000000000000000..32c751b14a58603216a72e2316714498c7103fab
--- /dev/null
+++ b/illustrious_generated/3048ba382498.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c21b122b73b3779efc8dd832b7a06269f9163ddf30e7c0f1c90d2e8762c5762
+size 2667445
diff --git a/illustrious_generated/31cbd66704bb.png b/illustrious_generated/31cbd66704bb.png
new file mode 100644
index 0000000000000000000000000000000000000000..fa473a21830132177a1fddb79a838f8af55a05d3
--- /dev/null
+++ b/illustrious_generated/31cbd66704bb.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e2915773e697f28a976380aa6dd5fea19263959379034acd44a4fb982436890
+size 733787
diff --git a/illustrious_generated/3315198d28df.png b/illustrious_generated/3315198d28df.png
new file mode 100644
index 0000000000000000000000000000000000000000..caad7bb26f88b4e7f6b4f36534c57eb3797d034c
--- /dev/null
+++ b/illustrious_generated/3315198d28df.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60cb70231ce6596594f34458e263f87c3c5efca5918966ec8d72036a27acebb3
+size 2228998
diff --git a/illustrious_generated/34afbd2725c8.png b/illustrious_generated/34afbd2725c8.png
new file mode 100644
index 0000000000000000000000000000000000000000..9fef3171b1b648066d36c0f147ffc3b875c28b9d
--- /dev/null
+++ b/illustrious_generated/34afbd2725c8.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07eeb39eb4a135e848508022d46ee11df80753c488b6846ffd19bc34d1a00870
+size 1827537
diff --git a/illustrious_generated/365e7d0f97c2.png b/illustrious_generated/365e7d0f97c2.png
new file mode 100644
index 0000000000000000000000000000000000000000..90f5d4d5a54b9baf4c586fd3dd75c6f9202501e0
--- /dev/null
+++ b/illustrious_generated/365e7d0f97c2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b16b1fee6a5391eeb0d340448d63127588504e27584d1130e24bc816a49c5ac
+size 588603
diff --git a/illustrious_generated/36915299353b.png b/illustrious_generated/36915299353b.png
new file mode 100644
index 0000000000000000000000000000000000000000..74c5dd1138129ee3e8dff0f3a0c90577a6cbcf66
--- /dev/null
+++ b/illustrious_generated/36915299353b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5a9ba4e0b4c7932e161551e8aa16e5b70f2a65cc6d96f7246893e851c26f73f
+size 793216
diff --git a/illustrious_generated/38b5363061d5.png b/illustrious_generated/38b5363061d5.png
new file mode 100644
index 0000000000000000000000000000000000000000..1ef47c3d11488f2476e3ee0ed1f05c6ae4849d74
--- /dev/null
+++ b/illustrious_generated/38b5363061d5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2372aef2e5606a69126e0dc18c76401c495d309265a893c06a95e27d47273745
+size 1554536
diff --git a/illustrious_generated/392a7a129a01.png b/illustrious_generated/392a7a129a01.png
new file mode 100644
index 0000000000000000000000000000000000000000..91dd9816d0b18b1c7865541468e9d9ecbaea0bbb
--- /dev/null
+++ b/illustrious_generated/392a7a129a01.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d55377c9cda708266408886ec0f193b0e519639366777c11e9ddf3757689b280
+size 2613314
diff --git a/illustrious_generated/3967f8d787ab.png b/illustrious_generated/3967f8d787ab.png
new file mode 100644
index 0000000000000000000000000000000000000000..52c6a22a1b10de9c7d1e1d49b64b1517a1bca70f
--- /dev/null
+++ b/illustrious_generated/3967f8d787ab.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33c93307dec7e47957cefb444e396c08c9ecf5cc4d9fc87b8c339cf48ee9af69
+size 3708703
diff --git a/illustrious_generated/3a12bf82c05e.png b/illustrious_generated/3a12bf82c05e.png
new file mode 100644
index 0000000000000000000000000000000000000000..c822732e100e510e7e8f79331f9256b49ebff535
--- /dev/null
+++ b/illustrious_generated/3a12bf82c05e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2f1666fb2c8da8370852b8ae7badcd6e96259a0181fd3ac2d14ec37f14d2688
+size 798746
diff --git a/illustrious_generated/3cc7f3366f7a.png b/illustrious_generated/3cc7f3366f7a.png
new file mode 100644
index 0000000000000000000000000000000000000000..f091d3be574817148c519efe93384a4430d9cf86
--- /dev/null
+++ b/illustrious_generated/3cc7f3366f7a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3761c3389eb276ded7025284bfa20d47a8e916a0526b5554059c1a2b9b99c566
+size 2704419
diff --git a/illustrious_generated/3f43e650c7d7.png b/illustrious_generated/3f43e650c7d7.png
new file mode 100644
index 0000000000000000000000000000000000000000..f788193ac965c5da6176d0845207c00fe344b2d6
--- /dev/null
+++ b/illustrious_generated/3f43e650c7d7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14defcf8993d0045c22e1d7ee9abdabe3801027fc2935e643f973fd9b19de43f
+size 932328
diff --git a/illustrious_generated/3f5c59c8ee7b.png b/illustrious_generated/3f5c59c8ee7b.png
new file mode 100644
index 0000000000000000000000000000000000000000..6edb9a0d3d85e681db461d1e98be758890436dd0
--- /dev/null
+++ b/illustrious_generated/3f5c59c8ee7b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be3e7a7ccf5b2ac524d94a8c92c81df8589b317e7eb2b7303ddd388addb29290
+size 1460525
diff --git a/illustrious_generated/40c498965cbd.png b/illustrious_generated/40c498965cbd.png
new file mode 100644
index 0000000000000000000000000000000000000000..059b0440523b5aa09082dc8f38298d4ccd8f3eae
--- /dev/null
+++ b/illustrious_generated/40c498965cbd.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8ee4697127551faebb1e21115af6efb4fd47fe9885e60e5c0e8e61803521094
+size 1529768
diff --git a/illustrious_generated/41d42d8f4842.png b/illustrious_generated/41d42d8f4842.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d3d757a4ff697b967769724288345f755050290
--- /dev/null
+++ b/illustrious_generated/41d42d8f4842.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1603070c25108e9c75421d521b808a6bf0f63ab45a3d06056a030336b4255762
+size 1905219
diff --git a/illustrious_generated/427d956c743b.png b/illustrious_generated/427d956c743b.png
new file mode 100644
index 0000000000000000000000000000000000000000..4fdb3cf4fec29f4fb8634c66a905810010b0df08
--- /dev/null
+++ b/illustrious_generated/427d956c743b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:808c7dd6112e5946427953d4f3dfd2858d6ccccc0cb90c7ef29d5b944c0cdadd
+size 2849331
diff --git a/illustrious_generated/433a115b55a3.png b/illustrious_generated/433a115b55a3.png
new file mode 100644
index 0000000000000000000000000000000000000000..a5ea89c45dca2e11f7caf883cba5cdc7956f59d9
--- /dev/null
+++ b/illustrious_generated/433a115b55a3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92ef4b06ee5d2567a5a5625e1abf338093cf66ca2ef8a570491be5444a683890
+size 1565574
diff --git a/illustrious_generated/43877698ad33.png b/illustrious_generated/43877698ad33.png
new file mode 100644
index 0000000000000000000000000000000000000000..39d526dd3a32bacc3f91854fd70eaae7a9f914d2
--- /dev/null
+++ b/illustrious_generated/43877698ad33.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eae356fcfab898bc7b4d7a44e65a6de83c0d37406722eb88148fb04fdfb78b1b
+size 736172
diff --git a/illustrious_generated/43eeb1fb403b.png b/illustrious_generated/43eeb1fb403b.png
new file mode 100644
index 0000000000000000000000000000000000000000..4f1ac5ae276c39098276c0fd170ec1759827fb92
--- /dev/null
+++ b/illustrious_generated/43eeb1fb403b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:055de88d1bc3fdcda179507e965927edefd757782bd56c4f45ceff4700236d73
+size 946975
diff --git a/illustrious_generated/451e48977b1a.png b/illustrious_generated/451e48977b1a.png
new file mode 100644
index 0000000000000000000000000000000000000000..98841b4273c7f9ac5d8e8fdb78f93627b5c2998f
--- /dev/null
+++ b/illustrious_generated/451e48977b1a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bb0a869844a969c524aad4737d8bf144095b7f095c23615ca8da3cddfb73808
+size 2283571
diff --git a/illustrious_generated/45c709323899.png b/illustrious_generated/45c709323899.png
new file mode 100644
index 0000000000000000000000000000000000000000..7e659ea764290da1b4d04a5d52e7ccb557bf9d18
--- /dev/null
+++ b/illustrious_generated/45c709323899.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd29098a3b0a6ec6101cd03a679d2bd464821dd5c2820d340b32217894b4bcdd
+size 1160774
diff --git a/illustrious_generated/46edb49b5dbf.png b/illustrious_generated/46edb49b5dbf.png
new file mode 100644
index 0000000000000000000000000000000000000000..845f0a93dee5517909d2b871e8e175368fcd04d3
--- /dev/null
+++ b/illustrious_generated/46edb49b5dbf.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ef461cf1e362e6a26595695920b9d231b78a3b5ef043d372bbec90d03074b6
+size 1598482
diff --git a/illustrious_generated/47418c15a58f.png b/illustrious_generated/47418c15a58f.png
new file mode 100644
index 0000000000000000000000000000000000000000..698f3bbad0ae0597248c433dff8bb95132d4cb90
--- /dev/null
+++ b/illustrious_generated/47418c15a58f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3509ece204b5608eeaba8423e4f4b5249ee13a605f505335aff1b9c42c09c29a
+size 355846
diff --git a/illustrious_generated/481f3834876a.png b/illustrious_generated/481f3834876a.png
new file mode 100644
index 0000000000000000000000000000000000000000..3141b97885b4f4cb186f83faa0c8cf7f9ba942b4
--- /dev/null
+++ b/illustrious_generated/481f3834876a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca87afba423b91969108309e114d12ecfbafa064f4690d633fb0bc850cb80240
+size 886270
diff --git a/illustrious_generated/495f1b55919f.png b/illustrious_generated/495f1b55919f.png
new file mode 100644
index 0000000000000000000000000000000000000000..9dd2ea37f947b31b30105cefc6acd862c6248feb
--- /dev/null
+++ b/illustrious_generated/495f1b55919f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7df5e464564e75375ce410290760f238ca73ed34465f1fcc9e834473e60b87a6
+size 4947618
diff --git a/illustrious_generated/49712a2e71f1.png b/illustrious_generated/49712a2e71f1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f59db01d4c6b07f44a7de5c98985a2619c165c23
--- /dev/null
+++ b/illustrious_generated/49712a2e71f1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb729069a2da4aa00a02d285e0eae6b9df206557acddf07a981eedad9602ef7b
+size 2550639
diff --git a/illustrious_generated/4c587778617b.png b/illustrious_generated/4c587778617b.png
new file mode 100644
index 0000000000000000000000000000000000000000..355eb2bc0487395145a9d20766d5bdcb6fedc97a
--- /dev/null
+++ b/illustrious_generated/4c587778617b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a54ebced59935aa34f02f402bb7bd84576442b41baedf510a0057d9805d02ba4
+size 2431626
diff --git a/illustrious_generated/4c6ea9681419.png b/illustrious_generated/4c6ea9681419.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd0dd87df84c90809dde688fde7e13ca59911aa7
--- /dev/null
+++ b/illustrious_generated/4c6ea9681419.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee13805b2f584e5e0788143ae1d986395b4c3c1a8a979bc12838dfe01b45929e
+size 2639420
diff --git a/illustrious_generated/4f1602c01d5b.png b/illustrious_generated/4f1602c01d5b.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f58a9e3d429d43de7302f685807b0628a9d5720
--- /dev/null
+++ b/illustrious_generated/4f1602c01d5b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:032b5e22e6b1aeae513c7b029970b0685be0cec6dd0e9fa777d26f1c8adedd33
+size 2022914
diff --git a/illustrious_generated/4f23c350b644.png b/illustrious_generated/4f23c350b644.png
new file mode 100644
index 0000000000000000000000000000000000000000..6f9abfc5220985771ada24b757955202f464a943
--- /dev/null
+++ b/illustrious_generated/4f23c350b644.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fad9b9b46918c8bb7bd2f25a234fa22cb8d0baef8e75276024873ebb78670ef1
+size 2239563
diff --git a/illustrious_generated/502a84449b45.png b/illustrious_generated/502a84449b45.png
new file mode 100644
index 0000000000000000000000000000000000000000..025ae25364a7c01ad8ac35a544878b799396af4f
--- /dev/null
+++ b/illustrious_generated/502a84449b45.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ead494f570d81dba41522612a40bf620d3273d09cb94e69f9dcb2eff6a2aaedb
+size 2816333
diff --git a/illustrious_generated/5242430c6777.png b/illustrious_generated/5242430c6777.png
new file mode 100644
index 0000000000000000000000000000000000000000..92c8dbe070fafa85bfe0597ebaf0003c35d2c438
--- /dev/null
+++ b/illustrious_generated/5242430c6777.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3866e5a2f93ab0033cae03d9a8e7a211f7629f78e6a9335a9f6078268f43e986
+size 844068
diff --git a/illustrious_generated/5718f8172842.png b/illustrious_generated/5718f8172842.png
new file mode 100644
index 0000000000000000000000000000000000000000..73b0982c5033a2d3f2fa522ae7a56b881980317a
--- /dev/null
+++ b/illustrious_generated/5718f8172842.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9c4fbb9201ce62ad12311ba0e025dc26896664b4b6c4205125b5e89155e729c
+size 4774500
diff --git a/illustrious_generated/574012fe8664.png b/illustrious_generated/574012fe8664.png
new file mode 100644
index 0000000000000000000000000000000000000000..98268886f7158b1f7635d1cd96ce5a7349141a36
--- /dev/null
+++ b/illustrious_generated/574012fe8664.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5705cf1536eeca3121256db820be909a2b633c128919bda5ce6734b122cadeb6
+size 2561323
diff --git a/illustrious_generated/574fba2c6515.png b/illustrious_generated/574fba2c6515.png
new file mode 100644
index 0000000000000000000000000000000000000000..ed31ddd8b6100eea212ac5076a2072f1b0584191
--- /dev/null
+++ b/illustrious_generated/574fba2c6515.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:623583ab5baf0b8637d51678f4ff297b73c7141d36a56ed545911cbcf3ee69e8
+size 2117313
diff --git a/illustrious_generated/585afc2017e2.png b/illustrious_generated/585afc2017e2.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0a2a42988ff9b1f33384072c4b4dffc2425900d
--- /dev/null
+++ b/illustrious_generated/585afc2017e2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e64f5998e1b164c13027f9179db16d76c2198f90e942fd70f71f2a8e5006b8a
+size 2268480
diff --git a/illustrious_generated/586dbda7c6ff.png b/illustrious_generated/586dbda7c6ff.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ddddb5444f4e01c3448e07f41394824cd56d7bc
--- /dev/null
+++ b/illustrious_generated/586dbda7c6ff.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e232ffe3fc2d49ab6fa275a803b36a4accfd8411668c22a6ae895fbfd005f98
+size 2786106
diff --git a/illustrious_generated/591e156ad5fd.png b/illustrious_generated/591e156ad5fd.png
new file mode 100644
index 0000000000000000000000000000000000000000..f71b815ceb97f564fe8a30dcbb66423950f2ae8b
--- /dev/null
+++ b/illustrious_generated/591e156ad5fd.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e7c3c472df7214dec0d6c286b6de97c3d9911f9ff67865394e398887e28bc78
+size 1174720
diff --git a/illustrious_generated/59a595c825c8.png b/illustrious_generated/59a595c825c8.png
new file mode 100644
index 0000000000000000000000000000000000000000..f2230b70f00c09d79939f0064ade2f9ec3c296d6
--- /dev/null
+++ b/illustrious_generated/59a595c825c8.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6768948df35f10c670b56cb86dae845d70ed337c8a187a8b8f4130e57fee07a
+size 1480045
diff --git a/illustrious_generated/5a0201bebc6d.png b/illustrious_generated/5a0201bebc6d.png
new file mode 100644
index 0000000000000000000000000000000000000000..25da206981d7df0d095a87e578af5a3eb41e7ec8
--- /dev/null
+++ b/illustrious_generated/5a0201bebc6d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c0ee4c29aeb30d1a390acdb3266371f483db4fe18939ac119398ef7c6cc6efc
+size 1781633
diff --git a/illustrious_generated/5b8f74bcc260.png b/illustrious_generated/5b8f74bcc260.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc48d188518e78e48b27f140c0d66c3f8c15d195
--- /dev/null
+++ b/illustrious_generated/5b8f74bcc260.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d27cfc20f26d0ecb5e04b80daaea350a2ee4b2256322ccfbc7d8ea8cc0cc18c3
+size 2358261
diff --git a/illustrious_generated/5c4a2ea8f842.png b/illustrious_generated/5c4a2ea8f842.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5256623457c50288832486f0e597fa20fda258
--- /dev/null
+++ b/illustrious_generated/5c4a2ea8f842.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87c2ed67113efb628798c930723c052c628b98951ad058565c2b28292405d241
+size 323198
diff --git a/illustrious_generated/5c6f22f08540.png b/illustrious_generated/5c6f22f08540.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce01dc6eddd50888cc45a165d94408dd2c72a577
--- /dev/null
+++ b/illustrious_generated/5c6f22f08540.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:417fc16cbd6efb47a1e5d1c134db2f947e26ee91899a16bf61900063502c6f21
+size 351229
diff --git a/illustrious_generated/5f147d77f3ed.png b/illustrious_generated/5f147d77f3ed.png
new file mode 100644
index 0000000000000000000000000000000000000000..fb7f41206f1be9345397403b52a047717c50d5c2
--- /dev/null
+++ b/illustrious_generated/5f147d77f3ed.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ee0eb46272a49d0b47b5d24d47d67881749787e7d76a6c4f7a09744df62ed74
+size 926161
diff --git a/illustrious_generated/62daa562132c.png b/illustrious_generated/62daa562132c.png
new file mode 100644
index 0000000000000000000000000000000000000000..9da3a6c575816434a34e52398774f0c4401aff22
--- /dev/null
+++ b/illustrious_generated/62daa562132c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8af5607e0a1f0e2685def37e0c9b2fc486006eed6230b668b2c4f41a44548843
+size 1781185
diff --git a/illustrious_generated/6346f39915f3.png b/illustrious_generated/6346f39915f3.png
new file mode 100644
index 0000000000000000000000000000000000000000..369c31d685c42806b602d8b157f4ce6213d530f8
--- /dev/null
+++ b/illustrious_generated/6346f39915f3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:083271dab9eeea8a305171321c3c93d0dc37df6d124019fb6ef7a9dff18ff98a
+size 829349
diff --git a/illustrious_generated/645e3b996530.png b/illustrious_generated/645e3b996530.png
new file mode 100644
index 0000000000000000000000000000000000000000..c90d59eceae5efd72284dfd3b8be2274c26a3f09
--- /dev/null
+++ b/illustrious_generated/645e3b996530.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97df3a9ecdb73ff700a2e187067b215f8afa69b0dfe3596d68c33860075337e4
+size 976539
diff --git a/illustrious_generated/656abae8d0b6.png b/illustrious_generated/656abae8d0b6.png
new file mode 100644
index 0000000000000000000000000000000000000000..a9bdb58bdf5c010e7a1b282346a2ab3a2a7c4305
--- /dev/null
+++ b/illustrious_generated/656abae8d0b6.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c59c2413c31fdfa454c9ce30369d0b72ff3ca0a135e365b9938f4837856703c0
+size 2093390
diff --git a/illustrious_generated/67ea9c16fed3.png b/illustrious_generated/67ea9c16fed3.png
new file mode 100644
index 0000000000000000000000000000000000000000..587f0b22bd5dd69d20e1838d6bddfe381035ee9d
--- /dev/null
+++ b/illustrious_generated/67ea9c16fed3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23ac7e50bf72682c3e21a801c292215ed96db7f5625aaf2d98027876f0fc6800
+size 650427
diff --git a/illustrious_generated/698a4bf05f13.png b/illustrious_generated/698a4bf05f13.png
new file mode 100644
index 0000000000000000000000000000000000000000..bb75732f82b5b523df09c3912cb3c44fa8f3096b
--- /dev/null
+++ b/illustrious_generated/698a4bf05f13.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1cea1f9064fc4ff449941f579478e834e755f4d31d3f8238a29b60939e7d909
+size 566700
diff --git a/illustrious_generated/69e10254baf5.png b/illustrious_generated/69e10254baf5.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9f387893246920535ad50757d91a97953a2b921
--- /dev/null
+++ b/illustrious_generated/69e10254baf5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f26bbeb91527b2a1cf3a14e2ab6363d67bb65686c0c9295b94054a25bfd6f1a
+size 1212755
diff --git a/illustrious_generated/6b3c44df8332.png b/illustrious_generated/6b3c44df8332.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e386f46ec823d9a2b7b982ee1f519e4832680e7
--- /dev/null
+++ b/illustrious_generated/6b3c44df8332.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f03f7c844273a5a52d151e727d728c768c0e0034a45e3f0efb48a7fb9798d83
+size 494992
diff --git a/illustrious_generated/6c268f463a2b.png b/illustrious_generated/6c268f463a2b.png
new file mode 100644
index 0000000000000000000000000000000000000000..001a8088979ec57bebbf8835adae5584da286d51
--- /dev/null
+++ b/illustrious_generated/6c268f463a2b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57ff356b2f9742b2a110c4c545a860e096f94ac5784a5a7268d0a3feda285cce
+size 782614
diff --git a/illustrious_generated/6ca60a86b836.png b/illustrious_generated/6ca60a86b836.png
new file mode 100644
index 0000000000000000000000000000000000000000..cf2e8ec70ae4b2b63ff3957a31627ca5b73387d9
--- /dev/null
+++ b/illustrious_generated/6ca60a86b836.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b97273270ee046a63b8833e7e57c5430d7543357f16cb841d2f74cef5dd58a1a
+size 1711711
diff --git a/illustrious_generated/6d5feb7de870.png b/illustrious_generated/6d5feb7de870.png
new file mode 100644
index 0000000000000000000000000000000000000000..3ee1808f53b990e7d5eaef895db48c2ad86c91ca
--- /dev/null
+++ b/illustrious_generated/6d5feb7de870.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2dd5a66f3da26463b6f6fd19228c298bec008915390f95f54713a6fde706403
+size 1123100
diff --git a/illustrious_generated/6f1c05af41ca.png b/illustrious_generated/6f1c05af41ca.png
new file mode 100644
index 0000000000000000000000000000000000000000..81c92a48fa14605078af7e06eaa64b25e9037f49
--- /dev/null
+++ b/illustrious_generated/6f1c05af41ca.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb56a6a447317d025f30f9342747e68a66300280da18068cf87b7f6f91799f98
+size 3505267
diff --git a/illustrious_generated/6fba429dafc5.png b/illustrious_generated/6fba429dafc5.png
new file mode 100644
index 0000000000000000000000000000000000000000..aec61cb6904282d3d4f46c771f7823dab5a4fa87
--- /dev/null
+++ b/illustrious_generated/6fba429dafc5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7dbf72adf13fe392af63fd9e0713a0cd3cf7b215fd12f1d0057dd32dd2c3ea2c
+size 2424576
diff --git a/illustrious_generated/6fe5f96649a3.png b/illustrious_generated/6fe5f96649a3.png
new file mode 100644
index 0000000000000000000000000000000000000000..f37082bddcd556fa27865847311e5fcd2d892ce1
--- /dev/null
+++ b/illustrious_generated/6fe5f96649a3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d555762177d86fdf916e34a3e0c7e6b0d6581362c4cb96cb1507b5fe6a4308f7
+size 727081
diff --git a/illustrious_generated/7023242de1c0.png b/illustrious_generated/7023242de1c0.png
new file mode 100644
index 0000000000000000000000000000000000000000..10c9f95462a6740380798c6b2bff0a7377dc0a2d
--- /dev/null
+++ b/illustrious_generated/7023242de1c0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5b2ccd26d25f96b65246428df6c5d617e2ab4490c187523c888316e58d30fe2
+size 577711
diff --git a/illustrious_generated/72473c769552.png b/illustrious_generated/72473c769552.png
new file mode 100644
index 0000000000000000000000000000000000000000..402b086230ca4d8c23a0ab03b4916f8c303817cf
--- /dev/null
+++ b/illustrious_generated/72473c769552.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:843f9e2c75342cab8b0e4ec48746ec0367f3fe5b188b518bb6034365339f6a49
+size 721059
diff --git a/illustrious_generated/733c86338921.png b/illustrious_generated/733c86338921.png
new file mode 100644
index 0000000000000000000000000000000000000000..e649f34ea3110c032a024cab58b148fabf4af7cd
--- /dev/null
+++ b/illustrious_generated/733c86338921.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69d0190defa0d3589dbccd341bd547ce52a7cf5e8e174c2fdfdf7a9549958f08
+size 797354
diff --git a/illustrious_generated/7368d4c82b5f.png b/illustrious_generated/7368d4c82b5f.png
new file mode 100644
index 0000000000000000000000000000000000000000..325c040a8fff523d8bac574fea19d7e276b88196
--- /dev/null
+++ b/illustrious_generated/7368d4c82b5f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:258cfe64efa94d5d12b67428200f5792dbb1bd1faa9a0ab76ef97b816ebaf522
+size 1669323
diff --git a/illustrious_generated/75e576f27cb6.png b/illustrious_generated/75e576f27cb6.png
new file mode 100644
index 0000000000000000000000000000000000000000..54d657b3903144228b5d5595c3fc3b66eff12d95
--- /dev/null
+++ b/illustrious_generated/75e576f27cb6.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:559bda1470ea68018b48c7deb85f5cb6ccb33ff8ada13d103cfac8f8fee7187c
+size 3490289
diff --git a/illustrious_generated/765bf9d23c7e.png b/illustrious_generated/765bf9d23c7e.png
new file mode 100644
index 0000000000000000000000000000000000000000..14ae0bada633e58a3734904a24828364797626ff
--- /dev/null
+++ b/illustrious_generated/765bf9d23c7e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54687422c1ec7ec4a14308e52e20f940629cb3f8c2c4de3688576cca8e6f3f66
+size 1303534
diff --git a/illustrious_generated/7663094bacec.png b/illustrious_generated/7663094bacec.png
new file mode 100644
index 0000000000000000000000000000000000000000..0924d452878364cc84eb2f6008a6755e73036edd
--- /dev/null
+++ b/illustrious_generated/7663094bacec.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f60c79963fd88aed7b0735152d610764878b5b8077a636b9e2f3a0f3615984d
+size 1553870
diff --git a/illustrious_generated/76b2de1037cb.png b/illustrious_generated/76b2de1037cb.png
new file mode 100644
index 0000000000000000000000000000000000000000..c13715cd00386f2f8c926eb381d1b01453c3fccb
--- /dev/null
+++ b/illustrious_generated/76b2de1037cb.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d32b47a5c2d6da608777fdea8a45689c8a8a1fa746a0b799012cb6709adf38e2
+size 2118831
diff --git a/illustrious_generated/78026f131004.png b/illustrious_generated/78026f131004.png
new file mode 100644
index 0000000000000000000000000000000000000000..962d9e2f58b8d4095f462f69b9474f6e03fe986c
--- /dev/null
+++ b/illustrious_generated/78026f131004.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c33e945b013b359bc3669d318e29d04dd83185ac99edf763d61bd9714fc7d00f
+size 770587
diff --git a/illustrious_generated/78dfdb4f0521.png b/illustrious_generated/78dfdb4f0521.png
new file mode 100644
index 0000000000000000000000000000000000000000..dbc00cb4112a94a57e033fe1a3dae2fb068b17d7
--- /dev/null
+++ b/illustrious_generated/78dfdb4f0521.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a31e20ce63fd5ab4a4bddf18f4371015aa58bc5e1f93bc685be3a1ffe9e08372
+size 3186414
diff --git a/illustrious_generated/790ece21df10.png b/illustrious_generated/790ece21df10.png
new file mode 100644
index 0000000000000000000000000000000000000000..1ddf52ab4598f207e69aed576c87077f3a6b77c9
--- /dev/null
+++ b/illustrious_generated/790ece21df10.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bb9295b88c44f3fb487de5c8f6f3d21c143483a7a2fad552f7a9db074fcb6a7
+size 1482443
diff --git a/illustrious_generated/7ac791baad53.png b/illustrious_generated/7ac791baad53.png
new file mode 100644
index 0000000000000000000000000000000000000000..362d036235d3f6b2179a7055bebd608857475c3b
--- /dev/null
+++ b/illustrious_generated/7ac791baad53.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4d961103328ca4c8f8f3c136732e3111b6aa509c081d4cb5bdbb18d4197f4ba
+size 2571175
diff --git a/illustrious_generated/7acda55248bc.png b/illustrious_generated/7acda55248bc.png
new file mode 100644
index 0000000000000000000000000000000000000000..6adb1fcbc4daabca0fb4d15bc516b84646dd90e6
--- /dev/null
+++ b/illustrious_generated/7acda55248bc.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab721d0d38cb2880b2dc9e77b1fd2ae994f51795b890e0d99f84e841404d428d
+size 2585078
diff --git a/illustrious_generated/7ad096e9b528.png b/illustrious_generated/7ad096e9b528.png
new file mode 100644
index 0000000000000000000000000000000000000000..21e1178f2b247f0d9bf58d5134f7fc615336366a
--- /dev/null
+++ b/illustrious_generated/7ad096e9b528.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:486537ebe70453b2ff5a2bbbb79f99bc0533cbf511a9bebe0a2b5e699df29e9c
+size 5015366
diff --git a/illustrious_generated/7b8529c066a0.png b/illustrious_generated/7b8529c066a0.png
new file mode 100644
index 0000000000000000000000000000000000000000..6605d290cdd9e67e012eb1a7e727a6356a07b959
--- /dev/null
+++ b/illustrious_generated/7b8529c066a0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb586477e776f677467368b039dd8b0cd92bab495aca366513d6fce99817cfd3
+size 785538
diff --git a/illustrious_generated/7b900f6e27b1.png b/illustrious_generated/7b900f6e27b1.png
new file mode 100644
index 0000000000000000000000000000000000000000..cd2f478027aa2a6744cfa14b523275436eaebe88
--- /dev/null
+++ b/illustrious_generated/7b900f6e27b1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47f2dceaa89f704e9e3a61e1561a39ab00676d45f8eb527b501c3afe8272c5db
+size 878110
diff --git a/illustrious_generated/7c5200560049.png b/illustrious_generated/7c5200560049.png
new file mode 100644
index 0000000000000000000000000000000000000000..dd10ab53a0281ae1351617eb7b01d0389360b2ee
--- /dev/null
+++ b/illustrious_generated/7c5200560049.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d10c001a84fb81c0948d234e1b0cd6842f7e2193769d6bfcdc1b606b6157fac2
+size 3334583
diff --git a/illustrious_generated/7cce990ade4c.png b/illustrious_generated/7cce990ade4c.png
new file mode 100644
index 0000000000000000000000000000000000000000..25be3ad9c2aca710f2e91ae3d2782a07386590dc
--- /dev/null
+++ b/illustrious_generated/7cce990ade4c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc684563aa70a63beffc8c28e029daddc915eef83c6cf70bb3aa521d2aa6c976
+size 399342
diff --git a/illustrious_generated/7d22dc2a6fb2.png b/illustrious_generated/7d22dc2a6fb2.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b5f4adb46b21196aa7268bfd30f73da1a456b89
--- /dev/null
+++ b/illustrious_generated/7d22dc2a6fb2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f87fabe2c7eb36b8324417774536915dd173137181a2e166fb81a5eeba4be904
+size 3132323
diff --git a/illustrious_generated/7d8509931e4e.png b/illustrious_generated/7d8509931e4e.png
new file mode 100644
index 0000000000000000000000000000000000000000..81e01c25cdb4ffb8f721c28e9fc5b45ce8eceb16
--- /dev/null
+++ b/illustrious_generated/7d8509931e4e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe79589cfca275f24bca5e0ccb91b3b9b124a4120136a6c66a5df9a15038a4ff
+size 1501695
diff --git a/illustrious_generated/7e49e6b5a30b.png b/illustrious_generated/7e49e6b5a30b.png
new file mode 100644
index 0000000000000000000000000000000000000000..f96054b8568c2e6c044ff55bf6bc0e06948b0367
--- /dev/null
+++ b/illustrious_generated/7e49e6b5a30b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc3540fa2b23fff4e737c7c8ed21da33f5d561ffcca9949b1ca248249f03fbe2
+size 1261923
diff --git a/illustrious_generated/7eab3f4f0c8e.png b/illustrious_generated/7eab3f4f0c8e.png
new file mode 100644
index 0000000000000000000000000000000000000000..c9df269bda4c3496a55a6cff33a6ea2b5ad9bd40
--- /dev/null
+++ b/illustrious_generated/7eab3f4f0c8e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5242c46b8f0707fd6f018cfdb7fc162ccfe66142021f4346b97a804c5deda3c5
+size 1109619
diff --git a/illustrious_generated/82ee8177ef04.png b/illustrious_generated/82ee8177ef04.png
new file mode 100644
index 0000000000000000000000000000000000000000..65c27ca28321003e1a0ac1ba209accaaf029ab76
--- /dev/null
+++ b/illustrious_generated/82ee8177ef04.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02cf5a2d4f9f80b2e8456d634443cd8980c7b9a19bc9914262d214ebda1cdeab
+size 3887910
diff --git a/illustrious_generated/85e9723ae8cf.png b/illustrious_generated/85e9723ae8cf.png
new file mode 100644
index 0000000000000000000000000000000000000000..47434e9ea0549ef0ac3243e44eae0eda59ae7444
--- /dev/null
+++ b/illustrious_generated/85e9723ae8cf.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddd85a02772934e04e8531ee821be87846e354c6d22f7c344e9c9e30c42fbe28
+size 732863
diff --git a/illustrious_generated/8633a3dff7ea.png b/illustrious_generated/8633a3dff7ea.png
new file mode 100644
index 0000000000000000000000000000000000000000..a97683debfe800ca8ba1bd03ddc64f37a53a185b
--- /dev/null
+++ b/illustrious_generated/8633a3dff7ea.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35277e93a6d8ba3fda0b8a10242bf99b6ea52c892ae265c89a8e5e7d4f5fd33c
+size 774056
diff --git a/illustrious_generated/891abd7c9fa3.png b/illustrious_generated/891abd7c9fa3.png
new file mode 100644
index 0000000000000000000000000000000000000000..16535886336f8e3dfede0438027865430fc7aa1f
--- /dev/null
+++ b/illustrious_generated/891abd7c9fa3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b500c4dbeec694f181dbb840037e616c2de7e8c64ac02357b1ab5279f6a1c1e8
+size 4155431
diff --git a/illustrious_generated/891dc839571c.png b/illustrious_generated/891dc839571c.png
new file mode 100644
index 0000000000000000000000000000000000000000..68a462e318bb2abf5ede5297ff75c6ecb205186f
--- /dev/null
+++ b/illustrious_generated/891dc839571c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3ac4431830813d4f3a726cef8772d3be3535613d8549ddfa67be0e2864c2a8f
+size 338301
diff --git a/illustrious_generated/8a371dac467c.png b/illustrious_generated/8a371dac467c.png
new file mode 100644
index 0000000000000000000000000000000000000000..e781ef0ee227fda9dd30739a73fc9652c909e287
--- /dev/null
+++ b/illustrious_generated/8a371dac467c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb20cc5e9f137912da6ae7bad8678396c5b3ab003966518862f05be7dfe92ce6
+size 1276253
diff --git a/illustrious_generated/8a90db3476ef.png b/illustrious_generated/8a90db3476ef.png
new file mode 100644
index 0000000000000000000000000000000000000000..f7383bab6c8277c3e36bae5117e4e07e42f85f4b
--- /dev/null
+++ b/illustrious_generated/8a90db3476ef.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:657aa9938e98693058414f5dc7cf63492c4c7b5ca85bcd690c49033e8fe540f5
+size 3362948
diff --git a/illustrious_generated/8ad0a744de62.png b/illustrious_generated/8ad0a744de62.png
new file mode 100644
index 0000000000000000000000000000000000000000..3fca76f9995f33eee27ae164e708d54dab0177b4
--- /dev/null
+++ b/illustrious_generated/8ad0a744de62.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fdcf313d5246d19dbcd1e76b46a2d349d01f89704b606dbb50a998180456f36
+size 897594
diff --git a/illustrious_generated/8b674edb3a4e.png b/illustrious_generated/8b674edb3a4e.png
new file mode 100644
index 0000000000000000000000000000000000000000..a12c0136c33bbd7421ba4880b0bd5c0f60fc83b3
--- /dev/null
+++ b/illustrious_generated/8b674edb3a4e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:143df5f59478212f7bb476762e35c1f84e27fc0a290e1178c0750eecaef10efc
+size 366453
diff --git a/illustrious_generated/8cbc6e1dbe62.png b/illustrious_generated/8cbc6e1dbe62.png
new file mode 100644
index 0000000000000000000000000000000000000000..ec85441e897bbea8b510193253a665ed5bd1b517
--- /dev/null
+++ b/illustrious_generated/8cbc6e1dbe62.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:097dcd8e53088cfa62e60e4e61bc619ef3dd160419534a55ca310734253de0ac
+size 1228809
diff --git a/illustrious_generated/8d95e57fcb27.png b/illustrious_generated/8d95e57fcb27.png
new file mode 100644
index 0000000000000000000000000000000000000000..db44fc4a0313395e628a5dea3bc8dcaa872c22ba
--- /dev/null
+++ b/illustrious_generated/8d95e57fcb27.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f201ff6108e7813475383d1437186e3c0c90a97ebcda6333086e2dc9cfb4df3
+size 1796221
diff --git a/illustrious_generated/8f338d47820a.png b/illustrious_generated/8f338d47820a.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6591caf5a212146a42965b1bda82cf6182ce916
--- /dev/null
+++ b/illustrious_generated/8f338d47820a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:218a659c1c66a3b6c3fcb0d28518d1d25762777385e223103e7f823565217d05
+size 3800176
diff --git a/illustrious_generated/8fa96985fc06.png b/illustrious_generated/8fa96985fc06.png
new file mode 100644
index 0000000000000000000000000000000000000000..959d321854a4c642e5f6fc754209b9c8735f13d9
--- /dev/null
+++ b/illustrious_generated/8fa96985fc06.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea3c8a0b1e2b0c5bbb65c471dcc28e76cb6b42d475e4ad34641ff4c479011d3
+size 1958824
diff --git a/illustrious_generated/8fd9fbffb954.png b/illustrious_generated/8fd9fbffb954.png
new file mode 100644
index 0000000000000000000000000000000000000000..2374815d1aa10709cee3dbfba8e6df6b359d8c3c
--- /dev/null
+++ b/illustrious_generated/8fd9fbffb954.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09cd19ad29c0ac2841222567516d81c8c8545c38dde30c17f1ad6bfc0de5e6d3
+size 1492004
diff --git a/illustrious_generated/91076903bce5.png b/illustrious_generated/91076903bce5.png
new file mode 100644
index 0000000000000000000000000000000000000000..a469e788183adbf5b7dea4eb48befed3d38da431
--- /dev/null
+++ b/illustrious_generated/91076903bce5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6324126f8a018c2dd9afa6a02c46afb9a1d9d9ac1d0a7577380fa64d5fa52046
+size 508064
diff --git a/illustrious_generated/91d346543b7c.png b/illustrious_generated/91d346543b7c.png
new file mode 100644
index 0000000000000000000000000000000000000000..2173f245cdb9fa335d091f3d3c8646ea82ca6ae0
--- /dev/null
+++ b/illustrious_generated/91d346543b7c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e2f3bd28e7ec3da511ad3398cedb3d5f55f8b78dc6063e850cc822c0cee737d
+size 583312
diff --git a/illustrious_generated/92bcab0aaba1.png b/illustrious_generated/92bcab0aaba1.png
new file mode 100644
index 0000000000000000000000000000000000000000..b958158a69bd96f3ded2b43c52a16aa2873db288
--- /dev/null
+++ b/illustrious_generated/92bcab0aaba1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da1822f00357605face72b884184e78c024eabe1f50ef9f8986f59feae97b7c9
+size 1144708
diff --git a/illustrious_generated/93d9e9abc98e.png b/illustrious_generated/93d9e9abc98e.png
new file mode 100644
index 0000000000000000000000000000000000000000..fa816ff230dcd30ca6839ce454fd3429e7847d09
--- /dev/null
+++ b/illustrious_generated/93d9e9abc98e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf19dfad48513ef301b45730de870df214ec8b3e1389d5c529ce70757d5da7a1
+size 1470617
diff --git a/illustrious_generated/954594f7f0a6.png b/illustrious_generated/954594f7f0a6.png
new file mode 100644
index 0000000000000000000000000000000000000000..8a5ec2e14416a82b0fe98e1a2c6dbd7d9dd888d3
--- /dev/null
+++ b/illustrious_generated/954594f7f0a6.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf6c2f2f8a34bc96183404464b44216fcad8fe46518eb60996616e3f3343ff65
+size 3768172
diff --git a/illustrious_generated/980b174e831c.png b/illustrious_generated/980b174e831c.png
new file mode 100644
index 0000000000000000000000000000000000000000..b5bc474028be5e5a4b852e2ebf3eeaafaa4ae9a3
--- /dev/null
+++ b/illustrious_generated/980b174e831c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cee59f0127f00ba449710219d301d5c1ad3a65966cf7248ca13cff6039fc06cf
+size 1157764
diff --git a/illustrious_generated/99d5b088ccd4.png b/illustrious_generated/99d5b088ccd4.png
new file mode 100644
index 0000000000000000000000000000000000000000..baa8643babd72c7a10c54c01dbfd0f6115fecfe3
--- /dev/null
+++ b/illustrious_generated/99d5b088ccd4.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d86771e53a55acd7fdaff5472268587be82c747b0cf970d1dd3808a739f441
+size 729979
diff --git a/illustrious_generated/9b2b12c21a2b.png b/illustrious_generated/9b2b12c21a2b.png
new file mode 100644
index 0000000000000000000000000000000000000000..c302599698f210c32378cc2839c0c75c607f215e
--- /dev/null
+++ b/illustrious_generated/9b2b12c21a2b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1189699c1c15957cb38a7a026101461e80918e0dd80a6363de482272ef3b846
+size 1368158
diff --git a/illustrious_generated/9bb815cccb98.png b/illustrious_generated/9bb815cccb98.png
new file mode 100644
index 0000000000000000000000000000000000000000..e10f7f886e351a0ed189d30e7da7a08ae916ca58
--- /dev/null
+++ b/illustrious_generated/9bb815cccb98.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13fd6ae75d80bf7157ea19ae7d053d6d606525b8f1e01123bff541c9d0e78c04
+size 2404202
diff --git a/illustrious_generated/9da135f5f21e.png b/illustrious_generated/9da135f5f21e.png
new file mode 100644
index 0000000000000000000000000000000000000000..6d01f081a8c0e14d1946a5d93f8fb49a9d0b5ab1
--- /dev/null
+++ b/illustrious_generated/9da135f5f21e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90076a4d6abbedcf02e25b38cfb5e6d2ca66c6bd33bae246e448152486ca5837
+size 3172067
diff --git a/illustrious_generated/9e8dc59217e8.png b/illustrious_generated/9e8dc59217e8.png
new file mode 100644
index 0000000000000000000000000000000000000000..c47f678dfb03d2a5a86fe6a337180384de439cc1
--- /dev/null
+++ b/illustrious_generated/9e8dc59217e8.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42b377268f5c19beba5c750ad73e557eec2a90d06435192fed5a6a2ff42f00a6
+size 2542129
diff --git a/illustrious_generated/9e9a0ce3d676.png b/illustrious_generated/9e9a0ce3d676.png
new file mode 100644
index 0000000000000000000000000000000000000000..d576e06008a0449544ef2f398d4495a7ea072a63
--- /dev/null
+++ b/illustrious_generated/9e9a0ce3d676.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bb3a6ba80e58221d1a0e66c1711c52d111fb60a73b5f57b43cab6b6e349d779
+size 1346444
diff --git a/illustrious_generated/9ee7e057c8a2.png b/illustrious_generated/9ee7e057c8a2.png
new file mode 100644
index 0000000000000000000000000000000000000000..7c59a11753a4b1f7dc9754789a8bfe29e2b89cbb
--- /dev/null
+++ b/illustrious_generated/9ee7e057c8a2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e78524154888901e2048ecb005febb743ad110bab7e7d87bceead00bcd8acb17
+size 1306543
diff --git a/illustrious_generated/9f447e4cf3d7.png b/illustrious_generated/9f447e4cf3d7.png
new file mode 100644
index 0000000000000000000000000000000000000000..d41732246f73e5946f62dad2711ed7b37fb2b2d3
--- /dev/null
+++ b/illustrious_generated/9f447e4cf3d7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b234aec6f73c7adb107ff974407dcecf320618a20061e6ab22222acf9c301b2c
+size 664412
diff --git a/illustrious_generated/9f5c49f2e362.png b/illustrious_generated/9f5c49f2e362.png
new file mode 100644
index 0000000000000000000000000000000000000000..36c99d2bc61061d28d116d61db7b0559848eceff
--- /dev/null
+++ b/illustrious_generated/9f5c49f2e362.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d6d1035b2b2461c6b972f3bd8204b5836fcbfb3b1b10ca1ad9fc95238fec5b
+size 2584435
diff --git a/illustrious_generated/9f741bd68919.png b/illustrious_generated/9f741bd68919.png
new file mode 100644
index 0000000000000000000000000000000000000000..6c7822cf2b9766820d45772ee404c3ce724e7c27
--- /dev/null
+++ b/illustrious_generated/9f741bd68919.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4bf84c56ebf7471d2bf10843e3f5669373c9e702852d063739a5a229a5dbd1
+size 719494
diff --git a/illustrious_generated/9fafd1175b72.png b/illustrious_generated/9fafd1175b72.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e276fbc873c0abcbed53cf15f55a0c968b3718e
--- /dev/null
+++ b/illustrious_generated/9fafd1175b72.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d0a406400e29a8e9fc24d2a0544e694291456d07dcf15dae20b684bb561bc6b
+size 817974
diff --git a/illustrious_generated/a1ec0d3b0b0e.png b/illustrious_generated/a1ec0d3b0b0e.png
new file mode 100644
index 0000000000000000000000000000000000000000..585e412551f38ed2f86a94f4459b22e6fdd4f7c3
--- /dev/null
+++ b/illustrious_generated/a1ec0d3b0b0e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecba8ea5e87e054e59432aea84f52d36a96826389be21e78c7698e48d345c6a8
+size 2416760
diff --git a/illustrious_generated/a28e4715fc8c.png b/illustrious_generated/a28e4715fc8c.png
new file mode 100644
index 0000000000000000000000000000000000000000..328bd79c1814e59431aaaa235bc5a4e3128e4ec0
--- /dev/null
+++ b/illustrious_generated/a28e4715fc8c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63c9e9789ffc097ef103f9d1b71a4be322bc3e9d49d32d7370968a6aff1358ee
+size 2126174
diff --git a/illustrious_generated/a2ca03055273.png b/illustrious_generated/a2ca03055273.png
new file mode 100644
index 0000000000000000000000000000000000000000..05952df9033fb237b57af5597873e1ef4b8e54ec
--- /dev/null
+++ b/illustrious_generated/a2ca03055273.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19af8e335f35f5a762cba346a4efa53fb97d35522fa8064e5748492816be7d3c
+size 1798746
diff --git a/illustrious_generated/a364591ba4c1.png b/illustrious_generated/a364591ba4c1.png
new file mode 100644
index 0000000000000000000000000000000000000000..a963cd646b92e3a8b0e93189dee5f7eeabbb7971
--- /dev/null
+++ b/illustrious_generated/a364591ba4c1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e918cb7c24b9ca7b492a68229f628e3084292c34b48aeeb2ff0ddf501c836d5
+size 2205135
diff --git a/illustrious_generated/a370eb471cd7.png b/illustrious_generated/a370eb471cd7.png
new file mode 100644
index 0000000000000000000000000000000000000000..707fc37f3279a3b4b8c5b445456074bdd05dbede
--- /dev/null
+++ b/illustrious_generated/a370eb471cd7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:988fdb8fe7f2c60348f88c269d198dce582635b8d3915866388698d27eff8835
+size 581492
diff --git a/illustrious_generated/a564e408f362.png b/illustrious_generated/a564e408f362.png
new file mode 100644
index 0000000000000000000000000000000000000000..0287b485bc51ad1f0606b2836fb6cae9f3c7002d
--- /dev/null
+++ b/illustrious_generated/a564e408f362.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a0d1eeb8a2f67634a11a26427b485ac973fbe95cdee1559df0c716404f1a26c
+size 4187303
diff --git a/illustrious_generated/a891e5d92031.png b/illustrious_generated/a891e5d92031.png
new file mode 100644
index 0000000000000000000000000000000000000000..9a0055a2883569d612d10ca95d3aa7ecfb53c302
--- /dev/null
+++ b/illustrious_generated/a891e5d92031.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcbd2012935f09b6e706264ddd8ddd52a10e3f793db1540c9b10f82f7cf463a6
+size 3644180
diff --git a/illustrious_generated/a8e5c9011eef.png b/illustrious_generated/a8e5c9011eef.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d6c499f6bb473155fe2562365ff70eb214155cc
--- /dev/null
+++ b/illustrious_generated/a8e5c9011eef.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f09367e69e5158660f990e6c0a413bd11884c3eb54b785d6aa5d5c2e6d6188d
+size 352876
diff --git a/illustrious_generated/abe90752beb0.png b/illustrious_generated/abe90752beb0.png
new file mode 100644
index 0000000000000000000000000000000000000000..7fb47e931792dd3eef786290e83250debe3bdb78
--- /dev/null
+++ b/illustrious_generated/abe90752beb0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:531a6c817d41f47be7c8bf5d000349e68fec55517611017d2f3e7d5f365d2f8f
+size 1252824
diff --git a/illustrious_generated/ac9d950baac7.png b/illustrious_generated/ac9d950baac7.png
new file mode 100644
index 0000000000000000000000000000000000000000..63fe74f90ab03a5ce99edca6de9debc332b0101d
--- /dev/null
+++ b/illustrious_generated/ac9d950baac7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1068c1bc135821207bbf5ed77b2129b3ccdb852f877424e4f05b6e6e5b3f56d4
+size 4047040
diff --git a/illustrious_generated/aef907db00ce.png b/illustrious_generated/aef907db00ce.png
new file mode 100644
index 0000000000000000000000000000000000000000..96c716f2dc87e728249224576f8d33381a816668
--- /dev/null
+++ b/illustrious_generated/aef907db00ce.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d41d7d287a21e0e9419ebe4f2e4fd61324e98bc2eea494e1a3ba1d46f3205121
+size 325015
diff --git a/illustrious_generated/afbdb8dce1e5.png b/illustrious_generated/afbdb8dce1e5.png
new file mode 100644
index 0000000000000000000000000000000000000000..39eb900eb67c6d75322473584eb8b1ff6be52eb0
--- /dev/null
+++ b/illustrious_generated/afbdb8dce1e5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b727c7c976b77121f14e7509b7737e780530f78c73f596a787c893b2f3524b3e
+size 1505837
diff --git a/illustrious_generated/afd28993674d.png b/illustrious_generated/afd28993674d.png
new file mode 100644
index 0000000000000000000000000000000000000000..0a699a0b5fc1c88fedfec2cf0d462cb33f3090f5
--- /dev/null
+++ b/illustrious_generated/afd28993674d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2da4c9f034d846b800eee4e42f817c09321f241806b7731b00cc6af0fb87c2e5
+size 1317054
diff --git a/illustrious_generated/b4a9600f3647.png b/illustrious_generated/b4a9600f3647.png
new file mode 100644
index 0000000000000000000000000000000000000000..59bc2adc6e53faf451bea96f5f83f0fefce42357
--- /dev/null
+++ b/illustrious_generated/b4a9600f3647.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ddcefd287ee4f68b4b6ceedf40bbe0f61d475e64ba7485e244a270ad27a4b77
+size 1856011
diff --git a/illustrious_generated/b569d3590c66.png b/illustrious_generated/b569d3590c66.png
new file mode 100644
index 0000000000000000000000000000000000000000..c0c74c319ec9bb2e9cb5dfbbdb45888fa2a50154
--- /dev/null
+++ b/illustrious_generated/b569d3590c66.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d30e0041b7ad12b302033266af02e7b08d04632ca3aed4af4f3562f93ed628b3
+size 1815725
diff --git a/illustrious_generated/b58cf17494db.png b/illustrious_generated/b58cf17494db.png
new file mode 100644
index 0000000000000000000000000000000000000000..f2cdaaa02514275e72ed87213832d4229453fe76
--- /dev/null
+++ b/illustrious_generated/b58cf17494db.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e01ac813e22aa01867d184088e22b44b40e98a151df38b58750332179d9b873a
+size 2344099
diff --git a/illustrious_generated/b78d0c1f0687.png b/illustrious_generated/b78d0c1f0687.png
new file mode 100644
index 0000000000000000000000000000000000000000..7df0d315edf2da3412ef1793fd2b9ca735bf5830
--- /dev/null
+++ b/illustrious_generated/b78d0c1f0687.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ce6437880739d27489fec6e46a01c7601a86364a0c2b65d9d28c4cbaab72f78
+size 585213
diff --git a/illustrious_generated/b7f508ecce88.png b/illustrious_generated/b7f508ecce88.png
new file mode 100644
index 0000000000000000000000000000000000000000..a13aa49114c355d3e72d3f8e3948dc23566a308c
--- /dev/null
+++ b/illustrious_generated/b7f508ecce88.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccf441af6014a4d4f6d71beb54d9d23ac6ae80fba15dd66170f90d05e98dc4a1
+size 1971123
diff --git a/illustrious_generated/b80b59fe722f.png b/illustrious_generated/b80b59fe722f.png
new file mode 100644
index 0000000000000000000000000000000000000000..dc98202bbdc9ac44ad081dd1ff46d87150b99518
--- /dev/null
+++ b/illustrious_generated/b80b59fe722f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a40f93a69fc846c11c727aa0c691b862e36b3525a9a1d8d0d91040a004260df
+size 2095239
diff --git a/illustrious_generated/b8e81c1a4bd1.png b/illustrious_generated/b8e81c1a4bd1.png
new file mode 100644
index 0000000000000000000000000000000000000000..d2b70603080094b8f1561cf0431fed91d943c0db
--- /dev/null
+++ b/illustrious_generated/b8e81c1a4bd1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b745f9b1bd7a18b5b84da718fde0e98f928cbd3134b9d56736c2f1ad7567d8d3
+size 5561619
diff --git a/illustrious_generated/b9f37572031b.png b/illustrious_generated/b9f37572031b.png
new file mode 100644
index 0000000000000000000000000000000000000000..d4c8be2a026ecacbf5b7c0175a7447e341e44d4c
--- /dev/null
+++ b/illustrious_generated/b9f37572031b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45a91a098247b848929993063b2b3f66a7efe410fd50ac815ad02ddd842479ab
+size 1775976
diff --git a/illustrious_generated/b9fdc64b985c.png b/illustrious_generated/b9fdc64b985c.png
new file mode 100644
index 0000000000000000000000000000000000000000..41344628b9d8f171fc57fa839d4dd61abe3b86dd
--- /dev/null
+++ b/illustrious_generated/b9fdc64b985c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09afe2371f727403b562806482e1a4348c96fdb90cd55f542c1dae5d64c8ee78
+size 1365189
diff --git a/illustrious_generated/bb2041beb345.png b/illustrious_generated/bb2041beb345.png
new file mode 100644
index 0000000000000000000000000000000000000000..d3b5eaf380b149fd18889e37fd1b5a3c17b5b81a
--- /dev/null
+++ b/illustrious_generated/bb2041beb345.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bfc3eb8f8c6a0bca40a338e40612f76b803fb4b40837fe656bb4ac691ceadef
+size 1564904
diff --git a/illustrious_generated/bbf3fb096202.png b/illustrious_generated/bbf3fb096202.png
new file mode 100644
index 0000000000000000000000000000000000000000..314d365ae967826ba899e2e4325485176c213fd4
--- /dev/null
+++ b/illustrious_generated/bbf3fb096202.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fac16ee34c867fcb56bee00cb5314b23cf26e8ef71437466eada9055a55a3c0
+size 2990372
diff --git a/illustrious_generated/bcfc32b88c98.png b/illustrious_generated/bcfc32b88c98.png
new file mode 100644
index 0000000000000000000000000000000000000000..9a605b66077c57ae17130f477485a4e676265f73
--- /dev/null
+++ b/illustrious_generated/bcfc32b88c98.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cf5769b72f6f986490b11dd2f23875f7fb9de2d6c5bd61bae5857cc7831f518
+size 956155
diff --git a/illustrious_generated/bd65b176bfe6.png b/illustrious_generated/bd65b176bfe6.png
new file mode 100644
index 0000000000000000000000000000000000000000..c2429f8dbe94ff875a680abd39566b211b392bdf
--- /dev/null
+++ b/illustrious_generated/bd65b176bfe6.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:158b9682224ae802936cf4b331f591ff298c9d387104ba9092490c1135dbafa0
+size 2356203
diff --git a/illustrious_generated/be56d67f1e08.png b/illustrious_generated/be56d67f1e08.png
new file mode 100644
index 0000000000000000000000000000000000000000..32cb542fce078da4345a834e1e8e743a77e7a318
--- /dev/null
+++ b/illustrious_generated/be56d67f1e08.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:523cda9ca48be642397ce8457c977f31b1ffaa440fc8da7c609dce0f50b3bdeb
+size 3858720
diff --git a/illustrious_generated/bf97f1eaffeb.png b/illustrious_generated/bf97f1eaffeb.png
new file mode 100644
index 0000000000000000000000000000000000000000..f2c8850e150cacffe84952a41193ea7a226978f7
--- /dev/null
+++ b/illustrious_generated/bf97f1eaffeb.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a7d723eb08b84c73eb7c66469df4edc8add74e6607ceff764b359e0d67839cd
+size 629496
diff --git a/illustrious_generated/c1276a9fc21b.png b/illustrious_generated/c1276a9fc21b.png
new file mode 100644
index 0000000000000000000000000000000000000000..bb60ac85efd9289a2b0bd804e7be7ed07f57424d
--- /dev/null
+++ b/illustrious_generated/c1276a9fc21b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11d0afd05fdd07573a4d341b7ad8c5197353faafba63d315466040b2eedbdc46
+size 760038
diff --git a/illustrious_generated/c17212cc7fda.png b/illustrious_generated/c17212cc7fda.png
new file mode 100644
index 0000000000000000000000000000000000000000..591e822cdf982800e306ff79ed45d56a740da81d
--- /dev/null
+++ b/illustrious_generated/c17212cc7fda.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29d8a77ad42e033a05ada49c8255561291eaeb1a2773d65cbcd034a07a69aca8
+size 671342
diff --git a/illustrious_generated/c2c3bea0e9d5.png b/illustrious_generated/c2c3bea0e9d5.png
new file mode 100644
index 0000000000000000000000000000000000000000..9e7a1cb51d014163da10b06ae949ce15e57eec4c
--- /dev/null
+++ b/illustrious_generated/c2c3bea0e9d5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2296fd3651f294cdae300b533ec69cdcfaa89815b715c26b8cc87d8ad4a8a808
+size 1112906
diff --git a/illustrious_generated/c4b5bff2dbc1.png b/illustrious_generated/c4b5bff2dbc1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f1a05186b88e877368f67e665066e2f54ca3bb11
--- /dev/null
+++ b/illustrious_generated/c4b5bff2dbc1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c56b71eb119e24854f25c04f53a1e60fb55cc4c8d7afdd910095a9faae0749a1
+size 1080395
diff --git a/illustrious_generated/c5e0eb8a2241.png b/illustrious_generated/c5e0eb8a2241.png
new file mode 100644
index 0000000000000000000000000000000000000000..6943c102ec39fe9bcdb179438aba98914aac3431
--- /dev/null
+++ b/illustrious_generated/c5e0eb8a2241.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd989b9052683b3aa726514b4d43837ecb84f46228c49219af2c3484fa43a97f
+size 3262079
diff --git a/illustrious_generated/c63799030196.png b/illustrious_generated/c63799030196.png
new file mode 100644
index 0000000000000000000000000000000000000000..90e9cc67cef9d50ab0d88b6a97845f86875a8f4c
--- /dev/null
+++ b/illustrious_generated/c63799030196.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a42ed2414899172fc1c86384355f80763c82b37f02c7cca72a4363dd2733818a
+size 4182601
diff --git a/illustrious_generated/c76729f0f827.png b/illustrious_generated/c76729f0f827.png
new file mode 100644
index 0000000000000000000000000000000000000000..1c1517ea10fb1b39a706848fbc9e62d0cfa42475
--- /dev/null
+++ b/illustrious_generated/c76729f0f827.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd16e3a8922b1431ca0d4b1f9fe4ba83e635de74f6c9c7f5c81240fe63dba19a
+size 1970930
diff --git a/illustrious_generated/c7e1a60c0f5d.png b/illustrious_generated/c7e1a60c0f5d.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d0fdad04ef54d2cfce0ba08c314d0a7dd6ef902
--- /dev/null
+++ b/illustrious_generated/c7e1a60c0f5d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa403e72e26c1545fefdd135533680a91bd778711687d66bef530238011b961e
+size 828613
diff --git a/illustrious_generated/c8846919f3a8.png b/illustrious_generated/c8846919f3a8.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b2b5db0c5e0b3c93ed8356499656717f0a54b8c
--- /dev/null
+++ b/illustrious_generated/c8846919f3a8.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a00e92ceb2dedfc1dccd500fbbb9deeab2de9e797f8761ebbfc2c8d9b5fb862b
+size 1566903
diff --git a/illustrious_generated/c9bf921e364a.png b/illustrious_generated/c9bf921e364a.png
new file mode 100644
index 0000000000000000000000000000000000000000..47def0853b1f6b7f8e61bf64751f29f55b2bf58c
--- /dev/null
+++ b/illustrious_generated/c9bf921e364a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bdad60d0b5d121f1c4d65a40dab40d45cfd6ef09282446b8c98544dc567527f
+size 3930586
diff --git a/illustrious_generated/ca07713b354c.png b/illustrious_generated/ca07713b354c.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce7f44e7863a83a9c5db0cc0676b32de05af0eb1
--- /dev/null
+++ b/illustrious_generated/ca07713b354c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe35fe0ba814038919f80fba6f28ffaab93ac51e44a233ff9c5544ab710c3d43
+size 1377617
diff --git a/illustrious_generated/cae43d7fd0f8.png b/illustrious_generated/cae43d7fd0f8.png
new file mode 100644
index 0000000000000000000000000000000000000000..44ed43cd61b2a91616af861819de6027ab6c683e
--- /dev/null
+++ b/illustrious_generated/cae43d7fd0f8.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d343cb54292314be3696231b1cbcf8c85805ea300f845b509a92cc58c9c17296
+size 737522
diff --git a/illustrious_generated/cb335826ba02.png b/illustrious_generated/cb335826ba02.png
new file mode 100644
index 0000000000000000000000000000000000000000..75edf02afe02f578da861acf1ab3def47d7ec34d
--- /dev/null
+++ b/illustrious_generated/cb335826ba02.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee44c2fd557988d22bf60504433ea98a2f5206673c86f3c4e672d4eed808fc72
+size 3001467
diff --git a/illustrious_generated/cbd5827b38ea.png b/illustrious_generated/cbd5827b38ea.png
new file mode 100644
index 0000000000000000000000000000000000000000..d6dc4213be15f0a65589f4fd1a6d7cd92abfabea
--- /dev/null
+++ b/illustrious_generated/cbd5827b38ea.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd8b9219f7e18f7abebb6660ea967ef11593bd1c80a54089d6741ff3f9428105
+size 5406381
diff --git a/illustrious_generated/cd9145683d1e.png b/illustrious_generated/cd9145683d1e.png
new file mode 100644
index 0000000000000000000000000000000000000000..8fbaf506e51dc8b5bb5da8eb9f78cec74132ba54
--- /dev/null
+++ b/illustrious_generated/cd9145683d1e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f60b3267d85c04cf6db098a20d57237662c47e2a3216ebd379e082c964d5bf67
+size 992140
diff --git a/illustrious_generated/d1413371999b.png b/illustrious_generated/d1413371999b.png
new file mode 100644
index 0000000000000000000000000000000000000000..33643fde42eee867aa7409742dc6fcd20cb33289
--- /dev/null
+++ b/illustrious_generated/d1413371999b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e31fd542af4739adf724b52a15d964c937321f8b6c493b75721e0f1491bbe28
+size 4145173
diff --git a/illustrious_generated/d1e30fd687b5.png b/illustrious_generated/d1e30fd687b5.png
new file mode 100644
index 0000000000000000000000000000000000000000..4d03e719a89d895946ff1fe4b7676f5ed91be712
--- /dev/null
+++ b/illustrious_generated/d1e30fd687b5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2f1c758879d123bb08e1f7d99cc3de003352612cdfe821f3e254e4c1fbbc02f
+size 2619644
diff --git a/illustrious_generated/d22ef7243fac.png b/illustrious_generated/d22ef7243fac.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99f65723d48513b7a209f74fbb3b09ff63c7c79
--- /dev/null
+++ b/illustrious_generated/d22ef7243fac.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d55f5c5da986ec3bff35be417145e6b5413136940645614a657888ee657178cb
+size 1313525
diff --git a/illustrious_generated/d305fe437c6f.png b/illustrious_generated/d305fe437c6f.png
new file mode 100644
index 0000000000000000000000000000000000000000..c63fb4a3058c90e207850082731fbf17ae41a7d3
--- /dev/null
+++ b/illustrious_generated/d305fe437c6f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45210708b7cb1a3fba0bafeaa3c0fc0beaf900d664ba7e5e6480ff948e78241f
+size 650490
diff --git a/illustrious_generated/d684bc0d0627.png b/illustrious_generated/d684bc0d0627.png
new file mode 100644
index 0000000000000000000000000000000000000000..6534b3d8e385b308bc4e85af3be727e95624cdce
--- /dev/null
+++ b/illustrious_generated/d684bc0d0627.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acaca395704bdd7541d7c70daa044b80cbfbe27619bea1afb75baa13ee03c346
+size 2078222
diff --git a/illustrious_generated/d7bc7c5ba632.png b/illustrious_generated/d7bc7c5ba632.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e4b2e8e5a1da762b0b329270b519bfc949d5101
--- /dev/null
+++ b/illustrious_generated/d7bc7c5ba632.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:185c1082fe9e5cf9da121f64b493ded273d111049ef0136d681f53ed8b820ef0
+size 2343577
diff --git a/illustrious_generated/d7ef34bf47ee.png b/illustrious_generated/d7ef34bf47ee.png
new file mode 100644
index 0000000000000000000000000000000000000000..c3a142d57bc55a78bcc99dda99ea297b16bca462
--- /dev/null
+++ b/illustrious_generated/d7ef34bf47ee.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:add7f40ffbd3d8a3d035602334f9f02612b152c0d9f613f8fc0791f466519bd3
+size 1061359
diff --git a/illustrious_generated/d8641bfcdd46.png b/illustrious_generated/d8641bfcdd46.png
new file mode 100644
index 0000000000000000000000000000000000000000..fcc307fb7d2b2c93ce3ec7034145a05b8dd63cef
--- /dev/null
+++ b/illustrious_generated/d8641bfcdd46.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4db4c0b823eb5e9d4e38710fe1661c677234eaabfd6a3a2a127c63fe37dccd0c
+size 1687287
diff --git a/illustrious_generated/d99abaed93ba.png b/illustrious_generated/d99abaed93ba.png
new file mode 100644
index 0000000000000000000000000000000000000000..0760a2e0929ec1e0b8dd5d5c22b6902521bccbd1
--- /dev/null
+++ b/illustrious_generated/d99abaed93ba.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d7140afe3cefebc084cd9f20731ccdc2f9dc48f5b3fc71bce832ad2d3bd5056
+size 1513282
diff --git a/illustrious_generated/dacfbbcd3fb3.png b/illustrious_generated/dacfbbcd3fb3.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa16353658c2cfa48017dedae907eaad25fee654
--- /dev/null
+++ b/illustrious_generated/dacfbbcd3fb3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56f38aa118c55fae7e6a3aaf2bd52d8c3e61e262c2404409448e7b6325fbfea5
+size 2179924
diff --git a/illustrious_generated/dc7501a6f47f.png b/illustrious_generated/dc7501a6f47f.png
new file mode 100644
index 0000000000000000000000000000000000000000..8ddbd071d94daf1dfcc136e95049e199936c3af5
--- /dev/null
+++ b/illustrious_generated/dc7501a6f47f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9eb06333256a57b138e179ad231d705bf84475da583a919c05bf54cb696e11c9
+size 1957004
diff --git a/illustrious_generated/dd8a48931525.png b/illustrious_generated/dd8a48931525.png
new file mode 100644
index 0000000000000000000000000000000000000000..f28fce57650f917c665272d1a65b1cfb3d165d43
--- /dev/null
+++ b/illustrious_generated/dd8a48931525.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d802d21884f317e9c5c7ea7514b1df5fd9d354045207fffec06416db75b60a06
+size 1613820
diff --git a/illustrious_generated/dff506d177c0.png b/illustrious_generated/dff506d177c0.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d8900801d6aab319617afb7d45f154fbda85086
--- /dev/null
+++ b/illustrious_generated/dff506d177c0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5c2ff68901dfb558bb935fecae2a1c01cdcdb32fa37e8397b6f0394aa48182c
+size 597433
diff --git a/illustrious_generated/e0443895d658.png b/illustrious_generated/e0443895d658.png
new file mode 100644
index 0000000000000000000000000000000000000000..01d54c694e32a4f88dd5dafdc21cd82d2cf288b6
--- /dev/null
+++ b/illustrious_generated/e0443895d658.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49bbc11c4e410ea84c7e6c92ffdbeb2dbf359040e3e127c4a36ef078cb74b70a
+size 842812
diff --git a/illustrious_generated/e24085ea542f.png b/illustrious_generated/e24085ea542f.png
new file mode 100644
index 0000000000000000000000000000000000000000..abeb5baa377e61b9ab9f487d72e28beaade9f5fd
--- /dev/null
+++ b/illustrious_generated/e24085ea542f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:137dac3793eb110a98e30e03736968d3a35b1023b54f56dc805b93af63c38437
+size 699414
diff --git a/illustrious_generated/e2812aff73e9.png b/illustrious_generated/e2812aff73e9.png
new file mode 100644
index 0000000000000000000000000000000000000000..5fba90b044533edb9c623e3d4e4da0c9dd966004
--- /dev/null
+++ b/illustrious_generated/e2812aff73e9.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29b78dc31703ab7eff57c64ef562bc4fbf3bd576242cd003e86a80c9e9ed6026
+size 3059972
diff --git a/illustrious_generated/e39fecdd2676.png b/illustrious_generated/e39fecdd2676.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b6dce03cd82f06b5f584714ab7c37116dc9dd67
--- /dev/null
+++ b/illustrious_generated/e39fecdd2676.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d155a6d0ac8f5278262a8a9aa64061dc3356f0e1532cb85615389e8d64af075c
+size 2680551
diff --git a/illustrious_generated/e4acb93d313c.png b/illustrious_generated/e4acb93d313c.png
new file mode 100644
index 0000000000000000000000000000000000000000..897ce46fc4b30171031383a2f84891d2e4459f03
--- /dev/null
+++ b/illustrious_generated/e4acb93d313c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9f8ab91bcfa33cfa6e84c88b9f9d860f25136ac0c73f9f30599d21f064faf02
+size 1097393
diff --git a/illustrious_generated/e55e6cf94025.png b/illustrious_generated/e55e6cf94025.png
new file mode 100644
index 0000000000000000000000000000000000000000..b8da2783734d6d7d30ece1e09311f049d25e70e2
--- /dev/null
+++ b/illustrious_generated/e55e6cf94025.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c390e0f683606e3f5a8a37e3ed2b6fdc26cb91fbf396887094d538a6794a13b
+size 2241757
diff --git a/illustrious_generated/e71b25950c5d.png b/illustrious_generated/e71b25950c5d.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd9adc5e224aa1fc1f3d7585d39660b77f3b2f1b
--- /dev/null
+++ b/illustrious_generated/e71b25950c5d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49c76abc906a1b425677ca0ab60c4810c1fdc8ad30d66f94ac4ba21b59b63d57
+size 2223679
diff --git a/illustrious_generated/e801a5ce2da6.png b/illustrious_generated/e801a5ce2da6.png
new file mode 100644
index 0000000000000000000000000000000000000000..dbdfaf77d4e9b035bf6c0528b99a993b59327029
--- /dev/null
+++ b/illustrious_generated/e801a5ce2da6.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e488e9c840434b0e820c586dc785a37342d186cc24138ba152e3d164949f3290
+size 1071562
diff --git a/illustrious_generated/e8318516b273.png b/illustrious_generated/e8318516b273.png
new file mode 100644
index 0000000000000000000000000000000000000000..db074012199ff99b40701faf320a6ba2a320e294
--- /dev/null
+++ b/illustrious_generated/e8318516b273.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bcd6297b564bd41b741a4904383c4c407eb39bd716b84276627dde46d9840b6
+size 321644
diff --git a/illustrious_generated/e89ab638d462.png b/illustrious_generated/e89ab638d462.png
new file mode 100644
index 0000000000000000000000000000000000000000..7c03ad6533b4cec0c4b899beb31a6506f23ad160
--- /dev/null
+++ b/illustrious_generated/e89ab638d462.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85b978acba6c4b2bf25da885e67c4cfade2ba82dc1a846f10975260bba727a64
+size 2644030
diff --git a/illustrious_generated/eac29190186c.png b/illustrious_generated/eac29190186c.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f3371386fc015e366fe23fad37b9de656e0f84
--- /dev/null
+++ b/illustrious_generated/eac29190186c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d561fffba1ffb68606bbba554cd87c050c41165358cef5d66941f3138e6a0508
+size 1533996
diff --git a/illustrious_generated/ec6650b62802.png b/illustrious_generated/ec6650b62802.png
new file mode 100644
index 0000000000000000000000000000000000000000..4f6b4fa8b04814c1860b61520ad0e8e63c2c1a8b
--- /dev/null
+++ b/illustrious_generated/ec6650b62802.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1334aa3bc65a364fb62148da3f3895373efa6308535d75b2e24fd8195c6b75b6
+size 1020732
diff --git a/illustrious_generated/ec96a311c2cb.png b/illustrious_generated/ec96a311c2cb.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc2ca58e49eb634e8af3a7768f98572e9f2230bd
--- /dev/null
+++ b/illustrious_generated/ec96a311c2cb.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c915c38e155a6fd41553b9d914d687c11e49f16ece87a6e64609f72d85d208
+size 1481461
diff --git a/illustrious_generated/eca43ddadd85.png b/illustrious_generated/eca43ddadd85.png
new file mode 100644
index 0000000000000000000000000000000000000000..8b43e9283517a6d6e6c1dc3fbfaa727f5a3e5991
--- /dev/null
+++ b/illustrious_generated/eca43ddadd85.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89bace15d8606113d401f8e4573c53783b7db59d2eef86cf6f090959f7835483
+size 733801
diff --git a/illustrious_generated/ed13e74032fb.png b/illustrious_generated/ed13e74032fb.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d2bfb8bbe64a69887de8889fdfe6386c68366a6
--- /dev/null
+++ b/illustrious_generated/ed13e74032fb.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c97d79270f14ede419a0997a38bd90886d0bb36c538f140b5f2ce2cae6d71430
+size 1260807
diff --git a/illustrious_generated/ed89a47fd589.png b/illustrious_generated/ed89a47fd589.png
new file mode 100644
index 0000000000000000000000000000000000000000..f172f8cc20a84ae83ded7de8edb9ce6ed23cbe2d
--- /dev/null
+++ b/illustrious_generated/ed89a47fd589.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51a3e17138a5dfa06e46cb4d1b43c00745cfea7b6560c627c75b27b567072a38
+size 442282
diff --git a/illustrious_generated/ee32c9618a12.png b/illustrious_generated/ee32c9618a12.png
new file mode 100644
index 0000000000000000000000000000000000000000..51be5d1bdf3cba6799cf387734cebc7112ac1afb
--- /dev/null
+++ b/illustrious_generated/ee32c9618a12.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d48f4e0ad008919efd242c4b7c7ed963923202d0455363816f64996319d64e2
+size 1052077
diff --git a/illustrious_generated/ee36cea22c91.png b/illustrious_generated/ee36cea22c91.png
new file mode 100644
index 0000000000000000000000000000000000000000..507aa0bcaef1e477e062e6ac37d1edb8ea85b6a9
--- /dev/null
+++ b/illustrious_generated/ee36cea22c91.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a3e823e0f5c9f995022674410fbbd9f7139e42bc6f7580fa8a43e11765722bd
+size 2623475
diff --git a/illustrious_generated/f0d97f98333f.png b/illustrious_generated/f0d97f98333f.png
new file mode 100644
index 0000000000000000000000000000000000000000..c84d61d36197da85c023ac3bfbc8294be186d7a8
--- /dev/null
+++ b/illustrious_generated/f0d97f98333f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b24701b7b92a2d888bbcb0c3e685fa013c4d94bce5f4a8935b1ba7759b3c3f4
+size 1265602
diff --git a/illustrious_generated/f1de13ffcad6.png b/illustrious_generated/f1de13ffcad6.png
new file mode 100644
index 0000000000000000000000000000000000000000..0ad4beb32998ade9a9a42b268b5f6c84191fb47b
--- /dev/null
+++ b/illustrious_generated/f1de13ffcad6.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c733f11020c9b977024f71c493b2cc0575763faf86f003257532d7ef6b86f1b1
+size 1351385
diff --git a/illustrious_generated/f214facc5681.png b/illustrious_generated/f214facc5681.png
new file mode 100644
index 0000000000000000000000000000000000000000..7c3336b12880a75dacb14c8924998bdf7f84d6f5
--- /dev/null
+++ b/illustrious_generated/f214facc5681.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d04e8dd6ae2968e83a74abae5bcc41ff43dc3944dfa4c2ebad3911dafe85ff23
+size 1429322
diff --git a/illustrious_generated/f2a6e0c5c432.png b/illustrious_generated/f2a6e0c5c432.png
new file mode 100644
index 0000000000000000000000000000000000000000..df3bc6f5d09a7b9ca6d494b2cc2f8927b92a1054
--- /dev/null
+++ b/illustrious_generated/f2a6e0c5c432.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a12435d0c677457803d6019d189dbb23e69a4585be5f2e8466a04e96a665c80
+size 2658332
diff --git a/illustrious_generated/f41b4fc2c7d5.png b/illustrious_generated/f41b4fc2c7d5.png
new file mode 100644
index 0000000000000000000000000000000000000000..31e4facc64d7106763a11789b44f6027b2c92394
--- /dev/null
+++ b/illustrious_generated/f41b4fc2c7d5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f35e5208037b1e2e2a7a9ba391601a1ac2e4cd50ed02cb0a0563837a8418298b
+size 583840
diff --git a/illustrious_generated/f5ab32c63fb8.png b/illustrious_generated/f5ab32c63fb8.png
new file mode 100644
index 0000000000000000000000000000000000000000..263a687780cc517cb700964dca505504d645a138
--- /dev/null
+++ b/illustrious_generated/f5ab32c63fb8.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1fc67a6d854b32952da20ef6bb337903e27f99c4ec662b6ff293cd81fe9b818
+size 1848841
diff --git a/illustrious_generated/f6342e8db68a.png b/illustrious_generated/f6342e8db68a.png
new file mode 100644
index 0000000000000000000000000000000000000000..0885a5c3a53fffd43daf634086229e3342bce8ea
--- /dev/null
+++ b/illustrious_generated/f6342e8db68a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0073a4885901b8afd6f5a5b8f921004b0f9c2467d36338c4af5140982d0262d7
+size 1490652
diff --git a/illustrious_generated/f7621703575c.png b/illustrious_generated/f7621703575c.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b0b7d343090a2300377842e8d4c35fc4405c2e3
--- /dev/null
+++ b/illustrious_generated/f7621703575c.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d67124f035ad4f16bc536865ec2b158e08c503a7728483b22909df9dc64d99
+size 651851
diff --git a/illustrious_generated/f7ca451e1933.png b/illustrious_generated/f7ca451e1933.png
new file mode 100644
index 0000000000000000000000000000000000000000..cb2174f555d0dabc7e2e2708c8da3c12b3f810b5
--- /dev/null
+++ b/illustrious_generated/f7ca451e1933.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40dd1e34b4c226c6efad8ac8a18c6bc62bebc6447530a1f54f8650ec7a7c165a
+size 773912
diff --git a/illustrious_generated/f84f116882be.png b/illustrious_generated/f84f116882be.png
new file mode 100644
index 0000000000000000000000000000000000000000..367e4c6ab940b28200892e1288f19a96337db766
--- /dev/null
+++ b/illustrious_generated/f84f116882be.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d52421998151ff81022e0f13583853a69b3561ddfd640be5cbd8a65d3d6c8e16
+size 2399513
diff --git a/illustrious_generated/f8631de95d70.png b/illustrious_generated/f8631de95d70.png
new file mode 100644
index 0000000000000000000000000000000000000000..ddb128509abd0d3b5a6b52ea0e144c86587d53b0
--- /dev/null
+++ b/illustrious_generated/f8631de95d70.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51d81e519ba4008f87302b103ce49962068994249161e7d5b321ee349775551f
+size 1089140
diff --git a/illustrious_generated/f9c5bdc8bef5.png b/illustrious_generated/f9c5bdc8bef5.png
new file mode 100644
index 0000000000000000000000000000000000000000..1df41e0801dde093655060a8c288beed9d29e5d4
--- /dev/null
+++ b/illustrious_generated/f9c5bdc8bef5.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d019f672ae6e7ad59bee28f2b60d2c96b05ac948a9851051e0bc9a59ebf38e8b
+size 1524407
diff --git a/illustrious_generated/fa67e15ca2bf.png b/illustrious_generated/fa67e15ca2bf.png
new file mode 100644
index 0000000000000000000000000000000000000000..f1613a9483fc6b950694b6be49d6d7d293a166b9
--- /dev/null
+++ b/illustrious_generated/fa67e15ca2bf.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dee6f63be29dcff2a432c324da6abe273cae4de5d5a3899f79ba4bc73cde9d5d
+size 670173
diff --git a/illustrious_generated/faa1e7049117.png b/illustrious_generated/faa1e7049117.png
new file mode 100644
index 0000000000000000000000000000000000000000..6f2a6630faee3f7595dc0b01d8023ce68a31e9d7
--- /dev/null
+++ b/illustrious_generated/faa1e7049117.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b786889638e70b78d3ed5953f910b0e0cbad409067648d7736dc62770a7a2a42
+size 2791914
diff --git a/illustrious_generated/fbebd175667e.png b/illustrious_generated/fbebd175667e.png
new file mode 100644
index 0000000000000000000000000000000000000000..745bdfa69ca603211e42408eb10c6009bf2ccc78
--- /dev/null
+++ b/illustrious_generated/fbebd175667e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3aaeed026076d657d99f094e4a8e0269b2ed79004e39362f8e1e1435064a246e
+size 1155787
diff --git a/illustrious_generated/fc061ac787c7.png b/illustrious_generated/fc061ac787c7.png
new file mode 100644
index 0000000000000000000000000000000000000000..23842fb0a0e36160d023cbddc9cbb3d63a4b639c
--- /dev/null
+++ b/illustrious_generated/fc061ac787c7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e928b4585fa0e90c32e9e3b10a5d2c4e9341a63d9cc92b908d899fadfe9f2ef
+size 2236452
diff --git a/illustrious_generated/fc885c9be9af.png b/illustrious_generated/fc885c9be9af.png
new file mode 100644
index 0000000000000000000000000000000000000000..18d2579cd2c11d1972bfe0046ab33dd9ec68362f
--- /dev/null
+++ b/illustrious_generated/fc885c9be9af.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a5a8fe97e7af9543eb261a64c12833d0bcf7bfbd9663da66b509b4f4ec02e46
+size 1200187
diff --git a/illustrious_generated/fd4c46f2141f.png b/illustrious_generated/fd4c46f2141f.png
new file mode 100644
index 0000000000000000000000000000000000000000..7253385b40c456f6b12768c4d427914036d9bb6f
--- /dev/null
+++ b/illustrious_generated/fd4c46f2141f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d149567e13cb8980071f8ae25d9a5046303d3e73d9b9a9e37797a0da1887fe07
+size 1001238
diff --git a/illustrious_generated/fff7c0390e8a.png b/illustrious_generated/fff7c0390e8a.png
new file mode 100644
index 0000000000000000000000000000000000000000..8d461bda2054c7f6b8be4561ece72aa5dce8c6e7
--- /dev/null
+++ b/illustrious_generated/fff7c0390e8a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6feddc431c1fee3b82d804ae568f04444006fbab05b406b9893538696fc2ca00
+size 1565966